1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright (c) 2018, Intel Corporation. */ 3 4 /* The driver transmit and receive code */ 5 6 #include <linux/prefetch.h> 7 #include <linux/mm.h> 8 #include <linux/bpf_trace.h> 9 #include <net/dsfield.h> 10 #include <net/xdp.h> 11 #include "ice_txrx_lib.h" 12 #include "ice_lib.h" 13 #include "ice.h" 14 #include "ice_trace.h" 15 #include "ice_dcb_lib.h" 16 #include "ice_xsk.h" 17 18 #define ICE_RX_HDR_SIZE 256 19 20 #define FDIR_DESC_RXDID 0x40 21 #define ICE_FDIR_CLEAN_DELAY 10 22 23 /** 24 * ice_prgm_fdir_fltr - Program a Flow Director filter 25 * @vsi: VSI to send dummy packet 26 * @fdir_desc: flow director descriptor 27 * @raw_packet: allocated buffer for flow director 28 */ 29 int 30 ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc, 31 u8 *raw_packet) 32 { 33 struct ice_tx_buf *tx_buf, *first; 34 struct ice_fltr_desc *f_desc; 35 struct ice_tx_desc *tx_desc; 36 struct ice_ring *tx_ring; 37 struct device *dev; 38 dma_addr_t dma; 39 u32 td_cmd; 40 u16 i; 41 42 /* VSI and Tx ring */ 43 if (!vsi) 44 return -ENOENT; 45 tx_ring = vsi->tx_rings[0]; 46 if (!tx_ring || !tx_ring->desc) 47 return -ENOENT; 48 dev = tx_ring->dev; 49 50 /* we are using two descriptors to add/del a filter and we can wait */ 51 for (i = ICE_FDIR_CLEAN_DELAY; ICE_DESC_UNUSED(tx_ring) < 2; i--) { 52 if (!i) 53 return -EAGAIN; 54 msleep_interruptible(1); 55 } 56 57 dma = dma_map_single(dev, raw_packet, ICE_FDIR_MAX_RAW_PKT_SIZE, 58 DMA_TO_DEVICE); 59 60 if (dma_mapping_error(dev, dma)) 61 return -EINVAL; 62 63 /* grab the next descriptor */ 64 i = tx_ring->next_to_use; 65 first = &tx_ring->tx_buf[i]; 66 f_desc = ICE_TX_FDIRDESC(tx_ring, i); 67 memcpy(f_desc, fdir_desc, sizeof(*f_desc)); 68 69 i++; 70 i = (i < tx_ring->count) ? i : 0; 71 tx_desc = ICE_TX_DESC(tx_ring, i); 72 tx_buf = &tx_ring->tx_buf[i]; 73 74 i++; 75 tx_ring->next_to_use = (i < tx_ring->count) ? i : 0; 76 77 memset(tx_buf, 0, sizeof(*tx_buf)); 78 dma_unmap_len_set(tx_buf, len, ICE_FDIR_MAX_RAW_PKT_SIZE); 79 dma_unmap_addr_set(tx_buf, dma, dma); 80 81 tx_desc->buf_addr = cpu_to_le64(dma); 82 td_cmd = ICE_TXD_LAST_DESC_CMD | ICE_TX_DESC_CMD_DUMMY | 83 ICE_TX_DESC_CMD_RE; 84 85 tx_buf->tx_flags = ICE_TX_FLAGS_DUMMY_PKT; 86 tx_buf->raw_buf = raw_packet; 87 88 tx_desc->cmd_type_offset_bsz = 89 ice_build_ctob(td_cmd, 0, ICE_FDIR_MAX_RAW_PKT_SIZE, 0); 90 91 /* Force memory write to complete before letting h/w know 92 * there are new descriptors to fetch. 93 */ 94 wmb(); 95 96 /* mark the data descriptor to be watched */ 97 first->next_to_watch = tx_desc; 98 99 writel(tx_ring->next_to_use, tx_ring->tail); 100 101 return 0; 102 } 103 104 /** 105 * ice_unmap_and_free_tx_buf - Release a Tx buffer 106 * @ring: the ring that owns the buffer 107 * @tx_buf: the buffer to free 108 */ 109 static void 110 ice_unmap_and_free_tx_buf(struct ice_ring *ring, struct ice_tx_buf *tx_buf) 111 { 112 if (tx_buf->skb) { 113 if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT) 114 devm_kfree(ring->dev, tx_buf->raw_buf); 115 else if (ice_ring_is_xdp(ring)) 116 page_frag_free(tx_buf->raw_buf); 117 else 118 dev_kfree_skb_any(tx_buf->skb); 119 if (dma_unmap_len(tx_buf, len)) 120 dma_unmap_single(ring->dev, 121 dma_unmap_addr(tx_buf, dma), 122 dma_unmap_len(tx_buf, len), 123 DMA_TO_DEVICE); 124 } else if (dma_unmap_len(tx_buf, len)) { 125 dma_unmap_page(ring->dev, 126 dma_unmap_addr(tx_buf, dma), 127 dma_unmap_len(tx_buf, len), 128 DMA_TO_DEVICE); 129 } 130 131 tx_buf->next_to_watch = NULL; 132 tx_buf->skb = NULL; 133 dma_unmap_len_set(tx_buf, len, 0); 134 /* tx_buf must be completely set up in the transmit path */ 135 } 136 137 static struct netdev_queue *txring_txq(const struct ice_ring *ring) 138 { 139 return netdev_get_tx_queue(ring->netdev, ring->q_index); 140 } 141 142 /** 143 * ice_clean_tx_ring - Free any empty Tx buffers 144 * @tx_ring: ring to be cleaned 145 */ 146 void ice_clean_tx_ring(struct ice_ring *tx_ring) 147 { 148 u16 i; 149 150 if (ice_ring_is_xdp(tx_ring) && tx_ring->xsk_pool) { 151 ice_xsk_clean_xdp_ring(tx_ring); 152 goto tx_skip_free; 153 } 154 155 /* ring already cleared, nothing to do */ 156 if (!tx_ring->tx_buf) 157 return; 158 159 /* Free all the Tx ring sk_buffs */ 160 for (i = 0; i < tx_ring->count; i++) 161 ice_unmap_and_free_tx_buf(tx_ring, &tx_ring->tx_buf[i]); 162 163 tx_skip_free: 164 memset(tx_ring->tx_buf, 0, sizeof(*tx_ring->tx_buf) * tx_ring->count); 165 166 /* Zero out the descriptor ring */ 167 memset(tx_ring->desc, 0, tx_ring->size); 168 169 tx_ring->next_to_use = 0; 170 tx_ring->next_to_clean = 0; 171 172 if (!tx_ring->netdev) 173 return; 174 175 /* cleanup Tx queue statistics */ 176 netdev_tx_reset_queue(txring_txq(tx_ring)); 177 } 178 179 /** 180 * ice_free_tx_ring - Free Tx resources per queue 181 * @tx_ring: Tx descriptor ring for a specific queue 182 * 183 * Free all transmit software resources 184 */ 185 void ice_free_tx_ring(struct ice_ring *tx_ring) 186 { 187 ice_clean_tx_ring(tx_ring); 188 devm_kfree(tx_ring->dev, tx_ring->tx_buf); 189 tx_ring->tx_buf = NULL; 190 191 if (tx_ring->desc) { 192 dmam_free_coherent(tx_ring->dev, tx_ring->size, 193 tx_ring->desc, tx_ring->dma); 194 tx_ring->desc = NULL; 195 } 196 } 197 198 /** 199 * ice_clean_tx_irq - Reclaim resources after transmit completes 200 * @tx_ring: Tx ring to clean 201 * @napi_budget: Used to determine if we are in netpoll 202 * 203 * Returns true if there's any budget left (e.g. the clean is finished) 204 */ 205 static bool ice_clean_tx_irq(struct ice_ring *tx_ring, int napi_budget) 206 { 207 unsigned int total_bytes = 0, total_pkts = 0; 208 unsigned int budget = ICE_DFLT_IRQ_WORK; 209 struct ice_vsi *vsi = tx_ring->vsi; 210 s16 i = tx_ring->next_to_clean; 211 struct ice_tx_desc *tx_desc; 212 struct ice_tx_buf *tx_buf; 213 214 tx_buf = &tx_ring->tx_buf[i]; 215 tx_desc = ICE_TX_DESC(tx_ring, i); 216 i -= tx_ring->count; 217 218 prefetch(&vsi->state); 219 220 do { 221 struct ice_tx_desc *eop_desc = tx_buf->next_to_watch; 222 223 /* if next_to_watch is not set then there is no work pending */ 224 if (!eop_desc) 225 break; 226 227 smp_rmb(); /* prevent any other reads prior to eop_desc */ 228 229 ice_trace(clean_tx_irq, tx_ring, tx_desc, tx_buf); 230 /* if the descriptor isn't done, no work yet to do */ 231 if (!(eop_desc->cmd_type_offset_bsz & 232 cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE))) 233 break; 234 235 /* clear next_to_watch to prevent false hangs */ 236 tx_buf->next_to_watch = NULL; 237 238 /* update the statistics for this packet */ 239 total_bytes += tx_buf->bytecount; 240 total_pkts += tx_buf->gso_segs; 241 242 if (ice_ring_is_xdp(tx_ring)) 243 page_frag_free(tx_buf->raw_buf); 244 else 245 /* free the skb */ 246 napi_consume_skb(tx_buf->skb, napi_budget); 247 248 /* unmap skb header data */ 249 dma_unmap_single(tx_ring->dev, 250 dma_unmap_addr(tx_buf, dma), 251 dma_unmap_len(tx_buf, len), 252 DMA_TO_DEVICE); 253 254 /* clear tx_buf data */ 255 tx_buf->skb = NULL; 256 dma_unmap_len_set(tx_buf, len, 0); 257 258 /* unmap remaining buffers */ 259 while (tx_desc != eop_desc) { 260 ice_trace(clean_tx_irq_unmap, tx_ring, tx_desc, tx_buf); 261 tx_buf++; 262 tx_desc++; 263 i++; 264 if (unlikely(!i)) { 265 i -= tx_ring->count; 266 tx_buf = tx_ring->tx_buf; 267 tx_desc = ICE_TX_DESC(tx_ring, 0); 268 } 269 270 /* unmap any remaining paged data */ 271 if (dma_unmap_len(tx_buf, len)) { 272 dma_unmap_page(tx_ring->dev, 273 dma_unmap_addr(tx_buf, dma), 274 dma_unmap_len(tx_buf, len), 275 DMA_TO_DEVICE); 276 dma_unmap_len_set(tx_buf, len, 0); 277 } 278 } 279 ice_trace(clean_tx_irq_unmap_eop, tx_ring, tx_desc, tx_buf); 280 281 /* move us one more past the eop_desc for start of next pkt */ 282 tx_buf++; 283 tx_desc++; 284 i++; 285 if (unlikely(!i)) { 286 i -= tx_ring->count; 287 tx_buf = tx_ring->tx_buf; 288 tx_desc = ICE_TX_DESC(tx_ring, 0); 289 } 290 291 prefetch(tx_desc); 292 293 /* update budget accounting */ 294 budget--; 295 } while (likely(budget)); 296 297 i += tx_ring->count; 298 tx_ring->next_to_clean = i; 299 300 ice_update_tx_ring_stats(tx_ring, total_pkts, total_bytes); 301 302 if (ice_ring_is_xdp(tx_ring)) 303 return !!budget; 304 305 netdev_tx_completed_queue(txring_txq(tx_ring), total_pkts, 306 total_bytes); 307 308 #define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2)) 309 if (unlikely(total_pkts && netif_carrier_ok(tx_ring->netdev) && 310 (ICE_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) { 311 /* Make sure that anybody stopping the queue after this 312 * sees the new next_to_clean. 313 */ 314 smp_mb(); 315 if (__netif_subqueue_stopped(tx_ring->netdev, 316 tx_ring->q_index) && 317 !test_bit(ICE_VSI_DOWN, vsi->state)) { 318 netif_wake_subqueue(tx_ring->netdev, 319 tx_ring->q_index); 320 ++tx_ring->tx_stats.restart_q; 321 } 322 } 323 324 return !!budget; 325 } 326 327 /** 328 * ice_setup_tx_ring - Allocate the Tx descriptors 329 * @tx_ring: the Tx ring to set up 330 * 331 * Return 0 on success, negative on error 332 */ 333 int ice_setup_tx_ring(struct ice_ring *tx_ring) 334 { 335 struct device *dev = tx_ring->dev; 336 337 if (!dev) 338 return -ENOMEM; 339 340 /* warn if we are about to overwrite the pointer */ 341 WARN_ON(tx_ring->tx_buf); 342 tx_ring->tx_buf = 343 devm_kzalloc(dev, sizeof(*tx_ring->tx_buf) * tx_ring->count, 344 GFP_KERNEL); 345 if (!tx_ring->tx_buf) 346 return -ENOMEM; 347 348 /* round up to nearest page */ 349 tx_ring->size = ALIGN(tx_ring->count * sizeof(struct ice_tx_desc), 350 PAGE_SIZE); 351 tx_ring->desc = dmam_alloc_coherent(dev, tx_ring->size, &tx_ring->dma, 352 GFP_KERNEL); 353 if (!tx_ring->desc) { 354 dev_err(dev, "Unable to allocate memory for the Tx descriptor ring, size=%d\n", 355 tx_ring->size); 356 goto err; 357 } 358 359 tx_ring->next_to_use = 0; 360 tx_ring->next_to_clean = 0; 361 tx_ring->tx_stats.prev_pkt = -1; 362 return 0; 363 364 err: 365 devm_kfree(dev, tx_ring->tx_buf); 366 tx_ring->tx_buf = NULL; 367 return -ENOMEM; 368 } 369 370 /** 371 * ice_clean_rx_ring - Free Rx buffers 372 * @rx_ring: ring to be cleaned 373 */ 374 void ice_clean_rx_ring(struct ice_ring *rx_ring) 375 { 376 struct device *dev = rx_ring->dev; 377 u16 i; 378 379 /* ring already cleared, nothing to do */ 380 if (!rx_ring->rx_buf) 381 return; 382 383 if (rx_ring->skb) { 384 dev_kfree_skb(rx_ring->skb); 385 rx_ring->skb = NULL; 386 } 387 388 if (rx_ring->xsk_pool) { 389 ice_xsk_clean_rx_ring(rx_ring); 390 goto rx_skip_free; 391 } 392 393 /* Free all the Rx ring sk_buffs */ 394 for (i = 0; i < rx_ring->count; i++) { 395 struct ice_rx_buf *rx_buf = &rx_ring->rx_buf[i]; 396 397 if (!rx_buf->page) 398 continue; 399 400 /* Invalidate cache lines that may have been written to by 401 * device so that we avoid corrupting memory. 402 */ 403 dma_sync_single_range_for_cpu(dev, rx_buf->dma, 404 rx_buf->page_offset, 405 rx_ring->rx_buf_len, 406 DMA_FROM_DEVICE); 407 408 /* free resources associated with mapping */ 409 dma_unmap_page_attrs(dev, rx_buf->dma, ice_rx_pg_size(rx_ring), 410 DMA_FROM_DEVICE, ICE_RX_DMA_ATTR); 411 __page_frag_cache_drain(rx_buf->page, rx_buf->pagecnt_bias); 412 413 rx_buf->page = NULL; 414 rx_buf->page_offset = 0; 415 } 416 417 rx_skip_free: 418 memset(rx_ring->rx_buf, 0, sizeof(*rx_ring->rx_buf) * rx_ring->count); 419 420 /* Zero out the descriptor ring */ 421 memset(rx_ring->desc, 0, rx_ring->size); 422 423 rx_ring->next_to_alloc = 0; 424 rx_ring->next_to_clean = 0; 425 rx_ring->next_to_use = 0; 426 } 427 428 /** 429 * ice_free_rx_ring - Free Rx resources 430 * @rx_ring: ring to clean the resources from 431 * 432 * Free all receive software resources 433 */ 434 void ice_free_rx_ring(struct ice_ring *rx_ring) 435 { 436 ice_clean_rx_ring(rx_ring); 437 if (rx_ring->vsi->type == ICE_VSI_PF) 438 if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) 439 xdp_rxq_info_unreg(&rx_ring->xdp_rxq); 440 rx_ring->xdp_prog = NULL; 441 devm_kfree(rx_ring->dev, rx_ring->rx_buf); 442 rx_ring->rx_buf = NULL; 443 444 if (rx_ring->desc) { 445 dmam_free_coherent(rx_ring->dev, rx_ring->size, 446 rx_ring->desc, rx_ring->dma); 447 rx_ring->desc = NULL; 448 } 449 } 450 451 /** 452 * ice_setup_rx_ring - Allocate the Rx descriptors 453 * @rx_ring: the Rx ring to set up 454 * 455 * Return 0 on success, negative on error 456 */ 457 int ice_setup_rx_ring(struct ice_ring *rx_ring) 458 { 459 struct device *dev = rx_ring->dev; 460 461 if (!dev) 462 return -ENOMEM; 463 464 /* warn if we are about to overwrite the pointer */ 465 WARN_ON(rx_ring->rx_buf); 466 rx_ring->rx_buf = 467 devm_kzalloc(dev, sizeof(*rx_ring->rx_buf) * rx_ring->count, 468 GFP_KERNEL); 469 if (!rx_ring->rx_buf) 470 return -ENOMEM; 471 472 /* round up to nearest page */ 473 rx_ring->size = ALIGN(rx_ring->count * sizeof(union ice_32byte_rx_desc), 474 PAGE_SIZE); 475 rx_ring->desc = dmam_alloc_coherent(dev, rx_ring->size, &rx_ring->dma, 476 GFP_KERNEL); 477 if (!rx_ring->desc) { 478 dev_err(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n", 479 rx_ring->size); 480 goto err; 481 } 482 483 rx_ring->next_to_use = 0; 484 rx_ring->next_to_clean = 0; 485 486 if (ice_is_xdp_ena_vsi(rx_ring->vsi)) 487 WRITE_ONCE(rx_ring->xdp_prog, rx_ring->vsi->xdp_prog); 488 489 if (rx_ring->vsi->type == ICE_VSI_PF && 490 !xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) 491 if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, 492 rx_ring->q_index, rx_ring->q_vector->napi.napi_id)) 493 goto err; 494 return 0; 495 496 err: 497 devm_kfree(dev, rx_ring->rx_buf); 498 rx_ring->rx_buf = NULL; 499 return -ENOMEM; 500 } 501 502 static unsigned int 503 ice_rx_frame_truesize(struct ice_ring *rx_ring, unsigned int __maybe_unused size) 504 { 505 unsigned int truesize; 506 507 #if (PAGE_SIZE < 8192) 508 truesize = ice_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */ 509 #else 510 truesize = rx_ring->rx_offset ? 511 SKB_DATA_ALIGN(rx_ring->rx_offset + size) + 512 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : 513 SKB_DATA_ALIGN(size); 514 #endif 515 return truesize; 516 } 517 518 /** 519 * ice_run_xdp - Executes an XDP program on initialized xdp_buff 520 * @rx_ring: Rx ring 521 * @xdp: xdp_buff used as input to the XDP program 522 * @xdp_prog: XDP program to run 523 * 524 * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR} 525 */ 526 static int 527 ice_run_xdp(struct ice_ring *rx_ring, struct xdp_buff *xdp, 528 struct bpf_prog *xdp_prog) 529 { 530 struct ice_ring *xdp_ring; 531 int err, result; 532 u32 act; 533 534 act = bpf_prog_run_xdp(xdp_prog, xdp); 535 switch (act) { 536 case XDP_PASS: 537 return ICE_XDP_PASS; 538 case XDP_TX: 539 xdp_ring = rx_ring->vsi->xdp_rings[smp_processor_id()]; 540 result = ice_xmit_xdp_buff(xdp, xdp_ring); 541 if (result == ICE_XDP_CONSUMED) 542 goto out_failure; 543 return result; 544 case XDP_REDIRECT: 545 err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog); 546 if (err) 547 goto out_failure; 548 return ICE_XDP_REDIR; 549 default: 550 bpf_warn_invalid_xdp_action(act); 551 fallthrough; 552 case XDP_ABORTED: 553 out_failure: 554 trace_xdp_exception(rx_ring->netdev, xdp_prog, act); 555 fallthrough; 556 case XDP_DROP: 557 return ICE_XDP_CONSUMED; 558 } 559 } 560 561 /** 562 * ice_xdp_xmit - submit packets to XDP ring for transmission 563 * @dev: netdev 564 * @n: number of XDP frames to be transmitted 565 * @frames: XDP frames to be transmitted 566 * @flags: transmit flags 567 * 568 * Returns number of frames successfully sent. Failed frames 569 * will be free'ed by XDP core. 570 * For error cases, a negative errno code is returned and no-frames 571 * are transmitted (caller must handle freeing frames). 572 */ 573 int 574 ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, 575 u32 flags) 576 { 577 struct ice_netdev_priv *np = netdev_priv(dev); 578 unsigned int queue_index = smp_processor_id(); 579 struct ice_vsi *vsi = np->vsi; 580 struct ice_ring *xdp_ring; 581 int nxmit = 0, i; 582 583 if (test_bit(ICE_VSI_DOWN, vsi->state)) 584 return -ENETDOWN; 585 586 if (!ice_is_xdp_ena_vsi(vsi) || queue_index >= vsi->num_xdp_txq) 587 return -ENXIO; 588 589 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 590 return -EINVAL; 591 592 xdp_ring = vsi->xdp_rings[queue_index]; 593 for (i = 0; i < n; i++) { 594 struct xdp_frame *xdpf = frames[i]; 595 int err; 596 597 err = ice_xmit_xdp_ring(xdpf->data, xdpf->len, xdp_ring); 598 if (err != ICE_XDP_TX) 599 break; 600 nxmit++; 601 } 602 603 if (unlikely(flags & XDP_XMIT_FLUSH)) 604 ice_xdp_ring_update_tail(xdp_ring); 605 606 return nxmit; 607 } 608 609 /** 610 * ice_alloc_mapped_page - recycle or make a new page 611 * @rx_ring: ring to use 612 * @bi: rx_buf struct to modify 613 * 614 * Returns true if the page was successfully allocated or 615 * reused. 616 */ 617 static bool 618 ice_alloc_mapped_page(struct ice_ring *rx_ring, struct ice_rx_buf *bi) 619 { 620 struct page *page = bi->page; 621 dma_addr_t dma; 622 623 /* since we are recycling buffers we should seldom need to alloc */ 624 if (likely(page)) 625 return true; 626 627 /* alloc new page for storage */ 628 page = dev_alloc_pages(ice_rx_pg_order(rx_ring)); 629 if (unlikely(!page)) { 630 rx_ring->rx_stats.alloc_page_failed++; 631 return false; 632 } 633 634 /* map page for use */ 635 dma = dma_map_page_attrs(rx_ring->dev, page, 0, ice_rx_pg_size(rx_ring), 636 DMA_FROM_DEVICE, ICE_RX_DMA_ATTR); 637 638 /* if mapping failed free memory back to system since 639 * there isn't much point in holding memory we can't use 640 */ 641 if (dma_mapping_error(rx_ring->dev, dma)) { 642 __free_pages(page, ice_rx_pg_order(rx_ring)); 643 rx_ring->rx_stats.alloc_page_failed++; 644 return false; 645 } 646 647 bi->dma = dma; 648 bi->page = page; 649 bi->page_offset = rx_ring->rx_offset; 650 page_ref_add(page, USHRT_MAX - 1); 651 bi->pagecnt_bias = USHRT_MAX; 652 653 return true; 654 } 655 656 /** 657 * ice_alloc_rx_bufs - Replace used receive buffers 658 * @rx_ring: ring to place buffers on 659 * @cleaned_count: number of buffers to replace 660 * 661 * Returns false if all allocations were successful, true if any fail. Returning 662 * true signals to the caller that we didn't replace cleaned_count buffers and 663 * there is more work to do. 664 * 665 * First, try to clean "cleaned_count" Rx buffers. Then refill the cleaned Rx 666 * buffers. Then bump tail at most one time. Grouping like this lets us avoid 667 * multiple tail writes per call. 668 */ 669 bool ice_alloc_rx_bufs(struct ice_ring *rx_ring, u16 cleaned_count) 670 { 671 union ice_32b_rx_flex_desc *rx_desc; 672 u16 ntu = rx_ring->next_to_use; 673 struct ice_rx_buf *bi; 674 675 /* do nothing if no valid netdev defined */ 676 if ((!rx_ring->netdev && rx_ring->vsi->type != ICE_VSI_CTRL) || 677 !cleaned_count) 678 return false; 679 680 /* get the Rx descriptor and buffer based on next_to_use */ 681 rx_desc = ICE_RX_DESC(rx_ring, ntu); 682 bi = &rx_ring->rx_buf[ntu]; 683 684 do { 685 /* if we fail here, we have work remaining */ 686 if (!ice_alloc_mapped_page(rx_ring, bi)) 687 break; 688 689 /* sync the buffer for use by the device */ 690 dma_sync_single_range_for_device(rx_ring->dev, bi->dma, 691 bi->page_offset, 692 rx_ring->rx_buf_len, 693 DMA_FROM_DEVICE); 694 695 /* Refresh the desc even if buffer_addrs didn't change 696 * because each write-back erases this info. 697 */ 698 rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset); 699 700 rx_desc++; 701 bi++; 702 ntu++; 703 if (unlikely(ntu == rx_ring->count)) { 704 rx_desc = ICE_RX_DESC(rx_ring, 0); 705 bi = rx_ring->rx_buf; 706 ntu = 0; 707 } 708 709 /* clear the status bits for the next_to_use descriptor */ 710 rx_desc->wb.status_error0 = 0; 711 712 cleaned_count--; 713 } while (cleaned_count); 714 715 if (rx_ring->next_to_use != ntu) 716 ice_release_rx_desc(rx_ring, ntu); 717 718 return !!cleaned_count; 719 } 720 721 /** 722 * ice_rx_buf_adjust_pg_offset - Prepare Rx buffer for reuse 723 * @rx_buf: Rx buffer to adjust 724 * @size: Size of adjustment 725 * 726 * Update the offset within page so that Rx buf will be ready to be reused. 727 * For systems with PAGE_SIZE < 8192 this function will flip the page offset 728 * so the second half of page assigned to Rx buffer will be used, otherwise 729 * the offset is moved by "size" bytes 730 */ 731 static void 732 ice_rx_buf_adjust_pg_offset(struct ice_rx_buf *rx_buf, unsigned int size) 733 { 734 #if (PAGE_SIZE < 8192) 735 /* flip page offset to other buffer */ 736 rx_buf->page_offset ^= size; 737 #else 738 /* move offset up to the next cache line */ 739 rx_buf->page_offset += size; 740 #endif 741 } 742 743 /** 744 * ice_can_reuse_rx_page - Determine if page can be reused for another Rx 745 * @rx_buf: buffer containing the page 746 * @rx_buf_pgcnt: rx_buf page refcount pre xdp_do_redirect() call 747 * 748 * If page is reusable, we have a green light for calling ice_reuse_rx_page, 749 * which will assign the current buffer to the buffer that next_to_alloc is 750 * pointing to; otherwise, the DMA mapping needs to be destroyed and 751 * page freed 752 */ 753 static bool 754 ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf, int rx_buf_pgcnt) 755 { 756 unsigned int pagecnt_bias = rx_buf->pagecnt_bias; 757 struct page *page = rx_buf->page; 758 759 /* avoid re-using remote and pfmemalloc pages */ 760 if (!dev_page_is_reusable(page)) 761 return false; 762 763 #if (PAGE_SIZE < 8192) 764 /* if we are only owner of page we can reuse it */ 765 if (unlikely((rx_buf_pgcnt - pagecnt_bias) > 1)) 766 return false; 767 #else 768 #define ICE_LAST_OFFSET \ 769 (SKB_WITH_OVERHEAD(PAGE_SIZE) - ICE_RXBUF_2048) 770 if (rx_buf->page_offset > ICE_LAST_OFFSET) 771 return false; 772 #endif /* PAGE_SIZE < 8192) */ 773 774 /* If we have drained the page fragment pool we need to update 775 * the pagecnt_bias and page count so that we fully restock the 776 * number of references the driver holds. 777 */ 778 if (unlikely(pagecnt_bias == 1)) { 779 page_ref_add(page, USHRT_MAX - 1); 780 rx_buf->pagecnt_bias = USHRT_MAX; 781 } 782 783 return true; 784 } 785 786 /** 787 * ice_add_rx_frag - Add contents of Rx buffer to sk_buff as a frag 788 * @rx_ring: Rx descriptor ring to transact packets on 789 * @rx_buf: buffer containing page to add 790 * @skb: sk_buff to place the data into 791 * @size: packet length from rx_desc 792 * 793 * This function will add the data contained in rx_buf->page to the skb. 794 * It will just attach the page as a frag to the skb. 795 * The function will then update the page offset. 796 */ 797 static void 798 ice_add_rx_frag(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf, 799 struct sk_buff *skb, unsigned int size) 800 { 801 #if (PAGE_SIZE >= 8192) 802 unsigned int truesize = SKB_DATA_ALIGN(size + rx_ring->rx_offset); 803 #else 804 unsigned int truesize = ice_rx_pg_size(rx_ring) / 2; 805 #endif 806 807 if (!size) 808 return; 809 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buf->page, 810 rx_buf->page_offset, size, truesize); 811 812 /* page is being used so we must update the page offset */ 813 ice_rx_buf_adjust_pg_offset(rx_buf, truesize); 814 } 815 816 /** 817 * ice_reuse_rx_page - page flip buffer and store it back on the ring 818 * @rx_ring: Rx descriptor ring to store buffers on 819 * @old_buf: donor buffer to have page reused 820 * 821 * Synchronizes page for reuse by the adapter 822 */ 823 static void 824 ice_reuse_rx_page(struct ice_ring *rx_ring, struct ice_rx_buf *old_buf) 825 { 826 u16 nta = rx_ring->next_to_alloc; 827 struct ice_rx_buf *new_buf; 828 829 new_buf = &rx_ring->rx_buf[nta]; 830 831 /* update, and store next to alloc */ 832 nta++; 833 rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; 834 835 /* Transfer page from old buffer to new buffer. 836 * Move each member individually to avoid possible store 837 * forwarding stalls and unnecessary copy of skb. 838 */ 839 new_buf->dma = old_buf->dma; 840 new_buf->page = old_buf->page; 841 new_buf->page_offset = old_buf->page_offset; 842 new_buf->pagecnt_bias = old_buf->pagecnt_bias; 843 } 844 845 /** 846 * ice_get_rx_buf - Fetch Rx buffer and synchronize data for use 847 * @rx_ring: Rx descriptor ring to transact packets on 848 * @size: size of buffer to add to skb 849 * @rx_buf_pgcnt: rx_buf page refcount 850 * 851 * This function will pull an Rx buffer from the ring and synchronize it 852 * for use by the CPU. 853 */ 854 static struct ice_rx_buf * 855 ice_get_rx_buf(struct ice_ring *rx_ring, const unsigned int size, 856 int *rx_buf_pgcnt) 857 { 858 struct ice_rx_buf *rx_buf; 859 860 rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean]; 861 *rx_buf_pgcnt = 862 #if (PAGE_SIZE < 8192) 863 page_count(rx_buf->page); 864 #else 865 0; 866 #endif 867 prefetchw(rx_buf->page); 868 869 if (!size) 870 return rx_buf; 871 /* we are reusing so sync this buffer for CPU use */ 872 dma_sync_single_range_for_cpu(rx_ring->dev, rx_buf->dma, 873 rx_buf->page_offset, size, 874 DMA_FROM_DEVICE); 875 876 /* We have pulled a buffer for use, so decrement pagecnt_bias */ 877 rx_buf->pagecnt_bias--; 878 879 return rx_buf; 880 } 881 882 /** 883 * ice_build_skb - Build skb around an existing buffer 884 * @rx_ring: Rx descriptor ring to transact packets on 885 * @rx_buf: Rx buffer to pull data from 886 * @xdp: xdp_buff pointing to the data 887 * 888 * This function builds an skb around an existing Rx buffer, taking care 889 * to set up the skb correctly and avoid any memcpy overhead. 890 */ 891 static struct sk_buff * 892 ice_build_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf, 893 struct xdp_buff *xdp) 894 { 895 u8 metasize = xdp->data - xdp->data_meta; 896 #if (PAGE_SIZE < 8192) 897 unsigned int truesize = ice_rx_pg_size(rx_ring) / 2; 898 #else 899 unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) + 900 SKB_DATA_ALIGN(xdp->data_end - 901 xdp->data_hard_start); 902 #endif 903 struct sk_buff *skb; 904 905 /* Prefetch first cache line of first page. If xdp->data_meta 906 * is unused, this points exactly as xdp->data, otherwise we 907 * likely have a consumer accessing first few bytes of meta 908 * data, and then actual data. 909 */ 910 net_prefetch(xdp->data_meta); 911 /* build an skb around the page buffer */ 912 skb = build_skb(xdp->data_hard_start, truesize); 913 if (unlikely(!skb)) 914 return NULL; 915 916 /* must to record Rx queue, otherwise OS features such as 917 * symmetric queue won't work 918 */ 919 skb_record_rx_queue(skb, rx_ring->q_index); 920 921 /* update pointers within the skb to store the data */ 922 skb_reserve(skb, xdp->data - xdp->data_hard_start); 923 __skb_put(skb, xdp->data_end - xdp->data); 924 if (metasize) 925 skb_metadata_set(skb, metasize); 926 927 /* buffer is used by skb, update page_offset */ 928 ice_rx_buf_adjust_pg_offset(rx_buf, truesize); 929 930 return skb; 931 } 932 933 /** 934 * ice_construct_skb - Allocate skb and populate it 935 * @rx_ring: Rx descriptor ring to transact packets on 936 * @rx_buf: Rx buffer to pull data from 937 * @xdp: xdp_buff pointing to the data 938 * 939 * This function allocates an skb. It then populates it with the page 940 * data from the current receive descriptor, taking care to set up the 941 * skb correctly. 942 */ 943 static struct sk_buff * 944 ice_construct_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf, 945 struct xdp_buff *xdp) 946 { 947 unsigned int size = xdp->data_end - xdp->data; 948 unsigned int headlen; 949 struct sk_buff *skb; 950 951 /* prefetch first cache line of first page */ 952 net_prefetch(xdp->data); 953 954 /* allocate a skb to store the frags */ 955 skb = __napi_alloc_skb(&rx_ring->q_vector->napi, ICE_RX_HDR_SIZE, 956 GFP_ATOMIC | __GFP_NOWARN); 957 if (unlikely(!skb)) 958 return NULL; 959 960 skb_record_rx_queue(skb, rx_ring->q_index); 961 /* Determine available headroom for copy */ 962 headlen = size; 963 if (headlen > ICE_RX_HDR_SIZE) 964 headlen = eth_get_headlen(skb->dev, xdp->data, ICE_RX_HDR_SIZE); 965 966 /* align pull length to size of long to optimize memcpy performance */ 967 memcpy(__skb_put(skb, headlen), xdp->data, ALIGN(headlen, 968 sizeof(long))); 969 970 /* if we exhaust the linear part then add what is left as a frag */ 971 size -= headlen; 972 if (size) { 973 #if (PAGE_SIZE >= 8192) 974 unsigned int truesize = SKB_DATA_ALIGN(size); 975 #else 976 unsigned int truesize = ice_rx_pg_size(rx_ring) / 2; 977 #endif 978 skb_add_rx_frag(skb, 0, rx_buf->page, 979 rx_buf->page_offset + headlen, size, truesize); 980 /* buffer is used by skb, update page_offset */ 981 ice_rx_buf_adjust_pg_offset(rx_buf, truesize); 982 } else { 983 /* buffer is unused, reset bias back to rx_buf; data was copied 984 * onto skb's linear part so there's no need for adjusting 985 * page offset and we can reuse this buffer as-is 986 */ 987 rx_buf->pagecnt_bias++; 988 } 989 990 return skb; 991 } 992 993 /** 994 * ice_put_rx_buf - Clean up used buffer and either recycle or free 995 * @rx_ring: Rx descriptor ring to transact packets on 996 * @rx_buf: Rx buffer to pull data from 997 * @rx_buf_pgcnt: Rx buffer page count pre xdp_do_redirect() 998 * 999 * This function will update next_to_clean and then clean up the contents 1000 * of the rx_buf. It will either recycle the buffer or unmap it and free 1001 * the associated resources. 1002 */ 1003 static void 1004 ice_put_rx_buf(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf, 1005 int rx_buf_pgcnt) 1006 { 1007 u16 ntc = rx_ring->next_to_clean + 1; 1008 1009 /* fetch, update, and store next to clean */ 1010 ntc = (ntc < rx_ring->count) ? ntc : 0; 1011 rx_ring->next_to_clean = ntc; 1012 1013 if (!rx_buf) 1014 return; 1015 1016 if (ice_can_reuse_rx_page(rx_buf, rx_buf_pgcnt)) { 1017 /* hand second half of page back to the ring */ 1018 ice_reuse_rx_page(rx_ring, rx_buf); 1019 } else { 1020 /* we are not reusing the buffer so unmap it */ 1021 dma_unmap_page_attrs(rx_ring->dev, rx_buf->dma, 1022 ice_rx_pg_size(rx_ring), DMA_FROM_DEVICE, 1023 ICE_RX_DMA_ATTR); 1024 __page_frag_cache_drain(rx_buf->page, rx_buf->pagecnt_bias); 1025 } 1026 1027 /* clear contents of buffer_info */ 1028 rx_buf->page = NULL; 1029 } 1030 1031 /** 1032 * ice_is_non_eop - process handling of non-EOP buffers 1033 * @rx_ring: Rx ring being processed 1034 * @rx_desc: Rx descriptor for current buffer 1035 * 1036 * If the buffer is an EOP buffer, this function exits returning false, 1037 * otherwise return true indicating that this is in fact a non-EOP buffer. 1038 */ 1039 static bool 1040 ice_is_non_eop(struct ice_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc) 1041 { 1042 /* if we are the last buffer then there is nothing else to do */ 1043 #define ICE_RXD_EOF BIT(ICE_RX_FLEX_DESC_STATUS0_EOF_S) 1044 if (likely(ice_test_staterr(rx_desc, ICE_RXD_EOF))) 1045 return false; 1046 1047 rx_ring->rx_stats.non_eop_descs++; 1048 1049 return true; 1050 } 1051 1052 /** 1053 * ice_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf 1054 * @rx_ring: Rx descriptor ring to transact packets on 1055 * @budget: Total limit on number of packets to process 1056 * 1057 * This function provides a "bounce buffer" approach to Rx interrupt 1058 * processing. The advantage to this is that on systems that have 1059 * expensive overhead for IOMMU access this provides a means of avoiding 1060 * it by maintaining the mapping of the page to the system. 1061 * 1062 * Returns amount of work completed 1063 */ 1064 int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) 1065 { 1066 unsigned int total_rx_bytes = 0, total_rx_pkts = 0, frame_sz = 0; 1067 u16 cleaned_count = ICE_DESC_UNUSED(rx_ring); 1068 unsigned int offset = rx_ring->rx_offset; 1069 unsigned int xdp_res, xdp_xmit = 0; 1070 struct sk_buff *skb = rx_ring->skb; 1071 struct bpf_prog *xdp_prog = NULL; 1072 struct xdp_buff xdp; 1073 bool failure; 1074 1075 /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ 1076 #if (PAGE_SIZE < 8192) 1077 frame_sz = ice_rx_frame_truesize(rx_ring, 0); 1078 #endif 1079 xdp_init_buff(&xdp, frame_sz, &rx_ring->xdp_rxq); 1080 1081 /* start the loop to process Rx packets bounded by 'budget' */ 1082 while (likely(total_rx_pkts < (unsigned int)budget)) { 1083 union ice_32b_rx_flex_desc *rx_desc; 1084 struct ice_rx_buf *rx_buf; 1085 unsigned char *hard_start; 1086 unsigned int size; 1087 u16 stat_err_bits; 1088 int rx_buf_pgcnt; 1089 u16 vlan_tag = 0; 1090 u16 rx_ptype; 1091 1092 /* get the Rx desc from Rx ring based on 'next_to_clean' */ 1093 rx_desc = ICE_RX_DESC(rx_ring, rx_ring->next_to_clean); 1094 1095 /* status_error_len will always be zero for unused descriptors 1096 * because it's cleared in cleanup, and overlaps with hdr_addr 1097 * which is always zero because packet split isn't used, if the 1098 * hardware wrote DD then it will be non-zero 1099 */ 1100 stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S); 1101 if (!ice_test_staterr(rx_desc, stat_err_bits)) 1102 break; 1103 1104 /* This memory barrier is needed to keep us from reading 1105 * any other fields out of the rx_desc until we know the 1106 * DD bit is set. 1107 */ 1108 dma_rmb(); 1109 1110 ice_trace(clean_rx_irq, rx_ring, rx_desc); 1111 if (rx_desc->wb.rxdid == FDIR_DESC_RXDID || !rx_ring->netdev) { 1112 struct ice_vsi *ctrl_vsi = rx_ring->vsi; 1113 1114 if (rx_desc->wb.rxdid == FDIR_DESC_RXDID && 1115 ctrl_vsi->vf_id != ICE_INVAL_VFID) 1116 ice_vc_fdir_irq_handler(ctrl_vsi, rx_desc); 1117 ice_put_rx_buf(rx_ring, NULL, 0); 1118 cleaned_count++; 1119 continue; 1120 } 1121 1122 size = le16_to_cpu(rx_desc->wb.pkt_len) & 1123 ICE_RX_FLX_DESC_PKT_LEN_M; 1124 1125 /* retrieve a buffer from the ring */ 1126 rx_buf = ice_get_rx_buf(rx_ring, size, &rx_buf_pgcnt); 1127 1128 if (!size) { 1129 xdp.data = NULL; 1130 xdp.data_end = NULL; 1131 xdp.data_hard_start = NULL; 1132 xdp.data_meta = NULL; 1133 goto construct_skb; 1134 } 1135 1136 hard_start = page_address(rx_buf->page) + rx_buf->page_offset - 1137 offset; 1138 xdp_prepare_buff(&xdp, hard_start, offset, size, true); 1139 #if (PAGE_SIZE > 4096) 1140 /* At larger PAGE_SIZE, frame_sz depend on len size */ 1141 xdp.frame_sz = ice_rx_frame_truesize(rx_ring, size); 1142 #endif 1143 1144 xdp_prog = READ_ONCE(rx_ring->xdp_prog); 1145 if (!xdp_prog) 1146 goto construct_skb; 1147 1148 xdp_res = ice_run_xdp(rx_ring, &xdp, xdp_prog); 1149 if (!xdp_res) 1150 goto construct_skb; 1151 if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) { 1152 xdp_xmit |= xdp_res; 1153 ice_rx_buf_adjust_pg_offset(rx_buf, xdp.frame_sz); 1154 } else { 1155 rx_buf->pagecnt_bias++; 1156 } 1157 total_rx_bytes += size; 1158 total_rx_pkts++; 1159 1160 cleaned_count++; 1161 ice_put_rx_buf(rx_ring, rx_buf, rx_buf_pgcnt); 1162 continue; 1163 construct_skb: 1164 if (skb) { 1165 ice_add_rx_frag(rx_ring, rx_buf, skb, size); 1166 } else if (likely(xdp.data)) { 1167 if (ice_ring_uses_build_skb(rx_ring)) 1168 skb = ice_build_skb(rx_ring, rx_buf, &xdp); 1169 else 1170 skb = ice_construct_skb(rx_ring, rx_buf, &xdp); 1171 } 1172 /* exit if we failed to retrieve a buffer */ 1173 if (!skb) { 1174 rx_ring->rx_stats.alloc_buf_failed++; 1175 if (rx_buf) 1176 rx_buf->pagecnt_bias++; 1177 break; 1178 } 1179 1180 ice_put_rx_buf(rx_ring, rx_buf, rx_buf_pgcnt); 1181 cleaned_count++; 1182 1183 /* skip if it is NOP desc */ 1184 if (ice_is_non_eop(rx_ring, rx_desc)) 1185 continue; 1186 1187 stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S); 1188 if (unlikely(ice_test_staterr(rx_desc, stat_err_bits))) { 1189 dev_kfree_skb_any(skb); 1190 continue; 1191 } 1192 1193 stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_L2TAG1P_S); 1194 if (ice_test_staterr(rx_desc, stat_err_bits)) 1195 vlan_tag = le16_to_cpu(rx_desc->wb.l2tag1); 1196 1197 /* pad the skb if needed, to make a valid ethernet frame */ 1198 if (eth_skb_pad(skb)) { 1199 skb = NULL; 1200 continue; 1201 } 1202 1203 /* probably a little skewed due to removing CRC */ 1204 total_rx_bytes += skb->len; 1205 1206 /* populate checksum, VLAN, and protocol */ 1207 rx_ptype = le16_to_cpu(rx_desc->wb.ptype_flex_flags0) & 1208 ICE_RX_FLEX_DESC_PTYPE_M; 1209 1210 ice_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype); 1211 1212 ice_trace(clean_rx_irq_indicate, rx_ring, rx_desc, skb); 1213 /* send completed skb up the stack */ 1214 ice_receive_skb(rx_ring, skb, vlan_tag); 1215 skb = NULL; 1216 1217 /* update budget accounting */ 1218 total_rx_pkts++; 1219 } 1220 1221 /* return up to cleaned_count buffers to hardware */ 1222 failure = ice_alloc_rx_bufs(rx_ring, cleaned_count); 1223 1224 if (xdp_prog) 1225 ice_finalize_xdp_rx(rx_ring, xdp_xmit); 1226 rx_ring->skb = skb; 1227 1228 ice_update_rx_ring_stats(rx_ring, total_rx_pkts, total_rx_bytes); 1229 1230 /* guarantee a trip back through this routine if there was a failure */ 1231 return failure ? budget : (int)total_rx_pkts; 1232 } 1233 1234 /** 1235 * ice_net_dim - Update net DIM algorithm 1236 * @q_vector: the vector associated with the interrupt 1237 * 1238 * Create a DIM sample and notify net_dim() so that it can possibly decide 1239 * a new ITR value based on incoming packets, bytes, and interrupts. 1240 * 1241 * This function is a no-op if the ring is not configured to dynamic ITR. 1242 */ 1243 static void ice_net_dim(struct ice_q_vector *q_vector) 1244 { 1245 struct ice_ring_container *tx = &q_vector->tx; 1246 struct ice_ring_container *rx = &q_vector->rx; 1247 1248 if (ITR_IS_DYNAMIC(tx)) { 1249 struct dim_sample dim_sample = {}; 1250 u64 packets = 0, bytes = 0; 1251 struct ice_ring *ring; 1252 1253 ice_for_each_ring(ring, q_vector->tx) { 1254 packets += ring->stats.pkts; 1255 bytes += ring->stats.bytes; 1256 } 1257 1258 dim_update_sample(q_vector->total_events, packets, bytes, 1259 &dim_sample); 1260 1261 net_dim(&tx->dim, dim_sample); 1262 } 1263 1264 if (ITR_IS_DYNAMIC(rx)) { 1265 struct dim_sample dim_sample = {}; 1266 u64 packets = 0, bytes = 0; 1267 struct ice_ring *ring; 1268 1269 ice_for_each_ring(ring, q_vector->rx) { 1270 packets += ring->stats.pkts; 1271 bytes += ring->stats.bytes; 1272 } 1273 1274 dim_update_sample(q_vector->total_events, packets, bytes, 1275 &dim_sample); 1276 1277 net_dim(&rx->dim, dim_sample); 1278 } 1279 } 1280 1281 /** 1282 * ice_buildreg_itr - build value for writing to the GLINT_DYN_CTL register 1283 * @itr_idx: interrupt throttling index 1284 * @itr: interrupt throttling value in usecs 1285 */ 1286 static u32 ice_buildreg_itr(u16 itr_idx, u16 itr) 1287 { 1288 /* The ITR value is reported in microseconds, and the register value is 1289 * recorded in 2 microsecond units. For this reason we only need to 1290 * shift by the GLINT_DYN_CTL_INTERVAL_S - ICE_ITR_GRAN_S to apply this 1291 * granularity as a shift instead of division. The mask makes sure the 1292 * ITR value is never odd so we don't accidentally write into the field 1293 * prior to the ITR field. 1294 */ 1295 itr &= ICE_ITR_MASK; 1296 1297 return GLINT_DYN_CTL_INTENA_M | GLINT_DYN_CTL_CLEARPBA_M | 1298 (itr_idx << GLINT_DYN_CTL_ITR_INDX_S) | 1299 (itr << (GLINT_DYN_CTL_INTERVAL_S - ICE_ITR_GRAN_S)); 1300 } 1301 1302 /** 1303 * ice_update_ena_itr - Update ITR moderation and re-enable MSI-X interrupt 1304 * @q_vector: the vector associated with the interrupt to enable 1305 * 1306 * Update the net_dim() algorithm and re-enable the interrupt associated with 1307 * this vector. 1308 * 1309 * If the VSI is down, the interrupt will not be re-enabled. 1310 */ 1311 static void ice_update_ena_itr(struct ice_q_vector *q_vector) 1312 { 1313 struct ice_vsi *vsi = q_vector->vsi; 1314 bool wb_en = q_vector->wb_on_itr; 1315 u32 itr_val; 1316 1317 if (test_bit(ICE_DOWN, vsi->state)) 1318 return; 1319 1320 /* When exiting WB_ON_ITR, let ITR resume its normal 1321 * interrupts-enabled path. 1322 */ 1323 if (wb_en) 1324 q_vector->wb_on_itr = false; 1325 1326 /* This will do nothing if dynamic updates are not enabled. */ 1327 ice_net_dim(q_vector); 1328 1329 /* net_dim() updates ITR out-of-band using a work item */ 1330 itr_val = ice_buildreg_itr(ICE_ITR_NONE, 0); 1331 /* trigger an immediate software interrupt when exiting 1332 * busy poll, to make sure to catch any pending cleanups 1333 * that might have been missed due to interrupt state 1334 * transition. 1335 */ 1336 if (wb_en) { 1337 itr_val |= GLINT_DYN_CTL_SWINT_TRIG_M | 1338 GLINT_DYN_CTL_SW_ITR_INDX_M | 1339 GLINT_DYN_CTL_SW_ITR_INDX_ENA_M; 1340 } 1341 wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx), itr_val); 1342 } 1343 1344 /** 1345 * ice_set_wb_on_itr - set WB_ON_ITR for this q_vector 1346 * @q_vector: q_vector to set WB_ON_ITR on 1347 * 1348 * We need to tell hardware to write-back completed descriptors even when 1349 * interrupts are disabled. Descriptors will be written back on cache line 1350 * boundaries without WB_ON_ITR enabled, but if we don't enable WB_ON_ITR 1351 * descriptors may not be written back if they don't fill a cache line until 1352 * the next interrupt. 1353 * 1354 * This sets the write-back frequency to whatever was set previously for the 1355 * ITR indices. Also, set the INTENA_MSK bit to make sure hardware knows we 1356 * aren't meddling with the INTENA_M bit. 1357 */ 1358 static void ice_set_wb_on_itr(struct ice_q_vector *q_vector) 1359 { 1360 struct ice_vsi *vsi = q_vector->vsi; 1361 1362 /* already in wb_on_itr mode no need to change it */ 1363 if (q_vector->wb_on_itr) 1364 return; 1365 1366 /* use previously set ITR values for all of the ITR indices by 1367 * specifying ICE_ITR_NONE, which will vary in adaptive (AIM) mode and 1368 * be static in non-adaptive mode (user configured) 1369 */ 1370 wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx), 1371 ((ICE_ITR_NONE << GLINT_DYN_CTL_ITR_INDX_S) & 1372 GLINT_DYN_CTL_ITR_INDX_M) | GLINT_DYN_CTL_INTENA_MSK_M | 1373 GLINT_DYN_CTL_WB_ON_ITR_M); 1374 1375 q_vector->wb_on_itr = true; 1376 } 1377 1378 /** 1379 * ice_napi_poll - NAPI polling Rx/Tx cleanup routine 1380 * @napi: napi struct with our devices info in it 1381 * @budget: amount of work driver is allowed to do this pass, in packets 1382 * 1383 * This function will clean all queues associated with a q_vector. 1384 * 1385 * Returns the amount of work done 1386 */ 1387 int ice_napi_poll(struct napi_struct *napi, int budget) 1388 { 1389 struct ice_q_vector *q_vector = 1390 container_of(napi, struct ice_q_vector, napi); 1391 bool clean_complete = true; 1392 struct ice_ring *ring; 1393 int budget_per_ring; 1394 int work_done = 0; 1395 1396 /* Since the actual Tx work is minimal, we can give the Tx a larger 1397 * budget and be more aggressive about cleaning up the Tx descriptors. 1398 */ 1399 ice_for_each_ring(ring, q_vector->tx) { 1400 bool wd = ring->xsk_pool ? 1401 ice_clean_tx_irq_zc(ring, budget) : 1402 ice_clean_tx_irq(ring, budget); 1403 1404 if (!wd) 1405 clean_complete = false; 1406 } 1407 1408 /* Handle case where we are called by netpoll with a budget of 0 */ 1409 if (unlikely(budget <= 0)) 1410 return budget; 1411 1412 /* normally we have 1 Rx ring per q_vector */ 1413 if (unlikely(q_vector->num_ring_rx > 1)) 1414 /* We attempt to distribute budget to each Rx queue fairly, but 1415 * don't allow the budget to go below 1 because that would exit 1416 * polling early. 1417 */ 1418 budget_per_ring = max_t(int, budget / q_vector->num_ring_rx, 1); 1419 else 1420 /* Max of 1 Rx ring in this q_vector so give it the budget */ 1421 budget_per_ring = budget; 1422 1423 ice_for_each_ring(ring, q_vector->rx) { 1424 int cleaned; 1425 1426 /* A dedicated path for zero-copy allows making a single 1427 * comparison in the irq context instead of many inside the 1428 * ice_clean_rx_irq function and makes the codebase cleaner. 1429 */ 1430 cleaned = ring->xsk_pool ? 1431 ice_clean_rx_irq_zc(ring, budget_per_ring) : 1432 ice_clean_rx_irq(ring, budget_per_ring); 1433 work_done += cleaned; 1434 /* if we clean as many as budgeted, we must not be done */ 1435 if (cleaned >= budget_per_ring) 1436 clean_complete = false; 1437 } 1438 1439 /* If work not completed, return budget and polling will return */ 1440 if (!clean_complete) { 1441 /* Set the writeback on ITR so partial completions of 1442 * cache-lines will still continue even if we're polling. 1443 */ 1444 ice_set_wb_on_itr(q_vector); 1445 return budget; 1446 } 1447 1448 /* Exit the polling mode, but don't re-enable interrupts if stack might 1449 * poll us due to busy-polling 1450 */ 1451 if (likely(napi_complete_done(napi, work_done))) 1452 ice_update_ena_itr(q_vector); 1453 else 1454 ice_set_wb_on_itr(q_vector); 1455 1456 return min_t(int, work_done, budget - 1); 1457 } 1458 1459 /** 1460 * __ice_maybe_stop_tx - 2nd level check for Tx stop conditions 1461 * @tx_ring: the ring to be checked 1462 * @size: the size buffer we want to assure is available 1463 * 1464 * Returns -EBUSY if a stop is needed, else 0 1465 */ 1466 static int __ice_maybe_stop_tx(struct ice_ring *tx_ring, unsigned int size) 1467 { 1468 netif_stop_subqueue(tx_ring->netdev, tx_ring->q_index); 1469 /* Memory barrier before checking head and tail */ 1470 smp_mb(); 1471 1472 /* Check again in a case another CPU has just made room available. */ 1473 if (likely(ICE_DESC_UNUSED(tx_ring) < size)) 1474 return -EBUSY; 1475 1476 /* A reprieve! - use start_subqueue because it doesn't call schedule */ 1477 netif_start_subqueue(tx_ring->netdev, tx_ring->q_index); 1478 ++tx_ring->tx_stats.restart_q; 1479 return 0; 1480 } 1481 1482 /** 1483 * ice_maybe_stop_tx - 1st level check for Tx stop conditions 1484 * @tx_ring: the ring to be checked 1485 * @size: the size buffer we want to assure is available 1486 * 1487 * Returns 0 if stop is not needed 1488 */ 1489 static int ice_maybe_stop_tx(struct ice_ring *tx_ring, unsigned int size) 1490 { 1491 if (likely(ICE_DESC_UNUSED(tx_ring) >= size)) 1492 return 0; 1493 1494 return __ice_maybe_stop_tx(tx_ring, size); 1495 } 1496 1497 /** 1498 * ice_tx_map - Build the Tx descriptor 1499 * @tx_ring: ring to send buffer on 1500 * @first: first buffer info buffer to use 1501 * @off: pointer to struct that holds offload parameters 1502 * 1503 * This function loops over the skb data pointed to by *first 1504 * and gets a physical address for each memory location and programs 1505 * it and the length into the transmit descriptor. 1506 */ 1507 static void 1508 ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first, 1509 struct ice_tx_offload_params *off) 1510 { 1511 u64 td_offset, td_tag, td_cmd; 1512 u16 i = tx_ring->next_to_use; 1513 unsigned int data_len, size; 1514 struct ice_tx_desc *tx_desc; 1515 struct ice_tx_buf *tx_buf; 1516 struct sk_buff *skb; 1517 skb_frag_t *frag; 1518 dma_addr_t dma; 1519 1520 td_tag = off->td_l2tag1; 1521 td_cmd = off->td_cmd; 1522 td_offset = off->td_offset; 1523 skb = first->skb; 1524 1525 data_len = skb->data_len; 1526 size = skb_headlen(skb); 1527 1528 tx_desc = ICE_TX_DESC(tx_ring, i); 1529 1530 if (first->tx_flags & ICE_TX_FLAGS_HW_VLAN) { 1531 td_cmd |= (u64)ICE_TX_DESC_CMD_IL2TAG1; 1532 td_tag = (first->tx_flags & ICE_TX_FLAGS_VLAN_M) >> 1533 ICE_TX_FLAGS_VLAN_S; 1534 } 1535 1536 dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE); 1537 1538 tx_buf = first; 1539 1540 for (frag = &skb_shinfo(skb)->frags[0];; frag++) { 1541 unsigned int max_data = ICE_MAX_DATA_PER_TXD_ALIGNED; 1542 1543 if (dma_mapping_error(tx_ring->dev, dma)) 1544 goto dma_error; 1545 1546 /* record length, and DMA address */ 1547 dma_unmap_len_set(tx_buf, len, size); 1548 dma_unmap_addr_set(tx_buf, dma, dma); 1549 1550 /* align size to end of page */ 1551 max_data += -dma & (ICE_MAX_READ_REQ_SIZE - 1); 1552 tx_desc->buf_addr = cpu_to_le64(dma); 1553 1554 /* account for data chunks larger than the hardware 1555 * can handle 1556 */ 1557 while (unlikely(size > ICE_MAX_DATA_PER_TXD)) { 1558 tx_desc->cmd_type_offset_bsz = 1559 ice_build_ctob(td_cmd, td_offset, max_data, 1560 td_tag); 1561 1562 tx_desc++; 1563 i++; 1564 1565 if (i == tx_ring->count) { 1566 tx_desc = ICE_TX_DESC(tx_ring, 0); 1567 i = 0; 1568 } 1569 1570 dma += max_data; 1571 size -= max_data; 1572 1573 max_data = ICE_MAX_DATA_PER_TXD_ALIGNED; 1574 tx_desc->buf_addr = cpu_to_le64(dma); 1575 } 1576 1577 if (likely(!data_len)) 1578 break; 1579 1580 tx_desc->cmd_type_offset_bsz = ice_build_ctob(td_cmd, td_offset, 1581 size, td_tag); 1582 1583 tx_desc++; 1584 i++; 1585 1586 if (i == tx_ring->count) { 1587 tx_desc = ICE_TX_DESC(tx_ring, 0); 1588 i = 0; 1589 } 1590 1591 size = skb_frag_size(frag); 1592 data_len -= size; 1593 1594 dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size, 1595 DMA_TO_DEVICE); 1596 1597 tx_buf = &tx_ring->tx_buf[i]; 1598 } 1599 1600 /* record bytecount for BQL */ 1601 netdev_tx_sent_queue(txring_txq(tx_ring), first->bytecount); 1602 1603 /* record SW timestamp if HW timestamp is not available */ 1604 skb_tx_timestamp(first->skb); 1605 1606 i++; 1607 if (i == tx_ring->count) 1608 i = 0; 1609 1610 /* write last descriptor with RS and EOP bits */ 1611 td_cmd |= (u64)ICE_TXD_LAST_DESC_CMD; 1612 tx_desc->cmd_type_offset_bsz = 1613 ice_build_ctob(td_cmd, td_offset, size, td_tag); 1614 1615 /* Force memory writes to complete before letting h/w know there 1616 * are new descriptors to fetch. 1617 * 1618 * We also use this memory barrier to make certain all of the 1619 * status bits have been updated before next_to_watch is written. 1620 */ 1621 wmb(); 1622 1623 /* set next_to_watch value indicating a packet is present */ 1624 first->next_to_watch = tx_desc; 1625 1626 tx_ring->next_to_use = i; 1627 1628 ice_maybe_stop_tx(tx_ring, DESC_NEEDED); 1629 1630 /* notify HW of packet */ 1631 if (netif_xmit_stopped(txring_txq(tx_ring)) || !netdev_xmit_more()) 1632 writel(i, tx_ring->tail); 1633 1634 return; 1635 1636 dma_error: 1637 /* clear DMA mappings for failed tx_buf map */ 1638 for (;;) { 1639 tx_buf = &tx_ring->tx_buf[i]; 1640 ice_unmap_and_free_tx_buf(tx_ring, tx_buf); 1641 if (tx_buf == first) 1642 break; 1643 if (i == 0) 1644 i = tx_ring->count; 1645 i--; 1646 } 1647 1648 tx_ring->next_to_use = i; 1649 } 1650 1651 /** 1652 * ice_tx_csum - Enable Tx checksum offloads 1653 * @first: pointer to the first descriptor 1654 * @off: pointer to struct that holds offload parameters 1655 * 1656 * Returns 0 or error (negative) if checksum offload can't happen, 1 otherwise. 1657 */ 1658 static 1659 int ice_tx_csum(struct ice_tx_buf *first, struct ice_tx_offload_params *off) 1660 { 1661 u32 l4_len = 0, l3_len = 0, l2_len = 0; 1662 struct sk_buff *skb = first->skb; 1663 union { 1664 struct iphdr *v4; 1665 struct ipv6hdr *v6; 1666 unsigned char *hdr; 1667 } ip; 1668 union { 1669 struct tcphdr *tcp; 1670 unsigned char *hdr; 1671 } l4; 1672 __be16 frag_off, protocol; 1673 unsigned char *exthdr; 1674 u32 offset, cmd = 0; 1675 u8 l4_proto = 0; 1676 1677 if (skb->ip_summed != CHECKSUM_PARTIAL) 1678 return 0; 1679 1680 ip.hdr = skb_network_header(skb); 1681 l4.hdr = skb_transport_header(skb); 1682 1683 /* compute outer L2 header size */ 1684 l2_len = ip.hdr - skb->data; 1685 offset = (l2_len / 2) << ICE_TX_DESC_LEN_MACLEN_S; 1686 1687 protocol = vlan_get_protocol(skb); 1688 1689 if (protocol == htons(ETH_P_IP)) 1690 first->tx_flags |= ICE_TX_FLAGS_IPV4; 1691 else if (protocol == htons(ETH_P_IPV6)) 1692 first->tx_flags |= ICE_TX_FLAGS_IPV6; 1693 1694 if (skb->encapsulation) { 1695 bool gso_ena = false; 1696 u32 tunnel = 0; 1697 1698 /* define outer network header type */ 1699 if (first->tx_flags & ICE_TX_FLAGS_IPV4) { 1700 tunnel |= (first->tx_flags & ICE_TX_FLAGS_TSO) ? 1701 ICE_TX_CTX_EIPT_IPV4 : 1702 ICE_TX_CTX_EIPT_IPV4_NO_CSUM; 1703 l4_proto = ip.v4->protocol; 1704 } else if (first->tx_flags & ICE_TX_FLAGS_IPV6) { 1705 int ret; 1706 1707 tunnel |= ICE_TX_CTX_EIPT_IPV6; 1708 exthdr = ip.hdr + sizeof(*ip.v6); 1709 l4_proto = ip.v6->nexthdr; 1710 ret = ipv6_skip_exthdr(skb, exthdr - skb->data, 1711 &l4_proto, &frag_off); 1712 if (ret < 0) 1713 return -1; 1714 } 1715 1716 /* define outer transport */ 1717 switch (l4_proto) { 1718 case IPPROTO_UDP: 1719 tunnel |= ICE_TXD_CTX_UDP_TUNNELING; 1720 first->tx_flags |= ICE_TX_FLAGS_TUNNEL; 1721 break; 1722 case IPPROTO_GRE: 1723 tunnel |= ICE_TXD_CTX_GRE_TUNNELING; 1724 first->tx_flags |= ICE_TX_FLAGS_TUNNEL; 1725 break; 1726 case IPPROTO_IPIP: 1727 case IPPROTO_IPV6: 1728 first->tx_flags |= ICE_TX_FLAGS_TUNNEL; 1729 l4.hdr = skb_inner_network_header(skb); 1730 break; 1731 default: 1732 if (first->tx_flags & ICE_TX_FLAGS_TSO) 1733 return -1; 1734 1735 skb_checksum_help(skb); 1736 return 0; 1737 } 1738 1739 /* compute outer L3 header size */ 1740 tunnel |= ((l4.hdr - ip.hdr) / 4) << 1741 ICE_TXD_CTX_QW0_EIPLEN_S; 1742 1743 /* switch IP header pointer from outer to inner header */ 1744 ip.hdr = skb_inner_network_header(skb); 1745 1746 /* compute tunnel header size */ 1747 tunnel |= ((ip.hdr - l4.hdr) / 2) << 1748 ICE_TXD_CTX_QW0_NATLEN_S; 1749 1750 gso_ena = skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL; 1751 /* indicate if we need to offload outer UDP header */ 1752 if ((first->tx_flags & ICE_TX_FLAGS_TSO) && !gso_ena && 1753 (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) 1754 tunnel |= ICE_TXD_CTX_QW0_L4T_CS_M; 1755 1756 /* record tunnel offload values */ 1757 off->cd_tunnel_params |= tunnel; 1758 1759 /* set DTYP=1 to indicate that it's an Tx context descriptor 1760 * in IPsec tunnel mode with Tx offloads in Quad word 1 1761 */ 1762 off->cd_qw1 |= (u64)ICE_TX_DESC_DTYPE_CTX; 1763 1764 /* switch L4 header pointer from outer to inner */ 1765 l4.hdr = skb_inner_transport_header(skb); 1766 l4_proto = 0; 1767 1768 /* reset type as we transition from outer to inner headers */ 1769 first->tx_flags &= ~(ICE_TX_FLAGS_IPV4 | ICE_TX_FLAGS_IPV6); 1770 if (ip.v4->version == 4) 1771 first->tx_flags |= ICE_TX_FLAGS_IPV4; 1772 if (ip.v6->version == 6) 1773 first->tx_flags |= ICE_TX_FLAGS_IPV6; 1774 } 1775 1776 /* Enable IP checksum offloads */ 1777 if (first->tx_flags & ICE_TX_FLAGS_IPV4) { 1778 l4_proto = ip.v4->protocol; 1779 /* the stack computes the IP header already, the only time we 1780 * need the hardware to recompute it is in the case of TSO. 1781 */ 1782 if (first->tx_flags & ICE_TX_FLAGS_TSO) 1783 cmd |= ICE_TX_DESC_CMD_IIPT_IPV4_CSUM; 1784 else 1785 cmd |= ICE_TX_DESC_CMD_IIPT_IPV4; 1786 1787 } else if (first->tx_flags & ICE_TX_FLAGS_IPV6) { 1788 cmd |= ICE_TX_DESC_CMD_IIPT_IPV6; 1789 exthdr = ip.hdr + sizeof(*ip.v6); 1790 l4_proto = ip.v6->nexthdr; 1791 if (l4.hdr != exthdr) 1792 ipv6_skip_exthdr(skb, exthdr - skb->data, &l4_proto, 1793 &frag_off); 1794 } else { 1795 return -1; 1796 } 1797 1798 /* compute inner L3 header size */ 1799 l3_len = l4.hdr - ip.hdr; 1800 offset |= (l3_len / 4) << ICE_TX_DESC_LEN_IPLEN_S; 1801 1802 /* Enable L4 checksum offloads */ 1803 switch (l4_proto) { 1804 case IPPROTO_TCP: 1805 /* enable checksum offloads */ 1806 cmd |= ICE_TX_DESC_CMD_L4T_EOFT_TCP; 1807 l4_len = l4.tcp->doff; 1808 offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S; 1809 break; 1810 case IPPROTO_UDP: 1811 /* enable UDP checksum offload */ 1812 cmd |= ICE_TX_DESC_CMD_L4T_EOFT_UDP; 1813 l4_len = (sizeof(struct udphdr) >> 2); 1814 offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S; 1815 break; 1816 case IPPROTO_SCTP: 1817 /* enable SCTP checksum offload */ 1818 cmd |= ICE_TX_DESC_CMD_L4T_EOFT_SCTP; 1819 l4_len = sizeof(struct sctphdr) >> 2; 1820 offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S; 1821 break; 1822 1823 default: 1824 if (first->tx_flags & ICE_TX_FLAGS_TSO) 1825 return -1; 1826 skb_checksum_help(skb); 1827 return 0; 1828 } 1829 1830 off->td_cmd |= cmd; 1831 off->td_offset |= offset; 1832 return 1; 1833 } 1834 1835 /** 1836 * ice_tx_prepare_vlan_flags - prepare generic Tx VLAN tagging flags for HW 1837 * @tx_ring: ring to send buffer on 1838 * @first: pointer to struct ice_tx_buf 1839 * 1840 * Checks the skb and set up correspondingly several generic transmit flags 1841 * related to VLAN tagging for the HW, such as VLAN, DCB, etc. 1842 */ 1843 static void 1844 ice_tx_prepare_vlan_flags(struct ice_ring *tx_ring, struct ice_tx_buf *first) 1845 { 1846 struct sk_buff *skb = first->skb; 1847 1848 /* nothing left to do, software offloaded VLAN */ 1849 if (!skb_vlan_tag_present(skb) && eth_type_vlan(skb->protocol)) 1850 return; 1851 1852 /* currently, we always assume 802.1Q for VLAN insertion as VLAN 1853 * insertion for 802.1AD is not supported 1854 */ 1855 if (skb_vlan_tag_present(skb)) { 1856 first->tx_flags |= skb_vlan_tag_get(skb) << ICE_TX_FLAGS_VLAN_S; 1857 first->tx_flags |= ICE_TX_FLAGS_HW_VLAN; 1858 } 1859 1860 ice_tx_prepare_vlan_flags_dcb(tx_ring, first); 1861 } 1862 1863 /** 1864 * ice_tso - computes mss and TSO length to prepare for TSO 1865 * @first: pointer to struct ice_tx_buf 1866 * @off: pointer to struct that holds offload parameters 1867 * 1868 * Returns 0 or error (negative) if TSO can't happen, 1 otherwise. 1869 */ 1870 static 1871 int ice_tso(struct ice_tx_buf *first, struct ice_tx_offload_params *off) 1872 { 1873 struct sk_buff *skb = first->skb; 1874 union { 1875 struct iphdr *v4; 1876 struct ipv6hdr *v6; 1877 unsigned char *hdr; 1878 } ip; 1879 union { 1880 struct tcphdr *tcp; 1881 struct udphdr *udp; 1882 unsigned char *hdr; 1883 } l4; 1884 u64 cd_mss, cd_tso_len; 1885 u32 paylen; 1886 u8 l4_start; 1887 int err; 1888 1889 if (skb->ip_summed != CHECKSUM_PARTIAL) 1890 return 0; 1891 1892 if (!skb_is_gso(skb)) 1893 return 0; 1894 1895 err = skb_cow_head(skb, 0); 1896 if (err < 0) 1897 return err; 1898 1899 /* cppcheck-suppress unreadVariable */ 1900 ip.hdr = skb_network_header(skb); 1901 l4.hdr = skb_transport_header(skb); 1902 1903 /* initialize outer IP header fields */ 1904 if (ip.v4->version == 4) { 1905 ip.v4->tot_len = 0; 1906 ip.v4->check = 0; 1907 } else { 1908 ip.v6->payload_len = 0; 1909 } 1910 1911 if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | 1912 SKB_GSO_GRE_CSUM | 1913 SKB_GSO_IPXIP4 | 1914 SKB_GSO_IPXIP6 | 1915 SKB_GSO_UDP_TUNNEL | 1916 SKB_GSO_UDP_TUNNEL_CSUM)) { 1917 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) && 1918 (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) { 1919 l4.udp->len = 0; 1920 1921 /* determine offset of outer transport header */ 1922 l4_start = (u8)(l4.hdr - skb->data); 1923 1924 /* remove payload length from outer checksum */ 1925 paylen = skb->len - l4_start; 1926 csum_replace_by_diff(&l4.udp->check, 1927 (__force __wsum)htonl(paylen)); 1928 } 1929 1930 /* reset pointers to inner headers */ 1931 1932 /* cppcheck-suppress unreadVariable */ 1933 ip.hdr = skb_inner_network_header(skb); 1934 l4.hdr = skb_inner_transport_header(skb); 1935 1936 /* initialize inner IP header fields */ 1937 if (ip.v4->version == 4) { 1938 ip.v4->tot_len = 0; 1939 ip.v4->check = 0; 1940 } else { 1941 ip.v6->payload_len = 0; 1942 } 1943 } 1944 1945 /* determine offset of transport header */ 1946 l4_start = (u8)(l4.hdr - skb->data); 1947 1948 /* remove payload length from checksum */ 1949 paylen = skb->len - l4_start; 1950 1951 if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) { 1952 csum_replace_by_diff(&l4.udp->check, 1953 (__force __wsum)htonl(paylen)); 1954 /* compute length of UDP segmentation header */ 1955 off->header_len = (u8)sizeof(l4.udp) + l4_start; 1956 } else { 1957 csum_replace_by_diff(&l4.tcp->check, 1958 (__force __wsum)htonl(paylen)); 1959 /* compute length of TCP segmentation header */ 1960 off->header_len = (u8)((l4.tcp->doff * 4) + l4_start); 1961 } 1962 1963 /* update gso_segs and bytecount */ 1964 first->gso_segs = skb_shinfo(skb)->gso_segs; 1965 first->bytecount += (first->gso_segs - 1) * off->header_len; 1966 1967 cd_tso_len = skb->len - off->header_len; 1968 cd_mss = skb_shinfo(skb)->gso_size; 1969 1970 /* record cdesc_qw1 with TSO parameters */ 1971 off->cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX | 1972 (ICE_TX_CTX_DESC_TSO << ICE_TXD_CTX_QW1_CMD_S) | 1973 (cd_tso_len << ICE_TXD_CTX_QW1_TSO_LEN_S) | 1974 (cd_mss << ICE_TXD_CTX_QW1_MSS_S)); 1975 first->tx_flags |= ICE_TX_FLAGS_TSO; 1976 return 1; 1977 } 1978 1979 /** 1980 * ice_txd_use_count - estimate the number of descriptors needed for Tx 1981 * @size: transmit request size in bytes 1982 * 1983 * Due to hardware alignment restrictions (4K alignment), we need to 1984 * assume that we can have no more than 12K of data per descriptor, even 1985 * though each descriptor can take up to 16K - 1 bytes of aligned memory. 1986 * Thus, we need to divide by 12K. But division is slow! Instead, 1987 * we decompose the operation into shifts and one relatively cheap 1988 * multiply operation. 1989 * 1990 * To divide by 12K, we first divide by 4K, then divide by 3: 1991 * To divide by 4K, shift right by 12 bits 1992 * To divide by 3, multiply by 85, then divide by 256 1993 * (Divide by 256 is done by shifting right by 8 bits) 1994 * Finally, we add one to round up. Because 256 isn't an exact multiple of 1995 * 3, we'll underestimate near each multiple of 12K. This is actually more 1996 * accurate as we have 4K - 1 of wiggle room that we can fit into the last 1997 * segment. For our purposes this is accurate out to 1M which is orders of 1998 * magnitude greater than our largest possible GSO size. 1999 * 2000 * This would then be implemented as: 2001 * return (((size >> 12) * 85) >> 8) + ICE_DESCS_FOR_SKB_DATA_PTR; 2002 * 2003 * Since multiplication and division are commutative, we can reorder 2004 * operations into: 2005 * return ((size * 85) >> 20) + ICE_DESCS_FOR_SKB_DATA_PTR; 2006 */ 2007 static unsigned int ice_txd_use_count(unsigned int size) 2008 { 2009 return ((size * 85) >> 20) + ICE_DESCS_FOR_SKB_DATA_PTR; 2010 } 2011 2012 /** 2013 * ice_xmit_desc_count - calculate number of Tx descriptors needed 2014 * @skb: send buffer 2015 * 2016 * Returns number of data descriptors needed for this skb. 2017 */ 2018 static unsigned int ice_xmit_desc_count(struct sk_buff *skb) 2019 { 2020 const skb_frag_t *frag = &skb_shinfo(skb)->frags[0]; 2021 unsigned int nr_frags = skb_shinfo(skb)->nr_frags; 2022 unsigned int count = 0, size = skb_headlen(skb); 2023 2024 for (;;) { 2025 count += ice_txd_use_count(size); 2026 2027 if (!nr_frags--) 2028 break; 2029 2030 size = skb_frag_size(frag++); 2031 } 2032 2033 return count; 2034 } 2035 2036 /** 2037 * __ice_chk_linearize - Check if there are more than 8 buffers per packet 2038 * @skb: send buffer 2039 * 2040 * Note: This HW can't DMA more than 8 buffers to build a packet on the wire 2041 * and so we need to figure out the cases where we need to linearize the skb. 2042 * 2043 * For TSO we need to count the TSO header and segment payload separately. 2044 * As such we need to check cases where we have 7 fragments or more as we 2045 * can potentially require 9 DMA transactions, 1 for the TSO header, 1 for 2046 * the segment payload in the first descriptor, and another 7 for the 2047 * fragments. 2048 */ 2049 static bool __ice_chk_linearize(struct sk_buff *skb) 2050 { 2051 const skb_frag_t *frag, *stale; 2052 int nr_frags, sum; 2053 2054 /* no need to check if number of frags is less than 7 */ 2055 nr_frags = skb_shinfo(skb)->nr_frags; 2056 if (nr_frags < (ICE_MAX_BUF_TXD - 1)) 2057 return false; 2058 2059 /* We need to walk through the list and validate that each group 2060 * of 6 fragments totals at least gso_size. 2061 */ 2062 nr_frags -= ICE_MAX_BUF_TXD - 2; 2063 frag = &skb_shinfo(skb)->frags[0]; 2064 2065 /* Initialize size to the negative value of gso_size minus 1. We 2066 * use this as the worst case scenario in which the frag ahead 2067 * of us only provides one byte which is why we are limited to 6 2068 * descriptors for a single transmit as the header and previous 2069 * fragment are already consuming 2 descriptors. 2070 */ 2071 sum = 1 - skb_shinfo(skb)->gso_size; 2072 2073 /* Add size of frags 0 through 4 to create our initial sum */ 2074 sum += skb_frag_size(frag++); 2075 sum += skb_frag_size(frag++); 2076 sum += skb_frag_size(frag++); 2077 sum += skb_frag_size(frag++); 2078 sum += skb_frag_size(frag++); 2079 2080 /* Walk through fragments adding latest fragment, testing it, and 2081 * then removing stale fragments from the sum. 2082 */ 2083 for (stale = &skb_shinfo(skb)->frags[0];; stale++) { 2084 int stale_size = skb_frag_size(stale); 2085 2086 sum += skb_frag_size(frag++); 2087 2088 /* The stale fragment may present us with a smaller 2089 * descriptor than the actual fragment size. To account 2090 * for that we need to remove all the data on the front and 2091 * figure out what the remainder would be in the last 2092 * descriptor associated with the fragment. 2093 */ 2094 if (stale_size > ICE_MAX_DATA_PER_TXD) { 2095 int align_pad = -(skb_frag_off(stale)) & 2096 (ICE_MAX_READ_REQ_SIZE - 1); 2097 2098 sum -= align_pad; 2099 stale_size -= align_pad; 2100 2101 do { 2102 sum -= ICE_MAX_DATA_PER_TXD_ALIGNED; 2103 stale_size -= ICE_MAX_DATA_PER_TXD_ALIGNED; 2104 } while (stale_size > ICE_MAX_DATA_PER_TXD); 2105 } 2106 2107 /* if sum is negative we failed to make sufficient progress */ 2108 if (sum < 0) 2109 return true; 2110 2111 if (!nr_frags--) 2112 break; 2113 2114 sum -= stale_size; 2115 } 2116 2117 return false; 2118 } 2119 2120 /** 2121 * ice_chk_linearize - Check if there are more than 8 fragments per packet 2122 * @skb: send buffer 2123 * @count: number of buffers used 2124 * 2125 * Note: Our HW can't scatter-gather more than 8 fragments to build 2126 * a packet on the wire and so we need to figure out the cases where we 2127 * need to linearize the skb. 2128 */ 2129 static bool ice_chk_linearize(struct sk_buff *skb, unsigned int count) 2130 { 2131 /* Both TSO and single send will work if count is less than 8 */ 2132 if (likely(count < ICE_MAX_BUF_TXD)) 2133 return false; 2134 2135 if (skb_is_gso(skb)) 2136 return __ice_chk_linearize(skb); 2137 2138 /* we can support up to 8 data buffers for a single send */ 2139 return count != ICE_MAX_BUF_TXD; 2140 } 2141 2142 /** 2143 * ice_tstamp - set up context descriptor for hardware timestamp 2144 * @tx_ring: pointer to the Tx ring to send buffer on 2145 * @skb: pointer to the SKB we're sending 2146 * @first: Tx buffer 2147 * @off: Tx offload parameters 2148 */ 2149 static void 2150 ice_tstamp(struct ice_ring *tx_ring, struct sk_buff *skb, 2151 struct ice_tx_buf *first, struct ice_tx_offload_params *off) 2152 { 2153 s8 idx; 2154 2155 /* only timestamp the outbound packet if the user has requested it */ 2156 if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))) 2157 return; 2158 2159 if (!tx_ring->ptp_tx) 2160 return; 2161 2162 /* Tx timestamps cannot be sampled when doing TSO */ 2163 if (first->tx_flags & ICE_TX_FLAGS_TSO) 2164 return; 2165 2166 /* Grab an open timestamp slot */ 2167 idx = ice_ptp_request_ts(tx_ring->tx_tstamps, skb); 2168 if (idx < 0) 2169 return; 2170 2171 off->cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX | 2172 (ICE_TX_CTX_DESC_TSYN << ICE_TXD_CTX_QW1_CMD_S) | 2173 ((u64)idx << ICE_TXD_CTX_QW1_TSO_LEN_S)); 2174 first->tx_flags |= ICE_TX_FLAGS_TSYN; 2175 } 2176 2177 /** 2178 * ice_xmit_frame_ring - Sends buffer on Tx ring 2179 * @skb: send buffer 2180 * @tx_ring: ring to send buffer on 2181 * 2182 * Returns NETDEV_TX_OK if sent, else an error code 2183 */ 2184 static netdev_tx_t 2185 ice_xmit_frame_ring(struct sk_buff *skb, struct ice_ring *tx_ring) 2186 { 2187 struct ice_tx_offload_params offload = { 0 }; 2188 struct ice_vsi *vsi = tx_ring->vsi; 2189 struct ice_tx_buf *first; 2190 struct ethhdr *eth; 2191 unsigned int count; 2192 int tso, csum; 2193 2194 ice_trace(xmit_frame_ring, tx_ring, skb); 2195 2196 count = ice_xmit_desc_count(skb); 2197 if (ice_chk_linearize(skb, count)) { 2198 if (__skb_linearize(skb)) 2199 goto out_drop; 2200 count = ice_txd_use_count(skb->len); 2201 tx_ring->tx_stats.tx_linearize++; 2202 } 2203 2204 /* need: 1 descriptor per page * PAGE_SIZE/ICE_MAX_DATA_PER_TXD, 2205 * + 1 desc for skb_head_len/ICE_MAX_DATA_PER_TXD, 2206 * + 4 desc gap to avoid the cache line where head is, 2207 * + 1 desc for context descriptor, 2208 * otherwise try next time 2209 */ 2210 if (ice_maybe_stop_tx(tx_ring, count + ICE_DESCS_PER_CACHE_LINE + 2211 ICE_DESCS_FOR_CTX_DESC)) { 2212 tx_ring->tx_stats.tx_busy++; 2213 return NETDEV_TX_BUSY; 2214 } 2215 2216 offload.tx_ring = tx_ring; 2217 2218 /* record the location of the first descriptor for this packet */ 2219 first = &tx_ring->tx_buf[tx_ring->next_to_use]; 2220 first->skb = skb; 2221 first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN); 2222 first->gso_segs = 1; 2223 first->tx_flags = 0; 2224 2225 /* prepare the VLAN tagging flags for Tx */ 2226 ice_tx_prepare_vlan_flags(tx_ring, first); 2227 2228 /* set up TSO offload */ 2229 tso = ice_tso(first, &offload); 2230 if (tso < 0) 2231 goto out_drop; 2232 2233 /* always set up Tx checksum offload */ 2234 csum = ice_tx_csum(first, &offload); 2235 if (csum < 0) 2236 goto out_drop; 2237 2238 /* allow CONTROL frames egress from main VSI if FW LLDP disabled */ 2239 eth = (struct ethhdr *)skb_mac_header(skb); 2240 if (unlikely((skb->priority == TC_PRIO_CONTROL || 2241 eth->h_proto == htons(ETH_P_LLDP)) && 2242 vsi->type == ICE_VSI_PF && 2243 vsi->port_info->qos_cfg.is_sw_lldp)) 2244 offload.cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX | 2245 ICE_TX_CTX_DESC_SWTCH_UPLINK << 2246 ICE_TXD_CTX_QW1_CMD_S); 2247 2248 ice_tstamp(tx_ring, skb, first, &offload); 2249 2250 if (offload.cd_qw1 & ICE_TX_DESC_DTYPE_CTX) { 2251 struct ice_tx_ctx_desc *cdesc; 2252 u16 i = tx_ring->next_to_use; 2253 2254 /* grab the next descriptor */ 2255 cdesc = ICE_TX_CTX_DESC(tx_ring, i); 2256 i++; 2257 tx_ring->next_to_use = (i < tx_ring->count) ? i : 0; 2258 2259 /* setup context descriptor */ 2260 cdesc->tunneling_params = cpu_to_le32(offload.cd_tunnel_params); 2261 cdesc->l2tag2 = cpu_to_le16(offload.cd_l2tag2); 2262 cdesc->rsvd = cpu_to_le16(0); 2263 cdesc->qw1 = cpu_to_le64(offload.cd_qw1); 2264 } 2265 2266 ice_tx_map(tx_ring, first, &offload); 2267 return NETDEV_TX_OK; 2268 2269 out_drop: 2270 ice_trace(xmit_frame_ring_drop, tx_ring, skb); 2271 dev_kfree_skb_any(skb); 2272 return NETDEV_TX_OK; 2273 } 2274 2275 /** 2276 * ice_start_xmit - Selects the correct VSI and Tx queue to send buffer 2277 * @skb: send buffer 2278 * @netdev: network interface device structure 2279 * 2280 * Returns NETDEV_TX_OK if sent, else an error code 2281 */ 2282 netdev_tx_t ice_start_xmit(struct sk_buff *skb, struct net_device *netdev) 2283 { 2284 struct ice_netdev_priv *np = netdev_priv(netdev); 2285 struct ice_vsi *vsi = np->vsi; 2286 struct ice_ring *tx_ring; 2287 2288 tx_ring = vsi->tx_rings[skb->queue_mapping]; 2289 2290 /* hardware can't handle really short frames, hardware padding works 2291 * beyond this point 2292 */ 2293 if (skb_put_padto(skb, ICE_MIN_TX_LEN)) 2294 return NETDEV_TX_OK; 2295 2296 return ice_xmit_frame_ring(skb, tx_ring); 2297 } 2298 2299 /** 2300 * ice_get_dscp_up - return the UP/TC value for a SKB 2301 * @dcbcfg: DCB config that contains DSCP to UP/TC mapping 2302 * @skb: SKB to query for info to determine UP/TC 2303 * 2304 * This function is to only be called when the PF is in L3 DSCP PFC mode 2305 */ 2306 static u8 ice_get_dscp_up(struct ice_dcbx_cfg *dcbcfg, struct sk_buff *skb) 2307 { 2308 u8 dscp = 0; 2309 2310 if (skb->protocol == htons(ETH_P_IP)) 2311 dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2; 2312 else if (skb->protocol == htons(ETH_P_IPV6)) 2313 dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; 2314 2315 return dcbcfg->dscp_map[dscp]; 2316 } 2317 2318 u16 2319 ice_select_queue(struct net_device *netdev, struct sk_buff *skb, 2320 struct net_device *sb_dev) 2321 { 2322 struct ice_pf *pf = ice_netdev_to_pf(netdev); 2323 struct ice_dcbx_cfg *dcbcfg; 2324 2325 dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg; 2326 if (dcbcfg->pfc_mode == ICE_QOS_MODE_DSCP) 2327 skb->priority = ice_get_dscp_up(dcbcfg, skb); 2328 2329 return netdev_pick_tx(netdev, skb, sb_dev); 2330 } 2331 2332 /** 2333 * ice_clean_ctrl_tx_irq - interrupt handler for flow director Tx queue 2334 * @tx_ring: tx_ring to clean 2335 */ 2336 void ice_clean_ctrl_tx_irq(struct ice_ring *tx_ring) 2337 { 2338 struct ice_vsi *vsi = tx_ring->vsi; 2339 s16 i = tx_ring->next_to_clean; 2340 int budget = ICE_DFLT_IRQ_WORK; 2341 struct ice_tx_desc *tx_desc; 2342 struct ice_tx_buf *tx_buf; 2343 2344 tx_buf = &tx_ring->tx_buf[i]; 2345 tx_desc = ICE_TX_DESC(tx_ring, i); 2346 i -= tx_ring->count; 2347 2348 do { 2349 struct ice_tx_desc *eop_desc = tx_buf->next_to_watch; 2350 2351 /* if next_to_watch is not set then there is no pending work */ 2352 if (!eop_desc) 2353 break; 2354 2355 /* prevent any other reads prior to eop_desc */ 2356 smp_rmb(); 2357 2358 /* if the descriptor isn't done, no work to do */ 2359 if (!(eop_desc->cmd_type_offset_bsz & 2360 cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE))) 2361 break; 2362 2363 /* clear next_to_watch to prevent false hangs */ 2364 tx_buf->next_to_watch = NULL; 2365 tx_desc->buf_addr = 0; 2366 tx_desc->cmd_type_offset_bsz = 0; 2367 2368 /* move past filter desc */ 2369 tx_buf++; 2370 tx_desc++; 2371 i++; 2372 if (unlikely(!i)) { 2373 i -= tx_ring->count; 2374 tx_buf = tx_ring->tx_buf; 2375 tx_desc = ICE_TX_DESC(tx_ring, 0); 2376 } 2377 2378 /* unmap the data header */ 2379 if (dma_unmap_len(tx_buf, len)) 2380 dma_unmap_single(tx_ring->dev, 2381 dma_unmap_addr(tx_buf, dma), 2382 dma_unmap_len(tx_buf, len), 2383 DMA_TO_DEVICE); 2384 if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT) 2385 devm_kfree(tx_ring->dev, tx_buf->raw_buf); 2386 2387 /* clear next_to_watch to prevent false hangs */ 2388 tx_buf->raw_buf = NULL; 2389 tx_buf->tx_flags = 0; 2390 tx_buf->next_to_watch = NULL; 2391 dma_unmap_len_set(tx_buf, len, 0); 2392 tx_desc->buf_addr = 0; 2393 tx_desc->cmd_type_offset_bsz = 0; 2394 2395 /* move past eop_desc for start of next FD desc */ 2396 tx_buf++; 2397 tx_desc++; 2398 i++; 2399 if (unlikely(!i)) { 2400 i -= tx_ring->count; 2401 tx_buf = tx_ring->tx_buf; 2402 tx_desc = ICE_TX_DESC(tx_ring, 0); 2403 } 2404 2405 budget--; 2406 } while (likely(budget)); 2407 2408 i += tx_ring->count; 2409 tx_ring->next_to_clean = i; 2410 2411 /* re-enable interrupt if needed */ 2412 ice_irq_dynamic_ena(&vsi->back->hw, vsi, vsi->q_vectors[0]); 2413 } 2414