1 /**************************************************************************** 2 * Driver for Solarflare network controllers and boards 3 * Copyright 2005-2006 Fen Systems Ltd. 4 * Copyright 2005-2013 Solarflare Communications Inc. 5 * 6 * This program is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 as published 8 * by the Free Software Foundation, incorporated herein by reference. 9 */ 10 11 #include <linux/pci.h> 12 #include <linux/tcp.h> 13 #include <linux/ip.h> 14 #include <linux/in.h> 15 #include <linux/ipv6.h> 16 #include <linux/slab.h> 17 #include <net/ipv6.h> 18 #include <linux/if_ether.h> 19 #include <linux/highmem.h> 20 #include "net_driver.h" 21 #include "efx.h" 22 #include "nic.h" 23 #include "workarounds.h" 24 25 static void efx_dequeue_buffer(struct efx_tx_queue *tx_queue, 26 struct efx_tx_buffer *buffer, 27 unsigned int *pkts_compl, 28 unsigned int *bytes_compl) 29 { 30 if (buffer->unmap_len) { 31 struct device *dma_dev = &tx_queue->efx->pci_dev->dev; 32 dma_addr_t unmap_addr = (buffer->dma_addr + buffer->len - 33 buffer->unmap_len); 34 if (buffer->flags & EFX_TX_BUF_MAP_SINGLE) 35 dma_unmap_single(dma_dev, unmap_addr, buffer->unmap_len, 36 DMA_TO_DEVICE); 37 else 38 dma_unmap_page(dma_dev, unmap_addr, buffer->unmap_len, 39 DMA_TO_DEVICE); 40 buffer->unmap_len = 0; 41 } 42 43 if (buffer->flags & EFX_TX_BUF_SKB) { 44 (*pkts_compl)++; 45 (*bytes_compl) += buffer->skb->len; 46 dev_kfree_skb_any((struct sk_buff *) buffer->skb); 47 netif_vdbg(tx_queue->efx, tx_done, tx_queue->efx->net_dev, 48 "TX queue %d transmission id %x complete\n", 49 tx_queue->queue, tx_queue->read_count); 50 } else if (buffer->flags & EFX_TX_BUF_HEAP) { 51 kfree(buffer->heap_buf); 52 } 53 54 buffer->len = 0; 55 buffer->flags = 0; 56 } 57 58 static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue, 59 struct sk_buff *skb); 60 61 static inline unsigned 62 efx_max_tx_len(struct efx_nic *efx, dma_addr_t dma_addr) 63 { 64 /* Depending on the NIC revision, we can use descriptor 65 * lengths up to 8K or 8K-1. However, since PCI Express 66 * devices must split read requests at 4K boundaries, there is 67 * little benefit from using descriptors that cross those 68 * boundaries and we keep things simple by not doing so. 69 */ 70 unsigned len = (~dma_addr & (EFX_PAGE_SIZE - 1)) + 1; 71 72 /* Work around hardware bug for unaligned buffers. */ 73 if (EFX_WORKAROUND_5391(efx) && (dma_addr & 0xf)) 74 len = min_t(unsigned, len, 512 - (dma_addr & 0xf)); 75 76 return len; 77 } 78 79 unsigned int efx_tx_max_skb_descs(struct efx_nic *efx) 80 { 81 /* Header and payload descriptor for each output segment, plus 82 * one for every input fragment boundary within a segment 83 */ 84 unsigned int max_descs = EFX_TSO_MAX_SEGS * 2 + MAX_SKB_FRAGS; 85 86 /* Possibly one more per segment for the alignment workaround */ 87 if (EFX_WORKAROUND_5391(efx)) 88 max_descs += EFX_TSO_MAX_SEGS; 89 90 /* Possibly more for PCIe page boundaries within input fragments */ 91 if (PAGE_SIZE > EFX_PAGE_SIZE) 92 max_descs += max_t(unsigned int, MAX_SKB_FRAGS, 93 DIV_ROUND_UP(GSO_MAX_SIZE, EFX_PAGE_SIZE)); 94 95 return max_descs; 96 } 97 98 /* Get partner of a TX queue, seen as part of the same net core queue */ 99 static struct efx_tx_queue *efx_tx_queue_partner(struct efx_tx_queue *tx_queue) 100 { 101 if (tx_queue->queue & EFX_TXQ_TYPE_OFFLOAD) 102 return tx_queue - EFX_TXQ_TYPE_OFFLOAD; 103 else 104 return tx_queue + EFX_TXQ_TYPE_OFFLOAD; 105 } 106 107 static void efx_tx_maybe_stop_queue(struct efx_tx_queue *txq1) 108 { 109 /* We need to consider both queues that the net core sees as one */ 110 struct efx_tx_queue *txq2 = efx_tx_queue_partner(txq1); 111 struct efx_nic *efx = txq1->efx; 112 unsigned int fill_level; 113 114 fill_level = max(txq1->insert_count - txq1->old_read_count, 115 txq2->insert_count - txq2->old_read_count); 116 if (likely(fill_level < efx->txq_stop_thresh)) 117 return; 118 119 /* We used the stale old_read_count above, which gives us a 120 * pessimistic estimate of the fill level (which may even 121 * validly be >= efx->txq_entries). Now try again using 122 * read_count (more likely to be a cache miss). 123 * 124 * If we read read_count and then conditionally stop the 125 * queue, it is possible for the completion path to race with 126 * us and complete all outstanding descriptors in the middle, 127 * after which there will be no more completions to wake it. 128 * Therefore we stop the queue first, then read read_count 129 * (with a memory barrier to ensure the ordering), then 130 * restart the queue if the fill level turns out to be low 131 * enough. 132 */ 133 netif_tx_stop_queue(txq1->core_txq); 134 smp_mb(); 135 txq1->old_read_count = ACCESS_ONCE(txq1->read_count); 136 txq2->old_read_count = ACCESS_ONCE(txq2->read_count); 137 138 fill_level = max(txq1->insert_count - txq1->old_read_count, 139 txq2->insert_count - txq2->old_read_count); 140 EFX_BUG_ON_PARANOID(fill_level >= efx->txq_entries); 141 if (likely(fill_level < efx->txq_stop_thresh)) { 142 smp_mb(); 143 if (likely(!efx->loopback_selftest)) 144 netif_tx_start_queue(txq1->core_txq); 145 } 146 } 147 148 /* 149 * Add a socket buffer to a TX queue 150 * 151 * This maps all fragments of a socket buffer for DMA and adds them to 152 * the TX queue. The queue's insert pointer will be incremented by 153 * the number of fragments in the socket buffer. 154 * 155 * If any DMA mapping fails, any mapped fragments will be unmapped, 156 * the queue's insert pointer will be restored to its original value. 157 * 158 * This function is split out from efx_hard_start_xmit to allow the 159 * loopback test to direct packets via specific TX queues. 160 * 161 * Returns NETDEV_TX_OK. 162 * You must hold netif_tx_lock() to call this function. 163 */ 164 netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb) 165 { 166 struct efx_nic *efx = tx_queue->efx; 167 struct device *dma_dev = &efx->pci_dev->dev; 168 struct efx_tx_buffer *buffer; 169 skb_frag_t *fragment; 170 unsigned int len, unmap_len = 0, insert_ptr; 171 dma_addr_t dma_addr, unmap_addr = 0; 172 unsigned int dma_len; 173 unsigned short dma_flags; 174 int i = 0; 175 176 EFX_BUG_ON_PARANOID(tx_queue->write_count != tx_queue->insert_count); 177 178 if (skb_shinfo(skb)->gso_size) 179 return efx_enqueue_skb_tso(tx_queue, skb); 180 181 /* Get size of the initial fragment */ 182 len = skb_headlen(skb); 183 184 /* Pad if necessary */ 185 if (EFX_WORKAROUND_15592(efx) && skb->len <= 32) { 186 EFX_BUG_ON_PARANOID(skb->data_len); 187 len = 32 + 1; 188 if (skb_pad(skb, len - skb->len)) 189 return NETDEV_TX_OK; 190 } 191 192 /* Map for DMA. Use dma_map_single rather than dma_map_page 193 * since this is more efficient on machines with sparse 194 * memory. 195 */ 196 dma_flags = EFX_TX_BUF_MAP_SINGLE; 197 dma_addr = dma_map_single(dma_dev, skb->data, len, PCI_DMA_TODEVICE); 198 199 /* Process all fragments */ 200 while (1) { 201 if (unlikely(dma_mapping_error(dma_dev, dma_addr))) 202 goto dma_err; 203 204 /* Store fields for marking in the per-fragment final 205 * descriptor */ 206 unmap_len = len; 207 unmap_addr = dma_addr; 208 209 /* Add to TX queue, splitting across DMA boundaries */ 210 do { 211 insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask; 212 buffer = &tx_queue->buffer[insert_ptr]; 213 EFX_BUG_ON_PARANOID(buffer->flags); 214 EFX_BUG_ON_PARANOID(buffer->len); 215 EFX_BUG_ON_PARANOID(buffer->unmap_len); 216 217 dma_len = efx_max_tx_len(efx, dma_addr); 218 if (likely(dma_len >= len)) 219 dma_len = len; 220 221 /* Fill out per descriptor fields */ 222 buffer->len = dma_len; 223 buffer->dma_addr = dma_addr; 224 buffer->flags = EFX_TX_BUF_CONT; 225 len -= dma_len; 226 dma_addr += dma_len; 227 ++tx_queue->insert_count; 228 } while (len); 229 230 /* Transfer ownership of the unmapping to the final buffer */ 231 buffer->flags = EFX_TX_BUF_CONT | dma_flags; 232 buffer->unmap_len = unmap_len; 233 unmap_len = 0; 234 235 /* Get address and size of next fragment */ 236 if (i >= skb_shinfo(skb)->nr_frags) 237 break; 238 fragment = &skb_shinfo(skb)->frags[i]; 239 len = skb_frag_size(fragment); 240 i++; 241 /* Map for DMA */ 242 dma_flags = 0; 243 dma_addr = skb_frag_dma_map(dma_dev, fragment, 0, len, 244 DMA_TO_DEVICE); 245 } 246 247 /* Transfer ownership of the skb to the final buffer */ 248 buffer->skb = skb; 249 buffer->flags = EFX_TX_BUF_SKB | dma_flags; 250 251 netdev_tx_sent_queue(tx_queue->core_txq, skb->len); 252 253 /* Pass off to hardware */ 254 efx_nic_push_buffers(tx_queue); 255 256 efx_tx_maybe_stop_queue(tx_queue); 257 258 return NETDEV_TX_OK; 259 260 dma_err: 261 netif_err(efx, tx_err, efx->net_dev, 262 " TX queue %d could not map skb with %d bytes %d " 263 "fragments for DMA\n", tx_queue->queue, skb->len, 264 skb_shinfo(skb)->nr_frags + 1); 265 266 /* Mark the packet as transmitted, and free the SKB ourselves */ 267 dev_kfree_skb_any(skb); 268 269 /* Work backwards until we hit the original insert pointer value */ 270 while (tx_queue->insert_count != tx_queue->write_count) { 271 unsigned int pkts_compl = 0, bytes_compl = 0; 272 --tx_queue->insert_count; 273 insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask; 274 buffer = &tx_queue->buffer[insert_ptr]; 275 efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl); 276 } 277 278 /* Free the fragment we were mid-way through pushing */ 279 if (unmap_len) { 280 if (dma_flags & EFX_TX_BUF_MAP_SINGLE) 281 dma_unmap_single(dma_dev, unmap_addr, unmap_len, 282 DMA_TO_DEVICE); 283 else 284 dma_unmap_page(dma_dev, unmap_addr, unmap_len, 285 DMA_TO_DEVICE); 286 } 287 288 return NETDEV_TX_OK; 289 } 290 291 /* Remove packets from the TX queue 292 * 293 * This removes packets from the TX queue, up to and including the 294 * specified index. 295 */ 296 static void efx_dequeue_buffers(struct efx_tx_queue *tx_queue, 297 unsigned int index, 298 unsigned int *pkts_compl, 299 unsigned int *bytes_compl) 300 { 301 struct efx_nic *efx = tx_queue->efx; 302 unsigned int stop_index, read_ptr; 303 304 stop_index = (index + 1) & tx_queue->ptr_mask; 305 read_ptr = tx_queue->read_count & tx_queue->ptr_mask; 306 307 while (read_ptr != stop_index) { 308 struct efx_tx_buffer *buffer = &tx_queue->buffer[read_ptr]; 309 310 if (!(buffer->flags & EFX_TX_BUF_OPTION) && 311 unlikely(buffer->len == 0)) { 312 netif_err(efx, tx_err, efx->net_dev, 313 "TX queue %d spurious TX completion id %x\n", 314 tx_queue->queue, read_ptr); 315 efx_schedule_reset(efx, RESET_TYPE_TX_SKIP); 316 return; 317 } 318 319 efx_dequeue_buffer(tx_queue, buffer, pkts_compl, bytes_compl); 320 321 ++tx_queue->read_count; 322 read_ptr = tx_queue->read_count & tx_queue->ptr_mask; 323 } 324 } 325 326 /* Initiate a packet transmission. We use one channel per CPU 327 * (sharing when we have more CPUs than channels). On Falcon, the TX 328 * completion events will be directed back to the CPU that transmitted 329 * the packet, which should be cache-efficient. 330 * 331 * Context: non-blocking. 332 * Note that returning anything other than NETDEV_TX_OK will cause the 333 * OS to free the skb. 334 */ 335 netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb, 336 struct net_device *net_dev) 337 { 338 struct efx_nic *efx = netdev_priv(net_dev); 339 struct efx_tx_queue *tx_queue; 340 unsigned index, type; 341 342 EFX_WARN_ON_PARANOID(!netif_device_present(net_dev)); 343 344 /* PTP "event" packet */ 345 if (unlikely(efx_xmit_with_hwtstamp(skb)) && 346 unlikely(efx_ptp_is_ptp_tx(efx, skb))) { 347 return efx_ptp_tx(efx, skb); 348 } 349 350 index = skb_get_queue_mapping(skb); 351 type = skb->ip_summed == CHECKSUM_PARTIAL ? EFX_TXQ_TYPE_OFFLOAD : 0; 352 if (index >= efx->n_tx_channels) { 353 index -= efx->n_tx_channels; 354 type |= EFX_TXQ_TYPE_HIGHPRI; 355 } 356 tx_queue = efx_get_tx_queue(efx, index, type); 357 358 return efx_enqueue_skb(tx_queue, skb); 359 } 360 361 void efx_init_tx_queue_core_txq(struct efx_tx_queue *tx_queue) 362 { 363 struct efx_nic *efx = tx_queue->efx; 364 365 /* Must be inverse of queue lookup in efx_hard_start_xmit() */ 366 tx_queue->core_txq = 367 netdev_get_tx_queue(efx->net_dev, 368 tx_queue->queue / EFX_TXQ_TYPES + 369 ((tx_queue->queue & EFX_TXQ_TYPE_HIGHPRI) ? 370 efx->n_tx_channels : 0)); 371 } 372 373 int efx_setup_tc(struct net_device *net_dev, u8 num_tc) 374 { 375 struct efx_nic *efx = netdev_priv(net_dev); 376 struct efx_channel *channel; 377 struct efx_tx_queue *tx_queue; 378 unsigned tc; 379 int rc; 380 381 if (efx_nic_rev(efx) < EFX_REV_FALCON_B0 || num_tc > EFX_MAX_TX_TC) 382 return -EINVAL; 383 384 if (num_tc == net_dev->num_tc) 385 return 0; 386 387 for (tc = 0; tc < num_tc; tc++) { 388 net_dev->tc_to_txq[tc].offset = tc * efx->n_tx_channels; 389 net_dev->tc_to_txq[tc].count = efx->n_tx_channels; 390 } 391 392 if (num_tc > net_dev->num_tc) { 393 /* Initialise high-priority queues as necessary */ 394 efx_for_each_channel(channel, efx) { 395 efx_for_each_possible_channel_tx_queue(tx_queue, 396 channel) { 397 if (!(tx_queue->queue & EFX_TXQ_TYPE_HIGHPRI)) 398 continue; 399 if (!tx_queue->buffer) { 400 rc = efx_probe_tx_queue(tx_queue); 401 if (rc) 402 return rc; 403 } 404 if (!tx_queue->initialised) 405 efx_init_tx_queue(tx_queue); 406 efx_init_tx_queue_core_txq(tx_queue); 407 } 408 } 409 } else { 410 /* Reduce number of classes before number of queues */ 411 net_dev->num_tc = num_tc; 412 } 413 414 rc = netif_set_real_num_tx_queues(net_dev, 415 max_t(int, num_tc, 1) * 416 efx->n_tx_channels); 417 if (rc) 418 return rc; 419 420 /* Do not destroy high-priority queues when they become 421 * unused. We would have to flush them first, and it is 422 * fairly difficult to flush a subset of TX queues. Leave 423 * it to efx_fini_channels(). 424 */ 425 426 net_dev->num_tc = num_tc; 427 return 0; 428 } 429 430 void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) 431 { 432 unsigned fill_level; 433 struct efx_nic *efx = tx_queue->efx; 434 struct efx_tx_queue *txq2; 435 unsigned int pkts_compl = 0, bytes_compl = 0; 436 437 EFX_BUG_ON_PARANOID(index > tx_queue->ptr_mask); 438 439 efx_dequeue_buffers(tx_queue, index, &pkts_compl, &bytes_compl); 440 netdev_tx_completed_queue(tx_queue->core_txq, pkts_compl, bytes_compl); 441 442 if (pkts_compl > 1) 443 ++tx_queue->merge_events; 444 445 /* See if we need to restart the netif queue. This memory 446 * barrier ensures that we write read_count (inside 447 * efx_dequeue_buffers()) before reading the queue status. 448 */ 449 smp_mb(); 450 if (unlikely(netif_tx_queue_stopped(tx_queue->core_txq)) && 451 likely(efx->port_enabled) && 452 likely(netif_device_present(efx->net_dev))) { 453 txq2 = efx_tx_queue_partner(tx_queue); 454 fill_level = max(tx_queue->insert_count - tx_queue->read_count, 455 txq2->insert_count - txq2->read_count); 456 if (fill_level <= efx->txq_wake_thresh) 457 netif_tx_wake_queue(tx_queue->core_txq); 458 } 459 460 /* Check whether the hardware queue is now empty */ 461 if ((int)(tx_queue->read_count - tx_queue->old_write_count) >= 0) { 462 tx_queue->old_write_count = ACCESS_ONCE(tx_queue->write_count); 463 if (tx_queue->read_count == tx_queue->old_write_count) { 464 smp_mb(); 465 tx_queue->empty_read_count = 466 tx_queue->read_count | EFX_EMPTY_COUNT_VALID; 467 } 468 } 469 } 470 471 /* Size of page-based TSO header buffers. Larger blocks must be 472 * allocated from the heap. 473 */ 474 #define TSOH_STD_SIZE 128 475 #define TSOH_PER_PAGE (PAGE_SIZE / TSOH_STD_SIZE) 476 477 /* At most half the descriptors in the queue at any time will refer to 478 * a TSO header buffer, since they must always be followed by a 479 * payload descriptor referring to an skb. 480 */ 481 static unsigned int efx_tsoh_page_count(struct efx_tx_queue *tx_queue) 482 { 483 return DIV_ROUND_UP(tx_queue->ptr_mask + 1, 2 * TSOH_PER_PAGE); 484 } 485 486 int efx_probe_tx_queue(struct efx_tx_queue *tx_queue) 487 { 488 struct efx_nic *efx = tx_queue->efx; 489 unsigned int entries; 490 int rc; 491 492 /* Create the smallest power-of-two aligned ring */ 493 entries = max(roundup_pow_of_two(efx->txq_entries), EFX_MIN_DMAQ_SIZE); 494 EFX_BUG_ON_PARANOID(entries > EFX_MAX_DMAQ_SIZE); 495 tx_queue->ptr_mask = entries - 1; 496 497 netif_dbg(efx, probe, efx->net_dev, 498 "creating TX queue %d size %#x mask %#x\n", 499 tx_queue->queue, efx->txq_entries, tx_queue->ptr_mask); 500 501 /* Allocate software ring */ 502 tx_queue->buffer = kcalloc(entries, sizeof(*tx_queue->buffer), 503 GFP_KERNEL); 504 if (!tx_queue->buffer) 505 return -ENOMEM; 506 507 if (tx_queue->queue & EFX_TXQ_TYPE_OFFLOAD) { 508 tx_queue->tsoh_page = 509 kcalloc(efx_tsoh_page_count(tx_queue), 510 sizeof(tx_queue->tsoh_page[0]), GFP_KERNEL); 511 if (!tx_queue->tsoh_page) { 512 rc = -ENOMEM; 513 goto fail1; 514 } 515 } 516 517 /* Allocate hardware ring */ 518 rc = efx_nic_probe_tx(tx_queue); 519 if (rc) 520 goto fail2; 521 522 return 0; 523 524 fail2: 525 kfree(tx_queue->tsoh_page); 526 tx_queue->tsoh_page = NULL; 527 fail1: 528 kfree(tx_queue->buffer); 529 tx_queue->buffer = NULL; 530 return rc; 531 } 532 533 void efx_init_tx_queue(struct efx_tx_queue *tx_queue) 534 { 535 netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev, 536 "initialising TX queue %d\n", tx_queue->queue); 537 538 tx_queue->insert_count = 0; 539 tx_queue->write_count = 0; 540 tx_queue->old_write_count = 0; 541 tx_queue->read_count = 0; 542 tx_queue->old_read_count = 0; 543 tx_queue->empty_read_count = 0 | EFX_EMPTY_COUNT_VALID; 544 545 /* Set up TX descriptor ring */ 546 efx_nic_init_tx(tx_queue); 547 548 tx_queue->initialised = true; 549 } 550 551 void efx_fini_tx_queue(struct efx_tx_queue *tx_queue) 552 { 553 struct efx_tx_buffer *buffer; 554 555 netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev, 556 "shutting down TX queue %d\n", tx_queue->queue); 557 558 if (!tx_queue->buffer) 559 return; 560 561 /* Free any buffers left in the ring */ 562 while (tx_queue->read_count != tx_queue->write_count) { 563 unsigned int pkts_compl = 0, bytes_compl = 0; 564 buffer = &tx_queue->buffer[tx_queue->read_count & tx_queue->ptr_mask]; 565 efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl); 566 567 ++tx_queue->read_count; 568 } 569 netdev_tx_reset_queue(tx_queue->core_txq); 570 } 571 572 void efx_remove_tx_queue(struct efx_tx_queue *tx_queue) 573 { 574 int i; 575 576 if (!tx_queue->buffer) 577 return; 578 579 netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev, 580 "destroying TX queue %d\n", tx_queue->queue); 581 efx_nic_remove_tx(tx_queue); 582 583 if (tx_queue->tsoh_page) { 584 for (i = 0; i < efx_tsoh_page_count(tx_queue); i++) 585 efx_nic_free_buffer(tx_queue->efx, 586 &tx_queue->tsoh_page[i]); 587 kfree(tx_queue->tsoh_page); 588 tx_queue->tsoh_page = NULL; 589 } 590 591 kfree(tx_queue->buffer); 592 tx_queue->buffer = NULL; 593 } 594 595 596 /* Efx TCP segmentation acceleration. 597 * 598 * Why? Because by doing it here in the driver we can go significantly 599 * faster than the GSO. 600 * 601 * Requires TX checksum offload support. 602 */ 603 604 /* Number of bytes inserted at the start of a TSO header buffer, 605 * similar to NET_IP_ALIGN. 606 */ 607 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 608 #define TSOH_OFFSET 0 609 #else 610 #define TSOH_OFFSET NET_IP_ALIGN 611 #endif 612 613 #define PTR_DIFF(p1, p2) ((u8 *)(p1) - (u8 *)(p2)) 614 615 /** 616 * struct tso_state - TSO state for an SKB 617 * @out_len: Remaining length in current segment 618 * @seqnum: Current sequence number 619 * @ipv4_id: Current IPv4 ID, host endian 620 * @packet_space: Remaining space in current packet 621 * @dma_addr: DMA address of current position 622 * @in_len: Remaining length in current SKB fragment 623 * @unmap_len: Length of SKB fragment 624 * @unmap_addr: DMA address of SKB fragment 625 * @dma_flags: TX buffer flags for DMA mapping - %EFX_TX_BUF_MAP_SINGLE or 0 626 * @protocol: Network protocol (after any VLAN header) 627 * @ip_off: Offset of IP header 628 * @tcp_off: Offset of TCP header 629 * @header_len: Number of bytes of header 630 * @ip_base_len: IPv4 tot_len or IPv6 payload_len, before TCP payload 631 * 632 * The state used during segmentation. It is put into this data structure 633 * just to make it easy to pass into inline functions. 634 */ 635 struct tso_state { 636 /* Output position */ 637 unsigned out_len; 638 unsigned seqnum; 639 unsigned ipv4_id; 640 unsigned packet_space; 641 642 /* Input position */ 643 dma_addr_t dma_addr; 644 unsigned in_len; 645 unsigned unmap_len; 646 dma_addr_t unmap_addr; 647 unsigned short dma_flags; 648 649 __be16 protocol; 650 unsigned int ip_off; 651 unsigned int tcp_off; 652 unsigned header_len; 653 unsigned int ip_base_len; 654 }; 655 656 657 /* 658 * Verify that our various assumptions about sk_buffs and the conditions 659 * under which TSO will be attempted hold true. Return the protocol number. 660 */ 661 static __be16 efx_tso_check_protocol(struct sk_buff *skb) 662 { 663 __be16 protocol = skb->protocol; 664 665 EFX_BUG_ON_PARANOID(((struct ethhdr *)skb->data)->h_proto != 666 protocol); 667 if (protocol == htons(ETH_P_8021Q)) { 668 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; 669 protocol = veh->h_vlan_encapsulated_proto; 670 } 671 672 if (protocol == htons(ETH_P_IP)) { 673 EFX_BUG_ON_PARANOID(ip_hdr(skb)->protocol != IPPROTO_TCP); 674 } else { 675 EFX_BUG_ON_PARANOID(protocol != htons(ETH_P_IPV6)); 676 EFX_BUG_ON_PARANOID(ipv6_hdr(skb)->nexthdr != NEXTHDR_TCP); 677 } 678 EFX_BUG_ON_PARANOID((PTR_DIFF(tcp_hdr(skb), skb->data) 679 + (tcp_hdr(skb)->doff << 2u)) > 680 skb_headlen(skb)); 681 682 return protocol; 683 } 684 685 static u8 *efx_tsoh_get_buffer(struct efx_tx_queue *tx_queue, 686 struct efx_tx_buffer *buffer, unsigned int len) 687 { 688 u8 *result; 689 690 EFX_BUG_ON_PARANOID(buffer->len); 691 EFX_BUG_ON_PARANOID(buffer->flags); 692 EFX_BUG_ON_PARANOID(buffer->unmap_len); 693 694 if (likely(len <= TSOH_STD_SIZE - TSOH_OFFSET)) { 695 unsigned index = 696 (tx_queue->insert_count & tx_queue->ptr_mask) / 2; 697 struct efx_buffer *page_buf = 698 &tx_queue->tsoh_page[index / TSOH_PER_PAGE]; 699 unsigned offset = 700 TSOH_STD_SIZE * (index % TSOH_PER_PAGE) + TSOH_OFFSET; 701 702 if (unlikely(!page_buf->addr) && 703 efx_nic_alloc_buffer(tx_queue->efx, page_buf, PAGE_SIZE, 704 GFP_ATOMIC)) 705 return NULL; 706 707 result = (u8 *)page_buf->addr + offset; 708 buffer->dma_addr = page_buf->dma_addr + offset; 709 buffer->flags = EFX_TX_BUF_CONT; 710 } else { 711 tx_queue->tso_long_headers++; 712 713 buffer->heap_buf = kmalloc(TSOH_OFFSET + len, GFP_ATOMIC); 714 if (unlikely(!buffer->heap_buf)) 715 return NULL; 716 result = (u8 *)buffer->heap_buf + TSOH_OFFSET; 717 buffer->flags = EFX_TX_BUF_CONT | EFX_TX_BUF_HEAP; 718 } 719 720 buffer->len = len; 721 722 return result; 723 } 724 725 /** 726 * efx_tx_queue_insert - push descriptors onto the TX queue 727 * @tx_queue: Efx TX queue 728 * @dma_addr: DMA address of fragment 729 * @len: Length of fragment 730 * @final_buffer: The final buffer inserted into the queue 731 * 732 * Push descriptors onto the TX queue. 733 */ 734 static void efx_tx_queue_insert(struct efx_tx_queue *tx_queue, 735 dma_addr_t dma_addr, unsigned len, 736 struct efx_tx_buffer **final_buffer) 737 { 738 struct efx_tx_buffer *buffer; 739 struct efx_nic *efx = tx_queue->efx; 740 unsigned dma_len, insert_ptr; 741 742 EFX_BUG_ON_PARANOID(len <= 0); 743 744 while (1) { 745 insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask; 746 buffer = &tx_queue->buffer[insert_ptr]; 747 ++tx_queue->insert_count; 748 749 EFX_BUG_ON_PARANOID(tx_queue->insert_count - 750 tx_queue->read_count >= 751 efx->txq_entries); 752 753 EFX_BUG_ON_PARANOID(buffer->len); 754 EFX_BUG_ON_PARANOID(buffer->unmap_len); 755 EFX_BUG_ON_PARANOID(buffer->flags); 756 757 buffer->dma_addr = dma_addr; 758 759 dma_len = efx_max_tx_len(efx, dma_addr); 760 761 /* If there is enough space to send then do so */ 762 if (dma_len >= len) 763 break; 764 765 buffer->len = dma_len; 766 buffer->flags = EFX_TX_BUF_CONT; 767 dma_addr += dma_len; 768 len -= dma_len; 769 } 770 771 EFX_BUG_ON_PARANOID(!len); 772 buffer->len = len; 773 *final_buffer = buffer; 774 } 775 776 777 /* 778 * Put a TSO header into the TX queue. 779 * 780 * This is special-cased because we know that it is small enough to fit in 781 * a single fragment, and we know it doesn't cross a page boundary. It 782 * also allows us to not worry about end-of-packet etc. 783 */ 784 static int efx_tso_put_header(struct efx_tx_queue *tx_queue, 785 struct efx_tx_buffer *buffer, u8 *header) 786 { 787 if (unlikely(buffer->flags & EFX_TX_BUF_HEAP)) { 788 buffer->dma_addr = dma_map_single(&tx_queue->efx->pci_dev->dev, 789 header, buffer->len, 790 DMA_TO_DEVICE); 791 if (unlikely(dma_mapping_error(&tx_queue->efx->pci_dev->dev, 792 buffer->dma_addr))) { 793 kfree(buffer->heap_buf); 794 buffer->len = 0; 795 buffer->flags = 0; 796 return -ENOMEM; 797 } 798 buffer->unmap_len = buffer->len; 799 buffer->flags |= EFX_TX_BUF_MAP_SINGLE; 800 } 801 802 ++tx_queue->insert_count; 803 return 0; 804 } 805 806 807 /* Remove buffers put into a tx_queue. None of the buffers must have 808 * an skb attached. 809 */ 810 static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue) 811 { 812 struct efx_tx_buffer *buffer; 813 814 /* Work backwards until we hit the original insert pointer value */ 815 while (tx_queue->insert_count != tx_queue->write_count) { 816 --tx_queue->insert_count; 817 buffer = &tx_queue->buffer[tx_queue->insert_count & 818 tx_queue->ptr_mask]; 819 efx_dequeue_buffer(tx_queue, buffer, NULL, NULL); 820 } 821 } 822 823 824 /* Parse the SKB header and initialise state. */ 825 static void tso_start(struct tso_state *st, const struct sk_buff *skb) 826 { 827 st->ip_off = skb_network_header(skb) - skb->data; 828 st->tcp_off = skb_transport_header(skb) - skb->data; 829 st->header_len = st->tcp_off + (tcp_hdr(skb)->doff << 2u); 830 if (st->protocol == htons(ETH_P_IP)) { 831 st->ip_base_len = st->header_len - st->ip_off; 832 st->ipv4_id = ntohs(ip_hdr(skb)->id); 833 } else { 834 st->ip_base_len = st->header_len - st->tcp_off; 835 st->ipv4_id = 0; 836 } 837 st->seqnum = ntohl(tcp_hdr(skb)->seq); 838 839 EFX_BUG_ON_PARANOID(tcp_hdr(skb)->urg); 840 EFX_BUG_ON_PARANOID(tcp_hdr(skb)->syn); 841 EFX_BUG_ON_PARANOID(tcp_hdr(skb)->rst); 842 843 st->out_len = skb->len - st->header_len; 844 st->unmap_len = 0; 845 st->dma_flags = 0; 846 } 847 848 static int tso_get_fragment(struct tso_state *st, struct efx_nic *efx, 849 skb_frag_t *frag) 850 { 851 st->unmap_addr = skb_frag_dma_map(&efx->pci_dev->dev, frag, 0, 852 skb_frag_size(frag), DMA_TO_DEVICE); 853 if (likely(!dma_mapping_error(&efx->pci_dev->dev, st->unmap_addr))) { 854 st->dma_flags = 0; 855 st->unmap_len = skb_frag_size(frag); 856 st->in_len = skb_frag_size(frag); 857 st->dma_addr = st->unmap_addr; 858 return 0; 859 } 860 return -ENOMEM; 861 } 862 863 static int tso_get_head_fragment(struct tso_state *st, struct efx_nic *efx, 864 const struct sk_buff *skb) 865 { 866 int hl = st->header_len; 867 int len = skb_headlen(skb) - hl; 868 869 st->unmap_addr = dma_map_single(&efx->pci_dev->dev, skb->data + hl, 870 len, DMA_TO_DEVICE); 871 if (likely(!dma_mapping_error(&efx->pci_dev->dev, st->unmap_addr))) { 872 st->dma_flags = EFX_TX_BUF_MAP_SINGLE; 873 st->unmap_len = len; 874 st->in_len = len; 875 st->dma_addr = st->unmap_addr; 876 return 0; 877 } 878 return -ENOMEM; 879 } 880 881 882 /** 883 * tso_fill_packet_with_fragment - form descriptors for the current fragment 884 * @tx_queue: Efx TX queue 885 * @skb: Socket buffer 886 * @st: TSO state 887 * 888 * Form descriptors for the current fragment, until we reach the end 889 * of fragment or end-of-packet. 890 */ 891 static void tso_fill_packet_with_fragment(struct efx_tx_queue *tx_queue, 892 const struct sk_buff *skb, 893 struct tso_state *st) 894 { 895 struct efx_tx_buffer *buffer; 896 int n; 897 898 if (st->in_len == 0) 899 return; 900 if (st->packet_space == 0) 901 return; 902 903 EFX_BUG_ON_PARANOID(st->in_len <= 0); 904 EFX_BUG_ON_PARANOID(st->packet_space <= 0); 905 906 n = min(st->in_len, st->packet_space); 907 908 st->packet_space -= n; 909 st->out_len -= n; 910 st->in_len -= n; 911 912 efx_tx_queue_insert(tx_queue, st->dma_addr, n, &buffer); 913 914 if (st->out_len == 0) { 915 /* Transfer ownership of the skb */ 916 buffer->skb = skb; 917 buffer->flags = EFX_TX_BUF_SKB; 918 } else if (st->packet_space != 0) { 919 buffer->flags = EFX_TX_BUF_CONT; 920 } 921 922 if (st->in_len == 0) { 923 /* Transfer ownership of the DMA mapping */ 924 buffer->unmap_len = st->unmap_len; 925 buffer->flags |= st->dma_flags; 926 st->unmap_len = 0; 927 } 928 929 st->dma_addr += n; 930 } 931 932 933 /** 934 * tso_start_new_packet - generate a new header and prepare for the new packet 935 * @tx_queue: Efx TX queue 936 * @skb: Socket buffer 937 * @st: TSO state 938 * 939 * Generate a new header and prepare for the new packet. Return 0 on 940 * success, or -%ENOMEM if failed to alloc header. 941 */ 942 static int tso_start_new_packet(struct efx_tx_queue *tx_queue, 943 const struct sk_buff *skb, 944 struct tso_state *st) 945 { 946 struct efx_tx_buffer *buffer = 947 &tx_queue->buffer[tx_queue->insert_count & tx_queue->ptr_mask]; 948 struct tcphdr *tsoh_th; 949 unsigned ip_length; 950 u8 *header; 951 int rc; 952 953 /* Allocate and insert a DMA-mapped header buffer. */ 954 header = efx_tsoh_get_buffer(tx_queue, buffer, st->header_len); 955 if (!header) 956 return -ENOMEM; 957 958 tsoh_th = (struct tcphdr *)(header + st->tcp_off); 959 960 /* Copy and update the headers. */ 961 memcpy(header, skb->data, st->header_len); 962 963 tsoh_th->seq = htonl(st->seqnum); 964 st->seqnum += skb_shinfo(skb)->gso_size; 965 if (st->out_len > skb_shinfo(skb)->gso_size) { 966 /* This packet will not finish the TSO burst. */ 967 st->packet_space = skb_shinfo(skb)->gso_size; 968 tsoh_th->fin = 0; 969 tsoh_th->psh = 0; 970 } else { 971 /* This packet will be the last in the TSO burst. */ 972 st->packet_space = st->out_len; 973 tsoh_th->fin = tcp_hdr(skb)->fin; 974 tsoh_th->psh = tcp_hdr(skb)->psh; 975 } 976 ip_length = st->ip_base_len + st->packet_space; 977 978 if (st->protocol == htons(ETH_P_IP)) { 979 struct iphdr *tsoh_iph = (struct iphdr *)(header + st->ip_off); 980 981 tsoh_iph->tot_len = htons(ip_length); 982 983 /* Linux leaves suitable gaps in the IP ID space for us to fill. */ 984 tsoh_iph->id = htons(st->ipv4_id); 985 st->ipv4_id++; 986 } else { 987 struct ipv6hdr *tsoh_iph = 988 (struct ipv6hdr *)(header + st->ip_off); 989 990 tsoh_iph->payload_len = htons(ip_length); 991 } 992 993 rc = efx_tso_put_header(tx_queue, buffer, header); 994 if (unlikely(rc)) 995 return rc; 996 997 ++tx_queue->tso_packets; 998 999 return 0; 1000 } 1001 1002 1003 /** 1004 * efx_enqueue_skb_tso - segment and transmit a TSO socket buffer 1005 * @tx_queue: Efx TX queue 1006 * @skb: Socket buffer 1007 * 1008 * Context: You must hold netif_tx_lock() to call this function. 1009 * 1010 * Add socket buffer @skb to @tx_queue, doing TSO or return != 0 if 1011 * @skb was not enqueued. In all cases @skb is consumed. Return 1012 * %NETDEV_TX_OK. 1013 */ 1014 static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue, 1015 struct sk_buff *skb) 1016 { 1017 struct efx_nic *efx = tx_queue->efx; 1018 int frag_i, rc; 1019 struct tso_state state; 1020 1021 /* Find the packet protocol and sanity-check it */ 1022 state.protocol = efx_tso_check_protocol(skb); 1023 1024 EFX_BUG_ON_PARANOID(tx_queue->write_count != tx_queue->insert_count); 1025 1026 tso_start(&state, skb); 1027 1028 /* Assume that skb header area contains exactly the headers, and 1029 * all payload is in the frag list. 1030 */ 1031 if (skb_headlen(skb) == state.header_len) { 1032 /* Grab the first payload fragment. */ 1033 EFX_BUG_ON_PARANOID(skb_shinfo(skb)->nr_frags < 1); 1034 frag_i = 0; 1035 rc = tso_get_fragment(&state, efx, 1036 skb_shinfo(skb)->frags + frag_i); 1037 if (rc) 1038 goto mem_err; 1039 } else { 1040 rc = tso_get_head_fragment(&state, efx, skb); 1041 if (rc) 1042 goto mem_err; 1043 frag_i = -1; 1044 } 1045 1046 if (tso_start_new_packet(tx_queue, skb, &state) < 0) 1047 goto mem_err; 1048 1049 while (1) { 1050 tso_fill_packet_with_fragment(tx_queue, skb, &state); 1051 1052 /* Move onto the next fragment? */ 1053 if (state.in_len == 0) { 1054 if (++frag_i >= skb_shinfo(skb)->nr_frags) 1055 /* End of payload reached. */ 1056 break; 1057 rc = tso_get_fragment(&state, efx, 1058 skb_shinfo(skb)->frags + frag_i); 1059 if (rc) 1060 goto mem_err; 1061 } 1062 1063 /* Start at new packet? */ 1064 if (state.packet_space == 0 && 1065 tso_start_new_packet(tx_queue, skb, &state) < 0) 1066 goto mem_err; 1067 } 1068 1069 netdev_tx_sent_queue(tx_queue->core_txq, skb->len); 1070 1071 /* Pass off to hardware */ 1072 efx_nic_push_buffers(tx_queue); 1073 1074 efx_tx_maybe_stop_queue(tx_queue); 1075 1076 tx_queue->tso_bursts++; 1077 return NETDEV_TX_OK; 1078 1079 mem_err: 1080 netif_err(efx, tx_err, efx->net_dev, 1081 "Out of memory for TSO headers, or DMA mapping error\n"); 1082 dev_kfree_skb_any(skb); 1083 1084 /* Free the DMA mapping we were in the process of writing out */ 1085 if (state.unmap_len) { 1086 if (state.dma_flags & EFX_TX_BUF_MAP_SINGLE) 1087 dma_unmap_single(&efx->pci_dev->dev, state.unmap_addr, 1088 state.unmap_len, DMA_TO_DEVICE); 1089 else 1090 dma_unmap_page(&efx->pci_dev->dev, state.unmap_addr, 1091 state.unmap_len, DMA_TO_DEVICE); 1092 } 1093 1094 efx_enqueue_unwind(tx_queue); 1095 return NETDEV_TX_OK; 1096 } 1097