1 // SPDX-License-Identifier: (GPL-2.0 OR MIT) 2 /* Google virtual Ethernet (gve) driver 3 * 4 * Copyright (C) 2015-2021 Google, Inc. 5 */ 6 7 #include "gve.h" 8 #include "gve_adminq.h" 9 #include "gve_utils.h" 10 #include "gve_dqo.h" 11 #include <linux/tcp.h> 12 #include <linux/slab.h> 13 #include <linux/skbuff.h> 14 15 /* Returns true if a gve_tx_pending_packet_dqo object is available. */ 16 static bool gve_has_pending_packet(struct gve_tx_ring *tx) 17 { 18 /* Check TX path's list. */ 19 if (tx->dqo_tx.free_pending_packets != -1) 20 return true; 21 22 /* Check completion handler's list. */ 23 if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1) 24 return true; 25 26 return false; 27 } 28 29 static struct gve_tx_pending_packet_dqo * 30 gve_alloc_pending_packet(struct gve_tx_ring *tx) 31 { 32 struct gve_tx_pending_packet_dqo *pending_packet; 33 s16 index; 34 35 index = tx->dqo_tx.free_pending_packets; 36 37 /* No pending_packets available, try to steal the list from the 38 * completion handler. 39 */ 40 if (unlikely(index == -1)) { 41 tx->dqo_tx.free_pending_packets = 42 atomic_xchg(&tx->dqo_compl.free_pending_packets, -1); 43 index = tx->dqo_tx.free_pending_packets; 44 45 if (unlikely(index == -1)) 46 return NULL; 47 } 48 49 pending_packet = &tx->dqo.pending_packets[index]; 50 51 /* Remove pending_packet from free list */ 52 tx->dqo_tx.free_pending_packets = pending_packet->next; 53 pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL; 54 55 return pending_packet; 56 } 57 58 static void 59 gve_free_pending_packet(struct gve_tx_ring *tx, 60 struct gve_tx_pending_packet_dqo *pending_packet) 61 { 62 s16 index = pending_packet - tx->dqo.pending_packets; 63 64 pending_packet->state = GVE_PACKET_STATE_UNALLOCATED; 65 while (true) { 66 s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets); 67 68 pending_packet->next = old_head; 69 if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets, 70 old_head, index) == old_head) { 71 break; 72 } 73 } 74 } 75 76 /* gve_tx_free_desc - Cleans up all pending tx requests and buffers. 77 */ 78 static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx) 79 { 80 int i; 81 82 for (i = 0; i < tx->dqo.num_pending_packets; i++) { 83 struct gve_tx_pending_packet_dqo *cur_state = 84 &tx->dqo.pending_packets[i]; 85 int j; 86 87 for (j = 0; j < cur_state->num_bufs; j++) { 88 struct gve_tx_dma_buf *buf = &cur_state->bufs[j]; 89 90 if (j == 0) { 91 dma_unmap_single(tx->dev, 92 dma_unmap_addr(buf, dma), 93 dma_unmap_len(buf, len), 94 DMA_TO_DEVICE); 95 } else { 96 dma_unmap_page(tx->dev, 97 dma_unmap_addr(buf, dma), 98 dma_unmap_len(buf, len), 99 DMA_TO_DEVICE); 100 } 101 } 102 if (cur_state->skb) { 103 dev_consume_skb_any(cur_state->skb); 104 cur_state->skb = NULL; 105 } 106 } 107 } 108 109 static void gve_tx_free_ring_dqo(struct gve_priv *priv, int idx) 110 { 111 struct gve_tx_ring *tx = &priv->tx[idx]; 112 struct device *hdev = &priv->pdev->dev; 113 size_t bytes; 114 115 gve_tx_remove_from_block(priv, idx); 116 117 if (tx->q_resources) { 118 dma_free_coherent(hdev, sizeof(*tx->q_resources), 119 tx->q_resources, tx->q_resources_bus); 120 tx->q_resources = NULL; 121 } 122 123 if (tx->dqo.compl_ring) { 124 bytes = sizeof(tx->dqo.compl_ring[0]) * 125 (tx->dqo.complq_mask + 1); 126 dma_free_coherent(hdev, bytes, tx->dqo.compl_ring, 127 tx->complq_bus_dqo); 128 tx->dqo.compl_ring = NULL; 129 } 130 131 if (tx->dqo.tx_ring) { 132 bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1); 133 dma_free_coherent(hdev, bytes, tx->dqo.tx_ring, tx->bus); 134 tx->dqo.tx_ring = NULL; 135 } 136 137 kvfree(tx->dqo.pending_packets); 138 tx->dqo.pending_packets = NULL; 139 140 netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx); 141 } 142 143 static int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int idx) 144 { 145 struct gve_tx_ring *tx = &priv->tx[idx]; 146 struct device *hdev = &priv->pdev->dev; 147 int num_pending_packets; 148 size_t bytes; 149 int i; 150 151 memset(tx, 0, sizeof(*tx)); 152 tx->q_num = idx; 153 tx->dev = &priv->pdev->dev; 154 tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx); 155 atomic_set_release(&tx->dqo_compl.hw_tx_head, 0); 156 157 /* Queue sizes must be a power of 2 */ 158 tx->mask = priv->tx_desc_cnt - 1; 159 tx->dqo.complq_mask = priv->options_dqo_rda.tx_comp_ring_entries - 1; 160 161 /* The max number of pending packets determines the maximum number of 162 * descriptors which maybe written to the completion queue. 163 * 164 * We must set the number small enough to make sure we never overrun the 165 * completion queue. 166 */ 167 num_pending_packets = tx->dqo.complq_mask + 1; 168 169 /* Reserve space for descriptor completions, which will be reported at 170 * most every GVE_TX_MIN_RE_INTERVAL packets. 171 */ 172 num_pending_packets -= 173 (tx->dqo.complq_mask + 1) / GVE_TX_MIN_RE_INTERVAL; 174 175 /* Each packet may have at most 2 buffer completions if it receives both 176 * a miss and reinjection completion. 177 */ 178 num_pending_packets /= 2; 179 180 tx->dqo.num_pending_packets = min_t(int, num_pending_packets, S16_MAX); 181 tx->dqo.pending_packets = kvcalloc(tx->dqo.num_pending_packets, 182 sizeof(tx->dqo.pending_packets[0]), 183 GFP_KERNEL); 184 if (!tx->dqo.pending_packets) 185 goto err; 186 187 /* Set up linked list of pending packets */ 188 for (i = 0; i < tx->dqo.num_pending_packets - 1; i++) 189 tx->dqo.pending_packets[i].next = i + 1; 190 191 tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1; 192 atomic_set_release(&tx->dqo_compl.free_pending_packets, -1); 193 tx->dqo_compl.miss_completions.head = -1; 194 tx->dqo_compl.miss_completions.tail = -1; 195 tx->dqo_compl.timed_out_completions.head = -1; 196 tx->dqo_compl.timed_out_completions.tail = -1; 197 198 bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1); 199 tx->dqo.tx_ring = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL); 200 if (!tx->dqo.tx_ring) 201 goto err; 202 203 bytes = sizeof(tx->dqo.compl_ring[0]) * (tx->dqo.complq_mask + 1); 204 tx->dqo.compl_ring = dma_alloc_coherent(hdev, bytes, 205 &tx->complq_bus_dqo, 206 GFP_KERNEL); 207 if (!tx->dqo.compl_ring) 208 goto err; 209 210 tx->q_resources = dma_alloc_coherent(hdev, sizeof(*tx->q_resources), 211 &tx->q_resources_bus, GFP_KERNEL); 212 if (!tx->q_resources) 213 goto err; 214 215 gve_tx_add_to_block(priv, idx); 216 217 return 0; 218 219 err: 220 gve_tx_free_ring_dqo(priv, idx); 221 return -ENOMEM; 222 } 223 224 int gve_tx_alloc_rings_dqo(struct gve_priv *priv) 225 { 226 int err = 0; 227 int i; 228 229 for (i = 0; i < priv->tx_cfg.num_queues; i++) { 230 err = gve_tx_alloc_ring_dqo(priv, i); 231 if (err) { 232 netif_err(priv, drv, priv->dev, 233 "Failed to alloc tx ring=%d: err=%d\n", 234 i, err); 235 goto err; 236 } 237 } 238 239 return 0; 240 241 err: 242 for (i--; i >= 0; i--) 243 gve_tx_free_ring_dqo(priv, i); 244 245 return err; 246 } 247 248 void gve_tx_free_rings_dqo(struct gve_priv *priv) 249 { 250 int i; 251 252 for (i = 0; i < priv->tx_cfg.num_queues; i++) { 253 struct gve_tx_ring *tx = &priv->tx[i]; 254 255 gve_clean_tx_done_dqo(priv, tx, /*napi=*/NULL); 256 netdev_tx_reset_queue(tx->netdev_txq); 257 gve_tx_clean_pending_packets(tx); 258 259 gve_tx_free_ring_dqo(priv, i); 260 } 261 } 262 263 /* Returns the number of slots available in the ring */ 264 static u32 num_avail_tx_slots(const struct gve_tx_ring *tx) 265 { 266 u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask; 267 268 return tx->mask - num_used; 269 } 270 271 /* Stops the queue if available descriptors is less than 'count'. 272 * Return: 0 if stop is not required. 273 */ 274 static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, int count) 275 { 276 if (likely(gve_has_pending_packet(tx) && 277 num_avail_tx_slots(tx) >= count)) 278 return 0; 279 280 /* Update cached TX head pointer */ 281 tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head); 282 283 if (likely(gve_has_pending_packet(tx) && 284 num_avail_tx_slots(tx) >= count)) 285 return 0; 286 287 /* No space, so stop the queue */ 288 tx->stop_queue++; 289 netif_tx_stop_queue(tx->netdev_txq); 290 291 /* Sync with restarting queue in `gve_tx_poll_dqo()` */ 292 mb(); 293 294 /* After stopping queue, check if we can transmit again in order to 295 * avoid TOCTOU bug. 296 */ 297 tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head); 298 299 if (likely(!gve_has_pending_packet(tx) || 300 num_avail_tx_slots(tx) < count)) 301 return -EBUSY; 302 303 netif_tx_start_queue(tx->netdev_txq); 304 tx->wake_queue++; 305 return 0; 306 } 307 308 static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb, 309 struct gve_tx_metadata_dqo *metadata) 310 { 311 memset(metadata, 0, sizeof(*metadata)); 312 metadata->version = GVE_TX_METADATA_VERSION_DQO; 313 314 if (skb->l4_hash) { 315 u16 path_hash = skb->hash ^ (skb->hash >> 16); 316 317 path_hash &= (1 << 15) - 1; 318 if (unlikely(path_hash == 0)) 319 path_hash = ~path_hash; 320 321 metadata->path_hash = path_hash; 322 } 323 } 324 325 static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx, 326 struct sk_buff *skb, u32 len, u64 addr, 327 s16 compl_tag, bool eop, bool is_gso) 328 { 329 const bool checksum_offload_en = skb->ip_summed == CHECKSUM_PARTIAL; 330 331 while (len > 0) { 332 struct gve_tx_pkt_desc_dqo *desc = 333 &tx->dqo.tx_ring[*desc_idx].pkt; 334 u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO); 335 bool cur_eop = eop && cur_len == len; 336 337 *desc = (struct gve_tx_pkt_desc_dqo){ 338 .buf_addr = cpu_to_le64(addr), 339 .dtype = GVE_TX_PKT_DESC_DTYPE_DQO, 340 .end_of_packet = cur_eop, 341 .checksum_offload_enable = checksum_offload_en, 342 .compl_tag = cpu_to_le16(compl_tag), 343 .buf_size = cur_len, 344 }; 345 346 addr += cur_len; 347 len -= cur_len; 348 *desc_idx = (*desc_idx + 1) & tx->mask; 349 } 350 } 351 352 /* Validates and prepares `skb` for TSO. 353 * 354 * Returns header length, or < 0 if invalid. 355 */ 356 static int gve_prep_tso(struct sk_buff *skb) 357 { 358 struct tcphdr *tcp; 359 int header_len; 360 u32 paylen; 361 int err; 362 363 /* Note: HW requires MSS (gso_size) to be <= 9728 and the total length 364 * of the TSO to be <= 262143. 365 * 366 * However, we don't validate these because: 367 * - Hypervisor enforces a limit of 9K MTU 368 * - Kernel will not produce a TSO larger than 64k 369 */ 370 371 if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO)) 372 return -1; 373 374 /* Needed because we will modify header. */ 375 err = skb_cow_head(skb, 0); 376 if (err < 0) 377 return err; 378 379 tcp = tcp_hdr(skb); 380 381 /* Remove payload length from checksum. */ 382 paylen = skb->len - skb_transport_offset(skb); 383 384 switch (skb_shinfo(skb)->gso_type) { 385 case SKB_GSO_TCPV4: 386 case SKB_GSO_TCPV6: 387 csum_replace_by_diff(&tcp->check, 388 (__force __wsum)htonl(paylen)); 389 390 /* Compute length of segmentation header. */ 391 header_len = skb_transport_offset(skb) + tcp_hdrlen(skb); 392 break; 393 default: 394 return -EINVAL; 395 } 396 397 if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO)) 398 return -EINVAL; 399 400 return header_len; 401 } 402 403 static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc, 404 const struct sk_buff *skb, 405 const struct gve_tx_metadata_dqo *metadata, 406 int header_len) 407 { 408 *desc = (struct gve_tx_tso_context_desc_dqo){ 409 .header_len = header_len, 410 .cmd_dtype = { 411 .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO, 412 .tso = 1, 413 }, 414 .flex0 = metadata->bytes[0], 415 .flex5 = metadata->bytes[5], 416 .flex6 = metadata->bytes[6], 417 .flex7 = metadata->bytes[7], 418 .flex8 = metadata->bytes[8], 419 .flex9 = metadata->bytes[9], 420 .flex10 = metadata->bytes[10], 421 .flex11 = metadata->bytes[11], 422 }; 423 desc->tso_total_len = skb->len - header_len; 424 desc->mss = skb_shinfo(skb)->gso_size; 425 } 426 427 static void 428 gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc, 429 const struct gve_tx_metadata_dqo *metadata) 430 { 431 *desc = (struct gve_tx_general_context_desc_dqo){ 432 .flex0 = metadata->bytes[0], 433 .flex1 = metadata->bytes[1], 434 .flex2 = metadata->bytes[2], 435 .flex3 = metadata->bytes[3], 436 .flex4 = metadata->bytes[4], 437 .flex5 = metadata->bytes[5], 438 .flex6 = metadata->bytes[6], 439 .flex7 = metadata->bytes[7], 440 .flex8 = metadata->bytes[8], 441 .flex9 = metadata->bytes[9], 442 .flex10 = metadata->bytes[10], 443 .flex11 = metadata->bytes[11], 444 .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO}, 445 }; 446 } 447 448 /* Returns 0 on success, or < 0 on error. 449 * 450 * Before this function is called, the caller must ensure 451 * gve_has_pending_packet(tx) returns true. 452 */ 453 static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx, 454 struct sk_buff *skb) 455 { 456 const struct skb_shared_info *shinfo = skb_shinfo(skb); 457 const bool is_gso = skb_is_gso(skb); 458 u32 desc_idx = tx->dqo_tx.tail; 459 460 struct gve_tx_pending_packet_dqo *pending_packet; 461 struct gve_tx_metadata_dqo metadata; 462 s16 completion_tag; 463 int i; 464 465 pending_packet = gve_alloc_pending_packet(tx); 466 pending_packet->skb = skb; 467 pending_packet->num_bufs = 0; 468 completion_tag = pending_packet - tx->dqo.pending_packets; 469 470 gve_extract_tx_metadata_dqo(skb, &metadata); 471 if (is_gso) { 472 int header_len = gve_prep_tso(skb); 473 474 if (unlikely(header_len < 0)) 475 goto err; 476 477 gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx, 478 skb, &metadata, header_len); 479 desc_idx = (desc_idx + 1) & tx->mask; 480 } 481 482 gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx, 483 &metadata); 484 desc_idx = (desc_idx + 1) & tx->mask; 485 486 /* Note: HW requires that the size of a non-TSO packet be within the 487 * range of [17, 9728]. 488 * 489 * We don't double check because 490 * - We limited `netdev->min_mtu` to ETH_MIN_MTU. 491 * - Hypervisor won't allow MTU larger than 9216. 492 */ 493 494 /* Map the linear portion of skb */ 495 { 496 struct gve_tx_dma_buf *buf = 497 &pending_packet->bufs[pending_packet->num_bufs]; 498 u32 len = skb_headlen(skb); 499 dma_addr_t addr; 500 501 addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE); 502 if (unlikely(dma_mapping_error(tx->dev, addr))) 503 goto err; 504 505 dma_unmap_len_set(buf, len, len); 506 dma_unmap_addr_set(buf, dma, addr); 507 ++pending_packet->num_bufs; 508 509 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr, 510 completion_tag, 511 /*eop=*/shinfo->nr_frags == 0, is_gso); 512 } 513 514 for (i = 0; i < shinfo->nr_frags; i++) { 515 struct gve_tx_dma_buf *buf = 516 &pending_packet->bufs[pending_packet->num_bufs]; 517 const skb_frag_t *frag = &shinfo->frags[i]; 518 bool is_eop = i == (shinfo->nr_frags - 1); 519 u32 len = skb_frag_size(frag); 520 dma_addr_t addr; 521 522 addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE); 523 if (unlikely(dma_mapping_error(tx->dev, addr))) 524 goto err; 525 526 dma_unmap_len_set(buf, len, len); 527 dma_unmap_addr_set(buf, dma, addr); 528 ++pending_packet->num_bufs; 529 530 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr, 531 completion_tag, is_eop, is_gso); 532 } 533 534 /* Commit the changes to our state */ 535 tx->dqo_tx.tail = desc_idx; 536 537 /* Request a descriptor completion on the last descriptor of the 538 * packet if we are allowed to by the HW enforced interval. 539 */ 540 { 541 u32 last_desc_idx = (desc_idx - 1) & tx->mask; 542 u32 last_report_event_interval = 543 (last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask; 544 545 if (unlikely(last_report_event_interval >= 546 GVE_TX_MIN_RE_INTERVAL)) { 547 tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true; 548 tx->dqo_tx.last_re_idx = last_desc_idx; 549 } 550 } 551 552 return 0; 553 554 err: 555 for (i = 0; i < pending_packet->num_bufs; i++) { 556 struct gve_tx_dma_buf *buf = &pending_packet->bufs[i]; 557 558 if (i == 0) { 559 dma_unmap_single(tx->dev, dma_unmap_addr(buf, dma), 560 dma_unmap_len(buf, len), 561 DMA_TO_DEVICE); 562 } else { 563 dma_unmap_page(tx->dev, dma_unmap_addr(buf, dma), 564 dma_unmap_len(buf, len), DMA_TO_DEVICE); 565 } 566 } 567 568 pending_packet->skb = NULL; 569 pending_packet->num_bufs = 0; 570 gve_free_pending_packet(tx, pending_packet); 571 572 return -1; 573 } 574 575 static int gve_num_descs_per_buf(size_t size) 576 { 577 return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO); 578 } 579 580 static int gve_num_buffer_descs_needed(const struct sk_buff *skb) 581 { 582 const struct skb_shared_info *shinfo = skb_shinfo(skb); 583 int num_descs; 584 int i; 585 586 num_descs = gve_num_descs_per_buf(skb_headlen(skb)); 587 588 for (i = 0; i < shinfo->nr_frags; i++) { 589 unsigned int frag_size = skb_frag_size(&shinfo->frags[i]); 590 591 num_descs += gve_num_descs_per_buf(frag_size); 592 } 593 594 return num_descs; 595 } 596 597 /* Returns true if HW is capable of sending TSO represented by `skb`. 598 * 599 * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers. 600 * - The header is counted as one buffer for every single segment. 601 * - A buffer which is split between two segments is counted for both. 602 * - If a buffer contains both header and payload, it is counted as two buffers. 603 */ 604 static bool gve_can_send_tso(const struct sk_buff *skb) 605 { 606 const int header_len = skb_checksum_start_offset(skb) + tcp_hdrlen(skb); 607 const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1; 608 const struct skb_shared_info *shinfo = skb_shinfo(skb); 609 const int gso_size = shinfo->gso_size; 610 int cur_seg_num_bufs; 611 int cur_seg_size; 612 int i; 613 614 cur_seg_size = skb_headlen(skb) - header_len; 615 cur_seg_num_bufs = cur_seg_size > 0; 616 617 for (i = 0; i < shinfo->nr_frags; i++) { 618 if (cur_seg_size >= gso_size) { 619 cur_seg_size %= gso_size; 620 cur_seg_num_bufs = cur_seg_size > 0; 621 } 622 623 if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg)) 624 return false; 625 626 cur_seg_size += skb_frag_size(&shinfo->frags[i]); 627 } 628 629 return true; 630 } 631 632 /* Attempt to transmit specified SKB. 633 * 634 * Returns 0 if the SKB was transmitted or dropped. 635 * Returns -1 if there is not currently enough space to transmit the SKB. 636 */ 637 static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx, 638 struct sk_buff *skb) 639 { 640 int num_buffer_descs; 641 int total_num_descs; 642 643 if (skb_is_gso(skb)) { 644 /* If TSO doesn't meet HW requirements, attempt to linearize the 645 * packet. 646 */ 647 if (unlikely(!gve_can_send_tso(skb) && 648 skb_linearize(skb) < 0)) { 649 net_err_ratelimited("%s: Failed to transmit TSO packet\n", 650 priv->dev->name); 651 goto drop; 652 } 653 654 num_buffer_descs = gve_num_buffer_descs_needed(skb); 655 } else { 656 num_buffer_descs = gve_num_buffer_descs_needed(skb); 657 658 if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) { 659 if (unlikely(skb_linearize(skb) < 0)) 660 goto drop; 661 662 num_buffer_descs = 1; 663 } 664 } 665 666 /* Metadata + (optional TSO) + data descriptors. */ 667 total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs; 668 if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs + 669 GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP))) { 670 return -1; 671 } 672 673 if (unlikely(gve_tx_add_skb_no_copy_dqo(tx, skb) < 0)) 674 goto drop; 675 676 netdev_tx_sent_queue(tx->netdev_txq, skb->len); 677 skb_tx_timestamp(skb); 678 return 0; 679 680 drop: 681 tx->dropped_pkt++; 682 dev_kfree_skb_any(skb); 683 return 0; 684 } 685 686 /* Transmit a given skb and ring the doorbell. */ 687 netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev) 688 { 689 struct gve_priv *priv = netdev_priv(dev); 690 struct gve_tx_ring *tx; 691 692 tx = &priv->tx[skb_get_queue_mapping(skb)]; 693 if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) { 694 /* We need to ring the txq doorbell -- we have stopped the Tx 695 * queue for want of resources, but prior calls to gve_tx() 696 * may have added descriptors without ringing the doorbell. 697 */ 698 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); 699 return NETDEV_TX_BUSY; 700 } 701 702 if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more()) 703 return NETDEV_TX_OK; 704 705 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); 706 return NETDEV_TX_OK; 707 } 708 709 static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list, 710 struct gve_tx_pending_packet_dqo *pending_packet) 711 { 712 s16 old_tail, index; 713 714 index = pending_packet - tx->dqo.pending_packets; 715 old_tail = list->tail; 716 list->tail = index; 717 if (old_tail == -1) 718 list->head = index; 719 else 720 tx->dqo.pending_packets[old_tail].next = index; 721 722 pending_packet->next = -1; 723 pending_packet->prev = old_tail; 724 } 725 726 static void remove_from_list(struct gve_tx_ring *tx, 727 struct gve_index_list *list, 728 struct gve_tx_pending_packet_dqo *pending_packet) 729 { 730 s16 prev_index, next_index; 731 732 prev_index = pending_packet->prev; 733 next_index = pending_packet->next; 734 735 if (prev_index == -1) { 736 /* Node is head */ 737 list->head = next_index; 738 } else { 739 tx->dqo.pending_packets[prev_index].next = next_index; 740 } 741 if (next_index == -1) { 742 /* Node is tail */ 743 list->tail = prev_index; 744 } else { 745 tx->dqo.pending_packets[next_index].prev = prev_index; 746 } 747 } 748 749 static void gve_unmap_packet(struct device *dev, 750 struct gve_tx_pending_packet_dqo *pending_packet) 751 { 752 struct gve_tx_dma_buf *buf; 753 int i; 754 755 /* SKB linear portion is guaranteed to be mapped */ 756 buf = &pending_packet->bufs[0]; 757 dma_unmap_single(dev, dma_unmap_addr(buf, dma), 758 dma_unmap_len(buf, len), DMA_TO_DEVICE); 759 for (i = 1; i < pending_packet->num_bufs; i++) { 760 buf = &pending_packet->bufs[i]; 761 dma_unmap_page(dev, dma_unmap_addr(buf, dma), 762 dma_unmap_len(buf, len), DMA_TO_DEVICE); 763 } 764 pending_packet->num_bufs = 0; 765 } 766 767 /* Completion types and expected behavior: 768 * No Miss compl + Packet compl = Packet completed normally. 769 * Miss compl + Re-inject compl = Packet completed normally. 770 * No Miss compl + Re-inject compl = Skipped i.e. packet not completed. 771 * Miss compl + Packet compl = Skipped i.e. packet not completed. 772 */ 773 static void gve_handle_packet_completion(struct gve_priv *priv, 774 struct gve_tx_ring *tx, bool is_napi, 775 u16 compl_tag, u64 *bytes, u64 *pkts, 776 bool is_reinjection) 777 { 778 struct gve_tx_pending_packet_dqo *pending_packet; 779 780 if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) { 781 net_err_ratelimited("%s: Invalid TX completion tag: %d\n", 782 priv->dev->name, (int)compl_tag); 783 return; 784 } 785 786 pending_packet = &tx->dqo.pending_packets[compl_tag]; 787 788 if (unlikely(is_reinjection)) { 789 if (unlikely(pending_packet->state == 790 GVE_PACKET_STATE_TIMED_OUT_COMPL)) { 791 net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n", 792 priv->dev->name, (int)compl_tag); 793 /* Packet was already completed as a result of timeout, 794 * so just remove from list and free pending packet. 795 */ 796 remove_from_list(tx, 797 &tx->dqo_compl.timed_out_completions, 798 pending_packet); 799 gve_free_pending_packet(tx, pending_packet); 800 return; 801 } 802 if (unlikely(pending_packet->state != 803 GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) { 804 /* No outstanding miss completion but packet allocated 805 * implies packet receives a re-injection completion 806 * without a a prior miss completion. Return without 807 * completing the packet. 808 */ 809 net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n", 810 priv->dev->name, (int)compl_tag); 811 return; 812 } 813 remove_from_list(tx, &tx->dqo_compl.miss_completions, 814 pending_packet); 815 } else { 816 /* Packet is allocated but not a pending data completion. */ 817 if (unlikely(pending_packet->state != 818 GVE_PACKET_STATE_PENDING_DATA_COMPL)) { 819 net_err_ratelimited("%s: No pending data completion: %d\n", 820 priv->dev->name, (int)compl_tag); 821 return; 822 } 823 } 824 gve_unmap_packet(tx->dev, pending_packet); 825 826 *bytes += pending_packet->skb->len; 827 (*pkts)++; 828 napi_consume_skb(pending_packet->skb, is_napi); 829 pending_packet->skb = NULL; 830 gve_free_pending_packet(tx, pending_packet); 831 } 832 833 static void gve_handle_miss_completion(struct gve_priv *priv, 834 struct gve_tx_ring *tx, u16 compl_tag, 835 u64 *bytes, u64 *pkts) 836 { 837 struct gve_tx_pending_packet_dqo *pending_packet; 838 839 if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) { 840 net_err_ratelimited("%s: Invalid TX completion tag: %d\n", 841 priv->dev->name, (int)compl_tag); 842 return; 843 } 844 845 pending_packet = &tx->dqo.pending_packets[compl_tag]; 846 if (unlikely(pending_packet->state != 847 GVE_PACKET_STATE_PENDING_DATA_COMPL)) { 848 net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n", 849 priv->dev->name, (int)pending_packet->state, 850 (int)compl_tag); 851 return; 852 } 853 854 pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL; 855 /* jiffies can wraparound but time comparisons can handle overflows. */ 856 pending_packet->timeout_jiffies = 857 jiffies + 858 msecs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT * 859 MSEC_PER_SEC); 860 add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet); 861 862 *bytes += pending_packet->skb->len; 863 (*pkts)++; 864 } 865 866 static void remove_miss_completions(struct gve_priv *priv, 867 struct gve_tx_ring *tx) 868 { 869 struct gve_tx_pending_packet_dqo *pending_packet; 870 s16 next_index; 871 872 next_index = tx->dqo_compl.miss_completions.head; 873 while (next_index != -1) { 874 pending_packet = &tx->dqo.pending_packets[next_index]; 875 next_index = pending_packet->next; 876 /* Break early because packets should timeout in order. */ 877 if (time_is_after_jiffies(pending_packet->timeout_jiffies)) 878 break; 879 880 remove_from_list(tx, &tx->dqo_compl.miss_completions, 881 pending_packet); 882 /* Unmap buffers and free skb but do not unallocate packet i.e. 883 * the completion tag is not freed to ensure that the driver 884 * can take appropriate action if a corresponding valid 885 * completion is received later. 886 */ 887 gve_unmap_packet(tx->dev, pending_packet); 888 /* This indicates the packet was dropped. */ 889 dev_kfree_skb_any(pending_packet->skb); 890 pending_packet->skb = NULL; 891 tx->dropped_pkt++; 892 net_err_ratelimited("%s: No reinjection completion was received for: %d.\n", 893 priv->dev->name, 894 (int)(pending_packet - tx->dqo.pending_packets)); 895 896 pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL; 897 pending_packet->timeout_jiffies = 898 jiffies + 899 msecs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT * 900 MSEC_PER_SEC); 901 /* Maintain pending packet in another list so the packet can be 902 * unallocated at a later time. 903 */ 904 add_to_list(tx, &tx->dqo_compl.timed_out_completions, 905 pending_packet); 906 } 907 } 908 909 static void remove_timed_out_completions(struct gve_priv *priv, 910 struct gve_tx_ring *tx) 911 { 912 struct gve_tx_pending_packet_dqo *pending_packet; 913 s16 next_index; 914 915 next_index = tx->dqo_compl.timed_out_completions.head; 916 while (next_index != -1) { 917 pending_packet = &tx->dqo.pending_packets[next_index]; 918 next_index = pending_packet->next; 919 /* Break early because packets should timeout in order. */ 920 if (time_is_after_jiffies(pending_packet->timeout_jiffies)) 921 break; 922 923 remove_from_list(tx, &tx->dqo_compl.timed_out_completions, 924 pending_packet); 925 gve_free_pending_packet(tx, pending_packet); 926 } 927 } 928 929 int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, 930 struct napi_struct *napi) 931 { 932 u64 reinject_compl_bytes = 0; 933 u64 reinject_compl_pkts = 0; 934 int num_descs_cleaned = 0; 935 u64 miss_compl_bytes = 0; 936 u64 miss_compl_pkts = 0; 937 u64 pkt_compl_bytes = 0; 938 u64 pkt_compl_pkts = 0; 939 940 /* Limit in order to avoid blocking for too long */ 941 while (!napi || pkt_compl_pkts < napi->weight) { 942 struct gve_tx_compl_desc *compl_desc = 943 &tx->dqo.compl_ring[tx->dqo_compl.head]; 944 u16 type; 945 946 if (compl_desc->generation == tx->dqo_compl.cur_gen_bit) 947 break; 948 949 /* Prefetch the next descriptor. */ 950 prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) & 951 tx->dqo.complq_mask]); 952 953 /* Do not read data until we own the descriptor */ 954 dma_rmb(); 955 type = compl_desc->type; 956 957 if (type == GVE_COMPL_TYPE_DQO_DESC) { 958 /* This is the last descriptor fetched by HW plus one */ 959 u16 tx_head = le16_to_cpu(compl_desc->tx_head); 960 961 atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head); 962 } else if (type == GVE_COMPL_TYPE_DQO_PKT) { 963 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); 964 965 gve_handle_packet_completion(priv, tx, !!napi, 966 compl_tag, 967 &pkt_compl_bytes, 968 &pkt_compl_pkts, 969 /*is_reinjection=*/false); 970 } else if (type == GVE_COMPL_TYPE_DQO_MISS) { 971 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); 972 973 gve_handle_miss_completion(priv, tx, compl_tag, 974 &miss_compl_bytes, 975 &miss_compl_pkts); 976 } else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) { 977 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); 978 979 gve_handle_packet_completion(priv, tx, !!napi, 980 compl_tag, 981 &reinject_compl_bytes, 982 &reinject_compl_pkts, 983 /*is_reinjection=*/true); 984 } 985 986 tx->dqo_compl.head = 987 (tx->dqo_compl.head + 1) & tx->dqo.complq_mask; 988 /* Flip the generation bit when we wrap around */ 989 tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0; 990 num_descs_cleaned++; 991 } 992 993 netdev_tx_completed_queue(tx->netdev_txq, 994 pkt_compl_pkts + miss_compl_pkts, 995 pkt_compl_bytes + miss_compl_bytes); 996 997 remove_miss_completions(priv, tx); 998 remove_timed_out_completions(priv, tx); 999 1000 u64_stats_update_begin(&tx->statss); 1001 tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes; 1002 tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts; 1003 u64_stats_update_end(&tx->statss); 1004 return num_descs_cleaned; 1005 } 1006 1007 bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean) 1008 { 1009 struct gve_tx_compl_desc *compl_desc; 1010 struct gve_tx_ring *tx = block->tx; 1011 struct gve_priv *priv = block->priv; 1012 1013 if (do_clean) { 1014 int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx, 1015 &block->napi); 1016 1017 /* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */ 1018 mb(); 1019 1020 if (netif_tx_queue_stopped(tx->netdev_txq) && 1021 num_descs_cleaned > 0) { 1022 tx->wake_queue++; 1023 netif_tx_wake_queue(tx->netdev_txq); 1024 } 1025 } 1026 1027 /* Return true if we still have work. */ 1028 compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head]; 1029 return compl_desc->generation != tx->dqo_compl.cur_gen_bit; 1030 } 1031