1 // SPDX-License-Identifier: (GPL-2.0 OR MIT) 2 /* Google virtual Ethernet (gve) driver 3 * 4 * Copyright (C) 2015-2021 Google, Inc. 5 */ 6 7 #include "gve.h" 8 #include "gve_adminq.h" 9 #include "gve_utils.h" 10 #include "gve_dqo.h" 11 #include <net/ip.h> 12 #include <linux/tcp.h> 13 #include <linux/slab.h> 14 #include <linux/skbuff.h> 15 16 /* Returns true if a gve_tx_pending_packet_dqo object is available. */ 17 static bool gve_has_pending_packet(struct gve_tx_ring *tx) 18 { 19 /* Check TX path's list. */ 20 if (tx->dqo_tx.free_pending_packets != -1) 21 return true; 22 23 /* Check completion handler's list. */ 24 if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1) 25 return true; 26 27 return false; 28 } 29 30 static struct gve_tx_pending_packet_dqo * 31 gve_alloc_pending_packet(struct gve_tx_ring *tx) 32 { 33 struct gve_tx_pending_packet_dqo *pending_packet; 34 s16 index; 35 36 index = tx->dqo_tx.free_pending_packets; 37 38 /* No pending_packets available, try to steal the list from the 39 * completion handler. 40 */ 41 if (unlikely(index == -1)) { 42 tx->dqo_tx.free_pending_packets = 43 atomic_xchg(&tx->dqo_compl.free_pending_packets, -1); 44 index = tx->dqo_tx.free_pending_packets; 45 46 if (unlikely(index == -1)) 47 return NULL; 48 } 49 50 pending_packet = &tx->dqo.pending_packets[index]; 51 52 /* Remove pending_packet from free list */ 53 tx->dqo_tx.free_pending_packets = pending_packet->next; 54 pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL; 55 56 return pending_packet; 57 } 58 59 static void 60 gve_free_pending_packet(struct gve_tx_ring *tx, 61 struct gve_tx_pending_packet_dqo *pending_packet) 62 { 63 s16 index = pending_packet - tx->dqo.pending_packets; 64 65 pending_packet->state = GVE_PACKET_STATE_UNALLOCATED; 66 while (true) { 67 s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets); 68 69 pending_packet->next = old_head; 70 if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets, 71 old_head, index) == old_head) { 72 break; 73 } 74 } 75 } 76 77 /* gve_tx_free_desc - Cleans up all pending tx requests and buffers. 78 */ 79 static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx) 80 { 81 int i; 82 83 for (i = 0; i < tx->dqo.num_pending_packets; i++) { 84 struct gve_tx_pending_packet_dqo *cur_state = 85 &tx->dqo.pending_packets[i]; 86 int j; 87 88 for (j = 0; j < cur_state->num_bufs; j++) { 89 if (j == 0) { 90 dma_unmap_single(tx->dev, 91 dma_unmap_addr(cur_state, dma[j]), 92 dma_unmap_len(cur_state, len[j]), 93 DMA_TO_DEVICE); 94 } else { 95 dma_unmap_page(tx->dev, 96 dma_unmap_addr(cur_state, dma[j]), 97 dma_unmap_len(cur_state, len[j]), 98 DMA_TO_DEVICE); 99 } 100 } 101 if (cur_state->skb) { 102 dev_consume_skb_any(cur_state->skb); 103 cur_state->skb = NULL; 104 } 105 } 106 } 107 108 static void gve_tx_free_ring_dqo(struct gve_priv *priv, int idx) 109 { 110 struct gve_tx_ring *tx = &priv->tx[idx]; 111 struct device *hdev = &priv->pdev->dev; 112 size_t bytes; 113 114 gve_tx_remove_from_block(priv, idx); 115 116 if (tx->q_resources) { 117 dma_free_coherent(hdev, sizeof(*tx->q_resources), 118 tx->q_resources, tx->q_resources_bus); 119 tx->q_resources = NULL; 120 } 121 122 if (tx->dqo.compl_ring) { 123 bytes = sizeof(tx->dqo.compl_ring[0]) * 124 (tx->dqo.complq_mask + 1); 125 dma_free_coherent(hdev, bytes, tx->dqo.compl_ring, 126 tx->complq_bus_dqo); 127 tx->dqo.compl_ring = NULL; 128 } 129 130 if (tx->dqo.tx_ring) { 131 bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1); 132 dma_free_coherent(hdev, bytes, tx->dqo.tx_ring, tx->bus); 133 tx->dqo.tx_ring = NULL; 134 } 135 136 kvfree(tx->dqo.pending_packets); 137 tx->dqo.pending_packets = NULL; 138 139 netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx); 140 } 141 142 static int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int idx) 143 { 144 struct gve_tx_ring *tx = &priv->tx[idx]; 145 struct device *hdev = &priv->pdev->dev; 146 int num_pending_packets; 147 size_t bytes; 148 int i; 149 150 memset(tx, 0, sizeof(*tx)); 151 tx->q_num = idx; 152 tx->dev = &priv->pdev->dev; 153 tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx); 154 atomic_set_release(&tx->dqo_compl.hw_tx_head, 0); 155 156 /* Queue sizes must be a power of 2 */ 157 tx->mask = priv->tx_desc_cnt - 1; 158 tx->dqo.complq_mask = priv->options_dqo_rda.tx_comp_ring_entries - 1; 159 160 /* The max number of pending packets determines the maximum number of 161 * descriptors which maybe written to the completion queue. 162 * 163 * We must set the number small enough to make sure we never overrun the 164 * completion queue. 165 */ 166 num_pending_packets = tx->dqo.complq_mask + 1; 167 168 /* Reserve space for descriptor completions, which will be reported at 169 * most every GVE_TX_MIN_RE_INTERVAL packets. 170 */ 171 num_pending_packets -= 172 (tx->dqo.complq_mask + 1) / GVE_TX_MIN_RE_INTERVAL; 173 174 /* Each packet may have at most 2 buffer completions if it receives both 175 * a miss and reinjection completion. 176 */ 177 num_pending_packets /= 2; 178 179 tx->dqo.num_pending_packets = min_t(int, num_pending_packets, S16_MAX); 180 tx->dqo.pending_packets = kvcalloc(tx->dqo.num_pending_packets, 181 sizeof(tx->dqo.pending_packets[0]), 182 GFP_KERNEL); 183 if (!tx->dqo.pending_packets) 184 goto err; 185 186 /* Set up linked list of pending packets */ 187 for (i = 0; i < tx->dqo.num_pending_packets - 1; i++) 188 tx->dqo.pending_packets[i].next = i + 1; 189 190 tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1; 191 atomic_set_release(&tx->dqo_compl.free_pending_packets, -1); 192 tx->dqo_compl.miss_completions.head = -1; 193 tx->dqo_compl.miss_completions.tail = -1; 194 tx->dqo_compl.timed_out_completions.head = -1; 195 tx->dqo_compl.timed_out_completions.tail = -1; 196 197 bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1); 198 tx->dqo.tx_ring = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL); 199 if (!tx->dqo.tx_ring) 200 goto err; 201 202 bytes = sizeof(tx->dqo.compl_ring[0]) * (tx->dqo.complq_mask + 1); 203 tx->dqo.compl_ring = dma_alloc_coherent(hdev, bytes, 204 &tx->complq_bus_dqo, 205 GFP_KERNEL); 206 if (!tx->dqo.compl_ring) 207 goto err; 208 209 tx->q_resources = dma_alloc_coherent(hdev, sizeof(*tx->q_resources), 210 &tx->q_resources_bus, GFP_KERNEL); 211 if (!tx->q_resources) 212 goto err; 213 214 gve_tx_add_to_block(priv, idx); 215 216 return 0; 217 218 err: 219 gve_tx_free_ring_dqo(priv, idx); 220 return -ENOMEM; 221 } 222 223 int gve_tx_alloc_rings_dqo(struct gve_priv *priv) 224 { 225 int err = 0; 226 int i; 227 228 for (i = 0; i < priv->tx_cfg.num_queues; i++) { 229 err = gve_tx_alloc_ring_dqo(priv, i); 230 if (err) { 231 netif_err(priv, drv, priv->dev, 232 "Failed to alloc tx ring=%d: err=%d\n", 233 i, err); 234 goto err; 235 } 236 } 237 238 return 0; 239 240 err: 241 for (i--; i >= 0; i--) 242 gve_tx_free_ring_dqo(priv, i); 243 244 return err; 245 } 246 247 void gve_tx_free_rings_dqo(struct gve_priv *priv) 248 { 249 int i; 250 251 for (i = 0; i < priv->tx_cfg.num_queues; i++) { 252 struct gve_tx_ring *tx = &priv->tx[i]; 253 254 gve_clean_tx_done_dqo(priv, tx, /*napi=*/NULL); 255 netdev_tx_reset_queue(tx->netdev_txq); 256 gve_tx_clean_pending_packets(tx); 257 258 gve_tx_free_ring_dqo(priv, i); 259 } 260 } 261 262 /* Returns the number of slots available in the ring */ 263 static u32 num_avail_tx_slots(const struct gve_tx_ring *tx) 264 { 265 u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask; 266 267 return tx->mask - num_used; 268 } 269 270 /* Stops the queue if available descriptors is less than 'count'. 271 * Return: 0 if stop is not required. 272 */ 273 static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, int count) 274 { 275 if (likely(gve_has_pending_packet(tx) && 276 num_avail_tx_slots(tx) >= count)) 277 return 0; 278 279 /* Update cached TX head pointer */ 280 tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head); 281 282 if (likely(gve_has_pending_packet(tx) && 283 num_avail_tx_slots(tx) >= count)) 284 return 0; 285 286 /* No space, so stop the queue */ 287 tx->stop_queue++; 288 netif_tx_stop_queue(tx->netdev_txq); 289 290 /* Sync with restarting queue in `gve_tx_poll_dqo()` */ 291 mb(); 292 293 /* After stopping queue, check if we can transmit again in order to 294 * avoid TOCTOU bug. 295 */ 296 tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head); 297 298 if (likely(!gve_has_pending_packet(tx) || 299 num_avail_tx_slots(tx) < count)) 300 return -EBUSY; 301 302 netif_tx_start_queue(tx->netdev_txq); 303 tx->wake_queue++; 304 return 0; 305 } 306 307 static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb, 308 struct gve_tx_metadata_dqo *metadata) 309 { 310 memset(metadata, 0, sizeof(*metadata)); 311 metadata->version = GVE_TX_METADATA_VERSION_DQO; 312 313 if (skb->l4_hash) { 314 u16 path_hash = skb->hash ^ (skb->hash >> 16); 315 316 path_hash &= (1 << 15) - 1; 317 if (unlikely(path_hash == 0)) 318 path_hash = ~path_hash; 319 320 metadata->path_hash = path_hash; 321 } 322 } 323 324 static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx, 325 struct sk_buff *skb, u32 len, u64 addr, 326 s16 compl_tag, bool eop, bool is_gso) 327 { 328 const bool checksum_offload_en = skb->ip_summed == CHECKSUM_PARTIAL; 329 330 while (len > 0) { 331 struct gve_tx_pkt_desc_dqo *desc = 332 &tx->dqo.tx_ring[*desc_idx].pkt; 333 u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO); 334 bool cur_eop = eop && cur_len == len; 335 336 *desc = (struct gve_tx_pkt_desc_dqo){ 337 .buf_addr = cpu_to_le64(addr), 338 .dtype = GVE_TX_PKT_DESC_DTYPE_DQO, 339 .end_of_packet = cur_eop, 340 .checksum_offload_enable = checksum_offload_en, 341 .compl_tag = cpu_to_le16(compl_tag), 342 .buf_size = cur_len, 343 }; 344 345 addr += cur_len; 346 len -= cur_len; 347 *desc_idx = (*desc_idx + 1) & tx->mask; 348 } 349 } 350 351 /* Validates and prepares `skb` for TSO. 352 * 353 * Returns header length, or < 0 if invalid. 354 */ 355 static int gve_prep_tso(struct sk_buff *skb) 356 { 357 struct tcphdr *tcp; 358 int header_len; 359 u32 paylen; 360 int err; 361 362 /* Note: HW requires MSS (gso_size) to be <= 9728 and the total length 363 * of the TSO to be <= 262143. 364 * 365 * However, we don't validate these because: 366 * - Hypervisor enforces a limit of 9K MTU 367 * - Kernel will not produce a TSO larger than 64k 368 */ 369 370 if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO)) 371 return -1; 372 373 /* Needed because we will modify header. */ 374 err = skb_cow_head(skb, 0); 375 if (err < 0) 376 return err; 377 378 tcp = tcp_hdr(skb); 379 380 /* Remove payload length from checksum. */ 381 paylen = skb->len - skb_transport_offset(skb); 382 383 switch (skb_shinfo(skb)->gso_type) { 384 case SKB_GSO_TCPV4: 385 case SKB_GSO_TCPV6: 386 csum_replace_by_diff(&tcp->check, 387 (__force __wsum)htonl(paylen)); 388 389 /* Compute length of segmentation header. */ 390 header_len = skb_tcp_all_headers(skb); 391 break; 392 default: 393 return -EINVAL; 394 } 395 396 if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO)) 397 return -EINVAL; 398 399 return header_len; 400 } 401 402 static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc, 403 const struct sk_buff *skb, 404 const struct gve_tx_metadata_dqo *metadata, 405 int header_len) 406 { 407 *desc = (struct gve_tx_tso_context_desc_dqo){ 408 .header_len = header_len, 409 .cmd_dtype = { 410 .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO, 411 .tso = 1, 412 }, 413 .flex0 = metadata->bytes[0], 414 .flex5 = metadata->bytes[5], 415 .flex6 = metadata->bytes[6], 416 .flex7 = metadata->bytes[7], 417 .flex8 = metadata->bytes[8], 418 .flex9 = metadata->bytes[9], 419 .flex10 = metadata->bytes[10], 420 .flex11 = metadata->bytes[11], 421 }; 422 desc->tso_total_len = skb->len - header_len; 423 desc->mss = skb_shinfo(skb)->gso_size; 424 } 425 426 static void 427 gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc, 428 const struct gve_tx_metadata_dqo *metadata) 429 { 430 *desc = (struct gve_tx_general_context_desc_dqo){ 431 .flex0 = metadata->bytes[0], 432 .flex1 = metadata->bytes[1], 433 .flex2 = metadata->bytes[2], 434 .flex3 = metadata->bytes[3], 435 .flex4 = metadata->bytes[4], 436 .flex5 = metadata->bytes[5], 437 .flex6 = metadata->bytes[6], 438 .flex7 = metadata->bytes[7], 439 .flex8 = metadata->bytes[8], 440 .flex9 = metadata->bytes[9], 441 .flex10 = metadata->bytes[10], 442 .flex11 = metadata->bytes[11], 443 .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO}, 444 }; 445 } 446 447 /* Returns 0 on success, or < 0 on error. 448 * 449 * Before this function is called, the caller must ensure 450 * gve_has_pending_packet(tx) returns true. 451 */ 452 static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx, 453 struct sk_buff *skb) 454 { 455 const struct skb_shared_info *shinfo = skb_shinfo(skb); 456 const bool is_gso = skb_is_gso(skb); 457 u32 desc_idx = tx->dqo_tx.tail; 458 459 struct gve_tx_pending_packet_dqo *pkt; 460 struct gve_tx_metadata_dqo metadata; 461 s16 completion_tag; 462 int i; 463 464 pkt = gve_alloc_pending_packet(tx); 465 pkt->skb = skb; 466 pkt->num_bufs = 0; 467 completion_tag = pkt - tx->dqo.pending_packets; 468 469 gve_extract_tx_metadata_dqo(skb, &metadata); 470 if (is_gso) { 471 int header_len = gve_prep_tso(skb); 472 473 if (unlikely(header_len < 0)) 474 goto err; 475 476 gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx, 477 skb, &metadata, header_len); 478 desc_idx = (desc_idx + 1) & tx->mask; 479 } 480 481 gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx, 482 &metadata); 483 desc_idx = (desc_idx + 1) & tx->mask; 484 485 /* Note: HW requires that the size of a non-TSO packet be within the 486 * range of [17, 9728]. 487 * 488 * We don't double check because 489 * - We limited `netdev->min_mtu` to ETH_MIN_MTU. 490 * - Hypervisor won't allow MTU larger than 9216. 491 */ 492 493 /* Map the linear portion of skb */ 494 { 495 u32 len = skb_headlen(skb); 496 dma_addr_t addr; 497 498 addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE); 499 if (unlikely(dma_mapping_error(tx->dev, addr))) 500 goto err; 501 502 dma_unmap_len_set(pkt, len[pkt->num_bufs], len); 503 dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr); 504 ++pkt->num_bufs; 505 506 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr, 507 completion_tag, 508 /*eop=*/shinfo->nr_frags == 0, is_gso); 509 } 510 511 for (i = 0; i < shinfo->nr_frags; i++) { 512 const skb_frag_t *frag = &shinfo->frags[i]; 513 bool is_eop = i == (shinfo->nr_frags - 1); 514 u32 len = skb_frag_size(frag); 515 dma_addr_t addr; 516 517 addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE); 518 if (unlikely(dma_mapping_error(tx->dev, addr))) 519 goto err; 520 521 dma_unmap_len_set(pkt, len[pkt->num_bufs], len); 522 dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr); 523 ++pkt->num_bufs; 524 525 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr, 526 completion_tag, is_eop, is_gso); 527 } 528 529 /* Commit the changes to our state */ 530 tx->dqo_tx.tail = desc_idx; 531 532 /* Request a descriptor completion on the last descriptor of the 533 * packet if we are allowed to by the HW enforced interval. 534 */ 535 { 536 u32 last_desc_idx = (desc_idx - 1) & tx->mask; 537 u32 last_report_event_interval = 538 (last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask; 539 540 if (unlikely(last_report_event_interval >= 541 GVE_TX_MIN_RE_INTERVAL)) { 542 tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true; 543 tx->dqo_tx.last_re_idx = last_desc_idx; 544 } 545 } 546 547 return 0; 548 549 err: 550 for (i = 0; i < pkt->num_bufs; i++) { 551 if (i == 0) { 552 dma_unmap_single(tx->dev, 553 dma_unmap_addr(pkt, dma[i]), 554 dma_unmap_len(pkt, len[i]), 555 DMA_TO_DEVICE); 556 } else { 557 dma_unmap_page(tx->dev, 558 dma_unmap_addr(pkt, dma[i]), 559 dma_unmap_len(pkt, len[i]), 560 DMA_TO_DEVICE); 561 } 562 } 563 564 pkt->skb = NULL; 565 pkt->num_bufs = 0; 566 gve_free_pending_packet(tx, pkt); 567 568 return -1; 569 } 570 571 static int gve_num_descs_per_buf(size_t size) 572 { 573 return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO); 574 } 575 576 static int gve_num_buffer_descs_needed(const struct sk_buff *skb) 577 { 578 const struct skb_shared_info *shinfo = skb_shinfo(skb); 579 int num_descs; 580 int i; 581 582 num_descs = gve_num_descs_per_buf(skb_headlen(skb)); 583 584 for (i = 0; i < shinfo->nr_frags; i++) { 585 unsigned int frag_size = skb_frag_size(&shinfo->frags[i]); 586 587 num_descs += gve_num_descs_per_buf(frag_size); 588 } 589 590 return num_descs; 591 } 592 593 /* Returns true if HW is capable of sending TSO represented by `skb`. 594 * 595 * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers. 596 * - The header is counted as one buffer for every single segment. 597 * - A buffer which is split between two segments is counted for both. 598 * - If a buffer contains both header and payload, it is counted as two buffers. 599 */ 600 static bool gve_can_send_tso(const struct sk_buff *skb) 601 { 602 const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1; 603 const struct skb_shared_info *shinfo = skb_shinfo(skb); 604 const int header_len = skb_tcp_all_headers(skb); 605 const int gso_size = shinfo->gso_size; 606 int cur_seg_num_bufs; 607 int cur_seg_size; 608 int i; 609 610 cur_seg_size = skb_headlen(skb) - header_len; 611 cur_seg_num_bufs = cur_seg_size > 0; 612 613 for (i = 0; i < shinfo->nr_frags; i++) { 614 if (cur_seg_size >= gso_size) { 615 cur_seg_size %= gso_size; 616 cur_seg_num_bufs = cur_seg_size > 0; 617 } 618 619 if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg)) 620 return false; 621 622 cur_seg_size += skb_frag_size(&shinfo->frags[i]); 623 } 624 625 return true; 626 } 627 628 /* Attempt to transmit specified SKB. 629 * 630 * Returns 0 if the SKB was transmitted or dropped. 631 * Returns -1 if there is not currently enough space to transmit the SKB. 632 */ 633 static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx, 634 struct sk_buff *skb) 635 { 636 int num_buffer_descs; 637 int total_num_descs; 638 639 if (skb_is_gso(skb)) { 640 /* If TSO doesn't meet HW requirements, attempt to linearize the 641 * packet. 642 */ 643 if (unlikely(!gve_can_send_tso(skb) && 644 skb_linearize(skb) < 0)) { 645 net_err_ratelimited("%s: Failed to transmit TSO packet\n", 646 priv->dev->name); 647 goto drop; 648 } 649 650 if (unlikely(ipv6_hopopt_jumbo_remove(skb))) 651 goto drop; 652 653 num_buffer_descs = gve_num_buffer_descs_needed(skb); 654 } else { 655 num_buffer_descs = gve_num_buffer_descs_needed(skb); 656 657 if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) { 658 if (unlikely(skb_linearize(skb) < 0)) 659 goto drop; 660 661 num_buffer_descs = 1; 662 } 663 } 664 665 /* Metadata + (optional TSO) + data descriptors. */ 666 total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs; 667 if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs + 668 GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP))) { 669 return -1; 670 } 671 672 if (unlikely(gve_tx_add_skb_no_copy_dqo(tx, skb) < 0)) 673 goto drop; 674 675 netdev_tx_sent_queue(tx->netdev_txq, skb->len); 676 skb_tx_timestamp(skb); 677 return 0; 678 679 drop: 680 tx->dropped_pkt++; 681 dev_kfree_skb_any(skb); 682 return 0; 683 } 684 685 /* Transmit a given skb and ring the doorbell. */ 686 netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev) 687 { 688 struct gve_priv *priv = netdev_priv(dev); 689 struct gve_tx_ring *tx; 690 691 tx = &priv->tx[skb_get_queue_mapping(skb)]; 692 if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) { 693 /* We need to ring the txq doorbell -- we have stopped the Tx 694 * queue for want of resources, but prior calls to gve_tx() 695 * may have added descriptors without ringing the doorbell. 696 */ 697 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); 698 return NETDEV_TX_BUSY; 699 } 700 701 if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more()) 702 return NETDEV_TX_OK; 703 704 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); 705 return NETDEV_TX_OK; 706 } 707 708 static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list, 709 struct gve_tx_pending_packet_dqo *pending_packet) 710 { 711 s16 old_tail, index; 712 713 index = pending_packet - tx->dqo.pending_packets; 714 old_tail = list->tail; 715 list->tail = index; 716 if (old_tail == -1) 717 list->head = index; 718 else 719 tx->dqo.pending_packets[old_tail].next = index; 720 721 pending_packet->next = -1; 722 pending_packet->prev = old_tail; 723 } 724 725 static void remove_from_list(struct gve_tx_ring *tx, 726 struct gve_index_list *list, 727 struct gve_tx_pending_packet_dqo *pkt) 728 { 729 s16 prev_index, next_index; 730 731 prev_index = pkt->prev; 732 next_index = pkt->next; 733 734 if (prev_index == -1) { 735 /* Node is head */ 736 list->head = next_index; 737 } else { 738 tx->dqo.pending_packets[prev_index].next = next_index; 739 } 740 if (next_index == -1) { 741 /* Node is tail */ 742 list->tail = prev_index; 743 } else { 744 tx->dqo.pending_packets[next_index].prev = prev_index; 745 } 746 } 747 748 static void gve_unmap_packet(struct device *dev, 749 struct gve_tx_pending_packet_dqo *pkt) 750 { 751 int i; 752 753 /* SKB linear portion is guaranteed to be mapped */ 754 dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]), 755 dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE); 756 for (i = 1; i < pkt->num_bufs; i++) { 757 dma_unmap_page(dev, dma_unmap_addr(pkt, dma[i]), 758 dma_unmap_len(pkt, len[i]), DMA_TO_DEVICE); 759 } 760 pkt->num_bufs = 0; 761 } 762 763 /* Completion types and expected behavior: 764 * No Miss compl + Packet compl = Packet completed normally. 765 * Miss compl + Re-inject compl = Packet completed normally. 766 * No Miss compl + Re-inject compl = Skipped i.e. packet not completed. 767 * Miss compl + Packet compl = Skipped i.e. packet not completed. 768 */ 769 static void gve_handle_packet_completion(struct gve_priv *priv, 770 struct gve_tx_ring *tx, bool is_napi, 771 u16 compl_tag, u64 *bytes, u64 *pkts, 772 bool is_reinjection) 773 { 774 struct gve_tx_pending_packet_dqo *pending_packet; 775 776 if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) { 777 net_err_ratelimited("%s: Invalid TX completion tag: %d\n", 778 priv->dev->name, (int)compl_tag); 779 return; 780 } 781 782 pending_packet = &tx->dqo.pending_packets[compl_tag]; 783 784 if (unlikely(is_reinjection)) { 785 if (unlikely(pending_packet->state == 786 GVE_PACKET_STATE_TIMED_OUT_COMPL)) { 787 net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n", 788 priv->dev->name, (int)compl_tag); 789 /* Packet was already completed as a result of timeout, 790 * so just remove from list and free pending packet. 791 */ 792 remove_from_list(tx, 793 &tx->dqo_compl.timed_out_completions, 794 pending_packet); 795 gve_free_pending_packet(tx, pending_packet); 796 return; 797 } 798 if (unlikely(pending_packet->state != 799 GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) { 800 /* No outstanding miss completion but packet allocated 801 * implies packet receives a re-injection completion 802 * without a prior miss completion. Return without 803 * completing the packet. 804 */ 805 net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n", 806 priv->dev->name, (int)compl_tag); 807 return; 808 } 809 remove_from_list(tx, &tx->dqo_compl.miss_completions, 810 pending_packet); 811 } else { 812 /* Packet is allocated but not a pending data completion. */ 813 if (unlikely(pending_packet->state != 814 GVE_PACKET_STATE_PENDING_DATA_COMPL)) { 815 net_err_ratelimited("%s: No pending data completion: %d\n", 816 priv->dev->name, (int)compl_tag); 817 return; 818 } 819 } 820 gve_unmap_packet(tx->dev, pending_packet); 821 822 *bytes += pending_packet->skb->len; 823 (*pkts)++; 824 napi_consume_skb(pending_packet->skb, is_napi); 825 pending_packet->skb = NULL; 826 gve_free_pending_packet(tx, pending_packet); 827 } 828 829 static void gve_handle_miss_completion(struct gve_priv *priv, 830 struct gve_tx_ring *tx, u16 compl_tag, 831 u64 *bytes, u64 *pkts) 832 { 833 struct gve_tx_pending_packet_dqo *pending_packet; 834 835 if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) { 836 net_err_ratelimited("%s: Invalid TX completion tag: %d\n", 837 priv->dev->name, (int)compl_tag); 838 return; 839 } 840 841 pending_packet = &tx->dqo.pending_packets[compl_tag]; 842 if (unlikely(pending_packet->state != 843 GVE_PACKET_STATE_PENDING_DATA_COMPL)) { 844 net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n", 845 priv->dev->name, (int)pending_packet->state, 846 (int)compl_tag); 847 return; 848 } 849 850 pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL; 851 /* jiffies can wraparound but time comparisons can handle overflows. */ 852 pending_packet->timeout_jiffies = 853 jiffies + 854 msecs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT * 855 MSEC_PER_SEC); 856 add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet); 857 858 *bytes += pending_packet->skb->len; 859 (*pkts)++; 860 } 861 862 static void remove_miss_completions(struct gve_priv *priv, 863 struct gve_tx_ring *tx) 864 { 865 struct gve_tx_pending_packet_dqo *pending_packet; 866 s16 next_index; 867 868 next_index = tx->dqo_compl.miss_completions.head; 869 while (next_index != -1) { 870 pending_packet = &tx->dqo.pending_packets[next_index]; 871 next_index = pending_packet->next; 872 /* Break early because packets should timeout in order. */ 873 if (time_is_after_jiffies(pending_packet->timeout_jiffies)) 874 break; 875 876 remove_from_list(tx, &tx->dqo_compl.miss_completions, 877 pending_packet); 878 /* Unmap buffers and free skb but do not unallocate packet i.e. 879 * the completion tag is not freed to ensure that the driver 880 * can take appropriate action if a corresponding valid 881 * completion is received later. 882 */ 883 gve_unmap_packet(tx->dev, pending_packet); 884 /* This indicates the packet was dropped. */ 885 dev_kfree_skb_any(pending_packet->skb); 886 pending_packet->skb = NULL; 887 tx->dropped_pkt++; 888 net_err_ratelimited("%s: No reinjection completion was received for: %d.\n", 889 priv->dev->name, 890 (int)(pending_packet - tx->dqo.pending_packets)); 891 892 pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL; 893 pending_packet->timeout_jiffies = 894 jiffies + 895 msecs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT * 896 MSEC_PER_SEC); 897 /* Maintain pending packet in another list so the packet can be 898 * unallocated at a later time. 899 */ 900 add_to_list(tx, &tx->dqo_compl.timed_out_completions, 901 pending_packet); 902 } 903 } 904 905 static void remove_timed_out_completions(struct gve_priv *priv, 906 struct gve_tx_ring *tx) 907 { 908 struct gve_tx_pending_packet_dqo *pending_packet; 909 s16 next_index; 910 911 next_index = tx->dqo_compl.timed_out_completions.head; 912 while (next_index != -1) { 913 pending_packet = &tx->dqo.pending_packets[next_index]; 914 next_index = pending_packet->next; 915 /* Break early because packets should timeout in order. */ 916 if (time_is_after_jiffies(pending_packet->timeout_jiffies)) 917 break; 918 919 remove_from_list(tx, &tx->dqo_compl.timed_out_completions, 920 pending_packet); 921 gve_free_pending_packet(tx, pending_packet); 922 } 923 } 924 925 int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, 926 struct napi_struct *napi) 927 { 928 u64 reinject_compl_bytes = 0; 929 u64 reinject_compl_pkts = 0; 930 int num_descs_cleaned = 0; 931 u64 miss_compl_bytes = 0; 932 u64 miss_compl_pkts = 0; 933 u64 pkt_compl_bytes = 0; 934 u64 pkt_compl_pkts = 0; 935 936 /* Limit in order to avoid blocking for too long */ 937 while (!napi || pkt_compl_pkts < napi->weight) { 938 struct gve_tx_compl_desc *compl_desc = 939 &tx->dqo.compl_ring[tx->dqo_compl.head]; 940 u16 type; 941 942 if (compl_desc->generation == tx->dqo_compl.cur_gen_bit) 943 break; 944 945 /* Prefetch the next descriptor. */ 946 prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) & 947 tx->dqo.complq_mask]); 948 949 /* Do not read data until we own the descriptor */ 950 dma_rmb(); 951 type = compl_desc->type; 952 953 if (type == GVE_COMPL_TYPE_DQO_DESC) { 954 /* This is the last descriptor fetched by HW plus one */ 955 u16 tx_head = le16_to_cpu(compl_desc->tx_head); 956 957 atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head); 958 } else if (type == GVE_COMPL_TYPE_DQO_PKT) { 959 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); 960 if (compl_tag & GVE_ALT_MISS_COMPL_BIT) { 961 compl_tag &= ~GVE_ALT_MISS_COMPL_BIT; 962 gve_handle_miss_completion(priv, tx, compl_tag, 963 &miss_compl_bytes, 964 &miss_compl_pkts); 965 } else { 966 gve_handle_packet_completion(priv, tx, !!napi, 967 compl_tag, 968 &pkt_compl_bytes, 969 &pkt_compl_pkts, 970 false); 971 } 972 } else if (type == GVE_COMPL_TYPE_DQO_MISS) { 973 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); 974 975 gve_handle_miss_completion(priv, tx, compl_tag, 976 &miss_compl_bytes, 977 &miss_compl_pkts); 978 } else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) { 979 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); 980 981 gve_handle_packet_completion(priv, tx, !!napi, 982 compl_tag, 983 &reinject_compl_bytes, 984 &reinject_compl_pkts, 985 true); 986 } 987 988 tx->dqo_compl.head = 989 (tx->dqo_compl.head + 1) & tx->dqo.complq_mask; 990 /* Flip the generation bit when we wrap around */ 991 tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0; 992 num_descs_cleaned++; 993 } 994 995 netdev_tx_completed_queue(tx->netdev_txq, 996 pkt_compl_pkts + miss_compl_pkts, 997 pkt_compl_bytes + miss_compl_bytes); 998 999 remove_miss_completions(priv, tx); 1000 remove_timed_out_completions(priv, tx); 1001 1002 u64_stats_update_begin(&tx->statss); 1003 tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes; 1004 tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts; 1005 u64_stats_update_end(&tx->statss); 1006 return num_descs_cleaned; 1007 } 1008 1009 bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean) 1010 { 1011 struct gve_tx_compl_desc *compl_desc; 1012 struct gve_tx_ring *tx = block->tx; 1013 struct gve_priv *priv = block->priv; 1014 1015 if (do_clean) { 1016 int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx, 1017 &block->napi); 1018 1019 /* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */ 1020 mb(); 1021 1022 if (netif_tx_queue_stopped(tx->netdev_txq) && 1023 num_descs_cleaned > 0) { 1024 tx->wake_queue++; 1025 netif_tx_wake_queue(tx->netdev_txq); 1026 } 1027 } 1028 1029 /* Return true if we still have work. */ 1030 compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head]; 1031 return compl_desc->generation != tx->dqo_compl.cur_gen_bit; 1032 } 1033