1 // SPDX-License-Identifier: (GPL-2.0 OR MIT) 2 /* Google virtual Ethernet (gve) driver 3 * 4 * Copyright (C) 2015-2021 Google, Inc. 5 */ 6 7 #include "gve.h" 8 #include "gve_adminq.h" 9 #include "gve_utils.h" 10 #include "gve_dqo.h" 11 #include <linux/tcp.h> 12 #include <linux/slab.h> 13 #include <linux/skbuff.h> 14 15 /* Returns true if a gve_tx_pending_packet_dqo object is available. */ 16 static bool gve_has_pending_packet(struct gve_tx_ring *tx) 17 { 18 /* Check TX path's list. */ 19 if (tx->dqo_tx.free_pending_packets != -1) 20 return true; 21 22 /* Check completion handler's list. */ 23 if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1) 24 return true; 25 26 return false; 27 } 28 29 static struct gve_tx_pending_packet_dqo * 30 gve_alloc_pending_packet(struct gve_tx_ring *tx) 31 { 32 struct gve_tx_pending_packet_dqo *pending_packet; 33 s16 index; 34 35 index = tx->dqo_tx.free_pending_packets; 36 37 /* No pending_packets available, try to steal the list from the 38 * completion handler. 39 */ 40 if (unlikely(index == -1)) { 41 tx->dqo_tx.free_pending_packets = 42 atomic_xchg(&tx->dqo_compl.free_pending_packets, -1); 43 index = tx->dqo_tx.free_pending_packets; 44 45 if (unlikely(index == -1)) 46 return NULL; 47 } 48 49 pending_packet = &tx->dqo.pending_packets[index]; 50 51 /* Remove pending_packet from free list */ 52 tx->dqo_tx.free_pending_packets = pending_packet->next; 53 pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL; 54 55 return pending_packet; 56 } 57 58 static void 59 gve_free_pending_packet(struct gve_tx_ring *tx, 60 struct gve_tx_pending_packet_dqo *pending_packet) 61 { 62 s16 index = pending_packet - tx->dqo.pending_packets; 63 64 pending_packet->state = GVE_PACKET_STATE_UNALLOCATED; 65 while (true) { 66 s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets); 67 68 pending_packet->next = old_head; 69 if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets, 70 old_head, index) == old_head) { 71 break; 72 } 73 } 74 } 75 76 /* gve_tx_free_desc - Cleans up all pending tx requests and buffers. 77 */ 78 static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx) 79 { 80 int i; 81 82 for (i = 0; i < tx->dqo.num_pending_packets; i++) { 83 struct gve_tx_pending_packet_dqo *cur_state = 84 &tx->dqo.pending_packets[i]; 85 int j; 86 87 for (j = 0; j < cur_state->num_bufs; j++) { 88 if (j == 0) { 89 dma_unmap_single(tx->dev, 90 dma_unmap_addr(cur_state, dma[j]), 91 dma_unmap_len(cur_state, len[j]), 92 DMA_TO_DEVICE); 93 } else { 94 dma_unmap_page(tx->dev, 95 dma_unmap_addr(cur_state, dma[j]), 96 dma_unmap_len(cur_state, len[j]), 97 DMA_TO_DEVICE); 98 } 99 } 100 if (cur_state->skb) { 101 dev_consume_skb_any(cur_state->skb); 102 cur_state->skb = NULL; 103 } 104 } 105 } 106 107 static void gve_tx_free_ring_dqo(struct gve_priv *priv, int idx) 108 { 109 struct gve_tx_ring *tx = &priv->tx[idx]; 110 struct device *hdev = &priv->pdev->dev; 111 size_t bytes; 112 113 gve_tx_remove_from_block(priv, idx); 114 115 if (tx->q_resources) { 116 dma_free_coherent(hdev, sizeof(*tx->q_resources), 117 tx->q_resources, tx->q_resources_bus); 118 tx->q_resources = NULL; 119 } 120 121 if (tx->dqo.compl_ring) { 122 bytes = sizeof(tx->dqo.compl_ring[0]) * 123 (tx->dqo.complq_mask + 1); 124 dma_free_coherent(hdev, bytes, tx->dqo.compl_ring, 125 tx->complq_bus_dqo); 126 tx->dqo.compl_ring = NULL; 127 } 128 129 if (tx->dqo.tx_ring) { 130 bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1); 131 dma_free_coherent(hdev, bytes, tx->dqo.tx_ring, tx->bus); 132 tx->dqo.tx_ring = NULL; 133 } 134 135 kvfree(tx->dqo.pending_packets); 136 tx->dqo.pending_packets = NULL; 137 138 netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx); 139 } 140 141 static int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int idx) 142 { 143 struct gve_tx_ring *tx = &priv->tx[idx]; 144 struct device *hdev = &priv->pdev->dev; 145 int num_pending_packets; 146 size_t bytes; 147 int i; 148 149 memset(tx, 0, sizeof(*tx)); 150 tx->q_num = idx; 151 tx->dev = &priv->pdev->dev; 152 tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx); 153 atomic_set_release(&tx->dqo_compl.hw_tx_head, 0); 154 155 /* Queue sizes must be a power of 2 */ 156 tx->mask = priv->tx_desc_cnt - 1; 157 tx->dqo.complq_mask = priv->options_dqo_rda.tx_comp_ring_entries - 1; 158 159 /* The max number of pending packets determines the maximum number of 160 * descriptors which maybe written to the completion queue. 161 * 162 * We must set the number small enough to make sure we never overrun the 163 * completion queue. 164 */ 165 num_pending_packets = tx->dqo.complq_mask + 1; 166 167 /* Reserve space for descriptor completions, which will be reported at 168 * most every GVE_TX_MIN_RE_INTERVAL packets. 169 */ 170 num_pending_packets -= 171 (tx->dqo.complq_mask + 1) / GVE_TX_MIN_RE_INTERVAL; 172 173 /* Each packet may have at most 2 buffer completions if it receives both 174 * a miss and reinjection completion. 175 */ 176 num_pending_packets /= 2; 177 178 tx->dqo.num_pending_packets = min_t(int, num_pending_packets, S16_MAX); 179 tx->dqo.pending_packets = kvcalloc(tx->dqo.num_pending_packets, 180 sizeof(tx->dqo.pending_packets[0]), 181 GFP_KERNEL); 182 if (!tx->dqo.pending_packets) 183 goto err; 184 185 /* Set up linked list of pending packets */ 186 for (i = 0; i < tx->dqo.num_pending_packets - 1; i++) 187 tx->dqo.pending_packets[i].next = i + 1; 188 189 tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1; 190 atomic_set_release(&tx->dqo_compl.free_pending_packets, -1); 191 tx->dqo_compl.miss_completions.head = -1; 192 tx->dqo_compl.miss_completions.tail = -1; 193 tx->dqo_compl.timed_out_completions.head = -1; 194 tx->dqo_compl.timed_out_completions.tail = -1; 195 196 bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1); 197 tx->dqo.tx_ring = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL); 198 if (!tx->dqo.tx_ring) 199 goto err; 200 201 bytes = sizeof(tx->dqo.compl_ring[0]) * (tx->dqo.complq_mask + 1); 202 tx->dqo.compl_ring = dma_alloc_coherent(hdev, bytes, 203 &tx->complq_bus_dqo, 204 GFP_KERNEL); 205 if (!tx->dqo.compl_ring) 206 goto err; 207 208 tx->q_resources = dma_alloc_coherent(hdev, sizeof(*tx->q_resources), 209 &tx->q_resources_bus, GFP_KERNEL); 210 if (!tx->q_resources) 211 goto err; 212 213 gve_tx_add_to_block(priv, idx); 214 215 return 0; 216 217 err: 218 gve_tx_free_ring_dqo(priv, idx); 219 return -ENOMEM; 220 } 221 222 int gve_tx_alloc_rings_dqo(struct gve_priv *priv) 223 { 224 int err = 0; 225 int i; 226 227 for (i = 0; i < priv->tx_cfg.num_queues; i++) { 228 err = gve_tx_alloc_ring_dqo(priv, i); 229 if (err) { 230 netif_err(priv, drv, priv->dev, 231 "Failed to alloc tx ring=%d: err=%d\n", 232 i, err); 233 goto err; 234 } 235 } 236 237 return 0; 238 239 err: 240 for (i--; i >= 0; i--) 241 gve_tx_free_ring_dqo(priv, i); 242 243 return err; 244 } 245 246 void gve_tx_free_rings_dqo(struct gve_priv *priv) 247 { 248 int i; 249 250 for (i = 0; i < priv->tx_cfg.num_queues; i++) { 251 struct gve_tx_ring *tx = &priv->tx[i]; 252 253 gve_clean_tx_done_dqo(priv, tx, /*napi=*/NULL); 254 netdev_tx_reset_queue(tx->netdev_txq); 255 gve_tx_clean_pending_packets(tx); 256 257 gve_tx_free_ring_dqo(priv, i); 258 } 259 } 260 261 /* Returns the number of slots available in the ring */ 262 static u32 num_avail_tx_slots(const struct gve_tx_ring *tx) 263 { 264 u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask; 265 266 return tx->mask - num_used; 267 } 268 269 /* Stops the queue if available descriptors is less than 'count'. 270 * Return: 0 if stop is not required. 271 */ 272 static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, int count) 273 { 274 if (likely(gve_has_pending_packet(tx) && 275 num_avail_tx_slots(tx) >= count)) 276 return 0; 277 278 /* Update cached TX head pointer */ 279 tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head); 280 281 if (likely(gve_has_pending_packet(tx) && 282 num_avail_tx_slots(tx) >= count)) 283 return 0; 284 285 /* No space, so stop the queue */ 286 tx->stop_queue++; 287 netif_tx_stop_queue(tx->netdev_txq); 288 289 /* Sync with restarting queue in `gve_tx_poll_dqo()` */ 290 mb(); 291 292 /* After stopping queue, check if we can transmit again in order to 293 * avoid TOCTOU bug. 294 */ 295 tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head); 296 297 if (likely(!gve_has_pending_packet(tx) || 298 num_avail_tx_slots(tx) < count)) 299 return -EBUSY; 300 301 netif_tx_start_queue(tx->netdev_txq); 302 tx->wake_queue++; 303 return 0; 304 } 305 306 static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb, 307 struct gve_tx_metadata_dqo *metadata) 308 { 309 memset(metadata, 0, sizeof(*metadata)); 310 metadata->version = GVE_TX_METADATA_VERSION_DQO; 311 312 if (skb->l4_hash) { 313 u16 path_hash = skb->hash ^ (skb->hash >> 16); 314 315 path_hash &= (1 << 15) - 1; 316 if (unlikely(path_hash == 0)) 317 path_hash = ~path_hash; 318 319 metadata->path_hash = path_hash; 320 } 321 } 322 323 static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx, 324 struct sk_buff *skb, u32 len, u64 addr, 325 s16 compl_tag, bool eop, bool is_gso) 326 { 327 const bool checksum_offload_en = skb->ip_summed == CHECKSUM_PARTIAL; 328 329 while (len > 0) { 330 struct gve_tx_pkt_desc_dqo *desc = 331 &tx->dqo.tx_ring[*desc_idx].pkt; 332 u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO); 333 bool cur_eop = eop && cur_len == len; 334 335 *desc = (struct gve_tx_pkt_desc_dqo){ 336 .buf_addr = cpu_to_le64(addr), 337 .dtype = GVE_TX_PKT_DESC_DTYPE_DQO, 338 .end_of_packet = cur_eop, 339 .checksum_offload_enable = checksum_offload_en, 340 .compl_tag = cpu_to_le16(compl_tag), 341 .buf_size = cur_len, 342 }; 343 344 addr += cur_len; 345 len -= cur_len; 346 *desc_idx = (*desc_idx + 1) & tx->mask; 347 } 348 } 349 350 /* Validates and prepares `skb` for TSO. 351 * 352 * Returns header length, or < 0 if invalid. 353 */ 354 static int gve_prep_tso(struct sk_buff *skb) 355 { 356 struct tcphdr *tcp; 357 int header_len; 358 u32 paylen; 359 int err; 360 361 /* Note: HW requires MSS (gso_size) to be <= 9728 and the total length 362 * of the TSO to be <= 262143. 363 * 364 * However, we don't validate these because: 365 * - Hypervisor enforces a limit of 9K MTU 366 * - Kernel will not produce a TSO larger than 64k 367 */ 368 369 if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO)) 370 return -1; 371 372 /* Needed because we will modify header. */ 373 err = skb_cow_head(skb, 0); 374 if (err < 0) 375 return err; 376 377 tcp = tcp_hdr(skb); 378 379 /* Remove payload length from checksum. */ 380 paylen = skb->len - skb_transport_offset(skb); 381 382 switch (skb_shinfo(skb)->gso_type) { 383 case SKB_GSO_TCPV4: 384 case SKB_GSO_TCPV6: 385 csum_replace_by_diff(&tcp->check, 386 (__force __wsum)htonl(paylen)); 387 388 /* Compute length of segmentation header. */ 389 header_len = skb_transport_offset(skb) + tcp_hdrlen(skb); 390 break; 391 default: 392 return -EINVAL; 393 } 394 395 if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO)) 396 return -EINVAL; 397 398 return header_len; 399 } 400 401 static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc, 402 const struct sk_buff *skb, 403 const struct gve_tx_metadata_dqo *metadata, 404 int header_len) 405 { 406 *desc = (struct gve_tx_tso_context_desc_dqo){ 407 .header_len = header_len, 408 .cmd_dtype = { 409 .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO, 410 .tso = 1, 411 }, 412 .flex0 = metadata->bytes[0], 413 .flex5 = metadata->bytes[5], 414 .flex6 = metadata->bytes[6], 415 .flex7 = metadata->bytes[7], 416 .flex8 = metadata->bytes[8], 417 .flex9 = metadata->bytes[9], 418 .flex10 = metadata->bytes[10], 419 .flex11 = metadata->bytes[11], 420 }; 421 desc->tso_total_len = skb->len - header_len; 422 desc->mss = skb_shinfo(skb)->gso_size; 423 } 424 425 static void 426 gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc, 427 const struct gve_tx_metadata_dqo *metadata) 428 { 429 *desc = (struct gve_tx_general_context_desc_dqo){ 430 .flex0 = metadata->bytes[0], 431 .flex1 = metadata->bytes[1], 432 .flex2 = metadata->bytes[2], 433 .flex3 = metadata->bytes[3], 434 .flex4 = metadata->bytes[4], 435 .flex5 = metadata->bytes[5], 436 .flex6 = metadata->bytes[6], 437 .flex7 = metadata->bytes[7], 438 .flex8 = metadata->bytes[8], 439 .flex9 = metadata->bytes[9], 440 .flex10 = metadata->bytes[10], 441 .flex11 = metadata->bytes[11], 442 .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO}, 443 }; 444 } 445 446 /* Returns 0 on success, or < 0 on error. 447 * 448 * Before this function is called, the caller must ensure 449 * gve_has_pending_packet(tx) returns true. 450 */ 451 static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx, 452 struct sk_buff *skb) 453 { 454 const struct skb_shared_info *shinfo = skb_shinfo(skb); 455 const bool is_gso = skb_is_gso(skb); 456 u32 desc_idx = tx->dqo_tx.tail; 457 458 struct gve_tx_pending_packet_dqo *pkt; 459 struct gve_tx_metadata_dqo metadata; 460 s16 completion_tag; 461 int i; 462 463 pkt = gve_alloc_pending_packet(tx); 464 pkt->skb = skb; 465 pkt->num_bufs = 0; 466 completion_tag = pkt - tx->dqo.pending_packets; 467 468 gve_extract_tx_metadata_dqo(skb, &metadata); 469 if (is_gso) { 470 int header_len = gve_prep_tso(skb); 471 472 if (unlikely(header_len < 0)) 473 goto err; 474 475 gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx, 476 skb, &metadata, header_len); 477 desc_idx = (desc_idx + 1) & tx->mask; 478 } 479 480 gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx, 481 &metadata); 482 desc_idx = (desc_idx + 1) & tx->mask; 483 484 /* Note: HW requires that the size of a non-TSO packet be within the 485 * range of [17, 9728]. 486 * 487 * We don't double check because 488 * - We limited `netdev->min_mtu` to ETH_MIN_MTU. 489 * - Hypervisor won't allow MTU larger than 9216. 490 */ 491 492 /* Map the linear portion of skb */ 493 { 494 u32 len = skb_headlen(skb); 495 dma_addr_t addr; 496 497 addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE); 498 if (unlikely(dma_mapping_error(tx->dev, addr))) 499 goto err; 500 501 dma_unmap_len_set(pkt, len[pkt->num_bufs], len); 502 dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr); 503 ++pkt->num_bufs; 504 505 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr, 506 completion_tag, 507 /*eop=*/shinfo->nr_frags == 0, is_gso); 508 } 509 510 for (i = 0; i < shinfo->nr_frags; i++) { 511 const skb_frag_t *frag = &shinfo->frags[i]; 512 bool is_eop = i == (shinfo->nr_frags - 1); 513 u32 len = skb_frag_size(frag); 514 dma_addr_t addr; 515 516 addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE); 517 if (unlikely(dma_mapping_error(tx->dev, addr))) 518 goto err; 519 520 dma_unmap_len_set(pkt, len[pkt->num_bufs], len); 521 dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr); 522 ++pkt->num_bufs; 523 524 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr, 525 completion_tag, is_eop, is_gso); 526 } 527 528 /* Commit the changes to our state */ 529 tx->dqo_tx.tail = desc_idx; 530 531 /* Request a descriptor completion on the last descriptor of the 532 * packet if we are allowed to by the HW enforced interval. 533 */ 534 { 535 u32 last_desc_idx = (desc_idx - 1) & tx->mask; 536 u32 last_report_event_interval = 537 (last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask; 538 539 if (unlikely(last_report_event_interval >= 540 GVE_TX_MIN_RE_INTERVAL)) { 541 tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true; 542 tx->dqo_tx.last_re_idx = last_desc_idx; 543 } 544 } 545 546 return 0; 547 548 err: 549 for (i = 0; i < pkt->num_bufs; i++) { 550 if (i == 0) { 551 dma_unmap_single(tx->dev, 552 dma_unmap_addr(pkt, dma[i]), 553 dma_unmap_len(pkt, len[i]), 554 DMA_TO_DEVICE); 555 } else { 556 dma_unmap_page(tx->dev, 557 dma_unmap_addr(pkt, dma[i]), 558 dma_unmap_len(pkt, len[i]), 559 DMA_TO_DEVICE); 560 } 561 } 562 563 pkt->skb = NULL; 564 pkt->num_bufs = 0; 565 gve_free_pending_packet(tx, pkt); 566 567 return -1; 568 } 569 570 static int gve_num_descs_per_buf(size_t size) 571 { 572 return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO); 573 } 574 575 static int gve_num_buffer_descs_needed(const struct sk_buff *skb) 576 { 577 const struct skb_shared_info *shinfo = skb_shinfo(skb); 578 int num_descs; 579 int i; 580 581 num_descs = gve_num_descs_per_buf(skb_headlen(skb)); 582 583 for (i = 0; i < shinfo->nr_frags; i++) { 584 unsigned int frag_size = skb_frag_size(&shinfo->frags[i]); 585 586 num_descs += gve_num_descs_per_buf(frag_size); 587 } 588 589 return num_descs; 590 } 591 592 /* Returns true if HW is capable of sending TSO represented by `skb`. 593 * 594 * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers. 595 * - The header is counted as one buffer for every single segment. 596 * - A buffer which is split between two segments is counted for both. 597 * - If a buffer contains both header and payload, it is counted as two buffers. 598 */ 599 static bool gve_can_send_tso(const struct sk_buff *skb) 600 { 601 const int header_len = skb_checksum_start_offset(skb) + tcp_hdrlen(skb); 602 const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1; 603 const struct skb_shared_info *shinfo = skb_shinfo(skb); 604 const int gso_size = shinfo->gso_size; 605 int cur_seg_num_bufs; 606 int cur_seg_size; 607 int i; 608 609 cur_seg_size = skb_headlen(skb) - header_len; 610 cur_seg_num_bufs = cur_seg_size > 0; 611 612 for (i = 0; i < shinfo->nr_frags; i++) { 613 if (cur_seg_size >= gso_size) { 614 cur_seg_size %= gso_size; 615 cur_seg_num_bufs = cur_seg_size > 0; 616 } 617 618 if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg)) 619 return false; 620 621 cur_seg_size += skb_frag_size(&shinfo->frags[i]); 622 } 623 624 return true; 625 } 626 627 /* Attempt to transmit specified SKB. 628 * 629 * Returns 0 if the SKB was transmitted or dropped. 630 * Returns -1 if there is not currently enough space to transmit the SKB. 631 */ 632 static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx, 633 struct sk_buff *skb) 634 { 635 int num_buffer_descs; 636 int total_num_descs; 637 638 if (skb_is_gso(skb)) { 639 /* If TSO doesn't meet HW requirements, attempt to linearize the 640 * packet. 641 */ 642 if (unlikely(!gve_can_send_tso(skb) && 643 skb_linearize(skb) < 0)) { 644 net_err_ratelimited("%s: Failed to transmit TSO packet\n", 645 priv->dev->name); 646 goto drop; 647 } 648 649 num_buffer_descs = gve_num_buffer_descs_needed(skb); 650 } else { 651 num_buffer_descs = gve_num_buffer_descs_needed(skb); 652 653 if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) { 654 if (unlikely(skb_linearize(skb) < 0)) 655 goto drop; 656 657 num_buffer_descs = 1; 658 } 659 } 660 661 /* Metadata + (optional TSO) + data descriptors. */ 662 total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs; 663 if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs + 664 GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP))) { 665 return -1; 666 } 667 668 if (unlikely(gve_tx_add_skb_no_copy_dqo(tx, skb) < 0)) 669 goto drop; 670 671 netdev_tx_sent_queue(tx->netdev_txq, skb->len); 672 skb_tx_timestamp(skb); 673 return 0; 674 675 drop: 676 tx->dropped_pkt++; 677 dev_kfree_skb_any(skb); 678 return 0; 679 } 680 681 /* Transmit a given skb and ring the doorbell. */ 682 netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev) 683 { 684 struct gve_priv *priv = netdev_priv(dev); 685 struct gve_tx_ring *tx; 686 687 tx = &priv->tx[skb_get_queue_mapping(skb)]; 688 if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) { 689 /* We need to ring the txq doorbell -- we have stopped the Tx 690 * queue for want of resources, but prior calls to gve_tx() 691 * may have added descriptors without ringing the doorbell. 692 */ 693 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); 694 return NETDEV_TX_BUSY; 695 } 696 697 if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more()) 698 return NETDEV_TX_OK; 699 700 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); 701 return NETDEV_TX_OK; 702 } 703 704 static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list, 705 struct gve_tx_pending_packet_dqo *pending_packet) 706 { 707 s16 old_tail, index; 708 709 index = pending_packet - tx->dqo.pending_packets; 710 old_tail = list->tail; 711 list->tail = index; 712 if (old_tail == -1) 713 list->head = index; 714 else 715 tx->dqo.pending_packets[old_tail].next = index; 716 717 pending_packet->next = -1; 718 pending_packet->prev = old_tail; 719 } 720 721 static void remove_from_list(struct gve_tx_ring *tx, 722 struct gve_index_list *list, 723 struct gve_tx_pending_packet_dqo *pkt) 724 { 725 s16 prev_index, next_index; 726 727 prev_index = pkt->prev; 728 next_index = pkt->next; 729 730 if (prev_index == -1) { 731 /* Node is head */ 732 list->head = next_index; 733 } else { 734 tx->dqo.pending_packets[prev_index].next = next_index; 735 } 736 if (next_index == -1) { 737 /* Node is tail */ 738 list->tail = prev_index; 739 } else { 740 tx->dqo.pending_packets[next_index].prev = prev_index; 741 } 742 } 743 744 static void gve_unmap_packet(struct device *dev, 745 struct gve_tx_pending_packet_dqo *pkt) 746 { 747 int i; 748 749 /* SKB linear portion is guaranteed to be mapped */ 750 dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]), 751 dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE); 752 for (i = 1; i < pkt->num_bufs; i++) { 753 dma_unmap_page(dev, dma_unmap_addr(pkt, dma[i]), 754 dma_unmap_len(pkt, len[i]), DMA_TO_DEVICE); 755 } 756 pkt->num_bufs = 0; 757 } 758 759 /* Completion types and expected behavior: 760 * No Miss compl + Packet compl = Packet completed normally. 761 * Miss compl + Re-inject compl = Packet completed normally. 762 * No Miss compl + Re-inject compl = Skipped i.e. packet not completed. 763 * Miss compl + Packet compl = Skipped i.e. packet not completed. 764 */ 765 static void gve_handle_packet_completion(struct gve_priv *priv, 766 struct gve_tx_ring *tx, bool is_napi, 767 u16 compl_tag, u64 *bytes, u64 *pkts, 768 bool is_reinjection) 769 { 770 struct gve_tx_pending_packet_dqo *pending_packet; 771 772 if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) { 773 net_err_ratelimited("%s: Invalid TX completion tag: %d\n", 774 priv->dev->name, (int)compl_tag); 775 return; 776 } 777 778 pending_packet = &tx->dqo.pending_packets[compl_tag]; 779 780 if (unlikely(is_reinjection)) { 781 if (unlikely(pending_packet->state == 782 GVE_PACKET_STATE_TIMED_OUT_COMPL)) { 783 net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n", 784 priv->dev->name, (int)compl_tag); 785 /* Packet was already completed as a result of timeout, 786 * so just remove from list and free pending packet. 787 */ 788 remove_from_list(tx, 789 &tx->dqo_compl.timed_out_completions, 790 pending_packet); 791 gve_free_pending_packet(tx, pending_packet); 792 return; 793 } 794 if (unlikely(pending_packet->state != 795 GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) { 796 /* No outstanding miss completion but packet allocated 797 * implies packet receives a re-injection completion 798 * without a a prior miss completion. Return without 799 * completing the packet. 800 */ 801 net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n", 802 priv->dev->name, (int)compl_tag); 803 return; 804 } 805 remove_from_list(tx, &tx->dqo_compl.miss_completions, 806 pending_packet); 807 } else { 808 /* Packet is allocated but not a pending data completion. */ 809 if (unlikely(pending_packet->state != 810 GVE_PACKET_STATE_PENDING_DATA_COMPL)) { 811 net_err_ratelimited("%s: No pending data completion: %d\n", 812 priv->dev->name, (int)compl_tag); 813 return; 814 } 815 } 816 gve_unmap_packet(tx->dev, pending_packet); 817 818 *bytes += pending_packet->skb->len; 819 (*pkts)++; 820 napi_consume_skb(pending_packet->skb, is_napi); 821 pending_packet->skb = NULL; 822 gve_free_pending_packet(tx, pending_packet); 823 } 824 825 static void gve_handle_miss_completion(struct gve_priv *priv, 826 struct gve_tx_ring *tx, u16 compl_tag, 827 u64 *bytes, u64 *pkts) 828 { 829 struct gve_tx_pending_packet_dqo *pending_packet; 830 831 if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) { 832 net_err_ratelimited("%s: Invalid TX completion tag: %d\n", 833 priv->dev->name, (int)compl_tag); 834 return; 835 } 836 837 pending_packet = &tx->dqo.pending_packets[compl_tag]; 838 if (unlikely(pending_packet->state != 839 GVE_PACKET_STATE_PENDING_DATA_COMPL)) { 840 net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n", 841 priv->dev->name, (int)pending_packet->state, 842 (int)compl_tag); 843 return; 844 } 845 846 pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL; 847 /* jiffies can wraparound but time comparisons can handle overflows. */ 848 pending_packet->timeout_jiffies = 849 jiffies + 850 msecs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT * 851 MSEC_PER_SEC); 852 add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet); 853 854 *bytes += pending_packet->skb->len; 855 (*pkts)++; 856 } 857 858 static void remove_miss_completions(struct gve_priv *priv, 859 struct gve_tx_ring *tx) 860 { 861 struct gve_tx_pending_packet_dqo *pending_packet; 862 s16 next_index; 863 864 next_index = tx->dqo_compl.miss_completions.head; 865 while (next_index != -1) { 866 pending_packet = &tx->dqo.pending_packets[next_index]; 867 next_index = pending_packet->next; 868 /* Break early because packets should timeout in order. */ 869 if (time_is_after_jiffies(pending_packet->timeout_jiffies)) 870 break; 871 872 remove_from_list(tx, &tx->dqo_compl.miss_completions, 873 pending_packet); 874 /* Unmap buffers and free skb but do not unallocate packet i.e. 875 * the completion tag is not freed to ensure that the driver 876 * can take appropriate action if a corresponding valid 877 * completion is received later. 878 */ 879 gve_unmap_packet(tx->dev, pending_packet); 880 /* This indicates the packet was dropped. */ 881 dev_kfree_skb_any(pending_packet->skb); 882 pending_packet->skb = NULL; 883 tx->dropped_pkt++; 884 net_err_ratelimited("%s: No reinjection completion was received for: %d.\n", 885 priv->dev->name, 886 (int)(pending_packet - tx->dqo.pending_packets)); 887 888 pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL; 889 pending_packet->timeout_jiffies = 890 jiffies + 891 msecs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT * 892 MSEC_PER_SEC); 893 /* Maintain pending packet in another list so the packet can be 894 * unallocated at a later time. 895 */ 896 add_to_list(tx, &tx->dqo_compl.timed_out_completions, 897 pending_packet); 898 } 899 } 900 901 static void remove_timed_out_completions(struct gve_priv *priv, 902 struct gve_tx_ring *tx) 903 { 904 struct gve_tx_pending_packet_dqo *pending_packet; 905 s16 next_index; 906 907 next_index = tx->dqo_compl.timed_out_completions.head; 908 while (next_index != -1) { 909 pending_packet = &tx->dqo.pending_packets[next_index]; 910 next_index = pending_packet->next; 911 /* Break early because packets should timeout in order. */ 912 if (time_is_after_jiffies(pending_packet->timeout_jiffies)) 913 break; 914 915 remove_from_list(tx, &tx->dqo_compl.timed_out_completions, 916 pending_packet); 917 gve_free_pending_packet(tx, pending_packet); 918 } 919 } 920 921 int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, 922 struct napi_struct *napi) 923 { 924 u64 reinject_compl_bytes = 0; 925 u64 reinject_compl_pkts = 0; 926 int num_descs_cleaned = 0; 927 u64 miss_compl_bytes = 0; 928 u64 miss_compl_pkts = 0; 929 u64 pkt_compl_bytes = 0; 930 u64 pkt_compl_pkts = 0; 931 932 /* Limit in order to avoid blocking for too long */ 933 while (!napi || pkt_compl_pkts < napi->weight) { 934 struct gve_tx_compl_desc *compl_desc = 935 &tx->dqo.compl_ring[tx->dqo_compl.head]; 936 u16 type; 937 938 if (compl_desc->generation == tx->dqo_compl.cur_gen_bit) 939 break; 940 941 /* Prefetch the next descriptor. */ 942 prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) & 943 tx->dqo.complq_mask]); 944 945 /* Do not read data until we own the descriptor */ 946 dma_rmb(); 947 type = compl_desc->type; 948 949 if (type == GVE_COMPL_TYPE_DQO_DESC) { 950 /* This is the last descriptor fetched by HW plus one */ 951 u16 tx_head = le16_to_cpu(compl_desc->tx_head); 952 953 atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head); 954 } else if (type == GVE_COMPL_TYPE_DQO_PKT) { 955 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); 956 957 gve_handle_packet_completion(priv, tx, !!napi, 958 compl_tag, 959 &pkt_compl_bytes, 960 &pkt_compl_pkts, 961 /*is_reinjection=*/false); 962 } else if (type == GVE_COMPL_TYPE_DQO_MISS) { 963 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); 964 965 gve_handle_miss_completion(priv, tx, compl_tag, 966 &miss_compl_bytes, 967 &miss_compl_pkts); 968 } else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) { 969 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); 970 971 gve_handle_packet_completion(priv, tx, !!napi, 972 compl_tag, 973 &reinject_compl_bytes, 974 &reinject_compl_pkts, 975 /*is_reinjection=*/true); 976 } 977 978 tx->dqo_compl.head = 979 (tx->dqo_compl.head + 1) & tx->dqo.complq_mask; 980 /* Flip the generation bit when we wrap around */ 981 tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0; 982 num_descs_cleaned++; 983 } 984 985 netdev_tx_completed_queue(tx->netdev_txq, 986 pkt_compl_pkts + miss_compl_pkts, 987 pkt_compl_bytes + miss_compl_bytes); 988 989 remove_miss_completions(priv, tx); 990 remove_timed_out_completions(priv, tx); 991 992 u64_stats_update_begin(&tx->statss); 993 tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes; 994 tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts; 995 u64_stats_update_end(&tx->statss); 996 return num_descs_cleaned; 997 } 998 999 bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean) 1000 { 1001 struct gve_tx_compl_desc *compl_desc; 1002 struct gve_tx_ring *tx = block->tx; 1003 struct gve_priv *priv = block->priv; 1004 1005 if (do_clean) { 1006 int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx, 1007 &block->napi); 1008 1009 /* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */ 1010 mb(); 1011 1012 if (netif_tx_queue_stopped(tx->netdev_txq) && 1013 num_descs_cleaned > 0) { 1014 tx->wake_queue++; 1015 netif_tx_wake_queue(tx->netdev_txq); 1016 } 1017 } 1018 1019 /* Return true if we still have work. */ 1020 compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head]; 1021 return compl_desc->generation != tx->dqo_compl.cur_gen_bit; 1022 } 1023