1 // SPDX-License-Identifier: (GPL-2.0 OR MIT) 2 /* Google virtual Ethernet (gve) driver 3 * 4 * Copyright (C) 2015-2021 Google, Inc. 5 */ 6 7 #include "gve.h" 8 #include "gve_adminq.h" 9 #include "gve_utils.h" 10 #include <linux/etherdevice.h> 11 #include <linux/filter.h> 12 #include <net/xdp.h> 13 #include <net/xdp_sock_drv.h> 14 15 static void gve_rx_free_buffer(struct device *dev, 16 struct gve_rx_slot_page_info *page_info, 17 union gve_rx_data_slot *data_slot) 18 { 19 dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) & 20 GVE_DATA_SLOT_ADDR_PAGE_MASK); 21 22 page_ref_sub(page_info->page, page_info->pagecnt_bias - 1); 23 gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE); 24 } 25 26 static void gve_rx_unfill_pages(struct gve_priv *priv, struct gve_rx_ring *rx) 27 { 28 u32 slots = rx->mask + 1; 29 int i; 30 31 if (rx->data.raw_addressing) { 32 for (i = 0; i < slots; i++) 33 gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i], 34 &rx->data.data_ring[i]); 35 } else { 36 for (i = 0; i < slots; i++) 37 page_ref_sub(rx->data.page_info[i].page, 38 rx->data.page_info[i].pagecnt_bias - 1); 39 gve_unassign_qpl(priv, rx->data.qpl->id); 40 rx->data.qpl = NULL; 41 42 for (i = 0; i < rx->qpl_copy_pool_mask + 1; i++) { 43 page_ref_sub(rx->qpl_copy_pool[i].page, 44 rx->qpl_copy_pool[i].pagecnt_bias - 1); 45 put_page(rx->qpl_copy_pool[i].page); 46 } 47 } 48 kvfree(rx->data.page_info); 49 rx->data.page_info = NULL; 50 } 51 52 static void gve_rx_free_ring(struct gve_priv *priv, int idx) 53 { 54 struct gve_rx_ring *rx = &priv->rx[idx]; 55 struct device *dev = &priv->pdev->dev; 56 u32 slots = rx->mask + 1; 57 size_t bytes; 58 59 gve_rx_remove_from_block(priv, idx); 60 61 bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt; 62 dma_free_coherent(dev, bytes, rx->desc.desc_ring, rx->desc.bus); 63 rx->desc.desc_ring = NULL; 64 65 dma_free_coherent(dev, sizeof(*rx->q_resources), 66 rx->q_resources, rx->q_resources_bus); 67 rx->q_resources = NULL; 68 69 gve_rx_unfill_pages(priv, rx); 70 71 bytes = sizeof(*rx->data.data_ring) * slots; 72 dma_free_coherent(dev, bytes, rx->data.data_ring, 73 rx->data.data_bus); 74 rx->data.data_ring = NULL; 75 76 kvfree(rx->qpl_copy_pool); 77 rx->qpl_copy_pool = NULL; 78 79 netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); 80 } 81 82 static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info, 83 dma_addr_t addr, struct page *page, __be64 *slot_addr) 84 { 85 page_info->page = page; 86 page_info->page_offset = 0; 87 page_info->page_address = page_address(page); 88 *slot_addr = cpu_to_be64(addr); 89 /* The page already has 1 ref */ 90 page_ref_add(page, INT_MAX - 1); 91 page_info->pagecnt_bias = INT_MAX; 92 } 93 94 static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev, 95 struct gve_rx_slot_page_info *page_info, 96 union gve_rx_data_slot *data_slot) 97 { 98 struct page *page; 99 dma_addr_t dma; 100 int err; 101 102 err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE, 103 GFP_ATOMIC); 104 if (err) 105 return err; 106 107 gve_setup_rx_buffer(page_info, dma, page, &data_slot->addr); 108 return 0; 109 } 110 111 static int gve_prefill_rx_pages(struct gve_rx_ring *rx) 112 { 113 struct gve_priv *priv = rx->gve; 114 u32 slots; 115 int err; 116 int i; 117 int j; 118 119 /* Allocate one page per Rx queue slot. Each page is split into two 120 * packet buffers, when possible we "page flip" between the two. 121 */ 122 slots = rx->mask + 1; 123 124 rx->data.page_info = kvzalloc(slots * 125 sizeof(*rx->data.page_info), GFP_KERNEL); 126 if (!rx->data.page_info) 127 return -ENOMEM; 128 129 if (!rx->data.raw_addressing) { 130 rx->data.qpl = gve_assign_rx_qpl(priv, rx->q_num); 131 if (!rx->data.qpl) { 132 kvfree(rx->data.page_info); 133 rx->data.page_info = NULL; 134 return -ENOMEM; 135 } 136 } 137 for (i = 0; i < slots; i++) { 138 if (!rx->data.raw_addressing) { 139 struct page *page = rx->data.qpl->pages[i]; 140 dma_addr_t addr = i * PAGE_SIZE; 141 142 gve_setup_rx_buffer(&rx->data.page_info[i], addr, page, 143 &rx->data.data_ring[i].qpl_offset); 144 continue; 145 } 146 err = gve_rx_alloc_buffer(priv, &priv->pdev->dev, &rx->data.page_info[i], 147 &rx->data.data_ring[i]); 148 if (err) 149 goto alloc_err_rda; 150 } 151 152 if (!rx->data.raw_addressing) { 153 for (j = 0; j < rx->qpl_copy_pool_mask + 1; j++) { 154 struct page *page = alloc_page(GFP_KERNEL); 155 156 if (!page) { 157 err = -ENOMEM; 158 goto alloc_err_qpl; 159 } 160 161 rx->qpl_copy_pool[j].page = page; 162 rx->qpl_copy_pool[j].page_offset = 0; 163 rx->qpl_copy_pool[j].page_address = page_address(page); 164 165 /* The page already has 1 ref. */ 166 page_ref_add(page, INT_MAX - 1); 167 rx->qpl_copy_pool[j].pagecnt_bias = INT_MAX; 168 } 169 } 170 171 return slots; 172 173 alloc_err_qpl: 174 /* Fully free the copy pool pages. */ 175 while (j--) { 176 page_ref_sub(rx->qpl_copy_pool[j].page, 177 rx->qpl_copy_pool[j].pagecnt_bias - 1); 178 put_page(rx->qpl_copy_pool[j].page); 179 } 180 181 /* Do not fully free QPL pages - only remove the bias added in this 182 * function with gve_setup_rx_buffer. 183 */ 184 while (i--) 185 page_ref_sub(rx->data.page_info[i].page, 186 rx->data.page_info[i].pagecnt_bias - 1); 187 188 gve_unassign_qpl(priv, rx->data.qpl->id); 189 rx->data.qpl = NULL; 190 191 return err; 192 193 alloc_err_rda: 194 while (i--) 195 gve_rx_free_buffer(&priv->pdev->dev, 196 &rx->data.page_info[i], 197 &rx->data.data_ring[i]); 198 return err; 199 } 200 201 static void gve_rx_ctx_clear(struct gve_rx_ctx *ctx) 202 { 203 ctx->skb_head = NULL; 204 ctx->skb_tail = NULL; 205 ctx->total_size = 0; 206 ctx->frag_cnt = 0; 207 ctx->drop_pkt = false; 208 } 209 210 static int gve_rx_alloc_ring(struct gve_priv *priv, int idx) 211 { 212 struct gve_rx_ring *rx = &priv->rx[idx]; 213 struct device *hdev = &priv->pdev->dev; 214 u32 slots, npages; 215 int filled_pages; 216 size_t bytes; 217 int err; 218 219 netif_dbg(priv, drv, priv->dev, "allocating rx ring\n"); 220 /* Make sure everything is zeroed to start with */ 221 memset(rx, 0, sizeof(*rx)); 222 223 rx->gve = priv; 224 rx->q_num = idx; 225 226 slots = priv->rx_data_slot_cnt; 227 rx->mask = slots - 1; 228 rx->data.raw_addressing = priv->queue_format == GVE_GQI_RDA_FORMAT; 229 230 /* alloc rx data ring */ 231 bytes = sizeof(*rx->data.data_ring) * slots; 232 rx->data.data_ring = dma_alloc_coherent(hdev, bytes, 233 &rx->data.data_bus, 234 GFP_KERNEL); 235 if (!rx->data.data_ring) 236 return -ENOMEM; 237 238 rx->qpl_copy_pool_mask = min_t(u32, U32_MAX, slots * 2) - 1; 239 rx->qpl_copy_pool_head = 0; 240 rx->qpl_copy_pool = kvcalloc(rx->qpl_copy_pool_mask + 1, 241 sizeof(rx->qpl_copy_pool[0]), 242 GFP_KERNEL); 243 244 if (!rx->qpl_copy_pool) { 245 err = -ENOMEM; 246 goto abort_with_slots; 247 } 248 249 filled_pages = gve_prefill_rx_pages(rx); 250 if (filled_pages < 0) { 251 err = -ENOMEM; 252 goto abort_with_copy_pool; 253 } 254 rx->fill_cnt = filled_pages; 255 /* Ensure data ring slots (packet buffers) are visible. */ 256 dma_wmb(); 257 258 /* Alloc gve_queue_resources */ 259 rx->q_resources = 260 dma_alloc_coherent(hdev, 261 sizeof(*rx->q_resources), 262 &rx->q_resources_bus, 263 GFP_KERNEL); 264 if (!rx->q_resources) { 265 err = -ENOMEM; 266 goto abort_filled; 267 } 268 netif_dbg(priv, drv, priv->dev, "rx[%d]->data.data_bus=%lx\n", idx, 269 (unsigned long)rx->data.data_bus); 270 271 /* alloc rx desc ring */ 272 bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt; 273 npages = bytes / PAGE_SIZE; 274 if (npages * PAGE_SIZE != bytes) { 275 err = -EIO; 276 goto abort_with_q_resources; 277 } 278 279 rx->desc.desc_ring = dma_alloc_coherent(hdev, bytes, &rx->desc.bus, 280 GFP_KERNEL); 281 if (!rx->desc.desc_ring) { 282 err = -ENOMEM; 283 goto abort_with_q_resources; 284 } 285 rx->cnt = 0; 286 rx->db_threshold = priv->rx_desc_cnt / 2; 287 rx->desc.seqno = 1; 288 289 /* Allocating half-page buffers allows page-flipping which is faster 290 * than copying or allocating new pages. 291 */ 292 rx->packet_buffer_size = PAGE_SIZE / 2; 293 gve_rx_ctx_clear(&rx->ctx); 294 gve_rx_add_to_block(priv, idx); 295 296 return 0; 297 298 abort_with_q_resources: 299 dma_free_coherent(hdev, sizeof(*rx->q_resources), 300 rx->q_resources, rx->q_resources_bus); 301 rx->q_resources = NULL; 302 abort_filled: 303 gve_rx_unfill_pages(priv, rx); 304 abort_with_copy_pool: 305 kvfree(rx->qpl_copy_pool); 306 rx->qpl_copy_pool = NULL; 307 abort_with_slots: 308 bytes = sizeof(*rx->data.data_ring) * slots; 309 dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus); 310 rx->data.data_ring = NULL; 311 312 return err; 313 } 314 315 int gve_rx_alloc_rings(struct gve_priv *priv) 316 { 317 int err = 0; 318 int i; 319 320 for (i = 0; i < priv->rx_cfg.num_queues; i++) { 321 err = gve_rx_alloc_ring(priv, i); 322 if (err) { 323 netif_err(priv, drv, priv->dev, 324 "Failed to alloc rx ring=%d: err=%d\n", 325 i, err); 326 break; 327 } 328 } 329 /* Unallocate if there was an error */ 330 if (err) { 331 int j; 332 333 for (j = 0; j < i; j++) 334 gve_rx_free_ring(priv, j); 335 } 336 return err; 337 } 338 339 void gve_rx_free_rings_gqi(struct gve_priv *priv) 340 { 341 int i; 342 343 for (i = 0; i < priv->rx_cfg.num_queues; i++) 344 gve_rx_free_ring(priv, i); 345 } 346 347 void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx) 348 { 349 u32 db_idx = be32_to_cpu(rx->q_resources->db_index); 350 351 iowrite32be(rx->fill_cnt, &priv->db_bar2[db_idx]); 352 } 353 354 static enum pkt_hash_types gve_rss_type(__be16 pkt_flags) 355 { 356 if (likely(pkt_flags & (GVE_RXF_TCP | GVE_RXF_UDP))) 357 return PKT_HASH_TYPE_L4; 358 if (pkt_flags & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) 359 return PKT_HASH_TYPE_L3; 360 return PKT_HASH_TYPE_L2; 361 } 362 363 static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi, 364 struct gve_rx_slot_page_info *page_info, 365 unsigned int truesize, u16 len, 366 struct gve_rx_ctx *ctx) 367 { 368 u32 offset = page_info->page_offset + page_info->pad; 369 struct sk_buff *skb = ctx->skb_tail; 370 int num_frags = 0; 371 372 if (!skb) { 373 skb = napi_get_frags(napi); 374 if (unlikely(!skb)) 375 return NULL; 376 377 ctx->skb_head = skb; 378 ctx->skb_tail = skb; 379 } else { 380 num_frags = skb_shinfo(ctx->skb_tail)->nr_frags; 381 if (num_frags == MAX_SKB_FRAGS) { 382 skb = napi_alloc_skb(napi, 0); 383 if (!skb) 384 return NULL; 385 386 // We will never chain more than two SKBs: 2 * 16 * 2k > 64k 387 // which is why we do not need to chain by using skb->next 388 skb_shinfo(ctx->skb_tail)->frag_list = skb; 389 390 ctx->skb_tail = skb; 391 num_frags = 0; 392 } 393 } 394 395 if (skb != ctx->skb_head) { 396 ctx->skb_head->len += len; 397 ctx->skb_head->data_len += len; 398 ctx->skb_head->truesize += truesize; 399 } 400 skb_add_rx_frag(skb, num_frags, page_info->page, 401 offset, len, truesize); 402 403 return ctx->skb_head; 404 } 405 406 static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) 407 { 408 const __be64 offset = cpu_to_be64(PAGE_SIZE / 2); 409 410 /* "flip" to other packet buffer on this page */ 411 page_info->page_offset ^= PAGE_SIZE / 2; 412 *(slot_addr) ^= offset; 413 } 414 415 static int gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info *page_info) 416 { 417 int pagecount = page_count(page_info->page); 418 419 /* This page is not being used by any SKBs - reuse */ 420 if (pagecount == page_info->pagecnt_bias) 421 return 1; 422 /* This page is still being used by an SKB - we can't reuse */ 423 else if (pagecount > page_info->pagecnt_bias) 424 return 0; 425 WARN(pagecount < page_info->pagecnt_bias, 426 "Pagecount should never be less than the bias."); 427 return -1; 428 } 429 430 static struct sk_buff * 431 gve_rx_raw_addressing(struct device *dev, struct net_device *netdev, 432 struct gve_rx_slot_page_info *page_info, u16 len, 433 struct napi_struct *napi, 434 union gve_rx_data_slot *data_slot, 435 u16 packet_buffer_size, struct gve_rx_ctx *ctx) 436 { 437 struct sk_buff *skb = gve_rx_add_frags(napi, page_info, packet_buffer_size, len, ctx); 438 439 if (!skb) 440 return NULL; 441 442 /* Optimistically stop the kernel from freeing the page. 443 * We will check again in refill to determine if we need to alloc a 444 * new page. 445 */ 446 gve_dec_pagecnt_bias(page_info); 447 448 return skb; 449 } 450 451 static struct sk_buff *gve_rx_copy_to_pool(struct gve_rx_ring *rx, 452 struct gve_rx_slot_page_info *page_info, 453 u16 len, struct napi_struct *napi) 454 { 455 u32 pool_idx = rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask; 456 void *src = page_info->page_address + page_info->page_offset; 457 struct gve_rx_slot_page_info *copy_page_info; 458 struct gve_rx_ctx *ctx = &rx->ctx; 459 bool alloc_page = false; 460 struct sk_buff *skb; 461 void *dst; 462 463 copy_page_info = &rx->qpl_copy_pool[pool_idx]; 464 if (!copy_page_info->can_flip) { 465 int recycle = gve_rx_can_recycle_buffer(copy_page_info); 466 467 if (unlikely(recycle < 0)) { 468 gve_schedule_reset(rx->gve); 469 return NULL; 470 } 471 alloc_page = !recycle; 472 } 473 474 if (alloc_page) { 475 struct gve_rx_slot_page_info alloc_page_info; 476 struct page *page; 477 478 /* The least recently used page turned out to be 479 * still in use by the kernel. Ignoring it and moving 480 * on alleviates head-of-line blocking. 481 */ 482 rx->qpl_copy_pool_head++; 483 484 page = alloc_page(GFP_ATOMIC); 485 if (!page) 486 return NULL; 487 488 alloc_page_info.page = page; 489 alloc_page_info.page_offset = 0; 490 alloc_page_info.page_address = page_address(page); 491 alloc_page_info.pad = page_info->pad; 492 493 memcpy(alloc_page_info.page_address, src, page_info->pad + len); 494 skb = gve_rx_add_frags(napi, &alloc_page_info, 495 PAGE_SIZE, 496 len, ctx); 497 498 u64_stats_update_begin(&rx->statss); 499 rx->rx_frag_copy_cnt++; 500 rx->rx_frag_alloc_cnt++; 501 u64_stats_update_end(&rx->statss); 502 503 return skb; 504 } 505 506 dst = copy_page_info->page_address + copy_page_info->page_offset; 507 memcpy(dst, src, page_info->pad + len); 508 copy_page_info->pad = page_info->pad; 509 510 skb = gve_rx_add_frags(napi, copy_page_info, 511 rx->packet_buffer_size, len, ctx); 512 if (unlikely(!skb)) 513 return NULL; 514 515 gve_dec_pagecnt_bias(copy_page_info); 516 copy_page_info->page_offset += rx->packet_buffer_size; 517 copy_page_info->page_offset &= (PAGE_SIZE - 1); 518 519 if (copy_page_info->can_flip) { 520 /* We have used both halves of this copy page, it 521 * is time for it to go to the back of the queue. 522 */ 523 copy_page_info->can_flip = false; 524 rx->qpl_copy_pool_head++; 525 prefetch(rx->qpl_copy_pool[rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask].page); 526 } else { 527 copy_page_info->can_flip = true; 528 } 529 530 u64_stats_update_begin(&rx->statss); 531 rx->rx_frag_copy_cnt++; 532 u64_stats_update_end(&rx->statss); 533 534 return skb; 535 } 536 537 static struct sk_buff * 538 gve_rx_qpl(struct device *dev, struct net_device *netdev, 539 struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info, 540 u16 len, struct napi_struct *napi, 541 union gve_rx_data_slot *data_slot) 542 { 543 struct gve_rx_ctx *ctx = &rx->ctx; 544 struct sk_buff *skb; 545 546 /* if raw_addressing mode is not enabled gvnic can only receive into 547 * registered segments. If the buffer can't be recycled, our only 548 * choice is to copy the data out of it so that we can return it to the 549 * device. 550 */ 551 if (page_info->can_flip) { 552 skb = gve_rx_add_frags(napi, page_info, rx->packet_buffer_size, len, ctx); 553 /* No point in recycling if we didn't get the skb */ 554 if (skb) { 555 /* Make sure that the page isn't freed. */ 556 gve_dec_pagecnt_bias(page_info); 557 gve_rx_flip_buff(page_info, &data_slot->qpl_offset); 558 } 559 } else { 560 skb = gve_rx_copy_to_pool(rx, page_info, len, napi); 561 } 562 return skb; 563 } 564 565 static struct sk_buff *gve_rx_skb(struct gve_priv *priv, struct gve_rx_ring *rx, 566 struct gve_rx_slot_page_info *page_info, struct napi_struct *napi, 567 u16 len, union gve_rx_data_slot *data_slot, 568 bool is_only_frag) 569 { 570 struct net_device *netdev = priv->dev; 571 struct gve_rx_ctx *ctx = &rx->ctx; 572 struct sk_buff *skb = NULL; 573 574 if (len <= priv->rx_copybreak && is_only_frag) { 575 /* Just copy small packets */ 576 skb = gve_rx_copy(netdev, napi, page_info, len); 577 if (skb) { 578 u64_stats_update_begin(&rx->statss); 579 rx->rx_copied_pkt++; 580 rx->rx_frag_copy_cnt++; 581 rx->rx_copybreak_pkt++; 582 u64_stats_update_end(&rx->statss); 583 } 584 } else { 585 int recycle = gve_rx_can_recycle_buffer(page_info); 586 587 if (unlikely(recycle < 0)) { 588 gve_schedule_reset(priv); 589 return NULL; 590 } 591 page_info->can_flip = recycle; 592 if (page_info->can_flip) { 593 u64_stats_update_begin(&rx->statss); 594 rx->rx_frag_flip_cnt++; 595 u64_stats_update_end(&rx->statss); 596 } 597 598 if (rx->data.raw_addressing) { 599 skb = gve_rx_raw_addressing(&priv->pdev->dev, netdev, 600 page_info, len, napi, 601 data_slot, 602 rx->packet_buffer_size, ctx); 603 } else { 604 skb = gve_rx_qpl(&priv->pdev->dev, netdev, rx, 605 page_info, len, napi, data_slot); 606 } 607 } 608 return skb; 609 } 610 611 static int gve_xsk_pool_redirect(struct net_device *dev, 612 struct gve_rx_ring *rx, 613 void *data, int len, 614 struct bpf_prog *xdp_prog) 615 { 616 struct xdp_buff *xdp; 617 int err; 618 619 if (rx->xsk_pool->frame_len < len) 620 return -E2BIG; 621 xdp = xsk_buff_alloc(rx->xsk_pool); 622 if (!xdp) { 623 u64_stats_update_begin(&rx->statss); 624 rx->xdp_alloc_fails++; 625 u64_stats_update_end(&rx->statss); 626 return -ENOMEM; 627 } 628 xdp->data_end = xdp->data + len; 629 memcpy(xdp->data, data, len); 630 err = xdp_do_redirect(dev, xdp, xdp_prog); 631 if (err) 632 xsk_buff_free(xdp); 633 return err; 634 } 635 636 static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx, 637 struct xdp_buff *orig, struct bpf_prog *xdp_prog) 638 { 639 int total_len, len = orig->data_end - orig->data; 640 int headroom = XDP_PACKET_HEADROOM; 641 struct xdp_buff new; 642 void *frame; 643 int err; 644 645 if (rx->xsk_pool) 646 return gve_xsk_pool_redirect(dev, rx, orig->data, 647 len, xdp_prog); 648 649 total_len = headroom + SKB_DATA_ALIGN(len) + 650 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 651 frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC); 652 if (!frame) { 653 u64_stats_update_begin(&rx->statss); 654 rx->xdp_alloc_fails++; 655 u64_stats_update_end(&rx->statss); 656 return -ENOMEM; 657 } 658 xdp_init_buff(&new, total_len, &rx->xdp_rxq); 659 xdp_prepare_buff(&new, frame, headroom, len, false); 660 memcpy(new.data, orig->data, len); 661 662 err = xdp_do_redirect(dev, &new, xdp_prog); 663 if (err) 664 page_frag_free(frame); 665 666 return err; 667 } 668 669 static void gve_xdp_done(struct gve_priv *priv, struct gve_rx_ring *rx, 670 struct xdp_buff *xdp, struct bpf_prog *xprog, 671 int xdp_act) 672 { 673 struct gve_tx_ring *tx; 674 int tx_qid; 675 int err; 676 677 switch (xdp_act) { 678 case XDP_ABORTED: 679 case XDP_DROP: 680 default: 681 break; 682 case XDP_TX: 683 tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num); 684 tx = &priv->tx[tx_qid]; 685 spin_lock(&tx->xdp_lock); 686 err = gve_xdp_xmit_one(priv, tx, xdp->data, 687 xdp->data_end - xdp->data, NULL); 688 spin_unlock(&tx->xdp_lock); 689 690 if (unlikely(err)) { 691 u64_stats_update_begin(&rx->statss); 692 rx->xdp_tx_errors++; 693 u64_stats_update_end(&rx->statss); 694 } 695 break; 696 case XDP_REDIRECT: 697 err = gve_xdp_redirect(priv->dev, rx, xdp, xprog); 698 699 if (unlikely(err)) { 700 u64_stats_update_begin(&rx->statss); 701 rx->xdp_redirect_errors++; 702 u64_stats_update_end(&rx->statss); 703 } 704 break; 705 } 706 u64_stats_update_begin(&rx->statss); 707 if ((u32)xdp_act < GVE_XDP_ACTIONS) 708 rx->xdp_actions[xdp_act]++; 709 u64_stats_update_end(&rx->statss); 710 } 711 712 #define GVE_PKTCONT_BIT_IS_SET(x) (GVE_RXF_PKT_CONT & (x)) 713 static void gve_rx(struct gve_rx_ring *rx, netdev_features_t feat, 714 struct gve_rx_desc *desc, u32 idx, 715 struct gve_rx_cnts *cnts) 716 { 717 bool is_last_frag = !GVE_PKTCONT_BIT_IS_SET(desc->flags_seq); 718 struct gve_rx_slot_page_info *page_info; 719 u16 frag_size = be16_to_cpu(desc->len); 720 struct gve_rx_ctx *ctx = &rx->ctx; 721 union gve_rx_data_slot *data_slot; 722 struct gve_priv *priv = rx->gve; 723 struct sk_buff *skb = NULL; 724 struct bpf_prog *xprog; 725 struct xdp_buff xdp; 726 dma_addr_t page_bus; 727 void *va; 728 729 u16 len = frag_size; 730 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 731 bool is_first_frag = ctx->frag_cnt == 0; 732 733 bool is_only_frag = is_first_frag && is_last_frag; 734 735 if (unlikely(ctx->drop_pkt)) 736 goto finish_frag; 737 738 if (desc->flags_seq & GVE_RXF_ERR) { 739 ctx->drop_pkt = true; 740 cnts->desc_err_pkt_cnt++; 741 napi_free_frags(napi); 742 goto finish_frag; 743 } 744 745 if (unlikely(frag_size > rx->packet_buffer_size)) { 746 netdev_warn(priv->dev, "Unexpected frag size %d, can't exceed %d, scheduling reset", 747 frag_size, rx->packet_buffer_size); 748 ctx->drop_pkt = true; 749 napi_free_frags(napi); 750 gve_schedule_reset(rx->gve); 751 goto finish_frag; 752 } 753 754 /* Prefetch two packet buffers ahead, we will need it soon. */ 755 page_info = &rx->data.page_info[(idx + 2) & rx->mask]; 756 va = page_info->page_address + page_info->page_offset; 757 prefetch(page_info->page); /* Kernel page struct. */ 758 prefetch(va); /* Packet header. */ 759 prefetch(va + 64); /* Next cacheline too. */ 760 761 page_info = &rx->data.page_info[idx]; 762 data_slot = &rx->data.data_ring[idx]; 763 page_bus = (rx->data.raw_addressing) ? 764 be64_to_cpu(data_slot->addr) - page_info->page_offset : 765 rx->data.qpl->page_buses[idx]; 766 dma_sync_single_for_cpu(&priv->pdev->dev, page_bus, 767 PAGE_SIZE, DMA_FROM_DEVICE); 768 page_info->pad = is_first_frag ? GVE_RX_PAD : 0; 769 len -= page_info->pad; 770 frag_size -= page_info->pad; 771 772 xprog = READ_ONCE(priv->xdp_prog); 773 if (xprog && is_only_frag) { 774 void *old_data; 775 int xdp_act; 776 777 xdp_init_buff(&xdp, rx->packet_buffer_size, &rx->xdp_rxq); 778 xdp_prepare_buff(&xdp, page_info->page_address + 779 page_info->page_offset, GVE_RX_PAD, 780 len, false); 781 old_data = xdp.data; 782 xdp_act = bpf_prog_run_xdp(xprog, &xdp); 783 if (xdp_act != XDP_PASS) { 784 gve_xdp_done(priv, rx, &xdp, xprog, xdp_act); 785 ctx->total_size += frag_size; 786 goto finish_ok_pkt; 787 } 788 789 page_info->pad += xdp.data - old_data; 790 len = xdp.data_end - xdp.data; 791 792 u64_stats_update_begin(&rx->statss); 793 rx->xdp_actions[XDP_PASS]++; 794 u64_stats_update_end(&rx->statss); 795 } 796 797 skb = gve_rx_skb(priv, rx, page_info, napi, len, 798 data_slot, is_only_frag); 799 if (!skb) { 800 u64_stats_update_begin(&rx->statss); 801 rx->rx_skb_alloc_fail++; 802 u64_stats_update_end(&rx->statss); 803 804 napi_free_frags(napi); 805 ctx->drop_pkt = true; 806 goto finish_frag; 807 } 808 ctx->total_size += frag_size; 809 810 if (is_first_frag) { 811 if (likely(feat & NETIF_F_RXCSUM)) { 812 /* NIC passes up the partial sum */ 813 if (desc->csum) 814 skb->ip_summed = CHECKSUM_COMPLETE; 815 else 816 skb->ip_summed = CHECKSUM_NONE; 817 skb->csum = csum_unfold(desc->csum); 818 } 819 820 /* parse flags & pass relevant info up */ 821 if (likely(feat & NETIF_F_RXHASH) && 822 gve_needs_rss(desc->flags_seq)) 823 skb_set_hash(skb, be32_to_cpu(desc->rss_hash), 824 gve_rss_type(desc->flags_seq)); 825 } 826 827 if (is_last_frag) { 828 skb_record_rx_queue(skb, rx->q_num); 829 if (skb_is_nonlinear(skb)) 830 napi_gro_frags(napi); 831 else 832 napi_gro_receive(napi, skb); 833 goto finish_ok_pkt; 834 } 835 836 goto finish_frag; 837 838 finish_ok_pkt: 839 cnts->ok_pkt_bytes += ctx->total_size; 840 cnts->ok_pkt_cnt++; 841 finish_frag: 842 ctx->frag_cnt++; 843 if (is_last_frag) { 844 cnts->total_pkt_cnt++; 845 cnts->cont_pkt_cnt += (ctx->frag_cnt > 1); 846 gve_rx_ctx_clear(ctx); 847 } 848 } 849 850 bool gve_rx_work_pending(struct gve_rx_ring *rx) 851 { 852 struct gve_rx_desc *desc; 853 __be16 flags_seq; 854 u32 next_idx; 855 856 next_idx = rx->cnt & rx->mask; 857 desc = rx->desc.desc_ring + next_idx; 858 859 flags_seq = desc->flags_seq; 860 861 return (GVE_SEQNO(flags_seq) == rx->desc.seqno); 862 } 863 864 static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx) 865 { 866 int refill_target = rx->mask + 1; 867 u32 fill_cnt = rx->fill_cnt; 868 869 while (fill_cnt - rx->cnt < refill_target) { 870 struct gve_rx_slot_page_info *page_info; 871 u32 idx = fill_cnt & rx->mask; 872 873 page_info = &rx->data.page_info[idx]; 874 if (page_info->can_flip) { 875 /* The other half of the page is free because it was 876 * free when we processed the descriptor. Flip to it. 877 */ 878 union gve_rx_data_slot *data_slot = 879 &rx->data.data_ring[idx]; 880 881 gve_rx_flip_buff(page_info, &data_slot->addr); 882 page_info->can_flip = 0; 883 } else { 884 /* It is possible that the networking stack has already 885 * finished processing all outstanding packets in the buffer 886 * and it can be reused. 887 * Flipping is unnecessary here - if the networking stack still 888 * owns half the page it is impossible to tell which half. Either 889 * the whole page is free or it needs to be replaced. 890 */ 891 int recycle = gve_rx_can_recycle_buffer(page_info); 892 893 if (recycle < 0) { 894 if (!rx->data.raw_addressing) 895 gve_schedule_reset(priv); 896 return false; 897 } 898 if (!recycle) { 899 /* We can't reuse the buffer - alloc a new one*/ 900 union gve_rx_data_slot *data_slot = 901 &rx->data.data_ring[idx]; 902 struct device *dev = &priv->pdev->dev; 903 gve_rx_free_buffer(dev, page_info, data_slot); 904 page_info->page = NULL; 905 if (gve_rx_alloc_buffer(priv, dev, page_info, 906 data_slot)) { 907 u64_stats_update_begin(&rx->statss); 908 rx->rx_buf_alloc_fail++; 909 u64_stats_update_end(&rx->statss); 910 break; 911 } 912 } 913 } 914 fill_cnt++; 915 } 916 rx->fill_cnt = fill_cnt; 917 return true; 918 } 919 920 static int gve_clean_rx_done(struct gve_rx_ring *rx, int budget, 921 netdev_features_t feat) 922 { 923 u64 xdp_redirects = rx->xdp_actions[XDP_REDIRECT]; 924 u64 xdp_txs = rx->xdp_actions[XDP_TX]; 925 struct gve_rx_ctx *ctx = &rx->ctx; 926 struct gve_priv *priv = rx->gve; 927 struct gve_rx_cnts cnts = {0}; 928 struct gve_rx_desc *next_desc; 929 u32 idx = rx->cnt & rx->mask; 930 u32 work_done = 0; 931 932 struct gve_rx_desc *desc = &rx->desc.desc_ring[idx]; 933 934 // Exceed budget only if (and till) the inflight packet is consumed. 935 while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) && 936 (work_done < budget || ctx->frag_cnt)) { 937 next_desc = &rx->desc.desc_ring[(idx + 1) & rx->mask]; 938 prefetch(next_desc); 939 940 gve_rx(rx, feat, desc, idx, &cnts); 941 942 rx->cnt++; 943 idx = rx->cnt & rx->mask; 944 desc = &rx->desc.desc_ring[idx]; 945 rx->desc.seqno = gve_next_seqno(rx->desc.seqno); 946 work_done++; 947 } 948 949 // The device will only send whole packets. 950 if (unlikely(ctx->frag_cnt)) { 951 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 952 953 napi_free_frags(napi); 954 gve_rx_ctx_clear(&rx->ctx); 955 netdev_warn(priv->dev, "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset", 956 GVE_SEQNO(desc->flags_seq), rx->desc.seqno); 957 gve_schedule_reset(rx->gve); 958 } 959 960 if (!work_done && rx->fill_cnt - rx->cnt > rx->db_threshold) 961 return 0; 962 963 if (work_done) { 964 u64_stats_update_begin(&rx->statss); 965 rx->rpackets += cnts.ok_pkt_cnt; 966 rx->rbytes += cnts.ok_pkt_bytes; 967 rx->rx_cont_packet_cnt += cnts.cont_pkt_cnt; 968 rx->rx_desc_err_dropped_pkt += cnts.desc_err_pkt_cnt; 969 u64_stats_update_end(&rx->statss); 970 } 971 972 if (xdp_txs != rx->xdp_actions[XDP_TX]) 973 gve_xdp_tx_flush(priv, rx->q_num); 974 975 if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT]) 976 xdp_do_flush(); 977 978 /* restock ring slots */ 979 if (!rx->data.raw_addressing) { 980 /* In QPL mode buffs are refilled as the desc are processed */ 981 rx->fill_cnt += work_done; 982 } else if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { 983 /* In raw addressing mode buffs are only refilled if the avail 984 * falls below a threshold. 985 */ 986 if (!gve_rx_refill_buffers(priv, rx)) 987 return 0; 988 989 /* If we were not able to completely refill buffers, we'll want 990 * to schedule this queue for work again to refill buffers. 991 */ 992 if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { 993 gve_rx_write_doorbell(priv, rx); 994 return budget; 995 } 996 } 997 998 gve_rx_write_doorbell(priv, rx); 999 return cnts.total_pkt_cnt; 1000 } 1001 1002 int gve_rx_poll(struct gve_notify_block *block, int budget) 1003 { 1004 struct gve_rx_ring *rx = block->rx; 1005 netdev_features_t feat; 1006 int work_done = 0; 1007 1008 feat = block->napi.dev->features; 1009 1010 if (budget > 0) 1011 work_done = gve_clean_rx_done(rx, budget, feat); 1012 1013 return work_done; 1014 } 1015