1 // SPDX-License-Identifier: (GPL-2.0 OR MIT) 2 /* Google virtual Ethernet (gve) driver 3 * 4 * Copyright (C) 2015-2021 Google, Inc. 5 */ 6 7 #include "gve.h" 8 #include "gve_adminq.h" 9 #include "gve_utils.h" 10 #include <linux/etherdevice.h> 11 #include <linux/filter.h> 12 #include <net/xdp.h> 13 #include <net/xdp_sock_drv.h> 14 15 static void gve_rx_free_buffer(struct device *dev, 16 struct gve_rx_slot_page_info *page_info, 17 union gve_rx_data_slot *data_slot) 18 { 19 dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) & 20 GVE_DATA_SLOT_ADDR_PAGE_MASK); 21 22 page_ref_sub(page_info->page, page_info->pagecnt_bias - 1); 23 gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE); 24 } 25 26 static void gve_rx_unfill_pages(struct gve_priv *priv, struct gve_rx_ring *rx) 27 { 28 u32 slots = rx->mask + 1; 29 int i; 30 31 if (rx->data.raw_addressing) { 32 for (i = 0; i < slots; i++) 33 gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i], 34 &rx->data.data_ring[i]); 35 } else { 36 for (i = 0; i < slots; i++) 37 page_ref_sub(rx->data.page_info[i].page, 38 rx->data.page_info[i].pagecnt_bias - 1); 39 gve_unassign_qpl(priv, rx->data.qpl->id); 40 rx->data.qpl = NULL; 41 42 for (i = 0; i < rx->qpl_copy_pool_mask + 1; i++) { 43 page_ref_sub(rx->qpl_copy_pool[i].page, 44 rx->qpl_copy_pool[i].pagecnt_bias - 1); 45 put_page(rx->qpl_copy_pool[i].page); 46 } 47 } 48 kvfree(rx->data.page_info); 49 rx->data.page_info = NULL; 50 } 51 52 static void gve_rx_free_ring(struct gve_priv *priv, int idx) 53 { 54 struct gve_rx_ring *rx = &priv->rx[idx]; 55 struct device *dev = &priv->pdev->dev; 56 u32 slots = rx->mask + 1; 57 size_t bytes; 58 59 gve_rx_remove_from_block(priv, idx); 60 61 bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt; 62 dma_free_coherent(dev, bytes, rx->desc.desc_ring, rx->desc.bus); 63 rx->desc.desc_ring = NULL; 64 65 dma_free_coherent(dev, sizeof(*rx->q_resources), 66 rx->q_resources, rx->q_resources_bus); 67 rx->q_resources = NULL; 68 69 gve_rx_unfill_pages(priv, rx); 70 71 bytes = sizeof(*rx->data.data_ring) * slots; 72 dma_free_coherent(dev, bytes, rx->data.data_ring, 73 rx->data.data_bus); 74 rx->data.data_ring = NULL; 75 76 kvfree(rx->qpl_copy_pool); 77 rx->qpl_copy_pool = NULL; 78 79 netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); 80 } 81 82 static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info, 83 dma_addr_t addr, struct page *page, __be64 *slot_addr) 84 { 85 page_info->page = page; 86 page_info->page_offset = 0; 87 page_info->page_address = page_address(page); 88 *slot_addr = cpu_to_be64(addr); 89 /* The page already has 1 ref */ 90 page_ref_add(page, INT_MAX - 1); 91 page_info->pagecnt_bias = INT_MAX; 92 } 93 94 static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev, 95 struct gve_rx_slot_page_info *page_info, 96 union gve_rx_data_slot *data_slot) 97 { 98 struct page *page; 99 dma_addr_t dma; 100 int err; 101 102 err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE, 103 GFP_ATOMIC); 104 if (err) 105 return err; 106 107 gve_setup_rx_buffer(page_info, dma, page, &data_slot->addr); 108 return 0; 109 } 110 111 static int gve_prefill_rx_pages(struct gve_rx_ring *rx) 112 { 113 struct gve_priv *priv = rx->gve; 114 u32 slots; 115 int err; 116 int i; 117 int j; 118 119 /* Allocate one page per Rx queue slot. Each page is split into two 120 * packet buffers, when possible we "page flip" between the two. 121 */ 122 slots = rx->mask + 1; 123 124 rx->data.page_info = kvzalloc(slots * 125 sizeof(*rx->data.page_info), GFP_KERNEL); 126 if (!rx->data.page_info) 127 return -ENOMEM; 128 129 if (!rx->data.raw_addressing) { 130 rx->data.qpl = gve_assign_rx_qpl(priv, rx->q_num); 131 if (!rx->data.qpl) { 132 kvfree(rx->data.page_info); 133 rx->data.page_info = NULL; 134 return -ENOMEM; 135 } 136 } 137 for (i = 0; i < slots; i++) { 138 if (!rx->data.raw_addressing) { 139 struct page *page = rx->data.qpl->pages[i]; 140 dma_addr_t addr = i * PAGE_SIZE; 141 142 gve_setup_rx_buffer(&rx->data.page_info[i], addr, page, 143 &rx->data.data_ring[i].qpl_offset); 144 continue; 145 } 146 err = gve_rx_alloc_buffer(priv, &priv->pdev->dev, &rx->data.page_info[i], 147 &rx->data.data_ring[i]); 148 if (err) 149 goto alloc_err; 150 } 151 152 if (!rx->data.raw_addressing) { 153 for (j = 0; j < rx->qpl_copy_pool_mask + 1; j++) { 154 struct page *page = alloc_page(GFP_KERNEL); 155 156 if (!page) { 157 err = -ENOMEM; 158 goto alloc_err_qpl; 159 } 160 161 rx->qpl_copy_pool[j].page = page; 162 rx->qpl_copy_pool[j].page_offset = 0; 163 rx->qpl_copy_pool[j].page_address = page_address(page); 164 165 /* The page already has 1 ref. */ 166 page_ref_add(page, INT_MAX - 1); 167 rx->qpl_copy_pool[j].pagecnt_bias = INT_MAX; 168 } 169 } 170 171 return slots; 172 173 alloc_err_qpl: 174 while (j--) { 175 page_ref_sub(rx->qpl_copy_pool[j].page, 176 rx->qpl_copy_pool[j].pagecnt_bias - 1); 177 put_page(rx->qpl_copy_pool[j].page); 178 } 179 alloc_err: 180 while (i--) 181 gve_rx_free_buffer(&priv->pdev->dev, 182 &rx->data.page_info[i], 183 &rx->data.data_ring[i]); 184 return err; 185 } 186 187 static void gve_rx_ctx_clear(struct gve_rx_ctx *ctx) 188 { 189 ctx->skb_head = NULL; 190 ctx->skb_tail = NULL; 191 ctx->total_size = 0; 192 ctx->frag_cnt = 0; 193 ctx->drop_pkt = false; 194 } 195 196 static int gve_rx_alloc_ring(struct gve_priv *priv, int idx) 197 { 198 struct gve_rx_ring *rx = &priv->rx[idx]; 199 struct device *hdev = &priv->pdev->dev; 200 u32 slots, npages; 201 int filled_pages; 202 size_t bytes; 203 int err; 204 205 netif_dbg(priv, drv, priv->dev, "allocating rx ring\n"); 206 /* Make sure everything is zeroed to start with */ 207 memset(rx, 0, sizeof(*rx)); 208 209 rx->gve = priv; 210 rx->q_num = idx; 211 212 slots = priv->rx_data_slot_cnt; 213 rx->mask = slots - 1; 214 rx->data.raw_addressing = priv->queue_format == GVE_GQI_RDA_FORMAT; 215 216 /* alloc rx data ring */ 217 bytes = sizeof(*rx->data.data_ring) * slots; 218 rx->data.data_ring = dma_alloc_coherent(hdev, bytes, 219 &rx->data.data_bus, 220 GFP_KERNEL); 221 if (!rx->data.data_ring) 222 return -ENOMEM; 223 224 rx->qpl_copy_pool_mask = min_t(u32, U32_MAX, slots * 2) - 1; 225 rx->qpl_copy_pool_head = 0; 226 rx->qpl_copy_pool = kvcalloc(rx->qpl_copy_pool_mask + 1, 227 sizeof(rx->qpl_copy_pool[0]), 228 GFP_KERNEL); 229 230 if (!rx->qpl_copy_pool) { 231 err = -ENOMEM; 232 goto abort_with_slots; 233 } 234 235 filled_pages = gve_prefill_rx_pages(rx); 236 if (filled_pages < 0) { 237 err = -ENOMEM; 238 goto abort_with_copy_pool; 239 } 240 rx->fill_cnt = filled_pages; 241 /* Ensure data ring slots (packet buffers) are visible. */ 242 dma_wmb(); 243 244 /* Alloc gve_queue_resources */ 245 rx->q_resources = 246 dma_alloc_coherent(hdev, 247 sizeof(*rx->q_resources), 248 &rx->q_resources_bus, 249 GFP_KERNEL); 250 if (!rx->q_resources) { 251 err = -ENOMEM; 252 goto abort_filled; 253 } 254 netif_dbg(priv, drv, priv->dev, "rx[%d]->data.data_bus=%lx\n", idx, 255 (unsigned long)rx->data.data_bus); 256 257 /* alloc rx desc ring */ 258 bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt; 259 npages = bytes / PAGE_SIZE; 260 if (npages * PAGE_SIZE != bytes) { 261 err = -EIO; 262 goto abort_with_q_resources; 263 } 264 265 rx->desc.desc_ring = dma_alloc_coherent(hdev, bytes, &rx->desc.bus, 266 GFP_KERNEL); 267 if (!rx->desc.desc_ring) { 268 err = -ENOMEM; 269 goto abort_with_q_resources; 270 } 271 rx->cnt = 0; 272 rx->db_threshold = priv->rx_desc_cnt / 2; 273 rx->desc.seqno = 1; 274 275 /* Allocating half-page buffers allows page-flipping which is faster 276 * than copying or allocating new pages. 277 */ 278 rx->packet_buffer_size = PAGE_SIZE / 2; 279 gve_rx_ctx_clear(&rx->ctx); 280 gve_rx_add_to_block(priv, idx); 281 282 return 0; 283 284 abort_with_q_resources: 285 dma_free_coherent(hdev, sizeof(*rx->q_resources), 286 rx->q_resources, rx->q_resources_bus); 287 rx->q_resources = NULL; 288 abort_filled: 289 gve_rx_unfill_pages(priv, rx); 290 abort_with_copy_pool: 291 kvfree(rx->qpl_copy_pool); 292 rx->qpl_copy_pool = NULL; 293 abort_with_slots: 294 bytes = sizeof(*rx->data.data_ring) * slots; 295 dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus); 296 rx->data.data_ring = NULL; 297 298 return err; 299 } 300 301 int gve_rx_alloc_rings(struct gve_priv *priv) 302 { 303 int err = 0; 304 int i; 305 306 for (i = 0; i < priv->rx_cfg.num_queues; i++) { 307 err = gve_rx_alloc_ring(priv, i); 308 if (err) { 309 netif_err(priv, drv, priv->dev, 310 "Failed to alloc rx ring=%d: err=%d\n", 311 i, err); 312 break; 313 } 314 } 315 /* Unallocate if there was an error */ 316 if (err) { 317 int j; 318 319 for (j = 0; j < i; j++) 320 gve_rx_free_ring(priv, j); 321 } 322 return err; 323 } 324 325 void gve_rx_free_rings_gqi(struct gve_priv *priv) 326 { 327 int i; 328 329 for (i = 0; i < priv->rx_cfg.num_queues; i++) 330 gve_rx_free_ring(priv, i); 331 } 332 333 void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx) 334 { 335 u32 db_idx = be32_to_cpu(rx->q_resources->db_index); 336 337 iowrite32be(rx->fill_cnt, &priv->db_bar2[db_idx]); 338 } 339 340 static enum pkt_hash_types gve_rss_type(__be16 pkt_flags) 341 { 342 if (likely(pkt_flags & (GVE_RXF_TCP | GVE_RXF_UDP))) 343 return PKT_HASH_TYPE_L4; 344 if (pkt_flags & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) 345 return PKT_HASH_TYPE_L3; 346 return PKT_HASH_TYPE_L2; 347 } 348 349 static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi, 350 struct gve_rx_slot_page_info *page_info, 351 u16 packet_buffer_size, u16 len, 352 struct gve_rx_ctx *ctx) 353 { 354 u32 offset = page_info->page_offset + page_info->pad; 355 struct sk_buff *skb = ctx->skb_tail; 356 int num_frags = 0; 357 358 if (!skb) { 359 skb = napi_get_frags(napi); 360 if (unlikely(!skb)) 361 return NULL; 362 363 ctx->skb_head = skb; 364 ctx->skb_tail = skb; 365 } else { 366 num_frags = skb_shinfo(ctx->skb_tail)->nr_frags; 367 if (num_frags == MAX_SKB_FRAGS) { 368 skb = napi_alloc_skb(napi, 0); 369 if (!skb) 370 return NULL; 371 372 // We will never chain more than two SKBs: 2 * 16 * 2k > 64k 373 // which is why we do not need to chain by using skb->next 374 skb_shinfo(ctx->skb_tail)->frag_list = skb; 375 376 ctx->skb_tail = skb; 377 num_frags = 0; 378 } 379 } 380 381 if (skb != ctx->skb_head) { 382 ctx->skb_head->len += len; 383 ctx->skb_head->data_len += len; 384 ctx->skb_head->truesize += packet_buffer_size; 385 } 386 skb_add_rx_frag(skb, num_frags, page_info->page, 387 offset, len, packet_buffer_size); 388 389 return ctx->skb_head; 390 } 391 392 static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) 393 { 394 const __be64 offset = cpu_to_be64(PAGE_SIZE / 2); 395 396 /* "flip" to other packet buffer on this page */ 397 page_info->page_offset ^= PAGE_SIZE / 2; 398 *(slot_addr) ^= offset; 399 } 400 401 static int gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info *page_info) 402 { 403 int pagecount = page_count(page_info->page); 404 405 /* This page is not being used by any SKBs - reuse */ 406 if (pagecount == page_info->pagecnt_bias) 407 return 1; 408 /* This page is still being used by an SKB - we can't reuse */ 409 else if (pagecount > page_info->pagecnt_bias) 410 return 0; 411 WARN(pagecount < page_info->pagecnt_bias, 412 "Pagecount should never be less than the bias."); 413 return -1; 414 } 415 416 static struct sk_buff * 417 gve_rx_raw_addressing(struct device *dev, struct net_device *netdev, 418 struct gve_rx_slot_page_info *page_info, u16 len, 419 struct napi_struct *napi, 420 union gve_rx_data_slot *data_slot, 421 u16 packet_buffer_size, struct gve_rx_ctx *ctx) 422 { 423 struct sk_buff *skb = gve_rx_add_frags(napi, page_info, packet_buffer_size, len, ctx); 424 425 if (!skb) 426 return NULL; 427 428 /* Optimistically stop the kernel from freeing the page. 429 * We will check again in refill to determine if we need to alloc a 430 * new page. 431 */ 432 gve_dec_pagecnt_bias(page_info); 433 434 return skb; 435 } 436 437 static struct sk_buff *gve_rx_copy_to_pool(struct gve_rx_ring *rx, 438 struct gve_rx_slot_page_info *page_info, 439 u16 len, struct napi_struct *napi) 440 { 441 u32 pool_idx = rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask; 442 void *src = page_info->page_address + page_info->page_offset; 443 struct gve_rx_slot_page_info *copy_page_info; 444 struct gve_rx_ctx *ctx = &rx->ctx; 445 bool alloc_page = false; 446 struct sk_buff *skb; 447 void *dst; 448 449 copy_page_info = &rx->qpl_copy_pool[pool_idx]; 450 if (!copy_page_info->can_flip) { 451 int recycle = gve_rx_can_recycle_buffer(copy_page_info); 452 453 if (unlikely(recycle < 0)) { 454 gve_schedule_reset(rx->gve); 455 return NULL; 456 } 457 alloc_page = !recycle; 458 } 459 460 if (alloc_page) { 461 struct gve_rx_slot_page_info alloc_page_info; 462 struct page *page; 463 464 /* The least recently used page turned out to be 465 * still in use by the kernel. Ignoring it and moving 466 * on alleviates head-of-line blocking. 467 */ 468 rx->qpl_copy_pool_head++; 469 470 page = alloc_page(GFP_ATOMIC); 471 if (!page) 472 return NULL; 473 474 alloc_page_info.page = page; 475 alloc_page_info.page_offset = 0; 476 alloc_page_info.page_address = page_address(page); 477 alloc_page_info.pad = page_info->pad; 478 479 memcpy(alloc_page_info.page_address, src, page_info->pad + len); 480 skb = gve_rx_add_frags(napi, &alloc_page_info, 481 rx->packet_buffer_size, 482 len, ctx); 483 484 u64_stats_update_begin(&rx->statss); 485 rx->rx_frag_copy_cnt++; 486 rx->rx_frag_alloc_cnt++; 487 u64_stats_update_end(&rx->statss); 488 489 return skb; 490 } 491 492 dst = copy_page_info->page_address + copy_page_info->page_offset; 493 memcpy(dst, src, page_info->pad + len); 494 copy_page_info->pad = page_info->pad; 495 496 skb = gve_rx_add_frags(napi, copy_page_info, 497 rx->packet_buffer_size, len, ctx); 498 if (unlikely(!skb)) 499 return NULL; 500 501 gve_dec_pagecnt_bias(copy_page_info); 502 copy_page_info->page_offset += rx->packet_buffer_size; 503 copy_page_info->page_offset &= (PAGE_SIZE - 1); 504 505 if (copy_page_info->can_flip) { 506 /* We have used both halves of this copy page, it 507 * is time for it to go to the back of the queue. 508 */ 509 copy_page_info->can_flip = false; 510 rx->qpl_copy_pool_head++; 511 prefetch(rx->qpl_copy_pool[rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask].page); 512 } else { 513 copy_page_info->can_flip = true; 514 } 515 516 u64_stats_update_begin(&rx->statss); 517 rx->rx_frag_copy_cnt++; 518 u64_stats_update_end(&rx->statss); 519 520 return skb; 521 } 522 523 static struct sk_buff * 524 gve_rx_qpl(struct device *dev, struct net_device *netdev, 525 struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info, 526 u16 len, struct napi_struct *napi, 527 union gve_rx_data_slot *data_slot) 528 { 529 struct gve_rx_ctx *ctx = &rx->ctx; 530 struct sk_buff *skb; 531 532 /* if raw_addressing mode is not enabled gvnic can only receive into 533 * registered segments. If the buffer can't be recycled, our only 534 * choice is to copy the data out of it so that we can return it to the 535 * device. 536 */ 537 if (page_info->can_flip) { 538 skb = gve_rx_add_frags(napi, page_info, rx->packet_buffer_size, len, ctx); 539 /* No point in recycling if we didn't get the skb */ 540 if (skb) { 541 /* Make sure that the page isn't freed. */ 542 gve_dec_pagecnt_bias(page_info); 543 gve_rx_flip_buff(page_info, &data_slot->qpl_offset); 544 } 545 } else { 546 skb = gve_rx_copy_to_pool(rx, page_info, len, napi); 547 } 548 return skb; 549 } 550 551 static struct sk_buff *gve_rx_skb(struct gve_priv *priv, struct gve_rx_ring *rx, 552 struct gve_rx_slot_page_info *page_info, struct napi_struct *napi, 553 u16 len, union gve_rx_data_slot *data_slot, 554 bool is_only_frag) 555 { 556 struct net_device *netdev = priv->dev; 557 struct gve_rx_ctx *ctx = &rx->ctx; 558 struct sk_buff *skb = NULL; 559 560 if (len <= priv->rx_copybreak && is_only_frag) { 561 /* Just copy small packets */ 562 skb = gve_rx_copy(netdev, napi, page_info, len); 563 if (skb) { 564 u64_stats_update_begin(&rx->statss); 565 rx->rx_copied_pkt++; 566 rx->rx_frag_copy_cnt++; 567 rx->rx_copybreak_pkt++; 568 u64_stats_update_end(&rx->statss); 569 } 570 } else { 571 int recycle = gve_rx_can_recycle_buffer(page_info); 572 573 if (unlikely(recycle < 0)) { 574 gve_schedule_reset(priv); 575 return NULL; 576 } 577 page_info->can_flip = recycle; 578 if (page_info->can_flip) { 579 u64_stats_update_begin(&rx->statss); 580 rx->rx_frag_flip_cnt++; 581 u64_stats_update_end(&rx->statss); 582 } 583 584 if (rx->data.raw_addressing) { 585 skb = gve_rx_raw_addressing(&priv->pdev->dev, netdev, 586 page_info, len, napi, 587 data_slot, 588 rx->packet_buffer_size, ctx); 589 } else { 590 skb = gve_rx_qpl(&priv->pdev->dev, netdev, rx, 591 page_info, len, napi, data_slot); 592 } 593 } 594 return skb; 595 } 596 597 static int gve_xsk_pool_redirect(struct net_device *dev, 598 struct gve_rx_ring *rx, 599 void *data, int len, 600 struct bpf_prog *xdp_prog) 601 { 602 struct xdp_buff *xdp; 603 int err; 604 605 if (rx->xsk_pool->frame_len < len) 606 return -E2BIG; 607 xdp = xsk_buff_alloc(rx->xsk_pool); 608 if (!xdp) { 609 u64_stats_update_begin(&rx->statss); 610 rx->xdp_alloc_fails++; 611 u64_stats_update_end(&rx->statss); 612 return -ENOMEM; 613 } 614 xdp->data_end = xdp->data + len; 615 memcpy(xdp->data, data, len); 616 err = xdp_do_redirect(dev, xdp, xdp_prog); 617 if (err) 618 xsk_buff_free(xdp); 619 return err; 620 } 621 622 static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx, 623 struct xdp_buff *orig, struct bpf_prog *xdp_prog) 624 { 625 int total_len, len = orig->data_end - orig->data; 626 int headroom = XDP_PACKET_HEADROOM; 627 struct xdp_buff new; 628 void *frame; 629 int err; 630 631 if (rx->xsk_pool) 632 return gve_xsk_pool_redirect(dev, rx, orig->data, 633 len, xdp_prog); 634 635 total_len = headroom + SKB_DATA_ALIGN(len) + 636 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 637 frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC); 638 if (!frame) { 639 u64_stats_update_begin(&rx->statss); 640 rx->xdp_alloc_fails++; 641 u64_stats_update_end(&rx->statss); 642 return -ENOMEM; 643 } 644 xdp_init_buff(&new, total_len, &rx->xdp_rxq); 645 xdp_prepare_buff(&new, frame, headroom, len, false); 646 memcpy(new.data, orig->data, len); 647 648 err = xdp_do_redirect(dev, &new, xdp_prog); 649 if (err) 650 page_frag_free(frame); 651 652 return err; 653 } 654 655 static void gve_xdp_done(struct gve_priv *priv, struct gve_rx_ring *rx, 656 struct xdp_buff *xdp, struct bpf_prog *xprog, 657 int xdp_act) 658 { 659 struct gve_tx_ring *tx; 660 int tx_qid; 661 int err; 662 663 switch (xdp_act) { 664 case XDP_ABORTED: 665 case XDP_DROP: 666 default: 667 break; 668 case XDP_TX: 669 tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num); 670 tx = &priv->tx[tx_qid]; 671 spin_lock(&tx->xdp_lock); 672 err = gve_xdp_xmit_one(priv, tx, xdp->data, 673 xdp->data_end - xdp->data, NULL); 674 spin_unlock(&tx->xdp_lock); 675 676 if (unlikely(err)) { 677 u64_stats_update_begin(&rx->statss); 678 rx->xdp_tx_errors++; 679 u64_stats_update_end(&rx->statss); 680 } 681 break; 682 case XDP_REDIRECT: 683 err = gve_xdp_redirect(priv->dev, rx, xdp, xprog); 684 685 if (unlikely(err)) { 686 u64_stats_update_begin(&rx->statss); 687 rx->xdp_redirect_errors++; 688 u64_stats_update_end(&rx->statss); 689 } 690 break; 691 } 692 u64_stats_update_begin(&rx->statss); 693 if ((u32)xdp_act < GVE_XDP_ACTIONS) 694 rx->xdp_actions[xdp_act]++; 695 u64_stats_update_end(&rx->statss); 696 } 697 698 #define GVE_PKTCONT_BIT_IS_SET(x) (GVE_RXF_PKT_CONT & (x)) 699 static void gve_rx(struct gve_rx_ring *rx, netdev_features_t feat, 700 struct gve_rx_desc *desc, u32 idx, 701 struct gve_rx_cnts *cnts) 702 { 703 bool is_last_frag = !GVE_PKTCONT_BIT_IS_SET(desc->flags_seq); 704 struct gve_rx_slot_page_info *page_info; 705 u16 frag_size = be16_to_cpu(desc->len); 706 struct gve_rx_ctx *ctx = &rx->ctx; 707 union gve_rx_data_slot *data_slot; 708 struct gve_priv *priv = rx->gve; 709 struct sk_buff *skb = NULL; 710 struct bpf_prog *xprog; 711 struct xdp_buff xdp; 712 dma_addr_t page_bus; 713 void *va; 714 715 u16 len = frag_size; 716 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 717 bool is_first_frag = ctx->frag_cnt == 0; 718 719 bool is_only_frag = is_first_frag && is_last_frag; 720 721 if (unlikely(ctx->drop_pkt)) 722 goto finish_frag; 723 724 if (desc->flags_seq & GVE_RXF_ERR) { 725 ctx->drop_pkt = true; 726 cnts->desc_err_pkt_cnt++; 727 napi_free_frags(napi); 728 goto finish_frag; 729 } 730 731 if (unlikely(frag_size > rx->packet_buffer_size)) { 732 netdev_warn(priv->dev, "Unexpected frag size %d, can't exceed %d, scheduling reset", 733 frag_size, rx->packet_buffer_size); 734 ctx->drop_pkt = true; 735 napi_free_frags(napi); 736 gve_schedule_reset(rx->gve); 737 goto finish_frag; 738 } 739 740 /* Prefetch two packet buffers ahead, we will need it soon. */ 741 page_info = &rx->data.page_info[(idx + 2) & rx->mask]; 742 va = page_info->page_address + page_info->page_offset; 743 prefetch(page_info->page); /* Kernel page struct. */ 744 prefetch(va); /* Packet header. */ 745 prefetch(va + 64); /* Next cacheline too. */ 746 747 page_info = &rx->data.page_info[idx]; 748 data_slot = &rx->data.data_ring[idx]; 749 page_bus = (rx->data.raw_addressing) ? 750 be64_to_cpu(data_slot->addr) - page_info->page_offset : 751 rx->data.qpl->page_buses[idx]; 752 dma_sync_single_for_cpu(&priv->pdev->dev, page_bus, 753 PAGE_SIZE, DMA_FROM_DEVICE); 754 page_info->pad = is_first_frag ? GVE_RX_PAD : 0; 755 len -= page_info->pad; 756 frag_size -= page_info->pad; 757 758 xprog = READ_ONCE(priv->xdp_prog); 759 if (xprog && is_only_frag) { 760 void *old_data; 761 int xdp_act; 762 763 xdp_init_buff(&xdp, rx->packet_buffer_size, &rx->xdp_rxq); 764 xdp_prepare_buff(&xdp, page_info->page_address + 765 page_info->page_offset, GVE_RX_PAD, 766 len, false); 767 old_data = xdp.data; 768 xdp_act = bpf_prog_run_xdp(xprog, &xdp); 769 if (xdp_act != XDP_PASS) { 770 gve_xdp_done(priv, rx, &xdp, xprog, xdp_act); 771 ctx->total_size += frag_size; 772 goto finish_ok_pkt; 773 } 774 775 page_info->pad += xdp.data - old_data; 776 len = xdp.data_end - xdp.data; 777 778 u64_stats_update_begin(&rx->statss); 779 rx->xdp_actions[XDP_PASS]++; 780 u64_stats_update_end(&rx->statss); 781 } 782 783 skb = gve_rx_skb(priv, rx, page_info, napi, len, 784 data_slot, is_only_frag); 785 if (!skb) { 786 u64_stats_update_begin(&rx->statss); 787 rx->rx_skb_alloc_fail++; 788 u64_stats_update_end(&rx->statss); 789 790 napi_free_frags(napi); 791 ctx->drop_pkt = true; 792 goto finish_frag; 793 } 794 ctx->total_size += frag_size; 795 796 if (is_first_frag) { 797 if (likely(feat & NETIF_F_RXCSUM)) { 798 /* NIC passes up the partial sum */ 799 if (desc->csum) 800 skb->ip_summed = CHECKSUM_COMPLETE; 801 else 802 skb->ip_summed = CHECKSUM_NONE; 803 skb->csum = csum_unfold(desc->csum); 804 } 805 806 /* parse flags & pass relevant info up */ 807 if (likely(feat & NETIF_F_RXHASH) && 808 gve_needs_rss(desc->flags_seq)) 809 skb_set_hash(skb, be32_to_cpu(desc->rss_hash), 810 gve_rss_type(desc->flags_seq)); 811 } 812 813 if (is_last_frag) { 814 skb_record_rx_queue(skb, rx->q_num); 815 if (skb_is_nonlinear(skb)) 816 napi_gro_frags(napi); 817 else 818 napi_gro_receive(napi, skb); 819 goto finish_ok_pkt; 820 } 821 822 goto finish_frag; 823 824 finish_ok_pkt: 825 cnts->ok_pkt_bytes += ctx->total_size; 826 cnts->ok_pkt_cnt++; 827 finish_frag: 828 ctx->frag_cnt++; 829 if (is_last_frag) { 830 cnts->total_pkt_cnt++; 831 cnts->cont_pkt_cnt += (ctx->frag_cnt > 1); 832 gve_rx_ctx_clear(ctx); 833 } 834 } 835 836 bool gve_rx_work_pending(struct gve_rx_ring *rx) 837 { 838 struct gve_rx_desc *desc; 839 __be16 flags_seq; 840 u32 next_idx; 841 842 next_idx = rx->cnt & rx->mask; 843 desc = rx->desc.desc_ring + next_idx; 844 845 flags_seq = desc->flags_seq; 846 847 return (GVE_SEQNO(flags_seq) == rx->desc.seqno); 848 } 849 850 static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx) 851 { 852 int refill_target = rx->mask + 1; 853 u32 fill_cnt = rx->fill_cnt; 854 855 while (fill_cnt - rx->cnt < refill_target) { 856 struct gve_rx_slot_page_info *page_info; 857 u32 idx = fill_cnt & rx->mask; 858 859 page_info = &rx->data.page_info[idx]; 860 if (page_info->can_flip) { 861 /* The other half of the page is free because it was 862 * free when we processed the descriptor. Flip to it. 863 */ 864 union gve_rx_data_slot *data_slot = 865 &rx->data.data_ring[idx]; 866 867 gve_rx_flip_buff(page_info, &data_slot->addr); 868 page_info->can_flip = 0; 869 } else { 870 /* It is possible that the networking stack has already 871 * finished processing all outstanding packets in the buffer 872 * and it can be reused. 873 * Flipping is unnecessary here - if the networking stack still 874 * owns half the page it is impossible to tell which half. Either 875 * the whole page is free or it needs to be replaced. 876 */ 877 int recycle = gve_rx_can_recycle_buffer(page_info); 878 879 if (recycle < 0) { 880 if (!rx->data.raw_addressing) 881 gve_schedule_reset(priv); 882 return false; 883 } 884 if (!recycle) { 885 /* We can't reuse the buffer - alloc a new one*/ 886 union gve_rx_data_slot *data_slot = 887 &rx->data.data_ring[idx]; 888 struct device *dev = &priv->pdev->dev; 889 gve_rx_free_buffer(dev, page_info, data_slot); 890 page_info->page = NULL; 891 if (gve_rx_alloc_buffer(priv, dev, page_info, 892 data_slot)) { 893 u64_stats_update_begin(&rx->statss); 894 rx->rx_buf_alloc_fail++; 895 u64_stats_update_end(&rx->statss); 896 break; 897 } 898 } 899 } 900 fill_cnt++; 901 } 902 rx->fill_cnt = fill_cnt; 903 return true; 904 } 905 906 static int gve_clean_rx_done(struct gve_rx_ring *rx, int budget, 907 netdev_features_t feat) 908 { 909 u64 xdp_redirects = rx->xdp_actions[XDP_REDIRECT]; 910 u64 xdp_txs = rx->xdp_actions[XDP_TX]; 911 struct gve_rx_ctx *ctx = &rx->ctx; 912 struct gve_priv *priv = rx->gve; 913 struct gve_rx_cnts cnts = {0}; 914 struct gve_rx_desc *next_desc; 915 u32 idx = rx->cnt & rx->mask; 916 u32 work_done = 0; 917 918 struct gve_rx_desc *desc = &rx->desc.desc_ring[idx]; 919 920 // Exceed budget only if (and till) the inflight packet is consumed. 921 while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) && 922 (work_done < budget || ctx->frag_cnt)) { 923 next_desc = &rx->desc.desc_ring[(idx + 1) & rx->mask]; 924 prefetch(next_desc); 925 926 gve_rx(rx, feat, desc, idx, &cnts); 927 928 rx->cnt++; 929 idx = rx->cnt & rx->mask; 930 desc = &rx->desc.desc_ring[idx]; 931 rx->desc.seqno = gve_next_seqno(rx->desc.seqno); 932 work_done++; 933 } 934 935 // The device will only send whole packets. 936 if (unlikely(ctx->frag_cnt)) { 937 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 938 939 napi_free_frags(napi); 940 gve_rx_ctx_clear(&rx->ctx); 941 netdev_warn(priv->dev, "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset", 942 GVE_SEQNO(desc->flags_seq), rx->desc.seqno); 943 gve_schedule_reset(rx->gve); 944 } 945 946 if (!work_done && rx->fill_cnt - rx->cnt > rx->db_threshold) 947 return 0; 948 949 if (work_done) { 950 u64_stats_update_begin(&rx->statss); 951 rx->rpackets += cnts.ok_pkt_cnt; 952 rx->rbytes += cnts.ok_pkt_bytes; 953 rx->rx_cont_packet_cnt += cnts.cont_pkt_cnt; 954 rx->rx_desc_err_dropped_pkt += cnts.desc_err_pkt_cnt; 955 u64_stats_update_end(&rx->statss); 956 } 957 958 if (xdp_txs != rx->xdp_actions[XDP_TX]) 959 gve_xdp_tx_flush(priv, rx->q_num); 960 961 if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT]) 962 xdp_do_flush(); 963 964 /* restock ring slots */ 965 if (!rx->data.raw_addressing) { 966 /* In QPL mode buffs are refilled as the desc are processed */ 967 rx->fill_cnt += work_done; 968 } else if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { 969 /* In raw addressing mode buffs are only refilled if the avail 970 * falls below a threshold. 971 */ 972 if (!gve_rx_refill_buffers(priv, rx)) 973 return 0; 974 975 /* If we were not able to completely refill buffers, we'll want 976 * to schedule this queue for work again to refill buffers. 977 */ 978 if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { 979 gve_rx_write_doorbell(priv, rx); 980 return budget; 981 } 982 } 983 984 gve_rx_write_doorbell(priv, rx); 985 return cnts.total_pkt_cnt; 986 } 987 988 int gve_rx_poll(struct gve_notify_block *block, int budget) 989 { 990 struct gve_rx_ring *rx = block->rx; 991 netdev_features_t feat; 992 int work_done = 0; 993 994 feat = block->napi.dev->features; 995 996 /* If budget is 0, do all the work */ 997 if (budget == 0) 998 budget = INT_MAX; 999 1000 if (budget > 0) 1001 work_done = gve_clean_rx_done(rx, budget, feat); 1002 1003 return work_done; 1004 } 1005