1 // SPDX-License-Identifier: (GPL-2.0 OR MIT) 2 /* Google virtual Ethernet (gve) driver 3 * 4 * Copyright (C) 2015-2021 Google, Inc. 5 */ 6 7 #include "gve.h" 8 #include "gve_adminq.h" 9 #include "gve_utils.h" 10 #include <linux/etherdevice.h> 11 12 static void gve_rx_free_buffer(struct device *dev, 13 struct gve_rx_slot_page_info *page_info, 14 union gve_rx_data_slot *data_slot) 15 { 16 dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) & 17 GVE_DATA_SLOT_ADDR_PAGE_MASK); 18 19 gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE); 20 } 21 22 static void gve_rx_unfill_pages(struct gve_priv *priv, struct gve_rx_ring *rx) 23 { 24 if (rx->data.raw_addressing) { 25 u32 slots = rx->mask + 1; 26 int i; 27 28 for (i = 0; i < slots; i++) 29 gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i], 30 &rx->data.data_ring[i]); 31 } else { 32 gve_unassign_qpl(priv, rx->data.qpl->id); 33 rx->data.qpl = NULL; 34 } 35 kvfree(rx->data.page_info); 36 rx->data.page_info = NULL; 37 } 38 39 static void gve_rx_free_ring(struct gve_priv *priv, int idx) 40 { 41 struct gve_rx_ring *rx = &priv->rx[idx]; 42 struct device *dev = &priv->pdev->dev; 43 u32 slots = rx->mask + 1; 44 size_t bytes; 45 46 gve_rx_remove_from_block(priv, idx); 47 48 bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt; 49 dma_free_coherent(dev, bytes, rx->desc.desc_ring, rx->desc.bus); 50 rx->desc.desc_ring = NULL; 51 52 dma_free_coherent(dev, sizeof(*rx->q_resources), 53 rx->q_resources, rx->q_resources_bus); 54 rx->q_resources = NULL; 55 56 gve_rx_unfill_pages(priv, rx); 57 58 bytes = sizeof(*rx->data.data_ring) * slots; 59 dma_free_coherent(dev, bytes, rx->data.data_ring, 60 rx->data.data_bus); 61 rx->data.data_ring = NULL; 62 netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); 63 } 64 65 static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info, 66 dma_addr_t addr, struct page *page, __be64 *slot_addr) 67 { 68 page_info->page = page; 69 page_info->page_offset = 0; 70 page_info->page_address = page_address(page); 71 *slot_addr = cpu_to_be64(addr); 72 } 73 74 static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev, 75 struct gve_rx_slot_page_info *page_info, 76 union gve_rx_data_slot *data_slot) 77 { 78 struct page *page; 79 dma_addr_t dma; 80 int err; 81 82 err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE); 83 if (err) 84 return err; 85 86 gve_setup_rx_buffer(page_info, dma, page, &data_slot->addr); 87 return 0; 88 } 89 90 static int gve_prefill_rx_pages(struct gve_rx_ring *rx) 91 { 92 struct gve_priv *priv = rx->gve; 93 u32 slots; 94 int err; 95 int i; 96 97 /* Allocate one page per Rx queue slot. Each page is split into two 98 * packet buffers, when possible we "page flip" between the two. 99 */ 100 slots = rx->mask + 1; 101 102 rx->data.page_info = kvzalloc(slots * 103 sizeof(*rx->data.page_info), GFP_KERNEL); 104 if (!rx->data.page_info) 105 return -ENOMEM; 106 107 if (!rx->data.raw_addressing) { 108 rx->data.qpl = gve_assign_rx_qpl(priv); 109 if (!rx->data.qpl) { 110 kvfree(rx->data.page_info); 111 rx->data.page_info = NULL; 112 return -ENOMEM; 113 } 114 } 115 for (i = 0; i < slots; i++) { 116 if (!rx->data.raw_addressing) { 117 struct page *page = rx->data.qpl->pages[i]; 118 dma_addr_t addr = i * PAGE_SIZE; 119 120 gve_setup_rx_buffer(&rx->data.page_info[i], addr, page, 121 &rx->data.data_ring[i].qpl_offset); 122 continue; 123 } 124 err = gve_rx_alloc_buffer(priv, &priv->pdev->dev, &rx->data.page_info[i], 125 &rx->data.data_ring[i]); 126 if (err) 127 goto alloc_err; 128 } 129 130 return slots; 131 alloc_err: 132 while (i--) 133 gve_rx_free_buffer(&priv->pdev->dev, 134 &rx->data.page_info[i], 135 &rx->data.data_ring[i]); 136 return err; 137 } 138 139 static int gve_rx_alloc_ring(struct gve_priv *priv, int idx) 140 { 141 struct gve_rx_ring *rx = &priv->rx[idx]; 142 struct device *hdev = &priv->pdev->dev; 143 u32 slots, npages; 144 int filled_pages; 145 size_t bytes; 146 int err; 147 148 netif_dbg(priv, drv, priv->dev, "allocating rx ring\n"); 149 /* Make sure everything is zeroed to start with */ 150 memset(rx, 0, sizeof(*rx)); 151 152 rx->gve = priv; 153 rx->q_num = idx; 154 155 slots = priv->rx_data_slot_cnt; 156 rx->mask = slots - 1; 157 rx->data.raw_addressing = priv->queue_format == GVE_GQI_RDA_FORMAT; 158 159 /* alloc rx data ring */ 160 bytes = sizeof(*rx->data.data_ring) * slots; 161 rx->data.data_ring = dma_alloc_coherent(hdev, bytes, 162 &rx->data.data_bus, 163 GFP_KERNEL); 164 if (!rx->data.data_ring) 165 return -ENOMEM; 166 filled_pages = gve_prefill_rx_pages(rx); 167 if (filled_pages < 0) { 168 err = -ENOMEM; 169 goto abort_with_slots; 170 } 171 rx->fill_cnt = filled_pages; 172 /* Ensure data ring slots (packet buffers) are visible. */ 173 dma_wmb(); 174 175 /* Alloc gve_queue_resources */ 176 rx->q_resources = 177 dma_alloc_coherent(hdev, 178 sizeof(*rx->q_resources), 179 &rx->q_resources_bus, 180 GFP_KERNEL); 181 if (!rx->q_resources) { 182 err = -ENOMEM; 183 goto abort_filled; 184 } 185 netif_dbg(priv, drv, priv->dev, "rx[%d]->data.data_bus=%lx\n", idx, 186 (unsigned long)rx->data.data_bus); 187 188 /* alloc rx desc ring */ 189 bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt; 190 npages = bytes / PAGE_SIZE; 191 if (npages * PAGE_SIZE != bytes) { 192 err = -EIO; 193 goto abort_with_q_resources; 194 } 195 196 rx->desc.desc_ring = dma_alloc_coherent(hdev, bytes, &rx->desc.bus, 197 GFP_KERNEL); 198 if (!rx->desc.desc_ring) { 199 err = -ENOMEM; 200 goto abort_with_q_resources; 201 } 202 rx->cnt = 0; 203 rx->db_threshold = priv->rx_desc_cnt / 2; 204 rx->desc.seqno = 1; 205 gve_rx_add_to_block(priv, idx); 206 207 return 0; 208 209 abort_with_q_resources: 210 dma_free_coherent(hdev, sizeof(*rx->q_resources), 211 rx->q_resources, rx->q_resources_bus); 212 rx->q_resources = NULL; 213 abort_filled: 214 gve_rx_unfill_pages(priv, rx); 215 abort_with_slots: 216 bytes = sizeof(*rx->data.data_ring) * slots; 217 dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus); 218 rx->data.data_ring = NULL; 219 220 return err; 221 } 222 223 int gve_rx_alloc_rings(struct gve_priv *priv) 224 { 225 int err = 0; 226 int i; 227 228 for (i = 0; i < priv->rx_cfg.num_queues; i++) { 229 err = gve_rx_alloc_ring(priv, i); 230 if (err) { 231 netif_err(priv, drv, priv->dev, 232 "Failed to alloc rx ring=%d: err=%d\n", 233 i, err); 234 break; 235 } 236 } 237 /* Unallocate if there was an error */ 238 if (err) { 239 int j; 240 241 for (j = 0; j < i; j++) 242 gve_rx_free_ring(priv, j); 243 } 244 return err; 245 } 246 247 void gve_rx_free_rings_gqi(struct gve_priv *priv) 248 { 249 int i; 250 251 for (i = 0; i < priv->rx_cfg.num_queues; i++) 252 gve_rx_free_ring(priv, i); 253 } 254 255 void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx) 256 { 257 u32 db_idx = be32_to_cpu(rx->q_resources->db_index); 258 259 iowrite32be(rx->fill_cnt, &priv->db_bar2[db_idx]); 260 } 261 262 static enum pkt_hash_types gve_rss_type(__be16 pkt_flags) 263 { 264 if (likely(pkt_flags & (GVE_RXF_TCP | GVE_RXF_UDP))) 265 return PKT_HASH_TYPE_L4; 266 if (pkt_flags & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) 267 return PKT_HASH_TYPE_L3; 268 return PKT_HASH_TYPE_L2; 269 } 270 271 static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi, 272 struct gve_rx_slot_page_info *page_info, 273 u16 len) 274 { 275 struct sk_buff *skb = napi_get_frags(napi); 276 277 if (unlikely(!skb)) 278 return NULL; 279 280 skb_add_rx_frag(skb, 0, page_info->page, 281 page_info->page_offset + 282 GVE_RX_PAD, len, PAGE_SIZE / 2); 283 284 return skb; 285 } 286 287 static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) 288 { 289 const __be64 offset = cpu_to_be64(PAGE_SIZE / 2); 290 291 /* "flip" to other packet buffer on this page */ 292 page_info->page_offset ^= PAGE_SIZE / 2; 293 *(slot_addr) ^= offset; 294 } 295 296 static bool gve_rx_can_flip_buffers(struct net_device *netdev) 297 { 298 return PAGE_SIZE == 4096 299 ? netdev->mtu + GVE_RX_PAD + ETH_HLEN <= PAGE_SIZE / 2 : false; 300 } 301 302 static int gve_rx_can_recycle_buffer(struct page *page) 303 { 304 int pagecount = page_count(page); 305 306 /* This page is not being used by any SKBs - reuse */ 307 if (pagecount == 1) 308 return 1; 309 /* This page is still being used by an SKB - we can't reuse */ 310 else if (pagecount >= 2) 311 return 0; 312 WARN(pagecount < 1, "Pagecount should never be < 1"); 313 return -1; 314 } 315 316 static struct sk_buff * 317 gve_rx_raw_addressing(struct device *dev, struct net_device *netdev, 318 struct gve_rx_slot_page_info *page_info, u16 len, 319 struct napi_struct *napi, 320 union gve_rx_data_slot *data_slot) 321 { 322 struct sk_buff *skb; 323 324 skb = gve_rx_add_frags(napi, page_info, len); 325 if (!skb) 326 return NULL; 327 328 /* Optimistically stop the kernel from freeing the page by increasing 329 * the page bias. We will check the refcount in refill to determine if 330 * we need to alloc a new page. 331 */ 332 get_page(page_info->page); 333 334 return skb; 335 } 336 337 static struct sk_buff * 338 gve_rx_qpl(struct device *dev, struct net_device *netdev, 339 struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info, 340 u16 len, struct napi_struct *napi, 341 union gve_rx_data_slot *data_slot) 342 { 343 struct sk_buff *skb; 344 345 /* if raw_addressing mode is not enabled gvnic can only receive into 346 * registered segments. If the buffer can't be recycled, our only 347 * choice is to copy the data out of it so that we can return it to the 348 * device. 349 */ 350 if (page_info->can_flip) { 351 skb = gve_rx_add_frags(napi, page_info, len); 352 /* No point in recycling if we didn't get the skb */ 353 if (skb) { 354 /* Make sure that the page isn't freed. */ 355 get_page(page_info->page); 356 gve_rx_flip_buff(page_info, &data_slot->qpl_offset); 357 } 358 } else { 359 skb = gve_rx_copy(netdev, napi, page_info, len, GVE_RX_PAD); 360 if (skb) { 361 u64_stats_update_begin(&rx->statss); 362 rx->rx_copied_pkt++; 363 u64_stats_update_end(&rx->statss); 364 } 365 } 366 return skb; 367 } 368 369 static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc, 370 netdev_features_t feat, u32 idx) 371 { 372 struct gve_rx_slot_page_info *page_info; 373 struct gve_priv *priv = rx->gve; 374 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 375 struct net_device *dev = priv->dev; 376 union gve_rx_data_slot *data_slot; 377 struct sk_buff *skb = NULL; 378 dma_addr_t page_bus; 379 u16 len; 380 381 /* drop this packet */ 382 if (unlikely(rx_desc->flags_seq & GVE_RXF_ERR)) { 383 u64_stats_update_begin(&rx->statss); 384 rx->rx_desc_err_dropped_pkt++; 385 u64_stats_update_end(&rx->statss); 386 return false; 387 } 388 389 len = be16_to_cpu(rx_desc->len) - GVE_RX_PAD; 390 page_info = &rx->data.page_info[idx]; 391 392 data_slot = &rx->data.data_ring[idx]; 393 page_bus = (rx->data.raw_addressing) ? 394 be64_to_cpu(data_slot->addr) & GVE_DATA_SLOT_ADDR_PAGE_MASK : 395 rx->data.qpl->page_buses[idx]; 396 dma_sync_single_for_cpu(&priv->pdev->dev, page_bus, 397 PAGE_SIZE, DMA_FROM_DEVICE); 398 399 if (len <= priv->rx_copybreak) { 400 /* Just copy small packets */ 401 skb = gve_rx_copy(dev, napi, page_info, len, GVE_RX_PAD); 402 u64_stats_update_begin(&rx->statss); 403 rx->rx_copied_pkt++; 404 rx->rx_copybreak_pkt++; 405 u64_stats_update_end(&rx->statss); 406 } else { 407 u8 can_flip = gve_rx_can_flip_buffers(dev); 408 int recycle = 0; 409 410 if (can_flip) { 411 recycle = gve_rx_can_recycle_buffer(page_info->page); 412 if (recycle < 0) { 413 if (!rx->data.raw_addressing) 414 gve_schedule_reset(priv); 415 return false; 416 } 417 } 418 419 page_info->can_flip = can_flip && recycle; 420 if (rx->data.raw_addressing) { 421 skb = gve_rx_raw_addressing(&priv->pdev->dev, dev, 422 page_info, len, napi, 423 data_slot); 424 } else { 425 skb = gve_rx_qpl(&priv->pdev->dev, dev, rx, 426 page_info, len, napi, data_slot); 427 } 428 } 429 430 if (!skb) { 431 u64_stats_update_begin(&rx->statss); 432 rx->rx_skb_alloc_fail++; 433 u64_stats_update_end(&rx->statss); 434 return false; 435 } 436 437 if (likely(feat & NETIF_F_RXCSUM)) { 438 /* NIC passes up the partial sum */ 439 if (rx_desc->csum) 440 skb->ip_summed = CHECKSUM_COMPLETE; 441 else 442 skb->ip_summed = CHECKSUM_NONE; 443 skb->csum = csum_unfold(rx_desc->csum); 444 } 445 446 /* parse flags & pass relevant info up */ 447 if (likely(feat & NETIF_F_RXHASH) && 448 gve_needs_rss(rx_desc->flags_seq)) 449 skb_set_hash(skb, be32_to_cpu(rx_desc->rss_hash), 450 gve_rss_type(rx_desc->flags_seq)); 451 452 if (skb_is_nonlinear(skb)) 453 napi_gro_frags(napi); 454 else 455 napi_gro_receive(napi, skb); 456 return true; 457 } 458 459 static bool gve_rx_work_pending(struct gve_rx_ring *rx) 460 { 461 struct gve_rx_desc *desc; 462 __be16 flags_seq; 463 u32 next_idx; 464 465 next_idx = rx->cnt & rx->mask; 466 desc = rx->desc.desc_ring + next_idx; 467 468 flags_seq = desc->flags_seq; 469 /* Make sure we have synchronized the seq no with the device */ 470 smp_rmb(); 471 472 return (GVE_SEQNO(flags_seq) == rx->desc.seqno); 473 } 474 475 static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx) 476 { 477 int refill_target = rx->mask + 1; 478 u32 fill_cnt = rx->fill_cnt; 479 480 while (fill_cnt - rx->cnt < refill_target) { 481 struct gve_rx_slot_page_info *page_info; 482 u32 idx = fill_cnt & rx->mask; 483 484 page_info = &rx->data.page_info[idx]; 485 if (page_info->can_flip) { 486 /* The other half of the page is free because it was 487 * free when we processed the descriptor. Flip to it. 488 */ 489 union gve_rx_data_slot *data_slot = 490 &rx->data.data_ring[idx]; 491 492 gve_rx_flip_buff(page_info, &data_slot->addr); 493 page_info->can_flip = 0; 494 } else { 495 /* It is possible that the networking stack has already 496 * finished processing all outstanding packets in the buffer 497 * and it can be reused. 498 * Flipping is unnecessary here - if the networking stack still 499 * owns half the page it is impossible to tell which half. Either 500 * the whole page is free or it needs to be replaced. 501 */ 502 int recycle = gve_rx_can_recycle_buffer(page_info->page); 503 504 if (recycle < 0) { 505 if (!rx->data.raw_addressing) 506 gve_schedule_reset(priv); 507 return false; 508 } 509 if (!recycle) { 510 /* We can't reuse the buffer - alloc a new one*/ 511 union gve_rx_data_slot *data_slot = 512 &rx->data.data_ring[idx]; 513 struct device *dev = &priv->pdev->dev; 514 515 gve_rx_free_buffer(dev, page_info, data_slot); 516 page_info->page = NULL; 517 if (gve_rx_alloc_buffer(priv, dev, page_info, data_slot)) 518 break; 519 } 520 } 521 fill_cnt++; 522 } 523 rx->fill_cnt = fill_cnt; 524 return true; 525 } 526 527 bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget, 528 netdev_features_t feat) 529 { 530 struct gve_priv *priv = rx->gve; 531 u32 work_done = 0, packets = 0; 532 struct gve_rx_desc *desc; 533 u32 cnt = rx->cnt; 534 u32 idx = cnt & rx->mask; 535 u64 bytes = 0; 536 537 desc = rx->desc.desc_ring + idx; 538 while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) && 539 work_done < budget) { 540 bool dropped; 541 542 netif_info(priv, rx_status, priv->dev, 543 "[%d] idx=%d desc=%p desc->flags_seq=0x%x\n", 544 rx->q_num, idx, desc, desc->flags_seq); 545 netif_info(priv, rx_status, priv->dev, 546 "[%d] seqno=%d rx->desc.seqno=%d\n", 547 rx->q_num, GVE_SEQNO(desc->flags_seq), 548 rx->desc.seqno); 549 dropped = !gve_rx(rx, desc, feat, idx); 550 if (!dropped) { 551 bytes += be16_to_cpu(desc->len) - GVE_RX_PAD; 552 packets++; 553 } 554 cnt++; 555 idx = cnt & rx->mask; 556 desc = rx->desc.desc_ring + idx; 557 rx->desc.seqno = gve_next_seqno(rx->desc.seqno); 558 work_done++; 559 } 560 561 if (!work_done && rx->fill_cnt - cnt > rx->db_threshold) 562 return false; 563 564 u64_stats_update_begin(&rx->statss); 565 rx->rpackets += packets; 566 rx->rbytes += bytes; 567 u64_stats_update_end(&rx->statss); 568 rx->cnt = cnt; 569 570 /* restock ring slots */ 571 if (!rx->data.raw_addressing) { 572 /* In QPL mode buffs are refilled as the desc are processed */ 573 rx->fill_cnt += work_done; 574 } else if (rx->fill_cnt - cnt <= rx->db_threshold) { 575 /* In raw addressing mode buffs are only refilled if the avail 576 * falls below a threshold. 577 */ 578 if (!gve_rx_refill_buffers(priv, rx)) 579 return false; 580 581 /* If we were not able to completely refill buffers, we'll want 582 * to schedule this queue for work again to refill buffers. 583 */ 584 if (rx->fill_cnt - cnt <= rx->db_threshold) { 585 gve_rx_write_doorbell(priv, rx); 586 return true; 587 } 588 } 589 590 gve_rx_write_doorbell(priv, rx); 591 return gve_rx_work_pending(rx); 592 } 593 594 bool gve_rx_poll(struct gve_notify_block *block, int budget) 595 { 596 struct gve_rx_ring *rx = block->rx; 597 netdev_features_t feat; 598 bool repoll = false; 599 600 feat = block->napi.dev->features; 601 602 /* If budget is 0, do all the work */ 603 if (budget == 0) 604 budget = INT_MAX; 605 606 if (budget > 0) 607 repoll |= gve_clean_rx_done(rx, budget, feat); 608 else 609 repoll |= gve_rx_work_pending(rx); 610 return repoll; 611 } 612