1 /* SPDX-License-Identifier: GPL-2.0 2 * 3 * page_pool.c 4 * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> 5 * Copyright (C) 2016 Red Hat, Inc. 6 */ 7 8 #include <linux/types.h> 9 #include <linux/kernel.h> 10 #include <linux/slab.h> 11 #include <linux/device.h> 12 13 #include <net/page_pool.h> 14 #include <net/xdp.h> 15 16 #include <linux/dma-direction.h> 17 #include <linux/dma-mapping.h> 18 #include <linux/page-flags.h> 19 #include <linux/mm.h> /* for __put_page() */ 20 #include <linux/poison.h> 21 22 #include <trace/events/page_pool.h> 23 24 #define DEFER_TIME (msecs_to_jiffies(1000)) 25 #define DEFER_WARN_INTERVAL (60 * HZ) 26 27 static int page_pool_init(struct page_pool *pool, 28 const struct page_pool_params *params) 29 { 30 unsigned int ring_qsize = 1024; /* Default */ 31 32 memcpy(&pool->p, params, sizeof(pool->p)); 33 34 /* Validate only known flags were used */ 35 if (pool->p.flags & ~(PP_FLAG_ALL)) 36 return -EINVAL; 37 38 if (pool->p.pool_size) 39 ring_qsize = pool->p.pool_size; 40 41 /* Sanity limit mem that can be pinned down */ 42 if (ring_qsize > 32768) 43 return -E2BIG; 44 45 /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. 46 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, 47 * which is the XDP_TX use-case. 48 */ 49 if (pool->p.flags & PP_FLAG_DMA_MAP) { 50 if ((pool->p.dma_dir != DMA_FROM_DEVICE) && 51 (pool->p.dma_dir != DMA_BIDIRECTIONAL)) 52 return -EINVAL; 53 } 54 55 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { 56 /* In order to request DMA-sync-for-device the page 57 * needs to be mapped 58 */ 59 if (!(pool->p.flags & PP_FLAG_DMA_MAP)) 60 return -EINVAL; 61 62 if (!pool->p.max_len) 63 return -EINVAL; 64 65 /* pool->p.offset has to be set according to the address 66 * offset used by the DMA engine to start copying rx data 67 */ 68 } 69 70 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) 71 return -ENOMEM; 72 73 atomic_set(&pool->pages_state_release_cnt, 0); 74 75 /* Driver calling page_pool_create() also call page_pool_destroy() */ 76 refcount_set(&pool->user_cnt, 1); 77 78 if (pool->p.flags & PP_FLAG_DMA_MAP) 79 get_device(pool->p.dev); 80 81 return 0; 82 } 83 84 struct page_pool *page_pool_create(const struct page_pool_params *params) 85 { 86 struct page_pool *pool; 87 int err; 88 89 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); 90 if (!pool) 91 return ERR_PTR(-ENOMEM); 92 93 err = page_pool_init(pool, params); 94 if (err < 0) { 95 pr_warn("%s() gave up with errno %d\n", __func__, err); 96 kfree(pool); 97 return ERR_PTR(err); 98 } 99 100 return pool; 101 } 102 EXPORT_SYMBOL(page_pool_create); 103 104 static void page_pool_return_page(struct page_pool *pool, struct page *page); 105 106 noinline 107 static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) 108 { 109 struct ptr_ring *r = &pool->ring; 110 struct page *page; 111 int pref_nid; /* preferred NUMA node */ 112 113 /* Quicker fallback, avoid locks when ring is empty */ 114 if (__ptr_ring_empty(r)) 115 return NULL; 116 117 /* Softirq guarantee CPU and thus NUMA node is stable. This, 118 * assumes CPU refilling driver RX-ring will also run RX-NAPI. 119 */ 120 #ifdef CONFIG_NUMA 121 pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; 122 #else 123 /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ 124 pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ 125 #endif 126 127 /* Slower-path: Get pages from locked ring queue */ 128 spin_lock(&r->consumer_lock); 129 130 /* Refill alloc array, but only if NUMA match */ 131 do { 132 page = __ptr_ring_consume(r); 133 if (unlikely(!page)) 134 break; 135 136 if (likely(page_to_nid(page) == pref_nid)) { 137 pool->alloc.cache[pool->alloc.count++] = page; 138 } else { 139 /* NUMA mismatch; 140 * (1) release 1 page to page-allocator and 141 * (2) break out to fallthrough to alloc_pages_node. 142 * This limit stress on page buddy alloactor. 143 */ 144 page_pool_return_page(pool, page); 145 page = NULL; 146 break; 147 } 148 } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); 149 150 /* Return last page */ 151 if (likely(pool->alloc.count > 0)) 152 page = pool->alloc.cache[--pool->alloc.count]; 153 154 spin_unlock(&r->consumer_lock); 155 return page; 156 } 157 158 /* fast path */ 159 static struct page *__page_pool_get_cached(struct page_pool *pool) 160 { 161 struct page *page; 162 163 /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ 164 if (likely(pool->alloc.count)) { 165 /* Fast-path */ 166 page = pool->alloc.cache[--pool->alloc.count]; 167 } else { 168 page = page_pool_refill_alloc_cache(pool); 169 } 170 171 return page; 172 } 173 174 static void page_pool_dma_sync_for_device(struct page_pool *pool, 175 struct page *page, 176 unsigned int dma_sync_size) 177 { 178 dma_addr_t dma_addr = page_pool_get_dma_addr(page); 179 180 dma_sync_size = min(dma_sync_size, pool->p.max_len); 181 dma_sync_single_range_for_device(pool->p.dev, dma_addr, 182 pool->p.offset, dma_sync_size, 183 pool->p.dma_dir); 184 } 185 186 static bool page_pool_dma_map(struct page_pool *pool, struct page *page) 187 { 188 dma_addr_t dma; 189 190 /* Setup DMA mapping: use 'struct page' area for storing DMA-addr 191 * since dma_addr_t can be either 32 or 64 bits and does not always fit 192 * into page private data (i.e 32bit cpu with 64bit DMA caps) 193 * This mapping is kept for lifetime of page, until leaving pool. 194 */ 195 dma = dma_map_page_attrs(pool->p.dev, page, 0, 196 (PAGE_SIZE << pool->p.order), 197 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); 198 if (dma_mapping_error(pool->p.dev, dma)) 199 return false; 200 201 page_pool_set_dma_addr(page, dma); 202 203 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) 204 page_pool_dma_sync_for_device(pool, page, pool->p.max_len); 205 206 return true; 207 } 208 209 static struct page *__page_pool_alloc_page_order(struct page_pool *pool, 210 gfp_t gfp) 211 { 212 struct page *page; 213 214 gfp |= __GFP_COMP; 215 page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); 216 if (unlikely(!page)) 217 return NULL; 218 219 if ((pool->p.flags & PP_FLAG_DMA_MAP) && 220 unlikely(!page_pool_dma_map(pool, page))) { 221 put_page(page); 222 return NULL; 223 } 224 225 page->pp_magic |= PP_SIGNATURE; 226 227 /* Track how many pages are held 'in-flight' */ 228 pool->pages_state_hold_cnt++; 229 trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); 230 return page; 231 } 232 233 /* slow path */ 234 noinline 235 static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, 236 gfp_t gfp) 237 { 238 const int bulk = PP_ALLOC_CACHE_REFILL; 239 unsigned int pp_flags = pool->p.flags; 240 unsigned int pp_order = pool->p.order; 241 struct page *page; 242 int i, nr_pages; 243 244 /* Don't support bulk alloc for high-order pages */ 245 if (unlikely(pp_order)) 246 return __page_pool_alloc_page_order(pool, gfp); 247 248 /* Unnecessary as alloc cache is empty, but guarantees zero count */ 249 if (unlikely(pool->alloc.count > 0)) 250 return pool->alloc.cache[--pool->alloc.count]; 251 252 /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ 253 memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); 254 255 nr_pages = alloc_pages_bulk_array(gfp, bulk, pool->alloc.cache); 256 if (unlikely(!nr_pages)) 257 return NULL; 258 259 /* Pages have been filled into alloc.cache array, but count is zero and 260 * page element have not been (possibly) DMA mapped. 261 */ 262 for (i = 0; i < nr_pages; i++) { 263 page = pool->alloc.cache[i]; 264 if ((pp_flags & PP_FLAG_DMA_MAP) && 265 unlikely(!page_pool_dma_map(pool, page))) { 266 put_page(page); 267 continue; 268 } 269 page->pp_magic |= PP_SIGNATURE; 270 pool->alloc.cache[pool->alloc.count++] = page; 271 /* Track how many pages are held 'in-flight' */ 272 pool->pages_state_hold_cnt++; 273 trace_page_pool_state_hold(pool, page, 274 pool->pages_state_hold_cnt); 275 } 276 277 /* Return last page */ 278 if (likely(pool->alloc.count > 0)) 279 page = pool->alloc.cache[--pool->alloc.count]; 280 else 281 page = NULL; 282 283 /* When page just alloc'ed is should/must have refcnt 1. */ 284 return page; 285 } 286 287 /* For using page_pool replace: alloc_pages() API calls, but provide 288 * synchronization guarantee for allocation side. 289 */ 290 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) 291 { 292 struct page *page; 293 294 /* Fast-path: Get a page from cache */ 295 page = __page_pool_get_cached(pool); 296 if (page) 297 return page; 298 299 /* Slow-path: cache empty, do real allocation */ 300 page = __page_pool_alloc_pages_slow(pool, gfp); 301 return page; 302 } 303 EXPORT_SYMBOL(page_pool_alloc_pages); 304 305 /* Calculate distance between two u32 values, valid if distance is below 2^(31) 306 * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution 307 */ 308 #define _distance(a, b) (s32)((a) - (b)) 309 310 static s32 page_pool_inflight(struct page_pool *pool) 311 { 312 u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); 313 u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); 314 s32 inflight; 315 316 inflight = _distance(hold_cnt, release_cnt); 317 318 trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); 319 WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); 320 321 return inflight; 322 } 323 324 /* Disconnects a page (from a page_pool). API users can have a need 325 * to disconnect a page (from a page_pool), to allow it to be used as 326 * a regular page (that will eventually be returned to the normal 327 * page-allocator via put_page). 328 */ 329 void page_pool_release_page(struct page_pool *pool, struct page *page) 330 { 331 dma_addr_t dma; 332 int count; 333 334 if (!(pool->p.flags & PP_FLAG_DMA_MAP)) 335 /* Always account for inflight pages, even if we didn't 336 * map them 337 */ 338 goto skip_dma_unmap; 339 340 dma = page_pool_get_dma_addr(page); 341 342 /* When page is unmapped, it cannot be returned to our pool */ 343 dma_unmap_page_attrs(pool->p.dev, dma, 344 PAGE_SIZE << pool->p.order, pool->p.dma_dir, 345 DMA_ATTR_SKIP_CPU_SYNC); 346 page_pool_set_dma_addr(page, 0); 347 skip_dma_unmap: 348 page->pp_magic = 0; 349 350 /* This may be the last page returned, releasing the pool, so 351 * it is not safe to reference pool afterwards. 352 */ 353 count = atomic_inc_return(&pool->pages_state_release_cnt); 354 trace_page_pool_state_release(pool, page, count); 355 } 356 EXPORT_SYMBOL(page_pool_release_page); 357 358 /* Return a page to the page allocator, cleaning up our state */ 359 static void page_pool_return_page(struct page_pool *pool, struct page *page) 360 { 361 page_pool_release_page(pool, page); 362 363 put_page(page); 364 /* An optimization would be to call __free_pages(page, pool->p.order) 365 * knowing page is not part of page-cache (thus avoiding a 366 * __page_cache_release() call). 367 */ 368 } 369 370 static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) 371 { 372 int ret; 373 /* BH protection not needed if current is serving softirq */ 374 if (in_serving_softirq()) 375 ret = ptr_ring_produce(&pool->ring, page); 376 else 377 ret = ptr_ring_produce_bh(&pool->ring, page); 378 379 return (ret == 0) ? true : false; 380 } 381 382 /* Only allow direct recycling in special circumstances, into the 383 * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. 384 * 385 * Caller must provide appropriate safe context. 386 */ 387 static bool page_pool_recycle_in_cache(struct page *page, 388 struct page_pool *pool) 389 { 390 if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) 391 return false; 392 393 /* Caller MUST have verified/know (page_ref_count(page) == 1) */ 394 pool->alloc.cache[pool->alloc.count++] = page; 395 return true; 396 } 397 398 /* If the page refcnt == 1, this will try to recycle the page. 399 * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for 400 * the configured size min(dma_sync_size, pool->max_len). 401 * If the page refcnt != 1, then the page will be returned to memory 402 * subsystem. 403 */ 404 static __always_inline struct page * 405 __page_pool_put_page(struct page_pool *pool, struct page *page, 406 unsigned int dma_sync_size, bool allow_direct) 407 { 408 /* This allocator is optimized for the XDP mode that uses 409 * one-frame-per-page, but have fallbacks that act like the 410 * regular page allocator APIs. 411 * 412 * refcnt == 1 means page_pool owns page, and can recycle it. 413 * 414 * page is NOT reusable when allocated when system is under 415 * some pressure. (page_is_pfmemalloc) 416 */ 417 if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { 418 /* Read barrier done in page_ref_count / READ_ONCE */ 419 420 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) 421 page_pool_dma_sync_for_device(pool, page, 422 dma_sync_size); 423 424 if (allow_direct && in_serving_softirq() && 425 page_pool_recycle_in_cache(page, pool)) 426 return NULL; 427 428 /* Page found as candidate for recycling */ 429 return page; 430 } 431 /* Fallback/non-XDP mode: API user have elevated refcnt. 432 * 433 * Many drivers split up the page into fragments, and some 434 * want to keep doing this to save memory and do refcnt based 435 * recycling. Support this use case too, to ease drivers 436 * switching between XDP/non-XDP. 437 * 438 * In-case page_pool maintains the DMA mapping, API user must 439 * call page_pool_put_page once. In this elevated refcnt 440 * case, the DMA is unmapped/released, as driver is likely 441 * doing refcnt based recycle tricks, meaning another process 442 * will be invoking put_page. 443 */ 444 /* Do not replace this with page_pool_return_page() */ 445 page_pool_release_page(pool, page); 446 put_page(page); 447 448 return NULL; 449 } 450 451 void page_pool_put_page(struct page_pool *pool, struct page *page, 452 unsigned int dma_sync_size, bool allow_direct) 453 { 454 page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); 455 if (page && !page_pool_recycle_in_ring(pool, page)) { 456 /* Cache full, fallback to free pages */ 457 page_pool_return_page(pool, page); 458 } 459 } 460 EXPORT_SYMBOL(page_pool_put_page); 461 462 /* Caller must not use data area after call, as this function overwrites it */ 463 void page_pool_put_page_bulk(struct page_pool *pool, void **data, 464 int count) 465 { 466 int i, bulk_len = 0; 467 468 for (i = 0; i < count; i++) { 469 struct page *page = virt_to_head_page(data[i]); 470 471 page = __page_pool_put_page(pool, page, -1, false); 472 /* Approved for bulk recycling in ptr_ring cache */ 473 if (page) 474 data[bulk_len++] = page; 475 } 476 477 if (unlikely(!bulk_len)) 478 return; 479 480 /* Bulk producer into ptr_ring page_pool cache */ 481 page_pool_ring_lock(pool); 482 for (i = 0; i < bulk_len; i++) { 483 if (__ptr_ring_produce(&pool->ring, data[i])) 484 break; /* ring full */ 485 } 486 page_pool_ring_unlock(pool); 487 488 /* Hopefully all pages was return into ptr_ring */ 489 if (likely(i == bulk_len)) 490 return; 491 492 /* ptr_ring cache full, free remaining pages outside producer lock 493 * since put_page() with refcnt == 1 can be an expensive operation 494 */ 495 for (; i < bulk_len; i++) 496 page_pool_return_page(pool, data[i]); 497 } 498 EXPORT_SYMBOL(page_pool_put_page_bulk); 499 500 static void page_pool_empty_ring(struct page_pool *pool) 501 { 502 struct page *page; 503 504 /* Empty recycle ring */ 505 while ((page = ptr_ring_consume_bh(&pool->ring))) { 506 /* Verify the refcnt invariant of cached pages */ 507 if (!(page_ref_count(page) == 1)) 508 pr_crit("%s() page_pool refcnt %d violation\n", 509 __func__, page_ref_count(page)); 510 511 page_pool_return_page(pool, page); 512 } 513 } 514 515 static void page_pool_free(struct page_pool *pool) 516 { 517 if (pool->disconnect) 518 pool->disconnect(pool); 519 520 ptr_ring_cleanup(&pool->ring, NULL); 521 522 if (pool->p.flags & PP_FLAG_DMA_MAP) 523 put_device(pool->p.dev); 524 525 kfree(pool); 526 } 527 528 static void page_pool_empty_alloc_cache_once(struct page_pool *pool) 529 { 530 struct page *page; 531 532 if (pool->destroy_cnt) 533 return; 534 535 /* Empty alloc cache, assume caller made sure this is 536 * no-longer in use, and page_pool_alloc_pages() cannot be 537 * call concurrently. 538 */ 539 while (pool->alloc.count) { 540 page = pool->alloc.cache[--pool->alloc.count]; 541 page_pool_return_page(pool, page); 542 } 543 } 544 545 static void page_pool_scrub(struct page_pool *pool) 546 { 547 page_pool_empty_alloc_cache_once(pool); 548 pool->destroy_cnt++; 549 550 /* No more consumers should exist, but producers could still 551 * be in-flight. 552 */ 553 page_pool_empty_ring(pool); 554 } 555 556 static int page_pool_release(struct page_pool *pool) 557 { 558 int inflight; 559 560 page_pool_scrub(pool); 561 inflight = page_pool_inflight(pool); 562 if (!inflight) 563 page_pool_free(pool); 564 565 return inflight; 566 } 567 568 static void page_pool_release_retry(struct work_struct *wq) 569 { 570 struct delayed_work *dwq = to_delayed_work(wq); 571 struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); 572 int inflight; 573 574 inflight = page_pool_release(pool); 575 if (!inflight) 576 return; 577 578 /* Periodic warning */ 579 if (time_after_eq(jiffies, pool->defer_warn)) { 580 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; 581 582 pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", 583 __func__, inflight, sec); 584 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; 585 } 586 587 /* Still not ready to be disconnected, retry later */ 588 schedule_delayed_work(&pool->release_dw, DEFER_TIME); 589 } 590 591 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)) 592 { 593 refcount_inc(&pool->user_cnt); 594 pool->disconnect = disconnect; 595 } 596 597 void page_pool_destroy(struct page_pool *pool) 598 { 599 if (!pool) 600 return; 601 602 if (!page_pool_put(pool)) 603 return; 604 605 if (!page_pool_release(pool)) 606 return; 607 608 pool->defer_start = jiffies; 609 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; 610 611 INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); 612 schedule_delayed_work(&pool->release_dw, DEFER_TIME); 613 } 614 EXPORT_SYMBOL(page_pool_destroy); 615 616 /* Caller must provide appropriate safe context, e.g. NAPI. */ 617 void page_pool_update_nid(struct page_pool *pool, int new_nid) 618 { 619 struct page *page; 620 621 trace_page_pool_update_nid(pool, new_nid); 622 pool->p.nid = new_nid; 623 624 /* Flush pool alloc cache, as refill will check NUMA node */ 625 while (pool->alloc.count) { 626 page = pool->alloc.cache[--pool->alloc.count]; 627 page_pool_return_page(pool, page); 628 } 629 } 630 EXPORT_SYMBOL(page_pool_update_nid); 631 632 bool page_pool_return_skb_page(struct page *page) 633 { 634 struct page_pool *pp; 635 636 page = compound_head(page); 637 if (unlikely(page->pp_magic != PP_SIGNATURE)) 638 return false; 639 640 pp = page->pp; 641 642 /* Driver set this to memory recycling info. Reset it on recycle. 643 * This will *not* work for NIC using a split-page memory model. 644 * The page will be returned to the pool here regardless of the 645 * 'flipped' fragment being in use or not. 646 */ 647 page->pp = NULL; 648 page_pool_put_full_page(pp, page, false); 649 650 return true; 651 } 652 EXPORT_SYMBOL(page_pool_return_skb_page); 653