1 /* SPDX-License-Identifier: GPL-2.0 2 * 3 * page_pool.c 4 * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> 5 * Copyright (C) 2016 Red Hat, Inc. 6 */ 7 8 #include <linux/types.h> 9 #include <linux/kernel.h> 10 #include <linux/slab.h> 11 #include <linux/device.h> 12 13 #include <net/page_pool/helpers.h> 14 #include <net/xdp.h> 15 16 #include <linux/dma-direction.h> 17 #include <linux/dma-mapping.h> 18 #include <linux/page-flags.h> 19 #include <linux/mm.h> /* for put_page() */ 20 #include <linux/poison.h> 21 #include <linux/ethtool.h> 22 #include <linux/netdevice.h> 23 24 #include <trace/events/page_pool.h> 25 26 #define DEFER_TIME (msecs_to_jiffies(1000)) 27 #define DEFER_WARN_INTERVAL (60 * HZ) 28 29 #define BIAS_MAX LONG_MAX 30 31 #ifdef CONFIG_PAGE_POOL_STATS 32 /* alloc_stat_inc is intended to be used in softirq context */ 33 #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) 34 /* recycle_stat_inc is safe to use when preemption is possible. */ 35 #define recycle_stat_inc(pool, __stat) \ 36 do { \ 37 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ 38 this_cpu_inc(s->__stat); \ 39 } while (0) 40 41 #define recycle_stat_add(pool, __stat, val) \ 42 do { \ 43 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ 44 this_cpu_add(s->__stat, val); \ 45 } while (0) 46 47 static const char pp_stats[][ETH_GSTRING_LEN] = { 48 "rx_pp_alloc_fast", 49 "rx_pp_alloc_slow", 50 "rx_pp_alloc_slow_ho", 51 "rx_pp_alloc_empty", 52 "rx_pp_alloc_refill", 53 "rx_pp_alloc_waive", 54 "rx_pp_recycle_cached", 55 "rx_pp_recycle_cache_full", 56 "rx_pp_recycle_ring", 57 "rx_pp_recycle_ring_full", 58 "rx_pp_recycle_released_ref", 59 }; 60 61 /** 62 * page_pool_get_stats() - fetch page pool stats 63 * @pool: pool from which page was allocated 64 * @stats: struct page_pool_stats to fill in 65 * 66 * Retrieve statistics about the page_pool. This API is only available 67 * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. 68 * A pointer to a caller allocated struct page_pool_stats structure 69 * is passed to this API which is filled in. The caller can then report 70 * those stats to the user (perhaps via ethtool, debugfs, etc.). 71 */ 72 bool page_pool_get_stats(struct page_pool *pool, 73 struct page_pool_stats *stats) 74 { 75 int cpu = 0; 76 77 if (!stats) 78 return false; 79 80 /* The caller is responsible to initialize stats. */ 81 stats->alloc_stats.fast += pool->alloc_stats.fast; 82 stats->alloc_stats.slow += pool->alloc_stats.slow; 83 stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; 84 stats->alloc_stats.empty += pool->alloc_stats.empty; 85 stats->alloc_stats.refill += pool->alloc_stats.refill; 86 stats->alloc_stats.waive += pool->alloc_stats.waive; 87 88 for_each_possible_cpu(cpu) { 89 const struct page_pool_recycle_stats *pcpu = 90 per_cpu_ptr(pool->recycle_stats, cpu); 91 92 stats->recycle_stats.cached += pcpu->cached; 93 stats->recycle_stats.cache_full += pcpu->cache_full; 94 stats->recycle_stats.ring += pcpu->ring; 95 stats->recycle_stats.ring_full += pcpu->ring_full; 96 stats->recycle_stats.released_refcnt += pcpu->released_refcnt; 97 } 98 99 return true; 100 } 101 EXPORT_SYMBOL(page_pool_get_stats); 102 103 u8 *page_pool_ethtool_stats_get_strings(u8 *data) 104 { 105 int i; 106 107 for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { 108 memcpy(data, pp_stats[i], ETH_GSTRING_LEN); 109 data += ETH_GSTRING_LEN; 110 } 111 112 return data; 113 } 114 EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); 115 116 int page_pool_ethtool_stats_get_count(void) 117 { 118 return ARRAY_SIZE(pp_stats); 119 } 120 EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); 121 122 u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) 123 { 124 struct page_pool_stats *pool_stats = stats; 125 126 *data++ = pool_stats->alloc_stats.fast; 127 *data++ = pool_stats->alloc_stats.slow; 128 *data++ = pool_stats->alloc_stats.slow_high_order; 129 *data++ = pool_stats->alloc_stats.empty; 130 *data++ = pool_stats->alloc_stats.refill; 131 *data++ = pool_stats->alloc_stats.waive; 132 *data++ = pool_stats->recycle_stats.cached; 133 *data++ = pool_stats->recycle_stats.cache_full; 134 *data++ = pool_stats->recycle_stats.ring; 135 *data++ = pool_stats->recycle_stats.ring_full; 136 *data++ = pool_stats->recycle_stats.released_refcnt; 137 138 return data; 139 } 140 EXPORT_SYMBOL(page_pool_ethtool_stats_get); 141 142 #else 143 #define alloc_stat_inc(pool, __stat) 144 #define recycle_stat_inc(pool, __stat) 145 #define recycle_stat_add(pool, __stat, val) 146 #endif 147 148 static bool page_pool_producer_lock(struct page_pool *pool) 149 __acquires(&pool->ring.producer_lock) 150 { 151 bool in_softirq = in_softirq(); 152 153 if (in_softirq) 154 spin_lock(&pool->ring.producer_lock); 155 else 156 spin_lock_bh(&pool->ring.producer_lock); 157 158 return in_softirq; 159 } 160 161 static void page_pool_producer_unlock(struct page_pool *pool, 162 bool in_softirq) 163 __releases(&pool->ring.producer_lock) 164 { 165 if (in_softirq) 166 spin_unlock(&pool->ring.producer_lock); 167 else 168 spin_unlock_bh(&pool->ring.producer_lock); 169 } 170 171 static int page_pool_init(struct page_pool *pool, 172 const struct page_pool_params *params) 173 { 174 unsigned int ring_qsize = 1024; /* Default */ 175 176 memcpy(&pool->p, params, sizeof(pool->p)); 177 178 /* Validate only known flags were used */ 179 if (pool->p.flags & ~(PP_FLAG_ALL)) 180 return -EINVAL; 181 182 if (pool->p.pool_size) 183 ring_qsize = pool->p.pool_size; 184 185 /* Sanity limit mem that can be pinned down */ 186 if (ring_qsize > 32768) 187 return -E2BIG; 188 189 /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. 190 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, 191 * which is the XDP_TX use-case. 192 */ 193 if (pool->p.flags & PP_FLAG_DMA_MAP) { 194 if ((pool->p.dma_dir != DMA_FROM_DEVICE) && 195 (pool->p.dma_dir != DMA_BIDIRECTIONAL)) 196 return -EINVAL; 197 } 198 199 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { 200 /* In order to request DMA-sync-for-device the page 201 * needs to be mapped 202 */ 203 if (!(pool->p.flags & PP_FLAG_DMA_MAP)) 204 return -EINVAL; 205 206 if (!pool->p.max_len) 207 return -EINVAL; 208 209 /* pool->p.offset has to be set according to the address 210 * offset used by the DMA engine to start copying rx data 211 */ 212 } 213 214 if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT && 215 pool->p.flags & PP_FLAG_PAGE_FRAG) 216 return -EINVAL; 217 218 #ifdef CONFIG_PAGE_POOL_STATS 219 pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); 220 if (!pool->recycle_stats) 221 return -ENOMEM; 222 #endif 223 224 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { 225 #ifdef CONFIG_PAGE_POOL_STATS 226 free_percpu(pool->recycle_stats); 227 #endif 228 return -ENOMEM; 229 } 230 231 atomic_set(&pool->pages_state_release_cnt, 0); 232 233 /* Driver calling page_pool_create() also call page_pool_destroy() */ 234 refcount_set(&pool->user_cnt, 1); 235 236 if (pool->p.flags & PP_FLAG_DMA_MAP) 237 get_device(pool->p.dev); 238 239 return 0; 240 } 241 242 /** 243 * page_pool_create() - create a page pool. 244 * @params: parameters, see struct page_pool_params 245 */ 246 struct page_pool *page_pool_create(const struct page_pool_params *params) 247 { 248 struct page_pool *pool; 249 int err; 250 251 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); 252 if (!pool) 253 return ERR_PTR(-ENOMEM); 254 255 err = page_pool_init(pool, params); 256 if (err < 0) { 257 pr_warn("%s() gave up with errno %d\n", __func__, err); 258 kfree(pool); 259 return ERR_PTR(err); 260 } 261 262 return pool; 263 } 264 EXPORT_SYMBOL(page_pool_create); 265 266 static void page_pool_return_page(struct page_pool *pool, struct page *page); 267 268 noinline 269 static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) 270 { 271 struct ptr_ring *r = &pool->ring; 272 struct page *page; 273 int pref_nid; /* preferred NUMA node */ 274 275 /* Quicker fallback, avoid locks when ring is empty */ 276 if (__ptr_ring_empty(r)) { 277 alloc_stat_inc(pool, empty); 278 return NULL; 279 } 280 281 /* Softirq guarantee CPU and thus NUMA node is stable. This, 282 * assumes CPU refilling driver RX-ring will also run RX-NAPI. 283 */ 284 #ifdef CONFIG_NUMA 285 pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; 286 #else 287 /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ 288 pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ 289 #endif 290 291 /* Refill alloc array, but only if NUMA match */ 292 do { 293 page = __ptr_ring_consume(r); 294 if (unlikely(!page)) 295 break; 296 297 if (likely(page_to_nid(page) == pref_nid)) { 298 pool->alloc.cache[pool->alloc.count++] = page; 299 } else { 300 /* NUMA mismatch; 301 * (1) release 1 page to page-allocator and 302 * (2) break out to fallthrough to alloc_pages_node. 303 * This limit stress on page buddy alloactor. 304 */ 305 page_pool_return_page(pool, page); 306 alloc_stat_inc(pool, waive); 307 page = NULL; 308 break; 309 } 310 } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); 311 312 /* Return last page */ 313 if (likely(pool->alloc.count > 0)) { 314 page = pool->alloc.cache[--pool->alloc.count]; 315 alloc_stat_inc(pool, refill); 316 } 317 318 return page; 319 } 320 321 /* fast path */ 322 static struct page *__page_pool_get_cached(struct page_pool *pool) 323 { 324 struct page *page; 325 326 /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ 327 if (likely(pool->alloc.count)) { 328 /* Fast-path */ 329 page = pool->alloc.cache[--pool->alloc.count]; 330 alloc_stat_inc(pool, fast); 331 } else { 332 page = page_pool_refill_alloc_cache(pool); 333 } 334 335 return page; 336 } 337 338 static void page_pool_dma_sync_for_device(struct page_pool *pool, 339 struct page *page, 340 unsigned int dma_sync_size) 341 { 342 dma_addr_t dma_addr = page_pool_get_dma_addr(page); 343 344 dma_sync_size = min(dma_sync_size, pool->p.max_len); 345 dma_sync_single_range_for_device(pool->p.dev, dma_addr, 346 pool->p.offset, dma_sync_size, 347 pool->p.dma_dir); 348 } 349 350 static bool page_pool_dma_map(struct page_pool *pool, struct page *page) 351 { 352 dma_addr_t dma; 353 354 /* Setup DMA mapping: use 'struct page' area for storing DMA-addr 355 * since dma_addr_t can be either 32 or 64 bits and does not always fit 356 * into page private data (i.e 32bit cpu with 64bit DMA caps) 357 * This mapping is kept for lifetime of page, until leaving pool. 358 */ 359 dma = dma_map_page_attrs(pool->p.dev, page, 0, 360 (PAGE_SIZE << pool->p.order), 361 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | 362 DMA_ATTR_WEAK_ORDERING); 363 if (dma_mapping_error(pool->p.dev, dma)) 364 return false; 365 366 page_pool_set_dma_addr(page, dma); 367 368 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) 369 page_pool_dma_sync_for_device(pool, page, pool->p.max_len); 370 371 return true; 372 } 373 374 static void page_pool_set_pp_info(struct page_pool *pool, 375 struct page *page) 376 { 377 page->pp = pool; 378 page->pp_magic |= PP_SIGNATURE; 379 if (pool->p.init_callback) 380 pool->p.init_callback(page, pool->p.init_arg); 381 } 382 383 static void page_pool_clear_pp_info(struct page *page) 384 { 385 page->pp_magic = 0; 386 page->pp = NULL; 387 } 388 389 static struct page *__page_pool_alloc_page_order(struct page_pool *pool, 390 gfp_t gfp) 391 { 392 struct page *page; 393 394 gfp |= __GFP_COMP; 395 page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); 396 if (unlikely(!page)) 397 return NULL; 398 399 if ((pool->p.flags & PP_FLAG_DMA_MAP) && 400 unlikely(!page_pool_dma_map(pool, page))) { 401 put_page(page); 402 return NULL; 403 } 404 405 alloc_stat_inc(pool, slow_high_order); 406 page_pool_set_pp_info(pool, page); 407 408 /* Track how many pages are held 'in-flight' */ 409 pool->pages_state_hold_cnt++; 410 trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); 411 return page; 412 } 413 414 /* slow path */ 415 noinline 416 static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, 417 gfp_t gfp) 418 { 419 const int bulk = PP_ALLOC_CACHE_REFILL; 420 unsigned int pp_flags = pool->p.flags; 421 unsigned int pp_order = pool->p.order; 422 struct page *page; 423 int i, nr_pages; 424 425 /* Don't support bulk alloc for high-order pages */ 426 if (unlikely(pp_order)) 427 return __page_pool_alloc_page_order(pool, gfp); 428 429 /* Unnecessary as alloc cache is empty, but guarantees zero count */ 430 if (unlikely(pool->alloc.count > 0)) 431 return pool->alloc.cache[--pool->alloc.count]; 432 433 /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ 434 memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); 435 436 nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk, 437 pool->alloc.cache); 438 if (unlikely(!nr_pages)) 439 return NULL; 440 441 /* Pages have been filled into alloc.cache array, but count is zero and 442 * page element have not been (possibly) DMA mapped. 443 */ 444 for (i = 0; i < nr_pages; i++) { 445 page = pool->alloc.cache[i]; 446 if ((pp_flags & PP_FLAG_DMA_MAP) && 447 unlikely(!page_pool_dma_map(pool, page))) { 448 put_page(page); 449 continue; 450 } 451 452 page_pool_set_pp_info(pool, page); 453 pool->alloc.cache[pool->alloc.count++] = page; 454 /* Track how many pages are held 'in-flight' */ 455 pool->pages_state_hold_cnt++; 456 trace_page_pool_state_hold(pool, page, 457 pool->pages_state_hold_cnt); 458 } 459 460 /* Return last page */ 461 if (likely(pool->alloc.count > 0)) { 462 page = pool->alloc.cache[--pool->alloc.count]; 463 alloc_stat_inc(pool, slow); 464 } else { 465 page = NULL; 466 } 467 468 /* When page just alloc'ed is should/must have refcnt 1. */ 469 return page; 470 } 471 472 /* For using page_pool replace: alloc_pages() API calls, but provide 473 * synchronization guarantee for allocation side. 474 */ 475 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) 476 { 477 struct page *page; 478 479 /* Fast-path: Get a page from cache */ 480 page = __page_pool_get_cached(pool); 481 if (page) 482 return page; 483 484 /* Slow-path: cache empty, do real allocation */ 485 page = __page_pool_alloc_pages_slow(pool, gfp); 486 return page; 487 } 488 EXPORT_SYMBOL(page_pool_alloc_pages); 489 490 /* Calculate distance between two u32 values, valid if distance is below 2^(31) 491 * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution 492 */ 493 #define _distance(a, b) (s32)((a) - (b)) 494 495 static s32 page_pool_inflight(struct page_pool *pool) 496 { 497 u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); 498 u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); 499 s32 inflight; 500 501 inflight = _distance(hold_cnt, release_cnt); 502 503 trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); 504 WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); 505 506 return inflight; 507 } 508 509 /* Disconnects a page (from a page_pool). API users can have a need 510 * to disconnect a page (from a page_pool), to allow it to be used as 511 * a regular page (that will eventually be returned to the normal 512 * page-allocator via put_page). 513 */ 514 static void page_pool_return_page(struct page_pool *pool, struct page *page) 515 { 516 dma_addr_t dma; 517 int count; 518 519 if (!(pool->p.flags & PP_FLAG_DMA_MAP)) 520 /* Always account for inflight pages, even if we didn't 521 * map them 522 */ 523 goto skip_dma_unmap; 524 525 dma = page_pool_get_dma_addr(page); 526 527 /* When page is unmapped, it cannot be returned to our pool */ 528 dma_unmap_page_attrs(pool->p.dev, dma, 529 PAGE_SIZE << pool->p.order, pool->p.dma_dir, 530 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); 531 page_pool_set_dma_addr(page, 0); 532 skip_dma_unmap: 533 page_pool_clear_pp_info(page); 534 535 /* This may be the last page returned, releasing the pool, so 536 * it is not safe to reference pool afterwards. 537 */ 538 count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); 539 trace_page_pool_state_release(pool, page, count); 540 541 put_page(page); 542 /* An optimization would be to call __free_pages(page, pool->p.order) 543 * knowing page is not part of page-cache (thus avoiding a 544 * __page_cache_release() call). 545 */ 546 } 547 548 static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) 549 { 550 int ret; 551 /* BH protection not needed if current is softirq */ 552 if (in_softirq()) 553 ret = ptr_ring_produce(&pool->ring, page); 554 else 555 ret = ptr_ring_produce_bh(&pool->ring, page); 556 557 if (!ret) { 558 recycle_stat_inc(pool, ring); 559 return true; 560 } 561 562 return false; 563 } 564 565 /* Only allow direct recycling in special circumstances, into the 566 * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. 567 * 568 * Caller must provide appropriate safe context. 569 */ 570 static bool page_pool_recycle_in_cache(struct page *page, 571 struct page_pool *pool) 572 { 573 if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { 574 recycle_stat_inc(pool, cache_full); 575 return false; 576 } 577 578 /* Caller MUST have verified/know (page_ref_count(page) == 1) */ 579 pool->alloc.cache[pool->alloc.count++] = page; 580 recycle_stat_inc(pool, cached); 581 return true; 582 } 583 584 /* If the page refcnt == 1, this will try to recycle the page. 585 * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for 586 * the configured size min(dma_sync_size, pool->max_len). 587 * If the page refcnt != 1, then the page will be returned to memory 588 * subsystem. 589 */ 590 static __always_inline struct page * 591 __page_pool_put_page(struct page_pool *pool, struct page *page, 592 unsigned int dma_sync_size, bool allow_direct) 593 { 594 lockdep_assert_no_hardirq(); 595 596 /* This allocator is optimized for the XDP mode that uses 597 * one-frame-per-page, but have fallbacks that act like the 598 * regular page allocator APIs. 599 * 600 * refcnt == 1 means page_pool owns page, and can recycle it. 601 * 602 * page is NOT reusable when allocated when system is under 603 * some pressure. (page_is_pfmemalloc) 604 */ 605 if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { 606 /* Read barrier done in page_ref_count / READ_ONCE */ 607 608 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) 609 page_pool_dma_sync_for_device(pool, page, 610 dma_sync_size); 611 612 if (allow_direct && in_softirq() && 613 page_pool_recycle_in_cache(page, pool)) 614 return NULL; 615 616 /* Page found as candidate for recycling */ 617 return page; 618 } 619 /* Fallback/non-XDP mode: API user have elevated refcnt. 620 * 621 * Many drivers split up the page into fragments, and some 622 * want to keep doing this to save memory and do refcnt based 623 * recycling. Support this use case too, to ease drivers 624 * switching between XDP/non-XDP. 625 * 626 * In-case page_pool maintains the DMA mapping, API user must 627 * call page_pool_put_page once. In this elevated refcnt 628 * case, the DMA is unmapped/released, as driver is likely 629 * doing refcnt based recycle tricks, meaning another process 630 * will be invoking put_page. 631 */ 632 recycle_stat_inc(pool, released_refcnt); 633 page_pool_return_page(pool, page); 634 635 return NULL; 636 } 637 638 void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, 639 unsigned int dma_sync_size, bool allow_direct) 640 { 641 page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); 642 if (page && !page_pool_recycle_in_ring(pool, page)) { 643 /* Cache full, fallback to free pages */ 644 recycle_stat_inc(pool, ring_full); 645 page_pool_return_page(pool, page); 646 } 647 } 648 EXPORT_SYMBOL(page_pool_put_defragged_page); 649 650 /** 651 * page_pool_put_page_bulk() - release references on multiple pages 652 * @pool: pool from which pages were allocated 653 * @data: array holding page pointers 654 * @count: number of pages in @data 655 * 656 * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring 657 * producer lock. If the ptr_ring is full, page_pool_put_page_bulk() 658 * will release leftover pages to the page allocator. 659 * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx 660 * completion loop for the XDP_REDIRECT use case. 661 * 662 * Please note the caller must not use data area after running 663 * page_pool_put_page_bulk(), as this function overwrites it. 664 */ 665 void page_pool_put_page_bulk(struct page_pool *pool, void **data, 666 int count) 667 { 668 int i, bulk_len = 0; 669 bool in_softirq; 670 671 for (i = 0; i < count; i++) { 672 struct page *page = virt_to_head_page(data[i]); 673 674 /* It is not the last user for the page frag case */ 675 if (!page_pool_is_last_frag(pool, page)) 676 continue; 677 678 page = __page_pool_put_page(pool, page, -1, false); 679 /* Approved for bulk recycling in ptr_ring cache */ 680 if (page) 681 data[bulk_len++] = page; 682 } 683 684 if (unlikely(!bulk_len)) 685 return; 686 687 /* Bulk producer into ptr_ring page_pool cache */ 688 in_softirq = page_pool_producer_lock(pool); 689 for (i = 0; i < bulk_len; i++) { 690 if (__ptr_ring_produce(&pool->ring, data[i])) { 691 /* ring full */ 692 recycle_stat_inc(pool, ring_full); 693 break; 694 } 695 } 696 recycle_stat_add(pool, ring, i); 697 page_pool_producer_unlock(pool, in_softirq); 698 699 /* Hopefully all pages was return into ptr_ring */ 700 if (likely(i == bulk_len)) 701 return; 702 703 /* ptr_ring cache full, free remaining pages outside producer lock 704 * since put_page() with refcnt == 1 can be an expensive operation 705 */ 706 for (; i < bulk_len; i++) 707 page_pool_return_page(pool, data[i]); 708 } 709 EXPORT_SYMBOL(page_pool_put_page_bulk); 710 711 static struct page *page_pool_drain_frag(struct page_pool *pool, 712 struct page *page) 713 { 714 long drain_count = BIAS_MAX - pool->frag_users; 715 716 /* Some user is still using the page frag */ 717 if (likely(page_pool_defrag_page(page, drain_count))) 718 return NULL; 719 720 if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { 721 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) 722 page_pool_dma_sync_for_device(pool, page, -1); 723 724 return page; 725 } 726 727 page_pool_return_page(pool, page); 728 return NULL; 729 } 730 731 static void page_pool_free_frag(struct page_pool *pool) 732 { 733 long drain_count = BIAS_MAX - pool->frag_users; 734 struct page *page = pool->frag_page; 735 736 pool->frag_page = NULL; 737 738 if (!page || page_pool_defrag_page(page, drain_count)) 739 return; 740 741 page_pool_return_page(pool, page); 742 } 743 744 struct page *page_pool_alloc_frag(struct page_pool *pool, 745 unsigned int *offset, 746 unsigned int size, gfp_t gfp) 747 { 748 unsigned int max_size = PAGE_SIZE << pool->p.order; 749 struct page *page = pool->frag_page; 750 751 if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) || 752 size > max_size)) 753 return NULL; 754 755 size = ALIGN(size, dma_get_cache_alignment()); 756 *offset = pool->frag_offset; 757 758 if (page && *offset + size > max_size) { 759 page = page_pool_drain_frag(pool, page); 760 if (page) { 761 alloc_stat_inc(pool, fast); 762 goto frag_reset; 763 } 764 } 765 766 if (!page) { 767 page = page_pool_alloc_pages(pool, gfp); 768 if (unlikely(!page)) { 769 pool->frag_page = NULL; 770 return NULL; 771 } 772 773 pool->frag_page = page; 774 775 frag_reset: 776 pool->frag_users = 1; 777 *offset = 0; 778 pool->frag_offset = size; 779 page_pool_fragment_page(page, BIAS_MAX); 780 return page; 781 } 782 783 pool->frag_users++; 784 pool->frag_offset = *offset + size; 785 alloc_stat_inc(pool, fast); 786 return page; 787 } 788 EXPORT_SYMBOL(page_pool_alloc_frag); 789 790 static void page_pool_empty_ring(struct page_pool *pool) 791 { 792 struct page *page; 793 794 /* Empty recycle ring */ 795 while ((page = ptr_ring_consume_bh(&pool->ring))) { 796 /* Verify the refcnt invariant of cached pages */ 797 if (!(page_ref_count(page) == 1)) 798 pr_crit("%s() page_pool refcnt %d violation\n", 799 __func__, page_ref_count(page)); 800 801 page_pool_return_page(pool, page); 802 } 803 } 804 805 static void page_pool_free(struct page_pool *pool) 806 { 807 if (pool->disconnect) 808 pool->disconnect(pool); 809 810 ptr_ring_cleanup(&pool->ring, NULL); 811 812 if (pool->p.flags & PP_FLAG_DMA_MAP) 813 put_device(pool->p.dev); 814 815 #ifdef CONFIG_PAGE_POOL_STATS 816 free_percpu(pool->recycle_stats); 817 #endif 818 kfree(pool); 819 } 820 821 static void page_pool_empty_alloc_cache_once(struct page_pool *pool) 822 { 823 struct page *page; 824 825 if (pool->destroy_cnt) 826 return; 827 828 /* Empty alloc cache, assume caller made sure this is 829 * no-longer in use, and page_pool_alloc_pages() cannot be 830 * call concurrently. 831 */ 832 while (pool->alloc.count) { 833 page = pool->alloc.cache[--pool->alloc.count]; 834 page_pool_return_page(pool, page); 835 } 836 } 837 838 static void page_pool_scrub(struct page_pool *pool) 839 { 840 page_pool_empty_alloc_cache_once(pool); 841 pool->destroy_cnt++; 842 843 /* No more consumers should exist, but producers could still 844 * be in-flight. 845 */ 846 page_pool_empty_ring(pool); 847 } 848 849 static int page_pool_release(struct page_pool *pool) 850 { 851 int inflight; 852 853 page_pool_scrub(pool); 854 inflight = page_pool_inflight(pool); 855 if (!inflight) 856 page_pool_free(pool); 857 858 return inflight; 859 } 860 861 static void page_pool_release_retry(struct work_struct *wq) 862 { 863 struct delayed_work *dwq = to_delayed_work(wq); 864 struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); 865 int inflight; 866 867 inflight = page_pool_release(pool); 868 if (!inflight) 869 return; 870 871 /* Periodic warning */ 872 if (time_after_eq(jiffies, pool->defer_warn)) { 873 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; 874 875 pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", 876 __func__, inflight, sec); 877 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; 878 } 879 880 /* Still not ready to be disconnected, retry later */ 881 schedule_delayed_work(&pool->release_dw, DEFER_TIME); 882 } 883 884 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), 885 struct xdp_mem_info *mem) 886 { 887 refcount_inc(&pool->user_cnt); 888 pool->disconnect = disconnect; 889 pool->xdp_mem_id = mem->id; 890 } 891 892 void page_pool_unlink_napi(struct page_pool *pool) 893 { 894 if (!pool->p.napi) 895 return; 896 897 /* To avoid races with recycling and additional barriers make sure 898 * pool and NAPI are unlinked when NAPI is disabled. 899 */ 900 WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) || 901 READ_ONCE(pool->p.napi->list_owner) != -1); 902 903 WRITE_ONCE(pool->p.napi, NULL); 904 } 905 EXPORT_SYMBOL(page_pool_unlink_napi); 906 907 void page_pool_destroy(struct page_pool *pool) 908 { 909 if (!pool) 910 return; 911 912 if (!page_pool_put(pool)) 913 return; 914 915 page_pool_unlink_napi(pool); 916 page_pool_free_frag(pool); 917 918 if (!page_pool_release(pool)) 919 return; 920 921 pool->defer_start = jiffies; 922 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; 923 924 INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); 925 schedule_delayed_work(&pool->release_dw, DEFER_TIME); 926 } 927 EXPORT_SYMBOL(page_pool_destroy); 928 929 /* Caller must provide appropriate safe context, e.g. NAPI. */ 930 void page_pool_update_nid(struct page_pool *pool, int new_nid) 931 { 932 struct page *page; 933 934 trace_page_pool_update_nid(pool, new_nid); 935 pool->p.nid = new_nid; 936 937 /* Flush pool alloc cache, as refill will check NUMA node */ 938 while (pool->alloc.count) { 939 page = pool->alloc.cache[--pool->alloc.count]; 940 page_pool_return_page(pool, page); 941 } 942 } 943 EXPORT_SYMBOL(page_pool_update_nid); 944