1 /* SPDX-License-Identifier: GPL-2.0 2 * 3 * page_pool.c 4 * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> 5 * Copyright (C) 2016 Red Hat, Inc. 6 */ 7 #include <linux/types.h> 8 #include <linux/kernel.h> 9 #include <linux/slab.h> 10 11 #include <net/page_pool.h> 12 #include <linux/dma-direction.h> 13 #include <linux/dma-mapping.h> 14 #include <linux/page-flags.h> 15 #include <linux/mm.h> /* for __put_page() */ 16 17 static int page_pool_init(struct page_pool *pool, 18 const struct page_pool_params *params) 19 { 20 unsigned int ring_qsize = 1024; /* Default */ 21 22 memcpy(&pool->p, params, sizeof(pool->p)); 23 24 /* Validate only known flags were used */ 25 if (pool->p.flags & ~(PP_FLAG_ALL)) 26 return -EINVAL; 27 28 if (pool->p.pool_size) 29 ring_qsize = pool->p.pool_size; 30 31 /* Sanity limit mem that can be pinned down */ 32 if (ring_qsize > 32768) 33 return -E2BIG; 34 35 /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. 36 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, 37 * which is the XDP_TX use-case. 38 */ 39 if ((pool->p.dma_dir != DMA_FROM_DEVICE) && 40 (pool->p.dma_dir != DMA_BIDIRECTIONAL)) 41 return -EINVAL; 42 43 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) 44 return -ENOMEM; 45 46 return 0; 47 } 48 49 struct page_pool *page_pool_create(const struct page_pool_params *params) 50 { 51 struct page_pool *pool; 52 int err = 0; 53 54 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); 55 if (!pool) 56 return ERR_PTR(-ENOMEM); 57 58 err = page_pool_init(pool, params); 59 if (err < 0) { 60 pr_warn("%s() gave up with errno %d\n", __func__, err); 61 kfree(pool); 62 return ERR_PTR(err); 63 } 64 return pool; 65 } 66 EXPORT_SYMBOL(page_pool_create); 67 68 /* fast path */ 69 static struct page *__page_pool_get_cached(struct page_pool *pool) 70 { 71 struct ptr_ring *r = &pool->ring; 72 struct page *page; 73 74 /* Quicker fallback, avoid locks when ring is empty */ 75 if (__ptr_ring_empty(r)) 76 return NULL; 77 78 /* Test for safe-context, caller should provide this guarantee */ 79 if (likely(in_serving_softirq())) { 80 if (likely(pool->alloc.count)) { 81 /* Fast-path */ 82 page = pool->alloc.cache[--pool->alloc.count]; 83 return page; 84 } 85 /* Slower-path: Alloc array empty, time to refill 86 * 87 * Open-coded bulk ptr_ring consumer. 88 * 89 * Discussion: the ring consumer lock is not really 90 * needed due to the softirq/NAPI protection, but 91 * later need the ability to reclaim pages on the 92 * ring. Thus, keeping the locks. 93 */ 94 spin_lock(&r->consumer_lock); 95 while ((page = __ptr_ring_consume(r))) { 96 if (pool->alloc.count == PP_ALLOC_CACHE_REFILL) 97 break; 98 pool->alloc.cache[pool->alloc.count++] = page; 99 } 100 spin_unlock(&r->consumer_lock); 101 return page; 102 } 103 104 /* Slow-path: Get page from locked ring queue */ 105 page = ptr_ring_consume(&pool->ring); 106 return page; 107 } 108 109 /* slow path */ 110 noinline 111 static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, 112 gfp_t _gfp) 113 { 114 struct page *page; 115 gfp_t gfp = _gfp; 116 dma_addr_t dma; 117 118 /* We could always set __GFP_COMP, and avoid this branch, as 119 * prep_new_page() can handle order-0 with __GFP_COMP. 120 */ 121 if (pool->p.order) 122 gfp |= __GFP_COMP; 123 124 /* FUTURE development: 125 * 126 * Current slow-path essentially falls back to single page 127 * allocations, which doesn't improve performance. This code 128 * need bulk allocation support from the page allocator code. 129 */ 130 131 /* Cache was empty, do real allocation */ 132 page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); 133 if (!page) 134 return NULL; 135 136 if (!(pool->p.flags & PP_FLAG_DMA_MAP)) 137 goto skip_dma_map; 138 139 /* Setup DMA mapping: use 'struct page' area for storing DMA-addr 140 * since dma_addr_t can be either 32 or 64 bits and does not always fit 141 * into page private data (i.e 32bit cpu with 64bit DMA caps) 142 * This mapping is kept for lifetime of page, until leaving pool. 143 */ 144 dma = dma_map_page_attrs(pool->p.dev, page, 0, 145 (PAGE_SIZE << pool->p.order), 146 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); 147 if (dma_mapping_error(pool->p.dev, dma)) { 148 put_page(page); 149 return NULL; 150 } 151 page->dma_addr = dma; 152 153 skip_dma_map: 154 /* When page just alloc'ed is should/must have refcnt 1. */ 155 return page; 156 } 157 158 /* For using page_pool replace: alloc_pages() API calls, but provide 159 * synchronization guarantee for allocation side. 160 */ 161 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) 162 { 163 struct page *page; 164 165 /* Fast-path: Get a page from cache */ 166 page = __page_pool_get_cached(pool); 167 if (page) 168 return page; 169 170 /* Slow-path: cache empty, do real allocation */ 171 page = __page_pool_alloc_pages_slow(pool, gfp); 172 return page; 173 } 174 EXPORT_SYMBOL(page_pool_alloc_pages); 175 176 /* Cleanup page_pool state from page */ 177 static void __page_pool_clean_page(struct page_pool *pool, 178 struct page *page) 179 { 180 dma_addr_t dma; 181 182 if (!(pool->p.flags & PP_FLAG_DMA_MAP)) 183 return; 184 185 dma = page->dma_addr; 186 /* DMA unmap */ 187 dma_unmap_page_attrs(pool->p.dev, dma, 188 PAGE_SIZE << pool->p.order, pool->p.dma_dir, 189 DMA_ATTR_SKIP_CPU_SYNC); 190 page->dma_addr = 0; 191 } 192 193 /* Return a page to the page allocator, cleaning up our state */ 194 static void __page_pool_return_page(struct page_pool *pool, struct page *page) 195 { 196 __page_pool_clean_page(pool, page); 197 put_page(page); 198 /* An optimization would be to call __free_pages(page, pool->p.order) 199 * knowing page is not part of page-cache (thus avoiding a 200 * __page_cache_release() call). 201 */ 202 } 203 204 static bool __page_pool_recycle_into_ring(struct page_pool *pool, 205 struct page *page) 206 { 207 int ret; 208 /* BH protection not needed if current is serving softirq */ 209 if (in_serving_softirq()) 210 ret = ptr_ring_produce(&pool->ring, page); 211 else 212 ret = ptr_ring_produce_bh(&pool->ring, page); 213 214 return (ret == 0) ? true : false; 215 } 216 217 /* Only allow direct recycling in special circumstances, into the 218 * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. 219 * 220 * Caller must provide appropriate safe context. 221 */ 222 static bool __page_pool_recycle_direct(struct page *page, 223 struct page_pool *pool) 224 { 225 if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) 226 return false; 227 228 /* Caller MUST have verified/know (page_ref_count(page) == 1) */ 229 pool->alloc.cache[pool->alloc.count++] = page; 230 return true; 231 } 232 233 void __page_pool_put_page(struct page_pool *pool, 234 struct page *page, bool allow_direct) 235 { 236 /* This allocator is optimized for the XDP mode that uses 237 * one-frame-per-page, but have fallbacks that act like the 238 * regular page allocator APIs. 239 * 240 * refcnt == 1 means page_pool owns page, and can recycle it. 241 */ 242 if (likely(page_ref_count(page) == 1)) { 243 /* Read barrier done in page_ref_count / READ_ONCE */ 244 245 if (allow_direct && in_serving_softirq()) 246 if (__page_pool_recycle_direct(page, pool)) 247 return; 248 249 if (!__page_pool_recycle_into_ring(pool, page)) { 250 /* Cache full, fallback to free pages */ 251 __page_pool_return_page(pool, page); 252 } 253 return; 254 } 255 /* Fallback/non-XDP mode: API user have elevated refcnt. 256 * 257 * Many drivers split up the page into fragments, and some 258 * want to keep doing this to save memory and do refcnt based 259 * recycling. Support this use case too, to ease drivers 260 * switching between XDP/non-XDP. 261 * 262 * In-case page_pool maintains the DMA mapping, API user must 263 * call page_pool_put_page once. In this elevated refcnt 264 * case, the DMA is unmapped/released, as driver is likely 265 * doing refcnt based recycle tricks, meaning another process 266 * will be invoking put_page. 267 */ 268 __page_pool_clean_page(pool, page); 269 put_page(page); 270 } 271 EXPORT_SYMBOL(__page_pool_put_page); 272 273 static void __page_pool_empty_ring(struct page_pool *pool) 274 { 275 struct page *page; 276 277 /* Empty recycle ring */ 278 while ((page = ptr_ring_consume_bh(&pool->ring))) { 279 /* Verify the refcnt invariant of cached pages */ 280 if (!(page_ref_count(page) == 1)) 281 pr_crit("%s() page_pool refcnt %d violation\n", 282 __func__, page_ref_count(page)); 283 284 __page_pool_return_page(pool, page); 285 } 286 } 287 288 static void __page_pool_destroy_rcu(struct rcu_head *rcu) 289 { 290 struct page_pool *pool; 291 292 pool = container_of(rcu, struct page_pool, rcu); 293 294 WARN(pool->alloc.count, "API usage violation"); 295 296 __page_pool_empty_ring(pool); 297 ptr_ring_cleanup(&pool->ring, NULL); 298 kfree(pool); 299 } 300 301 /* Cleanup and release resources */ 302 void page_pool_destroy(struct page_pool *pool) 303 { 304 struct page *page; 305 306 /* Empty alloc cache, assume caller made sure this is 307 * no-longer in use, and page_pool_alloc_pages() cannot be 308 * call concurrently. 309 */ 310 while (pool->alloc.count) { 311 page = pool->alloc.cache[--pool->alloc.count]; 312 __page_pool_return_page(pool, page); 313 } 314 315 /* No more consumers should exist, but producers could still 316 * be in-flight. 317 */ 318 __page_pool_empty_ring(pool); 319 320 /* An xdp_mem_allocator can still ref page_pool pointer */ 321 call_rcu(&pool->rcu, __page_pool_destroy_rcu); 322 } 323 EXPORT_SYMBOL(page_pool_destroy); 324