1 /* 2 * linux/mm/slab.c 3 * Written by Mark Hemment, 1996/97. 4 * (markhe@nextd.demon.co.uk) 5 * 6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli 7 * 8 * Major cleanup, different bufctl logic, per-cpu arrays 9 * (c) 2000 Manfred Spraul 10 * 11 * Cleanup, make the head arrays unconditional, preparation for NUMA 12 * (c) 2002 Manfred Spraul 13 * 14 * An implementation of the Slab Allocator as described in outline in; 15 * UNIX Internals: The New Frontiers by Uresh Vahalia 16 * Pub: Prentice Hall ISBN 0-13-101908-2 17 * or with a little more detail in; 18 * The Slab Allocator: An Object-Caching Kernel Memory Allocator 19 * Jeff Bonwick (Sun Microsystems). 20 * Presented at: USENIX Summer 1994 Technical Conference 21 * 22 * The memory is organized in caches, one cache for each object type. 23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) 24 * Each cache consists out of many slabs (they are small (usually one 25 * page long) and always contiguous), and each slab contains multiple 26 * initialized objects. 27 * 28 * This means, that your constructor is used only for newly allocated 29 * slabs and you must pass objects with the same initializations to 30 * kmem_cache_free. 31 * 32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, 33 * normal). If you need a special memory type, then must create a new 34 * cache for that memory type. 35 * 36 * In order to reduce fragmentation, the slabs are sorted in 3 groups: 37 * full slabs with 0 free objects 38 * partial slabs 39 * empty slabs with no allocated objects 40 * 41 * If partial slabs exist, then new allocations come from these slabs, 42 * otherwise from empty slabs or new slabs are allocated. 43 * 44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache 45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs. 46 * 47 * Each cache has a short per-cpu head array, most allocs 48 * and frees go into that array, and if that array overflows, then 1/2 49 * of the entries in the array are given back into the global cache. 50 * The head array is strictly LIFO and should improve the cache hit rates. 51 * On SMP, it additionally reduces the spinlock operations. 52 * 53 * The c_cpuarray may not be read with enabled local interrupts - 54 * it's changed with a smp_call_function(). 55 * 56 * SMP synchronization: 57 * constructors and destructors are called without any locking. 58 * Several members in struct kmem_cache and struct slab never change, they 59 * are accessed without any locking. 60 * The per-cpu arrays are never accessed from the wrong cpu, no locking, 61 * and local interrupts are disabled so slab code is preempt-safe. 62 * The non-constant members are protected with a per-cache irq spinlock. 63 * 64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch 65 * in 2000 - many ideas in the current implementation are derived from 66 * his patch. 67 * 68 * Further notes from the original documentation: 69 * 70 * 11 April '97. Started multi-threading - markhe 71 * The global cache-chain is protected by the mutex 'slab_mutex'. 72 * The sem is only needed when accessing/extending the cache-chain, which 73 * can never happen inside an interrupt (kmem_cache_create(), 74 * kmem_cache_shrink() and kmem_cache_reap()). 75 * 76 * At present, each engine can be growing a cache. This should be blocked. 77 * 78 * 15 March 2005. NUMA slab allocator. 79 * Shai Fultheim <shai@scalex86.org>. 80 * Shobhit Dayal <shobhit@calsoftinc.com> 81 * Alok N Kataria <alokk@calsoftinc.com> 82 * Christoph Lameter <christoph@lameter.com> 83 * 84 * Modified the slab allocator to be node aware on NUMA systems. 85 * Each node has its own list of partial, free and full slabs. 86 * All object allocations for a node occur from node specific slab lists. 87 */ 88 89 #include <linux/slab.h> 90 #include <linux/mm.h> 91 #include <linux/poison.h> 92 #include <linux/swap.h> 93 #include <linux/cache.h> 94 #include <linux/interrupt.h> 95 #include <linux/init.h> 96 #include <linux/compiler.h> 97 #include <linux/cpuset.h> 98 #include <linux/proc_fs.h> 99 #include <linux/seq_file.h> 100 #include <linux/notifier.h> 101 #include <linux/kallsyms.h> 102 #include <linux/cpu.h> 103 #include <linux/sysctl.h> 104 #include <linux/module.h> 105 #include <linux/rcupdate.h> 106 #include <linux/string.h> 107 #include <linux/uaccess.h> 108 #include <linux/nodemask.h> 109 #include <linux/kmemleak.h> 110 #include <linux/mempolicy.h> 111 #include <linux/mutex.h> 112 #include <linux/fault-inject.h> 113 #include <linux/rtmutex.h> 114 #include <linux/reciprocal_div.h> 115 #include <linux/debugobjects.h> 116 #include <linux/kmemcheck.h> 117 #include <linux/memory.h> 118 #include <linux/prefetch.h> 119 120 #include <net/sock.h> 121 122 #include <asm/cacheflush.h> 123 #include <asm/tlbflush.h> 124 #include <asm/page.h> 125 126 #include <trace/events/kmem.h> 127 128 #include "internal.h" 129 130 #include "slab.h" 131 132 /* 133 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. 134 * 0 for faster, smaller code (especially in the critical paths). 135 * 136 * STATS - 1 to collect stats for /proc/slabinfo. 137 * 0 for faster, smaller code (especially in the critical paths). 138 * 139 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) 140 */ 141 142 #ifdef CONFIG_DEBUG_SLAB 143 #define DEBUG 1 144 #define STATS 1 145 #define FORCED_DEBUG 1 146 #else 147 #define DEBUG 0 148 #define STATS 0 149 #define FORCED_DEBUG 0 150 #endif 151 152 /* Shouldn't this be in a header file somewhere? */ 153 #define BYTES_PER_WORD sizeof(void *) 154 #define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) 155 156 #ifndef ARCH_KMALLOC_FLAGS 157 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 158 #endif 159 160 #define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \ 161 <= SLAB_OBJ_MIN_SIZE) ? 1 : 0) 162 163 #if FREELIST_BYTE_INDEX 164 typedef unsigned char freelist_idx_t; 165 #else 166 typedef unsigned short freelist_idx_t; 167 #endif 168 169 #define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) 170 171 /* 172 * true if a page was allocated from pfmemalloc reserves for network-based 173 * swap 174 */ 175 static bool pfmemalloc_active __read_mostly; 176 177 /* 178 * struct array_cache 179 * 180 * Purpose: 181 * - LIFO ordering, to hand out cache-warm objects from _alloc 182 * - reduce the number of linked list operations 183 * - reduce spinlock operations 184 * 185 * The limit is stored in the per-cpu structure to reduce the data cache 186 * footprint. 187 * 188 */ 189 struct array_cache { 190 unsigned int avail; 191 unsigned int limit; 192 unsigned int batchcount; 193 unsigned int touched; 194 void *entry[]; /* 195 * Must have this definition in here for the proper 196 * alignment of array_cache. Also simplifies accessing 197 * the entries. 198 * 199 * Entries should not be directly dereferenced as 200 * entries belonging to slabs marked pfmemalloc will 201 * have the lower bits set SLAB_OBJ_PFMEMALLOC 202 */ 203 }; 204 205 struct alien_cache { 206 spinlock_t lock; 207 struct array_cache ac; 208 }; 209 210 #define SLAB_OBJ_PFMEMALLOC 1 211 static inline bool is_obj_pfmemalloc(void *objp) 212 { 213 return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; 214 } 215 216 static inline void set_obj_pfmemalloc(void **objp) 217 { 218 *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); 219 return; 220 } 221 222 static inline void clear_obj_pfmemalloc(void **objp) 223 { 224 *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); 225 } 226 227 /* 228 * bootstrap: The caches do not work without cpuarrays anymore, but the 229 * cpuarrays are allocated from the generic caches... 230 */ 231 #define BOOT_CPUCACHE_ENTRIES 1 232 struct arraycache_init { 233 struct array_cache cache; 234 void *entries[BOOT_CPUCACHE_ENTRIES]; 235 }; 236 237 /* 238 * Need this for bootstrapping a per node allocator. 239 */ 240 #define NUM_INIT_LISTS (2 * MAX_NUMNODES) 241 static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; 242 #define CACHE_CACHE 0 243 #define SIZE_NODE (MAX_NUMNODES) 244 245 static int drain_freelist(struct kmem_cache *cache, 246 struct kmem_cache_node *n, int tofree); 247 static void free_block(struct kmem_cache *cachep, void **objpp, int len, 248 int node, struct list_head *list); 249 static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); 250 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); 251 static void cache_reap(struct work_struct *unused); 252 253 static int slab_early_init = 1; 254 255 #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) 256 257 static void kmem_cache_node_init(struct kmem_cache_node *parent) 258 { 259 INIT_LIST_HEAD(&parent->slabs_full); 260 INIT_LIST_HEAD(&parent->slabs_partial); 261 INIT_LIST_HEAD(&parent->slabs_free); 262 parent->shared = NULL; 263 parent->alien = NULL; 264 parent->colour_next = 0; 265 spin_lock_init(&parent->list_lock); 266 parent->free_objects = 0; 267 parent->free_touched = 0; 268 } 269 270 #define MAKE_LIST(cachep, listp, slab, nodeid) \ 271 do { \ 272 INIT_LIST_HEAD(listp); \ 273 list_splice(&get_node(cachep, nodeid)->slab, listp); \ 274 } while (0) 275 276 #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ 277 do { \ 278 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ 279 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ 280 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 281 } while (0) 282 283 #define CFLGS_OFF_SLAB (0x80000000UL) 284 #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 285 #define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1)) 286 287 #define BATCHREFILL_LIMIT 16 288 /* 289 * Optimization question: fewer reaps means less probability for unnessary 290 * cpucache drain/refill cycles. 291 * 292 * OTOH the cpuarrays can contain lots of objects, 293 * which could lock up otherwise freeable slabs. 294 */ 295 #define REAPTIMEOUT_AC (2*HZ) 296 #define REAPTIMEOUT_NODE (4*HZ) 297 298 #if STATS 299 #define STATS_INC_ACTIVE(x) ((x)->num_active++) 300 #define STATS_DEC_ACTIVE(x) ((x)->num_active--) 301 #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 302 #define STATS_INC_GROWN(x) ((x)->grown++) 303 #define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) 304 #define STATS_SET_HIGH(x) \ 305 do { \ 306 if ((x)->num_active > (x)->high_mark) \ 307 (x)->high_mark = (x)->num_active; \ 308 } while (0) 309 #define STATS_INC_ERR(x) ((x)->errors++) 310 #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 311 #define STATS_INC_NODEFREES(x) ((x)->node_frees++) 312 #define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++) 313 #define STATS_SET_FREEABLE(x, i) \ 314 do { \ 315 if ((x)->max_freeable < i) \ 316 (x)->max_freeable = i; \ 317 } while (0) 318 #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) 319 #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) 320 #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) 321 #define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) 322 #else 323 #define STATS_INC_ACTIVE(x) do { } while (0) 324 #define STATS_DEC_ACTIVE(x) do { } while (0) 325 #define STATS_INC_ALLOCED(x) do { } while (0) 326 #define STATS_INC_GROWN(x) do { } while (0) 327 #define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0) 328 #define STATS_SET_HIGH(x) do { } while (0) 329 #define STATS_INC_ERR(x) do { } while (0) 330 #define STATS_INC_NODEALLOCS(x) do { } while (0) 331 #define STATS_INC_NODEFREES(x) do { } while (0) 332 #define STATS_INC_ACOVERFLOW(x) do { } while (0) 333 #define STATS_SET_FREEABLE(x, i) do { } while (0) 334 #define STATS_INC_ALLOCHIT(x) do { } while (0) 335 #define STATS_INC_ALLOCMISS(x) do { } while (0) 336 #define STATS_INC_FREEHIT(x) do { } while (0) 337 #define STATS_INC_FREEMISS(x) do { } while (0) 338 #endif 339 340 #if DEBUG 341 342 /* 343 * memory layout of objects: 344 * 0 : objp 345 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that 346 * the end of an object is aligned with the end of the real 347 * allocation. Catches writes behind the end of the allocation. 348 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: 349 * redzone word. 350 * cachep->obj_offset: The real object. 351 * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 352 * cachep->size - 1* BYTES_PER_WORD: last caller address 353 * [BYTES_PER_WORD long] 354 */ 355 static int obj_offset(struct kmem_cache *cachep) 356 { 357 return cachep->obj_offset; 358 } 359 360 static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) 361 { 362 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 363 return (unsigned long long*) (objp + obj_offset(cachep) - 364 sizeof(unsigned long long)); 365 } 366 367 static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) 368 { 369 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 370 if (cachep->flags & SLAB_STORE_USER) 371 return (unsigned long long *)(objp + cachep->size - 372 sizeof(unsigned long long) - 373 REDZONE_ALIGN); 374 return (unsigned long long *) (objp + cachep->size - 375 sizeof(unsigned long long)); 376 } 377 378 static void **dbg_userword(struct kmem_cache *cachep, void *objp) 379 { 380 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 381 return (void **)(objp + cachep->size - BYTES_PER_WORD); 382 } 383 384 #else 385 386 #define obj_offset(x) 0 387 #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 388 #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 389 #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 390 391 #endif 392 393 #define OBJECT_FREE (0) 394 #define OBJECT_ACTIVE (1) 395 396 #ifdef CONFIG_DEBUG_SLAB_LEAK 397 398 static void set_obj_status(struct page *page, int idx, int val) 399 { 400 int freelist_size; 401 char *status; 402 struct kmem_cache *cachep = page->slab_cache; 403 404 freelist_size = cachep->num * sizeof(freelist_idx_t); 405 status = (char *)page->freelist + freelist_size; 406 status[idx] = val; 407 } 408 409 static inline unsigned int get_obj_status(struct page *page, int idx) 410 { 411 int freelist_size; 412 char *status; 413 struct kmem_cache *cachep = page->slab_cache; 414 415 freelist_size = cachep->num * sizeof(freelist_idx_t); 416 status = (char *)page->freelist + freelist_size; 417 418 return status[idx]; 419 } 420 421 #else 422 static inline void set_obj_status(struct page *page, int idx, int val) {} 423 424 #endif 425 426 /* 427 * Do not go above this order unless 0 objects fit into the slab or 428 * overridden on the command line. 429 */ 430 #define SLAB_MAX_ORDER_HI 1 431 #define SLAB_MAX_ORDER_LO 0 432 static int slab_max_order = SLAB_MAX_ORDER_LO; 433 static bool slab_max_order_set __initdata; 434 435 static inline struct kmem_cache *virt_to_cache(const void *obj) 436 { 437 struct page *page = virt_to_head_page(obj); 438 return page->slab_cache; 439 } 440 441 static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, 442 unsigned int idx) 443 { 444 return page->s_mem + cache->size * idx; 445 } 446 447 /* 448 * We want to avoid an expensive divide : (offset / cache->size) 449 * Using the fact that size is a constant for a particular cache, 450 * we can replace (offset / cache->size) by 451 * reciprocal_divide(offset, cache->reciprocal_buffer_size) 452 */ 453 static inline unsigned int obj_to_index(const struct kmem_cache *cache, 454 const struct page *page, void *obj) 455 { 456 u32 offset = (obj - page->s_mem); 457 return reciprocal_divide(offset, cache->reciprocal_buffer_size); 458 } 459 460 /* internal cache of cache description objs */ 461 static struct kmem_cache kmem_cache_boot = { 462 .batchcount = 1, 463 .limit = BOOT_CPUCACHE_ENTRIES, 464 .shared = 1, 465 .size = sizeof(struct kmem_cache), 466 .name = "kmem_cache", 467 }; 468 469 #define BAD_ALIEN_MAGIC 0x01020304ul 470 471 static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); 472 473 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 474 { 475 return this_cpu_ptr(cachep->cpu_cache); 476 } 477 478 static size_t calculate_freelist_size(int nr_objs, size_t align) 479 { 480 size_t freelist_size; 481 482 freelist_size = nr_objs * sizeof(freelist_idx_t); 483 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) 484 freelist_size += nr_objs * sizeof(char); 485 486 if (align) 487 freelist_size = ALIGN(freelist_size, align); 488 489 return freelist_size; 490 } 491 492 static int calculate_nr_objs(size_t slab_size, size_t buffer_size, 493 size_t idx_size, size_t align) 494 { 495 int nr_objs; 496 size_t remained_size; 497 size_t freelist_size; 498 int extra_space = 0; 499 500 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) 501 extra_space = sizeof(char); 502 /* 503 * Ignore padding for the initial guess. The padding 504 * is at most @align-1 bytes, and @buffer_size is at 505 * least @align. In the worst case, this result will 506 * be one greater than the number of objects that fit 507 * into the memory allocation when taking the padding 508 * into account. 509 */ 510 nr_objs = slab_size / (buffer_size + idx_size + extra_space); 511 512 /* 513 * This calculated number will be either the right 514 * amount, or one greater than what we want. 515 */ 516 remained_size = slab_size - nr_objs * buffer_size; 517 freelist_size = calculate_freelist_size(nr_objs, align); 518 if (remained_size < freelist_size) 519 nr_objs--; 520 521 return nr_objs; 522 } 523 524 /* 525 * Calculate the number of objects and left-over bytes for a given buffer size. 526 */ 527 static void cache_estimate(unsigned long gfporder, size_t buffer_size, 528 size_t align, int flags, size_t *left_over, 529 unsigned int *num) 530 { 531 int nr_objs; 532 size_t mgmt_size; 533 size_t slab_size = PAGE_SIZE << gfporder; 534 535 /* 536 * The slab management structure can be either off the slab or 537 * on it. For the latter case, the memory allocated for a 538 * slab is used for: 539 * 540 * - One unsigned int for each object 541 * - Padding to respect alignment of @align 542 * - @buffer_size bytes for each object 543 * 544 * If the slab management structure is off the slab, then the 545 * alignment will already be calculated into the size. Because 546 * the slabs are all pages aligned, the objects will be at the 547 * correct alignment when allocated. 548 */ 549 if (flags & CFLGS_OFF_SLAB) { 550 mgmt_size = 0; 551 nr_objs = slab_size / buffer_size; 552 553 } else { 554 nr_objs = calculate_nr_objs(slab_size, buffer_size, 555 sizeof(freelist_idx_t), align); 556 mgmt_size = calculate_freelist_size(nr_objs, align); 557 } 558 *num = nr_objs; 559 *left_over = slab_size - nr_objs*buffer_size - mgmt_size; 560 } 561 562 #if DEBUG 563 #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) 564 565 static void __slab_error(const char *function, struct kmem_cache *cachep, 566 char *msg) 567 { 568 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 569 function, cachep->name, msg); 570 dump_stack(); 571 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 572 } 573 #endif 574 575 /* 576 * By default on NUMA we use alien caches to stage the freeing of 577 * objects allocated from other nodes. This causes massive memory 578 * inefficiencies when using fake NUMA setup to split memory into a 579 * large number of small nodes, so it can be disabled on the command 580 * line 581 */ 582 583 static int use_alien_caches __read_mostly = 1; 584 static int __init noaliencache_setup(char *s) 585 { 586 use_alien_caches = 0; 587 return 1; 588 } 589 __setup("noaliencache", noaliencache_setup); 590 591 static int __init slab_max_order_setup(char *str) 592 { 593 get_option(&str, &slab_max_order); 594 slab_max_order = slab_max_order < 0 ? 0 : 595 min(slab_max_order, MAX_ORDER - 1); 596 slab_max_order_set = true; 597 598 return 1; 599 } 600 __setup("slab_max_order=", slab_max_order_setup); 601 602 #ifdef CONFIG_NUMA 603 /* 604 * Special reaping functions for NUMA systems called from cache_reap(). 605 * These take care of doing round robin flushing of alien caches (containing 606 * objects freed on different nodes from which they were allocated) and the 607 * flushing of remote pcps by calling drain_node_pages. 608 */ 609 static DEFINE_PER_CPU(unsigned long, slab_reap_node); 610 611 static void init_reap_node(int cpu) 612 { 613 int node; 614 615 node = next_node(cpu_to_mem(cpu), node_online_map); 616 if (node == MAX_NUMNODES) 617 node = first_node(node_online_map); 618 619 per_cpu(slab_reap_node, cpu) = node; 620 } 621 622 static void next_reap_node(void) 623 { 624 int node = __this_cpu_read(slab_reap_node); 625 626 node = next_node(node, node_online_map); 627 if (unlikely(node >= MAX_NUMNODES)) 628 node = first_node(node_online_map); 629 __this_cpu_write(slab_reap_node, node); 630 } 631 632 #else 633 #define init_reap_node(cpu) do { } while (0) 634 #define next_reap_node(void) do { } while (0) 635 #endif 636 637 /* 638 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz 639 * via the workqueue/eventd. 640 * Add the CPU number into the expiration time to minimize the possibility of 641 * the CPUs getting into lockstep and contending for the global cache chain 642 * lock. 643 */ 644 static void start_cpu_timer(int cpu) 645 { 646 struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); 647 648 /* 649 * When this gets called from do_initcalls via cpucache_init(), 650 * init_workqueues() has already run, so keventd will be setup 651 * at that time. 652 */ 653 if (keventd_up() && reap_work->work.func == NULL) { 654 init_reap_node(cpu); 655 INIT_DEFERRABLE_WORK(reap_work, cache_reap); 656 schedule_delayed_work_on(cpu, reap_work, 657 __round_jiffies_relative(HZ, cpu)); 658 } 659 } 660 661 static void init_arraycache(struct array_cache *ac, int limit, int batch) 662 { 663 /* 664 * The array_cache structures contain pointers to free object. 665 * However, when such objects are allocated or transferred to another 666 * cache the pointers are not cleared and they could be counted as 667 * valid references during a kmemleak scan. Therefore, kmemleak must 668 * not scan such objects. 669 */ 670 kmemleak_no_scan(ac); 671 if (ac) { 672 ac->avail = 0; 673 ac->limit = limit; 674 ac->batchcount = batch; 675 ac->touched = 0; 676 } 677 } 678 679 static struct array_cache *alloc_arraycache(int node, int entries, 680 int batchcount, gfp_t gfp) 681 { 682 size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache); 683 struct array_cache *ac = NULL; 684 685 ac = kmalloc_node(memsize, gfp, node); 686 init_arraycache(ac, entries, batchcount); 687 return ac; 688 } 689 690 static inline bool is_slab_pfmemalloc(struct page *page) 691 { 692 return PageSlabPfmemalloc(page); 693 } 694 695 /* Clears pfmemalloc_active if no slabs have pfmalloc set */ 696 static void recheck_pfmemalloc_active(struct kmem_cache *cachep, 697 struct array_cache *ac) 698 { 699 struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); 700 struct page *page; 701 unsigned long flags; 702 703 if (!pfmemalloc_active) 704 return; 705 706 spin_lock_irqsave(&n->list_lock, flags); 707 list_for_each_entry(page, &n->slabs_full, lru) 708 if (is_slab_pfmemalloc(page)) 709 goto out; 710 711 list_for_each_entry(page, &n->slabs_partial, lru) 712 if (is_slab_pfmemalloc(page)) 713 goto out; 714 715 list_for_each_entry(page, &n->slabs_free, lru) 716 if (is_slab_pfmemalloc(page)) 717 goto out; 718 719 pfmemalloc_active = false; 720 out: 721 spin_unlock_irqrestore(&n->list_lock, flags); 722 } 723 724 static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, 725 gfp_t flags, bool force_refill) 726 { 727 int i; 728 void *objp = ac->entry[--ac->avail]; 729 730 /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ 731 if (unlikely(is_obj_pfmemalloc(objp))) { 732 struct kmem_cache_node *n; 733 734 if (gfp_pfmemalloc_allowed(flags)) { 735 clear_obj_pfmemalloc(&objp); 736 return objp; 737 } 738 739 /* The caller cannot use PFMEMALLOC objects, find another one */ 740 for (i = 0; i < ac->avail; i++) { 741 /* If a !PFMEMALLOC object is found, swap them */ 742 if (!is_obj_pfmemalloc(ac->entry[i])) { 743 objp = ac->entry[i]; 744 ac->entry[i] = ac->entry[ac->avail]; 745 ac->entry[ac->avail] = objp; 746 return objp; 747 } 748 } 749 750 /* 751 * If there are empty slabs on the slabs_free list and we are 752 * being forced to refill the cache, mark this one !pfmemalloc. 753 */ 754 n = get_node(cachep, numa_mem_id()); 755 if (!list_empty(&n->slabs_free) && force_refill) { 756 struct page *page = virt_to_head_page(objp); 757 ClearPageSlabPfmemalloc(page); 758 clear_obj_pfmemalloc(&objp); 759 recheck_pfmemalloc_active(cachep, ac); 760 return objp; 761 } 762 763 /* No !PFMEMALLOC objects available */ 764 ac->avail++; 765 objp = NULL; 766 } 767 768 return objp; 769 } 770 771 static inline void *ac_get_obj(struct kmem_cache *cachep, 772 struct array_cache *ac, gfp_t flags, bool force_refill) 773 { 774 void *objp; 775 776 if (unlikely(sk_memalloc_socks())) 777 objp = __ac_get_obj(cachep, ac, flags, force_refill); 778 else 779 objp = ac->entry[--ac->avail]; 780 781 return objp; 782 } 783 784 static noinline void *__ac_put_obj(struct kmem_cache *cachep, 785 struct array_cache *ac, void *objp) 786 { 787 if (unlikely(pfmemalloc_active)) { 788 /* Some pfmemalloc slabs exist, check if this is one */ 789 struct page *page = virt_to_head_page(objp); 790 if (PageSlabPfmemalloc(page)) 791 set_obj_pfmemalloc(&objp); 792 } 793 794 return objp; 795 } 796 797 static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, 798 void *objp) 799 { 800 if (unlikely(sk_memalloc_socks())) 801 objp = __ac_put_obj(cachep, ac, objp); 802 803 ac->entry[ac->avail++] = objp; 804 } 805 806 /* 807 * Transfer objects in one arraycache to another. 808 * Locking must be handled by the caller. 809 * 810 * Return the number of entries transferred. 811 */ 812 static int transfer_objects(struct array_cache *to, 813 struct array_cache *from, unsigned int max) 814 { 815 /* Figure out how many entries to transfer */ 816 int nr = min3(from->avail, max, to->limit - to->avail); 817 818 if (!nr) 819 return 0; 820 821 memcpy(to->entry + to->avail, from->entry + from->avail -nr, 822 sizeof(void *) *nr); 823 824 from->avail -= nr; 825 to->avail += nr; 826 return nr; 827 } 828 829 #ifndef CONFIG_NUMA 830 831 #define drain_alien_cache(cachep, alien) do { } while (0) 832 #define reap_alien(cachep, n) do { } while (0) 833 834 static inline struct alien_cache **alloc_alien_cache(int node, 835 int limit, gfp_t gfp) 836 { 837 return (struct alien_cache **)BAD_ALIEN_MAGIC; 838 } 839 840 static inline void free_alien_cache(struct alien_cache **ac_ptr) 841 { 842 } 843 844 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 845 { 846 return 0; 847 } 848 849 static inline void *alternate_node_alloc(struct kmem_cache *cachep, 850 gfp_t flags) 851 { 852 return NULL; 853 } 854 855 static inline void *____cache_alloc_node(struct kmem_cache *cachep, 856 gfp_t flags, int nodeid) 857 { 858 return NULL; 859 } 860 861 static inline gfp_t gfp_exact_node(gfp_t flags) 862 { 863 return flags; 864 } 865 866 #else /* CONFIG_NUMA */ 867 868 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); 869 static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 870 871 static struct alien_cache *__alloc_alien_cache(int node, int entries, 872 int batch, gfp_t gfp) 873 { 874 size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache); 875 struct alien_cache *alc = NULL; 876 877 alc = kmalloc_node(memsize, gfp, node); 878 init_arraycache(&alc->ac, entries, batch); 879 spin_lock_init(&alc->lock); 880 return alc; 881 } 882 883 static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) 884 { 885 struct alien_cache **alc_ptr; 886 size_t memsize = sizeof(void *) * nr_node_ids; 887 int i; 888 889 if (limit > 1) 890 limit = 12; 891 alc_ptr = kzalloc_node(memsize, gfp, node); 892 if (!alc_ptr) 893 return NULL; 894 895 for_each_node(i) { 896 if (i == node || !node_online(i)) 897 continue; 898 alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp); 899 if (!alc_ptr[i]) { 900 for (i--; i >= 0; i--) 901 kfree(alc_ptr[i]); 902 kfree(alc_ptr); 903 return NULL; 904 } 905 } 906 return alc_ptr; 907 } 908 909 static void free_alien_cache(struct alien_cache **alc_ptr) 910 { 911 int i; 912 913 if (!alc_ptr) 914 return; 915 for_each_node(i) 916 kfree(alc_ptr[i]); 917 kfree(alc_ptr); 918 } 919 920 static void __drain_alien_cache(struct kmem_cache *cachep, 921 struct array_cache *ac, int node, 922 struct list_head *list) 923 { 924 struct kmem_cache_node *n = get_node(cachep, node); 925 926 if (ac->avail) { 927 spin_lock(&n->list_lock); 928 /* 929 * Stuff objects into the remote nodes shared array first. 930 * That way we could avoid the overhead of putting the objects 931 * into the free lists and getting them back later. 932 */ 933 if (n->shared) 934 transfer_objects(n->shared, ac, ac->limit); 935 936 free_block(cachep, ac->entry, ac->avail, node, list); 937 ac->avail = 0; 938 spin_unlock(&n->list_lock); 939 } 940 } 941 942 /* 943 * Called from cache_reap() to regularly drain alien caches round robin. 944 */ 945 static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) 946 { 947 int node = __this_cpu_read(slab_reap_node); 948 949 if (n->alien) { 950 struct alien_cache *alc = n->alien[node]; 951 struct array_cache *ac; 952 953 if (alc) { 954 ac = &alc->ac; 955 if (ac->avail && spin_trylock_irq(&alc->lock)) { 956 LIST_HEAD(list); 957 958 __drain_alien_cache(cachep, ac, node, &list); 959 spin_unlock_irq(&alc->lock); 960 slabs_destroy(cachep, &list); 961 } 962 } 963 } 964 } 965 966 static void drain_alien_cache(struct kmem_cache *cachep, 967 struct alien_cache **alien) 968 { 969 int i = 0; 970 struct alien_cache *alc; 971 struct array_cache *ac; 972 unsigned long flags; 973 974 for_each_online_node(i) { 975 alc = alien[i]; 976 if (alc) { 977 LIST_HEAD(list); 978 979 ac = &alc->ac; 980 spin_lock_irqsave(&alc->lock, flags); 981 __drain_alien_cache(cachep, ac, i, &list); 982 spin_unlock_irqrestore(&alc->lock, flags); 983 slabs_destroy(cachep, &list); 984 } 985 } 986 } 987 988 static int __cache_free_alien(struct kmem_cache *cachep, void *objp, 989 int node, int page_node) 990 { 991 struct kmem_cache_node *n; 992 struct alien_cache *alien = NULL; 993 struct array_cache *ac; 994 LIST_HEAD(list); 995 996 n = get_node(cachep, node); 997 STATS_INC_NODEFREES(cachep); 998 if (n->alien && n->alien[page_node]) { 999 alien = n->alien[page_node]; 1000 ac = &alien->ac; 1001 spin_lock(&alien->lock); 1002 if (unlikely(ac->avail == ac->limit)) { 1003 STATS_INC_ACOVERFLOW(cachep); 1004 __drain_alien_cache(cachep, ac, page_node, &list); 1005 } 1006 ac_put_obj(cachep, ac, objp); 1007 spin_unlock(&alien->lock); 1008 slabs_destroy(cachep, &list); 1009 } else { 1010 n = get_node(cachep, page_node); 1011 spin_lock(&n->list_lock); 1012 free_block(cachep, &objp, 1, page_node, &list); 1013 spin_unlock(&n->list_lock); 1014 slabs_destroy(cachep, &list); 1015 } 1016 return 1; 1017 } 1018 1019 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 1020 { 1021 int page_node = page_to_nid(virt_to_page(objp)); 1022 int node = numa_mem_id(); 1023 /* 1024 * Make sure we are not freeing a object from another node to the array 1025 * cache on this cpu. 1026 */ 1027 if (likely(node == page_node)) 1028 return 0; 1029 1030 return __cache_free_alien(cachep, objp, node, page_node); 1031 } 1032 1033 /* 1034 * Construct gfp mask to allocate from a specific node but do not direct reclaim 1035 * or warn about failures. kswapd may still wake to reclaim in the background. 1036 */ 1037 static inline gfp_t gfp_exact_node(gfp_t flags) 1038 { 1039 return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM; 1040 } 1041 #endif 1042 1043 /* 1044 * Allocates and initializes node for a node on each slab cache, used for 1045 * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node 1046 * will be allocated off-node since memory is not yet online for the new node. 1047 * When hotplugging memory or a cpu, existing node are not replaced if 1048 * already in use. 1049 * 1050 * Must hold slab_mutex. 1051 */ 1052 static int init_cache_node_node(int node) 1053 { 1054 struct kmem_cache *cachep; 1055 struct kmem_cache_node *n; 1056 const size_t memsize = sizeof(struct kmem_cache_node); 1057 1058 list_for_each_entry(cachep, &slab_caches, list) { 1059 /* 1060 * Set up the kmem_cache_node for cpu before we can 1061 * begin anything. Make sure some other cpu on this 1062 * node has not already allocated this 1063 */ 1064 n = get_node(cachep, node); 1065 if (!n) { 1066 n = kmalloc_node(memsize, GFP_KERNEL, node); 1067 if (!n) 1068 return -ENOMEM; 1069 kmem_cache_node_init(n); 1070 n->next_reap = jiffies + REAPTIMEOUT_NODE + 1071 ((unsigned long)cachep) % REAPTIMEOUT_NODE; 1072 1073 /* 1074 * The kmem_cache_nodes don't come and go as CPUs 1075 * come and go. slab_mutex is sufficient 1076 * protection here. 1077 */ 1078 cachep->node[node] = n; 1079 } 1080 1081 spin_lock_irq(&n->list_lock); 1082 n->free_limit = 1083 (1 + nr_cpus_node(node)) * 1084 cachep->batchcount + cachep->num; 1085 spin_unlock_irq(&n->list_lock); 1086 } 1087 return 0; 1088 } 1089 1090 static inline int slabs_tofree(struct kmem_cache *cachep, 1091 struct kmem_cache_node *n) 1092 { 1093 return (n->free_objects + cachep->num - 1) / cachep->num; 1094 } 1095 1096 static void cpuup_canceled(long cpu) 1097 { 1098 struct kmem_cache *cachep; 1099 struct kmem_cache_node *n = NULL; 1100 int node = cpu_to_mem(cpu); 1101 const struct cpumask *mask = cpumask_of_node(node); 1102 1103 list_for_each_entry(cachep, &slab_caches, list) { 1104 struct array_cache *nc; 1105 struct array_cache *shared; 1106 struct alien_cache **alien; 1107 LIST_HEAD(list); 1108 1109 n = get_node(cachep, node); 1110 if (!n) 1111 continue; 1112 1113 spin_lock_irq(&n->list_lock); 1114 1115 /* Free limit for this kmem_cache_node */ 1116 n->free_limit -= cachep->batchcount; 1117 1118 /* cpu is dead; no one can alloc from it. */ 1119 nc = per_cpu_ptr(cachep->cpu_cache, cpu); 1120 if (nc) { 1121 free_block(cachep, nc->entry, nc->avail, node, &list); 1122 nc->avail = 0; 1123 } 1124 1125 if (!cpumask_empty(mask)) { 1126 spin_unlock_irq(&n->list_lock); 1127 goto free_slab; 1128 } 1129 1130 shared = n->shared; 1131 if (shared) { 1132 free_block(cachep, shared->entry, 1133 shared->avail, node, &list); 1134 n->shared = NULL; 1135 } 1136 1137 alien = n->alien; 1138 n->alien = NULL; 1139 1140 spin_unlock_irq(&n->list_lock); 1141 1142 kfree(shared); 1143 if (alien) { 1144 drain_alien_cache(cachep, alien); 1145 free_alien_cache(alien); 1146 } 1147 1148 free_slab: 1149 slabs_destroy(cachep, &list); 1150 } 1151 /* 1152 * In the previous loop, all the objects were freed to 1153 * the respective cache's slabs, now we can go ahead and 1154 * shrink each nodelist to its limit. 1155 */ 1156 list_for_each_entry(cachep, &slab_caches, list) { 1157 n = get_node(cachep, node); 1158 if (!n) 1159 continue; 1160 drain_freelist(cachep, n, slabs_tofree(cachep, n)); 1161 } 1162 } 1163 1164 static int cpuup_prepare(long cpu) 1165 { 1166 struct kmem_cache *cachep; 1167 struct kmem_cache_node *n = NULL; 1168 int node = cpu_to_mem(cpu); 1169 int err; 1170 1171 /* 1172 * We need to do this right in the beginning since 1173 * alloc_arraycache's are going to use this list. 1174 * kmalloc_node allows us to add the slab to the right 1175 * kmem_cache_node and not this cpu's kmem_cache_node 1176 */ 1177 err = init_cache_node_node(node); 1178 if (err < 0) 1179 goto bad; 1180 1181 /* 1182 * Now we can go ahead with allocating the shared arrays and 1183 * array caches 1184 */ 1185 list_for_each_entry(cachep, &slab_caches, list) { 1186 struct array_cache *shared = NULL; 1187 struct alien_cache **alien = NULL; 1188 1189 if (cachep->shared) { 1190 shared = alloc_arraycache(node, 1191 cachep->shared * cachep->batchcount, 1192 0xbaadf00d, GFP_KERNEL); 1193 if (!shared) 1194 goto bad; 1195 } 1196 if (use_alien_caches) { 1197 alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); 1198 if (!alien) { 1199 kfree(shared); 1200 goto bad; 1201 } 1202 } 1203 n = get_node(cachep, node); 1204 BUG_ON(!n); 1205 1206 spin_lock_irq(&n->list_lock); 1207 if (!n->shared) { 1208 /* 1209 * We are serialised from CPU_DEAD or 1210 * CPU_UP_CANCELLED by the cpucontrol lock 1211 */ 1212 n->shared = shared; 1213 shared = NULL; 1214 } 1215 #ifdef CONFIG_NUMA 1216 if (!n->alien) { 1217 n->alien = alien; 1218 alien = NULL; 1219 } 1220 #endif 1221 spin_unlock_irq(&n->list_lock); 1222 kfree(shared); 1223 free_alien_cache(alien); 1224 } 1225 1226 return 0; 1227 bad: 1228 cpuup_canceled(cpu); 1229 return -ENOMEM; 1230 } 1231 1232 static int cpuup_callback(struct notifier_block *nfb, 1233 unsigned long action, void *hcpu) 1234 { 1235 long cpu = (long)hcpu; 1236 int err = 0; 1237 1238 switch (action) { 1239 case CPU_UP_PREPARE: 1240 case CPU_UP_PREPARE_FROZEN: 1241 mutex_lock(&slab_mutex); 1242 err = cpuup_prepare(cpu); 1243 mutex_unlock(&slab_mutex); 1244 break; 1245 case CPU_ONLINE: 1246 case CPU_ONLINE_FROZEN: 1247 start_cpu_timer(cpu); 1248 break; 1249 #ifdef CONFIG_HOTPLUG_CPU 1250 case CPU_DOWN_PREPARE: 1251 case CPU_DOWN_PREPARE_FROZEN: 1252 /* 1253 * Shutdown cache reaper. Note that the slab_mutex is 1254 * held so that if cache_reap() is invoked it cannot do 1255 * anything expensive but will only modify reap_work 1256 * and reschedule the timer. 1257 */ 1258 cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); 1259 /* Now the cache_reaper is guaranteed to be not running. */ 1260 per_cpu(slab_reap_work, cpu).work.func = NULL; 1261 break; 1262 case CPU_DOWN_FAILED: 1263 case CPU_DOWN_FAILED_FROZEN: 1264 start_cpu_timer(cpu); 1265 break; 1266 case CPU_DEAD: 1267 case CPU_DEAD_FROZEN: 1268 /* 1269 * Even if all the cpus of a node are down, we don't free the 1270 * kmem_cache_node of any cache. This to avoid a race between 1271 * cpu_down, and a kmalloc allocation from another cpu for 1272 * memory from the node of the cpu going down. The node 1273 * structure is usually allocated from kmem_cache_create() and 1274 * gets destroyed at kmem_cache_destroy(). 1275 */ 1276 /* fall through */ 1277 #endif 1278 case CPU_UP_CANCELED: 1279 case CPU_UP_CANCELED_FROZEN: 1280 mutex_lock(&slab_mutex); 1281 cpuup_canceled(cpu); 1282 mutex_unlock(&slab_mutex); 1283 break; 1284 } 1285 return notifier_from_errno(err); 1286 } 1287 1288 static struct notifier_block cpucache_notifier = { 1289 &cpuup_callback, NULL, 0 1290 }; 1291 1292 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) 1293 /* 1294 * Drains freelist for a node on each slab cache, used for memory hot-remove. 1295 * Returns -EBUSY if all objects cannot be drained so that the node is not 1296 * removed. 1297 * 1298 * Must hold slab_mutex. 1299 */ 1300 static int __meminit drain_cache_node_node(int node) 1301 { 1302 struct kmem_cache *cachep; 1303 int ret = 0; 1304 1305 list_for_each_entry(cachep, &slab_caches, list) { 1306 struct kmem_cache_node *n; 1307 1308 n = get_node(cachep, node); 1309 if (!n) 1310 continue; 1311 1312 drain_freelist(cachep, n, slabs_tofree(cachep, n)); 1313 1314 if (!list_empty(&n->slabs_full) || 1315 !list_empty(&n->slabs_partial)) { 1316 ret = -EBUSY; 1317 break; 1318 } 1319 } 1320 return ret; 1321 } 1322 1323 static int __meminit slab_memory_callback(struct notifier_block *self, 1324 unsigned long action, void *arg) 1325 { 1326 struct memory_notify *mnb = arg; 1327 int ret = 0; 1328 int nid; 1329 1330 nid = mnb->status_change_nid; 1331 if (nid < 0) 1332 goto out; 1333 1334 switch (action) { 1335 case MEM_GOING_ONLINE: 1336 mutex_lock(&slab_mutex); 1337 ret = init_cache_node_node(nid); 1338 mutex_unlock(&slab_mutex); 1339 break; 1340 case MEM_GOING_OFFLINE: 1341 mutex_lock(&slab_mutex); 1342 ret = drain_cache_node_node(nid); 1343 mutex_unlock(&slab_mutex); 1344 break; 1345 case MEM_ONLINE: 1346 case MEM_OFFLINE: 1347 case MEM_CANCEL_ONLINE: 1348 case MEM_CANCEL_OFFLINE: 1349 break; 1350 } 1351 out: 1352 return notifier_from_errno(ret); 1353 } 1354 #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ 1355 1356 /* 1357 * swap the static kmem_cache_node with kmalloced memory 1358 */ 1359 static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list, 1360 int nodeid) 1361 { 1362 struct kmem_cache_node *ptr; 1363 1364 ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid); 1365 BUG_ON(!ptr); 1366 1367 memcpy(ptr, list, sizeof(struct kmem_cache_node)); 1368 /* 1369 * Do not assume that spinlocks can be initialized via memcpy: 1370 */ 1371 spin_lock_init(&ptr->list_lock); 1372 1373 MAKE_ALL_LISTS(cachep, ptr, nodeid); 1374 cachep->node[nodeid] = ptr; 1375 } 1376 1377 /* 1378 * For setting up all the kmem_cache_node for cache whose buffer_size is same as 1379 * size of kmem_cache_node. 1380 */ 1381 static void __init set_up_node(struct kmem_cache *cachep, int index) 1382 { 1383 int node; 1384 1385 for_each_online_node(node) { 1386 cachep->node[node] = &init_kmem_cache_node[index + node]; 1387 cachep->node[node]->next_reap = jiffies + 1388 REAPTIMEOUT_NODE + 1389 ((unsigned long)cachep) % REAPTIMEOUT_NODE; 1390 } 1391 } 1392 1393 /* 1394 * Initialisation. Called after the page allocator have been initialised and 1395 * before smp_init(). 1396 */ 1397 void __init kmem_cache_init(void) 1398 { 1399 int i; 1400 1401 BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < 1402 sizeof(struct rcu_head)); 1403 kmem_cache = &kmem_cache_boot; 1404 1405 if (num_possible_nodes() == 1) 1406 use_alien_caches = 0; 1407 1408 for (i = 0; i < NUM_INIT_LISTS; i++) 1409 kmem_cache_node_init(&init_kmem_cache_node[i]); 1410 1411 /* 1412 * Fragmentation resistance on low memory - only use bigger 1413 * page orders on machines with more than 32MB of memory if 1414 * not overridden on the command line. 1415 */ 1416 if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) 1417 slab_max_order = SLAB_MAX_ORDER_HI; 1418 1419 /* Bootstrap is tricky, because several objects are allocated 1420 * from caches that do not exist yet: 1421 * 1) initialize the kmem_cache cache: it contains the struct 1422 * kmem_cache structures of all caches, except kmem_cache itself: 1423 * kmem_cache is statically allocated. 1424 * Initially an __init data area is used for the head array and the 1425 * kmem_cache_node structures, it's replaced with a kmalloc allocated 1426 * array at the end of the bootstrap. 1427 * 2) Create the first kmalloc cache. 1428 * The struct kmem_cache for the new cache is allocated normally. 1429 * An __init data area is used for the head array. 1430 * 3) Create the remaining kmalloc caches, with minimally sized 1431 * head arrays. 1432 * 4) Replace the __init data head arrays for kmem_cache and the first 1433 * kmalloc cache with kmalloc allocated arrays. 1434 * 5) Replace the __init data for kmem_cache_node for kmem_cache and 1435 * the other cache's with kmalloc allocated memory. 1436 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1437 */ 1438 1439 /* 1) create the kmem_cache */ 1440 1441 /* 1442 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids 1443 */ 1444 create_boot_cache(kmem_cache, "kmem_cache", 1445 offsetof(struct kmem_cache, node) + 1446 nr_node_ids * sizeof(struct kmem_cache_node *), 1447 SLAB_HWCACHE_ALIGN); 1448 list_add(&kmem_cache->list, &slab_caches); 1449 slab_state = PARTIAL; 1450 1451 /* 1452 * Initialize the caches that provide memory for the kmem_cache_node 1453 * structures first. Without this, further allocations will bug. 1454 */ 1455 kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node", 1456 kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); 1457 slab_state = PARTIAL_NODE; 1458 setup_kmalloc_cache_index_table(); 1459 1460 slab_early_init = 0; 1461 1462 /* 5) Replace the bootstrap kmem_cache_node */ 1463 { 1464 int nid; 1465 1466 for_each_online_node(nid) { 1467 init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); 1468 1469 init_list(kmalloc_caches[INDEX_NODE], 1470 &init_kmem_cache_node[SIZE_NODE + nid], nid); 1471 } 1472 } 1473 1474 create_kmalloc_caches(ARCH_KMALLOC_FLAGS); 1475 } 1476 1477 void __init kmem_cache_init_late(void) 1478 { 1479 struct kmem_cache *cachep; 1480 1481 slab_state = UP; 1482 1483 /* 6) resize the head arrays to their final sizes */ 1484 mutex_lock(&slab_mutex); 1485 list_for_each_entry(cachep, &slab_caches, list) 1486 if (enable_cpucache(cachep, GFP_NOWAIT)) 1487 BUG(); 1488 mutex_unlock(&slab_mutex); 1489 1490 /* Done! */ 1491 slab_state = FULL; 1492 1493 /* 1494 * Register a cpu startup notifier callback that initializes 1495 * cpu_cache_get for all new cpus 1496 */ 1497 register_cpu_notifier(&cpucache_notifier); 1498 1499 #ifdef CONFIG_NUMA 1500 /* 1501 * Register a memory hotplug callback that initializes and frees 1502 * node. 1503 */ 1504 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 1505 #endif 1506 1507 /* 1508 * The reap timers are started later, with a module init call: That part 1509 * of the kernel is not yet operational. 1510 */ 1511 } 1512 1513 static int __init cpucache_init(void) 1514 { 1515 int cpu; 1516 1517 /* 1518 * Register the timers that return unneeded pages to the page allocator 1519 */ 1520 for_each_online_cpu(cpu) 1521 start_cpu_timer(cpu); 1522 1523 /* Done! */ 1524 slab_state = FULL; 1525 return 0; 1526 } 1527 __initcall(cpucache_init); 1528 1529 static noinline void 1530 slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) 1531 { 1532 #if DEBUG 1533 struct kmem_cache_node *n; 1534 struct page *page; 1535 unsigned long flags; 1536 int node; 1537 static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, 1538 DEFAULT_RATELIMIT_BURST); 1539 1540 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) 1541 return; 1542 1543 printk(KERN_WARNING 1544 "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", 1545 nodeid, gfpflags); 1546 printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", 1547 cachep->name, cachep->size, cachep->gfporder); 1548 1549 for_each_kmem_cache_node(cachep, node, n) { 1550 unsigned long active_objs = 0, num_objs = 0, free_objects = 0; 1551 unsigned long active_slabs = 0, num_slabs = 0; 1552 1553 spin_lock_irqsave(&n->list_lock, flags); 1554 list_for_each_entry(page, &n->slabs_full, lru) { 1555 active_objs += cachep->num; 1556 active_slabs++; 1557 } 1558 list_for_each_entry(page, &n->slabs_partial, lru) { 1559 active_objs += page->active; 1560 active_slabs++; 1561 } 1562 list_for_each_entry(page, &n->slabs_free, lru) 1563 num_slabs++; 1564 1565 free_objects += n->free_objects; 1566 spin_unlock_irqrestore(&n->list_lock, flags); 1567 1568 num_slabs += active_slabs; 1569 num_objs = num_slabs * cachep->num; 1570 printk(KERN_WARNING 1571 " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", 1572 node, active_slabs, num_slabs, active_objs, num_objs, 1573 free_objects); 1574 } 1575 #endif 1576 } 1577 1578 /* 1579 * Interface to system's page allocator. No need to hold the 1580 * kmem_cache_node ->list_lock. 1581 * 1582 * If we requested dmaable memory, we will get it. Even if we 1583 * did not request dmaable memory, we might get it, but that 1584 * would be relatively rare and ignorable. 1585 */ 1586 static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, 1587 int nodeid) 1588 { 1589 struct page *page; 1590 int nr_pages; 1591 1592 flags |= cachep->allocflags; 1593 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1594 flags |= __GFP_RECLAIMABLE; 1595 1596 page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); 1597 if (!page) { 1598 slab_out_of_memory(cachep, flags, nodeid); 1599 return NULL; 1600 } 1601 1602 if (memcg_charge_slab(page, flags, cachep->gfporder, cachep)) { 1603 __free_pages(page, cachep->gfporder); 1604 return NULL; 1605 } 1606 1607 /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ 1608 if (page_is_pfmemalloc(page)) 1609 pfmemalloc_active = true; 1610 1611 nr_pages = (1 << cachep->gfporder); 1612 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1613 add_zone_page_state(page_zone(page), 1614 NR_SLAB_RECLAIMABLE, nr_pages); 1615 else 1616 add_zone_page_state(page_zone(page), 1617 NR_SLAB_UNRECLAIMABLE, nr_pages); 1618 __SetPageSlab(page); 1619 if (page_is_pfmemalloc(page)) 1620 SetPageSlabPfmemalloc(page); 1621 1622 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1623 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); 1624 1625 if (cachep->ctor) 1626 kmemcheck_mark_uninitialized_pages(page, nr_pages); 1627 else 1628 kmemcheck_mark_unallocated_pages(page, nr_pages); 1629 } 1630 1631 return page; 1632 } 1633 1634 /* 1635 * Interface to system's page release. 1636 */ 1637 static void kmem_freepages(struct kmem_cache *cachep, struct page *page) 1638 { 1639 const unsigned long nr_freed = (1 << cachep->gfporder); 1640 1641 kmemcheck_free_shadow(page, cachep->gfporder); 1642 1643 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1644 sub_zone_page_state(page_zone(page), 1645 NR_SLAB_RECLAIMABLE, nr_freed); 1646 else 1647 sub_zone_page_state(page_zone(page), 1648 NR_SLAB_UNRECLAIMABLE, nr_freed); 1649 1650 BUG_ON(!PageSlab(page)); 1651 __ClearPageSlabPfmemalloc(page); 1652 __ClearPageSlab(page); 1653 page_mapcount_reset(page); 1654 page->mapping = NULL; 1655 1656 if (current->reclaim_state) 1657 current->reclaim_state->reclaimed_slab += nr_freed; 1658 __free_kmem_pages(page, cachep->gfporder); 1659 } 1660 1661 static void kmem_rcu_free(struct rcu_head *head) 1662 { 1663 struct kmem_cache *cachep; 1664 struct page *page; 1665 1666 page = container_of(head, struct page, rcu_head); 1667 cachep = page->slab_cache; 1668 1669 kmem_freepages(cachep, page); 1670 } 1671 1672 #if DEBUG 1673 1674 #ifdef CONFIG_DEBUG_PAGEALLOC 1675 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, 1676 unsigned long caller) 1677 { 1678 int size = cachep->object_size; 1679 1680 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; 1681 1682 if (size < 5 * sizeof(unsigned long)) 1683 return; 1684 1685 *addr++ = 0x12345678; 1686 *addr++ = caller; 1687 *addr++ = smp_processor_id(); 1688 size -= 3 * sizeof(unsigned long); 1689 { 1690 unsigned long *sptr = &caller; 1691 unsigned long svalue; 1692 1693 while (!kstack_end(sptr)) { 1694 svalue = *sptr++; 1695 if (kernel_text_address(svalue)) { 1696 *addr++ = svalue; 1697 size -= sizeof(unsigned long); 1698 if (size <= sizeof(unsigned long)) 1699 break; 1700 } 1701 } 1702 1703 } 1704 *addr++ = 0x87654321; 1705 } 1706 #endif 1707 1708 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) 1709 { 1710 int size = cachep->object_size; 1711 addr = &((char *)addr)[obj_offset(cachep)]; 1712 1713 memset(addr, val, size); 1714 *(unsigned char *)(addr + size - 1) = POISON_END; 1715 } 1716 1717 static void dump_line(char *data, int offset, int limit) 1718 { 1719 int i; 1720 unsigned char error = 0; 1721 int bad_count = 0; 1722 1723 printk(KERN_ERR "%03x: ", offset); 1724 for (i = 0; i < limit; i++) { 1725 if (data[offset + i] != POISON_FREE) { 1726 error = data[offset + i]; 1727 bad_count++; 1728 } 1729 } 1730 print_hex_dump(KERN_CONT, "", 0, 16, 1, 1731 &data[offset], limit, 1); 1732 1733 if (bad_count == 1) { 1734 error ^= POISON_FREE; 1735 if (!(error & (error - 1))) { 1736 printk(KERN_ERR "Single bit error detected. Probably " 1737 "bad RAM.\n"); 1738 #ifdef CONFIG_X86 1739 printk(KERN_ERR "Run memtest86+ or a similar memory " 1740 "test tool.\n"); 1741 #else 1742 printk(KERN_ERR "Run a memory test tool.\n"); 1743 #endif 1744 } 1745 } 1746 } 1747 #endif 1748 1749 #if DEBUG 1750 1751 static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) 1752 { 1753 int i, size; 1754 char *realobj; 1755 1756 if (cachep->flags & SLAB_RED_ZONE) { 1757 printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", 1758 *dbg_redzone1(cachep, objp), 1759 *dbg_redzone2(cachep, objp)); 1760 } 1761 1762 if (cachep->flags & SLAB_STORE_USER) { 1763 printk(KERN_ERR "Last user: [<%p>](%pSR)\n", 1764 *dbg_userword(cachep, objp), 1765 *dbg_userword(cachep, objp)); 1766 } 1767 realobj = (char *)objp + obj_offset(cachep); 1768 size = cachep->object_size; 1769 for (i = 0; i < size && lines; i += 16, lines--) { 1770 int limit; 1771 limit = 16; 1772 if (i + limit > size) 1773 limit = size - i; 1774 dump_line(realobj, i, limit); 1775 } 1776 } 1777 1778 static void check_poison_obj(struct kmem_cache *cachep, void *objp) 1779 { 1780 char *realobj; 1781 int size, i; 1782 int lines = 0; 1783 1784 realobj = (char *)objp + obj_offset(cachep); 1785 size = cachep->object_size; 1786 1787 for (i = 0; i < size; i++) { 1788 char exp = POISON_FREE; 1789 if (i == size - 1) 1790 exp = POISON_END; 1791 if (realobj[i] != exp) { 1792 int limit; 1793 /* Mismatch ! */ 1794 /* Print header */ 1795 if (lines == 0) { 1796 printk(KERN_ERR 1797 "Slab corruption (%s): %s start=%p, len=%d\n", 1798 print_tainted(), cachep->name, realobj, size); 1799 print_objinfo(cachep, objp, 0); 1800 } 1801 /* Hexdump the affected line */ 1802 i = (i / 16) * 16; 1803 limit = 16; 1804 if (i + limit > size) 1805 limit = size - i; 1806 dump_line(realobj, i, limit); 1807 i += 16; 1808 lines++; 1809 /* Limit to 5 lines */ 1810 if (lines > 5) 1811 break; 1812 } 1813 } 1814 if (lines != 0) { 1815 /* Print some data about the neighboring objects, if they 1816 * exist: 1817 */ 1818 struct page *page = virt_to_head_page(objp); 1819 unsigned int objnr; 1820 1821 objnr = obj_to_index(cachep, page, objp); 1822 if (objnr) { 1823 objp = index_to_obj(cachep, page, objnr - 1); 1824 realobj = (char *)objp + obj_offset(cachep); 1825 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1826 realobj, size); 1827 print_objinfo(cachep, objp, 2); 1828 } 1829 if (objnr + 1 < cachep->num) { 1830 objp = index_to_obj(cachep, page, objnr + 1); 1831 realobj = (char *)objp + obj_offset(cachep); 1832 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1833 realobj, size); 1834 print_objinfo(cachep, objp, 2); 1835 } 1836 } 1837 } 1838 #endif 1839 1840 #if DEBUG 1841 static void slab_destroy_debugcheck(struct kmem_cache *cachep, 1842 struct page *page) 1843 { 1844 int i; 1845 for (i = 0; i < cachep->num; i++) { 1846 void *objp = index_to_obj(cachep, page, i); 1847 1848 if (cachep->flags & SLAB_POISON) { 1849 #ifdef CONFIG_DEBUG_PAGEALLOC 1850 if (cachep->size % PAGE_SIZE == 0 && 1851 OFF_SLAB(cachep)) 1852 kernel_map_pages(virt_to_page(objp), 1853 cachep->size / PAGE_SIZE, 1); 1854 else 1855 check_poison_obj(cachep, objp); 1856 #else 1857 check_poison_obj(cachep, objp); 1858 #endif 1859 } 1860 if (cachep->flags & SLAB_RED_ZONE) { 1861 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1862 slab_error(cachep, "start of a freed object " 1863 "was overwritten"); 1864 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 1865 slab_error(cachep, "end of a freed object " 1866 "was overwritten"); 1867 } 1868 } 1869 } 1870 #else 1871 static void slab_destroy_debugcheck(struct kmem_cache *cachep, 1872 struct page *page) 1873 { 1874 } 1875 #endif 1876 1877 /** 1878 * slab_destroy - destroy and release all objects in a slab 1879 * @cachep: cache pointer being destroyed 1880 * @page: page pointer being destroyed 1881 * 1882 * Destroy all the objs in a slab page, and release the mem back to the system. 1883 * Before calling the slab page must have been unlinked from the cache. The 1884 * kmem_cache_node ->list_lock is not held/needed. 1885 */ 1886 static void slab_destroy(struct kmem_cache *cachep, struct page *page) 1887 { 1888 void *freelist; 1889 1890 freelist = page->freelist; 1891 slab_destroy_debugcheck(cachep, page); 1892 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 1893 call_rcu(&page->rcu_head, kmem_rcu_free); 1894 else 1895 kmem_freepages(cachep, page); 1896 1897 /* 1898 * From now on, we don't use freelist 1899 * although actual page can be freed in rcu context 1900 */ 1901 if (OFF_SLAB(cachep)) 1902 kmem_cache_free(cachep->freelist_cache, freelist); 1903 } 1904 1905 static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) 1906 { 1907 struct page *page, *n; 1908 1909 list_for_each_entry_safe(page, n, list, lru) { 1910 list_del(&page->lru); 1911 slab_destroy(cachep, page); 1912 } 1913 } 1914 1915 /** 1916 * calculate_slab_order - calculate size (page order) of slabs 1917 * @cachep: pointer to the cache that is being created 1918 * @size: size of objects to be created in this cache. 1919 * @align: required alignment for the objects. 1920 * @flags: slab allocation flags 1921 * 1922 * Also calculates the number of objects per slab. 1923 * 1924 * This could be made much more intelligent. For now, try to avoid using 1925 * high order pages for slabs. When the gfp() functions are more friendly 1926 * towards high-order requests, this should be changed. 1927 */ 1928 static size_t calculate_slab_order(struct kmem_cache *cachep, 1929 size_t size, size_t align, unsigned long flags) 1930 { 1931 unsigned long offslab_limit; 1932 size_t left_over = 0; 1933 int gfporder; 1934 1935 for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { 1936 unsigned int num; 1937 size_t remainder; 1938 1939 cache_estimate(gfporder, size, align, flags, &remainder, &num); 1940 if (!num) 1941 continue; 1942 1943 /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ 1944 if (num > SLAB_OBJ_MAX_NUM) 1945 break; 1946 1947 if (flags & CFLGS_OFF_SLAB) { 1948 size_t freelist_size_per_obj = sizeof(freelist_idx_t); 1949 /* 1950 * Max number of objs-per-slab for caches which 1951 * use off-slab slabs. Needed to avoid a possible 1952 * looping condition in cache_grow(). 1953 */ 1954 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) 1955 freelist_size_per_obj += sizeof(char); 1956 offslab_limit = size; 1957 offslab_limit /= freelist_size_per_obj; 1958 1959 if (num > offslab_limit) 1960 break; 1961 } 1962 1963 /* Found something acceptable - save it away */ 1964 cachep->num = num; 1965 cachep->gfporder = gfporder; 1966 left_over = remainder; 1967 1968 /* 1969 * A VFS-reclaimable slab tends to have most allocations 1970 * as GFP_NOFS and we really don't want to have to be allocating 1971 * higher-order pages when we are unable to shrink dcache. 1972 */ 1973 if (flags & SLAB_RECLAIM_ACCOUNT) 1974 break; 1975 1976 /* 1977 * Large number of objects is good, but very large slabs are 1978 * currently bad for the gfp()s. 1979 */ 1980 if (gfporder >= slab_max_order) 1981 break; 1982 1983 /* 1984 * Acceptable internal fragmentation? 1985 */ 1986 if (left_over * 8 <= (PAGE_SIZE << gfporder)) 1987 break; 1988 } 1989 return left_over; 1990 } 1991 1992 static struct array_cache __percpu *alloc_kmem_cache_cpus( 1993 struct kmem_cache *cachep, int entries, int batchcount) 1994 { 1995 int cpu; 1996 size_t size; 1997 struct array_cache __percpu *cpu_cache; 1998 1999 size = sizeof(void *) * entries + sizeof(struct array_cache); 2000 cpu_cache = __alloc_percpu(size, sizeof(void *)); 2001 2002 if (!cpu_cache) 2003 return NULL; 2004 2005 for_each_possible_cpu(cpu) { 2006 init_arraycache(per_cpu_ptr(cpu_cache, cpu), 2007 entries, batchcount); 2008 } 2009 2010 return cpu_cache; 2011 } 2012 2013 static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) 2014 { 2015 if (slab_state >= FULL) 2016 return enable_cpucache(cachep, gfp); 2017 2018 cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1); 2019 if (!cachep->cpu_cache) 2020 return 1; 2021 2022 if (slab_state == DOWN) { 2023 /* Creation of first cache (kmem_cache). */ 2024 set_up_node(kmem_cache, CACHE_CACHE); 2025 } else if (slab_state == PARTIAL) { 2026 /* For kmem_cache_node */ 2027 set_up_node(cachep, SIZE_NODE); 2028 } else { 2029 int node; 2030 2031 for_each_online_node(node) { 2032 cachep->node[node] = kmalloc_node( 2033 sizeof(struct kmem_cache_node), gfp, node); 2034 BUG_ON(!cachep->node[node]); 2035 kmem_cache_node_init(cachep->node[node]); 2036 } 2037 } 2038 2039 cachep->node[numa_mem_id()]->next_reap = 2040 jiffies + REAPTIMEOUT_NODE + 2041 ((unsigned long)cachep) % REAPTIMEOUT_NODE; 2042 2043 cpu_cache_get(cachep)->avail = 0; 2044 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 2045 cpu_cache_get(cachep)->batchcount = 1; 2046 cpu_cache_get(cachep)->touched = 0; 2047 cachep->batchcount = 1; 2048 cachep->limit = BOOT_CPUCACHE_ENTRIES; 2049 return 0; 2050 } 2051 2052 unsigned long kmem_cache_flags(unsigned long object_size, 2053 unsigned long flags, const char *name, 2054 void (*ctor)(void *)) 2055 { 2056 return flags; 2057 } 2058 2059 struct kmem_cache * 2060 __kmem_cache_alias(const char *name, size_t size, size_t align, 2061 unsigned long flags, void (*ctor)(void *)) 2062 { 2063 struct kmem_cache *cachep; 2064 2065 cachep = find_mergeable(size, align, flags, name, ctor); 2066 if (cachep) { 2067 cachep->refcount++; 2068 2069 /* 2070 * Adjust the object sizes so that we clear 2071 * the complete object on kzalloc. 2072 */ 2073 cachep->object_size = max_t(int, cachep->object_size, size); 2074 } 2075 return cachep; 2076 } 2077 2078 /** 2079 * __kmem_cache_create - Create a cache. 2080 * @cachep: cache management descriptor 2081 * @flags: SLAB flags 2082 * 2083 * Returns a ptr to the cache on success, NULL on failure. 2084 * Cannot be called within a int, but can be interrupted. 2085 * The @ctor is run when new pages are allocated by the cache. 2086 * 2087 * The flags are 2088 * 2089 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 2090 * to catch references to uninitialised memory. 2091 * 2092 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 2093 * for buffer overruns. 2094 * 2095 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 2096 * cacheline. This can be beneficial if you're counting cycles as closely 2097 * as davem. 2098 */ 2099 int 2100 __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) 2101 { 2102 size_t left_over, freelist_size; 2103 size_t ralign = BYTES_PER_WORD; 2104 gfp_t gfp; 2105 int err; 2106 size_t size = cachep->size; 2107 2108 #if DEBUG 2109 #if FORCED_DEBUG 2110 /* 2111 * Enable redzoning and last user accounting, except for caches with 2112 * large objects, if the increased size would increase the object size 2113 * above the next power of two: caches with object sizes just above a 2114 * power of two have a significant amount of internal fragmentation. 2115 */ 2116 if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + 2117 2 * sizeof(unsigned long long))) 2118 flags |= SLAB_RED_ZONE | SLAB_STORE_USER; 2119 if (!(flags & SLAB_DESTROY_BY_RCU)) 2120 flags |= SLAB_POISON; 2121 #endif 2122 if (flags & SLAB_DESTROY_BY_RCU) 2123 BUG_ON(flags & SLAB_POISON); 2124 #endif 2125 2126 /* 2127 * Check that size is in terms of words. This is needed to avoid 2128 * unaligned accesses for some archs when redzoning is used, and makes 2129 * sure any on-slab bufctl's are also correctly aligned. 2130 */ 2131 if (size & (BYTES_PER_WORD - 1)) { 2132 size += (BYTES_PER_WORD - 1); 2133 size &= ~(BYTES_PER_WORD - 1); 2134 } 2135 2136 if (flags & SLAB_RED_ZONE) { 2137 ralign = REDZONE_ALIGN; 2138 /* If redzoning, ensure that the second redzone is suitably 2139 * aligned, by adjusting the object size accordingly. */ 2140 size += REDZONE_ALIGN - 1; 2141 size &= ~(REDZONE_ALIGN - 1); 2142 } 2143 2144 /* 3) caller mandated alignment */ 2145 if (ralign < cachep->align) { 2146 ralign = cachep->align; 2147 } 2148 /* disable debug if necessary */ 2149 if (ralign > __alignof__(unsigned long long)) 2150 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2151 /* 2152 * 4) Store it. 2153 */ 2154 cachep->align = ralign; 2155 2156 if (slab_is_available()) 2157 gfp = GFP_KERNEL; 2158 else 2159 gfp = GFP_NOWAIT; 2160 2161 #if DEBUG 2162 2163 /* 2164 * Both debugging options require word-alignment which is calculated 2165 * into align above. 2166 */ 2167 if (flags & SLAB_RED_ZONE) { 2168 /* add space for red zone words */ 2169 cachep->obj_offset += sizeof(unsigned long long); 2170 size += 2 * sizeof(unsigned long long); 2171 } 2172 if (flags & SLAB_STORE_USER) { 2173 /* user store requires one word storage behind the end of 2174 * the real object. But if the second red zone needs to be 2175 * aligned to 64 bits, we must allow that much space. 2176 */ 2177 if (flags & SLAB_RED_ZONE) 2178 size += REDZONE_ALIGN; 2179 else 2180 size += BYTES_PER_WORD; 2181 } 2182 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 2183 /* 2184 * To activate debug pagealloc, off-slab management is necessary 2185 * requirement. In early phase of initialization, small sized slab 2186 * doesn't get initialized so it would not be possible. So, we need 2187 * to check size >= 256. It guarantees that all necessary small 2188 * sized slab is initialized in current slab initialization sequence. 2189 */ 2190 if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) && 2191 size >= 256 && cachep->object_size > cache_line_size() && 2192 ALIGN(size, cachep->align) < PAGE_SIZE) { 2193 cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); 2194 size = PAGE_SIZE; 2195 } 2196 #endif 2197 #endif 2198 2199 /* 2200 * Determine if the slab management is 'on' or 'off' slab. 2201 * (bootstrapping cannot cope with offslab caches so don't do 2202 * it too early on. Always use on-slab management when 2203 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) 2204 */ 2205 if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init && 2206 !(flags & SLAB_NOLEAKTRACE)) 2207 /* 2208 * Size is large, assume best to place the slab management obj 2209 * off-slab (should allow better packing of objs). 2210 */ 2211 flags |= CFLGS_OFF_SLAB; 2212 2213 size = ALIGN(size, cachep->align); 2214 /* 2215 * We should restrict the number of objects in a slab to implement 2216 * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. 2217 */ 2218 if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) 2219 size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); 2220 2221 left_over = calculate_slab_order(cachep, size, cachep->align, flags); 2222 2223 if (!cachep->num) 2224 return -E2BIG; 2225 2226 freelist_size = calculate_freelist_size(cachep->num, cachep->align); 2227 2228 /* 2229 * If the slab has been placed off-slab, and we have enough space then 2230 * move it on-slab. This is at the expense of any extra colouring. 2231 */ 2232 if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { 2233 flags &= ~CFLGS_OFF_SLAB; 2234 left_over -= freelist_size; 2235 } 2236 2237 if (flags & CFLGS_OFF_SLAB) { 2238 /* really off slab. No need for manual alignment */ 2239 freelist_size = calculate_freelist_size(cachep->num, 0); 2240 2241 #ifdef CONFIG_PAGE_POISONING 2242 /* If we're going to use the generic kernel_map_pages() 2243 * poisoning, then it's going to smash the contents of 2244 * the redzone and userword anyhow, so switch them off. 2245 */ 2246 if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) 2247 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2248 #endif 2249 } 2250 2251 cachep->colour_off = cache_line_size(); 2252 /* Offset must be a multiple of the alignment. */ 2253 if (cachep->colour_off < cachep->align) 2254 cachep->colour_off = cachep->align; 2255 cachep->colour = left_over / cachep->colour_off; 2256 cachep->freelist_size = freelist_size; 2257 cachep->flags = flags; 2258 cachep->allocflags = __GFP_COMP; 2259 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) 2260 cachep->allocflags |= GFP_DMA; 2261 cachep->size = size; 2262 cachep->reciprocal_buffer_size = reciprocal_value(size); 2263 2264 if (flags & CFLGS_OFF_SLAB) { 2265 cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); 2266 /* 2267 * This is a possibility for one of the kmalloc_{dma,}_caches. 2268 * But since we go off slab only for object size greater than 2269 * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created 2270 * in ascending order,this should not happen at all. 2271 * But leave a BUG_ON for some lucky dude. 2272 */ 2273 BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); 2274 } 2275 2276 err = setup_cpu_cache(cachep, gfp); 2277 if (err) { 2278 __kmem_cache_shutdown(cachep); 2279 return err; 2280 } 2281 2282 return 0; 2283 } 2284 2285 #if DEBUG 2286 static void check_irq_off(void) 2287 { 2288 BUG_ON(!irqs_disabled()); 2289 } 2290 2291 static void check_irq_on(void) 2292 { 2293 BUG_ON(irqs_disabled()); 2294 } 2295 2296 static void check_spinlock_acquired(struct kmem_cache *cachep) 2297 { 2298 #ifdef CONFIG_SMP 2299 check_irq_off(); 2300 assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); 2301 #endif 2302 } 2303 2304 static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) 2305 { 2306 #ifdef CONFIG_SMP 2307 check_irq_off(); 2308 assert_spin_locked(&get_node(cachep, node)->list_lock); 2309 #endif 2310 } 2311 2312 #else 2313 #define check_irq_off() do { } while(0) 2314 #define check_irq_on() do { } while(0) 2315 #define check_spinlock_acquired(x) do { } while(0) 2316 #define check_spinlock_acquired_node(x, y) do { } while(0) 2317 #endif 2318 2319 static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, 2320 struct array_cache *ac, 2321 int force, int node); 2322 2323 static void do_drain(void *arg) 2324 { 2325 struct kmem_cache *cachep = arg; 2326 struct array_cache *ac; 2327 int node = numa_mem_id(); 2328 struct kmem_cache_node *n; 2329 LIST_HEAD(list); 2330 2331 check_irq_off(); 2332 ac = cpu_cache_get(cachep); 2333 n = get_node(cachep, node); 2334 spin_lock(&n->list_lock); 2335 free_block(cachep, ac->entry, ac->avail, node, &list); 2336 spin_unlock(&n->list_lock); 2337 slabs_destroy(cachep, &list); 2338 ac->avail = 0; 2339 } 2340 2341 static void drain_cpu_caches(struct kmem_cache *cachep) 2342 { 2343 struct kmem_cache_node *n; 2344 int node; 2345 2346 on_each_cpu(do_drain, cachep, 1); 2347 check_irq_on(); 2348 for_each_kmem_cache_node(cachep, node, n) 2349 if (n->alien) 2350 drain_alien_cache(cachep, n->alien); 2351 2352 for_each_kmem_cache_node(cachep, node, n) 2353 drain_array(cachep, n, n->shared, 1, node); 2354 } 2355 2356 /* 2357 * Remove slabs from the list of free slabs. 2358 * Specify the number of slabs to drain in tofree. 2359 * 2360 * Returns the actual number of slabs released. 2361 */ 2362 static int drain_freelist(struct kmem_cache *cache, 2363 struct kmem_cache_node *n, int tofree) 2364 { 2365 struct list_head *p; 2366 int nr_freed; 2367 struct page *page; 2368 2369 nr_freed = 0; 2370 while (nr_freed < tofree && !list_empty(&n->slabs_free)) { 2371 2372 spin_lock_irq(&n->list_lock); 2373 p = n->slabs_free.prev; 2374 if (p == &n->slabs_free) { 2375 spin_unlock_irq(&n->list_lock); 2376 goto out; 2377 } 2378 2379 page = list_entry(p, struct page, lru); 2380 #if DEBUG 2381 BUG_ON(page->active); 2382 #endif 2383 list_del(&page->lru); 2384 /* 2385 * Safe to drop the lock. The slab is no longer linked 2386 * to the cache. 2387 */ 2388 n->free_objects -= cache->num; 2389 spin_unlock_irq(&n->list_lock); 2390 slab_destroy(cache, page); 2391 nr_freed++; 2392 } 2393 out: 2394 return nr_freed; 2395 } 2396 2397 int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) 2398 { 2399 int ret = 0; 2400 int node; 2401 struct kmem_cache_node *n; 2402 2403 drain_cpu_caches(cachep); 2404 2405 check_irq_on(); 2406 for_each_kmem_cache_node(cachep, node, n) { 2407 drain_freelist(cachep, n, slabs_tofree(cachep, n)); 2408 2409 ret += !list_empty(&n->slabs_full) || 2410 !list_empty(&n->slabs_partial); 2411 } 2412 return (ret ? 1 : 0); 2413 } 2414 2415 int __kmem_cache_shutdown(struct kmem_cache *cachep) 2416 { 2417 int i; 2418 struct kmem_cache_node *n; 2419 int rc = __kmem_cache_shrink(cachep, false); 2420 2421 if (rc) 2422 return rc; 2423 2424 free_percpu(cachep->cpu_cache); 2425 2426 /* NUMA: free the node structures */ 2427 for_each_kmem_cache_node(cachep, i, n) { 2428 kfree(n->shared); 2429 free_alien_cache(n->alien); 2430 kfree(n); 2431 cachep->node[i] = NULL; 2432 } 2433 return 0; 2434 } 2435 2436 /* 2437 * Get the memory for a slab management obj. 2438 * 2439 * For a slab cache when the slab descriptor is off-slab, the 2440 * slab descriptor can't come from the same cache which is being created, 2441 * Because if it is the case, that means we defer the creation of 2442 * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. 2443 * And we eventually call down to __kmem_cache_create(), which 2444 * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one. 2445 * This is a "chicken-and-egg" problem. 2446 * 2447 * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, 2448 * which are all initialized during kmem_cache_init(). 2449 */ 2450 static void *alloc_slabmgmt(struct kmem_cache *cachep, 2451 struct page *page, int colour_off, 2452 gfp_t local_flags, int nodeid) 2453 { 2454 void *freelist; 2455 void *addr = page_address(page); 2456 2457 if (OFF_SLAB(cachep)) { 2458 /* Slab management obj is off-slab. */ 2459 freelist = kmem_cache_alloc_node(cachep->freelist_cache, 2460 local_flags, nodeid); 2461 if (!freelist) 2462 return NULL; 2463 } else { 2464 freelist = addr + colour_off; 2465 colour_off += cachep->freelist_size; 2466 } 2467 page->active = 0; 2468 page->s_mem = addr + colour_off; 2469 return freelist; 2470 } 2471 2472 static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx) 2473 { 2474 return ((freelist_idx_t *)page->freelist)[idx]; 2475 } 2476 2477 static inline void set_free_obj(struct page *page, 2478 unsigned int idx, freelist_idx_t val) 2479 { 2480 ((freelist_idx_t *)(page->freelist))[idx] = val; 2481 } 2482 2483 static void cache_init_objs(struct kmem_cache *cachep, 2484 struct page *page) 2485 { 2486 int i; 2487 2488 for (i = 0; i < cachep->num; i++) { 2489 void *objp = index_to_obj(cachep, page, i); 2490 #if DEBUG 2491 /* need to poison the objs? */ 2492 if (cachep->flags & SLAB_POISON) 2493 poison_obj(cachep, objp, POISON_FREE); 2494 if (cachep->flags & SLAB_STORE_USER) 2495 *dbg_userword(cachep, objp) = NULL; 2496 2497 if (cachep->flags & SLAB_RED_ZONE) { 2498 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2499 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2500 } 2501 /* 2502 * Constructors are not allowed to allocate memory from the same 2503 * cache which they are a constructor for. Otherwise, deadlock. 2504 * They must also be threaded. 2505 */ 2506 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2507 cachep->ctor(objp + obj_offset(cachep)); 2508 2509 if (cachep->flags & SLAB_RED_ZONE) { 2510 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2511 slab_error(cachep, "constructor overwrote the" 2512 " end of an object"); 2513 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2514 slab_error(cachep, "constructor overwrote the" 2515 " start of an object"); 2516 } 2517 if ((cachep->size % PAGE_SIZE) == 0 && 2518 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2519 kernel_map_pages(virt_to_page(objp), 2520 cachep->size / PAGE_SIZE, 0); 2521 #else 2522 if (cachep->ctor) 2523 cachep->ctor(objp); 2524 #endif 2525 set_obj_status(page, i, OBJECT_FREE); 2526 set_free_obj(page, i, i); 2527 } 2528 } 2529 2530 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) 2531 { 2532 if (CONFIG_ZONE_DMA_FLAG) { 2533 if (flags & GFP_DMA) 2534 BUG_ON(!(cachep->allocflags & GFP_DMA)); 2535 else 2536 BUG_ON(cachep->allocflags & GFP_DMA); 2537 } 2538 } 2539 2540 static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, 2541 int nodeid) 2542 { 2543 void *objp; 2544 2545 objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); 2546 page->active++; 2547 #if DEBUG 2548 WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); 2549 #endif 2550 2551 return objp; 2552 } 2553 2554 static void slab_put_obj(struct kmem_cache *cachep, struct page *page, 2555 void *objp, int nodeid) 2556 { 2557 unsigned int objnr = obj_to_index(cachep, page, objp); 2558 #if DEBUG 2559 unsigned int i; 2560 2561 /* Verify that the slab belongs to the intended node */ 2562 WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); 2563 2564 /* Verify double free bug */ 2565 for (i = page->active; i < cachep->num; i++) { 2566 if (get_free_obj(page, i) == objnr) { 2567 printk(KERN_ERR "slab: double free detected in cache " 2568 "'%s', objp %p\n", cachep->name, objp); 2569 BUG(); 2570 } 2571 } 2572 #endif 2573 page->active--; 2574 set_free_obj(page, page->active, objnr); 2575 } 2576 2577 /* 2578 * Map pages beginning at addr to the given cache and slab. This is required 2579 * for the slab allocator to be able to lookup the cache and slab of a 2580 * virtual address for kfree, ksize, and slab debugging. 2581 */ 2582 static void slab_map_pages(struct kmem_cache *cache, struct page *page, 2583 void *freelist) 2584 { 2585 page->slab_cache = cache; 2586 page->freelist = freelist; 2587 } 2588 2589 /* 2590 * Grow (by 1) the number of slabs within a cache. This is called by 2591 * kmem_cache_alloc() when there are no active objs left in a cache. 2592 */ 2593 static int cache_grow(struct kmem_cache *cachep, 2594 gfp_t flags, int nodeid, struct page *page) 2595 { 2596 void *freelist; 2597 size_t offset; 2598 gfp_t local_flags; 2599 struct kmem_cache_node *n; 2600 2601 /* 2602 * Be lazy and only check for valid flags here, keeping it out of the 2603 * critical path in kmem_cache_alloc(). 2604 */ 2605 if (unlikely(flags & GFP_SLAB_BUG_MASK)) { 2606 pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); 2607 BUG(); 2608 } 2609 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 2610 2611 /* Take the node list lock to change the colour_next on this node */ 2612 check_irq_off(); 2613 n = get_node(cachep, nodeid); 2614 spin_lock(&n->list_lock); 2615 2616 /* Get colour for the slab, and cal the next value. */ 2617 offset = n->colour_next; 2618 n->colour_next++; 2619 if (n->colour_next >= cachep->colour) 2620 n->colour_next = 0; 2621 spin_unlock(&n->list_lock); 2622 2623 offset *= cachep->colour_off; 2624 2625 if (gfpflags_allow_blocking(local_flags)) 2626 local_irq_enable(); 2627 2628 /* 2629 * The test for missing atomic flag is performed here, rather than 2630 * the more obvious place, simply to reduce the critical path length 2631 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they 2632 * will eventually be caught here (where it matters). 2633 */ 2634 kmem_flagcheck(cachep, flags); 2635 2636 /* 2637 * Get mem for the objs. Attempt to allocate a physical page from 2638 * 'nodeid'. 2639 */ 2640 if (!page) 2641 page = kmem_getpages(cachep, local_flags, nodeid); 2642 if (!page) 2643 goto failed; 2644 2645 /* Get slab management. */ 2646 freelist = alloc_slabmgmt(cachep, page, offset, 2647 local_flags & ~GFP_CONSTRAINT_MASK, nodeid); 2648 if (!freelist) 2649 goto opps1; 2650 2651 slab_map_pages(cachep, page, freelist); 2652 2653 cache_init_objs(cachep, page); 2654 2655 if (gfpflags_allow_blocking(local_flags)) 2656 local_irq_disable(); 2657 check_irq_off(); 2658 spin_lock(&n->list_lock); 2659 2660 /* Make slab active. */ 2661 list_add_tail(&page->lru, &(n->slabs_free)); 2662 STATS_INC_GROWN(cachep); 2663 n->free_objects += cachep->num; 2664 spin_unlock(&n->list_lock); 2665 return 1; 2666 opps1: 2667 kmem_freepages(cachep, page); 2668 failed: 2669 if (gfpflags_allow_blocking(local_flags)) 2670 local_irq_disable(); 2671 return 0; 2672 } 2673 2674 #if DEBUG 2675 2676 /* 2677 * Perform extra freeing checks: 2678 * - detect bad pointers. 2679 * - POISON/RED_ZONE checking 2680 */ 2681 static void kfree_debugcheck(const void *objp) 2682 { 2683 if (!virt_addr_valid(objp)) { 2684 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", 2685 (unsigned long)objp); 2686 BUG(); 2687 } 2688 } 2689 2690 static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) 2691 { 2692 unsigned long long redzone1, redzone2; 2693 2694 redzone1 = *dbg_redzone1(cache, obj); 2695 redzone2 = *dbg_redzone2(cache, obj); 2696 2697 /* 2698 * Redzone is ok. 2699 */ 2700 if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) 2701 return; 2702 2703 if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) 2704 slab_error(cache, "double free detected"); 2705 else 2706 slab_error(cache, "memory outside object was overwritten"); 2707 2708 printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", 2709 obj, redzone1, redzone2); 2710 } 2711 2712 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, 2713 unsigned long caller) 2714 { 2715 unsigned int objnr; 2716 struct page *page; 2717 2718 BUG_ON(virt_to_cache(objp) != cachep); 2719 2720 objp -= obj_offset(cachep); 2721 kfree_debugcheck(objp); 2722 page = virt_to_head_page(objp); 2723 2724 if (cachep->flags & SLAB_RED_ZONE) { 2725 verify_redzone_free(cachep, objp); 2726 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2727 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2728 } 2729 if (cachep->flags & SLAB_STORE_USER) 2730 *dbg_userword(cachep, objp) = (void *)caller; 2731 2732 objnr = obj_to_index(cachep, page, objp); 2733 2734 BUG_ON(objnr >= cachep->num); 2735 BUG_ON(objp != index_to_obj(cachep, page, objnr)); 2736 2737 set_obj_status(page, objnr, OBJECT_FREE); 2738 if (cachep->flags & SLAB_POISON) { 2739 #ifdef CONFIG_DEBUG_PAGEALLOC 2740 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { 2741 store_stackinfo(cachep, objp, caller); 2742 kernel_map_pages(virt_to_page(objp), 2743 cachep->size / PAGE_SIZE, 0); 2744 } else { 2745 poison_obj(cachep, objp, POISON_FREE); 2746 } 2747 #else 2748 poison_obj(cachep, objp, POISON_FREE); 2749 #endif 2750 } 2751 return objp; 2752 } 2753 2754 #else 2755 #define kfree_debugcheck(x) do { } while(0) 2756 #define cache_free_debugcheck(x,objp,z) (objp) 2757 #endif 2758 2759 static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, 2760 bool force_refill) 2761 { 2762 int batchcount; 2763 struct kmem_cache_node *n; 2764 struct array_cache *ac; 2765 int node; 2766 2767 check_irq_off(); 2768 node = numa_mem_id(); 2769 if (unlikely(force_refill)) 2770 goto force_grow; 2771 retry: 2772 ac = cpu_cache_get(cachep); 2773 batchcount = ac->batchcount; 2774 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2775 /* 2776 * If there was little recent activity on this cache, then 2777 * perform only a partial refill. Otherwise we could generate 2778 * refill bouncing. 2779 */ 2780 batchcount = BATCHREFILL_LIMIT; 2781 } 2782 n = get_node(cachep, node); 2783 2784 BUG_ON(ac->avail > 0 || !n); 2785 spin_lock(&n->list_lock); 2786 2787 /* See if we can refill from the shared array */ 2788 if (n->shared && transfer_objects(ac, n->shared, batchcount)) { 2789 n->shared->touched = 1; 2790 goto alloc_done; 2791 } 2792 2793 while (batchcount > 0) { 2794 struct list_head *entry; 2795 struct page *page; 2796 /* Get slab alloc is to come from. */ 2797 entry = n->slabs_partial.next; 2798 if (entry == &n->slabs_partial) { 2799 n->free_touched = 1; 2800 entry = n->slabs_free.next; 2801 if (entry == &n->slabs_free) 2802 goto must_grow; 2803 } 2804 2805 page = list_entry(entry, struct page, lru); 2806 check_spinlock_acquired(cachep); 2807 2808 /* 2809 * The slab was either on partial or free list so 2810 * there must be at least one object available for 2811 * allocation. 2812 */ 2813 BUG_ON(page->active >= cachep->num); 2814 2815 while (page->active < cachep->num && batchcount--) { 2816 STATS_INC_ALLOCED(cachep); 2817 STATS_INC_ACTIVE(cachep); 2818 STATS_SET_HIGH(cachep); 2819 2820 ac_put_obj(cachep, ac, slab_get_obj(cachep, page, 2821 node)); 2822 } 2823 2824 /* move slabp to correct slabp list: */ 2825 list_del(&page->lru); 2826 if (page->active == cachep->num) 2827 list_add(&page->lru, &n->slabs_full); 2828 else 2829 list_add(&page->lru, &n->slabs_partial); 2830 } 2831 2832 must_grow: 2833 n->free_objects -= ac->avail; 2834 alloc_done: 2835 spin_unlock(&n->list_lock); 2836 2837 if (unlikely(!ac->avail)) { 2838 int x; 2839 force_grow: 2840 x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); 2841 2842 /* cache_grow can reenable interrupts, then ac could change. */ 2843 ac = cpu_cache_get(cachep); 2844 node = numa_mem_id(); 2845 2846 /* no objects in sight? abort */ 2847 if (!x && (ac->avail == 0 || force_refill)) 2848 return NULL; 2849 2850 if (!ac->avail) /* objects refilled by interrupt? */ 2851 goto retry; 2852 } 2853 ac->touched = 1; 2854 2855 return ac_get_obj(cachep, ac, flags, force_refill); 2856 } 2857 2858 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, 2859 gfp_t flags) 2860 { 2861 might_sleep_if(gfpflags_allow_blocking(flags)); 2862 #if DEBUG 2863 kmem_flagcheck(cachep, flags); 2864 #endif 2865 } 2866 2867 #if DEBUG 2868 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, 2869 gfp_t flags, void *objp, unsigned long caller) 2870 { 2871 struct page *page; 2872 2873 if (!objp) 2874 return objp; 2875 if (cachep->flags & SLAB_POISON) { 2876 #ifdef CONFIG_DEBUG_PAGEALLOC 2877 if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 2878 kernel_map_pages(virt_to_page(objp), 2879 cachep->size / PAGE_SIZE, 1); 2880 else 2881 check_poison_obj(cachep, objp); 2882 #else 2883 check_poison_obj(cachep, objp); 2884 #endif 2885 poison_obj(cachep, objp, POISON_INUSE); 2886 } 2887 if (cachep->flags & SLAB_STORE_USER) 2888 *dbg_userword(cachep, objp) = (void *)caller; 2889 2890 if (cachep->flags & SLAB_RED_ZONE) { 2891 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || 2892 *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 2893 slab_error(cachep, "double free, or memory outside" 2894 " object was overwritten"); 2895 printk(KERN_ERR 2896 "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", 2897 objp, *dbg_redzone1(cachep, objp), 2898 *dbg_redzone2(cachep, objp)); 2899 } 2900 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2901 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2902 } 2903 2904 page = virt_to_head_page(objp); 2905 set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); 2906 objp += obj_offset(cachep); 2907 if (cachep->ctor && cachep->flags & SLAB_POISON) 2908 cachep->ctor(objp); 2909 if (ARCH_SLAB_MINALIGN && 2910 ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { 2911 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 2912 objp, (int)ARCH_SLAB_MINALIGN); 2913 } 2914 return objp; 2915 } 2916 #else 2917 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 2918 #endif 2919 2920 static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) 2921 { 2922 if (unlikely(cachep == kmem_cache)) 2923 return false; 2924 2925 return should_failslab(cachep->object_size, flags, cachep->flags); 2926 } 2927 2928 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 2929 { 2930 void *objp; 2931 struct array_cache *ac; 2932 bool force_refill = false; 2933 2934 check_irq_off(); 2935 2936 ac = cpu_cache_get(cachep); 2937 if (likely(ac->avail)) { 2938 ac->touched = 1; 2939 objp = ac_get_obj(cachep, ac, flags, false); 2940 2941 /* 2942 * Allow for the possibility all avail objects are not allowed 2943 * by the current flags 2944 */ 2945 if (objp) { 2946 STATS_INC_ALLOCHIT(cachep); 2947 goto out; 2948 } 2949 force_refill = true; 2950 } 2951 2952 STATS_INC_ALLOCMISS(cachep); 2953 objp = cache_alloc_refill(cachep, flags, force_refill); 2954 /* 2955 * the 'ac' may be updated by cache_alloc_refill(), 2956 * and kmemleak_erase() requires its correct value. 2957 */ 2958 ac = cpu_cache_get(cachep); 2959 2960 out: 2961 /* 2962 * To avoid a false negative, if an object that is in one of the 2963 * per-CPU caches is leaked, we need to make sure kmemleak doesn't 2964 * treat the array pointers as a reference to the object. 2965 */ 2966 if (objp) 2967 kmemleak_erase(&ac->entry[ac->avail]); 2968 return objp; 2969 } 2970 2971 #ifdef CONFIG_NUMA 2972 /* 2973 * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set. 2974 * 2975 * If we are in_interrupt, then process context, including cpusets and 2976 * mempolicy, may not apply and should not be used for allocation policy. 2977 */ 2978 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) 2979 { 2980 int nid_alloc, nid_here; 2981 2982 if (in_interrupt() || (flags & __GFP_THISNODE)) 2983 return NULL; 2984 nid_alloc = nid_here = numa_mem_id(); 2985 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 2986 nid_alloc = cpuset_slab_spread_node(); 2987 else if (current->mempolicy) 2988 nid_alloc = mempolicy_slab_node(); 2989 if (nid_alloc != nid_here) 2990 return ____cache_alloc_node(cachep, flags, nid_alloc); 2991 return NULL; 2992 } 2993 2994 /* 2995 * Fallback function if there was no memory available and no objects on a 2996 * certain node and fall back is permitted. First we scan all the 2997 * available node for available objects. If that fails then we 2998 * perform an allocation without specifying a node. This allows the page 2999 * allocator to do its reclaim / fallback magic. We then insert the 3000 * slab into the proper nodelist and then allocate from it. 3001 */ 3002 static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) 3003 { 3004 struct zonelist *zonelist; 3005 gfp_t local_flags; 3006 struct zoneref *z; 3007 struct zone *zone; 3008 enum zone_type high_zoneidx = gfp_zone(flags); 3009 void *obj = NULL; 3010 int nid; 3011 unsigned int cpuset_mems_cookie; 3012 3013 if (flags & __GFP_THISNODE) 3014 return NULL; 3015 3016 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 3017 3018 retry_cpuset: 3019 cpuset_mems_cookie = read_mems_allowed_begin(); 3020 zonelist = node_zonelist(mempolicy_slab_node(), flags); 3021 3022 retry: 3023 /* 3024 * Look through allowed nodes for objects available 3025 * from existing per node queues. 3026 */ 3027 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 3028 nid = zone_to_nid(zone); 3029 3030 if (cpuset_zone_allowed(zone, flags) && 3031 get_node(cache, nid) && 3032 get_node(cache, nid)->free_objects) { 3033 obj = ____cache_alloc_node(cache, 3034 gfp_exact_node(flags), nid); 3035 if (obj) 3036 break; 3037 } 3038 } 3039 3040 if (!obj) { 3041 /* 3042 * This allocation will be performed within the constraints 3043 * of the current cpuset / memory policy requirements. 3044 * We may trigger various forms of reclaim on the allowed 3045 * set and go into memory reserves if necessary. 3046 */ 3047 struct page *page; 3048 3049 if (gfpflags_allow_blocking(local_flags)) 3050 local_irq_enable(); 3051 kmem_flagcheck(cache, flags); 3052 page = kmem_getpages(cache, local_flags, numa_mem_id()); 3053 if (gfpflags_allow_blocking(local_flags)) 3054 local_irq_disable(); 3055 if (page) { 3056 /* 3057 * Insert into the appropriate per node queues 3058 */ 3059 nid = page_to_nid(page); 3060 if (cache_grow(cache, flags, nid, page)) { 3061 obj = ____cache_alloc_node(cache, 3062 gfp_exact_node(flags), nid); 3063 if (!obj) 3064 /* 3065 * Another processor may allocate the 3066 * objects in the slab since we are 3067 * not holding any locks. 3068 */ 3069 goto retry; 3070 } else { 3071 /* cache_grow already freed obj */ 3072 obj = NULL; 3073 } 3074 } 3075 } 3076 3077 if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) 3078 goto retry_cpuset; 3079 return obj; 3080 } 3081 3082 /* 3083 * A interface to enable slab creation on nodeid 3084 */ 3085 static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3086 int nodeid) 3087 { 3088 struct list_head *entry; 3089 struct page *page; 3090 struct kmem_cache_node *n; 3091 void *obj; 3092 int x; 3093 3094 VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); 3095 n = get_node(cachep, nodeid); 3096 BUG_ON(!n); 3097 3098 retry: 3099 check_irq_off(); 3100 spin_lock(&n->list_lock); 3101 entry = n->slabs_partial.next; 3102 if (entry == &n->slabs_partial) { 3103 n->free_touched = 1; 3104 entry = n->slabs_free.next; 3105 if (entry == &n->slabs_free) 3106 goto must_grow; 3107 } 3108 3109 page = list_entry(entry, struct page, lru); 3110 check_spinlock_acquired_node(cachep, nodeid); 3111 3112 STATS_INC_NODEALLOCS(cachep); 3113 STATS_INC_ACTIVE(cachep); 3114 STATS_SET_HIGH(cachep); 3115 3116 BUG_ON(page->active == cachep->num); 3117 3118 obj = slab_get_obj(cachep, page, nodeid); 3119 n->free_objects--; 3120 /* move slabp to correct slabp list: */ 3121 list_del(&page->lru); 3122 3123 if (page->active == cachep->num) 3124 list_add(&page->lru, &n->slabs_full); 3125 else 3126 list_add(&page->lru, &n->slabs_partial); 3127 3128 spin_unlock(&n->list_lock); 3129 goto done; 3130 3131 must_grow: 3132 spin_unlock(&n->list_lock); 3133 x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL); 3134 if (x) 3135 goto retry; 3136 3137 return fallback_alloc(cachep, flags); 3138 3139 done: 3140 return obj; 3141 } 3142 3143 static __always_inline void * 3144 slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, 3145 unsigned long caller) 3146 { 3147 unsigned long save_flags; 3148 void *ptr; 3149 int slab_node = numa_mem_id(); 3150 3151 flags &= gfp_allowed_mask; 3152 3153 lockdep_trace_alloc(flags); 3154 3155 if (slab_should_failslab(cachep, flags)) 3156 return NULL; 3157 3158 cachep = memcg_kmem_get_cache(cachep, flags); 3159 3160 cache_alloc_debugcheck_before(cachep, flags); 3161 local_irq_save(save_flags); 3162 3163 if (nodeid == NUMA_NO_NODE) 3164 nodeid = slab_node; 3165 3166 if (unlikely(!get_node(cachep, nodeid))) { 3167 /* Node not bootstrapped yet */ 3168 ptr = fallback_alloc(cachep, flags); 3169 goto out; 3170 } 3171 3172 if (nodeid == slab_node) { 3173 /* 3174 * Use the locally cached objects if possible. 3175 * However ____cache_alloc does not allow fallback 3176 * to other nodes. It may fail while we still have 3177 * objects on other nodes available. 3178 */ 3179 ptr = ____cache_alloc(cachep, flags); 3180 if (ptr) 3181 goto out; 3182 } 3183 /* ___cache_alloc_node can fall back to other nodes */ 3184 ptr = ____cache_alloc_node(cachep, flags, nodeid); 3185 out: 3186 local_irq_restore(save_flags); 3187 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); 3188 kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, 3189 flags); 3190 3191 if (likely(ptr)) { 3192 kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); 3193 if (unlikely(flags & __GFP_ZERO)) 3194 memset(ptr, 0, cachep->object_size); 3195 } 3196 3197 memcg_kmem_put_cache(cachep); 3198 return ptr; 3199 } 3200 3201 static __always_inline void * 3202 __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) 3203 { 3204 void *objp; 3205 3206 if (current->mempolicy || cpuset_do_slab_mem_spread()) { 3207 objp = alternate_node_alloc(cache, flags); 3208 if (objp) 3209 goto out; 3210 } 3211 objp = ____cache_alloc(cache, flags); 3212 3213 /* 3214 * We may just have run out of memory on the local node. 3215 * ____cache_alloc_node() knows how to locate memory on other nodes 3216 */ 3217 if (!objp) 3218 objp = ____cache_alloc_node(cache, flags, numa_mem_id()); 3219 3220 out: 3221 return objp; 3222 } 3223 #else 3224 3225 static __always_inline void * 3226 __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3227 { 3228 return ____cache_alloc(cachep, flags); 3229 } 3230 3231 #endif /* CONFIG_NUMA */ 3232 3233 static __always_inline void * 3234 slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) 3235 { 3236 unsigned long save_flags; 3237 void *objp; 3238 3239 flags &= gfp_allowed_mask; 3240 3241 lockdep_trace_alloc(flags); 3242 3243 if (slab_should_failslab(cachep, flags)) 3244 return NULL; 3245 3246 cachep = memcg_kmem_get_cache(cachep, flags); 3247 3248 cache_alloc_debugcheck_before(cachep, flags); 3249 local_irq_save(save_flags); 3250 objp = __do_cache_alloc(cachep, flags); 3251 local_irq_restore(save_flags); 3252 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); 3253 kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, 3254 flags); 3255 prefetchw(objp); 3256 3257 if (likely(objp)) { 3258 kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); 3259 if (unlikely(flags & __GFP_ZERO)) 3260 memset(objp, 0, cachep->object_size); 3261 } 3262 3263 memcg_kmem_put_cache(cachep); 3264 return objp; 3265 } 3266 3267 /* 3268 * Caller needs to acquire correct kmem_cache_node's list_lock 3269 * @list: List of detached free slabs should be freed by caller 3270 */ 3271 static void free_block(struct kmem_cache *cachep, void **objpp, 3272 int nr_objects, int node, struct list_head *list) 3273 { 3274 int i; 3275 struct kmem_cache_node *n = get_node(cachep, node); 3276 3277 for (i = 0; i < nr_objects; i++) { 3278 void *objp; 3279 struct page *page; 3280 3281 clear_obj_pfmemalloc(&objpp[i]); 3282 objp = objpp[i]; 3283 3284 page = virt_to_head_page(objp); 3285 list_del(&page->lru); 3286 check_spinlock_acquired_node(cachep, node); 3287 slab_put_obj(cachep, page, objp, node); 3288 STATS_DEC_ACTIVE(cachep); 3289 n->free_objects++; 3290 3291 /* fixup slab chains */ 3292 if (page->active == 0) { 3293 if (n->free_objects > n->free_limit) { 3294 n->free_objects -= cachep->num; 3295 list_add_tail(&page->lru, list); 3296 } else { 3297 list_add(&page->lru, &n->slabs_free); 3298 } 3299 } else { 3300 /* Unconditionally move a slab to the end of the 3301 * partial list on free - maximum time for the 3302 * other objects to be freed, too. 3303 */ 3304 list_add_tail(&page->lru, &n->slabs_partial); 3305 } 3306 } 3307 } 3308 3309 static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) 3310 { 3311 int batchcount; 3312 struct kmem_cache_node *n; 3313 int node = numa_mem_id(); 3314 LIST_HEAD(list); 3315 3316 batchcount = ac->batchcount; 3317 #if DEBUG 3318 BUG_ON(!batchcount || batchcount > ac->avail); 3319 #endif 3320 check_irq_off(); 3321 n = get_node(cachep, node); 3322 spin_lock(&n->list_lock); 3323 if (n->shared) { 3324 struct array_cache *shared_array = n->shared; 3325 int max = shared_array->limit - shared_array->avail; 3326 if (max) { 3327 if (batchcount > max) 3328 batchcount = max; 3329 memcpy(&(shared_array->entry[shared_array->avail]), 3330 ac->entry, sizeof(void *) * batchcount); 3331 shared_array->avail += batchcount; 3332 goto free_done; 3333 } 3334 } 3335 3336 free_block(cachep, ac->entry, batchcount, node, &list); 3337 free_done: 3338 #if STATS 3339 { 3340 int i = 0; 3341 struct list_head *p; 3342 3343 p = n->slabs_free.next; 3344 while (p != &(n->slabs_free)) { 3345 struct page *page; 3346 3347 page = list_entry(p, struct page, lru); 3348 BUG_ON(page->active); 3349 3350 i++; 3351 p = p->next; 3352 } 3353 STATS_SET_FREEABLE(cachep, i); 3354 } 3355 #endif 3356 spin_unlock(&n->list_lock); 3357 slabs_destroy(cachep, &list); 3358 ac->avail -= batchcount; 3359 memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); 3360 } 3361 3362 /* 3363 * Release an obj back to its cache. If the obj has a constructed state, it must 3364 * be in this state _before_ it is released. Called with disabled ints. 3365 */ 3366 static inline void __cache_free(struct kmem_cache *cachep, void *objp, 3367 unsigned long caller) 3368 { 3369 struct array_cache *ac = cpu_cache_get(cachep); 3370 3371 check_irq_off(); 3372 kmemleak_free_recursive(objp, cachep->flags); 3373 objp = cache_free_debugcheck(cachep, objp, caller); 3374 3375 kmemcheck_slab_free(cachep, objp, cachep->object_size); 3376 3377 /* 3378 * Skip calling cache_free_alien() when the platform is not numa. 3379 * This will avoid cache misses that happen while accessing slabp (which 3380 * is per page memory reference) to get nodeid. Instead use a global 3381 * variable to skip the call, which is mostly likely to be present in 3382 * the cache. 3383 */ 3384 if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) 3385 return; 3386 3387 if (ac->avail < ac->limit) { 3388 STATS_INC_FREEHIT(cachep); 3389 } else { 3390 STATS_INC_FREEMISS(cachep); 3391 cache_flusharray(cachep, ac); 3392 } 3393 3394 ac_put_obj(cachep, ac, objp); 3395 } 3396 3397 /** 3398 * kmem_cache_alloc - Allocate an object 3399 * @cachep: The cache to allocate from. 3400 * @flags: See kmalloc(). 3401 * 3402 * Allocate an object from this cache. The flags are only relevant 3403 * if the cache has no available objects. 3404 */ 3405 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3406 { 3407 void *ret = slab_alloc(cachep, flags, _RET_IP_); 3408 3409 trace_kmem_cache_alloc(_RET_IP_, ret, 3410 cachep->object_size, cachep->size, flags); 3411 3412 return ret; 3413 } 3414 EXPORT_SYMBOL(kmem_cache_alloc); 3415 3416 void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) 3417 { 3418 __kmem_cache_free_bulk(s, size, p); 3419 } 3420 EXPORT_SYMBOL(kmem_cache_free_bulk); 3421 3422 int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, 3423 void **p) 3424 { 3425 return __kmem_cache_alloc_bulk(s, flags, size, p); 3426 } 3427 EXPORT_SYMBOL(kmem_cache_alloc_bulk); 3428 3429 #ifdef CONFIG_TRACING 3430 void * 3431 kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) 3432 { 3433 void *ret; 3434 3435 ret = slab_alloc(cachep, flags, _RET_IP_); 3436 3437 trace_kmalloc(_RET_IP_, ret, 3438 size, cachep->size, flags); 3439 return ret; 3440 } 3441 EXPORT_SYMBOL(kmem_cache_alloc_trace); 3442 #endif 3443 3444 #ifdef CONFIG_NUMA 3445 /** 3446 * kmem_cache_alloc_node - Allocate an object on the specified node 3447 * @cachep: The cache to allocate from. 3448 * @flags: See kmalloc(). 3449 * @nodeid: node number of the target node. 3450 * 3451 * Identical to kmem_cache_alloc but it will allocate memory on the given 3452 * node, which can improve the performance for cpu bound structures. 3453 * 3454 * Fallback to other node is possible if __GFP_THISNODE is not set. 3455 */ 3456 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3457 { 3458 void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); 3459 3460 trace_kmem_cache_alloc_node(_RET_IP_, ret, 3461 cachep->object_size, cachep->size, 3462 flags, nodeid); 3463 3464 return ret; 3465 } 3466 EXPORT_SYMBOL(kmem_cache_alloc_node); 3467 3468 #ifdef CONFIG_TRACING 3469 void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, 3470 gfp_t flags, 3471 int nodeid, 3472 size_t size) 3473 { 3474 void *ret; 3475 3476 ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); 3477 3478 trace_kmalloc_node(_RET_IP_, ret, 3479 size, cachep->size, 3480 flags, nodeid); 3481 return ret; 3482 } 3483 EXPORT_SYMBOL(kmem_cache_alloc_node_trace); 3484 #endif 3485 3486 static __always_inline void * 3487 __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) 3488 { 3489 struct kmem_cache *cachep; 3490 3491 cachep = kmalloc_slab(size, flags); 3492 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3493 return cachep; 3494 return kmem_cache_alloc_node_trace(cachep, flags, node, size); 3495 } 3496 3497 void *__kmalloc_node(size_t size, gfp_t flags, int node) 3498 { 3499 return __do_kmalloc_node(size, flags, node, _RET_IP_); 3500 } 3501 EXPORT_SYMBOL(__kmalloc_node); 3502 3503 void *__kmalloc_node_track_caller(size_t size, gfp_t flags, 3504 int node, unsigned long caller) 3505 { 3506 return __do_kmalloc_node(size, flags, node, caller); 3507 } 3508 EXPORT_SYMBOL(__kmalloc_node_track_caller); 3509 #endif /* CONFIG_NUMA */ 3510 3511 /** 3512 * __do_kmalloc - allocate memory 3513 * @size: how many bytes of memory are required. 3514 * @flags: the type of memory to allocate (see kmalloc). 3515 * @caller: function caller for debug tracking of the caller 3516 */ 3517 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, 3518 unsigned long caller) 3519 { 3520 struct kmem_cache *cachep; 3521 void *ret; 3522 3523 cachep = kmalloc_slab(size, flags); 3524 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3525 return cachep; 3526 ret = slab_alloc(cachep, flags, caller); 3527 3528 trace_kmalloc(caller, ret, 3529 size, cachep->size, flags); 3530 3531 return ret; 3532 } 3533 3534 void *__kmalloc(size_t size, gfp_t flags) 3535 { 3536 return __do_kmalloc(size, flags, _RET_IP_); 3537 } 3538 EXPORT_SYMBOL(__kmalloc); 3539 3540 void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) 3541 { 3542 return __do_kmalloc(size, flags, caller); 3543 } 3544 EXPORT_SYMBOL(__kmalloc_track_caller); 3545 3546 /** 3547 * kmem_cache_free - Deallocate an object 3548 * @cachep: The cache the allocation was from. 3549 * @objp: The previously allocated object. 3550 * 3551 * Free an object which was previously allocated from this 3552 * cache. 3553 */ 3554 void kmem_cache_free(struct kmem_cache *cachep, void *objp) 3555 { 3556 unsigned long flags; 3557 cachep = cache_from_obj(cachep, objp); 3558 if (!cachep) 3559 return; 3560 3561 local_irq_save(flags); 3562 debug_check_no_locks_freed(objp, cachep->object_size); 3563 if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) 3564 debug_check_no_obj_freed(objp, cachep->object_size); 3565 __cache_free(cachep, objp, _RET_IP_); 3566 local_irq_restore(flags); 3567 3568 trace_kmem_cache_free(_RET_IP_, objp); 3569 } 3570 EXPORT_SYMBOL(kmem_cache_free); 3571 3572 /** 3573 * kfree - free previously allocated memory 3574 * @objp: pointer returned by kmalloc. 3575 * 3576 * If @objp is NULL, no operation is performed. 3577 * 3578 * Don't free memory not originally allocated by kmalloc() 3579 * or you will run into trouble. 3580 */ 3581 void kfree(const void *objp) 3582 { 3583 struct kmem_cache *c; 3584 unsigned long flags; 3585 3586 trace_kfree(_RET_IP_, objp); 3587 3588 if (unlikely(ZERO_OR_NULL_PTR(objp))) 3589 return; 3590 local_irq_save(flags); 3591 kfree_debugcheck(objp); 3592 c = virt_to_cache(objp); 3593 debug_check_no_locks_freed(objp, c->object_size); 3594 3595 debug_check_no_obj_freed(objp, c->object_size); 3596 __cache_free(c, (void *)objp, _RET_IP_); 3597 local_irq_restore(flags); 3598 } 3599 EXPORT_SYMBOL(kfree); 3600 3601 /* 3602 * This initializes kmem_cache_node or resizes various caches for all nodes. 3603 */ 3604 static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) 3605 { 3606 int node; 3607 struct kmem_cache_node *n; 3608 struct array_cache *new_shared; 3609 struct alien_cache **new_alien = NULL; 3610 3611 for_each_online_node(node) { 3612 3613 if (use_alien_caches) { 3614 new_alien = alloc_alien_cache(node, cachep->limit, gfp); 3615 if (!new_alien) 3616 goto fail; 3617 } 3618 3619 new_shared = NULL; 3620 if (cachep->shared) { 3621 new_shared = alloc_arraycache(node, 3622 cachep->shared*cachep->batchcount, 3623 0xbaadf00d, gfp); 3624 if (!new_shared) { 3625 free_alien_cache(new_alien); 3626 goto fail; 3627 } 3628 } 3629 3630 n = get_node(cachep, node); 3631 if (n) { 3632 struct array_cache *shared = n->shared; 3633 LIST_HEAD(list); 3634 3635 spin_lock_irq(&n->list_lock); 3636 3637 if (shared) 3638 free_block(cachep, shared->entry, 3639 shared->avail, node, &list); 3640 3641 n->shared = new_shared; 3642 if (!n->alien) { 3643 n->alien = new_alien; 3644 new_alien = NULL; 3645 } 3646 n->free_limit = (1 + nr_cpus_node(node)) * 3647 cachep->batchcount + cachep->num; 3648 spin_unlock_irq(&n->list_lock); 3649 slabs_destroy(cachep, &list); 3650 kfree(shared); 3651 free_alien_cache(new_alien); 3652 continue; 3653 } 3654 n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); 3655 if (!n) { 3656 free_alien_cache(new_alien); 3657 kfree(new_shared); 3658 goto fail; 3659 } 3660 3661 kmem_cache_node_init(n); 3662 n->next_reap = jiffies + REAPTIMEOUT_NODE + 3663 ((unsigned long)cachep) % REAPTIMEOUT_NODE; 3664 n->shared = new_shared; 3665 n->alien = new_alien; 3666 n->free_limit = (1 + nr_cpus_node(node)) * 3667 cachep->batchcount + cachep->num; 3668 cachep->node[node] = n; 3669 } 3670 return 0; 3671 3672 fail: 3673 if (!cachep->list.next) { 3674 /* Cache is not active yet. Roll back what we did */ 3675 node--; 3676 while (node >= 0) { 3677 n = get_node(cachep, node); 3678 if (n) { 3679 kfree(n->shared); 3680 free_alien_cache(n->alien); 3681 kfree(n); 3682 cachep->node[node] = NULL; 3683 } 3684 node--; 3685 } 3686 } 3687 return -ENOMEM; 3688 } 3689 3690 /* Always called with the slab_mutex held */ 3691 static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, 3692 int batchcount, int shared, gfp_t gfp) 3693 { 3694 struct array_cache __percpu *cpu_cache, *prev; 3695 int cpu; 3696 3697 cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount); 3698 if (!cpu_cache) 3699 return -ENOMEM; 3700 3701 prev = cachep->cpu_cache; 3702 cachep->cpu_cache = cpu_cache; 3703 kick_all_cpus_sync(); 3704 3705 check_irq_on(); 3706 cachep->batchcount = batchcount; 3707 cachep->limit = limit; 3708 cachep->shared = shared; 3709 3710 if (!prev) 3711 goto alloc_node; 3712 3713 for_each_online_cpu(cpu) { 3714 LIST_HEAD(list); 3715 int node; 3716 struct kmem_cache_node *n; 3717 struct array_cache *ac = per_cpu_ptr(prev, cpu); 3718 3719 node = cpu_to_mem(cpu); 3720 n = get_node(cachep, node); 3721 spin_lock_irq(&n->list_lock); 3722 free_block(cachep, ac->entry, ac->avail, node, &list); 3723 spin_unlock_irq(&n->list_lock); 3724 slabs_destroy(cachep, &list); 3725 } 3726 free_percpu(prev); 3727 3728 alloc_node: 3729 return alloc_kmem_cache_node(cachep, gfp); 3730 } 3731 3732 static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 3733 int batchcount, int shared, gfp_t gfp) 3734 { 3735 int ret; 3736 struct kmem_cache *c; 3737 3738 ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); 3739 3740 if (slab_state < FULL) 3741 return ret; 3742 3743 if ((ret < 0) || !is_root_cache(cachep)) 3744 return ret; 3745 3746 lockdep_assert_held(&slab_mutex); 3747 for_each_memcg_cache(c, cachep) { 3748 /* return value determined by the root cache only */ 3749 __do_tune_cpucache(c, limit, batchcount, shared, gfp); 3750 } 3751 3752 return ret; 3753 } 3754 3755 /* Called with slab_mutex held always */ 3756 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) 3757 { 3758 int err; 3759 int limit = 0; 3760 int shared = 0; 3761 int batchcount = 0; 3762 3763 if (!is_root_cache(cachep)) { 3764 struct kmem_cache *root = memcg_root_cache(cachep); 3765 limit = root->limit; 3766 shared = root->shared; 3767 batchcount = root->batchcount; 3768 } 3769 3770 if (limit && shared && batchcount) 3771 goto skip_setup; 3772 /* 3773 * The head array serves three purposes: 3774 * - create a LIFO ordering, i.e. return objects that are cache-warm 3775 * - reduce the number of spinlock operations. 3776 * - reduce the number of linked list operations on the slab and 3777 * bufctl chains: array operations are cheaper. 3778 * The numbers are guessed, we should auto-tune as described by 3779 * Bonwick. 3780 */ 3781 if (cachep->size > 131072) 3782 limit = 1; 3783 else if (cachep->size > PAGE_SIZE) 3784 limit = 8; 3785 else if (cachep->size > 1024) 3786 limit = 24; 3787 else if (cachep->size > 256) 3788 limit = 54; 3789 else 3790 limit = 120; 3791 3792 /* 3793 * CPU bound tasks (e.g. network routing) can exhibit cpu bound 3794 * allocation behaviour: Most allocs on one cpu, most free operations 3795 * on another cpu. For these cases, an efficient object passing between 3796 * cpus is necessary. This is provided by a shared array. The array 3797 * replaces Bonwick's magazine layer. 3798 * On uniprocessor, it's functionally equivalent (but less efficient) 3799 * to a larger limit. Thus disabled by default. 3800 */ 3801 shared = 0; 3802 if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) 3803 shared = 8; 3804 3805 #if DEBUG 3806 /* 3807 * With debugging enabled, large batchcount lead to excessively long 3808 * periods with disabled local interrupts. Limit the batchcount 3809 */ 3810 if (limit > 32) 3811 limit = 32; 3812 #endif 3813 batchcount = (limit + 1) / 2; 3814 skip_setup: 3815 err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); 3816 if (err) 3817 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3818 cachep->name, -err); 3819 return err; 3820 } 3821 3822 /* 3823 * Drain an array if it contains any elements taking the node lock only if 3824 * necessary. Note that the node listlock also protects the array_cache 3825 * if drain_array() is used on the shared array. 3826 */ 3827 static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, 3828 struct array_cache *ac, int force, int node) 3829 { 3830 LIST_HEAD(list); 3831 int tofree; 3832 3833 if (!ac || !ac->avail) 3834 return; 3835 if (ac->touched && !force) { 3836 ac->touched = 0; 3837 } else { 3838 spin_lock_irq(&n->list_lock); 3839 if (ac->avail) { 3840 tofree = force ? ac->avail : (ac->limit + 4) / 5; 3841 if (tofree > ac->avail) 3842 tofree = (ac->avail + 1) / 2; 3843 free_block(cachep, ac->entry, tofree, node, &list); 3844 ac->avail -= tofree; 3845 memmove(ac->entry, &(ac->entry[tofree]), 3846 sizeof(void *) * ac->avail); 3847 } 3848 spin_unlock_irq(&n->list_lock); 3849 slabs_destroy(cachep, &list); 3850 } 3851 } 3852 3853 /** 3854 * cache_reap - Reclaim memory from caches. 3855 * @w: work descriptor 3856 * 3857 * Called from workqueue/eventd every few seconds. 3858 * Purpose: 3859 * - clear the per-cpu caches for this CPU. 3860 * - return freeable pages to the main free memory pool. 3861 * 3862 * If we cannot acquire the cache chain mutex then just give up - we'll try 3863 * again on the next iteration. 3864 */ 3865 static void cache_reap(struct work_struct *w) 3866 { 3867 struct kmem_cache *searchp; 3868 struct kmem_cache_node *n; 3869 int node = numa_mem_id(); 3870 struct delayed_work *work = to_delayed_work(w); 3871 3872 if (!mutex_trylock(&slab_mutex)) 3873 /* Give up. Setup the next iteration. */ 3874 goto out; 3875 3876 list_for_each_entry(searchp, &slab_caches, list) { 3877 check_irq_on(); 3878 3879 /* 3880 * We only take the node lock if absolutely necessary and we 3881 * have established with reasonable certainty that 3882 * we can do some work if the lock was obtained. 3883 */ 3884 n = get_node(searchp, node); 3885 3886 reap_alien(searchp, n); 3887 3888 drain_array(searchp, n, cpu_cache_get(searchp), 0, node); 3889 3890 /* 3891 * These are racy checks but it does not matter 3892 * if we skip one check or scan twice. 3893 */ 3894 if (time_after(n->next_reap, jiffies)) 3895 goto next; 3896 3897 n->next_reap = jiffies + REAPTIMEOUT_NODE; 3898 3899 drain_array(searchp, n, n->shared, 0, node); 3900 3901 if (n->free_touched) 3902 n->free_touched = 0; 3903 else { 3904 int freed; 3905 3906 freed = drain_freelist(searchp, n, (n->free_limit + 3907 5 * searchp->num - 1) / (5 * searchp->num)); 3908 STATS_ADD_REAPED(searchp, freed); 3909 } 3910 next: 3911 cond_resched(); 3912 } 3913 check_irq_on(); 3914 mutex_unlock(&slab_mutex); 3915 next_reap_node(); 3916 out: 3917 /* Set up the next iteration */ 3918 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); 3919 } 3920 3921 #ifdef CONFIG_SLABINFO 3922 void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) 3923 { 3924 struct page *page; 3925 unsigned long active_objs; 3926 unsigned long num_objs; 3927 unsigned long active_slabs = 0; 3928 unsigned long num_slabs, free_objects = 0, shared_avail = 0; 3929 const char *name; 3930 char *error = NULL; 3931 int node; 3932 struct kmem_cache_node *n; 3933 3934 active_objs = 0; 3935 num_slabs = 0; 3936 for_each_kmem_cache_node(cachep, node, n) { 3937 3938 check_irq_on(); 3939 spin_lock_irq(&n->list_lock); 3940 3941 list_for_each_entry(page, &n->slabs_full, lru) { 3942 if (page->active != cachep->num && !error) 3943 error = "slabs_full accounting error"; 3944 active_objs += cachep->num; 3945 active_slabs++; 3946 } 3947 list_for_each_entry(page, &n->slabs_partial, lru) { 3948 if (page->active == cachep->num && !error) 3949 error = "slabs_partial accounting error"; 3950 if (!page->active && !error) 3951 error = "slabs_partial accounting error"; 3952 active_objs += page->active; 3953 active_slabs++; 3954 } 3955 list_for_each_entry(page, &n->slabs_free, lru) { 3956 if (page->active && !error) 3957 error = "slabs_free accounting error"; 3958 num_slabs++; 3959 } 3960 free_objects += n->free_objects; 3961 if (n->shared) 3962 shared_avail += n->shared->avail; 3963 3964 spin_unlock_irq(&n->list_lock); 3965 } 3966 num_slabs += active_slabs; 3967 num_objs = num_slabs * cachep->num; 3968 if (num_objs - active_objs != free_objects && !error) 3969 error = "free_objects accounting error"; 3970 3971 name = cachep->name; 3972 if (error) 3973 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 3974 3975 sinfo->active_objs = active_objs; 3976 sinfo->num_objs = num_objs; 3977 sinfo->active_slabs = active_slabs; 3978 sinfo->num_slabs = num_slabs; 3979 sinfo->shared_avail = shared_avail; 3980 sinfo->limit = cachep->limit; 3981 sinfo->batchcount = cachep->batchcount; 3982 sinfo->shared = cachep->shared; 3983 sinfo->objects_per_slab = cachep->num; 3984 sinfo->cache_order = cachep->gfporder; 3985 } 3986 3987 void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) 3988 { 3989 #if STATS 3990 { /* node stats */ 3991 unsigned long high = cachep->high_mark; 3992 unsigned long allocs = cachep->num_allocations; 3993 unsigned long grown = cachep->grown; 3994 unsigned long reaped = cachep->reaped; 3995 unsigned long errors = cachep->errors; 3996 unsigned long max_freeable = cachep->max_freeable; 3997 unsigned long node_allocs = cachep->node_allocs; 3998 unsigned long node_frees = cachep->node_frees; 3999 unsigned long overflows = cachep->node_overflow; 4000 4001 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu " 4002 "%4lu %4lu %4lu %4lu %4lu", 4003 allocs, high, grown, 4004 reaped, errors, max_freeable, node_allocs, 4005 node_frees, overflows); 4006 } 4007 /* cpu stats */ 4008 { 4009 unsigned long allochit = atomic_read(&cachep->allochit); 4010 unsigned long allocmiss = atomic_read(&cachep->allocmiss); 4011 unsigned long freehit = atomic_read(&cachep->freehit); 4012 unsigned long freemiss = atomic_read(&cachep->freemiss); 4013 4014 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", 4015 allochit, allocmiss, freehit, freemiss); 4016 } 4017 #endif 4018 } 4019 4020 #define MAX_SLABINFO_WRITE 128 4021 /** 4022 * slabinfo_write - Tuning for the slab allocator 4023 * @file: unused 4024 * @buffer: user buffer 4025 * @count: data length 4026 * @ppos: unused 4027 */ 4028 ssize_t slabinfo_write(struct file *file, const char __user *buffer, 4029 size_t count, loff_t *ppos) 4030 { 4031 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; 4032 int limit, batchcount, shared, res; 4033 struct kmem_cache *cachep; 4034 4035 if (count > MAX_SLABINFO_WRITE) 4036 return -EINVAL; 4037 if (copy_from_user(&kbuf, buffer, count)) 4038 return -EFAULT; 4039 kbuf[MAX_SLABINFO_WRITE] = '\0'; 4040 4041 tmp = strchr(kbuf, ' '); 4042 if (!tmp) 4043 return -EINVAL; 4044 *tmp = '\0'; 4045 tmp++; 4046 if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) 4047 return -EINVAL; 4048 4049 /* Find the cache in the chain of caches. */ 4050 mutex_lock(&slab_mutex); 4051 res = -EINVAL; 4052 list_for_each_entry(cachep, &slab_caches, list) { 4053 if (!strcmp(cachep->name, kbuf)) { 4054 if (limit < 1 || batchcount < 1 || 4055 batchcount > limit || shared < 0) { 4056 res = 0; 4057 } else { 4058 res = do_tune_cpucache(cachep, limit, 4059 batchcount, shared, 4060 GFP_KERNEL); 4061 } 4062 break; 4063 } 4064 } 4065 mutex_unlock(&slab_mutex); 4066 if (res >= 0) 4067 res = count; 4068 return res; 4069 } 4070 4071 #ifdef CONFIG_DEBUG_SLAB_LEAK 4072 4073 static inline int add_caller(unsigned long *n, unsigned long v) 4074 { 4075 unsigned long *p; 4076 int l; 4077 if (!v) 4078 return 1; 4079 l = n[1]; 4080 p = n + 2; 4081 while (l) { 4082 int i = l/2; 4083 unsigned long *q = p + 2 * i; 4084 if (*q == v) { 4085 q[1]++; 4086 return 1; 4087 } 4088 if (*q > v) { 4089 l = i; 4090 } else { 4091 p = q + 2; 4092 l -= i + 1; 4093 } 4094 } 4095 if (++n[1] == n[0]) 4096 return 0; 4097 memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n)); 4098 p[0] = v; 4099 p[1] = 1; 4100 return 1; 4101 } 4102 4103 static void handle_slab(unsigned long *n, struct kmem_cache *c, 4104 struct page *page) 4105 { 4106 void *p; 4107 int i; 4108 4109 if (n[0] == n[1]) 4110 return; 4111 for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { 4112 if (get_obj_status(page, i) != OBJECT_ACTIVE) 4113 continue; 4114 4115 if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) 4116 return; 4117 } 4118 } 4119 4120 static void show_symbol(struct seq_file *m, unsigned long address) 4121 { 4122 #ifdef CONFIG_KALLSYMS 4123 unsigned long offset, size; 4124 char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN]; 4125 4126 if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) { 4127 seq_printf(m, "%s+%#lx/%#lx", name, offset, size); 4128 if (modname[0]) 4129 seq_printf(m, " [%s]", modname); 4130 return; 4131 } 4132 #endif 4133 seq_printf(m, "%p", (void *)address); 4134 } 4135 4136 static int leaks_show(struct seq_file *m, void *p) 4137 { 4138 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); 4139 struct page *page; 4140 struct kmem_cache_node *n; 4141 const char *name; 4142 unsigned long *x = m->private; 4143 int node; 4144 int i; 4145 4146 if (!(cachep->flags & SLAB_STORE_USER)) 4147 return 0; 4148 if (!(cachep->flags & SLAB_RED_ZONE)) 4149 return 0; 4150 4151 /* OK, we can do it */ 4152 4153 x[1] = 0; 4154 4155 for_each_kmem_cache_node(cachep, node, n) { 4156 4157 check_irq_on(); 4158 spin_lock_irq(&n->list_lock); 4159 4160 list_for_each_entry(page, &n->slabs_full, lru) 4161 handle_slab(x, cachep, page); 4162 list_for_each_entry(page, &n->slabs_partial, lru) 4163 handle_slab(x, cachep, page); 4164 spin_unlock_irq(&n->list_lock); 4165 } 4166 name = cachep->name; 4167 if (x[0] == x[1]) { 4168 /* Increase the buffer size */ 4169 mutex_unlock(&slab_mutex); 4170 m->private = kzalloc(x[0] * 4 * sizeof(unsigned long), GFP_KERNEL); 4171 if (!m->private) { 4172 /* Too bad, we are really out */ 4173 m->private = x; 4174 mutex_lock(&slab_mutex); 4175 return -ENOMEM; 4176 } 4177 *(unsigned long *)m->private = x[0] * 2; 4178 kfree(x); 4179 mutex_lock(&slab_mutex); 4180 /* Now make sure this entry will be retried */ 4181 m->count = m->size; 4182 return 0; 4183 } 4184 for (i = 0; i < x[1]; i++) { 4185 seq_printf(m, "%s: %lu ", name, x[2*i+3]); 4186 show_symbol(m, x[2*i+2]); 4187 seq_putc(m, '\n'); 4188 } 4189 4190 return 0; 4191 } 4192 4193 static const struct seq_operations slabstats_op = { 4194 .start = slab_start, 4195 .next = slab_next, 4196 .stop = slab_stop, 4197 .show = leaks_show, 4198 }; 4199 4200 static int slabstats_open(struct inode *inode, struct file *file) 4201 { 4202 unsigned long *n; 4203 4204 n = __seq_open_private(file, &slabstats_op, PAGE_SIZE); 4205 if (!n) 4206 return -ENOMEM; 4207 4208 *n = PAGE_SIZE / (2 * sizeof(unsigned long)); 4209 4210 return 0; 4211 } 4212 4213 static const struct file_operations proc_slabstats_operations = { 4214 .open = slabstats_open, 4215 .read = seq_read, 4216 .llseek = seq_lseek, 4217 .release = seq_release_private, 4218 }; 4219 #endif 4220 4221 static int __init slab_proc_init(void) 4222 { 4223 #ifdef CONFIG_DEBUG_SLAB_LEAK 4224 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); 4225 #endif 4226 return 0; 4227 } 4228 module_init(slab_proc_init); 4229 #endif 4230 4231 /** 4232 * ksize - get the actual amount of memory allocated for a given object 4233 * @objp: Pointer to the object 4234 * 4235 * kmalloc may internally round up allocations and return more memory 4236 * than requested. ksize() can be used to determine the actual amount of 4237 * memory allocated. The caller may use this additional memory, even though 4238 * a smaller amount of memory was initially specified with the kmalloc call. 4239 * The caller must guarantee that objp points to a valid object previously 4240 * allocated with either kmalloc() or kmem_cache_alloc(). The object 4241 * must not be freed during the duration of the call. 4242 */ 4243 size_t ksize(const void *objp) 4244 { 4245 BUG_ON(!objp); 4246 if (unlikely(objp == ZERO_SIZE_PTR)) 4247 return 0; 4248 4249 return virt_to_cache(objp)->object_size; 4250 } 4251 EXPORT_SYMBOL(ksize); 4252