1 /* 2 * linux/mm/slab.c 3 * Written by Mark Hemment, 1996/97. 4 * (markhe@nextd.demon.co.uk) 5 * 6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli 7 * 8 * Major cleanup, different bufctl logic, per-cpu arrays 9 * (c) 2000 Manfred Spraul 10 * 11 * Cleanup, make the head arrays unconditional, preparation for NUMA 12 * (c) 2002 Manfred Spraul 13 * 14 * An implementation of the Slab Allocator as described in outline in; 15 * UNIX Internals: The New Frontiers by Uresh Vahalia 16 * Pub: Prentice Hall ISBN 0-13-101908-2 17 * or with a little more detail in; 18 * The Slab Allocator: An Object-Caching Kernel Memory Allocator 19 * Jeff Bonwick (Sun Microsystems). 20 * Presented at: USENIX Summer 1994 Technical Conference 21 * 22 * The memory is organized in caches, one cache for each object type. 23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) 24 * Each cache consists out of many slabs (they are small (usually one 25 * page long) and always contiguous), and each slab contains multiple 26 * initialized objects. 27 * 28 * This means, that your constructor is used only for newly allocated 29 * slabs and you must pass objects with the same initializations to 30 * kmem_cache_free. 31 * 32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, 33 * normal). If you need a special memory type, then must create a new 34 * cache for that memory type. 35 * 36 * In order to reduce fragmentation, the slabs are sorted in 3 groups: 37 * full slabs with 0 free objects 38 * partial slabs 39 * empty slabs with no allocated objects 40 * 41 * If partial slabs exist, then new allocations come from these slabs, 42 * otherwise from empty slabs or new slabs are allocated. 43 * 44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache 45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs. 46 * 47 * Each cache has a short per-cpu head array, most allocs 48 * and frees go into that array, and if that array overflows, then 1/2 49 * of the entries in the array are given back into the global cache. 50 * The head array is strictly LIFO and should improve the cache hit rates. 51 * On SMP, it additionally reduces the spinlock operations. 52 * 53 * The c_cpuarray may not be read with enabled local interrupts - 54 * it's changed with a smp_call_function(). 55 * 56 * SMP synchronization: 57 * constructors and destructors are called without any locking. 58 * Several members in struct kmem_cache and struct slab never change, they 59 * are accessed without any locking. 60 * The per-cpu arrays are never accessed from the wrong cpu, no locking, 61 * and local interrupts are disabled so slab code is preempt-safe. 62 * The non-constant members are protected with a per-cache irq spinlock. 63 * 64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch 65 * in 2000 - many ideas in the current implementation are derived from 66 * his patch. 67 * 68 * Further notes from the original documentation: 69 * 70 * 11 April '97. Started multi-threading - markhe 71 * The global cache-chain is protected by the mutex 'cache_chain_mutex'. 72 * The sem is only needed when accessing/extending the cache-chain, which 73 * can never happen inside an interrupt (kmem_cache_create(), 74 * kmem_cache_shrink() and kmem_cache_reap()). 75 * 76 * At present, each engine can be growing a cache. This should be blocked. 77 * 78 * 15 March 2005. NUMA slab allocator. 79 * Shai Fultheim <shai@scalex86.org>. 80 * Shobhit Dayal <shobhit@calsoftinc.com> 81 * Alok N Kataria <alokk@calsoftinc.com> 82 * Christoph Lameter <christoph@lameter.com> 83 * 84 * Modified the slab allocator to be node aware on NUMA systems. 85 * Each node has its own list of partial, free and full slabs. 86 * All object allocations for a node occur from node specific slab lists. 87 */ 88 89 #include <linux/slab.h> 90 #include <linux/mm.h> 91 #include <linux/poison.h> 92 #include <linux/swap.h> 93 #include <linux/cache.h> 94 #include <linux/interrupt.h> 95 #include <linux/init.h> 96 #include <linux/compiler.h> 97 #include <linux/cpuset.h> 98 #include <linux/proc_fs.h> 99 #include <linux/seq_file.h> 100 #include <linux/notifier.h> 101 #include <linux/kallsyms.h> 102 #include <linux/cpu.h> 103 #include <linux/sysctl.h> 104 #include <linux/module.h> 105 #include <linux/kmemtrace.h> 106 #include <linux/rcupdate.h> 107 #include <linux/string.h> 108 #include <linux/uaccess.h> 109 #include <linux/nodemask.h> 110 #include <linux/kmemleak.h> 111 #include <linux/mempolicy.h> 112 #include <linux/mutex.h> 113 #include <linux/fault-inject.h> 114 #include <linux/rtmutex.h> 115 #include <linux/reciprocal_div.h> 116 #include <linux/debugobjects.h> 117 #include <linux/kmemcheck.h> 118 119 #include <asm/cacheflush.h> 120 #include <asm/tlbflush.h> 121 #include <asm/page.h> 122 123 /* 124 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. 125 * 0 for faster, smaller code (especially in the critical paths). 126 * 127 * STATS - 1 to collect stats for /proc/slabinfo. 128 * 0 for faster, smaller code (especially in the critical paths). 129 * 130 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) 131 */ 132 133 #ifdef CONFIG_DEBUG_SLAB 134 #define DEBUG 1 135 #define STATS 1 136 #define FORCED_DEBUG 1 137 #else 138 #define DEBUG 0 139 #define STATS 0 140 #define FORCED_DEBUG 0 141 #endif 142 143 /* Shouldn't this be in a header file somewhere? */ 144 #define BYTES_PER_WORD sizeof(void *) 145 #define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) 146 147 #ifndef ARCH_KMALLOC_MINALIGN 148 /* 149 * Enforce a minimum alignment for the kmalloc caches. 150 * Usually, the kmalloc caches are cache_line_size() aligned, except when 151 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. 152 * Some archs want to perform DMA into kmalloc caches and need a guaranteed 153 * alignment larger than the alignment of a 64-bit integer. 154 * ARCH_KMALLOC_MINALIGN allows that. 155 * Note that increasing this value may disable some debug features. 156 */ 157 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) 158 #endif 159 160 #ifndef ARCH_SLAB_MINALIGN 161 /* 162 * Enforce a minimum alignment for all caches. 163 * Intended for archs that get misalignment faults even for BYTES_PER_WORD 164 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. 165 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables 166 * some debug features. 167 */ 168 #define ARCH_SLAB_MINALIGN 0 169 #endif 170 171 #ifndef ARCH_KMALLOC_FLAGS 172 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 173 #endif 174 175 /* Legal flag mask for kmem_cache_create(). */ 176 #if DEBUG 177 # define CREATE_MASK (SLAB_RED_ZONE | \ 178 SLAB_POISON | SLAB_HWCACHE_ALIGN | \ 179 SLAB_CACHE_DMA | \ 180 SLAB_STORE_USER | \ 181 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 182 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ 183 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) 184 #else 185 # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ 186 SLAB_CACHE_DMA | \ 187 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 188 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ 189 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) 190 #endif 191 192 /* 193 * kmem_bufctl_t: 194 * 195 * Bufctl's are used for linking objs within a slab 196 * linked offsets. 197 * 198 * This implementation relies on "struct page" for locating the cache & 199 * slab an object belongs to. 200 * This allows the bufctl structure to be small (one int), but limits 201 * the number of objects a slab (not a cache) can contain when off-slab 202 * bufctls are used. The limit is the size of the largest general cache 203 * that does not use off-slab slabs. 204 * For 32bit archs with 4 kB pages, is this 56. 205 * This is not serious, as it is only for large objects, when it is unwise 206 * to have too many per slab. 207 * Note: This limit can be raised by introducing a general cache whose size 208 * is less than 512 (PAGE_SIZE<<3), but greater than 256. 209 */ 210 211 typedef unsigned int kmem_bufctl_t; 212 #define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) 213 #define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) 214 #define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) 215 #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) 216 217 /* 218 * struct slab 219 * 220 * Manages the objs in a slab. Placed either at the beginning of mem allocated 221 * for a slab, or allocated from an general cache. 222 * Slabs are chained into three list: fully used, partial, fully free slabs. 223 */ 224 struct slab { 225 struct list_head list; 226 unsigned long colouroff; 227 void *s_mem; /* including colour offset */ 228 unsigned int inuse; /* num of objs active in slab */ 229 kmem_bufctl_t free; 230 unsigned short nodeid; 231 }; 232 233 /* 234 * struct slab_rcu 235 * 236 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to 237 * arrange for kmem_freepages to be called via RCU. This is useful if 238 * we need to approach a kernel structure obliquely, from its address 239 * obtained without the usual locking. We can lock the structure to 240 * stabilize it and check it's still at the given address, only if we 241 * can be sure that the memory has not been meanwhile reused for some 242 * other kind of object (which our subsystem's lock might corrupt). 243 * 244 * rcu_read_lock before reading the address, then rcu_read_unlock after 245 * taking the spinlock within the structure expected at that address. 246 * 247 * We assume struct slab_rcu can overlay struct slab when destroying. 248 */ 249 struct slab_rcu { 250 struct rcu_head head; 251 struct kmem_cache *cachep; 252 void *addr; 253 }; 254 255 /* 256 * struct array_cache 257 * 258 * Purpose: 259 * - LIFO ordering, to hand out cache-warm objects from _alloc 260 * - reduce the number of linked list operations 261 * - reduce spinlock operations 262 * 263 * The limit is stored in the per-cpu structure to reduce the data cache 264 * footprint. 265 * 266 */ 267 struct array_cache { 268 unsigned int avail; 269 unsigned int limit; 270 unsigned int batchcount; 271 unsigned int touched; 272 spinlock_t lock; 273 void *entry[]; /* 274 * Must have this definition in here for the proper 275 * alignment of array_cache. Also simplifies accessing 276 * the entries. 277 */ 278 }; 279 280 /* 281 * bootstrap: The caches do not work without cpuarrays anymore, but the 282 * cpuarrays are allocated from the generic caches... 283 */ 284 #define BOOT_CPUCACHE_ENTRIES 1 285 struct arraycache_init { 286 struct array_cache cache; 287 void *entries[BOOT_CPUCACHE_ENTRIES]; 288 }; 289 290 /* 291 * The slab lists for all objects. 292 */ 293 struct kmem_list3 { 294 struct list_head slabs_partial; /* partial list first, better asm code */ 295 struct list_head slabs_full; 296 struct list_head slabs_free; 297 unsigned long free_objects; 298 unsigned int free_limit; 299 unsigned int colour_next; /* Per-node cache coloring */ 300 spinlock_t list_lock; 301 struct array_cache *shared; /* shared per node */ 302 struct array_cache **alien; /* on other nodes */ 303 unsigned long next_reap; /* updated without locking */ 304 int free_touched; /* updated without locking */ 305 }; 306 307 /* 308 * Need this for bootstrapping a per node allocator. 309 */ 310 #define NUM_INIT_LISTS (3 * MAX_NUMNODES) 311 struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; 312 #define CACHE_CACHE 0 313 #define SIZE_AC MAX_NUMNODES 314 #define SIZE_L3 (2 * MAX_NUMNODES) 315 316 static int drain_freelist(struct kmem_cache *cache, 317 struct kmem_list3 *l3, int tofree); 318 static void free_block(struct kmem_cache *cachep, void **objpp, int len, 319 int node); 320 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); 321 static void cache_reap(struct work_struct *unused); 322 323 /* 324 * This function must be completely optimized away if a constant is passed to 325 * it. Mostly the same as what is in linux/slab.h except it returns an index. 326 */ 327 static __always_inline int index_of(const size_t size) 328 { 329 extern void __bad_size(void); 330 331 if (__builtin_constant_p(size)) { 332 int i = 0; 333 334 #define CACHE(x) \ 335 if (size <=x) \ 336 return i; \ 337 else \ 338 i++; 339 #include <linux/kmalloc_sizes.h> 340 #undef CACHE 341 __bad_size(); 342 } else 343 __bad_size(); 344 return 0; 345 } 346 347 static int slab_early_init = 1; 348 349 #define INDEX_AC index_of(sizeof(struct arraycache_init)) 350 #define INDEX_L3 index_of(sizeof(struct kmem_list3)) 351 352 static void kmem_list3_init(struct kmem_list3 *parent) 353 { 354 INIT_LIST_HEAD(&parent->slabs_full); 355 INIT_LIST_HEAD(&parent->slabs_partial); 356 INIT_LIST_HEAD(&parent->slabs_free); 357 parent->shared = NULL; 358 parent->alien = NULL; 359 parent->colour_next = 0; 360 spin_lock_init(&parent->list_lock); 361 parent->free_objects = 0; 362 parent->free_touched = 0; 363 } 364 365 #define MAKE_LIST(cachep, listp, slab, nodeid) \ 366 do { \ 367 INIT_LIST_HEAD(listp); \ 368 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ 369 } while (0) 370 371 #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ 372 do { \ 373 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ 374 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ 375 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 376 } while (0) 377 378 #define CFLGS_OFF_SLAB (0x80000000UL) 379 #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 380 381 #define BATCHREFILL_LIMIT 16 382 /* 383 * Optimization question: fewer reaps means less probability for unnessary 384 * cpucache drain/refill cycles. 385 * 386 * OTOH the cpuarrays can contain lots of objects, 387 * which could lock up otherwise freeable slabs. 388 */ 389 #define REAPTIMEOUT_CPUC (2*HZ) 390 #define REAPTIMEOUT_LIST3 (4*HZ) 391 392 #if STATS 393 #define STATS_INC_ACTIVE(x) ((x)->num_active++) 394 #define STATS_DEC_ACTIVE(x) ((x)->num_active--) 395 #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 396 #define STATS_INC_GROWN(x) ((x)->grown++) 397 #define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) 398 #define STATS_SET_HIGH(x) \ 399 do { \ 400 if ((x)->num_active > (x)->high_mark) \ 401 (x)->high_mark = (x)->num_active; \ 402 } while (0) 403 #define STATS_INC_ERR(x) ((x)->errors++) 404 #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 405 #define STATS_INC_NODEFREES(x) ((x)->node_frees++) 406 #define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++) 407 #define STATS_SET_FREEABLE(x, i) \ 408 do { \ 409 if ((x)->max_freeable < i) \ 410 (x)->max_freeable = i; \ 411 } while (0) 412 #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) 413 #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) 414 #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) 415 #define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) 416 #else 417 #define STATS_INC_ACTIVE(x) do { } while (0) 418 #define STATS_DEC_ACTIVE(x) do { } while (0) 419 #define STATS_INC_ALLOCED(x) do { } while (0) 420 #define STATS_INC_GROWN(x) do { } while (0) 421 #define STATS_ADD_REAPED(x,y) do { } while (0) 422 #define STATS_SET_HIGH(x) do { } while (0) 423 #define STATS_INC_ERR(x) do { } while (0) 424 #define STATS_INC_NODEALLOCS(x) do { } while (0) 425 #define STATS_INC_NODEFREES(x) do { } while (0) 426 #define STATS_INC_ACOVERFLOW(x) do { } while (0) 427 #define STATS_SET_FREEABLE(x, i) do { } while (0) 428 #define STATS_INC_ALLOCHIT(x) do { } while (0) 429 #define STATS_INC_ALLOCMISS(x) do { } while (0) 430 #define STATS_INC_FREEHIT(x) do { } while (0) 431 #define STATS_INC_FREEMISS(x) do { } while (0) 432 #endif 433 434 #if DEBUG 435 436 /* 437 * memory layout of objects: 438 * 0 : objp 439 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that 440 * the end of an object is aligned with the end of the real 441 * allocation. Catches writes behind the end of the allocation. 442 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: 443 * redzone word. 444 * cachep->obj_offset: The real object. 445 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 446 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address 447 * [BYTES_PER_WORD long] 448 */ 449 static int obj_offset(struct kmem_cache *cachep) 450 { 451 return cachep->obj_offset; 452 } 453 454 static int obj_size(struct kmem_cache *cachep) 455 { 456 return cachep->obj_size; 457 } 458 459 static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) 460 { 461 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 462 return (unsigned long long*) (objp + obj_offset(cachep) - 463 sizeof(unsigned long long)); 464 } 465 466 static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) 467 { 468 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 469 if (cachep->flags & SLAB_STORE_USER) 470 return (unsigned long long *)(objp + cachep->buffer_size - 471 sizeof(unsigned long long) - 472 REDZONE_ALIGN); 473 return (unsigned long long *) (objp + cachep->buffer_size - 474 sizeof(unsigned long long)); 475 } 476 477 static void **dbg_userword(struct kmem_cache *cachep, void *objp) 478 { 479 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 480 return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD); 481 } 482 483 #else 484 485 #define obj_offset(x) 0 486 #define obj_size(cachep) (cachep->buffer_size) 487 #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 488 #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 489 #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 490 491 #endif 492 493 #ifdef CONFIG_TRACING 494 size_t slab_buffer_size(struct kmem_cache *cachep) 495 { 496 return cachep->buffer_size; 497 } 498 EXPORT_SYMBOL(slab_buffer_size); 499 #endif 500 501 /* 502 * Do not go above this order unless 0 objects fit into the slab. 503 */ 504 #define BREAK_GFP_ORDER_HI 1 505 #define BREAK_GFP_ORDER_LO 0 506 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; 507 508 /* 509 * Functions for storing/retrieving the cachep and or slab from the page 510 * allocator. These are used to find the slab an obj belongs to. With kfree(), 511 * these are used to find the cache which an obj belongs to. 512 */ 513 static inline void page_set_cache(struct page *page, struct kmem_cache *cache) 514 { 515 page->lru.next = (struct list_head *)cache; 516 } 517 518 static inline struct kmem_cache *page_get_cache(struct page *page) 519 { 520 page = compound_head(page); 521 BUG_ON(!PageSlab(page)); 522 return (struct kmem_cache *)page->lru.next; 523 } 524 525 static inline void page_set_slab(struct page *page, struct slab *slab) 526 { 527 page->lru.prev = (struct list_head *)slab; 528 } 529 530 static inline struct slab *page_get_slab(struct page *page) 531 { 532 BUG_ON(!PageSlab(page)); 533 return (struct slab *)page->lru.prev; 534 } 535 536 static inline struct kmem_cache *virt_to_cache(const void *obj) 537 { 538 struct page *page = virt_to_head_page(obj); 539 return page_get_cache(page); 540 } 541 542 static inline struct slab *virt_to_slab(const void *obj) 543 { 544 struct page *page = virt_to_head_page(obj); 545 return page_get_slab(page); 546 } 547 548 static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, 549 unsigned int idx) 550 { 551 return slab->s_mem + cache->buffer_size * idx; 552 } 553 554 /* 555 * We want to avoid an expensive divide : (offset / cache->buffer_size) 556 * Using the fact that buffer_size is a constant for a particular cache, 557 * we can replace (offset / cache->buffer_size) by 558 * reciprocal_divide(offset, cache->reciprocal_buffer_size) 559 */ 560 static inline unsigned int obj_to_index(const struct kmem_cache *cache, 561 const struct slab *slab, void *obj) 562 { 563 u32 offset = (obj - slab->s_mem); 564 return reciprocal_divide(offset, cache->reciprocal_buffer_size); 565 } 566 567 /* 568 * These are the default caches for kmalloc. Custom caches can have other sizes. 569 */ 570 struct cache_sizes malloc_sizes[] = { 571 #define CACHE(x) { .cs_size = (x) }, 572 #include <linux/kmalloc_sizes.h> 573 CACHE(ULONG_MAX) 574 #undef CACHE 575 }; 576 EXPORT_SYMBOL(malloc_sizes); 577 578 /* Must match cache_sizes above. Out of line to keep cache footprint low. */ 579 struct cache_names { 580 char *name; 581 char *name_dma; 582 }; 583 584 static struct cache_names __initdata cache_names[] = { 585 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, 586 #include <linux/kmalloc_sizes.h> 587 {NULL,} 588 #undef CACHE 589 }; 590 591 static struct arraycache_init initarray_cache __initdata = 592 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 593 static struct arraycache_init initarray_generic = 594 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 595 596 /* internal cache of cache description objs */ 597 static struct kmem_cache cache_cache = { 598 .batchcount = 1, 599 .limit = BOOT_CPUCACHE_ENTRIES, 600 .shared = 1, 601 .buffer_size = sizeof(struct kmem_cache), 602 .name = "kmem_cache", 603 }; 604 605 #define BAD_ALIEN_MAGIC 0x01020304ul 606 607 /* 608 * chicken and egg problem: delay the per-cpu array allocation 609 * until the general caches are up. 610 */ 611 static enum { 612 NONE, 613 PARTIAL_AC, 614 PARTIAL_L3, 615 EARLY, 616 FULL 617 } g_cpucache_up; 618 619 /* 620 * used by boot code to determine if it can use slab based allocator 621 */ 622 int slab_is_available(void) 623 { 624 return g_cpucache_up >= EARLY; 625 } 626 627 #ifdef CONFIG_LOCKDEP 628 629 /* 630 * Slab sometimes uses the kmalloc slabs to store the slab headers 631 * for other slabs "off slab". 632 * The locking for this is tricky in that it nests within the locks 633 * of all other slabs in a few places; to deal with this special 634 * locking we put on-slab caches into a separate lock-class. 635 * 636 * We set lock class for alien array caches which are up during init. 637 * The lock annotation will be lost if all cpus of a node goes down and 638 * then comes back up during hotplug 639 */ 640 static struct lock_class_key on_slab_l3_key; 641 static struct lock_class_key on_slab_alc_key; 642 643 static void init_node_lock_keys(int q) 644 { 645 struct cache_sizes *s = malloc_sizes; 646 647 if (g_cpucache_up != FULL) 648 return; 649 650 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { 651 struct array_cache **alc; 652 struct kmem_list3 *l3; 653 int r; 654 655 l3 = s->cs_cachep->nodelists[q]; 656 if (!l3 || OFF_SLAB(s->cs_cachep)) 657 continue; 658 lockdep_set_class(&l3->list_lock, &on_slab_l3_key); 659 alc = l3->alien; 660 /* 661 * FIXME: This check for BAD_ALIEN_MAGIC 662 * should go away when common slab code is taught to 663 * work even without alien caches. 664 * Currently, non NUMA code returns BAD_ALIEN_MAGIC 665 * for alloc_alien_cache, 666 */ 667 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) 668 continue; 669 for_each_node(r) { 670 if (alc[r]) 671 lockdep_set_class(&alc[r]->lock, 672 &on_slab_alc_key); 673 } 674 } 675 } 676 677 static inline void init_lock_keys(void) 678 { 679 int node; 680 681 for_each_node(node) 682 init_node_lock_keys(node); 683 } 684 #else 685 static void init_node_lock_keys(int q) 686 { 687 } 688 689 static inline void init_lock_keys(void) 690 { 691 } 692 #endif 693 694 /* 695 * Guard access to the cache-chain. 696 */ 697 static DEFINE_MUTEX(cache_chain_mutex); 698 static struct list_head cache_chain; 699 700 static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); 701 702 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 703 { 704 return cachep->array[smp_processor_id()]; 705 } 706 707 static inline struct kmem_cache *__find_general_cachep(size_t size, 708 gfp_t gfpflags) 709 { 710 struct cache_sizes *csizep = malloc_sizes; 711 712 #if DEBUG 713 /* This happens if someone tries to call 714 * kmem_cache_create(), or __kmalloc(), before 715 * the generic caches are initialized. 716 */ 717 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); 718 #endif 719 if (!size) 720 return ZERO_SIZE_PTR; 721 722 while (size > csizep->cs_size) 723 csizep++; 724 725 /* 726 * Really subtle: The last entry with cs->cs_size==ULONG_MAX 727 * has cs_{dma,}cachep==NULL. Thus no special case 728 * for large kmalloc calls required. 729 */ 730 #ifdef CONFIG_ZONE_DMA 731 if (unlikely(gfpflags & GFP_DMA)) 732 return csizep->cs_dmacachep; 733 #endif 734 return csizep->cs_cachep; 735 } 736 737 static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) 738 { 739 return __find_general_cachep(size, gfpflags); 740 } 741 742 static size_t slab_mgmt_size(size_t nr_objs, size_t align) 743 { 744 return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); 745 } 746 747 /* 748 * Calculate the number of objects and left-over bytes for a given buffer size. 749 */ 750 static void cache_estimate(unsigned long gfporder, size_t buffer_size, 751 size_t align, int flags, size_t *left_over, 752 unsigned int *num) 753 { 754 int nr_objs; 755 size_t mgmt_size; 756 size_t slab_size = PAGE_SIZE << gfporder; 757 758 /* 759 * The slab management structure can be either off the slab or 760 * on it. For the latter case, the memory allocated for a 761 * slab is used for: 762 * 763 * - The struct slab 764 * - One kmem_bufctl_t for each object 765 * - Padding to respect alignment of @align 766 * - @buffer_size bytes for each object 767 * 768 * If the slab management structure is off the slab, then the 769 * alignment will already be calculated into the size. Because 770 * the slabs are all pages aligned, the objects will be at the 771 * correct alignment when allocated. 772 */ 773 if (flags & CFLGS_OFF_SLAB) { 774 mgmt_size = 0; 775 nr_objs = slab_size / buffer_size; 776 777 if (nr_objs > SLAB_LIMIT) 778 nr_objs = SLAB_LIMIT; 779 } else { 780 /* 781 * Ignore padding for the initial guess. The padding 782 * is at most @align-1 bytes, and @buffer_size is at 783 * least @align. In the worst case, this result will 784 * be one greater than the number of objects that fit 785 * into the memory allocation when taking the padding 786 * into account. 787 */ 788 nr_objs = (slab_size - sizeof(struct slab)) / 789 (buffer_size + sizeof(kmem_bufctl_t)); 790 791 /* 792 * This calculated number will be either the right 793 * amount, or one greater than what we want. 794 */ 795 if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size 796 > slab_size) 797 nr_objs--; 798 799 if (nr_objs > SLAB_LIMIT) 800 nr_objs = SLAB_LIMIT; 801 802 mgmt_size = slab_mgmt_size(nr_objs, align); 803 } 804 *num = nr_objs; 805 *left_over = slab_size - nr_objs*buffer_size - mgmt_size; 806 } 807 808 #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) 809 810 static void __slab_error(const char *function, struct kmem_cache *cachep, 811 char *msg) 812 { 813 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 814 function, cachep->name, msg); 815 dump_stack(); 816 } 817 818 /* 819 * By default on NUMA we use alien caches to stage the freeing of 820 * objects allocated from other nodes. This causes massive memory 821 * inefficiencies when using fake NUMA setup to split memory into a 822 * large number of small nodes, so it can be disabled on the command 823 * line 824 */ 825 826 static int use_alien_caches __read_mostly = 1; 827 static int __init noaliencache_setup(char *s) 828 { 829 use_alien_caches = 0; 830 return 1; 831 } 832 __setup("noaliencache", noaliencache_setup); 833 834 #ifdef CONFIG_NUMA 835 /* 836 * Special reaping functions for NUMA systems called from cache_reap(). 837 * These take care of doing round robin flushing of alien caches (containing 838 * objects freed on different nodes from which they were allocated) and the 839 * flushing of remote pcps by calling drain_node_pages. 840 */ 841 static DEFINE_PER_CPU(unsigned long, slab_reap_node); 842 843 static void init_reap_node(int cpu) 844 { 845 int node; 846 847 node = next_node(cpu_to_node(cpu), node_online_map); 848 if (node == MAX_NUMNODES) 849 node = first_node(node_online_map); 850 851 per_cpu(slab_reap_node, cpu) = node; 852 } 853 854 static void next_reap_node(void) 855 { 856 int node = __get_cpu_var(slab_reap_node); 857 858 node = next_node(node, node_online_map); 859 if (unlikely(node >= MAX_NUMNODES)) 860 node = first_node(node_online_map); 861 __get_cpu_var(slab_reap_node) = node; 862 } 863 864 #else 865 #define init_reap_node(cpu) do { } while (0) 866 #define next_reap_node(void) do { } while (0) 867 #endif 868 869 /* 870 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz 871 * via the workqueue/eventd. 872 * Add the CPU number into the expiration time to minimize the possibility of 873 * the CPUs getting into lockstep and contending for the global cache chain 874 * lock. 875 */ 876 static void __cpuinit start_cpu_timer(int cpu) 877 { 878 struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); 879 880 /* 881 * When this gets called from do_initcalls via cpucache_init(), 882 * init_workqueues() has already run, so keventd will be setup 883 * at that time. 884 */ 885 if (keventd_up() && reap_work->work.func == NULL) { 886 init_reap_node(cpu); 887 INIT_DELAYED_WORK(reap_work, cache_reap); 888 schedule_delayed_work_on(cpu, reap_work, 889 __round_jiffies_relative(HZ, cpu)); 890 } 891 } 892 893 static struct array_cache *alloc_arraycache(int node, int entries, 894 int batchcount, gfp_t gfp) 895 { 896 int memsize = sizeof(void *) * entries + sizeof(struct array_cache); 897 struct array_cache *nc = NULL; 898 899 nc = kmalloc_node(memsize, gfp, node); 900 /* 901 * The array_cache structures contain pointers to free object. 902 * However, when such objects are allocated or transfered to another 903 * cache the pointers are not cleared and they could be counted as 904 * valid references during a kmemleak scan. Therefore, kmemleak must 905 * not scan such objects. 906 */ 907 kmemleak_no_scan(nc); 908 if (nc) { 909 nc->avail = 0; 910 nc->limit = entries; 911 nc->batchcount = batchcount; 912 nc->touched = 0; 913 spin_lock_init(&nc->lock); 914 } 915 return nc; 916 } 917 918 /* 919 * Transfer objects in one arraycache to another. 920 * Locking must be handled by the caller. 921 * 922 * Return the number of entries transferred. 923 */ 924 static int transfer_objects(struct array_cache *to, 925 struct array_cache *from, unsigned int max) 926 { 927 /* Figure out how many entries to transfer */ 928 int nr = min(min(from->avail, max), to->limit - to->avail); 929 930 if (!nr) 931 return 0; 932 933 memcpy(to->entry + to->avail, from->entry + from->avail -nr, 934 sizeof(void *) *nr); 935 936 from->avail -= nr; 937 to->avail += nr; 938 return nr; 939 } 940 941 #ifndef CONFIG_NUMA 942 943 #define drain_alien_cache(cachep, alien) do { } while (0) 944 #define reap_alien(cachep, l3) do { } while (0) 945 946 static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) 947 { 948 return (struct array_cache **)BAD_ALIEN_MAGIC; 949 } 950 951 static inline void free_alien_cache(struct array_cache **ac_ptr) 952 { 953 } 954 955 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 956 { 957 return 0; 958 } 959 960 static inline void *alternate_node_alloc(struct kmem_cache *cachep, 961 gfp_t flags) 962 { 963 return NULL; 964 } 965 966 static inline void *____cache_alloc_node(struct kmem_cache *cachep, 967 gfp_t flags, int nodeid) 968 { 969 return NULL; 970 } 971 972 #else /* CONFIG_NUMA */ 973 974 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); 975 static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 976 977 static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) 978 { 979 struct array_cache **ac_ptr; 980 int memsize = sizeof(void *) * nr_node_ids; 981 int i; 982 983 if (limit > 1) 984 limit = 12; 985 ac_ptr = kzalloc_node(memsize, gfp, node); 986 if (ac_ptr) { 987 for_each_node(i) { 988 if (i == node || !node_online(i)) 989 continue; 990 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); 991 if (!ac_ptr[i]) { 992 for (i--; i >= 0; i--) 993 kfree(ac_ptr[i]); 994 kfree(ac_ptr); 995 return NULL; 996 } 997 } 998 } 999 return ac_ptr; 1000 } 1001 1002 static void free_alien_cache(struct array_cache **ac_ptr) 1003 { 1004 int i; 1005 1006 if (!ac_ptr) 1007 return; 1008 for_each_node(i) 1009 kfree(ac_ptr[i]); 1010 kfree(ac_ptr); 1011 } 1012 1013 static void __drain_alien_cache(struct kmem_cache *cachep, 1014 struct array_cache *ac, int node) 1015 { 1016 struct kmem_list3 *rl3 = cachep->nodelists[node]; 1017 1018 if (ac->avail) { 1019 spin_lock(&rl3->list_lock); 1020 /* 1021 * Stuff objects into the remote nodes shared array first. 1022 * That way we could avoid the overhead of putting the objects 1023 * into the free lists and getting them back later. 1024 */ 1025 if (rl3->shared) 1026 transfer_objects(rl3->shared, ac, ac->limit); 1027 1028 free_block(cachep, ac->entry, ac->avail, node); 1029 ac->avail = 0; 1030 spin_unlock(&rl3->list_lock); 1031 } 1032 } 1033 1034 /* 1035 * Called from cache_reap() to regularly drain alien caches round robin. 1036 */ 1037 static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) 1038 { 1039 int node = __get_cpu_var(slab_reap_node); 1040 1041 if (l3->alien) { 1042 struct array_cache *ac = l3->alien[node]; 1043 1044 if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { 1045 __drain_alien_cache(cachep, ac, node); 1046 spin_unlock_irq(&ac->lock); 1047 } 1048 } 1049 } 1050 1051 static void drain_alien_cache(struct kmem_cache *cachep, 1052 struct array_cache **alien) 1053 { 1054 int i = 0; 1055 struct array_cache *ac; 1056 unsigned long flags; 1057 1058 for_each_online_node(i) { 1059 ac = alien[i]; 1060 if (ac) { 1061 spin_lock_irqsave(&ac->lock, flags); 1062 __drain_alien_cache(cachep, ac, i); 1063 spin_unlock_irqrestore(&ac->lock, flags); 1064 } 1065 } 1066 } 1067 1068 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 1069 { 1070 struct slab *slabp = virt_to_slab(objp); 1071 int nodeid = slabp->nodeid; 1072 struct kmem_list3 *l3; 1073 struct array_cache *alien = NULL; 1074 int node; 1075 1076 node = numa_node_id(); 1077 1078 /* 1079 * Make sure we are not freeing a object from another node to the array 1080 * cache on this cpu. 1081 */ 1082 if (likely(slabp->nodeid == node)) 1083 return 0; 1084 1085 l3 = cachep->nodelists[node]; 1086 STATS_INC_NODEFREES(cachep); 1087 if (l3->alien && l3->alien[nodeid]) { 1088 alien = l3->alien[nodeid]; 1089 spin_lock(&alien->lock); 1090 if (unlikely(alien->avail == alien->limit)) { 1091 STATS_INC_ACOVERFLOW(cachep); 1092 __drain_alien_cache(cachep, alien, nodeid); 1093 } 1094 alien->entry[alien->avail++] = objp; 1095 spin_unlock(&alien->lock); 1096 } else { 1097 spin_lock(&(cachep->nodelists[nodeid])->list_lock); 1098 free_block(cachep, &objp, 1, nodeid); 1099 spin_unlock(&(cachep->nodelists[nodeid])->list_lock); 1100 } 1101 return 1; 1102 } 1103 #endif 1104 1105 static void __cpuinit cpuup_canceled(long cpu) 1106 { 1107 struct kmem_cache *cachep; 1108 struct kmem_list3 *l3 = NULL; 1109 int node = cpu_to_node(cpu); 1110 const struct cpumask *mask = cpumask_of_node(node); 1111 1112 list_for_each_entry(cachep, &cache_chain, next) { 1113 struct array_cache *nc; 1114 struct array_cache *shared; 1115 struct array_cache **alien; 1116 1117 /* cpu is dead; no one can alloc from it. */ 1118 nc = cachep->array[cpu]; 1119 cachep->array[cpu] = NULL; 1120 l3 = cachep->nodelists[node]; 1121 1122 if (!l3) 1123 goto free_array_cache; 1124 1125 spin_lock_irq(&l3->list_lock); 1126 1127 /* Free limit for this kmem_list3 */ 1128 l3->free_limit -= cachep->batchcount; 1129 if (nc) 1130 free_block(cachep, nc->entry, nc->avail, node); 1131 1132 if (!cpumask_empty(mask)) { 1133 spin_unlock_irq(&l3->list_lock); 1134 goto free_array_cache; 1135 } 1136 1137 shared = l3->shared; 1138 if (shared) { 1139 free_block(cachep, shared->entry, 1140 shared->avail, node); 1141 l3->shared = NULL; 1142 } 1143 1144 alien = l3->alien; 1145 l3->alien = NULL; 1146 1147 spin_unlock_irq(&l3->list_lock); 1148 1149 kfree(shared); 1150 if (alien) { 1151 drain_alien_cache(cachep, alien); 1152 free_alien_cache(alien); 1153 } 1154 free_array_cache: 1155 kfree(nc); 1156 } 1157 /* 1158 * In the previous loop, all the objects were freed to 1159 * the respective cache's slabs, now we can go ahead and 1160 * shrink each nodelist to its limit. 1161 */ 1162 list_for_each_entry(cachep, &cache_chain, next) { 1163 l3 = cachep->nodelists[node]; 1164 if (!l3) 1165 continue; 1166 drain_freelist(cachep, l3, l3->free_objects); 1167 } 1168 } 1169 1170 static int __cpuinit cpuup_prepare(long cpu) 1171 { 1172 struct kmem_cache *cachep; 1173 struct kmem_list3 *l3 = NULL; 1174 int node = cpu_to_node(cpu); 1175 const int memsize = sizeof(struct kmem_list3); 1176 1177 /* 1178 * We need to do this right in the beginning since 1179 * alloc_arraycache's are going to use this list. 1180 * kmalloc_node allows us to add the slab to the right 1181 * kmem_list3 and not this cpu's kmem_list3 1182 */ 1183 1184 list_for_each_entry(cachep, &cache_chain, next) { 1185 /* 1186 * Set up the size64 kmemlist for cpu before we can 1187 * begin anything. Make sure some other cpu on this 1188 * node has not already allocated this 1189 */ 1190 if (!cachep->nodelists[node]) { 1191 l3 = kmalloc_node(memsize, GFP_KERNEL, node); 1192 if (!l3) 1193 goto bad; 1194 kmem_list3_init(l3); 1195 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 1196 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1197 1198 /* 1199 * The l3s don't come and go as CPUs come and 1200 * go. cache_chain_mutex is sufficient 1201 * protection here. 1202 */ 1203 cachep->nodelists[node] = l3; 1204 } 1205 1206 spin_lock_irq(&cachep->nodelists[node]->list_lock); 1207 cachep->nodelists[node]->free_limit = 1208 (1 + nr_cpus_node(node)) * 1209 cachep->batchcount + cachep->num; 1210 spin_unlock_irq(&cachep->nodelists[node]->list_lock); 1211 } 1212 1213 /* 1214 * Now we can go ahead with allocating the shared arrays and 1215 * array caches 1216 */ 1217 list_for_each_entry(cachep, &cache_chain, next) { 1218 struct array_cache *nc; 1219 struct array_cache *shared = NULL; 1220 struct array_cache **alien = NULL; 1221 1222 nc = alloc_arraycache(node, cachep->limit, 1223 cachep->batchcount, GFP_KERNEL); 1224 if (!nc) 1225 goto bad; 1226 if (cachep->shared) { 1227 shared = alloc_arraycache(node, 1228 cachep->shared * cachep->batchcount, 1229 0xbaadf00d, GFP_KERNEL); 1230 if (!shared) { 1231 kfree(nc); 1232 goto bad; 1233 } 1234 } 1235 if (use_alien_caches) { 1236 alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); 1237 if (!alien) { 1238 kfree(shared); 1239 kfree(nc); 1240 goto bad; 1241 } 1242 } 1243 cachep->array[cpu] = nc; 1244 l3 = cachep->nodelists[node]; 1245 BUG_ON(!l3); 1246 1247 spin_lock_irq(&l3->list_lock); 1248 if (!l3->shared) { 1249 /* 1250 * We are serialised from CPU_DEAD or 1251 * CPU_UP_CANCELLED by the cpucontrol lock 1252 */ 1253 l3->shared = shared; 1254 shared = NULL; 1255 } 1256 #ifdef CONFIG_NUMA 1257 if (!l3->alien) { 1258 l3->alien = alien; 1259 alien = NULL; 1260 } 1261 #endif 1262 spin_unlock_irq(&l3->list_lock); 1263 kfree(shared); 1264 free_alien_cache(alien); 1265 } 1266 init_node_lock_keys(node); 1267 1268 return 0; 1269 bad: 1270 cpuup_canceled(cpu); 1271 return -ENOMEM; 1272 } 1273 1274 static int __cpuinit cpuup_callback(struct notifier_block *nfb, 1275 unsigned long action, void *hcpu) 1276 { 1277 long cpu = (long)hcpu; 1278 int err = 0; 1279 1280 switch (action) { 1281 case CPU_UP_PREPARE: 1282 case CPU_UP_PREPARE_FROZEN: 1283 mutex_lock(&cache_chain_mutex); 1284 err = cpuup_prepare(cpu); 1285 mutex_unlock(&cache_chain_mutex); 1286 break; 1287 case CPU_ONLINE: 1288 case CPU_ONLINE_FROZEN: 1289 start_cpu_timer(cpu); 1290 break; 1291 #ifdef CONFIG_HOTPLUG_CPU 1292 case CPU_DOWN_PREPARE: 1293 case CPU_DOWN_PREPARE_FROZEN: 1294 /* 1295 * Shutdown cache reaper. Note that the cache_chain_mutex is 1296 * held so that if cache_reap() is invoked it cannot do 1297 * anything expensive but will only modify reap_work 1298 * and reschedule the timer. 1299 */ 1300 cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu)); 1301 /* Now the cache_reaper is guaranteed to be not running. */ 1302 per_cpu(slab_reap_work, cpu).work.func = NULL; 1303 break; 1304 case CPU_DOWN_FAILED: 1305 case CPU_DOWN_FAILED_FROZEN: 1306 start_cpu_timer(cpu); 1307 break; 1308 case CPU_DEAD: 1309 case CPU_DEAD_FROZEN: 1310 /* 1311 * Even if all the cpus of a node are down, we don't free the 1312 * kmem_list3 of any cache. This to avoid a race between 1313 * cpu_down, and a kmalloc allocation from another cpu for 1314 * memory from the node of the cpu going down. The list3 1315 * structure is usually allocated from kmem_cache_create() and 1316 * gets destroyed at kmem_cache_destroy(). 1317 */ 1318 /* fall through */ 1319 #endif 1320 case CPU_UP_CANCELED: 1321 case CPU_UP_CANCELED_FROZEN: 1322 mutex_lock(&cache_chain_mutex); 1323 cpuup_canceled(cpu); 1324 mutex_unlock(&cache_chain_mutex); 1325 break; 1326 } 1327 return err ? NOTIFY_BAD : NOTIFY_OK; 1328 } 1329 1330 static struct notifier_block __cpuinitdata cpucache_notifier = { 1331 &cpuup_callback, NULL, 0 1332 }; 1333 1334 /* 1335 * swap the static kmem_list3 with kmalloced memory 1336 */ 1337 static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, 1338 int nodeid) 1339 { 1340 struct kmem_list3 *ptr; 1341 1342 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid); 1343 BUG_ON(!ptr); 1344 1345 memcpy(ptr, list, sizeof(struct kmem_list3)); 1346 /* 1347 * Do not assume that spinlocks can be initialized via memcpy: 1348 */ 1349 spin_lock_init(&ptr->list_lock); 1350 1351 MAKE_ALL_LISTS(cachep, ptr, nodeid); 1352 cachep->nodelists[nodeid] = ptr; 1353 } 1354 1355 /* 1356 * For setting up all the kmem_list3s for cache whose buffer_size is same as 1357 * size of kmem_list3. 1358 */ 1359 static void __init set_up_list3s(struct kmem_cache *cachep, int index) 1360 { 1361 int node; 1362 1363 for_each_online_node(node) { 1364 cachep->nodelists[node] = &initkmem_list3[index + node]; 1365 cachep->nodelists[node]->next_reap = jiffies + 1366 REAPTIMEOUT_LIST3 + 1367 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1368 } 1369 } 1370 1371 /* 1372 * Initialisation. Called after the page allocator have been initialised and 1373 * before smp_init(). 1374 */ 1375 void __init kmem_cache_init(void) 1376 { 1377 size_t left_over; 1378 struct cache_sizes *sizes; 1379 struct cache_names *names; 1380 int i; 1381 int order; 1382 int node; 1383 1384 if (num_possible_nodes() == 1) 1385 use_alien_caches = 0; 1386 1387 for (i = 0; i < NUM_INIT_LISTS; i++) { 1388 kmem_list3_init(&initkmem_list3[i]); 1389 if (i < MAX_NUMNODES) 1390 cache_cache.nodelists[i] = NULL; 1391 } 1392 set_up_list3s(&cache_cache, CACHE_CACHE); 1393 1394 /* 1395 * Fragmentation resistance on low memory - only use bigger 1396 * page orders on machines with more than 32MB of memory. 1397 */ 1398 if (totalram_pages > (32 << 20) >> PAGE_SHIFT) 1399 slab_break_gfp_order = BREAK_GFP_ORDER_HI; 1400 1401 /* Bootstrap is tricky, because several objects are allocated 1402 * from caches that do not exist yet: 1403 * 1) initialize the cache_cache cache: it contains the struct 1404 * kmem_cache structures of all caches, except cache_cache itself: 1405 * cache_cache is statically allocated. 1406 * Initially an __init data area is used for the head array and the 1407 * kmem_list3 structures, it's replaced with a kmalloc allocated 1408 * array at the end of the bootstrap. 1409 * 2) Create the first kmalloc cache. 1410 * The struct kmem_cache for the new cache is allocated normally. 1411 * An __init data area is used for the head array. 1412 * 3) Create the remaining kmalloc caches, with minimally sized 1413 * head arrays. 1414 * 4) Replace the __init data head arrays for cache_cache and the first 1415 * kmalloc cache with kmalloc allocated arrays. 1416 * 5) Replace the __init data for kmem_list3 for cache_cache and 1417 * the other cache's with kmalloc allocated memory. 1418 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1419 */ 1420 1421 node = numa_node_id(); 1422 1423 /* 1) create the cache_cache */ 1424 INIT_LIST_HEAD(&cache_chain); 1425 list_add(&cache_cache.next, &cache_chain); 1426 cache_cache.colour_off = cache_line_size(); 1427 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1428 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; 1429 1430 /* 1431 * struct kmem_cache size depends on nr_node_ids, which 1432 * can be less than MAX_NUMNODES. 1433 */ 1434 cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + 1435 nr_node_ids * sizeof(struct kmem_list3 *); 1436 #if DEBUG 1437 cache_cache.obj_size = cache_cache.buffer_size; 1438 #endif 1439 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, 1440 cache_line_size()); 1441 cache_cache.reciprocal_buffer_size = 1442 reciprocal_value(cache_cache.buffer_size); 1443 1444 for (order = 0; order < MAX_ORDER; order++) { 1445 cache_estimate(order, cache_cache.buffer_size, 1446 cache_line_size(), 0, &left_over, &cache_cache.num); 1447 if (cache_cache.num) 1448 break; 1449 } 1450 BUG_ON(!cache_cache.num); 1451 cache_cache.gfporder = order; 1452 cache_cache.colour = left_over / cache_cache.colour_off; 1453 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + 1454 sizeof(struct slab), cache_line_size()); 1455 1456 /* 2+3) create the kmalloc caches */ 1457 sizes = malloc_sizes; 1458 names = cache_names; 1459 1460 /* 1461 * Initialize the caches that provide memory for the array cache and the 1462 * kmem_list3 structures first. Without this, further allocations will 1463 * bug. 1464 */ 1465 1466 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, 1467 sizes[INDEX_AC].cs_size, 1468 ARCH_KMALLOC_MINALIGN, 1469 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1470 NULL); 1471 1472 if (INDEX_AC != INDEX_L3) { 1473 sizes[INDEX_L3].cs_cachep = 1474 kmem_cache_create(names[INDEX_L3].name, 1475 sizes[INDEX_L3].cs_size, 1476 ARCH_KMALLOC_MINALIGN, 1477 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1478 NULL); 1479 } 1480 1481 slab_early_init = 0; 1482 1483 while (sizes->cs_size != ULONG_MAX) { 1484 /* 1485 * For performance, all the general caches are L1 aligned. 1486 * This should be particularly beneficial on SMP boxes, as it 1487 * eliminates "false sharing". 1488 * Note for systems short on memory removing the alignment will 1489 * allow tighter packing of the smaller caches. 1490 */ 1491 if (!sizes->cs_cachep) { 1492 sizes->cs_cachep = kmem_cache_create(names->name, 1493 sizes->cs_size, 1494 ARCH_KMALLOC_MINALIGN, 1495 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1496 NULL); 1497 } 1498 #ifdef CONFIG_ZONE_DMA 1499 sizes->cs_dmacachep = kmem_cache_create( 1500 names->name_dma, 1501 sizes->cs_size, 1502 ARCH_KMALLOC_MINALIGN, 1503 ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| 1504 SLAB_PANIC, 1505 NULL); 1506 #endif 1507 sizes++; 1508 names++; 1509 } 1510 /* 4) Replace the bootstrap head arrays */ 1511 { 1512 struct array_cache *ptr; 1513 1514 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); 1515 1516 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); 1517 memcpy(ptr, cpu_cache_get(&cache_cache), 1518 sizeof(struct arraycache_init)); 1519 /* 1520 * Do not assume that spinlocks can be initialized via memcpy: 1521 */ 1522 spin_lock_init(&ptr->lock); 1523 1524 cache_cache.array[smp_processor_id()] = ptr; 1525 1526 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); 1527 1528 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) 1529 != &initarray_generic.cache); 1530 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), 1531 sizeof(struct arraycache_init)); 1532 /* 1533 * Do not assume that spinlocks can be initialized via memcpy: 1534 */ 1535 spin_lock_init(&ptr->lock); 1536 1537 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1538 ptr; 1539 } 1540 /* 5) Replace the bootstrap kmem_list3's */ 1541 { 1542 int nid; 1543 1544 for_each_online_node(nid) { 1545 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid); 1546 1547 init_list(malloc_sizes[INDEX_AC].cs_cachep, 1548 &initkmem_list3[SIZE_AC + nid], nid); 1549 1550 if (INDEX_AC != INDEX_L3) { 1551 init_list(malloc_sizes[INDEX_L3].cs_cachep, 1552 &initkmem_list3[SIZE_L3 + nid], nid); 1553 } 1554 } 1555 } 1556 1557 g_cpucache_up = EARLY; 1558 } 1559 1560 void __init kmem_cache_init_late(void) 1561 { 1562 struct kmem_cache *cachep; 1563 1564 /* 6) resize the head arrays to their final sizes */ 1565 mutex_lock(&cache_chain_mutex); 1566 list_for_each_entry(cachep, &cache_chain, next) 1567 if (enable_cpucache(cachep, GFP_NOWAIT)) 1568 BUG(); 1569 mutex_unlock(&cache_chain_mutex); 1570 1571 /* Done! */ 1572 g_cpucache_up = FULL; 1573 1574 /* Annotate slab for lockdep -- annotate the malloc caches */ 1575 init_lock_keys(); 1576 1577 /* 1578 * Register a cpu startup notifier callback that initializes 1579 * cpu_cache_get for all new cpus 1580 */ 1581 register_cpu_notifier(&cpucache_notifier); 1582 1583 /* 1584 * The reap timers are started later, with a module init call: That part 1585 * of the kernel is not yet operational. 1586 */ 1587 } 1588 1589 static int __init cpucache_init(void) 1590 { 1591 int cpu; 1592 1593 /* 1594 * Register the timers that return unneeded pages to the page allocator 1595 */ 1596 for_each_online_cpu(cpu) 1597 start_cpu_timer(cpu); 1598 return 0; 1599 } 1600 __initcall(cpucache_init); 1601 1602 /* 1603 * Interface to system's page allocator. No need to hold the cache-lock. 1604 * 1605 * If we requested dmaable memory, we will get it. Even if we 1606 * did not request dmaable memory, we might get it, but that 1607 * would be relatively rare and ignorable. 1608 */ 1609 static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) 1610 { 1611 struct page *page; 1612 int nr_pages; 1613 int i; 1614 1615 #ifndef CONFIG_MMU 1616 /* 1617 * Nommu uses slab's for process anonymous memory allocations, and thus 1618 * requires __GFP_COMP to properly refcount higher order allocations 1619 */ 1620 flags |= __GFP_COMP; 1621 #endif 1622 1623 flags |= cachep->gfpflags; 1624 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1625 flags |= __GFP_RECLAIMABLE; 1626 1627 page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); 1628 if (!page) 1629 return NULL; 1630 1631 nr_pages = (1 << cachep->gfporder); 1632 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1633 add_zone_page_state(page_zone(page), 1634 NR_SLAB_RECLAIMABLE, nr_pages); 1635 else 1636 add_zone_page_state(page_zone(page), 1637 NR_SLAB_UNRECLAIMABLE, nr_pages); 1638 for (i = 0; i < nr_pages; i++) 1639 __SetPageSlab(page + i); 1640 1641 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1642 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); 1643 1644 if (cachep->ctor) 1645 kmemcheck_mark_uninitialized_pages(page, nr_pages); 1646 else 1647 kmemcheck_mark_unallocated_pages(page, nr_pages); 1648 } 1649 1650 return page_address(page); 1651 } 1652 1653 /* 1654 * Interface to system's page release. 1655 */ 1656 static void kmem_freepages(struct kmem_cache *cachep, void *addr) 1657 { 1658 unsigned long i = (1 << cachep->gfporder); 1659 struct page *page = virt_to_page(addr); 1660 const unsigned long nr_freed = i; 1661 1662 kmemcheck_free_shadow(page, cachep->gfporder); 1663 1664 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1665 sub_zone_page_state(page_zone(page), 1666 NR_SLAB_RECLAIMABLE, nr_freed); 1667 else 1668 sub_zone_page_state(page_zone(page), 1669 NR_SLAB_UNRECLAIMABLE, nr_freed); 1670 while (i--) { 1671 BUG_ON(!PageSlab(page)); 1672 __ClearPageSlab(page); 1673 page++; 1674 } 1675 if (current->reclaim_state) 1676 current->reclaim_state->reclaimed_slab += nr_freed; 1677 free_pages((unsigned long)addr, cachep->gfporder); 1678 } 1679 1680 static void kmem_rcu_free(struct rcu_head *head) 1681 { 1682 struct slab_rcu *slab_rcu = (struct slab_rcu *)head; 1683 struct kmem_cache *cachep = slab_rcu->cachep; 1684 1685 kmem_freepages(cachep, slab_rcu->addr); 1686 if (OFF_SLAB(cachep)) 1687 kmem_cache_free(cachep->slabp_cache, slab_rcu); 1688 } 1689 1690 #if DEBUG 1691 1692 #ifdef CONFIG_DEBUG_PAGEALLOC 1693 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, 1694 unsigned long caller) 1695 { 1696 int size = obj_size(cachep); 1697 1698 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; 1699 1700 if (size < 5 * sizeof(unsigned long)) 1701 return; 1702 1703 *addr++ = 0x12345678; 1704 *addr++ = caller; 1705 *addr++ = smp_processor_id(); 1706 size -= 3 * sizeof(unsigned long); 1707 { 1708 unsigned long *sptr = &caller; 1709 unsigned long svalue; 1710 1711 while (!kstack_end(sptr)) { 1712 svalue = *sptr++; 1713 if (kernel_text_address(svalue)) { 1714 *addr++ = svalue; 1715 size -= sizeof(unsigned long); 1716 if (size <= sizeof(unsigned long)) 1717 break; 1718 } 1719 } 1720 1721 } 1722 *addr++ = 0x87654321; 1723 } 1724 #endif 1725 1726 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) 1727 { 1728 int size = obj_size(cachep); 1729 addr = &((char *)addr)[obj_offset(cachep)]; 1730 1731 memset(addr, val, size); 1732 *(unsigned char *)(addr + size - 1) = POISON_END; 1733 } 1734 1735 static void dump_line(char *data, int offset, int limit) 1736 { 1737 int i; 1738 unsigned char error = 0; 1739 int bad_count = 0; 1740 1741 printk(KERN_ERR "%03x:", offset); 1742 for (i = 0; i < limit; i++) { 1743 if (data[offset + i] != POISON_FREE) { 1744 error = data[offset + i]; 1745 bad_count++; 1746 } 1747 printk(" %02x", (unsigned char)data[offset + i]); 1748 } 1749 printk("\n"); 1750 1751 if (bad_count == 1) { 1752 error ^= POISON_FREE; 1753 if (!(error & (error - 1))) { 1754 printk(KERN_ERR "Single bit error detected. Probably " 1755 "bad RAM.\n"); 1756 #ifdef CONFIG_X86 1757 printk(KERN_ERR "Run memtest86+ or a similar memory " 1758 "test tool.\n"); 1759 #else 1760 printk(KERN_ERR "Run a memory test tool.\n"); 1761 #endif 1762 } 1763 } 1764 } 1765 #endif 1766 1767 #if DEBUG 1768 1769 static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) 1770 { 1771 int i, size; 1772 char *realobj; 1773 1774 if (cachep->flags & SLAB_RED_ZONE) { 1775 printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", 1776 *dbg_redzone1(cachep, objp), 1777 *dbg_redzone2(cachep, objp)); 1778 } 1779 1780 if (cachep->flags & SLAB_STORE_USER) { 1781 printk(KERN_ERR "Last user: [<%p>]", 1782 *dbg_userword(cachep, objp)); 1783 print_symbol("(%s)", 1784 (unsigned long)*dbg_userword(cachep, objp)); 1785 printk("\n"); 1786 } 1787 realobj = (char *)objp + obj_offset(cachep); 1788 size = obj_size(cachep); 1789 for (i = 0; i < size && lines; i += 16, lines--) { 1790 int limit; 1791 limit = 16; 1792 if (i + limit > size) 1793 limit = size - i; 1794 dump_line(realobj, i, limit); 1795 } 1796 } 1797 1798 static void check_poison_obj(struct kmem_cache *cachep, void *objp) 1799 { 1800 char *realobj; 1801 int size, i; 1802 int lines = 0; 1803 1804 realobj = (char *)objp + obj_offset(cachep); 1805 size = obj_size(cachep); 1806 1807 for (i = 0; i < size; i++) { 1808 char exp = POISON_FREE; 1809 if (i == size - 1) 1810 exp = POISON_END; 1811 if (realobj[i] != exp) { 1812 int limit; 1813 /* Mismatch ! */ 1814 /* Print header */ 1815 if (lines == 0) { 1816 printk(KERN_ERR 1817 "Slab corruption: %s start=%p, len=%d\n", 1818 cachep->name, realobj, size); 1819 print_objinfo(cachep, objp, 0); 1820 } 1821 /* Hexdump the affected line */ 1822 i = (i / 16) * 16; 1823 limit = 16; 1824 if (i + limit > size) 1825 limit = size - i; 1826 dump_line(realobj, i, limit); 1827 i += 16; 1828 lines++; 1829 /* Limit to 5 lines */ 1830 if (lines > 5) 1831 break; 1832 } 1833 } 1834 if (lines != 0) { 1835 /* Print some data about the neighboring objects, if they 1836 * exist: 1837 */ 1838 struct slab *slabp = virt_to_slab(objp); 1839 unsigned int objnr; 1840 1841 objnr = obj_to_index(cachep, slabp, objp); 1842 if (objnr) { 1843 objp = index_to_obj(cachep, slabp, objnr - 1); 1844 realobj = (char *)objp + obj_offset(cachep); 1845 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1846 realobj, size); 1847 print_objinfo(cachep, objp, 2); 1848 } 1849 if (objnr + 1 < cachep->num) { 1850 objp = index_to_obj(cachep, slabp, objnr + 1); 1851 realobj = (char *)objp + obj_offset(cachep); 1852 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1853 realobj, size); 1854 print_objinfo(cachep, objp, 2); 1855 } 1856 } 1857 } 1858 #endif 1859 1860 #if DEBUG 1861 static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) 1862 { 1863 int i; 1864 for (i = 0; i < cachep->num; i++) { 1865 void *objp = index_to_obj(cachep, slabp, i); 1866 1867 if (cachep->flags & SLAB_POISON) { 1868 #ifdef CONFIG_DEBUG_PAGEALLOC 1869 if (cachep->buffer_size % PAGE_SIZE == 0 && 1870 OFF_SLAB(cachep)) 1871 kernel_map_pages(virt_to_page(objp), 1872 cachep->buffer_size / PAGE_SIZE, 1); 1873 else 1874 check_poison_obj(cachep, objp); 1875 #else 1876 check_poison_obj(cachep, objp); 1877 #endif 1878 } 1879 if (cachep->flags & SLAB_RED_ZONE) { 1880 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1881 slab_error(cachep, "start of a freed object " 1882 "was overwritten"); 1883 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 1884 slab_error(cachep, "end of a freed object " 1885 "was overwritten"); 1886 } 1887 } 1888 } 1889 #else 1890 static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) 1891 { 1892 } 1893 #endif 1894 1895 /** 1896 * slab_destroy - destroy and release all objects in a slab 1897 * @cachep: cache pointer being destroyed 1898 * @slabp: slab pointer being destroyed 1899 * 1900 * Destroy all the objs in a slab, and release the mem back to the system. 1901 * Before calling the slab must have been unlinked from the cache. The 1902 * cache-lock is not held/needed. 1903 */ 1904 static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) 1905 { 1906 void *addr = slabp->s_mem - slabp->colouroff; 1907 1908 slab_destroy_debugcheck(cachep, slabp); 1909 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 1910 struct slab_rcu *slab_rcu; 1911 1912 slab_rcu = (struct slab_rcu *)slabp; 1913 slab_rcu->cachep = cachep; 1914 slab_rcu->addr = addr; 1915 call_rcu(&slab_rcu->head, kmem_rcu_free); 1916 } else { 1917 kmem_freepages(cachep, addr); 1918 if (OFF_SLAB(cachep)) 1919 kmem_cache_free(cachep->slabp_cache, slabp); 1920 } 1921 } 1922 1923 static void __kmem_cache_destroy(struct kmem_cache *cachep) 1924 { 1925 int i; 1926 struct kmem_list3 *l3; 1927 1928 for_each_online_cpu(i) 1929 kfree(cachep->array[i]); 1930 1931 /* NUMA: free the list3 structures */ 1932 for_each_online_node(i) { 1933 l3 = cachep->nodelists[i]; 1934 if (l3) { 1935 kfree(l3->shared); 1936 free_alien_cache(l3->alien); 1937 kfree(l3); 1938 } 1939 } 1940 kmem_cache_free(&cache_cache, cachep); 1941 } 1942 1943 1944 /** 1945 * calculate_slab_order - calculate size (page order) of slabs 1946 * @cachep: pointer to the cache that is being created 1947 * @size: size of objects to be created in this cache. 1948 * @align: required alignment for the objects. 1949 * @flags: slab allocation flags 1950 * 1951 * Also calculates the number of objects per slab. 1952 * 1953 * This could be made much more intelligent. For now, try to avoid using 1954 * high order pages for slabs. When the gfp() functions are more friendly 1955 * towards high-order requests, this should be changed. 1956 */ 1957 static size_t calculate_slab_order(struct kmem_cache *cachep, 1958 size_t size, size_t align, unsigned long flags) 1959 { 1960 unsigned long offslab_limit; 1961 size_t left_over = 0; 1962 int gfporder; 1963 1964 for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { 1965 unsigned int num; 1966 size_t remainder; 1967 1968 cache_estimate(gfporder, size, align, flags, &remainder, &num); 1969 if (!num) 1970 continue; 1971 1972 if (flags & CFLGS_OFF_SLAB) { 1973 /* 1974 * Max number of objs-per-slab for caches which 1975 * use off-slab slabs. Needed to avoid a possible 1976 * looping condition in cache_grow(). 1977 */ 1978 offslab_limit = size - sizeof(struct slab); 1979 offslab_limit /= sizeof(kmem_bufctl_t); 1980 1981 if (num > offslab_limit) 1982 break; 1983 } 1984 1985 /* Found something acceptable - save it away */ 1986 cachep->num = num; 1987 cachep->gfporder = gfporder; 1988 left_over = remainder; 1989 1990 /* 1991 * A VFS-reclaimable slab tends to have most allocations 1992 * as GFP_NOFS and we really don't want to have to be allocating 1993 * higher-order pages when we are unable to shrink dcache. 1994 */ 1995 if (flags & SLAB_RECLAIM_ACCOUNT) 1996 break; 1997 1998 /* 1999 * Large number of objects is good, but very large slabs are 2000 * currently bad for the gfp()s. 2001 */ 2002 if (gfporder >= slab_break_gfp_order) 2003 break; 2004 2005 /* 2006 * Acceptable internal fragmentation? 2007 */ 2008 if (left_over * 8 <= (PAGE_SIZE << gfporder)) 2009 break; 2010 } 2011 return left_over; 2012 } 2013 2014 static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) 2015 { 2016 if (g_cpucache_up == FULL) 2017 return enable_cpucache(cachep, gfp); 2018 2019 if (g_cpucache_up == NONE) { 2020 /* 2021 * Note: the first kmem_cache_create must create the cache 2022 * that's used by kmalloc(24), otherwise the creation of 2023 * further caches will BUG(). 2024 */ 2025 cachep->array[smp_processor_id()] = &initarray_generic.cache; 2026 2027 /* 2028 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is 2029 * the first cache, then we need to set up all its list3s, 2030 * otherwise the creation of further caches will BUG(). 2031 */ 2032 set_up_list3s(cachep, SIZE_AC); 2033 if (INDEX_AC == INDEX_L3) 2034 g_cpucache_up = PARTIAL_L3; 2035 else 2036 g_cpucache_up = PARTIAL_AC; 2037 } else { 2038 cachep->array[smp_processor_id()] = 2039 kmalloc(sizeof(struct arraycache_init), gfp); 2040 2041 if (g_cpucache_up == PARTIAL_AC) { 2042 set_up_list3s(cachep, SIZE_L3); 2043 g_cpucache_up = PARTIAL_L3; 2044 } else { 2045 int node; 2046 for_each_online_node(node) { 2047 cachep->nodelists[node] = 2048 kmalloc_node(sizeof(struct kmem_list3), 2049 gfp, node); 2050 BUG_ON(!cachep->nodelists[node]); 2051 kmem_list3_init(cachep->nodelists[node]); 2052 } 2053 } 2054 } 2055 cachep->nodelists[numa_node_id()]->next_reap = 2056 jiffies + REAPTIMEOUT_LIST3 + 2057 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 2058 2059 cpu_cache_get(cachep)->avail = 0; 2060 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 2061 cpu_cache_get(cachep)->batchcount = 1; 2062 cpu_cache_get(cachep)->touched = 0; 2063 cachep->batchcount = 1; 2064 cachep->limit = BOOT_CPUCACHE_ENTRIES; 2065 return 0; 2066 } 2067 2068 /** 2069 * kmem_cache_create - Create a cache. 2070 * @name: A string which is used in /proc/slabinfo to identify this cache. 2071 * @size: The size of objects to be created in this cache. 2072 * @align: The required alignment for the objects. 2073 * @flags: SLAB flags 2074 * @ctor: A constructor for the objects. 2075 * 2076 * Returns a ptr to the cache on success, NULL on failure. 2077 * Cannot be called within a int, but can be interrupted. 2078 * The @ctor is run when new pages are allocated by the cache. 2079 * 2080 * @name must be valid until the cache is destroyed. This implies that 2081 * the module calling this has to destroy the cache before getting unloaded. 2082 * Note that kmem_cache_name() is not guaranteed to return the same pointer, 2083 * therefore applications must manage it themselves. 2084 * 2085 * The flags are 2086 * 2087 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 2088 * to catch references to uninitialised memory. 2089 * 2090 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 2091 * for buffer overruns. 2092 * 2093 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 2094 * cacheline. This can be beneficial if you're counting cycles as closely 2095 * as davem. 2096 */ 2097 struct kmem_cache * 2098 kmem_cache_create (const char *name, size_t size, size_t align, 2099 unsigned long flags, void (*ctor)(void *)) 2100 { 2101 size_t left_over, slab_size, ralign; 2102 struct kmem_cache *cachep = NULL, *pc; 2103 gfp_t gfp; 2104 2105 /* 2106 * Sanity checks... these are all serious usage bugs. 2107 */ 2108 if (!name || in_interrupt() || (size < BYTES_PER_WORD) || 2109 size > KMALLOC_MAX_SIZE) { 2110 printk(KERN_ERR "%s: Early error in slab %s\n", __func__, 2111 name); 2112 BUG(); 2113 } 2114 2115 /* 2116 * We use cache_chain_mutex to ensure a consistent view of 2117 * cpu_online_mask as well. Please see cpuup_callback 2118 */ 2119 if (slab_is_available()) { 2120 get_online_cpus(); 2121 mutex_lock(&cache_chain_mutex); 2122 } 2123 2124 list_for_each_entry(pc, &cache_chain, next) { 2125 char tmp; 2126 int res; 2127 2128 /* 2129 * This happens when the module gets unloaded and doesn't 2130 * destroy its slab cache and no-one else reuses the vmalloc 2131 * area of the module. Print a warning. 2132 */ 2133 res = probe_kernel_address(pc->name, tmp); 2134 if (res) { 2135 printk(KERN_ERR 2136 "SLAB: cache with size %d has lost its name\n", 2137 pc->buffer_size); 2138 continue; 2139 } 2140 2141 if (!strcmp(pc->name, name)) { 2142 printk(KERN_ERR 2143 "kmem_cache_create: duplicate cache %s\n", name); 2144 dump_stack(); 2145 goto oops; 2146 } 2147 } 2148 2149 #if DEBUG 2150 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 2151 #if FORCED_DEBUG 2152 /* 2153 * Enable redzoning and last user accounting, except for caches with 2154 * large objects, if the increased size would increase the object size 2155 * above the next power of two: caches with object sizes just above a 2156 * power of two have a significant amount of internal fragmentation. 2157 */ 2158 if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + 2159 2 * sizeof(unsigned long long))) 2160 flags |= SLAB_RED_ZONE | SLAB_STORE_USER; 2161 if (!(flags & SLAB_DESTROY_BY_RCU)) 2162 flags |= SLAB_POISON; 2163 #endif 2164 if (flags & SLAB_DESTROY_BY_RCU) 2165 BUG_ON(flags & SLAB_POISON); 2166 #endif 2167 /* 2168 * Always checks flags, a caller might be expecting debug support which 2169 * isn't available. 2170 */ 2171 BUG_ON(flags & ~CREATE_MASK); 2172 2173 /* 2174 * Check that size is in terms of words. This is needed to avoid 2175 * unaligned accesses for some archs when redzoning is used, and makes 2176 * sure any on-slab bufctl's are also correctly aligned. 2177 */ 2178 if (size & (BYTES_PER_WORD - 1)) { 2179 size += (BYTES_PER_WORD - 1); 2180 size &= ~(BYTES_PER_WORD - 1); 2181 } 2182 2183 /* calculate the final buffer alignment: */ 2184 2185 /* 1) arch recommendation: can be overridden for debug */ 2186 if (flags & SLAB_HWCACHE_ALIGN) { 2187 /* 2188 * Default alignment: as specified by the arch code. Except if 2189 * an object is really small, then squeeze multiple objects into 2190 * one cacheline. 2191 */ 2192 ralign = cache_line_size(); 2193 while (size <= ralign / 2) 2194 ralign /= 2; 2195 } else { 2196 ralign = BYTES_PER_WORD; 2197 } 2198 2199 /* 2200 * Redzoning and user store require word alignment or possibly larger. 2201 * Note this will be overridden by architecture or caller mandated 2202 * alignment if either is greater than BYTES_PER_WORD. 2203 */ 2204 if (flags & SLAB_STORE_USER) 2205 ralign = BYTES_PER_WORD; 2206 2207 if (flags & SLAB_RED_ZONE) { 2208 ralign = REDZONE_ALIGN; 2209 /* If redzoning, ensure that the second redzone is suitably 2210 * aligned, by adjusting the object size accordingly. */ 2211 size += REDZONE_ALIGN - 1; 2212 size &= ~(REDZONE_ALIGN - 1); 2213 } 2214 2215 /* 2) arch mandated alignment */ 2216 if (ralign < ARCH_SLAB_MINALIGN) { 2217 ralign = ARCH_SLAB_MINALIGN; 2218 } 2219 /* 3) caller mandated alignment */ 2220 if (ralign < align) { 2221 ralign = align; 2222 } 2223 /* disable debug if necessary */ 2224 if (ralign > __alignof__(unsigned long long)) 2225 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2226 /* 2227 * 4) Store it. 2228 */ 2229 align = ralign; 2230 2231 if (slab_is_available()) 2232 gfp = GFP_KERNEL; 2233 else 2234 gfp = GFP_NOWAIT; 2235 2236 /* Get cache's description obj. */ 2237 cachep = kmem_cache_zalloc(&cache_cache, gfp); 2238 if (!cachep) 2239 goto oops; 2240 2241 #if DEBUG 2242 cachep->obj_size = size; 2243 2244 /* 2245 * Both debugging options require word-alignment which is calculated 2246 * into align above. 2247 */ 2248 if (flags & SLAB_RED_ZONE) { 2249 /* add space for red zone words */ 2250 cachep->obj_offset += sizeof(unsigned long long); 2251 size += 2 * sizeof(unsigned long long); 2252 } 2253 if (flags & SLAB_STORE_USER) { 2254 /* user store requires one word storage behind the end of 2255 * the real object. But if the second red zone needs to be 2256 * aligned to 64 bits, we must allow that much space. 2257 */ 2258 if (flags & SLAB_RED_ZONE) 2259 size += REDZONE_ALIGN; 2260 else 2261 size += BYTES_PER_WORD; 2262 } 2263 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 2264 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size 2265 && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) { 2266 cachep->obj_offset += PAGE_SIZE - size; 2267 size = PAGE_SIZE; 2268 } 2269 #endif 2270 #endif 2271 2272 /* 2273 * Determine if the slab management is 'on' or 'off' slab. 2274 * (bootstrapping cannot cope with offslab caches so don't do 2275 * it too early on. Always use on-slab management when 2276 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) 2277 */ 2278 if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && 2279 !(flags & SLAB_NOLEAKTRACE)) 2280 /* 2281 * Size is large, assume best to place the slab management obj 2282 * off-slab (should allow better packing of objs). 2283 */ 2284 flags |= CFLGS_OFF_SLAB; 2285 2286 size = ALIGN(size, align); 2287 2288 left_over = calculate_slab_order(cachep, size, align, flags); 2289 2290 if (!cachep->num) { 2291 printk(KERN_ERR 2292 "kmem_cache_create: couldn't create cache %s.\n", name); 2293 kmem_cache_free(&cache_cache, cachep); 2294 cachep = NULL; 2295 goto oops; 2296 } 2297 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) 2298 + sizeof(struct slab), align); 2299 2300 /* 2301 * If the slab has been placed off-slab, and we have enough space then 2302 * move it on-slab. This is at the expense of any extra colouring. 2303 */ 2304 if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { 2305 flags &= ~CFLGS_OFF_SLAB; 2306 left_over -= slab_size; 2307 } 2308 2309 if (flags & CFLGS_OFF_SLAB) { 2310 /* really off slab. No need for manual alignment */ 2311 slab_size = 2312 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); 2313 2314 #ifdef CONFIG_PAGE_POISONING 2315 /* If we're going to use the generic kernel_map_pages() 2316 * poisoning, then it's going to smash the contents of 2317 * the redzone and userword anyhow, so switch them off. 2318 */ 2319 if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) 2320 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2321 #endif 2322 } 2323 2324 cachep->colour_off = cache_line_size(); 2325 /* Offset must be a multiple of the alignment. */ 2326 if (cachep->colour_off < align) 2327 cachep->colour_off = align; 2328 cachep->colour = left_over / cachep->colour_off; 2329 cachep->slab_size = slab_size; 2330 cachep->flags = flags; 2331 cachep->gfpflags = 0; 2332 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) 2333 cachep->gfpflags |= GFP_DMA; 2334 cachep->buffer_size = size; 2335 cachep->reciprocal_buffer_size = reciprocal_value(size); 2336 2337 if (flags & CFLGS_OFF_SLAB) { 2338 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 2339 /* 2340 * This is a possibility for one of the malloc_sizes caches. 2341 * But since we go off slab only for object size greater than 2342 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, 2343 * this should not happen at all. 2344 * But leave a BUG_ON for some lucky dude. 2345 */ 2346 BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); 2347 } 2348 cachep->ctor = ctor; 2349 cachep->name = name; 2350 2351 if (setup_cpu_cache(cachep, gfp)) { 2352 __kmem_cache_destroy(cachep); 2353 cachep = NULL; 2354 goto oops; 2355 } 2356 2357 /* cache setup completed, link it into the list */ 2358 list_add(&cachep->next, &cache_chain); 2359 oops: 2360 if (!cachep && (flags & SLAB_PANIC)) 2361 panic("kmem_cache_create(): failed to create slab `%s'\n", 2362 name); 2363 if (slab_is_available()) { 2364 mutex_unlock(&cache_chain_mutex); 2365 put_online_cpus(); 2366 } 2367 return cachep; 2368 } 2369 EXPORT_SYMBOL(kmem_cache_create); 2370 2371 #if DEBUG 2372 static void check_irq_off(void) 2373 { 2374 BUG_ON(!irqs_disabled()); 2375 } 2376 2377 static void check_irq_on(void) 2378 { 2379 BUG_ON(irqs_disabled()); 2380 } 2381 2382 static void check_spinlock_acquired(struct kmem_cache *cachep) 2383 { 2384 #ifdef CONFIG_SMP 2385 check_irq_off(); 2386 assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); 2387 #endif 2388 } 2389 2390 static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) 2391 { 2392 #ifdef CONFIG_SMP 2393 check_irq_off(); 2394 assert_spin_locked(&cachep->nodelists[node]->list_lock); 2395 #endif 2396 } 2397 2398 #else 2399 #define check_irq_off() do { } while(0) 2400 #define check_irq_on() do { } while(0) 2401 #define check_spinlock_acquired(x) do { } while(0) 2402 #define check_spinlock_acquired_node(x, y) do { } while(0) 2403 #endif 2404 2405 static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, 2406 struct array_cache *ac, 2407 int force, int node); 2408 2409 static void do_drain(void *arg) 2410 { 2411 struct kmem_cache *cachep = arg; 2412 struct array_cache *ac; 2413 int node = numa_node_id(); 2414 2415 check_irq_off(); 2416 ac = cpu_cache_get(cachep); 2417 spin_lock(&cachep->nodelists[node]->list_lock); 2418 free_block(cachep, ac->entry, ac->avail, node); 2419 spin_unlock(&cachep->nodelists[node]->list_lock); 2420 ac->avail = 0; 2421 } 2422 2423 static void drain_cpu_caches(struct kmem_cache *cachep) 2424 { 2425 struct kmem_list3 *l3; 2426 int node; 2427 2428 on_each_cpu(do_drain, cachep, 1); 2429 check_irq_on(); 2430 for_each_online_node(node) { 2431 l3 = cachep->nodelists[node]; 2432 if (l3 && l3->alien) 2433 drain_alien_cache(cachep, l3->alien); 2434 } 2435 2436 for_each_online_node(node) { 2437 l3 = cachep->nodelists[node]; 2438 if (l3) 2439 drain_array(cachep, l3, l3->shared, 1, node); 2440 } 2441 } 2442 2443 /* 2444 * Remove slabs from the list of free slabs. 2445 * Specify the number of slabs to drain in tofree. 2446 * 2447 * Returns the actual number of slabs released. 2448 */ 2449 static int drain_freelist(struct kmem_cache *cache, 2450 struct kmem_list3 *l3, int tofree) 2451 { 2452 struct list_head *p; 2453 int nr_freed; 2454 struct slab *slabp; 2455 2456 nr_freed = 0; 2457 while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { 2458 2459 spin_lock_irq(&l3->list_lock); 2460 p = l3->slabs_free.prev; 2461 if (p == &l3->slabs_free) { 2462 spin_unlock_irq(&l3->list_lock); 2463 goto out; 2464 } 2465 2466 slabp = list_entry(p, struct slab, list); 2467 #if DEBUG 2468 BUG_ON(slabp->inuse); 2469 #endif 2470 list_del(&slabp->list); 2471 /* 2472 * Safe to drop the lock. The slab is no longer linked 2473 * to the cache. 2474 */ 2475 l3->free_objects -= cache->num; 2476 spin_unlock_irq(&l3->list_lock); 2477 slab_destroy(cache, slabp); 2478 nr_freed++; 2479 } 2480 out: 2481 return nr_freed; 2482 } 2483 2484 /* Called with cache_chain_mutex held to protect against cpu hotplug */ 2485 static int __cache_shrink(struct kmem_cache *cachep) 2486 { 2487 int ret = 0, i = 0; 2488 struct kmem_list3 *l3; 2489 2490 drain_cpu_caches(cachep); 2491 2492 check_irq_on(); 2493 for_each_online_node(i) { 2494 l3 = cachep->nodelists[i]; 2495 if (!l3) 2496 continue; 2497 2498 drain_freelist(cachep, l3, l3->free_objects); 2499 2500 ret += !list_empty(&l3->slabs_full) || 2501 !list_empty(&l3->slabs_partial); 2502 } 2503 return (ret ? 1 : 0); 2504 } 2505 2506 /** 2507 * kmem_cache_shrink - Shrink a cache. 2508 * @cachep: The cache to shrink. 2509 * 2510 * Releases as many slabs as possible for a cache. 2511 * To help debugging, a zero exit status indicates all slabs were released. 2512 */ 2513 int kmem_cache_shrink(struct kmem_cache *cachep) 2514 { 2515 int ret; 2516 BUG_ON(!cachep || in_interrupt()); 2517 2518 get_online_cpus(); 2519 mutex_lock(&cache_chain_mutex); 2520 ret = __cache_shrink(cachep); 2521 mutex_unlock(&cache_chain_mutex); 2522 put_online_cpus(); 2523 return ret; 2524 } 2525 EXPORT_SYMBOL(kmem_cache_shrink); 2526 2527 /** 2528 * kmem_cache_destroy - delete a cache 2529 * @cachep: the cache to destroy 2530 * 2531 * Remove a &struct kmem_cache object from the slab cache. 2532 * 2533 * It is expected this function will be called by a module when it is 2534 * unloaded. This will remove the cache completely, and avoid a duplicate 2535 * cache being allocated each time a module is loaded and unloaded, if the 2536 * module doesn't have persistent in-kernel storage across loads and unloads. 2537 * 2538 * The cache must be empty before calling this function. 2539 * 2540 * The caller must guarantee that noone will allocate memory from the cache 2541 * during the kmem_cache_destroy(). 2542 */ 2543 void kmem_cache_destroy(struct kmem_cache *cachep) 2544 { 2545 BUG_ON(!cachep || in_interrupt()); 2546 2547 /* Find the cache in the chain of caches. */ 2548 get_online_cpus(); 2549 mutex_lock(&cache_chain_mutex); 2550 /* 2551 * the chain is never empty, cache_cache is never destroyed 2552 */ 2553 list_del(&cachep->next); 2554 if (__cache_shrink(cachep)) { 2555 slab_error(cachep, "Can't free all objects"); 2556 list_add(&cachep->next, &cache_chain); 2557 mutex_unlock(&cache_chain_mutex); 2558 put_online_cpus(); 2559 return; 2560 } 2561 2562 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2563 rcu_barrier(); 2564 2565 __kmem_cache_destroy(cachep); 2566 mutex_unlock(&cache_chain_mutex); 2567 put_online_cpus(); 2568 } 2569 EXPORT_SYMBOL(kmem_cache_destroy); 2570 2571 /* 2572 * Get the memory for a slab management obj. 2573 * For a slab cache when the slab descriptor is off-slab, slab descriptors 2574 * always come from malloc_sizes caches. The slab descriptor cannot 2575 * come from the same cache which is getting created because, 2576 * when we are searching for an appropriate cache for these 2577 * descriptors in kmem_cache_create, we search through the malloc_sizes array. 2578 * If we are creating a malloc_sizes cache here it would not be visible to 2579 * kmem_find_general_cachep till the initialization is complete. 2580 * Hence we cannot have slabp_cache same as the original cache. 2581 */ 2582 static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, 2583 int colour_off, gfp_t local_flags, 2584 int nodeid) 2585 { 2586 struct slab *slabp; 2587 2588 if (OFF_SLAB(cachep)) { 2589 /* Slab management obj is off-slab. */ 2590 slabp = kmem_cache_alloc_node(cachep->slabp_cache, 2591 local_flags, nodeid); 2592 /* 2593 * If the first object in the slab is leaked (it's allocated 2594 * but no one has a reference to it), we want to make sure 2595 * kmemleak does not treat the ->s_mem pointer as a reference 2596 * to the object. Otherwise we will not report the leak. 2597 */ 2598 kmemleak_scan_area(&slabp->list, sizeof(struct list_head), 2599 local_flags); 2600 if (!slabp) 2601 return NULL; 2602 } else { 2603 slabp = objp + colour_off; 2604 colour_off += cachep->slab_size; 2605 } 2606 slabp->inuse = 0; 2607 slabp->colouroff = colour_off; 2608 slabp->s_mem = objp + colour_off; 2609 slabp->nodeid = nodeid; 2610 slabp->free = 0; 2611 return slabp; 2612 } 2613 2614 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) 2615 { 2616 return (kmem_bufctl_t *) (slabp + 1); 2617 } 2618 2619 static void cache_init_objs(struct kmem_cache *cachep, 2620 struct slab *slabp) 2621 { 2622 int i; 2623 2624 for (i = 0; i < cachep->num; i++) { 2625 void *objp = index_to_obj(cachep, slabp, i); 2626 #if DEBUG 2627 /* need to poison the objs? */ 2628 if (cachep->flags & SLAB_POISON) 2629 poison_obj(cachep, objp, POISON_FREE); 2630 if (cachep->flags & SLAB_STORE_USER) 2631 *dbg_userword(cachep, objp) = NULL; 2632 2633 if (cachep->flags & SLAB_RED_ZONE) { 2634 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2635 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2636 } 2637 /* 2638 * Constructors are not allowed to allocate memory from the same 2639 * cache which they are a constructor for. Otherwise, deadlock. 2640 * They must also be threaded. 2641 */ 2642 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2643 cachep->ctor(objp + obj_offset(cachep)); 2644 2645 if (cachep->flags & SLAB_RED_ZONE) { 2646 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2647 slab_error(cachep, "constructor overwrote the" 2648 " end of an object"); 2649 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2650 slab_error(cachep, "constructor overwrote the" 2651 " start of an object"); 2652 } 2653 if ((cachep->buffer_size % PAGE_SIZE) == 0 && 2654 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2655 kernel_map_pages(virt_to_page(objp), 2656 cachep->buffer_size / PAGE_SIZE, 0); 2657 #else 2658 if (cachep->ctor) 2659 cachep->ctor(objp); 2660 #endif 2661 slab_bufctl(slabp)[i] = i + 1; 2662 } 2663 slab_bufctl(slabp)[i - 1] = BUFCTL_END; 2664 } 2665 2666 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) 2667 { 2668 if (CONFIG_ZONE_DMA_FLAG) { 2669 if (flags & GFP_DMA) 2670 BUG_ON(!(cachep->gfpflags & GFP_DMA)); 2671 else 2672 BUG_ON(cachep->gfpflags & GFP_DMA); 2673 } 2674 } 2675 2676 static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, 2677 int nodeid) 2678 { 2679 void *objp = index_to_obj(cachep, slabp, slabp->free); 2680 kmem_bufctl_t next; 2681 2682 slabp->inuse++; 2683 next = slab_bufctl(slabp)[slabp->free]; 2684 #if DEBUG 2685 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2686 WARN_ON(slabp->nodeid != nodeid); 2687 #endif 2688 slabp->free = next; 2689 2690 return objp; 2691 } 2692 2693 static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, 2694 void *objp, int nodeid) 2695 { 2696 unsigned int objnr = obj_to_index(cachep, slabp, objp); 2697 2698 #if DEBUG 2699 /* Verify that the slab belongs to the intended node */ 2700 WARN_ON(slabp->nodeid != nodeid); 2701 2702 if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { 2703 printk(KERN_ERR "slab: double free detected in cache " 2704 "'%s', objp %p\n", cachep->name, objp); 2705 BUG(); 2706 } 2707 #endif 2708 slab_bufctl(slabp)[objnr] = slabp->free; 2709 slabp->free = objnr; 2710 slabp->inuse--; 2711 } 2712 2713 /* 2714 * Map pages beginning at addr to the given cache and slab. This is required 2715 * for the slab allocator to be able to lookup the cache and slab of a 2716 * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. 2717 */ 2718 static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, 2719 void *addr) 2720 { 2721 int nr_pages; 2722 struct page *page; 2723 2724 page = virt_to_page(addr); 2725 2726 nr_pages = 1; 2727 if (likely(!PageCompound(page))) 2728 nr_pages <<= cache->gfporder; 2729 2730 do { 2731 page_set_cache(page, cache); 2732 page_set_slab(page, slab); 2733 page++; 2734 } while (--nr_pages); 2735 } 2736 2737 /* 2738 * Grow (by 1) the number of slabs within a cache. This is called by 2739 * kmem_cache_alloc() when there are no active objs left in a cache. 2740 */ 2741 static int cache_grow(struct kmem_cache *cachep, 2742 gfp_t flags, int nodeid, void *objp) 2743 { 2744 struct slab *slabp; 2745 size_t offset; 2746 gfp_t local_flags; 2747 struct kmem_list3 *l3; 2748 2749 /* 2750 * Be lazy and only check for valid flags here, keeping it out of the 2751 * critical path in kmem_cache_alloc(). 2752 */ 2753 BUG_ON(flags & GFP_SLAB_BUG_MASK); 2754 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 2755 2756 /* Take the l3 list lock to change the colour_next on this node */ 2757 check_irq_off(); 2758 l3 = cachep->nodelists[nodeid]; 2759 spin_lock(&l3->list_lock); 2760 2761 /* Get colour for the slab, and cal the next value. */ 2762 offset = l3->colour_next; 2763 l3->colour_next++; 2764 if (l3->colour_next >= cachep->colour) 2765 l3->colour_next = 0; 2766 spin_unlock(&l3->list_lock); 2767 2768 offset *= cachep->colour_off; 2769 2770 if (local_flags & __GFP_WAIT) 2771 local_irq_enable(); 2772 2773 /* 2774 * The test for missing atomic flag is performed here, rather than 2775 * the more obvious place, simply to reduce the critical path length 2776 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they 2777 * will eventually be caught here (where it matters). 2778 */ 2779 kmem_flagcheck(cachep, flags); 2780 2781 /* 2782 * Get mem for the objs. Attempt to allocate a physical page from 2783 * 'nodeid'. 2784 */ 2785 if (!objp) 2786 objp = kmem_getpages(cachep, local_flags, nodeid); 2787 if (!objp) 2788 goto failed; 2789 2790 /* Get slab management. */ 2791 slabp = alloc_slabmgmt(cachep, objp, offset, 2792 local_flags & ~GFP_CONSTRAINT_MASK, nodeid); 2793 if (!slabp) 2794 goto opps1; 2795 2796 slab_map_pages(cachep, slabp, objp); 2797 2798 cache_init_objs(cachep, slabp); 2799 2800 if (local_flags & __GFP_WAIT) 2801 local_irq_disable(); 2802 check_irq_off(); 2803 spin_lock(&l3->list_lock); 2804 2805 /* Make slab active. */ 2806 list_add_tail(&slabp->list, &(l3->slabs_free)); 2807 STATS_INC_GROWN(cachep); 2808 l3->free_objects += cachep->num; 2809 spin_unlock(&l3->list_lock); 2810 return 1; 2811 opps1: 2812 kmem_freepages(cachep, objp); 2813 failed: 2814 if (local_flags & __GFP_WAIT) 2815 local_irq_disable(); 2816 return 0; 2817 } 2818 2819 #if DEBUG 2820 2821 /* 2822 * Perform extra freeing checks: 2823 * - detect bad pointers. 2824 * - POISON/RED_ZONE checking 2825 */ 2826 static void kfree_debugcheck(const void *objp) 2827 { 2828 if (!virt_addr_valid(objp)) { 2829 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", 2830 (unsigned long)objp); 2831 BUG(); 2832 } 2833 } 2834 2835 static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) 2836 { 2837 unsigned long long redzone1, redzone2; 2838 2839 redzone1 = *dbg_redzone1(cache, obj); 2840 redzone2 = *dbg_redzone2(cache, obj); 2841 2842 /* 2843 * Redzone is ok. 2844 */ 2845 if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) 2846 return; 2847 2848 if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) 2849 slab_error(cache, "double free detected"); 2850 else 2851 slab_error(cache, "memory outside object was overwritten"); 2852 2853 printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", 2854 obj, redzone1, redzone2); 2855 } 2856 2857 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, 2858 void *caller) 2859 { 2860 struct page *page; 2861 unsigned int objnr; 2862 struct slab *slabp; 2863 2864 BUG_ON(virt_to_cache(objp) != cachep); 2865 2866 objp -= obj_offset(cachep); 2867 kfree_debugcheck(objp); 2868 page = virt_to_head_page(objp); 2869 2870 slabp = page_get_slab(page); 2871 2872 if (cachep->flags & SLAB_RED_ZONE) { 2873 verify_redzone_free(cachep, objp); 2874 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2875 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2876 } 2877 if (cachep->flags & SLAB_STORE_USER) 2878 *dbg_userword(cachep, objp) = caller; 2879 2880 objnr = obj_to_index(cachep, slabp, objp); 2881 2882 BUG_ON(objnr >= cachep->num); 2883 BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); 2884 2885 #ifdef CONFIG_DEBUG_SLAB_LEAK 2886 slab_bufctl(slabp)[objnr] = BUFCTL_FREE; 2887 #endif 2888 if (cachep->flags & SLAB_POISON) { 2889 #ifdef CONFIG_DEBUG_PAGEALLOC 2890 if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { 2891 store_stackinfo(cachep, objp, (unsigned long)caller); 2892 kernel_map_pages(virt_to_page(objp), 2893 cachep->buffer_size / PAGE_SIZE, 0); 2894 } else { 2895 poison_obj(cachep, objp, POISON_FREE); 2896 } 2897 #else 2898 poison_obj(cachep, objp, POISON_FREE); 2899 #endif 2900 } 2901 return objp; 2902 } 2903 2904 static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) 2905 { 2906 kmem_bufctl_t i; 2907 int entries = 0; 2908 2909 /* Check slab's freelist to see if this obj is there. */ 2910 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { 2911 entries++; 2912 if (entries > cachep->num || i >= cachep->num) 2913 goto bad; 2914 } 2915 if (entries != cachep->num - slabp->inuse) { 2916 bad: 2917 printk(KERN_ERR "slab: Internal list corruption detected in " 2918 "cache '%s'(%d), slabp %p(%d). Hexdump:\n", 2919 cachep->name, cachep->num, slabp, slabp->inuse); 2920 for (i = 0; 2921 i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); 2922 i++) { 2923 if (i % 16 == 0) 2924 printk("\n%03x:", i); 2925 printk(" %02x", ((unsigned char *)slabp)[i]); 2926 } 2927 printk("\n"); 2928 BUG(); 2929 } 2930 } 2931 #else 2932 #define kfree_debugcheck(x) do { } while(0) 2933 #define cache_free_debugcheck(x,objp,z) (objp) 2934 #define check_slabp(x,y) do { } while(0) 2935 #endif 2936 2937 static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) 2938 { 2939 int batchcount; 2940 struct kmem_list3 *l3; 2941 struct array_cache *ac; 2942 int node; 2943 2944 retry: 2945 check_irq_off(); 2946 node = numa_node_id(); 2947 ac = cpu_cache_get(cachep); 2948 batchcount = ac->batchcount; 2949 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2950 /* 2951 * If there was little recent activity on this cache, then 2952 * perform only a partial refill. Otherwise we could generate 2953 * refill bouncing. 2954 */ 2955 batchcount = BATCHREFILL_LIMIT; 2956 } 2957 l3 = cachep->nodelists[node]; 2958 2959 BUG_ON(ac->avail > 0 || !l3); 2960 spin_lock(&l3->list_lock); 2961 2962 /* See if we can refill from the shared array */ 2963 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) { 2964 l3->shared->touched = 1; 2965 goto alloc_done; 2966 } 2967 2968 while (batchcount > 0) { 2969 struct list_head *entry; 2970 struct slab *slabp; 2971 /* Get slab alloc is to come from. */ 2972 entry = l3->slabs_partial.next; 2973 if (entry == &l3->slabs_partial) { 2974 l3->free_touched = 1; 2975 entry = l3->slabs_free.next; 2976 if (entry == &l3->slabs_free) 2977 goto must_grow; 2978 } 2979 2980 slabp = list_entry(entry, struct slab, list); 2981 check_slabp(cachep, slabp); 2982 check_spinlock_acquired(cachep); 2983 2984 /* 2985 * The slab was either on partial or free list so 2986 * there must be at least one object available for 2987 * allocation. 2988 */ 2989 BUG_ON(slabp->inuse >= cachep->num); 2990 2991 while (slabp->inuse < cachep->num && batchcount--) { 2992 STATS_INC_ALLOCED(cachep); 2993 STATS_INC_ACTIVE(cachep); 2994 STATS_SET_HIGH(cachep); 2995 2996 ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, 2997 node); 2998 } 2999 check_slabp(cachep, slabp); 3000 3001 /* move slabp to correct slabp list: */ 3002 list_del(&slabp->list); 3003 if (slabp->free == BUFCTL_END) 3004 list_add(&slabp->list, &l3->slabs_full); 3005 else 3006 list_add(&slabp->list, &l3->slabs_partial); 3007 } 3008 3009 must_grow: 3010 l3->free_objects -= ac->avail; 3011 alloc_done: 3012 spin_unlock(&l3->list_lock); 3013 3014 if (unlikely(!ac->avail)) { 3015 int x; 3016 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); 3017 3018 /* cache_grow can reenable interrupts, then ac could change. */ 3019 ac = cpu_cache_get(cachep); 3020 if (!x && ac->avail == 0) /* no objects in sight? abort */ 3021 return NULL; 3022 3023 if (!ac->avail) /* objects refilled by interrupt? */ 3024 goto retry; 3025 } 3026 ac->touched = 1; 3027 return ac->entry[--ac->avail]; 3028 } 3029 3030 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, 3031 gfp_t flags) 3032 { 3033 might_sleep_if(flags & __GFP_WAIT); 3034 #if DEBUG 3035 kmem_flagcheck(cachep, flags); 3036 #endif 3037 } 3038 3039 #if DEBUG 3040 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, 3041 gfp_t flags, void *objp, void *caller) 3042 { 3043 if (!objp) 3044 return objp; 3045 if (cachep->flags & SLAB_POISON) { 3046 #ifdef CONFIG_DEBUG_PAGEALLOC 3047 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 3048 kernel_map_pages(virt_to_page(objp), 3049 cachep->buffer_size / PAGE_SIZE, 1); 3050 else 3051 check_poison_obj(cachep, objp); 3052 #else 3053 check_poison_obj(cachep, objp); 3054 #endif 3055 poison_obj(cachep, objp, POISON_INUSE); 3056 } 3057 if (cachep->flags & SLAB_STORE_USER) 3058 *dbg_userword(cachep, objp) = caller; 3059 3060 if (cachep->flags & SLAB_RED_ZONE) { 3061 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || 3062 *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 3063 slab_error(cachep, "double free, or memory outside" 3064 " object was overwritten"); 3065 printk(KERN_ERR 3066 "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", 3067 objp, *dbg_redzone1(cachep, objp), 3068 *dbg_redzone2(cachep, objp)); 3069 } 3070 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 3071 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 3072 } 3073 #ifdef CONFIG_DEBUG_SLAB_LEAK 3074 { 3075 struct slab *slabp; 3076 unsigned objnr; 3077 3078 slabp = page_get_slab(virt_to_head_page(objp)); 3079 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; 3080 slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; 3081 } 3082 #endif 3083 objp += obj_offset(cachep); 3084 if (cachep->ctor && cachep->flags & SLAB_POISON) 3085 cachep->ctor(objp); 3086 #if ARCH_SLAB_MINALIGN 3087 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { 3088 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3089 objp, ARCH_SLAB_MINALIGN); 3090 } 3091 #endif 3092 return objp; 3093 } 3094 #else 3095 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 3096 #endif 3097 3098 static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) 3099 { 3100 if (cachep == &cache_cache) 3101 return false; 3102 3103 return should_failslab(obj_size(cachep), flags, cachep->flags); 3104 } 3105 3106 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3107 { 3108 void *objp; 3109 struct array_cache *ac; 3110 3111 check_irq_off(); 3112 3113 ac = cpu_cache_get(cachep); 3114 if (likely(ac->avail)) { 3115 STATS_INC_ALLOCHIT(cachep); 3116 ac->touched = 1; 3117 objp = ac->entry[--ac->avail]; 3118 } else { 3119 STATS_INC_ALLOCMISS(cachep); 3120 objp = cache_alloc_refill(cachep, flags); 3121 /* 3122 * the 'ac' may be updated by cache_alloc_refill(), 3123 * and kmemleak_erase() requires its correct value. 3124 */ 3125 ac = cpu_cache_get(cachep); 3126 } 3127 /* 3128 * To avoid a false negative, if an object that is in one of the 3129 * per-CPU caches is leaked, we need to make sure kmemleak doesn't 3130 * treat the array pointers as a reference to the object. 3131 */ 3132 if (objp) 3133 kmemleak_erase(&ac->entry[ac->avail]); 3134 return objp; 3135 } 3136 3137 #ifdef CONFIG_NUMA 3138 /* 3139 * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. 3140 * 3141 * If we are in_interrupt, then process context, including cpusets and 3142 * mempolicy, may not apply and should not be used for allocation policy. 3143 */ 3144 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) 3145 { 3146 int nid_alloc, nid_here; 3147 3148 if (in_interrupt() || (flags & __GFP_THISNODE)) 3149 return NULL; 3150 nid_alloc = nid_here = numa_node_id(); 3151 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3152 nid_alloc = cpuset_mem_spread_node(); 3153 else if (current->mempolicy) 3154 nid_alloc = slab_node(current->mempolicy); 3155 if (nid_alloc != nid_here) 3156 return ____cache_alloc_node(cachep, flags, nid_alloc); 3157 return NULL; 3158 } 3159 3160 /* 3161 * Fallback function if there was no memory available and no objects on a 3162 * certain node and fall back is permitted. First we scan all the 3163 * available nodelists for available objects. If that fails then we 3164 * perform an allocation without specifying a node. This allows the page 3165 * allocator to do its reclaim / fallback magic. We then insert the 3166 * slab into the proper nodelist and then allocate from it. 3167 */ 3168 static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) 3169 { 3170 struct zonelist *zonelist; 3171 gfp_t local_flags; 3172 struct zoneref *z; 3173 struct zone *zone; 3174 enum zone_type high_zoneidx = gfp_zone(flags); 3175 void *obj = NULL; 3176 int nid; 3177 3178 if (flags & __GFP_THISNODE) 3179 return NULL; 3180 3181 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 3182 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 3183 3184 retry: 3185 /* 3186 * Look through allowed nodes for objects available 3187 * from existing per node queues. 3188 */ 3189 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 3190 nid = zone_to_nid(zone); 3191 3192 if (cpuset_zone_allowed_hardwall(zone, flags) && 3193 cache->nodelists[nid] && 3194 cache->nodelists[nid]->free_objects) { 3195 obj = ____cache_alloc_node(cache, 3196 flags | GFP_THISNODE, nid); 3197 if (obj) 3198 break; 3199 } 3200 } 3201 3202 if (!obj) { 3203 /* 3204 * This allocation will be performed within the constraints 3205 * of the current cpuset / memory policy requirements. 3206 * We may trigger various forms of reclaim on the allowed 3207 * set and go into memory reserves if necessary. 3208 */ 3209 if (local_flags & __GFP_WAIT) 3210 local_irq_enable(); 3211 kmem_flagcheck(cache, flags); 3212 obj = kmem_getpages(cache, local_flags, numa_node_id()); 3213 if (local_flags & __GFP_WAIT) 3214 local_irq_disable(); 3215 if (obj) { 3216 /* 3217 * Insert into the appropriate per node queues 3218 */ 3219 nid = page_to_nid(virt_to_page(obj)); 3220 if (cache_grow(cache, flags, nid, obj)) { 3221 obj = ____cache_alloc_node(cache, 3222 flags | GFP_THISNODE, nid); 3223 if (!obj) 3224 /* 3225 * Another processor may allocate the 3226 * objects in the slab since we are 3227 * not holding any locks. 3228 */ 3229 goto retry; 3230 } else { 3231 /* cache_grow already freed obj */ 3232 obj = NULL; 3233 } 3234 } 3235 } 3236 return obj; 3237 } 3238 3239 /* 3240 * A interface to enable slab creation on nodeid 3241 */ 3242 static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3243 int nodeid) 3244 { 3245 struct list_head *entry; 3246 struct slab *slabp; 3247 struct kmem_list3 *l3; 3248 void *obj; 3249 int x; 3250 3251 l3 = cachep->nodelists[nodeid]; 3252 BUG_ON(!l3); 3253 3254 retry: 3255 check_irq_off(); 3256 spin_lock(&l3->list_lock); 3257 entry = l3->slabs_partial.next; 3258 if (entry == &l3->slabs_partial) { 3259 l3->free_touched = 1; 3260 entry = l3->slabs_free.next; 3261 if (entry == &l3->slabs_free) 3262 goto must_grow; 3263 } 3264 3265 slabp = list_entry(entry, struct slab, list); 3266 check_spinlock_acquired_node(cachep, nodeid); 3267 check_slabp(cachep, slabp); 3268 3269 STATS_INC_NODEALLOCS(cachep); 3270 STATS_INC_ACTIVE(cachep); 3271 STATS_SET_HIGH(cachep); 3272 3273 BUG_ON(slabp->inuse == cachep->num); 3274 3275 obj = slab_get_obj(cachep, slabp, nodeid); 3276 check_slabp(cachep, slabp); 3277 l3->free_objects--; 3278 /* move slabp to correct slabp list: */ 3279 list_del(&slabp->list); 3280 3281 if (slabp->free == BUFCTL_END) 3282 list_add(&slabp->list, &l3->slabs_full); 3283 else 3284 list_add(&slabp->list, &l3->slabs_partial); 3285 3286 spin_unlock(&l3->list_lock); 3287 goto done; 3288 3289 must_grow: 3290 spin_unlock(&l3->list_lock); 3291 x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); 3292 if (x) 3293 goto retry; 3294 3295 return fallback_alloc(cachep, flags); 3296 3297 done: 3298 return obj; 3299 } 3300 3301 /** 3302 * kmem_cache_alloc_node - Allocate an object on the specified node 3303 * @cachep: The cache to allocate from. 3304 * @flags: See kmalloc(). 3305 * @nodeid: node number of the target node. 3306 * @caller: return address of caller, used for debug information 3307 * 3308 * Identical to kmem_cache_alloc but it will allocate memory on the given 3309 * node, which can improve the performance for cpu bound structures. 3310 * 3311 * Fallback to other node is possible if __GFP_THISNODE is not set. 3312 */ 3313 static __always_inline void * 3314 __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, 3315 void *caller) 3316 { 3317 unsigned long save_flags; 3318 void *ptr; 3319 3320 flags &= gfp_allowed_mask; 3321 3322 lockdep_trace_alloc(flags); 3323 3324 if (slab_should_failslab(cachep, flags)) 3325 return NULL; 3326 3327 cache_alloc_debugcheck_before(cachep, flags); 3328 local_irq_save(save_flags); 3329 3330 if (nodeid == -1) 3331 nodeid = numa_node_id(); 3332 3333 if (unlikely(!cachep->nodelists[nodeid])) { 3334 /* Node not bootstrapped yet */ 3335 ptr = fallback_alloc(cachep, flags); 3336 goto out; 3337 } 3338 3339 if (nodeid == numa_node_id()) { 3340 /* 3341 * Use the locally cached objects if possible. 3342 * However ____cache_alloc does not allow fallback 3343 * to other nodes. It may fail while we still have 3344 * objects on other nodes available. 3345 */ 3346 ptr = ____cache_alloc(cachep, flags); 3347 if (ptr) 3348 goto out; 3349 } 3350 /* ___cache_alloc_node can fall back to other nodes */ 3351 ptr = ____cache_alloc_node(cachep, flags, nodeid); 3352 out: 3353 local_irq_restore(save_flags); 3354 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); 3355 kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, 3356 flags); 3357 3358 if (likely(ptr)) 3359 kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep)); 3360 3361 if (unlikely((flags & __GFP_ZERO) && ptr)) 3362 memset(ptr, 0, obj_size(cachep)); 3363 3364 return ptr; 3365 } 3366 3367 static __always_inline void * 3368 __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) 3369 { 3370 void *objp; 3371 3372 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { 3373 objp = alternate_node_alloc(cache, flags); 3374 if (objp) 3375 goto out; 3376 } 3377 objp = ____cache_alloc(cache, flags); 3378 3379 /* 3380 * We may just have run out of memory on the local node. 3381 * ____cache_alloc_node() knows how to locate memory on other nodes 3382 */ 3383 if (!objp) 3384 objp = ____cache_alloc_node(cache, flags, numa_node_id()); 3385 3386 out: 3387 return objp; 3388 } 3389 #else 3390 3391 static __always_inline void * 3392 __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3393 { 3394 return ____cache_alloc(cachep, flags); 3395 } 3396 3397 #endif /* CONFIG_NUMA */ 3398 3399 static __always_inline void * 3400 __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) 3401 { 3402 unsigned long save_flags; 3403 void *objp; 3404 3405 flags &= gfp_allowed_mask; 3406 3407 lockdep_trace_alloc(flags); 3408 3409 if (slab_should_failslab(cachep, flags)) 3410 return NULL; 3411 3412 cache_alloc_debugcheck_before(cachep, flags); 3413 local_irq_save(save_flags); 3414 objp = __do_cache_alloc(cachep, flags); 3415 local_irq_restore(save_flags); 3416 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); 3417 kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, 3418 flags); 3419 prefetchw(objp); 3420 3421 if (likely(objp)) 3422 kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); 3423 3424 if (unlikely((flags & __GFP_ZERO) && objp)) 3425 memset(objp, 0, obj_size(cachep)); 3426 3427 return objp; 3428 } 3429 3430 /* 3431 * Caller needs to acquire correct kmem_list's list_lock 3432 */ 3433 static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, 3434 int node) 3435 { 3436 int i; 3437 struct kmem_list3 *l3; 3438 3439 for (i = 0; i < nr_objects; i++) { 3440 void *objp = objpp[i]; 3441 struct slab *slabp; 3442 3443 slabp = virt_to_slab(objp); 3444 l3 = cachep->nodelists[node]; 3445 list_del(&slabp->list); 3446 check_spinlock_acquired_node(cachep, node); 3447 check_slabp(cachep, slabp); 3448 slab_put_obj(cachep, slabp, objp, node); 3449 STATS_DEC_ACTIVE(cachep); 3450 l3->free_objects++; 3451 check_slabp(cachep, slabp); 3452 3453 /* fixup slab chains */ 3454 if (slabp->inuse == 0) { 3455 if (l3->free_objects > l3->free_limit) { 3456 l3->free_objects -= cachep->num; 3457 /* No need to drop any previously held 3458 * lock here, even if we have a off-slab slab 3459 * descriptor it is guaranteed to come from 3460 * a different cache, refer to comments before 3461 * alloc_slabmgmt. 3462 */ 3463 slab_destroy(cachep, slabp); 3464 } else { 3465 list_add(&slabp->list, &l3->slabs_free); 3466 } 3467 } else { 3468 /* Unconditionally move a slab to the end of the 3469 * partial list on free - maximum time for the 3470 * other objects to be freed, too. 3471 */ 3472 list_add_tail(&slabp->list, &l3->slabs_partial); 3473 } 3474 } 3475 } 3476 3477 static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) 3478 { 3479 int batchcount; 3480 struct kmem_list3 *l3; 3481 int node = numa_node_id(); 3482 3483 batchcount = ac->batchcount; 3484 #if DEBUG 3485 BUG_ON(!batchcount || batchcount > ac->avail); 3486 #endif 3487 check_irq_off(); 3488 l3 = cachep->nodelists[node]; 3489 spin_lock(&l3->list_lock); 3490 if (l3->shared) { 3491 struct array_cache *shared_array = l3->shared; 3492 int max = shared_array->limit - shared_array->avail; 3493 if (max) { 3494 if (batchcount > max) 3495 batchcount = max; 3496 memcpy(&(shared_array->entry[shared_array->avail]), 3497 ac->entry, sizeof(void *) * batchcount); 3498 shared_array->avail += batchcount; 3499 goto free_done; 3500 } 3501 } 3502 3503 free_block(cachep, ac->entry, batchcount, node); 3504 free_done: 3505 #if STATS 3506 { 3507 int i = 0; 3508 struct list_head *p; 3509 3510 p = l3->slabs_free.next; 3511 while (p != &(l3->slabs_free)) { 3512 struct slab *slabp; 3513 3514 slabp = list_entry(p, struct slab, list); 3515 BUG_ON(slabp->inuse); 3516 3517 i++; 3518 p = p->next; 3519 } 3520 STATS_SET_FREEABLE(cachep, i); 3521 } 3522 #endif 3523 spin_unlock(&l3->list_lock); 3524 ac->avail -= batchcount; 3525 memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); 3526 } 3527 3528 /* 3529 * Release an obj back to its cache. If the obj has a constructed state, it must 3530 * be in this state _before_ it is released. Called with disabled ints. 3531 */ 3532 static inline void __cache_free(struct kmem_cache *cachep, void *objp) 3533 { 3534 struct array_cache *ac = cpu_cache_get(cachep); 3535 3536 check_irq_off(); 3537 kmemleak_free_recursive(objp, cachep->flags); 3538 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 3539 3540 kmemcheck_slab_free(cachep, objp, obj_size(cachep)); 3541 3542 /* 3543 * Skip calling cache_free_alien() when the platform is not numa. 3544 * This will avoid cache misses that happen while accessing slabp (which 3545 * is per page memory reference) to get nodeid. Instead use a global 3546 * variable to skip the call, which is mostly likely to be present in 3547 * the cache. 3548 */ 3549 if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) 3550 return; 3551 3552 if (likely(ac->avail < ac->limit)) { 3553 STATS_INC_FREEHIT(cachep); 3554 ac->entry[ac->avail++] = objp; 3555 return; 3556 } else { 3557 STATS_INC_FREEMISS(cachep); 3558 cache_flusharray(cachep, ac); 3559 ac->entry[ac->avail++] = objp; 3560 } 3561 } 3562 3563 /** 3564 * kmem_cache_alloc - Allocate an object 3565 * @cachep: The cache to allocate from. 3566 * @flags: See kmalloc(). 3567 * 3568 * Allocate an object from this cache. The flags are only relevant 3569 * if the cache has no available objects. 3570 */ 3571 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3572 { 3573 void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); 3574 3575 trace_kmem_cache_alloc(_RET_IP_, ret, 3576 obj_size(cachep), cachep->buffer_size, flags); 3577 3578 return ret; 3579 } 3580 EXPORT_SYMBOL(kmem_cache_alloc); 3581 3582 #ifdef CONFIG_TRACING 3583 void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) 3584 { 3585 return __cache_alloc(cachep, flags, __builtin_return_address(0)); 3586 } 3587 EXPORT_SYMBOL(kmem_cache_alloc_notrace); 3588 #endif 3589 3590 /** 3591 * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. 3592 * @cachep: the cache we're checking against 3593 * @ptr: pointer to validate 3594 * 3595 * This verifies that the untrusted pointer looks sane; 3596 * it is _not_ a guarantee that the pointer is actually 3597 * part of the slab cache in question, but it at least 3598 * validates that the pointer can be dereferenced and 3599 * looks half-way sane. 3600 * 3601 * Currently only used for dentry validation. 3602 */ 3603 int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) 3604 { 3605 unsigned long size = cachep->buffer_size; 3606 struct page *page; 3607 3608 if (unlikely(!kern_ptr_validate(ptr, size))) 3609 goto out; 3610 page = virt_to_page(ptr); 3611 if (unlikely(!PageSlab(page))) 3612 goto out; 3613 if (unlikely(page_get_cache(page) != cachep)) 3614 goto out; 3615 return 1; 3616 out: 3617 return 0; 3618 } 3619 3620 #ifdef CONFIG_NUMA 3621 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3622 { 3623 void *ret = __cache_alloc_node(cachep, flags, nodeid, 3624 __builtin_return_address(0)); 3625 3626 trace_kmem_cache_alloc_node(_RET_IP_, ret, 3627 obj_size(cachep), cachep->buffer_size, 3628 flags, nodeid); 3629 3630 return ret; 3631 } 3632 EXPORT_SYMBOL(kmem_cache_alloc_node); 3633 3634 #ifdef CONFIG_TRACING 3635 void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, 3636 gfp_t flags, 3637 int nodeid) 3638 { 3639 return __cache_alloc_node(cachep, flags, nodeid, 3640 __builtin_return_address(0)); 3641 } 3642 EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); 3643 #endif 3644 3645 static __always_inline void * 3646 __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) 3647 { 3648 struct kmem_cache *cachep; 3649 void *ret; 3650 3651 cachep = kmem_find_general_cachep(size, flags); 3652 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3653 return cachep; 3654 ret = kmem_cache_alloc_node_notrace(cachep, flags, node); 3655 3656 trace_kmalloc_node((unsigned long) caller, ret, 3657 size, cachep->buffer_size, flags, node); 3658 3659 return ret; 3660 } 3661 3662 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) 3663 void *__kmalloc_node(size_t size, gfp_t flags, int node) 3664 { 3665 return __do_kmalloc_node(size, flags, node, 3666 __builtin_return_address(0)); 3667 } 3668 EXPORT_SYMBOL(__kmalloc_node); 3669 3670 void *__kmalloc_node_track_caller(size_t size, gfp_t flags, 3671 int node, unsigned long caller) 3672 { 3673 return __do_kmalloc_node(size, flags, node, (void *)caller); 3674 } 3675 EXPORT_SYMBOL(__kmalloc_node_track_caller); 3676 #else 3677 void *__kmalloc_node(size_t size, gfp_t flags, int node) 3678 { 3679 return __do_kmalloc_node(size, flags, node, NULL); 3680 } 3681 EXPORT_SYMBOL(__kmalloc_node); 3682 #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ 3683 #endif /* CONFIG_NUMA */ 3684 3685 /** 3686 * __do_kmalloc - allocate memory 3687 * @size: how many bytes of memory are required. 3688 * @flags: the type of memory to allocate (see kmalloc). 3689 * @caller: function caller for debug tracking of the caller 3690 */ 3691 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, 3692 void *caller) 3693 { 3694 struct kmem_cache *cachep; 3695 void *ret; 3696 3697 /* If you want to save a few bytes .text space: replace 3698 * __ with kmem_. 3699 * Then kmalloc uses the uninlined functions instead of the inline 3700 * functions. 3701 */ 3702 cachep = __find_general_cachep(size, flags); 3703 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3704 return cachep; 3705 ret = __cache_alloc(cachep, flags, caller); 3706 3707 trace_kmalloc((unsigned long) caller, ret, 3708 size, cachep->buffer_size, flags); 3709 3710 return ret; 3711 } 3712 3713 3714 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) 3715 void *__kmalloc(size_t size, gfp_t flags) 3716 { 3717 return __do_kmalloc(size, flags, __builtin_return_address(0)); 3718 } 3719 EXPORT_SYMBOL(__kmalloc); 3720 3721 void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) 3722 { 3723 return __do_kmalloc(size, flags, (void *)caller); 3724 } 3725 EXPORT_SYMBOL(__kmalloc_track_caller); 3726 3727 #else 3728 void *__kmalloc(size_t size, gfp_t flags) 3729 { 3730 return __do_kmalloc(size, flags, NULL); 3731 } 3732 EXPORT_SYMBOL(__kmalloc); 3733 #endif 3734 3735 /** 3736 * kmem_cache_free - Deallocate an object 3737 * @cachep: The cache the allocation was from. 3738 * @objp: The previously allocated object. 3739 * 3740 * Free an object which was previously allocated from this 3741 * cache. 3742 */ 3743 void kmem_cache_free(struct kmem_cache *cachep, void *objp) 3744 { 3745 unsigned long flags; 3746 3747 local_irq_save(flags); 3748 debug_check_no_locks_freed(objp, obj_size(cachep)); 3749 if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) 3750 debug_check_no_obj_freed(objp, obj_size(cachep)); 3751 __cache_free(cachep, objp); 3752 local_irq_restore(flags); 3753 3754 trace_kmem_cache_free(_RET_IP_, objp); 3755 } 3756 EXPORT_SYMBOL(kmem_cache_free); 3757 3758 /** 3759 * kfree - free previously allocated memory 3760 * @objp: pointer returned by kmalloc. 3761 * 3762 * If @objp is NULL, no operation is performed. 3763 * 3764 * Don't free memory not originally allocated by kmalloc() 3765 * or you will run into trouble. 3766 */ 3767 void kfree(const void *objp) 3768 { 3769 struct kmem_cache *c; 3770 unsigned long flags; 3771 3772 trace_kfree(_RET_IP_, objp); 3773 3774 if (unlikely(ZERO_OR_NULL_PTR(objp))) 3775 return; 3776 local_irq_save(flags); 3777 kfree_debugcheck(objp); 3778 c = virt_to_cache(objp); 3779 debug_check_no_locks_freed(objp, obj_size(c)); 3780 debug_check_no_obj_freed(objp, obj_size(c)); 3781 __cache_free(c, (void *)objp); 3782 local_irq_restore(flags); 3783 } 3784 EXPORT_SYMBOL(kfree); 3785 3786 unsigned int kmem_cache_size(struct kmem_cache *cachep) 3787 { 3788 return obj_size(cachep); 3789 } 3790 EXPORT_SYMBOL(kmem_cache_size); 3791 3792 const char *kmem_cache_name(struct kmem_cache *cachep) 3793 { 3794 return cachep->name; 3795 } 3796 EXPORT_SYMBOL_GPL(kmem_cache_name); 3797 3798 /* 3799 * This initializes kmem_list3 or resizes various caches for all nodes. 3800 */ 3801 static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) 3802 { 3803 int node; 3804 struct kmem_list3 *l3; 3805 struct array_cache *new_shared; 3806 struct array_cache **new_alien = NULL; 3807 3808 for_each_online_node(node) { 3809 3810 if (use_alien_caches) { 3811 new_alien = alloc_alien_cache(node, cachep->limit, gfp); 3812 if (!new_alien) 3813 goto fail; 3814 } 3815 3816 new_shared = NULL; 3817 if (cachep->shared) { 3818 new_shared = alloc_arraycache(node, 3819 cachep->shared*cachep->batchcount, 3820 0xbaadf00d, gfp); 3821 if (!new_shared) { 3822 free_alien_cache(new_alien); 3823 goto fail; 3824 } 3825 } 3826 3827 l3 = cachep->nodelists[node]; 3828 if (l3) { 3829 struct array_cache *shared = l3->shared; 3830 3831 spin_lock_irq(&l3->list_lock); 3832 3833 if (shared) 3834 free_block(cachep, shared->entry, 3835 shared->avail, node); 3836 3837 l3->shared = new_shared; 3838 if (!l3->alien) { 3839 l3->alien = new_alien; 3840 new_alien = NULL; 3841 } 3842 l3->free_limit = (1 + nr_cpus_node(node)) * 3843 cachep->batchcount + cachep->num; 3844 spin_unlock_irq(&l3->list_lock); 3845 kfree(shared); 3846 free_alien_cache(new_alien); 3847 continue; 3848 } 3849 l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node); 3850 if (!l3) { 3851 free_alien_cache(new_alien); 3852 kfree(new_shared); 3853 goto fail; 3854 } 3855 3856 kmem_list3_init(l3); 3857 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 3858 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 3859 l3->shared = new_shared; 3860 l3->alien = new_alien; 3861 l3->free_limit = (1 + nr_cpus_node(node)) * 3862 cachep->batchcount + cachep->num; 3863 cachep->nodelists[node] = l3; 3864 } 3865 return 0; 3866 3867 fail: 3868 if (!cachep->next.next) { 3869 /* Cache is not active yet. Roll back what we did */ 3870 node--; 3871 while (node >= 0) { 3872 if (cachep->nodelists[node]) { 3873 l3 = cachep->nodelists[node]; 3874 3875 kfree(l3->shared); 3876 free_alien_cache(l3->alien); 3877 kfree(l3); 3878 cachep->nodelists[node] = NULL; 3879 } 3880 node--; 3881 } 3882 } 3883 return -ENOMEM; 3884 } 3885 3886 struct ccupdate_struct { 3887 struct kmem_cache *cachep; 3888 struct array_cache *new[NR_CPUS]; 3889 }; 3890 3891 static void do_ccupdate_local(void *info) 3892 { 3893 struct ccupdate_struct *new = info; 3894 struct array_cache *old; 3895 3896 check_irq_off(); 3897 old = cpu_cache_get(new->cachep); 3898 3899 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 3900 new->new[smp_processor_id()] = old; 3901 } 3902 3903 /* Always called with the cache_chain_mutex held */ 3904 static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 3905 int batchcount, int shared, gfp_t gfp) 3906 { 3907 struct ccupdate_struct *new; 3908 int i; 3909 3910 new = kzalloc(sizeof(*new), gfp); 3911 if (!new) 3912 return -ENOMEM; 3913 3914 for_each_online_cpu(i) { 3915 new->new[i] = alloc_arraycache(cpu_to_node(i), limit, 3916 batchcount, gfp); 3917 if (!new->new[i]) { 3918 for (i--; i >= 0; i--) 3919 kfree(new->new[i]); 3920 kfree(new); 3921 return -ENOMEM; 3922 } 3923 } 3924 new->cachep = cachep; 3925 3926 on_each_cpu(do_ccupdate_local, (void *)new, 1); 3927 3928 check_irq_on(); 3929 cachep->batchcount = batchcount; 3930 cachep->limit = limit; 3931 cachep->shared = shared; 3932 3933 for_each_online_cpu(i) { 3934 struct array_cache *ccold = new->new[i]; 3935 if (!ccold) 3936 continue; 3937 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3938 free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); 3939 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3940 kfree(ccold); 3941 } 3942 kfree(new); 3943 return alloc_kmemlist(cachep, gfp); 3944 } 3945 3946 /* Called with cache_chain_mutex held always */ 3947 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) 3948 { 3949 int err; 3950 int limit, shared; 3951 3952 /* 3953 * The head array serves three purposes: 3954 * - create a LIFO ordering, i.e. return objects that are cache-warm 3955 * - reduce the number of spinlock operations. 3956 * - reduce the number of linked list operations on the slab and 3957 * bufctl chains: array operations are cheaper. 3958 * The numbers are guessed, we should auto-tune as described by 3959 * Bonwick. 3960 */ 3961 if (cachep->buffer_size > 131072) 3962 limit = 1; 3963 else if (cachep->buffer_size > PAGE_SIZE) 3964 limit = 8; 3965 else if (cachep->buffer_size > 1024) 3966 limit = 24; 3967 else if (cachep->buffer_size > 256) 3968 limit = 54; 3969 else 3970 limit = 120; 3971 3972 /* 3973 * CPU bound tasks (e.g. network routing) can exhibit cpu bound 3974 * allocation behaviour: Most allocs on one cpu, most free operations 3975 * on another cpu. For these cases, an efficient object passing between 3976 * cpus is necessary. This is provided by a shared array. The array 3977 * replaces Bonwick's magazine layer. 3978 * On uniprocessor, it's functionally equivalent (but less efficient) 3979 * to a larger limit. Thus disabled by default. 3980 */ 3981 shared = 0; 3982 if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) 3983 shared = 8; 3984 3985 #if DEBUG 3986 /* 3987 * With debugging enabled, large batchcount lead to excessively long 3988 * periods with disabled local interrupts. Limit the batchcount 3989 */ 3990 if (limit > 32) 3991 limit = 32; 3992 #endif 3993 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); 3994 if (err) 3995 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3996 cachep->name, -err); 3997 return err; 3998 } 3999 4000 /* 4001 * Drain an array if it contains any elements taking the l3 lock only if 4002 * necessary. Note that the l3 listlock also protects the array_cache 4003 * if drain_array() is used on the shared array. 4004 */ 4005 void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, 4006 struct array_cache *ac, int force, int node) 4007 { 4008 int tofree; 4009 4010 if (!ac || !ac->avail) 4011 return; 4012 if (ac->touched && !force) { 4013 ac->touched = 0; 4014 } else { 4015 spin_lock_irq(&l3->list_lock); 4016 if (ac->avail) { 4017 tofree = force ? ac->avail : (ac->limit + 4) / 5; 4018 if (tofree > ac->avail) 4019 tofree = (ac->avail + 1) / 2; 4020 free_block(cachep, ac->entry, tofree, node); 4021 ac->avail -= tofree; 4022 memmove(ac->entry, &(ac->entry[tofree]), 4023 sizeof(void *) * ac->avail); 4024 } 4025 spin_unlock_irq(&l3->list_lock); 4026 } 4027 } 4028 4029 /** 4030 * cache_reap - Reclaim memory from caches. 4031 * @w: work descriptor 4032 * 4033 * Called from workqueue/eventd every few seconds. 4034 * Purpose: 4035 * - clear the per-cpu caches for this CPU. 4036 * - return freeable pages to the main free memory pool. 4037 * 4038 * If we cannot acquire the cache chain mutex then just give up - we'll try 4039 * again on the next iteration. 4040 */ 4041 static void cache_reap(struct work_struct *w) 4042 { 4043 struct kmem_cache *searchp; 4044 struct kmem_list3 *l3; 4045 int node = numa_node_id(); 4046 struct delayed_work *work = to_delayed_work(w); 4047 4048 if (!mutex_trylock(&cache_chain_mutex)) 4049 /* Give up. Setup the next iteration. */ 4050 goto out; 4051 4052 list_for_each_entry(searchp, &cache_chain, next) { 4053 check_irq_on(); 4054 4055 /* 4056 * We only take the l3 lock if absolutely necessary and we 4057 * have established with reasonable certainty that 4058 * we can do some work if the lock was obtained. 4059 */ 4060 l3 = searchp->nodelists[node]; 4061 4062 reap_alien(searchp, l3); 4063 4064 drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); 4065 4066 /* 4067 * These are racy checks but it does not matter 4068 * if we skip one check or scan twice. 4069 */ 4070 if (time_after(l3->next_reap, jiffies)) 4071 goto next; 4072 4073 l3->next_reap = jiffies + REAPTIMEOUT_LIST3; 4074 4075 drain_array(searchp, l3, l3->shared, 0, node); 4076 4077 if (l3->free_touched) 4078 l3->free_touched = 0; 4079 else { 4080 int freed; 4081 4082 freed = drain_freelist(searchp, l3, (l3->free_limit + 4083 5 * searchp->num - 1) / (5 * searchp->num)); 4084 STATS_ADD_REAPED(searchp, freed); 4085 } 4086 next: 4087 cond_resched(); 4088 } 4089 check_irq_on(); 4090 mutex_unlock(&cache_chain_mutex); 4091 next_reap_node(); 4092 out: 4093 /* Set up the next iteration */ 4094 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); 4095 } 4096 4097 #ifdef CONFIG_SLABINFO 4098 4099 static void print_slabinfo_header(struct seq_file *m) 4100 { 4101 /* 4102 * Output format version, so at least we can change it 4103 * without _too_ many complaints. 4104 */ 4105 #if STATS 4106 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); 4107 #else 4108 seq_puts(m, "slabinfo - version: 2.1\n"); 4109 #endif 4110 seq_puts(m, "# name <active_objs> <num_objs> <objsize> " 4111 "<objperslab> <pagesperslab>"); 4112 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 4113 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 4114 #if STATS 4115 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " 4116 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); 4117 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 4118 #endif 4119 seq_putc(m, '\n'); 4120 } 4121 4122 static void *s_start(struct seq_file *m, loff_t *pos) 4123 { 4124 loff_t n = *pos; 4125 4126 mutex_lock(&cache_chain_mutex); 4127 if (!n) 4128 print_slabinfo_header(m); 4129 4130 return seq_list_start(&cache_chain, *pos); 4131 } 4132 4133 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 4134 { 4135 return seq_list_next(p, &cache_chain, pos); 4136 } 4137 4138 static void s_stop(struct seq_file *m, void *p) 4139 { 4140 mutex_unlock(&cache_chain_mutex); 4141 } 4142 4143 static int s_show(struct seq_file *m, void *p) 4144 { 4145 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); 4146 struct slab *slabp; 4147 unsigned long active_objs; 4148 unsigned long num_objs; 4149 unsigned long active_slabs = 0; 4150 unsigned long num_slabs, free_objects = 0, shared_avail = 0; 4151 const char *name; 4152 char *error = NULL; 4153 int node; 4154 struct kmem_list3 *l3; 4155 4156 active_objs = 0; 4157 num_slabs = 0; 4158 for_each_online_node(node) { 4159 l3 = cachep->nodelists[node]; 4160 if (!l3) 4161 continue; 4162 4163 check_irq_on(); 4164 spin_lock_irq(&l3->list_lock); 4165 4166 list_for_each_entry(slabp, &l3->slabs_full, list) { 4167 if (slabp->inuse != cachep->num && !error) 4168 error = "slabs_full accounting error"; 4169 active_objs += cachep->num; 4170 active_slabs++; 4171 } 4172 list_for_each_entry(slabp, &l3->slabs_partial, list) { 4173 if (slabp->inuse == cachep->num && !error) 4174 error = "slabs_partial inuse accounting error"; 4175 if (!slabp->inuse && !error) 4176 error = "slabs_partial/inuse accounting error"; 4177 active_objs += slabp->inuse; 4178 active_slabs++; 4179 } 4180 list_for_each_entry(slabp, &l3->slabs_free, list) { 4181 if (slabp->inuse && !error) 4182 error = "slabs_free/inuse accounting error"; 4183 num_slabs++; 4184 } 4185 free_objects += l3->free_objects; 4186 if (l3->shared) 4187 shared_avail += l3->shared->avail; 4188 4189 spin_unlock_irq(&l3->list_lock); 4190 } 4191 num_slabs += active_slabs; 4192 num_objs = num_slabs * cachep->num; 4193 if (num_objs - active_objs != free_objects && !error) 4194 error = "free_objects accounting error"; 4195 4196 name = cachep->name; 4197 if (error) 4198 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 4199 4200 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 4201 name, active_objs, num_objs, cachep->buffer_size, 4202 cachep->num, (1 << cachep->gfporder)); 4203 seq_printf(m, " : tunables %4u %4u %4u", 4204 cachep->limit, cachep->batchcount, cachep->shared); 4205 seq_printf(m, " : slabdata %6lu %6lu %6lu", 4206 active_slabs, num_slabs, shared_avail); 4207 #if STATS 4208 { /* list3 stats */ 4209 unsigned long high = cachep->high_mark; 4210 unsigned long allocs = cachep->num_allocations; 4211 unsigned long grown = cachep->grown; 4212 unsigned long reaped = cachep->reaped; 4213 unsigned long errors = cachep->errors; 4214 unsigned long max_freeable = cachep->max_freeable; 4215 unsigned long node_allocs = cachep->node_allocs; 4216 unsigned long node_frees = cachep->node_frees; 4217 unsigned long overflows = cachep->node_overflow; 4218 4219 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ 4220 %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, 4221 reaped, errors, max_freeable, node_allocs, 4222 node_frees, overflows); 4223 } 4224 /* cpu stats */ 4225 { 4226 unsigned long allochit = atomic_read(&cachep->allochit); 4227 unsigned long allocmiss = atomic_read(&cachep->allocmiss); 4228 unsigned long freehit = atomic_read(&cachep->freehit); 4229 unsigned long freemiss = atomic_read(&cachep->freemiss); 4230 4231 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", 4232 allochit, allocmiss, freehit, freemiss); 4233 } 4234 #endif 4235 seq_putc(m, '\n'); 4236 return 0; 4237 } 4238 4239 /* 4240 * slabinfo_op - iterator that generates /proc/slabinfo 4241 * 4242 * Output layout: 4243 * cache-name 4244 * num-active-objs 4245 * total-objs 4246 * object size 4247 * num-active-slabs 4248 * total-slabs 4249 * num-pages-per-slab 4250 * + further values on SMP and with statistics enabled 4251 */ 4252 4253 static const struct seq_operations slabinfo_op = { 4254 .start = s_start, 4255 .next = s_next, 4256 .stop = s_stop, 4257 .show = s_show, 4258 }; 4259 4260 #define MAX_SLABINFO_WRITE 128 4261 /** 4262 * slabinfo_write - Tuning for the slab allocator 4263 * @file: unused 4264 * @buffer: user buffer 4265 * @count: data length 4266 * @ppos: unused 4267 */ 4268 ssize_t slabinfo_write(struct file *file, const char __user * buffer, 4269 size_t count, loff_t *ppos) 4270 { 4271 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; 4272 int limit, batchcount, shared, res; 4273 struct kmem_cache *cachep; 4274 4275 if (count > MAX_SLABINFO_WRITE) 4276 return -EINVAL; 4277 if (copy_from_user(&kbuf, buffer, count)) 4278 return -EFAULT; 4279 kbuf[MAX_SLABINFO_WRITE] = '\0'; 4280 4281 tmp = strchr(kbuf, ' '); 4282 if (!tmp) 4283 return -EINVAL; 4284 *tmp = '\0'; 4285 tmp++; 4286 if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) 4287 return -EINVAL; 4288 4289 /* Find the cache in the chain of caches. */ 4290 mutex_lock(&cache_chain_mutex); 4291 res = -EINVAL; 4292 list_for_each_entry(cachep, &cache_chain, next) { 4293 if (!strcmp(cachep->name, kbuf)) { 4294 if (limit < 1 || batchcount < 1 || 4295 batchcount > limit || shared < 0) { 4296 res = 0; 4297 } else { 4298 res = do_tune_cpucache(cachep, limit, 4299 batchcount, shared, 4300 GFP_KERNEL); 4301 } 4302 break; 4303 } 4304 } 4305 mutex_unlock(&cache_chain_mutex); 4306 if (res >= 0) 4307 res = count; 4308 return res; 4309 } 4310 4311 static int slabinfo_open(struct inode *inode, struct file *file) 4312 { 4313 return seq_open(file, &slabinfo_op); 4314 } 4315 4316 static const struct file_operations proc_slabinfo_operations = { 4317 .open = slabinfo_open, 4318 .read = seq_read, 4319 .write = slabinfo_write, 4320 .llseek = seq_lseek, 4321 .release = seq_release, 4322 }; 4323 4324 #ifdef CONFIG_DEBUG_SLAB_LEAK 4325 4326 static void *leaks_start(struct seq_file *m, loff_t *pos) 4327 { 4328 mutex_lock(&cache_chain_mutex); 4329 return seq_list_start(&cache_chain, *pos); 4330 } 4331 4332 static inline int add_caller(unsigned long *n, unsigned long v) 4333 { 4334 unsigned long *p; 4335 int l; 4336 if (!v) 4337 return 1; 4338 l = n[1]; 4339 p = n + 2; 4340 while (l) { 4341 int i = l/2; 4342 unsigned long *q = p + 2 * i; 4343 if (*q == v) { 4344 q[1]++; 4345 return 1; 4346 } 4347 if (*q > v) { 4348 l = i; 4349 } else { 4350 p = q + 2; 4351 l -= i + 1; 4352 } 4353 } 4354 if (++n[1] == n[0]) 4355 return 0; 4356 memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n)); 4357 p[0] = v; 4358 p[1] = 1; 4359 return 1; 4360 } 4361 4362 static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) 4363 { 4364 void *p; 4365 int i; 4366 if (n[0] == n[1]) 4367 return; 4368 for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) { 4369 if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) 4370 continue; 4371 if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) 4372 return; 4373 } 4374 } 4375 4376 static void show_symbol(struct seq_file *m, unsigned long address) 4377 { 4378 #ifdef CONFIG_KALLSYMS 4379 unsigned long offset, size; 4380 char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN]; 4381 4382 if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) { 4383 seq_printf(m, "%s+%#lx/%#lx", name, offset, size); 4384 if (modname[0]) 4385 seq_printf(m, " [%s]", modname); 4386 return; 4387 } 4388 #endif 4389 seq_printf(m, "%p", (void *)address); 4390 } 4391 4392 static int leaks_show(struct seq_file *m, void *p) 4393 { 4394 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); 4395 struct slab *slabp; 4396 struct kmem_list3 *l3; 4397 const char *name; 4398 unsigned long *n = m->private; 4399 int node; 4400 int i; 4401 4402 if (!(cachep->flags & SLAB_STORE_USER)) 4403 return 0; 4404 if (!(cachep->flags & SLAB_RED_ZONE)) 4405 return 0; 4406 4407 /* OK, we can do it */ 4408 4409 n[1] = 0; 4410 4411 for_each_online_node(node) { 4412 l3 = cachep->nodelists[node]; 4413 if (!l3) 4414 continue; 4415 4416 check_irq_on(); 4417 spin_lock_irq(&l3->list_lock); 4418 4419 list_for_each_entry(slabp, &l3->slabs_full, list) 4420 handle_slab(n, cachep, slabp); 4421 list_for_each_entry(slabp, &l3->slabs_partial, list) 4422 handle_slab(n, cachep, slabp); 4423 spin_unlock_irq(&l3->list_lock); 4424 } 4425 name = cachep->name; 4426 if (n[0] == n[1]) { 4427 /* Increase the buffer size */ 4428 mutex_unlock(&cache_chain_mutex); 4429 m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); 4430 if (!m->private) { 4431 /* Too bad, we are really out */ 4432 m->private = n; 4433 mutex_lock(&cache_chain_mutex); 4434 return -ENOMEM; 4435 } 4436 *(unsigned long *)m->private = n[0] * 2; 4437 kfree(n); 4438 mutex_lock(&cache_chain_mutex); 4439 /* Now make sure this entry will be retried */ 4440 m->count = m->size; 4441 return 0; 4442 } 4443 for (i = 0; i < n[1]; i++) { 4444 seq_printf(m, "%s: %lu ", name, n[2*i+3]); 4445 show_symbol(m, n[2*i+2]); 4446 seq_putc(m, '\n'); 4447 } 4448 4449 return 0; 4450 } 4451 4452 static const struct seq_operations slabstats_op = { 4453 .start = leaks_start, 4454 .next = s_next, 4455 .stop = s_stop, 4456 .show = leaks_show, 4457 }; 4458 4459 static int slabstats_open(struct inode *inode, struct file *file) 4460 { 4461 unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL); 4462 int ret = -ENOMEM; 4463 if (n) { 4464 ret = seq_open(file, &slabstats_op); 4465 if (!ret) { 4466 struct seq_file *m = file->private_data; 4467 *n = PAGE_SIZE / (2 * sizeof(unsigned long)); 4468 m->private = n; 4469 n = NULL; 4470 } 4471 kfree(n); 4472 } 4473 return ret; 4474 } 4475 4476 static const struct file_operations proc_slabstats_operations = { 4477 .open = slabstats_open, 4478 .read = seq_read, 4479 .llseek = seq_lseek, 4480 .release = seq_release_private, 4481 }; 4482 #endif 4483 4484 static int __init slab_proc_init(void) 4485 { 4486 proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); 4487 #ifdef CONFIG_DEBUG_SLAB_LEAK 4488 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); 4489 #endif 4490 return 0; 4491 } 4492 module_init(slab_proc_init); 4493 #endif 4494 4495 /** 4496 * ksize - get the actual amount of memory allocated for a given object 4497 * @objp: Pointer to the object 4498 * 4499 * kmalloc may internally round up allocations and return more memory 4500 * than requested. ksize() can be used to determine the actual amount of 4501 * memory allocated. The caller may use this additional memory, even though 4502 * a smaller amount of memory was initially specified with the kmalloc call. 4503 * The caller must guarantee that objp points to a valid object previously 4504 * allocated with either kmalloc() or kmem_cache_alloc(). The object 4505 * must not be freed during the duration of the call. 4506 */ 4507 size_t ksize(const void *objp) 4508 { 4509 BUG_ON(!objp); 4510 if (unlikely(objp == ZERO_SIZE_PTR)) 4511 return 0; 4512 4513 return obj_size(virt_to_cache(objp)); 4514 } 4515 EXPORT_SYMBOL(ksize); 4516