1 /* 2 * linux/mm/slab.c 3 * Written by Mark Hemment, 1996/97. 4 * (markhe@nextd.demon.co.uk) 5 * 6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli 7 * 8 * Major cleanup, different bufctl logic, per-cpu arrays 9 * (c) 2000 Manfred Spraul 10 * 11 * Cleanup, make the head arrays unconditional, preparation for NUMA 12 * (c) 2002 Manfred Spraul 13 * 14 * An implementation of the Slab Allocator as described in outline in; 15 * UNIX Internals: The New Frontiers by Uresh Vahalia 16 * Pub: Prentice Hall ISBN 0-13-101908-2 17 * or with a little more detail in; 18 * The Slab Allocator: An Object-Caching Kernel Memory Allocator 19 * Jeff Bonwick (Sun Microsystems). 20 * Presented at: USENIX Summer 1994 Technical Conference 21 * 22 * The memory is organized in caches, one cache for each object type. 23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) 24 * Each cache consists out of many slabs (they are small (usually one 25 * page long) and always contiguous), and each slab contains multiple 26 * initialized objects. 27 * 28 * This means, that your constructor is used only for newly allocated 29 * slabs and you must pass objects with the same intializations to 30 * kmem_cache_free. 31 * 32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, 33 * normal). If you need a special memory type, then must create a new 34 * cache for that memory type. 35 * 36 * In order to reduce fragmentation, the slabs are sorted in 3 groups: 37 * full slabs with 0 free objects 38 * partial slabs 39 * empty slabs with no allocated objects 40 * 41 * If partial slabs exist, then new allocations come from these slabs, 42 * otherwise from empty slabs or new slabs are allocated. 43 * 44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache 45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs. 46 * 47 * Each cache has a short per-cpu head array, most allocs 48 * and frees go into that array, and if that array overflows, then 1/2 49 * of the entries in the array are given back into the global cache. 50 * The head array is strictly LIFO and should improve the cache hit rates. 51 * On SMP, it additionally reduces the spinlock operations. 52 * 53 * The c_cpuarray may not be read with enabled local interrupts - 54 * it's changed with a smp_call_function(). 55 * 56 * SMP synchronization: 57 * constructors and destructors are called without any locking. 58 * Several members in kmem_cache_t and struct slab never change, they 59 * are accessed without any locking. 60 * The per-cpu arrays are never accessed from the wrong cpu, no locking, 61 * and local interrupts are disabled so slab code is preempt-safe. 62 * The non-constant members are protected with a per-cache irq spinlock. 63 * 64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch 65 * in 2000 - many ideas in the current implementation are derived from 66 * his patch. 67 * 68 * Further notes from the original documentation: 69 * 70 * 11 April '97. Started multi-threading - markhe 71 * The global cache-chain is protected by the semaphore 'cache_chain_sem'. 72 * The sem is only needed when accessing/extending the cache-chain, which 73 * can never happen inside an interrupt (kmem_cache_create(), 74 * kmem_cache_shrink() and kmem_cache_reap()). 75 * 76 * At present, each engine can be growing a cache. This should be blocked. 77 * 78 * 15 March 2005. NUMA slab allocator. 79 * Shai Fultheim <shai@scalex86.org>. 80 * Shobhit Dayal <shobhit@calsoftinc.com> 81 * Alok N Kataria <alokk@calsoftinc.com> 82 * Christoph Lameter <christoph@lameter.com> 83 * 84 * Modified the slab allocator to be node aware on NUMA systems. 85 * Each node has its own list of partial, free and full slabs. 86 * All object allocations for a node occur from node specific slab lists. 87 */ 88 89 #include <linux/config.h> 90 #include <linux/slab.h> 91 #include <linux/mm.h> 92 #include <linux/swap.h> 93 #include <linux/cache.h> 94 #include <linux/interrupt.h> 95 #include <linux/init.h> 96 #include <linux/compiler.h> 97 #include <linux/seq_file.h> 98 #include <linux/notifier.h> 99 #include <linux/kallsyms.h> 100 #include <linux/cpu.h> 101 #include <linux/sysctl.h> 102 #include <linux/module.h> 103 #include <linux/rcupdate.h> 104 #include <linux/string.h> 105 #include <linux/nodemask.h> 106 107 #include <asm/uaccess.h> 108 #include <asm/cacheflush.h> 109 #include <asm/tlbflush.h> 110 #include <asm/page.h> 111 112 /* 113 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, 114 * SLAB_RED_ZONE & SLAB_POISON. 115 * 0 for faster, smaller code (especially in the critical paths). 116 * 117 * STATS - 1 to collect stats for /proc/slabinfo. 118 * 0 for faster, smaller code (especially in the critical paths). 119 * 120 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) 121 */ 122 123 #ifdef CONFIG_DEBUG_SLAB 124 #define DEBUG 1 125 #define STATS 1 126 #define FORCED_DEBUG 1 127 #else 128 #define DEBUG 0 129 #define STATS 0 130 #define FORCED_DEBUG 0 131 #endif 132 133 /* Shouldn't this be in a header file somewhere? */ 134 #define BYTES_PER_WORD sizeof(void *) 135 136 #ifndef cache_line_size 137 #define cache_line_size() L1_CACHE_BYTES 138 #endif 139 140 #ifndef ARCH_KMALLOC_MINALIGN 141 /* 142 * Enforce a minimum alignment for the kmalloc caches. 143 * Usually, the kmalloc caches are cache_line_size() aligned, except when 144 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. 145 * Some archs want to perform DMA into kmalloc caches and need a guaranteed 146 * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that. 147 * Note that this flag disables some debug features. 148 */ 149 #define ARCH_KMALLOC_MINALIGN 0 150 #endif 151 152 #ifndef ARCH_SLAB_MINALIGN 153 /* 154 * Enforce a minimum alignment for all caches. 155 * Intended for archs that get misalignment faults even for BYTES_PER_WORD 156 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. 157 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables 158 * some debug features. 159 */ 160 #define ARCH_SLAB_MINALIGN 0 161 #endif 162 163 #ifndef ARCH_KMALLOC_FLAGS 164 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 165 #endif 166 167 /* Legal flag mask for kmem_cache_create(). */ 168 #if DEBUG 169 # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ 170 SLAB_POISON | SLAB_HWCACHE_ALIGN | \ 171 SLAB_NO_REAP | SLAB_CACHE_DMA | \ 172 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ 173 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 174 SLAB_DESTROY_BY_RCU) 175 #else 176 # define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ 177 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ 178 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 179 SLAB_DESTROY_BY_RCU) 180 #endif 181 182 /* 183 * kmem_bufctl_t: 184 * 185 * Bufctl's are used for linking objs within a slab 186 * linked offsets. 187 * 188 * This implementation relies on "struct page" for locating the cache & 189 * slab an object belongs to. 190 * This allows the bufctl structure to be small (one int), but limits 191 * the number of objects a slab (not a cache) can contain when off-slab 192 * bufctls are used. The limit is the size of the largest general cache 193 * that does not use off-slab slabs. 194 * For 32bit archs with 4 kB pages, is this 56. 195 * This is not serious, as it is only for large objects, when it is unwise 196 * to have too many per slab. 197 * Note: This limit can be raised by introducing a general cache whose size 198 * is less than 512 (PAGE_SIZE<<3), but greater than 256. 199 */ 200 201 typedef unsigned int kmem_bufctl_t; 202 #define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) 203 #define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) 204 #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) 205 206 /* Max number of objs-per-slab for caches which use off-slab slabs. 207 * Needed to avoid a possible looping condition in cache_grow(). 208 */ 209 static unsigned long offslab_limit; 210 211 /* 212 * struct slab 213 * 214 * Manages the objs in a slab. Placed either at the beginning of mem allocated 215 * for a slab, or allocated from an general cache. 216 * Slabs are chained into three list: fully used, partial, fully free slabs. 217 */ 218 struct slab { 219 struct list_head list; 220 unsigned long colouroff; 221 void *s_mem; /* including colour offset */ 222 unsigned int inuse; /* num of objs active in slab */ 223 kmem_bufctl_t free; 224 unsigned short nodeid; 225 }; 226 227 /* 228 * struct slab_rcu 229 * 230 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to 231 * arrange for kmem_freepages to be called via RCU. This is useful if 232 * we need to approach a kernel structure obliquely, from its address 233 * obtained without the usual locking. We can lock the structure to 234 * stabilize it and check it's still at the given address, only if we 235 * can be sure that the memory has not been meanwhile reused for some 236 * other kind of object (which our subsystem's lock might corrupt). 237 * 238 * rcu_read_lock before reading the address, then rcu_read_unlock after 239 * taking the spinlock within the structure expected at that address. 240 * 241 * We assume struct slab_rcu can overlay struct slab when destroying. 242 */ 243 struct slab_rcu { 244 struct rcu_head head; 245 kmem_cache_t *cachep; 246 void *addr; 247 }; 248 249 /* 250 * struct array_cache 251 * 252 * Purpose: 253 * - LIFO ordering, to hand out cache-warm objects from _alloc 254 * - reduce the number of linked list operations 255 * - reduce spinlock operations 256 * 257 * The limit is stored in the per-cpu structure to reduce the data cache 258 * footprint. 259 * 260 */ 261 struct array_cache { 262 unsigned int avail; 263 unsigned int limit; 264 unsigned int batchcount; 265 unsigned int touched; 266 spinlock_t lock; 267 void *entry[0]; /* 268 * Must have this definition in here for the proper 269 * alignment of array_cache. Also simplifies accessing 270 * the entries. 271 * [0] is for gcc 2.95. It should really be []. 272 */ 273 }; 274 275 /* bootstrap: The caches do not work without cpuarrays anymore, 276 * but the cpuarrays are allocated from the generic caches... 277 */ 278 #define BOOT_CPUCACHE_ENTRIES 1 279 struct arraycache_init { 280 struct array_cache cache; 281 void *entries[BOOT_CPUCACHE_ENTRIES]; 282 }; 283 284 /* 285 * The slab lists for all objects. 286 */ 287 struct kmem_list3 { 288 struct list_head slabs_partial; /* partial list first, better asm code */ 289 struct list_head slabs_full; 290 struct list_head slabs_free; 291 unsigned long free_objects; 292 unsigned long next_reap; 293 int free_touched; 294 unsigned int free_limit; 295 spinlock_t list_lock; 296 struct array_cache *shared; /* shared per node */ 297 struct array_cache **alien; /* on other nodes */ 298 }; 299 300 /* 301 * Need this for bootstrapping a per node allocator. 302 */ 303 #define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1) 304 struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; 305 #define CACHE_CACHE 0 306 #define SIZE_AC 1 307 #define SIZE_L3 (1 + MAX_NUMNODES) 308 309 /* 310 * This function must be completely optimized away if 311 * a constant is passed to it. Mostly the same as 312 * what is in linux/slab.h except it returns an 313 * index. 314 */ 315 static __always_inline int index_of(const size_t size) 316 { 317 if (__builtin_constant_p(size)) { 318 int i = 0; 319 320 #define CACHE(x) \ 321 if (size <=x) \ 322 return i; \ 323 else \ 324 i++; 325 #include "linux/kmalloc_sizes.h" 326 #undef CACHE 327 { 328 extern void __bad_size(void); 329 __bad_size(); 330 } 331 } else 332 BUG(); 333 return 0; 334 } 335 336 #define INDEX_AC index_of(sizeof(struct arraycache_init)) 337 #define INDEX_L3 index_of(sizeof(struct kmem_list3)) 338 339 static inline void kmem_list3_init(struct kmem_list3 *parent) 340 { 341 INIT_LIST_HEAD(&parent->slabs_full); 342 INIT_LIST_HEAD(&parent->slabs_partial); 343 INIT_LIST_HEAD(&parent->slabs_free); 344 parent->shared = NULL; 345 parent->alien = NULL; 346 spin_lock_init(&parent->list_lock); 347 parent->free_objects = 0; 348 parent->free_touched = 0; 349 } 350 351 #define MAKE_LIST(cachep, listp, slab, nodeid) \ 352 do { \ 353 INIT_LIST_HEAD(listp); \ 354 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ 355 } while (0) 356 357 #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ 358 do { \ 359 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ 360 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ 361 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 362 } while (0) 363 364 /* 365 * kmem_cache_t 366 * 367 * manages a cache. 368 */ 369 370 struct kmem_cache { 371 /* 1) per-cpu data, touched during every alloc/free */ 372 struct array_cache *array[NR_CPUS]; 373 unsigned int batchcount; 374 unsigned int limit; 375 unsigned int shared; 376 unsigned int objsize; 377 /* 2) touched by every alloc & free from the backend */ 378 struct kmem_list3 *nodelists[MAX_NUMNODES]; 379 unsigned int flags; /* constant flags */ 380 unsigned int num; /* # of objs per slab */ 381 spinlock_t spinlock; 382 383 /* 3) cache_grow/shrink */ 384 /* order of pgs per slab (2^n) */ 385 unsigned int gfporder; 386 387 /* force GFP flags, e.g. GFP_DMA */ 388 gfp_t gfpflags; 389 390 size_t colour; /* cache colouring range */ 391 unsigned int colour_off; /* colour offset */ 392 unsigned int colour_next; /* cache colouring */ 393 kmem_cache_t *slabp_cache; 394 unsigned int slab_size; 395 unsigned int dflags; /* dynamic flags */ 396 397 /* constructor func */ 398 void (*ctor) (void *, kmem_cache_t *, unsigned long); 399 400 /* de-constructor func */ 401 void (*dtor) (void *, kmem_cache_t *, unsigned long); 402 403 /* 4) cache creation/removal */ 404 const char *name; 405 struct list_head next; 406 407 /* 5) statistics */ 408 #if STATS 409 unsigned long num_active; 410 unsigned long num_allocations; 411 unsigned long high_mark; 412 unsigned long grown; 413 unsigned long reaped; 414 unsigned long errors; 415 unsigned long max_freeable; 416 unsigned long node_allocs; 417 unsigned long node_frees; 418 atomic_t allochit; 419 atomic_t allocmiss; 420 atomic_t freehit; 421 atomic_t freemiss; 422 #endif 423 #if DEBUG 424 int dbghead; 425 int reallen; 426 #endif 427 }; 428 429 #define CFLGS_OFF_SLAB (0x80000000UL) 430 #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 431 432 #define BATCHREFILL_LIMIT 16 433 /* Optimization question: fewer reaps means less 434 * probability for unnessary cpucache drain/refill cycles. 435 * 436 * OTOH the cpuarrays can contain lots of objects, 437 * which could lock up otherwise freeable slabs. 438 */ 439 #define REAPTIMEOUT_CPUC (2*HZ) 440 #define REAPTIMEOUT_LIST3 (4*HZ) 441 442 #if STATS 443 #define STATS_INC_ACTIVE(x) ((x)->num_active++) 444 #define STATS_DEC_ACTIVE(x) ((x)->num_active--) 445 #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 446 #define STATS_INC_GROWN(x) ((x)->grown++) 447 #define STATS_INC_REAPED(x) ((x)->reaped++) 448 #define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ 449 (x)->high_mark = (x)->num_active; \ 450 } while (0) 451 #define STATS_INC_ERR(x) ((x)->errors++) 452 #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 453 #define STATS_INC_NODEFREES(x) ((x)->node_frees++) 454 #define STATS_SET_FREEABLE(x, i) \ 455 do { if ((x)->max_freeable < i) \ 456 (x)->max_freeable = i; \ 457 } while (0) 458 459 #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) 460 #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) 461 #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) 462 #define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) 463 #else 464 #define STATS_INC_ACTIVE(x) do { } while (0) 465 #define STATS_DEC_ACTIVE(x) do { } while (0) 466 #define STATS_INC_ALLOCED(x) do { } while (0) 467 #define STATS_INC_GROWN(x) do { } while (0) 468 #define STATS_INC_REAPED(x) do { } while (0) 469 #define STATS_SET_HIGH(x) do { } while (0) 470 #define STATS_INC_ERR(x) do { } while (0) 471 #define STATS_INC_NODEALLOCS(x) do { } while (0) 472 #define STATS_INC_NODEFREES(x) do { } while (0) 473 #define STATS_SET_FREEABLE(x, i) \ 474 do { } while (0) 475 476 #define STATS_INC_ALLOCHIT(x) do { } while (0) 477 #define STATS_INC_ALLOCMISS(x) do { } while (0) 478 #define STATS_INC_FREEHIT(x) do { } while (0) 479 #define STATS_INC_FREEMISS(x) do { } while (0) 480 #endif 481 482 #if DEBUG 483 /* Magic nums for obj red zoning. 484 * Placed in the first word before and the first word after an obj. 485 */ 486 #define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ 487 #define RED_ACTIVE 0x170FC2A5UL /* when obj is active */ 488 489 /* ...and for poisoning */ 490 #define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ 491 #define POISON_FREE 0x6b /* for use-after-free poisoning */ 492 #define POISON_END 0xa5 /* end-byte of poisoning */ 493 494 /* memory layout of objects: 495 * 0 : objp 496 * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that 497 * the end of an object is aligned with the end of the real 498 * allocation. Catches writes behind the end of the allocation. 499 * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1: 500 * redzone word. 501 * cachep->dbghead: The real object. 502 * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 503 * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] 504 */ 505 static int obj_dbghead(kmem_cache_t *cachep) 506 { 507 return cachep->dbghead; 508 } 509 510 static int obj_reallen(kmem_cache_t *cachep) 511 { 512 return cachep->reallen; 513 } 514 515 static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp) 516 { 517 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 518 return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD); 519 } 520 521 static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp) 522 { 523 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 524 if (cachep->flags & SLAB_STORE_USER) 525 return (unsigned long *)(objp + cachep->objsize - 526 2 * BYTES_PER_WORD); 527 return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD); 528 } 529 530 static void **dbg_userword(kmem_cache_t *cachep, void *objp) 531 { 532 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 533 return (void **)(objp + cachep->objsize - BYTES_PER_WORD); 534 } 535 536 #else 537 538 #define obj_dbghead(x) 0 539 #define obj_reallen(cachep) (cachep->objsize) 540 #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 541 #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 542 #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 543 544 #endif 545 546 /* 547 * Maximum size of an obj (in 2^order pages) 548 * and absolute limit for the gfp order. 549 */ 550 #if defined(CONFIG_LARGE_ALLOCS) 551 #define MAX_OBJ_ORDER 13 /* up to 32Mb */ 552 #define MAX_GFP_ORDER 13 /* up to 32Mb */ 553 #elif defined(CONFIG_MMU) 554 #define MAX_OBJ_ORDER 5 /* 32 pages */ 555 #define MAX_GFP_ORDER 5 /* 32 pages */ 556 #else 557 #define MAX_OBJ_ORDER 8 /* up to 1Mb */ 558 #define MAX_GFP_ORDER 8 /* up to 1Mb */ 559 #endif 560 561 /* 562 * Do not go above this order unless 0 objects fit into the slab. 563 */ 564 #define BREAK_GFP_ORDER_HI 1 565 #define BREAK_GFP_ORDER_LO 0 566 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; 567 568 /* Functions for storing/retrieving the cachep and or slab from the 569 * global 'mem_map'. These are used to find the slab an obj belongs to. 570 * With kfree(), these are used to find the cache which an obj belongs to. 571 */ 572 static inline void page_set_cache(struct page *page, struct kmem_cache *cache) 573 { 574 page->lru.next = (struct list_head *)cache; 575 } 576 577 static inline struct kmem_cache *page_get_cache(struct page *page) 578 { 579 return (struct kmem_cache *)page->lru.next; 580 } 581 582 static inline void page_set_slab(struct page *page, struct slab *slab) 583 { 584 page->lru.prev = (struct list_head *)slab; 585 } 586 587 static inline struct slab *page_get_slab(struct page *page) 588 { 589 return (struct slab *)page->lru.prev; 590 } 591 592 /* These are the default caches for kmalloc. Custom caches can have other sizes. */ 593 struct cache_sizes malloc_sizes[] = { 594 #define CACHE(x) { .cs_size = (x) }, 595 #include <linux/kmalloc_sizes.h> 596 CACHE(ULONG_MAX) 597 #undef CACHE 598 }; 599 EXPORT_SYMBOL(malloc_sizes); 600 601 /* Must match cache_sizes above. Out of line to keep cache footprint low. */ 602 struct cache_names { 603 char *name; 604 char *name_dma; 605 }; 606 607 static struct cache_names __initdata cache_names[] = { 608 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, 609 #include <linux/kmalloc_sizes.h> 610 {NULL,} 611 #undef CACHE 612 }; 613 614 static struct arraycache_init initarray_cache __initdata = 615 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 616 static struct arraycache_init initarray_generic = 617 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 618 619 /* internal cache of cache description objs */ 620 static kmem_cache_t cache_cache = { 621 .batchcount = 1, 622 .limit = BOOT_CPUCACHE_ENTRIES, 623 .shared = 1, 624 .objsize = sizeof(kmem_cache_t), 625 .flags = SLAB_NO_REAP, 626 .spinlock = SPIN_LOCK_UNLOCKED, 627 .name = "kmem_cache", 628 #if DEBUG 629 .reallen = sizeof(kmem_cache_t), 630 #endif 631 }; 632 633 /* Guard access to the cache-chain. */ 634 static struct semaphore cache_chain_sem; 635 static struct list_head cache_chain; 636 637 /* 638 * vm_enough_memory() looks at this to determine how many 639 * slab-allocated pages are possibly freeable under pressure 640 * 641 * SLAB_RECLAIM_ACCOUNT turns this on per-slab 642 */ 643 atomic_t slab_reclaim_pages; 644 645 /* 646 * chicken and egg problem: delay the per-cpu array allocation 647 * until the general caches are up. 648 */ 649 static enum { 650 NONE, 651 PARTIAL_AC, 652 PARTIAL_L3, 653 FULL 654 } g_cpucache_up; 655 656 static DEFINE_PER_CPU(struct work_struct, reap_work); 657 658 static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node); 659 static void enable_cpucache(kmem_cache_t *cachep); 660 static void cache_reap(void *unused); 661 static int __node_shrink(kmem_cache_t *cachep, int node); 662 663 static inline struct array_cache *ac_data(kmem_cache_t *cachep) 664 { 665 return cachep->array[smp_processor_id()]; 666 } 667 668 static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags) 669 { 670 struct cache_sizes *csizep = malloc_sizes; 671 672 #if DEBUG 673 /* This happens if someone tries to call 674 * kmem_cache_create(), or __kmalloc(), before 675 * the generic caches are initialized. 676 */ 677 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); 678 #endif 679 while (size > csizep->cs_size) 680 csizep++; 681 682 /* 683 * Really subtle: The last entry with cs->cs_size==ULONG_MAX 684 * has cs_{dma,}cachep==NULL. Thus no special case 685 * for large kmalloc calls required. 686 */ 687 if (unlikely(gfpflags & GFP_DMA)) 688 return csizep->cs_dmacachep; 689 return csizep->cs_cachep; 690 } 691 692 kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags) 693 { 694 return __find_general_cachep(size, gfpflags); 695 } 696 EXPORT_SYMBOL(kmem_find_general_cachep); 697 698 /* Cal the num objs, wastage, and bytes left over for a given slab size. */ 699 static void cache_estimate(unsigned long gfporder, size_t size, size_t align, 700 int flags, size_t *left_over, unsigned int *num) 701 { 702 int i; 703 size_t wastage = PAGE_SIZE << gfporder; 704 size_t extra = 0; 705 size_t base = 0; 706 707 if (!(flags & CFLGS_OFF_SLAB)) { 708 base = sizeof(struct slab); 709 extra = sizeof(kmem_bufctl_t); 710 } 711 i = 0; 712 while (i * size + ALIGN(base + i * extra, align) <= wastage) 713 i++; 714 if (i > 0) 715 i--; 716 717 if (i > SLAB_LIMIT) 718 i = SLAB_LIMIT; 719 720 *num = i; 721 wastage -= i * size; 722 wastage -= ALIGN(base + i * extra, align); 723 *left_over = wastage; 724 } 725 726 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) 727 728 static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg) 729 { 730 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 731 function, cachep->name, msg); 732 dump_stack(); 733 } 734 735 /* 736 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz 737 * via the workqueue/eventd. 738 * Add the CPU number into the expiration time to minimize the possibility of 739 * the CPUs getting into lockstep and contending for the global cache chain 740 * lock. 741 */ 742 static void __devinit start_cpu_timer(int cpu) 743 { 744 struct work_struct *reap_work = &per_cpu(reap_work, cpu); 745 746 /* 747 * When this gets called from do_initcalls via cpucache_init(), 748 * init_workqueues() has already run, so keventd will be setup 749 * at that time. 750 */ 751 if (keventd_up() && reap_work->func == NULL) { 752 INIT_WORK(reap_work, cache_reap, NULL); 753 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); 754 } 755 } 756 757 static struct array_cache *alloc_arraycache(int node, int entries, 758 int batchcount) 759 { 760 int memsize = sizeof(void *) * entries + sizeof(struct array_cache); 761 struct array_cache *nc = NULL; 762 763 nc = kmalloc_node(memsize, GFP_KERNEL, node); 764 if (nc) { 765 nc->avail = 0; 766 nc->limit = entries; 767 nc->batchcount = batchcount; 768 nc->touched = 0; 769 spin_lock_init(&nc->lock); 770 } 771 return nc; 772 } 773 774 #ifdef CONFIG_NUMA 775 static inline struct array_cache **alloc_alien_cache(int node, int limit) 776 { 777 struct array_cache **ac_ptr; 778 int memsize = sizeof(void *) * MAX_NUMNODES; 779 int i; 780 781 if (limit > 1) 782 limit = 12; 783 ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node); 784 if (ac_ptr) { 785 for_each_node(i) { 786 if (i == node || !node_online(i)) { 787 ac_ptr[i] = NULL; 788 continue; 789 } 790 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); 791 if (!ac_ptr[i]) { 792 for (i--; i <= 0; i--) 793 kfree(ac_ptr[i]); 794 kfree(ac_ptr); 795 return NULL; 796 } 797 } 798 } 799 return ac_ptr; 800 } 801 802 static inline void free_alien_cache(struct array_cache **ac_ptr) 803 { 804 int i; 805 806 if (!ac_ptr) 807 return; 808 809 for_each_node(i) 810 kfree(ac_ptr[i]); 811 812 kfree(ac_ptr); 813 } 814 815 static inline void __drain_alien_cache(kmem_cache_t *cachep, 816 struct array_cache *ac, int node) 817 { 818 struct kmem_list3 *rl3 = cachep->nodelists[node]; 819 820 if (ac->avail) { 821 spin_lock(&rl3->list_lock); 822 free_block(cachep, ac->entry, ac->avail, node); 823 ac->avail = 0; 824 spin_unlock(&rl3->list_lock); 825 } 826 } 827 828 static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3) 829 { 830 int i = 0; 831 struct array_cache *ac; 832 unsigned long flags; 833 834 for_each_online_node(i) { 835 ac = l3->alien[i]; 836 if (ac) { 837 spin_lock_irqsave(&ac->lock, flags); 838 __drain_alien_cache(cachep, ac, i); 839 spin_unlock_irqrestore(&ac->lock, flags); 840 } 841 } 842 } 843 #else 844 #define alloc_alien_cache(node, limit) do { } while (0) 845 #define free_alien_cache(ac_ptr) do { } while (0) 846 #define drain_alien_cache(cachep, l3) do { } while (0) 847 #endif 848 849 static int __devinit cpuup_callback(struct notifier_block *nfb, 850 unsigned long action, void *hcpu) 851 { 852 long cpu = (long)hcpu; 853 kmem_cache_t *cachep; 854 struct kmem_list3 *l3 = NULL; 855 int node = cpu_to_node(cpu); 856 int memsize = sizeof(struct kmem_list3); 857 858 switch (action) { 859 case CPU_UP_PREPARE: 860 down(&cache_chain_sem); 861 /* we need to do this right in the beginning since 862 * alloc_arraycache's are going to use this list. 863 * kmalloc_node allows us to add the slab to the right 864 * kmem_list3 and not this cpu's kmem_list3 865 */ 866 867 list_for_each_entry(cachep, &cache_chain, next) { 868 /* setup the size64 kmemlist for cpu before we can 869 * begin anything. Make sure some other cpu on this 870 * node has not already allocated this 871 */ 872 if (!cachep->nodelists[node]) { 873 if (!(l3 = kmalloc_node(memsize, 874 GFP_KERNEL, node))) 875 goto bad; 876 kmem_list3_init(l3); 877 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 878 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 879 880 cachep->nodelists[node] = l3; 881 } 882 883 spin_lock_irq(&cachep->nodelists[node]->list_lock); 884 cachep->nodelists[node]->free_limit = 885 (1 + nr_cpus_node(node)) * 886 cachep->batchcount + cachep->num; 887 spin_unlock_irq(&cachep->nodelists[node]->list_lock); 888 } 889 890 /* Now we can go ahead with allocating the shared array's 891 & array cache's */ 892 list_for_each_entry(cachep, &cache_chain, next) { 893 struct array_cache *nc; 894 895 nc = alloc_arraycache(node, cachep->limit, 896 cachep->batchcount); 897 if (!nc) 898 goto bad; 899 cachep->array[cpu] = nc; 900 901 l3 = cachep->nodelists[node]; 902 BUG_ON(!l3); 903 if (!l3->shared) { 904 if (!(nc = alloc_arraycache(node, 905 cachep->shared * 906 cachep->batchcount, 907 0xbaadf00d))) 908 goto bad; 909 910 /* we are serialised from CPU_DEAD or 911 CPU_UP_CANCELLED by the cpucontrol lock */ 912 l3->shared = nc; 913 } 914 } 915 up(&cache_chain_sem); 916 break; 917 case CPU_ONLINE: 918 start_cpu_timer(cpu); 919 break; 920 #ifdef CONFIG_HOTPLUG_CPU 921 case CPU_DEAD: 922 /* fall thru */ 923 case CPU_UP_CANCELED: 924 down(&cache_chain_sem); 925 926 list_for_each_entry(cachep, &cache_chain, next) { 927 struct array_cache *nc; 928 cpumask_t mask; 929 930 mask = node_to_cpumask(node); 931 spin_lock_irq(&cachep->spinlock); 932 /* cpu is dead; no one can alloc from it. */ 933 nc = cachep->array[cpu]; 934 cachep->array[cpu] = NULL; 935 l3 = cachep->nodelists[node]; 936 937 if (!l3) 938 goto unlock_cache; 939 940 spin_lock(&l3->list_lock); 941 942 /* Free limit for this kmem_list3 */ 943 l3->free_limit -= cachep->batchcount; 944 if (nc) 945 free_block(cachep, nc->entry, nc->avail, node); 946 947 if (!cpus_empty(mask)) { 948 spin_unlock(&l3->list_lock); 949 goto unlock_cache; 950 } 951 952 if (l3->shared) { 953 free_block(cachep, l3->shared->entry, 954 l3->shared->avail, node); 955 kfree(l3->shared); 956 l3->shared = NULL; 957 } 958 if (l3->alien) { 959 drain_alien_cache(cachep, l3); 960 free_alien_cache(l3->alien); 961 l3->alien = NULL; 962 } 963 964 /* free slabs belonging to this node */ 965 if (__node_shrink(cachep, node)) { 966 cachep->nodelists[node] = NULL; 967 spin_unlock(&l3->list_lock); 968 kfree(l3); 969 } else { 970 spin_unlock(&l3->list_lock); 971 } 972 unlock_cache: 973 spin_unlock_irq(&cachep->spinlock); 974 kfree(nc); 975 } 976 up(&cache_chain_sem); 977 break; 978 #endif 979 } 980 return NOTIFY_OK; 981 bad: 982 up(&cache_chain_sem); 983 return NOTIFY_BAD; 984 } 985 986 static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; 987 988 /* 989 * swap the static kmem_list3 with kmalloced memory 990 */ 991 static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid) 992 { 993 struct kmem_list3 *ptr; 994 995 BUG_ON(cachep->nodelists[nodeid] != list); 996 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); 997 BUG_ON(!ptr); 998 999 local_irq_disable(); 1000 memcpy(ptr, list, sizeof(struct kmem_list3)); 1001 MAKE_ALL_LISTS(cachep, ptr, nodeid); 1002 cachep->nodelists[nodeid] = ptr; 1003 local_irq_enable(); 1004 } 1005 1006 /* Initialisation. 1007 * Called after the gfp() functions have been enabled, and before smp_init(). 1008 */ 1009 void __init kmem_cache_init(void) 1010 { 1011 size_t left_over; 1012 struct cache_sizes *sizes; 1013 struct cache_names *names; 1014 int i; 1015 1016 for (i = 0; i < NUM_INIT_LISTS; i++) { 1017 kmem_list3_init(&initkmem_list3[i]); 1018 if (i < MAX_NUMNODES) 1019 cache_cache.nodelists[i] = NULL; 1020 } 1021 1022 /* 1023 * Fragmentation resistance on low memory - only use bigger 1024 * page orders on machines with more than 32MB of memory. 1025 */ 1026 if (num_physpages > (32 << 20) >> PAGE_SHIFT) 1027 slab_break_gfp_order = BREAK_GFP_ORDER_HI; 1028 1029 /* Bootstrap is tricky, because several objects are allocated 1030 * from caches that do not exist yet: 1031 * 1) initialize the cache_cache cache: it contains the kmem_cache_t 1032 * structures of all caches, except cache_cache itself: cache_cache 1033 * is statically allocated. 1034 * Initially an __init data area is used for the head array and the 1035 * kmem_list3 structures, it's replaced with a kmalloc allocated 1036 * array at the end of the bootstrap. 1037 * 2) Create the first kmalloc cache. 1038 * The kmem_cache_t for the new cache is allocated normally. 1039 * An __init data area is used for the head array. 1040 * 3) Create the remaining kmalloc caches, with minimally sized 1041 * head arrays. 1042 * 4) Replace the __init data head arrays for cache_cache and the first 1043 * kmalloc cache with kmalloc allocated arrays. 1044 * 5) Replace the __init data for kmem_list3 for cache_cache and 1045 * the other cache's with kmalloc allocated memory. 1046 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1047 */ 1048 1049 /* 1) create the cache_cache */ 1050 init_MUTEX(&cache_chain_sem); 1051 INIT_LIST_HEAD(&cache_chain); 1052 list_add(&cache_cache.next, &cache_chain); 1053 cache_cache.colour_off = cache_line_size(); 1054 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1055 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; 1056 1057 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); 1058 1059 cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, 1060 &left_over, &cache_cache.num); 1061 if (!cache_cache.num) 1062 BUG(); 1063 1064 cache_cache.colour = left_over / cache_cache.colour_off; 1065 cache_cache.colour_next = 0; 1066 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + 1067 sizeof(struct slab), cache_line_size()); 1068 1069 /* 2+3) create the kmalloc caches */ 1070 sizes = malloc_sizes; 1071 names = cache_names; 1072 1073 /* Initialize the caches that provide memory for the array cache 1074 * and the kmem_list3 structures first. 1075 * Without this, further allocations will bug 1076 */ 1077 1078 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, 1079 sizes[INDEX_AC].cs_size, 1080 ARCH_KMALLOC_MINALIGN, 1081 (ARCH_KMALLOC_FLAGS | 1082 SLAB_PANIC), NULL, NULL); 1083 1084 if (INDEX_AC != INDEX_L3) 1085 sizes[INDEX_L3].cs_cachep = 1086 kmem_cache_create(names[INDEX_L3].name, 1087 sizes[INDEX_L3].cs_size, 1088 ARCH_KMALLOC_MINALIGN, 1089 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, 1090 NULL); 1091 1092 while (sizes->cs_size != ULONG_MAX) { 1093 /* 1094 * For performance, all the general caches are L1 aligned. 1095 * This should be particularly beneficial on SMP boxes, as it 1096 * eliminates "false sharing". 1097 * Note for systems short on memory removing the alignment will 1098 * allow tighter packing of the smaller caches. 1099 */ 1100 if (!sizes->cs_cachep) 1101 sizes->cs_cachep = kmem_cache_create(names->name, 1102 sizes->cs_size, 1103 ARCH_KMALLOC_MINALIGN, 1104 (ARCH_KMALLOC_FLAGS 1105 | SLAB_PANIC), 1106 NULL, NULL); 1107 1108 /* Inc off-slab bufctl limit until the ceiling is hit. */ 1109 if (!(OFF_SLAB(sizes->cs_cachep))) { 1110 offslab_limit = sizes->cs_size - sizeof(struct slab); 1111 offslab_limit /= sizeof(kmem_bufctl_t); 1112 } 1113 1114 sizes->cs_dmacachep = kmem_cache_create(names->name_dma, 1115 sizes->cs_size, 1116 ARCH_KMALLOC_MINALIGN, 1117 (ARCH_KMALLOC_FLAGS | 1118 SLAB_CACHE_DMA | 1119 SLAB_PANIC), NULL, 1120 NULL); 1121 1122 sizes++; 1123 names++; 1124 } 1125 /* 4) Replace the bootstrap head arrays */ 1126 { 1127 void *ptr; 1128 1129 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1130 1131 local_irq_disable(); 1132 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); 1133 memcpy(ptr, ac_data(&cache_cache), 1134 sizeof(struct arraycache_init)); 1135 cache_cache.array[smp_processor_id()] = ptr; 1136 local_irq_enable(); 1137 1138 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1139 1140 local_irq_disable(); 1141 BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep) 1142 != &initarray_generic.cache); 1143 memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep), 1144 sizeof(struct arraycache_init)); 1145 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1146 ptr; 1147 local_irq_enable(); 1148 } 1149 /* 5) Replace the bootstrap kmem_list3's */ 1150 { 1151 int node; 1152 /* Replace the static kmem_list3 structures for the boot cpu */ 1153 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], 1154 numa_node_id()); 1155 1156 for_each_online_node(node) { 1157 init_list(malloc_sizes[INDEX_AC].cs_cachep, 1158 &initkmem_list3[SIZE_AC + node], node); 1159 1160 if (INDEX_AC != INDEX_L3) { 1161 init_list(malloc_sizes[INDEX_L3].cs_cachep, 1162 &initkmem_list3[SIZE_L3 + node], 1163 node); 1164 } 1165 } 1166 } 1167 1168 /* 6) resize the head arrays to their final sizes */ 1169 { 1170 kmem_cache_t *cachep; 1171 down(&cache_chain_sem); 1172 list_for_each_entry(cachep, &cache_chain, next) 1173 enable_cpucache(cachep); 1174 up(&cache_chain_sem); 1175 } 1176 1177 /* Done! */ 1178 g_cpucache_up = FULL; 1179 1180 /* Register a cpu startup notifier callback 1181 * that initializes ac_data for all new cpus 1182 */ 1183 register_cpu_notifier(&cpucache_notifier); 1184 1185 /* The reap timers are started later, with a module init call: 1186 * That part of the kernel is not yet operational. 1187 */ 1188 } 1189 1190 static int __init cpucache_init(void) 1191 { 1192 int cpu; 1193 1194 /* 1195 * Register the timers that return unneeded 1196 * pages to gfp. 1197 */ 1198 for_each_online_cpu(cpu) 1199 start_cpu_timer(cpu); 1200 1201 return 0; 1202 } 1203 1204 __initcall(cpucache_init); 1205 1206 /* 1207 * Interface to system's page allocator. No need to hold the cache-lock. 1208 * 1209 * If we requested dmaable memory, we will get it. Even if we 1210 * did not request dmaable memory, we might get it, but that 1211 * would be relatively rare and ignorable. 1212 */ 1213 static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid) 1214 { 1215 struct page *page; 1216 void *addr; 1217 int i; 1218 1219 flags |= cachep->gfpflags; 1220 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1221 if (!page) 1222 return NULL; 1223 addr = page_address(page); 1224 1225 i = (1 << cachep->gfporder); 1226 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1227 atomic_add(i, &slab_reclaim_pages); 1228 add_page_state(nr_slab, i); 1229 while (i--) { 1230 SetPageSlab(page); 1231 page++; 1232 } 1233 return addr; 1234 } 1235 1236 /* 1237 * Interface to system's page release. 1238 */ 1239 static void kmem_freepages(kmem_cache_t *cachep, void *addr) 1240 { 1241 unsigned long i = (1 << cachep->gfporder); 1242 struct page *page = virt_to_page(addr); 1243 const unsigned long nr_freed = i; 1244 1245 while (i--) { 1246 if (!TestClearPageSlab(page)) 1247 BUG(); 1248 page++; 1249 } 1250 sub_page_state(nr_slab, nr_freed); 1251 if (current->reclaim_state) 1252 current->reclaim_state->reclaimed_slab += nr_freed; 1253 free_pages((unsigned long)addr, cachep->gfporder); 1254 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1255 atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages); 1256 } 1257 1258 static void kmem_rcu_free(struct rcu_head *head) 1259 { 1260 struct slab_rcu *slab_rcu = (struct slab_rcu *)head; 1261 kmem_cache_t *cachep = slab_rcu->cachep; 1262 1263 kmem_freepages(cachep, slab_rcu->addr); 1264 if (OFF_SLAB(cachep)) 1265 kmem_cache_free(cachep->slabp_cache, slab_rcu); 1266 } 1267 1268 #if DEBUG 1269 1270 #ifdef CONFIG_DEBUG_PAGEALLOC 1271 static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, 1272 unsigned long caller) 1273 { 1274 int size = obj_reallen(cachep); 1275 1276 addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)]; 1277 1278 if (size < 5 * sizeof(unsigned long)) 1279 return; 1280 1281 *addr++ = 0x12345678; 1282 *addr++ = caller; 1283 *addr++ = smp_processor_id(); 1284 size -= 3 * sizeof(unsigned long); 1285 { 1286 unsigned long *sptr = &caller; 1287 unsigned long svalue; 1288 1289 while (!kstack_end(sptr)) { 1290 svalue = *sptr++; 1291 if (kernel_text_address(svalue)) { 1292 *addr++ = svalue; 1293 size -= sizeof(unsigned long); 1294 if (size <= sizeof(unsigned long)) 1295 break; 1296 } 1297 } 1298 1299 } 1300 *addr++ = 0x87654321; 1301 } 1302 #endif 1303 1304 static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) 1305 { 1306 int size = obj_reallen(cachep); 1307 addr = &((char *)addr)[obj_dbghead(cachep)]; 1308 1309 memset(addr, val, size); 1310 *(unsigned char *)(addr + size - 1) = POISON_END; 1311 } 1312 1313 static void dump_line(char *data, int offset, int limit) 1314 { 1315 int i; 1316 printk(KERN_ERR "%03x:", offset); 1317 for (i = 0; i < limit; i++) { 1318 printk(" %02x", (unsigned char)data[offset + i]); 1319 } 1320 printk("\n"); 1321 } 1322 #endif 1323 1324 #if DEBUG 1325 1326 static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines) 1327 { 1328 int i, size; 1329 char *realobj; 1330 1331 if (cachep->flags & SLAB_RED_ZONE) { 1332 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", 1333 *dbg_redzone1(cachep, objp), 1334 *dbg_redzone2(cachep, objp)); 1335 } 1336 1337 if (cachep->flags & SLAB_STORE_USER) { 1338 printk(KERN_ERR "Last user: [<%p>]", 1339 *dbg_userword(cachep, objp)); 1340 print_symbol("(%s)", 1341 (unsigned long)*dbg_userword(cachep, objp)); 1342 printk("\n"); 1343 } 1344 realobj = (char *)objp + obj_dbghead(cachep); 1345 size = obj_reallen(cachep); 1346 for (i = 0; i < size && lines; i += 16, lines--) { 1347 int limit; 1348 limit = 16; 1349 if (i + limit > size) 1350 limit = size - i; 1351 dump_line(realobj, i, limit); 1352 } 1353 } 1354 1355 static void check_poison_obj(kmem_cache_t *cachep, void *objp) 1356 { 1357 char *realobj; 1358 int size, i; 1359 int lines = 0; 1360 1361 realobj = (char *)objp + obj_dbghead(cachep); 1362 size = obj_reallen(cachep); 1363 1364 for (i = 0; i < size; i++) { 1365 char exp = POISON_FREE; 1366 if (i == size - 1) 1367 exp = POISON_END; 1368 if (realobj[i] != exp) { 1369 int limit; 1370 /* Mismatch ! */ 1371 /* Print header */ 1372 if (lines == 0) { 1373 printk(KERN_ERR 1374 "Slab corruption: start=%p, len=%d\n", 1375 realobj, size); 1376 print_objinfo(cachep, objp, 0); 1377 } 1378 /* Hexdump the affected line */ 1379 i = (i / 16) * 16; 1380 limit = 16; 1381 if (i + limit > size) 1382 limit = size - i; 1383 dump_line(realobj, i, limit); 1384 i += 16; 1385 lines++; 1386 /* Limit to 5 lines */ 1387 if (lines > 5) 1388 break; 1389 } 1390 } 1391 if (lines != 0) { 1392 /* Print some data about the neighboring objects, if they 1393 * exist: 1394 */ 1395 struct slab *slabp = page_get_slab(virt_to_page(objp)); 1396 int objnr; 1397 1398 objnr = (objp - slabp->s_mem) / cachep->objsize; 1399 if (objnr) { 1400 objp = slabp->s_mem + (objnr - 1) * cachep->objsize; 1401 realobj = (char *)objp + obj_dbghead(cachep); 1402 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1403 realobj, size); 1404 print_objinfo(cachep, objp, 2); 1405 } 1406 if (objnr + 1 < cachep->num) { 1407 objp = slabp->s_mem + (objnr + 1) * cachep->objsize; 1408 realobj = (char *)objp + obj_dbghead(cachep); 1409 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1410 realobj, size); 1411 print_objinfo(cachep, objp, 2); 1412 } 1413 } 1414 } 1415 #endif 1416 1417 /* Destroy all the objs in a slab, and release the mem back to the system. 1418 * Before calling the slab must have been unlinked from the cache. 1419 * The cache-lock is not held/needed. 1420 */ 1421 static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp) 1422 { 1423 void *addr = slabp->s_mem - slabp->colouroff; 1424 1425 #if DEBUG 1426 int i; 1427 for (i = 0; i < cachep->num; i++) { 1428 void *objp = slabp->s_mem + cachep->objsize * i; 1429 1430 if (cachep->flags & SLAB_POISON) { 1431 #ifdef CONFIG_DEBUG_PAGEALLOC 1432 if ((cachep->objsize % PAGE_SIZE) == 0 1433 && OFF_SLAB(cachep)) 1434 kernel_map_pages(virt_to_page(objp), 1435 cachep->objsize / PAGE_SIZE, 1436 1); 1437 else 1438 check_poison_obj(cachep, objp); 1439 #else 1440 check_poison_obj(cachep, objp); 1441 #endif 1442 } 1443 if (cachep->flags & SLAB_RED_ZONE) { 1444 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1445 slab_error(cachep, "start of a freed object " 1446 "was overwritten"); 1447 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 1448 slab_error(cachep, "end of a freed object " 1449 "was overwritten"); 1450 } 1451 if (cachep->dtor && !(cachep->flags & SLAB_POISON)) 1452 (cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0); 1453 } 1454 #else 1455 if (cachep->dtor) { 1456 int i; 1457 for (i = 0; i < cachep->num; i++) { 1458 void *objp = slabp->s_mem + cachep->objsize * i; 1459 (cachep->dtor) (objp, cachep, 0); 1460 } 1461 } 1462 #endif 1463 1464 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 1465 struct slab_rcu *slab_rcu; 1466 1467 slab_rcu = (struct slab_rcu *)slabp; 1468 slab_rcu->cachep = cachep; 1469 slab_rcu->addr = addr; 1470 call_rcu(&slab_rcu->head, kmem_rcu_free); 1471 } else { 1472 kmem_freepages(cachep, addr); 1473 if (OFF_SLAB(cachep)) 1474 kmem_cache_free(cachep->slabp_cache, slabp); 1475 } 1476 } 1477 1478 /* For setting up all the kmem_list3s for cache whose objsize is same 1479 as size of kmem_list3. */ 1480 static inline void set_up_list3s(kmem_cache_t *cachep, int index) 1481 { 1482 int node; 1483 1484 for_each_online_node(node) { 1485 cachep->nodelists[node] = &initkmem_list3[index + node]; 1486 cachep->nodelists[node]->next_reap = jiffies + 1487 REAPTIMEOUT_LIST3 + 1488 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1489 } 1490 } 1491 1492 /** 1493 * calculate_slab_order - calculate size (page order) of slabs and the number 1494 * of objects per slab. 1495 * 1496 * This could be made much more intelligent. For now, try to avoid using 1497 * high order pages for slabs. When the gfp() functions are more friendly 1498 * towards high-order requests, this should be changed. 1499 */ 1500 static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size, 1501 size_t align, gfp_t flags) 1502 { 1503 size_t left_over = 0; 1504 1505 for (;; cachep->gfporder++) { 1506 unsigned int num; 1507 size_t remainder; 1508 1509 if (cachep->gfporder > MAX_GFP_ORDER) { 1510 cachep->num = 0; 1511 break; 1512 } 1513 1514 cache_estimate(cachep->gfporder, size, align, flags, 1515 &remainder, &num); 1516 if (!num) 1517 continue; 1518 /* More than offslab_limit objects will cause problems */ 1519 if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) 1520 break; 1521 1522 cachep->num = num; 1523 left_over = remainder; 1524 1525 /* 1526 * Large number of objects is good, but very large slabs are 1527 * currently bad for the gfp()s. 1528 */ 1529 if (cachep->gfporder >= slab_break_gfp_order) 1530 break; 1531 1532 if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder)) 1533 /* Acceptable internal fragmentation */ 1534 break; 1535 } 1536 return left_over; 1537 } 1538 1539 /** 1540 * kmem_cache_create - Create a cache. 1541 * @name: A string which is used in /proc/slabinfo to identify this cache. 1542 * @size: The size of objects to be created in this cache. 1543 * @align: The required alignment for the objects. 1544 * @flags: SLAB flags 1545 * @ctor: A constructor for the objects. 1546 * @dtor: A destructor for the objects. 1547 * 1548 * Returns a ptr to the cache on success, NULL on failure. 1549 * Cannot be called within a int, but can be interrupted. 1550 * The @ctor is run when new pages are allocated by the cache 1551 * and the @dtor is run before the pages are handed back. 1552 * 1553 * @name must be valid until the cache is destroyed. This implies that 1554 * the module calling this has to destroy the cache before getting 1555 * unloaded. 1556 * 1557 * The flags are 1558 * 1559 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 1560 * to catch references to uninitialised memory. 1561 * 1562 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 1563 * for buffer overruns. 1564 * 1565 * %SLAB_NO_REAP - Don't automatically reap this cache when we're under 1566 * memory pressure. 1567 * 1568 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 1569 * cacheline. This can be beneficial if you're counting cycles as closely 1570 * as davem. 1571 */ 1572 kmem_cache_t * 1573 kmem_cache_create (const char *name, size_t size, size_t align, 1574 unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), 1575 void (*dtor)(void*, kmem_cache_t *, unsigned long)) 1576 { 1577 size_t left_over, slab_size, ralign; 1578 kmem_cache_t *cachep = NULL; 1579 struct list_head *p; 1580 1581 /* 1582 * Sanity checks... these are all serious usage bugs. 1583 */ 1584 if ((!name) || 1585 in_interrupt() || 1586 (size < BYTES_PER_WORD) || 1587 (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { 1588 printk(KERN_ERR "%s: Early error in slab %s\n", 1589 __FUNCTION__, name); 1590 BUG(); 1591 } 1592 1593 down(&cache_chain_sem); 1594 1595 list_for_each(p, &cache_chain) { 1596 kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); 1597 mm_segment_t old_fs = get_fs(); 1598 char tmp; 1599 int res; 1600 1601 /* 1602 * This happens when the module gets unloaded and doesn't 1603 * destroy its slab cache and no-one else reuses the vmalloc 1604 * area of the module. Print a warning. 1605 */ 1606 set_fs(KERNEL_DS); 1607 res = __get_user(tmp, pc->name); 1608 set_fs(old_fs); 1609 if (res) { 1610 printk("SLAB: cache with size %d has lost its name\n", 1611 pc->objsize); 1612 continue; 1613 } 1614 1615 if (!strcmp(pc->name, name)) { 1616 printk("kmem_cache_create: duplicate cache %s\n", name); 1617 dump_stack(); 1618 goto oops; 1619 } 1620 } 1621 1622 #if DEBUG 1623 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 1624 if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { 1625 /* No constructor, but inital state check requested */ 1626 printk(KERN_ERR "%s: No con, but init state check " 1627 "requested - %s\n", __FUNCTION__, name); 1628 flags &= ~SLAB_DEBUG_INITIAL; 1629 } 1630 #if FORCED_DEBUG 1631 /* 1632 * Enable redzoning and last user accounting, except for caches with 1633 * large objects, if the increased size would increase the object size 1634 * above the next power of two: caches with object sizes just above a 1635 * power of two have a significant amount of internal fragmentation. 1636 */ 1637 if ((size < 4096 1638 || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD))) 1639 flags |= SLAB_RED_ZONE | SLAB_STORE_USER; 1640 if (!(flags & SLAB_DESTROY_BY_RCU)) 1641 flags |= SLAB_POISON; 1642 #endif 1643 if (flags & SLAB_DESTROY_BY_RCU) 1644 BUG_ON(flags & SLAB_POISON); 1645 #endif 1646 if (flags & SLAB_DESTROY_BY_RCU) 1647 BUG_ON(dtor); 1648 1649 /* 1650 * Always checks flags, a caller might be expecting debug 1651 * support which isn't available. 1652 */ 1653 if (flags & ~CREATE_MASK) 1654 BUG(); 1655 1656 /* Check that size is in terms of words. This is needed to avoid 1657 * unaligned accesses for some archs when redzoning is used, and makes 1658 * sure any on-slab bufctl's are also correctly aligned. 1659 */ 1660 if (size & (BYTES_PER_WORD - 1)) { 1661 size += (BYTES_PER_WORD - 1); 1662 size &= ~(BYTES_PER_WORD - 1); 1663 } 1664 1665 /* calculate out the final buffer alignment: */ 1666 /* 1) arch recommendation: can be overridden for debug */ 1667 if (flags & SLAB_HWCACHE_ALIGN) { 1668 /* Default alignment: as specified by the arch code. 1669 * Except if an object is really small, then squeeze multiple 1670 * objects into one cacheline. 1671 */ 1672 ralign = cache_line_size(); 1673 while (size <= ralign / 2) 1674 ralign /= 2; 1675 } else { 1676 ralign = BYTES_PER_WORD; 1677 } 1678 /* 2) arch mandated alignment: disables debug if necessary */ 1679 if (ralign < ARCH_SLAB_MINALIGN) { 1680 ralign = ARCH_SLAB_MINALIGN; 1681 if (ralign > BYTES_PER_WORD) 1682 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 1683 } 1684 /* 3) caller mandated alignment: disables debug if necessary */ 1685 if (ralign < align) { 1686 ralign = align; 1687 if (ralign > BYTES_PER_WORD) 1688 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 1689 } 1690 /* 4) Store it. Note that the debug code below can reduce 1691 * the alignment to BYTES_PER_WORD. 1692 */ 1693 align = ralign; 1694 1695 /* Get cache's description obj. */ 1696 cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); 1697 if (!cachep) 1698 goto oops; 1699 memset(cachep, 0, sizeof(kmem_cache_t)); 1700 1701 #if DEBUG 1702 cachep->reallen = size; 1703 1704 if (flags & SLAB_RED_ZONE) { 1705 /* redzoning only works with word aligned caches */ 1706 align = BYTES_PER_WORD; 1707 1708 /* add space for red zone words */ 1709 cachep->dbghead += BYTES_PER_WORD; 1710 size += 2 * BYTES_PER_WORD; 1711 } 1712 if (flags & SLAB_STORE_USER) { 1713 /* user store requires word alignment and 1714 * one word storage behind the end of the real 1715 * object. 1716 */ 1717 align = BYTES_PER_WORD; 1718 size += BYTES_PER_WORD; 1719 } 1720 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 1721 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size 1722 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { 1723 cachep->dbghead += PAGE_SIZE - size; 1724 size = PAGE_SIZE; 1725 } 1726 #endif 1727 #endif 1728 1729 /* Determine if the slab management is 'on' or 'off' slab. */ 1730 if (size >= (PAGE_SIZE >> 3)) 1731 /* 1732 * Size is large, assume best to place the slab management obj 1733 * off-slab (should allow better packing of objs). 1734 */ 1735 flags |= CFLGS_OFF_SLAB; 1736 1737 size = ALIGN(size, align); 1738 1739 if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) { 1740 /* 1741 * A VFS-reclaimable slab tends to have most allocations 1742 * as GFP_NOFS and we really don't want to have to be allocating 1743 * higher-order pages when we are unable to shrink dcache. 1744 */ 1745 cachep->gfporder = 0; 1746 cache_estimate(cachep->gfporder, size, align, flags, 1747 &left_over, &cachep->num); 1748 } else 1749 left_over = calculate_slab_order(cachep, size, align, flags); 1750 1751 if (!cachep->num) { 1752 printk("kmem_cache_create: couldn't create cache %s.\n", name); 1753 kmem_cache_free(&cache_cache, cachep); 1754 cachep = NULL; 1755 goto oops; 1756 } 1757 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) 1758 + sizeof(struct slab), align); 1759 1760 /* 1761 * If the slab has been placed off-slab, and we have enough space then 1762 * move it on-slab. This is at the expense of any extra colouring. 1763 */ 1764 if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { 1765 flags &= ~CFLGS_OFF_SLAB; 1766 left_over -= slab_size; 1767 } 1768 1769 if (flags & CFLGS_OFF_SLAB) { 1770 /* really off slab. No need for manual alignment */ 1771 slab_size = 1772 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); 1773 } 1774 1775 cachep->colour_off = cache_line_size(); 1776 /* Offset must be a multiple of the alignment. */ 1777 if (cachep->colour_off < align) 1778 cachep->colour_off = align; 1779 cachep->colour = left_over / cachep->colour_off; 1780 cachep->slab_size = slab_size; 1781 cachep->flags = flags; 1782 cachep->gfpflags = 0; 1783 if (flags & SLAB_CACHE_DMA) 1784 cachep->gfpflags |= GFP_DMA; 1785 spin_lock_init(&cachep->spinlock); 1786 cachep->objsize = size; 1787 1788 if (flags & CFLGS_OFF_SLAB) 1789 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 1790 cachep->ctor = ctor; 1791 cachep->dtor = dtor; 1792 cachep->name = name; 1793 1794 /* Don't let CPUs to come and go */ 1795 lock_cpu_hotplug(); 1796 1797 if (g_cpucache_up == FULL) { 1798 enable_cpucache(cachep); 1799 } else { 1800 if (g_cpucache_up == NONE) { 1801 /* Note: the first kmem_cache_create must create 1802 * the cache that's used by kmalloc(24), otherwise 1803 * the creation of further caches will BUG(). 1804 */ 1805 cachep->array[smp_processor_id()] = 1806 &initarray_generic.cache; 1807 1808 /* If the cache that's used by 1809 * kmalloc(sizeof(kmem_list3)) is the first cache, 1810 * then we need to set up all its list3s, otherwise 1811 * the creation of further caches will BUG(). 1812 */ 1813 set_up_list3s(cachep, SIZE_AC); 1814 if (INDEX_AC == INDEX_L3) 1815 g_cpucache_up = PARTIAL_L3; 1816 else 1817 g_cpucache_up = PARTIAL_AC; 1818 } else { 1819 cachep->array[smp_processor_id()] = 1820 kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1821 1822 if (g_cpucache_up == PARTIAL_AC) { 1823 set_up_list3s(cachep, SIZE_L3); 1824 g_cpucache_up = PARTIAL_L3; 1825 } else { 1826 int node; 1827 for_each_online_node(node) { 1828 1829 cachep->nodelists[node] = 1830 kmalloc_node(sizeof 1831 (struct kmem_list3), 1832 GFP_KERNEL, node); 1833 BUG_ON(!cachep->nodelists[node]); 1834 kmem_list3_init(cachep-> 1835 nodelists[node]); 1836 } 1837 } 1838 } 1839 cachep->nodelists[numa_node_id()]->next_reap = 1840 jiffies + REAPTIMEOUT_LIST3 + 1841 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1842 1843 BUG_ON(!ac_data(cachep)); 1844 ac_data(cachep)->avail = 0; 1845 ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 1846 ac_data(cachep)->batchcount = 1; 1847 ac_data(cachep)->touched = 0; 1848 cachep->batchcount = 1; 1849 cachep->limit = BOOT_CPUCACHE_ENTRIES; 1850 } 1851 1852 /* cache setup completed, link it into the list */ 1853 list_add(&cachep->next, &cache_chain); 1854 unlock_cpu_hotplug(); 1855 oops: 1856 if (!cachep && (flags & SLAB_PANIC)) 1857 panic("kmem_cache_create(): failed to create slab `%s'\n", 1858 name); 1859 up(&cache_chain_sem); 1860 return cachep; 1861 } 1862 EXPORT_SYMBOL(kmem_cache_create); 1863 1864 #if DEBUG 1865 static void check_irq_off(void) 1866 { 1867 BUG_ON(!irqs_disabled()); 1868 } 1869 1870 static void check_irq_on(void) 1871 { 1872 BUG_ON(irqs_disabled()); 1873 } 1874 1875 static void check_spinlock_acquired(kmem_cache_t *cachep) 1876 { 1877 #ifdef CONFIG_SMP 1878 check_irq_off(); 1879 assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); 1880 #endif 1881 } 1882 1883 static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node) 1884 { 1885 #ifdef CONFIG_SMP 1886 check_irq_off(); 1887 assert_spin_locked(&cachep->nodelists[node]->list_lock); 1888 #endif 1889 } 1890 1891 #else 1892 #define check_irq_off() do { } while(0) 1893 #define check_irq_on() do { } while(0) 1894 #define check_spinlock_acquired(x) do { } while(0) 1895 #define check_spinlock_acquired_node(x, y) do { } while(0) 1896 #endif 1897 1898 /* 1899 * Waits for all CPUs to execute func(). 1900 */ 1901 static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg) 1902 { 1903 check_irq_on(); 1904 preempt_disable(); 1905 1906 local_irq_disable(); 1907 func(arg); 1908 local_irq_enable(); 1909 1910 if (smp_call_function(func, arg, 1, 1)) 1911 BUG(); 1912 1913 preempt_enable(); 1914 } 1915 1916 static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac, 1917 int force, int node); 1918 1919 static void do_drain(void *arg) 1920 { 1921 kmem_cache_t *cachep = (kmem_cache_t *) arg; 1922 struct array_cache *ac; 1923 int node = numa_node_id(); 1924 1925 check_irq_off(); 1926 ac = ac_data(cachep); 1927 spin_lock(&cachep->nodelists[node]->list_lock); 1928 free_block(cachep, ac->entry, ac->avail, node); 1929 spin_unlock(&cachep->nodelists[node]->list_lock); 1930 ac->avail = 0; 1931 } 1932 1933 static void drain_cpu_caches(kmem_cache_t *cachep) 1934 { 1935 struct kmem_list3 *l3; 1936 int node; 1937 1938 smp_call_function_all_cpus(do_drain, cachep); 1939 check_irq_on(); 1940 spin_lock_irq(&cachep->spinlock); 1941 for_each_online_node(node) { 1942 l3 = cachep->nodelists[node]; 1943 if (l3) { 1944 spin_lock(&l3->list_lock); 1945 drain_array_locked(cachep, l3->shared, 1, node); 1946 spin_unlock(&l3->list_lock); 1947 if (l3->alien) 1948 drain_alien_cache(cachep, l3); 1949 } 1950 } 1951 spin_unlock_irq(&cachep->spinlock); 1952 } 1953 1954 static int __node_shrink(kmem_cache_t *cachep, int node) 1955 { 1956 struct slab *slabp; 1957 struct kmem_list3 *l3 = cachep->nodelists[node]; 1958 int ret; 1959 1960 for (;;) { 1961 struct list_head *p; 1962 1963 p = l3->slabs_free.prev; 1964 if (p == &l3->slabs_free) 1965 break; 1966 1967 slabp = list_entry(l3->slabs_free.prev, struct slab, list); 1968 #if DEBUG 1969 if (slabp->inuse) 1970 BUG(); 1971 #endif 1972 list_del(&slabp->list); 1973 1974 l3->free_objects -= cachep->num; 1975 spin_unlock_irq(&l3->list_lock); 1976 slab_destroy(cachep, slabp); 1977 spin_lock_irq(&l3->list_lock); 1978 } 1979 ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial); 1980 return ret; 1981 } 1982 1983 static int __cache_shrink(kmem_cache_t *cachep) 1984 { 1985 int ret = 0, i = 0; 1986 struct kmem_list3 *l3; 1987 1988 drain_cpu_caches(cachep); 1989 1990 check_irq_on(); 1991 for_each_online_node(i) { 1992 l3 = cachep->nodelists[i]; 1993 if (l3) { 1994 spin_lock_irq(&l3->list_lock); 1995 ret += __node_shrink(cachep, i); 1996 spin_unlock_irq(&l3->list_lock); 1997 } 1998 } 1999 return (ret ? 1 : 0); 2000 } 2001 2002 /** 2003 * kmem_cache_shrink - Shrink a cache. 2004 * @cachep: The cache to shrink. 2005 * 2006 * Releases as many slabs as possible for a cache. 2007 * To help debugging, a zero exit status indicates all slabs were released. 2008 */ 2009 int kmem_cache_shrink(kmem_cache_t *cachep) 2010 { 2011 if (!cachep || in_interrupt()) 2012 BUG(); 2013 2014 return __cache_shrink(cachep); 2015 } 2016 EXPORT_SYMBOL(kmem_cache_shrink); 2017 2018 /** 2019 * kmem_cache_destroy - delete a cache 2020 * @cachep: the cache to destroy 2021 * 2022 * Remove a kmem_cache_t object from the slab cache. 2023 * Returns 0 on success. 2024 * 2025 * It is expected this function will be called by a module when it is 2026 * unloaded. This will remove the cache completely, and avoid a duplicate 2027 * cache being allocated each time a module is loaded and unloaded, if the 2028 * module doesn't have persistent in-kernel storage across loads and unloads. 2029 * 2030 * The cache must be empty before calling this function. 2031 * 2032 * The caller must guarantee that noone will allocate memory from the cache 2033 * during the kmem_cache_destroy(). 2034 */ 2035 int kmem_cache_destroy(kmem_cache_t *cachep) 2036 { 2037 int i; 2038 struct kmem_list3 *l3; 2039 2040 if (!cachep || in_interrupt()) 2041 BUG(); 2042 2043 /* Don't let CPUs to come and go */ 2044 lock_cpu_hotplug(); 2045 2046 /* Find the cache in the chain of caches. */ 2047 down(&cache_chain_sem); 2048 /* 2049 * the chain is never empty, cache_cache is never destroyed 2050 */ 2051 list_del(&cachep->next); 2052 up(&cache_chain_sem); 2053 2054 if (__cache_shrink(cachep)) { 2055 slab_error(cachep, "Can't free all objects"); 2056 down(&cache_chain_sem); 2057 list_add(&cachep->next, &cache_chain); 2058 up(&cache_chain_sem); 2059 unlock_cpu_hotplug(); 2060 return 1; 2061 } 2062 2063 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2064 synchronize_rcu(); 2065 2066 for_each_online_cpu(i) 2067 kfree(cachep->array[i]); 2068 2069 /* NUMA: free the list3 structures */ 2070 for_each_online_node(i) { 2071 if ((l3 = cachep->nodelists[i])) { 2072 kfree(l3->shared); 2073 free_alien_cache(l3->alien); 2074 kfree(l3); 2075 } 2076 } 2077 kmem_cache_free(&cache_cache, cachep); 2078 2079 unlock_cpu_hotplug(); 2080 2081 return 0; 2082 } 2083 EXPORT_SYMBOL(kmem_cache_destroy); 2084 2085 /* Get the memory for a slab management obj. */ 2086 static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp, 2087 int colour_off, gfp_t local_flags) 2088 { 2089 struct slab *slabp; 2090 2091 if (OFF_SLAB(cachep)) { 2092 /* Slab management obj is off-slab. */ 2093 slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); 2094 if (!slabp) 2095 return NULL; 2096 } else { 2097 slabp = objp + colour_off; 2098 colour_off += cachep->slab_size; 2099 } 2100 slabp->inuse = 0; 2101 slabp->colouroff = colour_off; 2102 slabp->s_mem = objp + colour_off; 2103 2104 return slabp; 2105 } 2106 2107 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) 2108 { 2109 return (kmem_bufctl_t *) (slabp + 1); 2110 } 2111 2112 static void cache_init_objs(kmem_cache_t *cachep, 2113 struct slab *slabp, unsigned long ctor_flags) 2114 { 2115 int i; 2116 2117 for (i = 0; i < cachep->num; i++) { 2118 void *objp = slabp->s_mem + cachep->objsize * i; 2119 #if DEBUG 2120 /* need to poison the objs? */ 2121 if (cachep->flags & SLAB_POISON) 2122 poison_obj(cachep, objp, POISON_FREE); 2123 if (cachep->flags & SLAB_STORE_USER) 2124 *dbg_userword(cachep, objp) = NULL; 2125 2126 if (cachep->flags & SLAB_RED_ZONE) { 2127 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2128 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2129 } 2130 /* 2131 * Constructors are not allowed to allocate memory from 2132 * the same cache which they are a constructor for. 2133 * Otherwise, deadlock. They must also be threaded. 2134 */ 2135 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2136 cachep->ctor(objp + obj_dbghead(cachep), cachep, 2137 ctor_flags); 2138 2139 if (cachep->flags & SLAB_RED_ZONE) { 2140 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2141 slab_error(cachep, "constructor overwrote the" 2142 " end of an object"); 2143 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2144 slab_error(cachep, "constructor overwrote the" 2145 " start of an object"); 2146 } 2147 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) 2148 && cachep->flags & SLAB_POISON) 2149 kernel_map_pages(virt_to_page(objp), 2150 cachep->objsize / PAGE_SIZE, 0); 2151 #else 2152 if (cachep->ctor) 2153 cachep->ctor(objp, cachep, ctor_flags); 2154 #endif 2155 slab_bufctl(slabp)[i] = i + 1; 2156 } 2157 slab_bufctl(slabp)[i - 1] = BUFCTL_END; 2158 slabp->free = 0; 2159 } 2160 2161 static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags) 2162 { 2163 if (flags & SLAB_DMA) { 2164 if (!(cachep->gfpflags & GFP_DMA)) 2165 BUG(); 2166 } else { 2167 if (cachep->gfpflags & GFP_DMA) 2168 BUG(); 2169 } 2170 } 2171 2172 static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp) 2173 { 2174 int i; 2175 struct page *page; 2176 2177 /* Nasty!!!!!! I hope this is OK. */ 2178 i = 1 << cachep->gfporder; 2179 page = virt_to_page(objp); 2180 do { 2181 page_set_cache(page, cachep); 2182 page_set_slab(page, slabp); 2183 page++; 2184 } while (--i); 2185 } 2186 2187 /* 2188 * Grow (by 1) the number of slabs within a cache. This is called by 2189 * kmem_cache_alloc() when there are no active objs left in a cache. 2190 */ 2191 static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2192 { 2193 struct slab *slabp; 2194 void *objp; 2195 size_t offset; 2196 gfp_t local_flags; 2197 unsigned long ctor_flags; 2198 struct kmem_list3 *l3; 2199 2200 /* Be lazy and only check for valid flags here, 2201 * keeping it out of the critical path in kmem_cache_alloc(). 2202 */ 2203 if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)) 2204 BUG(); 2205 if (flags & SLAB_NO_GROW) 2206 return 0; 2207 2208 ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2209 local_flags = (flags & SLAB_LEVEL_MASK); 2210 if (!(local_flags & __GFP_WAIT)) 2211 /* 2212 * Not allowed to sleep. Need to tell a constructor about 2213 * this - it might need to know... 2214 */ 2215 ctor_flags |= SLAB_CTOR_ATOMIC; 2216 2217 /* About to mess with non-constant members - lock. */ 2218 check_irq_off(); 2219 spin_lock(&cachep->spinlock); 2220 2221 /* Get colour for the slab, and cal the next value. */ 2222 offset = cachep->colour_next; 2223 cachep->colour_next++; 2224 if (cachep->colour_next >= cachep->colour) 2225 cachep->colour_next = 0; 2226 offset *= cachep->colour_off; 2227 2228 spin_unlock(&cachep->spinlock); 2229 2230 check_irq_off(); 2231 if (local_flags & __GFP_WAIT) 2232 local_irq_enable(); 2233 2234 /* 2235 * The test for missing atomic flag is performed here, rather than 2236 * the more obvious place, simply to reduce the critical path length 2237 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they 2238 * will eventually be caught here (where it matters). 2239 */ 2240 kmem_flagcheck(cachep, flags); 2241 2242 /* Get mem for the objs. 2243 * Attempt to allocate a physical page from 'nodeid', 2244 */ 2245 if (!(objp = kmem_getpages(cachep, flags, nodeid))) 2246 goto failed; 2247 2248 /* Get slab management. */ 2249 if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) 2250 goto opps1; 2251 2252 slabp->nodeid = nodeid; 2253 set_slab_attr(cachep, slabp, objp); 2254 2255 cache_init_objs(cachep, slabp, ctor_flags); 2256 2257 if (local_flags & __GFP_WAIT) 2258 local_irq_disable(); 2259 check_irq_off(); 2260 l3 = cachep->nodelists[nodeid]; 2261 spin_lock(&l3->list_lock); 2262 2263 /* Make slab active. */ 2264 list_add_tail(&slabp->list, &(l3->slabs_free)); 2265 STATS_INC_GROWN(cachep); 2266 l3->free_objects += cachep->num; 2267 spin_unlock(&l3->list_lock); 2268 return 1; 2269 opps1: 2270 kmem_freepages(cachep, objp); 2271 failed: 2272 if (local_flags & __GFP_WAIT) 2273 local_irq_disable(); 2274 return 0; 2275 } 2276 2277 #if DEBUG 2278 2279 /* 2280 * Perform extra freeing checks: 2281 * - detect bad pointers. 2282 * - POISON/RED_ZONE checking 2283 * - destructor calls, for caches with POISON+dtor 2284 */ 2285 static void kfree_debugcheck(const void *objp) 2286 { 2287 struct page *page; 2288 2289 if (!virt_addr_valid(objp)) { 2290 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", 2291 (unsigned long)objp); 2292 BUG(); 2293 } 2294 page = virt_to_page(objp); 2295 if (!PageSlab(page)) { 2296 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", 2297 (unsigned long)objp); 2298 BUG(); 2299 } 2300 } 2301 2302 static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, 2303 void *caller) 2304 { 2305 struct page *page; 2306 unsigned int objnr; 2307 struct slab *slabp; 2308 2309 objp -= obj_dbghead(cachep); 2310 kfree_debugcheck(objp); 2311 page = virt_to_page(objp); 2312 2313 if (page_get_cache(page) != cachep) { 2314 printk(KERN_ERR 2315 "mismatch in kmem_cache_free: expected cache %p, got %p\n", 2316 page_get_cache(page), cachep); 2317 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); 2318 printk(KERN_ERR "%p is %s.\n", page_get_cache(page), 2319 page_get_cache(page)->name); 2320 WARN_ON(1); 2321 } 2322 slabp = page_get_slab(page); 2323 2324 if (cachep->flags & SLAB_RED_ZONE) { 2325 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE 2326 || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { 2327 slab_error(cachep, 2328 "double free, or memory outside" 2329 " object was overwritten"); 2330 printk(KERN_ERR 2331 "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2332 objp, *dbg_redzone1(cachep, objp), 2333 *dbg_redzone2(cachep, objp)); 2334 } 2335 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2336 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2337 } 2338 if (cachep->flags & SLAB_STORE_USER) 2339 *dbg_userword(cachep, objp) = caller; 2340 2341 objnr = (objp - slabp->s_mem) / cachep->objsize; 2342 2343 BUG_ON(objnr >= cachep->num); 2344 BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize); 2345 2346 if (cachep->flags & SLAB_DEBUG_INITIAL) { 2347 /* Need to call the slab's constructor so the 2348 * caller can perform a verify of its state (debugging). 2349 * Called without the cache-lock held. 2350 */ 2351 cachep->ctor(objp + obj_dbghead(cachep), 2352 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); 2353 } 2354 if (cachep->flags & SLAB_POISON && cachep->dtor) { 2355 /* we want to cache poison the object, 2356 * call the destruction callback 2357 */ 2358 cachep->dtor(objp + obj_dbghead(cachep), cachep, 0); 2359 } 2360 if (cachep->flags & SLAB_POISON) { 2361 #ifdef CONFIG_DEBUG_PAGEALLOC 2362 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { 2363 store_stackinfo(cachep, objp, (unsigned long)caller); 2364 kernel_map_pages(virt_to_page(objp), 2365 cachep->objsize / PAGE_SIZE, 0); 2366 } else { 2367 poison_obj(cachep, objp, POISON_FREE); 2368 } 2369 #else 2370 poison_obj(cachep, objp, POISON_FREE); 2371 #endif 2372 } 2373 return objp; 2374 } 2375 2376 static void check_slabp(kmem_cache_t *cachep, struct slab *slabp) 2377 { 2378 kmem_bufctl_t i; 2379 int entries = 0; 2380 2381 /* Check slab's freelist to see if this obj is there. */ 2382 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { 2383 entries++; 2384 if (entries > cachep->num || i >= cachep->num) 2385 goto bad; 2386 } 2387 if (entries != cachep->num - slabp->inuse) { 2388 bad: 2389 printk(KERN_ERR 2390 "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", 2391 cachep->name, cachep->num, slabp, slabp->inuse); 2392 for (i = 0; 2393 i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t); 2394 i++) { 2395 if ((i % 16) == 0) 2396 printk("\n%03x:", i); 2397 printk(" %02x", ((unsigned char *)slabp)[i]); 2398 } 2399 printk("\n"); 2400 BUG(); 2401 } 2402 } 2403 #else 2404 #define kfree_debugcheck(x) do { } while(0) 2405 #define cache_free_debugcheck(x,objp,z) (objp) 2406 #define check_slabp(x,y) do { } while(0) 2407 #endif 2408 2409 static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags) 2410 { 2411 int batchcount; 2412 struct kmem_list3 *l3; 2413 struct array_cache *ac; 2414 2415 check_irq_off(); 2416 ac = ac_data(cachep); 2417 retry: 2418 batchcount = ac->batchcount; 2419 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2420 /* if there was little recent activity on this 2421 * cache, then perform only a partial refill. 2422 * Otherwise we could generate refill bouncing. 2423 */ 2424 batchcount = BATCHREFILL_LIMIT; 2425 } 2426 l3 = cachep->nodelists[numa_node_id()]; 2427 2428 BUG_ON(ac->avail > 0 || !l3); 2429 spin_lock(&l3->list_lock); 2430 2431 if (l3->shared) { 2432 struct array_cache *shared_array = l3->shared; 2433 if (shared_array->avail) { 2434 if (batchcount > shared_array->avail) 2435 batchcount = shared_array->avail; 2436 shared_array->avail -= batchcount; 2437 ac->avail = batchcount; 2438 memcpy(ac->entry, 2439 &(shared_array->entry[shared_array->avail]), 2440 sizeof(void *) * batchcount); 2441 shared_array->touched = 1; 2442 goto alloc_done; 2443 } 2444 } 2445 while (batchcount > 0) { 2446 struct list_head *entry; 2447 struct slab *slabp; 2448 /* Get slab alloc is to come from. */ 2449 entry = l3->slabs_partial.next; 2450 if (entry == &l3->slabs_partial) { 2451 l3->free_touched = 1; 2452 entry = l3->slabs_free.next; 2453 if (entry == &l3->slabs_free) 2454 goto must_grow; 2455 } 2456 2457 slabp = list_entry(entry, struct slab, list); 2458 check_slabp(cachep, slabp); 2459 check_spinlock_acquired(cachep); 2460 while (slabp->inuse < cachep->num && batchcount--) { 2461 kmem_bufctl_t next; 2462 STATS_INC_ALLOCED(cachep); 2463 STATS_INC_ACTIVE(cachep); 2464 STATS_SET_HIGH(cachep); 2465 2466 /* get obj pointer */ 2467 ac->entry[ac->avail++] = slabp->s_mem + 2468 slabp->free * cachep->objsize; 2469 2470 slabp->inuse++; 2471 next = slab_bufctl(slabp)[slabp->free]; 2472 #if DEBUG 2473 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2474 WARN_ON(numa_node_id() != slabp->nodeid); 2475 #endif 2476 slabp->free = next; 2477 } 2478 check_slabp(cachep, slabp); 2479 2480 /* move slabp to correct slabp list: */ 2481 list_del(&slabp->list); 2482 if (slabp->free == BUFCTL_END) 2483 list_add(&slabp->list, &l3->slabs_full); 2484 else 2485 list_add(&slabp->list, &l3->slabs_partial); 2486 } 2487 2488 must_grow: 2489 l3->free_objects -= ac->avail; 2490 alloc_done: 2491 spin_unlock(&l3->list_lock); 2492 2493 if (unlikely(!ac->avail)) { 2494 int x; 2495 x = cache_grow(cachep, flags, numa_node_id()); 2496 2497 // cache_grow can reenable interrupts, then ac could change. 2498 ac = ac_data(cachep); 2499 if (!x && ac->avail == 0) // no objects in sight? abort 2500 return NULL; 2501 2502 if (!ac->avail) // objects refilled by interrupt? 2503 goto retry; 2504 } 2505 ac->touched = 1; 2506 return ac->entry[--ac->avail]; 2507 } 2508 2509 static inline void 2510 cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags) 2511 { 2512 might_sleep_if(flags & __GFP_WAIT); 2513 #if DEBUG 2514 kmem_flagcheck(cachep, flags); 2515 #endif 2516 } 2517 2518 #if DEBUG 2519 static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags, 2520 void *objp, void *caller) 2521 { 2522 if (!objp) 2523 return objp; 2524 if (cachep->flags & SLAB_POISON) { 2525 #ifdef CONFIG_DEBUG_PAGEALLOC 2526 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 2527 kernel_map_pages(virt_to_page(objp), 2528 cachep->objsize / PAGE_SIZE, 1); 2529 else 2530 check_poison_obj(cachep, objp); 2531 #else 2532 check_poison_obj(cachep, objp); 2533 #endif 2534 poison_obj(cachep, objp, POISON_INUSE); 2535 } 2536 if (cachep->flags & SLAB_STORE_USER) 2537 *dbg_userword(cachep, objp) = caller; 2538 2539 if (cachep->flags & SLAB_RED_ZONE) { 2540 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE 2541 || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 2542 slab_error(cachep, 2543 "double free, or memory outside" 2544 " object was overwritten"); 2545 printk(KERN_ERR 2546 "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2547 objp, *dbg_redzone1(cachep, objp), 2548 *dbg_redzone2(cachep, objp)); 2549 } 2550 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2551 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2552 } 2553 objp += obj_dbghead(cachep); 2554 if (cachep->ctor && cachep->flags & SLAB_POISON) { 2555 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2556 2557 if (!(flags & __GFP_WAIT)) 2558 ctor_flags |= SLAB_CTOR_ATOMIC; 2559 2560 cachep->ctor(objp, cachep, ctor_flags); 2561 } 2562 return objp; 2563 } 2564 #else 2565 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 2566 #endif 2567 2568 static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2569 { 2570 void *objp; 2571 struct array_cache *ac; 2572 2573 check_irq_off(); 2574 ac = ac_data(cachep); 2575 if (likely(ac->avail)) { 2576 STATS_INC_ALLOCHIT(cachep); 2577 ac->touched = 1; 2578 objp = ac->entry[--ac->avail]; 2579 } else { 2580 STATS_INC_ALLOCMISS(cachep); 2581 objp = cache_alloc_refill(cachep, flags); 2582 } 2583 return objp; 2584 } 2585 2586 static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2587 { 2588 unsigned long save_flags; 2589 void *objp; 2590 2591 cache_alloc_debugcheck_before(cachep, flags); 2592 2593 local_irq_save(save_flags); 2594 objp = ____cache_alloc(cachep, flags); 2595 local_irq_restore(save_flags); 2596 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 2597 __builtin_return_address(0)); 2598 prefetchw(objp); 2599 return objp; 2600 } 2601 2602 #ifdef CONFIG_NUMA 2603 /* 2604 * A interface to enable slab creation on nodeid 2605 */ 2606 static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2607 { 2608 struct list_head *entry; 2609 struct slab *slabp; 2610 struct kmem_list3 *l3; 2611 void *obj; 2612 kmem_bufctl_t next; 2613 int x; 2614 2615 l3 = cachep->nodelists[nodeid]; 2616 BUG_ON(!l3); 2617 2618 retry: 2619 spin_lock(&l3->list_lock); 2620 entry = l3->slabs_partial.next; 2621 if (entry == &l3->slabs_partial) { 2622 l3->free_touched = 1; 2623 entry = l3->slabs_free.next; 2624 if (entry == &l3->slabs_free) 2625 goto must_grow; 2626 } 2627 2628 slabp = list_entry(entry, struct slab, list); 2629 check_spinlock_acquired_node(cachep, nodeid); 2630 check_slabp(cachep, slabp); 2631 2632 STATS_INC_NODEALLOCS(cachep); 2633 STATS_INC_ACTIVE(cachep); 2634 STATS_SET_HIGH(cachep); 2635 2636 BUG_ON(slabp->inuse == cachep->num); 2637 2638 /* get obj pointer */ 2639 obj = slabp->s_mem + slabp->free * cachep->objsize; 2640 slabp->inuse++; 2641 next = slab_bufctl(slabp)[slabp->free]; 2642 #if DEBUG 2643 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2644 #endif 2645 slabp->free = next; 2646 check_slabp(cachep, slabp); 2647 l3->free_objects--; 2648 /* move slabp to correct slabp list: */ 2649 list_del(&slabp->list); 2650 2651 if (slabp->free == BUFCTL_END) { 2652 list_add(&slabp->list, &l3->slabs_full); 2653 } else { 2654 list_add(&slabp->list, &l3->slabs_partial); 2655 } 2656 2657 spin_unlock(&l3->list_lock); 2658 goto done; 2659 2660 must_grow: 2661 spin_unlock(&l3->list_lock); 2662 x = cache_grow(cachep, flags, nodeid); 2663 2664 if (!x) 2665 return NULL; 2666 2667 goto retry; 2668 done: 2669 return obj; 2670 } 2671 #endif 2672 2673 /* 2674 * Caller needs to acquire correct kmem_list's list_lock 2675 */ 2676 static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, 2677 int node) 2678 { 2679 int i; 2680 struct kmem_list3 *l3; 2681 2682 for (i = 0; i < nr_objects; i++) { 2683 void *objp = objpp[i]; 2684 struct slab *slabp; 2685 unsigned int objnr; 2686 2687 slabp = page_get_slab(virt_to_page(objp)); 2688 l3 = cachep->nodelists[node]; 2689 list_del(&slabp->list); 2690 objnr = (objp - slabp->s_mem) / cachep->objsize; 2691 check_spinlock_acquired_node(cachep, node); 2692 check_slabp(cachep, slabp); 2693 2694 #if DEBUG 2695 /* Verify that the slab belongs to the intended node */ 2696 WARN_ON(slabp->nodeid != node); 2697 2698 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { 2699 printk(KERN_ERR "slab: double free detected in cache " 2700 "'%s', objp %p\n", cachep->name, objp); 2701 BUG(); 2702 } 2703 #endif 2704 slab_bufctl(slabp)[objnr] = slabp->free; 2705 slabp->free = objnr; 2706 STATS_DEC_ACTIVE(cachep); 2707 slabp->inuse--; 2708 l3->free_objects++; 2709 check_slabp(cachep, slabp); 2710 2711 /* fixup slab chains */ 2712 if (slabp->inuse == 0) { 2713 if (l3->free_objects > l3->free_limit) { 2714 l3->free_objects -= cachep->num; 2715 slab_destroy(cachep, slabp); 2716 } else { 2717 list_add(&slabp->list, &l3->slabs_free); 2718 } 2719 } else { 2720 /* Unconditionally move a slab to the end of the 2721 * partial list on free - maximum time for the 2722 * other objects to be freed, too. 2723 */ 2724 list_add_tail(&slabp->list, &l3->slabs_partial); 2725 } 2726 } 2727 } 2728 2729 static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac) 2730 { 2731 int batchcount; 2732 struct kmem_list3 *l3; 2733 int node = numa_node_id(); 2734 2735 batchcount = ac->batchcount; 2736 #if DEBUG 2737 BUG_ON(!batchcount || batchcount > ac->avail); 2738 #endif 2739 check_irq_off(); 2740 l3 = cachep->nodelists[node]; 2741 spin_lock(&l3->list_lock); 2742 if (l3->shared) { 2743 struct array_cache *shared_array = l3->shared; 2744 int max = shared_array->limit - shared_array->avail; 2745 if (max) { 2746 if (batchcount > max) 2747 batchcount = max; 2748 memcpy(&(shared_array->entry[shared_array->avail]), 2749 ac->entry, sizeof(void *) * batchcount); 2750 shared_array->avail += batchcount; 2751 goto free_done; 2752 } 2753 } 2754 2755 free_block(cachep, ac->entry, batchcount, node); 2756 free_done: 2757 #if STATS 2758 { 2759 int i = 0; 2760 struct list_head *p; 2761 2762 p = l3->slabs_free.next; 2763 while (p != &(l3->slabs_free)) { 2764 struct slab *slabp; 2765 2766 slabp = list_entry(p, struct slab, list); 2767 BUG_ON(slabp->inuse); 2768 2769 i++; 2770 p = p->next; 2771 } 2772 STATS_SET_FREEABLE(cachep, i); 2773 } 2774 #endif 2775 spin_unlock(&l3->list_lock); 2776 ac->avail -= batchcount; 2777 memmove(ac->entry, &(ac->entry[batchcount]), 2778 sizeof(void *) * ac->avail); 2779 } 2780 2781 /* 2782 * __cache_free 2783 * Release an obj back to its cache. If the obj has a constructed 2784 * state, it must be in this state _before_ it is released. 2785 * 2786 * Called with disabled ints. 2787 */ 2788 static inline void __cache_free(kmem_cache_t *cachep, void *objp) 2789 { 2790 struct array_cache *ac = ac_data(cachep); 2791 2792 check_irq_off(); 2793 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 2794 2795 /* Make sure we are not freeing a object from another 2796 * node to the array cache on this cpu. 2797 */ 2798 #ifdef CONFIG_NUMA 2799 { 2800 struct slab *slabp; 2801 slabp = page_get_slab(virt_to_page(objp)); 2802 if (unlikely(slabp->nodeid != numa_node_id())) { 2803 struct array_cache *alien = NULL; 2804 int nodeid = slabp->nodeid; 2805 struct kmem_list3 *l3 = 2806 cachep->nodelists[numa_node_id()]; 2807 2808 STATS_INC_NODEFREES(cachep); 2809 if (l3->alien && l3->alien[nodeid]) { 2810 alien = l3->alien[nodeid]; 2811 spin_lock(&alien->lock); 2812 if (unlikely(alien->avail == alien->limit)) 2813 __drain_alien_cache(cachep, 2814 alien, nodeid); 2815 alien->entry[alien->avail++] = objp; 2816 spin_unlock(&alien->lock); 2817 } else { 2818 spin_lock(&(cachep->nodelists[nodeid])-> 2819 list_lock); 2820 free_block(cachep, &objp, 1, nodeid); 2821 spin_unlock(&(cachep->nodelists[nodeid])-> 2822 list_lock); 2823 } 2824 return; 2825 } 2826 } 2827 #endif 2828 if (likely(ac->avail < ac->limit)) { 2829 STATS_INC_FREEHIT(cachep); 2830 ac->entry[ac->avail++] = objp; 2831 return; 2832 } else { 2833 STATS_INC_FREEMISS(cachep); 2834 cache_flusharray(cachep, ac); 2835 ac->entry[ac->avail++] = objp; 2836 } 2837 } 2838 2839 /** 2840 * kmem_cache_alloc - Allocate an object 2841 * @cachep: The cache to allocate from. 2842 * @flags: See kmalloc(). 2843 * 2844 * Allocate an object from this cache. The flags are only relevant 2845 * if the cache has no available objects. 2846 */ 2847 void *kmem_cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2848 { 2849 return __cache_alloc(cachep, flags); 2850 } 2851 EXPORT_SYMBOL(kmem_cache_alloc); 2852 2853 /** 2854 * kmem_ptr_validate - check if an untrusted pointer might 2855 * be a slab entry. 2856 * @cachep: the cache we're checking against 2857 * @ptr: pointer to validate 2858 * 2859 * This verifies that the untrusted pointer looks sane: 2860 * it is _not_ a guarantee that the pointer is actually 2861 * part of the slab cache in question, but it at least 2862 * validates that the pointer can be dereferenced and 2863 * looks half-way sane. 2864 * 2865 * Currently only used for dentry validation. 2866 */ 2867 int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) 2868 { 2869 unsigned long addr = (unsigned long)ptr; 2870 unsigned long min_addr = PAGE_OFFSET; 2871 unsigned long align_mask = BYTES_PER_WORD - 1; 2872 unsigned long size = cachep->objsize; 2873 struct page *page; 2874 2875 if (unlikely(addr < min_addr)) 2876 goto out; 2877 if (unlikely(addr > (unsigned long)high_memory - size)) 2878 goto out; 2879 if (unlikely(addr & align_mask)) 2880 goto out; 2881 if (unlikely(!kern_addr_valid(addr))) 2882 goto out; 2883 if (unlikely(!kern_addr_valid(addr + size - 1))) 2884 goto out; 2885 page = virt_to_page(ptr); 2886 if (unlikely(!PageSlab(page))) 2887 goto out; 2888 if (unlikely(page_get_cache(page) != cachep)) 2889 goto out; 2890 return 1; 2891 out: 2892 return 0; 2893 } 2894 2895 #ifdef CONFIG_NUMA 2896 /** 2897 * kmem_cache_alloc_node - Allocate an object on the specified node 2898 * @cachep: The cache to allocate from. 2899 * @flags: See kmalloc(). 2900 * @nodeid: node number of the target node. 2901 * 2902 * Identical to kmem_cache_alloc, except that this function is slow 2903 * and can sleep. And it will allocate memory on the given node, which 2904 * can improve the performance for cpu bound structures. 2905 * New and improved: it will now make sure that the object gets 2906 * put on the correct node list so that there is no false sharing. 2907 */ 2908 void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2909 { 2910 unsigned long save_flags; 2911 void *ptr; 2912 2913 if (nodeid == -1) 2914 return __cache_alloc(cachep, flags); 2915 2916 if (unlikely(!cachep->nodelists[nodeid])) { 2917 /* Fall back to __cache_alloc if we run into trouble */ 2918 printk(KERN_WARNING 2919 "slab: not allocating in inactive node %d for cache %s\n", 2920 nodeid, cachep->name); 2921 return __cache_alloc(cachep, flags); 2922 } 2923 2924 cache_alloc_debugcheck_before(cachep, flags); 2925 local_irq_save(save_flags); 2926 if (nodeid == numa_node_id()) 2927 ptr = ____cache_alloc(cachep, flags); 2928 else 2929 ptr = __cache_alloc_node(cachep, flags, nodeid); 2930 local_irq_restore(save_flags); 2931 ptr = 2932 cache_alloc_debugcheck_after(cachep, flags, ptr, 2933 __builtin_return_address(0)); 2934 2935 return ptr; 2936 } 2937 EXPORT_SYMBOL(kmem_cache_alloc_node); 2938 2939 void *kmalloc_node(size_t size, gfp_t flags, int node) 2940 { 2941 kmem_cache_t *cachep; 2942 2943 cachep = kmem_find_general_cachep(size, flags); 2944 if (unlikely(cachep == NULL)) 2945 return NULL; 2946 return kmem_cache_alloc_node(cachep, flags, node); 2947 } 2948 EXPORT_SYMBOL(kmalloc_node); 2949 #endif 2950 2951 /** 2952 * kmalloc - allocate memory 2953 * @size: how many bytes of memory are required. 2954 * @flags: the type of memory to allocate. 2955 * 2956 * kmalloc is the normal method of allocating memory 2957 * in the kernel. 2958 * 2959 * The @flags argument may be one of: 2960 * 2961 * %GFP_USER - Allocate memory on behalf of user. May sleep. 2962 * 2963 * %GFP_KERNEL - Allocate normal kernel ram. May sleep. 2964 * 2965 * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers. 2966 * 2967 * Additionally, the %GFP_DMA flag may be set to indicate the memory 2968 * must be suitable for DMA. This can mean different things on different 2969 * platforms. For example, on i386, it means that the memory must come 2970 * from the first 16MB. 2971 */ 2972 void *__kmalloc(size_t size, gfp_t flags) 2973 { 2974 kmem_cache_t *cachep; 2975 2976 /* If you want to save a few bytes .text space: replace 2977 * __ with kmem_. 2978 * Then kmalloc uses the uninlined functions instead of the inline 2979 * functions. 2980 */ 2981 cachep = __find_general_cachep(size, flags); 2982 if (unlikely(cachep == NULL)) 2983 return NULL; 2984 return __cache_alloc(cachep, flags); 2985 } 2986 EXPORT_SYMBOL(__kmalloc); 2987 2988 #ifdef CONFIG_SMP 2989 /** 2990 * __alloc_percpu - allocate one copy of the object for every present 2991 * cpu in the system, zeroing them. 2992 * Objects should be dereferenced using the per_cpu_ptr macro only. 2993 * 2994 * @size: how many bytes of memory are required. 2995 */ 2996 void *__alloc_percpu(size_t size) 2997 { 2998 int i; 2999 struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL); 3000 3001 if (!pdata) 3002 return NULL; 3003 3004 /* 3005 * Cannot use for_each_online_cpu since a cpu may come online 3006 * and we have no way of figuring out how to fix the array 3007 * that we have allocated then.... 3008 */ 3009 for_each_cpu(i) { 3010 int node = cpu_to_node(i); 3011 3012 if (node_online(node)) 3013 pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node); 3014 else 3015 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL); 3016 3017 if (!pdata->ptrs[i]) 3018 goto unwind_oom; 3019 memset(pdata->ptrs[i], 0, size); 3020 } 3021 3022 /* Catch derefs w/o wrappers */ 3023 return (void *)(~(unsigned long)pdata); 3024 3025 unwind_oom: 3026 while (--i >= 0) { 3027 if (!cpu_possible(i)) 3028 continue; 3029 kfree(pdata->ptrs[i]); 3030 } 3031 kfree(pdata); 3032 return NULL; 3033 } 3034 EXPORT_SYMBOL(__alloc_percpu); 3035 #endif 3036 3037 /** 3038 * kmem_cache_free - Deallocate an object 3039 * @cachep: The cache the allocation was from. 3040 * @objp: The previously allocated object. 3041 * 3042 * Free an object which was previously allocated from this 3043 * cache. 3044 */ 3045 void kmem_cache_free(kmem_cache_t *cachep, void *objp) 3046 { 3047 unsigned long flags; 3048 3049 local_irq_save(flags); 3050 __cache_free(cachep, objp); 3051 local_irq_restore(flags); 3052 } 3053 EXPORT_SYMBOL(kmem_cache_free); 3054 3055 /** 3056 * kfree - free previously allocated memory 3057 * @objp: pointer returned by kmalloc. 3058 * 3059 * If @objp is NULL, no operation is performed. 3060 * 3061 * Don't free memory not originally allocated by kmalloc() 3062 * or you will run into trouble. 3063 */ 3064 void kfree(const void *objp) 3065 { 3066 kmem_cache_t *c; 3067 unsigned long flags; 3068 3069 if (unlikely(!objp)) 3070 return; 3071 local_irq_save(flags); 3072 kfree_debugcheck(objp); 3073 c = page_get_cache(virt_to_page(objp)); 3074 mutex_debug_check_no_locks_freed(objp, objp+obj_reallen(c)); 3075 __cache_free(c, (void *)objp); 3076 local_irq_restore(flags); 3077 } 3078 EXPORT_SYMBOL(kfree); 3079 3080 #ifdef CONFIG_SMP 3081 /** 3082 * free_percpu - free previously allocated percpu memory 3083 * @objp: pointer returned by alloc_percpu. 3084 * 3085 * Don't free memory not originally allocated by alloc_percpu() 3086 * The complemented objp is to check for that. 3087 */ 3088 void free_percpu(const void *objp) 3089 { 3090 int i; 3091 struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp); 3092 3093 /* 3094 * We allocate for all cpus so we cannot use for online cpu here. 3095 */ 3096 for_each_cpu(i) 3097 kfree(p->ptrs[i]); 3098 kfree(p); 3099 } 3100 EXPORT_SYMBOL(free_percpu); 3101 #endif 3102 3103 unsigned int kmem_cache_size(kmem_cache_t *cachep) 3104 { 3105 return obj_reallen(cachep); 3106 } 3107 EXPORT_SYMBOL(kmem_cache_size); 3108 3109 const char *kmem_cache_name(kmem_cache_t *cachep) 3110 { 3111 return cachep->name; 3112 } 3113 EXPORT_SYMBOL_GPL(kmem_cache_name); 3114 3115 /* 3116 * This initializes kmem_list3 for all nodes. 3117 */ 3118 static int alloc_kmemlist(kmem_cache_t *cachep) 3119 { 3120 int node; 3121 struct kmem_list3 *l3; 3122 int err = 0; 3123 3124 for_each_online_node(node) { 3125 struct array_cache *nc = NULL, *new; 3126 struct array_cache **new_alien = NULL; 3127 #ifdef CONFIG_NUMA 3128 if (!(new_alien = alloc_alien_cache(node, cachep->limit))) 3129 goto fail; 3130 #endif 3131 if (!(new = alloc_arraycache(node, (cachep->shared * 3132 cachep->batchcount), 3133 0xbaadf00d))) 3134 goto fail; 3135 if ((l3 = cachep->nodelists[node])) { 3136 3137 spin_lock_irq(&l3->list_lock); 3138 3139 if ((nc = cachep->nodelists[node]->shared)) 3140 free_block(cachep, nc->entry, nc->avail, node); 3141 3142 l3->shared = new; 3143 if (!cachep->nodelists[node]->alien) { 3144 l3->alien = new_alien; 3145 new_alien = NULL; 3146 } 3147 l3->free_limit = (1 + nr_cpus_node(node)) * 3148 cachep->batchcount + cachep->num; 3149 spin_unlock_irq(&l3->list_lock); 3150 kfree(nc); 3151 free_alien_cache(new_alien); 3152 continue; 3153 } 3154 if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), 3155 GFP_KERNEL, node))) 3156 goto fail; 3157 3158 kmem_list3_init(l3); 3159 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 3160 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 3161 l3->shared = new; 3162 l3->alien = new_alien; 3163 l3->free_limit = (1 + nr_cpus_node(node)) * 3164 cachep->batchcount + cachep->num; 3165 cachep->nodelists[node] = l3; 3166 } 3167 return err; 3168 fail: 3169 err = -ENOMEM; 3170 return err; 3171 } 3172 3173 struct ccupdate_struct { 3174 kmem_cache_t *cachep; 3175 struct array_cache *new[NR_CPUS]; 3176 }; 3177 3178 static void do_ccupdate_local(void *info) 3179 { 3180 struct ccupdate_struct *new = (struct ccupdate_struct *)info; 3181 struct array_cache *old; 3182 3183 check_irq_off(); 3184 old = ac_data(new->cachep); 3185 3186 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 3187 new->new[smp_processor_id()] = old; 3188 } 3189 3190 static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount, 3191 int shared) 3192 { 3193 struct ccupdate_struct new; 3194 int i, err; 3195 3196 memset(&new.new, 0, sizeof(new.new)); 3197 for_each_online_cpu(i) { 3198 new.new[i] = 3199 alloc_arraycache(cpu_to_node(i), limit, batchcount); 3200 if (!new.new[i]) { 3201 for (i--; i >= 0; i--) 3202 kfree(new.new[i]); 3203 return -ENOMEM; 3204 } 3205 } 3206 new.cachep = cachep; 3207 3208 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); 3209 3210 check_irq_on(); 3211 spin_lock_irq(&cachep->spinlock); 3212 cachep->batchcount = batchcount; 3213 cachep->limit = limit; 3214 cachep->shared = shared; 3215 spin_unlock_irq(&cachep->spinlock); 3216 3217 for_each_online_cpu(i) { 3218 struct array_cache *ccold = new.new[i]; 3219 if (!ccold) 3220 continue; 3221 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3222 free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); 3223 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3224 kfree(ccold); 3225 } 3226 3227 err = alloc_kmemlist(cachep); 3228 if (err) { 3229 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n", 3230 cachep->name, -err); 3231 BUG(); 3232 } 3233 return 0; 3234 } 3235 3236 static void enable_cpucache(kmem_cache_t *cachep) 3237 { 3238 int err; 3239 int limit, shared; 3240 3241 /* The head array serves three purposes: 3242 * - create a LIFO ordering, i.e. return objects that are cache-warm 3243 * - reduce the number of spinlock operations. 3244 * - reduce the number of linked list operations on the slab and 3245 * bufctl chains: array operations are cheaper. 3246 * The numbers are guessed, we should auto-tune as described by 3247 * Bonwick. 3248 */ 3249 if (cachep->objsize > 131072) 3250 limit = 1; 3251 else if (cachep->objsize > PAGE_SIZE) 3252 limit = 8; 3253 else if (cachep->objsize > 1024) 3254 limit = 24; 3255 else if (cachep->objsize > 256) 3256 limit = 54; 3257 else 3258 limit = 120; 3259 3260 /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound 3261 * allocation behaviour: Most allocs on one cpu, most free operations 3262 * on another cpu. For these cases, an efficient object passing between 3263 * cpus is necessary. This is provided by a shared array. The array 3264 * replaces Bonwick's magazine layer. 3265 * On uniprocessor, it's functionally equivalent (but less efficient) 3266 * to a larger limit. Thus disabled by default. 3267 */ 3268 shared = 0; 3269 #ifdef CONFIG_SMP 3270 if (cachep->objsize <= PAGE_SIZE) 3271 shared = 8; 3272 #endif 3273 3274 #if DEBUG 3275 /* With debugging enabled, large batchcount lead to excessively 3276 * long periods with disabled local interrupts. Limit the 3277 * batchcount 3278 */ 3279 if (limit > 32) 3280 limit = 32; 3281 #endif 3282 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared); 3283 if (err) 3284 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3285 cachep->name, -err); 3286 } 3287 3288 static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac, 3289 int force, int node) 3290 { 3291 int tofree; 3292 3293 check_spinlock_acquired_node(cachep, node); 3294 if (ac->touched && !force) { 3295 ac->touched = 0; 3296 } else if (ac->avail) { 3297 tofree = force ? ac->avail : (ac->limit + 4) / 5; 3298 if (tofree > ac->avail) { 3299 tofree = (ac->avail + 1) / 2; 3300 } 3301 free_block(cachep, ac->entry, tofree, node); 3302 ac->avail -= tofree; 3303 memmove(ac->entry, &(ac->entry[tofree]), 3304 sizeof(void *) * ac->avail); 3305 } 3306 } 3307 3308 /** 3309 * cache_reap - Reclaim memory from caches. 3310 * @unused: unused parameter 3311 * 3312 * Called from workqueue/eventd every few seconds. 3313 * Purpose: 3314 * - clear the per-cpu caches for this CPU. 3315 * - return freeable pages to the main free memory pool. 3316 * 3317 * If we cannot acquire the cache chain semaphore then just give up - we'll 3318 * try again on the next iteration. 3319 */ 3320 static void cache_reap(void *unused) 3321 { 3322 struct list_head *walk; 3323 struct kmem_list3 *l3; 3324 3325 if (down_trylock(&cache_chain_sem)) { 3326 /* Give up. Setup the next iteration. */ 3327 schedule_delayed_work(&__get_cpu_var(reap_work), 3328 REAPTIMEOUT_CPUC); 3329 return; 3330 } 3331 3332 list_for_each(walk, &cache_chain) { 3333 kmem_cache_t *searchp; 3334 struct list_head *p; 3335 int tofree; 3336 struct slab *slabp; 3337 3338 searchp = list_entry(walk, kmem_cache_t, next); 3339 3340 if (searchp->flags & SLAB_NO_REAP) 3341 goto next; 3342 3343 check_irq_on(); 3344 3345 l3 = searchp->nodelists[numa_node_id()]; 3346 if (l3->alien) 3347 drain_alien_cache(searchp, l3); 3348 spin_lock_irq(&l3->list_lock); 3349 3350 drain_array_locked(searchp, ac_data(searchp), 0, 3351 numa_node_id()); 3352 3353 if (time_after(l3->next_reap, jiffies)) 3354 goto next_unlock; 3355 3356 l3->next_reap = jiffies + REAPTIMEOUT_LIST3; 3357 3358 if (l3->shared) 3359 drain_array_locked(searchp, l3->shared, 0, 3360 numa_node_id()); 3361 3362 if (l3->free_touched) { 3363 l3->free_touched = 0; 3364 goto next_unlock; 3365 } 3366 3367 tofree = 3368 (l3->free_limit + 5 * searchp->num - 3369 1) / (5 * searchp->num); 3370 do { 3371 p = l3->slabs_free.next; 3372 if (p == &(l3->slabs_free)) 3373 break; 3374 3375 slabp = list_entry(p, struct slab, list); 3376 BUG_ON(slabp->inuse); 3377 list_del(&slabp->list); 3378 STATS_INC_REAPED(searchp); 3379 3380 /* Safe to drop the lock. The slab is no longer 3381 * linked to the cache. 3382 * searchp cannot disappear, we hold 3383 * cache_chain_lock 3384 */ 3385 l3->free_objects -= searchp->num; 3386 spin_unlock_irq(&l3->list_lock); 3387 slab_destroy(searchp, slabp); 3388 spin_lock_irq(&l3->list_lock); 3389 } while (--tofree > 0); 3390 next_unlock: 3391 spin_unlock_irq(&l3->list_lock); 3392 next: 3393 cond_resched(); 3394 } 3395 check_irq_on(); 3396 up(&cache_chain_sem); 3397 drain_remote_pages(); 3398 /* Setup the next iteration */ 3399 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 3400 } 3401 3402 #ifdef CONFIG_PROC_FS 3403 3404 static void print_slabinfo_header(struct seq_file *m) 3405 { 3406 /* 3407 * Output format version, so at least we can change it 3408 * without _too_ many complaints. 3409 */ 3410 #if STATS 3411 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); 3412 #else 3413 seq_puts(m, "slabinfo - version: 2.1\n"); 3414 #endif 3415 seq_puts(m, "# name <active_objs> <num_objs> <objsize> " 3416 "<objperslab> <pagesperslab>"); 3417 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 3418 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 3419 #if STATS 3420 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " 3421 "<error> <maxfreeable> <nodeallocs> <remotefrees>"); 3422 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 3423 #endif 3424 seq_putc(m, '\n'); 3425 } 3426 3427 static void *s_start(struct seq_file *m, loff_t *pos) 3428 { 3429 loff_t n = *pos; 3430 struct list_head *p; 3431 3432 down(&cache_chain_sem); 3433 if (!n) 3434 print_slabinfo_header(m); 3435 p = cache_chain.next; 3436 while (n--) { 3437 p = p->next; 3438 if (p == &cache_chain) 3439 return NULL; 3440 } 3441 return list_entry(p, kmem_cache_t, next); 3442 } 3443 3444 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 3445 { 3446 kmem_cache_t *cachep = p; 3447 ++*pos; 3448 return cachep->next.next == &cache_chain ? NULL 3449 : list_entry(cachep->next.next, kmem_cache_t, next); 3450 } 3451 3452 static void s_stop(struct seq_file *m, void *p) 3453 { 3454 up(&cache_chain_sem); 3455 } 3456 3457 static int s_show(struct seq_file *m, void *p) 3458 { 3459 kmem_cache_t *cachep = p; 3460 struct list_head *q; 3461 struct slab *slabp; 3462 unsigned long active_objs; 3463 unsigned long num_objs; 3464 unsigned long active_slabs = 0; 3465 unsigned long num_slabs, free_objects = 0, shared_avail = 0; 3466 const char *name; 3467 char *error = NULL; 3468 int node; 3469 struct kmem_list3 *l3; 3470 3471 check_irq_on(); 3472 spin_lock_irq(&cachep->spinlock); 3473 active_objs = 0; 3474 num_slabs = 0; 3475 for_each_online_node(node) { 3476 l3 = cachep->nodelists[node]; 3477 if (!l3) 3478 continue; 3479 3480 spin_lock(&l3->list_lock); 3481 3482 list_for_each(q, &l3->slabs_full) { 3483 slabp = list_entry(q, struct slab, list); 3484 if (slabp->inuse != cachep->num && !error) 3485 error = "slabs_full accounting error"; 3486 active_objs += cachep->num; 3487 active_slabs++; 3488 } 3489 list_for_each(q, &l3->slabs_partial) { 3490 slabp = list_entry(q, struct slab, list); 3491 if (slabp->inuse == cachep->num && !error) 3492 error = "slabs_partial inuse accounting error"; 3493 if (!slabp->inuse && !error) 3494 error = "slabs_partial/inuse accounting error"; 3495 active_objs += slabp->inuse; 3496 active_slabs++; 3497 } 3498 list_for_each(q, &l3->slabs_free) { 3499 slabp = list_entry(q, struct slab, list); 3500 if (slabp->inuse && !error) 3501 error = "slabs_free/inuse accounting error"; 3502 num_slabs++; 3503 } 3504 free_objects += l3->free_objects; 3505 shared_avail += l3->shared->avail; 3506 3507 spin_unlock(&l3->list_lock); 3508 } 3509 num_slabs += active_slabs; 3510 num_objs = num_slabs * cachep->num; 3511 if (num_objs - active_objs != free_objects && !error) 3512 error = "free_objects accounting error"; 3513 3514 name = cachep->name; 3515 if (error) 3516 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 3517 3518 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 3519 name, active_objs, num_objs, cachep->objsize, 3520 cachep->num, (1 << cachep->gfporder)); 3521 seq_printf(m, " : tunables %4u %4u %4u", 3522 cachep->limit, cachep->batchcount, cachep->shared); 3523 seq_printf(m, " : slabdata %6lu %6lu %6lu", 3524 active_slabs, num_slabs, shared_avail); 3525 #if STATS 3526 { /* list3 stats */ 3527 unsigned long high = cachep->high_mark; 3528 unsigned long allocs = cachep->num_allocations; 3529 unsigned long grown = cachep->grown; 3530 unsigned long reaped = cachep->reaped; 3531 unsigned long errors = cachep->errors; 3532 unsigned long max_freeable = cachep->max_freeable; 3533 unsigned long node_allocs = cachep->node_allocs; 3534 unsigned long node_frees = cachep->node_frees; 3535 3536 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ 3537 %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees); 3538 } 3539 /* cpu stats */ 3540 { 3541 unsigned long allochit = atomic_read(&cachep->allochit); 3542 unsigned long allocmiss = atomic_read(&cachep->allocmiss); 3543 unsigned long freehit = atomic_read(&cachep->freehit); 3544 unsigned long freemiss = atomic_read(&cachep->freemiss); 3545 3546 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", 3547 allochit, allocmiss, freehit, freemiss); 3548 } 3549 #endif 3550 seq_putc(m, '\n'); 3551 spin_unlock_irq(&cachep->spinlock); 3552 return 0; 3553 } 3554 3555 /* 3556 * slabinfo_op - iterator that generates /proc/slabinfo 3557 * 3558 * Output layout: 3559 * cache-name 3560 * num-active-objs 3561 * total-objs 3562 * object size 3563 * num-active-slabs 3564 * total-slabs 3565 * num-pages-per-slab 3566 * + further values on SMP and with statistics enabled 3567 */ 3568 3569 struct seq_operations slabinfo_op = { 3570 .start = s_start, 3571 .next = s_next, 3572 .stop = s_stop, 3573 .show = s_show, 3574 }; 3575 3576 #define MAX_SLABINFO_WRITE 128 3577 /** 3578 * slabinfo_write - Tuning for the slab allocator 3579 * @file: unused 3580 * @buffer: user buffer 3581 * @count: data length 3582 * @ppos: unused 3583 */ 3584 ssize_t slabinfo_write(struct file *file, const char __user * buffer, 3585 size_t count, loff_t *ppos) 3586 { 3587 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; 3588 int limit, batchcount, shared, res; 3589 struct list_head *p; 3590 3591 if (count > MAX_SLABINFO_WRITE) 3592 return -EINVAL; 3593 if (copy_from_user(&kbuf, buffer, count)) 3594 return -EFAULT; 3595 kbuf[MAX_SLABINFO_WRITE] = '\0'; 3596 3597 tmp = strchr(kbuf, ' '); 3598 if (!tmp) 3599 return -EINVAL; 3600 *tmp = '\0'; 3601 tmp++; 3602 if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) 3603 return -EINVAL; 3604 3605 /* Find the cache in the chain of caches. */ 3606 down(&cache_chain_sem); 3607 res = -EINVAL; 3608 list_for_each(p, &cache_chain) { 3609 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); 3610 3611 if (!strcmp(cachep->name, kbuf)) { 3612 if (limit < 1 || 3613 batchcount < 1 || 3614 batchcount > limit || shared < 0) { 3615 res = 0; 3616 } else { 3617 res = do_tune_cpucache(cachep, limit, 3618 batchcount, shared); 3619 } 3620 break; 3621 } 3622 } 3623 up(&cache_chain_sem); 3624 if (res >= 0) 3625 res = count; 3626 return res; 3627 } 3628 #endif 3629 3630 /** 3631 * ksize - get the actual amount of memory allocated for a given object 3632 * @objp: Pointer to the object 3633 * 3634 * kmalloc may internally round up allocations and return more memory 3635 * than requested. ksize() can be used to determine the actual amount of 3636 * memory allocated. The caller may use this additional memory, even though 3637 * a smaller amount of memory was initially specified with the kmalloc call. 3638 * The caller must guarantee that objp points to a valid object previously 3639 * allocated with either kmalloc() or kmem_cache_alloc(). The object 3640 * must not be freed during the duration of the call. 3641 */ 3642 unsigned int ksize(const void *objp) 3643 { 3644 if (unlikely(objp == NULL)) 3645 return 0; 3646 3647 return obj_reallen(page_get_cache(virt_to_page(objp))); 3648 } 3649