1 /* 2 * linux/mm/slab.c 3 * Written by Mark Hemment, 1996/97. 4 * (markhe@nextd.demon.co.uk) 5 * 6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli 7 * 8 * Major cleanup, different bufctl logic, per-cpu arrays 9 * (c) 2000 Manfred Spraul 10 * 11 * Cleanup, make the head arrays unconditional, preparation for NUMA 12 * (c) 2002 Manfred Spraul 13 * 14 * An implementation of the Slab Allocator as described in outline in; 15 * UNIX Internals: The New Frontiers by Uresh Vahalia 16 * Pub: Prentice Hall ISBN 0-13-101908-2 17 * or with a little more detail in; 18 * The Slab Allocator: An Object-Caching Kernel Memory Allocator 19 * Jeff Bonwick (Sun Microsystems). 20 * Presented at: USENIX Summer 1994 Technical Conference 21 * 22 * The memory is organized in caches, one cache for each object type. 23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) 24 * Each cache consists out of many slabs (they are small (usually one 25 * page long) and always contiguous), and each slab contains multiple 26 * initialized objects. 27 * 28 * This means, that your constructor is used only for newly allocated 29 * slabs and you must pass objects with the same intializations to 30 * kmem_cache_free. 31 * 32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, 33 * normal). If you need a special memory type, then must create a new 34 * cache for that memory type. 35 * 36 * In order to reduce fragmentation, the slabs are sorted in 3 groups: 37 * full slabs with 0 free objects 38 * partial slabs 39 * empty slabs with no allocated objects 40 * 41 * If partial slabs exist, then new allocations come from these slabs, 42 * otherwise from empty slabs or new slabs are allocated. 43 * 44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache 45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs. 46 * 47 * Each cache has a short per-cpu head array, most allocs 48 * and frees go into that array, and if that array overflows, then 1/2 49 * of the entries in the array are given back into the global cache. 50 * The head array is strictly LIFO and should improve the cache hit rates. 51 * On SMP, it additionally reduces the spinlock operations. 52 * 53 * The c_cpuarray may not be read with enabled local interrupts - 54 * it's changed with a smp_call_function(). 55 * 56 * SMP synchronization: 57 * constructors and destructors are called without any locking. 58 * Several members in kmem_cache_t and struct slab never change, they 59 * are accessed without any locking. 60 * The per-cpu arrays are never accessed from the wrong cpu, no locking, 61 * and local interrupts are disabled so slab code is preempt-safe. 62 * The non-constant members are protected with a per-cache irq spinlock. 63 * 64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch 65 * in 2000 - many ideas in the current implementation are derived from 66 * his patch. 67 * 68 * Further notes from the original documentation: 69 * 70 * 11 April '97. Started multi-threading - markhe 71 * The global cache-chain is protected by the semaphore 'cache_chain_sem'. 72 * The sem is only needed when accessing/extending the cache-chain, which 73 * can never happen inside an interrupt (kmem_cache_create(), 74 * kmem_cache_shrink() and kmem_cache_reap()). 75 * 76 * At present, each engine can be growing a cache. This should be blocked. 77 * 78 * 15 March 2005. NUMA slab allocator. 79 * Shai Fultheim <shai@scalex86.org>. 80 * Shobhit Dayal <shobhit@calsoftinc.com> 81 * Alok N Kataria <alokk@calsoftinc.com> 82 * Christoph Lameter <christoph@lameter.com> 83 * 84 * Modified the slab allocator to be node aware on NUMA systems. 85 * Each node has its own list of partial, free and full slabs. 86 * All object allocations for a node occur from node specific slab lists. 87 */ 88 89 #include <linux/config.h> 90 #include <linux/slab.h> 91 #include <linux/mm.h> 92 #include <linux/swap.h> 93 #include <linux/cache.h> 94 #include <linux/interrupt.h> 95 #include <linux/init.h> 96 #include <linux/compiler.h> 97 #include <linux/seq_file.h> 98 #include <linux/notifier.h> 99 #include <linux/kallsyms.h> 100 #include <linux/cpu.h> 101 #include <linux/sysctl.h> 102 #include <linux/module.h> 103 #include <linux/rcupdate.h> 104 #include <linux/string.h> 105 #include <linux/nodemask.h> 106 107 #include <asm/uaccess.h> 108 #include <asm/cacheflush.h> 109 #include <asm/tlbflush.h> 110 #include <asm/page.h> 111 112 /* 113 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, 114 * SLAB_RED_ZONE & SLAB_POISON. 115 * 0 for faster, smaller code (especially in the critical paths). 116 * 117 * STATS - 1 to collect stats for /proc/slabinfo. 118 * 0 for faster, smaller code (especially in the critical paths). 119 * 120 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) 121 */ 122 123 #ifdef CONFIG_DEBUG_SLAB 124 #define DEBUG 1 125 #define STATS 1 126 #define FORCED_DEBUG 1 127 #else 128 #define DEBUG 0 129 #define STATS 0 130 #define FORCED_DEBUG 0 131 #endif 132 133 134 /* Shouldn't this be in a header file somewhere? */ 135 #define BYTES_PER_WORD sizeof(void *) 136 137 #ifndef cache_line_size 138 #define cache_line_size() L1_CACHE_BYTES 139 #endif 140 141 #ifndef ARCH_KMALLOC_MINALIGN 142 /* 143 * Enforce a minimum alignment for the kmalloc caches. 144 * Usually, the kmalloc caches are cache_line_size() aligned, except when 145 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. 146 * Some archs want to perform DMA into kmalloc caches and need a guaranteed 147 * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that. 148 * Note that this flag disables some debug features. 149 */ 150 #define ARCH_KMALLOC_MINALIGN 0 151 #endif 152 153 #ifndef ARCH_SLAB_MINALIGN 154 /* 155 * Enforce a minimum alignment for all caches. 156 * Intended for archs that get misalignment faults even for BYTES_PER_WORD 157 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. 158 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables 159 * some debug features. 160 */ 161 #define ARCH_SLAB_MINALIGN 0 162 #endif 163 164 #ifndef ARCH_KMALLOC_FLAGS 165 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 166 #endif 167 168 /* Legal flag mask for kmem_cache_create(). */ 169 #if DEBUG 170 # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ 171 SLAB_POISON | SLAB_HWCACHE_ALIGN | \ 172 SLAB_NO_REAP | SLAB_CACHE_DMA | \ 173 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ 174 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 175 SLAB_DESTROY_BY_RCU) 176 #else 177 # define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ 178 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ 179 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 180 SLAB_DESTROY_BY_RCU) 181 #endif 182 183 /* 184 * kmem_bufctl_t: 185 * 186 * Bufctl's are used for linking objs within a slab 187 * linked offsets. 188 * 189 * This implementation relies on "struct page" for locating the cache & 190 * slab an object belongs to. 191 * This allows the bufctl structure to be small (one int), but limits 192 * the number of objects a slab (not a cache) can contain when off-slab 193 * bufctls are used. The limit is the size of the largest general cache 194 * that does not use off-slab slabs. 195 * For 32bit archs with 4 kB pages, is this 56. 196 * This is not serious, as it is only for large objects, when it is unwise 197 * to have too many per slab. 198 * Note: This limit can be raised by introducing a general cache whose size 199 * is less than 512 (PAGE_SIZE<<3), but greater than 256. 200 */ 201 202 typedef unsigned int kmem_bufctl_t; 203 #define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) 204 #define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) 205 #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) 206 207 /* Max number of objs-per-slab for caches which use off-slab slabs. 208 * Needed to avoid a possible looping condition in cache_grow(). 209 */ 210 static unsigned long offslab_limit; 211 212 /* 213 * struct slab 214 * 215 * Manages the objs in a slab. Placed either at the beginning of mem allocated 216 * for a slab, or allocated from an general cache. 217 * Slabs are chained into three list: fully used, partial, fully free slabs. 218 */ 219 struct slab { 220 struct list_head list; 221 unsigned long colouroff; 222 void *s_mem; /* including colour offset */ 223 unsigned int inuse; /* num of objs active in slab */ 224 kmem_bufctl_t free; 225 unsigned short nodeid; 226 }; 227 228 /* 229 * struct slab_rcu 230 * 231 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to 232 * arrange for kmem_freepages to be called via RCU. This is useful if 233 * we need to approach a kernel structure obliquely, from its address 234 * obtained without the usual locking. We can lock the structure to 235 * stabilize it and check it's still at the given address, only if we 236 * can be sure that the memory has not been meanwhile reused for some 237 * other kind of object (which our subsystem's lock might corrupt). 238 * 239 * rcu_read_lock before reading the address, then rcu_read_unlock after 240 * taking the spinlock within the structure expected at that address. 241 * 242 * We assume struct slab_rcu can overlay struct slab when destroying. 243 */ 244 struct slab_rcu { 245 struct rcu_head head; 246 kmem_cache_t *cachep; 247 void *addr; 248 }; 249 250 /* 251 * struct array_cache 252 * 253 * Purpose: 254 * - LIFO ordering, to hand out cache-warm objects from _alloc 255 * - reduce the number of linked list operations 256 * - reduce spinlock operations 257 * 258 * The limit is stored in the per-cpu structure to reduce the data cache 259 * footprint. 260 * 261 */ 262 struct array_cache { 263 unsigned int avail; 264 unsigned int limit; 265 unsigned int batchcount; 266 unsigned int touched; 267 spinlock_t lock; 268 void *entry[0]; /* 269 * Must have this definition in here for the proper 270 * alignment of array_cache. Also simplifies accessing 271 * the entries. 272 * [0] is for gcc 2.95. It should really be []. 273 */ 274 }; 275 276 /* bootstrap: The caches do not work without cpuarrays anymore, 277 * but the cpuarrays are allocated from the generic caches... 278 */ 279 #define BOOT_CPUCACHE_ENTRIES 1 280 struct arraycache_init { 281 struct array_cache cache; 282 void * entries[BOOT_CPUCACHE_ENTRIES]; 283 }; 284 285 /* 286 * The slab lists for all objects. 287 */ 288 struct kmem_list3 { 289 struct list_head slabs_partial; /* partial list first, better asm code */ 290 struct list_head slabs_full; 291 struct list_head slabs_free; 292 unsigned long free_objects; 293 unsigned long next_reap; 294 int free_touched; 295 unsigned int free_limit; 296 spinlock_t list_lock; 297 struct array_cache *shared; /* shared per node */ 298 struct array_cache **alien; /* on other nodes */ 299 }; 300 301 /* 302 * Need this for bootstrapping a per node allocator. 303 */ 304 #define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1) 305 struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; 306 #define CACHE_CACHE 0 307 #define SIZE_AC 1 308 #define SIZE_L3 (1 + MAX_NUMNODES) 309 310 /* 311 * This function must be completely optimized away if 312 * a constant is passed to it. Mostly the same as 313 * what is in linux/slab.h except it returns an 314 * index. 315 */ 316 static __always_inline int index_of(const size_t size) 317 { 318 if (__builtin_constant_p(size)) { 319 int i = 0; 320 321 #define CACHE(x) \ 322 if (size <=x) \ 323 return i; \ 324 else \ 325 i++; 326 #include "linux/kmalloc_sizes.h" 327 #undef CACHE 328 { 329 extern void __bad_size(void); 330 __bad_size(); 331 } 332 } else 333 BUG(); 334 return 0; 335 } 336 337 #define INDEX_AC index_of(sizeof(struct arraycache_init)) 338 #define INDEX_L3 index_of(sizeof(struct kmem_list3)) 339 340 static inline void kmem_list3_init(struct kmem_list3 *parent) 341 { 342 INIT_LIST_HEAD(&parent->slabs_full); 343 INIT_LIST_HEAD(&parent->slabs_partial); 344 INIT_LIST_HEAD(&parent->slabs_free); 345 parent->shared = NULL; 346 parent->alien = NULL; 347 spin_lock_init(&parent->list_lock); 348 parent->free_objects = 0; 349 parent->free_touched = 0; 350 } 351 352 #define MAKE_LIST(cachep, listp, slab, nodeid) \ 353 do { \ 354 INIT_LIST_HEAD(listp); \ 355 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ 356 } while (0) 357 358 #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ 359 do { \ 360 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ 361 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ 362 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 363 } while (0) 364 365 /* 366 * kmem_cache_t 367 * 368 * manages a cache. 369 */ 370 371 struct kmem_cache { 372 /* 1) per-cpu data, touched during every alloc/free */ 373 struct array_cache *array[NR_CPUS]; 374 unsigned int batchcount; 375 unsigned int limit; 376 unsigned int shared; 377 unsigned int objsize; 378 /* 2) touched by every alloc & free from the backend */ 379 struct kmem_list3 *nodelists[MAX_NUMNODES]; 380 unsigned int flags; /* constant flags */ 381 unsigned int num; /* # of objs per slab */ 382 spinlock_t spinlock; 383 384 /* 3) cache_grow/shrink */ 385 /* order of pgs per slab (2^n) */ 386 unsigned int gfporder; 387 388 /* force GFP flags, e.g. GFP_DMA */ 389 gfp_t gfpflags; 390 391 size_t colour; /* cache colouring range */ 392 unsigned int colour_off; /* colour offset */ 393 unsigned int colour_next; /* cache colouring */ 394 kmem_cache_t *slabp_cache; 395 unsigned int slab_size; 396 unsigned int dflags; /* dynamic flags */ 397 398 /* constructor func */ 399 void (*ctor)(void *, kmem_cache_t *, unsigned long); 400 401 /* de-constructor func */ 402 void (*dtor)(void *, kmem_cache_t *, unsigned long); 403 404 /* 4) cache creation/removal */ 405 const char *name; 406 struct list_head next; 407 408 /* 5) statistics */ 409 #if STATS 410 unsigned long num_active; 411 unsigned long num_allocations; 412 unsigned long high_mark; 413 unsigned long grown; 414 unsigned long reaped; 415 unsigned long errors; 416 unsigned long max_freeable; 417 unsigned long node_allocs; 418 unsigned long node_frees; 419 atomic_t allochit; 420 atomic_t allocmiss; 421 atomic_t freehit; 422 atomic_t freemiss; 423 #endif 424 #if DEBUG 425 int dbghead; 426 int reallen; 427 #endif 428 }; 429 430 #define CFLGS_OFF_SLAB (0x80000000UL) 431 #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 432 433 #define BATCHREFILL_LIMIT 16 434 /* Optimization question: fewer reaps means less 435 * probability for unnessary cpucache drain/refill cycles. 436 * 437 * OTHO the cpuarrays can contain lots of objects, 438 * which could lock up otherwise freeable slabs. 439 */ 440 #define REAPTIMEOUT_CPUC (2*HZ) 441 #define REAPTIMEOUT_LIST3 (4*HZ) 442 443 #if STATS 444 #define STATS_INC_ACTIVE(x) ((x)->num_active++) 445 #define STATS_DEC_ACTIVE(x) ((x)->num_active--) 446 #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 447 #define STATS_INC_GROWN(x) ((x)->grown++) 448 #define STATS_INC_REAPED(x) ((x)->reaped++) 449 #define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ 450 (x)->high_mark = (x)->num_active; \ 451 } while (0) 452 #define STATS_INC_ERR(x) ((x)->errors++) 453 #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 454 #define STATS_INC_NODEFREES(x) ((x)->node_frees++) 455 #define STATS_SET_FREEABLE(x, i) \ 456 do { if ((x)->max_freeable < i) \ 457 (x)->max_freeable = i; \ 458 } while (0) 459 460 #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) 461 #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) 462 #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) 463 #define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) 464 #else 465 #define STATS_INC_ACTIVE(x) do { } while (0) 466 #define STATS_DEC_ACTIVE(x) do { } while (0) 467 #define STATS_INC_ALLOCED(x) do { } while (0) 468 #define STATS_INC_GROWN(x) do { } while (0) 469 #define STATS_INC_REAPED(x) do { } while (0) 470 #define STATS_SET_HIGH(x) do { } while (0) 471 #define STATS_INC_ERR(x) do { } while (0) 472 #define STATS_INC_NODEALLOCS(x) do { } while (0) 473 #define STATS_INC_NODEFREES(x) do { } while (0) 474 #define STATS_SET_FREEABLE(x, i) \ 475 do { } while (0) 476 477 #define STATS_INC_ALLOCHIT(x) do { } while (0) 478 #define STATS_INC_ALLOCMISS(x) do { } while (0) 479 #define STATS_INC_FREEHIT(x) do { } while (0) 480 #define STATS_INC_FREEMISS(x) do { } while (0) 481 #endif 482 483 #if DEBUG 484 /* Magic nums for obj red zoning. 485 * Placed in the first word before and the first word after an obj. 486 */ 487 #define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ 488 #define RED_ACTIVE 0x170FC2A5UL /* when obj is active */ 489 490 /* ...and for poisoning */ 491 #define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ 492 #define POISON_FREE 0x6b /* for use-after-free poisoning */ 493 #define POISON_END 0xa5 /* end-byte of poisoning */ 494 495 /* memory layout of objects: 496 * 0 : objp 497 * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that 498 * the end of an object is aligned with the end of the real 499 * allocation. Catches writes behind the end of the allocation. 500 * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1: 501 * redzone word. 502 * cachep->dbghead: The real object. 503 * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 504 * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] 505 */ 506 static int obj_dbghead(kmem_cache_t *cachep) 507 { 508 return cachep->dbghead; 509 } 510 511 static int obj_reallen(kmem_cache_t *cachep) 512 { 513 return cachep->reallen; 514 } 515 516 static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp) 517 { 518 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 519 return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD); 520 } 521 522 static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp) 523 { 524 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 525 if (cachep->flags & SLAB_STORE_USER) 526 return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD); 527 return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD); 528 } 529 530 static void **dbg_userword(kmem_cache_t *cachep, void *objp) 531 { 532 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 533 return (void**)(objp+cachep->objsize-BYTES_PER_WORD); 534 } 535 536 #else 537 538 #define obj_dbghead(x) 0 539 #define obj_reallen(cachep) (cachep->objsize) 540 #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 541 #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 542 #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 543 544 #endif 545 546 /* 547 * Maximum size of an obj (in 2^order pages) 548 * and absolute limit for the gfp order. 549 */ 550 #if defined(CONFIG_LARGE_ALLOCS) 551 #define MAX_OBJ_ORDER 13 /* up to 32Mb */ 552 #define MAX_GFP_ORDER 13 /* up to 32Mb */ 553 #elif defined(CONFIG_MMU) 554 #define MAX_OBJ_ORDER 5 /* 32 pages */ 555 #define MAX_GFP_ORDER 5 /* 32 pages */ 556 #else 557 #define MAX_OBJ_ORDER 8 /* up to 1Mb */ 558 #define MAX_GFP_ORDER 8 /* up to 1Mb */ 559 #endif 560 561 /* 562 * Do not go above this order unless 0 objects fit into the slab. 563 */ 564 #define BREAK_GFP_ORDER_HI 1 565 #define BREAK_GFP_ORDER_LO 0 566 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; 567 568 /* Macros for storing/retrieving the cachep and or slab from the 569 * global 'mem_map'. These are used to find the slab an obj belongs to. 570 * With kfree(), these are used to find the cache which an obj belongs to. 571 */ 572 #define SET_PAGE_CACHE(pg,x) ((pg)->lru.next = (struct list_head *)(x)) 573 #define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->lru.next) 574 #define SET_PAGE_SLAB(pg,x) ((pg)->lru.prev = (struct list_head *)(x)) 575 #define GET_PAGE_SLAB(pg) ((struct slab *)(pg)->lru.prev) 576 577 /* These are the default caches for kmalloc. Custom caches can have other sizes. */ 578 struct cache_sizes malloc_sizes[] = { 579 #define CACHE(x) { .cs_size = (x) }, 580 #include <linux/kmalloc_sizes.h> 581 CACHE(ULONG_MAX) 582 #undef CACHE 583 }; 584 EXPORT_SYMBOL(malloc_sizes); 585 586 /* Must match cache_sizes above. Out of line to keep cache footprint low. */ 587 struct cache_names { 588 char *name; 589 char *name_dma; 590 }; 591 592 static struct cache_names __initdata cache_names[] = { 593 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, 594 #include <linux/kmalloc_sizes.h> 595 { NULL, } 596 #undef CACHE 597 }; 598 599 static struct arraycache_init initarray_cache __initdata = 600 { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 601 static struct arraycache_init initarray_generic = 602 { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 603 604 /* internal cache of cache description objs */ 605 static kmem_cache_t cache_cache = { 606 .batchcount = 1, 607 .limit = BOOT_CPUCACHE_ENTRIES, 608 .shared = 1, 609 .objsize = sizeof(kmem_cache_t), 610 .flags = SLAB_NO_REAP, 611 .spinlock = SPIN_LOCK_UNLOCKED, 612 .name = "kmem_cache", 613 #if DEBUG 614 .reallen = sizeof(kmem_cache_t), 615 #endif 616 }; 617 618 /* Guard access to the cache-chain. */ 619 static struct semaphore cache_chain_sem; 620 static struct list_head cache_chain; 621 622 /* 623 * vm_enough_memory() looks at this to determine how many 624 * slab-allocated pages are possibly freeable under pressure 625 * 626 * SLAB_RECLAIM_ACCOUNT turns this on per-slab 627 */ 628 atomic_t slab_reclaim_pages; 629 630 /* 631 * chicken and egg problem: delay the per-cpu array allocation 632 * until the general caches are up. 633 */ 634 static enum { 635 NONE, 636 PARTIAL_AC, 637 PARTIAL_L3, 638 FULL 639 } g_cpucache_up; 640 641 static DEFINE_PER_CPU(struct work_struct, reap_work); 642 643 static void free_block(kmem_cache_t* cachep, void** objpp, int len, int node); 644 static void enable_cpucache (kmem_cache_t *cachep); 645 static void cache_reap (void *unused); 646 static int __node_shrink(kmem_cache_t *cachep, int node); 647 648 static inline struct array_cache *ac_data(kmem_cache_t *cachep) 649 { 650 return cachep->array[smp_processor_id()]; 651 } 652 653 static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags) 654 { 655 struct cache_sizes *csizep = malloc_sizes; 656 657 #if DEBUG 658 /* This happens if someone tries to call 659 * kmem_cache_create(), or __kmalloc(), before 660 * the generic caches are initialized. 661 */ 662 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); 663 #endif 664 while (size > csizep->cs_size) 665 csizep++; 666 667 /* 668 * Really subtle: The last entry with cs->cs_size==ULONG_MAX 669 * has cs_{dma,}cachep==NULL. Thus no special case 670 * for large kmalloc calls required. 671 */ 672 if (unlikely(gfpflags & GFP_DMA)) 673 return csizep->cs_dmacachep; 674 return csizep->cs_cachep; 675 } 676 677 kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags) 678 { 679 return __find_general_cachep(size, gfpflags); 680 } 681 EXPORT_SYMBOL(kmem_find_general_cachep); 682 683 /* Cal the num objs, wastage, and bytes left over for a given slab size. */ 684 static void cache_estimate(unsigned long gfporder, size_t size, size_t align, 685 int flags, size_t *left_over, unsigned int *num) 686 { 687 int i; 688 size_t wastage = PAGE_SIZE<<gfporder; 689 size_t extra = 0; 690 size_t base = 0; 691 692 if (!(flags & CFLGS_OFF_SLAB)) { 693 base = sizeof(struct slab); 694 extra = sizeof(kmem_bufctl_t); 695 } 696 i = 0; 697 while (i*size + ALIGN(base+i*extra, align) <= wastage) 698 i++; 699 if (i > 0) 700 i--; 701 702 if (i > SLAB_LIMIT) 703 i = SLAB_LIMIT; 704 705 *num = i; 706 wastage -= i*size; 707 wastage -= ALIGN(base+i*extra, align); 708 *left_over = wastage; 709 } 710 711 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) 712 713 static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg) 714 { 715 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 716 function, cachep->name, msg); 717 dump_stack(); 718 } 719 720 /* 721 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz 722 * via the workqueue/eventd. 723 * Add the CPU number into the expiration time to minimize the possibility of 724 * the CPUs getting into lockstep and contending for the global cache chain 725 * lock. 726 */ 727 static void __devinit start_cpu_timer(int cpu) 728 { 729 struct work_struct *reap_work = &per_cpu(reap_work, cpu); 730 731 /* 732 * When this gets called from do_initcalls via cpucache_init(), 733 * init_workqueues() has already run, so keventd will be setup 734 * at that time. 735 */ 736 if (keventd_up() && reap_work->func == NULL) { 737 INIT_WORK(reap_work, cache_reap, NULL); 738 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); 739 } 740 } 741 742 static struct array_cache *alloc_arraycache(int node, int entries, 743 int batchcount) 744 { 745 int memsize = sizeof(void*)*entries+sizeof(struct array_cache); 746 struct array_cache *nc = NULL; 747 748 nc = kmalloc_node(memsize, GFP_KERNEL, node); 749 if (nc) { 750 nc->avail = 0; 751 nc->limit = entries; 752 nc->batchcount = batchcount; 753 nc->touched = 0; 754 spin_lock_init(&nc->lock); 755 } 756 return nc; 757 } 758 759 #ifdef CONFIG_NUMA 760 static inline struct array_cache **alloc_alien_cache(int node, int limit) 761 { 762 struct array_cache **ac_ptr; 763 int memsize = sizeof(void*)*MAX_NUMNODES; 764 int i; 765 766 if (limit > 1) 767 limit = 12; 768 ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node); 769 if (ac_ptr) { 770 for_each_node(i) { 771 if (i == node || !node_online(i)) { 772 ac_ptr[i] = NULL; 773 continue; 774 } 775 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); 776 if (!ac_ptr[i]) { 777 for (i--; i <=0; i--) 778 kfree(ac_ptr[i]); 779 kfree(ac_ptr); 780 return NULL; 781 } 782 } 783 } 784 return ac_ptr; 785 } 786 787 static inline void free_alien_cache(struct array_cache **ac_ptr) 788 { 789 int i; 790 791 if (!ac_ptr) 792 return; 793 794 for_each_node(i) 795 kfree(ac_ptr[i]); 796 797 kfree(ac_ptr); 798 } 799 800 static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node) 801 { 802 struct kmem_list3 *rl3 = cachep->nodelists[node]; 803 804 if (ac->avail) { 805 spin_lock(&rl3->list_lock); 806 free_block(cachep, ac->entry, ac->avail, node); 807 ac->avail = 0; 808 spin_unlock(&rl3->list_lock); 809 } 810 } 811 812 static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3) 813 { 814 int i=0; 815 struct array_cache *ac; 816 unsigned long flags; 817 818 for_each_online_node(i) { 819 ac = l3->alien[i]; 820 if (ac) { 821 spin_lock_irqsave(&ac->lock, flags); 822 __drain_alien_cache(cachep, ac, i); 823 spin_unlock_irqrestore(&ac->lock, flags); 824 } 825 } 826 } 827 #else 828 #define alloc_alien_cache(node, limit) do { } while (0) 829 #define free_alien_cache(ac_ptr) do { } while (0) 830 #define drain_alien_cache(cachep, l3) do { } while (0) 831 #endif 832 833 static int __devinit cpuup_callback(struct notifier_block *nfb, 834 unsigned long action, void *hcpu) 835 { 836 long cpu = (long)hcpu; 837 kmem_cache_t* cachep; 838 struct kmem_list3 *l3 = NULL; 839 int node = cpu_to_node(cpu); 840 int memsize = sizeof(struct kmem_list3); 841 struct array_cache *nc = NULL; 842 843 switch (action) { 844 case CPU_UP_PREPARE: 845 down(&cache_chain_sem); 846 /* we need to do this right in the beginning since 847 * alloc_arraycache's are going to use this list. 848 * kmalloc_node allows us to add the slab to the right 849 * kmem_list3 and not this cpu's kmem_list3 850 */ 851 852 list_for_each_entry(cachep, &cache_chain, next) { 853 /* setup the size64 kmemlist for cpu before we can 854 * begin anything. Make sure some other cpu on this 855 * node has not already allocated this 856 */ 857 if (!cachep->nodelists[node]) { 858 if (!(l3 = kmalloc_node(memsize, 859 GFP_KERNEL, node))) 860 goto bad; 861 kmem_list3_init(l3); 862 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 863 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 864 865 cachep->nodelists[node] = l3; 866 } 867 868 spin_lock_irq(&cachep->nodelists[node]->list_lock); 869 cachep->nodelists[node]->free_limit = 870 (1 + nr_cpus_node(node)) * 871 cachep->batchcount + cachep->num; 872 spin_unlock_irq(&cachep->nodelists[node]->list_lock); 873 } 874 875 /* Now we can go ahead with allocating the shared array's 876 & array cache's */ 877 list_for_each_entry(cachep, &cache_chain, next) { 878 nc = alloc_arraycache(node, cachep->limit, 879 cachep->batchcount); 880 if (!nc) 881 goto bad; 882 cachep->array[cpu] = nc; 883 884 l3 = cachep->nodelists[node]; 885 BUG_ON(!l3); 886 if (!l3->shared) { 887 if (!(nc = alloc_arraycache(node, 888 cachep->shared*cachep->batchcount, 889 0xbaadf00d))) 890 goto bad; 891 892 /* we are serialised from CPU_DEAD or 893 CPU_UP_CANCELLED by the cpucontrol lock */ 894 l3->shared = nc; 895 } 896 } 897 up(&cache_chain_sem); 898 break; 899 case CPU_ONLINE: 900 start_cpu_timer(cpu); 901 break; 902 #ifdef CONFIG_HOTPLUG_CPU 903 case CPU_DEAD: 904 /* fall thru */ 905 case CPU_UP_CANCELED: 906 down(&cache_chain_sem); 907 908 list_for_each_entry(cachep, &cache_chain, next) { 909 struct array_cache *nc; 910 cpumask_t mask; 911 912 mask = node_to_cpumask(node); 913 spin_lock_irq(&cachep->spinlock); 914 /* cpu is dead; no one can alloc from it. */ 915 nc = cachep->array[cpu]; 916 cachep->array[cpu] = NULL; 917 l3 = cachep->nodelists[node]; 918 919 if (!l3) 920 goto unlock_cache; 921 922 spin_lock(&l3->list_lock); 923 924 /* Free limit for this kmem_list3 */ 925 l3->free_limit -= cachep->batchcount; 926 if (nc) 927 free_block(cachep, nc->entry, nc->avail, node); 928 929 if (!cpus_empty(mask)) { 930 spin_unlock(&l3->list_lock); 931 goto unlock_cache; 932 } 933 934 if (l3->shared) { 935 free_block(cachep, l3->shared->entry, 936 l3->shared->avail, node); 937 kfree(l3->shared); 938 l3->shared = NULL; 939 } 940 if (l3->alien) { 941 drain_alien_cache(cachep, l3); 942 free_alien_cache(l3->alien); 943 l3->alien = NULL; 944 } 945 946 /* free slabs belonging to this node */ 947 if (__node_shrink(cachep, node)) { 948 cachep->nodelists[node] = NULL; 949 spin_unlock(&l3->list_lock); 950 kfree(l3); 951 } else { 952 spin_unlock(&l3->list_lock); 953 } 954 unlock_cache: 955 spin_unlock_irq(&cachep->spinlock); 956 kfree(nc); 957 } 958 up(&cache_chain_sem); 959 break; 960 #endif 961 } 962 return NOTIFY_OK; 963 bad: 964 up(&cache_chain_sem); 965 return NOTIFY_BAD; 966 } 967 968 static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; 969 970 /* 971 * swap the static kmem_list3 with kmalloced memory 972 */ 973 static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, 974 int nodeid) 975 { 976 struct kmem_list3 *ptr; 977 978 BUG_ON(cachep->nodelists[nodeid] != list); 979 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); 980 BUG_ON(!ptr); 981 982 local_irq_disable(); 983 memcpy(ptr, list, sizeof(struct kmem_list3)); 984 MAKE_ALL_LISTS(cachep, ptr, nodeid); 985 cachep->nodelists[nodeid] = ptr; 986 local_irq_enable(); 987 } 988 989 /* Initialisation. 990 * Called after the gfp() functions have been enabled, and before smp_init(). 991 */ 992 void __init kmem_cache_init(void) 993 { 994 size_t left_over; 995 struct cache_sizes *sizes; 996 struct cache_names *names; 997 int i; 998 999 for (i = 0; i < NUM_INIT_LISTS; i++) { 1000 kmem_list3_init(&initkmem_list3[i]); 1001 if (i < MAX_NUMNODES) 1002 cache_cache.nodelists[i] = NULL; 1003 } 1004 1005 /* 1006 * Fragmentation resistance on low memory - only use bigger 1007 * page orders on machines with more than 32MB of memory. 1008 */ 1009 if (num_physpages > (32 << 20) >> PAGE_SHIFT) 1010 slab_break_gfp_order = BREAK_GFP_ORDER_HI; 1011 1012 /* Bootstrap is tricky, because several objects are allocated 1013 * from caches that do not exist yet: 1014 * 1) initialize the cache_cache cache: it contains the kmem_cache_t 1015 * structures of all caches, except cache_cache itself: cache_cache 1016 * is statically allocated. 1017 * Initially an __init data area is used for the head array and the 1018 * kmem_list3 structures, it's replaced with a kmalloc allocated 1019 * array at the end of the bootstrap. 1020 * 2) Create the first kmalloc cache. 1021 * The kmem_cache_t for the new cache is allocated normally. 1022 * An __init data area is used for the head array. 1023 * 3) Create the remaining kmalloc caches, with minimally sized 1024 * head arrays. 1025 * 4) Replace the __init data head arrays for cache_cache and the first 1026 * kmalloc cache with kmalloc allocated arrays. 1027 * 5) Replace the __init data for kmem_list3 for cache_cache and 1028 * the other cache's with kmalloc allocated memory. 1029 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1030 */ 1031 1032 /* 1) create the cache_cache */ 1033 init_MUTEX(&cache_chain_sem); 1034 INIT_LIST_HEAD(&cache_chain); 1035 list_add(&cache_cache.next, &cache_chain); 1036 cache_cache.colour_off = cache_line_size(); 1037 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1038 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; 1039 1040 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); 1041 1042 cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, 1043 &left_over, &cache_cache.num); 1044 if (!cache_cache.num) 1045 BUG(); 1046 1047 cache_cache.colour = left_over/cache_cache.colour_off; 1048 cache_cache.colour_next = 0; 1049 cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) + 1050 sizeof(struct slab), cache_line_size()); 1051 1052 /* 2+3) create the kmalloc caches */ 1053 sizes = malloc_sizes; 1054 names = cache_names; 1055 1056 /* Initialize the caches that provide memory for the array cache 1057 * and the kmem_list3 structures first. 1058 * Without this, further allocations will bug 1059 */ 1060 1061 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, 1062 sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN, 1063 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1064 1065 if (INDEX_AC != INDEX_L3) 1066 sizes[INDEX_L3].cs_cachep = 1067 kmem_cache_create(names[INDEX_L3].name, 1068 sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN, 1069 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1070 1071 while (sizes->cs_size != ULONG_MAX) { 1072 /* 1073 * For performance, all the general caches are L1 aligned. 1074 * This should be particularly beneficial on SMP boxes, as it 1075 * eliminates "false sharing". 1076 * Note for systems short on memory removing the alignment will 1077 * allow tighter packing of the smaller caches. 1078 */ 1079 if(!sizes->cs_cachep) 1080 sizes->cs_cachep = kmem_cache_create(names->name, 1081 sizes->cs_size, ARCH_KMALLOC_MINALIGN, 1082 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1083 1084 /* Inc off-slab bufctl limit until the ceiling is hit. */ 1085 if (!(OFF_SLAB(sizes->cs_cachep))) { 1086 offslab_limit = sizes->cs_size-sizeof(struct slab); 1087 offslab_limit /= sizeof(kmem_bufctl_t); 1088 } 1089 1090 sizes->cs_dmacachep = kmem_cache_create(names->name_dma, 1091 sizes->cs_size, ARCH_KMALLOC_MINALIGN, 1092 (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC), 1093 NULL, NULL); 1094 1095 sizes++; 1096 names++; 1097 } 1098 /* 4) Replace the bootstrap head arrays */ 1099 { 1100 void * ptr; 1101 1102 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1103 1104 local_irq_disable(); 1105 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); 1106 memcpy(ptr, ac_data(&cache_cache), 1107 sizeof(struct arraycache_init)); 1108 cache_cache.array[smp_processor_id()] = ptr; 1109 local_irq_enable(); 1110 1111 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1112 1113 local_irq_disable(); 1114 BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep) 1115 != &initarray_generic.cache); 1116 memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep), 1117 sizeof(struct arraycache_init)); 1118 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1119 ptr; 1120 local_irq_enable(); 1121 } 1122 /* 5) Replace the bootstrap kmem_list3's */ 1123 { 1124 int node; 1125 /* Replace the static kmem_list3 structures for the boot cpu */ 1126 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], 1127 numa_node_id()); 1128 1129 for_each_online_node(node) { 1130 init_list(malloc_sizes[INDEX_AC].cs_cachep, 1131 &initkmem_list3[SIZE_AC+node], node); 1132 1133 if (INDEX_AC != INDEX_L3) { 1134 init_list(malloc_sizes[INDEX_L3].cs_cachep, 1135 &initkmem_list3[SIZE_L3+node], 1136 node); 1137 } 1138 } 1139 } 1140 1141 /* 6) resize the head arrays to their final sizes */ 1142 { 1143 kmem_cache_t *cachep; 1144 down(&cache_chain_sem); 1145 list_for_each_entry(cachep, &cache_chain, next) 1146 enable_cpucache(cachep); 1147 up(&cache_chain_sem); 1148 } 1149 1150 /* Done! */ 1151 g_cpucache_up = FULL; 1152 1153 /* Register a cpu startup notifier callback 1154 * that initializes ac_data for all new cpus 1155 */ 1156 register_cpu_notifier(&cpucache_notifier); 1157 1158 /* The reap timers are started later, with a module init call: 1159 * That part of the kernel is not yet operational. 1160 */ 1161 } 1162 1163 static int __init cpucache_init(void) 1164 { 1165 int cpu; 1166 1167 /* 1168 * Register the timers that return unneeded 1169 * pages to gfp. 1170 */ 1171 for_each_online_cpu(cpu) 1172 start_cpu_timer(cpu); 1173 1174 return 0; 1175 } 1176 1177 __initcall(cpucache_init); 1178 1179 /* 1180 * Interface to system's page allocator. No need to hold the cache-lock. 1181 * 1182 * If we requested dmaable memory, we will get it. Even if we 1183 * did not request dmaable memory, we might get it, but that 1184 * would be relatively rare and ignorable. 1185 */ 1186 static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid) 1187 { 1188 struct page *page; 1189 void *addr; 1190 int i; 1191 1192 flags |= cachep->gfpflags; 1193 if (likely(nodeid == -1)) { 1194 page = alloc_pages(flags, cachep->gfporder); 1195 } else { 1196 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1197 } 1198 if (!page) 1199 return NULL; 1200 addr = page_address(page); 1201 1202 i = (1 << cachep->gfporder); 1203 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1204 atomic_add(i, &slab_reclaim_pages); 1205 add_page_state(nr_slab, i); 1206 while (i--) { 1207 SetPageSlab(page); 1208 page++; 1209 } 1210 return addr; 1211 } 1212 1213 /* 1214 * Interface to system's page release. 1215 */ 1216 static void kmem_freepages(kmem_cache_t *cachep, void *addr) 1217 { 1218 unsigned long i = (1<<cachep->gfporder); 1219 struct page *page = virt_to_page(addr); 1220 const unsigned long nr_freed = i; 1221 1222 while (i--) { 1223 if (!TestClearPageSlab(page)) 1224 BUG(); 1225 page++; 1226 } 1227 sub_page_state(nr_slab, nr_freed); 1228 if (current->reclaim_state) 1229 current->reclaim_state->reclaimed_slab += nr_freed; 1230 free_pages((unsigned long)addr, cachep->gfporder); 1231 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1232 atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages); 1233 } 1234 1235 static void kmem_rcu_free(struct rcu_head *head) 1236 { 1237 struct slab_rcu *slab_rcu = (struct slab_rcu *) head; 1238 kmem_cache_t *cachep = slab_rcu->cachep; 1239 1240 kmem_freepages(cachep, slab_rcu->addr); 1241 if (OFF_SLAB(cachep)) 1242 kmem_cache_free(cachep->slabp_cache, slab_rcu); 1243 } 1244 1245 #if DEBUG 1246 1247 #ifdef CONFIG_DEBUG_PAGEALLOC 1248 static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, 1249 unsigned long caller) 1250 { 1251 int size = obj_reallen(cachep); 1252 1253 addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)]; 1254 1255 if (size < 5*sizeof(unsigned long)) 1256 return; 1257 1258 *addr++=0x12345678; 1259 *addr++=caller; 1260 *addr++=smp_processor_id(); 1261 size -= 3*sizeof(unsigned long); 1262 { 1263 unsigned long *sptr = &caller; 1264 unsigned long svalue; 1265 1266 while (!kstack_end(sptr)) { 1267 svalue = *sptr++; 1268 if (kernel_text_address(svalue)) { 1269 *addr++=svalue; 1270 size -= sizeof(unsigned long); 1271 if (size <= sizeof(unsigned long)) 1272 break; 1273 } 1274 } 1275 1276 } 1277 *addr++=0x87654321; 1278 } 1279 #endif 1280 1281 static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) 1282 { 1283 int size = obj_reallen(cachep); 1284 addr = &((char*)addr)[obj_dbghead(cachep)]; 1285 1286 memset(addr, val, size); 1287 *(unsigned char *)(addr+size-1) = POISON_END; 1288 } 1289 1290 static void dump_line(char *data, int offset, int limit) 1291 { 1292 int i; 1293 printk(KERN_ERR "%03x:", offset); 1294 for (i=0;i<limit;i++) { 1295 printk(" %02x", (unsigned char)data[offset+i]); 1296 } 1297 printk("\n"); 1298 } 1299 #endif 1300 1301 #if DEBUG 1302 1303 static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines) 1304 { 1305 int i, size; 1306 char *realobj; 1307 1308 if (cachep->flags & SLAB_RED_ZONE) { 1309 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", 1310 *dbg_redzone1(cachep, objp), 1311 *dbg_redzone2(cachep, objp)); 1312 } 1313 1314 if (cachep->flags & SLAB_STORE_USER) { 1315 printk(KERN_ERR "Last user: [<%p>]", 1316 *dbg_userword(cachep, objp)); 1317 print_symbol("(%s)", 1318 (unsigned long)*dbg_userword(cachep, objp)); 1319 printk("\n"); 1320 } 1321 realobj = (char*)objp+obj_dbghead(cachep); 1322 size = obj_reallen(cachep); 1323 for (i=0; i<size && lines;i+=16, lines--) { 1324 int limit; 1325 limit = 16; 1326 if (i+limit > size) 1327 limit = size-i; 1328 dump_line(realobj, i, limit); 1329 } 1330 } 1331 1332 static void check_poison_obj(kmem_cache_t *cachep, void *objp) 1333 { 1334 char *realobj; 1335 int size, i; 1336 int lines = 0; 1337 1338 realobj = (char*)objp+obj_dbghead(cachep); 1339 size = obj_reallen(cachep); 1340 1341 for (i=0;i<size;i++) { 1342 char exp = POISON_FREE; 1343 if (i == size-1) 1344 exp = POISON_END; 1345 if (realobj[i] != exp) { 1346 int limit; 1347 /* Mismatch ! */ 1348 /* Print header */ 1349 if (lines == 0) { 1350 printk(KERN_ERR "Slab corruption: start=%p, len=%d\n", 1351 realobj, size); 1352 print_objinfo(cachep, objp, 0); 1353 } 1354 /* Hexdump the affected line */ 1355 i = (i/16)*16; 1356 limit = 16; 1357 if (i+limit > size) 1358 limit = size-i; 1359 dump_line(realobj, i, limit); 1360 i += 16; 1361 lines++; 1362 /* Limit to 5 lines */ 1363 if (lines > 5) 1364 break; 1365 } 1366 } 1367 if (lines != 0) { 1368 /* Print some data about the neighboring objects, if they 1369 * exist: 1370 */ 1371 struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp)); 1372 int objnr; 1373 1374 objnr = (objp-slabp->s_mem)/cachep->objsize; 1375 if (objnr) { 1376 objp = slabp->s_mem+(objnr-1)*cachep->objsize; 1377 realobj = (char*)objp+obj_dbghead(cachep); 1378 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1379 realobj, size); 1380 print_objinfo(cachep, objp, 2); 1381 } 1382 if (objnr+1 < cachep->num) { 1383 objp = slabp->s_mem+(objnr+1)*cachep->objsize; 1384 realobj = (char*)objp+obj_dbghead(cachep); 1385 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1386 realobj, size); 1387 print_objinfo(cachep, objp, 2); 1388 } 1389 } 1390 } 1391 #endif 1392 1393 /* Destroy all the objs in a slab, and release the mem back to the system. 1394 * Before calling the slab must have been unlinked from the cache. 1395 * The cache-lock is not held/needed. 1396 */ 1397 static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) 1398 { 1399 void *addr = slabp->s_mem - slabp->colouroff; 1400 1401 #if DEBUG 1402 int i; 1403 for (i = 0; i < cachep->num; i++) { 1404 void *objp = slabp->s_mem + cachep->objsize * i; 1405 1406 if (cachep->flags & SLAB_POISON) { 1407 #ifdef CONFIG_DEBUG_PAGEALLOC 1408 if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep)) 1409 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1); 1410 else 1411 check_poison_obj(cachep, objp); 1412 #else 1413 check_poison_obj(cachep, objp); 1414 #endif 1415 } 1416 if (cachep->flags & SLAB_RED_ZONE) { 1417 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1418 slab_error(cachep, "start of a freed object " 1419 "was overwritten"); 1420 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 1421 slab_error(cachep, "end of a freed object " 1422 "was overwritten"); 1423 } 1424 if (cachep->dtor && !(cachep->flags & SLAB_POISON)) 1425 (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0); 1426 } 1427 #else 1428 if (cachep->dtor) { 1429 int i; 1430 for (i = 0; i < cachep->num; i++) { 1431 void* objp = slabp->s_mem+cachep->objsize*i; 1432 (cachep->dtor)(objp, cachep, 0); 1433 } 1434 } 1435 #endif 1436 1437 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 1438 struct slab_rcu *slab_rcu; 1439 1440 slab_rcu = (struct slab_rcu *) slabp; 1441 slab_rcu->cachep = cachep; 1442 slab_rcu->addr = addr; 1443 call_rcu(&slab_rcu->head, kmem_rcu_free); 1444 } else { 1445 kmem_freepages(cachep, addr); 1446 if (OFF_SLAB(cachep)) 1447 kmem_cache_free(cachep->slabp_cache, slabp); 1448 } 1449 } 1450 1451 /* For setting up all the kmem_list3s for cache whose objsize is same 1452 as size of kmem_list3. */ 1453 static inline void set_up_list3s(kmem_cache_t *cachep, int index) 1454 { 1455 int node; 1456 1457 for_each_online_node(node) { 1458 cachep->nodelists[node] = &initkmem_list3[index+node]; 1459 cachep->nodelists[node]->next_reap = jiffies + 1460 REAPTIMEOUT_LIST3 + 1461 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 1462 } 1463 } 1464 1465 /** 1466 * kmem_cache_create - Create a cache. 1467 * @name: A string which is used in /proc/slabinfo to identify this cache. 1468 * @size: The size of objects to be created in this cache. 1469 * @align: The required alignment for the objects. 1470 * @flags: SLAB flags 1471 * @ctor: A constructor for the objects. 1472 * @dtor: A destructor for the objects. 1473 * 1474 * Returns a ptr to the cache on success, NULL on failure. 1475 * Cannot be called within a int, but can be interrupted. 1476 * The @ctor is run when new pages are allocated by the cache 1477 * and the @dtor is run before the pages are handed back. 1478 * 1479 * @name must be valid until the cache is destroyed. This implies that 1480 * the module calling this has to destroy the cache before getting 1481 * unloaded. 1482 * 1483 * The flags are 1484 * 1485 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 1486 * to catch references to uninitialised memory. 1487 * 1488 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 1489 * for buffer overruns. 1490 * 1491 * %SLAB_NO_REAP - Don't automatically reap this cache when we're under 1492 * memory pressure. 1493 * 1494 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 1495 * cacheline. This can be beneficial if you're counting cycles as closely 1496 * as davem. 1497 */ 1498 kmem_cache_t * 1499 kmem_cache_create (const char *name, size_t size, size_t align, 1500 unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), 1501 void (*dtor)(void*, kmem_cache_t *, unsigned long)) 1502 { 1503 size_t left_over, slab_size, ralign; 1504 kmem_cache_t *cachep = NULL; 1505 struct list_head *p; 1506 1507 /* 1508 * Sanity checks... these are all serious usage bugs. 1509 */ 1510 if ((!name) || 1511 in_interrupt() || 1512 (size < BYTES_PER_WORD) || 1513 (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) || 1514 (dtor && !ctor)) { 1515 printk(KERN_ERR "%s: Early error in slab %s\n", 1516 __FUNCTION__, name); 1517 BUG(); 1518 } 1519 1520 down(&cache_chain_sem); 1521 1522 list_for_each(p, &cache_chain) { 1523 kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); 1524 mm_segment_t old_fs = get_fs(); 1525 char tmp; 1526 int res; 1527 1528 /* 1529 * This happens when the module gets unloaded and doesn't 1530 * destroy its slab cache and no-one else reuses the vmalloc 1531 * area of the module. Print a warning. 1532 */ 1533 set_fs(KERNEL_DS); 1534 res = __get_user(tmp, pc->name); 1535 set_fs(old_fs); 1536 if (res) { 1537 printk("SLAB: cache with size %d has lost its name\n", 1538 pc->objsize); 1539 continue; 1540 } 1541 1542 if (!strcmp(pc->name,name)) { 1543 printk("kmem_cache_create: duplicate cache %s\n", name); 1544 dump_stack(); 1545 goto oops; 1546 } 1547 } 1548 1549 #if DEBUG 1550 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 1551 if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { 1552 /* No constructor, but inital state check requested */ 1553 printk(KERN_ERR "%s: No con, but init state check " 1554 "requested - %s\n", __FUNCTION__, name); 1555 flags &= ~SLAB_DEBUG_INITIAL; 1556 } 1557 1558 #if FORCED_DEBUG 1559 /* 1560 * Enable redzoning and last user accounting, except for caches with 1561 * large objects, if the increased size would increase the object size 1562 * above the next power of two: caches with object sizes just above a 1563 * power of two have a significant amount of internal fragmentation. 1564 */ 1565 if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD))) 1566 flags |= SLAB_RED_ZONE|SLAB_STORE_USER; 1567 if (!(flags & SLAB_DESTROY_BY_RCU)) 1568 flags |= SLAB_POISON; 1569 #endif 1570 if (flags & SLAB_DESTROY_BY_RCU) 1571 BUG_ON(flags & SLAB_POISON); 1572 #endif 1573 if (flags & SLAB_DESTROY_BY_RCU) 1574 BUG_ON(dtor); 1575 1576 /* 1577 * Always checks flags, a caller might be expecting debug 1578 * support which isn't available. 1579 */ 1580 if (flags & ~CREATE_MASK) 1581 BUG(); 1582 1583 /* Check that size is in terms of words. This is needed to avoid 1584 * unaligned accesses for some archs when redzoning is used, and makes 1585 * sure any on-slab bufctl's are also correctly aligned. 1586 */ 1587 if (size & (BYTES_PER_WORD-1)) { 1588 size += (BYTES_PER_WORD-1); 1589 size &= ~(BYTES_PER_WORD-1); 1590 } 1591 1592 /* calculate out the final buffer alignment: */ 1593 /* 1) arch recommendation: can be overridden for debug */ 1594 if (flags & SLAB_HWCACHE_ALIGN) { 1595 /* Default alignment: as specified by the arch code. 1596 * Except if an object is really small, then squeeze multiple 1597 * objects into one cacheline. 1598 */ 1599 ralign = cache_line_size(); 1600 while (size <= ralign/2) 1601 ralign /= 2; 1602 } else { 1603 ralign = BYTES_PER_WORD; 1604 } 1605 /* 2) arch mandated alignment: disables debug if necessary */ 1606 if (ralign < ARCH_SLAB_MINALIGN) { 1607 ralign = ARCH_SLAB_MINALIGN; 1608 if (ralign > BYTES_PER_WORD) 1609 flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); 1610 } 1611 /* 3) caller mandated alignment: disables debug if necessary */ 1612 if (ralign < align) { 1613 ralign = align; 1614 if (ralign > BYTES_PER_WORD) 1615 flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); 1616 } 1617 /* 4) Store it. Note that the debug code below can reduce 1618 * the alignment to BYTES_PER_WORD. 1619 */ 1620 align = ralign; 1621 1622 /* Get cache's description obj. */ 1623 cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); 1624 if (!cachep) 1625 goto oops; 1626 memset(cachep, 0, sizeof(kmem_cache_t)); 1627 1628 #if DEBUG 1629 cachep->reallen = size; 1630 1631 if (flags & SLAB_RED_ZONE) { 1632 /* redzoning only works with word aligned caches */ 1633 align = BYTES_PER_WORD; 1634 1635 /* add space for red zone words */ 1636 cachep->dbghead += BYTES_PER_WORD; 1637 size += 2*BYTES_PER_WORD; 1638 } 1639 if (flags & SLAB_STORE_USER) { 1640 /* user store requires word alignment and 1641 * one word storage behind the end of the real 1642 * object. 1643 */ 1644 align = BYTES_PER_WORD; 1645 size += BYTES_PER_WORD; 1646 } 1647 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 1648 if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { 1649 cachep->dbghead += PAGE_SIZE - size; 1650 size = PAGE_SIZE; 1651 } 1652 #endif 1653 #endif 1654 1655 /* Determine if the slab management is 'on' or 'off' slab. */ 1656 if (size >= (PAGE_SIZE>>3)) 1657 /* 1658 * Size is large, assume best to place the slab management obj 1659 * off-slab (should allow better packing of objs). 1660 */ 1661 flags |= CFLGS_OFF_SLAB; 1662 1663 size = ALIGN(size, align); 1664 1665 if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) { 1666 /* 1667 * A VFS-reclaimable slab tends to have most allocations 1668 * as GFP_NOFS and we really don't want to have to be allocating 1669 * higher-order pages when we are unable to shrink dcache. 1670 */ 1671 cachep->gfporder = 0; 1672 cache_estimate(cachep->gfporder, size, align, flags, 1673 &left_over, &cachep->num); 1674 } else { 1675 /* 1676 * Calculate size (in pages) of slabs, and the num of objs per 1677 * slab. This could be made much more intelligent. For now, 1678 * try to avoid using high page-orders for slabs. When the 1679 * gfp() funcs are more friendly towards high-order requests, 1680 * this should be changed. 1681 */ 1682 do { 1683 unsigned int break_flag = 0; 1684 cal_wastage: 1685 cache_estimate(cachep->gfporder, size, align, flags, 1686 &left_over, &cachep->num); 1687 if (break_flag) 1688 break; 1689 if (cachep->gfporder >= MAX_GFP_ORDER) 1690 break; 1691 if (!cachep->num) 1692 goto next; 1693 if (flags & CFLGS_OFF_SLAB && 1694 cachep->num > offslab_limit) { 1695 /* This num of objs will cause problems. */ 1696 cachep->gfporder--; 1697 break_flag++; 1698 goto cal_wastage; 1699 } 1700 1701 /* 1702 * Large num of objs is good, but v. large slabs are 1703 * currently bad for the gfp()s. 1704 */ 1705 if (cachep->gfporder >= slab_break_gfp_order) 1706 break; 1707 1708 if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder)) 1709 break; /* Acceptable internal fragmentation. */ 1710 next: 1711 cachep->gfporder++; 1712 } while (1); 1713 } 1714 1715 if (!cachep->num) { 1716 printk("kmem_cache_create: couldn't create cache %s.\n", name); 1717 kmem_cache_free(&cache_cache, cachep); 1718 cachep = NULL; 1719 goto oops; 1720 } 1721 slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t) 1722 + sizeof(struct slab), align); 1723 1724 /* 1725 * If the slab has been placed off-slab, and we have enough space then 1726 * move it on-slab. This is at the expense of any extra colouring. 1727 */ 1728 if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { 1729 flags &= ~CFLGS_OFF_SLAB; 1730 left_over -= slab_size; 1731 } 1732 1733 if (flags & CFLGS_OFF_SLAB) { 1734 /* really off slab. No need for manual alignment */ 1735 slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab); 1736 } 1737 1738 cachep->colour_off = cache_line_size(); 1739 /* Offset must be a multiple of the alignment. */ 1740 if (cachep->colour_off < align) 1741 cachep->colour_off = align; 1742 cachep->colour = left_over/cachep->colour_off; 1743 cachep->slab_size = slab_size; 1744 cachep->flags = flags; 1745 cachep->gfpflags = 0; 1746 if (flags & SLAB_CACHE_DMA) 1747 cachep->gfpflags |= GFP_DMA; 1748 spin_lock_init(&cachep->spinlock); 1749 cachep->objsize = size; 1750 1751 if (flags & CFLGS_OFF_SLAB) 1752 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 1753 cachep->ctor = ctor; 1754 cachep->dtor = dtor; 1755 cachep->name = name; 1756 1757 /* Don't let CPUs to come and go */ 1758 lock_cpu_hotplug(); 1759 1760 if (g_cpucache_up == FULL) { 1761 enable_cpucache(cachep); 1762 } else { 1763 if (g_cpucache_up == NONE) { 1764 /* Note: the first kmem_cache_create must create 1765 * the cache that's used by kmalloc(24), otherwise 1766 * the creation of further caches will BUG(). 1767 */ 1768 cachep->array[smp_processor_id()] = 1769 &initarray_generic.cache; 1770 1771 /* If the cache that's used by 1772 * kmalloc(sizeof(kmem_list3)) is the first cache, 1773 * then we need to set up all its list3s, otherwise 1774 * the creation of further caches will BUG(). 1775 */ 1776 set_up_list3s(cachep, SIZE_AC); 1777 if (INDEX_AC == INDEX_L3) 1778 g_cpucache_up = PARTIAL_L3; 1779 else 1780 g_cpucache_up = PARTIAL_AC; 1781 } else { 1782 cachep->array[smp_processor_id()] = 1783 kmalloc(sizeof(struct arraycache_init), 1784 GFP_KERNEL); 1785 1786 if (g_cpucache_up == PARTIAL_AC) { 1787 set_up_list3s(cachep, SIZE_L3); 1788 g_cpucache_up = PARTIAL_L3; 1789 } else { 1790 int node; 1791 for_each_online_node(node) { 1792 1793 cachep->nodelists[node] = 1794 kmalloc_node(sizeof(struct kmem_list3), 1795 GFP_KERNEL, node); 1796 BUG_ON(!cachep->nodelists[node]); 1797 kmem_list3_init(cachep->nodelists[node]); 1798 } 1799 } 1800 } 1801 cachep->nodelists[numa_node_id()]->next_reap = 1802 jiffies + REAPTIMEOUT_LIST3 + 1803 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 1804 1805 BUG_ON(!ac_data(cachep)); 1806 ac_data(cachep)->avail = 0; 1807 ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 1808 ac_data(cachep)->batchcount = 1; 1809 ac_data(cachep)->touched = 0; 1810 cachep->batchcount = 1; 1811 cachep->limit = BOOT_CPUCACHE_ENTRIES; 1812 } 1813 1814 /* cache setup completed, link it into the list */ 1815 list_add(&cachep->next, &cache_chain); 1816 unlock_cpu_hotplug(); 1817 oops: 1818 if (!cachep && (flags & SLAB_PANIC)) 1819 panic("kmem_cache_create(): failed to create slab `%s'\n", 1820 name); 1821 up(&cache_chain_sem); 1822 return cachep; 1823 } 1824 EXPORT_SYMBOL(kmem_cache_create); 1825 1826 #if DEBUG 1827 static void check_irq_off(void) 1828 { 1829 BUG_ON(!irqs_disabled()); 1830 } 1831 1832 static void check_irq_on(void) 1833 { 1834 BUG_ON(irqs_disabled()); 1835 } 1836 1837 static void check_spinlock_acquired(kmem_cache_t *cachep) 1838 { 1839 #ifdef CONFIG_SMP 1840 check_irq_off(); 1841 assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); 1842 #endif 1843 } 1844 1845 static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node) 1846 { 1847 #ifdef CONFIG_SMP 1848 check_irq_off(); 1849 assert_spin_locked(&cachep->nodelists[node]->list_lock); 1850 #endif 1851 } 1852 1853 #else 1854 #define check_irq_off() do { } while(0) 1855 #define check_irq_on() do { } while(0) 1856 #define check_spinlock_acquired(x) do { } while(0) 1857 #define check_spinlock_acquired_node(x, y) do { } while(0) 1858 #endif 1859 1860 /* 1861 * Waits for all CPUs to execute func(). 1862 */ 1863 static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg) 1864 { 1865 check_irq_on(); 1866 preempt_disable(); 1867 1868 local_irq_disable(); 1869 func(arg); 1870 local_irq_enable(); 1871 1872 if (smp_call_function(func, arg, 1, 1)) 1873 BUG(); 1874 1875 preempt_enable(); 1876 } 1877 1878 static void drain_array_locked(kmem_cache_t* cachep, 1879 struct array_cache *ac, int force, int node); 1880 1881 static void do_drain(void *arg) 1882 { 1883 kmem_cache_t *cachep = (kmem_cache_t*)arg; 1884 struct array_cache *ac; 1885 int node = numa_node_id(); 1886 1887 check_irq_off(); 1888 ac = ac_data(cachep); 1889 spin_lock(&cachep->nodelists[node]->list_lock); 1890 free_block(cachep, ac->entry, ac->avail, node); 1891 spin_unlock(&cachep->nodelists[node]->list_lock); 1892 ac->avail = 0; 1893 } 1894 1895 static void drain_cpu_caches(kmem_cache_t *cachep) 1896 { 1897 struct kmem_list3 *l3; 1898 int node; 1899 1900 smp_call_function_all_cpus(do_drain, cachep); 1901 check_irq_on(); 1902 spin_lock_irq(&cachep->spinlock); 1903 for_each_online_node(node) { 1904 l3 = cachep->nodelists[node]; 1905 if (l3) { 1906 spin_lock(&l3->list_lock); 1907 drain_array_locked(cachep, l3->shared, 1, node); 1908 spin_unlock(&l3->list_lock); 1909 if (l3->alien) 1910 drain_alien_cache(cachep, l3); 1911 } 1912 } 1913 spin_unlock_irq(&cachep->spinlock); 1914 } 1915 1916 static int __node_shrink(kmem_cache_t *cachep, int node) 1917 { 1918 struct slab *slabp; 1919 struct kmem_list3 *l3 = cachep->nodelists[node]; 1920 int ret; 1921 1922 for (;;) { 1923 struct list_head *p; 1924 1925 p = l3->slabs_free.prev; 1926 if (p == &l3->slabs_free) 1927 break; 1928 1929 slabp = list_entry(l3->slabs_free.prev, struct slab, list); 1930 #if DEBUG 1931 if (slabp->inuse) 1932 BUG(); 1933 #endif 1934 list_del(&slabp->list); 1935 1936 l3->free_objects -= cachep->num; 1937 spin_unlock_irq(&l3->list_lock); 1938 slab_destroy(cachep, slabp); 1939 spin_lock_irq(&l3->list_lock); 1940 } 1941 ret = !list_empty(&l3->slabs_full) || 1942 !list_empty(&l3->slabs_partial); 1943 return ret; 1944 } 1945 1946 static int __cache_shrink(kmem_cache_t *cachep) 1947 { 1948 int ret = 0, i = 0; 1949 struct kmem_list3 *l3; 1950 1951 drain_cpu_caches(cachep); 1952 1953 check_irq_on(); 1954 for_each_online_node(i) { 1955 l3 = cachep->nodelists[i]; 1956 if (l3) { 1957 spin_lock_irq(&l3->list_lock); 1958 ret += __node_shrink(cachep, i); 1959 spin_unlock_irq(&l3->list_lock); 1960 } 1961 } 1962 return (ret ? 1 : 0); 1963 } 1964 1965 /** 1966 * kmem_cache_shrink - Shrink a cache. 1967 * @cachep: The cache to shrink. 1968 * 1969 * Releases as many slabs as possible for a cache. 1970 * To help debugging, a zero exit status indicates all slabs were released. 1971 */ 1972 int kmem_cache_shrink(kmem_cache_t *cachep) 1973 { 1974 if (!cachep || in_interrupt()) 1975 BUG(); 1976 1977 return __cache_shrink(cachep); 1978 } 1979 EXPORT_SYMBOL(kmem_cache_shrink); 1980 1981 /** 1982 * kmem_cache_destroy - delete a cache 1983 * @cachep: the cache to destroy 1984 * 1985 * Remove a kmem_cache_t object from the slab cache. 1986 * Returns 0 on success. 1987 * 1988 * It is expected this function will be called by a module when it is 1989 * unloaded. This will remove the cache completely, and avoid a duplicate 1990 * cache being allocated each time a module is loaded and unloaded, if the 1991 * module doesn't have persistent in-kernel storage across loads and unloads. 1992 * 1993 * The cache must be empty before calling this function. 1994 * 1995 * The caller must guarantee that noone will allocate memory from the cache 1996 * during the kmem_cache_destroy(). 1997 */ 1998 int kmem_cache_destroy(kmem_cache_t * cachep) 1999 { 2000 int i; 2001 struct kmem_list3 *l3; 2002 2003 if (!cachep || in_interrupt()) 2004 BUG(); 2005 2006 /* Don't let CPUs to come and go */ 2007 lock_cpu_hotplug(); 2008 2009 /* Find the cache in the chain of caches. */ 2010 down(&cache_chain_sem); 2011 /* 2012 * the chain is never empty, cache_cache is never destroyed 2013 */ 2014 list_del(&cachep->next); 2015 up(&cache_chain_sem); 2016 2017 if (__cache_shrink(cachep)) { 2018 slab_error(cachep, "Can't free all objects"); 2019 down(&cache_chain_sem); 2020 list_add(&cachep->next,&cache_chain); 2021 up(&cache_chain_sem); 2022 unlock_cpu_hotplug(); 2023 return 1; 2024 } 2025 2026 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2027 synchronize_rcu(); 2028 2029 for_each_online_cpu(i) 2030 kfree(cachep->array[i]); 2031 2032 /* NUMA: free the list3 structures */ 2033 for_each_online_node(i) { 2034 if ((l3 = cachep->nodelists[i])) { 2035 kfree(l3->shared); 2036 free_alien_cache(l3->alien); 2037 kfree(l3); 2038 } 2039 } 2040 kmem_cache_free(&cache_cache, cachep); 2041 2042 unlock_cpu_hotplug(); 2043 2044 return 0; 2045 } 2046 EXPORT_SYMBOL(kmem_cache_destroy); 2047 2048 /* Get the memory for a slab management obj. */ 2049 static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp, 2050 int colour_off, gfp_t local_flags) 2051 { 2052 struct slab *slabp; 2053 2054 if (OFF_SLAB(cachep)) { 2055 /* Slab management obj is off-slab. */ 2056 slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); 2057 if (!slabp) 2058 return NULL; 2059 } else { 2060 slabp = objp+colour_off; 2061 colour_off += cachep->slab_size; 2062 } 2063 slabp->inuse = 0; 2064 slabp->colouroff = colour_off; 2065 slabp->s_mem = objp+colour_off; 2066 2067 return slabp; 2068 } 2069 2070 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) 2071 { 2072 return (kmem_bufctl_t *)(slabp+1); 2073 } 2074 2075 static void cache_init_objs(kmem_cache_t *cachep, 2076 struct slab *slabp, unsigned long ctor_flags) 2077 { 2078 int i; 2079 2080 for (i = 0; i < cachep->num; i++) { 2081 void *objp = slabp->s_mem+cachep->objsize*i; 2082 #if DEBUG 2083 /* need to poison the objs? */ 2084 if (cachep->flags & SLAB_POISON) 2085 poison_obj(cachep, objp, POISON_FREE); 2086 if (cachep->flags & SLAB_STORE_USER) 2087 *dbg_userword(cachep, objp) = NULL; 2088 2089 if (cachep->flags & SLAB_RED_ZONE) { 2090 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2091 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2092 } 2093 /* 2094 * Constructors are not allowed to allocate memory from 2095 * the same cache which they are a constructor for. 2096 * Otherwise, deadlock. They must also be threaded. 2097 */ 2098 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2099 cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags); 2100 2101 if (cachep->flags & SLAB_RED_ZONE) { 2102 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2103 slab_error(cachep, "constructor overwrote the" 2104 " end of an object"); 2105 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2106 slab_error(cachep, "constructor overwrote the" 2107 " start of an object"); 2108 } 2109 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2110 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); 2111 #else 2112 if (cachep->ctor) 2113 cachep->ctor(objp, cachep, ctor_flags); 2114 #endif 2115 slab_bufctl(slabp)[i] = i+1; 2116 } 2117 slab_bufctl(slabp)[i-1] = BUFCTL_END; 2118 slabp->free = 0; 2119 } 2120 2121 static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags) 2122 { 2123 if (flags & SLAB_DMA) { 2124 if (!(cachep->gfpflags & GFP_DMA)) 2125 BUG(); 2126 } else { 2127 if (cachep->gfpflags & GFP_DMA) 2128 BUG(); 2129 } 2130 } 2131 2132 static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp) 2133 { 2134 int i; 2135 struct page *page; 2136 2137 /* Nasty!!!!!! I hope this is OK. */ 2138 i = 1 << cachep->gfporder; 2139 page = virt_to_page(objp); 2140 do { 2141 SET_PAGE_CACHE(page, cachep); 2142 SET_PAGE_SLAB(page, slabp); 2143 page++; 2144 } while (--i); 2145 } 2146 2147 /* 2148 * Grow (by 1) the number of slabs within a cache. This is called by 2149 * kmem_cache_alloc() when there are no active objs left in a cache. 2150 */ 2151 static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2152 { 2153 struct slab *slabp; 2154 void *objp; 2155 size_t offset; 2156 gfp_t local_flags; 2157 unsigned long ctor_flags; 2158 struct kmem_list3 *l3; 2159 2160 /* Be lazy and only check for valid flags here, 2161 * keeping it out of the critical path in kmem_cache_alloc(). 2162 */ 2163 if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) 2164 BUG(); 2165 if (flags & SLAB_NO_GROW) 2166 return 0; 2167 2168 ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2169 local_flags = (flags & SLAB_LEVEL_MASK); 2170 if (!(local_flags & __GFP_WAIT)) 2171 /* 2172 * Not allowed to sleep. Need to tell a constructor about 2173 * this - it might need to know... 2174 */ 2175 ctor_flags |= SLAB_CTOR_ATOMIC; 2176 2177 /* About to mess with non-constant members - lock. */ 2178 check_irq_off(); 2179 spin_lock(&cachep->spinlock); 2180 2181 /* Get colour for the slab, and cal the next value. */ 2182 offset = cachep->colour_next; 2183 cachep->colour_next++; 2184 if (cachep->colour_next >= cachep->colour) 2185 cachep->colour_next = 0; 2186 offset *= cachep->colour_off; 2187 2188 spin_unlock(&cachep->spinlock); 2189 2190 check_irq_off(); 2191 if (local_flags & __GFP_WAIT) 2192 local_irq_enable(); 2193 2194 /* 2195 * The test for missing atomic flag is performed here, rather than 2196 * the more obvious place, simply to reduce the critical path length 2197 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they 2198 * will eventually be caught here (where it matters). 2199 */ 2200 kmem_flagcheck(cachep, flags); 2201 2202 /* Get mem for the objs. 2203 * Attempt to allocate a physical page from 'nodeid', 2204 */ 2205 if (!(objp = kmem_getpages(cachep, flags, nodeid))) 2206 goto failed; 2207 2208 /* Get slab management. */ 2209 if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) 2210 goto opps1; 2211 2212 slabp->nodeid = nodeid; 2213 set_slab_attr(cachep, slabp, objp); 2214 2215 cache_init_objs(cachep, slabp, ctor_flags); 2216 2217 if (local_flags & __GFP_WAIT) 2218 local_irq_disable(); 2219 check_irq_off(); 2220 l3 = cachep->nodelists[nodeid]; 2221 spin_lock(&l3->list_lock); 2222 2223 /* Make slab active. */ 2224 list_add_tail(&slabp->list, &(l3->slabs_free)); 2225 STATS_INC_GROWN(cachep); 2226 l3->free_objects += cachep->num; 2227 spin_unlock(&l3->list_lock); 2228 return 1; 2229 opps1: 2230 kmem_freepages(cachep, objp); 2231 failed: 2232 if (local_flags & __GFP_WAIT) 2233 local_irq_disable(); 2234 return 0; 2235 } 2236 2237 #if DEBUG 2238 2239 /* 2240 * Perform extra freeing checks: 2241 * - detect bad pointers. 2242 * - POISON/RED_ZONE checking 2243 * - destructor calls, for caches with POISON+dtor 2244 */ 2245 static void kfree_debugcheck(const void *objp) 2246 { 2247 struct page *page; 2248 2249 if (!virt_addr_valid(objp)) { 2250 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", 2251 (unsigned long)objp); 2252 BUG(); 2253 } 2254 page = virt_to_page(objp); 2255 if (!PageSlab(page)) { 2256 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp); 2257 BUG(); 2258 } 2259 } 2260 2261 static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, 2262 void *caller) 2263 { 2264 struct page *page; 2265 unsigned int objnr; 2266 struct slab *slabp; 2267 2268 objp -= obj_dbghead(cachep); 2269 kfree_debugcheck(objp); 2270 page = virt_to_page(objp); 2271 2272 if (GET_PAGE_CACHE(page) != cachep) { 2273 printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n", 2274 GET_PAGE_CACHE(page),cachep); 2275 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); 2276 printk(KERN_ERR "%p is %s.\n", GET_PAGE_CACHE(page), GET_PAGE_CACHE(page)->name); 2277 WARN_ON(1); 2278 } 2279 slabp = GET_PAGE_SLAB(page); 2280 2281 if (cachep->flags & SLAB_RED_ZONE) { 2282 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { 2283 slab_error(cachep, "double free, or memory outside" 2284 " object was overwritten"); 2285 printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2286 objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); 2287 } 2288 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2289 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2290 } 2291 if (cachep->flags & SLAB_STORE_USER) 2292 *dbg_userword(cachep, objp) = caller; 2293 2294 objnr = (objp-slabp->s_mem)/cachep->objsize; 2295 2296 BUG_ON(objnr >= cachep->num); 2297 BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize); 2298 2299 if (cachep->flags & SLAB_DEBUG_INITIAL) { 2300 /* Need to call the slab's constructor so the 2301 * caller can perform a verify of its state (debugging). 2302 * Called without the cache-lock held. 2303 */ 2304 cachep->ctor(objp+obj_dbghead(cachep), 2305 cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); 2306 } 2307 if (cachep->flags & SLAB_POISON && cachep->dtor) { 2308 /* we want to cache poison the object, 2309 * call the destruction callback 2310 */ 2311 cachep->dtor(objp+obj_dbghead(cachep), cachep, 0); 2312 } 2313 if (cachep->flags & SLAB_POISON) { 2314 #ifdef CONFIG_DEBUG_PAGEALLOC 2315 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { 2316 store_stackinfo(cachep, objp, (unsigned long)caller); 2317 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); 2318 } else { 2319 poison_obj(cachep, objp, POISON_FREE); 2320 } 2321 #else 2322 poison_obj(cachep, objp, POISON_FREE); 2323 #endif 2324 } 2325 return objp; 2326 } 2327 2328 static void check_slabp(kmem_cache_t *cachep, struct slab *slabp) 2329 { 2330 kmem_bufctl_t i; 2331 int entries = 0; 2332 2333 /* Check slab's freelist to see if this obj is there. */ 2334 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { 2335 entries++; 2336 if (entries > cachep->num || i >= cachep->num) 2337 goto bad; 2338 } 2339 if (entries != cachep->num - slabp->inuse) { 2340 bad: 2341 printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", 2342 cachep->name, cachep->num, slabp, slabp->inuse); 2343 for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) { 2344 if ((i%16)==0) 2345 printk("\n%03x:", i); 2346 printk(" %02x", ((unsigned char*)slabp)[i]); 2347 } 2348 printk("\n"); 2349 BUG(); 2350 } 2351 } 2352 #else 2353 #define kfree_debugcheck(x) do { } while(0) 2354 #define cache_free_debugcheck(x,objp,z) (objp) 2355 #define check_slabp(x,y) do { } while(0) 2356 #endif 2357 2358 static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags) 2359 { 2360 int batchcount; 2361 struct kmem_list3 *l3; 2362 struct array_cache *ac; 2363 2364 check_irq_off(); 2365 ac = ac_data(cachep); 2366 retry: 2367 batchcount = ac->batchcount; 2368 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2369 /* if there was little recent activity on this 2370 * cache, then perform only a partial refill. 2371 * Otherwise we could generate refill bouncing. 2372 */ 2373 batchcount = BATCHREFILL_LIMIT; 2374 } 2375 l3 = cachep->nodelists[numa_node_id()]; 2376 2377 BUG_ON(ac->avail > 0 || !l3); 2378 spin_lock(&l3->list_lock); 2379 2380 if (l3->shared) { 2381 struct array_cache *shared_array = l3->shared; 2382 if (shared_array->avail) { 2383 if (batchcount > shared_array->avail) 2384 batchcount = shared_array->avail; 2385 shared_array->avail -= batchcount; 2386 ac->avail = batchcount; 2387 memcpy(ac->entry, 2388 &(shared_array->entry[shared_array->avail]), 2389 sizeof(void*)*batchcount); 2390 shared_array->touched = 1; 2391 goto alloc_done; 2392 } 2393 } 2394 while (batchcount > 0) { 2395 struct list_head *entry; 2396 struct slab *slabp; 2397 /* Get slab alloc is to come from. */ 2398 entry = l3->slabs_partial.next; 2399 if (entry == &l3->slabs_partial) { 2400 l3->free_touched = 1; 2401 entry = l3->slabs_free.next; 2402 if (entry == &l3->slabs_free) 2403 goto must_grow; 2404 } 2405 2406 slabp = list_entry(entry, struct slab, list); 2407 check_slabp(cachep, slabp); 2408 check_spinlock_acquired(cachep); 2409 while (slabp->inuse < cachep->num && batchcount--) { 2410 kmem_bufctl_t next; 2411 STATS_INC_ALLOCED(cachep); 2412 STATS_INC_ACTIVE(cachep); 2413 STATS_SET_HIGH(cachep); 2414 2415 /* get obj pointer */ 2416 ac->entry[ac->avail++] = slabp->s_mem + 2417 slabp->free*cachep->objsize; 2418 2419 slabp->inuse++; 2420 next = slab_bufctl(slabp)[slabp->free]; 2421 #if DEBUG 2422 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2423 WARN_ON(numa_node_id() != slabp->nodeid); 2424 #endif 2425 slabp->free = next; 2426 } 2427 check_slabp(cachep, slabp); 2428 2429 /* move slabp to correct slabp list: */ 2430 list_del(&slabp->list); 2431 if (slabp->free == BUFCTL_END) 2432 list_add(&slabp->list, &l3->slabs_full); 2433 else 2434 list_add(&slabp->list, &l3->slabs_partial); 2435 } 2436 2437 must_grow: 2438 l3->free_objects -= ac->avail; 2439 alloc_done: 2440 spin_unlock(&l3->list_lock); 2441 2442 if (unlikely(!ac->avail)) { 2443 int x; 2444 x = cache_grow(cachep, flags, numa_node_id()); 2445 2446 // cache_grow can reenable interrupts, then ac could change. 2447 ac = ac_data(cachep); 2448 if (!x && ac->avail == 0) // no objects in sight? abort 2449 return NULL; 2450 2451 if (!ac->avail) // objects refilled by interrupt? 2452 goto retry; 2453 } 2454 ac->touched = 1; 2455 return ac->entry[--ac->avail]; 2456 } 2457 2458 static inline void 2459 cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags) 2460 { 2461 might_sleep_if(flags & __GFP_WAIT); 2462 #if DEBUG 2463 kmem_flagcheck(cachep, flags); 2464 #endif 2465 } 2466 2467 #if DEBUG 2468 static void * 2469 cache_alloc_debugcheck_after(kmem_cache_t *cachep, 2470 gfp_t flags, void *objp, void *caller) 2471 { 2472 if (!objp) 2473 return objp; 2474 if (cachep->flags & SLAB_POISON) { 2475 #ifdef CONFIG_DEBUG_PAGEALLOC 2476 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 2477 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1); 2478 else 2479 check_poison_obj(cachep, objp); 2480 #else 2481 check_poison_obj(cachep, objp); 2482 #endif 2483 poison_obj(cachep, objp, POISON_INUSE); 2484 } 2485 if (cachep->flags & SLAB_STORE_USER) 2486 *dbg_userword(cachep, objp) = caller; 2487 2488 if (cachep->flags & SLAB_RED_ZONE) { 2489 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 2490 slab_error(cachep, "double free, or memory outside" 2491 " object was overwritten"); 2492 printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2493 objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); 2494 } 2495 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2496 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2497 } 2498 objp += obj_dbghead(cachep); 2499 if (cachep->ctor && cachep->flags & SLAB_POISON) { 2500 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2501 2502 if (!(flags & __GFP_WAIT)) 2503 ctor_flags |= SLAB_CTOR_ATOMIC; 2504 2505 cachep->ctor(objp, cachep, ctor_flags); 2506 } 2507 return objp; 2508 } 2509 #else 2510 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 2511 #endif 2512 2513 static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2514 { 2515 void* objp; 2516 struct array_cache *ac; 2517 2518 check_irq_off(); 2519 ac = ac_data(cachep); 2520 if (likely(ac->avail)) { 2521 STATS_INC_ALLOCHIT(cachep); 2522 ac->touched = 1; 2523 objp = ac->entry[--ac->avail]; 2524 } else { 2525 STATS_INC_ALLOCMISS(cachep); 2526 objp = cache_alloc_refill(cachep, flags); 2527 } 2528 return objp; 2529 } 2530 2531 static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2532 { 2533 unsigned long save_flags; 2534 void* objp; 2535 2536 cache_alloc_debugcheck_before(cachep, flags); 2537 2538 local_irq_save(save_flags); 2539 objp = ____cache_alloc(cachep, flags); 2540 local_irq_restore(save_flags); 2541 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 2542 __builtin_return_address(0)); 2543 prefetchw(objp); 2544 return objp; 2545 } 2546 2547 #ifdef CONFIG_NUMA 2548 /* 2549 * A interface to enable slab creation on nodeid 2550 */ 2551 static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2552 { 2553 struct list_head *entry; 2554 struct slab *slabp; 2555 struct kmem_list3 *l3; 2556 void *obj; 2557 kmem_bufctl_t next; 2558 int x; 2559 2560 l3 = cachep->nodelists[nodeid]; 2561 BUG_ON(!l3); 2562 2563 retry: 2564 spin_lock(&l3->list_lock); 2565 entry = l3->slabs_partial.next; 2566 if (entry == &l3->slabs_partial) { 2567 l3->free_touched = 1; 2568 entry = l3->slabs_free.next; 2569 if (entry == &l3->slabs_free) 2570 goto must_grow; 2571 } 2572 2573 slabp = list_entry(entry, struct slab, list); 2574 check_spinlock_acquired_node(cachep, nodeid); 2575 check_slabp(cachep, slabp); 2576 2577 STATS_INC_NODEALLOCS(cachep); 2578 STATS_INC_ACTIVE(cachep); 2579 STATS_SET_HIGH(cachep); 2580 2581 BUG_ON(slabp->inuse == cachep->num); 2582 2583 /* get obj pointer */ 2584 obj = slabp->s_mem + slabp->free*cachep->objsize; 2585 slabp->inuse++; 2586 next = slab_bufctl(slabp)[slabp->free]; 2587 #if DEBUG 2588 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2589 #endif 2590 slabp->free = next; 2591 check_slabp(cachep, slabp); 2592 l3->free_objects--; 2593 /* move slabp to correct slabp list: */ 2594 list_del(&slabp->list); 2595 2596 if (slabp->free == BUFCTL_END) { 2597 list_add(&slabp->list, &l3->slabs_full); 2598 } else { 2599 list_add(&slabp->list, &l3->slabs_partial); 2600 } 2601 2602 spin_unlock(&l3->list_lock); 2603 goto done; 2604 2605 must_grow: 2606 spin_unlock(&l3->list_lock); 2607 x = cache_grow(cachep, flags, nodeid); 2608 2609 if (!x) 2610 return NULL; 2611 2612 goto retry; 2613 done: 2614 return obj; 2615 } 2616 #endif 2617 2618 /* 2619 * Caller needs to acquire correct kmem_list's list_lock 2620 */ 2621 static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int node) 2622 { 2623 int i; 2624 struct kmem_list3 *l3; 2625 2626 for (i = 0; i < nr_objects; i++) { 2627 void *objp = objpp[i]; 2628 struct slab *slabp; 2629 unsigned int objnr; 2630 2631 slabp = GET_PAGE_SLAB(virt_to_page(objp)); 2632 l3 = cachep->nodelists[node]; 2633 list_del(&slabp->list); 2634 objnr = (objp - slabp->s_mem) / cachep->objsize; 2635 check_spinlock_acquired_node(cachep, node); 2636 check_slabp(cachep, slabp); 2637 2638 #if DEBUG 2639 /* Verify that the slab belongs to the intended node */ 2640 WARN_ON(slabp->nodeid != node); 2641 2642 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { 2643 printk(KERN_ERR "slab: double free detected in cache " 2644 "'%s', objp %p\n", cachep->name, objp); 2645 BUG(); 2646 } 2647 #endif 2648 slab_bufctl(slabp)[objnr] = slabp->free; 2649 slabp->free = objnr; 2650 STATS_DEC_ACTIVE(cachep); 2651 slabp->inuse--; 2652 l3->free_objects++; 2653 check_slabp(cachep, slabp); 2654 2655 /* fixup slab chains */ 2656 if (slabp->inuse == 0) { 2657 if (l3->free_objects > l3->free_limit) { 2658 l3->free_objects -= cachep->num; 2659 slab_destroy(cachep, slabp); 2660 } else { 2661 list_add(&slabp->list, &l3->slabs_free); 2662 } 2663 } else { 2664 /* Unconditionally move a slab to the end of the 2665 * partial list on free - maximum time for the 2666 * other objects to be freed, too. 2667 */ 2668 list_add_tail(&slabp->list, &l3->slabs_partial); 2669 } 2670 } 2671 } 2672 2673 static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac) 2674 { 2675 int batchcount; 2676 struct kmem_list3 *l3; 2677 int node = numa_node_id(); 2678 2679 batchcount = ac->batchcount; 2680 #if DEBUG 2681 BUG_ON(!batchcount || batchcount > ac->avail); 2682 #endif 2683 check_irq_off(); 2684 l3 = cachep->nodelists[node]; 2685 spin_lock(&l3->list_lock); 2686 if (l3->shared) { 2687 struct array_cache *shared_array = l3->shared; 2688 int max = shared_array->limit-shared_array->avail; 2689 if (max) { 2690 if (batchcount > max) 2691 batchcount = max; 2692 memcpy(&(shared_array->entry[shared_array->avail]), 2693 ac->entry, 2694 sizeof(void*)*batchcount); 2695 shared_array->avail += batchcount; 2696 goto free_done; 2697 } 2698 } 2699 2700 free_block(cachep, ac->entry, batchcount, node); 2701 free_done: 2702 #if STATS 2703 { 2704 int i = 0; 2705 struct list_head *p; 2706 2707 p = l3->slabs_free.next; 2708 while (p != &(l3->slabs_free)) { 2709 struct slab *slabp; 2710 2711 slabp = list_entry(p, struct slab, list); 2712 BUG_ON(slabp->inuse); 2713 2714 i++; 2715 p = p->next; 2716 } 2717 STATS_SET_FREEABLE(cachep, i); 2718 } 2719 #endif 2720 spin_unlock(&l3->list_lock); 2721 ac->avail -= batchcount; 2722 memmove(ac->entry, &(ac->entry[batchcount]), 2723 sizeof(void*)*ac->avail); 2724 } 2725 2726 2727 /* 2728 * __cache_free 2729 * Release an obj back to its cache. If the obj has a constructed 2730 * state, it must be in this state _before_ it is released. 2731 * 2732 * Called with disabled ints. 2733 */ 2734 static inline void __cache_free(kmem_cache_t *cachep, void *objp) 2735 { 2736 struct array_cache *ac = ac_data(cachep); 2737 2738 check_irq_off(); 2739 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 2740 2741 /* Make sure we are not freeing a object from another 2742 * node to the array cache on this cpu. 2743 */ 2744 #ifdef CONFIG_NUMA 2745 { 2746 struct slab *slabp; 2747 slabp = GET_PAGE_SLAB(virt_to_page(objp)); 2748 if (unlikely(slabp->nodeid != numa_node_id())) { 2749 struct array_cache *alien = NULL; 2750 int nodeid = slabp->nodeid; 2751 struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()]; 2752 2753 STATS_INC_NODEFREES(cachep); 2754 if (l3->alien && l3->alien[nodeid]) { 2755 alien = l3->alien[nodeid]; 2756 spin_lock(&alien->lock); 2757 if (unlikely(alien->avail == alien->limit)) 2758 __drain_alien_cache(cachep, 2759 alien, nodeid); 2760 alien->entry[alien->avail++] = objp; 2761 spin_unlock(&alien->lock); 2762 } else { 2763 spin_lock(&(cachep->nodelists[nodeid])-> 2764 list_lock); 2765 free_block(cachep, &objp, 1, nodeid); 2766 spin_unlock(&(cachep->nodelists[nodeid])-> 2767 list_lock); 2768 } 2769 return; 2770 } 2771 } 2772 #endif 2773 if (likely(ac->avail < ac->limit)) { 2774 STATS_INC_FREEHIT(cachep); 2775 ac->entry[ac->avail++] = objp; 2776 return; 2777 } else { 2778 STATS_INC_FREEMISS(cachep); 2779 cache_flusharray(cachep, ac); 2780 ac->entry[ac->avail++] = objp; 2781 } 2782 } 2783 2784 /** 2785 * kmem_cache_alloc - Allocate an object 2786 * @cachep: The cache to allocate from. 2787 * @flags: See kmalloc(). 2788 * 2789 * Allocate an object from this cache. The flags are only relevant 2790 * if the cache has no available objects. 2791 */ 2792 void *kmem_cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2793 { 2794 return __cache_alloc(cachep, flags); 2795 } 2796 EXPORT_SYMBOL(kmem_cache_alloc); 2797 2798 /** 2799 * kmem_ptr_validate - check if an untrusted pointer might 2800 * be a slab entry. 2801 * @cachep: the cache we're checking against 2802 * @ptr: pointer to validate 2803 * 2804 * This verifies that the untrusted pointer looks sane: 2805 * it is _not_ a guarantee that the pointer is actually 2806 * part of the slab cache in question, but it at least 2807 * validates that the pointer can be dereferenced and 2808 * looks half-way sane. 2809 * 2810 * Currently only used for dentry validation. 2811 */ 2812 int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) 2813 { 2814 unsigned long addr = (unsigned long) ptr; 2815 unsigned long min_addr = PAGE_OFFSET; 2816 unsigned long align_mask = BYTES_PER_WORD-1; 2817 unsigned long size = cachep->objsize; 2818 struct page *page; 2819 2820 if (unlikely(addr < min_addr)) 2821 goto out; 2822 if (unlikely(addr > (unsigned long)high_memory - size)) 2823 goto out; 2824 if (unlikely(addr & align_mask)) 2825 goto out; 2826 if (unlikely(!kern_addr_valid(addr))) 2827 goto out; 2828 if (unlikely(!kern_addr_valid(addr + size - 1))) 2829 goto out; 2830 page = virt_to_page(ptr); 2831 if (unlikely(!PageSlab(page))) 2832 goto out; 2833 if (unlikely(GET_PAGE_CACHE(page) != cachep)) 2834 goto out; 2835 return 1; 2836 out: 2837 return 0; 2838 } 2839 2840 #ifdef CONFIG_NUMA 2841 /** 2842 * kmem_cache_alloc_node - Allocate an object on the specified node 2843 * @cachep: The cache to allocate from. 2844 * @flags: See kmalloc(). 2845 * @nodeid: node number of the target node. 2846 * 2847 * Identical to kmem_cache_alloc, except that this function is slow 2848 * and can sleep. And it will allocate memory on the given node, which 2849 * can improve the performance for cpu bound structures. 2850 * New and improved: it will now make sure that the object gets 2851 * put on the correct node list so that there is no false sharing. 2852 */ 2853 void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2854 { 2855 unsigned long save_flags; 2856 void *ptr; 2857 2858 if (nodeid == -1) 2859 return __cache_alloc(cachep, flags); 2860 2861 if (unlikely(!cachep->nodelists[nodeid])) { 2862 /* Fall back to __cache_alloc if we run into trouble */ 2863 printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name); 2864 return __cache_alloc(cachep,flags); 2865 } 2866 2867 cache_alloc_debugcheck_before(cachep, flags); 2868 local_irq_save(save_flags); 2869 if (nodeid == numa_node_id()) 2870 ptr = ____cache_alloc(cachep, flags); 2871 else 2872 ptr = __cache_alloc_node(cachep, flags, nodeid); 2873 local_irq_restore(save_flags); 2874 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0)); 2875 2876 return ptr; 2877 } 2878 EXPORT_SYMBOL(kmem_cache_alloc_node); 2879 2880 void *kmalloc_node(size_t size, gfp_t flags, int node) 2881 { 2882 kmem_cache_t *cachep; 2883 2884 cachep = kmem_find_general_cachep(size, flags); 2885 if (unlikely(cachep == NULL)) 2886 return NULL; 2887 return kmem_cache_alloc_node(cachep, flags, node); 2888 } 2889 EXPORT_SYMBOL(kmalloc_node); 2890 #endif 2891 2892 /** 2893 * kmalloc - allocate memory 2894 * @size: how many bytes of memory are required. 2895 * @flags: the type of memory to allocate. 2896 * 2897 * kmalloc is the normal method of allocating memory 2898 * in the kernel. 2899 * 2900 * The @flags argument may be one of: 2901 * 2902 * %GFP_USER - Allocate memory on behalf of user. May sleep. 2903 * 2904 * %GFP_KERNEL - Allocate normal kernel ram. May sleep. 2905 * 2906 * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers. 2907 * 2908 * Additionally, the %GFP_DMA flag may be set to indicate the memory 2909 * must be suitable for DMA. This can mean different things on different 2910 * platforms. For example, on i386, it means that the memory must come 2911 * from the first 16MB. 2912 */ 2913 void *__kmalloc(size_t size, gfp_t flags) 2914 { 2915 kmem_cache_t *cachep; 2916 2917 /* If you want to save a few bytes .text space: replace 2918 * __ with kmem_. 2919 * Then kmalloc uses the uninlined functions instead of the inline 2920 * functions. 2921 */ 2922 cachep = __find_general_cachep(size, flags); 2923 if (unlikely(cachep == NULL)) 2924 return NULL; 2925 return __cache_alloc(cachep, flags); 2926 } 2927 EXPORT_SYMBOL(__kmalloc); 2928 2929 #ifdef CONFIG_SMP 2930 /** 2931 * __alloc_percpu - allocate one copy of the object for every present 2932 * cpu in the system, zeroing them. 2933 * Objects should be dereferenced using the per_cpu_ptr macro only. 2934 * 2935 * @size: how many bytes of memory are required. 2936 * @align: the alignment, which can't be greater than SMP_CACHE_BYTES. 2937 */ 2938 void *__alloc_percpu(size_t size, size_t align) 2939 { 2940 int i; 2941 struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); 2942 2943 if (!pdata) 2944 return NULL; 2945 2946 /* 2947 * Cannot use for_each_online_cpu since a cpu may come online 2948 * and we have no way of figuring out how to fix the array 2949 * that we have allocated then.... 2950 */ 2951 for_each_cpu(i) { 2952 int node = cpu_to_node(i); 2953 2954 if (node_online(node)) 2955 pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node); 2956 else 2957 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL); 2958 2959 if (!pdata->ptrs[i]) 2960 goto unwind_oom; 2961 memset(pdata->ptrs[i], 0, size); 2962 } 2963 2964 /* Catch derefs w/o wrappers */ 2965 return (void *) (~(unsigned long) pdata); 2966 2967 unwind_oom: 2968 while (--i >= 0) { 2969 if (!cpu_possible(i)) 2970 continue; 2971 kfree(pdata->ptrs[i]); 2972 } 2973 kfree(pdata); 2974 return NULL; 2975 } 2976 EXPORT_SYMBOL(__alloc_percpu); 2977 #endif 2978 2979 /** 2980 * kmem_cache_free - Deallocate an object 2981 * @cachep: The cache the allocation was from. 2982 * @objp: The previously allocated object. 2983 * 2984 * Free an object which was previously allocated from this 2985 * cache. 2986 */ 2987 void kmem_cache_free(kmem_cache_t *cachep, void *objp) 2988 { 2989 unsigned long flags; 2990 2991 local_irq_save(flags); 2992 __cache_free(cachep, objp); 2993 local_irq_restore(flags); 2994 } 2995 EXPORT_SYMBOL(kmem_cache_free); 2996 2997 /** 2998 * kzalloc - allocate memory. The memory is set to zero. 2999 * @size: how many bytes of memory are required. 3000 * @flags: the type of memory to allocate. 3001 */ 3002 void *kzalloc(size_t size, gfp_t flags) 3003 { 3004 void *ret = kmalloc(size, flags); 3005 if (ret) 3006 memset(ret, 0, size); 3007 return ret; 3008 } 3009 EXPORT_SYMBOL(kzalloc); 3010 3011 /** 3012 * kfree - free previously allocated memory 3013 * @objp: pointer returned by kmalloc. 3014 * 3015 * If @objp is NULL, no operation is performed. 3016 * 3017 * Don't free memory not originally allocated by kmalloc() 3018 * or you will run into trouble. 3019 */ 3020 void kfree(const void *objp) 3021 { 3022 kmem_cache_t *c; 3023 unsigned long flags; 3024 3025 if (unlikely(!objp)) 3026 return; 3027 local_irq_save(flags); 3028 kfree_debugcheck(objp); 3029 c = GET_PAGE_CACHE(virt_to_page(objp)); 3030 __cache_free(c, (void*)objp); 3031 local_irq_restore(flags); 3032 } 3033 EXPORT_SYMBOL(kfree); 3034 3035 #ifdef CONFIG_SMP 3036 /** 3037 * free_percpu - free previously allocated percpu memory 3038 * @objp: pointer returned by alloc_percpu. 3039 * 3040 * Don't free memory not originally allocated by alloc_percpu() 3041 * The complemented objp is to check for that. 3042 */ 3043 void 3044 free_percpu(const void *objp) 3045 { 3046 int i; 3047 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); 3048 3049 /* 3050 * We allocate for all cpus so we cannot use for online cpu here. 3051 */ 3052 for_each_cpu(i) 3053 kfree(p->ptrs[i]); 3054 kfree(p); 3055 } 3056 EXPORT_SYMBOL(free_percpu); 3057 #endif 3058 3059 unsigned int kmem_cache_size(kmem_cache_t *cachep) 3060 { 3061 return obj_reallen(cachep); 3062 } 3063 EXPORT_SYMBOL(kmem_cache_size); 3064 3065 const char *kmem_cache_name(kmem_cache_t *cachep) 3066 { 3067 return cachep->name; 3068 } 3069 EXPORT_SYMBOL_GPL(kmem_cache_name); 3070 3071 /* 3072 * This initializes kmem_list3 for all nodes. 3073 */ 3074 static int alloc_kmemlist(kmem_cache_t *cachep) 3075 { 3076 int node; 3077 struct kmem_list3 *l3; 3078 int err = 0; 3079 3080 for_each_online_node(node) { 3081 struct array_cache *nc = NULL, *new; 3082 struct array_cache **new_alien = NULL; 3083 #ifdef CONFIG_NUMA 3084 if (!(new_alien = alloc_alien_cache(node, cachep->limit))) 3085 goto fail; 3086 #endif 3087 if (!(new = alloc_arraycache(node, (cachep->shared* 3088 cachep->batchcount), 0xbaadf00d))) 3089 goto fail; 3090 if ((l3 = cachep->nodelists[node])) { 3091 3092 spin_lock_irq(&l3->list_lock); 3093 3094 if ((nc = cachep->nodelists[node]->shared)) 3095 free_block(cachep, nc->entry, 3096 nc->avail, node); 3097 3098 l3->shared = new; 3099 if (!cachep->nodelists[node]->alien) { 3100 l3->alien = new_alien; 3101 new_alien = NULL; 3102 } 3103 l3->free_limit = (1 + nr_cpus_node(node))* 3104 cachep->batchcount + cachep->num; 3105 spin_unlock_irq(&l3->list_lock); 3106 kfree(nc); 3107 free_alien_cache(new_alien); 3108 continue; 3109 } 3110 if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), 3111 GFP_KERNEL, node))) 3112 goto fail; 3113 3114 kmem_list3_init(l3); 3115 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 3116 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 3117 l3->shared = new; 3118 l3->alien = new_alien; 3119 l3->free_limit = (1 + nr_cpus_node(node))* 3120 cachep->batchcount + cachep->num; 3121 cachep->nodelists[node] = l3; 3122 } 3123 return err; 3124 fail: 3125 err = -ENOMEM; 3126 return err; 3127 } 3128 3129 struct ccupdate_struct { 3130 kmem_cache_t *cachep; 3131 struct array_cache *new[NR_CPUS]; 3132 }; 3133 3134 static void do_ccupdate_local(void *info) 3135 { 3136 struct ccupdate_struct *new = (struct ccupdate_struct *)info; 3137 struct array_cache *old; 3138 3139 check_irq_off(); 3140 old = ac_data(new->cachep); 3141 3142 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 3143 new->new[smp_processor_id()] = old; 3144 } 3145 3146 3147 static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount, 3148 int shared) 3149 { 3150 struct ccupdate_struct new; 3151 int i, err; 3152 3153 memset(&new.new,0,sizeof(new.new)); 3154 for_each_online_cpu(i) { 3155 new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount); 3156 if (!new.new[i]) { 3157 for (i--; i >= 0; i--) kfree(new.new[i]); 3158 return -ENOMEM; 3159 } 3160 } 3161 new.cachep = cachep; 3162 3163 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); 3164 3165 check_irq_on(); 3166 spin_lock_irq(&cachep->spinlock); 3167 cachep->batchcount = batchcount; 3168 cachep->limit = limit; 3169 cachep->shared = shared; 3170 spin_unlock_irq(&cachep->spinlock); 3171 3172 for_each_online_cpu(i) { 3173 struct array_cache *ccold = new.new[i]; 3174 if (!ccold) 3175 continue; 3176 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3177 free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); 3178 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3179 kfree(ccold); 3180 } 3181 3182 err = alloc_kmemlist(cachep); 3183 if (err) { 3184 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n", 3185 cachep->name, -err); 3186 BUG(); 3187 } 3188 return 0; 3189 } 3190 3191 3192 static void enable_cpucache(kmem_cache_t *cachep) 3193 { 3194 int err; 3195 int limit, shared; 3196 3197 /* The head array serves three purposes: 3198 * - create a LIFO ordering, i.e. return objects that are cache-warm 3199 * - reduce the number of spinlock operations. 3200 * - reduce the number of linked list operations on the slab and 3201 * bufctl chains: array operations are cheaper. 3202 * The numbers are guessed, we should auto-tune as described by 3203 * Bonwick. 3204 */ 3205 if (cachep->objsize > 131072) 3206 limit = 1; 3207 else if (cachep->objsize > PAGE_SIZE) 3208 limit = 8; 3209 else if (cachep->objsize > 1024) 3210 limit = 24; 3211 else if (cachep->objsize > 256) 3212 limit = 54; 3213 else 3214 limit = 120; 3215 3216 /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound 3217 * allocation behaviour: Most allocs on one cpu, most free operations 3218 * on another cpu. For these cases, an efficient object passing between 3219 * cpus is necessary. This is provided by a shared array. The array 3220 * replaces Bonwick's magazine layer. 3221 * On uniprocessor, it's functionally equivalent (but less efficient) 3222 * to a larger limit. Thus disabled by default. 3223 */ 3224 shared = 0; 3225 #ifdef CONFIG_SMP 3226 if (cachep->objsize <= PAGE_SIZE) 3227 shared = 8; 3228 #endif 3229 3230 #if DEBUG 3231 /* With debugging enabled, large batchcount lead to excessively 3232 * long periods with disabled local interrupts. Limit the 3233 * batchcount 3234 */ 3235 if (limit > 32) 3236 limit = 32; 3237 #endif 3238 err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared); 3239 if (err) 3240 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3241 cachep->name, -err); 3242 } 3243 3244 static void drain_array_locked(kmem_cache_t *cachep, 3245 struct array_cache *ac, int force, int node) 3246 { 3247 int tofree; 3248 3249 check_spinlock_acquired_node(cachep, node); 3250 if (ac->touched && !force) { 3251 ac->touched = 0; 3252 } else if (ac->avail) { 3253 tofree = force ? ac->avail : (ac->limit+4)/5; 3254 if (tofree > ac->avail) { 3255 tofree = (ac->avail+1)/2; 3256 } 3257 free_block(cachep, ac->entry, tofree, node); 3258 ac->avail -= tofree; 3259 memmove(ac->entry, &(ac->entry[tofree]), 3260 sizeof(void*)*ac->avail); 3261 } 3262 } 3263 3264 /** 3265 * cache_reap - Reclaim memory from caches. 3266 * 3267 * Called from workqueue/eventd every few seconds. 3268 * Purpose: 3269 * - clear the per-cpu caches for this CPU. 3270 * - return freeable pages to the main free memory pool. 3271 * 3272 * If we cannot acquire the cache chain semaphore then just give up - we'll 3273 * try again on the next iteration. 3274 */ 3275 static void cache_reap(void *unused) 3276 { 3277 struct list_head *walk; 3278 struct kmem_list3 *l3; 3279 3280 if (down_trylock(&cache_chain_sem)) { 3281 /* Give up. Setup the next iteration. */ 3282 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 3283 return; 3284 } 3285 3286 list_for_each(walk, &cache_chain) { 3287 kmem_cache_t *searchp; 3288 struct list_head* p; 3289 int tofree; 3290 struct slab *slabp; 3291 3292 searchp = list_entry(walk, kmem_cache_t, next); 3293 3294 if (searchp->flags & SLAB_NO_REAP) 3295 goto next; 3296 3297 check_irq_on(); 3298 3299 l3 = searchp->nodelists[numa_node_id()]; 3300 if (l3->alien) 3301 drain_alien_cache(searchp, l3); 3302 spin_lock_irq(&l3->list_lock); 3303 3304 drain_array_locked(searchp, ac_data(searchp), 0, 3305 numa_node_id()); 3306 3307 if (time_after(l3->next_reap, jiffies)) 3308 goto next_unlock; 3309 3310 l3->next_reap = jiffies + REAPTIMEOUT_LIST3; 3311 3312 if (l3->shared) 3313 drain_array_locked(searchp, l3->shared, 0, 3314 numa_node_id()); 3315 3316 if (l3->free_touched) { 3317 l3->free_touched = 0; 3318 goto next_unlock; 3319 } 3320 3321 tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num); 3322 do { 3323 p = l3->slabs_free.next; 3324 if (p == &(l3->slabs_free)) 3325 break; 3326 3327 slabp = list_entry(p, struct slab, list); 3328 BUG_ON(slabp->inuse); 3329 list_del(&slabp->list); 3330 STATS_INC_REAPED(searchp); 3331 3332 /* Safe to drop the lock. The slab is no longer 3333 * linked to the cache. 3334 * searchp cannot disappear, we hold 3335 * cache_chain_lock 3336 */ 3337 l3->free_objects -= searchp->num; 3338 spin_unlock_irq(&l3->list_lock); 3339 slab_destroy(searchp, slabp); 3340 spin_lock_irq(&l3->list_lock); 3341 } while(--tofree > 0); 3342 next_unlock: 3343 spin_unlock_irq(&l3->list_lock); 3344 next: 3345 cond_resched(); 3346 } 3347 check_irq_on(); 3348 up(&cache_chain_sem); 3349 drain_remote_pages(); 3350 /* Setup the next iteration */ 3351 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 3352 } 3353 3354 #ifdef CONFIG_PROC_FS 3355 3356 static void *s_start(struct seq_file *m, loff_t *pos) 3357 { 3358 loff_t n = *pos; 3359 struct list_head *p; 3360 3361 down(&cache_chain_sem); 3362 if (!n) { 3363 /* 3364 * Output format version, so at least we can change it 3365 * without _too_ many complaints. 3366 */ 3367 #if STATS 3368 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); 3369 #else 3370 seq_puts(m, "slabinfo - version: 2.1\n"); 3371 #endif 3372 seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>"); 3373 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 3374 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 3375 #if STATS 3376 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>" 3377 " <error> <maxfreeable> <nodeallocs> <remotefrees>"); 3378 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 3379 #endif 3380 seq_putc(m, '\n'); 3381 } 3382 p = cache_chain.next; 3383 while (n--) { 3384 p = p->next; 3385 if (p == &cache_chain) 3386 return NULL; 3387 } 3388 return list_entry(p, kmem_cache_t, next); 3389 } 3390 3391 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 3392 { 3393 kmem_cache_t *cachep = p; 3394 ++*pos; 3395 return cachep->next.next == &cache_chain ? NULL 3396 : list_entry(cachep->next.next, kmem_cache_t, next); 3397 } 3398 3399 static void s_stop(struct seq_file *m, void *p) 3400 { 3401 up(&cache_chain_sem); 3402 } 3403 3404 static int s_show(struct seq_file *m, void *p) 3405 { 3406 kmem_cache_t *cachep = p; 3407 struct list_head *q; 3408 struct slab *slabp; 3409 unsigned long active_objs; 3410 unsigned long num_objs; 3411 unsigned long active_slabs = 0; 3412 unsigned long num_slabs, free_objects = 0, shared_avail = 0; 3413 const char *name; 3414 char *error = NULL; 3415 int node; 3416 struct kmem_list3 *l3; 3417 3418 check_irq_on(); 3419 spin_lock_irq(&cachep->spinlock); 3420 active_objs = 0; 3421 num_slabs = 0; 3422 for_each_online_node(node) { 3423 l3 = cachep->nodelists[node]; 3424 if (!l3) 3425 continue; 3426 3427 spin_lock(&l3->list_lock); 3428 3429 list_for_each(q,&l3->slabs_full) { 3430 slabp = list_entry(q, struct slab, list); 3431 if (slabp->inuse != cachep->num && !error) 3432 error = "slabs_full accounting error"; 3433 active_objs += cachep->num; 3434 active_slabs++; 3435 } 3436 list_for_each(q,&l3->slabs_partial) { 3437 slabp = list_entry(q, struct slab, list); 3438 if (slabp->inuse == cachep->num && !error) 3439 error = "slabs_partial inuse accounting error"; 3440 if (!slabp->inuse && !error) 3441 error = "slabs_partial/inuse accounting error"; 3442 active_objs += slabp->inuse; 3443 active_slabs++; 3444 } 3445 list_for_each(q,&l3->slabs_free) { 3446 slabp = list_entry(q, struct slab, list); 3447 if (slabp->inuse && !error) 3448 error = "slabs_free/inuse accounting error"; 3449 num_slabs++; 3450 } 3451 free_objects += l3->free_objects; 3452 shared_avail += l3->shared->avail; 3453 3454 spin_unlock(&l3->list_lock); 3455 } 3456 num_slabs+=active_slabs; 3457 num_objs = num_slabs*cachep->num; 3458 if (num_objs - active_objs != free_objects && !error) 3459 error = "free_objects accounting error"; 3460 3461 name = cachep->name; 3462 if (error) 3463 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 3464 3465 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 3466 name, active_objs, num_objs, cachep->objsize, 3467 cachep->num, (1<<cachep->gfporder)); 3468 seq_printf(m, " : tunables %4u %4u %4u", 3469 cachep->limit, cachep->batchcount, 3470 cachep->shared); 3471 seq_printf(m, " : slabdata %6lu %6lu %6lu", 3472 active_slabs, num_slabs, shared_avail); 3473 #if STATS 3474 { /* list3 stats */ 3475 unsigned long high = cachep->high_mark; 3476 unsigned long allocs = cachep->num_allocations; 3477 unsigned long grown = cachep->grown; 3478 unsigned long reaped = cachep->reaped; 3479 unsigned long errors = cachep->errors; 3480 unsigned long max_freeable = cachep->max_freeable; 3481 unsigned long node_allocs = cachep->node_allocs; 3482 unsigned long node_frees = cachep->node_frees; 3483 3484 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ 3485 %4lu %4lu %4lu %4lu", 3486 allocs, high, grown, reaped, errors, 3487 max_freeable, node_allocs, node_frees); 3488 } 3489 /* cpu stats */ 3490 { 3491 unsigned long allochit = atomic_read(&cachep->allochit); 3492 unsigned long allocmiss = atomic_read(&cachep->allocmiss); 3493 unsigned long freehit = atomic_read(&cachep->freehit); 3494 unsigned long freemiss = atomic_read(&cachep->freemiss); 3495 3496 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", 3497 allochit, allocmiss, freehit, freemiss); 3498 } 3499 #endif 3500 seq_putc(m, '\n'); 3501 spin_unlock_irq(&cachep->spinlock); 3502 return 0; 3503 } 3504 3505 /* 3506 * slabinfo_op - iterator that generates /proc/slabinfo 3507 * 3508 * Output layout: 3509 * cache-name 3510 * num-active-objs 3511 * total-objs 3512 * object size 3513 * num-active-slabs 3514 * total-slabs 3515 * num-pages-per-slab 3516 * + further values on SMP and with statistics enabled 3517 */ 3518 3519 struct seq_operations slabinfo_op = { 3520 .start = s_start, 3521 .next = s_next, 3522 .stop = s_stop, 3523 .show = s_show, 3524 }; 3525 3526 #define MAX_SLABINFO_WRITE 128 3527 /** 3528 * slabinfo_write - Tuning for the slab allocator 3529 * @file: unused 3530 * @buffer: user buffer 3531 * @count: data length 3532 * @ppos: unused 3533 */ 3534 ssize_t slabinfo_write(struct file *file, const char __user *buffer, 3535 size_t count, loff_t *ppos) 3536 { 3537 char kbuf[MAX_SLABINFO_WRITE+1], *tmp; 3538 int limit, batchcount, shared, res; 3539 struct list_head *p; 3540 3541 if (count > MAX_SLABINFO_WRITE) 3542 return -EINVAL; 3543 if (copy_from_user(&kbuf, buffer, count)) 3544 return -EFAULT; 3545 kbuf[MAX_SLABINFO_WRITE] = '\0'; 3546 3547 tmp = strchr(kbuf, ' '); 3548 if (!tmp) 3549 return -EINVAL; 3550 *tmp = '\0'; 3551 tmp++; 3552 if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) 3553 return -EINVAL; 3554 3555 /* Find the cache in the chain of caches. */ 3556 down(&cache_chain_sem); 3557 res = -EINVAL; 3558 list_for_each(p,&cache_chain) { 3559 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); 3560 3561 if (!strcmp(cachep->name, kbuf)) { 3562 if (limit < 1 || 3563 batchcount < 1 || 3564 batchcount > limit || 3565 shared < 0) { 3566 res = 0; 3567 } else { 3568 res = do_tune_cpucache(cachep, limit, 3569 batchcount, shared); 3570 } 3571 break; 3572 } 3573 } 3574 up(&cache_chain_sem); 3575 if (res >= 0) 3576 res = count; 3577 return res; 3578 } 3579 #endif 3580 3581 /** 3582 * ksize - get the actual amount of memory allocated for a given object 3583 * @objp: Pointer to the object 3584 * 3585 * kmalloc may internally round up allocations and return more memory 3586 * than requested. ksize() can be used to determine the actual amount of 3587 * memory allocated. The caller may use this additional memory, even though 3588 * a smaller amount of memory was initially specified with the kmalloc call. 3589 * The caller must guarantee that objp points to a valid object previously 3590 * allocated with either kmalloc() or kmem_cache_alloc(). The object 3591 * must not be freed during the duration of the call. 3592 */ 3593 unsigned int ksize(const void *objp) 3594 { 3595 if (unlikely(objp == NULL)) 3596 return 0; 3597 3598 return obj_reallen(GET_PAGE_CACHE(virt_to_page(objp))); 3599 } 3600 3601 3602 /* 3603 * kstrdup - allocate space for and copy an existing string 3604 * 3605 * @s: the string to duplicate 3606 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 3607 */ 3608 char *kstrdup(const char *s, gfp_t gfp) 3609 { 3610 size_t len; 3611 char *buf; 3612 3613 if (!s) 3614 return NULL; 3615 3616 len = strlen(s) + 1; 3617 buf = kmalloc(len, gfp); 3618 if (buf) 3619 memcpy(buf, s, len); 3620 return buf; 3621 } 3622 EXPORT_SYMBOL(kstrdup); 3623