1 /* 2 * linux/mm/slab.c 3 * Written by Mark Hemment, 1996/97. 4 * (markhe@nextd.demon.co.uk) 5 * 6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli 7 * 8 * Major cleanup, different bufctl logic, per-cpu arrays 9 * (c) 2000 Manfred Spraul 10 * 11 * Cleanup, make the head arrays unconditional, preparation for NUMA 12 * (c) 2002 Manfred Spraul 13 * 14 * An implementation of the Slab Allocator as described in outline in; 15 * UNIX Internals: The New Frontiers by Uresh Vahalia 16 * Pub: Prentice Hall ISBN 0-13-101908-2 17 * or with a little more detail in; 18 * The Slab Allocator: An Object-Caching Kernel Memory Allocator 19 * Jeff Bonwick (Sun Microsystems). 20 * Presented at: USENIX Summer 1994 Technical Conference 21 * 22 * The memory is organized in caches, one cache for each object type. 23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) 24 * Each cache consists out of many slabs (they are small (usually one 25 * page long) and always contiguous), and each slab contains multiple 26 * initialized objects. 27 * 28 * This means, that your constructor is used only for newly allocated 29 * slabs and you must pass objects with the same initializations to 30 * kmem_cache_free. 31 * 32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, 33 * normal). If you need a special memory type, then must create a new 34 * cache for that memory type. 35 * 36 * In order to reduce fragmentation, the slabs are sorted in 3 groups: 37 * full slabs with 0 free objects 38 * partial slabs 39 * empty slabs with no allocated objects 40 * 41 * If partial slabs exist, then new allocations come from these slabs, 42 * otherwise from empty slabs or new slabs are allocated. 43 * 44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache 45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs. 46 * 47 * Each cache has a short per-cpu head array, most allocs 48 * and frees go into that array, and if that array overflows, then 1/2 49 * of the entries in the array are given back into the global cache. 50 * The head array is strictly LIFO and should improve the cache hit rates. 51 * On SMP, it additionally reduces the spinlock operations. 52 * 53 * The c_cpuarray may not be read with enabled local interrupts - 54 * it's changed with a smp_call_function(). 55 * 56 * SMP synchronization: 57 * constructors and destructors are called without any locking. 58 * Several members in struct kmem_cache and struct slab never change, they 59 * are accessed without any locking. 60 * The per-cpu arrays are never accessed from the wrong cpu, no locking, 61 * and local interrupts are disabled so slab code is preempt-safe. 62 * The non-constant members are protected with a per-cache irq spinlock. 63 * 64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch 65 * in 2000 - many ideas in the current implementation are derived from 66 * his patch. 67 * 68 * Further notes from the original documentation: 69 * 70 * 11 April '97. Started multi-threading - markhe 71 * The global cache-chain is protected by the mutex 'slab_mutex'. 72 * The sem is only needed when accessing/extending the cache-chain, which 73 * can never happen inside an interrupt (kmem_cache_create(), 74 * kmem_cache_shrink() and kmem_cache_reap()). 75 * 76 * At present, each engine can be growing a cache. This should be blocked. 77 * 78 * 15 March 2005. NUMA slab allocator. 79 * Shai Fultheim <shai@scalex86.org>. 80 * Shobhit Dayal <shobhit@calsoftinc.com> 81 * Alok N Kataria <alokk@calsoftinc.com> 82 * Christoph Lameter <christoph@lameter.com> 83 * 84 * Modified the slab allocator to be node aware on NUMA systems. 85 * Each node has its own list of partial, free and full slabs. 86 * All object allocations for a node occur from node specific slab lists. 87 */ 88 89 #include <linux/slab.h> 90 #include "slab.h" 91 #include <linux/mm.h> 92 #include <linux/poison.h> 93 #include <linux/swap.h> 94 #include <linux/cache.h> 95 #include <linux/interrupt.h> 96 #include <linux/init.h> 97 #include <linux/compiler.h> 98 #include <linux/cpuset.h> 99 #include <linux/proc_fs.h> 100 #include <linux/seq_file.h> 101 #include <linux/notifier.h> 102 #include <linux/kallsyms.h> 103 #include <linux/cpu.h> 104 #include <linux/sysctl.h> 105 #include <linux/module.h> 106 #include <linux/rcupdate.h> 107 #include <linux/string.h> 108 #include <linux/uaccess.h> 109 #include <linux/nodemask.h> 110 #include <linux/kmemleak.h> 111 #include <linux/mempolicy.h> 112 #include <linux/mutex.h> 113 #include <linux/fault-inject.h> 114 #include <linux/rtmutex.h> 115 #include <linux/reciprocal_div.h> 116 #include <linux/debugobjects.h> 117 #include <linux/kmemcheck.h> 118 #include <linux/memory.h> 119 #include <linux/prefetch.h> 120 121 #include <net/sock.h> 122 123 #include <asm/cacheflush.h> 124 #include <asm/tlbflush.h> 125 #include <asm/page.h> 126 127 #include <trace/events/kmem.h> 128 129 #include "internal.h" 130 131 /* 132 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. 133 * 0 for faster, smaller code (especially in the critical paths). 134 * 135 * STATS - 1 to collect stats for /proc/slabinfo. 136 * 0 for faster, smaller code (especially in the critical paths). 137 * 138 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) 139 */ 140 141 #ifdef CONFIG_DEBUG_SLAB 142 #define DEBUG 1 143 #define STATS 1 144 #define FORCED_DEBUG 1 145 #else 146 #define DEBUG 0 147 #define STATS 0 148 #define FORCED_DEBUG 0 149 #endif 150 151 /* Shouldn't this be in a header file somewhere? */ 152 #define BYTES_PER_WORD sizeof(void *) 153 #define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) 154 155 #ifndef ARCH_KMALLOC_FLAGS 156 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 157 #endif 158 159 /* 160 * true if a page was allocated from pfmemalloc reserves for network-based 161 * swap 162 */ 163 static bool pfmemalloc_active __read_mostly; 164 165 /* Legal flag mask for kmem_cache_create(). */ 166 #if DEBUG 167 # define CREATE_MASK (SLAB_RED_ZONE | \ 168 SLAB_POISON | SLAB_HWCACHE_ALIGN | \ 169 SLAB_CACHE_DMA | \ 170 SLAB_STORE_USER | \ 171 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 172 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ 173 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) 174 #else 175 # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ 176 SLAB_CACHE_DMA | \ 177 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 178 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ 179 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) 180 #endif 181 182 /* 183 * kmem_bufctl_t: 184 * 185 * Bufctl's are used for linking objs within a slab 186 * linked offsets. 187 * 188 * This implementation relies on "struct page" for locating the cache & 189 * slab an object belongs to. 190 * This allows the bufctl structure to be small (one int), but limits 191 * the number of objects a slab (not a cache) can contain when off-slab 192 * bufctls are used. The limit is the size of the largest general cache 193 * that does not use off-slab slabs. 194 * For 32bit archs with 4 kB pages, is this 56. 195 * This is not serious, as it is only for large objects, when it is unwise 196 * to have too many per slab. 197 * Note: This limit can be raised by introducing a general cache whose size 198 * is less than 512 (PAGE_SIZE<<3), but greater than 256. 199 */ 200 201 typedef unsigned int kmem_bufctl_t; 202 #define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) 203 #define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) 204 #define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) 205 #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) 206 207 /* 208 * struct slab_rcu 209 * 210 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to 211 * arrange for kmem_freepages to be called via RCU. This is useful if 212 * we need to approach a kernel structure obliquely, from its address 213 * obtained without the usual locking. We can lock the structure to 214 * stabilize it and check it's still at the given address, only if we 215 * can be sure that the memory has not been meanwhile reused for some 216 * other kind of object (which our subsystem's lock might corrupt). 217 * 218 * rcu_read_lock before reading the address, then rcu_read_unlock after 219 * taking the spinlock within the structure expected at that address. 220 */ 221 struct slab_rcu { 222 struct rcu_head head; 223 struct kmem_cache *cachep; 224 void *addr; 225 }; 226 227 /* 228 * struct slab 229 * 230 * Manages the objs in a slab. Placed either at the beginning of mem allocated 231 * for a slab, or allocated from an general cache. 232 * Slabs are chained into three list: fully used, partial, fully free slabs. 233 */ 234 struct slab { 235 union { 236 struct { 237 struct list_head list; 238 unsigned long colouroff; 239 void *s_mem; /* including colour offset */ 240 unsigned int inuse; /* num of objs active in slab */ 241 kmem_bufctl_t free; 242 unsigned short nodeid; 243 }; 244 struct slab_rcu __slab_cover_slab_rcu; 245 }; 246 }; 247 248 /* 249 * struct array_cache 250 * 251 * Purpose: 252 * - LIFO ordering, to hand out cache-warm objects from _alloc 253 * - reduce the number of linked list operations 254 * - reduce spinlock operations 255 * 256 * The limit is stored in the per-cpu structure to reduce the data cache 257 * footprint. 258 * 259 */ 260 struct array_cache { 261 unsigned int avail; 262 unsigned int limit; 263 unsigned int batchcount; 264 unsigned int touched; 265 spinlock_t lock; 266 void *entry[]; /* 267 * Must have this definition in here for the proper 268 * alignment of array_cache. Also simplifies accessing 269 * the entries. 270 * 271 * Entries should not be directly dereferenced as 272 * entries belonging to slabs marked pfmemalloc will 273 * have the lower bits set SLAB_OBJ_PFMEMALLOC 274 */ 275 }; 276 277 #define SLAB_OBJ_PFMEMALLOC 1 278 static inline bool is_obj_pfmemalloc(void *objp) 279 { 280 return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; 281 } 282 283 static inline void set_obj_pfmemalloc(void **objp) 284 { 285 *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); 286 return; 287 } 288 289 static inline void clear_obj_pfmemalloc(void **objp) 290 { 291 *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); 292 } 293 294 /* 295 * bootstrap: The caches do not work without cpuarrays anymore, but the 296 * cpuarrays are allocated from the generic caches... 297 */ 298 #define BOOT_CPUCACHE_ENTRIES 1 299 struct arraycache_init { 300 struct array_cache cache; 301 void *entries[BOOT_CPUCACHE_ENTRIES]; 302 }; 303 304 /* 305 * The slab lists for all objects. 306 */ 307 struct kmem_list3 { 308 struct list_head slabs_partial; /* partial list first, better asm code */ 309 struct list_head slabs_full; 310 struct list_head slabs_free; 311 unsigned long free_objects; 312 unsigned int free_limit; 313 unsigned int colour_next; /* Per-node cache coloring */ 314 spinlock_t list_lock; 315 struct array_cache *shared; /* shared per node */ 316 struct array_cache **alien; /* on other nodes */ 317 unsigned long next_reap; /* updated without locking */ 318 int free_touched; /* updated without locking */ 319 }; 320 321 /* 322 * Need this for bootstrapping a per node allocator. 323 */ 324 #define NUM_INIT_LISTS (3 * MAX_NUMNODES) 325 static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; 326 #define CACHE_CACHE 0 327 #define SIZE_AC MAX_NUMNODES 328 #define SIZE_L3 (2 * MAX_NUMNODES) 329 330 static int drain_freelist(struct kmem_cache *cache, 331 struct kmem_list3 *l3, int tofree); 332 static void free_block(struct kmem_cache *cachep, void **objpp, int len, 333 int node); 334 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); 335 static void cache_reap(struct work_struct *unused); 336 337 /* 338 * This function must be completely optimized away if a constant is passed to 339 * it. Mostly the same as what is in linux/slab.h except it returns an index. 340 */ 341 static __always_inline int index_of(const size_t size) 342 { 343 extern void __bad_size(void); 344 345 if (__builtin_constant_p(size)) { 346 int i = 0; 347 348 #define CACHE(x) \ 349 if (size <=x) \ 350 return i; \ 351 else \ 352 i++; 353 #include <linux/kmalloc_sizes.h> 354 #undef CACHE 355 __bad_size(); 356 } else 357 __bad_size(); 358 return 0; 359 } 360 361 static int slab_early_init = 1; 362 363 #define INDEX_AC index_of(sizeof(struct arraycache_init)) 364 #define INDEX_L3 index_of(sizeof(struct kmem_list3)) 365 366 static void kmem_list3_init(struct kmem_list3 *parent) 367 { 368 INIT_LIST_HEAD(&parent->slabs_full); 369 INIT_LIST_HEAD(&parent->slabs_partial); 370 INIT_LIST_HEAD(&parent->slabs_free); 371 parent->shared = NULL; 372 parent->alien = NULL; 373 parent->colour_next = 0; 374 spin_lock_init(&parent->list_lock); 375 parent->free_objects = 0; 376 parent->free_touched = 0; 377 } 378 379 #define MAKE_LIST(cachep, listp, slab, nodeid) \ 380 do { \ 381 INIT_LIST_HEAD(listp); \ 382 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ 383 } while (0) 384 385 #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ 386 do { \ 387 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ 388 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ 389 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 390 } while (0) 391 392 #define CFLGS_OFF_SLAB (0x80000000UL) 393 #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 394 395 #define BATCHREFILL_LIMIT 16 396 /* 397 * Optimization question: fewer reaps means less probability for unnessary 398 * cpucache drain/refill cycles. 399 * 400 * OTOH the cpuarrays can contain lots of objects, 401 * which could lock up otherwise freeable slabs. 402 */ 403 #define REAPTIMEOUT_CPUC (2*HZ) 404 #define REAPTIMEOUT_LIST3 (4*HZ) 405 406 #if STATS 407 #define STATS_INC_ACTIVE(x) ((x)->num_active++) 408 #define STATS_DEC_ACTIVE(x) ((x)->num_active--) 409 #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 410 #define STATS_INC_GROWN(x) ((x)->grown++) 411 #define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) 412 #define STATS_SET_HIGH(x) \ 413 do { \ 414 if ((x)->num_active > (x)->high_mark) \ 415 (x)->high_mark = (x)->num_active; \ 416 } while (0) 417 #define STATS_INC_ERR(x) ((x)->errors++) 418 #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 419 #define STATS_INC_NODEFREES(x) ((x)->node_frees++) 420 #define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++) 421 #define STATS_SET_FREEABLE(x, i) \ 422 do { \ 423 if ((x)->max_freeable < i) \ 424 (x)->max_freeable = i; \ 425 } while (0) 426 #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) 427 #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) 428 #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) 429 #define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) 430 #else 431 #define STATS_INC_ACTIVE(x) do { } while (0) 432 #define STATS_DEC_ACTIVE(x) do { } while (0) 433 #define STATS_INC_ALLOCED(x) do { } while (0) 434 #define STATS_INC_GROWN(x) do { } while (0) 435 #define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0) 436 #define STATS_SET_HIGH(x) do { } while (0) 437 #define STATS_INC_ERR(x) do { } while (0) 438 #define STATS_INC_NODEALLOCS(x) do { } while (0) 439 #define STATS_INC_NODEFREES(x) do { } while (0) 440 #define STATS_INC_ACOVERFLOW(x) do { } while (0) 441 #define STATS_SET_FREEABLE(x, i) do { } while (0) 442 #define STATS_INC_ALLOCHIT(x) do { } while (0) 443 #define STATS_INC_ALLOCMISS(x) do { } while (0) 444 #define STATS_INC_FREEHIT(x) do { } while (0) 445 #define STATS_INC_FREEMISS(x) do { } while (0) 446 #endif 447 448 #if DEBUG 449 450 /* 451 * memory layout of objects: 452 * 0 : objp 453 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that 454 * the end of an object is aligned with the end of the real 455 * allocation. Catches writes behind the end of the allocation. 456 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: 457 * redzone word. 458 * cachep->obj_offset: The real object. 459 * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 460 * cachep->size - 1* BYTES_PER_WORD: last caller address 461 * [BYTES_PER_WORD long] 462 */ 463 static int obj_offset(struct kmem_cache *cachep) 464 { 465 return cachep->obj_offset; 466 } 467 468 static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) 469 { 470 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 471 return (unsigned long long*) (objp + obj_offset(cachep) - 472 sizeof(unsigned long long)); 473 } 474 475 static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) 476 { 477 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 478 if (cachep->flags & SLAB_STORE_USER) 479 return (unsigned long long *)(objp + cachep->size - 480 sizeof(unsigned long long) - 481 REDZONE_ALIGN); 482 return (unsigned long long *) (objp + cachep->size - 483 sizeof(unsigned long long)); 484 } 485 486 static void **dbg_userword(struct kmem_cache *cachep, void *objp) 487 { 488 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 489 return (void **)(objp + cachep->size - BYTES_PER_WORD); 490 } 491 492 #else 493 494 #define obj_offset(x) 0 495 #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 496 #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 497 #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 498 499 #endif 500 501 /* 502 * Do not go above this order unless 0 objects fit into the slab or 503 * overridden on the command line. 504 */ 505 #define SLAB_MAX_ORDER_HI 1 506 #define SLAB_MAX_ORDER_LO 0 507 static int slab_max_order = SLAB_MAX_ORDER_LO; 508 static bool slab_max_order_set __initdata; 509 510 static inline struct kmem_cache *virt_to_cache(const void *obj) 511 { 512 struct page *page = virt_to_head_page(obj); 513 return page->slab_cache; 514 } 515 516 static inline struct slab *virt_to_slab(const void *obj) 517 { 518 struct page *page = virt_to_head_page(obj); 519 520 VM_BUG_ON(!PageSlab(page)); 521 return page->slab_page; 522 } 523 524 static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, 525 unsigned int idx) 526 { 527 return slab->s_mem + cache->size * idx; 528 } 529 530 /* 531 * We want to avoid an expensive divide : (offset / cache->size) 532 * Using the fact that size is a constant for a particular cache, 533 * we can replace (offset / cache->size) by 534 * reciprocal_divide(offset, cache->reciprocal_buffer_size) 535 */ 536 static inline unsigned int obj_to_index(const struct kmem_cache *cache, 537 const struct slab *slab, void *obj) 538 { 539 u32 offset = (obj - slab->s_mem); 540 return reciprocal_divide(offset, cache->reciprocal_buffer_size); 541 } 542 543 /* 544 * These are the default caches for kmalloc. Custom caches can have other sizes. 545 */ 546 struct cache_sizes malloc_sizes[] = { 547 #define CACHE(x) { .cs_size = (x) }, 548 #include <linux/kmalloc_sizes.h> 549 CACHE(ULONG_MAX) 550 #undef CACHE 551 }; 552 EXPORT_SYMBOL(malloc_sizes); 553 554 /* Must match cache_sizes above. Out of line to keep cache footprint low. */ 555 struct cache_names { 556 char *name; 557 char *name_dma; 558 }; 559 560 static struct cache_names __initdata cache_names[] = { 561 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, 562 #include <linux/kmalloc_sizes.h> 563 {NULL,} 564 #undef CACHE 565 }; 566 567 static struct arraycache_init initarray_cache __initdata = 568 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 569 static struct arraycache_init initarray_generic = 570 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 571 572 /* internal cache of cache description objs */ 573 static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES]; 574 static struct kmem_cache kmem_cache_boot = { 575 .nodelists = kmem_cache_nodelists, 576 .batchcount = 1, 577 .limit = BOOT_CPUCACHE_ENTRIES, 578 .shared = 1, 579 .size = sizeof(struct kmem_cache), 580 .name = "kmem_cache", 581 }; 582 583 #define BAD_ALIEN_MAGIC 0x01020304ul 584 585 #ifdef CONFIG_LOCKDEP 586 587 /* 588 * Slab sometimes uses the kmalloc slabs to store the slab headers 589 * for other slabs "off slab". 590 * The locking for this is tricky in that it nests within the locks 591 * of all other slabs in a few places; to deal with this special 592 * locking we put on-slab caches into a separate lock-class. 593 * 594 * We set lock class for alien array caches which are up during init. 595 * The lock annotation will be lost if all cpus of a node goes down and 596 * then comes back up during hotplug 597 */ 598 static struct lock_class_key on_slab_l3_key; 599 static struct lock_class_key on_slab_alc_key; 600 601 static struct lock_class_key debugobj_l3_key; 602 static struct lock_class_key debugobj_alc_key; 603 604 static void slab_set_lock_classes(struct kmem_cache *cachep, 605 struct lock_class_key *l3_key, struct lock_class_key *alc_key, 606 int q) 607 { 608 struct array_cache **alc; 609 struct kmem_list3 *l3; 610 int r; 611 612 l3 = cachep->nodelists[q]; 613 if (!l3) 614 return; 615 616 lockdep_set_class(&l3->list_lock, l3_key); 617 alc = l3->alien; 618 /* 619 * FIXME: This check for BAD_ALIEN_MAGIC 620 * should go away when common slab code is taught to 621 * work even without alien caches. 622 * Currently, non NUMA code returns BAD_ALIEN_MAGIC 623 * for alloc_alien_cache, 624 */ 625 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) 626 return; 627 for_each_node(r) { 628 if (alc[r]) 629 lockdep_set_class(&alc[r]->lock, alc_key); 630 } 631 } 632 633 static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) 634 { 635 slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node); 636 } 637 638 static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) 639 { 640 int node; 641 642 for_each_online_node(node) 643 slab_set_debugobj_lock_classes_node(cachep, node); 644 } 645 646 static void init_node_lock_keys(int q) 647 { 648 struct cache_sizes *s = malloc_sizes; 649 650 if (slab_state < UP) 651 return; 652 653 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { 654 struct kmem_list3 *l3; 655 656 l3 = s->cs_cachep->nodelists[q]; 657 if (!l3 || OFF_SLAB(s->cs_cachep)) 658 continue; 659 660 slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key, 661 &on_slab_alc_key, q); 662 } 663 } 664 665 static inline void init_lock_keys(void) 666 { 667 int node; 668 669 for_each_node(node) 670 init_node_lock_keys(node); 671 } 672 #else 673 static void init_node_lock_keys(int q) 674 { 675 } 676 677 static inline void init_lock_keys(void) 678 { 679 } 680 681 static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) 682 { 683 } 684 685 static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) 686 { 687 } 688 #endif 689 690 static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); 691 692 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 693 { 694 return cachep->array[smp_processor_id()]; 695 } 696 697 static inline struct kmem_cache *__find_general_cachep(size_t size, 698 gfp_t gfpflags) 699 { 700 struct cache_sizes *csizep = malloc_sizes; 701 702 #if DEBUG 703 /* This happens if someone tries to call 704 * kmem_cache_create(), or __kmalloc(), before 705 * the generic caches are initialized. 706 */ 707 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); 708 #endif 709 if (!size) 710 return ZERO_SIZE_PTR; 711 712 while (size > csizep->cs_size) 713 csizep++; 714 715 /* 716 * Really subtle: The last entry with cs->cs_size==ULONG_MAX 717 * has cs_{dma,}cachep==NULL. Thus no special case 718 * for large kmalloc calls required. 719 */ 720 #ifdef CONFIG_ZONE_DMA 721 if (unlikely(gfpflags & GFP_DMA)) 722 return csizep->cs_dmacachep; 723 #endif 724 return csizep->cs_cachep; 725 } 726 727 static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) 728 { 729 return __find_general_cachep(size, gfpflags); 730 } 731 732 static size_t slab_mgmt_size(size_t nr_objs, size_t align) 733 { 734 return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); 735 } 736 737 /* 738 * Calculate the number of objects and left-over bytes for a given buffer size. 739 */ 740 static void cache_estimate(unsigned long gfporder, size_t buffer_size, 741 size_t align, int flags, size_t *left_over, 742 unsigned int *num) 743 { 744 int nr_objs; 745 size_t mgmt_size; 746 size_t slab_size = PAGE_SIZE << gfporder; 747 748 /* 749 * The slab management structure can be either off the slab or 750 * on it. For the latter case, the memory allocated for a 751 * slab is used for: 752 * 753 * - The struct slab 754 * - One kmem_bufctl_t for each object 755 * - Padding to respect alignment of @align 756 * - @buffer_size bytes for each object 757 * 758 * If the slab management structure is off the slab, then the 759 * alignment will already be calculated into the size. Because 760 * the slabs are all pages aligned, the objects will be at the 761 * correct alignment when allocated. 762 */ 763 if (flags & CFLGS_OFF_SLAB) { 764 mgmt_size = 0; 765 nr_objs = slab_size / buffer_size; 766 767 if (nr_objs > SLAB_LIMIT) 768 nr_objs = SLAB_LIMIT; 769 } else { 770 /* 771 * Ignore padding for the initial guess. The padding 772 * is at most @align-1 bytes, and @buffer_size is at 773 * least @align. In the worst case, this result will 774 * be one greater than the number of objects that fit 775 * into the memory allocation when taking the padding 776 * into account. 777 */ 778 nr_objs = (slab_size - sizeof(struct slab)) / 779 (buffer_size + sizeof(kmem_bufctl_t)); 780 781 /* 782 * This calculated number will be either the right 783 * amount, or one greater than what we want. 784 */ 785 if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size 786 > slab_size) 787 nr_objs--; 788 789 if (nr_objs > SLAB_LIMIT) 790 nr_objs = SLAB_LIMIT; 791 792 mgmt_size = slab_mgmt_size(nr_objs, align); 793 } 794 *num = nr_objs; 795 *left_over = slab_size - nr_objs*buffer_size - mgmt_size; 796 } 797 798 #if DEBUG 799 #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) 800 801 static void __slab_error(const char *function, struct kmem_cache *cachep, 802 char *msg) 803 { 804 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 805 function, cachep->name, msg); 806 dump_stack(); 807 add_taint(TAINT_BAD_PAGE); 808 } 809 #endif 810 811 /* 812 * By default on NUMA we use alien caches to stage the freeing of 813 * objects allocated from other nodes. This causes massive memory 814 * inefficiencies when using fake NUMA setup to split memory into a 815 * large number of small nodes, so it can be disabled on the command 816 * line 817 */ 818 819 static int use_alien_caches __read_mostly = 1; 820 static int __init noaliencache_setup(char *s) 821 { 822 use_alien_caches = 0; 823 return 1; 824 } 825 __setup("noaliencache", noaliencache_setup); 826 827 static int __init slab_max_order_setup(char *str) 828 { 829 get_option(&str, &slab_max_order); 830 slab_max_order = slab_max_order < 0 ? 0 : 831 min(slab_max_order, MAX_ORDER - 1); 832 slab_max_order_set = true; 833 834 return 1; 835 } 836 __setup("slab_max_order=", slab_max_order_setup); 837 838 #ifdef CONFIG_NUMA 839 /* 840 * Special reaping functions for NUMA systems called from cache_reap(). 841 * These take care of doing round robin flushing of alien caches (containing 842 * objects freed on different nodes from which they were allocated) and the 843 * flushing of remote pcps by calling drain_node_pages. 844 */ 845 static DEFINE_PER_CPU(unsigned long, slab_reap_node); 846 847 static void init_reap_node(int cpu) 848 { 849 int node; 850 851 node = next_node(cpu_to_mem(cpu), node_online_map); 852 if (node == MAX_NUMNODES) 853 node = first_node(node_online_map); 854 855 per_cpu(slab_reap_node, cpu) = node; 856 } 857 858 static void next_reap_node(void) 859 { 860 int node = __this_cpu_read(slab_reap_node); 861 862 node = next_node(node, node_online_map); 863 if (unlikely(node >= MAX_NUMNODES)) 864 node = first_node(node_online_map); 865 __this_cpu_write(slab_reap_node, node); 866 } 867 868 #else 869 #define init_reap_node(cpu) do { } while (0) 870 #define next_reap_node(void) do { } while (0) 871 #endif 872 873 /* 874 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz 875 * via the workqueue/eventd. 876 * Add the CPU number into the expiration time to minimize the possibility of 877 * the CPUs getting into lockstep and contending for the global cache chain 878 * lock. 879 */ 880 static void __cpuinit start_cpu_timer(int cpu) 881 { 882 struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); 883 884 /* 885 * When this gets called from do_initcalls via cpucache_init(), 886 * init_workqueues() has already run, so keventd will be setup 887 * at that time. 888 */ 889 if (keventd_up() && reap_work->work.func == NULL) { 890 init_reap_node(cpu); 891 INIT_DEFERRABLE_WORK(reap_work, cache_reap); 892 schedule_delayed_work_on(cpu, reap_work, 893 __round_jiffies_relative(HZ, cpu)); 894 } 895 } 896 897 static struct array_cache *alloc_arraycache(int node, int entries, 898 int batchcount, gfp_t gfp) 899 { 900 int memsize = sizeof(void *) * entries + sizeof(struct array_cache); 901 struct array_cache *nc = NULL; 902 903 nc = kmalloc_node(memsize, gfp, node); 904 /* 905 * The array_cache structures contain pointers to free object. 906 * However, when such objects are allocated or transferred to another 907 * cache the pointers are not cleared and they could be counted as 908 * valid references during a kmemleak scan. Therefore, kmemleak must 909 * not scan such objects. 910 */ 911 kmemleak_no_scan(nc); 912 if (nc) { 913 nc->avail = 0; 914 nc->limit = entries; 915 nc->batchcount = batchcount; 916 nc->touched = 0; 917 spin_lock_init(&nc->lock); 918 } 919 return nc; 920 } 921 922 static inline bool is_slab_pfmemalloc(struct slab *slabp) 923 { 924 struct page *page = virt_to_page(slabp->s_mem); 925 926 return PageSlabPfmemalloc(page); 927 } 928 929 /* Clears pfmemalloc_active if no slabs have pfmalloc set */ 930 static void recheck_pfmemalloc_active(struct kmem_cache *cachep, 931 struct array_cache *ac) 932 { 933 struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()]; 934 struct slab *slabp; 935 unsigned long flags; 936 937 if (!pfmemalloc_active) 938 return; 939 940 spin_lock_irqsave(&l3->list_lock, flags); 941 list_for_each_entry(slabp, &l3->slabs_full, list) 942 if (is_slab_pfmemalloc(slabp)) 943 goto out; 944 945 list_for_each_entry(slabp, &l3->slabs_partial, list) 946 if (is_slab_pfmemalloc(slabp)) 947 goto out; 948 949 list_for_each_entry(slabp, &l3->slabs_free, list) 950 if (is_slab_pfmemalloc(slabp)) 951 goto out; 952 953 pfmemalloc_active = false; 954 out: 955 spin_unlock_irqrestore(&l3->list_lock, flags); 956 } 957 958 static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, 959 gfp_t flags, bool force_refill) 960 { 961 int i; 962 void *objp = ac->entry[--ac->avail]; 963 964 /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ 965 if (unlikely(is_obj_pfmemalloc(objp))) { 966 struct kmem_list3 *l3; 967 968 if (gfp_pfmemalloc_allowed(flags)) { 969 clear_obj_pfmemalloc(&objp); 970 return objp; 971 } 972 973 /* The caller cannot use PFMEMALLOC objects, find another one */ 974 for (i = 0; i < ac->avail; i++) { 975 /* If a !PFMEMALLOC object is found, swap them */ 976 if (!is_obj_pfmemalloc(ac->entry[i])) { 977 objp = ac->entry[i]; 978 ac->entry[i] = ac->entry[ac->avail]; 979 ac->entry[ac->avail] = objp; 980 return objp; 981 } 982 } 983 984 /* 985 * If there are empty slabs on the slabs_free list and we are 986 * being forced to refill the cache, mark this one !pfmemalloc. 987 */ 988 l3 = cachep->nodelists[numa_mem_id()]; 989 if (!list_empty(&l3->slabs_free) && force_refill) { 990 struct slab *slabp = virt_to_slab(objp); 991 ClearPageSlabPfmemalloc(virt_to_head_page(slabp->s_mem)); 992 clear_obj_pfmemalloc(&objp); 993 recheck_pfmemalloc_active(cachep, ac); 994 return objp; 995 } 996 997 /* No !PFMEMALLOC objects available */ 998 ac->avail++; 999 objp = NULL; 1000 } 1001 1002 return objp; 1003 } 1004 1005 static inline void *ac_get_obj(struct kmem_cache *cachep, 1006 struct array_cache *ac, gfp_t flags, bool force_refill) 1007 { 1008 void *objp; 1009 1010 if (unlikely(sk_memalloc_socks())) 1011 objp = __ac_get_obj(cachep, ac, flags, force_refill); 1012 else 1013 objp = ac->entry[--ac->avail]; 1014 1015 return objp; 1016 } 1017 1018 static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, 1019 void *objp) 1020 { 1021 if (unlikely(pfmemalloc_active)) { 1022 /* Some pfmemalloc slabs exist, check if this is one */ 1023 struct page *page = virt_to_head_page(objp); 1024 if (PageSlabPfmemalloc(page)) 1025 set_obj_pfmemalloc(&objp); 1026 } 1027 1028 return objp; 1029 } 1030 1031 static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, 1032 void *objp) 1033 { 1034 if (unlikely(sk_memalloc_socks())) 1035 objp = __ac_put_obj(cachep, ac, objp); 1036 1037 ac->entry[ac->avail++] = objp; 1038 } 1039 1040 /* 1041 * Transfer objects in one arraycache to another. 1042 * Locking must be handled by the caller. 1043 * 1044 * Return the number of entries transferred. 1045 */ 1046 static int transfer_objects(struct array_cache *to, 1047 struct array_cache *from, unsigned int max) 1048 { 1049 /* Figure out how many entries to transfer */ 1050 int nr = min3(from->avail, max, to->limit - to->avail); 1051 1052 if (!nr) 1053 return 0; 1054 1055 memcpy(to->entry + to->avail, from->entry + from->avail -nr, 1056 sizeof(void *) *nr); 1057 1058 from->avail -= nr; 1059 to->avail += nr; 1060 return nr; 1061 } 1062 1063 #ifndef CONFIG_NUMA 1064 1065 #define drain_alien_cache(cachep, alien) do { } while (0) 1066 #define reap_alien(cachep, l3) do { } while (0) 1067 1068 static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) 1069 { 1070 return (struct array_cache **)BAD_ALIEN_MAGIC; 1071 } 1072 1073 static inline void free_alien_cache(struct array_cache **ac_ptr) 1074 { 1075 } 1076 1077 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 1078 { 1079 return 0; 1080 } 1081 1082 static inline void *alternate_node_alloc(struct kmem_cache *cachep, 1083 gfp_t flags) 1084 { 1085 return NULL; 1086 } 1087 1088 static inline void *____cache_alloc_node(struct kmem_cache *cachep, 1089 gfp_t flags, int nodeid) 1090 { 1091 return NULL; 1092 } 1093 1094 #else /* CONFIG_NUMA */ 1095 1096 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); 1097 static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 1098 1099 static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) 1100 { 1101 struct array_cache **ac_ptr; 1102 int memsize = sizeof(void *) * nr_node_ids; 1103 int i; 1104 1105 if (limit > 1) 1106 limit = 12; 1107 ac_ptr = kzalloc_node(memsize, gfp, node); 1108 if (ac_ptr) { 1109 for_each_node(i) { 1110 if (i == node || !node_online(i)) 1111 continue; 1112 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); 1113 if (!ac_ptr[i]) { 1114 for (i--; i >= 0; i--) 1115 kfree(ac_ptr[i]); 1116 kfree(ac_ptr); 1117 return NULL; 1118 } 1119 } 1120 } 1121 return ac_ptr; 1122 } 1123 1124 static void free_alien_cache(struct array_cache **ac_ptr) 1125 { 1126 int i; 1127 1128 if (!ac_ptr) 1129 return; 1130 for_each_node(i) 1131 kfree(ac_ptr[i]); 1132 kfree(ac_ptr); 1133 } 1134 1135 static void __drain_alien_cache(struct kmem_cache *cachep, 1136 struct array_cache *ac, int node) 1137 { 1138 struct kmem_list3 *rl3 = cachep->nodelists[node]; 1139 1140 if (ac->avail) { 1141 spin_lock(&rl3->list_lock); 1142 /* 1143 * Stuff objects into the remote nodes shared array first. 1144 * That way we could avoid the overhead of putting the objects 1145 * into the free lists and getting them back later. 1146 */ 1147 if (rl3->shared) 1148 transfer_objects(rl3->shared, ac, ac->limit); 1149 1150 free_block(cachep, ac->entry, ac->avail, node); 1151 ac->avail = 0; 1152 spin_unlock(&rl3->list_lock); 1153 } 1154 } 1155 1156 /* 1157 * Called from cache_reap() to regularly drain alien caches round robin. 1158 */ 1159 static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) 1160 { 1161 int node = __this_cpu_read(slab_reap_node); 1162 1163 if (l3->alien) { 1164 struct array_cache *ac = l3->alien[node]; 1165 1166 if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { 1167 __drain_alien_cache(cachep, ac, node); 1168 spin_unlock_irq(&ac->lock); 1169 } 1170 } 1171 } 1172 1173 static void drain_alien_cache(struct kmem_cache *cachep, 1174 struct array_cache **alien) 1175 { 1176 int i = 0; 1177 struct array_cache *ac; 1178 unsigned long flags; 1179 1180 for_each_online_node(i) { 1181 ac = alien[i]; 1182 if (ac) { 1183 spin_lock_irqsave(&ac->lock, flags); 1184 __drain_alien_cache(cachep, ac, i); 1185 spin_unlock_irqrestore(&ac->lock, flags); 1186 } 1187 } 1188 } 1189 1190 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 1191 { 1192 struct slab *slabp = virt_to_slab(objp); 1193 int nodeid = slabp->nodeid; 1194 struct kmem_list3 *l3; 1195 struct array_cache *alien = NULL; 1196 int node; 1197 1198 node = numa_mem_id(); 1199 1200 /* 1201 * Make sure we are not freeing a object from another node to the array 1202 * cache on this cpu. 1203 */ 1204 if (likely(slabp->nodeid == node)) 1205 return 0; 1206 1207 l3 = cachep->nodelists[node]; 1208 STATS_INC_NODEFREES(cachep); 1209 if (l3->alien && l3->alien[nodeid]) { 1210 alien = l3->alien[nodeid]; 1211 spin_lock(&alien->lock); 1212 if (unlikely(alien->avail == alien->limit)) { 1213 STATS_INC_ACOVERFLOW(cachep); 1214 __drain_alien_cache(cachep, alien, nodeid); 1215 } 1216 ac_put_obj(cachep, alien, objp); 1217 spin_unlock(&alien->lock); 1218 } else { 1219 spin_lock(&(cachep->nodelists[nodeid])->list_lock); 1220 free_block(cachep, &objp, 1, nodeid); 1221 spin_unlock(&(cachep->nodelists[nodeid])->list_lock); 1222 } 1223 return 1; 1224 } 1225 #endif 1226 1227 /* 1228 * Allocates and initializes nodelists for a node on each slab cache, used for 1229 * either memory or cpu hotplug. If memory is being hot-added, the kmem_list3 1230 * will be allocated off-node since memory is not yet online for the new node. 1231 * When hotplugging memory or a cpu, existing nodelists are not replaced if 1232 * already in use. 1233 * 1234 * Must hold slab_mutex. 1235 */ 1236 static int init_cache_nodelists_node(int node) 1237 { 1238 struct kmem_cache *cachep; 1239 struct kmem_list3 *l3; 1240 const int memsize = sizeof(struct kmem_list3); 1241 1242 list_for_each_entry(cachep, &slab_caches, list) { 1243 /* 1244 * Set up the size64 kmemlist for cpu before we can 1245 * begin anything. Make sure some other cpu on this 1246 * node has not already allocated this 1247 */ 1248 if (!cachep->nodelists[node]) { 1249 l3 = kmalloc_node(memsize, GFP_KERNEL, node); 1250 if (!l3) 1251 return -ENOMEM; 1252 kmem_list3_init(l3); 1253 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 1254 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1255 1256 /* 1257 * The l3s don't come and go as CPUs come and 1258 * go. slab_mutex is sufficient 1259 * protection here. 1260 */ 1261 cachep->nodelists[node] = l3; 1262 } 1263 1264 spin_lock_irq(&cachep->nodelists[node]->list_lock); 1265 cachep->nodelists[node]->free_limit = 1266 (1 + nr_cpus_node(node)) * 1267 cachep->batchcount + cachep->num; 1268 spin_unlock_irq(&cachep->nodelists[node]->list_lock); 1269 } 1270 return 0; 1271 } 1272 1273 static void __cpuinit cpuup_canceled(long cpu) 1274 { 1275 struct kmem_cache *cachep; 1276 struct kmem_list3 *l3 = NULL; 1277 int node = cpu_to_mem(cpu); 1278 const struct cpumask *mask = cpumask_of_node(node); 1279 1280 list_for_each_entry(cachep, &slab_caches, list) { 1281 struct array_cache *nc; 1282 struct array_cache *shared; 1283 struct array_cache **alien; 1284 1285 /* cpu is dead; no one can alloc from it. */ 1286 nc = cachep->array[cpu]; 1287 cachep->array[cpu] = NULL; 1288 l3 = cachep->nodelists[node]; 1289 1290 if (!l3) 1291 goto free_array_cache; 1292 1293 spin_lock_irq(&l3->list_lock); 1294 1295 /* Free limit for this kmem_list3 */ 1296 l3->free_limit -= cachep->batchcount; 1297 if (nc) 1298 free_block(cachep, nc->entry, nc->avail, node); 1299 1300 if (!cpumask_empty(mask)) { 1301 spin_unlock_irq(&l3->list_lock); 1302 goto free_array_cache; 1303 } 1304 1305 shared = l3->shared; 1306 if (shared) { 1307 free_block(cachep, shared->entry, 1308 shared->avail, node); 1309 l3->shared = NULL; 1310 } 1311 1312 alien = l3->alien; 1313 l3->alien = NULL; 1314 1315 spin_unlock_irq(&l3->list_lock); 1316 1317 kfree(shared); 1318 if (alien) { 1319 drain_alien_cache(cachep, alien); 1320 free_alien_cache(alien); 1321 } 1322 free_array_cache: 1323 kfree(nc); 1324 } 1325 /* 1326 * In the previous loop, all the objects were freed to 1327 * the respective cache's slabs, now we can go ahead and 1328 * shrink each nodelist to its limit. 1329 */ 1330 list_for_each_entry(cachep, &slab_caches, list) { 1331 l3 = cachep->nodelists[node]; 1332 if (!l3) 1333 continue; 1334 drain_freelist(cachep, l3, l3->free_objects); 1335 } 1336 } 1337 1338 static int __cpuinit cpuup_prepare(long cpu) 1339 { 1340 struct kmem_cache *cachep; 1341 struct kmem_list3 *l3 = NULL; 1342 int node = cpu_to_mem(cpu); 1343 int err; 1344 1345 /* 1346 * We need to do this right in the beginning since 1347 * alloc_arraycache's are going to use this list. 1348 * kmalloc_node allows us to add the slab to the right 1349 * kmem_list3 and not this cpu's kmem_list3 1350 */ 1351 err = init_cache_nodelists_node(node); 1352 if (err < 0) 1353 goto bad; 1354 1355 /* 1356 * Now we can go ahead with allocating the shared arrays and 1357 * array caches 1358 */ 1359 list_for_each_entry(cachep, &slab_caches, list) { 1360 struct array_cache *nc; 1361 struct array_cache *shared = NULL; 1362 struct array_cache **alien = NULL; 1363 1364 nc = alloc_arraycache(node, cachep->limit, 1365 cachep->batchcount, GFP_KERNEL); 1366 if (!nc) 1367 goto bad; 1368 if (cachep->shared) { 1369 shared = alloc_arraycache(node, 1370 cachep->shared * cachep->batchcount, 1371 0xbaadf00d, GFP_KERNEL); 1372 if (!shared) { 1373 kfree(nc); 1374 goto bad; 1375 } 1376 } 1377 if (use_alien_caches) { 1378 alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); 1379 if (!alien) { 1380 kfree(shared); 1381 kfree(nc); 1382 goto bad; 1383 } 1384 } 1385 cachep->array[cpu] = nc; 1386 l3 = cachep->nodelists[node]; 1387 BUG_ON(!l3); 1388 1389 spin_lock_irq(&l3->list_lock); 1390 if (!l3->shared) { 1391 /* 1392 * We are serialised from CPU_DEAD or 1393 * CPU_UP_CANCELLED by the cpucontrol lock 1394 */ 1395 l3->shared = shared; 1396 shared = NULL; 1397 } 1398 #ifdef CONFIG_NUMA 1399 if (!l3->alien) { 1400 l3->alien = alien; 1401 alien = NULL; 1402 } 1403 #endif 1404 spin_unlock_irq(&l3->list_lock); 1405 kfree(shared); 1406 free_alien_cache(alien); 1407 if (cachep->flags & SLAB_DEBUG_OBJECTS) 1408 slab_set_debugobj_lock_classes_node(cachep, node); 1409 } 1410 init_node_lock_keys(node); 1411 1412 return 0; 1413 bad: 1414 cpuup_canceled(cpu); 1415 return -ENOMEM; 1416 } 1417 1418 static int __cpuinit cpuup_callback(struct notifier_block *nfb, 1419 unsigned long action, void *hcpu) 1420 { 1421 long cpu = (long)hcpu; 1422 int err = 0; 1423 1424 switch (action) { 1425 case CPU_UP_PREPARE: 1426 case CPU_UP_PREPARE_FROZEN: 1427 mutex_lock(&slab_mutex); 1428 err = cpuup_prepare(cpu); 1429 mutex_unlock(&slab_mutex); 1430 break; 1431 case CPU_ONLINE: 1432 case CPU_ONLINE_FROZEN: 1433 start_cpu_timer(cpu); 1434 break; 1435 #ifdef CONFIG_HOTPLUG_CPU 1436 case CPU_DOWN_PREPARE: 1437 case CPU_DOWN_PREPARE_FROZEN: 1438 /* 1439 * Shutdown cache reaper. Note that the slab_mutex is 1440 * held so that if cache_reap() is invoked it cannot do 1441 * anything expensive but will only modify reap_work 1442 * and reschedule the timer. 1443 */ 1444 cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); 1445 /* Now the cache_reaper is guaranteed to be not running. */ 1446 per_cpu(slab_reap_work, cpu).work.func = NULL; 1447 break; 1448 case CPU_DOWN_FAILED: 1449 case CPU_DOWN_FAILED_FROZEN: 1450 start_cpu_timer(cpu); 1451 break; 1452 case CPU_DEAD: 1453 case CPU_DEAD_FROZEN: 1454 /* 1455 * Even if all the cpus of a node are down, we don't free the 1456 * kmem_list3 of any cache. This to avoid a race between 1457 * cpu_down, and a kmalloc allocation from another cpu for 1458 * memory from the node of the cpu going down. The list3 1459 * structure is usually allocated from kmem_cache_create() and 1460 * gets destroyed at kmem_cache_destroy(). 1461 */ 1462 /* fall through */ 1463 #endif 1464 case CPU_UP_CANCELED: 1465 case CPU_UP_CANCELED_FROZEN: 1466 mutex_lock(&slab_mutex); 1467 cpuup_canceled(cpu); 1468 mutex_unlock(&slab_mutex); 1469 break; 1470 } 1471 return notifier_from_errno(err); 1472 } 1473 1474 static struct notifier_block __cpuinitdata cpucache_notifier = { 1475 &cpuup_callback, NULL, 0 1476 }; 1477 1478 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) 1479 /* 1480 * Drains freelist for a node on each slab cache, used for memory hot-remove. 1481 * Returns -EBUSY if all objects cannot be drained so that the node is not 1482 * removed. 1483 * 1484 * Must hold slab_mutex. 1485 */ 1486 static int __meminit drain_cache_nodelists_node(int node) 1487 { 1488 struct kmem_cache *cachep; 1489 int ret = 0; 1490 1491 list_for_each_entry(cachep, &slab_caches, list) { 1492 struct kmem_list3 *l3; 1493 1494 l3 = cachep->nodelists[node]; 1495 if (!l3) 1496 continue; 1497 1498 drain_freelist(cachep, l3, l3->free_objects); 1499 1500 if (!list_empty(&l3->slabs_full) || 1501 !list_empty(&l3->slabs_partial)) { 1502 ret = -EBUSY; 1503 break; 1504 } 1505 } 1506 return ret; 1507 } 1508 1509 static int __meminit slab_memory_callback(struct notifier_block *self, 1510 unsigned long action, void *arg) 1511 { 1512 struct memory_notify *mnb = arg; 1513 int ret = 0; 1514 int nid; 1515 1516 nid = mnb->status_change_nid; 1517 if (nid < 0) 1518 goto out; 1519 1520 switch (action) { 1521 case MEM_GOING_ONLINE: 1522 mutex_lock(&slab_mutex); 1523 ret = init_cache_nodelists_node(nid); 1524 mutex_unlock(&slab_mutex); 1525 break; 1526 case MEM_GOING_OFFLINE: 1527 mutex_lock(&slab_mutex); 1528 ret = drain_cache_nodelists_node(nid); 1529 mutex_unlock(&slab_mutex); 1530 break; 1531 case MEM_ONLINE: 1532 case MEM_OFFLINE: 1533 case MEM_CANCEL_ONLINE: 1534 case MEM_CANCEL_OFFLINE: 1535 break; 1536 } 1537 out: 1538 return notifier_from_errno(ret); 1539 } 1540 #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ 1541 1542 /* 1543 * swap the static kmem_list3 with kmalloced memory 1544 */ 1545 static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list, 1546 int nodeid) 1547 { 1548 struct kmem_list3 *ptr; 1549 1550 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid); 1551 BUG_ON(!ptr); 1552 1553 memcpy(ptr, list, sizeof(struct kmem_list3)); 1554 /* 1555 * Do not assume that spinlocks can be initialized via memcpy: 1556 */ 1557 spin_lock_init(&ptr->list_lock); 1558 1559 MAKE_ALL_LISTS(cachep, ptr, nodeid); 1560 cachep->nodelists[nodeid] = ptr; 1561 } 1562 1563 /* 1564 * For setting up all the kmem_list3s for cache whose buffer_size is same as 1565 * size of kmem_list3. 1566 */ 1567 static void __init set_up_list3s(struct kmem_cache *cachep, int index) 1568 { 1569 int node; 1570 1571 for_each_online_node(node) { 1572 cachep->nodelists[node] = &initkmem_list3[index + node]; 1573 cachep->nodelists[node]->next_reap = jiffies + 1574 REAPTIMEOUT_LIST3 + 1575 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1576 } 1577 } 1578 1579 /* 1580 * Initialisation. Called after the page allocator have been initialised and 1581 * before smp_init(). 1582 */ 1583 void __init kmem_cache_init(void) 1584 { 1585 size_t left_over; 1586 struct cache_sizes *sizes; 1587 struct cache_names *names; 1588 int i; 1589 int order; 1590 int node; 1591 1592 kmem_cache = &kmem_cache_boot; 1593 1594 if (num_possible_nodes() == 1) 1595 use_alien_caches = 0; 1596 1597 for (i = 0; i < NUM_INIT_LISTS; i++) { 1598 kmem_list3_init(&initkmem_list3[i]); 1599 if (i < MAX_NUMNODES) 1600 kmem_cache->nodelists[i] = NULL; 1601 } 1602 set_up_list3s(kmem_cache, CACHE_CACHE); 1603 1604 /* 1605 * Fragmentation resistance on low memory - only use bigger 1606 * page orders on machines with more than 32MB of memory if 1607 * not overridden on the command line. 1608 */ 1609 if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) 1610 slab_max_order = SLAB_MAX_ORDER_HI; 1611 1612 /* Bootstrap is tricky, because several objects are allocated 1613 * from caches that do not exist yet: 1614 * 1) initialize the kmem_cache cache: it contains the struct 1615 * kmem_cache structures of all caches, except kmem_cache itself: 1616 * kmem_cache is statically allocated. 1617 * Initially an __init data area is used for the head array and the 1618 * kmem_list3 structures, it's replaced with a kmalloc allocated 1619 * array at the end of the bootstrap. 1620 * 2) Create the first kmalloc cache. 1621 * The struct kmem_cache for the new cache is allocated normally. 1622 * An __init data area is used for the head array. 1623 * 3) Create the remaining kmalloc caches, with minimally sized 1624 * head arrays. 1625 * 4) Replace the __init data head arrays for kmem_cache and the first 1626 * kmalloc cache with kmalloc allocated arrays. 1627 * 5) Replace the __init data for kmem_list3 for kmem_cache and 1628 * the other cache's with kmalloc allocated memory. 1629 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1630 */ 1631 1632 node = numa_mem_id(); 1633 1634 /* 1) create the kmem_cache */ 1635 INIT_LIST_HEAD(&slab_caches); 1636 list_add(&kmem_cache->list, &slab_caches); 1637 kmem_cache->colour_off = cache_line_size(); 1638 kmem_cache->array[smp_processor_id()] = &initarray_cache.cache; 1639 kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; 1640 1641 /* 1642 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids 1643 */ 1644 kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + 1645 nr_node_ids * sizeof(struct kmem_list3 *); 1646 kmem_cache->object_size = kmem_cache->size; 1647 kmem_cache->size = ALIGN(kmem_cache->object_size, 1648 cache_line_size()); 1649 kmem_cache->reciprocal_buffer_size = 1650 reciprocal_value(kmem_cache->size); 1651 1652 for (order = 0; order < MAX_ORDER; order++) { 1653 cache_estimate(order, kmem_cache->size, 1654 cache_line_size(), 0, &left_over, &kmem_cache->num); 1655 if (kmem_cache->num) 1656 break; 1657 } 1658 BUG_ON(!kmem_cache->num); 1659 kmem_cache->gfporder = order; 1660 kmem_cache->colour = left_over / kmem_cache->colour_off; 1661 kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) + 1662 sizeof(struct slab), cache_line_size()); 1663 1664 /* 2+3) create the kmalloc caches */ 1665 sizes = malloc_sizes; 1666 names = cache_names; 1667 1668 /* 1669 * Initialize the caches that provide memory for the array cache and the 1670 * kmem_list3 structures first. Without this, further allocations will 1671 * bug. 1672 */ 1673 1674 sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 1675 sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name; 1676 sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size; 1677 sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size; 1678 sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN; 1679 __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); 1680 list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches); 1681 1682 if (INDEX_AC != INDEX_L3) { 1683 sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 1684 sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name; 1685 sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size; 1686 sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size; 1687 sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN; 1688 __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); 1689 list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches); 1690 } 1691 1692 slab_early_init = 0; 1693 1694 while (sizes->cs_size != ULONG_MAX) { 1695 /* 1696 * For performance, all the general caches are L1 aligned. 1697 * This should be particularly beneficial on SMP boxes, as it 1698 * eliminates "false sharing". 1699 * Note for systems short on memory removing the alignment will 1700 * allow tighter packing of the smaller caches. 1701 */ 1702 if (!sizes->cs_cachep) { 1703 sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 1704 sizes->cs_cachep->name = names->name; 1705 sizes->cs_cachep->size = sizes->cs_size; 1706 sizes->cs_cachep->object_size = sizes->cs_size; 1707 sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN; 1708 __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); 1709 list_add(&sizes->cs_cachep->list, &slab_caches); 1710 } 1711 #ifdef CONFIG_ZONE_DMA 1712 sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 1713 sizes->cs_dmacachep->name = names->name_dma; 1714 sizes->cs_dmacachep->size = sizes->cs_size; 1715 sizes->cs_dmacachep->object_size = sizes->cs_size; 1716 sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN; 1717 __kmem_cache_create(sizes->cs_dmacachep, 1718 ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC); 1719 list_add(&sizes->cs_dmacachep->list, &slab_caches); 1720 #endif 1721 sizes++; 1722 names++; 1723 } 1724 /* 4) Replace the bootstrap head arrays */ 1725 { 1726 struct array_cache *ptr; 1727 1728 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); 1729 1730 BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache); 1731 memcpy(ptr, cpu_cache_get(kmem_cache), 1732 sizeof(struct arraycache_init)); 1733 /* 1734 * Do not assume that spinlocks can be initialized via memcpy: 1735 */ 1736 spin_lock_init(&ptr->lock); 1737 1738 kmem_cache->array[smp_processor_id()] = ptr; 1739 1740 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); 1741 1742 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) 1743 != &initarray_generic.cache); 1744 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), 1745 sizeof(struct arraycache_init)); 1746 /* 1747 * Do not assume that spinlocks can be initialized via memcpy: 1748 */ 1749 spin_lock_init(&ptr->lock); 1750 1751 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1752 ptr; 1753 } 1754 /* 5) Replace the bootstrap kmem_list3's */ 1755 { 1756 int nid; 1757 1758 for_each_online_node(nid) { 1759 init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid); 1760 1761 init_list(malloc_sizes[INDEX_AC].cs_cachep, 1762 &initkmem_list3[SIZE_AC + nid], nid); 1763 1764 if (INDEX_AC != INDEX_L3) { 1765 init_list(malloc_sizes[INDEX_L3].cs_cachep, 1766 &initkmem_list3[SIZE_L3 + nid], nid); 1767 } 1768 } 1769 } 1770 1771 slab_state = UP; 1772 } 1773 1774 void __init kmem_cache_init_late(void) 1775 { 1776 struct kmem_cache *cachep; 1777 1778 slab_state = UP; 1779 1780 /* 6) resize the head arrays to their final sizes */ 1781 mutex_lock(&slab_mutex); 1782 list_for_each_entry(cachep, &slab_caches, list) 1783 if (enable_cpucache(cachep, GFP_NOWAIT)) 1784 BUG(); 1785 mutex_unlock(&slab_mutex); 1786 1787 /* Annotate slab for lockdep -- annotate the malloc caches */ 1788 init_lock_keys(); 1789 1790 /* Done! */ 1791 slab_state = FULL; 1792 1793 /* 1794 * Register a cpu startup notifier callback that initializes 1795 * cpu_cache_get for all new cpus 1796 */ 1797 register_cpu_notifier(&cpucache_notifier); 1798 1799 #ifdef CONFIG_NUMA 1800 /* 1801 * Register a memory hotplug callback that initializes and frees 1802 * nodelists. 1803 */ 1804 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 1805 #endif 1806 1807 /* 1808 * The reap timers are started later, with a module init call: That part 1809 * of the kernel is not yet operational. 1810 */ 1811 } 1812 1813 static int __init cpucache_init(void) 1814 { 1815 int cpu; 1816 1817 /* 1818 * Register the timers that return unneeded pages to the page allocator 1819 */ 1820 for_each_online_cpu(cpu) 1821 start_cpu_timer(cpu); 1822 1823 /* Done! */ 1824 slab_state = FULL; 1825 return 0; 1826 } 1827 __initcall(cpucache_init); 1828 1829 static noinline void 1830 slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) 1831 { 1832 struct kmem_list3 *l3; 1833 struct slab *slabp; 1834 unsigned long flags; 1835 int node; 1836 1837 printk(KERN_WARNING 1838 "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", 1839 nodeid, gfpflags); 1840 printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", 1841 cachep->name, cachep->size, cachep->gfporder); 1842 1843 for_each_online_node(node) { 1844 unsigned long active_objs = 0, num_objs = 0, free_objects = 0; 1845 unsigned long active_slabs = 0, num_slabs = 0; 1846 1847 l3 = cachep->nodelists[node]; 1848 if (!l3) 1849 continue; 1850 1851 spin_lock_irqsave(&l3->list_lock, flags); 1852 list_for_each_entry(slabp, &l3->slabs_full, list) { 1853 active_objs += cachep->num; 1854 active_slabs++; 1855 } 1856 list_for_each_entry(slabp, &l3->slabs_partial, list) { 1857 active_objs += slabp->inuse; 1858 active_slabs++; 1859 } 1860 list_for_each_entry(slabp, &l3->slabs_free, list) 1861 num_slabs++; 1862 1863 free_objects += l3->free_objects; 1864 spin_unlock_irqrestore(&l3->list_lock, flags); 1865 1866 num_slabs += active_slabs; 1867 num_objs = num_slabs * cachep->num; 1868 printk(KERN_WARNING 1869 " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", 1870 node, active_slabs, num_slabs, active_objs, num_objs, 1871 free_objects); 1872 } 1873 } 1874 1875 /* 1876 * Interface to system's page allocator. No need to hold the cache-lock. 1877 * 1878 * If we requested dmaable memory, we will get it. Even if we 1879 * did not request dmaable memory, we might get it, but that 1880 * would be relatively rare and ignorable. 1881 */ 1882 static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) 1883 { 1884 struct page *page; 1885 int nr_pages; 1886 int i; 1887 1888 #ifndef CONFIG_MMU 1889 /* 1890 * Nommu uses slab's for process anonymous memory allocations, and thus 1891 * requires __GFP_COMP to properly refcount higher order allocations 1892 */ 1893 flags |= __GFP_COMP; 1894 #endif 1895 1896 flags |= cachep->allocflags; 1897 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1898 flags |= __GFP_RECLAIMABLE; 1899 1900 page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); 1901 if (!page) { 1902 if (!(flags & __GFP_NOWARN) && printk_ratelimit()) 1903 slab_out_of_memory(cachep, flags, nodeid); 1904 return NULL; 1905 } 1906 1907 /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ 1908 if (unlikely(page->pfmemalloc)) 1909 pfmemalloc_active = true; 1910 1911 nr_pages = (1 << cachep->gfporder); 1912 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1913 add_zone_page_state(page_zone(page), 1914 NR_SLAB_RECLAIMABLE, nr_pages); 1915 else 1916 add_zone_page_state(page_zone(page), 1917 NR_SLAB_UNRECLAIMABLE, nr_pages); 1918 for (i = 0; i < nr_pages; i++) { 1919 __SetPageSlab(page + i); 1920 1921 if (page->pfmemalloc) 1922 SetPageSlabPfmemalloc(page + i); 1923 } 1924 1925 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1926 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); 1927 1928 if (cachep->ctor) 1929 kmemcheck_mark_uninitialized_pages(page, nr_pages); 1930 else 1931 kmemcheck_mark_unallocated_pages(page, nr_pages); 1932 } 1933 1934 return page_address(page); 1935 } 1936 1937 /* 1938 * Interface to system's page release. 1939 */ 1940 static void kmem_freepages(struct kmem_cache *cachep, void *addr) 1941 { 1942 unsigned long i = (1 << cachep->gfporder); 1943 struct page *page = virt_to_page(addr); 1944 const unsigned long nr_freed = i; 1945 1946 kmemcheck_free_shadow(page, cachep->gfporder); 1947 1948 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1949 sub_zone_page_state(page_zone(page), 1950 NR_SLAB_RECLAIMABLE, nr_freed); 1951 else 1952 sub_zone_page_state(page_zone(page), 1953 NR_SLAB_UNRECLAIMABLE, nr_freed); 1954 while (i--) { 1955 BUG_ON(!PageSlab(page)); 1956 __ClearPageSlabPfmemalloc(page); 1957 __ClearPageSlab(page); 1958 page++; 1959 } 1960 if (current->reclaim_state) 1961 current->reclaim_state->reclaimed_slab += nr_freed; 1962 free_pages((unsigned long)addr, cachep->gfporder); 1963 } 1964 1965 static void kmem_rcu_free(struct rcu_head *head) 1966 { 1967 struct slab_rcu *slab_rcu = (struct slab_rcu *)head; 1968 struct kmem_cache *cachep = slab_rcu->cachep; 1969 1970 kmem_freepages(cachep, slab_rcu->addr); 1971 if (OFF_SLAB(cachep)) 1972 kmem_cache_free(cachep->slabp_cache, slab_rcu); 1973 } 1974 1975 #if DEBUG 1976 1977 #ifdef CONFIG_DEBUG_PAGEALLOC 1978 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, 1979 unsigned long caller) 1980 { 1981 int size = cachep->object_size; 1982 1983 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; 1984 1985 if (size < 5 * sizeof(unsigned long)) 1986 return; 1987 1988 *addr++ = 0x12345678; 1989 *addr++ = caller; 1990 *addr++ = smp_processor_id(); 1991 size -= 3 * sizeof(unsigned long); 1992 { 1993 unsigned long *sptr = &caller; 1994 unsigned long svalue; 1995 1996 while (!kstack_end(sptr)) { 1997 svalue = *sptr++; 1998 if (kernel_text_address(svalue)) { 1999 *addr++ = svalue; 2000 size -= sizeof(unsigned long); 2001 if (size <= sizeof(unsigned long)) 2002 break; 2003 } 2004 } 2005 2006 } 2007 *addr++ = 0x87654321; 2008 } 2009 #endif 2010 2011 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) 2012 { 2013 int size = cachep->object_size; 2014 addr = &((char *)addr)[obj_offset(cachep)]; 2015 2016 memset(addr, val, size); 2017 *(unsigned char *)(addr + size - 1) = POISON_END; 2018 } 2019 2020 static void dump_line(char *data, int offset, int limit) 2021 { 2022 int i; 2023 unsigned char error = 0; 2024 int bad_count = 0; 2025 2026 printk(KERN_ERR "%03x: ", offset); 2027 for (i = 0; i < limit; i++) { 2028 if (data[offset + i] != POISON_FREE) { 2029 error = data[offset + i]; 2030 bad_count++; 2031 } 2032 } 2033 print_hex_dump(KERN_CONT, "", 0, 16, 1, 2034 &data[offset], limit, 1); 2035 2036 if (bad_count == 1) { 2037 error ^= POISON_FREE; 2038 if (!(error & (error - 1))) { 2039 printk(KERN_ERR "Single bit error detected. Probably " 2040 "bad RAM.\n"); 2041 #ifdef CONFIG_X86 2042 printk(KERN_ERR "Run memtest86+ or a similar memory " 2043 "test tool.\n"); 2044 #else 2045 printk(KERN_ERR "Run a memory test tool.\n"); 2046 #endif 2047 } 2048 } 2049 } 2050 #endif 2051 2052 #if DEBUG 2053 2054 static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) 2055 { 2056 int i, size; 2057 char *realobj; 2058 2059 if (cachep->flags & SLAB_RED_ZONE) { 2060 printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", 2061 *dbg_redzone1(cachep, objp), 2062 *dbg_redzone2(cachep, objp)); 2063 } 2064 2065 if (cachep->flags & SLAB_STORE_USER) { 2066 printk(KERN_ERR "Last user: [<%p>]", 2067 *dbg_userword(cachep, objp)); 2068 print_symbol("(%s)", 2069 (unsigned long)*dbg_userword(cachep, objp)); 2070 printk("\n"); 2071 } 2072 realobj = (char *)objp + obj_offset(cachep); 2073 size = cachep->object_size; 2074 for (i = 0; i < size && lines; i += 16, lines--) { 2075 int limit; 2076 limit = 16; 2077 if (i + limit > size) 2078 limit = size - i; 2079 dump_line(realobj, i, limit); 2080 } 2081 } 2082 2083 static void check_poison_obj(struct kmem_cache *cachep, void *objp) 2084 { 2085 char *realobj; 2086 int size, i; 2087 int lines = 0; 2088 2089 realobj = (char *)objp + obj_offset(cachep); 2090 size = cachep->object_size; 2091 2092 for (i = 0; i < size; i++) { 2093 char exp = POISON_FREE; 2094 if (i == size - 1) 2095 exp = POISON_END; 2096 if (realobj[i] != exp) { 2097 int limit; 2098 /* Mismatch ! */ 2099 /* Print header */ 2100 if (lines == 0) { 2101 printk(KERN_ERR 2102 "Slab corruption (%s): %s start=%p, len=%d\n", 2103 print_tainted(), cachep->name, realobj, size); 2104 print_objinfo(cachep, objp, 0); 2105 } 2106 /* Hexdump the affected line */ 2107 i = (i / 16) * 16; 2108 limit = 16; 2109 if (i + limit > size) 2110 limit = size - i; 2111 dump_line(realobj, i, limit); 2112 i += 16; 2113 lines++; 2114 /* Limit to 5 lines */ 2115 if (lines > 5) 2116 break; 2117 } 2118 } 2119 if (lines != 0) { 2120 /* Print some data about the neighboring objects, if they 2121 * exist: 2122 */ 2123 struct slab *slabp = virt_to_slab(objp); 2124 unsigned int objnr; 2125 2126 objnr = obj_to_index(cachep, slabp, objp); 2127 if (objnr) { 2128 objp = index_to_obj(cachep, slabp, objnr - 1); 2129 realobj = (char *)objp + obj_offset(cachep); 2130 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 2131 realobj, size); 2132 print_objinfo(cachep, objp, 2); 2133 } 2134 if (objnr + 1 < cachep->num) { 2135 objp = index_to_obj(cachep, slabp, objnr + 1); 2136 realobj = (char *)objp + obj_offset(cachep); 2137 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 2138 realobj, size); 2139 print_objinfo(cachep, objp, 2); 2140 } 2141 } 2142 } 2143 #endif 2144 2145 #if DEBUG 2146 static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) 2147 { 2148 int i; 2149 for (i = 0; i < cachep->num; i++) { 2150 void *objp = index_to_obj(cachep, slabp, i); 2151 2152 if (cachep->flags & SLAB_POISON) { 2153 #ifdef CONFIG_DEBUG_PAGEALLOC 2154 if (cachep->size % PAGE_SIZE == 0 && 2155 OFF_SLAB(cachep)) 2156 kernel_map_pages(virt_to_page(objp), 2157 cachep->size / PAGE_SIZE, 1); 2158 else 2159 check_poison_obj(cachep, objp); 2160 #else 2161 check_poison_obj(cachep, objp); 2162 #endif 2163 } 2164 if (cachep->flags & SLAB_RED_ZONE) { 2165 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2166 slab_error(cachep, "start of a freed object " 2167 "was overwritten"); 2168 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2169 slab_error(cachep, "end of a freed object " 2170 "was overwritten"); 2171 } 2172 } 2173 } 2174 #else 2175 static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) 2176 { 2177 } 2178 #endif 2179 2180 /** 2181 * slab_destroy - destroy and release all objects in a slab 2182 * @cachep: cache pointer being destroyed 2183 * @slabp: slab pointer being destroyed 2184 * 2185 * Destroy all the objs in a slab, and release the mem back to the system. 2186 * Before calling the slab must have been unlinked from the cache. The 2187 * cache-lock is not held/needed. 2188 */ 2189 static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) 2190 { 2191 void *addr = slabp->s_mem - slabp->colouroff; 2192 2193 slab_destroy_debugcheck(cachep, slabp); 2194 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 2195 struct slab_rcu *slab_rcu; 2196 2197 slab_rcu = (struct slab_rcu *)slabp; 2198 slab_rcu->cachep = cachep; 2199 slab_rcu->addr = addr; 2200 call_rcu(&slab_rcu->head, kmem_rcu_free); 2201 } else { 2202 kmem_freepages(cachep, addr); 2203 if (OFF_SLAB(cachep)) 2204 kmem_cache_free(cachep->slabp_cache, slabp); 2205 } 2206 } 2207 2208 /** 2209 * calculate_slab_order - calculate size (page order) of slabs 2210 * @cachep: pointer to the cache that is being created 2211 * @size: size of objects to be created in this cache. 2212 * @align: required alignment for the objects. 2213 * @flags: slab allocation flags 2214 * 2215 * Also calculates the number of objects per slab. 2216 * 2217 * This could be made much more intelligent. For now, try to avoid using 2218 * high order pages for slabs. When the gfp() functions are more friendly 2219 * towards high-order requests, this should be changed. 2220 */ 2221 static size_t calculate_slab_order(struct kmem_cache *cachep, 2222 size_t size, size_t align, unsigned long flags) 2223 { 2224 unsigned long offslab_limit; 2225 size_t left_over = 0; 2226 int gfporder; 2227 2228 for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { 2229 unsigned int num; 2230 size_t remainder; 2231 2232 cache_estimate(gfporder, size, align, flags, &remainder, &num); 2233 if (!num) 2234 continue; 2235 2236 if (flags & CFLGS_OFF_SLAB) { 2237 /* 2238 * Max number of objs-per-slab for caches which 2239 * use off-slab slabs. Needed to avoid a possible 2240 * looping condition in cache_grow(). 2241 */ 2242 offslab_limit = size - sizeof(struct slab); 2243 offslab_limit /= sizeof(kmem_bufctl_t); 2244 2245 if (num > offslab_limit) 2246 break; 2247 } 2248 2249 /* Found something acceptable - save it away */ 2250 cachep->num = num; 2251 cachep->gfporder = gfporder; 2252 left_over = remainder; 2253 2254 /* 2255 * A VFS-reclaimable slab tends to have most allocations 2256 * as GFP_NOFS and we really don't want to have to be allocating 2257 * higher-order pages when we are unable to shrink dcache. 2258 */ 2259 if (flags & SLAB_RECLAIM_ACCOUNT) 2260 break; 2261 2262 /* 2263 * Large number of objects is good, but very large slabs are 2264 * currently bad for the gfp()s. 2265 */ 2266 if (gfporder >= slab_max_order) 2267 break; 2268 2269 /* 2270 * Acceptable internal fragmentation? 2271 */ 2272 if (left_over * 8 <= (PAGE_SIZE << gfporder)) 2273 break; 2274 } 2275 return left_over; 2276 } 2277 2278 static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) 2279 { 2280 if (slab_state >= FULL) 2281 return enable_cpucache(cachep, gfp); 2282 2283 if (slab_state == DOWN) { 2284 /* 2285 * Note: the first kmem_cache_create must create the cache 2286 * that's used by kmalloc(24), otherwise the creation of 2287 * further caches will BUG(). 2288 */ 2289 cachep->array[smp_processor_id()] = &initarray_generic.cache; 2290 2291 /* 2292 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is 2293 * the first cache, then we need to set up all its list3s, 2294 * otherwise the creation of further caches will BUG(). 2295 */ 2296 set_up_list3s(cachep, SIZE_AC); 2297 if (INDEX_AC == INDEX_L3) 2298 slab_state = PARTIAL_L3; 2299 else 2300 slab_state = PARTIAL_ARRAYCACHE; 2301 } else { 2302 cachep->array[smp_processor_id()] = 2303 kmalloc(sizeof(struct arraycache_init), gfp); 2304 2305 if (slab_state == PARTIAL_ARRAYCACHE) { 2306 set_up_list3s(cachep, SIZE_L3); 2307 slab_state = PARTIAL_L3; 2308 } else { 2309 int node; 2310 for_each_online_node(node) { 2311 cachep->nodelists[node] = 2312 kmalloc_node(sizeof(struct kmem_list3), 2313 gfp, node); 2314 BUG_ON(!cachep->nodelists[node]); 2315 kmem_list3_init(cachep->nodelists[node]); 2316 } 2317 } 2318 } 2319 cachep->nodelists[numa_mem_id()]->next_reap = 2320 jiffies + REAPTIMEOUT_LIST3 + 2321 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 2322 2323 cpu_cache_get(cachep)->avail = 0; 2324 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 2325 cpu_cache_get(cachep)->batchcount = 1; 2326 cpu_cache_get(cachep)->touched = 0; 2327 cachep->batchcount = 1; 2328 cachep->limit = BOOT_CPUCACHE_ENTRIES; 2329 return 0; 2330 } 2331 2332 /** 2333 * __kmem_cache_create - Create a cache. 2334 * @name: A string which is used in /proc/slabinfo to identify this cache. 2335 * @size: The size of objects to be created in this cache. 2336 * @align: The required alignment for the objects. 2337 * @flags: SLAB flags 2338 * @ctor: A constructor for the objects. 2339 * 2340 * Returns a ptr to the cache on success, NULL on failure. 2341 * Cannot be called within a int, but can be interrupted. 2342 * The @ctor is run when new pages are allocated by the cache. 2343 * 2344 * The flags are 2345 * 2346 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 2347 * to catch references to uninitialised memory. 2348 * 2349 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 2350 * for buffer overruns. 2351 * 2352 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 2353 * cacheline. This can be beneficial if you're counting cycles as closely 2354 * as davem. 2355 */ 2356 int 2357 __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) 2358 { 2359 size_t left_over, slab_size, ralign; 2360 gfp_t gfp; 2361 int err; 2362 size_t size = cachep->size; 2363 2364 #if DEBUG 2365 #if FORCED_DEBUG 2366 /* 2367 * Enable redzoning and last user accounting, except for caches with 2368 * large objects, if the increased size would increase the object size 2369 * above the next power of two: caches with object sizes just above a 2370 * power of two have a significant amount of internal fragmentation. 2371 */ 2372 if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + 2373 2 * sizeof(unsigned long long))) 2374 flags |= SLAB_RED_ZONE | SLAB_STORE_USER; 2375 if (!(flags & SLAB_DESTROY_BY_RCU)) 2376 flags |= SLAB_POISON; 2377 #endif 2378 if (flags & SLAB_DESTROY_BY_RCU) 2379 BUG_ON(flags & SLAB_POISON); 2380 #endif 2381 /* 2382 * Always checks flags, a caller might be expecting debug support which 2383 * isn't available. 2384 */ 2385 BUG_ON(flags & ~CREATE_MASK); 2386 2387 /* 2388 * Check that size is in terms of words. This is needed to avoid 2389 * unaligned accesses for some archs when redzoning is used, and makes 2390 * sure any on-slab bufctl's are also correctly aligned. 2391 */ 2392 if (size & (BYTES_PER_WORD - 1)) { 2393 size += (BYTES_PER_WORD - 1); 2394 size &= ~(BYTES_PER_WORD - 1); 2395 } 2396 2397 /* calculate the final buffer alignment: */ 2398 2399 /* 1) arch recommendation: can be overridden for debug */ 2400 if (flags & SLAB_HWCACHE_ALIGN) { 2401 /* 2402 * Default alignment: as specified by the arch code. Except if 2403 * an object is really small, then squeeze multiple objects into 2404 * one cacheline. 2405 */ 2406 ralign = cache_line_size(); 2407 while (size <= ralign / 2) 2408 ralign /= 2; 2409 } else { 2410 ralign = BYTES_PER_WORD; 2411 } 2412 2413 /* 2414 * Redzoning and user store require word alignment or possibly larger. 2415 * Note this will be overridden by architecture or caller mandated 2416 * alignment if either is greater than BYTES_PER_WORD. 2417 */ 2418 if (flags & SLAB_STORE_USER) 2419 ralign = BYTES_PER_WORD; 2420 2421 if (flags & SLAB_RED_ZONE) { 2422 ralign = REDZONE_ALIGN; 2423 /* If redzoning, ensure that the second redzone is suitably 2424 * aligned, by adjusting the object size accordingly. */ 2425 size += REDZONE_ALIGN - 1; 2426 size &= ~(REDZONE_ALIGN - 1); 2427 } 2428 2429 /* 2) arch mandated alignment */ 2430 if (ralign < ARCH_SLAB_MINALIGN) { 2431 ralign = ARCH_SLAB_MINALIGN; 2432 } 2433 /* 3) caller mandated alignment */ 2434 if (ralign < cachep->align) { 2435 ralign = cachep->align; 2436 } 2437 /* disable debug if necessary */ 2438 if (ralign > __alignof__(unsigned long long)) 2439 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2440 /* 2441 * 4) Store it. 2442 */ 2443 cachep->align = ralign; 2444 2445 if (slab_is_available()) 2446 gfp = GFP_KERNEL; 2447 else 2448 gfp = GFP_NOWAIT; 2449 2450 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; 2451 #if DEBUG 2452 2453 /* 2454 * Both debugging options require word-alignment which is calculated 2455 * into align above. 2456 */ 2457 if (flags & SLAB_RED_ZONE) { 2458 /* add space for red zone words */ 2459 cachep->obj_offset += sizeof(unsigned long long); 2460 size += 2 * sizeof(unsigned long long); 2461 } 2462 if (flags & SLAB_STORE_USER) { 2463 /* user store requires one word storage behind the end of 2464 * the real object. But if the second red zone needs to be 2465 * aligned to 64 bits, we must allow that much space. 2466 */ 2467 if (flags & SLAB_RED_ZONE) 2468 size += REDZONE_ALIGN; 2469 else 2470 size += BYTES_PER_WORD; 2471 } 2472 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 2473 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size 2474 && cachep->object_size > cache_line_size() 2475 && ALIGN(size, cachep->align) < PAGE_SIZE) { 2476 cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); 2477 size = PAGE_SIZE; 2478 } 2479 #endif 2480 #endif 2481 2482 /* 2483 * Determine if the slab management is 'on' or 'off' slab. 2484 * (bootstrapping cannot cope with offslab caches so don't do 2485 * it too early on. Always use on-slab management when 2486 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) 2487 */ 2488 if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && 2489 !(flags & SLAB_NOLEAKTRACE)) 2490 /* 2491 * Size is large, assume best to place the slab management obj 2492 * off-slab (should allow better packing of objs). 2493 */ 2494 flags |= CFLGS_OFF_SLAB; 2495 2496 size = ALIGN(size, cachep->align); 2497 2498 left_over = calculate_slab_order(cachep, size, cachep->align, flags); 2499 2500 if (!cachep->num) 2501 return -E2BIG; 2502 2503 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) 2504 + sizeof(struct slab), cachep->align); 2505 2506 /* 2507 * If the slab has been placed off-slab, and we have enough space then 2508 * move it on-slab. This is at the expense of any extra colouring. 2509 */ 2510 if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { 2511 flags &= ~CFLGS_OFF_SLAB; 2512 left_over -= slab_size; 2513 } 2514 2515 if (flags & CFLGS_OFF_SLAB) { 2516 /* really off slab. No need for manual alignment */ 2517 slab_size = 2518 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); 2519 2520 #ifdef CONFIG_PAGE_POISONING 2521 /* If we're going to use the generic kernel_map_pages() 2522 * poisoning, then it's going to smash the contents of 2523 * the redzone and userword anyhow, so switch them off. 2524 */ 2525 if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) 2526 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2527 #endif 2528 } 2529 2530 cachep->colour_off = cache_line_size(); 2531 /* Offset must be a multiple of the alignment. */ 2532 if (cachep->colour_off < cachep->align) 2533 cachep->colour_off = cachep->align; 2534 cachep->colour = left_over / cachep->colour_off; 2535 cachep->slab_size = slab_size; 2536 cachep->flags = flags; 2537 cachep->allocflags = 0; 2538 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) 2539 cachep->allocflags |= GFP_DMA; 2540 cachep->size = size; 2541 cachep->reciprocal_buffer_size = reciprocal_value(size); 2542 2543 if (flags & CFLGS_OFF_SLAB) { 2544 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 2545 /* 2546 * This is a possibility for one of the malloc_sizes caches. 2547 * But since we go off slab only for object size greater than 2548 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, 2549 * this should not happen at all. 2550 * But leave a BUG_ON for some lucky dude. 2551 */ 2552 BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); 2553 } 2554 2555 err = setup_cpu_cache(cachep, gfp); 2556 if (err) { 2557 __kmem_cache_shutdown(cachep); 2558 return err; 2559 } 2560 2561 if (flags & SLAB_DEBUG_OBJECTS) { 2562 /* 2563 * Would deadlock through slab_destroy()->call_rcu()-> 2564 * debug_object_activate()->kmem_cache_alloc(). 2565 */ 2566 WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); 2567 2568 slab_set_debugobj_lock_classes(cachep); 2569 } 2570 2571 return 0; 2572 } 2573 2574 #if DEBUG 2575 static void check_irq_off(void) 2576 { 2577 BUG_ON(!irqs_disabled()); 2578 } 2579 2580 static void check_irq_on(void) 2581 { 2582 BUG_ON(irqs_disabled()); 2583 } 2584 2585 static void check_spinlock_acquired(struct kmem_cache *cachep) 2586 { 2587 #ifdef CONFIG_SMP 2588 check_irq_off(); 2589 assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock); 2590 #endif 2591 } 2592 2593 static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) 2594 { 2595 #ifdef CONFIG_SMP 2596 check_irq_off(); 2597 assert_spin_locked(&cachep->nodelists[node]->list_lock); 2598 #endif 2599 } 2600 2601 #else 2602 #define check_irq_off() do { } while(0) 2603 #define check_irq_on() do { } while(0) 2604 #define check_spinlock_acquired(x) do { } while(0) 2605 #define check_spinlock_acquired_node(x, y) do { } while(0) 2606 #endif 2607 2608 static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, 2609 struct array_cache *ac, 2610 int force, int node); 2611 2612 static void do_drain(void *arg) 2613 { 2614 struct kmem_cache *cachep = arg; 2615 struct array_cache *ac; 2616 int node = numa_mem_id(); 2617 2618 check_irq_off(); 2619 ac = cpu_cache_get(cachep); 2620 spin_lock(&cachep->nodelists[node]->list_lock); 2621 free_block(cachep, ac->entry, ac->avail, node); 2622 spin_unlock(&cachep->nodelists[node]->list_lock); 2623 ac->avail = 0; 2624 } 2625 2626 static void drain_cpu_caches(struct kmem_cache *cachep) 2627 { 2628 struct kmem_list3 *l3; 2629 int node; 2630 2631 on_each_cpu(do_drain, cachep, 1); 2632 check_irq_on(); 2633 for_each_online_node(node) { 2634 l3 = cachep->nodelists[node]; 2635 if (l3 && l3->alien) 2636 drain_alien_cache(cachep, l3->alien); 2637 } 2638 2639 for_each_online_node(node) { 2640 l3 = cachep->nodelists[node]; 2641 if (l3) 2642 drain_array(cachep, l3, l3->shared, 1, node); 2643 } 2644 } 2645 2646 /* 2647 * Remove slabs from the list of free slabs. 2648 * Specify the number of slabs to drain in tofree. 2649 * 2650 * Returns the actual number of slabs released. 2651 */ 2652 static int drain_freelist(struct kmem_cache *cache, 2653 struct kmem_list3 *l3, int tofree) 2654 { 2655 struct list_head *p; 2656 int nr_freed; 2657 struct slab *slabp; 2658 2659 nr_freed = 0; 2660 while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { 2661 2662 spin_lock_irq(&l3->list_lock); 2663 p = l3->slabs_free.prev; 2664 if (p == &l3->slabs_free) { 2665 spin_unlock_irq(&l3->list_lock); 2666 goto out; 2667 } 2668 2669 slabp = list_entry(p, struct slab, list); 2670 #if DEBUG 2671 BUG_ON(slabp->inuse); 2672 #endif 2673 list_del(&slabp->list); 2674 /* 2675 * Safe to drop the lock. The slab is no longer linked 2676 * to the cache. 2677 */ 2678 l3->free_objects -= cache->num; 2679 spin_unlock_irq(&l3->list_lock); 2680 slab_destroy(cache, slabp); 2681 nr_freed++; 2682 } 2683 out: 2684 return nr_freed; 2685 } 2686 2687 /* Called with slab_mutex held to protect against cpu hotplug */ 2688 static int __cache_shrink(struct kmem_cache *cachep) 2689 { 2690 int ret = 0, i = 0; 2691 struct kmem_list3 *l3; 2692 2693 drain_cpu_caches(cachep); 2694 2695 check_irq_on(); 2696 for_each_online_node(i) { 2697 l3 = cachep->nodelists[i]; 2698 if (!l3) 2699 continue; 2700 2701 drain_freelist(cachep, l3, l3->free_objects); 2702 2703 ret += !list_empty(&l3->slabs_full) || 2704 !list_empty(&l3->slabs_partial); 2705 } 2706 return (ret ? 1 : 0); 2707 } 2708 2709 /** 2710 * kmem_cache_shrink - Shrink a cache. 2711 * @cachep: The cache to shrink. 2712 * 2713 * Releases as many slabs as possible for a cache. 2714 * To help debugging, a zero exit status indicates all slabs were released. 2715 */ 2716 int kmem_cache_shrink(struct kmem_cache *cachep) 2717 { 2718 int ret; 2719 BUG_ON(!cachep || in_interrupt()); 2720 2721 get_online_cpus(); 2722 mutex_lock(&slab_mutex); 2723 ret = __cache_shrink(cachep); 2724 mutex_unlock(&slab_mutex); 2725 put_online_cpus(); 2726 return ret; 2727 } 2728 EXPORT_SYMBOL(kmem_cache_shrink); 2729 2730 int __kmem_cache_shutdown(struct kmem_cache *cachep) 2731 { 2732 int i; 2733 struct kmem_list3 *l3; 2734 int rc = __cache_shrink(cachep); 2735 2736 if (rc) 2737 return rc; 2738 2739 for_each_online_cpu(i) 2740 kfree(cachep->array[i]); 2741 2742 /* NUMA: free the list3 structures */ 2743 for_each_online_node(i) { 2744 l3 = cachep->nodelists[i]; 2745 if (l3) { 2746 kfree(l3->shared); 2747 free_alien_cache(l3->alien); 2748 kfree(l3); 2749 } 2750 } 2751 return 0; 2752 } 2753 2754 /* 2755 * Get the memory for a slab management obj. 2756 * For a slab cache when the slab descriptor is off-slab, slab descriptors 2757 * always come from malloc_sizes caches. The slab descriptor cannot 2758 * come from the same cache which is getting created because, 2759 * when we are searching for an appropriate cache for these 2760 * descriptors in kmem_cache_create, we search through the malloc_sizes array. 2761 * If we are creating a malloc_sizes cache here it would not be visible to 2762 * kmem_find_general_cachep till the initialization is complete. 2763 * Hence we cannot have slabp_cache same as the original cache. 2764 */ 2765 static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, 2766 int colour_off, gfp_t local_flags, 2767 int nodeid) 2768 { 2769 struct slab *slabp; 2770 2771 if (OFF_SLAB(cachep)) { 2772 /* Slab management obj is off-slab. */ 2773 slabp = kmem_cache_alloc_node(cachep->slabp_cache, 2774 local_flags, nodeid); 2775 /* 2776 * If the first object in the slab is leaked (it's allocated 2777 * but no one has a reference to it), we want to make sure 2778 * kmemleak does not treat the ->s_mem pointer as a reference 2779 * to the object. Otherwise we will not report the leak. 2780 */ 2781 kmemleak_scan_area(&slabp->list, sizeof(struct list_head), 2782 local_flags); 2783 if (!slabp) 2784 return NULL; 2785 } else { 2786 slabp = objp + colour_off; 2787 colour_off += cachep->slab_size; 2788 } 2789 slabp->inuse = 0; 2790 slabp->colouroff = colour_off; 2791 slabp->s_mem = objp + colour_off; 2792 slabp->nodeid = nodeid; 2793 slabp->free = 0; 2794 return slabp; 2795 } 2796 2797 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) 2798 { 2799 return (kmem_bufctl_t *) (slabp + 1); 2800 } 2801 2802 static void cache_init_objs(struct kmem_cache *cachep, 2803 struct slab *slabp) 2804 { 2805 int i; 2806 2807 for (i = 0; i < cachep->num; i++) { 2808 void *objp = index_to_obj(cachep, slabp, i); 2809 #if DEBUG 2810 /* need to poison the objs? */ 2811 if (cachep->flags & SLAB_POISON) 2812 poison_obj(cachep, objp, POISON_FREE); 2813 if (cachep->flags & SLAB_STORE_USER) 2814 *dbg_userword(cachep, objp) = NULL; 2815 2816 if (cachep->flags & SLAB_RED_ZONE) { 2817 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2818 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2819 } 2820 /* 2821 * Constructors are not allowed to allocate memory from the same 2822 * cache which they are a constructor for. Otherwise, deadlock. 2823 * They must also be threaded. 2824 */ 2825 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2826 cachep->ctor(objp + obj_offset(cachep)); 2827 2828 if (cachep->flags & SLAB_RED_ZONE) { 2829 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2830 slab_error(cachep, "constructor overwrote the" 2831 " end of an object"); 2832 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2833 slab_error(cachep, "constructor overwrote the" 2834 " start of an object"); 2835 } 2836 if ((cachep->size % PAGE_SIZE) == 0 && 2837 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2838 kernel_map_pages(virt_to_page(objp), 2839 cachep->size / PAGE_SIZE, 0); 2840 #else 2841 if (cachep->ctor) 2842 cachep->ctor(objp); 2843 #endif 2844 slab_bufctl(slabp)[i] = i + 1; 2845 } 2846 slab_bufctl(slabp)[i - 1] = BUFCTL_END; 2847 } 2848 2849 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) 2850 { 2851 if (CONFIG_ZONE_DMA_FLAG) { 2852 if (flags & GFP_DMA) 2853 BUG_ON(!(cachep->allocflags & GFP_DMA)); 2854 else 2855 BUG_ON(cachep->allocflags & GFP_DMA); 2856 } 2857 } 2858 2859 static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, 2860 int nodeid) 2861 { 2862 void *objp = index_to_obj(cachep, slabp, slabp->free); 2863 kmem_bufctl_t next; 2864 2865 slabp->inuse++; 2866 next = slab_bufctl(slabp)[slabp->free]; 2867 #if DEBUG 2868 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2869 WARN_ON(slabp->nodeid != nodeid); 2870 #endif 2871 slabp->free = next; 2872 2873 return objp; 2874 } 2875 2876 static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, 2877 void *objp, int nodeid) 2878 { 2879 unsigned int objnr = obj_to_index(cachep, slabp, objp); 2880 2881 #if DEBUG 2882 /* Verify that the slab belongs to the intended node */ 2883 WARN_ON(slabp->nodeid != nodeid); 2884 2885 if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { 2886 printk(KERN_ERR "slab: double free detected in cache " 2887 "'%s', objp %p\n", cachep->name, objp); 2888 BUG(); 2889 } 2890 #endif 2891 slab_bufctl(slabp)[objnr] = slabp->free; 2892 slabp->free = objnr; 2893 slabp->inuse--; 2894 } 2895 2896 /* 2897 * Map pages beginning at addr to the given cache and slab. This is required 2898 * for the slab allocator to be able to lookup the cache and slab of a 2899 * virtual address for kfree, ksize, and slab debugging. 2900 */ 2901 static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, 2902 void *addr) 2903 { 2904 int nr_pages; 2905 struct page *page; 2906 2907 page = virt_to_page(addr); 2908 2909 nr_pages = 1; 2910 if (likely(!PageCompound(page))) 2911 nr_pages <<= cache->gfporder; 2912 2913 do { 2914 page->slab_cache = cache; 2915 page->slab_page = slab; 2916 page++; 2917 } while (--nr_pages); 2918 } 2919 2920 /* 2921 * Grow (by 1) the number of slabs within a cache. This is called by 2922 * kmem_cache_alloc() when there are no active objs left in a cache. 2923 */ 2924 static int cache_grow(struct kmem_cache *cachep, 2925 gfp_t flags, int nodeid, void *objp) 2926 { 2927 struct slab *slabp; 2928 size_t offset; 2929 gfp_t local_flags; 2930 struct kmem_list3 *l3; 2931 2932 /* 2933 * Be lazy and only check for valid flags here, keeping it out of the 2934 * critical path in kmem_cache_alloc(). 2935 */ 2936 BUG_ON(flags & GFP_SLAB_BUG_MASK); 2937 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 2938 2939 /* Take the l3 list lock to change the colour_next on this node */ 2940 check_irq_off(); 2941 l3 = cachep->nodelists[nodeid]; 2942 spin_lock(&l3->list_lock); 2943 2944 /* Get colour for the slab, and cal the next value. */ 2945 offset = l3->colour_next; 2946 l3->colour_next++; 2947 if (l3->colour_next >= cachep->colour) 2948 l3->colour_next = 0; 2949 spin_unlock(&l3->list_lock); 2950 2951 offset *= cachep->colour_off; 2952 2953 if (local_flags & __GFP_WAIT) 2954 local_irq_enable(); 2955 2956 /* 2957 * The test for missing atomic flag is performed here, rather than 2958 * the more obvious place, simply to reduce the critical path length 2959 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they 2960 * will eventually be caught here (where it matters). 2961 */ 2962 kmem_flagcheck(cachep, flags); 2963 2964 /* 2965 * Get mem for the objs. Attempt to allocate a physical page from 2966 * 'nodeid'. 2967 */ 2968 if (!objp) 2969 objp = kmem_getpages(cachep, local_flags, nodeid); 2970 if (!objp) 2971 goto failed; 2972 2973 /* Get slab management. */ 2974 slabp = alloc_slabmgmt(cachep, objp, offset, 2975 local_flags & ~GFP_CONSTRAINT_MASK, nodeid); 2976 if (!slabp) 2977 goto opps1; 2978 2979 slab_map_pages(cachep, slabp, objp); 2980 2981 cache_init_objs(cachep, slabp); 2982 2983 if (local_flags & __GFP_WAIT) 2984 local_irq_disable(); 2985 check_irq_off(); 2986 spin_lock(&l3->list_lock); 2987 2988 /* Make slab active. */ 2989 list_add_tail(&slabp->list, &(l3->slabs_free)); 2990 STATS_INC_GROWN(cachep); 2991 l3->free_objects += cachep->num; 2992 spin_unlock(&l3->list_lock); 2993 return 1; 2994 opps1: 2995 kmem_freepages(cachep, objp); 2996 failed: 2997 if (local_flags & __GFP_WAIT) 2998 local_irq_disable(); 2999 return 0; 3000 } 3001 3002 #if DEBUG 3003 3004 /* 3005 * Perform extra freeing checks: 3006 * - detect bad pointers. 3007 * - POISON/RED_ZONE checking 3008 */ 3009 static void kfree_debugcheck(const void *objp) 3010 { 3011 if (!virt_addr_valid(objp)) { 3012 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", 3013 (unsigned long)objp); 3014 BUG(); 3015 } 3016 } 3017 3018 static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) 3019 { 3020 unsigned long long redzone1, redzone2; 3021 3022 redzone1 = *dbg_redzone1(cache, obj); 3023 redzone2 = *dbg_redzone2(cache, obj); 3024 3025 /* 3026 * Redzone is ok. 3027 */ 3028 if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) 3029 return; 3030 3031 if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) 3032 slab_error(cache, "double free detected"); 3033 else 3034 slab_error(cache, "memory outside object was overwritten"); 3035 3036 printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", 3037 obj, redzone1, redzone2); 3038 } 3039 3040 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, 3041 unsigned long caller) 3042 { 3043 struct page *page; 3044 unsigned int objnr; 3045 struct slab *slabp; 3046 3047 BUG_ON(virt_to_cache(objp) != cachep); 3048 3049 objp -= obj_offset(cachep); 3050 kfree_debugcheck(objp); 3051 page = virt_to_head_page(objp); 3052 3053 slabp = page->slab_page; 3054 3055 if (cachep->flags & SLAB_RED_ZONE) { 3056 verify_redzone_free(cachep, objp); 3057 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 3058 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 3059 } 3060 if (cachep->flags & SLAB_STORE_USER) 3061 *dbg_userword(cachep, objp) = (void *)caller; 3062 3063 objnr = obj_to_index(cachep, slabp, objp); 3064 3065 BUG_ON(objnr >= cachep->num); 3066 BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); 3067 3068 #ifdef CONFIG_DEBUG_SLAB_LEAK 3069 slab_bufctl(slabp)[objnr] = BUFCTL_FREE; 3070 #endif 3071 if (cachep->flags & SLAB_POISON) { 3072 #ifdef CONFIG_DEBUG_PAGEALLOC 3073 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { 3074 store_stackinfo(cachep, objp, caller); 3075 kernel_map_pages(virt_to_page(objp), 3076 cachep->size / PAGE_SIZE, 0); 3077 } else { 3078 poison_obj(cachep, objp, POISON_FREE); 3079 } 3080 #else 3081 poison_obj(cachep, objp, POISON_FREE); 3082 #endif 3083 } 3084 return objp; 3085 } 3086 3087 static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) 3088 { 3089 kmem_bufctl_t i; 3090 int entries = 0; 3091 3092 /* Check slab's freelist to see if this obj is there. */ 3093 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { 3094 entries++; 3095 if (entries > cachep->num || i >= cachep->num) 3096 goto bad; 3097 } 3098 if (entries != cachep->num - slabp->inuse) { 3099 bad: 3100 printk(KERN_ERR "slab: Internal list corruption detected in " 3101 "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n", 3102 cachep->name, cachep->num, slabp, slabp->inuse, 3103 print_tainted()); 3104 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp, 3105 sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t), 3106 1); 3107 BUG(); 3108 } 3109 } 3110 #else 3111 #define kfree_debugcheck(x) do { } while(0) 3112 #define cache_free_debugcheck(x,objp,z) (objp) 3113 #define check_slabp(x,y) do { } while(0) 3114 #endif 3115 3116 static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, 3117 bool force_refill) 3118 { 3119 int batchcount; 3120 struct kmem_list3 *l3; 3121 struct array_cache *ac; 3122 int node; 3123 3124 check_irq_off(); 3125 node = numa_mem_id(); 3126 if (unlikely(force_refill)) 3127 goto force_grow; 3128 retry: 3129 ac = cpu_cache_get(cachep); 3130 batchcount = ac->batchcount; 3131 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 3132 /* 3133 * If there was little recent activity on this cache, then 3134 * perform only a partial refill. Otherwise we could generate 3135 * refill bouncing. 3136 */ 3137 batchcount = BATCHREFILL_LIMIT; 3138 } 3139 l3 = cachep->nodelists[node]; 3140 3141 BUG_ON(ac->avail > 0 || !l3); 3142 spin_lock(&l3->list_lock); 3143 3144 /* See if we can refill from the shared array */ 3145 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) { 3146 l3->shared->touched = 1; 3147 goto alloc_done; 3148 } 3149 3150 while (batchcount > 0) { 3151 struct list_head *entry; 3152 struct slab *slabp; 3153 /* Get slab alloc is to come from. */ 3154 entry = l3->slabs_partial.next; 3155 if (entry == &l3->slabs_partial) { 3156 l3->free_touched = 1; 3157 entry = l3->slabs_free.next; 3158 if (entry == &l3->slabs_free) 3159 goto must_grow; 3160 } 3161 3162 slabp = list_entry(entry, struct slab, list); 3163 check_slabp(cachep, slabp); 3164 check_spinlock_acquired(cachep); 3165 3166 /* 3167 * The slab was either on partial or free list so 3168 * there must be at least one object available for 3169 * allocation. 3170 */ 3171 BUG_ON(slabp->inuse >= cachep->num); 3172 3173 while (slabp->inuse < cachep->num && batchcount--) { 3174 STATS_INC_ALLOCED(cachep); 3175 STATS_INC_ACTIVE(cachep); 3176 STATS_SET_HIGH(cachep); 3177 3178 ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp, 3179 node)); 3180 } 3181 check_slabp(cachep, slabp); 3182 3183 /* move slabp to correct slabp list: */ 3184 list_del(&slabp->list); 3185 if (slabp->free == BUFCTL_END) 3186 list_add(&slabp->list, &l3->slabs_full); 3187 else 3188 list_add(&slabp->list, &l3->slabs_partial); 3189 } 3190 3191 must_grow: 3192 l3->free_objects -= ac->avail; 3193 alloc_done: 3194 spin_unlock(&l3->list_lock); 3195 3196 if (unlikely(!ac->avail)) { 3197 int x; 3198 force_grow: 3199 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); 3200 3201 /* cache_grow can reenable interrupts, then ac could change. */ 3202 ac = cpu_cache_get(cachep); 3203 node = numa_mem_id(); 3204 3205 /* no objects in sight? abort */ 3206 if (!x && (ac->avail == 0 || force_refill)) 3207 return NULL; 3208 3209 if (!ac->avail) /* objects refilled by interrupt? */ 3210 goto retry; 3211 } 3212 ac->touched = 1; 3213 3214 return ac_get_obj(cachep, ac, flags, force_refill); 3215 } 3216 3217 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, 3218 gfp_t flags) 3219 { 3220 might_sleep_if(flags & __GFP_WAIT); 3221 #if DEBUG 3222 kmem_flagcheck(cachep, flags); 3223 #endif 3224 } 3225 3226 #if DEBUG 3227 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, 3228 gfp_t flags, void *objp, unsigned long caller) 3229 { 3230 if (!objp) 3231 return objp; 3232 if (cachep->flags & SLAB_POISON) { 3233 #ifdef CONFIG_DEBUG_PAGEALLOC 3234 if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 3235 kernel_map_pages(virt_to_page(objp), 3236 cachep->size / PAGE_SIZE, 1); 3237 else 3238 check_poison_obj(cachep, objp); 3239 #else 3240 check_poison_obj(cachep, objp); 3241 #endif 3242 poison_obj(cachep, objp, POISON_INUSE); 3243 } 3244 if (cachep->flags & SLAB_STORE_USER) 3245 *dbg_userword(cachep, objp) = (void *)caller; 3246 3247 if (cachep->flags & SLAB_RED_ZONE) { 3248 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || 3249 *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 3250 slab_error(cachep, "double free, or memory outside" 3251 " object was overwritten"); 3252 printk(KERN_ERR 3253 "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", 3254 objp, *dbg_redzone1(cachep, objp), 3255 *dbg_redzone2(cachep, objp)); 3256 } 3257 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 3258 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 3259 } 3260 #ifdef CONFIG_DEBUG_SLAB_LEAK 3261 { 3262 struct slab *slabp; 3263 unsigned objnr; 3264 3265 slabp = virt_to_head_page(objp)->slab_page; 3266 objnr = (unsigned)(objp - slabp->s_mem) / cachep->size; 3267 slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; 3268 } 3269 #endif 3270 objp += obj_offset(cachep); 3271 if (cachep->ctor && cachep->flags & SLAB_POISON) 3272 cachep->ctor(objp); 3273 if (ARCH_SLAB_MINALIGN && 3274 ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { 3275 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3276 objp, (int)ARCH_SLAB_MINALIGN); 3277 } 3278 return objp; 3279 } 3280 #else 3281 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 3282 #endif 3283 3284 static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) 3285 { 3286 if (cachep == kmem_cache) 3287 return false; 3288 3289 return should_failslab(cachep->object_size, flags, cachep->flags); 3290 } 3291 3292 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3293 { 3294 void *objp; 3295 struct array_cache *ac; 3296 bool force_refill = false; 3297 3298 check_irq_off(); 3299 3300 ac = cpu_cache_get(cachep); 3301 if (likely(ac->avail)) { 3302 ac->touched = 1; 3303 objp = ac_get_obj(cachep, ac, flags, false); 3304 3305 /* 3306 * Allow for the possibility all avail objects are not allowed 3307 * by the current flags 3308 */ 3309 if (objp) { 3310 STATS_INC_ALLOCHIT(cachep); 3311 goto out; 3312 } 3313 force_refill = true; 3314 } 3315 3316 STATS_INC_ALLOCMISS(cachep); 3317 objp = cache_alloc_refill(cachep, flags, force_refill); 3318 /* 3319 * the 'ac' may be updated by cache_alloc_refill(), 3320 * and kmemleak_erase() requires its correct value. 3321 */ 3322 ac = cpu_cache_get(cachep); 3323 3324 out: 3325 /* 3326 * To avoid a false negative, if an object that is in one of the 3327 * per-CPU caches is leaked, we need to make sure kmemleak doesn't 3328 * treat the array pointers as a reference to the object. 3329 */ 3330 if (objp) 3331 kmemleak_erase(&ac->entry[ac->avail]); 3332 return objp; 3333 } 3334 3335 #ifdef CONFIG_NUMA 3336 /* 3337 * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. 3338 * 3339 * If we are in_interrupt, then process context, including cpusets and 3340 * mempolicy, may not apply and should not be used for allocation policy. 3341 */ 3342 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) 3343 { 3344 int nid_alloc, nid_here; 3345 3346 if (in_interrupt() || (flags & __GFP_THISNODE)) 3347 return NULL; 3348 nid_alloc = nid_here = numa_mem_id(); 3349 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3350 nid_alloc = cpuset_slab_spread_node(); 3351 else if (current->mempolicy) 3352 nid_alloc = slab_node(); 3353 if (nid_alloc != nid_here) 3354 return ____cache_alloc_node(cachep, flags, nid_alloc); 3355 return NULL; 3356 } 3357 3358 /* 3359 * Fallback function if there was no memory available and no objects on a 3360 * certain node and fall back is permitted. First we scan all the 3361 * available nodelists for available objects. If that fails then we 3362 * perform an allocation without specifying a node. This allows the page 3363 * allocator to do its reclaim / fallback magic. We then insert the 3364 * slab into the proper nodelist and then allocate from it. 3365 */ 3366 static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) 3367 { 3368 struct zonelist *zonelist; 3369 gfp_t local_flags; 3370 struct zoneref *z; 3371 struct zone *zone; 3372 enum zone_type high_zoneidx = gfp_zone(flags); 3373 void *obj = NULL; 3374 int nid; 3375 unsigned int cpuset_mems_cookie; 3376 3377 if (flags & __GFP_THISNODE) 3378 return NULL; 3379 3380 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 3381 3382 retry_cpuset: 3383 cpuset_mems_cookie = get_mems_allowed(); 3384 zonelist = node_zonelist(slab_node(), flags); 3385 3386 retry: 3387 /* 3388 * Look through allowed nodes for objects available 3389 * from existing per node queues. 3390 */ 3391 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 3392 nid = zone_to_nid(zone); 3393 3394 if (cpuset_zone_allowed_hardwall(zone, flags) && 3395 cache->nodelists[nid] && 3396 cache->nodelists[nid]->free_objects) { 3397 obj = ____cache_alloc_node(cache, 3398 flags | GFP_THISNODE, nid); 3399 if (obj) 3400 break; 3401 } 3402 } 3403 3404 if (!obj) { 3405 /* 3406 * This allocation will be performed within the constraints 3407 * of the current cpuset / memory policy requirements. 3408 * We may trigger various forms of reclaim on the allowed 3409 * set and go into memory reserves if necessary. 3410 */ 3411 if (local_flags & __GFP_WAIT) 3412 local_irq_enable(); 3413 kmem_flagcheck(cache, flags); 3414 obj = kmem_getpages(cache, local_flags, numa_mem_id()); 3415 if (local_flags & __GFP_WAIT) 3416 local_irq_disable(); 3417 if (obj) { 3418 /* 3419 * Insert into the appropriate per node queues 3420 */ 3421 nid = page_to_nid(virt_to_page(obj)); 3422 if (cache_grow(cache, flags, nid, obj)) { 3423 obj = ____cache_alloc_node(cache, 3424 flags | GFP_THISNODE, nid); 3425 if (!obj) 3426 /* 3427 * Another processor may allocate the 3428 * objects in the slab since we are 3429 * not holding any locks. 3430 */ 3431 goto retry; 3432 } else { 3433 /* cache_grow already freed obj */ 3434 obj = NULL; 3435 } 3436 } 3437 } 3438 3439 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) 3440 goto retry_cpuset; 3441 return obj; 3442 } 3443 3444 /* 3445 * A interface to enable slab creation on nodeid 3446 */ 3447 static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3448 int nodeid) 3449 { 3450 struct list_head *entry; 3451 struct slab *slabp; 3452 struct kmem_list3 *l3; 3453 void *obj; 3454 int x; 3455 3456 l3 = cachep->nodelists[nodeid]; 3457 BUG_ON(!l3); 3458 3459 retry: 3460 check_irq_off(); 3461 spin_lock(&l3->list_lock); 3462 entry = l3->slabs_partial.next; 3463 if (entry == &l3->slabs_partial) { 3464 l3->free_touched = 1; 3465 entry = l3->slabs_free.next; 3466 if (entry == &l3->slabs_free) 3467 goto must_grow; 3468 } 3469 3470 slabp = list_entry(entry, struct slab, list); 3471 check_spinlock_acquired_node(cachep, nodeid); 3472 check_slabp(cachep, slabp); 3473 3474 STATS_INC_NODEALLOCS(cachep); 3475 STATS_INC_ACTIVE(cachep); 3476 STATS_SET_HIGH(cachep); 3477 3478 BUG_ON(slabp->inuse == cachep->num); 3479 3480 obj = slab_get_obj(cachep, slabp, nodeid); 3481 check_slabp(cachep, slabp); 3482 l3->free_objects--; 3483 /* move slabp to correct slabp list: */ 3484 list_del(&slabp->list); 3485 3486 if (slabp->free == BUFCTL_END) 3487 list_add(&slabp->list, &l3->slabs_full); 3488 else 3489 list_add(&slabp->list, &l3->slabs_partial); 3490 3491 spin_unlock(&l3->list_lock); 3492 goto done; 3493 3494 must_grow: 3495 spin_unlock(&l3->list_lock); 3496 x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); 3497 if (x) 3498 goto retry; 3499 3500 return fallback_alloc(cachep, flags); 3501 3502 done: 3503 return obj; 3504 } 3505 3506 /** 3507 * kmem_cache_alloc_node - Allocate an object on the specified node 3508 * @cachep: The cache to allocate from. 3509 * @flags: See kmalloc(). 3510 * @nodeid: node number of the target node. 3511 * @caller: return address of caller, used for debug information 3512 * 3513 * Identical to kmem_cache_alloc but it will allocate memory on the given 3514 * node, which can improve the performance for cpu bound structures. 3515 * 3516 * Fallback to other node is possible if __GFP_THISNODE is not set. 3517 */ 3518 static __always_inline void * 3519 slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, 3520 unsigned long caller) 3521 { 3522 unsigned long save_flags; 3523 void *ptr; 3524 int slab_node = numa_mem_id(); 3525 3526 flags &= gfp_allowed_mask; 3527 3528 lockdep_trace_alloc(flags); 3529 3530 if (slab_should_failslab(cachep, flags)) 3531 return NULL; 3532 3533 cache_alloc_debugcheck_before(cachep, flags); 3534 local_irq_save(save_flags); 3535 3536 if (nodeid == NUMA_NO_NODE) 3537 nodeid = slab_node; 3538 3539 if (unlikely(!cachep->nodelists[nodeid])) { 3540 /* Node not bootstrapped yet */ 3541 ptr = fallback_alloc(cachep, flags); 3542 goto out; 3543 } 3544 3545 if (nodeid == slab_node) { 3546 /* 3547 * Use the locally cached objects if possible. 3548 * However ____cache_alloc does not allow fallback 3549 * to other nodes. It may fail while we still have 3550 * objects on other nodes available. 3551 */ 3552 ptr = ____cache_alloc(cachep, flags); 3553 if (ptr) 3554 goto out; 3555 } 3556 /* ___cache_alloc_node can fall back to other nodes */ 3557 ptr = ____cache_alloc_node(cachep, flags, nodeid); 3558 out: 3559 local_irq_restore(save_flags); 3560 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); 3561 kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, 3562 flags); 3563 3564 if (likely(ptr)) 3565 kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); 3566 3567 if (unlikely((flags & __GFP_ZERO) && ptr)) 3568 memset(ptr, 0, cachep->object_size); 3569 3570 return ptr; 3571 } 3572 3573 static __always_inline void * 3574 __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) 3575 { 3576 void *objp; 3577 3578 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { 3579 objp = alternate_node_alloc(cache, flags); 3580 if (objp) 3581 goto out; 3582 } 3583 objp = ____cache_alloc(cache, flags); 3584 3585 /* 3586 * We may just have run out of memory on the local node. 3587 * ____cache_alloc_node() knows how to locate memory on other nodes 3588 */ 3589 if (!objp) 3590 objp = ____cache_alloc_node(cache, flags, numa_mem_id()); 3591 3592 out: 3593 return objp; 3594 } 3595 #else 3596 3597 static __always_inline void * 3598 __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3599 { 3600 return ____cache_alloc(cachep, flags); 3601 } 3602 3603 #endif /* CONFIG_NUMA */ 3604 3605 static __always_inline void * 3606 slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) 3607 { 3608 unsigned long save_flags; 3609 void *objp; 3610 3611 flags &= gfp_allowed_mask; 3612 3613 lockdep_trace_alloc(flags); 3614 3615 if (slab_should_failslab(cachep, flags)) 3616 return NULL; 3617 3618 cache_alloc_debugcheck_before(cachep, flags); 3619 local_irq_save(save_flags); 3620 objp = __do_cache_alloc(cachep, flags); 3621 local_irq_restore(save_flags); 3622 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); 3623 kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, 3624 flags); 3625 prefetchw(objp); 3626 3627 if (likely(objp)) 3628 kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); 3629 3630 if (unlikely((flags & __GFP_ZERO) && objp)) 3631 memset(objp, 0, cachep->object_size); 3632 3633 return objp; 3634 } 3635 3636 /* 3637 * Caller needs to acquire correct kmem_list's list_lock 3638 */ 3639 static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, 3640 int node) 3641 { 3642 int i; 3643 struct kmem_list3 *l3; 3644 3645 for (i = 0; i < nr_objects; i++) { 3646 void *objp; 3647 struct slab *slabp; 3648 3649 clear_obj_pfmemalloc(&objpp[i]); 3650 objp = objpp[i]; 3651 3652 slabp = virt_to_slab(objp); 3653 l3 = cachep->nodelists[node]; 3654 list_del(&slabp->list); 3655 check_spinlock_acquired_node(cachep, node); 3656 check_slabp(cachep, slabp); 3657 slab_put_obj(cachep, slabp, objp, node); 3658 STATS_DEC_ACTIVE(cachep); 3659 l3->free_objects++; 3660 check_slabp(cachep, slabp); 3661 3662 /* fixup slab chains */ 3663 if (slabp->inuse == 0) { 3664 if (l3->free_objects > l3->free_limit) { 3665 l3->free_objects -= cachep->num; 3666 /* No need to drop any previously held 3667 * lock here, even if we have a off-slab slab 3668 * descriptor it is guaranteed to come from 3669 * a different cache, refer to comments before 3670 * alloc_slabmgmt. 3671 */ 3672 slab_destroy(cachep, slabp); 3673 } else { 3674 list_add(&slabp->list, &l3->slabs_free); 3675 } 3676 } else { 3677 /* Unconditionally move a slab to the end of the 3678 * partial list on free - maximum time for the 3679 * other objects to be freed, too. 3680 */ 3681 list_add_tail(&slabp->list, &l3->slabs_partial); 3682 } 3683 } 3684 } 3685 3686 static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) 3687 { 3688 int batchcount; 3689 struct kmem_list3 *l3; 3690 int node = numa_mem_id(); 3691 3692 batchcount = ac->batchcount; 3693 #if DEBUG 3694 BUG_ON(!batchcount || batchcount > ac->avail); 3695 #endif 3696 check_irq_off(); 3697 l3 = cachep->nodelists[node]; 3698 spin_lock(&l3->list_lock); 3699 if (l3->shared) { 3700 struct array_cache *shared_array = l3->shared; 3701 int max = shared_array->limit - shared_array->avail; 3702 if (max) { 3703 if (batchcount > max) 3704 batchcount = max; 3705 memcpy(&(shared_array->entry[shared_array->avail]), 3706 ac->entry, sizeof(void *) * batchcount); 3707 shared_array->avail += batchcount; 3708 goto free_done; 3709 } 3710 } 3711 3712 free_block(cachep, ac->entry, batchcount, node); 3713 free_done: 3714 #if STATS 3715 { 3716 int i = 0; 3717 struct list_head *p; 3718 3719 p = l3->slabs_free.next; 3720 while (p != &(l3->slabs_free)) { 3721 struct slab *slabp; 3722 3723 slabp = list_entry(p, struct slab, list); 3724 BUG_ON(slabp->inuse); 3725 3726 i++; 3727 p = p->next; 3728 } 3729 STATS_SET_FREEABLE(cachep, i); 3730 } 3731 #endif 3732 spin_unlock(&l3->list_lock); 3733 ac->avail -= batchcount; 3734 memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); 3735 } 3736 3737 /* 3738 * Release an obj back to its cache. If the obj has a constructed state, it must 3739 * be in this state _before_ it is released. Called with disabled ints. 3740 */ 3741 static inline void __cache_free(struct kmem_cache *cachep, void *objp, 3742 unsigned long caller) 3743 { 3744 struct array_cache *ac = cpu_cache_get(cachep); 3745 3746 check_irq_off(); 3747 kmemleak_free_recursive(objp, cachep->flags); 3748 objp = cache_free_debugcheck(cachep, objp, caller); 3749 3750 kmemcheck_slab_free(cachep, objp, cachep->object_size); 3751 3752 /* 3753 * Skip calling cache_free_alien() when the platform is not numa. 3754 * This will avoid cache misses that happen while accessing slabp (which 3755 * is per page memory reference) to get nodeid. Instead use a global 3756 * variable to skip the call, which is mostly likely to be present in 3757 * the cache. 3758 */ 3759 if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) 3760 return; 3761 3762 if (likely(ac->avail < ac->limit)) { 3763 STATS_INC_FREEHIT(cachep); 3764 } else { 3765 STATS_INC_FREEMISS(cachep); 3766 cache_flusharray(cachep, ac); 3767 } 3768 3769 ac_put_obj(cachep, ac, objp); 3770 } 3771 3772 /** 3773 * kmem_cache_alloc - Allocate an object 3774 * @cachep: The cache to allocate from. 3775 * @flags: See kmalloc(). 3776 * 3777 * Allocate an object from this cache. The flags are only relevant 3778 * if the cache has no available objects. 3779 */ 3780 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3781 { 3782 void *ret = slab_alloc(cachep, flags, _RET_IP_); 3783 3784 trace_kmem_cache_alloc(_RET_IP_, ret, 3785 cachep->object_size, cachep->size, flags); 3786 3787 return ret; 3788 } 3789 EXPORT_SYMBOL(kmem_cache_alloc); 3790 3791 #ifdef CONFIG_TRACING 3792 void * 3793 kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) 3794 { 3795 void *ret; 3796 3797 ret = slab_alloc(cachep, flags, _RET_IP_); 3798 3799 trace_kmalloc(_RET_IP_, ret, 3800 size, cachep->size, flags); 3801 return ret; 3802 } 3803 EXPORT_SYMBOL(kmem_cache_alloc_trace); 3804 #endif 3805 3806 #ifdef CONFIG_NUMA 3807 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3808 { 3809 void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); 3810 3811 trace_kmem_cache_alloc_node(_RET_IP_, ret, 3812 cachep->object_size, cachep->size, 3813 flags, nodeid); 3814 3815 return ret; 3816 } 3817 EXPORT_SYMBOL(kmem_cache_alloc_node); 3818 3819 #ifdef CONFIG_TRACING 3820 void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, 3821 gfp_t flags, 3822 int nodeid, 3823 size_t size) 3824 { 3825 void *ret; 3826 3827 ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); 3828 3829 trace_kmalloc_node(_RET_IP_, ret, 3830 size, cachep->size, 3831 flags, nodeid); 3832 return ret; 3833 } 3834 EXPORT_SYMBOL(kmem_cache_alloc_node_trace); 3835 #endif 3836 3837 static __always_inline void * 3838 __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) 3839 { 3840 struct kmem_cache *cachep; 3841 3842 cachep = kmem_find_general_cachep(size, flags); 3843 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3844 return cachep; 3845 return kmem_cache_alloc_node_trace(cachep, flags, node, size); 3846 } 3847 3848 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) 3849 void *__kmalloc_node(size_t size, gfp_t flags, int node) 3850 { 3851 return __do_kmalloc_node(size, flags, node, _RET_IP_); 3852 } 3853 EXPORT_SYMBOL(__kmalloc_node); 3854 3855 void *__kmalloc_node_track_caller(size_t size, gfp_t flags, 3856 int node, unsigned long caller) 3857 { 3858 return __do_kmalloc_node(size, flags, node, caller); 3859 } 3860 EXPORT_SYMBOL(__kmalloc_node_track_caller); 3861 #else 3862 void *__kmalloc_node(size_t size, gfp_t flags, int node) 3863 { 3864 return __do_kmalloc_node(size, flags, node, 0); 3865 } 3866 EXPORT_SYMBOL(__kmalloc_node); 3867 #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ 3868 #endif /* CONFIG_NUMA */ 3869 3870 /** 3871 * __do_kmalloc - allocate memory 3872 * @size: how many bytes of memory are required. 3873 * @flags: the type of memory to allocate (see kmalloc). 3874 * @caller: function caller for debug tracking of the caller 3875 */ 3876 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, 3877 unsigned long caller) 3878 { 3879 struct kmem_cache *cachep; 3880 void *ret; 3881 3882 /* If you want to save a few bytes .text space: replace 3883 * __ with kmem_. 3884 * Then kmalloc uses the uninlined functions instead of the inline 3885 * functions. 3886 */ 3887 cachep = __find_general_cachep(size, flags); 3888 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3889 return cachep; 3890 ret = slab_alloc(cachep, flags, caller); 3891 3892 trace_kmalloc(caller, ret, 3893 size, cachep->size, flags); 3894 3895 return ret; 3896 } 3897 3898 3899 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) 3900 void *__kmalloc(size_t size, gfp_t flags) 3901 { 3902 return __do_kmalloc(size, flags, _RET_IP_); 3903 } 3904 EXPORT_SYMBOL(__kmalloc); 3905 3906 void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) 3907 { 3908 return __do_kmalloc(size, flags, caller); 3909 } 3910 EXPORT_SYMBOL(__kmalloc_track_caller); 3911 3912 #else 3913 void *__kmalloc(size_t size, gfp_t flags) 3914 { 3915 return __do_kmalloc(size, flags, 0); 3916 } 3917 EXPORT_SYMBOL(__kmalloc); 3918 #endif 3919 3920 /** 3921 * kmem_cache_free - Deallocate an object 3922 * @cachep: The cache the allocation was from. 3923 * @objp: The previously allocated object. 3924 * 3925 * Free an object which was previously allocated from this 3926 * cache. 3927 */ 3928 void kmem_cache_free(struct kmem_cache *cachep, void *objp) 3929 { 3930 unsigned long flags; 3931 3932 local_irq_save(flags); 3933 debug_check_no_locks_freed(objp, cachep->object_size); 3934 if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) 3935 debug_check_no_obj_freed(objp, cachep->object_size); 3936 __cache_free(cachep, objp, _RET_IP_); 3937 local_irq_restore(flags); 3938 3939 trace_kmem_cache_free(_RET_IP_, objp); 3940 } 3941 EXPORT_SYMBOL(kmem_cache_free); 3942 3943 /** 3944 * kfree - free previously allocated memory 3945 * @objp: pointer returned by kmalloc. 3946 * 3947 * If @objp is NULL, no operation is performed. 3948 * 3949 * Don't free memory not originally allocated by kmalloc() 3950 * or you will run into trouble. 3951 */ 3952 void kfree(const void *objp) 3953 { 3954 struct kmem_cache *c; 3955 unsigned long flags; 3956 3957 trace_kfree(_RET_IP_, objp); 3958 3959 if (unlikely(ZERO_OR_NULL_PTR(objp))) 3960 return; 3961 local_irq_save(flags); 3962 kfree_debugcheck(objp); 3963 c = virt_to_cache(objp); 3964 debug_check_no_locks_freed(objp, c->object_size); 3965 3966 debug_check_no_obj_freed(objp, c->object_size); 3967 __cache_free(c, (void *)objp, _RET_IP_); 3968 local_irq_restore(flags); 3969 } 3970 EXPORT_SYMBOL(kfree); 3971 3972 unsigned int kmem_cache_size(struct kmem_cache *cachep) 3973 { 3974 return cachep->object_size; 3975 } 3976 EXPORT_SYMBOL(kmem_cache_size); 3977 3978 /* 3979 * This initializes kmem_list3 or resizes various caches for all nodes. 3980 */ 3981 static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) 3982 { 3983 int node; 3984 struct kmem_list3 *l3; 3985 struct array_cache *new_shared; 3986 struct array_cache **new_alien = NULL; 3987 3988 for_each_online_node(node) { 3989 3990 if (use_alien_caches) { 3991 new_alien = alloc_alien_cache(node, cachep->limit, gfp); 3992 if (!new_alien) 3993 goto fail; 3994 } 3995 3996 new_shared = NULL; 3997 if (cachep->shared) { 3998 new_shared = alloc_arraycache(node, 3999 cachep->shared*cachep->batchcount, 4000 0xbaadf00d, gfp); 4001 if (!new_shared) { 4002 free_alien_cache(new_alien); 4003 goto fail; 4004 } 4005 } 4006 4007 l3 = cachep->nodelists[node]; 4008 if (l3) { 4009 struct array_cache *shared = l3->shared; 4010 4011 spin_lock_irq(&l3->list_lock); 4012 4013 if (shared) 4014 free_block(cachep, shared->entry, 4015 shared->avail, node); 4016 4017 l3->shared = new_shared; 4018 if (!l3->alien) { 4019 l3->alien = new_alien; 4020 new_alien = NULL; 4021 } 4022 l3->free_limit = (1 + nr_cpus_node(node)) * 4023 cachep->batchcount + cachep->num; 4024 spin_unlock_irq(&l3->list_lock); 4025 kfree(shared); 4026 free_alien_cache(new_alien); 4027 continue; 4028 } 4029 l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node); 4030 if (!l3) { 4031 free_alien_cache(new_alien); 4032 kfree(new_shared); 4033 goto fail; 4034 } 4035 4036 kmem_list3_init(l3); 4037 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 4038 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 4039 l3->shared = new_shared; 4040 l3->alien = new_alien; 4041 l3->free_limit = (1 + nr_cpus_node(node)) * 4042 cachep->batchcount + cachep->num; 4043 cachep->nodelists[node] = l3; 4044 } 4045 return 0; 4046 4047 fail: 4048 if (!cachep->list.next) { 4049 /* Cache is not active yet. Roll back what we did */ 4050 node--; 4051 while (node >= 0) { 4052 if (cachep->nodelists[node]) { 4053 l3 = cachep->nodelists[node]; 4054 4055 kfree(l3->shared); 4056 free_alien_cache(l3->alien); 4057 kfree(l3); 4058 cachep->nodelists[node] = NULL; 4059 } 4060 node--; 4061 } 4062 } 4063 return -ENOMEM; 4064 } 4065 4066 struct ccupdate_struct { 4067 struct kmem_cache *cachep; 4068 struct array_cache *new[0]; 4069 }; 4070 4071 static void do_ccupdate_local(void *info) 4072 { 4073 struct ccupdate_struct *new = info; 4074 struct array_cache *old; 4075 4076 check_irq_off(); 4077 old = cpu_cache_get(new->cachep); 4078 4079 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 4080 new->new[smp_processor_id()] = old; 4081 } 4082 4083 /* Always called with the slab_mutex held */ 4084 static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 4085 int batchcount, int shared, gfp_t gfp) 4086 { 4087 struct ccupdate_struct *new; 4088 int i; 4089 4090 new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), 4091 gfp); 4092 if (!new) 4093 return -ENOMEM; 4094 4095 for_each_online_cpu(i) { 4096 new->new[i] = alloc_arraycache(cpu_to_mem(i), limit, 4097 batchcount, gfp); 4098 if (!new->new[i]) { 4099 for (i--; i >= 0; i--) 4100 kfree(new->new[i]); 4101 kfree(new); 4102 return -ENOMEM; 4103 } 4104 } 4105 new->cachep = cachep; 4106 4107 on_each_cpu(do_ccupdate_local, (void *)new, 1); 4108 4109 check_irq_on(); 4110 cachep->batchcount = batchcount; 4111 cachep->limit = limit; 4112 cachep->shared = shared; 4113 4114 for_each_online_cpu(i) { 4115 struct array_cache *ccold = new->new[i]; 4116 if (!ccold) 4117 continue; 4118 spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); 4119 free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); 4120 spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); 4121 kfree(ccold); 4122 } 4123 kfree(new); 4124 return alloc_kmemlist(cachep, gfp); 4125 } 4126 4127 /* Called with slab_mutex held always */ 4128 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) 4129 { 4130 int err; 4131 int limit, shared; 4132 4133 /* 4134 * The head array serves three purposes: 4135 * - create a LIFO ordering, i.e. return objects that are cache-warm 4136 * - reduce the number of spinlock operations. 4137 * - reduce the number of linked list operations on the slab and 4138 * bufctl chains: array operations are cheaper. 4139 * The numbers are guessed, we should auto-tune as described by 4140 * Bonwick. 4141 */ 4142 if (cachep->size > 131072) 4143 limit = 1; 4144 else if (cachep->size > PAGE_SIZE) 4145 limit = 8; 4146 else if (cachep->size > 1024) 4147 limit = 24; 4148 else if (cachep->size > 256) 4149 limit = 54; 4150 else 4151 limit = 120; 4152 4153 /* 4154 * CPU bound tasks (e.g. network routing) can exhibit cpu bound 4155 * allocation behaviour: Most allocs on one cpu, most free operations 4156 * on another cpu. For these cases, an efficient object passing between 4157 * cpus is necessary. This is provided by a shared array. The array 4158 * replaces Bonwick's magazine layer. 4159 * On uniprocessor, it's functionally equivalent (but less efficient) 4160 * to a larger limit. Thus disabled by default. 4161 */ 4162 shared = 0; 4163 if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) 4164 shared = 8; 4165 4166 #if DEBUG 4167 /* 4168 * With debugging enabled, large batchcount lead to excessively long 4169 * periods with disabled local interrupts. Limit the batchcount 4170 */ 4171 if (limit > 32) 4172 limit = 32; 4173 #endif 4174 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); 4175 if (err) 4176 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 4177 cachep->name, -err); 4178 return err; 4179 } 4180 4181 /* 4182 * Drain an array if it contains any elements taking the l3 lock only if 4183 * necessary. Note that the l3 listlock also protects the array_cache 4184 * if drain_array() is used on the shared array. 4185 */ 4186 static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, 4187 struct array_cache *ac, int force, int node) 4188 { 4189 int tofree; 4190 4191 if (!ac || !ac->avail) 4192 return; 4193 if (ac->touched && !force) { 4194 ac->touched = 0; 4195 } else { 4196 spin_lock_irq(&l3->list_lock); 4197 if (ac->avail) { 4198 tofree = force ? ac->avail : (ac->limit + 4) / 5; 4199 if (tofree > ac->avail) 4200 tofree = (ac->avail + 1) / 2; 4201 free_block(cachep, ac->entry, tofree, node); 4202 ac->avail -= tofree; 4203 memmove(ac->entry, &(ac->entry[tofree]), 4204 sizeof(void *) * ac->avail); 4205 } 4206 spin_unlock_irq(&l3->list_lock); 4207 } 4208 } 4209 4210 /** 4211 * cache_reap - Reclaim memory from caches. 4212 * @w: work descriptor 4213 * 4214 * Called from workqueue/eventd every few seconds. 4215 * Purpose: 4216 * - clear the per-cpu caches for this CPU. 4217 * - return freeable pages to the main free memory pool. 4218 * 4219 * If we cannot acquire the cache chain mutex then just give up - we'll try 4220 * again on the next iteration. 4221 */ 4222 static void cache_reap(struct work_struct *w) 4223 { 4224 struct kmem_cache *searchp; 4225 struct kmem_list3 *l3; 4226 int node = numa_mem_id(); 4227 struct delayed_work *work = to_delayed_work(w); 4228 4229 if (!mutex_trylock(&slab_mutex)) 4230 /* Give up. Setup the next iteration. */ 4231 goto out; 4232 4233 list_for_each_entry(searchp, &slab_caches, list) { 4234 check_irq_on(); 4235 4236 /* 4237 * We only take the l3 lock if absolutely necessary and we 4238 * have established with reasonable certainty that 4239 * we can do some work if the lock was obtained. 4240 */ 4241 l3 = searchp->nodelists[node]; 4242 4243 reap_alien(searchp, l3); 4244 4245 drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); 4246 4247 /* 4248 * These are racy checks but it does not matter 4249 * if we skip one check or scan twice. 4250 */ 4251 if (time_after(l3->next_reap, jiffies)) 4252 goto next; 4253 4254 l3->next_reap = jiffies + REAPTIMEOUT_LIST3; 4255 4256 drain_array(searchp, l3, l3->shared, 0, node); 4257 4258 if (l3->free_touched) 4259 l3->free_touched = 0; 4260 else { 4261 int freed; 4262 4263 freed = drain_freelist(searchp, l3, (l3->free_limit + 4264 5 * searchp->num - 1) / (5 * searchp->num)); 4265 STATS_ADD_REAPED(searchp, freed); 4266 } 4267 next: 4268 cond_resched(); 4269 } 4270 check_irq_on(); 4271 mutex_unlock(&slab_mutex); 4272 next_reap_node(); 4273 out: 4274 /* Set up the next iteration */ 4275 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); 4276 } 4277 4278 #ifdef CONFIG_SLABINFO 4279 4280 static void print_slabinfo_header(struct seq_file *m) 4281 { 4282 /* 4283 * Output format version, so at least we can change it 4284 * without _too_ many complaints. 4285 */ 4286 #if STATS 4287 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); 4288 #else 4289 seq_puts(m, "slabinfo - version: 2.1\n"); 4290 #endif 4291 seq_puts(m, "# name <active_objs> <num_objs> <objsize> " 4292 "<objperslab> <pagesperslab>"); 4293 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 4294 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 4295 #if STATS 4296 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " 4297 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); 4298 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 4299 #endif 4300 seq_putc(m, '\n'); 4301 } 4302 4303 static void *s_start(struct seq_file *m, loff_t *pos) 4304 { 4305 loff_t n = *pos; 4306 4307 mutex_lock(&slab_mutex); 4308 if (!n) 4309 print_slabinfo_header(m); 4310 4311 return seq_list_start(&slab_caches, *pos); 4312 } 4313 4314 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 4315 { 4316 return seq_list_next(p, &slab_caches, pos); 4317 } 4318 4319 static void s_stop(struct seq_file *m, void *p) 4320 { 4321 mutex_unlock(&slab_mutex); 4322 } 4323 4324 static int s_show(struct seq_file *m, void *p) 4325 { 4326 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); 4327 struct slab *slabp; 4328 unsigned long active_objs; 4329 unsigned long num_objs; 4330 unsigned long active_slabs = 0; 4331 unsigned long num_slabs, free_objects = 0, shared_avail = 0; 4332 const char *name; 4333 char *error = NULL; 4334 int node; 4335 struct kmem_list3 *l3; 4336 4337 active_objs = 0; 4338 num_slabs = 0; 4339 for_each_online_node(node) { 4340 l3 = cachep->nodelists[node]; 4341 if (!l3) 4342 continue; 4343 4344 check_irq_on(); 4345 spin_lock_irq(&l3->list_lock); 4346 4347 list_for_each_entry(slabp, &l3->slabs_full, list) { 4348 if (slabp->inuse != cachep->num && !error) 4349 error = "slabs_full accounting error"; 4350 active_objs += cachep->num; 4351 active_slabs++; 4352 } 4353 list_for_each_entry(slabp, &l3->slabs_partial, list) { 4354 if (slabp->inuse == cachep->num && !error) 4355 error = "slabs_partial inuse accounting error"; 4356 if (!slabp->inuse && !error) 4357 error = "slabs_partial/inuse accounting error"; 4358 active_objs += slabp->inuse; 4359 active_slabs++; 4360 } 4361 list_for_each_entry(slabp, &l3->slabs_free, list) { 4362 if (slabp->inuse && !error) 4363 error = "slabs_free/inuse accounting error"; 4364 num_slabs++; 4365 } 4366 free_objects += l3->free_objects; 4367 if (l3->shared) 4368 shared_avail += l3->shared->avail; 4369 4370 spin_unlock_irq(&l3->list_lock); 4371 } 4372 num_slabs += active_slabs; 4373 num_objs = num_slabs * cachep->num; 4374 if (num_objs - active_objs != free_objects && !error) 4375 error = "free_objects accounting error"; 4376 4377 name = cachep->name; 4378 if (error) 4379 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 4380 4381 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 4382 name, active_objs, num_objs, cachep->size, 4383 cachep->num, (1 << cachep->gfporder)); 4384 seq_printf(m, " : tunables %4u %4u %4u", 4385 cachep->limit, cachep->batchcount, cachep->shared); 4386 seq_printf(m, " : slabdata %6lu %6lu %6lu", 4387 active_slabs, num_slabs, shared_avail); 4388 #if STATS 4389 { /* list3 stats */ 4390 unsigned long high = cachep->high_mark; 4391 unsigned long allocs = cachep->num_allocations; 4392 unsigned long grown = cachep->grown; 4393 unsigned long reaped = cachep->reaped; 4394 unsigned long errors = cachep->errors; 4395 unsigned long max_freeable = cachep->max_freeable; 4396 unsigned long node_allocs = cachep->node_allocs; 4397 unsigned long node_frees = cachep->node_frees; 4398 unsigned long overflows = cachep->node_overflow; 4399 4400 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu " 4401 "%4lu %4lu %4lu %4lu %4lu", 4402 allocs, high, grown, 4403 reaped, errors, max_freeable, node_allocs, 4404 node_frees, overflows); 4405 } 4406 /* cpu stats */ 4407 { 4408 unsigned long allochit = atomic_read(&cachep->allochit); 4409 unsigned long allocmiss = atomic_read(&cachep->allocmiss); 4410 unsigned long freehit = atomic_read(&cachep->freehit); 4411 unsigned long freemiss = atomic_read(&cachep->freemiss); 4412 4413 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", 4414 allochit, allocmiss, freehit, freemiss); 4415 } 4416 #endif 4417 seq_putc(m, '\n'); 4418 return 0; 4419 } 4420 4421 /* 4422 * slabinfo_op - iterator that generates /proc/slabinfo 4423 * 4424 * Output layout: 4425 * cache-name 4426 * num-active-objs 4427 * total-objs 4428 * object size 4429 * num-active-slabs 4430 * total-slabs 4431 * num-pages-per-slab 4432 * + further values on SMP and with statistics enabled 4433 */ 4434 4435 static const struct seq_operations slabinfo_op = { 4436 .start = s_start, 4437 .next = s_next, 4438 .stop = s_stop, 4439 .show = s_show, 4440 }; 4441 4442 #define MAX_SLABINFO_WRITE 128 4443 /** 4444 * slabinfo_write - Tuning for the slab allocator 4445 * @file: unused 4446 * @buffer: user buffer 4447 * @count: data length 4448 * @ppos: unused 4449 */ 4450 static ssize_t slabinfo_write(struct file *file, const char __user *buffer, 4451 size_t count, loff_t *ppos) 4452 { 4453 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; 4454 int limit, batchcount, shared, res; 4455 struct kmem_cache *cachep; 4456 4457 if (count > MAX_SLABINFO_WRITE) 4458 return -EINVAL; 4459 if (copy_from_user(&kbuf, buffer, count)) 4460 return -EFAULT; 4461 kbuf[MAX_SLABINFO_WRITE] = '\0'; 4462 4463 tmp = strchr(kbuf, ' '); 4464 if (!tmp) 4465 return -EINVAL; 4466 *tmp = '\0'; 4467 tmp++; 4468 if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) 4469 return -EINVAL; 4470 4471 /* Find the cache in the chain of caches. */ 4472 mutex_lock(&slab_mutex); 4473 res = -EINVAL; 4474 list_for_each_entry(cachep, &slab_caches, list) { 4475 if (!strcmp(cachep->name, kbuf)) { 4476 if (limit < 1 || batchcount < 1 || 4477 batchcount > limit || shared < 0) { 4478 res = 0; 4479 } else { 4480 res = do_tune_cpucache(cachep, limit, 4481 batchcount, shared, 4482 GFP_KERNEL); 4483 } 4484 break; 4485 } 4486 } 4487 mutex_unlock(&slab_mutex); 4488 if (res >= 0) 4489 res = count; 4490 return res; 4491 } 4492 4493 static int slabinfo_open(struct inode *inode, struct file *file) 4494 { 4495 return seq_open(file, &slabinfo_op); 4496 } 4497 4498 static const struct file_operations proc_slabinfo_operations = { 4499 .open = slabinfo_open, 4500 .read = seq_read, 4501 .write = slabinfo_write, 4502 .llseek = seq_lseek, 4503 .release = seq_release, 4504 }; 4505 4506 #ifdef CONFIG_DEBUG_SLAB_LEAK 4507 4508 static void *leaks_start(struct seq_file *m, loff_t *pos) 4509 { 4510 mutex_lock(&slab_mutex); 4511 return seq_list_start(&slab_caches, *pos); 4512 } 4513 4514 static inline int add_caller(unsigned long *n, unsigned long v) 4515 { 4516 unsigned long *p; 4517 int l; 4518 if (!v) 4519 return 1; 4520 l = n[1]; 4521 p = n + 2; 4522 while (l) { 4523 int i = l/2; 4524 unsigned long *q = p + 2 * i; 4525 if (*q == v) { 4526 q[1]++; 4527 return 1; 4528 } 4529 if (*q > v) { 4530 l = i; 4531 } else { 4532 p = q + 2; 4533 l -= i + 1; 4534 } 4535 } 4536 if (++n[1] == n[0]) 4537 return 0; 4538 memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n)); 4539 p[0] = v; 4540 p[1] = 1; 4541 return 1; 4542 } 4543 4544 static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) 4545 { 4546 void *p; 4547 int i; 4548 if (n[0] == n[1]) 4549 return; 4550 for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) { 4551 if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) 4552 continue; 4553 if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) 4554 return; 4555 } 4556 } 4557 4558 static void show_symbol(struct seq_file *m, unsigned long address) 4559 { 4560 #ifdef CONFIG_KALLSYMS 4561 unsigned long offset, size; 4562 char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN]; 4563 4564 if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) { 4565 seq_printf(m, "%s+%#lx/%#lx", name, offset, size); 4566 if (modname[0]) 4567 seq_printf(m, " [%s]", modname); 4568 return; 4569 } 4570 #endif 4571 seq_printf(m, "%p", (void *)address); 4572 } 4573 4574 static int leaks_show(struct seq_file *m, void *p) 4575 { 4576 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); 4577 struct slab *slabp; 4578 struct kmem_list3 *l3; 4579 const char *name; 4580 unsigned long *n = m->private; 4581 int node; 4582 int i; 4583 4584 if (!(cachep->flags & SLAB_STORE_USER)) 4585 return 0; 4586 if (!(cachep->flags & SLAB_RED_ZONE)) 4587 return 0; 4588 4589 /* OK, we can do it */ 4590 4591 n[1] = 0; 4592 4593 for_each_online_node(node) { 4594 l3 = cachep->nodelists[node]; 4595 if (!l3) 4596 continue; 4597 4598 check_irq_on(); 4599 spin_lock_irq(&l3->list_lock); 4600 4601 list_for_each_entry(slabp, &l3->slabs_full, list) 4602 handle_slab(n, cachep, slabp); 4603 list_for_each_entry(slabp, &l3->slabs_partial, list) 4604 handle_slab(n, cachep, slabp); 4605 spin_unlock_irq(&l3->list_lock); 4606 } 4607 name = cachep->name; 4608 if (n[0] == n[1]) { 4609 /* Increase the buffer size */ 4610 mutex_unlock(&slab_mutex); 4611 m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); 4612 if (!m->private) { 4613 /* Too bad, we are really out */ 4614 m->private = n; 4615 mutex_lock(&slab_mutex); 4616 return -ENOMEM; 4617 } 4618 *(unsigned long *)m->private = n[0] * 2; 4619 kfree(n); 4620 mutex_lock(&slab_mutex); 4621 /* Now make sure this entry will be retried */ 4622 m->count = m->size; 4623 return 0; 4624 } 4625 for (i = 0; i < n[1]; i++) { 4626 seq_printf(m, "%s: %lu ", name, n[2*i+3]); 4627 show_symbol(m, n[2*i+2]); 4628 seq_putc(m, '\n'); 4629 } 4630 4631 return 0; 4632 } 4633 4634 static const struct seq_operations slabstats_op = { 4635 .start = leaks_start, 4636 .next = s_next, 4637 .stop = s_stop, 4638 .show = leaks_show, 4639 }; 4640 4641 static int slabstats_open(struct inode *inode, struct file *file) 4642 { 4643 unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL); 4644 int ret = -ENOMEM; 4645 if (n) { 4646 ret = seq_open(file, &slabstats_op); 4647 if (!ret) { 4648 struct seq_file *m = file->private_data; 4649 *n = PAGE_SIZE / (2 * sizeof(unsigned long)); 4650 m->private = n; 4651 n = NULL; 4652 } 4653 kfree(n); 4654 } 4655 return ret; 4656 } 4657 4658 static const struct file_operations proc_slabstats_operations = { 4659 .open = slabstats_open, 4660 .read = seq_read, 4661 .llseek = seq_lseek, 4662 .release = seq_release_private, 4663 }; 4664 #endif 4665 4666 static int __init slab_proc_init(void) 4667 { 4668 proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations); 4669 #ifdef CONFIG_DEBUG_SLAB_LEAK 4670 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); 4671 #endif 4672 return 0; 4673 } 4674 module_init(slab_proc_init); 4675 #endif 4676 4677 /** 4678 * ksize - get the actual amount of memory allocated for a given object 4679 * @objp: Pointer to the object 4680 * 4681 * kmalloc may internally round up allocations and return more memory 4682 * than requested. ksize() can be used to determine the actual amount of 4683 * memory allocated. The caller may use this additional memory, even though 4684 * a smaller amount of memory was initially specified with the kmalloc call. 4685 * The caller must guarantee that objp points to a valid object previously 4686 * allocated with either kmalloc() or kmem_cache_alloc(). The object 4687 * must not be freed during the duration of the call. 4688 */ 4689 size_t ksize(const void *objp) 4690 { 4691 BUG_ON(!objp); 4692 if (unlikely(objp == ZERO_SIZE_PTR)) 4693 return 0; 4694 4695 return virt_to_cache(objp)->object_size; 4696 } 4697 EXPORT_SYMBOL(ksize); 4698