1 /* 2 * linux/mm/slab.c 3 * Written by Mark Hemment, 1996/97. 4 * (markhe@nextd.demon.co.uk) 5 * 6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli 7 * 8 * Major cleanup, different bufctl logic, per-cpu arrays 9 * (c) 2000 Manfred Spraul 10 * 11 * Cleanup, make the head arrays unconditional, preparation for NUMA 12 * (c) 2002 Manfred Spraul 13 * 14 * An implementation of the Slab Allocator as described in outline in; 15 * UNIX Internals: The New Frontiers by Uresh Vahalia 16 * Pub: Prentice Hall ISBN 0-13-101908-2 17 * or with a little more detail in; 18 * The Slab Allocator: An Object-Caching Kernel Memory Allocator 19 * Jeff Bonwick (Sun Microsystems). 20 * Presented at: USENIX Summer 1994 Technical Conference 21 * 22 * The memory is organized in caches, one cache for each object type. 23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) 24 * Each cache consists out of many slabs (they are small (usually one 25 * page long) and always contiguous), and each slab contains multiple 26 * initialized objects. 27 * 28 * This means, that your constructor is used only for newly allocated 29 * slabs and you must pass objects with the same initializations to 30 * kmem_cache_free. 31 * 32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, 33 * normal). If you need a special memory type, then must create a new 34 * cache for that memory type. 35 * 36 * In order to reduce fragmentation, the slabs are sorted in 3 groups: 37 * full slabs with 0 free objects 38 * partial slabs 39 * empty slabs with no allocated objects 40 * 41 * If partial slabs exist, then new allocations come from these slabs, 42 * otherwise from empty slabs or new slabs are allocated. 43 * 44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache 45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs. 46 * 47 * Each cache has a short per-cpu head array, most allocs 48 * and frees go into that array, and if that array overflows, then 1/2 49 * of the entries in the array are given back into the global cache. 50 * The head array is strictly LIFO and should improve the cache hit rates. 51 * On SMP, it additionally reduces the spinlock operations. 52 * 53 * The c_cpuarray may not be read with enabled local interrupts - 54 * it's changed with a smp_call_function(). 55 * 56 * SMP synchronization: 57 * constructors and destructors are called without any locking. 58 * Several members in struct kmem_cache and struct slab never change, they 59 * are accessed without any locking. 60 * The per-cpu arrays are never accessed from the wrong cpu, no locking, 61 * and local interrupts are disabled so slab code is preempt-safe. 62 * The non-constant members are protected with a per-cache irq spinlock. 63 * 64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch 65 * in 2000 - many ideas in the current implementation are derived from 66 * his patch. 67 * 68 * Further notes from the original documentation: 69 * 70 * 11 April '97. Started multi-threading - markhe 71 * The global cache-chain is protected by the mutex 'slab_mutex'. 72 * The sem is only needed when accessing/extending the cache-chain, which 73 * can never happen inside an interrupt (kmem_cache_create(), 74 * kmem_cache_shrink() and kmem_cache_reap()). 75 * 76 * At present, each engine can be growing a cache. This should be blocked. 77 * 78 * 15 March 2005. NUMA slab allocator. 79 * Shai Fultheim <shai@scalex86.org>. 80 * Shobhit Dayal <shobhit@calsoftinc.com> 81 * Alok N Kataria <alokk@calsoftinc.com> 82 * Christoph Lameter <christoph@lameter.com> 83 * 84 * Modified the slab allocator to be node aware on NUMA systems. 85 * Each node has its own list of partial, free and full slabs. 86 * All object allocations for a node occur from node specific slab lists. 87 */ 88 89 #include <linux/slab.h> 90 #include <linux/mm.h> 91 #include <linux/poison.h> 92 #include <linux/swap.h> 93 #include <linux/cache.h> 94 #include <linux/interrupt.h> 95 #include <linux/init.h> 96 #include <linux/compiler.h> 97 #include <linux/cpuset.h> 98 #include <linux/proc_fs.h> 99 #include <linux/seq_file.h> 100 #include <linux/notifier.h> 101 #include <linux/kallsyms.h> 102 #include <linux/cpu.h> 103 #include <linux/sysctl.h> 104 #include <linux/module.h> 105 #include <linux/rcupdate.h> 106 #include <linux/string.h> 107 #include <linux/uaccess.h> 108 #include <linux/nodemask.h> 109 #include <linux/kmemleak.h> 110 #include <linux/mempolicy.h> 111 #include <linux/mutex.h> 112 #include <linux/fault-inject.h> 113 #include <linux/rtmutex.h> 114 #include <linux/reciprocal_div.h> 115 #include <linux/debugobjects.h> 116 #include <linux/kmemcheck.h> 117 #include <linux/memory.h> 118 #include <linux/prefetch.h> 119 120 #include <net/sock.h> 121 122 #include <asm/cacheflush.h> 123 #include <asm/tlbflush.h> 124 #include <asm/page.h> 125 126 #include <trace/events/kmem.h> 127 128 #include "internal.h" 129 130 #include "slab.h" 131 132 /* 133 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. 134 * 0 for faster, smaller code (especially in the critical paths). 135 * 136 * STATS - 1 to collect stats for /proc/slabinfo. 137 * 0 for faster, smaller code (especially in the critical paths). 138 * 139 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) 140 */ 141 142 #ifdef CONFIG_DEBUG_SLAB 143 #define DEBUG 1 144 #define STATS 1 145 #define FORCED_DEBUG 1 146 #else 147 #define DEBUG 0 148 #define STATS 0 149 #define FORCED_DEBUG 0 150 #endif 151 152 /* Shouldn't this be in a header file somewhere? */ 153 #define BYTES_PER_WORD sizeof(void *) 154 #define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) 155 156 #ifndef ARCH_KMALLOC_FLAGS 157 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 158 #endif 159 160 #define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \ 161 <= SLAB_OBJ_MIN_SIZE) ? 1 : 0) 162 163 #if FREELIST_BYTE_INDEX 164 typedef unsigned char freelist_idx_t; 165 #else 166 typedef unsigned short freelist_idx_t; 167 #endif 168 169 #define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) 170 171 /* 172 * true if a page was allocated from pfmemalloc reserves for network-based 173 * swap 174 */ 175 static bool pfmemalloc_active __read_mostly; 176 177 /* 178 * struct array_cache 179 * 180 * Purpose: 181 * - LIFO ordering, to hand out cache-warm objects from _alloc 182 * - reduce the number of linked list operations 183 * - reduce spinlock operations 184 * 185 * The limit is stored in the per-cpu structure to reduce the data cache 186 * footprint. 187 * 188 */ 189 struct array_cache { 190 unsigned int avail; 191 unsigned int limit; 192 unsigned int batchcount; 193 unsigned int touched; 194 spinlock_t lock; 195 void *entry[]; /* 196 * Must have this definition in here for the proper 197 * alignment of array_cache. Also simplifies accessing 198 * the entries. 199 * 200 * Entries should not be directly dereferenced as 201 * entries belonging to slabs marked pfmemalloc will 202 * have the lower bits set SLAB_OBJ_PFMEMALLOC 203 */ 204 }; 205 206 #define SLAB_OBJ_PFMEMALLOC 1 207 static inline bool is_obj_pfmemalloc(void *objp) 208 { 209 return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; 210 } 211 212 static inline void set_obj_pfmemalloc(void **objp) 213 { 214 *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); 215 return; 216 } 217 218 static inline void clear_obj_pfmemalloc(void **objp) 219 { 220 *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); 221 } 222 223 /* 224 * bootstrap: The caches do not work without cpuarrays anymore, but the 225 * cpuarrays are allocated from the generic caches... 226 */ 227 #define BOOT_CPUCACHE_ENTRIES 1 228 struct arraycache_init { 229 struct array_cache cache; 230 void *entries[BOOT_CPUCACHE_ENTRIES]; 231 }; 232 233 /* 234 * Need this for bootstrapping a per node allocator. 235 */ 236 #define NUM_INIT_LISTS (3 * MAX_NUMNODES) 237 static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; 238 #define CACHE_CACHE 0 239 #define SIZE_AC MAX_NUMNODES 240 #define SIZE_NODE (2 * MAX_NUMNODES) 241 242 static int drain_freelist(struct kmem_cache *cache, 243 struct kmem_cache_node *n, int tofree); 244 static void free_block(struct kmem_cache *cachep, void **objpp, int len, 245 int node); 246 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); 247 static void cache_reap(struct work_struct *unused); 248 249 static int slab_early_init = 1; 250 251 #define INDEX_AC kmalloc_index(sizeof(struct arraycache_init)) 252 #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) 253 254 static void kmem_cache_node_init(struct kmem_cache_node *parent) 255 { 256 INIT_LIST_HEAD(&parent->slabs_full); 257 INIT_LIST_HEAD(&parent->slabs_partial); 258 INIT_LIST_HEAD(&parent->slabs_free); 259 parent->shared = NULL; 260 parent->alien = NULL; 261 parent->colour_next = 0; 262 spin_lock_init(&parent->list_lock); 263 parent->free_objects = 0; 264 parent->free_touched = 0; 265 } 266 267 #define MAKE_LIST(cachep, listp, slab, nodeid) \ 268 do { \ 269 INIT_LIST_HEAD(listp); \ 270 list_splice(&(cachep->node[nodeid]->slab), listp); \ 271 } while (0) 272 273 #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ 274 do { \ 275 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ 276 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ 277 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 278 } while (0) 279 280 #define CFLGS_OFF_SLAB (0x80000000UL) 281 #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 282 283 #define BATCHREFILL_LIMIT 16 284 /* 285 * Optimization question: fewer reaps means less probability for unnessary 286 * cpucache drain/refill cycles. 287 * 288 * OTOH the cpuarrays can contain lots of objects, 289 * which could lock up otherwise freeable slabs. 290 */ 291 #define REAPTIMEOUT_AC (2*HZ) 292 #define REAPTIMEOUT_NODE (4*HZ) 293 294 #if STATS 295 #define STATS_INC_ACTIVE(x) ((x)->num_active++) 296 #define STATS_DEC_ACTIVE(x) ((x)->num_active--) 297 #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 298 #define STATS_INC_GROWN(x) ((x)->grown++) 299 #define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) 300 #define STATS_SET_HIGH(x) \ 301 do { \ 302 if ((x)->num_active > (x)->high_mark) \ 303 (x)->high_mark = (x)->num_active; \ 304 } while (0) 305 #define STATS_INC_ERR(x) ((x)->errors++) 306 #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 307 #define STATS_INC_NODEFREES(x) ((x)->node_frees++) 308 #define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++) 309 #define STATS_SET_FREEABLE(x, i) \ 310 do { \ 311 if ((x)->max_freeable < i) \ 312 (x)->max_freeable = i; \ 313 } while (0) 314 #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) 315 #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) 316 #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) 317 #define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) 318 #else 319 #define STATS_INC_ACTIVE(x) do { } while (0) 320 #define STATS_DEC_ACTIVE(x) do { } while (0) 321 #define STATS_INC_ALLOCED(x) do { } while (0) 322 #define STATS_INC_GROWN(x) do { } while (0) 323 #define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0) 324 #define STATS_SET_HIGH(x) do { } while (0) 325 #define STATS_INC_ERR(x) do { } while (0) 326 #define STATS_INC_NODEALLOCS(x) do { } while (0) 327 #define STATS_INC_NODEFREES(x) do { } while (0) 328 #define STATS_INC_ACOVERFLOW(x) do { } while (0) 329 #define STATS_SET_FREEABLE(x, i) do { } while (0) 330 #define STATS_INC_ALLOCHIT(x) do { } while (0) 331 #define STATS_INC_ALLOCMISS(x) do { } while (0) 332 #define STATS_INC_FREEHIT(x) do { } while (0) 333 #define STATS_INC_FREEMISS(x) do { } while (0) 334 #endif 335 336 #if DEBUG 337 338 /* 339 * memory layout of objects: 340 * 0 : objp 341 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that 342 * the end of an object is aligned with the end of the real 343 * allocation. Catches writes behind the end of the allocation. 344 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: 345 * redzone word. 346 * cachep->obj_offset: The real object. 347 * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 348 * cachep->size - 1* BYTES_PER_WORD: last caller address 349 * [BYTES_PER_WORD long] 350 */ 351 static int obj_offset(struct kmem_cache *cachep) 352 { 353 return cachep->obj_offset; 354 } 355 356 static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) 357 { 358 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 359 return (unsigned long long*) (objp + obj_offset(cachep) - 360 sizeof(unsigned long long)); 361 } 362 363 static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) 364 { 365 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 366 if (cachep->flags & SLAB_STORE_USER) 367 return (unsigned long long *)(objp + cachep->size - 368 sizeof(unsigned long long) - 369 REDZONE_ALIGN); 370 return (unsigned long long *) (objp + cachep->size - 371 sizeof(unsigned long long)); 372 } 373 374 static void **dbg_userword(struct kmem_cache *cachep, void *objp) 375 { 376 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 377 return (void **)(objp + cachep->size - BYTES_PER_WORD); 378 } 379 380 #else 381 382 #define obj_offset(x) 0 383 #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 384 #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 385 #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 386 387 #endif 388 389 #define OBJECT_FREE (0) 390 #define OBJECT_ACTIVE (1) 391 392 #ifdef CONFIG_DEBUG_SLAB_LEAK 393 394 static void set_obj_status(struct page *page, int idx, int val) 395 { 396 int freelist_size; 397 char *status; 398 struct kmem_cache *cachep = page->slab_cache; 399 400 freelist_size = cachep->num * sizeof(freelist_idx_t); 401 status = (char *)page->freelist + freelist_size; 402 status[idx] = val; 403 } 404 405 static inline unsigned int get_obj_status(struct page *page, int idx) 406 { 407 int freelist_size; 408 char *status; 409 struct kmem_cache *cachep = page->slab_cache; 410 411 freelist_size = cachep->num * sizeof(freelist_idx_t); 412 status = (char *)page->freelist + freelist_size; 413 414 return status[idx]; 415 } 416 417 #else 418 static inline void set_obj_status(struct page *page, int idx, int val) {} 419 420 #endif 421 422 /* 423 * Do not go above this order unless 0 objects fit into the slab or 424 * overridden on the command line. 425 */ 426 #define SLAB_MAX_ORDER_HI 1 427 #define SLAB_MAX_ORDER_LO 0 428 static int slab_max_order = SLAB_MAX_ORDER_LO; 429 static bool slab_max_order_set __initdata; 430 431 static inline struct kmem_cache *virt_to_cache(const void *obj) 432 { 433 struct page *page = virt_to_head_page(obj); 434 return page->slab_cache; 435 } 436 437 static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, 438 unsigned int idx) 439 { 440 return page->s_mem + cache->size * idx; 441 } 442 443 /* 444 * We want to avoid an expensive divide : (offset / cache->size) 445 * Using the fact that size is a constant for a particular cache, 446 * we can replace (offset / cache->size) by 447 * reciprocal_divide(offset, cache->reciprocal_buffer_size) 448 */ 449 static inline unsigned int obj_to_index(const struct kmem_cache *cache, 450 const struct page *page, void *obj) 451 { 452 u32 offset = (obj - page->s_mem); 453 return reciprocal_divide(offset, cache->reciprocal_buffer_size); 454 } 455 456 static struct arraycache_init initarray_generic = 457 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 458 459 /* internal cache of cache description objs */ 460 static struct kmem_cache kmem_cache_boot = { 461 .batchcount = 1, 462 .limit = BOOT_CPUCACHE_ENTRIES, 463 .shared = 1, 464 .size = sizeof(struct kmem_cache), 465 .name = "kmem_cache", 466 }; 467 468 #define BAD_ALIEN_MAGIC 0x01020304ul 469 470 #ifdef CONFIG_LOCKDEP 471 472 /* 473 * Slab sometimes uses the kmalloc slabs to store the slab headers 474 * for other slabs "off slab". 475 * The locking for this is tricky in that it nests within the locks 476 * of all other slabs in a few places; to deal with this special 477 * locking we put on-slab caches into a separate lock-class. 478 * 479 * We set lock class for alien array caches which are up during init. 480 * The lock annotation will be lost if all cpus of a node goes down and 481 * then comes back up during hotplug 482 */ 483 static struct lock_class_key on_slab_l3_key; 484 static struct lock_class_key on_slab_alc_key; 485 486 static struct lock_class_key debugobj_l3_key; 487 static struct lock_class_key debugobj_alc_key; 488 489 static void slab_set_lock_classes(struct kmem_cache *cachep, 490 struct lock_class_key *l3_key, struct lock_class_key *alc_key, 491 int q) 492 { 493 struct array_cache **alc; 494 struct kmem_cache_node *n; 495 int r; 496 497 n = cachep->node[q]; 498 if (!n) 499 return; 500 501 lockdep_set_class(&n->list_lock, l3_key); 502 alc = n->alien; 503 /* 504 * FIXME: This check for BAD_ALIEN_MAGIC 505 * should go away when common slab code is taught to 506 * work even without alien caches. 507 * Currently, non NUMA code returns BAD_ALIEN_MAGIC 508 * for alloc_alien_cache, 509 */ 510 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) 511 return; 512 for_each_node(r) { 513 if (alc[r]) 514 lockdep_set_class(&alc[r]->lock, alc_key); 515 } 516 } 517 518 static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) 519 { 520 slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node); 521 } 522 523 static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) 524 { 525 int node; 526 527 for_each_online_node(node) 528 slab_set_debugobj_lock_classes_node(cachep, node); 529 } 530 531 static void init_node_lock_keys(int q) 532 { 533 int i; 534 535 if (slab_state < UP) 536 return; 537 538 for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) { 539 struct kmem_cache_node *n; 540 struct kmem_cache *cache = kmalloc_caches[i]; 541 542 if (!cache) 543 continue; 544 545 n = cache->node[q]; 546 if (!n || OFF_SLAB(cache)) 547 continue; 548 549 slab_set_lock_classes(cache, &on_slab_l3_key, 550 &on_slab_alc_key, q); 551 } 552 } 553 554 static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q) 555 { 556 if (!cachep->node[q]) 557 return; 558 559 slab_set_lock_classes(cachep, &on_slab_l3_key, 560 &on_slab_alc_key, q); 561 } 562 563 static inline void on_slab_lock_classes(struct kmem_cache *cachep) 564 { 565 int node; 566 567 VM_BUG_ON(OFF_SLAB(cachep)); 568 for_each_node(node) 569 on_slab_lock_classes_node(cachep, node); 570 } 571 572 static inline void init_lock_keys(void) 573 { 574 int node; 575 576 for_each_node(node) 577 init_node_lock_keys(node); 578 } 579 #else 580 static void init_node_lock_keys(int q) 581 { 582 } 583 584 static inline void init_lock_keys(void) 585 { 586 } 587 588 static inline void on_slab_lock_classes(struct kmem_cache *cachep) 589 { 590 } 591 592 static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node) 593 { 594 } 595 596 static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) 597 { 598 } 599 600 static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) 601 { 602 } 603 #endif 604 605 static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); 606 607 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 608 { 609 return cachep->array[smp_processor_id()]; 610 } 611 612 static size_t calculate_freelist_size(int nr_objs, size_t align) 613 { 614 size_t freelist_size; 615 616 freelist_size = nr_objs * sizeof(freelist_idx_t); 617 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) 618 freelist_size += nr_objs * sizeof(char); 619 620 if (align) 621 freelist_size = ALIGN(freelist_size, align); 622 623 return freelist_size; 624 } 625 626 static int calculate_nr_objs(size_t slab_size, size_t buffer_size, 627 size_t idx_size, size_t align) 628 { 629 int nr_objs; 630 size_t remained_size; 631 size_t freelist_size; 632 int extra_space = 0; 633 634 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) 635 extra_space = sizeof(char); 636 /* 637 * Ignore padding for the initial guess. The padding 638 * is at most @align-1 bytes, and @buffer_size is at 639 * least @align. In the worst case, this result will 640 * be one greater than the number of objects that fit 641 * into the memory allocation when taking the padding 642 * into account. 643 */ 644 nr_objs = slab_size / (buffer_size + idx_size + extra_space); 645 646 /* 647 * This calculated number will be either the right 648 * amount, or one greater than what we want. 649 */ 650 remained_size = slab_size - nr_objs * buffer_size; 651 freelist_size = calculate_freelist_size(nr_objs, align); 652 if (remained_size < freelist_size) 653 nr_objs--; 654 655 return nr_objs; 656 } 657 658 /* 659 * Calculate the number of objects and left-over bytes for a given buffer size. 660 */ 661 static void cache_estimate(unsigned long gfporder, size_t buffer_size, 662 size_t align, int flags, size_t *left_over, 663 unsigned int *num) 664 { 665 int nr_objs; 666 size_t mgmt_size; 667 size_t slab_size = PAGE_SIZE << gfporder; 668 669 /* 670 * The slab management structure can be either off the slab or 671 * on it. For the latter case, the memory allocated for a 672 * slab is used for: 673 * 674 * - One unsigned int for each object 675 * - Padding to respect alignment of @align 676 * - @buffer_size bytes for each object 677 * 678 * If the slab management structure is off the slab, then the 679 * alignment will already be calculated into the size. Because 680 * the slabs are all pages aligned, the objects will be at the 681 * correct alignment when allocated. 682 */ 683 if (flags & CFLGS_OFF_SLAB) { 684 mgmt_size = 0; 685 nr_objs = slab_size / buffer_size; 686 687 } else { 688 nr_objs = calculate_nr_objs(slab_size, buffer_size, 689 sizeof(freelist_idx_t), align); 690 mgmt_size = calculate_freelist_size(nr_objs, align); 691 } 692 *num = nr_objs; 693 *left_over = slab_size - nr_objs*buffer_size - mgmt_size; 694 } 695 696 #if DEBUG 697 #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) 698 699 static void __slab_error(const char *function, struct kmem_cache *cachep, 700 char *msg) 701 { 702 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 703 function, cachep->name, msg); 704 dump_stack(); 705 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 706 } 707 #endif 708 709 /* 710 * By default on NUMA we use alien caches to stage the freeing of 711 * objects allocated from other nodes. This causes massive memory 712 * inefficiencies when using fake NUMA setup to split memory into a 713 * large number of small nodes, so it can be disabled on the command 714 * line 715 */ 716 717 static int use_alien_caches __read_mostly = 1; 718 static int __init noaliencache_setup(char *s) 719 { 720 use_alien_caches = 0; 721 return 1; 722 } 723 __setup("noaliencache", noaliencache_setup); 724 725 static int __init slab_max_order_setup(char *str) 726 { 727 get_option(&str, &slab_max_order); 728 slab_max_order = slab_max_order < 0 ? 0 : 729 min(slab_max_order, MAX_ORDER - 1); 730 slab_max_order_set = true; 731 732 return 1; 733 } 734 __setup("slab_max_order=", slab_max_order_setup); 735 736 #ifdef CONFIG_NUMA 737 /* 738 * Special reaping functions for NUMA systems called from cache_reap(). 739 * These take care of doing round robin flushing of alien caches (containing 740 * objects freed on different nodes from which they were allocated) and the 741 * flushing of remote pcps by calling drain_node_pages. 742 */ 743 static DEFINE_PER_CPU(unsigned long, slab_reap_node); 744 745 static void init_reap_node(int cpu) 746 { 747 int node; 748 749 node = next_node(cpu_to_mem(cpu), node_online_map); 750 if (node == MAX_NUMNODES) 751 node = first_node(node_online_map); 752 753 per_cpu(slab_reap_node, cpu) = node; 754 } 755 756 static void next_reap_node(void) 757 { 758 int node = __this_cpu_read(slab_reap_node); 759 760 node = next_node(node, node_online_map); 761 if (unlikely(node >= MAX_NUMNODES)) 762 node = first_node(node_online_map); 763 __this_cpu_write(slab_reap_node, node); 764 } 765 766 #else 767 #define init_reap_node(cpu) do { } while (0) 768 #define next_reap_node(void) do { } while (0) 769 #endif 770 771 /* 772 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz 773 * via the workqueue/eventd. 774 * Add the CPU number into the expiration time to minimize the possibility of 775 * the CPUs getting into lockstep and contending for the global cache chain 776 * lock. 777 */ 778 static void start_cpu_timer(int cpu) 779 { 780 struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); 781 782 /* 783 * When this gets called from do_initcalls via cpucache_init(), 784 * init_workqueues() has already run, so keventd will be setup 785 * at that time. 786 */ 787 if (keventd_up() && reap_work->work.func == NULL) { 788 init_reap_node(cpu); 789 INIT_DEFERRABLE_WORK(reap_work, cache_reap); 790 schedule_delayed_work_on(cpu, reap_work, 791 __round_jiffies_relative(HZ, cpu)); 792 } 793 } 794 795 static struct array_cache *alloc_arraycache(int node, int entries, 796 int batchcount, gfp_t gfp) 797 { 798 int memsize = sizeof(void *) * entries + sizeof(struct array_cache); 799 struct array_cache *nc = NULL; 800 801 nc = kmalloc_node(memsize, gfp, node); 802 /* 803 * The array_cache structures contain pointers to free object. 804 * However, when such objects are allocated or transferred to another 805 * cache the pointers are not cleared and they could be counted as 806 * valid references during a kmemleak scan. Therefore, kmemleak must 807 * not scan such objects. 808 */ 809 kmemleak_no_scan(nc); 810 if (nc) { 811 nc->avail = 0; 812 nc->limit = entries; 813 nc->batchcount = batchcount; 814 nc->touched = 0; 815 spin_lock_init(&nc->lock); 816 } 817 return nc; 818 } 819 820 static inline bool is_slab_pfmemalloc(struct page *page) 821 { 822 return PageSlabPfmemalloc(page); 823 } 824 825 /* Clears pfmemalloc_active if no slabs have pfmalloc set */ 826 static void recheck_pfmemalloc_active(struct kmem_cache *cachep, 827 struct array_cache *ac) 828 { 829 struct kmem_cache_node *n = cachep->node[numa_mem_id()]; 830 struct page *page; 831 unsigned long flags; 832 833 if (!pfmemalloc_active) 834 return; 835 836 spin_lock_irqsave(&n->list_lock, flags); 837 list_for_each_entry(page, &n->slabs_full, lru) 838 if (is_slab_pfmemalloc(page)) 839 goto out; 840 841 list_for_each_entry(page, &n->slabs_partial, lru) 842 if (is_slab_pfmemalloc(page)) 843 goto out; 844 845 list_for_each_entry(page, &n->slabs_free, lru) 846 if (is_slab_pfmemalloc(page)) 847 goto out; 848 849 pfmemalloc_active = false; 850 out: 851 spin_unlock_irqrestore(&n->list_lock, flags); 852 } 853 854 static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, 855 gfp_t flags, bool force_refill) 856 { 857 int i; 858 void *objp = ac->entry[--ac->avail]; 859 860 /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ 861 if (unlikely(is_obj_pfmemalloc(objp))) { 862 struct kmem_cache_node *n; 863 864 if (gfp_pfmemalloc_allowed(flags)) { 865 clear_obj_pfmemalloc(&objp); 866 return objp; 867 } 868 869 /* The caller cannot use PFMEMALLOC objects, find another one */ 870 for (i = 0; i < ac->avail; i++) { 871 /* If a !PFMEMALLOC object is found, swap them */ 872 if (!is_obj_pfmemalloc(ac->entry[i])) { 873 objp = ac->entry[i]; 874 ac->entry[i] = ac->entry[ac->avail]; 875 ac->entry[ac->avail] = objp; 876 return objp; 877 } 878 } 879 880 /* 881 * If there are empty slabs on the slabs_free list and we are 882 * being forced to refill the cache, mark this one !pfmemalloc. 883 */ 884 n = cachep->node[numa_mem_id()]; 885 if (!list_empty(&n->slabs_free) && force_refill) { 886 struct page *page = virt_to_head_page(objp); 887 ClearPageSlabPfmemalloc(page); 888 clear_obj_pfmemalloc(&objp); 889 recheck_pfmemalloc_active(cachep, ac); 890 return objp; 891 } 892 893 /* No !PFMEMALLOC objects available */ 894 ac->avail++; 895 objp = NULL; 896 } 897 898 return objp; 899 } 900 901 static inline void *ac_get_obj(struct kmem_cache *cachep, 902 struct array_cache *ac, gfp_t flags, bool force_refill) 903 { 904 void *objp; 905 906 if (unlikely(sk_memalloc_socks())) 907 objp = __ac_get_obj(cachep, ac, flags, force_refill); 908 else 909 objp = ac->entry[--ac->avail]; 910 911 return objp; 912 } 913 914 static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, 915 void *objp) 916 { 917 if (unlikely(pfmemalloc_active)) { 918 /* Some pfmemalloc slabs exist, check if this is one */ 919 struct page *page = virt_to_head_page(objp); 920 if (PageSlabPfmemalloc(page)) 921 set_obj_pfmemalloc(&objp); 922 } 923 924 return objp; 925 } 926 927 static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, 928 void *objp) 929 { 930 if (unlikely(sk_memalloc_socks())) 931 objp = __ac_put_obj(cachep, ac, objp); 932 933 ac->entry[ac->avail++] = objp; 934 } 935 936 /* 937 * Transfer objects in one arraycache to another. 938 * Locking must be handled by the caller. 939 * 940 * Return the number of entries transferred. 941 */ 942 static int transfer_objects(struct array_cache *to, 943 struct array_cache *from, unsigned int max) 944 { 945 /* Figure out how many entries to transfer */ 946 int nr = min3(from->avail, max, to->limit - to->avail); 947 948 if (!nr) 949 return 0; 950 951 memcpy(to->entry + to->avail, from->entry + from->avail -nr, 952 sizeof(void *) *nr); 953 954 from->avail -= nr; 955 to->avail += nr; 956 return nr; 957 } 958 959 #ifndef CONFIG_NUMA 960 961 #define drain_alien_cache(cachep, alien) do { } while (0) 962 #define reap_alien(cachep, n) do { } while (0) 963 964 static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) 965 { 966 return (struct array_cache **)BAD_ALIEN_MAGIC; 967 } 968 969 static inline void free_alien_cache(struct array_cache **ac_ptr) 970 { 971 } 972 973 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 974 { 975 return 0; 976 } 977 978 static inline void *alternate_node_alloc(struct kmem_cache *cachep, 979 gfp_t flags) 980 { 981 return NULL; 982 } 983 984 static inline void *____cache_alloc_node(struct kmem_cache *cachep, 985 gfp_t flags, int nodeid) 986 { 987 return NULL; 988 } 989 990 #else /* CONFIG_NUMA */ 991 992 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); 993 static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 994 995 static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) 996 { 997 struct array_cache **ac_ptr; 998 int memsize = sizeof(void *) * nr_node_ids; 999 int i; 1000 1001 if (limit > 1) 1002 limit = 12; 1003 ac_ptr = kzalloc_node(memsize, gfp, node); 1004 if (ac_ptr) { 1005 for_each_node(i) { 1006 if (i == node || !node_online(i)) 1007 continue; 1008 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); 1009 if (!ac_ptr[i]) { 1010 for (i--; i >= 0; i--) 1011 kfree(ac_ptr[i]); 1012 kfree(ac_ptr); 1013 return NULL; 1014 } 1015 } 1016 } 1017 return ac_ptr; 1018 } 1019 1020 static void free_alien_cache(struct array_cache **ac_ptr) 1021 { 1022 int i; 1023 1024 if (!ac_ptr) 1025 return; 1026 for_each_node(i) 1027 kfree(ac_ptr[i]); 1028 kfree(ac_ptr); 1029 } 1030 1031 static void __drain_alien_cache(struct kmem_cache *cachep, 1032 struct array_cache *ac, int node) 1033 { 1034 struct kmem_cache_node *n = cachep->node[node]; 1035 1036 if (ac->avail) { 1037 spin_lock(&n->list_lock); 1038 /* 1039 * Stuff objects into the remote nodes shared array first. 1040 * That way we could avoid the overhead of putting the objects 1041 * into the free lists and getting them back later. 1042 */ 1043 if (n->shared) 1044 transfer_objects(n->shared, ac, ac->limit); 1045 1046 free_block(cachep, ac->entry, ac->avail, node); 1047 ac->avail = 0; 1048 spin_unlock(&n->list_lock); 1049 } 1050 } 1051 1052 /* 1053 * Called from cache_reap() to regularly drain alien caches round robin. 1054 */ 1055 static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) 1056 { 1057 int node = __this_cpu_read(slab_reap_node); 1058 1059 if (n->alien) { 1060 struct array_cache *ac = n->alien[node]; 1061 1062 if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { 1063 __drain_alien_cache(cachep, ac, node); 1064 spin_unlock_irq(&ac->lock); 1065 } 1066 } 1067 } 1068 1069 static void drain_alien_cache(struct kmem_cache *cachep, 1070 struct array_cache **alien) 1071 { 1072 int i = 0; 1073 struct array_cache *ac; 1074 unsigned long flags; 1075 1076 for_each_online_node(i) { 1077 ac = alien[i]; 1078 if (ac) { 1079 spin_lock_irqsave(&ac->lock, flags); 1080 __drain_alien_cache(cachep, ac, i); 1081 spin_unlock_irqrestore(&ac->lock, flags); 1082 } 1083 } 1084 } 1085 1086 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 1087 { 1088 int nodeid = page_to_nid(virt_to_page(objp)); 1089 struct kmem_cache_node *n; 1090 struct array_cache *alien = NULL; 1091 int node; 1092 1093 node = numa_mem_id(); 1094 1095 /* 1096 * Make sure we are not freeing a object from another node to the array 1097 * cache on this cpu. 1098 */ 1099 if (likely(nodeid == node)) 1100 return 0; 1101 1102 n = cachep->node[node]; 1103 STATS_INC_NODEFREES(cachep); 1104 if (n->alien && n->alien[nodeid]) { 1105 alien = n->alien[nodeid]; 1106 spin_lock(&alien->lock); 1107 if (unlikely(alien->avail == alien->limit)) { 1108 STATS_INC_ACOVERFLOW(cachep); 1109 __drain_alien_cache(cachep, alien, nodeid); 1110 } 1111 ac_put_obj(cachep, alien, objp); 1112 spin_unlock(&alien->lock); 1113 } else { 1114 spin_lock(&(cachep->node[nodeid])->list_lock); 1115 free_block(cachep, &objp, 1, nodeid); 1116 spin_unlock(&(cachep->node[nodeid])->list_lock); 1117 } 1118 return 1; 1119 } 1120 #endif 1121 1122 /* 1123 * Allocates and initializes node for a node on each slab cache, used for 1124 * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node 1125 * will be allocated off-node since memory is not yet online for the new node. 1126 * When hotplugging memory or a cpu, existing node are not replaced if 1127 * already in use. 1128 * 1129 * Must hold slab_mutex. 1130 */ 1131 static int init_cache_node_node(int node) 1132 { 1133 struct kmem_cache *cachep; 1134 struct kmem_cache_node *n; 1135 const int memsize = sizeof(struct kmem_cache_node); 1136 1137 list_for_each_entry(cachep, &slab_caches, list) { 1138 /* 1139 * Set up the kmem_cache_node for cpu before we can 1140 * begin anything. Make sure some other cpu on this 1141 * node has not already allocated this 1142 */ 1143 if (!cachep->node[node]) { 1144 n = kmalloc_node(memsize, GFP_KERNEL, node); 1145 if (!n) 1146 return -ENOMEM; 1147 kmem_cache_node_init(n); 1148 n->next_reap = jiffies + REAPTIMEOUT_NODE + 1149 ((unsigned long)cachep) % REAPTIMEOUT_NODE; 1150 1151 /* 1152 * The kmem_cache_nodes don't come and go as CPUs 1153 * come and go. slab_mutex is sufficient 1154 * protection here. 1155 */ 1156 cachep->node[node] = n; 1157 } 1158 1159 spin_lock_irq(&cachep->node[node]->list_lock); 1160 cachep->node[node]->free_limit = 1161 (1 + nr_cpus_node(node)) * 1162 cachep->batchcount + cachep->num; 1163 spin_unlock_irq(&cachep->node[node]->list_lock); 1164 } 1165 return 0; 1166 } 1167 1168 static inline int slabs_tofree(struct kmem_cache *cachep, 1169 struct kmem_cache_node *n) 1170 { 1171 return (n->free_objects + cachep->num - 1) / cachep->num; 1172 } 1173 1174 static void cpuup_canceled(long cpu) 1175 { 1176 struct kmem_cache *cachep; 1177 struct kmem_cache_node *n = NULL; 1178 int node = cpu_to_mem(cpu); 1179 const struct cpumask *mask = cpumask_of_node(node); 1180 1181 list_for_each_entry(cachep, &slab_caches, list) { 1182 struct array_cache *nc; 1183 struct array_cache *shared; 1184 struct array_cache **alien; 1185 1186 /* cpu is dead; no one can alloc from it. */ 1187 nc = cachep->array[cpu]; 1188 cachep->array[cpu] = NULL; 1189 n = cachep->node[node]; 1190 1191 if (!n) 1192 goto free_array_cache; 1193 1194 spin_lock_irq(&n->list_lock); 1195 1196 /* Free limit for this kmem_cache_node */ 1197 n->free_limit -= cachep->batchcount; 1198 if (nc) 1199 free_block(cachep, nc->entry, nc->avail, node); 1200 1201 if (!cpumask_empty(mask)) { 1202 spin_unlock_irq(&n->list_lock); 1203 goto free_array_cache; 1204 } 1205 1206 shared = n->shared; 1207 if (shared) { 1208 free_block(cachep, shared->entry, 1209 shared->avail, node); 1210 n->shared = NULL; 1211 } 1212 1213 alien = n->alien; 1214 n->alien = NULL; 1215 1216 spin_unlock_irq(&n->list_lock); 1217 1218 kfree(shared); 1219 if (alien) { 1220 drain_alien_cache(cachep, alien); 1221 free_alien_cache(alien); 1222 } 1223 free_array_cache: 1224 kfree(nc); 1225 } 1226 /* 1227 * In the previous loop, all the objects were freed to 1228 * the respective cache's slabs, now we can go ahead and 1229 * shrink each nodelist to its limit. 1230 */ 1231 list_for_each_entry(cachep, &slab_caches, list) { 1232 n = cachep->node[node]; 1233 if (!n) 1234 continue; 1235 drain_freelist(cachep, n, slabs_tofree(cachep, n)); 1236 } 1237 } 1238 1239 static int cpuup_prepare(long cpu) 1240 { 1241 struct kmem_cache *cachep; 1242 struct kmem_cache_node *n = NULL; 1243 int node = cpu_to_mem(cpu); 1244 int err; 1245 1246 /* 1247 * We need to do this right in the beginning since 1248 * alloc_arraycache's are going to use this list. 1249 * kmalloc_node allows us to add the slab to the right 1250 * kmem_cache_node and not this cpu's kmem_cache_node 1251 */ 1252 err = init_cache_node_node(node); 1253 if (err < 0) 1254 goto bad; 1255 1256 /* 1257 * Now we can go ahead with allocating the shared arrays and 1258 * array caches 1259 */ 1260 list_for_each_entry(cachep, &slab_caches, list) { 1261 struct array_cache *nc; 1262 struct array_cache *shared = NULL; 1263 struct array_cache **alien = NULL; 1264 1265 nc = alloc_arraycache(node, cachep->limit, 1266 cachep->batchcount, GFP_KERNEL); 1267 if (!nc) 1268 goto bad; 1269 if (cachep->shared) { 1270 shared = alloc_arraycache(node, 1271 cachep->shared * cachep->batchcount, 1272 0xbaadf00d, GFP_KERNEL); 1273 if (!shared) { 1274 kfree(nc); 1275 goto bad; 1276 } 1277 } 1278 if (use_alien_caches) { 1279 alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); 1280 if (!alien) { 1281 kfree(shared); 1282 kfree(nc); 1283 goto bad; 1284 } 1285 } 1286 cachep->array[cpu] = nc; 1287 n = cachep->node[node]; 1288 BUG_ON(!n); 1289 1290 spin_lock_irq(&n->list_lock); 1291 if (!n->shared) { 1292 /* 1293 * We are serialised from CPU_DEAD or 1294 * CPU_UP_CANCELLED by the cpucontrol lock 1295 */ 1296 n->shared = shared; 1297 shared = NULL; 1298 } 1299 #ifdef CONFIG_NUMA 1300 if (!n->alien) { 1301 n->alien = alien; 1302 alien = NULL; 1303 } 1304 #endif 1305 spin_unlock_irq(&n->list_lock); 1306 kfree(shared); 1307 free_alien_cache(alien); 1308 if (cachep->flags & SLAB_DEBUG_OBJECTS) 1309 slab_set_debugobj_lock_classes_node(cachep, node); 1310 else if (!OFF_SLAB(cachep) && 1311 !(cachep->flags & SLAB_DESTROY_BY_RCU)) 1312 on_slab_lock_classes_node(cachep, node); 1313 } 1314 init_node_lock_keys(node); 1315 1316 return 0; 1317 bad: 1318 cpuup_canceled(cpu); 1319 return -ENOMEM; 1320 } 1321 1322 static int cpuup_callback(struct notifier_block *nfb, 1323 unsigned long action, void *hcpu) 1324 { 1325 long cpu = (long)hcpu; 1326 int err = 0; 1327 1328 switch (action) { 1329 case CPU_UP_PREPARE: 1330 case CPU_UP_PREPARE_FROZEN: 1331 mutex_lock(&slab_mutex); 1332 err = cpuup_prepare(cpu); 1333 mutex_unlock(&slab_mutex); 1334 break; 1335 case CPU_ONLINE: 1336 case CPU_ONLINE_FROZEN: 1337 start_cpu_timer(cpu); 1338 break; 1339 #ifdef CONFIG_HOTPLUG_CPU 1340 case CPU_DOWN_PREPARE: 1341 case CPU_DOWN_PREPARE_FROZEN: 1342 /* 1343 * Shutdown cache reaper. Note that the slab_mutex is 1344 * held so that if cache_reap() is invoked it cannot do 1345 * anything expensive but will only modify reap_work 1346 * and reschedule the timer. 1347 */ 1348 cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); 1349 /* Now the cache_reaper is guaranteed to be not running. */ 1350 per_cpu(slab_reap_work, cpu).work.func = NULL; 1351 break; 1352 case CPU_DOWN_FAILED: 1353 case CPU_DOWN_FAILED_FROZEN: 1354 start_cpu_timer(cpu); 1355 break; 1356 case CPU_DEAD: 1357 case CPU_DEAD_FROZEN: 1358 /* 1359 * Even if all the cpus of a node are down, we don't free the 1360 * kmem_cache_node of any cache. This to avoid a race between 1361 * cpu_down, and a kmalloc allocation from another cpu for 1362 * memory from the node of the cpu going down. The node 1363 * structure is usually allocated from kmem_cache_create() and 1364 * gets destroyed at kmem_cache_destroy(). 1365 */ 1366 /* fall through */ 1367 #endif 1368 case CPU_UP_CANCELED: 1369 case CPU_UP_CANCELED_FROZEN: 1370 mutex_lock(&slab_mutex); 1371 cpuup_canceled(cpu); 1372 mutex_unlock(&slab_mutex); 1373 break; 1374 } 1375 return notifier_from_errno(err); 1376 } 1377 1378 static struct notifier_block cpucache_notifier = { 1379 &cpuup_callback, NULL, 0 1380 }; 1381 1382 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) 1383 /* 1384 * Drains freelist for a node on each slab cache, used for memory hot-remove. 1385 * Returns -EBUSY if all objects cannot be drained so that the node is not 1386 * removed. 1387 * 1388 * Must hold slab_mutex. 1389 */ 1390 static int __meminit drain_cache_node_node(int node) 1391 { 1392 struct kmem_cache *cachep; 1393 int ret = 0; 1394 1395 list_for_each_entry(cachep, &slab_caches, list) { 1396 struct kmem_cache_node *n; 1397 1398 n = cachep->node[node]; 1399 if (!n) 1400 continue; 1401 1402 drain_freelist(cachep, n, slabs_tofree(cachep, n)); 1403 1404 if (!list_empty(&n->slabs_full) || 1405 !list_empty(&n->slabs_partial)) { 1406 ret = -EBUSY; 1407 break; 1408 } 1409 } 1410 return ret; 1411 } 1412 1413 static int __meminit slab_memory_callback(struct notifier_block *self, 1414 unsigned long action, void *arg) 1415 { 1416 struct memory_notify *mnb = arg; 1417 int ret = 0; 1418 int nid; 1419 1420 nid = mnb->status_change_nid; 1421 if (nid < 0) 1422 goto out; 1423 1424 switch (action) { 1425 case MEM_GOING_ONLINE: 1426 mutex_lock(&slab_mutex); 1427 ret = init_cache_node_node(nid); 1428 mutex_unlock(&slab_mutex); 1429 break; 1430 case MEM_GOING_OFFLINE: 1431 mutex_lock(&slab_mutex); 1432 ret = drain_cache_node_node(nid); 1433 mutex_unlock(&slab_mutex); 1434 break; 1435 case MEM_ONLINE: 1436 case MEM_OFFLINE: 1437 case MEM_CANCEL_ONLINE: 1438 case MEM_CANCEL_OFFLINE: 1439 break; 1440 } 1441 out: 1442 return notifier_from_errno(ret); 1443 } 1444 #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ 1445 1446 /* 1447 * swap the static kmem_cache_node with kmalloced memory 1448 */ 1449 static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list, 1450 int nodeid) 1451 { 1452 struct kmem_cache_node *ptr; 1453 1454 ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid); 1455 BUG_ON(!ptr); 1456 1457 memcpy(ptr, list, sizeof(struct kmem_cache_node)); 1458 /* 1459 * Do not assume that spinlocks can be initialized via memcpy: 1460 */ 1461 spin_lock_init(&ptr->list_lock); 1462 1463 MAKE_ALL_LISTS(cachep, ptr, nodeid); 1464 cachep->node[nodeid] = ptr; 1465 } 1466 1467 /* 1468 * For setting up all the kmem_cache_node for cache whose buffer_size is same as 1469 * size of kmem_cache_node. 1470 */ 1471 static void __init set_up_node(struct kmem_cache *cachep, int index) 1472 { 1473 int node; 1474 1475 for_each_online_node(node) { 1476 cachep->node[node] = &init_kmem_cache_node[index + node]; 1477 cachep->node[node]->next_reap = jiffies + 1478 REAPTIMEOUT_NODE + 1479 ((unsigned long)cachep) % REAPTIMEOUT_NODE; 1480 } 1481 } 1482 1483 /* 1484 * The memory after the last cpu cache pointer is used for the 1485 * the node pointer. 1486 */ 1487 static void setup_node_pointer(struct kmem_cache *cachep) 1488 { 1489 cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids]; 1490 } 1491 1492 /* 1493 * Initialisation. Called after the page allocator have been initialised and 1494 * before smp_init(). 1495 */ 1496 void __init kmem_cache_init(void) 1497 { 1498 int i; 1499 1500 BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < 1501 sizeof(struct rcu_head)); 1502 kmem_cache = &kmem_cache_boot; 1503 setup_node_pointer(kmem_cache); 1504 1505 if (num_possible_nodes() == 1) 1506 use_alien_caches = 0; 1507 1508 for (i = 0; i < NUM_INIT_LISTS; i++) 1509 kmem_cache_node_init(&init_kmem_cache_node[i]); 1510 1511 set_up_node(kmem_cache, CACHE_CACHE); 1512 1513 /* 1514 * Fragmentation resistance on low memory - only use bigger 1515 * page orders on machines with more than 32MB of memory if 1516 * not overridden on the command line. 1517 */ 1518 if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) 1519 slab_max_order = SLAB_MAX_ORDER_HI; 1520 1521 /* Bootstrap is tricky, because several objects are allocated 1522 * from caches that do not exist yet: 1523 * 1) initialize the kmem_cache cache: it contains the struct 1524 * kmem_cache structures of all caches, except kmem_cache itself: 1525 * kmem_cache is statically allocated. 1526 * Initially an __init data area is used for the head array and the 1527 * kmem_cache_node structures, it's replaced with a kmalloc allocated 1528 * array at the end of the bootstrap. 1529 * 2) Create the first kmalloc cache. 1530 * The struct kmem_cache for the new cache is allocated normally. 1531 * An __init data area is used for the head array. 1532 * 3) Create the remaining kmalloc caches, with minimally sized 1533 * head arrays. 1534 * 4) Replace the __init data head arrays for kmem_cache and the first 1535 * kmalloc cache with kmalloc allocated arrays. 1536 * 5) Replace the __init data for kmem_cache_node for kmem_cache and 1537 * the other cache's with kmalloc allocated memory. 1538 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1539 */ 1540 1541 /* 1) create the kmem_cache */ 1542 1543 /* 1544 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids 1545 */ 1546 create_boot_cache(kmem_cache, "kmem_cache", 1547 offsetof(struct kmem_cache, array[nr_cpu_ids]) + 1548 nr_node_ids * sizeof(struct kmem_cache_node *), 1549 SLAB_HWCACHE_ALIGN); 1550 list_add(&kmem_cache->list, &slab_caches); 1551 1552 /* 2+3) create the kmalloc caches */ 1553 1554 /* 1555 * Initialize the caches that provide memory for the array cache and the 1556 * kmem_cache_node structures first. Without this, further allocations will 1557 * bug. 1558 */ 1559 1560 kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac", 1561 kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS); 1562 1563 if (INDEX_AC != INDEX_NODE) 1564 kmalloc_caches[INDEX_NODE] = 1565 create_kmalloc_cache("kmalloc-node", 1566 kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); 1567 1568 slab_early_init = 0; 1569 1570 /* 4) Replace the bootstrap head arrays */ 1571 { 1572 struct array_cache *ptr; 1573 1574 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); 1575 1576 memcpy(ptr, cpu_cache_get(kmem_cache), 1577 sizeof(struct arraycache_init)); 1578 /* 1579 * Do not assume that spinlocks can be initialized via memcpy: 1580 */ 1581 spin_lock_init(&ptr->lock); 1582 1583 kmem_cache->array[smp_processor_id()] = ptr; 1584 1585 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); 1586 1587 BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC]) 1588 != &initarray_generic.cache); 1589 memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]), 1590 sizeof(struct arraycache_init)); 1591 /* 1592 * Do not assume that spinlocks can be initialized via memcpy: 1593 */ 1594 spin_lock_init(&ptr->lock); 1595 1596 kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr; 1597 } 1598 /* 5) Replace the bootstrap kmem_cache_node */ 1599 { 1600 int nid; 1601 1602 for_each_online_node(nid) { 1603 init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); 1604 1605 init_list(kmalloc_caches[INDEX_AC], 1606 &init_kmem_cache_node[SIZE_AC + nid], nid); 1607 1608 if (INDEX_AC != INDEX_NODE) { 1609 init_list(kmalloc_caches[INDEX_NODE], 1610 &init_kmem_cache_node[SIZE_NODE + nid], nid); 1611 } 1612 } 1613 } 1614 1615 create_kmalloc_caches(ARCH_KMALLOC_FLAGS); 1616 } 1617 1618 void __init kmem_cache_init_late(void) 1619 { 1620 struct kmem_cache *cachep; 1621 1622 slab_state = UP; 1623 1624 /* 6) resize the head arrays to their final sizes */ 1625 mutex_lock(&slab_mutex); 1626 list_for_each_entry(cachep, &slab_caches, list) 1627 if (enable_cpucache(cachep, GFP_NOWAIT)) 1628 BUG(); 1629 mutex_unlock(&slab_mutex); 1630 1631 /* Annotate slab for lockdep -- annotate the malloc caches */ 1632 init_lock_keys(); 1633 1634 /* Done! */ 1635 slab_state = FULL; 1636 1637 /* 1638 * Register a cpu startup notifier callback that initializes 1639 * cpu_cache_get for all new cpus 1640 */ 1641 register_cpu_notifier(&cpucache_notifier); 1642 1643 #ifdef CONFIG_NUMA 1644 /* 1645 * Register a memory hotplug callback that initializes and frees 1646 * node. 1647 */ 1648 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 1649 #endif 1650 1651 /* 1652 * The reap timers are started later, with a module init call: That part 1653 * of the kernel is not yet operational. 1654 */ 1655 } 1656 1657 static int __init cpucache_init(void) 1658 { 1659 int cpu; 1660 1661 /* 1662 * Register the timers that return unneeded pages to the page allocator 1663 */ 1664 for_each_online_cpu(cpu) 1665 start_cpu_timer(cpu); 1666 1667 /* Done! */ 1668 slab_state = FULL; 1669 return 0; 1670 } 1671 __initcall(cpucache_init); 1672 1673 static noinline void 1674 slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) 1675 { 1676 #if DEBUG 1677 struct kmem_cache_node *n; 1678 struct page *page; 1679 unsigned long flags; 1680 int node; 1681 static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, 1682 DEFAULT_RATELIMIT_BURST); 1683 1684 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) 1685 return; 1686 1687 printk(KERN_WARNING 1688 "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", 1689 nodeid, gfpflags); 1690 printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", 1691 cachep->name, cachep->size, cachep->gfporder); 1692 1693 for_each_online_node(node) { 1694 unsigned long active_objs = 0, num_objs = 0, free_objects = 0; 1695 unsigned long active_slabs = 0, num_slabs = 0; 1696 1697 n = cachep->node[node]; 1698 if (!n) 1699 continue; 1700 1701 spin_lock_irqsave(&n->list_lock, flags); 1702 list_for_each_entry(page, &n->slabs_full, lru) { 1703 active_objs += cachep->num; 1704 active_slabs++; 1705 } 1706 list_for_each_entry(page, &n->slabs_partial, lru) { 1707 active_objs += page->active; 1708 active_slabs++; 1709 } 1710 list_for_each_entry(page, &n->slabs_free, lru) 1711 num_slabs++; 1712 1713 free_objects += n->free_objects; 1714 spin_unlock_irqrestore(&n->list_lock, flags); 1715 1716 num_slabs += active_slabs; 1717 num_objs = num_slabs * cachep->num; 1718 printk(KERN_WARNING 1719 " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", 1720 node, active_slabs, num_slabs, active_objs, num_objs, 1721 free_objects); 1722 } 1723 #endif 1724 } 1725 1726 /* 1727 * Interface to system's page allocator. No need to hold the cache-lock. 1728 * 1729 * If we requested dmaable memory, we will get it. Even if we 1730 * did not request dmaable memory, we might get it, but that 1731 * would be relatively rare and ignorable. 1732 */ 1733 static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, 1734 int nodeid) 1735 { 1736 struct page *page; 1737 int nr_pages; 1738 1739 flags |= cachep->allocflags; 1740 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1741 flags |= __GFP_RECLAIMABLE; 1742 1743 if (memcg_charge_slab(cachep, flags, cachep->gfporder)) 1744 return NULL; 1745 1746 page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); 1747 if (!page) { 1748 memcg_uncharge_slab(cachep, cachep->gfporder); 1749 slab_out_of_memory(cachep, flags, nodeid); 1750 return NULL; 1751 } 1752 1753 /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ 1754 if (unlikely(page->pfmemalloc)) 1755 pfmemalloc_active = true; 1756 1757 nr_pages = (1 << cachep->gfporder); 1758 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1759 add_zone_page_state(page_zone(page), 1760 NR_SLAB_RECLAIMABLE, nr_pages); 1761 else 1762 add_zone_page_state(page_zone(page), 1763 NR_SLAB_UNRECLAIMABLE, nr_pages); 1764 __SetPageSlab(page); 1765 if (page->pfmemalloc) 1766 SetPageSlabPfmemalloc(page); 1767 1768 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1769 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); 1770 1771 if (cachep->ctor) 1772 kmemcheck_mark_uninitialized_pages(page, nr_pages); 1773 else 1774 kmemcheck_mark_unallocated_pages(page, nr_pages); 1775 } 1776 1777 return page; 1778 } 1779 1780 /* 1781 * Interface to system's page release. 1782 */ 1783 static void kmem_freepages(struct kmem_cache *cachep, struct page *page) 1784 { 1785 const unsigned long nr_freed = (1 << cachep->gfporder); 1786 1787 kmemcheck_free_shadow(page, cachep->gfporder); 1788 1789 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1790 sub_zone_page_state(page_zone(page), 1791 NR_SLAB_RECLAIMABLE, nr_freed); 1792 else 1793 sub_zone_page_state(page_zone(page), 1794 NR_SLAB_UNRECLAIMABLE, nr_freed); 1795 1796 BUG_ON(!PageSlab(page)); 1797 __ClearPageSlabPfmemalloc(page); 1798 __ClearPageSlab(page); 1799 page_mapcount_reset(page); 1800 page->mapping = NULL; 1801 1802 if (current->reclaim_state) 1803 current->reclaim_state->reclaimed_slab += nr_freed; 1804 __free_pages(page, cachep->gfporder); 1805 memcg_uncharge_slab(cachep, cachep->gfporder); 1806 } 1807 1808 static void kmem_rcu_free(struct rcu_head *head) 1809 { 1810 struct kmem_cache *cachep; 1811 struct page *page; 1812 1813 page = container_of(head, struct page, rcu_head); 1814 cachep = page->slab_cache; 1815 1816 kmem_freepages(cachep, page); 1817 } 1818 1819 #if DEBUG 1820 1821 #ifdef CONFIG_DEBUG_PAGEALLOC 1822 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, 1823 unsigned long caller) 1824 { 1825 int size = cachep->object_size; 1826 1827 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; 1828 1829 if (size < 5 * sizeof(unsigned long)) 1830 return; 1831 1832 *addr++ = 0x12345678; 1833 *addr++ = caller; 1834 *addr++ = smp_processor_id(); 1835 size -= 3 * sizeof(unsigned long); 1836 { 1837 unsigned long *sptr = &caller; 1838 unsigned long svalue; 1839 1840 while (!kstack_end(sptr)) { 1841 svalue = *sptr++; 1842 if (kernel_text_address(svalue)) { 1843 *addr++ = svalue; 1844 size -= sizeof(unsigned long); 1845 if (size <= sizeof(unsigned long)) 1846 break; 1847 } 1848 } 1849 1850 } 1851 *addr++ = 0x87654321; 1852 } 1853 #endif 1854 1855 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) 1856 { 1857 int size = cachep->object_size; 1858 addr = &((char *)addr)[obj_offset(cachep)]; 1859 1860 memset(addr, val, size); 1861 *(unsigned char *)(addr + size - 1) = POISON_END; 1862 } 1863 1864 static void dump_line(char *data, int offset, int limit) 1865 { 1866 int i; 1867 unsigned char error = 0; 1868 int bad_count = 0; 1869 1870 printk(KERN_ERR "%03x: ", offset); 1871 for (i = 0; i < limit; i++) { 1872 if (data[offset + i] != POISON_FREE) { 1873 error = data[offset + i]; 1874 bad_count++; 1875 } 1876 } 1877 print_hex_dump(KERN_CONT, "", 0, 16, 1, 1878 &data[offset], limit, 1); 1879 1880 if (bad_count == 1) { 1881 error ^= POISON_FREE; 1882 if (!(error & (error - 1))) { 1883 printk(KERN_ERR "Single bit error detected. Probably " 1884 "bad RAM.\n"); 1885 #ifdef CONFIG_X86 1886 printk(KERN_ERR "Run memtest86+ or a similar memory " 1887 "test tool.\n"); 1888 #else 1889 printk(KERN_ERR "Run a memory test tool.\n"); 1890 #endif 1891 } 1892 } 1893 } 1894 #endif 1895 1896 #if DEBUG 1897 1898 static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) 1899 { 1900 int i, size; 1901 char *realobj; 1902 1903 if (cachep->flags & SLAB_RED_ZONE) { 1904 printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", 1905 *dbg_redzone1(cachep, objp), 1906 *dbg_redzone2(cachep, objp)); 1907 } 1908 1909 if (cachep->flags & SLAB_STORE_USER) { 1910 printk(KERN_ERR "Last user: [<%p>](%pSR)\n", 1911 *dbg_userword(cachep, objp), 1912 *dbg_userword(cachep, objp)); 1913 } 1914 realobj = (char *)objp + obj_offset(cachep); 1915 size = cachep->object_size; 1916 for (i = 0; i < size && lines; i += 16, lines--) { 1917 int limit; 1918 limit = 16; 1919 if (i + limit > size) 1920 limit = size - i; 1921 dump_line(realobj, i, limit); 1922 } 1923 } 1924 1925 static void check_poison_obj(struct kmem_cache *cachep, void *objp) 1926 { 1927 char *realobj; 1928 int size, i; 1929 int lines = 0; 1930 1931 realobj = (char *)objp + obj_offset(cachep); 1932 size = cachep->object_size; 1933 1934 for (i = 0; i < size; i++) { 1935 char exp = POISON_FREE; 1936 if (i == size - 1) 1937 exp = POISON_END; 1938 if (realobj[i] != exp) { 1939 int limit; 1940 /* Mismatch ! */ 1941 /* Print header */ 1942 if (lines == 0) { 1943 printk(KERN_ERR 1944 "Slab corruption (%s): %s start=%p, len=%d\n", 1945 print_tainted(), cachep->name, realobj, size); 1946 print_objinfo(cachep, objp, 0); 1947 } 1948 /* Hexdump the affected line */ 1949 i = (i / 16) * 16; 1950 limit = 16; 1951 if (i + limit > size) 1952 limit = size - i; 1953 dump_line(realobj, i, limit); 1954 i += 16; 1955 lines++; 1956 /* Limit to 5 lines */ 1957 if (lines > 5) 1958 break; 1959 } 1960 } 1961 if (lines != 0) { 1962 /* Print some data about the neighboring objects, if they 1963 * exist: 1964 */ 1965 struct page *page = virt_to_head_page(objp); 1966 unsigned int objnr; 1967 1968 objnr = obj_to_index(cachep, page, objp); 1969 if (objnr) { 1970 objp = index_to_obj(cachep, page, objnr - 1); 1971 realobj = (char *)objp + obj_offset(cachep); 1972 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1973 realobj, size); 1974 print_objinfo(cachep, objp, 2); 1975 } 1976 if (objnr + 1 < cachep->num) { 1977 objp = index_to_obj(cachep, page, objnr + 1); 1978 realobj = (char *)objp + obj_offset(cachep); 1979 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1980 realobj, size); 1981 print_objinfo(cachep, objp, 2); 1982 } 1983 } 1984 } 1985 #endif 1986 1987 #if DEBUG 1988 static void slab_destroy_debugcheck(struct kmem_cache *cachep, 1989 struct page *page) 1990 { 1991 int i; 1992 for (i = 0; i < cachep->num; i++) { 1993 void *objp = index_to_obj(cachep, page, i); 1994 1995 if (cachep->flags & SLAB_POISON) { 1996 #ifdef CONFIG_DEBUG_PAGEALLOC 1997 if (cachep->size % PAGE_SIZE == 0 && 1998 OFF_SLAB(cachep)) 1999 kernel_map_pages(virt_to_page(objp), 2000 cachep->size / PAGE_SIZE, 1); 2001 else 2002 check_poison_obj(cachep, objp); 2003 #else 2004 check_poison_obj(cachep, objp); 2005 #endif 2006 } 2007 if (cachep->flags & SLAB_RED_ZONE) { 2008 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2009 slab_error(cachep, "start of a freed object " 2010 "was overwritten"); 2011 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2012 slab_error(cachep, "end of a freed object " 2013 "was overwritten"); 2014 } 2015 } 2016 } 2017 #else 2018 static void slab_destroy_debugcheck(struct kmem_cache *cachep, 2019 struct page *page) 2020 { 2021 } 2022 #endif 2023 2024 /** 2025 * slab_destroy - destroy and release all objects in a slab 2026 * @cachep: cache pointer being destroyed 2027 * @page: page pointer being destroyed 2028 * 2029 * Destroy all the objs in a slab, and release the mem back to the system. 2030 * Before calling the slab must have been unlinked from the cache. The 2031 * cache-lock is not held/needed. 2032 */ 2033 static void slab_destroy(struct kmem_cache *cachep, struct page *page) 2034 { 2035 void *freelist; 2036 2037 freelist = page->freelist; 2038 slab_destroy_debugcheck(cachep, page); 2039 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 2040 struct rcu_head *head; 2041 2042 /* 2043 * RCU free overloads the RCU head over the LRU. 2044 * slab_page has been overloeaded over the LRU, 2045 * however it is not used from now on so that 2046 * we can use it safely. 2047 */ 2048 head = (void *)&page->rcu_head; 2049 call_rcu(head, kmem_rcu_free); 2050 2051 } else { 2052 kmem_freepages(cachep, page); 2053 } 2054 2055 /* 2056 * From now on, we don't use freelist 2057 * although actual page can be freed in rcu context 2058 */ 2059 if (OFF_SLAB(cachep)) 2060 kmem_cache_free(cachep->freelist_cache, freelist); 2061 } 2062 2063 /** 2064 * calculate_slab_order - calculate size (page order) of slabs 2065 * @cachep: pointer to the cache that is being created 2066 * @size: size of objects to be created in this cache. 2067 * @align: required alignment for the objects. 2068 * @flags: slab allocation flags 2069 * 2070 * Also calculates the number of objects per slab. 2071 * 2072 * This could be made much more intelligent. For now, try to avoid using 2073 * high order pages for slabs. When the gfp() functions are more friendly 2074 * towards high-order requests, this should be changed. 2075 */ 2076 static size_t calculate_slab_order(struct kmem_cache *cachep, 2077 size_t size, size_t align, unsigned long flags) 2078 { 2079 unsigned long offslab_limit; 2080 size_t left_over = 0; 2081 int gfporder; 2082 2083 for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { 2084 unsigned int num; 2085 size_t remainder; 2086 2087 cache_estimate(gfporder, size, align, flags, &remainder, &num); 2088 if (!num) 2089 continue; 2090 2091 /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ 2092 if (num > SLAB_OBJ_MAX_NUM) 2093 break; 2094 2095 if (flags & CFLGS_OFF_SLAB) { 2096 size_t freelist_size_per_obj = sizeof(freelist_idx_t); 2097 /* 2098 * Max number of objs-per-slab for caches which 2099 * use off-slab slabs. Needed to avoid a possible 2100 * looping condition in cache_grow(). 2101 */ 2102 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) 2103 freelist_size_per_obj += sizeof(char); 2104 offslab_limit = size; 2105 offslab_limit /= freelist_size_per_obj; 2106 2107 if (num > offslab_limit) 2108 break; 2109 } 2110 2111 /* Found something acceptable - save it away */ 2112 cachep->num = num; 2113 cachep->gfporder = gfporder; 2114 left_over = remainder; 2115 2116 /* 2117 * A VFS-reclaimable slab tends to have most allocations 2118 * as GFP_NOFS and we really don't want to have to be allocating 2119 * higher-order pages when we are unable to shrink dcache. 2120 */ 2121 if (flags & SLAB_RECLAIM_ACCOUNT) 2122 break; 2123 2124 /* 2125 * Large number of objects is good, but very large slabs are 2126 * currently bad for the gfp()s. 2127 */ 2128 if (gfporder >= slab_max_order) 2129 break; 2130 2131 /* 2132 * Acceptable internal fragmentation? 2133 */ 2134 if (left_over * 8 <= (PAGE_SIZE << gfporder)) 2135 break; 2136 } 2137 return left_over; 2138 } 2139 2140 static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) 2141 { 2142 if (slab_state >= FULL) 2143 return enable_cpucache(cachep, gfp); 2144 2145 if (slab_state == DOWN) { 2146 /* 2147 * Note: Creation of first cache (kmem_cache). 2148 * The setup_node is taken care 2149 * of by the caller of __kmem_cache_create 2150 */ 2151 cachep->array[smp_processor_id()] = &initarray_generic.cache; 2152 slab_state = PARTIAL; 2153 } else if (slab_state == PARTIAL) { 2154 /* 2155 * Note: the second kmem_cache_create must create the cache 2156 * that's used by kmalloc(24), otherwise the creation of 2157 * further caches will BUG(). 2158 */ 2159 cachep->array[smp_processor_id()] = &initarray_generic.cache; 2160 2161 /* 2162 * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is 2163 * the second cache, then we need to set up all its node/, 2164 * otherwise the creation of further caches will BUG(). 2165 */ 2166 set_up_node(cachep, SIZE_AC); 2167 if (INDEX_AC == INDEX_NODE) 2168 slab_state = PARTIAL_NODE; 2169 else 2170 slab_state = PARTIAL_ARRAYCACHE; 2171 } else { 2172 /* Remaining boot caches */ 2173 cachep->array[smp_processor_id()] = 2174 kmalloc(sizeof(struct arraycache_init), gfp); 2175 2176 if (slab_state == PARTIAL_ARRAYCACHE) { 2177 set_up_node(cachep, SIZE_NODE); 2178 slab_state = PARTIAL_NODE; 2179 } else { 2180 int node; 2181 for_each_online_node(node) { 2182 cachep->node[node] = 2183 kmalloc_node(sizeof(struct kmem_cache_node), 2184 gfp, node); 2185 BUG_ON(!cachep->node[node]); 2186 kmem_cache_node_init(cachep->node[node]); 2187 } 2188 } 2189 } 2190 cachep->node[numa_mem_id()]->next_reap = 2191 jiffies + REAPTIMEOUT_NODE + 2192 ((unsigned long)cachep) % REAPTIMEOUT_NODE; 2193 2194 cpu_cache_get(cachep)->avail = 0; 2195 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 2196 cpu_cache_get(cachep)->batchcount = 1; 2197 cpu_cache_get(cachep)->touched = 0; 2198 cachep->batchcount = 1; 2199 cachep->limit = BOOT_CPUCACHE_ENTRIES; 2200 return 0; 2201 } 2202 2203 /** 2204 * __kmem_cache_create - Create a cache. 2205 * @cachep: cache management descriptor 2206 * @flags: SLAB flags 2207 * 2208 * Returns a ptr to the cache on success, NULL on failure. 2209 * Cannot be called within a int, but can be interrupted. 2210 * The @ctor is run when new pages are allocated by the cache. 2211 * 2212 * The flags are 2213 * 2214 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 2215 * to catch references to uninitialised memory. 2216 * 2217 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 2218 * for buffer overruns. 2219 * 2220 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 2221 * cacheline. This can be beneficial if you're counting cycles as closely 2222 * as davem. 2223 */ 2224 int 2225 __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) 2226 { 2227 size_t left_over, freelist_size, ralign; 2228 gfp_t gfp; 2229 int err; 2230 size_t size = cachep->size; 2231 2232 #if DEBUG 2233 #if FORCED_DEBUG 2234 /* 2235 * Enable redzoning and last user accounting, except for caches with 2236 * large objects, if the increased size would increase the object size 2237 * above the next power of two: caches with object sizes just above a 2238 * power of two have a significant amount of internal fragmentation. 2239 */ 2240 if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + 2241 2 * sizeof(unsigned long long))) 2242 flags |= SLAB_RED_ZONE | SLAB_STORE_USER; 2243 if (!(flags & SLAB_DESTROY_BY_RCU)) 2244 flags |= SLAB_POISON; 2245 #endif 2246 if (flags & SLAB_DESTROY_BY_RCU) 2247 BUG_ON(flags & SLAB_POISON); 2248 #endif 2249 2250 /* 2251 * Check that size is in terms of words. This is needed to avoid 2252 * unaligned accesses for some archs when redzoning is used, and makes 2253 * sure any on-slab bufctl's are also correctly aligned. 2254 */ 2255 if (size & (BYTES_PER_WORD - 1)) { 2256 size += (BYTES_PER_WORD - 1); 2257 size &= ~(BYTES_PER_WORD - 1); 2258 } 2259 2260 /* 2261 * Redzoning and user store require word alignment or possibly larger. 2262 * Note this will be overridden by architecture or caller mandated 2263 * alignment if either is greater than BYTES_PER_WORD. 2264 */ 2265 if (flags & SLAB_STORE_USER) 2266 ralign = BYTES_PER_WORD; 2267 2268 if (flags & SLAB_RED_ZONE) { 2269 ralign = REDZONE_ALIGN; 2270 /* If redzoning, ensure that the second redzone is suitably 2271 * aligned, by adjusting the object size accordingly. */ 2272 size += REDZONE_ALIGN - 1; 2273 size &= ~(REDZONE_ALIGN - 1); 2274 } 2275 2276 /* 3) caller mandated alignment */ 2277 if (ralign < cachep->align) { 2278 ralign = cachep->align; 2279 } 2280 /* disable debug if necessary */ 2281 if (ralign > __alignof__(unsigned long long)) 2282 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2283 /* 2284 * 4) Store it. 2285 */ 2286 cachep->align = ralign; 2287 2288 if (slab_is_available()) 2289 gfp = GFP_KERNEL; 2290 else 2291 gfp = GFP_NOWAIT; 2292 2293 setup_node_pointer(cachep); 2294 #if DEBUG 2295 2296 /* 2297 * Both debugging options require word-alignment which is calculated 2298 * into align above. 2299 */ 2300 if (flags & SLAB_RED_ZONE) { 2301 /* add space for red zone words */ 2302 cachep->obj_offset += sizeof(unsigned long long); 2303 size += 2 * sizeof(unsigned long long); 2304 } 2305 if (flags & SLAB_STORE_USER) { 2306 /* user store requires one word storage behind the end of 2307 * the real object. But if the second red zone needs to be 2308 * aligned to 64 bits, we must allow that much space. 2309 */ 2310 if (flags & SLAB_RED_ZONE) 2311 size += REDZONE_ALIGN; 2312 else 2313 size += BYTES_PER_WORD; 2314 } 2315 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 2316 if (size >= kmalloc_size(INDEX_NODE + 1) 2317 && cachep->object_size > cache_line_size() 2318 && ALIGN(size, cachep->align) < PAGE_SIZE) { 2319 cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); 2320 size = PAGE_SIZE; 2321 } 2322 #endif 2323 #endif 2324 2325 /* 2326 * Determine if the slab management is 'on' or 'off' slab. 2327 * (bootstrapping cannot cope with offslab caches so don't do 2328 * it too early on. Always use on-slab management when 2329 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) 2330 */ 2331 if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init && 2332 !(flags & SLAB_NOLEAKTRACE)) 2333 /* 2334 * Size is large, assume best to place the slab management obj 2335 * off-slab (should allow better packing of objs). 2336 */ 2337 flags |= CFLGS_OFF_SLAB; 2338 2339 size = ALIGN(size, cachep->align); 2340 /* 2341 * We should restrict the number of objects in a slab to implement 2342 * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. 2343 */ 2344 if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) 2345 size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); 2346 2347 left_over = calculate_slab_order(cachep, size, cachep->align, flags); 2348 2349 if (!cachep->num) 2350 return -E2BIG; 2351 2352 freelist_size = calculate_freelist_size(cachep->num, cachep->align); 2353 2354 /* 2355 * If the slab has been placed off-slab, and we have enough space then 2356 * move it on-slab. This is at the expense of any extra colouring. 2357 */ 2358 if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { 2359 flags &= ~CFLGS_OFF_SLAB; 2360 left_over -= freelist_size; 2361 } 2362 2363 if (flags & CFLGS_OFF_SLAB) { 2364 /* really off slab. No need for manual alignment */ 2365 freelist_size = calculate_freelist_size(cachep->num, 0); 2366 2367 #ifdef CONFIG_PAGE_POISONING 2368 /* If we're going to use the generic kernel_map_pages() 2369 * poisoning, then it's going to smash the contents of 2370 * the redzone and userword anyhow, so switch them off. 2371 */ 2372 if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) 2373 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2374 #endif 2375 } 2376 2377 cachep->colour_off = cache_line_size(); 2378 /* Offset must be a multiple of the alignment. */ 2379 if (cachep->colour_off < cachep->align) 2380 cachep->colour_off = cachep->align; 2381 cachep->colour = left_over / cachep->colour_off; 2382 cachep->freelist_size = freelist_size; 2383 cachep->flags = flags; 2384 cachep->allocflags = __GFP_COMP; 2385 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) 2386 cachep->allocflags |= GFP_DMA; 2387 cachep->size = size; 2388 cachep->reciprocal_buffer_size = reciprocal_value(size); 2389 2390 if (flags & CFLGS_OFF_SLAB) { 2391 cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); 2392 /* 2393 * This is a possibility for one of the kmalloc_{dma,}_caches. 2394 * But since we go off slab only for object size greater than 2395 * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created 2396 * in ascending order,this should not happen at all. 2397 * But leave a BUG_ON for some lucky dude. 2398 */ 2399 BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); 2400 } 2401 2402 err = setup_cpu_cache(cachep, gfp); 2403 if (err) { 2404 __kmem_cache_shutdown(cachep); 2405 return err; 2406 } 2407 2408 if (flags & SLAB_DEBUG_OBJECTS) { 2409 /* 2410 * Would deadlock through slab_destroy()->call_rcu()-> 2411 * debug_object_activate()->kmem_cache_alloc(). 2412 */ 2413 WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); 2414 2415 slab_set_debugobj_lock_classes(cachep); 2416 } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU)) 2417 on_slab_lock_classes(cachep); 2418 2419 return 0; 2420 } 2421 2422 #if DEBUG 2423 static void check_irq_off(void) 2424 { 2425 BUG_ON(!irqs_disabled()); 2426 } 2427 2428 static void check_irq_on(void) 2429 { 2430 BUG_ON(irqs_disabled()); 2431 } 2432 2433 static void check_spinlock_acquired(struct kmem_cache *cachep) 2434 { 2435 #ifdef CONFIG_SMP 2436 check_irq_off(); 2437 assert_spin_locked(&cachep->node[numa_mem_id()]->list_lock); 2438 #endif 2439 } 2440 2441 static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) 2442 { 2443 #ifdef CONFIG_SMP 2444 check_irq_off(); 2445 assert_spin_locked(&cachep->node[node]->list_lock); 2446 #endif 2447 } 2448 2449 #else 2450 #define check_irq_off() do { } while(0) 2451 #define check_irq_on() do { } while(0) 2452 #define check_spinlock_acquired(x) do { } while(0) 2453 #define check_spinlock_acquired_node(x, y) do { } while(0) 2454 #endif 2455 2456 static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, 2457 struct array_cache *ac, 2458 int force, int node); 2459 2460 static void do_drain(void *arg) 2461 { 2462 struct kmem_cache *cachep = arg; 2463 struct array_cache *ac; 2464 int node = numa_mem_id(); 2465 2466 check_irq_off(); 2467 ac = cpu_cache_get(cachep); 2468 spin_lock(&cachep->node[node]->list_lock); 2469 free_block(cachep, ac->entry, ac->avail, node); 2470 spin_unlock(&cachep->node[node]->list_lock); 2471 ac->avail = 0; 2472 } 2473 2474 static void drain_cpu_caches(struct kmem_cache *cachep) 2475 { 2476 struct kmem_cache_node *n; 2477 int node; 2478 2479 on_each_cpu(do_drain, cachep, 1); 2480 check_irq_on(); 2481 for_each_online_node(node) { 2482 n = cachep->node[node]; 2483 if (n && n->alien) 2484 drain_alien_cache(cachep, n->alien); 2485 } 2486 2487 for_each_online_node(node) { 2488 n = cachep->node[node]; 2489 if (n) 2490 drain_array(cachep, n, n->shared, 1, node); 2491 } 2492 } 2493 2494 /* 2495 * Remove slabs from the list of free slabs. 2496 * Specify the number of slabs to drain in tofree. 2497 * 2498 * Returns the actual number of slabs released. 2499 */ 2500 static int drain_freelist(struct kmem_cache *cache, 2501 struct kmem_cache_node *n, int tofree) 2502 { 2503 struct list_head *p; 2504 int nr_freed; 2505 struct page *page; 2506 2507 nr_freed = 0; 2508 while (nr_freed < tofree && !list_empty(&n->slabs_free)) { 2509 2510 spin_lock_irq(&n->list_lock); 2511 p = n->slabs_free.prev; 2512 if (p == &n->slabs_free) { 2513 spin_unlock_irq(&n->list_lock); 2514 goto out; 2515 } 2516 2517 page = list_entry(p, struct page, lru); 2518 #if DEBUG 2519 BUG_ON(page->active); 2520 #endif 2521 list_del(&page->lru); 2522 /* 2523 * Safe to drop the lock. The slab is no longer linked 2524 * to the cache. 2525 */ 2526 n->free_objects -= cache->num; 2527 spin_unlock_irq(&n->list_lock); 2528 slab_destroy(cache, page); 2529 nr_freed++; 2530 } 2531 out: 2532 return nr_freed; 2533 } 2534 2535 int __kmem_cache_shrink(struct kmem_cache *cachep) 2536 { 2537 int ret = 0, i = 0; 2538 struct kmem_cache_node *n; 2539 2540 drain_cpu_caches(cachep); 2541 2542 check_irq_on(); 2543 for_each_online_node(i) { 2544 n = cachep->node[i]; 2545 if (!n) 2546 continue; 2547 2548 drain_freelist(cachep, n, slabs_tofree(cachep, n)); 2549 2550 ret += !list_empty(&n->slabs_full) || 2551 !list_empty(&n->slabs_partial); 2552 } 2553 return (ret ? 1 : 0); 2554 } 2555 2556 int __kmem_cache_shutdown(struct kmem_cache *cachep) 2557 { 2558 int i; 2559 struct kmem_cache_node *n; 2560 int rc = __kmem_cache_shrink(cachep); 2561 2562 if (rc) 2563 return rc; 2564 2565 for_each_online_cpu(i) 2566 kfree(cachep->array[i]); 2567 2568 /* NUMA: free the node structures */ 2569 for_each_online_node(i) { 2570 n = cachep->node[i]; 2571 if (n) { 2572 kfree(n->shared); 2573 free_alien_cache(n->alien); 2574 kfree(n); 2575 } 2576 } 2577 return 0; 2578 } 2579 2580 /* 2581 * Get the memory for a slab management obj. 2582 * 2583 * For a slab cache when the slab descriptor is off-slab, the 2584 * slab descriptor can't come from the same cache which is being created, 2585 * Because if it is the case, that means we defer the creation of 2586 * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. 2587 * And we eventually call down to __kmem_cache_create(), which 2588 * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one. 2589 * This is a "chicken-and-egg" problem. 2590 * 2591 * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, 2592 * which are all initialized during kmem_cache_init(). 2593 */ 2594 static void *alloc_slabmgmt(struct kmem_cache *cachep, 2595 struct page *page, int colour_off, 2596 gfp_t local_flags, int nodeid) 2597 { 2598 void *freelist; 2599 void *addr = page_address(page); 2600 2601 if (OFF_SLAB(cachep)) { 2602 /* Slab management obj is off-slab. */ 2603 freelist = kmem_cache_alloc_node(cachep->freelist_cache, 2604 local_flags, nodeid); 2605 if (!freelist) 2606 return NULL; 2607 } else { 2608 freelist = addr + colour_off; 2609 colour_off += cachep->freelist_size; 2610 } 2611 page->active = 0; 2612 page->s_mem = addr + colour_off; 2613 return freelist; 2614 } 2615 2616 static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx) 2617 { 2618 return ((freelist_idx_t *)page->freelist)[idx]; 2619 } 2620 2621 static inline void set_free_obj(struct page *page, 2622 unsigned int idx, freelist_idx_t val) 2623 { 2624 ((freelist_idx_t *)(page->freelist))[idx] = val; 2625 } 2626 2627 static void cache_init_objs(struct kmem_cache *cachep, 2628 struct page *page) 2629 { 2630 int i; 2631 2632 for (i = 0; i < cachep->num; i++) { 2633 void *objp = index_to_obj(cachep, page, i); 2634 #if DEBUG 2635 /* need to poison the objs? */ 2636 if (cachep->flags & SLAB_POISON) 2637 poison_obj(cachep, objp, POISON_FREE); 2638 if (cachep->flags & SLAB_STORE_USER) 2639 *dbg_userword(cachep, objp) = NULL; 2640 2641 if (cachep->flags & SLAB_RED_ZONE) { 2642 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2643 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2644 } 2645 /* 2646 * Constructors are not allowed to allocate memory from the same 2647 * cache which they are a constructor for. Otherwise, deadlock. 2648 * They must also be threaded. 2649 */ 2650 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2651 cachep->ctor(objp + obj_offset(cachep)); 2652 2653 if (cachep->flags & SLAB_RED_ZONE) { 2654 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2655 slab_error(cachep, "constructor overwrote the" 2656 " end of an object"); 2657 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2658 slab_error(cachep, "constructor overwrote the" 2659 " start of an object"); 2660 } 2661 if ((cachep->size % PAGE_SIZE) == 0 && 2662 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2663 kernel_map_pages(virt_to_page(objp), 2664 cachep->size / PAGE_SIZE, 0); 2665 #else 2666 if (cachep->ctor) 2667 cachep->ctor(objp); 2668 #endif 2669 set_obj_status(page, i, OBJECT_FREE); 2670 set_free_obj(page, i, i); 2671 } 2672 } 2673 2674 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) 2675 { 2676 if (CONFIG_ZONE_DMA_FLAG) { 2677 if (flags & GFP_DMA) 2678 BUG_ON(!(cachep->allocflags & GFP_DMA)); 2679 else 2680 BUG_ON(cachep->allocflags & GFP_DMA); 2681 } 2682 } 2683 2684 static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, 2685 int nodeid) 2686 { 2687 void *objp; 2688 2689 objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); 2690 page->active++; 2691 #if DEBUG 2692 WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); 2693 #endif 2694 2695 return objp; 2696 } 2697 2698 static void slab_put_obj(struct kmem_cache *cachep, struct page *page, 2699 void *objp, int nodeid) 2700 { 2701 unsigned int objnr = obj_to_index(cachep, page, objp); 2702 #if DEBUG 2703 unsigned int i; 2704 2705 /* Verify that the slab belongs to the intended node */ 2706 WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); 2707 2708 /* Verify double free bug */ 2709 for (i = page->active; i < cachep->num; i++) { 2710 if (get_free_obj(page, i) == objnr) { 2711 printk(KERN_ERR "slab: double free detected in cache " 2712 "'%s', objp %p\n", cachep->name, objp); 2713 BUG(); 2714 } 2715 } 2716 #endif 2717 page->active--; 2718 set_free_obj(page, page->active, objnr); 2719 } 2720 2721 /* 2722 * Map pages beginning at addr to the given cache and slab. This is required 2723 * for the slab allocator to be able to lookup the cache and slab of a 2724 * virtual address for kfree, ksize, and slab debugging. 2725 */ 2726 static void slab_map_pages(struct kmem_cache *cache, struct page *page, 2727 void *freelist) 2728 { 2729 page->slab_cache = cache; 2730 page->freelist = freelist; 2731 } 2732 2733 /* 2734 * Grow (by 1) the number of slabs within a cache. This is called by 2735 * kmem_cache_alloc() when there are no active objs left in a cache. 2736 */ 2737 static int cache_grow(struct kmem_cache *cachep, 2738 gfp_t flags, int nodeid, struct page *page) 2739 { 2740 void *freelist; 2741 size_t offset; 2742 gfp_t local_flags; 2743 struct kmem_cache_node *n; 2744 2745 /* 2746 * Be lazy and only check for valid flags here, keeping it out of the 2747 * critical path in kmem_cache_alloc(). 2748 */ 2749 BUG_ON(flags & GFP_SLAB_BUG_MASK); 2750 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 2751 2752 /* Take the node list lock to change the colour_next on this node */ 2753 check_irq_off(); 2754 n = cachep->node[nodeid]; 2755 spin_lock(&n->list_lock); 2756 2757 /* Get colour for the slab, and cal the next value. */ 2758 offset = n->colour_next; 2759 n->colour_next++; 2760 if (n->colour_next >= cachep->colour) 2761 n->colour_next = 0; 2762 spin_unlock(&n->list_lock); 2763 2764 offset *= cachep->colour_off; 2765 2766 if (local_flags & __GFP_WAIT) 2767 local_irq_enable(); 2768 2769 /* 2770 * The test for missing atomic flag is performed here, rather than 2771 * the more obvious place, simply to reduce the critical path length 2772 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they 2773 * will eventually be caught here (where it matters). 2774 */ 2775 kmem_flagcheck(cachep, flags); 2776 2777 /* 2778 * Get mem for the objs. Attempt to allocate a physical page from 2779 * 'nodeid'. 2780 */ 2781 if (!page) 2782 page = kmem_getpages(cachep, local_flags, nodeid); 2783 if (!page) 2784 goto failed; 2785 2786 /* Get slab management. */ 2787 freelist = alloc_slabmgmt(cachep, page, offset, 2788 local_flags & ~GFP_CONSTRAINT_MASK, nodeid); 2789 if (!freelist) 2790 goto opps1; 2791 2792 slab_map_pages(cachep, page, freelist); 2793 2794 cache_init_objs(cachep, page); 2795 2796 if (local_flags & __GFP_WAIT) 2797 local_irq_disable(); 2798 check_irq_off(); 2799 spin_lock(&n->list_lock); 2800 2801 /* Make slab active. */ 2802 list_add_tail(&page->lru, &(n->slabs_free)); 2803 STATS_INC_GROWN(cachep); 2804 n->free_objects += cachep->num; 2805 spin_unlock(&n->list_lock); 2806 return 1; 2807 opps1: 2808 kmem_freepages(cachep, page); 2809 failed: 2810 if (local_flags & __GFP_WAIT) 2811 local_irq_disable(); 2812 return 0; 2813 } 2814 2815 #if DEBUG 2816 2817 /* 2818 * Perform extra freeing checks: 2819 * - detect bad pointers. 2820 * - POISON/RED_ZONE checking 2821 */ 2822 static void kfree_debugcheck(const void *objp) 2823 { 2824 if (!virt_addr_valid(objp)) { 2825 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", 2826 (unsigned long)objp); 2827 BUG(); 2828 } 2829 } 2830 2831 static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) 2832 { 2833 unsigned long long redzone1, redzone2; 2834 2835 redzone1 = *dbg_redzone1(cache, obj); 2836 redzone2 = *dbg_redzone2(cache, obj); 2837 2838 /* 2839 * Redzone is ok. 2840 */ 2841 if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) 2842 return; 2843 2844 if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) 2845 slab_error(cache, "double free detected"); 2846 else 2847 slab_error(cache, "memory outside object was overwritten"); 2848 2849 printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", 2850 obj, redzone1, redzone2); 2851 } 2852 2853 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, 2854 unsigned long caller) 2855 { 2856 unsigned int objnr; 2857 struct page *page; 2858 2859 BUG_ON(virt_to_cache(objp) != cachep); 2860 2861 objp -= obj_offset(cachep); 2862 kfree_debugcheck(objp); 2863 page = virt_to_head_page(objp); 2864 2865 if (cachep->flags & SLAB_RED_ZONE) { 2866 verify_redzone_free(cachep, objp); 2867 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2868 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2869 } 2870 if (cachep->flags & SLAB_STORE_USER) 2871 *dbg_userword(cachep, objp) = (void *)caller; 2872 2873 objnr = obj_to_index(cachep, page, objp); 2874 2875 BUG_ON(objnr >= cachep->num); 2876 BUG_ON(objp != index_to_obj(cachep, page, objnr)); 2877 2878 set_obj_status(page, objnr, OBJECT_FREE); 2879 if (cachep->flags & SLAB_POISON) { 2880 #ifdef CONFIG_DEBUG_PAGEALLOC 2881 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { 2882 store_stackinfo(cachep, objp, caller); 2883 kernel_map_pages(virt_to_page(objp), 2884 cachep->size / PAGE_SIZE, 0); 2885 } else { 2886 poison_obj(cachep, objp, POISON_FREE); 2887 } 2888 #else 2889 poison_obj(cachep, objp, POISON_FREE); 2890 #endif 2891 } 2892 return objp; 2893 } 2894 2895 #else 2896 #define kfree_debugcheck(x) do { } while(0) 2897 #define cache_free_debugcheck(x,objp,z) (objp) 2898 #endif 2899 2900 static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, 2901 bool force_refill) 2902 { 2903 int batchcount; 2904 struct kmem_cache_node *n; 2905 struct array_cache *ac; 2906 int node; 2907 2908 check_irq_off(); 2909 node = numa_mem_id(); 2910 if (unlikely(force_refill)) 2911 goto force_grow; 2912 retry: 2913 ac = cpu_cache_get(cachep); 2914 batchcount = ac->batchcount; 2915 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2916 /* 2917 * If there was little recent activity on this cache, then 2918 * perform only a partial refill. Otherwise we could generate 2919 * refill bouncing. 2920 */ 2921 batchcount = BATCHREFILL_LIMIT; 2922 } 2923 n = cachep->node[node]; 2924 2925 BUG_ON(ac->avail > 0 || !n); 2926 spin_lock(&n->list_lock); 2927 2928 /* See if we can refill from the shared array */ 2929 if (n->shared && transfer_objects(ac, n->shared, batchcount)) { 2930 n->shared->touched = 1; 2931 goto alloc_done; 2932 } 2933 2934 while (batchcount > 0) { 2935 struct list_head *entry; 2936 struct page *page; 2937 /* Get slab alloc is to come from. */ 2938 entry = n->slabs_partial.next; 2939 if (entry == &n->slabs_partial) { 2940 n->free_touched = 1; 2941 entry = n->slabs_free.next; 2942 if (entry == &n->slabs_free) 2943 goto must_grow; 2944 } 2945 2946 page = list_entry(entry, struct page, lru); 2947 check_spinlock_acquired(cachep); 2948 2949 /* 2950 * The slab was either on partial or free list so 2951 * there must be at least one object available for 2952 * allocation. 2953 */ 2954 BUG_ON(page->active >= cachep->num); 2955 2956 while (page->active < cachep->num && batchcount--) { 2957 STATS_INC_ALLOCED(cachep); 2958 STATS_INC_ACTIVE(cachep); 2959 STATS_SET_HIGH(cachep); 2960 2961 ac_put_obj(cachep, ac, slab_get_obj(cachep, page, 2962 node)); 2963 } 2964 2965 /* move slabp to correct slabp list: */ 2966 list_del(&page->lru); 2967 if (page->active == cachep->num) 2968 list_add(&page->lru, &n->slabs_full); 2969 else 2970 list_add(&page->lru, &n->slabs_partial); 2971 } 2972 2973 must_grow: 2974 n->free_objects -= ac->avail; 2975 alloc_done: 2976 spin_unlock(&n->list_lock); 2977 2978 if (unlikely(!ac->avail)) { 2979 int x; 2980 force_grow: 2981 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); 2982 2983 /* cache_grow can reenable interrupts, then ac could change. */ 2984 ac = cpu_cache_get(cachep); 2985 node = numa_mem_id(); 2986 2987 /* no objects in sight? abort */ 2988 if (!x && (ac->avail == 0 || force_refill)) 2989 return NULL; 2990 2991 if (!ac->avail) /* objects refilled by interrupt? */ 2992 goto retry; 2993 } 2994 ac->touched = 1; 2995 2996 return ac_get_obj(cachep, ac, flags, force_refill); 2997 } 2998 2999 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, 3000 gfp_t flags) 3001 { 3002 might_sleep_if(flags & __GFP_WAIT); 3003 #if DEBUG 3004 kmem_flagcheck(cachep, flags); 3005 #endif 3006 } 3007 3008 #if DEBUG 3009 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, 3010 gfp_t flags, void *objp, unsigned long caller) 3011 { 3012 struct page *page; 3013 3014 if (!objp) 3015 return objp; 3016 if (cachep->flags & SLAB_POISON) { 3017 #ifdef CONFIG_DEBUG_PAGEALLOC 3018 if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 3019 kernel_map_pages(virt_to_page(objp), 3020 cachep->size / PAGE_SIZE, 1); 3021 else 3022 check_poison_obj(cachep, objp); 3023 #else 3024 check_poison_obj(cachep, objp); 3025 #endif 3026 poison_obj(cachep, objp, POISON_INUSE); 3027 } 3028 if (cachep->flags & SLAB_STORE_USER) 3029 *dbg_userword(cachep, objp) = (void *)caller; 3030 3031 if (cachep->flags & SLAB_RED_ZONE) { 3032 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || 3033 *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 3034 slab_error(cachep, "double free, or memory outside" 3035 " object was overwritten"); 3036 printk(KERN_ERR 3037 "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", 3038 objp, *dbg_redzone1(cachep, objp), 3039 *dbg_redzone2(cachep, objp)); 3040 } 3041 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 3042 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 3043 } 3044 3045 page = virt_to_head_page(objp); 3046 set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); 3047 objp += obj_offset(cachep); 3048 if (cachep->ctor && cachep->flags & SLAB_POISON) 3049 cachep->ctor(objp); 3050 if (ARCH_SLAB_MINALIGN && 3051 ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { 3052 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3053 objp, (int)ARCH_SLAB_MINALIGN); 3054 } 3055 return objp; 3056 } 3057 #else 3058 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 3059 #endif 3060 3061 static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) 3062 { 3063 if (cachep == kmem_cache) 3064 return false; 3065 3066 return should_failslab(cachep->object_size, flags, cachep->flags); 3067 } 3068 3069 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3070 { 3071 void *objp; 3072 struct array_cache *ac; 3073 bool force_refill = false; 3074 3075 check_irq_off(); 3076 3077 ac = cpu_cache_get(cachep); 3078 if (likely(ac->avail)) { 3079 ac->touched = 1; 3080 objp = ac_get_obj(cachep, ac, flags, false); 3081 3082 /* 3083 * Allow for the possibility all avail objects are not allowed 3084 * by the current flags 3085 */ 3086 if (objp) { 3087 STATS_INC_ALLOCHIT(cachep); 3088 goto out; 3089 } 3090 force_refill = true; 3091 } 3092 3093 STATS_INC_ALLOCMISS(cachep); 3094 objp = cache_alloc_refill(cachep, flags, force_refill); 3095 /* 3096 * the 'ac' may be updated by cache_alloc_refill(), 3097 * and kmemleak_erase() requires its correct value. 3098 */ 3099 ac = cpu_cache_get(cachep); 3100 3101 out: 3102 /* 3103 * To avoid a false negative, if an object that is in one of the 3104 * per-CPU caches is leaked, we need to make sure kmemleak doesn't 3105 * treat the array pointers as a reference to the object. 3106 */ 3107 if (objp) 3108 kmemleak_erase(&ac->entry[ac->avail]); 3109 return objp; 3110 } 3111 3112 #ifdef CONFIG_NUMA 3113 /* 3114 * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set. 3115 * 3116 * If we are in_interrupt, then process context, including cpusets and 3117 * mempolicy, may not apply and should not be used for allocation policy. 3118 */ 3119 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) 3120 { 3121 int nid_alloc, nid_here; 3122 3123 if (in_interrupt() || (flags & __GFP_THISNODE)) 3124 return NULL; 3125 nid_alloc = nid_here = numa_mem_id(); 3126 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3127 nid_alloc = cpuset_slab_spread_node(); 3128 else if (current->mempolicy) 3129 nid_alloc = mempolicy_slab_node(); 3130 if (nid_alloc != nid_here) 3131 return ____cache_alloc_node(cachep, flags, nid_alloc); 3132 return NULL; 3133 } 3134 3135 /* 3136 * Fallback function if there was no memory available and no objects on a 3137 * certain node and fall back is permitted. First we scan all the 3138 * available node for available objects. If that fails then we 3139 * perform an allocation without specifying a node. This allows the page 3140 * allocator to do its reclaim / fallback magic. We then insert the 3141 * slab into the proper nodelist and then allocate from it. 3142 */ 3143 static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) 3144 { 3145 struct zonelist *zonelist; 3146 gfp_t local_flags; 3147 struct zoneref *z; 3148 struct zone *zone; 3149 enum zone_type high_zoneidx = gfp_zone(flags); 3150 void *obj = NULL; 3151 int nid; 3152 unsigned int cpuset_mems_cookie; 3153 3154 if (flags & __GFP_THISNODE) 3155 return NULL; 3156 3157 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 3158 3159 retry_cpuset: 3160 cpuset_mems_cookie = read_mems_allowed_begin(); 3161 zonelist = node_zonelist(mempolicy_slab_node(), flags); 3162 3163 retry: 3164 /* 3165 * Look through allowed nodes for objects available 3166 * from existing per node queues. 3167 */ 3168 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 3169 nid = zone_to_nid(zone); 3170 3171 if (cpuset_zone_allowed_hardwall(zone, flags) && 3172 cache->node[nid] && 3173 cache->node[nid]->free_objects) { 3174 obj = ____cache_alloc_node(cache, 3175 flags | GFP_THISNODE, nid); 3176 if (obj) 3177 break; 3178 } 3179 } 3180 3181 if (!obj) { 3182 /* 3183 * This allocation will be performed within the constraints 3184 * of the current cpuset / memory policy requirements. 3185 * We may trigger various forms of reclaim on the allowed 3186 * set and go into memory reserves if necessary. 3187 */ 3188 struct page *page; 3189 3190 if (local_flags & __GFP_WAIT) 3191 local_irq_enable(); 3192 kmem_flagcheck(cache, flags); 3193 page = kmem_getpages(cache, local_flags, numa_mem_id()); 3194 if (local_flags & __GFP_WAIT) 3195 local_irq_disable(); 3196 if (page) { 3197 /* 3198 * Insert into the appropriate per node queues 3199 */ 3200 nid = page_to_nid(page); 3201 if (cache_grow(cache, flags, nid, page)) { 3202 obj = ____cache_alloc_node(cache, 3203 flags | GFP_THISNODE, nid); 3204 if (!obj) 3205 /* 3206 * Another processor may allocate the 3207 * objects in the slab since we are 3208 * not holding any locks. 3209 */ 3210 goto retry; 3211 } else { 3212 /* cache_grow already freed obj */ 3213 obj = NULL; 3214 } 3215 } 3216 } 3217 3218 if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) 3219 goto retry_cpuset; 3220 return obj; 3221 } 3222 3223 /* 3224 * A interface to enable slab creation on nodeid 3225 */ 3226 static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3227 int nodeid) 3228 { 3229 struct list_head *entry; 3230 struct page *page; 3231 struct kmem_cache_node *n; 3232 void *obj; 3233 int x; 3234 3235 VM_BUG_ON(nodeid > num_online_nodes()); 3236 n = cachep->node[nodeid]; 3237 BUG_ON(!n); 3238 3239 retry: 3240 check_irq_off(); 3241 spin_lock(&n->list_lock); 3242 entry = n->slabs_partial.next; 3243 if (entry == &n->slabs_partial) { 3244 n->free_touched = 1; 3245 entry = n->slabs_free.next; 3246 if (entry == &n->slabs_free) 3247 goto must_grow; 3248 } 3249 3250 page = list_entry(entry, struct page, lru); 3251 check_spinlock_acquired_node(cachep, nodeid); 3252 3253 STATS_INC_NODEALLOCS(cachep); 3254 STATS_INC_ACTIVE(cachep); 3255 STATS_SET_HIGH(cachep); 3256 3257 BUG_ON(page->active == cachep->num); 3258 3259 obj = slab_get_obj(cachep, page, nodeid); 3260 n->free_objects--; 3261 /* move slabp to correct slabp list: */ 3262 list_del(&page->lru); 3263 3264 if (page->active == cachep->num) 3265 list_add(&page->lru, &n->slabs_full); 3266 else 3267 list_add(&page->lru, &n->slabs_partial); 3268 3269 spin_unlock(&n->list_lock); 3270 goto done; 3271 3272 must_grow: 3273 spin_unlock(&n->list_lock); 3274 x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); 3275 if (x) 3276 goto retry; 3277 3278 return fallback_alloc(cachep, flags); 3279 3280 done: 3281 return obj; 3282 } 3283 3284 static __always_inline void * 3285 slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, 3286 unsigned long caller) 3287 { 3288 unsigned long save_flags; 3289 void *ptr; 3290 int slab_node = numa_mem_id(); 3291 3292 flags &= gfp_allowed_mask; 3293 3294 lockdep_trace_alloc(flags); 3295 3296 if (slab_should_failslab(cachep, flags)) 3297 return NULL; 3298 3299 cachep = memcg_kmem_get_cache(cachep, flags); 3300 3301 cache_alloc_debugcheck_before(cachep, flags); 3302 local_irq_save(save_flags); 3303 3304 if (nodeid == NUMA_NO_NODE) 3305 nodeid = slab_node; 3306 3307 if (unlikely(!cachep->node[nodeid])) { 3308 /* Node not bootstrapped yet */ 3309 ptr = fallback_alloc(cachep, flags); 3310 goto out; 3311 } 3312 3313 if (nodeid == slab_node) { 3314 /* 3315 * Use the locally cached objects if possible. 3316 * However ____cache_alloc does not allow fallback 3317 * to other nodes. It may fail while we still have 3318 * objects on other nodes available. 3319 */ 3320 ptr = ____cache_alloc(cachep, flags); 3321 if (ptr) 3322 goto out; 3323 } 3324 /* ___cache_alloc_node can fall back to other nodes */ 3325 ptr = ____cache_alloc_node(cachep, flags, nodeid); 3326 out: 3327 local_irq_restore(save_flags); 3328 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); 3329 kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, 3330 flags); 3331 3332 if (likely(ptr)) { 3333 kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); 3334 if (unlikely(flags & __GFP_ZERO)) 3335 memset(ptr, 0, cachep->object_size); 3336 } 3337 3338 return ptr; 3339 } 3340 3341 static __always_inline void * 3342 __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) 3343 { 3344 void *objp; 3345 3346 if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) { 3347 objp = alternate_node_alloc(cache, flags); 3348 if (objp) 3349 goto out; 3350 } 3351 objp = ____cache_alloc(cache, flags); 3352 3353 /* 3354 * We may just have run out of memory on the local node. 3355 * ____cache_alloc_node() knows how to locate memory on other nodes 3356 */ 3357 if (!objp) 3358 objp = ____cache_alloc_node(cache, flags, numa_mem_id()); 3359 3360 out: 3361 return objp; 3362 } 3363 #else 3364 3365 static __always_inline void * 3366 __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3367 { 3368 return ____cache_alloc(cachep, flags); 3369 } 3370 3371 #endif /* CONFIG_NUMA */ 3372 3373 static __always_inline void * 3374 slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) 3375 { 3376 unsigned long save_flags; 3377 void *objp; 3378 3379 flags &= gfp_allowed_mask; 3380 3381 lockdep_trace_alloc(flags); 3382 3383 if (slab_should_failslab(cachep, flags)) 3384 return NULL; 3385 3386 cachep = memcg_kmem_get_cache(cachep, flags); 3387 3388 cache_alloc_debugcheck_before(cachep, flags); 3389 local_irq_save(save_flags); 3390 objp = __do_cache_alloc(cachep, flags); 3391 local_irq_restore(save_flags); 3392 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); 3393 kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, 3394 flags); 3395 prefetchw(objp); 3396 3397 if (likely(objp)) { 3398 kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); 3399 if (unlikely(flags & __GFP_ZERO)) 3400 memset(objp, 0, cachep->object_size); 3401 } 3402 3403 return objp; 3404 } 3405 3406 /* 3407 * Caller needs to acquire correct kmem_cache_node's list_lock 3408 */ 3409 static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, 3410 int node) 3411 { 3412 int i; 3413 struct kmem_cache_node *n; 3414 3415 for (i = 0; i < nr_objects; i++) { 3416 void *objp; 3417 struct page *page; 3418 3419 clear_obj_pfmemalloc(&objpp[i]); 3420 objp = objpp[i]; 3421 3422 page = virt_to_head_page(objp); 3423 n = cachep->node[node]; 3424 list_del(&page->lru); 3425 check_spinlock_acquired_node(cachep, node); 3426 slab_put_obj(cachep, page, objp, node); 3427 STATS_DEC_ACTIVE(cachep); 3428 n->free_objects++; 3429 3430 /* fixup slab chains */ 3431 if (page->active == 0) { 3432 if (n->free_objects > n->free_limit) { 3433 n->free_objects -= cachep->num; 3434 /* No need to drop any previously held 3435 * lock here, even if we have a off-slab slab 3436 * descriptor it is guaranteed to come from 3437 * a different cache, refer to comments before 3438 * alloc_slabmgmt. 3439 */ 3440 slab_destroy(cachep, page); 3441 } else { 3442 list_add(&page->lru, &n->slabs_free); 3443 } 3444 } else { 3445 /* Unconditionally move a slab to the end of the 3446 * partial list on free - maximum time for the 3447 * other objects to be freed, too. 3448 */ 3449 list_add_tail(&page->lru, &n->slabs_partial); 3450 } 3451 } 3452 } 3453 3454 static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) 3455 { 3456 int batchcount; 3457 struct kmem_cache_node *n; 3458 int node = numa_mem_id(); 3459 3460 batchcount = ac->batchcount; 3461 #if DEBUG 3462 BUG_ON(!batchcount || batchcount > ac->avail); 3463 #endif 3464 check_irq_off(); 3465 n = cachep->node[node]; 3466 spin_lock(&n->list_lock); 3467 if (n->shared) { 3468 struct array_cache *shared_array = n->shared; 3469 int max = shared_array->limit - shared_array->avail; 3470 if (max) { 3471 if (batchcount > max) 3472 batchcount = max; 3473 memcpy(&(shared_array->entry[shared_array->avail]), 3474 ac->entry, sizeof(void *) * batchcount); 3475 shared_array->avail += batchcount; 3476 goto free_done; 3477 } 3478 } 3479 3480 free_block(cachep, ac->entry, batchcount, node); 3481 free_done: 3482 #if STATS 3483 { 3484 int i = 0; 3485 struct list_head *p; 3486 3487 p = n->slabs_free.next; 3488 while (p != &(n->slabs_free)) { 3489 struct page *page; 3490 3491 page = list_entry(p, struct page, lru); 3492 BUG_ON(page->active); 3493 3494 i++; 3495 p = p->next; 3496 } 3497 STATS_SET_FREEABLE(cachep, i); 3498 } 3499 #endif 3500 spin_unlock(&n->list_lock); 3501 ac->avail -= batchcount; 3502 memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); 3503 } 3504 3505 /* 3506 * Release an obj back to its cache. If the obj has a constructed state, it must 3507 * be in this state _before_ it is released. Called with disabled ints. 3508 */ 3509 static inline void __cache_free(struct kmem_cache *cachep, void *objp, 3510 unsigned long caller) 3511 { 3512 struct array_cache *ac = cpu_cache_get(cachep); 3513 3514 check_irq_off(); 3515 kmemleak_free_recursive(objp, cachep->flags); 3516 objp = cache_free_debugcheck(cachep, objp, caller); 3517 3518 kmemcheck_slab_free(cachep, objp, cachep->object_size); 3519 3520 /* 3521 * Skip calling cache_free_alien() when the platform is not numa. 3522 * This will avoid cache misses that happen while accessing slabp (which 3523 * is per page memory reference) to get nodeid. Instead use a global 3524 * variable to skip the call, which is mostly likely to be present in 3525 * the cache. 3526 */ 3527 if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) 3528 return; 3529 3530 if (likely(ac->avail < ac->limit)) { 3531 STATS_INC_FREEHIT(cachep); 3532 } else { 3533 STATS_INC_FREEMISS(cachep); 3534 cache_flusharray(cachep, ac); 3535 } 3536 3537 ac_put_obj(cachep, ac, objp); 3538 } 3539 3540 /** 3541 * kmem_cache_alloc - Allocate an object 3542 * @cachep: The cache to allocate from. 3543 * @flags: See kmalloc(). 3544 * 3545 * Allocate an object from this cache. The flags are only relevant 3546 * if the cache has no available objects. 3547 */ 3548 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3549 { 3550 void *ret = slab_alloc(cachep, flags, _RET_IP_); 3551 3552 trace_kmem_cache_alloc(_RET_IP_, ret, 3553 cachep->object_size, cachep->size, flags); 3554 3555 return ret; 3556 } 3557 EXPORT_SYMBOL(kmem_cache_alloc); 3558 3559 #ifdef CONFIG_TRACING 3560 void * 3561 kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) 3562 { 3563 void *ret; 3564 3565 ret = slab_alloc(cachep, flags, _RET_IP_); 3566 3567 trace_kmalloc(_RET_IP_, ret, 3568 size, cachep->size, flags); 3569 return ret; 3570 } 3571 EXPORT_SYMBOL(kmem_cache_alloc_trace); 3572 #endif 3573 3574 #ifdef CONFIG_NUMA 3575 /** 3576 * kmem_cache_alloc_node - Allocate an object on the specified node 3577 * @cachep: The cache to allocate from. 3578 * @flags: See kmalloc(). 3579 * @nodeid: node number of the target node. 3580 * 3581 * Identical to kmem_cache_alloc but it will allocate memory on the given 3582 * node, which can improve the performance for cpu bound structures. 3583 * 3584 * Fallback to other node is possible if __GFP_THISNODE is not set. 3585 */ 3586 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3587 { 3588 void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); 3589 3590 trace_kmem_cache_alloc_node(_RET_IP_, ret, 3591 cachep->object_size, cachep->size, 3592 flags, nodeid); 3593 3594 return ret; 3595 } 3596 EXPORT_SYMBOL(kmem_cache_alloc_node); 3597 3598 #ifdef CONFIG_TRACING 3599 void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, 3600 gfp_t flags, 3601 int nodeid, 3602 size_t size) 3603 { 3604 void *ret; 3605 3606 ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); 3607 3608 trace_kmalloc_node(_RET_IP_, ret, 3609 size, cachep->size, 3610 flags, nodeid); 3611 return ret; 3612 } 3613 EXPORT_SYMBOL(kmem_cache_alloc_node_trace); 3614 #endif 3615 3616 static __always_inline void * 3617 __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) 3618 { 3619 struct kmem_cache *cachep; 3620 3621 cachep = kmalloc_slab(size, flags); 3622 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3623 return cachep; 3624 return kmem_cache_alloc_node_trace(cachep, flags, node, size); 3625 } 3626 3627 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) 3628 void *__kmalloc_node(size_t size, gfp_t flags, int node) 3629 { 3630 return __do_kmalloc_node(size, flags, node, _RET_IP_); 3631 } 3632 EXPORT_SYMBOL(__kmalloc_node); 3633 3634 void *__kmalloc_node_track_caller(size_t size, gfp_t flags, 3635 int node, unsigned long caller) 3636 { 3637 return __do_kmalloc_node(size, flags, node, caller); 3638 } 3639 EXPORT_SYMBOL(__kmalloc_node_track_caller); 3640 #else 3641 void *__kmalloc_node(size_t size, gfp_t flags, int node) 3642 { 3643 return __do_kmalloc_node(size, flags, node, 0); 3644 } 3645 EXPORT_SYMBOL(__kmalloc_node); 3646 #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ 3647 #endif /* CONFIG_NUMA */ 3648 3649 /** 3650 * __do_kmalloc - allocate memory 3651 * @size: how many bytes of memory are required. 3652 * @flags: the type of memory to allocate (see kmalloc). 3653 * @caller: function caller for debug tracking of the caller 3654 */ 3655 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, 3656 unsigned long caller) 3657 { 3658 struct kmem_cache *cachep; 3659 void *ret; 3660 3661 cachep = kmalloc_slab(size, flags); 3662 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3663 return cachep; 3664 ret = slab_alloc(cachep, flags, caller); 3665 3666 trace_kmalloc(caller, ret, 3667 size, cachep->size, flags); 3668 3669 return ret; 3670 } 3671 3672 3673 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) 3674 void *__kmalloc(size_t size, gfp_t flags) 3675 { 3676 return __do_kmalloc(size, flags, _RET_IP_); 3677 } 3678 EXPORT_SYMBOL(__kmalloc); 3679 3680 void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) 3681 { 3682 return __do_kmalloc(size, flags, caller); 3683 } 3684 EXPORT_SYMBOL(__kmalloc_track_caller); 3685 3686 #else 3687 void *__kmalloc(size_t size, gfp_t flags) 3688 { 3689 return __do_kmalloc(size, flags, 0); 3690 } 3691 EXPORT_SYMBOL(__kmalloc); 3692 #endif 3693 3694 /** 3695 * kmem_cache_free - Deallocate an object 3696 * @cachep: The cache the allocation was from. 3697 * @objp: The previously allocated object. 3698 * 3699 * Free an object which was previously allocated from this 3700 * cache. 3701 */ 3702 void kmem_cache_free(struct kmem_cache *cachep, void *objp) 3703 { 3704 unsigned long flags; 3705 cachep = cache_from_obj(cachep, objp); 3706 if (!cachep) 3707 return; 3708 3709 local_irq_save(flags); 3710 debug_check_no_locks_freed(objp, cachep->object_size); 3711 if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) 3712 debug_check_no_obj_freed(objp, cachep->object_size); 3713 __cache_free(cachep, objp, _RET_IP_); 3714 local_irq_restore(flags); 3715 3716 trace_kmem_cache_free(_RET_IP_, objp); 3717 } 3718 EXPORT_SYMBOL(kmem_cache_free); 3719 3720 /** 3721 * kfree - free previously allocated memory 3722 * @objp: pointer returned by kmalloc. 3723 * 3724 * If @objp is NULL, no operation is performed. 3725 * 3726 * Don't free memory not originally allocated by kmalloc() 3727 * or you will run into trouble. 3728 */ 3729 void kfree(const void *objp) 3730 { 3731 struct kmem_cache *c; 3732 unsigned long flags; 3733 3734 trace_kfree(_RET_IP_, objp); 3735 3736 if (unlikely(ZERO_OR_NULL_PTR(objp))) 3737 return; 3738 local_irq_save(flags); 3739 kfree_debugcheck(objp); 3740 c = virt_to_cache(objp); 3741 debug_check_no_locks_freed(objp, c->object_size); 3742 3743 debug_check_no_obj_freed(objp, c->object_size); 3744 __cache_free(c, (void *)objp, _RET_IP_); 3745 local_irq_restore(flags); 3746 } 3747 EXPORT_SYMBOL(kfree); 3748 3749 /* 3750 * This initializes kmem_cache_node or resizes various caches for all nodes. 3751 */ 3752 static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) 3753 { 3754 int node; 3755 struct kmem_cache_node *n; 3756 struct array_cache *new_shared; 3757 struct array_cache **new_alien = NULL; 3758 3759 for_each_online_node(node) { 3760 3761 if (use_alien_caches) { 3762 new_alien = alloc_alien_cache(node, cachep->limit, gfp); 3763 if (!new_alien) 3764 goto fail; 3765 } 3766 3767 new_shared = NULL; 3768 if (cachep->shared) { 3769 new_shared = alloc_arraycache(node, 3770 cachep->shared*cachep->batchcount, 3771 0xbaadf00d, gfp); 3772 if (!new_shared) { 3773 free_alien_cache(new_alien); 3774 goto fail; 3775 } 3776 } 3777 3778 n = cachep->node[node]; 3779 if (n) { 3780 struct array_cache *shared = n->shared; 3781 3782 spin_lock_irq(&n->list_lock); 3783 3784 if (shared) 3785 free_block(cachep, shared->entry, 3786 shared->avail, node); 3787 3788 n->shared = new_shared; 3789 if (!n->alien) { 3790 n->alien = new_alien; 3791 new_alien = NULL; 3792 } 3793 n->free_limit = (1 + nr_cpus_node(node)) * 3794 cachep->batchcount + cachep->num; 3795 spin_unlock_irq(&n->list_lock); 3796 kfree(shared); 3797 free_alien_cache(new_alien); 3798 continue; 3799 } 3800 n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); 3801 if (!n) { 3802 free_alien_cache(new_alien); 3803 kfree(new_shared); 3804 goto fail; 3805 } 3806 3807 kmem_cache_node_init(n); 3808 n->next_reap = jiffies + REAPTIMEOUT_NODE + 3809 ((unsigned long)cachep) % REAPTIMEOUT_NODE; 3810 n->shared = new_shared; 3811 n->alien = new_alien; 3812 n->free_limit = (1 + nr_cpus_node(node)) * 3813 cachep->batchcount + cachep->num; 3814 cachep->node[node] = n; 3815 } 3816 return 0; 3817 3818 fail: 3819 if (!cachep->list.next) { 3820 /* Cache is not active yet. Roll back what we did */ 3821 node--; 3822 while (node >= 0) { 3823 if (cachep->node[node]) { 3824 n = cachep->node[node]; 3825 3826 kfree(n->shared); 3827 free_alien_cache(n->alien); 3828 kfree(n); 3829 cachep->node[node] = NULL; 3830 } 3831 node--; 3832 } 3833 } 3834 return -ENOMEM; 3835 } 3836 3837 struct ccupdate_struct { 3838 struct kmem_cache *cachep; 3839 struct array_cache *new[0]; 3840 }; 3841 3842 static void do_ccupdate_local(void *info) 3843 { 3844 struct ccupdate_struct *new = info; 3845 struct array_cache *old; 3846 3847 check_irq_off(); 3848 old = cpu_cache_get(new->cachep); 3849 3850 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 3851 new->new[smp_processor_id()] = old; 3852 } 3853 3854 /* Always called with the slab_mutex held */ 3855 static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, 3856 int batchcount, int shared, gfp_t gfp) 3857 { 3858 struct ccupdate_struct *new; 3859 int i; 3860 3861 new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), 3862 gfp); 3863 if (!new) 3864 return -ENOMEM; 3865 3866 for_each_online_cpu(i) { 3867 new->new[i] = alloc_arraycache(cpu_to_mem(i), limit, 3868 batchcount, gfp); 3869 if (!new->new[i]) { 3870 for (i--; i >= 0; i--) 3871 kfree(new->new[i]); 3872 kfree(new); 3873 return -ENOMEM; 3874 } 3875 } 3876 new->cachep = cachep; 3877 3878 on_each_cpu(do_ccupdate_local, (void *)new, 1); 3879 3880 check_irq_on(); 3881 cachep->batchcount = batchcount; 3882 cachep->limit = limit; 3883 cachep->shared = shared; 3884 3885 for_each_online_cpu(i) { 3886 struct array_cache *ccold = new->new[i]; 3887 if (!ccold) 3888 continue; 3889 spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); 3890 free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); 3891 spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); 3892 kfree(ccold); 3893 } 3894 kfree(new); 3895 return alloc_kmem_cache_node(cachep, gfp); 3896 } 3897 3898 static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 3899 int batchcount, int shared, gfp_t gfp) 3900 { 3901 int ret; 3902 struct kmem_cache *c = NULL; 3903 int i = 0; 3904 3905 ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); 3906 3907 if (slab_state < FULL) 3908 return ret; 3909 3910 if ((ret < 0) || !is_root_cache(cachep)) 3911 return ret; 3912 3913 VM_BUG_ON(!mutex_is_locked(&slab_mutex)); 3914 for_each_memcg_cache_index(i) { 3915 c = cache_from_memcg_idx(cachep, i); 3916 if (c) 3917 /* return value determined by the parent cache only */ 3918 __do_tune_cpucache(c, limit, batchcount, shared, gfp); 3919 } 3920 3921 return ret; 3922 } 3923 3924 /* Called with slab_mutex held always */ 3925 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) 3926 { 3927 int err; 3928 int limit = 0; 3929 int shared = 0; 3930 int batchcount = 0; 3931 3932 if (!is_root_cache(cachep)) { 3933 struct kmem_cache *root = memcg_root_cache(cachep); 3934 limit = root->limit; 3935 shared = root->shared; 3936 batchcount = root->batchcount; 3937 } 3938 3939 if (limit && shared && batchcount) 3940 goto skip_setup; 3941 /* 3942 * The head array serves three purposes: 3943 * - create a LIFO ordering, i.e. return objects that are cache-warm 3944 * - reduce the number of spinlock operations. 3945 * - reduce the number of linked list operations on the slab and 3946 * bufctl chains: array operations are cheaper. 3947 * The numbers are guessed, we should auto-tune as described by 3948 * Bonwick. 3949 */ 3950 if (cachep->size > 131072) 3951 limit = 1; 3952 else if (cachep->size > PAGE_SIZE) 3953 limit = 8; 3954 else if (cachep->size > 1024) 3955 limit = 24; 3956 else if (cachep->size > 256) 3957 limit = 54; 3958 else 3959 limit = 120; 3960 3961 /* 3962 * CPU bound tasks (e.g. network routing) can exhibit cpu bound 3963 * allocation behaviour: Most allocs on one cpu, most free operations 3964 * on another cpu. For these cases, an efficient object passing between 3965 * cpus is necessary. This is provided by a shared array. The array 3966 * replaces Bonwick's magazine layer. 3967 * On uniprocessor, it's functionally equivalent (but less efficient) 3968 * to a larger limit. Thus disabled by default. 3969 */ 3970 shared = 0; 3971 if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) 3972 shared = 8; 3973 3974 #if DEBUG 3975 /* 3976 * With debugging enabled, large batchcount lead to excessively long 3977 * periods with disabled local interrupts. Limit the batchcount 3978 */ 3979 if (limit > 32) 3980 limit = 32; 3981 #endif 3982 batchcount = (limit + 1) / 2; 3983 skip_setup: 3984 err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); 3985 if (err) 3986 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3987 cachep->name, -err); 3988 return err; 3989 } 3990 3991 /* 3992 * Drain an array if it contains any elements taking the node lock only if 3993 * necessary. Note that the node listlock also protects the array_cache 3994 * if drain_array() is used on the shared array. 3995 */ 3996 static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, 3997 struct array_cache *ac, int force, int node) 3998 { 3999 int tofree; 4000 4001 if (!ac || !ac->avail) 4002 return; 4003 if (ac->touched && !force) { 4004 ac->touched = 0; 4005 } else { 4006 spin_lock_irq(&n->list_lock); 4007 if (ac->avail) { 4008 tofree = force ? ac->avail : (ac->limit + 4) / 5; 4009 if (tofree > ac->avail) 4010 tofree = (ac->avail + 1) / 2; 4011 free_block(cachep, ac->entry, tofree, node); 4012 ac->avail -= tofree; 4013 memmove(ac->entry, &(ac->entry[tofree]), 4014 sizeof(void *) * ac->avail); 4015 } 4016 spin_unlock_irq(&n->list_lock); 4017 } 4018 } 4019 4020 /** 4021 * cache_reap - Reclaim memory from caches. 4022 * @w: work descriptor 4023 * 4024 * Called from workqueue/eventd every few seconds. 4025 * Purpose: 4026 * - clear the per-cpu caches for this CPU. 4027 * - return freeable pages to the main free memory pool. 4028 * 4029 * If we cannot acquire the cache chain mutex then just give up - we'll try 4030 * again on the next iteration. 4031 */ 4032 static void cache_reap(struct work_struct *w) 4033 { 4034 struct kmem_cache *searchp; 4035 struct kmem_cache_node *n; 4036 int node = numa_mem_id(); 4037 struct delayed_work *work = to_delayed_work(w); 4038 4039 if (!mutex_trylock(&slab_mutex)) 4040 /* Give up. Setup the next iteration. */ 4041 goto out; 4042 4043 list_for_each_entry(searchp, &slab_caches, list) { 4044 check_irq_on(); 4045 4046 /* 4047 * We only take the node lock if absolutely necessary and we 4048 * have established with reasonable certainty that 4049 * we can do some work if the lock was obtained. 4050 */ 4051 n = searchp->node[node]; 4052 4053 reap_alien(searchp, n); 4054 4055 drain_array(searchp, n, cpu_cache_get(searchp), 0, node); 4056 4057 /* 4058 * These are racy checks but it does not matter 4059 * if we skip one check or scan twice. 4060 */ 4061 if (time_after(n->next_reap, jiffies)) 4062 goto next; 4063 4064 n->next_reap = jiffies + REAPTIMEOUT_NODE; 4065 4066 drain_array(searchp, n, n->shared, 0, node); 4067 4068 if (n->free_touched) 4069 n->free_touched = 0; 4070 else { 4071 int freed; 4072 4073 freed = drain_freelist(searchp, n, (n->free_limit + 4074 5 * searchp->num - 1) / (5 * searchp->num)); 4075 STATS_ADD_REAPED(searchp, freed); 4076 } 4077 next: 4078 cond_resched(); 4079 } 4080 check_irq_on(); 4081 mutex_unlock(&slab_mutex); 4082 next_reap_node(); 4083 out: 4084 /* Set up the next iteration */ 4085 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); 4086 } 4087 4088 #ifdef CONFIG_SLABINFO 4089 void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) 4090 { 4091 struct page *page; 4092 unsigned long active_objs; 4093 unsigned long num_objs; 4094 unsigned long active_slabs = 0; 4095 unsigned long num_slabs, free_objects = 0, shared_avail = 0; 4096 const char *name; 4097 char *error = NULL; 4098 int node; 4099 struct kmem_cache_node *n; 4100 4101 active_objs = 0; 4102 num_slabs = 0; 4103 for_each_online_node(node) { 4104 n = cachep->node[node]; 4105 if (!n) 4106 continue; 4107 4108 check_irq_on(); 4109 spin_lock_irq(&n->list_lock); 4110 4111 list_for_each_entry(page, &n->slabs_full, lru) { 4112 if (page->active != cachep->num && !error) 4113 error = "slabs_full accounting error"; 4114 active_objs += cachep->num; 4115 active_slabs++; 4116 } 4117 list_for_each_entry(page, &n->slabs_partial, lru) { 4118 if (page->active == cachep->num && !error) 4119 error = "slabs_partial accounting error"; 4120 if (!page->active && !error) 4121 error = "slabs_partial accounting error"; 4122 active_objs += page->active; 4123 active_slabs++; 4124 } 4125 list_for_each_entry(page, &n->slabs_free, lru) { 4126 if (page->active && !error) 4127 error = "slabs_free accounting error"; 4128 num_slabs++; 4129 } 4130 free_objects += n->free_objects; 4131 if (n->shared) 4132 shared_avail += n->shared->avail; 4133 4134 spin_unlock_irq(&n->list_lock); 4135 } 4136 num_slabs += active_slabs; 4137 num_objs = num_slabs * cachep->num; 4138 if (num_objs - active_objs != free_objects && !error) 4139 error = "free_objects accounting error"; 4140 4141 name = cachep->name; 4142 if (error) 4143 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 4144 4145 sinfo->active_objs = active_objs; 4146 sinfo->num_objs = num_objs; 4147 sinfo->active_slabs = active_slabs; 4148 sinfo->num_slabs = num_slabs; 4149 sinfo->shared_avail = shared_avail; 4150 sinfo->limit = cachep->limit; 4151 sinfo->batchcount = cachep->batchcount; 4152 sinfo->shared = cachep->shared; 4153 sinfo->objects_per_slab = cachep->num; 4154 sinfo->cache_order = cachep->gfporder; 4155 } 4156 4157 void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) 4158 { 4159 #if STATS 4160 { /* node stats */ 4161 unsigned long high = cachep->high_mark; 4162 unsigned long allocs = cachep->num_allocations; 4163 unsigned long grown = cachep->grown; 4164 unsigned long reaped = cachep->reaped; 4165 unsigned long errors = cachep->errors; 4166 unsigned long max_freeable = cachep->max_freeable; 4167 unsigned long node_allocs = cachep->node_allocs; 4168 unsigned long node_frees = cachep->node_frees; 4169 unsigned long overflows = cachep->node_overflow; 4170 4171 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu " 4172 "%4lu %4lu %4lu %4lu %4lu", 4173 allocs, high, grown, 4174 reaped, errors, max_freeable, node_allocs, 4175 node_frees, overflows); 4176 } 4177 /* cpu stats */ 4178 { 4179 unsigned long allochit = atomic_read(&cachep->allochit); 4180 unsigned long allocmiss = atomic_read(&cachep->allocmiss); 4181 unsigned long freehit = atomic_read(&cachep->freehit); 4182 unsigned long freemiss = atomic_read(&cachep->freemiss); 4183 4184 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", 4185 allochit, allocmiss, freehit, freemiss); 4186 } 4187 #endif 4188 } 4189 4190 #define MAX_SLABINFO_WRITE 128 4191 /** 4192 * slabinfo_write - Tuning for the slab allocator 4193 * @file: unused 4194 * @buffer: user buffer 4195 * @count: data length 4196 * @ppos: unused 4197 */ 4198 ssize_t slabinfo_write(struct file *file, const char __user *buffer, 4199 size_t count, loff_t *ppos) 4200 { 4201 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; 4202 int limit, batchcount, shared, res; 4203 struct kmem_cache *cachep; 4204 4205 if (count > MAX_SLABINFO_WRITE) 4206 return -EINVAL; 4207 if (copy_from_user(&kbuf, buffer, count)) 4208 return -EFAULT; 4209 kbuf[MAX_SLABINFO_WRITE] = '\0'; 4210 4211 tmp = strchr(kbuf, ' '); 4212 if (!tmp) 4213 return -EINVAL; 4214 *tmp = '\0'; 4215 tmp++; 4216 if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) 4217 return -EINVAL; 4218 4219 /* Find the cache in the chain of caches. */ 4220 mutex_lock(&slab_mutex); 4221 res = -EINVAL; 4222 list_for_each_entry(cachep, &slab_caches, list) { 4223 if (!strcmp(cachep->name, kbuf)) { 4224 if (limit < 1 || batchcount < 1 || 4225 batchcount > limit || shared < 0) { 4226 res = 0; 4227 } else { 4228 res = do_tune_cpucache(cachep, limit, 4229 batchcount, shared, 4230 GFP_KERNEL); 4231 } 4232 break; 4233 } 4234 } 4235 mutex_unlock(&slab_mutex); 4236 if (res >= 0) 4237 res = count; 4238 return res; 4239 } 4240 4241 #ifdef CONFIG_DEBUG_SLAB_LEAK 4242 4243 static void *leaks_start(struct seq_file *m, loff_t *pos) 4244 { 4245 mutex_lock(&slab_mutex); 4246 return seq_list_start(&slab_caches, *pos); 4247 } 4248 4249 static inline int add_caller(unsigned long *n, unsigned long v) 4250 { 4251 unsigned long *p; 4252 int l; 4253 if (!v) 4254 return 1; 4255 l = n[1]; 4256 p = n + 2; 4257 while (l) { 4258 int i = l/2; 4259 unsigned long *q = p + 2 * i; 4260 if (*q == v) { 4261 q[1]++; 4262 return 1; 4263 } 4264 if (*q > v) { 4265 l = i; 4266 } else { 4267 p = q + 2; 4268 l -= i + 1; 4269 } 4270 } 4271 if (++n[1] == n[0]) 4272 return 0; 4273 memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n)); 4274 p[0] = v; 4275 p[1] = 1; 4276 return 1; 4277 } 4278 4279 static void handle_slab(unsigned long *n, struct kmem_cache *c, 4280 struct page *page) 4281 { 4282 void *p; 4283 int i; 4284 4285 if (n[0] == n[1]) 4286 return; 4287 for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { 4288 if (get_obj_status(page, i) != OBJECT_ACTIVE) 4289 continue; 4290 4291 if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) 4292 return; 4293 } 4294 } 4295 4296 static void show_symbol(struct seq_file *m, unsigned long address) 4297 { 4298 #ifdef CONFIG_KALLSYMS 4299 unsigned long offset, size; 4300 char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN]; 4301 4302 if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) { 4303 seq_printf(m, "%s+%#lx/%#lx", name, offset, size); 4304 if (modname[0]) 4305 seq_printf(m, " [%s]", modname); 4306 return; 4307 } 4308 #endif 4309 seq_printf(m, "%p", (void *)address); 4310 } 4311 4312 static int leaks_show(struct seq_file *m, void *p) 4313 { 4314 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); 4315 struct page *page; 4316 struct kmem_cache_node *n; 4317 const char *name; 4318 unsigned long *x = m->private; 4319 int node; 4320 int i; 4321 4322 if (!(cachep->flags & SLAB_STORE_USER)) 4323 return 0; 4324 if (!(cachep->flags & SLAB_RED_ZONE)) 4325 return 0; 4326 4327 /* OK, we can do it */ 4328 4329 x[1] = 0; 4330 4331 for_each_online_node(node) { 4332 n = cachep->node[node]; 4333 if (!n) 4334 continue; 4335 4336 check_irq_on(); 4337 spin_lock_irq(&n->list_lock); 4338 4339 list_for_each_entry(page, &n->slabs_full, lru) 4340 handle_slab(x, cachep, page); 4341 list_for_each_entry(page, &n->slabs_partial, lru) 4342 handle_slab(x, cachep, page); 4343 spin_unlock_irq(&n->list_lock); 4344 } 4345 name = cachep->name; 4346 if (x[0] == x[1]) { 4347 /* Increase the buffer size */ 4348 mutex_unlock(&slab_mutex); 4349 m->private = kzalloc(x[0] * 4 * sizeof(unsigned long), GFP_KERNEL); 4350 if (!m->private) { 4351 /* Too bad, we are really out */ 4352 m->private = x; 4353 mutex_lock(&slab_mutex); 4354 return -ENOMEM; 4355 } 4356 *(unsigned long *)m->private = x[0] * 2; 4357 kfree(x); 4358 mutex_lock(&slab_mutex); 4359 /* Now make sure this entry will be retried */ 4360 m->count = m->size; 4361 return 0; 4362 } 4363 for (i = 0; i < x[1]; i++) { 4364 seq_printf(m, "%s: %lu ", name, x[2*i+3]); 4365 show_symbol(m, x[2*i+2]); 4366 seq_putc(m, '\n'); 4367 } 4368 4369 return 0; 4370 } 4371 4372 static const struct seq_operations slabstats_op = { 4373 .start = leaks_start, 4374 .next = slab_next, 4375 .stop = slab_stop, 4376 .show = leaks_show, 4377 }; 4378 4379 static int slabstats_open(struct inode *inode, struct file *file) 4380 { 4381 unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL); 4382 int ret = -ENOMEM; 4383 if (n) { 4384 ret = seq_open(file, &slabstats_op); 4385 if (!ret) { 4386 struct seq_file *m = file->private_data; 4387 *n = PAGE_SIZE / (2 * sizeof(unsigned long)); 4388 m->private = n; 4389 n = NULL; 4390 } 4391 kfree(n); 4392 } 4393 return ret; 4394 } 4395 4396 static const struct file_operations proc_slabstats_operations = { 4397 .open = slabstats_open, 4398 .read = seq_read, 4399 .llseek = seq_lseek, 4400 .release = seq_release_private, 4401 }; 4402 #endif 4403 4404 static int __init slab_proc_init(void) 4405 { 4406 #ifdef CONFIG_DEBUG_SLAB_LEAK 4407 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); 4408 #endif 4409 return 0; 4410 } 4411 module_init(slab_proc_init); 4412 #endif 4413 4414 /** 4415 * ksize - get the actual amount of memory allocated for a given object 4416 * @objp: Pointer to the object 4417 * 4418 * kmalloc may internally round up allocations and return more memory 4419 * than requested. ksize() can be used to determine the actual amount of 4420 * memory allocated. The caller may use this additional memory, even though 4421 * a smaller amount of memory was initially specified with the kmalloc call. 4422 * The caller must guarantee that objp points to a valid object previously 4423 * allocated with either kmalloc() or kmem_cache_alloc(). The object 4424 * must not be freed during the duration of the call. 4425 */ 4426 size_t ksize(const void *objp) 4427 { 4428 BUG_ON(!objp); 4429 if (unlikely(objp == ZERO_SIZE_PTR)) 4430 return 0; 4431 4432 return virt_to_cache(objp)->object_size; 4433 } 4434 EXPORT_SYMBOL(ksize); 4435