xref: /openbmc/linux/mm/slab.c (revision 05bcf503)
1 /*
2  * linux/mm/slab.c
3  * Written by Mark Hemment, 1996/97.
4  * (markhe@nextd.demon.co.uk)
5  *
6  * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
7  *
8  * Major cleanup, different bufctl logic, per-cpu arrays
9  *	(c) 2000 Manfred Spraul
10  *
11  * Cleanup, make the head arrays unconditional, preparation for NUMA
12  * 	(c) 2002 Manfred Spraul
13  *
14  * An implementation of the Slab Allocator as described in outline in;
15  *	UNIX Internals: The New Frontiers by Uresh Vahalia
16  *	Pub: Prentice Hall	ISBN 0-13-101908-2
17  * or with a little more detail in;
18  *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
19  *	Jeff Bonwick (Sun Microsystems).
20  *	Presented at: USENIX Summer 1994 Technical Conference
21  *
22  * The memory is organized in caches, one cache for each object type.
23  * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
24  * Each cache consists out of many slabs (they are small (usually one
25  * page long) and always contiguous), and each slab contains multiple
26  * initialized objects.
27  *
28  * This means, that your constructor is used only for newly allocated
29  * slabs and you must pass objects with the same initializations to
30  * kmem_cache_free.
31  *
32  * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
33  * normal). If you need a special memory type, then must create a new
34  * cache for that memory type.
35  *
36  * In order to reduce fragmentation, the slabs are sorted in 3 groups:
37  *   full slabs with 0 free objects
38  *   partial slabs
39  *   empty slabs with no allocated objects
40  *
41  * If partial slabs exist, then new allocations come from these slabs,
42  * otherwise from empty slabs or new slabs are allocated.
43  *
44  * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
45  * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
46  *
47  * Each cache has a short per-cpu head array, most allocs
48  * and frees go into that array, and if that array overflows, then 1/2
49  * of the entries in the array are given back into the global cache.
50  * The head array is strictly LIFO and should improve the cache hit rates.
51  * On SMP, it additionally reduces the spinlock operations.
52  *
53  * The c_cpuarray may not be read with enabled local interrupts -
54  * it's changed with a smp_call_function().
55  *
56  * SMP synchronization:
57  *  constructors and destructors are called without any locking.
58  *  Several members in struct kmem_cache and struct slab never change, they
59  *	are accessed without any locking.
60  *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
61  *  	and local interrupts are disabled so slab code is preempt-safe.
62  *  The non-constant members are protected with a per-cache irq spinlock.
63  *
64  * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
65  * in 2000 - many ideas in the current implementation are derived from
66  * his patch.
67  *
68  * Further notes from the original documentation:
69  *
70  * 11 April '97.  Started multi-threading - markhe
71  *	The global cache-chain is protected by the mutex 'slab_mutex'.
72  *	The sem is only needed when accessing/extending the cache-chain, which
73  *	can never happen inside an interrupt (kmem_cache_create(),
74  *	kmem_cache_shrink() and kmem_cache_reap()).
75  *
76  *	At present, each engine can be growing a cache.  This should be blocked.
77  *
78  * 15 March 2005. NUMA slab allocator.
79  *	Shai Fultheim <shai@scalex86.org>.
80  *	Shobhit Dayal <shobhit@calsoftinc.com>
81  *	Alok N Kataria <alokk@calsoftinc.com>
82  *	Christoph Lameter <christoph@lameter.com>
83  *
84  *	Modified the slab allocator to be node aware on NUMA systems.
85  *	Each node has its own list of partial, free and full slabs.
86  *	All object allocations for a node occur from node specific slab lists.
87  */
88 
89 #include	<linux/slab.h>
90 #include	"slab.h"
91 #include	<linux/mm.h>
92 #include	<linux/poison.h>
93 #include	<linux/swap.h>
94 #include	<linux/cache.h>
95 #include	<linux/interrupt.h>
96 #include	<linux/init.h>
97 #include	<linux/compiler.h>
98 #include	<linux/cpuset.h>
99 #include	<linux/proc_fs.h>
100 #include	<linux/seq_file.h>
101 #include	<linux/notifier.h>
102 #include	<linux/kallsyms.h>
103 #include	<linux/cpu.h>
104 #include	<linux/sysctl.h>
105 #include	<linux/module.h>
106 #include	<linux/rcupdate.h>
107 #include	<linux/string.h>
108 #include	<linux/uaccess.h>
109 #include	<linux/nodemask.h>
110 #include	<linux/kmemleak.h>
111 #include	<linux/mempolicy.h>
112 #include	<linux/mutex.h>
113 #include	<linux/fault-inject.h>
114 #include	<linux/rtmutex.h>
115 #include	<linux/reciprocal_div.h>
116 #include	<linux/debugobjects.h>
117 #include	<linux/kmemcheck.h>
118 #include	<linux/memory.h>
119 #include	<linux/prefetch.h>
120 
121 #include	<net/sock.h>
122 
123 #include	<asm/cacheflush.h>
124 #include	<asm/tlbflush.h>
125 #include	<asm/page.h>
126 
127 #include <trace/events/kmem.h>
128 
129 #include	"internal.h"
130 
131 /*
132  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
133  *		  0 for faster, smaller code (especially in the critical paths).
134  *
135  * STATS	- 1 to collect stats for /proc/slabinfo.
136  *		  0 for faster, smaller code (especially in the critical paths).
137  *
138  * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
139  */
140 
141 #ifdef CONFIG_DEBUG_SLAB
142 #define	DEBUG		1
143 #define	STATS		1
144 #define	FORCED_DEBUG	1
145 #else
146 #define	DEBUG		0
147 #define	STATS		0
148 #define	FORCED_DEBUG	0
149 #endif
150 
151 /* Shouldn't this be in a header file somewhere? */
152 #define	BYTES_PER_WORD		sizeof(void *)
153 #define	REDZONE_ALIGN		max(BYTES_PER_WORD, __alignof__(unsigned long long))
154 
155 #ifndef ARCH_KMALLOC_FLAGS
156 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
157 #endif
158 
159 /*
160  * true if a page was allocated from pfmemalloc reserves for network-based
161  * swap
162  */
163 static bool pfmemalloc_active __read_mostly;
164 
165 /* Legal flag mask for kmem_cache_create(). */
166 #if DEBUG
167 # define CREATE_MASK	(SLAB_RED_ZONE | \
168 			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
169 			 SLAB_CACHE_DMA | \
170 			 SLAB_STORE_USER | \
171 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
172 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
173 			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
174 #else
175 # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
176 			 SLAB_CACHE_DMA | \
177 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
178 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
179 			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
180 #endif
181 
182 /*
183  * kmem_bufctl_t:
184  *
185  * Bufctl's are used for linking objs within a slab
186  * linked offsets.
187  *
188  * This implementation relies on "struct page" for locating the cache &
189  * slab an object belongs to.
190  * This allows the bufctl structure to be small (one int), but limits
191  * the number of objects a slab (not a cache) can contain when off-slab
192  * bufctls are used. The limit is the size of the largest general cache
193  * that does not use off-slab slabs.
194  * For 32bit archs with 4 kB pages, is this 56.
195  * This is not serious, as it is only for large objects, when it is unwise
196  * to have too many per slab.
197  * Note: This limit can be raised by introducing a general cache whose size
198  * is less than 512 (PAGE_SIZE<<3), but greater than 256.
199  */
200 
201 typedef unsigned int kmem_bufctl_t;
202 #define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
203 #define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
204 #define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2)
205 #define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3)
206 
207 /*
208  * struct slab_rcu
209  *
210  * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
211  * arrange for kmem_freepages to be called via RCU.  This is useful if
212  * we need to approach a kernel structure obliquely, from its address
213  * obtained without the usual locking.  We can lock the structure to
214  * stabilize it and check it's still at the given address, only if we
215  * can be sure that the memory has not been meanwhile reused for some
216  * other kind of object (which our subsystem's lock might corrupt).
217  *
218  * rcu_read_lock before reading the address, then rcu_read_unlock after
219  * taking the spinlock within the structure expected at that address.
220  */
221 struct slab_rcu {
222 	struct rcu_head head;
223 	struct kmem_cache *cachep;
224 	void *addr;
225 };
226 
227 /*
228  * struct slab
229  *
230  * Manages the objs in a slab. Placed either at the beginning of mem allocated
231  * for a slab, or allocated from an general cache.
232  * Slabs are chained into three list: fully used, partial, fully free slabs.
233  */
234 struct slab {
235 	union {
236 		struct {
237 			struct list_head list;
238 			unsigned long colouroff;
239 			void *s_mem;		/* including colour offset */
240 			unsigned int inuse;	/* num of objs active in slab */
241 			kmem_bufctl_t free;
242 			unsigned short nodeid;
243 		};
244 		struct slab_rcu __slab_cover_slab_rcu;
245 	};
246 };
247 
248 /*
249  * struct array_cache
250  *
251  * Purpose:
252  * - LIFO ordering, to hand out cache-warm objects from _alloc
253  * - reduce the number of linked list operations
254  * - reduce spinlock operations
255  *
256  * The limit is stored in the per-cpu structure to reduce the data cache
257  * footprint.
258  *
259  */
260 struct array_cache {
261 	unsigned int avail;
262 	unsigned int limit;
263 	unsigned int batchcount;
264 	unsigned int touched;
265 	spinlock_t lock;
266 	void *entry[];	/*
267 			 * Must have this definition in here for the proper
268 			 * alignment of array_cache. Also simplifies accessing
269 			 * the entries.
270 			 *
271 			 * Entries should not be directly dereferenced as
272 			 * entries belonging to slabs marked pfmemalloc will
273 			 * have the lower bits set SLAB_OBJ_PFMEMALLOC
274 			 */
275 };
276 
277 #define SLAB_OBJ_PFMEMALLOC	1
278 static inline bool is_obj_pfmemalloc(void *objp)
279 {
280 	return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
281 }
282 
283 static inline void set_obj_pfmemalloc(void **objp)
284 {
285 	*objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
286 	return;
287 }
288 
289 static inline void clear_obj_pfmemalloc(void **objp)
290 {
291 	*objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
292 }
293 
294 /*
295  * bootstrap: The caches do not work without cpuarrays anymore, but the
296  * cpuarrays are allocated from the generic caches...
297  */
298 #define BOOT_CPUCACHE_ENTRIES	1
299 struct arraycache_init {
300 	struct array_cache cache;
301 	void *entries[BOOT_CPUCACHE_ENTRIES];
302 };
303 
304 /*
305  * The slab lists for all objects.
306  */
307 struct kmem_list3 {
308 	struct list_head slabs_partial;	/* partial list first, better asm code */
309 	struct list_head slabs_full;
310 	struct list_head slabs_free;
311 	unsigned long free_objects;
312 	unsigned int free_limit;
313 	unsigned int colour_next;	/* Per-node cache coloring */
314 	spinlock_t list_lock;
315 	struct array_cache *shared;	/* shared per node */
316 	struct array_cache **alien;	/* on other nodes */
317 	unsigned long next_reap;	/* updated without locking */
318 	int free_touched;		/* updated without locking */
319 };
320 
321 /*
322  * Need this for bootstrapping a per node allocator.
323  */
324 #define NUM_INIT_LISTS (3 * MAX_NUMNODES)
325 static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
326 #define	CACHE_CACHE 0
327 #define	SIZE_AC MAX_NUMNODES
328 #define	SIZE_L3 (2 * MAX_NUMNODES)
329 
330 static int drain_freelist(struct kmem_cache *cache,
331 			struct kmem_list3 *l3, int tofree);
332 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
333 			int node);
334 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
335 static void cache_reap(struct work_struct *unused);
336 
337 /*
338  * This function must be completely optimized away if a constant is passed to
339  * it.  Mostly the same as what is in linux/slab.h except it returns an index.
340  */
341 static __always_inline int index_of(const size_t size)
342 {
343 	extern void __bad_size(void);
344 
345 	if (__builtin_constant_p(size)) {
346 		int i = 0;
347 
348 #define CACHE(x) \
349 	if (size <=x) \
350 		return i; \
351 	else \
352 		i++;
353 #include <linux/kmalloc_sizes.h>
354 #undef CACHE
355 		__bad_size();
356 	} else
357 		__bad_size();
358 	return 0;
359 }
360 
361 static int slab_early_init = 1;
362 
363 #define INDEX_AC index_of(sizeof(struct arraycache_init))
364 #define INDEX_L3 index_of(sizeof(struct kmem_list3))
365 
366 static void kmem_list3_init(struct kmem_list3 *parent)
367 {
368 	INIT_LIST_HEAD(&parent->slabs_full);
369 	INIT_LIST_HEAD(&parent->slabs_partial);
370 	INIT_LIST_HEAD(&parent->slabs_free);
371 	parent->shared = NULL;
372 	parent->alien = NULL;
373 	parent->colour_next = 0;
374 	spin_lock_init(&parent->list_lock);
375 	parent->free_objects = 0;
376 	parent->free_touched = 0;
377 }
378 
379 #define MAKE_LIST(cachep, listp, slab, nodeid)				\
380 	do {								\
381 		INIT_LIST_HEAD(listp);					\
382 		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\
383 	} while (0)
384 
385 #define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
386 	do {								\
387 	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
388 	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
389 	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
390 	} while (0)
391 
392 #define CFLGS_OFF_SLAB		(0x80000000UL)
393 #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
394 
395 #define BATCHREFILL_LIMIT	16
396 /*
397  * Optimization question: fewer reaps means less probability for unnessary
398  * cpucache drain/refill cycles.
399  *
400  * OTOH the cpuarrays can contain lots of objects,
401  * which could lock up otherwise freeable slabs.
402  */
403 #define REAPTIMEOUT_CPUC	(2*HZ)
404 #define REAPTIMEOUT_LIST3	(4*HZ)
405 
406 #if STATS
407 #define	STATS_INC_ACTIVE(x)	((x)->num_active++)
408 #define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
409 #define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
410 #define	STATS_INC_GROWN(x)	((x)->grown++)
411 #define	STATS_ADD_REAPED(x,y)	((x)->reaped += (y))
412 #define	STATS_SET_HIGH(x)						\
413 	do {								\
414 		if ((x)->num_active > (x)->high_mark)			\
415 			(x)->high_mark = (x)->num_active;		\
416 	} while (0)
417 #define	STATS_INC_ERR(x)	((x)->errors++)
418 #define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
419 #define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
420 #define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
421 #define	STATS_SET_FREEABLE(x, i)					\
422 	do {								\
423 		if ((x)->max_freeable < i)				\
424 			(x)->max_freeable = i;				\
425 	} while (0)
426 #define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
427 #define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
428 #define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
429 #define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
430 #else
431 #define	STATS_INC_ACTIVE(x)	do { } while (0)
432 #define	STATS_DEC_ACTIVE(x)	do { } while (0)
433 #define	STATS_INC_ALLOCED(x)	do { } while (0)
434 #define	STATS_INC_GROWN(x)	do { } while (0)
435 #define	STATS_ADD_REAPED(x,y)	do { (void)(y); } while (0)
436 #define	STATS_SET_HIGH(x)	do { } while (0)
437 #define	STATS_INC_ERR(x)	do { } while (0)
438 #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
439 #define	STATS_INC_NODEFREES(x)	do { } while (0)
440 #define STATS_INC_ACOVERFLOW(x)   do { } while (0)
441 #define	STATS_SET_FREEABLE(x, i) do { } while (0)
442 #define STATS_INC_ALLOCHIT(x)	do { } while (0)
443 #define STATS_INC_ALLOCMISS(x)	do { } while (0)
444 #define STATS_INC_FREEHIT(x)	do { } while (0)
445 #define STATS_INC_FREEMISS(x)	do { } while (0)
446 #endif
447 
448 #if DEBUG
449 
450 /*
451  * memory layout of objects:
452  * 0		: objp
453  * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
454  * 		the end of an object is aligned with the end of the real
455  * 		allocation. Catches writes behind the end of the allocation.
456  * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
457  * 		redzone word.
458  * cachep->obj_offset: The real object.
459  * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
460  * cachep->size - 1* BYTES_PER_WORD: last caller address
461  *					[BYTES_PER_WORD long]
462  */
463 static int obj_offset(struct kmem_cache *cachep)
464 {
465 	return cachep->obj_offset;
466 }
467 
468 static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
469 {
470 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
471 	return (unsigned long long*) (objp + obj_offset(cachep) -
472 				      sizeof(unsigned long long));
473 }
474 
475 static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
476 {
477 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
478 	if (cachep->flags & SLAB_STORE_USER)
479 		return (unsigned long long *)(objp + cachep->size -
480 					      sizeof(unsigned long long) -
481 					      REDZONE_ALIGN);
482 	return (unsigned long long *) (objp + cachep->size -
483 				       sizeof(unsigned long long));
484 }
485 
486 static void **dbg_userword(struct kmem_cache *cachep, void *objp)
487 {
488 	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
489 	return (void **)(objp + cachep->size - BYTES_PER_WORD);
490 }
491 
492 #else
493 
494 #define obj_offset(x)			0
495 #define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
496 #define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
497 #define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
498 
499 #endif
500 
501 /*
502  * Do not go above this order unless 0 objects fit into the slab or
503  * overridden on the command line.
504  */
505 #define	SLAB_MAX_ORDER_HI	1
506 #define	SLAB_MAX_ORDER_LO	0
507 static int slab_max_order = SLAB_MAX_ORDER_LO;
508 static bool slab_max_order_set __initdata;
509 
510 static inline struct kmem_cache *virt_to_cache(const void *obj)
511 {
512 	struct page *page = virt_to_head_page(obj);
513 	return page->slab_cache;
514 }
515 
516 static inline struct slab *virt_to_slab(const void *obj)
517 {
518 	struct page *page = virt_to_head_page(obj);
519 
520 	VM_BUG_ON(!PageSlab(page));
521 	return page->slab_page;
522 }
523 
524 static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
525 				 unsigned int idx)
526 {
527 	return slab->s_mem + cache->size * idx;
528 }
529 
530 /*
531  * We want to avoid an expensive divide : (offset / cache->size)
532  *   Using the fact that size is a constant for a particular cache,
533  *   we can replace (offset / cache->size) by
534  *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
535  */
536 static inline unsigned int obj_to_index(const struct kmem_cache *cache,
537 					const struct slab *slab, void *obj)
538 {
539 	u32 offset = (obj - slab->s_mem);
540 	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
541 }
542 
543 /*
544  * These are the default caches for kmalloc. Custom caches can have other sizes.
545  */
546 struct cache_sizes malloc_sizes[] = {
547 #define CACHE(x) { .cs_size = (x) },
548 #include <linux/kmalloc_sizes.h>
549 	CACHE(ULONG_MAX)
550 #undef CACHE
551 };
552 EXPORT_SYMBOL(malloc_sizes);
553 
554 /* Must match cache_sizes above. Out of line to keep cache footprint low. */
555 struct cache_names {
556 	char *name;
557 	char *name_dma;
558 };
559 
560 static struct cache_names __initdata cache_names[] = {
561 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
562 #include <linux/kmalloc_sizes.h>
563 	{NULL,}
564 #undef CACHE
565 };
566 
567 static struct arraycache_init initarray_cache __initdata =
568     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
569 static struct arraycache_init initarray_generic =
570     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
571 
572 /* internal cache of cache description objs */
573 static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES];
574 static struct kmem_cache kmem_cache_boot = {
575 	.nodelists = kmem_cache_nodelists,
576 	.batchcount = 1,
577 	.limit = BOOT_CPUCACHE_ENTRIES,
578 	.shared = 1,
579 	.size = sizeof(struct kmem_cache),
580 	.name = "kmem_cache",
581 };
582 
583 #define BAD_ALIEN_MAGIC 0x01020304ul
584 
585 #ifdef CONFIG_LOCKDEP
586 
587 /*
588  * Slab sometimes uses the kmalloc slabs to store the slab headers
589  * for other slabs "off slab".
590  * The locking for this is tricky in that it nests within the locks
591  * of all other slabs in a few places; to deal with this special
592  * locking we put on-slab caches into a separate lock-class.
593  *
594  * We set lock class for alien array caches which are up during init.
595  * The lock annotation will be lost if all cpus of a node goes down and
596  * then comes back up during hotplug
597  */
598 static struct lock_class_key on_slab_l3_key;
599 static struct lock_class_key on_slab_alc_key;
600 
601 static struct lock_class_key debugobj_l3_key;
602 static struct lock_class_key debugobj_alc_key;
603 
604 static void slab_set_lock_classes(struct kmem_cache *cachep,
605 		struct lock_class_key *l3_key, struct lock_class_key *alc_key,
606 		int q)
607 {
608 	struct array_cache **alc;
609 	struct kmem_list3 *l3;
610 	int r;
611 
612 	l3 = cachep->nodelists[q];
613 	if (!l3)
614 		return;
615 
616 	lockdep_set_class(&l3->list_lock, l3_key);
617 	alc = l3->alien;
618 	/*
619 	 * FIXME: This check for BAD_ALIEN_MAGIC
620 	 * should go away when common slab code is taught to
621 	 * work even without alien caches.
622 	 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
623 	 * for alloc_alien_cache,
624 	 */
625 	if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
626 		return;
627 	for_each_node(r) {
628 		if (alc[r])
629 			lockdep_set_class(&alc[r]->lock, alc_key);
630 	}
631 }
632 
633 static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
634 {
635 	slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node);
636 }
637 
638 static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
639 {
640 	int node;
641 
642 	for_each_online_node(node)
643 		slab_set_debugobj_lock_classes_node(cachep, node);
644 }
645 
646 static void init_node_lock_keys(int q)
647 {
648 	struct cache_sizes *s = malloc_sizes;
649 
650 	if (slab_state < UP)
651 		return;
652 
653 	for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
654 		struct kmem_list3 *l3;
655 
656 		l3 = s->cs_cachep->nodelists[q];
657 		if (!l3 || OFF_SLAB(s->cs_cachep))
658 			continue;
659 
660 		slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key,
661 				&on_slab_alc_key, q);
662 	}
663 }
664 
665 static inline void init_lock_keys(void)
666 {
667 	int node;
668 
669 	for_each_node(node)
670 		init_node_lock_keys(node);
671 }
672 #else
673 static void init_node_lock_keys(int q)
674 {
675 }
676 
677 static inline void init_lock_keys(void)
678 {
679 }
680 
681 static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
682 {
683 }
684 
685 static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
686 {
687 }
688 #endif
689 
690 static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
691 
692 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
693 {
694 	return cachep->array[smp_processor_id()];
695 }
696 
697 static inline struct kmem_cache *__find_general_cachep(size_t size,
698 							gfp_t gfpflags)
699 {
700 	struct cache_sizes *csizep = malloc_sizes;
701 
702 #if DEBUG
703 	/* This happens if someone tries to call
704 	 * kmem_cache_create(), or __kmalloc(), before
705 	 * the generic caches are initialized.
706 	 */
707 	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
708 #endif
709 	if (!size)
710 		return ZERO_SIZE_PTR;
711 
712 	while (size > csizep->cs_size)
713 		csizep++;
714 
715 	/*
716 	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
717 	 * has cs_{dma,}cachep==NULL. Thus no special case
718 	 * for large kmalloc calls required.
719 	 */
720 #ifdef CONFIG_ZONE_DMA
721 	if (unlikely(gfpflags & GFP_DMA))
722 		return csizep->cs_dmacachep;
723 #endif
724 	return csizep->cs_cachep;
725 }
726 
727 static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
728 {
729 	return __find_general_cachep(size, gfpflags);
730 }
731 
732 static size_t slab_mgmt_size(size_t nr_objs, size_t align)
733 {
734 	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
735 }
736 
737 /*
738  * Calculate the number of objects and left-over bytes for a given buffer size.
739  */
740 static void cache_estimate(unsigned long gfporder, size_t buffer_size,
741 			   size_t align, int flags, size_t *left_over,
742 			   unsigned int *num)
743 {
744 	int nr_objs;
745 	size_t mgmt_size;
746 	size_t slab_size = PAGE_SIZE << gfporder;
747 
748 	/*
749 	 * The slab management structure can be either off the slab or
750 	 * on it. For the latter case, the memory allocated for a
751 	 * slab is used for:
752 	 *
753 	 * - The struct slab
754 	 * - One kmem_bufctl_t for each object
755 	 * - Padding to respect alignment of @align
756 	 * - @buffer_size bytes for each object
757 	 *
758 	 * If the slab management structure is off the slab, then the
759 	 * alignment will already be calculated into the size. Because
760 	 * the slabs are all pages aligned, the objects will be at the
761 	 * correct alignment when allocated.
762 	 */
763 	if (flags & CFLGS_OFF_SLAB) {
764 		mgmt_size = 0;
765 		nr_objs = slab_size / buffer_size;
766 
767 		if (nr_objs > SLAB_LIMIT)
768 			nr_objs = SLAB_LIMIT;
769 	} else {
770 		/*
771 		 * Ignore padding for the initial guess. The padding
772 		 * is at most @align-1 bytes, and @buffer_size is at
773 		 * least @align. In the worst case, this result will
774 		 * be one greater than the number of objects that fit
775 		 * into the memory allocation when taking the padding
776 		 * into account.
777 		 */
778 		nr_objs = (slab_size - sizeof(struct slab)) /
779 			  (buffer_size + sizeof(kmem_bufctl_t));
780 
781 		/*
782 		 * This calculated number will be either the right
783 		 * amount, or one greater than what we want.
784 		 */
785 		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
786 		       > slab_size)
787 			nr_objs--;
788 
789 		if (nr_objs > SLAB_LIMIT)
790 			nr_objs = SLAB_LIMIT;
791 
792 		mgmt_size = slab_mgmt_size(nr_objs, align);
793 	}
794 	*num = nr_objs;
795 	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
796 }
797 
798 #if DEBUG
799 #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
800 
801 static void __slab_error(const char *function, struct kmem_cache *cachep,
802 			char *msg)
803 {
804 	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
805 	       function, cachep->name, msg);
806 	dump_stack();
807 	add_taint(TAINT_BAD_PAGE);
808 }
809 #endif
810 
811 /*
812  * By default on NUMA we use alien caches to stage the freeing of
813  * objects allocated from other nodes. This causes massive memory
814  * inefficiencies when using fake NUMA setup to split memory into a
815  * large number of small nodes, so it can be disabled on the command
816  * line
817   */
818 
819 static int use_alien_caches __read_mostly = 1;
820 static int __init noaliencache_setup(char *s)
821 {
822 	use_alien_caches = 0;
823 	return 1;
824 }
825 __setup("noaliencache", noaliencache_setup);
826 
827 static int __init slab_max_order_setup(char *str)
828 {
829 	get_option(&str, &slab_max_order);
830 	slab_max_order = slab_max_order < 0 ? 0 :
831 				min(slab_max_order, MAX_ORDER - 1);
832 	slab_max_order_set = true;
833 
834 	return 1;
835 }
836 __setup("slab_max_order=", slab_max_order_setup);
837 
838 #ifdef CONFIG_NUMA
839 /*
840  * Special reaping functions for NUMA systems called from cache_reap().
841  * These take care of doing round robin flushing of alien caches (containing
842  * objects freed on different nodes from which they were allocated) and the
843  * flushing of remote pcps by calling drain_node_pages.
844  */
845 static DEFINE_PER_CPU(unsigned long, slab_reap_node);
846 
847 static void init_reap_node(int cpu)
848 {
849 	int node;
850 
851 	node = next_node(cpu_to_mem(cpu), node_online_map);
852 	if (node == MAX_NUMNODES)
853 		node = first_node(node_online_map);
854 
855 	per_cpu(slab_reap_node, cpu) = node;
856 }
857 
858 static void next_reap_node(void)
859 {
860 	int node = __this_cpu_read(slab_reap_node);
861 
862 	node = next_node(node, node_online_map);
863 	if (unlikely(node >= MAX_NUMNODES))
864 		node = first_node(node_online_map);
865 	__this_cpu_write(slab_reap_node, node);
866 }
867 
868 #else
869 #define init_reap_node(cpu) do { } while (0)
870 #define next_reap_node(void) do { } while (0)
871 #endif
872 
873 /*
874  * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
875  * via the workqueue/eventd.
876  * Add the CPU number into the expiration time to minimize the possibility of
877  * the CPUs getting into lockstep and contending for the global cache chain
878  * lock.
879  */
880 static void __cpuinit start_cpu_timer(int cpu)
881 {
882 	struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
883 
884 	/*
885 	 * When this gets called from do_initcalls via cpucache_init(),
886 	 * init_workqueues() has already run, so keventd will be setup
887 	 * at that time.
888 	 */
889 	if (keventd_up() && reap_work->work.func == NULL) {
890 		init_reap_node(cpu);
891 		INIT_DEFERRABLE_WORK(reap_work, cache_reap);
892 		schedule_delayed_work_on(cpu, reap_work,
893 					__round_jiffies_relative(HZ, cpu));
894 	}
895 }
896 
897 static struct array_cache *alloc_arraycache(int node, int entries,
898 					    int batchcount, gfp_t gfp)
899 {
900 	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
901 	struct array_cache *nc = NULL;
902 
903 	nc = kmalloc_node(memsize, gfp, node);
904 	/*
905 	 * The array_cache structures contain pointers to free object.
906 	 * However, when such objects are allocated or transferred to another
907 	 * cache the pointers are not cleared and they could be counted as
908 	 * valid references during a kmemleak scan. Therefore, kmemleak must
909 	 * not scan such objects.
910 	 */
911 	kmemleak_no_scan(nc);
912 	if (nc) {
913 		nc->avail = 0;
914 		nc->limit = entries;
915 		nc->batchcount = batchcount;
916 		nc->touched = 0;
917 		spin_lock_init(&nc->lock);
918 	}
919 	return nc;
920 }
921 
922 static inline bool is_slab_pfmemalloc(struct slab *slabp)
923 {
924 	struct page *page = virt_to_page(slabp->s_mem);
925 
926 	return PageSlabPfmemalloc(page);
927 }
928 
929 /* Clears pfmemalloc_active if no slabs have pfmalloc set */
930 static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
931 						struct array_cache *ac)
932 {
933 	struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()];
934 	struct slab *slabp;
935 	unsigned long flags;
936 
937 	if (!pfmemalloc_active)
938 		return;
939 
940 	spin_lock_irqsave(&l3->list_lock, flags);
941 	list_for_each_entry(slabp, &l3->slabs_full, list)
942 		if (is_slab_pfmemalloc(slabp))
943 			goto out;
944 
945 	list_for_each_entry(slabp, &l3->slabs_partial, list)
946 		if (is_slab_pfmemalloc(slabp))
947 			goto out;
948 
949 	list_for_each_entry(slabp, &l3->slabs_free, list)
950 		if (is_slab_pfmemalloc(slabp))
951 			goto out;
952 
953 	pfmemalloc_active = false;
954 out:
955 	spin_unlock_irqrestore(&l3->list_lock, flags);
956 }
957 
958 static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
959 						gfp_t flags, bool force_refill)
960 {
961 	int i;
962 	void *objp = ac->entry[--ac->avail];
963 
964 	/* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
965 	if (unlikely(is_obj_pfmemalloc(objp))) {
966 		struct kmem_list3 *l3;
967 
968 		if (gfp_pfmemalloc_allowed(flags)) {
969 			clear_obj_pfmemalloc(&objp);
970 			return objp;
971 		}
972 
973 		/* The caller cannot use PFMEMALLOC objects, find another one */
974 		for (i = 0; i < ac->avail; i++) {
975 			/* If a !PFMEMALLOC object is found, swap them */
976 			if (!is_obj_pfmemalloc(ac->entry[i])) {
977 				objp = ac->entry[i];
978 				ac->entry[i] = ac->entry[ac->avail];
979 				ac->entry[ac->avail] = objp;
980 				return objp;
981 			}
982 		}
983 
984 		/*
985 		 * If there are empty slabs on the slabs_free list and we are
986 		 * being forced to refill the cache, mark this one !pfmemalloc.
987 		 */
988 		l3 = cachep->nodelists[numa_mem_id()];
989 		if (!list_empty(&l3->slabs_free) && force_refill) {
990 			struct slab *slabp = virt_to_slab(objp);
991 			ClearPageSlabPfmemalloc(virt_to_head_page(slabp->s_mem));
992 			clear_obj_pfmemalloc(&objp);
993 			recheck_pfmemalloc_active(cachep, ac);
994 			return objp;
995 		}
996 
997 		/* No !PFMEMALLOC objects available */
998 		ac->avail++;
999 		objp = NULL;
1000 	}
1001 
1002 	return objp;
1003 }
1004 
1005 static inline void *ac_get_obj(struct kmem_cache *cachep,
1006 			struct array_cache *ac, gfp_t flags, bool force_refill)
1007 {
1008 	void *objp;
1009 
1010 	if (unlikely(sk_memalloc_socks()))
1011 		objp = __ac_get_obj(cachep, ac, flags, force_refill);
1012 	else
1013 		objp = ac->entry[--ac->avail];
1014 
1015 	return objp;
1016 }
1017 
1018 static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
1019 								void *objp)
1020 {
1021 	if (unlikely(pfmemalloc_active)) {
1022 		/* Some pfmemalloc slabs exist, check if this is one */
1023 		struct page *page = virt_to_head_page(objp);
1024 		if (PageSlabPfmemalloc(page))
1025 			set_obj_pfmemalloc(&objp);
1026 	}
1027 
1028 	return objp;
1029 }
1030 
1031 static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
1032 								void *objp)
1033 {
1034 	if (unlikely(sk_memalloc_socks()))
1035 		objp = __ac_put_obj(cachep, ac, objp);
1036 
1037 	ac->entry[ac->avail++] = objp;
1038 }
1039 
1040 /*
1041  * Transfer objects in one arraycache to another.
1042  * Locking must be handled by the caller.
1043  *
1044  * Return the number of entries transferred.
1045  */
1046 static int transfer_objects(struct array_cache *to,
1047 		struct array_cache *from, unsigned int max)
1048 {
1049 	/* Figure out how many entries to transfer */
1050 	int nr = min3(from->avail, max, to->limit - to->avail);
1051 
1052 	if (!nr)
1053 		return 0;
1054 
1055 	memcpy(to->entry + to->avail, from->entry + from->avail -nr,
1056 			sizeof(void *) *nr);
1057 
1058 	from->avail -= nr;
1059 	to->avail += nr;
1060 	return nr;
1061 }
1062 
1063 #ifndef CONFIG_NUMA
1064 
1065 #define drain_alien_cache(cachep, alien) do { } while (0)
1066 #define reap_alien(cachep, l3) do { } while (0)
1067 
1068 static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
1069 {
1070 	return (struct array_cache **)BAD_ALIEN_MAGIC;
1071 }
1072 
1073 static inline void free_alien_cache(struct array_cache **ac_ptr)
1074 {
1075 }
1076 
1077 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1078 {
1079 	return 0;
1080 }
1081 
1082 static inline void *alternate_node_alloc(struct kmem_cache *cachep,
1083 		gfp_t flags)
1084 {
1085 	return NULL;
1086 }
1087 
1088 static inline void *____cache_alloc_node(struct kmem_cache *cachep,
1089 		 gfp_t flags, int nodeid)
1090 {
1091 	return NULL;
1092 }
1093 
1094 #else	/* CONFIG_NUMA */
1095 
1096 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
1097 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
1098 
1099 static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
1100 {
1101 	struct array_cache **ac_ptr;
1102 	int memsize = sizeof(void *) * nr_node_ids;
1103 	int i;
1104 
1105 	if (limit > 1)
1106 		limit = 12;
1107 	ac_ptr = kzalloc_node(memsize, gfp, node);
1108 	if (ac_ptr) {
1109 		for_each_node(i) {
1110 			if (i == node || !node_online(i))
1111 				continue;
1112 			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
1113 			if (!ac_ptr[i]) {
1114 				for (i--; i >= 0; i--)
1115 					kfree(ac_ptr[i]);
1116 				kfree(ac_ptr);
1117 				return NULL;
1118 			}
1119 		}
1120 	}
1121 	return ac_ptr;
1122 }
1123 
1124 static void free_alien_cache(struct array_cache **ac_ptr)
1125 {
1126 	int i;
1127 
1128 	if (!ac_ptr)
1129 		return;
1130 	for_each_node(i)
1131 	    kfree(ac_ptr[i]);
1132 	kfree(ac_ptr);
1133 }
1134 
1135 static void __drain_alien_cache(struct kmem_cache *cachep,
1136 				struct array_cache *ac, int node)
1137 {
1138 	struct kmem_list3 *rl3 = cachep->nodelists[node];
1139 
1140 	if (ac->avail) {
1141 		spin_lock(&rl3->list_lock);
1142 		/*
1143 		 * Stuff objects into the remote nodes shared array first.
1144 		 * That way we could avoid the overhead of putting the objects
1145 		 * into the free lists and getting them back later.
1146 		 */
1147 		if (rl3->shared)
1148 			transfer_objects(rl3->shared, ac, ac->limit);
1149 
1150 		free_block(cachep, ac->entry, ac->avail, node);
1151 		ac->avail = 0;
1152 		spin_unlock(&rl3->list_lock);
1153 	}
1154 }
1155 
1156 /*
1157  * Called from cache_reap() to regularly drain alien caches round robin.
1158  */
1159 static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
1160 {
1161 	int node = __this_cpu_read(slab_reap_node);
1162 
1163 	if (l3->alien) {
1164 		struct array_cache *ac = l3->alien[node];
1165 
1166 		if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
1167 			__drain_alien_cache(cachep, ac, node);
1168 			spin_unlock_irq(&ac->lock);
1169 		}
1170 	}
1171 }
1172 
1173 static void drain_alien_cache(struct kmem_cache *cachep,
1174 				struct array_cache **alien)
1175 {
1176 	int i = 0;
1177 	struct array_cache *ac;
1178 	unsigned long flags;
1179 
1180 	for_each_online_node(i) {
1181 		ac = alien[i];
1182 		if (ac) {
1183 			spin_lock_irqsave(&ac->lock, flags);
1184 			__drain_alien_cache(cachep, ac, i);
1185 			spin_unlock_irqrestore(&ac->lock, flags);
1186 		}
1187 	}
1188 }
1189 
1190 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1191 {
1192 	struct slab *slabp = virt_to_slab(objp);
1193 	int nodeid = slabp->nodeid;
1194 	struct kmem_list3 *l3;
1195 	struct array_cache *alien = NULL;
1196 	int node;
1197 
1198 	node = numa_mem_id();
1199 
1200 	/*
1201 	 * Make sure we are not freeing a object from another node to the array
1202 	 * cache on this cpu.
1203 	 */
1204 	if (likely(slabp->nodeid == node))
1205 		return 0;
1206 
1207 	l3 = cachep->nodelists[node];
1208 	STATS_INC_NODEFREES(cachep);
1209 	if (l3->alien && l3->alien[nodeid]) {
1210 		alien = l3->alien[nodeid];
1211 		spin_lock(&alien->lock);
1212 		if (unlikely(alien->avail == alien->limit)) {
1213 			STATS_INC_ACOVERFLOW(cachep);
1214 			__drain_alien_cache(cachep, alien, nodeid);
1215 		}
1216 		ac_put_obj(cachep, alien, objp);
1217 		spin_unlock(&alien->lock);
1218 	} else {
1219 		spin_lock(&(cachep->nodelists[nodeid])->list_lock);
1220 		free_block(cachep, &objp, 1, nodeid);
1221 		spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
1222 	}
1223 	return 1;
1224 }
1225 #endif
1226 
1227 /*
1228  * Allocates and initializes nodelists for a node on each slab cache, used for
1229  * either memory or cpu hotplug.  If memory is being hot-added, the kmem_list3
1230  * will be allocated off-node since memory is not yet online for the new node.
1231  * When hotplugging memory or a cpu, existing nodelists are not replaced if
1232  * already in use.
1233  *
1234  * Must hold slab_mutex.
1235  */
1236 static int init_cache_nodelists_node(int node)
1237 {
1238 	struct kmem_cache *cachep;
1239 	struct kmem_list3 *l3;
1240 	const int memsize = sizeof(struct kmem_list3);
1241 
1242 	list_for_each_entry(cachep, &slab_caches, list) {
1243 		/*
1244 		 * Set up the size64 kmemlist for cpu before we can
1245 		 * begin anything. Make sure some other cpu on this
1246 		 * node has not already allocated this
1247 		 */
1248 		if (!cachep->nodelists[node]) {
1249 			l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1250 			if (!l3)
1251 				return -ENOMEM;
1252 			kmem_list3_init(l3);
1253 			l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1254 			    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1255 
1256 			/*
1257 			 * The l3s don't come and go as CPUs come and
1258 			 * go.  slab_mutex is sufficient
1259 			 * protection here.
1260 			 */
1261 			cachep->nodelists[node] = l3;
1262 		}
1263 
1264 		spin_lock_irq(&cachep->nodelists[node]->list_lock);
1265 		cachep->nodelists[node]->free_limit =
1266 			(1 + nr_cpus_node(node)) *
1267 			cachep->batchcount + cachep->num;
1268 		spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1269 	}
1270 	return 0;
1271 }
1272 
1273 static void __cpuinit cpuup_canceled(long cpu)
1274 {
1275 	struct kmem_cache *cachep;
1276 	struct kmem_list3 *l3 = NULL;
1277 	int node = cpu_to_mem(cpu);
1278 	const struct cpumask *mask = cpumask_of_node(node);
1279 
1280 	list_for_each_entry(cachep, &slab_caches, list) {
1281 		struct array_cache *nc;
1282 		struct array_cache *shared;
1283 		struct array_cache **alien;
1284 
1285 		/* cpu is dead; no one can alloc from it. */
1286 		nc = cachep->array[cpu];
1287 		cachep->array[cpu] = NULL;
1288 		l3 = cachep->nodelists[node];
1289 
1290 		if (!l3)
1291 			goto free_array_cache;
1292 
1293 		spin_lock_irq(&l3->list_lock);
1294 
1295 		/* Free limit for this kmem_list3 */
1296 		l3->free_limit -= cachep->batchcount;
1297 		if (nc)
1298 			free_block(cachep, nc->entry, nc->avail, node);
1299 
1300 		if (!cpumask_empty(mask)) {
1301 			spin_unlock_irq(&l3->list_lock);
1302 			goto free_array_cache;
1303 		}
1304 
1305 		shared = l3->shared;
1306 		if (shared) {
1307 			free_block(cachep, shared->entry,
1308 				   shared->avail, node);
1309 			l3->shared = NULL;
1310 		}
1311 
1312 		alien = l3->alien;
1313 		l3->alien = NULL;
1314 
1315 		spin_unlock_irq(&l3->list_lock);
1316 
1317 		kfree(shared);
1318 		if (alien) {
1319 			drain_alien_cache(cachep, alien);
1320 			free_alien_cache(alien);
1321 		}
1322 free_array_cache:
1323 		kfree(nc);
1324 	}
1325 	/*
1326 	 * In the previous loop, all the objects were freed to
1327 	 * the respective cache's slabs,  now we can go ahead and
1328 	 * shrink each nodelist to its limit.
1329 	 */
1330 	list_for_each_entry(cachep, &slab_caches, list) {
1331 		l3 = cachep->nodelists[node];
1332 		if (!l3)
1333 			continue;
1334 		drain_freelist(cachep, l3, l3->free_objects);
1335 	}
1336 }
1337 
1338 static int __cpuinit cpuup_prepare(long cpu)
1339 {
1340 	struct kmem_cache *cachep;
1341 	struct kmem_list3 *l3 = NULL;
1342 	int node = cpu_to_mem(cpu);
1343 	int err;
1344 
1345 	/*
1346 	 * We need to do this right in the beginning since
1347 	 * alloc_arraycache's are going to use this list.
1348 	 * kmalloc_node allows us to add the slab to the right
1349 	 * kmem_list3 and not this cpu's kmem_list3
1350 	 */
1351 	err = init_cache_nodelists_node(node);
1352 	if (err < 0)
1353 		goto bad;
1354 
1355 	/*
1356 	 * Now we can go ahead with allocating the shared arrays and
1357 	 * array caches
1358 	 */
1359 	list_for_each_entry(cachep, &slab_caches, list) {
1360 		struct array_cache *nc;
1361 		struct array_cache *shared = NULL;
1362 		struct array_cache **alien = NULL;
1363 
1364 		nc = alloc_arraycache(node, cachep->limit,
1365 					cachep->batchcount, GFP_KERNEL);
1366 		if (!nc)
1367 			goto bad;
1368 		if (cachep->shared) {
1369 			shared = alloc_arraycache(node,
1370 				cachep->shared * cachep->batchcount,
1371 				0xbaadf00d, GFP_KERNEL);
1372 			if (!shared) {
1373 				kfree(nc);
1374 				goto bad;
1375 			}
1376 		}
1377 		if (use_alien_caches) {
1378 			alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
1379 			if (!alien) {
1380 				kfree(shared);
1381 				kfree(nc);
1382 				goto bad;
1383 			}
1384 		}
1385 		cachep->array[cpu] = nc;
1386 		l3 = cachep->nodelists[node];
1387 		BUG_ON(!l3);
1388 
1389 		spin_lock_irq(&l3->list_lock);
1390 		if (!l3->shared) {
1391 			/*
1392 			 * We are serialised from CPU_DEAD or
1393 			 * CPU_UP_CANCELLED by the cpucontrol lock
1394 			 */
1395 			l3->shared = shared;
1396 			shared = NULL;
1397 		}
1398 #ifdef CONFIG_NUMA
1399 		if (!l3->alien) {
1400 			l3->alien = alien;
1401 			alien = NULL;
1402 		}
1403 #endif
1404 		spin_unlock_irq(&l3->list_lock);
1405 		kfree(shared);
1406 		free_alien_cache(alien);
1407 		if (cachep->flags & SLAB_DEBUG_OBJECTS)
1408 			slab_set_debugobj_lock_classes_node(cachep, node);
1409 	}
1410 	init_node_lock_keys(node);
1411 
1412 	return 0;
1413 bad:
1414 	cpuup_canceled(cpu);
1415 	return -ENOMEM;
1416 }
1417 
1418 static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1419 				    unsigned long action, void *hcpu)
1420 {
1421 	long cpu = (long)hcpu;
1422 	int err = 0;
1423 
1424 	switch (action) {
1425 	case CPU_UP_PREPARE:
1426 	case CPU_UP_PREPARE_FROZEN:
1427 		mutex_lock(&slab_mutex);
1428 		err = cpuup_prepare(cpu);
1429 		mutex_unlock(&slab_mutex);
1430 		break;
1431 	case CPU_ONLINE:
1432 	case CPU_ONLINE_FROZEN:
1433 		start_cpu_timer(cpu);
1434 		break;
1435 #ifdef CONFIG_HOTPLUG_CPU
1436   	case CPU_DOWN_PREPARE:
1437   	case CPU_DOWN_PREPARE_FROZEN:
1438 		/*
1439 		 * Shutdown cache reaper. Note that the slab_mutex is
1440 		 * held so that if cache_reap() is invoked it cannot do
1441 		 * anything expensive but will only modify reap_work
1442 		 * and reschedule the timer.
1443 		*/
1444 		cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
1445 		/* Now the cache_reaper is guaranteed to be not running. */
1446 		per_cpu(slab_reap_work, cpu).work.func = NULL;
1447   		break;
1448   	case CPU_DOWN_FAILED:
1449   	case CPU_DOWN_FAILED_FROZEN:
1450 		start_cpu_timer(cpu);
1451   		break;
1452 	case CPU_DEAD:
1453 	case CPU_DEAD_FROZEN:
1454 		/*
1455 		 * Even if all the cpus of a node are down, we don't free the
1456 		 * kmem_list3 of any cache. This to avoid a race between
1457 		 * cpu_down, and a kmalloc allocation from another cpu for
1458 		 * memory from the node of the cpu going down.  The list3
1459 		 * structure is usually allocated from kmem_cache_create() and
1460 		 * gets destroyed at kmem_cache_destroy().
1461 		 */
1462 		/* fall through */
1463 #endif
1464 	case CPU_UP_CANCELED:
1465 	case CPU_UP_CANCELED_FROZEN:
1466 		mutex_lock(&slab_mutex);
1467 		cpuup_canceled(cpu);
1468 		mutex_unlock(&slab_mutex);
1469 		break;
1470 	}
1471 	return notifier_from_errno(err);
1472 }
1473 
1474 static struct notifier_block __cpuinitdata cpucache_notifier = {
1475 	&cpuup_callback, NULL, 0
1476 };
1477 
1478 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
1479 /*
1480  * Drains freelist for a node on each slab cache, used for memory hot-remove.
1481  * Returns -EBUSY if all objects cannot be drained so that the node is not
1482  * removed.
1483  *
1484  * Must hold slab_mutex.
1485  */
1486 static int __meminit drain_cache_nodelists_node(int node)
1487 {
1488 	struct kmem_cache *cachep;
1489 	int ret = 0;
1490 
1491 	list_for_each_entry(cachep, &slab_caches, list) {
1492 		struct kmem_list3 *l3;
1493 
1494 		l3 = cachep->nodelists[node];
1495 		if (!l3)
1496 			continue;
1497 
1498 		drain_freelist(cachep, l3, l3->free_objects);
1499 
1500 		if (!list_empty(&l3->slabs_full) ||
1501 		    !list_empty(&l3->slabs_partial)) {
1502 			ret = -EBUSY;
1503 			break;
1504 		}
1505 	}
1506 	return ret;
1507 }
1508 
1509 static int __meminit slab_memory_callback(struct notifier_block *self,
1510 					unsigned long action, void *arg)
1511 {
1512 	struct memory_notify *mnb = arg;
1513 	int ret = 0;
1514 	int nid;
1515 
1516 	nid = mnb->status_change_nid;
1517 	if (nid < 0)
1518 		goto out;
1519 
1520 	switch (action) {
1521 	case MEM_GOING_ONLINE:
1522 		mutex_lock(&slab_mutex);
1523 		ret = init_cache_nodelists_node(nid);
1524 		mutex_unlock(&slab_mutex);
1525 		break;
1526 	case MEM_GOING_OFFLINE:
1527 		mutex_lock(&slab_mutex);
1528 		ret = drain_cache_nodelists_node(nid);
1529 		mutex_unlock(&slab_mutex);
1530 		break;
1531 	case MEM_ONLINE:
1532 	case MEM_OFFLINE:
1533 	case MEM_CANCEL_ONLINE:
1534 	case MEM_CANCEL_OFFLINE:
1535 		break;
1536 	}
1537 out:
1538 	return notifier_from_errno(ret);
1539 }
1540 #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
1541 
1542 /*
1543  * swap the static kmem_list3 with kmalloced memory
1544  */
1545 static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1546 				int nodeid)
1547 {
1548 	struct kmem_list3 *ptr;
1549 
1550 	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
1551 	BUG_ON(!ptr);
1552 
1553 	memcpy(ptr, list, sizeof(struct kmem_list3));
1554 	/*
1555 	 * Do not assume that spinlocks can be initialized via memcpy:
1556 	 */
1557 	spin_lock_init(&ptr->list_lock);
1558 
1559 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
1560 	cachep->nodelists[nodeid] = ptr;
1561 }
1562 
1563 /*
1564  * For setting up all the kmem_list3s for cache whose buffer_size is same as
1565  * size of kmem_list3.
1566  */
1567 static void __init set_up_list3s(struct kmem_cache *cachep, int index)
1568 {
1569 	int node;
1570 
1571 	for_each_online_node(node) {
1572 		cachep->nodelists[node] = &initkmem_list3[index + node];
1573 		cachep->nodelists[node]->next_reap = jiffies +
1574 		    REAPTIMEOUT_LIST3 +
1575 		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1576 	}
1577 }
1578 
1579 /*
1580  * Initialisation.  Called after the page allocator have been initialised and
1581  * before smp_init().
1582  */
1583 void __init kmem_cache_init(void)
1584 {
1585 	size_t left_over;
1586 	struct cache_sizes *sizes;
1587 	struct cache_names *names;
1588 	int i;
1589 	int order;
1590 	int node;
1591 
1592 	kmem_cache = &kmem_cache_boot;
1593 
1594 	if (num_possible_nodes() == 1)
1595 		use_alien_caches = 0;
1596 
1597 	for (i = 0; i < NUM_INIT_LISTS; i++) {
1598 		kmem_list3_init(&initkmem_list3[i]);
1599 		if (i < MAX_NUMNODES)
1600 			kmem_cache->nodelists[i] = NULL;
1601 	}
1602 	set_up_list3s(kmem_cache, CACHE_CACHE);
1603 
1604 	/*
1605 	 * Fragmentation resistance on low memory - only use bigger
1606 	 * page orders on machines with more than 32MB of memory if
1607 	 * not overridden on the command line.
1608 	 */
1609 	if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
1610 		slab_max_order = SLAB_MAX_ORDER_HI;
1611 
1612 	/* Bootstrap is tricky, because several objects are allocated
1613 	 * from caches that do not exist yet:
1614 	 * 1) initialize the kmem_cache cache: it contains the struct
1615 	 *    kmem_cache structures of all caches, except kmem_cache itself:
1616 	 *    kmem_cache is statically allocated.
1617 	 *    Initially an __init data area is used for the head array and the
1618 	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
1619 	 *    array at the end of the bootstrap.
1620 	 * 2) Create the first kmalloc cache.
1621 	 *    The struct kmem_cache for the new cache is allocated normally.
1622 	 *    An __init data area is used for the head array.
1623 	 * 3) Create the remaining kmalloc caches, with minimally sized
1624 	 *    head arrays.
1625 	 * 4) Replace the __init data head arrays for kmem_cache and the first
1626 	 *    kmalloc cache with kmalloc allocated arrays.
1627 	 * 5) Replace the __init data for kmem_list3 for kmem_cache and
1628 	 *    the other cache's with kmalloc allocated memory.
1629 	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1630 	 */
1631 
1632 	node = numa_mem_id();
1633 
1634 	/* 1) create the kmem_cache */
1635 	INIT_LIST_HEAD(&slab_caches);
1636 	list_add(&kmem_cache->list, &slab_caches);
1637 	kmem_cache->colour_off = cache_line_size();
1638 	kmem_cache->array[smp_processor_id()] = &initarray_cache.cache;
1639 	kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
1640 
1641 	/*
1642 	 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1643 	 */
1644 	kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
1645 				  nr_node_ids * sizeof(struct kmem_list3 *);
1646 	kmem_cache->object_size = kmem_cache->size;
1647 	kmem_cache->size = ALIGN(kmem_cache->object_size,
1648 					cache_line_size());
1649 	kmem_cache->reciprocal_buffer_size =
1650 		reciprocal_value(kmem_cache->size);
1651 
1652 	for (order = 0; order < MAX_ORDER; order++) {
1653 		cache_estimate(order, kmem_cache->size,
1654 			cache_line_size(), 0, &left_over, &kmem_cache->num);
1655 		if (kmem_cache->num)
1656 			break;
1657 	}
1658 	BUG_ON(!kmem_cache->num);
1659 	kmem_cache->gfporder = order;
1660 	kmem_cache->colour = left_over / kmem_cache->colour_off;
1661 	kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) +
1662 				      sizeof(struct slab), cache_line_size());
1663 
1664 	/* 2+3) create the kmalloc caches */
1665 	sizes = malloc_sizes;
1666 	names = cache_names;
1667 
1668 	/*
1669 	 * Initialize the caches that provide memory for the array cache and the
1670 	 * kmem_list3 structures first.  Without this, further allocations will
1671 	 * bug.
1672 	 */
1673 
1674 	sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
1675 	sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name;
1676 	sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size;
1677 	sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size;
1678 	sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
1679 	__kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
1680 	list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches);
1681 
1682 	if (INDEX_AC != INDEX_L3) {
1683 		sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
1684 		sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name;
1685 		sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size;
1686 		sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size;
1687 		sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
1688 		__kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
1689 		list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches);
1690 	}
1691 
1692 	slab_early_init = 0;
1693 
1694 	while (sizes->cs_size != ULONG_MAX) {
1695 		/*
1696 		 * For performance, all the general caches are L1 aligned.
1697 		 * This should be particularly beneficial on SMP boxes, as it
1698 		 * eliminates "false sharing".
1699 		 * Note for systems short on memory removing the alignment will
1700 		 * allow tighter packing of the smaller caches.
1701 		 */
1702 		if (!sizes->cs_cachep) {
1703 			sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
1704 			sizes->cs_cachep->name = names->name;
1705 			sizes->cs_cachep->size = sizes->cs_size;
1706 			sizes->cs_cachep->object_size = sizes->cs_size;
1707 			sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN;
1708 			__kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
1709 			list_add(&sizes->cs_cachep->list, &slab_caches);
1710 		}
1711 #ifdef CONFIG_ZONE_DMA
1712 		sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
1713 		sizes->cs_dmacachep->name = names->name_dma;
1714 		sizes->cs_dmacachep->size = sizes->cs_size;
1715 		sizes->cs_dmacachep->object_size = sizes->cs_size;
1716 		sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN;
1717 		__kmem_cache_create(sizes->cs_dmacachep,
1718 			       ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC);
1719 		list_add(&sizes->cs_dmacachep->list, &slab_caches);
1720 #endif
1721 		sizes++;
1722 		names++;
1723 	}
1724 	/* 4) Replace the bootstrap head arrays */
1725 	{
1726 		struct array_cache *ptr;
1727 
1728 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1729 
1730 		BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache);
1731 		memcpy(ptr, cpu_cache_get(kmem_cache),
1732 		       sizeof(struct arraycache_init));
1733 		/*
1734 		 * Do not assume that spinlocks can be initialized via memcpy:
1735 		 */
1736 		spin_lock_init(&ptr->lock);
1737 
1738 		kmem_cache->array[smp_processor_id()] = ptr;
1739 
1740 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1741 
1742 		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1743 		       != &initarray_generic.cache);
1744 		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1745 		       sizeof(struct arraycache_init));
1746 		/*
1747 		 * Do not assume that spinlocks can be initialized via memcpy:
1748 		 */
1749 		spin_lock_init(&ptr->lock);
1750 
1751 		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1752 		    ptr;
1753 	}
1754 	/* 5) Replace the bootstrap kmem_list3's */
1755 	{
1756 		int nid;
1757 
1758 		for_each_online_node(nid) {
1759 			init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
1760 
1761 			init_list(malloc_sizes[INDEX_AC].cs_cachep,
1762 				  &initkmem_list3[SIZE_AC + nid], nid);
1763 
1764 			if (INDEX_AC != INDEX_L3) {
1765 				init_list(malloc_sizes[INDEX_L3].cs_cachep,
1766 					  &initkmem_list3[SIZE_L3 + nid], nid);
1767 			}
1768 		}
1769 	}
1770 
1771 	slab_state = UP;
1772 }
1773 
1774 void __init kmem_cache_init_late(void)
1775 {
1776 	struct kmem_cache *cachep;
1777 
1778 	slab_state = UP;
1779 
1780 	/* 6) resize the head arrays to their final sizes */
1781 	mutex_lock(&slab_mutex);
1782 	list_for_each_entry(cachep, &slab_caches, list)
1783 		if (enable_cpucache(cachep, GFP_NOWAIT))
1784 			BUG();
1785 	mutex_unlock(&slab_mutex);
1786 
1787 	/* Annotate slab for lockdep -- annotate the malloc caches */
1788 	init_lock_keys();
1789 
1790 	/* Done! */
1791 	slab_state = FULL;
1792 
1793 	/*
1794 	 * Register a cpu startup notifier callback that initializes
1795 	 * cpu_cache_get for all new cpus
1796 	 */
1797 	register_cpu_notifier(&cpucache_notifier);
1798 
1799 #ifdef CONFIG_NUMA
1800 	/*
1801 	 * Register a memory hotplug callback that initializes and frees
1802 	 * nodelists.
1803 	 */
1804 	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
1805 #endif
1806 
1807 	/*
1808 	 * The reap timers are started later, with a module init call: That part
1809 	 * of the kernel is not yet operational.
1810 	 */
1811 }
1812 
1813 static int __init cpucache_init(void)
1814 {
1815 	int cpu;
1816 
1817 	/*
1818 	 * Register the timers that return unneeded pages to the page allocator
1819 	 */
1820 	for_each_online_cpu(cpu)
1821 		start_cpu_timer(cpu);
1822 
1823 	/* Done! */
1824 	slab_state = FULL;
1825 	return 0;
1826 }
1827 __initcall(cpucache_init);
1828 
1829 static noinline void
1830 slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1831 {
1832 	struct kmem_list3 *l3;
1833 	struct slab *slabp;
1834 	unsigned long flags;
1835 	int node;
1836 
1837 	printk(KERN_WARNING
1838 		"SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
1839 		nodeid, gfpflags);
1840 	printk(KERN_WARNING "  cache: %s, object size: %d, order: %d\n",
1841 		cachep->name, cachep->size, cachep->gfporder);
1842 
1843 	for_each_online_node(node) {
1844 		unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
1845 		unsigned long active_slabs = 0, num_slabs = 0;
1846 
1847 		l3 = cachep->nodelists[node];
1848 		if (!l3)
1849 			continue;
1850 
1851 		spin_lock_irqsave(&l3->list_lock, flags);
1852 		list_for_each_entry(slabp, &l3->slabs_full, list) {
1853 			active_objs += cachep->num;
1854 			active_slabs++;
1855 		}
1856 		list_for_each_entry(slabp, &l3->slabs_partial, list) {
1857 			active_objs += slabp->inuse;
1858 			active_slabs++;
1859 		}
1860 		list_for_each_entry(slabp, &l3->slabs_free, list)
1861 			num_slabs++;
1862 
1863 		free_objects += l3->free_objects;
1864 		spin_unlock_irqrestore(&l3->list_lock, flags);
1865 
1866 		num_slabs += active_slabs;
1867 		num_objs = num_slabs * cachep->num;
1868 		printk(KERN_WARNING
1869 			"  node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
1870 			node, active_slabs, num_slabs, active_objs, num_objs,
1871 			free_objects);
1872 	}
1873 }
1874 
1875 /*
1876  * Interface to system's page allocator. No need to hold the cache-lock.
1877  *
1878  * If we requested dmaable memory, we will get it. Even if we
1879  * did not request dmaable memory, we might get it, but that
1880  * would be relatively rare and ignorable.
1881  */
1882 static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1883 {
1884 	struct page *page;
1885 	int nr_pages;
1886 	int i;
1887 
1888 #ifndef CONFIG_MMU
1889 	/*
1890 	 * Nommu uses slab's for process anonymous memory allocations, and thus
1891 	 * requires __GFP_COMP to properly refcount higher order allocations
1892 	 */
1893 	flags |= __GFP_COMP;
1894 #endif
1895 
1896 	flags |= cachep->allocflags;
1897 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1898 		flags |= __GFP_RECLAIMABLE;
1899 
1900 	page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1901 	if (!page) {
1902 		if (!(flags & __GFP_NOWARN) && printk_ratelimit())
1903 			slab_out_of_memory(cachep, flags, nodeid);
1904 		return NULL;
1905 	}
1906 
1907 	/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
1908 	if (unlikely(page->pfmemalloc))
1909 		pfmemalloc_active = true;
1910 
1911 	nr_pages = (1 << cachep->gfporder);
1912 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1913 		add_zone_page_state(page_zone(page),
1914 			NR_SLAB_RECLAIMABLE, nr_pages);
1915 	else
1916 		add_zone_page_state(page_zone(page),
1917 			NR_SLAB_UNRECLAIMABLE, nr_pages);
1918 	for (i = 0; i < nr_pages; i++) {
1919 		__SetPageSlab(page + i);
1920 
1921 		if (page->pfmemalloc)
1922 			SetPageSlabPfmemalloc(page + i);
1923 	}
1924 
1925 	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1926 		kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1927 
1928 		if (cachep->ctor)
1929 			kmemcheck_mark_uninitialized_pages(page, nr_pages);
1930 		else
1931 			kmemcheck_mark_unallocated_pages(page, nr_pages);
1932 	}
1933 
1934 	return page_address(page);
1935 }
1936 
1937 /*
1938  * Interface to system's page release.
1939  */
1940 static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1941 {
1942 	unsigned long i = (1 << cachep->gfporder);
1943 	struct page *page = virt_to_page(addr);
1944 	const unsigned long nr_freed = i;
1945 
1946 	kmemcheck_free_shadow(page, cachep->gfporder);
1947 
1948 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1949 		sub_zone_page_state(page_zone(page),
1950 				NR_SLAB_RECLAIMABLE, nr_freed);
1951 	else
1952 		sub_zone_page_state(page_zone(page),
1953 				NR_SLAB_UNRECLAIMABLE, nr_freed);
1954 	while (i--) {
1955 		BUG_ON(!PageSlab(page));
1956 		__ClearPageSlabPfmemalloc(page);
1957 		__ClearPageSlab(page);
1958 		page++;
1959 	}
1960 	if (current->reclaim_state)
1961 		current->reclaim_state->reclaimed_slab += nr_freed;
1962 	free_pages((unsigned long)addr, cachep->gfporder);
1963 }
1964 
1965 static void kmem_rcu_free(struct rcu_head *head)
1966 {
1967 	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1968 	struct kmem_cache *cachep = slab_rcu->cachep;
1969 
1970 	kmem_freepages(cachep, slab_rcu->addr);
1971 	if (OFF_SLAB(cachep))
1972 		kmem_cache_free(cachep->slabp_cache, slab_rcu);
1973 }
1974 
1975 #if DEBUG
1976 
1977 #ifdef CONFIG_DEBUG_PAGEALLOC
1978 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1979 			    unsigned long caller)
1980 {
1981 	int size = cachep->object_size;
1982 
1983 	addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1984 
1985 	if (size < 5 * sizeof(unsigned long))
1986 		return;
1987 
1988 	*addr++ = 0x12345678;
1989 	*addr++ = caller;
1990 	*addr++ = smp_processor_id();
1991 	size -= 3 * sizeof(unsigned long);
1992 	{
1993 		unsigned long *sptr = &caller;
1994 		unsigned long svalue;
1995 
1996 		while (!kstack_end(sptr)) {
1997 			svalue = *sptr++;
1998 			if (kernel_text_address(svalue)) {
1999 				*addr++ = svalue;
2000 				size -= sizeof(unsigned long);
2001 				if (size <= sizeof(unsigned long))
2002 					break;
2003 			}
2004 		}
2005 
2006 	}
2007 	*addr++ = 0x87654321;
2008 }
2009 #endif
2010 
2011 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
2012 {
2013 	int size = cachep->object_size;
2014 	addr = &((char *)addr)[obj_offset(cachep)];
2015 
2016 	memset(addr, val, size);
2017 	*(unsigned char *)(addr + size - 1) = POISON_END;
2018 }
2019 
2020 static void dump_line(char *data, int offset, int limit)
2021 {
2022 	int i;
2023 	unsigned char error = 0;
2024 	int bad_count = 0;
2025 
2026 	printk(KERN_ERR "%03x: ", offset);
2027 	for (i = 0; i < limit; i++) {
2028 		if (data[offset + i] != POISON_FREE) {
2029 			error = data[offset + i];
2030 			bad_count++;
2031 		}
2032 	}
2033 	print_hex_dump(KERN_CONT, "", 0, 16, 1,
2034 			&data[offset], limit, 1);
2035 
2036 	if (bad_count == 1) {
2037 		error ^= POISON_FREE;
2038 		if (!(error & (error - 1))) {
2039 			printk(KERN_ERR "Single bit error detected. Probably "
2040 					"bad RAM.\n");
2041 #ifdef CONFIG_X86
2042 			printk(KERN_ERR "Run memtest86+ or a similar memory "
2043 					"test tool.\n");
2044 #else
2045 			printk(KERN_ERR "Run a memory test tool.\n");
2046 #endif
2047 		}
2048 	}
2049 }
2050 #endif
2051 
2052 #if DEBUG
2053 
2054 static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
2055 {
2056 	int i, size;
2057 	char *realobj;
2058 
2059 	if (cachep->flags & SLAB_RED_ZONE) {
2060 		printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
2061 			*dbg_redzone1(cachep, objp),
2062 			*dbg_redzone2(cachep, objp));
2063 	}
2064 
2065 	if (cachep->flags & SLAB_STORE_USER) {
2066 		printk(KERN_ERR "Last user: [<%p>]",
2067 			*dbg_userword(cachep, objp));
2068 		print_symbol("(%s)",
2069 				(unsigned long)*dbg_userword(cachep, objp));
2070 		printk("\n");
2071 	}
2072 	realobj = (char *)objp + obj_offset(cachep);
2073 	size = cachep->object_size;
2074 	for (i = 0; i < size && lines; i += 16, lines--) {
2075 		int limit;
2076 		limit = 16;
2077 		if (i + limit > size)
2078 			limit = size - i;
2079 		dump_line(realobj, i, limit);
2080 	}
2081 }
2082 
2083 static void check_poison_obj(struct kmem_cache *cachep, void *objp)
2084 {
2085 	char *realobj;
2086 	int size, i;
2087 	int lines = 0;
2088 
2089 	realobj = (char *)objp + obj_offset(cachep);
2090 	size = cachep->object_size;
2091 
2092 	for (i = 0; i < size; i++) {
2093 		char exp = POISON_FREE;
2094 		if (i == size - 1)
2095 			exp = POISON_END;
2096 		if (realobj[i] != exp) {
2097 			int limit;
2098 			/* Mismatch ! */
2099 			/* Print header */
2100 			if (lines == 0) {
2101 				printk(KERN_ERR
2102 					"Slab corruption (%s): %s start=%p, len=%d\n",
2103 					print_tainted(), cachep->name, realobj, size);
2104 				print_objinfo(cachep, objp, 0);
2105 			}
2106 			/* Hexdump the affected line */
2107 			i = (i / 16) * 16;
2108 			limit = 16;
2109 			if (i + limit > size)
2110 				limit = size - i;
2111 			dump_line(realobj, i, limit);
2112 			i += 16;
2113 			lines++;
2114 			/* Limit to 5 lines */
2115 			if (lines > 5)
2116 				break;
2117 		}
2118 	}
2119 	if (lines != 0) {
2120 		/* Print some data about the neighboring objects, if they
2121 		 * exist:
2122 		 */
2123 		struct slab *slabp = virt_to_slab(objp);
2124 		unsigned int objnr;
2125 
2126 		objnr = obj_to_index(cachep, slabp, objp);
2127 		if (objnr) {
2128 			objp = index_to_obj(cachep, slabp, objnr - 1);
2129 			realobj = (char *)objp + obj_offset(cachep);
2130 			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
2131 			       realobj, size);
2132 			print_objinfo(cachep, objp, 2);
2133 		}
2134 		if (objnr + 1 < cachep->num) {
2135 			objp = index_to_obj(cachep, slabp, objnr + 1);
2136 			realobj = (char *)objp + obj_offset(cachep);
2137 			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
2138 			       realobj, size);
2139 			print_objinfo(cachep, objp, 2);
2140 		}
2141 	}
2142 }
2143 #endif
2144 
2145 #if DEBUG
2146 static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
2147 {
2148 	int i;
2149 	for (i = 0; i < cachep->num; i++) {
2150 		void *objp = index_to_obj(cachep, slabp, i);
2151 
2152 		if (cachep->flags & SLAB_POISON) {
2153 #ifdef CONFIG_DEBUG_PAGEALLOC
2154 			if (cachep->size % PAGE_SIZE == 0 &&
2155 					OFF_SLAB(cachep))
2156 				kernel_map_pages(virt_to_page(objp),
2157 					cachep->size / PAGE_SIZE, 1);
2158 			else
2159 				check_poison_obj(cachep, objp);
2160 #else
2161 			check_poison_obj(cachep, objp);
2162 #endif
2163 		}
2164 		if (cachep->flags & SLAB_RED_ZONE) {
2165 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2166 				slab_error(cachep, "start of a freed object "
2167 					   "was overwritten");
2168 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2169 				slab_error(cachep, "end of a freed object "
2170 					   "was overwritten");
2171 		}
2172 	}
2173 }
2174 #else
2175 static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
2176 {
2177 }
2178 #endif
2179 
2180 /**
2181  * slab_destroy - destroy and release all objects in a slab
2182  * @cachep: cache pointer being destroyed
2183  * @slabp: slab pointer being destroyed
2184  *
2185  * Destroy all the objs in a slab, and release the mem back to the system.
2186  * Before calling the slab must have been unlinked from the cache.  The
2187  * cache-lock is not held/needed.
2188  */
2189 static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
2190 {
2191 	void *addr = slabp->s_mem - slabp->colouroff;
2192 
2193 	slab_destroy_debugcheck(cachep, slabp);
2194 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
2195 		struct slab_rcu *slab_rcu;
2196 
2197 		slab_rcu = (struct slab_rcu *)slabp;
2198 		slab_rcu->cachep = cachep;
2199 		slab_rcu->addr = addr;
2200 		call_rcu(&slab_rcu->head, kmem_rcu_free);
2201 	} else {
2202 		kmem_freepages(cachep, addr);
2203 		if (OFF_SLAB(cachep))
2204 			kmem_cache_free(cachep->slabp_cache, slabp);
2205 	}
2206 }
2207 
2208 /**
2209  * calculate_slab_order - calculate size (page order) of slabs
2210  * @cachep: pointer to the cache that is being created
2211  * @size: size of objects to be created in this cache.
2212  * @align: required alignment for the objects.
2213  * @flags: slab allocation flags
2214  *
2215  * Also calculates the number of objects per slab.
2216  *
2217  * This could be made much more intelligent.  For now, try to avoid using
2218  * high order pages for slabs.  When the gfp() functions are more friendly
2219  * towards high-order requests, this should be changed.
2220  */
2221 static size_t calculate_slab_order(struct kmem_cache *cachep,
2222 			size_t size, size_t align, unsigned long flags)
2223 {
2224 	unsigned long offslab_limit;
2225 	size_t left_over = 0;
2226 	int gfporder;
2227 
2228 	for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
2229 		unsigned int num;
2230 		size_t remainder;
2231 
2232 		cache_estimate(gfporder, size, align, flags, &remainder, &num);
2233 		if (!num)
2234 			continue;
2235 
2236 		if (flags & CFLGS_OFF_SLAB) {
2237 			/*
2238 			 * Max number of objs-per-slab for caches which
2239 			 * use off-slab slabs. Needed to avoid a possible
2240 			 * looping condition in cache_grow().
2241 			 */
2242 			offslab_limit = size - sizeof(struct slab);
2243 			offslab_limit /= sizeof(kmem_bufctl_t);
2244 
2245  			if (num > offslab_limit)
2246 				break;
2247 		}
2248 
2249 		/* Found something acceptable - save it away */
2250 		cachep->num = num;
2251 		cachep->gfporder = gfporder;
2252 		left_over = remainder;
2253 
2254 		/*
2255 		 * A VFS-reclaimable slab tends to have most allocations
2256 		 * as GFP_NOFS and we really don't want to have to be allocating
2257 		 * higher-order pages when we are unable to shrink dcache.
2258 		 */
2259 		if (flags & SLAB_RECLAIM_ACCOUNT)
2260 			break;
2261 
2262 		/*
2263 		 * Large number of objects is good, but very large slabs are
2264 		 * currently bad for the gfp()s.
2265 		 */
2266 		if (gfporder >= slab_max_order)
2267 			break;
2268 
2269 		/*
2270 		 * Acceptable internal fragmentation?
2271 		 */
2272 		if (left_over * 8 <= (PAGE_SIZE << gfporder))
2273 			break;
2274 	}
2275 	return left_over;
2276 }
2277 
2278 static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2279 {
2280 	if (slab_state >= FULL)
2281 		return enable_cpucache(cachep, gfp);
2282 
2283 	if (slab_state == DOWN) {
2284 		/*
2285 		 * Note: the first kmem_cache_create must create the cache
2286 		 * that's used by kmalloc(24), otherwise the creation of
2287 		 * further caches will BUG().
2288 		 */
2289 		cachep->array[smp_processor_id()] = &initarray_generic.cache;
2290 
2291 		/*
2292 		 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
2293 		 * the first cache, then we need to set up all its list3s,
2294 		 * otherwise the creation of further caches will BUG().
2295 		 */
2296 		set_up_list3s(cachep, SIZE_AC);
2297 		if (INDEX_AC == INDEX_L3)
2298 			slab_state = PARTIAL_L3;
2299 		else
2300 			slab_state = PARTIAL_ARRAYCACHE;
2301 	} else {
2302 		cachep->array[smp_processor_id()] =
2303 			kmalloc(sizeof(struct arraycache_init), gfp);
2304 
2305 		if (slab_state == PARTIAL_ARRAYCACHE) {
2306 			set_up_list3s(cachep, SIZE_L3);
2307 			slab_state = PARTIAL_L3;
2308 		} else {
2309 			int node;
2310 			for_each_online_node(node) {
2311 				cachep->nodelists[node] =
2312 				    kmalloc_node(sizeof(struct kmem_list3),
2313 						gfp, node);
2314 				BUG_ON(!cachep->nodelists[node]);
2315 				kmem_list3_init(cachep->nodelists[node]);
2316 			}
2317 		}
2318 	}
2319 	cachep->nodelists[numa_mem_id()]->next_reap =
2320 			jiffies + REAPTIMEOUT_LIST3 +
2321 			((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2322 
2323 	cpu_cache_get(cachep)->avail = 0;
2324 	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
2325 	cpu_cache_get(cachep)->batchcount = 1;
2326 	cpu_cache_get(cachep)->touched = 0;
2327 	cachep->batchcount = 1;
2328 	cachep->limit = BOOT_CPUCACHE_ENTRIES;
2329 	return 0;
2330 }
2331 
2332 /**
2333  * __kmem_cache_create - Create a cache.
2334  * @name: A string which is used in /proc/slabinfo to identify this cache.
2335  * @size: The size of objects to be created in this cache.
2336  * @align: The required alignment for the objects.
2337  * @flags: SLAB flags
2338  * @ctor: A constructor for the objects.
2339  *
2340  * Returns a ptr to the cache on success, NULL on failure.
2341  * Cannot be called within a int, but can be interrupted.
2342  * The @ctor is run when new pages are allocated by the cache.
2343  *
2344  * The flags are
2345  *
2346  * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
2347  * to catch references to uninitialised memory.
2348  *
2349  * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
2350  * for buffer overruns.
2351  *
2352  * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
2353  * cacheline.  This can be beneficial if you're counting cycles as closely
2354  * as davem.
2355  */
2356 int
2357 __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2358 {
2359 	size_t left_over, slab_size, ralign;
2360 	gfp_t gfp;
2361 	int err;
2362 	size_t size = cachep->size;
2363 
2364 #if DEBUG
2365 #if FORCED_DEBUG
2366 	/*
2367 	 * Enable redzoning and last user accounting, except for caches with
2368 	 * large objects, if the increased size would increase the object size
2369 	 * above the next power of two: caches with object sizes just above a
2370 	 * power of two have a significant amount of internal fragmentation.
2371 	 */
2372 	if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
2373 						2 * sizeof(unsigned long long)))
2374 		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2375 	if (!(flags & SLAB_DESTROY_BY_RCU))
2376 		flags |= SLAB_POISON;
2377 #endif
2378 	if (flags & SLAB_DESTROY_BY_RCU)
2379 		BUG_ON(flags & SLAB_POISON);
2380 #endif
2381 	/*
2382 	 * Always checks flags, a caller might be expecting debug support which
2383 	 * isn't available.
2384 	 */
2385 	BUG_ON(flags & ~CREATE_MASK);
2386 
2387 	/*
2388 	 * Check that size is in terms of words.  This is needed to avoid
2389 	 * unaligned accesses for some archs when redzoning is used, and makes
2390 	 * sure any on-slab bufctl's are also correctly aligned.
2391 	 */
2392 	if (size & (BYTES_PER_WORD - 1)) {
2393 		size += (BYTES_PER_WORD - 1);
2394 		size &= ~(BYTES_PER_WORD - 1);
2395 	}
2396 
2397 	/* calculate the final buffer alignment: */
2398 
2399 	/* 1) arch recommendation: can be overridden for debug */
2400 	if (flags & SLAB_HWCACHE_ALIGN) {
2401 		/*
2402 		 * Default alignment: as specified by the arch code.  Except if
2403 		 * an object is really small, then squeeze multiple objects into
2404 		 * one cacheline.
2405 		 */
2406 		ralign = cache_line_size();
2407 		while (size <= ralign / 2)
2408 			ralign /= 2;
2409 	} else {
2410 		ralign = BYTES_PER_WORD;
2411 	}
2412 
2413 	/*
2414 	 * Redzoning and user store require word alignment or possibly larger.
2415 	 * Note this will be overridden by architecture or caller mandated
2416 	 * alignment if either is greater than BYTES_PER_WORD.
2417 	 */
2418 	if (flags & SLAB_STORE_USER)
2419 		ralign = BYTES_PER_WORD;
2420 
2421 	if (flags & SLAB_RED_ZONE) {
2422 		ralign = REDZONE_ALIGN;
2423 		/* If redzoning, ensure that the second redzone is suitably
2424 		 * aligned, by adjusting the object size accordingly. */
2425 		size += REDZONE_ALIGN - 1;
2426 		size &= ~(REDZONE_ALIGN - 1);
2427 	}
2428 
2429 	/* 2) arch mandated alignment */
2430 	if (ralign < ARCH_SLAB_MINALIGN) {
2431 		ralign = ARCH_SLAB_MINALIGN;
2432 	}
2433 	/* 3) caller mandated alignment */
2434 	if (ralign < cachep->align) {
2435 		ralign = cachep->align;
2436 	}
2437 	/* disable debug if necessary */
2438 	if (ralign > __alignof__(unsigned long long))
2439 		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2440 	/*
2441 	 * 4) Store it.
2442 	 */
2443 	cachep->align = ralign;
2444 
2445 	if (slab_is_available())
2446 		gfp = GFP_KERNEL;
2447 	else
2448 		gfp = GFP_NOWAIT;
2449 
2450 	cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
2451 #if DEBUG
2452 
2453 	/*
2454 	 * Both debugging options require word-alignment which is calculated
2455 	 * into align above.
2456 	 */
2457 	if (flags & SLAB_RED_ZONE) {
2458 		/* add space for red zone words */
2459 		cachep->obj_offset += sizeof(unsigned long long);
2460 		size += 2 * sizeof(unsigned long long);
2461 	}
2462 	if (flags & SLAB_STORE_USER) {
2463 		/* user store requires one word storage behind the end of
2464 		 * the real object. But if the second red zone needs to be
2465 		 * aligned to 64 bits, we must allow that much space.
2466 		 */
2467 		if (flags & SLAB_RED_ZONE)
2468 			size += REDZONE_ALIGN;
2469 		else
2470 			size += BYTES_PER_WORD;
2471 	}
2472 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2473 	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2474 	    && cachep->object_size > cache_line_size()
2475 	    && ALIGN(size, cachep->align) < PAGE_SIZE) {
2476 		cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
2477 		size = PAGE_SIZE;
2478 	}
2479 #endif
2480 #endif
2481 
2482 	/*
2483 	 * Determine if the slab management is 'on' or 'off' slab.
2484 	 * (bootstrapping cannot cope with offslab caches so don't do
2485 	 * it too early on. Always use on-slab management when
2486 	 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
2487 	 */
2488 	if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
2489 	    !(flags & SLAB_NOLEAKTRACE))
2490 		/*
2491 		 * Size is large, assume best to place the slab management obj
2492 		 * off-slab (should allow better packing of objs).
2493 		 */
2494 		flags |= CFLGS_OFF_SLAB;
2495 
2496 	size = ALIGN(size, cachep->align);
2497 
2498 	left_over = calculate_slab_order(cachep, size, cachep->align, flags);
2499 
2500 	if (!cachep->num)
2501 		return -E2BIG;
2502 
2503 	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2504 			  + sizeof(struct slab), cachep->align);
2505 
2506 	/*
2507 	 * If the slab has been placed off-slab, and we have enough space then
2508 	 * move it on-slab. This is at the expense of any extra colouring.
2509 	 */
2510 	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
2511 		flags &= ~CFLGS_OFF_SLAB;
2512 		left_over -= slab_size;
2513 	}
2514 
2515 	if (flags & CFLGS_OFF_SLAB) {
2516 		/* really off slab. No need for manual alignment */
2517 		slab_size =
2518 		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2519 
2520 #ifdef CONFIG_PAGE_POISONING
2521 		/* If we're going to use the generic kernel_map_pages()
2522 		 * poisoning, then it's going to smash the contents of
2523 		 * the redzone and userword anyhow, so switch them off.
2524 		 */
2525 		if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
2526 			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2527 #endif
2528 	}
2529 
2530 	cachep->colour_off = cache_line_size();
2531 	/* Offset must be a multiple of the alignment. */
2532 	if (cachep->colour_off < cachep->align)
2533 		cachep->colour_off = cachep->align;
2534 	cachep->colour = left_over / cachep->colour_off;
2535 	cachep->slab_size = slab_size;
2536 	cachep->flags = flags;
2537 	cachep->allocflags = 0;
2538 	if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
2539 		cachep->allocflags |= GFP_DMA;
2540 	cachep->size = size;
2541 	cachep->reciprocal_buffer_size = reciprocal_value(size);
2542 
2543 	if (flags & CFLGS_OFF_SLAB) {
2544 		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2545 		/*
2546 		 * This is a possibility for one of the malloc_sizes caches.
2547 		 * But since we go off slab only for object size greater than
2548 		 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
2549 		 * this should not happen at all.
2550 		 * But leave a BUG_ON for some lucky dude.
2551 		 */
2552 		BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
2553 	}
2554 
2555 	err = setup_cpu_cache(cachep, gfp);
2556 	if (err) {
2557 		__kmem_cache_shutdown(cachep);
2558 		return err;
2559 	}
2560 
2561 	if (flags & SLAB_DEBUG_OBJECTS) {
2562 		/*
2563 		 * Would deadlock through slab_destroy()->call_rcu()->
2564 		 * debug_object_activate()->kmem_cache_alloc().
2565 		 */
2566 		WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
2567 
2568 		slab_set_debugobj_lock_classes(cachep);
2569 	}
2570 
2571 	return 0;
2572 }
2573 
2574 #if DEBUG
2575 static void check_irq_off(void)
2576 {
2577 	BUG_ON(!irqs_disabled());
2578 }
2579 
2580 static void check_irq_on(void)
2581 {
2582 	BUG_ON(irqs_disabled());
2583 }
2584 
2585 static void check_spinlock_acquired(struct kmem_cache *cachep)
2586 {
2587 #ifdef CONFIG_SMP
2588 	check_irq_off();
2589 	assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock);
2590 #endif
2591 }
2592 
2593 static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2594 {
2595 #ifdef CONFIG_SMP
2596 	check_irq_off();
2597 	assert_spin_locked(&cachep->nodelists[node]->list_lock);
2598 #endif
2599 }
2600 
2601 #else
2602 #define check_irq_off()	do { } while(0)
2603 #define check_irq_on()	do { } while(0)
2604 #define check_spinlock_acquired(x) do { } while(0)
2605 #define check_spinlock_acquired_node(x, y) do { } while(0)
2606 #endif
2607 
2608 static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
2609 			struct array_cache *ac,
2610 			int force, int node);
2611 
2612 static void do_drain(void *arg)
2613 {
2614 	struct kmem_cache *cachep = arg;
2615 	struct array_cache *ac;
2616 	int node = numa_mem_id();
2617 
2618 	check_irq_off();
2619 	ac = cpu_cache_get(cachep);
2620 	spin_lock(&cachep->nodelists[node]->list_lock);
2621 	free_block(cachep, ac->entry, ac->avail, node);
2622 	spin_unlock(&cachep->nodelists[node]->list_lock);
2623 	ac->avail = 0;
2624 }
2625 
2626 static void drain_cpu_caches(struct kmem_cache *cachep)
2627 {
2628 	struct kmem_list3 *l3;
2629 	int node;
2630 
2631 	on_each_cpu(do_drain, cachep, 1);
2632 	check_irq_on();
2633 	for_each_online_node(node) {
2634 		l3 = cachep->nodelists[node];
2635 		if (l3 && l3->alien)
2636 			drain_alien_cache(cachep, l3->alien);
2637 	}
2638 
2639 	for_each_online_node(node) {
2640 		l3 = cachep->nodelists[node];
2641 		if (l3)
2642 			drain_array(cachep, l3, l3->shared, 1, node);
2643 	}
2644 }
2645 
2646 /*
2647  * Remove slabs from the list of free slabs.
2648  * Specify the number of slabs to drain in tofree.
2649  *
2650  * Returns the actual number of slabs released.
2651  */
2652 static int drain_freelist(struct kmem_cache *cache,
2653 			struct kmem_list3 *l3, int tofree)
2654 {
2655 	struct list_head *p;
2656 	int nr_freed;
2657 	struct slab *slabp;
2658 
2659 	nr_freed = 0;
2660 	while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
2661 
2662 		spin_lock_irq(&l3->list_lock);
2663 		p = l3->slabs_free.prev;
2664 		if (p == &l3->slabs_free) {
2665 			spin_unlock_irq(&l3->list_lock);
2666 			goto out;
2667 		}
2668 
2669 		slabp = list_entry(p, struct slab, list);
2670 #if DEBUG
2671 		BUG_ON(slabp->inuse);
2672 #endif
2673 		list_del(&slabp->list);
2674 		/*
2675 		 * Safe to drop the lock. The slab is no longer linked
2676 		 * to the cache.
2677 		 */
2678 		l3->free_objects -= cache->num;
2679 		spin_unlock_irq(&l3->list_lock);
2680 		slab_destroy(cache, slabp);
2681 		nr_freed++;
2682 	}
2683 out:
2684 	return nr_freed;
2685 }
2686 
2687 /* Called with slab_mutex held to protect against cpu hotplug */
2688 static int __cache_shrink(struct kmem_cache *cachep)
2689 {
2690 	int ret = 0, i = 0;
2691 	struct kmem_list3 *l3;
2692 
2693 	drain_cpu_caches(cachep);
2694 
2695 	check_irq_on();
2696 	for_each_online_node(i) {
2697 		l3 = cachep->nodelists[i];
2698 		if (!l3)
2699 			continue;
2700 
2701 		drain_freelist(cachep, l3, l3->free_objects);
2702 
2703 		ret += !list_empty(&l3->slabs_full) ||
2704 			!list_empty(&l3->slabs_partial);
2705 	}
2706 	return (ret ? 1 : 0);
2707 }
2708 
2709 /**
2710  * kmem_cache_shrink - Shrink a cache.
2711  * @cachep: The cache to shrink.
2712  *
2713  * Releases as many slabs as possible for a cache.
2714  * To help debugging, a zero exit status indicates all slabs were released.
2715  */
2716 int kmem_cache_shrink(struct kmem_cache *cachep)
2717 {
2718 	int ret;
2719 	BUG_ON(!cachep || in_interrupt());
2720 
2721 	get_online_cpus();
2722 	mutex_lock(&slab_mutex);
2723 	ret = __cache_shrink(cachep);
2724 	mutex_unlock(&slab_mutex);
2725 	put_online_cpus();
2726 	return ret;
2727 }
2728 EXPORT_SYMBOL(kmem_cache_shrink);
2729 
2730 int __kmem_cache_shutdown(struct kmem_cache *cachep)
2731 {
2732 	int i;
2733 	struct kmem_list3 *l3;
2734 	int rc = __cache_shrink(cachep);
2735 
2736 	if (rc)
2737 		return rc;
2738 
2739 	for_each_online_cpu(i)
2740 	    kfree(cachep->array[i]);
2741 
2742 	/* NUMA: free the list3 structures */
2743 	for_each_online_node(i) {
2744 		l3 = cachep->nodelists[i];
2745 		if (l3) {
2746 			kfree(l3->shared);
2747 			free_alien_cache(l3->alien);
2748 			kfree(l3);
2749 		}
2750 	}
2751 	return 0;
2752 }
2753 
2754 /*
2755  * Get the memory for a slab management obj.
2756  * For a slab cache when the slab descriptor is off-slab, slab descriptors
2757  * always come from malloc_sizes caches.  The slab descriptor cannot
2758  * come from the same cache which is getting created because,
2759  * when we are searching for an appropriate cache for these
2760  * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2761  * If we are creating a malloc_sizes cache here it would not be visible to
2762  * kmem_find_general_cachep till the initialization is complete.
2763  * Hence we cannot have slabp_cache same as the original cache.
2764  */
2765 static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2766 				   int colour_off, gfp_t local_flags,
2767 				   int nodeid)
2768 {
2769 	struct slab *slabp;
2770 
2771 	if (OFF_SLAB(cachep)) {
2772 		/* Slab management obj is off-slab. */
2773 		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2774 					      local_flags, nodeid);
2775 		/*
2776 		 * If the first object in the slab is leaked (it's allocated
2777 		 * but no one has a reference to it), we want to make sure
2778 		 * kmemleak does not treat the ->s_mem pointer as a reference
2779 		 * to the object. Otherwise we will not report the leak.
2780 		 */
2781 		kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
2782 				   local_flags);
2783 		if (!slabp)
2784 			return NULL;
2785 	} else {
2786 		slabp = objp + colour_off;
2787 		colour_off += cachep->slab_size;
2788 	}
2789 	slabp->inuse = 0;
2790 	slabp->colouroff = colour_off;
2791 	slabp->s_mem = objp + colour_off;
2792 	slabp->nodeid = nodeid;
2793 	slabp->free = 0;
2794 	return slabp;
2795 }
2796 
2797 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2798 {
2799 	return (kmem_bufctl_t *) (slabp + 1);
2800 }
2801 
2802 static void cache_init_objs(struct kmem_cache *cachep,
2803 			    struct slab *slabp)
2804 {
2805 	int i;
2806 
2807 	for (i = 0; i < cachep->num; i++) {
2808 		void *objp = index_to_obj(cachep, slabp, i);
2809 #if DEBUG
2810 		/* need to poison the objs? */
2811 		if (cachep->flags & SLAB_POISON)
2812 			poison_obj(cachep, objp, POISON_FREE);
2813 		if (cachep->flags & SLAB_STORE_USER)
2814 			*dbg_userword(cachep, objp) = NULL;
2815 
2816 		if (cachep->flags & SLAB_RED_ZONE) {
2817 			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2818 			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2819 		}
2820 		/*
2821 		 * Constructors are not allowed to allocate memory from the same
2822 		 * cache which they are a constructor for.  Otherwise, deadlock.
2823 		 * They must also be threaded.
2824 		 */
2825 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2826 			cachep->ctor(objp + obj_offset(cachep));
2827 
2828 		if (cachep->flags & SLAB_RED_ZONE) {
2829 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2830 				slab_error(cachep, "constructor overwrote the"
2831 					   " end of an object");
2832 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2833 				slab_error(cachep, "constructor overwrote the"
2834 					   " start of an object");
2835 		}
2836 		if ((cachep->size % PAGE_SIZE) == 0 &&
2837 			    OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2838 			kernel_map_pages(virt_to_page(objp),
2839 					 cachep->size / PAGE_SIZE, 0);
2840 #else
2841 		if (cachep->ctor)
2842 			cachep->ctor(objp);
2843 #endif
2844 		slab_bufctl(slabp)[i] = i + 1;
2845 	}
2846 	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2847 }
2848 
2849 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2850 {
2851 	if (CONFIG_ZONE_DMA_FLAG) {
2852 		if (flags & GFP_DMA)
2853 			BUG_ON(!(cachep->allocflags & GFP_DMA));
2854 		else
2855 			BUG_ON(cachep->allocflags & GFP_DMA);
2856 	}
2857 }
2858 
2859 static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2860 				int nodeid)
2861 {
2862 	void *objp = index_to_obj(cachep, slabp, slabp->free);
2863 	kmem_bufctl_t next;
2864 
2865 	slabp->inuse++;
2866 	next = slab_bufctl(slabp)[slabp->free];
2867 #if DEBUG
2868 	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2869 	WARN_ON(slabp->nodeid != nodeid);
2870 #endif
2871 	slabp->free = next;
2872 
2873 	return objp;
2874 }
2875 
2876 static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2877 				void *objp, int nodeid)
2878 {
2879 	unsigned int objnr = obj_to_index(cachep, slabp, objp);
2880 
2881 #if DEBUG
2882 	/* Verify that the slab belongs to the intended node */
2883 	WARN_ON(slabp->nodeid != nodeid);
2884 
2885 	if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
2886 		printk(KERN_ERR "slab: double free detected in cache "
2887 				"'%s', objp %p\n", cachep->name, objp);
2888 		BUG();
2889 	}
2890 #endif
2891 	slab_bufctl(slabp)[objnr] = slabp->free;
2892 	slabp->free = objnr;
2893 	slabp->inuse--;
2894 }
2895 
2896 /*
2897  * Map pages beginning at addr to the given cache and slab. This is required
2898  * for the slab allocator to be able to lookup the cache and slab of a
2899  * virtual address for kfree, ksize, and slab debugging.
2900  */
2901 static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2902 			   void *addr)
2903 {
2904 	int nr_pages;
2905 	struct page *page;
2906 
2907 	page = virt_to_page(addr);
2908 
2909 	nr_pages = 1;
2910 	if (likely(!PageCompound(page)))
2911 		nr_pages <<= cache->gfporder;
2912 
2913 	do {
2914 		page->slab_cache = cache;
2915 		page->slab_page = slab;
2916 		page++;
2917 	} while (--nr_pages);
2918 }
2919 
2920 /*
2921  * Grow (by 1) the number of slabs within a cache.  This is called by
2922  * kmem_cache_alloc() when there are no active objs left in a cache.
2923  */
2924 static int cache_grow(struct kmem_cache *cachep,
2925 		gfp_t flags, int nodeid, void *objp)
2926 {
2927 	struct slab *slabp;
2928 	size_t offset;
2929 	gfp_t local_flags;
2930 	struct kmem_list3 *l3;
2931 
2932 	/*
2933 	 * Be lazy and only check for valid flags here,  keeping it out of the
2934 	 * critical path in kmem_cache_alloc().
2935 	 */
2936 	BUG_ON(flags & GFP_SLAB_BUG_MASK);
2937 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
2938 
2939 	/* Take the l3 list lock to change the colour_next on this node */
2940 	check_irq_off();
2941 	l3 = cachep->nodelists[nodeid];
2942 	spin_lock(&l3->list_lock);
2943 
2944 	/* Get colour for the slab, and cal the next value. */
2945 	offset = l3->colour_next;
2946 	l3->colour_next++;
2947 	if (l3->colour_next >= cachep->colour)
2948 		l3->colour_next = 0;
2949 	spin_unlock(&l3->list_lock);
2950 
2951 	offset *= cachep->colour_off;
2952 
2953 	if (local_flags & __GFP_WAIT)
2954 		local_irq_enable();
2955 
2956 	/*
2957 	 * The test for missing atomic flag is performed here, rather than
2958 	 * the more obvious place, simply to reduce the critical path length
2959 	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2960 	 * will eventually be caught here (where it matters).
2961 	 */
2962 	kmem_flagcheck(cachep, flags);
2963 
2964 	/*
2965 	 * Get mem for the objs.  Attempt to allocate a physical page from
2966 	 * 'nodeid'.
2967 	 */
2968 	if (!objp)
2969 		objp = kmem_getpages(cachep, local_flags, nodeid);
2970 	if (!objp)
2971 		goto failed;
2972 
2973 	/* Get slab management. */
2974 	slabp = alloc_slabmgmt(cachep, objp, offset,
2975 			local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
2976 	if (!slabp)
2977 		goto opps1;
2978 
2979 	slab_map_pages(cachep, slabp, objp);
2980 
2981 	cache_init_objs(cachep, slabp);
2982 
2983 	if (local_flags & __GFP_WAIT)
2984 		local_irq_disable();
2985 	check_irq_off();
2986 	spin_lock(&l3->list_lock);
2987 
2988 	/* Make slab active. */
2989 	list_add_tail(&slabp->list, &(l3->slabs_free));
2990 	STATS_INC_GROWN(cachep);
2991 	l3->free_objects += cachep->num;
2992 	spin_unlock(&l3->list_lock);
2993 	return 1;
2994 opps1:
2995 	kmem_freepages(cachep, objp);
2996 failed:
2997 	if (local_flags & __GFP_WAIT)
2998 		local_irq_disable();
2999 	return 0;
3000 }
3001 
3002 #if DEBUG
3003 
3004 /*
3005  * Perform extra freeing checks:
3006  * - detect bad pointers.
3007  * - POISON/RED_ZONE checking
3008  */
3009 static void kfree_debugcheck(const void *objp)
3010 {
3011 	if (!virt_addr_valid(objp)) {
3012 		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
3013 		       (unsigned long)objp);
3014 		BUG();
3015 	}
3016 }
3017 
3018 static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
3019 {
3020 	unsigned long long redzone1, redzone2;
3021 
3022 	redzone1 = *dbg_redzone1(cache, obj);
3023 	redzone2 = *dbg_redzone2(cache, obj);
3024 
3025 	/*
3026 	 * Redzone is ok.
3027 	 */
3028 	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
3029 		return;
3030 
3031 	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
3032 		slab_error(cache, "double free detected");
3033 	else
3034 		slab_error(cache, "memory outside object was overwritten");
3035 
3036 	printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
3037 			obj, redzone1, redzone2);
3038 }
3039 
3040 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
3041 				   unsigned long caller)
3042 {
3043 	struct page *page;
3044 	unsigned int objnr;
3045 	struct slab *slabp;
3046 
3047 	BUG_ON(virt_to_cache(objp) != cachep);
3048 
3049 	objp -= obj_offset(cachep);
3050 	kfree_debugcheck(objp);
3051 	page = virt_to_head_page(objp);
3052 
3053 	slabp = page->slab_page;
3054 
3055 	if (cachep->flags & SLAB_RED_ZONE) {
3056 		verify_redzone_free(cachep, objp);
3057 		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
3058 		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
3059 	}
3060 	if (cachep->flags & SLAB_STORE_USER)
3061 		*dbg_userword(cachep, objp) = (void *)caller;
3062 
3063 	objnr = obj_to_index(cachep, slabp, objp);
3064 
3065 	BUG_ON(objnr >= cachep->num);
3066 	BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
3067 
3068 #ifdef CONFIG_DEBUG_SLAB_LEAK
3069 	slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
3070 #endif
3071 	if (cachep->flags & SLAB_POISON) {
3072 #ifdef CONFIG_DEBUG_PAGEALLOC
3073 		if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
3074 			store_stackinfo(cachep, objp, caller);
3075 			kernel_map_pages(virt_to_page(objp),
3076 					 cachep->size / PAGE_SIZE, 0);
3077 		} else {
3078 			poison_obj(cachep, objp, POISON_FREE);
3079 		}
3080 #else
3081 		poison_obj(cachep, objp, POISON_FREE);
3082 #endif
3083 	}
3084 	return objp;
3085 }
3086 
3087 static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
3088 {
3089 	kmem_bufctl_t i;
3090 	int entries = 0;
3091 
3092 	/* Check slab's freelist to see if this obj is there. */
3093 	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
3094 		entries++;
3095 		if (entries > cachep->num || i >= cachep->num)
3096 			goto bad;
3097 	}
3098 	if (entries != cachep->num - slabp->inuse) {
3099 bad:
3100 		printk(KERN_ERR "slab: Internal list corruption detected in "
3101 			"cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n",
3102 			cachep->name, cachep->num, slabp, slabp->inuse,
3103 			print_tainted());
3104 		print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,
3105 			sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t),
3106 			1);
3107 		BUG();
3108 	}
3109 }
3110 #else
3111 #define kfree_debugcheck(x) do { } while(0)
3112 #define cache_free_debugcheck(x,objp,z) (objp)
3113 #define check_slabp(x,y) do { } while(0)
3114 #endif
3115 
3116 static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
3117 							bool force_refill)
3118 {
3119 	int batchcount;
3120 	struct kmem_list3 *l3;
3121 	struct array_cache *ac;
3122 	int node;
3123 
3124 	check_irq_off();
3125 	node = numa_mem_id();
3126 	if (unlikely(force_refill))
3127 		goto force_grow;
3128 retry:
3129 	ac = cpu_cache_get(cachep);
3130 	batchcount = ac->batchcount;
3131 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
3132 		/*
3133 		 * If there was little recent activity on this cache, then
3134 		 * perform only a partial refill.  Otherwise we could generate
3135 		 * refill bouncing.
3136 		 */
3137 		batchcount = BATCHREFILL_LIMIT;
3138 	}
3139 	l3 = cachep->nodelists[node];
3140 
3141 	BUG_ON(ac->avail > 0 || !l3);
3142 	spin_lock(&l3->list_lock);
3143 
3144 	/* See if we can refill from the shared array */
3145 	if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
3146 		l3->shared->touched = 1;
3147 		goto alloc_done;
3148 	}
3149 
3150 	while (batchcount > 0) {
3151 		struct list_head *entry;
3152 		struct slab *slabp;
3153 		/* Get slab alloc is to come from. */
3154 		entry = l3->slabs_partial.next;
3155 		if (entry == &l3->slabs_partial) {
3156 			l3->free_touched = 1;
3157 			entry = l3->slabs_free.next;
3158 			if (entry == &l3->slabs_free)
3159 				goto must_grow;
3160 		}
3161 
3162 		slabp = list_entry(entry, struct slab, list);
3163 		check_slabp(cachep, slabp);
3164 		check_spinlock_acquired(cachep);
3165 
3166 		/*
3167 		 * The slab was either on partial or free list so
3168 		 * there must be at least one object available for
3169 		 * allocation.
3170 		 */
3171 		BUG_ON(slabp->inuse >= cachep->num);
3172 
3173 		while (slabp->inuse < cachep->num && batchcount--) {
3174 			STATS_INC_ALLOCED(cachep);
3175 			STATS_INC_ACTIVE(cachep);
3176 			STATS_SET_HIGH(cachep);
3177 
3178 			ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
3179 									node));
3180 		}
3181 		check_slabp(cachep, slabp);
3182 
3183 		/* move slabp to correct slabp list: */
3184 		list_del(&slabp->list);
3185 		if (slabp->free == BUFCTL_END)
3186 			list_add(&slabp->list, &l3->slabs_full);
3187 		else
3188 			list_add(&slabp->list, &l3->slabs_partial);
3189 	}
3190 
3191 must_grow:
3192 	l3->free_objects -= ac->avail;
3193 alloc_done:
3194 	spin_unlock(&l3->list_lock);
3195 
3196 	if (unlikely(!ac->avail)) {
3197 		int x;
3198 force_grow:
3199 		x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
3200 
3201 		/* cache_grow can reenable interrupts, then ac could change. */
3202 		ac = cpu_cache_get(cachep);
3203 		node = numa_mem_id();
3204 
3205 		/* no objects in sight? abort */
3206 		if (!x && (ac->avail == 0 || force_refill))
3207 			return NULL;
3208 
3209 		if (!ac->avail)		/* objects refilled by interrupt? */
3210 			goto retry;
3211 	}
3212 	ac->touched = 1;
3213 
3214 	return ac_get_obj(cachep, ac, flags, force_refill);
3215 }
3216 
3217 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
3218 						gfp_t flags)
3219 {
3220 	might_sleep_if(flags & __GFP_WAIT);
3221 #if DEBUG
3222 	kmem_flagcheck(cachep, flags);
3223 #endif
3224 }
3225 
3226 #if DEBUG
3227 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3228 				gfp_t flags, void *objp, unsigned long caller)
3229 {
3230 	if (!objp)
3231 		return objp;
3232 	if (cachep->flags & SLAB_POISON) {
3233 #ifdef CONFIG_DEBUG_PAGEALLOC
3234 		if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
3235 			kernel_map_pages(virt_to_page(objp),
3236 					 cachep->size / PAGE_SIZE, 1);
3237 		else
3238 			check_poison_obj(cachep, objp);
3239 #else
3240 		check_poison_obj(cachep, objp);
3241 #endif
3242 		poison_obj(cachep, objp, POISON_INUSE);
3243 	}
3244 	if (cachep->flags & SLAB_STORE_USER)
3245 		*dbg_userword(cachep, objp) = (void *)caller;
3246 
3247 	if (cachep->flags & SLAB_RED_ZONE) {
3248 		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
3249 				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
3250 			slab_error(cachep, "double free, or memory outside"
3251 						" object was overwritten");
3252 			printk(KERN_ERR
3253 				"%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
3254 				objp, *dbg_redzone1(cachep, objp),
3255 				*dbg_redzone2(cachep, objp));
3256 		}
3257 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
3258 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
3259 	}
3260 #ifdef CONFIG_DEBUG_SLAB_LEAK
3261 	{
3262 		struct slab *slabp;
3263 		unsigned objnr;
3264 
3265 		slabp = virt_to_head_page(objp)->slab_page;
3266 		objnr = (unsigned)(objp - slabp->s_mem) / cachep->size;
3267 		slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
3268 	}
3269 #endif
3270 	objp += obj_offset(cachep);
3271 	if (cachep->ctor && cachep->flags & SLAB_POISON)
3272 		cachep->ctor(objp);
3273 	if (ARCH_SLAB_MINALIGN &&
3274 	    ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
3275 		printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3276 		       objp, (int)ARCH_SLAB_MINALIGN);
3277 	}
3278 	return objp;
3279 }
3280 #else
3281 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
3282 #endif
3283 
3284 static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3285 {
3286 	if (cachep == kmem_cache)
3287 		return false;
3288 
3289 	return should_failslab(cachep->object_size, flags, cachep->flags);
3290 }
3291 
3292 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3293 {
3294 	void *objp;
3295 	struct array_cache *ac;
3296 	bool force_refill = false;
3297 
3298 	check_irq_off();
3299 
3300 	ac = cpu_cache_get(cachep);
3301 	if (likely(ac->avail)) {
3302 		ac->touched = 1;
3303 		objp = ac_get_obj(cachep, ac, flags, false);
3304 
3305 		/*
3306 		 * Allow for the possibility all avail objects are not allowed
3307 		 * by the current flags
3308 		 */
3309 		if (objp) {
3310 			STATS_INC_ALLOCHIT(cachep);
3311 			goto out;
3312 		}
3313 		force_refill = true;
3314 	}
3315 
3316 	STATS_INC_ALLOCMISS(cachep);
3317 	objp = cache_alloc_refill(cachep, flags, force_refill);
3318 	/*
3319 	 * the 'ac' may be updated by cache_alloc_refill(),
3320 	 * and kmemleak_erase() requires its correct value.
3321 	 */
3322 	ac = cpu_cache_get(cachep);
3323 
3324 out:
3325 	/*
3326 	 * To avoid a false negative, if an object that is in one of the
3327 	 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
3328 	 * treat the array pointers as a reference to the object.
3329 	 */
3330 	if (objp)
3331 		kmemleak_erase(&ac->entry[ac->avail]);
3332 	return objp;
3333 }
3334 
3335 #ifdef CONFIG_NUMA
3336 /*
3337  * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
3338  *
3339  * If we are in_interrupt, then process context, including cpusets and
3340  * mempolicy, may not apply and should not be used for allocation policy.
3341  */
3342 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3343 {
3344 	int nid_alloc, nid_here;
3345 
3346 	if (in_interrupt() || (flags & __GFP_THISNODE))
3347 		return NULL;
3348 	nid_alloc = nid_here = numa_mem_id();
3349 	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3350 		nid_alloc = cpuset_slab_spread_node();
3351 	else if (current->mempolicy)
3352 		nid_alloc = slab_node();
3353 	if (nid_alloc != nid_here)
3354 		return ____cache_alloc_node(cachep, flags, nid_alloc);
3355 	return NULL;
3356 }
3357 
3358 /*
3359  * Fallback function if there was no memory available and no objects on a
3360  * certain node and fall back is permitted. First we scan all the
3361  * available nodelists for available objects. If that fails then we
3362  * perform an allocation without specifying a node. This allows the page
3363  * allocator to do its reclaim / fallback magic. We then insert the
3364  * slab into the proper nodelist and then allocate from it.
3365  */
3366 static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3367 {
3368 	struct zonelist *zonelist;
3369 	gfp_t local_flags;
3370 	struct zoneref *z;
3371 	struct zone *zone;
3372 	enum zone_type high_zoneidx = gfp_zone(flags);
3373 	void *obj = NULL;
3374 	int nid;
3375 	unsigned int cpuset_mems_cookie;
3376 
3377 	if (flags & __GFP_THISNODE)
3378 		return NULL;
3379 
3380 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3381 
3382 retry_cpuset:
3383 	cpuset_mems_cookie = get_mems_allowed();
3384 	zonelist = node_zonelist(slab_node(), flags);
3385 
3386 retry:
3387 	/*
3388 	 * Look through allowed nodes for objects available
3389 	 * from existing per node queues.
3390 	 */
3391 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
3392 		nid = zone_to_nid(zone);
3393 
3394 		if (cpuset_zone_allowed_hardwall(zone, flags) &&
3395 			cache->nodelists[nid] &&
3396 			cache->nodelists[nid]->free_objects) {
3397 				obj = ____cache_alloc_node(cache,
3398 					flags | GFP_THISNODE, nid);
3399 				if (obj)
3400 					break;
3401 		}
3402 	}
3403 
3404 	if (!obj) {
3405 		/*
3406 		 * This allocation will be performed within the constraints
3407 		 * of the current cpuset / memory policy requirements.
3408 		 * We may trigger various forms of reclaim on the allowed
3409 		 * set and go into memory reserves if necessary.
3410 		 */
3411 		if (local_flags & __GFP_WAIT)
3412 			local_irq_enable();
3413 		kmem_flagcheck(cache, flags);
3414 		obj = kmem_getpages(cache, local_flags, numa_mem_id());
3415 		if (local_flags & __GFP_WAIT)
3416 			local_irq_disable();
3417 		if (obj) {
3418 			/*
3419 			 * Insert into the appropriate per node queues
3420 			 */
3421 			nid = page_to_nid(virt_to_page(obj));
3422 			if (cache_grow(cache, flags, nid, obj)) {
3423 				obj = ____cache_alloc_node(cache,
3424 					flags | GFP_THISNODE, nid);
3425 				if (!obj)
3426 					/*
3427 					 * Another processor may allocate the
3428 					 * objects in the slab since we are
3429 					 * not holding any locks.
3430 					 */
3431 					goto retry;
3432 			} else {
3433 				/* cache_grow already freed obj */
3434 				obj = NULL;
3435 			}
3436 		}
3437 	}
3438 
3439 	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
3440 		goto retry_cpuset;
3441 	return obj;
3442 }
3443 
3444 /*
3445  * A interface to enable slab creation on nodeid
3446  */
3447 static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3448 				int nodeid)
3449 {
3450 	struct list_head *entry;
3451 	struct slab *slabp;
3452 	struct kmem_list3 *l3;
3453 	void *obj;
3454 	int x;
3455 
3456 	l3 = cachep->nodelists[nodeid];
3457 	BUG_ON(!l3);
3458 
3459 retry:
3460 	check_irq_off();
3461 	spin_lock(&l3->list_lock);
3462 	entry = l3->slabs_partial.next;
3463 	if (entry == &l3->slabs_partial) {
3464 		l3->free_touched = 1;
3465 		entry = l3->slabs_free.next;
3466 		if (entry == &l3->slabs_free)
3467 			goto must_grow;
3468 	}
3469 
3470 	slabp = list_entry(entry, struct slab, list);
3471 	check_spinlock_acquired_node(cachep, nodeid);
3472 	check_slabp(cachep, slabp);
3473 
3474 	STATS_INC_NODEALLOCS(cachep);
3475 	STATS_INC_ACTIVE(cachep);
3476 	STATS_SET_HIGH(cachep);
3477 
3478 	BUG_ON(slabp->inuse == cachep->num);
3479 
3480 	obj = slab_get_obj(cachep, slabp, nodeid);
3481 	check_slabp(cachep, slabp);
3482 	l3->free_objects--;
3483 	/* move slabp to correct slabp list: */
3484 	list_del(&slabp->list);
3485 
3486 	if (slabp->free == BUFCTL_END)
3487 		list_add(&slabp->list, &l3->slabs_full);
3488 	else
3489 		list_add(&slabp->list, &l3->slabs_partial);
3490 
3491 	spin_unlock(&l3->list_lock);
3492 	goto done;
3493 
3494 must_grow:
3495 	spin_unlock(&l3->list_lock);
3496 	x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3497 	if (x)
3498 		goto retry;
3499 
3500 	return fallback_alloc(cachep, flags);
3501 
3502 done:
3503 	return obj;
3504 }
3505 
3506 /**
3507  * kmem_cache_alloc_node - Allocate an object on the specified node
3508  * @cachep: The cache to allocate from.
3509  * @flags: See kmalloc().
3510  * @nodeid: node number of the target node.
3511  * @caller: return address of caller, used for debug information
3512  *
3513  * Identical to kmem_cache_alloc but it will allocate memory on the given
3514  * node, which can improve the performance for cpu bound structures.
3515  *
3516  * Fallback to other node is possible if __GFP_THISNODE is not set.
3517  */
3518 static __always_inline void *
3519 slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3520 		   unsigned long caller)
3521 {
3522 	unsigned long save_flags;
3523 	void *ptr;
3524 	int slab_node = numa_mem_id();
3525 
3526 	flags &= gfp_allowed_mask;
3527 
3528 	lockdep_trace_alloc(flags);
3529 
3530 	if (slab_should_failslab(cachep, flags))
3531 		return NULL;
3532 
3533 	cache_alloc_debugcheck_before(cachep, flags);
3534 	local_irq_save(save_flags);
3535 
3536 	if (nodeid == NUMA_NO_NODE)
3537 		nodeid = slab_node;
3538 
3539 	if (unlikely(!cachep->nodelists[nodeid])) {
3540 		/* Node not bootstrapped yet */
3541 		ptr = fallback_alloc(cachep, flags);
3542 		goto out;
3543 	}
3544 
3545 	if (nodeid == slab_node) {
3546 		/*
3547 		 * Use the locally cached objects if possible.
3548 		 * However ____cache_alloc does not allow fallback
3549 		 * to other nodes. It may fail while we still have
3550 		 * objects on other nodes available.
3551 		 */
3552 		ptr = ____cache_alloc(cachep, flags);
3553 		if (ptr)
3554 			goto out;
3555 	}
3556 	/* ___cache_alloc_node can fall back to other nodes */
3557 	ptr = ____cache_alloc_node(cachep, flags, nodeid);
3558   out:
3559 	local_irq_restore(save_flags);
3560 	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3561 	kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
3562 				 flags);
3563 
3564 	if (likely(ptr))
3565 		kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
3566 
3567 	if (unlikely((flags & __GFP_ZERO) && ptr))
3568 		memset(ptr, 0, cachep->object_size);
3569 
3570 	return ptr;
3571 }
3572 
3573 static __always_inline void *
3574 __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
3575 {
3576 	void *objp;
3577 
3578 	if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
3579 		objp = alternate_node_alloc(cache, flags);
3580 		if (objp)
3581 			goto out;
3582 	}
3583 	objp = ____cache_alloc(cache, flags);
3584 
3585 	/*
3586 	 * We may just have run out of memory on the local node.
3587 	 * ____cache_alloc_node() knows how to locate memory on other nodes
3588 	 */
3589 	if (!objp)
3590 		objp = ____cache_alloc_node(cache, flags, numa_mem_id());
3591 
3592   out:
3593 	return objp;
3594 }
3595 #else
3596 
3597 static __always_inline void *
3598 __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3599 {
3600 	return ____cache_alloc(cachep, flags);
3601 }
3602 
3603 #endif /* CONFIG_NUMA */
3604 
3605 static __always_inline void *
3606 slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
3607 {
3608 	unsigned long save_flags;
3609 	void *objp;
3610 
3611 	flags &= gfp_allowed_mask;
3612 
3613 	lockdep_trace_alloc(flags);
3614 
3615 	if (slab_should_failslab(cachep, flags))
3616 		return NULL;
3617 
3618 	cache_alloc_debugcheck_before(cachep, flags);
3619 	local_irq_save(save_flags);
3620 	objp = __do_cache_alloc(cachep, flags);
3621 	local_irq_restore(save_flags);
3622 	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3623 	kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
3624 				 flags);
3625 	prefetchw(objp);
3626 
3627 	if (likely(objp))
3628 		kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
3629 
3630 	if (unlikely((flags & __GFP_ZERO) && objp))
3631 		memset(objp, 0, cachep->object_size);
3632 
3633 	return objp;
3634 }
3635 
3636 /*
3637  * Caller needs to acquire correct kmem_list's list_lock
3638  */
3639 static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3640 		       int node)
3641 {
3642 	int i;
3643 	struct kmem_list3 *l3;
3644 
3645 	for (i = 0; i < nr_objects; i++) {
3646 		void *objp;
3647 		struct slab *slabp;
3648 
3649 		clear_obj_pfmemalloc(&objpp[i]);
3650 		objp = objpp[i];
3651 
3652 		slabp = virt_to_slab(objp);
3653 		l3 = cachep->nodelists[node];
3654 		list_del(&slabp->list);
3655 		check_spinlock_acquired_node(cachep, node);
3656 		check_slabp(cachep, slabp);
3657 		slab_put_obj(cachep, slabp, objp, node);
3658 		STATS_DEC_ACTIVE(cachep);
3659 		l3->free_objects++;
3660 		check_slabp(cachep, slabp);
3661 
3662 		/* fixup slab chains */
3663 		if (slabp->inuse == 0) {
3664 			if (l3->free_objects > l3->free_limit) {
3665 				l3->free_objects -= cachep->num;
3666 				/* No need to drop any previously held
3667 				 * lock here, even if we have a off-slab slab
3668 				 * descriptor it is guaranteed to come from
3669 				 * a different cache, refer to comments before
3670 				 * alloc_slabmgmt.
3671 				 */
3672 				slab_destroy(cachep, slabp);
3673 			} else {
3674 				list_add(&slabp->list, &l3->slabs_free);
3675 			}
3676 		} else {
3677 			/* Unconditionally move a slab to the end of the
3678 			 * partial list on free - maximum time for the
3679 			 * other objects to be freed, too.
3680 			 */
3681 			list_add_tail(&slabp->list, &l3->slabs_partial);
3682 		}
3683 	}
3684 }
3685 
3686 static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3687 {
3688 	int batchcount;
3689 	struct kmem_list3 *l3;
3690 	int node = numa_mem_id();
3691 
3692 	batchcount = ac->batchcount;
3693 #if DEBUG
3694 	BUG_ON(!batchcount || batchcount > ac->avail);
3695 #endif
3696 	check_irq_off();
3697 	l3 = cachep->nodelists[node];
3698 	spin_lock(&l3->list_lock);
3699 	if (l3->shared) {
3700 		struct array_cache *shared_array = l3->shared;
3701 		int max = shared_array->limit - shared_array->avail;
3702 		if (max) {
3703 			if (batchcount > max)
3704 				batchcount = max;
3705 			memcpy(&(shared_array->entry[shared_array->avail]),
3706 			       ac->entry, sizeof(void *) * batchcount);
3707 			shared_array->avail += batchcount;
3708 			goto free_done;
3709 		}
3710 	}
3711 
3712 	free_block(cachep, ac->entry, batchcount, node);
3713 free_done:
3714 #if STATS
3715 	{
3716 		int i = 0;
3717 		struct list_head *p;
3718 
3719 		p = l3->slabs_free.next;
3720 		while (p != &(l3->slabs_free)) {
3721 			struct slab *slabp;
3722 
3723 			slabp = list_entry(p, struct slab, list);
3724 			BUG_ON(slabp->inuse);
3725 
3726 			i++;
3727 			p = p->next;
3728 		}
3729 		STATS_SET_FREEABLE(cachep, i);
3730 	}
3731 #endif
3732 	spin_unlock(&l3->list_lock);
3733 	ac->avail -= batchcount;
3734 	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3735 }
3736 
3737 /*
3738  * Release an obj back to its cache. If the obj has a constructed state, it must
3739  * be in this state _before_ it is released.  Called with disabled ints.
3740  */
3741 static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3742 				unsigned long caller)
3743 {
3744 	struct array_cache *ac = cpu_cache_get(cachep);
3745 
3746 	check_irq_off();
3747 	kmemleak_free_recursive(objp, cachep->flags);
3748 	objp = cache_free_debugcheck(cachep, objp, caller);
3749 
3750 	kmemcheck_slab_free(cachep, objp, cachep->object_size);
3751 
3752 	/*
3753 	 * Skip calling cache_free_alien() when the platform is not numa.
3754 	 * This will avoid cache misses that happen while accessing slabp (which
3755 	 * is per page memory  reference) to get nodeid. Instead use a global
3756 	 * variable to skip the call, which is mostly likely to be present in
3757 	 * the cache.
3758 	 */
3759 	if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
3760 		return;
3761 
3762 	if (likely(ac->avail < ac->limit)) {
3763 		STATS_INC_FREEHIT(cachep);
3764 	} else {
3765 		STATS_INC_FREEMISS(cachep);
3766 		cache_flusharray(cachep, ac);
3767 	}
3768 
3769 	ac_put_obj(cachep, ac, objp);
3770 }
3771 
3772 /**
3773  * kmem_cache_alloc - Allocate an object
3774  * @cachep: The cache to allocate from.
3775  * @flags: See kmalloc().
3776  *
3777  * Allocate an object from this cache.  The flags are only relevant
3778  * if the cache has no available objects.
3779  */
3780 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3781 {
3782 	void *ret = slab_alloc(cachep, flags, _RET_IP_);
3783 
3784 	trace_kmem_cache_alloc(_RET_IP_, ret,
3785 			       cachep->object_size, cachep->size, flags);
3786 
3787 	return ret;
3788 }
3789 EXPORT_SYMBOL(kmem_cache_alloc);
3790 
3791 #ifdef CONFIG_TRACING
3792 void *
3793 kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
3794 {
3795 	void *ret;
3796 
3797 	ret = slab_alloc(cachep, flags, _RET_IP_);
3798 
3799 	trace_kmalloc(_RET_IP_, ret,
3800 		      size, cachep->size, flags);
3801 	return ret;
3802 }
3803 EXPORT_SYMBOL(kmem_cache_alloc_trace);
3804 #endif
3805 
3806 #ifdef CONFIG_NUMA
3807 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3808 {
3809 	void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
3810 
3811 	trace_kmem_cache_alloc_node(_RET_IP_, ret,
3812 				    cachep->object_size, cachep->size,
3813 				    flags, nodeid);
3814 
3815 	return ret;
3816 }
3817 EXPORT_SYMBOL(kmem_cache_alloc_node);
3818 
3819 #ifdef CONFIG_TRACING
3820 void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
3821 				  gfp_t flags,
3822 				  int nodeid,
3823 				  size_t size)
3824 {
3825 	void *ret;
3826 
3827 	ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
3828 
3829 	trace_kmalloc_node(_RET_IP_, ret,
3830 			   size, cachep->size,
3831 			   flags, nodeid);
3832 	return ret;
3833 }
3834 EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
3835 #endif
3836 
3837 static __always_inline void *
3838 __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
3839 {
3840 	struct kmem_cache *cachep;
3841 
3842 	cachep = kmem_find_general_cachep(size, flags);
3843 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3844 		return cachep;
3845 	return kmem_cache_alloc_node_trace(cachep, flags, node, size);
3846 }
3847 
3848 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3849 void *__kmalloc_node(size_t size, gfp_t flags, int node)
3850 {
3851 	return __do_kmalloc_node(size, flags, node, _RET_IP_);
3852 }
3853 EXPORT_SYMBOL(__kmalloc_node);
3854 
3855 void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3856 		int node, unsigned long caller)
3857 {
3858 	return __do_kmalloc_node(size, flags, node, caller);
3859 }
3860 EXPORT_SYMBOL(__kmalloc_node_track_caller);
3861 #else
3862 void *__kmalloc_node(size_t size, gfp_t flags, int node)
3863 {
3864 	return __do_kmalloc_node(size, flags, node, 0);
3865 }
3866 EXPORT_SYMBOL(__kmalloc_node);
3867 #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
3868 #endif /* CONFIG_NUMA */
3869 
3870 /**
3871  * __do_kmalloc - allocate memory
3872  * @size: how many bytes of memory are required.
3873  * @flags: the type of memory to allocate (see kmalloc).
3874  * @caller: function caller for debug tracking of the caller
3875  */
3876 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3877 					  unsigned long caller)
3878 {
3879 	struct kmem_cache *cachep;
3880 	void *ret;
3881 
3882 	/* If you want to save a few bytes .text space: replace
3883 	 * __ with kmem_.
3884 	 * Then kmalloc uses the uninlined functions instead of the inline
3885 	 * functions.
3886 	 */
3887 	cachep = __find_general_cachep(size, flags);
3888 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3889 		return cachep;
3890 	ret = slab_alloc(cachep, flags, caller);
3891 
3892 	trace_kmalloc(caller, ret,
3893 		      size, cachep->size, flags);
3894 
3895 	return ret;
3896 }
3897 
3898 
3899 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3900 void *__kmalloc(size_t size, gfp_t flags)
3901 {
3902 	return __do_kmalloc(size, flags, _RET_IP_);
3903 }
3904 EXPORT_SYMBOL(__kmalloc);
3905 
3906 void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
3907 {
3908 	return __do_kmalloc(size, flags, caller);
3909 }
3910 EXPORT_SYMBOL(__kmalloc_track_caller);
3911 
3912 #else
3913 void *__kmalloc(size_t size, gfp_t flags)
3914 {
3915 	return __do_kmalloc(size, flags, 0);
3916 }
3917 EXPORT_SYMBOL(__kmalloc);
3918 #endif
3919 
3920 /**
3921  * kmem_cache_free - Deallocate an object
3922  * @cachep: The cache the allocation was from.
3923  * @objp: The previously allocated object.
3924  *
3925  * Free an object which was previously allocated from this
3926  * cache.
3927  */
3928 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3929 {
3930 	unsigned long flags;
3931 
3932 	local_irq_save(flags);
3933 	debug_check_no_locks_freed(objp, cachep->object_size);
3934 	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3935 		debug_check_no_obj_freed(objp, cachep->object_size);
3936 	__cache_free(cachep, objp, _RET_IP_);
3937 	local_irq_restore(flags);
3938 
3939 	trace_kmem_cache_free(_RET_IP_, objp);
3940 }
3941 EXPORT_SYMBOL(kmem_cache_free);
3942 
3943 /**
3944  * kfree - free previously allocated memory
3945  * @objp: pointer returned by kmalloc.
3946  *
3947  * If @objp is NULL, no operation is performed.
3948  *
3949  * Don't free memory not originally allocated by kmalloc()
3950  * or you will run into trouble.
3951  */
3952 void kfree(const void *objp)
3953 {
3954 	struct kmem_cache *c;
3955 	unsigned long flags;
3956 
3957 	trace_kfree(_RET_IP_, objp);
3958 
3959 	if (unlikely(ZERO_OR_NULL_PTR(objp)))
3960 		return;
3961 	local_irq_save(flags);
3962 	kfree_debugcheck(objp);
3963 	c = virt_to_cache(objp);
3964 	debug_check_no_locks_freed(objp, c->object_size);
3965 
3966 	debug_check_no_obj_freed(objp, c->object_size);
3967 	__cache_free(c, (void *)objp, _RET_IP_);
3968 	local_irq_restore(flags);
3969 }
3970 EXPORT_SYMBOL(kfree);
3971 
3972 unsigned int kmem_cache_size(struct kmem_cache *cachep)
3973 {
3974 	return cachep->object_size;
3975 }
3976 EXPORT_SYMBOL(kmem_cache_size);
3977 
3978 /*
3979  * This initializes kmem_list3 or resizes various caches for all nodes.
3980  */
3981 static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
3982 {
3983 	int node;
3984 	struct kmem_list3 *l3;
3985 	struct array_cache *new_shared;
3986 	struct array_cache **new_alien = NULL;
3987 
3988 	for_each_online_node(node) {
3989 
3990                 if (use_alien_caches) {
3991                         new_alien = alloc_alien_cache(node, cachep->limit, gfp);
3992                         if (!new_alien)
3993                                 goto fail;
3994                 }
3995 
3996 		new_shared = NULL;
3997 		if (cachep->shared) {
3998 			new_shared = alloc_arraycache(node,
3999 				cachep->shared*cachep->batchcount,
4000 					0xbaadf00d, gfp);
4001 			if (!new_shared) {
4002 				free_alien_cache(new_alien);
4003 				goto fail;
4004 			}
4005 		}
4006 
4007 		l3 = cachep->nodelists[node];
4008 		if (l3) {
4009 			struct array_cache *shared = l3->shared;
4010 
4011 			spin_lock_irq(&l3->list_lock);
4012 
4013 			if (shared)
4014 				free_block(cachep, shared->entry,
4015 						shared->avail, node);
4016 
4017 			l3->shared = new_shared;
4018 			if (!l3->alien) {
4019 				l3->alien = new_alien;
4020 				new_alien = NULL;
4021 			}
4022 			l3->free_limit = (1 + nr_cpus_node(node)) *
4023 					cachep->batchcount + cachep->num;
4024 			spin_unlock_irq(&l3->list_lock);
4025 			kfree(shared);
4026 			free_alien_cache(new_alien);
4027 			continue;
4028 		}
4029 		l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
4030 		if (!l3) {
4031 			free_alien_cache(new_alien);
4032 			kfree(new_shared);
4033 			goto fail;
4034 		}
4035 
4036 		kmem_list3_init(l3);
4037 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
4038 				((unsigned long)cachep) % REAPTIMEOUT_LIST3;
4039 		l3->shared = new_shared;
4040 		l3->alien = new_alien;
4041 		l3->free_limit = (1 + nr_cpus_node(node)) *
4042 					cachep->batchcount + cachep->num;
4043 		cachep->nodelists[node] = l3;
4044 	}
4045 	return 0;
4046 
4047 fail:
4048 	if (!cachep->list.next) {
4049 		/* Cache is not active yet. Roll back what we did */
4050 		node--;
4051 		while (node >= 0) {
4052 			if (cachep->nodelists[node]) {
4053 				l3 = cachep->nodelists[node];
4054 
4055 				kfree(l3->shared);
4056 				free_alien_cache(l3->alien);
4057 				kfree(l3);
4058 				cachep->nodelists[node] = NULL;
4059 			}
4060 			node--;
4061 		}
4062 	}
4063 	return -ENOMEM;
4064 }
4065 
4066 struct ccupdate_struct {
4067 	struct kmem_cache *cachep;
4068 	struct array_cache *new[0];
4069 };
4070 
4071 static void do_ccupdate_local(void *info)
4072 {
4073 	struct ccupdate_struct *new = info;
4074 	struct array_cache *old;
4075 
4076 	check_irq_off();
4077 	old = cpu_cache_get(new->cachep);
4078 
4079 	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
4080 	new->new[smp_processor_id()] = old;
4081 }
4082 
4083 /* Always called with the slab_mutex held */
4084 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
4085 				int batchcount, int shared, gfp_t gfp)
4086 {
4087 	struct ccupdate_struct *new;
4088 	int i;
4089 
4090 	new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
4091 		      gfp);
4092 	if (!new)
4093 		return -ENOMEM;
4094 
4095 	for_each_online_cpu(i) {
4096 		new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
4097 						batchcount, gfp);
4098 		if (!new->new[i]) {
4099 			for (i--; i >= 0; i--)
4100 				kfree(new->new[i]);
4101 			kfree(new);
4102 			return -ENOMEM;
4103 		}
4104 	}
4105 	new->cachep = cachep;
4106 
4107 	on_each_cpu(do_ccupdate_local, (void *)new, 1);
4108 
4109 	check_irq_on();
4110 	cachep->batchcount = batchcount;
4111 	cachep->limit = limit;
4112 	cachep->shared = shared;
4113 
4114 	for_each_online_cpu(i) {
4115 		struct array_cache *ccold = new->new[i];
4116 		if (!ccold)
4117 			continue;
4118 		spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
4119 		free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
4120 		spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
4121 		kfree(ccold);
4122 	}
4123 	kfree(new);
4124 	return alloc_kmemlist(cachep, gfp);
4125 }
4126 
4127 /* Called with slab_mutex held always */
4128 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4129 {
4130 	int err;
4131 	int limit, shared;
4132 
4133 	/*
4134 	 * The head array serves three purposes:
4135 	 * - create a LIFO ordering, i.e. return objects that are cache-warm
4136 	 * - reduce the number of spinlock operations.
4137 	 * - reduce the number of linked list operations on the slab and
4138 	 *   bufctl chains: array operations are cheaper.
4139 	 * The numbers are guessed, we should auto-tune as described by
4140 	 * Bonwick.
4141 	 */
4142 	if (cachep->size > 131072)
4143 		limit = 1;
4144 	else if (cachep->size > PAGE_SIZE)
4145 		limit = 8;
4146 	else if (cachep->size > 1024)
4147 		limit = 24;
4148 	else if (cachep->size > 256)
4149 		limit = 54;
4150 	else
4151 		limit = 120;
4152 
4153 	/*
4154 	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
4155 	 * allocation behaviour: Most allocs on one cpu, most free operations
4156 	 * on another cpu. For these cases, an efficient object passing between
4157 	 * cpus is necessary. This is provided by a shared array. The array
4158 	 * replaces Bonwick's magazine layer.
4159 	 * On uniprocessor, it's functionally equivalent (but less efficient)
4160 	 * to a larger limit. Thus disabled by default.
4161 	 */
4162 	shared = 0;
4163 	if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
4164 		shared = 8;
4165 
4166 #if DEBUG
4167 	/*
4168 	 * With debugging enabled, large batchcount lead to excessively long
4169 	 * periods with disabled local interrupts. Limit the batchcount
4170 	 */
4171 	if (limit > 32)
4172 		limit = 32;
4173 #endif
4174 	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
4175 	if (err)
4176 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
4177 		       cachep->name, -err);
4178 	return err;
4179 }
4180 
4181 /*
4182  * Drain an array if it contains any elements taking the l3 lock only if
4183  * necessary. Note that the l3 listlock also protects the array_cache
4184  * if drain_array() is used on the shared array.
4185  */
4186 static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
4187 			 struct array_cache *ac, int force, int node)
4188 {
4189 	int tofree;
4190 
4191 	if (!ac || !ac->avail)
4192 		return;
4193 	if (ac->touched && !force) {
4194 		ac->touched = 0;
4195 	} else {
4196 		spin_lock_irq(&l3->list_lock);
4197 		if (ac->avail) {
4198 			tofree = force ? ac->avail : (ac->limit + 4) / 5;
4199 			if (tofree > ac->avail)
4200 				tofree = (ac->avail + 1) / 2;
4201 			free_block(cachep, ac->entry, tofree, node);
4202 			ac->avail -= tofree;
4203 			memmove(ac->entry, &(ac->entry[tofree]),
4204 				sizeof(void *) * ac->avail);
4205 		}
4206 		spin_unlock_irq(&l3->list_lock);
4207 	}
4208 }
4209 
4210 /**
4211  * cache_reap - Reclaim memory from caches.
4212  * @w: work descriptor
4213  *
4214  * Called from workqueue/eventd every few seconds.
4215  * Purpose:
4216  * - clear the per-cpu caches for this CPU.
4217  * - return freeable pages to the main free memory pool.
4218  *
4219  * If we cannot acquire the cache chain mutex then just give up - we'll try
4220  * again on the next iteration.
4221  */
4222 static void cache_reap(struct work_struct *w)
4223 {
4224 	struct kmem_cache *searchp;
4225 	struct kmem_list3 *l3;
4226 	int node = numa_mem_id();
4227 	struct delayed_work *work = to_delayed_work(w);
4228 
4229 	if (!mutex_trylock(&slab_mutex))
4230 		/* Give up. Setup the next iteration. */
4231 		goto out;
4232 
4233 	list_for_each_entry(searchp, &slab_caches, list) {
4234 		check_irq_on();
4235 
4236 		/*
4237 		 * We only take the l3 lock if absolutely necessary and we
4238 		 * have established with reasonable certainty that
4239 		 * we can do some work if the lock was obtained.
4240 		 */
4241 		l3 = searchp->nodelists[node];
4242 
4243 		reap_alien(searchp, l3);
4244 
4245 		drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
4246 
4247 		/*
4248 		 * These are racy checks but it does not matter
4249 		 * if we skip one check or scan twice.
4250 		 */
4251 		if (time_after(l3->next_reap, jiffies))
4252 			goto next;
4253 
4254 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
4255 
4256 		drain_array(searchp, l3, l3->shared, 0, node);
4257 
4258 		if (l3->free_touched)
4259 			l3->free_touched = 0;
4260 		else {
4261 			int freed;
4262 
4263 			freed = drain_freelist(searchp, l3, (l3->free_limit +
4264 				5 * searchp->num - 1) / (5 * searchp->num));
4265 			STATS_ADD_REAPED(searchp, freed);
4266 		}
4267 next:
4268 		cond_resched();
4269 	}
4270 	check_irq_on();
4271 	mutex_unlock(&slab_mutex);
4272 	next_reap_node();
4273 out:
4274 	/* Set up the next iteration */
4275 	schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
4276 }
4277 
4278 #ifdef CONFIG_SLABINFO
4279 
4280 static void print_slabinfo_header(struct seq_file *m)
4281 {
4282 	/*
4283 	 * Output format version, so at least we can change it
4284 	 * without _too_ many complaints.
4285 	 */
4286 #if STATS
4287 	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
4288 #else
4289 	seq_puts(m, "slabinfo - version: 2.1\n");
4290 #endif
4291 	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
4292 		 "<objperslab> <pagesperslab>");
4293 	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4294 	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4295 #if STATS
4296 	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
4297 		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
4298 	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
4299 #endif
4300 	seq_putc(m, '\n');
4301 }
4302 
4303 static void *s_start(struct seq_file *m, loff_t *pos)
4304 {
4305 	loff_t n = *pos;
4306 
4307 	mutex_lock(&slab_mutex);
4308 	if (!n)
4309 		print_slabinfo_header(m);
4310 
4311 	return seq_list_start(&slab_caches, *pos);
4312 }
4313 
4314 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4315 {
4316 	return seq_list_next(p, &slab_caches, pos);
4317 }
4318 
4319 static void s_stop(struct seq_file *m, void *p)
4320 {
4321 	mutex_unlock(&slab_mutex);
4322 }
4323 
4324 static int s_show(struct seq_file *m, void *p)
4325 {
4326 	struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
4327 	struct slab *slabp;
4328 	unsigned long active_objs;
4329 	unsigned long num_objs;
4330 	unsigned long active_slabs = 0;
4331 	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
4332 	const char *name;
4333 	char *error = NULL;
4334 	int node;
4335 	struct kmem_list3 *l3;
4336 
4337 	active_objs = 0;
4338 	num_slabs = 0;
4339 	for_each_online_node(node) {
4340 		l3 = cachep->nodelists[node];
4341 		if (!l3)
4342 			continue;
4343 
4344 		check_irq_on();
4345 		spin_lock_irq(&l3->list_lock);
4346 
4347 		list_for_each_entry(slabp, &l3->slabs_full, list) {
4348 			if (slabp->inuse != cachep->num && !error)
4349 				error = "slabs_full accounting error";
4350 			active_objs += cachep->num;
4351 			active_slabs++;
4352 		}
4353 		list_for_each_entry(slabp, &l3->slabs_partial, list) {
4354 			if (slabp->inuse == cachep->num && !error)
4355 				error = "slabs_partial inuse accounting error";
4356 			if (!slabp->inuse && !error)
4357 				error = "slabs_partial/inuse accounting error";
4358 			active_objs += slabp->inuse;
4359 			active_slabs++;
4360 		}
4361 		list_for_each_entry(slabp, &l3->slabs_free, list) {
4362 			if (slabp->inuse && !error)
4363 				error = "slabs_free/inuse accounting error";
4364 			num_slabs++;
4365 		}
4366 		free_objects += l3->free_objects;
4367 		if (l3->shared)
4368 			shared_avail += l3->shared->avail;
4369 
4370 		spin_unlock_irq(&l3->list_lock);
4371 	}
4372 	num_slabs += active_slabs;
4373 	num_objs = num_slabs * cachep->num;
4374 	if (num_objs - active_objs != free_objects && !error)
4375 		error = "free_objects accounting error";
4376 
4377 	name = cachep->name;
4378 	if (error)
4379 		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4380 
4381 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
4382 		   name, active_objs, num_objs, cachep->size,
4383 		   cachep->num, (1 << cachep->gfporder));
4384 	seq_printf(m, " : tunables %4u %4u %4u",
4385 		   cachep->limit, cachep->batchcount, cachep->shared);
4386 	seq_printf(m, " : slabdata %6lu %6lu %6lu",
4387 		   active_slabs, num_slabs, shared_avail);
4388 #if STATS
4389 	{			/* list3 stats */
4390 		unsigned long high = cachep->high_mark;
4391 		unsigned long allocs = cachep->num_allocations;
4392 		unsigned long grown = cachep->grown;
4393 		unsigned long reaped = cachep->reaped;
4394 		unsigned long errors = cachep->errors;
4395 		unsigned long max_freeable = cachep->max_freeable;
4396 		unsigned long node_allocs = cachep->node_allocs;
4397 		unsigned long node_frees = cachep->node_frees;
4398 		unsigned long overflows = cachep->node_overflow;
4399 
4400 		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
4401 			   "%4lu %4lu %4lu %4lu %4lu",
4402 			   allocs, high, grown,
4403 			   reaped, errors, max_freeable, node_allocs,
4404 			   node_frees, overflows);
4405 	}
4406 	/* cpu stats */
4407 	{
4408 		unsigned long allochit = atomic_read(&cachep->allochit);
4409 		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
4410 		unsigned long freehit = atomic_read(&cachep->freehit);
4411 		unsigned long freemiss = atomic_read(&cachep->freemiss);
4412 
4413 		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
4414 			   allochit, allocmiss, freehit, freemiss);
4415 	}
4416 #endif
4417 	seq_putc(m, '\n');
4418 	return 0;
4419 }
4420 
4421 /*
4422  * slabinfo_op - iterator that generates /proc/slabinfo
4423  *
4424  * Output layout:
4425  * cache-name
4426  * num-active-objs
4427  * total-objs
4428  * object size
4429  * num-active-slabs
4430  * total-slabs
4431  * num-pages-per-slab
4432  * + further values on SMP and with statistics enabled
4433  */
4434 
4435 static const struct seq_operations slabinfo_op = {
4436 	.start = s_start,
4437 	.next = s_next,
4438 	.stop = s_stop,
4439 	.show = s_show,
4440 };
4441 
4442 #define MAX_SLABINFO_WRITE 128
4443 /**
4444  * slabinfo_write - Tuning for the slab allocator
4445  * @file: unused
4446  * @buffer: user buffer
4447  * @count: data length
4448  * @ppos: unused
4449  */
4450 static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4451 		       size_t count, loff_t *ppos)
4452 {
4453 	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
4454 	int limit, batchcount, shared, res;
4455 	struct kmem_cache *cachep;
4456 
4457 	if (count > MAX_SLABINFO_WRITE)
4458 		return -EINVAL;
4459 	if (copy_from_user(&kbuf, buffer, count))
4460 		return -EFAULT;
4461 	kbuf[MAX_SLABINFO_WRITE] = '\0';
4462 
4463 	tmp = strchr(kbuf, ' ');
4464 	if (!tmp)
4465 		return -EINVAL;
4466 	*tmp = '\0';
4467 	tmp++;
4468 	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
4469 		return -EINVAL;
4470 
4471 	/* Find the cache in the chain of caches. */
4472 	mutex_lock(&slab_mutex);
4473 	res = -EINVAL;
4474 	list_for_each_entry(cachep, &slab_caches, list) {
4475 		if (!strcmp(cachep->name, kbuf)) {
4476 			if (limit < 1 || batchcount < 1 ||
4477 					batchcount > limit || shared < 0) {
4478 				res = 0;
4479 			} else {
4480 				res = do_tune_cpucache(cachep, limit,
4481 						       batchcount, shared,
4482 						       GFP_KERNEL);
4483 			}
4484 			break;
4485 		}
4486 	}
4487 	mutex_unlock(&slab_mutex);
4488 	if (res >= 0)
4489 		res = count;
4490 	return res;
4491 }
4492 
4493 static int slabinfo_open(struct inode *inode, struct file *file)
4494 {
4495 	return seq_open(file, &slabinfo_op);
4496 }
4497 
4498 static const struct file_operations proc_slabinfo_operations = {
4499 	.open		= slabinfo_open,
4500 	.read		= seq_read,
4501 	.write		= slabinfo_write,
4502 	.llseek		= seq_lseek,
4503 	.release	= seq_release,
4504 };
4505 
4506 #ifdef CONFIG_DEBUG_SLAB_LEAK
4507 
4508 static void *leaks_start(struct seq_file *m, loff_t *pos)
4509 {
4510 	mutex_lock(&slab_mutex);
4511 	return seq_list_start(&slab_caches, *pos);
4512 }
4513 
4514 static inline int add_caller(unsigned long *n, unsigned long v)
4515 {
4516 	unsigned long *p;
4517 	int l;
4518 	if (!v)
4519 		return 1;
4520 	l = n[1];
4521 	p = n + 2;
4522 	while (l) {
4523 		int i = l/2;
4524 		unsigned long *q = p + 2 * i;
4525 		if (*q == v) {
4526 			q[1]++;
4527 			return 1;
4528 		}
4529 		if (*q > v) {
4530 			l = i;
4531 		} else {
4532 			p = q + 2;
4533 			l -= i + 1;
4534 		}
4535 	}
4536 	if (++n[1] == n[0])
4537 		return 0;
4538 	memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4539 	p[0] = v;
4540 	p[1] = 1;
4541 	return 1;
4542 }
4543 
4544 static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4545 {
4546 	void *p;
4547 	int i;
4548 	if (n[0] == n[1])
4549 		return;
4550 	for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) {
4551 		if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4552 			continue;
4553 		if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4554 			return;
4555 	}
4556 }
4557 
4558 static void show_symbol(struct seq_file *m, unsigned long address)
4559 {
4560 #ifdef CONFIG_KALLSYMS
4561 	unsigned long offset, size;
4562 	char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
4563 
4564 	if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
4565 		seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4566 		if (modname[0])
4567 			seq_printf(m, " [%s]", modname);
4568 		return;
4569 	}
4570 #endif
4571 	seq_printf(m, "%p", (void *)address);
4572 }
4573 
4574 static int leaks_show(struct seq_file *m, void *p)
4575 {
4576 	struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
4577 	struct slab *slabp;
4578 	struct kmem_list3 *l3;
4579 	const char *name;
4580 	unsigned long *n = m->private;
4581 	int node;
4582 	int i;
4583 
4584 	if (!(cachep->flags & SLAB_STORE_USER))
4585 		return 0;
4586 	if (!(cachep->flags & SLAB_RED_ZONE))
4587 		return 0;
4588 
4589 	/* OK, we can do it */
4590 
4591 	n[1] = 0;
4592 
4593 	for_each_online_node(node) {
4594 		l3 = cachep->nodelists[node];
4595 		if (!l3)
4596 			continue;
4597 
4598 		check_irq_on();
4599 		spin_lock_irq(&l3->list_lock);
4600 
4601 		list_for_each_entry(slabp, &l3->slabs_full, list)
4602 			handle_slab(n, cachep, slabp);
4603 		list_for_each_entry(slabp, &l3->slabs_partial, list)
4604 			handle_slab(n, cachep, slabp);
4605 		spin_unlock_irq(&l3->list_lock);
4606 	}
4607 	name = cachep->name;
4608 	if (n[0] == n[1]) {
4609 		/* Increase the buffer size */
4610 		mutex_unlock(&slab_mutex);
4611 		m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4612 		if (!m->private) {
4613 			/* Too bad, we are really out */
4614 			m->private = n;
4615 			mutex_lock(&slab_mutex);
4616 			return -ENOMEM;
4617 		}
4618 		*(unsigned long *)m->private = n[0] * 2;
4619 		kfree(n);
4620 		mutex_lock(&slab_mutex);
4621 		/* Now make sure this entry will be retried */
4622 		m->count = m->size;
4623 		return 0;
4624 	}
4625 	for (i = 0; i < n[1]; i++) {
4626 		seq_printf(m, "%s: %lu ", name, n[2*i+3]);
4627 		show_symbol(m, n[2*i+2]);
4628 		seq_putc(m, '\n');
4629 	}
4630 
4631 	return 0;
4632 }
4633 
4634 static const struct seq_operations slabstats_op = {
4635 	.start = leaks_start,
4636 	.next = s_next,
4637 	.stop = s_stop,
4638 	.show = leaks_show,
4639 };
4640 
4641 static int slabstats_open(struct inode *inode, struct file *file)
4642 {
4643 	unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
4644 	int ret = -ENOMEM;
4645 	if (n) {
4646 		ret = seq_open(file, &slabstats_op);
4647 		if (!ret) {
4648 			struct seq_file *m = file->private_data;
4649 			*n = PAGE_SIZE / (2 * sizeof(unsigned long));
4650 			m->private = n;
4651 			n = NULL;
4652 		}
4653 		kfree(n);
4654 	}
4655 	return ret;
4656 }
4657 
4658 static const struct file_operations proc_slabstats_operations = {
4659 	.open		= slabstats_open,
4660 	.read		= seq_read,
4661 	.llseek		= seq_lseek,
4662 	.release	= seq_release_private,
4663 };
4664 #endif
4665 
4666 static int __init slab_proc_init(void)
4667 {
4668 	proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations);
4669 #ifdef CONFIG_DEBUG_SLAB_LEAK
4670 	proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
4671 #endif
4672 	return 0;
4673 }
4674 module_init(slab_proc_init);
4675 #endif
4676 
4677 /**
4678  * ksize - get the actual amount of memory allocated for a given object
4679  * @objp: Pointer to the object
4680  *
4681  * kmalloc may internally round up allocations and return more memory
4682  * than requested. ksize() can be used to determine the actual amount of
4683  * memory allocated. The caller may use this additional memory, even though
4684  * a smaller amount of memory was initially specified with the kmalloc call.
4685  * The caller must guarantee that objp points to a valid object previously
4686  * allocated with either kmalloc() or kmem_cache_alloc(). The object
4687  * must not be freed during the duration of the call.
4688  */
4689 size_t ksize(const void *objp)
4690 {
4691 	BUG_ON(!objp);
4692 	if (unlikely(objp == ZERO_SIZE_PTR))
4693 		return 0;
4694 
4695 	return virt_to_cache(objp)->object_size;
4696 }
4697 EXPORT_SYMBOL(ksize);
4698