xref: /openbmc/linux/mm/slab.c (revision 643d1f7f)
1 /*
2  * linux/mm/slab.c
3  * Written by Mark Hemment, 1996/97.
4  * (markhe@nextd.demon.co.uk)
5  *
6  * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
7  *
8  * Major cleanup, different bufctl logic, per-cpu arrays
9  *	(c) 2000 Manfred Spraul
10  *
11  * Cleanup, make the head arrays unconditional, preparation for NUMA
12  * 	(c) 2002 Manfred Spraul
13  *
14  * An implementation of the Slab Allocator as described in outline in;
15  *	UNIX Internals: The New Frontiers by Uresh Vahalia
16  *	Pub: Prentice Hall	ISBN 0-13-101908-2
17  * or with a little more detail in;
18  *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
19  *	Jeff Bonwick (Sun Microsystems).
20  *	Presented at: USENIX Summer 1994 Technical Conference
21  *
22  * The memory is organized in caches, one cache for each object type.
23  * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
24  * Each cache consists out of many slabs (they are small (usually one
25  * page long) and always contiguous), and each slab contains multiple
26  * initialized objects.
27  *
28  * This means, that your constructor is used only for newly allocated
29  * slabs and you must pass objects with the same initializations to
30  * kmem_cache_free.
31  *
32  * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
33  * normal). If you need a special memory type, then must create a new
34  * cache for that memory type.
35  *
36  * In order to reduce fragmentation, the slabs are sorted in 3 groups:
37  *   full slabs with 0 free objects
38  *   partial slabs
39  *   empty slabs with no allocated objects
40  *
41  * If partial slabs exist, then new allocations come from these slabs,
42  * otherwise from empty slabs or new slabs are allocated.
43  *
44  * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
45  * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
46  *
47  * Each cache has a short per-cpu head array, most allocs
48  * and frees go into that array, and if that array overflows, then 1/2
49  * of the entries in the array are given back into the global cache.
50  * The head array is strictly LIFO and should improve the cache hit rates.
51  * On SMP, it additionally reduces the spinlock operations.
52  *
53  * The c_cpuarray may not be read with enabled local interrupts -
54  * it's changed with a smp_call_function().
55  *
56  * SMP synchronization:
57  *  constructors and destructors are called without any locking.
58  *  Several members in struct kmem_cache and struct slab never change, they
59  *	are accessed without any locking.
60  *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
61  *  	and local interrupts are disabled so slab code is preempt-safe.
62  *  The non-constant members are protected with a per-cache irq spinlock.
63  *
64  * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
65  * in 2000 - many ideas in the current implementation are derived from
66  * his patch.
67  *
68  * Further notes from the original documentation:
69  *
70  * 11 April '97.  Started multi-threading - markhe
71  *	The global cache-chain is protected by the mutex 'cache_chain_mutex'.
72  *	The sem is only needed when accessing/extending the cache-chain, which
73  *	can never happen inside an interrupt (kmem_cache_create(),
74  *	kmem_cache_shrink() and kmem_cache_reap()).
75  *
76  *	At present, each engine can be growing a cache.  This should be blocked.
77  *
78  * 15 March 2005. NUMA slab allocator.
79  *	Shai Fultheim <shai@scalex86.org>.
80  *	Shobhit Dayal <shobhit@calsoftinc.com>
81  *	Alok N Kataria <alokk@calsoftinc.com>
82  *	Christoph Lameter <christoph@lameter.com>
83  *
84  *	Modified the slab allocator to be node aware on NUMA systems.
85  *	Each node has its own list of partial, free and full slabs.
86  *	All object allocations for a node occur from node specific slab lists.
87  */
88 
89 #include	<linux/slab.h>
90 #include	<linux/mm.h>
91 #include	<linux/poison.h>
92 #include	<linux/swap.h>
93 #include	<linux/cache.h>
94 #include	<linux/interrupt.h>
95 #include	<linux/init.h>
96 #include	<linux/compiler.h>
97 #include	<linux/cpuset.h>
98 #include	<linux/seq_file.h>
99 #include	<linux/notifier.h>
100 #include	<linux/kallsyms.h>
101 #include	<linux/cpu.h>
102 #include	<linux/sysctl.h>
103 #include	<linux/module.h>
104 #include	<linux/rcupdate.h>
105 #include	<linux/string.h>
106 #include	<linux/uaccess.h>
107 #include	<linux/nodemask.h>
108 #include	<linux/mempolicy.h>
109 #include	<linux/mutex.h>
110 #include	<linux/fault-inject.h>
111 #include	<linux/rtmutex.h>
112 #include	<linux/reciprocal_div.h>
113 
114 #include	<asm/cacheflush.h>
115 #include	<asm/tlbflush.h>
116 #include	<asm/page.h>
117 
118 /*
119  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
120  *		  0 for faster, smaller code (especially in the critical paths).
121  *
122  * STATS	- 1 to collect stats for /proc/slabinfo.
123  *		  0 for faster, smaller code (especially in the critical paths).
124  *
125  * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
126  */
127 
128 #ifdef CONFIG_DEBUG_SLAB
129 #define	DEBUG		1
130 #define	STATS		1
131 #define	FORCED_DEBUG	1
132 #else
133 #define	DEBUG		0
134 #define	STATS		0
135 #define	FORCED_DEBUG	0
136 #endif
137 
138 /* Shouldn't this be in a header file somewhere? */
139 #define	BYTES_PER_WORD		sizeof(void *)
140 #define	REDZONE_ALIGN		max(BYTES_PER_WORD, __alignof__(unsigned long long))
141 
142 #ifndef cache_line_size
143 #define cache_line_size()	L1_CACHE_BYTES
144 #endif
145 
146 #ifndef ARCH_KMALLOC_MINALIGN
147 /*
148  * Enforce a minimum alignment for the kmalloc caches.
149  * Usually, the kmalloc caches are cache_line_size() aligned, except when
150  * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
151  * Some archs want to perform DMA into kmalloc caches and need a guaranteed
152  * alignment larger than the alignment of a 64-bit integer.
153  * ARCH_KMALLOC_MINALIGN allows that.
154  * Note that increasing this value may disable some debug features.
155  */
156 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
157 #endif
158 
159 #ifndef ARCH_SLAB_MINALIGN
160 /*
161  * Enforce a minimum alignment for all caches.
162  * Intended for archs that get misalignment faults even for BYTES_PER_WORD
163  * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
164  * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
165  * some debug features.
166  */
167 #define ARCH_SLAB_MINALIGN 0
168 #endif
169 
170 #ifndef ARCH_KMALLOC_FLAGS
171 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
172 #endif
173 
174 /* Legal flag mask for kmem_cache_create(). */
175 #if DEBUG
176 # define CREATE_MASK	(SLAB_RED_ZONE | \
177 			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
178 			 SLAB_CACHE_DMA | \
179 			 SLAB_STORE_USER | \
180 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
181 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
182 #else
183 # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
184 			 SLAB_CACHE_DMA | \
185 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
186 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
187 #endif
188 
189 /*
190  * kmem_bufctl_t:
191  *
192  * Bufctl's are used for linking objs within a slab
193  * linked offsets.
194  *
195  * This implementation relies on "struct page" for locating the cache &
196  * slab an object belongs to.
197  * This allows the bufctl structure to be small (one int), but limits
198  * the number of objects a slab (not a cache) can contain when off-slab
199  * bufctls are used. The limit is the size of the largest general cache
200  * that does not use off-slab slabs.
201  * For 32bit archs with 4 kB pages, is this 56.
202  * This is not serious, as it is only for large objects, when it is unwise
203  * to have too many per slab.
204  * Note: This limit can be raised by introducing a general cache whose size
205  * is less than 512 (PAGE_SIZE<<3), but greater than 256.
206  */
207 
208 typedef unsigned int kmem_bufctl_t;
209 #define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
210 #define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
211 #define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2)
212 #define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3)
213 
214 /*
215  * struct slab
216  *
217  * Manages the objs in a slab. Placed either at the beginning of mem allocated
218  * for a slab, or allocated from an general cache.
219  * Slabs are chained into three list: fully used, partial, fully free slabs.
220  */
221 struct slab {
222 	struct list_head list;
223 	unsigned long colouroff;
224 	void *s_mem;		/* including colour offset */
225 	unsigned int inuse;	/* num of objs active in slab */
226 	kmem_bufctl_t free;
227 	unsigned short nodeid;
228 };
229 
230 /*
231  * struct slab_rcu
232  *
233  * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
234  * arrange for kmem_freepages to be called via RCU.  This is useful if
235  * we need to approach a kernel structure obliquely, from its address
236  * obtained without the usual locking.  We can lock the structure to
237  * stabilize it and check it's still at the given address, only if we
238  * can be sure that the memory has not been meanwhile reused for some
239  * other kind of object (which our subsystem's lock might corrupt).
240  *
241  * rcu_read_lock before reading the address, then rcu_read_unlock after
242  * taking the spinlock within the structure expected at that address.
243  *
244  * We assume struct slab_rcu can overlay struct slab when destroying.
245  */
246 struct slab_rcu {
247 	struct rcu_head head;
248 	struct kmem_cache *cachep;
249 	void *addr;
250 };
251 
252 /*
253  * struct array_cache
254  *
255  * Purpose:
256  * - LIFO ordering, to hand out cache-warm objects from _alloc
257  * - reduce the number of linked list operations
258  * - reduce spinlock operations
259  *
260  * The limit is stored in the per-cpu structure to reduce the data cache
261  * footprint.
262  *
263  */
264 struct array_cache {
265 	unsigned int avail;
266 	unsigned int limit;
267 	unsigned int batchcount;
268 	unsigned int touched;
269 	spinlock_t lock;
270 	void *entry[];	/*
271 			 * Must have this definition in here for the proper
272 			 * alignment of array_cache. Also simplifies accessing
273 			 * the entries.
274 			 */
275 };
276 
277 /*
278  * bootstrap: The caches do not work without cpuarrays anymore, but the
279  * cpuarrays are allocated from the generic caches...
280  */
281 #define BOOT_CPUCACHE_ENTRIES	1
282 struct arraycache_init {
283 	struct array_cache cache;
284 	void *entries[BOOT_CPUCACHE_ENTRIES];
285 };
286 
287 /*
288  * The slab lists for all objects.
289  */
290 struct kmem_list3 {
291 	struct list_head slabs_partial;	/* partial list first, better asm code */
292 	struct list_head slabs_full;
293 	struct list_head slabs_free;
294 	unsigned long free_objects;
295 	unsigned int free_limit;
296 	unsigned int colour_next;	/* Per-node cache coloring */
297 	spinlock_t list_lock;
298 	struct array_cache *shared;	/* shared per node */
299 	struct array_cache **alien;	/* on other nodes */
300 	unsigned long next_reap;	/* updated without locking */
301 	int free_touched;		/* updated without locking */
302 };
303 
304 /*
305  * Need this for bootstrapping a per node allocator.
306  */
307 #define NUM_INIT_LISTS (3 * MAX_NUMNODES)
308 struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
309 #define	CACHE_CACHE 0
310 #define	SIZE_AC MAX_NUMNODES
311 #define	SIZE_L3 (2 * MAX_NUMNODES)
312 
313 static int drain_freelist(struct kmem_cache *cache,
314 			struct kmem_list3 *l3, int tofree);
315 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
316 			int node);
317 static int enable_cpucache(struct kmem_cache *cachep);
318 static void cache_reap(struct work_struct *unused);
319 
320 /*
321  * This function must be completely optimized away if a constant is passed to
322  * it.  Mostly the same as what is in linux/slab.h except it returns an index.
323  */
324 static __always_inline int index_of(const size_t size)
325 {
326 	extern void __bad_size(void);
327 
328 	if (__builtin_constant_p(size)) {
329 		int i = 0;
330 
331 #define CACHE(x) \
332 	if (size <=x) \
333 		return i; \
334 	else \
335 		i++;
336 #include "linux/kmalloc_sizes.h"
337 #undef CACHE
338 		__bad_size();
339 	} else
340 		__bad_size();
341 	return 0;
342 }
343 
344 static int slab_early_init = 1;
345 
346 #define INDEX_AC index_of(sizeof(struct arraycache_init))
347 #define INDEX_L3 index_of(sizeof(struct kmem_list3))
348 
349 static void kmem_list3_init(struct kmem_list3 *parent)
350 {
351 	INIT_LIST_HEAD(&parent->slabs_full);
352 	INIT_LIST_HEAD(&parent->slabs_partial);
353 	INIT_LIST_HEAD(&parent->slabs_free);
354 	parent->shared = NULL;
355 	parent->alien = NULL;
356 	parent->colour_next = 0;
357 	spin_lock_init(&parent->list_lock);
358 	parent->free_objects = 0;
359 	parent->free_touched = 0;
360 }
361 
362 #define MAKE_LIST(cachep, listp, slab, nodeid)				\
363 	do {								\
364 		INIT_LIST_HEAD(listp);					\
365 		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\
366 	} while (0)
367 
368 #define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
369 	do {								\
370 	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
371 	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
372 	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
373 	} while (0)
374 
375 /*
376  * struct kmem_cache
377  *
378  * manages a cache.
379  */
380 
381 struct kmem_cache {
382 /* 1) per-cpu data, touched during every alloc/free */
383 	struct array_cache *array[NR_CPUS];
384 /* 2) Cache tunables. Protected by cache_chain_mutex */
385 	unsigned int batchcount;
386 	unsigned int limit;
387 	unsigned int shared;
388 
389 	unsigned int buffer_size;
390 	u32 reciprocal_buffer_size;
391 /* 3) touched by every alloc & free from the backend */
392 
393 	unsigned int flags;		/* constant flags */
394 	unsigned int num;		/* # of objs per slab */
395 
396 /* 4) cache_grow/shrink */
397 	/* order of pgs per slab (2^n) */
398 	unsigned int gfporder;
399 
400 	/* force GFP flags, e.g. GFP_DMA */
401 	gfp_t gfpflags;
402 
403 	size_t colour;			/* cache colouring range */
404 	unsigned int colour_off;	/* colour offset */
405 	struct kmem_cache *slabp_cache;
406 	unsigned int slab_size;
407 	unsigned int dflags;		/* dynamic flags */
408 
409 	/* constructor func */
410 	void (*ctor)(struct kmem_cache *, void *);
411 
412 /* 5) cache creation/removal */
413 	const char *name;
414 	struct list_head next;
415 
416 /* 6) statistics */
417 #if STATS
418 	unsigned long num_active;
419 	unsigned long num_allocations;
420 	unsigned long high_mark;
421 	unsigned long grown;
422 	unsigned long reaped;
423 	unsigned long errors;
424 	unsigned long max_freeable;
425 	unsigned long node_allocs;
426 	unsigned long node_frees;
427 	unsigned long node_overflow;
428 	atomic_t allochit;
429 	atomic_t allocmiss;
430 	atomic_t freehit;
431 	atomic_t freemiss;
432 #endif
433 #if DEBUG
434 	/*
435 	 * If debugging is enabled, then the allocator can add additional
436 	 * fields and/or padding to every object. buffer_size contains the total
437 	 * object size including these internal fields, the following two
438 	 * variables contain the offset to the user object and its size.
439 	 */
440 	int obj_offset;
441 	int obj_size;
442 #endif
443 	/*
444 	 * We put nodelists[] at the end of kmem_cache, because we want to size
445 	 * this array to nr_node_ids slots instead of MAX_NUMNODES
446 	 * (see kmem_cache_init())
447 	 * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
448 	 * is statically defined, so we reserve the max number of nodes.
449 	 */
450 	struct kmem_list3 *nodelists[MAX_NUMNODES];
451 	/*
452 	 * Do not add fields after nodelists[]
453 	 */
454 };
455 
456 #define CFLGS_OFF_SLAB		(0x80000000UL)
457 #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
458 
459 #define BATCHREFILL_LIMIT	16
460 /*
461  * Optimization question: fewer reaps means less probability for unnessary
462  * cpucache drain/refill cycles.
463  *
464  * OTOH the cpuarrays can contain lots of objects,
465  * which could lock up otherwise freeable slabs.
466  */
467 #define REAPTIMEOUT_CPUC	(2*HZ)
468 #define REAPTIMEOUT_LIST3	(4*HZ)
469 
470 #if STATS
471 #define	STATS_INC_ACTIVE(x)	((x)->num_active++)
472 #define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
473 #define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
474 #define	STATS_INC_GROWN(x)	((x)->grown++)
475 #define	STATS_ADD_REAPED(x,y)	((x)->reaped += (y))
476 #define	STATS_SET_HIGH(x)						\
477 	do {								\
478 		if ((x)->num_active > (x)->high_mark)			\
479 			(x)->high_mark = (x)->num_active;		\
480 	} while (0)
481 #define	STATS_INC_ERR(x)	((x)->errors++)
482 #define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
483 #define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
484 #define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
485 #define	STATS_SET_FREEABLE(x, i)					\
486 	do {								\
487 		if ((x)->max_freeable < i)				\
488 			(x)->max_freeable = i;				\
489 	} while (0)
490 #define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
491 #define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
492 #define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
493 #define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
494 #else
495 #define	STATS_INC_ACTIVE(x)	do { } while (0)
496 #define	STATS_DEC_ACTIVE(x)	do { } while (0)
497 #define	STATS_INC_ALLOCED(x)	do { } while (0)
498 #define	STATS_INC_GROWN(x)	do { } while (0)
499 #define	STATS_ADD_REAPED(x,y)	do { } while (0)
500 #define	STATS_SET_HIGH(x)	do { } while (0)
501 #define	STATS_INC_ERR(x)	do { } while (0)
502 #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
503 #define	STATS_INC_NODEFREES(x)	do { } while (0)
504 #define STATS_INC_ACOVERFLOW(x)   do { } while (0)
505 #define	STATS_SET_FREEABLE(x, i) do { } while (0)
506 #define STATS_INC_ALLOCHIT(x)	do { } while (0)
507 #define STATS_INC_ALLOCMISS(x)	do { } while (0)
508 #define STATS_INC_FREEHIT(x)	do { } while (0)
509 #define STATS_INC_FREEMISS(x)	do { } while (0)
510 #endif
511 
512 #if DEBUG
513 
514 /*
515  * memory layout of objects:
516  * 0		: objp
517  * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
518  * 		the end of an object is aligned with the end of the real
519  * 		allocation. Catches writes behind the end of the allocation.
520  * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
521  * 		redzone word.
522  * cachep->obj_offset: The real object.
523  * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
524  * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
525  *					[BYTES_PER_WORD long]
526  */
527 static int obj_offset(struct kmem_cache *cachep)
528 {
529 	return cachep->obj_offset;
530 }
531 
532 static int obj_size(struct kmem_cache *cachep)
533 {
534 	return cachep->obj_size;
535 }
536 
537 static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
538 {
539 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
540 	return (unsigned long long*) (objp + obj_offset(cachep) -
541 				      sizeof(unsigned long long));
542 }
543 
544 static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
545 {
546 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
547 	if (cachep->flags & SLAB_STORE_USER)
548 		return (unsigned long long *)(objp + cachep->buffer_size -
549 					      sizeof(unsigned long long) -
550 					      REDZONE_ALIGN);
551 	return (unsigned long long *) (objp + cachep->buffer_size -
552 				       sizeof(unsigned long long));
553 }
554 
555 static void **dbg_userword(struct kmem_cache *cachep, void *objp)
556 {
557 	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
558 	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
559 }
560 
561 #else
562 
563 #define obj_offset(x)			0
564 #define obj_size(cachep)		(cachep->buffer_size)
565 #define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
566 #define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
567 #define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
568 
569 #endif
570 
571 /*
572  * Do not go above this order unless 0 objects fit into the slab.
573  */
574 #define	BREAK_GFP_ORDER_HI	1
575 #define	BREAK_GFP_ORDER_LO	0
576 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
577 
578 /*
579  * Functions for storing/retrieving the cachep and or slab from the page
580  * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
581  * these are used to find the cache which an obj belongs to.
582  */
583 static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
584 {
585 	page->lru.next = (struct list_head *)cache;
586 }
587 
588 static inline struct kmem_cache *page_get_cache(struct page *page)
589 {
590 	page = compound_head(page);
591 	BUG_ON(!PageSlab(page));
592 	return (struct kmem_cache *)page->lru.next;
593 }
594 
595 static inline void page_set_slab(struct page *page, struct slab *slab)
596 {
597 	page->lru.prev = (struct list_head *)slab;
598 }
599 
600 static inline struct slab *page_get_slab(struct page *page)
601 {
602 	BUG_ON(!PageSlab(page));
603 	return (struct slab *)page->lru.prev;
604 }
605 
606 static inline struct kmem_cache *virt_to_cache(const void *obj)
607 {
608 	struct page *page = virt_to_head_page(obj);
609 	return page_get_cache(page);
610 }
611 
612 static inline struct slab *virt_to_slab(const void *obj)
613 {
614 	struct page *page = virt_to_head_page(obj);
615 	return page_get_slab(page);
616 }
617 
618 static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
619 				 unsigned int idx)
620 {
621 	return slab->s_mem + cache->buffer_size * idx;
622 }
623 
624 /*
625  * We want to avoid an expensive divide : (offset / cache->buffer_size)
626  *   Using the fact that buffer_size is a constant for a particular cache,
627  *   we can replace (offset / cache->buffer_size) by
628  *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
629  */
630 static inline unsigned int obj_to_index(const struct kmem_cache *cache,
631 					const struct slab *slab, void *obj)
632 {
633 	u32 offset = (obj - slab->s_mem);
634 	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
635 }
636 
637 /*
638  * These are the default caches for kmalloc. Custom caches can have other sizes.
639  */
640 struct cache_sizes malloc_sizes[] = {
641 #define CACHE(x) { .cs_size = (x) },
642 #include <linux/kmalloc_sizes.h>
643 	CACHE(ULONG_MAX)
644 #undef CACHE
645 };
646 EXPORT_SYMBOL(malloc_sizes);
647 
648 /* Must match cache_sizes above. Out of line to keep cache footprint low. */
649 struct cache_names {
650 	char *name;
651 	char *name_dma;
652 };
653 
654 static struct cache_names __initdata cache_names[] = {
655 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
656 #include <linux/kmalloc_sizes.h>
657 	{NULL,}
658 #undef CACHE
659 };
660 
661 static struct arraycache_init initarray_cache __initdata =
662     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
663 static struct arraycache_init initarray_generic =
664     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
665 
666 /* internal cache of cache description objs */
667 static struct kmem_cache cache_cache = {
668 	.batchcount = 1,
669 	.limit = BOOT_CPUCACHE_ENTRIES,
670 	.shared = 1,
671 	.buffer_size = sizeof(struct kmem_cache),
672 	.name = "kmem_cache",
673 };
674 
675 #define BAD_ALIEN_MAGIC 0x01020304ul
676 
677 #ifdef CONFIG_LOCKDEP
678 
679 /*
680  * Slab sometimes uses the kmalloc slabs to store the slab headers
681  * for other slabs "off slab".
682  * The locking for this is tricky in that it nests within the locks
683  * of all other slabs in a few places; to deal with this special
684  * locking we put on-slab caches into a separate lock-class.
685  *
686  * We set lock class for alien array caches which are up during init.
687  * The lock annotation will be lost if all cpus of a node goes down and
688  * then comes back up during hotplug
689  */
690 static struct lock_class_key on_slab_l3_key;
691 static struct lock_class_key on_slab_alc_key;
692 
693 static inline void init_lock_keys(void)
694 
695 {
696 	int q;
697 	struct cache_sizes *s = malloc_sizes;
698 
699 	while (s->cs_size != ULONG_MAX) {
700 		for_each_node(q) {
701 			struct array_cache **alc;
702 			int r;
703 			struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
704 			if (!l3 || OFF_SLAB(s->cs_cachep))
705 				continue;
706 			lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
707 			alc = l3->alien;
708 			/*
709 			 * FIXME: This check for BAD_ALIEN_MAGIC
710 			 * should go away when common slab code is taught to
711 			 * work even without alien caches.
712 			 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
713 			 * for alloc_alien_cache,
714 			 */
715 			if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
716 				continue;
717 			for_each_node(r) {
718 				if (alc[r])
719 					lockdep_set_class(&alc[r]->lock,
720 					     &on_slab_alc_key);
721 			}
722 		}
723 		s++;
724 	}
725 }
726 #else
727 static inline void init_lock_keys(void)
728 {
729 }
730 #endif
731 
732 /*
733  * Guard access to the cache-chain.
734  */
735 static DEFINE_MUTEX(cache_chain_mutex);
736 static struct list_head cache_chain;
737 
738 /*
739  * chicken and egg problem: delay the per-cpu array allocation
740  * until the general caches are up.
741  */
742 static enum {
743 	NONE,
744 	PARTIAL_AC,
745 	PARTIAL_L3,
746 	FULL
747 } g_cpucache_up;
748 
749 /*
750  * used by boot code to determine if it can use slab based allocator
751  */
752 int slab_is_available(void)
753 {
754 	return g_cpucache_up == FULL;
755 }
756 
757 static DEFINE_PER_CPU(struct delayed_work, reap_work);
758 
759 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
760 {
761 	return cachep->array[smp_processor_id()];
762 }
763 
764 static inline struct kmem_cache *__find_general_cachep(size_t size,
765 							gfp_t gfpflags)
766 {
767 	struct cache_sizes *csizep = malloc_sizes;
768 
769 #if DEBUG
770 	/* This happens if someone tries to call
771 	 * kmem_cache_create(), or __kmalloc(), before
772 	 * the generic caches are initialized.
773 	 */
774 	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
775 #endif
776 	if (!size)
777 		return ZERO_SIZE_PTR;
778 
779 	while (size > csizep->cs_size)
780 		csizep++;
781 
782 	/*
783 	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
784 	 * has cs_{dma,}cachep==NULL. Thus no special case
785 	 * for large kmalloc calls required.
786 	 */
787 #ifdef CONFIG_ZONE_DMA
788 	if (unlikely(gfpflags & GFP_DMA))
789 		return csizep->cs_dmacachep;
790 #endif
791 	return csizep->cs_cachep;
792 }
793 
794 static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
795 {
796 	return __find_general_cachep(size, gfpflags);
797 }
798 
799 static size_t slab_mgmt_size(size_t nr_objs, size_t align)
800 {
801 	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
802 }
803 
804 /*
805  * Calculate the number of objects and left-over bytes for a given buffer size.
806  */
807 static void cache_estimate(unsigned long gfporder, size_t buffer_size,
808 			   size_t align, int flags, size_t *left_over,
809 			   unsigned int *num)
810 {
811 	int nr_objs;
812 	size_t mgmt_size;
813 	size_t slab_size = PAGE_SIZE << gfporder;
814 
815 	/*
816 	 * The slab management structure can be either off the slab or
817 	 * on it. For the latter case, the memory allocated for a
818 	 * slab is used for:
819 	 *
820 	 * - The struct slab
821 	 * - One kmem_bufctl_t for each object
822 	 * - Padding to respect alignment of @align
823 	 * - @buffer_size bytes for each object
824 	 *
825 	 * If the slab management structure is off the slab, then the
826 	 * alignment will already be calculated into the size. Because
827 	 * the slabs are all pages aligned, the objects will be at the
828 	 * correct alignment when allocated.
829 	 */
830 	if (flags & CFLGS_OFF_SLAB) {
831 		mgmt_size = 0;
832 		nr_objs = slab_size / buffer_size;
833 
834 		if (nr_objs > SLAB_LIMIT)
835 			nr_objs = SLAB_LIMIT;
836 	} else {
837 		/*
838 		 * Ignore padding for the initial guess. The padding
839 		 * is at most @align-1 bytes, and @buffer_size is at
840 		 * least @align. In the worst case, this result will
841 		 * be one greater than the number of objects that fit
842 		 * into the memory allocation when taking the padding
843 		 * into account.
844 		 */
845 		nr_objs = (slab_size - sizeof(struct slab)) /
846 			  (buffer_size + sizeof(kmem_bufctl_t));
847 
848 		/*
849 		 * This calculated number will be either the right
850 		 * amount, or one greater than what we want.
851 		 */
852 		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
853 		       > slab_size)
854 			nr_objs--;
855 
856 		if (nr_objs > SLAB_LIMIT)
857 			nr_objs = SLAB_LIMIT;
858 
859 		mgmt_size = slab_mgmt_size(nr_objs, align);
860 	}
861 	*num = nr_objs;
862 	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
863 }
864 
865 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
866 
867 static void __slab_error(const char *function, struct kmem_cache *cachep,
868 			char *msg)
869 {
870 	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
871 	       function, cachep->name, msg);
872 	dump_stack();
873 }
874 
875 /*
876  * By default on NUMA we use alien caches to stage the freeing of
877  * objects allocated from other nodes. This causes massive memory
878  * inefficiencies when using fake NUMA setup to split memory into a
879  * large number of small nodes, so it can be disabled on the command
880  * line
881   */
882 
883 static int use_alien_caches __read_mostly = 1;
884 static int numa_platform __read_mostly = 1;
885 static int __init noaliencache_setup(char *s)
886 {
887 	use_alien_caches = 0;
888 	return 1;
889 }
890 __setup("noaliencache", noaliencache_setup);
891 
892 #ifdef CONFIG_NUMA
893 /*
894  * Special reaping functions for NUMA systems called from cache_reap().
895  * These take care of doing round robin flushing of alien caches (containing
896  * objects freed on different nodes from which they were allocated) and the
897  * flushing of remote pcps by calling drain_node_pages.
898  */
899 static DEFINE_PER_CPU(unsigned long, reap_node);
900 
901 static void init_reap_node(int cpu)
902 {
903 	int node;
904 
905 	node = next_node(cpu_to_node(cpu), node_online_map);
906 	if (node == MAX_NUMNODES)
907 		node = first_node(node_online_map);
908 
909 	per_cpu(reap_node, cpu) = node;
910 }
911 
912 static void next_reap_node(void)
913 {
914 	int node = __get_cpu_var(reap_node);
915 
916 	node = next_node(node, node_online_map);
917 	if (unlikely(node >= MAX_NUMNODES))
918 		node = first_node(node_online_map);
919 	__get_cpu_var(reap_node) = node;
920 }
921 
922 #else
923 #define init_reap_node(cpu) do { } while (0)
924 #define next_reap_node(void) do { } while (0)
925 #endif
926 
927 /*
928  * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
929  * via the workqueue/eventd.
930  * Add the CPU number into the expiration time to minimize the possibility of
931  * the CPUs getting into lockstep and contending for the global cache chain
932  * lock.
933  */
934 static void __cpuinit start_cpu_timer(int cpu)
935 {
936 	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
937 
938 	/*
939 	 * When this gets called from do_initcalls via cpucache_init(),
940 	 * init_workqueues() has already run, so keventd will be setup
941 	 * at that time.
942 	 */
943 	if (keventd_up() && reap_work->work.func == NULL) {
944 		init_reap_node(cpu);
945 		INIT_DELAYED_WORK(reap_work, cache_reap);
946 		schedule_delayed_work_on(cpu, reap_work,
947 					__round_jiffies_relative(HZ, cpu));
948 	}
949 }
950 
951 static struct array_cache *alloc_arraycache(int node, int entries,
952 					    int batchcount)
953 {
954 	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
955 	struct array_cache *nc = NULL;
956 
957 	nc = kmalloc_node(memsize, GFP_KERNEL, node);
958 	if (nc) {
959 		nc->avail = 0;
960 		nc->limit = entries;
961 		nc->batchcount = batchcount;
962 		nc->touched = 0;
963 		spin_lock_init(&nc->lock);
964 	}
965 	return nc;
966 }
967 
968 /*
969  * Transfer objects in one arraycache to another.
970  * Locking must be handled by the caller.
971  *
972  * Return the number of entries transferred.
973  */
974 static int transfer_objects(struct array_cache *to,
975 		struct array_cache *from, unsigned int max)
976 {
977 	/* Figure out how many entries to transfer */
978 	int nr = min(min(from->avail, max), to->limit - to->avail);
979 
980 	if (!nr)
981 		return 0;
982 
983 	memcpy(to->entry + to->avail, from->entry + from->avail -nr,
984 			sizeof(void *) *nr);
985 
986 	from->avail -= nr;
987 	to->avail += nr;
988 	to->touched = 1;
989 	return nr;
990 }
991 
992 #ifndef CONFIG_NUMA
993 
994 #define drain_alien_cache(cachep, alien) do { } while (0)
995 #define reap_alien(cachep, l3) do { } while (0)
996 
997 static inline struct array_cache **alloc_alien_cache(int node, int limit)
998 {
999 	return (struct array_cache **)BAD_ALIEN_MAGIC;
1000 }
1001 
1002 static inline void free_alien_cache(struct array_cache **ac_ptr)
1003 {
1004 }
1005 
1006 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1007 {
1008 	return 0;
1009 }
1010 
1011 static inline void *alternate_node_alloc(struct kmem_cache *cachep,
1012 		gfp_t flags)
1013 {
1014 	return NULL;
1015 }
1016 
1017 static inline void *____cache_alloc_node(struct kmem_cache *cachep,
1018 		 gfp_t flags, int nodeid)
1019 {
1020 	return NULL;
1021 }
1022 
1023 #else	/* CONFIG_NUMA */
1024 
1025 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
1026 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
1027 
1028 static struct array_cache **alloc_alien_cache(int node, int limit)
1029 {
1030 	struct array_cache **ac_ptr;
1031 	int memsize = sizeof(void *) * nr_node_ids;
1032 	int i;
1033 
1034 	if (limit > 1)
1035 		limit = 12;
1036 	ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
1037 	if (ac_ptr) {
1038 		for_each_node(i) {
1039 			if (i == node || !node_online(i)) {
1040 				ac_ptr[i] = NULL;
1041 				continue;
1042 			}
1043 			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
1044 			if (!ac_ptr[i]) {
1045 				for (i--; i >= 0; i--)
1046 					kfree(ac_ptr[i]);
1047 				kfree(ac_ptr);
1048 				return NULL;
1049 			}
1050 		}
1051 	}
1052 	return ac_ptr;
1053 }
1054 
1055 static void free_alien_cache(struct array_cache **ac_ptr)
1056 {
1057 	int i;
1058 
1059 	if (!ac_ptr)
1060 		return;
1061 	for_each_node(i)
1062 	    kfree(ac_ptr[i]);
1063 	kfree(ac_ptr);
1064 }
1065 
1066 static void __drain_alien_cache(struct kmem_cache *cachep,
1067 				struct array_cache *ac, int node)
1068 {
1069 	struct kmem_list3 *rl3 = cachep->nodelists[node];
1070 
1071 	if (ac->avail) {
1072 		spin_lock(&rl3->list_lock);
1073 		/*
1074 		 * Stuff objects into the remote nodes shared array first.
1075 		 * That way we could avoid the overhead of putting the objects
1076 		 * into the free lists and getting them back later.
1077 		 */
1078 		if (rl3->shared)
1079 			transfer_objects(rl3->shared, ac, ac->limit);
1080 
1081 		free_block(cachep, ac->entry, ac->avail, node);
1082 		ac->avail = 0;
1083 		spin_unlock(&rl3->list_lock);
1084 	}
1085 }
1086 
1087 /*
1088  * Called from cache_reap() to regularly drain alien caches round robin.
1089  */
1090 static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
1091 {
1092 	int node = __get_cpu_var(reap_node);
1093 
1094 	if (l3->alien) {
1095 		struct array_cache *ac = l3->alien[node];
1096 
1097 		if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
1098 			__drain_alien_cache(cachep, ac, node);
1099 			spin_unlock_irq(&ac->lock);
1100 		}
1101 	}
1102 }
1103 
1104 static void drain_alien_cache(struct kmem_cache *cachep,
1105 				struct array_cache **alien)
1106 {
1107 	int i = 0;
1108 	struct array_cache *ac;
1109 	unsigned long flags;
1110 
1111 	for_each_online_node(i) {
1112 		ac = alien[i];
1113 		if (ac) {
1114 			spin_lock_irqsave(&ac->lock, flags);
1115 			__drain_alien_cache(cachep, ac, i);
1116 			spin_unlock_irqrestore(&ac->lock, flags);
1117 		}
1118 	}
1119 }
1120 
1121 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1122 {
1123 	struct slab *slabp = virt_to_slab(objp);
1124 	int nodeid = slabp->nodeid;
1125 	struct kmem_list3 *l3;
1126 	struct array_cache *alien = NULL;
1127 	int node;
1128 
1129 	node = numa_node_id();
1130 
1131 	/*
1132 	 * Make sure we are not freeing a object from another node to the array
1133 	 * cache on this cpu.
1134 	 */
1135 	if (likely(slabp->nodeid == node))
1136 		return 0;
1137 
1138 	l3 = cachep->nodelists[node];
1139 	STATS_INC_NODEFREES(cachep);
1140 	if (l3->alien && l3->alien[nodeid]) {
1141 		alien = l3->alien[nodeid];
1142 		spin_lock(&alien->lock);
1143 		if (unlikely(alien->avail == alien->limit)) {
1144 			STATS_INC_ACOVERFLOW(cachep);
1145 			__drain_alien_cache(cachep, alien, nodeid);
1146 		}
1147 		alien->entry[alien->avail++] = objp;
1148 		spin_unlock(&alien->lock);
1149 	} else {
1150 		spin_lock(&(cachep->nodelists[nodeid])->list_lock);
1151 		free_block(cachep, &objp, 1, nodeid);
1152 		spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
1153 	}
1154 	return 1;
1155 }
1156 #endif
1157 
1158 static void __cpuinit cpuup_canceled(long cpu)
1159 {
1160 	struct kmem_cache *cachep;
1161 	struct kmem_list3 *l3 = NULL;
1162 	int node = cpu_to_node(cpu);
1163 
1164 	list_for_each_entry(cachep, &cache_chain, next) {
1165 		struct array_cache *nc;
1166 		struct array_cache *shared;
1167 		struct array_cache **alien;
1168 		cpumask_t mask;
1169 
1170 		mask = node_to_cpumask(node);
1171 		/* cpu is dead; no one can alloc from it. */
1172 		nc = cachep->array[cpu];
1173 		cachep->array[cpu] = NULL;
1174 		l3 = cachep->nodelists[node];
1175 
1176 		if (!l3)
1177 			goto free_array_cache;
1178 
1179 		spin_lock_irq(&l3->list_lock);
1180 
1181 		/* Free limit for this kmem_list3 */
1182 		l3->free_limit -= cachep->batchcount;
1183 		if (nc)
1184 			free_block(cachep, nc->entry, nc->avail, node);
1185 
1186 		if (!cpus_empty(mask)) {
1187 			spin_unlock_irq(&l3->list_lock);
1188 			goto free_array_cache;
1189 		}
1190 
1191 		shared = l3->shared;
1192 		if (shared) {
1193 			free_block(cachep, shared->entry,
1194 				   shared->avail, node);
1195 			l3->shared = NULL;
1196 		}
1197 
1198 		alien = l3->alien;
1199 		l3->alien = NULL;
1200 
1201 		spin_unlock_irq(&l3->list_lock);
1202 
1203 		kfree(shared);
1204 		if (alien) {
1205 			drain_alien_cache(cachep, alien);
1206 			free_alien_cache(alien);
1207 		}
1208 free_array_cache:
1209 		kfree(nc);
1210 	}
1211 	/*
1212 	 * In the previous loop, all the objects were freed to
1213 	 * the respective cache's slabs,  now we can go ahead and
1214 	 * shrink each nodelist to its limit.
1215 	 */
1216 	list_for_each_entry(cachep, &cache_chain, next) {
1217 		l3 = cachep->nodelists[node];
1218 		if (!l3)
1219 			continue;
1220 		drain_freelist(cachep, l3, l3->free_objects);
1221 	}
1222 }
1223 
1224 static int __cpuinit cpuup_prepare(long cpu)
1225 {
1226 	struct kmem_cache *cachep;
1227 	struct kmem_list3 *l3 = NULL;
1228 	int node = cpu_to_node(cpu);
1229 	const int memsize = sizeof(struct kmem_list3);
1230 
1231 	/*
1232 	 * We need to do this right in the beginning since
1233 	 * alloc_arraycache's are going to use this list.
1234 	 * kmalloc_node allows us to add the slab to the right
1235 	 * kmem_list3 and not this cpu's kmem_list3
1236 	 */
1237 
1238 	list_for_each_entry(cachep, &cache_chain, next) {
1239 		/*
1240 		 * Set up the size64 kmemlist for cpu before we can
1241 		 * begin anything. Make sure some other cpu on this
1242 		 * node has not already allocated this
1243 		 */
1244 		if (!cachep->nodelists[node]) {
1245 			l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1246 			if (!l3)
1247 				goto bad;
1248 			kmem_list3_init(l3);
1249 			l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1250 			    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1251 
1252 			/*
1253 			 * The l3s don't come and go as CPUs come and
1254 			 * go.  cache_chain_mutex is sufficient
1255 			 * protection here.
1256 			 */
1257 			cachep->nodelists[node] = l3;
1258 		}
1259 
1260 		spin_lock_irq(&cachep->nodelists[node]->list_lock);
1261 		cachep->nodelists[node]->free_limit =
1262 			(1 + nr_cpus_node(node)) *
1263 			cachep->batchcount + cachep->num;
1264 		spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1265 	}
1266 
1267 	/*
1268 	 * Now we can go ahead with allocating the shared arrays and
1269 	 * array caches
1270 	 */
1271 	list_for_each_entry(cachep, &cache_chain, next) {
1272 		struct array_cache *nc;
1273 		struct array_cache *shared = NULL;
1274 		struct array_cache **alien = NULL;
1275 
1276 		nc = alloc_arraycache(node, cachep->limit,
1277 					cachep->batchcount);
1278 		if (!nc)
1279 			goto bad;
1280 		if (cachep->shared) {
1281 			shared = alloc_arraycache(node,
1282 				cachep->shared * cachep->batchcount,
1283 				0xbaadf00d);
1284 			if (!shared) {
1285 				kfree(nc);
1286 				goto bad;
1287 			}
1288 		}
1289 		if (use_alien_caches) {
1290 			alien = alloc_alien_cache(node, cachep->limit);
1291 			if (!alien) {
1292 				kfree(shared);
1293 				kfree(nc);
1294 				goto bad;
1295 			}
1296 		}
1297 		cachep->array[cpu] = nc;
1298 		l3 = cachep->nodelists[node];
1299 		BUG_ON(!l3);
1300 
1301 		spin_lock_irq(&l3->list_lock);
1302 		if (!l3->shared) {
1303 			/*
1304 			 * We are serialised from CPU_DEAD or
1305 			 * CPU_UP_CANCELLED by the cpucontrol lock
1306 			 */
1307 			l3->shared = shared;
1308 			shared = NULL;
1309 		}
1310 #ifdef CONFIG_NUMA
1311 		if (!l3->alien) {
1312 			l3->alien = alien;
1313 			alien = NULL;
1314 		}
1315 #endif
1316 		spin_unlock_irq(&l3->list_lock);
1317 		kfree(shared);
1318 		free_alien_cache(alien);
1319 	}
1320 	return 0;
1321 bad:
1322 	cpuup_canceled(cpu);
1323 	return -ENOMEM;
1324 }
1325 
1326 static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1327 				    unsigned long action, void *hcpu)
1328 {
1329 	long cpu = (long)hcpu;
1330 	int err = 0;
1331 
1332 	switch (action) {
1333 	case CPU_UP_PREPARE:
1334 	case CPU_UP_PREPARE_FROZEN:
1335 		mutex_lock(&cache_chain_mutex);
1336 		err = cpuup_prepare(cpu);
1337 		mutex_unlock(&cache_chain_mutex);
1338 		break;
1339 	case CPU_ONLINE:
1340 	case CPU_ONLINE_FROZEN:
1341 		start_cpu_timer(cpu);
1342 		break;
1343 #ifdef CONFIG_HOTPLUG_CPU
1344   	case CPU_DOWN_PREPARE:
1345   	case CPU_DOWN_PREPARE_FROZEN:
1346 		/*
1347 		 * Shutdown cache reaper. Note that the cache_chain_mutex is
1348 		 * held so that if cache_reap() is invoked it cannot do
1349 		 * anything expensive but will only modify reap_work
1350 		 * and reschedule the timer.
1351 		*/
1352 		cancel_rearming_delayed_work(&per_cpu(reap_work, cpu));
1353 		/* Now the cache_reaper is guaranteed to be not running. */
1354 		per_cpu(reap_work, cpu).work.func = NULL;
1355   		break;
1356   	case CPU_DOWN_FAILED:
1357   	case CPU_DOWN_FAILED_FROZEN:
1358 		start_cpu_timer(cpu);
1359   		break;
1360 	case CPU_DEAD:
1361 	case CPU_DEAD_FROZEN:
1362 		/*
1363 		 * Even if all the cpus of a node are down, we don't free the
1364 		 * kmem_list3 of any cache. This to avoid a race between
1365 		 * cpu_down, and a kmalloc allocation from another cpu for
1366 		 * memory from the node of the cpu going down.  The list3
1367 		 * structure is usually allocated from kmem_cache_create() and
1368 		 * gets destroyed at kmem_cache_destroy().
1369 		 */
1370 		/* fall through */
1371 #endif
1372 	case CPU_UP_CANCELED:
1373 	case CPU_UP_CANCELED_FROZEN:
1374 		mutex_lock(&cache_chain_mutex);
1375 		cpuup_canceled(cpu);
1376 		mutex_unlock(&cache_chain_mutex);
1377 		break;
1378 	}
1379 	return err ? NOTIFY_BAD : NOTIFY_OK;
1380 }
1381 
1382 static struct notifier_block __cpuinitdata cpucache_notifier = {
1383 	&cpuup_callback, NULL, 0
1384 };
1385 
1386 /*
1387  * swap the static kmem_list3 with kmalloced memory
1388  */
1389 static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1390 			int nodeid)
1391 {
1392 	struct kmem_list3 *ptr;
1393 
1394 	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
1395 	BUG_ON(!ptr);
1396 
1397 	local_irq_disable();
1398 	memcpy(ptr, list, sizeof(struct kmem_list3));
1399 	/*
1400 	 * Do not assume that spinlocks can be initialized via memcpy:
1401 	 */
1402 	spin_lock_init(&ptr->list_lock);
1403 
1404 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
1405 	cachep->nodelists[nodeid] = ptr;
1406 	local_irq_enable();
1407 }
1408 
1409 /*
1410  * For setting up all the kmem_list3s for cache whose buffer_size is same as
1411  * size of kmem_list3.
1412  */
1413 static void __init set_up_list3s(struct kmem_cache *cachep, int index)
1414 {
1415 	int node;
1416 
1417 	for_each_online_node(node) {
1418 		cachep->nodelists[node] = &initkmem_list3[index + node];
1419 		cachep->nodelists[node]->next_reap = jiffies +
1420 		    REAPTIMEOUT_LIST3 +
1421 		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1422 	}
1423 }
1424 
1425 /*
1426  * Initialisation.  Called after the page allocator have been initialised and
1427  * before smp_init().
1428  */
1429 void __init kmem_cache_init(void)
1430 {
1431 	size_t left_over;
1432 	struct cache_sizes *sizes;
1433 	struct cache_names *names;
1434 	int i;
1435 	int order;
1436 	int node;
1437 
1438 	if (num_possible_nodes() == 1) {
1439 		use_alien_caches = 0;
1440 		numa_platform = 0;
1441 	}
1442 
1443 	for (i = 0; i < NUM_INIT_LISTS; i++) {
1444 		kmem_list3_init(&initkmem_list3[i]);
1445 		if (i < MAX_NUMNODES)
1446 			cache_cache.nodelists[i] = NULL;
1447 	}
1448 	set_up_list3s(&cache_cache, CACHE_CACHE);
1449 
1450 	/*
1451 	 * Fragmentation resistance on low memory - only use bigger
1452 	 * page orders on machines with more than 32MB of memory.
1453 	 */
1454 	if (num_physpages > (32 << 20) >> PAGE_SHIFT)
1455 		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1456 
1457 	/* Bootstrap is tricky, because several objects are allocated
1458 	 * from caches that do not exist yet:
1459 	 * 1) initialize the cache_cache cache: it contains the struct
1460 	 *    kmem_cache structures of all caches, except cache_cache itself:
1461 	 *    cache_cache is statically allocated.
1462 	 *    Initially an __init data area is used for the head array and the
1463 	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
1464 	 *    array at the end of the bootstrap.
1465 	 * 2) Create the first kmalloc cache.
1466 	 *    The struct kmem_cache for the new cache is allocated normally.
1467 	 *    An __init data area is used for the head array.
1468 	 * 3) Create the remaining kmalloc caches, with minimally sized
1469 	 *    head arrays.
1470 	 * 4) Replace the __init data head arrays for cache_cache and the first
1471 	 *    kmalloc cache with kmalloc allocated arrays.
1472 	 * 5) Replace the __init data for kmem_list3 for cache_cache and
1473 	 *    the other cache's with kmalloc allocated memory.
1474 	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1475 	 */
1476 
1477 	node = numa_node_id();
1478 
1479 	/* 1) create the cache_cache */
1480 	INIT_LIST_HEAD(&cache_chain);
1481 	list_add(&cache_cache.next, &cache_chain);
1482 	cache_cache.colour_off = cache_line_size();
1483 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1484 	cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE];
1485 
1486 	/*
1487 	 * struct kmem_cache size depends on nr_node_ids, which
1488 	 * can be less than MAX_NUMNODES.
1489 	 */
1490 	cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
1491 				 nr_node_ids * sizeof(struct kmem_list3 *);
1492 #if DEBUG
1493 	cache_cache.obj_size = cache_cache.buffer_size;
1494 #endif
1495 	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1496 					cache_line_size());
1497 	cache_cache.reciprocal_buffer_size =
1498 		reciprocal_value(cache_cache.buffer_size);
1499 
1500 	for (order = 0; order < MAX_ORDER; order++) {
1501 		cache_estimate(order, cache_cache.buffer_size,
1502 			cache_line_size(), 0, &left_over, &cache_cache.num);
1503 		if (cache_cache.num)
1504 			break;
1505 	}
1506 	BUG_ON(!cache_cache.num);
1507 	cache_cache.gfporder = order;
1508 	cache_cache.colour = left_over / cache_cache.colour_off;
1509 	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1510 				      sizeof(struct slab), cache_line_size());
1511 
1512 	/* 2+3) create the kmalloc caches */
1513 	sizes = malloc_sizes;
1514 	names = cache_names;
1515 
1516 	/*
1517 	 * Initialize the caches that provide memory for the array cache and the
1518 	 * kmem_list3 structures first.  Without this, further allocations will
1519 	 * bug.
1520 	 */
1521 
1522 	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1523 					sizes[INDEX_AC].cs_size,
1524 					ARCH_KMALLOC_MINALIGN,
1525 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1526 					NULL);
1527 
1528 	if (INDEX_AC != INDEX_L3) {
1529 		sizes[INDEX_L3].cs_cachep =
1530 			kmem_cache_create(names[INDEX_L3].name,
1531 				sizes[INDEX_L3].cs_size,
1532 				ARCH_KMALLOC_MINALIGN,
1533 				ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1534 				NULL);
1535 	}
1536 
1537 	slab_early_init = 0;
1538 
1539 	while (sizes->cs_size != ULONG_MAX) {
1540 		/*
1541 		 * For performance, all the general caches are L1 aligned.
1542 		 * This should be particularly beneficial on SMP boxes, as it
1543 		 * eliminates "false sharing".
1544 		 * Note for systems short on memory removing the alignment will
1545 		 * allow tighter packing of the smaller caches.
1546 		 */
1547 		if (!sizes->cs_cachep) {
1548 			sizes->cs_cachep = kmem_cache_create(names->name,
1549 					sizes->cs_size,
1550 					ARCH_KMALLOC_MINALIGN,
1551 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1552 					NULL);
1553 		}
1554 #ifdef CONFIG_ZONE_DMA
1555 		sizes->cs_dmacachep = kmem_cache_create(
1556 					names->name_dma,
1557 					sizes->cs_size,
1558 					ARCH_KMALLOC_MINALIGN,
1559 					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
1560 						SLAB_PANIC,
1561 					NULL);
1562 #endif
1563 		sizes++;
1564 		names++;
1565 	}
1566 	/* 4) Replace the bootstrap head arrays */
1567 	{
1568 		struct array_cache *ptr;
1569 
1570 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1571 
1572 		local_irq_disable();
1573 		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1574 		memcpy(ptr, cpu_cache_get(&cache_cache),
1575 		       sizeof(struct arraycache_init));
1576 		/*
1577 		 * Do not assume that spinlocks can be initialized via memcpy:
1578 		 */
1579 		spin_lock_init(&ptr->lock);
1580 
1581 		cache_cache.array[smp_processor_id()] = ptr;
1582 		local_irq_enable();
1583 
1584 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1585 
1586 		local_irq_disable();
1587 		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1588 		       != &initarray_generic.cache);
1589 		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1590 		       sizeof(struct arraycache_init));
1591 		/*
1592 		 * Do not assume that spinlocks can be initialized via memcpy:
1593 		 */
1594 		spin_lock_init(&ptr->lock);
1595 
1596 		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1597 		    ptr;
1598 		local_irq_enable();
1599 	}
1600 	/* 5) Replace the bootstrap kmem_list3's */
1601 	{
1602 		int nid;
1603 
1604 		for_each_online_node(nid) {
1605 			init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], nid);
1606 
1607 			init_list(malloc_sizes[INDEX_AC].cs_cachep,
1608 				  &initkmem_list3[SIZE_AC + nid], nid);
1609 
1610 			if (INDEX_AC != INDEX_L3) {
1611 				init_list(malloc_sizes[INDEX_L3].cs_cachep,
1612 					  &initkmem_list3[SIZE_L3 + nid], nid);
1613 			}
1614 		}
1615 	}
1616 
1617 	/* 6) resize the head arrays to their final sizes */
1618 	{
1619 		struct kmem_cache *cachep;
1620 		mutex_lock(&cache_chain_mutex);
1621 		list_for_each_entry(cachep, &cache_chain, next)
1622 			if (enable_cpucache(cachep))
1623 				BUG();
1624 		mutex_unlock(&cache_chain_mutex);
1625 	}
1626 
1627 	/* Annotate slab for lockdep -- annotate the malloc caches */
1628 	init_lock_keys();
1629 
1630 
1631 	/* Done! */
1632 	g_cpucache_up = FULL;
1633 
1634 	/*
1635 	 * Register a cpu startup notifier callback that initializes
1636 	 * cpu_cache_get for all new cpus
1637 	 */
1638 	register_cpu_notifier(&cpucache_notifier);
1639 
1640 	/*
1641 	 * The reap timers are started later, with a module init call: That part
1642 	 * of the kernel is not yet operational.
1643 	 */
1644 }
1645 
1646 static int __init cpucache_init(void)
1647 {
1648 	int cpu;
1649 
1650 	/*
1651 	 * Register the timers that return unneeded pages to the page allocator
1652 	 */
1653 	for_each_online_cpu(cpu)
1654 		start_cpu_timer(cpu);
1655 	return 0;
1656 }
1657 __initcall(cpucache_init);
1658 
1659 /*
1660  * Interface to system's page allocator. No need to hold the cache-lock.
1661  *
1662  * If we requested dmaable memory, we will get it. Even if we
1663  * did not request dmaable memory, we might get it, but that
1664  * would be relatively rare and ignorable.
1665  */
1666 static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1667 {
1668 	struct page *page;
1669 	int nr_pages;
1670 	int i;
1671 
1672 #ifndef CONFIG_MMU
1673 	/*
1674 	 * Nommu uses slab's for process anonymous memory allocations, and thus
1675 	 * requires __GFP_COMP to properly refcount higher order allocations
1676 	 */
1677 	flags |= __GFP_COMP;
1678 #endif
1679 
1680 	flags |= cachep->gfpflags;
1681 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1682 		flags |= __GFP_RECLAIMABLE;
1683 
1684 	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1685 	if (!page)
1686 		return NULL;
1687 
1688 	nr_pages = (1 << cachep->gfporder);
1689 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1690 		add_zone_page_state(page_zone(page),
1691 			NR_SLAB_RECLAIMABLE, nr_pages);
1692 	else
1693 		add_zone_page_state(page_zone(page),
1694 			NR_SLAB_UNRECLAIMABLE, nr_pages);
1695 	for (i = 0; i < nr_pages; i++)
1696 		__SetPageSlab(page + i);
1697 	return page_address(page);
1698 }
1699 
1700 /*
1701  * Interface to system's page release.
1702  */
1703 static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1704 {
1705 	unsigned long i = (1 << cachep->gfporder);
1706 	struct page *page = virt_to_page(addr);
1707 	const unsigned long nr_freed = i;
1708 
1709 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1710 		sub_zone_page_state(page_zone(page),
1711 				NR_SLAB_RECLAIMABLE, nr_freed);
1712 	else
1713 		sub_zone_page_state(page_zone(page),
1714 				NR_SLAB_UNRECLAIMABLE, nr_freed);
1715 	while (i--) {
1716 		BUG_ON(!PageSlab(page));
1717 		__ClearPageSlab(page);
1718 		page++;
1719 	}
1720 	if (current->reclaim_state)
1721 		current->reclaim_state->reclaimed_slab += nr_freed;
1722 	free_pages((unsigned long)addr, cachep->gfporder);
1723 }
1724 
1725 static void kmem_rcu_free(struct rcu_head *head)
1726 {
1727 	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1728 	struct kmem_cache *cachep = slab_rcu->cachep;
1729 
1730 	kmem_freepages(cachep, slab_rcu->addr);
1731 	if (OFF_SLAB(cachep))
1732 		kmem_cache_free(cachep->slabp_cache, slab_rcu);
1733 }
1734 
1735 #if DEBUG
1736 
1737 #ifdef CONFIG_DEBUG_PAGEALLOC
1738 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1739 			    unsigned long caller)
1740 {
1741 	int size = obj_size(cachep);
1742 
1743 	addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1744 
1745 	if (size < 5 * sizeof(unsigned long))
1746 		return;
1747 
1748 	*addr++ = 0x12345678;
1749 	*addr++ = caller;
1750 	*addr++ = smp_processor_id();
1751 	size -= 3 * sizeof(unsigned long);
1752 	{
1753 		unsigned long *sptr = &caller;
1754 		unsigned long svalue;
1755 
1756 		while (!kstack_end(sptr)) {
1757 			svalue = *sptr++;
1758 			if (kernel_text_address(svalue)) {
1759 				*addr++ = svalue;
1760 				size -= sizeof(unsigned long);
1761 				if (size <= sizeof(unsigned long))
1762 					break;
1763 			}
1764 		}
1765 
1766 	}
1767 	*addr++ = 0x87654321;
1768 }
1769 #endif
1770 
1771 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1772 {
1773 	int size = obj_size(cachep);
1774 	addr = &((char *)addr)[obj_offset(cachep)];
1775 
1776 	memset(addr, val, size);
1777 	*(unsigned char *)(addr + size - 1) = POISON_END;
1778 }
1779 
1780 static void dump_line(char *data, int offset, int limit)
1781 {
1782 	int i;
1783 	unsigned char error = 0;
1784 	int bad_count = 0;
1785 
1786 	printk(KERN_ERR "%03x:", offset);
1787 	for (i = 0; i < limit; i++) {
1788 		if (data[offset + i] != POISON_FREE) {
1789 			error = data[offset + i];
1790 			bad_count++;
1791 		}
1792 		printk(" %02x", (unsigned char)data[offset + i]);
1793 	}
1794 	printk("\n");
1795 
1796 	if (bad_count == 1) {
1797 		error ^= POISON_FREE;
1798 		if (!(error & (error - 1))) {
1799 			printk(KERN_ERR "Single bit error detected. Probably "
1800 					"bad RAM.\n");
1801 #ifdef CONFIG_X86
1802 			printk(KERN_ERR "Run memtest86+ or a similar memory "
1803 					"test tool.\n");
1804 #else
1805 			printk(KERN_ERR "Run a memory test tool.\n");
1806 #endif
1807 		}
1808 	}
1809 }
1810 #endif
1811 
1812 #if DEBUG
1813 
1814 static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1815 {
1816 	int i, size;
1817 	char *realobj;
1818 
1819 	if (cachep->flags & SLAB_RED_ZONE) {
1820 		printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
1821 			*dbg_redzone1(cachep, objp),
1822 			*dbg_redzone2(cachep, objp));
1823 	}
1824 
1825 	if (cachep->flags & SLAB_STORE_USER) {
1826 		printk(KERN_ERR "Last user: [<%p>]",
1827 			*dbg_userword(cachep, objp));
1828 		print_symbol("(%s)",
1829 				(unsigned long)*dbg_userword(cachep, objp));
1830 		printk("\n");
1831 	}
1832 	realobj = (char *)objp + obj_offset(cachep);
1833 	size = obj_size(cachep);
1834 	for (i = 0; i < size && lines; i += 16, lines--) {
1835 		int limit;
1836 		limit = 16;
1837 		if (i + limit > size)
1838 			limit = size - i;
1839 		dump_line(realobj, i, limit);
1840 	}
1841 }
1842 
1843 static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1844 {
1845 	char *realobj;
1846 	int size, i;
1847 	int lines = 0;
1848 
1849 	realobj = (char *)objp + obj_offset(cachep);
1850 	size = obj_size(cachep);
1851 
1852 	for (i = 0; i < size; i++) {
1853 		char exp = POISON_FREE;
1854 		if (i == size - 1)
1855 			exp = POISON_END;
1856 		if (realobj[i] != exp) {
1857 			int limit;
1858 			/* Mismatch ! */
1859 			/* Print header */
1860 			if (lines == 0) {
1861 				printk(KERN_ERR
1862 					"Slab corruption: %s start=%p, len=%d\n",
1863 					cachep->name, realobj, size);
1864 				print_objinfo(cachep, objp, 0);
1865 			}
1866 			/* Hexdump the affected line */
1867 			i = (i / 16) * 16;
1868 			limit = 16;
1869 			if (i + limit > size)
1870 				limit = size - i;
1871 			dump_line(realobj, i, limit);
1872 			i += 16;
1873 			lines++;
1874 			/* Limit to 5 lines */
1875 			if (lines > 5)
1876 				break;
1877 		}
1878 	}
1879 	if (lines != 0) {
1880 		/* Print some data about the neighboring objects, if they
1881 		 * exist:
1882 		 */
1883 		struct slab *slabp = virt_to_slab(objp);
1884 		unsigned int objnr;
1885 
1886 		objnr = obj_to_index(cachep, slabp, objp);
1887 		if (objnr) {
1888 			objp = index_to_obj(cachep, slabp, objnr - 1);
1889 			realobj = (char *)objp + obj_offset(cachep);
1890 			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1891 			       realobj, size);
1892 			print_objinfo(cachep, objp, 2);
1893 		}
1894 		if (objnr + 1 < cachep->num) {
1895 			objp = index_to_obj(cachep, slabp, objnr + 1);
1896 			realobj = (char *)objp + obj_offset(cachep);
1897 			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1898 			       realobj, size);
1899 			print_objinfo(cachep, objp, 2);
1900 		}
1901 	}
1902 }
1903 #endif
1904 
1905 #if DEBUG
1906 /**
1907  * slab_destroy_objs - destroy a slab and its objects
1908  * @cachep: cache pointer being destroyed
1909  * @slabp: slab pointer being destroyed
1910  *
1911  * Call the registered destructor for each object in a slab that is being
1912  * destroyed.
1913  */
1914 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1915 {
1916 	int i;
1917 	for (i = 0; i < cachep->num; i++) {
1918 		void *objp = index_to_obj(cachep, slabp, i);
1919 
1920 		if (cachep->flags & SLAB_POISON) {
1921 #ifdef CONFIG_DEBUG_PAGEALLOC
1922 			if (cachep->buffer_size % PAGE_SIZE == 0 &&
1923 					OFF_SLAB(cachep))
1924 				kernel_map_pages(virt_to_page(objp),
1925 					cachep->buffer_size / PAGE_SIZE, 1);
1926 			else
1927 				check_poison_obj(cachep, objp);
1928 #else
1929 			check_poison_obj(cachep, objp);
1930 #endif
1931 		}
1932 		if (cachep->flags & SLAB_RED_ZONE) {
1933 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1934 				slab_error(cachep, "start of a freed object "
1935 					   "was overwritten");
1936 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1937 				slab_error(cachep, "end of a freed object "
1938 					   "was overwritten");
1939 		}
1940 	}
1941 }
1942 #else
1943 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1944 {
1945 }
1946 #endif
1947 
1948 /**
1949  * slab_destroy - destroy and release all objects in a slab
1950  * @cachep: cache pointer being destroyed
1951  * @slabp: slab pointer being destroyed
1952  *
1953  * Destroy all the objs in a slab, and release the mem back to the system.
1954  * Before calling the slab must have been unlinked from the cache.  The
1955  * cache-lock is not held/needed.
1956  */
1957 static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1958 {
1959 	void *addr = slabp->s_mem - slabp->colouroff;
1960 
1961 	slab_destroy_objs(cachep, slabp);
1962 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1963 		struct slab_rcu *slab_rcu;
1964 
1965 		slab_rcu = (struct slab_rcu *)slabp;
1966 		slab_rcu->cachep = cachep;
1967 		slab_rcu->addr = addr;
1968 		call_rcu(&slab_rcu->head, kmem_rcu_free);
1969 	} else {
1970 		kmem_freepages(cachep, addr);
1971 		if (OFF_SLAB(cachep))
1972 			kmem_cache_free(cachep->slabp_cache, slabp);
1973 	}
1974 }
1975 
1976 static void __kmem_cache_destroy(struct kmem_cache *cachep)
1977 {
1978 	int i;
1979 	struct kmem_list3 *l3;
1980 
1981 	for_each_online_cpu(i)
1982 	    kfree(cachep->array[i]);
1983 
1984 	/* NUMA: free the list3 structures */
1985 	for_each_online_node(i) {
1986 		l3 = cachep->nodelists[i];
1987 		if (l3) {
1988 			kfree(l3->shared);
1989 			free_alien_cache(l3->alien);
1990 			kfree(l3);
1991 		}
1992 	}
1993 	kmem_cache_free(&cache_cache, cachep);
1994 }
1995 
1996 
1997 /**
1998  * calculate_slab_order - calculate size (page order) of slabs
1999  * @cachep: pointer to the cache that is being created
2000  * @size: size of objects to be created in this cache.
2001  * @align: required alignment for the objects.
2002  * @flags: slab allocation flags
2003  *
2004  * Also calculates the number of objects per slab.
2005  *
2006  * This could be made much more intelligent.  For now, try to avoid using
2007  * high order pages for slabs.  When the gfp() functions are more friendly
2008  * towards high-order requests, this should be changed.
2009  */
2010 static size_t calculate_slab_order(struct kmem_cache *cachep,
2011 			size_t size, size_t align, unsigned long flags)
2012 {
2013 	unsigned long offslab_limit;
2014 	size_t left_over = 0;
2015 	int gfporder;
2016 
2017 	for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
2018 		unsigned int num;
2019 		size_t remainder;
2020 
2021 		cache_estimate(gfporder, size, align, flags, &remainder, &num);
2022 		if (!num)
2023 			continue;
2024 
2025 		if (flags & CFLGS_OFF_SLAB) {
2026 			/*
2027 			 * Max number of objs-per-slab for caches which
2028 			 * use off-slab slabs. Needed to avoid a possible
2029 			 * looping condition in cache_grow().
2030 			 */
2031 			offslab_limit = size - sizeof(struct slab);
2032 			offslab_limit /= sizeof(kmem_bufctl_t);
2033 
2034  			if (num > offslab_limit)
2035 				break;
2036 		}
2037 
2038 		/* Found something acceptable - save it away */
2039 		cachep->num = num;
2040 		cachep->gfporder = gfporder;
2041 		left_over = remainder;
2042 
2043 		/*
2044 		 * A VFS-reclaimable slab tends to have most allocations
2045 		 * as GFP_NOFS and we really don't want to have to be allocating
2046 		 * higher-order pages when we are unable to shrink dcache.
2047 		 */
2048 		if (flags & SLAB_RECLAIM_ACCOUNT)
2049 			break;
2050 
2051 		/*
2052 		 * Large number of objects is good, but very large slabs are
2053 		 * currently bad for the gfp()s.
2054 		 */
2055 		if (gfporder >= slab_break_gfp_order)
2056 			break;
2057 
2058 		/*
2059 		 * Acceptable internal fragmentation?
2060 		 */
2061 		if (left_over * 8 <= (PAGE_SIZE << gfporder))
2062 			break;
2063 	}
2064 	return left_over;
2065 }
2066 
2067 static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2068 {
2069 	if (g_cpucache_up == FULL)
2070 		return enable_cpucache(cachep);
2071 
2072 	if (g_cpucache_up == NONE) {
2073 		/*
2074 		 * Note: the first kmem_cache_create must create the cache
2075 		 * that's used by kmalloc(24), otherwise the creation of
2076 		 * further caches will BUG().
2077 		 */
2078 		cachep->array[smp_processor_id()] = &initarray_generic.cache;
2079 
2080 		/*
2081 		 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
2082 		 * the first cache, then we need to set up all its list3s,
2083 		 * otherwise the creation of further caches will BUG().
2084 		 */
2085 		set_up_list3s(cachep, SIZE_AC);
2086 		if (INDEX_AC == INDEX_L3)
2087 			g_cpucache_up = PARTIAL_L3;
2088 		else
2089 			g_cpucache_up = PARTIAL_AC;
2090 	} else {
2091 		cachep->array[smp_processor_id()] =
2092 			kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
2093 
2094 		if (g_cpucache_up == PARTIAL_AC) {
2095 			set_up_list3s(cachep, SIZE_L3);
2096 			g_cpucache_up = PARTIAL_L3;
2097 		} else {
2098 			int node;
2099 			for_each_online_node(node) {
2100 				cachep->nodelists[node] =
2101 				    kmalloc_node(sizeof(struct kmem_list3),
2102 						GFP_KERNEL, node);
2103 				BUG_ON(!cachep->nodelists[node]);
2104 				kmem_list3_init(cachep->nodelists[node]);
2105 			}
2106 		}
2107 	}
2108 	cachep->nodelists[numa_node_id()]->next_reap =
2109 			jiffies + REAPTIMEOUT_LIST3 +
2110 			((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2111 
2112 	cpu_cache_get(cachep)->avail = 0;
2113 	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
2114 	cpu_cache_get(cachep)->batchcount = 1;
2115 	cpu_cache_get(cachep)->touched = 0;
2116 	cachep->batchcount = 1;
2117 	cachep->limit = BOOT_CPUCACHE_ENTRIES;
2118 	return 0;
2119 }
2120 
2121 /**
2122  * kmem_cache_create - Create a cache.
2123  * @name: A string which is used in /proc/slabinfo to identify this cache.
2124  * @size: The size of objects to be created in this cache.
2125  * @align: The required alignment for the objects.
2126  * @flags: SLAB flags
2127  * @ctor: A constructor for the objects.
2128  *
2129  * Returns a ptr to the cache on success, NULL on failure.
2130  * Cannot be called within a int, but can be interrupted.
2131  * The @ctor is run when new pages are allocated by the cache.
2132  *
2133  * @name must be valid until the cache is destroyed. This implies that
2134  * the module calling this has to destroy the cache before getting unloaded.
2135  *
2136  * The flags are
2137  *
2138  * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
2139  * to catch references to uninitialised memory.
2140  *
2141  * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
2142  * for buffer overruns.
2143  *
2144  * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
2145  * cacheline.  This can be beneficial if you're counting cycles as closely
2146  * as davem.
2147  */
2148 struct kmem_cache *
2149 kmem_cache_create (const char *name, size_t size, size_t align,
2150 	unsigned long flags,
2151 	void (*ctor)(struct kmem_cache *, void *))
2152 {
2153 	size_t left_over, slab_size, ralign;
2154 	struct kmem_cache *cachep = NULL, *pc;
2155 
2156 	/*
2157 	 * Sanity checks... these are all serious usage bugs.
2158 	 */
2159 	if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
2160 	    size > KMALLOC_MAX_SIZE) {
2161 		printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
2162 				name);
2163 		BUG();
2164 	}
2165 
2166 	/*
2167 	 * We use cache_chain_mutex to ensure a consistent view of
2168 	 * cpu_online_map as well.  Please see cpuup_callback
2169 	 */
2170 	get_online_cpus();
2171 	mutex_lock(&cache_chain_mutex);
2172 
2173 	list_for_each_entry(pc, &cache_chain, next) {
2174 		char tmp;
2175 		int res;
2176 
2177 		/*
2178 		 * This happens when the module gets unloaded and doesn't
2179 		 * destroy its slab cache and no-one else reuses the vmalloc
2180 		 * area of the module.  Print a warning.
2181 		 */
2182 		res = probe_kernel_address(pc->name, tmp);
2183 		if (res) {
2184 			printk(KERN_ERR
2185 			       "SLAB: cache with size %d has lost its name\n",
2186 			       pc->buffer_size);
2187 			continue;
2188 		}
2189 
2190 		if (!strcmp(pc->name, name)) {
2191 			printk(KERN_ERR
2192 			       "kmem_cache_create: duplicate cache %s\n", name);
2193 			dump_stack();
2194 			goto oops;
2195 		}
2196 	}
2197 
2198 #if DEBUG
2199 	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
2200 #if FORCED_DEBUG
2201 	/*
2202 	 * Enable redzoning and last user accounting, except for caches with
2203 	 * large objects, if the increased size would increase the object size
2204 	 * above the next power of two: caches with object sizes just above a
2205 	 * power of two have a significant amount of internal fragmentation.
2206 	 */
2207 	if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
2208 						2 * sizeof(unsigned long long)))
2209 		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2210 	if (!(flags & SLAB_DESTROY_BY_RCU))
2211 		flags |= SLAB_POISON;
2212 #endif
2213 	if (flags & SLAB_DESTROY_BY_RCU)
2214 		BUG_ON(flags & SLAB_POISON);
2215 #endif
2216 	/*
2217 	 * Always checks flags, a caller might be expecting debug support which
2218 	 * isn't available.
2219 	 */
2220 	BUG_ON(flags & ~CREATE_MASK);
2221 
2222 	/*
2223 	 * Check that size is in terms of words.  This is needed to avoid
2224 	 * unaligned accesses for some archs when redzoning is used, and makes
2225 	 * sure any on-slab bufctl's are also correctly aligned.
2226 	 */
2227 	if (size & (BYTES_PER_WORD - 1)) {
2228 		size += (BYTES_PER_WORD - 1);
2229 		size &= ~(BYTES_PER_WORD - 1);
2230 	}
2231 
2232 	/* calculate the final buffer alignment: */
2233 
2234 	/* 1) arch recommendation: can be overridden for debug */
2235 	if (flags & SLAB_HWCACHE_ALIGN) {
2236 		/*
2237 		 * Default alignment: as specified by the arch code.  Except if
2238 		 * an object is really small, then squeeze multiple objects into
2239 		 * one cacheline.
2240 		 */
2241 		ralign = cache_line_size();
2242 		while (size <= ralign / 2)
2243 			ralign /= 2;
2244 	} else {
2245 		ralign = BYTES_PER_WORD;
2246 	}
2247 
2248 	/*
2249 	 * Redzoning and user store require word alignment or possibly larger.
2250 	 * Note this will be overridden by architecture or caller mandated
2251 	 * alignment if either is greater than BYTES_PER_WORD.
2252 	 */
2253 	if (flags & SLAB_STORE_USER)
2254 		ralign = BYTES_PER_WORD;
2255 
2256 	if (flags & SLAB_RED_ZONE) {
2257 		ralign = REDZONE_ALIGN;
2258 		/* If redzoning, ensure that the second redzone is suitably
2259 		 * aligned, by adjusting the object size accordingly. */
2260 		size += REDZONE_ALIGN - 1;
2261 		size &= ~(REDZONE_ALIGN - 1);
2262 	}
2263 
2264 	/* 2) arch mandated alignment */
2265 	if (ralign < ARCH_SLAB_MINALIGN) {
2266 		ralign = ARCH_SLAB_MINALIGN;
2267 	}
2268 	/* 3) caller mandated alignment */
2269 	if (ralign < align) {
2270 		ralign = align;
2271 	}
2272 	/* disable debug if necessary */
2273 	if (ralign > __alignof__(unsigned long long))
2274 		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2275 	/*
2276 	 * 4) Store it.
2277 	 */
2278 	align = ralign;
2279 
2280 	/* Get cache's description obj. */
2281 	cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
2282 	if (!cachep)
2283 		goto oops;
2284 
2285 #if DEBUG
2286 	cachep->obj_size = size;
2287 
2288 	/*
2289 	 * Both debugging options require word-alignment which is calculated
2290 	 * into align above.
2291 	 */
2292 	if (flags & SLAB_RED_ZONE) {
2293 		/* add space for red zone words */
2294 		cachep->obj_offset += sizeof(unsigned long long);
2295 		size += 2 * sizeof(unsigned long long);
2296 	}
2297 	if (flags & SLAB_STORE_USER) {
2298 		/* user store requires one word storage behind the end of
2299 		 * the real object. But if the second red zone needs to be
2300 		 * aligned to 64 bits, we must allow that much space.
2301 		 */
2302 		if (flags & SLAB_RED_ZONE)
2303 			size += REDZONE_ALIGN;
2304 		else
2305 			size += BYTES_PER_WORD;
2306 	}
2307 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2308 	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2309 	    && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
2310 		cachep->obj_offset += PAGE_SIZE - size;
2311 		size = PAGE_SIZE;
2312 	}
2313 #endif
2314 #endif
2315 
2316 	/*
2317 	 * Determine if the slab management is 'on' or 'off' slab.
2318 	 * (bootstrapping cannot cope with offslab caches so don't do
2319 	 * it too early on.)
2320 	 */
2321 	if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
2322 		/*
2323 		 * Size is large, assume best to place the slab management obj
2324 		 * off-slab (should allow better packing of objs).
2325 		 */
2326 		flags |= CFLGS_OFF_SLAB;
2327 
2328 	size = ALIGN(size, align);
2329 
2330 	left_over = calculate_slab_order(cachep, size, align, flags);
2331 
2332 	if (!cachep->num) {
2333 		printk(KERN_ERR
2334 		       "kmem_cache_create: couldn't create cache %s.\n", name);
2335 		kmem_cache_free(&cache_cache, cachep);
2336 		cachep = NULL;
2337 		goto oops;
2338 	}
2339 	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2340 			  + sizeof(struct slab), align);
2341 
2342 	/*
2343 	 * If the slab has been placed off-slab, and we have enough space then
2344 	 * move it on-slab. This is at the expense of any extra colouring.
2345 	 */
2346 	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
2347 		flags &= ~CFLGS_OFF_SLAB;
2348 		left_over -= slab_size;
2349 	}
2350 
2351 	if (flags & CFLGS_OFF_SLAB) {
2352 		/* really off slab. No need for manual alignment */
2353 		slab_size =
2354 		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2355 	}
2356 
2357 	cachep->colour_off = cache_line_size();
2358 	/* Offset must be a multiple of the alignment. */
2359 	if (cachep->colour_off < align)
2360 		cachep->colour_off = align;
2361 	cachep->colour = left_over / cachep->colour_off;
2362 	cachep->slab_size = slab_size;
2363 	cachep->flags = flags;
2364 	cachep->gfpflags = 0;
2365 	if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
2366 		cachep->gfpflags |= GFP_DMA;
2367 	cachep->buffer_size = size;
2368 	cachep->reciprocal_buffer_size = reciprocal_value(size);
2369 
2370 	if (flags & CFLGS_OFF_SLAB) {
2371 		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2372 		/*
2373 		 * This is a possibility for one of the malloc_sizes caches.
2374 		 * But since we go off slab only for object size greater than
2375 		 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
2376 		 * this should not happen at all.
2377 		 * But leave a BUG_ON for some lucky dude.
2378 		 */
2379 		BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
2380 	}
2381 	cachep->ctor = ctor;
2382 	cachep->name = name;
2383 
2384 	if (setup_cpu_cache(cachep)) {
2385 		__kmem_cache_destroy(cachep);
2386 		cachep = NULL;
2387 		goto oops;
2388 	}
2389 
2390 	/* cache setup completed, link it into the list */
2391 	list_add(&cachep->next, &cache_chain);
2392 oops:
2393 	if (!cachep && (flags & SLAB_PANIC))
2394 		panic("kmem_cache_create(): failed to create slab `%s'\n",
2395 		      name);
2396 	mutex_unlock(&cache_chain_mutex);
2397 	put_online_cpus();
2398 	return cachep;
2399 }
2400 EXPORT_SYMBOL(kmem_cache_create);
2401 
2402 #if DEBUG
2403 static void check_irq_off(void)
2404 {
2405 	BUG_ON(!irqs_disabled());
2406 }
2407 
2408 static void check_irq_on(void)
2409 {
2410 	BUG_ON(irqs_disabled());
2411 }
2412 
2413 static void check_spinlock_acquired(struct kmem_cache *cachep)
2414 {
2415 #ifdef CONFIG_SMP
2416 	check_irq_off();
2417 	assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
2418 #endif
2419 }
2420 
2421 static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2422 {
2423 #ifdef CONFIG_SMP
2424 	check_irq_off();
2425 	assert_spin_locked(&cachep->nodelists[node]->list_lock);
2426 #endif
2427 }
2428 
2429 #else
2430 #define check_irq_off()	do { } while(0)
2431 #define check_irq_on()	do { } while(0)
2432 #define check_spinlock_acquired(x) do { } while(0)
2433 #define check_spinlock_acquired_node(x, y) do { } while(0)
2434 #endif
2435 
2436 static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
2437 			struct array_cache *ac,
2438 			int force, int node);
2439 
2440 static void do_drain(void *arg)
2441 {
2442 	struct kmem_cache *cachep = arg;
2443 	struct array_cache *ac;
2444 	int node = numa_node_id();
2445 
2446 	check_irq_off();
2447 	ac = cpu_cache_get(cachep);
2448 	spin_lock(&cachep->nodelists[node]->list_lock);
2449 	free_block(cachep, ac->entry, ac->avail, node);
2450 	spin_unlock(&cachep->nodelists[node]->list_lock);
2451 	ac->avail = 0;
2452 }
2453 
2454 static void drain_cpu_caches(struct kmem_cache *cachep)
2455 {
2456 	struct kmem_list3 *l3;
2457 	int node;
2458 
2459 	on_each_cpu(do_drain, cachep, 1, 1);
2460 	check_irq_on();
2461 	for_each_online_node(node) {
2462 		l3 = cachep->nodelists[node];
2463 		if (l3 && l3->alien)
2464 			drain_alien_cache(cachep, l3->alien);
2465 	}
2466 
2467 	for_each_online_node(node) {
2468 		l3 = cachep->nodelists[node];
2469 		if (l3)
2470 			drain_array(cachep, l3, l3->shared, 1, node);
2471 	}
2472 }
2473 
2474 /*
2475  * Remove slabs from the list of free slabs.
2476  * Specify the number of slabs to drain in tofree.
2477  *
2478  * Returns the actual number of slabs released.
2479  */
2480 static int drain_freelist(struct kmem_cache *cache,
2481 			struct kmem_list3 *l3, int tofree)
2482 {
2483 	struct list_head *p;
2484 	int nr_freed;
2485 	struct slab *slabp;
2486 
2487 	nr_freed = 0;
2488 	while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
2489 
2490 		spin_lock_irq(&l3->list_lock);
2491 		p = l3->slabs_free.prev;
2492 		if (p == &l3->slabs_free) {
2493 			spin_unlock_irq(&l3->list_lock);
2494 			goto out;
2495 		}
2496 
2497 		slabp = list_entry(p, struct slab, list);
2498 #if DEBUG
2499 		BUG_ON(slabp->inuse);
2500 #endif
2501 		list_del(&slabp->list);
2502 		/*
2503 		 * Safe to drop the lock. The slab is no longer linked
2504 		 * to the cache.
2505 		 */
2506 		l3->free_objects -= cache->num;
2507 		spin_unlock_irq(&l3->list_lock);
2508 		slab_destroy(cache, slabp);
2509 		nr_freed++;
2510 	}
2511 out:
2512 	return nr_freed;
2513 }
2514 
2515 /* Called with cache_chain_mutex held to protect against cpu hotplug */
2516 static int __cache_shrink(struct kmem_cache *cachep)
2517 {
2518 	int ret = 0, i = 0;
2519 	struct kmem_list3 *l3;
2520 
2521 	drain_cpu_caches(cachep);
2522 
2523 	check_irq_on();
2524 	for_each_online_node(i) {
2525 		l3 = cachep->nodelists[i];
2526 		if (!l3)
2527 			continue;
2528 
2529 		drain_freelist(cachep, l3, l3->free_objects);
2530 
2531 		ret += !list_empty(&l3->slabs_full) ||
2532 			!list_empty(&l3->slabs_partial);
2533 	}
2534 	return (ret ? 1 : 0);
2535 }
2536 
2537 /**
2538  * kmem_cache_shrink - Shrink a cache.
2539  * @cachep: The cache to shrink.
2540  *
2541  * Releases as many slabs as possible for a cache.
2542  * To help debugging, a zero exit status indicates all slabs were released.
2543  */
2544 int kmem_cache_shrink(struct kmem_cache *cachep)
2545 {
2546 	int ret;
2547 	BUG_ON(!cachep || in_interrupt());
2548 
2549 	get_online_cpus();
2550 	mutex_lock(&cache_chain_mutex);
2551 	ret = __cache_shrink(cachep);
2552 	mutex_unlock(&cache_chain_mutex);
2553 	put_online_cpus();
2554 	return ret;
2555 }
2556 EXPORT_SYMBOL(kmem_cache_shrink);
2557 
2558 /**
2559  * kmem_cache_destroy - delete a cache
2560  * @cachep: the cache to destroy
2561  *
2562  * Remove a &struct kmem_cache object from the slab cache.
2563  *
2564  * It is expected this function will be called by a module when it is
2565  * unloaded.  This will remove the cache completely, and avoid a duplicate
2566  * cache being allocated each time a module is loaded and unloaded, if the
2567  * module doesn't have persistent in-kernel storage across loads and unloads.
2568  *
2569  * The cache must be empty before calling this function.
2570  *
2571  * The caller must guarantee that noone will allocate memory from the cache
2572  * during the kmem_cache_destroy().
2573  */
2574 void kmem_cache_destroy(struct kmem_cache *cachep)
2575 {
2576 	BUG_ON(!cachep || in_interrupt());
2577 
2578 	/* Find the cache in the chain of caches. */
2579 	get_online_cpus();
2580 	mutex_lock(&cache_chain_mutex);
2581 	/*
2582 	 * the chain is never empty, cache_cache is never destroyed
2583 	 */
2584 	list_del(&cachep->next);
2585 	if (__cache_shrink(cachep)) {
2586 		slab_error(cachep, "Can't free all objects");
2587 		list_add(&cachep->next, &cache_chain);
2588 		mutex_unlock(&cache_chain_mutex);
2589 		put_online_cpus();
2590 		return;
2591 	}
2592 
2593 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2594 		synchronize_rcu();
2595 
2596 	__kmem_cache_destroy(cachep);
2597 	mutex_unlock(&cache_chain_mutex);
2598 	put_online_cpus();
2599 }
2600 EXPORT_SYMBOL(kmem_cache_destroy);
2601 
2602 /*
2603  * Get the memory for a slab management obj.
2604  * For a slab cache when the slab descriptor is off-slab, slab descriptors
2605  * always come from malloc_sizes caches.  The slab descriptor cannot
2606  * come from the same cache which is getting created because,
2607  * when we are searching for an appropriate cache for these
2608  * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2609  * If we are creating a malloc_sizes cache here it would not be visible to
2610  * kmem_find_general_cachep till the initialization is complete.
2611  * Hence we cannot have slabp_cache same as the original cache.
2612  */
2613 static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2614 				   int colour_off, gfp_t local_flags,
2615 				   int nodeid)
2616 {
2617 	struct slab *slabp;
2618 
2619 	if (OFF_SLAB(cachep)) {
2620 		/* Slab management obj is off-slab. */
2621 		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2622 					      local_flags & ~GFP_THISNODE, nodeid);
2623 		if (!slabp)
2624 			return NULL;
2625 	} else {
2626 		slabp = objp + colour_off;
2627 		colour_off += cachep->slab_size;
2628 	}
2629 	slabp->inuse = 0;
2630 	slabp->colouroff = colour_off;
2631 	slabp->s_mem = objp + colour_off;
2632 	slabp->nodeid = nodeid;
2633 	return slabp;
2634 }
2635 
2636 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2637 {
2638 	return (kmem_bufctl_t *) (slabp + 1);
2639 }
2640 
2641 static void cache_init_objs(struct kmem_cache *cachep,
2642 			    struct slab *slabp)
2643 {
2644 	int i;
2645 
2646 	for (i = 0; i < cachep->num; i++) {
2647 		void *objp = index_to_obj(cachep, slabp, i);
2648 #if DEBUG
2649 		/* need to poison the objs? */
2650 		if (cachep->flags & SLAB_POISON)
2651 			poison_obj(cachep, objp, POISON_FREE);
2652 		if (cachep->flags & SLAB_STORE_USER)
2653 			*dbg_userword(cachep, objp) = NULL;
2654 
2655 		if (cachep->flags & SLAB_RED_ZONE) {
2656 			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2657 			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2658 		}
2659 		/*
2660 		 * Constructors are not allowed to allocate memory from the same
2661 		 * cache which they are a constructor for.  Otherwise, deadlock.
2662 		 * They must also be threaded.
2663 		 */
2664 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2665 			cachep->ctor(cachep, objp + obj_offset(cachep));
2666 
2667 		if (cachep->flags & SLAB_RED_ZONE) {
2668 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2669 				slab_error(cachep, "constructor overwrote the"
2670 					   " end of an object");
2671 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2672 				slab_error(cachep, "constructor overwrote the"
2673 					   " start of an object");
2674 		}
2675 		if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
2676 			    OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2677 			kernel_map_pages(virt_to_page(objp),
2678 					 cachep->buffer_size / PAGE_SIZE, 0);
2679 #else
2680 		if (cachep->ctor)
2681 			cachep->ctor(cachep, objp);
2682 #endif
2683 		slab_bufctl(slabp)[i] = i + 1;
2684 	}
2685 	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2686 	slabp->free = 0;
2687 }
2688 
2689 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2690 {
2691 	if (CONFIG_ZONE_DMA_FLAG) {
2692 		if (flags & GFP_DMA)
2693 			BUG_ON(!(cachep->gfpflags & GFP_DMA));
2694 		else
2695 			BUG_ON(cachep->gfpflags & GFP_DMA);
2696 	}
2697 }
2698 
2699 static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2700 				int nodeid)
2701 {
2702 	void *objp = index_to_obj(cachep, slabp, slabp->free);
2703 	kmem_bufctl_t next;
2704 
2705 	slabp->inuse++;
2706 	next = slab_bufctl(slabp)[slabp->free];
2707 #if DEBUG
2708 	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2709 	WARN_ON(slabp->nodeid != nodeid);
2710 #endif
2711 	slabp->free = next;
2712 
2713 	return objp;
2714 }
2715 
2716 static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2717 				void *objp, int nodeid)
2718 {
2719 	unsigned int objnr = obj_to_index(cachep, slabp, objp);
2720 
2721 #if DEBUG
2722 	/* Verify that the slab belongs to the intended node */
2723 	WARN_ON(slabp->nodeid != nodeid);
2724 
2725 	if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
2726 		printk(KERN_ERR "slab: double free detected in cache "
2727 				"'%s', objp %p\n", cachep->name, objp);
2728 		BUG();
2729 	}
2730 #endif
2731 	slab_bufctl(slabp)[objnr] = slabp->free;
2732 	slabp->free = objnr;
2733 	slabp->inuse--;
2734 }
2735 
2736 /*
2737  * Map pages beginning at addr to the given cache and slab. This is required
2738  * for the slab allocator to be able to lookup the cache and slab of a
2739  * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
2740  */
2741 static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2742 			   void *addr)
2743 {
2744 	int nr_pages;
2745 	struct page *page;
2746 
2747 	page = virt_to_page(addr);
2748 
2749 	nr_pages = 1;
2750 	if (likely(!PageCompound(page)))
2751 		nr_pages <<= cache->gfporder;
2752 
2753 	do {
2754 		page_set_cache(page, cache);
2755 		page_set_slab(page, slab);
2756 		page++;
2757 	} while (--nr_pages);
2758 }
2759 
2760 /*
2761  * Grow (by 1) the number of slabs within a cache.  This is called by
2762  * kmem_cache_alloc() when there are no active objs left in a cache.
2763  */
2764 static int cache_grow(struct kmem_cache *cachep,
2765 		gfp_t flags, int nodeid, void *objp)
2766 {
2767 	struct slab *slabp;
2768 	size_t offset;
2769 	gfp_t local_flags;
2770 	struct kmem_list3 *l3;
2771 
2772 	/*
2773 	 * Be lazy and only check for valid flags here,  keeping it out of the
2774 	 * critical path in kmem_cache_alloc().
2775 	 */
2776 	BUG_ON(flags & GFP_SLAB_BUG_MASK);
2777 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
2778 
2779 	/* Take the l3 list lock to change the colour_next on this node */
2780 	check_irq_off();
2781 	l3 = cachep->nodelists[nodeid];
2782 	spin_lock(&l3->list_lock);
2783 
2784 	/* Get colour for the slab, and cal the next value. */
2785 	offset = l3->colour_next;
2786 	l3->colour_next++;
2787 	if (l3->colour_next >= cachep->colour)
2788 		l3->colour_next = 0;
2789 	spin_unlock(&l3->list_lock);
2790 
2791 	offset *= cachep->colour_off;
2792 
2793 	if (local_flags & __GFP_WAIT)
2794 		local_irq_enable();
2795 
2796 	/*
2797 	 * The test for missing atomic flag is performed here, rather than
2798 	 * the more obvious place, simply to reduce the critical path length
2799 	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2800 	 * will eventually be caught here (where it matters).
2801 	 */
2802 	kmem_flagcheck(cachep, flags);
2803 
2804 	/*
2805 	 * Get mem for the objs.  Attempt to allocate a physical page from
2806 	 * 'nodeid'.
2807 	 */
2808 	if (!objp)
2809 		objp = kmem_getpages(cachep, local_flags, nodeid);
2810 	if (!objp)
2811 		goto failed;
2812 
2813 	/* Get slab management. */
2814 	slabp = alloc_slabmgmt(cachep, objp, offset,
2815 			local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
2816 	if (!slabp)
2817 		goto opps1;
2818 
2819 	slabp->nodeid = nodeid;
2820 	slab_map_pages(cachep, slabp, objp);
2821 
2822 	cache_init_objs(cachep, slabp);
2823 
2824 	if (local_flags & __GFP_WAIT)
2825 		local_irq_disable();
2826 	check_irq_off();
2827 	spin_lock(&l3->list_lock);
2828 
2829 	/* Make slab active. */
2830 	list_add_tail(&slabp->list, &(l3->slabs_free));
2831 	STATS_INC_GROWN(cachep);
2832 	l3->free_objects += cachep->num;
2833 	spin_unlock(&l3->list_lock);
2834 	return 1;
2835 opps1:
2836 	kmem_freepages(cachep, objp);
2837 failed:
2838 	if (local_flags & __GFP_WAIT)
2839 		local_irq_disable();
2840 	return 0;
2841 }
2842 
2843 #if DEBUG
2844 
2845 /*
2846  * Perform extra freeing checks:
2847  * - detect bad pointers.
2848  * - POISON/RED_ZONE checking
2849  */
2850 static void kfree_debugcheck(const void *objp)
2851 {
2852 	if (!virt_addr_valid(objp)) {
2853 		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2854 		       (unsigned long)objp);
2855 		BUG();
2856 	}
2857 }
2858 
2859 static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2860 {
2861 	unsigned long long redzone1, redzone2;
2862 
2863 	redzone1 = *dbg_redzone1(cache, obj);
2864 	redzone2 = *dbg_redzone2(cache, obj);
2865 
2866 	/*
2867 	 * Redzone is ok.
2868 	 */
2869 	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2870 		return;
2871 
2872 	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2873 		slab_error(cache, "double free detected");
2874 	else
2875 		slab_error(cache, "memory outside object was overwritten");
2876 
2877 	printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
2878 			obj, redzone1, redzone2);
2879 }
2880 
2881 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2882 				   void *caller)
2883 {
2884 	struct page *page;
2885 	unsigned int objnr;
2886 	struct slab *slabp;
2887 
2888 	BUG_ON(virt_to_cache(objp) != cachep);
2889 
2890 	objp -= obj_offset(cachep);
2891 	kfree_debugcheck(objp);
2892 	page = virt_to_head_page(objp);
2893 
2894 	slabp = page_get_slab(page);
2895 
2896 	if (cachep->flags & SLAB_RED_ZONE) {
2897 		verify_redzone_free(cachep, objp);
2898 		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2899 		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2900 	}
2901 	if (cachep->flags & SLAB_STORE_USER)
2902 		*dbg_userword(cachep, objp) = caller;
2903 
2904 	objnr = obj_to_index(cachep, slabp, objp);
2905 
2906 	BUG_ON(objnr >= cachep->num);
2907 	BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2908 
2909 #ifdef CONFIG_DEBUG_SLAB_LEAK
2910 	slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
2911 #endif
2912 	if (cachep->flags & SLAB_POISON) {
2913 #ifdef CONFIG_DEBUG_PAGEALLOC
2914 		if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2915 			store_stackinfo(cachep, objp, (unsigned long)caller);
2916 			kernel_map_pages(virt_to_page(objp),
2917 					 cachep->buffer_size / PAGE_SIZE, 0);
2918 		} else {
2919 			poison_obj(cachep, objp, POISON_FREE);
2920 		}
2921 #else
2922 		poison_obj(cachep, objp, POISON_FREE);
2923 #endif
2924 	}
2925 	return objp;
2926 }
2927 
2928 static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2929 {
2930 	kmem_bufctl_t i;
2931 	int entries = 0;
2932 
2933 	/* Check slab's freelist to see if this obj is there. */
2934 	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2935 		entries++;
2936 		if (entries > cachep->num || i >= cachep->num)
2937 			goto bad;
2938 	}
2939 	if (entries != cachep->num - slabp->inuse) {
2940 bad:
2941 		printk(KERN_ERR "slab: Internal list corruption detected in "
2942 				"cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2943 			cachep->name, cachep->num, slabp, slabp->inuse);
2944 		for (i = 0;
2945 		     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2946 		     i++) {
2947 			if (i % 16 == 0)
2948 				printk("\n%03x:", i);
2949 			printk(" %02x", ((unsigned char *)slabp)[i]);
2950 		}
2951 		printk("\n");
2952 		BUG();
2953 	}
2954 }
2955 #else
2956 #define kfree_debugcheck(x) do { } while(0)
2957 #define cache_free_debugcheck(x,objp,z) (objp)
2958 #define check_slabp(x,y) do { } while(0)
2959 #endif
2960 
2961 static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2962 {
2963 	int batchcount;
2964 	struct kmem_list3 *l3;
2965 	struct array_cache *ac;
2966 	int node;
2967 
2968 	node = numa_node_id();
2969 
2970 	check_irq_off();
2971 	ac = cpu_cache_get(cachep);
2972 retry:
2973 	batchcount = ac->batchcount;
2974 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2975 		/*
2976 		 * If there was little recent activity on this cache, then
2977 		 * perform only a partial refill.  Otherwise we could generate
2978 		 * refill bouncing.
2979 		 */
2980 		batchcount = BATCHREFILL_LIMIT;
2981 	}
2982 	l3 = cachep->nodelists[node];
2983 
2984 	BUG_ON(ac->avail > 0 || !l3);
2985 	spin_lock(&l3->list_lock);
2986 
2987 	/* See if we can refill from the shared array */
2988 	if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
2989 		goto alloc_done;
2990 
2991 	while (batchcount > 0) {
2992 		struct list_head *entry;
2993 		struct slab *slabp;
2994 		/* Get slab alloc is to come from. */
2995 		entry = l3->slabs_partial.next;
2996 		if (entry == &l3->slabs_partial) {
2997 			l3->free_touched = 1;
2998 			entry = l3->slabs_free.next;
2999 			if (entry == &l3->slabs_free)
3000 				goto must_grow;
3001 		}
3002 
3003 		slabp = list_entry(entry, struct slab, list);
3004 		check_slabp(cachep, slabp);
3005 		check_spinlock_acquired(cachep);
3006 
3007 		/*
3008 		 * The slab was either on partial or free list so
3009 		 * there must be at least one object available for
3010 		 * allocation.
3011 		 */
3012 		BUG_ON(slabp->inuse < 0 || slabp->inuse >= cachep->num);
3013 
3014 		while (slabp->inuse < cachep->num && batchcount--) {
3015 			STATS_INC_ALLOCED(cachep);
3016 			STATS_INC_ACTIVE(cachep);
3017 			STATS_SET_HIGH(cachep);
3018 
3019 			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
3020 							    node);
3021 		}
3022 		check_slabp(cachep, slabp);
3023 
3024 		/* move slabp to correct slabp list: */
3025 		list_del(&slabp->list);
3026 		if (slabp->free == BUFCTL_END)
3027 			list_add(&slabp->list, &l3->slabs_full);
3028 		else
3029 			list_add(&slabp->list, &l3->slabs_partial);
3030 	}
3031 
3032 must_grow:
3033 	l3->free_objects -= ac->avail;
3034 alloc_done:
3035 	spin_unlock(&l3->list_lock);
3036 
3037 	if (unlikely(!ac->avail)) {
3038 		int x;
3039 		x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
3040 
3041 		/* cache_grow can reenable interrupts, then ac could change. */
3042 		ac = cpu_cache_get(cachep);
3043 		if (!x && ac->avail == 0)	/* no objects in sight? abort */
3044 			return NULL;
3045 
3046 		if (!ac->avail)		/* objects refilled by interrupt? */
3047 			goto retry;
3048 	}
3049 	ac->touched = 1;
3050 	return ac->entry[--ac->avail];
3051 }
3052 
3053 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
3054 						gfp_t flags)
3055 {
3056 	might_sleep_if(flags & __GFP_WAIT);
3057 #if DEBUG
3058 	kmem_flagcheck(cachep, flags);
3059 #endif
3060 }
3061 
3062 #if DEBUG
3063 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3064 				gfp_t flags, void *objp, void *caller)
3065 {
3066 	if (!objp)
3067 		return objp;
3068 	if (cachep->flags & SLAB_POISON) {
3069 #ifdef CONFIG_DEBUG_PAGEALLOC
3070 		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
3071 			kernel_map_pages(virt_to_page(objp),
3072 					 cachep->buffer_size / PAGE_SIZE, 1);
3073 		else
3074 			check_poison_obj(cachep, objp);
3075 #else
3076 		check_poison_obj(cachep, objp);
3077 #endif
3078 		poison_obj(cachep, objp, POISON_INUSE);
3079 	}
3080 	if (cachep->flags & SLAB_STORE_USER)
3081 		*dbg_userword(cachep, objp) = caller;
3082 
3083 	if (cachep->flags & SLAB_RED_ZONE) {
3084 		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
3085 				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
3086 			slab_error(cachep, "double free, or memory outside"
3087 						" object was overwritten");
3088 			printk(KERN_ERR
3089 				"%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
3090 				objp, *dbg_redzone1(cachep, objp),
3091 				*dbg_redzone2(cachep, objp));
3092 		}
3093 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
3094 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
3095 	}
3096 #ifdef CONFIG_DEBUG_SLAB_LEAK
3097 	{
3098 		struct slab *slabp;
3099 		unsigned objnr;
3100 
3101 		slabp = page_get_slab(virt_to_head_page(objp));
3102 		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
3103 		slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
3104 	}
3105 #endif
3106 	objp += obj_offset(cachep);
3107 	if (cachep->ctor && cachep->flags & SLAB_POISON)
3108 		cachep->ctor(cachep, objp);
3109 #if ARCH_SLAB_MINALIGN
3110 	if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3111 		printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3112 		       objp, ARCH_SLAB_MINALIGN);
3113 	}
3114 #endif
3115 	return objp;
3116 }
3117 #else
3118 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
3119 #endif
3120 
3121 #ifdef CONFIG_FAILSLAB
3122 
3123 static struct failslab_attr {
3124 
3125 	struct fault_attr attr;
3126 
3127 	u32 ignore_gfp_wait;
3128 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3129 	struct dentry *ignore_gfp_wait_file;
3130 #endif
3131 
3132 } failslab = {
3133 	.attr = FAULT_ATTR_INITIALIZER,
3134 	.ignore_gfp_wait = 1,
3135 };
3136 
3137 static int __init setup_failslab(char *str)
3138 {
3139 	return setup_fault_attr(&failslab.attr, str);
3140 }
3141 __setup("failslab=", setup_failslab);
3142 
3143 static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3144 {
3145 	if (cachep == &cache_cache)
3146 		return 0;
3147 	if (flags & __GFP_NOFAIL)
3148 		return 0;
3149 	if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
3150 		return 0;
3151 
3152 	return should_fail(&failslab.attr, obj_size(cachep));
3153 }
3154 
3155 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3156 
3157 static int __init failslab_debugfs(void)
3158 {
3159 	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
3160 	struct dentry *dir;
3161 	int err;
3162 
3163 	err = init_fault_attr_dentries(&failslab.attr, "failslab");
3164 	if (err)
3165 		return err;
3166 	dir = failslab.attr.dentries.dir;
3167 
3168 	failslab.ignore_gfp_wait_file =
3169 		debugfs_create_bool("ignore-gfp-wait", mode, dir,
3170 				      &failslab.ignore_gfp_wait);
3171 
3172 	if (!failslab.ignore_gfp_wait_file) {
3173 		err = -ENOMEM;
3174 		debugfs_remove(failslab.ignore_gfp_wait_file);
3175 		cleanup_fault_attr_dentries(&failslab.attr);
3176 	}
3177 
3178 	return err;
3179 }
3180 
3181 late_initcall(failslab_debugfs);
3182 
3183 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
3184 
3185 #else /* CONFIG_FAILSLAB */
3186 
3187 static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3188 {
3189 	return 0;
3190 }
3191 
3192 #endif /* CONFIG_FAILSLAB */
3193 
3194 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3195 {
3196 	void *objp;
3197 	struct array_cache *ac;
3198 
3199 	check_irq_off();
3200 
3201 	ac = cpu_cache_get(cachep);
3202 	if (likely(ac->avail)) {
3203 		STATS_INC_ALLOCHIT(cachep);
3204 		ac->touched = 1;
3205 		objp = ac->entry[--ac->avail];
3206 	} else {
3207 		STATS_INC_ALLOCMISS(cachep);
3208 		objp = cache_alloc_refill(cachep, flags);
3209 	}
3210 	return objp;
3211 }
3212 
3213 #ifdef CONFIG_NUMA
3214 /*
3215  * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
3216  *
3217  * If we are in_interrupt, then process context, including cpusets and
3218  * mempolicy, may not apply and should not be used for allocation policy.
3219  */
3220 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3221 {
3222 	int nid_alloc, nid_here;
3223 
3224 	if (in_interrupt() || (flags & __GFP_THISNODE))
3225 		return NULL;
3226 	nid_alloc = nid_here = numa_node_id();
3227 	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3228 		nid_alloc = cpuset_mem_spread_node();
3229 	else if (current->mempolicy)
3230 		nid_alloc = slab_node(current->mempolicy);
3231 	if (nid_alloc != nid_here)
3232 		return ____cache_alloc_node(cachep, flags, nid_alloc);
3233 	return NULL;
3234 }
3235 
3236 /*
3237  * Fallback function if there was no memory available and no objects on a
3238  * certain node and fall back is permitted. First we scan all the
3239  * available nodelists for available objects. If that fails then we
3240  * perform an allocation without specifying a node. This allows the page
3241  * allocator to do its reclaim / fallback magic. We then insert the
3242  * slab into the proper nodelist and then allocate from it.
3243  */
3244 static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3245 {
3246 	struct zonelist *zonelist;
3247 	gfp_t local_flags;
3248 	struct zone **z;
3249 	void *obj = NULL;
3250 	int nid;
3251 
3252 	if (flags & __GFP_THISNODE)
3253 		return NULL;
3254 
3255 	zonelist = &NODE_DATA(slab_node(current->mempolicy))
3256 			->node_zonelists[gfp_zone(flags)];
3257 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3258 
3259 retry:
3260 	/*
3261 	 * Look through allowed nodes for objects available
3262 	 * from existing per node queues.
3263 	 */
3264 	for (z = zonelist->zones; *z && !obj; z++) {
3265 		nid = zone_to_nid(*z);
3266 
3267 		if (cpuset_zone_allowed_hardwall(*z, flags) &&
3268 			cache->nodelists[nid] &&
3269 			cache->nodelists[nid]->free_objects)
3270 				obj = ____cache_alloc_node(cache,
3271 					flags | GFP_THISNODE, nid);
3272 	}
3273 
3274 	if (!obj) {
3275 		/*
3276 		 * This allocation will be performed within the constraints
3277 		 * of the current cpuset / memory policy requirements.
3278 		 * We may trigger various forms of reclaim on the allowed
3279 		 * set and go into memory reserves if necessary.
3280 		 */
3281 		if (local_flags & __GFP_WAIT)
3282 			local_irq_enable();
3283 		kmem_flagcheck(cache, flags);
3284 		obj = kmem_getpages(cache, flags, -1);
3285 		if (local_flags & __GFP_WAIT)
3286 			local_irq_disable();
3287 		if (obj) {
3288 			/*
3289 			 * Insert into the appropriate per node queues
3290 			 */
3291 			nid = page_to_nid(virt_to_page(obj));
3292 			if (cache_grow(cache, flags, nid, obj)) {
3293 				obj = ____cache_alloc_node(cache,
3294 					flags | GFP_THISNODE, nid);
3295 				if (!obj)
3296 					/*
3297 					 * Another processor may allocate the
3298 					 * objects in the slab since we are
3299 					 * not holding any locks.
3300 					 */
3301 					goto retry;
3302 			} else {
3303 				/* cache_grow already freed obj */
3304 				obj = NULL;
3305 			}
3306 		}
3307 	}
3308 	return obj;
3309 }
3310 
3311 /*
3312  * A interface to enable slab creation on nodeid
3313  */
3314 static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3315 				int nodeid)
3316 {
3317 	struct list_head *entry;
3318 	struct slab *slabp;
3319 	struct kmem_list3 *l3;
3320 	void *obj;
3321 	int x;
3322 
3323 	l3 = cachep->nodelists[nodeid];
3324 	BUG_ON(!l3);
3325 
3326 retry:
3327 	check_irq_off();
3328 	spin_lock(&l3->list_lock);
3329 	entry = l3->slabs_partial.next;
3330 	if (entry == &l3->slabs_partial) {
3331 		l3->free_touched = 1;
3332 		entry = l3->slabs_free.next;
3333 		if (entry == &l3->slabs_free)
3334 			goto must_grow;
3335 	}
3336 
3337 	slabp = list_entry(entry, struct slab, list);
3338 	check_spinlock_acquired_node(cachep, nodeid);
3339 	check_slabp(cachep, slabp);
3340 
3341 	STATS_INC_NODEALLOCS(cachep);
3342 	STATS_INC_ACTIVE(cachep);
3343 	STATS_SET_HIGH(cachep);
3344 
3345 	BUG_ON(slabp->inuse == cachep->num);
3346 
3347 	obj = slab_get_obj(cachep, slabp, nodeid);
3348 	check_slabp(cachep, slabp);
3349 	l3->free_objects--;
3350 	/* move slabp to correct slabp list: */
3351 	list_del(&slabp->list);
3352 
3353 	if (slabp->free == BUFCTL_END)
3354 		list_add(&slabp->list, &l3->slabs_full);
3355 	else
3356 		list_add(&slabp->list, &l3->slabs_partial);
3357 
3358 	spin_unlock(&l3->list_lock);
3359 	goto done;
3360 
3361 must_grow:
3362 	spin_unlock(&l3->list_lock);
3363 	x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3364 	if (x)
3365 		goto retry;
3366 
3367 	return fallback_alloc(cachep, flags);
3368 
3369 done:
3370 	return obj;
3371 }
3372 
3373 /**
3374  * kmem_cache_alloc_node - Allocate an object on the specified node
3375  * @cachep: The cache to allocate from.
3376  * @flags: See kmalloc().
3377  * @nodeid: node number of the target node.
3378  * @caller: return address of caller, used for debug information
3379  *
3380  * Identical to kmem_cache_alloc but it will allocate memory on the given
3381  * node, which can improve the performance for cpu bound structures.
3382  *
3383  * Fallback to other node is possible if __GFP_THISNODE is not set.
3384  */
3385 static __always_inline void *
3386 __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3387 		   void *caller)
3388 {
3389 	unsigned long save_flags;
3390 	void *ptr;
3391 
3392 	if (should_failslab(cachep, flags))
3393 		return NULL;
3394 
3395 	cache_alloc_debugcheck_before(cachep, flags);
3396 	local_irq_save(save_flags);
3397 
3398 	if (unlikely(nodeid == -1))
3399 		nodeid = numa_node_id();
3400 
3401 	if (unlikely(!cachep->nodelists[nodeid])) {
3402 		/* Node not bootstrapped yet */
3403 		ptr = fallback_alloc(cachep, flags);
3404 		goto out;
3405 	}
3406 
3407 	if (nodeid == numa_node_id()) {
3408 		/*
3409 		 * Use the locally cached objects if possible.
3410 		 * However ____cache_alloc does not allow fallback
3411 		 * to other nodes. It may fail while we still have
3412 		 * objects on other nodes available.
3413 		 */
3414 		ptr = ____cache_alloc(cachep, flags);
3415 		if (ptr)
3416 			goto out;
3417 	}
3418 	/* ___cache_alloc_node can fall back to other nodes */
3419 	ptr = ____cache_alloc_node(cachep, flags, nodeid);
3420   out:
3421 	local_irq_restore(save_flags);
3422 	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3423 
3424 	if (unlikely((flags & __GFP_ZERO) && ptr))
3425 		memset(ptr, 0, obj_size(cachep));
3426 
3427 	return ptr;
3428 }
3429 
3430 static __always_inline void *
3431 __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
3432 {
3433 	void *objp;
3434 
3435 	if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
3436 		objp = alternate_node_alloc(cache, flags);
3437 		if (objp)
3438 			goto out;
3439 	}
3440 	objp = ____cache_alloc(cache, flags);
3441 
3442 	/*
3443 	 * We may just have run out of memory on the local node.
3444 	 * ____cache_alloc_node() knows how to locate memory on other nodes
3445 	 */
3446  	if (!objp)
3447  		objp = ____cache_alloc_node(cache, flags, numa_node_id());
3448 
3449   out:
3450 	return objp;
3451 }
3452 #else
3453 
3454 static __always_inline void *
3455 __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3456 {
3457 	return ____cache_alloc(cachep, flags);
3458 }
3459 
3460 #endif /* CONFIG_NUMA */
3461 
3462 static __always_inline void *
3463 __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3464 {
3465 	unsigned long save_flags;
3466 	void *objp;
3467 
3468 	if (should_failslab(cachep, flags))
3469 		return NULL;
3470 
3471 	cache_alloc_debugcheck_before(cachep, flags);
3472 	local_irq_save(save_flags);
3473 	objp = __do_cache_alloc(cachep, flags);
3474 	local_irq_restore(save_flags);
3475 	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3476 	prefetchw(objp);
3477 
3478 	if (unlikely((flags & __GFP_ZERO) && objp))
3479 		memset(objp, 0, obj_size(cachep));
3480 
3481 	return objp;
3482 }
3483 
3484 /*
3485  * Caller needs to acquire correct kmem_list's list_lock
3486  */
3487 static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3488 		       int node)
3489 {
3490 	int i;
3491 	struct kmem_list3 *l3;
3492 
3493 	for (i = 0; i < nr_objects; i++) {
3494 		void *objp = objpp[i];
3495 		struct slab *slabp;
3496 
3497 		slabp = virt_to_slab(objp);
3498 		l3 = cachep->nodelists[node];
3499 		list_del(&slabp->list);
3500 		check_spinlock_acquired_node(cachep, node);
3501 		check_slabp(cachep, slabp);
3502 		slab_put_obj(cachep, slabp, objp, node);
3503 		STATS_DEC_ACTIVE(cachep);
3504 		l3->free_objects++;
3505 		check_slabp(cachep, slabp);
3506 
3507 		/* fixup slab chains */
3508 		if (slabp->inuse == 0) {
3509 			if (l3->free_objects > l3->free_limit) {
3510 				l3->free_objects -= cachep->num;
3511 				/* No need to drop any previously held
3512 				 * lock here, even if we have a off-slab slab
3513 				 * descriptor it is guaranteed to come from
3514 				 * a different cache, refer to comments before
3515 				 * alloc_slabmgmt.
3516 				 */
3517 				slab_destroy(cachep, slabp);
3518 			} else {
3519 				list_add(&slabp->list, &l3->slabs_free);
3520 			}
3521 		} else {
3522 			/* Unconditionally move a slab to the end of the
3523 			 * partial list on free - maximum time for the
3524 			 * other objects to be freed, too.
3525 			 */
3526 			list_add_tail(&slabp->list, &l3->slabs_partial);
3527 		}
3528 	}
3529 }
3530 
3531 static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3532 {
3533 	int batchcount;
3534 	struct kmem_list3 *l3;
3535 	int node = numa_node_id();
3536 
3537 	batchcount = ac->batchcount;
3538 #if DEBUG
3539 	BUG_ON(!batchcount || batchcount > ac->avail);
3540 #endif
3541 	check_irq_off();
3542 	l3 = cachep->nodelists[node];
3543 	spin_lock(&l3->list_lock);
3544 	if (l3->shared) {
3545 		struct array_cache *shared_array = l3->shared;
3546 		int max = shared_array->limit - shared_array->avail;
3547 		if (max) {
3548 			if (batchcount > max)
3549 				batchcount = max;
3550 			memcpy(&(shared_array->entry[shared_array->avail]),
3551 			       ac->entry, sizeof(void *) * batchcount);
3552 			shared_array->avail += batchcount;
3553 			goto free_done;
3554 		}
3555 	}
3556 
3557 	free_block(cachep, ac->entry, batchcount, node);
3558 free_done:
3559 #if STATS
3560 	{
3561 		int i = 0;
3562 		struct list_head *p;
3563 
3564 		p = l3->slabs_free.next;
3565 		while (p != &(l3->slabs_free)) {
3566 			struct slab *slabp;
3567 
3568 			slabp = list_entry(p, struct slab, list);
3569 			BUG_ON(slabp->inuse);
3570 
3571 			i++;
3572 			p = p->next;
3573 		}
3574 		STATS_SET_FREEABLE(cachep, i);
3575 	}
3576 #endif
3577 	spin_unlock(&l3->list_lock);
3578 	ac->avail -= batchcount;
3579 	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3580 }
3581 
3582 /*
3583  * Release an obj back to its cache. If the obj has a constructed state, it must
3584  * be in this state _before_ it is released.  Called with disabled ints.
3585  */
3586 static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3587 {
3588 	struct array_cache *ac = cpu_cache_get(cachep);
3589 
3590 	check_irq_off();
3591 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3592 
3593 	/*
3594 	 * Skip calling cache_free_alien() when the platform is not numa.
3595 	 * This will avoid cache misses that happen while accessing slabp (which
3596 	 * is per page memory  reference) to get nodeid. Instead use a global
3597 	 * variable to skip the call, which is mostly likely to be present in
3598 	 * the cache.
3599 	 */
3600 	if (numa_platform && cache_free_alien(cachep, objp))
3601 		return;
3602 
3603 	if (likely(ac->avail < ac->limit)) {
3604 		STATS_INC_FREEHIT(cachep);
3605 		ac->entry[ac->avail++] = objp;
3606 		return;
3607 	} else {
3608 		STATS_INC_FREEMISS(cachep);
3609 		cache_flusharray(cachep, ac);
3610 		ac->entry[ac->avail++] = objp;
3611 	}
3612 }
3613 
3614 /**
3615  * kmem_cache_alloc - Allocate an object
3616  * @cachep: The cache to allocate from.
3617  * @flags: See kmalloc().
3618  *
3619  * Allocate an object from this cache.  The flags are only relevant
3620  * if the cache has no available objects.
3621  */
3622 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3623 {
3624 	return __cache_alloc(cachep, flags, __builtin_return_address(0));
3625 }
3626 EXPORT_SYMBOL(kmem_cache_alloc);
3627 
3628 /**
3629  * kmem_ptr_validate - check if an untrusted pointer might
3630  *	be a slab entry.
3631  * @cachep: the cache we're checking against
3632  * @ptr: pointer to validate
3633  *
3634  * This verifies that the untrusted pointer looks sane:
3635  * it is _not_ a guarantee that the pointer is actually
3636  * part of the slab cache in question, but it at least
3637  * validates that the pointer can be dereferenced and
3638  * looks half-way sane.
3639  *
3640  * Currently only used for dentry validation.
3641  */
3642 int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3643 {
3644 	unsigned long addr = (unsigned long)ptr;
3645 	unsigned long min_addr = PAGE_OFFSET;
3646 	unsigned long align_mask = BYTES_PER_WORD - 1;
3647 	unsigned long size = cachep->buffer_size;
3648 	struct page *page;
3649 
3650 	if (unlikely(addr < min_addr))
3651 		goto out;
3652 	if (unlikely(addr > (unsigned long)high_memory - size))
3653 		goto out;
3654 	if (unlikely(addr & align_mask))
3655 		goto out;
3656 	if (unlikely(!kern_addr_valid(addr)))
3657 		goto out;
3658 	if (unlikely(!kern_addr_valid(addr + size - 1)))
3659 		goto out;
3660 	page = virt_to_page(ptr);
3661 	if (unlikely(!PageSlab(page)))
3662 		goto out;
3663 	if (unlikely(page_get_cache(page) != cachep))
3664 		goto out;
3665 	return 1;
3666 out:
3667 	return 0;
3668 }
3669 
3670 #ifdef CONFIG_NUMA
3671 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3672 {
3673 	return __cache_alloc_node(cachep, flags, nodeid,
3674 			__builtin_return_address(0));
3675 }
3676 EXPORT_SYMBOL(kmem_cache_alloc_node);
3677 
3678 static __always_inline void *
3679 __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3680 {
3681 	struct kmem_cache *cachep;
3682 
3683 	cachep = kmem_find_general_cachep(size, flags);
3684 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3685 		return cachep;
3686 	return kmem_cache_alloc_node(cachep, flags, node);
3687 }
3688 
3689 #ifdef CONFIG_DEBUG_SLAB
3690 void *__kmalloc_node(size_t size, gfp_t flags, int node)
3691 {
3692 	return __do_kmalloc_node(size, flags, node,
3693 			__builtin_return_address(0));
3694 }
3695 EXPORT_SYMBOL(__kmalloc_node);
3696 
3697 void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3698 		int node, void *caller)
3699 {
3700 	return __do_kmalloc_node(size, flags, node, caller);
3701 }
3702 EXPORT_SYMBOL(__kmalloc_node_track_caller);
3703 #else
3704 void *__kmalloc_node(size_t size, gfp_t flags, int node)
3705 {
3706 	return __do_kmalloc_node(size, flags, node, NULL);
3707 }
3708 EXPORT_SYMBOL(__kmalloc_node);
3709 #endif /* CONFIG_DEBUG_SLAB */
3710 #endif /* CONFIG_NUMA */
3711 
3712 /**
3713  * __do_kmalloc - allocate memory
3714  * @size: how many bytes of memory are required.
3715  * @flags: the type of memory to allocate (see kmalloc).
3716  * @caller: function caller for debug tracking of the caller
3717  */
3718 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3719 					  void *caller)
3720 {
3721 	struct kmem_cache *cachep;
3722 
3723 	/* If you want to save a few bytes .text space: replace
3724 	 * __ with kmem_.
3725 	 * Then kmalloc uses the uninlined functions instead of the inline
3726 	 * functions.
3727 	 */
3728 	cachep = __find_general_cachep(size, flags);
3729 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3730 		return cachep;
3731 	return __cache_alloc(cachep, flags, caller);
3732 }
3733 
3734 
3735 #ifdef CONFIG_DEBUG_SLAB
3736 void *__kmalloc(size_t size, gfp_t flags)
3737 {
3738 	return __do_kmalloc(size, flags, __builtin_return_address(0));
3739 }
3740 EXPORT_SYMBOL(__kmalloc);
3741 
3742 void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3743 {
3744 	return __do_kmalloc(size, flags, caller);
3745 }
3746 EXPORT_SYMBOL(__kmalloc_track_caller);
3747 
3748 #else
3749 void *__kmalloc(size_t size, gfp_t flags)
3750 {
3751 	return __do_kmalloc(size, flags, NULL);
3752 }
3753 EXPORT_SYMBOL(__kmalloc);
3754 #endif
3755 
3756 /**
3757  * kmem_cache_free - Deallocate an object
3758  * @cachep: The cache the allocation was from.
3759  * @objp: The previously allocated object.
3760  *
3761  * Free an object which was previously allocated from this
3762  * cache.
3763  */
3764 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3765 {
3766 	unsigned long flags;
3767 
3768 	local_irq_save(flags);
3769 	debug_check_no_locks_freed(objp, obj_size(cachep));
3770 	__cache_free(cachep, objp);
3771 	local_irq_restore(flags);
3772 }
3773 EXPORT_SYMBOL(kmem_cache_free);
3774 
3775 /**
3776  * kfree - free previously allocated memory
3777  * @objp: pointer returned by kmalloc.
3778  *
3779  * If @objp is NULL, no operation is performed.
3780  *
3781  * Don't free memory not originally allocated by kmalloc()
3782  * or you will run into trouble.
3783  */
3784 void kfree(const void *objp)
3785 {
3786 	struct kmem_cache *c;
3787 	unsigned long flags;
3788 
3789 	if (unlikely(ZERO_OR_NULL_PTR(objp)))
3790 		return;
3791 	local_irq_save(flags);
3792 	kfree_debugcheck(objp);
3793 	c = virt_to_cache(objp);
3794 	debug_check_no_locks_freed(objp, obj_size(c));
3795 	__cache_free(c, (void *)objp);
3796 	local_irq_restore(flags);
3797 }
3798 EXPORT_SYMBOL(kfree);
3799 
3800 unsigned int kmem_cache_size(struct kmem_cache *cachep)
3801 {
3802 	return obj_size(cachep);
3803 }
3804 EXPORT_SYMBOL(kmem_cache_size);
3805 
3806 const char *kmem_cache_name(struct kmem_cache *cachep)
3807 {
3808 	return cachep->name;
3809 }
3810 EXPORT_SYMBOL_GPL(kmem_cache_name);
3811 
3812 /*
3813  * This initializes kmem_list3 or resizes various caches for all nodes.
3814  */
3815 static int alloc_kmemlist(struct kmem_cache *cachep)
3816 {
3817 	int node;
3818 	struct kmem_list3 *l3;
3819 	struct array_cache *new_shared;
3820 	struct array_cache **new_alien = NULL;
3821 
3822 	for_each_online_node(node) {
3823 
3824                 if (use_alien_caches) {
3825                         new_alien = alloc_alien_cache(node, cachep->limit);
3826                         if (!new_alien)
3827                                 goto fail;
3828                 }
3829 
3830 		new_shared = NULL;
3831 		if (cachep->shared) {
3832 			new_shared = alloc_arraycache(node,
3833 				cachep->shared*cachep->batchcount,
3834 					0xbaadf00d);
3835 			if (!new_shared) {
3836 				free_alien_cache(new_alien);
3837 				goto fail;
3838 			}
3839 		}
3840 
3841 		l3 = cachep->nodelists[node];
3842 		if (l3) {
3843 			struct array_cache *shared = l3->shared;
3844 
3845 			spin_lock_irq(&l3->list_lock);
3846 
3847 			if (shared)
3848 				free_block(cachep, shared->entry,
3849 						shared->avail, node);
3850 
3851 			l3->shared = new_shared;
3852 			if (!l3->alien) {
3853 				l3->alien = new_alien;
3854 				new_alien = NULL;
3855 			}
3856 			l3->free_limit = (1 + nr_cpus_node(node)) *
3857 					cachep->batchcount + cachep->num;
3858 			spin_unlock_irq(&l3->list_lock);
3859 			kfree(shared);
3860 			free_alien_cache(new_alien);
3861 			continue;
3862 		}
3863 		l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
3864 		if (!l3) {
3865 			free_alien_cache(new_alien);
3866 			kfree(new_shared);
3867 			goto fail;
3868 		}
3869 
3870 		kmem_list3_init(l3);
3871 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3872 				((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3873 		l3->shared = new_shared;
3874 		l3->alien = new_alien;
3875 		l3->free_limit = (1 + nr_cpus_node(node)) *
3876 					cachep->batchcount + cachep->num;
3877 		cachep->nodelists[node] = l3;
3878 	}
3879 	return 0;
3880 
3881 fail:
3882 	if (!cachep->next.next) {
3883 		/* Cache is not active yet. Roll back what we did */
3884 		node--;
3885 		while (node >= 0) {
3886 			if (cachep->nodelists[node]) {
3887 				l3 = cachep->nodelists[node];
3888 
3889 				kfree(l3->shared);
3890 				free_alien_cache(l3->alien);
3891 				kfree(l3);
3892 				cachep->nodelists[node] = NULL;
3893 			}
3894 			node--;
3895 		}
3896 	}
3897 	return -ENOMEM;
3898 }
3899 
3900 struct ccupdate_struct {
3901 	struct kmem_cache *cachep;
3902 	struct array_cache *new[NR_CPUS];
3903 };
3904 
3905 static void do_ccupdate_local(void *info)
3906 {
3907 	struct ccupdate_struct *new = info;
3908 	struct array_cache *old;
3909 
3910 	check_irq_off();
3911 	old = cpu_cache_get(new->cachep);
3912 
3913 	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3914 	new->new[smp_processor_id()] = old;
3915 }
3916 
3917 /* Always called with the cache_chain_mutex held */
3918 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3919 				int batchcount, int shared)
3920 {
3921 	struct ccupdate_struct *new;
3922 	int i;
3923 
3924 	new = kzalloc(sizeof(*new), GFP_KERNEL);
3925 	if (!new)
3926 		return -ENOMEM;
3927 
3928 	for_each_online_cpu(i) {
3929 		new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
3930 						batchcount);
3931 		if (!new->new[i]) {
3932 			for (i--; i >= 0; i--)
3933 				kfree(new->new[i]);
3934 			kfree(new);
3935 			return -ENOMEM;
3936 		}
3937 	}
3938 	new->cachep = cachep;
3939 
3940 	on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
3941 
3942 	check_irq_on();
3943 	cachep->batchcount = batchcount;
3944 	cachep->limit = limit;
3945 	cachep->shared = shared;
3946 
3947 	for_each_online_cpu(i) {
3948 		struct array_cache *ccold = new->new[i];
3949 		if (!ccold)
3950 			continue;
3951 		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3952 		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
3953 		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3954 		kfree(ccold);
3955 	}
3956 	kfree(new);
3957 	return alloc_kmemlist(cachep);
3958 }
3959 
3960 /* Called with cache_chain_mutex held always */
3961 static int enable_cpucache(struct kmem_cache *cachep)
3962 {
3963 	int err;
3964 	int limit, shared;
3965 
3966 	/*
3967 	 * The head array serves three purposes:
3968 	 * - create a LIFO ordering, i.e. return objects that are cache-warm
3969 	 * - reduce the number of spinlock operations.
3970 	 * - reduce the number of linked list operations on the slab and
3971 	 *   bufctl chains: array operations are cheaper.
3972 	 * The numbers are guessed, we should auto-tune as described by
3973 	 * Bonwick.
3974 	 */
3975 	if (cachep->buffer_size > 131072)
3976 		limit = 1;
3977 	else if (cachep->buffer_size > PAGE_SIZE)
3978 		limit = 8;
3979 	else if (cachep->buffer_size > 1024)
3980 		limit = 24;
3981 	else if (cachep->buffer_size > 256)
3982 		limit = 54;
3983 	else
3984 		limit = 120;
3985 
3986 	/*
3987 	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
3988 	 * allocation behaviour: Most allocs on one cpu, most free operations
3989 	 * on another cpu. For these cases, an efficient object passing between
3990 	 * cpus is necessary. This is provided by a shared array. The array
3991 	 * replaces Bonwick's magazine layer.
3992 	 * On uniprocessor, it's functionally equivalent (but less efficient)
3993 	 * to a larger limit. Thus disabled by default.
3994 	 */
3995 	shared = 0;
3996 	if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
3997 		shared = 8;
3998 
3999 #if DEBUG
4000 	/*
4001 	 * With debugging enabled, large batchcount lead to excessively long
4002 	 * periods with disabled local interrupts. Limit the batchcount
4003 	 */
4004 	if (limit > 32)
4005 		limit = 32;
4006 #endif
4007 	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
4008 	if (err)
4009 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
4010 		       cachep->name, -err);
4011 	return err;
4012 }
4013 
4014 /*
4015  * Drain an array if it contains any elements taking the l3 lock only if
4016  * necessary. Note that the l3 listlock also protects the array_cache
4017  * if drain_array() is used on the shared array.
4018  */
4019 void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
4020 			 struct array_cache *ac, int force, int node)
4021 {
4022 	int tofree;
4023 
4024 	if (!ac || !ac->avail)
4025 		return;
4026 	if (ac->touched && !force) {
4027 		ac->touched = 0;
4028 	} else {
4029 		spin_lock_irq(&l3->list_lock);
4030 		if (ac->avail) {
4031 			tofree = force ? ac->avail : (ac->limit + 4) / 5;
4032 			if (tofree > ac->avail)
4033 				tofree = (ac->avail + 1) / 2;
4034 			free_block(cachep, ac->entry, tofree, node);
4035 			ac->avail -= tofree;
4036 			memmove(ac->entry, &(ac->entry[tofree]),
4037 				sizeof(void *) * ac->avail);
4038 		}
4039 		spin_unlock_irq(&l3->list_lock);
4040 	}
4041 }
4042 
4043 /**
4044  * cache_reap - Reclaim memory from caches.
4045  * @w: work descriptor
4046  *
4047  * Called from workqueue/eventd every few seconds.
4048  * Purpose:
4049  * - clear the per-cpu caches for this CPU.
4050  * - return freeable pages to the main free memory pool.
4051  *
4052  * If we cannot acquire the cache chain mutex then just give up - we'll try
4053  * again on the next iteration.
4054  */
4055 static void cache_reap(struct work_struct *w)
4056 {
4057 	struct kmem_cache *searchp;
4058 	struct kmem_list3 *l3;
4059 	int node = numa_node_id();
4060 	struct delayed_work *work =
4061 		container_of(w, struct delayed_work, work);
4062 
4063 	if (!mutex_trylock(&cache_chain_mutex))
4064 		/* Give up. Setup the next iteration. */
4065 		goto out;
4066 
4067 	list_for_each_entry(searchp, &cache_chain, next) {
4068 		check_irq_on();
4069 
4070 		/*
4071 		 * We only take the l3 lock if absolutely necessary and we
4072 		 * have established with reasonable certainty that
4073 		 * we can do some work if the lock was obtained.
4074 		 */
4075 		l3 = searchp->nodelists[node];
4076 
4077 		reap_alien(searchp, l3);
4078 
4079 		drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
4080 
4081 		/*
4082 		 * These are racy checks but it does not matter
4083 		 * if we skip one check or scan twice.
4084 		 */
4085 		if (time_after(l3->next_reap, jiffies))
4086 			goto next;
4087 
4088 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
4089 
4090 		drain_array(searchp, l3, l3->shared, 0, node);
4091 
4092 		if (l3->free_touched)
4093 			l3->free_touched = 0;
4094 		else {
4095 			int freed;
4096 
4097 			freed = drain_freelist(searchp, l3, (l3->free_limit +
4098 				5 * searchp->num - 1) / (5 * searchp->num));
4099 			STATS_ADD_REAPED(searchp, freed);
4100 		}
4101 next:
4102 		cond_resched();
4103 	}
4104 	check_irq_on();
4105 	mutex_unlock(&cache_chain_mutex);
4106 	next_reap_node();
4107 out:
4108 	/* Set up the next iteration */
4109 	schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
4110 }
4111 
4112 #ifdef CONFIG_SLABINFO
4113 
4114 static void print_slabinfo_header(struct seq_file *m)
4115 {
4116 	/*
4117 	 * Output format version, so at least we can change it
4118 	 * without _too_ many complaints.
4119 	 */
4120 #if STATS
4121 	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
4122 #else
4123 	seq_puts(m, "slabinfo - version: 2.1\n");
4124 #endif
4125 	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
4126 		 "<objperslab> <pagesperslab>");
4127 	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4128 	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4129 #if STATS
4130 	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
4131 		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
4132 	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
4133 #endif
4134 	seq_putc(m, '\n');
4135 }
4136 
4137 static void *s_start(struct seq_file *m, loff_t *pos)
4138 {
4139 	loff_t n = *pos;
4140 
4141 	mutex_lock(&cache_chain_mutex);
4142 	if (!n)
4143 		print_slabinfo_header(m);
4144 
4145 	return seq_list_start(&cache_chain, *pos);
4146 }
4147 
4148 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4149 {
4150 	return seq_list_next(p, &cache_chain, pos);
4151 }
4152 
4153 static void s_stop(struct seq_file *m, void *p)
4154 {
4155 	mutex_unlock(&cache_chain_mutex);
4156 }
4157 
4158 static int s_show(struct seq_file *m, void *p)
4159 {
4160 	struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
4161 	struct slab *slabp;
4162 	unsigned long active_objs;
4163 	unsigned long num_objs;
4164 	unsigned long active_slabs = 0;
4165 	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
4166 	const char *name;
4167 	char *error = NULL;
4168 	int node;
4169 	struct kmem_list3 *l3;
4170 
4171 	active_objs = 0;
4172 	num_slabs = 0;
4173 	for_each_online_node(node) {
4174 		l3 = cachep->nodelists[node];
4175 		if (!l3)
4176 			continue;
4177 
4178 		check_irq_on();
4179 		spin_lock_irq(&l3->list_lock);
4180 
4181 		list_for_each_entry(slabp, &l3->slabs_full, list) {
4182 			if (slabp->inuse != cachep->num && !error)
4183 				error = "slabs_full accounting error";
4184 			active_objs += cachep->num;
4185 			active_slabs++;
4186 		}
4187 		list_for_each_entry(slabp, &l3->slabs_partial, list) {
4188 			if (slabp->inuse == cachep->num && !error)
4189 				error = "slabs_partial inuse accounting error";
4190 			if (!slabp->inuse && !error)
4191 				error = "slabs_partial/inuse accounting error";
4192 			active_objs += slabp->inuse;
4193 			active_slabs++;
4194 		}
4195 		list_for_each_entry(slabp, &l3->slabs_free, list) {
4196 			if (slabp->inuse && !error)
4197 				error = "slabs_free/inuse accounting error";
4198 			num_slabs++;
4199 		}
4200 		free_objects += l3->free_objects;
4201 		if (l3->shared)
4202 			shared_avail += l3->shared->avail;
4203 
4204 		spin_unlock_irq(&l3->list_lock);
4205 	}
4206 	num_slabs += active_slabs;
4207 	num_objs = num_slabs * cachep->num;
4208 	if (num_objs - active_objs != free_objects && !error)
4209 		error = "free_objects accounting error";
4210 
4211 	name = cachep->name;
4212 	if (error)
4213 		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4214 
4215 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
4216 		   name, active_objs, num_objs, cachep->buffer_size,
4217 		   cachep->num, (1 << cachep->gfporder));
4218 	seq_printf(m, " : tunables %4u %4u %4u",
4219 		   cachep->limit, cachep->batchcount, cachep->shared);
4220 	seq_printf(m, " : slabdata %6lu %6lu %6lu",
4221 		   active_slabs, num_slabs, shared_avail);
4222 #if STATS
4223 	{			/* list3 stats */
4224 		unsigned long high = cachep->high_mark;
4225 		unsigned long allocs = cachep->num_allocations;
4226 		unsigned long grown = cachep->grown;
4227 		unsigned long reaped = cachep->reaped;
4228 		unsigned long errors = cachep->errors;
4229 		unsigned long max_freeable = cachep->max_freeable;
4230 		unsigned long node_allocs = cachep->node_allocs;
4231 		unsigned long node_frees = cachep->node_frees;
4232 		unsigned long overflows = cachep->node_overflow;
4233 
4234 		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
4235 				%4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
4236 				reaped, errors, max_freeable, node_allocs,
4237 				node_frees, overflows);
4238 	}
4239 	/* cpu stats */
4240 	{
4241 		unsigned long allochit = atomic_read(&cachep->allochit);
4242 		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
4243 		unsigned long freehit = atomic_read(&cachep->freehit);
4244 		unsigned long freemiss = atomic_read(&cachep->freemiss);
4245 
4246 		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
4247 			   allochit, allocmiss, freehit, freemiss);
4248 	}
4249 #endif
4250 	seq_putc(m, '\n');
4251 	return 0;
4252 }
4253 
4254 /*
4255  * slabinfo_op - iterator that generates /proc/slabinfo
4256  *
4257  * Output layout:
4258  * cache-name
4259  * num-active-objs
4260  * total-objs
4261  * object size
4262  * num-active-slabs
4263  * total-slabs
4264  * num-pages-per-slab
4265  * + further values on SMP and with statistics enabled
4266  */
4267 
4268 const struct seq_operations slabinfo_op = {
4269 	.start = s_start,
4270 	.next = s_next,
4271 	.stop = s_stop,
4272 	.show = s_show,
4273 };
4274 
4275 #define MAX_SLABINFO_WRITE 128
4276 /**
4277  * slabinfo_write - Tuning for the slab allocator
4278  * @file: unused
4279  * @buffer: user buffer
4280  * @count: data length
4281  * @ppos: unused
4282  */
4283 ssize_t slabinfo_write(struct file *file, const char __user * buffer,
4284 		       size_t count, loff_t *ppos)
4285 {
4286 	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
4287 	int limit, batchcount, shared, res;
4288 	struct kmem_cache *cachep;
4289 
4290 	if (count > MAX_SLABINFO_WRITE)
4291 		return -EINVAL;
4292 	if (copy_from_user(&kbuf, buffer, count))
4293 		return -EFAULT;
4294 	kbuf[MAX_SLABINFO_WRITE] = '\0';
4295 
4296 	tmp = strchr(kbuf, ' ');
4297 	if (!tmp)
4298 		return -EINVAL;
4299 	*tmp = '\0';
4300 	tmp++;
4301 	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
4302 		return -EINVAL;
4303 
4304 	/* Find the cache in the chain of caches. */
4305 	mutex_lock(&cache_chain_mutex);
4306 	res = -EINVAL;
4307 	list_for_each_entry(cachep, &cache_chain, next) {
4308 		if (!strcmp(cachep->name, kbuf)) {
4309 			if (limit < 1 || batchcount < 1 ||
4310 					batchcount > limit || shared < 0) {
4311 				res = 0;
4312 			} else {
4313 				res = do_tune_cpucache(cachep, limit,
4314 						       batchcount, shared);
4315 			}
4316 			break;
4317 		}
4318 	}
4319 	mutex_unlock(&cache_chain_mutex);
4320 	if (res >= 0)
4321 		res = count;
4322 	return res;
4323 }
4324 
4325 #ifdef CONFIG_DEBUG_SLAB_LEAK
4326 
4327 static void *leaks_start(struct seq_file *m, loff_t *pos)
4328 {
4329 	mutex_lock(&cache_chain_mutex);
4330 	return seq_list_start(&cache_chain, *pos);
4331 }
4332 
4333 static inline int add_caller(unsigned long *n, unsigned long v)
4334 {
4335 	unsigned long *p;
4336 	int l;
4337 	if (!v)
4338 		return 1;
4339 	l = n[1];
4340 	p = n + 2;
4341 	while (l) {
4342 		int i = l/2;
4343 		unsigned long *q = p + 2 * i;
4344 		if (*q == v) {
4345 			q[1]++;
4346 			return 1;
4347 		}
4348 		if (*q > v) {
4349 			l = i;
4350 		} else {
4351 			p = q + 2;
4352 			l -= i + 1;
4353 		}
4354 	}
4355 	if (++n[1] == n[0])
4356 		return 0;
4357 	memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4358 	p[0] = v;
4359 	p[1] = 1;
4360 	return 1;
4361 }
4362 
4363 static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4364 {
4365 	void *p;
4366 	int i;
4367 	if (n[0] == n[1])
4368 		return;
4369 	for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
4370 		if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4371 			continue;
4372 		if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4373 			return;
4374 	}
4375 }
4376 
4377 static void show_symbol(struct seq_file *m, unsigned long address)
4378 {
4379 #ifdef CONFIG_KALLSYMS
4380 	unsigned long offset, size;
4381 	char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
4382 
4383 	if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
4384 		seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4385 		if (modname[0])
4386 			seq_printf(m, " [%s]", modname);
4387 		return;
4388 	}
4389 #endif
4390 	seq_printf(m, "%p", (void *)address);
4391 }
4392 
4393 static int leaks_show(struct seq_file *m, void *p)
4394 {
4395 	struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
4396 	struct slab *slabp;
4397 	struct kmem_list3 *l3;
4398 	const char *name;
4399 	unsigned long *n = m->private;
4400 	int node;
4401 	int i;
4402 
4403 	if (!(cachep->flags & SLAB_STORE_USER))
4404 		return 0;
4405 	if (!(cachep->flags & SLAB_RED_ZONE))
4406 		return 0;
4407 
4408 	/* OK, we can do it */
4409 
4410 	n[1] = 0;
4411 
4412 	for_each_online_node(node) {
4413 		l3 = cachep->nodelists[node];
4414 		if (!l3)
4415 			continue;
4416 
4417 		check_irq_on();
4418 		spin_lock_irq(&l3->list_lock);
4419 
4420 		list_for_each_entry(slabp, &l3->slabs_full, list)
4421 			handle_slab(n, cachep, slabp);
4422 		list_for_each_entry(slabp, &l3->slabs_partial, list)
4423 			handle_slab(n, cachep, slabp);
4424 		spin_unlock_irq(&l3->list_lock);
4425 	}
4426 	name = cachep->name;
4427 	if (n[0] == n[1]) {
4428 		/* Increase the buffer size */
4429 		mutex_unlock(&cache_chain_mutex);
4430 		m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4431 		if (!m->private) {
4432 			/* Too bad, we are really out */
4433 			m->private = n;
4434 			mutex_lock(&cache_chain_mutex);
4435 			return -ENOMEM;
4436 		}
4437 		*(unsigned long *)m->private = n[0] * 2;
4438 		kfree(n);
4439 		mutex_lock(&cache_chain_mutex);
4440 		/* Now make sure this entry will be retried */
4441 		m->count = m->size;
4442 		return 0;
4443 	}
4444 	for (i = 0; i < n[1]; i++) {
4445 		seq_printf(m, "%s: %lu ", name, n[2*i+3]);
4446 		show_symbol(m, n[2*i+2]);
4447 		seq_putc(m, '\n');
4448 	}
4449 
4450 	return 0;
4451 }
4452 
4453 const struct seq_operations slabstats_op = {
4454 	.start = leaks_start,
4455 	.next = s_next,
4456 	.stop = s_stop,
4457 	.show = leaks_show,
4458 };
4459 #endif
4460 #endif
4461 
4462 /**
4463  * ksize - get the actual amount of memory allocated for a given object
4464  * @objp: Pointer to the object
4465  *
4466  * kmalloc may internally round up allocations and return more memory
4467  * than requested. ksize() can be used to determine the actual amount of
4468  * memory allocated. The caller may use this additional memory, even though
4469  * a smaller amount of memory was initially specified with the kmalloc call.
4470  * The caller must guarantee that objp points to a valid object previously
4471  * allocated with either kmalloc() or kmem_cache_alloc(). The object
4472  * must not be freed during the duration of the call.
4473  */
4474 size_t ksize(const void *objp)
4475 {
4476 	BUG_ON(!objp);
4477 	if (unlikely(objp == ZERO_SIZE_PTR))
4478 		return 0;
4479 
4480 	return obj_size(virt_to_cache(objp));
4481 }
4482 EXPORT_SYMBOL(ksize);
4483