xref: /openbmc/linux/mm/slab.c (revision b454cc66)
1 /*
2  * linux/mm/slab.c
3  * Written by Mark Hemment, 1996/97.
4  * (markhe@nextd.demon.co.uk)
5  *
6  * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
7  *
8  * Major cleanup, different bufctl logic, per-cpu arrays
9  *	(c) 2000 Manfred Spraul
10  *
11  * Cleanup, make the head arrays unconditional, preparation for NUMA
12  * 	(c) 2002 Manfred Spraul
13  *
14  * An implementation of the Slab Allocator as described in outline in;
15  *	UNIX Internals: The New Frontiers by Uresh Vahalia
16  *	Pub: Prentice Hall	ISBN 0-13-101908-2
17  * or with a little more detail in;
18  *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
19  *	Jeff Bonwick (Sun Microsystems).
20  *	Presented at: USENIX Summer 1994 Technical Conference
21  *
22  * The memory is organized in caches, one cache for each object type.
23  * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
24  * Each cache consists out of many slabs (they are small (usually one
25  * page long) and always contiguous), and each slab contains multiple
26  * initialized objects.
27  *
28  * This means, that your constructor is used only for newly allocated
29  * slabs and you must pass objects with the same intializations to
30  * kmem_cache_free.
31  *
32  * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
33  * normal). If you need a special memory type, then must create a new
34  * cache for that memory type.
35  *
36  * In order to reduce fragmentation, the slabs are sorted in 3 groups:
37  *   full slabs with 0 free objects
38  *   partial slabs
39  *   empty slabs with no allocated objects
40  *
41  * If partial slabs exist, then new allocations come from these slabs,
42  * otherwise from empty slabs or new slabs are allocated.
43  *
44  * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
45  * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
46  *
47  * Each cache has a short per-cpu head array, most allocs
48  * and frees go into that array, and if that array overflows, then 1/2
49  * of the entries in the array are given back into the global cache.
50  * The head array is strictly LIFO and should improve the cache hit rates.
51  * On SMP, it additionally reduces the spinlock operations.
52  *
53  * The c_cpuarray may not be read with enabled local interrupts -
54  * it's changed with a smp_call_function().
55  *
56  * SMP synchronization:
57  *  constructors and destructors are called without any locking.
58  *  Several members in struct kmem_cache and struct slab never change, they
59  *	are accessed without any locking.
60  *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
61  *  	and local interrupts are disabled so slab code is preempt-safe.
62  *  The non-constant members are protected with a per-cache irq spinlock.
63  *
64  * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
65  * in 2000 - many ideas in the current implementation are derived from
66  * his patch.
67  *
68  * Further notes from the original documentation:
69  *
70  * 11 April '97.  Started multi-threading - markhe
71  *	The global cache-chain is protected by the mutex 'cache_chain_mutex'.
72  *	The sem is only needed when accessing/extending the cache-chain, which
73  *	can never happen inside an interrupt (kmem_cache_create(),
74  *	kmem_cache_shrink() and kmem_cache_reap()).
75  *
76  *	At present, each engine can be growing a cache.  This should be blocked.
77  *
78  * 15 March 2005. NUMA slab allocator.
79  *	Shai Fultheim <shai@scalex86.org>.
80  *	Shobhit Dayal <shobhit@calsoftinc.com>
81  *	Alok N Kataria <alokk@calsoftinc.com>
82  *	Christoph Lameter <christoph@lameter.com>
83  *
84  *	Modified the slab allocator to be node aware on NUMA systems.
85  *	Each node has its own list of partial, free and full slabs.
86  *	All object allocations for a node occur from node specific slab lists.
87  */
88 
89 #include	<linux/slab.h>
90 #include	<linux/mm.h>
91 #include	<linux/poison.h>
92 #include	<linux/swap.h>
93 #include	<linux/cache.h>
94 #include	<linux/interrupt.h>
95 #include	<linux/init.h>
96 #include	<linux/compiler.h>
97 #include	<linux/cpuset.h>
98 #include	<linux/seq_file.h>
99 #include	<linux/notifier.h>
100 #include	<linux/kallsyms.h>
101 #include	<linux/cpu.h>
102 #include	<linux/sysctl.h>
103 #include	<linux/module.h>
104 #include	<linux/rcupdate.h>
105 #include	<linux/string.h>
106 #include	<linux/uaccess.h>
107 #include	<linux/nodemask.h>
108 #include	<linux/mempolicy.h>
109 #include	<linux/mutex.h>
110 #include	<linux/fault-inject.h>
111 #include	<linux/rtmutex.h>
112 #include	<linux/reciprocal_div.h>
113 
114 #include	<asm/cacheflush.h>
115 #include	<asm/tlbflush.h>
116 #include	<asm/page.h>
117 
118 /*
119  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
120  *		  SLAB_RED_ZONE & SLAB_POISON.
121  *		  0 for faster, smaller code (especially in the critical paths).
122  *
123  * STATS	- 1 to collect stats for /proc/slabinfo.
124  *		  0 for faster, smaller code (especially in the critical paths).
125  *
126  * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
127  */
128 
129 #ifdef CONFIG_DEBUG_SLAB
130 #define	DEBUG		1
131 #define	STATS		1
132 #define	FORCED_DEBUG	1
133 #else
134 #define	DEBUG		0
135 #define	STATS		0
136 #define	FORCED_DEBUG	0
137 #endif
138 
139 /* Shouldn't this be in a header file somewhere? */
140 #define	BYTES_PER_WORD		sizeof(void *)
141 
142 #ifndef cache_line_size
143 #define cache_line_size()	L1_CACHE_BYTES
144 #endif
145 
146 #ifndef ARCH_KMALLOC_MINALIGN
147 /*
148  * Enforce a minimum alignment for the kmalloc caches.
149  * Usually, the kmalloc caches are cache_line_size() aligned, except when
150  * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
151  * Some archs want to perform DMA into kmalloc caches and need a guaranteed
152  * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.
153  * Note that this flag disables some debug features.
154  */
155 #define ARCH_KMALLOC_MINALIGN 0
156 #endif
157 
158 #ifndef ARCH_SLAB_MINALIGN
159 /*
160  * Enforce a minimum alignment for all caches.
161  * Intended for archs that get misalignment faults even for BYTES_PER_WORD
162  * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
163  * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
164  * some debug features.
165  */
166 #define ARCH_SLAB_MINALIGN 0
167 #endif
168 
169 #ifndef ARCH_KMALLOC_FLAGS
170 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
171 #endif
172 
173 /* Legal flag mask for kmem_cache_create(). */
174 #if DEBUG
175 # define CREATE_MASK	(SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
176 			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
177 			 SLAB_CACHE_DMA | \
178 			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
179 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
180 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
181 #else
182 # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
183 			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
184 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
185 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
186 #endif
187 
188 /*
189  * kmem_bufctl_t:
190  *
191  * Bufctl's are used for linking objs within a slab
192  * linked offsets.
193  *
194  * This implementation relies on "struct page" for locating the cache &
195  * slab an object belongs to.
196  * This allows the bufctl structure to be small (one int), but limits
197  * the number of objects a slab (not a cache) can contain when off-slab
198  * bufctls are used. The limit is the size of the largest general cache
199  * that does not use off-slab slabs.
200  * For 32bit archs with 4 kB pages, is this 56.
201  * This is not serious, as it is only for large objects, when it is unwise
202  * to have too many per slab.
203  * Note: This limit can be raised by introducing a general cache whose size
204  * is less than 512 (PAGE_SIZE<<3), but greater than 256.
205  */
206 
207 typedef unsigned int kmem_bufctl_t;
208 #define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
209 #define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
210 #define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2)
211 #define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3)
212 
213 /*
214  * struct slab
215  *
216  * Manages the objs in a slab. Placed either at the beginning of mem allocated
217  * for a slab, or allocated from an general cache.
218  * Slabs are chained into three list: fully used, partial, fully free slabs.
219  */
220 struct slab {
221 	struct list_head list;
222 	unsigned long colouroff;
223 	void *s_mem;		/* including colour offset */
224 	unsigned int inuse;	/* num of objs active in slab */
225 	kmem_bufctl_t free;
226 	unsigned short nodeid;
227 };
228 
229 /*
230  * struct slab_rcu
231  *
232  * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
233  * arrange for kmem_freepages to be called via RCU.  This is useful if
234  * we need to approach a kernel structure obliquely, from its address
235  * obtained without the usual locking.  We can lock the structure to
236  * stabilize it and check it's still at the given address, only if we
237  * can be sure that the memory has not been meanwhile reused for some
238  * other kind of object (which our subsystem's lock might corrupt).
239  *
240  * rcu_read_lock before reading the address, then rcu_read_unlock after
241  * taking the spinlock within the structure expected at that address.
242  *
243  * We assume struct slab_rcu can overlay struct slab when destroying.
244  */
245 struct slab_rcu {
246 	struct rcu_head head;
247 	struct kmem_cache *cachep;
248 	void *addr;
249 };
250 
251 /*
252  * struct array_cache
253  *
254  * Purpose:
255  * - LIFO ordering, to hand out cache-warm objects from _alloc
256  * - reduce the number of linked list operations
257  * - reduce spinlock operations
258  *
259  * The limit is stored in the per-cpu structure to reduce the data cache
260  * footprint.
261  *
262  */
263 struct array_cache {
264 	unsigned int avail;
265 	unsigned int limit;
266 	unsigned int batchcount;
267 	unsigned int touched;
268 	spinlock_t lock;
269 	void *entry[0];	/*
270 			 * Must have this definition in here for the proper
271 			 * alignment of array_cache. Also simplifies accessing
272 			 * the entries.
273 			 * [0] is for gcc 2.95. It should really be [].
274 			 */
275 };
276 
277 /*
278  * bootstrap: The caches do not work without cpuarrays anymore, but the
279  * cpuarrays are allocated from the generic caches...
280  */
281 #define BOOT_CPUCACHE_ENTRIES	1
282 struct arraycache_init {
283 	struct array_cache cache;
284 	void *entries[BOOT_CPUCACHE_ENTRIES];
285 };
286 
287 /*
288  * The slab lists for all objects.
289  */
290 struct kmem_list3 {
291 	struct list_head slabs_partial;	/* partial list first, better asm code */
292 	struct list_head slabs_full;
293 	struct list_head slabs_free;
294 	unsigned long free_objects;
295 	unsigned int free_limit;
296 	unsigned int colour_next;	/* Per-node cache coloring */
297 	spinlock_t list_lock;
298 	struct array_cache *shared;	/* shared per node */
299 	struct array_cache **alien;	/* on other nodes */
300 	unsigned long next_reap;	/* updated without locking */
301 	int free_touched;		/* updated without locking */
302 };
303 
304 /*
305  * Need this for bootstrapping a per node allocator.
306  */
307 #define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
308 struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
309 #define	CACHE_CACHE 0
310 #define	SIZE_AC 1
311 #define	SIZE_L3 (1 + MAX_NUMNODES)
312 
313 static int drain_freelist(struct kmem_cache *cache,
314 			struct kmem_list3 *l3, int tofree);
315 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
316 			int node);
317 static int enable_cpucache(struct kmem_cache *cachep);
318 static void cache_reap(struct work_struct *unused);
319 
320 /*
321  * This function must be completely optimized away if a constant is passed to
322  * it.  Mostly the same as what is in linux/slab.h except it returns an index.
323  */
324 static __always_inline int index_of(const size_t size)
325 {
326 	extern void __bad_size(void);
327 
328 	if (__builtin_constant_p(size)) {
329 		int i = 0;
330 
331 #define CACHE(x) \
332 	if (size <=x) \
333 		return i; \
334 	else \
335 		i++;
336 #include "linux/kmalloc_sizes.h"
337 #undef CACHE
338 		__bad_size();
339 	} else
340 		__bad_size();
341 	return 0;
342 }
343 
344 static int slab_early_init = 1;
345 
346 #define INDEX_AC index_of(sizeof(struct arraycache_init))
347 #define INDEX_L3 index_of(sizeof(struct kmem_list3))
348 
349 static void kmem_list3_init(struct kmem_list3 *parent)
350 {
351 	INIT_LIST_HEAD(&parent->slabs_full);
352 	INIT_LIST_HEAD(&parent->slabs_partial);
353 	INIT_LIST_HEAD(&parent->slabs_free);
354 	parent->shared = NULL;
355 	parent->alien = NULL;
356 	parent->colour_next = 0;
357 	spin_lock_init(&parent->list_lock);
358 	parent->free_objects = 0;
359 	parent->free_touched = 0;
360 }
361 
362 #define MAKE_LIST(cachep, listp, slab, nodeid)				\
363 	do {								\
364 		INIT_LIST_HEAD(listp);					\
365 		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\
366 	} while (0)
367 
368 #define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
369 	do {								\
370 	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
371 	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
372 	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
373 	} while (0)
374 
375 /*
376  * struct kmem_cache
377  *
378  * manages a cache.
379  */
380 
381 struct kmem_cache {
382 /* 1) per-cpu data, touched during every alloc/free */
383 	struct array_cache *array[NR_CPUS];
384 /* 2) Cache tunables. Protected by cache_chain_mutex */
385 	unsigned int batchcount;
386 	unsigned int limit;
387 	unsigned int shared;
388 
389 	unsigned int buffer_size;
390 	u32 reciprocal_buffer_size;
391 /* 3) touched by every alloc & free from the backend */
392 	struct kmem_list3 *nodelists[MAX_NUMNODES];
393 
394 	unsigned int flags;		/* constant flags */
395 	unsigned int num;		/* # of objs per slab */
396 
397 /* 4) cache_grow/shrink */
398 	/* order of pgs per slab (2^n) */
399 	unsigned int gfporder;
400 
401 	/* force GFP flags, e.g. GFP_DMA */
402 	gfp_t gfpflags;
403 
404 	size_t colour;			/* cache colouring range */
405 	unsigned int colour_off;	/* colour offset */
406 	struct kmem_cache *slabp_cache;
407 	unsigned int slab_size;
408 	unsigned int dflags;		/* dynamic flags */
409 
410 	/* constructor func */
411 	void (*ctor) (void *, struct kmem_cache *, unsigned long);
412 
413 	/* de-constructor func */
414 	void (*dtor) (void *, struct kmem_cache *, unsigned long);
415 
416 /* 5) cache creation/removal */
417 	const char *name;
418 	struct list_head next;
419 
420 /* 6) statistics */
421 #if STATS
422 	unsigned long num_active;
423 	unsigned long num_allocations;
424 	unsigned long high_mark;
425 	unsigned long grown;
426 	unsigned long reaped;
427 	unsigned long errors;
428 	unsigned long max_freeable;
429 	unsigned long node_allocs;
430 	unsigned long node_frees;
431 	unsigned long node_overflow;
432 	atomic_t allochit;
433 	atomic_t allocmiss;
434 	atomic_t freehit;
435 	atomic_t freemiss;
436 #endif
437 #if DEBUG
438 	/*
439 	 * If debugging is enabled, then the allocator can add additional
440 	 * fields and/or padding to every object. buffer_size contains the total
441 	 * object size including these internal fields, the following two
442 	 * variables contain the offset to the user object and its size.
443 	 */
444 	int obj_offset;
445 	int obj_size;
446 #endif
447 };
448 
449 #define CFLGS_OFF_SLAB		(0x80000000UL)
450 #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
451 
452 #define BATCHREFILL_LIMIT	16
453 /*
454  * Optimization question: fewer reaps means less probability for unnessary
455  * cpucache drain/refill cycles.
456  *
457  * OTOH the cpuarrays can contain lots of objects,
458  * which could lock up otherwise freeable slabs.
459  */
460 #define REAPTIMEOUT_CPUC	(2*HZ)
461 #define REAPTIMEOUT_LIST3	(4*HZ)
462 
463 #if STATS
464 #define	STATS_INC_ACTIVE(x)	((x)->num_active++)
465 #define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
466 #define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
467 #define	STATS_INC_GROWN(x)	((x)->grown++)
468 #define	STATS_ADD_REAPED(x,y)	((x)->reaped += (y))
469 #define	STATS_SET_HIGH(x)						\
470 	do {								\
471 		if ((x)->num_active > (x)->high_mark)			\
472 			(x)->high_mark = (x)->num_active;		\
473 	} while (0)
474 #define	STATS_INC_ERR(x)	((x)->errors++)
475 #define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
476 #define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
477 #define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
478 #define	STATS_SET_FREEABLE(x, i)					\
479 	do {								\
480 		if ((x)->max_freeable < i)				\
481 			(x)->max_freeable = i;				\
482 	} while (0)
483 #define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
484 #define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
485 #define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
486 #define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
487 #else
488 #define	STATS_INC_ACTIVE(x)	do { } while (0)
489 #define	STATS_DEC_ACTIVE(x)	do { } while (0)
490 #define	STATS_INC_ALLOCED(x)	do { } while (0)
491 #define	STATS_INC_GROWN(x)	do { } while (0)
492 #define	STATS_ADD_REAPED(x,y)	do { } while (0)
493 #define	STATS_SET_HIGH(x)	do { } while (0)
494 #define	STATS_INC_ERR(x)	do { } while (0)
495 #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
496 #define	STATS_INC_NODEFREES(x)	do { } while (0)
497 #define STATS_INC_ACOVERFLOW(x)   do { } while (0)
498 #define	STATS_SET_FREEABLE(x, i) do { } while (0)
499 #define STATS_INC_ALLOCHIT(x)	do { } while (0)
500 #define STATS_INC_ALLOCMISS(x)	do { } while (0)
501 #define STATS_INC_FREEHIT(x)	do { } while (0)
502 #define STATS_INC_FREEMISS(x)	do { } while (0)
503 #endif
504 
505 #if DEBUG
506 
507 /*
508  * memory layout of objects:
509  * 0		: objp
510  * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
511  * 		the end of an object is aligned with the end of the real
512  * 		allocation. Catches writes behind the end of the allocation.
513  * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
514  * 		redzone word.
515  * cachep->obj_offset: The real object.
516  * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
517  * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
518  *					[BYTES_PER_WORD long]
519  */
520 static int obj_offset(struct kmem_cache *cachep)
521 {
522 	return cachep->obj_offset;
523 }
524 
525 static int obj_size(struct kmem_cache *cachep)
526 {
527 	return cachep->obj_size;
528 }
529 
530 static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
531 {
532 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
533 	return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
534 }
535 
536 static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
537 {
538 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
539 	if (cachep->flags & SLAB_STORE_USER)
540 		return (unsigned long *)(objp + cachep->buffer_size -
541 					 2 * BYTES_PER_WORD);
542 	return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
543 }
544 
545 static void **dbg_userword(struct kmem_cache *cachep, void *objp)
546 {
547 	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
548 	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
549 }
550 
551 #else
552 
553 #define obj_offset(x)			0
554 #define obj_size(cachep)		(cachep->buffer_size)
555 #define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long *)NULL;})
556 #define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long *)NULL;})
557 #define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
558 
559 #endif
560 
561 /*
562  * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
563  * order.
564  */
565 #if defined(CONFIG_LARGE_ALLOCS)
566 #define	MAX_OBJ_ORDER	13	/* up to 32Mb */
567 #define	MAX_GFP_ORDER	13	/* up to 32Mb */
568 #elif defined(CONFIG_MMU)
569 #define	MAX_OBJ_ORDER	5	/* 32 pages */
570 #define	MAX_GFP_ORDER	5	/* 32 pages */
571 #else
572 #define	MAX_OBJ_ORDER	8	/* up to 1Mb */
573 #define	MAX_GFP_ORDER	8	/* up to 1Mb */
574 #endif
575 
576 /*
577  * Do not go above this order unless 0 objects fit into the slab.
578  */
579 #define	BREAK_GFP_ORDER_HI	1
580 #define	BREAK_GFP_ORDER_LO	0
581 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
582 
583 /*
584  * Functions for storing/retrieving the cachep and or slab from the page
585  * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
586  * these are used to find the cache which an obj belongs to.
587  */
588 static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
589 {
590 	page->lru.next = (struct list_head *)cache;
591 }
592 
593 static inline struct kmem_cache *page_get_cache(struct page *page)
594 {
595 	if (unlikely(PageCompound(page)))
596 		page = (struct page *)page_private(page);
597 	BUG_ON(!PageSlab(page));
598 	return (struct kmem_cache *)page->lru.next;
599 }
600 
601 static inline void page_set_slab(struct page *page, struct slab *slab)
602 {
603 	page->lru.prev = (struct list_head *)slab;
604 }
605 
606 static inline struct slab *page_get_slab(struct page *page)
607 {
608 	if (unlikely(PageCompound(page)))
609 		page = (struct page *)page_private(page);
610 	BUG_ON(!PageSlab(page));
611 	return (struct slab *)page->lru.prev;
612 }
613 
614 static inline struct kmem_cache *virt_to_cache(const void *obj)
615 {
616 	struct page *page = virt_to_page(obj);
617 	return page_get_cache(page);
618 }
619 
620 static inline struct slab *virt_to_slab(const void *obj)
621 {
622 	struct page *page = virt_to_page(obj);
623 	return page_get_slab(page);
624 }
625 
626 static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
627 				 unsigned int idx)
628 {
629 	return slab->s_mem + cache->buffer_size * idx;
630 }
631 
632 /*
633  * We want to avoid an expensive divide : (offset / cache->buffer_size)
634  *   Using the fact that buffer_size is a constant for a particular cache,
635  *   we can replace (offset / cache->buffer_size) by
636  *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
637  */
638 static inline unsigned int obj_to_index(const struct kmem_cache *cache,
639 					const struct slab *slab, void *obj)
640 {
641 	u32 offset = (obj - slab->s_mem);
642 	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
643 }
644 
645 /*
646  * These are the default caches for kmalloc. Custom caches can have other sizes.
647  */
648 struct cache_sizes malloc_sizes[] = {
649 #define CACHE(x) { .cs_size = (x) },
650 #include <linux/kmalloc_sizes.h>
651 	CACHE(ULONG_MAX)
652 #undef CACHE
653 };
654 EXPORT_SYMBOL(malloc_sizes);
655 
656 /* Must match cache_sizes above. Out of line to keep cache footprint low. */
657 struct cache_names {
658 	char *name;
659 	char *name_dma;
660 };
661 
662 static struct cache_names __initdata cache_names[] = {
663 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
664 #include <linux/kmalloc_sizes.h>
665 	{NULL,}
666 #undef CACHE
667 };
668 
669 static struct arraycache_init initarray_cache __initdata =
670     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
671 static struct arraycache_init initarray_generic =
672     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
673 
674 /* internal cache of cache description objs */
675 static struct kmem_cache cache_cache = {
676 	.batchcount = 1,
677 	.limit = BOOT_CPUCACHE_ENTRIES,
678 	.shared = 1,
679 	.buffer_size = sizeof(struct kmem_cache),
680 	.name = "kmem_cache",
681 #if DEBUG
682 	.obj_size = sizeof(struct kmem_cache),
683 #endif
684 };
685 
686 #define BAD_ALIEN_MAGIC 0x01020304ul
687 
688 #ifdef CONFIG_LOCKDEP
689 
690 /*
691  * Slab sometimes uses the kmalloc slabs to store the slab headers
692  * for other slabs "off slab".
693  * The locking for this is tricky in that it nests within the locks
694  * of all other slabs in a few places; to deal with this special
695  * locking we put on-slab caches into a separate lock-class.
696  *
697  * We set lock class for alien array caches which are up during init.
698  * The lock annotation will be lost if all cpus of a node goes down and
699  * then comes back up during hotplug
700  */
701 static struct lock_class_key on_slab_l3_key;
702 static struct lock_class_key on_slab_alc_key;
703 
704 static inline void init_lock_keys(void)
705 
706 {
707 	int q;
708 	struct cache_sizes *s = malloc_sizes;
709 
710 	while (s->cs_size != ULONG_MAX) {
711 		for_each_node(q) {
712 			struct array_cache **alc;
713 			int r;
714 			struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
715 			if (!l3 || OFF_SLAB(s->cs_cachep))
716 				continue;
717 			lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
718 			alc = l3->alien;
719 			/*
720 			 * FIXME: This check for BAD_ALIEN_MAGIC
721 			 * should go away when common slab code is taught to
722 			 * work even without alien caches.
723 			 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
724 			 * for alloc_alien_cache,
725 			 */
726 			if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
727 				continue;
728 			for_each_node(r) {
729 				if (alc[r])
730 					lockdep_set_class(&alc[r]->lock,
731 					     &on_slab_alc_key);
732 			}
733 		}
734 		s++;
735 	}
736 }
737 #else
738 static inline void init_lock_keys(void)
739 {
740 }
741 #endif
742 
743 /*
744  * 1. Guard access to the cache-chain.
745  * 2. Protect sanity of cpu_online_map against cpu hotplug events
746  */
747 static DEFINE_MUTEX(cache_chain_mutex);
748 static struct list_head cache_chain;
749 
750 /*
751  * chicken and egg problem: delay the per-cpu array allocation
752  * until the general caches are up.
753  */
754 static enum {
755 	NONE,
756 	PARTIAL_AC,
757 	PARTIAL_L3,
758 	FULL
759 } g_cpucache_up;
760 
761 /*
762  * used by boot code to determine if it can use slab based allocator
763  */
764 int slab_is_available(void)
765 {
766 	return g_cpucache_up == FULL;
767 }
768 
769 static DEFINE_PER_CPU(struct delayed_work, reap_work);
770 
771 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
772 {
773 	return cachep->array[smp_processor_id()];
774 }
775 
776 static inline struct kmem_cache *__find_general_cachep(size_t size,
777 							gfp_t gfpflags)
778 {
779 	struct cache_sizes *csizep = malloc_sizes;
780 
781 #if DEBUG
782 	/* This happens if someone tries to call
783 	 * kmem_cache_create(), or __kmalloc(), before
784 	 * the generic caches are initialized.
785 	 */
786 	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
787 #endif
788 	while (size > csizep->cs_size)
789 		csizep++;
790 
791 	/*
792 	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
793 	 * has cs_{dma,}cachep==NULL. Thus no special case
794 	 * for large kmalloc calls required.
795 	 */
796 	if (unlikely(gfpflags & GFP_DMA))
797 		return csizep->cs_dmacachep;
798 	return csizep->cs_cachep;
799 }
800 
801 static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
802 {
803 	return __find_general_cachep(size, gfpflags);
804 }
805 
806 static size_t slab_mgmt_size(size_t nr_objs, size_t align)
807 {
808 	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
809 }
810 
811 /*
812  * Calculate the number of objects and left-over bytes for a given buffer size.
813  */
814 static void cache_estimate(unsigned long gfporder, size_t buffer_size,
815 			   size_t align, int flags, size_t *left_over,
816 			   unsigned int *num)
817 {
818 	int nr_objs;
819 	size_t mgmt_size;
820 	size_t slab_size = PAGE_SIZE << gfporder;
821 
822 	/*
823 	 * The slab management structure can be either off the slab or
824 	 * on it. For the latter case, the memory allocated for a
825 	 * slab is used for:
826 	 *
827 	 * - The struct slab
828 	 * - One kmem_bufctl_t for each object
829 	 * - Padding to respect alignment of @align
830 	 * - @buffer_size bytes for each object
831 	 *
832 	 * If the slab management structure is off the slab, then the
833 	 * alignment will already be calculated into the size. Because
834 	 * the slabs are all pages aligned, the objects will be at the
835 	 * correct alignment when allocated.
836 	 */
837 	if (flags & CFLGS_OFF_SLAB) {
838 		mgmt_size = 0;
839 		nr_objs = slab_size / buffer_size;
840 
841 		if (nr_objs > SLAB_LIMIT)
842 			nr_objs = SLAB_LIMIT;
843 	} else {
844 		/*
845 		 * Ignore padding for the initial guess. The padding
846 		 * is at most @align-1 bytes, and @buffer_size is at
847 		 * least @align. In the worst case, this result will
848 		 * be one greater than the number of objects that fit
849 		 * into the memory allocation when taking the padding
850 		 * into account.
851 		 */
852 		nr_objs = (slab_size - sizeof(struct slab)) /
853 			  (buffer_size + sizeof(kmem_bufctl_t));
854 
855 		/*
856 		 * This calculated number will be either the right
857 		 * amount, or one greater than what we want.
858 		 */
859 		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
860 		       > slab_size)
861 			nr_objs--;
862 
863 		if (nr_objs > SLAB_LIMIT)
864 			nr_objs = SLAB_LIMIT;
865 
866 		mgmt_size = slab_mgmt_size(nr_objs, align);
867 	}
868 	*num = nr_objs;
869 	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
870 }
871 
872 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
873 
874 static void __slab_error(const char *function, struct kmem_cache *cachep,
875 			char *msg)
876 {
877 	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
878 	       function, cachep->name, msg);
879 	dump_stack();
880 }
881 
882 /*
883  * By default on NUMA we use alien caches to stage the freeing of
884  * objects allocated from other nodes. This causes massive memory
885  * inefficiencies when using fake NUMA setup to split memory into a
886  * large number of small nodes, so it can be disabled on the command
887  * line
888   */
889 
890 static int use_alien_caches __read_mostly = 1;
891 static int __init noaliencache_setup(char *s)
892 {
893 	use_alien_caches = 0;
894 	return 1;
895 }
896 __setup("noaliencache", noaliencache_setup);
897 
898 #ifdef CONFIG_NUMA
899 /*
900  * Special reaping functions for NUMA systems called from cache_reap().
901  * These take care of doing round robin flushing of alien caches (containing
902  * objects freed on different nodes from which they were allocated) and the
903  * flushing of remote pcps by calling drain_node_pages.
904  */
905 static DEFINE_PER_CPU(unsigned long, reap_node);
906 
907 static void init_reap_node(int cpu)
908 {
909 	int node;
910 
911 	node = next_node(cpu_to_node(cpu), node_online_map);
912 	if (node == MAX_NUMNODES)
913 		node = first_node(node_online_map);
914 
915 	per_cpu(reap_node, cpu) = node;
916 }
917 
918 static void next_reap_node(void)
919 {
920 	int node = __get_cpu_var(reap_node);
921 
922 	/*
923 	 * Also drain per cpu pages on remote zones
924 	 */
925 	if (node != numa_node_id())
926 		drain_node_pages(node);
927 
928 	node = next_node(node, node_online_map);
929 	if (unlikely(node >= MAX_NUMNODES))
930 		node = first_node(node_online_map);
931 	__get_cpu_var(reap_node) = node;
932 }
933 
934 #else
935 #define init_reap_node(cpu) do { } while (0)
936 #define next_reap_node(void) do { } while (0)
937 #endif
938 
939 /*
940  * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
941  * via the workqueue/eventd.
942  * Add the CPU number into the expiration time to minimize the possibility of
943  * the CPUs getting into lockstep and contending for the global cache chain
944  * lock.
945  */
946 static void __devinit start_cpu_timer(int cpu)
947 {
948 	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
949 
950 	/*
951 	 * When this gets called from do_initcalls via cpucache_init(),
952 	 * init_workqueues() has already run, so keventd will be setup
953 	 * at that time.
954 	 */
955 	if (keventd_up() && reap_work->work.func == NULL) {
956 		init_reap_node(cpu);
957 		INIT_DELAYED_WORK(reap_work, cache_reap);
958 		schedule_delayed_work_on(cpu, reap_work,
959 					__round_jiffies_relative(HZ, cpu));
960 	}
961 }
962 
963 static struct array_cache *alloc_arraycache(int node, int entries,
964 					    int batchcount)
965 {
966 	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
967 	struct array_cache *nc = NULL;
968 
969 	nc = kmalloc_node(memsize, GFP_KERNEL, node);
970 	if (nc) {
971 		nc->avail = 0;
972 		nc->limit = entries;
973 		nc->batchcount = batchcount;
974 		nc->touched = 0;
975 		spin_lock_init(&nc->lock);
976 	}
977 	return nc;
978 }
979 
980 /*
981  * Transfer objects in one arraycache to another.
982  * Locking must be handled by the caller.
983  *
984  * Return the number of entries transferred.
985  */
986 static int transfer_objects(struct array_cache *to,
987 		struct array_cache *from, unsigned int max)
988 {
989 	/* Figure out how many entries to transfer */
990 	int nr = min(min(from->avail, max), to->limit - to->avail);
991 
992 	if (!nr)
993 		return 0;
994 
995 	memcpy(to->entry + to->avail, from->entry + from->avail -nr,
996 			sizeof(void *) *nr);
997 
998 	from->avail -= nr;
999 	to->avail += nr;
1000 	to->touched = 1;
1001 	return nr;
1002 }
1003 
1004 #ifndef CONFIG_NUMA
1005 
1006 #define drain_alien_cache(cachep, alien) do { } while (0)
1007 #define reap_alien(cachep, l3) do { } while (0)
1008 
1009 static inline struct array_cache **alloc_alien_cache(int node, int limit)
1010 {
1011 	return (struct array_cache **)BAD_ALIEN_MAGIC;
1012 }
1013 
1014 static inline void free_alien_cache(struct array_cache **ac_ptr)
1015 {
1016 }
1017 
1018 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1019 {
1020 	return 0;
1021 }
1022 
1023 static inline void *alternate_node_alloc(struct kmem_cache *cachep,
1024 		gfp_t flags)
1025 {
1026 	return NULL;
1027 }
1028 
1029 static inline void *____cache_alloc_node(struct kmem_cache *cachep,
1030 		 gfp_t flags, int nodeid)
1031 {
1032 	return NULL;
1033 }
1034 
1035 #else	/* CONFIG_NUMA */
1036 
1037 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
1038 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
1039 
1040 static struct array_cache **alloc_alien_cache(int node, int limit)
1041 {
1042 	struct array_cache **ac_ptr;
1043 	int memsize = sizeof(void *) * MAX_NUMNODES;
1044 	int i;
1045 
1046 	if (limit > 1)
1047 		limit = 12;
1048 	ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
1049 	if (ac_ptr) {
1050 		for_each_node(i) {
1051 			if (i == node || !node_online(i)) {
1052 				ac_ptr[i] = NULL;
1053 				continue;
1054 			}
1055 			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
1056 			if (!ac_ptr[i]) {
1057 				for (i--; i <= 0; i--)
1058 					kfree(ac_ptr[i]);
1059 				kfree(ac_ptr);
1060 				return NULL;
1061 			}
1062 		}
1063 	}
1064 	return ac_ptr;
1065 }
1066 
1067 static void free_alien_cache(struct array_cache **ac_ptr)
1068 {
1069 	int i;
1070 
1071 	if (!ac_ptr)
1072 		return;
1073 	for_each_node(i)
1074 	    kfree(ac_ptr[i]);
1075 	kfree(ac_ptr);
1076 }
1077 
1078 static void __drain_alien_cache(struct kmem_cache *cachep,
1079 				struct array_cache *ac, int node)
1080 {
1081 	struct kmem_list3 *rl3 = cachep->nodelists[node];
1082 
1083 	if (ac->avail) {
1084 		spin_lock(&rl3->list_lock);
1085 		/*
1086 		 * Stuff objects into the remote nodes shared array first.
1087 		 * That way we could avoid the overhead of putting the objects
1088 		 * into the free lists and getting them back later.
1089 		 */
1090 		if (rl3->shared)
1091 			transfer_objects(rl3->shared, ac, ac->limit);
1092 
1093 		free_block(cachep, ac->entry, ac->avail, node);
1094 		ac->avail = 0;
1095 		spin_unlock(&rl3->list_lock);
1096 	}
1097 }
1098 
1099 /*
1100  * Called from cache_reap() to regularly drain alien caches round robin.
1101  */
1102 static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
1103 {
1104 	int node = __get_cpu_var(reap_node);
1105 
1106 	if (l3->alien) {
1107 		struct array_cache *ac = l3->alien[node];
1108 
1109 		if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
1110 			__drain_alien_cache(cachep, ac, node);
1111 			spin_unlock_irq(&ac->lock);
1112 		}
1113 	}
1114 }
1115 
1116 static void drain_alien_cache(struct kmem_cache *cachep,
1117 				struct array_cache **alien)
1118 {
1119 	int i = 0;
1120 	struct array_cache *ac;
1121 	unsigned long flags;
1122 
1123 	for_each_online_node(i) {
1124 		ac = alien[i];
1125 		if (ac) {
1126 			spin_lock_irqsave(&ac->lock, flags);
1127 			__drain_alien_cache(cachep, ac, i);
1128 			spin_unlock_irqrestore(&ac->lock, flags);
1129 		}
1130 	}
1131 }
1132 
1133 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1134 {
1135 	struct slab *slabp = virt_to_slab(objp);
1136 	int nodeid = slabp->nodeid;
1137 	struct kmem_list3 *l3;
1138 	struct array_cache *alien = NULL;
1139 	int node;
1140 
1141 	node = numa_node_id();
1142 
1143 	/*
1144 	 * Make sure we are not freeing a object from another node to the array
1145 	 * cache on this cpu.
1146 	 */
1147 	if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches))
1148 		return 0;
1149 
1150 	l3 = cachep->nodelists[node];
1151 	STATS_INC_NODEFREES(cachep);
1152 	if (l3->alien && l3->alien[nodeid]) {
1153 		alien = l3->alien[nodeid];
1154 		spin_lock(&alien->lock);
1155 		if (unlikely(alien->avail == alien->limit)) {
1156 			STATS_INC_ACOVERFLOW(cachep);
1157 			__drain_alien_cache(cachep, alien, nodeid);
1158 		}
1159 		alien->entry[alien->avail++] = objp;
1160 		spin_unlock(&alien->lock);
1161 	} else {
1162 		spin_lock(&(cachep->nodelists[nodeid])->list_lock);
1163 		free_block(cachep, &objp, 1, nodeid);
1164 		spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
1165 	}
1166 	return 1;
1167 }
1168 #endif
1169 
1170 static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1171 				    unsigned long action, void *hcpu)
1172 {
1173 	long cpu = (long)hcpu;
1174 	struct kmem_cache *cachep;
1175 	struct kmem_list3 *l3 = NULL;
1176 	int node = cpu_to_node(cpu);
1177 	int memsize = sizeof(struct kmem_list3);
1178 
1179 	switch (action) {
1180 	case CPU_UP_PREPARE:
1181 		mutex_lock(&cache_chain_mutex);
1182 		/*
1183 		 * We need to do this right in the beginning since
1184 		 * alloc_arraycache's are going to use this list.
1185 		 * kmalloc_node allows us to add the slab to the right
1186 		 * kmem_list3 and not this cpu's kmem_list3
1187 		 */
1188 
1189 		list_for_each_entry(cachep, &cache_chain, next) {
1190 			/*
1191 			 * Set up the size64 kmemlist for cpu before we can
1192 			 * begin anything. Make sure some other cpu on this
1193 			 * node has not already allocated this
1194 			 */
1195 			if (!cachep->nodelists[node]) {
1196 				l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1197 				if (!l3)
1198 					goto bad;
1199 				kmem_list3_init(l3);
1200 				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1201 				    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1202 
1203 				/*
1204 				 * The l3s don't come and go as CPUs come and
1205 				 * go.  cache_chain_mutex is sufficient
1206 				 * protection here.
1207 				 */
1208 				cachep->nodelists[node] = l3;
1209 			}
1210 
1211 			spin_lock_irq(&cachep->nodelists[node]->list_lock);
1212 			cachep->nodelists[node]->free_limit =
1213 				(1 + nr_cpus_node(node)) *
1214 				cachep->batchcount + cachep->num;
1215 			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1216 		}
1217 
1218 		/*
1219 		 * Now we can go ahead with allocating the shared arrays and
1220 		 * array caches
1221 		 */
1222 		list_for_each_entry(cachep, &cache_chain, next) {
1223 			struct array_cache *nc;
1224 			struct array_cache *shared;
1225 			struct array_cache **alien = NULL;
1226 
1227 			nc = alloc_arraycache(node, cachep->limit,
1228 						cachep->batchcount);
1229 			if (!nc)
1230 				goto bad;
1231 			shared = alloc_arraycache(node,
1232 					cachep->shared * cachep->batchcount,
1233 					0xbaadf00d);
1234 			if (!shared)
1235 				goto bad;
1236 
1237 			if (use_alien_caches) {
1238                                 alien = alloc_alien_cache(node, cachep->limit);
1239                                 if (!alien)
1240                                         goto bad;
1241                         }
1242 			cachep->array[cpu] = nc;
1243 			l3 = cachep->nodelists[node];
1244 			BUG_ON(!l3);
1245 
1246 			spin_lock_irq(&l3->list_lock);
1247 			if (!l3->shared) {
1248 				/*
1249 				 * We are serialised from CPU_DEAD or
1250 				 * CPU_UP_CANCELLED by the cpucontrol lock
1251 				 */
1252 				l3->shared = shared;
1253 				shared = NULL;
1254 			}
1255 #ifdef CONFIG_NUMA
1256 			if (!l3->alien) {
1257 				l3->alien = alien;
1258 				alien = NULL;
1259 			}
1260 #endif
1261 			spin_unlock_irq(&l3->list_lock);
1262 			kfree(shared);
1263 			free_alien_cache(alien);
1264 		}
1265 		break;
1266 	case CPU_ONLINE:
1267 		mutex_unlock(&cache_chain_mutex);
1268 		start_cpu_timer(cpu);
1269 		break;
1270 #ifdef CONFIG_HOTPLUG_CPU
1271 	case CPU_DOWN_PREPARE:
1272 		mutex_lock(&cache_chain_mutex);
1273 		break;
1274 	case CPU_DOWN_FAILED:
1275 		mutex_unlock(&cache_chain_mutex);
1276 		break;
1277 	case CPU_DEAD:
1278 		/*
1279 		 * Even if all the cpus of a node are down, we don't free the
1280 		 * kmem_list3 of any cache. This to avoid a race between
1281 		 * cpu_down, and a kmalloc allocation from another cpu for
1282 		 * memory from the node of the cpu going down.  The list3
1283 		 * structure is usually allocated from kmem_cache_create() and
1284 		 * gets destroyed at kmem_cache_destroy().
1285 		 */
1286 		/* fall thru */
1287 #endif
1288 	case CPU_UP_CANCELED:
1289 		list_for_each_entry(cachep, &cache_chain, next) {
1290 			struct array_cache *nc;
1291 			struct array_cache *shared;
1292 			struct array_cache **alien;
1293 			cpumask_t mask;
1294 
1295 			mask = node_to_cpumask(node);
1296 			/* cpu is dead; no one can alloc from it. */
1297 			nc = cachep->array[cpu];
1298 			cachep->array[cpu] = NULL;
1299 			l3 = cachep->nodelists[node];
1300 
1301 			if (!l3)
1302 				goto free_array_cache;
1303 
1304 			spin_lock_irq(&l3->list_lock);
1305 
1306 			/* Free limit for this kmem_list3 */
1307 			l3->free_limit -= cachep->batchcount;
1308 			if (nc)
1309 				free_block(cachep, nc->entry, nc->avail, node);
1310 
1311 			if (!cpus_empty(mask)) {
1312 				spin_unlock_irq(&l3->list_lock);
1313 				goto free_array_cache;
1314 			}
1315 
1316 			shared = l3->shared;
1317 			if (shared) {
1318 				free_block(cachep, l3->shared->entry,
1319 					   l3->shared->avail, node);
1320 				l3->shared = NULL;
1321 			}
1322 
1323 			alien = l3->alien;
1324 			l3->alien = NULL;
1325 
1326 			spin_unlock_irq(&l3->list_lock);
1327 
1328 			kfree(shared);
1329 			if (alien) {
1330 				drain_alien_cache(cachep, alien);
1331 				free_alien_cache(alien);
1332 			}
1333 free_array_cache:
1334 			kfree(nc);
1335 		}
1336 		/*
1337 		 * In the previous loop, all the objects were freed to
1338 		 * the respective cache's slabs,  now we can go ahead and
1339 		 * shrink each nodelist to its limit.
1340 		 */
1341 		list_for_each_entry(cachep, &cache_chain, next) {
1342 			l3 = cachep->nodelists[node];
1343 			if (!l3)
1344 				continue;
1345 			drain_freelist(cachep, l3, l3->free_objects);
1346 		}
1347 		mutex_unlock(&cache_chain_mutex);
1348 		break;
1349 	}
1350 	return NOTIFY_OK;
1351 bad:
1352 	return NOTIFY_BAD;
1353 }
1354 
1355 static struct notifier_block __cpuinitdata cpucache_notifier = {
1356 	&cpuup_callback, NULL, 0
1357 };
1358 
1359 /*
1360  * swap the static kmem_list3 with kmalloced memory
1361  */
1362 static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1363 			int nodeid)
1364 {
1365 	struct kmem_list3 *ptr;
1366 
1367 	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
1368 	BUG_ON(!ptr);
1369 
1370 	local_irq_disable();
1371 	memcpy(ptr, list, sizeof(struct kmem_list3));
1372 	/*
1373 	 * Do not assume that spinlocks can be initialized via memcpy:
1374 	 */
1375 	spin_lock_init(&ptr->list_lock);
1376 
1377 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
1378 	cachep->nodelists[nodeid] = ptr;
1379 	local_irq_enable();
1380 }
1381 
1382 /*
1383  * Initialisation.  Called after the page allocator have been initialised and
1384  * before smp_init().
1385  */
1386 void __init kmem_cache_init(void)
1387 {
1388 	size_t left_over;
1389 	struct cache_sizes *sizes;
1390 	struct cache_names *names;
1391 	int i;
1392 	int order;
1393 	int node;
1394 
1395 	for (i = 0; i < NUM_INIT_LISTS; i++) {
1396 		kmem_list3_init(&initkmem_list3[i]);
1397 		if (i < MAX_NUMNODES)
1398 			cache_cache.nodelists[i] = NULL;
1399 	}
1400 
1401 	/*
1402 	 * Fragmentation resistance on low memory - only use bigger
1403 	 * page orders on machines with more than 32MB of memory.
1404 	 */
1405 	if (num_physpages > (32 << 20) >> PAGE_SHIFT)
1406 		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1407 
1408 	/* Bootstrap is tricky, because several objects are allocated
1409 	 * from caches that do not exist yet:
1410 	 * 1) initialize the cache_cache cache: it contains the struct
1411 	 *    kmem_cache structures of all caches, except cache_cache itself:
1412 	 *    cache_cache is statically allocated.
1413 	 *    Initially an __init data area is used for the head array and the
1414 	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
1415 	 *    array at the end of the bootstrap.
1416 	 * 2) Create the first kmalloc cache.
1417 	 *    The struct kmem_cache for the new cache is allocated normally.
1418 	 *    An __init data area is used for the head array.
1419 	 * 3) Create the remaining kmalloc caches, with minimally sized
1420 	 *    head arrays.
1421 	 * 4) Replace the __init data head arrays for cache_cache and the first
1422 	 *    kmalloc cache with kmalloc allocated arrays.
1423 	 * 5) Replace the __init data for kmem_list3 for cache_cache and
1424 	 *    the other cache's with kmalloc allocated memory.
1425 	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1426 	 */
1427 
1428 	node = numa_node_id();
1429 
1430 	/* 1) create the cache_cache */
1431 	INIT_LIST_HEAD(&cache_chain);
1432 	list_add(&cache_cache.next, &cache_chain);
1433 	cache_cache.colour_off = cache_line_size();
1434 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1435 	cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE];
1436 
1437 	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1438 					cache_line_size());
1439 	cache_cache.reciprocal_buffer_size =
1440 		reciprocal_value(cache_cache.buffer_size);
1441 
1442 	for (order = 0; order < MAX_ORDER; order++) {
1443 		cache_estimate(order, cache_cache.buffer_size,
1444 			cache_line_size(), 0, &left_over, &cache_cache.num);
1445 		if (cache_cache.num)
1446 			break;
1447 	}
1448 	BUG_ON(!cache_cache.num);
1449 	cache_cache.gfporder = order;
1450 	cache_cache.colour = left_over / cache_cache.colour_off;
1451 	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1452 				      sizeof(struct slab), cache_line_size());
1453 
1454 	/* 2+3) create the kmalloc caches */
1455 	sizes = malloc_sizes;
1456 	names = cache_names;
1457 
1458 	/*
1459 	 * Initialize the caches that provide memory for the array cache and the
1460 	 * kmem_list3 structures first.  Without this, further allocations will
1461 	 * bug.
1462 	 */
1463 
1464 	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1465 					sizes[INDEX_AC].cs_size,
1466 					ARCH_KMALLOC_MINALIGN,
1467 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1468 					NULL, NULL);
1469 
1470 	if (INDEX_AC != INDEX_L3) {
1471 		sizes[INDEX_L3].cs_cachep =
1472 			kmem_cache_create(names[INDEX_L3].name,
1473 				sizes[INDEX_L3].cs_size,
1474 				ARCH_KMALLOC_MINALIGN,
1475 				ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1476 				NULL, NULL);
1477 	}
1478 
1479 	slab_early_init = 0;
1480 
1481 	while (sizes->cs_size != ULONG_MAX) {
1482 		/*
1483 		 * For performance, all the general caches are L1 aligned.
1484 		 * This should be particularly beneficial on SMP boxes, as it
1485 		 * eliminates "false sharing".
1486 		 * Note for systems short on memory removing the alignment will
1487 		 * allow tighter packing of the smaller caches.
1488 		 */
1489 		if (!sizes->cs_cachep) {
1490 			sizes->cs_cachep = kmem_cache_create(names->name,
1491 					sizes->cs_size,
1492 					ARCH_KMALLOC_MINALIGN,
1493 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1494 					NULL, NULL);
1495 		}
1496 
1497 		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
1498 					sizes->cs_size,
1499 					ARCH_KMALLOC_MINALIGN,
1500 					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
1501 						SLAB_PANIC,
1502 					NULL, NULL);
1503 		sizes++;
1504 		names++;
1505 	}
1506 	/* 4) Replace the bootstrap head arrays */
1507 	{
1508 		struct array_cache *ptr;
1509 
1510 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1511 
1512 		local_irq_disable();
1513 		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1514 		memcpy(ptr, cpu_cache_get(&cache_cache),
1515 		       sizeof(struct arraycache_init));
1516 		/*
1517 		 * Do not assume that spinlocks can be initialized via memcpy:
1518 		 */
1519 		spin_lock_init(&ptr->lock);
1520 
1521 		cache_cache.array[smp_processor_id()] = ptr;
1522 		local_irq_enable();
1523 
1524 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1525 
1526 		local_irq_disable();
1527 		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1528 		       != &initarray_generic.cache);
1529 		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1530 		       sizeof(struct arraycache_init));
1531 		/*
1532 		 * Do not assume that spinlocks can be initialized via memcpy:
1533 		 */
1534 		spin_lock_init(&ptr->lock);
1535 
1536 		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1537 		    ptr;
1538 		local_irq_enable();
1539 	}
1540 	/* 5) Replace the bootstrap kmem_list3's */
1541 	{
1542 		int nid;
1543 
1544 		/* Replace the static kmem_list3 structures for the boot cpu */
1545 		init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node);
1546 
1547 		for_each_online_node(nid) {
1548 			init_list(malloc_sizes[INDEX_AC].cs_cachep,
1549 				  &initkmem_list3[SIZE_AC + nid], nid);
1550 
1551 			if (INDEX_AC != INDEX_L3) {
1552 				init_list(malloc_sizes[INDEX_L3].cs_cachep,
1553 					  &initkmem_list3[SIZE_L3 + nid], nid);
1554 			}
1555 		}
1556 	}
1557 
1558 	/* 6) resize the head arrays to their final sizes */
1559 	{
1560 		struct kmem_cache *cachep;
1561 		mutex_lock(&cache_chain_mutex);
1562 		list_for_each_entry(cachep, &cache_chain, next)
1563 			if (enable_cpucache(cachep))
1564 				BUG();
1565 		mutex_unlock(&cache_chain_mutex);
1566 	}
1567 
1568 	/* Annotate slab for lockdep -- annotate the malloc caches */
1569 	init_lock_keys();
1570 
1571 
1572 	/* Done! */
1573 	g_cpucache_up = FULL;
1574 
1575 	/*
1576 	 * Register a cpu startup notifier callback that initializes
1577 	 * cpu_cache_get for all new cpus
1578 	 */
1579 	register_cpu_notifier(&cpucache_notifier);
1580 
1581 	/*
1582 	 * The reap timers are started later, with a module init call: That part
1583 	 * of the kernel is not yet operational.
1584 	 */
1585 }
1586 
1587 static int __init cpucache_init(void)
1588 {
1589 	int cpu;
1590 
1591 	/*
1592 	 * Register the timers that return unneeded pages to the page allocator
1593 	 */
1594 	for_each_online_cpu(cpu)
1595 		start_cpu_timer(cpu);
1596 	return 0;
1597 }
1598 __initcall(cpucache_init);
1599 
1600 /*
1601  * Interface to system's page allocator. No need to hold the cache-lock.
1602  *
1603  * If we requested dmaable memory, we will get it. Even if we
1604  * did not request dmaable memory, we might get it, but that
1605  * would be relatively rare and ignorable.
1606  */
1607 static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1608 {
1609 	struct page *page;
1610 	int nr_pages;
1611 	int i;
1612 
1613 #ifndef CONFIG_MMU
1614 	/*
1615 	 * Nommu uses slab's for process anonymous memory allocations, and thus
1616 	 * requires __GFP_COMP to properly refcount higher order allocations
1617 	 */
1618 	flags |= __GFP_COMP;
1619 #endif
1620 
1621 	flags |= cachep->gfpflags;
1622 
1623 	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1624 	if (!page)
1625 		return NULL;
1626 
1627 	nr_pages = (1 << cachep->gfporder);
1628 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1629 		add_zone_page_state(page_zone(page),
1630 			NR_SLAB_RECLAIMABLE, nr_pages);
1631 	else
1632 		add_zone_page_state(page_zone(page),
1633 			NR_SLAB_UNRECLAIMABLE, nr_pages);
1634 	for (i = 0; i < nr_pages; i++)
1635 		__SetPageSlab(page + i);
1636 	return page_address(page);
1637 }
1638 
1639 /*
1640  * Interface to system's page release.
1641  */
1642 static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1643 {
1644 	unsigned long i = (1 << cachep->gfporder);
1645 	struct page *page = virt_to_page(addr);
1646 	const unsigned long nr_freed = i;
1647 
1648 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1649 		sub_zone_page_state(page_zone(page),
1650 				NR_SLAB_RECLAIMABLE, nr_freed);
1651 	else
1652 		sub_zone_page_state(page_zone(page),
1653 				NR_SLAB_UNRECLAIMABLE, nr_freed);
1654 	while (i--) {
1655 		BUG_ON(!PageSlab(page));
1656 		__ClearPageSlab(page);
1657 		page++;
1658 	}
1659 	if (current->reclaim_state)
1660 		current->reclaim_state->reclaimed_slab += nr_freed;
1661 	free_pages((unsigned long)addr, cachep->gfporder);
1662 }
1663 
1664 static void kmem_rcu_free(struct rcu_head *head)
1665 {
1666 	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1667 	struct kmem_cache *cachep = slab_rcu->cachep;
1668 
1669 	kmem_freepages(cachep, slab_rcu->addr);
1670 	if (OFF_SLAB(cachep))
1671 		kmem_cache_free(cachep->slabp_cache, slab_rcu);
1672 }
1673 
1674 #if DEBUG
1675 
1676 #ifdef CONFIG_DEBUG_PAGEALLOC
1677 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1678 			    unsigned long caller)
1679 {
1680 	int size = obj_size(cachep);
1681 
1682 	addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1683 
1684 	if (size < 5 * sizeof(unsigned long))
1685 		return;
1686 
1687 	*addr++ = 0x12345678;
1688 	*addr++ = caller;
1689 	*addr++ = smp_processor_id();
1690 	size -= 3 * sizeof(unsigned long);
1691 	{
1692 		unsigned long *sptr = &caller;
1693 		unsigned long svalue;
1694 
1695 		while (!kstack_end(sptr)) {
1696 			svalue = *sptr++;
1697 			if (kernel_text_address(svalue)) {
1698 				*addr++ = svalue;
1699 				size -= sizeof(unsigned long);
1700 				if (size <= sizeof(unsigned long))
1701 					break;
1702 			}
1703 		}
1704 
1705 	}
1706 	*addr++ = 0x87654321;
1707 }
1708 #endif
1709 
1710 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1711 {
1712 	int size = obj_size(cachep);
1713 	addr = &((char *)addr)[obj_offset(cachep)];
1714 
1715 	memset(addr, val, size);
1716 	*(unsigned char *)(addr + size - 1) = POISON_END;
1717 }
1718 
1719 static void dump_line(char *data, int offset, int limit)
1720 {
1721 	int i;
1722 	unsigned char error = 0;
1723 	int bad_count = 0;
1724 
1725 	printk(KERN_ERR "%03x:", offset);
1726 	for (i = 0; i < limit; i++) {
1727 		if (data[offset + i] != POISON_FREE) {
1728 			error = data[offset + i];
1729 			bad_count++;
1730 		}
1731 		printk(" %02x", (unsigned char)data[offset + i]);
1732 	}
1733 	printk("\n");
1734 
1735 	if (bad_count == 1) {
1736 		error ^= POISON_FREE;
1737 		if (!(error & (error - 1))) {
1738 			printk(KERN_ERR "Single bit error detected. Probably "
1739 					"bad RAM.\n");
1740 #ifdef CONFIG_X86
1741 			printk(KERN_ERR "Run memtest86+ or a similar memory "
1742 					"test tool.\n");
1743 #else
1744 			printk(KERN_ERR "Run a memory test tool.\n");
1745 #endif
1746 		}
1747 	}
1748 }
1749 #endif
1750 
1751 #if DEBUG
1752 
1753 static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1754 {
1755 	int i, size;
1756 	char *realobj;
1757 
1758 	if (cachep->flags & SLAB_RED_ZONE) {
1759 		printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
1760 			*dbg_redzone1(cachep, objp),
1761 			*dbg_redzone2(cachep, objp));
1762 	}
1763 
1764 	if (cachep->flags & SLAB_STORE_USER) {
1765 		printk(KERN_ERR "Last user: [<%p>]",
1766 			*dbg_userword(cachep, objp));
1767 		print_symbol("(%s)",
1768 				(unsigned long)*dbg_userword(cachep, objp));
1769 		printk("\n");
1770 	}
1771 	realobj = (char *)objp + obj_offset(cachep);
1772 	size = obj_size(cachep);
1773 	for (i = 0; i < size && lines; i += 16, lines--) {
1774 		int limit;
1775 		limit = 16;
1776 		if (i + limit > size)
1777 			limit = size - i;
1778 		dump_line(realobj, i, limit);
1779 	}
1780 }
1781 
1782 static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1783 {
1784 	char *realobj;
1785 	int size, i;
1786 	int lines = 0;
1787 
1788 	realobj = (char *)objp + obj_offset(cachep);
1789 	size = obj_size(cachep);
1790 
1791 	for (i = 0; i < size; i++) {
1792 		char exp = POISON_FREE;
1793 		if (i == size - 1)
1794 			exp = POISON_END;
1795 		if (realobj[i] != exp) {
1796 			int limit;
1797 			/* Mismatch ! */
1798 			/* Print header */
1799 			if (lines == 0) {
1800 				printk(KERN_ERR
1801 					"Slab corruption: start=%p, len=%d\n",
1802 					realobj, size);
1803 				print_objinfo(cachep, objp, 0);
1804 			}
1805 			/* Hexdump the affected line */
1806 			i = (i / 16) * 16;
1807 			limit = 16;
1808 			if (i + limit > size)
1809 				limit = size - i;
1810 			dump_line(realobj, i, limit);
1811 			i += 16;
1812 			lines++;
1813 			/* Limit to 5 lines */
1814 			if (lines > 5)
1815 				break;
1816 		}
1817 	}
1818 	if (lines != 0) {
1819 		/* Print some data about the neighboring objects, if they
1820 		 * exist:
1821 		 */
1822 		struct slab *slabp = virt_to_slab(objp);
1823 		unsigned int objnr;
1824 
1825 		objnr = obj_to_index(cachep, slabp, objp);
1826 		if (objnr) {
1827 			objp = index_to_obj(cachep, slabp, objnr - 1);
1828 			realobj = (char *)objp + obj_offset(cachep);
1829 			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1830 			       realobj, size);
1831 			print_objinfo(cachep, objp, 2);
1832 		}
1833 		if (objnr + 1 < cachep->num) {
1834 			objp = index_to_obj(cachep, slabp, objnr + 1);
1835 			realobj = (char *)objp + obj_offset(cachep);
1836 			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1837 			       realobj, size);
1838 			print_objinfo(cachep, objp, 2);
1839 		}
1840 	}
1841 }
1842 #endif
1843 
1844 #if DEBUG
1845 /**
1846  * slab_destroy_objs - destroy a slab and its objects
1847  * @cachep: cache pointer being destroyed
1848  * @slabp: slab pointer being destroyed
1849  *
1850  * Call the registered destructor for each object in a slab that is being
1851  * destroyed.
1852  */
1853 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1854 {
1855 	int i;
1856 	for (i = 0; i < cachep->num; i++) {
1857 		void *objp = index_to_obj(cachep, slabp, i);
1858 
1859 		if (cachep->flags & SLAB_POISON) {
1860 #ifdef CONFIG_DEBUG_PAGEALLOC
1861 			if (cachep->buffer_size % PAGE_SIZE == 0 &&
1862 					OFF_SLAB(cachep))
1863 				kernel_map_pages(virt_to_page(objp),
1864 					cachep->buffer_size / PAGE_SIZE, 1);
1865 			else
1866 				check_poison_obj(cachep, objp);
1867 #else
1868 			check_poison_obj(cachep, objp);
1869 #endif
1870 		}
1871 		if (cachep->flags & SLAB_RED_ZONE) {
1872 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1873 				slab_error(cachep, "start of a freed object "
1874 					   "was overwritten");
1875 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1876 				slab_error(cachep, "end of a freed object "
1877 					   "was overwritten");
1878 		}
1879 		if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1880 			(cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
1881 	}
1882 }
1883 #else
1884 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1885 {
1886 	if (cachep->dtor) {
1887 		int i;
1888 		for (i = 0; i < cachep->num; i++) {
1889 			void *objp = index_to_obj(cachep, slabp, i);
1890 			(cachep->dtor) (objp, cachep, 0);
1891 		}
1892 	}
1893 }
1894 #endif
1895 
1896 /**
1897  * slab_destroy - destroy and release all objects in a slab
1898  * @cachep: cache pointer being destroyed
1899  * @slabp: slab pointer being destroyed
1900  *
1901  * Destroy all the objs in a slab, and release the mem back to the system.
1902  * Before calling the slab must have been unlinked from the cache.  The
1903  * cache-lock is not held/needed.
1904  */
1905 static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1906 {
1907 	void *addr = slabp->s_mem - slabp->colouroff;
1908 
1909 	slab_destroy_objs(cachep, slabp);
1910 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1911 		struct slab_rcu *slab_rcu;
1912 
1913 		slab_rcu = (struct slab_rcu *)slabp;
1914 		slab_rcu->cachep = cachep;
1915 		slab_rcu->addr = addr;
1916 		call_rcu(&slab_rcu->head, kmem_rcu_free);
1917 	} else {
1918 		kmem_freepages(cachep, addr);
1919 		if (OFF_SLAB(cachep))
1920 			kmem_cache_free(cachep->slabp_cache, slabp);
1921 	}
1922 }
1923 
1924 /*
1925  * For setting up all the kmem_list3s for cache whose buffer_size is same as
1926  * size of kmem_list3.
1927  */
1928 static void set_up_list3s(struct kmem_cache *cachep, int index)
1929 {
1930 	int node;
1931 
1932 	for_each_online_node(node) {
1933 		cachep->nodelists[node] = &initkmem_list3[index + node];
1934 		cachep->nodelists[node]->next_reap = jiffies +
1935 		    REAPTIMEOUT_LIST3 +
1936 		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1937 	}
1938 }
1939 
1940 static void __kmem_cache_destroy(struct kmem_cache *cachep)
1941 {
1942 	int i;
1943 	struct kmem_list3 *l3;
1944 
1945 	for_each_online_cpu(i)
1946 	    kfree(cachep->array[i]);
1947 
1948 	/* NUMA: free the list3 structures */
1949 	for_each_online_node(i) {
1950 		l3 = cachep->nodelists[i];
1951 		if (l3) {
1952 			kfree(l3->shared);
1953 			free_alien_cache(l3->alien);
1954 			kfree(l3);
1955 		}
1956 	}
1957 	kmem_cache_free(&cache_cache, cachep);
1958 }
1959 
1960 
1961 /**
1962  * calculate_slab_order - calculate size (page order) of slabs
1963  * @cachep: pointer to the cache that is being created
1964  * @size: size of objects to be created in this cache.
1965  * @align: required alignment for the objects.
1966  * @flags: slab allocation flags
1967  *
1968  * Also calculates the number of objects per slab.
1969  *
1970  * This could be made much more intelligent.  For now, try to avoid using
1971  * high order pages for slabs.  When the gfp() functions are more friendly
1972  * towards high-order requests, this should be changed.
1973  */
1974 static size_t calculate_slab_order(struct kmem_cache *cachep,
1975 			size_t size, size_t align, unsigned long flags)
1976 {
1977 	unsigned long offslab_limit;
1978 	size_t left_over = 0;
1979 	int gfporder;
1980 
1981 	for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
1982 		unsigned int num;
1983 		size_t remainder;
1984 
1985 		cache_estimate(gfporder, size, align, flags, &remainder, &num);
1986 		if (!num)
1987 			continue;
1988 
1989 		if (flags & CFLGS_OFF_SLAB) {
1990 			/*
1991 			 * Max number of objs-per-slab for caches which
1992 			 * use off-slab slabs. Needed to avoid a possible
1993 			 * looping condition in cache_grow().
1994 			 */
1995 			offslab_limit = size - sizeof(struct slab);
1996 			offslab_limit /= sizeof(kmem_bufctl_t);
1997 
1998  			if (num > offslab_limit)
1999 				break;
2000 		}
2001 
2002 		/* Found something acceptable - save it away */
2003 		cachep->num = num;
2004 		cachep->gfporder = gfporder;
2005 		left_over = remainder;
2006 
2007 		/*
2008 		 * A VFS-reclaimable slab tends to have most allocations
2009 		 * as GFP_NOFS and we really don't want to have to be allocating
2010 		 * higher-order pages when we are unable to shrink dcache.
2011 		 */
2012 		if (flags & SLAB_RECLAIM_ACCOUNT)
2013 			break;
2014 
2015 		/*
2016 		 * Large number of objects is good, but very large slabs are
2017 		 * currently bad for the gfp()s.
2018 		 */
2019 		if (gfporder >= slab_break_gfp_order)
2020 			break;
2021 
2022 		/*
2023 		 * Acceptable internal fragmentation?
2024 		 */
2025 		if (left_over * 8 <= (PAGE_SIZE << gfporder))
2026 			break;
2027 	}
2028 	return left_over;
2029 }
2030 
2031 static int setup_cpu_cache(struct kmem_cache *cachep)
2032 {
2033 	if (g_cpucache_up == FULL)
2034 		return enable_cpucache(cachep);
2035 
2036 	if (g_cpucache_up == NONE) {
2037 		/*
2038 		 * Note: the first kmem_cache_create must create the cache
2039 		 * that's used by kmalloc(24), otherwise the creation of
2040 		 * further caches will BUG().
2041 		 */
2042 		cachep->array[smp_processor_id()] = &initarray_generic.cache;
2043 
2044 		/*
2045 		 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
2046 		 * the first cache, then we need to set up all its list3s,
2047 		 * otherwise the creation of further caches will BUG().
2048 		 */
2049 		set_up_list3s(cachep, SIZE_AC);
2050 		if (INDEX_AC == INDEX_L3)
2051 			g_cpucache_up = PARTIAL_L3;
2052 		else
2053 			g_cpucache_up = PARTIAL_AC;
2054 	} else {
2055 		cachep->array[smp_processor_id()] =
2056 			kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
2057 
2058 		if (g_cpucache_up == PARTIAL_AC) {
2059 			set_up_list3s(cachep, SIZE_L3);
2060 			g_cpucache_up = PARTIAL_L3;
2061 		} else {
2062 			int node;
2063 			for_each_online_node(node) {
2064 				cachep->nodelists[node] =
2065 				    kmalloc_node(sizeof(struct kmem_list3),
2066 						GFP_KERNEL, node);
2067 				BUG_ON(!cachep->nodelists[node]);
2068 				kmem_list3_init(cachep->nodelists[node]);
2069 			}
2070 		}
2071 	}
2072 	cachep->nodelists[numa_node_id()]->next_reap =
2073 			jiffies + REAPTIMEOUT_LIST3 +
2074 			((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2075 
2076 	cpu_cache_get(cachep)->avail = 0;
2077 	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
2078 	cpu_cache_get(cachep)->batchcount = 1;
2079 	cpu_cache_get(cachep)->touched = 0;
2080 	cachep->batchcount = 1;
2081 	cachep->limit = BOOT_CPUCACHE_ENTRIES;
2082 	return 0;
2083 }
2084 
2085 /**
2086  * kmem_cache_create - Create a cache.
2087  * @name: A string which is used in /proc/slabinfo to identify this cache.
2088  * @size: The size of objects to be created in this cache.
2089  * @align: The required alignment for the objects.
2090  * @flags: SLAB flags
2091  * @ctor: A constructor for the objects.
2092  * @dtor: A destructor for the objects.
2093  *
2094  * Returns a ptr to the cache on success, NULL on failure.
2095  * Cannot be called within a int, but can be interrupted.
2096  * The @ctor is run when new pages are allocated by the cache
2097  * and the @dtor is run before the pages are handed back.
2098  *
2099  * @name must be valid until the cache is destroyed. This implies that
2100  * the module calling this has to destroy the cache before getting unloaded.
2101  *
2102  * The flags are
2103  *
2104  * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
2105  * to catch references to uninitialised memory.
2106  *
2107  * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
2108  * for buffer overruns.
2109  *
2110  * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
2111  * cacheline.  This can be beneficial if you're counting cycles as closely
2112  * as davem.
2113  */
2114 struct kmem_cache *
2115 kmem_cache_create (const char *name, size_t size, size_t align,
2116 	unsigned long flags,
2117 	void (*ctor)(void*, struct kmem_cache *, unsigned long),
2118 	void (*dtor)(void*, struct kmem_cache *, unsigned long))
2119 {
2120 	size_t left_over, slab_size, ralign;
2121 	struct kmem_cache *cachep = NULL, *pc;
2122 
2123 	/*
2124 	 * Sanity checks... these are all serious usage bugs.
2125 	 */
2126 	if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
2127 	    (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
2128 		printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
2129 				name);
2130 		BUG();
2131 	}
2132 
2133 	/*
2134 	 * We use cache_chain_mutex to ensure a consistent view of
2135 	 * cpu_online_map as well.  Please see cpuup_callback
2136 	 */
2137 	mutex_lock(&cache_chain_mutex);
2138 
2139 	list_for_each_entry(pc, &cache_chain, next) {
2140 		char tmp;
2141 		int res;
2142 
2143 		/*
2144 		 * This happens when the module gets unloaded and doesn't
2145 		 * destroy its slab cache and no-one else reuses the vmalloc
2146 		 * area of the module.  Print a warning.
2147 		 */
2148 		res = probe_kernel_address(pc->name, tmp);
2149 		if (res) {
2150 			printk("SLAB: cache with size %d has lost its name\n",
2151 			       pc->buffer_size);
2152 			continue;
2153 		}
2154 
2155 		if (!strcmp(pc->name, name)) {
2156 			printk("kmem_cache_create: duplicate cache %s\n", name);
2157 			dump_stack();
2158 			goto oops;
2159 		}
2160 	}
2161 
2162 #if DEBUG
2163 	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
2164 	if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
2165 		/* No constructor, but inital state check requested */
2166 		printk(KERN_ERR "%s: No con, but init state check "
2167 		       "requested - %s\n", __FUNCTION__, name);
2168 		flags &= ~SLAB_DEBUG_INITIAL;
2169 	}
2170 #if FORCED_DEBUG
2171 	/*
2172 	 * Enable redzoning and last user accounting, except for caches with
2173 	 * large objects, if the increased size would increase the object size
2174 	 * above the next power of two: caches with object sizes just above a
2175 	 * power of two have a significant amount of internal fragmentation.
2176 	 */
2177 	if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
2178 		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2179 	if (!(flags & SLAB_DESTROY_BY_RCU))
2180 		flags |= SLAB_POISON;
2181 #endif
2182 	if (flags & SLAB_DESTROY_BY_RCU)
2183 		BUG_ON(flags & SLAB_POISON);
2184 #endif
2185 	if (flags & SLAB_DESTROY_BY_RCU)
2186 		BUG_ON(dtor);
2187 
2188 	/*
2189 	 * Always checks flags, a caller might be expecting debug support which
2190 	 * isn't available.
2191 	 */
2192 	BUG_ON(flags & ~CREATE_MASK);
2193 
2194 	/*
2195 	 * Check that size is in terms of words.  This is needed to avoid
2196 	 * unaligned accesses for some archs when redzoning is used, and makes
2197 	 * sure any on-slab bufctl's are also correctly aligned.
2198 	 */
2199 	if (size & (BYTES_PER_WORD - 1)) {
2200 		size += (BYTES_PER_WORD - 1);
2201 		size &= ~(BYTES_PER_WORD - 1);
2202 	}
2203 
2204 	/* calculate the final buffer alignment: */
2205 
2206 	/* 1) arch recommendation: can be overridden for debug */
2207 	if (flags & SLAB_HWCACHE_ALIGN) {
2208 		/*
2209 		 * Default alignment: as specified by the arch code.  Except if
2210 		 * an object is really small, then squeeze multiple objects into
2211 		 * one cacheline.
2212 		 */
2213 		ralign = cache_line_size();
2214 		while (size <= ralign / 2)
2215 			ralign /= 2;
2216 	} else {
2217 		ralign = BYTES_PER_WORD;
2218 	}
2219 
2220 	/*
2221 	 * Redzoning and user store require word alignment. Note this will be
2222 	 * overridden by architecture or caller mandated alignment if either
2223 	 * is greater than BYTES_PER_WORD.
2224 	 */
2225 	if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
2226 		ralign = BYTES_PER_WORD;
2227 
2228 	/* 2) arch mandated alignment */
2229 	if (ralign < ARCH_SLAB_MINALIGN) {
2230 		ralign = ARCH_SLAB_MINALIGN;
2231 	}
2232 	/* 3) caller mandated alignment */
2233 	if (ralign < align) {
2234 		ralign = align;
2235 	}
2236 	/* disable debug if necessary */
2237 	if (ralign > BYTES_PER_WORD)
2238 		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2239 	/*
2240 	 * 4) Store it.
2241 	 */
2242 	align = ralign;
2243 
2244 	/* Get cache's description obj. */
2245 	cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
2246 	if (!cachep)
2247 		goto oops;
2248 
2249 #if DEBUG
2250 	cachep->obj_size = size;
2251 
2252 	/*
2253 	 * Both debugging options require word-alignment which is calculated
2254 	 * into align above.
2255 	 */
2256 	if (flags & SLAB_RED_ZONE) {
2257 		/* add space for red zone words */
2258 		cachep->obj_offset += BYTES_PER_WORD;
2259 		size += 2 * BYTES_PER_WORD;
2260 	}
2261 	if (flags & SLAB_STORE_USER) {
2262 		/* user store requires one word storage behind the end of
2263 		 * the real object.
2264 		 */
2265 		size += BYTES_PER_WORD;
2266 	}
2267 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2268 	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2269 	    && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
2270 		cachep->obj_offset += PAGE_SIZE - size;
2271 		size = PAGE_SIZE;
2272 	}
2273 #endif
2274 #endif
2275 
2276 	/*
2277 	 * Determine if the slab management is 'on' or 'off' slab.
2278 	 * (bootstrapping cannot cope with offslab caches so don't do
2279 	 * it too early on.)
2280 	 */
2281 	if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
2282 		/*
2283 		 * Size is large, assume best to place the slab management obj
2284 		 * off-slab (should allow better packing of objs).
2285 		 */
2286 		flags |= CFLGS_OFF_SLAB;
2287 
2288 	size = ALIGN(size, align);
2289 
2290 	left_over = calculate_slab_order(cachep, size, align, flags);
2291 
2292 	if (!cachep->num) {
2293 		printk("kmem_cache_create: couldn't create cache %s.\n", name);
2294 		kmem_cache_free(&cache_cache, cachep);
2295 		cachep = NULL;
2296 		goto oops;
2297 	}
2298 	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2299 			  + sizeof(struct slab), align);
2300 
2301 	/*
2302 	 * If the slab has been placed off-slab, and we have enough space then
2303 	 * move it on-slab. This is at the expense of any extra colouring.
2304 	 */
2305 	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
2306 		flags &= ~CFLGS_OFF_SLAB;
2307 		left_over -= slab_size;
2308 	}
2309 
2310 	if (flags & CFLGS_OFF_SLAB) {
2311 		/* really off slab. No need for manual alignment */
2312 		slab_size =
2313 		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2314 	}
2315 
2316 	cachep->colour_off = cache_line_size();
2317 	/* Offset must be a multiple of the alignment. */
2318 	if (cachep->colour_off < align)
2319 		cachep->colour_off = align;
2320 	cachep->colour = left_over / cachep->colour_off;
2321 	cachep->slab_size = slab_size;
2322 	cachep->flags = flags;
2323 	cachep->gfpflags = 0;
2324 	if (flags & SLAB_CACHE_DMA)
2325 		cachep->gfpflags |= GFP_DMA;
2326 	cachep->buffer_size = size;
2327 	cachep->reciprocal_buffer_size = reciprocal_value(size);
2328 
2329 	if (flags & CFLGS_OFF_SLAB) {
2330 		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2331 		/*
2332 		 * This is a possibility for one of the malloc_sizes caches.
2333 		 * But since we go off slab only for object size greater than
2334 		 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
2335 		 * this should not happen at all.
2336 		 * But leave a BUG_ON for some lucky dude.
2337 		 */
2338 		BUG_ON(!cachep->slabp_cache);
2339 	}
2340 	cachep->ctor = ctor;
2341 	cachep->dtor = dtor;
2342 	cachep->name = name;
2343 
2344 	if (setup_cpu_cache(cachep)) {
2345 		__kmem_cache_destroy(cachep);
2346 		cachep = NULL;
2347 		goto oops;
2348 	}
2349 
2350 	/* cache setup completed, link it into the list */
2351 	list_add(&cachep->next, &cache_chain);
2352 oops:
2353 	if (!cachep && (flags & SLAB_PANIC))
2354 		panic("kmem_cache_create(): failed to create slab `%s'\n",
2355 		      name);
2356 	mutex_unlock(&cache_chain_mutex);
2357 	return cachep;
2358 }
2359 EXPORT_SYMBOL(kmem_cache_create);
2360 
2361 #if DEBUG
2362 static void check_irq_off(void)
2363 {
2364 	BUG_ON(!irqs_disabled());
2365 }
2366 
2367 static void check_irq_on(void)
2368 {
2369 	BUG_ON(irqs_disabled());
2370 }
2371 
2372 static void check_spinlock_acquired(struct kmem_cache *cachep)
2373 {
2374 #ifdef CONFIG_SMP
2375 	check_irq_off();
2376 	assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
2377 #endif
2378 }
2379 
2380 static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2381 {
2382 #ifdef CONFIG_SMP
2383 	check_irq_off();
2384 	assert_spin_locked(&cachep->nodelists[node]->list_lock);
2385 #endif
2386 }
2387 
2388 #else
2389 #define check_irq_off()	do { } while(0)
2390 #define check_irq_on()	do { } while(0)
2391 #define check_spinlock_acquired(x) do { } while(0)
2392 #define check_spinlock_acquired_node(x, y) do { } while(0)
2393 #endif
2394 
2395 static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
2396 			struct array_cache *ac,
2397 			int force, int node);
2398 
2399 static void do_drain(void *arg)
2400 {
2401 	struct kmem_cache *cachep = arg;
2402 	struct array_cache *ac;
2403 	int node = numa_node_id();
2404 
2405 	check_irq_off();
2406 	ac = cpu_cache_get(cachep);
2407 	spin_lock(&cachep->nodelists[node]->list_lock);
2408 	free_block(cachep, ac->entry, ac->avail, node);
2409 	spin_unlock(&cachep->nodelists[node]->list_lock);
2410 	ac->avail = 0;
2411 }
2412 
2413 static void drain_cpu_caches(struct kmem_cache *cachep)
2414 {
2415 	struct kmem_list3 *l3;
2416 	int node;
2417 
2418 	on_each_cpu(do_drain, cachep, 1, 1);
2419 	check_irq_on();
2420 	for_each_online_node(node) {
2421 		l3 = cachep->nodelists[node];
2422 		if (l3 && l3->alien)
2423 			drain_alien_cache(cachep, l3->alien);
2424 	}
2425 
2426 	for_each_online_node(node) {
2427 		l3 = cachep->nodelists[node];
2428 		if (l3)
2429 			drain_array(cachep, l3, l3->shared, 1, node);
2430 	}
2431 }
2432 
2433 /*
2434  * Remove slabs from the list of free slabs.
2435  * Specify the number of slabs to drain in tofree.
2436  *
2437  * Returns the actual number of slabs released.
2438  */
2439 static int drain_freelist(struct kmem_cache *cache,
2440 			struct kmem_list3 *l3, int tofree)
2441 {
2442 	struct list_head *p;
2443 	int nr_freed;
2444 	struct slab *slabp;
2445 
2446 	nr_freed = 0;
2447 	while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
2448 
2449 		spin_lock_irq(&l3->list_lock);
2450 		p = l3->slabs_free.prev;
2451 		if (p == &l3->slabs_free) {
2452 			spin_unlock_irq(&l3->list_lock);
2453 			goto out;
2454 		}
2455 
2456 		slabp = list_entry(p, struct slab, list);
2457 #if DEBUG
2458 		BUG_ON(slabp->inuse);
2459 #endif
2460 		list_del(&slabp->list);
2461 		/*
2462 		 * Safe to drop the lock. The slab is no longer linked
2463 		 * to the cache.
2464 		 */
2465 		l3->free_objects -= cache->num;
2466 		spin_unlock_irq(&l3->list_lock);
2467 		slab_destroy(cache, slabp);
2468 		nr_freed++;
2469 	}
2470 out:
2471 	return nr_freed;
2472 }
2473 
2474 /* Called with cache_chain_mutex held to protect against cpu hotplug */
2475 static int __cache_shrink(struct kmem_cache *cachep)
2476 {
2477 	int ret = 0, i = 0;
2478 	struct kmem_list3 *l3;
2479 
2480 	drain_cpu_caches(cachep);
2481 
2482 	check_irq_on();
2483 	for_each_online_node(i) {
2484 		l3 = cachep->nodelists[i];
2485 		if (!l3)
2486 			continue;
2487 
2488 		drain_freelist(cachep, l3, l3->free_objects);
2489 
2490 		ret += !list_empty(&l3->slabs_full) ||
2491 			!list_empty(&l3->slabs_partial);
2492 	}
2493 	return (ret ? 1 : 0);
2494 }
2495 
2496 /**
2497  * kmem_cache_shrink - Shrink a cache.
2498  * @cachep: The cache to shrink.
2499  *
2500  * Releases as many slabs as possible for a cache.
2501  * To help debugging, a zero exit status indicates all slabs were released.
2502  */
2503 int kmem_cache_shrink(struct kmem_cache *cachep)
2504 {
2505 	int ret;
2506 	BUG_ON(!cachep || in_interrupt());
2507 
2508 	mutex_lock(&cache_chain_mutex);
2509 	ret = __cache_shrink(cachep);
2510 	mutex_unlock(&cache_chain_mutex);
2511 	return ret;
2512 }
2513 EXPORT_SYMBOL(kmem_cache_shrink);
2514 
2515 /**
2516  * kmem_cache_destroy - delete a cache
2517  * @cachep: the cache to destroy
2518  *
2519  * Remove a struct kmem_cache object from the slab cache.
2520  *
2521  * It is expected this function will be called by a module when it is
2522  * unloaded.  This will remove the cache completely, and avoid a duplicate
2523  * cache being allocated each time a module is loaded and unloaded, if the
2524  * module doesn't have persistent in-kernel storage across loads and unloads.
2525  *
2526  * The cache must be empty before calling this function.
2527  *
2528  * The caller must guarantee that noone will allocate memory from the cache
2529  * during the kmem_cache_destroy().
2530  */
2531 void kmem_cache_destroy(struct kmem_cache *cachep)
2532 {
2533 	BUG_ON(!cachep || in_interrupt());
2534 
2535 	/* Find the cache in the chain of caches. */
2536 	mutex_lock(&cache_chain_mutex);
2537 	/*
2538 	 * the chain is never empty, cache_cache is never destroyed
2539 	 */
2540 	list_del(&cachep->next);
2541 	if (__cache_shrink(cachep)) {
2542 		slab_error(cachep, "Can't free all objects");
2543 		list_add(&cachep->next, &cache_chain);
2544 		mutex_unlock(&cache_chain_mutex);
2545 		return;
2546 	}
2547 
2548 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2549 		synchronize_rcu();
2550 
2551 	__kmem_cache_destroy(cachep);
2552 	mutex_unlock(&cache_chain_mutex);
2553 }
2554 EXPORT_SYMBOL(kmem_cache_destroy);
2555 
2556 /*
2557  * Get the memory for a slab management obj.
2558  * For a slab cache when the slab descriptor is off-slab, slab descriptors
2559  * always come from malloc_sizes caches.  The slab descriptor cannot
2560  * come from the same cache which is getting created because,
2561  * when we are searching for an appropriate cache for these
2562  * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2563  * If we are creating a malloc_sizes cache here it would not be visible to
2564  * kmem_find_general_cachep till the initialization is complete.
2565  * Hence we cannot have slabp_cache same as the original cache.
2566  */
2567 static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2568 				   int colour_off, gfp_t local_flags,
2569 				   int nodeid)
2570 {
2571 	struct slab *slabp;
2572 
2573 	if (OFF_SLAB(cachep)) {
2574 		/* Slab management obj is off-slab. */
2575 		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2576 					      local_flags & ~GFP_THISNODE, nodeid);
2577 		if (!slabp)
2578 			return NULL;
2579 	} else {
2580 		slabp = objp + colour_off;
2581 		colour_off += cachep->slab_size;
2582 	}
2583 	slabp->inuse = 0;
2584 	slabp->colouroff = colour_off;
2585 	slabp->s_mem = objp + colour_off;
2586 	slabp->nodeid = nodeid;
2587 	return slabp;
2588 }
2589 
2590 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2591 {
2592 	return (kmem_bufctl_t *) (slabp + 1);
2593 }
2594 
2595 static void cache_init_objs(struct kmem_cache *cachep,
2596 			    struct slab *slabp, unsigned long ctor_flags)
2597 {
2598 	int i;
2599 
2600 	for (i = 0; i < cachep->num; i++) {
2601 		void *objp = index_to_obj(cachep, slabp, i);
2602 #if DEBUG
2603 		/* need to poison the objs? */
2604 		if (cachep->flags & SLAB_POISON)
2605 			poison_obj(cachep, objp, POISON_FREE);
2606 		if (cachep->flags & SLAB_STORE_USER)
2607 			*dbg_userword(cachep, objp) = NULL;
2608 
2609 		if (cachep->flags & SLAB_RED_ZONE) {
2610 			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2611 			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2612 		}
2613 		/*
2614 		 * Constructors are not allowed to allocate memory from the same
2615 		 * cache which they are a constructor for.  Otherwise, deadlock.
2616 		 * They must also be threaded.
2617 		 */
2618 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2619 			cachep->ctor(objp + obj_offset(cachep), cachep,
2620 				     ctor_flags);
2621 
2622 		if (cachep->flags & SLAB_RED_ZONE) {
2623 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2624 				slab_error(cachep, "constructor overwrote the"
2625 					   " end of an object");
2626 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2627 				slab_error(cachep, "constructor overwrote the"
2628 					   " start of an object");
2629 		}
2630 		if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
2631 			    OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2632 			kernel_map_pages(virt_to_page(objp),
2633 					 cachep->buffer_size / PAGE_SIZE, 0);
2634 #else
2635 		if (cachep->ctor)
2636 			cachep->ctor(objp, cachep, ctor_flags);
2637 #endif
2638 		slab_bufctl(slabp)[i] = i + 1;
2639 	}
2640 	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2641 	slabp->free = 0;
2642 }
2643 
2644 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2645 {
2646 	if (flags & GFP_DMA)
2647 		BUG_ON(!(cachep->gfpflags & GFP_DMA));
2648 	else
2649 		BUG_ON(cachep->gfpflags & GFP_DMA);
2650 }
2651 
2652 static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2653 				int nodeid)
2654 {
2655 	void *objp = index_to_obj(cachep, slabp, slabp->free);
2656 	kmem_bufctl_t next;
2657 
2658 	slabp->inuse++;
2659 	next = slab_bufctl(slabp)[slabp->free];
2660 #if DEBUG
2661 	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2662 	WARN_ON(slabp->nodeid != nodeid);
2663 #endif
2664 	slabp->free = next;
2665 
2666 	return objp;
2667 }
2668 
2669 static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2670 				void *objp, int nodeid)
2671 {
2672 	unsigned int objnr = obj_to_index(cachep, slabp, objp);
2673 
2674 #if DEBUG
2675 	/* Verify that the slab belongs to the intended node */
2676 	WARN_ON(slabp->nodeid != nodeid);
2677 
2678 	if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
2679 		printk(KERN_ERR "slab: double free detected in cache "
2680 				"'%s', objp %p\n", cachep->name, objp);
2681 		BUG();
2682 	}
2683 #endif
2684 	slab_bufctl(slabp)[objnr] = slabp->free;
2685 	slabp->free = objnr;
2686 	slabp->inuse--;
2687 }
2688 
2689 /*
2690  * Map pages beginning at addr to the given cache and slab. This is required
2691  * for the slab allocator to be able to lookup the cache and slab of a
2692  * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
2693  */
2694 static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2695 			   void *addr)
2696 {
2697 	int nr_pages;
2698 	struct page *page;
2699 
2700 	page = virt_to_page(addr);
2701 
2702 	nr_pages = 1;
2703 	if (likely(!PageCompound(page)))
2704 		nr_pages <<= cache->gfporder;
2705 
2706 	do {
2707 		page_set_cache(page, cache);
2708 		page_set_slab(page, slab);
2709 		page++;
2710 	} while (--nr_pages);
2711 }
2712 
2713 /*
2714  * Grow (by 1) the number of slabs within a cache.  This is called by
2715  * kmem_cache_alloc() when there are no active objs left in a cache.
2716  */
2717 static int cache_grow(struct kmem_cache *cachep,
2718 		gfp_t flags, int nodeid, void *objp)
2719 {
2720 	struct slab *slabp;
2721 	size_t offset;
2722 	gfp_t local_flags;
2723 	unsigned long ctor_flags;
2724 	struct kmem_list3 *l3;
2725 
2726 	/*
2727 	 * Be lazy and only check for valid flags here,  keeping it out of the
2728 	 * critical path in kmem_cache_alloc().
2729 	 */
2730 	BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW));
2731 	if (flags & __GFP_NO_GROW)
2732 		return 0;
2733 
2734 	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2735 	local_flags = (flags & GFP_LEVEL_MASK);
2736 	if (!(local_flags & __GFP_WAIT))
2737 		/*
2738 		 * Not allowed to sleep.  Need to tell a constructor about
2739 		 * this - it might need to know...
2740 		 */
2741 		ctor_flags |= SLAB_CTOR_ATOMIC;
2742 
2743 	/* Take the l3 list lock to change the colour_next on this node */
2744 	check_irq_off();
2745 	l3 = cachep->nodelists[nodeid];
2746 	spin_lock(&l3->list_lock);
2747 
2748 	/* Get colour for the slab, and cal the next value. */
2749 	offset = l3->colour_next;
2750 	l3->colour_next++;
2751 	if (l3->colour_next >= cachep->colour)
2752 		l3->colour_next = 0;
2753 	spin_unlock(&l3->list_lock);
2754 
2755 	offset *= cachep->colour_off;
2756 
2757 	if (local_flags & __GFP_WAIT)
2758 		local_irq_enable();
2759 
2760 	/*
2761 	 * The test for missing atomic flag is performed here, rather than
2762 	 * the more obvious place, simply to reduce the critical path length
2763 	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2764 	 * will eventually be caught here (where it matters).
2765 	 */
2766 	kmem_flagcheck(cachep, flags);
2767 
2768 	/*
2769 	 * Get mem for the objs.  Attempt to allocate a physical page from
2770 	 * 'nodeid'.
2771 	 */
2772 	if (!objp)
2773 		objp = kmem_getpages(cachep, flags, nodeid);
2774 	if (!objp)
2775 		goto failed;
2776 
2777 	/* Get slab management. */
2778 	slabp = alloc_slabmgmt(cachep, objp, offset,
2779 			local_flags & ~GFP_THISNODE, nodeid);
2780 	if (!slabp)
2781 		goto opps1;
2782 
2783 	slabp->nodeid = nodeid;
2784 	slab_map_pages(cachep, slabp, objp);
2785 
2786 	cache_init_objs(cachep, slabp, ctor_flags);
2787 
2788 	if (local_flags & __GFP_WAIT)
2789 		local_irq_disable();
2790 	check_irq_off();
2791 	spin_lock(&l3->list_lock);
2792 
2793 	/* Make slab active. */
2794 	list_add_tail(&slabp->list, &(l3->slabs_free));
2795 	STATS_INC_GROWN(cachep);
2796 	l3->free_objects += cachep->num;
2797 	spin_unlock(&l3->list_lock);
2798 	return 1;
2799 opps1:
2800 	kmem_freepages(cachep, objp);
2801 failed:
2802 	if (local_flags & __GFP_WAIT)
2803 		local_irq_disable();
2804 	return 0;
2805 }
2806 
2807 #if DEBUG
2808 
2809 /*
2810  * Perform extra freeing checks:
2811  * - detect bad pointers.
2812  * - POISON/RED_ZONE checking
2813  * - destructor calls, for caches with POISON+dtor
2814  */
2815 static void kfree_debugcheck(const void *objp)
2816 {
2817 	struct page *page;
2818 
2819 	if (!virt_addr_valid(objp)) {
2820 		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2821 		       (unsigned long)objp);
2822 		BUG();
2823 	}
2824 	page = virt_to_page(objp);
2825 	if (!PageSlab(page)) {
2826 		printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
2827 		       (unsigned long)objp);
2828 		BUG();
2829 	}
2830 }
2831 
2832 static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2833 {
2834 	unsigned long redzone1, redzone2;
2835 
2836 	redzone1 = *dbg_redzone1(cache, obj);
2837 	redzone2 = *dbg_redzone2(cache, obj);
2838 
2839 	/*
2840 	 * Redzone is ok.
2841 	 */
2842 	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2843 		return;
2844 
2845 	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2846 		slab_error(cache, "double free detected");
2847 	else
2848 		slab_error(cache, "memory outside object was overwritten");
2849 
2850 	printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n",
2851 			obj, redzone1, redzone2);
2852 }
2853 
2854 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2855 				   void *caller)
2856 {
2857 	struct page *page;
2858 	unsigned int objnr;
2859 	struct slab *slabp;
2860 
2861 	objp -= obj_offset(cachep);
2862 	kfree_debugcheck(objp);
2863 	page = virt_to_page(objp);
2864 
2865 	slabp = page_get_slab(page);
2866 
2867 	if (cachep->flags & SLAB_RED_ZONE) {
2868 		verify_redzone_free(cachep, objp);
2869 		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2870 		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2871 	}
2872 	if (cachep->flags & SLAB_STORE_USER)
2873 		*dbg_userword(cachep, objp) = caller;
2874 
2875 	objnr = obj_to_index(cachep, slabp, objp);
2876 
2877 	BUG_ON(objnr >= cachep->num);
2878 	BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2879 
2880 	if (cachep->flags & SLAB_DEBUG_INITIAL) {
2881 		/*
2882 		 * Need to call the slab's constructor so the caller can
2883 		 * perform a verify of its state (debugging).  Called without
2884 		 * the cache-lock held.
2885 		 */
2886 		cachep->ctor(objp + obj_offset(cachep),
2887 			     cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
2888 	}
2889 	if (cachep->flags & SLAB_POISON && cachep->dtor) {
2890 		/* we want to cache poison the object,
2891 		 * call the destruction callback
2892 		 */
2893 		cachep->dtor(objp + obj_offset(cachep), cachep, 0);
2894 	}
2895 #ifdef CONFIG_DEBUG_SLAB_LEAK
2896 	slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
2897 #endif
2898 	if (cachep->flags & SLAB_POISON) {
2899 #ifdef CONFIG_DEBUG_PAGEALLOC
2900 		if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2901 			store_stackinfo(cachep, objp, (unsigned long)caller);
2902 			kernel_map_pages(virt_to_page(objp),
2903 					 cachep->buffer_size / PAGE_SIZE, 0);
2904 		} else {
2905 			poison_obj(cachep, objp, POISON_FREE);
2906 		}
2907 #else
2908 		poison_obj(cachep, objp, POISON_FREE);
2909 #endif
2910 	}
2911 	return objp;
2912 }
2913 
2914 static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2915 {
2916 	kmem_bufctl_t i;
2917 	int entries = 0;
2918 
2919 	/* Check slab's freelist to see if this obj is there. */
2920 	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2921 		entries++;
2922 		if (entries > cachep->num || i >= cachep->num)
2923 			goto bad;
2924 	}
2925 	if (entries != cachep->num - slabp->inuse) {
2926 bad:
2927 		printk(KERN_ERR "slab: Internal list corruption detected in "
2928 				"cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2929 			cachep->name, cachep->num, slabp, slabp->inuse);
2930 		for (i = 0;
2931 		     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2932 		     i++) {
2933 			if (i % 16 == 0)
2934 				printk("\n%03x:", i);
2935 			printk(" %02x", ((unsigned char *)slabp)[i]);
2936 		}
2937 		printk("\n");
2938 		BUG();
2939 	}
2940 }
2941 #else
2942 #define kfree_debugcheck(x) do { } while(0)
2943 #define cache_free_debugcheck(x,objp,z) (objp)
2944 #define check_slabp(x,y) do { } while(0)
2945 #endif
2946 
2947 static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2948 {
2949 	int batchcount;
2950 	struct kmem_list3 *l3;
2951 	struct array_cache *ac;
2952 	int node;
2953 
2954 	node = numa_node_id();
2955 
2956 	check_irq_off();
2957 	ac = cpu_cache_get(cachep);
2958 retry:
2959 	batchcount = ac->batchcount;
2960 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2961 		/*
2962 		 * If there was little recent activity on this cache, then
2963 		 * perform only a partial refill.  Otherwise we could generate
2964 		 * refill bouncing.
2965 		 */
2966 		batchcount = BATCHREFILL_LIMIT;
2967 	}
2968 	l3 = cachep->nodelists[node];
2969 
2970 	BUG_ON(ac->avail > 0 || !l3);
2971 	spin_lock(&l3->list_lock);
2972 
2973 	/* See if we can refill from the shared array */
2974 	if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
2975 		goto alloc_done;
2976 
2977 	while (batchcount > 0) {
2978 		struct list_head *entry;
2979 		struct slab *slabp;
2980 		/* Get slab alloc is to come from. */
2981 		entry = l3->slabs_partial.next;
2982 		if (entry == &l3->slabs_partial) {
2983 			l3->free_touched = 1;
2984 			entry = l3->slabs_free.next;
2985 			if (entry == &l3->slabs_free)
2986 				goto must_grow;
2987 		}
2988 
2989 		slabp = list_entry(entry, struct slab, list);
2990 		check_slabp(cachep, slabp);
2991 		check_spinlock_acquired(cachep);
2992 		while (slabp->inuse < cachep->num && batchcount--) {
2993 			STATS_INC_ALLOCED(cachep);
2994 			STATS_INC_ACTIVE(cachep);
2995 			STATS_SET_HIGH(cachep);
2996 
2997 			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
2998 							    node);
2999 		}
3000 		check_slabp(cachep, slabp);
3001 
3002 		/* move slabp to correct slabp list: */
3003 		list_del(&slabp->list);
3004 		if (slabp->free == BUFCTL_END)
3005 			list_add(&slabp->list, &l3->slabs_full);
3006 		else
3007 			list_add(&slabp->list, &l3->slabs_partial);
3008 	}
3009 
3010 must_grow:
3011 	l3->free_objects -= ac->avail;
3012 alloc_done:
3013 	spin_unlock(&l3->list_lock);
3014 
3015 	if (unlikely(!ac->avail)) {
3016 		int x;
3017 		x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
3018 
3019 		/* cache_grow can reenable interrupts, then ac could change. */
3020 		ac = cpu_cache_get(cachep);
3021 		if (!x && ac->avail == 0)	/* no objects in sight? abort */
3022 			return NULL;
3023 
3024 		if (!ac->avail)		/* objects refilled by interrupt? */
3025 			goto retry;
3026 	}
3027 	ac->touched = 1;
3028 	return ac->entry[--ac->avail];
3029 }
3030 
3031 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
3032 						gfp_t flags)
3033 {
3034 	might_sleep_if(flags & __GFP_WAIT);
3035 #if DEBUG
3036 	kmem_flagcheck(cachep, flags);
3037 #endif
3038 }
3039 
3040 #if DEBUG
3041 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3042 				gfp_t flags, void *objp, void *caller)
3043 {
3044 	if (!objp)
3045 		return objp;
3046 	if (cachep->flags & SLAB_POISON) {
3047 #ifdef CONFIG_DEBUG_PAGEALLOC
3048 		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
3049 			kernel_map_pages(virt_to_page(objp),
3050 					 cachep->buffer_size / PAGE_SIZE, 1);
3051 		else
3052 			check_poison_obj(cachep, objp);
3053 #else
3054 		check_poison_obj(cachep, objp);
3055 #endif
3056 		poison_obj(cachep, objp, POISON_INUSE);
3057 	}
3058 	if (cachep->flags & SLAB_STORE_USER)
3059 		*dbg_userword(cachep, objp) = caller;
3060 
3061 	if (cachep->flags & SLAB_RED_ZONE) {
3062 		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
3063 				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
3064 			slab_error(cachep, "double free, or memory outside"
3065 						" object was overwritten");
3066 			printk(KERN_ERR
3067 				"%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
3068 				objp, *dbg_redzone1(cachep, objp),
3069 				*dbg_redzone2(cachep, objp));
3070 		}
3071 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
3072 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
3073 	}
3074 #ifdef CONFIG_DEBUG_SLAB_LEAK
3075 	{
3076 		struct slab *slabp;
3077 		unsigned objnr;
3078 
3079 		slabp = page_get_slab(virt_to_page(objp));
3080 		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
3081 		slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
3082 	}
3083 #endif
3084 	objp += obj_offset(cachep);
3085 	if (cachep->ctor && cachep->flags & SLAB_POISON) {
3086 		unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
3087 
3088 		if (!(flags & __GFP_WAIT))
3089 			ctor_flags |= SLAB_CTOR_ATOMIC;
3090 
3091 		cachep->ctor(objp, cachep, ctor_flags);
3092 	}
3093 #if ARCH_SLAB_MINALIGN
3094 	if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3095 		printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3096 		       objp, ARCH_SLAB_MINALIGN);
3097 	}
3098 #endif
3099 	return objp;
3100 }
3101 #else
3102 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
3103 #endif
3104 
3105 #ifdef CONFIG_FAILSLAB
3106 
3107 static struct failslab_attr {
3108 
3109 	struct fault_attr attr;
3110 
3111 	u32 ignore_gfp_wait;
3112 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3113 	struct dentry *ignore_gfp_wait_file;
3114 #endif
3115 
3116 } failslab = {
3117 	.attr = FAULT_ATTR_INITIALIZER,
3118 	.ignore_gfp_wait = 1,
3119 };
3120 
3121 static int __init setup_failslab(char *str)
3122 {
3123 	return setup_fault_attr(&failslab.attr, str);
3124 }
3125 __setup("failslab=", setup_failslab);
3126 
3127 static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3128 {
3129 	if (cachep == &cache_cache)
3130 		return 0;
3131 	if (flags & __GFP_NOFAIL)
3132 		return 0;
3133 	if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
3134 		return 0;
3135 
3136 	return should_fail(&failslab.attr, obj_size(cachep));
3137 }
3138 
3139 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3140 
3141 static int __init failslab_debugfs(void)
3142 {
3143 	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
3144 	struct dentry *dir;
3145 	int err;
3146 
3147        	err = init_fault_attr_dentries(&failslab.attr, "failslab");
3148 	if (err)
3149 		return err;
3150 	dir = failslab.attr.dentries.dir;
3151 
3152 	failslab.ignore_gfp_wait_file =
3153 		debugfs_create_bool("ignore-gfp-wait", mode, dir,
3154 				      &failslab.ignore_gfp_wait);
3155 
3156 	if (!failslab.ignore_gfp_wait_file) {
3157 		err = -ENOMEM;
3158 		debugfs_remove(failslab.ignore_gfp_wait_file);
3159 		cleanup_fault_attr_dentries(&failslab.attr);
3160 	}
3161 
3162 	return err;
3163 }
3164 
3165 late_initcall(failslab_debugfs);
3166 
3167 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
3168 
3169 #else /* CONFIG_FAILSLAB */
3170 
3171 static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3172 {
3173 	return 0;
3174 }
3175 
3176 #endif /* CONFIG_FAILSLAB */
3177 
3178 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3179 {
3180 	void *objp;
3181 	struct array_cache *ac;
3182 
3183 	check_irq_off();
3184 
3185 	if (should_failslab(cachep, flags))
3186 		return NULL;
3187 
3188 	ac = cpu_cache_get(cachep);
3189 	if (likely(ac->avail)) {
3190 		STATS_INC_ALLOCHIT(cachep);
3191 		ac->touched = 1;
3192 		objp = ac->entry[--ac->avail];
3193 	} else {
3194 		STATS_INC_ALLOCMISS(cachep);
3195 		objp = cache_alloc_refill(cachep, flags);
3196 	}
3197 	return objp;
3198 }
3199 
3200 static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
3201 						gfp_t flags, void *caller)
3202 {
3203 	unsigned long save_flags;
3204 	void *objp = NULL;
3205 
3206 	cache_alloc_debugcheck_before(cachep, flags);
3207 
3208 	local_irq_save(save_flags);
3209 
3210 	if (unlikely(NUMA_BUILD &&
3211 			current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
3212 		objp = alternate_node_alloc(cachep, flags);
3213 
3214 	if (!objp)
3215 		objp = ____cache_alloc(cachep, flags);
3216 	/*
3217 	 * We may just have run out of memory on the local node.
3218 	 * ____cache_alloc_node() knows how to locate memory on other nodes
3219 	 */
3220  	if (NUMA_BUILD && !objp)
3221  		objp = ____cache_alloc_node(cachep, flags, numa_node_id());
3222 	local_irq_restore(save_flags);
3223 	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
3224 					    caller);
3225 	prefetchw(objp);
3226 	return objp;
3227 }
3228 
3229 #ifdef CONFIG_NUMA
3230 /*
3231  * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
3232  *
3233  * If we are in_interrupt, then process context, including cpusets and
3234  * mempolicy, may not apply and should not be used for allocation policy.
3235  */
3236 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3237 {
3238 	int nid_alloc, nid_here;
3239 
3240 	if (in_interrupt() || (flags & __GFP_THISNODE))
3241 		return NULL;
3242 	nid_alloc = nid_here = numa_node_id();
3243 	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3244 		nid_alloc = cpuset_mem_spread_node();
3245 	else if (current->mempolicy)
3246 		nid_alloc = slab_node(current->mempolicy);
3247 	if (nid_alloc != nid_here)
3248 		return ____cache_alloc_node(cachep, flags, nid_alloc);
3249 	return NULL;
3250 }
3251 
3252 /*
3253  * Fallback function if there was no memory available and no objects on a
3254  * certain node and fall back is permitted. First we scan all the
3255  * available nodelists for available objects. If that fails then we
3256  * perform an allocation without specifying a node. This allows the page
3257  * allocator to do its reclaim / fallback magic. We then insert the
3258  * slab into the proper nodelist and then allocate from it.
3259  */
3260 void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3261 {
3262 	struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy))
3263 					->node_zonelists[gfp_zone(flags)];
3264 	struct zone **z;
3265 	void *obj = NULL;
3266 	int nid;
3267 	gfp_t local_flags = (flags & GFP_LEVEL_MASK);
3268 
3269 retry:
3270 	/*
3271 	 * Look through allowed nodes for objects available
3272 	 * from existing per node queues.
3273 	 */
3274 	for (z = zonelist->zones; *z && !obj; z++) {
3275 		nid = zone_to_nid(*z);
3276 
3277 		if (cpuset_zone_allowed_hardwall(*z, flags) &&
3278 			cache->nodelists[nid] &&
3279 			cache->nodelists[nid]->free_objects)
3280 				obj = ____cache_alloc_node(cache,
3281 					flags | GFP_THISNODE, nid);
3282 	}
3283 
3284 	if (!obj && !(flags & __GFP_NO_GROW)) {
3285 		/*
3286 		 * This allocation will be performed within the constraints
3287 		 * of the current cpuset / memory policy requirements.
3288 		 * We may trigger various forms of reclaim on the allowed
3289 		 * set and go into memory reserves if necessary.
3290 		 */
3291 		if (local_flags & __GFP_WAIT)
3292 			local_irq_enable();
3293 		kmem_flagcheck(cache, flags);
3294 		obj = kmem_getpages(cache, flags, -1);
3295 		if (local_flags & __GFP_WAIT)
3296 			local_irq_disable();
3297 		if (obj) {
3298 			/*
3299 			 * Insert into the appropriate per node queues
3300 			 */
3301 			nid = page_to_nid(virt_to_page(obj));
3302 			if (cache_grow(cache, flags, nid, obj)) {
3303 				obj = ____cache_alloc_node(cache,
3304 					flags | GFP_THISNODE, nid);
3305 				if (!obj)
3306 					/*
3307 					 * Another processor may allocate the
3308 					 * objects in the slab since we are
3309 					 * not holding any locks.
3310 					 */
3311 					goto retry;
3312 			} else {
3313 				/* cache_grow already freed obj */
3314 				obj = NULL;
3315 			}
3316 		}
3317 	}
3318 	return obj;
3319 }
3320 
3321 /*
3322  * A interface to enable slab creation on nodeid
3323  */
3324 static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3325 				int nodeid)
3326 {
3327 	struct list_head *entry;
3328 	struct slab *slabp;
3329 	struct kmem_list3 *l3;
3330 	void *obj;
3331 	int x;
3332 
3333 	l3 = cachep->nodelists[nodeid];
3334 	BUG_ON(!l3);
3335 
3336 retry:
3337 	check_irq_off();
3338 	spin_lock(&l3->list_lock);
3339 	entry = l3->slabs_partial.next;
3340 	if (entry == &l3->slabs_partial) {
3341 		l3->free_touched = 1;
3342 		entry = l3->slabs_free.next;
3343 		if (entry == &l3->slabs_free)
3344 			goto must_grow;
3345 	}
3346 
3347 	slabp = list_entry(entry, struct slab, list);
3348 	check_spinlock_acquired_node(cachep, nodeid);
3349 	check_slabp(cachep, slabp);
3350 
3351 	STATS_INC_NODEALLOCS(cachep);
3352 	STATS_INC_ACTIVE(cachep);
3353 	STATS_SET_HIGH(cachep);
3354 
3355 	BUG_ON(slabp->inuse == cachep->num);
3356 
3357 	obj = slab_get_obj(cachep, slabp, nodeid);
3358 	check_slabp(cachep, slabp);
3359 	l3->free_objects--;
3360 	/* move slabp to correct slabp list: */
3361 	list_del(&slabp->list);
3362 
3363 	if (slabp->free == BUFCTL_END)
3364 		list_add(&slabp->list, &l3->slabs_full);
3365 	else
3366 		list_add(&slabp->list, &l3->slabs_partial);
3367 
3368 	spin_unlock(&l3->list_lock);
3369 	goto done;
3370 
3371 must_grow:
3372 	spin_unlock(&l3->list_lock);
3373 	x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3374 	if (x)
3375 		goto retry;
3376 
3377 	if (!(flags & __GFP_THISNODE))
3378 		/* Unable to grow the cache. Fall back to other nodes. */
3379 		return fallback_alloc(cachep, flags);
3380 
3381 	return NULL;
3382 
3383 done:
3384 	return obj;
3385 }
3386 #endif
3387 
3388 /*
3389  * Caller needs to acquire correct kmem_list's list_lock
3390  */
3391 static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3392 		       int node)
3393 {
3394 	int i;
3395 	struct kmem_list3 *l3;
3396 
3397 	for (i = 0; i < nr_objects; i++) {
3398 		void *objp = objpp[i];
3399 		struct slab *slabp;
3400 
3401 		slabp = virt_to_slab(objp);
3402 		l3 = cachep->nodelists[node];
3403 		list_del(&slabp->list);
3404 		check_spinlock_acquired_node(cachep, node);
3405 		check_slabp(cachep, slabp);
3406 		slab_put_obj(cachep, slabp, objp, node);
3407 		STATS_DEC_ACTIVE(cachep);
3408 		l3->free_objects++;
3409 		check_slabp(cachep, slabp);
3410 
3411 		/* fixup slab chains */
3412 		if (slabp->inuse == 0) {
3413 			if (l3->free_objects > l3->free_limit) {
3414 				l3->free_objects -= cachep->num;
3415 				/* No need to drop any previously held
3416 				 * lock here, even if we have a off-slab slab
3417 				 * descriptor it is guaranteed to come from
3418 				 * a different cache, refer to comments before
3419 				 * alloc_slabmgmt.
3420 				 */
3421 				slab_destroy(cachep, slabp);
3422 			} else {
3423 				list_add(&slabp->list, &l3->slabs_free);
3424 			}
3425 		} else {
3426 			/* Unconditionally move a slab to the end of the
3427 			 * partial list on free - maximum time for the
3428 			 * other objects to be freed, too.
3429 			 */
3430 			list_add_tail(&slabp->list, &l3->slabs_partial);
3431 		}
3432 	}
3433 }
3434 
3435 static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3436 {
3437 	int batchcount;
3438 	struct kmem_list3 *l3;
3439 	int node = numa_node_id();
3440 
3441 	batchcount = ac->batchcount;
3442 #if DEBUG
3443 	BUG_ON(!batchcount || batchcount > ac->avail);
3444 #endif
3445 	check_irq_off();
3446 	l3 = cachep->nodelists[node];
3447 	spin_lock(&l3->list_lock);
3448 	if (l3->shared) {
3449 		struct array_cache *shared_array = l3->shared;
3450 		int max = shared_array->limit - shared_array->avail;
3451 		if (max) {
3452 			if (batchcount > max)
3453 				batchcount = max;
3454 			memcpy(&(shared_array->entry[shared_array->avail]),
3455 			       ac->entry, sizeof(void *) * batchcount);
3456 			shared_array->avail += batchcount;
3457 			goto free_done;
3458 		}
3459 	}
3460 
3461 	free_block(cachep, ac->entry, batchcount, node);
3462 free_done:
3463 #if STATS
3464 	{
3465 		int i = 0;
3466 		struct list_head *p;
3467 
3468 		p = l3->slabs_free.next;
3469 		while (p != &(l3->slabs_free)) {
3470 			struct slab *slabp;
3471 
3472 			slabp = list_entry(p, struct slab, list);
3473 			BUG_ON(slabp->inuse);
3474 
3475 			i++;
3476 			p = p->next;
3477 		}
3478 		STATS_SET_FREEABLE(cachep, i);
3479 	}
3480 #endif
3481 	spin_unlock(&l3->list_lock);
3482 	ac->avail -= batchcount;
3483 	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3484 }
3485 
3486 /*
3487  * Release an obj back to its cache. If the obj has a constructed state, it must
3488  * be in this state _before_ it is released.  Called with disabled ints.
3489  */
3490 static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3491 {
3492 	struct array_cache *ac = cpu_cache_get(cachep);
3493 
3494 	check_irq_off();
3495 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3496 
3497 	if (cache_free_alien(cachep, objp))
3498 		return;
3499 
3500 	if (likely(ac->avail < ac->limit)) {
3501 		STATS_INC_FREEHIT(cachep);
3502 		ac->entry[ac->avail++] = objp;
3503 		return;
3504 	} else {
3505 		STATS_INC_FREEMISS(cachep);
3506 		cache_flusharray(cachep, ac);
3507 		ac->entry[ac->avail++] = objp;
3508 	}
3509 }
3510 
3511 /**
3512  * kmem_cache_alloc - Allocate an object
3513  * @cachep: The cache to allocate from.
3514  * @flags: See kmalloc().
3515  *
3516  * Allocate an object from this cache.  The flags are only relevant
3517  * if the cache has no available objects.
3518  */
3519 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3520 {
3521 	return __cache_alloc(cachep, flags, __builtin_return_address(0));
3522 }
3523 EXPORT_SYMBOL(kmem_cache_alloc);
3524 
3525 /**
3526  * kmem_cache_zalloc - Allocate an object. The memory is set to zero.
3527  * @cache: The cache to allocate from.
3528  * @flags: See kmalloc().
3529  *
3530  * Allocate an object from this cache and set the allocated memory to zero.
3531  * The flags are only relevant if the cache has no available objects.
3532  */
3533 void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
3534 {
3535 	void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
3536 	if (ret)
3537 		memset(ret, 0, obj_size(cache));
3538 	return ret;
3539 }
3540 EXPORT_SYMBOL(kmem_cache_zalloc);
3541 
3542 /**
3543  * kmem_ptr_validate - check if an untrusted pointer might
3544  *	be a slab entry.
3545  * @cachep: the cache we're checking against
3546  * @ptr: pointer to validate
3547  *
3548  * This verifies that the untrusted pointer looks sane:
3549  * it is _not_ a guarantee that the pointer is actually
3550  * part of the slab cache in question, but it at least
3551  * validates that the pointer can be dereferenced and
3552  * looks half-way sane.
3553  *
3554  * Currently only used for dentry validation.
3555  */
3556 int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3557 {
3558 	unsigned long addr = (unsigned long)ptr;
3559 	unsigned long min_addr = PAGE_OFFSET;
3560 	unsigned long align_mask = BYTES_PER_WORD - 1;
3561 	unsigned long size = cachep->buffer_size;
3562 	struct page *page;
3563 
3564 	if (unlikely(addr < min_addr))
3565 		goto out;
3566 	if (unlikely(addr > (unsigned long)high_memory - size))
3567 		goto out;
3568 	if (unlikely(addr & align_mask))
3569 		goto out;
3570 	if (unlikely(!kern_addr_valid(addr)))
3571 		goto out;
3572 	if (unlikely(!kern_addr_valid(addr + size - 1)))
3573 		goto out;
3574 	page = virt_to_page(ptr);
3575 	if (unlikely(!PageSlab(page)))
3576 		goto out;
3577 	if (unlikely(page_get_cache(page) != cachep))
3578 		goto out;
3579 	return 1;
3580 out:
3581 	return 0;
3582 }
3583 
3584 #ifdef CONFIG_NUMA
3585 /**
3586  * kmem_cache_alloc_node - Allocate an object on the specified node
3587  * @cachep: The cache to allocate from.
3588  * @flags: See kmalloc().
3589  * @nodeid: node number of the target node.
3590  * @caller: return address of caller, used for debug information
3591  *
3592  * Identical to kmem_cache_alloc but it will allocate memory on the given
3593  * node, which can improve the performance for cpu bound structures.
3594  *
3595  * Fallback to other node is possible if __GFP_THISNODE is not set.
3596  */
3597 static __always_inline void *
3598 __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3599 		int nodeid, void *caller)
3600 {
3601 	unsigned long save_flags;
3602 	void *ptr = NULL;
3603 
3604 	cache_alloc_debugcheck_before(cachep, flags);
3605 	local_irq_save(save_flags);
3606 
3607 	if (unlikely(nodeid == -1))
3608 		nodeid = numa_node_id();
3609 
3610 	if (likely(cachep->nodelists[nodeid])) {
3611 		if (nodeid == numa_node_id()) {
3612 			/*
3613 			 * Use the locally cached objects if possible.
3614 			 * However ____cache_alloc does not allow fallback
3615 			 * to other nodes. It may fail while we still have
3616 			 * objects on other nodes available.
3617 			 */
3618 			ptr = ____cache_alloc(cachep, flags);
3619 		}
3620 		if (!ptr) {
3621 			/* ___cache_alloc_node can fall back to other nodes */
3622 			ptr = ____cache_alloc_node(cachep, flags, nodeid);
3623 		}
3624 	} else {
3625 		/* Node not bootstrapped yet */
3626 		if (!(flags & __GFP_THISNODE))
3627 			ptr = fallback_alloc(cachep, flags);
3628 	}
3629 
3630 	local_irq_restore(save_flags);
3631 	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3632 
3633 	return ptr;
3634 }
3635 
3636 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3637 {
3638 	return __cache_alloc_node(cachep, flags, nodeid,
3639 			__builtin_return_address(0));
3640 }
3641 EXPORT_SYMBOL(kmem_cache_alloc_node);
3642 
3643 static __always_inline void *
3644 __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3645 {
3646 	struct kmem_cache *cachep;
3647 
3648 	cachep = kmem_find_general_cachep(size, flags);
3649 	if (unlikely(cachep == NULL))
3650 		return NULL;
3651 	return kmem_cache_alloc_node(cachep, flags, node);
3652 }
3653 
3654 #ifdef CONFIG_DEBUG_SLAB
3655 void *__kmalloc_node(size_t size, gfp_t flags, int node)
3656 {
3657 	return __do_kmalloc_node(size, flags, node,
3658 			__builtin_return_address(0));
3659 }
3660 EXPORT_SYMBOL(__kmalloc_node);
3661 
3662 void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3663 		int node, void *caller)
3664 {
3665 	return __do_kmalloc_node(size, flags, node, caller);
3666 }
3667 EXPORT_SYMBOL(__kmalloc_node_track_caller);
3668 #else
3669 void *__kmalloc_node(size_t size, gfp_t flags, int node)
3670 {
3671 	return __do_kmalloc_node(size, flags, node, NULL);
3672 }
3673 EXPORT_SYMBOL(__kmalloc_node);
3674 #endif /* CONFIG_DEBUG_SLAB */
3675 #endif /* CONFIG_NUMA */
3676 
3677 /**
3678  * __do_kmalloc - allocate memory
3679  * @size: how many bytes of memory are required.
3680  * @flags: the type of memory to allocate (see kmalloc).
3681  * @caller: function caller for debug tracking of the caller
3682  */
3683 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3684 					  void *caller)
3685 {
3686 	struct kmem_cache *cachep;
3687 
3688 	/* If you want to save a few bytes .text space: replace
3689 	 * __ with kmem_.
3690 	 * Then kmalloc uses the uninlined functions instead of the inline
3691 	 * functions.
3692 	 */
3693 	cachep = __find_general_cachep(size, flags);
3694 	if (unlikely(cachep == NULL))
3695 		return NULL;
3696 	return __cache_alloc(cachep, flags, caller);
3697 }
3698 
3699 
3700 #ifdef CONFIG_DEBUG_SLAB
3701 void *__kmalloc(size_t size, gfp_t flags)
3702 {
3703 	return __do_kmalloc(size, flags, __builtin_return_address(0));
3704 }
3705 EXPORT_SYMBOL(__kmalloc);
3706 
3707 void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3708 {
3709 	return __do_kmalloc(size, flags, caller);
3710 }
3711 EXPORT_SYMBOL(__kmalloc_track_caller);
3712 
3713 #else
3714 void *__kmalloc(size_t size, gfp_t flags)
3715 {
3716 	return __do_kmalloc(size, flags, NULL);
3717 }
3718 EXPORT_SYMBOL(__kmalloc);
3719 #endif
3720 
3721 /**
3722  * kmem_cache_free - Deallocate an object
3723  * @cachep: The cache the allocation was from.
3724  * @objp: The previously allocated object.
3725  *
3726  * Free an object which was previously allocated from this
3727  * cache.
3728  */
3729 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3730 {
3731 	unsigned long flags;
3732 
3733 	BUG_ON(virt_to_cache(objp) != cachep);
3734 
3735 	local_irq_save(flags);
3736 	__cache_free(cachep, objp);
3737 	local_irq_restore(flags);
3738 }
3739 EXPORT_SYMBOL(kmem_cache_free);
3740 
3741 /**
3742  * kfree - free previously allocated memory
3743  * @objp: pointer returned by kmalloc.
3744  *
3745  * If @objp is NULL, no operation is performed.
3746  *
3747  * Don't free memory not originally allocated by kmalloc()
3748  * or you will run into trouble.
3749  */
3750 void kfree(const void *objp)
3751 {
3752 	struct kmem_cache *c;
3753 	unsigned long flags;
3754 
3755 	if (unlikely(!objp))
3756 		return;
3757 	local_irq_save(flags);
3758 	kfree_debugcheck(objp);
3759 	c = virt_to_cache(objp);
3760 	debug_check_no_locks_freed(objp, obj_size(c));
3761 	__cache_free(c, (void *)objp);
3762 	local_irq_restore(flags);
3763 }
3764 EXPORT_SYMBOL(kfree);
3765 
3766 unsigned int kmem_cache_size(struct kmem_cache *cachep)
3767 {
3768 	return obj_size(cachep);
3769 }
3770 EXPORT_SYMBOL(kmem_cache_size);
3771 
3772 const char *kmem_cache_name(struct kmem_cache *cachep)
3773 {
3774 	return cachep->name;
3775 }
3776 EXPORT_SYMBOL_GPL(kmem_cache_name);
3777 
3778 /*
3779  * This initializes kmem_list3 or resizes varioius caches for all nodes.
3780  */
3781 static int alloc_kmemlist(struct kmem_cache *cachep)
3782 {
3783 	int node;
3784 	struct kmem_list3 *l3;
3785 	struct array_cache *new_shared;
3786 	struct array_cache **new_alien = NULL;
3787 
3788 	for_each_online_node(node) {
3789 
3790                 if (use_alien_caches) {
3791                         new_alien = alloc_alien_cache(node, cachep->limit);
3792                         if (!new_alien)
3793                                 goto fail;
3794                 }
3795 
3796 		new_shared = alloc_arraycache(node,
3797 				cachep->shared*cachep->batchcount,
3798 					0xbaadf00d);
3799 		if (!new_shared) {
3800 			free_alien_cache(new_alien);
3801 			goto fail;
3802 		}
3803 
3804 		l3 = cachep->nodelists[node];
3805 		if (l3) {
3806 			struct array_cache *shared = l3->shared;
3807 
3808 			spin_lock_irq(&l3->list_lock);
3809 
3810 			if (shared)
3811 				free_block(cachep, shared->entry,
3812 						shared->avail, node);
3813 
3814 			l3->shared = new_shared;
3815 			if (!l3->alien) {
3816 				l3->alien = new_alien;
3817 				new_alien = NULL;
3818 			}
3819 			l3->free_limit = (1 + nr_cpus_node(node)) *
3820 					cachep->batchcount + cachep->num;
3821 			spin_unlock_irq(&l3->list_lock);
3822 			kfree(shared);
3823 			free_alien_cache(new_alien);
3824 			continue;
3825 		}
3826 		l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
3827 		if (!l3) {
3828 			free_alien_cache(new_alien);
3829 			kfree(new_shared);
3830 			goto fail;
3831 		}
3832 
3833 		kmem_list3_init(l3);
3834 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3835 				((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3836 		l3->shared = new_shared;
3837 		l3->alien = new_alien;
3838 		l3->free_limit = (1 + nr_cpus_node(node)) *
3839 					cachep->batchcount + cachep->num;
3840 		cachep->nodelists[node] = l3;
3841 	}
3842 	return 0;
3843 
3844 fail:
3845 	if (!cachep->next.next) {
3846 		/* Cache is not active yet. Roll back what we did */
3847 		node--;
3848 		while (node >= 0) {
3849 			if (cachep->nodelists[node]) {
3850 				l3 = cachep->nodelists[node];
3851 
3852 				kfree(l3->shared);
3853 				free_alien_cache(l3->alien);
3854 				kfree(l3);
3855 				cachep->nodelists[node] = NULL;
3856 			}
3857 			node--;
3858 		}
3859 	}
3860 	return -ENOMEM;
3861 }
3862 
3863 struct ccupdate_struct {
3864 	struct kmem_cache *cachep;
3865 	struct array_cache *new[NR_CPUS];
3866 };
3867 
3868 static void do_ccupdate_local(void *info)
3869 {
3870 	struct ccupdate_struct *new = info;
3871 	struct array_cache *old;
3872 
3873 	check_irq_off();
3874 	old = cpu_cache_get(new->cachep);
3875 
3876 	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3877 	new->new[smp_processor_id()] = old;
3878 }
3879 
3880 /* Always called with the cache_chain_mutex held */
3881 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3882 				int batchcount, int shared)
3883 {
3884 	struct ccupdate_struct *new;
3885 	int i;
3886 
3887 	new = kzalloc(sizeof(*new), GFP_KERNEL);
3888 	if (!new)
3889 		return -ENOMEM;
3890 
3891 	for_each_online_cpu(i) {
3892 		new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
3893 						batchcount);
3894 		if (!new->new[i]) {
3895 			for (i--; i >= 0; i--)
3896 				kfree(new->new[i]);
3897 			kfree(new);
3898 			return -ENOMEM;
3899 		}
3900 	}
3901 	new->cachep = cachep;
3902 
3903 	on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
3904 
3905 	check_irq_on();
3906 	cachep->batchcount = batchcount;
3907 	cachep->limit = limit;
3908 	cachep->shared = shared;
3909 
3910 	for_each_online_cpu(i) {
3911 		struct array_cache *ccold = new->new[i];
3912 		if (!ccold)
3913 			continue;
3914 		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3915 		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
3916 		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3917 		kfree(ccold);
3918 	}
3919 	kfree(new);
3920 	return alloc_kmemlist(cachep);
3921 }
3922 
3923 /* Called with cache_chain_mutex held always */
3924 static int enable_cpucache(struct kmem_cache *cachep)
3925 {
3926 	int err;
3927 	int limit, shared;
3928 
3929 	/*
3930 	 * The head array serves three purposes:
3931 	 * - create a LIFO ordering, i.e. return objects that are cache-warm
3932 	 * - reduce the number of spinlock operations.
3933 	 * - reduce the number of linked list operations on the slab and
3934 	 *   bufctl chains: array operations are cheaper.
3935 	 * The numbers are guessed, we should auto-tune as described by
3936 	 * Bonwick.
3937 	 */
3938 	if (cachep->buffer_size > 131072)
3939 		limit = 1;
3940 	else if (cachep->buffer_size > PAGE_SIZE)
3941 		limit = 8;
3942 	else if (cachep->buffer_size > 1024)
3943 		limit = 24;
3944 	else if (cachep->buffer_size > 256)
3945 		limit = 54;
3946 	else
3947 		limit = 120;
3948 
3949 	/*
3950 	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
3951 	 * allocation behaviour: Most allocs on one cpu, most free operations
3952 	 * on another cpu. For these cases, an efficient object passing between
3953 	 * cpus is necessary. This is provided by a shared array. The array
3954 	 * replaces Bonwick's magazine layer.
3955 	 * On uniprocessor, it's functionally equivalent (but less efficient)
3956 	 * to a larger limit. Thus disabled by default.
3957 	 */
3958 	shared = 0;
3959 #ifdef CONFIG_SMP
3960 	if (cachep->buffer_size <= PAGE_SIZE)
3961 		shared = 8;
3962 #endif
3963 
3964 #if DEBUG
3965 	/*
3966 	 * With debugging enabled, large batchcount lead to excessively long
3967 	 * periods with disabled local interrupts. Limit the batchcount
3968 	 */
3969 	if (limit > 32)
3970 		limit = 32;
3971 #endif
3972 	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
3973 	if (err)
3974 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3975 		       cachep->name, -err);
3976 	return err;
3977 }
3978 
3979 /*
3980  * Drain an array if it contains any elements taking the l3 lock only if
3981  * necessary. Note that the l3 listlock also protects the array_cache
3982  * if drain_array() is used on the shared array.
3983  */
3984 void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
3985 			 struct array_cache *ac, int force, int node)
3986 {
3987 	int tofree;
3988 
3989 	if (!ac || !ac->avail)
3990 		return;
3991 	if (ac->touched && !force) {
3992 		ac->touched = 0;
3993 	} else {
3994 		spin_lock_irq(&l3->list_lock);
3995 		if (ac->avail) {
3996 			tofree = force ? ac->avail : (ac->limit + 4) / 5;
3997 			if (tofree > ac->avail)
3998 				tofree = (ac->avail + 1) / 2;
3999 			free_block(cachep, ac->entry, tofree, node);
4000 			ac->avail -= tofree;
4001 			memmove(ac->entry, &(ac->entry[tofree]),
4002 				sizeof(void *) * ac->avail);
4003 		}
4004 		spin_unlock_irq(&l3->list_lock);
4005 	}
4006 }
4007 
4008 /**
4009  * cache_reap - Reclaim memory from caches.
4010  * @unused: unused parameter
4011  *
4012  * Called from workqueue/eventd every few seconds.
4013  * Purpose:
4014  * - clear the per-cpu caches for this CPU.
4015  * - return freeable pages to the main free memory pool.
4016  *
4017  * If we cannot acquire the cache chain mutex then just give up - we'll try
4018  * again on the next iteration.
4019  */
4020 static void cache_reap(struct work_struct *unused)
4021 {
4022 	struct kmem_cache *searchp;
4023 	struct kmem_list3 *l3;
4024 	int node = numa_node_id();
4025 
4026 	if (!mutex_trylock(&cache_chain_mutex)) {
4027 		/* Give up. Setup the next iteration. */
4028 		schedule_delayed_work(&__get_cpu_var(reap_work),
4029 				      round_jiffies_relative(REAPTIMEOUT_CPUC));
4030 		return;
4031 	}
4032 
4033 	list_for_each_entry(searchp, &cache_chain, next) {
4034 		check_irq_on();
4035 
4036 		/*
4037 		 * We only take the l3 lock if absolutely necessary and we
4038 		 * have established with reasonable certainty that
4039 		 * we can do some work if the lock was obtained.
4040 		 */
4041 		l3 = searchp->nodelists[node];
4042 
4043 		reap_alien(searchp, l3);
4044 
4045 		drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
4046 
4047 		/*
4048 		 * These are racy checks but it does not matter
4049 		 * if we skip one check or scan twice.
4050 		 */
4051 		if (time_after(l3->next_reap, jiffies))
4052 			goto next;
4053 
4054 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
4055 
4056 		drain_array(searchp, l3, l3->shared, 0, node);
4057 
4058 		if (l3->free_touched)
4059 			l3->free_touched = 0;
4060 		else {
4061 			int freed;
4062 
4063 			freed = drain_freelist(searchp, l3, (l3->free_limit +
4064 				5 * searchp->num - 1) / (5 * searchp->num));
4065 			STATS_ADD_REAPED(searchp, freed);
4066 		}
4067 next:
4068 		cond_resched();
4069 	}
4070 	check_irq_on();
4071 	mutex_unlock(&cache_chain_mutex);
4072 	next_reap_node();
4073 	refresh_cpu_vm_stats(smp_processor_id());
4074 	/* Set up the next iteration */
4075 	schedule_delayed_work(&__get_cpu_var(reap_work),
4076 		round_jiffies_relative(REAPTIMEOUT_CPUC));
4077 }
4078 
4079 #ifdef CONFIG_PROC_FS
4080 
4081 static void print_slabinfo_header(struct seq_file *m)
4082 {
4083 	/*
4084 	 * Output format version, so at least we can change it
4085 	 * without _too_ many complaints.
4086 	 */
4087 #if STATS
4088 	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
4089 #else
4090 	seq_puts(m, "slabinfo - version: 2.1\n");
4091 #endif
4092 	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
4093 		 "<objperslab> <pagesperslab>");
4094 	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4095 	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4096 #if STATS
4097 	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
4098 		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
4099 	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
4100 #endif
4101 	seq_putc(m, '\n');
4102 }
4103 
4104 static void *s_start(struct seq_file *m, loff_t *pos)
4105 {
4106 	loff_t n = *pos;
4107 	struct list_head *p;
4108 
4109 	mutex_lock(&cache_chain_mutex);
4110 	if (!n)
4111 		print_slabinfo_header(m);
4112 	p = cache_chain.next;
4113 	while (n--) {
4114 		p = p->next;
4115 		if (p == &cache_chain)
4116 			return NULL;
4117 	}
4118 	return list_entry(p, struct kmem_cache, next);
4119 }
4120 
4121 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4122 {
4123 	struct kmem_cache *cachep = p;
4124 	++*pos;
4125 	return cachep->next.next == &cache_chain ?
4126 		NULL : list_entry(cachep->next.next, struct kmem_cache, next);
4127 }
4128 
4129 static void s_stop(struct seq_file *m, void *p)
4130 {
4131 	mutex_unlock(&cache_chain_mutex);
4132 }
4133 
4134 static int s_show(struct seq_file *m, void *p)
4135 {
4136 	struct kmem_cache *cachep = p;
4137 	struct slab *slabp;
4138 	unsigned long active_objs;
4139 	unsigned long num_objs;
4140 	unsigned long active_slabs = 0;
4141 	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
4142 	const char *name;
4143 	char *error = NULL;
4144 	int node;
4145 	struct kmem_list3 *l3;
4146 
4147 	active_objs = 0;
4148 	num_slabs = 0;
4149 	for_each_online_node(node) {
4150 		l3 = cachep->nodelists[node];
4151 		if (!l3)
4152 			continue;
4153 
4154 		check_irq_on();
4155 		spin_lock_irq(&l3->list_lock);
4156 
4157 		list_for_each_entry(slabp, &l3->slabs_full, list) {
4158 			if (slabp->inuse != cachep->num && !error)
4159 				error = "slabs_full accounting error";
4160 			active_objs += cachep->num;
4161 			active_slabs++;
4162 		}
4163 		list_for_each_entry(slabp, &l3->slabs_partial, list) {
4164 			if (slabp->inuse == cachep->num && !error)
4165 				error = "slabs_partial inuse accounting error";
4166 			if (!slabp->inuse && !error)
4167 				error = "slabs_partial/inuse accounting error";
4168 			active_objs += slabp->inuse;
4169 			active_slabs++;
4170 		}
4171 		list_for_each_entry(slabp, &l3->slabs_free, list) {
4172 			if (slabp->inuse && !error)
4173 				error = "slabs_free/inuse accounting error";
4174 			num_slabs++;
4175 		}
4176 		free_objects += l3->free_objects;
4177 		if (l3->shared)
4178 			shared_avail += l3->shared->avail;
4179 
4180 		spin_unlock_irq(&l3->list_lock);
4181 	}
4182 	num_slabs += active_slabs;
4183 	num_objs = num_slabs * cachep->num;
4184 	if (num_objs - active_objs != free_objects && !error)
4185 		error = "free_objects accounting error";
4186 
4187 	name = cachep->name;
4188 	if (error)
4189 		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4190 
4191 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
4192 		   name, active_objs, num_objs, cachep->buffer_size,
4193 		   cachep->num, (1 << cachep->gfporder));
4194 	seq_printf(m, " : tunables %4u %4u %4u",
4195 		   cachep->limit, cachep->batchcount, cachep->shared);
4196 	seq_printf(m, " : slabdata %6lu %6lu %6lu",
4197 		   active_slabs, num_slabs, shared_avail);
4198 #if STATS
4199 	{			/* list3 stats */
4200 		unsigned long high = cachep->high_mark;
4201 		unsigned long allocs = cachep->num_allocations;
4202 		unsigned long grown = cachep->grown;
4203 		unsigned long reaped = cachep->reaped;
4204 		unsigned long errors = cachep->errors;
4205 		unsigned long max_freeable = cachep->max_freeable;
4206 		unsigned long node_allocs = cachep->node_allocs;
4207 		unsigned long node_frees = cachep->node_frees;
4208 		unsigned long overflows = cachep->node_overflow;
4209 
4210 		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
4211 				%4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
4212 				reaped, errors, max_freeable, node_allocs,
4213 				node_frees, overflows);
4214 	}
4215 	/* cpu stats */
4216 	{
4217 		unsigned long allochit = atomic_read(&cachep->allochit);
4218 		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
4219 		unsigned long freehit = atomic_read(&cachep->freehit);
4220 		unsigned long freemiss = atomic_read(&cachep->freemiss);
4221 
4222 		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
4223 			   allochit, allocmiss, freehit, freemiss);
4224 	}
4225 #endif
4226 	seq_putc(m, '\n');
4227 	return 0;
4228 }
4229 
4230 /*
4231  * slabinfo_op - iterator that generates /proc/slabinfo
4232  *
4233  * Output layout:
4234  * cache-name
4235  * num-active-objs
4236  * total-objs
4237  * object size
4238  * num-active-slabs
4239  * total-slabs
4240  * num-pages-per-slab
4241  * + further values on SMP and with statistics enabled
4242  */
4243 
4244 const struct seq_operations slabinfo_op = {
4245 	.start = s_start,
4246 	.next = s_next,
4247 	.stop = s_stop,
4248 	.show = s_show,
4249 };
4250 
4251 #define MAX_SLABINFO_WRITE 128
4252 /**
4253  * slabinfo_write - Tuning for the slab allocator
4254  * @file: unused
4255  * @buffer: user buffer
4256  * @count: data length
4257  * @ppos: unused
4258  */
4259 ssize_t slabinfo_write(struct file *file, const char __user * buffer,
4260 		       size_t count, loff_t *ppos)
4261 {
4262 	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
4263 	int limit, batchcount, shared, res;
4264 	struct kmem_cache *cachep;
4265 
4266 	if (count > MAX_SLABINFO_WRITE)
4267 		return -EINVAL;
4268 	if (copy_from_user(&kbuf, buffer, count))
4269 		return -EFAULT;
4270 	kbuf[MAX_SLABINFO_WRITE] = '\0';
4271 
4272 	tmp = strchr(kbuf, ' ');
4273 	if (!tmp)
4274 		return -EINVAL;
4275 	*tmp = '\0';
4276 	tmp++;
4277 	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
4278 		return -EINVAL;
4279 
4280 	/* Find the cache in the chain of caches. */
4281 	mutex_lock(&cache_chain_mutex);
4282 	res = -EINVAL;
4283 	list_for_each_entry(cachep, &cache_chain, next) {
4284 		if (!strcmp(cachep->name, kbuf)) {
4285 			if (limit < 1 || batchcount < 1 ||
4286 					batchcount > limit || shared < 0) {
4287 				res = 0;
4288 			} else {
4289 				res = do_tune_cpucache(cachep, limit,
4290 						       batchcount, shared);
4291 			}
4292 			break;
4293 		}
4294 	}
4295 	mutex_unlock(&cache_chain_mutex);
4296 	if (res >= 0)
4297 		res = count;
4298 	return res;
4299 }
4300 
4301 #ifdef CONFIG_DEBUG_SLAB_LEAK
4302 
4303 static void *leaks_start(struct seq_file *m, loff_t *pos)
4304 {
4305 	loff_t n = *pos;
4306 	struct list_head *p;
4307 
4308 	mutex_lock(&cache_chain_mutex);
4309 	p = cache_chain.next;
4310 	while (n--) {
4311 		p = p->next;
4312 		if (p == &cache_chain)
4313 			return NULL;
4314 	}
4315 	return list_entry(p, struct kmem_cache, next);
4316 }
4317 
4318 static inline int add_caller(unsigned long *n, unsigned long v)
4319 {
4320 	unsigned long *p;
4321 	int l;
4322 	if (!v)
4323 		return 1;
4324 	l = n[1];
4325 	p = n + 2;
4326 	while (l) {
4327 		int i = l/2;
4328 		unsigned long *q = p + 2 * i;
4329 		if (*q == v) {
4330 			q[1]++;
4331 			return 1;
4332 		}
4333 		if (*q > v) {
4334 			l = i;
4335 		} else {
4336 			p = q + 2;
4337 			l -= i + 1;
4338 		}
4339 	}
4340 	if (++n[1] == n[0])
4341 		return 0;
4342 	memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4343 	p[0] = v;
4344 	p[1] = 1;
4345 	return 1;
4346 }
4347 
4348 static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4349 {
4350 	void *p;
4351 	int i;
4352 	if (n[0] == n[1])
4353 		return;
4354 	for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
4355 		if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4356 			continue;
4357 		if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4358 			return;
4359 	}
4360 }
4361 
4362 static void show_symbol(struct seq_file *m, unsigned long address)
4363 {
4364 #ifdef CONFIG_KALLSYMS
4365 	char *modname;
4366 	const char *name;
4367 	unsigned long offset, size;
4368 	char namebuf[KSYM_NAME_LEN+1];
4369 
4370 	name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
4371 
4372 	if (name) {
4373 		seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4374 		if (modname)
4375 			seq_printf(m, " [%s]", modname);
4376 		return;
4377 	}
4378 #endif
4379 	seq_printf(m, "%p", (void *)address);
4380 }
4381 
4382 static int leaks_show(struct seq_file *m, void *p)
4383 {
4384 	struct kmem_cache *cachep = p;
4385 	struct slab *slabp;
4386 	struct kmem_list3 *l3;
4387 	const char *name;
4388 	unsigned long *n = m->private;
4389 	int node;
4390 	int i;
4391 
4392 	if (!(cachep->flags & SLAB_STORE_USER))
4393 		return 0;
4394 	if (!(cachep->flags & SLAB_RED_ZONE))
4395 		return 0;
4396 
4397 	/* OK, we can do it */
4398 
4399 	n[1] = 0;
4400 
4401 	for_each_online_node(node) {
4402 		l3 = cachep->nodelists[node];
4403 		if (!l3)
4404 			continue;
4405 
4406 		check_irq_on();
4407 		spin_lock_irq(&l3->list_lock);
4408 
4409 		list_for_each_entry(slabp, &l3->slabs_full, list)
4410 			handle_slab(n, cachep, slabp);
4411 		list_for_each_entry(slabp, &l3->slabs_partial, list)
4412 			handle_slab(n, cachep, slabp);
4413 		spin_unlock_irq(&l3->list_lock);
4414 	}
4415 	name = cachep->name;
4416 	if (n[0] == n[1]) {
4417 		/* Increase the buffer size */
4418 		mutex_unlock(&cache_chain_mutex);
4419 		m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4420 		if (!m->private) {
4421 			/* Too bad, we are really out */
4422 			m->private = n;
4423 			mutex_lock(&cache_chain_mutex);
4424 			return -ENOMEM;
4425 		}
4426 		*(unsigned long *)m->private = n[0] * 2;
4427 		kfree(n);
4428 		mutex_lock(&cache_chain_mutex);
4429 		/* Now make sure this entry will be retried */
4430 		m->count = m->size;
4431 		return 0;
4432 	}
4433 	for (i = 0; i < n[1]; i++) {
4434 		seq_printf(m, "%s: %lu ", name, n[2*i+3]);
4435 		show_symbol(m, n[2*i+2]);
4436 		seq_putc(m, '\n');
4437 	}
4438 
4439 	return 0;
4440 }
4441 
4442 const struct seq_operations slabstats_op = {
4443 	.start = leaks_start,
4444 	.next = s_next,
4445 	.stop = s_stop,
4446 	.show = leaks_show,
4447 };
4448 #endif
4449 #endif
4450 
4451 /**
4452  * ksize - get the actual amount of memory allocated for a given object
4453  * @objp: Pointer to the object
4454  *
4455  * kmalloc may internally round up allocations and return more memory
4456  * than requested. ksize() can be used to determine the actual amount of
4457  * memory allocated. The caller may use this additional memory, even though
4458  * a smaller amount of memory was initially specified with the kmalloc call.
4459  * The caller must guarantee that objp points to a valid object previously
4460  * allocated with either kmalloc() or kmem_cache_alloc(). The object
4461  * must not be freed during the duration of the call.
4462  */
4463 unsigned int ksize(const void *objp)
4464 {
4465 	if (unlikely(objp == NULL))
4466 		return 0;
4467 
4468 	return obj_size(virt_to_cache(objp));
4469 }
4470