xref: /openbmc/linux/mm/slab.c (revision 87c2ce3b)
1 /*
2  * linux/mm/slab.c
3  * Written by Mark Hemment, 1996/97.
4  * (markhe@nextd.demon.co.uk)
5  *
6  * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
7  *
8  * Major cleanup, different bufctl logic, per-cpu arrays
9  *	(c) 2000 Manfred Spraul
10  *
11  * Cleanup, make the head arrays unconditional, preparation for NUMA
12  * 	(c) 2002 Manfred Spraul
13  *
14  * An implementation of the Slab Allocator as described in outline in;
15  *	UNIX Internals: The New Frontiers by Uresh Vahalia
16  *	Pub: Prentice Hall	ISBN 0-13-101908-2
17  * or with a little more detail in;
18  *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
19  *	Jeff Bonwick (Sun Microsystems).
20  *	Presented at: USENIX Summer 1994 Technical Conference
21  *
22  * The memory is organized in caches, one cache for each object type.
23  * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
24  * Each cache consists out of many slabs (they are small (usually one
25  * page long) and always contiguous), and each slab contains multiple
26  * initialized objects.
27  *
28  * This means, that your constructor is used only for newly allocated
29  * slabs and you must pass objects with the same intializations to
30  * kmem_cache_free.
31  *
32  * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
33  * normal). If you need a special memory type, then must create a new
34  * cache for that memory type.
35  *
36  * In order to reduce fragmentation, the slabs are sorted in 3 groups:
37  *   full slabs with 0 free objects
38  *   partial slabs
39  *   empty slabs with no allocated objects
40  *
41  * If partial slabs exist, then new allocations come from these slabs,
42  * otherwise from empty slabs or new slabs are allocated.
43  *
44  * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
45  * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
46  *
47  * Each cache has a short per-cpu head array, most allocs
48  * and frees go into that array, and if that array overflows, then 1/2
49  * of the entries in the array are given back into the global cache.
50  * The head array is strictly LIFO and should improve the cache hit rates.
51  * On SMP, it additionally reduces the spinlock operations.
52  *
53  * The c_cpuarray may not be read with enabled local interrupts -
54  * it's changed with a smp_call_function().
55  *
56  * SMP synchronization:
57  *  constructors and destructors are called without any locking.
58  *  Several members in kmem_cache_t and struct slab never change, they
59  *	are accessed without any locking.
60  *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
61  *  	and local interrupts are disabled so slab code is preempt-safe.
62  *  The non-constant members are protected with a per-cache irq spinlock.
63  *
64  * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
65  * in 2000 - many ideas in the current implementation are derived from
66  * his patch.
67  *
68  * Further notes from the original documentation:
69  *
70  * 11 April '97.  Started multi-threading - markhe
71  *	The global cache-chain is protected by the semaphore 'cache_chain_sem'.
72  *	The sem is only needed when accessing/extending the cache-chain, which
73  *	can never happen inside an interrupt (kmem_cache_create(),
74  *	kmem_cache_shrink() and kmem_cache_reap()).
75  *
76  *	At present, each engine can be growing a cache.  This should be blocked.
77  *
78  * 15 March 2005. NUMA slab allocator.
79  *	Shai Fultheim <shai@scalex86.org>.
80  *	Shobhit Dayal <shobhit@calsoftinc.com>
81  *	Alok N Kataria <alokk@calsoftinc.com>
82  *	Christoph Lameter <christoph@lameter.com>
83  *
84  *	Modified the slab allocator to be node aware on NUMA systems.
85  *	Each node has its own list of partial, free and full slabs.
86  *	All object allocations for a node occur from node specific slab lists.
87  */
88 
89 #include	<linux/config.h>
90 #include	<linux/slab.h>
91 #include	<linux/mm.h>
92 #include	<linux/swap.h>
93 #include	<linux/cache.h>
94 #include	<linux/interrupt.h>
95 #include	<linux/init.h>
96 #include	<linux/compiler.h>
97 #include	<linux/seq_file.h>
98 #include	<linux/notifier.h>
99 #include	<linux/kallsyms.h>
100 #include	<linux/cpu.h>
101 #include	<linux/sysctl.h>
102 #include	<linux/module.h>
103 #include	<linux/rcupdate.h>
104 #include	<linux/string.h>
105 #include	<linux/nodemask.h>
106 
107 #include	<asm/uaccess.h>
108 #include	<asm/cacheflush.h>
109 #include	<asm/tlbflush.h>
110 #include	<asm/page.h>
111 
112 /*
113  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
114  *		  SLAB_RED_ZONE & SLAB_POISON.
115  *		  0 for faster, smaller code (especially in the critical paths).
116  *
117  * STATS	- 1 to collect stats for /proc/slabinfo.
118  *		  0 for faster, smaller code (especially in the critical paths).
119  *
120  * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
121  */
122 
123 #ifdef CONFIG_DEBUG_SLAB
124 #define	DEBUG		1
125 #define	STATS		1
126 #define	FORCED_DEBUG	1
127 #else
128 #define	DEBUG		0
129 #define	STATS		0
130 #define	FORCED_DEBUG	0
131 #endif
132 
133 /* Shouldn't this be in a header file somewhere? */
134 #define	BYTES_PER_WORD		sizeof(void *)
135 
136 #ifndef cache_line_size
137 #define cache_line_size()	L1_CACHE_BYTES
138 #endif
139 
140 #ifndef ARCH_KMALLOC_MINALIGN
141 /*
142  * Enforce a minimum alignment for the kmalloc caches.
143  * Usually, the kmalloc caches are cache_line_size() aligned, except when
144  * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
145  * Some archs want to perform DMA into kmalloc caches and need a guaranteed
146  * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.
147  * Note that this flag disables some debug features.
148  */
149 #define ARCH_KMALLOC_MINALIGN 0
150 #endif
151 
152 #ifndef ARCH_SLAB_MINALIGN
153 /*
154  * Enforce a minimum alignment for all caches.
155  * Intended for archs that get misalignment faults even for BYTES_PER_WORD
156  * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
157  * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
158  * some debug features.
159  */
160 #define ARCH_SLAB_MINALIGN 0
161 #endif
162 
163 #ifndef ARCH_KMALLOC_FLAGS
164 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
165 #endif
166 
167 /* Legal flag mask for kmem_cache_create(). */
168 #if DEBUG
169 # define CREATE_MASK	(SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
170 			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
171 			 SLAB_NO_REAP | SLAB_CACHE_DMA | \
172 			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
173 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
174 			 SLAB_DESTROY_BY_RCU)
175 #else
176 # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
177 			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
178 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
179 			 SLAB_DESTROY_BY_RCU)
180 #endif
181 
182 /*
183  * kmem_bufctl_t:
184  *
185  * Bufctl's are used for linking objs within a slab
186  * linked offsets.
187  *
188  * This implementation relies on "struct page" for locating the cache &
189  * slab an object belongs to.
190  * This allows the bufctl structure to be small (one int), but limits
191  * the number of objects a slab (not a cache) can contain when off-slab
192  * bufctls are used. The limit is the size of the largest general cache
193  * that does not use off-slab slabs.
194  * For 32bit archs with 4 kB pages, is this 56.
195  * This is not serious, as it is only for large objects, when it is unwise
196  * to have too many per slab.
197  * Note: This limit can be raised by introducing a general cache whose size
198  * is less than 512 (PAGE_SIZE<<3), but greater than 256.
199  */
200 
201 typedef unsigned int kmem_bufctl_t;
202 #define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
203 #define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
204 #define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-2)
205 
206 /* Max number of objs-per-slab for caches which use off-slab slabs.
207  * Needed to avoid a possible looping condition in cache_grow().
208  */
209 static unsigned long offslab_limit;
210 
211 /*
212  * struct slab
213  *
214  * Manages the objs in a slab. Placed either at the beginning of mem allocated
215  * for a slab, or allocated from an general cache.
216  * Slabs are chained into three list: fully used, partial, fully free slabs.
217  */
218 struct slab {
219 	struct list_head list;
220 	unsigned long colouroff;
221 	void *s_mem;		/* including colour offset */
222 	unsigned int inuse;	/* num of objs active in slab */
223 	kmem_bufctl_t free;
224 	unsigned short nodeid;
225 };
226 
227 /*
228  * struct slab_rcu
229  *
230  * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
231  * arrange for kmem_freepages to be called via RCU.  This is useful if
232  * we need to approach a kernel structure obliquely, from its address
233  * obtained without the usual locking.  We can lock the structure to
234  * stabilize it and check it's still at the given address, only if we
235  * can be sure that the memory has not been meanwhile reused for some
236  * other kind of object (which our subsystem's lock might corrupt).
237  *
238  * rcu_read_lock before reading the address, then rcu_read_unlock after
239  * taking the spinlock within the structure expected at that address.
240  *
241  * We assume struct slab_rcu can overlay struct slab when destroying.
242  */
243 struct slab_rcu {
244 	struct rcu_head head;
245 	kmem_cache_t *cachep;
246 	void *addr;
247 };
248 
249 /*
250  * struct array_cache
251  *
252  * Purpose:
253  * - LIFO ordering, to hand out cache-warm objects from _alloc
254  * - reduce the number of linked list operations
255  * - reduce spinlock operations
256  *
257  * The limit is stored in the per-cpu structure to reduce the data cache
258  * footprint.
259  *
260  */
261 struct array_cache {
262 	unsigned int avail;
263 	unsigned int limit;
264 	unsigned int batchcount;
265 	unsigned int touched;
266 	spinlock_t lock;
267 	void *entry[0];		/*
268 				 * Must have this definition in here for the proper
269 				 * alignment of array_cache. Also simplifies accessing
270 				 * the entries.
271 				 * [0] is for gcc 2.95. It should really be [].
272 				 */
273 };
274 
275 /* bootstrap: The caches do not work without cpuarrays anymore,
276  * but the cpuarrays are allocated from the generic caches...
277  */
278 #define BOOT_CPUCACHE_ENTRIES	1
279 struct arraycache_init {
280 	struct array_cache cache;
281 	void *entries[BOOT_CPUCACHE_ENTRIES];
282 };
283 
284 /*
285  * The slab lists for all objects.
286  */
287 struct kmem_list3 {
288 	struct list_head slabs_partial;	/* partial list first, better asm code */
289 	struct list_head slabs_full;
290 	struct list_head slabs_free;
291 	unsigned long free_objects;
292 	unsigned long next_reap;
293 	int free_touched;
294 	unsigned int free_limit;
295 	spinlock_t list_lock;
296 	struct array_cache *shared;	/* shared per node */
297 	struct array_cache **alien;	/* on other nodes */
298 };
299 
300 /*
301  * Need this for bootstrapping a per node allocator.
302  */
303 #define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
304 struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
305 #define	CACHE_CACHE 0
306 #define	SIZE_AC 1
307 #define	SIZE_L3 (1 + MAX_NUMNODES)
308 
309 /*
310  * This function must be completely optimized away if
311  * a constant is passed to it. Mostly the same as
312  * what is in linux/slab.h except it returns an
313  * index.
314  */
315 static __always_inline int index_of(const size_t size)
316 {
317 	if (__builtin_constant_p(size)) {
318 		int i = 0;
319 
320 #define CACHE(x) \
321 	if (size <=x) \
322 		return i; \
323 	else \
324 		i++;
325 #include "linux/kmalloc_sizes.h"
326 #undef CACHE
327 		{
328 			extern void __bad_size(void);
329 			__bad_size();
330 		}
331 	} else
332 		BUG();
333 	return 0;
334 }
335 
336 #define INDEX_AC index_of(sizeof(struct arraycache_init))
337 #define INDEX_L3 index_of(sizeof(struct kmem_list3))
338 
339 static inline void kmem_list3_init(struct kmem_list3 *parent)
340 {
341 	INIT_LIST_HEAD(&parent->slabs_full);
342 	INIT_LIST_HEAD(&parent->slabs_partial);
343 	INIT_LIST_HEAD(&parent->slabs_free);
344 	parent->shared = NULL;
345 	parent->alien = NULL;
346 	spin_lock_init(&parent->list_lock);
347 	parent->free_objects = 0;
348 	parent->free_touched = 0;
349 }
350 
351 #define MAKE_LIST(cachep, listp, slab, nodeid)	\
352 	do {	\
353 		INIT_LIST_HEAD(listp);		\
354 		list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
355 	} while (0)
356 
357 #define	MAKE_ALL_LISTS(cachep, ptr, nodeid)			\
358 	do {					\
359 	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
360 	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
361 	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
362 	} while (0)
363 
364 /*
365  * kmem_cache_t
366  *
367  * manages a cache.
368  */
369 
370 struct kmem_cache {
371 /* 1) per-cpu data, touched during every alloc/free */
372 	struct array_cache *array[NR_CPUS];
373 	unsigned int batchcount;
374 	unsigned int limit;
375 	unsigned int shared;
376 	unsigned int objsize;
377 /* 2) touched by every alloc & free from the backend */
378 	struct kmem_list3 *nodelists[MAX_NUMNODES];
379 	unsigned int flags;	/* constant flags */
380 	unsigned int num;	/* # of objs per slab */
381 	spinlock_t spinlock;
382 
383 /* 3) cache_grow/shrink */
384 	/* order of pgs per slab (2^n) */
385 	unsigned int gfporder;
386 
387 	/* force GFP flags, e.g. GFP_DMA */
388 	gfp_t gfpflags;
389 
390 	size_t colour;		/* cache colouring range */
391 	unsigned int colour_off;	/* colour offset */
392 	unsigned int colour_next;	/* cache colouring */
393 	kmem_cache_t *slabp_cache;
394 	unsigned int slab_size;
395 	unsigned int dflags;	/* dynamic flags */
396 
397 	/* constructor func */
398 	void (*ctor) (void *, kmem_cache_t *, unsigned long);
399 
400 	/* de-constructor func */
401 	void (*dtor) (void *, kmem_cache_t *, unsigned long);
402 
403 /* 4) cache creation/removal */
404 	const char *name;
405 	struct list_head next;
406 
407 /* 5) statistics */
408 #if STATS
409 	unsigned long num_active;
410 	unsigned long num_allocations;
411 	unsigned long high_mark;
412 	unsigned long grown;
413 	unsigned long reaped;
414 	unsigned long errors;
415 	unsigned long max_freeable;
416 	unsigned long node_allocs;
417 	unsigned long node_frees;
418 	atomic_t allochit;
419 	atomic_t allocmiss;
420 	atomic_t freehit;
421 	atomic_t freemiss;
422 #endif
423 #if DEBUG
424 	int dbghead;
425 	int reallen;
426 #endif
427 };
428 
429 #define CFLGS_OFF_SLAB		(0x80000000UL)
430 #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
431 
432 #define BATCHREFILL_LIMIT	16
433 /* Optimization question: fewer reaps means less
434  * probability for unnessary cpucache drain/refill cycles.
435  *
436  * OTOH the cpuarrays can contain lots of objects,
437  * which could lock up otherwise freeable slabs.
438  */
439 #define REAPTIMEOUT_CPUC	(2*HZ)
440 #define REAPTIMEOUT_LIST3	(4*HZ)
441 
442 #if STATS
443 #define	STATS_INC_ACTIVE(x)	((x)->num_active++)
444 #define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
445 #define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
446 #define	STATS_INC_GROWN(x)	((x)->grown++)
447 #define	STATS_INC_REAPED(x)	((x)->reaped++)
448 #define	STATS_SET_HIGH(x)	do { if ((x)->num_active > (x)->high_mark) \
449 					(x)->high_mark = (x)->num_active; \
450 				} while (0)
451 #define	STATS_INC_ERR(x)	((x)->errors++)
452 #define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
453 #define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
454 #define	STATS_SET_FREEABLE(x, i) \
455 				do { if ((x)->max_freeable < i) \
456 					(x)->max_freeable = i; \
457 				} while (0)
458 
459 #define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
460 #define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
461 #define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
462 #define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
463 #else
464 #define	STATS_INC_ACTIVE(x)	do { } while (0)
465 #define	STATS_DEC_ACTIVE(x)	do { } while (0)
466 #define	STATS_INC_ALLOCED(x)	do { } while (0)
467 #define	STATS_INC_GROWN(x)	do { } while (0)
468 #define	STATS_INC_REAPED(x)	do { } while (0)
469 #define	STATS_SET_HIGH(x)	do { } while (0)
470 #define	STATS_INC_ERR(x)	do { } while (0)
471 #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
472 #define	STATS_INC_NODEFREES(x)	do { } while (0)
473 #define	STATS_SET_FREEABLE(x, i) \
474 				do { } while (0)
475 
476 #define STATS_INC_ALLOCHIT(x)	do { } while (0)
477 #define STATS_INC_ALLOCMISS(x)	do { } while (0)
478 #define STATS_INC_FREEHIT(x)	do { } while (0)
479 #define STATS_INC_FREEMISS(x)	do { } while (0)
480 #endif
481 
482 #if DEBUG
483 /* Magic nums for obj red zoning.
484  * Placed in the first word before and the first word after an obj.
485  */
486 #define	RED_INACTIVE	0x5A2CF071UL	/* when obj is inactive */
487 #define	RED_ACTIVE	0x170FC2A5UL	/* when obj is active */
488 
489 /* ...and for poisoning */
490 #define	POISON_INUSE	0x5a	/* for use-uninitialised poisoning */
491 #define POISON_FREE	0x6b	/* for use-after-free poisoning */
492 #define	POISON_END	0xa5	/* end-byte of poisoning */
493 
494 /* memory layout of objects:
495  * 0		: objp
496  * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that
497  * 		the end of an object is aligned with the end of the real
498  * 		allocation. Catches writes behind the end of the allocation.
499  * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1:
500  * 		redzone word.
501  * cachep->dbghead: The real object.
502  * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
503  * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
504  */
505 static int obj_dbghead(kmem_cache_t *cachep)
506 {
507 	return cachep->dbghead;
508 }
509 
510 static int obj_reallen(kmem_cache_t *cachep)
511 {
512 	return cachep->reallen;
513 }
514 
515 static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp)
516 {
517 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
518 	return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD);
519 }
520 
521 static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
522 {
523 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
524 	if (cachep->flags & SLAB_STORE_USER)
525 		return (unsigned long *)(objp + cachep->objsize -
526 					 2 * BYTES_PER_WORD);
527 	return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD);
528 }
529 
530 static void **dbg_userword(kmem_cache_t *cachep, void *objp)
531 {
532 	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
533 	return (void **)(objp + cachep->objsize - BYTES_PER_WORD);
534 }
535 
536 #else
537 
538 #define obj_dbghead(x)			0
539 #define obj_reallen(cachep)		(cachep->objsize)
540 #define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long *)NULL;})
541 #define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long *)NULL;})
542 #define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
543 
544 #endif
545 
546 /*
547  * Maximum size of an obj (in 2^order pages)
548  * and absolute limit for the gfp order.
549  */
550 #if defined(CONFIG_LARGE_ALLOCS)
551 #define	MAX_OBJ_ORDER	13	/* up to 32Mb */
552 #define	MAX_GFP_ORDER	13	/* up to 32Mb */
553 #elif defined(CONFIG_MMU)
554 #define	MAX_OBJ_ORDER	5	/* 32 pages */
555 #define	MAX_GFP_ORDER	5	/* 32 pages */
556 #else
557 #define	MAX_OBJ_ORDER	8	/* up to 1Mb */
558 #define	MAX_GFP_ORDER	8	/* up to 1Mb */
559 #endif
560 
561 /*
562  * Do not go above this order unless 0 objects fit into the slab.
563  */
564 #define	BREAK_GFP_ORDER_HI	1
565 #define	BREAK_GFP_ORDER_LO	0
566 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
567 
568 /* Functions for storing/retrieving the cachep and or slab from the
569  * global 'mem_map'. These are used to find the slab an obj belongs to.
570  * With kfree(), these are used to find the cache which an obj belongs to.
571  */
572 static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
573 {
574 	page->lru.next = (struct list_head *)cache;
575 }
576 
577 static inline struct kmem_cache *page_get_cache(struct page *page)
578 {
579 	return (struct kmem_cache *)page->lru.next;
580 }
581 
582 static inline void page_set_slab(struct page *page, struct slab *slab)
583 {
584 	page->lru.prev = (struct list_head *)slab;
585 }
586 
587 static inline struct slab *page_get_slab(struct page *page)
588 {
589 	return (struct slab *)page->lru.prev;
590 }
591 
592 /* These are the default caches for kmalloc. Custom caches can have other sizes. */
593 struct cache_sizes malloc_sizes[] = {
594 #define CACHE(x) { .cs_size = (x) },
595 #include <linux/kmalloc_sizes.h>
596 	CACHE(ULONG_MAX)
597 #undef CACHE
598 };
599 EXPORT_SYMBOL(malloc_sizes);
600 
601 /* Must match cache_sizes above. Out of line to keep cache footprint low. */
602 struct cache_names {
603 	char *name;
604 	char *name_dma;
605 };
606 
607 static struct cache_names __initdata cache_names[] = {
608 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
609 #include <linux/kmalloc_sizes.h>
610 	{NULL,}
611 #undef CACHE
612 };
613 
614 static struct arraycache_init initarray_cache __initdata =
615     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
616 static struct arraycache_init initarray_generic =
617     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
618 
619 /* internal cache of cache description objs */
620 static kmem_cache_t cache_cache = {
621 	.batchcount = 1,
622 	.limit = BOOT_CPUCACHE_ENTRIES,
623 	.shared = 1,
624 	.objsize = sizeof(kmem_cache_t),
625 	.flags = SLAB_NO_REAP,
626 	.spinlock = SPIN_LOCK_UNLOCKED,
627 	.name = "kmem_cache",
628 #if DEBUG
629 	.reallen = sizeof(kmem_cache_t),
630 #endif
631 };
632 
633 /* Guard access to the cache-chain. */
634 static struct semaphore cache_chain_sem;
635 static struct list_head cache_chain;
636 
637 /*
638  * vm_enough_memory() looks at this to determine how many
639  * slab-allocated pages are possibly freeable under pressure
640  *
641  * SLAB_RECLAIM_ACCOUNT turns this on per-slab
642  */
643 atomic_t slab_reclaim_pages;
644 
645 /*
646  * chicken and egg problem: delay the per-cpu array allocation
647  * until the general caches are up.
648  */
649 static enum {
650 	NONE,
651 	PARTIAL_AC,
652 	PARTIAL_L3,
653 	FULL
654 } g_cpucache_up;
655 
656 static DEFINE_PER_CPU(struct work_struct, reap_work);
657 
658 static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node);
659 static void enable_cpucache(kmem_cache_t *cachep);
660 static void cache_reap(void *unused);
661 static int __node_shrink(kmem_cache_t *cachep, int node);
662 
663 static inline struct array_cache *ac_data(kmem_cache_t *cachep)
664 {
665 	return cachep->array[smp_processor_id()];
666 }
667 
668 static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
669 {
670 	struct cache_sizes *csizep = malloc_sizes;
671 
672 #if DEBUG
673 	/* This happens if someone tries to call
674 	 * kmem_cache_create(), or __kmalloc(), before
675 	 * the generic caches are initialized.
676 	 */
677 	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
678 #endif
679 	while (size > csizep->cs_size)
680 		csizep++;
681 
682 	/*
683 	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
684 	 * has cs_{dma,}cachep==NULL. Thus no special case
685 	 * for large kmalloc calls required.
686 	 */
687 	if (unlikely(gfpflags & GFP_DMA))
688 		return csizep->cs_dmacachep;
689 	return csizep->cs_cachep;
690 }
691 
692 kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
693 {
694 	return __find_general_cachep(size, gfpflags);
695 }
696 EXPORT_SYMBOL(kmem_find_general_cachep);
697 
698 /* Cal the num objs, wastage, and bytes left over for a given slab size. */
699 static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
700 			   int flags, size_t *left_over, unsigned int *num)
701 {
702 	int i;
703 	size_t wastage = PAGE_SIZE << gfporder;
704 	size_t extra = 0;
705 	size_t base = 0;
706 
707 	if (!(flags & CFLGS_OFF_SLAB)) {
708 		base = sizeof(struct slab);
709 		extra = sizeof(kmem_bufctl_t);
710 	}
711 	i = 0;
712 	while (i * size + ALIGN(base + i * extra, align) <= wastage)
713 		i++;
714 	if (i > 0)
715 		i--;
716 
717 	if (i > SLAB_LIMIT)
718 		i = SLAB_LIMIT;
719 
720 	*num = i;
721 	wastage -= i * size;
722 	wastage -= ALIGN(base + i * extra, align);
723 	*left_over = wastage;
724 }
725 
726 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
727 
728 static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
729 {
730 	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
731 	       function, cachep->name, msg);
732 	dump_stack();
733 }
734 
735 /*
736  * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
737  * via the workqueue/eventd.
738  * Add the CPU number into the expiration time to minimize the possibility of
739  * the CPUs getting into lockstep and contending for the global cache chain
740  * lock.
741  */
742 static void __devinit start_cpu_timer(int cpu)
743 {
744 	struct work_struct *reap_work = &per_cpu(reap_work, cpu);
745 
746 	/*
747 	 * When this gets called from do_initcalls via cpucache_init(),
748 	 * init_workqueues() has already run, so keventd will be setup
749 	 * at that time.
750 	 */
751 	if (keventd_up() && reap_work->func == NULL) {
752 		INIT_WORK(reap_work, cache_reap, NULL);
753 		schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
754 	}
755 }
756 
757 static struct array_cache *alloc_arraycache(int node, int entries,
758 					    int batchcount)
759 {
760 	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
761 	struct array_cache *nc = NULL;
762 
763 	nc = kmalloc_node(memsize, GFP_KERNEL, node);
764 	if (nc) {
765 		nc->avail = 0;
766 		nc->limit = entries;
767 		nc->batchcount = batchcount;
768 		nc->touched = 0;
769 		spin_lock_init(&nc->lock);
770 	}
771 	return nc;
772 }
773 
774 #ifdef CONFIG_NUMA
775 static inline struct array_cache **alloc_alien_cache(int node, int limit)
776 {
777 	struct array_cache **ac_ptr;
778 	int memsize = sizeof(void *) * MAX_NUMNODES;
779 	int i;
780 
781 	if (limit > 1)
782 		limit = 12;
783 	ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
784 	if (ac_ptr) {
785 		for_each_node(i) {
786 			if (i == node || !node_online(i)) {
787 				ac_ptr[i] = NULL;
788 				continue;
789 			}
790 			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
791 			if (!ac_ptr[i]) {
792 				for (i--; i <= 0; i--)
793 					kfree(ac_ptr[i]);
794 				kfree(ac_ptr);
795 				return NULL;
796 			}
797 		}
798 	}
799 	return ac_ptr;
800 }
801 
802 static inline void free_alien_cache(struct array_cache **ac_ptr)
803 {
804 	int i;
805 
806 	if (!ac_ptr)
807 		return;
808 
809 	for_each_node(i)
810 	    kfree(ac_ptr[i]);
811 
812 	kfree(ac_ptr);
813 }
814 
815 static inline void __drain_alien_cache(kmem_cache_t *cachep,
816 				       struct array_cache *ac, int node)
817 {
818 	struct kmem_list3 *rl3 = cachep->nodelists[node];
819 
820 	if (ac->avail) {
821 		spin_lock(&rl3->list_lock);
822 		free_block(cachep, ac->entry, ac->avail, node);
823 		ac->avail = 0;
824 		spin_unlock(&rl3->list_lock);
825 	}
826 }
827 
828 static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
829 {
830 	int i = 0;
831 	struct array_cache *ac;
832 	unsigned long flags;
833 
834 	for_each_online_node(i) {
835 		ac = l3->alien[i];
836 		if (ac) {
837 			spin_lock_irqsave(&ac->lock, flags);
838 			__drain_alien_cache(cachep, ac, i);
839 			spin_unlock_irqrestore(&ac->lock, flags);
840 		}
841 	}
842 }
843 #else
844 #define alloc_alien_cache(node, limit) do { } while (0)
845 #define free_alien_cache(ac_ptr) do { } while (0)
846 #define drain_alien_cache(cachep, l3) do { } while (0)
847 #endif
848 
849 static int __devinit cpuup_callback(struct notifier_block *nfb,
850 				    unsigned long action, void *hcpu)
851 {
852 	long cpu = (long)hcpu;
853 	kmem_cache_t *cachep;
854 	struct kmem_list3 *l3 = NULL;
855 	int node = cpu_to_node(cpu);
856 	int memsize = sizeof(struct kmem_list3);
857 
858 	switch (action) {
859 	case CPU_UP_PREPARE:
860 		down(&cache_chain_sem);
861 		/* we need to do this right in the beginning since
862 		 * alloc_arraycache's are going to use this list.
863 		 * kmalloc_node allows us to add the slab to the right
864 		 * kmem_list3 and not this cpu's kmem_list3
865 		 */
866 
867 		list_for_each_entry(cachep, &cache_chain, next) {
868 			/* setup the size64 kmemlist for cpu before we can
869 			 * begin anything. Make sure some other cpu on this
870 			 * node has not already allocated this
871 			 */
872 			if (!cachep->nodelists[node]) {
873 				if (!(l3 = kmalloc_node(memsize,
874 							GFP_KERNEL, node)))
875 					goto bad;
876 				kmem_list3_init(l3);
877 				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
878 				    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
879 
880 				cachep->nodelists[node] = l3;
881 			}
882 
883 			spin_lock_irq(&cachep->nodelists[node]->list_lock);
884 			cachep->nodelists[node]->free_limit =
885 			    (1 + nr_cpus_node(node)) *
886 			    cachep->batchcount + cachep->num;
887 			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
888 		}
889 
890 		/* Now we can go ahead with allocating the shared array's
891 		   & array cache's */
892 		list_for_each_entry(cachep, &cache_chain, next) {
893 			struct array_cache *nc;
894 
895 			nc = alloc_arraycache(node, cachep->limit,
896 					      cachep->batchcount);
897 			if (!nc)
898 				goto bad;
899 			cachep->array[cpu] = nc;
900 
901 			l3 = cachep->nodelists[node];
902 			BUG_ON(!l3);
903 			if (!l3->shared) {
904 				if (!(nc = alloc_arraycache(node,
905 							    cachep->shared *
906 							    cachep->batchcount,
907 							    0xbaadf00d)))
908 					goto bad;
909 
910 				/* we are serialised from CPU_DEAD or
911 				   CPU_UP_CANCELLED by the cpucontrol lock */
912 				l3->shared = nc;
913 			}
914 		}
915 		up(&cache_chain_sem);
916 		break;
917 	case CPU_ONLINE:
918 		start_cpu_timer(cpu);
919 		break;
920 #ifdef CONFIG_HOTPLUG_CPU
921 	case CPU_DEAD:
922 		/* fall thru */
923 	case CPU_UP_CANCELED:
924 		down(&cache_chain_sem);
925 
926 		list_for_each_entry(cachep, &cache_chain, next) {
927 			struct array_cache *nc;
928 			cpumask_t mask;
929 
930 			mask = node_to_cpumask(node);
931 			spin_lock_irq(&cachep->spinlock);
932 			/* cpu is dead; no one can alloc from it. */
933 			nc = cachep->array[cpu];
934 			cachep->array[cpu] = NULL;
935 			l3 = cachep->nodelists[node];
936 
937 			if (!l3)
938 				goto unlock_cache;
939 
940 			spin_lock(&l3->list_lock);
941 
942 			/* Free limit for this kmem_list3 */
943 			l3->free_limit -= cachep->batchcount;
944 			if (nc)
945 				free_block(cachep, nc->entry, nc->avail, node);
946 
947 			if (!cpus_empty(mask)) {
948 				spin_unlock(&l3->list_lock);
949 				goto unlock_cache;
950 			}
951 
952 			if (l3->shared) {
953 				free_block(cachep, l3->shared->entry,
954 					   l3->shared->avail, node);
955 				kfree(l3->shared);
956 				l3->shared = NULL;
957 			}
958 			if (l3->alien) {
959 				drain_alien_cache(cachep, l3);
960 				free_alien_cache(l3->alien);
961 				l3->alien = NULL;
962 			}
963 
964 			/* free slabs belonging to this node */
965 			if (__node_shrink(cachep, node)) {
966 				cachep->nodelists[node] = NULL;
967 				spin_unlock(&l3->list_lock);
968 				kfree(l3);
969 			} else {
970 				spin_unlock(&l3->list_lock);
971 			}
972 		      unlock_cache:
973 			spin_unlock_irq(&cachep->spinlock);
974 			kfree(nc);
975 		}
976 		up(&cache_chain_sem);
977 		break;
978 #endif
979 	}
980 	return NOTIFY_OK;
981       bad:
982 	up(&cache_chain_sem);
983 	return NOTIFY_BAD;
984 }
985 
986 static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
987 
988 /*
989  * swap the static kmem_list3 with kmalloced memory
990  */
991 static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid)
992 {
993 	struct kmem_list3 *ptr;
994 
995 	BUG_ON(cachep->nodelists[nodeid] != list);
996 	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
997 	BUG_ON(!ptr);
998 
999 	local_irq_disable();
1000 	memcpy(ptr, list, sizeof(struct kmem_list3));
1001 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
1002 	cachep->nodelists[nodeid] = ptr;
1003 	local_irq_enable();
1004 }
1005 
1006 /* Initialisation.
1007  * Called after the gfp() functions have been enabled, and before smp_init().
1008  */
1009 void __init kmem_cache_init(void)
1010 {
1011 	size_t left_over;
1012 	struct cache_sizes *sizes;
1013 	struct cache_names *names;
1014 	int i;
1015 
1016 	for (i = 0; i < NUM_INIT_LISTS; i++) {
1017 		kmem_list3_init(&initkmem_list3[i]);
1018 		if (i < MAX_NUMNODES)
1019 			cache_cache.nodelists[i] = NULL;
1020 	}
1021 
1022 	/*
1023 	 * Fragmentation resistance on low memory - only use bigger
1024 	 * page orders on machines with more than 32MB of memory.
1025 	 */
1026 	if (num_physpages > (32 << 20) >> PAGE_SHIFT)
1027 		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1028 
1029 	/* Bootstrap is tricky, because several objects are allocated
1030 	 * from caches that do not exist yet:
1031 	 * 1) initialize the cache_cache cache: it contains the kmem_cache_t
1032 	 *    structures of all caches, except cache_cache itself: cache_cache
1033 	 *    is statically allocated.
1034 	 *    Initially an __init data area is used for the head array and the
1035 	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
1036 	 *    array at the end of the bootstrap.
1037 	 * 2) Create the first kmalloc cache.
1038 	 *    The kmem_cache_t for the new cache is allocated normally.
1039 	 *    An __init data area is used for the head array.
1040 	 * 3) Create the remaining kmalloc caches, with minimally sized
1041 	 *    head arrays.
1042 	 * 4) Replace the __init data head arrays for cache_cache and the first
1043 	 *    kmalloc cache with kmalloc allocated arrays.
1044 	 * 5) Replace the __init data for kmem_list3 for cache_cache and
1045 	 *    the other cache's with kmalloc allocated memory.
1046 	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1047 	 */
1048 
1049 	/* 1) create the cache_cache */
1050 	init_MUTEX(&cache_chain_sem);
1051 	INIT_LIST_HEAD(&cache_chain);
1052 	list_add(&cache_cache.next, &cache_chain);
1053 	cache_cache.colour_off = cache_line_size();
1054 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1055 	cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
1056 
1057 	cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
1058 
1059 	cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
1060 		       &left_over, &cache_cache.num);
1061 	if (!cache_cache.num)
1062 		BUG();
1063 
1064 	cache_cache.colour = left_over / cache_cache.colour_off;
1065 	cache_cache.colour_next = 0;
1066 	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1067 				      sizeof(struct slab), cache_line_size());
1068 
1069 	/* 2+3) create the kmalloc caches */
1070 	sizes = malloc_sizes;
1071 	names = cache_names;
1072 
1073 	/* Initialize the caches that provide memory for the array cache
1074 	 * and the kmem_list3 structures first.
1075 	 * Without this, further allocations will bug
1076 	 */
1077 
1078 	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1079 						      sizes[INDEX_AC].cs_size,
1080 						      ARCH_KMALLOC_MINALIGN,
1081 						      (ARCH_KMALLOC_FLAGS |
1082 						       SLAB_PANIC), NULL, NULL);
1083 
1084 	if (INDEX_AC != INDEX_L3)
1085 		sizes[INDEX_L3].cs_cachep =
1086 		    kmem_cache_create(names[INDEX_L3].name,
1087 				      sizes[INDEX_L3].cs_size,
1088 				      ARCH_KMALLOC_MINALIGN,
1089 				      (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
1090 				      NULL);
1091 
1092 	while (sizes->cs_size != ULONG_MAX) {
1093 		/*
1094 		 * For performance, all the general caches are L1 aligned.
1095 		 * This should be particularly beneficial on SMP boxes, as it
1096 		 * eliminates "false sharing".
1097 		 * Note for systems short on memory removing the alignment will
1098 		 * allow tighter packing of the smaller caches.
1099 		 */
1100 		if (!sizes->cs_cachep)
1101 			sizes->cs_cachep = kmem_cache_create(names->name,
1102 							     sizes->cs_size,
1103 							     ARCH_KMALLOC_MINALIGN,
1104 							     (ARCH_KMALLOC_FLAGS
1105 							      | SLAB_PANIC),
1106 							     NULL, NULL);
1107 
1108 		/* Inc off-slab bufctl limit until the ceiling is hit. */
1109 		if (!(OFF_SLAB(sizes->cs_cachep))) {
1110 			offslab_limit = sizes->cs_size - sizeof(struct slab);
1111 			offslab_limit /= sizeof(kmem_bufctl_t);
1112 		}
1113 
1114 		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
1115 							sizes->cs_size,
1116 							ARCH_KMALLOC_MINALIGN,
1117 							(ARCH_KMALLOC_FLAGS |
1118 							 SLAB_CACHE_DMA |
1119 							 SLAB_PANIC), NULL,
1120 							NULL);
1121 
1122 		sizes++;
1123 		names++;
1124 	}
1125 	/* 4) Replace the bootstrap head arrays */
1126 	{
1127 		void *ptr;
1128 
1129 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1130 
1131 		local_irq_disable();
1132 		BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
1133 		memcpy(ptr, ac_data(&cache_cache),
1134 		       sizeof(struct arraycache_init));
1135 		cache_cache.array[smp_processor_id()] = ptr;
1136 		local_irq_enable();
1137 
1138 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1139 
1140 		local_irq_disable();
1141 		BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
1142 		       != &initarray_generic.cache);
1143 		memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
1144 		       sizeof(struct arraycache_init));
1145 		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1146 		    ptr;
1147 		local_irq_enable();
1148 	}
1149 	/* 5) Replace the bootstrap kmem_list3's */
1150 	{
1151 		int node;
1152 		/* Replace the static kmem_list3 structures for the boot cpu */
1153 		init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
1154 			  numa_node_id());
1155 
1156 		for_each_online_node(node) {
1157 			init_list(malloc_sizes[INDEX_AC].cs_cachep,
1158 				  &initkmem_list3[SIZE_AC + node], node);
1159 
1160 			if (INDEX_AC != INDEX_L3) {
1161 				init_list(malloc_sizes[INDEX_L3].cs_cachep,
1162 					  &initkmem_list3[SIZE_L3 + node],
1163 					  node);
1164 			}
1165 		}
1166 	}
1167 
1168 	/* 6) resize the head arrays to their final sizes */
1169 	{
1170 		kmem_cache_t *cachep;
1171 		down(&cache_chain_sem);
1172 		list_for_each_entry(cachep, &cache_chain, next)
1173 		    enable_cpucache(cachep);
1174 		up(&cache_chain_sem);
1175 	}
1176 
1177 	/* Done! */
1178 	g_cpucache_up = FULL;
1179 
1180 	/* Register a cpu startup notifier callback
1181 	 * that initializes ac_data for all new cpus
1182 	 */
1183 	register_cpu_notifier(&cpucache_notifier);
1184 
1185 	/* The reap timers are started later, with a module init call:
1186 	 * That part of the kernel is not yet operational.
1187 	 */
1188 }
1189 
1190 static int __init cpucache_init(void)
1191 {
1192 	int cpu;
1193 
1194 	/*
1195 	 * Register the timers that return unneeded
1196 	 * pages to gfp.
1197 	 */
1198 	for_each_online_cpu(cpu)
1199 	    start_cpu_timer(cpu);
1200 
1201 	return 0;
1202 }
1203 
1204 __initcall(cpucache_init);
1205 
1206 /*
1207  * Interface to system's page allocator. No need to hold the cache-lock.
1208  *
1209  * If we requested dmaable memory, we will get it. Even if we
1210  * did not request dmaable memory, we might get it, but that
1211  * would be relatively rare and ignorable.
1212  */
1213 static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
1214 {
1215 	struct page *page;
1216 	void *addr;
1217 	int i;
1218 
1219 	flags |= cachep->gfpflags;
1220 	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1221 	if (!page)
1222 		return NULL;
1223 	addr = page_address(page);
1224 
1225 	i = (1 << cachep->gfporder);
1226 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1227 		atomic_add(i, &slab_reclaim_pages);
1228 	add_page_state(nr_slab, i);
1229 	while (i--) {
1230 		SetPageSlab(page);
1231 		page++;
1232 	}
1233 	return addr;
1234 }
1235 
1236 /*
1237  * Interface to system's page release.
1238  */
1239 static void kmem_freepages(kmem_cache_t *cachep, void *addr)
1240 {
1241 	unsigned long i = (1 << cachep->gfporder);
1242 	struct page *page = virt_to_page(addr);
1243 	const unsigned long nr_freed = i;
1244 
1245 	while (i--) {
1246 		if (!TestClearPageSlab(page))
1247 			BUG();
1248 		page++;
1249 	}
1250 	sub_page_state(nr_slab, nr_freed);
1251 	if (current->reclaim_state)
1252 		current->reclaim_state->reclaimed_slab += nr_freed;
1253 	free_pages((unsigned long)addr, cachep->gfporder);
1254 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1255 		atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
1256 }
1257 
1258 static void kmem_rcu_free(struct rcu_head *head)
1259 {
1260 	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1261 	kmem_cache_t *cachep = slab_rcu->cachep;
1262 
1263 	kmem_freepages(cachep, slab_rcu->addr);
1264 	if (OFF_SLAB(cachep))
1265 		kmem_cache_free(cachep->slabp_cache, slab_rcu);
1266 }
1267 
1268 #if DEBUG
1269 
1270 #ifdef CONFIG_DEBUG_PAGEALLOC
1271 static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
1272 			    unsigned long caller)
1273 {
1274 	int size = obj_reallen(cachep);
1275 
1276 	addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)];
1277 
1278 	if (size < 5 * sizeof(unsigned long))
1279 		return;
1280 
1281 	*addr++ = 0x12345678;
1282 	*addr++ = caller;
1283 	*addr++ = smp_processor_id();
1284 	size -= 3 * sizeof(unsigned long);
1285 	{
1286 		unsigned long *sptr = &caller;
1287 		unsigned long svalue;
1288 
1289 		while (!kstack_end(sptr)) {
1290 			svalue = *sptr++;
1291 			if (kernel_text_address(svalue)) {
1292 				*addr++ = svalue;
1293 				size -= sizeof(unsigned long);
1294 				if (size <= sizeof(unsigned long))
1295 					break;
1296 			}
1297 		}
1298 
1299 	}
1300 	*addr++ = 0x87654321;
1301 }
1302 #endif
1303 
1304 static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
1305 {
1306 	int size = obj_reallen(cachep);
1307 	addr = &((char *)addr)[obj_dbghead(cachep)];
1308 
1309 	memset(addr, val, size);
1310 	*(unsigned char *)(addr + size - 1) = POISON_END;
1311 }
1312 
1313 static void dump_line(char *data, int offset, int limit)
1314 {
1315 	int i;
1316 	printk(KERN_ERR "%03x:", offset);
1317 	for (i = 0; i < limit; i++) {
1318 		printk(" %02x", (unsigned char)data[offset + i]);
1319 	}
1320 	printk("\n");
1321 }
1322 #endif
1323 
1324 #if DEBUG
1325 
1326 static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
1327 {
1328 	int i, size;
1329 	char *realobj;
1330 
1331 	if (cachep->flags & SLAB_RED_ZONE) {
1332 		printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
1333 		       *dbg_redzone1(cachep, objp),
1334 		       *dbg_redzone2(cachep, objp));
1335 	}
1336 
1337 	if (cachep->flags & SLAB_STORE_USER) {
1338 		printk(KERN_ERR "Last user: [<%p>]",
1339 		       *dbg_userword(cachep, objp));
1340 		print_symbol("(%s)",
1341 			     (unsigned long)*dbg_userword(cachep, objp));
1342 		printk("\n");
1343 	}
1344 	realobj = (char *)objp + obj_dbghead(cachep);
1345 	size = obj_reallen(cachep);
1346 	for (i = 0; i < size && lines; i += 16, lines--) {
1347 		int limit;
1348 		limit = 16;
1349 		if (i + limit > size)
1350 			limit = size - i;
1351 		dump_line(realobj, i, limit);
1352 	}
1353 }
1354 
1355 static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1356 {
1357 	char *realobj;
1358 	int size, i;
1359 	int lines = 0;
1360 
1361 	realobj = (char *)objp + obj_dbghead(cachep);
1362 	size = obj_reallen(cachep);
1363 
1364 	for (i = 0; i < size; i++) {
1365 		char exp = POISON_FREE;
1366 		if (i == size - 1)
1367 			exp = POISON_END;
1368 		if (realobj[i] != exp) {
1369 			int limit;
1370 			/* Mismatch ! */
1371 			/* Print header */
1372 			if (lines == 0) {
1373 				printk(KERN_ERR
1374 				       "Slab corruption: start=%p, len=%d\n",
1375 				       realobj, size);
1376 				print_objinfo(cachep, objp, 0);
1377 			}
1378 			/* Hexdump the affected line */
1379 			i = (i / 16) * 16;
1380 			limit = 16;
1381 			if (i + limit > size)
1382 				limit = size - i;
1383 			dump_line(realobj, i, limit);
1384 			i += 16;
1385 			lines++;
1386 			/* Limit to 5 lines */
1387 			if (lines > 5)
1388 				break;
1389 		}
1390 	}
1391 	if (lines != 0) {
1392 		/* Print some data about the neighboring objects, if they
1393 		 * exist:
1394 		 */
1395 		struct slab *slabp = page_get_slab(virt_to_page(objp));
1396 		int objnr;
1397 
1398 		objnr = (objp - slabp->s_mem) / cachep->objsize;
1399 		if (objnr) {
1400 			objp = slabp->s_mem + (objnr - 1) * cachep->objsize;
1401 			realobj = (char *)objp + obj_dbghead(cachep);
1402 			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1403 			       realobj, size);
1404 			print_objinfo(cachep, objp, 2);
1405 		}
1406 		if (objnr + 1 < cachep->num) {
1407 			objp = slabp->s_mem + (objnr + 1) * cachep->objsize;
1408 			realobj = (char *)objp + obj_dbghead(cachep);
1409 			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1410 			       realobj, size);
1411 			print_objinfo(cachep, objp, 2);
1412 		}
1413 	}
1414 }
1415 #endif
1416 
1417 /* Destroy all the objs in a slab, and release the mem back to the system.
1418  * Before calling the slab must have been unlinked from the cache.
1419  * The cache-lock is not held/needed.
1420  */
1421 static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
1422 {
1423 	void *addr = slabp->s_mem - slabp->colouroff;
1424 
1425 #if DEBUG
1426 	int i;
1427 	for (i = 0; i < cachep->num; i++) {
1428 		void *objp = slabp->s_mem + cachep->objsize * i;
1429 
1430 		if (cachep->flags & SLAB_POISON) {
1431 #ifdef CONFIG_DEBUG_PAGEALLOC
1432 			if ((cachep->objsize % PAGE_SIZE) == 0
1433 			    && OFF_SLAB(cachep))
1434 				kernel_map_pages(virt_to_page(objp),
1435 						 cachep->objsize / PAGE_SIZE,
1436 						 1);
1437 			else
1438 				check_poison_obj(cachep, objp);
1439 #else
1440 			check_poison_obj(cachep, objp);
1441 #endif
1442 		}
1443 		if (cachep->flags & SLAB_RED_ZONE) {
1444 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1445 				slab_error(cachep, "start of a freed object "
1446 					   "was overwritten");
1447 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1448 				slab_error(cachep, "end of a freed object "
1449 					   "was overwritten");
1450 		}
1451 		if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1452 			(cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0);
1453 	}
1454 #else
1455 	if (cachep->dtor) {
1456 		int i;
1457 		for (i = 0; i < cachep->num; i++) {
1458 			void *objp = slabp->s_mem + cachep->objsize * i;
1459 			(cachep->dtor) (objp, cachep, 0);
1460 		}
1461 	}
1462 #endif
1463 
1464 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1465 		struct slab_rcu *slab_rcu;
1466 
1467 		slab_rcu = (struct slab_rcu *)slabp;
1468 		slab_rcu->cachep = cachep;
1469 		slab_rcu->addr = addr;
1470 		call_rcu(&slab_rcu->head, kmem_rcu_free);
1471 	} else {
1472 		kmem_freepages(cachep, addr);
1473 		if (OFF_SLAB(cachep))
1474 			kmem_cache_free(cachep->slabp_cache, slabp);
1475 	}
1476 }
1477 
1478 /* For setting up all the kmem_list3s for cache whose objsize is same
1479    as size of kmem_list3. */
1480 static inline void set_up_list3s(kmem_cache_t *cachep, int index)
1481 {
1482 	int node;
1483 
1484 	for_each_online_node(node) {
1485 		cachep->nodelists[node] = &initkmem_list3[index + node];
1486 		cachep->nodelists[node]->next_reap = jiffies +
1487 		    REAPTIMEOUT_LIST3 +
1488 		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1489 	}
1490 }
1491 
1492 /**
1493  * calculate_slab_order - calculate size (page order) of slabs and the number
1494  *                        of objects per slab.
1495  *
1496  * This could be made much more intelligent.  For now, try to avoid using
1497  * high order pages for slabs.  When the gfp() functions are more friendly
1498  * towards high-order requests, this should be changed.
1499  */
1500 static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
1501 					  size_t align, gfp_t flags)
1502 {
1503 	size_t left_over = 0;
1504 
1505 	for (;; cachep->gfporder++) {
1506 		unsigned int num;
1507 		size_t remainder;
1508 
1509 		if (cachep->gfporder > MAX_GFP_ORDER) {
1510 			cachep->num = 0;
1511 			break;
1512 		}
1513 
1514 		cache_estimate(cachep->gfporder, size, align, flags,
1515 			       &remainder, &num);
1516 		if (!num)
1517 			continue;
1518 		/* More than offslab_limit objects will cause problems */
1519 		if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
1520 			break;
1521 
1522 		cachep->num = num;
1523 		left_over = remainder;
1524 
1525 		/*
1526 		 * Large number of objects is good, but very large slabs are
1527 		 * currently bad for the gfp()s.
1528 		 */
1529 		if (cachep->gfporder >= slab_break_gfp_order)
1530 			break;
1531 
1532 		if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
1533 			/* Acceptable internal fragmentation */
1534 			break;
1535 	}
1536 	return left_over;
1537 }
1538 
1539 /**
1540  * kmem_cache_create - Create a cache.
1541  * @name: A string which is used in /proc/slabinfo to identify this cache.
1542  * @size: The size of objects to be created in this cache.
1543  * @align: The required alignment for the objects.
1544  * @flags: SLAB flags
1545  * @ctor: A constructor for the objects.
1546  * @dtor: A destructor for the objects.
1547  *
1548  * Returns a ptr to the cache on success, NULL on failure.
1549  * Cannot be called within a int, but can be interrupted.
1550  * The @ctor is run when new pages are allocated by the cache
1551  * and the @dtor is run before the pages are handed back.
1552  *
1553  * @name must be valid until the cache is destroyed. This implies that
1554  * the module calling this has to destroy the cache before getting
1555  * unloaded.
1556  *
1557  * The flags are
1558  *
1559  * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
1560  * to catch references to uninitialised memory.
1561  *
1562  * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
1563  * for buffer overruns.
1564  *
1565  * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
1566  * memory pressure.
1567  *
1568  * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
1569  * cacheline.  This can be beneficial if you're counting cycles as closely
1570  * as davem.
1571  */
1572 kmem_cache_t *
1573 kmem_cache_create (const char *name, size_t size, size_t align,
1574 	unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
1575 	void (*dtor)(void*, kmem_cache_t *, unsigned long))
1576 {
1577 	size_t left_over, slab_size, ralign;
1578 	kmem_cache_t *cachep = NULL;
1579 	struct list_head *p;
1580 
1581 	/*
1582 	 * Sanity checks... these are all serious usage bugs.
1583 	 */
1584 	if ((!name) ||
1585 	    in_interrupt() ||
1586 	    (size < BYTES_PER_WORD) ||
1587 	    (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
1588 		printk(KERN_ERR "%s: Early error in slab %s\n",
1589 		       __FUNCTION__, name);
1590 		BUG();
1591 	}
1592 
1593 	down(&cache_chain_sem);
1594 
1595 	list_for_each(p, &cache_chain) {
1596 		kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
1597 		mm_segment_t old_fs = get_fs();
1598 		char tmp;
1599 		int res;
1600 
1601 		/*
1602 		 * This happens when the module gets unloaded and doesn't
1603 		 * destroy its slab cache and no-one else reuses the vmalloc
1604 		 * area of the module.  Print a warning.
1605 		 */
1606 		set_fs(KERNEL_DS);
1607 		res = __get_user(tmp, pc->name);
1608 		set_fs(old_fs);
1609 		if (res) {
1610 			printk("SLAB: cache with size %d has lost its name\n",
1611 			       pc->objsize);
1612 			continue;
1613 		}
1614 
1615 		if (!strcmp(pc->name, name)) {
1616 			printk("kmem_cache_create: duplicate cache %s\n", name);
1617 			dump_stack();
1618 			goto oops;
1619 		}
1620 	}
1621 
1622 #if DEBUG
1623 	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
1624 	if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
1625 		/* No constructor, but inital state check requested */
1626 		printk(KERN_ERR "%s: No con, but init state check "
1627 		       "requested - %s\n", __FUNCTION__, name);
1628 		flags &= ~SLAB_DEBUG_INITIAL;
1629 	}
1630 #if FORCED_DEBUG
1631 	/*
1632 	 * Enable redzoning and last user accounting, except for caches with
1633 	 * large objects, if the increased size would increase the object size
1634 	 * above the next power of two: caches with object sizes just above a
1635 	 * power of two have a significant amount of internal fragmentation.
1636 	 */
1637 	if ((size < 4096
1638 	     || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
1639 		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
1640 	if (!(flags & SLAB_DESTROY_BY_RCU))
1641 		flags |= SLAB_POISON;
1642 #endif
1643 	if (flags & SLAB_DESTROY_BY_RCU)
1644 		BUG_ON(flags & SLAB_POISON);
1645 #endif
1646 	if (flags & SLAB_DESTROY_BY_RCU)
1647 		BUG_ON(dtor);
1648 
1649 	/*
1650 	 * Always checks flags, a caller might be expecting debug
1651 	 * support which isn't available.
1652 	 */
1653 	if (flags & ~CREATE_MASK)
1654 		BUG();
1655 
1656 	/* Check that size is in terms of words.  This is needed to avoid
1657 	 * unaligned accesses for some archs when redzoning is used, and makes
1658 	 * sure any on-slab bufctl's are also correctly aligned.
1659 	 */
1660 	if (size & (BYTES_PER_WORD - 1)) {
1661 		size += (BYTES_PER_WORD - 1);
1662 		size &= ~(BYTES_PER_WORD - 1);
1663 	}
1664 
1665 	/* calculate out the final buffer alignment: */
1666 	/* 1) arch recommendation: can be overridden for debug */
1667 	if (flags & SLAB_HWCACHE_ALIGN) {
1668 		/* Default alignment: as specified by the arch code.
1669 		 * Except if an object is really small, then squeeze multiple
1670 		 * objects into one cacheline.
1671 		 */
1672 		ralign = cache_line_size();
1673 		while (size <= ralign / 2)
1674 			ralign /= 2;
1675 	} else {
1676 		ralign = BYTES_PER_WORD;
1677 	}
1678 	/* 2) arch mandated alignment: disables debug if necessary */
1679 	if (ralign < ARCH_SLAB_MINALIGN) {
1680 		ralign = ARCH_SLAB_MINALIGN;
1681 		if (ralign > BYTES_PER_WORD)
1682 			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
1683 	}
1684 	/* 3) caller mandated alignment: disables debug if necessary */
1685 	if (ralign < align) {
1686 		ralign = align;
1687 		if (ralign > BYTES_PER_WORD)
1688 			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
1689 	}
1690 	/* 4) Store it. Note that the debug code below can reduce
1691 	 *    the alignment to BYTES_PER_WORD.
1692 	 */
1693 	align = ralign;
1694 
1695 	/* Get cache's description obj. */
1696 	cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
1697 	if (!cachep)
1698 		goto oops;
1699 	memset(cachep, 0, sizeof(kmem_cache_t));
1700 
1701 #if DEBUG
1702 	cachep->reallen = size;
1703 
1704 	if (flags & SLAB_RED_ZONE) {
1705 		/* redzoning only works with word aligned caches */
1706 		align = BYTES_PER_WORD;
1707 
1708 		/* add space for red zone words */
1709 		cachep->dbghead += BYTES_PER_WORD;
1710 		size += 2 * BYTES_PER_WORD;
1711 	}
1712 	if (flags & SLAB_STORE_USER) {
1713 		/* user store requires word alignment and
1714 		 * one word storage behind the end of the real
1715 		 * object.
1716 		 */
1717 		align = BYTES_PER_WORD;
1718 		size += BYTES_PER_WORD;
1719 	}
1720 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
1721 	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
1722 	    && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
1723 		cachep->dbghead += PAGE_SIZE - size;
1724 		size = PAGE_SIZE;
1725 	}
1726 #endif
1727 #endif
1728 
1729 	/* Determine if the slab management is 'on' or 'off' slab. */
1730 	if (size >= (PAGE_SIZE >> 3))
1731 		/*
1732 		 * Size is large, assume best to place the slab management obj
1733 		 * off-slab (should allow better packing of objs).
1734 		 */
1735 		flags |= CFLGS_OFF_SLAB;
1736 
1737 	size = ALIGN(size, align);
1738 
1739 	if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {
1740 		/*
1741 		 * A VFS-reclaimable slab tends to have most allocations
1742 		 * as GFP_NOFS and we really don't want to have to be allocating
1743 		 * higher-order pages when we are unable to shrink dcache.
1744 		 */
1745 		cachep->gfporder = 0;
1746 		cache_estimate(cachep->gfporder, size, align, flags,
1747 			       &left_over, &cachep->num);
1748 	} else
1749 		left_over = calculate_slab_order(cachep, size, align, flags);
1750 
1751 	if (!cachep->num) {
1752 		printk("kmem_cache_create: couldn't create cache %s.\n", name);
1753 		kmem_cache_free(&cache_cache, cachep);
1754 		cachep = NULL;
1755 		goto oops;
1756 	}
1757 	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
1758 			  + sizeof(struct slab), align);
1759 
1760 	/*
1761 	 * If the slab has been placed off-slab, and we have enough space then
1762 	 * move it on-slab. This is at the expense of any extra colouring.
1763 	 */
1764 	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
1765 		flags &= ~CFLGS_OFF_SLAB;
1766 		left_over -= slab_size;
1767 	}
1768 
1769 	if (flags & CFLGS_OFF_SLAB) {
1770 		/* really off slab. No need for manual alignment */
1771 		slab_size =
1772 		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
1773 	}
1774 
1775 	cachep->colour_off = cache_line_size();
1776 	/* Offset must be a multiple of the alignment. */
1777 	if (cachep->colour_off < align)
1778 		cachep->colour_off = align;
1779 	cachep->colour = left_over / cachep->colour_off;
1780 	cachep->slab_size = slab_size;
1781 	cachep->flags = flags;
1782 	cachep->gfpflags = 0;
1783 	if (flags & SLAB_CACHE_DMA)
1784 		cachep->gfpflags |= GFP_DMA;
1785 	spin_lock_init(&cachep->spinlock);
1786 	cachep->objsize = size;
1787 
1788 	if (flags & CFLGS_OFF_SLAB)
1789 		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
1790 	cachep->ctor = ctor;
1791 	cachep->dtor = dtor;
1792 	cachep->name = name;
1793 
1794 	/* Don't let CPUs to come and go */
1795 	lock_cpu_hotplug();
1796 
1797 	if (g_cpucache_up == FULL) {
1798 		enable_cpucache(cachep);
1799 	} else {
1800 		if (g_cpucache_up == NONE) {
1801 			/* Note: the first kmem_cache_create must create
1802 			 * the cache that's used by kmalloc(24), otherwise
1803 			 * the creation of further caches will BUG().
1804 			 */
1805 			cachep->array[smp_processor_id()] =
1806 			    &initarray_generic.cache;
1807 
1808 			/* If the cache that's used by
1809 			 * kmalloc(sizeof(kmem_list3)) is the first cache,
1810 			 * then we need to set up all its list3s, otherwise
1811 			 * the creation of further caches will BUG().
1812 			 */
1813 			set_up_list3s(cachep, SIZE_AC);
1814 			if (INDEX_AC == INDEX_L3)
1815 				g_cpucache_up = PARTIAL_L3;
1816 			else
1817 				g_cpucache_up = PARTIAL_AC;
1818 		} else {
1819 			cachep->array[smp_processor_id()] =
1820 			    kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1821 
1822 			if (g_cpucache_up == PARTIAL_AC) {
1823 				set_up_list3s(cachep, SIZE_L3);
1824 				g_cpucache_up = PARTIAL_L3;
1825 			} else {
1826 				int node;
1827 				for_each_online_node(node) {
1828 
1829 					cachep->nodelists[node] =
1830 					    kmalloc_node(sizeof
1831 							 (struct kmem_list3),
1832 							 GFP_KERNEL, node);
1833 					BUG_ON(!cachep->nodelists[node]);
1834 					kmem_list3_init(cachep->
1835 							nodelists[node]);
1836 				}
1837 			}
1838 		}
1839 		cachep->nodelists[numa_node_id()]->next_reap =
1840 		    jiffies + REAPTIMEOUT_LIST3 +
1841 		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1842 
1843 		BUG_ON(!ac_data(cachep));
1844 		ac_data(cachep)->avail = 0;
1845 		ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
1846 		ac_data(cachep)->batchcount = 1;
1847 		ac_data(cachep)->touched = 0;
1848 		cachep->batchcount = 1;
1849 		cachep->limit = BOOT_CPUCACHE_ENTRIES;
1850 	}
1851 
1852 	/* cache setup completed, link it into the list */
1853 	list_add(&cachep->next, &cache_chain);
1854 	unlock_cpu_hotplug();
1855       oops:
1856 	if (!cachep && (flags & SLAB_PANIC))
1857 		panic("kmem_cache_create(): failed to create slab `%s'\n",
1858 		      name);
1859 	up(&cache_chain_sem);
1860 	return cachep;
1861 }
1862 EXPORT_SYMBOL(kmem_cache_create);
1863 
1864 #if DEBUG
1865 static void check_irq_off(void)
1866 {
1867 	BUG_ON(!irqs_disabled());
1868 }
1869 
1870 static void check_irq_on(void)
1871 {
1872 	BUG_ON(irqs_disabled());
1873 }
1874 
1875 static void check_spinlock_acquired(kmem_cache_t *cachep)
1876 {
1877 #ifdef CONFIG_SMP
1878 	check_irq_off();
1879 	assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
1880 #endif
1881 }
1882 
1883 static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
1884 {
1885 #ifdef CONFIG_SMP
1886 	check_irq_off();
1887 	assert_spin_locked(&cachep->nodelists[node]->list_lock);
1888 #endif
1889 }
1890 
1891 #else
1892 #define check_irq_off()	do { } while(0)
1893 #define check_irq_on()	do { } while(0)
1894 #define check_spinlock_acquired(x) do { } while(0)
1895 #define check_spinlock_acquired_node(x, y) do { } while(0)
1896 #endif
1897 
1898 /*
1899  * Waits for all CPUs to execute func().
1900  */
1901 static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
1902 {
1903 	check_irq_on();
1904 	preempt_disable();
1905 
1906 	local_irq_disable();
1907 	func(arg);
1908 	local_irq_enable();
1909 
1910 	if (smp_call_function(func, arg, 1, 1))
1911 		BUG();
1912 
1913 	preempt_enable();
1914 }
1915 
1916 static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
1917 				int force, int node);
1918 
1919 static void do_drain(void *arg)
1920 {
1921 	kmem_cache_t *cachep = (kmem_cache_t *) arg;
1922 	struct array_cache *ac;
1923 	int node = numa_node_id();
1924 
1925 	check_irq_off();
1926 	ac = ac_data(cachep);
1927 	spin_lock(&cachep->nodelists[node]->list_lock);
1928 	free_block(cachep, ac->entry, ac->avail, node);
1929 	spin_unlock(&cachep->nodelists[node]->list_lock);
1930 	ac->avail = 0;
1931 }
1932 
1933 static void drain_cpu_caches(kmem_cache_t *cachep)
1934 {
1935 	struct kmem_list3 *l3;
1936 	int node;
1937 
1938 	smp_call_function_all_cpus(do_drain, cachep);
1939 	check_irq_on();
1940 	spin_lock_irq(&cachep->spinlock);
1941 	for_each_online_node(node) {
1942 		l3 = cachep->nodelists[node];
1943 		if (l3) {
1944 			spin_lock(&l3->list_lock);
1945 			drain_array_locked(cachep, l3->shared, 1, node);
1946 			spin_unlock(&l3->list_lock);
1947 			if (l3->alien)
1948 				drain_alien_cache(cachep, l3);
1949 		}
1950 	}
1951 	spin_unlock_irq(&cachep->spinlock);
1952 }
1953 
1954 static int __node_shrink(kmem_cache_t *cachep, int node)
1955 {
1956 	struct slab *slabp;
1957 	struct kmem_list3 *l3 = cachep->nodelists[node];
1958 	int ret;
1959 
1960 	for (;;) {
1961 		struct list_head *p;
1962 
1963 		p = l3->slabs_free.prev;
1964 		if (p == &l3->slabs_free)
1965 			break;
1966 
1967 		slabp = list_entry(l3->slabs_free.prev, struct slab, list);
1968 #if DEBUG
1969 		if (slabp->inuse)
1970 			BUG();
1971 #endif
1972 		list_del(&slabp->list);
1973 
1974 		l3->free_objects -= cachep->num;
1975 		spin_unlock_irq(&l3->list_lock);
1976 		slab_destroy(cachep, slabp);
1977 		spin_lock_irq(&l3->list_lock);
1978 	}
1979 	ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial);
1980 	return ret;
1981 }
1982 
1983 static int __cache_shrink(kmem_cache_t *cachep)
1984 {
1985 	int ret = 0, i = 0;
1986 	struct kmem_list3 *l3;
1987 
1988 	drain_cpu_caches(cachep);
1989 
1990 	check_irq_on();
1991 	for_each_online_node(i) {
1992 		l3 = cachep->nodelists[i];
1993 		if (l3) {
1994 			spin_lock_irq(&l3->list_lock);
1995 			ret += __node_shrink(cachep, i);
1996 			spin_unlock_irq(&l3->list_lock);
1997 		}
1998 	}
1999 	return (ret ? 1 : 0);
2000 }
2001 
2002 /**
2003  * kmem_cache_shrink - Shrink a cache.
2004  * @cachep: The cache to shrink.
2005  *
2006  * Releases as many slabs as possible for a cache.
2007  * To help debugging, a zero exit status indicates all slabs were released.
2008  */
2009 int kmem_cache_shrink(kmem_cache_t *cachep)
2010 {
2011 	if (!cachep || in_interrupt())
2012 		BUG();
2013 
2014 	return __cache_shrink(cachep);
2015 }
2016 EXPORT_SYMBOL(kmem_cache_shrink);
2017 
2018 /**
2019  * kmem_cache_destroy - delete a cache
2020  * @cachep: the cache to destroy
2021  *
2022  * Remove a kmem_cache_t object from the slab cache.
2023  * Returns 0 on success.
2024  *
2025  * It is expected this function will be called by a module when it is
2026  * unloaded.  This will remove the cache completely, and avoid a duplicate
2027  * cache being allocated each time a module is loaded and unloaded, if the
2028  * module doesn't have persistent in-kernel storage across loads and unloads.
2029  *
2030  * The cache must be empty before calling this function.
2031  *
2032  * The caller must guarantee that noone will allocate memory from the cache
2033  * during the kmem_cache_destroy().
2034  */
2035 int kmem_cache_destroy(kmem_cache_t *cachep)
2036 {
2037 	int i;
2038 	struct kmem_list3 *l3;
2039 
2040 	if (!cachep || in_interrupt())
2041 		BUG();
2042 
2043 	/* Don't let CPUs to come and go */
2044 	lock_cpu_hotplug();
2045 
2046 	/* Find the cache in the chain of caches. */
2047 	down(&cache_chain_sem);
2048 	/*
2049 	 * the chain is never empty, cache_cache is never destroyed
2050 	 */
2051 	list_del(&cachep->next);
2052 	up(&cache_chain_sem);
2053 
2054 	if (__cache_shrink(cachep)) {
2055 		slab_error(cachep, "Can't free all objects");
2056 		down(&cache_chain_sem);
2057 		list_add(&cachep->next, &cache_chain);
2058 		up(&cache_chain_sem);
2059 		unlock_cpu_hotplug();
2060 		return 1;
2061 	}
2062 
2063 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2064 		synchronize_rcu();
2065 
2066 	for_each_online_cpu(i)
2067 	    kfree(cachep->array[i]);
2068 
2069 	/* NUMA: free the list3 structures */
2070 	for_each_online_node(i) {
2071 		if ((l3 = cachep->nodelists[i])) {
2072 			kfree(l3->shared);
2073 			free_alien_cache(l3->alien);
2074 			kfree(l3);
2075 		}
2076 	}
2077 	kmem_cache_free(&cache_cache, cachep);
2078 
2079 	unlock_cpu_hotplug();
2080 
2081 	return 0;
2082 }
2083 EXPORT_SYMBOL(kmem_cache_destroy);
2084 
2085 /* Get the memory for a slab management obj. */
2086 static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
2087 				   int colour_off, gfp_t local_flags)
2088 {
2089 	struct slab *slabp;
2090 
2091 	if (OFF_SLAB(cachep)) {
2092 		/* Slab management obj is off-slab. */
2093 		slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
2094 		if (!slabp)
2095 			return NULL;
2096 	} else {
2097 		slabp = objp + colour_off;
2098 		colour_off += cachep->slab_size;
2099 	}
2100 	slabp->inuse = 0;
2101 	slabp->colouroff = colour_off;
2102 	slabp->s_mem = objp + colour_off;
2103 
2104 	return slabp;
2105 }
2106 
2107 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2108 {
2109 	return (kmem_bufctl_t *) (slabp + 1);
2110 }
2111 
2112 static void cache_init_objs(kmem_cache_t *cachep,
2113 			    struct slab *slabp, unsigned long ctor_flags)
2114 {
2115 	int i;
2116 
2117 	for (i = 0; i < cachep->num; i++) {
2118 		void *objp = slabp->s_mem + cachep->objsize * i;
2119 #if DEBUG
2120 		/* need to poison the objs? */
2121 		if (cachep->flags & SLAB_POISON)
2122 			poison_obj(cachep, objp, POISON_FREE);
2123 		if (cachep->flags & SLAB_STORE_USER)
2124 			*dbg_userword(cachep, objp) = NULL;
2125 
2126 		if (cachep->flags & SLAB_RED_ZONE) {
2127 			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2128 			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2129 		}
2130 		/*
2131 		 * Constructors are not allowed to allocate memory from
2132 		 * the same cache which they are a constructor for.
2133 		 * Otherwise, deadlock. They must also be threaded.
2134 		 */
2135 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2136 			cachep->ctor(objp + obj_dbghead(cachep), cachep,
2137 				     ctor_flags);
2138 
2139 		if (cachep->flags & SLAB_RED_ZONE) {
2140 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2141 				slab_error(cachep, "constructor overwrote the"
2142 					   " end of an object");
2143 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2144 				slab_error(cachep, "constructor overwrote the"
2145 					   " start of an object");
2146 		}
2147 		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
2148 		    && cachep->flags & SLAB_POISON)
2149 			kernel_map_pages(virt_to_page(objp),
2150 					 cachep->objsize / PAGE_SIZE, 0);
2151 #else
2152 		if (cachep->ctor)
2153 			cachep->ctor(objp, cachep, ctor_flags);
2154 #endif
2155 		slab_bufctl(slabp)[i] = i + 1;
2156 	}
2157 	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2158 	slabp->free = 0;
2159 }
2160 
2161 static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags)
2162 {
2163 	if (flags & SLAB_DMA) {
2164 		if (!(cachep->gfpflags & GFP_DMA))
2165 			BUG();
2166 	} else {
2167 		if (cachep->gfpflags & GFP_DMA)
2168 			BUG();
2169 	}
2170 }
2171 
2172 static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
2173 {
2174 	int i;
2175 	struct page *page;
2176 
2177 	/* Nasty!!!!!! I hope this is OK. */
2178 	i = 1 << cachep->gfporder;
2179 	page = virt_to_page(objp);
2180 	do {
2181 		page_set_cache(page, cachep);
2182 		page_set_slab(page, slabp);
2183 		page++;
2184 	} while (--i);
2185 }
2186 
2187 /*
2188  * Grow (by 1) the number of slabs within a cache.  This is called by
2189  * kmem_cache_alloc() when there are no active objs left in a cache.
2190  */
2191 static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2192 {
2193 	struct slab *slabp;
2194 	void *objp;
2195 	size_t offset;
2196 	gfp_t local_flags;
2197 	unsigned long ctor_flags;
2198 	struct kmem_list3 *l3;
2199 
2200 	/* Be lazy and only check for valid flags here,
2201 	 * keeping it out of the critical path in kmem_cache_alloc().
2202 	 */
2203 	if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
2204 		BUG();
2205 	if (flags & SLAB_NO_GROW)
2206 		return 0;
2207 
2208 	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2209 	local_flags = (flags & SLAB_LEVEL_MASK);
2210 	if (!(local_flags & __GFP_WAIT))
2211 		/*
2212 		 * Not allowed to sleep.  Need to tell a constructor about
2213 		 * this - it might need to know...
2214 		 */
2215 		ctor_flags |= SLAB_CTOR_ATOMIC;
2216 
2217 	/* About to mess with non-constant members - lock. */
2218 	check_irq_off();
2219 	spin_lock(&cachep->spinlock);
2220 
2221 	/* Get colour for the slab, and cal the next value. */
2222 	offset = cachep->colour_next;
2223 	cachep->colour_next++;
2224 	if (cachep->colour_next >= cachep->colour)
2225 		cachep->colour_next = 0;
2226 	offset *= cachep->colour_off;
2227 
2228 	spin_unlock(&cachep->spinlock);
2229 
2230 	check_irq_off();
2231 	if (local_flags & __GFP_WAIT)
2232 		local_irq_enable();
2233 
2234 	/*
2235 	 * The test for missing atomic flag is performed here, rather than
2236 	 * the more obvious place, simply to reduce the critical path length
2237 	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2238 	 * will eventually be caught here (where it matters).
2239 	 */
2240 	kmem_flagcheck(cachep, flags);
2241 
2242 	/* Get mem for the objs.
2243 	 * Attempt to allocate a physical page from 'nodeid',
2244 	 */
2245 	if (!(objp = kmem_getpages(cachep, flags, nodeid)))
2246 		goto failed;
2247 
2248 	/* Get slab management. */
2249 	if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
2250 		goto opps1;
2251 
2252 	slabp->nodeid = nodeid;
2253 	set_slab_attr(cachep, slabp, objp);
2254 
2255 	cache_init_objs(cachep, slabp, ctor_flags);
2256 
2257 	if (local_flags & __GFP_WAIT)
2258 		local_irq_disable();
2259 	check_irq_off();
2260 	l3 = cachep->nodelists[nodeid];
2261 	spin_lock(&l3->list_lock);
2262 
2263 	/* Make slab active. */
2264 	list_add_tail(&slabp->list, &(l3->slabs_free));
2265 	STATS_INC_GROWN(cachep);
2266 	l3->free_objects += cachep->num;
2267 	spin_unlock(&l3->list_lock);
2268 	return 1;
2269       opps1:
2270 	kmem_freepages(cachep, objp);
2271       failed:
2272 	if (local_flags & __GFP_WAIT)
2273 		local_irq_disable();
2274 	return 0;
2275 }
2276 
2277 #if DEBUG
2278 
2279 /*
2280  * Perform extra freeing checks:
2281  * - detect bad pointers.
2282  * - POISON/RED_ZONE checking
2283  * - destructor calls, for caches with POISON+dtor
2284  */
2285 static void kfree_debugcheck(const void *objp)
2286 {
2287 	struct page *page;
2288 
2289 	if (!virt_addr_valid(objp)) {
2290 		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2291 		       (unsigned long)objp);
2292 		BUG();
2293 	}
2294 	page = virt_to_page(objp);
2295 	if (!PageSlab(page)) {
2296 		printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
2297 		       (unsigned long)objp);
2298 		BUG();
2299 	}
2300 }
2301 
2302 static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
2303 				   void *caller)
2304 {
2305 	struct page *page;
2306 	unsigned int objnr;
2307 	struct slab *slabp;
2308 
2309 	objp -= obj_dbghead(cachep);
2310 	kfree_debugcheck(objp);
2311 	page = virt_to_page(objp);
2312 
2313 	if (page_get_cache(page) != cachep) {
2314 		printk(KERN_ERR
2315 		       "mismatch in kmem_cache_free: expected cache %p, got %p\n",
2316 		       page_get_cache(page), cachep);
2317 		printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
2318 		printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
2319 		       page_get_cache(page)->name);
2320 		WARN_ON(1);
2321 	}
2322 	slabp = page_get_slab(page);
2323 
2324 	if (cachep->flags & SLAB_RED_ZONE) {
2325 		if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
2326 		    || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
2327 			slab_error(cachep,
2328 				   "double free, or memory outside"
2329 				   " object was overwritten");
2330 			printk(KERN_ERR
2331 			       "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
2332 			       objp, *dbg_redzone1(cachep, objp),
2333 			       *dbg_redzone2(cachep, objp));
2334 		}
2335 		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2336 		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2337 	}
2338 	if (cachep->flags & SLAB_STORE_USER)
2339 		*dbg_userword(cachep, objp) = caller;
2340 
2341 	objnr = (objp - slabp->s_mem) / cachep->objsize;
2342 
2343 	BUG_ON(objnr >= cachep->num);
2344 	BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize);
2345 
2346 	if (cachep->flags & SLAB_DEBUG_INITIAL) {
2347 		/* Need to call the slab's constructor so the
2348 		 * caller can perform a verify of its state (debugging).
2349 		 * Called without the cache-lock held.
2350 		 */
2351 		cachep->ctor(objp + obj_dbghead(cachep),
2352 			     cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
2353 	}
2354 	if (cachep->flags & SLAB_POISON && cachep->dtor) {
2355 		/* we want to cache poison the object,
2356 		 * call the destruction callback
2357 		 */
2358 		cachep->dtor(objp + obj_dbghead(cachep), cachep, 0);
2359 	}
2360 	if (cachep->flags & SLAB_POISON) {
2361 #ifdef CONFIG_DEBUG_PAGEALLOC
2362 		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
2363 			store_stackinfo(cachep, objp, (unsigned long)caller);
2364 			kernel_map_pages(virt_to_page(objp),
2365 					 cachep->objsize / PAGE_SIZE, 0);
2366 		} else {
2367 			poison_obj(cachep, objp, POISON_FREE);
2368 		}
2369 #else
2370 		poison_obj(cachep, objp, POISON_FREE);
2371 #endif
2372 	}
2373 	return objp;
2374 }
2375 
2376 static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
2377 {
2378 	kmem_bufctl_t i;
2379 	int entries = 0;
2380 
2381 	/* Check slab's freelist to see if this obj is there. */
2382 	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2383 		entries++;
2384 		if (entries > cachep->num || i >= cachep->num)
2385 			goto bad;
2386 	}
2387 	if (entries != cachep->num - slabp->inuse) {
2388 	      bad:
2389 		printk(KERN_ERR
2390 		       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2391 		       cachep->name, cachep->num, slabp, slabp->inuse);
2392 		for (i = 0;
2393 		     i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
2394 		     i++) {
2395 			if ((i % 16) == 0)
2396 				printk("\n%03x:", i);
2397 			printk(" %02x", ((unsigned char *)slabp)[i]);
2398 		}
2399 		printk("\n");
2400 		BUG();
2401 	}
2402 }
2403 #else
2404 #define kfree_debugcheck(x) do { } while(0)
2405 #define cache_free_debugcheck(x,objp,z) (objp)
2406 #define check_slabp(x,y) do { } while(0)
2407 #endif
2408 
2409 static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
2410 {
2411 	int batchcount;
2412 	struct kmem_list3 *l3;
2413 	struct array_cache *ac;
2414 
2415 	check_irq_off();
2416 	ac = ac_data(cachep);
2417       retry:
2418 	batchcount = ac->batchcount;
2419 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2420 		/* if there was little recent activity on this
2421 		 * cache, then perform only a partial refill.
2422 		 * Otherwise we could generate refill bouncing.
2423 		 */
2424 		batchcount = BATCHREFILL_LIMIT;
2425 	}
2426 	l3 = cachep->nodelists[numa_node_id()];
2427 
2428 	BUG_ON(ac->avail > 0 || !l3);
2429 	spin_lock(&l3->list_lock);
2430 
2431 	if (l3->shared) {
2432 		struct array_cache *shared_array = l3->shared;
2433 		if (shared_array->avail) {
2434 			if (batchcount > shared_array->avail)
2435 				batchcount = shared_array->avail;
2436 			shared_array->avail -= batchcount;
2437 			ac->avail = batchcount;
2438 			memcpy(ac->entry,
2439 			       &(shared_array->entry[shared_array->avail]),
2440 			       sizeof(void *) * batchcount);
2441 			shared_array->touched = 1;
2442 			goto alloc_done;
2443 		}
2444 	}
2445 	while (batchcount > 0) {
2446 		struct list_head *entry;
2447 		struct slab *slabp;
2448 		/* Get slab alloc is to come from. */
2449 		entry = l3->slabs_partial.next;
2450 		if (entry == &l3->slabs_partial) {
2451 			l3->free_touched = 1;
2452 			entry = l3->slabs_free.next;
2453 			if (entry == &l3->slabs_free)
2454 				goto must_grow;
2455 		}
2456 
2457 		slabp = list_entry(entry, struct slab, list);
2458 		check_slabp(cachep, slabp);
2459 		check_spinlock_acquired(cachep);
2460 		while (slabp->inuse < cachep->num && batchcount--) {
2461 			kmem_bufctl_t next;
2462 			STATS_INC_ALLOCED(cachep);
2463 			STATS_INC_ACTIVE(cachep);
2464 			STATS_SET_HIGH(cachep);
2465 
2466 			/* get obj pointer */
2467 			ac->entry[ac->avail++] = slabp->s_mem +
2468 			    slabp->free * cachep->objsize;
2469 
2470 			slabp->inuse++;
2471 			next = slab_bufctl(slabp)[slabp->free];
2472 #if DEBUG
2473 			slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2474 			WARN_ON(numa_node_id() != slabp->nodeid);
2475 #endif
2476 			slabp->free = next;
2477 		}
2478 		check_slabp(cachep, slabp);
2479 
2480 		/* move slabp to correct slabp list: */
2481 		list_del(&slabp->list);
2482 		if (slabp->free == BUFCTL_END)
2483 			list_add(&slabp->list, &l3->slabs_full);
2484 		else
2485 			list_add(&slabp->list, &l3->slabs_partial);
2486 	}
2487 
2488       must_grow:
2489 	l3->free_objects -= ac->avail;
2490       alloc_done:
2491 	spin_unlock(&l3->list_lock);
2492 
2493 	if (unlikely(!ac->avail)) {
2494 		int x;
2495 		x = cache_grow(cachep, flags, numa_node_id());
2496 
2497 		// cache_grow can reenable interrupts, then ac could change.
2498 		ac = ac_data(cachep);
2499 		if (!x && ac->avail == 0)	// no objects in sight? abort
2500 			return NULL;
2501 
2502 		if (!ac->avail)	// objects refilled by interrupt?
2503 			goto retry;
2504 	}
2505 	ac->touched = 1;
2506 	return ac->entry[--ac->avail];
2507 }
2508 
2509 static inline void
2510 cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
2511 {
2512 	might_sleep_if(flags & __GFP_WAIT);
2513 #if DEBUG
2514 	kmem_flagcheck(cachep, flags);
2515 #endif
2516 }
2517 
2518 #if DEBUG
2519 static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
2520 					void *objp, void *caller)
2521 {
2522 	if (!objp)
2523 		return objp;
2524 	if (cachep->flags & SLAB_POISON) {
2525 #ifdef CONFIG_DEBUG_PAGEALLOC
2526 		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2527 			kernel_map_pages(virt_to_page(objp),
2528 					 cachep->objsize / PAGE_SIZE, 1);
2529 		else
2530 			check_poison_obj(cachep, objp);
2531 #else
2532 		check_poison_obj(cachep, objp);
2533 #endif
2534 		poison_obj(cachep, objp, POISON_INUSE);
2535 	}
2536 	if (cachep->flags & SLAB_STORE_USER)
2537 		*dbg_userword(cachep, objp) = caller;
2538 
2539 	if (cachep->flags & SLAB_RED_ZONE) {
2540 		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
2541 		    || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
2542 			slab_error(cachep,
2543 				   "double free, or memory outside"
2544 				   " object was overwritten");
2545 			printk(KERN_ERR
2546 			       "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
2547 			       objp, *dbg_redzone1(cachep, objp),
2548 			       *dbg_redzone2(cachep, objp));
2549 		}
2550 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
2551 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
2552 	}
2553 	objp += obj_dbghead(cachep);
2554 	if (cachep->ctor && cachep->flags & SLAB_POISON) {
2555 		unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2556 
2557 		if (!(flags & __GFP_WAIT))
2558 			ctor_flags |= SLAB_CTOR_ATOMIC;
2559 
2560 		cachep->ctor(objp, cachep, ctor_flags);
2561 	}
2562 	return objp;
2563 }
2564 #else
2565 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
2566 #endif
2567 
2568 static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2569 {
2570 	void *objp;
2571 	struct array_cache *ac;
2572 
2573 	check_irq_off();
2574 	ac = ac_data(cachep);
2575 	if (likely(ac->avail)) {
2576 		STATS_INC_ALLOCHIT(cachep);
2577 		ac->touched = 1;
2578 		objp = ac->entry[--ac->avail];
2579 	} else {
2580 		STATS_INC_ALLOCMISS(cachep);
2581 		objp = cache_alloc_refill(cachep, flags);
2582 	}
2583 	return objp;
2584 }
2585 
2586 static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2587 {
2588 	unsigned long save_flags;
2589 	void *objp;
2590 
2591 	cache_alloc_debugcheck_before(cachep, flags);
2592 
2593 	local_irq_save(save_flags);
2594 	objp = ____cache_alloc(cachep, flags);
2595 	local_irq_restore(save_flags);
2596 	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
2597 					    __builtin_return_address(0));
2598 	prefetchw(objp);
2599 	return objp;
2600 }
2601 
2602 #ifdef CONFIG_NUMA
2603 /*
2604  * A interface to enable slab creation on nodeid
2605  */
2606 static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2607 {
2608 	struct list_head *entry;
2609 	struct slab *slabp;
2610 	struct kmem_list3 *l3;
2611 	void *obj;
2612 	kmem_bufctl_t next;
2613 	int x;
2614 
2615 	l3 = cachep->nodelists[nodeid];
2616 	BUG_ON(!l3);
2617 
2618       retry:
2619 	spin_lock(&l3->list_lock);
2620 	entry = l3->slabs_partial.next;
2621 	if (entry == &l3->slabs_partial) {
2622 		l3->free_touched = 1;
2623 		entry = l3->slabs_free.next;
2624 		if (entry == &l3->slabs_free)
2625 			goto must_grow;
2626 	}
2627 
2628 	slabp = list_entry(entry, struct slab, list);
2629 	check_spinlock_acquired_node(cachep, nodeid);
2630 	check_slabp(cachep, slabp);
2631 
2632 	STATS_INC_NODEALLOCS(cachep);
2633 	STATS_INC_ACTIVE(cachep);
2634 	STATS_SET_HIGH(cachep);
2635 
2636 	BUG_ON(slabp->inuse == cachep->num);
2637 
2638 	/* get obj pointer */
2639 	obj = slabp->s_mem + slabp->free * cachep->objsize;
2640 	slabp->inuse++;
2641 	next = slab_bufctl(slabp)[slabp->free];
2642 #if DEBUG
2643 	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2644 #endif
2645 	slabp->free = next;
2646 	check_slabp(cachep, slabp);
2647 	l3->free_objects--;
2648 	/* move slabp to correct slabp list: */
2649 	list_del(&slabp->list);
2650 
2651 	if (slabp->free == BUFCTL_END) {
2652 		list_add(&slabp->list, &l3->slabs_full);
2653 	} else {
2654 		list_add(&slabp->list, &l3->slabs_partial);
2655 	}
2656 
2657 	spin_unlock(&l3->list_lock);
2658 	goto done;
2659 
2660       must_grow:
2661 	spin_unlock(&l3->list_lock);
2662 	x = cache_grow(cachep, flags, nodeid);
2663 
2664 	if (!x)
2665 		return NULL;
2666 
2667 	goto retry;
2668       done:
2669 	return obj;
2670 }
2671 #endif
2672 
2673 /*
2674  * Caller needs to acquire correct kmem_list's list_lock
2675  */
2676 static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
2677 		       int node)
2678 {
2679 	int i;
2680 	struct kmem_list3 *l3;
2681 
2682 	for (i = 0; i < nr_objects; i++) {
2683 		void *objp = objpp[i];
2684 		struct slab *slabp;
2685 		unsigned int objnr;
2686 
2687 		slabp = page_get_slab(virt_to_page(objp));
2688 		l3 = cachep->nodelists[node];
2689 		list_del(&slabp->list);
2690 		objnr = (objp - slabp->s_mem) / cachep->objsize;
2691 		check_spinlock_acquired_node(cachep, node);
2692 		check_slabp(cachep, slabp);
2693 
2694 #if DEBUG
2695 		/* Verify that the slab belongs to the intended node */
2696 		WARN_ON(slabp->nodeid != node);
2697 
2698 		if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2699 			printk(KERN_ERR "slab: double free detected in cache "
2700 			       "'%s', objp %p\n", cachep->name, objp);
2701 			BUG();
2702 		}
2703 #endif
2704 		slab_bufctl(slabp)[objnr] = slabp->free;
2705 		slabp->free = objnr;
2706 		STATS_DEC_ACTIVE(cachep);
2707 		slabp->inuse--;
2708 		l3->free_objects++;
2709 		check_slabp(cachep, slabp);
2710 
2711 		/* fixup slab chains */
2712 		if (slabp->inuse == 0) {
2713 			if (l3->free_objects > l3->free_limit) {
2714 				l3->free_objects -= cachep->num;
2715 				slab_destroy(cachep, slabp);
2716 			} else {
2717 				list_add(&slabp->list, &l3->slabs_free);
2718 			}
2719 		} else {
2720 			/* Unconditionally move a slab to the end of the
2721 			 * partial list on free - maximum time for the
2722 			 * other objects to be freed, too.
2723 			 */
2724 			list_add_tail(&slabp->list, &l3->slabs_partial);
2725 		}
2726 	}
2727 }
2728 
2729 static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
2730 {
2731 	int batchcount;
2732 	struct kmem_list3 *l3;
2733 	int node = numa_node_id();
2734 
2735 	batchcount = ac->batchcount;
2736 #if DEBUG
2737 	BUG_ON(!batchcount || batchcount > ac->avail);
2738 #endif
2739 	check_irq_off();
2740 	l3 = cachep->nodelists[node];
2741 	spin_lock(&l3->list_lock);
2742 	if (l3->shared) {
2743 		struct array_cache *shared_array = l3->shared;
2744 		int max = shared_array->limit - shared_array->avail;
2745 		if (max) {
2746 			if (batchcount > max)
2747 				batchcount = max;
2748 			memcpy(&(shared_array->entry[shared_array->avail]),
2749 			       ac->entry, sizeof(void *) * batchcount);
2750 			shared_array->avail += batchcount;
2751 			goto free_done;
2752 		}
2753 	}
2754 
2755 	free_block(cachep, ac->entry, batchcount, node);
2756       free_done:
2757 #if STATS
2758 	{
2759 		int i = 0;
2760 		struct list_head *p;
2761 
2762 		p = l3->slabs_free.next;
2763 		while (p != &(l3->slabs_free)) {
2764 			struct slab *slabp;
2765 
2766 			slabp = list_entry(p, struct slab, list);
2767 			BUG_ON(slabp->inuse);
2768 
2769 			i++;
2770 			p = p->next;
2771 		}
2772 		STATS_SET_FREEABLE(cachep, i);
2773 	}
2774 #endif
2775 	spin_unlock(&l3->list_lock);
2776 	ac->avail -= batchcount;
2777 	memmove(ac->entry, &(ac->entry[batchcount]),
2778 		sizeof(void *) * ac->avail);
2779 }
2780 
2781 /*
2782  * __cache_free
2783  * Release an obj back to its cache. If the obj has a constructed
2784  * state, it must be in this state _before_ it is released.
2785  *
2786  * Called with disabled ints.
2787  */
2788 static inline void __cache_free(kmem_cache_t *cachep, void *objp)
2789 {
2790 	struct array_cache *ac = ac_data(cachep);
2791 
2792 	check_irq_off();
2793 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
2794 
2795 	/* Make sure we are not freeing a object from another
2796 	 * node to the array cache on this cpu.
2797 	 */
2798 #ifdef CONFIG_NUMA
2799 	{
2800 		struct slab *slabp;
2801 		slabp = page_get_slab(virt_to_page(objp));
2802 		if (unlikely(slabp->nodeid != numa_node_id())) {
2803 			struct array_cache *alien = NULL;
2804 			int nodeid = slabp->nodeid;
2805 			struct kmem_list3 *l3 =
2806 			    cachep->nodelists[numa_node_id()];
2807 
2808 			STATS_INC_NODEFREES(cachep);
2809 			if (l3->alien && l3->alien[nodeid]) {
2810 				alien = l3->alien[nodeid];
2811 				spin_lock(&alien->lock);
2812 				if (unlikely(alien->avail == alien->limit))
2813 					__drain_alien_cache(cachep,
2814 							    alien, nodeid);
2815 				alien->entry[alien->avail++] = objp;
2816 				spin_unlock(&alien->lock);
2817 			} else {
2818 				spin_lock(&(cachep->nodelists[nodeid])->
2819 					  list_lock);
2820 				free_block(cachep, &objp, 1, nodeid);
2821 				spin_unlock(&(cachep->nodelists[nodeid])->
2822 					    list_lock);
2823 			}
2824 			return;
2825 		}
2826 	}
2827 #endif
2828 	if (likely(ac->avail < ac->limit)) {
2829 		STATS_INC_FREEHIT(cachep);
2830 		ac->entry[ac->avail++] = objp;
2831 		return;
2832 	} else {
2833 		STATS_INC_FREEMISS(cachep);
2834 		cache_flusharray(cachep, ac);
2835 		ac->entry[ac->avail++] = objp;
2836 	}
2837 }
2838 
2839 /**
2840  * kmem_cache_alloc - Allocate an object
2841  * @cachep: The cache to allocate from.
2842  * @flags: See kmalloc().
2843  *
2844  * Allocate an object from this cache.  The flags are only relevant
2845  * if the cache has no available objects.
2846  */
2847 void *kmem_cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2848 {
2849 	return __cache_alloc(cachep, flags);
2850 }
2851 EXPORT_SYMBOL(kmem_cache_alloc);
2852 
2853 /**
2854  * kmem_ptr_validate - check if an untrusted pointer might
2855  *	be a slab entry.
2856  * @cachep: the cache we're checking against
2857  * @ptr: pointer to validate
2858  *
2859  * This verifies that the untrusted pointer looks sane:
2860  * it is _not_ a guarantee that the pointer is actually
2861  * part of the slab cache in question, but it at least
2862  * validates that the pointer can be dereferenced and
2863  * looks half-way sane.
2864  *
2865  * Currently only used for dentry validation.
2866  */
2867 int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
2868 {
2869 	unsigned long addr = (unsigned long)ptr;
2870 	unsigned long min_addr = PAGE_OFFSET;
2871 	unsigned long align_mask = BYTES_PER_WORD - 1;
2872 	unsigned long size = cachep->objsize;
2873 	struct page *page;
2874 
2875 	if (unlikely(addr < min_addr))
2876 		goto out;
2877 	if (unlikely(addr > (unsigned long)high_memory - size))
2878 		goto out;
2879 	if (unlikely(addr & align_mask))
2880 		goto out;
2881 	if (unlikely(!kern_addr_valid(addr)))
2882 		goto out;
2883 	if (unlikely(!kern_addr_valid(addr + size - 1)))
2884 		goto out;
2885 	page = virt_to_page(ptr);
2886 	if (unlikely(!PageSlab(page)))
2887 		goto out;
2888 	if (unlikely(page_get_cache(page) != cachep))
2889 		goto out;
2890 	return 1;
2891       out:
2892 	return 0;
2893 }
2894 
2895 #ifdef CONFIG_NUMA
2896 /**
2897  * kmem_cache_alloc_node - Allocate an object on the specified node
2898  * @cachep: The cache to allocate from.
2899  * @flags: See kmalloc().
2900  * @nodeid: node number of the target node.
2901  *
2902  * Identical to kmem_cache_alloc, except that this function is slow
2903  * and can sleep. And it will allocate memory on the given node, which
2904  * can improve the performance for cpu bound structures.
2905  * New and improved: it will now make sure that the object gets
2906  * put on the correct node list so that there is no false sharing.
2907  */
2908 void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2909 {
2910 	unsigned long save_flags;
2911 	void *ptr;
2912 
2913 	if (nodeid == -1)
2914 		return __cache_alloc(cachep, flags);
2915 
2916 	if (unlikely(!cachep->nodelists[nodeid])) {
2917 		/* Fall back to __cache_alloc if we run into trouble */
2918 		printk(KERN_WARNING
2919 		       "slab: not allocating in inactive node %d for cache %s\n",
2920 		       nodeid, cachep->name);
2921 		return __cache_alloc(cachep, flags);
2922 	}
2923 
2924 	cache_alloc_debugcheck_before(cachep, flags);
2925 	local_irq_save(save_flags);
2926 	if (nodeid == numa_node_id())
2927 		ptr = ____cache_alloc(cachep, flags);
2928 	else
2929 		ptr = __cache_alloc_node(cachep, flags, nodeid);
2930 	local_irq_restore(save_flags);
2931 	ptr =
2932 	    cache_alloc_debugcheck_after(cachep, flags, ptr,
2933 					 __builtin_return_address(0));
2934 
2935 	return ptr;
2936 }
2937 EXPORT_SYMBOL(kmem_cache_alloc_node);
2938 
2939 void *kmalloc_node(size_t size, gfp_t flags, int node)
2940 {
2941 	kmem_cache_t *cachep;
2942 
2943 	cachep = kmem_find_general_cachep(size, flags);
2944 	if (unlikely(cachep == NULL))
2945 		return NULL;
2946 	return kmem_cache_alloc_node(cachep, flags, node);
2947 }
2948 EXPORT_SYMBOL(kmalloc_node);
2949 #endif
2950 
2951 /**
2952  * kmalloc - allocate memory
2953  * @size: how many bytes of memory are required.
2954  * @flags: the type of memory to allocate.
2955  *
2956  * kmalloc is the normal method of allocating memory
2957  * in the kernel.
2958  *
2959  * The @flags argument may be one of:
2960  *
2961  * %GFP_USER - Allocate memory on behalf of user.  May sleep.
2962  *
2963  * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
2964  *
2965  * %GFP_ATOMIC - Allocation will not sleep.  Use inside interrupt handlers.
2966  *
2967  * Additionally, the %GFP_DMA flag may be set to indicate the memory
2968  * must be suitable for DMA.  This can mean different things on different
2969  * platforms.  For example, on i386, it means that the memory must come
2970  * from the first 16MB.
2971  */
2972 void *__kmalloc(size_t size, gfp_t flags)
2973 {
2974 	kmem_cache_t *cachep;
2975 
2976 	/* If you want to save a few bytes .text space: replace
2977 	 * __ with kmem_.
2978 	 * Then kmalloc uses the uninlined functions instead of the inline
2979 	 * functions.
2980 	 */
2981 	cachep = __find_general_cachep(size, flags);
2982 	if (unlikely(cachep == NULL))
2983 		return NULL;
2984 	return __cache_alloc(cachep, flags);
2985 }
2986 EXPORT_SYMBOL(__kmalloc);
2987 
2988 #ifdef CONFIG_SMP
2989 /**
2990  * __alloc_percpu - allocate one copy of the object for every present
2991  * cpu in the system, zeroing them.
2992  * Objects should be dereferenced using the per_cpu_ptr macro only.
2993  *
2994  * @size: how many bytes of memory are required.
2995  */
2996 void *__alloc_percpu(size_t size)
2997 {
2998 	int i;
2999 	struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
3000 
3001 	if (!pdata)
3002 		return NULL;
3003 
3004 	/*
3005 	 * Cannot use for_each_online_cpu since a cpu may come online
3006 	 * and we have no way of figuring out how to fix the array
3007 	 * that we have allocated then....
3008 	 */
3009 	for_each_cpu(i) {
3010 		int node = cpu_to_node(i);
3011 
3012 		if (node_online(node))
3013 			pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node);
3014 		else
3015 			pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
3016 
3017 		if (!pdata->ptrs[i])
3018 			goto unwind_oom;
3019 		memset(pdata->ptrs[i], 0, size);
3020 	}
3021 
3022 	/* Catch derefs w/o wrappers */
3023 	return (void *)(~(unsigned long)pdata);
3024 
3025       unwind_oom:
3026 	while (--i >= 0) {
3027 		if (!cpu_possible(i))
3028 			continue;
3029 		kfree(pdata->ptrs[i]);
3030 	}
3031 	kfree(pdata);
3032 	return NULL;
3033 }
3034 EXPORT_SYMBOL(__alloc_percpu);
3035 #endif
3036 
3037 /**
3038  * kmem_cache_free - Deallocate an object
3039  * @cachep: The cache the allocation was from.
3040  * @objp: The previously allocated object.
3041  *
3042  * Free an object which was previously allocated from this
3043  * cache.
3044  */
3045 void kmem_cache_free(kmem_cache_t *cachep, void *objp)
3046 {
3047 	unsigned long flags;
3048 
3049 	local_irq_save(flags);
3050 	__cache_free(cachep, objp);
3051 	local_irq_restore(flags);
3052 }
3053 EXPORT_SYMBOL(kmem_cache_free);
3054 
3055 /**
3056  * kfree - free previously allocated memory
3057  * @objp: pointer returned by kmalloc.
3058  *
3059  * If @objp is NULL, no operation is performed.
3060  *
3061  * Don't free memory not originally allocated by kmalloc()
3062  * or you will run into trouble.
3063  */
3064 void kfree(const void *objp)
3065 {
3066 	kmem_cache_t *c;
3067 	unsigned long flags;
3068 
3069 	if (unlikely(!objp))
3070 		return;
3071 	local_irq_save(flags);
3072 	kfree_debugcheck(objp);
3073 	c = page_get_cache(virt_to_page(objp));
3074 	mutex_debug_check_no_locks_freed(objp, objp+obj_reallen(c));
3075 	__cache_free(c, (void *)objp);
3076 	local_irq_restore(flags);
3077 }
3078 EXPORT_SYMBOL(kfree);
3079 
3080 #ifdef CONFIG_SMP
3081 /**
3082  * free_percpu - free previously allocated percpu memory
3083  * @objp: pointer returned by alloc_percpu.
3084  *
3085  * Don't free memory not originally allocated by alloc_percpu()
3086  * The complemented objp is to check for that.
3087  */
3088 void free_percpu(const void *objp)
3089 {
3090 	int i;
3091 	struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
3092 
3093 	/*
3094 	 * We allocate for all cpus so we cannot use for online cpu here.
3095 	 */
3096 	for_each_cpu(i)
3097 	    kfree(p->ptrs[i]);
3098 	kfree(p);
3099 }
3100 EXPORT_SYMBOL(free_percpu);
3101 #endif
3102 
3103 unsigned int kmem_cache_size(kmem_cache_t *cachep)
3104 {
3105 	return obj_reallen(cachep);
3106 }
3107 EXPORT_SYMBOL(kmem_cache_size);
3108 
3109 const char *kmem_cache_name(kmem_cache_t *cachep)
3110 {
3111 	return cachep->name;
3112 }
3113 EXPORT_SYMBOL_GPL(kmem_cache_name);
3114 
3115 /*
3116  * This initializes kmem_list3 for all nodes.
3117  */
3118 static int alloc_kmemlist(kmem_cache_t *cachep)
3119 {
3120 	int node;
3121 	struct kmem_list3 *l3;
3122 	int err = 0;
3123 
3124 	for_each_online_node(node) {
3125 		struct array_cache *nc = NULL, *new;
3126 		struct array_cache **new_alien = NULL;
3127 #ifdef CONFIG_NUMA
3128 		if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
3129 			goto fail;
3130 #endif
3131 		if (!(new = alloc_arraycache(node, (cachep->shared *
3132 						    cachep->batchcount),
3133 					     0xbaadf00d)))
3134 			goto fail;
3135 		if ((l3 = cachep->nodelists[node])) {
3136 
3137 			spin_lock_irq(&l3->list_lock);
3138 
3139 			if ((nc = cachep->nodelists[node]->shared))
3140 				free_block(cachep, nc->entry, nc->avail, node);
3141 
3142 			l3->shared = new;
3143 			if (!cachep->nodelists[node]->alien) {
3144 				l3->alien = new_alien;
3145 				new_alien = NULL;
3146 			}
3147 			l3->free_limit = (1 + nr_cpus_node(node)) *
3148 			    cachep->batchcount + cachep->num;
3149 			spin_unlock_irq(&l3->list_lock);
3150 			kfree(nc);
3151 			free_alien_cache(new_alien);
3152 			continue;
3153 		}
3154 		if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
3155 					GFP_KERNEL, node)))
3156 			goto fail;
3157 
3158 		kmem_list3_init(l3);
3159 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3160 		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3161 		l3->shared = new;
3162 		l3->alien = new_alien;
3163 		l3->free_limit = (1 + nr_cpus_node(node)) *
3164 		    cachep->batchcount + cachep->num;
3165 		cachep->nodelists[node] = l3;
3166 	}
3167 	return err;
3168       fail:
3169 	err = -ENOMEM;
3170 	return err;
3171 }
3172 
3173 struct ccupdate_struct {
3174 	kmem_cache_t *cachep;
3175 	struct array_cache *new[NR_CPUS];
3176 };
3177 
3178 static void do_ccupdate_local(void *info)
3179 {
3180 	struct ccupdate_struct *new = (struct ccupdate_struct *)info;
3181 	struct array_cache *old;
3182 
3183 	check_irq_off();
3184 	old = ac_data(new->cachep);
3185 
3186 	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3187 	new->new[smp_processor_id()] = old;
3188 }
3189 
3190 static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
3191 			    int shared)
3192 {
3193 	struct ccupdate_struct new;
3194 	int i, err;
3195 
3196 	memset(&new.new, 0, sizeof(new.new));
3197 	for_each_online_cpu(i) {
3198 		new.new[i] =
3199 		    alloc_arraycache(cpu_to_node(i), limit, batchcount);
3200 		if (!new.new[i]) {
3201 			for (i--; i >= 0; i--)
3202 				kfree(new.new[i]);
3203 			return -ENOMEM;
3204 		}
3205 	}
3206 	new.cachep = cachep;
3207 
3208 	smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
3209 
3210 	check_irq_on();
3211 	spin_lock_irq(&cachep->spinlock);
3212 	cachep->batchcount = batchcount;
3213 	cachep->limit = limit;
3214 	cachep->shared = shared;
3215 	spin_unlock_irq(&cachep->spinlock);
3216 
3217 	for_each_online_cpu(i) {
3218 		struct array_cache *ccold = new.new[i];
3219 		if (!ccold)
3220 			continue;
3221 		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3222 		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
3223 		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3224 		kfree(ccold);
3225 	}
3226 
3227 	err = alloc_kmemlist(cachep);
3228 	if (err) {
3229 		printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
3230 		       cachep->name, -err);
3231 		BUG();
3232 	}
3233 	return 0;
3234 }
3235 
3236 static void enable_cpucache(kmem_cache_t *cachep)
3237 {
3238 	int err;
3239 	int limit, shared;
3240 
3241 	/* The head array serves three purposes:
3242 	 * - create a LIFO ordering, i.e. return objects that are cache-warm
3243 	 * - reduce the number of spinlock operations.
3244 	 * - reduce the number of linked list operations on the slab and
3245 	 *   bufctl chains: array operations are cheaper.
3246 	 * The numbers are guessed, we should auto-tune as described by
3247 	 * Bonwick.
3248 	 */
3249 	if (cachep->objsize > 131072)
3250 		limit = 1;
3251 	else if (cachep->objsize > PAGE_SIZE)
3252 		limit = 8;
3253 	else if (cachep->objsize > 1024)
3254 		limit = 24;
3255 	else if (cachep->objsize > 256)
3256 		limit = 54;
3257 	else
3258 		limit = 120;
3259 
3260 	/* Cpu bound tasks (e.g. network routing) can exhibit cpu bound
3261 	 * allocation behaviour: Most allocs on one cpu, most free operations
3262 	 * on another cpu. For these cases, an efficient object passing between
3263 	 * cpus is necessary. This is provided by a shared array. The array
3264 	 * replaces Bonwick's magazine layer.
3265 	 * On uniprocessor, it's functionally equivalent (but less efficient)
3266 	 * to a larger limit. Thus disabled by default.
3267 	 */
3268 	shared = 0;
3269 #ifdef CONFIG_SMP
3270 	if (cachep->objsize <= PAGE_SIZE)
3271 		shared = 8;
3272 #endif
3273 
3274 #if DEBUG
3275 	/* With debugging enabled, large batchcount lead to excessively
3276 	 * long periods with disabled local interrupts. Limit the
3277 	 * batchcount
3278 	 */
3279 	if (limit > 32)
3280 		limit = 32;
3281 #endif
3282 	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
3283 	if (err)
3284 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3285 		       cachep->name, -err);
3286 }
3287 
3288 static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
3289 				int force, int node)
3290 {
3291 	int tofree;
3292 
3293 	check_spinlock_acquired_node(cachep, node);
3294 	if (ac->touched && !force) {
3295 		ac->touched = 0;
3296 	} else if (ac->avail) {
3297 		tofree = force ? ac->avail : (ac->limit + 4) / 5;
3298 		if (tofree > ac->avail) {
3299 			tofree = (ac->avail + 1) / 2;
3300 		}
3301 		free_block(cachep, ac->entry, tofree, node);
3302 		ac->avail -= tofree;
3303 		memmove(ac->entry, &(ac->entry[tofree]),
3304 			sizeof(void *) * ac->avail);
3305 	}
3306 }
3307 
3308 /**
3309  * cache_reap - Reclaim memory from caches.
3310  * @unused: unused parameter
3311  *
3312  * Called from workqueue/eventd every few seconds.
3313  * Purpose:
3314  * - clear the per-cpu caches for this CPU.
3315  * - return freeable pages to the main free memory pool.
3316  *
3317  * If we cannot acquire the cache chain semaphore then just give up - we'll
3318  * try again on the next iteration.
3319  */
3320 static void cache_reap(void *unused)
3321 {
3322 	struct list_head *walk;
3323 	struct kmem_list3 *l3;
3324 
3325 	if (down_trylock(&cache_chain_sem)) {
3326 		/* Give up. Setup the next iteration. */
3327 		schedule_delayed_work(&__get_cpu_var(reap_work),
3328 				      REAPTIMEOUT_CPUC);
3329 		return;
3330 	}
3331 
3332 	list_for_each(walk, &cache_chain) {
3333 		kmem_cache_t *searchp;
3334 		struct list_head *p;
3335 		int tofree;
3336 		struct slab *slabp;
3337 
3338 		searchp = list_entry(walk, kmem_cache_t, next);
3339 
3340 		if (searchp->flags & SLAB_NO_REAP)
3341 			goto next;
3342 
3343 		check_irq_on();
3344 
3345 		l3 = searchp->nodelists[numa_node_id()];
3346 		if (l3->alien)
3347 			drain_alien_cache(searchp, l3);
3348 		spin_lock_irq(&l3->list_lock);
3349 
3350 		drain_array_locked(searchp, ac_data(searchp), 0,
3351 				   numa_node_id());
3352 
3353 		if (time_after(l3->next_reap, jiffies))
3354 			goto next_unlock;
3355 
3356 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
3357 
3358 		if (l3->shared)
3359 			drain_array_locked(searchp, l3->shared, 0,
3360 					   numa_node_id());
3361 
3362 		if (l3->free_touched) {
3363 			l3->free_touched = 0;
3364 			goto next_unlock;
3365 		}
3366 
3367 		tofree =
3368 		    (l3->free_limit + 5 * searchp->num -
3369 		     1) / (5 * searchp->num);
3370 		do {
3371 			p = l3->slabs_free.next;
3372 			if (p == &(l3->slabs_free))
3373 				break;
3374 
3375 			slabp = list_entry(p, struct slab, list);
3376 			BUG_ON(slabp->inuse);
3377 			list_del(&slabp->list);
3378 			STATS_INC_REAPED(searchp);
3379 
3380 			/* Safe to drop the lock. The slab is no longer
3381 			 * linked to the cache.
3382 			 * searchp cannot disappear, we hold
3383 			 * cache_chain_lock
3384 			 */
3385 			l3->free_objects -= searchp->num;
3386 			spin_unlock_irq(&l3->list_lock);
3387 			slab_destroy(searchp, slabp);
3388 			spin_lock_irq(&l3->list_lock);
3389 		} while (--tofree > 0);
3390 	      next_unlock:
3391 		spin_unlock_irq(&l3->list_lock);
3392 	      next:
3393 		cond_resched();
3394 	}
3395 	check_irq_on();
3396 	up(&cache_chain_sem);
3397 	drain_remote_pages();
3398 	/* Setup the next iteration */
3399 	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
3400 }
3401 
3402 #ifdef CONFIG_PROC_FS
3403 
3404 static void print_slabinfo_header(struct seq_file *m)
3405 {
3406 	/*
3407 	 * Output format version, so at least we can change it
3408 	 * without _too_ many complaints.
3409 	 */
3410 #if STATS
3411 	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
3412 #else
3413 	seq_puts(m, "slabinfo - version: 2.1\n");
3414 #endif
3415 	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
3416 		 "<objperslab> <pagesperslab>");
3417 	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
3418 	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
3419 #if STATS
3420 	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
3421 		 "<error> <maxfreeable> <nodeallocs> <remotefrees>");
3422 	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
3423 #endif
3424 	seq_putc(m, '\n');
3425 }
3426 
3427 static void *s_start(struct seq_file *m, loff_t *pos)
3428 {
3429 	loff_t n = *pos;
3430 	struct list_head *p;
3431 
3432 	down(&cache_chain_sem);
3433 	if (!n)
3434 		print_slabinfo_header(m);
3435 	p = cache_chain.next;
3436 	while (n--) {
3437 		p = p->next;
3438 		if (p == &cache_chain)
3439 			return NULL;
3440 	}
3441 	return list_entry(p, kmem_cache_t, next);
3442 }
3443 
3444 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3445 {
3446 	kmem_cache_t *cachep = p;
3447 	++*pos;
3448 	return cachep->next.next == &cache_chain ? NULL
3449 	    : list_entry(cachep->next.next, kmem_cache_t, next);
3450 }
3451 
3452 static void s_stop(struct seq_file *m, void *p)
3453 {
3454 	up(&cache_chain_sem);
3455 }
3456 
3457 static int s_show(struct seq_file *m, void *p)
3458 {
3459 	kmem_cache_t *cachep = p;
3460 	struct list_head *q;
3461 	struct slab *slabp;
3462 	unsigned long active_objs;
3463 	unsigned long num_objs;
3464 	unsigned long active_slabs = 0;
3465 	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
3466 	const char *name;
3467 	char *error = NULL;
3468 	int node;
3469 	struct kmem_list3 *l3;
3470 
3471 	check_irq_on();
3472 	spin_lock_irq(&cachep->spinlock);
3473 	active_objs = 0;
3474 	num_slabs = 0;
3475 	for_each_online_node(node) {
3476 		l3 = cachep->nodelists[node];
3477 		if (!l3)
3478 			continue;
3479 
3480 		spin_lock(&l3->list_lock);
3481 
3482 		list_for_each(q, &l3->slabs_full) {
3483 			slabp = list_entry(q, struct slab, list);
3484 			if (slabp->inuse != cachep->num && !error)
3485 				error = "slabs_full accounting error";
3486 			active_objs += cachep->num;
3487 			active_slabs++;
3488 		}
3489 		list_for_each(q, &l3->slabs_partial) {
3490 			slabp = list_entry(q, struct slab, list);
3491 			if (slabp->inuse == cachep->num && !error)
3492 				error = "slabs_partial inuse accounting error";
3493 			if (!slabp->inuse && !error)
3494 				error = "slabs_partial/inuse accounting error";
3495 			active_objs += slabp->inuse;
3496 			active_slabs++;
3497 		}
3498 		list_for_each(q, &l3->slabs_free) {
3499 			slabp = list_entry(q, struct slab, list);
3500 			if (slabp->inuse && !error)
3501 				error = "slabs_free/inuse accounting error";
3502 			num_slabs++;
3503 		}
3504 		free_objects += l3->free_objects;
3505 		shared_avail += l3->shared->avail;
3506 
3507 		spin_unlock(&l3->list_lock);
3508 	}
3509 	num_slabs += active_slabs;
3510 	num_objs = num_slabs * cachep->num;
3511 	if (num_objs - active_objs != free_objects && !error)
3512 		error = "free_objects accounting error";
3513 
3514 	name = cachep->name;
3515 	if (error)
3516 		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
3517 
3518 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
3519 		   name, active_objs, num_objs, cachep->objsize,
3520 		   cachep->num, (1 << cachep->gfporder));
3521 	seq_printf(m, " : tunables %4u %4u %4u",
3522 		   cachep->limit, cachep->batchcount, cachep->shared);
3523 	seq_printf(m, " : slabdata %6lu %6lu %6lu",
3524 		   active_slabs, num_slabs, shared_avail);
3525 #if STATS
3526 	{			/* list3 stats */
3527 		unsigned long high = cachep->high_mark;
3528 		unsigned long allocs = cachep->num_allocations;
3529 		unsigned long grown = cachep->grown;
3530 		unsigned long reaped = cachep->reaped;
3531 		unsigned long errors = cachep->errors;
3532 		unsigned long max_freeable = cachep->max_freeable;
3533 		unsigned long node_allocs = cachep->node_allocs;
3534 		unsigned long node_frees = cachep->node_frees;
3535 
3536 		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
3537 				%4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
3538 	}
3539 	/* cpu stats */
3540 	{
3541 		unsigned long allochit = atomic_read(&cachep->allochit);
3542 		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
3543 		unsigned long freehit = atomic_read(&cachep->freehit);
3544 		unsigned long freemiss = atomic_read(&cachep->freemiss);
3545 
3546 		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
3547 			   allochit, allocmiss, freehit, freemiss);
3548 	}
3549 #endif
3550 	seq_putc(m, '\n');
3551 	spin_unlock_irq(&cachep->spinlock);
3552 	return 0;
3553 }
3554 
3555 /*
3556  * slabinfo_op - iterator that generates /proc/slabinfo
3557  *
3558  * Output layout:
3559  * cache-name
3560  * num-active-objs
3561  * total-objs
3562  * object size
3563  * num-active-slabs
3564  * total-slabs
3565  * num-pages-per-slab
3566  * + further values on SMP and with statistics enabled
3567  */
3568 
3569 struct seq_operations slabinfo_op = {
3570 	.start = s_start,
3571 	.next = s_next,
3572 	.stop = s_stop,
3573 	.show = s_show,
3574 };
3575 
3576 #define MAX_SLABINFO_WRITE 128
3577 /**
3578  * slabinfo_write - Tuning for the slab allocator
3579  * @file: unused
3580  * @buffer: user buffer
3581  * @count: data length
3582  * @ppos: unused
3583  */
3584 ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3585 		       size_t count, loff_t *ppos)
3586 {
3587 	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
3588 	int limit, batchcount, shared, res;
3589 	struct list_head *p;
3590 
3591 	if (count > MAX_SLABINFO_WRITE)
3592 		return -EINVAL;
3593 	if (copy_from_user(&kbuf, buffer, count))
3594 		return -EFAULT;
3595 	kbuf[MAX_SLABINFO_WRITE] = '\0';
3596 
3597 	tmp = strchr(kbuf, ' ');
3598 	if (!tmp)
3599 		return -EINVAL;
3600 	*tmp = '\0';
3601 	tmp++;
3602 	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
3603 		return -EINVAL;
3604 
3605 	/* Find the cache in the chain of caches. */
3606 	down(&cache_chain_sem);
3607 	res = -EINVAL;
3608 	list_for_each(p, &cache_chain) {
3609 		kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
3610 
3611 		if (!strcmp(cachep->name, kbuf)) {
3612 			if (limit < 1 ||
3613 			    batchcount < 1 ||
3614 			    batchcount > limit || shared < 0) {
3615 				res = 0;
3616 			} else {
3617 				res = do_tune_cpucache(cachep, limit,
3618 						       batchcount, shared);
3619 			}
3620 			break;
3621 		}
3622 	}
3623 	up(&cache_chain_sem);
3624 	if (res >= 0)
3625 		res = count;
3626 	return res;
3627 }
3628 #endif
3629 
3630 /**
3631  * ksize - get the actual amount of memory allocated for a given object
3632  * @objp: Pointer to the object
3633  *
3634  * kmalloc may internally round up allocations and return more memory
3635  * than requested. ksize() can be used to determine the actual amount of
3636  * memory allocated. The caller may use this additional memory, even though
3637  * a smaller amount of memory was initially specified with the kmalloc call.
3638  * The caller must guarantee that objp points to a valid object previously
3639  * allocated with either kmalloc() or kmem_cache_alloc(). The object
3640  * must not be freed during the duration of the call.
3641  */
3642 unsigned int ksize(const void *objp)
3643 {
3644 	if (unlikely(objp == NULL))
3645 		return 0;
3646 
3647 	return obj_reallen(page_get_cache(virt_to_page(objp)));
3648 }
3649