xref: /openbmc/linux/mm/slab.c (revision b28a02de8c70d41d6b6ba8911e83ed3ccf2e13f8)
1 /*
2  * linux/mm/slab.c
3  * Written by Mark Hemment, 1996/97.
4  * (markhe@nextd.demon.co.uk)
5  *
6  * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
7  *
8  * Major cleanup, different bufctl logic, per-cpu arrays
9  *	(c) 2000 Manfred Spraul
10  *
11  * Cleanup, make the head arrays unconditional, preparation for NUMA
12  * 	(c) 2002 Manfred Spraul
13  *
14  * An implementation of the Slab Allocator as described in outline in;
15  *	UNIX Internals: The New Frontiers by Uresh Vahalia
16  *	Pub: Prentice Hall	ISBN 0-13-101908-2
17  * or with a little more detail in;
18  *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
19  *	Jeff Bonwick (Sun Microsystems).
20  *	Presented at: USENIX Summer 1994 Technical Conference
21  *
22  * The memory is organized in caches, one cache for each object type.
23  * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
24  * Each cache consists out of many slabs (they are small (usually one
25  * page long) and always contiguous), and each slab contains multiple
26  * initialized objects.
27  *
28  * This means, that your constructor is used only for newly allocated
29  * slabs and you must pass objects with the same intializations to
30  * kmem_cache_free.
31  *
32  * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
33  * normal). If you need a special memory type, then must create a new
34  * cache for that memory type.
35  *
36  * In order to reduce fragmentation, the slabs are sorted in 3 groups:
37  *   full slabs with 0 free objects
38  *   partial slabs
39  *   empty slabs with no allocated objects
40  *
41  * If partial slabs exist, then new allocations come from these slabs,
42  * otherwise from empty slabs or new slabs are allocated.
43  *
44  * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
45  * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
46  *
47  * Each cache has a short per-cpu head array, most allocs
48  * and frees go into that array, and if that array overflows, then 1/2
49  * of the entries in the array are given back into the global cache.
50  * The head array is strictly LIFO and should improve the cache hit rates.
51  * On SMP, it additionally reduces the spinlock operations.
52  *
53  * The c_cpuarray may not be read with enabled local interrupts -
54  * it's changed with a smp_call_function().
55  *
56  * SMP synchronization:
57  *  constructors and destructors are called without any locking.
58  *  Several members in kmem_cache_t and struct slab never change, they
59  *	are accessed without any locking.
60  *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
61  *  	and local interrupts are disabled so slab code is preempt-safe.
62  *  The non-constant members are protected with a per-cache irq spinlock.
63  *
64  * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
65  * in 2000 - many ideas in the current implementation are derived from
66  * his patch.
67  *
68  * Further notes from the original documentation:
69  *
70  * 11 April '97.  Started multi-threading - markhe
71  *	The global cache-chain is protected by the semaphore 'cache_chain_sem'.
72  *	The sem is only needed when accessing/extending the cache-chain, which
73  *	can never happen inside an interrupt (kmem_cache_create(),
74  *	kmem_cache_shrink() and kmem_cache_reap()).
75  *
76  *	At present, each engine can be growing a cache.  This should be blocked.
77  *
78  * 15 March 2005. NUMA slab allocator.
79  *	Shai Fultheim <shai@scalex86.org>.
80  *	Shobhit Dayal <shobhit@calsoftinc.com>
81  *	Alok N Kataria <alokk@calsoftinc.com>
82  *	Christoph Lameter <christoph@lameter.com>
83  *
84  *	Modified the slab allocator to be node aware on NUMA systems.
85  *	Each node has its own list of partial, free and full slabs.
86  *	All object allocations for a node occur from node specific slab lists.
87  */
88 
89 #include	<linux/config.h>
90 #include	<linux/slab.h>
91 #include	<linux/mm.h>
92 #include	<linux/swap.h>
93 #include	<linux/cache.h>
94 #include	<linux/interrupt.h>
95 #include	<linux/init.h>
96 #include	<linux/compiler.h>
97 #include	<linux/seq_file.h>
98 #include	<linux/notifier.h>
99 #include	<linux/kallsyms.h>
100 #include	<linux/cpu.h>
101 #include	<linux/sysctl.h>
102 #include	<linux/module.h>
103 #include	<linux/rcupdate.h>
104 #include	<linux/string.h>
105 #include	<linux/nodemask.h>
106 
107 #include	<asm/uaccess.h>
108 #include	<asm/cacheflush.h>
109 #include	<asm/tlbflush.h>
110 #include	<asm/page.h>
111 
112 /*
113  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
114  *		  SLAB_RED_ZONE & SLAB_POISON.
115  *		  0 for faster, smaller code (especially in the critical paths).
116  *
117  * STATS	- 1 to collect stats for /proc/slabinfo.
118  *		  0 for faster, smaller code (especially in the critical paths).
119  *
120  * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
121  */
122 
123 #ifdef CONFIG_DEBUG_SLAB
124 #define	DEBUG		1
125 #define	STATS		1
126 #define	FORCED_DEBUG	1
127 #else
128 #define	DEBUG		0
129 #define	STATS		0
130 #define	FORCED_DEBUG	0
131 #endif
132 
133 /* Shouldn't this be in a header file somewhere? */
134 #define	BYTES_PER_WORD		sizeof(void *)
135 
136 #ifndef cache_line_size
137 #define cache_line_size()	L1_CACHE_BYTES
138 #endif
139 
140 #ifndef ARCH_KMALLOC_MINALIGN
141 /*
142  * Enforce a minimum alignment for the kmalloc caches.
143  * Usually, the kmalloc caches are cache_line_size() aligned, except when
144  * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
145  * Some archs want to perform DMA into kmalloc caches and need a guaranteed
146  * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.
147  * Note that this flag disables some debug features.
148  */
149 #define ARCH_KMALLOC_MINALIGN 0
150 #endif
151 
152 #ifndef ARCH_SLAB_MINALIGN
153 /*
154  * Enforce a minimum alignment for all caches.
155  * Intended for archs that get misalignment faults even for BYTES_PER_WORD
156  * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
157  * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
158  * some debug features.
159  */
160 #define ARCH_SLAB_MINALIGN 0
161 #endif
162 
163 #ifndef ARCH_KMALLOC_FLAGS
164 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
165 #endif
166 
167 /* Legal flag mask for kmem_cache_create(). */
168 #if DEBUG
169 # define CREATE_MASK	(SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
170 			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
171 			 SLAB_NO_REAP | SLAB_CACHE_DMA | \
172 			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
173 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
174 			 SLAB_DESTROY_BY_RCU)
175 #else
176 # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
177 			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
178 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
179 			 SLAB_DESTROY_BY_RCU)
180 #endif
181 
182 /*
183  * kmem_bufctl_t:
184  *
185  * Bufctl's are used for linking objs within a slab
186  * linked offsets.
187  *
188  * This implementation relies on "struct page" for locating the cache &
189  * slab an object belongs to.
190  * This allows the bufctl structure to be small (one int), but limits
191  * the number of objects a slab (not a cache) can contain when off-slab
192  * bufctls are used. The limit is the size of the largest general cache
193  * that does not use off-slab slabs.
194  * For 32bit archs with 4 kB pages, is this 56.
195  * This is not serious, as it is only for large objects, when it is unwise
196  * to have too many per slab.
197  * Note: This limit can be raised by introducing a general cache whose size
198  * is less than 512 (PAGE_SIZE<<3), but greater than 256.
199  */
200 
201 typedef unsigned int kmem_bufctl_t;
202 #define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
203 #define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
204 #define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-2)
205 
206 /* Max number of objs-per-slab for caches which use off-slab slabs.
207  * Needed to avoid a possible looping condition in cache_grow().
208  */
209 static unsigned long offslab_limit;
210 
211 /*
212  * struct slab
213  *
214  * Manages the objs in a slab. Placed either at the beginning of mem allocated
215  * for a slab, or allocated from an general cache.
216  * Slabs are chained into three list: fully used, partial, fully free slabs.
217  */
218 struct slab {
219 	struct list_head list;
220 	unsigned long colouroff;
221 	void *s_mem;		/* including colour offset */
222 	unsigned int inuse;	/* num of objs active in slab */
223 	kmem_bufctl_t free;
224 	unsigned short nodeid;
225 };
226 
227 /*
228  * struct slab_rcu
229  *
230  * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
231  * arrange for kmem_freepages to be called via RCU.  This is useful if
232  * we need to approach a kernel structure obliquely, from its address
233  * obtained without the usual locking.  We can lock the structure to
234  * stabilize it and check it's still at the given address, only if we
235  * can be sure that the memory has not been meanwhile reused for some
236  * other kind of object (which our subsystem's lock might corrupt).
237  *
238  * rcu_read_lock before reading the address, then rcu_read_unlock after
239  * taking the spinlock within the structure expected at that address.
240  *
241  * We assume struct slab_rcu can overlay struct slab when destroying.
242  */
243 struct slab_rcu {
244 	struct rcu_head head;
245 	kmem_cache_t *cachep;
246 	void *addr;
247 };
248 
249 /*
250  * struct array_cache
251  *
252  * Purpose:
253  * - LIFO ordering, to hand out cache-warm objects from _alloc
254  * - reduce the number of linked list operations
255  * - reduce spinlock operations
256  *
257  * The limit is stored in the per-cpu structure to reduce the data cache
258  * footprint.
259  *
260  */
261 struct array_cache {
262 	unsigned int avail;
263 	unsigned int limit;
264 	unsigned int batchcount;
265 	unsigned int touched;
266 	spinlock_t lock;
267 	void *entry[0];		/*
268 				 * Must have this definition in here for the proper
269 				 * alignment of array_cache. Also simplifies accessing
270 				 * the entries.
271 				 * [0] is for gcc 2.95. It should really be [].
272 				 */
273 };
274 
275 /* bootstrap: The caches do not work without cpuarrays anymore,
276  * but the cpuarrays are allocated from the generic caches...
277  */
278 #define BOOT_CPUCACHE_ENTRIES	1
279 struct arraycache_init {
280 	struct array_cache cache;
281 	void *entries[BOOT_CPUCACHE_ENTRIES];
282 };
283 
284 /*
285  * The slab lists for all objects.
286  */
287 struct kmem_list3 {
288 	struct list_head slabs_partial;	/* partial list first, better asm code */
289 	struct list_head slabs_full;
290 	struct list_head slabs_free;
291 	unsigned long free_objects;
292 	unsigned long next_reap;
293 	int free_touched;
294 	unsigned int free_limit;
295 	spinlock_t list_lock;
296 	struct array_cache *shared;	/* shared per node */
297 	struct array_cache **alien;	/* on other nodes */
298 };
299 
300 /*
301  * Need this for bootstrapping a per node allocator.
302  */
303 #define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
304 struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
305 #define	CACHE_CACHE 0
306 #define	SIZE_AC 1
307 #define	SIZE_L3 (1 + MAX_NUMNODES)
308 
309 /*
310  * This function must be completely optimized away if
311  * a constant is passed to it. Mostly the same as
312  * what is in linux/slab.h except it returns an
313  * index.
314  */
315 static __always_inline int index_of(const size_t size)
316 {
317 	if (__builtin_constant_p(size)) {
318 		int i = 0;
319 
320 #define CACHE(x) \
321 	if (size <=x) \
322 		return i; \
323 	else \
324 		i++;
325 #include "linux/kmalloc_sizes.h"
326 #undef CACHE
327 		{
328 			extern void __bad_size(void);
329 			__bad_size();
330 		}
331 	} else
332 		BUG();
333 	return 0;
334 }
335 
336 #define INDEX_AC index_of(sizeof(struct arraycache_init))
337 #define INDEX_L3 index_of(sizeof(struct kmem_list3))
338 
339 static inline void kmem_list3_init(struct kmem_list3 *parent)
340 {
341 	INIT_LIST_HEAD(&parent->slabs_full);
342 	INIT_LIST_HEAD(&parent->slabs_partial);
343 	INIT_LIST_HEAD(&parent->slabs_free);
344 	parent->shared = NULL;
345 	parent->alien = NULL;
346 	spin_lock_init(&parent->list_lock);
347 	parent->free_objects = 0;
348 	parent->free_touched = 0;
349 }
350 
351 #define MAKE_LIST(cachep, listp, slab, nodeid)	\
352 	do {	\
353 		INIT_LIST_HEAD(listp);		\
354 		list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
355 	} while (0)
356 
357 #define	MAKE_ALL_LISTS(cachep, ptr, nodeid)			\
358 	do {					\
359 	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
360 	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
361 	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
362 	} while (0)
363 
364 /*
365  * kmem_cache_t
366  *
367  * manages a cache.
368  */
369 
370 struct kmem_cache {
371 /* 1) per-cpu data, touched during every alloc/free */
372 	struct array_cache *array[NR_CPUS];
373 	unsigned int batchcount;
374 	unsigned int limit;
375 	unsigned int shared;
376 	unsigned int objsize;
377 /* 2) touched by every alloc & free from the backend */
378 	struct kmem_list3 *nodelists[MAX_NUMNODES];
379 	unsigned int flags;	/* constant flags */
380 	unsigned int num;	/* # of objs per slab */
381 	spinlock_t spinlock;
382 
383 /* 3) cache_grow/shrink */
384 	/* order of pgs per slab (2^n) */
385 	unsigned int gfporder;
386 
387 	/* force GFP flags, e.g. GFP_DMA */
388 	gfp_t gfpflags;
389 
390 	size_t colour;		/* cache colouring range */
391 	unsigned int colour_off;	/* colour offset */
392 	unsigned int colour_next;	/* cache colouring */
393 	kmem_cache_t *slabp_cache;
394 	unsigned int slab_size;
395 	unsigned int dflags;	/* dynamic flags */
396 
397 	/* constructor func */
398 	void (*ctor) (void *, kmem_cache_t *, unsigned long);
399 
400 	/* de-constructor func */
401 	void (*dtor) (void *, kmem_cache_t *, unsigned long);
402 
403 /* 4) cache creation/removal */
404 	const char *name;
405 	struct list_head next;
406 
407 /* 5) statistics */
408 #if STATS
409 	unsigned long num_active;
410 	unsigned long num_allocations;
411 	unsigned long high_mark;
412 	unsigned long grown;
413 	unsigned long reaped;
414 	unsigned long errors;
415 	unsigned long max_freeable;
416 	unsigned long node_allocs;
417 	unsigned long node_frees;
418 	atomic_t allochit;
419 	atomic_t allocmiss;
420 	atomic_t freehit;
421 	atomic_t freemiss;
422 #endif
423 #if DEBUG
424 	int dbghead;
425 	int reallen;
426 #endif
427 };
428 
429 #define CFLGS_OFF_SLAB		(0x80000000UL)
430 #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
431 
432 #define BATCHREFILL_LIMIT	16
433 /* Optimization question: fewer reaps means less
434  * probability for unnessary cpucache drain/refill cycles.
435  *
436  * OTOH the cpuarrays can contain lots of objects,
437  * which could lock up otherwise freeable slabs.
438  */
439 #define REAPTIMEOUT_CPUC	(2*HZ)
440 #define REAPTIMEOUT_LIST3	(4*HZ)
441 
442 #if STATS
443 #define	STATS_INC_ACTIVE(x)	((x)->num_active++)
444 #define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
445 #define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
446 #define	STATS_INC_GROWN(x)	((x)->grown++)
447 #define	STATS_INC_REAPED(x)	((x)->reaped++)
448 #define	STATS_SET_HIGH(x)	do { if ((x)->num_active > (x)->high_mark) \
449 					(x)->high_mark = (x)->num_active; \
450 				} while (0)
451 #define	STATS_INC_ERR(x)	((x)->errors++)
452 #define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
453 #define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
454 #define	STATS_SET_FREEABLE(x, i) \
455 				do { if ((x)->max_freeable < i) \
456 					(x)->max_freeable = i; \
457 				} while (0)
458 
459 #define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
460 #define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
461 #define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
462 #define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
463 #else
464 #define	STATS_INC_ACTIVE(x)	do { } while (0)
465 #define	STATS_DEC_ACTIVE(x)	do { } while (0)
466 #define	STATS_INC_ALLOCED(x)	do { } while (0)
467 #define	STATS_INC_GROWN(x)	do { } while (0)
468 #define	STATS_INC_REAPED(x)	do { } while (0)
469 #define	STATS_SET_HIGH(x)	do { } while (0)
470 #define	STATS_INC_ERR(x)	do { } while (0)
471 #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
472 #define	STATS_INC_NODEFREES(x)	do { } while (0)
473 #define	STATS_SET_FREEABLE(x, i) \
474 				do { } while (0)
475 
476 #define STATS_INC_ALLOCHIT(x)	do { } while (0)
477 #define STATS_INC_ALLOCMISS(x)	do { } while (0)
478 #define STATS_INC_FREEHIT(x)	do { } while (0)
479 #define STATS_INC_FREEMISS(x)	do { } while (0)
480 #endif
481 
482 #if DEBUG
483 /* Magic nums for obj red zoning.
484  * Placed in the first word before and the first word after an obj.
485  */
486 #define	RED_INACTIVE	0x5A2CF071UL	/* when obj is inactive */
487 #define	RED_ACTIVE	0x170FC2A5UL	/* when obj is active */
488 
489 /* ...and for poisoning */
490 #define	POISON_INUSE	0x5a	/* for use-uninitialised poisoning */
491 #define POISON_FREE	0x6b	/* for use-after-free poisoning */
492 #define	POISON_END	0xa5	/* end-byte of poisoning */
493 
494 /* memory layout of objects:
495  * 0		: objp
496  * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that
497  * 		the end of an object is aligned with the end of the real
498  * 		allocation. Catches writes behind the end of the allocation.
499  * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1:
500  * 		redzone word.
501  * cachep->dbghead: The real object.
502  * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
503  * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
504  */
505 static int obj_dbghead(kmem_cache_t *cachep)
506 {
507 	return cachep->dbghead;
508 }
509 
510 static int obj_reallen(kmem_cache_t *cachep)
511 {
512 	return cachep->reallen;
513 }
514 
515 static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp)
516 {
517 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
518 	return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD);
519 }
520 
521 static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
522 {
523 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
524 	if (cachep->flags & SLAB_STORE_USER)
525 		return (unsigned long *)(objp + cachep->objsize -
526 					 2 * BYTES_PER_WORD);
527 	return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD);
528 }
529 
530 static void **dbg_userword(kmem_cache_t *cachep, void *objp)
531 {
532 	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
533 	return (void **)(objp + cachep->objsize - BYTES_PER_WORD);
534 }
535 
536 #else
537 
538 #define obj_dbghead(x)			0
539 #define obj_reallen(cachep)		(cachep->objsize)
540 #define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long *)NULL;})
541 #define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long *)NULL;})
542 #define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
543 
544 #endif
545 
546 /*
547  * Maximum size of an obj (in 2^order pages)
548  * and absolute limit for the gfp order.
549  */
550 #if defined(CONFIG_LARGE_ALLOCS)
551 #define	MAX_OBJ_ORDER	13	/* up to 32Mb */
552 #define	MAX_GFP_ORDER	13	/* up to 32Mb */
553 #elif defined(CONFIG_MMU)
554 #define	MAX_OBJ_ORDER	5	/* 32 pages */
555 #define	MAX_GFP_ORDER	5	/* 32 pages */
556 #else
557 #define	MAX_OBJ_ORDER	8	/* up to 1Mb */
558 #define	MAX_GFP_ORDER	8	/* up to 1Mb */
559 #endif
560 
561 /*
562  * Do not go above this order unless 0 objects fit into the slab.
563  */
564 #define	BREAK_GFP_ORDER_HI	1
565 #define	BREAK_GFP_ORDER_LO	0
566 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
567 
568 /* Functions for storing/retrieving the cachep and or slab from the
569  * global 'mem_map'. These are used to find the slab an obj belongs to.
570  * With kfree(), these are used to find the cache which an obj belongs to.
571  */
572 static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
573 {
574 	page->lru.next = (struct list_head *)cache;
575 }
576 
577 static inline struct kmem_cache *page_get_cache(struct page *page)
578 {
579 	return (struct kmem_cache *)page->lru.next;
580 }
581 
582 static inline void page_set_slab(struct page *page, struct slab *slab)
583 {
584 	page->lru.prev = (struct list_head *)slab;
585 }
586 
587 static inline struct slab *page_get_slab(struct page *page)
588 {
589 	return (struct slab *)page->lru.prev;
590 }
591 
592 /* These are the default caches for kmalloc. Custom caches can have other sizes. */
593 struct cache_sizes malloc_sizes[] = {
594 #define CACHE(x) { .cs_size = (x) },
595 #include <linux/kmalloc_sizes.h>
596 	CACHE(ULONG_MAX)
597 #undef CACHE
598 };
599 EXPORT_SYMBOL(malloc_sizes);
600 
601 /* Must match cache_sizes above. Out of line to keep cache footprint low. */
602 struct cache_names {
603 	char *name;
604 	char *name_dma;
605 };
606 
607 static struct cache_names __initdata cache_names[] = {
608 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
609 #include <linux/kmalloc_sizes.h>
610 	{NULL,}
611 #undef CACHE
612 };
613 
614 static struct arraycache_init initarray_cache __initdata =
615     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
616 static struct arraycache_init initarray_generic =
617     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
618 
619 /* internal cache of cache description objs */
620 static kmem_cache_t cache_cache = {
621 	.batchcount = 1,
622 	.limit = BOOT_CPUCACHE_ENTRIES,
623 	.shared = 1,
624 	.objsize = sizeof(kmem_cache_t),
625 	.flags = SLAB_NO_REAP,
626 	.spinlock = SPIN_LOCK_UNLOCKED,
627 	.name = "kmem_cache",
628 #if DEBUG
629 	.reallen = sizeof(kmem_cache_t),
630 #endif
631 };
632 
633 /* Guard access to the cache-chain. */
634 static struct semaphore cache_chain_sem;
635 static struct list_head cache_chain;
636 
637 /*
638  * vm_enough_memory() looks at this to determine how many
639  * slab-allocated pages are possibly freeable under pressure
640  *
641  * SLAB_RECLAIM_ACCOUNT turns this on per-slab
642  */
643 atomic_t slab_reclaim_pages;
644 
645 /*
646  * chicken and egg problem: delay the per-cpu array allocation
647  * until the general caches are up.
648  */
649 static enum {
650 	NONE,
651 	PARTIAL_AC,
652 	PARTIAL_L3,
653 	FULL
654 } g_cpucache_up;
655 
656 static DEFINE_PER_CPU(struct work_struct, reap_work);
657 
658 static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node);
659 static void enable_cpucache(kmem_cache_t *cachep);
660 static void cache_reap(void *unused);
661 static int __node_shrink(kmem_cache_t *cachep, int node);
662 
663 static inline struct array_cache *ac_data(kmem_cache_t *cachep)
664 {
665 	return cachep->array[smp_processor_id()];
666 }
667 
668 static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
669 {
670 	struct cache_sizes *csizep = malloc_sizes;
671 
672 #if DEBUG
673 	/* This happens if someone tries to call
674 	 * kmem_cache_create(), or __kmalloc(), before
675 	 * the generic caches are initialized.
676 	 */
677 	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
678 #endif
679 	while (size > csizep->cs_size)
680 		csizep++;
681 
682 	/*
683 	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
684 	 * has cs_{dma,}cachep==NULL. Thus no special case
685 	 * for large kmalloc calls required.
686 	 */
687 	if (unlikely(gfpflags & GFP_DMA))
688 		return csizep->cs_dmacachep;
689 	return csizep->cs_cachep;
690 }
691 
692 kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
693 {
694 	return __find_general_cachep(size, gfpflags);
695 }
696 EXPORT_SYMBOL(kmem_find_general_cachep);
697 
698 /* Cal the num objs, wastage, and bytes left over for a given slab size. */
699 static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
700 			   int flags, size_t *left_over, unsigned int *num)
701 {
702 	int i;
703 	size_t wastage = PAGE_SIZE << gfporder;
704 	size_t extra = 0;
705 	size_t base = 0;
706 
707 	if (!(flags & CFLGS_OFF_SLAB)) {
708 		base = sizeof(struct slab);
709 		extra = sizeof(kmem_bufctl_t);
710 	}
711 	i = 0;
712 	while (i * size + ALIGN(base + i * extra, align) <= wastage)
713 		i++;
714 	if (i > 0)
715 		i--;
716 
717 	if (i > SLAB_LIMIT)
718 		i = SLAB_LIMIT;
719 
720 	*num = i;
721 	wastage -= i * size;
722 	wastage -= ALIGN(base + i * extra, align);
723 	*left_over = wastage;
724 }
725 
726 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
727 
728 static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
729 {
730 	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
731 	       function, cachep->name, msg);
732 	dump_stack();
733 }
734 
735 /*
736  * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
737  * via the workqueue/eventd.
738  * Add the CPU number into the expiration time to minimize the possibility of
739  * the CPUs getting into lockstep and contending for the global cache chain
740  * lock.
741  */
742 static void __devinit start_cpu_timer(int cpu)
743 {
744 	struct work_struct *reap_work = &per_cpu(reap_work, cpu);
745 
746 	/*
747 	 * When this gets called from do_initcalls via cpucache_init(),
748 	 * init_workqueues() has already run, so keventd will be setup
749 	 * at that time.
750 	 */
751 	if (keventd_up() && reap_work->func == NULL) {
752 		INIT_WORK(reap_work, cache_reap, NULL);
753 		schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
754 	}
755 }
756 
757 static struct array_cache *alloc_arraycache(int node, int entries,
758 					    int batchcount)
759 {
760 	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
761 	struct array_cache *nc = NULL;
762 
763 	nc = kmalloc_node(memsize, GFP_KERNEL, node);
764 	if (nc) {
765 		nc->avail = 0;
766 		nc->limit = entries;
767 		nc->batchcount = batchcount;
768 		nc->touched = 0;
769 		spin_lock_init(&nc->lock);
770 	}
771 	return nc;
772 }
773 
774 #ifdef CONFIG_NUMA
775 static inline struct array_cache **alloc_alien_cache(int node, int limit)
776 {
777 	struct array_cache **ac_ptr;
778 	int memsize = sizeof(void *) * MAX_NUMNODES;
779 	int i;
780 
781 	if (limit > 1)
782 		limit = 12;
783 	ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
784 	if (ac_ptr) {
785 		for_each_node(i) {
786 			if (i == node || !node_online(i)) {
787 				ac_ptr[i] = NULL;
788 				continue;
789 			}
790 			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
791 			if (!ac_ptr[i]) {
792 				for (i--; i <= 0; i--)
793 					kfree(ac_ptr[i]);
794 				kfree(ac_ptr);
795 				return NULL;
796 			}
797 		}
798 	}
799 	return ac_ptr;
800 }
801 
802 static inline void free_alien_cache(struct array_cache **ac_ptr)
803 {
804 	int i;
805 
806 	if (!ac_ptr)
807 		return;
808 
809 	for_each_node(i)
810 	    kfree(ac_ptr[i]);
811 
812 	kfree(ac_ptr);
813 }
814 
815 static inline void __drain_alien_cache(kmem_cache_t *cachep,
816 				       struct array_cache *ac, int node)
817 {
818 	struct kmem_list3 *rl3 = cachep->nodelists[node];
819 
820 	if (ac->avail) {
821 		spin_lock(&rl3->list_lock);
822 		free_block(cachep, ac->entry, ac->avail, node);
823 		ac->avail = 0;
824 		spin_unlock(&rl3->list_lock);
825 	}
826 }
827 
828 static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
829 {
830 	int i = 0;
831 	struct array_cache *ac;
832 	unsigned long flags;
833 
834 	for_each_online_node(i) {
835 		ac = l3->alien[i];
836 		if (ac) {
837 			spin_lock_irqsave(&ac->lock, flags);
838 			__drain_alien_cache(cachep, ac, i);
839 			spin_unlock_irqrestore(&ac->lock, flags);
840 		}
841 	}
842 }
843 #else
844 #define alloc_alien_cache(node, limit) do { } while (0)
845 #define free_alien_cache(ac_ptr) do { } while (0)
846 #define drain_alien_cache(cachep, l3) do { } while (0)
847 #endif
848 
849 static int __devinit cpuup_callback(struct notifier_block *nfb,
850 				    unsigned long action, void *hcpu)
851 {
852 	long cpu = (long)hcpu;
853 	kmem_cache_t *cachep;
854 	struct kmem_list3 *l3 = NULL;
855 	int node = cpu_to_node(cpu);
856 	int memsize = sizeof(struct kmem_list3);
857 	struct array_cache *nc = NULL;
858 
859 	switch (action) {
860 	case CPU_UP_PREPARE:
861 		down(&cache_chain_sem);
862 		/* we need to do this right in the beginning since
863 		 * alloc_arraycache's are going to use this list.
864 		 * kmalloc_node allows us to add the slab to the right
865 		 * kmem_list3 and not this cpu's kmem_list3
866 		 */
867 
868 		list_for_each_entry(cachep, &cache_chain, next) {
869 			/* setup the size64 kmemlist for cpu before we can
870 			 * begin anything. Make sure some other cpu on this
871 			 * node has not already allocated this
872 			 */
873 			if (!cachep->nodelists[node]) {
874 				if (!(l3 = kmalloc_node(memsize,
875 							GFP_KERNEL, node)))
876 					goto bad;
877 				kmem_list3_init(l3);
878 				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
879 				    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
880 
881 				cachep->nodelists[node] = l3;
882 			}
883 
884 			spin_lock_irq(&cachep->nodelists[node]->list_lock);
885 			cachep->nodelists[node]->free_limit =
886 			    (1 + nr_cpus_node(node)) *
887 			    cachep->batchcount + cachep->num;
888 			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
889 		}
890 
891 		/* Now we can go ahead with allocating the shared array's
892 		   & array cache's */
893 		list_for_each_entry(cachep, &cache_chain, next) {
894 			nc = alloc_arraycache(node, cachep->limit,
895 					      cachep->batchcount);
896 			if (!nc)
897 				goto bad;
898 			cachep->array[cpu] = nc;
899 
900 			l3 = cachep->nodelists[node];
901 			BUG_ON(!l3);
902 			if (!l3->shared) {
903 				if (!(nc = alloc_arraycache(node,
904 							    cachep->shared *
905 							    cachep->batchcount,
906 							    0xbaadf00d)))
907 					goto bad;
908 
909 				/* we are serialised from CPU_DEAD or
910 				   CPU_UP_CANCELLED by the cpucontrol lock */
911 				l3->shared = nc;
912 			}
913 		}
914 		up(&cache_chain_sem);
915 		break;
916 	case CPU_ONLINE:
917 		start_cpu_timer(cpu);
918 		break;
919 #ifdef CONFIG_HOTPLUG_CPU
920 	case CPU_DEAD:
921 		/* fall thru */
922 	case CPU_UP_CANCELED:
923 		down(&cache_chain_sem);
924 
925 		list_for_each_entry(cachep, &cache_chain, next) {
926 			struct array_cache *nc;
927 			cpumask_t mask;
928 
929 			mask = node_to_cpumask(node);
930 			spin_lock_irq(&cachep->spinlock);
931 			/* cpu is dead; no one can alloc from it. */
932 			nc = cachep->array[cpu];
933 			cachep->array[cpu] = NULL;
934 			l3 = cachep->nodelists[node];
935 
936 			if (!l3)
937 				goto unlock_cache;
938 
939 			spin_lock(&l3->list_lock);
940 
941 			/* Free limit for this kmem_list3 */
942 			l3->free_limit -= cachep->batchcount;
943 			if (nc)
944 				free_block(cachep, nc->entry, nc->avail, node);
945 
946 			if (!cpus_empty(mask)) {
947 				spin_unlock(&l3->list_lock);
948 				goto unlock_cache;
949 			}
950 
951 			if (l3->shared) {
952 				free_block(cachep, l3->shared->entry,
953 					   l3->shared->avail, node);
954 				kfree(l3->shared);
955 				l3->shared = NULL;
956 			}
957 			if (l3->alien) {
958 				drain_alien_cache(cachep, l3);
959 				free_alien_cache(l3->alien);
960 				l3->alien = NULL;
961 			}
962 
963 			/* free slabs belonging to this node */
964 			if (__node_shrink(cachep, node)) {
965 				cachep->nodelists[node] = NULL;
966 				spin_unlock(&l3->list_lock);
967 				kfree(l3);
968 			} else {
969 				spin_unlock(&l3->list_lock);
970 			}
971 		      unlock_cache:
972 			spin_unlock_irq(&cachep->spinlock);
973 			kfree(nc);
974 		}
975 		up(&cache_chain_sem);
976 		break;
977 #endif
978 	}
979 	return NOTIFY_OK;
980       bad:
981 	up(&cache_chain_sem);
982 	return NOTIFY_BAD;
983 }
984 
985 static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
986 
987 /*
988  * swap the static kmem_list3 with kmalloced memory
989  */
990 static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid)
991 {
992 	struct kmem_list3 *ptr;
993 
994 	BUG_ON(cachep->nodelists[nodeid] != list);
995 	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
996 	BUG_ON(!ptr);
997 
998 	local_irq_disable();
999 	memcpy(ptr, list, sizeof(struct kmem_list3));
1000 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
1001 	cachep->nodelists[nodeid] = ptr;
1002 	local_irq_enable();
1003 }
1004 
1005 /* Initialisation.
1006  * Called after the gfp() functions have been enabled, and before smp_init().
1007  */
1008 void __init kmem_cache_init(void)
1009 {
1010 	size_t left_over;
1011 	struct cache_sizes *sizes;
1012 	struct cache_names *names;
1013 	int i;
1014 
1015 	for (i = 0; i < NUM_INIT_LISTS; i++) {
1016 		kmem_list3_init(&initkmem_list3[i]);
1017 		if (i < MAX_NUMNODES)
1018 			cache_cache.nodelists[i] = NULL;
1019 	}
1020 
1021 	/*
1022 	 * Fragmentation resistance on low memory - only use bigger
1023 	 * page orders on machines with more than 32MB of memory.
1024 	 */
1025 	if (num_physpages > (32 << 20) >> PAGE_SHIFT)
1026 		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1027 
1028 	/* Bootstrap is tricky, because several objects are allocated
1029 	 * from caches that do not exist yet:
1030 	 * 1) initialize the cache_cache cache: it contains the kmem_cache_t
1031 	 *    structures of all caches, except cache_cache itself: cache_cache
1032 	 *    is statically allocated.
1033 	 *    Initially an __init data area is used for the head array and the
1034 	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
1035 	 *    array at the end of the bootstrap.
1036 	 * 2) Create the first kmalloc cache.
1037 	 *    The kmem_cache_t for the new cache is allocated normally.
1038 	 *    An __init data area is used for the head array.
1039 	 * 3) Create the remaining kmalloc caches, with minimally sized
1040 	 *    head arrays.
1041 	 * 4) Replace the __init data head arrays for cache_cache and the first
1042 	 *    kmalloc cache with kmalloc allocated arrays.
1043 	 * 5) Replace the __init data for kmem_list3 for cache_cache and
1044 	 *    the other cache's with kmalloc allocated memory.
1045 	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1046 	 */
1047 
1048 	/* 1) create the cache_cache */
1049 	init_MUTEX(&cache_chain_sem);
1050 	INIT_LIST_HEAD(&cache_chain);
1051 	list_add(&cache_cache.next, &cache_chain);
1052 	cache_cache.colour_off = cache_line_size();
1053 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1054 	cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
1055 
1056 	cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
1057 
1058 	cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
1059 		       &left_over, &cache_cache.num);
1060 	if (!cache_cache.num)
1061 		BUG();
1062 
1063 	cache_cache.colour = left_over / cache_cache.colour_off;
1064 	cache_cache.colour_next = 0;
1065 	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1066 				      sizeof(struct slab), cache_line_size());
1067 
1068 	/* 2+3) create the kmalloc caches */
1069 	sizes = malloc_sizes;
1070 	names = cache_names;
1071 
1072 	/* Initialize the caches that provide memory for the array cache
1073 	 * and the kmem_list3 structures first.
1074 	 * Without this, further allocations will bug
1075 	 */
1076 
1077 	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1078 						      sizes[INDEX_AC].cs_size,
1079 						      ARCH_KMALLOC_MINALIGN,
1080 						      (ARCH_KMALLOC_FLAGS |
1081 						       SLAB_PANIC), NULL, NULL);
1082 
1083 	if (INDEX_AC != INDEX_L3)
1084 		sizes[INDEX_L3].cs_cachep =
1085 		    kmem_cache_create(names[INDEX_L3].name,
1086 				      sizes[INDEX_L3].cs_size,
1087 				      ARCH_KMALLOC_MINALIGN,
1088 				      (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
1089 				      NULL);
1090 
1091 	while (sizes->cs_size != ULONG_MAX) {
1092 		/*
1093 		 * For performance, all the general caches are L1 aligned.
1094 		 * This should be particularly beneficial on SMP boxes, as it
1095 		 * eliminates "false sharing".
1096 		 * Note for systems short on memory removing the alignment will
1097 		 * allow tighter packing of the smaller caches.
1098 		 */
1099 		if (!sizes->cs_cachep)
1100 			sizes->cs_cachep = kmem_cache_create(names->name,
1101 							     sizes->cs_size,
1102 							     ARCH_KMALLOC_MINALIGN,
1103 							     (ARCH_KMALLOC_FLAGS
1104 							      | SLAB_PANIC),
1105 							     NULL, NULL);
1106 
1107 		/* Inc off-slab bufctl limit until the ceiling is hit. */
1108 		if (!(OFF_SLAB(sizes->cs_cachep))) {
1109 			offslab_limit = sizes->cs_size - sizeof(struct slab);
1110 			offslab_limit /= sizeof(kmem_bufctl_t);
1111 		}
1112 
1113 		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
1114 							sizes->cs_size,
1115 							ARCH_KMALLOC_MINALIGN,
1116 							(ARCH_KMALLOC_FLAGS |
1117 							 SLAB_CACHE_DMA |
1118 							 SLAB_PANIC), NULL,
1119 							NULL);
1120 
1121 		sizes++;
1122 		names++;
1123 	}
1124 	/* 4) Replace the bootstrap head arrays */
1125 	{
1126 		void *ptr;
1127 
1128 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1129 
1130 		local_irq_disable();
1131 		BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
1132 		memcpy(ptr, ac_data(&cache_cache),
1133 		       sizeof(struct arraycache_init));
1134 		cache_cache.array[smp_processor_id()] = ptr;
1135 		local_irq_enable();
1136 
1137 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1138 
1139 		local_irq_disable();
1140 		BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
1141 		       != &initarray_generic.cache);
1142 		memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
1143 		       sizeof(struct arraycache_init));
1144 		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1145 		    ptr;
1146 		local_irq_enable();
1147 	}
1148 	/* 5) Replace the bootstrap kmem_list3's */
1149 	{
1150 		int node;
1151 		/* Replace the static kmem_list3 structures for the boot cpu */
1152 		init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
1153 			  numa_node_id());
1154 
1155 		for_each_online_node(node) {
1156 			init_list(malloc_sizes[INDEX_AC].cs_cachep,
1157 				  &initkmem_list3[SIZE_AC + node], node);
1158 
1159 			if (INDEX_AC != INDEX_L3) {
1160 				init_list(malloc_sizes[INDEX_L3].cs_cachep,
1161 					  &initkmem_list3[SIZE_L3 + node],
1162 					  node);
1163 			}
1164 		}
1165 	}
1166 
1167 	/* 6) resize the head arrays to their final sizes */
1168 	{
1169 		kmem_cache_t *cachep;
1170 		down(&cache_chain_sem);
1171 		list_for_each_entry(cachep, &cache_chain, next)
1172 		    enable_cpucache(cachep);
1173 		up(&cache_chain_sem);
1174 	}
1175 
1176 	/* Done! */
1177 	g_cpucache_up = FULL;
1178 
1179 	/* Register a cpu startup notifier callback
1180 	 * that initializes ac_data for all new cpus
1181 	 */
1182 	register_cpu_notifier(&cpucache_notifier);
1183 
1184 	/* The reap timers are started later, with a module init call:
1185 	 * That part of the kernel is not yet operational.
1186 	 */
1187 }
1188 
1189 static int __init cpucache_init(void)
1190 {
1191 	int cpu;
1192 
1193 	/*
1194 	 * Register the timers that return unneeded
1195 	 * pages to gfp.
1196 	 */
1197 	for_each_online_cpu(cpu)
1198 	    start_cpu_timer(cpu);
1199 
1200 	return 0;
1201 }
1202 
1203 __initcall(cpucache_init);
1204 
1205 /*
1206  * Interface to system's page allocator. No need to hold the cache-lock.
1207  *
1208  * If we requested dmaable memory, we will get it. Even if we
1209  * did not request dmaable memory, we might get it, but that
1210  * would be relatively rare and ignorable.
1211  */
1212 static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
1213 {
1214 	struct page *page;
1215 	void *addr;
1216 	int i;
1217 
1218 	flags |= cachep->gfpflags;
1219 	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1220 	if (!page)
1221 		return NULL;
1222 	addr = page_address(page);
1223 
1224 	i = (1 << cachep->gfporder);
1225 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1226 		atomic_add(i, &slab_reclaim_pages);
1227 	add_page_state(nr_slab, i);
1228 	while (i--) {
1229 		SetPageSlab(page);
1230 		page++;
1231 	}
1232 	return addr;
1233 }
1234 
1235 /*
1236  * Interface to system's page release.
1237  */
1238 static void kmem_freepages(kmem_cache_t *cachep, void *addr)
1239 {
1240 	unsigned long i = (1 << cachep->gfporder);
1241 	struct page *page = virt_to_page(addr);
1242 	const unsigned long nr_freed = i;
1243 
1244 	while (i--) {
1245 		if (!TestClearPageSlab(page))
1246 			BUG();
1247 		page++;
1248 	}
1249 	sub_page_state(nr_slab, nr_freed);
1250 	if (current->reclaim_state)
1251 		current->reclaim_state->reclaimed_slab += nr_freed;
1252 	free_pages((unsigned long)addr, cachep->gfporder);
1253 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1254 		atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
1255 }
1256 
1257 static void kmem_rcu_free(struct rcu_head *head)
1258 {
1259 	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1260 	kmem_cache_t *cachep = slab_rcu->cachep;
1261 
1262 	kmem_freepages(cachep, slab_rcu->addr);
1263 	if (OFF_SLAB(cachep))
1264 		kmem_cache_free(cachep->slabp_cache, slab_rcu);
1265 }
1266 
1267 #if DEBUG
1268 
1269 #ifdef CONFIG_DEBUG_PAGEALLOC
1270 static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
1271 			    unsigned long caller)
1272 {
1273 	int size = obj_reallen(cachep);
1274 
1275 	addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)];
1276 
1277 	if (size < 5 * sizeof(unsigned long))
1278 		return;
1279 
1280 	*addr++ = 0x12345678;
1281 	*addr++ = caller;
1282 	*addr++ = smp_processor_id();
1283 	size -= 3 * sizeof(unsigned long);
1284 	{
1285 		unsigned long *sptr = &caller;
1286 		unsigned long svalue;
1287 
1288 		while (!kstack_end(sptr)) {
1289 			svalue = *sptr++;
1290 			if (kernel_text_address(svalue)) {
1291 				*addr++ = svalue;
1292 				size -= sizeof(unsigned long);
1293 				if (size <= sizeof(unsigned long))
1294 					break;
1295 			}
1296 		}
1297 
1298 	}
1299 	*addr++ = 0x87654321;
1300 }
1301 #endif
1302 
1303 static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
1304 {
1305 	int size = obj_reallen(cachep);
1306 	addr = &((char *)addr)[obj_dbghead(cachep)];
1307 
1308 	memset(addr, val, size);
1309 	*(unsigned char *)(addr + size - 1) = POISON_END;
1310 }
1311 
1312 static void dump_line(char *data, int offset, int limit)
1313 {
1314 	int i;
1315 	printk(KERN_ERR "%03x:", offset);
1316 	for (i = 0; i < limit; i++) {
1317 		printk(" %02x", (unsigned char)data[offset + i]);
1318 	}
1319 	printk("\n");
1320 }
1321 #endif
1322 
1323 #if DEBUG
1324 
1325 static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
1326 {
1327 	int i, size;
1328 	char *realobj;
1329 
1330 	if (cachep->flags & SLAB_RED_ZONE) {
1331 		printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
1332 		       *dbg_redzone1(cachep, objp),
1333 		       *dbg_redzone2(cachep, objp));
1334 	}
1335 
1336 	if (cachep->flags & SLAB_STORE_USER) {
1337 		printk(KERN_ERR "Last user: [<%p>]",
1338 		       *dbg_userword(cachep, objp));
1339 		print_symbol("(%s)",
1340 			     (unsigned long)*dbg_userword(cachep, objp));
1341 		printk("\n");
1342 	}
1343 	realobj = (char *)objp + obj_dbghead(cachep);
1344 	size = obj_reallen(cachep);
1345 	for (i = 0; i < size && lines; i += 16, lines--) {
1346 		int limit;
1347 		limit = 16;
1348 		if (i + limit > size)
1349 			limit = size - i;
1350 		dump_line(realobj, i, limit);
1351 	}
1352 }
1353 
1354 static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1355 {
1356 	char *realobj;
1357 	int size, i;
1358 	int lines = 0;
1359 
1360 	realobj = (char *)objp + obj_dbghead(cachep);
1361 	size = obj_reallen(cachep);
1362 
1363 	for (i = 0; i < size; i++) {
1364 		char exp = POISON_FREE;
1365 		if (i == size - 1)
1366 			exp = POISON_END;
1367 		if (realobj[i] != exp) {
1368 			int limit;
1369 			/* Mismatch ! */
1370 			/* Print header */
1371 			if (lines == 0) {
1372 				printk(KERN_ERR
1373 				       "Slab corruption: start=%p, len=%d\n",
1374 				       realobj, size);
1375 				print_objinfo(cachep, objp, 0);
1376 			}
1377 			/* Hexdump the affected line */
1378 			i = (i / 16) * 16;
1379 			limit = 16;
1380 			if (i + limit > size)
1381 				limit = size - i;
1382 			dump_line(realobj, i, limit);
1383 			i += 16;
1384 			lines++;
1385 			/* Limit to 5 lines */
1386 			if (lines > 5)
1387 				break;
1388 		}
1389 	}
1390 	if (lines != 0) {
1391 		/* Print some data about the neighboring objects, if they
1392 		 * exist:
1393 		 */
1394 		struct slab *slabp = page_get_slab(virt_to_page(objp));
1395 		int objnr;
1396 
1397 		objnr = (objp - slabp->s_mem) / cachep->objsize;
1398 		if (objnr) {
1399 			objp = slabp->s_mem + (objnr - 1) * cachep->objsize;
1400 			realobj = (char *)objp + obj_dbghead(cachep);
1401 			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1402 			       realobj, size);
1403 			print_objinfo(cachep, objp, 2);
1404 		}
1405 		if (objnr + 1 < cachep->num) {
1406 			objp = slabp->s_mem + (objnr + 1) * cachep->objsize;
1407 			realobj = (char *)objp + obj_dbghead(cachep);
1408 			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1409 			       realobj, size);
1410 			print_objinfo(cachep, objp, 2);
1411 		}
1412 	}
1413 }
1414 #endif
1415 
1416 /* Destroy all the objs in a slab, and release the mem back to the system.
1417  * Before calling the slab must have been unlinked from the cache.
1418  * The cache-lock is not held/needed.
1419  */
1420 static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
1421 {
1422 	void *addr = slabp->s_mem - slabp->colouroff;
1423 
1424 #if DEBUG
1425 	int i;
1426 	for (i = 0; i < cachep->num; i++) {
1427 		void *objp = slabp->s_mem + cachep->objsize * i;
1428 
1429 		if (cachep->flags & SLAB_POISON) {
1430 #ifdef CONFIG_DEBUG_PAGEALLOC
1431 			if ((cachep->objsize % PAGE_SIZE) == 0
1432 			    && OFF_SLAB(cachep))
1433 				kernel_map_pages(virt_to_page(objp),
1434 						 cachep->objsize / PAGE_SIZE,
1435 						 1);
1436 			else
1437 				check_poison_obj(cachep, objp);
1438 #else
1439 			check_poison_obj(cachep, objp);
1440 #endif
1441 		}
1442 		if (cachep->flags & SLAB_RED_ZONE) {
1443 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1444 				slab_error(cachep, "start of a freed object "
1445 					   "was overwritten");
1446 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1447 				slab_error(cachep, "end of a freed object "
1448 					   "was overwritten");
1449 		}
1450 		if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1451 			(cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0);
1452 	}
1453 #else
1454 	if (cachep->dtor) {
1455 		int i;
1456 		for (i = 0; i < cachep->num; i++) {
1457 			void *objp = slabp->s_mem + cachep->objsize * i;
1458 			(cachep->dtor) (objp, cachep, 0);
1459 		}
1460 	}
1461 #endif
1462 
1463 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1464 		struct slab_rcu *slab_rcu;
1465 
1466 		slab_rcu = (struct slab_rcu *)slabp;
1467 		slab_rcu->cachep = cachep;
1468 		slab_rcu->addr = addr;
1469 		call_rcu(&slab_rcu->head, kmem_rcu_free);
1470 	} else {
1471 		kmem_freepages(cachep, addr);
1472 		if (OFF_SLAB(cachep))
1473 			kmem_cache_free(cachep->slabp_cache, slabp);
1474 	}
1475 }
1476 
1477 /* For setting up all the kmem_list3s for cache whose objsize is same
1478    as size of kmem_list3. */
1479 static inline void set_up_list3s(kmem_cache_t *cachep, int index)
1480 {
1481 	int node;
1482 
1483 	for_each_online_node(node) {
1484 		cachep->nodelists[node] = &initkmem_list3[index + node];
1485 		cachep->nodelists[node]->next_reap = jiffies +
1486 		    REAPTIMEOUT_LIST3 +
1487 		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1488 	}
1489 }
1490 
1491 /**
1492  * calculate_slab_order - calculate size (page order) of slabs and the number
1493  *                        of objects per slab.
1494  *
1495  * This could be made much more intelligent.  For now, try to avoid using
1496  * high order pages for slabs.  When the gfp() functions are more friendly
1497  * towards high-order requests, this should be changed.
1498  */
1499 static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
1500 					  size_t align, gfp_t flags)
1501 {
1502 	size_t left_over = 0;
1503 
1504 	for (;; cachep->gfporder++) {
1505 		unsigned int num;
1506 		size_t remainder;
1507 
1508 		if (cachep->gfporder > MAX_GFP_ORDER) {
1509 			cachep->num = 0;
1510 			break;
1511 		}
1512 
1513 		cache_estimate(cachep->gfporder, size, align, flags,
1514 			       &remainder, &num);
1515 		if (!num)
1516 			continue;
1517 		/* More than offslab_limit objects will cause problems */
1518 		if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
1519 			break;
1520 
1521 		cachep->num = num;
1522 		left_over = remainder;
1523 
1524 		/*
1525 		 * Large number of objects is good, but very large slabs are
1526 		 * currently bad for the gfp()s.
1527 		 */
1528 		if (cachep->gfporder >= slab_break_gfp_order)
1529 			break;
1530 
1531 		if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
1532 			/* Acceptable internal fragmentation */
1533 			break;
1534 	}
1535 	return left_over;
1536 }
1537 
1538 /**
1539  * kmem_cache_create - Create a cache.
1540  * @name: A string which is used in /proc/slabinfo to identify this cache.
1541  * @size: The size of objects to be created in this cache.
1542  * @align: The required alignment for the objects.
1543  * @flags: SLAB flags
1544  * @ctor: A constructor for the objects.
1545  * @dtor: A destructor for the objects.
1546  *
1547  * Returns a ptr to the cache on success, NULL on failure.
1548  * Cannot be called within a int, but can be interrupted.
1549  * The @ctor is run when new pages are allocated by the cache
1550  * and the @dtor is run before the pages are handed back.
1551  *
1552  * @name must be valid until the cache is destroyed. This implies that
1553  * the module calling this has to destroy the cache before getting
1554  * unloaded.
1555  *
1556  * The flags are
1557  *
1558  * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
1559  * to catch references to uninitialised memory.
1560  *
1561  * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
1562  * for buffer overruns.
1563  *
1564  * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
1565  * memory pressure.
1566  *
1567  * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
1568  * cacheline.  This can be beneficial if you're counting cycles as closely
1569  * as davem.
1570  */
1571 kmem_cache_t *
1572 kmem_cache_create (const char *name, size_t size, size_t align,
1573 	unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
1574 	void (*dtor)(void*, kmem_cache_t *, unsigned long))
1575 {
1576 	size_t left_over, slab_size, ralign;
1577 	kmem_cache_t *cachep = NULL;
1578 	struct list_head *p;
1579 
1580 	/*
1581 	 * Sanity checks... these are all serious usage bugs.
1582 	 */
1583 	if ((!name) ||
1584 	    in_interrupt() ||
1585 	    (size < BYTES_PER_WORD) ||
1586 	    (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
1587 		printk(KERN_ERR "%s: Early error in slab %s\n",
1588 		       __FUNCTION__, name);
1589 		BUG();
1590 	}
1591 
1592 	down(&cache_chain_sem);
1593 
1594 	list_for_each(p, &cache_chain) {
1595 		kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
1596 		mm_segment_t old_fs = get_fs();
1597 		char tmp;
1598 		int res;
1599 
1600 		/*
1601 		 * This happens when the module gets unloaded and doesn't
1602 		 * destroy its slab cache and no-one else reuses the vmalloc
1603 		 * area of the module.  Print a warning.
1604 		 */
1605 		set_fs(KERNEL_DS);
1606 		res = __get_user(tmp, pc->name);
1607 		set_fs(old_fs);
1608 		if (res) {
1609 			printk("SLAB: cache with size %d has lost its name\n",
1610 			       pc->objsize);
1611 			continue;
1612 		}
1613 
1614 		if (!strcmp(pc->name, name)) {
1615 			printk("kmem_cache_create: duplicate cache %s\n", name);
1616 			dump_stack();
1617 			goto oops;
1618 		}
1619 	}
1620 
1621 #if DEBUG
1622 	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
1623 	if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
1624 		/* No constructor, but inital state check requested */
1625 		printk(KERN_ERR "%s: No con, but init state check "
1626 		       "requested - %s\n", __FUNCTION__, name);
1627 		flags &= ~SLAB_DEBUG_INITIAL;
1628 	}
1629 #if FORCED_DEBUG
1630 	/*
1631 	 * Enable redzoning and last user accounting, except for caches with
1632 	 * large objects, if the increased size would increase the object size
1633 	 * above the next power of two: caches with object sizes just above a
1634 	 * power of two have a significant amount of internal fragmentation.
1635 	 */
1636 	if ((size < 4096
1637 	     || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
1638 		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
1639 	if (!(flags & SLAB_DESTROY_BY_RCU))
1640 		flags |= SLAB_POISON;
1641 #endif
1642 	if (flags & SLAB_DESTROY_BY_RCU)
1643 		BUG_ON(flags & SLAB_POISON);
1644 #endif
1645 	if (flags & SLAB_DESTROY_BY_RCU)
1646 		BUG_ON(dtor);
1647 
1648 	/*
1649 	 * Always checks flags, a caller might be expecting debug
1650 	 * support which isn't available.
1651 	 */
1652 	if (flags & ~CREATE_MASK)
1653 		BUG();
1654 
1655 	/* Check that size is in terms of words.  This is needed to avoid
1656 	 * unaligned accesses for some archs when redzoning is used, and makes
1657 	 * sure any on-slab bufctl's are also correctly aligned.
1658 	 */
1659 	if (size & (BYTES_PER_WORD - 1)) {
1660 		size += (BYTES_PER_WORD - 1);
1661 		size &= ~(BYTES_PER_WORD - 1);
1662 	}
1663 
1664 	/* calculate out the final buffer alignment: */
1665 	/* 1) arch recommendation: can be overridden for debug */
1666 	if (flags & SLAB_HWCACHE_ALIGN) {
1667 		/* Default alignment: as specified by the arch code.
1668 		 * Except if an object is really small, then squeeze multiple
1669 		 * objects into one cacheline.
1670 		 */
1671 		ralign = cache_line_size();
1672 		while (size <= ralign / 2)
1673 			ralign /= 2;
1674 	} else {
1675 		ralign = BYTES_PER_WORD;
1676 	}
1677 	/* 2) arch mandated alignment: disables debug if necessary */
1678 	if (ralign < ARCH_SLAB_MINALIGN) {
1679 		ralign = ARCH_SLAB_MINALIGN;
1680 		if (ralign > BYTES_PER_WORD)
1681 			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
1682 	}
1683 	/* 3) caller mandated alignment: disables debug if necessary */
1684 	if (ralign < align) {
1685 		ralign = align;
1686 		if (ralign > BYTES_PER_WORD)
1687 			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
1688 	}
1689 	/* 4) Store it. Note that the debug code below can reduce
1690 	 *    the alignment to BYTES_PER_WORD.
1691 	 */
1692 	align = ralign;
1693 
1694 	/* Get cache's description obj. */
1695 	cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
1696 	if (!cachep)
1697 		goto oops;
1698 	memset(cachep, 0, sizeof(kmem_cache_t));
1699 
1700 #if DEBUG
1701 	cachep->reallen = size;
1702 
1703 	if (flags & SLAB_RED_ZONE) {
1704 		/* redzoning only works with word aligned caches */
1705 		align = BYTES_PER_WORD;
1706 
1707 		/* add space for red zone words */
1708 		cachep->dbghead += BYTES_PER_WORD;
1709 		size += 2 * BYTES_PER_WORD;
1710 	}
1711 	if (flags & SLAB_STORE_USER) {
1712 		/* user store requires word alignment and
1713 		 * one word storage behind the end of the real
1714 		 * object.
1715 		 */
1716 		align = BYTES_PER_WORD;
1717 		size += BYTES_PER_WORD;
1718 	}
1719 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
1720 	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
1721 	    && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
1722 		cachep->dbghead += PAGE_SIZE - size;
1723 		size = PAGE_SIZE;
1724 	}
1725 #endif
1726 #endif
1727 
1728 	/* Determine if the slab management is 'on' or 'off' slab. */
1729 	if (size >= (PAGE_SIZE >> 3))
1730 		/*
1731 		 * Size is large, assume best to place the slab management obj
1732 		 * off-slab (should allow better packing of objs).
1733 		 */
1734 		flags |= CFLGS_OFF_SLAB;
1735 
1736 	size = ALIGN(size, align);
1737 
1738 	if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {
1739 		/*
1740 		 * A VFS-reclaimable slab tends to have most allocations
1741 		 * as GFP_NOFS and we really don't want to have to be allocating
1742 		 * higher-order pages when we are unable to shrink dcache.
1743 		 */
1744 		cachep->gfporder = 0;
1745 		cache_estimate(cachep->gfporder, size, align, flags,
1746 			       &left_over, &cachep->num);
1747 	} else
1748 		left_over = calculate_slab_order(cachep, size, align, flags);
1749 
1750 	if (!cachep->num) {
1751 		printk("kmem_cache_create: couldn't create cache %s.\n", name);
1752 		kmem_cache_free(&cache_cache, cachep);
1753 		cachep = NULL;
1754 		goto oops;
1755 	}
1756 	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
1757 			  + sizeof(struct slab), align);
1758 
1759 	/*
1760 	 * If the slab has been placed off-slab, and we have enough space then
1761 	 * move it on-slab. This is at the expense of any extra colouring.
1762 	 */
1763 	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
1764 		flags &= ~CFLGS_OFF_SLAB;
1765 		left_over -= slab_size;
1766 	}
1767 
1768 	if (flags & CFLGS_OFF_SLAB) {
1769 		/* really off slab. No need for manual alignment */
1770 		slab_size =
1771 		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
1772 	}
1773 
1774 	cachep->colour_off = cache_line_size();
1775 	/* Offset must be a multiple of the alignment. */
1776 	if (cachep->colour_off < align)
1777 		cachep->colour_off = align;
1778 	cachep->colour = left_over / cachep->colour_off;
1779 	cachep->slab_size = slab_size;
1780 	cachep->flags = flags;
1781 	cachep->gfpflags = 0;
1782 	if (flags & SLAB_CACHE_DMA)
1783 		cachep->gfpflags |= GFP_DMA;
1784 	spin_lock_init(&cachep->spinlock);
1785 	cachep->objsize = size;
1786 
1787 	if (flags & CFLGS_OFF_SLAB)
1788 		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
1789 	cachep->ctor = ctor;
1790 	cachep->dtor = dtor;
1791 	cachep->name = name;
1792 
1793 	/* Don't let CPUs to come and go */
1794 	lock_cpu_hotplug();
1795 
1796 	if (g_cpucache_up == FULL) {
1797 		enable_cpucache(cachep);
1798 	} else {
1799 		if (g_cpucache_up == NONE) {
1800 			/* Note: the first kmem_cache_create must create
1801 			 * the cache that's used by kmalloc(24), otherwise
1802 			 * the creation of further caches will BUG().
1803 			 */
1804 			cachep->array[smp_processor_id()] =
1805 			    &initarray_generic.cache;
1806 
1807 			/* If the cache that's used by
1808 			 * kmalloc(sizeof(kmem_list3)) is the first cache,
1809 			 * then we need to set up all its list3s, otherwise
1810 			 * the creation of further caches will BUG().
1811 			 */
1812 			set_up_list3s(cachep, SIZE_AC);
1813 			if (INDEX_AC == INDEX_L3)
1814 				g_cpucache_up = PARTIAL_L3;
1815 			else
1816 				g_cpucache_up = PARTIAL_AC;
1817 		} else {
1818 			cachep->array[smp_processor_id()] =
1819 			    kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1820 
1821 			if (g_cpucache_up == PARTIAL_AC) {
1822 				set_up_list3s(cachep, SIZE_L3);
1823 				g_cpucache_up = PARTIAL_L3;
1824 			} else {
1825 				int node;
1826 				for_each_online_node(node) {
1827 
1828 					cachep->nodelists[node] =
1829 					    kmalloc_node(sizeof
1830 							 (struct kmem_list3),
1831 							 GFP_KERNEL, node);
1832 					BUG_ON(!cachep->nodelists[node]);
1833 					kmem_list3_init(cachep->
1834 							nodelists[node]);
1835 				}
1836 			}
1837 		}
1838 		cachep->nodelists[numa_node_id()]->next_reap =
1839 		    jiffies + REAPTIMEOUT_LIST3 +
1840 		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1841 
1842 		BUG_ON(!ac_data(cachep));
1843 		ac_data(cachep)->avail = 0;
1844 		ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
1845 		ac_data(cachep)->batchcount = 1;
1846 		ac_data(cachep)->touched = 0;
1847 		cachep->batchcount = 1;
1848 		cachep->limit = BOOT_CPUCACHE_ENTRIES;
1849 	}
1850 
1851 	/* cache setup completed, link it into the list */
1852 	list_add(&cachep->next, &cache_chain);
1853 	unlock_cpu_hotplug();
1854       oops:
1855 	if (!cachep && (flags & SLAB_PANIC))
1856 		panic("kmem_cache_create(): failed to create slab `%s'\n",
1857 		      name);
1858 	up(&cache_chain_sem);
1859 	return cachep;
1860 }
1861 EXPORT_SYMBOL(kmem_cache_create);
1862 
1863 #if DEBUG
1864 static void check_irq_off(void)
1865 {
1866 	BUG_ON(!irqs_disabled());
1867 }
1868 
1869 static void check_irq_on(void)
1870 {
1871 	BUG_ON(irqs_disabled());
1872 }
1873 
1874 static void check_spinlock_acquired(kmem_cache_t *cachep)
1875 {
1876 #ifdef CONFIG_SMP
1877 	check_irq_off();
1878 	assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
1879 #endif
1880 }
1881 
1882 static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
1883 {
1884 #ifdef CONFIG_SMP
1885 	check_irq_off();
1886 	assert_spin_locked(&cachep->nodelists[node]->list_lock);
1887 #endif
1888 }
1889 
1890 #else
1891 #define check_irq_off()	do { } while(0)
1892 #define check_irq_on()	do { } while(0)
1893 #define check_spinlock_acquired(x) do { } while(0)
1894 #define check_spinlock_acquired_node(x, y) do { } while(0)
1895 #endif
1896 
1897 /*
1898  * Waits for all CPUs to execute func().
1899  */
1900 static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
1901 {
1902 	check_irq_on();
1903 	preempt_disable();
1904 
1905 	local_irq_disable();
1906 	func(arg);
1907 	local_irq_enable();
1908 
1909 	if (smp_call_function(func, arg, 1, 1))
1910 		BUG();
1911 
1912 	preempt_enable();
1913 }
1914 
1915 static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
1916 				int force, int node);
1917 
1918 static void do_drain(void *arg)
1919 {
1920 	kmem_cache_t *cachep = (kmem_cache_t *) arg;
1921 	struct array_cache *ac;
1922 	int node = numa_node_id();
1923 
1924 	check_irq_off();
1925 	ac = ac_data(cachep);
1926 	spin_lock(&cachep->nodelists[node]->list_lock);
1927 	free_block(cachep, ac->entry, ac->avail, node);
1928 	spin_unlock(&cachep->nodelists[node]->list_lock);
1929 	ac->avail = 0;
1930 }
1931 
1932 static void drain_cpu_caches(kmem_cache_t *cachep)
1933 {
1934 	struct kmem_list3 *l3;
1935 	int node;
1936 
1937 	smp_call_function_all_cpus(do_drain, cachep);
1938 	check_irq_on();
1939 	spin_lock_irq(&cachep->spinlock);
1940 	for_each_online_node(node) {
1941 		l3 = cachep->nodelists[node];
1942 		if (l3) {
1943 			spin_lock(&l3->list_lock);
1944 			drain_array_locked(cachep, l3->shared, 1, node);
1945 			spin_unlock(&l3->list_lock);
1946 			if (l3->alien)
1947 				drain_alien_cache(cachep, l3);
1948 		}
1949 	}
1950 	spin_unlock_irq(&cachep->spinlock);
1951 }
1952 
1953 static int __node_shrink(kmem_cache_t *cachep, int node)
1954 {
1955 	struct slab *slabp;
1956 	struct kmem_list3 *l3 = cachep->nodelists[node];
1957 	int ret;
1958 
1959 	for (;;) {
1960 		struct list_head *p;
1961 
1962 		p = l3->slabs_free.prev;
1963 		if (p == &l3->slabs_free)
1964 			break;
1965 
1966 		slabp = list_entry(l3->slabs_free.prev, struct slab, list);
1967 #if DEBUG
1968 		if (slabp->inuse)
1969 			BUG();
1970 #endif
1971 		list_del(&slabp->list);
1972 
1973 		l3->free_objects -= cachep->num;
1974 		spin_unlock_irq(&l3->list_lock);
1975 		slab_destroy(cachep, slabp);
1976 		spin_lock_irq(&l3->list_lock);
1977 	}
1978 	ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial);
1979 	return ret;
1980 }
1981 
1982 static int __cache_shrink(kmem_cache_t *cachep)
1983 {
1984 	int ret = 0, i = 0;
1985 	struct kmem_list3 *l3;
1986 
1987 	drain_cpu_caches(cachep);
1988 
1989 	check_irq_on();
1990 	for_each_online_node(i) {
1991 		l3 = cachep->nodelists[i];
1992 		if (l3) {
1993 			spin_lock_irq(&l3->list_lock);
1994 			ret += __node_shrink(cachep, i);
1995 			spin_unlock_irq(&l3->list_lock);
1996 		}
1997 	}
1998 	return (ret ? 1 : 0);
1999 }
2000 
2001 /**
2002  * kmem_cache_shrink - Shrink a cache.
2003  * @cachep: The cache to shrink.
2004  *
2005  * Releases as many slabs as possible for a cache.
2006  * To help debugging, a zero exit status indicates all slabs were released.
2007  */
2008 int kmem_cache_shrink(kmem_cache_t *cachep)
2009 {
2010 	if (!cachep || in_interrupt())
2011 		BUG();
2012 
2013 	return __cache_shrink(cachep);
2014 }
2015 EXPORT_SYMBOL(kmem_cache_shrink);
2016 
2017 /**
2018  * kmem_cache_destroy - delete a cache
2019  * @cachep: the cache to destroy
2020  *
2021  * Remove a kmem_cache_t object from the slab cache.
2022  * Returns 0 on success.
2023  *
2024  * It is expected this function will be called by a module when it is
2025  * unloaded.  This will remove the cache completely, and avoid a duplicate
2026  * cache being allocated each time a module is loaded and unloaded, if the
2027  * module doesn't have persistent in-kernel storage across loads and unloads.
2028  *
2029  * The cache must be empty before calling this function.
2030  *
2031  * The caller must guarantee that noone will allocate memory from the cache
2032  * during the kmem_cache_destroy().
2033  */
2034 int kmem_cache_destroy(kmem_cache_t *cachep)
2035 {
2036 	int i;
2037 	struct kmem_list3 *l3;
2038 
2039 	if (!cachep || in_interrupt())
2040 		BUG();
2041 
2042 	/* Don't let CPUs to come and go */
2043 	lock_cpu_hotplug();
2044 
2045 	/* Find the cache in the chain of caches. */
2046 	down(&cache_chain_sem);
2047 	/*
2048 	 * the chain is never empty, cache_cache is never destroyed
2049 	 */
2050 	list_del(&cachep->next);
2051 	up(&cache_chain_sem);
2052 
2053 	if (__cache_shrink(cachep)) {
2054 		slab_error(cachep, "Can't free all objects");
2055 		down(&cache_chain_sem);
2056 		list_add(&cachep->next, &cache_chain);
2057 		up(&cache_chain_sem);
2058 		unlock_cpu_hotplug();
2059 		return 1;
2060 	}
2061 
2062 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2063 		synchronize_rcu();
2064 
2065 	for_each_online_cpu(i)
2066 	    kfree(cachep->array[i]);
2067 
2068 	/* NUMA: free the list3 structures */
2069 	for_each_online_node(i) {
2070 		if ((l3 = cachep->nodelists[i])) {
2071 			kfree(l3->shared);
2072 			free_alien_cache(l3->alien);
2073 			kfree(l3);
2074 		}
2075 	}
2076 	kmem_cache_free(&cache_cache, cachep);
2077 
2078 	unlock_cpu_hotplug();
2079 
2080 	return 0;
2081 }
2082 EXPORT_SYMBOL(kmem_cache_destroy);
2083 
2084 /* Get the memory for a slab management obj. */
2085 static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
2086 				   int colour_off, gfp_t local_flags)
2087 {
2088 	struct slab *slabp;
2089 
2090 	if (OFF_SLAB(cachep)) {
2091 		/* Slab management obj is off-slab. */
2092 		slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
2093 		if (!slabp)
2094 			return NULL;
2095 	} else {
2096 		slabp = objp + colour_off;
2097 		colour_off += cachep->slab_size;
2098 	}
2099 	slabp->inuse = 0;
2100 	slabp->colouroff = colour_off;
2101 	slabp->s_mem = objp + colour_off;
2102 
2103 	return slabp;
2104 }
2105 
2106 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2107 {
2108 	return (kmem_bufctl_t *) (slabp + 1);
2109 }
2110 
2111 static void cache_init_objs(kmem_cache_t *cachep,
2112 			    struct slab *slabp, unsigned long ctor_flags)
2113 {
2114 	int i;
2115 
2116 	for (i = 0; i < cachep->num; i++) {
2117 		void *objp = slabp->s_mem + cachep->objsize * i;
2118 #if DEBUG
2119 		/* need to poison the objs? */
2120 		if (cachep->flags & SLAB_POISON)
2121 			poison_obj(cachep, objp, POISON_FREE);
2122 		if (cachep->flags & SLAB_STORE_USER)
2123 			*dbg_userword(cachep, objp) = NULL;
2124 
2125 		if (cachep->flags & SLAB_RED_ZONE) {
2126 			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2127 			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2128 		}
2129 		/*
2130 		 * Constructors are not allowed to allocate memory from
2131 		 * the same cache which they are a constructor for.
2132 		 * Otherwise, deadlock. They must also be threaded.
2133 		 */
2134 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2135 			cachep->ctor(objp + obj_dbghead(cachep), cachep,
2136 				     ctor_flags);
2137 
2138 		if (cachep->flags & SLAB_RED_ZONE) {
2139 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2140 				slab_error(cachep, "constructor overwrote the"
2141 					   " end of an object");
2142 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2143 				slab_error(cachep, "constructor overwrote the"
2144 					   " start of an object");
2145 		}
2146 		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
2147 		    && cachep->flags & SLAB_POISON)
2148 			kernel_map_pages(virt_to_page(objp),
2149 					 cachep->objsize / PAGE_SIZE, 0);
2150 #else
2151 		if (cachep->ctor)
2152 			cachep->ctor(objp, cachep, ctor_flags);
2153 #endif
2154 		slab_bufctl(slabp)[i] = i + 1;
2155 	}
2156 	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2157 	slabp->free = 0;
2158 }
2159 
2160 static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags)
2161 {
2162 	if (flags & SLAB_DMA) {
2163 		if (!(cachep->gfpflags & GFP_DMA))
2164 			BUG();
2165 	} else {
2166 		if (cachep->gfpflags & GFP_DMA)
2167 			BUG();
2168 	}
2169 }
2170 
2171 static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
2172 {
2173 	int i;
2174 	struct page *page;
2175 
2176 	/* Nasty!!!!!! I hope this is OK. */
2177 	i = 1 << cachep->gfporder;
2178 	page = virt_to_page(objp);
2179 	do {
2180 		page_set_cache(page, cachep);
2181 		page_set_slab(page, slabp);
2182 		page++;
2183 	} while (--i);
2184 }
2185 
2186 /*
2187  * Grow (by 1) the number of slabs within a cache.  This is called by
2188  * kmem_cache_alloc() when there are no active objs left in a cache.
2189  */
2190 static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2191 {
2192 	struct slab *slabp;
2193 	void *objp;
2194 	size_t offset;
2195 	gfp_t local_flags;
2196 	unsigned long ctor_flags;
2197 	struct kmem_list3 *l3;
2198 
2199 	/* Be lazy and only check for valid flags here,
2200 	 * keeping it out of the critical path in kmem_cache_alloc().
2201 	 */
2202 	if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
2203 		BUG();
2204 	if (flags & SLAB_NO_GROW)
2205 		return 0;
2206 
2207 	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2208 	local_flags = (flags & SLAB_LEVEL_MASK);
2209 	if (!(local_flags & __GFP_WAIT))
2210 		/*
2211 		 * Not allowed to sleep.  Need to tell a constructor about
2212 		 * this - it might need to know...
2213 		 */
2214 		ctor_flags |= SLAB_CTOR_ATOMIC;
2215 
2216 	/* About to mess with non-constant members - lock. */
2217 	check_irq_off();
2218 	spin_lock(&cachep->spinlock);
2219 
2220 	/* Get colour for the slab, and cal the next value. */
2221 	offset = cachep->colour_next;
2222 	cachep->colour_next++;
2223 	if (cachep->colour_next >= cachep->colour)
2224 		cachep->colour_next = 0;
2225 	offset *= cachep->colour_off;
2226 
2227 	spin_unlock(&cachep->spinlock);
2228 
2229 	check_irq_off();
2230 	if (local_flags & __GFP_WAIT)
2231 		local_irq_enable();
2232 
2233 	/*
2234 	 * The test for missing atomic flag is performed here, rather than
2235 	 * the more obvious place, simply to reduce the critical path length
2236 	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2237 	 * will eventually be caught here (where it matters).
2238 	 */
2239 	kmem_flagcheck(cachep, flags);
2240 
2241 	/* Get mem for the objs.
2242 	 * Attempt to allocate a physical page from 'nodeid',
2243 	 */
2244 	if (!(objp = kmem_getpages(cachep, flags, nodeid)))
2245 		goto failed;
2246 
2247 	/* Get slab management. */
2248 	if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
2249 		goto opps1;
2250 
2251 	slabp->nodeid = nodeid;
2252 	set_slab_attr(cachep, slabp, objp);
2253 
2254 	cache_init_objs(cachep, slabp, ctor_flags);
2255 
2256 	if (local_flags & __GFP_WAIT)
2257 		local_irq_disable();
2258 	check_irq_off();
2259 	l3 = cachep->nodelists[nodeid];
2260 	spin_lock(&l3->list_lock);
2261 
2262 	/* Make slab active. */
2263 	list_add_tail(&slabp->list, &(l3->slabs_free));
2264 	STATS_INC_GROWN(cachep);
2265 	l3->free_objects += cachep->num;
2266 	spin_unlock(&l3->list_lock);
2267 	return 1;
2268       opps1:
2269 	kmem_freepages(cachep, objp);
2270       failed:
2271 	if (local_flags & __GFP_WAIT)
2272 		local_irq_disable();
2273 	return 0;
2274 }
2275 
2276 #if DEBUG
2277 
2278 /*
2279  * Perform extra freeing checks:
2280  * - detect bad pointers.
2281  * - POISON/RED_ZONE checking
2282  * - destructor calls, for caches with POISON+dtor
2283  */
2284 static void kfree_debugcheck(const void *objp)
2285 {
2286 	struct page *page;
2287 
2288 	if (!virt_addr_valid(objp)) {
2289 		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2290 		       (unsigned long)objp);
2291 		BUG();
2292 	}
2293 	page = virt_to_page(objp);
2294 	if (!PageSlab(page)) {
2295 		printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
2296 		       (unsigned long)objp);
2297 		BUG();
2298 	}
2299 }
2300 
2301 static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
2302 				   void *caller)
2303 {
2304 	struct page *page;
2305 	unsigned int objnr;
2306 	struct slab *slabp;
2307 
2308 	objp -= obj_dbghead(cachep);
2309 	kfree_debugcheck(objp);
2310 	page = virt_to_page(objp);
2311 
2312 	if (page_get_cache(page) != cachep) {
2313 		printk(KERN_ERR
2314 		       "mismatch in kmem_cache_free: expected cache %p, got %p\n",
2315 		       page_get_cache(page), cachep);
2316 		printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
2317 		printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
2318 		       page_get_cache(page)->name);
2319 		WARN_ON(1);
2320 	}
2321 	slabp = page_get_slab(page);
2322 
2323 	if (cachep->flags & SLAB_RED_ZONE) {
2324 		if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
2325 		    || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
2326 			slab_error(cachep,
2327 				   "double free, or memory outside"
2328 				   " object was overwritten");
2329 			printk(KERN_ERR
2330 			       "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
2331 			       objp, *dbg_redzone1(cachep, objp),
2332 			       *dbg_redzone2(cachep, objp));
2333 		}
2334 		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2335 		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2336 	}
2337 	if (cachep->flags & SLAB_STORE_USER)
2338 		*dbg_userword(cachep, objp) = caller;
2339 
2340 	objnr = (objp - slabp->s_mem) / cachep->objsize;
2341 
2342 	BUG_ON(objnr >= cachep->num);
2343 	BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize);
2344 
2345 	if (cachep->flags & SLAB_DEBUG_INITIAL) {
2346 		/* Need to call the slab's constructor so the
2347 		 * caller can perform a verify of its state (debugging).
2348 		 * Called without the cache-lock held.
2349 		 */
2350 		cachep->ctor(objp + obj_dbghead(cachep),
2351 			     cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
2352 	}
2353 	if (cachep->flags & SLAB_POISON && cachep->dtor) {
2354 		/* we want to cache poison the object,
2355 		 * call the destruction callback
2356 		 */
2357 		cachep->dtor(objp + obj_dbghead(cachep), cachep, 0);
2358 	}
2359 	if (cachep->flags & SLAB_POISON) {
2360 #ifdef CONFIG_DEBUG_PAGEALLOC
2361 		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
2362 			store_stackinfo(cachep, objp, (unsigned long)caller);
2363 			kernel_map_pages(virt_to_page(objp),
2364 					 cachep->objsize / PAGE_SIZE, 0);
2365 		} else {
2366 			poison_obj(cachep, objp, POISON_FREE);
2367 		}
2368 #else
2369 		poison_obj(cachep, objp, POISON_FREE);
2370 #endif
2371 	}
2372 	return objp;
2373 }
2374 
2375 static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
2376 {
2377 	kmem_bufctl_t i;
2378 	int entries = 0;
2379 
2380 	/* Check slab's freelist to see if this obj is there. */
2381 	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2382 		entries++;
2383 		if (entries > cachep->num || i >= cachep->num)
2384 			goto bad;
2385 	}
2386 	if (entries != cachep->num - slabp->inuse) {
2387 	      bad:
2388 		printk(KERN_ERR
2389 		       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2390 		       cachep->name, cachep->num, slabp, slabp->inuse);
2391 		for (i = 0;
2392 		     i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
2393 		     i++) {
2394 			if ((i % 16) == 0)
2395 				printk("\n%03x:", i);
2396 			printk(" %02x", ((unsigned char *)slabp)[i]);
2397 		}
2398 		printk("\n");
2399 		BUG();
2400 	}
2401 }
2402 #else
2403 #define kfree_debugcheck(x) do { } while(0)
2404 #define cache_free_debugcheck(x,objp,z) (objp)
2405 #define check_slabp(x,y) do { } while(0)
2406 #endif
2407 
2408 static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
2409 {
2410 	int batchcount;
2411 	struct kmem_list3 *l3;
2412 	struct array_cache *ac;
2413 
2414 	check_irq_off();
2415 	ac = ac_data(cachep);
2416       retry:
2417 	batchcount = ac->batchcount;
2418 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2419 		/* if there was little recent activity on this
2420 		 * cache, then perform only a partial refill.
2421 		 * Otherwise we could generate refill bouncing.
2422 		 */
2423 		batchcount = BATCHREFILL_LIMIT;
2424 	}
2425 	l3 = cachep->nodelists[numa_node_id()];
2426 
2427 	BUG_ON(ac->avail > 0 || !l3);
2428 	spin_lock(&l3->list_lock);
2429 
2430 	if (l3->shared) {
2431 		struct array_cache *shared_array = l3->shared;
2432 		if (shared_array->avail) {
2433 			if (batchcount > shared_array->avail)
2434 				batchcount = shared_array->avail;
2435 			shared_array->avail -= batchcount;
2436 			ac->avail = batchcount;
2437 			memcpy(ac->entry,
2438 			       &(shared_array->entry[shared_array->avail]),
2439 			       sizeof(void *) * batchcount);
2440 			shared_array->touched = 1;
2441 			goto alloc_done;
2442 		}
2443 	}
2444 	while (batchcount > 0) {
2445 		struct list_head *entry;
2446 		struct slab *slabp;
2447 		/* Get slab alloc is to come from. */
2448 		entry = l3->slabs_partial.next;
2449 		if (entry == &l3->slabs_partial) {
2450 			l3->free_touched = 1;
2451 			entry = l3->slabs_free.next;
2452 			if (entry == &l3->slabs_free)
2453 				goto must_grow;
2454 		}
2455 
2456 		slabp = list_entry(entry, struct slab, list);
2457 		check_slabp(cachep, slabp);
2458 		check_spinlock_acquired(cachep);
2459 		while (slabp->inuse < cachep->num && batchcount--) {
2460 			kmem_bufctl_t next;
2461 			STATS_INC_ALLOCED(cachep);
2462 			STATS_INC_ACTIVE(cachep);
2463 			STATS_SET_HIGH(cachep);
2464 
2465 			/* get obj pointer */
2466 			ac->entry[ac->avail++] = slabp->s_mem +
2467 			    slabp->free * cachep->objsize;
2468 
2469 			slabp->inuse++;
2470 			next = slab_bufctl(slabp)[slabp->free];
2471 #if DEBUG
2472 			slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2473 			WARN_ON(numa_node_id() != slabp->nodeid);
2474 #endif
2475 			slabp->free = next;
2476 		}
2477 		check_slabp(cachep, slabp);
2478 
2479 		/* move slabp to correct slabp list: */
2480 		list_del(&slabp->list);
2481 		if (slabp->free == BUFCTL_END)
2482 			list_add(&slabp->list, &l3->slabs_full);
2483 		else
2484 			list_add(&slabp->list, &l3->slabs_partial);
2485 	}
2486 
2487       must_grow:
2488 	l3->free_objects -= ac->avail;
2489       alloc_done:
2490 	spin_unlock(&l3->list_lock);
2491 
2492 	if (unlikely(!ac->avail)) {
2493 		int x;
2494 		x = cache_grow(cachep, flags, numa_node_id());
2495 
2496 		// cache_grow can reenable interrupts, then ac could change.
2497 		ac = ac_data(cachep);
2498 		if (!x && ac->avail == 0)	// no objects in sight? abort
2499 			return NULL;
2500 
2501 		if (!ac->avail)	// objects refilled by interrupt?
2502 			goto retry;
2503 	}
2504 	ac->touched = 1;
2505 	return ac->entry[--ac->avail];
2506 }
2507 
2508 static inline void
2509 cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
2510 {
2511 	might_sleep_if(flags & __GFP_WAIT);
2512 #if DEBUG
2513 	kmem_flagcheck(cachep, flags);
2514 #endif
2515 }
2516 
2517 #if DEBUG
2518 static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
2519 					void *objp, void *caller)
2520 {
2521 	if (!objp)
2522 		return objp;
2523 	if (cachep->flags & SLAB_POISON) {
2524 #ifdef CONFIG_DEBUG_PAGEALLOC
2525 		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2526 			kernel_map_pages(virt_to_page(objp),
2527 					 cachep->objsize / PAGE_SIZE, 1);
2528 		else
2529 			check_poison_obj(cachep, objp);
2530 #else
2531 		check_poison_obj(cachep, objp);
2532 #endif
2533 		poison_obj(cachep, objp, POISON_INUSE);
2534 	}
2535 	if (cachep->flags & SLAB_STORE_USER)
2536 		*dbg_userword(cachep, objp) = caller;
2537 
2538 	if (cachep->flags & SLAB_RED_ZONE) {
2539 		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
2540 		    || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
2541 			slab_error(cachep,
2542 				   "double free, or memory outside"
2543 				   " object was overwritten");
2544 			printk(KERN_ERR
2545 			       "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
2546 			       objp, *dbg_redzone1(cachep, objp),
2547 			       *dbg_redzone2(cachep, objp));
2548 		}
2549 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
2550 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
2551 	}
2552 	objp += obj_dbghead(cachep);
2553 	if (cachep->ctor && cachep->flags & SLAB_POISON) {
2554 		unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2555 
2556 		if (!(flags & __GFP_WAIT))
2557 			ctor_flags |= SLAB_CTOR_ATOMIC;
2558 
2559 		cachep->ctor(objp, cachep, ctor_flags);
2560 	}
2561 	return objp;
2562 }
2563 #else
2564 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
2565 #endif
2566 
2567 static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2568 {
2569 	void *objp;
2570 	struct array_cache *ac;
2571 
2572 	check_irq_off();
2573 	ac = ac_data(cachep);
2574 	if (likely(ac->avail)) {
2575 		STATS_INC_ALLOCHIT(cachep);
2576 		ac->touched = 1;
2577 		objp = ac->entry[--ac->avail];
2578 	} else {
2579 		STATS_INC_ALLOCMISS(cachep);
2580 		objp = cache_alloc_refill(cachep, flags);
2581 	}
2582 	return objp;
2583 }
2584 
2585 static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2586 {
2587 	unsigned long save_flags;
2588 	void *objp;
2589 
2590 	cache_alloc_debugcheck_before(cachep, flags);
2591 
2592 	local_irq_save(save_flags);
2593 	objp = ____cache_alloc(cachep, flags);
2594 	local_irq_restore(save_flags);
2595 	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
2596 					    __builtin_return_address(0));
2597 	prefetchw(objp);
2598 	return objp;
2599 }
2600 
2601 #ifdef CONFIG_NUMA
2602 /*
2603  * A interface to enable slab creation on nodeid
2604  */
2605 static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2606 {
2607 	struct list_head *entry;
2608 	struct slab *slabp;
2609 	struct kmem_list3 *l3;
2610 	void *obj;
2611 	kmem_bufctl_t next;
2612 	int x;
2613 
2614 	l3 = cachep->nodelists[nodeid];
2615 	BUG_ON(!l3);
2616 
2617       retry:
2618 	spin_lock(&l3->list_lock);
2619 	entry = l3->slabs_partial.next;
2620 	if (entry == &l3->slabs_partial) {
2621 		l3->free_touched = 1;
2622 		entry = l3->slabs_free.next;
2623 		if (entry == &l3->slabs_free)
2624 			goto must_grow;
2625 	}
2626 
2627 	slabp = list_entry(entry, struct slab, list);
2628 	check_spinlock_acquired_node(cachep, nodeid);
2629 	check_slabp(cachep, slabp);
2630 
2631 	STATS_INC_NODEALLOCS(cachep);
2632 	STATS_INC_ACTIVE(cachep);
2633 	STATS_SET_HIGH(cachep);
2634 
2635 	BUG_ON(slabp->inuse == cachep->num);
2636 
2637 	/* get obj pointer */
2638 	obj = slabp->s_mem + slabp->free * cachep->objsize;
2639 	slabp->inuse++;
2640 	next = slab_bufctl(slabp)[slabp->free];
2641 #if DEBUG
2642 	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2643 #endif
2644 	slabp->free = next;
2645 	check_slabp(cachep, slabp);
2646 	l3->free_objects--;
2647 	/* move slabp to correct slabp list: */
2648 	list_del(&slabp->list);
2649 
2650 	if (slabp->free == BUFCTL_END) {
2651 		list_add(&slabp->list, &l3->slabs_full);
2652 	} else {
2653 		list_add(&slabp->list, &l3->slabs_partial);
2654 	}
2655 
2656 	spin_unlock(&l3->list_lock);
2657 	goto done;
2658 
2659       must_grow:
2660 	spin_unlock(&l3->list_lock);
2661 	x = cache_grow(cachep, flags, nodeid);
2662 
2663 	if (!x)
2664 		return NULL;
2665 
2666 	goto retry;
2667       done:
2668 	return obj;
2669 }
2670 #endif
2671 
2672 /*
2673  * Caller needs to acquire correct kmem_list's list_lock
2674  */
2675 static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
2676 		       int node)
2677 {
2678 	int i;
2679 	struct kmem_list3 *l3;
2680 
2681 	for (i = 0; i < nr_objects; i++) {
2682 		void *objp = objpp[i];
2683 		struct slab *slabp;
2684 		unsigned int objnr;
2685 
2686 		slabp = page_get_slab(virt_to_page(objp));
2687 		l3 = cachep->nodelists[node];
2688 		list_del(&slabp->list);
2689 		objnr = (objp - slabp->s_mem) / cachep->objsize;
2690 		check_spinlock_acquired_node(cachep, node);
2691 		check_slabp(cachep, slabp);
2692 
2693 #if DEBUG
2694 		/* Verify that the slab belongs to the intended node */
2695 		WARN_ON(slabp->nodeid != node);
2696 
2697 		if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2698 			printk(KERN_ERR "slab: double free detected in cache "
2699 			       "'%s', objp %p\n", cachep->name, objp);
2700 			BUG();
2701 		}
2702 #endif
2703 		slab_bufctl(slabp)[objnr] = slabp->free;
2704 		slabp->free = objnr;
2705 		STATS_DEC_ACTIVE(cachep);
2706 		slabp->inuse--;
2707 		l3->free_objects++;
2708 		check_slabp(cachep, slabp);
2709 
2710 		/* fixup slab chains */
2711 		if (slabp->inuse == 0) {
2712 			if (l3->free_objects > l3->free_limit) {
2713 				l3->free_objects -= cachep->num;
2714 				slab_destroy(cachep, slabp);
2715 			} else {
2716 				list_add(&slabp->list, &l3->slabs_free);
2717 			}
2718 		} else {
2719 			/* Unconditionally move a slab to the end of the
2720 			 * partial list on free - maximum time for the
2721 			 * other objects to be freed, too.
2722 			 */
2723 			list_add_tail(&slabp->list, &l3->slabs_partial);
2724 		}
2725 	}
2726 }
2727 
2728 static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
2729 {
2730 	int batchcount;
2731 	struct kmem_list3 *l3;
2732 	int node = numa_node_id();
2733 
2734 	batchcount = ac->batchcount;
2735 #if DEBUG
2736 	BUG_ON(!batchcount || batchcount > ac->avail);
2737 #endif
2738 	check_irq_off();
2739 	l3 = cachep->nodelists[node];
2740 	spin_lock(&l3->list_lock);
2741 	if (l3->shared) {
2742 		struct array_cache *shared_array = l3->shared;
2743 		int max = shared_array->limit - shared_array->avail;
2744 		if (max) {
2745 			if (batchcount > max)
2746 				batchcount = max;
2747 			memcpy(&(shared_array->entry[shared_array->avail]),
2748 			       ac->entry, sizeof(void *) * batchcount);
2749 			shared_array->avail += batchcount;
2750 			goto free_done;
2751 		}
2752 	}
2753 
2754 	free_block(cachep, ac->entry, batchcount, node);
2755       free_done:
2756 #if STATS
2757 	{
2758 		int i = 0;
2759 		struct list_head *p;
2760 
2761 		p = l3->slabs_free.next;
2762 		while (p != &(l3->slabs_free)) {
2763 			struct slab *slabp;
2764 
2765 			slabp = list_entry(p, struct slab, list);
2766 			BUG_ON(slabp->inuse);
2767 
2768 			i++;
2769 			p = p->next;
2770 		}
2771 		STATS_SET_FREEABLE(cachep, i);
2772 	}
2773 #endif
2774 	spin_unlock(&l3->list_lock);
2775 	ac->avail -= batchcount;
2776 	memmove(ac->entry, &(ac->entry[batchcount]),
2777 		sizeof(void *) * ac->avail);
2778 }
2779 
2780 /*
2781  * __cache_free
2782  * Release an obj back to its cache. If the obj has a constructed
2783  * state, it must be in this state _before_ it is released.
2784  *
2785  * Called with disabled ints.
2786  */
2787 static inline void __cache_free(kmem_cache_t *cachep, void *objp)
2788 {
2789 	struct array_cache *ac = ac_data(cachep);
2790 
2791 	check_irq_off();
2792 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
2793 
2794 	/* Make sure we are not freeing a object from another
2795 	 * node to the array cache on this cpu.
2796 	 */
2797 #ifdef CONFIG_NUMA
2798 	{
2799 		struct slab *slabp;
2800 		slabp = page_get_slab(virt_to_page(objp));
2801 		if (unlikely(slabp->nodeid != numa_node_id())) {
2802 			struct array_cache *alien = NULL;
2803 			int nodeid = slabp->nodeid;
2804 			struct kmem_list3 *l3 =
2805 			    cachep->nodelists[numa_node_id()];
2806 
2807 			STATS_INC_NODEFREES(cachep);
2808 			if (l3->alien && l3->alien[nodeid]) {
2809 				alien = l3->alien[nodeid];
2810 				spin_lock(&alien->lock);
2811 				if (unlikely(alien->avail == alien->limit))
2812 					__drain_alien_cache(cachep,
2813 							    alien, nodeid);
2814 				alien->entry[alien->avail++] = objp;
2815 				spin_unlock(&alien->lock);
2816 			} else {
2817 				spin_lock(&(cachep->nodelists[nodeid])->
2818 					  list_lock);
2819 				free_block(cachep, &objp, 1, nodeid);
2820 				spin_unlock(&(cachep->nodelists[nodeid])->
2821 					    list_lock);
2822 			}
2823 			return;
2824 		}
2825 	}
2826 #endif
2827 	if (likely(ac->avail < ac->limit)) {
2828 		STATS_INC_FREEHIT(cachep);
2829 		ac->entry[ac->avail++] = objp;
2830 		return;
2831 	} else {
2832 		STATS_INC_FREEMISS(cachep);
2833 		cache_flusharray(cachep, ac);
2834 		ac->entry[ac->avail++] = objp;
2835 	}
2836 }
2837 
2838 /**
2839  * kmem_cache_alloc - Allocate an object
2840  * @cachep: The cache to allocate from.
2841  * @flags: See kmalloc().
2842  *
2843  * Allocate an object from this cache.  The flags are only relevant
2844  * if the cache has no available objects.
2845  */
2846 void *kmem_cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2847 {
2848 	return __cache_alloc(cachep, flags);
2849 }
2850 EXPORT_SYMBOL(kmem_cache_alloc);
2851 
2852 /**
2853  * kmem_ptr_validate - check if an untrusted pointer might
2854  *	be a slab entry.
2855  * @cachep: the cache we're checking against
2856  * @ptr: pointer to validate
2857  *
2858  * This verifies that the untrusted pointer looks sane:
2859  * it is _not_ a guarantee that the pointer is actually
2860  * part of the slab cache in question, but it at least
2861  * validates that the pointer can be dereferenced and
2862  * looks half-way sane.
2863  *
2864  * Currently only used for dentry validation.
2865  */
2866 int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
2867 {
2868 	unsigned long addr = (unsigned long)ptr;
2869 	unsigned long min_addr = PAGE_OFFSET;
2870 	unsigned long align_mask = BYTES_PER_WORD - 1;
2871 	unsigned long size = cachep->objsize;
2872 	struct page *page;
2873 
2874 	if (unlikely(addr < min_addr))
2875 		goto out;
2876 	if (unlikely(addr > (unsigned long)high_memory - size))
2877 		goto out;
2878 	if (unlikely(addr & align_mask))
2879 		goto out;
2880 	if (unlikely(!kern_addr_valid(addr)))
2881 		goto out;
2882 	if (unlikely(!kern_addr_valid(addr + size - 1)))
2883 		goto out;
2884 	page = virt_to_page(ptr);
2885 	if (unlikely(!PageSlab(page)))
2886 		goto out;
2887 	if (unlikely(page_get_cache(page) != cachep))
2888 		goto out;
2889 	return 1;
2890       out:
2891 	return 0;
2892 }
2893 
2894 #ifdef CONFIG_NUMA
2895 /**
2896  * kmem_cache_alloc_node - Allocate an object on the specified node
2897  * @cachep: The cache to allocate from.
2898  * @flags: See kmalloc().
2899  * @nodeid: node number of the target node.
2900  *
2901  * Identical to kmem_cache_alloc, except that this function is slow
2902  * and can sleep. And it will allocate memory on the given node, which
2903  * can improve the performance for cpu bound structures.
2904  * New and improved: it will now make sure that the object gets
2905  * put on the correct node list so that there is no false sharing.
2906  */
2907 void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2908 {
2909 	unsigned long save_flags;
2910 	void *ptr;
2911 
2912 	if (nodeid == -1)
2913 		return __cache_alloc(cachep, flags);
2914 
2915 	if (unlikely(!cachep->nodelists[nodeid])) {
2916 		/* Fall back to __cache_alloc if we run into trouble */
2917 		printk(KERN_WARNING
2918 		       "slab: not allocating in inactive node %d for cache %s\n",
2919 		       nodeid, cachep->name);
2920 		return __cache_alloc(cachep, flags);
2921 	}
2922 
2923 	cache_alloc_debugcheck_before(cachep, flags);
2924 	local_irq_save(save_flags);
2925 	if (nodeid == numa_node_id())
2926 		ptr = ____cache_alloc(cachep, flags);
2927 	else
2928 		ptr = __cache_alloc_node(cachep, flags, nodeid);
2929 	local_irq_restore(save_flags);
2930 	ptr =
2931 	    cache_alloc_debugcheck_after(cachep, flags, ptr,
2932 					 __builtin_return_address(0));
2933 
2934 	return ptr;
2935 }
2936 EXPORT_SYMBOL(kmem_cache_alloc_node);
2937 
2938 void *kmalloc_node(size_t size, gfp_t flags, int node)
2939 {
2940 	kmem_cache_t *cachep;
2941 
2942 	cachep = kmem_find_general_cachep(size, flags);
2943 	if (unlikely(cachep == NULL))
2944 		return NULL;
2945 	return kmem_cache_alloc_node(cachep, flags, node);
2946 }
2947 EXPORT_SYMBOL(kmalloc_node);
2948 #endif
2949 
2950 /**
2951  * kmalloc - allocate memory
2952  * @size: how many bytes of memory are required.
2953  * @flags: the type of memory to allocate.
2954  *
2955  * kmalloc is the normal method of allocating memory
2956  * in the kernel.
2957  *
2958  * The @flags argument may be one of:
2959  *
2960  * %GFP_USER - Allocate memory on behalf of user.  May sleep.
2961  *
2962  * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
2963  *
2964  * %GFP_ATOMIC - Allocation will not sleep.  Use inside interrupt handlers.
2965  *
2966  * Additionally, the %GFP_DMA flag may be set to indicate the memory
2967  * must be suitable for DMA.  This can mean different things on different
2968  * platforms.  For example, on i386, it means that the memory must come
2969  * from the first 16MB.
2970  */
2971 void *__kmalloc(size_t size, gfp_t flags)
2972 {
2973 	kmem_cache_t *cachep;
2974 
2975 	/* If you want to save a few bytes .text space: replace
2976 	 * __ with kmem_.
2977 	 * Then kmalloc uses the uninlined functions instead of the inline
2978 	 * functions.
2979 	 */
2980 	cachep = __find_general_cachep(size, flags);
2981 	if (unlikely(cachep == NULL))
2982 		return NULL;
2983 	return __cache_alloc(cachep, flags);
2984 }
2985 EXPORT_SYMBOL(__kmalloc);
2986 
2987 #ifdef CONFIG_SMP
2988 /**
2989  * __alloc_percpu - allocate one copy of the object for every present
2990  * cpu in the system, zeroing them.
2991  * Objects should be dereferenced using the per_cpu_ptr macro only.
2992  *
2993  * @size: how many bytes of memory are required.
2994  */
2995 void *__alloc_percpu(size_t size)
2996 {
2997 	int i;
2998 	struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
2999 
3000 	if (!pdata)
3001 		return NULL;
3002 
3003 	/*
3004 	 * Cannot use for_each_online_cpu since a cpu may come online
3005 	 * and we have no way of figuring out how to fix the array
3006 	 * that we have allocated then....
3007 	 */
3008 	for_each_cpu(i) {
3009 		int node = cpu_to_node(i);
3010 
3011 		if (node_online(node))
3012 			pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node);
3013 		else
3014 			pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
3015 
3016 		if (!pdata->ptrs[i])
3017 			goto unwind_oom;
3018 		memset(pdata->ptrs[i], 0, size);
3019 	}
3020 
3021 	/* Catch derefs w/o wrappers */
3022 	return (void *)(~(unsigned long)pdata);
3023 
3024       unwind_oom:
3025 	while (--i >= 0) {
3026 		if (!cpu_possible(i))
3027 			continue;
3028 		kfree(pdata->ptrs[i]);
3029 	}
3030 	kfree(pdata);
3031 	return NULL;
3032 }
3033 EXPORT_SYMBOL(__alloc_percpu);
3034 #endif
3035 
3036 /**
3037  * kmem_cache_free - Deallocate an object
3038  * @cachep: The cache the allocation was from.
3039  * @objp: The previously allocated object.
3040  *
3041  * Free an object which was previously allocated from this
3042  * cache.
3043  */
3044 void kmem_cache_free(kmem_cache_t *cachep, void *objp)
3045 {
3046 	unsigned long flags;
3047 
3048 	local_irq_save(flags);
3049 	__cache_free(cachep, objp);
3050 	local_irq_restore(flags);
3051 }
3052 EXPORT_SYMBOL(kmem_cache_free);
3053 
3054 /**
3055  * kzalloc - allocate memory. The memory is set to zero.
3056  * @size: how many bytes of memory are required.
3057  * @flags: the type of memory to allocate.
3058  */
3059 void *kzalloc(size_t size, gfp_t flags)
3060 {
3061 	void *ret = kmalloc(size, flags);
3062 	if (ret)
3063 		memset(ret, 0, size);
3064 	return ret;
3065 }
3066 EXPORT_SYMBOL(kzalloc);
3067 
3068 /**
3069  * kfree - free previously allocated memory
3070  * @objp: pointer returned by kmalloc.
3071  *
3072  * If @objp is NULL, no operation is performed.
3073  *
3074  * Don't free memory not originally allocated by kmalloc()
3075  * or you will run into trouble.
3076  */
3077 void kfree(const void *objp)
3078 {
3079 	kmem_cache_t *c;
3080 	unsigned long flags;
3081 
3082 	if (unlikely(!objp))
3083 		return;
3084 	local_irq_save(flags);
3085 	kfree_debugcheck(objp);
3086 	c = page_get_cache(virt_to_page(objp));
3087 	__cache_free(c, (void *)objp);
3088 	local_irq_restore(flags);
3089 }
3090 EXPORT_SYMBOL(kfree);
3091 
3092 #ifdef CONFIG_SMP
3093 /**
3094  * free_percpu - free previously allocated percpu memory
3095  * @objp: pointer returned by alloc_percpu.
3096  *
3097  * Don't free memory not originally allocated by alloc_percpu()
3098  * The complemented objp is to check for that.
3099  */
3100 void free_percpu(const void *objp)
3101 {
3102 	int i;
3103 	struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
3104 
3105 	/*
3106 	 * We allocate for all cpus so we cannot use for online cpu here.
3107 	 */
3108 	for_each_cpu(i)
3109 	    kfree(p->ptrs[i]);
3110 	kfree(p);
3111 }
3112 EXPORT_SYMBOL(free_percpu);
3113 #endif
3114 
3115 unsigned int kmem_cache_size(kmem_cache_t *cachep)
3116 {
3117 	return obj_reallen(cachep);
3118 }
3119 EXPORT_SYMBOL(kmem_cache_size);
3120 
3121 const char *kmem_cache_name(kmem_cache_t *cachep)
3122 {
3123 	return cachep->name;
3124 }
3125 EXPORT_SYMBOL_GPL(kmem_cache_name);
3126 
3127 /*
3128  * This initializes kmem_list3 for all nodes.
3129  */
3130 static int alloc_kmemlist(kmem_cache_t *cachep)
3131 {
3132 	int node;
3133 	struct kmem_list3 *l3;
3134 	int err = 0;
3135 
3136 	for_each_online_node(node) {
3137 		struct array_cache *nc = NULL, *new;
3138 		struct array_cache **new_alien = NULL;
3139 #ifdef CONFIG_NUMA
3140 		if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
3141 			goto fail;
3142 #endif
3143 		if (!(new = alloc_arraycache(node, (cachep->shared *
3144 						    cachep->batchcount),
3145 					     0xbaadf00d)))
3146 			goto fail;
3147 		if ((l3 = cachep->nodelists[node])) {
3148 
3149 			spin_lock_irq(&l3->list_lock);
3150 
3151 			if ((nc = cachep->nodelists[node]->shared))
3152 				free_block(cachep, nc->entry, nc->avail, node);
3153 
3154 			l3->shared = new;
3155 			if (!cachep->nodelists[node]->alien) {
3156 				l3->alien = new_alien;
3157 				new_alien = NULL;
3158 			}
3159 			l3->free_limit = (1 + nr_cpus_node(node)) *
3160 			    cachep->batchcount + cachep->num;
3161 			spin_unlock_irq(&l3->list_lock);
3162 			kfree(nc);
3163 			free_alien_cache(new_alien);
3164 			continue;
3165 		}
3166 		if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
3167 					GFP_KERNEL, node)))
3168 			goto fail;
3169 
3170 		kmem_list3_init(l3);
3171 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3172 		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3173 		l3->shared = new;
3174 		l3->alien = new_alien;
3175 		l3->free_limit = (1 + nr_cpus_node(node)) *
3176 		    cachep->batchcount + cachep->num;
3177 		cachep->nodelists[node] = l3;
3178 	}
3179 	return err;
3180       fail:
3181 	err = -ENOMEM;
3182 	return err;
3183 }
3184 
3185 struct ccupdate_struct {
3186 	kmem_cache_t *cachep;
3187 	struct array_cache *new[NR_CPUS];
3188 };
3189 
3190 static void do_ccupdate_local(void *info)
3191 {
3192 	struct ccupdate_struct *new = (struct ccupdate_struct *)info;
3193 	struct array_cache *old;
3194 
3195 	check_irq_off();
3196 	old = ac_data(new->cachep);
3197 
3198 	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3199 	new->new[smp_processor_id()] = old;
3200 }
3201 
3202 static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
3203 			    int shared)
3204 {
3205 	struct ccupdate_struct new;
3206 	int i, err;
3207 
3208 	memset(&new.new, 0, sizeof(new.new));
3209 	for_each_online_cpu(i) {
3210 		new.new[i] =
3211 		    alloc_arraycache(cpu_to_node(i), limit, batchcount);
3212 		if (!new.new[i]) {
3213 			for (i--; i >= 0; i--)
3214 				kfree(new.new[i]);
3215 			return -ENOMEM;
3216 		}
3217 	}
3218 	new.cachep = cachep;
3219 
3220 	smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
3221 
3222 	check_irq_on();
3223 	spin_lock_irq(&cachep->spinlock);
3224 	cachep->batchcount = batchcount;
3225 	cachep->limit = limit;
3226 	cachep->shared = shared;
3227 	spin_unlock_irq(&cachep->spinlock);
3228 
3229 	for_each_online_cpu(i) {
3230 		struct array_cache *ccold = new.new[i];
3231 		if (!ccold)
3232 			continue;
3233 		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3234 		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
3235 		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3236 		kfree(ccold);
3237 	}
3238 
3239 	err = alloc_kmemlist(cachep);
3240 	if (err) {
3241 		printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
3242 		       cachep->name, -err);
3243 		BUG();
3244 	}
3245 	return 0;
3246 }
3247 
3248 static void enable_cpucache(kmem_cache_t *cachep)
3249 {
3250 	int err;
3251 	int limit, shared;
3252 
3253 	/* The head array serves three purposes:
3254 	 * - create a LIFO ordering, i.e. return objects that are cache-warm
3255 	 * - reduce the number of spinlock operations.
3256 	 * - reduce the number of linked list operations on the slab and
3257 	 *   bufctl chains: array operations are cheaper.
3258 	 * The numbers are guessed, we should auto-tune as described by
3259 	 * Bonwick.
3260 	 */
3261 	if (cachep->objsize > 131072)
3262 		limit = 1;
3263 	else if (cachep->objsize > PAGE_SIZE)
3264 		limit = 8;
3265 	else if (cachep->objsize > 1024)
3266 		limit = 24;
3267 	else if (cachep->objsize > 256)
3268 		limit = 54;
3269 	else
3270 		limit = 120;
3271 
3272 	/* Cpu bound tasks (e.g. network routing) can exhibit cpu bound
3273 	 * allocation behaviour: Most allocs on one cpu, most free operations
3274 	 * on another cpu. For these cases, an efficient object passing between
3275 	 * cpus is necessary. This is provided by a shared array. The array
3276 	 * replaces Bonwick's magazine layer.
3277 	 * On uniprocessor, it's functionally equivalent (but less efficient)
3278 	 * to a larger limit. Thus disabled by default.
3279 	 */
3280 	shared = 0;
3281 #ifdef CONFIG_SMP
3282 	if (cachep->objsize <= PAGE_SIZE)
3283 		shared = 8;
3284 #endif
3285 
3286 #if DEBUG
3287 	/* With debugging enabled, large batchcount lead to excessively
3288 	 * long periods with disabled local interrupts. Limit the
3289 	 * batchcount
3290 	 */
3291 	if (limit > 32)
3292 		limit = 32;
3293 #endif
3294 	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
3295 	if (err)
3296 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3297 		       cachep->name, -err);
3298 }
3299 
3300 static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
3301 				int force, int node)
3302 {
3303 	int tofree;
3304 
3305 	check_spinlock_acquired_node(cachep, node);
3306 	if (ac->touched && !force) {
3307 		ac->touched = 0;
3308 	} else if (ac->avail) {
3309 		tofree = force ? ac->avail : (ac->limit + 4) / 5;
3310 		if (tofree > ac->avail) {
3311 			tofree = (ac->avail + 1) / 2;
3312 		}
3313 		free_block(cachep, ac->entry, tofree, node);
3314 		ac->avail -= tofree;
3315 		memmove(ac->entry, &(ac->entry[tofree]),
3316 			sizeof(void *) * ac->avail);
3317 	}
3318 }
3319 
3320 /**
3321  * cache_reap - Reclaim memory from caches.
3322  * @unused: unused parameter
3323  *
3324  * Called from workqueue/eventd every few seconds.
3325  * Purpose:
3326  * - clear the per-cpu caches for this CPU.
3327  * - return freeable pages to the main free memory pool.
3328  *
3329  * If we cannot acquire the cache chain semaphore then just give up - we'll
3330  * try again on the next iteration.
3331  */
3332 static void cache_reap(void *unused)
3333 {
3334 	struct list_head *walk;
3335 	struct kmem_list3 *l3;
3336 
3337 	if (down_trylock(&cache_chain_sem)) {
3338 		/* Give up. Setup the next iteration. */
3339 		schedule_delayed_work(&__get_cpu_var(reap_work),
3340 				      REAPTIMEOUT_CPUC);
3341 		return;
3342 	}
3343 
3344 	list_for_each(walk, &cache_chain) {
3345 		kmem_cache_t *searchp;
3346 		struct list_head *p;
3347 		int tofree;
3348 		struct slab *slabp;
3349 
3350 		searchp = list_entry(walk, kmem_cache_t, next);
3351 
3352 		if (searchp->flags & SLAB_NO_REAP)
3353 			goto next;
3354 
3355 		check_irq_on();
3356 
3357 		l3 = searchp->nodelists[numa_node_id()];
3358 		if (l3->alien)
3359 			drain_alien_cache(searchp, l3);
3360 		spin_lock_irq(&l3->list_lock);
3361 
3362 		drain_array_locked(searchp, ac_data(searchp), 0,
3363 				   numa_node_id());
3364 
3365 		if (time_after(l3->next_reap, jiffies))
3366 			goto next_unlock;
3367 
3368 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
3369 
3370 		if (l3->shared)
3371 			drain_array_locked(searchp, l3->shared, 0,
3372 					   numa_node_id());
3373 
3374 		if (l3->free_touched) {
3375 			l3->free_touched = 0;
3376 			goto next_unlock;
3377 		}
3378 
3379 		tofree =
3380 		    (l3->free_limit + 5 * searchp->num -
3381 		     1) / (5 * searchp->num);
3382 		do {
3383 			p = l3->slabs_free.next;
3384 			if (p == &(l3->slabs_free))
3385 				break;
3386 
3387 			slabp = list_entry(p, struct slab, list);
3388 			BUG_ON(slabp->inuse);
3389 			list_del(&slabp->list);
3390 			STATS_INC_REAPED(searchp);
3391 
3392 			/* Safe to drop the lock. The slab is no longer
3393 			 * linked to the cache.
3394 			 * searchp cannot disappear, we hold
3395 			 * cache_chain_lock
3396 			 */
3397 			l3->free_objects -= searchp->num;
3398 			spin_unlock_irq(&l3->list_lock);
3399 			slab_destroy(searchp, slabp);
3400 			spin_lock_irq(&l3->list_lock);
3401 		} while (--tofree > 0);
3402 	      next_unlock:
3403 		spin_unlock_irq(&l3->list_lock);
3404 	      next:
3405 		cond_resched();
3406 	}
3407 	check_irq_on();
3408 	up(&cache_chain_sem);
3409 	drain_remote_pages();
3410 	/* Setup the next iteration */
3411 	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
3412 }
3413 
3414 #ifdef CONFIG_PROC_FS
3415 
3416 static void print_slabinfo_header(struct seq_file *m)
3417 {
3418 	/*
3419 	 * Output format version, so at least we can change it
3420 	 * without _too_ many complaints.
3421 	 */
3422 #if STATS
3423 	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
3424 #else
3425 	seq_puts(m, "slabinfo - version: 2.1\n");
3426 #endif
3427 	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
3428 		 "<objperslab> <pagesperslab>");
3429 	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
3430 	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
3431 #if STATS
3432 	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
3433 		 "<error> <maxfreeable> <nodeallocs> <remotefrees>");
3434 	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
3435 #endif
3436 	seq_putc(m, '\n');
3437 }
3438 
3439 static void *s_start(struct seq_file *m, loff_t *pos)
3440 {
3441 	loff_t n = *pos;
3442 	struct list_head *p;
3443 
3444 	down(&cache_chain_sem);
3445 	if (!n)
3446 		print_slabinfo_header(m);
3447 	p = cache_chain.next;
3448 	while (n--) {
3449 		p = p->next;
3450 		if (p == &cache_chain)
3451 			return NULL;
3452 	}
3453 	return list_entry(p, kmem_cache_t, next);
3454 }
3455 
3456 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3457 {
3458 	kmem_cache_t *cachep = p;
3459 	++*pos;
3460 	return cachep->next.next == &cache_chain ? NULL
3461 	    : list_entry(cachep->next.next, kmem_cache_t, next);
3462 }
3463 
3464 static void s_stop(struct seq_file *m, void *p)
3465 {
3466 	up(&cache_chain_sem);
3467 }
3468 
3469 static int s_show(struct seq_file *m, void *p)
3470 {
3471 	kmem_cache_t *cachep = p;
3472 	struct list_head *q;
3473 	struct slab *slabp;
3474 	unsigned long active_objs;
3475 	unsigned long num_objs;
3476 	unsigned long active_slabs = 0;
3477 	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
3478 	const char *name;
3479 	char *error = NULL;
3480 	int node;
3481 	struct kmem_list3 *l3;
3482 
3483 	check_irq_on();
3484 	spin_lock_irq(&cachep->spinlock);
3485 	active_objs = 0;
3486 	num_slabs = 0;
3487 	for_each_online_node(node) {
3488 		l3 = cachep->nodelists[node];
3489 		if (!l3)
3490 			continue;
3491 
3492 		spin_lock(&l3->list_lock);
3493 
3494 		list_for_each(q, &l3->slabs_full) {
3495 			slabp = list_entry(q, struct slab, list);
3496 			if (slabp->inuse != cachep->num && !error)
3497 				error = "slabs_full accounting error";
3498 			active_objs += cachep->num;
3499 			active_slabs++;
3500 		}
3501 		list_for_each(q, &l3->slabs_partial) {
3502 			slabp = list_entry(q, struct slab, list);
3503 			if (slabp->inuse == cachep->num && !error)
3504 				error = "slabs_partial inuse accounting error";
3505 			if (!slabp->inuse && !error)
3506 				error = "slabs_partial/inuse accounting error";
3507 			active_objs += slabp->inuse;
3508 			active_slabs++;
3509 		}
3510 		list_for_each(q, &l3->slabs_free) {
3511 			slabp = list_entry(q, struct slab, list);
3512 			if (slabp->inuse && !error)
3513 				error = "slabs_free/inuse accounting error";
3514 			num_slabs++;
3515 		}
3516 		free_objects += l3->free_objects;
3517 		shared_avail += l3->shared->avail;
3518 
3519 		spin_unlock(&l3->list_lock);
3520 	}
3521 	num_slabs += active_slabs;
3522 	num_objs = num_slabs * cachep->num;
3523 	if (num_objs - active_objs != free_objects && !error)
3524 		error = "free_objects accounting error";
3525 
3526 	name = cachep->name;
3527 	if (error)
3528 		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
3529 
3530 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
3531 		   name, active_objs, num_objs, cachep->objsize,
3532 		   cachep->num, (1 << cachep->gfporder));
3533 	seq_printf(m, " : tunables %4u %4u %4u",
3534 		   cachep->limit, cachep->batchcount, cachep->shared);
3535 	seq_printf(m, " : slabdata %6lu %6lu %6lu",
3536 		   active_slabs, num_slabs, shared_avail);
3537 #if STATS
3538 	{			/* list3 stats */
3539 		unsigned long high = cachep->high_mark;
3540 		unsigned long allocs = cachep->num_allocations;
3541 		unsigned long grown = cachep->grown;
3542 		unsigned long reaped = cachep->reaped;
3543 		unsigned long errors = cachep->errors;
3544 		unsigned long max_freeable = cachep->max_freeable;
3545 		unsigned long node_allocs = cachep->node_allocs;
3546 		unsigned long node_frees = cachep->node_frees;
3547 
3548 		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
3549 				%4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
3550 	}
3551 	/* cpu stats */
3552 	{
3553 		unsigned long allochit = atomic_read(&cachep->allochit);
3554 		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
3555 		unsigned long freehit = atomic_read(&cachep->freehit);
3556 		unsigned long freemiss = atomic_read(&cachep->freemiss);
3557 
3558 		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
3559 			   allochit, allocmiss, freehit, freemiss);
3560 	}
3561 #endif
3562 	seq_putc(m, '\n');
3563 	spin_unlock_irq(&cachep->spinlock);
3564 	return 0;
3565 }
3566 
3567 /*
3568  * slabinfo_op - iterator that generates /proc/slabinfo
3569  *
3570  * Output layout:
3571  * cache-name
3572  * num-active-objs
3573  * total-objs
3574  * object size
3575  * num-active-slabs
3576  * total-slabs
3577  * num-pages-per-slab
3578  * + further values on SMP and with statistics enabled
3579  */
3580 
3581 struct seq_operations slabinfo_op = {
3582 	.start = s_start,
3583 	.next = s_next,
3584 	.stop = s_stop,
3585 	.show = s_show,
3586 };
3587 
3588 #define MAX_SLABINFO_WRITE 128
3589 /**
3590  * slabinfo_write - Tuning for the slab allocator
3591  * @file: unused
3592  * @buffer: user buffer
3593  * @count: data length
3594  * @ppos: unused
3595  */
3596 ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3597 		       size_t count, loff_t *ppos)
3598 {
3599 	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
3600 	int limit, batchcount, shared, res;
3601 	struct list_head *p;
3602 
3603 	if (count > MAX_SLABINFO_WRITE)
3604 		return -EINVAL;
3605 	if (copy_from_user(&kbuf, buffer, count))
3606 		return -EFAULT;
3607 	kbuf[MAX_SLABINFO_WRITE] = '\0';
3608 
3609 	tmp = strchr(kbuf, ' ');
3610 	if (!tmp)
3611 		return -EINVAL;
3612 	*tmp = '\0';
3613 	tmp++;
3614 	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
3615 		return -EINVAL;
3616 
3617 	/* Find the cache in the chain of caches. */
3618 	down(&cache_chain_sem);
3619 	res = -EINVAL;
3620 	list_for_each(p, &cache_chain) {
3621 		kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
3622 
3623 		if (!strcmp(cachep->name, kbuf)) {
3624 			if (limit < 1 ||
3625 			    batchcount < 1 ||
3626 			    batchcount > limit || shared < 0) {
3627 				res = 0;
3628 			} else {
3629 				res = do_tune_cpucache(cachep, limit,
3630 						       batchcount, shared);
3631 			}
3632 			break;
3633 		}
3634 	}
3635 	up(&cache_chain_sem);
3636 	if (res >= 0)
3637 		res = count;
3638 	return res;
3639 }
3640 #endif
3641 
3642 /**
3643  * ksize - get the actual amount of memory allocated for a given object
3644  * @objp: Pointer to the object
3645  *
3646  * kmalloc may internally round up allocations and return more memory
3647  * than requested. ksize() can be used to determine the actual amount of
3648  * memory allocated. The caller may use this additional memory, even though
3649  * a smaller amount of memory was initially specified with the kmalloc call.
3650  * The caller must guarantee that objp points to a valid object previously
3651  * allocated with either kmalloc() or kmem_cache_alloc(). The object
3652  * must not be freed during the duration of the call.
3653  */
3654 unsigned int ksize(const void *objp)
3655 {
3656 	if (unlikely(objp == NULL))
3657 		return 0;
3658 
3659 	return obj_reallen(page_get_cache(virt_to_page(objp)));
3660 }
3661 
3662 
3663 /*
3664  * kstrdup - allocate space for and copy an existing string
3665  *
3666  * @s: the string to duplicate
3667  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
3668  */
3669 char *kstrdup(const char *s, gfp_t gfp)
3670 {
3671 	size_t len;
3672 	char *buf;
3673 
3674 	if (!s)
3675 		return NULL;
3676 
3677 	len = strlen(s) + 1;
3678 	buf = kmalloc(len, gfp);
3679 	if (buf)
3680 		memcpy(buf, s, len);
3681 	return buf;
3682 }
3683 EXPORT_SYMBOL(kstrdup);
3684