xref: /openbmc/linux/mm/slab.c (revision 6ab3d562)
1 /*
2  * linux/mm/slab.c
3  * Written by Mark Hemment, 1996/97.
4  * (markhe@nextd.demon.co.uk)
5  *
6  * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
7  *
8  * Major cleanup, different bufctl logic, per-cpu arrays
9  *	(c) 2000 Manfred Spraul
10  *
11  * Cleanup, make the head arrays unconditional, preparation for NUMA
12  * 	(c) 2002 Manfred Spraul
13  *
14  * An implementation of the Slab Allocator as described in outline in;
15  *	UNIX Internals: The New Frontiers by Uresh Vahalia
16  *	Pub: Prentice Hall	ISBN 0-13-101908-2
17  * or with a little more detail in;
18  *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
19  *	Jeff Bonwick (Sun Microsystems).
20  *	Presented at: USENIX Summer 1994 Technical Conference
21  *
22  * The memory is organized in caches, one cache for each object type.
23  * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
24  * Each cache consists out of many slabs (they are small (usually one
25  * page long) and always contiguous), and each slab contains multiple
26  * initialized objects.
27  *
28  * This means, that your constructor is used only for newly allocated
29  * slabs and you must pass objects with the same intializations to
30  * kmem_cache_free.
31  *
32  * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
33  * normal). If you need a special memory type, then must create a new
34  * cache for that memory type.
35  *
36  * In order to reduce fragmentation, the slabs are sorted in 3 groups:
37  *   full slabs with 0 free objects
38  *   partial slabs
39  *   empty slabs with no allocated objects
40  *
41  * If partial slabs exist, then new allocations come from these slabs,
42  * otherwise from empty slabs or new slabs are allocated.
43  *
44  * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
45  * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
46  *
47  * Each cache has a short per-cpu head array, most allocs
48  * and frees go into that array, and if that array overflows, then 1/2
49  * of the entries in the array are given back into the global cache.
50  * The head array is strictly LIFO and should improve the cache hit rates.
51  * On SMP, it additionally reduces the spinlock operations.
52  *
53  * The c_cpuarray may not be read with enabled local interrupts -
54  * it's changed with a smp_call_function().
55  *
56  * SMP synchronization:
57  *  constructors and destructors are called without any locking.
58  *  Several members in struct kmem_cache and struct slab never change, they
59  *	are accessed without any locking.
60  *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
61  *  	and local interrupts are disabled so slab code is preempt-safe.
62  *  The non-constant members are protected with a per-cache irq spinlock.
63  *
64  * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
65  * in 2000 - many ideas in the current implementation are derived from
66  * his patch.
67  *
68  * Further notes from the original documentation:
69  *
70  * 11 April '97.  Started multi-threading - markhe
71  *	The global cache-chain is protected by the mutex 'cache_chain_mutex'.
72  *	The sem is only needed when accessing/extending the cache-chain, which
73  *	can never happen inside an interrupt (kmem_cache_create(),
74  *	kmem_cache_shrink() and kmem_cache_reap()).
75  *
76  *	At present, each engine can be growing a cache.  This should be blocked.
77  *
78  * 15 March 2005. NUMA slab allocator.
79  *	Shai Fultheim <shai@scalex86.org>.
80  *	Shobhit Dayal <shobhit@calsoftinc.com>
81  *	Alok N Kataria <alokk@calsoftinc.com>
82  *	Christoph Lameter <christoph@lameter.com>
83  *
84  *	Modified the slab allocator to be node aware on NUMA systems.
85  *	Each node has its own list of partial, free and full slabs.
86  *	All object allocations for a node occur from node specific slab lists.
87  */
88 
89 #include	<linux/config.h>
90 #include	<linux/slab.h>
91 #include	<linux/mm.h>
92 #include	<linux/poison.h>
93 #include	<linux/swap.h>
94 #include	<linux/cache.h>
95 #include	<linux/interrupt.h>
96 #include	<linux/init.h>
97 #include	<linux/compiler.h>
98 #include	<linux/cpuset.h>
99 #include	<linux/seq_file.h>
100 #include	<linux/notifier.h>
101 #include	<linux/kallsyms.h>
102 #include	<linux/cpu.h>
103 #include	<linux/sysctl.h>
104 #include	<linux/module.h>
105 #include	<linux/rcupdate.h>
106 #include	<linux/string.h>
107 #include	<linux/nodemask.h>
108 #include	<linux/mempolicy.h>
109 #include	<linux/mutex.h>
110 #include	<linux/rtmutex.h>
111 
112 #include	<asm/uaccess.h>
113 #include	<asm/cacheflush.h>
114 #include	<asm/tlbflush.h>
115 #include	<asm/page.h>
116 
117 /*
118  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
119  *		  SLAB_RED_ZONE & SLAB_POISON.
120  *		  0 for faster, smaller code (especially in the critical paths).
121  *
122  * STATS	- 1 to collect stats for /proc/slabinfo.
123  *		  0 for faster, smaller code (especially in the critical paths).
124  *
125  * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
126  */
127 
128 #ifdef CONFIG_DEBUG_SLAB
129 #define	DEBUG		1
130 #define	STATS		1
131 #define	FORCED_DEBUG	1
132 #else
133 #define	DEBUG		0
134 #define	STATS		0
135 #define	FORCED_DEBUG	0
136 #endif
137 
138 /* Shouldn't this be in a header file somewhere? */
139 #define	BYTES_PER_WORD		sizeof(void *)
140 
141 #ifndef cache_line_size
142 #define cache_line_size()	L1_CACHE_BYTES
143 #endif
144 
145 #ifndef ARCH_KMALLOC_MINALIGN
146 /*
147  * Enforce a minimum alignment for the kmalloc caches.
148  * Usually, the kmalloc caches are cache_line_size() aligned, except when
149  * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
150  * Some archs want to perform DMA into kmalloc caches and need a guaranteed
151  * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.
152  * Note that this flag disables some debug features.
153  */
154 #define ARCH_KMALLOC_MINALIGN 0
155 #endif
156 
157 #ifndef ARCH_SLAB_MINALIGN
158 /*
159  * Enforce a minimum alignment for all caches.
160  * Intended for archs that get misalignment faults even for BYTES_PER_WORD
161  * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
162  * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
163  * some debug features.
164  */
165 #define ARCH_SLAB_MINALIGN 0
166 #endif
167 
168 #ifndef ARCH_KMALLOC_FLAGS
169 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
170 #endif
171 
172 /* Legal flag mask for kmem_cache_create(). */
173 #if DEBUG
174 # define CREATE_MASK	(SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
175 			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
176 			 SLAB_CACHE_DMA | \
177 			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
178 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
179 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
180 #else
181 # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
182 			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
183 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
184 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
185 #endif
186 
187 /*
188  * kmem_bufctl_t:
189  *
190  * Bufctl's are used for linking objs within a slab
191  * linked offsets.
192  *
193  * This implementation relies on "struct page" for locating the cache &
194  * slab an object belongs to.
195  * This allows the bufctl structure to be small (one int), but limits
196  * the number of objects a slab (not a cache) can contain when off-slab
197  * bufctls are used. The limit is the size of the largest general cache
198  * that does not use off-slab slabs.
199  * For 32bit archs with 4 kB pages, is this 56.
200  * This is not serious, as it is only for large objects, when it is unwise
201  * to have too many per slab.
202  * Note: This limit can be raised by introducing a general cache whose size
203  * is less than 512 (PAGE_SIZE<<3), but greater than 256.
204  */
205 
206 typedef unsigned int kmem_bufctl_t;
207 #define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
208 #define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
209 #define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2)
210 #define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3)
211 
212 /*
213  * struct slab
214  *
215  * Manages the objs in a slab. Placed either at the beginning of mem allocated
216  * for a slab, or allocated from an general cache.
217  * Slabs are chained into three list: fully used, partial, fully free slabs.
218  */
219 struct slab {
220 	struct list_head list;
221 	unsigned long colouroff;
222 	void *s_mem;		/* including colour offset */
223 	unsigned int inuse;	/* num of objs active in slab */
224 	kmem_bufctl_t free;
225 	unsigned short nodeid;
226 };
227 
228 /*
229  * struct slab_rcu
230  *
231  * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
232  * arrange for kmem_freepages to be called via RCU.  This is useful if
233  * we need to approach a kernel structure obliquely, from its address
234  * obtained without the usual locking.  We can lock the structure to
235  * stabilize it and check it's still at the given address, only if we
236  * can be sure that the memory has not been meanwhile reused for some
237  * other kind of object (which our subsystem's lock might corrupt).
238  *
239  * rcu_read_lock before reading the address, then rcu_read_unlock after
240  * taking the spinlock within the structure expected at that address.
241  *
242  * We assume struct slab_rcu can overlay struct slab when destroying.
243  */
244 struct slab_rcu {
245 	struct rcu_head head;
246 	struct kmem_cache *cachep;
247 	void *addr;
248 };
249 
250 /*
251  * struct array_cache
252  *
253  * Purpose:
254  * - LIFO ordering, to hand out cache-warm objects from _alloc
255  * - reduce the number of linked list operations
256  * - reduce spinlock operations
257  *
258  * The limit is stored in the per-cpu structure to reduce the data cache
259  * footprint.
260  *
261  */
262 struct array_cache {
263 	unsigned int avail;
264 	unsigned int limit;
265 	unsigned int batchcount;
266 	unsigned int touched;
267 	spinlock_t lock;
268 	void *entry[0];	/*
269 			 * Must have this definition in here for the proper
270 			 * alignment of array_cache. Also simplifies accessing
271 			 * the entries.
272 			 * [0] is for gcc 2.95. It should really be [].
273 			 */
274 };
275 
276 /*
277  * bootstrap: The caches do not work without cpuarrays anymore, but the
278  * cpuarrays are allocated from the generic caches...
279  */
280 #define BOOT_CPUCACHE_ENTRIES	1
281 struct arraycache_init {
282 	struct array_cache cache;
283 	void *entries[BOOT_CPUCACHE_ENTRIES];
284 };
285 
286 /*
287  * The slab lists for all objects.
288  */
289 struct kmem_list3 {
290 	struct list_head slabs_partial;	/* partial list first, better asm code */
291 	struct list_head slabs_full;
292 	struct list_head slabs_free;
293 	unsigned long free_objects;
294 	unsigned int free_limit;
295 	unsigned int colour_next;	/* Per-node cache coloring */
296 	spinlock_t list_lock;
297 	struct array_cache *shared;	/* shared per node */
298 	struct array_cache **alien;	/* on other nodes */
299 	unsigned long next_reap;	/* updated without locking */
300 	int free_touched;		/* updated without locking */
301 };
302 
303 /*
304  * Need this for bootstrapping a per node allocator.
305  */
306 #define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
307 struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
308 #define	CACHE_CACHE 0
309 #define	SIZE_AC 1
310 #define	SIZE_L3 (1 + MAX_NUMNODES)
311 
312 /*
313  * This function must be completely optimized away if a constant is passed to
314  * it.  Mostly the same as what is in linux/slab.h except it returns an index.
315  */
316 static __always_inline int index_of(const size_t size)
317 {
318 	extern void __bad_size(void);
319 
320 	if (__builtin_constant_p(size)) {
321 		int i = 0;
322 
323 #define CACHE(x) \
324 	if (size <=x) \
325 		return i; \
326 	else \
327 		i++;
328 #include "linux/kmalloc_sizes.h"
329 #undef CACHE
330 		__bad_size();
331 	} else
332 		__bad_size();
333 	return 0;
334 }
335 
336 static int slab_early_init = 1;
337 
338 #define INDEX_AC index_of(sizeof(struct arraycache_init))
339 #define INDEX_L3 index_of(sizeof(struct kmem_list3))
340 
341 static void kmem_list3_init(struct kmem_list3 *parent)
342 {
343 	INIT_LIST_HEAD(&parent->slabs_full);
344 	INIT_LIST_HEAD(&parent->slabs_partial);
345 	INIT_LIST_HEAD(&parent->slabs_free);
346 	parent->shared = NULL;
347 	parent->alien = NULL;
348 	parent->colour_next = 0;
349 	spin_lock_init(&parent->list_lock);
350 	parent->free_objects = 0;
351 	parent->free_touched = 0;
352 }
353 
354 #define MAKE_LIST(cachep, listp, slab, nodeid)				\
355 	do {								\
356 		INIT_LIST_HEAD(listp);					\
357 		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\
358 	} while (0)
359 
360 #define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
361 	do {								\
362 	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
363 	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
364 	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
365 	} while (0)
366 
367 /*
368  * struct kmem_cache
369  *
370  * manages a cache.
371  */
372 
373 struct kmem_cache {
374 /* 1) per-cpu data, touched during every alloc/free */
375 	struct array_cache *array[NR_CPUS];
376 /* 2) Cache tunables. Protected by cache_chain_mutex */
377 	unsigned int batchcount;
378 	unsigned int limit;
379 	unsigned int shared;
380 
381 	unsigned int buffer_size;
382 /* 3) touched by every alloc & free from the backend */
383 	struct kmem_list3 *nodelists[MAX_NUMNODES];
384 
385 	unsigned int flags;		/* constant flags */
386 	unsigned int num;		/* # of objs per slab */
387 
388 /* 4) cache_grow/shrink */
389 	/* order of pgs per slab (2^n) */
390 	unsigned int gfporder;
391 
392 	/* force GFP flags, e.g. GFP_DMA */
393 	gfp_t gfpflags;
394 
395 	size_t colour;			/* cache colouring range */
396 	unsigned int colour_off;	/* colour offset */
397 	struct kmem_cache *slabp_cache;
398 	unsigned int slab_size;
399 	unsigned int dflags;		/* dynamic flags */
400 
401 	/* constructor func */
402 	void (*ctor) (void *, struct kmem_cache *, unsigned long);
403 
404 	/* de-constructor func */
405 	void (*dtor) (void *, struct kmem_cache *, unsigned long);
406 
407 /* 5) cache creation/removal */
408 	const char *name;
409 	struct list_head next;
410 
411 /* 6) statistics */
412 #if STATS
413 	unsigned long num_active;
414 	unsigned long num_allocations;
415 	unsigned long high_mark;
416 	unsigned long grown;
417 	unsigned long reaped;
418 	unsigned long errors;
419 	unsigned long max_freeable;
420 	unsigned long node_allocs;
421 	unsigned long node_frees;
422 	unsigned long node_overflow;
423 	atomic_t allochit;
424 	atomic_t allocmiss;
425 	atomic_t freehit;
426 	atomic_t freemiss;
427 #endif
428 #if DEBUG
429 	/*
430 	 * If debugging is enabled, then the allocator can add additional
431 	 * fields and/or padding to every object. buffer_size contains the total
432 	 * object size including these internal fields, the following two
433 	 * variables contain the offset to the user object and its size.
434 	 */
435 	int obj_offset;
436 	int obj_size;
437 #endif
438 };
439 
440 #define CFLGS_OFF_SLAB		(0x80000000UL)
441 #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
442 
443 #define BATCHREFILL_LIMIT	16
444 /*
445  * Optimization question: fewer reaps means less probability for unnessary
446  * cpucache drain/refill cycles.
447  *
448  * OTOH the cpuarrays can contain lots of objects,
449  * which could lock up otherwise freeable slabs.
450  */
451 #define REAPTIMEOUT_CPUC	(2*HZ)
452 #define REAPTIMEOUT_LIST3	(4*HZ)
453 
454 #if STATS
455 #define	STATS_INC_ACTIVE(x)	((x)->num_active++)
456 #define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
457 #define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
458 #define	STATS_INC_GROWN(x)	((x)->grown++)
459 #define	STATS_INC_REAPED(x)	((x)->reaped++)
460 #define	STATS_SET_HIGH(x)						\
461 	do {								\
462 		if ((x)->num_active > (x)->high_mark)			\
463 			(x)->high_mark = (x)->num_active;		\
464 	} while (0)
465 #define	STATS_INC_ERR(x)	((x)->errors++)
466 #define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
467 #define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
468 #define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
469 #define	STATS_SET_FREEABLE(x, i)					\
470 	do {								\
471 		if ((x)->max_freeable < i)				\
472 			(x)->max_freeable = i;				\
473 	} while (0)
474 #define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
475 #define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
476 #define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
477 #define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
478 #else
479 #define	STATS_INC_ACTIVE(x)	do { } while (0)
480 #define	STATS_DEC_ACTIVE(x)	do { } while (0)
481 #define	STATS_INC_ALLOCED(x)	do { } while (0)
482 #define	STATS_INC_GROWN(x)	do { } while (0)
483 #define	STATS_INC_REAPED(x)	do { } while (0)
484 #define	STATS_SET_HIGH(x)	do { } while (0)
485 #define	STATS_INC_ERR(x)	do { } while (0)
486 #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
487 #define	STATS_INC_NODEFREES(x)	do { } while (0)
488 #define STATS_INC_ACOVERFLOW(x)   do { } while (0)
489 #define	STATS_SET_FREEABLE(x, i) do { } while (0)
490 #define STATS_INC_ALLOCHIT(x)	do { } while (0)
491 #define STATS_INC_ALLOCMISS(x)	do { } while (0)
492 #define STATS_INC_FREEHIT(x)	do { } while (0)
493 #define STATS_INC_FREEMISS(x)	do { } while (0)
494 #endif
495 
496 #if DEBUG
497 
498 /*
499  * memory layout of objects:
500  * 0		: objp
501  * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
502  * 		the end of an object is aligned with the end of the real
503  * 		allocation. Catches writes behind the end of the allocation.
504  * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
505  * 		redzone word.
506  * cachep->obj_offset: The real object.
507  * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
508  * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
509  *					[BYTES_PER_WORD long]
510  */
511 static int obj_offset(struct kmem_cache *cachep)
512 {
513 	return cachep->obj_offset;
514 }
515 
516 static int obj_size(struct kmem_cache *cachep)
517 {
518 	return cachep->obj_size;
519 }
520 
521 static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
522 {
523 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
524 	return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
525 }
526 
527 static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
528 {
529 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
530 	if (cachep->flags & SLAB_STORE_USER)
531 		return (unsigned long *)(objp + cachep->buffer_size -
532 					 2 * BYTES_PER_WORD);
533 	return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
534 }
535 
536 static void **dbg_userword(struct kmem_cache *cachep, void *objp)
537 {
538 	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
539 	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
540 }
541 
542 #else
543 
544 #define obj_offset(x)			0
545 #define obj_size(cachep)		(cachep->buffer_size)
546 #define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long *)NULL;})
547 #define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long *)NULL;})
548 #define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
549 
550 #endif
551 
552 /*
553  * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
554  * order.
555  */
556 #if defined(CONFIG_LARGE_ALLOCS)
557 #define	MAX_OBJ_ORDER	13	/* up to 32Mb */
558 #define	MAX_GFP_ORDER	13	/* up to 32Mb */
559 #elif defined(CONFIG_MMU)
560 #define	MAX_OBJ_ORDER	5	/* 32 pages */
561 #define	MAX_GFP_ORDER	5	/* 32 pages */
562 #else
563 #define	MAX_OBJ_ORDER	8	/* up to 1Mb */
564 #define	MAX_GFP_ORDER	8	/* up to 1Mb */
565 #endif
566 
567 /*
568  * Do not go above this order unless 0 objects fit into the slab.
569  */
570 #define	BREAK_GFP_ORDER_HI	1
571 #define	BREAK_GFP_ORDER_LO	0
572 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
573 
574 /*
575  * Functions for storing/retrieving the cachep and or slab from the page
576  * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
577  * these are used to find the cache which an obj belongs to.
578  */
579 static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
580 {
581 	page->lru.next = (struct list_head *)cache;
582 }
583 
584 static inline struct kmem_cache *page_get_cache(struct page *page)
585 {
586 	if (unlikely(PageCompound(page)))
587 		page = (struct page *)page_private(page);
588 	BUG_ON(!PageSlab(page));
589 	return (struct kmem_cache *)page->lru.next;
590 }
591 
592 static inline void page_set_slab(struct page *page, struct slab *slab)
593 {
594 	page->lru.prev = (struct list_head *)slab;
595 }
596 
597 static inline struct slab *page_get_slab(struct page *page)
598 {
599 	if (unlikely(PageCompound(page)))
600 		page = (struct page *)page_private(page);
601 	BUG_ON(!PageSlab(page));
602 	return (struct slab *)page->lru.prev;
603 }
604 
605 static inline struct kmem_cache *virt_to_cache(const void *obj)
606 {
607 	struct page *page = virt_to_page(obj);
608 	return page_get_cache(page);
609 }
610 
611 static inline struct slab *virt_to_slab(const void *obj)
612 {
613 	struct page *page = virt_to_page(obj);
614 	return page_get_slab(page);
615 }
616 
617 static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
618 				 unsigned int idx)
619 {
620 	return slab->s_mem + cache->buffer_size * idx;
621 }
622 
623 static inline unsigned int obj_to_index(struct kmem_cache *cache,
624 					struct slab *slab, void *obj)
625 {
626 	return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
627 }
628 
629 /*
630  * These are the default caches for kmalloc. Custom caches can have other sizes.
631  */
632 struct cache_sizes malloc_sizes[] = {
633 #define CACHE(x) { .cs_size = (x) },
634 #include <linux/kmalloc_sizes.h>
635 	CACHE(ULONG_MAX)
636 #undef CACHE
637 };
638 EXPORT_SYMBOL(malloc_sizes);
639 
640 /* Must match cache_sizes above. Out of line to keep cache footprint low. */
641 struct cache_names {
642 	char *name;
643 	char *name_dma;
644 };
645 
646 static struct cache_names __initdata cache_names[] = {
647 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
648 #include <linux/kmalloc_sizes.h>
649 	{NULL,}
650 #undef CACHE
651 };
652 
653 static struct arraycache_init initarray_cache __initdata =
654     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
655 static struct arraycache_init initarray_generic =
656     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
657 
658 /* internal cache of cache description objs */
659 static struct kmem_cache cache_cache = {
660 	.batchcount = 1,
661 	.limit = BOOT_CPUCACHE_ENTRIES,
662 	.shared = 1,
663 	.buffer_size = sizeof(struct kmem_cache),
664 	.name = "kmem_cache",
665 #if DEBUG
666 	.obj_size = sizeof(struct kmem_cache),
667 #endif
668 };
669 
670 /* Guard access to the cache-chain. */
671 static DEFINE_MUTEX(cache_chain_mutex);
672 static struct list_head cache_chain;
673 
674 /*
675  * vm_enough_memory() looks at this to determine how many slab-allocated pages
676  * are possibly freeable under pressure
677  *
678  * SLAB_RECLAIM_ACCOUNT turns this on per-slab
679  */
680 atomic_t slab_reclaim_pages;
681 
682 /*
683  * chicken and egg problem: delay the per-cpu array allocation
684  * until the general caches are up.
685  */
686 static enum {
687 	NONE,
688 	PARTIAL_AC,
689 	PARTIAL_L3,
690 	FULL
691 } g_cpucache_up;
692 
693 /*
694  * used by boot code to determine if it can use slab based allocator
695  */
696 int slab_is_available(void)
697 {
698 	return g_cpucache_up == FULL;
699 }
700 
701 static DEFINE_PER_CPU(struct work_struct, reap_work);
702 
703 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
704 			int node);
705 static void enable_cpucache(struct kmem_cache *cachep);
706 static void cache_reap(void *unused);
707 static int __node_shrink(struct kmem_cache *cachep, int node);
708 
709 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
710 {
711 	return cachep->array[smp_processor_id()];
712 }
713 
714 static inline struct kmem_cache *__find_general_cachep(size_t size,
715 							gfp_t gfpflags)
716 {
717 	struct cache_sizes *csizep = malloc_sizes;
718 
719 #if DEBUG
720 	/* This happens if someone tries to call
721 	 * kmem_cache_create(), or __kmalloc(), before
722 	 * the generic caches are initialized.
723 	 */
724 	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
725 #endif
726 	while (size > csizep->cs_size)
727 		csizep++;
728 
729 	/*
730 	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
731 	 * has cs_{dma,}cachep==NULL. Thus no special case
732 	 * for large kmalloc calls required.
733 	 */
734 	if (unlikely(gfpflags & GFP_DMA))
735 		return csizep->cs_dmacachep;
736 	return csizep->cs_cachep;
737 }
738 
739 struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
740 {
741 	return __find_general_cachep(size, gfpflags);
742 }
743 EXPORT_SYMBOL(kmem_find_general_cachep);
744 
745 static size_t slab_mgmt_size(size_t nr_objs, size_t align)
746 {
747 	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
748 }
749 
750 /*
751  * Calculate the number of objects and left-over bytes for a given buffer size.
752  */
753 static void cache_estimate(unsigned long gfporder, size_t buffer_size,
754 			   size_t align, int flags, size_t *left_over,
755 			   unsigned int *num)
756 {
757 	int nr_objs;
758 	size_t mgmt_size;
759 	size_t slab_size = PAGE_SIZE << gfporder;
760 
761 	/*
762 	 * The slab management structure can be either off the slab or
763 	 * on it. For the latter case, the memory allocated for a
764 	 * slab is used for:
765 	 *
766 	 * - The struct slab
767 	 * - One kmem_bufctl_t for each object
768 	 * - Padding to respect alignment of @align
769 	 * - @buffer_size bytes for each object
770 	 *
771 	 * If the slab management structure is off the slab, then the
772 	 * alignment will already be calculated into the size. Because
773 	 * the slabs are all pages aligned, the objects will be at the
774 	 * correct alignment when allocated.
775 	 */
776 	if (flags & CFLGS_OFF_SLAB) {
777 		mgmt_size = 0;
778 		nr_objs = slab_size / buffer_size;
779 
780 		if (nr_objs > SLAB_LIMIT)
781 			nr_objs = SLAB_LIMIT;
782 	} else {
783 		/*
784 		 * Ignore padding for the initial guess. The padding
785 		 * is at most @align-1 bytes, and @buffer_size is at
786 		 * least @align. In the worst case, this result will
787 		 * be one greater than the number of objects that fit
788 		 * into the memory allocation when taking the padding
789 		 * into account.
790 		 */
791 		nr_objs = (slab_size - sizeof(struct slab)) /
792 			  (buffer_size + sizeof(kmem_bufctl_t));
793 
794 		/*
795 		 * This calculated number will be either the right
796 		 * amount, or one greater than what we want.
797 		 */
798 		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
799 		       > slab_size)
800 			nr_objs--;
801 
802 		if (nr_objs > SLAB_LIMIT)
803 			nr_objs = SLAB_LIMIT;
804 
805 		mgmt_size = slab_mgmt_size(nr_objs, align);
806 	}
807 	*num = nr_objs;
808 	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
809 }
810 
811 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
812 
813 static void __slab_error(const char *function, struct kmem_cache *cachep,
814 			char *msg)
815 {
816 	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
817 	       function, cachep->name, msg);
818 	dump_stack();
819 }
820 
821 #ifdef CONFIG_NUMA
822 /*
823  * Special reaping functions for NUMA systems called from cache_reap().
824  * These take care of doing round robin flushing of alien caches (containing
825  * objects freed on different nodes from which they were allocated) and the
826  * flushing of remote pcps by calling drain_node_pages.
827  */
828 static DEFINE_PER_CPU(unsigned long, reap_node);
829 
830 static void init_reap_node(int cpu)
831 {
832 	int node;
833 
834 	node = next_node(cpu_to_node(cpu), node_online_map);
835 	if (node == MAX_NUMNODES)
836 		node = first_node(node_online_map);
837 
838 	__get_cpu_var(reap_node) = node;
839 }
840 
841 static void next_reap_node(void)
842 {
843 	int node = __get_cpu_var(reap_node);
844 
845 	/*
846 	 * Also drain per cpu pages on remote zones
847 	 */
848 	if (node != numa_node_id())
849 		drain_node_pages(node);
850 
851 	node = next_node(node, node_online_map);
852 	if (unlikely(node >= MAX_NUMNODES))
853 		node = first_node(node_online_map);
854 	__get_cpu_var(reap_node) = node;
855 }
856 
857 #else
858 #define init_reap_node(cpu) do { } while (0)
859 #define next_reap_node(void) do { } while (0)
860 #endif
861 
862 /*
863  * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
864  * via the workqueue/eventd.
865  * Add the CPU number into the expiration time to minimize the possibility of
866  * the CPUs getting into lockstep and contending for the global cache chain
867  * lock.
868  */
869 static void __devinit start_cpu_timer(int cpu)
870 {
871 	struct work_struct *reap_work = &per_cpu(reap_work, cpu);
872 
873 	/*
874 	 * When this gets called from do_initcalls via cpucache_init(),
875 	 * init_workqueues() has already run, so keventd will be setup
876 	 * at that time.
877 	 */
878 	if (keventd_up() && reap_work->func == NULL) {
879 		init_reap_node(cpu);
880 		INIT_WORK(reap_work, cache_reap, NULL);
881 		schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
882 	}
883 }
884 
885 static struct array_cache *alloc_arraycache(int node, int entries,
886 					    int batchcount)
887 {
888 	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
889 	struct array_cache *nc = NULL;
890 
891 	nc = kmalloc_node(memsize, GFP_KERNEL, node);
892 	if (nc) {
893 		nc->avail = 0;
894 		nc->limit = entries;
895 		nc->batchcount = batchcount;
896 		nc->touched = 0;
897 		spin_lock_init(&nc->lock);
898 	}
899 	return nc;
900 }
901 
902 /*
903  * Transfer objects in one arraycache to another.
904  * Locking must be handled by the caller.
905  *
906  * Return the number of entries transferred.
907  */
908 static int transfer_objects(struct array_cache *to,
909 		struct array_cache *from, unsigned int max)
910 {
911 	/* Figure out how many entries to transfer */
912 	int nr = min(min(from->avail, max), to->limit - to->avail);
913 
914 	if (!nr)
915 		return 0;
916 
917 	memcpy(to->entry + to->avail, from->entry + from->avail -nr,
918 			sizeof(void *) *nr);
919 
920 	from->avail -= nr;
921 	to->avail += nr;
922 	to->touched = 1;
923 	return nr;
924 }
925 
926 #ifdef CONFIG_NUMA
927 static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
928 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
929 
930 static struct array_cache **alloc_alien_cache(int node, int limit)
931 {
932 	struct array_cache **ac_ptr;
933 	int memsize = sizeof(void *) * MAX_NUMNODES;
934 	int i;
935 
936 	if (limit > 1)
937 		limit = 12;
938 	ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
939 	if (ac_ptr) {
940 		for_each_node(i) {
941 			if (i == node || !node_online(i)) {
942 				ac_ptr[i] = NULL;
943 				continue;
944 			}
945 			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
946 			if (!ac_ptr[i]) {
947 				for (i--; i <= 0; i--)
948 					kfree(ac_ptr[i]);
949 				kfree(ac_ptr);
950 				return NULL;
951 			}
952 		}
953 	}
954 	return ac_ptr;
955 }
956 
957 static void free_alien_cache(struct array_cache **ac_ptr)
958 {
959 	int i;
960 
961 	if (!ac_ptr)
962 		return;
963 	for_each_node(i)
964 	    kfree(ac_ptr[i]);
965 	kfree(ac_ptr);
966 }
967 
968 static void __drain_alien_cache(struct kmem_cache *cachep,
969 				struct array_cache *ac, int node)
970 {
971 	struct kmem_list3 *rl3 = cachep->nodelists[node];
972 
973 	if (ac->avail) {
974 		spin_lock(&rl3->list_lock);
975 		/*
976 		 * Stuff objects into the remote nodes shared array first.
977 		 * That way we could avoid the overhead of putting the objects
978 		 * into the free lists and getting them back later.
979 		 */
980 		if (rl3->shared)
981 			transfer_objects(rl3->shared, ac, ac->limit);
982 
983 		free_block(cachep, ac->entry, ac->avail, node);
984 		ac->avail = 0;
985 		spin_unlock(&rl3->list_lock);
986 	}
987 }
988 
989 /*
990  * Called from cache_reap() to regularly drain alien caches round robin.
991  */
992 static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
993 {
994 	int node = __get_cpu_var(reap_node);
995 
996 	if (l3->alien) {
997 		struct array_cache *ac = l3->alien[node];
998 
999 		if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
1000 			__drain_alien_cache(cachep, ac, node);
1001 			spin_unlock_irq(&ac->lock);
1002 		}
1003 	}
1004 }
1005 
1006 static void drain_alien_cache(struct kmem_cache *cachep,
1007 				struct array_cache **alien)
1008 {
1009 	int i = 0;
1010 	struct array_cache *ac;
1011 	unsigned long flags;
1012 
1013 	for_each_online_node(i) {
1014 		ac = alien[i];
1015 		if (ac) {
1016 			spin_lock_irqsave(&ac->lock, flags);
1017 			__drain_alien_cache(cachep, ac, i);
1018 			spin_unlock_irqrestore(&ac->lock, flags);
1019 		}
1020 	}
1021 }
1022 
1023 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1024 {
1025 	struct slab *slabp = virt_to_slab(objp);
1026 	int nodeid = slabp->nodeid;
1027 	struct kmem_list3 *l3;
1028 	struct array_cache *alien = NULL;
1029 
1030 	/*
1031 	 * Make sure we are not freeing a object from another node to the array
1032 	 * cache on this cpu.
1033 	 */
1034 	if (likely(slabp->nodeid == numa_node_id()))
1035 		return 0;
1036 
1037 	l3 = cachep->nodelists[numa_node_id()];
1038 	STATS_INC_NODEFREES(cachep);
1039 	if (l3->alien && l3->alien[nodeid]) {
1040 		alien = l3->alien[nodeid];
1041 		spin_lock(&alien->lock);
1042 		if (unlikely(alien->avail == alien->limit)) {
1043 			STATS_INC_ACOVERFLOW(cachep);
1044 			__drain_alien_cache(cachep, alien, nodeid);
1045 		}
1046 		alien->entry[alien->avail++] = objp;
1047 		spin_unlock(&alien->lock);
1048 	} else {
1049 		spin_lock(&(cachep->nodelists[nodeid])->list_lock);
1050 		free_block(cachep, &objp, 1, nodeid);
1051 		spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
1052 	}
1053 	return 1;
1054 }
1055 
1056 #else
1057 
1058 #define drain_alien_cache(cachep, alien) do { } while (0)
1059 #define reap_alien(cachep, l3) do { } while (0)
1060 
1061 static inline struct array_cache **alloc_alien_cache(int node, int limit)
1062 {
1063 	return (struct array_cache **) 0x01020304ul;
1064 }
1065 
1066 static inline void free_alien_cache(struct array_cache **ac_ptr)
1067 {
1068 }
1069 
1070 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1071 {
1072 	return 0;
1073 }
1074 
1075 #endif
1076 
1077 static int __devinit cpuup_callback(struct notifier_block *nfb,
1078 				    unsigned long action, void *hcpu)
1079 {
1080 	long cpu = (long)hcpu;
1081 	struct kmem_cache *cachep;
1082 	struct kmem_list3 *l3 = NULL;
1083 	int node = cpu_to_node(cpu);
1084 	int memsize = sizeof(struct kmem_list3);
1085 
1086 	switch (action) {
1087 	case CPU_UP_PREPARE:
1088 		mutex_lock(&cache_chain_mutex);
1089 		/*
1090 		 * We need to do this right in the beginning since
1091 		 * alloc_arraycache's are going to use this list.
1092 		 * kmalloc_node allows us to add the slab to the right
1093 		 * kmem_list3 and not this cpu's kmem_list3
1094 		 */
1095 
1096 		list_for_each_entry(cachep, &cache_chain, next) {
1097 			/*
1098 			 * Set up the size64 kmemlist for cpu before we can
1099 			 * begin anything. Make sure some other cpu on this
1100 			 * node has not already allocated this
1101 			 */
1102 			if (!cachep->nodelists[node]) {
1103 				l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1104 				if (!l3)
1105 					goto bad;
1106 				kmem_list3_init(l3);
1107 				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1108 				    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1109 
1110 				/*
1111 				 * The l3s don't come and go as CPUs come and
1112 				 * go.  cache_chain_mutex is sufficient
1113 				 * protection here.
1114 				 */
1115 				cachep->nodelists[node] = l3;
1116 			}
1117 
1118 			spin_lock_irq(&cachep->nodelists[node]->list_lock);
1119 			cachep->nodelists[node]->free_limit =
1120 				(1 + nr_cpus_node(node)) *
1121 				cachep->batchcount + cachep->num;
1122 			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1123 		}
1124 
1125 		/*
1126 		 * Now we can go ahead with allocating the shared arrays and
1127 		 * array caches
1128 		 */
1129 		list_for_each_entry(cachep, &cache_chain, next) {
1130 			struct array_cache *nc;
1131 			struct array_cache *shared;
1132 			struct array_cache **alien;
1133 
1134 			nc = alloc_arraycache(node, cachep->limit,
1135 						cachep->batchcount);
1136 			if (!nc)
1137 				goto bad;
1138 			shared = alloc_arraycache(node,
1139 					cachep->shared * cachep->batchcount,
1140 					0xbaadf00d);
1141 			if (!shared)
1142 				goto bad;
1143 
1144 			alien = alloc_alien_cache(node, cachep->limit);
1145 			if (!alien)
1146 				goto bad;
1147 			cachep->array[cpu] = nc;
1148 			l3 = cachep->nodelists[node];
1149 			BUG_ON(!l3);
1150 
1151 			spin_lock_irq(&l3->list_lock);
1152 			if (!l3->shared) {
1153 				/*
1154 				 * We are serialised from CPU_DEAD or
1155 				 * CPU_UP_CANCELLED by the cpucontrol lock
1156 				 */
1157 				l3->shared = shared;
1158 				shared = NULL;
1159 			}
1160 #ifdef CONFIG_NUMA
1161 			if (!l3->alien) {
1162 				l3->alien = alien;
1163 				alien = NULL;
1164 			}
1165 #endif
1166 			spin_unlock_irq(&l3->list_lock);
1167 			kfree(shared);
1168 			free_alien_cache(alien);
1169 		}
1170 		mutex_unlock(&cache_chain_mutex);
1171 		break;
1172 	case CPU_ONLINE:
1173 		start_cpu_timer(cpu);
1174 		break;
1175 #ifdef CONFIG_HOTPLUG_CPU
1176 	case CPU_DEAD:
1177 		/*
1178 		 * Even if all the cpus of a node are down, we don't free the
1179 		 * kmem_list3 of any cache. This to avoid a race between
1180 		 * cpu_down, and a kmalloc allocation from another cpu for
1181 		 * memory from the node of the cpu going down.  The list3
1182 		 * structure is usually allocated from kmem_cache_create() and
1183 		 * gets destroyed at kmem_cache_destroy().
1184 		 */
1185 		/* fall thru */
1186 	case CPU_UP_CANCELED:
1187 		mutex_lock(&cache_chain_mutex);
1188 		list_for_each_entry(cachep, &cache_chain, next) {
1189 			struct array_cache *nc;
1190 			struct array_cache *shared;
1191 			struct array_cache **alien;
1192 			cpumask_t mask;
1193 
1194 			mask = node_to_cpumask(node);
1195 			/* cpu is dead; no one can alloc from it. */
1196 			nc = cachep->array[cpu];
1197 			cachep->array[cpu] = NULL;
1198 			l3 = cachep->nodelists[node];
1199 
1200 			if (!l3)
1201 				goto free_array_cache;
1202 
1203 			spin_lock_irq(&l3->list_lock);
1204 
1205 			/* Free limit for this kmem_list3 */
1206 			l3->free_limit -= cachep->batchcount;
1207 			if (nc)
1208 				free_block(cachep, nc->entry, nc->avail, node);
1209 
1210 			if (!cpus_empty(mask)) {
1211 				spin_unlock_irq(&l3->list_lock);
1212 				goto free_array_cache;
1213 			}
1214 
1215 			shared = l3->shared;
1216 			if (shared) {
1217 				free_block(cachep, l3->shared->entry,
1218 					   l3->shared->avail, node);
1219 				l3->shared = NULL;
1220 			}
1221 
1222 			alien = l3->alien;
1223 			l3->alien = NULL;
1224 
1225 			spin_unlock_irq(&l3->list_lock);
1226 
1227 			kfree(shared);
1228 			if (alien) {
1229 				drain_alien_cache(cachep, alien);
1230 				free_alien_cache(alien);
1231 			}
1232 free_array_cache:
1233 			kfree(nc);
1234 		}
1235 		/*
1236 		 * In the previous loop, all the objects were freed to
1237 		 * the respective cache's slabs,  now we can go ahead and
1238 		 * shrink each nodelist to its limit.
1239 		 */
1240 		list_for_each_entry(cachep, &cache_chain, next) {
1241 			l3 = cachep->nodelists[node];
1242 			if (!l3)
1243 				continue;
1244 			spin_lock_irq(&l3->list_lock);
1245 			/* free slabs belonging to this node */
1246 			__node_shrink(cachep, node);
1247 			spin_unlock_irq(&l3->list_lock);
1248 		}
1249 		mutex_unlock(&cache_chain_mutex);
1250 		break;
1251 #endif
1252 	}
1253 	return NOTIFY_OK;
1254 bad:
1255 	mutex_unlock(&cache_chain_mutex);
1256 	return NOTIFY_BAD;
1257 }
1258 
1259 static struct notifier_block __cpuinitdata cpucache_notifier = {
1260 	&cpuup_callback, NULL, 0
1261 };
1262 
1263 /*
1264  * swap the static kmem_list3 with kmalloced memory
1265  */
1266 static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1267 			int nodeid)
1268 {
1269 	struct kmem_list3 *ptr;
1270 
1271 	BUG_ON(cachep->nodelists[nodeid] != list);
1272 	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
1273 	BUG_ON(!ptr);
1274 
1275 	local_irq_disable();
1276 	memcpy(ptr, list, sizeof(struct kmem_list3));
1277 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
1278 	cachep->nodelists[nodeid] = ptr;
1279 	local_irq_enable();
1280 }
1281 
1282 /*
1283  * Initialisation.  Called after the page allocator have been initialised and
1284  * before smp_init().
1285  */
1286 void __init kmem_cache_init(void)
1287 {
1288 	size_t left_over;
1289 	struct cache_sizes *sizes;
1290 	struct cache_names *names;
1291 	int i;
1292 	int order;
1293 
1294 	for (i = 0; i < NUM_INIT_LISTS; i++) {
1295 		kmem_list3_init(&initkmem_list3[i]);
1296 		if (i < MAX_NUMNODES)
1297 			cache_cache.nodelists[i] = NULL;
1298 	}
1299 
1300 	/*
1301 	 * Fragmentation resistance on low memory - only use bigger
1302 	 * page orders on machines with more than 32MB of memory.
1303 	 */
1304 	if (num_physpages > (32 << 20) >> PAGE_SHIFT)
1305 		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1306 
1307 	/* Bootstrap is tricky, because several objects are allocated
1308 	 * from caches that do not exist yet:
1309 	 * 1) initialize the cache_cache cache: it contains the struct
1310 	 *    kmem_cache structures of all caches, except cache_cache itself:
1311 	 *    cache_cache is statically allocated.
1312 	 *    Initially an __init data area is used for the head array and the
1313 	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
1314 	 *    array at the end of the bootstrap.
1315 	 * 2) Create the first kmalloc cache.
1316 	 *    The struct kmem_cache for the new cache is allocated normally.
1317 	 *    An __init data area is used for the head array.
1318 	 * 3) Create the remaining kmalloc caches, with minimally sized
1319 	 *    head arrays.
1320 	 * 4) Replace the __init data head arrays for cache_cache and the first
1321 	 *    kmalloc cache with kmalloc allocated arrays.
1322 	 * 5) Replace the __init data for kmem_list3 for cache_cache and
1323 	 *    the other cache's with kmalloc allocated memory.
1324 	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1325 	 */
1326 
1327 	/* 1) create the cache_cache */
1328 	INIT_LIST_HEAD(&cache_chain);
1329 	list_add(&cache_cache.next, &cache_chain);
1330 	cache_cache.colour_off = cache_line_size();
1331 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1332 	cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
1333 
1334 	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1335 					cache_line_size());
1336 
1337 	for (order = 0; order < MAX_ORDER; order++) {
1338 		cache_estimate(order, cache_cache.buffer_size,
1339 			cache_line_size(), 0, &left_over, &cache_cache.num);
1340 		if (cache_cache.num)
1341 			break;
1342 	}
1343 	BUG_ON(!cache_cache.num);
1344 	cache_cache.gfporder = order;
1345 	cache_cache.colour = left_over / cache_cache.colour_off;
1346 	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1347 				      sizeof(struct slab), cache_line_size());
1348 
1349 	/* 2+3) create the kmalloc caches */
1350 	sizes = malloc_sizes;
1351 	names = cache_names;
1352 
1353 	/*
1354 	 * Initialize the caches that provide memory for the array cache and the
1355 	 * kmem_list3 structures first.  Without this, further allocations will
1356 	 * bug.
1357 	 */
1358 
1359 	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1360 					sizes[INDEX_AC].cs_size,
1361 					ARCH_KMALLOC_MINALIGN,
1362 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1363 					NULL, NULL);
1364 
1365 	if (INDEX_AC != INDEX_L3) {
1366 		sizes[INDEX_L3].cs_cachep =
1367 			kmem_cache_create(names[INDEX_L3].name,
1368 				sizes[INDEX_L3].cs_size,
1369 				ARCH_KMALLOC_MINALIGN,
1370 				ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1371 				NULL, NULL);
1372 	}
1373 
1374 	slab_early_init = 0;
1375 
1376 	while (sizes->cs_size != ULONG_MAX) {
1377 		/*
1378 		 * For performance, all the general caches are L1 aligned.
1379 		 * This should be particularly beneficial on SMP boxes, as it
1380 		 * eliminates "false sharing".
1381 		 * Note for systems short on memory removing the alignment will
1382 		 * allow tighter packing of the smaller caches.
1383 		 */
1384 		if (!sizes->cs_cachep) {
1385 			sizes->cs_cachep = kmem_cache_create(names->name,
1386 					sizes->cs_size,
1387 					ARCH_KMALLOC_MINALIGN,
1388 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1389 					NULL, NULL);
1390 		}
1391 
1392 		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
1393 					sizes->cs_size,
1394 					ARCH_KMALLOC_MINALIGN,
1395 					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
1396 						SLAB_PANIC,
1397 					NULL, NULL);
1398 		sizes++;
1399 		names++;
1400 	}
1401 	/* 4) Replace the bootstrap head arrays */
1402 	{
1403 		void *ptr;
1404 
1405 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1406 
1407 		local_irq_disable();
1408 		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1409 		memcpy(ptr, cpu_cache_get(&cache_cache),
1410 		       sizeof(struct arraycache_init));
1411 		cache_cache.array[smp_processor_id()] = ptr;
1412 		local_irq_enable();
1413 
1414 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1415 
1416 		local_irq_disable();
1417 		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1418 		       != &initarray_generic.cache);
1419 		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1420 		       sizeof(struct arraycache_init));
1421 		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1422 		    ptr;
1423 		local_irq_enable();
1424 	}
1425 	/* 5) Replace the bootstrap kmem_list3's */
1426 	{
1427 		int node;
1428 		/* Replace the static kmem_list3 structures for the boot cpu */
1429 		init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
1430 			  numa_node_id());
1431 
1432 		for_each_online_node(node) {
1433 			init_list(malloc_sizes[INDEX_AC].cs_cachep,
1434 				  &initkmem_list3[SIZE_AC + node], node);
1435 
1436 			if (INDEX_AC != INDEX_L3) {
1437 				init_list(malloc_sizes[INDEX_L3].cs_cachep,
1438 					  &initkmem_list3[SIZE_L3 + node],
1439 					  node);
1440 			}
1441 		}
1442 	}
1443 
1444 	/* 6) resize the head arrays to their final sizes */
1445 	{
1446 		struct kmem_cache *cachep;
1447 		mutex_lock(&cache_chain_mutex);
1448 		list_for_each_entry(cachep, &cache_chain, next)
1449 			enable_cpucache(cachep);
1450 		mutex_unlock(&cache_chain_mutex);
1451 	}
1452 
1453 	/* Done! */
1454 	g_cpucache_up = FULL;
1455 
1456 	/*
1457 	 * Register a cpu startup notifier callback that initializes
1458 	 * cpu_cache_get for all new cpus
1459 	 */
1460 	register_cpu_notifier(&cpucache_notifier);
1461 
1462 	/*
1463 	 * The reap timers are started later, with a module init call: That part
1464 	 * of the kernel is not yet operational.
1465 	 */
1466 }
1467 
1468 static int __init cpucache_init(void)
1469 {
1470 	int cpu;
1471 
1472 	/*
1473 	 * Register the timers that return unneeded pages to the page allocator
1474 	 */
1475 	for_each_online_cpu(cpu)
1476 		start_cpu_timer(cpu);
1477 	return 0;
1478 }
1479 __initcall(cpucache_init);
1480 
1481 /*
1482  * Interface to system's page allocator. No need to hold the cache-lock.
1483  *
1484  * If we requested dmaable memory, we will get it. Even if we
1485  * did not request dmaable memory, we might get it, but that
1486  * would be relatively rare and ignorable.
1487  */
1488 static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1489 {
1490 	struct page *page;
1491 	int nr_pages;
1492 	int i;
1493 
1494 #ifndef CONFIG_MMU
1495 	/*
1496 	 * Nommu uses slab's for process anonymous memory allocations, and thus
1497 	 * requires __GFP_COMP to properly refcount higher order allocations
1498 	 */
1499 	flags |= __GFP_COMP;
1500 #endif
1501 	flags |= cachep->gfpflags;
1502 
1503 	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1504 	if (!page)
1505 		return NULL;
1506 
1507 	nr_pages = (1 << cachep->gfporder);
1508 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1509 		atomic_add(nr_pages, &slab_reclaim_pages);
1510 	add_page_state(nr_slab, nr_pages);
1511 	for (i = 0; i < nr_pages; i++)
1512 		__SetPageSlab(page + i);
1513 	return page_address(page);
1514 }
1515 
1516 /*
1517  * Interface to system's page release.
1518  */
1519 static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1520 {
1521 	unsigned long i = (1 << cachep->gfporder);
1522 	struct page *page = virt_to_page(addr);
1523 	const unsigned long nr_freed = i;
1524 
1525 	while (i--) {
1526 		BUG_ON(!PageSlab(page));
1527 		__ClearPageSlab(page);
1528 		page++;
1529 	}
1530 	sub_page_state(nr_slab, nr_freed);
1531 	if (current->reclaim_state)
1532 		current->reclaim_state->reclaimed_slab += nr_freed;
1533 	free_pages((unsigned long)addr, cachep->gfporder);
1534 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1535 		atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
1536 }
1537 
1538 static void kmem_rcu_free(struct rcu_head *head)
1539 {
1540 	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1541 	struct kmem_cache *cachep = slab_rcu->cachep;
1542 
1543 	kmem_freepages(cachep, slab_rcu->addr);
1544 	if (OFF_SLAB(cachep))
1545 		kmem_cache_free(cachep->slabp_cache, slab_rcu);
1546 }
1547 
1548 #if DEBUG
1549 
1550 #ifdef CONFIG_DEBUG_PAGEALLOC
1551 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1552 			    unsigned long caller)
1553 {
1554 	int size = obj_size(cachep);
1555 
1556 	addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1557 
1558 	if (size < 5 * sizeof(unsigned long))
1559 		return;
1560 
1561 	*addr++ = 0x12345678;
1562 	*addr++ = caller;
1563 	*addr++ = smp_processor_id();
1564 	size -= 3 * sizeof(unsigned long);
1565 	{
1566 		unsigned long *sptr = &caller;
1567 		unsigned long svalue;
1568 
1569 		while (!kstack_end(sptr)) {
1570 			svalue = *sptr++;
1571 			if (kernel_text_address(svalue)) {
1572 				*addr++ = svalue;
1573 				size -= sizeof(unsigned long);
1574 				if (size <= sizeof(unsigned long))
1575 					break;
1576 			}
1577 		}
1578 
1579 	}
1580 	*addr++ = 0x87654321;
1581 }
1582 #endif
1583 
1584 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1585 {
1586 	int size = obj_size(cachep);
1587 	addr = &((char *)addr)[obj_offset(cachep)];
1588 
1589 	memset(addr, val, size);
1590 	*(unsigned char *)(addr + size - 1) = POISON_END;
1591 }
1592 
1593 static void dump_line(char *data, int offset, int limit)
1594 {
1595 	int i;
1596 	printk(KERN_ERR "%03x:", offset);
1597 	for (i = 0; i < limit; i++)
1598 		printk(" %02x", (unsigned char)data[offset + i]);
1599 	printk("\n");
1600 }
1601 #endif
1602 
1603 #if DEBUG
1604 
1605 static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1606 {
1607 	int i, size;
1608 	char *realobj;
1609 
1610 	if (cachep->flags & SLAB_RED_ZONE) {
1611 		printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
1612 			*dbg_redzone1(cachep, objp),
1613 			*dbg_redzone2(cachep, objp));
1614 	}
1615 
1616 	if (cachep->flags & SLAB_STORE_USER) {
1617 		printk(KERN_ERR "Last user: [<%p>]",
1618 			*dbg_userword(cachep, objp));
1619 		print_symbol("(%s)",
1620 				(unsigned long)*dbg_userword(cachep, objp));
1621 		printk("\n");
1622 	}
1623 	realobj = (char *)objp + obj_offset(cachep);
1624 	size = obj_size(cachep);
1625 	for (i = 0; i < size && lines; i += 16, lines--) {
1626 		int limit;
1627 		limit = 16;
1628 		if (i + limit > size)
1629 			limit = size - i;
1630 		dump_line(realobj, i, limit);
1631 	}
1632 }
1633 
1634 static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1635 {
1636 	char *realobj;
1637 	int size, i;
1638 	int lines = 0;
1639 
1640 	realobj = (char *)objp + obj_offset(cachep);
1641 	size = obj_size(cachep);
1642 
1643 	for (i = 0; i < size; i++) {
1644 		char exp = POISON_FREE;
1645 		if (i == size - 1)
1646 			exp = POISON_END;
1647 		if (realobj[i] != exp) {
1648 			int limit;
1649 			/* Mismatch ! */
1650 			/* Print header */
1651 			if (lines == 0) {
1652 				printk(KERN_ERR
1653 					"Slab corruption: start=%p, len=%d\n",
1654 					realobj, size);
1655 				print_objinfo(cachep, objp, 0);
1656 			}
1657 			/* Hexdump the affected line */
1658 			i = (i / 16) * 16;
1659 			limit = 16;
1660 			if (i + limit > size)
1661 				limit = size - i;
1662 			dump_line(realobj, i, limit);
1663 			i += 16;
1664 			lines++;
1665 			/* Limit to 5 lines */
1666 			if (lines > 5)
1667 				break;
1668 		}
1669 	}
1670 	if (lines != 0) {
1671 		/* Print some data about the neighboring objects, if they
1672 		 * exist:
1673 		 */
1674 		struct slab *slabp = virt_to_slab(objp);
1675 		unsigned int objnr;
1676 
1677 		objnr = obj_to_index(cachep, slabp, objp);
1678 		if (objnr) {
1679 			objp = index_to_obj(cachep, slabp, objnr - 1);
1680 			realobj = (char *)objp + obj_offset(cachep);
1681 			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1682 			       realobj, size);
1683 			print_objinfo(cachep, objp, 2);
1684 		}
1685 		if (objnr + 1 < cachep->num) {
1686 			objp = index_to_obj(cachep, slabp, objnr + 1);
1687 			realobj = (char *)objp + obj_offset(cachep);
1688 			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1689 			       realobj, size);
1690 			print_objinfo(cachep, objp, 2);
1691 		}
1692 	}
1693 }
1694 #endif
1695 
1696 #if DEBUG
1697 /**
1698  * slab_destroy_objs - destroy a slab and its objects
1699  * @cachep: cache pointer being destroyed
1700  * @slabp: slab pointer being destroyed
1701  *
1702  * Call the registered destructor for each object in a slab that is being
1703  * destroyed.
1704  */
1705 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1706 {
1707 	int i;
1708 	for (i = 0; i < cachep->num; i++) {
1709 		void *objp = index_to_obj(cachep, slabp, i);
1710 
1711 		if (cachep->flags & SLAB_POISON) {
1712 #ifdef CONFIG_DEBUG_PAGEALLOC
1713 			if (cachep->buffer_size % PAGE_SIZE == 0 &&
1714 					OFF_SLAB(cachep))
1715 				kernel_map_pages(virt_to_page(objp),
1716 					cachep->buffer_size / PAGE_SIZE, 1);
1717 			else
1718 				check_poison_obj(cachep, objp);
1719 #else
1720 			check_poison_obj(cachep, objp);
1721 #endif
1722 		}
1723 		if (cachep->flags & SLAB_RED_ZONE) {
1724 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1725 				slab_error(cachep, "start of a freed object "
1726 					   "was overwritten");
1727 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1728 				slab_error(cachep, "end of a freed object "
1729 					   "was overwritten");
1730 		}
1731 		if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1732 			(cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
1733 	}
1734 }
1735 #else
1736 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1737 {
1738 	if (cachep->dtor) {
1739 		int i;
1740 		for (i = 0; i < cachep->num; i++) {
1741 			void *objp = index_to_obj(cachep, slabp, i);
1742 			(cachep->dtor) (objp, cachep, 0);
1743 		}
1744 	}
1745 }
1746 #endif
1747 
1748 /**
1749  * slab_destroy - destroy and release all objects in a slab
1750  * @cachep: cache pointer being destroyed
1751  * @slabp: slab pointer being destroyed
1752  *
1753  * Destroy all the objs in a slab, and release the mem back to the system.
1754  * Before calling the slab must have been unlinked from the cache.  The
1755  * cache-lock is not held/needed.
1756  */
1757 static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1758 {
1759 	void *addr = slabp->s_mem - slabp->colouroff;
1760 
1761 	slab_destroy_objs(cachep, slabp);
1762 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1763 		struct slab_rcu *slab_rcu;
1764 
1765 		slab_rcu = (struct slab_rcu *)slabp;
1766 		slab_rcu->cachep = cachep;
1767 		slab_rcu->addr = addr;
1768 		call_rcu(&slab_rcu->head, kmem_rcu_free);
1769 	} else {
1770 		kmem_freepages(cachep, addr);
1771 		if (OFF_SLAB(cachep))
1772 			kmem_cache_free(cachep->slabp_cache, slabp);
1773 	}
1774 }
1775 
1776 /*
1777  * For setting up all the kmem_list3s for cache whose buffer_size is same as
1778  * size of kmem_list3.
1779  */
1780 static void set_up_list3s(struct kmem_cache *cachep, int index)
1781 {
1782 	int node;
1783 
1784 	for_each_online_node(node) {
1785 		cachep->nodelists[node] = &initkmem_list3[index + node];
1786 		cachep->nodelists[node]->next_reap = jiffies +
1787 		    REAPTIMEOUT_LIST3 +
1788 		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1789 	}
1790 }
1791 
1792 /**
1793  * calculate_slab_order - calculate size (page order) of slabs
1794  * @cachep: pointer to the cache that is being created
1795  * @size: size of objects to be created in this cache.
1796  * @align: required alignment for the objects.
1797  * @flags: slab allocation flags
1798  *
1799  * Also calculates the number of objects per slab.
1800  *
1801  * This could be made much more intelligent.  For now, try to avoid using
1802  * high order pages for slabs.  When the gfp() functions are more friendly
1803  * towards high-order requests, this should be changed.
1804  */
1805 static size_t calculate_slab_order(struct kmem_cache *cachep,
1806 			size_t size, size_t align, unsigned long flags)
1807 {
1808 	unsigned long offslab_limit;
1809 	size_t left_over = 0;
1810 	int gfporder;
1811 
1812 	for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
1813 		unsigned int num;
1814 		size_t remainder;
1815 
1816 		cache_estimate(gfporder, size, align, flags, &remainder, &num);
1817 		if (!num)
1818 			continue;
1819 
1820 		if (flags & CFLGS_OFF_SLAB) {
1821 			/*
1822 			 * Max number of objs-per-slab for caches which
1823 			 * use off-slab slabs. Needed to avoid a possible
1824 			 * looping condition in cache_grow().
1825 			 */
1826 			offslab_limit = size - sizeof(struct slab);
1827 			offslab_limit /= sizeof(kmem_bufctl_t);
1828 
1829  			if (num > offslab_limit)
1830 				break;
1831 		}
1832 
1833 		/* Found something acceptable - save it away */
1834 		cachep->num = num;
1835 		cachep->gfporder = gfporder;
1836 		left_over = remainder;
1837 
1838 		/*
1839 		 * A VFS-reclaimable slab tends to have most allocations
1840 		 * as GFP_NOFS and we really don't want to have to be allocating
1841 		 * higher-order pages when we are unable to shrink dcache.
1842 		 */
1843 		if (flags & SLAB_RECLAIM_ACCOUNT)
1844 			break;
1845 
1846 		/*
1847 		 * Large number of objects is good, but very large slabs are
1848 		 * currently bad for the gfp()s.
1849 		 */
1850 		if (gfporder >= slab_break_gfp_order)
1851 			break;
1852 
1853 		/*
1854 		 * Acceptable internal fragmentation?
1855 		 */
1856 		if (left_over * 8 <= (PAGE_SIZE << gfporder))
1857 			break;
1858 	}
1859 	return left_over;
1860 }
1861 
1862 static void setup_cpu_cache(struct kmem_cache *cachep)
1863 {
1864 	if (g_cpucache_up == FULL) {
1865 		enable_cpucache(cachep);
1866 		return;
1867 	}
1868 	if (g_cpucache_up == NONE) {
1869 		/*
1870 		 * Note: the first kmem_cache_create must create the cache
1871 		 * that's used by kmalloc(24), otherwise the creation of
1872 		 * further caches will BUG().
1873 		 */
1874 		cachep->array[smp_processor_id()] = &initarray_generic.cache;
1875 
1876 		/*
1877 		 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
1878 		 * the first cache, then we need to set up all its list3s,
1879 		 * otherwise the creation of further caches will BUG().
1880 		 */
1881 		set_up_list3s(cachep, SIZE_AC);
1882 		if (INDEX_AC == INDEX_L3)
1883 			g_cpucache_up = PARTIAL_L3;
1884 		else
1885 			g_cpucache_up = PARTIAL_AC;
1886 	} else {
1887 		cachep->array[smp_processor_id()] =
1888 			kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1889 
1890 		if (g_cpucache_up == PARTIAL_AC) {
1891 			set_up_list3s(cachep, SIZE_L3);
1892 			g_cpucache_up = PARTIAL_L3;
1893 		} else {
1894 			int node;
1895 			for_each_online_node(node) {
1896 				cachep->nodelists[node] =
1897 				    kmalloc_node(sizeof(struct kmem_list3),
1898 						GFP_KERNEL, node);
1899 				BUG_ON(!cachep->nodelists[node]);
1900 				kmem_list3_init(cachep->nodelists[node]);
1901 			}
1902 		}
1903 	}
1904 	cachep->nodelists[numa_node_id()]->next_reap =
1905 			jiffies + REAPTIMEOUT_LIST3 +
1906 			((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1907 
1908 	cpu_cache_get(cachep)->avail = 0;
1909 	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
1910 	cpu_cache_get(cachep)->batchcount = 1;
1911 	cpu_cache_get(cachep)->touched = 0;
1912 	cachep->batchcount = 1;
1913 	cachep->limit = BOOT_CPUCACHE_ENTRIES;
1914 }
1915 
1916 /**
1917  * kmem_cache_create - Create a cache.
1918  * @name: A string which is used in /proc/slabinfo to identify this cache.
1919  * @size: The size of objects to be created in this cache.
1920  * @align: The required alignment for the objects.
1921  * @flags: SLAB flags
1922  * @ctor: A constructor for the objects.
1923  * @dtor: A destructor for the objects.
1924  *
1925  * Returns a ptr to the cache on success, NULL on failure.
1926  * Cannot be called within a int, but can be interrupted.
1927  * The @ctor is run when new pages are allocated by the cache
1928  * and the @dtor is run before the pages are handed back.
1929  *
1930  * @name must be valid until the cache is destroyed. This implies that
1931  * the module calling this has to destroy the cache before getting unloaded.
1932  *
1933  * The flags are
1934  *
1935  * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
1936  * to catch references to uninitialised memory.
1937  *
1938  * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
1939  * for buffer overruns.
1940  *
1941  * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
1942  * cacheline.  This can be beneficial if you're counting cycles as closely
1943  * as davem.
1944  */
1945 struct kmem_cache *
1946 kmem_cache_create (const char *name, size_t size, size_t align,
1947 	unsigned long flags,
1948 	void (*ctor)(void*, struct kmem_cache *, unsigned long),
1949 	void (*dtor)(void*, struct kmem_cache *, unsigned long))
1950 {
1951 	size_t left_over, slab_size, ralign;
1952 	struct kmem_cache *cachep = NULL, *pc;
1953 
1954 	/*
1955 	 * Sanity checks... these are all serious usage bugs.
1956 	 */
1957 	if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
1958 	    (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
1959 		printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
1960 				name);
1961 		BUG();
1962 	}
1963 
1964 	/*
1965 	 * Prevent CPUs from coming and going.
1966 	 * lock_cpu_hotplug() nests outside cache_chain_mutex
1967 	 */
1968 	lock_cpu_hotplug();
1969 
1970 	mutex_lock(&cache_chain_mutex);
1971 
1972 	list_for_each_entry(pc, &cache_chain, next) {
1973 		mm_segment_t old_fs = get_fs();
1974 		char tmp;
1975 		int res;
1976 
1977 		/*
1978 		 * This happens when the module gets unloaded and doesn't
1979 		 * destroy its slab cache and no-one else reuses the vmalloc
1980 		 * area of the module.  Print a warning.
1981 		 */
1982 		set_fs(KERNEL_DS);
1983 		res = __get_user(tmp, pc->name);
1984 		set_fs(old_fs);
1985 		if (res) {
1986 			printk("SLAB: cache with size %d has lost its name\n",
1987 			       pc->buffer_size);
1988 			continue;
1989 		}
1990 
1991 		if (!strcmp(pc->name, name)) {
1992 			printk("kmem_cache_create: duplicate cache %s\n", name);
1993 			dump_stack();
1994 			goto oops;
1995 		}
1996 	}
1997 
1998 #if DEBUG
1999 	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
2000 	if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
2001 		/* No constructor, but inital state check requested */
2002 		printk(KERN_ERR "%s: No con, but init state check "
2003 		       "requested - %s\n", __FUNCTION__, name);
2004 		flags &= ~SLAB_DEBUG_INITIAL;
2005 	}
2006 #if FORCED_DEBUG
2007 	/*
2008 	 * Enable redzoning and last user accounting, except for caches with
2009 	 * large objects, if the increased size would increase the object size
2010 	 * above the next power of two: caches with object sizes just above a
2011 	 * power of two have a significant amount of internal fragmentation.
2012 	 */
2013 	if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
2014 		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2015 	if (!(flags & SLAB_DESTROY_BY_RCU))
2016 		flags |= SLAB_POISON;
2017 #endif
2018 	if (flags & SLAB_DESTROY_BY_RCU)
2019 		BUG_ON(flags & SLAB_POISON);
2020 #endif
2021 	if (flags & SLAB_DESTROY_BY_RCU)
2022 		BUG_ON(dtor);
2023 
2024 	/*
2025 	 * Always checks flags, a caller might be expecting debug support which
2026 	 * isn't available.
2027 	 */
2028 	BUG_ON(flags & ~CREATE_MASK);
2029 
2030 	/*
2031 	 * Check that size is in terms of words.  This is needed to avoid
2032 	 * unaligned accesses for some archs when redzoning is used, and makes
2033 	 * sure any on-slab bufctl's are also correctly aligned.
2034 	 */
2035 	if (size & (BYTES_PER_WORD - 1)) {
2036 		size += (BYTES_PER_WORD - 1);
2037 		size &= ~(BYTES_PER_WORD - 1);
2038 	}
2039 
2040 	/* calculate the final buffer alignment: */
2041 
2042 	/* 1) arch recommendation: can be overridden for debug */
2043 	if (flags & SLAB_HWCACHE_ALIGN) {
2044 		/*
2045 		 * Default alignment: as specified by the arch code.  Except if
2046 		 * an object is really small, then squeeze multiple objects into
2047 		 * one cacheline.
2048 		 */
2049 		ralign = cache_line_size();
2050 		while (size <= ralign / 2)
2051 			ralign /= 2;
2052 	} else {
2053 		ralign = BYTES_PER_WORD;
2054 	}
2055 	/* 2) arch mandated alignment: disables debug if necessary */
2056 	if (ralign < ARCH_SLAB_MINALIGN) {
2057 		ralign = ARCH_SLAB_MINALIGN;
2058 		if (ralign > BYTES_PER_WORD)
2059 			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2060 	}
2061 	/* 3) caller mandated alignment: disables debug if necessary */
2062 	if (ralign < align) {
2063 		ralign = align;
2064 		if (ralign > BYTES_PER_WORD)
2065 			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2066 	}
2067 	/*
2068 	 * 4) Store it. Note that the debug code below can reduce
2069 	 *    the alignment to BYTES_PER_WORD.
2070 	 */
2071 	align = ralign;
2072 
2073 	/* Get cache's description obj. */
2074 	cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL);
2075 	if (!cachep)
2076 		goto oops;
2077 
2078 #if DEBUG
2079 	cachep->obj_size = size;
2080 
2081 	if (flags & SLAB_RED_ZONE) {
2082 		/* redzoning only works with word aligned caches */
2083 		align = BYTES_PER_WORD;
2084 
2085 		/* add space for red zone words */
2086 		cachep->obj_offset += BYTES_PER_WORD;
2087 		size += 2 * BYTES_PER_WORD;
2088 	}
2089 	if (flags & SLAB_STORE_USER) {
2090 		/* user store requires word alignment and
2091 		 * one word storage behind the end of the real
2092 		 * object.
2093 		 */
2094 		align = BYTES_PER_WORD;
2095 		size += BYTES_PER_WORD;
2096 	}
2097 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2098 	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2099 	    && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
2100 		cachep->obj_offset += PAGE_SIZE - size;
2101 		size = PAGE_SIZE;
2102 	}
2103 #endif
2104 #endif
2105 
2106 	/*
2107 	 * Determine if the slab management is 'on' or 'off' slab.
2108 	 * (bootstrapping cannot cope with offslab caches so don't do
2109 	 * it too early on.)
2110 	 */
2111 	if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
2112 		/*
2113 		 * Size is large, assume best to place the slab management obj
2114 		 * off-slab (should allow better packing of objs).
2115 		 */
2116 		flags |= CFLGS_OFF_SLAB;
2117 
2118 	size = ALIGN(size, align);
2119 
2120 	left_over = calculate_slab_order(cachep, size, align, flags);
2121 
2122 	if (!cachep->num) {
2123 		printk("kmem_cache_create: couldn't create cache %s.\n", name);
2124 		kmem_cache_free(&cache_cache, cachep);
2125 		cachep = NULL;
2126 		goto oops;
2127 	}
2128 	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2129 			  + sizeof(struct slab), align);
2130 
2131 	/*
2132 	 * If the slab has been placed off-slab, and we have enough space then
2133 	 * move it on-slab. This is at the expense of any extra colouring.
2134 	 */
2135 	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
2136 		flags &= ~CFLGS_OFF_SLAB;
2137 		left_over -= slab_size;
2138 	}
2139 
2140 	if (flags & CFLGS_OFF_SLAB) {
2141 		/* really off slab. No need for manual alignment */
2142 		slab_size =
2143 		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2144 	}
2145 
2146 	cachep->colour_off = cache_line_size();
2147 	/* Offset must be a multiple of the alignment. */
2148 	if (cachep->colour_off < align)
2149 		cachep->colour_off = align;
2150 	cachep->colour = left_over / cachep->colour_off;
2151 	cachep->slab_size = slab_size;
2152 	cachep->flags = flags;
2153 	cachep->gfpflags = 0;
2154 	if (flags & SLAB_CACHE_DMA)
2155 		cachep->gfpflags |= GFP_DMA;
2156 	cachep->buffer_size = size;
2157 
2158 	if (flags & CFLGS_OFF_SLAB)
2159 		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2160 	cachep->ctor = ctor;
2161 	cachep->dtor = dtor;
2162 	cachep->name = name;
2163 
2164 
2165 	setup_cpu_cache(cachep);
2166 
2167 	/* cache setup completed, link it into the list */
2168 	list_add(&cachep->next, &cache_chain);
2169 oops:
2170 	if (!cachep && (flags & SLAB_PANIC))
2171 		panic("kmem_cache_create(): failed to create slab `%s'\n",
2172 		      name);
2173 	mutex_unlock(&cache_chain_mutex);
2174 	unlock_cpu_hotplug();
2175 	return cachep;
2176 }
2177 EXPORT_SYMBOL(kmem_cache_create);
2178 
2179 #if DEBUG
2180 static void check_irq_off(void)
2181 {
2182 	BUG_ON(!irqs_disabled());
2183 }
2184 
2185 static void check_irq_on(void)
2186 {
2187 	BUG_ON(irqs_disabled());
2188 }
2189 
2190 static void check_spinlock_acquired(struct kmem_cache *cachep)
2191 {
2192 #ifdef CONFIG_SMP
2193 	check_irq_off();
2194 	assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
2195 #endif
2196 }
2197 
2198 static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2199 {
2200 #ifdef CONFIG_SMP
2201 	check_irq_off();
2202 	assert_spin_locked(&cachep->nodelists[node]->list_lock);
2203 #endif
2204 }
2205 
2206 #else
2207 #define check_irq_off()	do { } while(0)
2208 #define check_irq_on()	do { } while(0)
2209 #define check_spinlock_acquired(x) do { } while(0)
2210 #define check_spinlock_acquired_node(x, y) do { } while(0)
2211 #endif
2212 
2213 static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
2214 			struct array_cache *ac,
2215 			int force, int node);
2216 
2217 static void do_drain(void *arg)
2218 {
2219 	struct kmem_cache *cachep = arg;
2220 	struct array_cache *ac;
2221 	int node = numa_node_id();
2222 
2223 	check_irq_off();
2224 	ac = cpu_cache_get(cachep);
2225 	spin_lock(&cachep->nodelists[node]->list_lock);
2226 	free_block(cachep, ac->entry, ac->avail, node);
2227 	spin_unlock(&cachep->nodelists[node]->list_lock);
2228 	ac->avail = 0;
2229 }
2230 
2231 static void drain_cpu_caches(struct kmem_cache *cachep)
2232 {
2233 	struct kmem_list3 *l3;
2234 	int node;
2235 
2236 	on_each_cpu(do_drain, cachep, 1, 1);
2237 	check_irq_on();
2238 	for_each_online_node(node) {
2239 		l3 = cachep->nodelists[node];
2240 		if (l3 && l3->alien)
2241 			drain_alien_cache(cachep, l3->alien);
2242 	}
2243 
2244 	for_each_online_node(node) {
2245 		l3 = cachep->nodelists[node];
2246 		if (l3)
2247 			drain_array(cachep, l3, l3->shared, 1, node);
2248 	}
2249 }
2250 
2251 static int __node_shrink(struct kmem_cache *cachep, int node)
2252 {
2253 	struct slab *slabp;
2254 	struct kmem_list3 *l3 = cachep->nodelists[node];
2255 	int ret;
2256 
2257 	for (;;) {
2258 		struct list_head *p;
2259 
2260 		p = l3->slabs_free.prev;
2261 		if (p == &l3->slabs_free)
2262 			break;
2263 
2264 		slabp = list_entry(l3->slabs_free.prev, struct slab, list);
2265 #if DEBUG
2266 		BUG_ON(slabp->inuse);
2267 #endif
2268 		list_del(&slabp->list);
2269 
2270 		l3->free_objects -= cachep->num;
2271 		spin_unlock_irq(&l3->list_lock);
2272 		slab_destroy(cachep, slabp);
2273 		spin_lock_irq(&l3->list_lock);
2274 	}
2275 	ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial);
2276 	return ret;
2277 }
2278 
2279 static int __cache_shrink(struct kmem_cache *cachep)
2280 {
2281 	int ret = 0, i = 0;
2282 	struct kmem_list3 *l3;
2283 
2284 	drain_cpu_caches(cachep);
2285 
2286 	check_irq_on();
2287 	for_each_online_node(i) {
2288 		l3 = cachep->nodelists[i];
2289 		if (l3) {
2290 			spin_lock_irq(&l3->list_lock);
2291 			ret += __node_shrink(cachep, i);
2292 			spin_unlock_irq(&l3->list_lock);
2293 		}
2294 	}
2295 	return (ret ? 1 : 0);
2296 }
2297 
2298 /**
2299  * kmem_cache_shrink - Shrink a cache.
2300  * @cachep: The cache to shrink.
2301  *
2302  * Releases as many slabs as possible for a cache.
2303  * To help debugging, a zero exit status indicates all slabs were released.
2304  */
2305 int kmem_cache_shrink(struct kmem_cache *cachep)
2306 {
2307 	BUG_ON(!cachep || in_interrupt());
2308 
2309 	return __cache_shrink(cachep);
2310 }
2311 EXPORT_SYMBOL(kmem_cache_shrink);
2312 
2313 /**
2314  * kmem_cache_destroy - delete a cache
2315  * @cachep: the cache to destroy
2316  *
2317  * Remove a struct kmem_cache object from the slab cache.
2318  * Returns 0 on success.
2319  *
2320  * It is expected this function will be called by a module when it is
2321  * unloaded.  This will remove the cache completely, and avoid a duplicate
2322  * cache being allocated each time a module is loaded and unloaded, if the
2323  * module doesn't have persistent in-kernel storage across loads and unloads.
2324  *
2325  * The cache must be empty before calling this function.
2326  *
2327  * The caller must guarantee that noone will allocate memory from the cache
2328  * during the kmem_cache_destroy().
2329  */
2330 int kmem_cache_destroy(struct kmem_cache *cachep)
2331 {
2332 	int i;
2333 	struct kmem_list3 *l3;
2334 
2335 	BUG_ON(!cachep || in_interrupt());
2336 
2337 	/* Don't let CPUs to come and go */
2338 	lock_cpu_hotplug();
2339 
2340 	/* Find the cache in the chain of caches. */
2341 	mutex_lock(&cache_chain_mutex);
2342 	/*
2343 	 * the chain is never empty, cache_cache is never destroyed
2344 	 */
2345 	list_del(&cachep->next);
2346 	mutex_unlock(&cache_chain_mutex);
2347 
2348 	if (__cache_shrink(cachep)) {
2349 		slab_error(cachep, "Can't free all objects");
2350 		mutex_lock(&cache_chain_mutex);
2351 		list_add(&cachep->next, &cache_chain);
2352 		mutex_unlock(&cache_chain_mutex);
2353 		unlock_cpu_hotplug();
2354 		return 1;
2355 	}
2356 
2357 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2358 		synchronize_rcu();
2359 
2360 	for_each_online_cpu(i)
2361 	    kfree(cachep->array[i]);
2362 
2363 	/* NUMA: free the list3 structures */
2364 	for_each_online_node(i) {
2365 		l3 = cachep->nodelists[i];
2366 		if (l3) {
2367 			kfree(l3->shared);
2368 			free_alien_cache(l3->alien);
2369 			kfree(l3);
2370 		}
2371 	}
2372 	kmem_cache_free(&cache_cache, cachep);
2373 	unlock_cpu_hotplug();
2374 	return 0;
2375 }
2376 EXPORT_SYMBOL(kmem_cache_destroy);
2377 
2378 /* Get the memory for a slab management obj. */
2379 static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2380 				   int colour_off, gfp_t local_flags,
2381 				   int nodeid)
2382 {
2383 	struct slab *slabp;
2384 
2385 	if (OFF_SLAB(cachep)) {
2386 		/* Slab management obj is off-slab. */
2387 		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2388 					      local_flags, nodeid);
2389 		if (!slabp)
2390 			return NULL;
2391 	} else {
2392 		slabp = objp + colour_off;
2393 		colour_off += cachep->slab_size;
2394 	}
2395 	slabp->inuse = 0;
2396 	slabp->colouroff = colour_off;
2397 	slabp->s_mem = objp + colour_off;
2398 	slabp->nodeid = nodeid;
2399 	return slabp;
2400 }
2401 
2402 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2403 {
2404 	return (kmem_bufctl_t *) (slabp + 1);
2405 }
2406 
2407 static void cache_init_objs(struct kmem_cache *cachep,
2408 			    struct slab *slabp, unsigned long ctor_flags)
2409 {
2410 	int i;
2411 
2412 	for (i = 0; i < cachep->num; i++) {
2413 		void *objp = index_to_obj(cachep, slabp, i);
2414 #if DEBUG
2415 		/* need to poison the objs? */
2416 		if (cachep->flags & SLAB_POISON)
2417 			poison_obj(cachep, objp, POISON_FREE);
2418 		if (cachep->flags & SLAB_STORE_USER)
2419 			*dbg_userword(cachep, objp) = NULL;
2420 
2421 		if (cachep->flags & SLAB_RED_ZONE) {
2422 			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2423 			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2424 		}
2425 		/*
2426 		 * Constructors are not allowed to allocate memory from the same
2427 		 * cache which they are a constructor for.  Otherwise, deadlock.
2428 		 * They must also be threaded.
2429 		 */
2430 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2431 			cachep->ctor(objp + obj_offset(cachep), cachep,
2432 				     ctor_flags);
2433 
2434 		if (cachep->flags & SLAB_RED_ZONE) {
2435 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2436 				slab_error(cachep, "constructor overwrote the"
2437 					   " end of an object");
2438 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2439 				slab_error(cachep, "constructor overwrote the"
2440 					   " start of an object");
2441 		}
2442 		if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
2443 			    OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2444 			kernel_map_pages(virt_to_page(objp),
2445 					 cachep->buffer_size / PAGE_SIZE, 0);
2446 #else
2447 		if (cachep->ctor)
2448 			cachep->ctor(objp, cachep, ctor_flags);
2449 #endif
2450 		slab_bufctl(slabp)[i] = i + 1;
2451 	}
2452 	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2453 	slabp->free = 0;
2454 }
2455 
2456 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2457 {
2458 	if (flags & SLAB_DMA)
2459 		BUG_ON(!(cachep->gfpflags & GFP_DMA));
2460 	else
2461 		BUG_ON(cachep->gfpflags & GFP_DMA);
2462 }
2463 
2464 static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2465 				int nodeid)
2466 {
2467 	void *objp = index_to_obj(cachep, slabp, slabp->free);
2468 	kmem_bufctl_t next;
2469 
2470 	slabp->inuse++;
2471 	next = slab_bufctl(slabp)[slabp->free];
2472 #if DEBUG
2473 	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2474 	WARN_ON(slabp->nodeid != nodeid);
2475 #endif
2476 	slabp->free = next;
2477 
2478 	return objp;
2479 }
2480 
2481 static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2482 				void *objp, int nodeid)
2483 {
2484 	unsigned int objnr = obj_to_index(cachep, slabp, objp);
2485 
2486 #if DEBUG
2487 	/* Verify that the slab belongs to the intended node */
2488 	WARN_ON(slabp->nodeid != nodeid);
2489 
2490 	if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
2491 		printk(KERN_ERR "slab: double free detected in cache "
2492 				"'%s', objp %p\n", cachep->name, objp);
2493 		BUG();
2494 	}
2495 #endif
2496 	slab_bufctl(slabp)[objnr] = slabp->free;
2497 	slabp->free = objnr;
2498 	slabp->inuse--;
2499 }
2500 
2501 /*
2502  * Map pages beginning at addr to the given cache and slab. This is required
2503  * for the slab allocator to be able to lookup the cache and slab of a
2504  * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
2505  */
2506 static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2507 			   void *addr)
2508 {
2509 	int nr_pages;
2510 	struct page *page;
2511 
2512 	page = virt_to_page(addr);
2513 
2514 	nr_pages = 1;
2515 	if (likely(!PageCompound(page)))
2516 		nr_pages <<= cache->gfporder;
2517 
2518 	do {
2519 		page_set_cache(page, cache);
2520 		page_set_slab(page, slab);
2521 		page++;
2522 	} while (--nr_pages);
2523 }
2524 
2525 /*
2526  * Grow (by 1) the number of slabs within a cache.  This is called by
2527  * kmem_cache_alloc() when there are no active objs left in a cache.
2528  */
2529 static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2530 {
2531 	struct slab *slabp;
2532 	void *objp;
2533 	size_t offset;
2534 	gfp_t local_flags;
2535 	unsigned long ctor_flags;
2536 	struct kmem_list3 *l3;
2537 
2538 	/*
2539 	 * Be lazy and only check for valid flags here,  keeping it out of the
2540 	 * critical path in kmem_cache_alloc().
2541 	 */
2542 	BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW));
2543 	if (flags & SLAB_NO_GROW)
2544 		return 0;
2545 
2546 	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2547 	local_flags = (flags & SLAB_LEVEL_MASK);
2548 	if (!(local_flags & __GFP_WAIT))
2549 		/*
2550 		 * Not allowed to sleep.  Need to tell a constructor about
2551 		 * this - it might need to know...
2552 		 */
2553 		ctor_flags |= SLAB_CTOR_ATOMIC;
2554 
2555 	/* Take the l3 list lock to change the colour_next on this node */
2556 	check_irq_off();
2557 	l3 = cachep->nodelists[nodeid];
2558 	spin_lock(&l3->list_lock);
2559 
2560 	/* Get colour for the slab, and cal the next value. */
2561 	offset = l3->colour_next;
2562 	l3->colour_next++;
2563 	if (l3->colour_next >= cachep->colour)
2564 		l3->colour_next = 0;
2565 	spin_unlock(&l3->list_lock);
2566 
2567 	offset *= cachep->colour_off;
2568 
2569 	if (local_flags & __GFP_WAIT)
2570 		local_irq_enable();
2571 
2572 	/*
2573 	 * The test for missing atomic flag is performed here, rather than
2574 	 * the more obvious place, simply to reduce the critical path length
2575 	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2576 	 * will eventually be caught here (where it matters).
2577 	 */
2578 	kmem_flagcheck(cachep, flags);
2579 
2580 	/*
2581 	 * Get mem for the objs.  Attempt to allocate a physical page from
2582 	 * 'nodeid'.
2583 	 */
2584 	objp = kmem_getpages(cachep, flags, nodeid);
2585 	if (!objp)
2586 		goto failed;
2587 
2588 	/* Get slab management. */
2589 	slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid);
2590 	if (!slabp)
2591 		goto opps1;
2592 
2593 	slabp->nodeid = nodeid;
2594 	slab_map_pages(cachep, slabp, objp);
2595 
2596 	cache_init_objs(cachep, slabp, ctor_flags);
2597 
2598 	if (local_flags & __GFP_WAIT)
2599 		local_irq_disable();
2600 	check_irq_off();
2601 	spin_lock(&l3->list_lock);
2602 
2603 	/* Make slab active. */
2604 	list_add_tail(&slabp->list, &(l3->slabs_free));
2605 	STATS_INC_GROWN(cachep);
2606 	l3->free_objects += cachep->num;
2607 	spin_unlock(&l3->list_lock);
2608 	return 1;
2609 opps1:
2610 	kmem_freepages(cachep, objp);
2611 failed:
2612 	if (local_flags & __GFP_WAIT)
2613 		local_irq_disable();
2614 	return 0;
2615 }
2616 
2617 #if DEBUG
2618 
2619 /*
2620  * Perform extra freeing checks:
2621  * - detect bad pointers.
2622  * - POISON/RED_ZONE checking
2623  * - destructor calls, for caches with POISON+dtor
2624  */
2625 static void kfree_debugcheck(const void *objp)
2626 {
2627 	struct page *page;
2628 
2629 	if (!virt_addr_valid(objp)) {
2630 		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2631 		       (unsigned long)objp);
2632 		BUG();
2633 	}
2634 	page = virt_to_page(objp);
2635 	if (!PageSlab(page)) {
2636 		printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
2637 		       (unsigned long)objp);
2638 		BUG();
2639 	}
2640 }
2641 
2642 static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2643 {
2644 	unsigned long redzone1, redzone2;
2645 
2646 	redzone1 = *dbg_redzone1(cache, obj);
2647 	redzone2 = *dbg_redzone2(cache, obj);
2648 
2649 	/*
2650 	 * Redzone is ok.
2651 	 */
2652 	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2653 		return;
2654 
2655 	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2656 		slab_error(cache, "double free detected");
2657 	else
2658 		slab_error(cache, "memory outside object was overwritten");
2659 
2660 	printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n",
2661 			obj, redzone1, redzone2);
2662 }
2663 
2664 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2665 				   void *caller)
2666 {
2667 	struct page *page;
2668 	unsigned int objnr;
2669 	struct slab *slabp;
2670 
2671 	objp -= obj_offset(cachep);
2672 	kfree_debugcheck(objp);
2673 	page = virt_to_page(objp);
2674 
2675 	slabp = page_get_slab(page);
2676 
2677 	if (cachep->flags & SLAB_RED_ZONE) {
2678 		verify_redzone_free(cachep, objp);
2679 		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2680 		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2681 	}
2682 	if (cachep->flags & SLAB_STORE_USER)
2683 		*dbg_userword(cachep, objp) = caller;
2684 
2685 	objnr = obj_to_index(cachep, slabp, objp);
2686 
2687 	BUG_ON(objnr >= cachep->num);
2688 	BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2689 
2690 	if (cachep->flags & SLAB_DEBUG_INITIAL) {
2691 		/*
2692 		 * Need to call the slab's constructor so the caller can
2693 		 * perform a verify of its state (debugging).  Called without
2694 		 * the cache-lock held.
2695 		 */
2696 		cachep->ctor(objp + obj_offset(cachep),
2697 			     cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
2698 	}
2699 	if (cachep->flags & SLAB_POISON && cachep->dtor) {
2700 		/* we want to cache poison the object,
2701 		 * call the destruction callback
2702 		 */
2703 		cachep->dtor(objp + obj_offset(cachep), cachep, 0);
2704 	}
2705 #ifdef CONFIG_DEBUG_SLAB_LEAK
2706 	slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
2707 #endif
2708 	if (cachep->flags & SLAB_POISON) {
2709 #ifdef CONFIG_DEBUG_PAGEALLOC
2710 		if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2711 			store_stackinfo(cachep, objp, (unsigned long)caller);
2712 			kernel_map_pages(virt_to_page(objp),
2713 					 cachep->buffer_size / PAGE_SIZE, 0);
2714 		} else {
2715 			poison_obj(cachep, objp, POISON_FREE);
2716 		}
2717 #else
2718 		poison_obj(cachep, objp, POISON_FREE);
2719 #endif
2720 	}
2721 	return objp;
2722 }
2723 
2724 static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2725 {
2726 	kmem_bufctl_t i;
2727 	int entries = 0;
2728 
2729 	/* Check slab's freelist to see if this obj is there. */
2730 	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2731 		entries++;
2732 		if (entries > cachep->num || i >= cachep->num)
2733 			goto bad;
2734 	}
2735 	if (entries != cachep->num - slabp->inuse) {
2736 bad:
2737 		printk(KERN_ERR "slab: Internal list corruption detected in "
2738 				"cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2739 			cachep->name, cachep->num, slabp, slabp->inuse);
2740 		for (i = 0;
2741 		     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2742 		     i++) {
2743 			if (i % 16 == 0)
2744 				printk("\n%03x:", i);
2745 			printk(" %02x", ((unsigned char *)slabp)[i]);
2746 		}
2747 		printk("\n");
2748 		BUG();
2749 	}
2750 }
2751 #else
2752 #define kfree_debugcheck(x) do { } while(0)
2753 #define cache_free_debugcheck(x,objp,z) (objp)
2754 #define check_slabp(x,y) do { } while(0)
2755 #endif
2756 
2757 static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2758 {
2759 	int batchcount;
2760 	struct kmem_list3 *l3;
2761 	struct array_cache *ac;
2762 
2763 	check_irq_off();
2764 	ac = cpu_cache_get(cachep);
2765 retry:
2766 	batchcount = ac->batchcount;
2767 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2768 		/*
2769 		 * If there was little recent activity on this cache, then
2770 		 * perform only a partial refill.  Otherwise we could generate
2771 		 * refill bouncing.
2772 		 */
2773 		batchcount = BATCHREFILL_LIMIT;
2774 	}
2775 	l3 = cachep->nodelists[numa_node_id()];
2776 
2777 	BUG_ON(ac->avail > 0 || !l3);
2778 	spin_lock(&l3->list_lock);
2779 
2780 	/* See if we can refill from the shared array */
2781 	if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
2782 		goto alloc_done;
2783 
2784 	while (batchcount > 0) {
2785 		struct list_head *entry;
2786 		struct slab *slabp;
2787 		/* Get slab alloc is to come from. */
2788 		entry = l3->slabs_partial.next;
2789 		if (entry == &l3->slabs_partial) {
2790 			l3->free_touched = 1;
2791 			entry = l3->slabs_free.next;
2792 			if (entry == &l3->slabs_free)
2793 				goto must_grow;
2794 		}
2795 
2796 		slabp = list_entry(entry, struct slab, list);
2797 		check_slabp(cachep, slabp);
2798 		check_spinlock_acquired(cachep);
2799 		while (slabp->inuse < cachep->num && batchcount--) {
2800 			STATS_INC_ALLOCED(cachep);
2801 			STATS_INC_ACTIVE(cachep);
2802 			STATS_SET_HIGH(cachep);
2803 
2804 			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
2805 							    numa_node_id());
2806 		}
2807 		check_slabp(cachep, slabp);
2808 
2809 		/* move slabp to correct slabp list: */
2810 		list_del(&slabp->list);
2811 		if (slabp->free == BUFCTL_END)
2812 			list_add(&slabp->list, &l3->slabs_full);
2813 		else
2814 			list_add(&slabp->list, &l3->slabs_partial);
2815 	}
2816 
2817 must_grow:
2818 	l3->free_objects -= ac->avail;
2819 alloc_done:
2820 	spin_unlock(&l3->list_lock);
2821 
2822 	if (unlikely(!ac->avail)) {
2823 		int x;
2824 		x = cache_grow(cachep, flags, numa_node_id());
2825 
2826 		/* cache_grow can reenable interrupts, then ac could change. */
2827 		ac = cpu_cache_get(cachep);
2828 		if (!x && ac->avail == 0)	/* no objects in sight? abort */
2829 			return NULL;
2830 
2831 		if (!ac->avail)		/* objects refilled by interrupt? */
2832 			goto retry;
2833 	}
2834 	ac->touched = 1;
2835 	return ac->entry[--ac->avail];
2836 }
2837 
2838 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
2839 						gfp_t flags)
2840 {
2841 	might_sleep_if(flags & __GFP_WAIT);
2842 #if DEBUG
2843 	kmem_flagcheck(cachep, flags);
2844 #endif
2845 }
2846 
2847 #if DEBUG
2848 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2849 				gfp_t flags, void *objp, void *caller)
2850 {
2851 	if (!objp)
2852 		return objp;
2853 	if (cachep->flags & SLAB_POISON) {
2854 #ifdef CONFIG_DEBUG_PAGEALLOC
2855 		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2856 			kernel_map_pages(virt_to_page(objp),
2857 					 cachep->buffer_size / PAGE_SIZE, 1);
2858 		else
2859 			check_poison_obj(cachep, objp);
2860 #else
2861 		check_poison_obj(cachep, objp);
2862 #endif
2863 		poison_obj(cachep, objp, POISON_INUSE);
2864 	}
2865 	if (cachep->flags & SLAB_STORE_USER)
2866 		*dbg_userword(cachep, objp) = caller;
2867 
2868 	if (cachep->flags & SLAB_RED_ZONE) {
2869 		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
2870 				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
2871 			slab_error(cachep, "double free, or memory outside"
2872 						" object was overwritten");
2873 			printk(KERN_ERR
2874 				"%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
2875 				objp, *dbg_redzone1(cachep, objp),
2876 				*dbg_redzone2(cachep, objp));
2877 		}
2878 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
2879 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
2880 	}
2881 #ifdef CONFIG_DEBUG_SLAB_LEAK
2882 	{
2883 		struct slab *slabp;
2884 		unsigned objnr;
2885 
2886 		slabp = page_get_slab(virt_to_page(objp));
2887 		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
2888 		slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
2889 	}
2890 #endif
2891 	objp += obj_offset(cachep);
2892 	if (cachep->ctor && cachep->flags & SLAB_POISON) {
2893 		unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2894 
2895 		if (!(flags & __GFP_WAIT))
2896 			ctor_flags |= SLAB_CTOR_ATOMIC;
2897 
2898 		cachep->ctor(objp, cachep, ctor_flags);
2899 	}
2900 	return objp;
2901 }
2902 #else
2903 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
2904 #endif
2905 
2906 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2907 {
2908 	void *objp;
2909 	struct array_cache *ac;
2910 
2911 #ifdef CONFIG_NUMA
2912 	if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
2913 		objp = alternate_node_alloc(cachep, flags);
2914 		if (objp != NULL)
2915 			return objp;
2916 	}
2917 #endif
2918 
2919 	check_irq_off();
2920 	ac = cpu_cache_get(cachep);
2921 	if (likely(ac->avail)) {
2922 		STATS_INC_ALLOCHIT(cachep);
2923 		ac->touched = 1;
2924 		objp = ac->entry[--ac->avail];
2925 	} else {
2926 		STATS_INC_ALLOCMISS(cachep);
2927 		objp = cache_alloc_refill(cachep, flags);
2928 	}
2929 	return objp;
2930 }
2931 
2932 static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
2933 						gfp_t flags, void *caller)
2934 {
2935 	unsigned long save_flags;
2936 	void *objp;
2937 
2938 	cache_alloc_debugcheck_before(cachep, flags);
2939 
2940 	local_irq_save(save_flags);
2941 	objp = ____cache_alloc(cachep, flags);
2942 	local_irq_restore(save_flags);
2943 	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
2944 					    caller);
2945 	prefetchw(objp);
2946 	return objp;
2947 }
2948 
2949 #ifdef CONFIG_NUMA
2950 /*
2951  * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
2952  *
2953  * If we are in_interrupt, then process context, including cpusets and
2954  * mempolicy, may not apply and should not be used for allocation policy.
2955  */
2956 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
2957 {
2958 	int nid_alloc, nid_here;
2959 
2960 	if (in_interrupt())
2961 		return NULL;
2962 	nid_alloc = nid_here = numa_node_id();
2963 	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
2964 		nid_alloc = cpuset_mem_spread_node();
2965 	else if (current->mempolicy)
2966 		nid_alloc = slab_node(current->mempolicy);
2967 	if (nid_alloc != nid_here)
2968 		return __cache_alloc_node(cachep, flags, nid_alloc);
2969 	return NULL;
2970 }
2971 
2972 /*
2973  * A interface to enable slab creation on nodeid
2974  */
2975 static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
2976 				int nodeid)
2977 {
2978 	struct list_head *entry;
2979 	struct slab *slabp;
2980 	struct kmem_list3 *l3;
2981 	void *obj;
2982 	int x;
2983 
2984 	l3 = cachep->nodelists[nodeid];
2985 	BUG_ON(!l3);
2986 
2987 retry:
2988 	check_irq_off();
2989 	spin_lock(&l3->list_lock);
2990 	entry = l3->slabs_partial.next;
2991 	if (entry == &l3->slabs_partial) {
2992 		l3->free_touched = 1;
2993 		entry = l3->slabs_free.next;
2994 		if (entry == &l3->slabs_free)
2995 			goto must_grow;
2996 	}
2997 
2998 	slabp = list_entry(entry, struct slab, list);
2999 	check_spinlock_acquired_node(cachep, nodeid);
3000 	check_slabp(cachep, slabp);
3001 
3002 	STATS_INC_NODEALLOCS(cachep);
3003 	STATS_INC_ACTIVE(cachep);
3004 	STATS_SET_HIGH(cachep);
3005 
3006 	BUG_ON(slabp->inuse == cachep->num);
3007 
3008 	obj = slab_get_obj(cachep, slabp, nodeid);
3009 	check_slabp(cachep, slabp);
3010 	l3->free_objects--;
3011 	/* move slabp to correct slabp list: */
3012 	list_del(&slabp->list);
3013 
3014 	if (slabp->free == BUFCTL_END)
3015 		list_add(&slabp->list, &l3->slabs_full);
3016 	else
3017 		list_add(&slabp->list, &l3->slabs_partial);
3018 
3019 	spin_unlock(&l3->list_lock);
3020 	goto done;
3021 
3022 must_grow:
3023 	spin_unlock(&l3->list_lock);
3024 	x = cache_grow(cachep, flags, nodeid);
3025 
3026 	if (!x)
3027 		return NULL;
3028 
3029 	goto retry;
3030 done:
3031 	return obj;
3032 }
3033 #endif
3034 
3035 /*
3036  * Caller needs to acquire correct kmem_list's list_lock
3037  */
3038 static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3039 		       int node)
3040 {
3041 	int i;
3042 	struct kmem_list3 *l3;
3043 
3044 	for (i = 0; i < nr_objects; i++) {
3045 		void *objp = objpp[i];
3046 		struct slab *slabp;
3047 
3048 		slabp = virt_to_slab(objp);
3049 		l3 = cachep->nodelists[node];
3050 		list_del(&slabp->list);
3051 		check_spinlock_acquired_node(cachep, node);
3052 		check_slabp(cachep, slabp);
3053 		slab_put_obj(cachep, slabp, objp, node);
3054 		STATS_DEC_ACTIVE(cachep);
3055 		l3->free_objects++;
3056 		check_slabp(cachep, slabp);
3057 
3058 		/* fixup slab chains */
3059 		if (slabp->inuse == 0) {
3060 			if (l3->free_objects > l3->free_limit) {
3061 				l3->free_objects -= cachep->num;
3062 				slab_destroy(cachep, slabp);
3063 			} else {
3064 				list_add(&slabp->list, &l3->slabs_free);
3065 			}
3066 		} else {
3067 			/* Unconditionally move a slab to the end of the
3068 			 * partial list on free - maximum time for the
3069 			 * other objects to be freed, too.
3070 			 */
3071 			list_add_tail(&slabp->list, &l3->slabs_partial);
3072 		}
3073 	}
3074 }
3075 
3076 static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3077 {
3078 	int batchcount;
3079 	struct kmem_list3 *l3;
3080 	int node = numa_node_id();
3081 
3082 	batchcount = ac->batchcount;
3083 #if DEBUG
3084 	BUG_ON(!batchcount || batchcount > ac->avail);
3085 #endif
3086 	check_irq_off();
3087 	l3 = cachep->nodelists[node];
3088 	spin_lock(&l3->list_lock);
3089 	if (l3->shared) {
3090 		struct array_cache *shared_array = l3->shared;
3091 		int max = shared_array->limit - shared_array->avail;
3092 		if (max) {
3093 			if (batchcount > max)
3094 				batchcount = max;
3095 			memcpy(&(shared_array->entry[shared_array->avail]),
3096 			       ac->entry, sizeof(void *) * batchcount);
3097 			shared_array->avail += batchcount;
3098 			goto free_done;
3099 		}
3100 	}
3101 
3102 	free_block(cachep, ac->entry, batchcount, node);
3103 free_done:
3104 #if STATS
3105 	{
3106 		int i = 0;
3107 		struct list_head *p;
3108 
3109 		p = l3->slabs_free.next;
3110 		while (p != &(l3->slabs_free)) {
3111 			struct slab *slabp;
3112 
3113 			slabp = list_entry(p, struct slab, list);
3114 			BUG_ON(slabp->inuse);
3115 
3116 			i++;
3117 			p = p->next;
3118 		}
3119 		STATS_SET_FREEABLE(cachep, i);
3120 	}
3121 #endif
3122 	spin_unlock(&l3->list_lock);
3123 	ac->avail -= batchcount;
3124 	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3125 }
3126 
3127 /*
3128  * Release an obj back to its cache. If the obj has a constructed state, it must
3129  * be in this state _before_ it is released.  Called with disabled ints.
3130  */
3131 static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3132 {
3133 	struct array_cache *ac = cpu_cache_get(cachep);
3134 
3135 	check_irq_off();
3136 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3137 
3138 	if (cache_free_alien(cachep, objp))
3139 		return;
3140 
3141 	if (likely(ac->avail < ac->limit)) {
3142 		STATS_INC_FREEHIT(cachep);
3143 		ac->entry[ac->avail++] = objp;
3144 		return;
3145 	} else {
3146 		STATS_INC_FREEMISS(cachep);
3147 		cache_flusharray(cachep, ac);
3148 		ac->entry[ac->avail++] = objp;
3149 	}
3150 }
3151 
3152 /**
3153  * kmem_cache_alloc - Allocate an object
3154  * @cachep: The cache to allocate from.
3155  * @flags: See kmalloc().
3156  *
3157  * Allocate an object from this cache.  The flags are only relevant
3158  * if the cache has no available objects.
3159  */
3160 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3161 {
3162 	return __cache_alloc(cachep, flags, __builtin_return_address(0));
3163 }
3164 EXPORT_SYMBOL(kmem_cache_alloc);
3165 
3166 /**
3167  * kmem_cache_alloc - Allocate an object. The memory is set to zero.
3168  * @cache: The cache to allocate from.
3169  * @flags: See kmalloc().
3170  *
3171  * Allocate an object from this cache and set the allocated memory to zero.
3172  * The flags are only relevant if the cache has no available objects.
3173  */
3174 void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
3175 {
3176 	void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
3177 	if (ret)
3178 		memset(ret, 0, obj_size(cache));
3179 	return ret;
3180 }
3181 EXPORT_SYMBOL(kmem_cache_zalloc);
3182 
3183 /**
3184  * kmem_ptr_validate - check if an untrusted pointer might
3185  *	be a slab entry.
3186  * @cachep: the cache we're checking against
3187  * @ptr: pointer to validate
3188  *
3189  * This verifies that the untrusted pointer looks sane:
3190  * it is _not_ a guarantee that the pointer is actually
3191  * part of the slab cache in question, but it at least
3192  * validates that the pointer can be dereferenced and
3193  * looks half-way sane.
3194  *
3195  * Currently only used for dentry validation.
3196  */
3197 int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
3198 {
3199 	unsigned long addr = (unsigned long)ptr;
3200 	unsigned long min_addr = PAGE_OFFSET;
3201 	unsigned long align_mask = BYTES_PER_WORD - 1;
3202 	unsigned long size = cachep->buffer_size;
3203 	struct page *page;
3204 
3205 	if (unlikely(addr < min_addr))
3206 		goto out;
3207 	if (unlikely(addr > (unsigned long)high_memory - size))
3208 		goto out;
3209 	if (unlikely(addr & align_mask))
3210 		goto out;
3211 	if (unlikely(!kern_addr_valid(addr)))
3212 		goto out;
3213 	if (unlikely(!kern_addr_valid(addr + size - 1)))
3214 		goto out;
3215 	page = virt_to_page(ptr);
3216 	if (unlikely(!PageSlab(page)))
3217 		goto out;
3218 	if (unlikely(page_get_cache(page) != cachep))
3219 		goto out;
3220 	return 1;
3221 out:
3222 	return 0;
3223 }
3224 
3225 #ifdef CONFIG_NUMA
3226 /**
3227  * kmem_cache_alloc_node - Allocate an object on the specified node
3228  * @cachep: The cache to allocate from.
3229  * @flags: See kmalloc().
3230  * @nodeid: node number of the target node.
3231  *
3232  * Identical to kmem_cache_alloc, except that this function is slow
3233  * and can sleep. And it will allocate memory on the given node, which
3234  * can improve the performance for cpu bound structures.
3235  * New and improved: it will now make sure that the object gets
3236  * put on the correct node list so that there is no false sharing.
3237  */
3238 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3239 {
3240 	unsigned long save_flags;
3241 	void *ptr;
3242 
3243 	cache_alloc_debugcheck_before(cachep, flags);
3244 	local_irq_save(save_flags);
3245 
3246 	if (nodeid == -1 || nodeid == numa_node_id() ||
3247 			!cachep->nodelists[nodeid])
3248 		ptr = ____cache_alloc(cachep, flags);
3249 	else
3250 		ptr = __cache_alloc_node(cachep, flags, nodeid);
3251 	local_irq_restore(save_flags);
3252 
3253 	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
3254 					   __builtin_return_address(0));
3255 
3256 	return ptr;
3257 }
3258 EXPORT_SYMBOL(kmem_cache_alloc_node);
3259 
3260 void *kmalloc_node(size_t size, gfp_t flags, int node)
3261 {
3262 	struct kmem_cache *cachep;
3263 
3264 	cachep = kmem_find_general_cachep(size, flags);
3265 	if (unlikely(cachep == NULL))
3266 		return NULL;
3267 	return kmem_cache_alloc_node(cachep, flags, node);
3268 }
3269 EXPORT_SYMBOL(kmalloc_node);
3270 #endif
3271 
3272 /**
3273  * __do_kmalloc - allocate memory
3274  * @size: how many bytes of memory are required.
3275  * @flags: the type of memory to allocate (see kmalloc).
3276  * @caller: function caller for debug tracking of the caller
3277  */
3278 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3279 					  void *caller)
3280 {
3281 	struct kmem_cache *cachep;
3282 
3283 	/* If you want to save a few bytes .text space: replace
3284 	 * __ with kmem_.
3285 	 * Then kmalloc uses the uninlined functions instead of the inline
3286 	 * functions.
3287 	 */
3288 	cachep = __find_general_cachep(size, flags);
3289 	if (unlikely(cachep == NULL))
3290 		return NULL;
3291 	return __cache_alloc(cachep, flags, caller);
3292 }
3293 
3294 
3295 void *__kmalloc(size_t size, gfp_t flags)
3296 {
3297 #ifndef CONFIG_DEBUG_SLAB
3298 	return __do_kmalloc(size, flags, NULL);
3299 #else
3300 	return __do_kmalloc(size, flags, __builtin_return_address(0));
3301 #endif
3302 }
3303 EXPORT_SYMBOL(__kmalloc);
3304 
3305 #ifdef CONFIG_DEBUG_SLAB
3306 void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3307 {
3308 	return __do_kmalloc(size, flags, caller);
3309 }
3310 EXPORT_SYMBOL(__kmalloc_track_caller);
3311 #endif
3312 
3313 #ifdef CONFIG_SMP
3314 /**
3315  * __alloc_percpu - allocate one copy of the object for every present
3316  * cpu in the system, zeroing them.
3317  * Objects should be dereferenced using the per_cpu_ptr macro only.
3318  *
3319  * @size: how many bytes of memory are required.
3320  */
3321 void *__alloc_percpu(size_t size)
3322 {
3323 	int i;
3324 	struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
3325 
3326 	if (!pdata)
3327 		return NULL;
3328 
3329 	/*
3330 	 * Cannot use for_each_online_cpu since a cpu may come online
3331 	 * and we have no way of figuring out how to fix the array
3332 	 * that we have allocated then....
3333 	 */
3334 	for_each_possible_cpu(i) {
3335 		int node = cpu_to_node(i);
3336 
3337 		if (node_online(node))
3338 			pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node);
3339 		else
3340 			pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
3341 
3342 		if (!pdata->ptrs[i])
3343 			goto unwind_oom;
3344 		memset(pdata->ptrs[i], 0, size);
3345 	}
3346 
3347 	/* Catch derefs w/o wrappers */
3348 	return (void *)(~(unsigned long)pdata);
3349 
3350 unwind_oom:
3351 	while (--i >= 0) {
3352 		if (!cpu_possible(i))
3353 			continue;
3354 		kfree(pdata->ptrs[i]);
3355 	}
3356 	kfree(pdata);
3357 	return NULL;
3358 }
3359 EXPORT_SYMBOL(__alloc_percpu);
3360 #endif
3361 
3362 /**
3363  * kmem_cache_free - Deallocate an object
3364  * @cachep: The cache the allocation was from.
3365  * @objp: The previously allocated object.
3366  *
3367  * Free an object which was previously allocated from this
3368  * cache.
3369  */
3370 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3371 {
3372 	unsigned long flags;
3373 
3374 	BUG_ON(virt_to_cache(objp) != cachep);
3375 
3376 	local_irq_save(flags);
3377 	__cache_free(cachep, objp);
3378 	local_irq_restore(flags);
3379 }
3380 EXPORT_SYMBOL(kmem_cache_free);
3381 
3382 /**
3383  * kfree - free previously allocated memory
3384  * @objp: pointer returned by kmalloc.
3385  *
3386  * If @objp is NULL, no operation is performed.
3387  *
3388  * Don't free memory not originally allocated by kmalloc()
3389  * or you will run into trouble.
3390  */
3391 void kfree(const void *objp)
3392 {
3393 	struct kmem_cache *c;
3394 	unsigned long flags;
3395 
3396 	if (unlikely(!objp))
3397 		return;
3398 	local_irq_save(flags);
3399 	kfree_debugcheck(objp);
3400 	c = virt_to_cache(objp);
3401 	debug_check_no_locks_freed(objp, obj_size(c));
3402 	__cache_free(c, (void *)objp);
3403 	local_irq_restore(flags);
3404 }
3405 EXPORT_SYMBOL(kfree);
3406 
3407 #ifdef CONFIG_SMP
3408 /**
3409  * free_percpu - free previously allocated percpu memory
3410  * @objp: pointer returned by alloc_percpu.
3411  *
3412  * Don't free memory not originally allocated by alloc_percpu()
3413  * The complemented objp is to check for that.
3414  */
3415 void free_percpu(const void *objp)
3416 {
3417 	int i;
3418 	struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
3419 
3420 	/*
3421 	 * We allocate for all cpus so we cannot use for online cpu here.
3422 	 */
3423 	for_each_possible_cpu(i)
3424 	    kfree(p->ptrs[i]);
3425 	kfree(p);
3426 }
3427 EXPORT_SYMBOL(free_percpu);
3428 #endif
3429 
3430 unsigned int kmem_cache_size(struct kmem_cache *cachep)
3431 {
3432 	return obj_size(cachep);
3433 }
3434 EXPORT_SYMBOL(kmem_cache_size);
3435 
3436 const char *kmem_cache_name(struct kmem_cache *cachep)
3437 {
3438 	return cachep->name;
3439 }
3440 EXPORT_SYMBOL_GPL(kmem_cache_name);
3441 
3442 /*
3443  * This initializes kmem_list3 or resizes varioius caches for all nodes.
3444  */
3445 static int alloc_kmemlist(struct kmem_cache *cachep)
3446 {
3447 	int node;
3448 	struct kmem_list3 *l3;
3449 	struct array_cache *new_shared;
3450 	struct array_cache **new_alien;
3451 
3452 	for_each_online_node(node) {
3453 
3454 		new_alien = alloc_alien_cache(node, cachep->limit);
3455 		if (!new_alien)
3456 			goto fail;
3457 
3458 		new_shared = alloc_arraycache(node,
3459 				cachep->shared*cachep->batchcount,
3460 					0xbaadf00d);
3461 		if (!new_shared) {
3462 			free_alien_cache(new_alien);
3463 			goto fail;
3464 		}
3465 
3466 		l3 = cachep->nodelists[node];
3467 		if (l3) {
3468 			struct array_cache *shared = l3->shared;
3469 
3470 			spin_lock_irq(&l3->list_lock);
3471 
3472 			if (shared)
3473 				free_block(cachep, shared->entry,
3474 						shared->avail, node);
3475 
3476 			l3->shared = new_shared;
3477 			if (!l3->alien) {
3478 				l3->alien = new_alien;
3479 				new_alien = NULL;
3480 			}
3481 			l3->free_limit = (1 + nr_cpus_node(node)) *
3482 					cachep->batchcount + cachep->num;
3483 			spin_unlock_irq(&l3->list_lock);
3484 			kfree(shared);
3485 			free_alien_cache(new_alien);
3486 			continue;
3487 		}
3488 		l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
3489 		if (!l3) {
3490 			free_alien_cache(new_alien);
3491 			kfree(new_shared);
3492 			goto fail;
3493 		}
3494 
3495 		kmem_list3_init(l3);
3496 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3497 				((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3498 		l3->shared = new_shared;
3499 		l3->alien = new_alien;
3500 		l3->free_limit = (1 + nr_cpus_node(node)) *
3501 					cachep->batchcount + cachep->num;
3502 		cachep->nodelists[node] = l3;
3503 	}
3504 	return 0;
3505 
3506 fail:
3507 	if (!cachep->next.next) {
3508 		/* Cache is not active yet. Roll back what we did */
3509 		node--;
3510 		while (node >= 0) {
3511 			if (cachep->nodelists[node]) {
3512 				l3 = cachep->nodelists[node];
3513 
3514 				kfree(l3->shared);
3515 				free_alien_cache(l3->alien);
3516 				kfree(l3);
3517 				cachep->nodelists[node] = NULL;
3518 			}
3519 			node--;
3520 		}
3521 	}
3522 	return -ENOMEM;
3523 }
3524 
3525 struct ccupdate_struct {
3526 	struct kmem_cache *cachep;
3527 	struct array_cache *new[NR_CPUS];
3528 };
3529 
3530 static void do_ccupdate_local(void *info)
3531 {
3532 	struct ccupdate_struct *new = info;
3533 	struct array_cache *old;
3534 
3535 	check_irq_off();
3536 	old = cpu_cache_get(new->cachep);
3537 
3538 	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3539 	new->new[smp_processor_id()] = old;
3540 }
3541 
3542 /* Always called with the cache_chain_mutex held */
3543 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3544 				int batchcount, int shared)
3545 {
3546 	struct ccupdate_struct new;
3547 	int i, err;
3548 
3549 	memset(&new.new, 0, sizeof(new.new));
3550 	for_each_online_cpu(i) {
3551 		new.new[i] = alloc_arraycache(cpu_to_node(i), limit,
3552 						batchcount);
3553 		if (!new.new[i]) {
3554 			for (i--; i >= 0; i--)
3555 				kfree(new.new[i]);
3556 			return -ENOMEM;
3557 		}
3558 	}
3559 	new.cachep = cachep;
3560 
3561 	on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1);
3562 
3563 	check_irq_on();
3564 	cachep->batchcount = batchcount;
3565 	cachep->limit = limit;
3566 	cachep->shared = shared;
3567 
3568 	for_each_online_cpu(i) {
3569 		struct array_cache *ccold = new.new[i];
3570 		if (!ccold)
3571 			continue;
3572 		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3573 		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
3574 		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3575 		kfree(ccold);
3576 	}
3577 
3578 	err = alloc_kmemlist(cachep);
3579 	if (err) {
3580 		printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
3581 		       cachep->name, -err);
3582 		BUG();
3583 	}
3584 	return 0;
3585 }
3586 
3587 /* Called with cache_chain_mutex held always */
3588 static void enable_cpucache(struct kmem_cache *cachep)
3589 {
3590 	int err;
3591 	int limit, shared;
3592 
3593 	/*
3594 	 * The head array serves three purposes:
3595 	 * - create a LIFO ordering, i.e. return objects that are cache-warm
3596 	 * - reduce the number of spinlock operations.
3597 	 * - reduce the number of linked list operations on the slab and
3598 	 *   bufctl chains: array operations are cheaper.
3599 	 * The numbers are guessed, we should auto-tune as described by
3600 	 * Bonwick.
3601 	 */
3602 	if (cachep->buffer_size > 131072)
3603 		limit = 1;
3604 	else if (cachep->buffer_size > PAGE_SIZE)
3605 		limit = 8;
3606 	else if (cachep->buffer_size > 1024)
3607 		limit = 24;
3608 	else if (cachep->buffer_size > 256)
3609 		limit = 54;
3610 	else
3611 		limit = 120;
3612 
3613 	/*
3614 	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
3615 	 * allocation behaviour: Most allocs on one cpu, most free operations
3616 	 * on another cpu. For these cases, an efficient object passing between
3617 	 * cpus is necessary. This is provided by a shared array. The array
3618 	 * replaces Bonwick's magazine layer.
3619 	 * On uniprocessor, it's functionally equivalent (but less efficient)
3620 	 * to a larger limit. Thus disabled by default.
3621 	 */
3622 	shared = 0;
3623 #ifdef CONFIG_SMP
3624 	if (cachep->buffer_size <= PAGE_SIZE)
3625 		shared = 8;
3626 #endif
3627 
3628 #if DEBUG
3629 	/*
3630 	 * With debugging enabled, large batchcount lead to excessively long
3631 	 * periods with disabled local interrupts. Limit the batchcount
3632 	 */
3633 	if (limit > 32)
3634 		limit = 32;
3635 #endif
3636 	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
3637 	if (err)
3638 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3639 		       cachep->name, -err);
3640 }
3641 
3642 /*
3643  * Drain an array if it contains any elements taking the l3 lock only if
3644  * necessary. Note that the l3 listlock also protects the array_cache
3645  * if drain_array() is used on the shared array.
3646  */
3647 void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
3648 			 struct array_cache *ac, int force, int node)
3649 {
3650 	int tofree;
3651 
3652 	if (!ac || !ac->avail)
3653 		return;
3654 	if (ac->touched && !force) {
3655 		ac->touched = 0;
3656 	} else {
3657 		spin_lock_irq(&l3->list_lock);
3658 		if (ac->avail) {
3659 			tofree = force ? ac->avail : (ac->limit + 4) / 5;
3660 			if (tofree > ac->avail)
3661 				tofree = (ac->avail + 1) / 2;
3662 			free_block(cachep, ac->entry, tofree, node);
3663 			ac->avail -= tofree;
3664 			memmove(ac->entry, &(ac->entry[tofree]),
3665 				sizeof(void *) * ac->avail);
3666 		}
3667 		spin_unlock_irq(&l3->list_lock);
3668 	}
3669 }
3670 
3671 /**
3672  * cache_reap - Reclaim memory from caches.
3673  * @unused: unused parameter
3674  *
3675  * Called from workqueue/eventd every few seconds.
3676  * Purpose:
3677  * - clear the per-cpu caches for this CPU.
3678  * - return freeable pages to the main free memory pool.
3679  *
3680  * If we cannot acquire the cache chain mutex then just give up - we'll try
3681  * again on the next iteration.
3682  */
3683 static void cache_reap(void *unused)
3684 {
3685 	struct kmem_cache *searchp;
3686 	struct kmem_list3 *l3;
3687 	int node = numa_node_id();
3688 
3689 	if (!mutex_trylock(&cache_chain_mutex)) {
3690 		/* Give up. Setup the next iteration. */
3691 		schedule_delayed_work(&__get_cpu_var(reap_work),
3692 				      REAPTIMEOUT_CPUC);
3693 		return;
3694 	}
3695 
3696 	list_for_each_entry(searchp, &cache_chain, next) {
3697 		struct list_head *p;
3698 		int tofree;
3699 		struct slab *slabp;
3700 
3701 		check_irq_on();
3702 
3703 		/*
3704 		 * We only take the l3 lock if absolutely necessary and we
3705 		 * have established with reasonable certainty that
3706 		 * we can do some work if the lock was obtained.
3707 		 */
3708 		l3 = searchp->nodelists[node];
3709 
3710 		reap_alien(searchp, l3);
3711 
3712 		drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
3713 
3714 		/*
3715 		 * These are racy checks but it does not matter
3716 		 * if we skip one check or scan twice.
3717 		 */
3718 		if (time_after(l3->next_reap, jiffies))
3719 			goto next;
3720 
3721 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
3722 
3723 		drain_array(searchp, l3, l3->shared, 0, node);
3724 
3725 		if (l3->free_touched) {
3726 			l3->free_touched = 0;
3727 			goto next;
3728 		}
3729 
3730 		tofree = (l3->free_limit + 5 * searchp->num - 1) /
3731 				(5 * searchp->num);
3732 		do {
3733 			/*
3734 			 * Do not lock if there are no free blocks.
3735 			 */
3736 			if (list_empty(&l3->slabs_free))
3737 				break;
3738 
3739 			spin_lock_irq(&l3->list_lock);
3740 			p = l3->slabs_free.next;
3741 			if (p == &(l3->slabs_free)) {
3742 				spin_unlock_irq(&l3->list_lock);
3743 				break;
3744 			}
3745 
3746 			slabp = list_entry(p, struct slab, list);
3747 			BUG_ON(slabp->inuse);
3748 			list_del(&slabp->list);
3749 			STATS_INC_REAPED(searchp);
3750 
3751 			/*
3752 			 * Safe to drop the lock. The slab is no longer linked
3753 			 * to the cache. searchp cannot disappear, we hold
3754 			 * cache_chain_lock
3755 			 */
3756 			l3->free_objects -= searchp->num;
3757 			spin_unlock_irq(&l3->list_lock);
3758 			slab_destroy(searchp, slabp);
3759 		} while (--tofree > 0);
3760 next:
3761 		cond_resched();
3762 	}
3763 	check_irq_on();
3764 	mutex_unlock(&cache_chain_mutex);
3765 	next_reap_node();
3766 	/* Set up the next iteration */
3767 	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
3768 }
3769 
3770 #ifdef CONFIG_PROC_FS
3771 
3772 static void print_slabinfo_header(struct seq_file *m)
3773 {
3774 	/*
3775 	 * Output format version, so at least we can change it
3776 	 * without _too_ many complaints.
3777 	 */
3778 #if STATS
3779 	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
3780 #else
3781 	seq_puts(m, "slabinfo - version: 2.1\n");
3782 #endif
3783 	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
3784 		 "<objperslab> <pagesperslab>");
3785 	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
3786 	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
3787 #if STATS
3788 	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
3789 		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
3790 	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
3791 #endif
3792 	seq_putc(m, '\n');
3793 }
3794 
3795 static void *s_start(struct seq_file *m, loff_t *pos)
3796 {
3797 	loff_t n = *pos;
3798 	struct list_head *p;
3799 
3800 	mutex_lock(&cache_chain_mutex);
3801 	if (!n)
3802 		print_slabinfo_header(m);
3803 	p = cache_chain.next;
3804 	while (n--) {
3805 		p = p->next;
3806 		if (p == &cache_chain)
3807 			return NULL;
3808 	}
3809 	return list_entry(p, struct kmem_cache, next);
3810 }
3811 
3812 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3813 {
3814 	struct kmem_cache *cachep = p;
3815 	++*pos;
3816 	return cachep->next.next == &cache_chain ?
3817 		NULL : list_entry(cachep->next.next, struct kmem_cache, next);
3818 }
3819 
3820 static void s_stop(struct seq_file *m, void *p)
3821 {
3822 	mutex_unlock(&cache_chain_mutex);
3823 }
3824 
3825 static int s_show(struct seq_file *m, void *p)
3826 {
3827 	struct kmem_cache *cachep = p;
3828 	struct slab *slabp;
3829 	unsigned long active_objs;
3830 	unsigned long num_objs;
3831 	unsigned long active_slabs = 0;
3832 	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
3833 	const char *name;
3834 	char *error = NULL;
3835 	int node;
3836 	struct kmem_list3 *l3;
3837 
3838 	active_objs = 0;
3839 	num_slabs = 0;
3840 	for_each_online_node(node) {
3841 		l3 = cachep->nodelists[node];
3842 		if (!l3)
3843 			continue;
3844 
3845 		check_irq_on();
3846 		spin_lock_irq(&l3->list_lock);
3847 
3848 		list_for_each_entry(slabp, &l3->slabs_full, list) {
3849 			if (slabp->inuse != cachep->num && !error)
3850 				error = "slabs_full accounting error";
3851 			active_objs += cachep->num;
3852 			active_slabs++;
3853 		}
3854 		list_for_each_entry(slabp, &l3->slabs_partial, list) {
3855 			if (slabp->inuse == cachep->num && !error)
3856 				error = "slabs_partial inuse accounting error";
3857 			if (!slabp->inuse && !error)
3858 				error = "slabs_partial/inuse accounting error";
3859 			active_objs += slabp->inuse;
3860 			active_slabs++;
3861 		}
3862 		list_for_each_entry(slabp, &l3->slabs_free, list) {
3863 			if (slabp->inuse && !error)
3864 				error = "slabs_free/inuse accounting error";
3865 			num_slabs++;
3866 		}
3867 		free_objects += l3->free_objects;
3868 		if (l3->shared)
3869 			shared_avail += l3->shared->avail;
3870 
3871 		spin_unlock_irq(&l3->list_lock);
3872 	}
3873 	num_slabs += active_slabs;
3874 	num_objs = num_slabs * cachep->num;
3875 	if (num_objs - active_objs != free_objects && !error)
3876 		error = "free_objects accounting error";
3877 
3878 	name = cachep->name;
3879 	if (error)
3880 		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
3881 
3882 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
3883 		   name, active_objs, num_objs, cachep->buffer_size,
3884 		   cachep->num, (1 << cachep->gfporder));
3885 	seq_printf(m, " : tunables %4u %4u %4u",
3886 		   cachep->limit, cachep->batchcount, cachep->shared);
3887 	seq_printf(m, " : slabdata %6lu %6lu %6lu",
3888 		   active_slabs, num_slabs, shared_avail);
3889 #if STATS
3890 	{			/* list3 stats */
3891 		unsigned long high = cachep->high_mark;
3892 		unsigned long allocs = cachep->num_allocations;
3893 		unsigned long grown = cachep->grown;
3894 		unsigned long reaped = cachep->reaped;
3895 		unsigned long errors = cachep->errors;
3896 		unsigned long max_freeable = cachep->max_freeable;
3897 		unsigned long node_allocs = cachep->node_allocs;
3898 		unsigned long node_frees = cachep->node_frees;
3899 		unsigned long overflows = cachep->node_overflow;
3900 
3901 		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
3902 				%4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
3903 				reaped, errors, max_freeable, node_allocs,
3904 				node_frees, overflows);
3905 	}
3906 	/* cpu stats */
3907 	{
3908 		unsigned long allochit = atomic_read(&cachep->allochit);
3909 		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
3910 		unsigned long freehit = atomic_read(&cachep->freehit);
3911 		unsigned long freemiss = atomic_read(&cachep->freemiss);
3912 
3913 		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
3914 			   allochit, allocmiss, freehit, freemiss);
3915 	}
3916 #endif
3917 	seq_putc(m, '\n');
3918 	return 0;
3919 }
3920 
3921 /*
3922  * slabinfo_op - iterator that generates /proc/slabinfo
3923  *
3924  * Output layout:
3925  * cache-name
3926  * num-active-objs
3927  * total-objs
3928  * object size
3929  * num-active-slabs
3930  * total-slabs
3931  * num-pages-per-slab
3932  * + further values on SMP and with statistics enabled
3933  */
3934 
3935 struct seq_operations slabinfo_op = {
3936 	.start = s_start,
3937 	.next = s_next,
3938 	.stop = s_stop,
3939 	.show = s_show,
3940 };
3941 
3942 #define MAX_SLABINFO_WRITE 128
3943 /**
3944  * slabinfo_write - Tuning for the slab allocator
3945  * @file: unused
3946  * @buffer: user buffer
3947  * @count: data length
3948  * @ppos: unused
3949  */
3950 ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3951 		       size_t count, loff_t *ppos)
3952 {
3953 	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
3954 	int limit, batchcount, shared, res;
3955 	struct kmem_cache *cachep;
3956 
3957 	if (count > MAX_SLABINFO_WRITE)
3958 		return -EINVAL;
3959 	if (copy_from_user(&kbuf, buffer, count))
3960 		return -EFAULT;
3961 	kbuf[MAX_SLABINFO_WRITE] = '\0';
3962 
3963 	tmp = strchr(kbuf, ' ');
3964 	if (!tmp)
3965 		return -EINVAL;
3966 	*tmp = '\0';
3967 	tmp++;
3968 	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
3969 		return -EINVAL;
3970 
3971 	/* Find the cache in the chain of caches. */
3972 	mutex_lock(&cache_chain_mutex);
3973 	res = -EINVAL;
3974 	list_for_each_entry(cachep, &cache_chain, next) {
3975 		if (!strcmp(cachep->name, kbuf)) {
3976 			if (limit < 1 || batchcount < 1 ||
3977 					batchcount > limit || shared < 0) {
3978 				res = 0;
3979 			} else {
3980 				res = do_tune_cpucache(cachep, limit,
3981 						       batchcount, shared);
3982 			}
3983 			break;
3984 		}
3985 	}
3986 	mutex_unlock(&cache_chain_mutex);
3987 	if (res >= 0)
3988 		res = count;
3989 	return res;
3990 }
3991 
3992 #ifdef CONFIG_DEBUG_SLAB_LEAK
3993 
3994 static void *leaks_start(struct seq_file *m, loff_t *pos)
3995 {
3996 	loff_t n = *pos;
3997 	struct list_head *p;
3998 
3999 	mutex_lock(&cache_chain_mutex);
4000 	p = cache_chain.next;
4001 	while (n--) {
4002 		p = p->next;
4003 		if (p == &cache_chain)
4004 			return NULL;
4005 	}
4006 	return list_entry(p, struct kmem_cache, next);
4007 }
4008 
4009 static inline int add_caller(unsigned long *n, unsigned long v)
4010 {
4011 	unsigned long *p;
4012 	int l;
4013 	if (!v)
4014 		return 1;
4015 	l = n[1];
4016 	p = n + 2;
4017 	while (l) {
4018 		int i = l/2;
4019 		unsigned long *q = p + 2 * i;
4020 		if (*q == v) {
4021 			q[1]++;
4022 			return 1;
4023 		}
4024 		if (*q > v) {
4025 			l = i;
4026 		} else {
4027 			p = q + 2;
4028 			l -= i + 1;
4029 		}
4030 	}
4031 	if (++n[1] == n[0])
4032 		return 0;
4033 	memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4034 	p[0] = v;
4035 	p[1] = 1;
4036 	return 1;
4037 }
4038 
4039 static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4040 {
4041 	void *p;
4042 	int i;
4043 	if (n[0] == n[1])
4044 		return;
4045 	for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
4046 		if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4047 			continue;
4048 		if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4049 			return;
4050 	}
4051 }
4052 
4053 static void show_symbol(struct seq_file *m, unsigned long address)
4054 {
4055 #ifdef CONFIG_KALLSYMS
4056 	char *modname;
4057 	const char *name;
4058 	unsigned long offset, size;
4059 	char namebuf[KSYM_NAME_LEN+1];
4060 
4061 	name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
4062 
4063 	if (name) {
4064 		seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4065 		if (modname)
4066 			seq_printf(m, " [%s]", modname);
4067 		return;
4068 	}
4069 #endif
4070 	seq_printf(m, "%p", (void *)address);
4071 }
4072 
4073 static int leaks_show(struct seq_file *m, void *p)
4074 {
4075 	struct kmem_cache *cachep = p;
4076 	struct slab *slabp;
4077 	struct kmem_list3 *l3;
4078 	const char *name;
4079 	unsigned long *n = m->private;
4080 	int node;
4081 	int i;
4082 
4083 	if (!(cachep->flags & SLAB_STORE_USER))
4084 		return 0;
4085 	if (!(cachep->flags & SLAB_RED_ZONE))
4086 		return 0;
4087 
4088 	/* OK, we can do it */
4089 
4090 	n[1] = 0;
4091 
4092 	for_each_online_node(node) {
4093 		l3 = cachep->nodelists[node];
4094 		if (!l3)
4095 			continue;
4096 
4097 		check_irq_on();
4098 		spin_lock_irq(&l3->list_lock);
4099 
4100 		list_for_each_entry(slabp, &l3->slabs_full, list)
4101 			handle_slab(n, cachep, slabp);
4102 		list_for_each_entry(slabp, &l3->slabs_partial, list)
4103 			handle_slab(n, cachep, slabp);
4104 		spin_unlock_irq(&l3->list_lock);
4105 	}
4106 	name = cachep->name;
4107 	if (n[0] == n[1]) {
4108 		/* Increase the buffer size */
4109 		mutex_unlock(&cache_chain_mutex);
4110 		m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4111 		if (!m->private) {
4112 			/* Too bad, we are really out */
4113 			m->private = n;
4114 			mutex_lock(&cache_chain_mutex);
4115 			return -ENOMEM;
4116 		}
4117 		*(unsigned long *)m->private = n[0] * 2;
4118 		kfree(n);
4119 		mutex_lock(&cache_chain_mutex);
4120 		/* Now make sure this entry will be retried */
4121 		m->count = m->size;
4122 		return 0;
4123 	}
4124 	for (i = 0; i < n[1]; i++) {
4125 		seq_printf(m, "%s: %lu ", name, n[2*i+3]);
4126 		show_symbol(m, n[2*i+2]);
4127 		seq_putc(m, '\n');
4128 	}
4129 	return 0;
4130 }
4131 
4132 struct seq_operations slabstats_op = {
4133 	.start = leaks_start,
4134 	.next = s_next,
4135 	.stop = s_stop,
4136 	.show = leaks_show,
4137 };
4138 #endif
4139 #endif
4140 
4141 /**
4142  * ksize - get the actual amount of memory allocated for a given object
4143  * @objp: Pointer to the object
4144  *
4145  * kmalloc may internally round up allocations and return more memory
4146  * than requested. ksize() can be used to determine the actual amount of
4147  * memory allocated. The caller may use this additional memory, even though
4148  * a smaller amount of memory was initially specified with the kmalloc call.
4149  * The caller must guarantee that objp points to a valid object previously
4150  * allocated with either kmalloc() or kmem_cache_alloc(). The object
4151  * must not be freed during the duration of the call.
4152  */
4153 unsigned int ksize(const void *objp)
4154 {
4155 	if (unlikely(objp == NULL))
4156 		return 0;
4157 
4158 	return obj_size(virt_to_cache(objp));
4159 }
4160