xref: /openbmc/linux/mm/slab.c (revision 37569287cba1246a5057de32ac42d6c8941c714b)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * linux/mm/slab.c
4   * Written by Mark Hemment, 1996/97.
5   * (markhe@nextd.demon.co.uk)
6   *
7   * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
8   *
9   * Major cleanup, different bufctl logic, per-cpu arrays
10   *	(c) 2000 Manfred Spraul
11   *
12   * Cleanup, make the head arrays unconditional, preparation for NUMA
13   * 	(c) 2002 Manfred Spraul
14   *
15   * An implementation of the Slab Allocator as described in outline in;
16   *	UNIX Internals: The New Frontiers by Uresh Vahalia
17   *	Pub: Prentice Hall	ISBN 0-13-101908-2
18   * or with a little more detail in;
19   *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
20   *	Jeff Bonwick (Sun Microsystems).
21   *	Presented at: USENIX Summer 1994 Technical Conference
22   *
23   * The memory is organized in caches, one cache for each object type.
24   * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
25   * Each cache consists out of many slabs (they are small (usually one
26   * page long) and always contiguous), and each slab contains multiple
27   * initialized objects.
28   *
29   * This means, that your constructor is used only for newly allocated
30   * slabs and you must pass objects with the same initializations to
31   * kmem_cache_free.
32   *
33   * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
34   * normal). If you need a special memory type, then must create a new
35   * cache for that memory type.
36   *
37   * In order to reduce fragmentation, the slabs are sorted in 3 groups:
38   *   full slabs with 0 free objects
39   *   partial slabs
40   *   empty slabs with no allocated objects
41   *
42   * If partial slabs exist, then new allocations come from these slabs,
43   * otherwise from empty slabs or new slabs are allocated.
44   *
45   * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
46   * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
47   *
48   * Each cache has a short per-cpu head array, most allocs
49   * and frees go into that array, and if that array overflows, then 1/2
50   * of the entries in the array are given back into the global cache.
51   * The head array is strictly LIFO and should improve the cache hit rates.
52   * On SMP, it additionally reduces the spinlock operations.
53   *
54   * The c_cpuarray may not be read with enabled local interrupts -
55   * it's changed with a smp_call_function().
56   *
57   * SMP synchronization:
58   *  constructors and destructors are called without any locking.
59   *  Several members in struct kmem_cache and struct slab never change, they
60   *	are accessed without any locking.
61   *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
62   *  	and local interrupts are disabled so slab code is preempt-safe.
63   *  The non-constant members are protected with a per-cache irq spinlock.
64   *
65   * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
66   * in 2000 - many ideas in the current implementation are derived from
67   * his patch.
68   *
69   * Further notes from the original documentation:
70   *
71   * 11 April '97.  Started multi-threading - markhe
72   *	The global cache-chain is protected by the mutex 'slab_mutex'.
73   *	The sem is only needed when accessing/extending the cache-chain, which
74   *	can never happen inside an interrupt (kmem_cache_create(),
75   *	kmem_cache_shrink() and kmem_cache_reap()).
76   *
77   *	At present, each engine can be growing a cache.  This should be blocked.
78   *
79   * 15 March 2005. NUMA slab allocator.
80   *	Shai Fultheim <shai@scalex86.org>.
81   *	Shobhit Dayal <shobhit@calsoftinc.com>
82   *	Alok N Kataria <alokk@calsoftinc.com>
83   *	Christoph Lameter <christoph@lameter.com>
84   *
85   *	Modified the slab allocator to be node aware on NUMA systems.
86   *	Each node has its own list of partial, free and full slabs.
87   *	All object allocations for a node occur from node specific slab lists.
88   */
89  
90  #include	<linux/slab.h>
91  #include	<linux/mm.h>
92  #include	<linux/poison.h>
93  #include	<linux/swap.h>
94  #include	<linux/cache.h>
95  #include	<linux/interrupt.h>
96  #include	<linux/init.h>
97  #include	<linux/compiler.h>
98  #include	<linux/cpuset.h>
99  #include	<linux/proc_fs.h>
100  #include	<linux/seq_file.h>
101  #include	<linux/notifier.h>
102  #include	<linux/kallsyms.h>
103  #include	<linux/kfence.h>
104  #include	<linux/cpu.h>
105  #include	<linux/sysctl.h>
106  #include	<linux/module.h>
107  #include	<linux/rcupdate.h>
108  #include	<linux/string.h>
109  #include	<linux/uaccess.h>
110  #include	<linux/nodemask.h>
111  #include	<linux/kmemleak.h>
112  #include	<linux/mempolicy.h>
113  #include	<linux/mutex.h>
114  #include	<linux/fault-inject.h>
115  #include	<linux/rtmutex.h>
116  #include	<linux/reciprocal_div.h>
117  #include	<linux/debugobjects.h>
118  #include	<linux/memory.h>
119  #include	<linux/prefetch.h>
120  #include	<linux/sched/task_stack.h>
121  
122  #include	<net/sock.h>
123  
124  #include	<asm/cacheflush.h>
125  #include	<asm/tlbflush.h>
126  #include	<asm/page.h>
127  
128  #include <trace/events/kmem.h>
129  
130  #include	"internal.h"
131  
132  #include	"slab.h"
133  
134  /*
135   * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
136   *		  0 for faster, smaller code (especially in the critical paths).
137   *
138   * STATS	- 1 to collect stats for /proc/slabinfo.
139   *		  0 for faster, smaller code (especially in the critical paths).
140   *
141   * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
142   */
143  
144  #ifdef CONFIG_DEBUG_SLAB
145  #define	DEBUG		1
146  #define	STATS		1
147  #define	FORCED_DEBUG	1
148  #else
149  #define	DEBUG		0
150  #define	STATS		0
151  #define	FORCED_DEBUG	0
152  #endif
153  
154  /* Shouldn't this be in a header file somewhere? */
155  #define	BYTES_PER_WORD		sizeof(void *)
156  #define	REDZONE_ALIGN		max(BYTES_PER_WORD, __alignof__(unsigned long long))
157  
158  #ifndef ARCH_KMALLOC_FLAGS
159  #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
160  #endif
161  
162  #define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \
163  				<= SLAB_OBJ_MIN_SIZE) ? 1 : 0)
164  
165  #if FREELIST_BYTE_INDEX
166  typedef unsigned char freelist_idx_t;
167  #else
168  typedef unsigned short freelist_idx_t;
169  #endif
170  
171  #define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
172  
173  /*
174   * struct array_cache
175   *
176   * Purpose:
177   * - LIFO ordering, to hand out cache-warm objects from _alloc
178   * - reduce the number of linked list operations
179   * - reduce spinlock operations
180   *
181   * The limit is stored in the per-cpu structure to reduce the data cache
182   * footprint.
183   *
184   */
185  struct array_cache {
186  	unsigned int avail;
187  	unsigned int limit;
188  	unsigned int batchcount;
189  	unsigned int touched;
190  	void *entry[];	/*
191  			 * Must have this definition in here for the proper
192  			 * alignment of array_cache. Also simplifies accessing
193  			 * the entries.
194  			 */
195  };
196  
197  struct alien_cache {
198  	spinlock_t lock;
199  	struct array_cache ac;
200  };
201  
202  /*
203   * Need this for bootstrapping a per node allocator.
204   */
205  #define NUM_INIT_LISTS (2 * MAX_NUMNODES)
206  static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
207  #define	CACHE_CACHE 0
208  #define	SIZE_NODE (MAX_NUMNODES)
209  
210  static int drain_freelist(struct kmem_cache *cache,
211  			struct kmem_cache_node *n, int tofree);
212  static void free_block(struct kmem_cache *cachep, void **objpp, int len,
213  			int node, struct list_head *list);
214  static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list);
215  static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
216  static void cache_reap(struct work_struct *unused);
217  
218  static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
219  						void **list);
220  static inline void fixup_slab_list(struct kmem_cache *cachep,
221  				struct kmem_cache_node *n, struct page *page,
222  				void **list);
223  static int slab_early_init = 1;
224  
225  #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
226  
227  static void kmem_cache_node_init(struct kmem_cache_node *parent)
228  {
229  	INIT_LIST_HEAD(&parent->slabs_full);
230  	INIT_LIST_HEAD(&parent->slabs_partial);
231  	INIT_LIST_HEAD(&parent->slabs_free);
232  	parent->total_slabs = 0;
233  	parent->free_slabs = 0;
234  	parent->shared = NULL;
235  	parent->alien = NULL;
236  	parent->colour_next = 0;
237  	spin_lock_init(&parent->list_lock);
238  	parent->free_objects = 0;
239  	parent->free_touched = 0;
240  }
241  
242  #define MAKE_LIST(cachep, listp, slab, nodeid)				\
243  	do {								\
244  		INIT_LIST_HEAD(listp);					\
245  		list_splice(&get_node(cachep, nodeid)->slab, listp);	\
246  	} while (0)
247  
248  #define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
249  	do {								\
250  	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
251  	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
252  	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
253  	} while (0)
254  
255  #define CFLGS_OBJFREELIST_SLAB	((slab_flags_t __force)0x40000000U)
256  #define CFLGS_OFF_SLAB		((slab_flags_t __force)0x80000000U)
257  #define	OBJFREELIST_SLAB(x)	((x)->flags & CFLGS_OBJFREELIST_SLAB)
258  #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
259  
260  #define BATCHREFILL_LIMIT	16
261  /*
262   * Optimization question: fewer reaps means less probability for unnessary
263   * cpucache drain/refill cycles.
264   *
265   * OTOH the cpuarrays can contain lots of objects,
266   * which could lock up otherwise freeable slabs.
267   */
268  #define REAPTIMEOUT_AC		(2*HZ)
269  #define REAPTIMEOUT_NODE	(4*HZ)
270  
271  #if STATS
272  #define	STATS_INC_ACTIVE(x)	((x)->num_active++)
273  #define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
274  #define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
275  #define	STATS_INC_GROWN(x)	((x)->grown++)
276  #define	STATS_ADD_REAPED(x, y)	((x)->reaped += (y))
277  #define	STATS_SET_HIGH(x)						\
278  	do {								\
279  		if ((x)->num_active > (x)->high_mark)			\
280  			(x)->high_mark = (x)->num_active;		\
281  	} while (0)
282  #define	STATS_INC_ERR(x)	((x)->errors++)
283  #define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
284  #define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
285  #define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
286  #define	STATS_SET_FREEABLE(x, i)					\
287  	do {								\
288  		if ((x)->max_freeable < i)				\
289  			(x)->max_freeable = i;				\
290  	} while (0)
291  #define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
292  #define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
293  #define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
294  #define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
295  #else
296  #define	STATS_INC_ACTIVE(x)	do { } while (0)
297  #define	STATS_DEC_ACTIVE(x)	do { } while (0)
298  #define	STATS_INC_ALLOCED(x)	do { } while (0)
299  #define	STATS_INC_GROWN(x)	do { } while (0)
300  #define	STATS_ADD_REAPED(x, y)	do { (void)(y); } while (0)
301  #define	STATS_SET_HIGH(x)	do { } while (0)
302  #define	STATS_INC_ERR(x)	do { } while (0)
303  #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
304  #define	STATS_INC_NODEFREES(x)	do { } while (0)
305  #define STATS_INC_ACOVERFLOW(x)   do { } while (0)
306  #define	STATS_SET_FREEABLE(x, i) do { } while (0)
307  #define STATS_INC_ALLOCHIT(x)	do { } while (0)
308  #define STATS_INC_ALLOCMISS(x)	do { } while (0)
309  #define STATS_INC_FREEHIT(x)	do { } while (0)
310  #define STATS_INC_FREEMISS(x)	do { } while (0)
311  #endif
312  
313  #if DEBUG
314  
315  /*
316   * memory layout of objects:
317   * 0		: objp
318   * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
319   * 		the end of an object is aligned with the end of the real
320   * 		allocation. Catches writes behind the end of the allocation.
321   * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
322   * 		redzone word.
323   * cachep->obj_offset: The real object.
324   * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
325   * cachep->size - 1* BYTES_PER_WORD: last caller address
326   *					[BYTES_PER_WORD long]
327   */
328  static int obj_offset(struct kmem_cache *cachep)
329  {
330  	return cachep->obj_offset;
331  }
332  
333  static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
334  {
335  	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
336  	return (unsigned long long *) (objp + obj_offset(cachep) -
337  				      sizeof(unsigned long long));
338  }
339  
340  static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
341  {
342  	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
343  	if (cachep->flags & SLAB_STORE_USER)
344  		return (unsigned long long *)(objp + cachep->size -
345  					      sizeof(unsigned long long) -
346  					      REDZONE_ALIGN);
347  	return (unsigned long long *) (objp + cachep->size -
348  				       sizeof(unsigned long long));
349  }
350  
351  static void **dbg_userword(struct kmem_cache *cachep, void *objp)
352  {
353  	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
354  	return (void **)(objp + cachep->size - BYTES_PER_WORD);
355  }
356  
357  #else
358  
359  #define obj_offset(x)			0
360  #define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
361  #define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
362  #define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
363  
364  #endif
365  
366  /*
367   * Do not go above this order unless 0 objects fit into the slab or
368   * overridden on the command line.
369   */
370  #define	SLAB_MAX_ORDER_HI	1
371  #define	SLAB_MAX_ORDER_LO	0
372  static int slab_max_order = SLAB_MAX_ORDER_LO;
373  static bool slab_max_order_set __initdata;
374  
375  static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
376  				 unsigned int idx)
377  {
378  	return page->s_mem + cache->size * idx;
379  }
380  
381  #define BOOT_CPUCACHE_ENTRIES	1
382  /* internal cache of cache description objs */
383  static struct kmem_cache kmem_cache_boot = {
384  	.batchcount = 1,
385  	.limit = BOOT_CPUCACHE_ENTRIES,
386  	.shared = 1,
387  	.size = sizeof(struct kmem_cache),
388  	.name = "kmem_cache",
389  };
390  
391  static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
392  
393  static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
394  {
395  	return this_cpu_ptr(cachep->cpu_cache);
396  }
397  
398  /*
399   * Calculate the number of objects and left-over bytes for a given buffer size.
400   */
401  static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
402  		slab_flags_t flags, size_t *left_over)
403  {
404  	unsigned int num;
405  	size_t slab_size = PAGE_SIZE << gfporder;
406  
407  	/*
408  	 * The slab management structure can be either off the slab or
409  	 * on it. For the latter case, the memory allocated for a
410  	 * slab is used for:
411  	 *
412  	 * - @buffer_size bytes for each object
413  	 * - One freelist_idx_t for each object
414  	 *
415  	 * We don't need to consider alignment of freelist because
416  	 * freelist will be at the end of slab page. The objects will be
417  	 * at the correct alignment.
418  	 *
419  	 * If the slab management structure is off the slab, then the
420  	 * alignment will already be calculated into the size. Because
421  	 * the slabs are all pages aligned, the objects will be at the
422  	 * correct alignment when allocated.
423  	 */
424  	if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) {
425  		num = slab_size / buffer_size;
426  		*left_over = slab_size % buffer_size;
427  	} else {
428  		num = slab_size / (buffer_size + sizeof(freelist_idx_t));
429  		*left_over = slab_size %
430  			(buffer_size + sizeof(freelist_idx_t));
431  	}
432  
433  	return num;
434  }
435  
436  #if DEBUG
437  #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
438  
439  static void __slab_error(const char *function, struct kmem_cache *cachep,
440  			char *msg)
441  {
442  	pr_err("slab error in %s(): cache `%s': %s\n",
443  	       function, cachep->name, msg);
444  	dump_stack();
445  	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
446  }
447  #endif
448  
449  /*
450   * By default on NUMA we use alien caches to stage the freeing of
451   * objects allocated from other nodes. This causes massive memory
452   * inefficiencies when using fake NUMA setup to split memory into a
453   * large number of small nodes, so it can be disabled on the command
454   * line
455    */
456  
457  static int use_alien_caches __read_mostly = 1;
458  static int __init noaliencache_setup(char *s)
459  {
460  	use_alien_caches = 0;
461  	return 1;
462  }
463  __setup("noaliencache", noaliencache_setup);
464  
465  static int __init slab_max_order_setup(char *str)
466  {
467  	get_option(&str, &slab_max_order);
468  	slab_max_order = slab_max_order < 0 ? 0 :
469  				min(slab_max_order, MAX_ORDER - 1);
470  	slab_max_order_set = true;
471  
472  	return 1;
473  }
474  __setup("slab_max_order=", slab_max_order_setup);
475  
476  #ifdef CONFIG_NUMA
477  /*
478   * Special reaping functions for NUMA systems called from cache_reap().
479   * These take care of doing round robin flushing of alien caches (containing
480   * objects freed on different nodes from which they were allocated) and the
481   * flushing of remote pcps by calling drain_node_pages.
482   */
483  static DEFINE_PER_CPU(unsigned long, slab_reap_node);
484  
485  static void init_reap_node(int cpu)
486  {
487  	per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu),
488  						    node_online_map);
489  }
490  
491  static void next_reap_node(void)
492  {
493  	int node = __this_cpu_read(slab_reap_node);
494  
495  	node = next_node_in(node, node_online_map);
496  	__this_cpu_write(slab_reap_node, node);
497  }
498  
499  #else
500  #define init_reap_node(cpu) do { } while (0)
501  #define next_reap_node(void) do { } while (0)
502  #endif
503  
504  /*
505   * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
506   * via the workqueue/eventd.
507   * Add the CPU number into the expiration time to minimize the possibility of
508   * the CPUs getting into lockstep and contending for the global cache chain
509   * lock.
510   */
511  static void start_cpu_timer(int cpu)
512  {
513  	struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
514  
515  	if (reap_work->work.func == NULL) {
516  		init_reap_node(cpu);
517  		INIT_DEFERRABLE_WORK(reap_work, cache_reap);
518  		schedule_delayed_work_on(cpu, reap_work,
519  					__round_jiffies_relative(HZ, cpu));
520  	}
521  }
522  
523  static void init_arraycache(struct array_cache *ac, int limit, int batch)
524  {
525  	if (ac) {
526  		ac->avail = 0;
527  		ac->limit = limit;
528  		ac->batchcount = batch;
529  		ac->touched = 0;
530  	}
531  }
532  
533  static struct array_cache *alloc_arraycache(int node, int entries,
534  					    int batchcount, gfp_t gfp)
535  {
536  	size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache);
537  	struct array_cache *ac = NULL;
538  
539  	ac = kmalloc_node(memsize, gfp, node);
540  	/*
541  	 * The array_cache structures contain pointers to free object.
542  	 * However, when such objects are allocated or transferred to another
543  	 * cache the pointers are not cleared and they could be counted as
544  	 * valid references during a kmemleak scan. Therefore, kmemleak must
545  	 * not scan such objects.
546  	 */
547  	kmemleak_no_scan(ac);
548  	init_arraycache(ac, entries, batchcount);
549  	return ac;
550  }
551  
552  static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
553  					struct page *page, void *objp)
554  {
555  	struct kmem_cache_node *n;
556  	int page_node;
557  	LIST_HEAD(list);
558  
559  	page_node = page_to_nid(page);
560  	n = get_node(cachep, page_node);
561  
562  	spin_lock(&n->list_lock);
563  	free_block(cachep, &objp, 1, page_node, &list);
564  	spin_unlock(&n->list_lock);
565  
566  	slabs_destroy(cachep, &list);
567  }
568  
569  /*
570   * Transfer objects in one arraycache to another.
571   * Locking must be handled by the caller.
572   *
573   * Return the number of entries transferred.
574   */
575  static int transfer_objects(struct array_cache *to,
576  		struct array_cache *from, unsigned int max)
577  {
578  	/* Figure out how many entries to transfer */
579  	int nr = min3(from->avail, max, to->limit - to->avail);
580  
581  	if (!nr)
582  		return 0;
583  
584  	memcpy(to->entry + to->avail, from->entry + from->avail - nr,
585  			sizeof(void *) *nr);
586  
587  	from->avail -= nr;
588  	to->avail += nr;
589  	return nr;
590  }
591  
592  /* &alien->lock must be held by alien callers. */
593  static __always_inline void __free_one(struct array_cache *ac, void *objp)
594  {
595  	/* Avoid trivial double-free. */
596  	if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
597  	    WARN_ON_ONCE(ac->avail > 0 && ac->entry[ac->avail - 1] == objp))
598  		return;
599  	ac->entry[ac->avail++] = objp;
600  }
601  
602  #ifndef CONFIG_NUMA
603  
604  #define drain_alien_cache(cachep, alien) do { } while (0)
605  #define reap_alien(cachep, n) do { } while (0)
606  
607  static inline struct alien_cache **alloc_alien_cache(int node,
608  						int limit, gfp_t gfp)
609  {
610  	return NULL;
611  }
612  
613  static inline void free_alien_cache(struct alien_cache **ac_ptr)
614  {
615  }
616  
617  static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
618  {
619  	return 0;
620  }
621  
622  static inline void *alternate_node_alloc(struct kmem_cache *cachep,
623  		gfp_t flags)
624  {
625  	return NULL;
626  }
627  
628  static inline void *____cache_alloc_node(struct kmem_cache *cachep,
629  		 gfp_t flags, int nodeid)
630  {
631  	return NULL;
632  }
633  
634  static inline gfp_t gfp_exact_node(gfp_t flags)
635  {
636  	return flags & ~__GFP_NOFAIL;
637  }
638  
639  #else	/* CONFIG_NUMA */
640  
641  static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
642  static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
643  
644  static struct alien_cache *__alloc_alien_cache(int node, int entries,
645  						int batch, gfp_t gfp)
646  {
647  	size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache);
648  	struct alien_cache *alc = NULL;
649  
650  	alc = kmalloc_node(memsize, gfp, node);
651  	if (alc) {
652  		kmemleak_no_scan(alc);
653  		init_arraycache(&alc->ac, entries, batch);
654  		spin_lock_init(&alc->lock);
655  	}
656  	return alc;
657  }
658  
659  static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
660  {
661  	struct alien_cache **alc_ptr;
662  	int i;
663  
664  	if (limit > 1)
665  		limit = 12;
666  	alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node);
667  	if (!alc_ptr)
668  		return NULL;
669  
670  	for_each_node(i) {
671  		if (i == node || !node_online(i))
672  			continue;
673  		alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp);
674  		if (!alc_ptr[i]) {
675  			for (i--; i >= 0; i--)
676  				kfree(alc_ptr[i]);
677  			kfree(alc_ptr);
678  			return NULL;
679  		}
680  	}
681  	return alc_ptr;
682  }
683  
684  static void free_alien_cache(struct alien_cache **alc_ptr)
685  {
686  	int i;
687  
688  	if (!alc_ptr)
689  		return;
690  	for_each_node(i)
691  	    kfree(alc_ptr[i]);
692  	kfree(alc_ptr);
693  }
694  
695  static void __drain_alien_cache(struct kmem_cache *cachep,
696  				struct array_cache *ac, int node,
697  				struct list_head *list)
698  {
699  	struct kmem_cache_node *n = get_node(cachep, node);
700  
701  	if (ac->avail) {
702  		spin_lock(&n->list_lock);
703  		/*
704  		 * Stuff objects into the remote nodes shared array first.
705  		 * That way we could avoid the overhead of putting the objects
706  		 * into the free lists and getting them back later.
707  		 */
708  		if (n->shared)
709  			transfer_objects(n->shared, ac, ac->limit);
710  
711  		free_block(cachep, ac->entry, ac->avail, node, list);
712  		ac->avail = 0;
713  		spin_unlock(&n->list_lock);
714  	}
715  }
716  
717  /*
718   * Called from cache_reap() to regularly drain alien caches round robin.
719   */
720  static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n)
721  {
722  	int node = __this_cpu_read(slab_reap_node);
723  
724  	if (n->alien) {
725  		struct alien_cache *alc = n->alien[node];
726  		struct array_cache *ac;
727  
728  		if (alc) {
729  			ac = &alc->ac;
730  			if (ac->avail && spin_trylock_irq(&alc->lock)) {
731  				LIST_HEAD(list);
732  
733  				__drain_alien_cache(cachep, ac, node, &list);
734  				spin_unlock_irq(&alc->lock);
735  				slabs_destroy(cachep, &list);
736  			}
737  		}
738  	}
739  }
740  
741  static void drain_alien_cache(struct kmem_cache *cachep,
742  				struct alien_cache **alien)
743  {
744  	int i = 0;
745  	struct alien_cache *alc;
746  	struct array_cache *ac;
747  	unsigned long flags;
748  
749  	for_each_online_node(i) {
750  		alc = alien[i];
751  		if (alc) {
752  			LIST_HEAD(list);
753  
754  			ac = &alc->ac;
755  			spin_lock_irqsave(&alc->lock, flags);
756  			__drain_alien_cache(cachep, ac, i, &list);
757  			spin_unlock_irqrestore(&alc->lock, flags);
758  			slabs_destroy(cachep, &list);
759  		}
760  	}
761  }
762  
763  static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
764  				int node, int page_node)
765  {
766  	struct kmem_cache_node *n;
767  	struct alien_cache *alien = NULL;
768  	struct array_cache *ac;
769  	LIST_HEAD(list);
770  
771  	n = get_node(cachep, node);
772  	STATS_INC_NODEFREES(cachep);
773  	if (n->alien && n->alien[page_node]) {
774  		alien = n->alien[page_node];
775  		ac = &alien->ac;
776  		spin_lock(&alien->lock);
777  		if (unlikely(ac->avail == ac->limit)) {
778  			STATS_INC_ACOVERFLOW(cachep);
779  			__drain_alien_cache(cachep, ac, page_node, &list);
780  		}
781  		__free_one(ac, objp);
782  		spin_unlock(&alien->lock);
783  		slabs_destroy(cachep, &list);
784  	} else {
785  		n = get_node(cachep, page_node);
786  		spin_lock(&n->list_lock);
787  		free_block(cachep, &objp, 1, page_node, &list);
788  		spin_unlock(&n->list_lock);
789  		slabs_destroy(cachep, &list);
790  	}
791  	return 1;
792  }
793  
794  static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
795  {
796  	int page_node = page_to_nid(virt_to_page(objp));
797  	int node = numa_mem_id();
798  	/*
799  	 * Make sure we are not freeing a object from another node to the array
800  	 * cache on this cpu.
801  	 */
802  	if (likely(node == page_node))
803  		return 0;
804  
805  	return __cache_free_alien(cachep, objp, node, page_node);
806  }
807  
808  /*
809   * Construct gfp mask to allocate from a specific node but do not reclaim or
810   * warn about failures.
811   */
812  static inline gfp_t gfp_exact_node(gfp_t flags)
813  {
814  	return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
815  }
816  #endif
817  
818  static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
819  {
820  	struct kmem_cache_node *n;
821  
822  	/*
823  	 * Set up the kmem_cache_node for cpu before we can
824  	 * begin anything. Make sure some other cpu on this
825  	 * node has not already allocated this
826  	 */
827  	n = get_node(cachep, node);
828  	if (n) {
829  		spin_lock_irq(&n->list_lock);
830  		n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
831  				cachep->num;
832  		spin_unlock_irq(&n->list_lock);
833  
834  		return 0;
835  	}
836  
837  	n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
838  	if (!n)
839  		return -ENOMEM;
840  
841  	kmem_cache_node_init(n);
842  	n->next_reap = jiffies + REAPTIMEOUT_NODE +
843  		    ((unsigned long)cachep) % REAPTIMEOUT_NODE;
844  
845  	n->free_limit =
846  		(1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num;
847  
848  	/*
849  	 * The kmem_cache_nodes don't come and go as CPUs
850  	 * come and go.  slab_mutex is sufficient
851  	 * protection here.
852  	 */
853  	cachep->node[node] = n;
854  
855  	return 0;
856  }
857  
858  #if (defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)) || defined(CONFIG_SMP)
859  /*
860   * Allocates and initializes node for a node on each slab cache, used for
861   * either memory or cpu hotplug.  If memory is being hot-added, the kmem_cache_node
862   * will be allocated off-node since memory is not yet online for the new node.
863   * When hotplugging memory or a cpu, existing node are not replaced if
864   * already in use.
865   *
866   * Must hold slab_mutex.
867   */
868  static int init_cache_node_node(int node)
869  {
870  	int ret;
871  	struct kmem_cache *cachep;
872  
873  	list_for_each_entry(cachep, &slab_caches, list) {
874  		ret = init_cache_node(cachep, node, GFP_KERNEL);
875  		if (ret)
876  			return ret;
877  	}
878  
879  	return 0;
880  }
881  #endif
882  
883  static int setup_kmem_cache_node(struct kmem_cache *cachep,
884  				int node, gfp_t gfp, bool force_change)
885  {
886  	int ret = -ENOMEM;
887  	struct kmem_cache_node *n;
888  	struct array_cache *old_shared = NULL;
889  	struct array_cache *new_shared = NULL;
890  	struct alien_cache **new_alien = NULL;
891  	LIST_HEAD(list);
892  
893  	if (use_alien_caches) {
894  		new_alien = alloc_alien_cache(node, cachep->limit, gfp);
895  		if (!new_alien)
896  			goto fail;
897  	}
898  
899  	if (cachep->shared) {
900  		new_shared = alloc_arraycache(node,
901  			cachep->shared * cachep->batchcount, 0xbaadf00d, gfp);
902  		if (!new_shared)
903  			goto fail;
904  	}
905  
906  	ret = init_cache_node(cachep, node, gfp);
907  	if (ret)
908  		goto fail;
909  
910  	n = get_node(cachep, node);
911  	spin_lock_irq(&n->list_lock);
912  	if (n->shared && force_change) {
913  		free_block(cachep, n->shared->entry,
914  				n->shared->avail, node, &list);
915  		n->shared->avail = 0;
916  	}
917  
918  	if (!n->shared || force_change) {
919  		old_shared = n->shared;
920  		n->shared = new_shared;
921  		new_shared = NULL;
922  	}
923  
924  	if (!n->alien) {
925  		n->alien = new_alien;
926  		new_alien = NULL;
927  	}
928  
929  	spin_unlock_irq(&n->list_lock);
930  	slabs_destroy(cachep, &list);
931  
932  	/*
933  	 * To protect lockless access to n->shared during irq disabled context.
934  	 * If n->shared isn't NULL in irq disabled context, accessing to it is
935  	 * guaranteed to be valid until irq is re-enabled, because it will be
936  	 * freed after synchronize_rcu().
937  	 */
938  	if (old_shared && force_change)
939  		synchronize_rcu();
940  
941  fail:
942  	kfree(old_shared);
943  	kfree(new_shared);
944  	free_alien_cache(new_alien);
945  
946  	return ret;
947  }
948  
949  #ifdef CONFIG_SMP
950  
951  static void cpuup_canceled(long cpu)
952  {
953  	struct kmem_cache *cachep;
954  	struct kmem_cache_node *n = NULL;
955  	int node = cpu_to_mem(cpu);
956  	const struct cpumask *mask = cpumask_of_node(node);
957  
958  	list_for_each_entry(cachep, &slab_caches, list) {
959  		struct array_cache *nc;
960  		struct array_cache *shared;
961  		struct alien_cache **alien;
962  		LIST_HEAD(list);
963  
964  		n = get_node(cachep, node);
965  		if (!n)
966  			continue;
967  
968  		spin_lock_irq(&n->list_lock);
969  
970  		/* Free limit for this kmem_cache_node */
971  		n->free_limit -= cachep->batchcount;
972  
973  		/* cpu is dead; no one can alloc from it. */
974  		nc = per_cpu_ptr(cachep->cpu_cache, cpu);
975  		free_block(cachep, nc->entry, nc->avail, node, &list);
976  		nc->avail = 0;
977  
978  		if (!cpumask_empty(mask)) {
979  			spin_unlock_irq(&n->list_lock);
980  			goto free_slab;
981  		}
982  
983  		shared = n->shared;
984  		if (shared) {
985  			free_block(cachep, shared->entry,
986  				   shared->avail, node, &list);
987  			n->shared = NULL;
988  		}
989  
990  		alien = n->alien;
991  		n->alien = NULL;
992  
993  		spin_unlock_irq(&n->list_lock);
994  
995  		kfree(shared);
996  		if (alien) {
997  			drain_alien_cache(cachep, alien);
998  			free_alien_cache(alien);
999  		}
1000  
1001  free_slab:
1002  		slabs_destroy(cachep, &list);
1003  	}
1004  	/*
1005  	 * In the previous loop, all the objects were freed to
1006  	 * the respective cache's slabs,  now we can go ahead and
1007  	 * shrink each nodelist to its limit.
1008  	 */
1009  	list_for_each_entry(cachep, &slab_caches, list) {
1010  		n = get_node(cachep, node);
1011  		if (!n)
1012  			continue;
1013  		drain_freelist(cachep, n, INT_MAX);
1014  	}
1015  }
1016  
1017  static int cpuup_prepare(long cpu)
1018  {
1019  	struct kmem_cache *cachep;
1020  	int node = cpu_to_mem(cpu);
1021  	int err;
1022  
1023  	/*
1024  	 * We need to do this right in the beginning since
1025  	 * alloc_arraycache's are going to use this list.
1026  	 * kmalloc_node allows us to add the slab to the right
1027  	 * kmem_cache_node and not this cpu's kmem_cache_node
1028  	 */
1029  	err = init_cache_node_node(node);
1030  	if (err < 0)
1031  		goto bad;
1032  
1033  	/*
1034  	 * Now we can go ahead with allocating the shared arrays and
1035  	 * array caches
1036  	 */
1037  	list_for_each_entry(cachep, &slab_caches, list) {
1038  		err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false);
1039  		if (err)
1040  			goto bad;
1041  	}
1042  
1043  	return 0;
1044  bad:
1045  	cpuup_canceled(cpu);
1046  	return -ENOMEM;
1047  }
1048  
1049  int slab_prepare_cpu(unsigned int cpu)
1050  {
1051  	int err;
1052  
1053  	mutex_lock(&slab_mutex);
1054  	err = cpuup_prepare(cpu);
1055  	mutex_unlock(&slab_mutex);
1056  	return err;
1057  }
1058  
1059  /*
1060   * This is called for a failed online attempt and for a successful
1061   * offline.
1062   *
1063   * Even if all the cpus of a node are down, we don't free the
1064   * kmem_cache_node of any cache. This to avoid a race between cpu_down, and
1065   * a kmalloc allocation from another cpu for memory from the node of
1066   * the cpu going down.  The kmem_cache_node structure is usually allocated from
1067   * kmem_cache_create() and gets destroyed at kmem_cache_destroy().
1068   */
1069  int slab_dead_cpu(unsigned int cpu)
1070  {
1071  	mutex_lock(&slab_mutex);
1072  	cpuup_canceled(cpu);
1073  	mutex_unlock(&slab_mutex);
1074  	return 0;
1075  }
1076  #endif
1077  
1078  static int slab_online_cpu(unsigned int cpu)
1079  {
1080  	start_cpu_timer(cpu);
1081  	return 0;
1082  }
1083  
1084  static int slab_offline_cpu(unsigned int cpu)
1085  {
1086  	/*
1087  	 * Shutdown cache reaper. Note that the slab_mutex is held so
1088  	 * that if cache_reap() is invoked it cannot do anything
1089  	 * expensive but will only modify reap_work and reschedule the
1090  	 * timer.
1091  	 */
1092  	cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
1093  	/* Now the cache_reaper is guaranteed to be not running. */
1094  	per_cpu(slab_reap_work, cpu).work.func = NULL;
1095  	return 0;
1096  }
1097  
1098  #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
1099  /*
1100   * Drains freelist for a node on each slab cache, used for memory hot-remove.
1101   * Returns -EBUSY if all objects cannot be drained so that the node is not
1102   * removed.
1103   *
1104   * Must hold slab_mutex.
1105   */
1106  static int __meminit drain_cache_node_node(int node)
1107  {
1108  	struct kmem_cache *cachep;
1109  	int ret = 0;
1110  
1111  	list_for_each_entry(cachep, &slab_caches, list) {
1112  		struct kmem_cache_node *n;
1113  
1114  		n = get_node(cachep, node);
1115  		if (!n)
1116  			continue;
1117  
1118  		drain_freelist(cachep, n, INT_MAX);
1119  
1120  		if (!list_empty(&n->slabs_full) ||
1121  		    !list_empty(&n->slabs_partial)) {
1122  			ret = -EBUSY;
1123  			break;
1124  		}
1125  	}
1126  	return ret;
1127  }
1128  
1129  static int __meminit slab_memory_callback(struct notifier_block *self,
1130  					unsigned long action, void *arg)
1131  {
1132  	struct memory_notify *mnb = arg;
1133  	int ret = 0;
1134  	int nid;
1135  
1136  	nid = mnb->status_change_nid;
1137  	if (nid < 0)
1138  		goto out;
1139  
1140  	switch (action) {
1141  	case MEM_GOING_ONLINE:
1142  		mutex_lock(&slab_mutex);
1143  		ret = init_cache_node_node(nid);
1144  		mutex_unlock(&slab_mutex);
1145  		break;
1146  	case MEM_GOING_OFFLINE:
1147  		mutex_lock(&slab_mutex);
1148  		ret = drain_cache_node_node(nid);
1149  		mutex_unlock(&slab_mutex);
1150  		break;
1151  	case MEM_ONLINE:
1152  	case MEM_OFFLINE:
1153  	case MEM_CANCEL_ONLINE:
1154  	case MEM_CANCEL_OFFLINE:
1155  		break;
1156  	}
1157  out:
1158  	return notifier_from_errno(ret);
1159  }
1160  #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
1161  
1162  /*
1163   * swap the static kmem_cache_node with kmalloced memory
1164   */
1165  static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list,
1166  				int nodeid)
1167  {
1168  	struct kmem_cache_node *ptr;
1169  
1170  	ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid);
1171  	BUG_ON(!ptr);
1172  
1173  	memcpy(ptr, list, sizeof(struct kmem_cache_node));
1174  	/*
1175  	 * Do not assume that spinlocks can be initialized via memcpy:
1176  	 */
1177  	spin_lock_init(&ptr->list_lock);
1178  
1179  	MAKE_ALL_LISTS(cachep, ptr, nodeid);
1180  	cachep->node[nodeid] = ptr;
1181  }
1182  
1183  /*
1184   * For setting up all the kmem_cache_node for cache whose buffer_size is same as
1185   * size of kmem_cache_node.
1186   */
1187  static void __init set_up_node(struct kmem_cache *cachep, int index)
1188  {
1189  	int node;
1190  
1191  	for_each_online_node(node) {
1192  		cachep->node[node] = &init_kmem_cache_node[index + node];
1193  		cachep->node[node]->next_reap = jiffies +
1194  		    REAPTIMEOUT_NODE +
1195  		    ((unsigned long)cachep) % REAPTIMEOUT_NODE;
1196  	}
1197  }
1198  
1199  /*
1200   * Initialisation.  Called after the page allocator have been initialised and
1201   * before smp_init().
1202   */
1203  void __init kmem_cache_init(void)
1204  {
1205  	int i;
1206  
1207  	kmem_cache = &kmem_cache_boot;
1208  
1209  	if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1)
1210  		use_alien_caches = 0;
1211  
1212  	for (i = 0; i < NUM_INIT_LISTS; i++)
1213  		kmem_cache_node_init(&init_kmem_cache_node[i]);
1214  
1215  	/*
1216  	 * Fragmentation resistance on low memory - only use bigger
1217  	 * page orders on machines with more than 32MB of memory if
1218  	 * not overridden on the command line.
1219  	 */
1220  	if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT)
1221  		slab_max_order = SLAB_MAX_ORDER_HI;
1222  
1223  	/* Bootstrap is tricky, because several objects are allocated
1224  	 * from caches that do not exist yet:
1225  	 * 1) initialize the kmem_cache cache: it contains the struct
1226  	 *    kmem_cache structures of all caches, except kmem_cache itself:
1227  	 *    kmem_cache is statically allocated.
1228  	 *    Initially an __init data area is used for the head array and the
1229  	 *    kmem_cache_node structures, it's replaced with a kmalloc allocated
1230  	 *    array at the end of the bootstrap.
1231  	 * 2) Create the first kmalloc cache.
1232  	 *    The struct kmem_cache for the new cache is allocated normally.
1233  	 *    An __init data area is used for the head array.
1234  	 * 3) Create the remaining kmalloc caches, with minimally sized
1235  	 *    head arrays.
1236  	 * 4) Replace the __init data head arrays for kmem_cache and the first
1237  	 *    kmalloc cache with kmalloc allocated arrays.
1238  	 * 5) Replace the __init data for kmem_cache_node for kmem_cache and
1239  	 *    the other cache's with kmalloc allocated memory.
1240  	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1241  	 */
1242  
1243  	/* 1) create the kmem_cache */
1244  
1245  	/*
1246  	 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1247  	 */
1248  	create_boot_cache(kmem_cache, "kmem_cache",
1249  		offsetof(struct kmem_cache, node) +
1250  				  nr_node_ids * sizeof(struct kmem_cache_node *),
1251  				  SLAB_HWCACHE_ALIGN, 0, 0);
1252  	list_add(&kmem_cache->list, &slab_caches);
1253  	slab_state = PARTIAL;
1254  
1255  	/*
1256  	 * Initialize the caches that provide memory for the  kmem_cache_node
1257  	 * structures first.  Without this, further allocations will bug.
1258  	 */
1259  	kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache(
1260  				kmalloc_info[INDEX_NODE].name[KMALLOC_NORMAL],
1261  				kmalloc_info[INDEX_NODE].size,
1262  				ARCH_KMALLOC_FLAGS, 0,
1263  				kmalloc_info[INDEX_NODE].size);
1264  	slab_state = PARTIAL_NODE;
1265  	setup_kmalloc_cache_index_table();
1266  
1267  	slab_early_init = 0;
1268  
1269  	/* 5) Replace the bootstrap kmem_cache_node */
1270  	{
1271  		int nid;
1272  
1273  		for_each_online_node(nid) {
1274  			init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
1275  
1276  			init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE],
1277  					  &init_kmem_cache_node[SIZE_NODE + nid], nid);
1278  		}
1279  	}
1280  
1281  	create_kmalloc_caches(ARCH_KMALLOC_FLAGS);
1282  }
1283  
1284  void __init kmem_cache_init_late(void)
1285  {
1286  	struct kmem_cache *cachep;
1287  
1288  	/* 6) resize the head arrays to their final sizes */
1289  	mutex_lock(&slab_mutex);
1290  	list_for_each_entry(cachep, &slab_caches, list)
1291  		if (enable_cpucache(cachep, GFP_NOWAIT))
1292  			BUG();
1293  	mutex_unlock(&slab_mutex);
1294  
1295  	/* Done! */
1296  	slab_state = FULL;
1297  
1298  #ifdef CONFIG_NUMA
1299  	/*
1300  	 * Register a memory hotplug callback that initializes and frees
1301  	 * node.
1302  	 */
1303  	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
1304  #endif
1305  
1306  	/*
1307  	 * The reap timers are started later, with a module init call: That part
1308  	 * of the kernel is not yet operational.
1309  	 */
1310  }
1311  
1312  static int __init cpucache_init(void)
1313  {
1314  	int ret;
1315  
1316  	/*
1317  	 * Register the timers that return unneeded pages to the page allocator
1318  	 */
1319  	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "SLAB online",
1320  				slab_online_cpu, slab_offline_cpu);
1321  	WARN_ON(ret < 0);
1322  
1323  	return 0;
1324  }
1325  __initcall(cpucache_init);
1326  
1327  static noinline void
1328  slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1329  {
1330  #if DEBUG
1331  	struct kmem_cache_node *n;
1332  	unsigned long flags;
1333  	int node;
1334  	static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
1335  				      DEFAULT_RATELIMIT_BURST);
1336  
1337  	if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
1338  		return;
1339  
1340  	pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
1341  		nodeid, gfpflags, &gfpflags);
1342  	pr_warn("  cache: %s, object size: %d, order: %d\n",
1343  		cachep->name, cachep->size, cachep->gfporder);
1344  
1345  	for_each_kmem_cache_node(cachep, node, n) {
1346  		unsigned long total_slabs, free_slabs, free_objs;
1347  
1348  		spin_lock_irqsave(&n->list_lock, flags);
1349  		total_slabs = n->total_slabs;
1350  		free_slabs = n->free_slabs;
1351  		free_objs = n->free_objects;
1352  		spin_unlock_irqrestore(&n->list_lock, flags);
1353  
1354  		pr_warn("  node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
1355  			node, total_slabs - free_slabs, total_slabs,
1356  			(total_slabs * cachep->num) - free_objs,
1357  			total_slabs * cachep->num);
1358  	}
1359  #endif
1360  }
1361  
1362  /*
1363   * Interface to system's page allocator. No need to hold the
1364   * kmem_cache_node ->list_lock.
1365   *
1366   * If we requested dmaable memory, we will get it. Even if we
1367   * did not request dmaable memory, we might get it, but that
1368   * would be relatively rare and ignorable.
1369   */
1370  static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
1371  								int nodeid)
1372  {
1373  	struct page *page;
1374  
1375  	flags |= cachep->allocflags;
1376  
1377  	page = __alloc_pages_node(nodeid, flags, cachep->gfporder);
1378  	if (!page) {
1379  		slab_out_of_memory(cachep, flags, nodeid);
1380  		return NULL;
1381  	}
1382  
1383  	account_slab_page(page, cachep->gfporder, cachep, flags);
1384  	__SetPageSlab(page);
1385  	/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
1386  	if (sk_memalloc_socks() && page_is_pfmemalloc(page))
1387  		SetPageSlabPfmemalloc(page);
1388  
1389  	return page;
1390  }
1391  
1392  /*
1393   * Interface to system's page release.
1394   */
1395  static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
1396  {
1397  	int order = cachep->gfporder;
1398  
1399  	BUG_ON(!PageSlab(page));
1400  	__ClearPageSlabPfmemalloc(page);
1401  	__ClearPageSlab(page);
1402  	page_mapcount_reset(page);
1403  	/* In union with page->mapping where page allocator expects NULL */
1404  	page->slab_cache = NULL;
1405  
1406  	if (current->reclaim_state)
1407  		current->reclaim_state->reclaimed_slab += 1 << order;
1408  	unaccount_slab_page(page, order, cachep);
1409  	__free_pages(page, order);
1410  }
1411  
1412  static void kmem_rcu_free(struct rcu_head *head)
1413  {
1414  	struct kmem_cache *cachep;
1415  	struct page *page;
1416  
1417  	page = container_of(head, struct page, rcu_head);
1418  	cachep = page->slab_cache;
1419  
1420  	kmem_freepages(cachep, page);
1421  }
1422  
1423  #if DEBUG
1424  static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
1425  {
1426  	if (debug_pagealloc_enabled_static() && OFF_SLAB(cachep) &&
1427  		(cachep->size % PAGE_SIZE) == 0)
1428  		return true;
1429  
1430  	return false;
1431  }
1432  
1433  #ifdef CONFIG_DEBUG_PAGEALLOC
1434  static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map)
1435  {
1436  	if (!is_debug_pagealloc_cache(cachep))
1437  		return;
1438  
1439  	__kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
1440  }
1441  
1442  #else
1443  static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp,
1444  				int map) {}
1445  
1446  #endif
1447  
1448  static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1449  {
1450  	int size = cachep->object_size;
1451  	addr = &((char *)addr)[obj_offset(cachep)];
1452  
1453  	memset(addr, val, size);
1454  	*(unsigned char *)(addr + size - 1) = POISON_END;
1455  }
1456  
1457  static void dump_line(char *data, int offset, int limit)
1458  {
1459  	int i;
1460  	unsigned char error = 0;
1461  	int bad_count = 0;
1462  
1463  	pr_err("%03x: ", offset);
1464  	for (i = 0; i < limit; i++) {
1465  		if (data[offset + i] != POISON_FREE) {
1466  			error = data[offset + i];
1467  			bad_count++;
1468  		}
1469  	}
1470  	print_hex_dump(KERN_CONT, "", 0, 16, 1,
1471  			&data[offset], limit, 1);
1472  
1473  	if (bad_count == 1) {
1474  		error ^= POISON_FREE;
1475  		if (!(error & (error - 1))) {
1476  			pr_err("Single bit error detected. Probably bad RAM.\n");
1477  #ifdef CONFIG_X86
1478  			pr_err("Run memtest86+ or a similar memory test tool.\n");
1479  #else
1480  			pr_err("Run a memory test tool.\n");
1481  #endif
1482  		}
1483  	}
1484  }
1485  #endif
1486  
1487  #if DEBUG
1488  
1489  static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1490  {
1491  	int i, size;
1492  	char *realobj;
1493  
1494  	if (cachep->flags & SLAB_RED_ZONE) {
1495  		pr_err("Redzone: 0x%llx/0x%llx\n",
1496  		       *dbg_redzone1(cachep, objp),
1497  		       *dbg_redzone2(cachep, objp));
1498  	}
1499  
1500  	if (cachep->flags & SLAB_STORE_USER)
1501  		pr_err("Last user: (%pSR)\n", *dbg_userword(cachep, objp));
1502  	realobj = (char *)objp + obj_offset(cachep);
1503  	size = cachep->object_size;
1504  	for (i = 0; i < size && lines; i += 16, lines--) {
1505  		int limit;
1506  		limit = 16;
1507  		if (i + limit > size)
1508  			limit = size - i;
1509  		dump_line(realobj, i, limit);
1510  	}
1511  }
1512  
1513  static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1514  {
1515  	char *realobj;
1516  	int size, i;
1517  	int lines = 0;
1518  
1519  	if (is_debug_pagealloc_cache(cachep))
1520  		return;
1521  
1522  	realobj = (char *)objp + obj_offset(cachep);
1523  	size = cachep->object_size;
1524  
1525  	for (i = 0; i < size; i++) {
1526  		char exp = POISON_FREE;
1527  		if (i == size - 1)
1528  			exp = POISON_END;
1529  		if (realobj[i] != exp) {
1530  			int limit;
1531  			/* Mismatch ! */
1532  			/* Print header */
1533  			if (lines == 0) {
1534  				pr_err("Slab corruption (%s): %s start=%px, len=%d\n",
1535  				       print_tainted(), cachep->name,
1536  				       realobj, size);
1537  				print_objinfo(cachep, objp, 0);
1538  			}
1539  			/* Hexdump the affected line */
1540  			i = (i / 16) * 16;
1541  			limit = 16;
1542  			if (i + limit > size)
1543  				limit = size - i;
1544  			dump_line(realobj, i, limit);
1545  			i += 16;
1546  			lines++;
1547  			/* Limit to 5 lines */
1548  			if (lines > 5)
1549  				break;
1550  		}
1551  	}
1552  	if (lines != 0) {
1553  		/* Print some data about the neighboring objects, if they
1554  		 * exist:
1555  		 */
1556  		struct page *page = virt_to_head_page(objp);
1557  		unsigned int objnr;
1558  
1559  		objnr = obj_to_index(cachep, page, objp);
1560  		if (objnr) {
1561  			objp = index_to_obj(cachep, page, objnr - 1);
1562  			realobj = (char *)objp + obj_offset(cachep);
1563  			pr_err("Prev obj: start=%px, len=%d\n", realobj, size);
1564  			print_objinfo(cachep, objp, 2);
1565  		}
1566  		if (objnr + 1 < cachep->num) {
1567  			objp = index_to_obj(cachep, page, objnr + 1);
1568  			realobj = (char *)objp + obj_offset(cachep);
1569  			pr_err("Next obj: start=%px, len=%d\n", realobj, size);
1570  			print_objinfo(cachep, objp, 2);
1571  		}
1572  	}
1573  }
1574  #endif
1575  
1576  #if DEBUG
1577  static void slab_destroy_debugcheck(struct kmem_cache *cachep,
1578  						struct page *page)
1579  {
1580  	int i;
1581  
1582  	if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) {
1583  		poison_obj(cachep, page->freelist - obj_offset(cachep),
1584  			POISON_FREE);
1585  	}
1586  
1587  	for (i = 0; i < cachep->num; i++) {
1588  		void *objp = index_to_obj(cachep, page, i);
1589  
1590  		if (cachep->flags & SLAB_POISON) {
1591  			check_poison_obj(cachep, objp);
1592  			slab_kernel_map(cachep, objp, 1);
1593  		}
1594  		if (cachep->flags & SLAB_RED_ZONE) {
1595  			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1596  				slab_error(cachep, "start of a freed object was overwritten");
1597  			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1598  				slab_error(cachep, "end of a freed object was overwritten");
1599  		}
1600  	}
1601  }
1602  #else
1603  static void slab_destroy_debugcheck(struct kmem_cache *cachep,
1604  						struct page *page)
1605  {
1606  }
1607  #endif
1608  
1609  /**
1610   * slab_destroy - destroy and release all objects in a slab
1611   * @cachep: cache pointer being destroyed
1612   * @page: page pointer being destroyed
1613   *
1614   * Destroy all the objs in a slab page, and release the mem back to the system.
1615   * Before calling the slab page must have been unlinked from the cache. The
1616   * kmem_cache_node ->list_lock is not held/needed.
1617   */
1618  static void slab_destroy(struct kmem_cache *cachep, struct page *page)
1619  {
1620  	void *freelist;
1621  
1622  	freelist = page->freelist;
1623  	slab_destroy_debugcheck(cachep, page);
1624  	if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU))
1625  		call_rcu(&page->rcu_head, kmem_rcu_free);
1626  	else
1627  		kmem_freepages(cachep, page);
1628  
1629  	/*
1630  	 * From now on, we don't use freelist
1631  	 * although actual page can be freed in rcu context
1632  	 */
1633  	if (OFF_SLAB(cachep))
1634  		kmem_cache_free(cachep->freelist_cache, freelist);
1635  }
1636  
1637  /*
1638   * Update the size of the caches before calling slabs_destroy as it may
1639   * recursively call kfree.
1640   */
1641  static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
1642  {
1643  	struct page *page, *n;
1644  
1645  	list_for_each_entry_safe(page, n, list, slab_list) {
1646  		list_del(&page->slab_list);
1647  		slab_destroy(cachep, page);
1648  	}
1649  }
1650  
1651  /**
1652   * calculate_slab_order - calculate size (page order) of slabs
1653   * @cachep: pointer to the cache that is being created
1654   * @size: size of objects to be created in this cache.
1655   * @flags: slab allocation flags
1656   *
1657   * Also calculates the number of objects per slab.
1658   *
1659   * This could be made much more intelligent.  For now, try to avoid using
1660   * high order pages for slabs.  When the gfp() functions are more friendly
1661   * towards high-order requests, this should be changed.
1662   *
1663   * Return: number of left-over bytes in a slab
1664   */
1665  static size_t calculate_slab_order(struct kmem_cache *cachep,
1666  				size_t size, slab_flags_t flags)
1667  {
1668  	size_t left_over = 0;
1669  	int gfporder;
1670  
1671  	for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
1672  		unsigned int num;
1673  		size_t remainder;
1674  
1675  		num = cache_estimate(gfporder, size, flags, &remainder);
1676  		if (!num)
1677  			continue;
1678  
1679  		/* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
1680  		if (num > SLAB_OBJ_MAX_NUM)
1681  			break;
1682  
1683  		if (flags & CFLGS_OFF_SLAB) {
1684  			struct kmem_cache *freelist_cache;
1685  			size_t freelist_size;
1686  
1687  			freelist_size = num * sizeof(freelist_idx_t);
1688  			freelist_cache = kmalloc_slab(freelist_size, 0u);
1689  			if (!freelist_cache)
1690  				continue;
1691  
1692  			/*
1693  			 * Needed to avoid possible looping condition
1694  			 * in cache_grow_begin()
1695  			 */
1696  			if (OFF_SLAB(freelist_cache))
1697  				continue;
1698  
1699  			/* check if off slab has enough benefit */
1700  			if (freelist_cache->size > cachep->size / 2)
1701  				continue;
1702  		}
1703  
1704  		/* Found something acceptable - save it away */
1705  		cachep->num = num;
1706  		cachep->gfporder = gfporder;
1707  		left_over = remainder;
1708  
1709  		/*
1710  		 * A VFS-reclaimable slab tends to have most allocations
1711  		 * as GFP_NOFS and we really don't want to have to be allocating
1712  		 * higher-order pages when we are unable to shrink dcache.
1713  		 */
1714  		if (flags & SLAB_RECLAIM_ACCOUNT)
1715  			break;
1716  
1717  		/*
1718  		 * Large number of objects is good, but very large slabs are
1719  		 * currently bad for the gfp()s.
1720  		 */
1721  		if (gfporder >= slab_max_order)
1722  			break;
1723  
1724  		/*
1725  		 * Acceptable internal fragmentation?
1726  		 */
1727  		if (left_over * 8 <= (PAGE_SIZE << gfporder))
1728  			break;
1729  	}
1730  	return left_over;
1731  }
1732  
1733  static struct array_cache __percpu *alloc_kmem_cache_cpus(
1734  		struct kmem_cache *cachep, int entries, int batchcount)
1735  {
1736  	int cpu;
1737  	size_t size;
1738  	struct array_cache __percpu *cpu_cache;
1739  
1740  	size = sizeof(void *) * entries + sizeof(struct array_cache);
1741  	cpu_cache = __alloc_percpu(size, sizeof(void *));
1742  
1743  	if (!cpu_cache)
1744  		return NULL;
1745  
1746  	for_each_possible_cpu(cpu) {
1747  		init_arraycache(per_cpu_ptr(cpu_cache, cpu),
1748  				entries, batchcount);
1749  	}
1750  
1751  	return cpu_cache;
1752  }
1753  
1754  static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
1755  {
1756  	if (slab_state >= FULL)
1757  		return enable_cpucache(cachep, gfp);
1758  
1759  	cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1);
1760  	if (!cachep->cpu_cache)
1761  		return 1;
1762  
1763  	if (slab_state == DOWN) {
1764  		/* Creation of first cache (kmem_cache). */
1765  		set_up_node(kmem_cache, CACHE_CACHE);
1766  	} else if (slab_state == PARTIAL) {
1767  		/* For kmem_cache_node */
1768  		set_up_node(cachep, SIZE_NODE);
1769  	} else {
1770  		int node;
1771  
1772  		for_each_online_node(node) {
1773  			cachep->node[node] = kmalloc_node(
1774  				sizeof(struct kmem_cache_node), gfp, node);
1775  			BUG_ON(!cachep->node[node]);
1776  			kmem_cache_node_init(cachep->node[node]);
1777  		}
1778  	}
1779  
1780  	cachep->node[numa_mem_id()]->next_reap =
1781  			jiffies + REAPTIMEOUT_NODE +
1782  			((unsigned long)cachep) % REAPTIMEOUT_NODE;
1783  
1784  	cpu_cache_get(cachep)->avail = 0;
1785  	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
1786  	cpu_cache_get(cachep)->batchcount = 1;
1787  	cpu_cache_get(cachep)->touched = 0;
1788  	cachep->batchcount = 1;
1789  	cachep->limit = BOOT_CPUCACHE_ENTRIES;
1790  	return 0;
1791  }
1792  
1793  slab_flags_t kmem_cache_flags(unsigned int object_size,
1794  	slab_flags_t flags, const char *name)
1795  {
1796  	return flags;
1797  }
1798  
1799  struct kmem_cache *
1800  __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
1801  		   slab_flags_t flags, void (*ctor)(void *))
1802  {
1803  	struct kmem_cache *cachep;
1804  
1805  	cachep = find_mergeable(size, align, flags, name, ctor);
1806  	if (cachep) {
1807  		cachep->refcount++;
1808  
1809  		/*
1810  		 * Adjust the object sizes so that we clear
1811  		 * the complete object on kzalloc.
1812  		 */
1813  		cachep->object_size = max_t(int, cachep->object_size, size);
1814  	}
1815  	return cachep;
1816  }
1817  
1818  static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
1819  			size_t size, slab_flags_t flags)
1820  {
1821  	size_t left;
1822  
1823  	cachep->num = 0;
1824  
1825  	/*
1826  	 * If slab auto-initialization on free is enabled, store the freelist
1827  	 * off-slab, so that its contents don't end up in one of the allocated
1828  	 * objects.
1829  	 */
1830  	if (unlikely(slab_want_init_on_free(cachep)))
1831  		return false;
1832  
1833  	if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU)
1834  		return false;
1835  
1836  	left = calculate_slab_order(cachep, size,
1837  			flags | CFLGS_OBJFREELIST_SLAB);
1838  	if (!cachep->num)
1839  		return false;
1840  
1841  	if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size)
1842  		return false;
1843  
1844  	cachep->colour = left / cachep->colour_off;
1845  
1846  	return true;
1847  }
1848  
1849  static bool set_off_slab_cache(struct kmem_cache *cachep,
1850  			size_t size, slab_flags_t flags)
1851  {
1852  	size_t left;
1853  
1854  	cachep->num = 0;
1855  
1856  	/*
1857  	 * Always use on-slab management when SLAB_NOLEAKTRACE
1858  	 * to avoid recursive calls into kmemleak.
1859  	 */
1860  	if (flags & SLAB_NOLEAKTRACE)
1861  		return false;
1862  
1863  	/*
1864  	 * Size is large, assume best to place the slab management obj
1865  	 * off-slab (should allow better packing of objs).
1866  	 */
1867  	left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB);
1868  	if (!cachep->num)
1869  		return false;
1870  
1871  	/*
1872  	 * If the slab has been placed off-slab, and we have enough space then
1873  	 * move it on-slab. This is at the expense of any extra colouring.
1874  	 */
1875  	if (left >= cachep->num * sizeof(freelist_idx_t))
1876  		return false;
1877  
1878  	cachep->colour = left / cachep->colour_off;
1879  
1880  	return true;
1881  }
1882  
1883  static bool set_on_slab_cache(struct kmem_cache *cachep,
1884  			size_t size, slab_flags_t flags)
1885  {
1886  	size_t left;
1887  
1888  	cachep->num = 0;
1889  
1890  	left = calculate_slab_order(cachep, size, flags);
1891  	if (!cachep->num)
1892  		return false;
1893  
1894  	cachep->colour = left / cachep->colour_off;
1895  
1896  	return true;
1897  }
1898  
1899  /**
1900   * __kmem_cache_create - Create a cache.
1901   * @cachep: cache management descriptor
1902   * @flags: SLAB flags
1903   *
1904   * Returns a ptr to the cache on success, NULL on failure.
1905   * Cannot be called within a int, but can be interrupted.
1906   * The @ctor is run when new pages are allocated by the cache.
1907   *
1908   * The flags are
1909   *
1910   * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
1911   * to catch references to uninitialised memory.
1912   *
1913   * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
1914   * for buffer overruns.
1915   *
1916   * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
1917   * cacheline.  This can be beneficial if you're counting cycles as closely
1918   * as davem.
1919   *
1920   * Return: a pointer to the created cache or %NULL in case of error
1921   */
1922  int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
1923  {
1924  	size_t ralign = BYTES_PER_WORD;
1925  	gfp_t gfp;
1926  	int err;
1927  	unsigned int size = cachep->size;
1928  
1929  #if DEBUG
1930  #if FORCED_DEBUG
1931  	/*
1932  	 * Enable redzoning and last user accounting, except for caches with
1933  	 * large objects, if the increased size would increase the object size
1934  	 * above the next power of two: caches with object sizes just above a
1935  	 * power of two have a significant amount of internal fragmentation.
1936  	 */
1937  	if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
1938  						2 * sizeof(unsigned long long)))
1939  		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
1940  	if (!(flags & SLAB_TYPESAFE_BY_RCU))
1941  		flags |= SLAB_POISON;
1942  #endif
1943  #endif
1944  
1945  	/*
1946  	 * Check that size is in terms of words.  This is needed to avoid
1947  	 * unaligned accesses for some archs when redzoning is used, and makes
1948  	 * sure any on-slab bufctl's are also correctly aligned.
1949  	 */
1950  	size = ALIGN(size, BYTES_PER_WORD);
1951  
1952  	if (flags & SLAB_RED_ZONE) {
1953  		ralign = REDZONE_ALIGN;
1954  		/* If redzoning, ensure that the second redzone is suitably
1955  		 * aligned, by adjusting the object size accordingly. */
1956  		size = ALIGN(size, REDZONE_ALIGN);
1957  	}
1958  
1959  	/* 3) caller mandated alignment */
1960  	if (ralign < cachep->align) {
1961  		ralign = cachep->align;
1962  	}
1963  	/* disable debug if necessary */
1964  	if (ralign > __alignof__(unsigned long long))
1965  		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
1966  	/*
1967  	 * 4) Store it.
1968  	 */
1969  	cachep->align = ralign;
1970  	cachep->colour_off = cache_line_size();
1971  	/* Offset must be a multiple of the alignment. */
1972  	if (cachep->colour_off < cachep->align)
1973  		cachep->colour_off = cachep->align;
1974  
1975  	if (slab_is_available())
1976  		gfp = GFP_KERNEL;
1977  	else
1978  		gfp = GFP_NOWAIT;
1979  
1980  #if DEBUG
1981  
1982  	/*
1983  	 * Both debugging options require word-alignment which is calculated
1984  	 * into align above.
1985  	 */
1986  	if (flags & SLAB_RED_ZONE) {
1987  		/* add space for red zone words */
1988  		cachep->obj_offset += sizeof(unsigned long long);
1989  		size += 2 * sizeof(unsigned long long);
1990  	}
1991  	if (flags & SLAB_STORE_USER) {
1992  		/* user store requires one word storage behind the end of
1993  		 * the real object. But if the second red zone needs to be
1994  		 * aligned to 64 bits, we must allow that much space.
1995  		 */
1996  		if (flags & SLAB_RED_ZONE)
1997  			size += REDZONE_ALIGN;
1998  		else
1999  			size += BYTES_PER_WORD;
2000  	}
2001  #endif
2002  
2003  	kasan_cache_create(cachep, &size, &flags);
2004  
2005  	size = ALIGN(size, cachep->align);
2006  	/*
2007  	 * We should restrict the number of objects in a slab to implement
2008  	 * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
2009  	 */
2010  	if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
2011  		size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
2012  
2013  #if DEBUG
2014  	/*
2015  	 * To activate debug pagealloc, off-slab management is necessary
2016  	 * requirement. In early phase of initialization, small sized slab
2017  	 * doesn't get initialized so it would not be possible. So, we need
2018  	 * to check size >= 256. It guarantees that all necessary small
2019  	 * sized slab is initialized in current slab initialization sequence.
2020  	 */
2021  	if (debug_pagealloc_enabled_static() && (flags & SLAB_POISON) &&
2022  		size >= 256 && cachep->object_size > cache_line_size()) {
2023  		if (size < PAGE_SIZE || size % PAGE_SIZE == 0) {
2024  			size_t tmp_size = ALIGN(size, PAGE_SIZE);
2025  
2026  			if (set_off_slab_cache(cachep, tmp_size, flags)) {
2027  				flags |= CFLGS_OFF_SLAB;
2028  				cachep->obj_offset += tmp_size - size;
2029  				size = tmp_size;
2030  				goto done;
2031  			}
2032  		}
2033  	}
2034  #endif
2035  
2036  	if (set_objfreelist_slab_cache(cachep, size, flags)) {
2037  		flags |= CFLGS_OBJFREELIST_SLAB;
2038  		goto done;
2039  	}
2040  
2041  	if (set_off_slab_cache(cachep, size, flags)) {
2042  		flags |= CFLGS_OFF_SLAB;
2043  		goto done;
2044  	}
2045  
2046  	if (set_on_slab_cache(cachep, size, flags))
2047  		goto done;
2048  
2049  	return -E2BIG;
2050  
2051  done:
2052  	cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
2053  	cachep->flags = flags;
2054  	cachep->allocflags = __GFP_COMP;
2055  	if (flags & SLAB_CACHE_DMA)
2056  		cachep->allocflags |= GFP_DMA;
2057  	if (flags & SLAB_CACHE_DMA32)
2058  		cachep->allocflags |= GFP_DMA32;
2059  	if (flags & SLAB_RECLAIM_ACCOUNT)
2060  		cachep->allocflags |= __GFP_RECLAIMABLE;
2061  	cachep->size = size;
2062  	cachep->reciprocal_buffer_size = reciprocal_value(size);
2063  
2064  #if DEBUG
2065  	/*
2066  	 * If we're going to use the generic kernel_map_pages()
2067  	 * poisoning, then it's going to smash the contents of
2068  	 * the redzone and userword anyhow, so switch them off.
2069  	 */
2070  	if (IS_ENABLED(CONFIG_PAGE_POISONING) &&
2071  		(cachep->flags & SLAB_POISON) &&
2072  		is_debug_pagealloc_cache(cachep))
2073  		cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2074  #endif
2075  
2076  	if (OFF_SLAB(cachep)) {
2077  		cachep->freelist_cache =
2078  			kmalloc_slab(cachep->freelist_size, 0u);
2079  	}
2080  
2081  	err = setup_cpu_cache(cachep, gfp);
2082  	if (err) {
2083  		__kmem_cache_release(cachep);
2084  		return err;
2085  	}
2086  
2087  	return 0;
2088  }
2089  
2090  #if DEBUG
2091  static void check_irq_off(void)
2092  {
2093  	BUG_ON(!irqs_disabled());
2094  }
2095  
2096  static void check_irq_on(void)
2097  {
2098  	BUG_ON(irqs_disabled());
2099  }
2100  
2101  static void check_mutex_acquired(void)
2102  {
2103  	BUG_ON(!mutex_is_locked(&slab_mutex));
2104  }
2105  
2106  static void check_spinlock_acquired(struct kmem_cache *cachep)
2107  {
2108  #ifdef CONFIG_SMP
2109  	check_irq_off();
2110  	assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
2111  #endif
2112  }
2113  
2114  static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2115  {
2116  #ifdef CONFIG_SMP
2117  	check_irq_off();
2118  	assert_spin_locked(&get_node(cachep, node)->list_lock);
2119  #endif
2120  }
2121  
2122  #else
2123  #define check_irq_off()	do { } while(0)
2124  #define check_irq_on()	do { } while(0)
2125  #define check_mutex_acquired()	do { } while(0)
2126  #define check_spinlock_acquired(x) do { } while(0)
2127  #define check_spinlock_acquired_node(x, y) do { } while(0)
2128  #endif
2129  
2130  static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
2131  				int node, bool free_all, struct list_head *list)
2132  {
2133  	int tofree;
2134  
2135  	if (!ac || !ac->avail)
2136  		return;
2137  
2138  	tofree = free_all ? ac->avail : (ac->limit + 4) / 5;
2139  	if (tofree > ac->avail)
2140  		tofree = (ac->avail + 1) / 2;
2141  
2142  	free_block(cachep, ac->entry, tofree, node, list);
2143  	ac->avail -= tofree;
2144  	memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail);
2145  }
2146  
2147  static void do_drain(void *arg)
2148  {
2149  	struct kmem_cache *cachep = arg;
2150  	struct array_cache *ac;
2151  	int node = numa_mem_id();
2152  	struct kmem_cache_node *n;
2153  	LIST_HEAD(list);
2154  
2155  	check_irq_off();
2156  	ac = cpu_cache_get(cachep);
2157  	n = get_node(cachep, node);
2158  	spin_lock(&n->list_lock);
2159  	free_block(cachep, ac->entry, ac->avail, node, &list);
2160  	spin_unlock(&n->list_lock);
2161  	ac->avail = 0;
2162  	slabs_destroy(cachep, &list);
2163  }
2164  
2165  static void drain_cpu_caches(struct kmem_cache *cachep)
2166  {
2167  	struct kmem_cache_node *n;
2168  	int node;
2169  	LIST_HEAD(list);
2170  
2171  	on_each_cpu(do_drain, cachep, 1);
2172  	check_irq_on();
2173  	for_each_kmem_cache_node(cachep, node, n)
2174  		if (n->alien)
2175  			drain_alien_cache(cachep, n->alien);
2176  
2177  	for_each_kmem_cache_node(cachep, node, n) {
2178  		spin_lock_irq(&n->list_lock);
2179  		drain_array_locked(cachep, n->shared, node, true, &list);
2180  		spin_unlock_irq(&n->list_lock);
2181  
2182  		slabs_destroy(cachep, &list);
2183  	}
2184  }
2185  
2186  /*
2187   * Remove slabs from the list of free slabs.
2188   * Specify the number of slabs to drain in tofree.
2189   *
2190   * Returns the actual number of slabs released.
2191   */
2192  static int drain_freelist(struct kmem_cache *cache,
2193  			struct kmem_cache_node *n, int tofree)
2194  {
2195  	struct list_head *p;
2196  	int nr_freed;
2197  	struct page *page;
2198  
2199  	nr_freed = 0;
2200  	while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
2201  
2202  		spin_lock_irq(&n->list_lock);
2203  		p = n->slabs_free.prev;
2204  		if (p == &n->slabs_free) {
2205  			spin_unlock_irq(&n->list_lock);
2206  			goto out;
2207  		}
2208  
2209  		page = list_entry(p, struct page, slab_list);
2210  		list_del(&page->slab_list);
2211  		n->free_slabs--;
2212  		n->total_slabs--;
2213  		/*
2214  		 * Safe to drop the lock. The slab is no longer linked
2215  		 * to the cache.
2216  		 */
2217  		n->free_objects -= cache->num;
2218  		spin_unlock_irq(&n->list_lock);
2219  		slab_destroy(cache, page);
2220  		nr_freed++;
2221  	}
2222  out:
2223  	return nr_freed;
2224  }
2225  
2226  bool __kmem_cache_empty(struct kmem_cache *s)
2227  {
2228  	int node;
2229  	struct kmem_cache_node *n;
2230  
2231  	for_each_kmem_cache_node(s, node, n)
2232  		if (!list_empty(&n->slabs_full) ||
2233  		    !list_empty(&n->slabs_partial))
2234  			return false;
2235  	return true;
2236  }
2237  
2238  int __kmem_cache_shrink(struct kmem_cache *cachep)
2239  {
2240  	int ret = 0;
2241  	int node;
2242  	struct kmem_cache_node *n;
2243  
2244  	drain_cpu_caches(cachep);
2245  
2246  	check_irq_on();
2247  	for_each_kmem_cache_node(cachep, node, n) {
2248  		drain_freelist(cachep, n, INT_MAX);
2249  
2250  		ret += !list_empty(&n->slabs_full) ||
2251  			!list_empty(&n->slabs_partial);
2252  	}
2253  	return (ret ? 1 : 0);
2254  }
2255  
2256  int __kmem_cache_shutdown(struct kmem_cache *cachep)
2257  {
2258  	return __kmem_cache_shrink(cachep);
2259  }
2260  
2261  void __kmem_cache_release(struct kmem_cache *cachep)
2262  {
2263  	int i;
2264  	struct kmem_cache_node *n;
2265  
2266  	cache_random_seq_destroy(cachep);
2267  
2268  	free_percpu(cachep->cpu_cache);
2269  
2270  	/* NUMA: free the node structures */
2271  	for_each_kmem_cache_node(cachep, i, n) {
2272  		kfree(n->shared);
2273  		free_alien_cache(n->alien);
2274  		kfree(n);
2275  		cachep->node[i] = NULL;
2276  	}
2277  }
2278  
2279  /*
2280   * Get the memory for a slab management obj.
2281   *
2282   * For a slab cache when the slab descriptor is off-slab, the
2283   * slab descriptor can't come from the same cache which is being created,
2284   * Because if it is the case, that means we defer the creation of
2285   * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
2286   * And we eventually call down to __kmem_cache_create(), which
2287   * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one.
2288   * This is a "chicken-and-egg" problem.
2289   *
2290   * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
2291   * which are all initialized during kmem_cache_init().
2292   */
2293  static void *alloc_slabmgmt(struct kmem_cache *cachep,
2294  				   struct page *page, int colour_off,
2295  				   gfp_t local_flags, int nodeid)
2296  {
2297  	void *freelist;
2298  	void *addr = page_address(page);
2299  
2300  	page->s_mem = addr + colour_off;
2301  	page->active = 0;
2302  
2303  	if (OBJFREELIST_SLAB(cachep))
2304  		freelist = NULL;
2305  	else if (OFF_SLAB(cachep)) {
2306  		/* Slab management obj is off-slab. */
2307  		freelist = kmem_cache_alloc_node(cachep->freelist_cache,
2308  					      local_flags, nodeid);
2309  	} else {
2310  		/* We will use last bytes at the slab for freelist */
2311  		freelist = addr + (PAGE_SIZE << cachep->gfporder) -
2312  				cachep->freelist_size;
2313  	}
2314  
2315  	return freelist;
2316  }
2317  
2318  static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx)
2319  {
2320  	return ((freelist_idx_t *)page->freelist)[idx];
2321  }
2322  
2323  static inline void set_free_obj(struct page *page,
2324  					unsigned int idx, freelist_idx_t val)
2325  {
2326  	((freelist_idx_t *)(page->freelist))[idx] = val;
2327  }
2328  
2329  static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
2330  {
2331  #if DEBUG
2332  	int i;
2333  
2334  	for (i = 0; i < cachep->num; i++) {
2335  		void *objp = index_to_obj(cachep, page, i);
2336  
2337  		if (cachep->flags & SLAB_STORE_USER)
2338  			*dbg_userword(cachep, objp) = NULL;
2339  
2340  		if (cachep->flags & SLAB_RED_ZONE) {
2341  			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2342  			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2343  		}
2344  		/*
2345  		 * Constructors are not allowed to allocate memory from the same
2346  		 * cache which they are a constructor for.  Otherwise, deadlock.
2347  		 * They must also be threaded.
2348  		 */
2349  		if (cachep->ctor && !(cachep->flags & SLAB_POISON)) {
2350  			kasan_unpoison_object_data(cachep,
2351  						   objp + obj_offset(cachep));
2352  			cachep->ctor(objp + obj_offset(cachep));
2353  			kasan_poison_object_data(
2354  				cachep, objp + obj_offset(cachep));
2355  		}
2356  
2357  		if (cachep->flags & SLAB_RED_ZONE) {
2358  			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2359  				slab_error(cachep, "constructor overwrote the end of an object");
2360  			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2361  				slab_error(cachep, "constructor overwrote the start of an object");
2362  		}
2363  		/* need to poison the objs? */
2364  		if (cachep->flags & SLAB_POISON) {
2365  			poison_obj(cachep, objp, POISON_FREE);
2366  			slab_kernel_map(cachep, objp, 0);
2367  		}
2368  	}
2369  #endif
2370  }
2371  
2372  #ifdef CONFIG_SLAB_FREELIST_RANDOM
2373  /* Hold information during a freelist initialization */
2374  union freelist_init_state {
2375  	struct {
2376  		unsigned int pos;
2377  		unsigned int *list;
2378  		unsigned int count;
2379  	};
2380  	struct rnd_state rnd_state;
2381  };
2382  
2383  /*
2384   * Initialize the state based on the randomization methode available.
2385   * return true if the pre-computed list is available, false otherwize.
2386   */
2387  static bool freelist_state_initialize(union freelist_init_state *state,
2388  				struct kmem_cache *cachep,
2389  				unsigned int count)
2390  {
2391  	bool ret;
2392  	unsigned int rand;
2393  
2394  	/* Use best entropy available to define a random shift */
2395  	rand = get_random_int();
2396  
2397  	/* Use a random state if the pre-computed list is not available */
2398  	if (!cachep->random_seq) {
2399  		prandom_seed_state(&state->rnd_state, rand);
2400  		ret = false;
2401  	} else {
2402  		state->list = cachep->random_seq;
2403  		state->count = count;
2404  		state->pos = rand % count;
2405  		ret = true;
2406  	}
2407  	return ret;
2408  }
2409  
2410  /* Get the next entry on the list and randomize it using a random shift */
2411  static freelist_idx_t next_random_slot(union freelist_init_state *state)
2412  {
2413  	if (state->pos >= state->count)
2414  		state->pos = 0;
2415  	return state->list[state->pos++];
2416  }
2417  
2418  /* Swap two freelist entries */
2419  static void swap_free_obj(struct page *page, unsigned int a, unsigned int b)
2420  {
2421  	swap(((freelist_idx_t *)page->freelist)[a],
2422  		((freelist_idx_t *)page->freelist)[b]);
2423  }
2424  
2425  /*
2426   * Shuffle the freelist initialization state based on pre-computed lists.
2427   * return true if the list was successfully shuffled, false otherwise.
2428   */
2429  static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
2430  {
2431  	unsigned int objfreelist = 0, i, rand, count = cachep->num;
2432  	union freelist_init_state state;
2433  	bool precomputed;
2434  
2435  	if (count < 2)
2436  		return false;
2437  
2438  	precomputed = freelist_state_initialize(&state, cachep, count);
2439  
2440  	/* Take a random entry as the objfreelist */
2441  	if (OBJFREELIST_SLAB(cachep)) {
2442  		if (!precomputed)
2443  			objfreelist = count - 1;
2444  		else
2445  			objfreelist = next_random_slot(&state);
2446  		page->freelist = index_to_obj(cachep, page, objfreelist) +
2447  						obj_offset(cachep);
2448  		count--;
2449  	}
2450  
2451  	/*
2452  	 * On early boot, generate the list dynamically.
2453  	 * Later use a pre-computed list for speed.
2454  	 */
2455  	if (!precomputed) {
2456  		for (i = 0; i < count; i++)
2457  			set_free_obj(page, i, i);
2458  
2459  		/* Fisher-Yates shuffle */
2460  		for (i = count - 1; i > 0; i--) {
2461  			rand = prandom_u32_state(&state.rnd_state);
2462  			rand %= (i + 1);
2463  			swap_free_obj(page, i, rand);
2464  		}
2465  	} else {
2466  		for (i = 0; i < count; i++)
2467  			set_free_obj(page, i, next_random_slot(&state));
2468  	}
2469  
2470  	if (OBJFREELIST_SLAB(cachep))
2471  		set_free_obj(page, cachep->num - 1, objfreelist);
2472  
2473  	return true;
2474  }
2475  #else
2476  static inline bool shuffle_freelist(struct kmem_cache *cachep,
2477  				struct page *page)
2478  {
2479  	return false;
2480  }
2481  #endif /* CONFIG_SLAB_FREELIST_RANDOM */
2482  
2483  static void cache_init_objs(struct kmem_cache *cachep,
2484  			    struct page *page)
2485  {
2486  	int i;
2487  	void *objp;
2488  	bool shuffled;
2489  
2490  	cache_init_objs_debug(cachep, page);
2491  
2492  	/* Try to randomize the freelist if enabled */
2493  	shuffled = shuffle_freelist(cachep, page);
2494  
2495  	if (!shuffled && OBJFREELIST_SLAB(cachep)) {
2496  		page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
2497  						obj_offset(cachep);
2498  	}
2499  
2500  	for (i = 0; i < cachep->num; i++) {
2501  		objp = index_to_obj(cachep, page, i);
2502  		objp = kasan_init_slab_obj(cachep, objp);
2503  
2504  		/* constructor could break poison info */
2505  		if (DEBUG == 0 && cachep->ctor) {
2506  			kasan_unpoison_object_data(cachep, objp);
2507  			cachep->ctor(objp);
2508  			kasan_poison_object_data(cachep, objp);
2509  		}
2510  
2511  		if (!shuffled)
2512  			set_free_obj(page, i, i);
2513  	}
2514  }
2515  
2516  static void *slab_get_obj(struct kmem_cache *cachep, struct page *page)
2517  {
2518  	void *objp;
2519  
2520  	objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
2521  	page->active++;
2522  
2523  	return objp;
2524  }
2525  
2526  static void slab_put_obj(struct kmem_cache *cachep,
2527  			struct page *page, void *objp)
2528  {
2529  	unsigned int objnr = obj_to_index(cachep, page, objp);
2530  #if DEBUG
2531  	unsigned int i;
2532  
2533  	/* Verify double free bug */
2534  	for (i = page->active; i < cachep->num; i++) {
2535  		if (get_free_obj(page, i) == objnr) {
2536  			pr_err("slab: double free detected in cache '%s', objp %px\n",
2537  			       cachep->name, objp);
2538  			BUG();
2539  		}
2540  	}
2541  #endif
2542  	page->active--;
2543  	if (!page->freelist)
2544  		page->freelist = objp + obj_offset(cachep);
2545  
2546  	set_free_obj(page, page->active, objnr);
2547  }
2548  
2549  /*
2550   * Map pages beginning at addr to the given cache and slab. This is required
2551   * for the slab allocator to be able to lookup the cache and slab of a
2552   * virtual address for kfree, ksize, and slab debugging.
2553   */
2554  static void slab_map_pages(struct kmem_cache *cache, struct page *page,
2555  			   void *freelist)
2556  {
2557  	page->slab_cache = cache;
2558  	page->freelist = freelist;
2559  }
2560  
2561  /*
2562   * Grow (by 1) the number of slabs within a cache.  This is called by
2563   * kmem_cache_alloc() when there are no active objs left in a cache.
2564   */
2565  static struct page *cache_grow_begin(struct kmem_cache *cachep,
2566  				gfp_t flags, int nodeid)
2567  {
2568  	void *freelist;
2569  	size_t offset;
2570  	gfp_t local_flags;
2571  	int page_node;
2572  	struct kmem_cache_node *n;
2573  	struct page *page;
2574  
2575  	/*
2576  	 * Be lazy and only check for valid flags here,  keeping it out of the
2577  	 * critical path in kmem_cache_alloc().
2578  	 */
2579  	if (unlikely(flags & GFP_SLAB_BUG_MASK))
2580  		flags = kmalloc_fix_flags(flags);
2581  
2582  	WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
2583  	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
2584  
2585  	check_irq_off();
2586  	if (gfpflags_allow_blocking(local_flags))
2587  		local_irq_enable();
2588  
2589  	/*
2590  	 * Get mem for the objs.  Attempt to allocate a physical page from
2591  	 * 'nodeid'.
2592  	 */
2593  	page = kmem_getpages(cachep, local_flags, nodeid);
2594  	if (!page)
2595  		goto failed;
2596  
2597  	page_node = page_to_nid(page);
2598  	n = get_node(cachep, page_node);
2599  
2600  	/* Get colour for the slab, and cal the next value. */
2601  	n->colour_next++;
2602  	if (n->colour_next >= cachep->colour)
2603  		n->colour_next = 0;
2604  
2605  	offset = n->colour_next;
2606  	if (offset >= cachep->colour)
2607  		offset = 0;
2608  
2609  	offset *= cachep->colour_off;
2610  
2611  	/*
2612  	 * Call kasan_poison_slab() before calling alloc_slabmgmt(), so
2613  	 * page_address() in the latter returns a non-tagged pointer,
2614  	 * as it should be for slab pages.
2615  	 */
2616  	kasan_poison_slab(page);
2617  
2618  	/* Get slab management. */
2619  	freelist = alloc_slabmgmt(cachep, page, offset,
2620  			local_flags & ~GFP_CONSTRAINT_MASK, page_node);
2621  	if (OFF_SLAB(cachep) && !freelist)
2622  		goto opps1;
2623  
2624  	slab_map_pages(cachep, page, freelist);
2625  
2626  	cache_init_objs(cachep, page);
2627  
2628  	if (gfpflags_allow_blocking(local_flags))
2629  		local_irq_disable();
2630  
2631  	return page;
2632  
2633  opps1:
2634  	kmem_freepages(cachep, page);
2635  failed:
2636  	if (gfpflags_allow_blocking(local_flags))
2637  		local_irq_disable();
2638  	return NULL;
2639  }
2640  
2641  static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
2642  {
2643  	struct kmem_cache_node *n;
2644  	void *list = NULL;
2645  
2646  	check_irq_off();
2647  
2648  	if (!page)
2649  		return;
2650  
2651  	INIT_LIST_HEAD(&page->slab_list);
2652  	n = get_node(cachep, page_to_nid(page));
2653  
2654  	spin_lock(&n->list_lock);
2655  	n->total_slabs++;
2656  	if (!page->active) {
2657  		list_add_tail(&page->slab_list, &n->slabs_free);
2658  		n->free_slabs++;
2659  	} else
2660  		fixup_slab_list(cachep, n, page, &list);
2661  
2662  	STATS_INC_GROWN(cachep);
2663  	n->free_objects += cachep->num - page->active;
2664  	spin_unlock(&n->list_lock);
2665  
2666  	fixup_objfreelist_debug(cachep, &list);
2667  }
2668  
2669  #if DEBUG
2670  
2671  /*
2672   * Perform extra freeing checks:
2673   * - detect bad pointers.
2674   * - POISON/RED_ZONE checking
2675   */
2676  static void kfree_debugcheck(const void *objp)
2677  {
2678  	if (!virt_addr_valid(objp)) {
2679  		pr_err("kfree_debugcheck: out of range ptr %lxh\n",
2680  		       (unsigned long)objp);
2681  		BUG();
2682  	}
2683  }
2684  
2685  static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2686  {
2687  	unsigned long long redzone1, redzone2;
2688  
2689  	redzone1 = *dbg_redzone1(cache, obj);
2690  	redzone2 = *dbg_redzone2(cache, obj);
2691  
2692  	/*
2693  	 * Redzone is ok.
2694  	 */
2695  	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2696  		return;
2697  
2698  	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2699  		slab_error(cache, "double free detected");
2700  	else
2701  		slab_error(cache, "memory outside object was overwritten");
2702  
2703  	pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n",
2704  	       obj, redzone1, redzone2);
2705  }
2706  
2707  static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2708  				   unsigned long caller)
2709  {
2710  	unsigned int objnr;
2711  	struct page *page;
2712  
2713  	BUG_ON(virt_to_cache(objp) != cachep);
2714  
2715  	objp -= obj_offset(cachep);
2716  	kfree_debugcheck(objp);
2717  	page = virt_to_head_page(objp);
2718  
2719  	if (cachep->flags & SLAB_RED_ZONE) {
2720  		verify_redzone_free(cachep, objp);
2721  		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2722  		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2723  	}
2724  	if (cachep->flags & SLAB_STORE_USER)
2725  		*dbg_userword(cachep, objp) = (void *)caller;
2726  
2727  	objnr = obj_to_index(cachep, page, objp);
2728  
2729  	BUG_ON(objnr >= cachep->num);
2730  	BUG_ON(objp != index_to_obj(cachep, page, objnr));
2731  
2732  	if (cachep->flags & SLAB_POISON) {
2733  		poison_obj(cachep, objp, POISON_FREE);
2734  		slab_kernel_map(cachep, objp, 0);
2735  	}
2736  	return objp;
2737  }
2738  
2739  #else
2740  #define kfree_debugcheck(x) do { } while(0)
2741  #define cache_free_debugcheck(x, objp, z) (objp)
2742  #endif
2743  
2744  static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
2745  						void **list)
2746  {
2747  #if DEBUG
2748  	void *next = *list;
2749  	void *objp;
2750  
2751  	while (next) {
2752  		objp = next - obj_offset(cachep);
2753  		next = *(void **)next;
2754  		poison_obj(cachep, objp, POISON_FREE);
2755  	}
2756  #endif
2757  }
2758  
2759  static inline void fixup_slab_list(struct kmem_cache *cachep,
2760  				struct kmem_cache_node *n, struct page *page,
2761  				void **list)
2762  {
2763  	/* move slabp to correct slabp list: */
2764  	list_del(&page->slab_list);
2765  	if (page->active == cachep->num) {
2766  		list_add(&page->slab_list, &n->slabs_full);
2767  		if (OBJFREELIST_SLAB(cachep)) {
2768  #if DEBUG
2769  			/* Poisoning will be done without holding the lock */
2770  			if (cachep->flags & SLAB_POISON) {
2771  				void **objp = page->freelist;
2772  
2773  				*objp = *list;
2774  				*list = objp;
2775  			}
2776  #endif
2777  			page->freelist = NULL;
2778  		}
2779  	} else
2780  		list_add(&page->slab_list, &n->slabs_partial);
2781  }
2782  
2783  /* Try to find non-pfmemalloc slab if needed */
2784  static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
2785  					struct page *page, bool pfmemalloc)
2786  {
2787  	if (!page)
2788  		return NULL;
2789  
2790  	if (pfmemalloc)
2791  		return page;
2792  
2793  	if (!PageSlabPfmemalloc(page))
2794  		return page;
2795  
2796  	/* No need to keep pfmemalloc slab if we have enough free objects */
2797  	if (n->free_objects > n->free_limit) {
2798  		ClearPageSlabPfmemalloc(page);
2799  		return page;
2800  	}
2801  
2802  	/* Move pfmemalloc slab to the end of list to speed up next search */
2803  	list_del(&page->slab_list);
2804  	if (!page->active) {
2805  		list_add_tail(&page->slab_list, &n->slabs_free);
2806  		n->free_slabs++;
2807  	} else
2808  		list_add_tail(&page->slab_list, &n->slabs_partial);
2809  
2810  	list_for_each_entry(page, &n->slabs_partial, slab_list) {
2811  		if (!PageSlabPfmemalloc(page))
2812  			return page;
2813  	}
2814  
2815  	n->free_touched = 1;
2816  	list_for_each_entry(page, &n->slabs_free, slab_list) {
2817  		if (!PageSlabPfmemalloc(page)) {
2818  			n->free_slabs--;
2819  			return page;
2820  		}
2821  	}
2822  
2823  	return NULL;
2824  }
2825  
2826  static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
2827  {
2828  	struct page *page;
2829  
2830  	assert_spin_locked(&n->list_lock);
2831  	page = list_first_entry_or_null(&n->slabs_partial, struct page,
2832  					slab_list);
2833  	if (!page) {
2834  		n->free_touched = 1;
2835  		page = list_first_entry_or_null(&n->slabs_free, struct page,
2836  						slab_list);
2837  		if (page)
2838  			n->free_slabs--;
2839  	}
2840  
2841  	if (sk_memalloc_socks())
2842  		page = get_valid_first_slab(n, page, pfmemalloc);
2843  
2844  	return page;
2845  }
2846  
2847  static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
2848  				struct kmem_cache_node *n, gfp_t flags)
2849  {
2850  	struct page *page;
2851  	void *obj;
2852  	void *list = NULL;
2853  
2854  	if (!gfp_pfmemalloc_allowed(flags))
2855  		return NULL;
2856  
2857  	spin_lock(&n->list_lock);
2858  	page = get_first_slab(n, true);
2859  	if (!page) {
2860  		spin_unlock(&n->list_lock);
2861  		return NULL;
2862  	}
2863  
2864  	obj = slab_get_obj(cachep, page);
2865  	n->free_objects--;
2866  
2867  	fixup_slab_list(cachep, n, page, &list);
2868  
2869  	spin_unlock(&n->list_lock);
2870  	fixup_objfreelist_debug(cachep, &list);
2871  
2872  	return obj;
2873  }
2874  
2875  /*
2876   * Slab list should be fixed up by fixup_slab_list() for existing slab
2877   * or cache_grow_end() for new slab
2878   */
2879  static __always_inline int alloc_block(struct kmem_cache *cachep,
2880  		struct array_cache *ac, struct page *page, int batchcount)
2881  {
2882  	/*
2883  	 * There must be at least one object available for
2884  	 * allocation.
2885  	 */
2886  	BUG_ON(page->active >= cachep->num);
2887  
2888  	while (page->active < cachep->num && batchcount--) {
2889  		STATS_INC_ALLOCED(cachep);
2890  		STATS_INC_ACTIVE(cachep);
2891  		STATS_SET_HIGH(cachep);
2892  
2893  		ac->entry[ac->avail++] = slab_get_obj(cachep, page);
2894  	}
2895  
2896  	return batchcount;
2897  }
2898  
2899  static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2900  {
2901  	int batchcount;
2902  	struct kmem_cache_node *n;
2903  	struct array_cache *ac, *shared;
2904  	int node;
2905  	void *list = NULL;
2906  	struct page *page;
2907  
2908  	check_irq_off();
2909  	node = numa_mem_id();
2910  
2911  	ac = cpu_cache_get(cachep);
2912  	batchcount = ac->batchcount;
2913  	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2914  		/*
2915  		 * If there was little recent activity on this cache, then
2916  		 * perform only a partial refill.  Otherwise we could generate
2917  		 * refill bouncing.
2918  		 */
2919  		batchcount = BATCHREFILL_LIMIT;
2920  	}
2921  	n = get_node(cachep, node);
2922  
2923  	BUG_ON(ac->avail > 0 || !n);
2924  	shared = READ_ONCE(n->shared);
2925  	if (!n->free_objects && (!shared || !shared->avail))
2926  		goto direct_grow;
2927  
2928  	spin_lock(&n->list_lock);
2929  	shared = READ_ONCE(n->shared);
2930  
2931  	/* See if we can refill from the shared array */
2932  	if (shared && transfer_objects(ac, shared, batchcount)) {
2933  		shared->touched = 1;
2934  		goto alloc_done;
2935  	}
2936  
2937  	while (batchcount > 0) {
2938  		/* Get slab alloc is to come from. */
2939  		page = get_first_slab(n, false);
2940  		if (!page)
2941  			goto must_grow;
2942  
2943  		check_spinlock_acquired(cachep);
2944  
2945  		batchcount = alloc_block(cachep, ac, page, batchcount);
2946  		fixup_slab_list(cachep, n, page, &list);
2947  	}
2948  
2949  must_grow:
2950  	n->free_objects -= ac->avail;
2951  alloc_done:
2952  	spin_unlock(&n->list_lock);
2953  	fixup_objfreelist_debug(cachep, &list);
2954  
2955  direct_grow:
2956  	if (unlikely(!ac->avail)) {
2957  		/* Check if we can use obj in pfmemalloc slab */
2958  		if (sk_memalloc_socks()) {
2959  			void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
2960  
2961  			if (obj)
2962  				return obj;
2963  		}
2964  
2965  		page = cache_grow_begin(cachep, gfp_exact_node(flags), node);
2966  
2967  		/*
2968  		 * cache_grow_begin() can reenable interrupts,
2969  		 * then ac could change.
2970  		 */
2971  		ac = cpu_cache_get(cachep);
2972  		if (!ac->avail && page)
2973  			alloc_block(cachep, ac, page, batchcount);
2974  		cache_grow_end(cachep, page);
2975  
2976  		if (!ac->avail)
2977  			return NULL;
2978  	}
2979  	ac->touched = 1;
2980  
2981  	return ac->entry[--ac->avail];
2982  }
2983  
2984  static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
2985  						gfp_t flags)
2986  {
2987  	might_sleep_if(gfpflags_allow_blocking(flags));
2988  }
2989  
2990  #if DEBUG
2991  static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2992  				gfp_t flags, void *objp, unsigned long caller)
2993  {
2994  	WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
2995  	if (!objp || is_kfence_address(objp))
2996  		return objp;
2997  	if (cachep->flags & SLAB_POISON) {
2998  		check_poison_obj(cachep, objp);
2999  		slab_kernel_map(cachep, objp, 1);
3000  		poison_obj(cachep, objp, POISON_INUSE);
3001  	}
3002  	if (cachep->flags & SLAB_STORE_USER)
3003  		*dbg_userword(cachep, objp) = (void *)caller;
3004  
3005  	if (cachep->flags & SLAB_RED_ZONE) {
3006  		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
3007  				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
3008  			slab_error(cachep, "double free, or memory outside object was overwritten");
3009  			pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n",
3010  			       objp, *dbg_redzone1(cachep, objp),
3011  			       *dbg_redzone2(cachep, objp));
3012  		}
3013  		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
3014  		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
3015  	}
3016  
3017  	objp += obj_offset(cachep);
3018  	if (cachep->ctor && cachep->flags & SLAB_POISON)
3019  		cachep->ctor(objp);
3020  	if (ARCH_SLAB_MINALIGN &&
3021  	    ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
3022  		pr_err("0x%px: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3023  		       objp, (int)ARCH_SLAB_MINALIGN);
3024  	}
3025  	return objp;
3026  }
3027  #else
3028  #define cache_alloc_debugcheck_after(a, b, objp, d) (objp)
3029  #endif
3030  
3031  static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3032  {
3033  	void *objp;
3034  	struct array_cache *ac;
3035  
3036  	check_irq_off();
3037  
3038  	ac = cpu_cache_get(cachep);
3039  	if (likely(ac->avail)) {
3040  		ac->touched = 1;
3041  		objp = ac->entry[--ac->avail];
3042  
3043  		STATS_INC_ALLOCHIT(cachep);
3044  		goto out;
3045  	}
3046  
3047  	STATS_INC_ALLOCMISS(cachep);
3048  	objp = cache_alloc_refill(cachep, flags);
3049  	/*
3050  	 * the 'ac' may be updated by cache_alloc_refill(),
3051  	 * and kmemleak_erase() requires its correct value.
3052  	 */
3053  	ac = cpu_cache_get(cachep);
3054  
3055  out:
3056  	/*
3057  	 * To avoid a false negative, if an object that is in one of the
3058  	 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
3059  	 * treat the array pointers as a reference to the object.
3060  	 */
3061  	if (objp)
3062  		kmemleak_erase(&ac->entry[ac->avail]);
3063  	return objp;
3064  }
3065  
3066  #ifdef CONFIG_NUMA
3067  /*
3068   * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set.
3069   *
3070   * If we are in_interrupt, then process context, including cpusets and
3071   * mempolicy, may not apply and should not be used for allocation policy.
3072   */
3073  static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3074  {
3075  	int nid_alloc, nid_here;
3076  
3077  	if (in_interrupt() || (flags & __GFP_THISNODE))
3078  		return NULL;
3079  	nid_alloc = nid_here = numa_mem_id();
3080  	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3081  		nid_alloc = cpuset_slab_spread_node();
3082  	else if (current->mempolicy)
3083  		nid_alloc = mempolicy_slab_node();
3084  	if (nid_alloc != nid_here)
3085  		return ____cache_alloc_node(cachep, flags, nid_alloc);
3086  	return NULL;
3087  }
3088  
3089  /*
3090   * Fallback function if there was no memory available and no objects on a
3091   * certain node and fall back is permitted. First we scan all the
3092   * available node for available objects. If that fails then we
3093   * perform an allocation without specifying a node. This allows the page
3094   * allocator to do its reclaim / fallback magic. We then insert the
3095   * slab into the proper nodelist and then allocate from it.
3096   */
3097  static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3098  {
3099  	struct zonelist *zonelist;
3100  	struct zoneref *z;
3101  	struct zone *zone;
3102  	enum zone_type highest_zoneidx = gfp_zone(flags);
3103  	void *obj = NULL;
3104  	struct page *page;
3105  	int nid;
3106  	unsigned int cpuset_mems_cookie;
3107  
3108  	if (flags & __GFP_THISNODE)
3109  		return NULL;
3110  
3111  retry_cpuset:
3112  	cpuset_mems_cookie = read_mems_allowed_begin();
3113  	zonelist = node_zonelist(mempolicy_slab_node(), flags);
3114  
3115  retry:
3116  	/*
3117  	 * Look through allowed nodes for objects available
3118  	 * from existing per node queues.
3119  	 */
3120  	for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
3121  		nid = zone_to_nid(zone);
3122  
3123  		if (cpuset_zone_allowed(zone, flags) &&
3124  			get_node(cache, nid) &&
3125  			get_node(cache, nid)->free_objects) {
3126  				obj = ____cache_alloc_node(cache,
3127  					gfp_exact_node(flags), nid);
3128  				if (obj)
3129  					break;
3130  		}
3131  	}
3132  
3133  	if (!obj) {
3134  		/*
3135  		 * This allocation will be performed within the constraints
3136  		 * of the current cpuset / memory policy requirements.
3137  		 * We may trigger various forms of reclaim on the allowed
3138  		 * set and go into memory reserves if necessary.
3139  		 */
3140  		page = cache_grow_begin(cache, flags, numa_mem_id());
3141  		cache_grow_end(cache, page);
3142  		if (page) {
3143  			nid = page_to_nid(page);
3144  			obj = ____cache_alloc_node(cache,
3145  				gfp_exact_node(flags), nid);
3146  
3147  			/*
3148  			 * Another processor may allocate the objects in
3149  			 * the slab since we are not holding any locks.
3150  			 */
3151  			if (!obj)
3152  				goto retry;
3153  		}
3154  	}
3155  
3156  	if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
3157  		goto retry_cpuset;
3158  	return obj;
3159  }
3160  
3161  /*
3162   * A interface to enable slab creation on nodeid
3163   */
3164  static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3165  				int nodeid)
3166  {
3167  	struct page *page;
3168  	struct kmem_cache_node *n;
3169  	void *obj = NULL;
3170  	void *list = NULL;
3171  
3172  	VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
3173  	n = get_node(cachep, nodeid);
3174  	BUG_ON(!n);
3175  
3176  	check_irq_off();
3177  	spin_lock(&n->list_lock);
3178  	page = get_first_slab(n, false);
3179  	if (!page)
3180  		goto must_grow;
3181  
3182  	check_spinlock_acquired_node(cachep, nodeid);
3183  
3184  	STATS_INC_NODEALLOCS(cachep);
3185  	STATS_INC_ACTIVE(cachep);
3186  	STATS_SET_HIGH(cachep);
3187  
3188  	BUG_ON(page->active == cachep->num);
3189  
3190  	obj = slab_get_obj(cachep, page);
3191  	n->free_objects--;
3192  
3193  	fixup_slab_list(cachep, n, page, &list);
3194  
3195  	spin_unlock(&n->list_lock);
3196  	fixup_objfreelist_debug(cachep, &list);
3197  	return obj;
3198  
3199  must_grow:
3200  	spin_unlock(&n->list_lock);
3201  	page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
3202  	if (page) {
3203  		/* This slab isn't counted yet so don't update free_objects */
3204  		obj = slab_get_obj(cachep, page);
3205  	}
3206  	cache_grow_end(cachep, page);
3207  
3208  	return obj ? obj : fallback_alloc(cachep, flags);
3209  }
3210  
3211  static __always_inline void *
3212  slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size,
3213  		   unsigned long caller)
3214  {
3215  	unsigned long save_flags;
3216  	void *ptr;
3217  	int slab_node = numa_mem_id();
3218  	struct obj_cgroup *objcg = NULL;
3219  
3220  	flags &= gfp_allowed_mask;
3221  	cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
3222  	if (unlikely(!cachep))
3223  		return NULL;
3224  
3225  	ptr = kfence_alloc(cachep, orig_size, flags);
3226  	if (unlikely(ptr))
3227  		goto out_hooks;
3228  
3229  	cache_alloc_debugcheck_before(cachep, flags);
3230  	local_irq_save(save_flags);
3231  
3232  	if (nodeid == NUMA_NO_NODE)
3233  		nodeid = slab_node;
3234  
3235  	if (unlikely(!get_node(cachep, nodeid))) {
3236  		/* Node not bootstrapped yet */
3237  		ptr = fallback_alloc(cachep, flags);
3238  		goto out;
3239  	}
3240  
3241  	if (nodeid == slab_node) {
3242  		/*
3243  		 * Use the locally cached objects if possible.
3244  		 * However ____cache_alloc does not allow fallback
3245  		 * to other nodes. It may fail while we still have
3246  		 * objects on other nodes available.
3247  		 */
3248  		ptr = ____cache_alloc(cachep, flags);
3249  		if (ptr)
3250  			goto out;
3251  	}
3252  	/* ___cache_alloc_node can fall back to other nodes */
3253  	ptr = ____cache_alloc_node(cachep, flags, nodeid);
3254    out:
3255  	local_irq_restore(save_flags);
3256  	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3257  
3258  	if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
3259  		memset(ptr, 0, cachep->object_size);
3260  
3261  out_hooks:
3262  	slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr);
3263  	return ptr;
3264  }
3265  
3266  static __always_inline void *
3267  __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
3268  {
3269  	void *objp;
3270  
3271  	if (current->mempolicy || cpuset_do_slab_mem_spread()) {
3272  		objp = alternate_node_alloc(cache, flags);
3273  		if (objp)
3274  			goto out;
3275  	}
3276  	objp = ____cache_alloc(cache, flags);
3277  
3278  	/*
3279  	 * We may just have run out of memory on the local node.
3280  	 * ____cache_alloc_node() knows how to locate memory on other nodes
3281  	 */
3282  	if (!objp)
3283  		objp = ____cache_alloc_node(cache, flags, numa_mem_id());
3284  
3285    out:
3286  	return objp;
3287  }
3288  #else
3289  
3290  static __always_inline void *
3291  __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3292  {
3293  	return ____cache_alloc(cachep, flags);
3294  }
3295  
3296  #endif /* CONFIG_NUMA */
3297  
3298  static __always_inline void *
3299  slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned long caller)
3300  {
3301  	unsigned long save_flags;
3302  	void *objp;
3303  	struct obj_cgroup *objcg = NULL;
3304  
3305  	flags &= gfp_allowed_mask;
3306  	cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
3307  	if (unlikely(!cachep))
3308  		return NULL;
3309  
3310  	objp = kfence_alloc(cachep, orig_size, flags);
3311  	if (unlikely(objp))
3312  		goto out;
3313  
3314  	cache_alloc_debugcheck_before(cachep, flags);
3315  	local_irq_save(save_flags);
3316  	objp = __do_cache_alloc(cachep, flags);
3317  	local_irq_restore(save_flags);
3318  	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3319  	prefetchw(objp);
3320  
3321  	if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
3322  		memset(objp, 0, cachep->object_size);
3323  
3324  out:
3325  	slab_post_alloc_hook(cachep, objcg, flags, 1, &objp);
3326  	return objp;
3327  }
3328  
3329  /*
3330   * Caller needs to acquire correct kmem_cache_node's list_lock
3331   * @list: List of detached free slabs should be freed by caller
3332   */
3333  static void free_block(struct kmem_cache *cachep, void **objpp,
3334  			int nr_objects, int node, struct list_head *list)
3335  {
3336  	int i;
3337  	struct kmem_cache_node *n = get_node(cachep, node);
3338  	struct page *page;
3339  
3340  	n->free_objects += nr_objects;
3341  
3342  	for (i = 0; i < nr_objects; i++) {
3343  		void *objp;
3344  		struct page *page;
3345  
3346  		objp = objpp[i];
3347  
3348  		page = virt_to_head_page(objp);
3349  		list_del(&page->slab_list);
3350  		check_spinlock_acquired_node(cachep, node);
3351  		slab_put_obj(cachep, page, objp);
3352  		STATS_DEC_ACTIVE(cachep);
3353  
3354  		/* fixup slab chains */
3355  		if (page->active == 0) {
3356  			list_add(&page->slab_list, &n->slabs_free);
3357  			n->free_slabs++;
3358  		} else {
3359  			/* Unconditionally move a slab to the end of the
3360  			 * partial list on free - maximum time for the
3361  			 * other objects to be freed, too.
3362  			 */
3363  			list_add_tail(&page->slab_list, &n->slabs_partial);
3364  		}
3365  	}
3366  
3367  	while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
3368  		n->free_objects -= cachep->num;
3369  
3370  		page = list_last_entry(&n->slabs_free, struct page, slab_list);
3371  		list_move(&page->slab_list, list);
3372  		n->free_slabs--;
3373  		n->total_slabs--;
3374  	}
3375  }
3376  
3377  static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3378  {
3379  	int batchcount;
3380  	struct kmem_cache_node *n;
3381  	int node = numa_mem_id();
3382  	LIST_HEAD(list);
3383  
3384  	batchcount = ac->batchcount;
3385  
3386  	check_irq_off();
3387  	n = get_node(cachep, node);
3388  	spin_lock(&n->list_lock);
3389  	if (n->shared) {
3390  		struct array_cache *shared_array = n->shared;
3391  		int max = shared_array->limit - shared_array->avail;
3392  		if (max) {
3393  			if (batchcount > max)
3394  				batchcount = max;
3395  			memcpy(&(shared_array->entry[shared_array->avail]),
3396  			       ac->entry, sizeof(void *) * batchcount);
3397  			shared_array->avail += batchcount;
3398  			goto free_done;
3399  		}
3400  	}
3401  
3402  	free_block(cachep, ac->entry, batchcount, node, &list);
3403  free_done:
3404  #if STATS
3405  	{
3406  		int i = 0;
3407  		struct page *page;
3408  
3409  		list_for_each_entry(page, &n->slabs_free, slab_list) {
3410  			BUG_ON(page->active);
3411  
3412  			i++;
3413  		}
3414  		STATS_SET_FREEABLE(cachep, i);
3415  	}
3416  #endif
3417  	spin_unlock(&n->list_lock);
3418  	ac->avail -= batchcount;
3419  	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3420  	slabs_destroy(cachep, &list);
3421  }
3422  
3423  /*
3424   * Release an obj back to its cache. If the obj has a constructed state, it must
3425   * be in this state _before_ it is released.  Called with disabled ints.
3426   */
3427  static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
3428  					 unsigned long caller)
3429  {
3430  	if (is_kfence_address(objp)) {
3431  		kmemleak_free_recursive(objp, cachep->flags);
3432  		__kfence_free(objp);
3433  		return;
3434  	}
3435  
3436  	if (unlikely(slab_want_init_on_free(cachep)))
3437  		memset(objp, 0, cachep->object_size);
3438  
3439  	/* Put the object into the quarantine, don't touch it for now. */
3440  	if (kasan_slab_free(cachep, objp))
3441  		return;
3442  
3443  	/* Use KCSAN to help debug racy use-after-free. */
3444  	if (!(cachep->flags & SLAB_TYPESAFE_BY_RCU))
3445  		__kcsan_check_access(objp, cachep->object_size,
3446  				     KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
3447  
3448  	___cache_free(cachep, objp, caller);
3449  }
3450  
3451  void ___cache_free(struct kmem_cache *cachep, void *objp,
3452  		unsigned long caller)
3453  {
3454  	struct array_cache *ac = cpu_cache_get(cachep);
3455  
3456  	check_irq_off();
3457  	kmemleak_free_recursive(objp, cachep->flags);
3458  	objp = cache_free_debugcheck(cachep, objp, caller);
3459  	memcg_slab_free_hook(cachep, &objp, 1);
3460  
3461  	/*
3462  	 * Skip calling cache_free_alien() when the platform is not numa.
3463  	 * This will avoid cache misses that happen while accessing slabp (which
3464  	 * is per page memory  reference) to get nodeid. Instead use a global
3465  	 * variable to skip the call, which is mostly likely to be present in
3466  	 * the cache.
3467  	 */
3468  	if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
3469  		return;
3470  
3471  	if (ac->avail < ac->limit) {
3472  		STATS_INC_FREEHIT(cachep);
3473  	} else {
3474  		STATS_INC_FREEMISS(cachep);
3475  		cache_flusharray(cachep, ac);
3476  	}
3477  
3478  	if (sk_memalloc_socks()) {
3479  		struct page *page = virt_to_head_page(objp);
3480  
3481  		if (unlikely(PageSlabPfmemalloc(page))) {
3482  			cache_free_pfmemalloc(cachep, page, objp);
3483  			return;
3484  		}
3485  	}
3486  
3487  	__free_one(ac, objp);
3488  }
3489  
3490  /**
3491   * kmem_cache_alloc - Allocate an object
3492   * @cachep: The cache to allocate from.
3493   * @flags: See kmalloc().
3494   *
3495   * Allocate an object from this cache.  The flags are only relevant
3496   * if the cache has no available objects.
3497   *
3498   * Return: pointer to the new object or %NULL in case of error
3499   */
3500  void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3501  {
3502  	void *ret = slab_alloc(cachep, flags, cachep->object_size, _RET_IP_);
3503  
3504  	trace_kmem_cache_alloc(_RET_IP_, ret,
3505  			       cachep->object_size, cachep->size, flags);
3506  
3507  	return ret;
3508  }
3509  EXPORT_SYMBOL(kmem_cache_alloc);
3510  
3511  static __always_inline void
3512  cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
3513  				  size_t size, void **p, unsigned long caller)
3514  {
3515  	size_t i;
3516  
3517  	for (i = 0; i < size; i++)
3518  		p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller);
3519  }
3520  
3521  int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
3522  			  void **p)
3523  {
3524  	size_t i;
3525  	struct obj_cgroup *objcg = NULL;
3526  
3527  	s = slab_pre_alloc_hook(s, &objcg, size, flags);
3528  	if (!s)
3529  		return 0;
3530  
3531  	cache_alloc_debugcheck_before(s, flags);
3532  
3533  	local_irq_disable();
3534  	for (i = 0; i < size; i++) {
3535  		void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags);
3536  
3537  		if (unlikely(!objp))
3538  			goto error;
3539  		p[i] = objp;
3540  	}
3541  	local_irq_enable();
3542  
3543  	cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
3544  
3545  	/* Clear memory outside IRQ disabled section */
3546  	if (unlikely(slab_want_init_on_alloc(flags, s)))
3547  		for (i = 0; i < size; i++)
3548  			memset(p[i], 0, s->object_size);
3549  
3550  	slab_post_alloc_hook(s, objcg, flags, size, p);
3551  	/* FIXME: Trace call missing. Christoph would like a bulk variant */
3552  	return size;
3553  error:
3554  	local_irq_enable();
3555  	cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
3556  	slab_post_alloc_hook(s, objcg, flags, i, p);
3557  	__kmem_cache_free_bulk(s, i, p);
3558  	return 0;
3559  }
3560  EXPORT_SYMBOL(kmem_cache_alloc_bulk);
3561  
3562  #ifdef CONFIG_TRACING
3563  void *
3564  kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
3565  {
3566  	void *ret;
3567  
3568  	ret = slab_alloc(cachep, flags, size, _RET_IP_);
3569  
3570  	ret = kasan_kmalloc(cachep, ret, size, flags);
3571  	trace_kmalloc(_RET_IP_, ret,
3572  		      size, cachep->size, flags);
3573  	return ret;
3574  }
3575  EXPORT_SYMBOL(kmem_cache_alloc_trace);
3576  #endif
3577  
3578  #ifdef CONFIG_NUMA
3579  /**
3580   * kmem_cache_alloc_node - Allocate an object on the specified node
3581   * @cachep: The cache to allocate from.
3582   * @flags: See kmalloc().
3583   * @nodeid: node number of the target node.
3584   *
3585   * Identical to kmem_cache_alloc but it will allocate memory on the given
3586   * node, which can improve the performance for cpu bound structures.
3587   *
3588   * Fallback to other node is possible if __GFP_THISNODE is not set.
3589   *
3590   * Return: pointer to the new object or %NULL in case of error
3591   */
3592  void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3593  {
3594  	void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_);
3595  
3596  	trace_kmem_cache_alloc_node(_RET_IP_, ret,
3597  				    cachep->object_size, cachep->size,
3598  				    flags, nodeid);
3599  
3600  	return ret;
3601  }
3602  EXPORT_SYMBOL(kmem_cache_alloc_node);
3603  
3604  #ifdef CONFIG_TRACING
3605  void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
3606  				  gfp_t flags,
3607  				  int nodeid,
3608  				  size_t size)
3609  {
3610  	void *ret;
3611  
3612  	ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_);
3613  
3614  	ret = kasan_kmalloc(cachep, ret, size, flags);
3615  	trace_kmalloc_node(_RET_IP_, ret,
3616  			   size, cachep->size,
3617  			   flags, nodeid);
3618  	return ret;
3619  }
3620  EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
3621  #endif
3622  
3623  static __always_inline void *
3624  __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
3625  {
3626  	struct kmem_cache *cachep;
3627  	void *ret;
3628  
3629  	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
3630  		return NULL;
3631  	cachep = kmalloc_slab(size, flags);
3632  	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3633  		return cachep;
3634  	ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
3635  	ret = kasan_kmalloc(cachep, ret, size, flags);
3636  
3637  	return ret;
3638  }
3639  
3640  void *__kmalloc_node(size_t size, gfp_t flags, int node)
3641  {
3642  	return __do_kmalloc_node(size, flags, node, _RET_IP_);
3643  }
3644  EXPORT_SYMBOL(__kmalloc_node);
3645  
3646  void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3647  		int node, unsigned long caller)
3648  {
3649  	return __do_kmalloc_node(size, flags, node, caller);
3650  }
3651  EXPORT_SYMBOL(__kmalloc_node_track_caller);
3652  #endif /* CONFIG_NUMA */
3653  
3654  void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
3655  {
3656  	struct kmem_cache *cachep;
3657  	unsigned int objnr;
3658  	void *objp;
3659  
3660  	kpp->kp_ptr = object;
3661  	kpp->kp_page = page;
3662  	cachep = page->slab_cache;
3663  	kpp->kp_slab_cache = cachep;
3664  	objp = object - obj_offset(cachep);
3665  	kpp->kp_data_offset = obj_offset(cachep);
3666  	page = virt_to_head_page(objp);
3667  	objnr = obj_to_index(cachep, page, objp);
3668  	objp = index_to_obj(cachep, page, objnr);
3669  	kpp->kp_objp = objp;
3670  	if (DEBUG && cachep->flags & SLAB_STORE_USER)
3671  		kpp->kp_ret = *dbg_userword(cachep, objp);
3672  }
3673  
3674  /**
3675   * __do_kmalloc - allocate memory
3676   * @size: how many bytes of memory are required.
3677   * @flags: the type of memory to allocate (see kmalloc).
3678   * @caller: function caller for debug tracking of the caller
3679   *
3680   * Return: pointer to the allocated memory or %NULL in case of error
3681   */
3682  static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3683  					  unsigned long caller)
3684  {
3685  	struct kmem_cache *cachep;
3686  	void *ret;
3687  
3688  	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
3689  		return NULL;
3690  	cachep = kmalloc_slab(size, flags);
3691  	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3692  		return cachep;
3693  	ret = slab_alloc(cachep, flags, size, caller);
3694  
3695  	ret = kasan_kmalloc(cachep, ret, size, flags);
3696  	trace_kmalloc(caller, ret,
3697  		      size, cachep->size, flags);
3698  
3699  	return ret;
3700  }
3701  
3702  void *__kmalloc(size_t size, gfp_t flags)
3703  {
3704  	return __do_kmalloc(size, flags, _RET_IP_);
3705  }
3706  EXPORT_SYMBOL(__kmalloc);
3707  
3708  void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
3709  {
3710  	return __do_kmalloc(size, flags, caller);
3711  }
3712  EXPORT_SYMBOL(__kmalloc_track_caller);
3713  
3714  /**
3715   * kmem_cache_free - Deallocate an object
3716   * @cachep: The cache the allocation was from.
3717   * @objp: The previously allocated object.
3718   *
3719   * Free an object which was previously allocated from this
3720   * cache.
3721   */
3722  void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3723  {
3724  	unsigned long flags;
3725  	cachep = cache_from_obj(cachep, objp);
3726  	if (!cachep)
3727  		return;
3728  
3729  	local_irq_save(flags);
3730  	debug_check_no_locks_freed(objp, cachep->object_size);
3731  	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3732  		debug_check_no_obj_freed(objp, cachep->object_size);
3733  	__cache_free(cachep, objp, _RET_IP_);
3734  	local_irq_restore(flags);
3735  
3736  	trace_kmem_cache_free(_RET_IP_, objp, cachep->name);
3737  }
3738  EXPORT_SYMBOL(kmem_cache_free);
3739  
3740  void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
3741  {
3742  	struct kmem_cache *s;
3743  	size_t i;
3744  
3745  	local_irq_disable();
3746  	for (i = 0; i < size; i++) {
3747  		void *objp = p[i];
3748  
3749  		if (!orig_s) /* called via kfree_bulk */
3750  			s = virt_to_cache(objp);
3751  		else
3752  			s = cache_from_obj(orig_s, objp);
3753  		if (!s)
3754  			continue;
3755  
3756  		debug_check_no_locks_freed(objp, s->object_size);
3757  		if (!(s->flags & SLAB_DEBUG_OBJECTS))
3758  			debug_check_no_obj_freed(objp, s->object_size);
3759  
3760  		__cache_free(s, objp, _RET_IP_);
3761  	}
3762  	local_irq_enable();
3763  
3764  	/* FIXME: add tracing */
3765  }
3766  EXPORT_SYMBOL(kmem_cache_free_bulk);
3767  
3768  /**
3769   * kfree - free previously allocated memory
3770   * @objp: pointer returned by kmalloc.
3771   *
3772   * If @objp is NULL, no operation is performed.
3773   *
3774   * Don't free memory not originally allocated by kmalloc()
3775   * or you will run into trouble.
3776   */
3777  void kfree(const void *objp)
3778  {
3779  	struct kmem_cache *c;
3780  	unsigned long flags;
3781  
3782  	trace_kfree(_RET_IP_, objp);
3783  
3784  	if (unlikely(ZERO_OR_NULL_PTR(objp)))
3785  		return;
3786  	local_irq_save(flags);
3787  	kfree_debugcheck(objp);
3788  	c = virt_to_cache(objp);
3789  	if (!c) {
3790  		local_irq_restore(flags);
3791  		return;
3792  	}
3793  	debug_check_no_locks_freed(objp, c->object_size);
3794  
3795  	debug_check_no_obj_freed(objp, c->object_size);
3796  	__cache_free(c, (void *)objp, _RET_IP_);
3797  	local_irq_restore(flags);
3798  }
3799  EXPORT_SYMBOL(kfree);
3800  
3801  /*
3802   * This initializes kmem_cache_node or resizes various caches for all nodes.
3803   */
3804  static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp)
3805  {
3806  	int ret;
3807  	int node;
3808  	struct kmem_cache_node *n;
3809  
3810  	for_each_online_node(node) {
3811  		ret = setup_kmem_cache_node(cachep, node, gfp, true);
3812  		if (ret)
3813  			goto fail;
3814  
3815  	}
3816  
3817  	return 0;
3818  
3819  fail:
3820  	if (!cachep->list.next) {
3821  		/* Cache is not active yet. Roll back what we did */
3822  		node--;
3823  		while (node >= 0) {
3824  			n = get_node(cachep, node);
3825  			if (n) {
3826  				kfree(n->shared);
3827  				free_alien_cache(n->alien);
3828  				kfree(n);
3829  				cachep->node[node] = NULL;
3830  			}
3831  			node--;
3832  		}
3833  	}
3834  	return -ENOMEM;
3835  }
3836  
3837  /* Always called with the slab_mutex held */
3838  static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3839  			    int batchcount, int shared, gfp_t gfp)
3840  {
3841  	struct array_cache __percpu *cpu_cache, *prev;
3842  	int cpu;
3843  
3844  	cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount);
3845  	if (!cpu_cache)
3846  		return -ENOMEM;
3847  
3848  	prev = cachep->cpu_cache;
3849  	cachep->cpu_cache = cpu_cache;
3850  	/*
3851  	 * Without a previous cpu_cache there's no need to synchronize remote
3852  	 * cpus, so skip the IPIs.
3853  	 */
3854  	if (prev)
3855  		kick_all_cpus_sync();
3856  
3857  	check_irq_on();
3858  	cachep->batchcount = batchcount;
3859  	cachep->limit = limit;
3860  	cachep->shared = shared;
3861  
3862  	if (!prev)
3863  		goto setup_node;
3864  
3865  	for_each_online_cpu(cpu) {
3866  		LIST_HEAD(list);
3867  		int node;
3868  		struct kmem_cache_node *n;
3869  		struct array_cache *ac = per_cpu_ptr(prev, cpu);
3870  
3871  		node = cpu_to_mem(cpu);
3872  		n = get_node(cachep, node);
3873  		spin_lock_irq(&n->list_lock);
3874  		free_block(cachep, ac->entry, ac->avail, node, &list);
3875  		spin_unlock_irq(&n->list_lock);
3876  		slabs_destroy(cachep, &list);
3877  	}
3878  	free_percpu(prev);
3879  
3880  setup_node:
3881  	return setup_kmem_cache_nodes(cachep, gfp);
3882  }
3883  
3884  /* Called with slab_mutex held always */
3885  static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
3886  {
3887  	int err;
3888  	int limit = 0;
3889  	int shared = 0;
3890  	int batchcount = 0;
3891  
3892  	err = cache_random_seq_create(cachep, cachep->num, gfp);
3893  	if (err)
3894  		goto end;
3895  
3896  	if (limit && shared && batchcount)
3897  		goto skip_setup;
3898  	/*
3899  	 * The head array serves three purposes:
3900  	 * - create a LIFO ordering, i.e. return objects that are cache-warm
3901  	 * - reduce the number of spinlock operations.
3902  	 * - reduce the number of linked list operations on the slab and
3903  	 *   bufctl chains: array operations are cheaper.
3904  	 * The numbers are guessed, we should auto-tune as described by
3905  	 * Bonwick.
3906  	 */
3907  	if (cachep->size > 131072)
3908  		limit = 1;
3909  	else if (cachep->size > PAGE_SIZE)
3910  		limit = 8;
3911  	else if (cachep->size > 1024)
3912  		limit = 24;
3913  	else if (cachep->size > 256)
3914  		limit = 54;
3915  	else
3916  		limit = 120;
3917  
3918  	/*
3919  	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
3920  	 * allocation behaviour: Most allocs on one cpu, most free operations
3921  	 * on another cpu. For these cases, an efficient object passing between
3922  	 * cpus is necessary. This is provided by a shared array. The array
3923  	 * replaces Bonwick's magazine layer.
3924  	 * On uniprocessor, it's functionally equivalent (but less efficient)
3925  	 * to a larger limit. Thus disabled by default.
3926  	 */
3927  	shared = 0;
3928  	if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
3929  		shared = 8;
3930  
3931  #if DEBUG
3932  	/*
3933  	 * With debugging enabled, large batchcount lead to excessively long
3934  	 * periods with disabled local interrupts. Limit the batchcount
3935  	 */
3936  	if (limit > 32)
3937  		limit = 32;
3938  #endif
3939  	batchcount = (limit + 1) / 2;
3940  skip_setup:
3941  	err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
3942  end:
3943  	if (err)
3944  		pr_err("enable_cpucache failed for %s, error %d\n",
3945  		       cachep->name, -err);
3946  	return err;
3947  }
3948  
3949  /*
3950   * Drain an array if it contains any elements taking the node lock only if
3951   * necessary. Note that the node listlock also protects the array_cache
3952   * if drain_array() is used on the shared array.
3953   */
3954  static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
3955  			 struct array_cache *ac, int node)
3956  {
3957  	LIST_HEAD(list);
3958  
3959  	/* ac from n->shared can be freed if we don't hold the slab_mutex. */
3960  	check_mutex_acquired();
3961  
3962  	if (!ac || !ac->avail)
3963  		return;
3964  
3965  	if (ac->touched) {
3966  		ac->touched = 0;
3967  		return;
3968  	}
3969  
3970  	spin_lock_irq(&n->list_lock);
3971  	drain_array_locked(cachep, ac, node, false, &list);
3972  	spin_unlock_irq(&n->list_lock);
3973  
3974  	slabs_destroy(cachep, &list);
3975  }
3976  
3977  /**
3978   * cache_reap - Reclaim memory from caches.
3979   * @w: work descriptor
3980   *
3981   * Called from workqueue/eventd every few seconds.
3982   * Purpose:
3983   * - clear the per-cpu caches for this CPU.
3984   * - return freeable pages to the main free memory pool.
3985   *
3986   * If we cannot acquire the cache chain mutex then just give up - we'll try
3987   * again on the next iteration.
3988   */
3989  static void cache_reap(struct work_struct *w)
3990  {
3991  	struct kmem_cache *searchp;
3992  	struct kmem_cache_node *n;
3993  	int node = numa_mem_id();
3994  	struct delayed_work *work = to_delayed_work(w);
3995  
3996  	if (!mutex_trylock(&slab_mutex))
3997  		/* Give up. Setup the next iteration. */
3998  		goto out;
3999  
4000  	list_for_each_entry(searchp, &slab_caches, list) {
4001  		check_irq_on();
4002  
4003  		/*
4004  		 * We only take the node lock if absolutely necessary and we
4005  		 * have established with reasonable certainty that
4006  		 * we can do some work if the lock was obtained.
4007  		 */
4008  		n = get_node(searchp, node);
4009  
4010  		reap_alien(searchp, n);
4011  
4012  		drain_array(searchp, n, cpu_cache_get(searchp), node);
4013  
4014  		/*
4015  		 * These are racy checks but it does not matter
4016  		 * if we skip one check or scan twice.
4017  		 */
4018  		if (time_after(n->next_reap, jiffies))
4019  			goto next;
4020  
4021  		n->next_reap = jiffies + REAPTIMEOUT_NODE;
4022  
4023  		drain_array(searchp, n, n->shared, node);
4024  
4025  		if (n->free_touched)
4026  			n->free_touched = 0;
4027  		else {
4028  			int freed;
4029  
4030  			freed = drain_freelist(searchp, n, (n->free_limit +
4031  				5 * searchp->num - 1) / (5 * searchp->num));
4032  			STATS_ADD_REAPED(searchp, freed);
4033  		}
4034  next:
4035  		cond_resched();
4036  	}
4037  	check_irq_on();
4038  	mutex_unlock(&slab_mutex);
4039  	next_reap_node();
4040  out:
4041  	/* Set up the next iteration */
4042  	schedule_delayed_work_on(smp_processor_id(), work,
4043  				round_jiffies_relative(REAPTIMEOUT_AC));
4044  }
4045  
4046  void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
4047  {
4048  	unsigned long active_objs, num_objs, active_slabs;
4049  	unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0;
4050  	unsigned long free_slabs = 0;
4051  	int node;
4052  	struct kmem_cache_node *n;
4053  
4054  	for_each_kmem_cache_node(cachep, node, n) {
4055  		check_irq_on();
4056  		spin_lock_irq(&n->list_lock);
4057  
4058  		total_slabs += n->total_slabs;
4059  		free_slabs += n->free_slabs;
4060  		free_objs += n->free_objects;
4061  
4062  		if (n->shared)
4063  			shared_avail += n->shared->avail;
4064  
4065  		spin_unlock_irq(&n->list_lock);
4066  	}
4067  	num_objs = total_slabs * cachep->num;
4068  	active_slabs = total_slabs - free_slabs;
4069  	active_objs = num_objs - free_objs;
4070  
4071  	sinfo->active_objs = active_objs;
4072  	sinfo->num_objs = num_objs;
4073  	sinfo->active_slabs = active_slabs;
4074  	sinfo->num_slabs = total_slabs;
4075  	sinfo->shared_avail = shared_avail;
4076  	sinfo->limit = cachep->limit;
4077  	sinfo->batchcount = cachep->batchcount;
4078  	sinfo->shared = cachep->shared;
4079  	sinfo->objects_per_slab = cachep->num;
4080  	sinfo->cache_order = cachep->gfporder;
4081  }
4082  
4083  void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
4084  {
4085  #if STATS
4086  	{			/* node stats */
4087  		unsigned long high = cachep->high_mark;
4088  		unsigned long allocs = cachep->num_allocations;
4089  		unsigned long grown = cachep->grown;
4090  		unsigned long reaped = cachep->reaped;
4091  		unsigned long errors = cachep->errors;
4092  		unsigned long max_freeable = cachep->max_freeable;
4093  		unsigned long node_allocs = cachep->node_allocs;
4094  		unsigned long node_frees = cachep->node_frees;
4095  		unsigned long overflows = cachep->node_overflow;
4096  
4097  		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu",
4098  			   allocs, high, grown,
4099  			   reaped, errors, max_freeable, node_allocs,
4100  			   node_frees, overflows);
4101  	}
4102  	/* cpu stats */
4103  	{
4104  		unsigned long allochit = atomic_read(&cachep->allochit);
4105  		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
4106  		unsigned long freehit = atomic_read(&cachep->freehit);
4107  		unsigned long freemiss = atomic_read(&cachep->freemiss);
4108  
4109  		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
4110  			   allochit, allocmiss, freehit, freemiss);
4111  	}
4112  #endif
4113  }
4114  
4115  #define MAX_SLABINFO_WRITE 128
4116  /**
4117   * slabinfo_write - Tuning for the slab allocator
4118   * @file: unused
4119   * @buffer: user buffer
4120   * @count: data length
4121   * @ppos: unused
4122   *
4123   * Return: %0 on success, negative error code otherwise.
4124   */
4125  ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4126  		       size_t count, loff_t *ppos)
4127  {
4128  	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
4129  	int limit, batchcount, shared, res;
4130  	struct kmem_cache *cachep;
4131  
4132  	if (count > MAX_SLABINFO_WRITE)
4133  		return -EINVAL;
4134  	if (copy_from_user(&kbuf, buffer, count))
4135  		return -EFAULT;
4136  	kbuf[MAX_SLABINFO_WRITE] = '\0';
4137  
4138  	tmp = strchr(kbuf, ' ');
4139  	if (!tmp)
4140  		return -EINVAL;
4141  	*tmp = '\0';
4142  	tmp++;
4143  	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
4144  		return -EINVAL;
4145  
4146  	/* Find the cache in the chain of caches. */
4147  	mutex_lock(&slab_mutex);
4148  	res = -EINVAL;
4149  	list_for_each_entry(cachep, &slab_caches, list) {
4150  		if (!strcmp(cachep->name, kbuf)) {
4151  			if (limit < 1 || batchcount < 1 ||
4152  					batchcount > limit || shared < 0) {
4153  				res = 0;
4154  			} else {
4155  				res = do_tune_cpucache(cachep, limit,
4156  						       batchcount, shared,
4157  						       GFP_KERNEL);
4158  			}
4159  			break;
4160  		}
4161  	}
4162  	mutex_unlock(&slab_mutex);
4163  	if (res >= 0)
4164  		res = count;
4165  	return res;
4166  }
4167  
4168  #ifdef CONFIG_HARDENED_USERCOPY
4169  /*
4170   * Rejects incorrectly sized objects and objects that are to be copied
4171   * to/from userspace but do not fall entirely within the containing slab
4172   * cache's usercopy region.
4173   *
4174   * Returns NULL if check passes, otherwise const char * to name of cache
4175   * to indicate an error.
4176   */
4177  void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
4178  			 bool to_user)
4179  {
4180  	struct kmem_cache *cachep;
4181  	unsigned int objnr;
4182  	unsigned long offset;
4183  
4184  	ptr = kasan_reset_tag(ptr);
4185  
4186  	/* Find and validate object. */
4187  	cachep = page->slab_cache;
4188  	objnr = obj_to_index(cachep, page, (void *)ptr);
4189  	BUG_ON(objnr >= cachep->num);
4190  
4191  	/* Find offset within object. */
4192  	if (is_kfence_address(ptr))
4193  		offset = ptr - kfence_object_start(ptr);
4194  	else
4195  		offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
4196  
4197  	/* Allow address range falling entirely within usercopy region. */
4198  	if (offset >= cachep->useroffset &&
4199  	    offset - cachep->useroffset <= cachep->usersize &&
4200  	    n <= cachep->useroffset - offset + cachep->usersize)
4201  		return;
4202  
4203  	/*
4204  	 * If the copy is still within the allocated object, produce
4205  	 * a warning instead of rejecting the copy. This is intended
4206  	 * to be a temporary method to find any missing usercopy
4207  	 * whitelists.
4208  	 */
4209  	if (usercopy_fallback &&
4210  	    offset <= cachep->object_size &&
4211  	    n <= cachep->object_size - offset) {
4212  		usercopy_warn("SLAB object", cachep->name, to_user, offset, n);
4213  		return;
4214  	}
4215  
4216  	usercopy_abort("SLAB object", cachep->name, to_user, offset, n);
4217  }
4218  #endif /* CONFIG_HARDENED_USERCOPY */
4219  
4220  /**
4221   * __ksize -- Uninstrumented ksize.
4222   * @objp: pointer to the object
4223   *
4224   * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same
4225   * safety checks as ksize() with KASAN instrumentation enabled.
4226   *
4227   * Return: size of the actual memory used by @objp in bytes
4228   */
4229  size_t __ksize(const void *objp)
4230  {
4231  	struct kmem_cache *c;
4232  	size_t size;
4233  
4234  	BUG_ON(!objp);
4235  	if (unlikely(objp == ZERO_SIZE_PTR))
4236  		return 0;
4237  
4238  	c = virt_to_cache(objp);
4239  	size = c ? c->object_size : 0;
4240  
4241  	return size;
4242  }
4243  EXPORT_SYMBOL(__ksize);
4244