xref: /openbmc/linux/mm/slub.c (revision ec8f24b7faaf3d4799a7c3f4c1b87f6b02778ad1)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * SLUB: A slab allocator that limits cache line use instead of queuing
4   * objects in per cpu and per node lists.
5   *
6   * The allocator synchronizes using per slab locks or atomic operatios
7   * and only uses a centralized lock to manage a pool of partial slabs.
8   *
9   * (C) 2007 SGI, Christoph Lameter
10   * (C) 2011 Linux Foundation, Christoph Lameter
11   */
12  
13  #include <linux/mm.h>
14  #include <linux/swap.h> /* struct reclaim_state */
15  #include <linux/module.h>
16  #include <linux/bit_spinlock.h>
17  #include <linux/interrupt.h>
18  #include <linux/bitops.h>
19  #include <linux/slab.h>
20  #include "slab.h"
21  #include <linux/proc_fs.h>
22  #include <linux/seq_file.h>
23  #include <linux/kasan.h>
24  #include <linux/cpu.h>
25  #include <linux/cpuset.h>
26  #include <linux/mempolicy.h>
27  #include <linux/ctype.h>
28  #include <linux/debugobjects.h>
29  #include <linux/kallsyms.h>
30  #include <linux/memory.h>
31  #include <linux/math64.h>
32  #include <linux/fault-inject.h>
33  #include <linux/stacktrace.h>
34  #include <linux/prefetch.h>
35  #include <linux/memcontrol.h>
36  #include <linux/random.h>
37  
38  #include <trace/events/kmem.h>
39  
40  #include "internal.h"
41  
42  /*
43   * Lock order:
44   *   1. slab_mutex (Global Mutex)
45   *   2. node->list_lock
46   *   3. slab_lock(page) (Only on some arches and for debugging)
47   *
48   *   slab_mutex
49   *
50   *   The role of the slab_mutex is to protect the list of all the slabs
51   *   and to synchronize major metadata changes to slab cache structures.
52   *
53   *   The slab_lock is only used for debugging and on arches that do not
54   *   have the ability to do a cmpxchg_double. It only protects:
55   *	A. page->freelist	-> List of object free in a page
56   *	B. page->inuse		-> Number of objects in use
57   *	C. page->objects	-> Number of objects in page
58   *	D. page->frozen		-> frozen state
59   *
60   *   If a slab is frozen then it is exempt from list management. It is not
61   *   on any list except per cpu partial list. The processor that froze the
62   *   slab is the one who can perform list operations on the page. Other
63   *   processors may put objects onto the freelist but the processor that
64   *   froze the slab is the only one that can retrieve the objects from the
65   *   page's freelist.
66   *
67   *   The list_lock protects the partial and full list on each node and
68   *   the partial slab counter. If taken then no new slabs may be added or
69   *   removed from the lists nor make the number of partial slabs be modified.
70   *   (Note that the total number of slabs is an atomic value that may be
71   *   modified without taking the list lock).
72   *
73   *   The list_lock is a centralized lock and thus we avoid taking it as
74   *   much as possible. As long as SLUB does not have to handle partial
75   *   slabs, operations can continue without any centralized lock. F.e.
76   *   allocating a long series of objects that fill up slabs does not require
77   *   the list lock.
78   *   Interrupts are disabled during allocation and deallocation in order to
79   *   make the slab allocator safe to use in the context of an irq. In addition
80   *   interrupts are disabled to ensure that the processor does not change
81   *   while handling per_cpu slabs, due to kernel preemption.
82   *
83   * SLUB assigns one slab for allocation to each processor.
84   * Allocations only occur from these slabs called cpu slabs.
85   *
86   * Slabs with free elements are kept on a partial list and during regular
87   * operations no list for full slabs is used. If an object in a full slab is
88   * freed then the slab will show up again on the partial lists.
89   * We track full slabs for debugging purposes though because otherwise we
90   * cannot scan all objects.
91   *
92   * Slabs are freed when they become empty. Teardown and setup is
93   * minimal so we rely on the page allocators per cpu caches for
94   * fast frees and allocs.
95   *
96   * Overloading of page flags that are otherwise used for LRU management.
97   *
98   * PageActive 		The slab is frozen and exempt from list processing.
99   * 			This means that the slab is dedicated to a purpose
100   * 			such as satisfying allocations for a specific
101   * 			processor. Objects may be freed in the slab while
102   * 			it is frozen but slab_free will then skip the usual
103   * 			list operations. It is up to the processor holding
104   * 			the slab to integrate the slab into the slab lists
105   * 			when the slab is no longer needed.
106   *
107   * 			One use of this flag is to mark slabs that are
108   * 			used for allocations. Then such a slab becomes a cpu
109   * 			slab. The cpu slab may be equipped with an additional
110   * 			freelist that allows lockless access to
111   * 			free objects in addition to the regular freelist
112   * 			that requires the slab lock.
113   *
114   * PageError		Slab requires special handling due to debug
115   * 			options set. This moves	slab handling out of
116   * 			the fast path and disables lockless freelists.
117   */
118  
119  static inline int kmem_cache_debug(struct kmem_cache *s)
120  {
121  #ifdef CONFIG_SLUB_DEBUG
122  	return unlikely(s->flags & SLAB_DEBUG_FLAGS);
123  #else
124  	return 0;
125  #endif
126  }
127  
128  void *fixup_red_left(struct kmem_cache *s, void *p)
129  {
130  	if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
131  		p += s->red_left_pad;
132  
133  	return p;
134  }
135  
136  static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
137  {
138  #ifdef CONFIG_SLUB_CPU_PARTIAL
139  	return !kmem_cache_debug(s);
140  #else
141  	return false;
142  #endif
143  }
144  
145  /*
146   * Issues still to be resolved:
147   *
148   * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
149   *
150   * - Variable sizing of the per node arrays
151   */
152  
153  /* Enable to test recovery from slab corruption on boot */
154  #undef SLUB_RESILIENCY_TEST
155  
156  /* Enable to log cmpxchg failures */
157  #undef SLUB_DEBUG_CMPXCHG
158  
159  /*
160   * Mininum number of partial slabs. These will be left on the partial
161   * lists even if they are empty. kmem_cache_shrink may reclaim them.
162   */
163  #define MIN_PARTIAL 5
164  
165  /*
166   * Maximum number of desirable partial slabs.
167   * The existence of more partial slabs makes kmem_cache_shrink
168   * sort the partial list by the number of objects in use.
169   */
170  #define MAX_PARTIAL 10
171  
172  #define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
173  				SLAB_POISON | SLAB_STORE_USER)
174  
175  /*
176   * These debug flags cannot use CMPXCHG because there might be consistency
177   * issues when checking or reading debug information
178   */
179  #define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
180  				SLAB_TRACE)
181  
182  
183  /*
184   * Debugging flags that require metadata to be stored in the slab.  These get
185   * disabled when slub_debug=O is used and a cache's min order increases with
186   * metadata.
187   */
188  #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
189  
190  #define OO_SHIFT	16
191  #define OO_MASK		((1 << OO_SHIFT) - 1)
192  #define MAX_OBJS_PER_PAGE	32767 /* since page.objects is u15 */
193  
194  /* Internal SLUB flags */
195  /* Poison object */
196  #define __OBJECT_POISON		((slab_flags_t __force)0x80000000U)
197  /* Use cmpxchg_double */
198  #define __CMPXCHG_DOUBLE	((slab_flags_t __force)0x40000000U)
199  
200  /*
201   * Tracking user of a slab.
202   */
203  #define TRACK_ADDRS_COUNT 16
204  struct track {
205  	unsigned long addr;	/* Called from address */
206  #ifdef CONFIG_STACKTRACE
207  	unsigned long addrs[TRACK_ADDRS_COUNT];	/* Called from address */
208  #endif
209  	int cpu;		/* Was running on cpu */
210  	int pid;		/* Pid context */
211  	unsigned long when;	/* When did the operation occur */
212  };
213  
214  enum track_item { TRACK_ALLOC, TRACK_FREE };
215  
216  #ifdef CONFIG_SYSFS
217  static int sysfs_slab_add(struct kmem_cache *);
218  static int sysfs_slab_alias(struct kmem_cache *, const char *);
219  static void memcg_propagate_slab_attrs(struct kmem_cache *s);
220  static void sysfs_slab_remove(struct kmem_cache *s);
221  #else
222  static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
223  static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
224  							{ return 0; }
225  static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
226  static inline void sysfs_slab_remove(struct kmem_cache *s) { }
227  #endif
228  
229  static inline void stat(const struct kmem_cache *s, enum stat_item si)
230  {
231  #ifdef CONFIG_SLUB_STATS
232  	/*
233  	 * The rmw is racy on a preemptible kernel but this is acceptable, so
234  	 * avoid this_cpu_add()'s irq-disable overhead.
235  	 */
236  	raw_cpu_inc(s->cpu_slab->stat[si]);
237  #endif
238  }
239  
240  /********************************************************************
241   * 			Core slab cache functions
242   *******************************************************************/
243  
244  /*
245   * Returns freelist pointer (ptr). With hardening, this is obfuscated
246   * with an XOR of the address where the pointer is held and a per-cache
247   * random number.
248   */
249  static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
250  				 unsigned long ptr_addr)
251  {
252  #ifdef CONFIG_SLAB_FREELIST_HARDENED
253  	/*
254  	 * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged.
255  	 * Normally, this doesn't cause any issues, as both set_freepointer()
256  	 * and get_freepointer() are called with a pointer with the same tag.
257  	 * However, there are some issues with CONFIG_SLUB_DEBUG code. For
258  	 * example, when __free_slub() iterates over objects in a cache, it
259  	 * passes untagged pointers to check_object(). check_object() in turns
260  	 * calls get_freepointer() with an untagged pointer, which causes the
261  	 * freepointer to be restored incorrectly.
262  	 */
263  	return (void *)((unsigned long)ptr ^ s->random ^
264  			(unsigned long)kasan_reset_tag((void *)ptr_addr));
265  #else
266  	return ptr;
267  #endif
268  }
269  
270  /* Returns the freelist pointer recorded at location ptr_addr. */
271  static inline void *freelist_dereference(const struct kmem_cache *s,
272  					 void *ptr_addr)
273  {
274  	return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr),
275  			    (unsigned long)ptr_addr);
276  }
277  
278  static inline void *get_freepointer(struct kmem_cache *s, void *object)
279  {
280  	return freelist_dereference(s, object + s->offset);
281  }
282  
283  static void prefetch_freepointer(const struct kmem_cache *s, void *object)
284  {
285  	prefetch(object + s->offset);
286  }
287  
288  static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
289  {
290  	unsigned long freepointer_addr;
291  	void *p;
292  
293  	if (!debug_pagealloc_enabled())
294  		return get_freepointer(s, object);
295  
296  	freepointer_addr = (unsigned long)object + s->offset;
297  	probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p));
298  	return freelist_ptr(s, p, freepointer_addr);
299  }
300  
301  static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
302  {
303  	unsigned long freeptr_addr = (unsigned long)object + s->offset;
304  
305  #ifdef CONFIG_SLAB_FREELIST_HARDENED
306  	BUG_ON(object == fp); /* naive detection of double free or corruption */
307  #endif
308  
309  	*(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
310  }
311  
312  /* Loop over all objects in a slab */
313  #define for_each_object(__p, __s, __addr, __objects) \
314  	for (__p = fixup_red_left(__s, __addr); \
315  		__p < (__addr) + (__objects) * (__s)->size; \
316  		__p += (__s)->size)
317  
318  /* Determine object index from a given position */
319  static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
320  {
321  	return (kasan_reset_tag(p) - addr) / s->size;
322  }
323  
324  static inline unsigned int order_objects(unsigned int order, unsigned int size)
325  {
326  	return ((unsigned int)PAGE_SIZE << order) / size;
327  }
328  
329  static inline struct kmem_cache_order_objects oo_make(unsigned int order,
330  		unsigned int size)
331  {
332  	struct kmem_cache_order_objects x = {
333  		(order << OO_SHIFT) + order_objects(order, size)
334  	};
335  
336  	return x;
337  }
338  
339  static inline unsigned int oo_order(struct kmem_cache_order_objects x)
340  {
341  	return x.x >> OO_SHIFT;
342  }
343  
344  static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
345  {
346  	return x.x & OO_MASK;
347  }
348  
349  /*
350   * Per slab locking using the pagelock
351   */
352  static __always_inline void slab_lock(struct page *page)
353  {
354  	VM_BUG_ON_PAGE(PageTail(page), page);
355  	bit_spin_lock(PG_locked, &page->flags);
356  }
357  
358  static __always_inline void slab_unlock(struct page *page)
359  {
360  	VM_BUG_ON_PAGE(PageTail(page), page);
361  	__bit_spin_unlock(PG_locked, &page->flags);
362  }
363  
364  /* Interrupts must be disabled (for the fallback code to work right) */
365  static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
366  		void *freelist_old, unsigned long counters_old,
367  		void *freelist_new, unsigned long counters_new,
368  		const char *n)
369  {
370  	VM_BUG_ON(!irqs_disabled());
371  #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
372      defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
373  	if (s->flags & __CMPXCHG_DOUBLE) {
374  		if (cmpxchg_double(&page->freelist, &page->counters,
375  				   freelist_old, counters_old,
376  				   freelist_new, counters_new))
377  			return true;
378  	} else
379  #endif
380  	{
381  		slab_lock(page);
382  		if (page->freelist == freelist_old &&
383  					page->counters == counters_old) {
384  			page->freelist = freelist_new;
385  			page->counters = counters_new;
386  			slab_unlock(page);
387  			return true;
388  		}
389  		slab_unlock(page);
390  	}
391  
392  	cpu_relax();
393  	stat(s, CMPXCHG_DOUBLE_FAIL);
394  
395  #ifdef SLUB_DEBUG_CMPXCHG
396  	pr_info("%s %s: cmpxchg double redo ", n, s->name);
397  #endif
398  
399  	return false;
400  }
401  
402  static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
403  		void *freelist_old, unsigned long counters_old,
404  		void *freelist_new, unsigned long counters_new,
405  		const char *n)
406  {
407  #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
408      defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
409  	if (s->flags & __CMPXCHG_DOUBLE) {
410  		if (cmpxchg_double(&page->freelist, &page->counters,
411  				   freelist_old, counters_old,
412  				   freelist_new, counters_new))
413  			return true;
414  	} else
415  #endif
416  	{
417  		unsigned long flags;
418  
419  		local_irq_save(flags);
420  		slab_lock(page);
421  		if (page->freelist == freelist_old &&
422  					page->counters == counters_old) {
423  			page->freelist = freelist_new;
424  			page->counters = counters_new;
425  			slab_unlock(page);
426  			local_irq_restore(flags);
427  			return true;
428  		}
429  		slab_unlock(page);
430  		local_irq_restore(flags);
431  	}
432  
433  	cpu_relax();
434  	stat(s, CMPXCHG_DOUBLE_FAIL);
435  
436  #ifdef SLUB_DEBUG_CMPXCHG
437  	pr_info("%s %s: cmpxchg double redo ", n, s->name);
438  #endif
439  
440  	return false;
441  }
442  
443  #ifdef CONFIG_SLUB_DEBUG
444  /*
445   * Determine a map of object in use on a page.
446   *
447   * Node listlock must be held to guarantee that the page does
448   * not vanish from under us.
449   */
450  static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
451  {
452  	void *p;
453  	void *addr = page_address(page);
454  
455  	for (p = page->freelist; p; p = get_freepointer(s, p))
456  		set_bit(slab_index(p, s, addr), map);
457  }
458  
459  static inline unsigned int size_from_object(struct kmem_cache *s)
460  {
461  	if (s->flags & SLAB_RED_ZONE)
462  		return s->size - s->red_left_pad;
463  
464  	return s->size;
465  }
466  
467  static inline void *restore_red_left(struct kmem_cache *s, void *p)
468  {
469  	if (s->flags & SLAB_RED_ZONE)
470  		p -= s->red_left_pad;
471  
472  	return p;
473  }
474  
475  /*
476   * Debug settings:
477   */
478  #if defined(CONFIG_SLUB_DEBUG_ON)
479  static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
480  #else
481  static slab_flags_t slub_debug;
482  #endif
483  
484  static char *slub_debug_slabs;
485  static int disable_higher_order_debug;
486  
487  /*
488   * slub is about to manipulate internal object metadata.  This memory lies
489   * outside the range of the allocated object, so accessing it would normally
490   * be reported by kasan as a bounds error.  metadata_access_enable() is used
491   * to tell kasan that these accesses are OK.
492   */
493  static inline void metadata_access_enable(void)
494  {
495  	kasan_disable_current();
496  }
497  
498  static inline void metadata_access_disable(void)
499  {
500  	kasan_enable_current();
501  }
502  
503  /*
504   * Object debugging
505   */
506  
507  /* Verify that a pointer has an address that is valid within a slab page */
508  static inline int check_valid_pointer(struct kmem_cache *s,
509  				struct page *page, void *object)
510  {
511  	void *base;
512  
513  	if (!object)
514  		return 1;
515  
516  	base = page_address(page);
517  	object = kasan_reset_tag(object);
518  	object = restore_red_left(s, object);
519  	if (object < base || object >= base + page->objects * s->size ||
520  		(object - base) % s->size) {
521  		return 0;
522  	}
523  
524  	return 1;
525  }
526  
527  static void print_section(char *level, char *text, u8 *addr,
528  			  unsigned int length)
529  {
530  	metadata_access_enable();
531  	print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
532  			length, 1);
533  	metadata_access_disable();
534  }
535  
536  static struct track *get_track(struct kmem_cache *s, void *object,
537  	enum track_item alloc)
538  {
539  	struct track *p;
540  
541  	if (s->offset)
542  		p = object + s->offset + sizeof(void *);
543  	else
544  		p = object + s->inuse;
545  
546  	return p + alloc;
547  }
548  
549  static void set_track(struct kmem_cache *s, void *object,
550  			enum track_item alloc, unsigned long addr)
551  {
552  	struct track *p = get_track(s, object, alloc);
553  
554  	if (addr) {
555  #ifdef CONFIG_STACKTRACE
556  		unsigned int nr_entries;
557  
558  		metadata_access_enable();
559  		nr_entries = stack_trace_save(p->addrs, TRACK_ADDRS_COUNT, 3);
560  		metadata_access_disable();
561  
562  		if (nr_entries < TRACK_ADDRS_COUNT)
563  			p->addrs[nr_entries] = 0;
564  #endif
565  		p->addr = addr;
566  		p->cpu = smp_processor_id();
567  		p->pid = current->pid;
568  		p->when = jiffies;
569  	} else {
570  		memset(p, 0, sizeof(struct track));
571  	}
572  }
573  
574  static void init_tracking(struct kmem_cache *s, void *object)
575  {
576  	if (!(s->flags & SLAB_STORE_USER))
577  		return;
578  
579  	set_track(s, object, TRACK_FREE, 0UL);
580  	set_track(s, object, TRACK_ALLOC, 0UL);
581  }
582  
583  static void print_track(const char *s, struct track *t, unsigned long pr_time)
584  {
585  	if (!t->addr)
586  		return;
587  
588  	pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
589  	       s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
590  #ifdef CONFIG_STACKTRACE
591  	{
592  		int i;
593  		for (i = 0; i < TRACK_ADDRS_COUNT; i++)
594  			if (t->addrs[i])
595  				pr_err("\t%pS\n", (void *)t->addrs[i]);
596  			else
597  				break;
598  	}
599  #endif
600  }
601  
602  static void print_tracking(struct kmem_cache *s, void *object)
603  {
604  	unsigned long pr_time = jiffies;
605  	if (!(s->flags & SLAB_STORE_USER))
606  		return;
607  
608  	print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time);
609  	print_track("Freed", get_track(s, object, TRACK_FREE), pr_time);
610  }
611  
612  static void print_page_info(struct page *page)
613  {
614  	pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
615  	       page, page->objects, page->inuse, page->freelist, page->flags);
616  
617  }
618  
619  static void slab_bug(struct kmem_cache *s, char *fmt, ...)
620  {
621  	struct va_format vaf;
622  	va_list args;
623  
624  	va_start(args, fmt);
625  	vaf.fmt = fmt;
626  	vaf.va = &args;
627  	pr_err("=============================================================================\n");
628  	pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
629  	pr_err("-----------------------------------------------------------------------------\n\n");
630  
631  	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
632  	va_end(args);
633  }
634  
635  static void slab_fix(struct kmem_cache *s, char *fmt, ...)
636  {
637  	struct va_format vaf;
638  	va_list args;
639  
640  	va_start(args, fmt);
641  	vaf.fmt = fmt;
642  	vaf.va = &args;
643  	pr_err("FIX %s: %pV\n", s->name, &vaf);
644  	va_end(args);
645  }
646  
647  static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
648  {
649  	unsigned int off;	/* Offset of last byte */
650  	u8 *addr = page_address(page);
651  
652  	print_tracking(s, p);
653  
654  	print_page_info(page);
655  
656  	pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
657  	       p, p - addr, get_freepointer(s, p));
658  
659  	if (s->flags & SLAB_RED_ZONE)
660  		print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
661  			      s->red_left_pad);
662  	else if (p > addr + 16)
663  		print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
664  
665  	print_section(KERN_ERR, "Object ", p,
666  		      min_t(unsigned int, s->object_size, PAGE_SIZE));
667  	if (s->flags & SLAB_RED_ZONE)
668  		print_section(KERN_ERR, "Redzone ", p + s->object_size,
669  			s->inuse - s->object_size);
670  
671  	if (s->offset)
672  		off = s->offset + sizeof(void *);
673  	else
674  		off = s->inuse;
675  
676  	if (s->flags & SLAB_STORE_USER)
677  		off += 2 * sizeof(struct track);
678  
679  	off += kasan_metadata_size(s);
680  
681  	if (off != size_from_object(s))
682  		/* Beginning of the filler is the free pointer */
683  		print_section(KERN_ERR, "Padding ", p + off,
684  			      size_from_object(s) - off);
685  
686  	dump_stack();
687  }
688  
689  void object_err(struct kmem_cache *s, struct page *page,
690  			u8 *object, char *reason)
691  {
692  	slab_bug(s, "%s", reason);
693  	print_trailer(s, page, object);
694  }
695  
696  static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
697  			const char *fmt, ...)
698  {
699  	va_list args;
700  	char buf[100];
701  
702  	va_start(args, fmt);
703  	vsnprintf(buf, sizeof(buf), fmt, args);
704  	va_end(args);
705  	slab_bug(s, "%s", buf);
706  	print_page_info(page);
707  	dump_stack();
708  }
709  
710  static void init_object(struct kmem_cache *s, void *object, u8 val)
711  {
712  	u8 *p = object;
713  
714  	if (s->flags & SLAB_RED_ZONE)
715  		memset(p - s->red_left_pad, val, s->red_left_pad);
716  
717  	if (s->flags & __OBJECT_POISON) {
718  		memset(p, POISON_FREE, s->object_size - 1);
719  		p[s->object_size - 1] = POISON_END;
720  	}
721  
722  	if (s->flags & SLAB_RED_ZONE)
723  		memset(p + s->object_size, val, s->inuse - s->object_size);
724  }
725  
726  static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
727  						void *from, void *to)
728  {
729  	slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
730  	memset(from, data, to - from);
731  }
732  
733  static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
734  			u8 *object, char *what,
735  			u8 *start, unsigned int value, unsigned int bytes)
736  {
737  	u8 *fault;
738  	u8 *end;
739  
740  	metadata_access_enable();
741  	fault = memchr_inv(start, value, bytes);
742  	metadata_access_disable();
743  	if (!fault)
744  		return 1;
745  
746  	end = start + bytes;
747  	while (end > fault && end[-1] == value)
748  		end--;
749  
750  	slab_bug(s, "%s overwritten", what);
751  	pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
752  					fault, end - 1, fault[0], value);
753  	print_trailer(s, page, object);
754  
755  	restore_bytes(s, what, value, fault, end);
756  	return 0;
757  }
758  
759  /*
760   * Object layout:
761   *
762   * object address
763   * 	Bytes of the object to be managed.
764   * 	If the freepointer may overlay the object then the free
765   * 	pointer is the first word of the object.
766   *
767   * 	Poisoning uses 0x6b (POISON_FREE) and the last byte is
768   * 	0xa5 (POISON_END)
769   *
770   * object + s->object_size
771   * 	Padding to reach word boundary. This is also used for Redzoning.
772   * 	Padding is extended by another word if Redzoning is enabled and
773   * 	object_size == inuse.
774   *
775   * 	We fill with 0xbb (RED_INACTIVE) for inactive objects and with
776   * 	0xcc (RED_ACTIVE) for objects in use.
777   *
778   * object + s->inuse
779   * 	Meta data starts here.
780   *
781   * 	A. Free pointer (if we cannot overwrite object on free)
782   * 	B. Tracking data for SLAB_STORE_USER
783   * 	C. Padding to reach required alignment boundary or at mininum
784   * 		one word if debugging is on to be able to detect writes
785   * 		before the word boundary.
786   *
787   *	Padding is done using 0x5a (POISON_INUSE)
788   *
789   * object + s->size
790   * 	Nothing is used beyond s->size.
791   *
792   * If slabcaches are merged then the object_size and inuse boundaries are mostly
793   * ignored. And therefore no slab options that rely on these boundaries
794   * may be used with merged slabcaches.
795   */
796  
797  static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
798  {
799  	unsigned long off = s->inuse;	/* The end of info */
800  
801  	if (s->offset)
802  		/* Freepointer is placed after the object. */
803  		off += sizeof(void *);
804  
805  	if (s->flags & SLAB_STORE_USER)
806  		/* We also have user information there */
807  		off += 2 * sizeof(struct track);
808  
809  	off += kasan_metadata_size(s);
810  
811  	if (size_from_object(s) == off)
812  		return 1;
813  
814  	return check_bytes_and_report(s, page, p, "Object padding",
815  			p + off, POISON_INUSE, size_from_object(s) - off);
816  }
817  
818  /* Check the pad bytes at the end of a slab page */
819  static int slab_pad_check(struct kmem_cache *s, struct page *page)
820  {
821  	u8 *start;
822  	u8 *fault;
823  	u8 *end;
824  	u8 *pad;
825  	int length;
826  	int remainder;
827  
828  	if (!(s->flags & SLAB_POISON))
829  		return 1;
830  
831  	start = page_address(page);
832  	length = PAGE_SIZE << compound_order(page);
833  	end = start + length;
834  	remainder = length % s->size;
835  	if (!remainder)
836  		return 1;
837  
838  	pad = end - remainder;
839  	metadata_access_enable();
840  	fault = memchr_inv(pad, POISON_INUSE, remainder);
841  	metadata_access_disable();
842  	if (!fault)
843  		return 1;
844  	while (end > fault && end[-1] == POISON_INUSE)
845  		end--;
846  
847  	slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
848  	print_section(KERN_ERR, "Padding ", pad, remainder);
849  
850  	restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
851  	return 0;
852  }
853  
854  static int check_object(struct kmem_cache *s, struct page *page,
855  					void *object, u8 val)
856  {
857  	u8 *p = object;
858  	u8 *endobject = object + s->object_size;
859  
860  	if (s->flags & SLAB_RED_ZONE) {
861  		if (!check_bytes_and_report(s, page, object, "Redzone",
862  			object - s->red_left_pad, val, s->red_left_pad))
863  			return 0;
864  
865  		if (!check_bytes_and_report(s, page, object, "Redzone",
866  			endobject, val, s->inuse - s->object_size))
867  			return 0;
868  	} else {
869  		if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
870  			check_bytes_and_report(s, page, p, "Alignment padding",
871  				endobject, POISON_INUSE,
872  				s->inuse - s->object_size);
873  		}
874  	}
875  
876  	if (s->flags & SLAB_POISON) {
877  		if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
878  			(!check_bytes_and_report(s, page, p, "Poison", p,
879  					POISON_FREE, s->object_size - 1) ||
880  			 !check_bytes_and_report(s, page, p, "Poison",
881  				p + s->object_size - 1, POISON_END, 1)))
882  			return 0;
883  		/*
884  		 * check_pad_bytes cleans up on its own.
885  		 */
886  		check_pad_bytes(s, page, p);
887  	}
888  
889  	if (!s->offset && val == SLUB_RED_ACTIVE)
890  		/*
891  		 * Object and freepointer overlap. Cannot check
892  		 * freepointer while object is allocated.
893  		 */
894  		return 1;
895  
896  	/* Check free pointer validity */
897  	if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
898  		object_err(s, page, p, "Freepointer corrupt");
899  		/*
900  		 * No choice but to zap it and thus lose the remainder
901  		 * of the free objects in this slab. May cause
902  		 * another error because the object count is now wrong.
903  		 */
904  		set_freepointer(s, p, NULL);
905  		return 0;
906  	}
907  	return 1;
908  }
909  
910  static int check_slab(struct kmem_cache *s, struct page *page)
911  {
912  	int maxobj;
913  
914  	VM_BUG_ON(!irqs_disabled());
915  
916  	if (!PageSlab(page)) {
917  		slab_err(s, page, "Not a valid slab page");
918  		return 0;
919  	}
920  
921  	maxobj = order_objects(compound_order(page), s->size);
922  	if (page->objects > maxobj) {
923  		slab_err(s, page, "objects %u > max %u",
924  			page->objects, maxobj);
925  		return 0;
926  	}
927  	if (page->inuse > page->objects) {
928  		slab_err(s, page, "inuse %u > max %u",
929  			page->inuse, page->objects);
930  		return 0;
931  	}
932  	/* Slab_pad_check fixes things up after itself */
933  	slab_pad_check(s, page);
934  	return 1;
935  }
936  
937  /*
938   * Determine if a certain object on a page is on the freelist. Must hold the
939   * slab lock to guarantee that the chains are in a consistent state.
940   */
941  static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
942  {
943  	int nr = 0;
944  	void *fp;
945  	void *object = NULL;
946  	int max_objects;
947  
948  	fp = page->freelist;
949  	while (fp && nr <= page->objects) {
950  		if (fp == search)
951  			return 1;
952  		if (!check_valid_pointer(s, page, fp)) {
953  			if (object) {
954  				object_err(s, page, object,
955  					"Freechain corrupt");
956  				set_freepointer(s, object, NULL);
957  			} else {
958  				slab_err(s, page, "Freepointer corrupt");
959  				page->freelist = NULL;
960  				page->inuse = page->objects;
961  				slab_fix(s, "Freelist cleared");
962  				return 0;
963  			}
964  			break;
965  		}
966  		object = fp;
967  		fp = get_freepointer(s, object);
968  		nr++;
969  	}
970  
971  	max_objects = order_objects(compound_order(page), s->size);
972  	if (max_objects > MAX_OBJS_PER_PAGE)
973  		max_objects = MAX_OBJS_PER_PAGE;
974  
975  	if (page->objects != max_objects) {
976  		slab_err(s, page, "Wrong number of objects. Found %d but should be %d",
977  			 page->objects, max_objects);
978  		page->objects = max_objects;
979  		slab_fix(s, "Number of objects adjusted.");
980  	}
981  	if (page->inuse != page->objects - nr) {
982  		slab_err(s, page, "Wrong object count. Counter is %d but counted were %d",
983  			 page->inuse, page->objects - nr);
984  		page->inuse = page->objects - nr;
985  		slab_fix(s, "Object count adjusted.");
986  	}
987  	return search == NULL;
988  }
989  
990  static void trace(struct kmem_cache *s, struct page *page, void *object,
991  								int alloc)
992  {
993  	if (s->flags & SLAB_TRACE) {
994  		pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
995  			s->name,
996  			alloc ? "alloc" : "free",
997  			object, page->inuse,
998  			page->freelist);
999  
1000  		if (!alloc)
1001  			print_section(KERN_INFO, "Object ", (void *)object,
1002  					s->object_size);
1003  
1004  		dump_stack();
1005  	}
1006  }
1007  
1008  /*
1009   * Tracking of fully allocated slabs for debugging purposes.
1010   */
1011  static void add_full(struct kmem_cache *s,
1012  	struct kmem_cache_node *n, struct page *page)
1013  {
1014  	if (!(s->flags & SLAB_STORE_USER))
1015  		return;
1016  
1017  	lockdep_assert_held(&n->list_lock);
1018  	list_add(&page->slab_list, &n->full);
1019  }
1020  
1021  static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
1022  {
1023  	if (!(s->flags & SLAB_STORE_USER))
1024  		return;
1025  
1026  	lockdep_assert_held(&n->list_lock);
1027  	list_del(&page->slab_list);
1028  }
1029  
1030  /* Tracking of the number of slabs for debugging purposes */
1031  static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1032  {
1033  	struct kmem_cache_node *n = get_node(s, node);
1034  
1035  	return atomic_long_read(&n->nr_slabs);
1036  }
1037  
1038  static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1039  {
1040  	return atomic_long_read(&n->nr_slabs);
1041  }
1042  
1043  static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
1044  {
1045  	struct kmem_cache_node *n = get_node(s, node);
1046  
1047  	/*
1048  	 * May be called early in order to allocate a slab for the
1049  	 * kmem_cache_node structure. Solve the chicken-egg
1050  	 * dilemma by deferring the increment of the count during
1051  	 * bootstrap (see early_kmem_cache_node_alloc).
1052  	 */
1053  	if (likely(n)) {
1054  		atomic_long_inc(&n->nr_slabs);
1055  		atomic_long_add(objects, &n->total_objects);
1056  	}
1057  }
1058  static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
1059  {
1060  	struct kmem_cache_node *n = get_node(s, node);
1061  
1062  	atomic_long_dec(&n->nr_slabs);
1063  	atomic_long_sub(objects, &n->total_objects);
1064  }
1065  
1066  /* Object debug checks for alloc/free paths */
1067  static void setup_object_debug(struct kmem_cache *s, struct page *page,
1068  								void *object)
1069  {
1070  	if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
1071  		return;
1072  
1073  	init_object(s, object, SLUB_RED_INACTIVE);
1074  	init_tracking(s, object);
1075  }
1076  
1077  static void setup_page_debug(struct kmem_cache *s, void *addr, int order)
1078  {
1079  	if (!(s->flags & SLAB_POISON))
1080  		return;
1081  
1082  	metadata_access_enable();
1083  	memset(addr, POISON_INUSE, PAGE_SIZE << order);
1084  	metadata_access_disable();
1085  }
1086  
1087  static inline int alloc_consistency_checks(struct kmem_cache *s,
1088  					struct page *page, void *object)
1089  {
1090  	if (!check_slab(s, page))
1091  		return 0;
1092  
1093  	if (!check_valid_pointer(s, page, object)) {
1094  		object_err(s, page, object, "Freelist Pointer check fails");
1095  		return 0;
1096  	}
1097  
1098  	if (!check_object(s, page, object, SLUB_RED_INACTIVE))
1099  		return 0;
1100  
1101  	return 1;
1102  }
1103  
1104  static noinline int alloc_debug_processing(struct kmem_cache *s,
1105  					struct page *page,
1106  					void *object, unsigned long addr)
1107  {
1108  	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1109  		if (!alloc_consistency_checks(s, page, object))
1110  			goto bad;
1111  	}
1112  
1113  	/* Success perform special debug activities for allocs */
1114  	if (s->flags & SLAB_STORE_USER)
1115  		set_track(s, object, TRACK_ALLOC, addr);
1116  	trace(s, page, object, 1);
1117  	init_object(s, object, SLUB_RED_ACTIVE);
1118  	return 1;
1119  
1120  bad:
1121  	if (PageSlab(page)) {
1122  		/*
1123  		 * If this is a slab page then lets do the best we can
1124  		 * to avoid issues in the future. Marking all objects
1125  		 * as used avoids touching the remaining objects.
1126  		 */
1127  		slab_fix(s, "Marking all objects used");
1128  		page->inuse = page->objects;
1129  		page->freelist = NULL;
1130  	}
1131  	return 0;
1132  }
1133  
1134  static inline int free_consistency_checks(struct kmem_cache *s,
1135  		struct page *page, void *object, unsigned long addr)
1136  {
1137  	if (!check_valid_pointer(s, page, object)) {
1138  		slab_err(s, page, "Invalid object pointer 0x%p", object);
1139  		return 0;
1140  	}
1141  
1142  	if (on_freelist(s, page, object)) {
1143  		object_err(s, page, object, "Object already free");
1144  		return 0;
1145  	}
1146  
1147  	if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1148  		return 0;
1149  
1150  	if (unlikely(s != page->slab_cache)) {
1151  		if (!PageSlab(page)) {
1152  			slab_err(s, page, "Attempt to free object(0x%p) outside of slab",
1153  				 object);
1154  		} else if (!page->slab_cache) {
1155  			pr_err("SLUB <none>: no slab for object 0x%p.\n",
1156  			       object);
1157  			dump_stack();
1158  		} else
1159  			object_err(s, page, object,
1160  					"page slab pointer corrupt.");
1161  		return 0;
1162  	}
1163  	return 1;
1164  }
1165  
1166  /* Supports checking bulk free of a constructed freelist */
1167  static noinline int free_debug_processing(
1168  	struct kmem_cache *s, struct page *page,
1169  	void *head, void *tail, int bulk_cnt,
1170  	unsigned long addr)
1171  {
1172  	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1173  	void *object = head;
1174  	int cnt = 0;
1175  	unsigned long uninitialized_var(flags);
1176  	int ret = 0;
1177  
1178  	spin_lock_irqsave(&n->list_lock, flags);
1179  	slab_lock(page);
1180  
1181  	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1182  		if (!check_slab(s, page))
1183  			goto out;
1184  	}
1185  
1186  next_object:
1187  	cnt++;
1188  
1189  	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1190  		if (!free_consistency_checks(s, page, object, addr))
1191  			goto out;
1192  	}
1193  
1194  	if (s->flags & SLAB_STORE_USER)
1195  		set_track(s, object, TRACK_FREE, addr);
1196  	trace(s, page, object, 0);
1197  	/* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
1198  	init_object(s, object, SLUB_RED_INACTIVE);
1199  
1200  	/* Reached end of constructed freelist yet? */
1201  	if (object != tail) {
1202  		object = get_freepointer(s, object);
1203  		goto next_object;
1204  	}
1205  	ret = 1;
1206  
1207  out:
1208  	if (cnt != bulk_cnt)
1209  		slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
1210  			 bulk_cnt, cnt);
1211  
1212  	slab_unlock(page);
1213  	spin_unlock_irqrestore(&n->list_lock, flags);
1214  	if (!ret)
1215  		slab_fix(s, "Object at 0x%p not freed", object);
1216  	return ret;
1217  }
1218  
1219  static int __init setup_slub_debug(char *str)
1220  {
1221  	slub_debug = DEBUG_DEFAULT_FLAGS;
1222  	if (*str++ != '=' || !*str)
1223  		/*
1224  		 * No options specified. Switch on full debugging.
1225  		 */
1226  		goto out;
1227  
1228  	if (*str == ',')
1229  		/*
1230  		 * No options but restriction on slabs. This means full
1231  		 * debugging for slabs matching a pattern.
1232  		 */
1233  		goto check_slabs;
1234  
1235  	slub_debug = 0;
1236  	if (*str == '-')
1237  		/*
1238  		 * Switch off all debugging measures.
1239  		 */
1240  		goto out;
1241  
1242  	/*
1243  	 * Determine which debug features should be switched on
1244  	 */
1245  	for (; *str && *str != ','; str++) {
1246  		switch (tolower(*str)) {
1247  		case 'f':
1248  			slub_debug |= SLAB_CONSISTENCY_CHECKS;
1249  			break;
1250  		case 'z':
1251  			slub_debug |= SLAB_RED_ZONE;
1252  			break;
1253  		case 'p':
1254  			slub_debug |= SLAB_POISON;
1255  			break;
1256  		case 'u':
1257  			slub_debug |= SLAB_STORE_USER;
1258  			break;
1259  		case 't':
1260  			slub_debug |= SLAB_TRACE;
1261  			break;
1262  		case 'a':
1263  			slub_debug |= SLAB_FAILSLAB;
1264  			break;
1265  		case 'o':
1266  			/*
1267  			 * Avoid enabling debugging on caches if its minimum
1268  			 * order would increase as a result.
1269  			 */
1270  			disable_higher_order_debug = 1;
1271  			break;
1272  		default:
1273  			pr_err("slub_debug option '%c' unknown. skipped\n",
1274  			       *str);
1275  		}
1276  	}
1277  
1278  check_slabs:
1279  	if (*str == ',')
1280  		slub_debug_slabs = str + 1;
1281  out:
1282  	return 1;
1283  }
1284  
1285  __setup("slub_debug", setup_slub_debug);
1286  
1287  /*
1288   * kmem_cache_flags - apply debugging options to the cache
1289   * @object_size:	the size of an object without meta data
1290   * @flags:		flags to set
1291   * @name:		name of the cache
1292   * @ctor:		constructor function
1293   *
1294   * Debug option(s) are applied to @flags. In addition to the debug
1295   * option(s), if a slab name (or multiple) is specified i.e.
1296   * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ...
1297   * then only the select slabs will receive the debug option(s).
1298   */
1299  slab_flags_t kmem_cache_flags(unsigned int object_size,
1300  	slab_flags_t flags, const char *name,
1301  	void (*ctor)(void *))
1302  {
1303  	char *iter;
1304  	size_t len;
1305  
1306  	/* If slub_debug = 0, it folds into the if conditional. */
1307  	if (!slub_debug_slabs)
1308  		return flags | slub_debug;
1309  
1310  	len = strlen(name);
1311  	iter = slub_debug_slabs;
1312  	while (*iter) {
1313  		char *end, *glob;
1314  		size_t cmplen;
1315  
1316  		end = strchr(iter, ',');
1317  		if (!end)
1318  			end = iter + strlen(iter);
1319  
1320  		glob = strnchr(iter, end - iter, '*');
1321  		if (glob)
1322  			cmplen = glob - iter;
1323  		else
1324  			cmplen = max_t(size_t, len, (end - iter));
1325  
1326  		if (!strncmp(name, iter, cmplen)) {
1327  			flags |= slub_debug;
1328  			break;
1329  		}
1330  
1331  		if (!*end)
1332  			break;
1333  		iter = end + 1;
1334  	}
1335  
1336  	return flags;
1337  }
1338  #else /* !CONFIG_SLUB_DEBUG */
1339  static inline void setup_object_debug(struct kmem_cache *s,
1340  			struct page *page, void *object) {}
1341  static inline void setup_page_debug(struct kmem_cache *s,
1342  			void *addr, int order) {}
1343  
1344  static inline int alloc_debug_processing(struct kmem_cache *s,
1345  	struct page *page, void *object, unsigned long addr) { return 0; }
1346  
1347  static inline int free_debug_processing(
1348  	struct kmem_cache *s, struct page *page,
1349  	void *head, void *tail, int bulk_cnt,
1350  	unsigned long addr) { return 0; }
1351  
1352  static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1353  			{ return 1; }
1354  static inline int check_object(struct kmem_cache *s, struct page *page,
1355  			void *object, u8 val) { return 1; }
1356  static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1357  					struct page *page) {}
1358  static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
1359  					struct page *page) {}
1360  slab_flags_t kmem_cache_flags(unsigned int object_size,
1361  	slab_flags_t flags, const char *name,
1362  	void (*ctor)(void *))
1363  {
1364  	return flags;
1365  }
1366  #define slub_debug 0
1367  
1368  #define disable_higher_order_debug 0
1369  
1370  static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1371  							{ return 0; }
1372  static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1373  							{ return 0; }
1374  static inline void inc_slabs_node(struct kmem_cache *s, int node,
1375  							int objects) {}
1376  static inline void dec_slabs_node(struct kmem_cache *s, int node,
1377  							int objects) {}
1378  
1379  #endif /* CONFIG_SLUB_DEBUG */
1380  
1381  /*
1382   * Hooks for other subsystems that check memory allocations. In a typical
1383   * production configuration these hooks all should produce no code at all.
1384   */
1385  static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
1386  {
1387  	ptr = kasan_kmalloc_large(ptr, size, flags);
1388  	/* As ptr might get tagged, call kmemleak hook after KASAN. */
1389  	kmemleak_alloc(ptr, size, 1, flags);
1390  	return ptr;
1391  }
1392  
1393  static __always_inline void kfree_hook(void *x)
1394  {
1395  	kmemleak_free(x);
1396  	kasan_kfree_large(x, _RET_IP_);
1397  }
1398  
1399  static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
1400  {
1401  	kmemleak_free_recursive(x, s->flags);
1402  
1403  	/*
1404  	 * Trouble is that we may no longer disable interrupts in the fast path
1405  	 * So in order to make the debug calls that expect irqs to be
1406  	 * disabled we need to disable interrupts temporarily.
1407  	 */
1408  #ifdef CONFIG_LOCKDEP
1409  	{
1410  		unsigned long flags;
1411  
1412  		local_irq_save(flags);
1413  		debug_check_no_locks_freed(x, s->object_size);
1414  		local_irq_restore(flags);
1415  	}
1416  #endif
1417  	if (!(s->flags & SLAB_DEBUG_OBJECTS))
1418  		debug_check_no_obj_freed(x, s->object_size);
1419  
1420  	/* KASAN might put x into memory quarantine, delaying its reuse */
1421  	return kasan_slab_free(s, x, _RET_IP_);
1422  }
1423  
1424  static inline bool slab_free_freelist_hook(struct kmem_cache *s,
1425  					   void **head, void **tail)
1426  {
1427  /*
1428   * Compiler cannot detect this function can be removed if slab_free_hook()
1429   * evaluates to nothing.  Thus, catch all relevant config debug options here.
1430   */
1431  #if defined(CONFIG_LOCKDEP)	||		\
1432  	defined(CONFIG_DEBUG_KMEMLEAK) ||	\
1433  	defined(CONFIG_DEBUG_OBJECTS_FREE) ||	\
1434  	defined(CONFIG_KASAN)
1435  
1436  	void *object;
1437  	void *next = *head;
1438  	void *old_tail = *tail ? *tail : *head;
1439  
1440  	/* Head and tail of the reconstructed freelist */
1441  	*head = NULL;
1442  	*tail = NULL;
1443  
1444  	do {
1445  		object = next;
1446  		next = get_freepointer(s, object);
1447  		/* If object's reuse doesn't have to be delayed */
1448  		if (!slab_free_hook(s, object)) {
1449  			/* Move object to the new freelist */
1450  			set_freepointer(s, object, *head);
1451  			*head = object;
1452  			if (!*tail)
1453  				*tail = object;
1454  		}
1455  	} while (object != old_tail);
1456  
1457  	if (*head == *tail)
1458  		*tail = NULL;
1459  
1460  	return *head != NULL;
1461  #else
1462  	return true;
1463  #endif
1464  }
1465  
1466  static void *setup_object(struct kmem_cache *s, struct page *page,
1467  				void *object)
1468  {
1469  	setup_object_debug(s, page, object);
1470  	object = kasan_init_slab_obj(s, object);
1471  	if (unlikely(s->ctor)) {
1472  		kasan_unpoison_object_data(s, object);
1473  		s->ctor(object);
1474  		kasan_poison_object_data(s, object);
1475  	}
1476  	return object;
1477  }
1478  
1479  /*
1480   * Slab allocation and freeing
1481   */
1482  static inline struct page *alloc_slab_page(struct kmem_cache *s,
1483  		gfp_t flags, int node, struct kmem_cache_order_objects oo)
1484  {
1485  	struct page *page;
1486  	unsigned int order = oo_order(oo);
1487  
1488  	if (node == NUMA_NO_NODE)
1489  		page = alloc_pages(flags, order);
1490  	else
1491  		page = __alloc_pages_node(node, flags, order);
1492  
1493  	if (page && memcg_charge_slab(page, flags, order, s)) {
1494  		__free_pages(page, order);
1495  		page = NULL;
1496  	}
1497  
1498  	return page;
1499  }
1500  
1501  #ifdef CONFIG_SLAB_FREELIST_RANDOM
1502  /* Pre-initialize the random sequence cache */
1503  static int init_cache_random_seq(struct kmem_cache *s)
1504  {
1505  	unsigned int count = oo_objects(s->oo);
1506  	int err;
1507  
1508  	/* Bailout if already initialised */
1509  	if (s->random_seq)
1510  		return 0;
1511  
1512  	err = cache_random_seq_create(s, count, GFP_KERNEL);
1513  	if (err) {
1514  		pr_err("SLUB: Unable to initialize free list for %s\n",
1515  			s->name);
1516  		return err;
1517  	}
1518  
1519  	/* Transform to an offset on the set of pages */
1520  	if (s->random_seq) {
1521  		unsigned int i;
1522  
1523  		for (i = 0; i < count; i++)
1524  			s->random_seq[i] *= s->size;
1525  	}
1526  	return 0;
1527  }
1528  
1529  /* Initialize each random sequence freelist per cache */
1530  static void __init init_freelist_randomization(void)
1531  {
1532  	struct kmem_cache *s;
1533  
1534  	mutex_lock(&slab_mutex);
1535  
1536  	list_for_each_entry(s, &slab_caches, list)
1537  		init_cache_random_seq(s);
1538  
1539  	mutex_unlock(&slab_mutex);
1540  }
1541  
1542  /* Get the next entry on the pre-computed freelist randomized */
1543  static void *next_freelist_entry(struct kmem_cache *s, struct page *page,
1544  				unsigned long *pos, void *start,
1545  				unsigned long page_limit,
1546  				unsigned long freelist_count)
1547  {
1548  	unsigned int idx;
1549  
1550  	/*
1551  	 * If the target page allocation failed, the number of objects on the
1552  	 * page might be smaller than the usual size defined by the cache.
1553  	 */
1554  	do {
1555  		idx = s->random_seq[*pos];
1556  		*pos += 1;
1557  		if (*pos >= freelist_count)
1558  			*pos = 0;
1559  	} while (unlikely(idx >= page_limit));
1560  
1561  	return (char *)start + idx;
1562  }
1563  
1564  /* Shuffle the single linked freelist based on a random pre-computed sequence */
1565  static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
1566  {
1567  	void *start;
1568  	void *cur;
1569  	void *next;
1570  	unsigned long idx, pos, page_limit, freelist_count;
1571  
1572  	if (page->objects < 2 || !s->random_seq)
1573  		return false;
1574  
1575  	freelist_count = oo_objects(s->oo);
1576  	pos = get_random_int() % freelist_count;
1577  
1578  	page_limit = page->objects * s->size;
1579  	start = fixup_red_left(s, page_address(page));
1580  
1581  	/* First entry is used as the base of the freelist */
1582  	cur = next_freelist_entry(s, page, &pos, start, page_limit,
1583  				freelist_count);
1584  	cur = setup_object(s, page, cur);
1585  	page->freelist = cur;
1586  
1587  	for (idx = 1; idx < page->objects; idx++) {
1588  		next = next_freelist_entry(s, page, &pos, start, page_limit,
1589  			freelist_count);
1590  		next = setup_object(s, page, next);
1591  		set_freepointer(s, cur, next);
1592  		cur = next;
1593  	}
1594  	set_freepointer(s, cur, NULL);
1595  
1596  	return true;
1597  }
1598  #else
1599  static inline int init_cache_random_seq(struct kmem_cache *s)
1600  {
1601  	return 0;
1602  }
1603  static inline void init_freelist_randomization(void) { }
1604  static inline bool shuffle_freelist(struct kmem_cache *s, struct page *page)
1605  {
1606  	return false;
1607  }
1608  #endif /* CONFIG_SLAB_FREELIST_RANDOM */
1609  
1610  static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1611  {
1612  	struct page *page;
1613  	struct kmem_cache_order_objects oo = s->oo;
1614  	gfp_t alloc_gfp;
1615  	void *start, *p, *next;
1616  	int idx, order;
1617  	bool shuffle;
1618  
1619  	flags &= gfp_allowed_mask;
1620  
1621  	if (gfpflags_allow_blocking(flags))
1622  		local_irq_enable();
1623  
1624  	flags |= s->allocflags;
1625  
1626  	/*
1627  	 * Let the initial higher-order allocation fail under memory pressure
1628  	 * so we fall-back to the minimum order allocation.
1629  	 */
1630  	alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1631  	if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
1632  		alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
1633  
1634  	page = alloc_slab_page(s, alloc_gfp, node, oo);
1635  	if (unlikely(!page)) {
1636  		oo = s->min;
1637  		alloc_gfp = flags;
1638  		/*
1639  		 * Allocation may have failed due to fragmentation.
1640  		 * Try a lower order alloc if possible
1641  		 */
1642  		page = alloc_slab_page(s, alloc_gfp, node, oo);
1643  		if (unlikely(!page))
1644  			goto out;
1645  		stat(s, ORDER_FALLBACK);
1646  	}
1647  
1648  	page->objects = oo_objects(oo);
1649  
1650  	order = compound_order(page);
1651  	page->slab_cache = s;
1652  	__SetPageSlab(page);
1653  	if (page_is_pfmemalloc(page))
1654  		SetPageSlabPfmemalloc(page);
1655  
1656  	kasan_poison_slab(page);
1657  
1658  	start = page_address(page);
1659  
1660  	setup_page_debug(s, start, order);
1661  
1662  	shuffle = shuffle_freelist(s, page);
1663  
1664  	if (!shuffle) {
1665  		start = fixup_red_left(s, start);
1666  		start = setup_object(s, page, start);
1667  		page->freelist = start;
1668  		for (idx = 0, p = start; idx < page->objects - 1; idx++) {
1669  			next = p + s->size;
1670  			next = setup_object(s, page, next);
1671  			set_freepointer(s, p, next);
1672  			p = next;
1673  		}
1674  		set_freepointer(s, p, NULL);
1675  	}
1676  
1677  	page->inuse = page->objects;
1678  	page->frozen = 1;
1679  
1680  out:
1681  	if (gfpflags_allow_blocking(flags))
1682  		local_irq_disable();
1683  	if (!page)
1684  		return NULL;
1685  
1686  	mod_lruvec_page_state(page,
1687  		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
1688  		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1689  		1 << oo_order(oo));
1690  
1691  	inc_slabs_node(s, page_to_nid(page), page->objects);
1692  
1693  	return page;
1694  }
1695  
1696  static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1697  {
1698  	if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
1699  		gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
1700  		flags &= ~GFP_SLAB_BUG_MASK;
1701  		pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
1702  				invalid_mask, &invalid_mask, flags, &flags);
1703  		dump_stack();
1704  	}
1705  
1706  	return allocate_slab(s,
1707  		flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1708  }
1709  
1710  static void __free_slab(struct kmem_cache *s, struct page *page)
1711  {
1712  	int order = compound_order(page);
1713  	int pages = 1 << order;
1714  
1715  	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1716  		void *p;
1717  
1718  		slab_pad_check(s, page);
1719  		for_each_object(p, s, page_address(page),
1720  						page->objects)
1721  			check_object(s, page, p, SLUB_RED_INACTIVE);
1722  	}
1723  
1724  	mod_lruvec_page_state(page,
1725  		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
1726  		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1727  		-pages);
1728  
1729  	__ClearPageSlabPfmemalloc(page);
1730  	__ClearPageSlab(page);
1731  
1732  	page->mapping = NULL;
1733  	if (current->reclaim_state)
1734  		current->reclaim_state->reclaimed_slab += pages;
1735  	memcg_uncharge_slab(page, order, s);
1736  	__free_pages(page, order);
1737  }
1738  
1739  static void rcu_free_slab(struct rcu_head *h)
1740  {
1741  	struct page *page = container_of(h, struct page, rcu_head);
1742  
1743  	__free_slab(page->slab_cache, page);
1744  }
1745  
1746  static void free_slab(struct kmem_cache *s, struct page *page)
1747  {
1748  	if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
1749  		call_rcu(&page->rcu_head, rcu_free_slab);
1750  	} else
1751  		__free_slab(s, page);
1752  }
1753  
1754  static void discard_slab(struct kmem_cache *s, struct page *page)
1755  {
1756  	dec_slabs_node(s, page_to_nid(page), page->objects);
1757  	free_slab(s, page);
1758  }
1759  
1760  /*
1761   * Management of partially allocated slabs.
1762   */
1763  static inline void
1764  __add_partial(struct kmem_cache_node *n, struct page *page, int tail)
1765  {
1766  	n->nr_partial++;
1767  	if (tail == DEACTIVATE_TO_TAIL)
1768  		list_add_tail(&page->slab_list, &n->partial);
1769  	else
1770  		list_add(&page->slab_list, &n->partial);
1771  }
1772  
1773  static inline void add_partial(struct kmem_cache_node *n,
1774  				struct page *page, int tail)
1775  {
1776  	lockdep_assert_held(&n->list_lock);
1777  	__add_partial(n, page, tail);
1778  }
1779  
1780  static inline void remove_partial(struct kmem_cache_node *n,
1781  					struct page *page)
1782  {
1783  	lockdep_assert_held(&n->list_lock);
1784  	list_del(&page->slab_list);
1785  	n->nr_partial--;
1786  }
1787  
1788  /*
1789   * Remove slab from the partial list, freeze it and
1790   * return the pointer to the freelist.
1791   *
1792   * Returns a list of objects or NULL if it fails.
1793   */
1794  static inline void *acquire_slab(struct kmem_cache *s,
1795  		struct kmem_cache_node *n, struct page *page,
1796  		int mode, int *objects)
1797  {
1798  	void *freelist;
1799  	unsigned long counters;
1800  	struct page new;
1801  
1802  	lockdep_assert_held(&n->list_lock);
1803  
1804  	/*
1805  	 * Zap the freelist and set the frozen bit.
1806  	 * The old freelist is the list of objects for the
1807  	 * per cpu allocation list.
1808  	 */
1809  	freelist = page->freelist;
1810  	counters = page->counters;
1811  	new.counters = counters;
1812  	*objects = new.objects - new.inuse;
1813  	if (mode) {
1814  		new.inuse = page->objects;
1815  		new.freelist = NULL;
1816  	} else {
1817  		new.freelist = freelist;
1818  	}
1819  
1820  	VM_BUG_ON(new.frozen);
1821  	new.frozen = 1;
1822  
1823  	if (!__cmpxchg_double_slab(s, page,
1824  			freelist, counters,
1825  			new.freelist, new.counters,
1826  			"acquire_slab"))
1827  		return NULL;
1828  
1829  	remove_partial(n, page);
1830  	WARN_ON(!freelist);
1831  	return freelist;
1832  }
1833  
1834  static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
1835  static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
1836  
1837  /*
1838   * Try to allocate a partial slab from a specific node.
1839   */
1840  static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
1841  				struct kmem_cache_cpu *c, gfp_t flags)
1842  {
1843  	struct page *page, *page2;
1844  	void *object = NULL;
1845  	unsigned int available = 0;
1846  	int objects;
1847  
1848  	/*
1849  	 * Racy check. If we mistakenly see no partial slabs then we
1850  	 * just allocate an empty slab. If we mistakenly try to get a
1851  	 * partial slab and there is none available then get_partials()
1852  	 * will return NULL.
1853  	 */
1854  	if (!n || !n->nr_partial)
1855  		return NULL;
1856  
1857  	spin_lock(&n->list_lock);
1858  	list_for_each_entry_safe(page, page2, &n->partial, slab_list) {
1859  		void *t;
1860  
1861  		if (!pfmemalloc_match(page, flags))
1862  			continue;
1863  
1864  		t = acquire_slab(s, n, page, object == NULL, &objects);
1865  		if (!t)
1866  			break;
1867  
1868  		available += objects;
1869  		if (!object) {
1870  			c->page = page;
1871  			stat(s, ALLOC_FROM_PARTIAL);
1872  			object = t;
1873  		} else {
1874  			put_cpu_partial(s, page, 0);
1875  			stat(s, CPU_PARTIAL_NODE);
1876  		}
1877  		if (!kmem_cache_has_cpu_partial(s)
1878  			|| available > slub_cpu_partial(s) / 2)
1879  			break;
1880  
1881  	}
1882  	spin_unlock(&n->list_lock);
1883  	return object;
1884  }
1885  
1886  /*
1887   * Get a page from somewhere. Search in increasing NUMA distances.
1888   */
1889  static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1890  		struct kmem_cache_cpu *c)
1891  {
1892  #ifdef CONFIG_NUMA
1893  	struct zonelist *zonelist;
1894  	struct zoneref *z;
1895  	struct zone *zone;
1896  	enum zone_type high_zoneidx = gfp_zone(flags);
1897  	void *object;
1898  	unsigned int cpuset_mems_cookie;
1899  
1900  	/*
1901  	 * The defrag ratio allows a configuration of the tradeoffs between
1902  	 * inter node defragmentation and node local allocations. A lower
1903  	 * defrag_ratio increases the tendency to do local allocations
1904  	 * instead of attempting to obtain partial slabs from other nodes.
1905  	 *
1906  	 * If the defrag_ratio is set to 0 then kmalloc() always
1907  	 * returns node local objects. If the ratio is higher then kmalloc()
1908  	 * may return off node objects because partial slabs are obtained
1909  	 * from other nodes and filled up.
1910  	 *
1911  	 * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
1912  	 * (which makes defrag_ratio = 1000) then every (well almost)
1913  	 * allocation will first attempt to defrag slab caches on other nodes.
1914  	 * This means scanning over all nodes to look for partial slabs which
1915  	 * may be expensive if we do it every time we are trying to find a slab
1916  	 * with available objects.
1917  	 */
1918  	if (!s->remote_node_defrag_ratio ||
1919  			get_cycles() % 1024 > s->remote_node_defrag_ratio)
1920  		return NULL;
1921  
1922  	do {
1923  		cpuset_mems_cookie = read_mems_allowed_begin();
1924  		zonelist = node_zonelist(mempolicy_slab_node(), flags);
1925  		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1926  			struct kmem_cache_node *n;
1927  
1928  			n = get_node(s, zone_to_nid(zone));
1929  
1930  			if (n && cpuset_zone_allowed(zone, flags) &&
1931  					n->nr_partial > s->min_partial) {
1932  				object = get_partial_node(s, n, c, flags);
1933  				if (object) {
1934  					/*
1935  					 * Don't check read_mems_allowed_retry()
1936  					 * here - if mems_allowed was updated in
1937  					 * parallel, that was a harmless race
1938  					 * between allocation and the cpuset
1939  					 * update
1940  					 */
1941  					return object;
1942  				}
1943  			}
1944  		}
1945  	} while (read_mems_allowed_retry(cpuset_mems_cookie));
1946  #endif	/* CONFIG_NUMA */
1947  	return NULL;
1948  }
1949  
1950  /*
1951   * Get a partial page, lock it and return it.
1952   */
1953  static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
1954  		struct kmem_cache_cpu *c)
1955  {
1956  	void *object;
1957  	int searchnode = node;
1958  
1959  	if (node == NUMA_NO_NODE)
1960  		searchnode = numa_mem_id();
1961  	else if (!node_present_pages(node))
1962  		searchnode = node_to_mem_node(node);
1963  
1964  	object = get_partial_node(s, get_node(s, searchnode), c, flags);
1965  	if (object || node != NUMA_NO_NODE)
1966  		return object;
1967  
1968  	return get_any_partial(s, flags, c);
1969  }
1970  
1971  #ifdef CONFIG_PREEMPT
1972  /*
1973   * Calculate the next globally unique transaction for disambiguiation
1974   * during cmpxchg. The transactions start with the cpu number and are then
1975   * incremented by CONFIG_NR_CPUS.
1976   */
1977  #define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
1978  #else
1979  /*
1980   * No preemption supported therefore also no need to check for
1981   * different cpus.
1982   */
1983  #define TID_STEP 1
1984  #endif
1985  
1986  static inline unsigned long next_tid(unsigned long tid)
1987  {
1988  	return tid + TID_STEP;
1989  }
1990  
1991  static inline unsigned int tid_to_cpu(unsigned long tid)
1992  {
1993  	return tid % TID_STEP;
1994  }
1995  
1996  static inline unsigned long tid_to_event(unsigned long tid)
1997  {
1998  	return tid / TID_STEP;
1999  }
2000  
2001  static inline unsigned int init_tid(int cpu)
2002  {
2003  	return cpu;
2004  }
2005  
2006  static inline void note_cmpxchg_failure(const char *n,
2007  		const struct kmem_cache *s, unsigned long tid)
2008  {
2009  #ifdef SLUB_DEBUG_CMPXCHG
2010  	unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
2011  
2012  	pr_info("%s %s: cmpxchg redo ", n, s->name);
2013  
2014  #ifdef CONFIG_PREEMPT
2015  	if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
2016  		pr_warn("due to cpu change %d -> %d\n",
2017  			tid_to_cpu(tid), tid_to_cpu(actual_tid));
2018  	else
2019  #endif
2020  	if (tid_to_event(tid) != tid_to_event(actual_tid))
2021  		pr_warn("due to cpu running other code. Event %ld->%ld\n",
2022  			tid_to_event(tid), tid_to_event(actual_tid));
2023  	else
2024  		pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
2025  			actual_tid, tid, next_tid(tid));
2026  #endif
2027  	stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
2028  }
2029  
2030  static void init_kmem_cache_cpus(struct kmem_cache *s)
2031  {
2032  	int cpu;
2033  
2034  	for_each_possible_cpu(cpu)
2035  		per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
2036  }
2037  
2038  /*
2039   * Remove the cpu slab
2040   */
2041  static void deactivate_slab(struct kmem_cache *s, struct page *page,
2042  				void *freelist, struct kmem_cache_cpu *c)
2043  {
2044  	enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
2045  	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
2046  	int lock = 0;
2047  	enum slab_modes l = M_NONE, m = M_NONE;
2048  	void *nextfree;
2049  	int tail = DEACTIVATE_TO_HEAD;
2050  	struct page new;
2051  	struct page old;
2052  
2053  	if (page->freelist) {
2054  		stat(s, DEACTIVATE_REMOTE_FREES);
2055  		tail = DEACTIVATE_TO_TAIL;
2056  	}
2057  
2058  	/*
2059  	 * Stage one: Free all available per cpu objects back
2060  	 * to the page freelist while it is still frozen. Leave the
2061  	 * last one.
2062  	 *
2063  	 * There is no need to take the list->lock because the page
2064  	 * is still frozen.
2065  	 */
2066  	while (freelist && (nextfree = get_freepointer(s, freelist))) {
2067  		void *prior;
2068  		unsigned long counters;
2069  
2070  		do {
2071  			prior = page->freelist;
2072  			counters = page->counters;
2073  			set_freepointer(s, freelist, prior);
2074  			new.counters = counters;
2075  			new.inuse--;
2076  			VM_BUG_ON(!new.frozen);
2077  
2078  		} while (!__cmpxchg_double_slab(s, page,
2079  			prior, counters,
2080  			freelist, new.counters,
2081  			"drain percpu freelist"));
2082  
2083  		freelist = nextfree;
2084  	}
2085  
2086  	/*
2087  	 * Stage two: Ensure that the page is unfrozen while the
2088  	 * list presence reflects the actual number of objects
2089  	 * during unfreeze.
2090  	 *
2091  	 * We setup the list membership and then perform a cmpxchg
2092  	 * with the count. If there is a mismatch then the page
2093  	 * is not unfrozen but the page is on the wrong list.
2094  	 *
2095  	 * Then we restart the process which may have to remove
2096  	 * the page from the list that we just put it on again
2097  	 * because the number of objects in the slab may have
2098  	 * changed.
2099  	 */
2100  redo:
2101  
2102  	old.freelist = page->freelist;
2103  	old.counters = page->counters;
2104  	VM_BUG_ON(!old.frozen);
2105  
2106  	/* Determine target state of the slab */
2107  	new.counters = old.counters;
2108  	if (freelist) {
2109  		new.inuse--;
2110  		set_freepointer(s, freelist, old.freelist);
2111  		new.freelist = freelist;
2112  	} else
2113  		new.freelist = old.freelist;
2114  
2115  	new.frozen = 0;
2116  
2117  	if (!new.inuse && n->nr_partial >= s->min_partial)
2118  		m = M_FREE;
2119  	else if (new.freelist) {
2120  		m = M_PARTIAL;
2121  		if (!lock) {
2122  			lock = 1;
2123  			/*
2124  			 * Taking the spinlock removes the possibility
2125  			 * that acquire_slab() will see a slab page that
2126  			 * is frozen
2127  			 */
2128  			spin_lock(&n->list_lock);
2129  		}
2130  	} else {
2131  		m = M_FULL;
2132  		if (kmem_cache_debug(s) && !lock) {
2133  			lock = 1;
2134  			/*
2135  			 * This also ensures that the scanning of full
2136  			 * slabs from diagnostic functions will not see
2137  			 * any frozen slabs.
2138  			 */
2139  			spin_lock(&n->list_lock);
2140  		}
2141  	}
2142  
2143  	if (l != m) {
2144  		if (l == M_PARTIAL)
2145  			remove_partial(n, page);
2146  		else if (l == M_FULL)
2147  			remove_full(s, n, page);
2148  
2149  		if (m == M_PARTIAL)
2150  			add_partial(n, page, tail);
2151  		else if (m == M_FULL)
2152  			add_full(s, n, page);
2153  	}
2154  
2155  	l = m;
2156  	if (!__cmpxchg_double_slab(s, page,
2157  				old.freelist, old.counters,
2158  				new.freelist, new.counters,
2159  				"unfreezing slab"))
2160  		goto redo;
2161  
2162  	if (lock)
2163  		spin_unlock(&n->list_lock);
2164  
2165  	if (m == M_PARTIAL)
2166  		stat(s, tail);
2167  	else if (m == M_FULL)
2168  		stat(s, DEACTIVATE_FULL);
2169  	else if (m == M_FREE) {
2170  		stat(s, DEACTIVATE_EMPTY);
2171  		discard_slab(s, page);
2172  		stat(s, FREE_SLAB);
2173  	}
2174  
2175  	c->page = NULL;
2176  	c->freelist = NULL;
2177  }
2178  
2179  /*
2180   * Unfreeze all the cpu partial slabs.
2181   *
2182   * This function must be called with interrupts disabled
2183   * for the cpu using c (or some other guarantee must be there
2184   * to guarantee no concurrent accesses).
2185   */
2186  static void unfreeze_partials(struct kmem_cache *s,
2187  		struct kmem_cache_cpu *c)
2188  {
2189  #ifdef CONFIG_SLUB_CPU_PARTIAL
2190  	struct kmem_cache_node *n = NULL, *n2 = NULL;
2191  	struct page *page, *discard_page = NULL;
2192  
2193  	while ((page = c->partial)) {
2194  		struct page new;
2195  		struct page old;
2196  
2197  		c->partial = page->next;
2198  
2199  		n2 = get_node(s, page_to_nid(page));
2200  		if (n != n2) {
2201  			if (n)
2202  				spin_unlock(&n->list_lock);
2203  
2204  			n = n2;
2205  			spin_lock(&n->list_lock);
2206  		}
2207  
2208  		do {
2209  
2210  			old.freelist = page->freelist;
2211  			old.counters = page->counters;
2212  			VM_BUG_ON(!old.frozen);
2213  
2214  			new.counters = old.counters;
2215  			new.freelist = old.freelist;
2216  
2217  			new.frozen = 0;
2218  
2219  		} while (!__cmpxchg_double_slab(s, page,
2220  				old.freelist, old.counters,
2221  				new.freelist, new.counters,
2222  				"unfreezing slab"));
2223  
2224  		if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
2225  			page->next = discard_page;
2226  			discard_page = page;
2227  		} else {
2228  			add_partial(n, page, DEACTIVATE_TO_TAIL);
2229  			stat(s, FREE_ADD_PARTIAL);
2230  		}
2231  	}
2232  
2233  	if (n)
2234  		spin_unlock(&n->list_lock);
2235  
2236  	while (discard_page) {
2237  		page = discard_page;
2238  		discard_page = discard_page->next;
2239  
2240  		stat(s, DEACTIVATE_EMPTY);
2241  		discard_slab(s, page);
2242  		stat(s, FREE_SLAB);
2243  	}
2244  #endif	/* CONFIG_SLUB_CPU_PARTIAL */
2245  }
2246  
2247  /*
2248   * Put a page that was just frozen (in __slab_free|get_partial_node) into a
2249   * partial page slot if available.
2250   *
2251   * If we did not find a slot then simply move all the partials to the
2252   * per node partial list.
2253   */
2254  static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
2255  {
2256  #ifdef CONFIG_SLUB_CPU_PARTIAL
2257  	struct page *oldpage;
2258  	int pages;
2259  	int pobjects;
2260  
2261  	preempt_disable();
2262  	do {
2263  		pages = 0;
2264  		pobjects = 0;
2265  		oldpage = this_cpu_read(s->cpu_slab->partial);
2266  
2267  		if (oldpage) {
2268  			pobjects = oldpage->pobjects;
2269  			pages = oldpage->pages;
2270  			if (drain && pobjects > s->cpu_partial) {
2271  				unsigned long flags;
2272  				/*
2273  				 * partial array is full. Move the existing
2274  				 * set to the per node partial list.
2275  				 */
2276  				local_irq_save(flags);
2277  				unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
2278  				local_irq_restore(flags);
2279  				oldpage = NULL;
2280  				pobjects = 0;
2281  				pages = 0;
2282  				stat(s, CPU_PARTIAL_DRAIN);
2283  			}
2284  		}
2285  
2286  		pages++;
2287  		pobjects += page->objects - page->inuse;
2288  
2289  		page->pages = pages;
2290  		page->pobjects = pobjects;
2291  		page->next = oldpage;
2292  
2293  	} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
2294  								!= oldpage);
2295  	if (unlikely(!s->cpu_partial)) {
2296  		unsigned long flags;
2297  
2298  		local_irq_save(flags);
2299  		unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
2300  		local_irq_restore(flags);
2301  	}
2302  	preempt_enable();
2303  #endif	/* CONFIG_SLUB_CPU_PARTIAL */
2304  }
2305  
2306  static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
2307  {
2308  	stat(s, CPUSLAB_FLUSH);
2309  	deactivate_slab(s, c->page, c->freelist, c);
2310  
2311  	c->tid = next_tid(c->tid);
2312  }
2313  
2314  /*
2315   * Flush cpu slab.
2316   *
2317   * Called from IPI handler with interrupts disabled.
2318   */
2319  static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
2320  {
2321  	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2322  
2323  	if (c->page)
2324  		flush_slab(s, c);
2325  
2326  	unfreeze_partials(s, c);
2327  }
2328  
2329  static void flush_cpu_slab(void *d)
2330  {
2331  	struct kmem_cache *s = d;
2332  
2333  	__flush_cpu_slab(s, smp_processor_id());
2334  }
2335  
2336  static bool has_cpu_slab(int cpu, void *info)
2337  {
2338  	struct kmem_cache *s = info;
2339  	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2340  
2341  	return c->page || slub_percpu_partial(c);
2342  }
2343  
2344  static void flush_all(struct kmem_cache *s)
2345  {
2346  	on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
2347  }
2348  
2349  /*
2350   * Use the cpu notifier to insure that the cpu slabs are flushed when
2351   * necessary.
2352   */
2353  static int slub_cpu_dead(unsigned int cpu)
2354  {
2355  	struct kmem_cache *s;
2356  	unsigned long flags;
2357  
2358  	mutex_lock(&slab_mutex);
2359  	list_for_each_entry(s, &slab_caches, list) {
2360  		local_irq_save(flags);
2361  		__flush_cpu_slab(s, cpu);
2362  		local_irq_restore(flags);
2363  	}
2364  	mutex_unlock(&slab_mutex);
2365  	return 0;
2366  }
2367  
2368  /*
2369   * Check if the objects in a per cpu structure fit numa
2370   * locality expectations.
2371   */
2372  static inline int node_match(struct page *page, int node)
2373  {
2374  #ifdef CONFIG_NUMA
2375  	if (node != NUMA_NO_NODE && page_to_nid(page) != node)
2376  		return 0;
2377  #endif
2378  	return 1;
2379  }
2380  
2381  #ifdef CONFIG_SLUB_DEBUG
2382  static int count_free(struct page *page)
2383  {
2384  	return page->objects - page->inuse;
2385  }
2386  
2387  static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
2388  {
2389  	return atomic_long_read(&n->total_objects);
2390  }
2391  #endif /* CONFIG_SLUB_DEBUG */
2392  
2393  #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
2394  static unsigned long count_partial(struct kmem_cache_node *n,
2395  					int (*get_count)(struct page *))
2396  {
2397  	unsigned long flags;
2398  	unsigned long x = 0;
2399  	struct page *page;
2400  
2401  	spin_lock_irqsave(&n->list_lock, flags);
2402  	list_for_each_entry(page, &n->partial, slab_list)
2403  		x += get_count(page);
2404  	spin_unlock_irqrestore(&n->list_lock, flags);
2405  	return x;
2406  }
2407  #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
2408  
2409  static noinline void
2410  slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2411  {
2412  #ifdef CONFIG_SLUB_DEBUG
2413  	static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
2414  				      DEFAULT_RATELIMIT_BURST);
2415  	int node;
2416  	struct kmem_cache_node *n;
2417  
2418  	if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
2419  		return;
2420  
2421  	pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
2422  		nid, gfpflags, &gfpflags);
2423  	pr_warn("  cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
2424  		s->name, s->object_size, s->size, oo_order(s->oo),
2425  		oo_order(s->min));
2426  
2427  	if (oo_order(s->min) > get_order(s->object_size))
2428  		pr_warn("  %s debugging increased min order, use slub_debug=O to disable.\n",
2429  			s->name);
2430  
2431  	for_each_kmem_cache_node(s, node, n) {
2432  		unsigned long nr_slabs;
2433  		unsigned long nr_objs;
2434  		unsigned long nr_free;
2435  
2436  		nr_free  = count_partial(n, count_free);
2437  		nr_slabs = node_nr_slabs(n);
2438  		nr_objs  = node_nr_objs(n);
2439  
2440  		pr_warn("  node %d: slabs: %ld, objs: %ld, free: %ld\n",
2441  			node, nr_slabs, nr_objs, nr_free);
2442  	}
2443  #endif
2444  }
2445  
2446  static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2447  			int node, struct kmem_cache_cpu **pc)
2448  {
2449  	void *freelist;
2450  	struct kmem_cache_cpu *c = *pc;
2451  	struct page *page;
2452  
2453  	WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
2454  
2455  	freelist = get_partial(s, flags, node, c);
2456  
2457  	if (freelist)
2458  		return freelist;
2459  
2460  	page = new_slab(s, flags, node);
2461  	if (page) {
2462  		c = raw_cpu_ptr(s->cpu_slab);
2463  		if (c->page)
2464  			flush_slab(s, c);
2465  
2466  		/*
2467  		 * No other reference to the page yet so we can
2468  		 * muck around with it freely without cmpxchg
2469  		 */
2470  		freelist = page->freelist;
2471  		page->freelist = NULL;
2472  
2473  		stat(s, ALLOC_SLAB);
2474  		c->page = page;
2475  		*pc = c;
2476  	}
2477  
2478  	return freelist;
2479  }
2480  
2481  static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
2482  {
2483  	if (unlikely(PageSlabPfmemalloc(page)))
2484  		return gfp_pfmemalloc_allowed(gfpflags);
2485  
2486  	return true;
2487  }
2488  
2489  /*
2490   * Check the page->freelist of a page and either transfer the freelist to the
2491   * per cpu freelist or deactivate the page.
2492   *
2493   * The page is still frozen if the return value is not NULL.
2494   *
2495   * If this function returns NULL then the page has been unfrozen.
2496   *
2497   * This function must be called with interrupt disabled.
2498   */
2499  static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2500  {
2501  	struct page new;
2502  	unsigned long counters;
2503  	void *freelist;
2504  
2505  	do {
2506  		freelist = page->freelist;
2507  		counters = page->counters;
2508  
2509  		new.counters = counters;
2510  		VM_BUG_ON(!new.frozen);
2511  
2512  		new.inuse = page->objects;
2513  		new.frozen = freelist != NULL;
2514  
2515  	} while (!__cmpxchg_double_slab(s, page,
2516  		freelist, counters,
2517  		NULL, new.counters,
2518  		"get_freelist"));
2519  
2520  	return freelist;
2521  }
2522  
2523  /*
2524   * Slow path. The lockless freelist is empty or we need to perform
2525   * debugging duties.
2526   *
2527   * Processing is still very fast if new objects have been freed to the
2528   * regular freelist. In that case we simply take over the regular freelist
2529   * as the lockless freelist and zap the regular freelist.
2530   *
2531   * If that is not working then we fall back to the partial lists. We take the
2532   * first element of the freelist as the object to allocate now and move the
2533   * rest of the freelist to the lockless freelist.
2534   *
2535   * And if we were unable to get a new slab from the partial slab lists then
2536   * we need to allocate a new slab. This is the slowest path since it involves
2537   * a call to the page allocator and the setup of a new slab.
2538   *
2539   * Version of __slab_alloc to use when we know that interrupts are
2540   * already disabled (which is the case for bulk allocation).
2541   */
2542  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2543  			  unsigned long addr, struct kmem_cache_cpu *c)
2544  {
2545  	void *freelist;
2546  	struct page *page;
2547  
2548  	page = c->page;
2549  	if (!page)
2550  		goto new_slab;
2551  redo:
2552  
2553  	if (unlikely(!node_match(page, node))) {
2554  		int searchnode = node;
2555  
2556  		if (node != NUMA_NO_NODE && !node_present_pages(node))
2557  			searchnode = node_to_mem_node(node);
2558  
2559  		if (unlikely(!node_match(page, searchnode))) {
2560  			stat(s, ALLOC_NODE_MISMATCH);
2561  			deactivate_slab(s, page, c->freelist, c);
2562  			goto new_slab;
2563  		}
2564  	}
2565  
2566  	/*
2567  	 * By rights, we should be searching for a slab page that was
2568  	 * PFMEMALLOC but right now, we are losing the pfmemalloc
2569  	 * information when the page leaves the per-cpu allocator
2570  	 */
2571  	if (unlikely(!pfmemalloc_match(page, gfpflags))) {
2572  		deactivate_slab(s, page, c->freelist, c);
2573  		goto new_slab;
2574  	}
2575  
2576  	/* must check again c->freelist in case of cpu migration or IRQ */
2577  	freelist = c->freelist;
2578  	if (freelist)
2579  		goto load_freelist;
2580  
2581  	freelist = get_freelist(s, page);
2582  
2583  	if (!freelist) {
2584  		c->page = NULL;
2585  		stat(s, DEACTIVATE_BYPASS);
2586  		goto new_slab;
2587  	}
2588  
2589  	stat(s, ALLOC_REFILL);
2590  
2591  load_freelist:
2592  	/*
2593  	 * freelist is pointing to the list of objects to be used.
2594  	 * page is pointing to the page from which the objects are obtained.
2595  	 * That page must be frozen for per cpu allocations to work.
2596  	 */
2597  	VM_BUG_ON(!c->page->frozen);
2598  	c->freelist = get_freepointer(s, freelist);
2599  	c->tid = next_tid(c->tid);
2600  	return freelist;
2601  
2602  new_slab:
2603  
2604  	if (slub_percpu_partial(c)) {
2605  		page = c->page = slub_percpu_partial(c);
2606  		slub_set_percpu_partial(c, page);
2607  		stat(s, CPU_PARTIAL_ALLOC);
2608  		goto redo;
2609  	}
2610  
2611  	freelist = new_slab_objects(s, gfpflags, node, &c);
2612  
2613  	if (unlikely(!freelist)) {
2614  		slab_out_of_memory(s, gfpflags, node);
2615  		return NULL;
2616  	}
2617  
2618  	page = c->page;
2619  	if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
2620  		goto load_freelist;
2621  
2622  	/* Only entered in the debug case */
2623  	if (kmem_cache_debug(s) &&
2624  			!alloc_debug_processing(s, page, freelist, addr))
2625  		goto new_slab;	/* Slab failed checks. Next slab needed */
2626  
2627  	deactivate_slab(s, page, get_freepointer(s, freelist), c);
2628  	return freelist;
2629  }
2630  
2631  /*
2632   * Another one that disabled interrupt and compensates for possible
2633   * cpu changes by refetching the per cpu area pointer.
2634   */
2635  static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2636  			  unsigned long addr, struct kmem_cache_cpu *c)
2637  {
2638  	void *p;
2639  	unsigned long flags;
2640  
2641  	local_irq_save(flags);
2642  #ifdef CONFIG_PREEMPT
2643  	/*
2644  	 * We may have been preempted and rescheduled on a different
2645  	 * cpu before disabling interrupts. Need to reload cpu area
2646  	 * pointer.
2647  	 */
2648  	c = this_cpu_ptr(s->cpu_slab);
2649  #endif
2650  
2651  	p = ___slab_alloc(s, gfpflags, node, addr, c);
2652  	local_irq_restore(flags);
2653  	return p;
2654  }
2655  
2656  /*
2657   * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
2658   * have the fastpath folded into their functions. So no function call
2659   * overhead for requests that can be satisfied on the fastpath.
2660   *
2661   * The fastpath works by first checking if the lockless freelist can be used.
2662   * If not then __slab_alloc is called for slow processing.
2663   *
2664   * Otherwise we can simply pick the next object from the lockless free list.
2665   */
2666  static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2667  		gfp_t gfpflags, int node, unsigned long addr)
2668  {
2669  	void *object;
2670  	struct kmem_cache_cpu *c;
2671  	struct page *page;
2672  	unsigned long tid;
2673  
2674  	s = slab_pre_alloc_hook(s, gfpflags);
2675  	if (!s)
2676  		return NULL;
2677  redo:
2678  	/*
2679  	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
2680  	 * enabled. We may switch back and forth between cpus while
2681  	 * reading from one cpu area. That does not matter as long
2682  	 * as we end up on the original cpu again when doing the cmpxchg.
2683  	 *
2684  	 * We should guarantee that tid and kmem_cache are retrieved on
2685  	 * the same cpu. It could be different if CONFIG_PREEMPT so we need
2686  	 * to check if it is matched or not.
2687  	 */
2688  	do {
2689  		tid = this_cpu_read(s->cpu_slab->tid);
2690  		c = raw_cpu_ptr(s->cpu_slab);
2691  	} while (IS_ENABLED(CONFIG_PREEMPT) &&
2692  		 unlikely(tid != READ_ONCE(c->tid)));
2693  
2694  	/*
2695  	 * Irqless object alloc/free algorithm used here depends on sequence
2696  	 * of fetching cpu_slab's data. tid should be fetched before anything
2697  	 * on c to guarantee that object and page associated with previous tid
2698  	 * won't be used with current tid. If we fetch tid first, object and
2699  	 * page could be one associated with next tid and our alloc/free
2700  	 * request will be failed. In this case, we will retry. So, no problem.
2701  	 */
2702  	barrier();
2703  
2704  	/*
2705  	 * The transaction ids are globally unique per cpu and per operation on
2706  	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
2707  	 * occurs on the right processor and that there was no operation on the
2708  	 * linked list in between.
2709  	 */
2710  
2711  	object = c->freelist;
2712  	page = c->page;
2713  	if (unlikely(!object || !node_match(page, node))) {
2714  		object = __slab_alloc(s, gfpflags, node, addr, c);
2715  		stat(s, ALLOC_SLOWPATH);
2716  	} else {
2717  		void *next_object = get_freepointer_safe(s, object);
2718  
2719  		/*
2720  		 * The cmpxchg will only match if there was no additional
2721  		 * operation and if we are on the right processor.
2722  		 *
2723  		 * The cmpxchg does the following atomically (without lock
2724  		 * semantics!)
2725  		 * 1. Relocate first pointer to the current per cpu area.
2726  		 * 2. Verify that tid and freelist have not been changed
2727  		 * 3. If they were not changed replace tid and freelist
2728  		 *
2729  		 * Since this is without lock semantics the protection is only
2730  		 * against code executing on this cpu *not* from access by
2731  		 * other cpus.
2732  		 */
2733  		if (unlikely(!this_cpu_cmpxchg_double(
2734  				s->cpu_slab->freelist, s->cpu_slab->tid,
2735  				object, tid,
2736  				next_object, next_tid(tid)))) {
2737  
2738  			note_cmpxchg_failure("slab_alloc", s, tid);
2739  			goto redo;
2740  		}
2741  		prefetch_freepointer(s, next_object);
2742  		stat(s, ALLOC_FASTPATH);
2743  	}
2744  
2745  	if (unlikely(gfpflags & __GFP_ZERO) && object)
2746  		memset(object, 0, s->object_size);
2747  
2748  	slab_post_alloc_hook(s, gfpflags, 1, &object);
2749  
2750  	return object;
2751  }
2752  
2753  static __always_inline void *slab_alloc(struct kmem_cache *s,
2754  		gfp_t gfpflags, unsigned long addr)
2755  {
2756  	return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
2757  }
2758  
2759  void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
2760  {
2761  	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2762  
2763  	trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
2764  				s->size, gfpflags);
2765  
2766  	return ret;
2767  }
2768  EXPORT_SYMBOL(kmem_cache_alloc);
2769  
2770  #ifdef CONFIG_TRACING
2771  void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
2772  {
2773  	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2774  	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
2775  	ret = kasan_kmalloc(s, ret, size, gfpflags);
2776  	return ret;
2777  }
2778  EXPORT_SYMBOL(kmem_cache_alloc_trace);
2779  #endif
2780  
2781  #ifdef CONFIG_NUMA
2782  void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
2783  {
2784  	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
2785  
2786  	trace_kmem_cache_alloc_node(_RET_IP_, ret,
2787  				    s->object_size, s->size, gfpflags, node);
2788  
2789  	return ret;
2790  }
2791  EXPORT_SYMBOL(kmem_cache_alloc_node);
2792  
2793  #ifdef CONFIG_TRACING
2794  void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
2795  				    gfp_t gfpflags,
2796  				    int node, size_t size)
2797  {
2798  	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
2799  
2800  	trace_kmalloc_node(_RET_IP_, ret,
2801  			   size, s->size, gfpflags, node);
2802  
2803  	ret = kasan_kmalloc(s, ret, size, gfpflags);
2804  	return ret;
2805  }
2806  EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
2807  #endif
2808  #endif	/* CONFIG_NUMA */
2809  
2810  /*
2811   * Slow path handling. This may still be called frequently since objects
2812   * have a longer lifetime than the cpu slabs in most processing loads.
2813   *
2814   * So we still attempt to reduce cache line usage. Just take the slab
2815   * lock and free the item. If there is no additional partial page
2816   * handling required then we can return immediately.
2817   */
2818  static void __slab_free(struct kmem_cache *s, struct page *page,
2819  			void *head, void *tail, int cnt,
2820  			unsigned long addr)
2821  
2822  {
2823  	void *prior;
2824  	int was_frozen;
2825  	struct page new;
2826  	unsigned long counters;
2827  	struct kmem_cache_node *n = NULL;
2828  	unsigned long uninitialized_var(flags);
2829  
2830  	stat(s, FREE_SLOWPATH);
2831  
2832  	if (kmem_cache_debug(s) &&
2833  	    !free_debug_processing(s, page, head, tail, cnt, addr))
2834  		return;
2835  
2836  	do {
2837  		if (unlikely(n)) {
2838  			spin_unlock_irqrestore(&n->list_lock, flags);
2839  			n = NULL;
2840  		}
2841  		prior = page->freelist;
2842  		counters = page->counters;
2843  		set_freepointer(s, tail, prior);
2844  		new.counters = counters;
2845  		was_frozen = new.frozen;
2846  		new.inuse -= cnt;
2847  		if ((!new.inuse || !prior) && !was_frozen) {
2848  
2849  			if (kmem_cache_has_cpu_partial(s) && !prior) {
2850  
2851  				/*
2852  				 * Slab was on no list before and will be
2853  				 * partially empty
2854  				 * We can defer the list move and instead
2855  				 * freeze it.
2856  				 */
2857  				new.frozen = 1;
2858  
2859  			} else { /* Needs to be taken off a list */
2860  
2861  				n = get_node(s, page_to_nid(page));
2862  				/*
2863  				 * Speculatively acquire the list_lock.
2864  				 * If the cmpxchg does not succeed then we may
2865  				 * drop the list_lock without any processing.
2866  				 *
2867  				 * Otherwise the list_lock will synchronize with
2868  				 * other processors updating the list of slabs.
2869  				 */
2870  				spin_lock_irqsave(&n->list_lock, flags);
2871  
2872  			}
2873  		}
2874  
2875  	} while (!cmpxchg_double_slab(s, page,
2876  		prior, counters,
2877  		head, new.counters,
2878  		"__slab_free"));
2879  
2880  	if (likely(!n)) {
2881  
2882  		/*
2883  		 * If we just froze the page then put it onto the
2884  		 * per cpu partial list.
2885  		 */
2886  		if (new.frozen && !was_frozen) {
2887  			put_cpu_partial(s, page, 1);
2888  			stat(s, CPU_PARTIAL_FREE);
2889  		}
2890  		/*
2891  		 * The list lock was not taken therefore no list
2892  		 * activity can be necessary.
2893  		 */
2894  		if (was_frozen)
2895  			stat(s, FREE_FROZEN);
2896  		return;
2897  	}
2898  
2899  	if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
2900  		goto slab_empty;
2901  
2902  	/*
2903  	 * Objects left in the slab. If it was not on the partial list before
2904  	 * then add it.
2905  	 */
2906  	if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
2907  		remove_full(s, n, page);
2908  		add_partial(n, page, DEACTIVATE_TO_TAIL);
2909  		stat(s, FREE_ADD_PARTIAL);
2910  	}
2911  	spin_unlock_irqrestore(&n->list_lock, flags);
2912  	return;
2913  
2914  slab_empty:
2915  	if (prior) {
2916  		/*
2917  		 * Slab on the partial list.
2918  		 */
2919  		remove_partial(n, page);
2920  		stat(s, FREE_REMOVE_PARTIAL);
2921  	} else {
2922  		/* Slab must be on the full list */
2923  		remove_full(s, n, page);
2924  	}
2925  
2926  	spin_unlock_irqrestore(&n->list_lock, flags);
2927  	stat(s, FREE_SLAB);
2928  	discard_slab(s, page);
2929  }
2930  
2931  /*
2932   * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
2933   * can perform fastpath freeing without additional function calls.
2934   *
2935   * The fastpath is only possible if we are freeing to the current cpu slab
2936   * of this processor. This typically the case if we have just allocated
2937   * the item before.
2938   *
2939   * If fastpath is not possible then fall back to __slab_free where we deal
2940   * with all sorts of special processing.
2941   *
2942   * Bulk free of a freelist with several objects (all pointing to the
2943   * same page) possible by specifying head and tail ptr, plus objects
2944   * count (cnt). Bulk free indicated by tail pointer being set.
2945   */
2946  static __always_inline void do_slab_free(struct kmem_cache *s,
2947  				struct page *page, void *head, void *tail,
2948  				int cnt, unsigned long addr)
2949  {
2950  	void *tail_obj = tail ? : head;
2951  	struct kmem_cache_cpu *c;
2952  	unsigned long tid;
2953  redo:
2954  	/*
2955  	 * Determine the currently cpus per cpu slab.
2956  	 * The cpu may change afterward. However that does not matter since
2957  	 * data is retrieved via this pointer. If we are on the same cpu
2958  	 * during the cmpxchg then the free will succeed.
2959  	 */
2960  	do {
2961  		tid = this_cpu_read(s->cpu_slab->tid);
2962  		c = raw_cpu_ptr(s->cpu_slab);
2963  	} while (IS_ENABLED(CONFIG_PREEMPT) &&
2964  		 unlikely(tid != READ_ONCE(c->tid)));
2965  
2966  	/* Same with comment on barrier() in slab_alloc_node() */
2967  	barrier();
2968  
2969  	if (likely(page == c->page)) {
2970  		set_freepointer(s, tail_obj, c->freelist);
2971  
2972  		if (unlikely(!this_cpu_cmpxchg_double(
2973  				s->cpu_slab->freelist, s->cpu_slab->tid,
2974  				c->freelist, tid,
2975  				head, next_tid(tid)))) {
2976  
2977  			note_cmpxchg_failure("slab_free", s, tid);
2978  			goto redo;
2979  		}
2980  		stat(s, FREE_FASTPATH);
2981  	} else
2982  		__slab_free(s, page, head, tail_obj, cnt, addr);
2983  
2984  }
2985  
2986  static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
2987  				      void *head, void *tail, int cnt,
2988  				      unsigned long addr)
2989  {
2990  	/*
2991  	 * With KASAN enabled slab_free_freelist_hook modifies the freelist
2992  	 * to remove objects, whose reuse must be delayed.
2993  	 */
2994  	if (slab_free_freelist_hook(s, &head, &tail))
2995  		do_slab_free(s, page, head, tail, cnt, addr);
2996  }
2997  
2998  #ifdef CONFIG_KASAN_GENERIC
2999  void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
3000  {
3001  	do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr);
3002  }
3003  #endif
3004  
3005  void kmem_cache_free(struct kmem_cache *s, void *x)
3006  {
3007  	s = cache_from_obj(s, x);
3008  	if (!s)
3009  		return;
3010  	slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
3011  	trace_kmem_cache_free(_RET_IP_, x);
3012  }
3013  EXPORT_SYMBOL(kmem_cache_free);
3014  
3015  struct detached_freelist {
3016  	struct page *page;
3017  	void *tail;
3018  	void *freelist;
3019  	int cnt;
3020  	struct kmem_cache *s;
3021  };
3022  
3023  /*
3024   * This function progressively scans the array with free objects (with
3025   * a limited look ahead) and extract objects belonging to the same
3026   * page.  It builds a detached freelist directly within the given
3027   * page/objects.  This can happen without any need for
3028   * synchronization, because the objects are owned by running process.
3029   * The freelist is build up as a single linked list in the objects.
3030   * The idea is, that this detached freelist can then be bulk
3031   * transferred to the real freelist(s), but only requiring a single
3032   * synchronization primitive.  Look ahead in the array is limited due
3033   * to performance reasons.
3034   */
3035  static inline
3036  int build_detached_freelist(struct kmem_cache *s, size_t size,
3037  			    void **p, struct detached_freelist *df)
3038  {
3039  	size_t first_skipped_index = 0;
3040  	int lookahead = 3;
3041  	void *object;
3042  	struct page *page;
3043  
3044  	/* Always re-init detached_freelist */
3045  	df->page = NULL;
3046  
3047  	do {
3048  		object = p[--size];
3049  		/* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */
3050  	} while (!object && size);
3051  
3052  	if (!object)
3053  		return 0;
3054  
3055  	page = virt_to_head_page(object);
3056  	if (!s) {
3057  		/* Handle kalloc'ed objects */
3058  		if (unlikely(!PageSlab(page))) {
3059  			BUG_ON(!PageCompound(page));
3060  			kfree_hook(object);
3061  			__free_pages(page, compound_order(page));
3062  			p[size] = NULL; /* mark object processed */
3063  			return size;
3064  		}
3065  		/* Derive kmem_cache from object */
3066  		df->s = page->slab_cache;
3067  	} else {
3068  		df->s = cache_from_obj(s, object); /* Support for memcg */
3069  	}
3070  
3071  	/* Start new detached freelist */
3072  	df->page = page;
3073  	set_freepointer(df->s, object, NULL);
3074  	df->tail = object;
3075  	df->freelist = object;
3076  	p[size] = NULL; /* mark object processed */
3077  	df->cnt = 1;
3078  
3079  	while (size) {
3080  		object = p[--size];
3081  		if (!object)
3082  			continue; /* Skip processed objects */
3083  
3084  		/* df->page is always set at this point */
3085  		if (df->page == virt_to_head_page(object)) {
3086  			/* Opportunity build freelist */
3087  			set_freepointer(df->s, object, df->freelist);
3088  			df->freelist = object;
3089  			df->cnt++;
3090  			p[size] = NULL; /* mark object processed */
3091  
3092  			continue;
3093  		}
3094  
3095  		/* Limit look ahead search */
3096  		if (!--lookahead)
3097  			break;
3098  
3099  		if (!first_skipped_index)
3100  			first_skipped_index = size + 1;
3101  	}
3102  
3103  	return first_skipped_index;
3104  }
3105  
3106  /* Note that interrupts must be enabled when calling this function. */
3107  void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
3108  {
3109  	if (WARN_ON(!size))
3110  		return;
3111  
3112  	do {
3113  		struct detached_freelist df;
3114  
3115  		size = build_detached_freelist(s, size, p, &df);
3116  		if (!df.page)
3117  			continue;
3118  
3119  		slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
3120  	} while (likely(size));
3121  }
3122  EXPORT_SYMBOL(kmem_cache_free_bulk);
3123  
3124  /* Note that interrupts must be enabled when calling this function. */
3125  int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
3126  			  void **p)
3127  {
3128  	struct kmem_cache_cpu *c;
3129  	int i;
3130  
3131  	/* memcg and kmem_cache debug support */
3132  	s = slab_pre_alloc_hook(s, flags);
3133  	if (unlikely(!s))
3134  		return false;
3135  	/*
3136  	 * Drain objects in the per cpu slab, while disabling local
3137  	 * IRQs, which protects against PREEMPT and interrupts
3138  	 * handlers invoking normal fastpath.
3139  	 */
3140  	local_irq_disable();
3141  	c = this_cpu_ptr(s->cpu_slab);
3142  
3143  	for (i = 0; i < size; i++) {
3144  		void *object = c->freelist;
3145  
3146  		if (unlikely(!object)) {
3147  			/*
3148  			 * Invoking slow path likely have side-effect
3149  			 * of re-populating per CPU c->freelist
3150  			 */
3151  			p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
3152  					    _RET_IP_, c);
3153  			if (unlikely(!p[i]))
3154  				goto error;
3155  
3156  			c = this_cpu_ptr(s->cpu_slab);
3157  			continue; /* goto for-loop */
3158  		}
3159  		c->freelist = get_freepointer(s, object);
3160  		p[i] = object;
3161  	}
3162  	c->tid = next_tid(c->tid);
3163  	local_irq_enable();
3164  
3165  	/* Clear memory outside IRQ disabled fastpath loop */
3166  	if (unlikely(flags & __GFP_ZERO)) {
3167  		int j;
3168  
3169  		for (j = 0; j < i; j++)
3170  			memset(p[j], 0, s->object_size);
3171  	}
3172  
3173  	/* memcg and kmem_cache debug support */
3174  	slab_post_alloc_hook(s, flags, size, p);
3175  	return i;
3176  error:
3177  	local_irq_enable();
3178  	slab_post_alloc_hook(s, flags, i, p);
3179  	__kmem_cache_free_bulk(s, i, p);
3180  	return 0;
3181  }
3182  EXPORT_SYMBOL(kmem_cache_alloc_bulk);
3183  
3184  
3185  /*
3186   * Object placement in a slab is made very easy because we always start at
3187   * offset 0. If we tune the size of the object to the alignment then we can
3188   * get the required alignment by putting one properly sized object after
3189   * another.
3190   *
3191   * Notice that the allocation order determines the sizes of the per cpu
3192   * caches. Each processor has always one slab available for allocations.
3193   * Increasing the allocation order reduces the number of times that slabs
3194   * must be moved on and off the partial lists and is therefore a factor in
3195   * locking overhead.
3196   */
3197  
3198  /*
3199   * Mininum / Maximum order of slab pages. This influences locking overhead
3200   * and slab fragmentation. A higher order reduces the number of partial slabs
3201   * and increases the number of allocations possible without having to
3202   * take the list_lock.
3203   */
3204  static unsigned int slub_min_order;
3205  static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
3206  static unsigned int slub_min_objects;
3207  
3208  /*
3209   * Calculate the order of allocation given an slab object size.
3210   *
3211   * The order of allocation has significant impact on performance and other
3212   * system components. Generally order 0 allocations should be preferred since
3213   * order 0 does not cause fragmentation in the page allocator. Larger objects
3214   * be problematic to put into order 0 slabs because there may be too much
3215   * unused space left. We go to a higher order if more than 1/16th of the slab
3216   * would be wasted.
3217   *
3218   * In order to reach satisfactory performance we must ensure that a minimum
3219   * number of objects is in one slab. Otherwise we may generate too much
3220   * activity on the partial lists which requires taking the list_lock. This is
3221   * less a concern for large slabs though which are rarely used.
3222   *
3223   * slub_max_order specifies the order where we begin to stop considering the
3224   * number of objects in a slab as critical. If we reach slub_max_order then
3225   * we try to keep the page order as low as possible. So we accept more waste
3226   * of space in favor of a small page order.
3227   *
3228   * Higher order allocations also allow the placement of more objects in a
3229   * slab and thereby reduce object handling overhead. If the user has
3230   * requested a higher mininum order then we start with that one instead of
3231   * the smallest order which will fit the object.
3232   */
3233  static inline unsigned int slab_order(unsigned int size,
3234  		unsigned int min_objects, unsigned int max_order,
3235  		unsigned int fract_leftover)
3236  {
3237  	unsigned int min_order = slub_min_order;
3238  	unsigned int order;
3239  
3240  	if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
3241  		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
3242  
3243  	for (order = max(min_order, (unsigned int)get_order(min_objects * size));
3244  			order <= max_order; order++) {
3245  
3246  		unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
3247  		unsigned int rem;
3248  
3249  		rem = slab_size % size;
3250  
3251  		if (rem <= slab_size / fract_leftover)
3252  			break;
3253  	}
3254  
3255  	return order;
3256  }
3257  
3258  static inline int calculate_order(unsigned int size)
3259  {
3260  	unsigned int order;
3261  	unsigned int min_objects;
3262  	unsigned int max_objects;
3263  
3264  	/*
3265  	 * Attempt to find best configuration for a slab. This
3266  	 * works by first attempting to generate a layout with
3267  	 * the best configuration and backing off gradually.
3268  	 *
3269  	 * First we increase the acceptable waste in a slab. Then
3270  	 * we reduce the minimum objects required in a slab.
3271  	 */
3272  	min_objects = slub_min_objects;
3273  	if (!min_objects)
3274  		min_objects = 4 * (fls(nr_cpu_ids) + 1);
3275  	max_objects = order_objects(slub_max_order, size);
3276  	min_objects = min(min_objects, max_objects);
3277  
3278  	while (min_objects > 1) {
3279  		unsigned int fraction;
3280  
3281  		fraction = 16;
3282  		while (fraction >= 4) {
3283  			order = slab_order(size, min_objects,
3284  					slub_max_order, fraction);
3285  			if (order <= slub_max_order)
3286  				return order;
3287  			fraction /= 2;
3288  		}
3289  		min_objects--;
3290  	}
3291  
3292  	/*
3293  	 * We were unable to place multiple objects in a slab. Now
3294  	 * lets see if we can place a single object there.
3295  	 */
3296  	order = slab_order(size, 1, slub_max_order, 1);
3297  	if (order <= slub_max_order)
3298  		return order;
3299  
3300  	/*
3301  	 * Doh this slab cannot be placed using slub_max_order.
3302  	 */
3303  	order = slab_order(size, 1, MAX_ORDER, 1);
3304  	if (order < MAX_ORDER)
3305  		return order;
3306  	return -ENOSYS;
3307  }
3308  
3309  static void
3310  init_kmem_cache_node(struct kmem_cache_node *n)
3311  {
3312  	n->nr_partial = 0;
3313  	spin_lock_init(&n->list_lock);
3314  	INIT_LIST_HEAD(&n->partial);
3315  #ifdef CONFIG_SLUB_DEBUG
3316  	atomic_long_set(&n->nr_slabs, 0);
3317  	atomic_long_set(&n->total_objects, 0);
3318  	INIT_LIST_HEAD(&n->full);
3319  #endif
3320  }
3321  
3322  static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
3323  {
3324  	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
3325  			KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
3326  
3327  	/*
3328  	 * Must align to double word boundary for the double cmpxchg
3329  	 * instructions to work; see __pcpu_double_call_return_bool().
3330  	 */
3331  	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
3332  				     2 * sizeof(void *));
3333  
3334  	if (!s->cpu_slab)
3335  		return 0;
3336  
3337  	init_kmem_cache_cpus(s);
3338  
3339  	return 1;
3340  }
3341  
3342  static struct kmem_cache *kmem_cache_node;
3343  
3344  /*
3345   * No kmalloc_node yet so do it by hand. We know that this is the first
3346   * slab on the node for this slabcache. There are no concurrent accesses
3347   * possible.
3348   *
3349   * Note that this function only works on the kmem_cache_node
3350   * when allocating for the kmem_cache_node. This is used for bootstrapping
3351   * memory on a fresh node that has no slab structures yet.
3352   */
3353  static void early_kmem_cache_node_alloc(int node)
3354  {
3355  	struct page *page;
3356  	struct kmem_cache_node *n;
3357  
3358  	BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
3359  
3360  	page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
3361  
3362  	BUG_ON(!page);
3363  	if (page_to_nid(page) != node) {
3364  		pr_err("SLUB: Unable to allocate memory from node %d\n", node);
3365  		pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
3366  	}
3367  
3368  	n = page->freelist;
3369  	BUG_ON(!n);
3370  #ifdef CONFIG_SLUB_DEBUG
3371  	init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
3372  	init_tracking(kmem_cache_node, n);
3373  #endif
3374  	n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
3375  		      GFP_KERNEL);
3376  	page->freelist = get_freepointer(kmem_cache_node, n);
3377  	page->inuse = 1;
3378  	page->frozen = 0;
3379  	kmem_cache_node->node[node] = n;
3380  	init_kmem_cache_node(n);
3381  	inc_slabs_node(kmem_cache_node, node, page->objects);
3382  
3383  	/*
3384  	 * No locks need to be taken here as it has just been
3385  	 * initialized and there is no concurrent access.
3386  	 */
3387  	__add_partial(n, page, DEACTIVATE_TO_HEAD);
3388  }
3389  
3390  static void free_kmem_cache_nodes(struct kmem_cache *s)
3391  {
3392  	int node;
3393  	struct kmem_cache_node *n;
3394  
3395  	for_each_kmem_cache_node(s, node, n) {
3396  		s->node[node] = NULL;
3397  		kmem_cache_free(kmem_cache_node, n);
3398  	}
3399  }
3400  
3401  void __kmem_cache_release(struct kmem_cache *s)
3402  {
3403  	cache_random_seq_destroy(s);
3404  	free_percpu(s->cpu_slab);
3405  	free_kmem_cache_nodes(s);
3406  }
3407  
3408  static int init_kmem_cache_nodes(struct kmem_cache *s)
3409  {
3410  	int node;
3411  
3412  	for_each_node_state(node, N_NORMAL_MEMORY) {
3413  		struct kmem_cache_node *n;
3414  
3415  		if (slab_state == DOWN) {
3416  			early_kmem_cache_node_alloc(node);
3417  			continue;
3418  		}
3419  		n = kmem_cache_alloc_node(kmem_cache_node,
3420  						GFP_KERNEL, node);
3421  
3422  		if (!n) {
3423  			free_kmem_cache_nodes(s);
3424  			return 0;
3425  		}
3426  
3427  		init_kmem_cache_node(n);
3428  		s->node[node] = n;
3429  	}
3430  	return 1;
3431  }
3432  
3433  static void set_min_partial(struct kmem_cache *s, unsigned long min)
3434  {
3435  	if (min < MIN_PARTIAL)
3436  		min = MIN_PARTIAL;
3437  	else if (min > MAX_PARTIAL)
3438  		min = MAX_PARTIAL;
3439  	s->min_partial = min;
3440  }
3441  
3442  static void set_cpu_partial(struct kmem_cache *s)
3443  {
3444  #ifdef CONFIG_SLUB_CPU_PARTIAL
3445  	/*
3446  	 * cpu_partial determined the maximum number of objects kept in the
3447  	 * per cpu partial lists of a processor.
3448  	 *
3449  	 * Per cpu partial lists mainly contain slabs that just have one
3450  	 * object freed. If they are used for allocation then they can be
3451  	 * filled up again with minimal effort. The slab will never hit the
3452  	 * per node partial lists and therefore no locking will be required.
3453  	 *
3454  	 * This setting also determines
3455  	 *
3456  	 * A) The number of objects from per cpu partial slabs dumped to the
3457  	 *    per node list when we reach the limit.
3458  	 * B) The number of objects in cpu partial slabs to extract from the
3459  	 *    per node list when we run out of per cpu objects. We only fetch
3460  	 *    50% to keep some capacity around for frees.
3461  	 */
3462  	if (!kmem_cache_has_cpu_partial(s))
3463  		s->cpu_partial = 0;
3464  	else if (s->size >= PAGE_SIZE)
3465  		s->cpu_partial = 2;
3466  	else if (s->size >= 1024)
3467  		s->cpu_partial = 6;
3468  	else if (s->size >= 256)
3469  		s->cpu_partial = 13;
3470  	else
3471  		s->cpu_partial = 30;
3472  #endif
3473  }
3474  
3475  /*
3476   * calculate_sizes() determines the order and the distribution of data within
3477   * a slab object.
3478   */
3479  static int calculate_sizes(struct kmem_cache *s, int forced_order)
3480  {
3481  	slab_flags_t flags = s->flags;
3482  	unsigned int size = s->object_size;
3483  	unsigned int order;
3484  
3485  	/*
3486  	 * Round up object size to the next word boundary. We can only
3487  	 * place the free pointer at word boundaries and this determines
3488  	 * the possible location of the free pointer.
3489  	 */
3490  	size = ALIGN(size, sizeof(void *));
3491  
3492  #ifdef CONFIG_SLUB_DEBUG
3493  	/*
3494  	 * Determine if we can poison the object itself. If the user of
3495  	 * the slab may touch the object after free or before allocation
3496  	 * then we should never poison the object itself.
3497  	 */
3498  	if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
3499  			!s->ctor)
3500  		s->flags |= __OBJECT_POISON;
3501  	else
3502  		s->flags &= ~__OBJECT_POISON;
3503  
3504  
3505  	/*
3506  	 * If we are Redzoning then check if there is some space between the
3507  	 * end of the object and the free pointer. If not then add an
3508  	 * additional word to have some bytes to store Redzone information.
3509  	 */
3510  	if ((flags & SLAB_RED_ZONE) && size == s->object_size)
3511  		size += sizeof(void *);
3512  #endif
3513  
3514  	/*
3515  	 * With that we have determined the number of bytes in actual use
3516  	 * by the object. This is the potential offset to the free pointer.
3517  	 */
3518  	s->inuse = size;
3519  
3520  	if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
3521  		s->ctor)) {
3522  		/*
3523  		 * Relocate free pointer after the object if it is not
3524  		 * permitted to overwrite the first word of the object on
3525  		 * kmem_cache_free.
3526  		 *
3527  		 * This is the case if we do RCU, have a constructor or
3528  		 * destructor or are poisoning the objects.
3529  		 */
3530  		s->offset = size;
3531  		size += sizeof(void *);
3532  	}
3533  
3534  #ifdef CONFIG_SLUB_DEBUG
3535  	if (flags & SLAB_STORE_USER)
3536  		/*
3537  		 * Need to store information about allocs and frees after
3538  		 * the object.
3539  		 */
3540  		size += 2 * sizeof(struct track);
3541  #endif
3542  
3543  	kasan_cache_create(s, &size, &s->flags);
3544  #ifdef CONFIG_SLUB_DEBUG
3545  	if (flags & SLAB_RED_ZONE) {
3546  		/*
3547  		 * Add some empty padding so that we can catch
3548  		 * overwrites from earlier objects rather than let
3549  		 * tracking information or the free pointer be
3550  		 * corrupted if a user writes before the start
3551  		 * of the object.
3552  		 */
3553  		size += sizeof(void *);
3554  
3555  		s->red_left_pad = sizeof(void *);
3556  		s->red_left_pad = ALIGN(s->red_left_pad, s->align);
3557  		size += s->red_left_pad;
3558  	}
3559  #endif
3560  
3561  	/*
3562  	 * SLUB stores one object immediately after another beginning from
3563  	 * offset 0. In order to align the objects we have to simply size
3564  	 * each object to conform to the alignment.
3565  	 */
3566  	size = ALIGN(size, s->align);
3567  	s->size = size;
3568  	if (forced_order >= 0)
3569  		order = forced_order;
3570  	else
3571  		order = calculate_order(size);
3572  
3573  	if ((int)order < 0)
3574  		return 0;
3575  
3576  	s->allocflags = 0;
3577  	if (order)
3578  		s->allocflags |= __GFP_COMP;
3579  
3580  	if (s->flags & SLAB_CACHE_DMA)
3581  		s->allocflags |= GFP_DMA;
3582  
3583  	if (s->flags & SLAB_CACHE_DMA32)
3584  		s->allocflags |= GFP_DMA32;
3585  
3586  	if (s->flags & SLAB_RECLAIM_ACCOUNT)
3587  		s->allocflags |= __GFP_RECLAIMABLE;
3588  
3589  	/*
3590  	 * Determine the number of objects per slab
3591  	 */
3592  	s->oo = oo_make(order, size);
3593  	s->min = oo_make(get_order(size), size);
3594  	if (oo_objects(s->oo) > oo_objects(s->max))
3595  		s->max = s->oo;
3596  
3597  	return !!oo_objects(s->oo);
3598  }
3599  
3600  static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
3601  {
3602  	s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
3603  #ifdef CONFIG_SLAB_FREELIST_HARDENED
3604  	s->random = get_random_long();
3605  #endif
3606  
3607  	if (!calculate_sizes(s, -1))
3608  		goto error;
3609  	if (disable_higher_order_debug) {
3610  		/*
3611  		 * Disable debugging flags that store metadata if the min slab
3612  		 * order increased.
3613  		 */
3614  		if (get_order(s->size) > get_order(s->object_size)) {
3615  			s->flags &= ~DEBUG_METADATA_FLAGS;
3616  			s->offset = 0;
3617  			if (!calculate_sizes(s, -1))
3618  				goto error;
3619  		}
3620  	}
3621  
3622  #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
3623      defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
3624  	if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
3625  		/* Enable fast mode */
3626  		s->flags |= __CMPXCHG_DOUBLE;
3627  #endif
3628  
3629  	/*
3630  	 * The larger the object size is, the more pages we want on the partial
3631  	 * list to avoid pounding the page allocator excessively.
3632  	 */
3633  	set_min_partial(s, ilog2(s->size) / 2);
3634  
3635  	set_cpu_partial(s);
3636  
3637  #ifdef CONFIG_NUMA
3638  	s->remote_node_defrag_ratio = 1000;
3639  #endif
3640  
3641  	/* Initialize the pre-computed randomized freelist if slab is up */
3642  	if (slab_state >= UP) {
3643  		if (init_cache_random_seq(s))
3644  			goto error;
3645  	}
3646  
3647  	if (!init_kmem_cache_nodes(s))
3648  		goto error;
3649  
3650  	if (alloc_kmem_cache_cpus(s))
3651  		return 0;
3652  
3653  	free_kmem_cache_nodes(s);
3654  error:
3655  	if (flags & SLAB_PANIC)
3656  		panic("Cannot create slab %s size=%u realsize=%u order=%u offset=%u flags=%lx\n",
3657  		      s->name, s->size, s->size,
3658  		      oo_order(s->oo), s->offset, (unsigned long)flags);
3659  	return -EINVAL;
3660  }
3661  
3662  static void list_slab_objects(struct kmem_cache *s, struct page *page,
3663  							const char *text)
3664  {
3665  #ifdef CONFIG_SLUB_DEBUG
3666  	void *addr = page_address(page);
3667  	void *p;
3668  	unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC);
3669  	if (!map)
3670  		return;
3671  	slab_err(s, page, text, s->name);
3672  	slab_lock(page);
3673  
3674  	get_map(s, page, map);
3675  	for_each_object(p, s, addr, page->objects) {
3676  
3677  		if (!test_bit(slab_index(p, s, addr), map)) {
3678  			pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
3679  			print_tracking(s, p);
3680  		}
3681  	}
3682  	slab_unlock(page);
3683  	bitmap_free(map);
3684  #endif
3685  }
3686  
3687  /*
3688   * Attempt to free all partial slabs on a node.
3689   * This is called from __kmem_cache_shutdown(). We must take list_lock
3690   * because sysfs file might still access partial list after the shutdowning.
3691   */
3692  static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
3693  {
3694  	LIST_HEAD(discard);
3695  	struct page *page, *h;
3696  
3697  	BUG_ON(irqs_disabled());
3698  	spin_lock_irq(&n->list_lock);
3699  	list_for_each_entry_safe(page, h, &n->partial, slab_list) {
3700  		if (!page->inuse) {
3701  			remove_partial(n, page);
3702  			list_add(&page->slab_list, &discard);
3703  		} else {
3704  			list_slab_objects(s, page,
3705  			"Objects remaining in %s on __kmem_cache_shutdown()");
3706  		}
3707  	}
3708  	spin_unlock_irq(&n->list_lock);
3709  
3710  	list_for_each_entry_safe(page, h, &discard, slab_list)
3711  		discard_slab(s, page);
3712  }
3713  
3714  bool __kmem_cache_empty(struct kmem_cache *s)
3715  {
3716  	int node;
3717  	struct kmem_cache_node *n;
3718  
3719  	for_each_kmem_cache_node(s, node, n)
3720  		if (n->nr_partial || slabs_node(s, node))
3721  			return false;
3722  	return true;
3723  }
3724  
3725  /*
3726   * Release all resources used by a slab cache.
3727   */
3728  int __kmem_cache_shutdown(struct kmem_cache *s)
3729  {
3730  	int node;
3731  	struct kmem_cache_node *n;
3732  
3733  	flush_all(s);
3734  	/* Attempt to free all objects */
3735  	for_each_kmem_cache_node(s, node, n) {
3736  		free_partial(s, n);
3737  		if (n->nr_partial || slabs_node(s, node))
3738  			return 1;
3739  	}
3740  	sysfs_slab_remove(s);
3741  	return 0;
3742  }
3743  
3744  /********************************************************************
3745   *		Kmalloc subsystem
3746   *******************************************************************/
3747  
3748  static int __init setup_slub_min_order(char *str)
3749  {
3750  	get_option(&str, (int *)&slub_min_order);
3751  
3752  	return 1;
3753  }
3754  
3755  __setup("slub_min_order=", setup_slub_min_order);
3756  
3757  static int __init setup_slub_max_order(char *str)
3758  {
3759  	get_option(&str, (int *)&slub_max_order);
3760  	slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1);
3761  
3762  	return 1;
3763  }
3764  
3765  __setup("slub_max_order=", setup_slub_max_order);
3766  
3767  static int __init setup_slub_min_objects(char *str)
3768  {
3769  	get_option(&str, (int *)&slub_min_objects);
3770  
3771  	return 1;
3772  }
3773  
3774  __setup("slub_min_objects=", setup_slub_min_objects);
3775  
3776  void *__kmalloc(size_t size, gfp_t flags)
3777  {
3778  	struct kmem_cache *s;
3779  	void *ret;
3780  
3781  	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
3782  		return kmalloc_large(size, flags);
3783  
3784  	s = kmalloc_slab(size, flags);
3785  
3786  	if (unlikely(ZERO_OR_NULL_PTR(s)))
3787  		return s;
3788  
3789  	ret = slab_alloc(s, flags, _RET_IP_);
3790  
3791  	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
3792  
3793  	ret = kasan_kmalloc(s, ret, size, flags);
3794  
3795  	return ret;
3796  }
3797  EXPORT_SYMBOL(__kmalloc);
3798  
3799  #ifdef CONFIG_NUMA
3800  static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
3801  {
3802  	struct page *page;
3803  	void *ptr = NULL;
3804  
3805  	flags |= __GFP_COMP;
3806  	page = alloc_pages_node(node, flags, get_order(size));
3807  	if (page)
3808  		ptr = page_address(page);
3809  
3810  	return kmalloc_large_node_hook(ptr, size, flags);
3811  }
3812  
3813  void *__kmalloc_node(size_t size, gfp_t flags, int node)
3814  {
3815  	struct kmem_cache *s;
3816  	void *ret;
3817  
3818  	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
3819  		ret = kmalloc_large_node(size, flags, node);
3820  
3821  		trace_kmalloc_node(_RET_IP_, ret,
3822  				   size, PAGE_SIZE << get_order(size),
3823  				   flags, node);
3824  
3825  		return ret;
3826  	}
3827  
3828  	s = kmalloc_slab(size, flags);
3829  
3830  	if (unlikely(ZERO_OR_NULL_PTR(s)))
3831  		return s;
3832  
3833  	ret = slab_alloc_node(s, flags, node, _RET_IP_);
3834  
3835  	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
3836  
3837  	ret = kasan_kmalloc(s, ret, size, flags);
3838  
3839  	return ret;
3840  }
3841  EXPORT_SYMBOL(__kmalloc_node);
3842  #endif	/* CONFIG_NUMA */
3843  
3844  #ifdef CONFIG_HARDENED_USERCOPY
3845  /*
3846   * Rejects incorrectly sized objects and objects that are to be copied
3847   * to/from userspace but do not fall entirely within the containing slab
3848   * cache's usercopy region.
3849   *
3850   * Returns NULL if check passes, otherwise const char * to name of cache
3851   * to indicate an error.
3852   */
3853  void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
3854  			 bool to_user)
3855  {
3856  	struct kmem_cache *s;
3857  	unsigned int offset;
3858  	size_t object_size;
3859  
3860  	ptr = kasan_reset_tag(ptr);
3861  
3862  	/* Find object and usable object size. */
3863  	s = page->slab_cache;
3864  
3865  	/* Reject impossible pointers. */
3866  	if (ptr < page_address(page))
3867  		usercopy_abort("SLUB object not in SLUB page?!", NULL,
3868  			       to_user, 0, n);
3869  
3870  	/* Find offset within object. */
3871  	offset = (ptr - page_address(page)) % s->size;
3872  
3873  	/* Adjust for redzone and reject if within the redzone. */
3874  	if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) {
3875  		if (offset < s->red_left_pad)
3876  			usercopy_abort("SLUB object in left red zone",
3877  				       s->name, to_user, offset, n);
3878  		offset -= s->red_left_pad;
3879  	}
3880  
3881  	/* Allow address range falling entirely within usercopy region. */
3882  	if (offset >= s->useroffset &&
3883  	    offset - s->useroffset <= s->usersize &&
3884  	    n <= s->useroffset - offset + s->usersize)
3885  		return;
3886  
3887  	/*
3888  	 * If the copy is still within the allocated object, produce
3889  	 * a warning instead of rejecting the copy. This is intended
3890  	 * to be a temporary method to find any missing usercopy
3891  	 * whitelists.
3892  	 */
3893  	object_size = slab_ksize(s);
3894  	if (usercopy_fallback &&
3895  	    offset <= object_size && n <= object_size - offset) {
3896  		usercopy_warn("SLUB object", s->name, to_user, offset, n);
3897  		return;
3898  	}
3899  
3900  	usercopy_abort("SLUB object", s->name, to_user, offset, n);
3901  }
3902  #endif /* CONFIG_HARDENED_USERCOPY */
3903  
3904  static size_t __ksize(const void *object)
3905  {
3906  	struct page *page;
3907  
3908  	if (unlikely(object == ZERO_SIZE_PTR))
3909  		return 0;
3910  
3911  	page = virt_to_head_page(object);
3912  
3913  	if (unlikely(!PageSlab(page))) {
3914  		WARN_ON(!PageCompound(page));
3915  		return PAGE_SIZE << compound_order(page);
3916  	}
3917  
3918  	return slab_ksize(page->slab_cache);
3919  }
3920  
3921  size_t ksize(const void *object)
3922  {
3923  	size_t size = __ksize(object);
3924  	/* We assume that ksize callers could use whole allocated area,
3925  	 * so we need to unpoison this area.
3926  	 */
3927  	kasan_unpoison_shadow(object, size);
3928  	return size;
3929  }
3930  EXPORT_SYMBOL(ksize);
3931  
3932  void kfree(const void *x)
3933  {
3934  	struct page *page;
3935  	void *object = (void *)x;
3936  
3937  	trace_kfree(_RET_IP_, x);
3938  
3939  	if (unlikely(ZERO_OR_NULL_PTR(x)))
3940  		return;
3941  
3942  	page = virt_to_head_page(x);
3943  	if (unlikely(!PageSlab(page))) {
3944  		BUG_ON(!PageCompound(page));
3945  		kfree_hook(object);
3946  		__free_pages(page, compound_order(page));
3947  		return;
3948  	}
3949  	slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
3950  }
3951  EXPORT_SYMBOL(kfree);
3952  
3953  #define SHRINK_PROMOTE_MAX 32
3954  
3955  /*
3956   * kmem_cache_shrink discards empty slabs and promotes the slabs filled
3957   * up most to the head of the partial lists. New allocations will then
3958   * fill those up and thus they can be removed from the partial lists.
3959   *
3960   * The slabs with the least items are placed last. This results in them
3961   * being allocated from last increasing the chance that the last objects
3962   * are freed in them.
3963   */
3964  int __kmem_cache_shrink(struct kmem_cache *s)
3965  {
3966  	int node;
3967  	int i;
3968  	struct kmem_cache_node *n;
3969  	struct page *page;
3970  	struct page *t;
3971  	struct list_head discard;
3972  	struct list_head promote[SHRINK_PROMOTE_MAX];
3973  	unsigned long flags;
3974  	int ret = 0;
3975  
3976  	flush_all(s);
3977  	for_each_kmem_cache_node(s, node, n) {
3978  		INIT_LIST_HEAD(&discard);
3979  		for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
3980  			INIT_LIST_HEAD(promote + i);
3981  
3982  		spin_lock_irqsave(&n->list_lock, flags);
3983  
3984  		/*
3985  		 * Build lists of slabs to discard or promote.
3986  		 *
3987  		 * Note that concurrent frees may occur while we hold the
3988  		 * list_lock. page->inuse here is the upper limit.
3989  		 */
3990  		list_for_each_entry_safe(page, t, &n->partial, slab_list) {
3991  			int free = page->objects - page->inuse;
3992  
3993  			/* Do not reread page->inuse */
3994  			barrier();
3995  
3996  			/* We do not keep full slabs on the list */
3997  			BUG_ON(free <= 0);
3998  
3999  			if (free == page->objects) {
4000  				list_move(&page->slab_list, &discard);
4001  				n->nr_partial--;
4002  			} else if (free <= SHRINK_PROMOTE_MAX)
4003  				list_move(&page->slab_list, promote + free - 1);
4004  		}
4005  
4006  		/*
4007  		 * Promote the slabs filled up most to the head of the
4008  		 * partial list.
4009  		 */
4010  		for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
4011  			list_splice(promote + i, &n->partial);
4012  
4013  		spin_unlock_irqrestore(&n->list_lock, flags);
4014  
4015  		/* Release empty slabs */
4016  		list_for_each_entry_safe(page, t, &discard, slab_list)
4017  			discard_slab(s, page);
4018  
4019  		if (slabs_node(s, node))
4020  			ret = 1;
4021  	}
4022  
4023  	return ret;
4024  }
4025  
4026  #ifdef CONFIG_MEMCG
4027  static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s)
4028  {
4029  	/*
4030  	 * Called with all the locks held after a sched RCU grace period.
4031  	 * Even if @s becomes empty after shrinking, we can't know that @s
4032  	 * doesn't have allocations already in-flight and thus can't
4033  	 * destroy @s until the associated memcg is released.
4034  	 *
4035  	 * However, let's remove the sysfs files for empty caches here.
4036  	 * Each cache has a lot of interface files which aren't
4037  	 * particularly useful for empty draining caches; otherwise, we can
4038  	 * easily end up with millions of unnecessary sysfs files on
4039  	 * systems which have a lot of memory and transient cgroups.
4040  	 */
4041  	if (!__kmem_cache_shrink(s))
4042  		sysfs_slab_remove(s);
4043  }
4044  
4045  void __kmemcg_cache_deactivate(struct kmem_cache *s)
4046  {
4047  	/*
4048  	 * Disable empty slabs caching. Used to avoid pinning offline
4049  	 * memory cgroups by kmem pages that can be freed.
4050  	 */
4051  	slub_set_cpu_partial(s, 0);
4052  	s->min_partial = 0;
4053  
4054  	/*
4055  	 * s->cpu_partial is checked locklessly (see put_cpu_partial), so
4056  	 * we have to make sure the change is visible before shrinking.
4057  	 */
4058  	slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu);
4059  }
4060  #endif	/* CONFIG_MEMCG */
4061  
4062  static int slab_mem_going_offline_callback(void *arg)
4063  {
4064  	struct kmem_cache *s;
4065  
4066  	mutex_lock(&slab_mutex);
4067  	list_for_each_entry(s, &slab_caches, list)
4068  		__kmem_cache_shrink(s);
4069  	mutex_unlock(&slab_mutex);
4070  
4071  	return 0;
4072  }
4073  
4074  static void slab_mem_offline_callback(void *arg)
4075  {
4076  	struct kmem_cache_node *n;
4077  	struct kmem_cache *s;
4078  	struct memory_notify *marg = arg;
4079  	int offline_node;
4080  
4081  	offline_node = marg->status_change_nid_normal;
4082  
4083  	/*
4084  	 * If the node still has available memory. we need kmem_cache_node
4085  	 * for it yet.
4086  	 */
4087  	if (offline_node < 0)
4088  		return;
4089  
4090  	mutex_lock(&slab_mutex);
4091  	list_for_each_entry(s, &slab_caches, list) {
4092  		n = get_node(s, offline_node);
4093  		if (n) {
4094  			/*
4095  			 * if n->nr_slabs > 0, slabs still exist on the node
4096  			 * that is going down. We were unable to free them,
4097  			 * and offline_pages() function shouldn't call this
4098  			 * callback. So, we must fail.
4099  			 */
4100  			BUG_ON(slabs_node(s, offline_node));
4101  
4102  			s->node[offline_node] = NULL;
4103  			kmem_cache_free(kmem_cache_node, n);
4104  		}
4105  	}
4106  	mutex_unlock(&slab_mutex);
4107  }
4108  
4109  static int slab_mem_going_online_callback(void *arg)
4110  {
4111  	struct kmem_cache_node *n;
4112  	struct kmem_cache *s;
4113  	struct memory_notify *marg = arg;
4114  	int nid = marg->status_change_nid_normal;
4115  	int ret = 0;
4116  
4117  	/*
4118  	 * If the node's memory is already available, then kmem_cache_node is
4119  	 * already created. Nothing to do.
4120  	 */
4121  	if (nid < 0)
4122  		return 0;
4123  
4124  	/*
4125  	 * We are bringing a node online. No memory is available yet. We must
4126  	 * allocate a kmem_cache_node structure in order to bring the node
4127  	 * online.
4128  	 */
4129  	mutex_lock(&slab_mutex);
4130  	list_for_each_entry(s, &slab_caches, list) {
4131  		/*
4132  		 * XXX: kmem_cache_alloc_node will fallback to other nodes
4133  		 *      since memory is not yet available from the node that
4134  		 *      is brought up.
4135  		 */
4136  		n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
4137  		if (!n) {
4138  			ret = -ENOMEM;
4139  			goto out;
4140  		}
4141  		init_kmem_cache_node(n);
4142  		s->node[nid] = n;
4143  	}
4144  out:
4145  	mutex_unlock(&slab_mutex);
4146  	return ret;
4147  }
4148  
4149  static int slab_memory_callback(struct notifier_block *self,
4150  				unsigned long action, void *arg)
4151  {
4152  	int ret = 0;
4153  
4154  	switch (action) {
4155  	case MEM_GOING_ONLINE:
4156  		ret = slab_mem_going_online_callback(arg);
4157  		break;
4158  	case MEM_GOING_OFFLINE:
4159  		ret = slab_mem_going_offline_callback(arg);
4160  		break;
4161  	case MEM_OFFLINE:
4162  	case MEM_CANCEL_ONLINE:
4163  		slab_mem_offline_callback(arg);
4164  		break;
4165  	case MEM_ONLINE:
4166  	case MEM_CANCEL_OFFLINE:
4167  		break;
4168  	}
4169  	if (ret)
4170  		ret = notifier_from_errno(ret);
4171  	else
4172  		ret = NOTIFY_OK;
4173  	return ret;
4174  }
4175  
4176  static struct notifier_block slab_memory_callback_nb = {
4177  	.notifier_call = slab_memory_callback,
4178  	.priority = SLAB_CALLBACK_PRI,
4179  };
4180  
4181  /********************************************************************
4182   *			Basic setup of slabs
4183   *******************************************************************/
4184  
4185  /*
4186   * Used for early kmem_cache structures that were allocated using
4187   * the page allocator. Allocate them properly then fix up the pointers
4188   * that may be pointing to the wrong kmem_cache structure.
4189   */
4190  
4191  static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
4192  {
4193  	int node;
4194  	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
4195  	struct kmem_cache_node *n;
4196  
4197  	memcpy(s, static_cache, kmem_cache->object_size);
4198  
4199  	/*
4200  	 * This runs very early, and only the boot processor is supposed to be
4201  	 * up.  Even if it weren't true, IRQs are not up so we couldn't fire
4202  	 * IPIs around.
4203  	 */
4204  	__flush_cpu_slab(s, smp_processor_id());
4205  	for_each_kmem_cache_node(s, node, n) {
4206  		struct page *p;
4207  
4208  		list_for_each_entry(p, &n->partial, slab_list)
4209  			p->slab_cache = s;
4210  
4211  #ifdef CONFIG_SLUB_DEBUG
4212  		list_for_each_entry(p, &n->full, slab_list)
4213  			p->slab_cache = s;
4214  #endif
4215  	}
4216  	slab_init_memcg_params(s);
4217  	list_add(&s->list, &slab_caches);
4218  	memcg_link_cache(s);
4219  	return s;
4220  }
4221  
4222  void __init kmem_cache_init(void)
4223  {
4224  	static __initdata struct kmem_cache boot_kmem_cache,
4225  		boot_kmem_cache_node;
4226  
4227  	if (debug_guardpage_minorder())
4228  		slub_max_order = 0;
4229  
4230  	kmem_cache_node = &boot_kmem_cache_node;
4231  	kmem_cache = &boot_kmem_cache;
4232  
4233  	create_boot_cache(kmem_cache_node, "kmem_cache_node",
4234  		sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
4235  
4236  	register_hotmemory_notifier(&slab_memory_callback_nb);
4237  
4238  	/* Able to allocate the per node structures */
4239  	slab_state = PARTIAL;
4240  
4241  	create_boot_cache(kmem_cache, "kmem_cache",
4242  			offsetof(struct kmem_cache, node) +
4243  				nr_node_ids * sizeof(struct kmem_cache_node *),
4244  		       SLAB_HWCACHE_ALIGN, 0, 0);
4245  
4246  	kmem_cache = bootstrap(&boot_kmem_cache);
4247  	kmem_cache_node = bootstrap(&boot_kmem_cache_node);
4248  
4249  	/* Now we can use the kmem_cache to allocate kmalloc slabs */
4250  	setup_kmalloc_cache_index_table();
4251  	create_kmalloc_caches(0);
4252  
4253  	/* Setup random freelists for each cache */
4254  	init_freelist_randomization();
4255  
4256  	cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
4257  				  slub_cpu_dead);
4258  
4259  	pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
4260  		cache_line_size(),
4261  		slub_min_order, slub_max_order, slub_min_objects,
4262  		nr_cpu_ids, nr_node_ids);
4263  }
4264  
4265  void __init kmem_cache_init_late(void)
4266  {
4267  }
4268  
4269  struct kmem_cache *
4270  __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
4271  		   slab_flags_t flags, void (*ctor)(void *))
4272  {
4273  	struct kmem_cache *s, *c;
4274  
4275  	s = find_mergeable(size, align, flags, name, ctor);
4276  	if (s) {
4277  		s->refcount++;
4278  
4279  		/*
4280  		 * Adjust the object sizes so that we clear
4281  		 * the complete object on kzalloc.
4282  		 */
4283  		s->object_size = max(s->object_size, size);
4284  		s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
4285  
4286  		for_each_memcg_cache(c, s) {
4287  			c->object_size = s->object_size;
4288  			c->inuse = max(c->inuse, ALIGN(size, sizeof(void *)));
4289  		}
4290  
4291  		if (sysfs_slab_alias(s, name)) {
4292  			s->refcount--;
4293  			s = NULL;
4294  		}
4295  	}
4296  
4297  	return s;
4298  }
4299  
4300  int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
4301  {
4302  	int err;
4303  
4304  	err = kmem_cache_open(s, flags);
4305  	if (err)
4306  		return err;
4307  
4308  	/* Mutex is not taken during early boot */
4309  	if (slab_state <= UP)
4310  		return 0;
4311  
4312  	memcg_propagate_slab_attrs(s);
4313  	err = sysfs_slab_add(s);
4314  	if (err)
4315  		__kmem_cache_release(s);
4316  
4317  	return err;
4318  }
4319  
4320  void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
4321  {
4322  	struct kmem_cache *s;
4323  	void *ret;
4324  
4325  	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
4326  		return kmalloc_large(size, gfpflags);
4327  
4328  	s = kmalloc_slab(size, gfpflags);
4329  
4330  	if (unlikely(ZERO_OR_NULL_PTR(s)))
4331  		return s;
4332  
4333  	ret = slab_alloc(s, gfpflags, caller);
4334  
4335  	/* Honor the call site pointer we received. */
4336  	trace_kmalloc(caller, ret, size, s->size, gfpflags);
4337  
4338  	return ret;
4339  }
4340  
4341  #ifdef CONFIG_NUMA
4342  void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
4343  					int node, unsigned long caller)
4344  {
4345  	struct kmem_cache *s;
4346  	void *ret;
4347  
4348  	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
4349  		ret = kmalloc_large_node(size, gfpflags, node);
4350  
4351  		trace_kmalloc_node(caller, ret,
4352  				   size, PAGE_SIZE << get_order(size),
4353  				   gfpflags, node);
4354  
4355  		return ret;
4356  	}
4357  
4358  	s = kmalloc_slab(size, gfpflags);
4359  
4360  	if (unlikely(ZERO_OR_NULL_PTR(s)))
4361  		return s;
4362  
4363  	ret = slab_alloc_node(s, gfpflags, node, caller);
4364  
4365  	/* Honor the call site pointer we received. */
4366  	trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
4367  
4368  	return ret;
4369  }
4370  #endif
4371  
4372  #ifdef CONFIG_SYSFS
4373  static int count_inuse(struct page *page)
4374  {
4375  	return page->inuse;
4376  }
4377  
4378  static int count_total(struct page *page)
4379  {
4380  	return page->objects;
4381  }
4382  #endif
4383  
4384  #ifdef CONFIG_SLUB_DEBUG
4385  static int validate_slab(struct kmem_cache *s, struct page *page,
4386  						unsigned long *map)
4387  {
4388  	void *p;
4389  	void *addr = page_address(page);
4390  
4391  	if (!check_slab(s, page) ||
4392  			!on_freelist(s, page, NULL))
4393  		return 0;
4394  
4395  	/* Now we know that a valid freelist exists */
4396  	bitmap_zero(map, page->objects);
4397  
4398  	get_map(s, page, map);
4399  	for_each_object(p, s, addr, page->objects) {
4400  		if (test_bit(slab_index(p, s, addr), map))
4401  			if (!check_object(s, page, p, SLUB_RED_INACTIVE))
4402  				return 0;
4403  	}
4404  
4405  	for_each_object(p, s, addr, page->objects)
4406  		if (!test_bit(slab_index(p, s, addr), map))
4407  			if (!check_object(s, page, p, SLUB_RED_ACTIVE))
4408  				return 0;
4409  	return 1;
4410  }
4411  
4412  static void validate_slab_slab(struct kmem_cache *s, struct page *page,
4413  						unsigned long *map)
4414  {
4415  	slab_lock(page);
4416  	validate_slab(s, page, map);
4417  	slab_unlock(page);
4418  }
4419  
4420  static int validate_slab_node(struct kmem_cache *s,
4421  		struct kmem_cache_node *n, unsigned long *map)
4422  {
4423  	unsigned long count = 0;
4424  	struct page *page;
4425  	unsigned long flags;
4426  
4427  	spin_lock_irqsave(&n->list_lock, flags);
4428  
4429  	list_for_each_entry(page, &n->partial, slab_list) {
4430  		validate_slab_slab(s, page, map);
4431  		count++;
4432  	}
4433  	if (count != n->nr_partial)
4434  		pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
4435  		       s->name, count, n->nr_partial);
4436  
4437  	if (!(s->flags & SLAB_STORE_USER))
4438  		goto out;
4439  
4440  	list_for_each_entry(page, &n->full, slab_list) {
4441  		validate_slab_slab(s, page, map);
4442  		count++;
4443  	}
4444  	if (count != atomic_long_read(&n->nr_slabs))
4445  		pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
4446  		       s->name, count, atomic_long_read(&n->nr_slabs));
4447  
4448  out:
4449  	spin_unlock_irqrestore(&n->list_lock, flags);
4450  	return count;
4451  }
4452  
4453  static long validate_slab_cache(struct kmem_cache *s)
4454  {
4455  	int node;
4456  	unsigned long count = 0;
4457  	struct kmem_cache_node *n;
4458  	unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
4459  
4460  	if (!map)
4461  		return -ENOMEM;
4462  
4463  	flush_all(s);
4464  	for_each_kmem_cache_node(s, node, n)
4465  		count += validate_slab_node(s, n, map);
4466  	bitmap_free(map);
4467  	return count;
4468  }
4469  /*
4470   * Generate lists of code addresses where slabcache objects are allocated
4471   * and freed.
4472   */
4473  
4474  struct location {
4475  	unsigned long count;
4476  	unsigned long addr;
4477  	long long sum_time;
4478  	long min_time;
4479  	long max_time;
4480  	long min_pid;
4481  	long max_pid;
4482  	DECLARE_BITMAP(cpus, NR_CPUS);
4483  	nodemask_t nodes;
4484  };
4485  
4486  struct loc_track {
4487  	unsigned long max;
4488  	unsigned long count;
4489  	struct location *loc;
4490  };
4491  
4492  static void free_loc_track(struct loc_track *t)
4493  {
4494  	if (t->max)
4495  		free_pages((unsigned long)t->loc,
4496  			get_order(sizeof(struct location) * t->max));
4497  }
4498  
4499  static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
4500  {
4501  	struct location *l;
4502  	int order;
4503  
4504  	order = get_order(sizeof(struct location) * max);
4505  
4506  	l = (void *)__get_free_pages(flags, order);
4507  	if (!l)
4508  		return 0;
4509  
4510  	if (t->count) {
4511  		memcpy(l, t->loc, sizeof(struct location) * t->count);
4512  		free_loc_track(t);
4513  	}
4514  	t->max = max;
4515  	t->loc = l;
4516  	return 1;
4517  }
4518  
4519  static int add_location(struct loc_track *t, struct kmem_cache *s,
4520  				const struct track *track)
4521  {
4522  	long start, end, pos;
4523  	struct location *l;
4524  	unsigned long caddr;
4525  	unsigned long age = jiffies - track->when;
4526  
4527  	start = -1;
4528  	end = t->count;
4529  
4530  	for ( ; ; ) {
4531  		pos = start + (end - start + 1) / 2;
4532  
4533  		/*
4534  		 * There is nothing at "end". If we end up there
4535  		 * we need to add something to before end.
4536  		 */
4537  		if (pos == end)
4538  			break;
4539  
4540  		caddr = t->loc[pos].addr;
4541  		if (track->addr == caddr) {
4542  
4543  			l = &t->loc[pos];
4544  			l->count++;
4545  			if (track->when) {
4546  				l->sum_time += age;
4547  				if (age < l->min_time)
4548  					l->min_time = age;
4549  				if (age > l->max_time)
4550  					l->max_time = age;
4551  
4552  				if (track->pid < l->min_pid)
4553  					l->min_pid = track->pid;
4554  				if (track->pid > l->max_pid)
4555  					l->max_pid = track->pid;
4556  
4557  				cpumask_set_cpu(track->cpu,
4558  						to_cpumask(l->cpus));
4559  			}
4560  			node_set(page_to_nid(virt_to_page(track)), l->nodes);
4561  			return 1;
4562  		}
4563  
4564  		if (track->addr < caddr)
4565  			end = pos;
4566  		else
4567  			start = pos;
4568  	}
4569  
4570  	/*
4571  	 * Not found. Insert new tracking element.
4572  	 */
4573  	if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
4574  		return 0;
4575  
4576  	l = t->loc + pos;
4577  	if (pos < t->count)
4578  		memmove(l + 1, l,
4579  			(t->count - pos) * sizeof(struct location));
4580  	t->count++;
4581  	l->count = 1;
4582  	l->addr = track->addr;
4583  	l->sum_time = age;
4584  	l->min_time = age;
4585  	l->max_time = age;
4586  	l->min_pid = track->pid;
4587  	l->max_pid = track->pid;
4588  	cpumask_clear(to_cpumask(l->cpus));
4589  	cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
4590  	nodes_clear(l->nodes);
4591  	node_set(page_to_nid(virt_to_page(track)), l->nodes);
4592  	return 1;
4593  }
4594  
4595  static void process_slab(struct loc_track *t, struct kmem_cache *s,
4596  		struct page *page, enum track_item alloc,
4597  		unsigned long *map)
4598  {
4599  	void *addr = page_address(page);
4600  	void *p;
4601  
4602  	bitmap_zero(map, page->objects);
4603  	get_map(s, page, map);
4604  
4605  	for_each_object(p, s, addr, page->objects)
4606  		if (!test_bit(slab_index(p, s, addr), map))
4607  			add_location(t, s, get_track(s, p, alloc));
4608  }
4609  
4610  static int list_locations(struct kmem_cache *s, char *buf,
4611  					enum track_item alloc)
4612  {
4613  	int len = 0;
4614  	unsigned long i;
4615  	struct loc_track t = { 0, 0, NULL };
4616  	int node;
4617  	struct kmem_cache_node *n;
4618  	unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
4619  
4620  	if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
4621  				     GFP_KERNEL)) {
4622  		bitmap_free(map);
4623  		return sprintf(buf, "Out of memory\n");
4624  	}
4625  	/* Push back cpu slabs */
4626  	flush_all(s);
4627  
4628  	for_each_kmem_cache_node(s, node, n) {
4629  		unsigned long flags;
4630  		struct page *page;
4631  
4632  		if (!atomic_long_read(&n->nr_slabs))
4633  			continue;
4634  
4635  		spin_lock_irqsave(&n->list_lock, flags);
4636  		list_for_each_entry(page, &n->partial, slab_list)
4637  			process_slab(&t, s, page, alloc, map);
4638  		list_for_each_entry(page, &n->full, slab_list)
4639  			process_slab(&t, s, page, alloc, map);
4640  		spin_unlock_irqrestore(&n->list_lock, flags);
4641  	}
4642  
4643  	for (i = 0; i < t.count; i++) {
4644  		struct location *l = &t.loc[i];
4645  
4646  		if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
4647  			break;
4648  		len += sprintf(buf + len, "%7ld ", l->count);
4649  
4650  		if (l->addr)
4651  			len += sprintf(buf + len, "%pS", (void *)l->addr);
4652  		else
4653  			len += sprintf(buf + len, "<not-available>");
4654  
4655  		if (l->sum_time != l->min_time) {
4656  			len += sprintf(buf + len, " age=%ld/%ld/%ld",
4657  				l->min_time,
4658  				(long)div_u64(l->sum_time, l->count),
4659  				l->max_time);
4660  		} else
4661  			len += sprintf(buf + len, " age=%ld",
4662  				l->min_time);
4663  
4664  		if (l->min_pid != l->max_pid)
4665  			len += sprintf(buf + len, " pid=%ld-%ld",
4666  				l->min_pid, l->max_pid);
4667  		else
4668  			len += sprintf(buf + len, " pid=%ld",
4669  				l->min_pid);
4670  
4671  		if (num_online_cpus() > 1 &&
4672  				!cpumask_empty(to_cpumask(l->cpus)) &&
4673  				len < PAGE_SIZE - 60)
4674  			len += scnprintf(buf + len, PAGE_SIZE - len - 50,
4675  					 " cpus=%*pbl",
4676  					 cpumask_pr_args(to_cpumask(l->cpus)));
4677  
4678  		if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
4679  				len < PAGE_SIZE - 60)
4680  			len += scnprintf(buf + len, PAGE_SIZE - len - 50,
4681  					 " nodes=%*pbl",
4682  					 nodemask_pr_args(&l->nodes));
4683  
4684  		len += sprintf(buf + len, "\n");
4685  	}
4686  
4687  	free_loc_track(&t);
4688  	bitmap_free(map);
4689  	if (!t.count)
4690  		len += sprintf(buf, "No data\n");
4691  	return len;
4692  }
4693  #endif	/* CONFIG_SLUB_DEBUG */
4694  
4695  #ifdef SLUB_RESILIENCY_TEST
4696  static void __init resiliency_test(void)
4697  {
4698  	u8 *p;
4699  	int type = KMALLOC_NORMAL;
4700  
4701  	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
4702  
4703  	pr_err("SLUB resiliency testing\n");
4704  	pr_err("-----------------------\n");
4705  	pr_err("A. Corruption after allocation\n");
4706  
4707  	p = kzalloc(16, GFP_KERNEL);
4708  	p[16] = 0x12;
4709  	pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
4710  	       p + 16);
4711  
4712  	validate_slab_cache(kmalloc_caches[type][4]);
4713  
4714  	/* Hmmm... The next two are dangerous */
4715  	p = kzalloc(32, GFP_KERNEL);
4716  	p[32 + sizeof(void *)] = 0x34;
4717  	pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
4718  	       p);
4719  	pr_err("If allocated object is overwritten then not detectable\n\n");
4720  
4721  	validate_slab_cache(kmalloc_caches[type][5]);
4722  	p = kzalloc(64, GFP_KERNEL);
4723  	p += 64 + (get_cycles() & 0xff) * sizeof(void *);
4724  	*p = 0x56;
4725  	pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
4726  	       p);
4727  	pr_err("If allocated object is overwritten then not detectable\n\n");
4728  	validate_slab_cache(kmalloc_caches[type][6]);
4729  
4730  	pr_err("\nB. Corruption after free\n");
4731  	p = kzalloc(128, GFP_KERNEL);
4732  	kfree(p);
4733  	*p = 0x78;
4734  	pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
4735  	validate_slab_cache(kmalloc_caches[type][7]);
4736  
4737  	p = kzalloc(256, GFP_KERNEL);
4738  	kfree(p);
4739  	p[50] = 0x9a;
4740  	pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
4741  	validate_slab_cache(kmalloc_caches[type][8]);
4742  
4743  	p = kzalloc(512, GFP_KERNEL);
4744  	kfree(p);
4745  	p[512] = 0xab;
4746  	pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
4747  	validate_slab_cache(kmalloc_caches[type][9]);
4748  }
4749  #else
4750  #ifdef CONFIG_SYSFS
4751  static void resiliency_test(void) {};
4752  #endif
4753  #endif	/* SLUB_RESILIENCY_TEST */
4754  
4755  #ifdef CONFIG_SYSFS
4756  enum slab_stat_type {
4757  	SL_ALL,			/* All slabs */
4758  	SL_PARTIAL,		/* Only partially allocated slabs */
4759  	SL_CPU,			/* Only slabs used for cpu caches */
4760  	SL_OBJECTS,		/* Determine allocated objects not slabs */
4761  	SL_TOTAL		/* Determine object capacity not slabs */
4762  };
4763  
4764  #define SO_ALL		(1 << SL_ALL)
4765  #define SO_PARTIAL	(1 << SL_PARTIAL)
4766  #define SO_CPU		(1 << SL_CPU)
4767  #define SO_OBJECTS	(1 << SL_OBJECTS)
4768  #define SO_TOTAL	(1 << SL_TOTAL)
4769  
4770  #ifdef CONFIG_MEMCG
4771  static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON);
4772  
4773  static int __init setup_slub_memcg_sysfs(char *str)
4774  {
4775  	int v;
4776  
4777  	if (get_option(&str, &v) > 0)
4778  		memcg_sysfs_enabled = v;
4779  
4780  	return 1;
4781  }
4782  
4783  __setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs);
4784  #endif
4785  
4786  static ssize_t show_slab_objects(struct kmem_cache *s,
4787  			    char *buf, unsigned long flags)
4788  {
4789  	unsigned long total = 0;
4790  	int node;
4791  	int x;
4792  	unsigned long *nodes;
4793  
4794  	nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
4795  	if (!nodes)
4796  		return -ENOMEM;
4797  
4798  	if (flags & SO_CPU) {
4799  		int cpu;
4800  
4801  		for_each_possible_cpu(cpu) {
4802  			struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
4803  							       cpu);
4804  			int node;
4805  			struct page *page;
4806  
4807  			page = READ_ONCE(c->page);
4808  			if (!page)
4809  				continue;
4810  
4811  			node = page_to_nid(page);
4812  			if (flags & SO_TOTAL)
4813  				x = page->objects;
4814  			else if (flags & SO_OBJECTS)
4815  				x = page->inuse;
4816  			else
4817  				x = 1;
4818  
4819  			total += x;
4820  			nodes[node] += x;
4821  
4822  			page = slub_percpu_partial_read_once(c);
4823  			if (page) {
4824  				node = page_to_nid(page);
4825  				if (flags & SO_TOTAL)
4826  					WARN_ON_ONCE(1);
4827  				else if (flags & SO_OBJECTS)
4828  					WARN_ON_ONCE(1);
4829  				else
4830  					x = page->pages;
4831  				total += x;
4832  				nodes[node] += x;
4833  			}
4834  		}
4835  	}
4836  
4837  	get_online_mems();
4838  #ifdef CONFIG_SLUB_DEBUG
4839  	if (flags & SO_ALL) {
4840  		struct kmem_cache_node *n;
4841  
4842  		for_each_kmem_cache_node(s, node, n) {
4843  
4844  			if (flags & SO_TOTAL)
4845  				x = atomic_long_read(&n->total_objects);
4846  			else if (flags & SO_OBJECTS)
4847  				x = atomic_long_read(&n->total_objects) -
4848  					count_partial(n, count_free);
4849  			else
4850  				x = atomic_long_read(&n->nr_slabs);
4851  			total += x;
4852  			nodes[node] += x;
4853  		}
4854  
4855  	} else
4856  #endif
4857  	if (flags & SO_PARTIAL) {
4858  		struct kmem_cache_node *n;
4859  
4860  		for_each_kmem_cache_node(s, node, n) {
4861  			if (flags & SO_TOTAL)
4862  				x = count_partial(n, count_total);
4863  			else if (flags & SO_OBJECTS)
4864  				x = count_partial(n, count_inuse);
4865  			else
4866  				x = n->nr_partial;
4867  			total += x;
4868  			nodes[node] += x;
4869  		}
4870  	}
4871  	x = sprintf(buf, "%lu", total);
4872  #ifdef CONFIG_NUMA
4873  	for (node = 0; node < nr_node_ids; node++)
4874  		if (nodes[node])
4875  			x += sprintf(buf + x, " N%d=%lu",
4876  					node, nodes[node]);
4877  #endif
4878  	put_online_mems();
4879  	kfree(nodes);
4880  	return x + sprintf(buf + x, "\n");
4881  }
4882  
4883  #ifdef CONFIG_SLUB_DEBUG
4884  static int any_slab_objects(struct kmem_cache *s)
4885  {
4886  	int node;
4887  	struct kmem_cache_node *n;
4888  
4889  	for_each_kmem_cache_node(s, node, n)
4890  		if (atomic_long_read(&n->total_objects))
4891  			return 1;
4892  
4893  	return 0;
4894  }
4895  #endif
4896  
4897  #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
4898  #define to_slab(n) container_of(n, struct kmem_cache, kobj)
4899  
4900  struct slab_attribute {
4901  	struct attribute attr;
4902  	ssize_t (*show)(struct kmem_cache *s, char *buf);
4903  	ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
4904  };
4905  
4906  #define SLAB_ATTR_RO(_name) \
4907  	static struct slab_attribute _name##_attr = \
4908  	__ATTR(_name, 0400, _name##_show, NULL)
4909  
4910  #define SLAB_ATTR(_name) \
4911  	static struct slab_attribute _name##_attr =  \
4912  	__ATTR(_name, 0600, _name##_show, _name##_store)
4913  
4914  static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
4915  {
4916  	return sprintf(buf, "%u\n", s->size);
4917  }
4918  SLAB_ATTR_RO(slab_size);
4919  
4920  static ssize_t align_show(struct kmem_cache *s, char *buf)
4921  {
4922  	return sprintf(buf, "%u\n", s->align);
4923  }
4924  SLAB_ATTR_RO(align);
4925  
4926  static ssize_t object_size_show(struct kmem_cache *s, char *buf)
4927  {
4928  	return sprintf(buf, "%u\n", s->object_size);
4929  }
4930  SLAB_ATTR_RO(object_size);
4931  
4932  static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
4933  {
4934  	return sprintf(buf, "%u\n", oo_objects(s->oo));
4935  }
4936  SLAB_ATTR_RO(objs_per_slab);
4937  
4938  static ssize_t order_store(struct kmem_cache *s,
4939  				const char *buf, size_t length)
4940  {
4941  	unsigned int order;
4942  	int err;
4943  
4944  	err = kstrtouint(buf, 10, &order);
4945  	if (err)
4946  		return err;
4947  
4948  	if (order > slub_max_order || order < slub_min_order)
4949  		return -EINVAL;
4950  
4951  	calculate_sizes(s, order);
4952  	return length;
4953  }
4954  
4955  static ssize_t order_show(struct kmem_cache *s, char *buf)
4956  {
4957  	return sprintf(buf, "%u\n", oo_order(s->oo));
4958  }
4959  SLAB_ATTR(order);
4960  
4961  static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
4962  {
4963  	return sprintf(buf, "%lu\n", s->min_partial);
4964  }
4965  
4966  static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
4967  				 size_t length)
4968  {
4969  	unsigned long min;
4970  	int err;
4971  
4972  	err = kstrtoul(buf, 10, &min);
4973  	if (err)
4974  		return err;
4975  
4976  	set_min_partial(s, min);
4977  	return length;
4978  }
4979  SLAB_ATTR(min_partial);
4980  
4981  static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
4982  {
4983  	return sprintf(buf, "%u\n", slub_cpu_partial(s));
4984  }
4985  
4986  static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
4987  				 size_t length)
4988  {
4989  	unsigned int objects;
4990  	int err;
4991  
4992  	err = kstrtouint(buf, 10, &objects);
4993  	if (err)
4994  		return err;
4995  	if (objects && !kmem_cache_has_cpu_partial(s))
4996  		return -EINVAL;
4997  
4998  	slub_set_cpu_partial(s, objects);
4999  	flush_all(s);
5000  	return length;
5001  }
5002  SLAB_ATTR(cpu_partial);
5003  
5004  static ssize_t ctor_show(struct kmem_cache *s, char *buf)
5005  {
5006  	if (!s->ctor)
5007  		return 0;
5008  	return sprintf(buf, "%pS\n", s->ctor);
5009  }
5010  SLAB_ATTR_RO(ctor);
5011  
5012  static ssize_t aliases_show(struct kmem_cache *s, char *buf)
5013  {
5014  	return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
5015  }
5016  SLAB_ATTR_RO(aliases);
5017  
5018  static ssize_t partial_show(struct kmem_cache *s, char *buf)
5019  {
5020  	return show_slab_objects(s, buf, SO_PARTIAL);
5021  }
5022  SLAB_ATTR_RO(partial);
5023  
5024  static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
5025  {
5026  	return show_slab_objects(s, buf, SO_CPU);
5027  }
5028  SLAB_ATTR_RO(cpu_slabs);
5029  
5030  static ssize_t objects_show(struct kmem_cache *s, char *buf)
5031  {
5032  	return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
5033  }
5034  SLAB_ATTR_RO(objects);
5035  
5036  static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
5037  {
5038  	return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
5039  }
5040  SLAB_ATTR_RO(objects_partial);
5041  
5042  static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
5043  {
5044  	int objects = 0;
5045  	int pages = 0;
5046  	int cpu;
5047  	int len;
5048  
5049  	for_each_online_cpu(cpu) {
5050  		struct page *page;
5051  
5052  		page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
5053  
5054  		if (page) {
5055  			pages += page->pages;
5056  			objects += page->pobjects;
5057  		}
5058  	}
5059  
5060  	len = sprintf(buf, "%d(%d)", objects, pages);
5061  
5062  #ifdef CONFIG_SMP
5063  	for_each_online_cpu(cpu) {
5064  		struct page *page;
5065  
5066  		page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
5067  
5068  		if (page && len < PAGE_SIZE - 20)
5069  			len += sprintf(buf + len, " C%d=%d(%d)", cpu,
5070  				page->pobjects, page->pages);
5071  	}
5072  #endif
5073  	return len + sprintf(buf + len, "\n");
5074  }
5075  SLAB_ATTR_RO(slabs_cpu_partial);
5076  
5077  static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
5078  {
5079  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
5080  }
5081  
5082  static ssize_t reclaim_account_store(struct kmem_cache *s,
5083  				const char *buf, size_t length)
5084  {
5085  	s->flags &= ~SLAB_RECLAIM_ACCOUNT;
5086  	if (buf[0] == '1')
5087  		s->flags |= SLAB_RECLAIM_ACCOUNT;
5088  	return length;
5089  }
5090  SLAB_ATTR(reclaim_account);
5091  
5092  static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
5093  {
5094  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
5095  }
5096  SLAB_ATTR_RO(hwcache_align);
5097  
5098  #ifdef CONFIG_ZONE_DMA
5099  static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
5100  {
5101  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
5102  }
5103  SLAB_ATTR_RO(cache_dma);
5104  #endif
5105  
5106  static ssize_t usersize_show(struct kmem_cache *s, char *buf)
5107  {
5108  	return sprintf(buf, "%u\n", s->usersize);
5109  }
5110  SLAB_ATTR_RO(usersize);
5111  
5112  static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
5113  {
5114  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
5115  }
5116  SLAB_ATTR_RO(destroy_by_rcu);
5117  
5118  #ifdef CONFIG_SLUB_DEBUG
5119  static ssize_t slabs_show(struct kmem_cache *s, char *buf)
5120  {
5121  	return show_slab_objects(s, buf, SO_ALL);
5122  }
5123  SLAB_ATTR_RO(slabs);
5124  
5125  static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
5126  {
5127  	return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
5128  }
5129  SLAB_ATTR_RO(total_objects);
5130  
5131  static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
5132  {
5133  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
5134  }
5135  
5136  static ssize_t sanity_checks_store(struct kmem_cache *s,
5137  				const char *buf, size_t length)
5138  {
5139  	s->flags &= ~SLAB_CONSISTENCY_CHECKS;
5140  	if (buf[0] == '1') {
5141  		s->flags &= ~__CMPXCHG_DOUBLE;
5142  		s->flags |= SLAB_CONSISTENCY_CHECKS;
5143  	}
5144  	return length;
5145  }
5146  SLAB_ATTR(sanity_checks);
5147  
5148  static ssize_t trace_show(struct kmem_cache *s, char *buf)
5149  {
5150  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
5151  }
5152  
5153  static ssize_t trace_store(struct kmem_cache *s, const char *buf,
5154  							size_t length)
5155  {
5156  	/*
5157  	 * Tracing a merged cache is going to give confusing results
5158  	 * as well as cause other issues like converting a mergeable
5159  	 * cache into an umergeable one.
5160  	 */
5161  	if (s->refcount > 1)
5162  		return -EINVAL;
5163  
5164  	s->flags &= ~SLAB_TRACE;
5165  	if (buf[0] == '1') {
5166  		s->flags &= ~__CMPXCHG_DOUBLE;
5167  		s->flags |= SLAB_TRACE;
5168  	}
5169  	return length;
5170  }
5171  SLAB_ATTR(trace);
5172  
5173  static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
5174  {
5175  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
5176  }
5177  
5178  static ssize_t red_zone_store(struct kmem_cache *s,
5179  				const char *buf, size_t length)
5180  {
5181  	if (any_slab_objects(s))
5182  		return -EBUSY;
5183  
5184  	s->flags &= ~SLAB_RED_ZONE;
5185  	if (buf[0] == '1') {
5186  		s->flags |= SLAB_RED_ZONE;
5187  	}
5188  	calculate_sizes(s, -1);
5189  	return length;
5190  }
5191  SLAB_ATTR(red_zone);
5192  
5193  static ssize_t poison_show(struct kmem_cache *s, char *buf)
5194  {
5195  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
5196  }
5197  
5198  static ssize_t poison_store(struct kmem_cache *s,
5199  				const char *buf, size_t length)
5200  {
5201  	if (any_slab_objects(s))
5202  		return -EBUSY;
5203  
5204  	s->flags &= ~SLAB_POISON;
5205  	if (buf[0] == '1') {
5206  		s->flags |= SLAB_POISON;
5207  	}
5208  	calculate_sizes(s, -1);
5209  	return length;
5210  }
5211  SLAB_ATTR(poison);
5212  
5213  static ssize_t store_user_show(struct kmem_cache *s, char *buf)
5214  {
5215  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
5216  }
5217  
5218  static ssize_t store_user_store(struct kmem_cache *s,
5219  				const char *buf, size_t length)
5220  {
5221  	if (any_slab_objects(s))
5222  		return -EBUSY;
5223  
5224  	s->flags &= ~SLAB_STORE_USER;
5225  	if (buf[0] == '1') {
5226  		s->flags &= ~__CMPXCHG_DOUBLE;
5227  		s->flags |= SLAB_STORE_USER;
5228  	}
5229  	calculate_sizes(s, -1);
5230  	return length;
5231  }
5232  SLAB_ATTR(store_user);
5233  
5234  static ssize_t validate_show(struct kmem_cache *s, char *buf)
5235  {
5236  	return 0;
5237  }
5238  
5239  static ssize_t validate_store(struct kmem_cache *s,
5240  			const char *buf, size_t length)
5241  {
5242  	int ret = -EINVAL;
5243  
5244  	if (buf[0] == '1') {
5245  		ret = validate_slab_cache(s);
5246  		if (ret >= 0)
5247  			ret = length;
5248  	}
5249  	return ret;
5250  }
5251  SLAB_ATTR(validate);
5252  
5253  static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
5254  {
5255  	if (!(s->flags & SLAB_STORE_USER))
5256  		return -ENOSYS;
5257  	return list_locations(s, buf, TRACK_ALLOC);
5258  }
5259  SLAB_ATTR_RO(alloc_calls);
5260  
5261  static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
5262  {
5263  	if (!(s->flags & SLAB_STORE_USER))
5264  		return -ENOSYS;
5265  	return list_locations(s, buf, TRACK_FREE);
5266  }
5267  SLAB_ATTR_RO(free_calls);
5268  #endif /* CONFIG_SLUB_DEBUG */
5269  
5270  #ifdef CONFIG_FAILSLAB
5271  static ssize_t failslab_show(struct kmem_cache *s, char *buf)
5272  {
5273  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
5274  }
5275  
5276  static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
5277  							size_t length)
5278  {
5279  	if (s->refcount > 1)
5280  		return -EINVAL;
5281  
5282  	s->flags &= ~SLAB_FAILSLAB;
5283  	if (buf[0] == '1')
5284  		s->flags |= SLAB_FAILSLAB;
5285  	return length;
5286  }
5287  SLAB_ATTR(failslab);
5288  #endif
5289  
5290  static ssize_t shrink_show(struct kmem_cache *s, char *buf)
5291  {
5292  	return 0;
5293  }
5294  
5295  static ssize_t shrink_store(struct kmem_cache *s,
5296  			const char *buf, size_t length)
5297  {
5298  	if (buf[0] == '1')
5299  		kmem_cache_shrink(s);
5300  	else
5301  		return -EINVAL;
5302  	return length;
5303  }
5304  SLAB_ATTR(shrink);
5305  
5306  #ifdef CONFIG_NUMA
5307  static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
5308  {
5309  	return sprintf(buf, "%u\n", s->remote_node_defrag_ratio / 10);
5310  }
5311  
5312  static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
5313  				const char *buf, size_t length)
5314  {
5315  	unsigned int ratio;
5316  	int err;
5317  
5318  	err = kstrtouint(buf, 10, &ratio);
5319  	if (err)
5320  		return err;
5321  	if (ratio > 100)
5322  		return -ERANGE;
5323  
5324  	s->remote_node_defrag_ratio = ratio * 10;
5325  
5326  	return length;
5327  }
5328  SLAB_ATTR(remote_node_defrag_ratio);
5329  #endif
5330  
5331  #ifdef CONFIG_SLUB_STATS
5332  static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
5333  {
5334  	unsigned long sum  = 0;
5335  	int cpu;
5336  	int len;
5337  	int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL);
5338  
5339  	if (!data)
5340  		return -ENOMEM;
5341  
5342  	for_each_online_cpu(cpu) {
5343  		unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
5344  
5345  		data[cpu] = x;
5346  		sum += x;
5347  	}
5348  
5349  	len = sprintf(buf, "%lu", sum);
5350  
5351  #ifdef CONFIG_SMP
5352  	for_each_online_cpu(cpu) {
5353  		if (data[cpu] && len < PAGE_SIZE - 20)
5354  			len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
5355  	}
5356  #endif
5357  	kfree(data);
5358  	return len + sprintf(buf + len, "\n");
5359  }
5360  
5361  static void clear_stat(struct kmem_cache *s, enum stat_item si)
5362  {
5363  	int cpu;
5364  
5365  	for_each_online_cpu(cpu)
5366  		per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
5367  }
5368  
5369  #define STAT_ATTR(si, text) 					\
5370  static ssize_t text##_show(struct kmem_cache *s, char *buf)	\
5371  {								\
5372  	return show_stat(s, buf, si);				\
5373  }								\
5374  static ssize_t text##_store(struct kmem_cache *s,		\
5375  				const char *buf, size_t length)	\
5376  {								\
5377  	if (buf[0] != '0')					\
5378  		return -EINVAL;					\
5379  	clear_stat(s, si);					\
5380  	return length;						\
5381  }								\
5382  SLAB_ATTR(text);						\
5383  
5384  STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
5385  STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
5386  STAT_ATTR(FREE_FASTPATH, free_fastpath);
5387  STAT_ATTR(FREE_SLOWPATH, free_slowpath);
5388  STAT_ATTR(FREE_FROZEN, free_frozen);
5389  STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
5390  STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
5391  STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
5392  STAT_ATTR(ALLOC_SLAB, alloc_slab);
5393  STAT_ATTR(ALLOC_REFILL, alloc_refill);
5394  STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
5395  STAT_ATTR(FREE_SLAB, free_slab);
5396  STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
5397  STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
5398  STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
5399  STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
5400  STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
5401  STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
5402  STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
5403  STAT_ATTR(ORDER_FALLBACK, order_fallback);
5404  STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
5405  STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
5406  STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
5407  STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
5408  STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
5409  STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
5410  #endif	/* CONFIG_SLUB_STATS */
5411  
5412  static struct attribute *slab_attrs[] = {
5413  	&slab_size_attr.attr,
5414  	&object_size_attr.attr,
5415  	&objs_per_slab_attr.attr,
5416  	&order_attr.attr,
5417  	&min_partial_attr.attr,
5418  	&cpu_partial_attr.attr,
5419  	&objects_attr.attr,
5420  	&objects_partial_attr.attr,
5421  	&partial_attr.attr,
5422  	&cpu_slabs_attr.attr,
5423  	&ctor_attr.attr,
5424  	&aliases_attr.attr,
5425  	&align_attr.attr,
5426  	&hwcache_align_attr.attr,
5427  	&reclaim_account_attr.attr,
5428  	&destroy_by_rcu_attr.attr,
5429  	&shrink_attr.attr,
5430  	&slabs_cpu_partial_attr.attr,
5431  #ifdef CONFIG_SLUB_DEBUG
5432  	&total_objects_attr.attr,
5433  	&slabs_attr.attr,
5434  	&sanity_checks_attr.attr,
5435  	&trace_attr.attr,
5436  	&red_zone_attr.attr,
5437  	&poison_attr.attr,
5438  	&store_user_attr.attr,
5439  	&validate_attr.attr,
5440  	&alloc_calls_attr.attr,
5441  	&free_calls_attr.attr,
5442  #endif
5443  #ifdef CONFIG_ZONE_DMA
5444  	&cache_dma_attr.attr,
5445  #endif
5446  #ifdef CONFIG_NUMA
5447  	&remote_node_defrag_ratio_attr.attr,
5448  #endif
5449  #ifdef CONFIG_SLUB_STATS
5450  	&alloc_fastpath_attr.attr,
5451  	&alloc_slowpath_attr.attr,
5452  	&free_fastpath_attr.attr,
5453  	&free_slowpath_attr.attr,
5454  	&free_frozen_attr.attr,
5455  	&free_add_partial_attr.attr,
5456  	&free_remove_partial_attr.attr,
5457  	&alloc_from_partial_attr.attr,
5458  	&alloc_slab_attr.attr,
5459  	&alloc_refill_attr.attr,
5460  	&alloc_node_mismatch_attr.attr,
5461  	&free_slab_attr.attr,
5462  	&cpuslab_flush_attr.attr,
5463  	&deactivate_full_attr.attr,
5464  	&deactivate_empty_attr.attr,
5465  	&deactivate_to_head_attr.attr,
5466  	&deactivate_to_tail_attr.attr,
5467  	&deactivate_remote_frees_attr.attr,
5468  	&deactivate_bypass_attr.attr,
5469  	&order_fallback_attr.attr,
5470  	&cmpxchg_double_fail_attr.attr,
5471  	&cmpxchg_double_cpu_fail_attr.attr,
5472  	&cpu_partial_alloc_attr.attr,
5473  	&cpu_partial_free_attr.attr,
5474  	&cpu_partial_node_attr.attr,
5475  	&cpu_partial_drain_attr.attr,
5476  #endif
5477  #ifdef CONFIG_FAILSLAB
5478  	&failslab_attr.attr,
5479  #endif
5480  	&usersize_attr.attr,
5481  
5482  	NULL
5483  };
5484  
5485  static const struct attribute_group slab_attr_group = {
5486  	.attrs = slab_attrs,
5487  };
5488  
5489  static ssize_t slab_attr_show(struct kobject *kobj,
5490  				struct attribute *attr,
5491  				char *buf)
5492  {
5493  	struct slab_attribute *attribute;
5494  	struct kmem_cache *s;
5495  	int err;
5496  
5497  	attribute = to_slab_attr(attr);
5498  	s = to_slab(kobj);
5499  
5500  	if (!attribute->show)
5501  		return -EIO;
5502  
5503  	err = attribute->show(s, buf);
5504  
5505  	return err;
5506  }
5507  
5508  static ssize_t slab_attr_store(struct kobject *kobj,
5509  				struct attribute *attr,
5510  				const char *buf, size_t len)
5511  {
5512  	struct slab_attribute *attribute;
5513  	struct kmem_cache *s;
5514  	int err;
5515  
5516  	attribute = to_slab_attr(attr);
5517  	s = to_slab(kobj);
5518  
5519  	if (!attribute->store)
5520  		return -EIO;
5521  
5522  	err = attribute->store(s, buf, len);
5523  #ifdef CONFIG_MEMCG
5524  	if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
5525  		struct kmem_cache *c;
5526  
5527  		mutex_lock(&slab_mutex);
5528  		if (s->max_attr_size < len)
5529  			s->max_attr_size = len;
5530  
5531  		/*
5532  		 * This is a best effort propagation, so this function's return
5533  		 * value will be determined by the parent cache only. This is
5534  		 * basically because not all attributes will have a well
5535  		 * defined semantics for rollbacks - most of the actions will
5536  		 * have permanent effects.
5537  		 *
5538  		 * Returning the error value of any of the children that fail
5539  		 * is not 100 % defined, in the sense that users seeing the
5540  		 * error code won't be able to know anything about the state of
5541  		 * the cache.
5542  		 *
5543  		 * Only returning the error code for the parent cache at least
5544  		 * has well defined semantics. The cache being written to
5545  		 * directly either failed or succeeded, in which case we loop
5546  		 * through the descendants with best-effort propagation.
5547  		 */
5548  		for_each_memcg_cache(c, s)
5549  			attribute->store(c, buf, len);
5550  		mutex_unlock(&slab_mutex);
5551  	}
5552  #endif
5553  	return err;
5554  }
5555  
5556  static void memcg_propagate_slab_attrs(struct kmem_cache *s)
5557  {
5558  #ifdef CONFIG_MEMCG
5559  	int i;
5560  	char *buffer = NULL;
5561  	struct kmem_cache *root_cache;
5562  
5563  	if (is_root_cache(s))
5564  		return;
5565  
5566  	root_cache = s->memcg_params.root_cache;
5567  
5568  	/*
5569  	 * This mean this cache had no attribute written. Therefore, no point
5570  	 * in copying default values around
5571  	 */
5572  	if (!root_cache->max_attr_size)
5573  		return;
5574  
5575  	for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
5576  		char mbuf[64];
5577  		char *buf;
5578  		struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
5579  		ssize_t len;
5580  
5581  		if (!attr || !attr->store || !attr->show)
5582  			continue;
5583  
5584  		/*
5585  		 * It is really bad that we have to allocate here, so we will
5586  		 * do it only as a fallback. If we actually allocate, though,
5587  		 * we can just use the allocated buffer until the end.
5588  		 *
5589  		 * Most of the slub attributes will tend to be very small in
5590  		 * size, but sysfs allows buffers up to a page, so they can
5591  		 * theoretically happen.
5592  		 */
5593  		if (buffer)
5594  			buf = buffer;
5595  		else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf))
5596  			buf = mbuf;
5597  		else {
5598  			buffer = (char *) get_zeroed_page(GFP_KERNEL);
5599  			if (WARN_ON(!buffer))
5600  				continue;
5601  			buf = buffer;
5602  		}
5603  
5604  		len = attr->show(root_cache, buf);
5605  		if (len > 0)
5606  			attr->store(s, buf, len);
5607  	}
5608  
5609  	if (buffer)
5610  		free_page((unsigned long)buffer);
5611  #endif	/* CONFIG_MEMCG */
5612  }
5613  
5614  static void kmem_cache_release(struct kobject *k)
5615  {
5616  	slab_kmem_cache_release(to_slab(k));
5617  }
5618  
5619  static const struct sysfs_ops slab_sysfs_ops = {
5620  	.show = slab_attr_show,
5621  	.store = slab_attr_store,
5622  };
5623  
5624  static struct kobj_type slab_ktype = {
5625  	.sysfs_ops = &slab_sysfs_ops,
5626  	.release = kmem_cache_release,
5627  };
5628  
5629  static int uevent_filter(struct kset *kset, struct kobject *kobj)
5630  {
5631  	struct kobj_type *ktype = get_ktype(kobj);
5632  
5633  	if (ktype == &slab_ktype)
5634  		return 1;
5635  	return 0;
5636  }
5637  
5638  static const struct kset_uevent_ops slab_uevent_ops = {
5639  	.filter = uevent_filter,
5640  };
5641  
5642  static struct kset *slab_kset;
5643  
5644  static inline struct kset *cache_kset(struct kmem_cache *s)
5645  {
5646  #ifdef CONFIG_MEMCG
5647  	if (!is_root_cache(s))
5648  		return s->memcg_params.root_cache->memcg_kset;
5649  #endif
5650  	return slab_kset;
5651  }
5652  
5653  #define ID_STR_LENGTH 64
5654  
5655  /* Create a unique string id for a slab cache:
5656   *
5657   * Format	:[flags-]size
5658   */
5659  static char *create_unique_id(struct kmem_cache *s)
5660  {
5661  	char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
5662  	char *p = name;
5663  
5664  	BUG_ON(!name);
5665  
5666  	*p++ = ':';
5667  	/*
5668  	 * First flags affecting slabcache operations. We will only
5669  	 * get here for aliasable slabs so we do not need to support
5670  	 * too many flags. The flags here must cover all flags that
5671  	 * are matched during merging to guarantee that the id is
5672  	 * unique.
5673  	 */
5674  	if (s->flags & SLAB_CACHE_DMA)
5675  		*p++ = 'd';
5676  	if (s->flags & SLAB_CACHE_DMA32)
5677  		*p++ = 'D';
5678  	if (s->flags & SLAB_RECLAIM_ACCOUNT)
5679  		*p++ = 'a';
5680  	if (s->flags & SLAB_CONSISTENCY_CHECKS)
5681  		*p++ = 'F';
5682  	if (s->flags & SLAB_ACCOUNT)
5683  		*p++ = 'A';
5684  	if (p != name + 1)
5685  		*p++ = '-';
5686  	p += sprintf(p, "%07u", s->size);
5687  
5688  	BUG_ON(p > name + ID_STR_LENGTH - 1);
5689  	return name;
5690  }
5691  
5692  static void sysfs_slab_remove_workfn(struct work_struct *work)
5693  {
5694  	struct kmem_cache *s =
5695  		container_of(work, struct kmem_cache, kobj_remove_work);
5696  
5697  	if (!s->kobj.state_in_sysfs)
5698  		/*
5699  		 * For a memcg cache, this may be called during
5700  		 * deactivation and again on shutdown.  Remove only once.
5701  		 * A cache is never shut down before deactivation is
5702  		 * complete, so no need to worry about synchronization.
5703  		 */
5704  		goto out;
5705  
5706  #ifdef CONFIG_MEMCG
5707  	kset_unregister(s->memcg_kset);
5708  #endif
5709  	kobject_uevent(&s->kobj, KOBJ_REMOVE);
5710  out:
5711  	kobject_put(&s->kobj);
5712  }
5713  
5714  static int sysfs_slab_add(struct kmem_cache *s)
5715  {
5716  	int err;
5717  	const char *name;
5718  	struct kset *kset = cache_kset(s);
5719  	int unmergeable = slab_unmergeable(s);
5720  
5721  	INIT_WORK(&s->kobj_remove_work, sysfs_slab_remove_workfn);
5722  
5723  	if (!kset) {
5724  		kobject_init(&s->kobj, &slab_ktype);
5725  		return 0;
5726  	}
5727  
5728  	if (!unmergeable && disable_higher_order_debug &&
5729  			(slub_debug & DEBUG_METADATA_FLAGS))
5730  		unmergeable = 1;
5731  
5732  	if (unmergeable) {
5733  		/*
5734  		 * Slabcache can never be merged so we can use the name proper.
5735  		 * This is typically the case for debug situations. In that
5736  		 * case we can catch duplicate names easily.
5737  		 */
5738  		sysfs_remove_link(&slab_kset->kobj, s->name);
5739  		name = s->name;
5740  	} else {
5741  		/*
5742  		 * Create a unique name for the slab as a target
5743  		 * for the symlinks.
5744  		 */
5745  		name = create_unique_id(s);
5746  	}
5747  
5748  	s->kobj.kset = kset;
5749  	err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
5750  	if (err)
5751  		goto out;
5752  
5753  	err = sysfs_create_group(&s->kobj, &slab_attr_group);
5754  	if (err)
5755  		goto out_del_kobj;
5756  
5757  #ifdef CONFIG_MEMCG
5758  	if (is_root_cache(s) && memcg_sysfs_enabled) {
5759  		s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
5760  		if (!s->memcg_kset) {
5761  			err = -ENOMEM;
5762  			goto out_del_kobj;
5763  		}
5764  	}
5765  #endif
5766  
5767  	kobject_uevent(&s->kobj, KOBJ_ADD);
5768  	if (!unmergeable) {
5769  		/* Setup first alias */
5770  		sysfs_slab_alias(s, s->name);
5771  	}
5772  out:
5773  	if (!unmergeable)
5774  		kfree(name);
5775  	return err;
5776  out_del_kobj:
5777  	kobject_del(&s->kobj);
5778  	goto out;
5779  }
5780  
5781  static void sysfs_slab_remove(struct kmem_cache *s)
5782  {
5783  	if (slab_state < FULL)
5784  		/*
5785  		 * Sysfs has not been setup yet so no need to remove the
5786  		 * cache from sysfs.
5787  		 */
5788  		return;
5789  
5790  	kobject_get(&s->kobj);
5791  	schedule_work(&s->kobj_remove_work);
5792  }
5793  
5794  void sysfs_slab_unlink(struct kmem_cache *s)
5795  {
5796  	if (slab_state >= FULL)
5797  		kobject_del(&s->kobj);
5798  }
5799  
5800  void sysfs_slab_release(struct kmem_cache *s)
5801  {
5802  	if (slab_state >= FULL)
5803  		kobject_put(&s->kobj);
5804  }
5805  
5806  /*
5807   * Need to buffer aliases during bootup until sysfs becomes
5808   * available lest we lose that information.
5809   */
5810  struct saved_alias {
5811  	struct kmem_cache *s;
5812  	const char *name;
5813  	struct saved_alias *next;
5814  };
5815  
5816  static struct saved_alias *alias_list;
5817  
5818  static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
5819  {
5820  	struct saved_alias *al;
5821  
5822  	if (slab_state == FULL) {
5823  		/*
5824  		 * If we have a leftover link then remove it.
5825  		 */
5826  		sysfs_remove_link(&slab_kset->kobj, name);
5827  		return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
5828  	}
5829  
5830  	al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
5831  	if (!al)
5832  		return -ENOMEM;
5833  
5834  	al->s = s;
5835  	al->name = name;
5836  	al->next = alias_list;
5837  	alias_list = al;
5838  	return 0;
5839  }
5840  
5841  static int __init slab_sysfs_init(void)
5842  {
5843  	struct kmem_cache *s;
5844  	int err;
5845  
5846  	mutex_lock(&slab_mutex);
5847  
5848  	slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
5849  	if (!slab_kset) {
5850  		mutex_unlock(&slab_mutex);
5851  		pr_err("Cannot register slab subsystem.\n");
5852  		return -ENOSYS;
5853  	}
5854  
5855  	slab_state = FULL;
5856  
5857  	list_for_each_entry(s, &slab_caches, list) {
5858  		err = sysfs_slab_add(s);
5859  		if (err)
5860  			pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
5861  			       s->name);
5862  	}
5863  
5864  	while (alias_list) {
5865  		struct saved_alias *al = alias_list;
5866  
5867  		alias_list = alias_list->next;
5868  		err = sysfs_slab_alias(al->s, al->name);
5869  		if (err)
5870  			pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
5871  			       al->name);
5872  		kfree(al);
5873  	}
5874  
5875  	mutex_unlock(&slab_mutex);
5876  	resiliency_test();
5877  	return 0;
5878  }
5879  
5880  __initcall(slab_sysfs_init);
5881  #endif /* CONFIG_SYSFS */
5882  
5883  /*
5884   * The /proc/slabinfo ABI
5885   */
5886  #ifdef CONFIG_SLUB_DEBUG
5887  void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
5888  {
5889  	unsigned long nr_slabs = 0;
5890  	unsigned long nr_objs = 0;
5891  	unsigned long nr_free = 0;
5892  	int node;
5893  	struct kmem_cache_node *n;
5894  
5895  	for_each_kmem_cache_node(s, node, n) {
5896  		nr_slabs += node_nr_slabs(n);
5897  		nr_objs += node_nr_objs(n);
5898  		nr_free += count_partial(n, count_free);
5899  	}
5900  
5901  	sinfo->active_objs = nr_objs - nr_free;
5902  	sinfo->num_objs = nr_objs;
5903  	sinfo->active_slabs = nr_slabs;
5904  	sinfo->num_slabs = nr_slabs;
5905  	sinfo->objects_per_slab = oo_objects(s->oo);
5906  	sinfo->cache_order = oo_order(s->oo);
5907  }
5908  
5909  void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
5910  {
5911  }
5912  
5913  ssize_t slabinfo_write(struct file *file, const char __user *buffer,
5914  		       size_t count, loff_t *ppos)
5915  {
5916  	return -EIO;
5917  }
5918  #endif /* CONFIG_SLUB_DEBUG */
5919