xref: /openbmc/linux/mm/slub.c (revision 1b69c6d0ae90b7f1a4f61d5c8209d5cb7a55f849)
1  /*
2   * SLUB: A slab allocator that limits cache line use instead of queuing
3   * objects in per cpu and per node lists.
4   *
5   * The allocator synchronizes using per slab locks or atomic operatios
6   * and only uses a centralized lock to manage a pool of partial slabs.
7   *
8   * (C) 2007 SGI, Christoph Lameter
9   * (C) 2011 Linux Foundation, Christoph Lameter
10   */
11  
12  #include <linux/mm.h>
13  #include <linux/swap.h> /* struct reclaim_state */
14  #include <linux/module.h>
15  #include <linux/bit_spinlock.h>
16  #include <linux/interrupt.h>
17  #include <linux/bitops.h>
18  #include <linux/slab.h>
19  #include "slab.h"
20  #include <linux/proc_fs.h>
21  #include <linux/notifier.h>
22  #include <linux/seq_file.h>
23  #include <linux/kasan.h>
24  #include <linux/kmemcheck.h>
25  #include <linux/cpu.h>
26  #include <linux/cpuset.h>
27  #include <linux/mempolicy.h>
28  #include <linux/ctype.h>
29  #include <linux/debugobjects.h>
30  #include <linux/kallsyms.h>
31  #include <linux/memory.h>
32  #include <linux/math64.h>
33  #include <linux/fault-inject.h>
34  #include <linux/stacktrace.h>
35  #include <linux/prefetch.h>
36  #include <linux/memcontrol.h>
37  
38  #include <trace/events/kmem.h>
39  
40  #include "internal.h"
41  
42  /*
43   * Lock order:
44   *   1. slab_mutex (Global Mutex)
45   *   2. node->list_lock
46   *   3. slab_lock(page) (Only on some arches and for debugging)
47   *
48   *   slab_mutex
49   *
50   *   The role of the slab_mutex is to protect the list of all the slabs
51   *   and to synchronize major metadata changes to slab cache structures.
52   *
53   *   The slab_lock is only used for debugging and on arches that do not
54   *   have the ability to do a cmpxchg_double. It only protects the second
55   *   double word in the page struct. Meaning
56   *	A. page->freelist	-> List of object free in a page
57   *	B. page->counters	-> Counters of objects
58   *	C. page->frozen		-> frozen state
59   *
60   *   If a slab is frozen then it is exempt from list management. It is not
61   *   on any list. The processor that froze the slab is the one who can
62   *   perform list operations on the page. Other processors may put objects
63   *   onto the freelist but the processor that froze the slab is the only
64   *   one that can retrieve the objects from the page's freelist.
65   *
66   *   The list_lock protects the partial and full list on each node and
67   *   the partial slab counter. If taken then no new slabs may be added or
68   *   removed from the lists nor make the number of partial slabs be modified.
69   *   (Note that the total number of slabs is an atomic value that may be
70   *   modified without taking the list lock).
71   *
72   *   The list_lock is a centralized lock and thus we avoid taking it as
73   *   much as possible. As long as SLUB does not have to handle partial
74   *   slabs, operations can continue without any centralized lock. F.e.
75   *   allocating a long series of objects that fill up slabs does not require
76   *   the list lock.
77   *   Interrupts are disabled during allocation and deallocation in order to
78   *   make the slab allocator safe to use in the context of an irq. In addition
79   *   interrupts are disabled to ensure that the processor does not change
80   *   while handling per_cpu slabs, due to kernel preemption.
81   *
82   * SLUB assigns one slab for allocation to each processor.
83   * Allocations only occur from these slabs called cpu slabs.
84   *
85   * Slabs with free elements are kept on a partial list and during regular
86   * operations no list for full slabs is used. If an object in a full slab is
87   * freed then the slab will show up again on the partial lists.
88   * We track full slabs for debugging purposes though because otherwise we
89   * cannot scan all objects.
90   *
91   * Slabs are freed when they become empty. Teardown and setup is
92   * minimal so we rely on the page allocators per cpu caches for
93   * fast frees and allocs.
94   *
95   * Overloading of page flags that are otherwise used for LRU management.
96   *
97   * PageActive 		The slab is frozen and exempt from list processing.
98   * 			This means that the slab is dedicated to a purpose
99   * 			such as satisfying allocations for a specific
100   * 			processor. Objects may be freed in the slab while
101   * 			it is frozen but slab_free will then skip the usual
102   * 			list operations. It is up to the processor holding
103   * 			the slab to integrate the slab into the slab lists
104   * 			when the slab is no longer needed.
105   *
106   * 			One use of this flag is to mark slabs that are
107   * 			used for allocations. Then such a slab becomes a cpu
108   * 			slab. The cpu slab may be equipped with an additional
109   * 			freelist that allows lockless access to
110   * 			free objects in addition to the regular freelist
111   * 			that requires the slab lock.
112   *
113   * PageError		Slab requires special handling due to debug
114   * 			options set. This moves	slab handling out of
115   * 			the fast path and disables lockless freelists.
116   */
117  
118  static inline int kmem_cache_debug(struct kmem_cache *s)
119  {
120  #ifdef CONFIG_SLUB_DEBUG
121  	return unlikely(s->flags & SLAB_DEBUG_FLAGS);
122  #else
123  	return 0;
124  #endif
125  }
126  
127  static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
128  {
129  #ifdef CONFIG_SLUB_CPU_PARTIAL
130  	return !kmem_cache_debug(s);
131  #else
132  	return false;
133  #endif
134  }
135  
136  /*
137   * Issues still to be resolved:
138   *
139   * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
140   *
141   * - Variable sizing of the per node arrays
142   */
143  
144  /* Enable to test recovery from slab corruption on boot */
145  #undef SLUB_RESILIENCY_TEST
146  
147  /* Enable to log cmpxchg failures */
148  #undef SLUB_DEBUG_CMPXCHG
149  
150  /*
151   * Mininum number of partial slabs. These will be left on the partial
152   * lists even if they are empty. kmem_cache_shrink may reclaim them.
153   */
154  #define MIN_PARTIAL 5
155  
156  /*
157   * Maximum number of desirable partial slabs.
158   * The existence of more partial slabs makes kmem_cache_shrink
159   * sort the partial list by the number of objects in use.
160   */
161  #define MAX_PARTIAL 10
162  
163  #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
164  				SLAB_POISON | SLAB_STORE_USER)
165  
166  /*
167   * Debugging flags that require metadata to be stored in the slab.  These get
168   * disabled when slub_debug=O is used and a cache's min order increases with
169   * metadata.
170   */
171  #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
172  
173  #define OO_SHIFT	16
174  #define OO_MASK		((1 << OO_SHIFT) - 1)
175  #define MAX_OBJS_PER_PAGE	32767 /* since page.objects is u15 */
176  
177  /* Internal SLUB flags */
178  #define __OBJECT_POISON		0x80000000UL /* Poison object */
179  #define __CMPXCHG_DOUBLE	0x40000000UL /* Use cmpxchg_double */
180  
181  #ifdef CONFIG_SMP
182  static struct notifier_block slab_notifier;
183  #endif
184  
185  /*
186   * Tracking user of a slab.
187   */
188  #define TRACK_ADDRS_COUNT 16
189  struct track {
190  	unsigned long addr;	/* Called from address */
191  #ifdef CONFIG_STACKTRACE
192  	unsigned long addrs[TRACK_ADDRS_COUNT];	/* Called from address */
193  #endif
194  	int cpu;		/* Was running on cpu */
195  	int pid;		/* Pid context */
196  	unsigned long when;	/* When did the operation occur */
197  };
198  
199  enum track_item { TRACK_ALLOC, TRACK_FREE };
200  
201  #ifdef CONFIG_SYSFS
202  static int sysfs_slab_add(struct kmem_cache *);
203  static int sysfs_slab_alias(struct kmem_cache *, const char *);
204  static void memcg_propagate_slab_attrs(struct kmem_cache *s);
205  #else
206  static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
207  static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
208  							{ return 0; }
209  static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
210  #endif
211  
212  static inline void stat(const struct kmem_cache *s, enum stat_item si)
213  {
214  #ifdef CONFIG_SLUB_STATS
215  	/*
216  	 * The rmw is racy on a preemptible kernel but this is acceptable, so
217  	 * avoid this_cpu_add()'s irq-disable overhead.
218  	 */
219  	raw_cpu_inc(s->cpu_slab->stat[si]);
220  #endif
221  }
222  
223  /********************************************************************
224   * 			Core slab cache functions
225   *******************************************************************/
226  
227  /* Verify that a pointer has an address that is valid within a slab page */
228  static inline int check_valid_pointer(struct kmem_cache *s,
229  				struct page *page, const void *object)
230  {
231  	void *base;
232  
233  	if (!object)
234  		return 1;
235  
236  	base = page_address(page);
237  	if (object < base || object >= base + page->objects * s->size ||
238  		(object - base) % s->size) {
239  		return 0;
240  	}
241  
242  	return 1;
243  }
244  
245  static inline void *get_freepointer(struct kmem_cache *s, void *object)
246  {
247  	return *(void **)(object + s->offset);
248  }
249  
250  static void prefetch_freepointer(const struct kmem_cache *s, void *object)
251  {
252  	prefetch(object + s->offset);
253  }
254  
255  static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
256  {
257  	void *p;
258  
259  #ifdef CONFIG_DEBUG_PAGEALLOC
260  	probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
261  #else
262  	p = get_freepointer(s, object);
263  #endif
264  	return p;
265  }
266  
267  static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
268  {
269  	*(void **)(object + s->offset) = fp;
270  }
271  
272  /* Loop over all objects in a slab */
273  #define for_each_object(__p, __s, __addr, __objects) \
274  	for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
275  			__p += (__s)->size)
276  
277  #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
278  	for (__p = (__addr), __idx = 1; __idx <= __objects;\
279  			__p += (__s)->size, __idx++)
280  
281  /* Determine object index from a given position */
282  static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
283  {
284  	return (p - addr) / s->size;
285  }
286  
287  static inline size_t slab_ksize(const struct kmem_cache *s)
288  {
289  #ifdef CONFIG_SLUB_DEBUG
290  	/*
291  	 * Debugging requires use of the padding between object
292  	 * and whatever may come after it.
293  	 */
294  	if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
295  		return s->object_size;
296  
297  #endif
298  	/*
299  	 * If we have the need to store the freelist pointer
300  	 * back there or track user information then we can
301  	 * only use the space before that information.
302  	 */
303  	if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
304  		return s->inuse;
305  	/*
306  	 * Else we can use all the padding etc for the allocation
307  	 */
308  	return s->size;
309  }
310  
311  static inline int order_objects(int order, unsigned long size, int reserved)
312  {
313  	return ((PAGE_SIZE << order) - reserved) / size;
314  }
315  
316  static inline struct kmem_cache_order_objects oo_make(int order,
317  		unsigned long size, int reserved)
318  {
319  	struct kmem_cache_order_objects x = {
320  		(order << OO_SHIFT) + order_objects(order, size, reserved)
321  	};
322  
323  	return x;
324  }
325  
326  static inline int oo_order(struct kmem_cache_order_objects x)
327  {
328  	return x.x >> OO_SHIFT;
329  }
330  
331  static inline int oo_objects(struct kmem_cache_order_objects x)
332  {
333  	return x.x & OO_MASK;
334  }
335  
336  /*
337   * Per slab locking using the pagelock
338   */
339  static __always_inline void slab_lock(struct page *page)
340  {
341  	bit_spin_lock(PG_locked, &page->flags);
342  }
343  
344  static __always_inline void slab_unlock(struct page *page)
345  {
346  	__bit_spin_unlock(PG_locked, &page->flags);
347  }
348  
349  static inline void set_page_slub_counters(struct page *page, unsigned long counters_new)
350  {
351  	struct page tmp;
352  	tmp.counters = counters_new;
353  	/*
354  	 * page->counters can cover frozen/inuse/objects as well
355  	 * as page->_count.  If we assign to ->counters directly
356  	 * we run the risk of losing updates to page->_count, so
357  	 * be careful and only assign to the fields we need.
358  	 */
359  	page->frozen  = tmp.frozen;
360  	page->inuse   = tmp.inuse;
361  	page->objects = tmp.objects;
362  }
363  
364  /* Interrupts must be disabled (for the fallback code to work right) */
365  static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
366  		void *freelist_old, unsigned long counters_old,
367  		void *freelist_new, unsigned long counters_new,
368  		const char *n)
369  {
370  	VM_BUG_ON(!irqs_disabled());
371  #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
372      defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
373  	if (s->flags & __CMPXCHG_DOUBLE) {
374  		if (cmpxchg_double(&page->freelist, &page->counters,
375  				   freelist_old, counters_old,
376  				   freelist_new, counters_new))
377  			return true;
378  	} else
379  #endif
380  	{
381  		slab_lock(page);
382  		if (page->freelist == freelist_old &&
383  					page->counters == counters_old) {
384  			page->freelist = freelist_new;
385  			set_page_slub_counters(page, counters_new);
386  			slab_unlock(page);
387  			return true;
388  		}
389  		slab_unlock(page);
390  	}
391  
392  	cpu_relax();
393  	stat(s, CMPXCHG_DOUBLE_FAIL);
394  
395  #ifdef SLUB_DEBUG_CMPXCHG
396  	pr_info("%s %s: cmpxchg double redo ", n, s->name);
397  #endif
398  
399  	return false;
400  }
401  
402  static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
403  		void *freelist_old, unsigned long counters_old,
404  		void *freelist_new, unsigned long counters_new,
405  		const char *n)
406  {
407  #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
408      defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
409  	if (s->flags & __CMPXCHG_DOUBLE) {
410  		if (cmpxchg_double(&page->freelist, &page->counters,
411  				   freelist_old, counters_old,
412  				   freelist_new, counters_new))
413  			return true;
414  	} else
415  #endif
416  	{
417  		unsigned long flags;
418  
419  		local_irq_save(flags);
420  		slab_lock(page);
421  		if (page->freelist == freelist_old &&
422  					page->counters == counters_old) {
423  			page->freelist = freelist_new;
424  			set_page_slub_counters(page, counters_new);
425  			slab_unlock(page);
426  			local_irq_restore(flags);
427  			return true;
428  		}
429  		slab_unlock(page);
430  		local_irq_restore(flags);
431  	}
432  
433  	cpu_relax();
434  	stat(s, CMPXCHG_DOUBLE_FAIL);
435  
436  #ifdef SLUB_DEBUG_CMPXCHG
437  	pr_info("%s %s: cmpxchg double redo ", n, s->name);
438  #endif
439  
440  	return false;
441  }
442  
443  #ifdef CONFIG_SLUB_DEBUG
444  /*
445   * Determine a map of object in use on a page.
446   *
447   * Node listlock must be held to guarantee that the page does
448   * not vanish from under us.
449   */
450  static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
451  {
452  	void *p;
453  	void *addr = page_address(page);
454  
455  	for (p = page->freelist; p; p = get_freepointer(s, p))
456  		set_bit(slab_index(p, s, addr), map);
457  }
458  
459  /*
460   * Debug settings:
461   */
462  #ifdef CONFIG_SLUB_DEBUG_ON
463  static int slub_debug = DEBUG_DEFAULT_FLAGS;
464  #else
465  static int slub_debug;
466  #endif
467  
468  static char *slub_debug_slabs;
469  static int disable_higher_order_debug;
470  
471  /*
472   * slub is about to manipulate internal object metadata.  This memory lies
473   * outside the range of the allocated object, so accessing it would normally
474   * be reported by kasan as a bounds error.  metadata_access_enable() is used
475   * to tell kasan that these accesses are OK.
476   */
477  static inline void metadata_access_enable(void)
478  {
479  	kasan_disable_current();
480  }
481  
482  static inline void metadata_access_disable(void)
483  {
484  	kasan_enable_current();
485  }
486  
487  /*
488   * Object debugging
489   */
490  static void print_section(char *text, u8 *addr, unsigned int length)
491  {
492  	metadata_access_enable();
493  	print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
494  			length, 1);
495  	metadata_access_disable();
496  }
497  
498  static struct track *get_track(struct kmem_cache *s, void *object,
499  	enum track_item alloc)
500  {
501  	struct track *p;
502  
503  	if (s->offset)
504  		p = object + s->offset + sizeof(void *);
505  	else
506  		p = object + s->inuse;
507  
508  	return p + alloc;
509  }
510  
511  static void set_track(struct kmem_cache *s, void *object,
512  			enum track_item alloc, unsigned long addr)
513  {
514  	struct track *p = get_track(s, object, alloc);
515  
516  	if (addr) {
517  #ifdef CONFIG_STACKTRACE
518  		struct stack_trace trace;
519  		int i;
520  
521  		trace.nr_entries = 0;
522  		trace.max_entries = TRACK_ADDRS_COUNT;
523  		trace.entries = p->addrs;
524  		trace.skip = 3;
525  		metadata_access_enable();
526  		save_stack_trace(&trace);
527  		metadata_access_disable();
528  
529  		/* See rant in lockdep.c */
530  		if (trace.nr_entries != 0 &&
531  		    trace.entries[trace.nr_entries - 1] == ULONG_MAX)
532  			trace.nr_entries--;
533  
534  		for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
535  			p->addrs[i] = 0;
536  #endif
537  		p->addr = addr;
538  		p->cpu = smp_processor_id();
539  		p->pid = current->pid;
540  		p->when = jiffies;
541  	} else
542  		memset(p, 0, sizeof(struct track));
543  }
544  
545  static void init_tracking(struct kmem_cache *s, void *object)
546  {
547  	if (!(s->flags & SLAB_STORE_USER))
548  		return;
549  
550  	set_track(s, object, TRACK_FREE, 0UL);
551  	set_track(s, object, TRACK_ALLOC, 0UL);
552  }
553  
554  static void print_track(const char *s, struct track *t)
555  {
556  	if (!t->addr)
557  		return;
558  
559  	pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
560  	       s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
561  #ifdef CONFIG_STACKTRACE
562  	{
563  		int i;
564  		for (i = 0; i < TRACK_ADDRS_COUNT; i++)
565  			if (t->addrs[i])
566  				pr_err("\t%pS\n", (void *)t->addrs[i]);
567  			else
568  				break;
569  	}
570  #endif
571  }
572  
573  static void print_tracking(struct kmem_cache *s, void *object)
574  {
575  	if (!(s->flags & SLAB_STORE_USER))
576  		return;
577  
578  	print_track("Allocated", get_track(s, object, TRACK_ALLOC));
579  	print_track("Freed", get_track(s, object, TRACK_FREE));
580  }
581  
582  static void print_page_info(struct page *page)
583  {
584  	pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
585  	       page, page->objects, page->inuse, page->freelist, page->flags);
586  
587  }
588  
589  static void slab_bug(struct kmem_cache *s, char *fmt, ...)
590  {
591  	struct va_format vaf;
592  	va_list args;
593  
594  	va_start(args, fmt);
595  	vaf.fmt = fmt;
596  	vaf.va = &args;
597  	pr_err("=============================================================================\n");
598  	pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
599  	pr_err("-----------------------------------------------------------------------------\n\n");
600  
601  	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
602  	va_end(args);
603  }
604  
605  static void slab_fix(struct kmem_cache *s, char *fmt, ...)
606  {
607  	struct va_format vaf;
608  	va_list args;
609  
610  	va_start(args, fmt);
611  	vaf.fmt = fmt;
612  	vaf.va = &args;
613  	pr_err("FIX %s: %pV\n", s->name, &vaf);
614  	va_end(args);
615  }
616  
617  static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
618  {
619  	unsigned int off;	/* Offset of last byte */
620  	u8 *addr = page_address(page);
621  
622  	print_tracking(s, p);
623  
624  	print_page_info(page);
625  
626  	pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
627  	       p, p - addr, get_freepointer(s, p));
628  
629  	if (p > addr + 16)
630  		print_section("Bytes b4 ", p - 16, 16);
631  
632  	print_section("Object ", p, min_t(unsigned long, s->object_size,
633  				PAGE_SIZE));
634  	if (s->flags & SLAB_RED_ZONE)
635  		print_section("Redzone ", p + s->object_size,
636  			s->inuse - s->object_size);
637  
638  	if (s->offset)
639  		off = s->offset + sizeof(void *);
640  	else
641  		off = s->inuse;
642  
643  	if (s->flags & SLAB_STORE_USER)
644  		off += 2 * sizeof(struct track);
645  
646  	if (off != s->size)
647  		/* Beginning of the filler is the free pointer */
648  		print_section("Padding ", p + off, s->size - off);
649  
650  	dump_stack();
651  }
652  
653  void object_err(struct kmem_cache *s, struct page *page,
654  			u8 *object, char *reason)
655  {
656  	slab_bug(s, "%s", reason);
657  	print_trailer(s, page, object);
658  }
659  
660  static void slab_err(struct kmem_cache *s, struct page *page,
661  			const char *fmt, ...)
662  {
663  	va_list args;
664  	char buf[100];
665  
666  	va_start(args, fmt);
667  	vsnprintf(buf, sizeof(buf), fmt, args);
668  	va_end(args);
669  	slab_bug(s, "%s", buf);
670  	print_page_info(page);
671  	dump_stack();
672  }
673  
674  static void init_object(struct kmem_cache *s, void *object, u8 val)
675  {
676  	u8 *p = object;
677  
678  	if (s->flags & __OBJECT_POISON) {
679  		memset(p, POISON_FREE, s->object_size - 1);
680  		p[s->object_size - 1] = POISON_END;
681  	}
682  
683  	if (s->flags & SLAB_RED_ZONE)
684  		memset(p + s->object_size, val, s->inuse - s->object_size);
685  }
686  
687  static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
688  						void *from, void *to)
689  {
690  	slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
691  	memset(from, data, to - from);
692  }
693  
694  static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
695  			u8 *object, char *what,
696  			u8 *start, unsigned int value, unsigned int bytes)
697  {
698  	u8 *fault;
699  	u8 *end;
700  
701  	metadata_access_enable();
702  	fault = memchr_inv(start, value, bytes);
703  	metadata_access_disable();
704  	if (!fault)
705  		return 1;
706  
707  	end = start + bytes;
708  	while (end > fault && end[-1] == value)
709  		end--;
710  
711  	slab_bug(s, "%s overwritten", what);
712  	pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
713  					fault, end - 1, fault[0], value);
714  	print_trailer(s, page, object);
715  
716  	restore_bytes(s, what, value, fault, end);
717  	return 0;
718  }
719  
720  /*
721   * Object layout:
722   *
723   * object address
724   * 	Bytes of the object to be managed.
725   * 	If the freepointer may overlay the object then the free
726   * 	pointer is the first word of the object.
727   *
728   * 	Poisoning uses 0x6b (POISON_FREE) and the last byte is
729   * 	0xa5 (POISON_END)
730   *
731   * object + s->object_size
732   * 	Padding to reach word boundary. This is also used for Redzoning.
733   * 	Padding is extended by another word if Redzoning is enabled and
734   * 	object_size == inuse.
735   *
736   * 	We fill with 0xbb (RED_INACTIVE) for inactive objects and with
737   * 	0xcc (RED_ACTIVE) for objects in use.
738   *
739   * object + s->inuse
740   * 	Meta data starts here.
741   *
742   * 	A. Free pointer (if we cannot overwrite object on free)
743   * 	B. Tracking data for SLAB_STORE_USER
744   * 	C. Padding to reach required alignment boundary or at mininum
745   * 		one word if debugging is on to be able to detect writes
746   * 		before the word boundary.
747   *
748   *	Padding is done using 0x5a (POISON_INUSE)
749   *
750   * object + s->size
751   * 	Nothing is used beyond s->size.
752   *
753   * If slabcaches are merged then the object_size and inuse boundaries are mostly
754   * ignored. And therefore no slab options that rely on these boundaries
755   * may be used with merged slabcaches.
756   */
757  
758  static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
759  {
760  	unsigned long off = s->inuse;	/* The end of info */
761  
762  	if (s->offset)
763  		/* Freepointer is placed after the object. */
764  		off += sizeof(void *);
765  
766  	if (s->flags & SLAB_STORE_USER)
767  		/* We also have user information there */
768  		off += 2 * sizeof(struct track);
769  
770  	if (s->size == off)
771  		return 1;
772  
773  	return check_bytes_and_report(s, page, p, "Object padding",
774  				p + off, POISON_INUSE, s->size - off);
775  }
776  
777  /* Check the pad bytes at the end of a slab page */
778  static int slab_pad_check(struct kmem_cache *s, struct page *page)
779  {
780  	u8 *start;
781  	u8 *fault;
782  	u8 *end;
783  	int length;
784  	int remainder;
785  
786  	if (!(s->flags & SLAB_POISON))
787  		return 1;
788  
789  	start = page_address(page);
790  	length = (PAGE_SIZE << compound_order(page)) - s->reserved;
791  	end = start + length;
792  	remainder = length % s->size;
793  	if (!remainder)
794  		return 1;
795  
796  	metadata_access_enable();
797  	fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
798  	metadata_access_disable();
799  	if (!fault)
800  		return 1;
801  	while (end > fault && end[-1] == POISON_INUSE)
802  		end--;
803  
804  	slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
805  	print_section("Padding ", end - remainder, remainder);
806  
807  	restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
808  	return 0;
809  }
810  
811  static int check_object(struct kmem_cache *s, struct page *page,
812  					void *object, u8 val)
813  {
814  	u8 *p = object;
815  	u8 *endobject = object + s->object_size;
816  
817  	if (s->flags & SLAB_RED_ZONE) {
818  		if (!check_bytes_and_report(s, page, object, "Redzone",
819  			endobject, val, s->inuse - s->object_size))
820  			return 0;
821  	} else {
822  		if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
823  			check_bytes_and_report(s, page, p, "Alignment padding",
824  				endobject, POISON_INUSE,
825  				s->inuse - s->object_size);
826  		}
827  	}
828  
829  	if (s->flags & SLAB_POISON) {
830  		if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
831  			(!check_bytes_and_report(s, page, p, "Poison", p,
832  					POISON_FREE, s->object_size - 1) ||
833  			 !check_bytes_and_report(s, page, p, "Poison",
834  				p + s->object_size - 1, POISON_END, 1)))
835  			return 0;
836  		/*
837  		 * check_pad_bytes cleans up on its own.
838  		 */
839  		check_pad_bytes(s, page, p);
840  	}
841  
842  	if (!s->offset && val == SLUB_RED_ACTIVE)
843  		/*
844  		 * Object and freepointer overlap. Cannot check
845  		 * freepointer while object is allocated.
846  		 */
847  		return 1;
848  
849  	/* Check free pointer validity */
850  	if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
851  		object_err(s, page, p, "Freepointer corrupt");
852  		/*
853  		 * No choice but to zap it and thus lose the remainder
854  		 * of the free objects in this slab. May cause
855  		 * another error because the object count is now wrong.
856  		 */
857  		set_freepointer(s, p, NULL);
858  		return 0;
859  	}
860  	return 1;
861  }
862  
863  static int check_slab(struct kmem_cache *s, struct page *page)
864  {
865  	int maxobj;
866  
867  	VM_BUG_ON(!irqs_disabled());
868  
869  	if (!PageSlab(page)) {
870  		slab_err(s, page, "Not a valid slab page");
871  		return 0;
872  	}
873  
874  	maxobj = order_objects(compound_order(page), s->size, s->reserved);
875  	if (page->objects > maxobj) {
876  		slab_err(s, page, "objects %u > max %u",
877  			page->objects, maxobj);
878  		return 0;
879  	}
880  	if (page->inuse > page->objects) {
881  		slab_err(s, page, "inuse %u > max %u",
882  			page->inuse, page->objects);
883  		return 0;
884  	}
885  	/* Slab_pad_check fixes things up after itself */
886  	slab_pad_check(s, page);
887  	return 1;
888  }
889  
890  /*
891   * Determine if a certain object on a page is on the freelist. Must hold the
892   * slab lock to guarantee that the chains are in a consistent state.
893   */
894  static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
895  {
896  	int nr = 0;
897  	void *fp;
898  	void *object = NULL;
899  	int max_objects;
900  
901  	fp = page->freelist;
902  	while (fp && nr <= page->objects) {
903  		if (fp == search)
904  			return 1;
905  		if (!check_valid_pointer(s, page, fp)) {
906  			if (object) {
907  				object_err(s, page, object,
908  					"Freechain corrupt");
909  				set_freepointer(s, object, NULL);
910  			} else {
911  				slab_err(s, page, "Freepointer corrupt");
912  				page->freelist = NULL;
913  				page->inuse = page->objects;
914  				slab_fix(s, "Freelist cleared");
915  				return 0;
916  			}
917  			break;
918  		}
919  		object = fp;
920  		fp = get_freepointer(s, object);
921  		nr++;
922  	}
923  
924  	max_objects = order_objects(compound_order(page), s->size, s->reserved);
925  	if (max_objects > MAX_OBJS_PER_PAGE)
926  		max_objects = MAX_OBJS_PER_PAGE;
927  
928  	if (page->objects != max_objects) {
929  		slab_err(s, page, "Wrong number of objects. Found %d but "
930  			"should be %d", page->objects, max_objects);
931  		page->objects = max_objects;
932  		slab_fix(s, "Number of objects adjusted.");
933  	}
934  	if (page->inuse != page->objects - nr) {
935  		slab_err(s, page, "Wrong object count. Counter is %d but "
936  			"counted were %d", page->inuse, page->objects - nr);
937  		page->inuse = page->objects - nr;
938  		slab_fix(s, "Object count adjusted.");
939  	}
940  	return search == NULL;
941  }
942  
943  static void trace(struct kmem_cache *s, struct page *page, void *object,
944  								int alloc)
945  {
946  	if (s->flags & SLAB_TRACE) {
947  		pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
948  			s->name,
949  			alloc ? "alloc" : "free",
950  			object, page->inuse,
951  			page->freelist);
952  
953  		if (!alloc)
954  			print_section("Object ", (void *)object,
955  					s->object_size);
956  
957  		dump_stack();
958  	}
959  }
960  
961  /*
962   * Tracking of fully allocated slabs for debugging purposes.
963   */
964  static void add_full(struct kmem_cache *s,
965  	struct kmem_cache_node *n, struct page *page)
966  {
967  	if (!(s->flags & SLAB_STORE_USER))
968  		return;
969  
970  	lockdep_assert_held(&n->list_lock);
971  	list_add(&page->lru, &n->full);
972  }
973  
974  static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
975  {
976  	if (!(s->flags & SLAB_STORE_USER))
977  		return;
978  
979  	lockdep_assert_held(&n->list_lock);
980  	list_del(&page->lru);
981  }
982  
983  /* Tracking of the number of slabs for debugging purposes */
984  static inline unsigned long slabs_node(struct kmem_cache *s, int node)
985  {
986  	struct kmem_cache_node *n = get_node(s, node);
987  
988  	return atomic_long_read(&n->nr_slabs);
989  }
990  
991  static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
992  {
993  	return atomic_long_read(&n->nr_slabs);
994  }
995  
996  static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
997  {
998  	struct kmem_cache_node *n = get_node(s, node);
999  
1000  	/*
1001  	 * May be called early in order to allocate a slab for the
1002  	 * kmem_cache_node structure. Solve the chicken-egg
1003  	 * dilemma by deferring the increment of the count during
1004  	 * bootstrap (see early_kmem_cache_node_alloc).
1005  	 */
1006  	if (likely(n)) {
1007  		atomic_long_inc(&n->nr_slabs);
1008  		atomic_long_add(objects, &n->total_objects);
1009  	}
1010  }
1011  static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
1012  {
1013  	struct kmem_cache_node *n = get_node(s, node);
1014  
1015  	atomic_long_dec(&n->nr_slabs);
1016  	atomic_long_sub(objects, &n->total_objects);
1017  }
1018  
1019  /* Object debug checks for alloc/free paths */
1020  static void setup_object_debug(struct kmem_cache *s, struct page *page,
1021  								void *object)
1022  {
1023  	if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
1024  		return;
1025  
1026  	init_object(s, object, SLUB_RED_INACTIVE);
1027  	init_tracking(s, object);
1028  }
1029  
1030  static noinline int alloc_debug_processing(struct kmem_cache *s,
1031  					struct page *page,
1032  					void *object, unsigned long addr)
1033  {
1034  	if (!check_slab(s, page))
1035  		goto bad;
1036  
1037  	if (!check_valid_pointer(s, page, object)) {
1038  		object_err(s, page, object, "Freelist Pointer check fails");
1039  		goto bad;
1040  	}
1041  
1042  	if (!check_object(s, page, object, SLUB_RED_INACTIVE))
1043  		goto bad;
1044  
1045  	/* Success perform special debug activities for allocs */
1046  	if (s->flags & SLAB_STORE_USER)
1047  		set_track(s, object, TRACK_ALLOC, addr);
1048  	trace(s, page, object, 1);
1049  	init_object(s, object, SLUB_RED_ACTIVE);
1050  	return 1;
1051  
1052  bad:
1053  	if (PageSlab(page)) {
1054  		/*
1055  		 * If this is a slab page then lets do the best we can
1056  		 * to avoid issues in the future. Marking all objects
1057  		 * as used avoids touching the remaining objects.
1058  		 */
1059  		slab_fix(s, "Marking all objects used");
1060  		page->inuse = page->objects;
1061  		page->freelist = NULL;
1062  	}
1063  	return 0;
1064  }
1065  
1066  static noinline struct kmem_cache_node *free_debug_processing(
1067  	struct kmem_cache *s, struct page *page, void *object,
1068  	unsigned long addr, unsigned long *flags)
1069  {
1070  	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1071  
1072  	spin_lock_irqsave(&n->list_lock, *flags);
1073  	slab_lock(page);
1074  
1075  	if (!check_slab(s, page))
1076  		goto fail;
1077  
1078  	if (!check_valid_pointer(s, page, object)) {
1079  		slab_err(s, page, "Invalid object pointer 0x%p", object);
1080  		goto fail;
1081  	}
1082  
1083  	if (on_freelist(s, page, object)) {
1084  		object_err(s, page, object, "Object already free");
1085  		goto fail;
1086  	}
1087  
1088  	if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1089  		goto out;
1090  
1091  	if (unlikely(s != page->slab_cache)) {
1092  		if (!PageSlab(page)) {
1093  			slab_err(s, page, "Attempt to free object(0x%p) "
1094  				"outside of slab", object);
1095  		} else if (!page->slab_cache) {
1096  			pr_err("SLUB <none>: no slab for object 0x%p.\n",
1097  			       object);
1098  			dump_stack();
1099  		} else
1100  			object_err(s, page, object,
1101  					"page slab pointer corrupt.");
1102  		goto fail;
1103  	}
1104  
1105  	if (s->flags & SLAB_STORE_USER)
1106  		set_track(s, object, TRACK_FREE, addr);
1107  	trace(s, page, object, 0);
1108  	init_object(s, object, SLUB_RED_INACTIVE);
1109  out:
1110  	slab_unlock(page);
1111  	/*
1112  	 * Keep node_lock to preserve integrity
1113  	 * until the object is actually freed
1114  	 */
1115  	return n;
1116  
1117  fail:
1118  	slab_unlock(page);
1119  	spin_unlock_irqrestore(&n->list_lock, *flags);
1120  	slab_fix(s, "Object at 0x%p not freed", object);
1121  	return NULL;
1122  }
1123  
1124  static int __init setup_slub_debug(char *str)
1125  {
1126  	slub_debug = DEBUG_DEFAULT_FLAGS;
1127  	if (*str++ != '=' || !*str)
1128  		/*
1129  		 * No options specified. Switch on full debugging.
1130  		 */
1131  		goto out;
1132  
1133  	if (*str == ',')
1134  		/*
1135  		 * No options but restriction on slabs. This means full
1136  		 * debugging for slabs matching a pattern.
1137  		 */
1138  		goto check_slabs;
1139  
1140  	slub_debug = 0;
1141  	if (*str == '-')
1142  		/*
1143  		 * Switch off all debugging measures.
1144  		 */
1145  		goto out;
1146  
1147  	/*
1148  	 * Determine which debug features should be switched on
1149  	 */
1150  	for (; *str && *str != ','; str++) {
1151  		switch (tolower(*str)) {
1152  		case 'f':
1153  			slub_debug |= SLAB_DEBUG_FREE;
1154  			break;
1155  		case 'z':
1156  			slub_debug |= SLAB_RED_ZONE;
1157  			break;
1158  		case 'p':
1159  			slub_debug |= SLAB_POISON;
1160  			break;
1161  		case 'u':
1162  			slub_debug |= SLAB_STORE_USER;
1163  			break;
1164  		case 't':
1165  			slub_debug |= SLAB_TRACE;
1166  			break;
1167  		case 'a':
1168  			slub_debug |= SLAB_FAILSLAB;
1169  			break;
1170  		case 'o':
1171  			/*
1172  			 * Avoid enabling debugging on caches if its minimum
1173  			 * order would increase as a result.
1174  			 */
1175  			disable_higher_order_debug = 1;
1176  			break;
1177  		default:
1178  			pr_err("slub_debug option '%c' unknown. skipped\n",
1179  			       *str);
1180  		}
1181  	}
1182  
1183  check_slabs:
1184  	if (*str == ',')
1185  		slub_debug_slabs = str + 1;
1186  out:
1187  	return 1;
1188  }
1189  
1190  __setup("slub_debug", setup_slub_debug);
1191  
1192  unsigned long kmem_cache_flags(unsigned long object_size,
1193  	unsigned long flags, const char *name,
1194  	void (*ctor)(void *))
1195  {
1196  	/*
1197  	 * Enable debugging if selected on the kernel commandline.
1198  	 */
1199  	if (slub_debug && (!slub_debug_slabs || (name &&
1200  		!strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))))
1201  		flags |= slub_debug;
1202  
1203  	return flags;
1204  }
1205  #else
1206  static inline void setup_object_debug(struct kmem_cache *s,
1207  			struct page *page, void *object) {}
1208  
1209  static inline int alloc_debug_processing(struct kmem_cache *s,
1210  	struct page *page, void *object, unsigned long addr) { return 0; }
1211  
1212  static inline struct kmem_cache_node *free_debug_processing(
1213  	struct kmem_cache *s, struct page *page, void *object,
1214  	unsigned long addr, unsigned long *flags) { return NULL; }
1215  
1216  static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1217  			{ return 1; }
1218  static inline int check_object(struct kmem_cache *s, struct page *page,
1219  			void *object, u8 val) { return 1; }
1220  static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1221  					struct page *page) {}
1222  static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
1223  					struct page *page) {}
1224  unsigned long kmem_cache_flags(unsigned long object_size,
1225  	unsigned long flags, const char *name,
1226  	void (*ctor)(void *))
1227  {
1228  	return flags;
1229  }
1230  #define slub_debug 0
1231  
1232  #define disable_higher_order_debug 0
1233  
1234  static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1235  							{ return 0; }
1236  static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1237  							{ return 0; }
1238  static inline void inc_slabs_node(struct kmem_cache *s, int node,
1239  							int objects) {}
1240  static inline void dec_slabs_node(struct kmem_cache *s, int node,
1241  							int objects) {}
1242  
1243  #endif /* CONFIG_SLUB_DEBUG */
1244  
1245  /*
1246   * Hooks for other subsystems that check memory allocations. In a typical
1247   * production configuration these hooks all should produce no code at all.
1248   */
1249  static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
1250  {
1251  	kmemleak_alloc(ptr, size, 1, flags);
1252  	kasan_kmalloc_large(ptr, size);
1253  }
1254  
1255  static inline void kfree_hook(const void *x)
1256  {
1257  	kmemleak_free(x);
1258  	kasan_kfree_large(x);
1259  }
1260  
1261  static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
1262  						     gfp_t flags)
1263  {
1264  	flags &= gfp_allowed_mask;
1265  	lockdep_trace_alloc(flags);
1266  	might_sleep_if(flags & __GFP_WAIT);
1267  
1268  	if (should_failslab(s->object_size, flags, s->flags))
1269  		return NULL;
1270  
1271  	return memcg_kmem_get_cache(s, flags);
1272  }
1273  
1274  static inline void slab_post_alloc_hook(struct kmem_cache *s,
1275  					gfp_t flags, void *object)
1276  {
1277  	flags &= gfp_allowed_mask;
1278  	kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
1279  	kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
1280  	memcg_kmem_put_cache(s);
1281  	kasan_slab_alloc(s, object);
1282  }
1283  
1284  static inline void slab_free_hook(struct kmem_cache *s, void *x)
1285  {
1286  	kmemleak_free_recursive(x, s->flags);
1287  
1288  	/*
1289  	 * Trouble is that we may no longer disable interrupts in the fast path
1290  	 * So in order to make the debug calls that expect irqs to be
1291  	 * disabled we need to disable interrupts temporarily.
1292  	 */
1293  #if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
1294  	{
1295  		unsigned long flags;
1296  
1297  		local_irq_save(flags);
1298  		kmemcheck_slab_free(s, x, s->object_size);
1299  		debug_check_no_locks_freed(x, s->object_size);
1300  		local_irq_restore(flags);
1301  	}
1302  #endif
1303  	if (!(s->flags & SLAB_DEBUG_OBJECTS))
1304  		debug_check_no_obj_freed(x, s->object_size);
1305  
1306  	kasan_slab_free(s, x);
1307  }
1308  
1309  static void setup_object(struct kmem_cache *s, struct page *page,
1310  				void *object)
1311  {
1312  	setup_object_debug(s, page, object);
1313  	if (unlikely(s->ctor)) {
1314  		kasan_unpoison_object_data(s, object);
1315  		s->ctor(object);
1316  		kasan_poison_object_data(s, object);
1317  	}
1318  }
1319  
1320  /*
1321   * Slab allocation and freeing
1322   */
1323  static inline struct page *alloc_slab_page(struct kmem_cache *s,
1324  		gfp_t flags, int node, struct kmem_cache_order_objects oo)
1325  {
1326  	struct page *page;
1327  	int order = oo_order(oo);
1328  
1329  	flags |= __GFP_NOTRACK;
1330  
1331  	if (memcg_charge_slab(s, flags, order))
1332  		return NULL;
1333  
1334  	if (node == NUMA_NO_NODE)
1335  		page = alloc_pages(flags, order);
1336  	else
1337  		page = __alloc_pages_node(node, flags, order);
1338  
1339  	if (!page)
1340  		memcg_uncharge_slab(s, order);
1341  
1342  	return page;
1343  }
1344  
1345  static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1346  {
1347  	struct page *page;
1348  	struct kmem_cache_order_objects oo = s->oo;
1349  	gfp_t alloc_gfp;
1350  	void *start, *p;
1351  	int idx, order;
1352  
1353  	flags &= gfp_allowed_mask;
1354  
1355  	if (flags & __GFP_WAIT)
1356  		local_irq_enable();
1357  
1358  	flags |= s->allocflags;
1359  
1360  	/*
1361  	 * Let the initial higher-order allocation fail under memory pressure
1362  	 * so we fall-back to the minimum order allocation.
1363  	 */
1364  	alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1365  	if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min))
1366  		alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT;
1367  
1368  	page = alloc_slab_page(s, alloc_gfp, node, oo);
1369  	if (unlikely(!page)) {
1370  		oo = s->min;
1371  		alloc_gfp = flags;
1372  		/*
1373  		 * Allocation may have failed due to fragmentation.
1374  		 * Try a lower order alloc if possible
1375  		 */
1376  		page = alloc_slab_page(s, alloc_gfp, node, oo);
1377  		if (unlikely(!page))
1378  			goto out;
1379  		stat(s, ORDER_FALLBACK);
1380  	}
1381  
1382  	if (kmemcheck_enabled &&
1383  	    !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1384  		int pages = 1 << oo_order(oo);
1385  
1386  		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
1387  
1388  		/*
1389  		 * Objects from caches that have a constructor don't get
1390  		 * cleared when they're allocated, so we need to do it here.
1391  		 */
1392  		if (s->ctor)
1393  			kmemcheck_mark_uninitialized_pages(page, pages);
1394  		else
1395  			kmemcheck_mark_unallocated_pages(page, pages);
1396  	}
1397  
1398  	page->objects = oo_objects(oo);
1399  
1400  	order = compound_order(page);
1401  	page->slab_cache = s;
1402  	__SetPageSlab(page);
1403  	if (page_is_pfmemalloc(page))
1404  		SetPageSlabPfmemalloc(page);
1405  
1406  	start = page_address(page);
1407  
1408  	if (unlikely(s->flags & SLAB_POISON))
1409  		memset(start, POISON_INUSE, PAGE_SIZE << order);
1410  
1411  	kasan_poison_slab(page);
1412  
1413  	for_each_object_idx(p, idx, s, start, page->objects) {
1414  		setup_object(s, page, p);
1415  		if (likely(idx < page->objects))
1416  			set_freepointer(s, p, p + s->size);
1417  		else
1418  			set_freepointer(s, p, NULL);
1419  	}
1420  
1421  	page->freelist = start;
1422  	page->inuse = page->objects;
1423  	page->frozen = 1;
1424  
1425  out:
1426  	if (flags & __GFP_WAIT)
1427  		local_irq_disable();
1428  	if (!page)
1429  		return NULL;
1430  
1431  	mod_zone_page_state(page_zone(page),
1432  		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
1433  		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1434  		1 << oo_order(oo));
1435  
1436  	inc_slabs_node(s, page_to_nid(page), page->objects);
1437  
1438  	return page;
1439  }
1440  
1441  static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1442  {
1443  	if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
1444  		pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
1445  		BUG();
1446  	}
1447  
1448  	return allocate_slab(s,
1449  		flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1450  }
1451  
1452  static void __free_slab(struct kmem_cache *s, struct page *page)
1453  {
1454  	int order = compound_order(page);
1455  	int pages = 1 << order;
1456  
1457  	if (kmem_cache_debug(s)) {
1458  		void *p;
1459  
1460  		slab_pad_check(s, page);
1461  		for_each_object(p, s, page_address(page),
1462  						page->objects)
1463  			check_object(s, page, p, SLUB_RED_INACTIVE);
1464  	}
1465  
1466  	kmemcheck_free_shadow(page, compound_order(page));
1467  
1468  	mod_zone_page_state(page_zone(page),
1469  		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
1470  		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1471  		-pages);
1472  
1473  	__ClearPageSlabPfmemalloc(page);
1474  	__ClearPageSlab(page);
1475  
1476  	page_mapcount_reset(page);
1477  	if (current->reclaim_state)
1478  		current->reclaim_state->reclaimed_slab += pages;
1479  	__free_pages(page, order);
1480  	memcg_uncharge_slab(s, order);
1481  }
1482  
1483  #define need_reserve_slab_rcu						\
1484  	(sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
1485  
1486  static void rcu_free_slab(struct rcu_head *h)
1487  {
1488  	struct page *page;
1489  
1490  	if (need_reserve_slab_rcu)
1491  		page = virt_to_head_page(h);
1492  	else
1493  		page = container_of((struct list_head *)h, struct page, lru);
1494  
1495  	__free_slab(page->slab_cache, page);
1496  }
1497  
1498  static void free_slab(struct kmem_cache *s, struct page *page)
1499  {
1500  	if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
1501  		struct rcu_head *head;
1502  
1503  		if (need_reserve_slab_rcu) {
1504  			int order = compound_order(page);
1505  			int offset = (PAGE_SIZE << order) - s->reserved;
1506  
1507  			VM_BUG_ON(s->reserved != sizeof(*head));
1508  			head = page_address(page) + offset;
1509  		} else {
1510  			/*
1511  			 * RCU free overloads the RCU head over the LRU
1512  			 */
1513  			head = (void *)&page->lru;
1514  		}
1515  
1516  		call_rcu(head, rcu_free_slab);
1517  	} else
1518  		__free_slab(s, page);
1519  }
1520  
1521  static void discard_slab(struct kmem_cache *s, struct page *page)
1522  {
1523  	dec_slabs_node(s, page_to_nid(page), page->objects);
1524  	free_slab(s, page);
1525  }
1526  
1527  /*
1528   * Management of partially allocated slabs.
1529   */
1530  static inline void
1531  __add_partial(struct kmem_cache_node *n, struct page *page, int tail)
1532  {
1533  	n->nr_partial++;
1534  	if (tail == DEACTIVATE_TO_TAIL)
1535  		list_add_tail(&page->lru, &n->partial);
1536  	else
1537  		list_add(&page->lru, &n->partial);
1538  }
1539  
1540  static inline void add_partial(struct kmem_cache_node *n,
1541  				struct page *page, int tail)
1542  {
1543  	lockdep_assert_held(&n->list_lock);
1544  	__add_partial(n, page, tail);
1545  }
1546  
1547  static inline void
1548  __remove_partial(struct kmem_cache_node *n, struct page *page)
1549  {
1550  	list_del(&page->lru);
1551  	n->nr_partial--;
1552  }
1553  
1554  static inline void remove_partial(struct kmem_cache_node *n,
1555  					struct page *page)
1556  {
1557  	lockdep_assert_held(&n->list_lock);
1558  	__remove_partial(n, page);
1559  }
1560  
1561  /*
1562   * Remove slab from the partial list, freeze it and
1563   * return the pointer to the freelist.
1564   *
1565   * Returns a list of objects or NULL if it fails.
1566   */
1567  static inline void *acquire_slab(struct kmem_cache *s,
1568  		struct kmem_cache_node *n, struct page *page,
1569  		int mode, int *objects)
1570  {
1571  	void *freelist;
1572  	unsigned long counters;
1573  	struct page new;
1574  
1575  	lockdep_assert_held(&n->list_lock);
1576  
1577  	/*
1578  	 * Zap the freelist and set the frozen bit.
1579  	 * The old freelist is the list of objects for the
1580  	 * per cpu allocation list.
1581  	 */
1582  	freelist = page->freelist;
1583  	counters = page->counters;
1584  	new.counters = counters;
1585  	*objects = new.objects - new.inuse;
1586  	if (mode) {
1587  		new.inuse = page->objects;
1588  		new.freelist = NULL;
1589  	} else {
1590  		new.freelist = freelist;
1591  	}
1592  
1593  	VM_BUG_ON(new.frozen);
1594  	new.frozen = 1;
1595  
1596  	if (!__cmpxchg_double_slab(s, page,
1597  			freelist, counters,
1598  			new.freelist, new.counters,
1599  			"acquire_slab"))
1600  		return NULL;
1601  
1602  	remove_partial(n, page);
1603  	WARN_ON(!freelist);
1604  	return freelist;
1605  }
1606  
1607  static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
1608  static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
1609  
1610  /*
1611   * Try to allocate a partial slab from a specific node.
1612   */
1613  static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
1614  				struct kmem_cache_cpu *c, gfp_t flags)
1615  {
1616  	struct page *page, *page2;
1617  	void *object = NULL;
1618  	int available = 0;
1619  	int objects;
1620  
1621  	/*
1622  	 * Racy check. If we mistakenly see no partial slabs then we
1623  	 * just allocate an empty slab. If we mistakenly try to get a
1624  	 * partial slab and there is none available then get_partials()
1625  	 * will return NULL.
1626  	 */
1627  	if (!n || !n->nr_partial)
1628  		return NULL;
1629  
1630  	spin_lock(&n->list_lock);
1631  	list_for_each_entry_safe(page, page2, &n->partial, lru) {
1632  		void *t;
1633  
1634  		if (!pfmemalloc_match(page, flags))
1635  			continue;
1636  
1637  		t = acquire_slab(s, n, page, object == NULL, &objects);
1638  		if (!t)
1639  			break;
1640  
1641  		available += objects;
1642  		if (!object) {
1643  			c->page = page;
1644  			stat(s, ALLOC_FROM_PARTIAL);
1645  			object = t;
1646  		} else {
1647  			put_cpu_partial(s, page, 0);
1648  			stat(s, CPU_PARTIAL_NODE);
1649  		}
1650  		if (!kmem_cache_has_cpu_partial(s)
1651  			|| available > s->cpu_partial / 2)
1652  			break;
1653  
1654  	}
1655  	spin_unlock(&n->list_lock);
1656  	return object;
1657  }
1658  
1659  /*
1660   * Get a page from somewhere. Search in increasing NUMA distances.
1661   */
1662  static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1663  		struct kmem_cache_cpu *c)
1664  {
1665  #ifdef CONFIG_NUMA
1666  	struct zonelist *zonelist;
1667  	struct zoneref *z;
1668  	struct zone *zone;
1669  	enum zone_type high_zoneidx = gfp_zone(flags);
1670  	void *object;
1671  	unsigned int cpuset_mems_cookie;
1672  
1673  	/*
1674  	 * The defrag ratio allows a configuration of the tradeoffs between
1675  	 * inter node defragmentation and node local allocations. A lower
1676  	 * defrag_ratio increases the tendency to do local allocations
1677  	 * instead of attempting to obtain partial slabs from other nodes.
1678  	 *
1679  	 * If the defrag_ratio is set to 0 then kmalloc() always
1680  	 * returns node local objects. If the ratio is higher then kmalloc()
1681  	 * may return off node objects because partial slabs are obtained
1682  	 * from other nodes and filled up.
1683  	 *
1684  	 * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes
1685  	 * defrag_ratio = 1000) then every (well almost) allocation will
1686  	 * first attempt to defrag slab caches on other nodes. This means
1687  	 * scanning over all nodes to look for partial slabs which may be
1688  	 * expensive if we do it every time we are trying to find a slab
1689  	 * with available objects.
1690  	 */
1691  	if (!s->remote_node_defrag_ratio ||
1692  			get_cycles() % 1024 > s->remote_node_defrag_ratio)
1693  		return NULL;
1694  
1695  	do {
1696  		cpuset_mems_cookie = read_mems_allowed_begin();
1697  		zonelist = node_zonelist(mempolicy_slab_node(), flags);
1698  		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1699  			struct kmem_cache_node *n;
1700  
1701  			n = get_node(s, zone_to_nid(zone));
1702  
1703  			if (n && cpuset_zone_allowed(zone, flags) &&
1704  					n->nr_partial > s->min_partial) {
1705  				object = get_partial_node(s, n, c, flags);
1706  				if (object) {
1707  					/*
1708  					 * Don't check read_mems_allowed_retry()
1709  					 * here - if mems_allowed was updated in
1710  					 * parallel, that was a harmless race
1711  					 * between allocation and the cpuset
1712  					 * update
1713  					 */
1714  					return object;
1715  				}
1716  			}
1717  		}
1718  	} while (read_mems_allowed_retry(cpuset_mems_cookie));
1719  #endif
1720  	return NULL;
1721  }
1722  
1723  /*
1724   * Get a partial page, lock it and return it.
1725   */
1726  static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
1727  		struct kmem_cache_cpu *c)
1728  {
1729  	void *object;
1730  	int searchnode = node;
1731  
1732  	if (node == NUMA_NO_NODE)
1733  		searchnode = numa_mem_id();
1734  	else if (!node_present_pages(node))
1735  		searchnode = node_to_mem_node(node);
1736  
1737  	object = get_partial_node(s, get_node(s, searchnode), c, flags);
1738  	if (object || node != NUMA_NO_NODE)
1739  		return object;
1740  
1741  	return get_any_partial(s, flags, c);
1742  }
1743  
1744  #ifdef CONFIG_PREEMPT
1745  /*
1746   * Calculate the next globally unique transaction for disambiguiation
1747   * during cmpxchg. The transactions start with the cpu number and are then
1748   * incremented by CONFIG_NR_CPUS.
1749   */
1750  #define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
1751  #else
1752  /*
1753   * No preemption supported therefore also no need to check for
1754   * different cpus.
1755   */
1756  #define TID_STEP 1
1757  #endif
1758  
1759  static inline unsigned long next_tid(unsigned long tid)
1760  {
1761  	return tid + TID_STEP;
1762  }
1763  
1764  static inline unsigned int tid_to_cpu(unsigned long tid)
1765  {
1766  	return tid % TID_STEP;
1767  }
1768  
1769  static inline unsigned long tid_to_event(unsigned long tid)
1770  {
1771  	return tid / TID_STEP;
1772  }
1773  
1774  static inline unsigned int init_tid(int cpu)
1775  {
1776  	return cpu;
1777  }
1778  
1779  static inline void note_cmpxchg_failure(const char *n,
1780  		const struct kmem_cache *s, unsigned long tid)
1781  {
1782  #ifdef SLUB_DEBUG_CMPXCHG
1783  	unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
1784  
1785  	pr_info("%s %s: cmpxchg redo ", n, s->name);
1786  
1787  #ifdef CONFIG_PREEMPT
1788  	if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
1789  		pr_warn("due to cpu change %d -> %d\n",
1790  			tid_to_cpu(tid), tid_to_cpu(actual_tid));
1791  	else
1792  #endif
1793  	if (tid_to_event(tid) != tid_to_event(actual_tid))
1794  		pr_warn("due to cpu running other code. Event %ld->%ld\n",
1795  			tid_to_event(tid), tid_to_event(actual_tid));
1796  	else
1797  		pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
1798  			actual_tid, tid, next_tid(tid));
1799  #endif
1800  	stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
1801  }
1802  
1803  static void init_kmem_cache_cpus(struct kmem_cache *s)
1804  {
1805  	int cpu;
1806  
1807  	for_each_possible_cpu(cpu)
1808  		per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
1809  }
1810  
1811  /*
1812   * Remove the cpu slab
1813   */
1814  static void deactivate_slab(struct kmem_cache *s, struct page *page,
1815  				void *freelist)
1816  {
1817  	enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
1818  	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1819  	int lock = 0;
1820  	enum slab_modes l = M_NONE, m = M_NONE;
1821  	void *nextfree;
1822  	int tail = DEACTIVATE_TO_HEAD;
1823  	struct page new;
1824  	struct page old;
1825  
1826  	if (page->freelist) {
1827  		stat(s, DEACTIVATE_REMOTE_FREES);
1828  		tail = DEACTIVATE_TO_TAIL;
1829  	}
1830  
1831  	/*
1832  	 * Stage one: Free all available per cpu objects back
1833  	 * to the page freelist while it is still frozen. Leave the
1834  	 * last one.
1835  	 *
1836  	 * There is no need to take the list->lock because the page
1837  	 * is still frozen.
1838  	 */
1839  	while (freelist && (nextfree = get_freepointer(s, freelist))) {
1840  		void *prior;
1841  		unsigned long counters;
1842  
1843  		do {
1844  			prior = page->freelist;
1845  			counters = page->counters;
1846  			set_freepointer(s, freelist, prior);
1847  			new.counters = counters;
1848  			new.inuse--;
1849  			VM_BUG_ON(!new.frozen);
1850  
1851  		} while (!__cmpxchg_double_slab(s, page,
1852  			prior, counters,
1853  			freelist, new.counters,
1854  			"drain percpu freelist"));
1855  
1856  		freelist = nextfree;
1857  	}
1858  
1859  	/*
1860  	 * Stage two: Ensure that the page is unfrozen while the
1861  	 * list presence reflects the actual number of objects
1862  	 * during unfreeze.
1863  	 *
1864  	 * We setup the list membership and then perform a cmpxchg
1865  	 * with the count. If there is a mismatch then the page
1866  	 * is not unfrozen but the page is on the wrong list.
1867  	 *
1868  	 * Then we restart the process which may have to remove
1869  	 * the page from the list that we just put it on again
1870  	 * because the number of objects in the slab may have
1871  	 * changed.
1872  	 */
1873  redo:
1874  
1875  	old.freelist = page->freelist;
1876  	old.counters = page->counters;
1877  	VM_BUG_ON(!old.frozen);
1878  
1879  	/* Determine target state of the slab */
1880  	new.counters = old.counters;
1881  	if (freelist) {
1882  		new.inuse--;
1883  		set_freepointer(s, freelist, old.freelist);
1884  		new.freelist = freelist;
1885  	} else
1886  		new.freelist = old.freelist;
1887  
1888  	new.frozen = 0;
1889  
1890  	if (!new.inuse && n->nr_partial >= s->min_partial)
1891  		m = M_FREE;
1892  	else if (new.freelist) {
1893  		m = M_PARTIAL;
1894  		if (!lock) {
1895  			lock = 1;
1896  			/*
1897  			 * Taking the spinlock removes the possiblity
1898  			 * that acquire_slab() will see a slab page that
1899  			 * is frozen
1900  			 */
1901  			spin_lock(&n->list_lock);
1902  		}
1903  	} else {
1904  		m = M_FULL;
1905  		if (kmem_cache_debug(s) && !lock) {
1906  			lock = 1;
1907  			/*
1908  			 * This also ensures that the scanning of full
1909  			 * slabs from diagnostic functions will not see
1910  			 * any frozen slabs.
1911  			 */
1912  			spin_lock(&n->list_lock);
1913  		}
1914  	}
1915  
1916  	if (l != m) {
1917  
1918  		if (l == M_PARTIAL)
1919  
1920  			remove_partial(n, page);
1921  
1922  		else if (l == M_FULL)
1923  
1924  			remove_full(s, n, page);
1925  
1926  		if (m == M_PARTIAL) {
1927  
1928  			add_partial(n, page, tail);
1929  			stat(s, tail);
1930  
1931  		} else if (m == M_FULL) {
1932  
1933  			stat(s, DEACTIVATE_FULL);
1934  			add_full(s, n, page);
1935  
1936  		}
1937  	}
1938  
1939  	l = m;
1940  	if (!__cmpxchg_double_slab(s, page,
1941  				old.freelist, old.counters,
1942  				new.freelist, new.counters,
1943  				"unfreezing slab"))
1944  		goto redo;
1945  
1946  	if (lock)
1947  		spin_unlock(&n->list_lock);
1948  
1949  	if (m == M_FREE) {
1950  		stat(s, DEACTIVATE_EMPTY);
1951  		discard_slab(s, page);
1952  		stat(s, FREE_SLAB);
1953  	}
1954  }
1955  
1956  /*
1957   * Unfreeze all the cpu partial slabs.
1958   *
1959   * This function must be called with interrupts disabled
1960   * for the cpu using c (or some other guarantee must be there
1961   * to guarantee no concurrent accesses).
1962   */
1963  static void unfreeze_partials(struct kmem_cache *s,
1964  		struct kmem_cache_cpu *c)
1965  {
1966  #ifdef CONFIG_SLUB_CPU_PARTIAL
1967  	struct kmem_cache_node *n = NULL, *n2 = NULL;
1968  	struct page *page, *discard_page = NULL;
1969  
1970  	while ((page = c->partial)) {
1971  		struct page new;
1972  		struct page old;
1973  
1974  		c->partial = page->next;
1975  
1976  		n2 = get_node(s, page_to_nid(page));
1977  		if (n != n2) {
1978  			if (n)
1979  				spin_unlock(&n->list_lock);
1980  
1981  			n = n2;
1982  			spin_lock(&n->list_lock);
1983  		}
1984  
1985  		do {
1986  
1987  			old.freelist = page->freelist;
1988  			old.counters = page->counters;
1989  			VM_BUG_ON(!old.frozen);
1990  
1991  			new.counters = old.counters;
1992  			new.freelist = old.freelist;
1993  
1994  			new.frozen = 0;
1995  
1996  		} while (!__cmpxchg_double_slab(s, page,
1997  				old.freelist, old.counters,
1998  				new.freelist, new.counters,
1999  				"unfreezing slab"));
2000  
2001  		if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
2002  			page->next = discard_page;
2003  			discard_page = page;
2004  		} else {
2005  			add_partial(n, page, DEACTIVATE_TO_TAIL);
2006  			stat(s, FREE_ADD_PARTIAL);
2007  		}
2008  	}
2009  
2010  	if (n)
2011  		spin_unlock(&n->list_lock);
2012  
2013  	while (discard_page) {
2014  		page = discard_page;
2015  		discard_page = discard_page->next;
2016  
2017  		stat(s, DEACTIVATE_EMPTY);
2018  		discard_slab(s, page);
2019  		stat(s, FREE_SLAB);
2020  	}
2021  #endif
2022  }
2023  
2024  /*
2025   * Put a page that was just frozen (in __slab_free) into a partial page
2026   * slot if available. This is done without interrupts disabled and without
2027   * preemption disabled. The cmpxchg is racy and may put the partial page
2028   * onto a random cpus partial slot.
2029   *
2030   * If we did not find a slot then simply move all the partials to the
2031   * per node partial list.
2032   */
2033  static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
2034  {
2035  #ifdef CONFIG_SLUB_CPU_PARTIAL
2036  	struct page *oldpage;
2037  	int pages;
2038  	int pobjects;
2039  
2040  	preempt_disable();
2041  	do {
2042  		pages = 0;
2043  		pobjects = 0;
2044  		oldpage = this_cpu_read(s->cpu_slab->partial);
2045  
2046  		if (oldpage) {
2047  			pobjects = oldpage->pobjects;
2048  			pages = oldpage->pages;
2049  			if (drain && pobjects > s->cpu_partial) {
2050  				unsigned long flags;
2051  				/*
2052  				 * partial array is full. Move the existing
2053  				 * set to the per node partial list.
2054  				 */
2055  				local_irq_save(flags);
2056  				unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
2057  				local_irq_restore(flags);
2058  				oldpage = NULL;
2059  				pobjects = 0;
2060  				pages = 0;
2061  				stat(s, CPU_PARTIAL_DRAIN);
2062  			}
2063  		}
2064  
2065  		pages++;
2066  		pobjects += page->objects - page->inuse;
2067  
2068  		page->pages = pages;
2069  		page->pobjects = pobjects;
2070  		page->next = oldpage;
2071  
2072  	} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
2073  								!= oldpage);
2074  	if (unlikely(!s->cpu_partial)) {
2075  		unsigned long flags;
2076  
2077  		local_irq_save(flags);
2078  		unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
2079  		local_irq_restore(flags);
2080  	}
2081  	preempt_enable();
2082  #endif
2083  }
2084  
2085  static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
2086  {
2087  	stat(s, CPUSLAB_FLUSH);
2088  	deactivate_slab(s, c->page, c->freelist);
2089  
2090  	c->tid = next_tid(c->tid);
2091  	c->page = NULL;
2092  	c->freelist = NULL;
2093  }
2094  
2095  /*
2096   * Flush cpu slab.
2097   *
2098   * Called from IPI handler with interrupts disabled.
2099   */
2100  static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
2101  {
2102  	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2103  
2104  	if (likely(c)) {
2105  		if (c->page)
2106  			flush_slab(s, c);
2107  
2108  		unfreeze_partials(s, c);
2109  	}
2110  }
2111  
2112  static void flush_cpu_slab(void *d)
2113  {
2114  	struct kmem_cache *s = d;
2115  
2116  	__flush_cpu_slab(s, smp_processor_id());
2117  }
2118  
2119  static bool has_cpu_slab(int cpu, void *info)
2120  {
2121  	struct kmem_cache *s = info;
2122  	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2123  
2124  	return c->page || c->partial;
2125  }
2126  
2127  static void flush_all(struct kmem_cache *s)
2128  {
2129  	on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
2130  }
2131  
2132  /*
2133   * Check if the objects in a per cpu structure fit numa
2134   * locality expectations.
2135   */
2136  static inline int node_match(struct page *page, int node)
2137  {
2138  #ifdef CONFIG_NUMA
2139  	if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node))
2140  		return 0;
2141  #endif
2142  	return 1;
2143  }
2144  
2145  #ifdef CONFIG_SLUB_DEBUG
2146  static int count_free(struct page *page)
2147  {
2148  	return page->objects - page->inuse;
2149  }
2150  
2151  static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
2152  {
2153  	return atomic_long_read(&n->total_objects);
2154  }
2155  #endif /* CONFIG_SLUB_DEBUG */
2156  
2157  #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
2158  static unsigned long count_partial(struct kmem_cache_node *n,
2159  					int (*get_count)(struct page *))
2160  {
2161  	unsigned long flags;
2162  	unsigned long x = 0;
2163  	struct page *page;
2164  
2165  	spin_lock_irqsave(&n->list_lock, flags);
2166  	list_for_each_entry(page, &n->partial, lru)
2167  		x += get_count(page);
2168  	spin_unlock_irqrestore(&n->list_lock, flags);
2169  	return x;
2170  }
2171  #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
2172  
2173  static noinline void
2174  slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2175  {
2176  #ifdef CONFIG_SLUB_DEBUG
2177  	static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
2178  				      DEFAULT_RATELIMIT_BURST);
2179  	int node;
2180  	struct kmem_cache_node *n;
2181  
2182  	if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
2183  		return;
2184  
2185  	pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
2186  		nid, gfpflags);
2187  	pr_warn("  cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n",
2188  		s->name, s->object_size, s->size, oo_order(s->oo),
2189  		oo_order(s->min));
2190  
2191  	if (oo_order(s->min) > get_order(s->object_size))
2192  		pr_warn("  %s debugging increased min order, use slub_debug=O to disable.\n",
2193  			s->name);
2194  
2195  	for_each_kmem_cache_node(s, node, n) {
2196  		unsigned long nr_slabs;
2197  		unsigned long nr_objs;
2198  		unsigned long nr_free;
2199  
2200  		nr_free  = count_partial(n, count_free);
2201  		nr_slabs = node_nr_slabs(n);
2202  		nr_objs  = node_nr_objs(n);
2203  
2204  		pr_warn("  node %d: slabs: %ld, objs: %ld, free: %ld\n",
2205  			node, nr_slabs, nr_objs, nr_free);
2206  	}
2207  #endif
2208  }
2209  
2210  static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2211  			int node, struct kmem_cache_cpu **pc)
2212  {
2213  	void *freelist;
2214  	struct kmem_cache_cpu *c = *pc;
2215  	struct page *page;
2216  
2217  	freelist = get_partial(s, flags, node, c);
2218  
2219  	if (freelist)
2220  		return freelist;
2221  
2222  	page = new_slab(s, flags, node);
2223  	if (page) {
2224  		c = raw_cpu_ptr(s->cpu_slab);
2225  		if (c->page)
2226  			flush_slab(s, c);
2227  
2228  		/*
2229  		 * No other reference to the page yet so we can
2230  		 * muck around with it freely without cmpxchg
2231  		 */
2232  		freelist = page->freelist;
2233  		page->freelist = NULL;
2234  
2235  		stat(s, ALLOC_SLAB);
2236  		c->page = page;
2237  		*pc = c;
2238  	} else
2239  		freelist = NULL;
2240  
2241  	return freelist;
2242  }
2243  
2244  static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
2245  {
2246  	if (unlikely(PageSlabPfmemalloc(page)))
2247  		return gfp_pfmemalloc_allowed(gfpflags);
2248  
2249  	return true;
2250  }
2251  
2252  /*
2253   * Check the page->freelist of a page and either transfer the freelist to the
2254   * per cpu freelist or deactivate the page.
2255   *
2256   * The page is still frozen if the return value is not NULL.
2257   *
2258   * If this function returns NULL then the page has been unfrozen.
2259   *
2260   * This function must be called with interrupt disabled.
2261   */
2262  static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2263  {
2264  	struct page new;
2265  	unsigned long counters;
2266  	void *freelist;
2267  
2268  	do {
2269  		freelist = page->freelist;
2270  		counters = page->counters;
2271  
2272  		new.counters = counters;
2273  		VM_BUG_ON(!new.frozen);
2274  
2275  		new.inuse = page->objects;
2276  		new.frozen = freelist != NULL;
2277  
2278  	} while (!__cmpxchg_double_slab(s, page,
2279  		freelist, counters,
2280  		NULL, new.counters,
2281  		"get_freelist"));
2282  
2283  	return freelist;
2284  }
2285  
2286  /*
2287   * Slow path. The lockless freelist is empty or we need to perform
2288   * debugging duties.
2289   *
2290   * Processing is still very fast if new objects have been freed to the
2291   * regular freelist. In that case we simply take over the regular freelist
2292   * as the lockless freelist and zap the regular freelist.
2293   *
2294   * If that is not working then we fall back to the partial lists. We take the
2295   * first element of the freelist as the object to allocate now and move the
2296   * rest of the freelist to the lockless freelist.
2297   *
2298   * And if we were unable to get a new slab from the partial slab lists then
2299   * we need to allocate a new slab. This is the slowest path since it involves
2300   * a call to the page allocator and the setup of a new slab.
2301   */
2302  static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2303  			  unsigned long addr, struct kmem_cache_cpu *c)
2304  {
2305  	void *freelist;
2306  	struct page *page;
2307  	unsigned long flags;
2308  
2309  	local_irq_save(flags);
2310  #ifdef CONFIG_PREEMPT
2311  	/*
2312  	 * We may have been preempted and rescheduled on a different
2313  	 * cpu before disabling interrupts. Need to reload cpu area
2314  	 * pointer.
2315  	 */
2316  	c = this_cpu_ptr(s->cpu_slab);
2317  #endif
2318  
2319  	page = c->page;
2320  	if (!page)
2321  		goto new_slab;
2322  redo:
2323  
2324  	if (unlikely(!node_match(page, node))) {
2325  		int searchnode = node;
2326  
2327  		if (node != NUMA_NO_NODE && !node_present_pages(node))
2328  			searchnode = node_to_mem_node(node);
2329  
2330  		if (unlikely(!node_match(page, searchnode))) {
2331  			stat(s, ALLOC_NODE_MISMATCH);
2332  			deactivate_slab(s, page, c->freelist);
2333  			c->page = NULL;
2334  			c->freelist = NULL;
2335  			goto new_slab;
2336  		}
2337  	}
2338  
2339  	/*
2340  	 * By rights, we should be searching for a slab page that was
2341  	 * PFMEMALLOC but right now, we are losing the pfmemalloc
2342  	 * information when the page leaves the per-cpu allocator
2343  	 */
2344  	if (unlikely(!pfmemalloc_match(page, gfpflags))) {
2345  		deactivate_slab(s, page, c->freelist);
2346  		c->page = NULL;
2347  		c->freelist = NULL;
2348  		goto new_slab;
2349  	}
2350  
2351  	/* must check again c->freelist in case of cpu migration or IRQ */
2352  	freelist = c->freelist;
2353  	if (freelist)
2354  		goto load_freelist;
2355  
2356  	freelist = get_freelist(s, page);
2357  
2358  	if (!freelist) {
2359  		c->page = NULL;
2360  		stat(s, DEACTIVATE_BYPASS);
2361  		goto new_slab;
2362  	}
2363  
2364  	stat(s, ALLOC_REFILL);
2365  
2366  load_freelist:
2367  	/*
2368  	 * freelist is pointing to the list of objects to be used.
2369  	 * page is pointing to the page from which the objects are obtained.
2370  	 * That page must be frozen for per cpu allocations to work.
2371  	 */
2372  	VM_BUG_ON(!c->page->frozen);
2373  	c->freelist = get_freepointer(s, freelist);
2374  	c->tid = next_tid(c->tid);
2375  	local_irq_restore(flags);
2376  	return freelist;
2377  
2378  new_slab:
2379  
2380  	if (c->partial) {
2381  		page = c->page = c->partial;
2382  		c->partial = page->next;
2383  		stat(s, CPU_PARTIAL_ALLOC);
2384  		c->freelist = NULL;
2385  		goto redo;
2386  	}
2387  
2388  	freelist = new_slab_objects(s, gfpflags, node, &c);
2389  
2390  	if (unlikely(!freelist)) {
2391  		slab_out_of_memory(s, gfpflags, node);
2392  		local_irq_restore(flags);
2393  		return NULL;
2394  	}
2395  
2396  	page = c->page;
2397  	if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
2398  		goto load_freelist;
2399  
2400  	/* Only entered in the debug case */
2401  	if (kmem_cache_debug(s) &&
2402  			!alloc_debug_processing(s, page, freelist, addr))
2403  		goto new_slab;	/* Slab failed checks. Next slab needed */
2404  
2405  	deactivate_slab(s, page, get_freepointer(s, freelist));
2406  	c->page = NULL;
2407  	c->freelist = NULL;
2408  	local_irq_restore(flags);
2409  	return freelist;
2410  }
2411  
2412  /*
2413   * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
2414   * have the fastpath folded into their functions. So no function call
2415   * overhead for requests that can be satisfied on the fastpath.
2416   *
2417   * The fastpath works by first checking if the lockless freelist can be used.
2418   * If not then __slab_alloc is called for slow processing.
2419   *
2420   * Otherwise we can simply pick the next object from the lockless free list.
2421   */
2422  static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2423  		gfp_t gfpflags, int node, unsigned long addr)
2424  {
2425  	void **object;
2426  	struct kmem_cache_cpu *c;
2427  	struct page *page;
2428  	unsigned long tid;
2429  
2430  	s = slab_pre_alloc_hook(s, gfpflags);
2431  	if (!s)
2432  		return NULL;
2433  redo:
2434  	/*
2435  	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
2436  	 * enabled. We may switch back and forth between cpus while
2437  	 * reading from one cpu area. That does not matter as long
2438  	 * as we end up on the original cpu again when doing the cmpxchg.
2439  	 *
2440  	 * We should guarantee that tid and kmem_cache are retrieved on
2441  	 * the same cpu. It could be different if CONFIG_PREEMPT so we need
2442  	 * to check if it is matched or not.
2443  	 */
2444  	do {
2445  		tid = this_cpu_read(s->cpu_slab->tid);
2446  		c = raw_cpu_ptr(s->cpu_slab);
2447  	} while (IS_ENABLED(CONFIG_PREEMPT) &&
2448  		 unlikely(tid != READ_ONCE(c->tid)));
2449  
2450  	/*
2451  	 * Irqless object alloc/free algorithm used here depends on sequence
2452  	 * of fetching cpu_slab's data. tid should be fetched before anything
2453  	 * on c to guarantee that object and page associated with previous tid
2454  	 * won't be used with current tid. If we fetch tid first, object and
2455  	 * page could be one associated with next tid and our alloc/free
2456  	 * request will be failed. In this case, we will retry. So, no problem.
2457  	 */
2458  	barrier();
2459  
2460  	/*
2461  	 * The transaction ids are globally unique per cpu and per operation on
2462  	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
2463  	 * occurs on the right processor and that there was no operation on the
2464  	 * linked list in between.
2465  	 */
2466  
2467  	object = c->freelist;
2468  	page = c->page;
2469  	if (unlikely(!object || !node_match(page, node))) {
2470  		object = __slab_alloc(s, gfpflags, node, addr, c);
2471  		stat(s, ALLOC_SLOWPATH);
2472  	} else {
2473  		void *next_object = get_freepointer_safe(s, object);
2474  
2475  		/*
2476  		 * The cmpxchg will only match if there was no additional
2477  		 * operation and if we are on the right processor.
2478  		 *
2479  		 * The cmpxchg does the following atomically (without lock
2480  		 * semantics!)
2481  		 * 1. Relocate first pointer to the current per cpu area.
2482  		 * 2. Verify that tid and freelist have not been changed
2483  		 * 3. If they were not changed replace tid and freelist
2484  		 *
2485  		 * Since this is without lock semantics the protection is only
2486  		 * against code executing on this cpu *not* from access by
2487  		 * other cpus.
2488  		 */
2489  		if (unlikely(!this_cpu_cmpxchg_double(
2490  				s->cpu_slab->freelist, s->cpu_slab->tid,
2491  				object, tid,
2492  				next_object, next_tid(tid)))) {
2493  
2494  			note_cmpxchg_failure("slab_alloc", s, tid);
2495  			goto redo;
2496  		}
2497  		prefetch_freepointer(s, next_object);
2498  		stat(s, ALLOC_FASTPATH);
2499  	}
2500  
2501  	if (unlikely(gfpflags & __GFP_ZERO) && object)
2502  		memset(object, 0, s->object_size);
2503  
2504  	slab_post_alloc_hook(s, gfpflags, object);
2505  
2506  	return object;
2507  }
2508  
2509  static __always_inline void *slab_alloc(struct kmem_cache *s,
2510  		gfp_t gfpflags, unsigned long addr)
2511  {
2512  	return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
2513  }
2514  
2515  void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
2516  {
2517  	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2518  
2519  	trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
2520  				s->size, gfpflags);
2521  
2522  	return ret;
2523  }
2524  EXPORT_SYMBOL(kmem_cache_alloc);
2525  
2526  #ifdef CONFIG_TRACING
2527  void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
2528  {
2529  	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2530  	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
2531  	kasan_kmalloc(s, ret, size);
2532  	return ret;
2533  }
2534  EXPORT_SYMBOL(kmem_cache_alloc_trace);
2535  #endif
2536  
2537  #ifdef CONFIG_NUMA
2538  void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
2539  {
2540  	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
2541  
2542  	trace_kmem_cache_alloc_node(_RET_IP_, ret,
2543  				    s->object_size, s->size, gfpflags, node);
2544  
2545  	return ret;
2546  }
2547  EXPORT_SYMBOL(kmem_cache_alloc_node);
2548  
2549  #ifdef CONFIG_TRACING
2550  void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
2551  				    gfp_t gfpflags,
2552  				    int node, size_t size)
2553  {
2554  	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
2555  
2556  	trace_kmalloc_node(_RET_IP_, ret,
2557  			   size, s->size, gfpflags, node);
2558  
2559  	kasan_kmalloc(s, ret, size);
2560  	return ret;
2561  }
2562  EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
2563  #endif
2564  #endif
2565  
2566  /*
2567   * Slow path handling. This may still be called frequently since objects
2568   * have a longer lifetime than the cpu slabs in most processing loads.
2569   *
2570   * So we still attempt to reduce cache line usage. Just take the slab
2571   * lock and free the item. If there is no additional partial page
2572   * handling required then we can return immediately.
2573   */
2574  static void __slab_free(struct kmem_cache *s, struct page *page,
2575  			void *x, unsigned long addr)
2576  {
2577  	void *prior;
2578  	void **object = (void *)x;
2579  	int was_frozen;
2580  	struct page new;
2581  	unsigned long counters;
2582  	struct kmem_cache_node *n = NULL;
2583  	unsigned long uninitialized_var(flags);
2584  
2585  	stat(s, FREE_SLOWPATH);
2586  
2587  	if (kmem_cache_debug(s) &&
2588  		!(n = free_debug_processing(s, page, x, addr, &flags)))
2589  		return;
2590  
2591  	do {
2592  		if (unlikely(n)) {
2593  			spin_unlock_irqrestore(&n->list_lock, flags);
2594  			n = NULL;
2595  		}
2596  		prior = page->freelist;
2597  		counters = page->counters;
2598  		set_freepointer(s, object, prior);
2599  		new.counters = counters;
2600  		was_frozen = new.frozen;
2601  		new.inuse--;
2602  		if ((!new.inuse || !prior) && !was_frozen) {
2603  
2604  			if (kmem_cache_has_cpu_partial(s) && !prior) {
2605  
2606  				/*
2607  				 * Slab was on no list before and will be
2608  				 * partially empty
2609  				 * We can defer the list move and instead
2610  				 * freeze it.
2611  				 */
2612  				new.frozen = 1;
2613  
2614  			} else { /* Needs to be taken off a list */
2615  
2616  				n = get_node(s, page_to_nid(page));
2617  				/*
2618  				 * Speculatively acquire the list_lock.
2619  				 * If the cmpxchg does not succeed then we may
2620  				 * drop the list_lock without any processing.
2621  				 *
2622  				 * Otherwise the list_lock will synchronize with
2623  				 * other processors updating the list of slabs.
2624  				 */
2625  				spin_lock_irqsave(&n->list_lock, flags);
2626  
2627  			}
2628  		}
2629  
2630  	} while (!cmpxchg_double_slab(s, page,
2631  		prior, counters,
2632  		object, new.counters,
2633  		"__slab_free"));
2634  
2635  	if (likely(!n)) {
2636  
2637  		/*
2638  		 * If we just froze the page then put it onto the
2639  		 * per cpu partial list.
2640  		 */
2641  		if (new.frozen && !was_frozen) {
2642  			put_cpu_partial(s, page, 1);
2643  			stat(s, CPU_PARTIAL_FREE);
2644  		}
2645  		/*
2646  		 * The list lock was not taken therefore no list
2647  		 * activity can be necessary.
2648  		 */
2649  		if (was_frozen)
2650  			stat(s, FREE_FROZEN);
2651  		return;
2652  	}
2653  
2654  	if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
2655  		goto slab_empty;
2656  
2657  	/*
2658  	 * Objects left in the slab. If it was not on the partial list before
2659  	 * then add it.
2660  	 */
2661  	if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
2662  		if (kmem_cache_debug(s))
2663  			remove_full(s, n, page);
2664  		add_partial(n, page, DEACTIVATE_TO_TAIL);
2665  		stat(s, FREE_ADD_PARTIAL);
2666  	}
2667  	spin_unlock_irqrestore(&n->list_lock, flags);
2668  	return;
2669  
2670  slab_empty:
2671  	if (prior) {
2672  		/*
2673  		 * Slab on the partial list.
2674  		 */
2675  		remove_partial(n, page);
2676  		stat(s, FREE_REMOVE_PARTIAL);
2677  	} else {
2678  		/* Slab must be on the full list */
2679  		remove_full(s, n, page);
2680  	}
2681  
2682  	spin_unlock_irqrestore(&n->list_lock, flags);
2683  	stat(s, FREE_SLAB);
2684  	discard_slab(s, page);
2685  }
2686  
2687  /*
2688   * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
2689   * can perform fastpath freeing without additional function calls.
2690   *
2691   * The fastpath is only possible if we are freeing to the current cpu slab
2692   * of this processor. This typically the case if we have just allocated
2693   * the item before.
2694   *
2695   * If fastpath is not possible then fall back to __slab_free where we deal
2696   * with all sorts of special processing.
2697   */
2698  static __always_inline void slab_free(struct kmem_cache *s,
2699  			struct page *page, void *x, unsigned long addr)
2700  {
2701  	void **object = (void *)x;
2702  	struct kmem_cache_cpu *c;
2703  	unsigned long tid;
2704  
2705  	slab_free_hook(s, x);
2706  
2707  redo:
2708  	/*
2709  	 * Determine the currently cpus per cpu slab.
2710  	 * The cpu may change afterward. However that does not matter since
2711  	 * data is retrieved via this pointer. If we are on the same cpu
2712  	 * during the cmpxchg then the free will succeed.
2713  	 */
2714  	do {
2715  		tid = this_cpu_read(s->cpu_slab->tid);
2716  		c = raw_cpu_ptr(s->cpu_slab);
2717  	} while (IS_ENABLED(CONFIG_PREEMPT) &&
2718  		 unlikely(tid != READ_ONCE(c->tid)));
2719  
2720  	/* Same with comment on barrier() in slab_alloc_node() */
2721  	barrier();
2722  
2723  	if (likely(page == c->page)) {
2724  		set_freepointer(s, object, c->freelist);
2725  
2726  		if (unlikely(!this_cpu_cmpxchg_double(
2727  				s->cpu_slab->freelist, s->cpu_slab->tid,
2728  				c->freelist, tid,
2729  				object, next_tid(tid)))) {
2730  
2731  			note_cmpxchg_failure("slab_free", s, tid);
2732  			goto redo;
2733  		}
2734  		stat(s, FREE_FASTPATH);
2735  	} else
2736  		__slab_free(s, page, x, addr);
2737  
2738  }
2739  
2740  void kmem_cache_free(struct kmem_cache *s, void *x)
2741  {
2742  	s = cache_from_obj(s, x);
2743  	if (!s)
2744  		return;
2745  	slab_free(s, virt_to_head_page(x), x, _RET_IP_);
2746  	trace_kmem_cache_free(_RET_IP_, x);
2747  }
2748  EXPORT_SYMBOL(kmem_cache_free);
2749  
2750  /* Note that interrupts must be enabled when calling this function. */
2751  void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
2752  {
2753  	struct kmem_cache_cpu *c;
2754  	struct page *page;
2755  	int i;
2756  
2757  	local_irq_disable();
2758  	c = this_cpu_ptr(s->cpu_slab);
2759  
2760  	for (i = 0; i < size; i++) {
2761  		void *object = p[i];
2762  
2763  		BUG_ON(!object);
2764  		/* kmem cache debug support */
2765  		s = cache_from_obj(s, object);
2766  		if (unlikely(!s))
2767  			goto exit;
2768  		slab_free_hook(s, object);
2769  
2770  		page = virt_to_head_page(object);
2771  
2772  		if (c->page == page) {
2773  			/* Fastpath: local CPU free */
2774  			set_freepointer(s, object, c->freelist);
2775  			c->freelist = object;
2776  		} else {
2777  			c->tid = next_tid(c->tid);
2778  			local_irq_enable();
2779  			/* Slowpath: overhead locked cmpxchg_double_slab */
2780  			__slab_free(s, page, object, _RET_IP_);
2781  			local_irq_disable();
2782  			c = this_cpu_ptr(s->cpu_slab);
2783  		}
2784  	}
2785  exit:
2786  	c->tid = next_tid(c->tid);
2787  	local_irq_enable();
2788  }
2789  EXPORT_SYMBOL(kmem_cache_free_bulk);
2790  
2791  /* Note that interrupts must be enabled when calling this function. */
2792  bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
2793  			   void **p)
2794  {
2795  	struct kmem_cache_cpu *c;
2796  	int i;
2797  
2798  	/*
2799  	 * Drain objects in the per cpu slab, while disabling local
2800  	 * IRQs, which protects against PREEMPT and interrupts
2801  	 * handlers invoking normal fastpath.
2802  	 */
2803  	local_irq_disable();
2804  	c = this_cpu_ptr(s->cpu_slab);
2805  
2806  	for (i = 0; i < size; i++) {
2807  		void *object = c->freelist;
2808  
2809  		if (unlikely(!object)) {
2810  			local_irq_enable();
2811  			/*
2812  			 * Invoking slow path likely have side-effect
2813  			 * of re-populating per CPU c->freelist
2814  			 */
2815  			p[i] = __slab_alloc(s, flags, NUMA_NO_NODE,
2816  					    _RET_IP_, c);
2817  			if (unlikely(!p[i])) {
2818  				__kmem_cache_free_bulk(s, i, p);
2819  				return false;
2820  			}
2821  			local_irq_disable();
2822  			c = this_cpu_ptr(s->cpu_slab);
2823  			continue; /* goto for-loop */
2824  		}
2825  
2826  		/* kmem_cache debug support */
2827  		s = slab_pre_alloc_hook(s, flags);
2828  		if (unlikely(!s)) {
2829  			__kmem_cache_free_bulk(s, i, p);
2830  			c->tid = next_tid(c->tid);
2831  			local_irq_enable();
2832  			return false;
2833  		}
2834  
2835  		c->freelist = get_freepointer(s, object);
2836  		p[i] = object;
2837  
2838  		/* kmem_cache debug support */
2839  		slab_post_alloc_hook(s, flags, object);
2840  	}
2841  	c->tid = next_tid(c->tid);
2842  	local_irq_enable();
2843  
2844  	/* Clear memory outside IRQ disabled fastpath loop */
2845  	if (unlikely(flags & __GFP_ZERO)) {
2846  		int j;
2847  
2848  		for (j = 0; j < i; j++)
2849  			memset(p[j], 0, s->object_size);
2850  	}
2851  
2852  	return true;
2853  }
2854  EXPORT_SYMBOL(kmem_cache_alloc_bulk);
2855  
2856  
2857  /*
2858   * Object placement in a slab is made very easy because we always start at
2859   * offset 0. If we tune the size of the object to the alignment then we can
2860   * get the required alignment by putting one properly sized object after
2861   * another.
2862   *
2863   * Notice that the allocation order determines the sizes of the per cpu
2864   * caches. Each processor has always one slab available for allocations.
2865   * Increasing the allocation order reduces the number of times that slabs
2866   * must be moved on and off the partial lists and is therefore a factor in
2867   * locking overhead.
2868   */
2869  
2870  /*
2871   * Mininum / Maximum order of slab pages. This influences locking overhead
2872   * and slab fragmentation. A higher order reduces the number of partial slabs
2873   * and increases the number of allocations possible without having to
2874   * take the list_lock.
2875   */
2876  static int slub_min_order;
2877  static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
2878  static int slub_min_objects;
2879  
2880  /*
2881   * Calculate the order of allocation given an slab object size.
2882   *
2883   * The order of allocation has significant impact on performance and other
2884   * system components. Generally order 0 allocations should be preferred since
2885   * order 0 does not cause fragmentation in the page allocator. Larger objects
2886   * be problematic to put into order 0 slabs because there may be too much
2887   * unused space left. We go to a higher order if more than 1/16th of the slab
2888   * would be wasted.
2889   *
2890   * In order to reach satisfactory performance we must ensure that a minimum
2891   * number of objects is in one slab. Otherwise we may generate too much
2892   * activity on the partial lists which requires taking the list_lock. This is
2893   * less a concern for large slabs though which are rarely used.
2894   *
2895   * slub_max_order specifies the order where we begin to stop considering the
2896   * number of objects in a slab as critical. If we reach slub_max_order then
2897   * we try to keep the page order as low as possible. So we accept more waste
2898   * of space in favor of a small page order.
2899   *
2900   * Higher order allocations also allow the placement of more objects in a
2901   * slab and thereby reduce object handling overhead. If the user has
2902   * requested a higher mininum order then we start with that one instead of
2903   * the smallest order which will fit the object.
2904   */
2905  static inline int slab_order(int size, int min_objects,
2906  				int max_order, int fract_leftover, int reserved)
2907  {
2908  	int order;
2909  	int rem;
2910  	int min_order = slub_min_order;
2911  
2912  	if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
2913  		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
2914  
2915  	for (order = max(min_order,
2916  				fls(min_objects * size - 1) - PAGE_SHIFT);
2917  			order <= max_order; order++) {
2918  
2919  		unsigned long slab_size = PAGE_SIZE << order;
2920  
2921  		if (slab_size < min_objects * size + reserved)
2922  			continue;
2923  
2924  		rem = (slab_size - reserved) % size;
2925  
2926  		if (rem <= slab_size / fract_leftover)
2927  			break;
2928  
2929  	}
2930  
2931  	return order;
2932  }
2933  
2934  static inline int calculate_order(int size, int reserved)
2935  {
2936  	int order;
2937  	int min_objects;
2938  	int fraction;
2939  	int max_objects;
2940  
2941  	/*
2942  	 * Attempt to find best configuration for a slab. This
2943  	 * works by first attempting to generate a layout with
2944  	 * the best configuration and backing off gradually.
2945  	 *
2946  	 * First we reduce the acceptable waste in a slab. Then
2947  	 * we reduce the minimum objects required in a slab.
2948  	 */
2949  	min_objects = slub_min_objects;
2950  	if (!min_objects)
2951  		min_objects = 4 * (fls(nr_cpu_ids) + 1);
2952  	max_objects = order_objects(slub_max_order, size, reserved);
2953  	min_objects = min(min_objects, max_objects);
2954  
2955  	while (min_objects > 1) {
2956  		fraction = 16;
2957  		while (fraction >= 4) {
2958  			order = slab_order(size, min_objects,
2959  					slub_max_order, fraction, reserved);
2960  			if (order <= slub_max_order)
2961  				return order;
2962  			fraction /= 2;
2963  		}
2964  		min_objects--;
2965  	}
2966  
2967  	/*
2968  	 * We were unable to place multiple objects in a slab. Now
2969  	 * lets see if we can place a single object there.
2970  	 */
2971  	order = slab_order(size, 1, slub_max_order, 1, reserved);
2972  	if (order <= slub_max_order)
2973  		return order;
2974  
2975  	/*
2976  	 * Doh this slab cannot be placed using slub_max_order.
2977  	 */
2978  	order = slab_order(size, 1, MAX_ORDER, 1, reserved);
2979  	if (order < MAX_ORDER)
2980  		return order;
2981  	return -ENOSYS;
2982  }
2983  
2984  static void
2985  init_kmem_cache_node(struct kmem_cache_node *n)
2986  {
2987  	n->nr_partial = 0;
2988  	spin_lock_init(&n->list_lock);
2989  	INIT_LIST_HEAD(&n->partial);
2990  #ifdef CONFIG_SLUB_DEBUG
2991  	atomic_long_set(&n->nr_slabs, 0);
2992  	atomic_long_set(&n->total_objects, 0);
2993  	INIT_LIST_HEAD(&n->full);
2994  #endif
2995  }
2996  
2997  static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2998  {
2999  	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
3000  			KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
3001  
3002  	/*
3003  	 * Must align to double word boundary for the double cmpxchg
3004  	 * instructions to work; see __pcpu_double_call_return_bool().
3005  	 */
3006  	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
3007  				     2 * sizeof(void *));
3008  
3009  	if (!s->cpu_slab)
3010  		return 0;
3011  
3012  	init_kmem_cache_cpus(s);
3013  
3014  	return 1;
3015  }
3016  
3017  static struct kmem_cache *kmem_cache_node;
3018  
3019  /*
3020   * No kmalloc_node yet so do it by hand. We know that this is the first
3021   * slab on the node for this slabcache. There are no concurrent accesses
3022   * possible.
3023   *
3024   * Note that this function only works on the kmem_cache_node
3025   * when allocating for the kmem_cache_node. This is used for bootstrapping
3026   * memory on a fresh node that has no slab structures yet.
3027   */
3028  static void early_kmem_cache_node_alloc(int node)
3029  {
3030  	struct page *page;
3031  	struct kmem_cache_node *n;
3032  
3033  	BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
3034  
3035  	page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
3036  
3037  	BUG_ON(!page);
3038  	if (page_to_nid(page) != node) {
3039  		pr_err("SLUB: Unable to allocate memory from node %d\n", node);
3040  		pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
3041  	}
3042  
3043  	n = page->freelist;
3044  	BUG_ON(!n);
3045  	page->freelist = get_freepointer(kmem_cache_node, n);
3046  	page->inuse = 1;
3047  	page->frozen = 0;
3048  	kmem_cache_node->node[node] = n;
3049  #ifdef CONFIG_SLUB_DEBUG
3050  	init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
3051  	init_tracking(kmem_cache_node, n);
3052  #endif
3053  	kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node));
3054  	init_kmem_cache_node(n);
3055  	inc_slabs_node(kmem_cache_node, node, page->objects);
3056  
3057  	/*
3058  	 * No locks need to be taken here as it has just been
3059  	 * initialized and there is no concurrent access.
3060  	 */
3061  	__add_partial(n, page, DEACTIVATE_TO_HEAD);
3062  }
3063  
3064  static void free_kmem_cache_nodes(struct kmem_cache *s)
3065  {
3066  	int node;
3067  	struct kmem_cache_node *n;
3068  
3069  	for_each_kmem_cache_node(s, node, n) {
3070  		kmem_cache_free(kmem_cache_node, n);
3071  		s->node[node] = NULL;
3072  	}
3073  }
3074  
3075  static int init_kmem_cache_nodes(struct kmem_cache *s)
3076  {
3077  	int node;
3078  
3079  	for_each_node_state(node, N_NORMAL_MEMORY) {
3080  		struct kmem_cache_node *n;
3081  
3082  		if (slab_state == DOWN) {
3083  			early_kmem_cache_node_alloc(node);
3084  			continue;
3085  		}
3086  		n = kmem_cache_alloc_node(kmem_cache_node,
3087  						GFP_KERNEL, node);
3088  
3089  		if (!n) {
3090  			free_kmem_cache_nodes(s);
3091  			return 0;
3092  		}
3093  
3094  		s->node[node] = n;
3095  		init_kmem_cache_node(n);
3096  	}
3097  	return 1;
3098  }
3099  
3100  static void set_min_partial(struct kmem_cache *s, unsigned long min)
3101  {
3102  	if (min < MIN_PARTIAL)
3103  		min = MIN_PARTIAL;
3104  	else if (min > MAX_PARTIAL)
3105  		min = MAX_PARTIAL;
3106  	s->min_partial = min;
3107  }
3108  
3109  /*
3110   * calculate_sizes() determines the order and the distribution of data within
3111   * a slab object.
3112   */
3113  static int calculate_sizes(struct kmem_cache *s, int forced_order)
3114  {
3115  	unsigned long flags = s->flags;
3116  	unsigned long size = s->object_size;
3117  	int order;
3118  
3119  	/*
3120  	 * Round up object size to the next word boundary. We can only
3121  	 * place the free pointer at word boundaries and this determines
3122  	 * the possible location of the free pointer.
3123  	 */
3124  	size = ALIGN(size, sizeof(void *));
3125  
3126  #ifdef CONFIG_SLUB_DEBUG
3127  	/*
3128  	 * Determine if we can poison the object itself. If the user of
3129  	 * the slab may touch the object after free or before allocation
3130  	 * then we should never poison the object itself.
3131  	 */
3132  	if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
3133  			!s->ctor)
3134  		s->flags |= __OBJECT_POISON;
3135  	else
3136  		s->flags &= ~__OBJECT_POISON;
3137  
3138  
3139  	/*
3140  	 * If we are Redzoning then check if there is some space between the
3141  	 * end of the object and the free pointer. If not then add an
3142  	 * additional word to have some bytes to store Redzone information.
3143  	 */
3144  	if ((flags & SLAB_RED_ZONE) && size == s->object_size)
3145  		size += sizeof(void *);
3146  #endif
3147  
3148  	/*
3149  	 * With that we have determined the number of bytes in actual use
3150  	 * by the object. This is the potential offset to the free pointer.
3151  	 */
3152  	s->inuse = size;
3153  
3154  	if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
3155  		s->ctor)) {
3156  		/*
3157  		 * Relocate free pointer after the object if it is not
3158  		 * permitted to overwrite the first word of the object on
3159  		 * kmem_cache_free.
3160  		 *
3161  		 * This is the case if we do RCU, have a constructor or
3162  		 * destructor or are poisoning the objects.
3163  		 */
3164  		s->offset = size;
3165  		size += sizeof(void *);
3166  	}
3167  
3168  #ifdef CONFIG_SLUB_DEBUG
3169  	if (flags & SLAB_STORE_USER)
3170  		/*
3171  		 * Need to store information about allocs and frees after
3172  		 * the object.
3173  		 */
3174  		size += 2 * sizeof(struct track);
3175  
3176  	if (flags & SLAB_RED_ZONE)
3177  		/*
3178  		 * Add some empty padding so that we can catch
3179  		 * overwrites from earlier objects rather than let
3180  		 * tracking information or the free pointer be
3181  		 * corrupted if a user writes before the start
3182  		 * of the object.
3183  		 */
3184  		size += sizeof(void *);
3185  #endif
3186  
3187  	/*
3188  	 * SLUB stores one object immediately after another beginning from
3189  	 * offset 0. In order to align the objects we have to simply size
3190  	 * each object to conform to the alignment.
3191  	 */
3192  	size = ALIGN(size, s->align);
3193  	s->size = size;
3194  	if (forced_order >= 0)
3195  		order = forced_order;
3196  	else
3197  		order = calculate_order(size, s->reserved);
3198  
3199  	if (order < 0)
3200  		return 0;
3201  
3202  	s->allocflags = 0;
3203  	if (order)
3204  		s->allocflags |= __GFP_COMP;
3205  
3206  	if (s->flags & SLAB_CACHE_DMA)
3207  		s->allocflags |= GFP_DMA;
3208  
3209  	if (s->flags & SLAB_RECLAIM_ACCOUNT)
3210  		s->allocflags |= __GFP_RECLAIMABLE;
3211  
3212  	/*
3213  	 * Determine the number of objects per slab
3214  	 */
3215  	s->oo = oo_make(order, size, s->reserved);
3216  	s->min = oo_make(get_order(size), size, s->reserved);
3217  	if (oo_objects(s->oo) > oo_objects(s->max))
3218  		s->max = s->oo;
3219  
3220  	return !!oo_objects(s->oo);
3221  }
3222  
3223  static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
3224  {
3225  	s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
3226  	s->reserved = 0;
3227  
3228  	if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
3229  		s->reserved = sizeof(struct rcu_head);
3230  
3231  	if (!calculate_sizes(s, -1))
3232  		goto error;
3233  	if (disable_higher_order_debug) {
3234  		/*
3235  		 * Disable debugging flags that store metadata if the min slab
3236  		 * order increased.
3237  		 */
3238  		if (get_order(s->size) > get_order(s->object_size)) {
3239  			s->flags &= ~DEBUG_METADATA_FLAGS;
3240  			s->offset = 0;
3241  			if (!calculate_sizes(s, -1))
3242  				goto error;
3243  		}
3244  	}
3245  
3246  #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
3247      defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
3248  	if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
3249  		/* Enable fast mode */
3250  		s->flags |= __CMPXCHG_DOUBLE;
3251  #endif
3252  
3253  	/*
3254  	 * The larger the object size is, the more pages we want on the partial
3255  	 * list to avoid pounding the page allocator excessively.
3256  	 */
3257  	set_min_partial(s, ilog2(s->size) / 2);
3258  
3259  	/*
3260  	 * cpu_partial determined the maximum number of objects kept in the
3261  	 * per cpu partial lists of a processor.
3262  	 *
3263  	 * Per cpu partial lists mainly contain slabs that just have one
3264  	 * object freed. If they are used for allocation then they can be
3265  	 * filled up again with minimal effort. The slab will never hit the
3266  	 * per node partial lists and therefore no locking will be required.
3267  	 *
3268  	 * This setting also determines
3269  	 *
3270  	 * A) The number of objects from per cpu partial slabs dumped to the
3271  	 *    per node list when we reach the limit.
3272  	 * B) The number of objects in cpu partial slabs to extract from the
3273  	 *    per node list when we run out of per cpu objects. We only fetch
3274  	 *    50% to keep some capacity around for frees.
3275  	 */
3276  	if (!kmem_cache_has_cpu_partial(s))
3277  		s->cpu_partial = 0;
3278  	else if (s->size >= PAGE_SIZE)
3279  		s->cpu_partial = 2;
3280  	else if (s->size >= 1024)
3281  		s->cpu_partial = 6;
3282  	else if (s->size >= 256)
3283  		s->cpu_partial = 13;
3284  	else
3285  		s->cpu_partial = 30;
3286  
3287  #ifdef CONFIG_NUMA
3288  	s->remote_node_defrag_ratio = 1000;
3289  #endif
3290  	if (!init_kmem_cache_nodes(s))
3291  		goto error;
3292  
3293  	if (alloc_kmem_cache_cpus(s))
3294  		return 0;
3295  
3296  	free_kmem_cache_nodes(s);
3297  error:
3298  	if (flags & SLAB_PANIC)
3299  		panic("Cannot create slab %s size=%lu realsize=%u "
3300  			"order=%u offset=%u flags=%lx\n",
3301  			s->name, (unsigned long)s->size, s->size,
3302  			oo_order(s->oo), s->offset, flags);
3303  	return -EINVAL;
3304  }
3305  
3306  static void list_slab_objects(struct kmem_cache *s, struct page *page,
3307  							const char *text)
3308  {
3309  #ifdef CONFIG_SLUB_DEBUG
3310  	void *addr = page_address(page);
3311  	void *p;
3312  	unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
3313  				     sizeof(long), GFP_ATOMIC);
3314  	if (!map)
3315  		return;
3316  	slab_err(s, page, text, s->name);
3317  	slab_lock(page);
3318  
3319  	get_map(s, page, map);
3320  	for_each_object(p, s, addr, page->objects) {
3321  
3322  		if (!test_bit(slab_index(p, s, addr), map)) {
3323  			pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
3324  			print_tracking(s, p);
3325  		}
3326  	}
3327  	slab_unlock(page);
3328  	kfree(map);
3329  #endif
3330  }
3331  
3332  /*
3333   * Attempt to free all partial slabs on a node.
3334   * This is called from kmem_cache_close(). We must be the last thread
3335   * using the cache and therefore we do not need to lock anymore.
3336   */
3337  static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
3338  {
3339  	struct page *page, *h;
3340  
3341  	list_for_each_entry_safe(page, h, &n->partial, lru) {
3342  		if (!page->inuse) {
3343  			__remove_partial(n, page);
3344  			discard_slab(s, page);
3345  		} else {
3346  			list_slab_objects(s, page,
3347  			"Objects remaining in %s on kmem_cache_close()");
3348  		}
3349  	}
3350  }
3351  
3352  /*
3353   * Release all resources used by a slab cache.
3354   */
3355  static inline int kmem_cache_close(struct kmem_cache *s)
3356  {
3357  	int node;
3358  	struct kmem_cache_node *n;
3359  
3360  	flush_all(s);
3361  	/* Attempt to free all objects */
3362  	for_each_kmem_cache_node(s, node, n) {
3363  		free_partial(s, n);
3364  		if (n->nr_partial || slabs_node(s, node))
3365  			return 1;
3366  	}
3367  	free_percpu(s->cpu_slab);
3368  	free_kmem_cache_nodes(s);
3369  	return 0;
3370  }
3371  
3372  int __kmem_cache_shutdown(struct kmem_cache *s)
3373  {
3374  	return kmem_cache_close(s);
3375  }
3376  
3377  /********************************************************************
3378   *		Kmalloc subsystem
3379   *******************************************************************/
3380  
3381  static int __init setup_slub_min_order(char *str)
3382  {
3383  	get_option(&str, &slub_min_order);
3384  
3385  	return 1;
3386  }
3387  
3388  __setup("slub_min_order=", setup_slub_min_order);
3389  
3390  static int __init setup_slub_max_order(char *str)
3391  {
3392  	get_option(&str, &slub_max_order);
3393  	slub_max_order = min(slub_max_order, MAX_ORDER - 1);
3394  
3395  	return 1;
3396  }
3397  
3398  __setup("slub_max_order=", setup_slub_max_order);
3399  
3400  static int __init setup_slub_min_objects(char *str)
3401  {
3402  	get_option(&str, &slub_min_objects);
3403  
3404  	return 1;
3405  }
3406  
3407  __setup("slub_min_objects=", setup_slub_min_objects);
3408  
3409  void *__kmalloc(size_t size, gfp_t flags)
3410  {
3411  	struct kmem_cache *s;
3412  	void *ret;
3413  
3414  	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
3415  		return kmalloc_large(size, flags);
3416  
3417  	s = kmalloc_slab(size, flags);
3418  
3419  	if (unlikely(ZERO_OR_NULL_PTR(s)))
3420  		return s;
3421  
3422  	ret = slab_alloc(s, flags, _RET_IP_);
3423  
3424  	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
3425  
3426  	kasan_kmalloc(s, ret, size);
3427  
3428  	return ret;
3429  }
3430  EXPORT_SYMBOL(__kmalloc);
3431  
3432  #ifdef CONFIG_NUMA
3433  static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
3434  {
3435  	struct page *page;
3436  	void *ptr = NULL;
3437  
3438  	flags |= __GFP_COMP | __GFP_NOTRACK;
3439  	page = alloc_kmem_pages_node(node, flags, get_order(size));
3440  	if (page)
3441  		ptr = page_address(page);
3442  
3443  	kmalloc_large_node_hook(ptr, size, flags);
3444  	return ptr;
3445  }
3446  
3447  void *__kmalloc_node(size_t size, gfp_t flags, int node)
3448  {
3449  	struct kmem_cache *s;
3450  	void *ret;
3451  
3452  	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
3453  		ret = kmalloc_large_node(size, flags, node);
3454  
3455  		trace_kmalloc_node(_RET_IP_, ret,
3456  				   size, PAGE_SIZE << get_order(size),
3457  				   flags, node);
3458  
3459  		return ret;
3460  	}
3461  
3462  	s = kmalloc_slab(size, flags);
3463  
3464  	if (unlikely(ZERO_OR_NULL_PTR(s)))
3465  		return s;
3466  
3467  	ret = slab_alloc_node(s, flags, node, _RET_IP_);
3468  
3469  	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
3470  
3471  	kasan_kmalloc(s, ret, size);
3472  
3473  	return ret;
3474  }
3475  EXPORT_SYMBOL(__kmalloc_node);
3476  #endif
3477  
3478  static size_t __ksize(const void *object)
3479  {
3480  	struct page *page;
3481  
3482  	if (unlikely(object == ZERO_SIZE_PTR))
3483  		return 0;
3484  
3485  	page = virt_to_head_page(object);
3486  
3487  	if (unlikely(!PageSlab(page))) {
3488  		WARN_ON(!PageCompound(page));
3489  		return PAGE_SIZE << compound_order(page);
3490  	}
3491  
3492  	return slab_ksize(page->slab_cache);
3493  }
3494  
3495  size_t ksize(const void *object)
3496  {
3497  	size_t size = __ksize(object);
3498  	/* We assume that ksize callers could use whole allocated area,
3499  	   so we need unpoison this area. */
3500  	kasan_krealloc(object, size);
3501  	return size;
3502  }
3503  EXPORT_SYMBOL(ksize);
3504  
3505  void kfree(const void *x)
3506  {
3507  	struct page *page;
3508  	void *object = (void *)x;
3509  
3510  	trace_kfree(_RET_IP_, x);
3511  
3512  	if (unlikely(ZERO_OR_NULL_PTR(x)))
3513  		return;
3514  
3515  	page = virt_to_head_page(x);
3516  	if (unlikely(!PageSlab(page))) {
3517  		BUG_ON(!PageCompound(page));
3518  		kfree_hook(x);
3519  		__free_kmem_pages(page, compound_order(page));
3520  		return;
3521  	}
3522  	slab_free(page->slab_cache, page, object, _RET_IP_);
3523  }
3524  EXPORT_SYMBOL(kfree);
3525  
3526  #define SHRINK_PROMOTE_MAX 32
3527  
3528  /*
3529   * kmem_cache_shrink discards empty slabs and promotes the slabs filled
3530   * up most to the head of the partial lists. New allocations will then
3531   * fill those up and thus they can be removed from the partial lists.
3532   *
3533   * The slabs with the least items are placed last. This results in them
3534   * being allocated from last increasing the chance that the last objects
3535   * are freed in them.
3536   */
3537  int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
3538  {
3539  	int node;
3540  	int i;
3541  	struct kmem_cache_node *n;
3542  	struct page *page;
3543  	struct page *t;
3544  	struct list_head discard;
3545  	struct list_head promote[SHRINK_PROMOTE_MAX];
3546  	unsigned long flags;
3547  	int ret = 0;
3548  
3549  	if (deactivate) {
3550  		/*
3551  		 * Disable empty slabs caching. Used to avoid pinning offline
3552  		 * memory cgroups by kmem pages that can be freed.
3553  		 */
3554  		s->cpu_partial = 0;
3555  		s->min_partial = 0;
3556  
3557  		/*
3558  		 * s->cpu_partial is checked locklessly (see put_cpu_partial),
3559  		 * so we have to make sure the change is visible.
3560  		 */
3561  		kick_all_cpus_sync();
3562  	}
3563  
3564  	flush_all(s);
3565  	for_each_kmem_cache_node(s, node, n) {
3566  		INIT_LIST_HEAD(&discard);
3567  		for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
3568  			INIT_LIST_HEAD(promote + i);
3569  
3570  		spin_lock_irqsave(&n->list_lock, flags);
3571  
3572  		/*
3573  		 * Build lists of slabs to discard or promote.
3574  		 *
3575  		 * Note that concurrent frees may occur while we hold the
3576  		 * list_lock. page->inuse here is the upper limit.
3577  		 */
3578  		list_for_each_entry_safe(page, t, &n->partial, lru) {
3579  			int free = page->objects - page->inuse;
3580  
3581  			/* Do not reread page->inuse */
3582  			barrier();
3583  
3584  			/* We do not keep full slabs on the list */
3585  			BUG_ON(free <= 0);
3586  
3587  			if (free == page->objects) {
3588  				list_move(&page->lru, &discard);
3589  				n->nr_partial--;
3590  			} else if (free <= SHRINK_PROMOTE_MAX)
3591  				list_move(&page->lru, promote + free - 1);
3592  		}
3593  
3594  		/*
3595  		 * Promote the slabs filled up most to the head of the
3596  		 * partial list.
3597  		 */
3598  		for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
3599  			list_splice(promote + i, &n->partial);
3600  
3601  		spin_unlock_irqrestore(&n->list_lock, flags);
3602  
3603  		/* Release empty slabs */
3604  		list_for_each_entry_safe(page, t, &discard, lru)
3605  			discard_slab(s, page);
3606  
3607  		if (slabs_node(s, node))
3608  			ret = 1;
3609  	}
3610  
3611  	return ret;
3612  }
3613  
3614  static int slab_mem_going_offline_callback(void *arg)
3615  {
3616  	struct kmem_cache *s;
3617  
3618  	mutex_lock(&slab_mutex);
3619  	list_for_each_entry(s, &slab_caches, list)
3620  		__kmem_cache_shrink(s, false);
3621  	mutex_unlock(&slab_mutex);
3622  
3623  	return 0;
3624  }
3625  
3626  static void slab_mem_offline_callback(void *arg)
3627  {
3628  	struct kmem_cache_node *n;
3629  	struct kmem_cache *s;
3630  	struct memory_notify *marg = arg;
3631  	int offline_node;
3632  
3633  	offline_node = marg->status_change_nid_normal;
3634  
3635  	/*
3636  	 * If the node still has available memory. we need kmem_cache_node
3637  	 * for it yet.
3638  	 */
3639  	if (offline_node < 0)
3640  		return;
3641  
3642  	mutex_lock(&slab_mutex);
3643  	list_for_each_entry(s, &slab_caches, list) {
3644  		n = get_node(s, offline_node);
3645  		if (n) {
3646  			/*
3647  			 * if n->nr_slabs > 0, slabs still exist on the node
3648  			 * that is going down. We were unable to free them,
3649  			 * and offline_pages() function shouldn't call this
3650  			 * callback. So, we must fail.
3651  			 */
3652  			BUG_ON(slabs_node(s, offline_node));
3653  
3654  			s->node[offline_node] = NULL;
3655  			kmem_cache_free(kmem_cache_node, n);
3656  		}
3657  	}
3658  	mutex_unlock(&slab_mutex);
3659  }
3660  
3661  static int slab_mem_going_online_callback(void *arg)
3662  {
3663  	struct kmem_cache_node *n;
3664  	struct kmem_cache *s;
3665  	struct memory_notify *marg = arg;
3666  	int nid = marg->status_change_nid_normal;
3667  	int ret = 0;
3668  
3669  	/*
3670  	 * If the node's memory is already available, then kmem_cache_node is
3671  	 * already created. Nothing to do.
3672  	 */
3673  	if (nid < 0)
3674  		return 0;
3675  
3676  	/*
3677  	 * We are bringing a node online. No memory is available yet. We must
3678  	 * allocate a kmem_cache_node structure in order to bring the node
3679  	 * online.
3680  	 */
3681  	mutex_lock(&slab_mutex);
3682  	list_for_each_entry(s, &slab_caches, list) {
3683  		/*
3684  		 * XXX: kmem_cache_alloc_node will fallback to other nodes
3685  		 *      since memory is not yet available from the node that
3686  		 *      is brought up.
3687  		 */
3688  		n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
3689  		if (!n) {
3690  			ret = -ENOMEM;
3691  			goto out;
3692  		}
3693  		init_kmem_cache_node(n);
3694  		s->node[nid] = n;
3695  	}
3696  out:
3697  	mutex_unlock(&slab_mutex);
3698  	return ret;
3699  }
3700  
3701  static int slab_memory_callback(struct notifier_block *self,
3702  				unsigned long action, void *arg)
3703  {
3704  	int ret = 0;
3705  
3706  	switch (action) {
3707  	case MEM_GOING_ONLINE:
3708  		ret = slab_mem_going_online_callback(arg);
3709  		break;
3710  	case MEM_GOING_OFFLINE:
3711  		ret = slab_mem_going_offline_callback(arg);
3712  		break;
3713  	case MEM_OFFLINE:
3714  	case MEM_CANCEL_ONLINE:
3715  		slab_mem_offline_callback(arg);
3716  		break;
3717  	case MEM_ONLINE:
3718  	case MEM_CANCEL_OFFLINE:
3719  		break;
3720  	}
3721  	if (ret)
3722  		ret = notifier_from_errno(ret);
3723  	else
3724  		ret = NOTIFY_OK;
3725  	return ret;
3726  }
3727  
3728  static struct notifier_block slab_memory_callback_nb = {
3729  	.notifier_call = slab_memory_callback,
3730  	.priority = SLAB_CALLBACK_PRI,
3731  };
3732  
3733  /********************************************************************
3734   *			Basic setup of slabs
3735   *******************************************************************/
3736  
3737  /*
3738   * Used for early kmem_cache structures that were allocated using
3739   * the page allocator. Allocate them properly then fix up the pointers
3740   * that may be pointing to the wrong kmem_cache structure.
3741   */
3742  
3743  static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
3744  {
3745  	int node;
3746  	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
3747  	struct kmem_cache_node *n;
3748  
3749  	memcpy(s, static_cache, kmem_cache->object_size);
3750  
3751  	/*
3752  	 * This runs very early, and only the boot processor is supposed to be
3753  	 * up.  Even if it weren't true, IRQs are not up so we couldn't fire
3754  	 * IPIs around.
3755  	 */
3756  	__flush_cpu_slab(s, smp_processor_id());
3757  	for_each_kmem_cache_node(s, node, n) {
3758  		struct page *p;
3759  
3760  		list_for_each_entry(p, &n->partial, lru)
3761  			p->slab_cache = s;
3762  
3763  #ifdef CONFIG_SLUB_DEBUG
3764  		list_for_each_entry(p, &n->full, lru)
3765  			p->slab_cache = s;
3766  #endif
3767  	}
3768  	slab_init_memcg_params(s);
3769  	list_add(&s->list, &slab_caches);
3770  	return s;
3771  }
3772  
3773  void __init kmem_cache_init(void)
3774  {
3775  	static __initdata struct kmem_cache boot_kmem_cache,
3776  		boot_kmem_cache_node;
3777  
3778  	if (debug_guardpage_minorder())
3779  		slub_max_order = 0;
3780  
3781  	kmem_cache_node = &boot_kmem_cache_node;
3782  	kmem_cache = &boot_kmem_cache;
3783  
3784  	create_boot_cache(kmem_cache_node, "kmem_cache_node",
3785  		sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
3786  
3787  	register_hotmemory_notifier(&slab_memory_callback_nb);
3788  
3789  	/* Able to allocate the per node structures */
3790  	slab_state = PARTIAL;
3791  
3792  	create_boot_cache(kmem_cache, "kmem_cache",
3793  			offsetof(struct kmem_cache, node) +
3794  				nr_node_ids * sizeof(struct kmem_cache_node *),
3795  		       SLAB_HWCACHE_ALIGN);
3796  
3797  	kmem_cache = bootstrap(&boot_kmem_cache);
3798  
3799  	/*
3800  	 * Allocate kmem_cache_node properly from the kmem_cache slab.
3801  	 * kmem_cache_node is separately allocated so no need to
3802  	 * update any list pointers.
3803  	 */
3804  	kmem_cache_node = bootstrap(&boot_kmem_cache_node);
3805  
3806  	/* Now we can use the kmem_cache to allocate kmalloc slabs */
3807  	setup_kmalloc_cache_index_table();
3808  	create_kmalloc_caches(0);
3809  
3810  #ifdef CONFIG_SMP
3811  	register_cpu_notifier(&slab_notifier);
3812  #endif
3813  
3814  	pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n",
3815  		cache_line_size(),
3816  		slub_min_order, slub_max_order, slub_min_objects,
3817  		nr_cpu_ids, nr_node_ids);
3818  }
3819  
3820  void __init kmem_cache_init_late(void)
3821  {
3822  }
3823  
3824  struct kmem_cache *
3825  __kmem_cache_alias(const char *name, size_t size, size_t align,
3826  		   unsigned long flags, void (*ctor)(void *))
3827  {
3828  	struct kmem_cache *s, *c;
3829  
3830  	s = find_mergeable(size, align, flags, name, ctor);
3831  	if (s) {
3832  		s->refcount++;
3833  
3834  		/*
3835  		 * Adjust the object sizes so that we clear
3836  		 * the complete object on kzalloc.
3837  		 */
3838  		s->object_size = max(s->object_size, (int)size);
3839  		s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3840  
3841  		for_each_memcg_cache(c, s) {
3842  			c->object_size = s->object_size;
3843  			c->inuse = max_t(int, c->inuse,
3844  					 ALIGN(size, sizeof(void *)));
3845  		}
3846  
3847  		if (sysfs_slab_alias(s, name)) {
3848  			s->refcount--;
3849  			s = NULL;
3850  		}
3851  	}
3852  
3853  	return s;
3854  }
3855  
3856  int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
3857  {
3858  	int err;
3859  
3860  	err = kmem_cache_open(s, flags);
3861  	if (err)
3862  		return err;
3863  
3864  	/* Mutex is not taken during early boot */
3865  	if (slab_state <= UP)
3866  		return 0;
3867  
3868  	memcg_propagate_slab_attrs(s);
3869  	err = sysfs_slab_add(s);
3870  	if (err)
3871  		kmem_cache_close(s);
3872  
3873  	return err;
3874  }
3875  
3876  #ifdef CONFIG_SMP
3877  /*
3878   * Use the cpu notifier to insure that the cpu slabs are flushed when
3879   * necessary.
3880   */
3881  static int slab_cpuup_callback(struct notifier_block *nfb,
3882  		unsigned long action, void *hcpu)
3883  {
3884  	long cpu = (long)hcpu;
3885  	struct kmem_cache *s;
3886  	unsigned long flags;
3887  
3888  	switch (action) {
3889  	case CPU_UP_CANCELED:
3890  	case CPU_UP_CANCELED_FROZEN:
3891  	case CPU_DEAD:
3892  	case CPU_DEAD_FROZEN:
3893  		mutex_lock(&slab_mutex);
3894  		list_for_each_entry(s, &slab_caches, list) {
3895  			local_irq_save(flags);
3896  			__flush_cpu_slab(s, cpu);
3897  			local_irq_restore(flags);
3898  		}
3899  		mutex_unlock(&slab_mutex);
3900  		break;
3901  	default:
3902  		break;
3903  	}
3904  	return NOTIFY_OK;
3905  }
3906  
3907  static struct notifier_block slab_notifier = {
3908  	.notifier_call = slab_cpuup_callback
3909  };
3910  
3911  #endif
3912  
3913  void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3914  {
3915  	struct kmem_cache *s;
3916  	void *ret;
3917  
3918  	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
3919  		return kmalloc_large(size, gfpflags);
3920  
3921  	s = kmalloc_slab(size, gfpflags);
3922  
3923  	if (unlikely(ZERO_OR_NULL_PTR(s)))
3924  		return s;
3925  
3926  	ret = slab_alloc(s, gfpflags, caller);
3927  
3928  	/* Honor the call site pointer we received. */
3929  	trace_kmalloc(caller, ret, size, s->size, gfpflags);
3930  
3931  	return ret;
3932  }
3933  
3934  #ifdef CONFIG_NUMA
3935  void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3936  					int node, unsigned long caller)
3937  {
3938  	struct kmem_cache *s;
3939  	void *ret;
3940  
3941  	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
3942  		ret = kmalloc_large_node(size, gfpflags, node);
3943  
3944  		trace_kmalloc_node(caller, ret,
3945  				   size, PAGE_SIZE << get_order(size),
3946  				   gfpflags, node);
3947  
3948  		return ret;
3949  	}
3950  
3951  	s = kmalloc_slab(size, gfpflags);
3952  
3953  	if (unlikely(ZERO_OR_NULL_PTR(s)))
3954  		return s;
3955  
3956  	ret = slab_alloc_node(s, gfpflags, node, caller);
3957  
3958  	/* Honor the call site pointer we received. */
3959  	trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
3960  
3961  	return ret;
3962  }
3963  #endif
3964  
3965  #ifdef CONFIG_SYSFS
3966  static int count_inuse(struct page *page)
3967  {
3968  	return page->inuse;
3969  }
3970  
3971  static int count_total(struct page *page)
3972  {
3973  	return page->objects;
3974  }
3975  #endif
3976  
3977  #ifdef CONFIG_SLUB_DEBUG
3978  static int validate_slab(struct kmem_cache *s, struct page *page,
3979  						unsigned long *map)
3980  {
3981  	void *p;
3982  	void *addr = page_address(page);
3983  
3984  	if (!check_slab(s, page) ||
3985  			!on_freelist(s, page, NULL))
3986  		return 0;
3987  
3988  	/* Now we know that a valid freelist exists */
3989  	bitmap_zero(map, page->objects);
3990  
3991  	get_map(s, page, map);
3992  	for_each_object(p, s, addr, page->objects) {
3993  		if (test_bit(slab_index(p, s, addr), map))
3994  			if (!check_object(s, page, p, SLUB_RED_INACTIVE))
3995  				return 0;
3996  	}
3997  
3998  	for_each_object(p, s, addr, page->objects)
3999  		if (!test_bit(slab_index(p, s, addr), map))
4000  			if (!check_object(s, page, p, SLUB_RED_ACTIVE))
4001  				return 0;
4002  	return 1;
4003  }
4004  
4005  static void validate_slab_slab(struct kmem_cache *s, struct page *page,
4006  						unsigned long *map)
4007  {
4008  	slab_lock(page);
4009  	validate_slab(s, page, map);
4010  	slab_unlock(page);
4011  }
4012  
4013  static int validate_slab_node(struct kmem_cache *s,
4014  		struct kmem_cache_node *n, unsigned long *map)
4015  {
4016  	unsigned long count = 0;
4017  	struct page *page;
4018  	unsigned long flags;
4019  
4020  	spin_lock_irqsave(&n->list_lock, flags);
4021  
4022  	list_for_each_entry(page, &n->partial, lru) {
4023  		validate_slab_slab(s, page, map);
4024  		count++;
4025  	}
4026  	if (count != n->nr_partial)
4027  		pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
4028  		       s->name, count, n->nr_partial);
4029  
4030  	if (!(s->flags & SLAB_STORE_USER))
4031  		goto out;
4032  
4033  	list_for_each_entry(page, &n->full, lru) {
4034  		validate_slab_slab(s, page, map);
4035  		count++;
4036  	}
4037  	if (count != atomic_long_read(&n->nr_slabs))
4038  		pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
4039  		       s->name, count, atomic_long_read(&n->nr_slabs));
4040  
4041  out:
4042  	spin_unlock_irqrestore(&n->list_lock, flags);
4043  	return count;
4044  }
4045  
4046  static long validate_slab_cache(struct kmem_cache *s)
4047  {
4048  	int node;
4049  	unsigned long count = 0;
4050  	unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
4051  				sizeof(unsigned long), GFP_KERNEL);
4052  	struct kmem_cache_node *n;
4053  
4054  	if (!map)
4055  		return -ENOMEM;
4056  
4057  	flush_all(s);
4058  	for_each_kmem_cache_node(s, node, n)
4059  		count += validate_slab_node(s, n, map);
4060  	kfree(map);
4061  	return count;
4062  }
4063  /*
4064   * Generate lists of code addresses where slabcache objects are allocated
4065   * and freed.
4066   */
4067  
4068  struct location {
4069  	unsigned long count;
4070  	unsigned long addr;
4071  	long long sum_time;
4072  	long min_time;
4073  	long max_time;
4074  	long min_pid;
4075  	long max_pid;
4076  	DECLARE_BITMAP(cpus, NR_CPUS);
4077  	nodemask_t nodes;
4078  };
4079  
4080  struct loc_track {
4081  	unsigned long max;
4082  	unsigned long count;
4083  	struct location *loc;
4084  };
4085  
4086  static void free_loc_track(struct loc_track *t)
4087  {
4088  	if (t->max)
4089  		free_pages((unsigned long)t->loc,
4090  			get_order(sizeof(struct location) * t->max));
4091  }
4092  
4093  static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
4094  {
4095  	struct location *l;
4096  	int order;
4097  
4098  	order = get_order(sizeof(struct location) * max);
4099  
4100  	l = (void *)__get_free_pages(flags, order);
4101  	if (!l)
4102  		return 0;
4103  
4104  	if (t->count) {
4105  		memcpy(l, t->loc, sizeof(struct location) * t->count);
4106  		free_loc_track(t);
4107  	}
4108  	t->max = max;
4109  	t->loc = l;
4110  	return 1;
4111  }
4112  
4113  static int add_location(struct loc_track *t, struct kmem_cache *s,
4114  				const struct track *track)
4115  {
4116  	long start, end, pos;
4117  	struct location *l;
4118  	unsigned long caddr;
4119  	unsigned long age = jiffies - track->when;
4120  
4121  	start = -1;
4122  	end = t->count;
4123  
4124  	for ( ; ; ) {
4125  		pos = start + (end - start + 1) / 2;
4126  
4127  		/*
4128  		 * There is nothing at "end". If we end up there
4129  		 * we need to add something to before end.
4130  		 */
4131  		if (pos == end)
4132  			break;
4133  
4134  		caddr = t->loc[pos].addr;
4135  		if (track->addr == caddr) {
4136  
4137  			l = &t->loc[pos];
4138  			l->count++;
4139  			if (track->when) {
4140  				l->sum_time += age;
4141  				if (age < l->min_time)
4142  					l->min_time = age;
4143  				if (age > l->max_time)
4144  					l->max_time = age;
4145  
4146  				if (track->pid < l->min_pid)
4147  					l->min_pid = track->pid;
4148  				if (track->pid > l->max_pid)
4149  					l->max_pid = track->pid;
4150  
4151  				cpumask_set_cpu(track->cpu,
4152  						to_cpumask(l->cpus));
4153  			}
4154  			node_set(page_to_nid(virt_to_page(track)), l->nodes);
4155  			return 1;
4156  		}
4157  
4158  		if (track->addr < caddr)
4159  			end = pos;
4160  		else
4161  			start = pos;
4162  	}
4163  
4164  	/*
4165  	 * Not found. Insert new tracking element.
4166  	 */
4167  	if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
4168  		return 0;
4169  
4170  	l = t->loc + pos;
4171  	if (pos < t->count)
4172  		memmove(l + 1, l,
4173  			(t->count - pos) * sizeof(struct location));
4174  	t->count++;
4175  	l->count = 1;
4176  	l->addr = track->addr;
4177  	l->sum_time = age;
4178  	l->min_time = age;
4179  	l->max_time = age;
4180  	l->min_pid = track->pid;
4181  	l->max_pid = track->pid;
4182  	cpumask_clear(to_cpumask(l->cpus));
4183  	cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
4184  	nodes_clear(l->nodes);
4185  	node_set(page_to_nid(virt_to_page(track)), l->nodes);
4186  	return 1;
4187  }
4188  
4189  static void process_slab(struct loc_track *t, struct kmem_cache *s,
4190  		struct page *page, enum track_item alloc,
4191  		unsigned long *map)
4192  {
4193  	void *addr = page_address(page);
4194  	void *p;
4195  
4196  	bitmap_zero(map, page->objects);
4197  	get_map(s, page, map);
4198  
4199  	for_each_object(p, s, addr, page->objects)
4200  		if (!test_bit(slab_index(p, s, addr), map))
4201  			add_location(t, s, get_track(s, p, alloc));
4202  }
4203  
4204  static int list_locations(struct kmem_cache *s, char *buf,
4205  					enum track_item alloc)
4206  {
4207  	int len = 0;
4208  	unsigned long i;
4209  	struct loc_track t = { 0, 0, NULL };
4210  	int node;
4211  	unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
4212  				     sizeof(unsigned long), GFP_KERNEL);
4213  	struct kmem_cache_node *n;
4214  
4215  	if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
4216  				     GFP_TEMPORARY)) {
4217  		kfree(map);
4218  		return sprintf(buf, "Out of memory\n");
4219  	}
4220  	/* Push back cpu slabs */
4221  	flush_all(s);
4222  
4223  	for_each_kmem_cache_node(s, node, n) {
4224  		unsigned long flags;
4225  		struct page *page;
4226  
4227  		if (!atomic_long_read(&n->nr_slabs))
4228  			continue;
4229  
4230  		spin_lock_irqsave(&n->list_lock, flags);
4231  		list_for_each_entry(page, &n->partial, lru)
4232  			process_slab(&t, s, page, alloc, map);
4233  		list_for_each_entry(page, &n->full, lru)
4234  			process_slab(&t, s, page, alloc, map);
4235  		spin_unlock_irqrestore(&n->list_lock, flags);
4236  	}
4237  
4238  	for (i = 0; i < t.count; i++) {
4239  		struct location *l = &t.loc[i];
4240  
4241  		if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
4242  			break;
4243  		len += sprintf(buf + len, "%7ld ", l->count);
4244  
4245  		if (l->addr)
4246  			len += sprintf(buf + len, "%pS", (void *)l->addr);
4247  		else
4248  			len += sprintf(buf + len, "<not-available>");
4249  
4250  		if (l->sum_time != l->min_time) {
4251  			len += sprintf(buf + len, " age=%ld/%ld/%ld",
4252  				l->min_time,
4253  				(long)div_u64(l->sum_time, l->count),
4254  				l->max_time);
4255  		} else
4256  			len += sprintf(buf + len, " age=%ld",
4257  				l->min_time);
4258  
4259  		if (l->min_pid != l->max_pid)
4260  			len += sprintf(buf + len, " pid=%ld-%ld",
4261  				l->min_pid, l->max_pid);
4262  		else
4263  			len += sprintf(buf + len, " pid=%ld",
4264  				l->min_pid);
4265  
4266  		if (num_online_cpus() > 1 &&
4267  				!cpumask_empty(to_cpumask(l->cpus)) &&
4268  				len < PAGE_SIZE - 60)
4269  			len += scnprintf(buf + len, PAGE_SIZE - len - 50,
4270  					 " cpus=%*pbl",
4271  					 cpumask_pr_args(to_cpumask(l->cpus)));
4272  
4273  		if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
4274  				len < PAGE_SIZE - 60)
4275  			len += scnprintf(buf + len, PAGE_SIZE - len - 50,
4276  					 " nodes=%*pbl",
4277  					 nodemask_pr_args(&l->nodes));
4278  
4279  		len += sprintf(buf + len, "\n");
4280  	}
4281  
4282  	free_loc_track(&t);
4283  	kfree(map);
4284  	if (!t.count)
4285  		len += sprintf(buf, "No data\n");
4286  	return len;
4287  }
4288  #endif
4289  
4290  #ifdef SLUB_RESILIENCY_TEST
4291  static void __init resiliency_test(void)
4292  {
4293  	u8 *p;
4294  
4295  	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
4296  
4297  	pr_err("SLUB resiliency testing\n");
4298  	pr_err("-----------------------\n");
4299  	pr_err("A. Corruption after allocation\n");
4300  
4301  	p = kzalloc(16, GFP_KERNEL);
4302  	p[16] = 0x12;
4303  	pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
4304  	       p + 16);
4305  
4306  	validate_slab_cache(kmalloc_caches[4]);
4307  
4308  	/* Hmmm... The next two are dangerous */
4309  	p = kzalloc(32, GFP_KERNEL);
4310  	p[32 + sizeof(void *)] = 0x34;
4311  	pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
4312  	       p);
4313  	pr_err("If allocated object is overwritten then not detectable\n\n");
4314  
4315  	validate_slab_cache(kmalloc_caches[5]);
4316  	p = kzalloc(64, GFP_KERNEL);
4317  	p += 64 + (get_cycles() & 0xff) * sizeof(void *);
4318  	*p = 0x56;
4319  	pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
4320  	       p);
4321  	pr_err("If allocated object is overwritten then not detectable\n\n");
4322  	validate_slab_cache(kmalloc_caches[6]);
4323  
4324  	pr_err("\nB. Corruption after free\n");
4325  	p = kzalloc(128, GFP_KERNEL);
4326  	kfree(p);
4327  	*p = 0x78;
4328  	pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
4329  	validate_slab_cache(kmalloc_caches[7]);
4330  
4331  	p = kzalloc(256, GFP_KERNEL);
4332  	kfree(p);
4333  	p[50] = 0x9a;
4334  	pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
4335  	validate_slab_cache(kmalloc_caches[8]);
4336  
4337  	p = kzalloc(512, GFP_KERNEL);
4338  	kfree(p);
4339  	p[512] = 0xab;
4340  	pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
4341  	validate_slab_cache(kmalloc_caches[9]);
4342  }
4343  #else
4344  #ifdef CONFIG_SYSFS
4345  static void resiliency_test(void) {};
4346  #endif
4347  #endif
4348  
4349  #ifdef CONFIG_SYSFS
4350  enum slab_stat_type {
4351  	SL_ALL,			/* All slabs */
4352  	SL_PARTIAL,		/* Only partially allocated slabs */
4353  	SL_CPU,			/* Only slabs used for cpu caches */
4354  	SL_OBJECTS,		/* Determine allocated objects not slabs */
4355  	SL_TOTAL		/* Determine object capacity not slabs */
4356  };
4357  
4358  #define SO_ALL		(1 << SL_ALL)
4359  #define SO_PARTIAL	(1 << SL_PARTIAL)
4360  #define SO_CPU		(1 << SL_CPU)
4361  #define SO_OBJECTS	(1 << SL_OBJECTS)
4362  #define SO_TOTAL	(1 << SL_TOTAL)
4363  
4364  static ssize_t show_slab_objects(struct kmem_cache *s,
4365  			    char *buf, unsigned long flags)
4366  {
4367  	unsigned long total = 0;
4368  	int node;
4369  	int x;
4370  	unsigned long *nodes;
4371  
4372  	nodes = kzalloc(sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
4373  	if (!nodes)
4374  		return -ENOMEM;
4375  
4376  	if (flags & SO_CPU) {
4377  		int cpu;
4378  
4379  		for_each_possible_cpu(cpu) {
4380  			struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
4381  							       cpu);
4382  			int node;
4383  			struct page *page;
4384  
4385  			page = READ_ONCE(c->page);
4386  			if (!page)
4387  				continue;
4388  
4389  			node = page_to_nid(page);
4390  			if (flags & SO_TOTAL)
4391  				x = page->objects;
4392  			else if (flags & SO_OBJECTS)
4393  				x = page->inuse;
4394  			else
4395  				x = 1;
4396  
4397  			total += x;
4398  			nodes[node] += x;
4399  
4400  			page = READ_ONCE(c->partial);
4401  			if (page) {
4402  				node = page_to_nid(page);
4403  				if (flags & SO_TOTAL)
4404  					WARN_ON_ONCE(1);
4405  				else if (flags & SO_OBJECTS)
4406  					WARN_ON_ONCE(1);
4407  				else
4408  					x = page->pages;
4409  				total += x;
4410  				nodes[node] += x;
4411  			}
4412  		}
4413  	}
4414  
4415  	get_online_mems();
4416  #ifdef CONFIG_SLUB_DEBUG
4417  	if (flags & SO_ALL) {
4418  		struct kmem_cache_node *n;
4419  
4420  		for_each_kmem_cache_node(s, node, n) {
4421  
4422  			if (flags & SO_TOTAL)
4423  				x = atomic_long_read(&n->total_objects);
4424  			else if (flags & SO_OBJECTS)
4425  				x = atomic_long_read(&n->total_objects) -
4426  					count_partial(n, count_free);
4427  			else
4428  				x = atomic_long_read(&n->nr_slabs);
4429  			total += x;
4430  			nodes[node] += x;
4431  		}
4432  
4433  	} else
4434  #endif
4435  	if (flags & SO_PARTIAL) {
4436  		struct kmem_cache_node *n;
4437  
4438  		for_each_kmem_cache_node(s, node, n) {
4439  			if (flags & SO_TOTAL)
4440  				x = count_partial(n, count_total);
4441  			else if (flags & SO_OBJECTS)
4442  				x = count_partial(n, count_inuse);
4443  			else
4444  				x = n->nr_partial;
4445  			total += x;
4446  			nodes[node] += x;
4447  		}
4448  	}
4449  	x = sprintf(buf, "%lu", total);
4450  #ifdef CONFIG_NUMA
4451  	for (node = 0; node < nr_node_ids; node++)
4452  		if (nodes[node])
4453  			x += sprintf(buf + x, " N%d=%lu",
4454  					node, nodes[node]);
4455  #endif
4456  	put_online_mems();
4457  	kfree(nodes);
4458  	return x + sprintf(buf + x, "\n");
4459  }
4460  
4461  #ifdef CONFIG_SLUB_DEBUG
4462  static int any_slab_objects(struct kmem_cache *s)
4463  {
4464  	int node;
4465  	struct kmem_cache_node *n;
4466  
4467  	for_each_kmem_cache_node(s, node, n)
4468  		if (atomic_long_read(&n->total_objects))
4469  			return 1;
4470  
4471  	return 0;
4472  }
4473  #endif
4474  
4475  #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
4476  #define to_slab(n) container_of(n, struct kmem_cache, kobj)
4477  
4478  struct slab_attribute {
4479  	struct attribute attr;
4480  	ssize_t (*show)(struct kmem_cache *s, char *buf);
4481  	ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
4482  };
4483  
4484  #define SLAB_ATTR_RO(_name) \
4485  	static struct slab_attribute _name##_attr = \
4486  	__ATTR(_name, 0400, _name##_show, NULL)
4487  
4488  #define SLAB_ATTR(_name) \
4489  	static struct slab_attribute _name##_attr =  \
4490  	__ATTR(_name, 0600, _name##_show, _name##_store)
4491  
4492  static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
4493  {
4494  	return sprintf(buf, "%d\n", s->size);
4495  }
4496  SLAB_ATTR_RO(slab_size);
4497  
4498  static ssize_t align_show(struct kmem_cache *s, char *buf)
4499  {
4500  	return sprintf(buf, "%d\n", s->align);
4501  }
4502  SLAB_ATTR_RO(align);
4503  
4504  static ssize_t object_size_show(struct kmem_cache *s, char *buf)
4505  {
4506  	return sprintf(buf, "%d\n", s->object_size);
4507  }
4508  SLAB_ATTR_RO(object_size);
4509  
4510  static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
4511  {
4512  	return sprintf(buf, "%d\n", oo_objects(s->oo));
4513  }
4514  SLAB_ATTR_RO(objs_per_slab);
4515  
4516  static ssize_t order_store(struct kmem_cache *s,
4517  				const char *buf, size_t length)
4518  {
4519  	unsigned long order;
4520  	int err;
4521  
4522  	err = kstrtoul(buf, 10, &order);
4523  	if (err)
4524  		return err;
4525  
4526  	if (order > slub_max_order || order < slub_min_order)
4527  		return -EINVAL;
4528  
4529  	calculate_sizes(s, order);
4530  	return length;
4531  }
4532  
4533  static ssize_t order_show(struct kmem_cache *s, char *buf)
4534  {
4535  	return sprintf(buf, "%d\n", oo_order(s->oo));
4536  }
4537  SLAB_ATTR(order);
4538  
4539  static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
4540  {
4541  	return sprintf(buf, "%lu\n", s->min_partial);
4542  }
4543  
4544  static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
4545  				 size_t length)
4546  {
4547  	unsigned long min;
4548  	int err;
4549  
4550  	err = kstrtoul(buf, 10, &min);
4551  	if (err)
4552  		return err;
4553  
4554  	set_min_partial(s, min);
4555  	return length;
4556  }
4557  SLAB_ATTR(min_partial);
4558  
4559  static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
4560  {
4561  	return sprintf(buf, "%u\n", s->cpu_partial);
4562  }
4563  
4564  static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
4565  				 size_t length)
4566  {
4567  	unsigned long objects;
4568  	int err;
4569  
4570  	err = kstrtoul(buf, 10, &objects);
4571  	if (err)
4572  		return err;
4573  	if (objects && !kmem_cache_has_cpu_partial(s))
4574  		return -EINVAL;
4575  
4576  	s->cpu_partial = objects;
4577  	flush_all(s);
4578  	return length;
4579  }
4580  SLAB_ATTR(cpu_partial);
4581  
4582  static ssize_t ctor_show(struct kmem_cache *s, char *buf)
4583  {
4584  	if (!s->ctor)
4585  		return 0;
4586  	return sprintf(buf, "%pS\n", s->ctor);
4587  }
4588  SLAB_ATTR_RO(ctor);
4589  
4590  static ssize_t aliases_show(struct kmem_cache *s, char *buf)
4591  {
4592  	return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
4593  }
4594  SLAB_ATTR_RO(aliases);
4595  
4596  static ssize_t partial_show(struct kmem_cache *s, char *buf)
4597  {
4598  	return show_slab_objects(s, buf, SO_PARTIAL);
4599  }
4600  SLAB_ATTR_RO(partial);
4601  
4602  static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
4603  {
4604  	return show_slab_objects(s, buf, SO_CPU);
4605  }
4606  SLAB_ATTR_RO(cpu_slabs);
4607  
4608  static ssize_t objects_show(struct kmem_cache *s, char *buf)
4609  {
4610  	return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
4611  }
4612  SLAB_ATTR_RO(objects);
4613  
4614  static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
4615  {
4616  	return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
4617  }
4618  SLAB_ATTR_RO(objects_partial);
4619  
4620  static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
4621  {
4622  	int objects = 0;
4623  	int pages = 0;
4624  	int cpu;
4625  	int len;
4626  
4627  	for_each_online_cpu(cpu) {
4628  		struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;
4629  
4630  		if (page) {
4631  			pages += page->pages;
4632  			objects += page->pobjects;
4633  		}
4634  	}
4635  
4636  	len = sprintf(buf, "%d(%d)", objects, pages);
4637  
4638  #ifdef CONFIG_SMP
4639  	for_each_online_cpu(cpu) {
4640  		struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;
4641  
4642  		if (page && len < PAGE_SIZE - 20)
4643  			len += sprintf(buf + len, " C%d=%d(%d)", cpu,
4644  				page->pobjects, page->pages);
4645  	}
4646  #endif
4647  	return len + sprintf(buf + len, "\n");
4648  }
4649  SLAB_ATTR_RO(slabs_cpu_partial);
4650  
4651  static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
4652  {
4653  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
4654  }
4655  
4656  static ssize_t reclaim_account_store(struct kmem_cache *s,
4657  				const char *buf, size_t length)
4658  {
4659  	s->flags &= ~SLAB_RECLAIM_ACCOUNT;
4660  	if (buf[0] == '1')
4661  		s->flags |= SLAB_RECLAIM_ACCOUNT;
4662  	return length;
4663  }
4664  SLAB_ATTR(reclaim_account);
4665  
4666  static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
4667  {
4668  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
4669  }
4670  SLAB_ATTR_RO(hwcache_align);
4671  
4672  #ifdef CONFIG_ZONE_DMA
4673  static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
4674  {
4675  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
4676  }
4677  SLAB_ATTR_RO(cache_dma);
4678  #endif
4679  
4680  static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
4681  {
4682  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
4683  }
4684  SLAB_ATTR_RO(destroy_by_rcu);
4685  
4686  static ssize_t reserved_show(struct kmem_cache *s, char *buf)
4687  {
4688  	return sprintf(buf, "%d\n", s->reserved);
4689  }
4690  SLAB_ATTR_RO(reserved);
4691  
4692  #ifdef CONFIG_SLUB_DEBUG
4693  static ssize_t slabs_show(struct kmem_cache *s, char *buf)
4694  {
4695  	return show_slab_objects(s, buf, SO_ALL);
4696  }
4697  SLAB_ATTR_RO(slabs);
4698  
4699  static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
4700  {
4701  	return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
4702  }
4703  SLAB_ATTR_RO(total_objects);
4704  
4705  static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
4706  {
4707  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
4708  }
4709  
4710  static ssize_t sanity_checks_store(struct kmem_cache *s,
4711  				const char *buf, size_t length)
4712  {
4713  	s->flags &= ~SLAB_DEBUG_FREE;
4714  	if (buf[0] == '1') {
4715  		s->flags &= ~__CMPXCHG_DOUBLE;
4716  		s->flags |= SLAB_DEBUG_FREE;
4717  	}
4718  	return length;
4719  }
4720  SLAB_ATTR(sanity_checks);
4721  
4722  static ssize_t trace_show(struct kmem_cache *s, char *buf)
4723  {
4724  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
4725  }
4726  
4727  static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4728  							size_t length)
4729  {
4730  	/*
4731  	 * Tracing a merged cache is going to give confusing results
4732  	 * as well as cause other issues like converting a mergeable
4733  	 * cache into an umergeable one.
4734  	 */
4735  	if (s->refcount > 1)
4736  		return -EINVAL;
4737  
4738  	s->flags &= ~SLAB_TRACE;
4739  	if (buf[0] == '1') {
4740  		s->flags &= ~__CMPXCHG_DOUBLE;
4741  		s->flags |= SLAB_TRACE;
4742  	}
4743  	return length;
4744  }
4745  SLAB_ATTR(trace);
4746  
4747  static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
4748  {
4749  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
4750  }
4751  
4752  static ssize_t red_zone_store(struct kmem_cache *s,
4753  				const char *buf, size_t length)
4754  {
4755  	if (any_slab_objects(s))
4756  		return -EBUSY;
4757  
4758  	s->flags &= ~SLAB_RED_ZONE;
4759  	if (buf[0] == '1') {
4760  		s->flags &= ~__CMPXCHG_DOUBLE;
4761  		s->flags |= SLAB_RED_ZONE;
4762  	}
4763  	calculate_sizes(s, -1);
4764  	return length;
4765  }
4766  SLAB_ATTR(red_zone);
4767  
4768  static ssize_t poison_show(struct kmem_cache *s, char *buf)
4769  {
4770  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
4771  }
4772  
4773  static ssize_t poison_store(struct kmem_cache *s,
4774  				const char *buf, size_t length)
4775  {
4776  	if (any_slab_objects(s))
4777  		return -EBUSY;
4778  
4779  	s->flags &= ~SLAB_POISON;
4780  	if (buf[0] == '1') {
4781  		s->flags &= ~__CMPXCHG_DOUBLE;
4782  		s->flags |= SLAB_POISON;
4783  	}
4784  	calculate_sizes(s, -1);
4785  	return length;
4786  }
4787  SLAB_ATTR(poison);
4788  
4789  static ssize_t store_user_show(struct kmem_cache *s, char *buf)
4790  {
4791  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
4792  }
4793  
4794  static ssize_t store_user_store(struct kmem_cache *s,
4795  				const char *buf, size_t length)
4796  {
4797  	if (any_slab_objects(s))
4798  		return -EBUSY;
4799  
4800  	s->flags &= ~SLAB_STORE_USER;
4801  	if (buf[0] == '1') {
4802  		s->flags &= ~__CMPXCHG_DOUBLE;
4803  		s->flags |= SLAB_STORE_USER;
4804  	}
4805  	calculate_sizes(s, -1);
4806  	return length;
4807  }
4808  SLAB_ATTR(store_user);
4809  
4810  static ssize_t validate_show(struct kmem_cache *s, char *buf)
4811  {
4812  	return 0;
4813  }
4814  
4815  static ssize_t validate_store(struct kmem_cache *s,
4816  			const char *buf, size_t length)
4817  {
4818  	int ret = -EINVAL;
4819  
4820  	if (buf[0] == '1') {
4821  		ret = validate_slab_cache(s);
4822  		if (ret >= 0)
4823  			ret = length;
4824  	}
4825  	return ret;
4826  }
4827  SLAB_ATTR(validate);
4828  
4829  static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
4830  {
4831  	if (!(s->flags & SLAB_STORE_USER))
4832  		return -ENOSYS;
4833  	return list_locations(s, buf, TRACK_ALLOC);
4834  }
4835  SLAB_ATTR_RO(alloc_calls);
4836  
4837  static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
4838  {
4839  	if (!(s->flags & SLAB_STORE_USER))
4840  		return -ENOSYS;
4841  	return list_locations(s, buf, TRACK_FREE);
4842  }
4843  SLAB_ATTR_RO(free_calls);
4844  #endif /* CONFIG_SLUB_DEBUG */
4845  
4846  #ifdef CONFIG_FAILSLAB
4847  static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4848  {
4849  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
4850  }
4851  
4852  static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
4853  							size_t length)
4854  {
4855  	if (s->refcount > 1)
4856  		return -EINVAL;
4857  
4858  	s->flags &= ~SLAB_FAILSLAB;
4859  	if (buf[0] == '1')
4860  		s->flags |= SLAB_FAILSLAB;
4861  	return length;
4862  }
4863  SLAB_ATTR(failslab);
4864  #endif
4865  
4866  static ssize_t shrink_show(struct kmem_cache *s, char *buf)
4867  {
4868  	return 0;
4869  }
4870  
4871  static ssize_t shrink_store(struct kmem_cache *s,
4872  			const char *buf, size_t length)
4873  {
4874  	if (buf[0] == '1')
4875  		kmem_cache_shrink(s);
4876  	else
4877  		return -EINVAL;
4878  	return length;
4879  }
4880  SLAB_ATTR(shrink);
4881  
4882  #ifdef CONFIG_NUMA
4883  static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
4884  {
4885  	return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10);
4886  }
4887  
4888  static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
4889  				const char *buf, size_t length)
4890  {
4891  	unsigned long ratio;
4892  	int err;
4893  
4894  	err = kstrtoul(buf, 10, &ratio);
4895  	if (err)
4896  		return err;
4897  
4898  	if (ratio <= 100)
4899  		s->remote_node_defrag_ratio = ratio * 10;
4900  
4901  	return length;
4902  }
4903  SLAB_ATTR(remote_node_defrag_ratio);
4904  #endif
4905  
4906  #ifdef CONFIG_SLUB_STATS
4907  static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
4908  {
4909  	unsigned long sum  = 0;
4910  	int cpu;
4911  	int len;
4912  	int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
4913  
4914  	if (!data)
4915  		return -ENOMEM;
4916  
4917  	for_each_online_cpu(cpu) {
4918  		unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
4919  
4920  		data[cpu] = x;
4921  		sum += x;
4922  	}
4923  
4924  	len = sprintf(buf, "%lu", sum);
4925  
4926  #ifdef CONFIG_SMP
4927  	for_each_online_cpu(cpu) {
4928  		if (data[cpu] && len < PAGE_SIZE - 20)
4929  			len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
4930  	}
4931  #endif
4932  	kfree(data);
4933  	return len + sprintf(buf + len, "\n");
4934  }
4935  
4936  static void clear_stat(struct kmem_cache *s, enum stat_item si)
4937  {
4938  	int cpu;
4939  
4940  	for_each_online_cpu(cpu)
4941  		per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
4942  }
4943  
4944  #define STAT_ATTR(si, text) 					\
4945  static ssize_t text##_show(struct kmem_cache *s, char *buf)	\
4946  {								\
4947  	return show_stat(s, buf, si);				\
4948  }								\
4949  static ssize_t text##_store(struct kmem_cache *s,		\
4950  				const char *buf, size_t length)	\
4951  {								\
4952  	if (buf[0] != '0')					\
4953  		return -EINVAL;					\
4954  	clear_stat(s, si);					\
4955  	return length;						\
4956  }								\
4957  SLAB_ATTR(text);						\
4958  
4959  STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
4960  STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
4961  STAT_ATTR(FREE_FASTPATH, free_fastpath);
4962  STAT_ATTR(FREE_SLOWPATH, free_slowpath);
4963  STAT_ATTR(FREE_FROZEN, free_frozen);
4964  STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
4965  STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
4966  STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
4967  STAT_ATTR(ALLOC_SLAB, alloc_slab);
4968  STAT_ATTR(ALLOC_REFILL, alloc_refill);
4969  STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
4970  STAT_ATTR(FREE_SLAB, free_slab);
4971  STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
4972  STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
4973  STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
4974  STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
4975  STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
4976  STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
4977  STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
4978  STAT_ATTR(ORDER_FALLBACK, order_fallback);
4979  STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
4980  STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
4981  STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
4982  STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
4983  STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
4984  STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
4985  #endif
4986  
4987  static struct attribute *slab_attrs[] = {
4988  	&slab_size_attr.attr,
4989  	&object_size_attr.attr,
4990  	&objs_per_slab_attr.attr,
4991  	&order_attr.attr,
4992  	&min_partial_attr.attr,
4993  	&cpu_partial_attr.attr,
4994  	&objects_attr.attr,
4995  	&objects_partial_attr.attr,
4996  	&partial_attr.attr,
4997  	&cpu_slabs_attr.attr,
4998  	&ctor_attr.attr,
4999  	&aliases_attr.attr,
5000  	&align_attr.attr,
5001  	&hwcache_align_attr.attr,
5002  	&reclaim_account_attr.attr,
5003  	&destroy_by_rcu_attr.attr,
5004  	&shrink_attr.attr,
5005  	&reserved_attr.attr,
5006  	&slabs_cpu_partial_attr.attr,
5007  #ifdef CONFIG_SLUB_DEBUG
5008  	&total_objects_attr.attr,
5009  	&slabs_attr.attr,
5010  	&sanity_checks_attr.attr,
5011  	&trace_attr.attr,
5012  	&red_zone_attr.attr,
5013  	&poison_attr.attr,
5014  	&store_user_attr.attr,
5015  	&validate_attr.attr,
5016  	&alloc_calls_attr.attr,
5017  	&free_calls_attr.attr,
5018  #endif
5019  #ifdef CONFIG_ZONE_DMA
5020  	&cache_dma_attr.attr,
5021  #endif
5022  #ifdef CONFIG_NUMA
5023  	&remote_node_defrag_ratio_attr.attr,
5024  #endif
5025  #ifdef CONFIG_SLUB_STATS
5026  	&alloc_fastpath_attr.attr,
5027  	&alloc_slowpath_attr.attr,
5028  	&free_fastpath_attr.attr,
5029  	&free_slowpath_attr.attr,
5030  	&free_frozen_attr.attr,
5031  	&free_add_partial_attr.attr,
5032  	&free_remove_partial_attr.attr,
5033  	&alloc_from_partial_attr.attr,
5034  	&alloc_slab_attr.attr,
5035  	&alloc_refill_attr.attr,
5036  	&alloc_node_mismatch_attr.attr,
5037  	&free_slab_attr.attr,
5038  	&cpuslab_flush_attr.attr,
5039  	&deactivate_full_attr.attr,
5040  	&deactivate_empty_attr.attr,
5041  	&deactivate_to_head_attr.attr,
5042  	&deactivate_to_tail_attr.attr,
5043  	&deactivate_remote_frees_attr.attr,
5044  	&deactivate_bypass_attr.attr,
5045  	&order_fallback_attr.attr,
5046  	&cmpxchg_double_fail_attr.attr,
5047  	&cmpxchg_double_cpu_fail_attr.attr,
5048  	&cpu_partial_alloc_attr.attr,
5049  	&cpu_partial_free_attr.attr,
5050  	&cpu_partial_node_attr.attr,
5051  	&cpu_partial_drain_attr.attr,
5052  #endif
5053  #ifdef CONFIG_FAILSLAB
5054  	&failslab_attr.attr,
5055  #endif
5056  
5057  	NULL
5058  };
5059  
5060  static struct attribute_group slab_attr_group = {
5061  	.attrs = slab_attrs,
5062  };
5063  
5064  static ssize_t slab_attr_show(struct kobject *kobj,
5065  				struct attribute *attr,
5066  				char *buf)
5067  {
5068  	struct slab_attribute *attribute;
5069  	struct kmem_cache *s;
5070  	int err;
5071  
5072  	attribute = to_slab_attr(attr);
5073  	s = to_slab(kobj);
5074  
5075  	if (!attribute->show)
5076  		return -EIO;
5077  
5078  	err = attribute->show(s, buf);
5079  
5080  	return err;
5081  }
5082  
5083  static ssize_t slab_attr_store(struct kobject *kobj,
5084  				struct attribute *attr,
5085  				const char *buf, size_t len)
5086  {
5087  	struct slab_attribute *attribute;
5088  	struct kmem_cache *s;
5089  	int err;
5090  
5091  	attribute = to_slab_attr(attr);
5092  	s = to_slab(kobj);
5093  
5094  	if (!attribute->store)
5095  		return -EIO;
5096  
5097  	err = attribute->store(s, buf, len);
5098  #ifdef CONFIG_MEMCG_KMEM
5099  	if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
5100  		struct kmem_cache *c;
5101  
5102  		mutex_lock(&slab_mutex);
5103  		if (s->max_attr_size < len)
5104  			s->max_attr_size = len;
5105  
5106  		/*
5107  		 * This is a best effort propagation, so this function's return
5108  		 * value will be determined by the parent cache only. This is
5109  		 * basically because not all attributes will have a well
5110  		 * defined semantics for rollbacks - most of the actions will
5111  		 * have permanent effects.
5112  		 *
5113  		 * Returning the error value of any of the children that fail
5114  		 * is not 100 % defined, in the sense that users seeing the
5115  		 * error code won't be able to know anything about the state of
5116  		 * the cache.
5117  		 *
5118  		 * Only returning the error code for the parent cache at least
5119  		 * has well defined semantics. The cache being written to
5120  		 * directly either failed or succeeded, in which case we loop
5121  		 * through the descendants with best-effort propagation.
5122  		 */
5123  		for_each_memcg_cache(c, s)
5124  			attribute->store(c, buf, len);
5125  		mutex_unlock(&slab_mutex);
5126  	}
5127  #endif
5128  	return err;
5129  }
5130  
5131  static void memcg_propagate_slab_attrs(struct kmem_cache *s)
5132  {
5133  #ifdef CONFIG_MEMCG_KMEM
5134  	int i;
5135  	char *buffer = NULL;
5136  	struct kmem_cache *root_cache;
5137  
5138  	if (is_root_cache(s))
5139  		return;
5140  
5141  	root_cache = s->memcg_params.root_cache;
5142  
5143  	/*
5144  	 * This mean this cache had no attribute written. Therefore, no point
5145  	 * in copying default values around
5146  	 */
5147  	if (!root_cache->max_attr_size)
5148  		return;
5149  
5150  	for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
5151  		char mbuf[64];
5152  		char *buf;
5153  		struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
5154  
5155  		if (!attr || !attr->store || !attr->show)
5156  			continue;
5157  
5158  		/*
5159  		 * It is really bad that we have to allocate here, so we will
5160  		 * do it only as a fallback. If we actually allocate, though,
5161  		 * we can just use the allocated buffer until the end.
5162  		 *
5163  		 * Most of the slub attributes will tend to be very small in
5164  		 * size, but sysfs allows buffers up to a page, so they can
5165  		 * theoretically happen.
5166  		 */
5167  		if (buffer)
5168  			buf = buffer;
5169  		else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf))
5170  			buf = mbuf;
5171  		else {
5172  			buffer = (char *) get_zeroed_page(GFP_KERNEL);
5173  			if (WARN_ON(!buffer))
5174  				continue;
5175  			buf = buffer;
5176  		}
5177  
5178  		attr->show(root_cache, buf);
5179  		attr->store(s, buf, strlen(buf));
5180  	}
5181  
5182  	if (buffer)
5183  		free_page((unsigned long)buffer);
5184  #endif
5185  }
5186  
5187  static void kmem_cache_release(struct kobject *k)
5188  {
5189  	slab_kmem_cache_release(to_slab(k));
5190  }
5191  
5192  static const struct sysfs_ops slab_sysfs_ops = {
5193  	.show = slab_attr_show,
5194  	.store = slab_attr_store,
5195  };
5196  
5197  static struct kobj_type slab_ktype = {
5198  	.sysfs_ops = &slab_sysfs_ops,
5199  	.release = kmem_cache_release,
5200  };
5201  
5202  static int uevent_filter(struct kset *kset, struct kobject *kobj)
5203  {
5204  	struct kobj_type *ktype = get_ktype(kobj);
5205  
5206  	if (ktype == &slab_ktype)
5207  		return 1;
5208  	return 0;
5209  }
5210  
5211  static const struct kset_uevent_ops slab_uevent_ops = {
5212  	.filter = uevent_filter,
5213  };
5214  
5215  static struct kset *slab_kset;
5216  
5217  static inline struct kset *cache_kset(struct kmem_cache *s)
5218  {
5219  #ifdef CONFIG_MEMCG_KMEM
5220  	if (!is_root_cache(s))
5221  		return s->memcg_params.root_cache->memcg_kset;
5222  #endif
5223  	return slab_kset;
5224  }
5225  
5226  #define ID_STR_LENGTH 64
5227  
5228  /* Create a unique string id for a slab cache:
5229   *
5230   * Format	:[flags-]size
5231   */
5232  static char *create_unique_id(struct kmem_cache *s)
5233  {
5234  	char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
5235  	char *p = name;
5236  
5237  	BUG_ON(!name);
5238  
5239  	*p++ = ':';
5240  	/*
5241  	 * First flags affecting slabcache operations. We will only
5242  	 * get here for aliasable slabs so we do not need to support
5243  	 * too many flags. The flags here must cover all flags that
5244  	 * are matched during merging to guarantee that the id is
5245  	 * unique.
5246  	 */
5247  	if (s->flags & SLAB_CACHE_DMA)
5248  		*p++ = 'd';
5249  	if (s->flags & SLAB_RECLAIM_ACCOUNT)
5250  		*p++ = 'a';
5251  	if (s->flags & SLAB_DEBUG_FREE)
5252  		*p++ = 'F';
5253  	if (!(s->flags & SLAB_NOTRACK))
5254  		*p++ = 't';
5255  	if (p != name + 1)
5256  		*p++ = '-';
5257  	p += sprintf(p, "%07d", s->size);
5258  
5259  	BUG_ON(p > name + ID_STR_LENGTH - 1);
5260  	return name;
5261  }
5262  
5263  static int sysfs_slab_add(struct kmem_cache *s)
5264  {
5265  	int err;
5266  	const char *name;
5267  	int unmergeable = slab_unmergeable(s);
5268  
5269  	if (unmergeable) {
5270  		/*
5271  		 * Slabcache can never be merged so we can use the name proper.
5272  		 * This is typically the case for debug situations. In that
5273  		 * case we can catch duplicate names easily.
5274  		 */
5275  		sysfs_remove_link(&slab_kset->kobj, s->name);
5276  		name = s->name;
5277  	} else {
5278  		/*
5279  		 * Create a unique name for the slab as a target
5280  		 * for the symlinks.
5281  		 */
5282  		name = create_unique_id(s);
5283  	}
5284  
5285  	s->kobj.kset = cache_kset(s);
5286  	err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
5287  	if (err)
5288  		goto out;
5289  
5290  	err = sysfs_create_group(&s->kobj, &slab_attr_group);
5291  	if (err)
5292  		goto out_del_kobj;
5293  
5294  #ifdef CONFIG_MEMCG_KMEM
5295  	if (is_root_cache(s)) {
5296  		s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
5297  		if (!s->memcg_kset) {
5298  			err = -ENOMEM;
5299  			goto out_del_kobj;
5300  		}
5301  	}
5302  #endif
5303  
5304  	kobject_uevent(&s->kobj, KOBJ_ADD);
5305  	if (!unmergeable) {
5306  		/* Setup first alias */
5307  		sysfs_slab_alias(s, s->name);
5308  	}
5309  out:
5310  	if (!unmergeable)
5311  		kfree(name);
5312  	return err;
5313  out_del_kobj:
5314  	kobject_del(&s->kobj);
5315  	goto out;
5316  }
5317  
5318  void sysfs_slab_remove(struct kmem_cache *s)
5319  {
5320  	if (slab_state < FULL)
5321  		/*
5322  		 * Sysfs has not been setup yet so no need to remove the
5323  		 * cache from sysfs.
5324  		 */
5325  		return;
5326  
5327  #ifdef CONFIG_MEMCG_KMEM
5328  	kset_unregister(s->memcg_kset);
5329  #endif
5330  	kobject_uevent(&s->kobj, KOBJ_REMOVE);
5331  	kobject_del(&s->kobj);
5332  	kobject_put(&s->kobj);
5333  }
5334  
5335  /*
5336   * Need to buffer aliases during bootup until sysfs becomes
5337   * available lest we lose that information.
5338   */
5339  struct saved_alias {
5340  	struct kmem_cache *s;
5341  	const char *name;
5342  	struct saved_alias *next;
5343  };
5344  
5345  static struct saved_alias *alias_list;
5346  
5347  static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
5348  {
5349  	struct saved_alias *al;
5350  
5351  	if (slab_state == FULL) {
5352  		/*
5353  		 * If we have a leftover link then remove it.
5354  		 */
5355  		sysfs_remove_link(&slab_kset->kobj, name);
5356  		return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
5357  	}
5358  
5359  	al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
5360  	if (!al)
5361  		return -ENOMEM;
5362  
5363  	al->s = s;
5364  	al->name = name;
5365  	al->next = alias_list;
5366  	alias_list = al;
5367  	return 0;
5368  }
5369  
5370  static int __init slab_sysfs_init(void)
5371  {
5372  	struct kmem_cache *s;
5373  	int err;
5374  
5375  	mutex_lock(&slab_mutex);
5376  
5377  	slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
5378  	if (!slab_kset) {
5379  		mutex_unlock(&slab_mutex);
5380  		pr_err("Cannot register slab subsystem.\n");
5381  		return -ENOSYS;
5382  	}
5383  
5384  	slab_state = FULL;
5385  
5386  	list_for_each_entry(s, &slab_caches, list) {
5387  		err = sysfs_slab_add(s);
5388  		if (err)
5389  			pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
5390  			       s->name);
5391  	}
5392  
5393  	while (alias_list) {
5394  		struct saved_alias *al = alias_list;
5395  
5396  		alias_list = alias_list->next;
5397  		err = sysfs_slab_alias(al->s, al->name);
5398  		if (err)
5399  			pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
5400  			       al->name);
5401  		kfree(al);
5402  	}
5403  
5404  	mutex_unlock(&slab_mutex);
5405  	resiliency_test();
5406  	return 0;
5407  }
5408  
5409  __initcall(slab_sysfs_init);
5410  #endif /* CONFIG_SYSFS */
5411  
5412  /*
5413   * The /proc/slabinfo ABI
5414   */
5415  #ifdef CONFIG_SLABINFO
5416  void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
5417  {
5418  	unsigned long nr_slabs = 0;
5419  	unsigned long nr_objs = 0;
5420  	unsigned long nr_free = 0;
5421  	int node;
5422  	struct kmem_cache_node *n;
5423  
5424  	for_each_kmem_cache_node(s, node, n) {
5425  		nr_slabs += node_nr_slabs(n);
5426  		nr_objs += node_nr_objs(n);
5427  		nr_free += count_partial(n, count_free);
5428  	}
5429  
5430  	sinfo->active_objs = nr_objs - nr_free;
5431  	sinfo->num_objs = nr_objs;
5432  	sinfo->active_slabs = nr_slabs;
5433  	sinfo->num_slabs = nr_slabs;
5434  	sinfo->objects_per_slab = oo_objects(s->oo);
5435  	sinfo->cache_order = oo_order(s->oo);
5436  }
5437  
5438  void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
5439  {
5440  }
5441  
5442  ssize_t slabinfo_write(struct file *file, const char __user *buffer,
5443  		       size_t count, loff_t *ppos)
5444  {
5445  	return -EIO;
5446  }
5447  #endif /* CONFIG_SLABINFO */
5448