xref: /openbmc/linux/mm/zsmalloc.c (revision fa7f32422ea1ac276b45b96a540ed5981caaa61f)
1  /*
2   * zsmalloc memory allocator
3   *
4   * Copyright (C) 2011  Nitin Gupta
5   * Copyright (C) 2012, 2013 Minchan Kim
6   *
7   * This code is released using a dual license strategy: BSD/GPL
8   * You can choose the license that better fits your requirements.
9   *
10   * Released under the terms of 3-clause BSD License
11   * Released under the terms of GNU General Public License Version 2.0
12   */
13  
14  /*
15   * Following is how we use various fields and flags of underlying
16   * struct page(s) to form a zspage.
17   *
18   * Usage of struct page fields:
19   *	page->private: points to zspage
20   *	page->freelist(index): links together all component pages of a zspage
21   *		For the huge page, this is always 0, so we use this field
22   *		to store handle.
23   *	page->units: first object offset in a subpage of zspage
24   *
25   * Usage of struct page flags:
26   *	PG_private: identifies the first component page
27   *	PG_private2: identifies the last component page
28   *	PG_owner_priv_1: indentifies the huge component page
29   *
30   */
31  
32  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
33  
34  #include <linux/module.h>
35  #include <linux/kernel.h>
36  #include <linux/sched.h>
37  #include <linux/bitops.h>
38  #include <linux/errno.h>
39  #include <linux/highmem.h>
40  #include <linux/string.h>
41  #include <linux/slab.h>
42  #include <asm/tlbflush.h>
43  #include <asm/pgtable.h>
44  #include <linux/cpumask.h>
45  #include <linux/cpu.h>
46  #include <linux/vmalloc.h>
47  #include <linux/preempt.h>
48  #include <linux/spinlock.h>
49  #include <linux/types.h>
50  #include <linux/debugfs.h>
51  #include <linux/zsmalloc.h>
52  #include <linux/zpool.h>
53  #include <linux/mount.h>
54  #include <linux/migrate.h>
55  #include <linux/pagemap.h>
56  
57  #define ZSPAGE_MAGIC	0x58
58  
59  /*
60   * This must be power of 2 and greater than of equal to sizeof(link_free).
61   * These two conditions ensure that any 'struct link_free' itself doesn't
62   * span more than 1 page which avoids complex case of mapping 2 pages simply
63   * to restore link_free pointer values.
64   */
65  #define ZS_ALIGN		8
66  
67  /*
68   * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single)
69   * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N.
70   */
71  #define ZS_MAX_ZSPAGE_ORDER 2
72  #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
73  
74  #define ZS_HANDLE_SIZE (sizeof(unsigned long))
75  
76  /*
77   * Object location (<PFN>, <obj_idx>) is encoded as
78   * as single (unsigned long) handle value.
79   *
80   * Note that object index <obj_idx> starts from 0.
81   *
82   * This is made more complicated by various memory models and PAE.
83   */
84  
85  #ifndef MAX_PHYSMEM_BITS
86  #ifdef CONFIG_HIGHMEM64G
87  #define MAX_PHYSMEM_BITS 36
88  #else /* !CONFIG_HIGHMEM64G */
89  /*
90   * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
91   * be PAGE_SHIFT
92   */
93  #define MAX_PHYSMEM_BITS BITS_PER_LONG
94  #endif
95  #endif
96  #define _PFN_BITS		(MAX_PHYSMEM_BITS - PAGE_SHIFT)
97  
98  /*
99   * Memory for allocating for handle keeps object position by
100   * encoding <page, obj_idx> and the encoded value has a room
101   * in least bit(ie, look at obj_to_location).
102   * We use the bit to synchronize between object access by
103   * user and migration.
104   */
105  #define HANDLE_PIN_BIT	0
106  
107  /*
108   * Head in allocated object should have OBJ_ALLOCATED_TAG
109   * to identify the object was allocated or not.
110   * It's okay to add the status bit in the least bit because
111   * header keeps handle which is 4byte-aligned address so we
112   * have room for two bit at least.
113   */
114  #define OBJ_ALLOCATED_TAG 1
115  #define OBJ_TAG_BITS 1
116  #define OBJ_INDEX_BITS	(BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
117  #define OBJ_INDEX_MASK	((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
118  
119  #define MAX(a, b) ((a) >= (b) ? (a) : (b))
120  /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
121  #define ZS_MIN_ALLOC_SIZE \
122  	MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
123  /* each chunk includes extra space to keep handle */
124  #define ZS_MAX_ALLOC_SIZE	PAGE_SIZE
125  
126  /*
127   * On systems with 4K page size, this gives 255 size classes! There is a
128   * trader-off here:
129   *  - Large number of size classes is potentially wasteful as free page are
130   *    spread across these classes
131   *  - Small number of size classes causes large internal fragmentation
132   *  - Probably its better to use specific size classes (empirically
133   *    determined). NOTE: all those class sizes must be set as multiple of
134   *    ZS_ALIGN to make sure link_free itself never has to span 2 pages.
135   *
136   *  ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
137   *  (reason above)
138   */
139  #define ZS_SIZE_CLASS_DELTA	(PAGE_SIZE >> CLASS_BITS)
140  
141  enum fullness_group {
142  	ZS_EMPTY,
143  	ZS_ALMOST_EMPTY,
144  	ZS_ALMOST_FULL,
145  	ZS_FULL,
146  	NR_ZS_FULLNESS,
147  };
148  
149  enum zs_stat_type {
150  	CLASS_EMPTY,
151  	CLASS_ALMOST_EMPTY,
152  	CLASS_ALMOST_FULL,
153  	CLASS_FULL,
154  	OBJ_ALLOCATED,
155  	OBJ_USED,
156  	NR_ZS_STAT_TYPE,
157  };
158  
159  struct zs_size_stat {
160  	unsigned long objs[NR_ZS_STAT_TYPE];
161  };
162  
163  #ifdef CONFIG_ZSMALLOC_STAT
164  static struct dentry *zs_stat_root;
165  #endif
166  
167  #ifdef CONFIG_COMPACTION
168  static struct vfsmount *zsmalloc_mnt;
169  #endif
170  
171  /*
172   * number of size_classes
173   */
174  static int zs_size_classes;
175  
176  /*
177   * We assign a page to ZS_ALMOST_EMPTY fullness group when:
178   *	n <= N / f, where
179   * n = number of allocated objects
180   * N = total number of objects zspage can store
181   * f = fullness_threshold_frac
182   *
183   * Similarly, we assign zspage to:
184   *	ZS_ALMOST_FULL	when n > N / f
185   *	ZS_EMPTY	when n == 0
186   *	ZS_FULL		when n == N
187   *
188   * (see: fix_fullness_group())
189   */
190  static const int fullness_threshold_frac = 4;
191  
192  struct size_class {
193  	spinlock_t lock;
194  	struct list_head fullness_list[NR_ZS_FULLNESS];
195  	/*
196  	 * Size of objects stored in this class. Must be multiple
197  	 * of ZS_ALIGN.
198  	 */
199  	int size;
200  	int objs_per_zspage;
201  	/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
202  	int pages_per_zspage;
203  
204  	unsigned int index;
205  	struct zs_size_stat stats;
206  };
207  
208  /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
209  static void SetPageHugeObject(struct page *page)
210  {
211  	SetPageOwnerPriv1(page);
212  }
213  
214  static void ClearPageHugeObject(struct page *page)
215  {
216  	ClearPageOwnerPriv1(page);
217  }
218  
219  static int PageHugeObject(struct page *page)
220  {
221  	return PageOwnerPriv1(page);
222  }
223  
224  /*
225   * Placed within free objects to form a singly linked list.
226   * For every zspage, zspage->freeobj gives head of this list.
227   *
228   * This must be power of 2 and less than or equal to ZS_ALIGN
229   */
230  struct link_free {
231  	union {
232  		/*
233  		 * Free object index;
234  		 * It's valid for non-allocated object
235  		 */
236  		unsigned long next;
237  		/*
238  		 * Handle of allocated object.
239  		 */
240  		unsigned long handle;
241  	};
242  };
243  
244  struct zs_pool {
245  	const char *name;
246  
247  	struct size_class **size_class;
248  	struct kmem_cache *handle_cachep;
249  	struct kmem_cache *zspage_cachep;
250  
251  	atomic_long_t pages_allocated;
252  
253  	struct zs_pool_stats stats;
254  
255  	/* Compact classes */
256  	struct shrinker shrinker;
257  	/*
258  	 * To signify that register_shrinker() was successful
259  	 * and unregister_shrinker() will not Oops.
260  	 */
261  	bool shrinker_enabled;
262  #ifdef CONFIG_ZSMALLOC_STAT
263  	struct dentry *stat_dentry;
264  #endif
265  #ifdef CONFIG_COMPACTION
266  	struct inode *inode;
267  	struct work_struct free_work;
268  #endif
269  };
270  
271  /*
272   * A zspage's class index and fullness group
273   * are encoded in its (first)page->mapping
274   */
275  #define FULLNESS_BITS	2
276  #define CLASS_BITS	8
277  #define ISOLATED_BITS	3
278  #define MAGIC_VAL_BITS	8
279  
280  struct zspage {
281  	struct {
282  		unsigned int fullness:FULLNESS_BITS;
283  		unsigned int class:CLASS_BITS;
284  		unsigned int isolated:ISOLATED_BITS;
285  		unsigned int magic:MAGIC_VAL_BITS;
286  	};
287  	unsigned int inuse;
288  	unsigned int freeobj;
289  	struct page *first_page;
290  	struct list_head list; /* fullness list */
291  #ifdef CONFIG_COMPACTION
292  	rwlock_t lock;
293  #endif
294  };
295  
296  struct mapping_area {
297  #ifdef CONFIG_PGTABLE_MAPPING
298  	struct vm_struct *vm; /* vm area for mapping object that span pages */
299  #else
300  	char *vm_buf; /* copy buffer for objects that span pages */
301  #endif
302  	char *vm_addr; /* address of kmap_atomic()'ed pages */
303  	enum zs_mapmode vm_mm; /* mapping mode */
304  };
305  
306  #ifdef CONFIG_COMPACTION
307  static int zs_register_migration(struct zs_pool *pool);
308  static void zs_unregister_migration(struct zs_pool *pool);
309  static void migrate_lock_init(struct zspage *zspage);
310  static void migrate_read_lock(struct zspage *zspage);
311  static void migrate_read_unlock(struct zspage *zspage);
312  static void kick_deferred_free(struct zs_pool *pool);
313  static void init_deferred_free(struct zs_pool *pool);
314  static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
315  #else
316  static int zsmalloc_mount(void) { return 0; }
317  static void zsmalloc_unmount(void) {}
318  static int zs_register_migration(struct zs_pool *pool) { return 0; }
319  static void zs_unregister_migration(struct zs_pool *pool) {}
320  static void migrate_lock_init(struct zspage *zspage) {}
321  static void migrate_read_lock(struct zspage *zspage) {}
322  static void migrate_read_unlock(struct zspage *zspage) {}
323  static void kick_deferred_free(struct zs_pool *pool) {}
324  static void init_deferred_free(struct zs_pool *pool) {}
325  static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
326  #endif
327  
328  static int create_cache(struct zs_pool *pool)
329  {
330  	pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
331  					0, 0, NULL);
332  	if (!pool->handle_cachep)
333  		return 1;
334  
335  	pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage),
336  					0, 0, NULL);
337  	if (!pool->zspage_cachep) {
338  		kmem_cache_destroy(pool->handle_cachep);
339  		pool->handle_cachep = NULL;
340  		return 1;
341  	}
342  
343  	return 0;
344  }
345  
346  static void destroy_cache(struct zs_pool *pool)
347  {
348  	kmem_cache_destroy(pool->handle_cachep);
349  	kmem_cache_destroy(pool->zspage_cachep);
350  }
351  
352  static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
353  {
354  	return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
355  			gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
356  }
357  
358  static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
359  {
360  	kmem_cache_free(pool->handle_cachep, (void *)handle);
361  }
362  
363  static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
364  {
365  	return kmem_cache_alloc(pool->zspage_cachep,
366  			flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
367  };
368  
369  static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
370  {
371  	kmem_cache_free(pool->zspage_cachep, zspage);
372  }
373  
374  static void record_obj(unsigned long handle, unsigned long obj)
375  {
376  	/*
377  	 * lsb of @obj represents handle lock while other bits
378  	 * represent object value the handle is pointing so
379  	 * updating shouldn't do store tearing.
380  	 */
381  	WRITE_ONCE(*(unsigned long *)handle, obj);
382  }
383  
384  /* zpool driver */
385  
386  #ifdef CONFIG_ZPOOL
387  
388  static void *zs_zpool_create(const char *name, gfp_t gfp,
389  			     const struct zpool_ops *zpool_ops,
390  			     struct zpool *zpool)
391  {
392  	/*
393  	 * Ignore global gfp flags: zs_malloc() may be invoked from
394  	 * different contexts and its caller must provide a valid
395  	 * gfp mask.
396  	 */
397  	return zs_create_pool(name);
398  }
399  
400  static void zs_zpool_destroy(void *pool)
401  {
402  	zs_destroy_pool(pool);
403  }
404  
405  static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
406  			unsigned long *handle)
407  {
408  	*handle = zs_malloc(pool, size, gfp);
409  	return *handle ? 0 : -1;
410  }
411  static void zs_zpool_free(void *pool, unsigned long handle)
412  {
413  	zs_free(pool, handle);
414  }
415  
416  static int zs_zpool_shrink(void *pool, unsigned int pages,
417  			unsigned int *reclaimed)
418  {
419  	return -EINVAL;
420  }
421  
422  static void *zs_zpool_map(void *pool, unsigned long handle,
423  			enum zpool_mapmode mm)
424  {
425  	enum zs_mapmode zs_mm;
426  
427  	switch (mm) {
428  	case ZPOOL_MM_RO:
429  		zs_mm = ZS_MM_RO;
430  		break;
431  	case ZPOOL_MM_WO:
432  		zs_mm = ZS_MM_WO;
433  		break;
434  	case ZPOOL_MM_RW: /* fallthru */
435  	default:
436  		zs_mm = ZS_MM_RW;
437  		break;
438  	}
439  
440  	return zs_map_object(pool, handle, zs_mm);
441  }
442  static void zs_zpool_unmap(void *pool, unsigned long handle)
443  {
444  	zs_unmap_object(pool, handle);
445  }
446  
447  static u64 zs_zpool_total_size(void *pool)
448  {
449  	return zs_get_total_pages(pool) << PAGE_SHIFT;
450  }
451  
452  static struct zpool_driver zs_zpool_driver = {
453  	.type =		"zsmalloc",
454  	.owner =	THIS_MODULE,
455  	.create =	zs_zpool_create,
456  	.destroy =	zs_zpool_destroy,
457  	.malloc =	zs_zpool_malloc,
458  	.free =		zs_zpool_free,
459  	.shrink =	zs_zpool_shrink,
460  	.map =		zs_zpool_map,
461  	.unmap =	zs_zpool_unmap,
462  	.total_size =	zs_zpool_total_size,
463  };
464  
465  MODULE_ALIAS("zpool-zsmalloc");
466  #endif /* CONFIG_ZPOOL */
467  
468  /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
469  static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
470  
471  static bool is_zspage_isolated(struct zspage *zspage)
472  {
473  	return zspage->isolated;
474  }
475  
476  static int is_first_page(struct page *page)
477  {
478  	return PagePrivate(page);
479  }
480  
481  /* Protected by class->lock */
482  static inline int get_zspage_inuse(struct zspage *zspage)
483  {
484  	return zspage->inuse;
485  }
486  
487  static inline void set_zspage_inuse(struct zspage *zspage, int val)
488  {
489  	zspage->inuse = val;
490  }
491  
492  static inline void mod_zspage_inuse(struct zspage *zspage, int val)
493  {
494  	zspage->inuse += val;
495  }
496  
497  static inline struct page *get_first_page(struct zspage *zspage)
498  {
499  	struct page *first_page = zspage->first_page;
500  
501  	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
502  	return first_page;
503  }
504  
505  static inline int get_first_obj_offset(struct page *page)
506  {
507  	return page->units;
508  }
509  
510  static inline void set_first_obj_offset(struct page *page, int offset)
511  {
512  	page->units = offset;
513  }
514  
515  static inline unsigned int get_freeobj(struct zspage *zspage)
516  {
517  	return zspage->freeobj;
518  }
519  
520  static inline void set_freeobj(struct zspage *zspage, unsigned int obj)
521  {
522  	zspage->freeobj = obj;
523  }
524  
525  static void get_zspage_mapping(struct zspage *zspage,
526  				unsigned int *class_idx,
527  				enum fullness_group *fullness)
528  {
529  	BUG_ON(zspage->magic != ZSPAGE_MAGIC);
530  
531  	*fullness = zspage->fullness;
532  	*class_idx = zspage->class;
533  }
534  
535  static void set_zspage_mapping(struct zspage *zspage,
536  				unsigned int class_idx,
537  				enum fullness_group fullness)
538  {
539  	zspage->class = class_idx;
540  	zspage->fullness = fullness;
541  }
542  
543  /*
544   * zsmalloc divides the pool into various size classes where each
545   * class maintains a list of zspages where each zspage is divided
546   * into equal sized chunks. Each allocation falls into one of these
547   * classes depending on its size. This function returns index of the
548   * size class which has chunk size big enough to hold the give size.
549   */
550  static int get_size_class_index(int size)
551  {
552  	int idx = 0;
553  
554  	if (likely(size > ZS_MIN_ALLOC_SIZE))
555  		idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
556  				ZS_SIZE_CLASS_DELTA);
557  
558  	return min(zs_size_classes - 1, idx);
559  }
560  
561  static inline void zs_stat_inc(struct size_class *class,
562  				enum zs_stat_type type, unsigned long cnt)
563  {
564  	class->stats.objs[type] += cnt;
565  }
566  
567  static inline void zs_stat_dec(struct size_class *class,
568  				enum zs_stat_type type, unsigned long cnt)
569  {
570  	class->stats.objs[type] -= cnt;
571  }
572  
573  static inline unsigned long zs_stat_get(struct size_class *class,
574  				enum zs_stat_type type)
575  {
576  	return class->stats.objs[type];
577  }
578  
579  #ifdef CONFIG_ZSMALLOC_STAT
580  
581  static void __init zs_stat_init(void)
582  {
583  	if (!debugfs_initialized()) {
584  		pr_warn("debugfs not available, stat dir not created\n");
585  		return;
586  	}
587  
588  	zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
589  	if (!zs_stat_root)
590  		pr_warn("debugfs 'zsmalloc' stat dir creation failed\n");
591  }
592  
593  static void __exit zs_stat_exit(void)
594  {
595  	debugfs_remove_recursive(zs_stat_root);
596  }
597  
598  static unsigned long zs_can_compact(struct size_class *class);
599  
600  static int zs_stats_size_show(struct seq_file *s, void *v)
601  {
602  	int i;
603  	struct zs_pool *pool = s->private;
604  	struct size_class *class;
605  	int objs_per_zspage;
606  	unsigned long class_almost_full, class_almost_empty;
607  	unsigned long obj_allocated, obj_used, pages_used, freeable;
608  	unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
609  	unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
610  	unsigned long total_freeable = 0;
611  
612  	seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n",
613  			"class", "size", "almost_full", "almost_empty",
614  			"obj_allocated", "obj_used", "pages_used",
615  			"pages_per_zspage", "freeable");
616  
617  	for (i = 0; i < zs_size_classes; i++) {
618  		class = pool->size_class[i];
619  
620  		if (class->index != i)
621  			continue;
622  
623  		spin_lock(&class->lock);
624  		class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
625  		class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
626  		obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
627  		obj_used = zs_stat_get(class, OBJ_USED);
628  		freeable = zs_can_compact(class);
629  		spin_unlock(&class->lock);
630  
631  		objs_per_zspage = class->objs_per_zspage;
632  		pages_used = obj_allocated / objs_per_zspage *
633  				class->pages_per_zspage;
634  
635  		seq_printf(s, " %5u %5u %11lu %12lu %13lu"
636  				" %10lu %10lu %16d %8lu\n",
637  			i, class->size, class_almost_full, class_almost_empty,
638  			obj_allocated, obj_used, pages_used,
639  			class->pages_per_zspage, freeable);
640  
641  		total_class_almost_full += class_almost_full;
642  		total_class_almost_empty += class_almost_empty;
643  		total_objs += obj_allocated;
644  		total_used_objs += obj_used;
645  		total_pages += pages_used;
646  		total_freeable += freeable;
647  	}
648  
649  	seq_puts(s, "\n");
650  	seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n",
651  			"Total", "", total_class_almost_full,
652  			total_class_almost_empty, total_objs,
653  			total_used_objs, total_pages, "", total_freeable);
654  
655  	return 0;
656  }
657  
658  static int zs_stats_size_open(struct inode *inode, struct file *file)
659  {
660  	return single_open(file, zs_stats_size_show, inode->i_private);
661  }
662  
663  static const struct file_operations zs_stat_size_ops = {
664  	.open           = zs_stats_size_open,
665  	.read           = seq_read,
666  	.llseek         = seq_lseek,
667  	.release        = single_release,
668  };
669  
670  static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
671  {
672  	struct dentry *entry;
673  
674  	if (!zs_stat_root) {
675  		pr_warn("no root stat dir, not creating <%s> stat dir\n", name);
676  		return;
677  	}
678  
679  	entry = debugfs_create_dir(name, zs_stat_root);
680  	if (!entry) {
681  		pr_warn("debugfs dir <%s> creation failed\n", name);
682  		return;
683  	}
684  	pool->stat_dentry = entry;
685  
686  	entry = debugfs_create_file("classes", S_IFREG | S_IRUGO,
687  			pool->stat_dentry, pool, &zs_stat_size_ops);
688  	if (!entry) {
689  		pr_warn("%s: debugfs file entry <%s> creation failed\n",
690  				name, "classes");
691  		debugfs_remove_recursive(pool->stat_dentry);
692  		pool->stat_dentry = NULL;
693  	}
694  }
695  
696  static void zs_pool_stat_destroy(struct zs_pool *pool)
697  {
698  	debugfs_remove_recursive(pool->stat_dentry);
699  }
700  
701  #else /* CONFIG_ZSMALLOC_STAT */
702  static void __init zs_stat_init(void)
703  {
704  }
705  
706  static void __exit zs_stat_exit(void)
707  {
708  }
709  
710  static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name)
711  {
712  }
713  
714  static inline void zs_pool_stat_destroy(struct zs_pool *pool)
715  {
716  }
717  #endif
718  
719  
720  /*
721   * For each size class, zspages are divided into different groups
722   * depending on how "full" they are. This was done so that we could
723   * easily find empty or nearly empty zspages when we try to shrink
724   * the pool (not yet implemented). This function returns fullness
725   * status of the given page.
726   */
727  static enum fullness_group get_fullness_group(struct size_class *class,
728  						struct zspage *zspage)
729  {
730  	int inuse, objs_per_zspage;
731  	enum fullness_group fg;
732  
733  	inuse = get_zspage_inuse(zspage);
734  	objs_per_zspage = class->objs_per_zspage;
735  
736  	if (inuse == 0)
737  		fg = ZS_EMPTY;
738  	else if (inuse == objs_per_zspage)
739  		fg = ZS_FULL;
740  	else if (inuse <= 3 * objs_per_zspage / fullness_threshold_frac)
741  		fg = ZS_ALMOST_EMPTY;
742  	else
743  		fg = ZS_ALMOST_FULL;
744  
745  	return fg;
746  }
747  
748  /*
749   * Each size class maintains various freelists and zspages are assigned
750   * to one of these freelists based on the number of live objects they
751   * have. This functions inserts the given zspage into the freelist
752   * identified by <class, fullness_group>.
753   */
754  static void insert_zspage(struct size_class *class,
755  				struct zspage *zspage,
756  				enum fullness_group fullness)
757  {
758  	struct zspage *head;
759  
760  	zs_stat_inc(class, fullness, 1);
761  	head = list_first_entry_or_null(&class->fullness_list[fullness],
762  					struct zspage, list);
763  	/*
764  	 * We want to see more ZS_FULL pages and less almost empty/full.
765  	 * Put pages with higher ->inuse first.
766  	 */
767  	if (head) {
768  		if (get_zspage_inuse(zspage) < get_zspage_inuse(head)) {
769  			list_add(&zspage->list, &head->list);
770  			return;
771  		}
772  	}
773  	list_add(&zspage->list, &class->fullness_list[fullness]);
774  }
775  
776  /*
777   * This function removes the given zspage from the freelist identified
778   * by <class, fullness_group>.
779   */
780  static void remove_zspage(struct size_class *class,
781  				struct zspage *zspage,
782  				enum fullness_group fullness)
783  {
784  	VM_BUG_ON(list_empty(&class->fullness_list[fullness]));
785  	VM_BUG_ON(is_zspage_isolated(zspage));
786  
787  	list_del_init(&zspage->list);
788  	zs_stat_dec(class, fullness, 1);
789  }
790  
791  /*
792   * Each size class maintains zspages in different fullness groups depending
793   * on the number of live objects they contain. When allocating or freeing
794   * objects, the fullness status of the page can change, say, from ALMOST_FULL
795   * to ALMOST_EMPTY when freeing an object. This function checks if such
796   * a status change has occurred for the given page and accordingly moves the
797   * page from the freelist of the old fullness group to that of the new
798   * fullness group.
799   */
800  static enum fullness_group fix_fullness_group(struct size_class *class,
801  						struct zspage *zspage)
802  {
803  	int class_idx;
804  	enum fullness_group currfg, newfg;
805  
806  	get_zspage_mapping(zspage, &class_idx, &currfg);
807  	newfg = get_fullness_group(class, zspage);
808  	if (newfg == currfg)
809  		goto out;
810  
811  	if (!is_zspage_isolated(zspage)) {
812  		remove_zspage(class, zspage, currfg);
813  		insert_zspage(class, zspage, newfg);
814  	}
815  
816  	set_zspage_mapping(zspage, class_idx, newfg);
817  
818  out:
819  	return newfg;
820  }
821  
822  /*
823   * We have to decide on how many pages to link together
824   * to form a zspage for each size class. This is important
825   * to reduce wastage due to unusable space left at end of
826   * each zspage which is given as:
827   *     wastage = Zp % class_size
828   *     usage = Zp - wastage
829   * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
830   *
831   * For example, for size class of 3/8 * PAGE_SIZE, we should
832   * link together 3 PAGE_SIZE sized pages to form a zspage
833   * since then we can perfectly fit in 8 such objects.
834   */
835  static int get_pages_per_zspage(int class_size)
836  {
837  	int i, max_usedpc = 0;
838  	/* zspage order which gives maximum used size per KB */
839  	int max_usedpc_order = 1;
840  
841  	for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) {
842  		int zspage_size;
843  		int waste, usedpc;
844  
845  		zspage_size = i * PAGE_SIZE;
846  		waste = zspage_size % class_size;
847  		usedpc = (zspage_size - waste) * 100 / zspage_size;
848  
849  		if (usedpc > max_usedpc) {
850  			max_usedpc = usedpc;
851  			max_usedpc_order = i;
852  		}
853  	}
854  
855  	return max_usedpc_order;
856  }
857  
858  static struct zspage *get_zspage(struct page *page)
859  {
860  	struct zspage *zspage = (struct zspage *)page->private;
861  
862  	BUG_ON(zspage->magic != ZSPAGE_MAGIC);
863  	return zspage;
864  }
865  
866  static struct page *get_next_page(struct page *page)
867  {
868  	if (unlikely(PageHugeObject(page)))
869  		return NULL;
870  
871  	return page->freelist;
872  }
873  
874  /**
875   * obj_to_location - get (<page>, <obj_idx>) from encoded object value
876   * @page: page object resides in zspage
877   * @obj_idx: object index
878   */
879  static void obj_to_location(unsigned long obj, struct page **page,
880  				unsigned int *obj_idx)
881  {
882  	obj >>= OBJ_TAG_BITS;
883  	*page = pfn_to_page(obj >> OBJ_INDEX_BITS);
884  	*obj_idx = (obj & OBJ_INDEX_MASK);
885  }
886  
887  /**
888   * location_to_obj - get obj value encoded from (<page>, <obj_idx>)
889   * @page: page object resides in zspage
890   * @obj_idx: object index
891   */
892  static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
893  {
894  	unsigned long obj;
895  
896  	obj = page_to_pfn(page) << OBJ_INDEX_BITS;
897  	obj |= obj_idx & OBJ_INDEX_MASK;
898  	obj <<= OBJ_TAG_BITS;
899  
900  	return obj;
901  }
902  
903  static unsigned long handle_to_obj(unsigned long handle)
904  {
905  	return *(unsigned long *)handle;
906  }
907  
908  static unsigned long obj_to_head(struct page *page, void *obj)
909  {
910  	if (unlikely(PageHugeObject(page))) {
911  		VM_BUG_ON_PAGE(!is_first_page(page), page);
912  		return page->index;
913  	} else
914  		return *(unsigned long *)obj;
915  }
916  
917  static inline int testpin_tag(unsigned long handle)
918  {
919  	return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
920  }
921  
922  static inline int trypin_tag(unsigned long handle)
923  {
924  	return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
925  }
926  
927  static void pin_tag(unsigned long handle)
928  {
929  	bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
930  }
931  
932  static void unpin_tag(unsigned long handle)
933  {
934  	bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
935  }
936  
937  static void reset_page(struct page *page)
938  {
939  	__ClearPageMovable(page);
940  	ClearPagePrivate(page);
941  	ClearPagePrivate2(page);
942  	set_page_private(page, 0);
943  	page_mapcount_reset(page);
944  	ClearPageHugeObject(page);
945  	page->freelist = NULL;
946  }
947  
948  /*
949   * To prevent zspage destroy during migration, zspage freeing should
950   * hold locks of all pages in the zspage.
951   */
952  void lock_zspage(struct zspage *zspage)
953  {
954  	struct page *page = get_first_page(zspage);
955  
956  	do {
957  		lock_page(page);
958  	} while ((page = get_next_page(page)) != NULL);
959  }
960  
961  int trylock_zspage(struct zspage *zspage)
962  {
963  	struct page *cursor, *fail;
964  
965  	for (cursor = get_first_page(zspage); cursor != NULL; cursor =
966  					get_next_page(cursor)) {
967  		if (!trylock_page(cursor)) {
968  			fail = cursor;
969  			goto unlock;
970  		}
971  	}
972  
973  	return 1;
974  unlock:
975  	for (cursor = get_first_page(zspage); cursor != fail; cursor =
976  					get_next_page(cursor))
977  		unlock_page(cursor);
978  
979  	return 0;
980  }
981  
982  static void __free_zspage(struct zs_pool *pool, struct size_class *class,
983  				struct zspage *zspage)
984  {
985  	struct page *page, *next;
986  	enum fullness_group fg;
987  	unsigned int class_idx;
988  
989  	get_zspage_mapping(zspage, &class_idx, &fg);
990  
991  	assert_spin_locked(&class->lock);
992  
993  	VM_BUG_ON(get_zspage_inuse(zspage));
994  	VM_BUG_ON(fg != ZS_EMPTY);
995  
996  	next = page = get_first_page(zspage);
997  	do {
998  		VM_BUG_ON_PAGE(!PageLocked(page), page);
999  		next = get_next_page(page);
1000  		reset_page(page);
1001  		unlock_page(page);
1002  		dec_zone_page_state(page, NR_ZSPAGES);
1003  		put_page(page);
1004  		page = next;
1005  	} while (page != NULL);
1006  
1007  	cache_free_zspage(pool, zspage);
1008  
1009  	zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage);
1010  	atomic_long_sub(class->pages_per_zspage,
1011  					&pool->pages_allocated);
1012  }
1013  
1014  static void free_zspage(struct zs_pool *pool, struct size_class *class,
1015  				struct zspage *zspage)
1016  {
1017  	VM_BUG_ON(get_zspage_inuse(zspage));
1018  	VM_BUG_ON(list_empty(&zspage->list));
1019  
1020  	if (!trylock_zspage(zspage)) {
1021  		kick_deferred_free(pool);
1022  		return;
1023  	}
1024  
1025  	remove_zspage(class, zspage, ZS_EMPTY);
1026  	__free_zspage(pool, class, zspage);
1027  }
1028  
1029  /* Initialize a newly allocated zspage */
1030  static void init_zspage(struct size_class *class, struct zspage *zspage)
1031  {
1032  	unsigned int freeobj = 1;
1033  	unsigned long off = 0;
1034  	struct page *page = get_first_page(zspage);
1035  
1036  	while (page) {
1037  		struct page *next_page;
1038  		struct link_free *link;
1039  		void *vaddr;
1040  
1041  		set_first_obj_offset(page, off);
1042  
1043  		vaddr = kmap_atomic(page);
1044  		link = (struct link_free *)vaddr + off / sizeof(*link);
1045  
1046  		while ((off += class->size) < PAGE_SIZE) {
1047  			link->next = freeobj++ << OBJ_TAG_BITS;
1048  			link += class->size / sizeof(*link);
1049  		}
1050  
1051  		/*
1052  		 * We now come to the last (full or partial) object on this
1053  		 * page, which must point to the first object on the next
1054  		 * page (if present)
1055  		 */
1056  		next_page = get_next_page(page);
1057  		if (next_page) {
1058  			link->next = freeobj++ << OBJ_TAG_BITS;
1059  		} else {
1060  			/*
1061  			 * Reset OBJ_TAG_BITS bit to last link to tell
1062  			 * whether it's allocated object or not.
1063  			 */
1064  			link->next = -1 << OBJ_TAG_BITS;
1065  		}
1066  		kunmap_atomic(vaddr);
1067  		page = next_page;
1068  		off %= PAGE_SIZE;
1069  	}
1070  
1071  	set_freeobj(zspage, 0);
1072  }
1073  
1074  static void create_page_chain(struct size_class *class, struct zspage *zspage,
1075  				struct page *pages[])
1076  {
1077  	int i;
1078  	struct page *page;
1079  	struct page *prev_page = NULL;
1080  	int nr_pages = class->pages_per_zspage;
1081  
1082  	/*
1083  	 * Allocate individual pages and link them together as:
1084  	 * 1. all pages are linked together using page->freelist
1085  	 * 2. each sub-page point to zspage using page->private
1086  	 *
1087  	 * we set PG_private to identify the first page (i.e. no other sub-page
1088  	 * has this flag set) and PG_private_2 to identify the last page.
1089  	 */
1090  	for (i = 0; i < nr_pages; i++) {
1091  		page = pages[i];
1092  		set_page_private(page, (unsigned long)zspage);
1093  		page->freelist = NULL;
1094  		if (i == 0) {
1095  			zspage->first_page = page;
1096  			SetPagePrivate(page);
1097  			if (unlikely(class->objs_per_zspage == 1 &&
1098  					class->pages_per_zspage == 1))
1099  				SetPageHugeObject(page);
1100  		} else {
1101  			prev_page->freelist = page;
1102  		}
1103  		if (i == nr_pages - 1)
1104  			SetPagePrivate2(page);
1105  		prev_page = page;
1106  	}
1107  }
1108  
1109  /*
1110   * Allocate a zspage for the given size class
1111   */
1112  static struct zspage *alloc_zspage(struct zs_pool *pool,
1113  					struct size_class *class,
1114  					gfp_t gfp)
1115  {
1116  	int i;
1117  	struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE];
1118  	struct zspage *zspage = cache_alloc_zspage(pool, gfp);
1119  
1120  	if (!zspage)
1121  		return NULL;
1122  
1123  	memset(zspage, 0, sizeof(struct zspage));
1124  	zspage->magic = ZSPAGE_MAGIC;
1125  	migrate_lock_init(zspage);
1126  
1127  	for (i = 0; i < class->pages_per_zspage; i++) {
1128  		struct page *page;
1129  
1130  		page = alloc_page(gfp);
1131  		if (!page) {
1132  			while (--i >= 0) {
1133  				dec_zone_page_state(pages[i], NR_ZSPAGES);
1134  				__free_page(pages[i]);
1135  			}
1136  			cache_free_zspage(pool, zspage);
1137  			return NULL;
1138  		}
1139  
1140  		inc_zone_page_state(page, NR_ZSPAGES);
1141  		pages[i] = page;
1142  	}
1143  
1144  	create_page_chain(class, zspage, pages);
1145  	init_zspage(class, zspage);
1146  
1147  	return zspage;
1148  }
1149  
1150  static struct zspage *find_get_zspage(struct size_class *class)
1151  {
1152  	int i;
1153  	struct zspage *zspage;
1154  
1155  	for (i = ZS_ALMOST_FULL; i >= ZS_EMPTY; i--) {
1156  		zspage = list_first_entry_or_null(&class->fullness_list[i],
1157  				struct zspage, list);
1158  		if (zspage)
1159  			break;
1160  	}
1161  
1162  	return zspage;
1163  }
1164  
1165  #ifdef CONFIG_PGTABLE_MAPPING
1166  static inline int __zs_cpu_up(struct mapping_area *area)
1167  {
1168  	/*
1169  	 * Make sure we don't leak memory if a cpu UP notification
1170  	 * and zs_init() race and both call zs_cpu_up() on the same cpu
1171  	 */
1172  	if (area->vm)
1173  		return 0;
1174  	area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL);
1175  	if (!area->vm)
1176  		return -ENOMEM;
1177  	return 0;
1178  }
1179  
1180  static inline void __zs_cpu_down(struct mapping_area *area)
1181  {
1182  	if (area->vm)
1183  		free_vm_area(area->vm);
1184  	area->vm = NULL;
1185  }
1186  
1187  static inline void *__zs_map_object(struct mapping_area *area,
1188  				struct page *pages[2], int off, int size)
1189  {
1190  	BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages));
1191  	area->vm_addr = area->vm->addr;
1192  	return area->vm_addr + off;
1193  }
1194  
1195  static inline void __zs_unmap_object(struct mapping_area *area,
1196  				struct page *pages[2], int off, int size)
1197  {
1198  	unsigned long addr = (unsigned long)area->vm_addr;
1199  
1200  	unmap_kernel_range(addr, PAGE_SIZE * 2);
1201  }
1202  
1203  #else /* CONFIG_PGTABLE_MAPPING */
1204  
1205  static inline int __zs_cpu_up(struct mapping_area *area)
1206  {
1207  	/*
1208  	 * Make sure we don't leak memory if a cpu UP notification
1209  	 * and zs_init() race and both call zs_cpu_up() on the same cpu
1210  	 */
1211  	if (area->vm_buf)
1212  		return 0;
1213  	area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL);
1214  	if (!area->vm_buf)
1215  		return -ENOMEM;
1216  	return 0;
1217  }
1218  
1219  static inline void __zs_cpu_down(struct mapping_area *area)
1220  {
1221  	kfree(area->vm_buf);
1222  	area->vm_buf = NULL;
1223  }
1224  
1225  static void *__zs_map_object(struct mapping_area *area,
1226  			struct page *pages[2], int off, int size)
1227  {
1228  	int sizes[2];
1229  	void *addr;
1230  	char *buf = area->vm_buf;
1231  
1232  	/* disable page faults to match kmap_atomic() return conditions */
1233  	pagefault_disable();
1234  
1235  	/* no read fastpath */
1236  	if (area->vm_mm == ZS_MM_WO)
1237  		goto out;
1238  
1239  	sizes[0] = PAGE_SIZE - off;
1240  	sizes[1] = size - sizes[0];
1241  
1242  	/* copy object to per-cpu buffer */
1243  	addr = kmap_atomic(pages[0]);
1244  	memcpy(buf, addr + off, sizes[0]);
1245  	kunmap_atomic(addr);
1246  	addr = kmap_atomic(pages[1]);
1247  	memcpy(buf + sizes[0], addr, sizes[1]);
1248  	kunmap_atomic(addr);
1249  out:
1250  	return area->vm_buf;
1251  }
1252  
1253  static void __zs_unmap_object(struct mapping_area *area,
1254  			struct page *pages[2], int off, int size)
1255  {
1256  	int sizes[2];
1257  	void *addr;
1258  	char *buf;
1259  
1260  	/* no write fastpath */
1261  	if (area->vm_mm == ZS_MM_RO)
1262  		goto out;
1263  
1264  	buf = area->vm_buf;
1265  	buf = buf + ZS_HANDLE_SIZE;
1266  	size -= ZS_HANDLE_SIZE;
1267  	off += ZS_HANDLE_SIZE;
1268  
1269  	sizes[0] = PAGE_SIZE - off;
1270  	sizes[1] = size - sizes[0];
1271  
1272  	/* copy per-cpu buffer to object */
1273  	addr = kmap_atomic(pages[0]);
1274  	memcpy(addr + off, buf, sizes[0]);
1275  	kunmap_atomic(addr);
1276  	addr = kmap_atomic(pages[1]);
1277  	memcpy(addr, buf + sizes[0], sizes[1]);
1278  	kunmap_atomic(addr);
1279  
1280  out:
1281  	/* enable page faults to match kunmap_atomic() return conditions */
1282  	pagefault_enable();
1283  }
1284  
1285  #endif /* CONFIG_PGTABLE_MAPPING */
1286  
1287  static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action,
1288  				void *pcpu)
1289  {
1290  	int ret, cpu = (long)pcpu;
1291  	struct mapping_area *area;
1292  
1293  	switch (action) {
1294  	case CPU_UP_PREPARE:
1295  		area = &per_cpu(zs_map_area, cpu);
1296  		ret = __zs_cpu_up(area);
1297  		if (ret)
1298  			return notifier_from_errno(ret);
1299  		break;
1300  	case CPU_DEAD:
1301  	case CPU_UP_CANCELED:
1302  		area = &per_cpu(zs_map_area, cpu);
1303  		__zs_cpu_down(area);
1304  		break;
1305  	}
1306  
1307  	return NOTIFY_OK;
1308  }
1309  
1310  static struct notifier_block zs_cpu_nb = {
1311  	.notifier_call = zs_cpu_notifier
1312  };
1313  
1314  static int zs_register_cpu_notifier(void)
1315  {
1316  	int cpu, uninitialized_var(ret);
1317  
1318  	cpu_notifier_register_begin();
1319  
1320  	__register_cpu_notifier(&zs_cpu_nb);
1321  	for_each_online_cpu(cpu) {
1322  		ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
1323  		if (notifier_to_errno(ret))
1324  			break;
1325  	}
1326  
1327  	cpu_notifier_register_done();
1328  	return notifier_to_errno(ret);
1329  }
1330  
1331  static void zs_unregister_cpu_notifier(void)
1332  {
1333  	int cpu;
1334  
1335  	cpu_notifier_register_begin();
1336  
1337  	for_each_online_cpu(cpu)
1338  		zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu);
1339  	__unregister_cpu_notifier(&zs_cpu_nb);
1340  
1341  	cpu_notifier_register_done();
1342  }
1343  
1344  static void __init init_zs_size_classes(void)
1345  {
1346  	int nr;
1347  
1348  	nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1;
1349  	if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA)
1350  		nr += 1;
1351  
1352  	zs_size_classes = nr;
1353  }
1354  
1355  static bool can_merge(struct size_class *prev, int pages_per_zspage,
1356  					int objs_per_zspage)
1357  {
1358  	if (prev->pages_per_zspage == pages_per_zspage &&
1359  		prev->objs_per_zspage == objs_per_zspage)
1360  		return true;
1361  
1362  	return false;
1363  }
1364  
1365  static bool zspage_full(struct size_class *class, struct zspage *zspage)
1366  {
1367  	return get_zspage_inuse(zspage) == class->objs_per_zspage;
1368  }
1369  
1370  unsigned long zs_get_total_pages(struct zs_pool *pool)
1371  {
1372  	return atomic_long_read(&pool->pages_allocated);
1373  }
1374  EXPORT_SYMBOL_GPL(zs_get_total_pages);
1375  
1376  /**
1377   * zs_map_object - get address of allocated object from handle.
1378   * @pool: pool from which the object was allocated
1379   * @handle: handle returned from zs_malloc
1380   *
1381   * Before using an object allocated from zs_malloc, it must be mapped using
1382   * this function. When done with the object, it must be unmapped using
1383   * zs_unmap_object.
1384   *
1385   * Only one object can be mapped per cpu at a time. There is no protection
1386   * against nested mappings.
1387   *
1388   * This function returns with preemption and page faults disabled.
1389   */
1390  void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1391  			enum zs_mapmode mm)
1392  {
1393  	struct zspage *zspage;
1394  	struct page *page;
1395  	unsigned long obj, off;
1396  	unsigned int obj_idx;
1397  
1398  	unsigned int class_idx;
1399  	enum fullness_group fg;
1400  	struct size_class *class;
1401  	struct mapping_area *area;
1402  	struct page *pages[2];
1403  	void *ret;
1404  
1405  	/*
1406  	 * Because we use per-cpu mapping areas shared among the
1407  	 * pools/users, we can't allow mapping in interrupt context
1408  	 * because it can corrupt another users mappings.
1409  	 */
1410  	WARN_ON_ONCE(in_interrupt());
1411  
1412  	/* From now on, migration cannot move the object */
1413  	pin_tag(handle);
1414  
1415  	obj = handle_to_obj(handle);
1416  	obj_to_location(obj, &page, &obj_idx);
1417  	zspage = get_zspage(page);
1418  
1419  	/* migration cannot move any subpage in this zspage */
1420  	migrate_read_lock(zspage);
1421  
1422  	get_zspage_mapping(zspage, &class_idx, &fg);
1423  	class = pool->size_class[class_idx];
1424  	off = (class->size * obj_idx) & ~PAGE_MASK;
1425  
1426  	area = &get_cpu_var(zs_map_area);
1427  	area->vm_mm = mm;
1428  	if (off + class->size <= PAGE_SIZE) {
1429  		/* this object is contained entirely within a page */
1430  		area->vm_addr = kmap_atomic(page);
1431  		ret = area->vm_addr + off;
1432  		goto out;
1433  	}
1434  
1435  	/* this object spans two pages */
1436  	pages[0] = page;
1437  	pages[1] = get_next_page(page);
1438  	BUG_ON(!pages[1]);
1439  
1440  	ret = __zs_map_object(area, pages, off, class->size);
1441  out:
1442  	if (likely(!PageHugeObject(page)))
1443  		ret += ZS_HANDLE_SIZE;
1444  
1445  	return ret;
1446  }
1447  EXPORT_SYMBOL_GPL(zs_map_object);
1448  
1449  void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1450  {
1451  	struct zspage *zspage;
1452  	struct page *page;
1453  	unsigned long obj, off;
1454  	unsigned int obj_idx;
1455  
1456  	unsigned int class_idx;
1457  	enum fullness_group fg;
1458  	struct size_class *class;
1459  	struct mapping_area *area;
1460  
1461  	obj = handle_to_obj(handle);
1462  	obj_to_location(obj, &page, &obj_idx);
1463  	zspage = get_zspage(page);
1464  	get_zspage_mapping(zspage, &class_idx, &fg);
1465  	class = pool->size_class[class_idx];
1466  	off = (class->size * obj_idx) & ~PAGE_MASK;
1467  
1468  	area = this_cpu_ptr(&zs_map_area);
1469  	if (off + class->size <= PAGE_SIZE)
1470  		kunmap_atomic(area->vm_addr);
1471  	else {
1472  		struct page *pages[2];
1473  
1474  		pages[0] = page;
1475  		pages[1] = get_next_page(page);
1476  		BUG_ON(!pages[1]);
1477  
1478  		__zs_unmap_object(area, pages, off, class->size);
1479  	}
1480  	put_cpu_var(zs_map_area);
1481  
1482  	migrate_read_unlock(zspage);
1483  	unpin_tag(handle);
1484  }
1485  EXPORT_SYMBOL_GPL(zs_unmap_object);
1486  
1487  static unsigned long obj_malloc(struct size_class *class,
1488  				struct zspage *zspage, unsigned long handle)
1489  {
1490  	int i, nr_page, offset;
1491  	unsigned long obj;
1492  	struct link_free *link;
1493  
1494  	struct page *m_page;
1495  	unsigned long m_offset;
1496  	void *vaddr;
1497  
1498  	handle |= OBJ_ALLOCATED_TAG;
1499  	obj = get_freeobj(zspage);
1500  
1501  	offset = obj * class->size;
1502  	nr_page = offset >> PAGE_SHIFT;
1503  	m_offset = offset & ~PAGE_MASK;
1504  	m_page = get_first_page(zspage);
1505  
1506  	for (i = 0; i < nr_page; i++)
1507  		m_page = get_next_page(m_page);
1508  
1509  	vaddr = kmap_atomic(m_page);
1510  	link = (struct link_free *)vaddr + m_offset / sizeof(*link);
1511  	set_freeobj(zspage, link->next >> OBJ_TAG_BITS);
1512  	if (likely(!PageHugeObject(m_page)))
1513  		/* record handle in the header of allocated chunk */
1514  		link->handle = handle;
1515  	else
1516  		/* record handle to page->index */
1517  		zspage->first_page->index = handle;
1518  
1519  	kunmap_atomic(vaddr);
1520  	mod_zspage_inuse(zspage, 1);
1521  	zs_stat_inc(class, OBJ_USED, 1);
1522  
1523  	obj = location_to_obj(m_page, obj);
1524  
1525  	return obj;
1526  }
1527  
1528  
1529  /**
1530   * zs_malloc - Allocate block of given size from pool.
1531   * @pool: pool to allocate from
1532   * @size: size of block to allocate
1533   * @gfp: gfp flags when allocating object
1534   *
1535   * On success, handle to the allocated object is returned,
1536   * otherwise 0.
1537   * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
1538   */
1539  unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
1540  {
1541  	unsigned long handle, obj;
1542  	struct size_class *class;
1543  	enum fullness_group newfg;
1544  	struct zspage *zspage;
1545  
1546  	if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
1547  		return 0;
1548  
1549  	handle = cache_alloc_handle(pool, gfp);
1550  	if (!handle)
1551  		return 0;
1552  
1553  	/* extra space in chunk to keep the handle */
1554  	size += ZS_HANDLE_SIZE;
1555  	class = pool->size_class[get_size_class_index(size)];
1556  
1557  	spin_lock(&class->lock);
1558  	zspage = find_get_zspage(class);
1559  	if (likely(zspage)) {
1560  		obj = obj_malloc(class, zspage, handle);
1561  		/* Now move the zspage to another fullness group, if required */
1562  		fix_fullness_group(class, zspage);
1563  		record_obj(handle, obj);
1564  		spin_unlock(&class->lock);
1565  
1566  		return handle;
1567  	}
1568  
1569  	spin_unlock(&class->lock);
1570  
1571  	zspage = alloc_zspage(pool, class, gfp);
1572  	if (!zspage) {
1573  		cache_free_handle(pool, handle);
1574  		return 0;
1575  	}
1576  
1577  	spin_lock(&class->lock);
1578  	obj = obj_malloc(class, zspage, handle);
1579  	newfg = get_fullness_group(class, zspage);
1580  	insert_zspage(class, zspage, newfg);
1581  	set_zspage_mapping(zspage, class->index, newfg);
1582  	record_obj(handle, obj);
1583  	atomic_long_add(class->pages_per_zspage,
1584  				&pool->pages_allocated);
1585  	zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage);
1586  
1587  	/* We completely set up zspage so mark them as movable */
1588  	SetZsPageMovable(pool, zspage);
1589  	spin_unlock(&class->lock);
1590  
1591  	return handle;
1592  }
1593  EXPORT_SYMBOL_GPL(zs_malloc);
1594  
1595  static void obj_free(struct size_class *class, unsigned long obj)
1596  {
1597  	struct link_free *link;
1598  	struct zspage *zspage;
1599  	struct page *f_page;
1600  	unsigned long f_offset;
1601  	unsigned int f_objidx;
1602  	void *vaddr;
1603  
1604  	obj &= ~OBJ_ALLOCATED_TAG;
1605  	obj_to_location(obj, &f_page, &f_objidx);
1606  	f_offset = (class->size * f_objidx) & ~PAGE_MASK;
1607  	zspage = get_zspage(f_page);
1608  
1609  	vaddr = kmap_atomic(f_page);
1610  
1611  	/* Insert this object in containing zspage's freelist */
1612  	link = (struct link_free *)(vaddr + f_offset);
1613  	link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
1614  	kunmap_atomic(vaddr);
1615  	set_freeobj(zspage, f_objidx);
1616  	mod_zspage_inuse(zspage, -1);
1617  	zs_stat_dec(class, OBJ_USED, 1);
1618  }
1619  
1620  void zs_free(struct zs_pool *pool, unsigned long handle)
1621  {
1622  	struct zspage *zspage;
1623  	struct page *f_page;
1624  	unsigned long obj;
1625  	unsigned int f_objidx;
1626  	int class_idx;
1627  	struct size_class *class;
1628  	enum fullness_group fullness;
1629  	bool isolated;
1630  
1631  	if (unlikely(!handle))
1632  		return;
1633  
1634  	pin_tag(handle);
1635  	obj = handle_to_obj(handle);
1636  	obj_to_location(obj, &f_page, &f_objidx);
1637  	zspage = get_zspage(f_page);
1638  
1639  	migrate_read_lock(zspage);
1640  
1641  	get_zspage_mapping(zspage, &class_idx, &fullness);
1642  	class = pool->size_class[class_idx];
1643  
1644  	spin_lock(&class->lock);
1645  	obj_free(class, obj);
1646  	fullness = fix_fullness_group(class, zspage);
1647  	if (fullness != ZS_EMPTY) {
1648  		migrate_read_unlock(zspage);
1649  		goto out;
1650  	}
1651  
1652  	isolated = is_zspage_isolated(zspage);
1653  	migrate_read_unlock(zspage);
1654  	/* If zspage is isolated, zs_page_putback will free the zspage */
1655  	if (likely(!isolated))
1656  		free_zspage(pool, class, zspage);
1657  out:
1658  
1659  	spin_unlock(&class->lock);
1660  	unpin_tag(handle);
1661  	cache_free_handle(pool, handle);
1662  }
1663  EXPORT_SYMBOL_GPL(zs_free);
1664  
1665  static void zs_object_copy(struct size_class *class, unsigned long dst,
1666  				unsigned long src)
1667  {
1668  	struct page *s_page, *d_page;
1669  	unsigned int s_objidx, d_objidx;
1670  	unsigned long s_off, d_off;
1671  	void *s_addr, *d_addr;
1672  	int s_size, d_size, size;
1673  	int written = 0;
1674  
1675  	s_size = d_size = class->size;
1676  
1677  	obj_to_location(src, &s_page, &s_objidx);
1678  	obj_to_location(dst, &d_page, &d_objidx);
1679  
1680  	s_off = (class->size * s_objidx) & ~PAGE_MASK;
1681  	d_off = (class->size * d_objidx) & ~PAGE_MASK;
1682  
1683  	if (s_off + class->size > PAGE_SIZE)
1684  		s_size = PAGE_SIZE - s_off;
1685  
1686  	if (d_off + class->size > PAGE_SIZE)
1687  		d_size = PAGE_SIZE - d_off;
1688  
1689  	s_addr = kmap_atomic(s_page);
1690  	d_addr = kmap_atomic(d_page);
1691  
1692  	while (1) {
1693  		size = min(s_size, d_size);
1694  		memcpy(d_addr + d_off, s_addr + s_off, size);
1695  		written += size;
1696  
1697  		if (written == class->size)
1698  			break;
1699  
1700  		s_off += size;
1701  		s_size -= size;
1702  		d_off += size;
1703  		d_size -= size;
1704  
1705  		if (s_off >= PAGE_SIZE) {
1706  			kunmap_atomic(d_addr);
1707  			kunmap_atomic(s_addr);
1708  			s_page = get_next_page(s_page);
1709  			s_addr = kmap_atomic(s_page);
1710  			d_addr = kmap_atomic(d_page);
1711  			s_size = class->size - written;
1712  			s_off = 0;
1713  		}
1714  
1715  		if (d_off >= PAGE_SIZE) {
1716  			kunmap_atomic(d_addr);
1717  			d_page = get_next_page(d_page);
1718  			d_addr = kmap_atomic(d_page);
1719  			d_size = class->size - written;
1720  			d_off = 0;
1721  		}
1722  	}
1723  
1724  	kunmap_atomic(d_addr);
1725  	kunmap_atomic(s_addr);
1726  }
1727  
1728  /*
1729   * Find alloced object in zspage from index object and
1730   * return handle.
1731   */
1732  static unsigned long find_alloced_obj(struct size_class *class,
1733  					struct page *page, int *obj_idx)
1734  {
1735  	unsigned long head;
1736  	int offset = 0;
1737  	int index = *obj_idx;
1738  	unsigned long handle = 0;
1739  	void *addr = kmap_atomic(page);
1740  
1741  	offset = get_first_obj_offset(page);
1742  	offset += class->size * index;
1743  
1744  	while (offset < PAGE_SIZE) {
1745  		head = obj_to_head(page, addr + offset);
1746  		if (head & OBJ_ALLOCATED_TAG) {
1747  			handle = head & ~OBJ_ALLOCATED_TAG;
1748  			if (trypin_tag(handle))
1749  				break;
1750  			handle = 0;
1751  		}
1752  
1753  		offset += class->size;
1754  		index++;
1755  	}
1756  
1757  	kunmap_atomic(addr);
1758  
1759  	*obj_idx = index;
1760  
1761  	return handle;
1762  }
1763  
1764  struct zs_compact_control {
1765  	/* Source spage for migration which could be a subpage of zspage */
1766  	struct page *s_page;
1767  	/* Destination page for migration which should be a first page
1768  	 * of zspage. */
1769  	struct page *d_page;
1770  	 /* Starting object index within @s_page which used for live object
1771  	  * in the subpage. */
1772  	int obj_idx;
1773  };
1774  
1775  static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
1776  				struct zs_compact_control *cc)
1777  {
1778  	unsigned long used_obj, free_obj;
1779  	unsigned long handle;
1780  	struct page *s_page = cc->s_page;
1781  	struct page *d_page = cc->d_page;
1782  	int obj_idx = cc->obj_idx;
1783  	int ret = 0;
1784  
1785  	while (1) {
1786  		handle = find_alloced_obj(class, s_page, &obj_idx);
1787  		if (!handle) {
1788  			s_page = get_next_page(s_page);
1789  			if (!s_page)
1790  				break;
1791  			obj_idx = 0;
1792  			continue;
1793  		}
1794  
1795  		/* Stop if there is no more space */
1796  		if (zspage_full(class, get_zspage(d_page))) {
1797  			unpin_tag(handle);
1798  			ret = -ENOMEM;
1799  			break;
1800  		}
1801  
1802  		used_obj = handle_to_obj(handle);
1803  		free_obj = obj_malloc(class, get_zspage(d_page), handle);
1804  		zs_object_copy(class, free_obj, used_obj);
1805  		obj_idx++;
1806  		/*
1807  		 * record_obj updates handle's value to free_obj and it will
1808  		 * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
1809  		 * breaks synchronization using pin_tag(e,g, zs_free) so
1810  		 * let's keep the lock bit.
1811  		 */
1812  		free_obj |= BIT(HANDLE_PIN_BIT);
1813  		record_obj(handle, free_obj);
1814  		unpin_tag(handle);
1815  		obj_free(class, used_obj);
1816  	}
1817  
1818  	/* Remember last position in this iteration */
1819  	cc->s_page = s_page;
1820  	cc->obj_idx = obj_idx;
1821  
1822  	return ret;
1823  }
1824  
1825  static struct zspage *isolate_zspage(struct size_class *class, bool source)
1826  {
1827  	int i;
1828  	struct zspage *zspage;
1829  	enum fullness_group fg[2] = {ZS_ALMOST_EMPTY, ZS_ALMOST_FULL};
1830  
1831  	if (!source) {
1832  		fg[0] = ZS_ALMOST_FULL;
1833  		fg[1] = ZS_ALMOST_EMPTY;
1834  	}
1835  
1836  	for (i = 0; i < 2; i++) {
1837  		zspage = list_first_entry_or_null(&class->fullness_list[fg[i]],
1838  							struct zspage, list);
1839  		if (zspage) {
1840  			VM_BUG_ON(is_zspage_isolated(zspage));
1841  			remove_zspage(class, zspage, fg[i]);
1842  			return zspage;
1843  		}
1844  	}
1845  
1846  	return zspage;
1847  }
1848  
1849  /*
1850   * putback_zspage - add @zspage into right class's fullness list
1851   * @class: destination class
1852   * @zspage: target page
1853   *
1854   * Return @zspage's fullness_group
1855   */
1856  static enum fullness_group putback_zspage(struct size_class *class,
1857  			struct zspage *zspage)
1858  {
1859  	enum fullness_group fullness;
1860  
1861  	VM_BUG_ON(is_zspage_isolated(zspage));
1862  
1863  	fullness = get_fullness_group(class, zspage);
1864  	insert_zspage(class, zspage, fullness);
1865  	set_zspage_mapping(zspage, class->index, fullness);
1866  
1867  	return fullness;
1868  }
1869  
1870  #ifdef CONFIG_COMPACTION
1871  static struct dentry *zs_mount(struct file_system_type *fs_type,
1872  				int flags, const char *dev_name, void *data)
1873  {
1874  	static const struct dentry_operations ops = {
1875  		.d_dname = simple_dname,
1876  	};
1877  
1878  	return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC);
1879  }
1880  
1881  static struct file_system_type zsmalloc_fs = {
1882  	.name		= "zsmalloc",
1883  	.mount		= zs_mount,
1884  	.kill_sb	= kill_anon_super,
1885  };
1886  
1887  static int zsmalloc_mount(void)
1888  {
1889  	int ret = 0;
1890  
1891  	zsmalloc_mnt = kern_mount(&zsmalloc_fs);
1892  	if (IS_ERR(zsmalloc_mnt))
1893  		ret = PTR_ERR(zsmalloc_mnt);
1894  
1895  	return ret;
1896  }
1897  
1898  static void zsmalloc_unmount(void)
1899  {
1900  	kern_unmount(zsmalloc_mnt);
1901  }
1902  
1903  static void migrate_lock_init(struct zspage *zspage)
1904  {
1905  	rwlock_init(&zspage->lock);
1906  }
1907  
1908  static void migrate_read_lock(struct zspage *zspage)
1909  {
1910  	read_lock(&zspage->lock);
1911  }
1912  
1913  static void migrate_read_unlock(struct zspage *zspage)
1914  {
1915  	read_unlock(&zspage->lock);
1916  }
1917  
1918  static void migrate_write_lock(struct zspage *zspage)
1919  {
1920  	write_lock(&zspage->lock);
1921  }
1922  
1923  static void migrate_write_unlock(struct zspage *zspage)
1924  {
1925  	write_unlock(&zspage->lock);
1926  }
1927  
1928  /* Number of isolated subpage for *page migration* in this zspage */
1929  static void inc_zspage_isolation(struct zspage *zspage)
1930  {
1931  	zspage->isolated++;
1932  }
1933  
1934  static void dec_zspage_isolation(struct zspage *zspage)
1935  {
1936  	zspage->isolated--;
1937  }
1938  
1939  static void replace_sub_page(struct size_class *class, struct zspage *zspage,
1940  				struct page *newpage, struct page *oldpage)
1941  {
1942  	struct page *page;
1943  	struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, };
1944  	int idx = 0;
1945  
1946  	page = get_first_page(zspage);
1947  	do {
1948  		if (page == oldpage)
1949  			pages[idx] = newpage;
1950  		else
1951  			pages[idx] = page;
1952  		idx++;
1953  	} while ((page = get_next_page(page)) != NULL);
1954  
1955  	create_page_chain(class, zspage, pages);
1956  	set_first_obj_offset(newpage, get_first_obj_offset(oldpage));
1957  	if (unlikely(PageHugeObject(oldpage)))
1958  		newpage->index = oldpage->index;
1959  	__SetPageMovable(newpage, page_mapping(oldpage));
1960  }
1961  
1962  bool zs_page_isolate(struct page *page, isolate_mode_t mode)
1963  {
1964  	struct zs_pool *pool;
1965  	struct size_class *class;
1966  	int class_idx;
1967  	enum fullness_group fullness;
1968  	struct zspage *zspage;
1969  	struct address_space *mapping;
1970  
1971  	/*
1972  	 * Page is locked so zspage couldn't be destroyed. For detail, look at
1973  	 * lock_zspage in free_zspage.
1974  	 */
1975  	VM_BUG_ON_PAGE(!PageMovable(page), page);
1976  	VM_BUG_ON_PAGE(PageIsolated(page), page);
1977  
1978  	zspage = get_zspage(page);
1979  
1980  	/*
1981  	 * Without class lock, fullness could be stale while class_idx is okay
1982  	 * because class_idx is constant unless page is freed so we should get
1983  	 * fullness again under class lock.
1984  	 */
1985  	get_zspage_mapping(zspage, &class_idx, &fullness);
1986  	mapping = page_mapping(page);
1987  	pool = mapping->private_data;
1988  	class = pool->size_class[class_idx];
1989  
1990  	spin_lock(&class->lock);
1991  	if (get_zspage_inuse(zspage) == 0) {
1992  		spin_unlock(&class->lock);
1993  		return false;
1994  	}
1995  
1996  	/* zspage is isolated for object migration */
1997  	if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
1998  		spin_unlock(&class->lock);
1999  		return false;
2000  	}
2001  
2002  	/*
2003  	 * If this is first time isolation for the zspage, isolate zspage from
2004  	 * size_class to prevent further object allocation from the zspage.
2005  	 */
2006  	if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
2007  		get_zspage_mapping(zspage, &class_idx, &fullness);
2008  		remove_zspage(class, zspage, fullness);
2009  	}
2010  
2011  	inc_zspage_isolation(zspage);
2012  	spin_unlock(&class->lock);
2013  
2014  	return true;
2015  }
2016  
2017  int zs_page_migrate(struct address_space *mapping, struct page *newpage,
2018  		struct page *page, enum migrate_mode mode)
2019  {
2020  	struct zs_pool *pool;
2021  	struct size_class *class;
2022  	int class_idx;
2023  	enum fullness_group fullness;
2024  	struct zspage *zspage;
2025  	struct page *dummy;
2026  	void *s_addr, *d_addr, *addr;
2027  	int offset, pos;
2028  	unsigned long handle, head;
2029  	unsigned long old_obj, new_obj;
2030  	unsigned int obj_idx;
2031  	int ret = -EAGAIN;
2032  
2033  	VM_BUG_ON_PAGE(!PageMovable(page), page);
2034  	VM_BUG_ON_PAGE(!PageIsolated(page), page);
2035  
2036  	zspage = get_zspage(page);
2037  
2038  	/* Concurrent compactor cannot migrate any subpage in zspage */
2039  	migrate_write_lock(zspage);
2040  	get_zspage_mapping(zspage, &class_idx, &fullness);
2041  	pool = mapping->private_data;
2042  	class = pool->size_class[class_idx];
2043  	offset = get_first_obj_offset(page);
2044  
2045  	spin_lock(&class->lock);
2046  	if (!get_zspage_inuse(zspage)) {
2047  		ret = -EBUSY;
2048  		goto unlock_class;
2049  	}
2050  
2051  	pos = offset;
2052  	s_addr = kmap_atomic(page);
2053  	while (pos < PAGE_SIZE) {
2054  		head = obj_to_head(page, s_addr + pos);
2055  		if (head & OBJ_ALLOCATED_TAG) {
2056  			handle = head & ~OBJ_ALLOCATED_TAG;
2057  			if (!trypin_tag(handle))
2058  				goto unpin_objects;
2059  		}
2060  		pos += class->size;
2061  	}
2062  
2063  	/*
2064  	 * Here, any user cannot access all objects in the zspage so let's move.
2065  	 */
2066  	d_addr = kmap_atomic(newpage);
2067  	memcpy(d_addr, s_addr, PAGE_SIZE);
2068  	kunmap_atomic(d_addr);
2069  
2070  	for (addr = s_addr + offset; addr < s_addr + pos;
2071  					addr += class->size) {
2072  		head = obj_to_head(page, addr);
2073  		if (head & OBJ_ALLOCATED_TAG) {
2074  			handle = head & ~OBJ_ALLOCATED_TAG;
2075  			if (!testpin_tag(handle))
2076  				BUG();
2077  
2078  			old_obj = handle_to_obj(handle);
2079  			obj_to_location(old_obj, &dummy, &obj_idx);
2080  			new_obj = (unsigned long)location_to_obj(newpage,
2081  								obj_idx);
2082  			new_obj |= BIT(HANDLE_PIN_BIT);
2083  			record_obj(handle, new_obj);
2084  		}
2085  	}
2086  
2087  	replace_sub_page(class, zspage, newpage, page);
2088  	get_page(newpage);
2089  
2090  	dec_zspage_isolation(zspage);
2091  
2092  	/*
2093  	 * Page migration is done so let's putback isolated zspage to
2094  	 * the list if @page is final isolated subpage in the zspage.
2095  	 */
2096  	if (!is_zspage_isolated(zspage))
2097  		putback_zspage(class, zspage);
2098  
2099  	reset_page(page);
2100  	put_page(page);
2101  	page = newpage;
2102  
2103  	ret = MIGRATEPAGE_SUCCESS;
2104  unpin_objects:
2105  	for (addr = s_addr + offset; addr < s_addr + pos;
2106  						addr += class->size) {
2107  		head = obj_to_head(page, addr);
2108  		if (head & OBJ_ALLOCATED_TAG) {
2109  			handle = head & ~OBJ_ALLOCATED_TAG;
2110  			if (!testpin_tag(handle))
2111  				BUG();
2112  			unpin_tag(handle);
2113  		}
2114  	}
2115  	kunmap_atomic(s_addr);
2116  unlock_class:
2117  	spin_unlock(&class->lock);
2118  	migrate_write_unlock(zspage);
2119  
2120  	return ret;
2121  }
2122  
2123  void zs_page_putback(struct page *page)
2124  {
2125  	struct zs_pool *pool;
2126  	struct size_class *class;
2127  	int class_idx;
2128  	enum fullness_group fg;
2129  	struct address_space *mapping;
2130  	struct zspage *zspage;
2131  
2132  	VM_BUG_ON_PAGE(!PageMovable(page), page);
2133  	VM_BUG_ON_PAGE(!PageIsolated(page), page);
2134  
2135  	zspage = get_zspage(page);
2136  	get_zspage_mapping(zspage, &class_idx, &fg);
2137  	mapping = page_mapping(page);
2138  	pool = mapping->private_data;
2139  	class = pool->size_class[class_idx];
2140  
2141  	spin_lock(&class->lock);
2142  	dec_zspage_isolation(zspage);
2143  	if (!is_zspage_isolated(zspage)) {
2144  		fg = putback_zspage(class, zspage);
2145  		/*
2146  		 * Due to page_lock, we cannot free zspage immediately
2147  		 * so let's defer.
2148  		 */
2149  		if (fg == ZS_EMPTY)
2150  			schedule_work(&pool->free_work);
2151  	}
2152  	spin_unlock(&class->lock);
2153  }
2154  
2155  const struct address_space_operations zsmalloc_aops = {
2156  	.isolate_page = zs_page_isolate,
2157  	.migratepage = zs_page_migrate,
2158  	.putback_page = zs_page_putback,
2159  };
2160  
2161  static int zs_register_migration(struct zs_pool *pool)
2162  {
2163  	pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb);
2164  	if (IS_ERR(pool->inode)) {
2165  		pool->inode = NULL;
2166  		return 1;
2167  	}
2168  
2169  	pool->inode->i_mapping->private_data = pool;
2170  	pool->inode->i_mapping->a_ops = &zsmalloc_aops;
2171  	return 0;
2172  }
2173  
2174  static void zs_unregister_migration(struct zs_pool *pool)
2175  {
2176  	flush_work(&pool->free_work);
2177  	iput(pool->inode);
2178  }
2179  
2180  /*
2181   * Caller should hold page_lock of all pages in the zspage
2182   * In here, we cannot use zspage meta data.
2183   */
2184  static void async_free_zspage(struct work_struct *work)
2185  {
2186  	int i;
2187  	struct size_class *class;
2188  	unsigned int class_idx;
2189  	enum fullness_group fullness;
2190  	struct zspage *zspage, *tmp;
2191  	LIST_HEAD(free_pages);
2192  	struct zs_pool *pool = container_of(work, struct zs_pool,
2193  					free_work);
2194  
2195  	for (i = 0; i < zs_size_classes; i++) {
2196  		class = pool->size_class[i];
2197  		if (class->index != i)
2198  			continue;
2199  
2200  		spin_lock(&class->lock);
2201  		list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages);
2202  		spin_unlock(&class->lock);
2203  	}
2204  
2205  
2206  	list_for_each_entry_safe(zspage, tmp, &free_pages, list) {
2207  		list_del(&zspage->list);
2208  		lock_zspage(zspage);
2209  
2210  		get_zspage_mapping(zspage, &class_idx, &fullness);
2211  		VM_BUG_ON(fullness != ZS_EMPTY);
2212  		class = pool->size_class[class_idx];
2213  		spin_lock(&class->lock);
2214  		__free_zspage(pool, pool->size_class[class_idx], zspage);
2215  		spin_unlock(&class->lock);
2216  	}
2217  };
2218  
2219  static void kick_deferred_free(struct zs_pool *pool)
2220  {
2221  	schedule_work(&pool->free_work);
2222  }
2223  
2224  static void init_deferred_free(struct zs_pool *pool)
2225  {
2226  	INIT_WORK(&pool->free_work, async_free_zspage);
2227  }
2228  
2229  static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
2230  {
2231  	struct page *page = get_first_page(zspage);
2232  
2233  	do {
2234  		WARN_ON(!trylock_page(page));
2235  		__SetPageMovable(page, pool->inode->i_mapping);
2236  		unlock_page(page);
2237  	} while ((page = get_next_page(page)) != NULL);
2238  }
2239  #endif
2240  
2241  /*
2242   *
2243   * Based on the number of unused allocated objects calculate
2244   * and return the number of pages that we can free.
2245   */
2246  static unsigned long zs_can_compact(struct size_class *class)
2247  {
2248  	unsigned long obj_wasted;
2249  	unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
2250  	unsigned long obj_used = zs_stat_get(class, OBJ_USED);
2251  
2252  	if (obj_allocated <= obj_used)
2253  		return 0;
2254  
2255  	obj_wasted = obj_allocated - obj_used;
2256  	obj_wasted /= class->objs_per_zspage;
2257  
2258  	return obj_wasted * class->pages_per_zspage;
2259  }
2260  
2261  static void __zs_compact(struct zs_pool *pool, struct size_class *class)
2262  {
2263  	struct zs_compact_control cc;
2264  	struct zspage *src_zspage;
2265  	struct zspage *dst_zspage = NULL;
2266  
2267  	spin_lock(&class->lock);
2268  	while ((src_zspage = isolate_zspage(class, true))) {
2269  
2270  		if (!zs_can_compact(class))
2271  			break;
2272  
2273  		cc.obj_idx = 0;
2274  		cc.s_page = get_first_page(src_zspage);
2275  
2276  		while ((dst_zspage = isolate_zspage(class, false))) {
2277  			cc.d_page = get_first_page(dst_zspage);
2278  			/*
2279  			 * If there is no more space in dst_page, resched
2280  			 * and see if anyone had allocated another zspage.
2281  			 */
2282  			if (!migrate_zspage(pool, class, &cc))
2283  				break;
2284  
2285  			putback_zspage(class, dst_zspage);
2286  		}
2287  
2288  		/* Stop if we couldn't find slot */
2289  		if (dst_zspage == NULL)
2290  			break;
2291  
2292  		putback_zspage(class, dst_zspage);
2293  		if (putback_zspage(class, src_zspage) == ZS_EMPTY) {
2294  			free_zspage(pool, class, src_zspage);
2295  			pool->stats.pages_compacted += class->pages_per_zspage;
2296  		}
2297  		spin_unlock(&class->lock);
2298  		cond_resched();
2299  		spin_lock(&class->lock);
2300  	}
2301  
2302  	if (src_zspage)
2303  		putback_zspage(class, src_zspage);
2304  
2305  	spin_unlock(&class->lock);
2306  }
2307  
2308  unsigned long zs_compact(struct zs_pool *pool)
2309  {
2310  	int i;
2311  	struct size_class *class;
2312  
2313  	for (i = zs_size_classes - 1; i >= 0; i--) {
2314  		class = pool->size_class[i];
2315  		if (!class)
2316  			continue;
2317  		if (class->index != i)
2318  			continue;
2319  		__zs_compact(pool, class);
2320  	}
2321  
2322  	return pool->stats.pages_compacted;
2323  }
2324  EXPORT_SYMBOL_GPL(zs_compact);
2325  
2326  void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats)
2327  {
2328  	memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats));
2329  }
2330  EXPORT_SYMBOL_GPL(zs_pool_stats);
2331  
2332  static unsigned long zs_shrinker_scan(struct shrinker *shrinker,
2333  		struct shrink_control *sc)
2334  {
2335  	unsigned long pages_freed;
2336  	struct zs_pool *pool = container_of(shrinker, struct zs_pool,
2337  			shrinker);
2338  
2339  	pages_freed = pool->stats.pages_compacted;
2340  	/*
2341  	 * Compact classes and calculate compaction delta.
2342  	 * Can run concurrently with a manually triggered
2343  	 * (by user) compaction.
2344  	 */
2345  	pages_freed = zs_compact(pool) - pages_freed;
2346  
2347  	return pages_freed ? pages_freed : SHRINK_STOP;
2348  }
2349  
2350  static unsigned long zs_shrinker_count(struct shrinker *shrinker,
2351  		struct shrink_control *sc)
2352  {
2353  	int i;
2354  	struct size_class *class;
2355  	unsigned long pages_to_free = 0;
2356  	struct zs_pool *pool = container_of(shrinker, struct zs_pool,
2357  			shrinker);
2358  
2359  	for (i = zs_size_classes - 1; i >= 0; i--) {
2360  		class = pool->size_class[i];
2361  		if (!class)
2362  			continue;
2363  		if (class->index != i)
2364  			continue;
2365  
2366  		pages_to_free += zs_can_compact(class);
2367  	}
2368  
2369  	return pages_to_free;
2370  }
2371  
2372  static void zs_unregister_shrinker(struct zs_pool *pool)
2373  {
2374  	if (pool->shrinker_enabled) {
2375  		unregister_shrinker(&pool->shrinker);
2376  		pool->shrinker_enabled = false;
2377  	}
2378  }
2379  
2380  static int zs_register_shrinker(struct zs_pool *pool)
2381  {
2382  	pool->shrinker.scan_objects = zs_shrinker_scan;
2383  	pool->shrinker.count_objects = zs_shrinker_count;
2384  	pool->shrinker.batch = 0;
2385  	pool->shrinker.seeks = DEFAULT_SEEKS;
2386  
2387  	return register_shrinker(&pool->shrinker);
2388  }
2389  
2390  /**
2391   * zs_create_pool - Creates an allocation pool to work from.
2392   * @name: pool name to be created
2393   *
2394   * This function must be called before anything when using
2395   * the zsmalloc allocator.
2396   *
2397   * On success, a pointer to the newly created pool is returned,
2398   * otherwise NULL.
2399   */
2400  struct zs_pool *zs_create_pool(const char *name)
2401  {
2402  	int i;
2403  	struct zs_pool *pool;
2404  	struct size_class *prev_class = NULL;
2405  
2406  	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
2407  	if (!pool)
2408  		return NULL;
2409  
2410  	init_deferred_free(pool);
2411  	pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
2412  			GFP_KERNEL);
2413  	if (!pool->size_class) {
2414  		kfree(pool);
2415  		return NULL;
2416  	}
2417  
2418  	pool->name = kstrdup(name, GFP_KERNEL);
2419  	if (!pool->name)
2420  		goto err;
2421  
2422  	if (create_cache(pool))
2423  		goto err;
2424  
2425  	/*
2426  	 * Iterate reversly, because, size of size_class that we want to use
2427  	 * for merging should be larger or equal to current size.
2428  	 */
2429  	for (i = zs_size_classes - 1; i >= 0; i--) {
2430  		int size;
2431  		int pages_per_zspage;
2432  		int objs_per_zspage;
2433  		struct size_class *class;
2434  		int fullness = 0;
2435  
2436  		size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
2437  		if (size > ZS_MAX_ALLOC_SIZE)
2438  			size = ZS_MAX_ALLOC_SIZE;
2439  		pages_per_zspage = get_pages_per_zspage(size);
2440  		objs_per_zspage = pages_per_zspage * PAGE_SIZE / size;
2441  
2442  		/*
2443  		 * size_class is used for normal zsmalloc operation such
2444  		 * as alloc/free for that size. Although it is natural that we
2445  		 * have one size_class for each size, there is a chance that we
2446  		 * can get more memory utilization if we use one size_class for
2447  		 * many different sizes whose size_class have same
2448  		 * characteristics. So, we makes size_class point to
2449  		 * previous size_class if possible.
2450  		 */
2451  		if (prev_class) {
2452  			if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) {
2453  				pool->size_class[i] = prev_class;
2454  				continue;
2455  			}
2456  		}
2457  
2458  		class = kzalloc(sizeof(struct size_class), GFP_KERNEL);
2459  		if (!class)
2460  			goto err;
2461  
2462  		class->size = size;
2463  		class->index = i;
2464  		class->pages_per_zspage = pages_per_zspage;
2465  		class->objs_per_zspage = objs_per_zspage;
2466  		spin_lock_init(&class->lock);
2467  		pool->size_class[i] = class;
2468  		for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
2469  							fullness++)
2470  			INIT_LIST_HEAD(&class->fullness_list[fullness]);
2471  
2472  		prev_class = class;
2473  	}
2474  
2475  	/* debug only, don't abort if it fails */
2476  	zs_pool_stat_create(pool, name);
2477  
2478  	if (zs_register_migration(pool))
2479  		goto err;
2480  
2481  	/*
2482  	 * Not critical, we still can use the pool
2483  	 * and user can trigger compaction manually.
2484  	 */
2485  	if (zs_register_shrinker(pool) == 0)
2486  		pool->shrinker_enabled = true;
2487  	return pool;
2488  
2489  err:
2490  	zs_destroy_pool(pool);
2491  	return NULL;
2492  }
2493  EXPORT_SYMBOL_GPL(zs_create_pool);
2494  
2495  void zs_destroy_pool(struct zs_pool *pool)
2496  {
2497  	int i;
2498  
2499  	zs_unregister_shrinker(pool);
2500  	zs_unregister_migration(pool);
2501  	zs_pool_stat_destroy(pool);
2502  
2503  	for (i = 0; i < zs_size_classes; i++) {
2504  		int fg;
2505  		struct size_class *class = pool->size_class[i];
2506  
2507  		if (!class)
2508  			continue;
2509  
2510  		if (class->index != i)
2511  			continue;
2512  
2513  		for (fg = ZS_EMPTY; fg < NR_ZS_FULLNESS; fg++) {
2514  			if (!list_empty(&class->fullness_list[fg])) {
2515  				pr_info("Freeing non-empty class with size %db, fullness group %d\n",
2516  					class->size, fg);
2517  			}
2518  		}
2519  		kfree(class);
2520  	}
2521  
2522  	destroy_cache(pool);
2523  	kfree(pool->size_class);
2524  	kfree(pool->name);
2525  	kfree(pool);
2526  }
2527  EXPORT_SYMBOL_GPL(zs_destroy_pool);
2528  
2529  static int __init zs_init(void)
2530  {
2531  	int ret;
2532  
2533  	ret = zsmalloc_mount();
2534  	if (ret)
2535  		goto out;
2536  
2537  	ret = zs_register_cpu_notifier();
2538  
2539  	if (ret)
2540  		goto notifier_fail;
2541  
2542  	init_zs_size_classes();
2543  
2544  #ifdef CONFIG_ZPOOL
2545  	zpool_register_driver(&zs_zpool_driver);
2546  #endif
2547  
2548  	zs_stat_init();
2549  
2550  	return 0;
2551  
2552  notifier_fail:
2553  	zs_unregister_cpu_notifier();
2554  	zsmalloc_unmount();
2555  out:
2556  	return ret;
2557  }
2558  
2559  static void __exit zs_exit(void)
2560  {
2561  #ifdef CONFIG_ZPOOL
2562  	zpool_unregister_driver(&zs_zpool_driver);
2563  #endif
2564  	zsmalloc_unmount();
2565  	zs_unregister_cpu_notifier();
2566  
2567  	zs_stat_exit();
2568  }
2569  
2570  module_init(zs_init);
2571  module_exit(zs_exit);
2572  
2573  MODULE_LICENSE("Dual BSD/GPL");
2574  MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
2575