xref: /openbmc/linux/mm/z3fold.c (revision 5497b23e)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * z3fold.c
4   *
5   * Author: Vitaly Wool <vitaly.wool@konsulko.com>
6   * Copyright (C) 2016, Sony Mobile Communications Inc.
7   *
8   * This implementation is based on zbud written by Seth Jennings.
9   *
10   * z3fold is an special purpose allocator for storing compressed pages. It
11   * can store up to three compressed pages per page which improves the
12   * compression ratio of zbud while retaining its main concepts (e. g. always
13   * storing an integral number of objects per page) and simplicity.
14   * It still has simple and deterministic reclaim properties that make it
15   * preferable to a higher density approach (with no requirement on integral
16   * number of object per page) when reclaim is used.
17   *
18   * As in zbud, pages are divided into "chunks".  The size of the chunks is
19   * fixed at compile time and is determined by NCHUNKS_ORDER below.
20   *
21   * z3fold doesn't export any API and is meant to be used via zpool API.
22   */
23  
24  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
25  
26  #include <linux/atomic.h>
27  #include <linux/sched.h>
28  #include <linux/cpumask.h>
29  #include <linux/list.h>
30  #include <linux/mm.h>
31  #include <linux/module.h>
32  #include <linux/page-flags.h>
33  #include <linux/migrate.h>
34  #include <linux/node.h>
35  #include <linux/compaction.h>
36  #include <linux/percpu.h>
37  #include <linux/mount.h>
38  #include <linux/pseudo_fs.h>
39  #include <linux/fs.h>
40  #include <linux/preempt.h>
41  #include <linux/workqueue.h>
42  #include <linux/slab.h>
43  #include <linux/spinlock.h>
44  #include <linux/zpool.h>
45  #include <linux/magic.h>
46  #include <linux/kmemleak.h>
47  
48  /*
49   * NCHUNKS_ORDER determines the internal allocation granularity, effectively
50   * adjusting internal fragmentation.  It also determines the number of
51   * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
52   * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
53   * in the beginning of an allocated page are occupied by z3fold header, so
54   * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
55   * which shows the max number of free chunks in z3fold page, also there will
56   * be 63, or 62, respectively, freelists per pool.
57   */
58  #define NCHUNKS_ORDER	6
59  
60  #define CHUNK_SHIFT	(PAGE_SHIFT - NCHUNKS_ORDER)
61  #define CHUNK_SIZE	(1 << CHUNK_SHIFT)
62  #define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
63  #define ZHDR_CHUNKS	(ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
64  #define TOTAL_CHUNKS	(PAGE_SIZE >> CHUNK_SHIFT)
65  #define NCHUNKS		((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
66  
67  #define BUDDY_MASK	(0x3)
68  #define BUDDY_SHIFT	2
69  #define SLOTS_ALIGN	(0x40)
70  
71  /*****************
72   * Structures
73  *****************/
74  struct z3fold_pool;
75  struct z3fold_ops {
76  	int (*evict)(struct z3fold_pool *pool, unsigned long handle);
77  };
78  
79  enum buddy {
80  	HEADLESS = 0,
81  	FIRST,
82  	MIDDLE,
83  	LAST,
84  	BUDDIES_MAX = LAST
85  };
86  
87  struct z3fold_buddy_slots {
88  	/*
89  	 * we are using BUDDY_MASK in handle_to_buddy etc. so there should
90  	 * be enough slots to hold all possible variants
91  	 */
92  	unsigned long slot[BUDDY_MASK + 1];
93  	unsigned long pool; /* back link */
94  	rwlock_t lock;
95  };
96  #define HANDLE_FLAG_MASK	(0x03)
97  
98  /*
99   * struct z3fold_header - z3fold page metadata occupying first chunks of each
100   *			z3fold page, except for HEADLESS pages
101   * @buddy:		links the z3fold page into the relevant list in the
102   *			pool
103   * @page_lock:		per-page lock
104   * @refcount:		reference count for the z3fold page
105   * @work:		work_struct for page layout optimization
106   * @slots:		pointer to the structure holding buddy slots
107   * @pool:		pointer to the containing pool
108   * @cpu:		CPU which this page "belongs" to
109   * @first_chunks:	the size of the first buddy in chunks, 0 if free
110   * @middle_chunks:	the size of the middle buddy in chunks, 0 if free
111   * @last_chunks:	the size of the last buddy in chunks, 0 if free
112   * @first_num:		the starting number (for the first handle)
113   * @mapped_count:	the number of objects currently mapped
114   */
115  struct z3fold_header {
116  	struct list_head buddy;
117  	spinlock_t page_lock;
118  	struct kref refcount;
119  	struct work_struct work;
120  	struct z3fold_buddy_slots *slots;
121  	struct z3fold_pool *pool;
122  	short cpu;
123  	unsigned short first_chunks;
124  	unsigned short middle_chunks;
125  	unsigned short last_chunks;
126  	unsigned short start_middle;
127  	unsigned short first_num:2;
128  	unsigned short mapped_count:2;
129  	unsigned short foreign_handles:2;
130  };
131  
132  /**
133   * struct z3fold_pool - stores metadata for each z3fold pool
134   * @name:	pool name
135   * @lock:	protects pool unbuddied/lru lists
136   * @stale_lock:	protects pool stale page list
137   * @unbuddied:	per-cpu array of lists tracking z3fold pages that contain 2-
138   *		buddies; the list each z3fold page is added to depends on
139   *		the size of its free region.
140   * @lru:	list tracking the z3fold pages in LRU order by most recently
141   *		added buddy.
142   * @stale:	list of pages marked for freeing
143   * @pages_nr:	number of z3fold pages in the pool.
144   * @c_handle:	cache for z3fold_buddy_slots allocation
145   * @ops:	pointer to a structure of user defined operations specified at
146   *		pool creation time.
147   * @compact_wq:	workqueue for page layout background optimization
148   * @release_wq:	workqueue for safe page release
149   * @work:	work_struct for safe page release
150   * @inode:	inode for z3fold pseudo filesystem
151   *
152   * This structure is allocated at pool creation time and maintains metadata
153   * pertaining to a particular z3fold pool.
154   */
155  struct z3fold_pool {
156  	const char *name;
157  	spinlock_t lock;
158  	spinlock_t stale_lock;
159  	struct list_head *unbuddied;
160  	struct list_head lru;
161  	struct list_head stale;
162  	atomic64_t pages_nr;
163  	struct kmem_cache *c_handle;
164  	const struct z3fold_ops *ops;
165  	struct zpool *zpool;
166  	const struct zpool_ops *zpool_ops;
167  	struct workqueue_struct *compact_wq;
168  	struct workqueue_struct *release_wq;
169  	struct work_struct work;
170  	struct inode *inode;
171  };
172  
173  /*
174   * Internal z3fold page flags
175   */
176  enum z3fold_page_flags {
177  	PAGE_HEADLESS = 0,
178  	MIDDLE_CHUNK_MAPPED,
179  	NEEDS_COMPACTING,
180  	PAGE_STALE,
181  	PAGE_CLAIMED, /* by either reclaim or free */
182  };
183  
184  /*
185   * handle flags, go under HANDLE_FLAG_MASK
186   */
187  enum z3fold_handle_flags {
188  	HANDLES_NOFREE = 0,
189  };
190  
191  /*
192   * Forward declarations
193   */
194  static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool);
195  static void compact_page_work(struct work_struct *w);
196  
197  /*****************
198   * Helpers
199  *****************/
200  
201  /* Converts an allocation size in bytes to size in z3fold chunks */
202  static int size_to_chunks(size_t size)
203  {
204  	return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
205  }
206  
207  #define for_each_unbuddied_list(_iter, _begin) \
208  	for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
209  
210  static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
211  							gfp_t gfp)
212  {
213  	struct z3fold_buddy_slots *slots;
214  
215  	slots = kmem_cache_zalloc(pool->c_handle,
216  				 (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
217  
218  	if (slots) {
219  		/* It will be freed separately in free_handle(). */
220  		kmemleak_not_leak(slots);
221  		slots->pool = (unsigned long)pool;
222  		rwlock_init(&slots->lock);
223  	}
224  
225  	return slots;
226  }
227  
228  static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s)
229  {
230  	return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK);
231  }
232  
233  static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
234  {
235  	return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
236  }
237  
238  /* Lock a z3fold page */
239  static inline void z3fold_page_lock(struct z3fold_header *zhdr)
240  {
241  	spin_lock(&zhdr->page_lock);
242  }
243  
244  /* Try to lock a z3fold page */
245  static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
246  {
247  	return spin_trylock(&zhdr->page_lock);
248  }
249  
250  /* Unlock a z3fold page */
251  static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
252  {
253  	spin_unlock(&zhdr->page_lock);
254  }
255  
256  
257  static inline struct z3fold_header *__get_z3fold_header(unsigned long handle,
258  							bool lock)
259  {
260  	struct z3fold_buddy_slots *slots;
261  	struct z3fold_header *zhdr;
262  	int locked = 0;
263  
264  	if (!(handle & (1 << PAGE_HEADLESS))) {
265  		slots = handle_to_slots(handle);
266  		do {
267  			unsigned long addr;
268  
269  			read_lock(&slots->lock);
270  			addr = *(unsigned long *)handle;
271  			zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
272  			if (lock)
273  				locked = z3fold_page_trylock(zhdr);
274  			read_unlock(&slots->lock);
275  			if (locked)
276  				break;
277  			cpu_relax();
278  		} while (lock);
279  	} else {
280  		zhdr = (struct z3fold_header *)(handle & PAGE_MASK);
281  	}
282  
283  	return zhdr;
284  }
285  
286  /* Returns the z3fold page where a given handle is stored */
287  static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
288  {
289  	return __get_z3fold_header(h, false);
290  }
291  
292  /* return locked z3fold page if it's not headless */
293  static inline struct z3fold_header *get_z3fold_header(unsigned long h)
294  {
295  	return __get_z3fold_header(h, true);
296  }
297  
298  static inline void put_z3fold_header(struct z3fold_header *zhdr)
299  {
300  	struct page *page = virt_to_page(zhdr);
301  
302  	if (!test_bit(PAGE_HEADLESS, &page->private))
303  		z3fold_page_unlock(zhdr);
304  }
305  
306  static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr)
307  {
308  	struct z3fold_buddy_slots *slots;
309  	int i;
310  	bool is_free;
311  
312  	if (handle & (1 << PAGE_HEADLESS))
313  		return;
314  
315  	if (WARN_ON(*(unsigned long *)handle == 0))
316  		return;
317  
318  	slots = handle_to_slots(handle);
319  	write_lock(&slots->lock);
320  	*(unsigned long *)handle = 0;
321  
322  	if (test_bit(HANDLES_NOFREE, &slots->pool)) {
323  		write_unlock(&slots->lock);
324  		return; /* simple case, nothing else to do */
325  	}
326  
327  	if (zhdr->slots != slots)
328  		zhdr->foreign_handles--;
329  
330  	is_free = true;
331  	for (i = 0; i <= BUDDY_MASK; i++) {
332  		if (slots->slot[i]) {
333  			is_free = false;
334  			break;
335  		}
336  	}
337  	write_unlock(&slots->lock);
338  
339  	if (is_free) {
340  		struct z3fold_pool *pool = slots_to_pool(slots);
341  
342  		if (zhdr->slots == slots)
343  			zhdr->slots = NULL;
344  		kmem_cache_free(pool->c_handle, slots);
345  	}
346  }
347  
348  static int z3fold_init_fs_context(struct fs_context *fc)
349  {
350  	return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM;
351  }
352  
353  static struct file_system_type z3fold_fs = {
354  	.name		= "z3fold",
355  	.init_fs_context = z3fold_init_fs_context,
356  	.kill_sb	= kill_anon_super,
357  };
358  
359  static struct vfsmount *z3fold_mnt;
360  static int z3fold_mount(void)
361  {
362  	int ret = 0;
363  
364  	z3fold_mnt = kern_mount(&z3fold_fs);
365  	if (IS_ERR(z3fold_mnt))
366  		ret = PTR_ERR(z3fold_mnt);
367  
368  	return ret;
369  }
370  
371  static void z3fold_unmount(void)
372  {
373  	kern_unmount(z3fold_mnt);
374  }
375  
376  static const struct address_space_operations z3fold_aops;
377  static int z3fold_register_migration(struct z3fold_pool *pool)
378  {
379  	pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
380  	if (IS_ERR(pool->inode)) {
381  		pool->inode = NULL;
382  		return 1;
383  	}
384  
385  	pool->inode->i_mapping->private_data = pool;
386  	pool->inode->i_mapping->a_ops = &z3fold_aops;
387  	return 0;
388  }
389  
390  static void z3fold_unregister_migration(struct z3fold_pool *pool)
391  {
392  	if (pool->inode)
393  		iput(pool->inode);
394   }
395  
396  /* Initializes the z3fold header of a newly allocated z3fold page */
397  static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
398  					struct z3fold_pool *pool, gfp_t gfp)
399  {
400  	struct z3fold_header *zhdr = page_address(page);
401  	struct z3fold_buddy_slots *slots;
402  
403  	INIT_LIST_HEAD(&page->lru);
404  	clear_bit(PAGE_HEADLESS, &page->private);
405  	clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
406  	clear_bit(NEEDS_COMPACTING, &page->private);
407  	clear_bit(PAGE_STALE, &page->private);
408  	clear_bit(PAGE_CLAIMED, &page->private);
409  	if (headless)
410  		return zhdr;
411  
412  	slots = alloc_slots(pool, gfp);
413  	if (!slots)
414  		return NULL;
415  
416  	memset(zhdr, 0, sizeof(*zhdr));
417  	spin_lock_init(&zhdr->page_lock);
418  	kref_init(&zhdr->refcount);
419  	zhdr->cpu = -1;
420  	zhdr->slots = slots;
421  	zhdr->pool = pool;
422  	INIT_LIST_HEAD(&zhdr->buddy);
423  	INIT_WORK(&zhdr->work, compact_page_work);
424  	return zhdr;
425  }
426  
427  /* Resets the struct page fields and frees the page */
428  static void free_z3fold_page(struct page *page, bool headless)
429  {
430  	if (!headless) {
431  		lock_page(page);
432  		__ClearPageMovable(page);
433  		unlock_page(page);
434  	}
435  	ClearPagePrivate(page);
436  	__free_page(page);
437  }
438  
439  /* Helper function to build the index */
440  static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
441  {
442  	return (bud + zhdr->first_num) & BUDDY_MASK;
443  }
444  
445  /*
446   * Encodes the handle of a particular buddy within a z3fold page
447   * Pool lock should be held as this function accesses first_num
448   */
449  static unsigned long __encode_handle(struct z3fold_header *zhdr,
450  				struct z3fold_buddy_slots *slots,
451  				enum buddy bud)
452  {
453  	unsigned long h = (unsigned long)zhdr;
454  	int idx = 0;
455  
456  	/*
457  	 * For a headless page, its handle is its pointer with the extra
458  	 * PAGE_HEADLESS bit set
459  	 */
460  	if (bud == HEADLESS)
461  		return h | (1 << PAGE_HEADLESS);
462  
463  	/* otherwise, return pointer to encoded handle */
464  	idx = __idx(zhdr, bud);
465  	h += idx;
466  	if (bud == LAST)
467  		h |= (zhdr->last_chunks << BUDDY_SHIFT);
468  
469  	write_lock(&slots->lock);
470  	slots->slot[idx] = h;
471  	write_unlock(&slots->lock);
472  	return (unsigned long)&slots->slot[idx];
473  }
474  
475  static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
476  {
477  	return __encode_handle(zhdr, zhdr->slots, bud);
478  }
479  
480  /* only for LAST bud, returns zero otherwise */
481  static unsigned short handle_to_chunks(unsigned long handle)
482  {
483  	struct z3fold_buddy_slots *slots = handle_to_slots(handle);
484  	unsigned long addr;
485  
486  	read_lock(&slots->lock);
487  	addr = *(unsigned long *)handle;
488  	read_unlock(&slots->lock);
489  	return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
490  }
491  
492  /*
493   * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle
494   *  but that doesn't matter. because the masking will result in the
495   *  correct buddy number.
496   */
497  static enum buddy handle_to_buddy(unsigned long handle)
498  {
499  	struct z3fold_header *zhdr;
500  	struct z3fold_buddy_slots *slots = handle_to_slots(handle);
501  	unsigned long addr;
502  
503  	read_lock(&slots->lock);
504  	WARN_ON(handle & (1 << PAGE_HEADLESS));
505  	addr = *(unsigned long *)handle;
506  	read_unlock(&slots->lock);
507  	zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
508  	return (addr - zhdr->first_num) & BUDDY_MASK;
509  }
510  
511  static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
512  {
513  	return zhdr->pool;
514  }
515  
516  static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
517  {
518  	struct page *page = virt_to_page(zhdr);
519  	struct z3fold_pool *pool = zhdr_to_pool(zhdr);
520  
521  	WARN_ON(!list_empty(&zhdr->buddy));
522  	set_bit(PAGE_STALE, &page->private);
523  	clear_bit(NEEDS_COMPACTING, &page->private);
524  	spin_lock(&pool->lock);
525  	if (!list_empty(&page->lru))
526  		list_del_init(&page->lru);
527  	spin_unlock(&pool->lock);
528  
529  	if (locked)
530  		z3fold_page_unlock(zhdr);
531  
532  	spin_lock(&pool->stale_lock);
533  	list_add(&zhdr->buddy, &pool->stale);
534  	queue_work(pool->release_wq, &pool->work);
535  	spin_unlock(&pool->stale_lock);
536  }
537  
538  static void release_z3fold_page(struct kref *ref)
539  {
540  	struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
541  						refcount);
542  	__release_z3fold_page(zhdr, false);
543  }
544  
545  static void release_z3fold_page_locked(struct kref *ref)
546  {
547  	struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
548  						refcount);
549  	WARN_ON(z3fold_page_trylock(zhdr));
550  	__release_z3fold_page(zhdr, true);
551  }
552  
553  static void release_z3fold_page_locked_list(struct kref *ref)
554  {
555  	struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
556  					       refcount);
557  	struct z3fold_pool *pool = zhdr_to_pool(zhdr);
558  
559  	spin_lock(&pool->lock);
560  	list_del_init(&zhdr->buddy);
561  	spin_unlock(&pool->lock);
562  
563  	WARN_ON(z3fold_page_trylock(zhdr));
564  	__release_z3fold_page(zhdr, true);
565  }
566  
567  static void free_pages_work(struct work_struct *w)
568  {
569  	struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
570  
571  	spin_lock(&pool->stale_lock);
572  	while (!list_empty(&pool->stale)) {
573  		struct z3fold_header *zhdr = list_first_entry(&pool->stale,
574  						struct z3fold_header, buddy);
575  		struct page *page = virt_to_page(zhdr);
576  
577  		list_del(&zhdr->buddy);
578  		if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
579  			continue;
580  		spin_unlock(&pool->stale_lock);
581  		cancel_work_sync(&zhdr->work);
582  		free_z3fold_page(page, false);
583  		cond_resched();
584  		spin_lock(&pool->stale_lock);
585  	}
586  	spin_unlock(&pool->stale_lock);
587  }
588  
589  /*
590   * Returns the number of free chunks in a z3fold page.
591   * NB: can't be used with HEADLESS pages.
592   */
593  static int num_free_chunks(struct z3fold_header *zhdr)
594  {
595  	int nfree;
596  	/*
597  	 * If there is a middle object, pick up the bigger free space
598  	 * either before or after it. Otherwise just subtract the number
599  	 * of chunks occupied by the first and the last objects.
600  	 */
601  	if (zhdr->middle_chunks != 0) {
602  		int nfree_before = zhdr->first_chunks ?
603  			0 : zhdr->start_middle - ZHDR_CHUNKS;
604  		int nfree_after = zhdr->last_chunks ?
605  			0 : TOTAL_CHUNKS -
606  				(zhdr->start_middle + zhdr->middle_chunks);
607  		nfree = max(nfree_before, nfree_after);
608  	} else
609  		nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
610  	return nfree;
611  }
612  
613  /* Add to the appropriate unbuddied list */
614  static inline void add_to_unbuddied(struct z3fold_pool *pool,
615  				struct z3fold_header *zhdr)
616  {
617  	if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
618  			zhdr->middle_chunks == 0) {
619  		struct list_head *unbuddied;
620  		int freechunks = num_free_chunks(zhdr);
621  
622  		migrate_disable();
623  		unbuddied = this_cpu_ptr(pool->unbuddied);
624  		spin_lock(&pool->lock);
625  		list_add(&zhdr->buddy, &unbuddied[freechunks]);
626  		spin_unlock(&pool->lock);
627  		zhdr->cpu = smp_processor_id();
628  		migrate_enable();
629  	}
630  }
631  
632  static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks)
633  {
634  	enum buddy bud = HEADLESS;
635  
636  	if (zhdr->middle_chunks) {
637  		if (!zhdr->first_chunks &&
638  		    chunks <= zhdr->start_middle - ZHDR_CHUNKS)
639  			bud = FIRST;
640  		else if (!zhdr->last_chunks)
641  			bud = LAST;
642  	} else {
643  		if (!zhdr->first_chunks)
644  			bud = FIRST;
645  		else if (!zhdr->last_chunks)
646  			bud = LAST;
647  		else
648  			bud = MIDDLE;
649  	}
650  
651  	return bud;
652  }
653  
654  static inline void *mchunk_memmove(struct z3fold_header *zhdr,
655  				unsigned short dst_chunk)
656  {
657  	void *beg = zhdr;
658  	return memmove(beg + (dst_chunk << CHUNK_SHIFT),
659  		       beg + (zhdr->start_middle << CHUNK_SHIFT),
660  		       zhdr->middle_chunks << CHUNK_SHIFT);
661  }
662  
663  static inline bool buddy_single(struct z3fold_header *zhdr)
664  {
665  	return !((zhdr->first_chunks && zhdr->middle_chunks) ||
666  			(zhdr->first_chunks && zhdr->last_chunks) ||
667  			(zhdr->middle_chunks && zhdr->last_chunks));
668  }
669  
670  static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
671  {
672  	struct z3fold_pool *pool = zhdr_to_pool(zhdr);
673  	void *p = zhdr;
674  	unsigned long old_handle = 0;
675  	size_t sz = 0;
676  	struct z3fold_header *new_zhdr = NULL;
677  	int first_idx = __idx(zhdr, FIRST);
678  	int middle_idx = __idx(zhdr, MIDDLE);
679  	int last_idx = __idx(zhdr, LAST);
680  	unsigned short *moved_chunks = NULL;
681  
682  	/*
683  	 * No need to protect slots here -- all the slots are "local" and
684  	 * the page lock is already taken
685  	 */
686  	if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) {
687  		p += ZHDR_SIZE_ALIGNED;
688  		sz = zhdr->first_chunks << CHUNK_SHIFT;
689  		old_handle = (unsigned long)&zhdr->slots->slot[first_idx];
690  		moved_chunks = &zhdr->first_chunks;
691  	} else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) {
692  		p += zhdr->start_middle << CHUNK_SHIFT;
693  		sz = zhdr->middle_chunks << CHUNK_SHIFT;
694  		old_handle = (unsigned long)&zhdr->slots->slot[middle_idx];
695  		moved_chunks = &zhdr->middle_chunks;
696  	} else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) {
697  		p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
698  		sz = zhdr->last_chunks << CHUNK_SHIFT;
699  		old_handle = (unsigned long)&zhdr->slots->slot[last_idx];
700  		moved_chunks = &zhdr->last_chunks;
701  	}
702  
703  	if (sz > 0) {
704  		enum buddy new_bud = HEADLESS;
705  		short chunks = size_to_chunks(sz);
706  		void *q;
707  
708  		new_zhdr = __z3fold_alloc(pool, sz, false);
709  		if (!new_zhdr)
710  			return NULL;
711  
712  		if (WARN_ON(new_zhdr == zhdr))
713  			goto out_fail;
714  
715  		new_bud = get_free_buddy(new_zhdr, chunks);
716  		q = new_zhdr;
717  		switch (new_bud) {
718  		case FIRST:
719  			new_zhdr->first_chunks = chunks;
720  			q += ZHDR_SIZE_ALIGNED;
721  			break;
722  		case MIDDLE:
723  			new_zhdr->middle_chunks = chunks;
724  			new_zhdr->start_middle =
725  				new_zhdr->first_chunks + ZHDR_CHUNKS;
726  			q += new_zhdr->start_middle << CHUNK_SHIFT;
727  			break;
728  		case LAST:
729  			new_zhdr->last_chunks = chunks;
730  			q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT);
731  			break;
732  		default:
733  			goto out_fail;
734  		}
735  		new_zhdr->foreign_handles++;
736  		memcpy(q, p, sz);
737  		write_lock(&zhdr->slots->lock);
738  		*(unsigned long *)old_handle = (unsigned long)new_zhdr +
739  			__idx(new_zhdr, new_bud);
740  		if (new_bud == LAST)
741  			*(unsigned long *)old_handle |=
742  					(new_zhdr->last_chunks << BUDDY_SHIFT);
743  		write_unlock(&zhdr->slots->lock);
744  		add_to_unbuddied(pool, new_zhdr);
745  		z3fold_page_unlock(new_zhdr);
746  
747  		*moved_chunks = 0;
748  	}
749  
750  	return new_zhdr;
751  
752  out_fail:
753  	if (new_zhdr) {
754  		if (kref_put(&new_zhdr->refcount, release_z3fold_page_locked))
755  			atomic64_dec(&pool->pages_nr);
756  		else {
757  			add_to_unbuddied(pool, new_zhdr);
758  			z3fold_page_unlock(new_zhdr);
759  		}
760  	}
761  	return NULL;
762  
763  }
764  
765  #define BIG_CHUNK_GAP	3
766  /* Has to be called with lock held */
767  static int z3fold_compact_page(struct z3fold_header *zhdr)
768  {
769  	struct page *page = virt_to_page(zhdr);
770  
771  	if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
772  		return 0; /* can't move middle chunk, it's used */
773  
774  	if (unlikely(PageIsolated(page)))
775  		return 0;
776  
777  	if (zhdr->middle_chunks == 0)
778  		return 0; /* nothing to compact */
779  
780  	if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
781  		/* move to the beginning */
782  		mchunk_memmove(zhdr, ZHDR_CHUNKS);
783  		zhdr->first_chunks = zhdr->middle_chunks;
784  		zhdr->middle_chunks = 0;
785  		zhdr->start_middle = 0;
786  		zhdr->first_num++;
787  		return 1;
788  	}
789  
790  	/*
791  	 * moving data is expensive, so let's only do that if
792  	 * there's substantial gain (at least BIG_CHUNK_GAP chunks)
793  	 */
794  	if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 &&
795  	    zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >=
796  			BIG_CHUNK_GAP) {
797  		mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS);
798  		zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
799  		return 1;
800  	} else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 &&
801  		   TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle
802  					+ zhdr->middle_chunks) >=
803  			BIG_CHUNK_GAP) {
804  		unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks -
805  			zhdr->middle_chunks;
806  		mchunk_memmove(zhdr, new_start);
807  		zhdr->start_middle = new_start;
808  		return 1;
809  	}
810  
811  	return 0;
812  }
813  
814  static void do_compact_page(struct z3fold_header *zhdr, bool locked)
815  {
816  	struct z3fold_pool *pool = zhdr_to_pool(zhdr);
817  	struct page *page;
818  
819  	page = virt_to_page(zhdr);
820  	if (locked)
821  		WARN_ON(z3fold_page_trylock(zhdr));
822  	else
823  		z3fold_page_lock(zhdr);
824  	if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) {
825  		z3fold_page_unlock(zhdr);
826  		return;
827  	}
828  	spin_lock(&pool->lock);
829  	list_del_init(&zhdr->buddy);
830  	spin_unlock(&pool->lock);
831  
832  	if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
833  		atomic64_dec(&pool->pages_nr);
834  		return;
835  	}
836  
837  	if (test_bit(PAGE_STALE, &page->private) ||
838  	    test_and_set_bit(PAGE_CLAIMED, &page->private)) {
839  		z3fold_page_unlock(zhdr);
840  		return;
841  	}
842  
843  	if (!zhdr->foreign_handles && buddy_single(zhdr) &&
844  	    zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) {
845  		if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
846  			atomic64_dec(&pool->pages_nr);
847  		else {
848  			clear_bit(PAGE_CLAIMED, &page->private);
849  			z3fold_page_unlock(zhdr);
850  		}
851  		return;
852  	}
853  
854  	z3fold_compact_page(zhdr);
855  	add_to_unbuddied(pool, zhdr);
856  	clear_bit(PAGE_CLAIMED, &page->private);
857  	z3fold_page_unlock(zhdr);
858  }
859  
860  static void compact_page_work(struct work_struct *w)
861  {
862  	struct z3fold_header *zhdr = container_of(w, struct z3fold_header,
863  						work);
864  
865  	do_compact_page(zhdr, false);
866  }
867  
868  /* returns _locked_ z3fold page header or NULL */
869  static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
870  						size_t size, bool can_sleep)
871  {
872  	struct z3fold_header *zhdr = NULL;
873  	struct page *page;
874  	struct list_head *unbuddied;
875  	int chunks = size_to_chunks(size), i;
876  
877  lookup:
878  	migrate_disable();
879  	/* First, try to find an unbuddied z3fold page. */
880  	unbuddied = this_cpu_ptr(pool->unbuddied);
881  	for_each_unbuddied_list(i, chunks) {
882  		struct list_head *l = &unbuddied[i];
883  
884  		zhdr = list_first_entry_or_null(READ_ONCE(l),
885  					struct z3fold_header, buddy);
886  
887  		if (!zhdr)
888  			continue;
889  
890  		/* Re-check under lock. */
891  		spin_lock(&pool->lock);
892  		l = &unbuddied[i];
893  		if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
894  						struct z3fold_header, buddy)) ||
895  		    !z3fold_page_trylock(zhdr)) {
896  			spin_unlock(&pool->lock);
897  			zhdr = NULL;
898  			migrate_enable();
899  			if (can_sleep)
900  				cond_resched();
901  			goto lookup;
902  		}
903  		list_del_init(&zhdr->buddy);
904  		zhdr->cpu = -1;
905  		spin_unlock(&pool->lock);
906  
907  		page = virt_to_page(zhdr);
908  		if (test_bit(NEEDS_COMPACTING, &page->private) ||
909  		    test_bit(PAGE_CLAIMED, &page->private)) {
910  			z3fold_page_unlock(zhdr);
911  			zhdr = NULL;
912  			migrate_enable();
913  			if (can_sleep)
914  				cond_resched();
915  			goto lookup;
916  		}
917  
918  		/*
919  		 * this page could not be removed from its unbuddied
920  		 * list while pool lock was held, and then we've taken
921  		 * page lock so kref_put could not be called before
922  		 * we got here, so it's safe to just call kref_get()
923  		 */
924  		kref_get(&zhdr->refcount);
925  		break;
926  	}
927  	migrate_enable();
928  
929  	if (!zhdr) {
930  		int cpu;
931  
932  		/* look for _exact_ match on other cpus' lists */
933  		for_each_online_cpu(cpu) {
934  			struct list_head *l;
935  
936  			unbuddied = per_cpu_ptr(pool->unbuddied, cpu);
937  			spin_lock(&pool->lock);
938  			l = &unbuddied[chunks];
939  
940  			zhdr = list_first_entry_or_null(READ_ONCE(l),
941  						struct z3fold_header, buddy);
942  
943  			if (!zhdr || !z3fold_page_trylock(zhdr)) {
944  				spin_unlock(&pool->lock);
945  				zhdr = NULL;
946  				continue;
947  			}
948  			list_del_init(&zhdr->buddy);
949  			zhdr->cpu = -1;
950  			spin_unlock(&pool->lock);
951  
952  			page = virt_to_page(zhdr);
953  			if (test_bit(NEEDS_COMPACTING, &page->private) ||
954  			    test_bit(PAGE_CLAIMED, &page->private)) {
955  				z3fold_page_unlock(zhdr);
956  				zhdr = NULL;
957  				if (can_sleep)
958  					cond_resched();
959  				continue;
960  			}
961  			kref_get(&zhdr->refcount);
962  			break;
963  		}
964  	}
965  
966  	if (zhdr && !zhdr->slots)
967  		zhdr->slots = alloc_slots(pool,
968  					can_sleep ? GFP_NOIO : GFP_ATOMIC);
969  	return zhdr;
970  }
971  
972  /*
973   * API Functions
974   */
975  
976  /**
977   * z3fold_create_pool() - create a new z3fold pool
978   * @name:	pool name
979   * @gfp:	gfp flags when allocating the z3fold pool structure
980   * @ops:	user-defined operations for the z3fold pool
981   *
982   * Return: pointer to the new z3fold pool or NULL if the metadata allocation
983   * failed.
984   */
985  static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
986  		const struct z3fold_ops *ops)
987  {
988  	struct z3fold_pool *pool = NULL;
989  	int i, cpu;
990  
991  	pool = kzalloc(sizeof(struct z3fold_pool), gfp);
992  	if (!pool)
993  		goto out;
994  	pool->c_handle = kmem_cache_create("z3fold_handle",
995  				sizeof(struct z3fold_buddy_slots),
996  				SLOTS_ALIGN, 0, NULL);
997  	if (!pool->c_handle)
998  		goto out_c;
999  	spin_lock_init(&pool->lock);
1000  	spin_lock_init(&pool->stale_lock);
1001  	pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
1002  	if (!pool->unbuddied)
1003  		goto out_pool;
1004  	for_each_possible_cpu(cpu) {
1005  		struct list_head *unbuddied =
1006  				per_cpu_ptr(pool->unbuddied, cpu);
1007  		for_each_unbuddied_list(i, 0)
1008  			INIT_LIST_HEAD(&unbuddied[i]);
1009  	}
1010  	INIT_LIST_HEAD(&pool->lru);
1011  	INIT_LIST_HEAD(&pool->stale);
1012  	atomic64_set(&pool->pages_nr, 0);
1013  	pool->name = name;
1014  	pool->compact_wq = create_singlethread_workqueue(pool->name);
1015  	if (!pool->compact_wq)
1016  		goto out_unbuddied;
1017  	pool->release_wq = create_singlethread_workqueue(pool->name);
1018  	if (!pool->release_wq)
1019  		goto out_wq;
1020  	if (z3fold_register_migration(pool))
1021  		goto out_rwq;
1022  	INIT_WORK(&pool->work, free_pages_work);
1023  	pool->ops = ops;
1024  	return pool;
1025  
1026  out_rwq:
1027  	destroy_workqueue(pool->release_wq);
1028  out_wq:
1029  	destroy_workqueue(pool->compact_wq);
1030  out_unbuddied:
1031  	free_percpu(pool->unbuddied);
1032  out_pool:
1033  	kmem_cache_destroy(pool->c_handle);
1034  out_c:
1035  	kfree(pool);
1036  out:
1037  	return NULL;
1038  }
1039  
1040  /**
1041   * z3fold_destroy_pool() - destroys an existing z3fold pool
1042   * @pool:	the z3fold pool to be destroyed
1043   *
1044   * The pool should be emptied before this function is called.
1045   */
1046  static void z3fold_destroy_pool(struct z3fold_pool *pool)
1047  {
1048  	kmem_cache_destroy(pool->c_handle);
1049  
1050  	/*
1051  	 * We need to destroy pool->compact_wq before pool->release_wq,
1052  	 * as any pending work on pool->compact_wq will call
1053  	 * queue_work(pool->release_wq, &pool->work).
1054  	 *
1055  	 * There are still outstanding pages until both workqueues are drained,
1056  	 * so we cannot unregister migration until then.
1057  	 */
1058  
1059  	destroy_workqueue(pool->compact_wq);
1060  	destroy_workqueue(pool->release_wq);
1061  	z3fold_unregister_migration(pool);
1062  	kfree(pool);
1063  }
1064  
1065  /**
1066   * z3fold_alloc() - allocates a region of a given size
1067   * @pool:	z3fold pool from which to allocate
1068   * @size:	size in bytes of the desired allocation
1069   * @gfp:	gfp flags used if the pool needs to grow
1070   * @handle:	handle of the new allocation
1071   *
1072   * This function will attempt to find a free region in the pool large enough to
1073   * satisfy the allocation request.  A search of the unbuddied lists is
1074   * performed first. If no suitable free region is found, then a new page is
1075   * allocated and added to the pool to satisfy the request.
1076   *
1077   * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
1078   * as z3fold pool pages.
1079   *
1080   * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
1081   * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
1082   * a new page.
1083   */
1084  static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
1085  			unsigned long *handle)
1086  {
1087  	int chunks = size_to_chunks(size);
1088  	struct z3fold_header *zhdr = NULL;
1089  	struct page *page = NULL;
1090  	enum buddy bud;
1091  	bool can_sleep = gfpflags_allow_blocking(gfp);
1092  
1093  	if (!size)
1094  		return -EINVAL;
1095  
1096  	if (size > PAGE_SIZE)
1097  		return -ENOSPC;
1098  
1099  	if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
1100  		bud = HEADLESS;
1101  	else {
1102  retry:
1103  		zhdr = __z3fold_alloc(pool, size, can_sleep);
1104  		if (zhdr) {
1105  			bud = get_free_buddy(zhdr, chunks);
1106  			if (bud == HEADLESS) {
1107  				if (kref_put(&zhdr->refcount,
1108  					     release_z3fold_page_locked))
1109  					atomic64_dec(&pool->pages_nr);
1110  				else
1111  					z3fold_page_unlock(zhdr);
1112  				pr_err("No free chunks in unbuddied\n");
1113  				WARN_ON(1);
1114  				goto retry;
1115  			}
1116  			page = virt_to_page(zhdr);
1117  			goto found;
1118  		}
1119  		bud = FIRST;
1120  	}
1121  
1122  	page = NULL;
1123  	if (can_sleep) {
1124  		spin_lock(&pool->stale_lock);
1125  		zhdr = list_first_entry_or_null(&pool->stale,
1126  						struct z3fold_header, buddy);
1127  		/*
1128  		 * Before allocating a page, let's see if we can take one from
1129  		 * the stale pages list. cancel_work_sync() can sleep so we
1130  		 * limit this case to the contexts where we can sleep
1131  		 */
1132  		if (zhdr) {
1133  			list_del(&zhdr->buddy);
1134  			spin_unlock(&pool->stale_lock);
1135  			cancel_work_sync(&zhdr->work);
1136  			page = virt_to_page(zhdr);
1137  		} else {
1138  			spin_unlock(&pool->stale_lock);
1139  		}
1140  	}
1141  	if (!page)
1142  		page = alloc_page(gfp);
1143  
1144  	if (!page)
1145  		return -ENOMEM;
1146  
1147  	zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp);
1148  	if (!zhdr) {
1149  		__free_page(page);
1150  		return -ENOMEM;
1151  	}
1152  	atomic64_inc(&pool->pages_nr);
1153  
1154  	if (bud == HEADLESS) {
1155  		set_bit(PAGE_HEADLESS, &page->private);
1156  		goto headless;
1157  	}
1158  	if (can_sleep) {
1159  		lock_page(page);
1160  		__SetPageMovable(page, pool->inode->i_mapping);
1161  		unlock_page(page);
1162  	} else {
1163  		if (trylock_page(page)) {
1164  			__SetPageMovable(page, pool->inode->i_mapping);
1165  			unlock_page(page);
1166  		}
1167  	}
1168  	z3fold_page_lock(zhdr);
1169  
1170  found:
1171  	if (bud == FIRST)
1172  		zhdr->first_chunks = chunks;
1173  	else if (bud == LAST)
1174  		zhdr->last_chunks = chunks;
1175  	else {
1176  		zhdr->middle_chunks = chunks;
1177  		zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
1178  	}
1179  	add_to_unbuddied(pool, zhdr);
1180  
1181  headless:
1182  	spin_lock(&pool->lock);
1183  	/* Add/move z3fold page to beginning of LRU */
1184  	if (!list_empty(&page->lru))
1185  		list_del(&page->lru);
1186  
1187  	list_add(&page->lru, &pool->lru);
1188  
1189  	*handle = encode_handle(zhdr, bud);
1190  	spin_unlock(&pool->lock);
1191  	if (bud != HEADLESS)
1192  		z3fold_page_unlock(zhdr);
1193  
1194  	return 0;
1195  }
1196  
1197  /**
1198   * z3fold_free() - frees the allocation associated with the given handle
1199   * @pool:	pool in which the allocation resided
1200   * @handle:	handle associated with the allocation returned by z3fold_alloc()
1201   *
1202   * In the case that the z3fold page in which the allocation resides is under
1203   * reclaim, as indicated by the PG_reclaim flag being set, this function
1204   * only sets the first|last_chunks to 0.  The page is actually freed
1205   * once both buddies are evicted (see z3fold_reclaim_page() below).
1206   */
1207  static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
1208  {
1209  	struct z3fold_header *zhdr;
1210  	struct page *page;
1211  	enum buddy bud;
1212  	bool page_claimed;
1213  
1214  	zhdr = get_z3fold_header(handle);
1215  	page = virt_to_page(zhdr);
1216  	page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
1217  
1218  	if (test_bit(PAGE_HEADLESS, &page->private)) {
1219  		/* if a headless page is under reclaim, just leave.
1220  		 * NB: we use test_and_set_bit for a reason: if the bit
1221  		 * has not been set before, we release this page
1222  		 * immediately so we don't care about its value any more.
1223  		 */
1224  		if (!page_claimed) {
1225  			spin_lock(&pool->lock);
1226  			list_del(&page->lru);
1227  			spin_unlock(&pool->lock);
1228  			put_z3fold_header(zhdr);
1229  			free_z3fold_page(page, true);
1230  			atomic64_dec(&pool->pages_nr);
1231  		}
1232  		return;
1233  	}
1234  
1235  	/* Non-headless case */
1236  	bud = handle_to_buddy(handle);
1237  
1238  	switch (bud) {
1239  	case FIRST:
1240  		zhdr->first_chunks = 0;
1241  		break;
1242  	case MIDDLE:
1243  		zhdr->middle_chunks = 0;
1244  		break;
1245  	case LAST:
1246  		zhdr->last_chunks = 0;
1247  		break;
1248  	default:
1249  		pr_err("%s: unknown bud %d\n", __func__, bud);
1250  		WARN_ON(1);
1251  		put_z3fold_header(zhdr);
1252  		return;
1253  	}
1254  
1255  	if (!page_claimed)
1256  		free_handle(handle, zhdr);
1257  	if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
1258  		atomic64_dec(&pool->pages_nr);
1259  		return;
1260  	}
1261  	if (page_claimed) {
1262  		/* the page has not been claimed by us */
1263  		z3fold_page_unlock(zhdr);
1264  		return;
1265  	}
1266  	if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
1267  		put_z3fold_header(zhdr);
1268  		clear_bit(PAGE_CLAIMED, &page->private);
1269  		return;
1270  	}
1271  	if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
1272  		spin_lock(&pool->lock);
1273  		list_del_init(&zhdr->buddy);
1274  		spin_unlock(&pool->lock);
1275  		zhdr->cpu = -1;
1276  		kref_get(&zhdr->refcount);
1277  		clear_bit(PAGE_CLAIMED, &page->private);
1278  		do_compact_page(zhdr, true);
1279  		return;
1280  	}
1281  	kref_get(&zhdr->refcount);
1282  	clear_bit(PAGE_CLAIMED, &page->private);
1283  	queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
1284  	put_z3fold_header(zhdr);
1285  }
1286  
1287  /**
1288   * z3fold_reclaim_page() - evicts allocations from a pool page and frees it
1289   * @pool:	pool from which a page will attempt to be evicted
1290   * @retries:	number of pages on the LRU list for which eviction will
1291   *		be attempted before failing
1292   *
1293   * z3fold reclaim is different from normal system reclaim in that it is done
1294   * from the bottom, up. This is because only the bottom layer, z3fold, has
1295   * information on how the allocations are organized within each z3fold page.
1296   * This has the potential to create interesting locking situations between
1297   * z3fold and the user, however.
1298   *
1299   * To avoid these, this is how z3fold_reclaim_page() should be called:
1300   *
1301   * The user detects a page should be reclaimed and calls z3fold_reclaim_page().
1302   * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and
1303   * call the user-defined eviction handler with the pool and handle as
1304   * arguments.
1305   *
1306   * If the handle can not be evicted, the eviction handler should return
1307   * non-zero. z3fold_reclaim_page() will add the z3fold page back to the
1308   * appropriate list and try the next z3fold page on the LRU up to
1309   * a user defined number of retries.
1310   *
1311   * If the handle is successfully evicted, the eviction handler should
1312   * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free()
1313   * contains logic to delay freeing the page if the page is under reclaim,
1314   * as indicated by the setting of the PG_reclaim flag on the underlying page.
1315   *
1316   * If all buddies in the z3fold page are successfully evicted, then the
1317   * z3fold page can be freed.
1318   *
1319   * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
1320   * no pages to evict or an eviction handler is not registered, -EAGAIN if
1321   * the retry limit was hit.
1322   */
1323  static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
1324  {
1325  	int i, ret = -1;
1326  	struct z3fold_header *zhdr = NULL;
1327  	struct page *page = NULL;
1328  	struct list_head *pos;
1329  	unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
1330  	struct z3fold_buddy_slots slots __attribute__((aligned(SLOTS_ALIGN)));
1331  
1332  	rwlock_init(&slots.lock);
1333  	slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE);
1334  
1335  	spin_lock(&pool->lock);
1336  	if (!pool->ops || !pool->ops->evict || retries == 0) {
1337  		spin_unlock(&pool->lock);
1338  		return -EINVAL;
1339  	}
1340  	for (i = 0; i < retries; i++) {
1341  		if (list_empty(&pool->lru)) {
1342  			spin_unlock(&pool->lock);
1343  			return -EINVAL;
1344  		}
1345  		list_for_each_prev(pos, &pool->lru) {
1346  			page = list_entry(pos, struct page, lru);
1347  
1348  			zhdr = page_address(page);
1349  			if (test_bit(PAGE_HEADLESS, &page->private))
1350  				break;
1351  
1352  			if (kref_get_unless_zero(&zhdr->refcount) == 0) {
1353  				zhdr = NULL;
1354  				break;
1355  			}
1356  			if (!z3fold_page_trylock(zhdr)) {
1357  				if (kref_put(&zhdr->refcount,
1358  						release_z3fold_page))
1359  					atomic64_dec(&pool->pages_nr);
1360  				zhdr = NULL;
1361  				continue; /* can't evict at this point */
1362  			}
1363  
1364  			/* test_and_set_bit is of course atomic, but we still
1365  			 * need to do it under page lock, otherwise checking
1366  			 * that bit in __z3fold_alloc wouldn't make sense
1367  			 */
1368  			if (zhdr->foreign_handles ||
1369  			    test_and_set_bit(PAGE_CLAIMED, &page->private)) {
1370  				if (kref_put(&zhdr->refcount,
1371  						release_z3fold_page))
1372  					atomic64_dec(&pool->pages_nr);
1373  				else
1374  					z3fold_page_unlock(zhdr);
1375  				zhdr = NULL;
1376  				continue; /* can't evict such page */
1377  			}
1378  			list_del_init(&zhdr->buddy);
1379  			zhdr->cpu = -1;
1380  			break;
1381  		}
1382  
1383  		if (!zhdr)
1384  			break;
1385  
1386  		list_del_init(&page->lru);
1387  		spin_unlock(&pool->lock);
1388  
1389  		if (!test_bit(PAGE_HEADLESS, &page->private)) {
1390  			/*
1391  			 * We need encode the handles before unlocking, and
1392  			 * use our local slots structure because z3fold_free
1393  			 * can zero out zhdr->slots and we can't do much
1394  			 * about that
1395  			 */
1396  			first_handle = 0;
1397  			last_handle = 0;
1398  			middle_handle = 0;
1399  			memset(slots.slot, 0, sizeof(slots.slot));
1400  			if (zhdr->first_chunks)
1401  				first_handle = __encode_handle(zhdr, &slots,
1402  								FIRST);
1403  			if (zhdr->middle_chunks)
1404  				middle_handle = __encode_handle(zhdr, &slots,
1405  								MIDDLE);
1406  			if (zhdr->last_chunks)
1407  				last_handle = __encode_handle(zhdr, &slots,
1408  								LAST);
1409  			/*
1410  			 * it's safe to unlock here because we hold a
1411  			 * reference to this page
1412  			 */
1413  			z3fold_page_unlock(zhdr);
1414  		} else {
1415  			first_handle = encode_handle(zhdr, HEADLESS);
1416  			last_handle = middle_handle = 0;
1417  		}
1418  		/* Issue the eviction callback(s) */
1419  		if (middle_handle) {
1420  			ret = pool->ops->evict(pool, middle_handle);
1421  			if (ret)
1422  				goto next;
1423  		}
1424  		if (first_handle) {
1425  			ret = pool->ops->evict(pool, first_handle);
1426  			if (ret)
1427  				goto next;
1428  		}
1429  		if (last_handle) {
1430  			ret = pool->ops->evict(pool, last_handle);
1431  			if (ret)
1432  				goto next;
1433  		}
1434  next:
1435  		if (test_bit(PAGE_HEADLESS, &page->private)) {
1436  			if (ret == 0) {
1437  				free_z3fold_page(page, true);
1438  				atomic64_dec(&pool->pages_nr);
1439  				return 0;
1440  			}
1441  			spin_lock(&pool->lock);
1442  			list_add(&page->lru, &pool->lru);
1443  			spin_unlock(&pool->lock);
1444  			clear_bit(PAGE_CLAIMED, &page->private);
1445  		} else {
1446  			struct z3fold_buddy_slots *slots = zhdr->slots;
1447  			z3fold_page_lock(zhdr);
1448  			if (kref_put(&zhdr->refcount,
1449  					release_z3fold_page_locked)) {
1450  				kmem_cache_free(pool->c_handle, slots);
1451  				atomic64_dec(&pool->pages_nr);
1452  				return 0;
1453  			}
1454  			/*
1455  			 * if we are here, the page is still not completely
1456  			 * free. Take the global pool lock then to be able
1457  			 * to add it back to the lru list
1458  			 */
1459  			spin_lock(&pool->lock);
1460  			list_add(&page->lru, &pool->lru);
1461  			spin_unlock(&pool->lock);
1462  			z3fold_page_unlock(zhdr);
1463  			clear_bit(PAGE_CLAIMED, &page->private);
1464  		}
1465  
1466  		/* We started off locked to we need to lock the pool back */
1467  		spin_lock(&pool->lock);
1468  	}
1469  	spin_unlock(&pool->lock);
1470  	return -EAGAIN;
1471  }
1472  
1473  /**
1474   * z3fold_map() - maps the allocation associated with the given handle
1475   * @pool:	pool in which the allocation resides
1476   * @handle:	handle associated with the allocation to be mapped
1477   *
1478   * Extracts the buddy number from handle and constructs the pointer to the
1479   * correct starting chunk within the page.
1480   *
1481   * Returns: a pointer to the mapped allocation
1482   */
1483  static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
1484  {
1485  	struct z3fold_header *zhdr;
1486  	struct page *page;
1487  	void *addr;
1488  	enum buddy buddy;
1489  
1490  	zhdr = get_z3fold_header(handle);
1491  	addr = zhdr;
1492  	page = virt_to_page(zhdr);
1493  
1494  	if (test_bit(PAGE_HEADLESS, &page->private))
1495  		goto out;
1496  
1497  	buddy = handle_to_buddy(handle);
1498  	switch (buddy) {
1499  	case FIRST:
1500  		addr += ZHDR_SIZE_ALIGNED;
1501  		break;
1502  	case MIDDLE:
1503  		addr += zhdr->start_middle << CHUNK_SHIFT;
1504  		set_bit(MIDDLE_CHUNK_MAPPED, &page->private);
1505  		break;
1506  	case LAST:
1507  		addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT);
1508  		break;
1509  	default:
1510  		pr_err("unknown buddy id %d\n", buddy);
1511  		WARN_ON(1);
1512  		addr = NULL;
1513  		break;
1514  	}
1515  
1516  	if (addr)
1517  		zhdr->mapped_count++;
1518  out:
1519  	put_z3fold_header(zhdr);
1520  	return addr;
1521  }
1522  
1523  /**
1524   * z3fold_unmap() - unmaps the allocation associated with the given handle
1525   * @pool:	pool in which the allocation resides
1526   * @handle:	handle associated with the allocation to be unmapped
1527   */
1528  static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
1529  {
1530  	struct z3fold_header *zhdr;
1531  	struct page *page;
1532  	enum buddy buddy;
1533  
1534  	zhdr = get_z3fold_header(handle);
1535  	page = virt_to_page(zhdr);
1536  
1537  	if (test_bit(PAGE_HEADLESS, &page->private))
1538  		return;
1539  
1540  	buddy = handle_to_buddy(handle);
1541  	if (buddy == MIDDLE)
1542  		clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
1543  	zhdr->mapped_count--;
1544  	put_z3fold_header(zhdr);
1545  }
1546  
1547  /**
1548   * z3fold_get_pool_size() - gets the z3fold pool size in pages
1549   * @pool:	pool whose size is being queried
1550   *
1551   * Returns: size in pages of the given pool.
1552   */
1553  static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
1554  {
1555  	return atomic64_read(&pool->pages_nr);
1556  }
1557  
1558  static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
1559  {
1560  	struct z3fold_header *zhdr;
1561  	struct z3fold_pool *pool;
1562  
1563  	VM_BUG_ON_PAGE(!PageMovable(page), page);
1564  	VM_BUG_ON_PAGE(PageIsolated(page), page);
1565  
1566  	if (test_bit(PAGE_HEADLESS, &page->private))
1567  		return false;
1568  
1569  	zhdr = page_address(page);
1570  	z3fold_page_lock(zhdr);
1571  	if (test_bit(NEEDS_COMPACTING, &page->private) ||
1572  	    test_bit(PAGE_STALE, &page->private))
1573  		goto out;
1574  
1575  	if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0)
1576  		goto out;
1577  
1578  	if (test_and_set_bit(PAGE_CLAIMED, &page->private))
1579  		goto out;
1580  	pool = zhdr_to_pool(zhdr);
1581  	spin_lock(&pool->lock);
1582  	if (!list_empty(&zhdr->buddy))
1583  		list_del_init(&zhdr->buddy);
1584  	if (!list_empty(&page->lru))
1585  		list_del_init(&page->lru);
1586  	spin_unlock(&pool->lock);
1587  
1588  	kref_get(&zhdr->refcount);
1589  	z3fold_page_unlock(zhdr);
1590  	return true;
1591  
1592  out:
1593  	z3fold_page_unlock(zhdr);
1594  	return false;
1595  }
1596  
1597  static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage,
1598  			       struct page *page, enum migrate_mode mode)
1599  {
1600  	struct z3fold_header *zhdr, *new_zhdr;
1601  	struct z3fold_pool *pool;
1602  	struct address_space *new_mapping;
1603  
1604  	VM_BUG_ON_PAGE(!PageMovable(page), page);
1605  	VM_BUG_ON_PAGE(!PageIsolated(page), page);
1606  	VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page);
1607  	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
1608  
1609  	zhdr = page_address(page);
1610  	pool = zhdr_to_pool(zhdr);
1611  
1612  	if (!z3fold_page_trylock(zhdr))
1613  		return -EAGAIN;
1614  	if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
1615  		z3fold_page_unlock(zhdr);
1616  		clear_bit(PAGE_CLAIMED, &page->private);
1617  		return -EBUSY;
1618  	}
1619  	if (work_pending(&zhdr->work)) {
1620  		z3fold_page_unlock(zhdr);
1621  		return -EAGAIN;
1622  	}
1623  	new_zhdr = page_address(newpage);
1624  	memcpy(new_zhdr, zhdr, PAGE_SIZE);
1625  	newpage->private = page->private;
1626  	page->private = 0;
1627  	z3fold_page_unlock(zhdr);
1628  	spin_lock_init(&new_zhdr->page_lock);
1629  	INIT_WORK(&new_zhdr->work, compact_page_work);
1630  	/*
1631  	 * z3fold_page_isolate() ensures that new_zhdr->buddy is empty,
1632  	 * so we only have to reinitialize it.
1633  	 */
1634  	INIT_LIST_HEAD(&new_zhdr->buddy);
1635  	new_mapping = page_mapping(page);
1636  	__ClearPageMovable(page);
1637  	ClearPagePrivate(page);
1638  
1639  	get_page(newpage);
1640  	z3fold_page_lock(new_zhdr);
1641  	if (new_zhdr->first_chunks)
1642  		encode_handle(new_zhdr, FIRST);
1643  	if (new_zhdr->last_chunks)
1644  		encode_handle(new_zhdr, LAST);
1645  	if (new_zhdr->middle_chunks)
1646  		encode_handle(new_zhdr, MIDDLE);
1647  	set_bit(NEEDS_COMPACTING, &newpage->private);
1648  	new_zhdr->cpu = smp_processor_id();
1649  	spin_lock(&pool->lock);
1650  	list_add(&newpage->lru, &pool->lru);
1651  	spin_unlock(&pool->lock);
1652  	__SetPageMovable(newpage, new_mapping);
1653  	z3fold_page_unlock(new_zhdr);
1654  
1655  	queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
1656  
1657  	page_mapcount_reset(page);
1658  	clear_bit(PAGE_CLAIMED, &page->private);
1659  	put_page(page);
1660  	return 0;
1661  }
1662  
1663  static void z3fold_page_putback(struct page *page)
1664  {
1665  	struct z3fold_header *zhdr;
1666  	struct z3fold_pool *pool;
1667  
1668  	zhdr = page_address(page);
1669  	pool = zhdr_to_pool(zhdr);
1670  
1671  	z3fold_page_lock(zhdr);
1672  	if (!list_empty(&zhdr->buddy))
1673  		list_del_init(&zhdr->buddy);
1674  	INIT_LIST_HEAD(&page->lru);
1675  	if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
1676  		atomic64_dec(&pool->pages_nr);
1677  		return;
1678  	}
1679  	spin_lock(&pool->lock);
1680  	list_add(&page->lru, &pool->lru);
1681  	spin_unlock(&pool->lock);
1682  	clear_bit(PAGE_CLAIMED, &page->private);
1683  	z3fold_page_unlock(zhdr);
1684  }
1685  
1686  static const struct address_space_operations z3fold_aops = {
1687  	.isolate_page = z3fold_page_isolate,
1688  	.migratepage = z3fold_page_migrate,
1689  	.putback_page = z3fold_page_putback,
1690  };
1691  
1692  /*****************
1693   * zpool
1694   ****************/
1695  
1696  static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle)
1697  {
1698  	if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
1699  		return pool->zpool_ops->evict(pool->zpool, handle);
1700  	else
1701  		return -ENOENT;
1702  }
1703  
1704  static const struct z3fold_ops z3fold_zpool_ops = {
1705  	.evict =	z3fold_zpool_evict
1706  };
1707  
1708  static void *z3fold_zpool_create(const char *name, gfp_t gfp,
1709  			       const struct zpool_ops *zpool_ops,
1710  			       struct zpool *zpool)
1711  {
1712  	struct z3fold_pool *pool;
1713  
1714  	pool = z3fold_create_pool(name, gfp,
1715  				zpool_ops ? &z3fold_zpool_ops : NULL);
1716  	if (pool) {
1717  		pool->zpool = zpool;
1718  		pool->zpool_ops = zpool_ops;
1719  	}
1720  	return pool;
1721  }
1722  
1723  static void z3fold_zpool_destroy(void *pool)
1724  {
1725  	z3fold_destroy_pool(pool);
1726  }
1727  
1728  static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp,
1729  			unsigned long *handle)
1730  {
1731  	return z3fold_alloc(pool, size, gfp, handle);
1732  }
1733  static void z3fold_zpool_free(void *pool, unsigned long handle)
1734  {
1735  	z3fold_free(pool, handle);
1736  }
1737  
1738  static int z3fold_zpool_shrink(void *pool, unsigned int pages,
1739  			unsigned int *reclaimed)
1740  {
1741  	unsigned int total = 0;
1742  	int ret = -EINVAL;
1743  
1744  	while (total < pages) {
1745  		ret = z3fold_reclaim_page(pool, 8);
1746  		if (ret < 0)
1747  			break;
1748  		total++;
1749  	}
1750  
1751  	if (reclaimed)
1752  		*reclaimed = total;
1753  
1754  	return ret;
1755  }
1756  
1757  static void *z3fold_zpool_map(void *pool, unsigned long handle,
1758  			enum zpool_mapmode mm)
1759  {
1760  	return z3fold_map(pool, handle);
1761  }
1762  static void z3fold_zpool_unmap(void *pool, unsigned long handle)
1763  {
1764  	z3fold_unmap(pool, handle);
1765  }
1766  
1767  static u64 z3fold_zpool_total_size(void *pool)
1768  {
1769  	return z3fold_get_pool_size(pool) * PAGE_SIZE;
1770  }
1771  
1772  static struct zpool_driver z3fold_zpool_driver = {
1773  	.type =		"z3fold",
1774  	.sleep_mapped = true,
1775  	.owner =	THIS_MODULE,
1776  	.create =	z3fold_zpool_create,
1777  	.destroy =	z3fold_zpool_destroy,
1778  	.malloc =	z3fold_zpool_malloc,
1779  	.free =		z3fold_zpool_free,
1780  	.shrink =	z3fold_zpool_shrink,
1781  	.map =		z3fold_zpool_map,
1782  	.unmap =	z3fold_zpool_unmap,
1783  	.total_size =	z3fold_zpool_total_size,
1784  };
1785  
1786  MODULE_ALIAS("zpool-z3fold");
1787  
1788  static int __init init_z3fold(void)
1789  {
1790  	int ret;
1791  
1792  	/* Make sure the z3fold header is not larger than the page size */
1793  	BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE);
1794  	ret = z3fold_mount();
1795  	if (ret)
1796  		return ret;
1797  
1798  	zpool_register_driver(&z3fold_zpool_driver);
1799  
1800  	return 0;
1801  }
1802  
1803  static void __exit exit_z3fold(void)
1804  {
1805  	z3fold_unmount();
1806  	zpool_unregister_driver(&z3fold_zpool_driver);
1807  }
1808  
1809  module_init(init_z3fold);
1810  module_exit(exit_z3fold);
1811  
1812  MODULE_LICENSE("GPL");
1813  MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>");
1814  MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages");
1815