xref: /openbmc/linux/mm/zswap.c (revision 70d49bbf)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * zswap.c - zswap driver file
4  *
5  * zswap is a backend for frontswap that takes pages that are in the process
6  * of being swapped out and attempts to compress and store them in a
7  * RAM-based memory pool.  This can result in a significant I/O reduction on
8  * the swap device and, in the case where decompressing from RAM is faster
9  * than reading from the swap device, can also improve workload performance.
10  *
11  * Copyright (C) 2012  Seth Jennings <sjenning@linux.vnet.ibm.com>
12 */
13 
14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15 
16 #include <linux/module.h>
17 #include <linux/cpu.h>
18 #include <linux/highmem.h>
19 #include <linux/slab.h>
20 #include <linux/spinlock.h>
21 #include <linux/types.h>
22 #include <linux/atomic.h>
23 #include <linux/frontswap.h>
24 #include <linux/rbtree.h>
25 #include <linux/swap.h>
26 #include <linux/crypto.h>
27 #include <linux/scatterlist.h>
28 #include <linux/mempool.h>
29 #include <linux/zpool.h>
30 #include <crypto/acompress.h>
31 
32 #include <linux/mm_types.h>
33 #include <linux/page-flags.h>
34 #include <linux/swapops.h>
35 #include <linux/writeback.h>
36 #include <linux/pagemap.h>
37 #include <linux/workqueue.h>
38 
39 #include "swap.h"
40 #include "internal.h"
41 
42 /*********************************
43 * statistics
44 **********************************/
45 /* Total bytes used by the compressed storage */
46 u64 zswap_pool_total_size;
47 /* The number of compressed pages currently stored in zswap */
48 atomic_t zswap_stored_pages = ATOMIC_INIT(0);
49 /* The number of same-value filled pages currently stored in zswap */
50 static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0);
51 
52 /*
53  * The statistics below are not protected from concurrent access for
54  * performance reasons so they may not be a 100% accurate.  However,
55  * they do provide useful information on roughly how many times a
56  * certain event is occurring.
57 */
58 
59 /* Pool limit was hit (see zswap_max_pool_percent) */
60 static u64 zswap_pool_limit_hit;
61 /* Pages written back when pool limit was reached */
62 static u64 zswap_written_back_pages;
63 /* Store failed due to a reclaim failure after pool limit was reached */
64 static u64 zswap_reject_reclaim_fail;
65 /* Compressed page was too big for the allocator to (optimally) store */
66 static u64 zswap_reject_compress_poor;
67 /* Store failed because underlying allocator could not get memory */
68 static u64 zswap_reject_alloc_fail;
69 /* Store failed because the entry metadata could not be allocated (rare) */
70 static u64 zswap_reject_kmemcache_fail;
71 /* Duplicate store was encountered (rare) */
72 static u64 zswap_duplicate_entry;
73 
74 /* Shrinker work queue */
75 static struct workqueue_struct *shrink_wq;
76 /* Pool limit was hit, we need to calm down */
77 static bool zswap_pool_reached_full;
78 
79 /*********************************
80 * tunables
81 **********************************/
82 
83 #define ZSWAP_PARAM_UNSET ""
84 
85 static int zswap_setup(void);
86 
87 /* Enable/disable zswap */
88 static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON);
89 static int zswap_enabled_param_set(const char *,
90 				   const struct kernel_param *);
91 static const struct kernel_param_ops zswap_enabled_param_ops = {
92 	.set =		zswap_enabled_param_set,
93 	.get =		param_get_bool,
94 };
95 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644);
96 
97 /* Crypto compressor to use */
98 static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
99 static int zswap_compressor_param_set(const char *,
100 				      const struct kernel_param *);
101 static const struct kernel_param_ops zswap_compressor_param_ops = {
102 	.set =		zswap_compressor_param_set,
103 	.get =		param_get_charp,
104 	.free =		param_free_charp,
105 };
106 module_param_cb(compressor, &zswap_compressor_param_ops,
107 		&zswap_compressor, 0644);
108 
109 /* Compressed storage zpool to use */
110 static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
111 static int zswap_zpool_param_set(const char *, const struct kernel_param *);
112 static const struct kernel_param_ops zswap_zpool_param_ops = {
113 	.set =		zswap_zpool_param_set,
114 	.get =		param_get_charp,
115 	.free =		param_free_charp,
116 };
117 module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
118 
119 /* The maximum percentage of memory that the compressed pool can occupy */
120 static unsigned int zswap_max_pool_percent = 20;
121 module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
122 
123 /* The threshold for accepting new pages after the max_pool_percent was hit */
124 static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
125 module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
126 		   uint, 0644);
127 
128 /*
129  * Enable/disable handling same-value filled pages (enabled by default).
130  * If disabled every page is considered non-same-value filled.
131  */
132 static bool zswap_same_filled_pages_enabled = true;
133 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
134 		   bool, 0644);
135 
136 /* Enable/disable handling non-same-value filled pages (enabled by default) */
137 static bool zswap_non_same_filled_pages_enabled = true;
138 module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
139 		   bool, 0644);
140 
141 static bool zswap_exclusive_loads_enabled = IS_ENABLED(
142 		CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
143 module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
144 
145 /*********************************
146 * data structures
147 **********************************/
148 
149 struct crypto_acomp_ctx {
150 	struct crypto_acomp *acomp;
151 	struct acomp_req *req;
152 	struct crypto_wait wait;
153 	u8 *dstmem;
154 	struct mutex *mutex;
155 };
156 
157 /*
158  * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock.
159  * The only case where lru_lock is not acquired while holding tree.lock is
160  * when a zswap_entry is taken off the lru for writeback, in that case it
161  * needs to be verified that it's still valid in the tree.
162  */
163 struct zswap_pool {
164 	struct zpool *zpool;
165 	struct crypto_acomp_ctx __percpu *acomp_ctx;
166 	struct kref kref;
167 	struct list_head list;
168 	struct work_struct release_work;
169 	struct work_struct shrink_work;
170 	struct hlist_node node;
171 	char tfm_name[CRYPTO_MAX_ALG_NAME];
172 	struct list_head lru;
173 	spinlock_t lru_lock;
174 };
175 
176 /*
177  * struct zswap_entry
178  *
179  * This structure contains the metadata for tracking a single compressed
180  * page within zswap.
181  *
182  * rbnode - links the entry into red-black tree for the appropriate swap type
183  * offset - the swap offset for the entry.  Index into the red-black tree.
184  * refcount - the number of outstanding reference to the entry. This is needed
185  *            to protect against premature freeing of the entry by code
186  *            concurrent calls to load, invalidate, and writeback.  The lock
187  *            for the zswap_tree structure that contains the entry must
188  *            be held while changing the refcount.  Since the lock must
189  *            be held, there is no reason to also make refcount atomic.
190  * length - the length in bytes of the compressed page data.  Needed during
191  *          decompression. For a same value filled page length is 0, and both
192  *          pool and lru are invalid and must be ignored.
193  * pool - the zswap_pool the entry's data is in
194  * handle - zpool allocation handle that stores the compressed page data
195  * value - value of the same-value filled pages which have same content
196  * lru - handle to the pool's lru used to evict pages.
197  */
198 struct zswap_entry {
199 	struct rb_node rbnode;
200 	swp_entry_t swpentry;
201 	int refcount;
202 	unsigned int length;
203 	struct zswap_pool *pool;
204 	union {
205 		unsigned long handle;
206 		unsigned long value;
207 	};
208 	struct obj_cgroup *objcg;
209 	struct list_head lru;
210 };
211 
212 /*
213  * The tree lock in the zswap_tree struct protects a few things:
214  * - the rbtree
215  * - the refcount field of each entry in the tree
216  */
217 struct zswap_tree {
218 	struct rb_root rbroot;
219 	spinlock_t lock;
220 };
221 
222 static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
223 
224 /* RCU-protected iteration */
225 static LIST_HEAD(zswap_pools);
226 /* protects zswap_pools list modification */
227 static DEFINE_SPINLOCK(zswap_pools_lock);
228 /* pool counter to provide unique names to zpool */
229 static atomic_t zswap_pools_count = ATOMIC_INIT(0);
230 
231 enum zswap_init_type {
232 	ZSWAP_UNINIT,
233 	ZSWAP_INIT_SUCCEED,
234 	ZSWAP_INIT_FAILED
235 };
236 
237 static enum zswap_init_type zswap_init_state;
238 
239 /* used to ensure the integrity of initialization */
240 static DEFINE_MUTEX(zswap_init_lock);
241 
242 /* init completed, but couldn't create the initial pool */
243 static bool zswap_has_pool;
244 
245 /*********************************
246 * helpers and fwd declarations
247 **********************************/
248 
249 #define zswap_pool_debug(msg, p)				\
250 	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
251 		 zpool_get_type((p)->zpool))
252 
253 static int zswap_writeback_entry(struct zswap_entry *entry,
254 				 struct zswap_tree *tree);
255 static int zswap_pool_get(struct zswap_pool *pool);
256 static void zswap_pool_put(struct zswap_pool *pool);
257 
258 static bool zswap_is_full(void)
259 {
260 	return totalram_pages() * zswap_max_pool_percent / 100 <
261 			DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
262 }
263 
264 static bool zswap_can_accept(void)
265 {
266 	return totalram_pages() * zswap_accept_thr_percent / 100 *
267 				zswap_max_pool_percent / 100 >
268 			DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
269 }
270 
271 static void zswap_update_total_size(void)
272 {
273 	struct zswap_pool *pool;
274 	u64 total = 0;
275 
276 	rcu_read_lock();
277 
278 	list_for_each_entry_rcu(pool, &zswap_pools, list)
279 		total += zpool_get_total_size(pool->zpool);
280 
281 	rcu_read_unlock();
282 
283 	zswap_pool_total_size = total;
284 }
285 
286 /*********************************
287 * zswap entry functions
288 **********************************/
289 static struct kmem_cache *zswap_entry_cache;
290 
291 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
292 {
293 	struct zswap_entry *entry;
294 	entry = kmem_cache_alloc(zswap_entry_cache, gfp);
295 	if (!entry)
296 		return NULL;
297 	entry->refcount = 1;
298 	RB_CLEAR_NODE(&entry->rbnode);
299 	return entry;
300 }
301 
302 static void zswap_entry_cache_free(struct zswap_entry *entry)
303 {
304 	kmem_cache_free(zswap_entry_cache, entry);
305 }
306 
307 /*********************************
308 * rbtree functions
309 **********************************/
310 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
311 {
312 	struct rb_node *node = root->rb_node;
313 	struct zswap_entry *entry;
314 	pgoff_t entry_offset;
315 
316 	while (node) {
317 		entry = rb_entry(node, struct zswap_entry, rbnode);
318 		entry_offset = swp_offset(entry->swpentry);
319 		if (entry_offset > offset)
320 			node = node->rb_left;
321 		else if (entry_offset < offset)
322 			node = node->rb_right;
323 		else
324 			return entry;
325 	}
326 	return NULL;
327 }
328 
329 /*
330  * In the case that a entry with the same offset is found, a pointer to
331  * the existing entry is stored in dupentry and the function returns -EEXIST
332  */
333 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
334 			struct zswap_entry **dupentry)
335 {
336 	struct rb_node **link = &root->rb_node, *parent = NULL;
337 	struct zswap_entry *myentry;
338 	pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
339 
340 	while (*link) {
341 		parent = *link;
342 		myentry = rb_entry(parent, struct zswap_entry, rbnode);
343 		myentry_offset = swp_offset(myentry->swpentry);
344 		if (myentry_offset > entry_offset)
345 			link = &(*link)->rb_left;
346 		else if (myentry_offset < entry_offset)
347 			link = &(*link)->rb_right;
348 		else {
349 			*dupentry = myentry;
350 			return -EEXIST;
351 		}
352 	}
353 	rb_link_node(&entry->rbnode, parent, link);
354 	rb_insert_color(&entry->rbnode, root);
355 	return 0;
356 }
357 
358 static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
359 {
360 	if (!RB_EMPTY_NODE(&entry->rbnode)) {
361 		rb_erase(&entry->rbnode, root);
362 		RB_CLEAR_NODE(&entry->rbnode);
363 		return true;
364 	}
365 	return false;
366 }
367 
368 /*
369  * Carries out the common pattern of freeing and entry's zpool allocation,
370  * freeing the entry itself, and decrementing the number of stored pages.
371  */
372 static void zswap_free_entry(struct zswap_entry *entry)
373 {
374 	if (entry->objcg) {
375 		obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
376 		obj_cgroup_put(entry->objcg);
377 	}
378 	if (!entry->length)
379 		atomic_dec(&zswap_same_filled_pages);
380 	else {
381 		spin_lock(&entry->pool->lru_lock);
382 		list_del(&entry->lru);
383 		spin_unlock(&entry->pool->lru_lock);
384 		zpool_free(entry->pool->zpool, entry->handle);
385 		zswap_pool_put(entry->pool);
386 	}
387 	zswap_entry_cache_free(entry);
388 	atomic_dec(&zswap_stored_pages);
389 	zswap_update_total_size();
390 }
391 
392 /* caller must hold the tree lock */
393 static void zswap_entry_get(struct zswap_entry *entry)
394 {
395 	entry->refcount++;
396 }
397 
398 /* caller must hold the tree lock
399 * remove from the tree and free it, if nobody reference the entry
400 */
401 static void zswap_entry_put(struct zswap_tree *tree,
402 			struct zswap_entry *entry)
403 {
404 	int refcount = --entry->refcount;
405 
406 	BUG_ON(refcount < 0);
407 	if (refcount == 0) {
408 		zswap_rb_erase(&tree->rbroot, entry);
409 		zswap_free_entry(entry);
410 	}
411 }
412 
413 /* caller must hold the tree lock */
414 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
415 				pgoff_t offset)
416 {
417 	struct zswap_entry *entry;
418 
419 	entry = zswap_rb_search(root, offset);
420 	if (entry)
421 		zswap_entry_get(entry);
422 
423 	return entry;
424 }
425 
426 /*********************************
427 * per-cpu code
428 **********************************/
429 static DEFINE_PER_CPU(u8 *, zswap_dstmem);
430 /*
431  * If users dynamically change the zpool type and compressor at runtime, i.e.
432  * zswap is running, zswap can have more than one zpool on one cpu, but they
433  * are sharing dtsmem. So we need this mutex to be per-cpu.
434  */
435 static DEFINE_PER_CPU(struct mutex *, zswap_mutex);
436 
437 static int zswap_dstmem_prepare(unsigned int cpu)
438 {
439 	struct mutex *mutex;
440 	u8 *dst;
441 
442 	dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
443 	if (!dst)
444 		return -ENOMEM;
445 
446 	mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu));
447 	if (!mutex) {
448 		kfree(dst);
449 		return -ENOMEM;
450 	}
451 
452 	mutex_init(mutex);
453 	per_cpu(zswap_dstmem, cpu) = dst;
454 	per_cpu(zswap_mutex, cpu) = mutex;
455 	return 0;
456 }
457 
458 static int zswap_dstmem_dead(unsigned int cpu)
459 {
460 	struct mutex *mutex;
461 	u8 *dst;
462 
463 	mutex = per_cpu(zswap_mutex, cpu);
464 	kfree(mutex);
465 	per_cpu(zswap_mutex, cpu) = NULL;
466 
467 	dst = per_cpu(zswap_dstmem, cpu);
468 	kfree(dst);
469 	per_cpu(zswap_dstmem, cpu) = NULL;
470 
471 	return 0;
472 }
473 
474 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
475 {
476 	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
477 	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
478 	struct crypto_acomp *acomp;
479 	struct acomp_req *req;
480 
481 	acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
482 	if (IS_ERR(acomp)) {
483 		pr_err("could not alloc crypto acomp %s : %ld\n",
484 				pool->tfm_name, PTR_ERR(acomp));
485 		return PTR_ERR(acomp);
486 	}
487 	acomp_ctx->acomp = acomp;
488 
489 	req = acomp_request_alloc(acomp_ctx->acomp);
490 	if (!req) {
491 		pr_err("could not alloc crypto acomp_request %s\n",
492 		       pool->tfm_name);
493 		crypto_free_acomp(acomp_ctx->acomp);
494 		return -ENOMEM;
495 	}
496 	acomp_ctx->req = req;
497 
498 	crypto_init_wait(&acomp_ctx->wait);
499 	/*
500 	 * if the backend of acomp is async zip, crypto_req_done() will wakeup
501 	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
502 	 * won't be called, crypto_wait_req() will return without blocking.
503 	 */
504 	acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
505 				   crypto_req_done, &acomp_ctx->wait);
506 
507 	acomp_ctx->mutex = per_cpu(zswap_mutex, cpu);
508 	acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu);
509 
510 	return 0;
511 }
512 
513 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
514 {
515 	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
516 	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
517 
518 	if (!IS_ERR_OR_NULL(acomp_ctx)) {
519 		if (!IS_ERR_OR_NULL(acomp_ctx->req))
520 			acomp_request_free(acomp_ctx->req);
521 		if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
522 			crypto_free_acomp(acomp_ctx->acomp);
523 	}
524 
525 	return 0;
526 }
527 
528 /*********************************
529 * pool functions
530 **********************************/
531 
532 static struct zswap_pool *__zswap_pool_current(void)
533 {
534 	struct zswap_pool *pool;
535 
536 	pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
537 	WARN_ONCE(!pool && zswap_has_pool,
538 		  "%s: no page storage pool!\n", __func__);
539 
540 	return pool;
541 }
542 
543 static struct zswap_pool *zswap_pool_current(void)
544 {
545 	assert_spin_locked(&zswap_pools_lock);
546 
547 	return __zswap_pool_current();
548 }
549 
550 static struct zswap_pool *zswap_pool_current_get(void)
551 {
552 	struct zswap_pool *pool;
553 
554 	rcu_read_lock();
555 
556 	pool = __zswap_pool_current();
557 	if (!zswap_pool_get(pool))
558 		pool = NULL;
559 
560 	rcu_read_unlock();
561 
562 	return pool;
563 }
564 
565 static struct zswap_pool *zswap_pool_last_get(void)
566 {
567 	struct zswap_pool *pool, *last = NULL;
568 
569 	rcu_read_lock();
570 
571 	list_for_each_entry_rcu(pool, &zswap_pools, list)
572 		last = pool;
573 	WARN_ONCE(!last && zswap_has_pool,
574 		  "%s: no page storage pool!\n", __func__);
575 	if (!zswap_pool_get(last))
576 		last = NULL;
577 
578 	rcu_read_unlock();
579 
580 	return last;
581 }
582 
583 /* type and compressor must be null-terminated */
584 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
585 {
586 	struct zswap_pool *pool;
587 
588 	assert_spin_locked(&zswap_pools_lock);
589 
590 	list_for_each_entry_rcu(pool, &zswap_pools, list) {
591 		if (strcmp(pool->tfm_name, compressor))
592 			continue;
593 		if (strcmp(zpool_get_type(pool->zpool), type))
594 			continue;
595 		/* if we can't get it, it's about to be destroyed */
596 		if (!zswap_pool_get(pool))
597 			continue;
598 		return pool;
599 	}
600 
601 	return NULL;
602 }
603 
604 /*
605  * If the entry is still valid in the tree, drop the initial ref and remove it
606  * from the tree. This function must be called with an additional ref held,
607  * otherwise it may race with another invalidation freeing the entry.
608  */
609 static void zswap_invalidate_entry(struct zswap_tree *tree,
610 				   struct zswap_entry *entry)
611 {
612 	if (zswap_rb_erase(&tree->rbroot, entry))
613 		zswap_entry_put(tree, entry);
614 }
615 
616 static int zswap_reclaim_entry(struct zswap_pool *pool)
617 {
618 	struct zswap_entry *entry;
619 	struct zswap_tree *tree;
620 	pgoff_t swpoffset;
621 	int ret;
622 
623 	/* Get an entry off the LRU */
624 	spin_lock(&pool->lru_lock);
625 	if (list_empty(&pool->lru)) {
626 		spin_unlock(&pool->lru_lock);
627 		return -EINVAL;
628 	}
629 	entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
630 	list_del_init(&entry->lru);
631 	/*
632 	 * Once the lru lock is dropped, the entry might get freed. The
633 	 * swpoffset is copied to the stack, and entry isn't deref'd again
634 	 * until the entry is verified to still be alive in the tree.
635 	 */
636 	swpoffset = swp_offset(entry->swpentry);
637 	tree = zswap_trees[swp_type(entry->swpentry)];
638 	spin_unlock(&pool->lru_lock);
639 
640 	/* Check for invalidate() race */
641 	spin_lock(&tree->lock);
642 	if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
643 		ret = -EAGAIN;
644 		goto unlock;
645 	}
646 	/* Hold a reference to prevent a free during writeback */
647 	zswap_entry_get(entry);
648 	spin_unlock(&tree->lock);
649 
650 	ret = zswap_writeback_entry(entry, tree);
651 
652 	spin_lock(&tree->lock);
653 	if (ret) {
654 		/* Writeback failed, put entry back on LRU */
655 		spin_lock(&pool->lru_lock);
656 		list_move(&entry->lru, &pool->lru);
657 		spin_unlock(&pool->lru_lock);
658 		goto put_unlock;
659 	}
660 
661 	/*
662 	 * Writeback started successfully, the page now belongs to the
663 	 * swapcache. Drop the entry from zswap - unless invalidate already
664 	 * took it out while we had the tree->lock released for IO.
665 	 */
666 	zswap_invalidate_entry(tree, entry);
667 
668 put_unlock:
669 	/* Drop local reference */
670 	zswap_entry_put(tree, entry);
671 unlock:
672 	spin_unlock(&tree->lock);
673 	return ret ? -EAGAIN : 0;
674 }
675 
676 static void shrink_worker(struct work_struct *w)
677 {
678 	struct zswap_pool *pool = container_of(w, typeof(*pool),
679 						shrink_work);
680 	int ret, failures = 0;
681 
682 	do {
683 		ret = zswap_reclaim_entry(pool);
684 		if (ret) {
685 			zswap_reject_reclaim_fail++;
686 			if (ret != -EAGAIN)
687 				break;
688 			if (++failures == MAX_RECLAIM_RETRIES)
689 				break;
690 		}
691 		cond_resched();
692 	} while (!zswap_can_accept());
693 	zswap_pool_put(pool);
694 }
695 
696 static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
697 {
698 	struct zswap_pool *pool;
699 	char name[38]; /* 'zswap' + 32 char (max) num + \0 */
700 	gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
701 	int ret;
702 
703 	if (!zswap_has_pool) {
704 		/* if either are unset, pool initialization failed, and we
705 		 * need both params to be set correctly before trying to
706 		 * create a pool.
707 		 */
708 		if (!strcmp(type, ZSWAP_PARAM_UNSET))
709 			return NULL;
710 		if (!strcmp(compressor, ZSWAP_PARAM_UNSET))
711 			return NULL;
712 	}
713 
714 	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
715 	if (!pool)
716 		return NULL;
717 
718 	/* unique name for each pool specifically required by zsmalloc */
719 	snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
720 
721 	pool->zpool = zpool_create_pool(type, name, gfp);
722 	if (!pool->zpool) {
723 		pr_err("%s zpool not available\n", type);
724 		goto error;
725 	}
726 	pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
727 
728 	strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
729 
730 	pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
731 	if (!pool->acomp_ctx) {
732 		pr_err("percpu alloc failed\n");
733 		goto error;
734 	}
735 
736 	ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
737 				       &pool->node);
738 	if (ret)
739 		goto error;
740 	pr_debug("using %s compressor\n", pool->tfm_name);
741 
742 	/* being the current pool takes 1 ref; this func expects the
743 	 * caller to always add the new pool as the current pool
744 	 */
745 	kref_init(&pool->kref);
746 	INIT_LIST_HEAD(&pool->list);
747 	INIT_LIST_HEAD(&pool->lru);
748 	spin_lock_init(&pool->lru_lock);
749 	INIT_WORK(&pool->shrink_work, shrink_worker);
750 
751 	zswap_pool_debug("created", pool);
752 
753 	return pool;
754 
755 error:
756 	if (pool->acomp_ctx)
757 		free_percpu(pool->acomp_ctx);
758 	if (pool->zpool)
759 		zpool_destroy_pool(pool->zpool);
760 	kfree(pool);
761 	return NULL;
762 }
763 
764 static struct zswap_pool *__zswap_pool_create_fallback(void)
765 {
766 	bool has_comp, has_zpool;
767 
768 	has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
769 	if (!has_comp && strcmp(zswap_compressor,
770 				CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
771 		pr_err("compressor %s not available, using default %s\n",
772 		       zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
773 		param_free_charp(&zswap_compressor);
774 		zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
775 		has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
776 	}
777 	if (!has_comp) {
778 		pr_err("default compressor %s not available\n",
779 		       zswap_compressor);
780 		param_free_charp(&zswap_compressor);
781 		zswap_compressor = ZSWAP_PARAM_UNSET;
782 	}
783 
784 	has_zpool = zpool_has_pool(zswap_zpool_type);
785 	if (!has_zpool && strcmp(zswap_zpool_type,
786 				 CONFIG_ZSWAP_ZPOOL_DEFAULT)) {
787 		pr_err("zpool %s not available, using default %s\n",
788 		       zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT);
789 		param_free_charp(&zswap_zpool_type);
790 		zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
791 		has_zpool = zpool_has_pool(zswap_zpool_type);
792 	}
793 	if (!has_zpool) {
794 		pr_err("default zpool %s not available\n",
795 		       zswap_zpool_type);
796 		param_free_charp(&zswap_zpool_type);
797 		zswap_zpool_type = ZSWAP_PARAM_UNSET;
798 	}
799 
800 	if (!has_comp || !has_zpool)
801 		return NULL;
802 
803 	return zswap_pool_create(zswap_zpool_type, zswap_compressor);
804 }
805 
806 static void zswap_pool_destroy(struct zswap_pool *pool)
807 {
808 	zswap_pool_debug("destroying", pool);
809 
810 	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
811 	free_percpu(pool->acomp_ctx);
812 	zpool_destroy_pool(pool->zpool);
813 	kfree(pool);
814 }
815 
816 static int __must_check zswap_pool_get(struct zswap_pool *pool)
817 {
818 	if (!pool)
819 		return 0;
820 
821 	return kref_get_unless_zero(&pool->kref);
822 }
823 
824 static void __zswap_pool_release(struct work_struct *work)
825 {
826 	struct zswap_pool *pool = container_of(work, typeof(*pool),
827 						release_work);
828 
829 	synchronize_rcu();
830 
831 	/* nobody should have been able to get a kref... */
832 	WARN_ON(kref_get_unless_zero(&pool->kref));
833 
834 	/* pool is now off zswap_pools list and has no references. */
835 	zswap_pool_destroy(pool);
836 }
837 
838 static void __zswap_pool_empty(struct kref *kref)
839 {
840 	struct zswap_pool *pool;
841 
842 	pool = container_of(kref, typeof(*pool), kref);
843 
844 	spin_lock(&zswap_pools_lock);
845 
846 	WARN_ON(pool == zswap_pool_current());
847 
848 	list_del_rcu(&pool->list);
849 
850 	INIT_WORK(&pool->release_work, __zswap_pool_release);
851 	schedule_work(&pool->release_work);
852 
853 	spin_unlock(&zswap_pools_lock);
854 }
855 
856 static void zswap_pool_put(struct zswap_pool *pool)
857 {
858 	kref_put(&pool->kref, __zswap_pool_empty);
859 }
860 
861 /*********************************
862 * param callbacks
863 **********************************/
864 
865 static bool zswap_pool_changed(const char *s, const struct kernel_param *kp)
866 {
867 	/* no change required */
868 	if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
869 		return false;
870 	return true;
871 }
872 
873 /* val must be a null-terminated string */
874 static int __zswap_param_set(const char *val, const struct kernel_param *kp,
875 			     char *type, char *compressor)
876 {
877 	struct zswap_pool *pool, *put_pool = NULL;
878 	char *s = strstrip((char *)val);
879 	int ret = 0;
880 	bool new_pool = false;
881 
882 	mutex_lock(&zswap_init_lock);
883 	switch (zswap_init_state) {
884 	case ZSWAP_UNINIT:
885 		/* if this is load-time (pre-init) param setting,
886 		 * don't create a pool; that's done during init.
887 		 */
888 		ret = param_set_charp(s, kp);
889 		break;
890 	case ZSWAP_INIT_SUCCEED:
891 		new_pool = zswap_pool_changed(s, kp);
892 		break;
893 	case ZSWAP_INIT_FAILED:
894 		pr_err("can't set param, initialization failed\n");
895 		ret = -ENODEV;
896 	}
897 	mutex_unlock(&zswap_init_lock);
898 
899 	/* no need to create a new pool, return directly */
900 	if (!new_pool)
901 		return ret;
902 
903 	if (!type) {
904 		if (!zpool_has_pool(s)) {
905 			pr_err("zpool %s not available\n", s);
906 			return -ENOENT;
907 		}
908 		type = s;
909 	} else if (!compressor) {
910 		if (!crypto_has_acomp(s, 0, 0)) {
911 			pr_err("compressor %s not available\n", s);
912 			return -ENOENT;
913 		}
914 		compressor = s;
915 	} else {
916 		WARN_ON(1);
917 		return -EINVAL;
918 	}
919 
920 	spin_lock(&zswap_pools_lock);
921 
922 	pool = zswap_pool_find_get(type, compressor);
923 	if (pool) {
924 		zswap_pool_debug("using existing", pool);
925 		WARN_ON(pool == zswap_pool_current());
926 		list_del_rcu(&pool->list);
927 	}
928 
929 	spin_unlock(&zswap_pools_lock);
930 
931 	if (!pool)
932 		pool = zswap_pool_create(type, compressor);
933 
934 	if (pool)
935 		ret = param_set_charp(s, kp);
936 	else
937 		ret = -EINVAL;
938 
939 	spin_lock(&zswap_pools_lock);
940 
941 	if (!ret) {
942 		put_pool = zswap_pool_current();
943 		list_add_rcu(&pool->list, &zswap_pools);
944 		zswap_has_pool = true;
945 	} else if (pool) {
946 		/* add the possibly pre-existing pool to the end of the pools
947 		 * list; if it's new (and empty) then it'll be removed and
948 		 * destroyed by the put after we drop the lock
949 		 */
950 		list_add_tail_rcu(&pool->list, &zswap_pools);
951 		put_pool = pool;
952 	}
953 
954 	spin_unlock(&zswap_pools_lock);
955 
956 	if (!zswap_has_pool && !pool) {
957 		/* if initial pool creation failed, and this pool creation also
958 		 * failed, maybe both compressor and zpool params were bad.
959 		 * Allow changing this param, so pool creation will succeed
960 		 * when the other param is changed. We already verified this
961 		 * param is ok in the zpool_has_pool() or crypto_has_acomp()
962 		 * checks above.
963 		 */
964 		ret = param_set_charp(s, kp);
965 	}
966 
967 	/* drop the ref from either the old current pool,
968 	 * or the new pool we failed to add
969 	 */
970 	if (put_pool)
971 		zswap_pool_put(put_pool);
972 
973 	return ret;
974 }
975 
976 static int zswap_compressor_param_set(const char *val,
977 				      const struct kernel_param *kp)
978 {
979 	return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
980 }
981 
982 static int zswap_zpool_param_set(const char *val,
983 				 const struct kernel_param *kp)
984 {
985 	return __zswap_param_set(val, kp, NULL, zswap_compressor);
986 }
987 
988 static int zswap_enabled_param_set(const char *val,
989 				   const struct kernel_param *kp)
990 {
991 	int ret = -ENODEV;
992 
993 	/* if this is load-time (pre-init) param setting, only set param. */
994 	if (system_state != SYSTEM_RUNNING)
995 		return param_set_bool(val, kp);
996 
997 	mutex_lock(&zswap_init_lock);
998 	switch (zswap_init_state) {
999 	case ZSWAP_UNINIT:
1000 		if (zswap_setup())
1001 			break;
1002 		fallthrough;
1003 	case ZSWAP_INIT_SUCCEED:
1004 		if (!zswap_has_pool)
1005 			pr_err("can't enable, no pool configured\n");
1006 		else
1007 			ret = param_set_bool(val, kp);
1008 		break;
1009 	case ZSWAP_INIT_FAILED:
1010 		pr_err("can't enable, initialization failed\n");
1011 	}
1012 	mutex_unlock(&zswap_init_lock);
1013 
1014 	return ret;
1015 }
1016 
1017 /*********************************
1018 * writeback code
1019 **********************************/
1020 /* return enum for zswap_get_swap_cache_page */
1021 enum zswap_get_swap_ret {
1022 	ZSWAP_SWAPCACHE_NEW,
1023 	ZSWAP_SWAPCACHE_EXIST,
1024 	ZSWAP_SWAPCACHE_FAIL,
1025 };
1026 
1027 /*
1028  * zswap_get_swap_cache_page
1029  *
1030  * This is an adaption of read_swap_cache_async()
1031  *
1032  * This function tries to find a page with the given swap entry
1033  * in the swapper_space address space (the swap cache).  If the page
1034  * is found, it is returned in retpage.  Otherwise, a page is allocated,
1035  * added to the swap cache, and returned in retpage.
1036  *
1037  * If success, the swap cache page is returned in retpage
1038  * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
1039  * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
1040  *     the new page is added to swapcache and locked
1041  * Returns ZSWAP_SWAPCACHE_FAIL on error
1042  */
1043 static int zswap_get_swap_cache_page(swp_entry_t entry,
1044 				struct page **retpage)
1045 {
1046 	bool page_was_allocated;
1047 
1048 	*retpage = __read_swap_cache_async(entry, GFP_KERNEL,
1049 			NULL, 0, &page_was_allocated);
1050 	if (page_was_allocated)
1051 		return ZSWAP_SWAPCACHE_NEW;
1052 	if (!*retpage)
1053 		return ZSWAP_SWAPCACHE_FAIL;
1054 	return ZSWAP_SWAPCACHE_EXIST;
1055 }
1056 
1057 /*
1058  * Attempts to free an entry by adding a page to the swap cache,
1059  * decompressing the entry data into the page, and issuing a
1060  * bio write to write the page back to the swap device.
1061  *
1062  * This can be thought of as a "resumed writeback" of the page
1063  * to the swap device.  We are basically resuming the same swap
1064  * writeback path that was intercepted with the frontswap_store()
1065  * in the first place.  After the page has been decompressed into
1066  * the swap cache, the compressed version stored by zswap can be
1067  * freed.
1068  */
1069 static int zswap_writeback_entry(struct zswap_entry *entry,
1070 				 struct zswap_tree *tree)
1071 {
1072 	swp_entry_t swpentry = entry->swpentry;
1073 	struct page *page;
1074 	struct scatterlist input, output;
1075 	struct crypto_acomp_ctx *acomp_ctx;
1076 	struct zpool *pool = entry->pool->zpool;
1077 
1078 	u8 *src, *tmp = NULL;
1079 	unsigned int dlen;
1080 	int ret;
1081 	struct writeback_control wbc = {
1082 		.sync_mode = WB_SYNC_NONE,
1083 	};
1084 
1085 	if (!zpool_can_sleep_mapped(pool)) {
1086 		tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
1087 		if (!tmp)
1088 			return -ENOMEM;
1089 	}
1090 
1091 	/* try to allocate swap cache page */
1092 	switch (zswap_get_swap_cache_page(swpentry, &page)) {
1093 	case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
1094 		ret = -ENOMEM;
1095 		goto fail;
1096 
1097 	case ZSWAP_SWAPCACHE_EXIST:
1098 		/* page is already in the swap cache, ignore for now */
1099 		put_page(page);
1100 		ret = -EEXIST;
1101 		goto fail;
1102 
1103 	case ZSWAP_SWAPCACHE_NEW: /* page is locked */
1104 		/*
1105 		 * Having a local reference to the zswap entry doesn't exclude
1106 		 * swapping from invalidating and recycling the swap slot. Once
1107 		 * the swapcache is secured against concurrent swapping to and
1108 		 * from the slot, recheck that the entry is still current before
1109 		 * writing.
1110 		 */
1111 		spin_lock(&tree->lock);
1112 		if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
1113 			spin_unlock(&tree->lock);
1114 			delete_from_swap_cache(page_folio(page));
1115 			ret = -ENOMEM;
1116 			goto fail;
1117 		}
1118 		spin_unlock(&tree->lock);
1119 
1120 		/* decompress */
1121 		acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1122 		dlen = PAGE_SIZE;
1123 
1124 		src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
1125 		if (!zpool_can_sleep_mapped(pool)) {
1126 			memcpy(tmp, src, entry->length);
1127 			src = tmp;
1128 			zpool_unmap_handle(pool, entry->handle);
1129 		}
1130 
1131 		mutex_lock(acomp_ctx->mutex);
1132 		sg_init_one(&input, src, entry->length);
1133 		sg_init_table(&output, 1);
1134 		sg_set_page(&output, page, PAGE_SIZE, 0);
1135 		acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
1136 		ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
1137 		dlen = acomp_ctx->req->dlen;
1138 		mutex_unlock(acomp_ctx->mutex);
1139 
1140 		if (!zpool_can_sleep_mapped(pool))
1141 			kfree(tmp);
1142 		else
1143 			zpool_unmap_handle(pool, entry->handle);
1144 
1145 		BUG_ON(ret);
1146 		BUG_ON(dlen != PAGE_SIZE);
1147 
1148 		/* page is up to date */
1149 		SetPageUptodate(page);
1150 	}
1151 
1152 	/* move it to the tail of the inactive list after end_writeback */
1153 	SetPageReclaim(page);
1154 
1155 	/* start writeback */
1156 	__swap_writepage(page, &wbc);
1157 	put_page(page);
1158 	zswap_written_back_pages++;
1159 
1160 	return ret;
1161 fail:
1162 	if (!zpool_can_sleep_mapped(pool))
1163 		kfree(tmp);
1164 
1165 	/*
1166 	* if we get here due to ZSWAP_SWAPCACHE_EXIST
1167 	* a load may be happening concurrently.
1168 	* it is safe and okay to not free the entry.
1169 	* it is also okay to return !0
1170 	*/
1171 	return ret;
1172 }
1173 
1174 static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
1175 {
1176 	unsigned long *page;
1177 	unsigned long val;
1178 	unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1;
1179 
1180 	page = (unsigned long *)ptr;
1181 	val = page[0];
1182 
1183 	if (val != page[last_pos])
1184 		return 0;
1185 
1186 	for (pos = 1; pos < last_pos; pos++) {
1187 		if (val != page[pos])
1188 			return 0;
1189 	}
1190 
1191 	*value = val;
1192 
1193 	return 1;
1194 }
1195 
1196 static void zswap_fill_page(void *ptr, unsigned long value)
1197 {
1198 	unsigned long *page;
1199 
1200 	page = (unsigned long *)ptr;
1201 	memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
1202 }
1203 
1204 /*********************************
1205 * frontswap hooks
1206 **********************************/
1207 /* attempts to compress and store an single page */
1208 static int zswap_frontswap_store(unsigned type, pgoff_t offset,
1209 				struct page *page)
1210 {
1211 	struct zswap_tree *tree = zswap_trees[type];
1212 	struct zswap_entry *entry, *dupentry;
1213 	struct scatterlist input, output;
1214 	struct crypto_acomp_ctx *acomp_ctx;
1215 	struct obj_cgroup *objcg = NULL;
1216 	struct zswap_pool *pool;
1217 	int ret;
1218 	unsigned int dlen = PAGE_SIZE;
1219 	unsigned long handle, value;
1220 	char *buf;
1221 	u8 *src, *dst;
1222 	gfp_t gfp;
1223 
1224 	/* THP isn't supported */
1225 	if (PageTransHuge(page)) {
1226 		ret = -EINVAL;
1227 		goto reject;
1228 	}
1229 
1230 	if (!zswap_enabled || !tree) {
1231 		ret = -ENODEV;
1232 		goto reject;
1233 	}
1234 
1235 	/*
1236 	 * XXX: zswap reclaim does not work with cgroups yet. Without a
1237 	 * cgroup-aware entry LRU, we will push out entries system-wide based on
1238 	 * local cgroup limits.
1239 	 */
1240 	objcg = get_obj_cgroup_from_page(page);
1241 	if (objcg && !obj_cgroup_may_zswap(objcg)) {
1242 		ret = -ENOMEM;
1243 		goto reject;
1244 	}
1245 
1246 	/* reclaim space if needed */
1247 	if (zswap_is_full()) {
1248 		zswap_pool_limit_hit++;
1249 		zswap_pool_reached_full = true;
1250 		goto shrink;
1251 	}
1252 
1253 	if (zswap_pool_reached_full) {
1254 	       if (!zswap_can_accept()) {
1255 			ret = -ENOMEM;
1256 			goto shrink;
1257 		} else
1258 			zswap_pool_reached_full = false;
1259 	}
1260 
1261 	/* allocate entry */
1262 	entry = zswap_entry_cache_alloc(GFP_KERNEL);
1263 	if (!entry) {
1264 		zswap_reject_kmemcache_fail++;
1265 		ret = -ENOMEM;
1266 		goto reject;
1267 	}
1268 
1269 	if (zswap_same_filled_pages_enabled) {
1270 		src = kmap_atomic(page);
1271 		if (zswap_is_page_same_filled(src, &value)) {
1272 			kunmap_atomic(src);
1273 			entry->swpentry = swp_entry(type, offset);
1274 			entry->length = 0;
1275 			entry->value = value;
1276 			atomic_inc(&zswap_same_filled_pages);
1277 			goto insert_entry;
1278 		}
1279 		kunmap_atomic(src);
1280 	}
1281 
1282 	if (!zswap_non_same_filled_pages_enabled) {
1283 		ret = -EINVAL;
1284 		goto freepage;
1285 	}
1286 
1287 	/* if entry is successfully added, it keeps the reference */
1288 	entry->pool = zswap_pool_current_get();
1289 	if (!entry->pool) {
1290 		ret = -EINVAL;
1291 		goto freepage;
1292 	}
1293 
1294 	/* compress */
1295 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1296 
1297 	mutex_lock(acomp_ctx->mutex);
1298 
1299 	dst = acomp_ctx->dstmem;
1300 	sg_init_table(&input, 1);
1301 	sg_set_page(&input, page, PAGE_SIZE, 0);
1302 
1303 	/* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */
1304 	sg_init_one(&output, dst, PAGE_SIZE * 2);
1305 	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
1306 	/*
1307 	 * it maybe looks a little bit silly that we send an asynchronous request,
1308 	 * then wait for its completion synchronously. This makes the process look
1309 	 * synchronous in fact.
1310 	 * Theoretically, acomp supports users send multiple acomp requests in one
1311 	 * acomp instance, then get those requests done simultaneously. but in this
1312 	 * case, frontswap actually does store and load page by page, there is no
1313 	 * existing method to send the second page before the first page is done
1314 	 * in one thread doing frontswap.
1315 	 * but in different threads running on different cpu, we have different
1316 	 * acomp instance, so multiple threads can do (de)compression in parallel.
1317 	 */
1318 	ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
1319 	dlen = acomp_ctx->req->dlen;
1320 
1321 	if (ret) {
1322 		ret = -EINVAL;
1323 		goto put_dstmem;
1324 	}
1325 
1326 	/* store */
1327 	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
1328 	if (zpool_malloc_support_movable(entry->pool->zpool))
1329 		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
1330 	ret = zpool_malloc(entry->pool->zpool, dlen, gfp, &handle);
1331 	if (ret == -ENOSPC) {
1332 		zswap_reject_compress_poor++;
1333 		goto put_dstmem;
1334 	}
1335 	if (ret) {
1336 		zswap_reject_alloc_fail++;
1337 		goto put_dstmem;
1338 	}
1339 	buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
1340 	memcpy(buf, dst, dlen);
1341 	zpool_unmap_handle(entry->pool->zpool, handle);
1342 	mutex_unlock(acomp_ctx->mutex);
1343 
1344 	/* populate entry */
1345 	entry->swpentry = swp_entry(type, offset);
1346 	entry->handle = handle;
1347 	entry->length = dlen;
1348 
1349 insert_entry:
1350 	entry->objcg = objcg;
1351 	if (objcg) {
1352 		obj_cgroup_charge_zswap(objcg, entry->length);
1353 		/* Account before objcg ref is moved to tree */
1354 		count_objcg_event(objcg, ZSWPOUT);
1355 	}
1356 
1357 	/* map */
1358 	spin_lock(&tree->lock);
1359 	do {
1360 		ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
1361 		if (ret == -EEXIST) {
1362 			zswap_duplicate_entry++;
1363 			/* remove from rbtree */
1364 			zswap_rb_erase(&tree->rbroot, dupentry);
1365 			zswap_entry_put(tree, dupentry);
1366 		}
1367 	} while (ret == -EEXIST);
1368 	if (entry->length) {
1369 		spin_lock(&entry->pool->lru_lock);
1370 		list_add(&entry->lru, &entry->pool->lru);
1371 		spin_unlock(&entry->pool->lru_lock);
1372 	}
1373 	spin_unlock(&tree->lock);
1374 
1375 	/* update stats */
1376 	atomic_inc(&zswap_stored_pages);
1377 	zswap_update_total_size();
1378 	count_vm_event(ZSWPOUT);
1379 
1380 	return 0;
1381 
1382 put_dstmem:
1383 	mutex_unlock(acomp_ctx->mutex);
1384 	zswap_pool_put(entry->pool);
1385 freepage:
1386 	zswap_entry_cache_free(entry);
1387 reject:
1388 	if (objcg)
1389 		obj_cgroup_put(objcg);
1390 	return ret;
1391 
1392 shrink:
1393 	pool = zswap_pool_last_get();
1394 	if (pool)
1395 		queue_work(shrink_wq, &pool->shrink_work);
1396 	ret = -ENOMEM;
1397 	goto reject;
1398 }
1399 
1400 /*
1401  * returns 0 if the page was successfully decompressed
1402  * return -1 on entry not found or error
1403 */
1404 static int zswap_frontswap_load(unsigned type, pgoff_t offset,
1405 				struct page *page, bool *exclusive)
1406 {
1407 	struct zswap_tree *tree = zswap_trees[type];
1408 	struct zswap_entry *entry;
1409 	struct scatterlist input, output;
1410 	struct crypto_acomp_ctx *acomp_ctx;
1411 	u8 *src, *dst, *tmp;
1412 	unsigned int dlen;
1413 	int ret;
1414 
1415 	/* find */
1416 	spin_lock(&tree->lock);
1417 	entry = zswap_entry_find_get(&tree->rbroot, offset);
1418 	if (!entry) {
1419 		/* entry was written back */
1420 		spin_unlock(&tree->lock);
1421 		return -1;
1422 	}
1423 	spin_unlock(&tree->lock);
1424 
1425 	if (!entry->length) {
1426 		dst = kmap_atomic(page);
1427 		zswap_fill_page(dst, entry->value);
1428 		kunmap_atomic(dst);
1429 		ret = 0;
1430 		goto stats;
1431 	}
1432 
1433 	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
1434 		tmp = kmalloc(entry->length, GFP_KERNEL);
1435 		if (!tmp) {
1436 			ret = -ENOMEM;
1437 			goto freeentry;
1438 		}
1439 	}
1440 
1441 	/* decompress */
1442 	dlen = PAGE_SIZE;
1443 	src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
1444 
1445 	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
1446 		memcpy(tmp, src, entry->length);
1447 		src = tmp;
1448 		zpool_unmap_handle(entry->pool->zpool, entry->handle);
1449 	}
1450 
1451 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1452 	mutex_lock(acomp_ctx->mutex);
1453 	sg_init_one(&input, src, entry->length);
1454 	sg_init_table(&output, 1);
1455 	sg_set_page(&output, page, PAGE_SIZE, 0);
1456 	acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
1457 	ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
1458 	mutex_unlock(acomp_ctx->mutex);
1459 
1460 	if (zpool_can_sleep_mapped(entry->pool->zpool))
1461 		zpool_unmap_handle(entry->pool->zpool, entry->handle);
1462 	else
1463 		kfree(tmp);
1464 
1465 	BUG_ON(ret);
1466 stats:
1467 	count_vm_event(ZSWPIN);
1468 	if (entry->objcg)
1469 		count_objcg_event(entry->objcg, ZSWPIN);
1470 freeentry:
1471 	spin_lock(&tree->lock);
1472 	if (!ret && zswap_exclusive_loads_enabled) {
1473 		zswap_invalidate_entry(tree, entry);
1474 		*exclusive = true;
1475 	} else if (entry->length) {
1476 		spin_lock(&entry->pool->lru_lock);
1477 		list_move(&entry->lru, &entry->pool->lru);
1478 		spin_unlock(&entry->pool->lru_lock);
1479 	}
1480 	zswap_entry_put(tree, entry);
1481 	spin_unlock(&tree->lock);
1482 
1483 	return ret;
1484 }
1485 
1486 /* frees an entry in zswap */
1487 static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
1488 {
1489 	struct zswap_tree *tree = zswap_trees[type];
1490 	struct zswap_entry *entry;
1491 
1492 	/* find */
1493 	spin_lock(&tree->lock);
1494 	entry = zswap_rb_search(&tree->rbroot, offset);
1495 	if (!entry) {
1496 		/* entry was written back */
1497 		spin_unlock(&tree->lock);
1498 		return;
1499 	}
1500 	zswap_invalidate_entry(tree, entry);
1501 	spin_unlock(&tree->lock);
1502 }
1503 
1504 /* frees all zswap entries for the given swap type */
1505 static void zswap_frontswap_invalidate_area(unsigned type)
1506 {
1507 	struct zswap_tree *tree = zswap_trees[type];
1508 	struct zswap_entry *entry, *n;
1509 
1510 	if (!tree)
1511 		return;
1512 
1513 	/* walk the tree and free everything */
1514 	spin_lock(&tree->lock);
1515 	rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
1516 		zswap_free_entry(entry);
1517 	tree->rbroot = RB_ROOT;
1518 	spin_unlock(&tree->lock);
1519 	kfree(tree);
1520 	zswap_trees[type] = NULL;
1521 }
1522 
1523 static void zswap_frontswap_init(unsigned type)
1524 {
1525 	struct zswap_tree *tree;
1526 
1527 	tree = kzalloc(sizeof(*tree), GFP_KERNEL);
1528 	if (!tree) {
1529 		pr_err("alloc failed, zswap disabled for swap type %d\n", type);
1530 		return;
1531 	}
1532 
1533 	tree->rbroot = RB_ROOT;
1534 	spin_lock_init(&tree->lock);
1535 	zswap_trees[type] = tree;
1536 }
1537 
1538 static const struct frontswap_ops zswap_frontswap_ops = {
1539 	.store = zswap_frontswap_store,
1540 	.load = zswap_frontswap_load,
1541 	.invalidate_page = zswap_frontswap_invalidate_page,
1542 	.invalidate_area = zswap_frontswap_invalidate_area,
1543 	.init = zswap_frontswap_init
1544 };
1545 
1546 /*********************************
1547 * debugfs functions
1548 **********************************/
1549 #ifdef CONFIG_DEBUG_FS
1550 #include <linux/debugfs.h>
1551 
1552 static struct dentry *zswap_debugfs_root;
1553 
1554 static int zswap_debugfs_init(void)
1555 {
1556 	if (!debugfs_initialized())
1557 		return -ENODEV;
1558 
1559 	zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
1560 
1561 	debugfs_create_u64("pool_limit_hit", 0444,
1562 			   zswap_debugfs_root, &zswap_pool_limit_hit);
1563 	debugfs_create_u64("reject_reclaim_fail", 0444,
1564 			   zswap_debugfs_root, &zswap_reject_reclaim_fail);
1565 	debugfs_create_u64("reject_alloc_fail", 0444,
1566 			   zswap_debugfs_root, &zswap_reject_alloc_fail);
1567 	debugfs_create_u64("reject_kmemcache_fail", 0444,
1568 			   zswap_debugfs_root, &zswap_reject_kmemcache_fail);
1569 	debugfs_create_u64("reject_compress_poor", 0444,
1570 			   zswap_debugfs_root, &zswap_reject_compress_poor);
1571 	debugfs_create_u64("written_back_pages", 0444,
1572 			   zswap_debugfs_root, &zswap_written_back_pages);
1573 	debugfs_create_u64("duplicate_entry", 0444,
1574 			   zswap_debugfs_root, &zswap_duplicate_entry);
1575 	debugfs_create_u64("pool_total_size", 0444,
1576 			   zswap_debugfs_root, &zswap_pool_total_size);
1577 	debugfs_create_atomic_t("stored_pages", 0444,
1578 				zswap_debugfs_root, &zswap_stored_pages);
1579 	debugfs_create_atomic_t("same_filled_pages", 0444,
1580 				zswap_debugfs_root, &zswap_same_filled_pages);
1581 
1582 	return 0;
1583 }
1584 #else
1585 static int zswap_debugfs_init(void)
1586 {
1587 	return 0;
1588 }
1589 #endif
1590 
1591 /*********************************
1592 * module init and exit
1593 **********************************/
1594 static int zswap_setup(void)
1595 {
1596 	struct zswap_pool *pool;
1597 	int ret;
1598 
1599 	zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
1600 	if (!zswap_entry_cache) {
1601 		pr_err("entry cache creation failed\n");
1602 		goto cache_fail;
1603 	}
1604 
1605 	ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare",
1606 				zswap_dstmem_prepare, zswap_dstmem_dead);
1607 	if (ret) {
1608 		pr_err("dstmem alloc failed\n");
1609 		goto dstmem_fail;
1610 	}
1611 
1612 	ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
1613 				      "mm/zswap_pool:prepare",
1614 				      zswap_cpu_comp_prepare,
1615 				      zswap_cpu_comp_dead);
1616 	if (ret)
1617 		goto hp_fail;
1618 
1619 	pool = __zswap_pool_create_fallback();
1620 	if (pool) {
1621 		pr_info("loaded using pool %s/%s\n", pool->tfm_name,
1622 			zpool_get_type(pool->zpool));
1623 		list_add(&pool->list, &zswap_pools);
1624 		zswap_has_pool = true;
1625 	} else {
1626 		pr_err("pool creation failed\n");
1627 		zswap_enabled = false;
1628 	}
1629 
1630 	shrink_wq = create_workqueue("zswap-shrink");
1631 	if (!shrink_wq)
1632 		goto fallback_fail;
1633 
1634 	ret = frontswap_register_ops(&zswap_frontswap_ops);
1635 	if (ret)
1636 		goto destroy_wq;
1637 	if (zswap_debugfs_init())
1638 		pr_warn("debugfs initialization failed\n");
1639 	zswap_init_state = ZSWAP_INIT_SUCCEED;
1640 	return 0;
1641 
1642 destroy_wq:
1643 	destroy_workqueue(shrink_wq);
1644 fallback_fail:
1645 	if (pool)
1646 		zswap_pool_destroy(pool);
1647 hp_fail:
1648 	cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
1649 dstmem_fail:
1650 	kmem_cache_destroy(zswap_entry_cache);
1651 cache_fail:
1652 	/* if built-in, we aren't unloaded on failure; don't allow use */
1653 	zswap_init_state = ZSWAP_INIT_FAILED;
1654 	zswap_enabled = false;
1655 	return -ENOMEM;
1656 }
1657 
1658 static int __init zswap_init(void)
1659 {
1660 	if (!zswap_enabled)
1661 		return 0;
1662 	return zswap_setup();
1663 }
1664 /* must be late so crypto has time to come up */
1665 late_initcall(zswap_init);
1666 
1667 MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
1668 MODULE_DESCRIPTION("Compressed cache for swap pages");
1669