xref: /openbmc/linux/mm/zswap.c (revision 726ccdba)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * zswap.c - zswap driver file
4  *
5  * zswap is a backend for frontswap that takes pages that are in the process
6  * of being swapped out and attempts to compress and store them in a
7  * RAM-based memory pool.  This can result in a significant I/O reduction on
8  * the swap device and, in the case where decompressing from RAM is faster
9  * than reading from the swap device, can also improve workload performance.
10  *
11  * Copyright (C) 2012  Seth Jennings <sjenning@linux.vnet.ibm.com>
12 */
13 
14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15 
16 #include <linux/module.h>
17 #include <linux/cpu.h>
18 #include <linux/highmem.h>
19 #include <linux/slab.h>
20 #include <linux/spinlock.h>
21 #include <linux/types.h>
22 #include <linux/atomic.h>
23 #include <linux/frontswap.h>
24 #include <linux/rbtree.h>
25 #include <linux/swap.h>
26 #include <linux/crypto.h>
27 #include <linux/scatterlist.h>
28 #include <linux/mempool.h>
29 #include <linux/zpool.h>
30 #include <crypto/acompress.h>
31 
32 #include <linux/mm_types.h>
33 #include <linux/page-flags.h>
34 #include <linux/swapops.h>
35 #include <linux/writeback.h>
36 #include <linux/pagemap.h>
37 #include <linux/workqueue.h>
38 
39 #include "swap.h"
40 #include "internal.h"
41 
42 /*********************************
43 * statistics
44 **********************************/
45 /* Total bytes used by the compressed storage */
46 u64 zswap_pool_total_size;
47 /* The number of compressed pages currently stored in zswap */
48 atomic_t zswap_stored_pages = ATOMIC_INIT(0);
49 /* The number of same-value filled pages currently stored in zswap */
50 static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0);
51 
52 /*
53  * The statistics below are not protected from concurrent access for
54  * performance reasons so they may not be a 100% accurate.  However,
55  * they do provide useful information on roughly how many times a
56  * certain event is occurring.
57 */
58 
59 /* Pool limit was hit (see zswap_max_pool_percent) */
60 static u64 zswap_pool_limit_hit;
61 /* Pages written back when pool limit was reached */
62 static u64 zswap_written_back_pages;
63 /* Store failed due to a reclaim failure after pool limit was reached */
64 static u64 zswap_reject_reclaim_fail;
65 /* Compressed page was too big for the allocator to (optimally) store */
66 static u64 zswap_reject_compress_poor;
67 /* Store failed because underlying allocator could not get memory */
68 static u64 zswap_reject_alloc_fail;
69 /* Store failed because the entry metadata could not be allocated (rare) */
70 static u64 zswap_reject_kmemcache_fail;
71 /* Duplicate store was encountered (rare) */
72 static u64 zswap_duplicate_entry;
73 
74 /* Shrinker work queue */
75 static struct workqueue_struct *shrink_wq;
76 /* Pool limit was hit, we need to calm down */
77 static bool zswap_pool_reached_full;
78 
79 /*********************************
80 * tunables
81 **********************************/
82 
83 #define ZSWAP_PARAM_UNSET ""
84 
85 static int zswap_setup(void);
86 
87 /* Enable/disable zswap */
88 static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON);
89 static int zswap_enabled_param_set(const char *,
90 				   const struct kernel_param *);
91 static const struct kernel_param_ops zswap_enabled_param_ops = {
92 	.set =		zswap_enabled_param_set,
93 	.get =		param_get_bool,
94 };
95 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644);
96 
97 /* Crypto compressor to use */
98 static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
99 static int zswap_compressor_param_set(const char *,
100 				      const struct kernel_param *);
101 static const struct kernel_param_ops zswap_compressor_param_ops = {
102 	.set =		zswap_compressor_param_set,
103 	.get =		param_get_charp,
104 	.free =		param_free_charp,
105 };
106 module_param_cb(compressor, &zswap_compressor_param_ops,
107 		&zswap_compressor, 0644);
108 
109 /* Compressed storage zpool to use */
110 static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
111 static int zswap_zpool_param_set(const char *, const struct kernel_param *);
112 static const struct kernel_param_ops zswap_zpool_param_ops = {
113 	.set =		zswap_zpool_param_set,
114 	.get =		param_get_charp,
115 	.free =		param_free_charp,
116 };
117 module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
118 
119 /* The maximum percentage of memory that the compressed pool can occupy */
120 static unsigned int zswap_max_pool_percent = 20;
121 module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
122 
123 /* The threshold for accepting new pages after the max_pool_percent was hit */
124 static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
125 module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
126 		   uint, 0644);
127 
128 /*
129  * Enable/disable handling same-value filled pages (enabled by default).
130  * If disabled every page is considered non-same-value filled.
131  */
132 static bool zswap_same_filled_pages_enabled = true;
133 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
134 		   bool, 0644);
135 
136 /* Enable/disable handling non-same-value filled pages (enabled by default) */
137 static bool zswap_non_same_filled_pages_enabled = true;
138 module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
139 		   bool, 0644);
140 
141 static bool zswap_exclusive_loads_enabled = IS_ENABLED(
142 		CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
143 module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
144 
145 /*********************************
146 * data structures
147 **********************************/
148 
149 struct crypto_acomp_ctx {
150 	struct crypto_acomp *acomp;
151 	struct acomp_req *req;
152 	struct crypto_wait wait;
153 	u8 *dstmem;
154 	struct mutex *mutex;
155 };
156 
157 /*
158  * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock.
159  * The only case where lru_lock is not acquired while holding tree.lock is
160  * when a zswap_entry is taken off the lru for writeback, in that case it
161  * needs to be verified that it's still valid in the tree.
162  */
163 struct zswap_pool {
164 	struct zpool *zpool;
165 	struct crypto_acomp_ctx __percpu *acomp_ctx;
166 	struct kref kref;
167 	struct list_head list;
168 	struct work_struct release_work;
169 	struct work_struct shrink_work;
170 	struct hlist_node node;
171 	char tfm_name[CRYPTO_MAX_ALG_NAME];
172 	struct list_head lru;
173 	spinlock_t lru_lock;
174 };
175 
176 /*
177  * struct zswap_entry
178  *
179  * This structure contains the metadata for tracking a single compressed
180  * page within zswap.
181  *
182  * rbnode - links the entry into red-black tree for the appropriate swap type
183  * offset - the swap offset for the entry.  Index into the red-black tree.
184  * refcount - the number of outstanding reference to the entry. This is needed
185  *            to protect against premature freeing of the entry by code
186  *            concurrent calls to load, invalidate, and writeback.  The lock
187  *            for the zswap_tree structure that contains the entry must
188  *            be held while changing the refcount.  Since the lock must
189  *            be held, there is no reason to also make refcount atomic.
190  * length - the length in bytes of the compressed page data.  Needed during
191  *          decompression. For a same value filled page length is 0, and both
192  *          pool and lru are invalid and must be ignored.
193  * pool - the zswap_pool the entry's data is in
194  * handle - zpool allocation handle that stores the compressed page data
195  * value - value of the same-value filled pages which have same content
196  * lru - handle to the pool's lru used to evict pages.
197  */
198 struct zswap_entry {
199 	struct rb_node rbnode;
200 	swp_entry_t swpentry;
201 	int refcount;
202 	unsigned int length;
203 	struct zswap_pool *pool;
204 	union {
205 		unsigned long handle;
206 		unsigned long value;
207 	};
208 	struct obj_cgroup *objcg;
209 	struct list_head lru;
210 };
211 
212 /*
213  * The tree lock in the zswap_tree struct protects a few things:
214  * - the rbtree
215  * - the refcount field of each entry in the tree
216  */
217 struct zswap_tree {
218 	struct rb_root rbroot;
219 	spinlock_t lock;
220 };
221 
222 static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
223 
224 /* RCU-protected iteration */
225 static LIST_HEAD(zswap_pools);
226 /* protects zswap_pools list modification */
227 static DEFINE_SPINLOCK(zswap_pools_lock);
228 /* pool counter to provide unique names to zpool */
229 static atomic_t zswap_pools_count = ATOMIC_INIT(0);
230 
231 enum zswap_init_type {
232 	ZSWAP_UNINIT,
233 	ZSWAP_INIT_SUCCEED,
234 	ZSWAP_INIT_FAILED
235 };
236 
237 static enum zswap_init_type zswap_init_state;
238 
239 /* used to ensure the integrity of initialization */
240 static DEFINE_MUTEX(zswap_init_lock);
241 
242 /* init completed, but couldn't create the initial pool */
243 static bool zswap_has_pool;
244 
245 /*********************************
246 * helpers and fwd declarations
247 **********************************/
248 
249 #define zswap_pool_debug(msg, p)				\
250 	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
251 		 zpool_get_type((p)->zpool))
252 
253 static int zswap_writeback_entry(struct zswap_entry *entry,
254 				 struct zswap_tree *tree);
255 static int zswap_pool_get(struct zswap_pool *pool);
256 static void zswap_pool_put(struct zswap_pool *pool);
257 
258 static bool zswap_is_full(void)
259 {
260 	return totalram_pages() * zswap_max_pool_percent / 100 <
261 			DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
262 }
263 
264 static bool zswap_can_accept(void)
265 {
266 	return totalram_pages() * zswap_accept_thr_percent / 100 *
267 				zswap_max_pool_percent / 100 >
268 			DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
269 }
270 
271 static void zswap_update_total_size(void)
272 {
273 	struct zswap_pool *pool;
274 	u64 total = 0;
275 
276 	rcu_read_lock();
277 
278 	list_for_each_entry_rcu(pool, &zswap_pools, list)
279 		total += zpool_get_total_size(pool->zpool);
280 
281 	rcu_read_unlock();
282 
283 	zswap_pool_total_size = total;
284 }
285 
286 /*********************************
287 * zswap entry functions
288 **********************************/
289 static struct kmem_cache *zswap_entry_cache;
290 
291 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
292 {
293 	struct zswap_entry *entry;
294 	entry = kmem_cache_alloc(zswap_entry_cache, gfp);
295 	if (!entry)
296 		return NULL;
297 	entry->refcount = 1;
298 	RB_CLEAR_NODE(&entry->rbnode);
299 	return entry;
300 }
301 
302 static void zswap_entry_cache_free(struct zswap_entry *entry)
303 {
304 	kmem_cache_free(zswap_entry_cache, entry);
305 }
306 
307 /*********************************
308 * rbtree functions
309 **********************************/
310 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
311 {
312 	struct rb_node *node = root->rb_node;
313 	struct zswap_entry *entry;
314 	pgoff_t entry_offset;
315 
316 	while (node) {
317 		entry = rb_entry(node, struct zswap_entry, rbnode);
318 		entry_offset = swp_offset(entry->swpentry);
319 		if (entry_offset > offset)
320 			node = node->rb_left;
321 		else if (entry_offset < offset)
322 			node = node->rb_right;
323 		else
324 			return entry;
325 	}
326 	return NULL;
327 }
328 
329 /*
330  * In the case that a entry with the same offset is found, a pointer to
331  * the existing entry is stored in dupentry and the function returns -EEXIST
332  */
333 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
334 			struct zswap_entry **dupentry)
335 {
336 	struct rb_node **link = &root->rb_node, *parent = NULL;
337 	struct zswap_entry *myentry;
338 	pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
339 
340 	while (*link) {
341 		parent = *link;
342 		myentry = rb_entry(parent, struct zswap_entry, rbnode);
343 		myentry_offset = swp_offset(myentry->swpentry);
344 		if (myentry_offset > entry_offset)
345 			link = &(*link)->rb_left;
346 		else if (myentry_offset < entry_offset)
347 			link = &(*link)->rb_right;
348 		else {
349 			*dupentry = myentry;
350 			return -EEXIST;
351 		}
352 	}
353 	rb_link_node(&entry->rbnode, parent, link);
354 	rb_insert_color(&entry->rbnode, root);
355 	return 0;
356 }
357 
358 static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
359 {
360 	if (!RB_EMPTY_NODE(&entry->rbnode)) {
361 		rb_erase(&entry->rbnode, root);
362 		RB_CLEAR_NODE(&entry->rbnode);
363 	}
364 }
365 
366 /*
367  * Carries out the common pattern of freeing and entry's zpool allocation,
368  * freeing the entry itself, and decrementing the number of stored pages.
369  */
370 static void zswap_free_entry(struct zswap_entry *entry)
371 {
372 	if (entry->objcg) {
373 		obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
374 		obj_cgroup_put(entry->objcg);
375 	}
376 	if (!entry->length)
377 		atomic_dec(&zswap_same_filled_pages);
378 	else {
379 		spin_lock(&entry->pool->lru_lock);
380 		list_del(&entry->lru);
381 		spin_unlock(&entry->pool->lru_lock);
382 		zpool_free(entry->pool->zpool, entry->handle);
383 		zswap_pool_put(entry->pool);
384 	}
385 	zswap_entry_cache_free(entry);
386 	atomic_dec(&zswap_stored_pages);
387 	zswap_update_total_size();
388 }
389 
390 /* caller must hold the tree lock */
391 static void zswap_entry_get(struct zswap_entry *entry)
392 {
393 	entry->refcount++;
394 }
395 
396 /* caller must hold the tree lock
397 * remove from the tree and free it, if nobody reference the entry
398 */
399 static void zswap_entry_put(struct zswap_tree *tree,
400 			struct zswap_entry *entry)
401 {
402 	int refcount = --entry->refcount;
403 
404 	BUG_ON(refcount < 0);
405 	if (refcount == 0) {
406 		zswap_rb_erase(&tree->rbroot, entry);
407 		zswap_free_entry(entry);
408 	}
409 }
410 
411 /* caller must hold the tree lock */
412 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
413 				pgoff_t offset)
414 {
415 	struct zswap_entry *entry;
416 
417 	entry = zswap_rb_search(root, offset);
418 	if (entry)
419 		zswap_entry_get(entry);
420 
421 	return entry;
422 }
423 
424 /*********************************
425 * per-cpu code
426 **********************************/
427 static DEFINE_PER_CPU(u8 *, zswap_dstmem);
428 /*
429  * If users dynamically change the zpool type and compressor at runtime, i.e.
430  * zswap is running, zswap can have more than one zpool on one cpu, but they
431  * are sharing dtsmem. So we need this mutex to be per-cpu.
432  */
433 static DEFINE_PER_CPU(struct mutex *, zswap_mutex);
434 
435 static int zswap_dstmem_prepare(unsigned int cpu)
436 {
437 	struct mutex *mutex;
438 	u8 *dst;
439 
440 	dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
441 	if (!dst)
442 		return -ENOMEM;
443 
444 	mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu));
445 	if (!mutex) {
446 		kfree(dst);
447 		return -ENOMEM;
448 	}
449 
450 	mutex_init(mutex);
451 	per_cpu(zswap_dstmem, cpu) = dst;
452 	per_cpu(zswap_mutex, cpu) = mutex;
453 	return 0;
454 }
455 
456 static int zswap_dstmem_dead(unsigned int cpu)
457 {
458 	struct mutex *mutex;
459 	u8 *dst;
460 
461 	mutex = per_cpu(zswap_mutex, cpu);
462 	kfree(mutex);
463 	per_cpu(zswap_mutex, cpu) = NULL;
464 
465 	dst = per_cpu(zswap_dstmem, cpu);
466 	kfree(dst);
467 	per_cpu(zswap_dstmem, cpu) = NULL;
468 
469 	return 0;
470 }
471 
472 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
473 {
474 	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
475 	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
476 	struct crypto_acomp *acomp;
477 	struct acomp_req *req;
478 
479 	acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
480 	if (IS_ERR(acomp)) {
481 		pr_err("could not alloc crypto acomp %s : %ld\n",
482 				pool->tfm_name, PTR_ERR(acomp));
483 		return PTR_ERR(acomp);
484 	}
485 	acomp_ctx->acomp = acomp;
486 
487 	req = acomp_request_alloc(acomp_ctx->acomp);
488 	if (!req) {
489 		pr_err("could not alloc crypto acomp_request %s\n",
490 		       pool->tfm_name);
491 		crypto_free_acomp(acomp_ctx->acomp);
492 		return -ENOMEM;
493 	}
494 	acomp_ctx->req = req;
495 
496 	crypto_init_wait(&acomp_ctx->wait);
497 	/*
498 	 * if the backend of acomp is async zip, crypto_req_done() will wakeup
499 	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
500 	 * won't be called, crypto_wait_req() will return without blocking.
501 	 */
502 	acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
503 				   crypto_req_done, &acomp_ctx->wait);
504 
505 	acomp_ctx->mutex = per_cpu(zswap_mutex, cpu);
506 	acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu);
507 
508 	return 0;
509 }
510 
511 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
512 {
513 	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
514 	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
515 
516 	if (!IS_ERR_OR_NULL(acomp_ctx)) {
517 		if (!IS_ERR_OR_NULL(acomp_ctx->req))
518 			acomp_request_free(acomp_ctx->req);
519 		if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
520 			crypto_free_acomp(acomp_ctx->acomp);
521 	}
522 
523 	return 0;
524 }
525 
526 /*********************************
527 * pool functions
528 **********************************/
529 
530 static struct zswap_pool *__zswap_pool_current(void)
531 {
532 	struct zswap_pool *pool;
533 
534 	pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
535 	WARN_ONCE(!pool && zswap_has_pool,
536 		  "%s: no page storage pool!\n", __func__);
537 
538 	return pool;
539 }
540 
541 static struct zswap_pool *zswap_pool_current(void)
542 {
543 	assert_spin_locked(&zswap_pools_lock);
544 
545 	return __zswap_pool_current();
546 }
547 
548 static struct zswap_pool *zswap_pool_current_get(void)
549 {
550 	struct zswap_pool *pool;
551 
552 	rcu_read_lock();
553 
554 	pool = __zswap_pool_current();
555 	if (!zswap_pool_get(pool))
556 		pool = NULL;
557 
558 	rcu_read_unlock();
559 
560 	return pool;
561 }
562 
563 static struct zswap_pool *zswap_pool_last_get(void)
564 {
565 	struct zswap_pool *pool, *last = NULL;
566 
567 	rcu_read_lock();
568 
569 	list_for_each_entry_rcu(pool, &zswap_pools, list)
570 		last = pool;
571 	WARN_ONCE(!last && zswap_has_pool,
572 		  "%s: no page storage pool!\n", __func__);
573 	if (!zswap_pool_get(last))
574 		last = NULL;
575 
576 	rcu_read_unlock();
577 
578 	return last;
579 }
580 
581 /* type and compressor must be null-terminated */
582 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
583 {
584 	struct zswap_pool *pool;
585 
586 	assert_spin_locked(&zswap_pools_lock);
587 
588 	list_for_each_entry_rcu(pool, &zswap_pools, list) {
589 		if (strcmp(pool->tfm_name, compressor))
590 			continue;
591 		if (strcmp(zpool_get_type(pool->zpool), type))
592 			continue;
593 		/* if we can't get it, it's about to be destroyed */
594 		if (!zswap_pool_get(pool))
595 			continue;
596 		return pool;
597 	}
598 
599 	return NULL;
600 }
601 
602 static void zswap_invalidate_entry(struct zswap_tree *tree,
603 				   struct zswap_entry *entry)
604 {
605 	/* remove from rbtree */
606 	zswap_rb_erase(&tree->rbroot, entry);
607 
608 	/* drop the initial reference from entry creation */
609 	zswap_entry_put(tree, entry);
610 }
611 
612 static int zswap_reclaim_entry(struct zswap_pool *pool)
613 {
614 	struct zswap_entry *entry;
615 	struct zswap_tree *tree;
616 	pgoff_t swpoffset;
617 	int ret;
618 
619 	/* Get an entry off the LRU */
620 	spin_lock(&pool->lru_lock);
621 	if (list_empty(&pool->lru)) {
622 		spin_unlock(&pool->lru_lock);
623 		return -EINVAL;
624 	}
625 	entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
626 	list_del_init(&entry->lru);
627 	/*
628 	 * Once the lru lock is dropped, the entry might get freed. The
629 	 * swpoffset is copied to the stack, and entry isn't deref'd again
630 	 * until the entry is verified to still be alive in the tree.
631 	 */
632 	swpoffset = swp_offset(entry->swpentry);
633 	tree = zswap_trees[swp_type(entry->swpentry)];
634 	spin_unlock(&pool->lru_lock);
635 
636 	/* Check for invalidate() race */
637 	spin_lock(&tree->lock);
638 	if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
639 		ret = -EAGAIN;
640 		goto unlock;
641 	}
642 	/* Hold a reference to prevent a free during writeback */
643 	zswap_entry_get(entry);
644 	spin_unlock(&tree->lock);
645 
646 	ret = zswap_writeback_entry(entry, tree);
647 
648 	spin_lock(&tree->lock);
649 	if (ret) {
650 		/* Writeback failed, put entry back on LRU */
651 		spin_lock(&pool->lru_lock);
652 		list_move(&entry->lru, &pool->lru);
653 		spin_unlock(&pool->lru_lock);
654 		goto put_unlock;
655 	}
656 
657 	/*
658 	 * Writeback started successfully, the page now belongs to the
659 	 * swapcache. Drop the entry from zswap - unless invalidate already
660 	 * took it out while we had the tree->lock released for IO.
661 	 */
662 	if (entry == zswap_rb_search(&tree->rbroot, swpoffset))
663 		zswap_invalidate_entry(tree, entry);
664 
665 put_unlock:
666 	/* Drop local reference */
667 	zswap_entry_put(tree, entry);
668 unlock:
669 	spin_unlock(&tree->lock);
670 	return ret ? -EAGAIN : 0;
671 }
672 
673 static void shrink_worker(struct work_struct *w)
674 {
675 	struct zswap_pool *pool = container_of(w, typeof(*pool),
676 						shrink_work);
677 	int ret, failures = 0;
678 
679 	do {
680 		ret = zswap_reclaim_entry(pool);
681 		if (ret) {
682 			zswap_reject_reclaim_fail++;
683 			if (ret != -EAGAIN)
684 				break;
685 			if (++failures == MAX_RECLAIM_RETRIES)
686 				break;
687 		}
688 		cond_resched();
689 	} while (!zswap_can_accept());
690 	zswap_pool_put(pool);
691 }
692 
693 static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
694 {
695 	struct zswap_pool *pool;
696 	char name[38]; /* 'zswap' + 32 char (max) num + \0 */
697 	gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
698 	int ret;
699 
700 	if (!zswap_has_pool) {
701 		/* if either are unset, pool initialization failed, and we
702 		 * need both params to be set correctly before trying to
703 		 * create a pool.
704 		 */
705 		if (!strcmp(type, ZSWAP_PARAM_UNSET))
706 			return NULL;
707 		if (!strcmp(compressor, ZSWAP_PARAM_UNSET))
708 			return NULL;
709 	}
710 
711 	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
712 	if (!pool)
713 		return NULL;
714 
715 	/* unique name for each pool specifically required by zsmalloc */
716 	snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
717 
718 	pool->zpool = zpool_create_pool(type, name, gfp);
719 	if (!pool->zpool) {
720 		pr_err("%s zpool not available\n", type);
721 		goto error;
722 	}
723 	pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
724 
725 	strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
726 
727 	pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
728 	if (!pool->acomp_ctx) {
729 		pr_err("percpu alloc failed\n");
730 		goto error;
731 	}
732 
733 	ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
734 				       &pool->node);
735 	if (ret)
736 		goto error;
737 	pr_debug("using %s compressor\n", pool->tfm_name);
738 
739 	/* being the current pool takes 1 ref; this func expects the
740 	 * caller to always add the new pool as the current pool
741 	 */
742 	kref_init(&pool->kref);
743 	INIT_LIST_HEAD(&pool->list);
744 	INIT_LIST_HEAD(&pool->lru);
745 	spin_lock_init(&pool->lru_lock);
746 	INIT_WORK(&pool->shrink_work, shrink_worker);
747 
748 	zswap_pool_debug("created", pool);
749 
750 	return pool;
751 
752 error:
753 	if (pool->acomp_ctx)
754 		free_percpu(pool->acomp_ctx);
755 	if (pool->zpool)
756 		zpool_destroy_pool(pool->zpool);
757 	kfree(pool);
758 	return NULL;
759 }
760 
761 static struct zswap_pool *__zswap_pool_create_fallback(void)
762 {
763 	bool has_comp, has_zpool;
764 
765 	has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
766 	if (!has_comp && strcmp(zswap_compressor,
767 				CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
768 		pr_err("compressor %s not available, using default %s\n",
769 		       zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
770 		param_free_charp(&zswap_compressor);
771 		zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
772 		has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
773 	}
774 	if (!has_comp) {
775 		pr_err("default compressor %s not available\n",
776 		       zswap_compressor);
777 		param_free_charp(&zswap_compressor);
778 		zswap_compressor = ZSWAP_PARAM_UNSET;
779 	}
780 
781 	has_zpool = zpool_has_pool(zswap_zpool_type);
782 	if (!has_zpool && strcmp(zswap_zpool_type,
783 				 CONFIG_ZSWAP_ZPOOL_DEFAULT)) {
784 		pr_err("zpool %s not available, using default %s\n",
785 		       zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT);
786 		param_free_charp(&zswap_zpool_type);
787 		zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
788 		has_zpool = zpool_has_pool(zswap_zpool_type);
789 	}
790 	if (!has_zpool) {
791 		pr_err("default zpool %s not available\n",
792 		       zswap_zpool_type);
793 		param_free_charp(&zswap_zpool_type);
794 		zswap_zpool_type = ZSWAP_PARAM_UNSET;
795 	}
796 
797 	if (!has_comp || !has_zpool)
798 		return NULL;
799 
800 	return zswap_pool_create(zswap_zpool_type, zswap_compressor);
801 }
802 
803 static void zswap_pool_destroy(struct zswap_pool *pool)
804 {
805 	zswap_pool_debug("destroying", pool);
806 
807 	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
808 	free_percpu(pool->acomp_ctx);
809 	zpool_destroy_pool(pool->zpool);
810 	kfree(pool);
811 }
812 
813 static int __must_check zswap_pool_get(struct zswap_pool *pool)
814 {
815 	if (!pool)
816 		return 0;
817 
818 	return kref_get_unless_zero(&pool->kref);
819 }
820 
821 static void __zswap_pool_release(struct work_struct *work)
822 {
823 	struct zswap_pool *pool = container_of(work, typeof(*pool),
824 						release_work);
825 
826 	synchronize_rcu();
827 
828 	/* nobody should have been able to get a kref... */
829 	WARN_ON(kref_get_unless_zero(&pool->kref));
830 
831 	/* pool is now off zswap_pools list and has no references. */
832 	zswap_pool_destroy(pool);
833 }
834 
835 static void __zswap_pool_empty(struct kref *kref)
836 {
837 	struct zswap_pool *pool;
838 
839 	pool = container_of(kref, typeof(*pool), kref);
840 
841 	spin_lock(&zswap_pools_lock);
842 
843 	WARN_ON(pool == zswap_pool_current());
844 
845 	list_del_rcu(&pool->list);
846 
847 	INIT_WORK(&pool->release_work, __zswap_pool_release);
848 	schedule_work(&pool->release_work);
849 
850 	spin_unlock(&zswap_pools_lock);
851 }
852 
853 static void zswap_pool_put(struct zswap_pool *pool)
854 {
855 	kref_put(&pool->kref, __zswap_pool_empty);
856 }
857 
858 /*********************************
859 * param callbacks
860 **********************************/
861 
862 static bool zswap_pool_changed(const char *s, const struct kernel_param *kp)
863 {
864 	/* no change required */
865 	if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
866 		return false;
867 	return true;
868 }
869 
870 /* val must be a null-terminated string */
871 static int __zswap_param_set(const char *val, const struct kernel_param *kp,
872 			     char *type, char *compressor)
873 {
874 	struct zswap_pool *pool, *put_pool = NULL;
875 	char *s = strstrip((char *)val);
876 	int ret = 0;
877 	bool new_pool = false;
878 
879 	mutex_lock(&zswap_init_lock);
880 	switch (zswap_init_state) {
881 	case ZSWAP_UNINIT:
882 		/* if this is load-time (pre-init) param setting,
883 		 * don't create a pool; that's done during init.
884 		 */
885 		ret = param_set_charp(s, kp);
886 		break;
887 	case ZSWAP_INIT_SUCCEED:
888 		new_pool = zswap_pool_changed(s, kp);
889 		break;
890 	case ZSWAP_INIT_FAILED:
891 		pr_err("can't set param, initialization failed\n");
892 		ret = -ENODEV;
893 	}
894 	mutex_unlock(&zswap_init_lock);
895 
896 	/* no need to create a new pool, return directly */
897 	if (!new_pool)
898 		return ret;
899 
900 	if (!type) {
901 		if (!zpool_has_pool(s)) {
902 			pr_err("zpool %s not available\n", s);
903 			return -ENOENT;
904 		}
905 		type = s;
906 	} else if (!compressor) {
907 		if (!crypto_has_acomp(s, 0, 0)) {
908 			pr_err("compressor %s not available\n", s);
909 			return -ENOENT;
910 		}
911 		compressor = s;
912 	} else {
913 		WARN_ON(1);
914 		return -EINVAL;
915 	}
916 
917 	spin_lock(&zswap_pools_lock);
918 
919 	pool = zswap_pool_find_get(type, compressor);
920 	if (pool) {
921 		zswap_pool_debug("using existing", pool);
922 		WARN_ON(pool == zswap_pool_current());
923 		list_del_rcu(&pool->list);
924 	}
925 
926 	spin_unlock(&zswap_pools_lock);
927 
928 	if (!pool)
929 		pool = zswap_pool_create(type, compressor);
930 
931 	if (pool)
932 		ret = param_set_charp(s, kp);
933 	else
934 		ret = -EINVAL;
935 
936 	spin_lock(&zswap_pools_lock);
937 
938 	if (!ret) {
939 		put_pool = zswap_pool_current();
940 		list_add_rcu(&pool->list, &zswap_pools);
941 		zswap_has_pool = true;
942 	} else if (pool) {
943 		/* add the possibly pre-existing pool to the end of the pools
944 		 * list; if it's new (and empty) then it'll be removed and
945 		 * destroyed by the put after we drop the lock
946 		 */
947 		list_add_tail_rcu(&pool->list, &zswap_pools);
948 		put_pool = pool;
949 	}
950 
951 	spin_unlock(&zswap_pools_lock);
952 
953 	if (!zswap_has_pool && !pool) {
954 		/* if initial pool creation failed, and this pool creation also
955 		 * failed, maybe both compressor and zpool params were bad.
956 		 * Allow changing this param, so pool creation will succeed
957 		 * when the other param is changed. We already verified this
958 		 * param is ok in the zpool_has_pool() or crypto_has_acomp()
959 		 * checks above.
960 		 */
961 		ret = param_set_charp(s, kp);
962 	}
963 
964 	/* drop the ref from either the old current pool,
965 	 * or the new pool we failed to add
966 	 */
967 	if (put_pool)
968 		zswap_pool_put(put_pool);
969 
970 	return ret;
971 }
972 
973 static int zswap_compressor_param_set(const char *val,
974 				      const struct kernel_param *kp)
975 {
976 	return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
977 }
978 
979 static int zswap_zpool_param_set(const char *val,
980 				 const struct kernel_param *kp)
981 {
982 	return __zswap_param_set(val, kp, NULL, zswap_compressor);
983 }
984 
985 static int zswap_enabled_param_set(const char *val,
986 				   const struct kernel_param *kp)
987 {
988 	int ret = -ENODEV;
989 
990 	/* if this is load-time (pre-init) param setting, only set param. */
991 	if (system_state != SYSTEM_RUNNING)
992 		return param_set_bool(val, kp);
993 
994 	mutex_lock(&zswap_init_lock);
995 	switch (zswap_init_state) {
996 	case ZSWAP_UNINIT:
997 		if (zswap_setup())
998 			break;
999 		fallthrough;
1000 	case ZSWAP_INIT_SUCCEED:
1001 		if (!zswap_has_pool)
1002 			pr_err("can't enable, no pool configured\n");
1003 		else
1004 			ret = param_set_bool(val, kp);
1005 		break;
1006 	case ZSWAP_INIT_FAILED:
1007 		pr_err("can't enable, initialization failed\n");
1008 	}
1009 	mutex_unlock(&zswap_init_lock);
1010 
1011 	return ret;
1012 }
1013 
1014 /*********************************
1015 * writeback code
1016 **********************************/
1017 /* return enum for zswap_get_swap_cache_page */
1018 enum zswap_get_swap_ret {
1019 	ZSWAP_SWAPCACHE_NEW,
1020 	ZSWAP_SWAPCACHE_EXIST,
1021 	ZSWAP_SWAPCACHE_FAIL,
1022 };
1023 
1024 /*
1025  * zswap_get_swap_cache_page
1026  *
1027  * This is an adaption of read_swap_cache_async()
1028  *
1029  * This function tries to find a page with the given swap entry
1030  * in the swapper_space address space (the swap cache).  If the page
1031  * is found, it is returned in retpage.  Otherwise, a page is allocated,
1032  * added to the swap cache, and returned in retpage.
1033  *
1034  * If success, the swap cache page is returned in retpage
1035  * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
1036  * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
1037  *     the new page is added to swapcache and locked
1038  * Returns ZSWAP_SWAPCACHE_FAIL on error
1039  */
1040 static int zswap_get_swap_cache_page(swp_entry_t entry,
1041 				struct page **retpage)
1042 {
1043 	bool page_was_allocated;
1044 
1045 	*retpage = __read_swap_cache_async(entry, GFP_KERNEL,
1046 			NULL, 0, &page_was_allocated);
1047 	if (page_was_allocated)
1048 		return ZSWAP_SWAPCACHE_NEW;
1049 	if (!*retpage)
1050 		return ZSWAP_SWAPCACHE_FAIL;
1051 	return ZSWAP_SWAPCACHE_EXIST;
1052 }
1053 
1054 /*
1055  * Attempts to free an entry by adding a page to the swap cache,
1056  * decompressing the entry data into the page, and issuing a
1057  * bio write to write the page back to the swap device.
1058  *
1059  * This can be thought of as a "resumed writeback" of the page
1060  * to the swap device.  We are basically resuming the same swap
1061  * writeback path that was intercepted with the frontswap_store()
1062  * in the first place.  After the page has been decompressed into
1063  * the swap cache, the compressed version stored by zswap can be
1064  * freed.
1065  */
1066 static int zswap_writeback_entry(struct zswap_entry *entry,
1067 				 struct zswap_tree *tree)
1068 {
1069 	swp_entry_t swpentry = entry->swpentry;
1070 	struct page *page;
1071 	struct scatterlist input, output;
1072 	struct crypto_acomp_ctx *acomp_ctx;
1073 	struct zpool *pool = entry->pool->zpool;
1074 
1075 	u8 *src, *tmp = NULL;
1076 	unsigned int dlen;
1077 	int ret;
1078 	struct writeback_control wbc = {
1079 		.sync_mode = WB_SYNC_NONE,
1080 	};
1081 
1082 	if (!zpool_can_sleep_mapped(pool)) {
1083 		tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
1084 		if (!tmp)
1085 			return -ENOMEM;
1086 	}
1087 
1088 	/* try to allocate swap cache page */
1089 	switch (zswap_get_swap_cache_page(swpentry, &page)) {
1090 	case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
1091 		ret = -ENOMEM;
1092 		goto fail;
1093 
1094 	case ZSWAP_SWAPCACHE_EXIST:
1095 		/* page is already in the swap cache, ignore for now */
1096 		put_page(page);
1097 		ret = -EEXIST;
1098 		goto fail;
1099 
1100 	case ZSWAP_SWAPCACHE_NEW: /* page is locked */
1101 		/*
1102 		 * Having a local reference to the zswap entry doesn't exclude
1103 		 * swapping from invalidating and recycling the swap slot. Once
1104 		 * the swapcache is secured against concurrent swapping to and
1105 		 * from the slot, recheck that the entry is still current before
1106 		 * writing.
1107 		 */
1108 		spin_lock(&tree->lock);
1109 		if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
1110 			spin_unlock(&tree->lock);
1111 			delete_from_swap_cache(page_folio(page));
1112 			ret = -ENOMEM;
1113 			goto fail;
1114 		}
1115 		spin_unlock(&tree->lock);
1116 
1117 		/* decompress */
1118 		acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1119 		dlen = PAGE_SIZE;
1120 
1121 		src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
1122 		if (!zpool_can_sleep_mapped(pool)) {
1123 			memcpy(tmp, src, entry->length);
1124 			src = tmp;
1125 			zpool_unmap_handle(pool, entry->handle);
1126 		}
1127 
1128 		mutex_lock(acomp_ctx->mutex);
1129 		sg_init_one(&input, src, entry->length);
1130 		sg_init_table(&output, 1);
1131 		sg_set_page(&output, page, PAGE_SIZE, 0);
1132 		acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
1133 		ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
1134 		dlen = acomp_ctx->req->dlen;
1135 		mutex_unlock(acomp_ctx->mutex);
1136 
1137 		if (!zpool_can_sleep_mapped(pool))
1138 			kfree(tmp);
1139 		else
1140 			zpool_unmap_handle(pool, entry->handle);
1141 
1142 		BUG_ON(ret);
1143 		BUG_ON(dlen != PAGE_SIZE);
1144 
1145 		/* page is up to date */
1146 		SetPageUptodate(page);
1147 	}
1148 
1149 	/* move it to the tail of the inactive list after end_writeback */
1150 	SetPageReclaim(page);
1151 
1152 	/* start writeback */
1153 	__swap_writepage(page, &wbc);
1154 	put_page(page);
1155 	zswap_written_back_pages++;
1156 
1157 	return ret;
1158 fail:
1159 	if (!zpool_can_sleep_mapped(pool))
1160 		kfree(tmp);
1161 
1162 	/*
1163 	* if we get here due to ZSWAP_SWAPCACHE_EXIST
1164 	* a load may be happening concurrently.
1165 	* it is safe and okay to not free the entry.
1166 	* it is also okay to return !0
1167 	*/
1168 	return ret;
1169 }
1170 
1171 static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
1172 {
1173 	unsigned long *page;
1174 	unsigned long val;
1175 	unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1;
1176 
1177 	page = (unsigned long *)ptr;
1178 	val = page[0];
1179 
1180 	if (val != page[last_pos])
1181 		return 0;
1182 
1183 	for (pos = 1; pos < last_pos; pos++) {
1184 		if (val != page[pos])
1185 			return 0;
1186 	}
1187 
1188 	*value = val;
1189 
1190 	return 1;
1191 }
1192 
1193 static void zswap_fill_page(void *ptr, unsigned long value)
1194 {
1195 	unsigned long *page;
1196 
1197 	page = (unsigned long *)ptr;
1198 	memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
1199 }
1200 
1201 /*********************************
1202 * frontswap hooks
1203 **********************************/
1204 /* attempts to compress and store an single page */
1205 static int zswap_frontswap_store(unsigned type, pgoff_t offset,
1206 				struct page *page)
1207 {
1208 	struct zswap_tree *tree = zswap_trees[type];
1209 	struct zswap_entry *entry, *dupentry;
1210 	struct scatterlist input, output;
1211 	struct crypto_acomp_ctx *acomp_ctx;
1212 	struct obj_cgroup *objcg = NULL;
1213 	struct zswap_pool *pool;
1214 	int ret;
1215 	unsigned int dlen = PAGE_SIZE;
1216 	unsigned long handle, value;
1217 	char *buf;
1218 	u8 *src, *dst;
1219 	gfp_t gfp;
1220 
1221 	/* THP isn't supported */
1222 	if (PageTransHuge(page)) {
1223 		ret = -EINVAL;
1224 		goto reject;
1225 	}
1226 
1227 	if (!zswap_enabled || !tree) {
1228 		ret = -ENODEV;
1229 		goto reject;
1230 	}
1231 
1232 	/*
1233 	 * XXX: zswap reclaim does not work with cgroups yet. Without a
1234 	 * cgroup-aware entry LRU, we will push out entries system-wide based on
1235 	 * local cgroup limits.
1236 	 */
1237 	objcg = get_obj_cgroup_from_page(page);
1238 	if (objcg && !obj_cgroup_may_zswap(objcg)) {
1239 		ret = -ENOMEM;
1240 		goto reject;
1241 	}
1242 
1243 	/* reclaim space if needed */
1244 	if (zswap_is_full()) {
1245 		zswap_pool_limit_hit++;
1246 		zswap_pool_reached_full = true;
1247 		goto shrink;
1248 	}
1249 
1250 	if (zswap_pool_reached_full) {
1251 	       if (!zswap_can_accept()) {
1252 			ret = -ENOMEM;
1253 			goto shrink;
1254 		} else
1255 			zswap_pool_reached_full = false;
1256 	}
1257 
1258 	/* allocate entry */
1259 	entry = zswap_entry_cache_alloc(GFP_KERNEL);
1260 	if (!entry) {
1261 		zswap_reject_kmemcache_fail++;
1262 		ret = -ENOMEM;
1263 		goto reject;
1264 	}
1265 
1266 	if (zswap_same_filled_pages_enabled) {
1267 		src = kmap_atomic(page);
1268 		if (zswap_is_page_same_filled(src, &value)) {
1269 			kunmap_atomic(src);
1270 			entry->swpentry = swp_entry(type, offset);
1271 			entry->length = 0;
1272 			entry->value = value;
1273 			atomic_inc(&zswap_same_filled_pages);
1274 			goto insert_entry;
1275 		}
1276 		kunmap_atomic(src);
1277 	}
1278 
1279 	if (!zswap_non_same_filled_pages_enabled) {
1280 		ret = -EINVAL;
1281 		goto freepage;
1282 	}
1283 
1284 	/* if entry is successfully added, it keeps the reference */
1285 	entry->pool = zswap_pool_current_get();
1286 	if (!entry->pool) {
1287 		ret = -EINVAL;
1288 		goto freepage;
1289 	}
1290 
1291 	/* compress */
1292 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1293 
1294 	mutex_lock(acomp_ctx->mutex);
1295 
1296 	dst = acomp_ctx->dstmem;
1297 	sg_init_table(&input, 1);
1298 	sg_set_page(&input, page, PAGE_SIZE, 0);
1299 
1300 	/* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */
1301 	sg_init_one(&output, dst, PAGE_SIZE * 2);
1302 	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
1303 	/*
1304 	 * it maybe looks a little bit silly that we send an asynchronous request,
1305 	 * then wait for its completion synchronously. This makes the process look
1306 	 * synchronous in fact.
1307 	 * Theoretically, acomp supports users send multiple acomp requests in one
1308 	 * acomp instance, then get those requests done simultaneously. but in this
1309 	 * case, frontswap actually does store and load page by page, there is no
1310 	 * existing method to send the second page before the first page is done
1311 	 * in one thread doing frontswap.
1312 	 * but in different threads running on different cpu, we have different
1313 	 * acomp instance, so multiple threads can do (de)compression in parallel.
1314 	 */
1315 	ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
1316 	dlen = acomp_ctx->req->dlen;
1317 
1318 	if (ret) {
1319 		ret = -EINVAL;
1320 		goto put_dstmem;
1321 	}
1322 
1323 	/* store */
1324 	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
1325 	if (zpool_malloc_support_movable(entry->pool->zpool))
1326 		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
1327 	ret = zpool_malloc(entry->pool->zpool, dlen, gfp, &handle);
1328 	if (ret == -ENOSPC) {
1329 		zswap_reject_compress_poor++;
1330 		goto put_dstmem;
1331 	}
1332 	if (ret) {
1333 		zswap_reject_alloc_fail++;
1334 		goto put_dstmem;
1335 	}
1336 	buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
1337 	memcpy(buf, dst, dlen);
1338 	zpool_unmap_handle(entry->pool->zpool, handle);
1339 	mutex_unlock(acomp_ctx->mutex);
1340 
1341 	/* populate entry */
1342 	entry->swpentry = swp_entry(type, offset);
1343 	entry->handle = handle;
1344 	entry->length = dlen;
1345 
1346 insert_entry:
1347 	entry->objcg = objcg;
1348 	if (objcg) {
1349 		obj_cgroup_charge_zswap(objcg, entry->length);
1350 		/* Account before objcg ref is moved to tree */
1351 		count_objcg_event(objcg, ZSWPOUT);
1352 	}
1353 
1354 	/* map */
1355 	spin_lock(&tree->lock);
1356 	do {
1357 		ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
1358 		if (ret == -EEXIST) {
1359 			zswap_duplicate_entry++;
1360 			/* remove from rbtree */
1361 			zswap_rb_erase(&tree->rbroot, dupentry);
1362 			zswap_entry_put(tree, dupentry);
1363 		}
1364 	} while (ret == -EEXIST);
1365 	if (entry->length) {
1366 		spin_lock(&entry->pool->lru_lock);
1367 		list_add(&entry->lru, &entry->pool->lru);
1368 		spin_unlock(&entry->pool->lru_lock);
1369 	}
1370 	spin_unlock(&tree->lock);
1371 
1372 	/* update stats */
1373 	atomic_inc(&zswap_stored_pages);
1374 	zswap_update_total_size();
1375 	count_vm_event(ZSWPOUT);
1376 
1377 	return 0;
1378 
1379 put_dstmem:
1380 	mutex_unlock(acomp_ctx->mutex);
1381 	zswap_pool_put(entry->pool);
1382 freepage:
1383 	zswap_entry_cache_free(entry);
1384 reject:
1385 	if (objcg)
1386 		obj_cgroup_put(objcg);
1387 	return ret;
1388 
1389 shrink:
1390 	pool = zswap_pool_last_get();
1391 	if (pool)
1392 		queue_work(shrink_wq, &pool->shrink_work);
1393 	ret = -ENOMEM;
1394 	goto reject;
1395 }
1396 
1397 /*
1398  * returns 0 if the page was successfully decompressed
1399  * return -1 on entry not found or error
1400 */
1401 static int zswap_frontswap_load(unsigned type, pgoff_t offset,
1402 				struct page *page, bool *exclusive)
1403 {
1404 	struct zswap_tree *tree = zswap_trees[type];
1405 	struct zswap_entry *entry;
1406 	struct scatterlist input, output;
1407 	struct crypto_acomp_ctx *acomp_ctx;
1408 	u8 *src, *dst, *tmp;
1409 	unsigned int dlen;
1410 	int ret;
1411 
1412 	/* find */
1413 	spin_lock(&tree->lock);
1414 	entry = zswap_entry_find_get(&tree->rbroot, offset);
1415 	if (!entry) {
1416 		/* entry was written back */
1417 		spin_unlock(&tree->lock);
1418 		return -1;
1419 	}
1420 	spin_unlock(&tree->lock);
1421 
1422 	if (!entry->length) {
1423 		dst = kmap_atomic(page);
1424 		zswap_fill_page(dst, entry->value);
1425 		kunmap_atomic(dst);
1426 		ret = 0;
1427 		goto stats;
1428 	}
1429 
1430 	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
1431 		tmp = kmalloc(entry->length, GFP_KERNEL);
1432 		if (!tmp) {
1433 			ret = -ENOMEM;
1434 			goto freeentry;
1435 		}
1436 	}
1437 
1438 	/* decompress */
1439 	dlen = PAGE_SIZE;
1440 	src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
1441 
1442 	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
1443 		memcpy(tmp, src, entry->length);
1444 		src = tmp;
1445 		zpool_unmap_handle(entry->pool->zpool, entry->handle);
1446 	}
1447 
1448 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1449 	mutex_lock(acomp_ctx->mutex);
1450 	sg_init_one(&input, src, entry->length);
1451 	sg_init_table(&output, 1);
1452 	sg_set_page(&output, page, PAGE_SIZE, 0);
1453 	acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
1454 	ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
1455 	mutex_unlock(acomp_ctx->mutex);
1456 
1457 	if (zpool_can_sleep_mapped(entry->pool->zpool))
1458 		zpool_unmap_handle(entry->pool->zpool, entry->handle);
1459 	else
1460 		kfree(tmp);
1461 
1462 	BUG_ON(ret);
1463 stats:
1464 	count_vm_event(ZSWPIN);
1465 	if (entry->objcg)
1466 		count_objcg_event(entry->objcg, ZSWPIN);
1467 freeentry:
1468 	spin_lock(&tree->lock);
1469 	zswap_entry_put(tree, entry);
1470 	if (!ret && zswap_exclusive_loads_enabled) {
1471 		zswap_invalidate_entry(tree, entry);
1472 		*exclusive = true;
1473 	} else if (entry->length) {
1474 		spin_lock(&entry->pool->lru_lock);
1475 		list_move(&entry->lru, &entry->pool->lru);
1476 		spin_unlock(&entry->pool->lru_lock);
1477 	}
1478 	spin_unlock(&tree->lock);
1479 
1480 	return ret;
1481 }
1482 
1483 /* frees an entry in zswap */
1484 static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
1485 {
1486 	struct zswap_tree *tree = zswap_trees[type];
1487 	struct zswap_entry *entry;
1488 
1489 	/* find */
1490 	spin_lock(&tree->lock);
1491 	entry = zswap_rb_search(&tree->rbroot, offset);
1492 	if (!entry) {
1493 		/* entry was written back */
1494 		spin_unlock(&tree->lock);
1495 		return;
1496 	}
1497 	zswap_invalidate_entry(tree, entry);
1498 	spin_unlock(&tree->lock);
1499 }
1500 
1501 /* frees all zswap entries for the given swap type */
1502 static void zswap_frontswap_invalidate_area(unsigned type)
1503 {
1504 	struct zswap_tree *tree = zswap_trees[type];
1505 	struct zswap_entry *entry, *n;
1506 
1507 	if (!tree)
1508 		return;
1509 
1510 	/* walk the tree and free everything */
1511 	spin_lock(&tree->lock);
1512 	rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
1513 		zswap_free_entry(entry);
1514 	tree->rbroot = RB_ROOT;
1515 	spin_unlock(&tree->lock);
1516 	kfree(tree);
1517 	zswap_trees[type] = NULL;
1518 }
1519 
1520 static void zswap_frontswap_init(unsigned type)
1521 {
1522 	struct zswap_tree *tree;
1523 
1524 	tree = kzalloc(sizeof(*tree), GFP_KERNEL);
1525 	if (!tree) {
1526 		pr_err("alloc failed, zswap disabled for swap type %d\n", type);
1527 		return;
1528 	}
1529 
1530 	tree->rbroot = RB_ROOT;
1531 	spin_lock_init(&tree->lock);
1532 	zswap_trees[type] = tree;
1533 }
1534 
1535 static const struct frontswap_ops zswap_frontswap_ops = {
1536 	.store = zswap_frontswap_store,
1537 	.load = zswap_frontswap_load,
1538 	.invalidate_page = zswap_frontswap_invalidate_page,
1539 	.invalidate_area = zswap_frontswap_invalidate_area,
1540 	.init = zswap_frontswap_init
1541 };
1542 
1543 /*********************************
1544 * debugfs functions
1545 **********************************/
1546 #ifdef CONFIG_DEBUG_FS
1547 #include <linux/debugfs.h>
1548 
1549 static struct dentry *zswap_debugfs_root;
1550 
1551 static int zswap_debugfs_init(void)
1552 {
1553 	if (!debugfs_initialized())
1554 		return -ENODEV;
1555 
1556 	zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
1557 
1558 	debugfs_create_u64("pool_limit_hit", 0444,
1559 			   zswap_debugfs_root, &zswap_pool_limit_hit);
1560 	debugfs_create_u64("reject_reclaim_fail", 0444,
1561 			   zswap_debugfs_root, &zswap_reject_reclaim_fail);
1562 	debugfs_create_u64("reject_alloc_fail", 0444,
1563 			   zswap_debugfs_root, &zswap_reject_alloc_fail);
1564 	debugfs_create_u64("reject_kmemcache_fail", 0444,
1565 			   zswap_debugfs_root, &zswap_reject_kmemcache_fail);
1566 	debugfs_create_u64("reject_compress_poor", 0444,
1567 			   zswap_debugfs_root, &zswap_reject_compress_poor);
1568 	debugfs_create_u64("written_back_pages", 0444,
1569 			   zswap_debugfs_root, &zswap_written_back_pages);
1570 	debugfs_create_u64("duplicate_entry", 0444,
1571 			   zswap_debugfs_root, &zswap_duplicate_entry);
1572 	debugfs_create_u64("pool_total_size", 0444,
1573 			   zswap_debugfs_root, &zswap_pool_total_size);
1574 	debugfs_create_atomic_t("stored_pages", 0444,
1575 				zswap_debugfs_root, &zswap_stored_pages);
1576 	debugfs_create_atomic_t("same_filled_pages", 0444,
1577 				zswap_debugfs_root, &zswap_same_filled_pages);
1578 
1579 	return 0;
1580 }
1581 #else
1582 static int zswap_debugfs_init(void)
1583 {
1584 	return 0;
1585 }
1586 #endif
1587 
1588 /*********************************
1589 * module init and exit
1590 **********************************/
1591 static int zswap_setup(void)
1592 {
1593 	struct zswap_pool *pool;
1594 	int ret;
1595 
1596 	zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
1597 	if (!zswap_entry_cache) {
1598 		pr_err("entry cache creation failed\n");
1599 		goto cache_fail;
1600 	}
1601 
1602 	ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare",
1603 				zswap_dstmem_prepare, zswap_dstmem_dead);
1604 	if (ret) {
1605 		pr_err("dstmem alloc failed\n");
1606 		goto dstmem_fail;
1607 	}
1608 
1609 	ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
1610 				      "mm/zswap_pool:prepare",
1611 				      zswap_cpu_comp_prepare,
1612 				      zswap_cpu_comp_dead);
1613 	if (ret)
1614 		goto hp_fail;
1615 
1616 	pool = __zswap_pool_create_fallback();
1617 	if (pool) {
1618 		pr_info("loaded using pool %s/%s\n", pool->tfm_name,
1619 			zpool_get_type(pool->zpool));
1620 		list_add(&pool->list, &zswap_pools);
1621 		zswap_has_pool = true;
1622 	} else {
1623 		pr_err("pool creation failed\n");
1624 		zswap_enabled = false;
1625 	}
1626 
1627 	shrink_wq = create_workqueue("zswap-shrink");
1628 	if (!shrink_wq)
1629 		goto fallback_fail;
1630 
1631 	ret = frontswap_register_ops(&zswap_frontswap_ops);
1632 	if (ret)
1633 		goto destroy_wq;
1634 	if (zswap_debugfs_init())
1635 		pr_warn("debugfs initialization failed\n");
1636 	zswap_init_state = ZSWAP_INIT_SUCCEED;
1637 	return 0;
1638 
1639 destroy_wq:
1640 	destroy_workqueue(shrink_wq);
1641 fallback_fail:
1642 	if (pool)
1643 		zswap_pool_destroy(pool);
1644 hp_fail:
1645 	cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
1646 dstmem_fail:
1647 	kmem_cache_destroy(zswap_entry_cache);
1648 cache_fail:
1649 	/* if built-in, we aren't unloaded on failure; don't allow use */
1650 	zswap_init_state = ZSWAP_INIT_FAILED;
1651 	zswap_enabled = false;
1652 	return -ENOMEM;
1653 }
1654 
1655 static int __init zswap_init(void)
1656 {
1657 	if (!zswap_enabled)
1658 		return 0;
1659 	return zswap_setup();
1660 }
1661 /* must be late so crypto has time to come up */
1662 late_initcall(zswap_init);
1663 
1664 MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
1665 MODULE_DESCRIPTION("Compressed cache for swap pages");
1666