xref: /openbmc/linux/mm/zswap.c (revision c33c7948)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * zswap.c - zswap driver file
4  *
5  * zswap is a backend for frontswap that takes pages that are in the process
6  * of being swapped out and attempts to compress and store them in a
7  * RAM-based memory pool.  This can result in a significant I/O reduction on
8  * the swap device and, in the case where decompressing from RAM is faster
9  * than reading from the swap device, can also improve workload performance.
10  *
11  * Copyright (C) 2012  Seth Jennings <sjenning@linux.vnet.ibm.com>
12 */
13 
14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15 
16 #include <linux/module.h>
17 #include <linux/cpu.h>
18 #include <linux/highmem.h>
19 #include <linux/slab.h>
20 #include <linux/spinlock.h>
21 #include <linux/types.h>
22 #include <linux/atomic.h>
23 #include <linux/frontswap.h>
24 #include <linux/rbtree.h>
25 #include <linux/swap.h>
26 #include <linux/crypto.h>
27 #include <linux/scatterlist.h>
28 #include <linux/mempool.h>
29 #include <linux/zpool.h>
30 #include <crypto/acompress.h>
31 
32 #include <linux/mm_types.h>
33 #include <linux/page-flags.h>
34 #include <linux/swapops.h>
35 #include <linux/writeback.h>
36 #include <linux/pagemap.h>
37 #include <linux/workqueue.h>
38 
39 #include "swap.h"
40 #include "internal.h"
41 
42 /*********************************
43 * statistics
44 **********************************/
45 /* Total bytes used by the compressed storage */
46 u64 zswap_pool_total_size;
47 /* The number of compressed pages currently stored in zswap */
48 atomic_t zswap_stored_pages = ATOMIC_INIT(0);
49 /* The number of same-value filled pages currently stored in zswap */
50 static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0);
51 
52 /*
53  * The statistics below are not protected from concurrent access for
54  * performance reasons so they may not be a 100% accurate.  However,
55  * they do provide useful information on roughly how many times a
56  * certain event is occurring.
57 */
58 
59 /* Pool limit was hit (see zswap_max_pool_percent) */
60 static u64 zswap_pool_limit_hit;
61 /* Pages written back when pool limit was reached */
62 static u64 zswap_written_back_pages;
63 /* Store failed due to a reclaim failure after pool limit was reached */
64 static u64 zswap_reject_reclaim_fail;
65 /* Compressed page was too big for the allocator to (optimally) store */
66 static u64 zswap_reject_compress_poor;
67 /* Store failed because underlying allocator could not get memory */
68 static u64 zswap_reject_alloc_fail;
69 /* Store failed because the entry metadata could not be allocated (rare) */
70 static u64 zswap_reject_kmemcache_fail;
71 /* Duplicate store was encountered (rare) */
72 static u64 zswap_duplicate_entry;
73 
74 /* Shrinker work queue */
75 static struct workqueue_struct *shrink_wq;
76 /* Pool limit was hit, we need to calm down */
77 static bool zswap_pool_reached_full;
78 
79 /*********************************
80 * tunables
81 **********************************/
82 
83 #define ZSWAP_PARAM_UNSET ""
84 
85 static int zswap_setup(void);
86 
87 /* Enable/disable zswap */
88 static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON);
89 static int zswap_enabled_param_set(const char *,
90 				   const struct kernel_param *);
91 static const struct kernel_param_ops zswap_enabled_param_ops = {
92 	.set =		zswap_enabled_param_set,
93 	.get =		param_get_bool,
94 };
95 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644);
96 
97 /* Crypto compressor to use */
98 static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
99 static int zswap_compressor_param_set(const char *,
100 				      const struct kernel_param *);
101 static const struct kernel_param_ops zswap_compressor_param_ops = {
102 	.set =		zswap_compressor_param_set,
103 	.get =		param_get_charp,
104 	.free =		param_free_charp,
105 };
106 module_param_cb(compressor, &zswap_compressor_param_ops,
107 		&zswap_compressor, 0644);
108 
109 /* Compressed storage zpool to use */
110 static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
111 static int zswap_zpool_param_set(const char *, const struct kernel_param *);
112 static const struct kernel_param_ops zswap_zpool_param_ops = {
113 	.set =		zswap_zpool_param_set,
114 	.get =		param_get_charp,
115 	.free =		param_free_charp,
116 };
117 module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
118 
119 /* The maximum percentage of memory that the compressed pool can occupy */
120 static unsigned int zswap_max_pool_percent = 20;
121 module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
122 
123 /* The threshold for accepting new pages after the max_pool_percent was hit */
124 static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
125 module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
126 		   uint, 0644);
127 
128 /*
129  * Enable/disable handling same-value filled pages (enabled by default).
130  * If disabled every page is considered non-same-value filled.
131  */
132 static bool zswap_same_filled_pages_enabled = true;
133 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
134 		   bool, 0644);
135 
136 /* Enable/disable handling non-same-value filled pages (enabled by default) */
137 static bool zswap_non_same_filled_pages_enabled = true;
138 module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
139 		   bool, 0644);
140 
141 static bool zswap_exclusive_loads_enabled = IS_ENABLED(
142 		CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
143 module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
144 
145 /*********************************
146 * data structures
147 **********************************/
148 
149 struct crypto_acomp_ctx {
150 	struct crypto_acomp *acomp;
151 	struct acomp_req *req;
152 	struct crypto_wait wait;
153 	u8 *dstmem;
154 	struct mutex *mutex;
155 };
156 
157 struct zswap_pool {
158 	struct zpool *zpool;
159 	struct crypto_acomp_ctx __percpu *acomp_ctx;
160 	struct kref kref;
161 	struct list_head list;
162 	struct work_struct release_work;
163 	struct work_struct shrink_work;
164 	struct hlist_node node;
165 	char tfm_name[CRYPTO_MAX_ALG_NAME];
166 };
167 
168 /*
169  * struct zswap_entry
170  *
171  * This structure contains the metadata for tracking a single compressed
172  * page within zswap.
173  *
174  * rbnode - links the entry into red-black tree for the appropriate swap type
175  * offset - the swap offset for the entry.  Index into the red-black tree.
176  * refcount - the number of outstanding reference to the entry. This is needed
177  *            to protect against premature freeing of the entry by code
178  *            concurrent calls to load, invalidate, and writeback.  The lock
179  *            for the zswap_tree structure that contains the entry must
180  *            be held while changing the refcount.  Since the lock must
181  *            be held, there is no reason to also make refcount atomic.
182  * length - the length in bytes of the compressed page data.  Needed during
183  *          decompression. For a same value filled page length is 0.
184  * pool - the zswap_pool the entry's data is in
185  * handle - zpool allocation handle that stores the compressed page data
186  * value - value of the same-value filled pages which have same content
187  */
188 struct zswap_entry {
189 	struct rb_node rbnode;
190 	pgoff_t offset;
191 	int refcount;
192 	unsigned int length;
193 	struct zswap_pool *pool;
194 	union {
195 		unsigned long handle;
196 		unsigned long value;
197 	};
198 	struct obj_cgroup *objcg;
199 };
200 
201 struct zswap_header {
202 	swp_entry_t swpentry;
203 };
204 
205 /*
206  * The tree lock in the zswap_tree struct protects a few things:
207  * - the rbtree
208  * - the refcount field of each entry in the tree
209  */
210 struct zswap_tree {
211 	struct rb_root rbroot;
212 	spinlock_t lock;
213 };
214 
215 static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
216 
217 /* RCU-protected iteration */
218 static LIST_HEAD(zswap_pools);
219 /* protects zswap_pools list modification */
220 static DEFINE_SPINLOCK(zswap_pools_lock);
221 /* pool counter to provide unique names to zpool */
222 static atomic_t zswap_pools_count = ATOMIC_INIT(0);
223 
224 enum zswap_init_type {
225 	ZSWAP_UNINIT,
226 	ZSWAP_INIT_SUCCEED,
227 	ZSWAP_INIT_FAILED
228 };
229 
230 static enum zswap_init_type zswap_init_state;
231 
232 /* used to ensure the integrity of initialization */
233 static DEFINE_MUTEX(zswap_init_lock);
234 
235 /* init completed, but couldn't create the initial pool */
236 static bool zswap_has_pool;
237 
238 /*********************************
239 * helpers and fwd declarations
240 **********************************/
241 
242 #define zswap_pool_debug(msg, p)				\
243 	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
244 		 zpool_get_type((p)->zpool))
245 
246 static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
247 static int zswap_pool_get(struct zswap_pool *pool);
248 static void zswap_pool_put(struct zswap_pool *pool);
249 
250 static const struct zpool_ops zswap_zpool_ops = {
251 	.evict = zswap_writeback_entry
252 };
253 
254 static bool zswap_is_full(void)
255 {
256 	return totalram_pages() * zswap_max_pool_percent / 100 <
257 			DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
258 }
259 
260 static bool zswap_can_accept(void)
261 {
262 	return totalram_pages() * zswap_accept_thr_percent / 100 *
263 				zswap_max_pool_percent / 100 >
264 			DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
265 }
266 
267 static void zswap_update_total_size(void)
268 {
269 	struct zswap_pool *pool;
270 	u64 total = 0;
271 
272 	rcu_read_lock();
273 
274 	list_for_each_entry_rcu(pool, &zswap_pools, list)
275 		total += zpool_get_total_size(pool->zpool);
276 
277 	rcu_read_unlock();
278 
279 	zswap_pool_total_size = total;
280 }
281 
282 /*********************************
283 * zswap entry functions
284 **********************************/
285 static struct kmem_cache *zswap_entry_cache;
286 
287 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
288 {
289 	struct zswap_entry *entry;
290 	entry = kmem_cache_alloc(zswap_entry_cache, gfp);
291 	if (!entry)
292 		return NULL;
293 	entry->refcount = 1;
294 	RB_CLEAR_NODE(&entry->rbnode);
295 	return entry;
296 }
297 
298 static void zswap_entry_cache_free(struct zswap_entry *entry)
299 {
300 	kmem_cache_free(zswap_entry_cache, entry);
301 }
302 
303 /*********************************
304 * rbtree functions
305 **********************************/
306 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
307 {
308 	struct rb_node *node = root->rb_node;
309 	struct zswap_entry *entry;
310 
311 	while (node) {
312 		entry = rb_entry(node, struct zswap_entry, rbnode);
313 		if (entry->offset > offset)
314 			node = node->rb_left;
315 		else if (entry->offset < offset)
316 			node = node->rb_right;
317 		else
318 			return entry;
319 	}
320 	return NULL;
321 }
322 
323 /*
324  * In the case that a entry with the same offset is found, a pointer to
325  * the existing entry is stored in dupentry and the function returns -EEXIST
326  */
327 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
328 			struct zswap_entry **dupentry)
329 {
330 	struct rb_node **link = &root->rb_node, *parent = NULL;
331 	struct zswap_entry *myentry;
332 
333 	while (*link) {
334 		parent = *link;
335 		myentry = rb_entry(parent, struct zswap_entry, rbnode);
336 		if (myentry->offset > entry->offset)
337 			link = &(*link)->rb_left;
338 		else if (myentry->offset < entry->offset)
339 			link = &(*link)->rb_right;
340 		else {
341 			*dupentry = myentry;
342 			return -EEXIST;
343 		}
344 	}
345 	rb_link_node(&entry->rbnode, parent, link);
346 	rb_insert_color(&entry->rbnode, root);
347 	return 0;
348 }
349 
350 static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
351 {
352 	if (!RB_EMPTY_NODE(&entry->rbnode)) {
353 		rb_erase(&entry->rbnode, root);
354 		RB_CLEAR_NODE(&entry->rbnode);
355 	}
356 }
357 
358 /*
359  * Carries out the common pattern of freeing and entry's zpool allocation,
360  * freeing the entry itself, and decrementing the number of stored pages.
361  */
362 static void zswap_free_entry(struct zswap_entry *entry)
363 {
364 	if (entry->objcg) {
365 		obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
366 		obj_cgroup_put(entry->objcg);
367 	}
368 	if (!entry->length)
369 		atomic_dec(&zswap_same_filled_pages);
370 	else {
371 		zpool_free(entry->pool->zpool, entry->handle);
372 		zswap_pool_put(entry->pool);
373 	}
374 	zswap_entry_cache_free(entry);
375 	atomic_dec(&zswap_stored_pages);
376 	zswap_update_total_size();
377 }
378 
379 /* caller must hold the tree lock */
380 static void zswap_entry_get(struct zswap_entry *entry)
381 {
382 	entry->refcount++;
383 }
384 
385 /* caller must hold the tree lock
386 * remove from the tree and free it, if nobody reference the entry
387 */
388 static void zswap_entry_put(struct zswap_tree *tree,
389 			struct zswap_entry *entry)
390 {
391 	int refcount = --entry->refcount;
392 
393 	BUG_ON(refcount < 0);
394 	if (refcount == 0) {
395 		zswap_rb_erase(&tree->rbroot, entry);
396 		zswap_free_entry(entry);
397 	}
398 }
399 
400 /* caller must hold the tree lock */
401 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
402 				pgoff_t offset)
403 {
404 	struct zswap_entry *entry;
405 
406 	entry = zswap_rb_search(root, offset);
407 	if (entry)
408 		zswap_entry_get(entry);
409 
410 	return entry;
411 }
412 
413 /*********************************
414 * per-cpu code
415 **********************************/
416 static DEFINE_PER_CPU(u8 *, zswap_dstmem);
417 /*
418  * If users dynamically change the zpool type and compressor at runtime, i.e.
419  * zswap is running, zswap can have more than one zpool on one cpu, but they
420  * are sharing dtsmem. So we need this mutex to be per-cpu.
421  */
422 static DEFINE_PER_CPU(struct mutex *, zswap_mutex);
423 
424 static int zswap_dstmem_prepare(unsigned int cpu)
425 {
426 	struct mutex *mutex;
427 	u8 *dst;
428 
429 	dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
430 	if (!dst)
431 		return -ENOMEM;
432 
433 	mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu));
434 	if (!mutex) {
435 		kfree(dst);
436 		return -ENOMEM;
437 	}
438 
439 	mutex_init(mutex);
440 	per_cpu(zswap_dstmem, cpu) = dst;
441 	per_cpu(zswap_mutex, cpu) = mutex;
442 	return 0;
443 }
444 
445 static int zswap_dstmem_dead(unsigned int cpu)
446 {
447 	struct mutex *mutex;
448 	u8 *dst;
449 
450 	mutex = per_cpu(zswap_mutex, cpu);
451 	kfree(mutex);
452 	per_cpu(zswap_mutex, cpu) = NULL;
453 
454 	dst = per_cpu(zswap_dstmem, cpu);
455 	kfree(dst);
456 	per_cpu(zswap_dstmem, cpu) = NULL;
457 
458 	return 0;
459 }
460 
461 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
462 {
463 	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
464 	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
465 	struct crypto_acomp *acomp;
466 	struct acomp_req *req;
467 
468 	acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
469 	if (IS_ERR(acomp)) {
470 		pr_err("could not alloc crypto acomp %s : %ld\n",
471 				pool->tfm_name, PTR_ERR(acomp));
472 		return PTR_ERR(acomp);
473 	}
474 	acomp_ctx->acomp = acomp;
475 
476 	req = acomp_request_alloc(acomp_ctx->acomp);
477 	if (!req) {
478 		pr_err("could not alloc crypto acomp_request %s\n",
479 		       pool->tfm_name);
480 		crypto_free_acomp(acomp_ctx->acomp);
481 		return -ENOMEM;
482 	}
483 	acomp_ctx->req = req;
484 
485 	crypto_init_wait(&acomp_ctx->wait);
486 	/*
487 	 * if the backend of acomp is async zip, crypto_req_done() will wakeup
488 	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
489 	 * won't be called, crypto_wait_req() will return without blocking.
490 	 */
491 	acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
492 				   crypto_req_done, &acomp_ctx->wait);
493 
494 	acomp_ctx->mutex = per_cpu(zswap_mutex, cpu);
495 	acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu);
496 
497 	return 0;
498 }
499 
500 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
501 {
502 	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
503 	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
504 
505 	if (!IS_ERR_OR_NULL(acomp_ctx)) {
506 		if (!IS_ERR_OR_NULL(acomp_ctx->req))
507 			acomp_request_free(acomp_ctx->req);
508 		if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
509 			crypto_free_acomp(acomp_ctx->acomp);
510 	}
511 
512 	return 0;
513 }
514 
515 /*********************************
516 * pool functions
517 **********************************/
518 
519 static struct zswap_pool *__zswap_pool_current(void)
520 {
521 	struct zswap_pool *pool;
522 
523 	pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
524 	WARN_ONCE(!pool && zswap_has_pool,
525 		  "%s: no page storage pool!\n", __func__);
526 
527 	return pool;
528 }
529 
530 static struct zswap_pool *zswap_pool_current(void)
531 {
532 	assert_spin_locked(&zswap_pools_lock);
533 
534 	return __zswap_pool_current();
535 }
536 
537 static struct zswap_pool *zswap_pool_current_get(void)
538 {
539 	struct zswap_pool *pool;
540 
541 	rcu_read_lock();
542 
543 	pool = __zswap_pool_current();
544 	if (!zswap_pool_get(pool))
545 		pool = NULL;
546 
547 	rcu_read_unlock();
548 
549 	return pool;
550 }
551 
552 static struct zswap_pool *zswap_pool_last_get(void)
553 {
554 	struct zswap_pool *pool, *last = NULL;
555 
556 	rcu_read_lock();
557 
558 	list_for_each_entry_rcu(pool, &zswap_pools, list)
559 		last = pool;
560 	WARN_ONCE(!last && zswap_has_pool,
561 		  "%s: no page storage pool!\n", __func__);
562 	if (!zswap_pool_get(last))
563 		last = NULL;
564 
565 	rcu_read_unlock();
566 
567 	return last;
568 }
569 
570 /* type and compressor must be null-terminated */
571 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
572 {
573 	struct zswap_pool *pool;
574 
575 	assert_spin_locked(&zswap_pools_lock);
576 
577 	list_for_each_entry_rcu(pool, &zswap_pools, list) {
578 		if (strcmp(pool->tfm_name, compressor))
579 			continue;
580 		if (strcmp(zpool_get_type(pool->zpool), type))
581 			continue;
582 		/* if we can't get it, it's about to be destroyed */
583 		if (!zswap_pool_get(pool))
584 			continue;
585 		return pool;
586 	}
587 
588 	return NULL;
589 }
590 
591 static void shrink_worker(struct work_struct *w)
592 {
593 	struct zswap_pool *pool = container_of(w, typeof(*pool),
594 						shrink_work);
595 	int ret, failures = 0;
596 
597 	do {
598 		ret = zpool_shrink(pool->zpool, 1, NULL);
599 		if (ret) {
600 			zswap_reject_reclaim_fail++;
601 			if (ret != -EAGAIN)
602 				break;
603 			if (++failures == MAX_RECLAIM_RETRIES)
604 				break;
605 		}
606 		cond_resched();
607 	} while (!zswap_can_accept());
608 	zswap_pool_put(pool);
609 }
610 
611 static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
612 {
613 	struct zswap_pool *pool;
614 	char name[38]; /* 'zswap' + 32 char (max) num + \0 */
615 	gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
616 	int ret;
617 
618 	if (!zswap_has_pool) {
619 		/* if either are unset, pool initialization failed, and we
620 		 * need both params to be set correctly before trying to
621 		 * create a pool.
622 		 */
623 		if (!strcmp(type, ZSWAP_PARAM_UNSET))
624 			return NULL;
625 		if (!strcmp(compressor, ZSWAP_PARAM_UNSET))
626 			return NULL;
627 	}
628 
629 	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
630 	if (!pool)
631 		return NULL;
632 
633 	/* unique name for each pool specifically required by zsmalloc */
634 	snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
635 
636 	pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops);
637 	if (!pool->zpool) {
638 		pr_err("%s zpool not available\n", type);
639 		goto error;
640 	}
641 	pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
642 
643 	strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
644 
645 	pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
646 	if (!pool->acomp_ctx) {
647 		pr_err("percpu alloc failed\n");
648 		goto error;
649 	}
650 
651 	ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
652 				       &pool->node);
653 	if (ret)
654 		goto error;
655 	pr_debug("using %s compressor\n", pool->tfm_name);
656 
657 	/* being the current pool takes 1 ref; this func expects the
658 	 * caller to always add the new pool as the current pool
659 	 */
660 	kref_init(&pool->kref);
661 	INIT_LIST_HEAD(&pool->list);
662 	INIT_WORK(&pool->shrink_work, shrink_worker);
663 
664 	zswap_pool_debug("created", pool);
665 
666 	return pool;
667 
668 error:
669 	if (pool->acomp_ctx)
670 		free_percpu(pool->acomp_ctx);
671 	if (pool->zpool)
672 		zpool_destroy_pool(pool->zpool);
673 	kfree(pool);
674 	return NULL;
675 }
676 
677 static struct zswap_pool *__zswap_pool_create_fallback(void)
678 {
679 	bool has_comp, has_zpool;
680 
681 	has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
682 	if (!has_comp && strcmp(zswap_compressor,
683 				CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
684 		pr_err("compressor %s not available, using default %s\n",
685 		       zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
686 		param_free_charp(&zswap_compressor);
687 		zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
688 		has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
689 	}
690 	if (!has_comp) {
691 		pr_err("default compressor %s not available\n",
692 		       zswap_compressor);
693 		param_free_charp(&zswap_compressor);
694 		zswap_compressor = ZSWAP_PARAM_UNSET;
695 	}
696 
697 	has_zpool = zpool_has_pool(zswap_zpool_type);
698 	if (!has_zpool && strcmp(zswap_zpool_type,
699 				 CONFIG_ZSWAP_ZPOOL_DEFAULT)) {
700 		pr_err("zpool %s not available, using default %s\n",
701 		       zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT);
702 		param_free_charp(&zswap_zpool_type);
703 		zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
704 		has_zpool = zpool_has_pool(zswap_zpool_type);
705 	}
706 	if (!has_zpool) {
707 		pr_err("default zpool %s not available\n",
708 		       zswap_zpool_type);
709 		param_free_charp(&zswap_zpool_type);
710 		zswap_zpool_type = ZSWAP_PARAM_UNSET;
711 	}
712 
713 	if (!has_comp || !has_zpool)
714 		return NULL;
715 
716 	return zswap_pool_create(zswap_zpool_type, zswap_compressor);
717 }
718 
719 static void zswap_pool_destroy(struct zswap_pool *pool)
720 {
721 	zswap_pool_debug("destroying", pool);
722 
723 	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
724 	free_percpu(pool->acomp_ctx);
725 	zpool_destroy_pool(pool->zpool);
726 	kfree(pool);
727 }
728 
729 static int __must_check zswap_pool_get(struct zswap_pool *pool)
730 {
731 	if (!pool)
732 		return 0;
733 
734 	return kref_get_unless_zero(&pool->kref);
735 }
736 
737 static void __zswap_pool_release(struct work_struct *work)
738 {
739 	struct zswap_pool *pool = container_of(work, typeof(*pool),
740 						release_work);
741 
742 	synchronize_rcu();
743 
744 	/* nobody should have been able to get a kref... */
745 	WARN_ON(kref_get_unless_zero(&pool->kref));
746 
747 	/* pool is now off zswap_pools list and has no references. */
748 	zswap_pool_destroy(pool);
749 }
750 
751 static void __zswap_pool_empty(struct kref *kref)
752 {
753 	struct zswap_pool *pool;
754 
755 	pool = container_of(kref, typeof(*pool), kref);
756 
757 	spin_lock(&zswap_pools_lock);
758 
759 	WARN_ON(pool == zswap_pool_current());
760 
761 	list_del_rcu(&pool->list);
762 
763 	INIT_WORK(&pool->release_work, __zswap_pool_release);
764 	schedule_work(&pool->release_work);
765 
766 	spin_unlock(&zswap_pools_lock);
767 }
768 
769 static void zswap_pool_put(struct zswap_pool *pool)
770 {
771 	kref_put(&pool->kref, __zswap_pool_empty);
772 }
773 
774 /*********************************
775 * param callbacks
776 **********************************/
777 
778 static bool zswap_pool_changed(const char *s, const struct kernel_param *kp)
779 {
780 	/* no change required */
781 	if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
782 		return false;
783 	return true;
784 }
785 
786 /* val must be a null-terminated string */
787 static int __zswap_param_set(const char *val, const struct kernel_param *kp,
788 			     char *type, char *compressor)
789 {
790 	struct zswap_pool *pool, *put_pool = NULL;
791 	char *s = strstrip((char *)val);
792 	int ret = 0;
793 	bool new_pool = false;
794 
795 	mutex_lock(&zswap_init_lock);
796 	switch (zswap_init_state) {
797 	case ZSWAP_UNINIT:
798 		/* if this is load-time (pre-init) param setting,
799 		 * don't create a pool; that's done during init.
800 		 */
801 		ret = param_set_charp(s, kp);
802 		break;
803 	case ZSWAP_INIT_SUCCEED:
804 		new_pool = zswap_pool_changed(s, kp);
805 		break;
806 	case ZSWAP_INIT_FAILED:
807 		pr_err("can't set param, initialization failed\n");
808 		ret = -ENODEV;
809 	}
810 	mutex_unlock(&zswap_init_lock);
811 
812 	/* no need to create a new pool, return directly */
813 	if (!new_pool)
814 		return ret;
815 
816 	if (!type) {
817 		if (!zpool_has_pool(s)) {
818 			pr_err("zpool %s not available\n", s);
819 			return -ENOENT;
820 		}
821 		type = s;
822 	} else if (!compressor) {
823 		if (!crypto_has_acomp(s, 0, 0)) {
824 			pr_err("compressor %s not available\n", s);
825 			return -ENOENT;
826 		}
827 		compressor = s;
828 	} else {
829 		WARN_ON(1);
830 		return -EINVAL;
831 	}
832 
833 	spin_lock(&zswap_pools_lock);
834 
835 	pool = zswap_pool_find_get(type, compressor);
836 	if (pool) {
837 		zswap_pool_debug("using existing", pool);
838 		WARN_ON(pool == zswap_pool_current());
839 		list_del_rcu(&pool->list);
840 	}
841 
842 	spin_unlock(&zswap_pools_lock);
843 
844 	if (!pool)
845 		pool = zswap_pool_create(type, compressor);
846 
847 	if (pool)
848 		ret = param_set_charp(s, kp);
849 	else
850 		ret = -EINVAL;
851 
852 	spin_lock(&zswap_pools_lock);
853 
854 	if (!ret) {
855 		put_pool = zswap_pool_current();
856 		list_add_rcu(&pool->list, &zswap_pools);
857 		zswap_has_pool = true;
858 	} else if (pool) {
859 		/* add the possibly pre-existing pool to the end of the pools
860 		 * list; if it's new (and empty) then it'll be removed and
861 		 * destroyed by the put after we drop the lock
862 		 */
863 		list_add_tail_rcu(&pool->list, &zswap_pools);
864 		put_pool = pool;
865 	}
866 
867 	spin_unlock(&zswap_pools_lock);
868 
869 	if (!zswap_has_pool && !pool) {
870 		/* if initial pool creation failed, and this pool creation also
871 		 * failed, maybe both compressor and zpool params were bad.
872 		 * Allow changing this param, so pool creation will succeed
873 		 * when the other param is changed. We already verified this
874 		 * param is ok in the zpool_has_pool() or crypto_has_acomp()
875 		 * checks above.
876 		 */
877 		ret = param_set_charp(s, kp);
878 	}
879 
880 	/* drop the ref from either the old current pool,
881 	 * or the new pool we failed to add
882 	 */
883 	if (put_pool)
884 		zswap_pool_put(put_pool);
885 
886 	return ret;
887 }
888 
889 static int zswap_compressor_param_set(const char *val,
890 				      const struct kernel_param *kp)
891 {
892 	return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
893 }
894 
895 static int zswap_zpool_param_set(const char *val,
896 				 const struct kernel_param *kp)
897 {
898 	return __zswap_param_set(val, kp, NULL, zswap_compressor);
899 }
900 
901 static int zswap_enabled_param_set(const char *val,
902 				   const struct kernel_param *kp)
903 {
904 	int ret = -ENODEV;
905 
906 	/* if this is load-time (pre-init) param setting, only set param. */
907 	if (system_state != SYSTEM_RUNNING)
908 		return param_set_bool(val, kp);
909 
910 	mutex_lock(&zswap_init_lock);
911 	switch (zswap_init_state) {
912 	case ZSWAP_UNINIT:
913 		if (zswap_setup())
914 			break;
915 		fallthrough;
916 	case ZSWAP_INIT_SUCCEED:
917 		if (!zswap_has_pool)
918 			pr_err("can't enable, no pool configured\n");
919 		else
920 			ret = param_set_bool(val, kp);
921 		break;
922 	case ZSWAP_INIT_FAILED:
923 		pr_err("can't enable, initialization failed\n");
924 	}
925 	mutex_unlock(&zswap_init_lock);
926 
927 	return ret;
928 }
929 
930 /*********************************
931 * writeback code
932 **********************************/
933 /* return enum for zswap_get_swap_cache_page */
934 enum zswap_get_swap_ret {
935 	ZSWAP_SWAPCACHE_NEW,
936 	ZSWAP_SWAPCACHE_EXIST,
937 	ZSWAP_SWAPCACHE_FAIL,
938 };
939 
940 /*
941  * zswap_get_swap_cache_page
942  *
943  * This is an adaption of read_swap_cache_async()
944  *
945  * This function tries to find a page with the given swap entry
946  * in the swapper_space address space (the swap cache).  If the page
947  * is found, it is returned in retpage.  Otherwise, a page is allocated,
948  * added to the swap cache, and returned in retpage.
949  *
950  * If success, the swap cache page is returned in retpage
951  * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
952  * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
953  *     the new page is added to swapcache and locked
954  * Returns ZSWAP_SWAPCACHE_FAIL on error
955  */
956 static int zswap_get_swap_cache_page(swp_entry_t entry,
957 				struct page **retpage)
958 {
959 	bool page_was_allocated;
960 
961 	*retpage = __read_swap_cache_async(entry, GFP_KERNEL,
962 			NULL, 0, &page_was_allocated);
963 	if (page_was_allocated)
964 		return ZSWAP_SWAPCACHE_NEW;
965 	if (!*retpage)
966 		return ZSWAP_SWAPCACHE_FAIL;
967 	return ZSWAP_SWAPCACHE_EXIST;
968 }
969 
970 /*
971  * Attempts to free an entry by adding a page to the swap cache,
972  * decompressing the entry data into the page, and issuing a
973  * bio write to write the page back to the swap device.
974  *
975  * This can be thought of as a "resumed writeback" of the page
976  * to the swap device.  We are basically resuming the same swap
977  * writeback path that was intercepted with the frontswap_store()
978  * in the first place.  After the page has been decompressed into
979  * the swap cache, the compressed version stored by zswap can be
980  * freed.
981  */
982 static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
983 {
984 	struct zswap_header *zhdr;
985 	swp_entry_t swpentry;
986 	struct zswap_tree *tree;
987 	pgoff_t offset;
988 	struct zswap_entry *entry;
989 	struct page *page;
990 	struct scatterlist input, output;
991 	struct crypto_acomp_ctx *acomp_ctx;
992 
993 	u8 *src, *tmp = NULL;
994 	unsigned int dlen;
995 	int ret;
996 	struct writeback_control wbc = {
997 		.sync_mode = WB_SYNC_NONE,
998 	};
999 
1000 	if (!zpool_can_sleep_mapped(pool)) {
1001 		tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
1002 		if (!tmp)
1003 			return -ENOMEM;
1004 	}
1005 
1006 	/* extract swpentry from data */
1007 	zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
1008 	swpentry = zhdr->swpentry; /* here */
1009 	tree = zswap_trees[swp_type(swpentry)];
1010 	offset = swp_offset(swpentry);
1011 	zpool_unmap_handle(pool, handle);
1012 
1013 	/* find and ref zswap entry */
1014 	spin_lock(&tree->lock);
1015 	entry = zswap_entry_find_get(&tree->rbroot, offset);
1016 	if (!entry) {
1017 		/* entry was invalidated */
1018 		spin_unlock(&tree->lock);
1019 		kfree(tmp);
1020 		return 0;
1021 	}
1022 	spin_unlock(&tree->lock);
1023 	BUG_ON(offset != entry->offset);
1024 
1025 	/* try to allocate swap cache page */
1026 	switch (zswap_get_swap_cache_page(swpentry, &page)) {
1027 	case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
1028 		ret = -ENOMEM;
1029 		goto fail;
1030 
1031 	case ZSWAP_SWAPCACHE_EXIST:
1032 		/* page is already in the swap cache, ignore for now */
1033 		put_page(page);
1034 		ret = -EEXIST;
1035 		goto fail;
1036 
1037 	case ZSWAP_SWAPCACHE_NEW: /* page is locked */
1038 		/*
1039 		 * Having a local reference to the zswap entry doesn't exclude
1040 		 * swapping from invalidating and recycling the swap slot. Once
1041 		 * the swapcache is secured against concurrent swapping to and
1042 		 * from the slot, recheck that the entry is still current before
1043 		 * writing.
1044 		 */
1045 		spin_lock(&tree->lock);
1046 		if (zswap_rb_search(&tree->rbroot, entry->offset) != entry) {
1047 			spin_unlock(&tree->lock);
1048 			delete_from_swap_cache(page_folio(page));
1049 			ret = -ENOMEM;
1050 			goto fail;
1051 		}
1052 		spin_unlock(&tree->lock);
1053 
1054 		/* decompress */
1055 		acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1056 		dlen = PAGE_SIZE;
1057 
1058 		zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
1059 		src = (u8 *)zhdr + sizeof(struct zswap_header);
1060 		if (!zpool_can_sleep_mapped(pool)) {
1061 			memcpy(tmp, src, entry->length);
1062 			src = tmp;
1063 			zpool_unmap_handle(pool, handle);
1064 		}
1065 
1066 		mutex_lock(acomp_ctx->mutex);
1067 		sg_init_one(&input, src, entry->length);
1068 		sg_init_table(&output, 1);
1069 		sg_set_page(&output, page, PAGE_SIZE, 0);
1070 		acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
1071 		ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
1072 		dlen = acomp_ctx->req->dlen;
1073 		mutex_unlock(acomp_ctx->mutex);
1074 
1075 		if (!zpool_can_sleep_mapped(pool))
1076 			kfree(tmp);
1077 		else
1078 			zpool_unmap_handle(pool, handle);
1079 
1080 		BUG_ON(ret);
1081 		BUG_ON(dlen != PAGE_SIZE);
1082 
1083 		/* page is up to date */
1084 		SetPageUptodate(page);
1085 	}
1086 
1087 	/* move it to the tail of the inactive list after end_writeback */
1088 	SetPageReclaim(page);
1089 
1090 	/* start writeback */
1091 	__swap_writepage(page, &wbc);
1092 	put_page(page);
1093 	zswap_written_back_pages++;
1094 
1095 	spin_lock(&tree->lock);
1096 	/* drop local reference */
1097 	zswap_entry_put(tree, entry);
1098 
1099 	/*
1100 	* There are two possible situations for entry here:
1101 	* (1) refcount is 1(normal case),  entry is valid and on the tree
1102 	* (2) refcount is 0, entry is freed and not on the tree
1103 	*     because invalidate happened during writeback
1104 	*  search the tree and free the entry if find entry
1105 	*/
1106 	if (entry == zswap_rb_search(&tree->rbroot, offset))
1107 		zswap_entry_put(tree, entry);
1108 	spin_unlock(&tree->lock);
1109 
1110 	return ret;
1111 
1112 fail:
1113 	if (!zpool_can_sleep_mapped(pool))
1114 		kfree(tmp);
1115 
1116 	/*
1117 	* if we get here due to ZSWAP_SWAPCACHE_EXIST
1118 	* a load may be happening concurrently.
1119 	* it is safe and okay to not free the entry.
1120 	* if we free the entry in the following put
1121 	* it is also okay to return !0
1122 	*/
1123 	spin_lock(&tree->lock);
1124 	zswap_entry_put(tree, entry);
1125 	spin_unlock(&tree->lock);
1126 
1127 	return ret;
1128 }
1129 
1130 static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
1131 {
1132 	unsigned long *page;
1133 	unsigned long val;
1134 	unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1;
1135 
1136 	page = (unsigned long *)ptr;
1137 	val = page[0];
1138 
1139 	if (val != page[last_pos])
1140 		return 0;
1141 
1142 	for (pos = 1; pos < last_pos; pos++) {
1143 		if (val != page[pos])
1144 			return 0;
1145 	}
1146 
1147 	*value = val;
1148 
1149 	return 1;
1150 }
1151 
1152 static void zswap_fill_page(void *ptr, unsigned long value)
1153 {
1154 	unsigned long *page;
1155 
1156 	page = (unsigned long *)ptr;
1157 	memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
1158 }
1159 
1160 /*********************************
1161 * frontswap hooks
1162 **********************************/
1163 /* attempts to compress and store an single page */
1164 static int zswap_frontswap_store(unsigned type, pgoff_t offset,
1165 				struct page *page)
1166 {
1167 	struct zswap_tree *tree = zswap_trees[type];
1168 	struct zswap_entry *entry, *dupentry;
1169 	struct scatterlist input, output;
1170 	struct crypto_acomp_ctx *acomp_ctx;
1171 	struct obj_cgroup *objcg = NULL;
1172 	struct zswap_pool *pool;
1173 	int ret;
1174 	unsigned int hlen, dlen = PAGE_SIZE;
1175 	unsigned long handle, value;
1176 	char *buf;
1177 	u8 *src, *dst;
1178 	struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
1179 	gfp_t gfp;
1180 
1181 	/* THP isn't supported */
1182 	if (PageTransHuge(page)) {
1183 		ret = -EINVAL;
1184 		goto reject;
1185 	}
1186 
1187 	if (!zswap_enabled || !tree) {
1188 		ret = -ENODEV;
1189 		goto reject;
1190 	}
1191 
1192 	objcg = get_obj_cgroup_from_page(page);
1193 	if (objcg && !obj_cgroup_may_zswap(objcg))
1194 		goto shrink;
1195 
1196 	/* reclaim space if needed */
1197 	if (zswap_is_full()) {
1198 		zswap_pool_limit_hit++;
1199 		zswap_pool_reached_full = true;
1200 		goto shrink;
1201 	}
1202 
1203 	if (zswap_pool_reached_full) {
1204 	       if (!zswap_can_accept()) {
1205 			ret = -ENOMEM;
1206 			goto shrink;
1207 		} else
1208 			zswap_pool_reached_full = false;
1209 	}
1210 
1211 	/* allocate entry */
1212 	entry = zswap_entry_cache_alloc(GFP_KERNEL);
1213 	if (!entry) {
1214 		zswap_reject_kmemcache_fail++;
1215 		ret = -ENOMEM;
1216 		goto reject;
1217 	}
1218 
1219 	if (zswap_same_filled_pages_enabled) {
1220 		src = kmap_atomic(page);
1221 		if (zswap_is_page_same_filled(src, &value)) {
1222 			kunmap_atomic(src);
1223 			entry->offset = offset;
1224 			entry->length = 0;
1225 			entry->value = value;
1226 			atomic_inc(&zswap_same_filled_pages);
1227 			goto insert_entry;
1228 		}
1229 		kunmap_atomic(src);
1230 	}
1231 
1232 	if (!zswap_non_same_filled_pages_enabled) {
1233 		ret = -EINVAL;
1234 		goto freepage;
1235 	}
1236 
1237 	/* if entry is successfully added, it keeps the reference */
1238 	entry->pool = zswap_pool_current_get();
1239 	if (!entry->pool) {
1240 		ret = -EINVAL;
1241 		goto freepage;
1242 	}
1243 
1244 	/* compress */
1245 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1246 
1247 	mutex_lock(acomp_ctx->mutex);
1248 
1249 	dst = acomp_ctx->dstmem;
1250 	sg_init_table(&input, 1);
1251 	sg_set_page(&input, page, PAGE_SIZE, 0);
1252 
1253 	/* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */
1254 	sg_init_one(&output, dst, PAGE_SIZE * 2);
1255 	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
1256 	/*
1257 	 * it maybe looks a little bit silly that we send an asynchronous request,
1258 	 * then wait for its completion synchronously. This makes the process look
1259 	 * synchronous in fact.
1260 	 * Theoretically, acomp supports users send multiple acomp requests in one
1261 	 * acomp instance, then get those requests done simultaneously. but in this
1262 	 * case, frontswap actually does store and load page by page, there is no
1263 	 * existing method to send the second page before the first page is done
1264 	 * in one thread doing frontswap.
1265 	 * but in different threads running on different cpu, we have different
1266 	 * acomp instance, so multiple threads can do (de)compression in parallel.
1267 	 */
1268 	ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
1269 	dlen = acomp_ctx->req->dlen;
1270 
1271 	if (ret) {
1272 		ret = -EINVAL;
1273 		goto put_dstmem;
1274 	}
1275 
1276 	/* store */
1277 	hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
1278 	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
1279 	if (zpool_malloc_support_movable(entry->pool->zpool))
1280 		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
1281 	ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle);
1282 	if (ret == -ENOSPC) {
1283 		zswap_reject_compress_poor++;
1284 		goto put_dstmem;
1285 	}
1286 	if (ret) {
1287 		zswap_reject_alloc_fail++;
1288 		goto put_dstmem;
1289 	}
1290 	buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
1291 	memcpy(buf, &zhdr, hlen);
1292 	memcpy(buf + hlen, dst, dlen);
1293 	zpool_unmap_handle(entry->pool->zpool, handle);
1294 	mutex_unlock(acomp_ctx->mutex);
1295 
1296 	/* populate entry */
1297 	entry->offset = offset;
1298 	entry->handle = handle;
1299 	entry->length = dlen;
1300 
1301 insert_entry:
1302 	entry->objcg = objcg;
1303 	if (objcg) {
1304 		obj_cgroup_charge_zswap(objcg, entry->length);
1305 		/* Account before objcg ref is moved to tree */
1306 		count_objcg_event(objcg, ZSWPOUT);
1307 	}
1308 
1309 	/* map */
1310 	spin_lock(&tree->lock);
1311 	do {
1312 		ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
1313 		if (ret == -EEXIST) {
1314 			zswap_duplicate_entry++;
1315 			/* remove from rbtree */
1316 			zswap_rb_erase(&tree->rbroot, dupentry);
1317 			zswap_entry_put(tree, dupentry);
1318 		}
1319 	} while (ret == -EEXIST);
1320 	spin_unlock(&tree->lock);
1321 
1322 	/* update stats */
1323 	atomic_inc(&zswap_stored_pages);
1324 	zswap_update_total_size();
1325 	count_vm_event(ZSWPOUT);
1326 
1327 	return 0;
1328 
1329 put_dstmem:
1330 	mutex_unlock(acomp_ctx->mutex);
1331 	zswap_pool_put(entry->pool);
1332 freepage:
1333 	zswap_entry_cache_free(entry);
1334 reject:
1335 	if (objcg)
1336 		obj_cgroup_put(objcg);
1337 	return ret;
1338 
1339 shrink:
1340 	pool = zswap_pool_last_get();
1341 	if (pool)
1342 		queue_work(shrink_wq, &pool->shrink_work);
1343 	ret = -ENOMEM;
1344 	goto reject;
1345 }
1346 
1347 static void zswap_invalidate_entry(struct zswap_tree *tree,
1348 				   struct zswap_entry *entry)
1349 {
1350 	/* remove from rbtree */
1351 	zswap_rb_erase(&tree->rbroot, entry);
1352 
1353 	/* drop the initial reference from entry creation */
1354 	zswap_entry_put(tree, entry);
1355 }
1356 
1357 /*
1358  * returns 0 if the page was successfully decompressed
1359  * return -1 on entry not found or error
1360 */
1361 static int zswap_frontswap_load(unsigned type, pgoff_t offset,
1362 				struct page *page, bool *exclusive)
1363 {
1364 	struct zswap_tree *tree = zswap_trees[type];
1365 	struct zswap_entry *entry;
1366 	struct scatterlist input, output;
1367 	struct crypto_acomp_ctx *acomp_ctx;
1368 	u8 *src, *dst, *tmp;
1369 	unsigned int dlen;
1370 	int ret;
1371 
1372 	/* find */
1373 	spin_lock(&tree->lock);
1374 	entry = zswap_entry_find_get(&tree->rbroot, offset);
1375 	if (!entry) {
1376 		/* entry was written back */
1377 		spin_unlock(&tree->lock);
1378 		return -1;
1379 	}
1380 	spin_unlock(&tree->lock);
1381 
1382 	if (!entry->length) {
1383 		dst = kmap_atomic(page);
1384 		zswap_fill_page(dst, entry->value);
1385 		kunmap_atomic(dst);
1386 		ret = 0;
1387 		goto stats;
1388 	}
1389 
1390 	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
1391 		tmp = kmalloc(entry->length, GFP_KERNEL);
1392 		if (!tmp) {
1393 			ret = -ENOMEM;
1394 			goto freeentry;
1395 		}
1396 	}
1397 
1398 	/* decompress */
1399 	dlen = PAGE_SIZE;
1400 	src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
1401 	if (zpool_evictable(entry->pool->zpool))
1402 		src += sizeof(struct zswap_header);
1403 
1404 	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
1405 		memcpy(tmp, src, entry->length);
1406 		src = tmp;
1407 		zpool_unmap_handle(entry->pool->zpool, entry->handle);
1408 	}
1409 
1410 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1411 	mutex_lock(acomp_ctx->mutex);
1412 	sg_init_one(&input, src, entry->length);
1413 	sg_init_table(&output, 1);
1414 	sg_set_page(&output, page, PAGE_SIZE, 0);
1415 	acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
1416 	ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
1417 	mutex_unlock(acomp_ctx->mutex);
1418 
1419 	if (zpool_can_sleep_mapped(entry->pool->zpool))
1420 		zpool_unmap_handle(entry->pool->zpool, entry->handle);
1421 	else
1422 		kfree(tmp);
1423 
1424 	BUG_ON(ret);
1425 stats:
1426 	count_vm_event(ZSWPIN);
1427 	if (entry->objcg)
1428 		count_objcg_event(entry->objcg, ZSWPIN);
1429 freeentry:
1430 	spin_lock(&tree->lock);
1431 	zswap_entry_put(tree, entry);
1432 	if (!ret && zswap_exclusive_loads_enabled) {
1433 		zswap_invalidate_entry(tree, entry);
1434 		*exclusive = true;
1435 	}
1436 	spin_unlock(&tree->lock);
1437 
1438 	return ret;
1439 }
1440 
1441 /* frees an entry in zswap */
1442 static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
1443 {
1444 	struct zswap_tree *tree = zswap_trees[type];
1445 	struct zswap_entry *entry;
1446 
1447 	/* find */
1448 	spin_lock(&tree->lock);
1449 	entry = zswap_rb_search(&tree->rbroot, offset);
1450 	if (!entry) {
1451 		/* entry was written back */
1452 		spin_unlock(&tree->lock);
1453 		return;
1454 	}
1455 	zswap_invalidate_entry(tree, entry);
1456 	spin_unlock(&tree->lock);
1457 }
1458 
1459 /* frees all zswap entries for the given swap type */
1460 static void zswap_frontswap_invalidate_area(unsigned type)
1461 {
1462 	struct zswap_tree *tree = zswap_trees[type];
1463 	struct zswap_entry *entry, *n;
1464 
1465 	if (!tree)
1466 		return;
1467 
1468 	/* walk the tree and free everything */
1469 	spin_lock(&tree->lock);
1470 	rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
1471 		zswap_free_entry(entry);
1472 	tree->rbroot = RB_ROOT;
1473 	spin_unlock(&tree->lock);
1474 	kfree(tree);
1475 	zswap_trees[type] = NULL;
1476 }
1477 
1478 static void zswap_frontswap_init(unsigned type)
1479 {
1480 	struct zswap_tree *tree;
1481 
1482 	tree = kzalloc(sizeof(*tree), GFP_KERNEL);
1483 	if (!tree) {
1484 		pr_err("alloc failed, zswap disabled for swap type %d\n", type);
1485 		return;
1486 	}
1487 
1488 	tree->rbroot = RB_ROOT;
1489 	spin_lock_init(&tree->lock);
1490 	zswap_trees[type] = tree;
1491 }
1492 
1493 static const struct frontswap_ops zswap_frontswap_ops = {
1494 	.store = zswap_frontswap_store,
1495 	.load = zswap_frontswap_load,
1496 	.invalidate_page = zswap_frontswap_invalidate_page,
1497 	.invalidate_area = zswap_frontswap_invalidate_area,
1498 	.init = zswap_frontswap_init
1499 };
1500 
1501 /*********************************
1502 * debugfs functions
1503 **********************************/
1504 #ifdef CONFIG_DEBUG_FS
1505 #include <linux/debugfs.h>
1506 
1507 static struct dentry *zswap_debugfs_root;
1508 
1509 static int zswap_debugfs_init(void)
1510 {
1511 	if (!debugfs_initialized())
1512 		return -ENODEV;
1513 
1514 	zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
1515 
1516 	debugfs_create_u64("pool_limit_hit", 0444,
1517 			   zswap_debugfs_root, &zswap_pool_limit_hit);
1518 	debugfs_create_u64("reject_reclaim_fail", 0444,
1519 			   zswap_debugfs_root, &zswap_reject_reclaim_fail);
1520 	debugfs_create_u64("reject_alloc_fail", 0444,
1521 			   zswap_debugfs_root, &zswap_reject_alloc_fail);
1522 	debugfs_create_u64("reject_kmemcache_fail", 0444,
1523 			   zswap_debugfs_root, &zswap_reject_kmemcache_fail);
1524 	debugfs_create_u64("reject_compress_poor", 0444,
1525 			   zswap_debugfs_root, &zswap_reject_compress_poor);
1526 	debugfs_create_u64("written_back_pages", 0444,
1527 			   zswap_debugfs_root, &zswap_written_back_pages);
1528 	debugfs_create_u64("duplicate_entry", 0444,
1529 			   zswap_debugfs_root, &zswap_duplicate_entry);
1530 	debugfs_create_u64("pool_total_size", 0444,
1531 			   zswap_debugfs_root, &zswap_pool_total_size);
1532 	debugfs_create_atomic_t("stored_pages", 0444,
1533 				zswap_debugfs_root, &zswap_stored_pages);
1534 	debugfs_create_atomic_t("same_filled_pages", 0444,
1535 				zswap_debugfs_root, &zswap_same_filled_pages);
1536 
1537 	return 0;
1538 }
1539 #else
1540 static int zswap_debugfs_init(void)
1541 {
1542 	return 0;
1543 }
1544 #endif
1545 
1546 /*********************************
1547 * module init and exit
1548 **********************************/
1549 static int zswap_setup(void)
1550 {
1551 	struct zswap_pool *pool;
1552 	int ret;
1553 
1554 	zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
1555 	if (!zswap_entry_cache) {
1556 		pr_err("entry cache creation failed\n");
1557 		goto cache_fail;
1558 	}
1559 
1560 	ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare",
1561 				zswap_dstmem_prepare, zswap_dstmem_dead);
1562 	if (ret) {
1563 		pr_err("dstmem alloc failed\n");
1564 		goto dstmem_fail;
1565 	}
1566 
1567 	ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
1568 				      "mm/zswap_pool:prepare",
1569 				      zswap_cpu_comp_prepare,
1570 				      zswap_cpu_comp_dead);
1571 	if (ret)
1572 		goto hp_fail;
1573 
1574 	pool = __zswap_pool_create_fallback();
1575 	if (pool) {
1576 		pr_info("loaded using pool %s/%s\n", pool->tfm_name,
1577 			zpool_get_type(pool->zpool));
1578 		list_add(&pool->list, &zswap_pools);
1579 		zswap_has_pool = true;
1580 	} else {
1581 		pr_err("pool creation failed\n");
1582 		zswap_enabled = false;
1583 	}
1584 
1585 	shrink_wq = create_workqueue("zswap-shrink");
1586 	if (!shrink_wq)
1587 		goto fallback_fail;
1588 
1589 	ret = frontswap_register_ops(&zswap_frontswap_ops);
1590 	if (ret)
1591 		goto destroy_wq;
1592 	if (zswap_debugfs_init())
1593 		pr_warn("debugfs initialization failed\n");
1594 	zswap_init_state = ZSWAP_INIT_SUCCEED;
1595 	return 0;
1596 
1597 destroy_wq:
1598 	destroy_workqueue(shrink_wq);
1599 fallback_fail:
1600 	if (pool)
1601 		zswap_pool_destroy(pool);
1602 hp_fail:
1603 	cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
1604 dstmem_fail:
1605 	kmem_cache_destroy(zswap_entry_cache);
1606 cache_fail:
1607 	/* if built-in, we aren't unloaded on failure; don't allow use */
1608 	zswap_init_state = ZSWAP_INIT_FAILED;
1609 	zswap_enabled = false;
1610 	return -ENOMEM;
1611 }
1612 
1613 static int __init zswap_init(void)
1614 {
1615 	if (!zswap_enabled)
1616 		return 0;
1617 	return zswap_setup();
1618 }
1619 /* must be late so crypto has time to come up */
1620 late_initcall(zswap_init);
1621 
1622 MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
1623 MODULE_DESCRIPTION("Compressed cache for swap pages");
1624