1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * zswap.c - zswap driver file 4 * 5 * zswap is a cache that takes pages that are in the process 6 * of being swapped out and attempts to compress and store them in a 7 * RAM-based memory pool. This can result in a significant I/O reduction on 8 * the swap device and, in the case where decompressing from RAM is faster 9 * than reading from the swap device, can also improve workload performance. 10 * 11 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> 12 */ 13 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 16 #include <linux/module.h> 17 #include <linux/cpu.h> 18 #include <linux/highmem.h> 19 #include <linux/slab.h> 20 #include <linux/spinlock.h> 21 #include <linux/types.h> 22 #include <linux/atomic.h> 23 #include <linux/rbtree.h> 24 #include <linux/swap.h> 25 #include <linux/crypto.h> 26 #include <linux/scatterlist.h> 27 #include <linux/mempool.h> 28 #include <linux/zpool.h> 29 #include <crypto/acompress.h> 30 #include <linux/zswap.h> 31 #include <linux/mm_types.h> 32 #include <linux/page-flags.h> 33 #include <linux/swapops.h> 34 #include <linux/writeback.h> 35 #include <linux/pagemap.h> 36 #include <linux/workqueue.h> 37 38 #include "swap.h" 39 #include "internal.h" 40 41 /********************************* 42 * statistics 43 **********************************/ 44 /* Total bytes used by the compressed storage */ 45 u64 zswap_pool_total_size; 46 /* The number of compressed pages currently stored in zswap */ 47 atomic_t zswap_stored_pages = ATOMIC_INIT(0); 48 /* The number of same-value filled pages currently stored in zswap */ 49 static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); 50 51 /* 52 * The statistics below are not protected from concurrent access for 53 * performance reasons so they may not be a 100% accurate. However, 54 * they do provide useful information on roughly how many times a 55 * certain event is occurring. 56 */ 57 58 /* Pool limit was hit (see zswap_max_pool_percent) */ 59 static u64 zswap_pool_limit_hit; 60 /* Pages written back when pool limit was reached */ 61 static u64 zswap_written_back_pages; 62 /* Store failed due to a reclaim failure after pool limit was reached */ 63 static u64 zswap_reject_reclaim_fail; 64 /* Compressed page was too big for the allocator to (optimally) store */ 65 static u64 zswap_reject_compress_poor; 66 /* Store failed because underlying allocator could not get memory */ 67 static u64 zswap_reject_alloc_fail; 68 /* Store failed because the entry metadata could not be allocated (rare) */ 69 static u64 zswap_reject_kmemcache_fail; 70 /* Duplicate store was encountered (rare) */ 71 static u64 zswap_duplicate_entry; 72 73 /* Shrinker work queue */ 74 static struct workqueue_struct *shrink_wq; 75 /* Pool limit was hit, we need to calm down */ 76 static bool zswap_pool_reached_full; 77 78 /********************************* 79 * tunables 80 **********************************/ 81 82 #define ZSWAP_PARAM_UNSET "" 83 84 static int zswap_setup(void); 85 86 /* Enable/disable zswap */ 87 static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); 88 static int zswap_enabled_param_set(const char *, 89 const struct kernel_param *); 90 static const struct kernel_param_ops zswap_enabled_param_ops = { 91 .set = zswap_enabled_param_set, 92 .get = param_get_bool, 93 }; 94 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); 95 96 /* Crypto compressor to use */ 97 static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; 98 static int zswap_compressor_param_set(const char *, 99 const struct kernel_param *); 100 static const struct kernel_param_ops zswap_compressor_param_ops = { 101 .set = zswap_compressor_param_set, 102 .get = param_get_charp, 103 .free = param_free_charp, 104 }; 105 module_param_cb(compressor, &zswap_compressor_param_ops, 106 &zswap_compressor, 0644); 107 108 /* Compressed storage zpool to use */ 109 static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; 110 static int zswap_zpool_param_set(const char *, const struct kernel_param *); 111 static const struct kernel_param_ops zswap_zpool_param_ops = { 112 .set = zswap_zpool_param_set, 113 .get = param_get_charp, 114 .free = param_free_charp, 115 }; 116 module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); 117 118 /* The maximum percentage of memory that the compressed pool can occupy */ 119 static unsigned int zswap_max_pool_percent = 20; 120 module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); 121 122 /* The threshold for accepting new pages after the max_pool_percent was hit */ 123 static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ 124 module_param_named(accept_threshold_percent, zswap_accept_thr_percent, 125 uint, 0644); 126 127 /* 128 * Enable/disable handling same-value filled pages (enabled by default). 129 * If disabled every page is considered non-same-value filled. 130 */ 131 static bool zswap_same_filled_pages_enabled = true; 132 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, 133 bool, 0644); 134 135 /* Enable/disable handling non-same-value filled pages (enabled by default) */ 136 static bool zswap_non_same_filled_pages_enabled = true; 137 module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, 138 bool, 0644); 139 140 static bool zswap_exclusive_loads_enabled = IS_ENABLED( 141 CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON); 142 module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); 143 144 /* Number of zpools in zswap_pool (empirically determined for scalability) */ 145 #define ZSWAP_NR_ZPOOLS 32 146 147 /********************************* 148 * data structures 149 **********************************/ 150 151 struct crypto_acomp_ctx { 152 struct crypto_acomp *acomp; 153 struct acomp_req *req; 154 struct crypto_wait wait; 155 u8 *dstmem; 156 struct mutex *mutex; 157 }; 158 159 /* 160 * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock. 161 * The only case where lru_lock is not acquired while holding tree.lock is 162 * when a zswap_entry is taken off the lru for writeback, in that case it 163 * needs to be verified that it's still valid in the tree. 164 */ 165 struct zswap_pool { 166 struct zpool *zpools[ZSWAP_NR_ZPOOLS]; 167 struct crypto_acomp_ctx __percpu *acomp_ctx; 168 struct kref kref; 169 struct list_head list; 170 struct work_struct release_work; 171 struct work_struct shrink_work; 172 struct hlist_node node; 173 char tfm_name[CRYPTO_MAX_ALG_NAME]; 174 struct list_head lru; 175 spinlock_t lru_lock; 176 }; 177 178 /* 179 * struct zswap_entry 180 * 181 * This structure contains the metadata for tracking a single compressed 182 * page within zswap. 183 * 184 * rbnode - links the entry into red-black tree for the appropriate swap type 185 * swpentry - associated swap entry, the offset indexes into the red-black tree 186 * refcount - the number of outstanding reference to the entry. This is needed 187 * to protect against premature freeing of the entry by code 188 * concurrent calls to load, invalidate, and writeback. The lock 189 * for the zswap_tree structure that contains the entry must 190 * be held while changing the refcount. Since the lock must 191 * be held, there is no reason to also make refcount atomic. 192 * length - the length in bytes of the compressed page data. Needed during 193 * decompression. For a same value filled page length is 0, and both 194 * pool and lru are invalid and must be ignored. 195 * pool - the zswap_pool the entry's data is in 196 * handle - zpool allocation handle that stores the compressed page data 197 * value - value of the same-value filled pages which have same content 198 * objcg - the obj_cgroup that the compressed memory is charged to 199 * lru - handle to the pool's lru used to evict pages. 200 */ 201 struct zswap_entry { 202 struct rb_node rbnode; 203 swp_entry_t swpentry; 204 int refcount; 205 unsigned int length; 206 struct zswap_pool *pool; 207 union { 208 unsigned long handle; 209 unsigned long value; 210 }; 211 struct obj_cgroup *objcg; 212 struct list_head lru; 213 }; 214 215 /* 216 * The tree lock in the zswap_tree struct protects a few things: 217 * - the rbtree 218 * - the refcount field of each entry in the tree 219 */ 220 struct zswap_tree { 221 struct rb_root rbroot; 222 spinlock_t lock; 223 }; 224 225 static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 226 227 /* RCU-protected iteration */ 228 static LIST_HEAD(zswap_pools); 229 /* protects zswap_pools list modification */ 230 static DEFINE_SPINLOCK(zswap_pools_lock); 231 /* pool counter to provide unique names to zpool */ 232 static atomic_t zswap_pools_count = ATOMIC_INIT(0); 233 234 enum zswap_init_type { 235 ZSWAP_UNINIT, 236 ZSWAP_INIT_SUCCEED, 237 ZSWAP_INIT_FAILED 238 }; 239 240 static enum zswap_init_type zswap_init_state; 241 242 /* used to ensure the integrity of initialization */ 243 static DEFINE_MUTEX(zswap_init_lock); 244 245 /* init completed, but couldn't create the initial pool */ 246 static bool zswap_has_pool; 247 248 /********************************* 249 * helpers and fwd declarations 250 **********************************/ 251 252 #define zswap_pool_debug(msg, p) \ 253 pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ 254 zpool_get_type((p)->zpools[0])) 255 256 static int zswap_writeback_entry(struct zswap_entry *entry, 257 struct zswap_tree *tree); 258 static int zswap_pool_get(struct zswap_pool *pool); 259 static void zswap_pool_put(struct zswap_pool *pool); 260 261 static bool zswap_is_full(void) 262 { 263 return totalram_pages() * zswap_max_pool_percent / 100 < 264 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 265 } 266 267 static bool zswap_can_accept(void) 268 { 269 return totalram_pages() * zswap_accept_thr_percent / 100 * 270 zswap_max_pool_percent / 100 > 271 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 272 } 273 274 static void zswap_update_total_size(void) 275 { 276 struct zswap_pool *pool; 277 u64 total = 0; 278 int i; 279 280 rcu_read_lock(); 281 282 list_for_each_entry_rcu(pool, &zswap_pools, list) 283 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) 284 total += zpool_get_total_size(pool->zpools[i]); 285 286 rcu_read_unlock(); 287 288 zswap_pool_total_size = total; 289 } 290 291 /********************************* 292 * zswap entry functions 293 **********************************/ 294 static struct kmem_cache *zswap_entry_cache; 295 296 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) 297 { 298 struct zswap_entry *entry; 299 entry = kmem_cache_alloc(zswap_entry_cache, gfp); 300 if (!entry) 301 return NULL; 302 entry->refcount = 1; 303 RB_CLEAR_NODE(&entry->rbnode); 304 return entry; 305 } 306 307 static void zswap_entry_cache_free(struct zswap_entry *entry) 308 { 309 kmem_cache_free(zswap_entry_cache, entry); 310 } 311 312 /********************************* 313 * rbtree functions 314 **********************************/ 315 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) 316 { 317 struct rb_node *node = root->rb_node; 318 struct zswap_entry *entry; 319 pgoff_t entry_offset; 320 321 while (node) { 322 entry = rb_entry(node, struct zswap_entry, rbnode); 323 entry_offset = swp_offset(entry->swpentry); 324 if (entry_offset > offset) 325 node = node->rb_left; 326 else if (entry_offset < offset) 327 node = node->rb_right; 328 else 329 return entry; 330 } 331 return NULL; 332 } 333 334 /* 335 * In the case that a entry with the same offset is found, a pointer to 336 * the existing entry is stored in dupentry and the function returns -EEXIST 337 */ 338 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, 339 struct zswap_entry **dupentry) 340 { 341 struct rb_node **link = &root->rb_node, *parent = NULL; 342 struct zswap_entry *myentry; 343 pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); 344 345 while (*link) { 346 parent = *link; 347 myentry = rb_entry(parent, struct zswap_entry, rbnode); 348 myentry_offset = swp_offset(myentry->swpentry); 349 if (myentry_offset > entry_offset) 350 link = &(*link)->rb_left; 351 else if (myentry_offset < entry_offset) 352 link = &(*link)->rb_right; 353 else { 354 *dupentry = myentry; 355 return -EEXIST; 356 } 357 } 358 rb_link_node(&entry->rbnode, parent, link); 359 rb_insert_color(&entry->rbnode, root); 360 return 0; 361 } 362 363 static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) 364 { 365 if (!RB_EMPTY_NODE(&entry->rbnode)) { 366 rb_erase(&entry->rbnode, root); 367 RB_CLEAR_NODE(&entry->rbnode); 368 return true; 369 } 370 return false; 371 } 372 373 static struct zpool *zswap_find_zpool(struct zswap_entry *entry) 374 { 375 int i = 0; 376 377 if (ZSWAP_NR_ZPOOLS > 1) 378 i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); 379 380 return entry->pool->zpools[i]; 381 } 382 383 /* 384 * Carries out the common pattern of freeing and entry's zpool allocation, 385 * freeing the entry itself, and decrementing the number of stored pages. 386 */ 387 static void zswap_free_entry(struct zswap_entry *entry) 388 { 389 if (entry->objcg) { 390 obj_cgroup_uncharge_zswap(entry->objcg, entry->length); 391 obj_cgroup_put(entry->objcg); 392 } 393 if (!entry->length) 394 atomic_dec(&zswap_same_filled_pages); 395 else { 396 spin_lock(&entry->pool->lru_lock); 397 list_del(&entry->lru); 398 spin_unlock(&entry->pool->lru_lock); 399 zpool_free(zswap_find_zpool(entry), entry->handle); 400 zswap_pool_put(entry->pool); 401 } 402 zswap_entry_cache_free(entry); 403 atomic_dec(&zswap_stored_pages); 404 zswap_update_total_size(); 405 } 406 407 /* caller must hold the tree lock */ 408 static void zswap_entry_get(struct zswap_entry *entry) 409 { 410 entry->refcount++; 411 } 412 413 /* caller must hold the tree lock 414 * remove from the tree and free it, if nobody reference the entry 415 */ 416 static void zswap_entry_put(struct zswap_tree *tree, 417 struct zswap_entry *entry) 418 { 419 int refcount = --entry->refcount; 420 421 WARN_ON_ONCE(refcount < 0); 422 if (refcount == 0) { 423 WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); 424 zswap_free_entry(entry); 425 } 426 } 427 428 /* caller must hold the tree lock */ 429 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, 430 pgoff_t offset) 431 { 432 struct zswap_entry *entry; 433 434 entry = zswap_rb_search(root, offset); 435 if (entry) 436 zswap_entry_get(entry); 437 438 return entry; 439 } 440 441 /********************************* 442 * per-cpu code 443 **********************************/ 444 static DEFINE_PER_CPU(u8 *, zswap_dstmem); 445 /* 446 * If users dynamically change the zpool type and compressor at runtime, i.e. 447 * zswap is running, zswap can have more than one zpool on one cpu, but they 448 * are sharing dtsmem. So we need this mutex to be per-cpu. 449 */ 450 static DEFINE_PER_CPU(struct mutex *, zswap_mutex); 451 452 static int zswap_dstmem_prepare(unsigned int cpu) 453 { 454 struct mutex *mutex; 455 u8 *dst; 456 457 dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); 458 if (!dst) 459 return -ENOMEM; 460 461 mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu)); 462 if (!mutex) { 463 kfree(dst); 464 return -ENOMEM; 465 } 466 467 mutex_init(mutex); 468 per_cpu(zswap_dstmem, cpu) = dst; 469 per_cpu(zswap_mutex, cpu) = mutex; 470 return 0; 471 } 472 473 static int zswap_dstmem_dead(unsigned int cpu) 474 { 475 struct mutex *mutex; 476 u8 *dst; 477 478 mutex = per_cpu(zswap_mutex, cpu); 479 kfree(mutex); 480 per_cpu(zswap_mutex, cpu) = NULL; 481 482 dst = per_cpu(zswap_dstmem, cpu); 483 kfree(dst); 484 per_cpu(zswap_dstmem, cpu) = NULL; 485 486 return 0; 487 } 488 489 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) 490 { 491 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 492 struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); 493 struct crypto_acomp *acomp; 494 struct acomp_req *req; 495 496 acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); 497 if (IS_ERR(acomp)) { 498 pr_err("could not alloc crypto acomp %s : %ld\n", 499 pool->tfm_name, PTR_ERR(acomp)); 500 return PTR_ERR(acomp); 501 } 502 acomp_ctx->acomp = acomp; 503 504 req = acomp_request_alloc(acomp_ctx->acomp); 505 if (!req) { 506 pr_err("could not alloc crypto acomp_request %s\n", 507 pool->tfm_name); 508 crypto_free_acomp(acomp_ctx->acomp); 509 return -ENOMEM; 510 } 511 acomp_ctx->req = req; 512 513 crypto_init_wait(&acomp_ctx->wait); 514 /* 515 * if the backend of acomp is async zip, crypto_req_done() will wakeup 516 * crypto_wait_req(); if the backend of acomp is scomp, the callback 517 * won't be called, crypto_wait_req() will return without blocking. 518 */ 519 acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, 520 crypto_req_done, &acomp_ctx->wait); 521 522 acomp_ctx->mutex = per_cpu(zswap_mutex, cpu); 523 acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu); 524 525 return 0; 526 } 527 528 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) 529 { 530 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 531 struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); 532 533 if (!IS_ERR_OR_NULL(acomp_ctx)) { 534 if (!IS_ERR_OR_NULL(acomp_ctx->req)) 535 acomp_request_free(acomp_ctx->req); 536 if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) 537 crypto_free_acomp(acomp_ctx->acomp); 538 } 539 540 return 0; 541 } 542 543 /********************************* 544 * pool functions 545 **********************************/ 546 547 static struct zswap_pool *__zswap_pool_current(void) 548 { 549 struct zswap_pool *pool; 550 551 pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); 552 WARN_ONCE(!pool && zswap_has_pool, 553 "%s: no page storage pool!\n", __func__); 554 555 return pool; 556 } 557 558 static struct zswap_pool *zswap_pool_current(void) 559 { 560 assert_spin_locked(&zswap_pools_lock); 561 562 return __zswap_pool_current(); 563 } 564 565 static struct zswap_pool *zswap_pool_current_get(void) 566 { 567 struct zswap_pool *pool; 568 569 rcu_read_lock(); 570 571 pool = __zswap_pool_current(); 572 if (!zswap_pool_get(pool)) 573 pool = NULL; 574 575 rcu_read_unlock(); 576 577 return pool; 578 } 579 580 static struct zswap_pool *zswap_pool_last_get(void) 581 { 582 struct zswap_pool *pool, *last = NULL; 583 584 rcu_read_lock(); 585 586 list_for_each_entry_rcu(pool, &zswap_pools, list) 587 last = pool; 588 WARN_ONCE(!last && zswap_has_pool, 589 "%s: no page storage pool!\n", __func__); 590 if (!zswap_pool_get(last)) 591 last = NULL; 592 593 rcu_read_unlock(); 594 595 return last; 596 } 597 598 /* type and compressor must be null-terminated */ 599 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) 600 { 601 struct zswap_pool *pool; 602 603 assert_spin_locked(&zswap_pools_lock); 604 605 list_for_each_entry_rcu(pool, &zswap_pools, list) { 606 if (strcmp(pool->tfm_name, compressor)) 607 continue; 608 /* all zpools share the same type */ 609 if (strcmp(zpool_get_type(pool->zpools[0]), type)) 610 continue; 611 /* if we can't get it, it's about to be destroyed */ 612 if (!zswap_pool_get(pool)) 613 continue; 614 return pool; 615 } 616 617 return NULL; 618 } 619 620 /* 621 * If the entry is still valid in the tree, drop the initial ref and remove it 622 * from the tree. This function must be called with an additional ref held, 623 * otherwise it may race with another invalidation freeing the entry. 624 */ 625 static void zswap_invalidate_entry(struct zswap_tree *tree, 626 struct zswap_entry *entry) 627 { 628 if (zswap_rb_erase(&tree->rbroot, entry)) 629 zswap_entry_put(tree, entry); 630 } 631 632 static int zswap_reclaim_entry(struct zswap_pool *pool) 633 { 634 struct zswap_entry *entry; 635 struct zswap_tree *tree; 636 pgoff_t swpoffset; 637 int ret; 638 639 /* Get an entry off the LRU */ 640 spin_lock(&pool->lru_lock); 641 if (list_empty(&pool->lru)) { 642 spin_unlock(&pool->lru_lock); 643 return -EINVAL; 644 } 645 entry = list_last_entry(&pool->lru, struct zswap_entry, lru); 646 list_del_init(&entry->lru); 647 /* 648 * Once the lru lock is dropped, the entry might get freed. The 649 * swpoffset is copied to the stack, and entry isn't deref'd again 650 * until the entry is verified to still be alive in the tree. 651 */ 652 swpoffset = swp_offset(entry->swpentry); 653 tree = zswap_trees[swp_type(entry->swpentry)]; 654 spin_unlock(&pool->lru_lock); 655 656 /* Check for invalidate() race */ 657 spin_lock(&tree->lock); 658 if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) { 659 ret = -EAGAIN; 660 goto unlock; 661 } 662 /* Hold a reference to prevent a free during writeback */ 663 zswap_entry_get(entry); 664 spin_unlock(&tree->lock); 665 666 ret = zswap_writeback_entry(entry, tree); 667 668 spin_lock(&tree->lock); 669 if (ret) { 670 /* Writeback failed, put entry back on LRU */ 671 spin_lock(&pool->lru_lock); 672 list_move(&entry->lru, &pool->lru); 673 spin_unlock(&pool->lru_lock); 674 goto put_unlock; 675 } 676 677 /* 678 * Writeback started successfully, the page now belongs to the 679 * swapcache. Drop the entry from zswap - unless invalidate already 680 * took it out while we had the tree->lock released for IO. 681 */ 682 zswap_invalidate_entry(tree, entry); 683 684 put_unlock: 685 /* Drop local reference */ 686 zswap_entry_put(tree, entry); 687 unlock: 688 spin_unlock(&tree->lock); 689 return ret ? -EAGAIN : 0; 690 } 691 692 static void shrink_worker(struct work_struct *w) 693 { 694 struct zswap_pool *pool = container_of(w, typeof(*pool), 695 shrink_work); 696 int ret, failures = 0; 697 698 do { 699 ret = zswap_reclaim_entry(pool); 700 if (ret) { 701 zswap_reject_reclaim_fail++; 702 if (ret != -EAGAIN) 703 break; 704 if (++failures == MAX_RECLAIM_RETRIES) 705 break; 706 } 707 cond_resched(); 708 } while (!zswap_can_accept()); 709 zswap_pool_put(pool); 710 } 711 712 static struct zswap_pool *zswap_pool_create(char *type, char *compressor) 713 { 714 int i; 715 struct zswap_pool *pool; 716 char name[38]; /* 'zswap' + 32 char (max) num + \0 */ 717 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 718 int ret; 719 720 if (!zswap_has_pool) { 721 /* if either are unset, pool initialization failed, and we 722 * need both params to be set correctly before trying to 723 * create a pool. 724 */ 725 if (!strcmp(type, ZSWAP_PARAM_UNSET)) 726 return NULL; 727 if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) 728 return NULL; 729 } 730 731 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 732 if (!pool) 733 return NULL; 734 735 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { 736 /* unique name for each pool specifically required by zsmalloc */ 737 snprintf(name, 38, "zswap%x", 738 atomic_inc_return(&zswap_pools_count)); 739 740 pool->zpools[i] = zpool_create_pool(type, name, gfp); 741 if (!pool->zpools[i]) { 742 pr_err("%s zpool not available\n", type); 743 goto error; 744 } 745 } 746 pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); 747 748 strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); 749 750 pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); 751 if (!pool->acomp_ctx) { 752 pr_err("percpu alloc failed\n"); 753 goto error; 754 } 755 756 ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, 757 &pool->node); 758 if (ret) 759 goto error; 760 pr_debug("using %s compressor\n", pool->tfm_name); 761 762 /* being the current pool takes 1 ref; this func expects the 763 * caller to always add the new pool as the current pool 764 */ 765 kref_init(&pool->kref); 766 INIT_LIST_HEAD(&pool->list); 767 INIT_LIST_HEAD(&pool->lru); 768 spin_lock_init(&pool->lru_lock); 769 INIT_WORK(&pool->shrink_work, shrink_worker); 770 771 zswap_pool_debug("created", pool); 772 773 return pool; 774 775 error: 776 if (pool->acomp_ctx) 777 free_percpu(pool->acomp_ctx); 778 while (i--) 779 zpool_destroy_pool(pool->zpools[i]); 780 kfree(pool); 781 return NULL; 782 } 783 784 static struct zswap_pool *__zswap_pool_create_fallback(void) 785 { 786 bool has_comp, has_zpool; 787 788 has_comp = crypto_has_acomp(zswap_compressor, 0, 0); 789 if (!has_comp && strcmp(zswap_compressor, 790 CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { 791 pr_err("compressor %s not available, using default %s\n", 792 zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); 793 param_free_charp(&zswap_compressor); 794 zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; 795 has_comp = crypto_has_acomp(zswap_compressor, 0, 0); 796 } 797 if (!has_comp) { 798 pr_err("default compressor %s not available\n", 799 zswap_compressor); 800 param_free_charp(&zswap_compressor); 801 zswap_compressor = ZSWAP_PARAM_UNSET; 802 } 803 804 has_zpool = zpool_has_pool(zswap_zpool_type); 805 if (!has_zpool && strcmp(zswap_zpool_type, 806 CONFIG_ZSWAP_ZPOOL_DEFAULT)) { 807 pr_err("zpool %s not available, using default %s\n", 808 zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); 809 param_free_charp(&zswap_zpool_type); 810 zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; 811 has_zpool = zpool_has_pool(zswap_zpool_type); 812 } 813 if (!has_zpool) { 814 pr_err("default zpool %s not available\n", 815 zswap_zpool_type); 816 param_free_charp(&zswap_zpool_type); 817 zswap_zpool_type = ZSWAP_PARAM_UNSET; 818 } 819 820 if (!has_comp || !has_zpool) 821 return NULL; 822 823 return zswap_pool_create(zswap_zpool_type, zswap_compressor); 824 } 825 826 static void zswap_pool_destroy(struct zswap_pool *pool) 827 { 828 int i; 829 830 zswap_pool_debug("destroying", pool); 831 832 cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); 833 free_percpu(pool->acomp_ctx); 834 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) 835 zpool_destroy_pool(pool->zpools[i]); 836 kfree(pool); 837 } 838 839 static int __must_check zswap_pool_get(struct zswap_pool *pool) 840 { 841 if (!pool) 842 return 0; 843 844 return kref_get_unless_zero(&pool->kref); 845 } 846 847 static void __zswap_pool_release(struct work_struct *work) 848 { 849 struct zswap_pool *pool = container_of(work, typeof(*pool), 850 release_work); 851 852 synchronize_rcu(); 853 854 /* nobody should have been able to get a kref... */ 855 WARN_ON(kref_get_unless_zero(&pool->kref)); 856 857 /* pool is now off zswap_pools list and has no references. */ 858 zswap_pool_destroy(pool); 859 } 860 861 static void __zswap_pool_empty(struct kref *kref) 862 { 863 struct zswap_pool *pool; 864 865 pool = container_of(kref, typeof(*pool), kref); 866 867 spin_lock(&zswap_pools_lock); 868 869 WARN_ON(pool == zswap_pool_current()); 870 871 list_del_rcu(&pool->list); 872 873 INIT_WORK(&pool->release_work, __zswap_pool_release); 874 schedule_work(&pool->release_work); 875 876 spin_unlock(&zswap_pools_lock); 877 } 878 879 static void zswap_pool_put(struct zswap_pool *pool) 880 { 881 kref_put(&pool->kref, __zswap_pool_empty); 882 } 883 884 /********************************* 885 * param callbacks 886 **********************************/ 887 888 static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) 889 { 890 /* no change required */ 891 if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) 892 return false; 893 return true; 894 } 895 896 /* val must be a null-terminated string */ 897 static int __zswap_param_set(const char *val, const struct kernel_param *kp, 898 char *type, char *compressor) 899 { 900 struct zswap_pool *pool, *put_pool = NULL; 901 char *s = strstrip((char *)val); 902 int ret = 0; 903 bool new_pool = false; 904 905 mutex_lock(&zswap_init_lock); 906 switch (zswap_init_state) { 907 case ZSWAP_UNINIT: 908 /* if this is load-time (pre-init) param setting, 909 * don't create a pool; that's done during init. 910 */ 911 ret = param_set_charp(s, kp); 912 break; 913 case ZSWAP_INIT_SUCCEED: 914 new_pool = zswap_pool_changed(s, kp); 915 break; 916 case ZSWAP_INIT_FAILED: 917 pr_err("can't set param, initialization failed\n"); 918 ret = -ENODEV; 919 } 920 mutex_unlock(&zswap_init_lock); 921 922 /* no need to create a new pool, return directly */ 923 if (!new_pool) 924 return ret; 925 926 if (!type) { 927 if (!zpool_has_pool(s)) { 928 pr_err("zpool %s not available\n", s); 929 return -ENOENT; 930 } 931 type = s; 932 } else if (!compressor) { 933 if (!crypto_has_acomp(s, 0, 0)) { 934 pr_err("compressor %s not available\n", s); 935 return -ENOENT; 936 } 937 compressor = s; 938 } else { 939 WARN_ON(1); 940 return -EINVAL; 941 } 942 943 spin_lock(&zswap_pools_lock); 944 945 pool = zswap_pool_find_get(type, compressor); 946 if (pool) { 947 zswap_pool_debug("using existing", pool); 948 WARN_ON(pool == zswap_pool_current()); 949 list_del_rcu(&pool->list); 950 } 951 952 spin_unlock(&zswap_pools_lock); 953 954 if (!pool) 955 pool = zswap_pool_create(type, compressor); 956 957 if (pool) 958 ret = param_set_charp(s, kp); 959 else 960 ret = -EINVAL; 961 962 spin_lock(&zswap_pools_lock); 963 964 if (!ret) { 965 put_pool = zswap_pool_current(); 966 list_add_rcu(&pool->list, &zswap_pools); 967 zswap_has_pool = true; 968 } else if (pool) { 969 /* add the possibly pre-existing pool to the end of the pools 970 * list; if it's new (and empty) then it'll be removed and 971 * destroyed by the put after we drop the lock 972 */ 973 list_add_tail_rcu(&pool->list, &zswap_pools); 974 put_pool = pool; 975 } 976 977 spin_unlock(&zswap_pools_lock); 978 979 if (!zswap_has_pool && !pool) { 980 /* if initial pool creation failed, and this pool creation also 981 * failed, maybe both compressor and zpool params were bad. 982 * Allow changing this param, so pool creation will succeed 983 * when the other param is changed. We already verified this 984 * param is ok in the zpool_has_pool() or crypto_has_acomp() 985 * checks above. 986 */ 987 ret = param_set_charp(s, kp); 988 } 989 990 /* drop the ref from either the old current pool, 991 * or the new pool we failed to add 992 */ 993 if (put_pool) 994 zswap_pool_put(put_pool); 995 996 return ret; 997 } 998 999 static int zswap_compressor_param_set(const char *val, 1000 const struct kernel_param *kp) 1001 { 1002 return __zswap_param_set(val, kp, zswap_zpool_type, NULL); 1003 } 1004 1005 static int zswap_zpool_param_set(const char *val, 1006 const struct kernel_param *kp) 1007 { 1008 return __zswap_param_set(val, kp, NULL, zswap_compressor); 1009 } 1010 1011 static int zswap_enabled_param_set(const char *val, 1012 const struct kernel_param *kp) 1013 { 1014 int ret = -ENODEV; 1015 1016 /* if this is load-time (pre-init) param setting, only set param. */ 1017 if (system_state != SYSTEM_RUNNING) 1018 return param_set_bool(val, kp); 1019 1020 mutex_lock(&zswap_init_lock); 1021 switch (zswap_init_state) { 1022 case ZSWAP_UNINIT: 1023 if (zswap_setup()) 1024 break; 1025 fallthrough; 1026 case ZSWAP_INIT_SUCCEED: 1027 if (!zswap_has_pool) 1028 pr_err("can't enable, no pool configured\n"); 1029 else 1030 ret = param_set_bool(val, kp); 1031 break; 1032 case ZSWAP_INIT_FAILED: 1033 pr_err("can't enable, initialization failed\n"); 1034 } 1035 mutex_unlock(&zswap_init_lock); 1036 1037 return ret; 1038 } 1039 1040 /********************************* 1041 * writeback code 1042 **********************************/ 1043 /* 1044 * Attempts to free an entry by adding a page to the swap cache, 1045 * decompressing the entry data into the page, and issuing a 1046 * bio write to write the page back to the swap device. 1047 * 1048 * This can be thought of as a "resumed writeback" of the page 1049 * to the swap device. We are basically resuming the same swap 1050 * writeback path that was intercepted with the zswap_store() 1051 * in the first place. After the page has been decompressed into 1052 * the swap cache, the compressed version stored by zswap can be 1053 * freed. 1054 */ 1055 static int zswap_writeback_entry(struct zswap_entry *entry, 1056 struct zswap_tree *tree) 1057 { 1058 swp_entry_t swpentry = entry->swpentry; 1059 struct page *page; 1060 struct scatterlist input, output; 1061 struct crypto_acomp_ctx *acomp_ctx; 1062 struct zpool *pool = zswap_find_zpool(entry); 1063 bool page_was_allocated; 1064 u8 *src, *tmp = NULL; 1065 unsigned int dlen; 1066 int ret; 1067 struct writeback_control wbc = { 1068 .sync_mode = WB_SYNC_NONE, 1069 }; 1070 1071 if (!zpool_can_sleep_mapped(pool)) { 1072 tmp = kmalloc(PAGE_SIZE, GFP_KERNEL); 1073 if (!tmp) 1074 return -ENOMEM; 1075 } 1076 1077 /* try to allocate swap cache page */ 1078 page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0, 1079 &page_was_allocated); 1080 if (!page) { 1081 ret = -ENOMEM; 1082 goto fail; 1083 } 1084 1085 /* Found an existing page, we raced with load/swapin */ 1086 if (!page_was_allocated) { 1087 put_page(page); 1088 ret = -EEXIST; 1089 goto fail; 1090 } 1091 1092 /* 1093 * Page is locked, and the swapcache is now secured against 1094 * concurrent swapping to and from the slot. Verify that the 1095 * swap entry hasn't been invalidated and recycled behind our 1096 * backs (our zswap_entry reference doesn't prevent that), to 1097 * avoid overwriting a new swap page with old compressed data. 1098 */ 1099 spin_lock(&tree->lock); 1100 if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { 1101 spin_unlock(&tree->lock); 1102 delete_from_swap_cache(page_folio(page)); 1103 ret = -ENOMEM; 1104 goto fail; 1105 } 1106 spin_unlock(&tree->lock); 1107 1108 /* decompress */ 1109 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); 1110 dlen = PAGE_SIZE; 1111 1112 src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO); 1113 if (!zpool_can_sleep_mapped(pool)) { 1114 memcpy(tmp, src, entry->length); 1115 src = tmp; 1116 zpool_unmap_handle(pool, entry->handle); 1117 } 1118 1119 mutex_lock(acomp_ctx->mutex); 1120 sg_init_one(&input, src, entry->length); 1121 sg_init_table(&output, 1); 1122 sg_set_page(&output, page, PAGE_SIZE, 0); 1123 acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); 1124 ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); 1125 dlen = acomp_ctx->req->dlen; 1126 mutex_unlock(acomp_ctx->mutex); 1127 1128 if (!zpool_can_sleep_mapped(pool)) 1129 kfree(tmp); 1130 else 1131 zpool_unmap_handle(pool, entry->handle); 1132 1133 BUG_ON(ret); 1134 BUG_ON(dlen != PAGE_SIZE); 1135 1136 /* page is up to date */ 1137 SetPageUptodate(page); 1138 1139 /* move it to the tail of the inactive list after end_writeback */ 1140 SetPageReclaim(page); 1141 1142 /* start writeback */ 1143 __swap_writepage(page, &wbc); 1144 put_page(page); 1145 zswap_written_back_pages++; 1146 1147 return ret; 1148 1149 fail: 1150 if (!zpool_can_sleep_mapped(pool)) 1151 kfree(tmp); 1152 1153 /* 1154 * If we get here because the page is already in swapcache, a 1155 * load may be happening concurrently. It is safe and okay to 1156 * not free the entry. It is also okay to return !0. 1157 */ 1158 return ret; 1159 } 1160 1161 static int zswap_is_page_same_filled(void *ptr, unsigned long *value) 1162 { 1163 unsigned long *page; 1164 unsigned long val; 1165 unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; 1166 1167 page = (unsigned long *)ptr; 1168 val = page[0]; 1169 1170 if (val != page[last_pos]) 1171 return 0; 1172 1173 for (pos = 1; pos < last_pos; pos++) { 1174 if (val != page[pos]) 1175 return 0; 1176 } 1177 1178 *value = val; 1179 1180 return 1; 1181 } 1182 1183 static void zswap_fill_page(void *ptr, unsigned long value) 1184 { 1185 unsigned long *page; 1186 1187 page = (unsigned long *)ptr; 1188 memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); 1189 } 1190 1191 bool zswap_store(struct folio *folio) 1192 { 1193 swp_entry_t swp = folio->swap; 1194 int type = swp_type(swp); 1195 pgoff_t offset = swp_offset(swp); 1196 struct page *page = &folio->page; 1197 struct zswap_tree *tree = zswap_trees[type]; 1198 struct zswap_entry *entry, *dupentry; 1199 struct scatterlist input, output; 1200 struct crypto_acomp_ctx *acomp_ctx; 1201 struct obj_cgroup *objcg = NULL; 1202 struct zswap_pool *pool; 1203 struct zpool *zpool; 1204 unsigned int dlen = PAGE_SIZE; 1205 unsigned long handle, value; 1206 char *buf; 1207 u8 *src, *dst; 1208 gfp_t gfp; 1209 int ret; 1210 1211 VM_WARN_ON_ONCE(!folio_test_locked(folio)); 1212 VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); 1213 1214 /* Large folios aren't supported */ 1215 if (folio_test_large(folio)) 1216 return false; 1217 1218 if (!zswap_enabled || !tree) 1219 return false; 1220 1221 /* 1222 * If this is a duplicate, it must be removed before attempting to store 1223 * it, otherwise, if the store fails the old page won't be removed from 1224 * the tree, and it might be written back overriding the new data. 1225 */ 1226 spin_lock(&tree->lock); 1227 dupentry = zswap_rb_search(&tree->rbroot, offset); 1228 if (dupentry) { 1229 zswap_duplicate_entry++; 1230 zswap_invalidate_entry(tree, dupentry); 1231 } 1232 spin_unlock(&tree->lock); 1233 1234 /* 1235 * XXX: zswap reclaim does not work with cgroups yet. Without a 1236 * cgroup-aware entry LRU, we will push out entries system-wide based on 1237 * local cgroup limits. 1238 */ 1239 objcg = get_obj_cgroup_from_folio(folio); 1240 if (objcg && !obj_cgroup_may_zswap(objcg)) 1241 goto reject; 1242 1243 /* reclaim space if needed */ 1244 if (zswap_is_full()) { 1245 zswap_pool_limit_hit++; 1246 zswap_pool_reached_full = true; 1247 goto shrink; 1248 } 1249 1250 if (zswap_pool_reached_full) { 1251 if (!zswap_can_accept()) 1252 goto shrink; 1253 else 1254 zswap_pool_reached_full = false; 1255 } 1256 1257 /* allocate entry */ 1258 entry = zswap_entry_cache_alloc(GFP_KERNEL); 1259 if (!entry) { 1260 zswap_reject_kmemcache_fail++; 1261 goto reject; 1262 } 1263 1264 if (zswap_same_filled_pages_enabled) { 1265 src = kmap_atomic(page); 1266 if (zswap_is_page_same_filled(src, &value)) { 1267 kunmap_atomic(src); 1268 entry->swpentry = swp_entry(type, offset); 1269 entry->length = 0; 1270 entry->value = value; 1271 atomic_inc(&zswap_same_filled_pages); 1272 goto insert_entry; 1273 } 1274 kunmap_atomic(src); 1275 } 1276 1277 if (!zswap_non_same_filled_pages_enabled) 1278 goto freepage; 1279 1280 /* if entry is successfully added, it keeps the reference */ 1281 entry->pool = zswap_pool_current_get(); 1282 if (!entry->pool) 1283 goto freepage; 1284 1285 /* compress */ 1286 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); 1287 1288 mutex_lock(acomp_ctx->mutex); 1289 1290 dst = acomp_ctx->dstmem; 1291 sg_init_table(&input, 1); 1292 sg_set_page(&input, page, PAGE_SIZE, 0); 1293 1294 /* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */ 1295 sg_init_one(&output, dst, PAGE_SIZE * 2); 1296 acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); 1297 /* 1298 * it maybe looks a little bit silly that we send an asynchronous request, 1299 * then wait for its completion synchronously. This makes the process look 1300 * synchronous in fact. 1301 * Theoretically, acomp supports users send multiple acomp requests in one 1302 * acomp instance, then get those requests done simultaneously. but in this 1303 * case, zswap actually does store and load page by page, there is no 1304 * existing method to send the second page before the first page is done 1305 * in one thread doing zwap. 1306 * but in different threads running on different cpu, we have different 1307 * acomp instance, so multiple threads can do (de)compression in parallel. 1308 */ 1309 ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); 1310 dlen = acomp_ctx->req->dlen; 1311 1312 if (ret) 1313 goto put_dstmem; 1314 1315 /* store */ 1316 zpool = zswap_find_zpool(entry); 1317 gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 1318 if (zpool_malloc_support_movable(zpool)) 1319 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; 1320 ret = zpool_malloc(zpool, dlen, gfp, &handle); 1321 if (ret == -ENOSPC) { 1322 zswap_reject_compress_poor++; 1323 goto put_dstmem; 1324 } 1325 if (ret) { 1326 zswap_reject_alloc_fail++; 1327 goto put_dstmem; 1328 } 1329 buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); 1330 memcpy(buf, dst, dlen); 1331 zpool_unmap_handle(zpool, handle); 1332 mutex_unlock(acomp_ctx->mutex); 1333 1334 /* populate entry */ 1335 entry->swpentry = swp_entry(type, offset); 1336 entry->handle = handle; 1337 entry->length = dlen; 1338 1339 insert_entry: 1340 entry->objcg = objcg; 1341 if (objcg) { 1342 obj_cgroup_charge_zswap(objcg, entry->length); 1343 /* Account before objcg ref is moved to tree */ 1344 count_objcg_event(objcg, ZSWPOUT); 1345 } 1346 1347 /* map */ 1348 spin_lock(&tree->lock); 1349 /* 1350 * A duplicate entry should have been removed at the beginning of this 1351 * function. Since the swap entry should be pinned, if a duplicate is 1352 * found again here it means that something went wrong in the swap 1353 * cache. 1354 */ 1355 while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { 1356 WARN_ON(1); 1357 zswap_duplicate_entry++; 1358 zswap_invalidate_entry(tree, dupentry); 1359 } 1360 if (entry->length) { 1361 spin_lock(&entry->pool->lru_lock); 1362 list_add(&entry->lru, &entry->pool->lru); 1363 spin_unlock(&entry->pool->lru_lock); 1364 } 1365 spin_unlock(&tree->lock); 1366 1367 /* update stats */ 1368 atomic_inc(&zswap_stored_pages); 1369 zswap_update_total_size(); 1370 count_vm_event(ZSWPOUT); 1371 1372 return true; 1373 1374 put_dstmem: 1375 mutex_unlock(acomp_ctx->mutex); 1376 zswap_pool_put(entry->pool); 1377 freepage: 1378 zswap_entry_cache_free(entry); 1379 reject: 1380 if (objcg) 1381 obj_cgroup_put(objcg); 1382 return false; 1383 1384 shrink: 1385 pool = zswap_pool_last_get(); 1386 if (pool) 1387 queue_work(shrink_wq, &pool->shrink_work); 1388 goto reject; 1389 } 1390 1391 bool zswap_load(struct folio *folio) 1392 { 1393 swp_entry_t swp = folio->swap; 1394 int type = swp_type(swp); 1395 pgoff_t offset = swp_offset(swp); 1396 struct page *page = &folio->page; 1397 struct zswap_tree *tree = zswap_trees[type]; 1398 struct zswap_entry *entry; 1399 struct scatterlist input, output; 1400 struct crypto_acomp_ctx *acomp_ctx; 1401 u8 *src, *dst, *tmp; 1402 struct zpool *zpool; 1403 unsigned int dlen; 1404 bool ret; 1405 1406 VM_WARN_ON_ONCE(!folio_test_locked(folio)); 1407 1408 /* find */ 1409 spin_lock(&tree->lock); 1410 entry = zswap_entry_find_get(&tree->rbroot, offset); 1411 if (!entry) { 1412 spin_unlock(&tree->lock); 1413 return false; 1414 } 1415 spin_unlock(&tree->lock); 1416 1417 if (!entry->length) { 1418 dst = kmap_atomic(page); 1419 zswap_fill_page(dst, entry->value); 1420 kunmap_atomic(dst); 1421 ret = true; 1422 goto stats; 1423 } 1424 1425 zpool = zswap_find_zpool(entry); 1426 if (!zpool_can_sleep_mapped(zpool)) { 1427 tmp = kmalloc(entry->length, GFP_KERNEL); 1428 if (!tmp) { 1429 ret = false; 1430 goto freeentry; 1431 } 1432 } 1433 1434 /* decompress */ 1435 dlen = PAGE_SIZE; 1436 src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); 1437 1438 if (!zpool_can_sleep_mapped(zpool)) { 1439 memcpy(tmp, src, entry->length); 1440 src = tmp; 1441 zpool_unmap_handle(zpool, entry->handle); 1442 } 1443 1444 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); 1445 mutex_lock(acomp_ctx->mutex); 1446 sg_init_one(&input, src, entry->length); 1447 sg_init_table(&output, 1); 1448 sg_set_page(&output, page, PAGE_SIZE, 0); 1449 acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); 1450 if (crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)) 1451 WARN_ON(1); 1452 mutex_unlock(acomp_ctx->mutex); 1453 1454 if (zpool_can_sleep_mapped(zpool)) 1455 zpool_unmap_handle(zpool, entry->handle); 1456 else 1457 kfree(tmp); 1458 1459 ret = true; 1460 stats: 1461 count_vm_event(ZSWPIN); 1462 if (entry->objcg) 1463 count_objcg_event(entry->objcg, ZSWPIN); 1464 freeentry: 1465 spin_lock(&tree->lock); 1466 if (ret && zswap_exclusive_loads_enabled) { 1467 zswap_invalidate_entry(tree, entry); 1468 folio_mark_dirty(folio); 1469 } else if (entry->length) { 1470 spin_lock(&entry->pool->lru_lock); 1471 list_move(&entry->lru, &entry->pool->lru); 1472 spin_unlock(&entry->pool->lru_lock); 1473 } 1474 zswap_entry_put(tree, entry); 1475 spin_unlock(&tree->lock); 1476 1477 return ret; 1478 } 1479 1480 void zswap_invalidate(int type, pgoff_t offset) 1481 { 1482 struct zswap_tree *tree = zswap_trees[type]; 1483 struct zswap_entry *entry; 1484 1485 /* find */ 1486 spin_lock(&tree->lock); 1487 entry = zswap_rb_search(&tree->rbroot, offset); 1488 if (!entry) { 1489 /* entry was written back */ 1490 spin_unlock(&tree->lock); 1491 return; 1492 } 1493 zswap_invalidate_entry(tree, entry); 1494 spin_unlock(&tree->lock); 1495 } 1496 1497 void zswap_swapon(int type) 1498 { 1499 struct zswap_tree *tree; 1500 1501 tree = kzalloc(sizeof(*tree), GFP_KERNEL); 1502 if (!tree) { 1503 pr_err("alloc failed, zswap disabled for swap type %d\n", type); 1504 return; 1505 } 1506 1507 tree->rbroot = RB_ROOT; 1508 spin_lock_init(&tree->lock); 1509 zswap_trees[type] = tree; 1510 } 1511 1512 void zswap_swapoff(int type) 1513 { 1514 struct zswap_tree *tree = zswap_trees[type]; 1515 struct zswap_entry *entry, *n; 1516 1517 if (!tree) 1518 return; 1519 1520 /* walk the tree and free everything */ 1521 spin_lock(&tree->lock); 1522 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) 1523 zswap_free_entry(entry); 1524 tree->rbroot = RB_ROOT; 1525 spin_unlock(&tree->lock); 1526 kfree(tree); 1527 zswap_trees[type] = NULL; 1528 } 1529 1530 /********************************* 1531 * debugfs functions 1532 **********************************/ 1533 #ifdef CONFIG_DEBUG_FS 1534 #include <linux/debugfs.h> 1535 1536 static struct dentry *zswap_debugfs_root; 1537 1538 static int zswap_debugfs_init(void) 1539 { 1540 if (!debugfs_initialized()) 1541 return -ENODEV; 1542 1543 zswap_debugfs_root = debugfs_create_dir("zswap", NULL); 1544 1545 debugfs_create_u64("pool_limit_hit", 0444, 1546 zswap_debugfs_root, &zswap_pool_limit_hit); 1547 debugfs_create_u64("reject_reclaim_fail", 0444, 1548 zswap_debugfs_root, &zswap_reject_reclaim_fail); 1549 debugfs_create_u64("reject_alloc_fail", 0444, 1550 zswap_debugfs_root, &zswap_reject_alloc_fail); 1551 debugfs_create_u64("reject_kmemcache_fail", 0444, 1552 zswap_debugfs_root, &zswap_reject_kmemcache_fail); 1553 debugfs_create_u64("reject_compress_poor", 0444, 1554 zswap_debugfs_root, &zswap_reject_compress_poor); 1555 debugfs_create_u64("written_back_pages", 0444, 1556 zswap_debugfs_root, &zswap_written_back_pages); 1557 debugfs_create_u64("duplicate_entry", 0444, 1558 zswap_debugfs_root, &zswap_duplicate_entry); 1559 debugfs_create_u64("pool_total_size", 0444, 1560 zswap_debugfs_root, &zswap_pool_total_size); 1561 debugfs_create_atomic_t("stored_pages", 0444, 1562 zswap_debugfs_root, &zswap_stored_pages); 1563 debugfs_create_atomic_t("same_filled_pages", 0444, 1564 zswap_debugfs_root, &zswap_same_filled_pages); 1565 1566 return 0; 1567 } 1568 #else 1569 static int zswap_debugfs_init(void) 1570 { 1571 return 0; 1572 } 1573 #endif 1574 1575 /********************************* 1576 * module init and exit 1577 **********************************/ 1578 static int zswap_setup(void) 1579 { 1580 struct zswap_pool *pool; 1581 int ret; 1582 1583 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 1584 if (!zswap_entry_cache) { 1585 pr_err("entry cache creation failed\n"); 1586 goto cache_fail; 1587 } 1588 1589 ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare", 1590 zswap_dstmem_prepare, zswap_dstmem_dead); 1591 if (ret) { 1592 pr_err("dstmem alloc failed\n"); 1593 goto dstmem_fail; 1594 } 1595 1596 ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, 1597 "mm/zswap_pool:prepare", 1598 zswap_cpu_comp_prepare, 1599 zswap_cpu_comp_dead); 1600 if (ret) 1601 goto hp_fail; 1602 1603 pool = __zswap_pool_create_fallback(); 1604 if (pool) { 1605 pr_info("loaded using pool %s/%s\n", pool->tfm_name, 1606 zpool_get_type(pool->zpools[0])); 1607 list_add(&pool->list, &zswap_pools); 1608 zswap_has_pool = true; 1609 } else { 1610 pr_err("pool creation failed\n"); 1611 zswap_enabled = false; 1612 } 1613 1614 shrink_wq = create_workqueue("zswap-shrink"); 1615 if (!shrink_wq) 1616 goto fallback_fail; 1617 1618 if (zswap_debugfs_init()) 1619 pr_warn("debugfs initialization failed\n"); 1620 zswap_init_state = ZSWAP_INIT_SUCCEED; 1621 return 0; 1622 1623 fallback_fail: 1624 if (pool) 1625 zswap_pool_destroy(pool); 1626 hp_fail: 1627 cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE); 1628 dstmem_fail: 1629 kmem_cache_destroy(zswap_entry_cache); 1630 cache_fail: 1631 /* if built-in, we aren't unloaded on failure; don't allow use */ 1632 zswap_init_state = ZSWAP_INIT_FAILED; 1633 zswap_enabled = false; 1634 return -ENOMEM; 1635 } 1636 1637 static int __init zswap_init(void) 1638 { 1639 if (!zswap_enabled) 1640 return 0; 1641 return zswap_setup(); 1642 } 1643 /* must be late so crypto has time to come up */ 1644 late_initcall(zswap_init); 1645 1646 MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); 1647 MODULE_DESCRIPTION("Compressed cache for swap pages"); 1648