1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * zswap.c - zswap driver file 4 * 5 * zswap is a backend for frontswap that takes pages that are in the process 6 * of being swapped out and attempts to compress and store them in a 7 * RAM-based memory pool. This can result in a significant I/O reduction on 8 * the swap device and, in the case where decompressing from RAM is faster 9 * than reading from the swap device, can also improve workload performance. 10 * 11 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> 12 */ 13 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 16 #include <linux/module.h> 17 #include <linux/cpu.h> 18 #include <linux/highmem.h> 19 #include <linux/slab.h> 20 #include <linux/spinlock.h> 21 #include <linux/types.h> 22 #include <linux/atomic.h> 23 #include <linux/frontswap.h> 24 #include <linux/rbtree.h> 25 #include <linux/swap.h> 26 #include <linux/crypto.h> 27 #include <linux/scatterlist.h> 28 #include <linux/mempool.h> 29 #include <linux/zpool.h> 30 #include <crypto/acompress.h> 31 32 #include <linux/mm_types.h> 33 #include <linux/page-flags.h> 34 #include <linux/swapops.h> 35 #include <linux/writeback.h> 36 #include <linux/pagemap.h> 37 #include <linux/workqueue.h> 38 39 #include "swap.h" 40 #include "internal.h" 41 42 /********************************* 43 * statistics 44 **********************************/ 45 /* Total bytes used by the compressed storage */ 46 u64 zswap_pool_total_size; 47 /* The number of compressed pages currently stored in zswap */ 48 atomic_t zswap_stored_pages = ATOMIC_INIT(0); 49 /* The number of same-value filled pages currently stored in zswap */ 50 static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); 51 52 /* 53 * The statistics below are not protected from concurrent access for 54 * performance reasons so they may not be a 100% accurate. However, 55 * they do provide useful information on roughly how many times a 56 * certain event is occurring. 57 */ 58 59 /* Pool limit was hit (see zswap_max_pool_percent) */ 60 static u64 zswap_pool_limit_hit; 61 /* Pages written back when pool limit was reached */ 62 static u64 zswap_written_back_pages; 63 /* Store failed due to a reclaim failure after pool limit was reached */ 64 static u64 zswap_reject_reclaim_fail; 65 /* Compressed page was too big for the allocator to (optimally) store */ 66 static u64 zswap_reject_compress_poor; 67 /* Store failed because underlying allocator could not get memory */ 68 static u64 zswap_reject_alloc_fail; 69 /* Store failed because the entry metadata could not be allocated (rare) */ 70 static u64 zswap_reject_kmemcache_fail; 71 /* Duplicate store was encountered (rare) */ 72 static u64 zswap_duplicate_entry; 73 74 /* Shrinker work queue */ 75 static struct workqueue_struct *shrink_wq; 76 /* Pool limit was hit, we need to calm down */ 77 static bool zswap_pool_reached_full; 78 79 /********************************* 80 * tunables 81 **********************************/ 82 83 #define ZSWAP_PARAM_UNSET "" 84 85 static int zswap_setup(void); 86 87 /* Enable/disable zswap */ 88 static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); 89 static int zswap_enabled_param_set(const char *, 90 const struct kernel_param *); 91 static const struct kernel_param_ops zswap_enabled_param_ops = { 92 .set = zswap_enabled_param_set, 93 .get = param_get_bool, 94 }; 95 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); 96 97 /* Crypto compressor to use */ 98 static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; 99 static int zswap_compressor_param_set(const char *, 100 const struct kernel_param *); 101 static const struct kernel_param_ops zswap_compressor_param_ops = { 102 .set = zswap_compressor_param_set, 103 .get = param_get_charp, 104 .free = param_free_charp, 105 }; 106 module_param_cb(compressor, &zswap_compressor_param_ops, 107 &zswap_compressor, 0644); 108 109 /* Compressed storage zpool to use */ 110 static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; 111 static int zswap_zpool_param_set(const char *, const struct kernel_param *); 112 static const struct kernel_param_ops zswap_zpool_param_ops = { 113 .set = zswap_zpool_param_set, 114 .get = param_get_charp, 115 .free = param_free_charp, 116 }; 117 module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); 118 119 /* The maximum percentage of memory that the compressed pool can occupy */ 120 static unsigned int zswap_max_pool_percent = 20; 121 module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); 122 123 /* The threshold for accepting new pages after the max_pool_percent was hit */ 124 static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ 125 module_param_named(accept_threshold_percent, zswap_accept_thr_percent, 126 uint, 0644); 127 128 /* 129 * Enable/disable handling same-value filled pages (enabled by default). 130 * If disabled every page is considered non-same-value filled. 131 */ 132 static bool zswap_same_filled_pages_enabled = true; 133 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, 134 bool, 0644); 135 136 /* Enable/disable handling non-same-value filled pages (enabled by default) */ 137 static bool zswap_non_same_filled_pages_enabled = true; 138 module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, 139 bool, 0644); 140 141 static bool zswap_exclusive_loads_enabled = IS_ENABLED( 142 CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON); 143 module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); 144 145 /********************************* 146 * data structures 147 **********************************/ 148 149 struct crypto_acomp_ctx { 150 struct crypto_acomp *acomp; 151 struct acomp_req *req; 152 struct crypto_wait wait; 153 u8 *dstmem; 154 struct mutex *mutex; 155 }; 156 157 /* 158 * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock. 159 * The only case where lru_lock is not acquired while holding tree.lock is 160 * when a zswap_entry is taken off the lru for writeback, in that case it 161 * needs to be verified that it's still valid in the tree. 162 */ 163 struct zswap_pool { 164 struct zpool *zpool; 165 struct crypto_acomp_ctx __percpu *acomp_ctx; 166 struct kref kref; 167 struct list_head list; 168 struct work_struct release_work; 169 struct work_struct shrink_work; 170 struct hlist_node node; 171 char tfm_name[CRYPTO_MAX_ALG_NAME]; 172 struct list_head lru; 173 spinlock_t lru_lock; 174 }; 175 176 /* 177 * struct zswap_entry 178 * 179 * This structure contains the metadata for tracking a single compressed 180 * page within zswap. 181 * 182 * rbnode - links the entry into red-black tree for the appropriate swap type 183 * offset - the swap offset for the entry. Index into the red-black tree. 184 * refcount - the number of outstanding reference to the entry. This is needed 185 * to protect against premature freeing of the entry by code 186 * concurrent calls to load, invalidate, and writeback. The lock 187 * for the zswap_tree structure that contains the entry must 188 * be held while changing the refcount. Since the lock must 189 * be held, there is no reason to also make refcount atomic. 190 * length - the length in bytes of the compressed page data. Needed during 191 * decompression. For a same value filled page length is 0, and both 192 * pool and lru are invalid and must be ignored. 193 * pool - the zswap_pool the entry's data is in 194 * handle - zpool allocation handle that stores the compressed page data 195 * value - value of the same-value filled pages which have same content 196 * lru - handle to the pool's lru used to evict pages. 197 */ 198 struct zswap_entry { 199 struct rb_node rbnode; 200 swp_entry_t swpentry; 201 int refcount; 202 unsigned int length; 203 struct zswap_pool *pool; 204 union { 205 unsigned long handle; 206 unsigned long value; 207 }; 208 struct obj_cgroup *objcg; 209 struct list_head lru; 210 }; 211 212 /* 213 * The tree lock in the zswap_tree struct protects a few things: 214 * - the rbtree 215 * - the refcount field of each entry in the tree 216 */ 217 struct zswap_tree { 218 struct rb_root rbroot; 219 spinlock_t lock; 220 }; 221 222 static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 223 224 /* RCU-protected iteration */ 225 static LIST_HEAD(zswap_pools); 226 /* protects zswap_pools list modification */ 227 static DEFINE_SPINLOCK(zswap_pools_lock); 228 /* pool counter to provide unique names to zpool */ 229 static atomic_t zswap_pools_count = ATOMIC_INIT(0); 230 231 enum zswap_init_type { 232 ZSWAP_UNINIT, 233 ZSWAP_INIT_SUCCEED, 234 ZSWAP_INIT_FAILED 235 }; 236 237 static enum zswap_init_type zswap_init_state; 238 239 /* used to ensure the integrity of initialization */ 240 static DEFINE_MUTEX(zswap_init_lock); 241 242 /* init completed, but couldn't create the initial pool */ 243 static bool zswap_has_pool; 244 245 /********************************* 246 * helpers and fwd declarations 247 **********************************/ 248 249 #define zswap_pool_debug(msg, p) \ 250 pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ 251 zpool_get_type((p)->zpool)) 252 253 static int zswap_writeback_entry(struct zswap_entry *entry, 254 struct zswap_tree *tree); 255 static int zswap_pool_get(struct zswap_pool *pool); 256 static void zswap_pool_put(struct zswap_pool *pool); 257 258 static bool zswap_is_full(void) 259 { 260 return totalram_pages() * zswap_max_pool_percent / 100 < 261 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 262 } 263 264 static bool zswap_can_accept(void) 265 { 266 return totalram_pages() * zswap_accept_thr_percent / 100 * 267 zswap_max_pool_percent / 100 > 268 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 269 } 270 271 static void zswap_update_total_size(void) 272 { 273 struct zswap_pool *pool; 274 u64 total = 0; 275 276 rcu_read_lock(); 277 278 list_for_each_entry_rcu(pool, &zswap_pools, list) 279 total += zpool_get_total_size(pool->zpool); 280 281 rcu_read_unlock(); 282 283 zswap_pool_total_size = total; 284 } 285 286 /********************************* 287 * zswap entry functions 288 **********************************/ 289 static struct kmem_cache *zswap_entry_cache; 290 291 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) 292 { 293 struct zswap_entry *entry; 294 entry = kmem_cache_alloc(zswap_entry_cache, gfp); 295 if (!entry) 296 return NULL; 297 entry->refcount = 1; 298 RB_CLEAR_NODE(&entry->rbnode); 299 return entry; 300 } 301 302 static void zswap_entry_cache_free(struct zswap_entry *entry) 303 { 304 kmem_cache_free(zswap_entry_cache, entry); 305 } 306 307 /********************************* 308 * rbtree functions 309 **********************************/ 310 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) 311 { 312 struct rb_node *node = root->rb_node; 313 struct zswap_entry *entry; 314 pgoff_t entry_offset; 315 316 while (node) { 317 entry = rb_entry(node, struct zswap_entry, rbnode); 318 entry_offset = swp_offset(entry->swpentry); 319 if (entry_offset > offset) 320 node = node->rb_left; 321 else if (entry_offset < offset) 322 node = node->rb_right; 323 else 324 return entry; 325 } 326 return NULL; 327 } 328 329 /* 330 * In the case that a entry with the same offset is found, a pointer to 331 * the existing entry is stored in dupentry and the function returns -EEXIST 332 */ 333 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, 334 struct zswap_entry **dupentry) 335 { 336 struct rb_node **link = &root->rb_node, *parent = NULL; 337 struct zswap_entry *myentry; 338 pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); 339 340 while (*link) { 341 parent = *link; 342 myentry = rb_entry(parent, struct zswap_entry, rbnode); 343 myentry_offset = swp_offset(myentry->swpentry); 344 if (myentry_offset > entry_offset) 345 link = &(*link)->rb_left; 346 else if (myentry_offset < entry_offset) 347 link = &(*link)->rb_right; 348 else { 349 *dupentry = myentry; 350 return -EEXIST; 351 } 352 } 353 rb_link_node(&entry->rbnode, parent, link); 354 rb_insert_color(&entry->rbnode, root); 355 return 0; 356 } 357 358 static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) 359 { 360 if (!RB_EMPTY_NODE(&entry->rbnode)) { 361 rb_erase(&entry->rbnode, root); 362 RB_CLEAR_NODE(&entry->rbnode); 363 } 364 } 365 366 /* 367 * Carries out the common pattern of freeing and entry's zpool allocation, 368 * freeing the entry itself, and decrementing the number of stored pages. 369 */ 370 static void zswap_free_entry(struct zswap_entry *entry) 371 { 372 if (entry->objcg) { 373 obj_cgroup_uncharge_zswap(entry->objcg, entry->length); 374 obj_cgroup_put(entry->objcg); 375 } 376 if (!entry->length) 377 atomic_dec(&zswap_same_filled_pages); 378 else { 379 spin_lock(&entry->pool->lru_lock); 380 list_del(&entry->lru); 381 spin_unlock(&entry->pool->lru_lock); 382 zpool_free(entry->pool->zpool, entry->handle); 383 zswap_pool_put(entry->pool); 384 } 385 zswap_entry_cache_free(entry); 386 atomic_dec(&zswap_stored_pages); 387 zswap_update_total_size(); 388 } 389 390 /* caller must hold the tree lock */ 391 static void zswap_entry_get(struct zswap_entry *entry) 392 { 393 entry->refcount++; 394 } 395 396 /* caller must hold the tree lock 397 * remove from the tree and free it, if nobody reference the entry 398 */ 399 static void zswap_entry_put(struct zswap_tree *tree, 400 struct zswap_entry *entry) 401 { 402 int refcount = --entry->refcount; 403 404 BUG_ON(refcount < 0); 405 if (refcount == 0) { 406 zswap_rb_erase(&tree->rbroot, entry); 407 zswap_free_entry(entry); 408 } 409 } 410 411 /* caller must hold the tree lock */ 412 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, 413 pgoff_t offset) 414 { 415 struct zswap_entry *entry; 416 417 entry = zswap_rb_search(root, offset); 418 if (entry) 419 zswap_entry_get(entry); 420 421 return entry; 422 } 423 424 /********************************* 425 * per-cpu code 426 **********************************/ 427 static DEFINE_PER_CPU(u8 *, zswap_dstmem); 428 /* 429 * If users dynamically change the zpool type and compressor at runtime, i.e. 430 * zswap is running, zswap can have more than one zpool on one cpu, but they 431 * are sharing dtsmem. So we need this mutex to be per-cpu. 432 */ 433 static DEFINE_PER_CPU(struct mutex *, zswap_mutex); 434 435 static int zswap_dstmem_prepare(unsigned int cpu) 436 { 437 struct mutex *mutex; 438 u8 *dst; 439 440 dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); 441 if (!dst) 442 return -ENOMEM; 443 444 mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu)); 445 if (!mutex) { 446 kfree(dst); 447 return -ENOMEM; 448 } 449 450 mutex_init(mutex); 451 per_cpu(zswap_dstmem, cpu) = dst; 452 per_cpu(zswap_mutex, cpu) = mutex; 453 return 0; 454 } 455 456 static int zswap_dstmem_dead(unsigned int cpu) 457 { 458 struct mutex *mutex; 459 u8 *dst; 460 461 mutex = per_cpu(zswap_mutex, cpu); 462 kfree(mutex); 463 per_cpu(zswap_mutex, cpu) = NULL; 464 465 dst = per_cpu(zswap_dstmem, cpu); 466 kfree(dst); 467 per_cpu(zswap_dstmem, cpu) = NULL; 468 469 return 0; 470 } 471 472 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) 473 { 474 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 475 struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); 476 struct crypto_acomp *acomp; 477 struct acomp_req *req; 478 479 acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); 480 if (IS_ERR(acomp)) { 481 pr_err("could not alloc crypto acomp %s : %ld\n", 482 pool->tfm_name, PTR_ERR(acomp)); 483 return PTR_ERR(acomp); 484 } 485 acomp_ctx->acomp = acomp; 486 487 req = acomp_request_alloc(acomp_ctx->acomp); 488 if (!req) { 489 pr_err("could not alloc crypto acomp_request %s\n", 490 pool->tfm_name); 491 crypto_free_acomp(acomp_ctx->acomp); 492 return -ENOMEM; 493 } 494 acomp_ctx->req = req; 495 496 crypto_init_wait(&acomp_ctx->wait); 497 /* 498 * if the backend of acomp is async zip, crypto_req_done() will wakeup 499 * crypto_wait_req(); if the backend of acomp is scomp, the callback 500 * won't be called, crypto_wait_req() will return without blocking. 501 */ 502 acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, 503 crypto_req_done, &acomp_ctx->wait); 504 505 acomp_ctx->mutex = per_cpu(zswap_mutex, cpu); 506 acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu); 507 508 return 0; 509 } 510 511 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) 512 { 513 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 514 struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); 515 516 if (!IS_ERR_OR_NULL(acomp_ctx)) { 517 if (!IS_ERR_OR_NULL(acomp_ctx->req)) 518 acomp_request_free(acomp_ctx->req); 519 if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) 520 crypto_free_acomp(acomp_ctx->acomp); 521 } 522 523 return 0; 524 } 525 526 /********************************* 527 * pool functions 528 **********************************/ 529 530 static struct zswap_pool *__zswap_pool_current(void) 531 { 532 struct zswap_pool *pool; 533 534 pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); 535 WARN_ONCE(!pool && zswap_has_pool, 536 "%s: no page storage pool!\n", __func__); 537 538 return pool; 539 } 540 541 static struct zswap_pool *zswap_pool_current(void) 542 { 543 assert_spin_locked(&zswap_pools_lock); 544 545 return __zswap_pool_current(); 546 } 547 548 static struct zswap_pool *zswap_pool_current_get(void) 549 { 550 struct zswap_pool *pool; 551 552 rcu_read_lock(); 553 554 pool = __zswap_pool_current(); 555 if (!zswap_pool_get(pool)) 556 pool = NULL; 557 558 rcu_read_unlock(); 559 560 return pool; 561 } 562 563 static struct zswap_pool *zswap_pool_last_get(void) 564 { 565 struct zswap_pool *pool, *last = NULL; 566 567 rcu_read_lock(); 568 569 list_for_each_entry_rcu(pool, &zswap_pools, list) 570 last = pool; 571 WARN_ONCE(!last && zswap_has_pool, 572 "%s: no page storage pool!\n", __func__); 573 if (!zswap_pool_get(last)) 574 last = NULL; 575 576 rcu_read_unlock(); 577 578 return last; 579 } 580 581 /* type and compressor must be null-terminated */ 582 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) 583 { 584 struct zswap_pool *pool; 585 586 assert_spin_locked(&zswap_pools_lock); 587 588 list_for_each_entry_rcu(pool, &zswap_pools, list) { 589 if (strcmp(pool->tfm_name, compressor)) 590 continue; 591 if (strcmp(zpool_get_type(pool->zpool), type)) 592 continue; 593 /* if we can't get it, it's about to be destroyed */ 594 if (!zswap_pool_get(pool)) 595 continue; 596 return pool; 597 } 598 599 return NULL; 600 } 601 602 static void zswap_invalidate_entry(struct zswap_tree *tree, 603 struct zswap_entry *entry) 604 { 605 /* remove from rbtree */ 606 zswap_rb_erase(&tree->rbroot, entry); 607 608 /* drop the initial reference from entry creation */ 609 zswap_entry_put(tree, entry); 610 } 611 612 static int zswap_reclaim_entry(struct zswap_pool *pool) 613 { 614 struct zswap_entry *entry; 615 struct zswap_tree *tree; 616 pgoff_t swpoffset; 617 int ret; 618 619 /* Get an entry off the LRU */ 620 spin_lock(&pool->lru_lock); 621 if (list_empty(&pool->lru)) { 622 spin_unlock(&pool->lru_lock); 623 return -EINVAL; 624 } 625 entry = list_last_entry(&pool->lru, struct zswap_entry, lru); 626 list_del_init(&entry->lru); 627 /* 628 * Once the lru lock is dropped, the entry might get freed. The 629 * swpoffset is copied to the stack, and entry isn't deref'd again 630 * until the entry is verified to still be alive in the tree. 631 */ 632 swpoffset = swp_offset(entry->swpentry); 633 tree = zswap_trees[swp_type(entry->swpentry)]; 634 spin_unlock(&pool->lru_lock); 635 636 /* Check for invalidate() race */ 637 spin_lock(&tree->lock); 638 if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) { 639 ret = -EAGAIN; 640 goto unlock; 641 } 642 /* Hold a reference to prevent a free during writeback */ 643 zswap_entry_get(entry); 644 spin_unlock(&tree->lock); 645 646 ret = zswap_writeback_entry(entry, tree); 647 648 spin_lock(&tree->lock); 649 if (ret) { 650 /* Writeback failed, put entry back on LRU */ 651 spin_lock(&pool->lru_lock); 652 list_move(&entry->lru, &pool->lru); 653 spin_unlock(&pool->lru_lock); 654 goto put_unlock; 655 } 656 657 /* 658 * Writeback started successfully, the page now belongs to the 659 * swapcache. Drop the entry from zswap - unless invalidate already 660 * took it out while we had the tree->lock released for IO. 661 */ 662 if (entry == zswap_rb_search(&tree->rbroot, swpoffset)) 663 zswap_invalidate_entry(tree, entry); 664 665 put_unlock: 666 /* Drop local reference */ 667 zswap_entry_put(tree, entry); 668 unlock: 669 spin_unlock(&tree->lock); 670 return ret ? -EAGAIN : 0; 671 } 672 673 static void shrink_worker(struct work_struct *w) 674 { 675 struct zswap_pool *pool = container_of(w, typeof(*pool), 676 shrink_work); 677 int ret, failures = 0; 678 679 do { 680 ret = zswap_reclaim_entry(pool); 681 if (ret) { 682 zswap_reject_reclaim_fail++; 683 if (ret != -EAGAIN) 684 break; 685 if (++failures == MAX_RECLAIM_RETRIES) 686 break; 687 } 688 cond_resched(); 689 } while (!zswap_can_accept()); 690 zswap_pool_put(pool); 691 } 692 693 static struct zswap_pool *zswap_pool_create(char *type, char *compressor) 694 { 695 struct zswap_pool *pool; 696 char name[38]; /* 'zswap' + 32 char (max) num + \0 */ 697 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 698 int ret; 699 700 if (!zswap_has_pool) { 701 /* if either are unset, pool initialization failed, and we 702 * need both params to be set correctly before trying to 703 * create a pool. 704 */ 705 if (!strcmp(type, ZSWAP_PARAM_UNSET)) 706 return NULL; 707 if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) 708 return NULL; 709 } 710 711 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 712 if (!pool) 713 return NULL; 714 715 /* unique name for each pool specifically required by zsmalloc */ 716 snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); 717 718 pool->zpool = zpool_create_pool(type, name, gfp); 719 if (!pool->zpool) { 720 pr_err("%s zpool not available\n", type); 721 goto error; 722 } 723 pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); 724 725 strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); 726 727 pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); 728 if (!pool->acomp_ctx) { 729 pr_err("percpu alloc failed\n"); 730 goto error; 731 } 732 733 ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, 734 &pool->node); 735 if (ret) 736 goto error; 737 pr_debug("using %s compressor\n", pool->tfm_name); 738 739 /* being the current pool takes 1 ref; this func expects the 740 * caller to always add the new pool as the current pool 741 */ 742 kref_init(&pool->kref); 743 INIT_LIST_HEAD(&pool->list); 744 INIT_LIST_HEAD(&pool->lru); 745 spin_lock_init(&pool->lru_lock); 746 INIT_WORK(&pool->shrink_work, shrink_worker); 747 748 zswap_pool_debug("created", pool); 749 750 return pool; 751 752 error: 753 if (pool->acomp_ctx) 754 free_percpu(pool->acomp_ctx); 755 if (pool->zpool) 756 zpool_destroy_pool(pool->zpool); 757 kfree(pool); 758 return NULL; 759 } 760 761 static struct zswap_pool *__zswap_pool_create_fallback(void) 762 { 763 bool has_comp, has_zpool; 764 765 has_comp = crypto_has_acomp(zswap_compressor, 0, 0); 766 if (!has_comp && strcmp(zswap_compressor, 767 CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { 768 pr_err("compressor %s not available, using default %s\n", 769 zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); 770 param_free_charp(&zswap_compressor); 771 zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; 772 has_comp = crypto_has_acomp(zswap_compressor, 0, 0); 773 } 774 if (!has_comp) { 775 pr_err("default compressor %s not available\n", 776 zswap_compressor); 777 param_free_charp(&zswap_compressor); 778 zswap_compressor = ZSWAP_PARAM_UNSET; 779 } 780 781 has_zpool = zpool_has_pool(zswap_zpool_type); 782 if (!has_zpool && strcmp(zswap_zpool_type, 783 CONFIG_ZSWAP_ZPOOL_DEFAULT)) { 784 pr_err("zpool %s not available, using default %s\n", 785 zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); 786 param_free_charp(&zswap_zpool_type); 787 zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; 788 has_zpool = zpool_has_pool(zswap_zpool_type); 789 } 790 if (!has_zpool) { 791 pr_err("default zpool %s not available\n", 792 zswap_zpool_type); 793 param_free_charp(&zswap_zpool_type); 794 zswap_zpool_type = ZSWAP_PARAM_UNSET; 795 } 796 797 if (!has_comp || !has_zpool) 798 return NULL; 799 800 return zswap_pool_create(zswap_zpool_type, zswap_compressor); 801 } 802 803 static void zswap_pool_destroy(struct zswap_pool *pool) 804 { 805 zswap_pool_debug("destroying", pool); 806 807 cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); 808 free_percpu(pool->acomp_ctx); 809 zpool_destroy_pool(pool->zpool); 810 kfree(pool); 811 } 812 813 static int __must_check zswap_pool_get(struct zswap_pool *pool) 814 { 815 if (!pool) 816 return 0; 817 818 return kref_get_unless_zero(&pool->kref); 819 } 820 821 static void __zswap_pool_release(struct work_struct *work) 822 { 823 struct zswap_pool *pool = container_of(work, typeof(*pool), 824 release_work); 825 826 synchronize_rcu(); 827 828 /* nobody should have been able to get a kref... */ 829 WARN_ON(kref_get_unless_zero(&pool->kref)); 830 831 /* pool is now off zswap_pools list and has no references. */ 832 zswap_pool_destroy(pool); 833 } 834 835 static void __zswap_pool_empty(struct kref *kref) 836 { 837 struct zswap_pool *pool; 838 839 pool = container_of(kref, typeof(*pool), kref); 840 841 spin_lock(&zswap_pools_lock); 842 843 WARN_ON(pool == zswap_pool_current()); 844 845 list_del_rcu(&pool->list); 846 847 INIT_WORK(&pool->release_work, __zswap_pool_release); 848 schedule_work(&pool->release_work); 849 850 spin_unlock(&zswap_pools_lock); 851 } 852 853 static void zswap_pool_put(struct zswap_pool *pool) 854 { 855 kref_put(&pool->kref, __zswap_pool_empty); 856 } 857 858 /********************************* 859 * param callbacks 860 **********************************/ 861 862 static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) 863 { 864 /* no change required */ 865 if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) 866 return false; 867 return true; 868 } 869 870 /* val must be a null-terminated string */ 871 static int __zswap_param_set(const char *val, const struct kernel_param *kp, 872 char *type, char *compressor) 873 { 874 struct zswap_pool *pool, *put_pool = NULL; 875 char *s = strstrip((char *)val); 876 int ret = 0; 877 bool new_pool = false; 878 879 mutex_lock(&zswap_init_lock); 880 switch (zswap_init_state) { 881 case ZSWAP_UNINIT: 882 /* if this is load-time (pre-init) param setting, 883 * don't create a pool; that's done during init. 884 */ 885 ret = param_set_charp(s, kp); 886 break; 887 case ZSWAP_INIT_SUCCEED: 888 new_pool = zswap_pool_changed(s, kp); 889 break; 890 case ZSWAP_INIT_FAILED: 891 pr_err("can't set param, initialization failed\n"); 892 ret = -ENODEV; 893 } 894 mutex_unlock(&zswap_init_lock); 895 896 /* no need to create a new pool, return directly */ 897 if (!new_pool) 898 return ret; 899 900 if (!type) { 901 if (!zpool_has_pool(s)) { 902 pr_err("zpool %s not available\n", s); 903 return -ENOENT; 904 } 905 type = s; 906 } else if (!compressor) { 907 if (!crypto_has_acomp(s, 0, 0)) { 908 pr_err("compressor %s not available\n", s); 909 return -ENOENT; 910 } 911 compressor = s; 912 } else { 913 WARN_ON(1); 914 return -EINVAL; 915 } 916 917 spin_lock(&zswap_pools_lock); 918 919 pool = zswap_pool_find_get(type, compressor); 920 if (pool) { 921 zswap_pool_debug("using existing", pool); 922 WARN_ON(pool == zswap_pool_current()); 923 list_del_rcu(&pool->list); 924 } 925 926 spin_unlock(&zswap_pools_lock); 927 928 if (!pool) 929 pool = zswap_pool_create(type, compressor); 930 931 if (pool) 932 ret = param_set_charp(s, kp); 933 else 934 ret = -EINVAL; 935 936 spin_lock(&zswap_pools_lock); 937 938 if (!ret) { 939 put_pool = zswap_pool_current(); 940 list_add_rcu(&pool->list, &zswap_pools); 941 zswap_has_pool = true; 942 } else if (pool) { 943 /* add the possibly pre-existing pool to the end of the pools 944 * list; if it's new (and empty) then it'll be removed and 945 * destroyed by the put after we drop the lock 946 */ 947 list_add_tail_rcu(&pool->list, &zswap_pools); 948 put_pool = pool; 949 } 950 951 spin_unlock(&zswap_pools_lock); 952 953 if (!zswap_has_pool && !pool) { 954 /* if initial pool creation failed, and this pool creation also 955 * failed, maybe both compressor and zpool params were bad. 956 * Allow changing this param, so pool creation will succeed 957 * when the other param is changed. We already verified this 958 * param is ok in the zpool_has_pool() or crypto_has_acomp() 959 * checks above. 960 */ 961 ret = param_set_charp(s, kp); 962 } 963 964 /* drop the ref from either the old current pool, 965 * or the new pool we failed to add 966 */ 967 if (put_pool) 968 zswap_pool_put(put_pool); 969 970 return ret; 971 } 972 973 static int zswap_compressor_param_set(const char *val, 974 const struct kernel_param *kp) 975 { 976 return __zswap_param_set(val, kp, zswap_zpool_type, NULL); 977 } 978 979 static int zswap_zpool_param_set(const char *val, 980 const struct kernel_param *kp) 981 { 982 return __zswap_param_set(val, kp, NULL, zswap_compressor); 983 } 984 985 static int zswap_enabled_param_set(const char *val, 986 const struct kernel_param *kp) 987 { 988 int ret = -ENODEV; 989 990 /* if this is load-time (pre-init) param setting, only set param. */ 991 if (system_state != SYSTEM_RUNNING) 992 return param_set_bool(val, kp); 993 994 mutex_lock(&zswap_init_lock); 995 switch (zswap_init_state) { 996 case ZSWAP_UNINIT: 997 if (zswap_setup()) 998 break; 999 fallthrough; 1000 case ZSWAP_INIT_SUCCEED: 1001 if (!zswap_has_pool) 1002 pr_err("can't enable, no pool configured\n"); 1003 else 1004 ret = param_set_bool(val, kp); 1005 break; 1006 case ZSWAP_INIT_FAILED: 1007 pr_err("can't enable, initialization failed\n"); 1008 } 1009 mutex_unlock(&zswap_init_lock); 1010 1011 return ret; 1012 } 1013 1014 /********************************* 1015 * writeback code 1016 **********************************/ 1017 /* return enum for zswap_get_swap_cache_page */ 1018 enum zswap_get_swap_ret { 1019 ZSWAP_SWAPCACHE_NEW, 1020 ZSWAP_SWAPCACHE_EXIST, 1021 ZSWAP_SWAPCACHE_FAIL, 1022 }; 1023 1024 /* 1025 * zswap_get_swap_cache_page 1026 * 1027 * This is an adaption of read_swap_cache_async() 1028 * 1029 * This function tries to find a page with the given swap entry 1030 * in the swapper_space address space (the swap cache). If the page 1031 * is found, it is returned in retpage. Otherwise, a page is allocated, 1032 * added to the swap cache, and returned in retpage. 1033 * 1034 * If success, the swap cache page is returned in retpage 1035 * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache 1036 * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, 1037 * the new page is added to swapcache and locked 1038 * Returns ZSWAP_SWAPCACHE_FAIL on error 1039 */ 1040 static int zswap_get_swap_cache_page(swp_entry_t entry, 1041 struct page **retpage) 1042 { 1043 bool page_was_allocated; 1044 1045 *retpage = __read_swap_cache_async(entry, GFP_KERNEL, 1046 NULL, 0, &page_was_allocated); 1047 if (page_was_allocated) 1048 return ZSWAP_SWAPCACHE_NEW; 1049 if (!*retpage) 1050 return ZSWAP_SWAPCACHE_FAIL; 1051 return ZSWAP_SWAPCACHE_EXIST; 1052 } 1053 1054 /* 1055 * Attempts to free an entry by adding a page to the swap cache, 1056 * decompressing the entry data into the page, and issuing a 1057 * bio write to write the page back to the swap device. 1058 * 1059 * This can be thought of as a "resumed writeback" of the page 1060 * to the swap device. We are basically resuming the same swap 1061 * writeback path that was intercepted with the frontswap_store() 1062 * in the first place. After the page has been decompressed into 1063 * the swap cache, the compressed version stored by zswap can be 1064 * freed. 1065 */ 1066 static int zswap_writeback_entry(struct zswap_entry *entry, 1067 struct zswap_tree *tree) 1068 { 1069 swp_entry_t swpentry = entry->swpentry; 1070 struct page *page; 1071 struct scatterlist input, output; 1072 struct crypto_acomp_ctx *acomp_ctx; 1073 struct zpool *pool = entry->pool->zpool; 1074 1075 u8 *src, *tmp = NULL; 1076 unsigned int dlen; 1077 int ret; 1078 struct writeback_control wbc = { 1079 .sync_mode = WB_SYNC_NONE, 1080 }; 1081 1082 if (!zpool_can_sleep_mapped(pool)) { 1083 tmp = kmalloc(PAGE_SIZE, GFP_KERNEL); 1084 if (!tmp) 1085 return -ENOMEM; 1086 } 1087 1088 /* try to allocate swap cache page */ 1089 switch (zswap_get_swap_cache_page(swpentry, &page)) { 1090 case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ 1091 ret = -ENOMEM; 1092 goto fail; 1093 1094 case ZSWAP_SWAPCACHE_EXIST: 1095 /* page is already in the swap cache, ignore for now */ 1096 put_page(page); 1097 ret = -EEXIST; 1098 goto fail; 1099 1100 case ZSWAP_SWAPCACHE_NEW: /* page is locked */ 1101 /* 1102 * Having a local reference to the zswap entry doesn't exclude 1103 * swapping from invalidating and recycling the swap slot. Once 1104 * the swapcache is secured against concurrent swapping to and 1105 * from the slot, recheck that the entry is still current before 1106 * writing. 1107 */ 1108 spin_lock(&tree->lock); 1109 if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { 1110 spin_unlock(&tree->lock); 1111 delete_from_swap_cache(page_folio(page)); 1112 ret = -ENOMEM; 1113 goto fail; 1114 } 1115 spin_unlock(&tree->lock); 1116 1117 /* decompress */ 1118 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); 1119 dlen = PAGE_SIZE; 1120 1121 src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO); 1122 if (!zpool_can_sleep_mapped(pool)) { 1123 memcpy(tmp, src, entry->length); 1124 src = tmp; 1125 zpool_unmap_handle(pool, entry->handle); 1126 } 1127 1128 mutex_lock(acomp_ctx->mutex); 1129 sg_init_one(&input, src, entry->length); 1130 sg_init_table(&output, 1); 1131 sg_set_page(&output, page, PAGE_SIZE, 0); 1132 acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); 1133 ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); 1134 dlen = acomp_ctx->req->dlen; 1135 mutex_unlock(acomp_ctx->mutex); 1136 1137 if (!zpool_can_sleep_mapped(pool)) 1138 kfree(tmp); 1139 else 1140 zpool_unmap_handle(pool, entry->handle); 1141 1142 BUG_ON(ret); 1143 BUG_ON(dlen != PAGE_SIZE); 1144 1145 /* page is up to date */ 1146 SetPageUptodate(page); 1147 } 1148 1149 /* move it to the tail of the inactive list after end_writeback */ 1150 SetPageReclaim(page); 1151 1152 /* start writeback */ 1153 __swap_writepage(page, &wbc); 1154 put_page(page); 1155 zswap_written_back_pages++; 1156 1157 return ret; 1158 fail: 1159 if (!zpool_can_sleep_mapped(pool)) 1160 kfree(tmp); 1161 1162 /* 1163 * if we get here due to ZSWAP_SWAPCACHE_EXIST 1164 * a load may be happening concurrently. 1165 * it is safe and okay to not free the entry. 1166 * it is also okay to return !0 1167 */ 1168 return ret; 1169 } 1170 1171 static int zswap_is_page_same_filled(void *ptr, unsigned long *value) 1172 { 1173 unsigned long *page; 1174 unsigned long val; 1175 unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; 1176 1177 page = (unsigned long *)ptr; 1178 val = page[0]; 1179 1180 if (val != page[last_pos]) 1181 return 0; 1182 1183 for (pos = 1; pos < last_pos; pos++) { 1184 if (val != page[pos]) 1185 return 0; 1186 } 1187 1188 *value = val; 1189 1190 return 1; 1191 } 1192 1193 static void zswap_fill_page(void *ptr, unsigned long value) 1194 { 1195 unsigned long *page; 1196 1197 page = (unsigned long *)ptr; 1198 memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); 1199 } 1200 1201 /********************************* 1202 * frontswap hooks 1203 **********************************/ 1204 /* attempts to compress and store an single page */ 1205 static int zswap_frontswap_store(unsigned type, pgoff_t offset, 1206 struct page *page) 1207 { 1208 struct zswap_tree *tree = zswap_trees[type]; 1209 struct zswap_entry *entry, *dupentry; 1210 struct scatterlist input, output; 1211 struct crypto_acomp_ctx *acomp_ctx; 1212 struct obj_cgroup *objcg = NULL; 1213 struct zswap_pool *pool; 1214 int ret; 1215 unsigned int dlen = PAGE_SIZE; 1216 unsigned long handle, value; 1217 char *buf; 1218 u8 *src, *dst; 1219 gfp_t gfp; 1220 1221 /* THP isn't supported */ 1222 if (PageTransHuge(page)) { 1223 ret = -EINVAL; 1224 goto reject; 1225 } 1226 1227 if (!zswap_enabled || !tree) { 1228 ret = -ENODEV; 1229 goto reject; 1230 } 1231 1232 /* 1233 * XXX: zswap reclaim does not work with cgroups yet. Without a 1234 * cgroup-aware entry LRU, we will push out entries system-wide based on 1235 * local cgroup limits. 1236 */ 1237 objcg = get_obj_cgroup_from_page(page); 1238 if (objcg && !obj_cgroup_may_zswap(objcg)) { 1239 ret = -ENOMEM; 1240 goto reject; 1241 } 1242 1243 /* reclaim space if needed */ 1244 if (zswap_is_full()) { 1245 zswap_pool_limit_hit++; 1246 zswap_pool_reached_full = true; 1247 goto shrink; 1248 } 1249 1250 if (zswap_pool_reached_full) { 1251 if (!zswap_can_accept()) { 1252 ret = -ENOMEM; 1253 goto shrink; 1254 } else 1255 zswap_pool_reached_full = false; 1256 } 1257 1258 /* allocate entry */ 1259 entry = zswap_entry_cache_alloc(GFP_KERNEL); 1260 if (!entry) { 1261 zswap_reject_kmemcache_fail++; 1262 ret = -ENOMEM; 1263 goto reject; 1264 } 1265 1266 if (zswap_same_filled_pages_enabled) { 1267 src = kmap_atomic(page); 1268 if (zswap_is_page_same_filled(src, &value)) { 1269 kunmap_atomic(src); 1270 entry->swpentry = swp_entry(type, offset); 1271 entry->length = 0; 1272 entry->value = value; 1273 atomic_inc(&zswap_same_filled_pages); 1274 goto insert_entry; 1275 } 1276 kunmap_atomic(src); 1277 } 1278 1279 if (!zswap_non_same_filled_pages_enabled) { 1280 ret = -EINVAL; 1281 goto freepage; 1282 } 1283 1284 /* if entry is successfully added, it keeps the reference */ 1285 entry->pool = zswap_pool_current_get(); 1286 if (!entry->pool) { 1287 ret = -EINVAL; 1288 goto freepage; 1289 } 1290 1291 /* compress */ 1292 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); 1293 1294 mutex_lock(acomp_ctx->mutex); 1295 1296 dst = acomp_ctx->dstmem; 1297 sg_init_table(&input, 1); 1298 sg_set_page(&input, page, PAGE_SIZE, 0); 1299 1300 /* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */ 1301 sg_init_one(&output, dst, PAGE_SIZE * 2); 1302 acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); 1303 /* 1304 * it maybe looks a little bit silly that we send an asynchronous request, 1305 * then wait for its completion synchronously. This makes the process look 1306 * synchronous in fact. 1307 * Theoretically, acomp supports users send multiple acomp requests in one 1308 * acomp instance, then get those requests done simultaneously. but in this 1309 * case, frontswap actually does store and load page by page, there is no 1310 * existing method to send the second page before the first page is done 1311 * in one thread doing frontswap. 1312 * but in different threads running on different cpu, we have different 1313 * acomp instance, so multiple threads can do (de)compression in parallel. 1314 */ 1315 ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); 1316 dlen = acomp_ctx->req->dlen; 1317 1318 if (ret) { 1319 ret = -EINVAL; 1320 goto put_dstmem; 1321 } 1322 1323 /* store */ 1324 gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 1325 if (zpool_malloc_support_movable(entry->pool->zpool)) 1326 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; 1327 ret = zpool_malloc(entry->pool->zpool, dlen, gfp, &handle); 1328 if (ret == -ENOSPC) { 1329 zswap_reject_compress_poor++; 1330 goto put_dstmem; 1331 } 1332 if (ret) { 1333 zswap_reject_alloc_fail++; 1334 goto put_dstmem; 1335 } 1336 buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO); 1337 memcpy(buf, dst, dlen); 1338 zpool_unmap_handle(entry->pool->zpool, handle); 1339 mutex_unlock(acomp_ctx->mutex); 1340 1341 /* populate entry */ 1342 entry->swpentry = swp_entry(type, offset); 1343 entry->handle = handle; 1344 entry->length = dlen; 1345 1346 insert_entry: 1347 entry->objcg = objcg; 1348 if (objcg) { 1349 obj_cgroup_charge_zswap(objcg, entry->length); 1350 /* Account before objcg ref is moved to tree */ 1351 count_objcg_event(objcg, ZSWPOUT); 1352 } 1353 1354 /* map */ 1355 spin_lock(&tree->lock); 1356 do { 1357 ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); 1358 if (ret == -EEXIST) { 1359 zswap_duplicate_entry++; 1360 /* remove from rbtree */ 1361 zswap_rb_erase(&tree->rbroot, dupentry); 1362 zswap_entry_put(tree, dupentry); 1363 } 1364 } while (ret == -EEXIST); 1365 if (entry->length) { 1366 spin_lock(&entry->pool->lru_lock); 1367 list_add(&entry->lru, &entry->pool->lru); 1368 spin_unlock(&entry->pool->lru_lock); 1369 } 1370 spin_unlock(&tree->lock); 1371 1372 /* update stats */ 1373 atomic_inc(&zswap_stored_pages); 1374 zswap_update_total_size(); 1375 count_vm_event(ZSWPOUT); 1376 1377 return 0; 1378 1379 put_dstmem: 1380 mutex_unlock(acomp_ctx->mutex); 1381 zswap_pool_put(entry->pool); 1382 freepage: 1383 zswap_entry_cache_free(entry); 1384 reject: 1385 if (objcg) 1386 obj_cgroup_put(objcg); 1387 return ret; 1388 1389 shrink: 1390 pool = zswap_pool_last_get(); 1391 if (pool) 1392 queue_work(shrink_wq, &pool->shrink_work); 1393 ret = -ENOMEM; 1394 goto reject; 1395 } 1396 1397 /* 1398 * returns 0 if the page was successfully decompressed 1399 * return -1 on entry not found or error 1400 */ 1401 static int zswap_frontswap_load(unsigned type, pgoff_t offset, 1402 struct page *page, bool *exclusive) 1403 { 1404 struct zswap_tree *tree = zswap_trees[type]; 1405 struct zswap_entry *entry; 1406 struct scatterlist input, output; 1407 struct crypto_acomp_ctx *acomp_ctx; 1408 u8 *src, *dst, *tmp; 1409 unsigned int dlen; 1410 int ret; 1411 1412 /* find */ 1413 spin_lock(&tree->lock); 1414 entry = zswap_entry_find_get(&tree->rbroot, offset); 1415 if (!entry) { 1416 /* entry was written back */ 1417 spin_unlock(&tree->lock); 1418 return -1; 1419 } 1420 spin_unlock(&tree->lock); 1421 1422 if (!entry->length) { 1423 dst = kmap_atomic(page); 1424 zswap_fill_page(dst, entry->value); 1425 kunmap_atomic(dst); 1426 ret = 0; 1427 goto stats; 1428 } 1429 1430 if (!zpool_can_sleep_mapped(entry->pool->zpool)) { 1431 tmp = kmalloc(entry->length, GFP_KERNEL); 1432 if (!tmp) { 1433 ret = -ENOMEM; 1434 goto freeentry; 1435 } 1436 } 1437 1438 /* decompress */ 1439 dlen = PAGE_SIZE; 1440 src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); 1441 1442 if (!zpool_can_sleep_mapped(entry->pool->zpool)) { 1443 memcpy(tmp, src, entry->length); 1444 src = tmp; 1445 zpool_unmap_handle(entry->pool->zpool, entry->handle); 1446 } 1447 1448 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); 1449 mutex_lock(acomp_ctx->mutex); 1450 sg_init_one(&input, src, entry->length); 1451 sg_init_table(&output, 1); 1452 sg_set_page(&output, page, PAGE_SIZE, 0); 1453 acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); 1454 ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); 1455 mutex_unlock(acomp_ctx->mutex); 1456 1457 if (zpool_can_sleep_mapped(entry->pool->zpool)) 1458 zpool_unmap_handle(entry->pool->zpool, entry->handle); 1459 else 1460 kfree(tmp); 1461 1462 BUG_ON(ret); 1463 stats: 1464 count_vm_event(ZSWPIN); 1465 if (entry->objcg) 1466 count_objcg_event(entry->objcg, ZSWPIN); 1467 freeentry: 1468 spin_lock(&tree->lock); 1469 zswap_entry_put(tree, entry); 1470 if (!ret && zswap_exclusive_loads_enabled) { 1471 zswap_invalidate_entry(tree, entry); 1472 *exclusive = true; 1473 } else if (entry->length) { 1474 spin_lock(&entry->pool->lru_lock); 1475 list_move(&entry->lru, &entry->pool->lru); 1476 spin_unlock(&entry->pool->lru_lock); 1477 } 1478 spin_unlock(&tree->lock); 1479 1480 return ret; 1481 } 1482 1483 /* frees an entry in zswap */ 1484 static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) 1485 { 1486 struct zswap_tree *tree = zswap_trees[type]; 1487 struct zswap_entry *entry; 1488 1489 /* find */ 1490 spin_lock(&tree->lock); 1491 entry = zswap_rb_search(&tree->rbroot, offset); 1492 if (!entry) { 1493 /* entry was written back */ 1494 spin_unlock(&tree->lock); 1495 return; 1496 } 1497 zswap_invalidate_entry(tree, entry); 1498 spin_unlock(&tree->lock); 1499 } 1500 1501 /* frees all zswap entries for the given swap type */ 1502 static void zswap_frontswap_invalidate_area(unsigned type) 1503 { 1504 struct zswap_tree *tree = zswap_trees[type]; 1505 struct zswap_entry *entry, *n; 1506 1507 if (!tree) 1508 return; 1509 1510 /* walk the tree and free everything */ 1511 spin_lock(&tree->lock); 1512 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) 1513 zswap_free_entry(entry); 1514 tree->rbroot = RB_ROOT; 1515 spin_unlock(&tree->lock); 1516 kfree(tree); 1517 zswap_trees[type] = NULL; 1518 } 1519 1520 static void zswap_frontswap_init(unsigned type) 1521 { 1522 struct zswap_tree *tree; 1523 1524 tree = kzalloc(sizeof(*tree), GFP_KERNEL); 1525 if (!tree) { 1526 pr_err("alloc failed, zswap disabled for swap type %d\n", type); 1527 return; 1528 } 1529 1530 tree->rbroot = RB_ROOT; 1531 spin_lock_init(&tree->lock); 1532 zswap_trees[type] = tree; 1533 } 1534 1535 static const struct frontswap_ops zswap_frontswap_ops = { 1536 .store = zswap_frontswap_store, 1537 .load = zswap_frontswap_load, 1538 .invalidate_page = zswap_frontswap_invalidate_page, 1539 .invalidate_area = zswap_frontswap_invalidate_area, 1540 .init = zswap_frontswap_init 1541 }; 1542 1543 /********************************* 1544 * debugfs functions 1545 **********************************/ 1546 #ifdef CONFIG_DEBUG_FS 1547 #include <linux/debugfs.h> 1548 1549 static struct dentry *zswap_debugfs_root; 1550 1551 static int zswap_debugfs_init(void) 1552 { 1553 if (!debugfs_initialized()) 1554 return -ENODEV; 1555 1556 zswap_debugfs_root = debugfs_create_dir("zswap", NULL); 1557 1558 debugfs_create_u64("pool_limit_hit", 0444, 1559 zswap_debugfs_root, &zswap_pool_limit_hit); 1560 debugfs_create_u64("reject_reclaim_fail", 0444, 1561 zswap_debugfs_root, &zswap_reject_reclaim_fail); 1562 debugfs_create_u64("reject_alloc_fail", 0444, 1563 zswap_debugfs_root, &zswap_reject_alloc_fail); 1564 debugfs_create_u64("reject_kmemcache_fail", 0444, 1565 zswap_debugfs_root, &zswap_reject_kmemcache_fail); 1566 debugfs_create_u64("reject_compress_poor", 0444, 1567 zswap_debugfs_root, &zswap_reject_compress_poor); 1568 debugfs_create_u64("written_back_pages", 0444, 1569 zswap_debugfs_root, &zswap_written_back_pages); 1570 debugfs_create_u64("duplicate_entry", 0444, 1571 zswap_debugfs_root, &zswap_duplicate_entry); 1572 debugfs_create_u64("pool_total_size", 0444, 1573 zswap_debugfs_root, &zswap_pool_total_size); 1574 debugfs_create_atomic_t("stored_pages", 0444, 1575 zswap_debugfs_root, &zswap_stored_pages); 1576 debugfs_create_atomic_t("same_filled_pages", 0444, 1577 zswap_debugfs_root, &zswap_same_filled_pages); 1578 1579 return 0; 1580 } 1581 #else 1582 static int zswap_debugfs_init(void) 1583 { 1584 return 0; 1585 } 1586 #endif 1587 1588 /********************************* 1589 * module init and exit 1590 **********************************/ 1591 static int zswap_setup(void) 1592 { 1593 struct zswap_pool *pool; 1594 int ret; 1595 1596 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 1597 if (!zswap_entry_cache) { 1598 pr_err("entry cache creation failed\n"); 1599 goto cache_fail; 1600 } 1601 1602 ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare", 1603 zswap_dstmem_prepare, zswap_dstmem_dead); 1604 if (ret) { 1605 pr_err("dstmem alloc failed\n"); 1606 goto dstmem_fail; 1607 } 1608 1609 ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, 1610 "mm/zswap_pool:prepare", 1611 zswap_cpu_comp_prepare, 1612 zswap_cpu_comp_dead); 1613 if (ret) 1614 goto hp_fail; 1615 1616 pool = __zswap_pool_create_fallback(); 1617 if (pool) { 1618 pr_info("loaded using pool %s/%s\n", pool->tfm_name, 1619 zpool_get_type(pool->zpool)); 1620 list_add(&pool->list, &zswap_pools); 1621 zswap_has_pool = true; 1622 } else { 1623 pr_err("pool creation failed\n"); 1624 zswap_enabled = false; 1625 } 1626 1627 shrink_wq = create_workqueue("zswap-shrink"); 1628 if (!shrink_wq) 1629 goto fallback_fail; 1630 1631 ret = frontswap_register_ops(&zswap_frontswap_ops); 1632 if (ret) 1633 goto destroy_wq; 1634 if (zswap_debugfs_init()) 1635 pr_warn("debugfs initialization failed\n"); 1636 zswap_init_state = ZSWAP_INIT_SUCCEED; 1637 return 0; 1638 1639 destroy_wq: 1640 destroy_workqueue(shrink_wq); 1641 fallback_fail: 1642 if (pool) 1643 zswap_pool_destroy(pool); 1644 hp_fail: 1645 cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE); 1646 dstmem_fail: 1647 kmem_cache_destroy(zswap_entry_cache); 1648 cache_fail: 1649 /* if built-in, we aren't unloaded on failure; don't allow use */ 1650 zswap_init_state = ZSWAP_INIT_FAILED; 1651 zswap_enabled = false; 1652 return -ENOMEM; 1653 } 1654 1655 static int __init zswap_init(void) 1656 { 1657 if (!zswap_enabled) 1658 return 0; 1659 return zswap_setup(); 1660 } 1661 /* must be late so crypto has time to come up */ 1662 late_initcall(zswap_init); 1663 1664 MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); 1665 MODULE_DESCRIPTION("Compressed cache for swap pages"); 1666