1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * zswap.c - zswap driver file 4 * 5 * zswap is a backend for frontswap that takes pages that are in the process 6 * of being swapped out and attempts to compress and store them in a 7 * RAM-based memory pool. This can result in a significant I/O reduction on 8 * the swap device and, in the case where decompressing from RAM is faster 9 * than reading from the swap device, can also improve workload performance. 10 * 11 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> 12 */ 13 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 16 #include <linux/module.h> 17 #include <linux/cpu.h> 18 #include <linux/highmem.h> 19 #include <linux/slab.h> 20 #include <linux/spinlock.h> 21 #include <linux/types.h> 22 #include <linux/atomic.h> 23 #include <linux/frontswap.h> 24 #include <linux/rbtree.h> 25 #include <linux/swap.h> 26 #include <linux/crypto.h> 27 #include <linux/mempool.h> 28 #include <linux/zpool.h> 29 30 #include <linux/mm_types.h> 31 #include <linux/page-flags.h> 32 #include <linux/swapops.h> 33 #include <linux/writeback.h> 34 #include <linux/pagemap.h> 35 #include <linux/workqueue.h> 36 37 /********************************* 38 * statistics 39 **********************************/ 40 /* Total bytes used by the compressed storage */ 41 static u64 zswap_pool_total_size; 42 /* The number of compressed pages currently stored in zswap */ 43 static atomic_t zswap_stored_pages = ATOMIC_INIT(0); 44 /* The number of same-value filled pages currently stored in zswap */ 45 static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); 46 47 /* 48 * The statistics below are not protected from concurrent access for 49 * performance reasons so they may not be a 100% accurate. However, 50 * they do provide useful information on roughly how many times a 51 * certain event is occurring. 52 */ 53 54 /* Pool limit was hit (see zswap_max_pool_percent) */ 55 static u64 zswap_pool_limit_hit; 56 /* Pages written back when pool limit was reached */ 57 static u64 zswap_written_back_pages; 58 /* Store failed due to a reclaim failure after pool limit was reached */ 59 static u64 zswap_reject_reclaim_fail; 60 /* Compressed page was too big for the allocator to (optimally) store */ 61 static u64 zswap_reject_compress_poor; 62 /* Store failed because underlying allocator could not get memory */ 63 static u64 zswap_reject_alloc_fail; 64 /* Store failed because the entry metadata could not be allocated (rare) */ 65 static u64 zswap_reject_kmemcache_fail; 66 /* Duplicate store was encountered (rare) */ 67 static u64 zswap_duplicate_entry; 68 69 /* Shrinker work queue */ 70 static struct workqueue_struct *shrink_wq; 71 /* Pool limit was hit, we need to calm down */ 72 static bool zswap_pool_reached_full; 73 74 /********************************* 75 * tunables 76 **********************************/ 77 78 #define ZSWAP_PARAM_UNSET "" 79 80 /* Enable/disable zswap (disabled by default) */ 81 static bool zswap_enabled; 82 static int zswap_enabled_param_set(const char *, 83 const struct kernel_param *); 84 static struct kernel_param_ops zswap_enabled_param_ops = { 85 .set = zswap_enabled_param_set, 86 .get = param_get_bool, 87 }; 88 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); 89 90 /* Crypto compressor to use */ 91 #define ZSWAP_COMPRESSOR_DEFAULT "lzo" 92 static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 93 static int zswap_compressor_param_set(const char *, 94 const struct kernel_param *); 95 static struct kernel_param_ops zswap_compressor_param_ops = { 96 .set = zswap_compressor_param_set, 97 .get = param_get_charp, 98 .free = param_free_charp, 99 }; 100 module_param_cb(compressor, &zswap_compressor_param_ops, 101 &zswap_compressor, 0644); 102 103 /* Compressed storage zpool to use */ 104 #define ZSWAP_ZPOOL_DEFAULT "zbud" 105 static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 106 static int zswap_zpool_param_set(const char *, const struct kernel_param *); 107 static struct kernel_param_ops zswap_zpool_param_ops = { 108 .set = zswap_zpool_param_set, 109 .get = param_get_charp, 110 .free = param_free_charp, 111 }; 112 module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); 113 114 /* The maximum percentage of memory that the compressed pool can occupy */ 115 static unsigned int zswap_max_pool_percent = 20; 116 module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); 117 118 /* The threshold for accepting new pages after the max_pool_percent was hit */ 119 static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ 120 module_param_named(accept_threshold_percent, zswap_accept_thr_percent, 121 uint, 0644); 122 123 /* Enable/disable handling same-value filled pages (enabled by default) */ 124 static bool zswap_same_filled_pages_enabled = true; 125 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, 126 bool, 0644); 127 128 /********************************* 129 * data structures 130 **********************************/ 131 132 struct zswap_pool { 133 struct zpool *zpool; 134 struct crypto_comp * __percpu *tfm; 135 struct kref kref; 136 struct list_head list; 137 struct work_struct release_work; 138 struct work_struct shrink_work; 139 struct hlist_node node; 140 char tfm_name[CRYPTO_MAX_ALG_NAME]; 141 }; 142 143 /* 144 * struct zswap_entry 145 * 146 * This structure contains the metadata for tracking a single compressed 147 * page within zswap. 148 * 149 * rbnode - links the entry into red-black tree for the appropriate swap type 150 * offset - the swap offset for the entry. Index into the red-black tree. 151 * refcount - the number of outstanding reference to the entry. This is needed 152 * to protect against premature freeing of the entry by code 153 * concurrent calls to load, invalidate, and writeback. The lock 154 * for the zswap_tree structure that contains the entry must 155 * be held while changing the refcount. Since the lock must 156 * be held, there is no reason to also make refcount atomic. 157 * length - the length in bytes of the compressed page data. Needed during 158 * decompression. For a same value filled page length is 0. 159 * pool - the zswap_pool the entry's data is in 160 * handle - zpool allocation handle that stores the compressed page data 161 * value - value of the same-value filled pages which have same content 162 */ 163 struct zswap_entry { 164 struct rb_node rbnode; 165 pgoff_t offset; 166 int refcount; 167 unsigned int length; 168 struct zswap_pool *pool; 169 union { 170 unsigned long handle; 171 unsigned long value; 172 }; 173 }; 174 175 struct zswap_header { 176 swp_entry_t swpentry; 177 }; 178 179 /* 180 * The tree lock in the zswap_tree struct protects a few things: 181 * - the rbtree 182 * - the refcount field of each entry in the tree 183 */ 184 struct zswap_tree { 185 struct rb_root rbroot; 186 spinlock_t lock; 187 }; 188 189 static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 190 191 /* RCU-protected iteration */ 192 static LIST_HEAD(zswap_pools); 193 /* protects zswap_pools list modification */ 194 static DEFINE_SPINLOCK(zswap_pools_lock); 195 /* pool counter to provide unique names to zpool */ 196 static atomic_t zswap_pools_count = ATOMIC_INIT(0); 197 198 /* used by param callback function */ 199 static bool zswap_init_started; 200 201 /* fatal error during init */ 202 static bool zswap_init_failed; 203 204 /* init completed, but couldn't create the initial pool */ 205 static bool zswap_has_pool; 206 207 /********************************* 208 * helpers and fwd declarations 209 **********************************/ 210 211 #define zswap_pool_debug(msg, p) \ 212 pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ 213 zpool_get_type((p)->zpool)) 214 215 static int zswap_writeback_entry(struct zpool *pool, unsigned long handle); 216 static int zswap_pool_get(struct zswap_pool *pool); 217 static void zswap_pool_put(struct zswap_pool *pool); 218 219 static const struct zpool_ops zswap_zpool_ops = { 220 .evict = zswap_writeback_entry 221 }; 222 223 static bool zswap_is_full(void) 224 { 225 return totalram_pages() * zswap_max_pool_percent / 100 < 226 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 227 } 228 229 static bool zswap_can_accept(void) 230 { 231 return totalram_pages() * zswap_accept_thr_percent / 100 * 232 zswap_max_pool_percent / 100 > 233 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 234 } 235 236 static void zswap_update_total_size(void) 237 { 238 struct zswap_pool *pool; 239 u64 total = 0; 240 241 rcu_read_lock(); 242 243 list_for_each_entry_rcu(pool, &zswap_pools, list) 244 total += zpool_get_total_size(pool->zpool); 245 246 rcu_read_unlock(); 247 248 zswap_pool_total_size = total; 249 } 250 251 /********************************* 252 * zswap entry functions 253 **********************************/ 254 static struct kmem_cache *zswap_entry_cache; 255 256 static int __init zswap_entry_cache_create(void) 257 { 258 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 259 return zswap_entry_cache == NULL; 260 } 261 262 static void __init zswap_entry_cache_destroy(void) 263 { 264 kmem_cache_destroy(zswap_entry_cache); 265 } 266 267 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) 268 { 269 struct zswap_entry *entry; 270 entry = kmem_cache_alloc(zswap_entry_cache, gfp); 271 if (!entry) 272 return NULL; 273 entry->refcount = 1; 274 RB_CLEAR_NODE(&entry->rbnode); 275 return entry; 276 } 277 278 static void zswap_entry_cache_free(struct zswap_entry *entry) 279 { 280 kmem_cache_free(zswap_entry_cache, entry); 281 } 282 283 /********************************* 284 * rbtree functions 285 **********************************/ 286 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) 287 { 288 struct rb_node *node = root->rb_node; 289 struct zswap_entry *entry; 290 291 while (node) { 292 entry = rb_entry(node, struct zswap_entry, rbnode); 293 if (entry->offset > offset) 294 node = node->rb_left; 295 else if (entry->offset < offset) 296 node = node->rb_right; 297 else 298 return entry; 299 } 300 return NULL; 301 } 302 303 /* 304 * In the case that a entry with the same offset is found, a pointer to 305 * the existing entry is stored in dupentry and the function returns -EEXIST 306 */ 307 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, 308 struct zswap_entry **dupentry) 309 { 310 struct rb_node **link = &root->rb_node, *parent = NULL; 311 struct zswap_entry *myentry; 312 313 while (*link) { 314 parent = *link; 315 myentry = rb_entry(parent, struct zswap_entry, rbnode); 316 if (myentry->offset > entry->offset) 317 link = &(*link)->rb_left; 318 else if (myentry->offset < entry->offset) 319 link = &(*link)->rb_right; 320 else { 321 *dupentry = myentry; 322 return -EEXIST; 323 } 324 } 325 rb_link_node(&entry->rbnode, parent, link); 326 rb_insert_color(&entry->rbnode, root); 327 return 0; 328 } 329 330 static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) 331 { 332 if (!RB_EMPTY_NODE(&entry->rbnode)) { 333 rb_erase(&entry->rbnode, root); 334 RB_CLEAR_NODE(&entry->rbnode); 335 } 336 } 337 338 /* 339 * Carries out the common pattern of freeing and entry's zpool allocation, 340 * freeing the entry itself, and decrementing the number of stored pages. 341 */ 342 static void zswap_free_entry(struct zswap_entry *entry) 343 { 344 if (!entry->length) 345 atomic_dec(&zswap_same_filled_pages); 346 else { 347 zpool_free(entry->pool->zpool, entry->handle); 348 zswap_pool_put(entry->pool); 349 } 350 zswap_entry_cache_free(entry); 351 atomic_dec(&zswap_stored_pages); 352 zswap_update_total_size(); 353 } 354 355 /* caller must hold the tree lock */ 356 static void zswap_entry_get(struct zswap_entry *entry) 357 { 358 entry->refcount++; 359 } 360 361 /* caller must hold the tree lock 362 * remove from the tree and free it, if nobody reference the entry 363 */ 364 static void zswap_entry_put(struct zswap_tree *tree, 365 struct zswap_entry *entry) 366 { 367 int refcount = --entry->refcount; 368 369 BUG_ON(refcount < 0); 370 if (refcount == 0) { 371 zswap_rb_erase(&tree->rbroot, entry); 372 zswap_free_entry(entry); 373 } 374 } 375 376 /* caller must hold the tree lock */ 377 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, 378 pgoff_t offset) 379 { 380 struct zswap_entry *entry; 381 382 entry = zswap_rb_search(root, offset); 383 if (entry) 384 zswap_entry_get(entry); 385 386 return entry; 387 } 388 389 /********************************* 390 * per-cpu code 391 **********************************/ 392 static DEFINE_PER_CPU(u8 *, zswap_dstmem); 393 394 static int zswap_dstmem_prepare(unsigned int cpu) 395 { 396 u8 *dst; 397 398 dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); 399 if (!dst) 400 return -ENOMEM; 401 402 per_cpu(zswap_dstmem, cpu) = dst; 403 return 0; 404 } 405 406 static int zswap_dstmem_dead(unsigned int cpu) 407 { 408 u8 *dst; 409 410 dst = per_cpu(zswap_dstmem, cpu); 411 kfree(dst); 412 per_cpu(zswap_dstmem, cpu) = NULL; 413 414 return 0; 415 } 416 417 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) 418 { 419 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 420 struct crypto_comp *tfm; 421 422 if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu))) 423 return 0; 424 425 tfm = crypto_alloc_comp(pool->tfm_name, 0, 0); 426 if (IS_ERR_OR_NULL(tfm)) { 427 pr_err("could not alloc crypto comp %s : %ld\n", 428 pool->tfm_name, PTR_ERR(tfm)); 429 return -ENOMEM; 430 } 431 *per_cpu_ptr(pool->tfm, cpu) = tfm; 432 return 0; 433 } 434 435 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) 436 { 437 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 438 struct crypto_comp *tfm; 439 440 tfm = *per_cpu_ptr(pool->tfm, cpu); 441 if (!IS_ERR_OR_NULL(tfm)) 442 crypto_free_comp(tfm); 443 *per_cpu_ptr(pool->tfm, cpu) = NULL; 444 return 0; 445 } 446 447 /********************************* 448 * pool functions 449 **********************************/ 450 451 static struct zswap_pool *__zswap_pool_current(void) 452 { 453 struct zswap_pool *pool; 454 455 pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); 456 WARN_ONCE(!pool && zswap_has_pool, 457 "%s: no page storage pool!\n", __func__); 458 459 return pool; 460 } 461 462 static struct zswap_pool *zswap_pool_current(void) 463 { 464 assert_spin_locked(&zswap_pools_lock); 465 466 return __zswap_pool_current(); 467 } 468 469 static struct zswap_pool *zswap_pool_current_get(void) 470 { 471 struct zswap_pool *pool; 472 473 rcu_read_lock(); 474 475 pool = __zswap_pool_current(); 476 if (!zswap_pool_get(pool)) 477 pool = NULL; 478 479 rcu_read_unlock(); 480 481 return pool; 482 } 483 484 static struct zswap_pool *zswap_pool_last_get(void) 485 { 486 struct zswap_pool *pool, *last = NULL; 487 488 rcu_read_lock(); 489 490 list_for_each_entry_rcu(pool, &zswap_pools, list) 491 last = pool; 492 WARN_ONCE(!last && zswap_has_pool, 493 "%s: no page storage pool!\n", __func__); 494 if (!zswap_pool_get(last)) 495 last = NULL; 496 497 rcu_read_unlock(); 498 499 return last; 500 } 501 502 /* type and compressor must be null-terminated */ 503 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) 504 { 505 struct zswap_pool *pool; 506 507 assert_spin_locked(&zswap_pools_lock); 508 509 list_for_each_entry_rcu(pool, &zswap_pools, list) { 510 if (strcmp(pool->tfm_name, compressor)) 511 continue; 512 if (strcmp(zpool_get_type(pool->zpool), type)) 513 continue; 514 /* if we can't get it, it's about to be destroyed */ 515 if (!zswap_pool_get(pool)) 516 continue; 517 return pool; 518 } 519 520 return NULL; 521 } 522 523 static void shrink_worker(struct work_struct *w) 524 { 525 struct zswap_pool *pool = container_of(w, typeof(*pool), 526 shrink_work); 527 528 if (zpool_shrink(pool->zpool, 1, NULL)) 529 zswap_reject_reclaim_fail++; 530 zswap_pool_put(pool); 531 } 532 533 static struct zswap_pool *zswap_pool_create(char *type, char *compressor) 534 { 535 struct zswap_pool *pool; 536 char name[38]; /* 'zswap' + 32 char (max) num + \0 */ 537 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 538 int ret; 539 540 if (!zswap_has_pool) { 541 /* if either are unset, pool initialization failed, and we 542 * need both params to be set correctly before trying to 543 * create a pool. 544 */ 545 if (!strcmp(type, ZSWAP_PARAM_UNSET)) 546 return NULL; 547 if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) 548 return NULL; 549 } 550 551 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 552 if (!pool) 553 return NULL; 554 555 /* unique name for each pool specifically required by zsmalloc */ 556 snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); 557 558 pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops); 559 if (!pool->zpool) { 560 pr_err("%s zpool not available\n", type); 561 goto error; 562 } 563 pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); 564 565 strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); 566 pool->tfm = alloc_percpu(struct crypto_comp *); 567 if (!pool->tfm) { 568 pr_err("percpu alloc failed\n"); 569 goto error; 570 } 571 572 ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, 573 &pool->node); 574 if (ret) 575 goto error; 576 pr_debug("using %s compressor\n", pool->tfm_name); 577 578 /* being the current pool takes 1 ref; this func expects the 579 * caller to always add the new pool as the current pool 580 */ 581 kref_init(&pool->kref); 582 INIT_LIST_HEAD(&pool->list); 583 INIT_WORK(&pool->shrink_work, shrink_worker); 584 585 zswap_pool_debug("created", pool); 586 587 return pool; 588 589 error: 590 free_percpu(pool->tfm); 591 if (pool->zpool) 592 zpool_destroy_pool(pool->zpool); 593 kfree(pool); 594 return NULL; 595 } 596 597 static __init struct zswap_pool *__zswap_pool_create_fallback(void) 598 { 599 bool has_comp, has_zpool; 600 601 has_comp = crypto_has_comp(zswap_compressor, 0, 0); 602 if (!has_comp && strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) { 603 pr_err("compressor %s not available, using default %s\n", 604 zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT); 605 param_free_charp(&zswap_compressor); 606 zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 607 has_comp = crypto_has_comp(zswap_compressor, 0, 0); 608 } 609 if (!has_comp) { 610 pr_err("default compressor %s not available\n", 611 zswap_compressor); 612 param_free_charp(&zswap_compressor); 613 zswap_compressor = ZSWAP_PARAM_UNSET; 614 } 615 616 has_zpool = zpool_has_pool(zswap_zpool_type); 617 if (!has_zpool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { 618 pr_err("zpool %s not available, using default %s\n", 619 zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT); 620 param_free_charp(&zswap_zpool_type); 621 zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 622 has_zpool = zpool_has_pool(zswap_zpool_type); 623 } 624 if (!has_zpool) { 625 pr_err("default zpool %s not available\n", 626 zswap_zpool_type); 627 param_free_charp(&zswap_zpool_type); 628 zswap_zpool_type = ZSWAP_PARAM_UNSET; 629 } 630 631 if (!has_comp || !has_zpool) 632 return NULL; 633 634 return zswap_pool_create(zswap_zpool_type, zswap_compressor); 635 } 636 637 static void zswap_pool_destroy(struct zswap_pool *pool) 638 { 639 zswap_pool_debug("destroying", pool); 640 641 cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); 642 free_percpu(pool->tfm); 643 zpool_destroy_pool(pool->zpool); 644 kfree(pool); 645 } 646 647 static int __must_check zswap_pool_get(struct zswap_pool *pool) 648 { 649 if (!pool) 650 return 0; 651 652 return kref_get_unless_zero(&pool->kref); 653 } 654 655 static void __zswap_pool_release(struct work_struct *work) 656 { 657 struct zswap_pool *pool = container_of(work, typeof(*pool), 658 release_work); 659 660 synchronize_rcu(); 661 662 /* nobody should have been able to get a kref... */ 663 WARN_ON(kref_get_unless_zero(&pool->kref)); 664 665 /* pool is now off zswap_pools list and has no references. */ 666 zswap_pool_destroy(pool); 667 } 668 669 static void __zswap_pool_empty(struct kref *kref) 670 { 671 struct zswap_pool *pool; 672 673 pool = container_of(kref, typeof(*pool), kref); 674 675 spin_lock(&zswap_pools_lock); 676 677 WARN_ON(pool == zswap_pool_current()); 678 679 list_del_rcu(&pool->list); 680 681 INIT_WORK(&pool->release_work, __zswap_pool_release); 682 schedule_work(&pool->release_work); 683 684 spin_unlock(&zswap_pools_lock); 685 } 686 687 static void zswap_pool_put(struct zswap_pool *pool) 688 { 689 kref_put(&pool->kref, __zswap_pool_empty); 690 } 691 692 /********************************* 693 * param callbacks 694 **********************************/ 695 696 /* val must be a null-terminated string */ 697 static int __zswap_param_set(const char *val, const struct kernel_param *kp, 698 char *type, char *compressor) 699 { 700 struct zswap_pool *pool, *put_pool = NULL; 701 char *s = strstrip((char *)val); 702 int ret; 703 704 if (zswap_init_failed) { 705 pr_err("can't set param, initialization failed\n"); 706 return -ENODEV; 707 } 708 709 /* no change required */ 710 if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) 711 return 0; 712 713 /* if this is load-time (pre-init) param setting, 714 * don't create a pool; that's done during init. 715 */ 716 if (!zswap_init_started) 717 return param_set_charp(s, kp); 718 719 if (!type) { 720 if (!zpool_has_pool(s)) { 721 pr_err("zpool %s not available\n", s); 722 return -ENOENT; 723 } 724 type = s; 725 } else if (!compressor) { 726 if (!crypto_has_comp(s, 0, 0)) { 727 pr_err("compressor %s not available\n", s); 728 return -ENOENT; 729 } 730 compressor = s; 731 } else { 732 WARN_ON(1); 733 return -EINVAL; 734 } 735 736 spin_lock(&zswap_pools_lock); 737 738 pool = zswap_pool_find_get(type, compressor); 739 if (pool) { 740 zswap_pool_debug("using existing", pool); 741 WARN_ON(pool == zswap_pool_current()); 742 list_del_rcu(&pool->list); 743 } 744 745 spin_unlock(&zswap_pools_lock); 746 747 if (!pool) 748 pool = zswap_pool_create(type, compressor); 749 750 if (pool) 751 ret = param_set_charp(s, kp); 752 else 753 ret = -EINVAL; 754 755 spin_lock(&zswap_pools_lock); 756 757 if (!ret) { 758 put_pool = zswap_pool_current(); 759 list_add_rcu(&pool->list, &zswap_pools); 760 zswap_has_pool = true; 761 } else if (pool) { 762 /* add the possibly pre-existing pool to the end of the pools 763 * list; if it's new (and empty) then it'll be removed and 764 * destroyed by the put after we drop the lock 765 */ 766 list_add_tail_rcu(&pool->list, &zswap_pools); 767 put_pool = pool; 768 } 769 770 spin_unlock(&zswap_pools_lock); 771 772 if (!zswap_has_pool && !pool) { 773 /* if initial pool creation failed, and this pool creation also 774 * failed, maybe both compressor and zpool params were bad. 775 * Allow changing this param, so pool creation will succeed 776 * when the other param is changed. We already verified this 777 * param is ok in the zpool_has_pool() or crypto_has_comp() 778 * checks above. 779 */ 780 ret = param_set_charp(s, kp); 781 } 782 783 /* drop the ref from either the old current pool, 784 * or the new pool we failed to add 785 */ 786 if (put_pool) 787 zswap_pool_put(put_pool); 788 789 return ret; 790 } 791 792 static int zswap_compressor_param_set(const char *val, 793 const struct kernel_param *kp) 794 { 795 return __zswap_param_set(val, kp, zswap_zpool_type, NULL); 796 } 797 798 static int zswap_zpool_param_set(const char *val, 799 const struct kernel_param *kp) 800 { 801 return __zswap_param_set(val, kp, NULL, zswap_compressor); 802 } 803 804 static int zswap_enabled_param_set(const char *val, 805 const struct kernel_param *kp) 806 { 807 if (zswap_init_failed) { 808 pr_err("can't enable, initialization failed\n"); 809 return -ENODEV; 810 } 811 if (!zswap_has_pool && zswap_init_started) { 812 pr_err("can't enable, no pool configured\n"); 813 return -ENODEV; 814 } 815 816 return param_set_bool(val, kp); 817 } 818 819 /********************************* 820 * writeback code 821 **********************************/ 822 /* return enum for zswap_get_swap_cache_page */ 823 enum zswap_get_swap_ret { 824 ZSWAP_SWAPCACHE_NEW, 825 ZSWAP_SWAPCACHE_EXIST, 826 ZSWAP_SWAPCACHE_FAIL, 827 }; 828 829 /* 830 * zswap_get_swap_cache_page 831 * 832 * This is an adaption of read_swap_cache_async() 833 * 834 * This function tries to find a page with the given swap entry 835 * in the swapper_space address space (the swap cache). If the page 836 * is found, it is returned in retpage. Otherwise, a page is allocated, 837 * added to the swap cache, and returned in retpage. 838 * 839 * If success, the swap cache page is returned in retpage 840 * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache 841 * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, 842 * the new page is added to swapcache and locked 843 * Returns ZSWAP_SWAPCACHE_FAIL on error 844 */ 845 static int zswap_get_swap_cache_page(swp_entry_t entry, 846 struct page **retpage) 847 { 848 bool page_was_allocated; 849 850 *retpage = __read_swap_cache_async(entry, GFP_KERNEL, 851 NULL, 0, &page_was_allocated); 852 if (page_was_allocated) 853 return ZSWAP_SWAPCACHE_NEW; 854 if (!*retpage) 855 return ZSWAP_SWAPCACHE_FAIL; 856 return ZSWAP_SWAPCACHE_EXIST; 857 } 858 859 /* 860 * Attempts to free an entry by adding a page to the swap cache, 861 * decompressing the entry data into the page, and issuing a 862 * bio write to write the page back to the swap device. 863 * 864 * This can be thought of as a "resumed writeback" of the page 865 * to the swap device. We are basically resuming the same swap 866 * writeback path that was intercepted with the frontswap_store() 867 * in the first place. After the page has been decompressed into 868 * the swap cache, the compressed version stored by zswap can be 869 * freed. 870 */ 871 static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) 872 { 873 struct zswap_header *zhdr; 874 swp_entry_t swpentry; 875 struct zswap_tree *tree; 876 pgoff_t offset; 877 struct zswap_entry *entry; 878 struct page *page; 879 struct crypto_comp *tfm; 880 u8 *src, *dst; 881 unsigned int dlen; 882 int ret; 883 struct writeback_control wbc = { 884 .sync_mode = WB_SYNC_NONE, 885 }; 886 887 /* extract swpentry from data */ 888 zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); 889 swpentry = zhdr->swpentry; /* here */ 890 tree = zswap_trees[swp_type(swpentry)]; 891 offset = swp_offset(swpentry); 892 893 /* find and ref zswap entry */ 894 spin_lock(&tree->lock); 895 entry = zswap_entry_find_get(&tree->rbroot, offset); 896 if (!entry) { 897 /* entry was invalidated */ 898 spin_unlock(&tree->lock); 899 zpool_unmap_handle(pool, handle); 900 return 0; 901 } 902 spin_unlock(&tree->lock); 903 BUG_ON(offset != entry->offset); 904 905 /* try to allocate swap cache page */ 906 switch (zswap_get_swap_cache_page(swpentry, &page)) { 907 case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ 908 ret = -ENOMEM; 909 goto fail; 910 911 case ZSWAP_SWAPCACHE_EXIST: 912 /* page is already in the swap cache, ignore for now */ 913 put_page(page); 914 ret = -EEXIST; 915 goto fail; 916 917 case ZSWAP_SWAPCACHE_NEW: /* page is locked */ 918 /* decompress */ 919 dlen = PAGE_SIZE; 920 src = (u8 *)zhdr + sizeof(struct zswap_header); 921 dst = kmap_atomic(page); 922 tfm = *get_cpu_ptr(entry->pool->tfm); 923 ret = crypto_comp_decompress(tfm, src, entry->length, 924 dst, &dlen); 925 put_cpu_ptr(entry->pool->tfm); 926 kunmap_atomic(dst); 927 BUG_ON(ret); 928 BUG_ON(dlen != PAGE_SIZE); 929 930 /* page is up to date */ 931 SetPageUptodate(page); 932 } 933 934 /* move it to the tail of the inactive list after end_writeback */ 935 SetPageReclaim(page); 936 937 /* start writeback */ 938 __swap_writepage(page, &wbc, end_swap_bio_write); 939 put_page(page); 940 zswap_written_back_pages++; 941 942 spin_lock(&tree->lock); 943 /* drop local reference */ 944 zswap_entry_put(tree, entry); 945 946 /* 947 * There are two possible situations for entry here: 948 * (1) refcount is 1(normal case), entry is valid and on the tree 949 * (2) refcount is 0, entry is freed and not on the tree 950 * because invalidate happened during writeback 951 * search the tree and free the entry if find entry 952 */ 953 if (entry == zswap_rb_search(&tree->rbroot, offset)) 954 zswap_entry_put(tree, entry); 955 spin_unlock(&tree->lock); 956 957 goto end; 958 959 /* 960 * if we get here due to ZSWAP_SWAPCACHE_EXIST 961 * a load may happening concurrently 962 * it is safe and okay to not free the entry 963 * if we free the entry in the following put 964 * it it either okay to return !0 965 */ 966 fail: 967 spin_lock(&tree->lock); 968 zswap_entry_put(tree, entry); 969 spin_unlock(&tree->lock); 970 971 end: 972 zpool_unmap_handle(pool, handle); 973 return ret; 974 } 975 976 static int zswap_is_page_same_filled(void *ptr, unsigned long *value) 977 { 978 unsigned int pos; 979 unsigned long *page; 980 981 page = (unsigned long *)ptr; 982 for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) { 983 if (page[pos] != page[0]) 984 return 0; 985 } 986 *value = page[0]; 987 return 1; 988 } 989 990 static void zswap_fill_page(void *ptr, unsigned long value) 991 { 992 unsigned long *page; 993 994 page = (unsigned long *)ptr; 995 memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); 996 } 997 998 /********************************* 999 * frontswap hooks 1000 **********************************/ 1001 /* attempts to compress and store an single page */ 1002 static int zswap_frontswap_store(unsigned type, pgoff_t offset, 1003 struct page *page) 1004 { 1005 struct zswap_tree *tree = zswap_trees[type]; 1006 struct zswap_entry *entry, *dupentry; 1007 struct crypto_comp *tfm; 1008 int ret; 1009 unsigned int hlen, dlen = PAGE_SIZE; 1010 unsigned long handle, value; 1011 char *buf; 1012 u8 *src, *dst; 1013 struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) }; 1014 gfp_t gfp; 1015 1016 /* THP isn't supported */ 1017 if (PageTransHuge(page)) { 1018 ret = -EINVAL; 1019 goto reject; 1020 } 1021 1022 if (!zswap_enabled || !tree) { 1023 ret = -ENODEV; 1024 goto reject; 1025 } 1026 1027 /* reclaim space if needed */ 1028 if (zswap_is_full()) { 1029 struct zswap_pool *pool; 1030 1031 zswap_pool_limit_hit++; 1032 zswap_pool_reached_full = true; 1033 pool = zswap_pool_last_get(); 1034 if (pool) 1035 queue_work(shrink_wq, &pool->shrink_work); 1036 ret = -ENOMEM; 1037 goto reject; 1038 } 1039 1040 if (zswap_pool_reached_full) { 1041 if (!zswap_can_accept()) { 1042 ret = -ENOMEM; 1043 goto reject; 1044 } else 1045 zswap_pool_reached_full = false; 1046 } 1047 1048 /* allocate entry */ 1049 entry = zswap_entry_cache_alloc(GFP_KERNEL); 1050 if (!entry) { 1051 zswap_reject_kmemcache_fail++; 1052 ret = -ENOMEM; 1053 goto reject; 1054 } 1055 1056 if (zswap_same_filled_pages_enabled) { 1057 src = kmap_atomic(page); 1058 if (zswap_is_page_same_filled(src, &value)) { 1059 kunmap_atomic(src); 1060 entry->offset = offset; 1061 entry->length = 0; 1062 entry->value = value; 1063 atomic_inc(&zswap_same_filled_pages); 1064 goto insert_entry; 1065 } 1066 kunmap_atomic(src); 1067 } 1068 1069 /* if entry is successfully added, it keeps the reference */ 1070 entry->pool = zswap_pool_current_get(); 1071 if (!entry->pool) { 1072 ret = -EINVAL; 1073 goto freepage; 1074 } 1075 1076 /* compress */ 1077 dst = get_cpu_var(zswap_dstmem); 1078 tfm = *get_cpu_ptr(entry->pool->tfm); 1079 src = kmap_atomic(page); 1080 ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen); 1081 kunmap_atomic(src); 1082 put_cpu_ptr(entry->pool->tfm); 1083 if (ret) { 1084 ret = -EINVAL; 1085 goto put_dstmem; 1086 } 1087 1088 /* store */ 1089 hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0; 1090 gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 1091 if (zpool_malloc_support_movable(entry->pool->zpool)) 1092 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; 1093 ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle); 1094 if (ret == -ENOSPC) { 1095 zswap_reject_compress_poor++; 1096 goto put_dstmem; 1097 } 1098 if (ret) { 1099 zswap_reject_alloc_fail++; 1100 goto put_dstmem; 1101 } 1102 buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW); 1103 memcpy(buf, &zhdr, hlen); 1104 memcpy(buf + hlen, dst, dlen); 1105 zpool_unmap_handle(entry->pool->zpool, handle); 1106 put_cpu_var(zswap_dstmem); 1107 1108 /* populate entry */ 1109 entry->offset = offset; 1110 entry->handle = handle; 1111 entry->length = dlen; 1112 1113 insert_entry: 1114 /* map */ 1115 spin_lock(&tree->lock); 1116 do { 1117 ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); 1118 if (ret == -EEXIST) { 1119 zswap_duplicate_entry++; 1120 /* remove from rbtree */ 1121 zswap_rb_erase(&tree->rbroot, dupentry); 1122 zswap_entry_put(tree, dupentry); 1123 } 1124 } while (ret == -EEXIST); 1125 spin_unlock(&tree->lock); 1126 1127 /* update stats */ 1128 atomic_inc(&zswap_stored_pages); 1129 zswap_update_total_size(); 1130 1131 return 0; 1132 1133 put_dstmem: 1134 put_cpu_var(zswap_dstmem); 1135 zswap_pool_put(entry->pool); 1136 freepage: 1137 zswap_entry_cache_free(entry); 1138 reject: 1139 return ret; 1140 } 1141 1142 /* 1143 * returns 0 if the page was successfully decompressed 1144 * return -1 on entry not found or error 1145 */ 1146 static int zswap_frontswap_load(unsigned type, pgoff_t offset, 1147 struct page *page) 1148 { 1149 struct zswap_tree *tree = zswap_trees[type]; 1150 struct zswap_entry *entry; 1151 struct crypto_comp *tfm; 1152 u8 *src, *dst; 1153 unsigned int dlen; 1154 int ret; 1155 1156 /* find */ 1157 spin_lock(&tree->lock); 1158 entry = zswap_entry_find_get(&tree->rbroot, offset); 1159 if (!entry) { 1160 /* entry was written back */ 1161 spin_unlock(&tree->lock); 1162 return -1; 1163 } 1164 spin_unlock(&tree->lock); 1165 1166 if (!entry->length) { 1167 dst = kmap_atomic(page); 1168 zswap_fill_page(dst, entry->value); 1169 kunmap_atomic(dst); 1170 goto freeentry; 1171 } 1172 1173 /* decompress */ 1174 dlen = PAGE_SIZE; 1175 src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); 1176 if (zpool_evictable(entry->pool->zpool)) 1177 src += sizeof(struct zswap_header); 1178 dst = kmap_atomic(page); 1179 tfm = *get_cpu_ptr(entry->pool->tfm); 1180 ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); 1181 put_cpu_ptr(entry->pool->tfm); 1182 kunmap_atomic(dst); 1183 zpool_unmap_handle(entry->pool->zpool, entry->handle); 1184 BUG_ON(ret); 1185 1186 freeentry: 1187 spin_lock(&tree->lock); 1188 zswap_entry_put(tree, entry); 1189 spin_unlock(&tree->lock); 1190 1191 return 0; 1192 } 1193 1194 /* frees an entry in zswap */ 1195 static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) 1196 { 1197 struct zswap_tree *tree = zswap_trees[type]; 1198 struct zswap_entry *entry; 1199 1200 /* find */ 1201 spin_lock(&tree->lock); 1202 entry = zswap_rb_search(&tree->rbroot, offset); 1203 if (!entry) { 1204 /* entry was written back */ 1205 spin_unlock(&tree->lock); 1206 return; 1207 } 1208 1209 /* remove from rbtree */ 1210 zswap_rb_erase(&tree->rbroot, entry); 1211 1212 /* drop the initial reference from entry creation */ 1213 zswap_entry_put(tree, entry); 1214 1215 spin_unlock(&tree->lock); 1216 } 1217 1218 /* frees all zswap entries for the given swap type */ 1219 static void zswap_frontswap_invalidate_area(unsigned type) 1220 { 1221 struct zswap_tree *tree = zswap_trees[type]; 1222 struct zswap_entry *entry, *n; 1223 1224 if (!tree) 1225 return; 1226 1227 /* walk the tree and free everything */ 1228 spin_lock(&tree->lock); 1229 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) 1230 zswap_free_entry(entry); 1231 tree->rbroot = RB_ROOT; 1232 spin_unlock(&tree->lock); 1233 kfree(tree); 1234 zswap_trees[type] = NULL; 1235 } 1236 1237 static void zswap_frontswap_init(unsigned type) 1238 { 1239 struct zswap_tree *tree; 1240 1241 tree = kzalloc(sizeof(*tree), GFP_KERNEL); 1242 if (!tree) { 1243 pr_err("alloc failed, zswap disabled for swap type %d\n", type); 1244 return; 1245 } 1246 1247 tree->rbroot = RB_ROOT; 1248 spin_lock_init(&tree->lock); 1249 zswap_trees[type] = tree; 1250 } 1251 1252 static struct frontswap_ops zswap_frontswap_ops = { 1253 .store = zswap_frontswap_store, 1254 .load = zswap_frontswap_load, 1255 .invalidate_page = zswap_frontswap_invalidate_page, 1256 .invalidate_area = zswap_frontswap_invalidate_area, 1257 .init = zswap_frontswap_init 1258 }; 1259 1260 /********************************* 1261 * debugfs functions 1262 **********************************/ 1263 #ifdef CONFIG_DEBUG_FS 1264 #include <linux/debugfs.h> 1265 1266 static struct dentry *zswap_debugfs_root; 1267 1268 static int __init zswap_debugfs_init(void) 1269 { 1270 if (!debugfs_initialized()) 1271 return -ENODEV; 1272 1273 zswap_debugfs_root = debugfs_create_dir("zswap", NULL); 1274 1275 debugfs_create_u64("pool_limit_hit", 0444, 1276 zswap_debugfs_root, &zswap_pool_limit_hit); 1277 debugfs_create_u64("reject_reclaim_fail", 0444, 1278 zswap_debugfs_root, &zswap_reject_reclaim_fail); 1279 debugfs_create_u64("reject_alloc_fail", 0444, 1280 zswap_debugfs_root, &zswap_reject_alloc_fail); 1281 debugfs_create_u64("reject_kmemcache_fail", 0444, 1282 zswap_debugfs_root, &zswap_reject_kmemcache_fail); 1283 debugfs_create_u64("reject_compress_poor", 0444, 1284 zswap_debugfs_root, &zswap_reject_compress_poor); 1285 debugfs_create_u64("written_back_pages", 0444, 1286 zswap_debugfs_root, &zswap_written_back_pages); 1287 debugfs_create_u64("duplicate_entry", 0444, 1288 zswap_debugfs_root, &zswap_duplicate_entry); 1289 debugfs_create_u64("pool_total_size", 0444, 1290 zswap_debugfs_root, &zswap_pool_total_size); 1291 debugfs_create_atomic_t("stored_pages", 0444, 1292 zswap_debugfs_root, &zswap_stored_pages); 1293 debugfs_create_atomic_t("same_filled_pages", 0444, 1294 zswap_debugfs_root, &zswap_same_filled_pages); 1295 1296 return 0; 1297 } 1298 1299 static void __exit zswap_debugfs_exit(void) 1300 { 1301 debugfs_remove_recursive(zswap_debugfs_root); 1302 } 1303 #else 1304 static int __init zswap_debugfs_init(void) 1305 { 1306 return 0; 1307 } 1308 1309 static void __exit zswap_debugfs_exit(void) { } 1310 #endif 1311 1312 /********************************* 1313 * module init and exit 1314 **********************************/ 1315 static int __init init_zswap(void) 1316 { 1317 struct zswap_pool *pool; 1318 int ret; 1319 1320 zswap_init_started = true; 1321 1322 if (zswap_entry_cache_create()) { 1323 pr_err("entry cache creation failed\n"); 1324 goto cache_fail; 1325 } 1326 1327 ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare", 1328 zswap_dstmem_prepare, zswap_dstmem_dead); 1329 if (ret) { 1330 pr_err("dstmem alloc failed\n"); 1331 goto dstmem_fail; 1332 } 1333 1334 ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, 1335 "mm/zswap_pool:prepare", 1336 zswap_cpu_comp_prepare, 1337 zswap_cpu_comp_dead); 1338 if (ret) 1339 goto hp_fail; 1340 1341 pool = __zswap_pool_create_fallback(); 1342 if (pool) { 1343 pr_info("loaded using pool %s/%s\n", pool->tfm_name, 1344 zpool_get_type(pool->zpool)); 1345 list_add(&pool->list, &zswap_pools); 1346 zswap_has_pool = true; 1347 } else { 1348 pr_err("pool creation failed\n"); 1349 zswap_enabled = false; 1350 } 1351 1352 shrink_wq = create_workqueue("zswap-shrink"); 1353 if (!shrink_wq) 1354 goto fallback_fail; 1355 1356 frontswap_register_ops(&zswap_frontswap_ops); 1357 if (zswap_debugfs_init()) 1358 pr_warn("debugfs initialization failed\n"); 1359 return 0; 1360 1361 fallback_fail: 1362 if (pool) 1363 zswap_pool_destroy(pool); 1364 hp_fail: 1365 cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE); 1366 dstmem_fail: 1367 zswap_entry_cache_destroy(); 1368 cache_fail: 1369 /* if built-in, we aren't unloaded on failure; don't allow use */ 1370 zswap_init_failed = true; 1371 zswap_enabled = false; 1372 return -ENOMEM; 1373 } 1374 /* must be late so crypto has time to come up */ 1375 late_initcall(init_zswap); 1376 1377 MODULE_LICENSE("GPL"); 1378 MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); 1379 MODULE_DESCRIPTION("Compressed cache for swap pages"); 1380