1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * zswap.c - zswap driver file 4 * 5 * zswap is a backend for frontswap that takes pages that are in the process 6 * of being swapped out and attempts to compress and store them in a 7 * RAM-based memory pool. This can result in a significant I/O reduction on 8 * the swap device and, in the case where decompressing from RAM is faster 9 * than reading from the swap device, can also improve workload performance. 10 * 11 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> 12 */ 13 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 16 #include <linux/module.h> 17 #include <linux/cpu.h> 18 #include <linux/highmem.h> 19 #include <linux/slab.h> 20 #include <linux/spinlock.h> 21 #include <linux/types.h> 22 #include <linux/atomic.h> 23 #include <linux/frontswap.h> 24 #include <linux/rbtree.h> 25 #include <linux/swap.h> 26 #include <linux/crypto.h> 27 #include <linux/mempool.h> 28 #include <linux/zpool.h> 29 30 #include <linux/mm_types.h> 31 #include <linux/page-flags.h> 32 #include <linux/swapops.h> 33 #include <linux/writeback.h> 34 #include <linux/pagemap.h> 35 #include <linux/workqueue.h> 36 37 /********************************* 38 * statistics 39 **********************************/ 40 /* Total bytes used by the compressed storage */ 41 static u64 zswap_pool_total_size; 42 /* The number of compressed pages currently stored in zswap */ 43 static atomic_t zswap_stored_pages = ATOMIC_INIT(0); 44 /* The number of same-value filled pages currently stored in zswap */ 45 static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); 46 47 /* 48 * The statistics below are not protected from concurrent access for 49 * performance reasons so they may not be a 100% accurate. However, 50 * they do provide useful information on roughly how many times a 51 * certain event is occurring. 52 */ 53 54 /* Pool limit was hit (see zswap_max_pool_percent) */ 55 static u64 zswap_pool_limit_hit; 56 /* Pages written back when pool limit was reached */ 57 static u64 zswap_written_back_pages; 58 /* Store failed due to a reclaim failure after pool limit was reached */ 59 static u64 zswap_reject_reclaim_fail; 60 /* Compressed page was too big for the allocator to (optimally) store */ 61 static u64 zswap_reject_compress_poor; 62 /* Store failed because underlying allocator could not get memory */ 63 static u64 zswap_reject_alloc_fail; 64 /* Store failed because the entry metadata could not be allocated (rare) */ 65 static u64 zswap_reject_kmemcache_fail; 66 /* Duplicate store was encountered (rare) */ 67 static u64 zswap_duplicate_entry; 68 69 /* Shrinker work queue */ 70 static struct workqueue_struct *shrink_wq; 71 /* Pool limit was hit, we need to calm down */ 72 static bool zswap_pool_reached_full; 73 74 /********************************* 75 * tunables 76 **********************************/ 77 78 #define ZSWAP_PARAM_UNSET "" 79 80 /* Enable/disable zswap */ 81 static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); 82 static int zswap_enabled_param_set(const char *, 83 const struct kernel_param *); 84 static struct kernel_param_ops zswap_enabled_param_ops = { 85 .set = zswap_enabled_param_set, 86 .get = param_get_bool, 87 }; 88 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); 89 90 /* Crypto compressor to use */ 91 static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; 92 static int zswap_compressor_param_set(const char *, 93 const struct kernel_param *); 94 static struct kernel_param_ops zswap_compressor_param_ops = { 95 .set = zswap_compressor_param_set, 96 .get = param_get_charp, 97 .free = param_free_charp, 98 }; 99 module_param_cb(compressor, &zswap_compressor_param_ops, 100 &zswap_compressor, 0644); 101 102 /* Compressed storage zpool to use */ 103 static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; 104 static int zswap_zpool_param_set(const char *, const struct kernel_param *); 105 static struct kernel_param_ops zswap_zpool_param_ops = { 106 .set = zswap_zpool_param_set, 107 .get = param_get_charp, 108 .free = param_free_charp, 109 }; 110 module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); 111 112 /* The maximum percentage of memory that the compressed pool can occupy */ 113 static unsigned int zswap_max_pool_percent = 20; 114 module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); 115 116 /* The threshold for accepting new pages after the max_pool_percent was hit */ 117 static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ 118 module_param_named(accept_threshold_percent, zswap_accept_thr_percent, 119 uint, 0644); 120 121 /* Enable/disable handling same-value filled pages (enabled by default) */ 122 static bool zswap_same_filled_pages_enabled = true; 123 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, 124 bool, 0644); 125 126 /********************************* 127 * data structures 128 **********************************/ 129 130 struct zswap_pool { 131 struct zpool *zpool; 132 struct crypto_comp * __percpu *tfm; 133 struct kref kref; 134 struct list_head list; 135 struct work_struct release_work; 136 struct work_struct shrink_work; 137 struct hlist_node node; 138 char tfm_name[CRYPTO_MAX_ALG_NAME]; 139 }; 140 141 /* 142 * struct zswap_entry 143 * 144 * This structure contains the metadata for tracking a single compressed 145 * page within zswap. 146 * 147 * rbnode - links the entry into red-black tree for the appropriate swap type 148 * offset - the swap offset for the entry. Index into the red-black tree. 149 * refcount - the number of outstanding reference to the entry. This is needed 150 * to protect against premature freeing of the entry by code 151 * concurrent calls to load, invalidate, and writeback. The lock 152 * for the zswap_tree structure that contains the entry must 153 * be held while changing the refcount. Since the lock must 154 * be held, there is no reason to also make refcount atomic. 155 * length - the length in bytes of the compressed page data. Needed during 156 * decompression. For a same value filled page length is 0. 157 * pool - the zswap_pool the entry's data is in 158 * handle - zpool allocation handle that stores the compressed page data 159 * value - value of the same-value filled pages which have same content 160 */ 161 struct zswap_entry { 162 struct rb_node rbnode; 163 pgoff_t offset; 164 int refcount; 165 unsigned int length; 166 struct zswap_pool *pool; 167 union { 168 unsigned long handle; 169 unsigned long value; 170 }; 171 }; 172 173 struct zswap_header { 174 swp_entry_t swpentry; 175 }; 176 177 /* 178 * The tree lock in the zswap_tree struct protects a few things: 179 * - the rbtree 180 * - the refcount field of each entry in the tree 181 */ 182 struct zswap_tree { 183 struct rb_root rbroot; 184 spinlock_t lock; 185 }; 186 187 static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 188 189 /* RCU-protected iteration */ 190 static LIST_HEAD(zswap_pools); 191 /* protects zswap_pools list modification */ 192 static DEFINE_SPINLOCK(zswap_pools_lock); 193 /* pool counter to provide unique names to zpool */ 194 static atomic_t zswap_pools_count = ATOMIC_INIT(0); 195 196 /* used by param callback function */ 197 static bool zswap_init_started; 198 199 /* fatal error during init */ 200 static bool zswap_init_failed; 201 202 /* init completed, but couldn't create the initial pool */ 203 static bool zswap_has_pool; 204 205 /********************************* 206 * helpers and fwd declarations 207 **********************************/ 208 209 #define zswap_pool_debug(msg, p) \ 210 pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ 211 zpool_get_type((p)->zpool)) 212 213 static int zswap_writeback_entry(struct zpool *pool, unsigned long handle); 214 static int zswap_pool_get(struct zswap_pool *pool); 215 static void zswap_pool_put(struct zswap_pool *pool); 216 217 static const struct zpool_ops zswap_zpool_ops = { 218 .evict = zswap_writeback_entry 219 }; 220 221 static bool zswap_is_full(void) 222 { 223 return totalram_pages() * zswap_max_pool_percent / 100 < 224 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 225 } 226 227 static bool zswap_can_accept(void) 228 { 229 return totalram_pages() * zswap_accept_thr_percent / 100 * 230 zswap_max_pool_percent / 100 > 231 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 232 } 233 234 static void zswap_update_total_size(void) 235 { 236 struct zswap_pool *pool; 237 u64 total = 0; 238 239 rcu_read_lock(); 240 241 list_for_each_entry_rcu(pool, &zswap_pools, list) 242 total += zpool_get_total_size(pool->zpool); 243 244 rcu_read_unlock(); 245 246 zswap_pool_total_size = total; 247 } 248 249 /********************************* 250 * zswap entry functions 251 **********************************/ 252 static struct kmem_cache *zswap_entry_cache; 253 254 static int __init zswap_entry_cache_create(void) 255 { 256 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 257 return zswap_entry_cache == NULL; 258 } 259 260 static void __init zswap_entry_cache_destroy(void) 261 { 262 kmem_cache_destroy(zswap_entry_cache); 263 } 264 265 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) 266 { 267 struct zswap_entry *entry; 268 entry = kmem_cache_alloc(zswap_entry_cache, gfp); 269 if (!entry) 270 return NULL; 271 entry->refcount = 1; 272 RB_CLEAR_NODE(&entry->rbnode); 273 return entry; 274 } 275 276 static void zswap_entry_cache_free(struct zswap_entry *entry) 277 { 278 kmem_cache_free(zswap_entry_cache, entry); 279 } 280 281 /********************************* 282 * rbtree functions 283 **********************************/ 284 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) 285 { 286 struct rb_node *node = root->rb_node; 287 struct zswap_entry *entry; 288 289 while (node) { 290 entry = rb_entry(node, struct zswap_entry, rbnode); 291 if (entry->offset > offset) 292 node = node->rb_left; 293 else if (entry->offset < offset) 294 node = node->rb_right; 295 else 296 return entry; 297 } 298 return NULL; 299 } 300 301 /* 302 * In the case that a entry with the same offset is found, a pointer to 303 * the existing entry is stored in dupentry and the function returns -EEXIST 304 */ 305 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, 306 struct zswap_entry **dupentry) 307 { 308 struct rb_node **link = &root->rb_node, *parent = NULL; 309 struct zswap_entry *myentry; 310 311 while (*link) { 312 parent = *link; 313 myentry = rb_entry(parent, struct zswap_entry, rbnode); 314 if (myentry->offset > entry->offset) 315 link = &(*link)->rb_left; 316 else if (myentry->offset < entry->offset) 317 link = &(*link)->rb_right; 318 else { 319 *dupentry = myentry; 320 return -EEXIST; 321 } 322 } 323 rb_link_node(&entry->rbnode, parent, link); 324 rb_insert_color(&entry->rbnode, root); 325 return 0; 326 } 327 328 static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) 329 { 330 if (!RB_EMPTY_NODE(&entry->rbnode)) { 331 rb_erase(&entry->rbnode, root); 332 RB_CLEAR_NODE(&entry->rbnode); 333 } 334 } 335 336 /* 337 * Carries out the common pattern of freeing and entry's zpool allocation, 338 * freeing the entry itself, and decrementing the number of stored pages. 339 */ 340 static void zswap_free_entry(struct zswap_entry *entry) 341 { 342 if (!entry->length) 343 atomic_dec(&zswap_same_filled_pages); 344 else { 345 zpool_free(entry->pool->zpool, entry->handle); 346 zswap_pool_put(entry->pool); 347 } 348 zswap_entry_cache_free(entry); 349 atomic_dec(&zswap_stored_pages); 350 zswap_update_total_size(); 351 } 352 353 /* caller must hold the tree lock */ 354 static void zswap_entry_get(struct zswap_entry *entry) 355 { 356 entry->refcount++; 357 } 358 359 /* caller must hold the tree lock 360 * remove from the tree and free it, if nobody reference the entry 361 */ 362 static void zswap_entry_put(struct zswap_tree *tree, 363 struct zswap_entry *entry) 364 { 365 int refcount = --entry->refcount; 366 367 BUG_ON(refcount < 0); 368 if (refcount == 0) { 369 zswap_rb_erase(&tree->rbroot, entry); 370 zswap_free_entry(entry); 371 } 372 } 373 374 /* caller must hold the tree lock */ 375 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, 376 pgoff_t offset) 377 { 378 struct zswap_entry *entry; 379 380 entry = zswap_rb_search(root, offset); 381 if (entry) 382 zswap_entry_get(entry); 383 384 return entry; 385 } 386 387 /********************************* 388 * per-cpu code 389 **********************************/ 390 static DEFINE_PER_CPU(u8 *, zswap_dstmem); 391 392 static int zswap_dstmem_prepare(unsigned int cpu) 393 { 394 u8 *dst; 395 396 dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); 397 if (!dst) 398 return -ENOMEM; 399 400 per_cpu(zswap_dstmem, cpu) = dst; 401 return 0; 402 } 403 404 static int zswap_dstmem_dead(unsigned int cpu) 405 { 406 u8 *dst; 407 408 dst = per_cpu(zswap_dstmem, cpu); 409 kfree(dst); 410 per_cpu(zswap_dstmem, cpu) = NULL; 411 412 return 0; 413 } 414 415 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) 416 { 417 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 418 struct crypto_comp *tfm; 419 420 if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu))) 421 return 0; 422 423 tfm = crypto_alloc_comp(pool->tfm_name, 0, 0); 424 if (IS_ERR_OR_NULL(tfm)) { 425 pr_err("could not alloc crypto comp %s : %ld\n", 426 pool->tfm_name, PTR_ERR(tfm)); 427 return -ENOMEM; 428 } 429 *per_cpu_ptr(pool->tfm, cpu) = tfm; 430 return 0; 431 } 432 433 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) 434 { 435 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 436 struct crypto_comp *tfm; 437 438 tfm = *per_cpu_ptr(pool->tfm, cpu); 439 if (!IS_ERR_OR_NULL(tfm)) 440 crypto_free_comp(tfm); 441 *per_cpu_ptr(pool->tfm, cpu) = NULL; 442 return 0; 443 } 444 445 /********************************* 446 * pool functions 447 **********************************/ 448 449 static struct zswap_pool *__zswap_pool_current(void) 450 { 451 struct zswap_pool *pool; 452 453 pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); 454 WARN_ONCE(!pool && zswap_has_pool, 455 "%s: no page storage pool!\n", __func__); 456 457 return pool; 458 } 459 460 static struct zswap_pool *zswap_pool_current(void) 461 { 462 assert_spin_locked(&zswap_pools_lock); 463 464 return __zswap_pool_current(); 465 } 466 467 static struct zswap_pool *zswap_pool_current_get(void) 468 { 469 struct zswap_pool *pool; 470 471 rcu_read_lock(); 472 473 pool = __zswap_pool_current(); 474 if (!zswap_pool_get(pool)) 475 pool = NULL; 476 477 rcu_read_unlock(); 478 479 return pool; 480 } 481 482 static struct zswap_pool *zswap_pool_last_get(void) 483 { 484 struct zswap_pool *pool, *last = NULL; 485 486 rcu_read_lock(); 487 488 list_for_each_entry_rcu(pool, &zswap_pools, list) 489 last = pool; 490 WARN_ONCE(!last && zswap_has_pool, 491 "%s: no page storage pool!\n", __func__); 492 if (!zswap_pool_get(last)) 493 last = NULL; 494 495 rcu_read_unlock(); 496 497 return last; 498 } 499 500 /* type and compressor must be null-terminated */ 501 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) 502 { 503 struct zswap_pool *pool; 504 505 assert_spin_locked(&zswap_pools_lock); 506 507 list_for_each_entry_rcu(pool, &zswap_pools, list) { 508 if (strcmp(pool->tfm_name, compressor)) 509 continue; 510 if (strcmp(zpool_get_type(pool->zpool), type)) 511 continue; 512 /* if we can't get it, it's about to be destroyed */ 513 if (!zswap_pool_get(pool)) 514 continue; 515 return pool; 516 } 517 518 return NULL; 519 } 520 521 static void shrink_worker(struct work_struct *w) 522 { 523 struct zswap_pool *pool = container_of(w, typeof(*pool), 524 shrink_work); 525 526 if (zpool_shrink(pool->zpool, 1, NULL)) 527 zswap_reject_reclaim_fail++; 528 zswap_pool_put(pool); 529 } 530 531 static struct zswap_pool *zswap_pool_create(char *type, char *compressor) 532 { 533 struct zswap_pool *pool; 534 char name[38]; /* 'zswap' + 32 char (max) num + \0 */ 535 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 536 int ret; 537 538 if (!zswap_has_pool) { 539 /* if either are unset, pool initialization failed, and we 540 * need both params to be set correctly before trying to 541 * create a pool. 542 */ 543 if (!strcmp(type, ZSWAP_PARAM_UNSET)) 544 return NULL; 545 if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) 546 return NULL; 547 } 548 549 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 550 if (!pool) 551 return NULL; 552 553 /* unique name for each pool specifically required by zsmalloc */ 554 snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); 555 556 pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops); 557 if (!pool->zpool) { 558 pr_err("%s zpool not available\n", type); 559 goto error; 560 } 561 pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); 562 563 strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); 564 pool->tfm = alloc_percpu(struct crypto_comp *); 565 if (!pool->tfm) { 566 pr_err("percpu alloc failed\n"); 567 goto error; 568 } 569 570 ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, 571 &pool->node); 572 if (ret) 573 goto error; 574 pr_debug("using %s compressor\n", pool->tfm_name); 575 576 /* being the current pool takes 1 ref; this func expects the 577 * caller to always add the new pool as the current pool 578 */ 579 kref_init(&pool->kref); 580 INIT_LIST_HEAD(&pool->list); 581 INIT_WORK(&pool->shrink_work, shrink_worker); 582 583 zswap_pool_debug("created", pool); 584 585 return pool; 586 587 error: 588 free_percpu(pool->tfm); 589 if (pool->zpool) 590 zpool_destroy_pool(pool->zpool); 591 kfree(pool); 592 return NULL; 593 } 594 595 static __init struct zswap_pool *__zswap_pool_create_fallback(void) 596 { 597 bool has_comp, has_zpool; 598 599 has_comp = crypto_has_comp(zswap_compressor, 0, 0); 600 if (!has_comp && strcmp(zswap_compressor, 601 CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { 602 pr_err("compressor %s not available, using default %s\n", 603 zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); 604 param_free_charp(&zswap_compressor); 605 zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; 606 has_comp = crypto_has_comp(zswap_compressor, 0, 0); 607 } 608 if (!has_comp) { 609 pr_err("default compressor %s not available\n", 610 zswap_compressor); 611 param_free_charp(&zswap_compressor); 612 zswap_compressor = ZSWAP_PARAM_UNSET; 613 } 614 615 has_zpool = zpool_has_pool(zswap_zpool_type); 616 if (!has_zpool && strcmp(zswap_zpool_type, 617 CONFIG_ZSWAP_ZPOOL_DEFAULT)) { 618 pr_err("zpool %s not available, using default %s\n", 619 zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); 620 param_free_charp(&zswap_zpool_type); 621 zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; 622 has_zpool = zpool_has_pool(zswap_zpool_type); 623 } 624 if (!has_zpool) { 625 pr_err("default zpool %s not available\n", 626 zswap_zpool_type); 627 param_free_charp(&zswap_zpool_type); 628 zswap_zpool_type = ZSWAP_PARAM_UNSET; 629 } 630 631 if (!has_comp || !has_zpool) 632 return NULL; 633 634 return zswap_pool_create(zswap_zpool_type, zswap_compressor); 635 } 636 637 static void zswap_pool_destroy(struct zswap_pool *pool) 638 { 639 zswap_pool_debug("destroying", pool); 640 641 cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); 642 free_percpu(pool->tfm); 643 zpool_destroy_pool(pool->zpool); 644 kfree(pool); 645 } 646 647 static int __must_check zswap_pool_get(struct zswap_pool *pool) 648 { 649 if (!pool) 650 return 0; 651 652 return kref_get_unless_zero(&pool->kref); 653 } 654 655 static void __zswap_pool_release(struct work_struct *work) 656 { 657 struct zswap_pool *pool = container_of(work, typeof(*pool), 658 release_work); 659 660 synchronize_rcu(); 661 662 /* nobody should have been able to get a kref... */ 663 WARN_ON(kref_get_unless_zero(&pool->kref)); 664 665 /* pool is now off zswap_pools list and has no references. */ 666 zswap_pool_destroy(pool); 667 } 668 669 static void __zswap_pool_empty(struct kref *kref) 670 { 671 struct zswap_pool *pool; 672 673 pool = container_of(kref, typeof(*pool), kref); 674 675 spin_lock(&zswap_pools_lock); 676 677 WARN_ON(pool == zswap_pool_current()); 678 679 list_del_rcu(&pool->list); 680 681 INIT_WORK(&pool->release_work, __zswap_pool_release); 682 schedule_work(&pool->release_work); 683 684 spin_unlock(&zswap_pools_lock); 685 } 686 687 static void zswap_pool_put(struct zswap_pool *pool) 688 { 689 kref_put(&pool->kref, __zswap_pool_empty); 690 } 691 692 /********************************* 693 * param callbacks 694 **********************************/ 695 696 /* val must be a null-terminated string */ 697 static int __zswap_param_set(const char *val, const struct kernel_param *kp, 698 char *type, char *compressor) 699 { 700 struct zswap_pool *pool, *put_pool = NULL; 701 char *s = strstrip((char *)val); 702 int ret; 703 704 if (zswap_init_failed) { 705 pr_err("can't set param, initialization failed\n"); 706 return -ENODEV; 707 } 708 709 /* no change required */ 710 if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) 711 return 0; 712 713 /* if this is load-time (pre-init) param setting, 714 * don't create a pool; that's done during init. 715 */ 716 if (!zswap_init_started) 717 return param_set_charp(s, kp); 718 719 if (!type) { 720 if (!zpool_has_pool(s)) { 721 pr_err("zpool %s not available\n", s); 722 return -ENOENT; 723 } 724 type = s; 725 } else if (!compressor) { 726 if (!crypto_has_comp(s, 0, 0)) { 727 pr_err("compressor %s not available\n", s); 728 return -ENOENT; 729 } 730 compressor = s; 731 } else { 732 WARN_ON(1); 733 return -EINVAL; 734 } 735 736 spin_lock(&zswap_pools_lock); 737 738 pool = zswap_pool_find_get(type, compressor); 739 if (pool) { 740 zswap_pool_debug("using existing", pool); 741 WARN_ON(pool == zswap_pool_current()); 742 list_del_rcu(&pool->list); 743 } 744 745 spin_unlock(&zswap_pools_lock); 746 747 if (!pool) 748 pool = zswap_pool_create(type, compressor); 749 750 if (pool) 751 ret = param_set_charp(s, kp); 752 else 753 ret = -EINVAL; 754 755 spin_lock(&zswap_pools_lock); 756 757 if (!ret) { 758 put_pool = zswap_pool_current(); 759 list_add_rcu(&pool->list, &zswap_pools); 760 zswap_has_pool = true; 761 } else if (pool) { 762 /* add the possibly pre-existing pool to the end of the pools 763 * list; if it's new (and empty) then it'll be removed and 764 * destroyed by the put after we drop the lock 765 */ 766 list_add_tail_rcu(&pool->list, &zswap_pools); 767 put_pool = pool; 768 } 769 770 spin_unlock(&zswap_pools_lock); 771 772 if (!zswap_has_pool && !pool) { 773 /* if initial pool creation failed, and this pool creation also 774 * failed, maybe both compressor and zpool params were bad. 775 * Allow changing this param, so pool creation will succeed 776 * when the other param is changed. We already verified this 777 * param is ok in the zpool_has_pool() or crypto_has_comp() 778 * checks above. 779 */ 780 ret = param_set_charp(s, kp); 781 } 782 783 /* drop the ref from either the old current pool, 784 * or the new pool we failed to add 785 */ 786 if (put_pool) 787 zswap_pool_put(put_pool); 788 789 return ret; 790 } 791 792 static int zswap_compressor_param_set(const char *val, 793 const struct kernel_param *kp) 794 { 795 return __zswap_param_set(val, kp, zswap_zpool_type, NULL); 796 } 797 798 static int zswap_zpool_param_set(const char *val, 799 const struct kernel_param *kp) 800 { 801 return __zswap_param_set(val, kp, NULL, zswap_compressor); 802 } 803 804 static int zswap_enabled_param_set(const char *val, 805 const struct kernel_param *kp) 806 { 807 if (zswap_init_failed) { 808 pr_err("can't enable, initialization failed\n"); 809 return -ENODEV; 810 } 811 if (!zswap_has_pool && zswap_init_started) { 812 pr_err("can't enable, no pool configured\n"); 813 return -ENODEV; 814 } 815 816 return param_set_bool(val, kp); 817 } 818 819 /********************************* 820 * writeback code 821 **********************************/ 822 /* return enum for zswap_get_swap_cache_page */ 823 enum zswap_get_swap_ret { 824 ZSWAP_SWAPCACHE_NEW, 825 ZSWAP_SWAPCACHE_EXIST, 826 ZSWAP_SWAPCACHE_FAIL, 827 }; 828 829 /* 830 * zswap_get_swap_cache_page 831 * 832 * This is an adaption of read_swap_cache_async() 833 * 834 * This function tries to find a page with the given swap entry 835 * in the swapper_space address space (the swap cache). If the page 836 * is found, it is returned in retpage. Otherwise, a page is allocated, 837 * added to the swap cache, and returned in retpage. 838 * 839 * If success, the swap cache page is returned in retpage 840 * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache 841 * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, 842 * the new page is added to swapcache and locked 843 * Returns ZSWAP_SWAPCACHE_FAIL on error 844 */ 845 static int zswap_get_swap_cache_page(swp_entry_t entry, 846 struct page **retpage) 847 { 848 bool page_was_allocated; 849 850 *retpage = __read_swap_cache_async(entry, GFP_KERNEL, 851 NULL, 0, &page_was_allocated); 852 if (page_was_allocated) 853 return ZSWAP_SWAPCACHE_NEW; 854 if (!*retpage) 855 return ZSWAP_SWAPCACHE_FAIL; 856 return ZSWAP_SWAPCACHE_EXIST; 857 } 858 859 /* 860 * Attempts to free an entry by adding a page to the swap cache, 861 * decompressing the entry data into the page, and issuing a 862 * bio write to write the page back to the swap device. 863 * 864 * This can be thought of as a "resumed writeback" of the page 865 * to the swap device. We are basically resuming the same swap 866 * writeback path that was intercepted with the frontswap_store() 867 * in the first place. After the page has been decompressed into 868 * the swap cache, the compressed version stored by zswap can be 869 * freed. 870 */ 871 static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) 872 { 873 struct zswap_header *zhdr; 874 swp_entry_t swpentry; 875 struct zswap_tree *tree; 876 pgoff_t offset; 877 struct zswap_entry *entry; 878 struct page *page; 879 struct crypto_comp *tfm; 880 u8 *src, *dst; 881 unsigned int dlen; 882 int ret; 883 struct writeback_control wbc = { 884 .sync_mode = WB_SYNC_NONE, 885 }; 886 887 /* extract swpentry from data */ 888 zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); 889 swpentry = zhdr->swpentry; /* here */ 890 tree = zswap_trees[swp_type(swpentry)]; 891 offset = swp_offset(swpentry); 892 893 /* find and ref zswap entry */ 894 spin_lock(&tree->lock); 895 entry = zswap_entry_find_get(&tree->rbroot, offset); 896 if (!entry) { 897 /* entry was invalidated */ 898 spin_unlock(&tree->lock); 899 zpool_unmap_handle(pool, handle); 900 return 0; 901 } 902 spin_unlock(&tree->lock); 903 BUG_ON(offset != entry->offset); 904 905 /* try to allocate swap cache page */ 906 switch (zswap_get_swap_cache_page(swpentry, &page)) { 907 case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ 908 ret = -ENOMEM; 909 goto fail; 910 911 case ZSWAP_SWAPCACHE_EXIST: 912 /* page is already in the swap cache, ignore for now */ 913 put_page(page); 914 ret = -EEXIST; 915 goto fail; 916 917 case ZSWAP_SWAPCACHE_NEW: /* page is locked */ 918 /* decompress */ 919 dlen = PAGE_SIZE; 920 src = (u8 *)zhdr + sizeof(struct zswap_header); 921 dst = kmap_atomic(page); 922 tfm = *get_cpu_ptr(entry->pool->tfm); 923 ret = crypto_comp_decompress(tfm, src, entry->length, 924 dst, &dlen); 925 put_cpu_ptr(entry->pool->tfm); 926 kunmap_atomic(dst); 927 BUG_ON(ret); 928 BUG_ON(dlen != PAGE_SIZE); 929 930 /* page is up to date */ 931 SetPageUptodate(page); 932 } 933 934 /* move it to the tail of the inactive list after end_writeback */ 935 SetPageReclaim(page); 936 937 /* start writeback */ 938 __swap_writepage(page, &wbc, end_swap_bio_write); 939 put_page(page); 940 zswap_written_back_pages++; 941 942 spin_lock(&tree->lock); 943 /* drop local reference */ 944 zswap_entry_put(tree, entry); 945 946 /* 947 * There are two possible situations for entry here: 948 * (1) refcount is 1(normal case), entry is valid and on the tree 949 * (2) refcount is 0, entry is freed and not on the tree 950 * because invalidate happened during writeback 951 * search the tree and free the entry if find entry 952 */ 953 if (entry == zswap_rb_search(&tree->rbroot, offset)) 954 zswap_entry_put(tree, entry); 955 spin_unlock(&tree->lock); 956 957 goto end; 958 959 /* 960 * if we get here due to ZSWAP_SWAPCACHE_EXIST 961 * a load may happening concurrently 962 * it is safe and okay to not free the entry 963 * if we free the entry in the following put 964 * it it either okay to return !0 965 */ 966 fail: 967 spin_lock(&tree->lock); 968 zswap_entry_put(tree, entry); 969 spin_unlock(&tree->lock); 970 971 end: 972 zpool_unmap_handle(pool, handle); 973 return ret; 974 } 975 976 static int zswap_is_page_same_filled(void *ptr, unsigned long *value) 977 { 978 unsigned int pos; 979 unsigned long *page; 980 981 page = (unsigned long *)ptr; 982 for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) { 983 if (page[pos] != page[0]) 984 return 0; 985 } 986 *value = page[0]; 987 return 1; 988 } 989 990 static void zswap_fill_page(void *ptr, unsigned long value) 991 { 992 unsigned long *page; 993 994 page = (unsigned long *)ptr; 995 memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); 996 } 997 998 /********************************* 999 * frontswap hooks 1000 **********************************/ 1001 /* attempts to compress and store an single page */ 1002 static int zswap_frontswap_store(unsigned type, pgoff_t offset, 1003 struct page *page) 1004 { 1005 struct zswap_tree *tree = zswap_trees[type]; 1006 struct zswap_entry *entry, *dupentry; 1007 struct crypto_comp *tfm; 1008 int ret; 1009 unsigned int hlen, dlen = PAGE_SIZE; 1010 unsigned long handle, value; 1011 char *buf; 1012 u8 *src, *dst; 1013 struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) }; 1014 gfp_t gfp; 1015 1016 /* THP isn't supported */ 1017 if (PageTransHuge(page)) { 1018 ret = -EINVAL; 1019 goto reject; 1020 } 1021 1022 if (!zswap_enabled || !tree) { 1023 ret = -ENODEV; 1024 goto reject; 1025 } 1026 1027 /* reclaim space if needed */ 1028 if (zswap_is_full()) { 1029 struct zswap_pool *pool; 1030 1031 zswap_pool_limit_hit++; 1032 zswap_pool_reached_full = true; 1033 pool = zswap_pool_last_get(); 1034 if (pool) 1035 queue_work(shrink_wq, &pool->shrink_work); 1036 ret = -ENOMEM; 1037 goto reject; 1038 } 1039 1040 if (zswap_pool_reached_full) { 1041 if (!zswap_can_accept()) { 1042 ret = -ENOMEM; 1043 goto reject; 1044 } else 1045 zswap_pool_reached_full = false; 1046 } 1047 1048 /* allocate entry */ 1049 entry = zswap_entry_cache_alloc(GFP_KERNEL); 1050 if (!entry) { 1051 zswap_reject_kmemcache_fail++; 1052 ret = -ENOMEM; 1053 goto reject; 1054 } 1055 1056 if (zswap_same_filled_pages_enabled) { 1057 src = kmap_atomic(page); 1058 if (zswap_is_page_same_filled(src, &value)) { 1059 kunmap_atomic(src); 1060 entry->offset = offset; 1061 entry->length = 0; 1062 entry->value = value; 1063 atomic_inc(&zswap_same_filled_pages); 1064 goto insert_entry; 1065 } 1066 kunmap_atomic(src); 1067 } 1068 1069 /* if entry is successfully added, it keeps the reference */ 1070 entry->pool = zswap_pool_current_get(); 1071 if (!entry->pool) { 1072 ret = -EINVAL; 1073 goto freepage; 1074 } 1075 1076 /* compress */ 1077 dst = get_cpu_var(zswap_dstmem); 1078 tfm = *get_cpu_ptr(entry->pool->tfm); 1079 src = kmap_atomic(page); 1080 ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen); 1081 kunmap_atomic(src); 1082 put_cpu_ptr(entry->pool->tfm); 1083 if (ret) { 1084 ret = -EINVAL; 1085 goto put_dstmem; 1086 } 1087 1088 /* store */ 1089 hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0; 1090 gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 1091 if (zpool_malloc_support_movable(entry->pool->zpool)) 1092 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; 1093 ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle); 1094 if (ret == -ENOSPC) { 1095 zswap_reject_compress_poor++; 1096 goto put_dstmem; 1097 } 1098 if (ret) { 1099 zswap_reject_alloc_fail++; 1100 goto put_dstmem; 1101 } 1102 buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW); 1103 memcpy(buf, &zhdr, hlen); 1104 memcpy(buf + hlen, dst, dlen); 1105 zpool_unmap_handle(entry->pool->zpool, handle); 1106 put_cpu_var(zswap_dstmem); 1107 1108 /* populate entry */ 1109 entry->offset = offset; 1110 entry->handle = handle; 1111 entry->length = dlen; 1112 1113 insert_entry: 1114 /* map */ 1115 spin_lock(&tree->lock); 1116 do { 1117 ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); 1118 if (ret == -EEXIST) { 1119 zswap_duplicate_entry++; 1120 /* remove from rbtree */ 1121 zswap_rb_erase(&tree->rbroot, dupentry); 1122 zswap_entry_put(tree, dupentry); 1123 } 1124 } while (ret == -EEXIST); 1125 spin_unlock(&tree->lock); 1126 1127 /* update stats */ 1128 atomic_inc(&zswap_stored_pages); 1129 zswap_update_total_size(); 1130 1131 return 0; 1132 1133 put_dstmem: 1134 put_cpu_var(zswap_dstmem); 1135 zswap_pool_put(entry->pool); 1136 freepage: 1137 zswap_entry_cache_free(entry); 1138 reject: 1139 return ret; 1140 } 1141 1142 /* 1143 * returns 0 if the page was successfully decompressed 1144 * return -1 on entry not found or error 1145 */ 1146 static int zswap_frontswap_load(unsigned type, pgoff_t offset, 1147 struct page *page) 1148 { 1149 struct zswap_tree *tree = zswap_trees[type]; 1150 struct zswap_entry *entry; 1151 struct crypto_comp *tfm; 1152 u8 *src, *dst; 1153 unsigned int dlen; 1154 int ret; 1155 1156 /* find */ 1157 spin_lock(&tree->lock); 1158 entry = zswap_entry_find_get(&tree->rbroot, offset); 1159 if (!entry) { 1160 /* entry was written back */ 1161 spin_unlock(&tree->lock); 1162 return -1; 1163 } 1164 spin_unlock(&tree->lock); 1165 1166 if (!entry->length) { 1167 dst = kmap_atomic(page); 1168 zswap_fill_page(dst, entry->value); 1169 kunmap_atomic(dst); 1170 goto freeentry; 1171 } 1172 1173 /* decompress */ 1174 dlen = PAGE_SIZE; 1175 src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); 1176 if (zpool_evictable(entry->pool->zpool)) 1177 src += sizeof(struct zswap_header); 1178 dst = kmap_atomic(page); 1179 tfm = *get_cpu_ptr(entry->pool->tfm); 1180 ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); 1181 put_cpu_ptr(entry->pool->tfm); 1182 kunmap_atomic(dst); 1183 zpool_unmap_handle(entry->pool->zpool, entry->handle); 1184 BUG_ON(ret); 1185 1186 freeentry: 1187 spin_lock(&tree->lock); 1188 zswap_entry_put(tree, entry); 1189 spin_unlock(&tree->lock); 1190 1191 return 0; 1192 } 1193 1194 /* frees an entry in zswap */ 1195 static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) 1196 { 1197 struct zswap_tree *tree = zswap_trees[type]; 1198 struct zswap_entry *entry; 1199 1200 /* find */ 1201 spin_lock(&tree->lock); 1202 entry = zswap_rb_search(&tree->rbroot, offset); 1203 if (!entry) { 1204 /* entry was written back */ 1205 spin_unlock(&tree->lock); 1206 return; 1207 } 1208 1209 /* remove from rbtree */ 1210 zswap_rb_erase(&tree->rbroot, entry); 1211 1212 /* drop the initial reference from entry creation */ 1213 zswap_entry_put(tree, entry); 1214 1215 spin_unlock(&tree->lock); 1216 } 1217 1218 /* frees all zswap entries for the given swap type */ 1219 static void zswap_frontswap_invalidate_area(unsigned type) 1220 { 1221 struct zswap_tree *tree = zswap_trees[type]; 1222 struct zswap_entry *entry, *n; 1223 1224 if (!tree) 1225 return; 1226 1227 /* walk the tree and free everything */ 1228 spin_lock(&tree->lock); 1229 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) 1230 zswap_free_entry(entry); 1231 tree->rbroot = RB_ROOT; 1232 spin_unlock(&tree->lock); 1233 kfree(tree); 1234 zswap_trees[type] = NULL; 1235 } 1236 1237 static void zswap_frontswap_init(unsigned type) 1238 { 1239 struct zswap_tree *tree; 1240 1241 tree = kzalloc(sizeof(*tree), GFP_KERNEL); 1242 if (!tree) { 1243 pr_err("alloc failed, zswap disabled for swap type %d\n", type); 1244 return; 1245 } 1246 1247 tree->rbroot = RB_ROOT; 1248 spin_lock_init(&tree->lock); 1249 zswap_trees[type] = tree; 1250 } 1251 1252 static struct frontswap_ops zswap_frontswap_ops = { 1253 .store = zswap_frontswap_store, 1254 .load = zswap_frontswap_load, 1255 .invalidate_page = zswap_frontswap_invalidate_page, 1256 .invalidate_area = zswap_frontswap_invalidate_area, 1257 .init = zswap_frontswap_init 1258 }; 1259 1260 /********************************* 1261 * debugfs functions 1262 **********************************/ 1263 #ifdef CONFIG_DEBUG_FS 1264 #include <linux/debugfs.h> 1265 1266 static struct dentry *zswap_debugfs_root; 1267 1268 static int __init zswap_debugfs_init(void) 1269 { 1270 if (!debugfs_initialized()) 1271 return -ENODEV; 1272 1273 zswap_debugfs_root = debugfs_create_dir("zswap", NULL); 1274 1275 debugfs_create_u64("pool_limit_hit", 0444, 1276 zswap_debugfs_root, &zswap_pool_limit_hit); 1277 debugfs_create_u64("reject_reclaim_fail", 0444, 1278 zswap_debugfs_root, &zswap_reject_reclaim_fail); 1279 debugfs_create_u64("reject_alloc_fail", 0444, 1280 zswap_debugfs_root, &zswap_reject_alloc_fail); 1281 debugfs_create_u64("reject_kmemcache_fail", 0444, 1282 zswap_debugfs_root, &zswap_reject_kmemcache_fail); 1283 debugfs_create_u64("reject_compress_poor", 0444, 1284 zswap_debugfs_root, &zswap_reject_compress_poor); 1285 debugfs_create_u64("written_back_pages", 0444, 1286 zswap_debugfs_root, &zswap_written_back_pages); 1287 debugfs_create_u64("duplicate_entry", 0444, 1288 zswap_debugfs_root, &zswap_duplicate_entry); 1289 debugfs_create_u64("pool_total_size", 0444, 1290 zswap_debugfs_root, &zswap_pool_total_size); 1291 debugfs_create_atomic_t("stored_pages", 0444, 1292 zswap_debugfs_root, &zswap_stored_pages); 1293 debugfs_create_atomic_t("same_filled_pages", 0444, 1294 zswap_debugfs_root, &zswap_same_filled_pages); 1295 1296 return 0; 1297 } 1298 1299 static void __exit zswap_debugfs_exit(void) 1300 { 1301 debugfs_remove_recursive(zswap_debugfs_root); 1302 } 1303 #else 1304 static int __init zswap_debugfs_init(void) 1305 { 1306 return 0; 1307 } 1308 1309 static void __exit zswap_debugfs_exit(void) { } 1310 #endif 1311 1312 /********************************* 1313 * module init and exit 1314 **********************************/ 1315 static int __init init_zswap(void) 1316 { 1317 struct zswap_pool *pool; 1318 int ret; 1319 1320 zswap_init_started = true; 1321 1322 if (zswap_entry_cache_create()) { 1323 pr_err("entry cache creation failed\n"); 1324 goto cache_fail; 1325 } 1326 1327 ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare", 1328 zswap_dstmem_prepare, zswap_dstmem_dead); 1329 if (ret) { 1330 pr_err("dstmem alloc failed\n"); 1331 goto dstmem_fail; 1332 } 1333 1334 ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, 1335 "mm/zswap_pool:prepare", 1336 zswap_cpu_comp_prepare, 1337 zswap_cpu_comp_dead); 1338 if (ret) 1339 goto hp_fail; 1340 1341 pool = __zswap_pool_create_fallback(); 1342 if (pool) { 1343 pr_info("loaded using pool %s/%s\n", pool->tfm_name, 1344 zpool_get_type(pool->zpool)); 1345 list_add(&pool->list, &zswap_pools); 1346 zswap_has_pool = true; 1347 } else { 1348 pr_err("pool creation failed\n"); 1349 zswap_enabled = false; 1350 } 1351 1352 shrink_wq = create_workqueue("zswap-shrink"); 1353 if (!shrink_wq) 1354 goto fallback_fail; 1355 1356 frontswap_register_ops(&zswap_frontswap_ops); 1357 if (zswap_debugfs_init()) 1358 pr_warn("debugfs initialization failed\n"); 1359 return 0; 1360 1361 fallback_fail: 1362 if (pool) 1363 zswap_pool_destroy(pool); 1364 hp_fail: 1365 cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE); 1366 dstmem_fail: 1367 zswap_entry_cache_destroy(); 1368 cache_fail: 1369 /* if built-in, we aren't unloaded on failure; don't allow use */ 1370 zswap_init_failed = true; 1371 zswap_enabled = false; 1372 return -ENOMEM; 1373 } 1374 /* must be late so crypto has time to come up */ 1375 late_initcall(init_zswap); 1376 1377 MODULE_LICENSE("GPL"); 1378 MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); 1379 MODULE_DESCRIPTION("Compressed cache for swap pages"); 1380