1 /* 2 * zswap.c - zswap driver file 3 * 4 * zswap is a backend for frontswap that takes pages that are in the process 5 * of being swapped out and attempts to compress and store them in a 6 * RAM-based memory pool. This can result in a significant I/O reduction on 7 * the swap device and, in the case where decompressing from RAM is faster 8 * than reading from the swap device, can also improve workload performance. 9 * 10 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; either version 2 15 * of the License, or (at your option) any later version. 16 * 17 * This program is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 * GNU General Public License for more details. 21 */ 22 23 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 24 25 #include <linux/module.h> 26 #include <linux/cpu.h> 27 #include <linux/highmem.h> 28 #include <linux/slab.h> 29 #include <linux/spinlock.h> 30 #include <linux/types.h> 31 #include <linux/atomic.h> 32 #include <linux/frontswap.h> 33 #include <linux/rbtree.h> 34 #include <linux/swap.h> 35 #include <linux/crypto.h> 36 #include <linux/mempool.h> 37 #include <linux/zpool.h> 38 39 #include <linux/mm_types.h> 40 #include <linux/page-flags.h> 41 #include <linux/swapops.h> 42 #include <linux/writeback.h> 43 #include <linux/pagemap.h> 44 45 /********************************* 46 * statistics 47 **********************************/ 48 /* Total bytes used by the compressed storage */ 49 static u64 zswap_pool_total_size; 50 /* The number of compressed pages currently stored in zswap */ 51 static atomic_t zswap_stored_pages = ATOMIC_INIT(0); 52 53 /* 54 * The statistics below are not protected from concurrent access for 55 * performance reasons so they may not be a 100% accurate. However, 56 * they do provide useful information on roughly how many times a 57 * certain event is occurring. 58 */ 59 60 /* Pool limit was hit (see zswap_max_pool_percent) */ 61 static u64 zswap_pool_limit_hit; 62 /* Pages written back when pool limit was reached */ 63 static u64 zswap_written_back_pages; 64 /* Store failed due to a reclaim failure after pool limit was reached */ 65 static u64 zswap_reject_reclaim_fail; 66 /* Compressed page was too big for the allocator to (optimally) store */ 67 static u64 zswap_reject_compress_poor; 68 /* Store failed because underlying allocator could not get memory */ 69 static u64 zswap_reject_alloc_fail; 70 /* Store failed because the entry metadata could not be allocated (rare) */ 71 static u64 zswap_reject_kmemcache_fail; 72 /* Duplicate store was encountered (rare) */ 73 static u64 zswap_duplicate_entry; 74 75 /********************************* 76 * tunables 77 **********************************/ 78 79 #define ZSWAP_PARAM_UNSET "" 80 81 /* Enable/disable zswap (disabled by default) */ 82 static bool zswap_enabled; 83 static int zswap_enabled_param_set(const char *, 84 const struct kernel_param *); 85 static struct kernel_param_ops zswap_enabled_param_ops = { 86 .set = zswap_enabled_param_set, 87 .get = param_get_bool, 88 }; 89 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); 90 91 /* Crypto compressor to use */ 92 #define ZSWAP_COMPRESSOR_DEFAULT "lzo" 93 static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 94 static int zswap_compressor_param_set(const char *, 95 const struct kernel_param *); 96 static struct kernel_param_ops zswap_compressor_param_ops = { 97 .set = zswap_compressor_param_set, 98 .get = param_get_charp, 99 .free = param_free_charp, 100 }; 101 module_param_cb(compressor, &zswap_compressor_param_ops, 102 &zswap_compressor, 0644); 103 104 /* Compressed storage zpool to use */ 105 #define ZSWAP_ZPOOL_DEFAULT "zbud" 106 static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 107 static int zswap_zpool_param_set(const char *, const struct kernel_param *); 108 static struct kernel_param_ops zswap_zpool_param_ops = { 109 .set = zswap_zpool_param_set, 110 .get = param_get_charp, 111 .free = param_free_charp, 112 }; 113 module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); 114 115 /* The maximum percentage of memory that the compressed pool can occupy */ 116 static unsigned int zswap_max_pool_percent = 20; 117 module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); 118 119 /********************************* 120 * data structures 121 **********************************/ 122 123 struct zswap_pool { 124 struct zpool *zpool; 125 struct crypto_comp * __percpu *tfm; 126 struct kref kref; 127 struct list_head list; 128 struct work_struct work; 129 struct hlist_node node; 130 char tfm_name[CRYPTO_MAX_ALG_NAME]; 131 }; 132 133 /* 134 * struct zswap_entry 135 * 136 * This structure contains the metadata for tracking a single compressed 137 * page within zswap. 138 * 139 * rbnode - links the entry into red-black tree for the appropriate swap type 140 * offset - the swap offset for the entry. Index into the red-black tree. 141 * refcount - the number of outstanding reference to the entry. This is needed 142 * to protect against premature freeing of the entry by code 143 * concurrent calls to load, invalidate, and writeback. The lock 144 * for the zswap_tree structure that contains the entry must 145 * be held while changing the refcount. Since the lock must 146 * be held, there is no reason to also make refcount atomic. 147 * length - the length in bytes of the compressed page data. Needed during 148 * decompression 149 * pool - the zswap_pool the entry's data is in 150 * handle - zpool allocation handle that stores the compressed page data 151 */ 152 struct zswap_entry { 153 struct rb_node rbnode; 154 pgoff_t offset; 155 int refcount; 156 unsigned int length; 157 struct zswap_pool *pool; 158 unsigned long handle; 159 }; 160 161 struct zswap_header { 162 swp_entry_t swpentry; 163 }; 164 165 /* 166 * The tree lock in the zswap_tree struct protects a few things: 167 * - the rbtree 168 * - the refcount field of each entry in the tree 169 */ 170 struct zswap_tree { 171 struct rb_root rbroot; 172 spinlock_t lock; 173 }; 174 175 static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 176 177 /* RCU-protected iteration */ 178 static LIST_HEAD(zswap_pools); 179 /* protects zswap_pools list modification */ 180 static DEFINE_SPINLOCK(zswap_pools_lock); 181 /* pool counter to provide unique names to zpool */ 182 static atomic_t zswap_pools_count = ATOMIC_INIT(0); 183 184 /* used by param callback function */ 185 static bool zswap_init_started; 186 187 /* fatal error during init */ 188 static bool zswap_init_failed; 189 190 /* init completed, but couldn't create the initial pool */ 191 static bool zswap_has_pool; 192 193 /********************************* 194 * helpers and fwd declarations 195 **********************************/ 196 197 #define zswap_pool_debug(msg, p) \ 198 pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ 199 zpool_get_type((p)->zpool)) 200 201 static int zswap_writeback_entry(struct zpool *pool, unsigned long handle); 202 static int zswap_pool_get(struct zswap_pool *pool); 203 static void zswap_pool_put(struct zswap_pool *pool); 204 205 static const struct zpool_ops zswap_zpool_ops = { 206 .evict = zswap_writeback_entry 207 }; 208 209 static bool zswap_is_full(void) 210 { 211 return totalram_pages * zswap_max_pool_percent / 100 < 212 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 213 } 214 215 static void zswap_update_total_size(void) 216 { 217 struct zswap_pool *pool; 218 u64 total = 0; 219 220 rcu_read_lock(); 221 222 list_for_each_entry_rcu(pool, &zswap_pools, list) 223 total += zpool_get_total_size(pool->zpool); 224 225 rcu_read_unlock(); 226 227 zswap_pool_total_size = total; 228 } 229 230 /********************************* 231 * zswap entry functions 232 **********************************/ 233 static struct kmem_cache *zswap_entry_cache; 234 235 static int __init zswap_entry_cache_create(void) 236 { 237 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 238 return zswap_entry_cache == NULL; 239 } 240 241 static void __init zswap_entry_cache_destroy(void) 242 { 243 kmem_cache_destroy(zswap_entry_cache); 244 } 245 246 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) 247 { 248 struct zswap_entry *entry; 249 entry = kmem_cache_alloc(zswap_entry_cache, gfp); 250 if (!entry) 251 return NULL; 252 entry->refcount = 1; 253 RB_CLEAR_NODE(&entry->rbnode); 254 return entry; 255 } 256 257 static void zswap_entry_cache_free(struct zswap_entry *entry) 258 { 259 kmem_cache_free(zswap_entry_cache, entry); 260 } 261 262 /********************************* 263 * rbtree functions 264 **********************************/ 265 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) 266 { 267 struct rb_node *node = root->rb_node; 268 struct zswap_entry *entry; 269 270 while (node) { 271 entry = rb_entry(node, struct zswap_entry, rbnode); 272 if (entry->offset > offset) 273 node = node->rb_left; 274 else if (entry->offset < offset) 275 node = node->rb_right; 276 else 277 return entry; 278 } 279 return NULL; 280 } 281 282 /* 283 * In the case that a entry with the same offset is found, a pointer to 284 * the existing entry is stored in dupentry and the function returns -EEXIST 285 */ 286 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, 287 struct zswap_entry **dupentry) 288 { 289 struct rb_node **link = &root->rb_node, *parent = NULL; 290 struct zswap_entry *myentry; 291 292 while (*link) { 293 parent = *link; 294 myentry = rb_entry(parent, struct zswap_entry, rbnode); 295 if (myentry->offset > entry->offset) 296 link = &(*link)->rb_left; 297 else if (myentry->offset < entry->offset) 298 link = &(*link)->rb_right; 299 else { 300 *dupentry = myentry; 301 return -EEXIST; 302 } 303 } 304 rb_link_node(&entry->rbnode, parent, link); 305 rb_insert_color(&entry->rbnode, root); 306 return 0; 307 } 308 309 static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) 310 { 311 if (!RB_EMPTY_NODE(&entry->rbnode)) { 312 rb_erase(&entry->rbnode, root); 313 RB_CLEAR_NODE(&entry->rbnode); 314 } 315 } 316 317 /* 318 * Carries out the common pattern of freeing and entry's zpool allocation, 319 * freeing the entry itself, and decrementing the number of stored pages. 320 */ 321 static void zswap_free_entry(struct zswap_entry *entry) 322 { 323 zpool_free(entry->pool->zpool, entry->handle); 324 zswap_pool_put(entry->pool); 325 zswap_entry_cache_free(entry); 326 atomic_dec(&zswap_stored_pages); 327 zswap_update_total_size(); 328 } 329 330 /* caller must hold the tree lock */ 331 static void zswap_entry_get(struct zswap_entry *entry) 332 { 333 entry->refcount++; 334 } 335 336 /* caller must hold the tree lock 337 * remove from the tree and free it, if nobody reference the entry 338 */ 339 static void zswap_entry_put(struct zswap_tree *tree, 340 struct zswap_entry *entry) 341 { 342 int refcount = --entry->refcount; 343 344 BUG_ON(refcount < 0); 345 if (refcount == 0) { 346 zswap_rb_erase(&tree->rbroot, entry); 347 zswap_free_entry(entry); 348 } 349 } 350 351 /* caller must hold the tree lock */ 352 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, 353 pgoff_t offset) 354 { 355 struct zswap_entry *entry; 356 357 entry = zswap_rb_search(root, offset); 358 if (entry) 359 zswap_entry_get(entry); 360 361 return entry; 362 } 363 364 /********************************* 365 * per-cpu code 366 **********************************/ 367 static DEFINE_PER_CPU(u8 *, zswap_dstmem); 368 369 static int zswap_dstmem_prepare(unsigned int cpu) 370 { 371 u8 *dst; 372 373 dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); 374 if (!dst) 375 return -ENOMEM; 376 377 per_cpu(zswap_dstmem, cpu) = dst; 378 return 0; 379 } 380 381 static int zswap_dstmem_dead(unsigned int cpu) 382 { 383 u8 *dst; 384 385 dst = per_cpu(zswap_dstmem, cpu); 386 kfree(dst); 387 per_cpu(zswap_dstmem, cpu) = NULL; 388 389 return 0; 390 } 391 392 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) 393 { 394 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 395 struct crypto_comp *tfm; 396 397 if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu))) 398 return 0; 399 400 tfm = crypto_alloc_comp(pool->tfm_name, 0, 0); 401 if (IS_ERR_OR_NULL(tfm)) { 402 pr_err("could not alloc crypto comp %s : %ld\n", 403 pool->tfm_name, PTR_ERR(tfm)); 404 return -ENOMEM; 405 } 406 *per_cpu_ptr(pool->tfm, cpu) = tfm; 407 return 0; 408 } 409 410 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) 411 { 412 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 413 struct crypto_comp *tfm; 414 415 tfm = *per_cpu_ptr(pool->tfm, cpu); 416 if (!IS_ERR_OR_NULL(tfm)) 417 crypto_free_comp(tfm); 418 *per_cpu_ptr(pool->tfm, cpu) = NULL; 419 return 0; 420 } 421 422 /********************************* 423 * pool functions 424 **********************************/ 425 426 static struct zswap_pool *__zswap_pool_current(void) 427 { 428 struct zswap_pool *pool; 429 430 pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); 431 WARN_ONCE(!pool && zswap_has_pool, 432 "%s: no page storage pool!\n", __func__); 433 434 return pool; 435 } 436 437 static struct zswap_pool *zswap_pool_current(void) 438 { 439 assert_spin_locked(&zswap_pools_lock); 440 441 return __zswap_pool_current(); 442 } 443 444 static struct zswap_pool *zswap_pool_current_get(void) 445 { 446 struct zswap_pool *pool; 447 448 rcu_read_lock(); 449 450 pool = __zswap_pool_current(); 451 if (!zswap_pool_get(pool)) 452 pool = NULL; 453 454 rcu_read_unlock(); 455 456 return pool; 457 } 458 459 static struct zswap_pool *zswap_pool_last_get(void) 460 { 461 struct zswap_pool *pool, *last = NULL; 462 463 rcu_read_lock(); 464 465 list_for_each_entry_rcu(pool, &zswap_pools, list) 466 last = pool; 467 WARN_ONCE(!last && zswap_has_pool, 468 "%s: no page storage pool!\n", __func__); 469 if (!zswap_pool_get(last)) 470 last = NULL; 471 472 rcu_read_unlock(); 473 474 return last; 475 } 476 477 /* type and compressor must be null-terminated */ 478 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) 479 { 480 struct zswap_pool *pool; 481 482 assert_spin_locked(&zswap_pools_lock); 483 484 list_for_each_entry_rcu(pool, &zswap_pools, list) { 485 if (strcmp(pool->tfm_name, compressor)) 486 continue; 487 if (strcmp(zpool_get_type(pool->zpool), type)) 488 continue; 489 /* if we can't get it, it's about to be destroyed */ 490 if (!zswap_pool_get(pool)) 491 continue; 492 return pool; 493 } 494 495 return NULL; 496 } 497 498 static struct zswap_pool *zswap_pool_create(char *type, char *compressor) 499 { 500 struct zswap_pool *pool; 501 char name[38]; /* 'zswap' + 32 char (max) num + \0 */ 502 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 503 int ret; 504 505 if (!zswap_has_pool) { 506 /* if either are unset, pool initialization failed, and we 507 * need both params to be set correctly before trying to 508 * create a pool. 509 */ 510 if (!strcmp(type, ZSWAP_PARAM_UNSET)) 511 return NULL; 512 if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) 513 return NULL; 514 } 515 516 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 517 if (!pool) 518 return NULL; 519 520 /* unique name for each pool specifically required by zsmalloc */ 521 snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); 522 523 pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops); 524 if (!pool->zpool) { 525 pr_err("%s zpool not available\n", type); 526 goto error; 527 } 528 pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); 529 530 strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); 531 pool->tfm = alloc_percpu(struct crypto_comp *); 532 if (!pool->tfm) { 533 pr_err("percpu alloc failed\n"); 534 goto error; 535 } 536 537 ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, 538 &pool->node); 539 if (ret) 540 goto error; 541 pr_debug("using %s compressor\n", pool->tfm_name); 542 543 /* being the current pool takes 1 ref; this func expects the 544 * caller to always add the new pool as the current pool 545 */ 546 kref_init(&pool->kref); 547 INIT_LIST_HEAD(&pool->list); 548 549 zswap_pool_debug("created", pool); 550 551 return pool; 552 553 error: 554 free_percpu(pool->tfm); 555 if (pool->zpool) 556 zpool_destroy_pool(pool->zpool); 557 kfree(pool); 558 return NULL; 559 } 560 561 static __init struct zswap_pool *__zswap_pool_create_fallback(void) 562 { 563 bool has_comp, has_zpool; 564 565 has_comp = crypto_has_comp(zswap_compressor, 0, 0); 566 if (!has_comp && strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) { 567 pr_err("compressor %s not available, using default %s\n", 568 zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT); 569 param_free_charp(&zswap_compressor); 570 zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 571 has_comp = crypto_has_comp(zswap_compressor, 0, 0); 572 } 573 if (!has_comp) { 574 pr_err("default compressor %s not available\n", 575 zswap_compressor); 576 param_free_charp(&zswap_compressor); 577 zswap_compressor = ZSWAP_PARAM_UNSET; 578 } 579 580 has_zpool = zpool_has_pool(zswap_zpool_type); 581 if (!has_zpool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { 582 pr_err("zpool %s not available, using default %s\n", 583 zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT); 584 param_free_charp(&zswap_zpool_type); 585 zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 586 has_zpool = zpool_has_pool(zswap_zpool_type); 587 } 588 if (!has_zpool) { 589 pr_err("default zpool %s not available\n", 590 zswap_zpool_type); 591 param_free_charp(&zswap_zpool_type); 592 zswap_zpool_type = ZSWAP_PARAM_UNSET; 593 } 594 595 if (!has_comp || !has_zpool) 596 return NULL; 597 598 return zswap_pool_create(zswap_zpool_type, zswap_compressor); 599 } 600 601 static void zswap_pool_destroy(struct zswap_pool *pool) 602 { 603 zswap_pool_debug("destroying", pool); 604 605 cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); 606 free_percpu(pool->tfm); 607 zpool_destroy_pool(pool->zpool); 608 kfree(pool); 609 } 610 611 static int __must_check zswap_pool_get(struct zswap_pool *pool) 612 { 613 if (!pool) 614 return 0; 615 616 return kref_get_unless_zero(&pool->kref); 617 } 618 619 static void __zswap_pool_release(struct work_struct *work) 620 { 621 struct zswap_pool *pool = container_of(work, typeof(*pool), work); 622 623 synchronize_rcu(); 624 625 /* nobody should have been able to get a kref... */ 626 WARN_ON(kref_get_unless_zero(&pool->kref)); 627 628 /* pool is now off zswap_pools list and has no references. */ 629 zswap_pool_destroy(pool); 630 } 631 632 static void __zswap_pool_empty(struct kref *kref) 633 { 634 struct zswap_pool *pool; 635 636 pool = container_of(kref, typeof(*pool), kref); 637 638 spin_lock(&zswap_pools_lock); 639 640 WARN_ON(pool == zswap_pool_current()); 641 642 list_del_rcu(&pool->list); 643 644 INIT_WORK(&pool->work, __zswap_pool_release); 645 schedule_work(&pool->work); 646 647 spin_unlock(&zswap_pools_lock); 648 } 649 650 static void zswap_pool_put(struct zswap_pool *pool) 651 { 652 kref_put(&pool->kref, __zswap_pool_empty); 653 } 654 655 /********************************* 656 * param callbacks 657 **********************************/ 658 659 /* val must be a null-terminated string */ 660 static int __zswap_param_set(const char *val, const struct kernel_param *kp, 661 char *type, char *compressor) 662 { 663 struct zswap_pool *pool, *put_pool = NULL; 664 char *s = strstrip((char *)val); 665 int ret; 666 667 if (zswap_init_failed) { 668 pr_err("can't set param, initialization failed\n"); 669 return -ENODEV; 670 } 671 672 /* no change required */ 673 if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) 674 return 0; 675 676 /* if this is load-time (pre-init) param setting, 677 * don't create a pool; that's done during init. 678 */ 679 if (!zswap_init_started) 680 return param_set_charp(s, kp); 681 682 if (!type) { 683 if (!zpool_has_pool(s)) { 684 pr_err("zpool %s not available\n", s); 685 return -ENOENT; 686 } 687 type = s; 688 } else if (!compressor) { 689 if (!crypto_has_comp(s, 0, 0)) { 690 pr_err("compressor %s not available\n", s); 691 return -ENOENT; 692 } 693 compressor = s; 694 } else { 695 WARN_ON(1); 696 return -EINVAL; 697 } 698 699 spin_lock(&zswap_pools_lock); 700 701 pool = zswap_pool_find_get(type, compressor); 702 if (pool) { 703 zswap_pool_debug("using existing", pool); 704 WARN_ON(pool == zswap_pool_current()); 705 list_del_rcu(&pool->list); 706 } 707 708 spin_unlock(&zswap_pools_lock); 709 710 if (!pool) 711 pool = zswap_pool_create(type, compressor); 712 713 if (pool) 714 ret = param_set_charp(s, kp); 715 else 716 ret = -EINVAL; 717 718 spin_lock(&zswap_pools_lock); 719 720 if (!ret) { 721 put_pool = zswap_pool_current(); 722 list_add_rcu(&pool->list, &zswap_pools); 723 zswap_has_pool = true; 724 } else if (pool) { 725 /* add the possibly pre-existing pool to the end of the pools 726 * list; if it's new (and empty) then it'll be removed and 727 * destroyed by the put after we drop the lock 728 */ 729 list_add_tail_rcu(&pool->list, &zswap_pools); 730 put_pool = pool; 731 } 732 733 spin_unlock(&zswap_pools_lock); 734 735 if (!zswap_has_pool && !pool) { 736 /* if initial pool creation failed, and this pool creation also 737 * failed, maybe both compressor and zpool params were bad. 738 * Allow changing this param, so pool creation will succeed 739 * when the other param is changed. We already verified this 740 * param is ok in the zpool_has_pool() or crypto_has_comp() 741 * checks above. 742 */ 743 ret = param_set_charp(s, kp); 744 } 745 746 /* drop the ref from either the old current pool, 747 * or the new pool we failed to add 748 */ 749 if (put_pool) 750 zswap_pool_put(put_pool); 751 752 return ret; 753 } 754 755 static int zswap_compressor_param_set(const char *val, 756 const struct kernel_param *kp) 757 { 758 return __zswap_param_set(val, kp, zswap_zpool_type, NULL); 759 } 760 761 static int zswap_zpool_param_set(const char *val, 762 const struct kernel_param *kp) 763 { 764 return __zswap_param_set(val, kp, NULL, zswap_compressor); 765 } 766 767 static int zswap_enabled_param_set(const char *val, 768 const struct kernel_param *kp) 769 { 770 if (zswap_init_failed) { 771 pr_err("can't enable, initialization failed\n"); 772 return -ENODEV; 773 } 774 if (!zswap_has_pool && zswap_init_started) { 775 pr_err("can't enable, no pool configured\n"); 776 return -ENODEV; 777 } 778 779 return param_set_bool(val, kp); 780 } 781 782 /********************************* 783 * writeback code 784 **********************************/ 785 /* return enum for zswap_get_swap_cache_page */ 786 enum zswap_get_swap_ret { 787 ZSWAP_SWAPCACHE_NEW, 788 ZSWAP_SWAPCACHE_EXIST, 789 ZSWAP_SWAPCACHE_FAIL, 790 }; 791 792 /* 793 * zswap_get_swap_cache_page 794 * 795 * This is an adaption of read_swap_cache_async() 796 * 797 * This function tries to find a page with the given swap entry 798 * in the swapper_space address space (the swap cache). If the page 799 * is found, it is returned in retpage. Otherwise, a page is allocated, 800 * added to the swap cache, and returned in retpage. 801 * 802 * If success, the swap cache page is returned in retpage 803 * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache 804 * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, 805 * the new page is added to swapcache and locked 806 * Returns ZSWAP_SWAPCACHE_FAIL on error 807 */ 808 static int zswap_get_swap_cache_page(swp_entry_t entry, 809 struct page **retpage) 810 { 811 bool page_was_allocated; 812 813 *retpage = __read_swap_cache_async(entry, GFP_KERNEL, 814 NULL, 0, &page_was_allocated); 815 if (page_was_allocated) 816 return ZSWAP_SWAPCACHE_NEW; 817 if (!*retpage) 818 return ZSWAP_SWAPCACHE_FAIL; 819 return ZSWAP_SWAPCACHE_EXIST; 820 } 821 822 /* 823 * Attempts to free an entry by adding a page to the swap cache, 824 * decompressing the entry data into the page, and issuing a 825 * bio write to write the page back to the swap device. 826 * 827 * This can be thought of as a "resumed writeback" of the page 828 * to the swap device. We are basically resuming the same swap 829 * writeback path that was intercepted with the frontswap_store() 830 * in the first place. After the page has been decompressed into 831 * the swap cache, the compressed version stored by zswap can be 832 * freed. 833 */ 834 static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) 835 { 836 struct zswap_header *zhdr; 837 swp_entry_t swpentry; 838 struct zswap_tree *tree; 839 pgoff_t offset; 840 struct zswap_entry *entry; 841 struct page *page; 842 struct crypto_comp *tfm; 843 u8 *src, *dst; 844 unsigned int dlen; 845 int ret; 846 struct writeback_control wbc = { 847 .sync_mode = WB_SYNC_NONE, 848 }; 849 850 /* extract swpentry from data */ 851 zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); 852 swpentry = zhdr->swpentry; /* here */ 853 zpool_unmap_handle(pool, handle); 854 tree = zswap_trees[swp_type(swpentry)]; 855 offset = swp_offset(swpentry); 856 857 /* find and ref zswap entry */ 858 spin_lock(&tree->lock); 859 entry = zswap_entry_find_get(&tree->rbroot, offset); 860 if (!entry) { 861 /* entry was invalidated */ 862 spin_unlock(&tree->lock); 863 return 0; 864 } 865 spin_unlock(&tree->lock); 866 BUG_ON(offset != entry->offset); 867 868 /* try to allocate swap cache page */ 869 switch (zswap_get_swap_cache_page(swpentry, &page)) { 870 case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ 871 ret = -ENOMEM; 872 goto fail; 873 874 case ZSWAP_SWAPCACHE_EXIST: 875 /* page is already in the swap cache, ignore for now */ 876 put_page(page); 877 ret = -EEXIST; 878 goto fail; 879 880 case ZSWAP_SWAPCACHE_NEW: /* page is locked */ 881 /* decompress */ 882 dlen = PAGE_SIZE; 883 src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, 884 ZPOOL_MM_RO) + sizeof(struct zswap_header); 885 dst = kmap_atomic(page); 886 tfm = *get_cpu_ptr(entry->pool->tfm); 887 ret = crypto_comp_decompress(tfm, src, entry->length, 888 dst, &dlen); 889 put_cpu_ptr(entry->pool->tfm); 890 kunmap_atomic(dst); 891 zpool_unmap_handle(entry->pool->zpool, entry->handle); 892 BUG_ON(ret); 893 BUG_ON(dlen != PAGE_SIZE); 894 895 /* page is up to date */ 896 SetPageUptodate(page); 897 } 898 899 /* move it to the tail of the inactive list after end_writeback */ 900 SetPageReclaim(page); 901 902 /* start writeback */ 903 __swap_writepage(page, &wbc, end_swap_bio_write); 904 put_page(page); 905 zswap_written_back_pages++; 906 907 spin_lock(&tree->lock); 908 /* drop local reference */ 909 zswap_entry_put(tree, entry); 910 911 /* 912 * There are two possible situations for entry here: 913 * (1) refcount is 1(normal case), entry is valid and on the tree 914 * (2) refcount is 0, entry is freed and not on the tree 915 * because invalidate happened during writeback 916 * search the tree and free the entry if find entry 917 */ 918 if (entry == zswap_rb_search(&tree->rbroot, offset)) 919 zswap_entry_put(tree, entry); 920 spin_unlock(&tree->lock); 921 922 goto end; 923 924 /* 925 * if we get here due to ZSWAP_SWAPCACHE_EXIST 926 * a load may happening concurrently 927 * it is safe and okay to not free the entry 928 * if we free the entry in the following put 929 * it it either okay to return !0 930 */ 931 fail: 932 spin_lock(&tree->lock); 933 zswap_entry_put(tree, entry); 934 spin_unlock(&tree->lock); 935 936 end: 937 return ret; 938 } 939 940 static int zswap_shrink(void) 941 { 942 struct zswap_pool *pool; 943 int ret; 944 945 pool = zswap_pool_last_get(); 946 if (!pool) 947 return -ENOENT; 948 949 ret = zpool_shrink(pool->zpool, 1, NULL); 950 951 zswap_pool_put(pool); 952 953 return ret; 954 } 955 956 /********************************* 957 * frontswap hooks 958 **********************************/ 959 /* attempts to compress and store an single page */ 960 static int zswap_frontswap_store(unsigned type, pgoff_t offset, 961 struct page *page) 962 { 963 struct zswap_tree *tree = zswap_trees[type]; 964 struct zswap_entry *entry, *dupentry; 965 struct crypto_comp *tfm; 966 int ret; 967 unsigned int dlen = PAGE_SIZE, len; 968 unsigned long handle; 969 char *buf; 970 u8 *src, *dst; 971 struct zswap_header *zhdr; 972 973 if (!zswap_enabled || !tree) { 974 ret = -ENODEV; 975 goto reject; 976 } 977 978 /* reclaim space if needed */ 979 if (zswap_is_full()) { 980 zswap_pool_limit_hit++; 981 if (zswap_shrink()) { 982 zswap_reject_reclaim_fail++; 983 ret = -ENOMEM; 984 goto reject; 985 } 986 } 987 988 /* allocate entry */ 989 entry = zswap_entry_cache_alloc(GFP_KERNEL); 990 if (!entry) { 991 zswap_reject_kmemcache_fail++; 992 ret = -ENOMEM; 993 goto reject; 994 } 995 996 /* if entry is successfully added, it keeps the reference */ 997 entry->pool = zswap_pool_current_get(); 998 if (!entry->pool) { 999 ret = -EINVAL; 1000 goto freepage; 1001 } 1002 1003 /* compress */ 1004 dst = get_cpu_var(zswap_dstmem); 1005 tfm = *get_cpu_ptr(entry->pool->tfm); 1006 src = kmap_atomic(page); 1007 ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen); 1008 kunmap_atomic(src); 1009 put_cpu_ptr(entry->pool->tfm); 1010 if (ret) { 1011 ret = -EINVAL; 1012 goto put_dstmem; 1013 } 1014 1015 /* store */ 1016 len = dlen + sizeof(struct zswap_header); 1017 ret = zpool_malloc(entry->pool->zpool, len, 1018 __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, 1019 &handle); 1020 if (ret == -ENOSPC) { 1021 zswap_reject_compress_poor++; 1022 goto put_dstmem; 1023 } 1024 if (ret) { 1025 zswap_reject_alloc_fail++; 1026 goto put_dstmem; 1027 } 1028 zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW); 1029 zhdr->swpentry = swp_entry(type, offset); 1030 buf = (u8 *)(zhdr + 1); 1031 memcpy(buf, dst, dlen); 1032 zpool_unmap_handle(entry->pool->zpool, handle); 1033 put_cpu_var(zswap_dstmem); 1034 1035 /* populate entry */ 1036 entry->offset = offset; 1037 entry->handle = handle; 1038 entry->length = dlen; 1039 1040 /* map */ 1041 spin_lock(&tree->lock); 1042 do { 1043 ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); 1044 if (ret == -EEXIST) { 1045 zswap_duplicate_entry++; 1046 /* remove from rbtree */ 1047 zswap_rb_erase(&tree->rbroot, dupentry); 1048 zswap_entry_put(tree, dupentry); 1049 } 1050 } while (ret == -EEXIST); 1051 spin_unlock(&tree->lock); 1052 1053 /* update stats */ 1054 atomic_inc(&zswap_stored_pages); 1055 zswap_update_total_size(); 1056 1057 return 0; 1058 1059 put_dstmem: 1060 put_cpu_var(zswap_dstmem); 1061 zswap_pool_put(entry->pool); 1062 freepage: 1063 zswap_entry_cache_free(entry); 1064 reject: 1065 return ret; 1066 } 1067 1068 /* 1069 * returns 0 if the page was successfully decompressed 1070 * return -1 on entry not found or error 1071 */ 1072 static int zswap_frontswap_load(unsigned type, pgoff_t offset, 1073 struct page *page) 1074 { 1075 struct zswap_tree *tree = zswap_trees[type]; 1076 struct zswap_entry *entry; 1077 struct crypto_comp *tfm; 1078 u8 *src, *dst; 1079 unsigned int dlen; 1080 int ret; 1081 1082 /* find */ 1083 spin_lock(&tree->lock); 1084 entry = zswap_entry_find_get(&tree->rbroot, offset); 1085 if (!entry) { 1086 /* entry was written back */ 1087 spin_unlock(&tree->lock); 1088 return -1; 1089 } 1090 spin_unlock(&tree->lock); 1091 1092 /* decompress */ 1093 dlen = PAGE_SIZE; 1094 src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, 1095 ZPOOL_MM_RO) + sizeof(struct zswap_header); 1096 dst = kmap_atomic(page); 1097 tfm = *get_cpu_ptr(entry->pool->tfm); 1098 ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); 1099 put_cpu_ptr(entry->pool->tfm); 1100 kunmap_atomic(dst); 1101 zpool_unmap_handle(entry->pool->zpool, entry->handle); 1102 BUG_ON(ret); 1103 1104 spin_lock(&tree->lock); 1105 zswap_entry_put(tree, entry); 1106 spin_unlock(&tree->lock); 1107 1108 return 0; 1109 } 1110 1111 /* frees an entry in zswap */ 1112 static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) 1113 { 1114 struct zswap_tree *tree = zswap_trees[type]; 1115 struct zswap_entry *entry; 1116 1117 /* find */ 1118 spin_lock(&tree->lock); 1119 entry = zswap_rb_search(&tree->rbroot, offset); 1120 if (!entry) { 1121 /* entry was written back */ 1122 spin_unlock(&tree->lock); 1123 return; 1124 } 1125 1126 /* remove from rbtree */ 1127 zswap_rb_erase(&tree->rbroot, entry); 1128 1129 /* drop the initial reference from entry creation */ 1130 zswap_entry_put(tree, entry); 1131 1132 spin_unlock(&tree->lock); 1133 } 1134 1135 /* frees all zswap entries for the given swap type */ 1136 static void zswap_frontswap_invalidate_area(unsigned type) 1137 { 1138 struct zswap_tree *tree = zswap_trees[type]; 1139 struct zswap_entry *entry, *n; 1140 1141 if (!tree) 1142 return; 1143 1144 /* walk the tree and free everything */ 1145 spin_lock(&tree->lock); 1146 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) 1147 zswap_free_entry(entry); 1148 tree->rbroot = RB_ROOT; 1149 spin_unlock(&tree->lock); 1150 kfree(tree); 1151 zswap_trees[type] = NULL; 1152 } 1153 1154 static void zswap_frontswap_init(unsigned type) 1155 { 1156 struct zswap_tree *tree; 1157 1158 tree = kzalloc(sizeof(*tree), GFP_KERNEL); 1159 if (!tree) { 1160 pr_err("alloc failed, zswap disabled for swap type %d\n", type); 1161 return; 1162 } 1163 1164 tree->rbroot = RB_ROOT; 1165 spin_lock_init(&tree->lock); 1166 zswap_trees[type] = tree; 1167 } 1168 1169 static struct frontswap_ops zswap_frontswap_ops = { 1170 .store = zswap_frontswap_store, 1171 .load = zswap_frontswap_load, 1172 .invalidate_page = zswap_frontswap_invalidate_page, 1173 .invalidate_area = zswap_frontswap_invalidate_area, 1174 .init = zswap_frontswap_init 1175 }; 1176 1177 /********************************* 1178 * debugfs functions 1179 **********************************/ 1180 #ifdef CONFIG_DEBUG_FS 1181 #include <linux/debugfs.h> 1182 1183 static struct dentry *zswap_debugfs_root; 1184 1185 static int __init zswap_debugfs_init(void) 1186 { 1187 if (!debugfs_initialized()) 1188 return -ENODEV; 1189 1190 zswap_debugfs_root = debugfs_create_dir("zswap", NULL); 1191 if (!zswap_debugfs_root) 1192 return -ENOMEM; 1193 1194 debugfs_create_u64("pool_limit_hit", S_IRUGO, 1195 zswap_debugfs_root, &zswap_pool_limit_hit); 1196 debugfs_create_u64("reject_reclaim_fail", S_IRUGO, 1197 zswap_debugfs_root, &zswap_reject_reclaim_fail); 1198 debugfs_create_u64("reject_alloc_fail", S_IRUGO, 1199 zswap_debugfs_root, &zswap_reject_alloc_fail); 1200 debugfs_create_u64("reject_kmemcache_fail", S_IRUGO, 1201 zswap_debugfs_root, &zswap_reject_kmemcache_fail); 1202 debugfs_create_u64("reject_compress_poor", S_IRUGO, 1203 zswap_debugfs_root, &zswap_reject_compress_poor); 1204 debugfs_create_u64("written_back_pages", S_IRUGO, 1205 zswap_debugfs_root, &zswap_written_back_pages); 1206 debugfs_create_u64("duplicate_entry", S_IRUGO, 1207 zswap_debugfs_root, &zswap_duplicate_entry); 1208 debugfs_create_u64("pool_total_size", S_IRUGO, 1209 zswap_debugfs_root, &zswap_pool_total_size); 1210 debugfs_create_atomic_t("stored_pages", S_IRUGO, 1211 zswap_debugfs_root, &zswap_stored_pages); 1212 1213 return 0; 1214 } 1215 1216 static void __exit zswap_debugfs_exit(void) 1217 { 1218 debugfs_remove_recursive(zswap_debugfs_root); 1219 } 1220 #else 1221 static int __init zswap_debugfs_init(void) 1222 { 1223 return 0; 1224 } 1225 1226 static void __exit zswap_debugfs_exit(void) { } 1227 #endif 1228 1229 /********************************* 1230 * module init and exit 1231 **********************************/ 1232 static int __init init_zswap(void) 1233 { 1234 struct zswap_pool *pool; 1235 int ret; 1236 1237 zswap_init_started = true; 1238 1239 if (zswap_entry_cache_create()) { 1240 pr_err("entry cache creation failed\n"); 1241 goto cache_fail; 1242 } 1243 1244 ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare", 1245 zswap_dstmem_prepare, zswap_dstmem_dead); 1246 if (ret) { 1247 pr_err("dstmem alloc failed\n"); 1248 goto dstmem_fail; 1249 } 1250 1251 ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, 1252 "mm/zswap_pool:prepare", 1253 zswap_cpu_comp_prepare, 1254 zswap_cpu_comp_dead); 1255 if (ret) 1256 goto hp_fail; 1257 1258 pool = __zswap_pool_create_fallback(); 1259 if (pool) { 1260 pr_info("loaded using pool %s/%s\n", pool->tfm_name, 1261 zpool_get_type(pool->zpool)); 1262 list_add(&pool->list, &zswap_pools); 1263 zswap_has_pool = true; 1264 } else { 1265 pr_err("pool creation failed\n"); 1266 zswap_enabled = false; 1267 } 1268 1269 frontswap_register_ops(&zswap_frontswap_ops); 1270 if (zswap_debugfs_init()) 1271 pr_warn("debugfs initialization failed\n"); 1272 return 0; 1273 1274 hp_fail: 1275 cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE); 1276 dstmem_fail: 1277 zswap_entry_cache_destroy(); 1278 cache_fail: 1279 /* if built-in, we aren't unloaded on failure; don't allow use */ 1280 zswap_init_failed = true; 1281 zswap_enabled = false; 1282 return -ENOMEM; 1283 } 1284 /* must be late so crypto has time to come up */ 1285 late_initcall(init_zswap); 1286 1287 MODULE_LICENSE("GPL"); 1288 MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); 1289 MODULE_DESCRIPTION("Compressed cache for swap pages"); 1290