1 /* 2 * zswap.c - zswap driver file 3 * 4 * zswap is a backend for frontswap that takes pages that are in the process 5 * of being swapped out and attempts to compress and store them in a 6 * RAM-based memory pool. This can result in a significant I/O reduction on 7 * the swap device and, in the case where decompressing from RAM is faster 8 * than reading from the swap device, can also improve workload performance. 9 * 10 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; either version 2 15 * of the License, or (at your option) any later version. 16 * 17 * This program is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 * GNU General Public License for more details. 21 */ 22 23 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 24 25 #include <linux/module.h> 26 #include <linux/cpu.h> 27 #include <linux/highmem.h> 28 #include <linux/slab.h> 29 #include <linux/spinlock.h> 30 #include <linux/types.h> 31 #include <linux/atomic.h> 32 #include <linux/frontswap.h> 33 #include <linux/rbtree.h> 34 #include <linux/swap.h> 35 #include <linux/crypto.h> 36 #include <linux/mempool.h> 37 #include <linux/zpool.h> 38 39 #include <linux/mm_types.h> 40 #include <linux/page-flags.h> 41 #include <linux/swapops.h> 42 #include <linux/writeback.h> 43 #include <linux/pagemap.h> 44 45 /********************************* 46 * statistics 47 **********************************/ 48 /* Total bytes used by the compressed storage */ 49 static u64 zswap_pool_total_size; 50 /* The number of compressed pages currently stored in zswap */ 51 static atomic_t zswap_stored_pages = ATOMIC_INIT(0); 52 /* The number of same-value filled pages currently stored in zswap */ 53 static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); 54 55 /* 56 * The statistics below are not protected from concurrent access for 57 * performance reasons so they may not be a 100% accurate. However, 58 * they do provide useful information on roughly how many times a 59 * certain event is occurring. 60 */ 61 62 /* Pool limit was hit (see zswap_max_pool_percent) */ 63 static u64 zswap_pool_limit_hit; 64 /* Pages written back when pool limit was reached */ 65 static u64 zswap_written_back_pages; 66 /* Store failed due to a reclaim failure after pool limit was reached */ 67 static u64 zswap_reject_reclaim_fail; 68 /* Compressed page was too big for the allocator to (optimally) store */ 69 static u64 zswap_reject_compress_poor; 70 /* Store failed because underlying allocator could not get memory */ 71 static u64 zswap_reject_alloc_fail; 72 /* Store failed because the entry metadata could not be allocated (rare) */ 73 static u64 zswap_reject_kmemcache_fail; 74 /* Duplicate store was encountered (rare) */ 75 static u64 zswap_duplicate_entry; 76 77 /********************************* 78 * tunables 79 **********************************/ 80 81 #define ZSWAP_PARAM_UNSET "" 82 83 /* Enable/disable zswap (disabled by default) */ 84 static bool zswap_enabled; 85 static int zswap_enabled_param_set(const char *, 86 const struct kernel_param *); 87 static struct kernel_param_ops zswap_enabled_param_ops = { 88 .set = zswap_enabled_param_set, 89 .get = param_get_bool, 90 }; 91 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); 92 93 /* Crypto compressor to use */ 94 #define ZSWAP_COMPRESSOR_DEFAULT "lzo" 95 static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 96 static int zswap_compressor_param_set(const char *, 97 const struct kernel_param *); 98 static struct kernel_param_ops zswap_compressor_param_ops = { 99 .set = zswap_compressor_param_set, 100 .get = param_get_charp, 101 .free = param_free_charp, 102 }; 103 module_param_cb(compressor, &zswap_compressor_param_ops, 104 &zswap_compressor, 0644); 105 106 /* Compressed storage zpool to use */ 107 #define ZSWAP_ZPOOL_DEFAULT "zbud" 108 static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 109 static int zswap_zpool_param_set(const char *, const struct kernel_param *); 110 static struct kernel_param_ops zswap_zpool_param_ops = { 111 .set = zswap_zpool_param_set, 112 .get = param_get_charp, 113 .free = param_free_charp, 114 }; 115 module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); 116 117 /* The maximum percentage of memory that the compressed pool can occupy */ 118 static unsigned int zswap_max_pool_percent = 20; 119 module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); 120 121 /* Enable/disable handling same-value filled pages (enabled by default) */ 122 static bool zswap_same_filled_pages_enabled = true; 123 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, 124 bool, 0644); 125 126 /********************************* 127 * data structures 128 **********************************/ 129 130 struct zswap_pool { 131 struct zpool *zpool; 132 struct crypto_comp * __percpu *tfm; 133 struct kref kref; 134 struct list_head list; 135 struct work_struct work; 136 struct hlist_node node; 137 char tfm_name[CRYPTO_MAX_ALG_NAME]; 138 }; 139 140 /* 141 * struct zswap_entry 142 * 143 * This structure contains the metadata for tracking a single compressed 144 * page within zswap. 145 * 146 * rbnode - links the entry into red-black tree for the appropriate swap type 147 * offset - the swap offset for the entry. Index into the red-black tree. 148 * refcount - the number of outstanding reference to the entry. This is needed 149 * to protect against premature freeing of the entry by code 150 * concurrent calls to load, invalidate, and writeback. The lock 151 * for the zswap_tree structure that contains the entry must 152 * be held while changing the refcount. Since the lock must 153 * be held, there is no reason to also make refcount atomic. 154 * length - the length in bytes of the compressed page data. Needed during 155 * decompression. For a same value filled page length is 0. 156 * pool - the zswap_pool the entry's data is in 157 * handle - zpool allocation handle that stores the compressed page data 158 * value - value of the same-value filled pages which have same content 159 */ 160 struct zswap_entry { 161 struct rb_node rbnode; 162 pgoff_t offset; 163 int refcount; 164 unsigned int length; 165 struct zswap_pool *pool; 166 union { 167 unsigned long handle; 168 unsigned long value; 169 }; 170 }; 171 172 struct zswap_header { 173 swp_entry_t swpentry; 174 }; 175 176 /* 177 * The tree lock in the zswap_tree struct protects a few things: 178 * - the rbtree 179 * - the refcount field of each entry in the tree 180 */ 181 struct zswap_tree { 182 struct rb_root rbroot; 183 spinlock_t lock; 184 }; 185 186 static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 187 188 /* RCU-protected iteration */ 189 static LIST_HEAD(zswap_pools); 190 /* protects zswap_pools list modification */ 191 static DEFINE_SPINLOCK(zswap_pools_lock); 192 /* pool counter to provide unique names to zpool */ 193 static atomic_t zswap_pools_count = ATOMIC_INIT(0); 194 195 /* used by param callback function */ 196 static bool zswap_init_started; 197 198 /* fatal error during init */ 199 static bool zswap_init_failed; 200 201 /* init completed, but couldn't create the initial pool */ 202 static bool zswap_has_pool; 203 204 /********************************* 205 * helpers and fwd declarations 206 **********************************/ 207 208 #define zswap_pool_debug(msg, p) \ 209 pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ 210 zpool_get_type((p)->zpool)) 211 212 static int zswap_writeback_entry(struct zpool *pool, unsigned long handle); 213 static int zswap_pool_get(struct zswap_pool *pool); 214 static void zswap_pool_put(struct zswap_pool *pool); 215 216 static const struct zpool_ops zswap_zpool_ops = { 217 .evict = zswap_writeback_entry 218 }; 219 220 static bool zswap_is_full(void) 221 { 222 return totalram_pages * zswap_max_pool_percent / 100 < 223 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 224 } 225 226 static void zswap_update_total_size(void) 227 { 228 struct zswap_pool *pool; 229 u64 total = 0; 230 231 rcu_read_lock(); 232 233 list_for_each_entry_rcu(pool, &zswap_pools, list) 234 total += zpool_get_total_size(pool->zpool); 235 236 rcu_read_unlock(); 237 238 zswap_pool_total_size = total; 239 } 240 241 /********************************* 242 * zswap entry functions 243 **********************************/ 244 static struct kmem_cache *zswap_entry_cache; 245 246 static int __init zswap_entry_cache_create(void) 247 { 248 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 249 return zswap_entry_cache == NULL; 250 } 251 252 static void __init zswap_entry_cache_destroy(void) 253 { 254 kmem_cache_destroy(zswap_entry_cache); 255 } 256 257 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) 258 { 259 struct zswap_entry *entry; 260 entry = kmem_cache_alloc(zswap_entry_cache, gfp); 261 if (!entry) 262 return NULL; 263 entry->refcount = 1; 264 RB_CLEAR_NODE(&entry->rbnode); 265 return entry; 266 } 267 268 static void zswap_entry_cache_free(struct zswap_entry *entry) 269 { 270 kmem_cache_free(zswap_entry_cache, entry); 271 } 272 273 /********************************* 274 * rbtree functions 275 **********************************/ 276 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) 277 { 278 struct rb_node *node = root->rb_node; 279 struct zswap_entry *entry; 280 281 while (node) { 282 entry = rb_entry(node, struct zswap_entry, rbnode); 283 if (entry->offset > offset) 284 node = node->rb_left; 285 else if (entry->offset < offset) 286 node = node->rb_right; 287 else 288 return entry; 289 } 290 return NULL; 291 } 292 293 /* 294 * In the case that a entry with the same offset is found, a pointer to 295 * the existing entry is stored in dupentry and the function returns -EEXIST 296 */ 297 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, 298 struct zswap_entry **dupentry) 299 { 300 struct rb_node **link = &root->rb_node, *parent = NULL; 301 struct zswap_entry *myentry; 302 303 while (*link) { 304 parent = *link; 305 myentry = rb_entry(parent, struct zswap_entry, rbnode); 306 if (myentry->offset > entry->offset) 307 link = &(*link)->rb_left; 308 else if (myentry->offset < entry->offset) 309 link = &(*link)->rb_right; 310 else { 311 *dupentry = myentry; 312 return -EEXIST; 313 } 314 } 315 rb_link_node(&entry->rbnode, parent, link); 316 rb_insert_color(&entry->rbnode, root); 317 return 0; 318 } 319 320 static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) 321 { 322 if (!RB_EMPTY_NODE(&entry->rbnode)) { 323 rb_erase(&entry->rbnode, root); 324 RB_CLEAR_NODE(&entry->rbnode); 325 } 326 } 327 328 /* 329 * Carries out the common pattern of freeing and entry's zpool allocation, 330 * freeing the entry itself, and decrementing the number of stored pages. 331 */ 332 static void zswap_free_entry(struct zswap_entry *entry) 333 { 334 if (!entry->length) 335 atomic_dec(&zswap_same_filled_pages); 336 else { 337 zpool_free(entry->pool->zpool, entry->handle); 338 zswap_pool_put(entry->pool); 339 } 340 zswap_entry_cache_free(entry); 341 atomic_dec(&zswap_stored_pages); 342 zswap_update_total_size(); 343 } 344 345 /* caller must hold the tree lock */ 346 static void zswap_entry_get(struct zswap_entry *entry) 347 { 348 entry->refcount++; 349 } 350 351 /* caller must hold the tree lock 352 * remove from the tree and free it, if nobody reference the entry 353 */ 354 static void zswap_entry_put(struct zswap_tree *tree, 355 struct zswap_entry *entry) 356 { 357 int refcount = --entry->refcount; 358 359 BUG_ON(refcount < 0); 360 if (refcount == 0) { 361 zswap_rb_erase(&tree->rbroot, entry); 362 zswap_free_entry(entry); 363 } 364 } 365 366 /* caller must hold the tree lock */ 367 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, 368 pgoff_t offset) 369 { 370 struct zswap_entry *entry; 371 372 entry = zswap_rb_search(root, offset); 373 if (entry) 374 zswap_entry_get(entry); 375 376 return entry; 377 } 378 379 /********************************* 380 * per-cpu code 381 **********************************/ 382 static DEFINE_PER_CPU(u8 *, zswap_dstmem); 383 384 static int zswap_dstmem_prepare(unsigned int cpu) 385 { 386 u8 *dst; 387 388 dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); 389 if (!dst) 390 return -ENOMEM; 391 392 per_cpu(zswap_dstmem, cpu) = dst; 393 return 0; 394 } 395 396 static int zswap_dstmem_dead(unsigned int cpu) 397 { 398 u8 *dst; 399 400 dst = per_cpu(zswap_dstmem, cpu); 401 kfree(dst); 402 per_cpu(zswap_dstmem, cpu) = NULL; 403 404 return 0; 405 } 406 407 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) 408 { 409 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 410 struct crypto_comp *tfm; 411 412 if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu))) 413 return 0; 414 415 tfm = crypto_alloc_comp(pool->tfm_name, 0, 0); 416 if (IS_ERR_OR_NULL(tfm)) { 417 pr_err("could not alloc crypto comp %s : %ld\n", 418 pool->tfm_name, PTR_ERR(tfm)); 419 return -ENOMEM; 420 } 421 *per_cpu_ptr(pool->tfm, cpu) = tfm; 422 return 0; 423 } 424 425 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) 426 { 427 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 428 struct crypto_comp *tfm; 429 430 tfm = *per_cpu_ptr(pool->tfm, cpu); 431 if (!IS_ERR_OR_NULL(tfm)) 432 crypto_free_comp(tfm); 433 *per_cpu_ptr(pool->tfm, cpu) = NULL; 434 return 0; 435 } 436 437 /********************************* 438 * pool functions 439 **********************************/ 440 441 static struct zswap_pool *__zswap_pool_current(void) 442 { 443 struct zswap_pool *pool; 444 445 pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); 446 WARN_ONCE(!pool && zswap_has_pool, 447 "%s: no page storage pool!\n", __func__); 448 449 return pool; 450 } 451 452 static struct zswap_pool *zswap_pool_current(void) 453 { 454 assert_spin_locked(&zswap_pools_lock); 455 456 return __zswap_pool_current(); 457 } 458 459 static struct zswap_pool *zswap_pool_current_get(void) 460 { 461 struct zswap_pool *pool; 462 463 rcu_read_lock(); 464 465 pool = __zswap_pool_current(); 466 if (!zswap_pool_get(pool)) 467 pool = NULL; 468 469 rcu_read_unlock(); 470 471 return pool; 472 } 473 474 static struct zswap_pool *zswap_pool_last_get(void) 475 { 476 struct zswap_pool *pool, *last = NULL; 477 478 rcu_read_lock(); 479 480 list_for_each_entry_rcu(pool, &zswap_pools, list) 481 last = pool; 482 WARN_ONCE(!last && zswap_has_pool, 483 "%s: no page storage pool!\n", __func__); 484 if (!zswap_pool_get(last)) 485 last = NULL; 486 487 rcu_read_unlock(); 488 489 return last; 490 } 491 492 /* type and compressor must be null-terminated */ 493 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) 494 { 495 struct zswap_pool *pool; 496 497 assert_spin_locked(&zswap_pools_lock); 498 499 list_for_each_entry_rcu(pool, &zswap_pools, list) { 500 if (strcmp(pool->tfm_name, compressor)) 501 continue; 502 if (strcmp(zpool_get_type(pool->zpool), type)) 503 continue; 504 /* if we can't get it, it's about to be destroyed */ 505 if (!zswap_pool_get(pool)) 506 continue; 507 return pool; 508 } 509 510 return NULL; 511 } 512 513 static struct zswap_pool *zswap_pool_create(char *type, char *compressor) 514 { 515 struct zswap_pool *pool; 516 char name[38]; /* 'zswap' + 32 char (max) num + \0 */ 517 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 518 int ret; 519 520 if (!zswap_has_pool) { 521 /* if either are unset, pool initialization failed, and we 522 * need both params to be set correctly before trying to 523 * create a pool. 524 */ 525 if (!strcmp(type, ZSWAP_PARAM_UNSET)) 526 return NULL; 527 if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) 528 return NULL; 529 } 530 531 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 532 if (!pool) 533 return NULL; 534 535 /* unique name for each pool specifically required by zsmalloc */ 536 snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); 537 538 pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops); 539 if (!pool->zpool) { 540 pr_err("%s zpool not available\n", type); 541 goto error; 542 } 543 pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); 544 545 strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); 546 pool->tfm = alloc_percpu(struct crypto_comp *); 547 if (!pool->tfm) { 548 pr_err("percpu alloc failed\n"); 549 goto error; 550 } 551 552 ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, 553 &pool->node); 554 if (ret) 555 goto error; 556 pr_debug("using %s compressor\n", pool->tfm_name); 557 558 /* being the current pool takes 1 ref; this func expects the 559 * caller to always add the new pool as the current pool 560 */ 561 kref_init(&pool->kref); 562 INIT_LIST_HEAD(&pool->list); 563 564 zswap_pool_debug("created", pool); 565 566 return pool; 567 568 error: 569 free_percpu(pool->tfm); 570 if (pool->zpool) 571 zpool_destroy_pool(pool->zpool); 572 kfree(pool); 573 return NULL; 574 } 575 576 static __init struct zswap_pool *__zswap_pool_create_fallback(void) 577 { 578 bool has_comp, has_zpool; 579 580 has_comp = crypto_has_comp(zswap_compressor, 0, 0); 581 if (!has_comp && strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) { 582 pr_err("compressor %s not available, using default %s\n", 583 zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT); 584 param_free_charp(&zswap_compressor); 585 zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 586 has_comp = crypto_has_comp(zswap_compressor, 0, 0); 587 } 588 if (!has_comp) { 589 pr_err("default compressor %s not available\n", 590 zswap_compressor); 591 param_free_charp(&zswap_compressor); 592 zswap_compressor = ZSWAP_PARAM_UNSET; 593 } 594 595 has_zpool = zpool_has_pool(zswap_zpool_type); 596 if (!has_zpool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { 597 pr_err("zpool %s not available, using default %s\n", 598 zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT); 599 param_free_charp(&zswap_zpool_type); 600 zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 601 has_zpool = zpool_has_pool(zswap_zpool_type); 602 } 603 if (!has_zpool) { 604 pr_err("default zpool %s not available\n", 605 zswap_zpool_type); 606 param_free_charp(&zswap_zpool_type); 607 zswap_zpool_type = ZSWAP_PARAM_UNSET; 608 } 609 610 if (!has_comp || !has_zpool) 611 return NULL; 612 613 return zswap_pool_create(zswap_zpool_type, zswap_compressor); 614 } 615 616 static void zswap_pool_destroy(struct zswap_pool *pool) 617 { 618 zswap_pool_debug("destroying", pool); 619 620 cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); 621 free_percpu(pool->tfm); 622 zpool_destroy_pool(pool->zpool); 623 kfree(pool); 624 } 625 626 static int __must_check zswap_pool_get(struct zswap_pool *pool) 627 { 628 if (!pool) 629 return 0; 630 631 return kref_get_unless_zero(&pool->kref); 632 } 633 634 static void __zswap_pool_release(struct work_struct *work) 635 { 636 struct zswap_pool *pool = container_of(work, typeof(*pool), work); 637 638 synchronize_rcu(); 639 640 /* nobody should have been able to get a kref... */ 641 WARN_ON(kref_get_unless_zero(&pool->kref)); 642 643 /* pool is now off zswap_pools list and has no references. */ 644 zswap_pool_destroy(pool); 645 } 646 647 static void __zswap_pool_empty(struct kref *kref) 648 { 649 struct zswap_pool *pool; 650 651 pool = container_of(kref, typeof(*pool), kref); 652 653 spin_lock(&zswap_pools_lock); 654 655 WARN_ON(pool == zswap_pool_current()); 656 657 list_del_rcu(&pool->list); 658 659 INIT_WORK(&pool->work, __zswap_pool_release); 660 schedule_work(&pool->work); 661 662 spin_unlock(&zswap_pools_lock); 663 } 664 665 static void zswap_pool_put(struct zswap_pool *pool) 666 { 667 kref_put(&pool->kref, __zswap_pool_empty); 668 } 669 670 /********************************* 671 * param callbacks 672 **********************************/ 673 674 /* val must be a null-terminated string */ 675 static int __zswap_param_set(const char *val, const struct kernel_param *kp, 676 char *type, char *compressor) 677 { 678 struct zswap_pool *pool, *put_pool = NULL; 679 char *s = strstrip((char *)val); 680 int ret; 681 682 if (zswap_init_failed) { 683 pr_err("can't set param, initialization failed\n"); 684 return -ENODEV; 685 } 686 687 /* no change required */ 688 if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) 689 return 0; 690 691 /* if this is load-time (pre-init) param setting, 692 * don't create a pool; that's done during init. 693 */ 694 if (!zswap_init_started) 695 return param_set_charp(s, kp); 696 697 if (!type) { 698 if (!zpool_has_pool(s)) { 699 pr_err("zpool %s not available\n", s); 700 return -ENOENT; 701 } 702 type = s; 703 } else if (!compressor) { 704 if (!crypto_has_comp(s, 0, 0)) { 705 pr_err("compressor %s not available\n", s); 706 return -ENOENT; 707 } 708 compressor = s; 709 } else { 710 WARN_ON(1); 711 return -EINVAL; 712 } 713 714 spin_lock(&zswap_pools_lock); 715 716 pool = zswap_pool_find_get(type, compressor); 717 if (pool) { 718 zswap_pool_debug("using existing", pool); 719 WARN_ON(pool == zswap_pool_current()); 720 list_del_rcu(&pool->list); 721 } 722 723 spin_unlock(&zswap_pools_lock); 724 725 if (!pool) 726 pool = zswap_pool_create(type, compressor); 727 728 if (pool) 729 ret = param_set_charp(s, kp); 730 else 731 ret = -EINVAL; 732 733 spin_lock(&zswap_pools_lock); 734 735 if (!ret) { 736 put_pool = zswap_pool_current(); 737 list_add_rcu(&pool->list, &zswap_pools); 738 zswap_has_pool = true; 739 } else if (pool) { 740 /* add the possibly pre-existing pool to the end of the pools 741 * list; if it's new (and empty) then it'll be removed and 742 * destroyed by the put after we drop the lock 743 */ 744 list_add_tail_rcu(&pool->list, &zswap_pools); 745 put_pool = pool; 746 } 747 748 spin_unlock(&zswap_pools_lock); 749 750 if (!zswap_has_pool && !pool) { 751 /* if initial pool creation failed, and this pool creation also 752 * failed, maybe both compressor and zpool params were bad. 753 * Allow changing this param, so pool creation will succeed 754 * when the other param is changed. We already verified this 755 * param is ok in the zpool_has_pool() or crypto_has_comp() 756 * checks above. 757 */ 758 ret = param_set_charp(s, kp); 759 } 760 761 /* drop the ref from either the old current pool, 762 * or the new pool we failed to add 763 */ 764 if (put_pool) 765 zswap_pool_put(put_pool); 766 767 return ret; 768 } 769 770 static int zswap_compressor_param_set(const char *val, 771 const struct kernel_param *kp) 772 { 773 return __zswap_param_set(val, kp, zswap_zpool_type, NULL); 774 } 775 776 static int zswap_zpool_param_set(const char *val, 777 const struct kernel_param *kp) 778 { 779 return __zswap_param_set(val, kp, NULL, zswap_compressor); 780 } 781 782 static int zswap_enabled_param_set(const char *val, 783 const struct kernel_param *kp) 784 { 785 if (zswap_init_failed) { 786 pr_err("can't enable, initialization failed\n"); 787 return -ENODEV; 788 } 789 if (!zswap_has_pool && zswap_init_started) { 790 pr_err("can't enable, no pool configured\n"); 791 return -ENODEV; 792 } 793 794 return param_set_bool(val, kp); 795 } 796 797 /********************************* 798 * writeback code 799 **********************************/ 800 /* return enum for zswap_get_swap_cache_page */ 801 enum zswap_get_swap_ret { 802 ZSWAP_SWAPCACHE_NEW, 803 ZSWAP_SWAPCACHE_EXIST, 804 ZSWAP_SWAPCACHE_FAIL, 805 }; 806 807 /* 808 * zswap_get_swap_cache_page 809 * 810 * This is an adaption of read_swap_cache_async() 811 * 812 * This function tries to find a page with the given swap entry 813 * in the swapper_space address space (the swap cache). If the page 814 * is found, it is returned in retpage. Otherwise, a page is allocated, 815 * added to the swap cache, and returned in retpage. 816 * 817 * If success, the swap cache page is returned in retpage 818 * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache 819 * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, 820 * the new page is added to swapcache and locked 821 * Returns ZSWAP_SWAPCACHE_FAIL on error 822 */ 823 static int zswap_get_swap_cache_page(swp_entry_t entry, 824 struct page **retpage) 825 { 826 bool page_was_allocated; 827 828 *retpage = __read_swap_cache_async(entry, GFP_KERNEL, 829 NULL, 0, &page_was_allocated); 830 if (page_was_allocated) 831 return ZSWAP_SWAPCACHE_NEW; 832 if (!*retpage) 833 return ZSWAP_SWAPCACHE_FAIL; 834 return ZSWAP_SWAPCACHE_EXIST; 835 } 836 837 /* 838 * Attempts to free an entry by adding a page to the swap cache, 839 * decompressing the entry data into the page, and issuing a 840 * bio write to write the page back to the swap device. 841 * 842 * This can be thought of as a "resumed writeback" of the page 843 * to the swap device. We are basically resuming the same swap 844 * writeback path that was intercepted with the frontswap_store() 845 * in the first place. After the page has been decompressed into 846 * the swap cache, the compressed version stored by zswap can be 847 * freed. 848 */ 849 static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) 850 { 851 struct zswap_header *zhdr; 852 swp_entry_t swpentry; 853 struct zswap_tree *tree; 854 pgoff_t offset; 855 struct zswap_entry *entry; 856 struct page *page; 857 struct crypto_comp *tfm; 858 u8 *src, *dst; 859 unsigned int dlen; 860 int ret; 861 struct writeback_control wbc = { 862 .sync_mode = WB_SYNC_NONE, 863 }; 864 865 /* extract swpentry from data */ 866 zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); 867 swpentry = zhdr->swpentry; /* here */ 868 zpool_unmap_handle(pool, handle); 869 tree = zswap_trees[swp_type(swpentry)]; 870 offset = swp_offset(swpentry); 871 872 /* find and ref zswap entry */ 873 spin_lock(&tree->lock); 874 entry = zswap_entry_find_get(&tree->rbroot, offset); 875 if (!entry) { 876 /* entry was invalidated */ 877 spin_unlock(&tree->lock); 878 return 0; 879 } 880 spin_unlock(&tree->lock); 881 BUG_ON(offset != entry->offset); 882 883 /* try to allocate swap cache page */ 884 switch (zswap_get_swap_cache_page(swpentry, &page)) { 885 case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ 886 ret = -ENOMEM; 887 goto fail; 888 889 case ZSWAP_SWAPCACHE_EXIST: 890 /* page is already in the swap cache, ignore for now */ 891 put_page(page); 892 ret = -EEXIST; 893 goto fail; 894 895 case ZSWAP_SWAPCACHE_NEW: /* page is locked */ 896 /* decompress */ 897 dlen = PAGE_SIZE; 898 src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, 899 ZPOOL_MM_RO) + sizeof(struct zswap_header); 900 dst = kmap_atomic(page); 901 tfm = *get_cpu_ptr(entry->pool->tfm); 902 ret = crypto_comp_decompress(tfm, src, entry->length, 903 dst, &dlen); 904 put_cpu_ptr(entry->pool->tfm); 905 kunmap_atomic(dst); 906 zpool_unmap_handle(entry->pool->zpool, entry->handle); 907 BUG_ON(ret); 908 BUG_ON(dlen != PAGE_SIZE); 909 910 /* page is up to date */ 911 SetPageUptodate(page); 912 } 913 914 /* move it to the tail of the inactive list after end_writeback */ 915 SetPageReclaim(page); 916 917 /* start writeback */ 918 __swap_writepage(page, &wbc, end_swap_bio_write); 919 put_page(page); 920 zswap_written_back_pages++; 921 922 spin_lock(&tree->lock); 923 /* drop local reference */ 924 zswap_entry_put(tree, entry); 925 926 /* 927 * There are two possible situations for entry here: 928 * (1) refcount is 1(normal case), entry is valid and on the tree 929 * (2) refcount is 0, entry is freed and not on the tree 930 * because invalidate happened during writeback 931 * search the tree and free the entry if find entry 932 */ 933 if (entry == zswap_rb_search(&tree->rbroot, offset)) 934 zswap_entry_put(tree, entry); 935 spin_unlock(&tree->lock); 936 937 goto end; 938 939 /* 940 * if we get here due to ZSWAP_SWAPCACHE_EXIST 941 * a load may happening concurrently 942 * it is safe and okay to not free the entry 943 * if we free the entry in the following put 944 * it it either okay to return !0 945 */ 946 fail: 947 spin_lock(&tree->lock); 948 zswap_entry_put(tree, entry); 949 spin_unlock(&tree->lock); 950 951 end: 952 return ret; 953 } 954 955 static int zswap_shrink(void) 956 { 957 struct zswap_pool *pool; 958 int ret; 959 960 pool = zswap_pool_last_get(); 961 if (!pool) 962 return -ENOENT; 963 964 ret = zpool_shrink(pool->zpool, 1, NULL); 965 966 zswap_pool_put(pool); 967 968 return ret; 969 } 970 971 static int zswap_is_page_same_filled(void *ptr, unsigned long *value) 972 { 973 unsigned int pos; 974 unsigned long *page; 975 976 page = (unsigned long *)ptr; 977 for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) { 978 if (page[pos] != page[0]) 979 return 0; 980 } 981 *value = page[0]; 982 return 1; 983 } 984 985 static void zswap_fill_page(void *ptr, unsigned long value) 986 { 987 unsigned long *page; 988 989 page = (unsigned long *)ptr; 990 memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); 991 } 992 993 /********************************* 994 * frontswap hooks 995 **********************************/ 996 /* attempts to compress and store an single page */ 997 static int zswap_frontswap_store(unsigned type, pgoff_t offset, 998 struct page *page) 999 { 1000 struct zswap_tree *tree = zswap_trees[type]; 1001 struct zswap_entry *entry, *dupentry; 1002 struct crypto_comp *tfm; 1003 int ret; 1004 unsigned int hlen, dlen = PAGE_SIZE; 1005 unsigned long handle, value; 1006 char *buf; 1007 u8 *src, *dst; 1008 struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) }; 1009 1010 /* THP isn't supported */ 1011 if (PageTransHuge(page)) { 1012 ret = -EINVAL; 1013 goto reject; 1014 } 1015 1016 if (!zswap_enabled || !tree) { 1017 ret = -ENODEV; 1018 goto reject; 1019 } 1020 1021 /* reclaim space if needed */ 1022 if (zswap_is_full()) { 1023 zswap_pool_limit_hit++; 1024 if (zswap_shrink()) { 1025 zswap_reject_reclaim_fail++; 1026 ret = -ENOMEM; 1027 goto reject; 1028 } 1029 1030 /* A second zswap_is_full() check after 1031 * zswap_shrink() to make sure it's now 1032 * under the max_pool_percent 1033 */ 1034 if (zswap_is_full()) { 1035 ret = -ENOMEM; 1036 goto reject; 1037 } 1038 } 1039 1040 /* allocate entry */ 1041 entry = zswap_entry_cache_alloc(GFP_KERNEL); 1042 if (!entry) { 1043 zswap_reject_kmemcache_fail++; 1044 ret = -ENOMEM; 1045 goto reject; 1046 } 1047 1048 if (zswap_same_filled_pages_enabled) { 1049 src = kmap_atomic(page); 1050 if (zswap_is_page_same_filled(src, &value)) { 1051 kunmap_atomic(src); 1052 entry->offset = offset; 1053 entry->length = 0; 1054 entry->value = value; 1055 atomic_inc(&zswap_same_filled_pages); 1056 goto insert_entry; 1057 } 1058 kunmap_atomic(src); 1059 } 1060 1061 /* if entry is successfully added, it keeps the reference */ 1062 entry->pool = zswap_pool_current_get(); 1063 if (!entry->pool) { 1064 ret = -EINVAL; 1065 goto freepage; 1066 } 1067 1068 /* compress */ 1069 dst = get_cpu_var(zswap_dstmem); 1070 tfm = *get_cpu_ptr(entry->pool->tfm); 1071 src = kmap_atomic(page); 1072 ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen); 1073 kunmap_atomic(src); 1074 put_cpu_ptr(entry->pool->tfm); 1075 if (ret) { 1076 ret = -EINVAL; 1077 goto put_dstmem; 1078 } 1079 1080 /* store */ 1081 hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0; 1082 ret = zpool_malloc(entry->pool->zpool, hlen + dlen, 1083 __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, 1084 &handle); 1085 if (ret == -ENOSPC) { 1086 zswap_reject_compress_poor++; 1087 goto put_dstmem; 1088 } 1089 if (ret) { 1090 zswap_reject_alloc_fail++; 1091 goto put_dstmem; 1092 } 1093 buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW); 1094 memcpy(buf, &zhdr, hlen); 1095 memcpy(buf + hlen, dst, dlen); 1096 zpool_unmap_handle(entry->pool->zpool, handle); 1097 put_cpu_var(zswap_dstmem); 1098 1099 /* populate entry */ 1100 entry->offset = offset; 1101 entry->handle = handle; 1102 entry->length = dlen; 1103 1104 insert_entry: 1105 /* map */ 1106 spin_lock(&tree->lock); 1107 do { 1108 ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); 1109 if (ret == -EEXIST) { 1110 zswap_duplicate_entry++; 1111 /* remove from rbtree */ 1112 zswap_rb_erase(&tree->rbroot, dupentry); 1113 zswap_entry_put(tree, dupentry); 1114 } 1115 } while (ret == -EEXIST); 1116 spin_unlock(&tree->lock); 1117 1118 /* update stats */ 1119 atomic_inc(&zswap_stored_pages); 1120 zswap_update_total_size(); 1121 1122 return 0; 1123 1124 put_dstmem: 1125 put_cpu_var(zswap_dstmem); 1126 zswap_pool_put(entry->pool); 1127 freepage: 1128 zswap_entry_cache_free(entry); 1129 reject: 1130 return ret; 1131 } 1132 1133 /* 1134 * returns 0 if the page was successfully decompressed 1135 * return -1 on entry not found or error 1136 */ 1137 static int zswap_frontswap_load(unsigned type, pgoff_t offset, 1138 struct page *page) 1139 { 1140 struct zswap_tree *tree = zswap_trees[type]; 1141 struct zswap_entry *entry; 1142 struct crypto_comp *tfm; 1143 u8 *src, *dst; 1144 unsigned int dlen; 1145 int ret; 1146 1147 /* find */ 1148 spin_lock(&tree->lock); 1149 entry = zswap_entry_find_get(&tree->rbroot, offset); 1150 if (!entry) { 1151 /* entry was written back */ 1152 spin_unlock(&tree->lock); 1153 return -1; 1154 } 1155 spin_unlock(&tree->lock); 1156 1157 if (!entry->length) { 1158 dst = kmap_atomic(page); 1159 zswap_fill_page(dst, entry->value); 1160 kunmap_atomic(dst); 1161 goto freeentry; 1162 } 1163 1164 /* decompress */ 1165 dlen = PAGE_SIZE; 1166 src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); 1167 if (zpool_evictable(entry->pool->zpool)) 1168 src += sizeof(struct zswap_header); 1169 dst = kmap_atomic(page); 1170 tfm = *get_cpu_ptr(entry->pool->tfm); 1171 ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); 1172 put_cpu_ptr(entry->pool->tfm); 1173 kunmap_atomic(dst); 1174 zpool_unmap_handle(entry->pool->zpool, entry->handle); 1175 BUG_ON(ret); 1176 1177 freeentry: 1178 spin_lock(&tree->lock); 1179 zswap_entry_put(tree, entry); 1180 spin_unlock(&tree->lock); 1181 1182 return 0; 1183 } 1184 1185 /* frees an entry in zswap */ 1186 static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) 1187 { 1188 struct zswap_tree *tree = zswap_trees[type]; 1189 struct zswap_entry *entry; 1190 1191 /* find */ 1192 spin_lock(&tree->lock); 1193 entry = zswap_rb_search(&tree->rbroot, offset); 1194 if (!entry) { 1195 /* entry was written back */ 1196 spin_unlock(&tree->lock); 1197 return; 1198 } 1199 1200 /* remove from rbtree */ 1201 zswap_rb_erase(&tree->rbroot, entry); 1202 1203 /* drop the initial reference from entry creation */ 1204 zswap_entry_put(tree, entry); 1205 1206 spin_unlock(&tree->lock); 1207 } 1208 1209 /* frees all zswap entries for the given swap type */ 1210 static void zswap_frontswap_invalidate_area(unsigned type) 1211 { 1212 struct zswap_tree *tree = zswap_trees[type]; 1213 struct zswap_entry *entry, *n; 1214 1215 if (!tree) 1216 return; 1217 1218 /* walk the tree and free everything */ 1219 spin_lock(&tree->lock); 1220 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) 1221 zswap_free_entry(entry); 1222 tree->rbroot = RB_ROOT; 1223 spin_unlock(&tree->lock); 1224 kfree(tree); 1225 zswap_trees[type] = NULL; 1226 } 1227 1228 static void zswap_frontswap_init(unsigned type) 1229 { 1230 struct zswap_tree *tree; 1231 1232 tree = kzalloc(sizeof(*tree), GFP_KERNEL); 1233 if (!tree) { 1234 pr_err("alloc failed, zswap disabled for swap type %d\n", type); 1235 return; 1236 } 1237 1238 tree->rbroot = RB_ROOT; 1239 spin_lock_init(&tree->lock); 1240 zswap_trees[type] = tree; 1241 } 1242 1243 static struct frontswap_ops zswap_frontswap_ops = { 1244 .store = zswap_frontswap_store, 1245 .load = zswap_frontswap_load, 1246 .invalidate_page = zswap_frontswap_invalidate_page, 1247 .invalidate_area = zswap_frontswap_invalidate_area, 1248 .init = zswap_frontswap_init 1249 }; 1250 1251 /********************************* 1252 * debugfs functions 1253 **********************************/ 1254 #ifdef CONFIG_DEBUG_FS 1255 #include <linux/debugfs.h> 1256 1257 static struct dentry *zswap_debugfs_root; 1258 1259 static int __init zswap_debugfs_init(void) 1260 { 1261 if (!debugfs_initialized()) 1262 return -ENODEV; 1263 1264 zswap_debugfs_root = debugfs_create_dir("zswap", NULL); 1265 if (!zswap_debugfs_root) 1266 return -ENOMEM; 1267 1268 debugfs_create_u64("pool_limit_hit", 0444, 1269 zswap_debugfs_root, &zswap_pool_limit_hit); 1270 debugfs_create_u64("reject_reclaim_fail", 0444, 1271 zswap_debugfs_root, &zswap_reject_reclaim_fail); 1272 debugfs_create_u64("reject_alloc_fail", 0444, 1273 zswap_debugfs_root, &zswap_reject_alloc_fail); 1274 debugfs_create_u64("reject_kmemcache_fail", 0444, 1275 zswap_debugfs_root, &zswap_reject_kmemcache_fail); 1276 debugfs_create_u64("reject_compress_poor", 0444, 1277 zswap_debugfs_root, &zswap_reject_compress_poor); 1278 debugfs_create_u64("written_back_pages", 0444, 1279 zswap_debugfs_root, &zswap_written_back_pages); 1280 debugfs_create_u64("duplicate_entry", 0444, 1281 zswap_debugfs_root, &zswap_duplicate_entry); 1282 debugfs_create_u64("pool_total_size", 0444, 1283 zswap_debugfs_root, &zswap_pool_total_size); 1284 debugfs_create_atomic_t("stored_pages", 0444, 1285 zswap_debugfs_root, &zswap_stored_pages); 1286 debugfs_create_atomic_t("same_filled_pages", 0444, 1287 zswap_debugfs_root, &zswap_same_filled_pages); 1288 1289 return 0; 1290 } 1291 1292 static void __exit zswap_debugfs_exit(void) 1293 { 1294 debugfs_remove_recursive(zswap_debugfs_root); 1295 } 1296 #else 1297 static int __init zswap_debugfs_init(void) 1298 { 1299 return 0; 1300 } 1301 1302 static void __exit zswap_debugfs_exit(void) { } 1303 #endif 1304 1305 /********************************* 1306 * module init and exit 1307 **********************************/ 1308 static int __init init_zswap(void) 1309 { 1310 struct zswap_pool *pool; 1311 int ret; 1312 1313 zswap_init_started = true; 1314 1315 if (zswap_entry_cache_create()) { 1316 pr_err("entry cache creation failed\n"); 1317 goto cache_fail; 1318 } 1319 1320 ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare", 1321 zswap_dstmem_prepare, zswap_dstmem_dead); 1322 if (ret) { 1323 pr_err("dstmem alloc failed\n"); 1324 goto dstmem_fail; 1325 } 1326 1327 ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, 1328 "mm/zswap_pool:prepare", 1329 zswap_cpu_comp_prepare, 1330 zswap_cpu_comp_dead); 1331 if (ret) 1332 goto hp_fail; 1333 1334 pool = __zswap_pool_create_fallback(); 1335 if (pool) { 1336 pr_info("loaded using pool %s/%s\n", pool->tfm_name, 1337 zpool_get_type(pool->zpool)); 1338 list_add(&pool->list, &zswap_pools); 1339 zswap_has_pool = true; 1340 } else { 1341 pr_err("pool creation failed\n"); 1342 zswap_enabled = false; 1343 } 1344 1345 frontswap_register_ops(&zswap_frontswap_ops); 1346 if (zswap_debugfs_init()) 1347 pr_warn("debugfs initialization failed\n"); 1348 return 0; 1349 1350 hp_fail: 1351 cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE); 1352 dstmem_fail: 1353 zswap_entry_cache_destroy(); 1354 cache_fail: 1355 /* if built-in, we aren't unloaded on failure; don't allow use */ 1356 zswap_init_failed = true; 1357 zswap_enabled = false; 1358 return -ENOMEM; 1359 } 1360 /* must be late so crypto has time to come up */ 1361 late_initcall(init_zswap); 1362 1363 MODULE_LICENSE("GPL"); 1364 MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); 1365 MODULE_DESCRIPTION("Compressed cache for swap pages"); 1366