1 /* 2 * zswap.c - zswap driver file 3 * 4 * zswap is a backend for frontswap that takes pages that are in the process 5 * of being swapped out and attempts to compress and store them in a 6 * RAM-based memory pool. This can result in a significant I/O reduction on 7 * the swap device and, in the case where decompressing from RAM is faster 8 * than reading from the swap device, can also improve workload performance. 9 * 10 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; either version 2 15 * of the License, or (at your option) any later version. 16 * 17 * This program is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 * GNU General Public License for more details. 21 */ 22 23 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 24 25 #include <linux/module.h> 26 #include <linux/cpu.h> 27 #include <linux/highmem.h> 28 #include <linux/slab.h> 29 #include <linux/spinlock.h> 30 #include <linux/types.h> 31 #include <linux/atomic.h> 32 #include <linux/frontswap.h> 33 #include <linux/rbtree.h> 34 #include <linux/swap.h> 35 #include <linux/crypto.h> 36 #include <linux/mempool.h> 37 #include <linux/zpool.h> 38 39 #include <linux/mm_types.h> 40 #include <linux/page-flags.h> 41 #include <linux/swapops.h> 42 #include <linux/writeback.h> 43 #include <linux/pagemap.h> 44 45 /********************************* 46 * statistics 47 **********************************/ 48 /* Total bytes used by the compressed storage */ 49 static u64 zswap_pool_total_size; 50 /* The number of compressed pages currently stored in zswap */ 51 static atomic_t zswap_stored_pages = ATOMIC_INIT(0); 52 53 /* 54 * The statistics below are not protected from concurrent access for 55 * performance reasons so they may not be a 100% accurate. However, 56 * they do provide useful information on roughly how many times a 57 * certain event is occurring. 58 */ 59 60 /* Pool limit was hit (see zswap_max_pool_percent) */ 61 static u64 zswap_pool_limit_hit; 62 /* Pages written back when pool limit was reached */ 63 static u64 zswap_written_back_pages; 64 /* Store failed due to a reclaim failure after pool limit was reached */ 65 static u64 zswap_reject_reclaim_fail; 66 /* Compressed page was too big for the allocator to (optimally) store */ 67 static u64 zswap_reject_compress_poor; 68 /* Store failed because underlying allocator could not get memory */ 69 static u64 zswap_reject_alloc_fail; 70 /* Store failed because the entry metadata could not be allocated (rare) */ 71 static u64 zswap_reject_kmemcache_fail; 72 /* Duplicate store was encountered (rare) */ 73 static u64 zswap_duplicate_entry; 74 75 /********************************* 76 * tunables 77 **********************************/ 78 /* Enable/disable zswap (disabled by default, fixed at boot for now) */ 79 static bool zswap_enabled __read_mostly; 80 module_param_named(enabled, zswap_enabled, bool, 0444); 81 82 /* Compressor to be used by zswap (fixed at boot for now) */ 83 #define ZSWAP_COMPRESSOR_DEFAULT "lzo" 84 static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 85 module_param_named(compressor, zswap_compressor, charp, 0444); 86 87 /* The maximum percentage of memory that the compressed pool can occupy */ 88 static unsigned int zswap_max_pool_percent = 20; 89 module_param_named(max_pool_percent, 90 zswap_max_pool_percent, uint, 0644); 91 92 /* Compressed storage to use */ 93 #define ZSWAP_ZPOOL_DEFAULT "zbud" 94 static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 95 module_param_named(zpool, zswap_zpool_type, charp, 0444); 96 97 /* zpool is shared by all of zswap backend */ 98 static struct zpool *zswap_pool; 99 100 /********************************* 101 * compression functions 102 **********************************/ 103 /* per-cpu compression transforms */ 104 static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms; 105 106 enum comp_op { 107 ZSWAP_COMPOP_COMPRESS, 108 ZSWAP_COMPOP_DECOMPRESS 109 }; 110 111 static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen, 112 u8 *dst, unsigned int *dlen) 113 { 114 struct crypto_comp *tfm; 115 int ret; 116 117 tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu()); 118 switch (op) { 119 case ZSWAP_COMPOP_COMPRESS: 120 ret = crypto_comp_compress(tfm, src, slen, dst, dlen); 121 break; 122 case ZSWAP_COMPOP_DECOMPRESS: 123 ret = crypto_comp_decompress(tfm, src, slen, dst, dlen); 124 break; 125 default: 126 ret = -EINVAL; 127 } 128 129 put_cpu(); 130 return ret; 131 } 132 133 static int __init zswap_comp_init(void) 134 { 135 if (!crypto_has_comp(zswap_compressor, 0, 0)) { 136 pr_info("%s compressor not available\n", zswap_compressor); 137 /* fall back to default compressor */ 138 zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 139 if (!crypto_has_comp(zswap_compressor, 0, 0)) 140 /* can't even load the default compressor */ 141 return -ENODEV; 142 } 143 pr_info("using %s compressor\n", zswap_compressor); 144 145 /* alloc percpu transforms */ 146 zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *); 147 if (!zswap_comp_pcpu_tfms) 148 return -ENOMEM; 149 return 0; 150 } 151 152 static void zswap_comp_exit(void) 153 { 154 /* free percpu transforms */ 155 if (zswap_comp_pcpu_tfms) 156 free_percpu(zswap_comp_pcpu_tfms); 157 } 158 159 /********************************* 160 * data structures 161 **********************************/ 162 /* 163 * struct zswap_entry 164 * 165 * This structure contains the metadata for tracking a single compressed 166 * page within zswap. 167 * 168 * rbnode - links the entry into red-black tree for the appropriate swap type 169 * refcount - the number of outstanding reference to the entry. This is needed 170 * to protect against premature freeing of the entry by code 171 * concurrent calls to load, invalidate, and writeback. The lock 172 * for the zswap_tree structure that contains the entry must 173 * be held while changing the refcount. Since the lock must 174 * be held, there is no reason to also make refcount atomic. 175 * offset - the swap offset for the entry. Index into the red-black tree. 176 * handle - zpool allocation handle that stores the compressed page data 177 * length - the length in bytes of the compressed page data. Needed during 178 * decompression 179 */ 180 struct zswap_entry { 181 struct rb_node rbnode; 182 pgoff_t offset; 183 int refcount; 184 unsigned int length; 185 unsigned long handle; 186 }; 187 188 struct zswap_header { 189 swp_entry_t swpentry; 190 }; 191 192 /* 193 * The tree lock in the zswap_tree struct protects a few things: 194 * - the rbtree 195 * - the refcount field of each entry in the tree 196 */ 197 struct zswap_tree { 198 struct rb_root rbroot; 199 spinlock_t lock; 200 }; 201 202 static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 203 204 /********************************* 205 * zswap entry functions 206 **********************************/ 207 static struct kmem_cache *zswap_entry_cache; 208 209 static int zswap_entry_cache_create(void) 210 { 211 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 212 return zswap_entry_cache == NULL; 213 } 214 215 static void __init zswap_entry_cache_destroy(void) 216 { 217 kmem_cache_destroy(zswap_entry_cache); 218 } 219 220 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) 221 { 222 struct zswap_entry *entry; 223 entry = kmem_cache_alloc(zswap_entry_cache, gfp); 224 if (!entry) 225 return NULL; 226 entry->refcount = 1; 227 RB_CLEAR_NODE(&entry->rbnode); 228 return entry; 229 } 230 231 static void zswap_entry_cache_free(struct zswap_entry *entry) 232 { 233 kmem_cache_free(zswap_entry_cache, entry); 234 } 235 236 /********************************* 237 * rbtree functions 238 **********************************/ 239 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) 240 { 241 struct rb_node *node = root->rb_node; 242 struct zswap_entry *entry; 243 244 while (node) { 245 entry = rb_entry(node, struct zswap_entry, rbnode); 246 if (entry->offset > offset) 247 node = node->rb_left; 248 else if (entry->offset < offset) 249 node = node->rb_right; 250 else 251 return entry; 252 } 253 return NULL; 254 } 255 256 /* 257 * In the case that a entry with the same offset is found, a pointer to 258 * the existing entry is stored in dupentry and the function returns -EEXIST 259 */ 260 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, 261 struct zswap_entry **dupentry) 262 { 263 struct rb_node **link = &root->rb_node, *parent = NULL; 264 struct zswap_entry *myentry; 265 266 while (*link) { 267 parent = *link; 268 myentry = rb_entry(parent, struct zswap_entry, rbnode); 269 if (myentry->offset > entry->offset) 270 link = &(*link)->rb_left; 271 else if (myentry->offset < entry->offset) 272 link = &(*link)->rb_right; 273 else { 274 *dupentry = myentry; 275 return -EEXIST; 276 } 277 } 278 rb_link_node(&entry->rbnode, parent, link); 279 rb_insert_color(&entry->rbnode, root); 280 return 0; 281 } 282 283 static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) 284 { 285 if (!RB_EMPTY_NODE(&entry->rbnode)) { 286 rb_erase(&entry->rbnode, root); 287 RB_CLEAR_NODE(&entry->rbnode); 288 } 289 } 290 291 /* 292 * Carries out the common pattern of freeing and entry's zpool allocation, 293 * freeing the entry itself, and decrementing the number of stored pages. 294 */ 295 static void zswap_free_entry(struct zswap_entry *entry) 296 { 297 zpool_free(zswap_pool, entry->handle); 298 zswap_entry_cache_free(entry); 299 atomic_dec(&zswap_stored_pages); 300 zswap_pool_total_size = zpool_get_total_size(zswap_pool); 301 } 302 303 /* caller must hold the tree lock */ 304 static void zswap_entry_get(struct zswap_entry *entry) 305 { 306 entry->refcount++; 307 } 308 309 /* caller must hold the tree lock 310 * remove from the tree and free it, if nobody reference the entry 311 */ 312 static void zswap_entry_put(struct zswap_tree *tree, 313 struct zswap_entry *entry) 314 { 315 int refcount = --entry->refcount; 316 317 BUG_ON(refcount < 0); 318 if (refcount == 0) { 319 zswap_rb_erase(&tree->rbroot, entry); 320 zswap_free_entry(entry); 321 } 322 } 323 324 /* caller must hold the tree lock */ 325 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, 326 pgoff_t offset) 327 { 328 struct zswap_entry *entry = NULL; 329 330 entry = zswap_rb_search(root, offset); 331 if (entry) 332 zswap_entry_get(entry); 333 334 return entry; 335 } 336 337 /********************************* 338 * per-cpu code 339 **********************************/ 340 static DEFINE_PER_CPU(u8 *, zswap_dstmem); 341 342 static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) 343 { 344 struct crypto_comp *tfm; 345 u8 *dst; 346 347 switch (action) { 348 case CPU_UP_PREPARE: 349 tfm = crypto_alloc_comp(zswap_compressor, 0, 0); 350 if (IS_ERR(tfm)) { 351 pr_err("can't allocate compressor transform\n"); 352 return NOTIFY_BAD; 353 } 354 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; 355 dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); 356 if (!dst) { 357 pr_err("can't allocate compressor buffer\n"); 358 crypto_free_comp(tfm); 359 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; 360 return NOTIFY_BAD; 361 } 362 per_cpu(zswap_dstmem, cpu) = dst; 363 break; 364 case CPU_DEAD: 365 case CPU_UP_CANCELED: 366 tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu); 367 if (tfm) { 368 crypto_free_comp(tfm); 369 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; 370 } 371 dst = per_cpu(zswap_dstmem, cpu); 372 kfree(dst); 373 per_cpu(zswap_dstmem, cpu) = NULL; 374 break; 375 default: 376 break; 377 } 378 return NOTIFY_OK; 379 } 380 381 static int zswap_cpu_notifier(struct notifier_block *nb, 382 unsigned long action, void *pcpu) 383 { 384 unsigned long cpu = (unsigned long)pcpu; 385 return __zswap_cpu_notifier(action, cpu); 386 } 387 388 static struct notifier_block zswap_cpu_notifier_block = { 389 .notifier_call = zswap_cpu_notifier 390 }; 391 392 static int zswap_cpu_init(void) 393 { 394 unsigned long cpu; 395 396 cpu_notifier_register_begin(); 397 for_each_online_cpu(cpu) 398 if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) 399 goto cleanup; 400 __register_cpu_notifier(&zswap_cpu_notifier_block); 401 cpu_notifier_register_done(); 402 return 0; 403 404 cleanup: 405 for_each_online_cpu(cpu) 406 __zswap_cpu_notifier(CPU_UP_CANCELED, cpu); 407 cpu_notifier_register_done(); 408 return -ENOMEM; 409 } 410 411 /********************************* 412 * helpers 413 **********************************/ 414 static bool zswap_is_full(void) 415 { 416 return totalram_pages * zswap_max_pool_percent / 100 < 417 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 418 } 419 420 /********************************* 421 * writeback code 422 **********************************/ 423 /* return enum for zswap_get_swap_cache_page */ 424 enum zswap_get_swap_ret { 425 ZSWAP_SWAPCACHE_NEW, 426 ZSWAP_SWAPCACHE_EXIST, 427 ZSWAP_SWAPCACHE_FAIL, 428 }; 429 430 /* 431 * zswap_get_swap_cache_page 432 * 433 * This is an adaption of read_swap_cache_async() 434 * 435 * This function tries to find a page with the given swap entry 436 * in the swapper_space address space (the swap cache). If the page 437 * is found, it is returned in retpage. Otherwise, a page is allocated, 438 * added to the swap cache, and returned in retpage. 439 * 440 * If success, the swap cache page is returned in retpage 441 * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache 442 * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, 443 * the new page is added to swapcache and locked 444 * Returns ZSWAP_SWAPCACHE_FAIL on error 445 */ 446 static int zswap_get_swap_cache_page(swp_entry_t entry, 447 struct page **retpage) 448 { 449 struct page *found_page, *new_page = NULL; 450 struct address_space *swapper_space = swap_address_space(entry); 451 int err; 452 453 *retpage = NULL; 454 do { 455 /* 456 * First check the swap cache. Since this is normally 457 * called after lookup_swap_cache() failed, re-calling 458 * that would confuse statistics. 459 */ 460 found_page = find_get_page(swapper_space, entry.val); 461 if (found_page) 462 break; 463 464 /* 465 * Get a new page to read into from swap. 466 */ 467 if (!new_page) { 468 new_page = alloc_page(GFP_KERNEL); 469 if (!new_page) 470 break; /* Out of memory */ 471 } 472 473 /* 474 * call radix_tree_preload() while we can wait. 475 */ 476 err = radix_tree_preload(GFP_KERNEL); 477 if (err) 478 break; 479 480 /* 481 * Swap entry may have been freed since our caller observed it. 482 */ 483 err = swapcache_prepare(entry); 484 if (err == -EEXIST) { /* seems racy */ 485 radix_tree_preload_end(); 486 continue; 487 } 488 if (err) { /* swp entry is obsolete ? */ 489 radix_tree_preload_end(); 490 break; 491 } 492 493 /* May fail (-ENOMEM) if radix-tree node allocation failed. */ 494 __set_page_locked(new_page); 495 SetPageSwapBacked(new_page); 496 err = __add_to_swap_cache(new_page, entry); 497 if (likely(!err)) { 498 radix_tree_preload_end(); 499 lru_cache_add_anon(new_page); 500 *retpage = new_page; 501 return ZSWAP_SWAPCACHE_NEW; 502 } 503 radix_tree_preload_end(); 504 ClearPageSwapBacked(new_page); 505 __clear_page_locked(new_page); 506 /* 507 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 508 * clear SWAP_HAS_CACHE flag. 509 */ 510 swapcache_free(entry); 511 } while (err != -ENOMEM); 512 513 if (new_page) 514 page_cache_release(new_page); 515 if (!found_page) 516 return ZSWAP_SWAPCACHE_FAIL; 517 *retpage = found_page; 518 return ZSWAP_SWAPCACHE_EXIST; 519 } 520 521 /* 522 * Attempts to free an entry by adding a page to the swap cache, 523 * decompressing the entry data into the page, and issuing a 524 * bio write to write the page back to the swap device. 525 * 526 * This can be thought of as a "resumed writeback" of the page 527 * to the swap device. We are basically resuming the same swap 528 * writeback path that was intercepted with the frontswap_store() 529 * in the first place. After the page has been decompressed into 530 * the swap cache, the compressed version stored by zswap can be 531 * freed. 532 */ 533 static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) 534 { 535 struct zswap_header *zhdr; 536 swp_entry_t swpentry; 537 struct zswap_tree *tree; 538 pgoff_t offset; 539 struct zswap_entry *entry; 540 struct page *page; 541 u8 *src, *dst; 542 unsigned int dlen; 543 int ret; 544 struct writeback_control wbc = { 545 .sync_mode = WB_SYNC_NONE, 546 }; 547 548 /* extract swpentry from data */ 549 zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); 550 swpentry = zhdr->swpentry; /* here */ 551 zpool_unmap_handle(pool, handle); 552 tree = zswap_trees[swp_type(swpentry)]; 553 offset = swp_offset(swpentry); 554 555 /* find and ref zswap entry */ 556 spin_lock(&tree->lock); 557 entry = zswap_entry_find_get(&tree->rbroot, offset); 558 if (!entry) { 559 /* entry was invalidated */ 560 spin_unlock(&tree->lock); 561 return 0; 562 } 563 spin_unlock(&tree->lock); 564 BUG_ON(offset != entry->offset); 565 566 /* try to allocate swap cache page */ 567 switch (zswap_get_swap_cache_page(swpentry, &page)) { 568 case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ 569 ret = -ENOMEM; 570 goto fail; 571 572 case ZSWAP_SWAPCACHE_EXIST: 573 /* page is already in the swap cache, ignore for now */ 574 page_cache_release(page); 575 ret = -EEXIST; 576 goto fail; 577 578 case ZSWAP_SWAPCACHE_NEW: /* page is locked */ 579 /* decompress */ 580 dlen = PAGE_SIZE; 581 src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, 582 ZPOOL_MM_RO) + sizeof(struct zswap_header); 583 dst = kmap_atomic(page); 584 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, 585 entry->length, dst, &dlen); 586 kunmap_atomic(dst); 587 zpool_unmap_handle(zswap_pool, entry->handle); 588 BUG_ON(ret); 589 BUG_ON(dlen != PAGE_SIZE); 590 591 /* page is up to date */ 592 SetPageUptodate(page); 593 } 594 595 /* move it to the tail of the inactive list after end_writeback */ 596 SetPageReclaim(page); 597 598 /* start writeback */ 599 __swap_writepage(page, &wbc, end_swap_bio_write); 600 page_cache_release(page); 601 zswap_written_back_pages++; 602 603 spin_lock(&tree->lock); 604 /* drop local reference */ 605 zswap_entry_put(tree, entry); 606 607 /* 608 * There are two possible situations for entry here: 609 * (1) refcount is 1(normal case), entry is valid and on the tree 610 * (2) refcount is 0, entry is freed and not on the tree 611 * because invalidate happened during writeback 612 * search the tree and free the entry if find entry 613 */ 614 if (entry == zswap_rb_search(&tree->rbroot, offset)) 615 zswap_entry_put(tree, entry); 616 spin_unlock(&tree->lock); 617 618 goto end; 619 620 /* 621 * if we get here due to ZSWAP_SWAPCACHE_EXIST 622 * a load may happening concurrently 623 * it is safe and okay to not free the entry 624 * if we free the entry in the following put 625 * it it either okay to return !0 626 */ 627 fail: 628 spin_lock(&tree->lock); 629 zswap_entry_put(tree, entry); 630 spin_unlock(&tree->lock); 631 632 end: 633 return ret; 634 } 635 636 /********************************* 637 * frontswap hooks 638 **********************************/ 639 /* attempts to compress and store an single page */ 640 static int zswap_frontswap_store(unsigned type, pgoff_t offset, 641 struct page *page) 642 { 643 struct zswap_tree *tree = zswap_trees[type]; 644 struct zswap_entry *entry, *dupentry; 645 int ret; 646 unsigned int dlen = PAGE_SIZE, len; 647 unsigned long handle; 648 char *buf; 649 u8 *src, *dst; 650 struct zswap_header *zhdr; 651 652 if (!tree) { 653 ret = -ENODEV; 654 goto reject; 655 } 656 657 /* reclaim space if needed */ 658 if (zswap_is_full()) { 659 zswap_pool_limit_hit++; 660 if (zpool_shrink(zswap_pool, 1, NULL)) { 661 zswap_reject_reclaim_fail++; 662 ret = -ENOMEM; 663 goto reject; 664 } 665 } 666 667 /* allocate entry */ 668 entry = zswap_entry_cache_alloc(GFP_KERNEL); 669 if (!entry) { 670 zswap_reject_kmemcache_fail++; 671 ret = -ENOMEM; 672 goto reject; 673 } 674 675 /* compress */ 676 dst = get_cpu_var(zswap_dstmem); 677 src = kmap_atomic(page); 678 ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen); 679 kunmap_atomic(src); 680 if (ret) { 681 ret = -EINVAL; 682 goto freepage; 683 } 684 685 /* store */ 686 len = dlen + sizeof(struct zswap_header); 687 ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, 688 &handle); 689 if (ret == -ENOSPC) { 690 zswap_reject_compress_poor++; 691 goto freepage; 692 } 693 if (ret) { 694 zswap_reject_alloc_fail++; 695 goto freepage; 696 } 697 zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW); 698 zhdr->swpentry = swp_entry(type, offset); 699 buf = (u8 *)(zhdr + 1); 700 memcpy(buf, dst, dlen); 701 zpool_unmap_handle(zswap_pool, handle); 702 put_cpu_var(zswap_dstmem); 703 704 /* populate entry */ 705 entry->offset = offset; 706 entry->handle = handle; 707 entry->length = dlen; 708 709 /* map */ 710 spin_lock(&tree->lock); 711 do { 712 ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); 713 if (ret == -EEXIST) { 714 zswap_duplicate_entry++; 715 /* remove from rbtree */ 716 zswap_rb_erase(&tree->rbroot, dupentry); 717 zswap_entry_put(tree, dupentry); 718 } 719 } while (ret == -EEXIST); 720 spin_unlock(&tree->lock); 721 722 /* update stats */ 723 atomic_inc(&zswap_stored_pages); 724 zswap_pool_total_size = zpool_get_total_size(zswap_pool); 725 726 return 0; 727 728 freepage: 729 put_cpu_var(zswap_dstmem); 730 zswap_entry_cache_free(entry); 731 reject: 732 return ret; 733 } 734 735 /* 736 * returns 0 if the page was successfully decompressed 737 * return -1 on entry not found or error 738 */ 739 static int zswap_frontswap_load(unsigned type, pgoff_t offset, 740 struct page *page) 741 { 742 struct zswap_tree *tree = zswap_trees[type]; 743 struct zswap_entry *entry; 744 u8 *src, *dst; 745 unsigned int dlen; 746 int ret; 747 748 /* find */ 749 spin_lock(&tree->lock); 750 entry = zswap_entry_find_get(&tree->rbroot, offset); 751 if (!entry) { 752 /* entry was written back */ 753 spin_unlock(&tree->lock); 754 return -1; 755 } 756 spin_unlock(&tree->lock); 757 758 /* decompress */ 759 dlen = PAGE_SIZE; 760 src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, 761 ZPOOL_MM_RO) + sizeof(struct zswap_header); 762 dst = kmap_atomic(page); 763 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, 764 dst, &dlen); 765 kunmap_atomic(dst); 766 zpool_unmap_handle(zswap_pool, entry->handle); 767 BUG_ON(ret); 768 769 spin_lock(&tree->lock); 770 zswap_entry_put(tree, entry); 771 spin_unlock(&tree->lock); 772 773 return 0; 774 } 775 776 /* frees an entry in zswap */ 777 static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) 778 { 779 struct zswap_tree *tree = zswap_trees[type]; 780 struct zswap_entry *entry; 781 782 /* find */ 783 spin_lock(&tree->lock); 784 entry = zswap_rb_search(&tree->rbroot, offset); 785 if (!entry) { 786 /* entry was written back */ 787 spin_unlock(&tree->lock); 788 return; 789 } 790 791 /* remove from rbtree */ 792 zswap_rb_erase(&tree->rbroot, entry); 793 794 /* drop the initial reference from entry creation */ 795 zswap_entry_put(tree, entry); 796 797 spin_unlock(&tree->lock); 798 } 799 800 /* frees all zswap entries for the given swap type */ 801 static void zswap_frontswap_invalidate_area(unsigned type) 802 { 803 struct zswap_tree *tree = zswap_trees[type]; 804 struct zswap_entry *entry, *n; 805 806 if (!tree) 807 return; 808 809 /* walk the tree and free everything */ 810 spin_lock(&tree->lock); 811 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) 812 zswap_free_entry(entry); 813 tree->rbroot = RB_ROOT; 814 spin_unlock(&tree->lock); 815 kfree(tree); 816 zswap_trees[type] = NULL; 817 } 818 819 static struct zpool_ops zswap_zpool_ops = { 820 .evict = zswap_writeback_entry 821 }; 822 823 static void zswap_frontswap_init(unsigned type) 824 { 825 struct zswap_tree *tree; 826 827 tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); 828 if (!tree) { 829 pr_err("alloc failed, zswap disabled for swap type %d\n", type); 830 return; 831 } 832 833 tree->rbroot = RB_ROOT; 834 spin_lock_init(&tree->lock); 835 zswap_trees[type] = tree; 836 } 837 838 static struct frontswap_ops zswap_frontswap_ops = { 839 .store = zswap_frontswap_store, 840 .load = zswap_frontswap_load, 841 .invalidate_page = zswap_frontswap_invalidate_page, 842 .invalidate_area = zswap_frontswap_invalidate_area, 843 .init = zswap_frontswap_init 844 }; 845 846 /********************************* 847 * debugfs functions 848 **********************************/ 849 #ifdef CONFIG_DEBUG_FS 850 #include <linux/debugfs.h> 851 852 static struct dentry *zswap_debugfs_root; 853 854 static int __init zswap_debugfs_init(void) 855 { 856 if (!debugfs_initialized()) 857 return -ENODEV; 858 859 zswap_debugfs_root = debugfs_create_dir("zswap", NULL); 860 if (!zswap_debugfs_root) 861 return -ENOMEM; 862 863 debugfs_create_u64("pool_limit_hit", S_IRUGO, 864 zswap_debugfs_root, &zswap_pool_limit_hit); 865 debugfs_create_u64("reject_reclaim_fail", S_IRUGO, 866 zswap_debugfs_root, &zswap_reject_reclaim_fail); 867 debugfs_create_u64("reject_alloc_fail", S_IRUGO, 868 zswap_debugfs_root, &zswap_reject_alloc_fail); 869 debugfs_create_u64("reject_kmemcache_fail", S_IRUGO, 870 zswap_debugfs_root, &zswap_reject_kmemcache_fail); 871 debugfs_create_u64("reject_compress_poor", S_IRUGO, 872 zswap_debugfs_root, &zswap_reject_compress_poor); 873 debugfs_create_u64("written_back_pages", S_IRUGO, 874 zswap_debugfs_root, &zswap_written_back_pages); 875 debugfs_create_u64("duplicate_entry", S_IRUGO, 876 zswap_debugfs_root, &zswap_duplicate_entry); 877 debugfs_create_u64("pool_total_size", S_IRUGO, 878 zswap_debugfs_root, &zswap_pool_total_size); 879 debugfs_create_atomic_t("stored_pages", S_IRUGO, 880 zswap_debugfs_root, &zswap_stored_pages); 881 882 return 0; 883 } 884 885 static void __exit zswap_debugfs_exit(void) 886 { 887 debugfs_remove_recursive(zswap_debugfs_root); 888 } 889 #else 890 static int __init zswap_debugfs_init(void) 891 { 892 return 0; 893 } 894 895 static void __exit zswap_debugfs_exit(void) { } 896 #endif 897 898 /********************************* 899 * module init and exit 900 **********************************/ 901 static int __init init_zswap(void) 902 { 903 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; 904 905 if (!zswap_enabled) 906 return 0; 907 908 pr_info("loading zswap\n"); 909 910 zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops); 911 if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { 912 pr_info("%s zpool not available\n", zswap_zpool_type); 913 zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 914 zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, 915 &zswap_zpool_ops); 916 } 917 if (!zswap_pool) { 918 pr_err("%s zpool not available\n", zswap_zpool_type); 919 pr_err("zpool creation failed\n"); 920 goto error; 921 } 922 pr_info("using %s pool\n", zswap_zpool_type); 923 924 if (zswap_entry_cache_create()) { 925 pr_err("entry cache creation failed\n"); 926 goto cachefail; 927 } 928 if (zswap_comp_init()) { 929 pr_err("compressor initialization failed\n"); 930 goto compfail; 931 } 932 if (zswap_cpu_init()) { 933 pr_err("per-cpu initialization failed\n"); 934 goto pcpufail; 935 } 936 937 frontswap_register_ops(&zswap_frontswap_ops); 938 if (zswap_debugfs_init()) 939 pr_warn("debugfs initialization failed\n"); 940 return 0; 941 pcpufail: 942 zswap_comp_exit(); 943 compfail: 944 zswap_entry_cache_destroy(); 945 cachefail: 946 zpool_destroy_pool(zswap_pool); 947 error: 948 return -ENOMEM; 949 } 950 /* must be late so crypto has time to come up */ 951 late_initcall(init_zswap); 952 953 MODULE_LICENSE("GPL"); 954 MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>"); 955 MODULE_DESCRIPTION("Compressed cache for swap pages"); 956