1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/swap_state.c 4 * 5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 6 * Swap reorganised 29.12.95, Stephen Tweedie 7 * 8 * Rewritten to use page cache, (C) 1998 Stephen Tweedie 9 */ 10 #include <linux/mm.h> 11 #include <linux/gfp.h> 12 #include <linux/kernel_stat.h> 13 #include <linux/swap.h> 14 #include <linux/swapops.h> 15 #include <linux/init.h> 16 #include <linux/pagemap.h> 17 #include <linux/backing-dev.h> 18 #include <linux/blkdev.h> 19 #include <linux/pagevec.h> 20 #include <linux/migrate.h> 21 #include <linux/vmalloc.h> 22 #include <linux/swap_slots.h> 23 #include <linux/huge_mm.h> 24 25 #include <asm/pgtable.h> 26 27 /* 28 * swapper_space is a fiction, retained to simplify the path through 29 * vmscan's shrink_page_list. 30 */ 31 static const struct address_space_operations swap_aops = { 32 .writepage = swap_writepage, 33 .set_page_dirty = swap_set_page_dirty, 34 #ifdef CONFIG_MIGRATION 35 .migratepage = migrate_page, 36 #endif 37 }; 38 39 struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; 40 static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; 41 bool swap_vma_readahead __read_mostly = true; 42 43 #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 44 #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 45 #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK 46 #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) 47 48 #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) 49 #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) 50 #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) 51 52 #define SWAP_RA_VAL(addr, win, hits) \ 53 (((addr) & PAGE_MASK) | \ 54 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ 55 ((hits) & SWAP_RA_HITS_MASK)) 56 57 /* Initial readahead hits is 4 to start up with a small window */ 58 #define GET_SWAP_RA_VAL(vma) \ 59 (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) 60 61 #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) 62 #define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) 63 64 static struct { 65 unsigned long add_total; 66 unsigned long del_total; 67 unsigned long find_success; 68 unsigned long find_total; 69 } swap_cache_info; 70 71 unsigned long total_swapcache_pages(void) 72 { 73 unsigned int i, j, nr; 74 unsigned long ret = 0; 75 struct address_space *spaces; 76 77 rcu_read_lock(); 78 for (i = 0; i < MAX_SWAPFILES; i++) { 79 /* 80 * The corresponding entries in nr_swapper_spaces and 81 * swapper_spaces will be reused only after at least 82 * one grace period. So it is impossible for them 83 * belongs to different usage. 84 */ 85 nr = nr_swapper_spaces[i]; 86 spaces = rcu_dereference(swapper_spaces[i]); 87 if (!nr || !spaces) 88 continue; 89 for (j = 0; j < nr; j++) 90 ret += spaces[j].nrpages; 91 } 92 rcu_read_unlock(); 93 return ret; 94 } 95 96 static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); 97 98 void show_swap_cache_info(void) 99 { 100 printk("%lu pages in swap cache\n", total_swapcache_pages()); 101 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", 102 swap_cache_info.add_total, swap_cache_info.del_total, 103 swap_cache_info.find_success, swap_cache_info.find_total); 104 printk("Free swap = %ldkB\n", 105 get_nr_swap_pages() << (PAGE_SHIFT - 10)); 106 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 107 } 108 109 /* 110 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, 111 * but sets SwapCache flag and private instead of mapping and index. 112 */ 113 int __add_to_swap_cache(struct page *page, swp_entry_t entry) 114 { 115 int error, i, nr = hpage_nr_pages(page); 116 struct address_space *address_space; 117 pgoff_t idx = swp_offset(entry); 118 119 VM_BUG_ON_PAGE(!PageLocked(page), page); 120 VM_BUG_ON_PAGE(PageSwapCache(page), page); 121 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 122 123 page_ref_add(page, nr); 124 SetPageSwapCache(page); 125 126 address_space = swap_address_space(entry); 127 spin_lock_irq(&address_space->tree_lock); 128 for (i = 0; i < nr; i++) { 129 set_page_private(page + i, entry.val + i); 130 error = radix_tree_insert(&address_space->page_tree, 131 idx + i, page + i); 132 if (unlikely(error)) 133 break; 134 } 135 if (likely(!error)) { 136 address_space->nrpages += nr; 137 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); 138 ADD_CACHE_INFO(add_total, nr); 139 } else { 140 /* 141 * Only the context which have set SWAP_HAS_CACHE flag 142 * would call add_to_swap_cache(). 143 * So add_to_swap_cache() doesn't returns -EEXIST. 144 */ 145 VM_BUG_ON(error == -EEXIST); 146 set_page_private(page + i, 0UL); 147 while (i--) { 148 radix_tree_delete(&address_space->page_tree, idx + i); 149 set_page_private(page + i, 0UL); 150 } 151 ClearPageSwapCache(page); 152 page_ref_sub(page, nr); 153 } 154 spin_unlock_irq(&address_space->tree_lock); 155 156 return error; 157 } 158 159 160 int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 161 { 162 int error; 163 164 error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page)); 165 if (!error) { 166 error = __add_to_swap_cache(page, entry); 167 radix_tree_preload_end(); 168 } 169 return error; 170 } 171 172 /* 173 * This must be called only on pages that have 174 * been verified to be in the swap cache. 175 */ 176 void __delete_from_swap_cache(struct page *page) 177 { 178 struct address_space *address_space; 179 int i, nr = hpage_nr_pages(page); 180 swp_entry_t entry; 181 pgoff_t idx; 182 183 VM_BUG_ON_PAGE(!PageLocked(page), page); 184 VM_BUG_ON_PAGE(!PageSwapCache(page), page); 185 VM_BUG_ON_PAGE(PageWriteback(page), page); 186 187 entry.val = page_private(page); 188 address_space = swap_address_space(entry); 189 idx = swp_offset(entry); 190 for (i = 0; i < nr; i++) { 191 radix_tree_delete(&address_space->page_tree, idx + i); 192 set_page_private(page + i, 0); 193 } 194 ClearPageSwapCache(page); 195 address_space->nrpages -= nr; 196 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); 197 ADD_CACHE_INFO(del_total, nr); 198 } 199 200 /** 201 * add_to_swap - allocate swap space for a page 202 * @page: page we want to move to swap 203 * 204 * Allocate swap space for the page and add the page to the 205 * swap cache. Caller needs to hold the page lock. 206 */ 207 int add_to_swap(struct page *page) 208 { 209 swp_entry_t entry; 210 int err; 211 212 VM_BUG_ON_PAGE(!PageLocked(page), page); 213 VM_BUG_ON_PAGE(!PageUptodate(page), page); 214 215 entry = get_swap_page(page); 216 if (!entry.val) 217 return 0; 218 219 if (mem_cgroup_try_charge_swap(page, entry)) 220 goto fail; 221 222 /* 223 * Radix-tree node allocations from PF_MEMALLOC contexts could 224 * completely exhaust the page allocator. __GFP_NOMEMALLOC 225 * stops emergency reserves from being allocated. 226 * 227 * TODO: this could cause a theoretical memory reclaim 228 * deadlock in the swap out path. 229 */ 230 /* 231 * Add it to the swap cache. 232 */ 233 err = add_to_swap_cache(page, entry, 234 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); 235 /* -ENOMEM radix-tree allocation failure */ 236 if (err) 237 /* 238 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 239 * clear SWAP_HAS_CACHE flag. 240 */ 241 goto fail; 242 /* 243 * Normally the page will be dirtied in unmap because its pte should be 244 * dirty. A special case is MADV_FREE page. The page'e pte could have 245 * dirty bit cleared but the page's SwapBacked bit is still set because 246 * clearing the dirty bit and SwapBacked bit has no lock protected. For 247 * such page, unmap will not set dirty bit for it, so page reclaim will 248 * not write the page out. This can cause data corruption when the page 249 * is swap in later. Always setting the dirty bit for the page solves 250 * the problem. 251 */ 252 set_page_dirty(page); 253 254 return 1; 255 256 fail: 257 put_swap_page(page, entry); 258 return 0; 259 } 260 261 /* 262 * This must be called only on pages that have 263 * been verified to be in the swap cache and locked. 264 * It will never put the page into the free list, 265 * the caller has a reference on the page. 266 */ 267 void delete_from_swap_cache(struct page *page) 268 { 269 swp_entry_t entry; 270 struct address_space *address_space; 271 272 entry.val = page_private(page); 273 274 address_space = swap_address_space(entry); 275 spin_lock_irq(&address_space->tree_lock); 276 __delete_from_swap_cache(page); 277 spin_unlock_irq(&address_space->tree_lock); 278 279 put_swap_page(page, entry); 280 page_ref_sub(page, hpage_nr_pages(page)); 281 } 282 283 /* 284 * If we are the only user, then try to free up the swap cache. 285 * 286 * Its ok to check for PageSwapCache without the page lock 287 * here because we are going to recheck again inside 288 * try_to_free_swap() _with_ the lock. 289 * - Marcelo 290 */ 291 static inline void free_swap_cache(struct page *page) 292 { 293 if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { 294 try_to_free_swap(page); 295 unlock_page(page); 296 } 297 } 298 299 /* 300 * Perform a free_page(), also freeing any swap cache associated with 301 * this page if it is the last user of the page. 302 */ 303 void free_page_and_swap_cache(struct page *page) 304 { 305 free_swap_cache(page); 306 if (!is_huge_zero_page(page)) 307 put_page(page); 308 } 309 310 /* 311 * Passed an array of pages, drop them all from swapcache and then release 312 * them. They are removed from the LRU and freed if this is their last use. 313 */ 314 void free_pages_and_swap_cache(struct page **pages, int nr) 315 { 316 struct page **pagep = pages; 317 int i; 318 319 lru_add_drain(); 320 for (i = 0; i < nr; i++) 321 free_swap_cache(pagep[i]); 322 release_pages(pagep, nr); 323 } 324 325 /* 326 * Lookup a swap entry in the swap cache. A found page will be returned 327 * unlocked and with its refcount incremented - we rely on the kernel 328 * lock getting page table operations atomic even if we drop the page 329 * lock before returning. 330 */ 331 struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, 332 unsigned long addr) 333 { 334 struct page *page; 335 unsigned long ra_info; 336 int win, hits, readahead; 337 338 page = find_get_page(swap_address_space(entry), swp_offset(entry)); 339 340 INC_CACHE_INFO(find_total); 341 if (page) { 342 INC_CACHE_INFO(find_success); 343 if (unlikely(PageTransCompound(page))) 344 return page; 345 readahead = TestClearPageReadahead(page); 346 if (vma) { 347 ra_info = GET_SWAP_RA_VAL(vma); 348 win = SWAP_RA_WIN(ra_info); 349 hits = SWAP_RA_HITS(ra_info); 350 if (readahead) 351 hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); 352 atomic_long_set(&vma->swap_readahead_info, 353 SWAP_RA_VAL(addr, win, hits)); 354 } 355 if (readahead) { 356 count_vm_event(SWAP_RA_HIT); 357 if (!vma) 358 atomic_inc(&swapin_readahead_hits); 359 } 360 } 361 return page; 362 } 363 364 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 365 struct vm_area_struct *vma, unsigned long addr, 366 bool *new_page_allocated) 367 { 368 struct page *found_page, *new_page = NULL; 369 struct address_space *swapper_space = swap_address_space(entry); 370 int err; 371 *new_page_allocated = false; 372 373 do { 374 /* 375 * First check the swap cache. Since this is normally 376 * called after lookup_swap_cache() failed, re-calling 377 * that would confuse statistics. 378 */ 379 found_page = find_get_page(swapper_space, swp_offset(entry)); 380 if (found_page) 381 break; 382 383 /* 384 * Just skip read ahead for unused swap slot. 385 * During swap_off when swap_slot_cache is disabled, 386 * we have to handle the race between putting 387 * swap entry in swap cache and marking swap slot 388 * as SWAP_HAS_CACHE. That's done in later part of code or 389 * else swap_off will be aborted if we return NULL. 390 */ 391 if (!__swp_swapcount(entry) && swap_slot_cache_enabled) 392 break; 393 394 /* 395 * Get a new page to read into from swap. 396 */ 397 if (!new_page) { 398 new_page = alloc_page_vma(gfp_mask, vma, addr); 399 if (!new_page) 400 break; /* Out of memory */ 401 } 402 403 /* 404 * call radix_tree_preload() while we can wait. 405 */ 406 err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); 407 if (err) 408 break; 409 410 /* 411 * Swap entry may have been freed since our caller observed it. 412 */ 413 err = swapcache_prepare(entry); 414 if (err == -EEXIST) { 415 radix_tree_preload_end(); 416 /* 417 * We might race against get_swap_page() and stumble 418 * across a SWAP_HAS_CACHE swap_map entry whose page 419 * has not been brought into the swapcache yet. 420 */ 421 cond_resched(); 422 continue; 423 } 424 if (err) { /* swp entry is obsolete ? */ 425 radix_tree_preload_end(); 426 break; 427 } 428 429 /* May fail (-ENOMEM) if radix-tree node allocation failed. */ 430 __SetPageLocked(new_page); 431 __SetPageSwapBacked(new_page); 432 err = __add_to_swap_cache(new_page, entry); 433 if (likely(!err)) { 434 radix_tree_preload_end(); 435 /* 436 * Initiate read into locked page and return. 437 */ 438 lru_cache_add_anon(new_page); 439 *new_page_allocated = true; 440 return new_page; 441 } 442 radix_tree_preload_end(); 443 __ClearPageLocked(new_page); 444 /* 445 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 446 * clear SWAP_HAS_CACHE flag. 447 */ 448 put_swap_page(new_page, entry); 449 } while (err != -ENOMEM); 450 451 if (new_page) 452 put_page(new_page); 453 return found_page; 454 } 455 456 /* 457 * Locate a page of swap in physical memory, reserving swap cache space 458 * and reading the disk if it is not already cached. 459 * A failure return means that either the page allocation failed or that 460 * the swap entry is no longer in use. 461 */ 462 struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 463 struct vm_area_struct *vma, unsigned long addr, bool do_poll) 464 { 465 bool page_was_allocated; 466 struct page *retpage = __read_swap_cache_async(entry, gfp_mask, 467 vma, addr, &page_was_allocated); 468 469 if (page_was_allocated) 470 swap_readpage(retpage, do_poll); 471 472 return retpage; 473 } 474 475 static unsigned int __swapin_nr_pages(unsigned long prev_offset, 476 unsigned long offset, 477 int hits, 478 int max_pages, 479 int prev_win) 480 { 481 unsigned int pages, last_ra; 482 483 /* 484 * This heuristic has been found to work well on both sequential and 485 * random loads, swapping to hard disk or to SSD: please don't ask 486 * what the "+ 2" means, it just happens to work well, that's all. 487 */ 488 pages = hits + 2; 489 if (pages == 2) { 490 /* 491 * We can have no readahead hits to judge by: but must not get 492 * stuck here forever, so check for an adjacent offset instead 493 * (and don't even bother to check whether swap type is same). 494 */ 495 if (offset != prev_offset + 1 && offset != prev_offset - 1) 496 pages = 1; 497 } else { 498 unsigned int roundup = 4; 499 while (roundup < pages) 500 roundup <<= 1; 501 pages = roundup; 502 } 503 504 if (pages > max_pages) 505 pages = max_pages; 506 507 /* Don't shrink readahead too fast */ 508 last_ra = prev_win / 2; 509 if (pages < last_ra) 510 pages = last_ra; 511 512 return pages; 513 } 514 515 static unsigned long swapin_nr_pages(unsigned long offset) 516 { 517 static unsigned long prev_offset; 518 unsigned int hits, pages, max_pages; 519 static atomic_t last_readahead_pages; 520 521 max_pages = 1 << READ_ONCE(page_cluster); 522 if (max_pages <= 1) 523 return 1; 524 525 hits = atomic_xchg(&swapin_readahead_hits, 0); 526 pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages, 527 atomic_read(&last_readahead_pages)); 528 if (!hits) 529 prev_offset = offset; 530 atomic_set(&last_readahead_pages, pages); 531 532 return pages; 533 } 534 535 /** 536 * swapin_readahead - swap in pages in hope we need them soon 537 * @entry: swap entry of this memory 538 * @gfp_mask: memory allocation flags 539 * @vma: user vma this address belongs to 540 * @addr: target address for mempolicy 541 * 542 * Returns the struct page for entry and addr, after queueing swapin. 543 * 544 * Primitive swap readahead code. We simply read an aligned block of 545 * (1 << page_cluster) entries in the swap area. This method is chosen 546 * because it doesn't cost us any seek time. We also make sure to queue 547 * the 'original' request together with the readahead ones... 548 * 549 * This has been extended to use the NUMA policies from the mm triggering 550 * the readahead. 551 * 552 * Caller must hold down_read on the vma->vm_mm if vma is not NULL. 553 */ 554 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 555 struct vm_area_struct *vma, unsigned long addr) 556 { 557 struct page *page; 558 unsigned long entry_offset = swp_offset(entry); 559 unsigned long offset = entry_offset; 560 unsigned long start_offset, end_offset; 561 unsigned long mask; 562 struct swap_info_struct *si = swp_swap_info(entry); 563 struct blk_plug plug; 564 bool do_poll = true, page_allocated; 565 566 mask = swapin_nr_pages(offset) - 1; 567 if (!mask) 568 goto skip; 569 570 do_poll = false; 571 /* Read a page_cluster sized and aligned cluster around offset. */ 572 start_offset = offset & ~mask; 573 end_offset = offset | mask; 574 if (!start_offset) /* First page is swap header. */ 575 start_offset++; 576 if (end_offset >= si->max) 577 end_offset = si->max - 1; 578 579 blk_start_plug(&plug); 580 for (offset = start_offset; offset <= end_offset ; offset++) { 581 /* Ok, do the async read-ahead now */ 582 page = __read_swap_cache_async( 583 swp_entry(swp_type(entry), offset), 584 gfp_mask, vma, addr, &page_allocated); 585 if (!page) 586 continue; 587 if (page_allocated) { 588 swap_readpage(page, false); 589 if (offset != entry_offset && 590 likely(!PageTransCompound(page))) { 591 SetPageReadahead(page); 592 count_vm_event(SWAP_RA); 593 } 594 } 595 put_page(page); 596 } 597 blk_finish_plug(&plug); 598 599 lru_add_drain(); /* Push any new pages onto the LRU now */ 600 skip: 601 return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll); 602 } 603 604 int init_swap_address_space(unsigned int type, unsigned long nr_pages) 605 { 606 struct address_space *spaces, *space; 607 unsigned int i, nr; 608 609 nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); 610 spaces = kvzalloc(sizeof(struct address_space) * nr, GFP_KERNEL); 611 if (!spaces) 612 return -ENOMEM; 613 for (i = 0; i < nr; i++) { 614 space = spaces + i; 615 INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN); 616 atomic_set(&space->i_mmap_writable, 0); 617 space->a_ops = &swap_aops; 618 /* swap cache doesn't use writeback related tags */ 619 mapping_set_no_writeback_tags(space); 620 spin_lock_init(&space->tree_lock); 621 } 622 nr_swapper_spaces[type] = nr; 623 rcu_assign_pointer(swapper_spaces[type], spaces); 624 625 return 0; 626 } 627 628 void exit_swap_address_space(unsigned int type) 629 { 630 struct address_space *spaces; 631 632 spaces = swapper_spaces[type]; 633 nr_swapper_spaces[type] = 0; 634 rcu_assign_pointer(swapper_spaces[type], NULL); 635 synchronize_rcu(); 636 kvfree(spaces); 637 } 638 639 static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, 640 unsigned long faddr, 641 unsigned long lpfn, 642 unsigned long rpfn, 643 unsigned long *start, 644 unsigned long *end) 645 { 646 *start = max3(lpfn, PFN_DOWN(vma->vm_start), 647 PFN_DOWN(faddr & PMD_MASK)); 648 *end = min3(rpfn, PFN_DOWN(vma->vm_end), 649 PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); 650 } 651 652 struct page *swap_readahead_detect(struct vm_fault *vmf, 653 struct vma_swap_readahead *swap_ra) 654 { 655 struct vm_area_struct *vma = vmf->vma; 656 unsigned long swap_ra_info; 657 struct page *page; 658 swp_entry_t entry; 659 unsigned long faddr, pfn, fpfn; 660 unsigned long start, end; 661 pte_t *pte; 662 unsigned int max_win, hits, prev_win, win, left; 663 #ifndef CONFIG_64BIT 664 pte_t *tpte; 665 #endif 666 667 max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), 668 SWAP_RA_ORDER_CEILING); 669 if (max_win == 1) { 670 swap_ra->win = 1; 671 return NULL; 672 } 673 674 faddr = vmf->address; 675 entry = pte_to_swp_entry(vmf->orig_pte); 676 if ((unlikely(non_swap_entry(entry)))) 677 return NULL; 678 page = lookup_swap_cache(entry, vma, faddr); 679 if (page) 680 return page; 681 682 fpfn = PFN_DOWN(faddr); 683 swap_ra_info = GET_SWAP_RA_VAL(vma); 684 pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info)); 685 prev_win = SWAP_RA_WIN(swap_ra_info); 686 hits = SWAP_RA_HITS(swap_ra_info); 687 swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits, 688 max_win, prev_win); 689 atomic_long_set(&vma->swap_readahead_info, 690 SWAP_RA_VAL(faddr, win, 0)); 691 692 if (win == 1) 693 return NULL; 694 695 /* Copy the PTEs because the page table may be unmapped */ 696 if (fpfn == pfn + 1) 697 swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); 698 else if (pfn == fpfn + 1) 699 swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1, 700 &start, &end); 701 else { 702 left = (win - 1) / 2; 703 swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, 704 &start, &end); 705 } 706 swap_ra->nr_pte = end - start; 707 swap_ra->offset = fpfn - start; 708 pte = vmf->pte - swap_ra->offset; 709 #ifdef CONFIG_64BIT 710 swap_ra->ptes = pte; 711 #else 712 tpte = swap_ra->ptes; 713 for (pfn = start; pfn != end; pfn++) 714 *tpte++ = *pte++; 715 #endif 716 717 return NULL; 718 } 719 720 struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, 721 struct vm_fault *vmf, 722 struct vma_swap_readahead *swap_ra) 723 { 724 struct blk_plug plug; 725 struct vm_area_struct *vma = vmf->vma; 726 struct page *page; 727 pte_t *pte, pentry; 728 swp_entry_t entry; 729 unsigned int i; 730 bool page_allocated; 731 732 if (swap_ra->win == 1) 733 goto skip; 734 735 blk_start_plug(&plug); 736 for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte; 737 i++, pte++) { 738 pentry = *pte; 739 if (pte_none(pentry)) 740 continue; 741 if (pte_present(pentry)) 742 continue; 743 entry = pte_to_swp_entry(pentry); 744 if (unlikely(non_swap_entry(entry))) 745 continue; 746 page = __read_swap_cache_async(entry, gfp_mask, vma, 747 vmf->address, &page_allocated); 748 if (!page) 749 continue; 750 if (page_allocated) { 751 swap_readpage(page, false); 752 if (i != swap_ra->offset && 753 likely(!PageTransCompound(page))) { 754 SetPageReadahead(page); 755 count_vm_event(SWAP_RA); 756 } 757 } 758 put_page(page); 759 } 760 blk_finish_plug(&plug); 761 lru_add_drain(); 762 skip: 763 return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, 764 swap_ra->win == 1); 765 } 766 767 #ifdef CONFIG_SYSFS 768 static ssize_t vma_ra_enabled_show(struct kobject *kobj, 769 struct kobj_attribute *attr, char *buf) 770 { 771 return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false"); 772 } 773 static ssize_t vma_ra_enabled_store(struct kobject *kobj, 774 struct kobj_attribute *attr, 775 const char *buf, size_t count) 776 { 777 if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) 778 swap_vma_readahead = true; 779 else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) 780 swap_vma_readahead = false; 781 else 782 return -EINVAL; 783 784 return count; 785 } 786 static struct kobj_attribute vma_ra_enabled_attr = 787 __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show, 788 vma_ra_enabled_store); 789 790 static struct attribute *swap_attrs[] = { 791 &vma_ra_enabled_attr.attr, 792 NULL, 793 }; 794 795 static struct attribute_group swap_attr_group = { 796 .attrs = swap_attrs, 797 }; 798 799 static int __init swap_init_sysfs(void) 800 { 801 int err; 802 struct kobject *swap_kobj; 803 804 swap_kobj = kobject_create_and_add("swap", mm_kobj); 805 if (!swap_kobj) { 806 pr_err("failed to create swap kobject\n"); 807 return -ENOMEM; 808 } 809 err = sysfs_create_group(swap_kobj, &swap_attr_group); 810 if (err) { 811 pr_err("failed to register swap group\n"); 812 goto delete_obj; 813 } 814 return 0; 815 816 delete_obj: 817 kobject_put(swap_kobj); 818 return err; 819 } 820 subsys_initcall(swap_init_sysfs); 821 #endif 822