1 /* 2 * linux/mm/swap_state.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 * Swap reorganised 29.12.95, Stephen Tweedie 6 * 7 * Rewritten to use page cache, (C) 1998 Stephen Tweedie 8 */ 9 #include <linux/mm.h> 10 #include <linux/gfp.h> 11 #include <linux/kernel_stat.h> 12 #include <linux/swap.h> 13 #include <linux/swapops.h> 14 #include <linux/init.h> 15 #include <linux/pagemap.h> 16 #include <linux/backing-dev.h> 17 #include <linux/blkdev.h> 18 #include <linux/pagevec.h> 19 #include <linux/migrate.h> 20 #include <linux/vmalloc.h> 21 #include <linux/swap_slots.h> 22 #include <linux/huge_mm.h> 23 24 #include <asm/pgtable.h> 25 26 /* 27 * swapper_space is a fiction, retained to simplify the path through 28 * vmscan's shrink_page_list. 29 */ 30 static const struct address_space_operations swap_aops = { 31 .writepage = swap_writepage, 32 .set_page_dirty = swap_set_page_dirty, 33 #ifdef CONFIG_MIGRATION 34 .migratepage = migrate_page, 35 #endif 36 }; 37 38 struct address_space *swapper_spaces[MAX_SWAPFILES]; 39 static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; 40 bool swap_vma_readahead = true; 41 42 #define SWAP_RA_MAX_ORDER_DEFAULT 3 43 44 static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT; 45 46 #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 47 #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 48 #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK 49 #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) 50 51 #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) 52 #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) 53 #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) 54 55 #define SWAP_RA_VAL(addr, win, hits) \ 56 (((addr) & PAGE_MASK) | \ 57 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ 58 ((hits) & SWAP_RA_HITS_MASK)) 59 60 /* Initial readahead hits is 4 to start up with a small window */ 61 #define GET_SWAP_RA_VAL(vma) \ 62 (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) 63 64 #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) 65 #define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) 66 67 static struct { 68 unsigned long add_total; 69 unsigned long del_total; 70 unsigned long find_success; 71 unsigned long find_total; 72 } swap_cache_info; 73 74 unsigned long total_swapcache_pages(void) 75 { 76 unsigned int i, j, nr; 77 unsigned long ret = 0; 78 struct address_space *spaces; 79 80 rcu_read_lock(); 81 for (i = 0; i < MAX_SWAPFILES; i++) { 82 /* 83 * The corresponding entries in nr_swapper_spaces and 84 * swapper_spaces will be reused only after at least 85 * one grace period. So it is impossible for them 86 * belongs to different usage. 87 */ 88 nr = nr_swapper_spaces[i]; 89 spaces = rcu_dereference(swapper_spaces[i]); 90 if (!nr || !spaces) 91 continue; 92 for (j = 0; j < nr; j++) 93 ret += spaces[j].nrpages; 94 } 95 rcu_read_unlock(); 96 return ret; 97 } 98 99 static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); 100 101 void show_swap_cache_info(void) 102 { 103 printk("%lu pages in swap cache\n", total_swapcache_pages()); 104 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", 105 swap_cache_info.add_total, swap_cache_info.del_total, 106 swap_cache_info.find_success, swap_cache_info.find_total); 107 printk("Free swap = %ldkB\n", 108 get_nr_swap_pages() << (PAGE_SHIFT - 10)); 109 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 110 } 111 112 /* 113 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, 114 * but sets SwapCache flag and private instead of mapping and index. 115 */ 116 int __add_to_swap_cache(struct page *page, swp_entry_t entry) 117 { 118 int error, i, nr = hpage_nr_pages(page); 119 struct address_space *address_space; 120 pgoff_t idx = swp_offset(entry); 121 122 VM_BUG_ON_PAGE(!PageLocked(page), page); 123 VM_BUG_ON_PAGE(PageSwapCache(page), page); 124 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 125 126 page_ref_add(page, nr); 127 SetPageSwapCache(page); 128 129 address_space = swap_address_space(entry); 130 spin_lock_irq(&address_space->tree_lock); 131 for (i = 0; i < nr; i++) { 132 set_page_private(page + i, entry.val + i); 133 error = radix_tree_insert(&address_space->page_tree, 134 idx + i, page + i); 135 if (unlikely(error)) 136 break; 137 } 138 if (likely(!error)) { 139 address_space->nrpages += nr; 140 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); 141 ADD_CACHE_INFO(add_total, nr); 142 } else { 143 /* 144 * Only the context which have set SWAP_HAS_CACHE flag 145 * would call add_to_swap_cache(). 146 * So add_to_swap_cache() doesn't returns -EEXIST. 147 */ 148 VM_BUG_ON(error == -EEXIST); 149 set_page_private(page + i, 0UL); 150 while (i--) { 151 radix_tree_delete(&address_space->page_tree, idx + i); 152 set_page_private(page + i, 0UL); 153 } 154 ClearPageSwapCache(page); 155 page_ref_sub(page, nr); 156 } 157 spin_unlock_irq(&address_space->tree_lock); 158 159 return error; 160 } 161 162 163 int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 164 { 165 int error; 166 167 error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page)); 168 if (!error) { 169 error = __add_to_swap_cache(page, entry); 170 radix_tree_preload_end(); 171 } 172 return error; 173 } 174 175 /* 176 * This must be called only on pages that have 177 * been verified to be in the swap cache. 178 */ 179 void __delete_from_swap_cache(struct page *page) 180 { 181 struct address_space *address_space; 182 int i, nr = hpage_nr_pages(page); 183 swp_entry_t entry; 184 pgoff_t idx; 185 186 VM_BUG_ON_PAGE(!PageLocked(page), page); 187 VM_BUG_ON_PAGE(!PageSwapCache(page), page); 188 VM_BUG_ON_PAGE(PageWriteback(page), page); 189 190 entry.val = page_private(page); 191 address_space = swap_address_space(entry); 192 idx = swp_offset(entry); 193 for (i = 0; i < nr; i++) { 194 radix_tree_delete(&address_space->page_tree, idx + i); 195 set_page_private(page + i, 0); 196 } 197 ClearPageSwapCache(page); 198 address_space->nrpages -= nr; 199 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); 200 ADD_CACHE_INFO(del_total, nr); 201 } 202 203 /** 204 * add_to_swap - allocate swap space for a page 205 * @page: page we want to move to swap 206 * 207 * Allocate swap space for the page and add the page to the 208 * swap cache. Caller needs to hold the page lock. 209 */ 210 int add_to_swap(struct page *page) 211 { 212 swp_entry_t entry; 213 int err; 214 215 VM_BUG_ON_PAGE(!PageLocked(page), page); 216 VM_BUG_ON_PAGE(!PageUptodate(page), page); 217 218 entry = get_swap_page(page); 219 if (!entry.val) 220 return 0; 221 222 if (mem_cgroup_try_charge_swap(page, entry)) 223 goto fail; 224 225 /* 226 * Radix-tree node allocations from PF_MEMALLOC contexts could 227 * completely exhaust the page allocator. __GFP_NOMEMALLOC 228 * stops emergency reserves from being allocated. 229 * 230 * TODO: this could cause a theoretical memory reclaim 231 * deadlock in the swap out path. 232 */ 233 /* 234 * Add it to the swap cache. 235 */ 236 err = add_to_swap_cache(page, entry, 237 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); 238 /* -ENOMEM radix-tree allocation failure */ 239 if (err) 240 /* 241 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 242 * clear SWAP_HAS_CACHE flag. 243 */ 244 goto fail; 245 /* 246 * Normally the page will be dirtied in unmap because its pte should be 247 * dirty. A special case is MADV_FREE page. The page'e pte could have 248 * dirty bit cleared but the page's SwapBacked bit is still set because 249 * clearing the dirty bit and SwapBacked bit has no lock protected. For 250 * such page, unmap will not set dirty bit for it, so page reclaim will 251 * not write the page out. This can cause data corruption when the page 252 * is swap in later. Always setting the dirty bit for the page solves 253 * the problem. 254 */ 255 set_page_dirty(page); 256 257 return 1; 258 259 fail: 260 put_swap_page(page, entry); 261 return 0; 262 } 263 264 /* 265 * This must be called only on pages that have 266 * been verified to be in the swap cache and locked. 267 * It will never put the page into the free list, 268 * the caller has a reference on the page. 269 */ 270 void delete_from_swap_cache(struct page *page) 271 { 272 swp_entry_t entry; 273 struct address_space *address_space; 274 275 entry.val = page_private(page); 276 277 address_space = swap_address_space(entry); 278 spin_lock_irq(&address_space->tree_lock); 279 __delete_from_swap_cache(page); 280 spin_unlock_irq(&address_space->tree_lock); 281 282 put_swap_page(page, entry); 283 page_ref_sub(page, hpage_nr_pages(page)); 284 } 285 286 /* 287 * If we are the only user, then try to free up the swap cache. 288 * 289 * Its ok to check for PageSwapCache without the page lock 290 * here because we are going to recheck again inside 291 * try_to_free_swap() _with_ the lock. 292 * - Marcelo 293 */ 294 static inline void free_swap_cache(struct page *page) 295 { 296 if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { 297 try_to_free_swap(page); 298 unlock_page(page); 299 } 300 } 301 302 /* 303 * Perform a free_page(), also freeing any swap cache associated with 304 * this page if it is the last user of the page. 305 */ 306 void free_page_and_swap_cache(struct page *page) 307 { 308 free_swap_cache(page); 309 if (!is_huge_zero_page(page)) 310 put_page(page); 311 } 312 313 /* 314 * Passed an array of pages, drop them all from swapcache and then release 315 * them. They are removed from the LRU and freed if this is their last use. 316 */ 317 void free_pages_and_swap_cache(struct page **pages, int nr) 318 { 319 struct page **pagep = pages; 320 int i; 321 322 lru_add_drain(); 323 for (i = 0; i < nr; i++) 324 free_swap_cache(pagep[i]); 325 release_pages(pagep, nr, false); 326 } 327 328 /* 329 * Lookup a swap entry in the swap cache. A found page will be returned 330 * unlocked and with its refcount incremented - we rely on the kernel 331 * lock getting page table operations atomic even if we drop the page 332 * lock before returning. 333 */ 334 struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, 335 unsigned long addr) 336 { 337 struct page *page; 338 unsigned long ra_info; 339 int win, hits, readahead; 340 341 page = find_get_page(swap_address_space(entry), swp_offset(entry)); 342 343 INC_CACHE_INFO(find_total); 344 if (page) { 345 INC_CACHE_INFO(find_success); 346 if (unlikely(PageTransCompound(page))) 347 return page; 348 readahead = TestClearPageReadahead(page); 349 if (vma) { 350 ra_info = GET_SWAP_RA_VAL(vma); 351 win = SWAP_RA_WIN(ra_info); 352 hits = SWAP_RA_HITS(ra_info); 353 if (readahead) 354 hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); 355 atomic_long_set(&vma->swap_readahead_info, 356 SWAP_RA_VAL(addr, win, hits)); 357 } 358 if (readahead) { 359 count_vm_event(SWAP_RA_HIT); 360 if (!vma) 361 atomic_inc(&swapin_readahead_hits); 362 } 363 } 364 return page; 365 } 366 367 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 368 struct vm_area_struct *vma, unsigned long addr, 369 bool *new_page_allocated) 370 { 371 struct page *found_page, *new_page = NULL; 372 struct address_space *swapper_space = swap_address_space(entry); 373 int err; 374 *new_page_allocated = false; 375 376 do { 377 /* 378 * First check the swap cache. Since this is normally 379 * called after lookup_swap_cache() failed, re-calling 380 * that would confuse statistics. 381 */ 382 found_page = find_get_page(swapper_space, swp_offset(entry)); 383 if (found_page) 384 break; 385 386 /* 387 * Just skip read ahead for unused swap slot. 388 * During swap_off when swap_slot_cache is disabled, 389 * we have to handle the race between putting 390 * swap entry in swap cache and marking swap slot 391 * as SWAP_HAS_CACHE. That's done in later part of code or 392 * else swap_off will be aborted if we return NULL. 393 */ 394 if (!__swp_swapcount(entry) && swap_slot_cache_enabled) 395 break; 396 397 /* 398 * Get a new page to read into from swap. 399 */ 400 if (!new_page) { 401 new_page = alloc_page_vma(gfp_mask, vma, addr); 402 if (!new_page) 403 break; /* Out of memory */ 404 } 405 406 /* 407 * call radix_tree_preload() while we can wait. 408 */ 409 err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); 410 if (err) 411 break; 412 413 /* 414 * Swap entry may have been freed since our caller observed it. 415 */ 416 err = swapcache_prepare(entry); 417 if (err == -EEXIST) { 418 radix_tree_preload_end(); 419 /* 420 * We might race against get_swap_page() and stumble 421 * across a SWAP_HAS_CACHE swap_map entry whose page 422 * has not been brought into the swapcache yet. 423 */ 424 cond_resched(); 425 continue; 426 } 427 if (err) { /* swp entry is obsolete ? */ 428 radix_tree_preload_end(); 429 break; 430 } 431 432 /* May fail (-ENOMEM) if radix-tree node allocation failed. */ 433 __SetPageLocked(new_page); 434 __SetPageSwapBacked(new_page); 435 err = __add_to_swap_cache(new_page, entry); 436 if (likely(!err)) { 437 radix_tree_preload_end(); 438 /* 439 * Initiate read into locked page and return. 440 */ 441 lru_cache_add_anon(new_page); 442 *new_page_allocated = true; 443 return new_page; 444 } 445 radix_tree_preload_end(); 446 __ClearPageLocked(new_page); 447 /* 448 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 449 * clear SWAP_HAS_CACHE flag. 450 */ 451 put_swap_page(new_page, entry); 452 } while (err != -ENOMEM); 453 454 if (new_page) 455 put_page(new_page); 456 return found_page; 457 } 458 459 /* 460 * Locate a page of swap in physical memory, reserving swap cache space 461 * and reading the disk if it is not already cached. 462 * A failure return means that either the page allocation failed or that 463 * the swap entry is no longer in use. 464 */ 465 struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 466 struct vm_area_struct *vma, unsigned long addr, bool do_poll) 467 { 468 bool page_was_allocated; 469 struct page *retpage = __read_swap_cache_async(entry, gfp_mask, 470 vma, addr, &page_was_allocated); 471 472 if (page_was_allocated) 473 swap_readpage(retpage, do_poll); 474 475 return retpage; 476 } 477 478 static unsigned int __swapin_nr_pages(unsigned long prev_offset, 479 unsigned long offset, 480 int hits, 481 int max_pages, 482 int prev_win) 483 { 484 unsigned int pages, last_ra; 485 486 /* 487 * This heuristic has been found to work well on both sequential and 488 * random loads, swapping to hard disk or to SSD: please don't ask 489 * what the "+ 2" means, it just happens to work well, that's all. 490 */ 491 pages = hits + 2; 492 if (pages == 2) { 493 /* 494 * We can have no readahead hits to judge by: but must not get 495 * stuck here forever, so check for an adjacent offset instead 496 * (and don't even bother to check whether swap type is same). 497 */ 498 if (offset != prev_offset + 1 && offset != prev_offset - 1) 499 pages = 1; 500 } else { 501 unsigned int roundup = 4; 502 while (roundup < pages) 503 roundup <<= 1; 504 pages = roundup; 505 } 506 507 if (pages > max_pages) 508 pages = max_pages; 509 510 /* Don't shrink readahead too fast */ 511 last_ra = prev_win / 2; 512 if (pages < last_ra) 513 pages = last_ra; 514 515 return pages; 516 } 517 518 static unsigned long swapin_nr_pages(unsigned long offset) 519 { 520 static unsigned long prev_offset; 521 unsigned int hits, pages, max_pages; 522 static atomic_t last_readahead_pages; 523 524 max_pages = 1 << READ_ONCE(page_cluster); 525 if (max_pages <= 1) 526 return 1; 527 528 hits = atomic_xchg(&swapin_readahead_hits, 0); 529 pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages, 530 atomic_read(&last_readahead_pages)); 531 if (!hits) 532 prev_offset = offset; 533 atomic_set(&last_readahead_pages, pages); 534 535 return pages; 536 } 537 538 /** 539 * swapin_readahead - swap in pages in hope we need them soon 540 * @entry: swap entry of this memory 541 * @gfp_mask: memory allocation flags 542 * @vma: user vma this address belongs to 543 * @addr: target address for mempolicy 544 * 545 * Returns the struct page for entry and addr, after queueing swapin. 546 * 547 * Primitive swap readahead code. We simply read an aligned block of 548 * (1 << page_cluster) entries in the swap area. This method is chosen 549 * because it doesn't cost us any seek time. We also make sure to queue 550 * the 'original' request together with the readahead ones... 551 * 552 * This has been extended to use the NUMA policies from the mm triggering 553 * the readahead. 554 * 555 * Caller must hold down_read on the vma->vm_mm if vma is not NULL. 556 */ 557 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 558 struct vm_area_struct *vma, unsigned long addr) 559 { 560 struct page *page; 561 unsigned long entry_offset = swp_offset(entry); 562 unsigned long offset = entry_offset; 563 unsigned long start_offset, end_offset; 564 unsigned long mask; 565 struct blk_plug plug; 566 bool do_poll = true, page_allocated; 567 568 mask = swapin_nr_pages(offset) - 1; 569 if (!mask) 570 goto skip; 571 572 do_poll = false; 573 /* Read a page_cluster sized and aligned cluster around offset. */ 574 start_offset = offset & ~mask; 575 end_offset = offset | mask; 576 if (!start_offset) /* First page is swap header. */ 577 start_offset++; 578 579 blk_start_plug(&plug); 580 for (offset = start_offset; offset <= end_offset ; offset++) { 581 /* Ok, do the async read-ahead now */ 582 page = __read_swap_cache_async( 583 swp_entry(swp_type(entry), offset), 584 gfp_mask, vma, addr, &page_allocated); 585 if (!page) 586 continue; 587 if (page_allocated) { 588 swap_readpage(page, false); 589 if (offset != entry_offset && 590 likely(!PageTransCompound(page))) { 591 SetPageReadahead(page); 592 count_vm_event(SWAP_RA); 593 } 594 } 595 put_page(page); 596 } 597 blk_finish_plug(&plug); 598 599 lru_add_drain(); /* Push any new pages onto the LRU now */ 600 skip: 601 return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll); 602 } 603 604 int init_swap_address_space(unsigned int type, unsigned long nr_pages) 605 { 606 struct address_space *spaces, *space; 607 unsigned int i, nr; 608 609 nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); 610 spaces = kvzalloc(sizeof(struct address_space) * nr, GFP_KERNEL); 611 if (!spaces) 612 return -ENOMEM; 613 for (i = 0; i < nr; i++) { 614 space = spaces + i; 615 INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN); 616 atomic_set(&space->i_mmap_writable, 0); 617 space->a_ops = &swap_aops; 618 /* swap cache doesn't use writeback related tags */ 619 mapping_set_no_writeback_tags(space); 620 spin_lock_init(&space->tree_lock); 621 } 622 nr_swapper_spaces[type] = nr; 623 rcu_assign_pointer(swapper_spaces[type], spaces); 624 625 return 0; 626 } 627 628 void exit_swap_address_space(unsigned int type) 629 { 630 struct address_space *spaces; 631 632 spaces = swapper_spaces[type]; 633 nr_swapper_spaces[type] = 0; 634 rcu_assign_pointer(swapper_spaces[type], NULL); 635 synchronize_rcu(); 636 kvfree(spaces); 637 } 638 639 static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, 640 unsigned long faddr, 641 unsigned long lpfn, 642 unsigned long rpfn, 643 unsigned long *start, 644 unsigned long *end) 645 { 646 *start = max3(lpfn, PFN_DOWN(vma->vm_start), 647 PFN_DOWN(faddr & PMD_MASK)); 648 *end = min3(rpfn, PFN_DOWN(vma->vm_end), 649 PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); 650 } 651 652 struct page *swap_readahead_detect(struct vm_fault *vmf, 653 struct vma_swap_readahead *swap_ra) 654 { 655 struct vm_area_struct *vma = vmf->vma; 656 unsigned long swap_ra_info; 657 struct page *page; 658 swp_entry_t entry; 659 unsigned long faddr, pfn, fpfn; 660 unsigned long start, end; 661 pte_t *pte; 662 unsigned int max_win, hits, prev_win, win, left; 663 #ifndef CONFIG_64BIT 664 pte_t *tpte; 665 #endif 666 667 faddr = vmf->address; 668 entry = pte_to_swp_entry(vmf->orig_pte); 669 if ((unlikely(non_swap_entry(entry)))) 670 return NULL; 671 page = lookup_swap_cache(entry, vma, faddr); 672 if (page) 673 return page; 674 675 max_win = 1 << READ_ONCE(swap_ra_max_order); 676 if (max_win == 1) { 677 swap_ra->win = 1; 678 return NULL; 679 } 680 681 fpfn = PFN_DOWN(faddr); 682 swap_ra_info = GET_SWAP_RA_VAL(vma); 683 pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info)); 684 prev_win = SWAP_RA_WIN(swap_ra_info); 685 hits = SWAP_RA_HITS(swap_ra_info); 686 swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits, 687 max_win, prev_win); 688 atomic_long_set(&vma->swap_readahead_info, 689 SWAP_RA_VAL(faddr, win, 0)); 690 691 if (win == 1) 692 return NULL; 693 694 /* Copy the PTEs because the page table may be unmapped */ 695 if (fpfn == pfn + 1) 696 swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); 697 else if (pfn == fpfn + 1) 698 swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1, 699 &start, &end); 700 else { 701 left = (win - 1) / 2; 702 swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, 703 &start, &end); 704 } 705 swap_ra->nr_pte = end - start; 706 swap_ra->offset = fpfn - start; 707 pte = vmf->pte - swap_ra->offset; 708 #ifdef CONFIG_64BIT 709 swap_ra->ptes = pte; 710 #else 711 tpte = swap_ra->ptes; 712 for (pfn = start; pfn != end; pfn++) 713 *tpte++ = *pte++; 714 #endif 715 716 return NULL; 717 } 718 719 struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, 720 struct vm_fault *vmf, 721 struct vma_swap_readahead *swap_ra) 722 { 723 struct blk_plug plug; 724 struct vm_area_struct *vma = vmf->vma; 725 struct page *page; 726 pte_t *pte, pentry; 727 swp_entry_t entry; 728 unsigned int i; 729 bool page_allocated; 730 731 if (swap_ra->win == 1) 732 goto skip; 733 734 blk_start_plug(&plug); 735 for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte; 736 i++, pte++) { 737 pentry = *pte; 738 if (pte_none(pentry)) 739 continue; 740 if (pte_present(pentry)) 741 continue; 742 entry = pte_to_swp_entry(pentry); 743 if (unlikely(non_swap_entry(entry))) 744 continue; 745 page = __read_swap_cache_async(entry, gfp_mask, vma, 746 vmf->address, &page_allocated); 747 if (!page) 748 continue; 749 if (page_allocated) { 750 swap_readpage(page, false); 751 if (i != swap_ra->offset && 752 likely(!PageTransCompound(page))) { 753 SetPageReadahead(page); 754 count_vm_event(SWAP_RA); 755 } 756 } 757 put_page(page); 758 } 759 blk_finish_plug(&plug); 760 lru_add_drain(); 761 skip: 762 return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, 763 swap_ra->win == 1); 764 } 765 766 #ifdef CONFIG_SYSFS 767 static ssize_t vma_ra_enabled_show(struct kobject *kobj, 768 struct kobj_attribute *attr, char *buf) 769 { 770 return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false"); 771 } 772 static ssize_t vma_ra_enabled_store(struct kobject *kobj, 773 struct kobj_attribute *attr, 774 const char *buf, size_t count) 775 { 776 if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) 777 swap_vma_readahead = true; 778 else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) 779 swap_vma_readahead = false; 780 else 781 return -EINVAL; 782 783 return count; 784 } 785 static struct kobj_attribute vma_ra_enabled_attr = 786 __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show, 787 vma_ra_enabled_store); 788 789 static ssize_t vma_ra_max_order_show(struct kobject *kobj, 790 struct kobj_attribute *attr, char *buf) 791 { 792 return sprintf(buf, "%d\n", swap_ra_max_order); 793 } 794 static ssize_t vma_ra_max_order_store(struct kobject *kobj, 795 struct kobj_attribute *attr, 796 const char *buf, size_t count) 797 { 798 int err, v; 799 800 err = kstrtoint(buf, 10, &v); 801 if (err || v > SWAP_RA_ORDER_CEILING || v <= 0) 802 return -EINVAL; 803 804 swap_ra_max_order = v; 805 806 return count; 807 } 808 static struct kobj_attribute vma_ra_max_order_attr = 809 __ATTR(vma_ra_max_order, 0644, vma_ra_max_order_show, 810 vma_ra_max_order_store); 811 812 static struct attribute *swap_attrs[] = { 813 &vma_ra_enabled_attr.attr, 814 &vma_ra_max_order_attr.attr, 815 NULL, 816 }; 817 818 static struct attribute_group swap_attr_group = { 819 .attrs = swap_attrs, 820 }; 821 822 static int __init swap_init_sysfs(void) 823 { 824 int err; 825 struct kobject *swap_kobj; 826 827 swap_kobj = kobject_create_and_add("swap", mm_kobj); 828 if (!swap_kobj) { 829 pr_err("failed to create swap kobject\n"); 830 return -ENOMEM; 831 } 832 err = sysfs_create_group(swap_kobj, &swap_attr_group); 833 if (err) { 834 pr_err("failed to register swap group\n"); 835 goto delete_obj; 836 } 837 return 0; 838 839 delete_obj: 840 kobject_put(swap_kobj); 841 return err; 842 } 843 subsys_initcall(swap_init_sysfs); 844 #endif 845