1 /* 2 * linux/mm/swap_state.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 * Swap reorganised 29.12.95, Stephen Tweedie 6 * 7 * Rewritten to use page cache, (C) 1998 Stephen Tweedie 8 */ 9 #include <linux/mm.h> 10 #include <linux/gfp.h> 11 #include <linux/kernel_stat.h> 12 #include <linux/swap.h> 13 #include <linux/swapops.h> 14 #include <linux/init.h> 15 #include <linux/pagemap.h> 16 #include <linux/backing-dev.h> 17 #include <linux/blkdev.h> 18 #include <linux/pagevec.h> 19 #include <linux/migrate.h> 20 #include <linux/vmalloc.h> 21 #include <linux/swap_slots.h> 22 #include <linux/huge_mm.h> 23 24 #include <asm/pgtable.h> 25 26 /* 27 * swapper_space is a fiction, retained to simplify the path through 28 * vmscan's shrink_page_list. 29 */ 30 static const struct address_space_operations swap_aops = { 31 .writepage = swap_writepage, 32 .set_page_dirty = swap_set_page_dirty, 33 #ifdef CONFIG_MIGRATION 34 .migratepage = migrate_page, 35 #endif 36 }; 37 38 struct address_space *swapper_spaces[MAX_SWAPFILES]; 39 static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; 40 bool swap_vma_readahead = true; 41 42 #define SWAP_RA_MAX_ORDER_DEFAULT 3 43 44 static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT; 45 46 #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 47 #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 48 #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK 49 #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) 50 51 #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) 52 #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) 53 #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) 54 55 #define SWAP_RA_VAL(addr, win, hits) \ 56 (((addr) & PAGE_MASK) | \ 57 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ 58 ((hits) & SWAP_RA_HITS_MASK)) 59 60 /* Initial readahead hits is 4 to start up with a small window */ 61 #define GET_SWAP_RA_VAL(vma) \ 62 (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) 63 64 #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) 65 #define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) 66 67 static struct { 68 unsigned long add_total; 69 unsigned long del_total; 70 unsigned long find_success; 71 unsigned long find_total; 72 } swap_cache_info; 73 74 unsigned long total_swapcache_pages(void) 75 { 76 unsigned int i, j, nr; 77 unsigned long ret = 0; 78 struct address_space *spaces; 79 80 rcu_read_lock(); 81 for (i = 0; i < MAX_SWAPFILES; i++) { 82 /* 83 * The corresponding entries in nr_swapper_spaces and 84 * swapper_spaces will be reused only after at least 85 * one grace period. So it is impossible for them 86 * belongs to different usage. 87 */ 88 nr = nr_swapper_spaces[i]; 89 spaces = rcu_dereference(swapper_spaces[i]); 90 if (!nr || !spaces) 91 continue; 92 for (j = 0; j < nr; j++) 93 ret += spaces[j].nrpages; 94 } 95 rcu_read_unlock(); 96 return ret; 97 } 98 99 static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); 100 101 void show_swap_cache_info(void) 102 { 103 printk("%lu pages in swap cache\n", total_swapcache_pages()); 104 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", 105 swap_cache_info.add_total, swap_cache_info.del_total, 106 swap_cache_info.find_success, swap_cache_info.find_total); 107 printk("Free swap = %ldkB\n", 108 get_nr_swap_pages() << (PAGE_SHIFT - 10)); 109 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 110 } 111 112 /* 113 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, 114 * but sets SwapCache flag and private instead of mapping and index. 115 */ 116 int __add_to_swap_cache(struct page *page, swp_entry_t entry) 117 { 118 int error, i, nr = hpage_nr_pages(page); 119 struct address_space *address_space; 120 pgoff_t idx = swp_offset(entry); 121 122 VM_BUG_ON_PAGE(!PageLocked(page), page); 123 VM_BUG_ON_PAGE(PageSwapCache(page), page); 124 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 125 126 page_ref_add(page, nr); 127 SetPageSwapCache(page); 128 129 address_space = swap_address_space(entry); 130 spin_lock_irq(&address_space->tree_lock); 131 for (i = 0; i < nr; i++) { 132 set_page_private(page + i, entry.val + i); 133 error = radix_tree_insert(&address_space->page_tree, 134 idx + i, page + i); 135 if (unlikely(error)) 136 break; 137 } 138 if (likely(!error)) { 139 address_space->nrpages += nr; 140 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); 141 ADD_CACHE_INFO(add_total, nr); 142 } else { 143 /* 144 * Only the context which have set SWAP_HAS_CACHE flag 145 * would call add_to_swap_cache(). 146 * So add_to_swap_cache() doesn't returns -EEXIST. 147 */ 148 VM_BUG_ON(error == -EEXIST); 149 set_page_private(page + i, 0UL); 150 while (i--) { 151 radix_tree_delete(&address_space->page_tree, idx + i); 152 set_page_private(page + i, 0UL); 153 } 154 ClearPageSwapCache(page); 155 page_ref_sub(page, nr); 156 } 157 spin_unlock_irq(&address_space->tree_lock); 158 159 return error; 160 } 161 162 163 int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 164 { 165 int error; 166 167 error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page)); 168 if (!error) { 169 error = __add_to_swap_cache(page, entry); 170 radix_tree_preload_end(); 171 } 172 return error; 173 } 174 175 /* 176 * This must be called only on pages that have 177 * been verified to be in the swap cache. 178 */ 179 void __delete_from_swap_cache(struct page *page) 180 { 181 struct address_space *address_space; 182 int i, nr = hpage_nr_pages(page); 183 swp_entry_t entry; 184 pgoff_t idx; 185 186 VM_BUG_ON_PAGE(!PageLocked(page), page); 187 VM_BUG_ON_PAGE(!PageSwapCache(page), page); 188 VM_BUG_ON_PAGE(PageWriteback(page), page); 189 190 entry.val = page_private(page); 191 address_space = swap_address_space(entry); 192 idx = swp_offset(entry); 193 for (i = 0; i < nr; i++) { 194 radix_tree_delete(&address_space->page_tree, idx + i); 195 set_page_private(page + i, 0); 196 } 197 ClearPageSwapCache(page); 198 address_space->nrpages -= nr; 199 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); 200 ADD_CACHE_INFO(del_total, nr); 201 } 202 203 /** 204 * add_to_swap - allocate swap space for a page 205 * @page: page we want to move to swap 206 * 207 * Allocate swap space for the page and add the page to the 208 * swap cache. Caller needs to hold the page lock. 209 */ 210 int add_to_swap(struct page *page) 211 { 212 swp_entry_t entry; 213 int err; 214 215 VM_BUG_ON_PAGE(!PageLocked(page), page); 216 VM_BUG_ON_PAGE(!PageUptodate(page), page); 217 218 entry = get_swap_page(page); 219 if (!entry.val) 220 return 0; 221 222 if (mem_cgroup_try_charge_swap(page, entry)) 223 goto fail; 224 225 /* 226 * Radix-tree node allocations from PF_MEMALLOC contexts could 227 * completely exhaust the page allocator. __GFP_NOMEMALLOC 228 * stops emergency reserves from being allocated. 229 * 230 * TODO: this could cause a theoretical memory reclaim 231 * deadlock in the swap out path. 232 */ 233 /* 234 * Add it to the swap cache. 235 */ 236 err = add_to_swap_cache(page, entry, 237 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); 238 /* -ENOMEM radix-tree allocation failure */ 239 if (err) 240 /* 241 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 242 * clear SWAP_HAS_CACHE flag. 243 */ 244 goto fail; 245 246 return 1; 247 248 fail: 249 put_swap_page(page, entry); 250 return 0; 251 } 252 253 /* 254 * This must be called only on pages that have 255 * been verified to be in the swap cache and locked. 256 * It will never put the page into the free list, 257 * the caller has a reference on the page. 258 */ 259 void delete_from_swap_cache(struct page *page) 260 { 261 swp_entry_t entry; 262 struct address_space *address_space; 263 264 entry.val = page_private(page); 265 266 address_space = swap_address_space(entry); 267 spin_lock_irq(&address_space->tree_lock); 268 __delete_from_swap_cache(page); 269 spin_unlock_irq(&address_space->tree_lock); 270 271 put_swap_page(page, entry); 272 page_ref_sub(page, hpage_nr_pages(page)); 273 } 274 275 /* 276 * If we are the only user, then try to free up the swap cache. 277 * 278 * Its ok to check for PageSwapCache without the page lock 279 * here because we are going to recheck again inside 280 * try_to_free_swap() _with_ the lock. 281 * - Marcelo 282 */ 283 static inline void free_swap_cache(struct page *page) 284 { 285 if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { 286 try_to_free_swap(page); 287 unlock_page(page); 288 } 289 } 290 291 /* 292 * Perform a free_page(), also freeing any swap cache associated with 293 * this page if it is the last user of the page. 294 */ 295 void free_page_and_swap_cache(struct page *page) 296 { 297 free_swap_cache(page); 298 if (!is_huge_zero_page(page)) 299 put_page(page); 300 } 301 302 /* 303 * Passed an array of pages, drop them all from swapcache and then release 304 * them. They are removed from the LRU and freed if this is their last use. 305 */ 306 void free_pages_and_swap_cache(struct page **pages, int nr) 307 { 308 struct page **pagep = pages; 309 int i; 310 311 lru_add_drain(); 312 for (i = 0; i < nr; i++) 313 free_swap_cache(pagep[i]); 314 release_pages(pagep, nr, false); 315 } 316 317 /* 318 * Lookup a swap entry in the swap cache. A found page will be returned 319 * unlocked and with its refcount incremented - we rely on the kernel 320 * lock getting page table operations atomic even if we drop the page 321 * lock before returning. 322 */ 323 struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, 324 unsigned long addr) 325 { 326 struct page *page; 327 unsigned long ra_info; 328 int win, hits, readahead; 329 330 page = find_get_page(swap_address_space(entry), swp_offset(entry)); 331 332 INC_CACHE_INFO(find_total); 333 if (page) { 334 INC_CACHE_INFO(find_success); 335 if (unlikely(PageTransCompound(page))) 336 return page; 337 readahead = TestClearPageReadahead(page); 338 if (vma) { 339 ra_info = GET_SWAP_RA_VAL(vma); 340 win = SWAP_RA_WIN(ra_info); 341 hits = SWAP_RA_HITS(ra_info); 342 if (readahead) 343 hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); 344 atomic_long_set(&vma->swap_readahead_info, 345 SWAP_RA_VAL(addr, win, hits)); 346 } 347 if (readahead) { 348 count_vm_event(SWAP_RA_HIT); 349 if (!vma) 350 atomic_inc(&swapin_readahead_hits); 351 } 352 } 353 return page; 354 } 355 356 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 357 struct vm_area_struct *vma, unsigned long addr, 358 bool *new_page_allocated) 359 { 360 struct page *found_page, *new_page = NULL; 361 struct address_space *swapper_space = swap_address_space(entry); 362 int err; 363 *new_page_allocated = false; 364 365 do { 366 /* 367 * First check the swap cache. Since this is normally 368 * called after lookup_swap_cache() failed, re-calling 369 * that would confuse statistics. 370 */ 371 found_page = find_get_page(swapper_space, swp_offset(entry)); 372 if (found_page) 373 break; 374 375 /* 376 * Just skip read ahead for unused swap slot. 377 * During swap_off when swap_slot_cache is disabled, 378 * we have to handle the race between putting 379 * swap entry in swap cache and marking swap slot 380 * as SWAP_HAS_CACHE. That's done in later part of code or 381 * else swap_off will be aborted if we return NULL. 382 */ 383 if (!__swp_swapcount(entry) && swap_slot_cache_enabled) 384 break; 385 386 /* 387 * Get a new page to read into from swap. 388 */ 389 if (!new_page) { 390 new_page = alloc_page_vma(gfp_mask, vma, addr); 391 if (!new_page) 392 break; /* Out of memory */ 393 } 394 395 /* 396 * call radix_tree_preload() while we can wait. 397 */ 398 err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); 399 if (err) 400 break; 401 402 /* 403 * Swap entry may have been freed since our caller observed it. 404 */ 405 err = swapcache_prepare(entry); 406 if (err == -EEXIST) { 407 radix_tree_preload_end(); 408 /* 409 * We might race against get_swap_page() and stumble 410 * across a SWAP_HAS_CACHE swap_map entry whose page 411 * has not been brought into the swapcache yet. 412 */ 413 cond_resched(); 414 continue; 415 } 416 if (err) { /* swp entry is obsolete ? */ 417 radix_tree_preload_end(); 418 break; 419 } 420 421 /* May fail (-ENOMEM) if radix-tree node allocation failed. */ 422 __SetPageLocked(new_page); 423 __SetPageSwapBacked(new_page); 424 err = __add_to_swap_cache(new_page, entry); 425 if (likely(!err)) { 426 radix_tree_preload_end(); 427 /* 428 * Initiate read into locked page and return. 429 */ 430 lru_cache_add_anon(new_page); 431 *new_page_allocated = true; 432 return new_page; 433 } 434 radix_tree_preload_end(); 435 __ClearPageLocked(new_page); 436 /* 437 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 438 * clear SWAP_HAS_CACHE flag. 439 */ 440 put_swap_page(new_page, entry); 441 } while (err != -ENOMEM); 442 443 if (new_page) 444 put_page(new_page); 445 return found_page; 446 } 447 448 /* 449 * Locate a page of swap in physical memory, reserving swap cache space 450 * and reading the disk if it is not already cached. 451 * A failure return means that either the page allocation failed or that 452 * the swap entry is no longer in use. 453 */ 454 struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 455 struct vm_area_struct *vma, unsigned long addr, bool do_poll) 456 { 457 bool page_was_allocated; 458 struct page *retpage = __read_swap_cache_async(entry, gfp_mask, 459 vma, addr, &page_was_allocated); 460 461 if (page_was_allocated) 462 swap_readpage(retpage, do_poll); 463 464 return retpage; 465 } 466 467 static unsigned int __swapin_nr_pages(unsigned long prev_offset, 468 unsigned long offset, 469 int hits, 470 int max_pages, 471 int prev_win) 472 { 473 unsigned int pages, last_ra; 474 475 /* 476 * This heuristic has been found to work well on both sequential and 477 * random loads, swapping to hard disk or to SSD: please don't ask 478 * what the "+ 2" means, it just happens to work well, that's all. 479 */ 480 pages = hits + 2; 481 if (pages == 2) { 482 /* 483 * We can have no readahead hits to judge by: but must not get 484 * stuck here forever, so check for an adjacent offset instead 485 * (and don't even bother to check whether swap type is same). 486 */ 487 if (offset != prev_offset + 1 && offset != prev_offset - 1) 488 pages = 1; 489 } else { 490 unsigned int roundup = 4; 491 while (roundup < pages) 492 roundup <<= 1; 493 pages = roundup; 494 } 495 496 if (pages > max_pages) 497 pages = max_pages; 498 499 /* Don't shrink readahead too fast */ 500 last_ra = prev_win / 2; 501 if (pages < last_ra) 502 pages = last_ra; 503 504 return pages; 505 } 506 507 static unsigned long swapin_nr_pages(unsigned long offset) 508 { 509 static unsigned long prev_offset; 510 unsigned int hits, pages, max_pages; 511 static atomic_t last_readahead_pages; 512 513 max_pages = 1 << READ_ONCE(page_cluster); 514 if (max_pages <= 1) 515 return 1; 516 517 hits = atomic_xchg(&swapin_readahead_hits, 0); 518 pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages, 519 atomic_read(&last_readahead_pages)); 520 if (!hits) 521 prev_offset = offset; 522 atomic_set(&last_readahead_pages, pages); 523 524 return pages; 525 } 526 527 /** 528 * swapin_readahead - swap in pages in hope we need them soon 529 * @entry: swap entry of this memory 530 * @gfp_mask: memory allocation flags 531 * @vma: user vma this address belongs to 532 * @addr: target address for mempolicy 533 * 534 * Returns the struct page for entry and addr, after queueing swapin. 535 * 536 * Primitive swap readahead code. We simply read an aligned block of 537 * (1 << page_cluster) entries in the swap area. This method is chosen 538 * because it doesn't cost us any seek time. We also make sure to queue 539 * the 'original' request together with the readahead ones... 540 * 541 * This has been extended to use the NUMA policies from the mm triggering 542 * the readahead. 543 * 544 * Caller must hold down_read on the vma->vm_mm if vma is not NULL. 545 */ 546 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 547 struct vm_area_struct *vma, unsigned long addr) 548 { 549 struct page *page; 550 unsigned long entry_offset = swp_offset(entry); 551 unsigned long offset = entry_offset; 552 unsigned long start_offset, end_offset; 553 unsigned long mask; 554 struct blk_plug plug; 555 bool do_poll = true, page_allocated; 556 557 mask = swapin_nr_pages(offset) - 1; 558 if (!mask) 559 goto skip; 560 561 do_poll = false; 562 /* Read a page_cluster sized and aligned cluster around offset. */ 563 start_offset = offset & ~mask; 564 end_offset = offset | mask; 565 if (!start_offset) /* First page is swap header. */ 566 start_offset++; 567 568 blk_start_plug(&plug); 569 for (offset = start_offset; offset <= end_offset ; offset++) { 570 /* Ok, do the async read-ahead now */ 571 page = __read_swap_cache_async( 572 swp_entry(swp_type(entry), offset), 573 gfp_mask, vma, addr, &page_allocated); 574 if (!page) 575 continue; 576 if (page_allocated) { 577 swap_readpage(page, false); 578 if (offset != entry_offset && 579 likely(!PageTransCompound(page))) { 580 SetPageReadahead(page); 581 count_vm_event(SWAP_RA); 582 } 583 } 584 put_page(page); 585 } 586 blk_finish_plug(&plug); 587 588 lru_add_drain(); /* Push any new pages onto the LRU now */ 589 skip: 590 return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll); 591 } 592 593 int init_swap_address_space(unsigned int type, unsigned long nr_pages) 594 { 595 struct address_space *spaces, *space; 596 unsigned int i, nr; 597 598 nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); 599 spaces = kvzalloc(sizeof(struct address_space) * nr, GFP_KERNEL); 600 if (!spaces) 601 return -ENOMEM; 602 for (i = 0; i < nr; i++) { 603 space = spaces + i; 604 INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN); 605 atomic_set(&space->i_mmap_writable, 0); 606 space->a_ops = &swap_aops; 607 /* swap cache doesn't use writeback related tags */ 608 mapping_set_no_writeback_tags(space); 609 spin_lock_init(&space->tree_lock); 610 } 611 nr_swapper_spaces[type] = nr; 612 rcu_assign_pointer(swapper_spaces[type], spaces); 613 614 return 0; 615 } 616 617 void exit_swap_address_space(unsigned int type) 618 { 619 struct address_space *spaces; 620 621 spaces = swapper_spaces[type]; 622 nr_swapper_spaces[type] = 0; 623 rcu_assign_pointer(swapper_spaces[type], NULL); 624 synchronize_rcu(); 625 kvfree(spaces); 626 } 627 628 static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, 629 unsigned long faddr, 630 unsigned long lpfn, 631 unsigned long rpfn, 632 unsigned long *start, 633 unsigned long *end) 634 { 635 *start = max3(lpfn, PFN_DOWN(vma->vm_start), 636 PFN_DOWN(faddr & PMD_MASK)); 637 *end = min3(rpfn, PFN_DOWN(vma->vm_end), 638 PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); 639 } 640 641 struct page *swap_readahead_detect(struct vm_fault *vmf, 642 struct vma_swap_readahead *swap_ra) 643 { 644 struct vm_area_struct *vma = vmf->vma; 645 unsigned long swap_ra_info; 646 struct page *page; 647 swp_entry_t entry; 648 unsigned long faddr, pfn, fpfn; 649 unsigned long start, end; 650 pte_t *pte; 651 unsigned int max_win, hits, prev_win, win, left; 652 #ifndef CONFIG_64BIT 653 pte_t *tpte; 654 #endif 655 656 faddr = vmf->address; 657 entry = pte_to_swp_entry(vmf->orig_pte); 658 if ((unlikely(non_swap_entry(entry)))) 659 return NULL; 660 page = lookup_swap_cache(entry, vma, faddr); 661 if (page) 662 return page; 663 664 max_win = 1 << READ_ONCE(swap_ra_max_order); 665 if (max_win == 1) { 666 swap_ra->win = 1; 667 return NULL; 668 } 669 670 fpfn = PFN_DOWN(faddr); 671 swap_ra_info = GET_SWAP_RA_VAL(vma); 672 pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info)); 673 prev_win = SWAP_RA_WIN(swap_ra_info); 674 hits = SWAP_RA_HITS(swap_ra_info); 675 swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits, 676 max_win, prev_win); 677 atomic_long_set(&vma->swap_readahead_info, 678 SWAP_RA_VAL(faddr, win, 0)); 679 680 if (win == 1) 681 return NULL; 682 683 /* Copy the PTEs because the page table may be unmapped */ 684 if (fpfn == pfn + 1) 685 swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); 686 else if (pfn == fpfn + 1) 687 swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1, 688 &start, &end); 689 else { 690 left = (win - 1) / 2; 691 swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, 692 &start, &end); 693 } 694 swap_ra->nr_pte = end - start; 695 swap_ra->offset = fpfn - start; 696 pte = vmf->pte - swap_ra->offset; 697 #ifdef CONFIG_64BIT 698 swap_ra->ptes = pte; 699 #else 700 tpte = swap_ra->ptes; 701 for (pfn = start; pfn != end; pfn++) 702 *tpte++ = *pte++; 703 #endif 704 705 return NULL; 706 } 707 708 struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, 709 struct vm_fault *vmf, 710 struct vma_swap_readahead *swap_ra) 711 { 712 struct blk_plug plug; 713 struct vm_area_struct *vma = vmf->vma; 714 struct page *page; 715 pte_t *pte, pentry; 716 swp_entry_t entry; 717 unsigned int i; 718 bool page_allocated; 719 720 if (swap_ra->win == 1) 721 goto skip; 722 723 blk_start_plug(&plug); 724 for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte; 725 i++, pte++) { 726 pentry = *pte; 727 if (pte_none(pentry)) 728 continue; 729 if (pte_present(pentry)) 730 continue; 731 entry = pte_to_swp_entry(pentry); 732 if (unlikely(non_swap_entry(entry))) 733 continue; 734 page = __read_swap_cache_async(entry, gfp_mask, vma, 735 vmf->address, &page_allocated); 736 if (!page) 737 continue; 738 if (page_allocated) { 739 swap_readpage(page, false); 740 if (i != swap_ra->offset && 741 likely(!PageTransCompound(page))) { 742 SetPageReadahead(page); 743 count_vm_event(SWAP_RA); 744 } 745 } 746 put_page(page); 747 } 748 blk_finish_plug(&plug); 749 lru_add_drain(); 750 skip: 751 return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, 752 swap_ra->win == 1); 753 } 754 755 #ifdef CONFIG_SYSFS 756 static ssize_t vma_ra_enabled_show(struct kobject *kobj, 757 struct kobj_attribute *attr, char *buf) 758 { 759 return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false"); 760 } 761 static ssize_t vma_ra_enabled_store(struct kobject *kobj, 762 struct kobj_attribute *attr, 763 const char *buf, size_t count) 764 { 765 if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) 766 swap_vma_readahead = true; 767 else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) 768 swap_vma_readahead = false; 769 else 770 return -EINVAL; 771 772 return count; 773 } 774 static struct kobj_attribute vma_ra_enabled_attr = 775 __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show, 776 vma_ra_enabled_store); 777 778 static ssize_t vma_ra_max_order_show(struct kobject *kobj, 779 struct kobj_attribute *attr, char *buf) 780 { 781 return sprintf(buf, "%d\n", swap_ra_max_order); 782 } 783 static ssize_t vma_ra_max_order_store(struct kobject *kobj, 784 struct kobj_attribute *attr, 785 const char *buf, size_t count) 786 { 787 int err, v; 788 789 err = kstrtoint(buf, 10, &v); 790 if (err || v > SWAP_RA_ORDER_CEILING || v <= 0) 791 return -EINVAL; 792 793 swap_ra_max_order = v; 794 795 return count; 796 } 797 static struct kobj_attribute vma_ra_max_order_attr = 798 __ATTR(vma_ra_max_order, 0644, vma_ra_max_order_show, 799 vma_ra_max_order_store); 800 801 static struct attribute *swap_attrs[] = { 802 &vma_ra_enabled_attr.attr, 803 &vma_ra_max_order_attr.attr, 804 NULL, 805 }; 806 807 static struct attribute_group swap_attr_group = { 808 .attrs = swap_attrs, 809 }; 810 811 static int __init swap_init_sysfs(void) 812 { 813 int err; 814 struct kobject *swap_kobj; 815 816 swap_kobj = kobject_create_and_add("swap", mm_kobj); 817 if (!swap_kobj) { 818 pr_err("failed to create swap kobject\n"); 819 return -ENOMEM; 820 } 821 err = sysfs_create_group(swap_kobj, &swap_attr_group); 822 if (err) { 823 pr_err("failed to register swap group\n"); 824 goto delete_obj; 825 } 826 return 0; 827 828 delete_obj: 829 kobject_put(swap_kobj); 830 return err; 831 } 832 subsys_initcall(swap_init_sysfs); 833 #endif 834