1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/swap_state.c 4 * 5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 6 * Swap reorganised 29.12.95, Stephen Tweedie 7 * 8 * Rewritten to use page cache, (C) 1998 Stephen Tweedie 9 */ 10 #include <linux/mm.h> 11 #include <linux/gfp.h> 12 #include <linux/kernel_stat.h> 13 #include <linux/swap.h> 14 #include <linux/swapops.h> 15 #include <linux/init.h> 16 #include <linux/pagemap.h> 17 #include <linux/backing-dev.h> 18 #include <linux/blkdev.h> 19 #include <linux/pagevec.h> 20 #include <linux/migrate.h> 21 #include <linux/vmalloc.h> 22 #include <linux/swap_slots.h> 23 #include <linux/huge_mm.h> 24 #include <linux/shmem_fs.h> 25 #include "internal.h" 26 #include "swap.h" 27 28 /* 29 * swapper_space is a fiction, retained to simplify the path through 30 * vmscan's shrink_page_list. 31 */ 32 static const struct address_space_operations swap_aops = { 33 .writepage = swap_writepage, 34 .dirty_folio = noop_dirty_folio, 35 #ifdef CONFIG_MIGRATION 36 .migrate_folio = migrate_folio, 37 #endif 38 }; 39 40 struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; 41 static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; 42 static bool enable_vma_readahead __read_mostly = true; 43 44 #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 45 #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 46 #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK 47 #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) 48 49 #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) 50 #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) 51 #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) 52 53 #define SWAP_RA_VAL(addr, win, hits) \ 54 (((addr) & PAGE_MASK) | \ 55 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ 56 ((hits) & SWAP_RA_HITS_MASK)) 57 58 /* Initial readahead hits is 4 to start up with a small window */ 59 #define GET_SWAP_RA_VAL(vma) \ 60 (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) 61 62 static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); 63 64 void show_swap_cache_info(void) 65 { 66 printk("%lu pages in swap cache\n", total_swapcache_pages()); 67 printk("Free swap = %ldkB\n", 68 get_nr_swap_pages() << (PAGE_SHIFT - 10)); 69 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 70 } 71 72 void *get_shadow_from_swap_cache(swp_entry_t entry) 73 { 74 struct address_space *address_space = swap_address_space(entry); 75 pgoff_t idx = swp_offset(entry); 76 struct page *page; 77 78 page = xa_load(&address_space->i_pages, idx); 79 if (xa_is_value(page)) 80 return page; 81 return NULL; 82 } 83 84 /* 85 * add_to_swap_cache resembles filemap_add_folio on swapper_space, 86 * but sets SwapCache flag and private instead of mapping and index. 87 */ 88 int add_to_swap_cache(struct folio *folio, swp_entry_t entry, 89 gfp_t gfp, void **shadowp) 90 { 91 struct address_space *address_space = swap_address_space(entry); 92 pgoff_t idx = swp_offset(entry); 93 XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); 94 unsigned long i, nr = folio_nr_pages(folio); 95 void *old; 96 97 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 98 VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); 99 VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); 100 101 folio_ref_add(folio, nr); 102 folio_set_swapcache(folio); 103 104 do { 105 xas_lock_irq(&xas); 106 xas_create_range(&xas); 107 if (xas_error(&xas)) 108 goto unlock; 109 for (i = 0; i < nr; i++) { 110 VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); 111 old = xas_load(&xas); 112 if (xa_is_value(old)) { 113 if (shadowp) 114 *shadowp = old; 115 } 116 set_page_private(folio_page(folio, i), entry.val + i); 117 xas_store(&xas, folio); 118 xas_next(&xas); 119 } 120 address_space->nrpages += nr; 121 __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); 122 __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); 123 unlock: 124 xas_unlock_irq(&xas); 125 } while (xas_nomem(&xas, gfp)); 126 127 if (!xas_error(&xas)) 128 return 0; 129 130 folio_clear_swapcache(folio); 131 folio_ref_sub(folio, nr); 132 return xas_error(&xas); 133 } 134 135 /* 136 * This must be called only on folios that have 137 * been verified to be in the swap cache. 138 */ 139 void __delete_from_swap_cache(struct folio *folio, 140 swp_entry_t entry, void *shadow) 141 { 142 struct address_space *address_space = swap_address_space(entry); 143 int i; 144 long nr = folio_nr_pages(folio); 145 pgoff_t idx = swp_offset(entry); 146 XA_STATE(xas, &address_space->i_pages, idx); 147 148 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 149 VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); 150 VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); 151 152 for (i = 0; i < nr; i++) { 153 void *entry = xas_store(&xas, shadow); 154 VM_BUG_ON_PAGE(entry != folio, entry); 155 set_page_private(folio_page(folio, i), 0); 156 xas_next(&xas); 157 } 158 folio_clear_swapcache(folio); 159 address_space->nrpages -= nr; 160 __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); 161 __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); 162 } 163 164 /** 165 * add_to_swap - allocate swap space for a folio 166 * @folio: folio we want to move to swap 167 * 168 * Allocate swap space for the folio and add the folio to the 169 * swap cache. 170 * 171 * Context: Caller needs to hold the folio lock. 172 * Return: Whether the folio was added to the swap cache. 173 */ 174 bool add_to_swap(struct folio *folio) 175 { 176 swp_entry_t entry; 177 int err; 178 179 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 180 VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); 181 182 entry = folio_alloc_swap(folio); 183 if (!entry.val) 184 return false; 185 186 /* 187 * XArray node allocations from PF_MEMALLOC contexts could 188 * completely exhaust the page allocator. __GFP_NOMEMALLOC 189 * stops emergency reserves from being allocated. 190 * 191 * TODO: this could cause a theoretical memory reclaim 192 * deadlock in the swap out path. 193 */ 194 /* 195 * Add it to the swap cache. 196 */ 197 err = add_to_swap_cache(folio, entry, 198 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); 199 if (err) 200 /* 201 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 202 * clear SWAP_HAS_CACHE flag. 203 */ 204 goto fail; 205 /* 206 * Normally the folio will be dirtied in unmap because its 207 * pte should be dirty. A special case is MADV_FREE page. The 208 * page's pte could have dirty bit cleared but the folio's 209 * SwapBacked flag is still set because clearing the dirty bit 210 * and SwapBacked flag has no lock protected. For such folio, 211 * unmap will not set dirty bit for it, so folio reclaim will 212 * not write the folio out. This can cause data corruption when 213 * the folio is swapped in later. Always setting the dirty flag 214 * for the folio solves the problem. 215 */ 216 folio_mark_dirty(folio); 217 218 return true; 219 220 fail: 221 put_swap_folio(folio, entry); 222 return false; 223 } 224 225 /* 226 * This must be called only on folios that have 227 * been verified to be in the swap cache and locked. 228 * It will never put the folio into the free list, 229 * the caller has a reference on the folio. 230 */ 231 void delete_from_swap_cache(struct folio *folio) 232 { 233 swp_entry_t entry = folio_swap_entry(folio); 234 struct address_space *address_space = swap_address_space(entry); 235 236 xa_lock_irq(&address_space->i_pages); 237 __delete_from_swap_cache(folio, entry, NULL); 238 xa_unlock_irq(&address_space->i_pages); 239 240 put_swap_folio(folio, entry); 241 folio_ref_sub(folio, folio_nr_pages(folio)); 242 } 243 244 void clear_shadow_from_swap_cache(int type, unsigned long begin, 245 unsigned long end) 246 { 247 unsigned long curr = begin; 248 void *old; 249 250 for (;;) { 251 swp_entry_t entry = swp_entry(type, curr); 252 struct address_space *address_space = swap_address_space(entry); 253 XA_STATE(xas, &address_space->i_pages, curr); 254 255 xa_lock_irq(&address_space->i_pages); 256 xas_for_each(&xas, old, end) { 257 if (!xa_is_value(old)) 258 continue; 259 xas_store(&xas, NULL); 260 } 261 xa_unlock_irq(&address_space->i_pages); 262 263 /* search the next swapcache until we meet end */ 264 curr >>= SWAP_ADDRESS_SPACE_SHIFT; 265 curr++; 266 curr <<= SWAP_ADDRESS_SPACE_SHIFT; 267 if (curr > end) 268 break; 269 } 270 } 271 272 /* 273 * If we are the only user, then try to free up the swap cache. 274 * 275 * Its ok to check the swapcache flag without the folio lock 276 * here because we are going to recheck again inside 277 * folio_free_swap() _with_ the lock. 278 * - Marcelo 279 */ 280 void free_swap_cache(struct page *page) 281 { 282 struct folio *folio = page_folio(page); 283 284 if (folio_test_swapcache(folio) && !folio_mapped(folio) && 285 folio_trylock(folio)) { 286 folio_free_swap(folio); 287 folio_unlock(folio); 288 } 289 } 290 291 /* 292 * Perform a free_page(), also freeing any swap cache associated with 293 * this page if it is the last user of the page. 294 */ 295 void free_page_and_swap_cache(struct page *page) 296 { 297 free_swap_cache(page); 298 if (!is_huge_zero_page(page)) 299 put_page(page); 300 } 301 302 /* 303 * Passed an array of pages, drop them all from swapcache and then release 304 * them. They are removed from the LRU and freed if this is their last use. 305 */ 306 void free_pages_and_swap_cache(struct page **pages, int nr) 307 { 308 struct page **pagep = pages; 309 int i; 310 311 lru_add_drain(); 312 for (i = 0; i < nr; i++) 313 free_swap_cache(pagep[i]); 314 release_pages(pagep, nr); 315 } 316 317 static inline bool swap_use_vma_readahead(void) 318 { 319 return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); 320 } 321 322 /* 323 * Lookup a swap entry in the swap cache. A found folio will be returned 324 * unlocked and with its refcount incremented - we rely on the kernel 325 * lock getting page table operations atomic even if we drop the folio 326 * lock before returning. 327 */ 328 struct folio *swap_cache_get_folio(swp_entry_t entry, 329 struct vm_area_struct *vma, unsigned long addr) 330 { 331 struct folio *folio; 332 struct swap_info_struct *si; 333 334 si = get_swap_device(entry); 335 if (!si) 336 return NULL; 337 folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); 338 put_swap_device(si); 339 340 if (folio) { 341 bool vma_ra = swap_use_vma_readahead(); 342 bool readahead; 343 344 /* 345 * At the moment, we don't support PG_readahead for anon THP 346 * so let's bail out rather than confusing the readahead stat. 347 */ 348 if (unlikely(folio_test_large(folio))) 349 return folio; 350 351 readahead = folio_test_clear_readahead(folio); 352 if (vma && vma_ra) { 353 unsigned long ra_val; 354 int win, hits; 355 356 ra_val = GET_SWAP_RA_VAL(vma); 357 win = SWAP_RA_WIN(ra_val); 358 hits = SWAP_RA_HITS(ra_val); 359 if (readahead) 360 hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); 361 atomic_long_set(&vma->swap_readahead_info, 362 SWAP_RA_VAL(addr, win, hits)); 363 } 364 365 if (readahead) { 366 count_vm_event(SWAP_RA_HIT); 367 if (!vma || !vma_ra) 368 atomic_inc(&swapin_readahead_hits); 369 } 370 } 371 372 return folio; 373 } 374 375 /** 376 * find_get_incore_page - Find and get a page from the page or swap caches. 377 * @mapping: The address_space to search. 378 * @index: The page cache index. 379 * 380 * This differs from find_get_page() in that it will also look for the 381 * page in the swap cache. 382 * 383 * Return: The found page or %NULL. 384 */ 385 struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) 386 { 387 swp_entry_t swp; 388 struct swap_info_struct *si; 389 struct page *page = pagecache_get_page(mapping, index, 390 FGP_ENTRY | FGP_HEAD, 0); 391 392 if (!page) 393 return page; 394 if (!xa_is_value(page)) 395 return find_subpage(page, index); 396 if (!shmem_mapping(mapping)) 397 return NULL; 398 399 swp = radix_to_swp_entry(page); 400 /* There might be swapin error entries in shmem mapping. */ 401 if (non_swap_entry(swp)) 402 return NULL; 403 /* Prevent swapoff from happening to us */ 404 si = get_swap_device(swp); 405 if (!si) 406 return NULL; 407 page = find_get_page(swap_address_space(swp), swp_offset(swp)); 408 put_swap_device(si); 409 return page; 410 } 411 412 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 413 struct vm_area_struct *vma, unsigned long addr, 414 bool *new_page_allocated) 415 { 416 struct swap_info_struct *si; 417 struct folio *folio; 418 void *shadow = NULL; 419 420 *new_page_allocated = false; 421 422 for (;;) { 423 int err; 424 /* 425 * First check the swap cache. Since this is normally 426 * called after swap_cache_get_folio() failed, re-calling 427 * that would confuse statistics. 428 */ 429 si = get_swap_device(entry); 430 if (!si) 431 return NULL; 432 folio = filemap_get_folio(swap_address_space(entry), 433 swp_offset(entry)); 434 put_swap_device(si); 435 if (folio) 436 return folio_file_page(folio, swp_offset(entry)); 437 438 /* 439 * Just skip read ahead for unused swap slot. 440 * During swap_off when swap_slot_cache is disabled, 441 * we have to handle the race between putting 442 * swap entry in swap cache and marking swap slot 443 * as SWAP_HAS_CACHE. That's done in later part of code or 444 * else swap_off will be aborted if we return NULL. 445 */ 446 if (!__swp_swapcount(entry) && swap_slot_cache_enabled) 447 return NULL; 448 449 /* 450 * Get a new page to read into from swap. Allocate it now, 451 * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will 452 * cause any racers to loop around until we add it to cache. 453 */ 454 folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); 455 if (!folio) 456 return NULL; 457 458 /* 459 * Swap entry may have been freed since our caller observed it. 460 */ 461 err = swapcache_prepare(entry); 462 if (!err) 463 break; 464 465 folio_put(folio); 466 if (err != -EEXIST) 467 return NULL; 468 469 /* 470 * We might race against __delete_from_swap_cache(), and 471 * stumble across a swap_map entry whose SWAP_HAS_CACHE 472 * has not yet been cleared. Or race against another 473 * __read_swap_cache_async(), which has set SWAP_HAS_CACHE 474 * in swap_map, but not yet added its page to swap cache. 475 */ 476 schedule_timeout_uninterruptible(1); 477 } 478 479 /* 480 * The swap entry is ours to swap in. Prepare the new page. 481 */ 482 483 __folio_set_locked(folio); 484 __folio_set_swapbacked(folio); 485 486 if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)) 487 goto fail_unlock; 488 489 /* May fail (-ENOMEM) if XArray node allocation failed. */ 490 if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) 491 goto fail_unlock; 492 493 mem_cgroup_swapin_uncharge_swap(entry); 494 495 if (shadow) 496 workingset_refault(folio, shadow); 497 498 /* Caller will initiate read into locked folio */ 499 folio_add_lru(folio); 500 *new_page_allocated = true; 501 return &folio->page; 502 503 fail_unlock: 504 put_swap_folio(folio, entry); 505 folio_unlock(folio); 506 folio_put(folio); 507 return NULL; 508 } 509 510 /* 511 * Locate a page of swap in physical memory, reserving swap cache space 512 * and reading the disk if it is not already cached. 513 * A failure return means that either the page allocation failed or that 514 * the swap entry is no longer in use. 515 */ 516 struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 517 struct vm_area_struct *vma, 518 unsigned long addr, bool do_poll, 519 struct swap_iocb **plug) 520 { 521 bool page_was_allocated; 522 struct page *retpage = __read_swap_cache_async(entry, gfp_mask, 523 vma, addr, &page_was_allocated); 524 525 if (page_was_allocated) 526 swap_readpage(retpage, do_poll, plug); 527 528 return retpage; 529 } 530 531 static unsigned int __swapin_nr_pages(unsigned long prev_offset, 532 unsigned long offset, 533 int hits, 534 int max_pages, 535 int prev_win) 536 { 537 unsigned int pages, last_ra; 538 539 /* 540 * This heuristic has been found to work well on both sequential and 541 * random loads, swapping to hard disk or to SSD: please don't ask 542 * what the "+ 2" means, it just happens to work well, that's all. 543 */ 544 pages = hits + 2; 545 if (pages == 2) { 546 /* 547 * We can have no readahead hits to judge by: but must not get 548 * stuck here forever, so check for an adjacent offset instead 549 * (and don't even bother to check whether swap type is same). 550 */ 551 if (offset != prev_offset + 1 && offset != prev_offset - 1) 552 pages = 1; 553 } else { 554 unsigned int roundup = 4; 555 while (roundup < pages) 556 roundup <<= 1; 557 pages = roundup; 558 } 559 560 if (pages > max_pages) 561 pages = max_pages; 562 563 /* Don't shrink readahead too fast */ 564 last_ra = prev_win / 2; 565 if (pages < last_ra) 566 pages = last_ra; 567 568 return pages; 569 } 570 571 static unsigned long swapin_nr_pages(unsigned long offset) 572 { 573 static unsigned long prev_offset; 574 unsigned int hits, pages, max_pages; 575 static atomic_t last_readahead_pages; 576 577 max_pages = 1 << READ_ONCE(page_cluster); 578 if (max_pages <= 1) 579 return 1; 580 581 hits = atomic_xchg(&swapin_readahead_hits, 0); 582 pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, 583 max_pages, 584 atomic_read(&last_readahead_pages)); 585 if (!hits) 586 WRITE_ONCE(prev_offset, offset); 587 atomic_set(&last_readahead_pages, pages); 588 589 return pages; 590 } 591 592 /** 593 * swap_cluster_readahead - swap in pages in hope we need them soon 594 * @entry: swap entry of this memory 595 * @gfp_mask: memory allocation flags 596 * @vmf: fault information 597 * 598 * Returns the struct page for entry and addr, after queueing swapin. 599 * 600 * Primitive swap readahead code. We simply read an aligned block of 601 * (1 << page_cluster) entries in the swap area. This method is chosen 602 * because it doesn't cost us any seek time. We also make sure to queue 603 * the 'original' request together with the readahead ones... 604 * 605 * This has been extended to use the NUMA policies from the mm triggering 606 * the readahead. 607 * 608 * Caller must hold read mmap_lock if vmf->vma is not NULL. 609 */ 610 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, 611 struct vm_fault *vmf) 612 { 613 struct page *page; 614 unsigned long entry_offset = swp_offset(entry); 615 unsigned long offset = entry_offset; 616 unsigned long start_offset, end_offset; 617 unsigned long mask; 618 struct swap_info_struct *si = swp_swap_info(entry); 619 struct blk_plug plug; 620 struct swap_iocb *splug = NULL; 621 bool do_poll = true, page_allocated; 622 struct vm_area_struct *vma = vmf->vma; 623 unsigned long addr = vmf->address; 624 625 mask = swapin_nr_pages(offset) - 1; 626 if (!mask) 627 goto skip; 628 629 do_poll = false; 630 /* Read a page_cluster sized and aligned cluster around offset. */ 631 start_offset = offset & ~mask; 632 end_offset = offset | mask; 633 if (!start_offset) /* First page is swap header. */ 634 start_offset++; 635 if (end_offset >= si->max) 636 end_offset = si->max - 1; 637 638 blk_start_plug(&plug); 639 for (offset = start_offset; offset <= end_offset ; offset++) { 640 /* Ok, do the async read-ahead now */ 641 page = __read_swap_cache_async( 642 swp_entry(swp_type(entry), offset), 643 gfp_mask, vma, addr, &page_allocated); 644 if (!page) 645 continue; 646 if (page_allocated) { 647 swap_readpage(page, false, &splug); 648 if (offset != entry_offset) { 649 SetPageReadahead(page); 650 count_vm_event(SWAP_RA); 651 } 652 } 653 put_page(page); 654 } 655 blk_finish_plug(&plug); 656 swap_read_unplug(splug); 657 658 lru_add_drain(); /* Push any new pages onto the LRU now */ 659 skip: 660 /* The page was likely read above, so no need for plugging here */ 661 return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll, NULL); 662 } 663 664 int init_swap_address_space(unsigned int type, unsigned long nr_pages) 665 { 666 struct address_space *spaces, *space; 667 unsigned int i, nr; 668 669 nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); 670 spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); 671 if (!spaces) 672 return -ENOMEM; 673 for (i = 0; i < nr; i++) { 674 space = spaces + i; 675 xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); 676 atomic_set(&space->i_mmap_writable, 0); 677 space->a_ops = &swap_aops; 678 /* swap cache doesn't use writeback related tags */ 679 mapping_set_no_writeback_tags(space); 680 } 681 nr_swapper_spaces[type] = nr; 682 swapper_spaces[type] = spaces; 683 684 return 0; 685 } 686 687 void exit_swap_address_space(unsigned int type) 688 { 689 int i; 690 struct address_space *spaces = swapper_spaces[type]; 691 692 for (i = 0; i < nr_swapper_spaces[type]; i++) 693 VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); 694 kvfree(spaces); 695 nr_swapper_spaces[type] = 0; 696 swapper_spaces[type] = NULL; 697 } 698 699 static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, 700 unsigned long faddr, 701 unsigned long lpfn, 702 unsigned long rpfn, 703 unsigned long *start, 704 unsigned long *end) 705 { 706 *start = max3(lpfn, PFN_DOWN(vma->vm_start), 707 PFN_DOWN(faddr & PMD_MASK)); 708 *end = min3(rpfn, PFN_DOWN(vma->vm_end), 709 PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); 710 } 711 712 static void swap_ra_info(struct vm_fault *vmf, 713 struct vma_swap_readahead *ra_info) 714 { 715 struct vm_area_struct *vma = vmf->vma; 716 unsigned long ra_val; 717 unsigned long faddr, pfn, fpfn; 718 unsigned long start, end; 719 pte_t *pte, *orig_pte; 720 unsigned int max_win, hits, prev_win, win, left; 721 #ifndef CONFIG_64BIT 722 pte_t *tpte; 723 #endif 724 725 max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), 726 SWAP_RA_ORDER_CEILING); 727 if (max_win == 1) { 728 ra_info->win = 1; 729 return; 730 } 731 732 faddr = vmf->address; 733 orig_pte = pte = pte_offset_map(vmf->pmd, faddr); 734 735 fpfn = PFN_DOWN(faddr); 736 ra_val = GET_SWAP_RA_VAL(vma); 737 pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); 738 prev_win = SWAP_RA_WIN(ra_val); 739 hits = SWAP_RA_HITS(ra_val); 740 ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits, 741 max_win, prev_win); 742 atomic_long_set(&vma->swap_readahead_info, 743 SWAP_RA_VAL(faddr, win, 0)); 744 745 if (win == 1) { 746 pte_unmap(orig_pte); 747 return; 748 } 749 750 /* Copy the PTEs because the page table may be unmapped */ 751 if (fpfn == pfn + 1) 752 swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); 753 else if (pfn == fpfn + 1) 754 swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1, 755 &start, &end); 756 else { 757 left = (win - 1) / 2; 758 swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, 759 &start, &end); 760 } 761 ra_info->nr_pte = end - start; 762 ra_info->offset = fpfn - start; 763 pte -= ra_info->offset; 764 #ifdef CONFIG_64BIT 765 ra_info->ptes = pte; 766 #else 767 tpte = ra_info->ptes; 768 for (pfn = start; pfn != end; pfn++) 769 *tpte++ = *pte++; 770 #endif 771 pte_unmap(orig_pte); 772 } 773 774 /** 775 * swap_vma_readahead - swap in pages in hope we need them soon 776 * @fentry: swap entry of this memory 777 * @gfp_mask: memory allocation flags 778 * @vmf: fault information 779 * 780 * Returns the struct page for entry and addr, after queueing swapin. 781 * 782 * Primitive swap readahead code. We simply read in a few pages whose 783 * virtual addresses are around the fault address in the same vma. 784 * 785 * Caller must hold read mmap_lock if vmf->vma is not NULL. 786 * 787 */ 788 static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, 789 struct vm_fault *vmf) 790 { 791 struct blk_plug plug; 792 struct swap_iocb *splug = NULL; 793 struct vm_area_struct *vma = vmf->vma; 794 struct page *page; 795 pte_t *pte, pentry; 796 swp_entry_t entry; 797 unsigned int i; 798 bool page_allocated; 799 struct vma_swap_readahead ra_info = { 800 .win = 1, 801 }; 802 803 swap_ra_info(vmf, &ra_info); 804 if (ra_info.win == 1) 805 goto skip; 806 807 blk_start_plug(&plug); 808 for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte; 809 i++, pte++) { 810 pentry = *pte; 811 if (!is_swap_pte(pentry)) 812 continue; 813 entry = pte_to_swp_entry(pentry); 814 if (unlikely(non_swap_entry(entry))) 815 continue; 816 page = __read_swap_cache_async(entry, gfp_mask, vma, 817 vmf->address, &page_allocated); 818 if (!page) 819 continue; 820 if (page_allocated) { 821 swap_readpage(page, false, &splug); 822 if (i != ra_info.offset) { 823 SetPageReadahead(page); 824 count_vm_event(SWAP_RA); 825 } 826 } 827 put_page(page); 828 } 829 blk_finish_plug(&plug); 830 swap_read_unplug(splug); 831 lru_add_drain(); 832 skip: 833 /* The page was likely read above, so no need for plugging here */ 834 return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, 835 ra_info.win == 1, NULL); 836 } 837 838 /** 839 * swapin_readahead - swap in pages in hope we need them soon 840 * @entry: swap entry of this memory 841 * @gfp_mask: memory allocation flags 842 * @vmf: fault information 843 * 844 * Returns the struct page for entry and addr, after queueing swapin. 845 * 846 * It's a main entry function for swap readahead. By the configuration, 847 * it will read ahead blocks by cluster-based(ie, physical disk based) 848 * or vma-based(ie, virtual address based on faulty address) readahead. 849 */ 850 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 851 struct vm_fault *vmf) 852 { 853 return swap_use_vma_readahead() ? 854 swap_vma_readahead(entry, gfp_mask, vmf) : 855 swap_cluster_readahead(entry, gfp_mask, vmf); 856 } 857 858 #ifdef CONFIG_SYSFS 859 static ssize_t vma_ra_enabled_show(struct kobject *kobj, 860 struct kobj_attribute *attr, char *buf) 861 { 862 return sysfs_emit(buf, "%s\n", 863 enable_vma_readahead ? "true" : "false"); 864 } 865 static ssize_t vma_ra_enabled_store(struct kobject *kobj, 866 struct kobj_attribute *attr, 867 const char *buf, size_t count) 868 { 869 ssize_t ret; 870 871 ret = kstrtobool(buf, &enable_vma_readahead); 872 if (ret) 873 return ret; 874 875 return count; 876 } 877 static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); 878 879 static struct attribute *swap_attrs[] = { 880 &vma_ra_enabled_attr.attr, 881 NULL, 882 }; 883 884 static const struct attribute_group swap_attr_group = { 885 .attrs = swap_attrs, 886 }; 887 888 static int __init swap_init_sysfs(void) 889 { 890 int err; 891 struct kobject *swap_kobj; 892 893 swap_kobj = kobject_create_and_add("swap", mm_kobj); 894 if (!swap_kobj) { 895 pr_err("failed to create swap kobject\n"); 896 return -ENOMEM; 897 } 898 err = sysfs_create_group(swap_kobj, &swap_attr_group); 899 if (err) { 900 pr_err("failed to register swap group\n"); 901 goto delete_obj; 902 } 903 return 0; 904 905 delete_obj: 906 kobject_put(swap_kobj); 907 return err; 908 } 909 subsys_initcall(swap_init_sysfs); 910 #endif 911