1 /* 2 * linux/mm/swap_state.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 * Swap reorganised 29.12.95, Stephen Tweedie 6 * 7 * Rewritten to use page cache, (C) 1998 Stephen Tweedie 8 */ 9 #include <linux/mm.h> 10 #include <linux/gfp.h> 11 #include <linux/kernel_stat.h> 12 #include <linux/swap.h> 13 #include <linux/swapops.h> 14 #include <linux/init.h> 15 #include <linux/pagemap.h> 16 #include <linux/backing-dev.h> 17 #include <linux/blkdev.h> 18 #include <linux/pagevec.h> 19 #include <linux/migrate.h> 20 #include <linux/vmalloc.h> 21 #include <linux/swap_slots.h> 22 #include <linux/huge_mm.h> 23 24 #include <asm/pgtable.h> 25 26 /* 27 * swapper_space is a fiction, retained to simplify the path through 28 * vmscan's shrink_page_list. 29 */ 30 static const struct address_space_operations swap_aops = { 31 .writepage = swap_writepage, 32 .set_page_dirty = swap_set_page_dirty, 33 #ifdef CONFIG_MIGRATION 34 .migratepage = migrate_page, 35 #endif 36 }; 37 38 struct address_space *swapper_spaces[MAX_SWAPFILES]; 39 static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; 40 41 #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) 42 #define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) 43 44 static struct { 45 unsigned long add_total; 46 unsigned long del_total; 47 unsigned long find_success; 48 unsigned long find_total; 49 } swap_cache_info; 50 51 unsigned long total_swapcache_pages(void) 52 { 53 unsigned int i, j, nr; 54 unsigned long ret = 0; 55 struct address_space *spaces; 56 57 rcu_read_lock(); 58 for (i = 0; i < MAX_SWAPFILES; i++) { 59 /* 60 * The corresponding entries in nr_swapper_spaces and 61 * swapper_spaces will be reused only after at least 62 * one grace period. So it is impossible for them 63 * belongs to different usage. 64 */ 65 nr = nr_swapper_spaces[i]; 66 spaces = rcu_dereference(swapper_spaces[i]); 67 if (!nr || !spaces) 68 continue; 69 for (j = 0; j < nr; j++) 70 ret += spaces[j].nrpages; 71 } 72 rcu_read_unlock(); 73 return ret; 74 } 75 76 static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); 77 78 void show_swap_cache_info(void) 79 { 80 printk("%lu pages in swap cache\n", total_swapcache_pages()); 81 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", 82 swap_cache_info.add_total, swap_cache_info.del_total, 83 swap_cache_info.find_success, swap_cache_info.find_total); 84 printk("Free swap = %ldkB\n", 85 get_nr_swap_pages() << (PAGE_SHIFT - 10)); 86 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 87 } 88 89 /* 90 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, 91 * but sets SwapCache flag and private instead of mapping and index. 92 */ 93 int __add_to_swap_cache(struct page *page, swp_entry_t entry) 94 { 95 int error, i, nr = hpage_nr_pages(page); 96 struct address_space *address_space; 97 pgoff_t idx = swp_offset(entry); 98 99 VM_BUG_ON_PAGE(!PageLocked(page), page); 100 VM_BUG_ON_PAGE(PageSwapCache(page), page); 101 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 102 103 page_ref_add(page, nr); 104 SetPageSwapCache(page); 105 106 address_space = swap_address_space(entry); 107 spin_lock_irq(&address_space->tree_lock); 108 for (i = 0; i < nr; i++) { 109 set_page_private(page + i, entry.val + i); 110 error = radix_tree_insert(&address_space->page_tree, 111 idx + i, page + i); 112 if (unlikely(error)) 113 break; 114 } 115 if (likely(!error)) { 116 address_space->nrpages += nr; 117 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); 118 ADD_CACHE_INFO(add_total, nr); 119 } else { 120 /* 121 * Only the context which have set SWAP_HAS_CACHE flag 122 * would call add_to_swap_cache(). 123 * So add_to_swap_cache() doesn't returns -EEXIST. 124 */ 125 VM_BUG_ON(error == -EEXIST); 126 set_page_private(page + i, 0UL); 127 while (i--) { 128 radix_tree_delete(&address_space->page_tree, idx + i); 129 set_page_private(page + i, 0UL); 130 } 131 ClearPageSwapCache(page); 132 page_ref_sub(page, nr); 133 } 134 spin_unlock_irq(&address_space->tree_lock); 135 136 return error; 137 } 138 139 140 int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 141 { 142 int error; 143 144 error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page)); 145 if (!error) { 146 error = __add_to_swap_cache(page, entry); 147 radix_tree_preload_end(); 148 } 149 return error; 150 } 151 152 /* 153 * This must be called only on pages that have 154 * been verified to be in the swap cache. 155 */ 156 void __delete_from_swap_cache(struct page *page) 157 { 158 struct address_space *address_space; 159 int i, nr = hpage_nr_pages(page); 160 swp_entry_t entry; 161 pgoff_t idx; 162 163 VM_BUG_ON_PAGE(!PageLocked(page), page); 164 VM_BUG_ON_PAGE(!PageSwapCache(page), page); 165 VM_BUG_ON_PAGE(PageWriteback(page), page); 166 167 entry.val = page_private(page); 168 address_space = swap_address_space(entry); 169 idx = swp_offset(entry); 170 for (i = 0; i < nr; i++) { 171 radix_tree_delete(&address_space->page_tree, idx + i); 172 set_page_private(page + i, 0); 173 } 174 ClearPageSwapCache(page); 175 address_space->nrpages -= nr; 176 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); 177 ADD_CACHE_INFO(del_total, nr); 178 } 179 180 /** 181 * add_to_swap - allocate swap space for a page 182 * @page: page we want to move to swap 183 * 184 * Allocate swap space for the page and add the page to the 185 * swap cache. Caller needs to hold the page lock. 186 */ 187 int add_to_swap(struct page *page) 188 { 189 swp_entry_t entry; 190 int err; 191 192 VM_BUG_ON_PAGE(!PageLocked(page), page); 193 VM_BUG_ON_PAGE(!PageUptodate(page), page); 194 195 entry = get_swap_page(page); 196 if (!entry.val) 197 return 0; 198 199 if (mem_cgroup_try_charge_swap(page, entry)) 200 goto fail; 201 202 /* 203 * Radix-tree node allocations from PF_MEMALLOC contexts could 204 * completely exhaust the page allocator. __GFP_NOMEMALLOC 205 * stops emergency reserves from being allocated. 206 * 207 * TODO: this could cause a theoretical memory reclaim 208 * deadlock in the swap out path. 209 */ 210 /* 211 * Add it to the swap cache. 212 */ 213 err = add_to_swap_cache(page, entry, 214 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); 215 /* -ENOMEM radix-tree allocation failure */ 216 if (err) 217 /* 218 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 219 * clear SWAP_HAS_CACHE flag. 220 */ 221 goto fail; 222 223 return 1; 224 225 fail: 226 put_swap_page(page, entry); 227 return 0; 228 } 229 230 /* 231 * This must be called only on pages that have 232 * been verified to be in the swap cache and locked. 233 * It will never put the page into the free list, 234 * the caller has a reference on the page. 235 */ 236 void delete_from_swap_cache(struct page *page) 237 { 238 swp_entry_t entry; 239 struct address_space *address_space; 240 241 entry.val = page_private(page); 242 243 address_space = swap_address_space(entry); 244 spin_lock_irq(&address_space->tree_lock); 245 __delete_from_swap_cache(page); 246 spin_unlock_irq(&address_space->tree_lock); 247 248 put_swap_page(page, entry); 249 page_ref_sub(page, hpage_nr_pages(page)); 250 } 251 252 /* 253 * If we are the only user, then try to free up the swap cache. 254 * 255 * Its ok to check for PageSwapCache without the page lock 256 * here because we are going to recheck again inside 257 * try_to_free_swap() _with_ the lock. 258 * - Marcelo 259 */ 260 static inline void free_swap_cache(struct page *page) 261 { 262 if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { 263 try_to_free_swap(page); 264 unlock_page(page); 265 } 266 } 267 268 /* 269 * Perform a free_page(), also freeing any swap cache associated with 270 * this page if it is the last user of the page. 271 */ 272 void free_page_and_swap_cache(struct page *page) 273 { 274 free_swap_cache(page); 275 if (!is_huge_zero_page(page)) 276 put_page(page); 277 } 278 279 /* 280 * Passed an array of pages, drop them all from swapcache and then release 281 * them. They are removed from the LRU and freed if this is their last use. 282 */ 283 void free_pages_and_swap_cache(struct page **pages, int nr) 284 { 285 struct page **pagep = pages; 286 int i; 287 288 lru_add_drain(); 289 for (i = 0; i < nr; i++) 290 free_swap_cache(pagep[i]); 291 release_pages(pagep, nr, false); 292 } 293 294 /* 295 * Lookup a swap entry in the swap cache. A found page will be returned 296 * unlocked and with its refcount incremented - we rely on the kernel 297 * lock getting page table operations atomic even if we drop the page 298 * lock before returning. 299 */ 300 struct page * lookup_swap_cache(swp_entry_t entry) 301 { 302 struct page *page; 303 304 page = find_get_page(swap_address_space(entry), swp_offset(entry)); 305 306 if (page && likely(!PageTransCompound(page))) { 307 INC_CACHE_INFO(find_success); 308 if (TestClearPageReadahead(page)) 309 atomic_inc(&swapin_readahead_hits); 310 } 311 312 INC_CACHE_INFO(find_total); 313 return page; 314 } 315 316 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 317 struct vm_area_struct *vma, unsigned long addr, 318 bool *new_page_allocated) 319 { 320 struct page *found_page, *new_page = NULL; 321 struct address_space *swapper_space = swap_address_space(entry); 322 int err; 323 *new_page_allocated = false; 324 325 do { 326 /* 327 * First check the swap cache. Since this is normally 328 * called after lookup_swap_cache() failed, re-calling 329 * that would confuse statistics. 330 */ 331 found_page = find_get_page(swapper_space, swp_offset(entry)); 332 if (found_page) 333 break; 334 335 /* 336 * Just skip read ahead for unused swap slot. 337 * During swap_off when swap_slot_cache is disabled, 338 * we have to handle the race between putting 339 * swap entry in swap cache and marking swap slot 340 * as SWAP_HAS_CACHE. That's done in later part of code or 341 * else swap_off will be aborted if we return NULL. 342 */ 343 if (!__swp_swapcount(entry) && swap_slot_cache_enabled) 344 break; 345 346 /* 347 * Get a new page to read into from swap. 348 */ 349 if (!new_page) { 350 new_page = alloc_page_vma(gfp_mask, vma, addr); 351 if (!new_page) 352 break; /* Out of memory */ 353 } 354 355 /* 356 * call radix_tree_preload() while we can wait. 357 */ 358 err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); 359 if (err) 360 break; 361 362 /* 363 * Swap entry may have been freed since our caller observed it. 364 */ 365 err = swapcache_prepare(entry); 366 if (err == -EEXIST) { 367 radix_tree_preload_end(); 368 /* 369 * We might race against get_swap_page() and stumble 370 * across a SWAP_HAS_CACHE swap_map entry whose page 371 * has not been brought into the swapcache yet. 372 */ 373 cond_resched(); 374 continue; 375 } 376 if (err) { /* swp entry is obsolete ? */ 377 radix_tree_preload_end(); 378 break; 379 } 380 381 /* May fail (-ENOMEM) if radix-tree node allocation failed. */ 382 __SetPageLocked(new_page); 383 __SetPageSwapBacked(new_page); 384 err = __add_to_swap_cache(new_page, entry); 385 if (likely(!err)) { 386 radix_tree_preload_end(); 387 /* 388 * Initiate read into locked page and return. 389 */ 390 lru_cache_add_anon(new_page); 391 *new_page_allocated = true; 392 return new_page; 393 } 394 radix_tree_preload_end(); 395 __ClearPageLocked(new_page); 396 /* 397 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 398 * clear SWAP_HAS_CACHE flag. 399 */ 400 put_swap_page(new_page, entry); 401 } while (err != -ENOMEM); 402 403 if (new_page) 404 put_page(new_page); 405 return found_page; 406 } 407 408 /* 409 * Locate a page of swap in physical memory, reserving swap cache space 410 * and reading the disk if it is not already cached. 411 * A failure return means that either the page allocation failed or that 412 * the swap entry is no longer in use. 413 */ 414 struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 415 struct vm_area_struct *vma, unsigned long addr, bool do_poll) 416 { 417 bool page_was_allocated; 418 struct page *retpage = __read_swap_cache_async(entry, gfp_mask, 419 vma, addr, &page_was_allocated); 420 421 if (page_was_allocated) 422 swap_readpage(retpage, do_poll); 423 424 return retpage; 425 } 426 427 static unsigned long swapin_nr_pages(unsigned long offset) 428 { 429 static unsigned long prev_offset; 430 unsigned int pages, max_pages, last_ra; 431 static atomic_t last_readahead_pages; 432 433 max_pages = 1 << READ_ONCE(page_cluster); 434 if (max_pages <= 1) 435 return 1; 436 437 /* 438 * This heuristic has been found to work well on both sequential and 439 * random loads, swapping to hard disk or to SSD: please don't ask 440 * what the "+ 2" means, it just happens to work well, that's all. 441 */ 442 pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; 443 if (pages == 2) { 444 /* 445 * We can have no readahead hits to judge by: but must not get 446 * stuck here forever, so check for an adjacent offset instead 447 * (and don't even bother to check whether swap type is same). 448 */ 449 if (offset != prev_offset + 1 && offset != prev_offset - 1) 450 pages = 1; 451 prev_offset = offset; 452 } else { 453 unsigned int roundup = 4; 454 while (roundup < pages) 455 roundup <<= 1; 456 pages = roundup; 457 } 458 459 if (pages > max_pages) 460 pages = max_pages; 461 462 /* Don't shrink readahead too fast */ 463 last_ra = atomic_read(&last_readahead_pages) / 2; 464 if (pages < last_ra) 465 pages = last_ra; 466 atomic_set(&last_readahead_pages, pages); 467 468 return pages; 469 } 470 471 /** 472 * swapin_readahead - swap in pages in hope we need them soon 473 * @entry: swap entry of this memory 474 * @gfp_mask: memory allocation flags 475 * @vma: user vma this address belongs to 476 * @addr: target address for mempolicy 477 * 478 * Returns the struct page for entry and addr, after queueing swapin. 479 * 480 * Primitive swap readahead code. We simply read an aligned block of 481 * (1 << page_cluster) entries in the swap area. This method is chosen 482 * because it doesn't cost us any seek time. We also make sure to queue 483 * the 'original' request together with the readahead ones... 484 * 485 * This has been extended to use the NUMA policies from the mm triggering 486 * the readahead. 487 * 488 * Caller must hold down_read on the vma->vm_mm if vma is not NULL. 489 */ 490 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 491 struct vm_area_struct *vma, unsigned long addr) 492 { 493 struct page *page; 494 unsigned long entry_offset = swp_offset(entry); 495 unsigned long offset = entry_offset; 496 unsigned long start_offset, end_offset; 497 unsigned long mask; 498 struct blk_plug plug; 499 bool do_poll = true; 500 501 mask = swapin_nr_pages(offset) - 1; 502 if (!mask) 503 goto skip; 504 505 do_poll = false; 506 /* Read a page_cluster sized and aligned cluster around offset. */ 507 start_offset = offset & ~mask; 508 end_offset = offset | mask; 509 if (!start_offset) /* First page is swap header. */ 510 start_offset++; 511 512 blk_start_plug(&plug); 513 for (offset = start_offset; offset <= end_offset ; offset++) { 514 /* Ok, do the async read-ahead now */ 515 page = read_swap_cache_async(swp_entry(swp_type(entry), offset), 516 gfp_mask, vma, addr, false); 517 if (!page) 518 continue; 519 if (offset != entry_offset && likely(!PageTransCompound(page))) 520 SetPageReadahead(page); 521 put_page(page); 522 } 523 blk_finish_plug(&plug); 524 525 lru_add_drain(); /* Push any new pages onto the LRU now */ 526 skip: 527 return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll); 528 } 529 530 int init_swap_address_space(unsigned int type, unsigned long nr_pages) 531 { 532 struct address_space *spaces, *space; 533 unsigned int i, nr; 534 535 nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); 536 spaces = kvzalloc(sizeof(struct address_space) * nr, GFP_KERNEL); 537 if (!spaces) 538 return -ENOMEM; 539 for (i = 0; i < nr; i++) { 540 space = spaces + i; 541 INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN); 542 atomic_set(&space->i_mmap_writable, 0); 543 space->a_ops = &swap_aops; 544 /* swap cache doesn't use writeback related tags */ 545 mapping_set_no_writeback_tags(space); 546 spin_lock_init(&space->tree_lock); 547 } 548 nr_swapper_spaces[type] = nr; 549 rcu_assign_pointer(swapper_spaces[type], spaces); 550 551 return 0; 552 } 553 554 void exit_swap_address_space(unsigned int type) 555 { 556 struct address_space *spaces; 557 558 spaces = swapper_spaces[type]; 559 nr_swapper_spaces[type] = 0; 560 rcu_assign_pointer(swapper_spaces[type], NULL); 561 synchronize_rcu(); 562 kvfree(spaces); 563 } 564