1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/vmscan.c 4 * 5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 6 * 7 * Swap reorganised 29.12.95, Stephen Tweedie. 8 * kswapd added: 7.1.96 sct 9 * Removed kswapd_ctl limits, and swap out as many pages as needed 10 * to bring the system back to freepages.high: 2.4.97, Rik van Riel. 11 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). 12 * Multiqueue VM started 5.8.00, Rik van Riel. 13 */ 14 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/mm.h> 18 #include <linux/sched/mm.h> 19 #include <linux/module.h> 20 #include <linux/gfp.h> 21 #include <linux/kernel_stat.h> 22 #include <linux/swap.h> 23 #include <linux/pagemap.h> 24 #include <linux/init.h> 25 #include <linux/highmem.h> 26 #include <linux/vmpressure.h> 27 #include <linux/vmstat.h> 28 #include <linux/file.h> 29 #include <linux/writeback.h> 30 #include <linux/blkdev.h> 31 #include <linux/buffer_head.h> /* for try_to_release_page(), 32 buffer_heads_over_limit */ 33 #include <linux/mm_inline.h> 34 #include <linux/backing-dev.h> 35 #include <linux/rmap.h> 36 #include <linux/topology.h> 37 #include <linux/cpu.h> 38 #include <linux/cpuset.h> 39 #include <linux/compaction.h> 40 #include <linux/notifier.h> 41 #include <linux/rwsem.h> 42 #include <linux/delay.h> 43 #include <linux/kthread.h> 44 #include <linux/freezer.h> 45 #include <linux/memcontrol.h> 46 #include <linux/delayacct.h> 47 #include <linux/sysctl.h> 48 #include <linux/oom.h> 49 #include <linux/prefetch.h> 50 #include <linux/printk.h> 51 #include <linux/dax.h> 52 53 #include <asm/tlbflush.h> 54 #include <asm/div64.h> 55 56 #include <linux/swapops.h> 57 #include <linux/balloon_compaction.h> 58 59 #include "internal.h" 60 61 #define CREATE_TRACE_POINTS 62 #include <trace/events/vmscan.h> 63 64 struct scan_control { 65 /* How many pages shrink_list() should reclaim */ 66 unsigned long nr_to_reclaim; 67 68 /* This context's GFP mask */ 69 gfp_t gfp_mask; 70 71 /* Allocation order */ 72 int order; 73 74 /* 75 * Nodemask of nodes allowed by the caller. If NULL, all nodes 76 * are scanned. 77 */ 78 nodemask_t *nodemask; 79 80 /* 81 * The memory cgroup that hit its limit and as a result is the 82 * primary target of this reclaim invocation. 83 */ 84 struct mem_cgroup *target_mem_cgroup; 85 86 /* Scan (total_size >> priority) pages at once */ 87 int priority; 88 89 /* The highest zone to isolate pages for reclaim from */ 90 enum zone_type reclaim_idx; 91 92 /* Writepage batching in laptop mode; RECLAIM_WRITE */ 93 unsigned int may_writepage:1; 94 95 /* Can mapped pages be reclaimed? */ 96 unsigned int may_unmap:1; 97 98 /* Can pages be swapped as part of reclaim? */ 99 unsigned int may_swap:1; 100 101 /* 102 * Cgroups are not reclaimed below their configured memory.low, 103 * unless we threaten to OOM. If any cgroups are skipped due to 104 * memory.low and nothing was reclaimed, go back for memory.low. 105 */ 106 unsigned int memcg_low_reclaim:1; 107 unsigned int memcg_low_skipped:1; 108 109 unsigned int hibernation_mode:1; 110 111 /* One of the zones is ready for compaction */ 112 unsigned int compaction_ready:1; 113 114 /* Incremented by the number of inactive pages that were scanned */ 115 unsigned long nr_scanned; 116 117 /* Number of pages freed so far during a call to shrink_zones() */ 118 unsigned long nr_reclaimed; 119 }; 120 121 #ifdef ARCH_HAS_PREFETCH 122 #define prefetch_prev_lru_page(_page, _base, _field) \ 123 do { \ 124 if ((_page)->lru.prev != _base) { \ 125 struct page *prev; \ 126 \ 127 prev = lru_to_page(&(_page->lru)); \ 128 prefetch(&prev->_field); \ 129 } \ 130 } while (0) 131 #else 132 #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) 133 #endif 134 135 #ifdef ARCH_HAS_PREFETCHW 136 #define prefetchw_prev_lru_page(_page, _base, _field) \ 137 do { \ 138 if ((_page)->lru.prev != _base) { \ 139 struct page *prev; \ 140 \ 141 prev = lru_to_page(&(_page->lru)); \ 142 prefetchw(&prev->_field); \ 143 } \ 144 } while (0) 145 #else 146 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) 147 #endif 148 149 /* 150 * From 0 .. 100. Higher means more swappy. 151 */ 152 int vm_swappiness = 60; 153 /* 154 * The total number of pages which are beyond the high watermark within all 155 * zones. 156 */ 157 unsigned long vm_total_pages; 158 159 static LIST_HEAD(shrinker_list); 160 static DECLARE_RWSEM(shrinker_rwsem); 161 162 #ifdef CONFIG_MEMCG 163 static bool global_reclaim(struct scan_control *sc) 164 { 165 return !sc->target_mem_cgroup; 166 } 167 168 /** 169 * sane_reclaim - is the usual dirty throttling mechanism operational? 170 * @sc: scan_control in question 171 * 172 * The normal page dirty throttling mechanism in balance_dirty_pages() is 173 * completely broken with the legacy memcg and direct stalling in 174 * shrink_page_list() is used for throttling instead, which lacks all the 175 * niceties such as fairness, adaptive pausing, bandwidth proportional 176 * allocation and configurability. 177 * 178 * This function tests whether the vmscan currently in progress can assume 179 * that the normal dirty throttling mechanism is operational. 180 */ 181 static bool sane_reclaim(struct scan_control *sc) 182 { 183 struct mem_cgroup *memcg = sc->target_mem_cgroup; 184 185 if (!memcg) 186 return true; 187 #ifdef CONFIG_CGROUP_WRITEBACK 188 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 189 return true; 190 #endif 191 return false; 192 } 193 #else 194 static bool global_reclaim(struct scan_control *sc) 195 { 196 return true; 197 } 198 199 static bool sane_reclaim(struct scan_control *sc) 200 { 201 return true; 202 } 203 #endif 204 205 /* 206 * This misses isolated pages which are not accounted for to save counters. 207 * As the data only determines if reclaim or compaction continues, it is 208 * not expected that isolated pages will be a dominating factor. 209 */ 210 unsigned long zone_reclaimable_pages(struct zone *zone) 211 { 212 unsigned long nr; 213 214 nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + 215 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); 216 if (get_nr_swap_pages() > 0) 217 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + 218 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); 219 220 return nr; 221 } 222 223 unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) 224 { 225 unsigned long nr; 226 227 nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) + 228 node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) + 229 node_page_state_snapshot(pgdat, NR_ISOLATED_FILE); 230 231 if (get_nr_swap_pages() > 0) 232 nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) + 233 node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) + 234 node_page_state_snapshot(pgdat, NR_ISOLATED_ANON); 235 236 return nr; 237 } 238 239 /** 240 * lruvec_lru_size - Returns the number of pages on the given LRU list. 241 * @lruvec: lru vector 242 * @lru: lru to use 243 * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list) 244 */ 245 unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) 246 { 247 unsigned long lru_size; 248 int zid; 249 250 if (!mem_cgroup_disabled()) 251 lru_size = mem_cgroup_get_lru_size(lruvec, lru); 252 else 253 lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); 254 255 for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) { 256 struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; 257 unsigned long size; 258 259 if (!managed_zone(zone)) 260 continue; 261 262 if (!mem_cgroup_disabled()) 263 size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid); 264 else 265 size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid], 266 NR_ZONE_LRU_BASE + lru); 267 lru_size -= min(size, lru_size); 268 } 269 270 return lru_size; 271 272 } 273 274 /* 275 * Add a shrinker callback to be called from the vm. 276 */ 277 int register_shrinker(struct shrinker *shrinker) 278 { 279 size_t size = sizeof(*shrinker->nr_deferred); 280 281 if (shrinker->flags & SHRINKER_NUMA_AWARE) 282 size *= nr_node_ids; 283 284 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); 285 if (!shrinker->nr_deferred) 286 return -ENOMEM; 287 288 down_write(&shrinker_rwsem); 289 list_add_tail(&shrinker->list, &shrinker_list); 290 up_write(&shrinker_rwsem); 291 return 0; 292 } 293 EXPORT_SYMBOL(register_shrinker); 294 295 /* 296 * Remove one 297 */ 298 void unregister_shrinker(struct shrinker *shrinker) 299 { 300 if (!shrinker->nr_deferred) 301 return; 302 down_write(&shrinker_rwsem); 303 list_del(&shrinker->list); 304 up_write(&shrinker_rwsem); 305 kfree(shrinker->nr_deferred); 306 shrinker->nr_deferred = NULL; 307 } 308 EXPORT_SYMBOL(unregister_shrinker); 309 310 #define SHRINK_BATCH 128 311 312 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, 313 struct shrinker *shrinker, 314 unsigned long nr_scanned, 315 unsigned long nr_eligible) 316 { 317 unsigned long freed = 0; 318 unsigned long long delta; 319 long total_scan; 320 long freeable; 321 long nr; 322 long new_nr; 323 int nid = shrinkctl->nid; 324 long batch_size = shrinker->batch ? shrinker->batch 325 : SHRINK_BATCH; 326 long scanned = 0, next_deferred; 327 328 freeable = shrinker->count_objects(shrinker, shrinkctl); 329 if (freeable == 0) 330 return 0; 331 332 /* 333 * copy the current shrinker scan count into a local variable 334 * and zero it so that other concurrent shrinker invocations 335 * don't also do this scanning work. 336 */ 337 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); 338 339 total_scan = nr; 340 delta = (4 * nr_scanned) / shrinker->seeks; 341 delta *= freeable; 342 do_div(delta, nr_eligible + 1); 343 total_scan += delta; 344 if (total_scan < 0) { 345 pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", 346 shrinker->scan_objects, total_scan); 347 total_scan = freeable; 348 next_deferred = nr; 349 } else 350 next_deferred = total_scan; 351 352 /* 353 * We need to avoid excessive windup on filesystem shrinkers 354 * due to large numbers of GFP_NOFS allocations causing the 355 * shrinkers to return -1 all the time. This results in a large 356 * nr being built up so when a shrink that can do some work 357 * comes along it empties the entire cache due to nr >>> 358 * freeable. This is bad for sustaining a working set in 359 * memory. 360 * 361 * Hence only allow the shrinker to scan the entire cache when 362 * a large delta change is calculated directly. 363 */ 364 if (delta < freeable / 4) 365 total_scan = min(total_scan, freeable / 2); 366 367 /* 368 * Avoid risking looping forever due to too large nr value: 369 * never try to free more than twice the estimate number of 370 * freeable entries. 371 */ 372 if (total_scan > freeable * 2) 373 total_scan = freeable * 2; 374 375 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, 376 nr_scanned, nr_eligible, 377 freeable, delta, total_scan); 378 379 /* 380 * Normally, we should not scan less than batch_size objects in one 381 * pass to avoid too frequent shrinker calls, but if the slab has less 382 * than batch_size objects in total and we are really tight on memory, 383 * we will try to reclaim all available objects, otherwise we can end 384 * up failing allocations although there are plenty of reclaimable 385 * objects spread over several slabs with usage less than the 386 * batch_size. 387 * 388 * We detect the "tight on memory" situations by looking at the total 389 * number of objects we want to scan (total_scan). If it is greater 390 * than the total number of objects on slab (freeable), we must be 391 * scanning at high prio and therefore should try to reclaim as much as 392 * possible. 393 */ 394 while (total_scan >= batch_size || 395 total_scan >= freeable) { 396 unsigned long ret; 397 unsigned long nr_to_scan = min(batch_size, total_scan); 398 399 shrinkctl->nr_to_scan = nr_to_scan; 400 shrinkctl->nr_scanned = nr_to_scan; 401 ret = shrinker->scan_objects(shrinker, shrinkctl); 402 if (ret == SHRINK_STOP) 403 break; 404 freed += ret; 405 406 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); 407 total_scan -= shrinkctl->nr_scanned; 408 scanned += shrinkctl->nr_scanned; 409 410 cond_resched(); 411 } 412 413 if (next_deferred >= scanned) 414 next_deferred -= scanned; 415 else 416 next_deferred = 0; 417 /* 418 * move the unused scan count back into the shrinker in a 419 * manner that handles concurrent updates. If we exhausted the 420 * scan, there is no need to do an update. 421 */ 422 if (next_deferred > 0) 423 new_nr = atomic_long_add_return(next_deferred, 424 &shrinker->nr_deferred[nid]); 425 else 426 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); 427 428 trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); 429 return freed; 430 } 431 432 /** 433 * shrink_slab - shrink slab caches 434 * @gfp_mask: allocation context 435 * @nid: node whose slab caches to target 436 * @memcg: memory cgroup whose slab caches to target 437 * @nr_scanned: pressure numerator 438 * @nr_eligible: pressure denominator 439 * 440 * Call the shrink functions to age shrinkable caches. 441 * 442 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, 443 * unaware shrinkers will receive a node id of 0 instead. 444 * 445 * @memcg specifies the memory cgroup to target. If it is not NULL, 446 * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan 447 * objects from the memory cgroup specified. Otherwise, only unaware 448 * shrinkers are called. 449 * 450 * @nr_scanned and @nr_eligible form a ratio that indicate how much of 451 * the available objects should be scanned. Page reclaim for example 452 * passes the number of pages scanned and the number of pages on the 453 * LRU lists that it considered on @nid, plus a bias in @nr_scanned 454 * when it encountered mapped pages. The ratio is further biased by 455 * the ->seeks setting of the shrink function, which indicates the 456 * cost to recreate an object relative to that of an LRU page. 457 * 458 * Returns the number of reclaimed slab objects. 459 */ 460 static unsigned long shrink_slab(gfp_t gfp_mask, int nid, 461 struct mem_cgroup *memcg, 462 unsigned long nr_scanned, 463 unsigned long nr_eligible) 464 { 465 struct shrinker *shrinker; 466 unsigned long freed = 0; 467 468 if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))) 469 return 0; 470 471 if (nr_scanned == 0) 472 nr_scanned = SWAP_CLUSTER_MAX; 473 474 if (!down_read_trylock(&shrinker_rwsem)) { 475 /* 476 * If we would return 0, our callers would understand that we 477 * have nothing else to shrink and give up trying. By returning 478 * 1 we keep it going and assume we'll be able to shrink next 479 * time. 480 */ 481 freed = 1; 482 goto out; 483 } 484 485 list_for_each_entry(shrinker, &shrinker_list, list) { 486 struct shrink_control sc = { 487 .gfp_mask = gfp_mask, 488 .nid = nid, 489 .memcg = memcg, 490 }; 491 492 /* 493 * If kernel memory accounting is disabled, we ignore 494 * SHRINKER_MEMCG_AWARE flag and call all shrinkers 495 * passing NULL for memcg. 496 */ 497 if (memcg_kmem_enabled() && 498 !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE)) 499 continue; 500 501 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) 502 sc.nid = 0; 503 504 freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); 505 } 506 507 up_read(&shrinker_rwsem); 508 out: 509 cond_resched(); 510 return freed; 511 } 512 513 void drop_slab_node(int nid) 514 { 515 unsigned long freed; 516 517 do { 518 struct mem_cgroup *memcg = NULL; 519 520 freed = 0; 521 do { 522 freed += shrink_slab(GFP_KERNEL, nid, memcg, 523 1000, 1000); 524 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); 525 } while (freed > 10); 526 } 527 528 void drop_slab(void) 529 { 530 int nid; 531 532 for_each_online_node(nid) 533 drop_slab_node(nid); 534 } 535 536 static inline int is_page_cache_freeable(struct page *page) 537 { 538 /* 539 * A freeable page cache page is referenced only by the caller 540 * that isolated the page, the page cache radix tree and 541 * optional buffer heads at page->private. 542 */ 543 int radix_pins = PageTransHuge(page) && PageSwapCache(page) ? 544 HPAGE_PMD_NR : 1; 545 return page_count(page) - page_has_private(page) == 1 + radix_pins; 546 } 547 548 static int may_write_to_inode(struct inode *inode, struct scan_control *sc) 549 { 550 if (current->flags & PF_SWAPWRITE) 551 return 1; 552 if (!inode_write_congested(inode)) 553 return 1; 554 if (inode_to_bdi(inode) == current->backing_dev_info) 555 return 1; 556 return 0; 557 } 558 559 /* 560 * We detected a synchronous write error writing a page out. Probably 561 * -ENOSPC. We need to propagate that into the address_space for a subsequent 562 * fsync(), msync() or close(). 563 * 564 * The tricky part is that after writepage we cannot touch the mapping: nothing 565 * prevents it from being freed up. But we have a ref on the page and once 566 * that page is locked, the mapping is pinned. 567 * 568 * We're allowed to run sleeping lock_page() here because we know the caller has 569 * __GFP_FS. 570 */ 571 static void handle_write_error(struct address_space *mapping, 572 struct page *page, int error) 573 { 574 lock_page(page); 575 if (page_mapping(page) == mapping) 576 mapping_set_error(mapping, error); 577 unlock_page(page); 578 } 579 580 /* possible outcome of pageout() */ 581 typedef enum { 582 /* failed to write page out, page is locked */ 583 PAGE_KEEP, 584 /* move page to the active list, page is locked */ 585 PAGE_ACTIVATE, 586 /* page has been sent to the disk successfully, page is unlocked */ 587 PAGE_SUCCESS, 588 /* page is clean and locked */ 589 PAGE_CLEAN, 590 } pageout_t; 591 592 /* 593 * pageout is called by shrink_page_list() for each dirty page. 594 * Calls ->writepage(). 595 */ 596 static pageout_t pageout(struct page *page, struct address_space *mapping, 597 struct scan_control *sc) 598 { 599 /* 600 * If the page is dirty, only perform writeback if that write 601 * will be non-blocking. To prevent this allocation from being 602 * stalled by pagecache activity. But note that there may be 603 * stalls if we need to run get_block(). We could test 604 * PagePrivate for that. 605 * 606 * If this process is currently in __generic_file_write_iter() against 607 * this page's queue, we can perform writeback even if that 608 * will block. 609 * 610 * If the page is swapcache, write it back even if that would 611 * block, for some throttling. This happens by accident, because 612 * swap_backing_dev_info is bust: it doesn't reflect the 613 * congestion state of the swapdevs. Easy to fix, if needed. 614 */ 615 if (!is_page_cache_freeable(page)) 616 return PAGE_KEEP; 617 if (!mapping) { 618 /* 619 * Some data journaling orphaned pages can have 620 * page->mapping == NULL while being dirty with clean buffers. 621 */ 622 if (page_has_private(page)) { 623 if (try_to_free_buffers(page)) { 624 ClearPageDirty(page); 625 pr_info("%s: orphaned page\n", __func__); 626 return PAGE_CLEAN; 627 } 628 } 629 return PAGE_KEEP; 630 } 631 if (mapping->a_ops->writepage == NULL) 632 return PAGE_ACTIVATE; 633 if (!may_write_to_inode(mapping->host, sc)) 634 return PAGE_KEEP; 635 636 if (clear_page_dirty_for_io(page)) { 637 int res; 638 struct writeback_control wbc = { 639 .sync_mode = WB_SYNC_NONE, 640 .nr_to_write = SWAP_CLUSTER_MAX, 641 .range_start = 0, 642 .range_end = LLONG_MAX, 643 .for_reclaim = 1, 644 }; 645 646 SetPageReclaim(page); 647 res = mapping->a_ops->writepage(page, &wbc); 648 if (res < 0) 649 handle_write_error(mapping, page, res); 650 if (res == AOP_WRITEPAGE_ACTIVATE) { 651 ClearPageReclaim(page); 652 return PAGE_ACTIVATE; 653 } 654 655 if (!PageWriteback(page)) { 656 /* synchronous write or broken a_ops? */ 657 ClearPageReclaim(page); 658 } 659 trace_mm_vmscan_writepage(page); 660 inc_node_page_state(page, NR_VMSCAN_WRITE); 661 return PAGE_SUCCESS; 662 } 663 664 return PAGE_CLEAN; 665 } 666 667 /* 668 * Same as remove_mapping, but if the page is removed from the mapping, it 669 * gets returned with a refcount of 0. 670 */ 671 static int __remove_mapping(struct address_space *mapping, struct page *page, 672 bool reclaimed) 673 { 674 unsigned long flags; 675 int refcount; 676 677 BUG_ON(!PageLocked(page)); 678 BUG_ON(mapping != page_mapping(page)); 679 680 spin_lock_irqsave(&mapping->tree_lock, flags); 681 /* 682 * The non racy check for a busy page. 683 * 684 * Must be careful with the order of the tests. When someone has 685 * a ref to the page, it may be possible that they dirty it then 686 * drop the reference. So if PageDirty is tested before page_count 687 * here, then the following race may occur: 688 * 689 * get_user_pages(&page); 690 * [user mapping goes away] 691 * write_to(page); 692 * !PageDirty(page) [good] 693 * SetPageDirty(page); 694 * put_page(page); 695 * !page_count(page) [good, discard it] 696 * 697 * [oops, our write_to data is lost] 698 * 699 * Reversing the order of the tests ensures such a situation cannot 700 * escape unnoticed. The smp_rmb is needed to ensure the page->flags 701 * load is not satisfied before that of page->_refcount. 702 * 703 * Note that if SetPageDirty is always performed via set_page_dirty, 704 * and thus under tree_lock, then this ordering is not required. 705 */ 706 if (unlikely(PageTransHuge(page)) && PageSwapCache(page)) 707 refcount = 1 + HPAGE_PMD_NR; 708 else 709 refcount = 2; 710 if (!page_ref_freeze(page, refcount)) 711 goto cannot_free; 712 /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ 713 if (unlikely(PageDirty(page))) { 714 page_ref_unfreeze(page, refcount); 715 goto cannot_free; 716 } 717 718 if (PageSwapCache(page)) { 719 swp_entry_t swap = { .val = page_private(page) }; 720 mem_cgroup_swapout(page, swap); 721 __delete_from_swap_cache(page); 722 spin_unlock_irqrestore(&mapping->tree_lock, flags); 723 put_swap_page(page, swap); 724 } else { 725 void (*freepage)(struct page *); 726 void *shadow = NULL; 727 728 freepage = mapping->a_ops->freepage; 729 /* 730 * Remember a shadow entry for reclaimed file cache in 731 * order to detect refaults, thus thrashing, later on. 732 * 733 * But don't store shadows in an address space that is 734 * already exiting. This is not just an optizimation, 735 * inode reclaim needs to empty out the radix tree or 736 * the nodes are lost. Don't plant shadows behind its 737 * back. 738 * 739 * We also don't store shadows for DAX mappings because the 740 * only page cache pages found in these are zero pages 741 * covering holes, and because we don't want to mix DAX 742 * exceptional entries and shadow exceptional entries in the 743 * same page_tree. 744 */ 745 if (reclaimed && page_is_file_cache(page) && 746 !mapping_exiting(mapping) && !dax_mapping(mapping)) 747 shadow = workingset_eviction(mapping, page); 748 __delete_from_page_cache(page, shadow); 749 spin_unlock_irqrestore(&mapping->tree_lock, flags); 750 751 if (freepage != NULL) 752 freepage(page); 753 } 754 755 return 1; 756 757 cannot_free: 758 spin_unlock_irqrestore(&mapping->tree_lock, flags); 759 return 0; 760 } 761 762 /* 763 * Attempt to detach a locked page from its ->mapping. If it is dirty or if 764 * someone else has a ref on the page, abort and return 0. If it was 765 * successfully detached, return 1. Assumes the caller has a single ref on 766 * this page. 767 */ 768 int remove_mapping(struct address_space *mapping, struct page *page) 769 { 770 if (__remove_mapping(mapping, page, false)) { 771 /* 772 * Unfreezing the refcount with 1 rather than 2 effectively 773 * drops the pagecache ref for us without requiring another 774 * atomic operation. 775 */ 776 page_ref_unfreeze(page, 1); 777 return 1; 778 } 779 return 0; 780 } 781 782 /** 783 * putback_lru_page - put previously isolated page onto appropriate LRU list 784 * @page: page to be put back to appropriate lru list 785 * 786 * Add previously isolated @page to appropriate LRU list. 787 * Page may still be unevictable for other reasons. 788 * 789 * lru_lock must not be held, interrupts must be enabled. 790 */ 791 void putback_lru_page(struct page *page) 792 { 793 bool is_unevictable; 794 int was_unevictable = PageUnevictable(page); 795 796 VM_BUG_ON_PAGE(PageLRU(page), page); 797 798 redo: 799 ClearPageUnevictable(page); 800 801 if (page_evictable(page)) { 802 /* 803 * For evictable pages, we can use the cache. 804 * In event of a race, worst case is we end up with an 805 * unevictable page on [in]active list. 806 * We know how to handle that. 807 */ 808 is_unevictable = false; 809 lru_cache_add(page); 810 } else { 811 /* 812 * Put unevictable pages directly on zone's unevictable 813 * list. 814 */ 815 is_unevictable = true; 816 add_page_to_unevictable_list(page); 817 /* 818 * When racing with an mlock or AS_UNEVICTABLE clearing 819 * (page is unlocked) make sure that if the other thread 820 * does not observe our setting of PG_lru and fails 821 * isolation/check_move_unevictable_pages, 822 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move 823 * the page back to the evictable list. 824 * 825 * The other side is TestClearPageMlocked() or shmem_lock(). 826 */ 827 smp_mb(); 828 } 829 830 /* 831 * page's status can change while we move it among lru. If an evictable 832 * page is on unevictable list, it never be freed. To avoid that, 833 * check after we added it to the list, again. 834 */ 835 if (is_unevictable && page_evictable(page)) { 836 if (!isolate_lru_page(page)) { 837 put_page(page); 838 goto redo; 839 } 840 /* This means someone else dropped this page from LRU 841 * So, it will be freed or putback to LRU again. There is 842 * nothing to do here. 843 */ 844 } 845 846 if (was_unevictable && !is_unevictable) 847 count_vm_event(UNEVICTABLE_PGRESCUED); 848 else if (!was_unevictable && is_unevictable) 849 count_vm_event(UNEVICTABLE_PGCULLED); 850 851 put_page(page); /* drop ref from isolate */ 852 } 853 854 enum page_references { 855 PAGEREF_RECLAIM, 856 PAGEREF_RECLAIM_CLEAN, 857 PAGEREF_KEEP, 858 PAGEREF_ACTIVATE, 859 }; 860 861 static enum page_references page_check_references(struct page *page, 862 struct scan_control *sc) 863 { 864 int referenced_ptes, referenced_page; 865 unsigned long vm_flags; 866 867 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, 868 &vm_flags); 869 referenced_page = TestClearPageReferenced(page); 870 871 /* 872 * Mlock lost the isolation race with us. Let try_to_unmap() 873 * move the page to the unevictable list. 874 */ 875 if (vm_flags & VM_LOCKED) 876 return PAGEREF_RECLAIM; 877 878 if (referenced_ptes) { 879 if (PageSwapBacked(page)) 880 return PAGEREF_ACTIVATE; 881 /* 882 * All mapped pages start out with page table 883 * references from the instantiating fault, so we need 884 * to look twice if a mapped file page is used more 885 * than once. 886 * 887 * Mark it and spare it for another trip around the 888 * inactive list. Another page table reference will 889 * lead to its activation. 890 * 891 * Note: the mark is set for activated pages as well 892 * so that recently deactivated but used pages are 893 * quickly recovered. 894 */ 895 SetPageReferenced(page); 896 897 if (referenced_page || referenced_ptes > 1) 898 return PAGEREF_ACTIVATE; 899 900 /* 901 * Activate file-backed executable pages after first usage. 902 */ 903 if (vm_flags & VM_EXEC) 904 return PAGEREF_ACTIVATE; 905 906 return PAGEREF_KEEP; 907 } 908 909 /* Reclaim if clean, defer dirty pages to writeback */ 910 if (referenced_page && !PageSwapBacked(page)) 911 return PAGEREF_RECLAIM_CLEAN; 912 913 return PAGEREF_RECLAIM; 914 } 915 916 /* Check if a page is dirty or under writeback */ 917 static void page_check_dirty_writeback(struct page *page, 918 bool *dirty, bool *writeback) 919 { 920 struct address_space *mapping; 921 922 /* 923 * Anonymous pages are not handled by flushers and must be written 924 * from reclaim context. Do not stall reclaim based on them 925 */ 926 if (!page_is_file_cache(page) || 927 (PageAnon(page) && !PageSwapBacked(page))) { 928 *dirty = false; 929 *writeback = false; 930 return; 931 } 932 933 /* By default assume that the page flags are accurate */ 934 *dirty = PageDirty(page); 935 *writeback = PageWriteback(page); 936 937 /* Verify dirty/writeback state if the filesystem supports it */ 938 if (!page_has_private(page)) 939 return; 940 941 mapping = page_mapping(page); 942 if (mapping && mapping->a_ops->is_dirty_writeback) 943 mapping->a_ops->is_dirty_writeback(page, dirty, writeback); 944 } 945 946 struct reclaim_stat { 947 unsigned nr_dirty; 948 unsigned nr_unqueued_dirty; 949 unsigned nr_congested; 950 unsigned nr_writeback; 951 unsigned nr_immediate; 952 unsigned nr_activate; 953 unsigned nr_ref_keep; 954 unsigned nr_unmap_fail; 955 }; 956 957 /* 958 * shrink_page_list() returns the number of reclaimed pages 959 */ 960 static unsigned long shrink_page_list(struct list_head *page_list, 961 struct pglist_data *pgdat, 962 struct scan_control *sc, 963 enum ttu_flags ttu_flags, 964 struct reclaim_stat *stat, 965 bool force_reclaim) 966 { 967 LIST_HEAD(ret_pages); 968 LIST_HEAD(free_pages); 969 int pgactivate = 0; 970 unsigned nr_unqueued_dirty = 0; 971 unsigned nr_dirty = 0; 972 unsigned nr_congested = 0; 973 unsigned nr_reclaimed = 0; 974 unsigned nr_writeback = 0; 975 unsigned nr_immediate = 0; 976 unsigned nr_ref_keep = 0; 977 unsigned nr_unmap_fail = 0; 978 979 cond_resched(); 980 981 while (!list_empty(page_list)) { 982 struct address_space *mapping; 983 struct page *page; 984 int may_enter_fs; 985 enum page_references references = PAGEREF_RECLAIM_CLEAN; 986 bool dirty, writeback; 987 988 cond_resched(); 989 990 page = lru_to_page(page_list); 991 list_del(&page->lru); 992 993 if (!trylock_page(page)) 994 goto keep; 995 996 VM_BUG_ON_PAGE(PageActive(page), page); 997 998 sc->nr_scanned++; 999 1000 if (unlikely(!page_evictable(page))) 1001 goto activate_locked; 1002 1003 if (!sc->may_unmap && page_mapped(page)) 1004 goto keep_locked; 1005 1006 /* Double the slab pressure for mapped and swapcache pages */ 1007 if ((page_mapped(page) || PageSwapCache(page)) && 1008 !(PageAnon(page) && !PageSwapBacked(page))) 1009 sc->nr_scanned++; 1010 1011 may_enter_fs = (sc->gfp_mask & __GFP_FS) || 1012 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 1013 1014 /* 1015 * The number of dirty pages determines if a zone is marked 1016 * reclaim_congested which affects wait_iff_congested. kswapd 1017 * will stall and start writing pages if the tail of the LRU 1018 * is all dirty unqueued pages. 1019 */ 1020 page_check_dirty_writeback(page, &dirty, &writeback); 1021 if (dirty || writeback) 1022 nr_dirty++; 1023 1024 if (dirty && !writeback) 1025 nr_unqueued_dirty++; 1026 1027 /* 1028 * Treat this page as congested if the underlying BDI is or if 1029 * pages are cycling through the LRU so quickly that the 1030 * pages marked for immediate reclaim are making it to the 1031 * end of the LRU a second time. 1032 */ 1033 mapping = page_mapping(page); 1034 if (((dirty || writeback) && mapping && 1035 inode_write_congested(mapping->host)) || 1036 (writeback && PageReclaim(page))) 1037 nr_congested++; 1038 1039 /* 1040 * If a page at the tail of the LRU is under writeback, there 1041 * are three cases to consider. 1042 * 1043 * 1) If reclaim is encountering an excessive number of pages 1044 * under writeback and this page is both under writeback and 1045 * PageReclaim then it indicates that pages are being queued 1046 * for IO but are being recycled through the LRU before the 1047 * IO can complete. Waiting on the page itself risks an 1048 * indefinite stall if it is impossible to writeback the 1049 * page due to IO error or disconnected storage so instead 1050 * note that the LRU is being scanned too quickly and the 1051 * caller can stall after page list has been processed. 1052 * 1053 * 2) Global or new memcg reclaim encounters a page that is 1054 * not marked for immediate reclaim, or the caller does not 1055 * have __GFP_FS (or __GFP_IO if it's simply going to swap, 1056 * not to fs). In this case mark the page for immediate 1057 * reclaim and continue scanning. 1058 * 1059 * Require may_enter_fs because we would wait on fs, which 1060 * may not have submitted IO yet. And the loop driver might 1061 * enter reclaim, and deadlock if it waits on a page for 1062 * which it is needed to do the write (loop masks off 1063 * __GFP_IO|__GFP_FS for this reason); but more thought 1064 * would probably show more reasons. 1065 * 1066 * 3) Legacy memcg encounters a page that is already marked 1067 * PageReclaim. memcg does not have any dirty pages 1068 * throttling so we could easily OOM just because too many 1069 * pages are in writeback and there is nothing else to 1070 * reclaim. Wait for the writeback to complete. 1071 * 1072 * In cases 1) and 2) we activate the pages to get them out of 1073 * the way while we continue scanning for clean pages on the 1074 * inactive list and refilling from the active list. The 1075 * observation here is that waiting for disk writes is more 1076 * expensive than potentially causing reloads down the line. 1077 * Since they're marked for immediate reclaim, they won't put 1078 * memory pressure on the cache working set any longer than it 1079 * takes to write them to disk. 1080 */ 1081 if (PageWriteback(page)) { 1082 /* Case 1 above */ 1083 if (current_is_kswapd() && 1084 PageReclaim(page) && 1085 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { 1086 nr_immediate++; 1087 goto activate_locked; 1088 1089 /* Case 2 above */ 1090 } else if (sane_reclaim(sc) || 1091 !PageReclaim(page) || !may_enter_fs) { 1092 /* 1093 * This is slightly racy - end_page_writeback() 1094 * might have just cleared PageReclaim, then 1095 * setting PageReclaim here end up interpreted 1096 * as PageReadahead - but that does not matter 1097 * enough to care. What we do want is for this 1098 * page to have PageReclaim set next time memcg 1099 * reclaim reaches the tests above, so it will 1100 * then wait_on_page_writeback() to avoid OOM; 1101 * and it's also appropriate in global reclaim. 1102 */ 1103 SetPageReclaim(page); 1104 nr_writeback++; 1105 goto activate_locked; 1106 1107 /* Case 3 above */ 1108 } else { 1109 unlock_page(page); 1110 wait_on_page_writeback(page); 1111 /* then go back and try same page again */ 1112 list_add_tail(&page->lru, page_list); 1113 continue; 1114 } 1115 } 1116 1117 if (!force_reclaim) 1118 references = page_check_references(page, sc); 1119 1120 switch (references) { 1121 case PAGEREF_ACTIVATE: 1122 goto activate_locked; 1123 case PAGEREF_KEEP: 1124 nr_ref_keep++; 1125 goto keep_locked; 1126 case PAGEREF_RECLAIM: 1127 case PAGEREF_RECLAIM_CLEAN: 1128 ; /* try to reclaim the page below */ 1129 } 1130 1131 /* 1132 * Anonymous process memory has backing store? 1133 * Try to allocate it some swap space here. 1134 * Lazyfree page could be freed directly 1135 */ 1136 if (PageAnon(page) && PageSwapBacked(page)) { 1137 if (!PageSwapCache(page)) { 1138 if (!(sc->gfp_mask & __GFP_IO)) 1139 goto keep_locked; 1140 if (PageTransHuge(page)) { 1141 /* cannot split THP, skip it */ 1142 if (!can_split_huge_page(page, NULL)) 1143 goto activate_locked; 1144 /* 1145 * Split pages without a PMD map right 1146 * away. Chances are some or all of the 1147 * tail pages can be freed without IO. 1148 */ 1149 if (!compound_mapcount(page) && 1150 split_huge_page_to_list(page, 1151 page_list)) 1152 goto activate_locked; 1153 } 1154 if (!add_to_swap(page)) { 1155 if (!PageTransHuge(page)) 1156 goto activate_locked; 1157 /* Fallback to swap normal pages */ 1158 if (split_huge_page_to_list(page, 1159 page_list)) 1160 goto activate_locked; 1161 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1162 count_vm_event(THP_SWPOUT_FALLBACK); 1163 #endif 1164 if (!add_to_swap(page)) 1165 goto activate_locked; 1166 } 1167 1168 may_enter_fs = 1; 1169 1170 /* Adding to swap updated mapping */ 1171 mapping = page_mapping(page); 1172 } 1173 } else if (unlikely(PageTransHuge(page))) { 1174 /* Split file THP */ 1175 if (split_huge_page_to_list(page, page_list)) 1176 goto keep_locked; 1177 } 1178 1179 /* 1180 * The page is mapped into the page tables of one or more 1181 * processes. Try to unmap it here. 1182 */ 1183 if (page_mapped(page)) { 1184 enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH; 1185 1186 if (unlikely(PageTransHuge(page))) 1187 flags |= TTU_SPLIT_HUGE_PMD; 1188 if (!try_to_unmap(page, flags)) { 1189 nr_unmap_fail++; 1190 goto activate_locked; 1191 } 1192 } 1193 1194 if (PageDirty(page)) { 1195 /* 1196 * Only kswapd can writeback filesystem pages 1197 * to avoid risk of stack overflow. But avoid 1198 * injecting inefficient single-page IO into 1199 * flusher writeback as much as possible: only 1200 * write pages when we've encountered many 1201 * dirty pages, and when we've already scanned 1202 * the rest of the LRU for clean pages and see 1203 * the same dirty pages again (PageReclaim). 1204 */ 1205 if (page_is_file_cache(page) && 1206 (!current_is_kswapd() || !PageReclaim(page) || 1207 !test_bit(PGDAT_DIRTY, &pgdat->flags))) { 1208 /* 1209 * Immediately reclaim when written back. 1210 * Similar in principal to deactivate_page() 1211 * except we already have the page isolated 1212 * and know it's dirty 1213 */ 1214 inc_node_page_state(page, NR_VMSCAN_IMMEDIATE); 1215 SetPageReclaim(page); 1216 1217 goto activate_locked; 1218 } 1219 1220 if (references == PAGEREF_RECLAIM_CLEAN) 1221 goto keep_locked; 1222 if (!may_enter_fs) 1223 goto keep_locked; 1224 if (!sc->may_writepage) 1225 goto keep_locked; 1226 1227 /* 1228 * Page is dirty. Flush the TLB if a writable entry 1229 * potentially exists to avoid CPU writes after IO 1230 * starts and then write it out here. 1231 */ 1232 try_to_unmap_flush_dirty(); 1233 switch (pageout(page, mapping, sc)) { 1234 case PAGE_KEEP: 1235 goto keep_locked; 1236 case PAGE_ACTIVATE: 1237 goto activate_locked; 1238 case PAGE_SUCCESS: 1239 if (PageWriteback(page)) 1240 goto keep; 1241 if (PageDirty(page)) 1242 goto keep; 1243 1244 /* 1245 * A synchronous write - probably a ramdisk. Go 1246 * ahead and try to reclaim the page. 1247 */ 1248 if (!trylock_page(page)) 1249 goto keep; 1250 if (PageDirty(page) || PageWriteback(page)) 1251 goto keep_locked; 1252 mapping = page_mapping(page); 1253 case PAGE_CLEAN: 1254 ; /* try to free the page below */ 1255 } 1256 } 1257 1258 /* 1259 * If the page has buffers, try to free the buffer mappings 1260 * associated with this page. If we succeed we try to free 1261 * the page as well. 1262 * 1263 * We do this even if the page is PageDirty(). 1264 * try_to_release_page() does not perform I/O, but it is 1265 * possible for a page to have PageDirty set, but it is actually 1266 * clean (all its buffers are clean). This happens if the 1267 * buffers were written out directly, with submit_bh(). ext3 1268 * will do this, as well as the blockdev mapping. 1269 * try_to_release_page() will discover that cleanness and will 1270 * drop the buffers and mark the page clean - it can be freed. 1271 * 1272 * Rarely, pages can have buffers and no ->mapping. These are 1273 * the pages which were not successfully invalidated in 1274 * truncate_complete_page(). We try to drop those buffers here 1275 * and if that worked, and the page is no longer mapped into 1276 * process address space (page_count == 1) it can be freed. 1277 * Otherwise, leave the page on the LRU so it is swappable. 1278 */ 1279 if (page_has_private(page)) { 1280 if (!try_to_release_page(page, sc->gfp_mask)) 1281 goto activate_locked; 1282 if (!mapping && page_count(page) == 1) { 1283 unlock_page(page); 1284 if (put_page_testzero(page)) 1285 goto free_it; 1286 else { 1287 /* 1288 * rare race with speculative reference. 1289 * the speculative reference will free 1290 * this page shortly, so we may 1291 * increment nr_reclaimed here (and 1292 * leave it off the LRU). 1293 */ 1294 nr_reclaimed++; 1295 continue; 1296 } 1297 } 1298 } 1299 1300 if (PageAnon(page) && !PageSwapBacked(page)) { 1301 /* follow __remove_mapping for reference */ 1302 if (!page_ref_freeze(page, 1)) 1303 goto keep_locked; 1304 if (PageDirty(page)) { 1305 page_ref_unfreeze(page, 1); 1306 goto keep_locked; 1307 } 1308 1309 count_vm_event(PGLAZYFREED); 1310 count_memcg_page_event(page, PGLAZYFREED); 1311 } else if (!mapping || !__remove_mapping(mapping, page, true)) 1312 goto keep_locked; 1313 /* 1314 * At this point, we have no other references and there is 1315 * no way to pick any more up (removed from LRU, removed 1316 * from pagecache). Can use non-atomic bitops now (and 1317 * we obviously don't have to worry about waking up a process 1318 * waiting on the page lock, because there are no references. 1319 */ 1320 __ClearPageLocked(page); 1321 free_it: 1322 nr_reclaimed++; 1323 1324 /* 1325 * Is there need to periodically free_page_list? It would 1326 * appear not as the counts should be low 1327 */ 1328 if (unlikely(PageTransHuge(page))) { 1329 mem_cgroup_uncharge(page); 1330 (*get_compound_page_dtor(page))(page); 1331 } else 1332 list_add(&page->lru, &free_pages); 1333 continue; 1334 1335 activate_locked: 1336 /* Not a candidate for swapping, so reclaim swap space. */ 1337 if (PageSwapCache(page) && (mem_cgroup_swap_full(page) || 1338 PageMlocked(page))) 1339 try_to_free_swap(page); 1340 VM_BUG_ON_PAGE(PageActive(page), page); 1341 if (!PageMlocked(page)) { 1342 SetPageActive(page); 1343 pgactivate++; 1344 count_memcg_page_event(page, PGACTIVATE); 1345 } 1346 keep_locked: 1347 unlock_page(page); 1348 keep: 1349 list_add(&page->lru, &ret_pages); 1350 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); 1351 } 1352 1353 mem_cgroup_uncharge_list(&free_pages); 1354 try_to_unmap_flush(); 1355 free_unref_page_list(&free_pages); 1356 1357 list_splice(&ret_pages, page_list); 1358 count_vm_events(PGACTIVATE, pgactivate); 1359 1360 if (stat) { 1361 stat->nr_dirty = nr_dirty; 1362 stat->nr_congested = nr_congested; 1363 stat->nr_unqueued_dirty = nr_unqueued_dirty; 1364 stat->nr_writeback = nr_writeback; 1365 stat->nr_immediate = nr_immediate; 1366 stat->nr_activate = pgactivate; 1367 stat->nr_ref_keep = nr_ref_keep; 1368 stat->nr_unmap_fail = nr_unmap_fail; 1369 } 1370 return nr_reclaimed; 1371 } 1372 1373 unsigned long reclaim_clean_pages_from_list(struct zone *zone, 1374 struct list_head *page_list) 1375 { 1376 struct scan_control sc = { 1377 .gfp_mask = GFP_KERNEL, 1378 .priority = DEF_PRIORITY, 1379 .may_unmap = 1, 1380 }; 1381 unsigned long ret; 1382 struct page *page, *next; 1383 LIST_HEAD(clean_pages); 1384 1385 list_for_each_entry_safe(page, next, page_list, lru) { 1386 if (page_is_file_cache(page) && !PageDirty(page) && 1387 !__PageMovable(page)) { 1388 ClearPageActive(page); 1389 list_move(&page->lru, &clean_pages); 1390 } 1391 } 1392 1393 ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, 1394 TTU_IGNORE_ACCESS, NULL, true); 1395 list_splice(&clean_pages, page_list); 1396 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); 1397 return ret; 1398 } 1399 1400 /* 1401 * Attempt to remove the specified page from its LRU. Only take this page 1402 * if it is of the appropriate PageActive status. Pages which are being 1403 * freed elsewhere are also ignored. 1404 * 1405 * page: page to consider 1406 * mode: one of the LRU isolation modes defined above 1407 * 1408 * returns 0 on success, -ve errno on failure. 1409 */ 1410 int __isolate_lru_page(struct page *page, isolate_mode_t mode) 1411 { 1412 int ret = -EINVAL; 1413 1414 /* Only take pages on the LRU. */ 1415 if (!PageLRU(page)) 1416 return ret; 1417 1418 /* Compaction should not handle unevictable pages but CMA can do so */ 1419 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) 1420 return ret; 1421 1422 ret = -EBUSY; 1423 1424 /* 1425 * To minimise LRU disruption, the caller can indicate that it only 1426 * wants to isolate pages it will be able to operate on without 1427 * blocking - clean pages for the most part. 1428 * 1429 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages 1430 * that it is possible to migrate without blocking 1431 */ 1432 if (mode & ISOLATE_ASYNC_MIGRATE) { 1433 /* All the caller can do on PageWriteback is block */ 1434 if (PageWriteback(page)) 1435 return ret; 1436 1437 if (PageDirty(page)) { 1438 struct address_space *mapping; 1439 1440 /* 1441 * Only pages without mappings or that have a 1442 * ->migratepage callback are possible to migrate 1443 * without blocking 1444 */ 1445 mapping = page_mapping(page); 1446 if (mapping && !mapping->a_ops->migratepage) 1447 return ret; 1448 } 1449 } 1450 1451 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) 1452 return ret; 1453 1454 if (likely(get_page_unless_zero(page))) { 1455 /* 1456 * Be careful not to clear PageLRU until after we're 1457 * sure the page is not being freed elsewhere -- the 1458 * page release code relies on it. 1459 */ 1460 ClearPageLRU(page); 1461 ret = 0; 1462 } 1463 1464 return ret; 1465 } 1466 1467 1468 /* 1469 * Update LRU sizes after isolating pages. The LRU size updates must 1470 * be complete before mem_cgroup_update_lru_size due to a santity check. 1471 */ 1472 static __always_inline void update_lru_sizes(struct lruvec *lruvec, 1473 enum lru_list lru, unsigned long *nr_zone_taken) 1474 { 1475 int zid; 1476 1477 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1478 if (!nr_zone_taken[zid]) 1479 continue; 1480 1481 __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); 1482 #ifdef CONFIG_MEMCG 1483 mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); 1484 #endif 1485 } 1486 1487 } 1488 1489 /* 1490 * zone_lru_lock is heavily contended. Some of the functions that 1491 * shrink the lists perform better by taking out a batch of pages 1492 * and working on them outside the LRU lock. 1493 * 1494 * For pagecache intensive workloads, this function is the hottest 1495 * spot in the kernel (apart from copy_*_user functions). 1496 * 1497 * Appropriate locks must be held before calling this function. 1498 * 1499 * @nr_to_scan: The number of eligible pages to look through on the list. 1500 * @lruvec: The LRU vector to pull pages from. 1501 * @dst: The temp list to put pages on to. 1502 * @nr_scanned: The number of pages that were scanned. 1503 * @sc: The scan_control struct for this reclaim session 1504 * @mode: One of the LRU isolation modes 1505 * @lru: LRU list id for isolating 1506 * 1507 * returns how many pages were moved onto *@dst. 1508 */ 1509 static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1510 struct lruvec *lruvec, struct list_head *dst, 1511 unsigned long *nr_scanned, struct scan_control *sc, 1512 isolate_mode_t mode, enum lru_list lru) 1513 { 1514 struct list_head *src = &lruvec->lists[lru]; 1515 unsigned long nr_taken = 0; 1516 unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; 1517 unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; 1518 unsigned long skipped = 0; 1519 unsigned long scan, total_scan, nr_pages; 1520 LIST_HEAD(pages_skipped); 1521 1522 scan = 0; 1523 for (total_scan = 0; 1524 scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src); 1525 total_scan++) { 1526 struct page *page; 1527 1528 page = lru_to_page(src); 1529 prefetchw_prev_lru_page(page, src, flags); 1530 1531 VM_BUG_ON_PAGE(!PageLRU(page), page); 1532 1533 if (page_zonenum(page) > sc->reclaim_idx) { 1534 list_move(&page->lru, &pages_skipped); 1535 nr_skipped[page_zonenum(page)]++; 1536 continue; 1537 } 1538 1539 /* 1540 * Do not count skipped pages because that makes the function 1541 * return with no isolated pages if the LRU mostly contains 1542 * ineligible pages. This causes the VM to not reclaim any 1543 * pages, triggering a premature OOM. 1544 */ 1545 scan++; 1546 switch (__isolate_lru_page(page, mode)) { 1547 case 0: 1548 nr_pages = hpage_nr_pages(page); 1549 nr_taken += nr_pages; 1550 nr_zone_taken[page_zonenum(page)] += nr_pages; 1551 list_move(&page->lru, dst); 1552 break; 1553 1554 case -EBUSY: 1555 /* else it is being freed elsewhere */ 1556 list_move(&page->lru, src); 1557 continue; 1558 1559 default: 1560 BUG(); 1561 } 1562 } 1563 1564 /* 1565 * Splice any skipped pages to the start of the LRU list. Note that 1566 * this disrupts the LRU order when reclaiming for lower zones but 1567 * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX 1568 * scanning would soon rescan the same pages to skip and put the 1569 * system at risk of premature OOM. 1570 */ 1571 if (!list_empty(&pages_skipped)) { 1572 int zid; 1573 1574 list_splice(&pages_skipped, src); 1575 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1576 if (!nr_skipped[zid]) 1577 continue; 1578 1579 __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); 1580 skipped += nr_skipped[zid]; 1581 } 1582 } 1583 *nr_scanned = total_scan; 1584 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, 1585 total_scan, skipped, nr_taken, mode, lru); 1586 update_lru_sizes(lruvec, lru, nr_zone_taken); 1587 return nr_taken; 1588 } 1589 1590 /** 1591 * isolate_lru_page - tries to isolate a page from its LRU list 1592 * @page: page to isolate from its LRU list 1593 * 1594 * Isolates a @page from an LRU list, clears PageLRU and adjusts the 1595 * vmstat statistic corresponding to whatever LRU list the page was on. 1596 * 1597 * Returns 0 if the page was removed from an LRU list. 1598 * Returns -EBUSY if the page was not on an LRU list. 1599 * 1600 * The returned page will have PageLRU() cleared. If it was found on 1601 * the active list, it will have PageActive set. If it was found on 1602 * the unevictable list, it will have the PageUnevictable bit set. That flag 1603 * may need to be cleared by the caller before letting the page go. 1604 * 1605 * The vmstat statistic corresponding to the list on which the page was 1606 * found will be decremented. 1607 * 1608 * Restrictions: 1609 * (1) Must be called with an elevated refcount on the page. This is a 1610 * fundamentnal difference from isolate_lru_pages (which is called 1611 * without a stable reference). 1612 * (2) the lru_lock must not be held. 1613 * (3) interrupts must be enabled. 1614 */ 1615 int isolate_lru_page(struct page *page) 1616 { 1617 int ret = -EBUSY; 1618 1619 VM_BUG_ON_PAGE(!page_count(page), page); 1620 WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); 1621 1622 if (PageLRU(page)) { 1623 struct zone *zone = page_zone(page); 1624 struct lruvec *lruvec; 1625 1626 spin_lock_irq(zone_lru_lock(zone)); 1627 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 1628 if (PageLRU(page)) { 1629 int lru = page_lru(page); 1630 get_page(page); 1631 ClearPageLRU(page); 1632 del_page_from_lru_list(page, lruvec, lru); 1633 ret = 0; 1634 } 1635 spin_unlock_irq(zone_lru_lock(zone)); 1636 } 1637 return ret; 1638 } 1639 1640 /* 1641 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and 1642 * then get resheduled. When there are massive number of tasks doing page 1643 * allocation, such sleeping direct reclaimers may keep piling up on each CPU, 1644 * the LRU list will go small and be scanned faster than necessary, leading to 1645 * unnecessary swapping, thrashing and OOM. 1646 */ 1647 static int too_many_isolated(struct pglist_data *pgdat, int file, 1648 struct scan_control *sc) 1649 { 1650 unsigned long inactive, isolated; 1651 1652 if (current_is_kswapd()) 1653 return 0; 1654 1655 if (!sane_reclaim(sc)) 1656 return 0; 1657 1658 if (file) { 1659 inactive = node_page_state(pgdat, NR_INACTIVE_FILE); 1660 isolated = node_page_state(pgdat, NR_ISOLATED_FILE); 1661 } else { 1662 inactive = node_page_state(pgdat, NR_INACTIVE_ANON); 1663 isolated = node_page_state(pgdat, NR_ISOLATED_ANON); 1664 } 1665 1666 /* 1667 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they 1668 * won't get blocked by normal direct-reclaimers, forming a circular 1669 * deadlock. 1670 */ 1671 if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) 1672 inactive >>= 3; 1673 1674 return isolated > inactive; 1675 } 1676 1677 static noinline_for_stack void 1678 putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) 1679 { 1680 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1681 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 1682 LIST_HEAD(pages_to_free); 1683 1684 /* 1685 * Put back any unfreeable pages. 1686 */ 1687 while (!list_empty(page_list)) { 1688 struct page *page = lru_to_page(page_list); 1689 int lru; 1690 1691 VM_BUG_ON_PAGE(PageLRU(page), page); 1692 list_del(&page->lru); 1693 if (unlikely(!page_evictable(page))) { 1694 spin_unlock_irq(&pgdat->lru_lock); 1695 putback_lru_page(page); 1696 spin_lock_irq(&pgdat->lru_lock); 1697 continue; 1698 } 1699 1700 lruvec = mem_cgroup_page_lruvec(page, pgdat); 1701 1702 SetPageLRU(page); 1703 lru = page_lru(page); 1704 add_page_to_lru_list(page, lruvec, lru); 1705 1706 if (is_active_lru(lru)) { 1707 int file = is_file_lru(lru); 1708 int numpages = hpage_nr_pages(page); 1709 reclaim_stat->recent_rotated[file] += numpages; 1710 } 1711 if (put_page_testzero(page)) { 1712 __ClearPageLRU(page); 1713 __ClearPageActive(page); 1714 del_page_from_lru_list(page, lruvec, lru); 1715 1716 if (unlikely(PageCompound(page))) { 1717 spin_unlock_irq(&pgdat->lru_lock); 1718 mem_cgroup_uncharge(page); 1719 (*get_compound_page_dtor(page))(page); 1720 spin_lock_irq(&pgdat->lru_lock); 1721 } else 1722 list_add(&page->lru, &pages_to_free); 1723 } 1724 } 1725 1726 /* 1727 * To save our caller's stack, now use input list for pages to free. 1728 */ 1729 list_splice(&pages_to_free, page_list); 1730 } 1731 1732 /* 1733 * If a kernel thread (such as nfsd for loop-back mounts) services 1734 * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. 1735 * In that case we should only throttle if the backing device it is 1736 * writing to is congested. In other cases it is safe to throttle. 1737 */ 1738 static int current_may_throttle(void) 1739 { 1740 return !(current->flags & PF_LESS_THROTTLE) || 1741 current->backing_dev_info == NULL || 1742 bdi_write_congested(current->backing_dev_info); 1743 } 1744 1745 /* 1746 * shrink_inactive_list() is a helper for shrink_node(). It returns the number 1747 * of reclaimed pages 1748 */ 1749 static noinline_for_stack unsigned long 1750 shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, 1751 struct scan_control *sc, enum lru_list lru) 1752 { 1753 LIST_HEAD(page_list); 1754 unsigned long nr_scanned; 1755 unsigned long nr_reclaimed = 0; 1756 unsigned long nr_taken; 1757 struct reclaim_stat stat = {}; 1758 isolate_mode_t isolate_mode = 0; 1759 int file = is_file_lru(lru); 1760 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 1761 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1762 bool stalled = false; 1763 1764 while (unlikely(too_many_isolated(pgdat, file, sc))) { 1765 if (stalled) 1766 return 0; 1767 1768 /* wait a bit for the reclaimer. */ 1769 msleep(100); 1770 stalled = true; 1771 1772 /* We are about to die and free our memory. Return now. */ 1773 if (fatal_signal_pending(current)) 1774 return SWAP_CLUSTER_MAX; 1775 } 1776 1777 lru_add_drain(); 1778 1779 if (!sc->may_unmap) 1780 isolate_mode |= ISOLATE_UNMAPPED; 1781 1782 spin_lock_irq(&pgdat->lru_lock); 1783 1784 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, 1785 &nr_scanned, sc, isolate_mode, lru); 1786 1787 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); 1788 reclaim_stat->recent_scanned[file] += nr_taken; 1789 1790 if (current_is_kswapd()) { 1791 if (global_reclaim(sc)) 1792 __count_vm_events(PGSCAN_KSWAPD, nr_scanned); 1793 count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD, 1794 nr_scanned); 1795 } else { 1796 if (global_reclaim(sc)) 1797 __count_vm_events(PGSCAN_DIRECT, nr_scanned); 1798 count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT, 1799 nr_scanned); 1800 } 1801 spin_unlock_irq(&pgdat->lru_lock); 1802 1803 if (nr_taken == 0) 1804 return 0; 1805 1806 nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0, 1807 &stat, false); 1808 1809 spin_lock_irq(&pgdat->lru_lock); 1810 1811 if (current_is_kswapd()) { 1812 if (global_reclaim(sc)) 1813 __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed); 1814 count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD, 1815 nr_reclaimed); 1816 } else { 1817 if (global_reclaim(sc)) 1818 __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed); 1819 count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT, 1820 nr_reclaimed); 1821 } 1822 1823 putback_inactive_pages(lruvec, &page_list); 1824 1825 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); 1826 1827 spin_unlock_irq(&pgdat->lru_lock); 1828 1829 mem_cgroup_uncharge_list(&page_list); 1830 free_unref_page_list(&page_list); 1831 1832 /* 1833 * If reclaim is isolating dirty pages under writeback, it implies 1834 * that the long-lived page allocation rate is exceeding the page 1835 * laundering rate. Either the global limits are not being effective 1836 * at throttling processes due to the page distribution throughout 1837 * zones or there is heavy usage of a slow backing device. The 1838 * only option is to throttle from reclaim context which is not ideal 1839 * as there is no guarantee the dirtying process is throttled in the 1840 * same way balance_dirty_pages() manages. 1841 * 1842 * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number 1843 * of pages under pages flagged for immediate reclaim and stall if any 1844 * are encountered in the nr_immediate check below. 1845 */ 1846 if (stat.nr_writeback && stat.nr_writeback == nr_taken) 1847 set_bit(PGDAT_WRITEBACK, &pgdat->flags); 1848 1849 /* 1850 * Legacy memcg will stall in page writeback so avoid forcibly 1851 * stalling here. 1852 */ 1853 if (sane_reclaim(sc)) { 1854 /* 1855 * Tag a zone as congested if all the dirty pages scanned were 1856 * backed by a congested BDI and wait_iff_congested will stall. 1857 */ 1858 if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested) 1859 set_bit(PGDAT_CONGESTED, &pgdat->flags); 1860 1861 /* 1862 * If dirty pages are scanned that are not queued for IO, it 1863 * implies that flushers are not doing their job. This can 1864 * happen when memory pressure pushes dirty pages to the end of 1865 * the LRU before the dirty limits are breached and the dirty 1866 * data has expired. It can also happen when the proportion of 1867 * dirty pages grows not through writes but through memory 1868 * pressure reclaiming all the clean cache. And in some cases, 1869 * the flushers simply cannot keep up with the allocation 1870 * rate. Nudge the flusher threads in case they are asleep, but 1871 * also allow kswapd to start writing pages during reclaim. 1872 */ 1873 if (stat.nr_unqueued_dirty == nr_taken) { 1874 wakeup_flusher_threads(WB_REASON_VMSCAN); 1875 set_bit(PGDAT_DIRTY, &pgdat->flags); 1876 } 1877 1878 /* 1879 * If kswapd scans pages marked marked for immediate 1880 * reclaim and under writeback (nr_immediate), it implies 1881 * that pages are cycling through the LRU faster than 1882 * they are written so also forcibly stall. 1883 */ 1884 if (stat.nr_immediate && current_may_throttle()) 1885 congestion_wait(BLK_RW_ASYNC, HZ/10); 1886 } 1887 1888 /* 1889 * Stall direct reclaim for IO completions if underlying BDIs or zone 1890 * is congested. Allow kswapd to continue until it starts encountering 1891 * unqueued dirty pages or cycling through the LRU too quickly. 1892 */ 1893 if (!sc->hibernation_mode && !current_is_kswapd() && 1894 current_may_throttle()) 1895 wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10); 1896 1897 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, 1898 nr_scanned, nr_reclaimed, 1899 stat.nr_dirty, stat.nr_writeback, 1900 stat.nr_congested, stat.nr_immediate, 1901 stat.nr_activate, stat.nr_ref_keep, 1902 stat.nr_unmap_fail, 1903 sc->priority, file); 1904 return nr_reclaimed; 1905 } 1906 1907 /* 1908 * This moves pages from the active list to the inactive list. 1909 * 1910 * We move them the other way if the page is referenced by one or more 1911 * processes, from rmap. 1912 * 1913 * If the pages are mostly unmapped, the processing is fast and it is 1914 * appropriate to hold zone_lru_lock across the whole operation. But if 1915 * the pages are mapped, the processing is slow (page_referenced()) so we 1916 * should drop zone_lru_lock around each page. It's impossible to balance 1917 * this, so instead we remove the pages from the LRU while processing them. 1918 * It is safe to rely on PG_active against the non-LRU pages in here because 1919 * nobody will play with that bit on a non-LRU page. 1920 * 1921 * The downside is that we have to touch page->_refcount against each page. 1922 * But we had to alter page->flags anyway. 1923 * 1924 * Returns the number of pages moved to the given lru. 1925 */ 1926 1927 static unsigned move_active_pages_to_lru(struct lruvec *lruvec, 1928 struct list_head *list, 1929 struct list_head *pages_to_free, 1930 enum lru_list lru) 1931 { 1932 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 1933 struct page *page; 1934 int nr_pages; 1935 int nr_moved = 0; 1936 1937 while (!list_empty(list)) { 1938 page = lru_to_page(list); 1939 lruvec = mem_cgroup_page_lruvec(page, pgdat); 1940 1941 VM_BUG_ON_PAGE(PageLRU(page), page); 1942 SetPageLRU(page); 1943 1944 nr_pages = hpage_nr_pages(page); 1945 update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); 1946 list_move(&page->lru, &lruvec->lists[lru]); 1947 1948 if (put_page_testzero(page)) { 1949 __ClearPageLRU(page); 1950 __ClearPageActive(page); 1951 del_page_from_lru_list(page, lruvec, lru); 1952 1953 if (unlikely(PageCompound(page))) { 1954 spin_unlock_irq(&pgdat->lru_lock); 1955 mem_cgroup_uncharge(page); 1956 (*get_compound_page_dtor(page))(page); 1957 spin_lock_irq(&pgdat->lru_lock); 1958 } else 1959 list_add(&page->lru, pages_to_free); 1960 } else { 1961 nr_moved += nr_pages; 1962 } 1963 } 1964 1965 if (!is_active_lru(lru)) { 1966 __count_vm_events(PGDEACTIVATE, nr_moved); 1967 count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, 1968 nr_moved); 1969 } 1970 1971 return nr_moved; 1972 } 1973 1974 static void shrink_active_list(unsigned long nr_to_scan, 1975 struct lruvec *lruvec, 1976 struct scan_control *sc, 1977 enum lru_list lru) 1978 { 1979 unsigned long nr_taken; 1980 unsigned long nr_scanned; 1981 unsigned long vm_flags; 1982 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1983 LIST_HEAD(l_active); 1984 LIST_HEAD(l_inactive); 1985 struct page *page; 1986 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1987 unsigned nr_deactivate, nr_activate; 1988 unsigned nr_rotated = 0; 1989 isolate_mode_t isolate_mode = 0; 1990 int file = is_file_lru(lru); 1991 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 1992 1993 lru_add_drain(); 1994 1995 if (!sc->may_unmap) 1996 isolate_mode |= ISOLATE_UNMAPPED; 1997 1998 spin_lock_irq(&pgdat->lru_lock); 1999 2000 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, 2001 &nr_scanned, sc, isolate_mode, lru); 2002 2003 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); 2004 reclaim_stat->recent_scanned[file] += nr_taken; 2005 2006 __count_vm_events(PGREFILL, nr_scanned); 2007 count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); 2008 2009 spin_unlock_irq(&pgdat->lru_lock); 2010 2011 while (!list_empty(&l_hold)) { 2012 cond_resched(); 2013 page = lru_to_page(&l_hold); 2014 list_del(&page->lru); 2015 2016 if (unlikely(!page_evictable(page))) { 2017 putback_lru_page(page); 2018 continue; 2019 } 2020 2021 if (unlikely(buffer_heads_over_limit)) { 2022 if (page_has_private(page) && trylock_page(page)) { 2023 if (page_has_private(page)) 2024 try_to_release_page(page, 0); 2025 unlock_page(page); 2026 } 2027 } 2028 2029 if (page_referenced(page, 0, sc->target_mem_cgroup, 2030 &vm_flags)) { 2031 nr_rotated += hpage_nr_pages(page); 2032 /* 2033 * Identify referenced, file-backed active pages and 2034 * give them one more trip around the active list. So 2035 * that executable code get better chances to stay in 2036 * memory under moderate memory pressure. Anon pages 2037 * are not likely to be evicted by use-once streaming 2038 * IO, plus JVM can create lots of anon VM_EXEC pages, 2039 * so we ignore them here. 2040 */ 2041 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { 2042 list_add(&page->lru, &l_active); 2043 continue; 2044 } 2045 } 2046 2047 ClearPageActive(page); /* we are de-activating */ 2048 list_add(&page->lru, &l_inactive); 2049 } 2050 2051 /* 2052 * Move pages back to the lru list. 2053 */ 2054 spin_lock_irq(&pgdat->lru_lock); 2055 /* 2056 * Count referenced pages from currently used mappings as rotated, 2057 * even though only some of them are actually re-activated. This 2058 * helps balance scan pressure between file and anonymous pages in 2059 * get_scan_count. 2060 */ 2061 reclaim_stat->recent_rotated[file] += nr_rotated; 2062 2063 nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); 2064 nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); 2065 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); 2066 spin_unlock_irq(&pgdat->lru_lock); 2067 2068 mem_cgroup_uncharge_list(&l_hold); 2069 free_unref_page_list(&l_hold); 2070 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, 2071 nr_deactivate, nr_rotated, sc->priority, file); 2072 } 2073 2074 /* 2075 * The inactive anon list should be small enough that the VM never has 2076 * to do too much work. 2077 * 2078 * The inactive file list should be small enough to leave most memory 2079 * to the established workingset on the scan-resistant active list, 2080 * but large enough to avoid thrashing the aggregate readahead window. 2081 * 2082 * Both inactive lists should also be large enough that each inactive 2083 * page has a chance to be referenced again before it is reclaimed. 2084 * 2085 * If that fails and refaulting is observed, the inactive list grows. 2086 * 2087 * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages 2088 * on this LRU, maintained by the pageout code. An inactive_ratio 2089 * of 3 means 3:1 or 25% of the pages are kept on the inactive list. 2090 * 2091 * total target max 2092 * memory ratio inactive 2093 * ------------------------------------- 2094 * 10MB 1 5MB 2095 * 100MB 1 50MB 2096 * 1GB 3 250MB 2097 * 10GB 10 0.9GB 2098 * 100GB 31 3GB 2099 * 1TB 101 10GB 2100 * 10TB 320 32GB 2101 */ 2102 static bool inactive_list_is_low(struct lruvec *lruvec, bool file, 2103 struct mem_cgroup *memcg, 2104 struct scan_control *sc, bool actual_reclaim) 2105 { 2106 enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; 2107 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 2108 enum lru_list inactive_lru = file * LRU_FILE; 2109 unsigned long inactive, active; 2110 unsigned long inactive_ratio; 2111 unsigned long refaults; 2112 unsigned long gb; 2113 2114 /* 2115 * If we don't have swap space, anonymous page deactivation 2116 * is pointless. 2117 */ 2118 if (!file && !total_swap_pages) 2119 return false; 2120 2121 inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); 2122 active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); 2123 2124 if (memcg) 2125 refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE); 2126 else 2127 refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); 2128 2129 /* 2130 * When refaults are being observed, it means a new workingset 2131 * is being established. Disable active list protection to get 2132 * rid of the stale workingset quickly. 2133 */ 2134 if (file && actual_reclaim && lruvec->refaults != refaults) { 2135 inactive_ratio = 0; 2136 } else { 2137 gb = (inactive + active) >> (30 - PAGE_SHIFT); 2138 if (gb) 2139 inactive_ratio = int_sqrt(10 * gb); 2140 else 2141 inactive_ratio = 1; 2142 } 2143 2144 if (actual_reclaim) 2145 trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx, 2146 lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, 2147 lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, 2148 inactive_ratio, file); 2149 2150 return inactive * inactive_ratio < active; 2151 } 2152 2153 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 2154 struct lruvec *lruvec, struct mem_cgroup *memcg, 2155 struct scan_control *sc) 2156 { 2157 if (is_active_lru(lru)) { 2158 if (inactive_list_is_low(lruvec, is_file_lru(lru), 2159 memcg, sc, true)) 2160 shrink_active_list(nr_to_scan, lruvec, sc, lru); 2161 return 0; 2162 } 2163 2164 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); 2165 } 2166 2167 enum scan_balance { 2168 SCAN_EQUAL, 2169 SCAN_FRACT, 2170 SCAN_ANON, 2171 SCAN_FILE, 2172 }; 2173 2174 /* 2175 * Determine how aggressively the anon and file LRU lists should be 2176 * scanned. The relative value of each set of LRU lists is determined 2177 * by looking at the fraction of the pages scanned we did rotate back 2178 * onto the active list instead of evict. 2179 * 2180 * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan 2181 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan 2182 */ 2183 static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, 2184 struct scan_control *sc, unsigned long *nr, 2185 unsigned long *lru_pages) 2186 { 2187 int swappiness = mem_cgroup_swappiness(memcg); 2188 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 2189 u64 fraction[2]; 2190 u64 denominator = 0; /* gcc */ 2191 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 2192 unsigned long anon_prio, file_prio; 2193 enum scan_balance scan_balance; 2194 unsigned long anon, file; 2195 unsigned long ap, fp; 2196 enum lru_list lru; 2197 2198 /* If we have no swap space, do not bother scanning anon pages. */ 2199 if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { 2200 scan_balance = SCAN_FILE; 2201 goto out; 2202 } 2203 2204 /* 2205 * Global reclaim will swap to prevent OOM even with no 2206 * swappiness, but memcg users want to use this knob to 2207 * disable swapping for individual groups completely when 2208 * using the memory controller's swap limit feature would be 2209 * too expensive. 2210 */ 2211 if (!global_reclaim(sc) && !swappiness) { 2212 scan_balance = SCAN_FILE; 2213 goto out; 2214 } 2215 2216 /* 2217 * Do not apply any pressure balancing cleverness when the 2218 * system is close to OOM, scan both anon and file equally 2219 * (unless the swappiness setting disagrees with swapping). 2220 */ 2221 if (!sc->priority && swappiness) { 2222 scan_balance = SCAN_EQUAL; 2223 goto out; 2224 } 2225 2226 /* 2227 * Prevent the reclaimer from falling into the cache trap: as 2228 * cache pages start out inactive, every cache fault will tip 2229 * the scan balance towards the file LRU. And as the file LRU 2230 * shrinks, so does the window for rotation from references. 2231 * This means we have a runaway feedback loop where a tiny 2232 * thrashing file LRU becomes infinitely more attractive than 2233 * anon pages. Try to detect this based on file LRU size. 2234 */ 2235 if (global_reclaim(sc)) { 2236 unsigned long pgdatfile; 2237 unsigned long pgdatfree; 2238 int z; 2239 unsigned long total_high_wmark = 0; 2240 2241 pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); 2242 pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) + 2243 node_page_state(pgdat, NR_INACTIVE_FILE); 2244 2245 for (z = 0; z < MAX_NR_ZONES; z++) { 2246 struct zone *zone = &pgdat->node_zones[z]; 2247 if (!managed_zone(zone)) 2248 continue; 2249 2250 total_high_wmark += high_wmark_pages(zone); 2251 } 2252 2253 if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) { 2254 /* 2255 * Force SCAN_ANON if there are enough inactive 2256 * anonymous pages on the LRU in eligible zones. 2257 * Otherwise, the small LRU gets thrashed. 2258 */ 2259 if (!inactive_list_is_low(lruvec, false, memcg, sc, false) && 2260 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx) 2261 >> sc->priority) { 2262 scan_balance = SCAN_ANON; 2263 goto out; 2264 } 2265 } 2266 } 2267 2268 /* 2269 * If there is enough inactive page cache, i.e. if the size of the 2270 * inactive list is greater than that of the active list *and* the 2271 * inactive list actually has some pages to scan on this priority, we 2272 * do not reclaim anything from the anonymous working set right now. 2273 * Without the second condition we could end up never scanning an 2274 * lruvec even if it has plenty of old anonymous pages unless the 2275 * system is under heavy pressure. 2276 */ 2277 if (!inactive_list_is_low(lruvec, true, memcg, sc, false) && 2278 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { 2279 scan_balance = SCAN_FILE; 2280 goto out; 2281 } 2282 2283 scan_balance = SCAN_FRACT; 2284 2285 /* 2286 * With swappiness at 100, anonymous and file have the same priority. 2287 * This scanning priority is essentially the inverse of IO cost. 2288 */ 2289 anon_prio = swappiness; 2290 file_prio = 200 - anon_prio; 2291 2292 /* 2293 * OK, so we have swap space and a fair amount of page cache 2294 * pages. We use the recently rotated / recently scanned 2295 * ratios to determine how valuable each cache is. 2296 * 2297 * Because workloads change over time (and to avoid overflow) 2298 * we keep these statistics as a floating average, which ends 2299 * up weighing recent references more than old ones. 2300 * 2301 * anon in [0], file in [1] 2302 */ 2303 2304 anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + 2305 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); 2306 file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) + 2307 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES); 2308 2309 spin_lock_irq(&pgdat->lru_lock); 2310 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 2311 reclaim_stat->recent_scanned[0] /= 2; 2312 reclaim_stat->recent_rotated[0] /= 2; 2313 } 2314 2315 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { 2316 reclaim_stat->recent_scanned[1] /= 2; 2317 reclaim_stat->recent_rotated[1] /= 2; 2318 } 2319 2320 /* 2321 * The amount of pressure on anon vs file pages is inversely 2322 * proportional to the fraction of recently scanned pages on 2323 * each list that were recently referenced and in active use. 2324 */ 2325 ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); 2326 ap /= reclaim_stat->recent_rotated[0] + 1; 2327 2328 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); 2329 fp /= reclaim_stat->recent_rotated[1] + 1; 2330 spin_unlock_irq(&pgdat->lru_lock); 2331 2332 fraction[0] = ap; 2333 fraction[1] = fp; 2334 denominator = ap + fp + 1; 2335 out: 2336 *lru_pages = 0; 2337 for_each_evictable_lru(lru) { 2338 int file = is_file_lru(lru); 2339 unsigned long size; 2340 unsigned long scan; 2341 2342 size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); 2343 scan = size >> sc->priority; 2344 /* 2345 * If the cgroup's already been deleted, make sure to 2346 * scrape out the remaining cache. 2347 */ 2348 if (!scan && !mem_cgroup_online(memcg)) 2349 scan = min(size, SWAP_CLUSTER_MAX); 2350 2351 switch (scan_balance) { 2352 case SCAN_EQUAL: 2353 /* Scan lists relative to size */ 2354 break; 2355 case SCAN_FRACT: 2356 /* 2357 * Scan types proportional to swappiness and 2358 * their relative recent reclaim efficiency. 2359 */ 2360 scan = div64_u64(scan * fraction[file], 2361 denominator); 2362 break; 2363 case SCAN_FILE: 2364 case SCAN_ANON: 2365 /* Scan one type exclusively */ 2366 if ((scan_balance == SCAN_FILE) != file) { 2367 size = 0; 2368 scan = 0; 2369 } 2370 break; 2371 default: 2372 /* Look ma, no brain */ 2373 BUG(); 2374 } 2375 2376 *lru_pages += size; 2377 nr[lru] = scan; 2378 } 2379 } 2380 2381 /* 2382 * This is a basic per-node page freer. Used by both kswapd and direct reclaim. 2383 */ 2384 static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, 2385 struct scan_control *sc, unsigned long *lru_pages) 2386 { 2387 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); 2388 unsigned long nr[NR_LRU_LISTS]; 2389 unsigned long targets[NR_LRU_LISTS]; 2390 unsigned long nr_to_scan; 2391 enum lru_list lru; 2392 unsigned long nr_reclaimed = 0; 2393 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 2394 struct blk_plug plug; 2395 bool scan_adjusted; 2396 2397 get_scan_count(lruvec, memcg, sc, nr, lru_pages); 2398 2399 /* Record the original scan target for proportional adjustments later */ 2400 memcpy(targets, nr, sizeof(nr)); 2401 2402 /* 2403 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal 2404 * event that can occur when there is little memory pressure e.g. 2405 * multiple streaming readers/writers. Hence, we do not abort scanning 2406 * when the requested number of pages are reclaimed when scanning at 2407 * DEF_PRIORITY on the assumption that the fact we are direct 2408 * reclaiming implies that kswapd is not keeping up and it is best to 2409 * do a batch of work at once. For memcg reclaim one check is made to 2410 * abort proportional reclaim if either the file or anon lru has already 2411 * dropped to zero at the first pass. 2412 */ 2413 scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && 2414 sc->priority == DEF_PRIORITY); 2415 2416 blk_start_plug(&plug); 2417 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 2418 nr[LRU_INACTIVE_FILE]) { 2419 unsigned long nr_anon, nr_file, percentage; 2420 unsigned long nr_scanned; 2421 2422 for_each_evictable_lru(lru) { 2423 if (nr[lru]) { 2424 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); 2425 nr[lru] -= nr_to_scan; 2426 2427 nr_reclaimed += shrink_list(lru, nr_to_scan, 2428 lruvec, memcg, sc); 2429 } 2430 } 2431 2432 cond_resched(); 2433 2434 if (nr_reclaimed < nr_to_reclaim || scan_adjusted) 2435 continue; 2436 2437 /* 2438 * For kswapd and memcg, reclaim at least the number of pages 2439 * requested. Ensure that the anon and file LRUs are scanned 2440 * proportionally what was requested by get_scan_count(). We 2441 * stop reclaiming one LRU and reduce the amount scanning 2442 * proportional to the original scan target. 2443 */ 2444 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; 2445 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; 2446 2447 /* 2448 * It's just vindictive to attack the larger once the smaller 2449 * has gone to zero. And given the way we stop scanning the 2450 * smaller below, this makes sure that we only make one nudge 2451 * towards proportionality once we've got nr_to_reclaim. 2452 */ 2453 if (!nr_file || !nr_anon) 2454 break; 2455 2456 if (nr_file > nr_anon) { 2457 unsigned long scan_target = targets[LRU_INACTIVE_ANON] + 2458 targets[LRU_ACTIVE_ANON] + 1; 2459 lru = LRU_BASE; 2460 percentage = nr_anon * 100 / scan_target; 2461 } else { 2462 unsigned long scan_target = targets[LRU_INACTIVE_FILE] + 2463 targets[LRU_ACTIVE_FILE] + 1; 2464 lru = LRU_FILE; 2465 percentage = nr_file * 100 / scan_target; 2466 } 2467 2468 /* Stop scanning the smaller of the LRU */ 2469 nr[lru] = 0; 2470 nr[lru + LRU_ACTIVE] = 0; 2471 2472 /* 2473 * Recalculate the other LRU scan count based on its original 2474 * scan target and the percentage scanning already complete 2475 */ 2476 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; 2477 nr_scanned = targets[lru] - nr[lru]; 2478 nr[lru] = targets[lru] * (100 - percentage) / 100; 2479 nr[lru] -= min(nr[lru], nr_scanned); 2480 2481 lru += LRU_ACTIVE; 2482 nr_scanned = targets[lru] - nr[lru]; 2483 nr[lru] = targets[lru] * (100 - percentage) / 100; 2484 nr[lru] -= min(nr[lru], nr_scanned); 2485 2486 scan_adjusted = true; 2487 } 2488 blk_finish_plug(&plug); 2489 sc->nr_reclaimed += nr_reclaimed; 2490 2491 /* 2492 * Even if we did not try to evict anon pages at all, we want to 2493 * rebalance the anon lru active/inactive ratio. 2494 */ 2495 if (inactive_list_is_low(lruvec, false, memcg, sc, true)) 2496 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 2497 sc, LRU_ACTIVE_ANON); 2498 } 2499 2500 /* Use reclaim/compaction for costly allocs or under memory pressure */ 2501 static bool in_reclaim_compaction(struct scan_control *sc) 2502 { 2503 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && 2504 (sc->order > PAGE_ALLOC_COSTLY_ORDER || 2505 sc->priority < DEF_PRIORITY - 2)) 2506 return true; 2507 2508 return false; 2509 } 2510 2511 /* 2512 * Reclaim/compaction is used for high-order allocation requests. It reclaims 2513 * order-0 pages before compacting the zone. should_continue_reclaim() returns 2514 * true if more pages should be reclaimed such that when the page allocator 2515 * calls try_to_compact_zone() that it will have enough free pages to succeed. 2516 * It will give up earlier than that if there is difficulty reclaiming pages. 2517 */ 2518 static inline bool should_continue_reclaim(struct pglist_data *pgdat, 2519 unsigned long nr_reclaimed, 2520 unsigned long nr_scanned, 2521 struct scan_control *sc) 2522 { 2523 unsigned long pages_for_compaction; 2524 unsigned long inactive_lru_pages; 2525 int z; 2526 2527 /* If not in reclaim/compaction mode, stop */ 2528 if (!in_reclaim_compaction(sc)) 2529 return false; 2530 2531 /* Consider stopping depending on scan and reclaim activity */ 2532 if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) { 2533 /* 2534 * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the 2535 * full LRU list has been scanned and we are still failing 2536 * to reclaim pages. This full LRU scan is potentially 2537 * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed 2538 */ 2539 if (!nr_reclaimed && !nr_scanned) 2540 return false; 2541 } else { 2542 /* 2543 * For non-__GFP_RETRY_MAYFAIL allocations which can presumably 2544 * fail without consequence, stop if we failed to reclaim 2545 * any pages from the last SWAP_CLUSTER_MAX number of 2546 * pages that were scanned. This will return to the 2547 * caller faster at the risk reclaim/compaction and 2548 * the resulting allocation attempt fails 2549 */ 2550 if (!nr_reclaimed) 2551 return false; 2552 } 2553 2554 /* 2555 * If we have not reclaimed enough pages for compaction and the 2556 * inactive lists are large enough, continue reclaiming 2557 */ 2558 pages_for_compaction = compact_gap(sc->order); 2559 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); 2560 if (get_nr_swap_pages() > 0) 2561 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); 2562 if (sc->nr_reclaimed < pages_for_compaction && 2563 inactive_lru_pages > pages_for_compaction) 2564 return true; 2565 2566 /* If compaction would go ahead or the allocation would succeed, stop */ 2567 for (z = 0; z <= sc->reclaim_idx; z++) { 2568 struct zone *zone = &pgdat->node_zones[z]; 2569 if (!managed_zone(zone)) 2570 continue; 2571 2572 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { 2573 case COMPACT_SUCCESS: 2574 case COMPACT_CONTINUE: 2575 return false; 2576 default: 2577 /* check next zone */ 2578 ; 2579 } 2580 } 2581 return true; 2582 } 2583 2584 static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) 2585 { 2586 struct reclaim_state *reclaim_state = current->reclaim_state; 2587 unsigned long nr_reclaimed, nr_scanned; 2588 bool reclaimable = false; 2589 2590 do { 2591 struct mem_cgroup *root = sc->target_mem_cgroup; 2592 struct mem_cgroup_reclaim_cookie reclaim = { 2593 .pgdat = pgdat, 2594 .priority = sc->priority, 2595 }; 2596 unsigned long node_lru_pages = 0; 2597 struct mem_cgroup *memcg; 2598 2599 nr_reclaimed = sc->nr_reclaimed; 2600 nr_scanned = sc->nr_scanned; 2601 2602 memcg = mem_cgroup_iter(root, NULL, &reclaim); 2603 do { 2604 unsigned long lru_pages; 2605 unsigned long reclaimed; 2606 unsigned long scanned; 2607 2608 if (mem_cgroup_low(root, memcg)) { 2609 if (!sc->memcg_low_reclaim) { 2610 sc->memcg_low_skipped = 1; 2611 continue; 2612 } 2613 mem_cgroup_event(memcg, MEMCG_LOW); 2614 } 2615 2616 reclaimed = sc->nr_reclaimed; 2617 scanned = sc->nr_scanned; 2618 2619 shrink_node_memcg(pgdat, memcg, sc, &lru_pages); 2620 node_lru_pages += lru_pages; 2621 2622 if (memcg) 2623 shrink_slab(sc->gfp_mask, pgdat->node_id, 2624 memcg, sc->nr_scanned - scanned, 2625 lru_pages); 2626 2627 /* Record the group's reclaim efficiency */ 2628 vmpressure(sc->gfp_mask, memcg, false, 2629 sc->nr_scanned - scanned, 2630 sc->nr_reclaimed - reclaimed); 2631 2632 /* 2633 * Direct reclaim and kswapd have to scan all memory 2634 * cgroups to fulfill the overall scan target for the 2635 * node. 2636 * 2637 * Limit reclaim, on the other hand, only cares about 2638 * nr_to_reclaim pages to be reclaimed and it will 2639 * retry with decreasing priority if one round over the 2640 * whole hierarchy is not sufficient. 2641 */ 2642 if (!global_reclaim(sc) && 2643 sc->nr_reclaimed >= sc->nr_to_reclaim) { 2644 mem_cgroup_iter_break(root, memcg); 2645 break; 2646 } 2647 } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); 2648 2649 /* 2650 * Shrink the slab caches in the same proportion that 2651 * the eligible LRU pages were scanned. 2652 */ 2653 if (global_reclaim(sc)) 2654 shrink_slab(sc->gfp_mask, pgdat->node_id, NULL, 2655 sc->nr_scanned - nr_scanned, 2656 node_lru_pages); 2657 2658 if (reclaim_state) { 2659 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2660 reclaim_state->reclaimed_slab = 0; 2661 } 2662 2663 /* Record the subtree's reclaim efficiency */ 2664 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, 2665 sc->nr_scanned - nr_scanned, 2666 sc->nr_reclaimed - nr_reclaimed); 2667 2668 if (sc->nr_reclaimed - nr_reclaimed) 2669 reclaimable = true; 2670 2671 } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, 2672 sc->nr_scanned - nr_scanned, sc)); 2673 2674 /* 2675 * Kswapd gives up on balancing particular nodes after too 2676 * many failures to reclaim anything from them and goes to 2677 * sleep. On reclaim progress, reset the failure counter. A 2678 * successful direct reclaim run will revive a dormant kswapd. 2679 */ 2680 if (reclaimable) 2681 pgdat->kswapd_failures = 0; 2682 2683 return reclaimable; 2684 } 2685 2686 /* 2687 * Returns true if compaction should go ahead for a costly-order request, or 2688 * the allocation would already succeed without compaction. Return false if we 2689 * should reclaim first. 2690 */ 2691 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) 2692 { 2693 unsigned long watermark; 2694 enum compact_result suitable; 2695 2696 suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx); 2697 if (suitable == COMPACT_SUCCESS) 2698 /* Allocation should succeed already. Don't reclaim. */ 2699 return true; 2700 if (suitable == COMPACT_SKIPPED) 2701 /* Compaction cannot yet proceed. Do reclaim. */ 2702 return false; 2703 2704 /* 2705 * Compaction is already possible, but it takes time to run and there 2706 * are potentially other callers using the pages just freed. So proceed 2707 * with reclaim to make a buffer of free pages available to give 2708 * compaction a reasonable chance of completing and allocating the page. 2709 * Note that we won't actually reclaim the whole buffer in one attempt 2710 * as the target watermark in should_continue_reclaim() is lower. But if 2711 * we are already above the high+gap watermark, don't reclaim at all. 2712 */ 2713 watermark = high_wmark_pages(zone) + compact_gap(sc->order); 2714 2715 return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); 2716 } 2717 2718 /* 2719 * This is the direct reclaim path, for page-allocating processes. We only 2720 * try to reclaim pages from zones which will satisfy the caller's allocation 2721 * request. 2722 * 2723 * If a zone is deemed to be full of pinned pages then just give it a light 2724 * scan then give up on it. 2725 */ 2726 static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) 2727 { 2728 struct zoneref *z; 2729 struct zone *zone; 2730 unsigned long nr_soft_reclaimed; 2731 unsigned long nr_soft_scanned; 2732 gfp_t orig_mask; 2733 pg_data_t *last_pgdat = NULL; 2734 2735 /* 2736 * If the number of buffer_heads in the machine exceeds the maximum 2737 * allowed level, force direct reclaim to scan the highmem zone as 2738 * highmem pages could be pinning lowmem pages storing buffer_heads 2739 */ 2740 orig_mask = sc->gfp_mask; 2741 if (buffer_heads_over_limit) { 2742 sc->gfp_mask |= __GFP_HIGHMEM; 2743 sc->reclaim_idx = gfp_zone(sc->gfp_mask); 2744 } 2745 2746 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2747 sc->reclaim_idx, sc->nodemask) { 2748 /* 2749 * Take care memory controller reclaiming has small influence 2750 * to global LRU. 2751 */ 2752 if (global_reclaim(sc)) { 2753 if (!cpuset_zone_allowed(zone, 2754 GFP_KERNEL | __GFP_HARDWALL)) 2755 continue; 2756 2757 /* 2758 * If we already have plenty of memory free for 2759 * compaction in this zone, don't free any more. 2760 * Even though compaction is invoked for any 2761 * non-zero order, only frequent costly order 2762 * reclamation is disruptive enough to become a 2763 * noticeable problem, like transparent huge 2764 * page allocations. 2765 */ 2766 if (IS_ENABLED(CONFIG_COMPACTION) && 2767 sc->order > PAGE_ALLOC_COSTLY_ORDER && 2768 compaction_ready(zone, sc)) { 2769 sc->compaction_ready = true; 2770 continue; 2771 } 2772 2773 /* 2774 * Shrink each node in the zonelist once. If the 2775 * zonelist is ordered by zone (not the default) then a 2776 * node may be shrunk multiple times but in that case 2777 * the user prefers lower zones being preserved. 2778 */ 2779 if (zone->zone_pgdat == last_pgdat) 2780 continue; 2781 2782 /* 2783 * This steals pages from memory cgroups over softlimit 2784 * and returns the number of reclaimed pages and 2785 * scanned pages. This works for global memory pressure 2786 * and balancing, not for a memcg's limit. 2787 */ 2788 nr_soft_scanned = 0; 2789 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat, 2790 sc->order, sc->gfp_mask, 2791 &nr_soft_scanned); 2792 sc->nr_reclaimed += nr_soft_reclaimed; 2793 sc->nr_scanned += nr_soft_scanned; 2794 /* need some check for avoid more shrink_zone() */ 2795 } 2796 2797 /* See comment about same check for global reclaim above */ 2798 if (zone->zone_pgdat == last_pgdat) 2799 continue; 2800 last_pgdat = zone->zone_pgdat; 2801 shrink_node(zone->zone_pgdat, sc); 2802 } 2803 2804 /* 2805 * Restore to original mask to avoid the impact on the caller if we 2806 * promoted it to __GFP_HIGHMEM. 2807 */ 2808 sc->gfp_mask = orig_mask; 2809 } 2810 2811 static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) 2812 { 2813 struct mem_cgroup *memcg; 2814 2815 memcg = mem_cgroup_iter(root_memcg, NULL, NULL); 2816 do { 2817 unsigned long refaults; 2818 struct lruvec *lruvec; 2819 2820 if (memcg) 2821 refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE); 2822 else 2823 refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); 2824 2825 lruvec = mem_cgroup_lruvec(pgdat, memcg); 2826 lruvec->refaults = refaults; 2827 } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); 2828 } 2829 2830 /* 2831 * This is the main entry point to direct page reclaim. 2832 * 2833 * If a full scan of the inactive list fails to free enough memory then we 2834 * are "out of memory" and something needs to be killed. 2835 * 2836 * If the caller is !__GFP_FS then the probability of a failure is reasonably 2837 * high - the zone may be full of dirty or under-writeback pages, which this 2838 * caller can't do much about. We kick the writeback threads and take explicit 2839 * naps in the hope that some of these pages can be written. But if the 2840 * allocating task holds filesystem locks which prevent writeout this might not 2841 * work, and the allocation attempt will fail. 2842 * 2843 * returns: 0, if no pages reclaimed 2844 * else, the number of pages reclaimed 2845 */ 2846 static unsigned long do_try_to_free_pages(struct zonelist *zonelist, 2847 struct scan_control *sc) 2848 { 2849 int initial_priority = sc->priority; 2850 pg_data_t *last_pgdat; 2851 struct zoneref *z; 2852 struct zone *zone; 2853 retry: 2854 delayacct_freepages_start(); 2855 2856 if (global_reclaim(sc)) 2857 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); 2858 2859 do { 2860 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, 2861 sc->priority); 2862 sc->nr_scanned = 0; 2863 shrink_zones(zonelist, sc); 2864 2865 if (sc->nr_reclaimed >= sc->nr_to_reclaim) 2866 break; 2867 2868 if (sc->compaction_ready) 2869 break; 2870 2871 /* 2872 * If we're getting trouble reclaiming, start doing 2873 * writepage even in laptop mode. 2874 */ 2875 if (sc->priority < DEF_PRIORITY - 2) 2876 sc->may_writepage = 1; 2877 } while (--sc->priority >= 0); 2878 2879 last_pgdat = NULL; 2880 for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, 2881 sc->nodemask) { 2882 if (zone->zone_pgdat == last_pgdat) 2883 continue; 2884 last_pgdat = zone->zone_pgdat; 2885 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); 2886 } 2887 2888 delayacct_freepages_end(); 2889 2890 if (sc->nr_reclaimed) 2891 return sc->nr_reclaimed; 2892 2893 /* Aborted reclaim to try compaction? don't OOM, then */ 2894 if (sc->compaction_ready) 2895 return 1; 2896 2897 /* Untapped cgroup reserves? Don't OOM, retry. */ 2898 if (sc->memcg_low_skipped) { 2899 sc->priority = initial_priority; 2900 sc->memcg_low_reclaim = 1; 2901 sc->memcg_low_skipped = 0; 2902 goto retry; 2903 } 2904 2905 return 0; 2906 } 2907 2908 static bool allow_direct_reclaim(pg_data_t *pgdat) 2909 { 2910 struct zone *zone; 2911 unsigned long pfmemalloc_reserve = 0; 2912 unsigned long free_pages = 0; 2913 int i; 2914 bool wmark_ok; 2915 2916 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) 2917 return true; 2918 2919 for (i = 0; i <= ZONE_NORMAL; i++) { 2920 zone = &pgdat->node_zones[i]; 2921 if (!managed_zone(zone)) 2922 continue; 2923 2924 if (!zone_reclaimable_pages(zone)) 2925 continue; 2926 2927 pfmemalloc_reserve += min_wmark_pages(zone); 2928 free_pages += zone_page_state(zone, NR_FREE_PAGES); 2929 } 2930 2931 /* If there are no reserves (unexpected config) then do not throttle */ 2932 if (!pfmemalloc_reserve) 2933 return true; 2934 2935 wmark_ok = free_pages > pfmemalloc_reserve / 2; 2936 2937 /* kswapd must be awake if processes are being throttled */ 2938 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { 2939 pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx, 2940 (enum zone_type)ZONE_NORMAL); 2941 wake_up_interruptible(&pgdat->kswapd_wait); 2942 } 2943 2944 return wmark_ok; 2945 } 2946 2947 /* 2948 * Throttle direct reclaimers if backing storage is backed by the network 2949 * and the PFMEMALLOC reserve for the preferred node is getting dangerously 2950 * depleted. kswapd will continue to make progress and wake the processes 2951 * when the low watermark is reached. 2952 * 2953 * Returns true if a fatal signal was delivered during throttling. If this 2954 * happens, the page allocator should not consider triggering the OOM killer. 2955 */ 2956 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, 2957 nodemask_t *nodemask) 2958 { 2959 struct zoneref *z; 2960 struct zone *zone; 2961 pg_data_t *pgdat = NULL; 2962 2963 /* 2964 * Kernel threads should not be throttled as they may be indirectly 2965 * responsible for cleaning pages necessary for reclaim to make forward 2966 * progress. kjournald for example may enter direct reclaim while 2967 * committing a transaction where throttling it could forcing other 2968 * processes to block on log_wait_commit(). 2969 */ 2970 if (current->flags & PF_KTHREAD) 2971 goto out; 2972 2973 /* 2974 * If a fatal signal is pending, this process should not throttle. 2975 * It should return quickly so it can exit and free its memory 2976 */ 2977 if (fatal_signal_pending(current)) 2978 goto out; 2979 2980 /* 2981 * Check if the pfmemalloc reserves are ok by finding the first node 2982 * with a usable ZONE_NORMAL or lower zone. The expectation is that 2983 * GFP_KERNEL will be required for allocating network buffers when 2984 * swapping over the network so ZONE_HIGHMEM is unusable. 2985 * 2986 * Throttling is based on the first usable node and throttled processes 2987 * wait on a queue until kswapd makes progress and wakes them. There 2988 * is an affinity then between processes waking up and where reclaim 2989 * progress has been made assuming the process wakes on the same node. 2990 * More importantly, processes running on remote nodes will not compete 2991 * for remote pfmemalloc reserves and processes on different nodes 2992 * should make reasonable progress. 2993 */ 2994 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2995 gfp_zone(gfp_mask), nodemask) { 2996 if (zone_idx(zone) > ZONE_NORMAL) 2997 continue; 2998 2999 /* Throttle based on the first usable node */ 3000 pgdat = zone->zone_pgdat; 3001 if (allow_direct_reclaim(pgdat)) 3002 goto out; 3003 break; 3004 } 3005 3006 /* If no zone was usable by the allocation flags then do not throttle */ 3007 if (!pgdat) 3008 goto out; 3009 3010 /* Account for the throttling */ 3011 count_vm_event(PGSCAN_DIRECT_THROTTLE); 3012 3013 /* 3014 * If the caller cannot enter the filesystem, it's possible that it 3015 * is due to the caller holding an FS lock or performing a journal 3016 * transaction in the case of a filesystem like ext[3|4]. In this case, 3017 * it is not safe to block on pfmemalloc_wait as kswapd could be 3018 * blocked waiting on the same lock. Instead, throttle for up to a 3019 * second before continuing. 3020 */ 3021 if (!(gfp_mask & __GFP_FS)) { 3022 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, 3023 allow_direct_reclaim(pgdat), HZ); 3024 3025 goto check_pending; 3026 } 3027 3028 /* Throttle until kswapd wakes the process */ 3029 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, 3030 allow_direct_reclaim(pgdat)); 3031 3032 check_pending: 3033 if (fatal_signal_pending(current)) 3034 return true; 3035 3036 out: 3037 return false; 3038 } 3039 3040 unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 3041 gfp_t gfp_mask, nodemask_t *nodemask) 3042 { 3043 unsigned long nr_reclaimed; 3044 struct scan_control sc = { 3045 .nr_to_reclaim = SWAP_CLUSTER_MAX, 3046 .gfp_mask = current_gfp_context(gfp_mask), 3047 .reclaim_idx = gfp_zone(gfp_mask), 3048 .order = order, 3049 .nodemask = nodemask, 3050 .priority = DEF_PRIORITY, 3051 .may_writepage = !laptop_mode, 3052 .may_unmap = 1, 3053 .may_swap = 1, 3054 }; 3055 3056 /* 3057 * Do not enter reclaim if fatal signal was delivered while throttled. 3058 * 1 is returned so that the page allocator does not OOM kill at this 3059 * point. 3060 */ 3061 if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) 3062 return 1; 3063 3064 trace_mm_vmscan_direct_reclaim_begin(order, 3065 sc.may_writepage, 3066 sc.gfp_mask, 3067 sc.reclaim_idx); 3068 3069 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 3070 3071 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); 3072 3073 return nr_reclaimed; 3074 } 3075 3076 #ifdef CONFIG_MEMCG 3077 3078 unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, 3079 gfp_t gfp_mask, bool noswap, 3080 pg_data_t *pgdat, 3081 unsigned long *nr_scanned) 3082 { 3083 struct scan_control sc = { 3084 .nr_to_reclaim = SWAP_CLUSTER_MAX, 3085 .target_mem_cgroup = memcg, 3086 .may_writepage = !laptop_mode, 3087 .may_unmap = 1, 3088 .reclaim_idx = MAX_NR_ZONES - 1, 3089 .may_swap = !noswap, 3090 }; 3091 unsigned long lru_pages; 3092 3093 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 3094 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 3095 3096 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, 3097 sc.may_writepage, 3098 sc.gfp_mask, 3099 sc.reclaim_idx); 3100 3101 /* 3102 * NOTE: Although we can get the priority field, using it 3103 * here is not a good idea, since it limits the pages we can scan. 3104 * if we don't reclaim here, the shrink_node from balance_pgdat 3105 * will pick up pages from other mem cgroup's as well. We hack 3106 * the priority and make it zero. 3107 */ 3108 shrink_node_memcg(pgdat, memcg, &sc, &lru_pages); 3109 3110 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 3111 3112 *nr_scanned = sc.nr_scanned; 3113 return sc.nr_reclaimed; 3114 } 3115 3116 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, 3117 unsigned long nr_pages, 3118 gfp_t gfp_mask, 3119 bool may_swap) 3120 { 3121 struct zonelist *zonelist; 3122 unsigned long nr_reclaimed; 3123 int nid; 3124 unsigned int noreclaim_flag; 3125 struct scan_control sc = { 3126 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), 3127 .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | 3128 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 3129 .reclaim_idx = MAX_NR_ZONES - 1, 3130 .target_mem_cgroup = memcg, 3131 .priority = DEF_PRIORITY, 3132 .may_writepage = !laptop_mode, 3133 .may_unmap = 1, 3134 .may_swap = may_swap, 3135 }; 3136 3137 /* 3138 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't 3139 * take care of from where we get pages. So the node where we start the 3140 * scan does not need to be the current node. 3141 */ 3142 nid = mem_cgroup_select_victim_node(memcg); 3143 3144 zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; 3145 3146 trace_mm_vmscan_memcg_reclaim_begin(0, 3147 sc.may_writepage, 3148 sc.gfp_mask, 3149 sc.reclaim_idx); 3150 3151 noreclaim_flag = memalloc_noreclaim_save(); 3152 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 3153 memalloc_noreclaim_restore(noreclaim_flag); 3154 3155 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 3156 3157 return nr_reclaimed; 3158 } 3159 #endif 3160 3161 static void age_active_anon(struct pglist_data *pgdat, 3162 struct scan_control *sc) 3163 { 3164 struct mem_cgroup *memcg; 3165 3166 if (!total_swap_pages) 3167 return; 3168 3169 memcg = mem_cgroup_iter(NULL, NULL, NULL); 3170 do { 3171 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); 3172 3173 if (inactive_list_is_low(lruvec, false, memcg, sc, true)) 3174 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 3175 sc, LRU_ACTIVE_ANON); 3176 3177 memcg = mem_cgroup_iter(NULL, memcg, NULL); 3178 } while (memcg); 3179 } 3180 3181 /* 3182 * Returns true if there is an eligible zone balanced for the request order 3183 * and classzone_idx 3184 */ 3185 static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) 3186 { 3187 int i; 3188 unsigned long mark = -1; 3189 struct zone *zone; 3190 3191 for (i = 0; i <= classzone_idx; i++) { 3192 zone = pgdat->node_zones + i; 3193 3194 if (!managed_zone(zone)) 3195 continue; 3196 3197 mark = high_wmark_pages(zone); 3198 if (zone_watermark_ok_safe(zone, order, mark, classzone_idx)) 3199 return true; 3200 } 3201 3202 /* 3203 * If a node has no populated zone within classzone_idx, it does not 3204 * need balancing by definition. This can happen if a zone-restricted 3205 * allocation tries to wake a remote kswapd. 3206 */ 3207 if (mark == -1) 3208 return true; 3209 3210 return false; 3211 } 3212 3213 /* Clear pgdat state for congested, dirty or under writeback. */ 3214 static void clear_pgdat_congested(pg_data_t *pgdat) 3215 { 3216 clear_bit(PGDAT_CONGESTED, &pgdat->flags); 3217 clear_bit(PGDAT_DIRTY, &pgdat->flags); 3218 clear_bit(PGDAT_WRITEBACK, &pgdat->flags); 3219 } 3220 3221 /* 3222 * Prepare kswapd for sleeping. This verifies that there are no processes 3223 * waiting in throttle_direct_reclaim() and that watermarks have been met. 3224 * 3225 * Returns true if kswapd is ready to sleep 3226 */ 3227 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) 3228 { 3229 /* 3230 * The throttled processes are normally woken up in balance_pgdat() as 3231 * soon as allow_direct_reclaim() is true. But there is a potential 3232 * race between when kswapd checks the watermarks and a process gets 3233 * throttled. There is also a potential race if processes get 3234 * throttled, kswapd wakes, a large process exits thereby balancing the 3235 * zones, which causes kswapd to exit balance_pgdat() before reaching 3236 * the wake up checks. If kswapd is going to sleep, no process should 3237 * be sleeping on pfmemalloc_wait, so wake them now if necessary. If 3238 * the wake up is premature, processes will wake kswapd and get 3239 * throttled again. The difference from wake ups in balance_pgdat() is 3240 * that here we are under prepare_to_wait(). 3241 */ 3242 if (waitqueue_active(&pgdat->pfmemalloc_wait)) 3243 wake_up_all(&pgdat->pfmemalloc_wait); 3244 3245 /* Hopeless node, leave it to direct reclaim */ 3246 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) 3247 return true; 3248 3249 if (pgdat_balanced(pgdat, order, classzone_idx)) { 3250 clear_pgdat_congested(pgdat); 3251 return true; 3252 } 3253 3254 return false; 3255 } 3256 3257 /* 3258 * kswapd shrinks a node of pages that are at or below the highest usable 3259 * zone that is currently unbalanced. 3260 * 3261 * Returns true if kswapd scanned at least the requested number of pages to 3262 * reclaim or if the lack of progress was due to pages under writeback. 3263 * This is used to determine if the scanning priority needs to be raised. 3264 */ 3265 static bool kswapd_shrink_node(pg_data_t *pgdat, 3266 struct scan_control *sc) 3267 { 3268 struct zone *zone; 3269 int z; 3270 3271 /* Reclaim a number of pages proportional to the number of zones */ 3272 sc->nr_to_reclaim = 0; 3273 for (z = 0; z <= sc->reclaim_idx; z++) { 3274 zone = pgdat->node_zones + z; 3275 if (!managed_zone(zone)) 3276 continue; 3277 3278 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); 3279 } 3280 3281 /* 3282 * Historically care was taken to put equal pressure on all zones but 3283 * now pressure is applied based on node LRU order. 3284 */ 3285 shrink_node(pgdat, sc); 3286 3287 /* 3288 * Fragmentation may mean that the system cannot be rebalanced for 3289 * high-order allocations. If twice the allocation size has been 3290 * reclaimed then recheck watermarks only at order-0 to prevent 3291 * excessive reclaim. Assume that a process requested a high-order 3292 * can direct reclaim/compact. 3293 */ 3294 if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) 3295 sc->order = 0; 3296 3297 return sc->nr_scanned >= sc->nr_to_reclaim; 3298 } 3299 3300 /* 3301 * For kswapd, balance_pgdat() will reclaim pages across a node from zones 3302 * that are eligible for use by the caller until at least one zone is 3303 * balanced. 3304 * 3305 * Returns the order kswapd finished reclaiming at. 3306 * 3307 * kswapd scans the zones in the highmem->normal->dma direction. It skips 3308 * zones which have free_pages > high_wmark_pages(zone), but once a zone is 3309 * found to have free_pages <= high_wmark_pages(zone), any page is that zone 3310 * or lower is eligible for reclaim until at least one usable zone is 3311 * balanced. 3312 */ 3313 static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) 3314 { 3315 int i; 3316 unsigned long nr_soft_reclaimed; 3317 unsigned long nr_soft_scanned; 3318 struct zone *zone; 3319 struct scan_control sc = { 3320 .gfp_mask = GFP_KERNEL, 3321 .order = order, 3322 .priority = DEF_PRIORITY, 3323 .may_writepage = !laptop_mode, 3324 .may_unmap = 1, 3325 .may_swap = 1, 3326 }; 3327 count_vm_event(PAGEOUTRUN); 3328 3329 do { 3330 unsigned long nr_reclaimed = sc.nr_reclaimed; 3331 bool raise_priority = true; 3332 3333 sc.reclaim_idx = classzone_idx; 3334 3335 /* 3336 * If the number of buffer_heads exceeds the maximum allowed 3337 * then consider reclaiming from all zones. This has a dual 3338 * purpose -- on 64-bit systems it is expected that 3339 * buffer_heads are stripped during active rotation. On 32-bit 3340 * systems, highmem pages can pin lowmem memory and shrinking 3341 * buffers can relieve lowmem pressure. Reclaim may still not 3342 * go ahead if all eligible zones for the original allocation 3343 * request are balanced to avoid excessive reclaim from kswapd. 3344 */ 3345 if (buffer_heads_over_limit) { 3346 for (i = MAX_NR_ZONES - 1; i >= 0; i--) { 3347 zone = pgdat->node_zones + i; 3348 if (!managed_zone(zone)) 3349 continue; 3350 3351 sc.reclaim_idx = i; 3352 break; 3353 } 3354 } 3355 3356 /* 3357 * Only reclaim if there are no eligible zones. Note that 3358 * sc.reclaim_idx is not used as buffer_heads_over_limit may 3359 * have adjusted it. 3360 */ 3361 if (pgdat_balanced(pgdat, sc.order, classzone_idx)) 3362 goto out; 3363 3364 /* 3365 * Do some background aging of the anon list, to give 3366 * pages a chance to be referenced before reclaiming. All 3367 * pages are rotated regardless of classzone as this is 3368 * about consistent aging. 3369 */ 3370 age_active_anon(pgdat, &sc); 3371 3372 /* 3373 * If we're getting trouble reclaiming, start doing writepage 3374 * even in laptop mode. 3375 */ 3376 if (sc.priority < DEF_PRIORITY - 2) 3377 sc.may_writepage = 1; 3378 3379 /* Call soft limit reclaim before calling shrink_node. */ 3380 sc.nr_scanned = 0; 3381 nr_soft_scanned = 0; 3382 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order, 3383 sc.gfp_mask, &nr_soft_scanned); 3384 sc.nr_reclaimed += nr_soft_reclaimed; 3385 3386 /* 3387 * There should be no need to raise the scanning priority if 3388 * enough pages are already being scanned that that high 3389 * watermark would be met at 100% efficiency. 3390 */ 3391 if (kswapd_shrink_node(pgdat, &sc)) 3392 raise_priority = false; 3393 3394 /* 3395 * If the low watermark is met there is no need for processes 3396 * to be throttled on pfmemalloc_wait as they should not be 3397 * able to safely make forward progress. Wake them 3398 */ 3399 if (waitqueue_active(&pgdat->pfmemalloc_wait) && 3400 allow_direct_reclaim(pgdat)) 3401 wake_up_all(&pgdat->pfmemalloc_wait); 3402 3403 /* Check if kswapd should be suspending */ 3404 if (try_to_freeze() || kthread_should_stop()) 3405 break; 3406 3407 /* 3408 * Raise priority if scanning rate is too low or there was no 3409 * progress in reclaiming pages 3410 */ 3411 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; 3412 if (raise_priority || !nr_reclaimed) 3413 sc.priority--; 3414 } while (sc.priority >= 1); 3415 3416 if (!sc.nr_reclaimed) 3417 pgdat->kswapd_failures++; 3418 3419 out: 3420 snapshot_refaults(NULL, pgdat); 3421 /* 3422 * Return the order kswapd stopped reclaiming at as 3423 * prepare_kswapd_sleep() takes it into account. If another caller 3424 * entered the allocator slow path while kswapd was awake, order will 3425 * remain at the higher level. 3426 */ 3427 return sc.order; 3428 } 3429 3430 /* 3431 * pgdat->kswapd_classzone_idx is the highest zone index that a recent 3432 * allocation request woke kswapd for. When kswapd has not woken recently, 3433 * the value is MAX_NR_ZONES which is not a valid index. This compares a 3434 * given classzone and returns it or the highest classzone index kswapd 3435 * was recently woke for. 3436 */ 3437 static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat, 3438 enum zone_type classzone_idx) 3439 { 3440 if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES) 3441 return classzone_idx; 3442 3443 return max(pgdat->kswapd_classzone_idx, classzone_idx); 3444 } 3445 3446 static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, 3447 unsigned int classzone_idx) 3448 { 3449 long remaining = 0; 3450 DEFINE_WAIT(wait); 3451 3452 if (freezing(current) || kthread_should_stop()) 3453 return; 3454 3455 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 3456 3457 /* 3458 * Try to sleep for a short interval. Note that kcompactd will only be 3459 * woken if it is possible to sleep for a short interval. This is 3460 * deliberate on the assumption that if reclaim cannot keep an 3461 * eligible zone balanced that it's also unlikely that compaction will 3462 * succeed. 3463 */ 3464 if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { 3465 /* 3466 * Compaction records what page blocks it recently failed to 3467 * isolate pages from and skips them in the future scanning. 3468 * When kswapd is going to sleep, it is reasonable to assume 3469 * that pages and compaction may succeed so reset the cache. 3470 */ 3471 reset_isolation_suitable(pgdat); 3472 3473 /* 3474 * We have freed the memory, now we should compact it to make 3475 * allocation of the requested order possible. 3476 */ 3477 wakeup_kcompactd(pgdat, alloc_order, classzone_idx); 3478 3479 remaining = schedule_timeout(HZ/10); 3480 3481 /* 3482 * If woken prematurely then reset kswapd_classzone_idx and 3483 * order. The values will either be from a wakeup request or 3484 * the previous request that slept prematurely. 3485 */ 3486 if (remaining) { 3487 pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); 3488 pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order); 3489 } 3490 3491 finish_wait(&pgdat->kswapd_wait, &wait); 3492 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 3493 } 3494 3495 /* 3496 * After a short sleep, check if it was a premature sleep. If not, then 3497 * go fully to sleep until explicitly woken up. 3498 */ 3499 if (!remaining && 3500 prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { 3501 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 3502 3503 /* 3504 * vmstat counters are not perfectly accurate and the estimated 3505 * value for counters such as NR_FREE_PAGES can deviate from the 3506 * true value by nr_online_cpus * threshold. To avoid the zone 3507 * watermarks being breached while under pressure, we reduce the 3508 * per-cpu vmstat threshold while kswapd is awake and restore 3509 * them before going back to sleep. 3510 */ 3511 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 3512 3513 if (!kthread_should_stop()) 3514 schedule(); 3515 3516 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 3517 } else { 3518 if (remaining) 3519 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); 3520 else 3521 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); 3522 } 3523 finish_wait(&pgdat->kswapd_wait, &wait); 3524 } 3525 3526 /* 3527 * The background pageout daemon, started as a kernel thread 3528 * from the init process. 3529 * 3530 * This basically trickles out pages so that we have _some_ 3531 * free memory available even if there is no other activity 3532 * that frees anything up. This is needed for things like routing 3533 * etc, where we otherwise might have all activity going on in 3534 * asynchronous contexts that cannot page things out. 3535 * 3536 * If there are applications that are active memory-allocators 3537 * (most normal use), this basically shouldn't matter. 3538 */ 3539 static int kswapd(void *p) 3540 { 3541 unsigned int alloc_order, reclaim_order; 3542 unsigned int classzone_idx = MAX_NR_ZONES - 1; 3543 pg_data_t *pgdat = (pg_data_t*)p; 3544 struct task_struct *tsk = current; 3545 3546 struct reclaim_state reclaim_state = { 3547 .reclaimed_slab = 0, 3548 }; 3549 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 3550 3551 if (!cpumask_empty(cpumask)) 3552 set_cpus_allowed_ptr(tsk, cpumask); 3553 current->reclaim_state = &reclaim_state; 3554 3555 /* 3556 * Tell the memory management that we're a "memory allocator", 3557 * and that if we need more memory we should get access to it 3558 * regardless (see "__alloc_pages()"). "kswapd" should 3559 * never get caught in the normal page freeing logic. 3560 * 3561 * (Kswapd normally doesn't need memory anyway, but sometimes 3562 * you need a small amount of memory in order to be able to 3563 * page out something else, and this flag essentially protects 3564 * us from recursively trying to free more memory as we're 3565 * trying to free the first piece of memory in the first place). 3566 */ 3567 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 3568 set_freezable(); 3569 3570 pgdat->kswapd_order = 0; 3571 pgdat->kswapd_classzone_idx = MAX_NR_ZONES; 3572 for ( ; ; ) { 3573 bool ret; 3574 3575 alloc_order = reclaim_order = pgdat->kswapd_order; 3576 classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); 3577 3578 kswapd_try_sleep: 3579 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, 3580 classzone_idx); 3581 3582 /* Read the new order and classzone_idx */ 3583 alloc_order = reclaim_order = pgdat->kswapd_order; 3584 classzone_idx = kswapd_classzone_idx(pgdat, 0); 3585 pgdat->kswapd_order = 0; 3586 pgdat->kswapd_classzone_idx = MAX_NR_ZONES; 3587 3588 ret = try_to_freeze(); 3589 if (kthread_should_stop()) 3590 break; 3591 3592 /* 3593 * We can speed up thawing tasks if we don't call balance_pgdat 3594 * after returning from the refrigerator 3595 */ 3596 if (ret) 3597 continue; 3598 3599 /* 3600 * Reclaim begins at the requested order but if a high-order 3601 * reclaim fails then kswapd falls back to reclaiming for 3602 * order-0. If that happens, kswapd will consider sleeping 3603 * for the order it finished reclaiming at (reclaim_order) 3604 * but kcompactd is woken to compact for the original 3605 * request (alloc_order). 3606 */ 3607 trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, 3608 alloc_order); 3609 fs_reclaim_acquire(GFP_KERNEL); 3610 reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); 3611 fs_reclaim_release(GFP_KERNEL); 3612 if (reclaim_order < alloc_order) 3613 goto kswapd_try_sleep; 3614 } 3615 3616 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); 3617 current->reclaim_state = NULL; 3618 3619 return 0; 3620 } 3621 3622 /* 3623 * A zone is low on free memory, so wake its kswapd task to service it. 3624 */ 3625 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) 3626 { 3627 pg_data_t *pgdat; 3628 3629 if (!managed_zone(zone)) 3630 return; 3631 3632 if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) 3633 return; 3634 pgdat = zone->zone_pgdat; 3635 pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, 3636 classzone_idx); 3637 pgdat->kswapd_order = max(pgdat->kswapd_order, order); 3638 if (!waitqueue_active(&pgdat->kswapd_wait)) 3639 return; 3640 3641 /* Hopeless node, leave it to direct reclaim */ 3642 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) 3643 return; 3644 3645 if (pgdat_balanced(pgdat, order, classzone_idx)) 3646 return; 3647 3648 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order); 3649 wake_up_interruptible(&pgdat->kswapd_wait); 3650 } 3651 3652 #ifdef CONFIG_HIBERNATION 3653 /* 3654 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of 3655 * freed pages. 3656 * 3657 * Rather than trying to age LRUs the aim is to preserve the overall 3658 * LRU order by reclaiming preferentially 3659 * inactive > active > active referenced > active mapped 3660 */ 3661 unsigned long shrink_all_memory(unsigned long nr_to_reclaim) 3662 { 3663 struct reclaim_state reclaim_state; 3664 struct scan_control sc = { 3665 .nr_to_reclaim = nr_to_reclaim, 3666 .gfp_mask = GFP_HIGHUSER_MOVABLE, 3667 .reclaim_idx = MAX_NR_ZONES - 1, 3668 .priority = DEF_PRIORITY, 3669 .may_writepage = 1, 3670 .may_unmap = 1, 3671 .may_swap = 1, 3672 .hibernation_mode = 1, 3673 }; 3674 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 3675 struct task_struct *p = current; 3676 unsigned long nr_reclaimed; 3677 unsigned int noreclaim_flag; 3678 3679 noreclaim_flag = memalloc_noreclaim_save(); 3680 fs_reclaim_acquire(sc.gfp_mask); 3681 reclaim_state.reclaimed_slab = 0; 3682 p->reclaim_state = &reclaim_state; 3683 3684 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 3685 3686 p->reclaim_state = NULL; 3687 fs_reclaim_release(sc.gfp_mask); 3688 memalloc_noreclaim_restore(noreclaim_flag); 3689 3690 return nr_reclaimed; 3691 } 3692 #endif /* CONFIG_HIBERNATION */ 3693 3694 /* It's optimal to keep kswapds on the same CPUs as their memory, but 3695 not required for correctness. So if the last cpu in a node goes 3696 away, we get changed to run anywhere: as the first one comes back, 3697 restore their cpu bindings. */ 3698 static int kswapd_cpu_online(unsigned int cpu) 3699 { 3700 int nid; 3701 3702 for_each_node_state(nid, N_MEMORY) { 3703 pg_data_t *pgdat = NODE_DATA(nid); 3704 const struct cpumask *mask; 3705 3706 mask = cpumask_of_node(pgdat->node_id); 3707 3708 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) 3709 /* One of our CPUs online: restore mask */ 3710 set_cpus_allowed_ptr(pgdat->kswapd, mask); 3711 } 3712 return 0; 3713 } 3714 3715 /* 3716 * This kswapd start function will be called by init and node-hot-add. 3717 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. 3718 */ 3719 int kswapd_run(int nid) 3720 { 3721 pg_data_t *pgdat = NODE_DATA(nid); 3722 int ret = 0; 3723 3724 if (pgdat->kswapd) 3725 return 0; 3726 3727 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); 3728 if (IS_ERR(pgdat->kswapd)) { 3729 /* failure at boot is fatal */ 3730 BUG_ON(system_state < SYSTEM_RUNNING); 3731 pr_err("Failed to start kswapd on node %d\n", nid); 3732 ret = PTR_ERR(pgdat->kswapd); 3733 pgdat->kswapd = NULL; 3734 } 3735 return ret; 3736 } 3737 3738 /* 3739 * Called by memory hotplug when all memory in a node is offlined. Caller must 3740 * hold mem_hotplug_begin/end(). 3741 */ 3742 void kswapd_stop(int nid) 3743 { 3744 struct task_struct *kswapd = NODE_DATA(nid)->kswapd; 3745 3746 if (kswapd) { 3747 kthread_stop(kswapd); 3748 NODE_DATA(nid)->kswapd = NULL; 3749 } 3750 } 3751 3752 static int __init kswapd_init(void) 3753 { 3754 int nid, ret; 3755 3756 swap_setup(); 3757 for_each_node_state(nid, N_MEMORY) 3758 kswapd_run(nid); 3759 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, 3760 "mm/vmscan:online", kswapd_cpu_online, 3761 NULL); 3762 WARN_ON(ret < 0); 3763 return 0; 3764 } 3765 3766 module_init(kswapd_init) 3767 3768 #ifdef CONFIG_NUMA 3769 /* 3770 * Node reclaim mode 3771 * 3772 * If non-zero call node_reclaim when the number of free pages falls below 3773 * the watermarks. 3774 */ 3775 int node_reclaim_mode __read_mostly; 3776 3777 #define RECLAIM_OFF 0 3778 #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ 3779 #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 3780 #define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ 3781 3782 /* 3783 * Priority for NODE_RECLAIM. This determines the fraction of pages 3784 * of a node considered for each zone_reclaim. 4 scans 1/16th of 3785 * a zone. 3786 */ 3787 #define NODE_RECLAIM_PRIORITY 4 3788 3789 /* 3790 * Percentage of pages in a zone that must be unmapped for node_reclaim to 3791 * occur. 3792 */ 3793 int sysctl_min_unmapped_ratio = 1; 3794 3795 /* 3796 * If the number of slab pages in a zone grows beyond this percentage then 3797 * slab reclaim needs to occur. 3798 */ 3799 int sysctl_min_slab_ratio = 5; 3800 3801 static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat) 3802 { 3803 unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED); 3804 unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) + 3805 node_page_state(pgdat, NR_ACTIVE_FILE); 3806 3807 /* 3808 * It's possible for there to be more file mapped pages than 3809 * accounted for by the pages on the file LRU lists because 3810 * tmpfs pages accounted for as ANON can also be FILE_MAPPED 3811 */ 3812 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; 3813 } 3814 3815 /* Work out how many page cache pages we can reclaim in this reclaim_mode */ 3816 static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) 3817 { 3818 unsigned long nr_pagecache_reclaimable; 3819 unsigned long delta = 0; 3820 3821 /* 3822 * If RECLAIM_UNMAP is set, then all file pages are considered 3823 * potentially reclaimable. Otherwise, we have to worry about 3824 * pages like swapcache and node_unmapped_file_pages() provides 3825 * a better estimate 3826 */ 3827 if (node_reclaim_mode & RECLAIM_UNMAP) 3828 nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES); 3829 else 3830 nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat); 3831 3832 /* If we can't clean pages, remove dirty pages from consideration */ 3833 if (!(node_reclaim_mode & RECLAIM_WRITE)) 3834 delta += node_page_state(pgdat, NR_FILE_DIRTY); 3835 3836 /* Watch for any possible underflows due to delta */ 3837 if (unlikely(delta > nr_pagecache_reclaimable)) 3838 delta = nr_pagecache_reclaimable; 3839 3840 return nr_pagecache_reclaimable - delta; 3841 } 3842 3843 /* 3844 * Try to free up some pages from this node through reclaim. 3845 */ 3846 static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) 3847 { 3848 /* Minimum pages needed in order to stay on node */ 3849 const unsigned long nr_pages = 1 << order; 3850 struct task_struct *p = current; 3851 struct reclaim_state reclaim_state; 3852 unsigned int noreclaim_flag; 3853 struct scan_control sc = { 3854 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), 3855 .gfp_mask = current_gfp_context(gfp_mask), 3856 .order = order, 3857 .priority = NODE_RECLAIM_PRIORITY, 3858 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), 3859 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), 3860 .may_swap = 1, 3861 .reclaim_idx = gfp_zone(gfp_mask), 3862 }; 3863 3864 cond_resched(); 3865 /* 3866 * We need to be able to allocate from the reserves for RECLAIM_UNMAP 3867 * and we also need to be able to write out pages for RECLAIM_WRITE 3868 * and RECLAIM_UNMAP. 3869 */ 3870 noreclaim_flag = memalloc_noreclaim_save(); 3871 p->flags |= PF_SWAPWRITE; 3872 fs_reclaim_acquire(sc.gfp_mask); 3873 reclaim_state.reclaimed_slab = 0; 3874 p->reclaim_state = &reclaim_state; 3875 3876 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { 3877 /* 3878 * Free memory by calling shrink zone with increasing 3879 * priorities until we have enough memory freed. 3880 */ 3881 do { 3882 shrink_node(pgdat, &sc); 3883 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); 3884 } 3885 3886 p->reclaim_state = NULL; 3887 fs_reclaim_release(gfp_mask); 3888 current->flags &= ~PF_SWAPWRITE; 3889 memalloc_noreclaim_restore(noreclaim_flag); 3890 return sc.nr_reclaimed >= nr_pages; 3891 } 3892 3893 int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) 3894 { 3895 int ret; 3896 3897 /* 3898 * Node reclaim reclaims unmapped file backed pages and 3899 * slab pages if we are over the defined limits. 3900 * 3901 * A small portion of unmapped file backed pages is needed for 3902 * file I/O otherwise pages read by file I/O will be immediately 3903 * thrown out if the node is overallocated. So we do not reclaim 3904 * if less than a specified percentage of the node is used by 3905 * unmapped file backed pages. 3906 */ 3907 if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && 3908 node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) 3909 return NODE_RECLAIM_FULL; 3910 3911 /* 3912 * Do not scan if the allocation should not be delayed. 3913 */ 3914 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) 3915 return NODE_RECLAIM_NOSCAN; 3916 3917 /* 3918 * Only run node reclaim on the local node or on nodes that do not 3919 * have associated processors. This will favor the local processor 3920 * over remote processors and spread off node memory allocations 3921 * as wide as possible. 3922 */ 3923 if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) 3924 return NODE_RECLAIM_NOSCAN; 3925 3926 if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) 3927 return NODE_RECLAIM_NOSCAN; 3928 3929 ret = __node_reclaim(pgdat, gfp_mask, order); 3930 clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); 3931 3932 if (!ret) 3933 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); 3934 3935 return ret; 3936 } 3937 #endif 3938 3939 /* 3940 * page_evictable - test whether a page is evictable 3941 * @page: the page to test 3942 * 3943 * Test whether page is evictable--i.e., should be placed on active/inactive 3944 * lists vs unevictable list. 3945 * 3946 * Reasons page might not be evictable: 3947 * (1) page's mapping marked unevictable 3948 * (2) page is part of an mlocked VMA 3949 * 3950 */ 3951 int page_evictable(struct page *page) 3952 { 3953 return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); 3954 } 3955 3956 #ifdef CONFIG_SHMEM 3957 /** 3958 * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list 3959 * @pages: array of pages to check 3960 * @nr_pages: number of pages to check 3961 * 3962 * Checks pages for evictability and moves them to the appropriate lru list. 3963 * 3964 * This function is only used for SysV IPC SHM_UNLOCK. 3965 */ 3966 void check_move_unevictable_pages(struct page **pages, int nr_pages) 3967 { 3968 struct lruvec *lruvec; 3969 struct pglist_data *pgdat = NULL; 3970 int pgscanned = 0; 3971 int pgrescued = 0; 3972 int i; 3973 3974 for (i = 0; i < nr_pages; i++) { 3975 struct page *page = pages[i]; 3976 struct pglist_data *pagepgdat = page_pgdat(page); 3977 3978 pgscanned++; 3979 if (pagepgdat != pgdat) { 3980 if (pgdat) 3981 spin_unlock_irq(&pgdat->lru_lock); 3982 pgdat = pagepgdat; 3983 spin_lock_irq(&pgdat->lru_lock); 3984 } 3985 lruvec = mem_cgroup_page_lruvec(page, pgdat); 3986 3987 if (!PageLRU(page) || !PageUnevictable(page)) 3988 continue; 3989 3990 if (page_evictable(page)) { 3991 enum lru_list lru = page_lru_base_type(page); 3992 3993 VM_BUG_ON_PAGE(PageActive(page), page); 3994 ClearPageUnevictable(page); 3995 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); 3996 add_page_to_lru_list(page, lruvec, lru); 3997 pgrescued++; 3998 } 3999 } 4000 4001 if (pgdat) { 4002 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); 4003 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); 4004 spin_unlock_irq(&pgdat->lru_lock); 4005 } 4006 } 4007 #endif /* CONFIG_SHMEM */ 4008