1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 4 * 5 * Swap reorganised 29.12.95, Stephen Tweedie. 6 * kswapd added: 7.1.96 sct 7 * Removed kswapd_ctl limits, and swap out as many pages as needed 8 * to bring the system back to freepages.high: 2.4.97, Rik van Riel. 9 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). 10 * Multiqueue VM started 5.8.00, Rik van Riel. 11 */ 12 13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 14 15 #include <linux/mm.h> 16 #include <linux/sched/mm.h> 17 #include <linux/module.h> 18 #include <linux/gfp.h> 19 #include <linux/kernel_stat.h> 20 #include <linux/swap.h> 21 #include <linux/pagemap.h> 22 #include <linux/init.h> 23 #include <linux/highmem.h> 24 #include <linux/vmpressure.h> 25 #include <linux/vmstat.h> 26 #include <linux/file.h> 27 #include <linux/writeback.h> 28 #include <linux/blkdev.h> 29 #include <linux/buffer_head.h> /* for buffer_heads_over_limit */ 30 #include <linux/mm_inline.h> 31 #include <linux/backing-dev.h> 32 #include <linux/rmap.h> 33 #include <linux/topology.h> 34 #include <linux/cpu.h> 35 #include <linux/cpuset.h> 36 #include <linux/compaction.h> 37 #include <linux/notifier.h> 38 #include <linux/rwsem.h> 39 #include <linux/delay.h> 40 #include <linux/kthread.h> 41 #include <linux/freezer.h> 42 #include <linux/memcontrol.h> 43 #include <linux/migrate.h> 44 #include <linux/delayacct.h> 45 #include <linux/sysctl.h> 46 #include <linux/memory-tiers.h> 47 #include <linux/oom.h> 48 #include <linux/pagevec.h> 49 #include <linux/prefetch.h> 50 #include <linux/printk.h> 51 #include <linux/dax.h> 52 #include <linux/psi.h> 53 #include <linux/pagewalk.h> 54 #include <linux/shmem_fs.h> 55 #include <linux/ctype.h> 56 #include <linux/debugfs.h> 57 #include <linux/khugepaged.h> 58 #include <linux/rculist_nulls.h> 59 #include <linux/random.h> 60 61 #include <asm/tlbflush.h> 62 #include <asm/div64.h> 63 64 #include <linux/swapops.h> 65 #include <linux/balloon_compaction.h> 66 #include <linux/sched/sysctl.h> 67 68 #include "internal.h" 69 #include "swap.h" 70 71 #define CREATE_TRACE_POINTS 72 #include <trace/events/vmscan.h> 73 74 struct scan_control { 75 /* How many pages shrink_list() should reclaim */ 76 unsigned long nr_to_reclaim; 77 78 /* 79 * Nodemask of nodes allowed by the caller. If NULL, all nodes 80 * are scanned. 81 */ 82 nodemask_t *nodemask; 83 84 /* 85 * The memory cgroup that hit its limit and as a result is the 86 * primary target of this reclaim invocation. 87 */ 88 struct mem_cgroup *target_mem_cgroup; 89 90 /* 91 * Scan pressure balancing between anon and file LRUs 92 */ 93 unsigned long anon_cost; 94 unsigned long file_cost; 95 96 /* Can active folios be deactivated as part of reclaim? */ 97 #define DEACTIVATE_ANON 1 98 #define DEACTIVATE_FILE 2 99 unsigned int may_deactivate:2; 100 unsigned int force_deactivate:1; 101 unsigned int skipped_deactivate:1; 102 103 /* Writepage batching in laptop mode; RECLAIM_WRITE */ 104 unsigned int may_writepage:1; 105 106 /* Can mapped folios be reclaimed? */ 107 unsigned int may_unmap:1; 108 109 /* Can folios be swapped as part of reclaim? */ 110 unsigned int may_swap:1; 111 112 /* Proactive reclaim invoked by userspace through memory.reclaim */ 113 unsigned int proactive:1; 114 115 /* 116 * Cgroup memory below memory.low is protected as long as we 117 * don't threaten to OOM. If any cgroup is reclaimed at 118 * reduced force or passed over entirely due to its memory.low 119 * setting (memcg_low_skipped), and nothing is reclaimed as a 120 * result, then go back for one more cycle that reclaims the protected 121 * memory (memcg_low_reclaim) to avert OOM. 122 */ 123 unsigned int memcg_low_reclaim:1; 124 unsigned int memcg_low_skipped:1; 125 126 unsigned int hibernation_mode:1; 127 128 /* One of the zones is ready for compaction */ 129 unsigned int compaction_ready:1; 130 131 /* There is easily reclaimable cold cache in the current node */ 132 unsigned int cache_trim_mode:1; 133 134 /* The file folios on the current node are dangerously low */ 135 unsigned int file_is_tiny:1; 136 137 /* Always discard instead of demoting to lower tier memory */ 138 unsigned int no_demotion:1; 139 140 /* Allocation order */ 141 s8 order; 142 143 /* Scan (total_size >> priority) pages at once */ 144 s8 priority; 145 146 /* The highest zone to isolate folios for reclaim from */ 147 s8 reclaim_idx; 148 149 /* This context's GFP mask */ 150 gfp_t gfp_mask; 151 152 /* Incremented by the number of inactive pages that were scanned */ 153 unsigned long nr_scanned; 154 155 /* Number of pages freed so far during a call to shrink_zones() */ 156 unsigned long nr_reclaimed; 157 158 struct { 159 unsigned int dirty; 160 unsigned int unqueued_dirty; 161 unsigned int congested; 162 unsigned int writeback; 163 unsigned int immediate; 164 unsigned int file_taken; 165 unsigned int taken; 166 } nr; 167 168 /* for recording the reclaimed slab by now */ 169 struct reclaim_state reclaim_state; 170 }; 171 172 #ifdef ARCH_HAS_PREFETCHW 173 #define prefetchw_prev_lru_folio(_folio, _base, _field) \ 174 do { \ 175 if ((_folio)->lru.prev != _base) { \ 176 struct folio *prev; \ 177 \ 178 prev = lru_to_folio(&(_folio->lru)); \ 179 prefetchw(&prev->_field); \ 180 } \ 181 } while (0) 182 #else 183 #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) 184 #endif 185 186 /* 187 * From 0 .. 200. Higher means more swappy. 188 */ 189 int vm_swappiness = 60; 190 191 LIST_HEAD(shrinker_list); 192 DECLARE_RWSEM(shrinker_rwsem); 193 194 #ifdef CONFIG_MEMCG 195 static int shrinker_nr_max; 196 197 /* The shrinker_info is expanded in a batch of BITS_PER_LONG */ 198 static inline int shrinker_map_size(int nr_items) 199 { 200 return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); 201 } 202 203 static inline int shrinker_defer_size(int nr_items) 204 { 205 return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); 206 } 207 208 static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, 209 int nid) 210 { 211 return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, 212 lockdep_is_held(&shrinker_rwsem)); 213 } 214 215 static int expand_one_shrinker_info(struct mem_cgroup *memcg, 216 int map_size, int defer_size, 217 int old_map_size, int old_defer_size, 218 int new_nr_max) 219 { 220 struct shrinker_info *new, *old; 221 struct mem_cgroup_per_node *pn; 222 int nid; 223 int size = map_size + defer_size; 224 225 for_each_node(nid) { 226 pn = memcg->nodeinfo[nid]; 227 old = shrinker_info_protected(memcg, nid); 228 /* Not yet online memcg */ 229 if (!old) 230 return 0; 231 232 /* Already expanded this shrinker_info */ 233 if (new_nr_max <= old->map_nr_max) 234 continue; 235 236 new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); 237 if (!new) 238 return -ENOMEM; 239 240 new->nr_deferred = (atomic_long_t *)(new + 1); 241 new->map = (void *)new->nr_deferred + defer_size; 242 new->map_nr_max = new_nr_max; 243 244 /* map: set all old bits, clear all new bits */ 245 memset(new->map, (int)0xff, old_map_size); 246 memset((void *)new->map + old_map_size, 0, map_size - old_map_size); 247 /* nr_deferred: copy old values, clear all new values */ 248 memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); 249 memset((void *)new->nr_deferred + old_defer_size, 0, 250 defer_size - old_defer_size); 251 252 rcu_assign_pointer(pn->shrinker_info, new); 253 kvfree_rcu(old, rcu); 254 } 255 256 return 0; 257 } 258 259 void free_shrinker_info(struct mem_cgroup *memcg) 260 { 261 struct mem_cgroup_per_node *pn; 262 struct shrinker_info *info; 263 int nid; 264 265 for_each_node(nid) { 266 pn = memcg->nodeinfo[nid]; 267 info = rcu_dereference_protected(pn->shrinker_info, true); 268 kvfree(info); 269 rcu_assign_pointer(pn->shrinker_info, NULL); 270 } 271 } 272 273 int alloc_shrinker_info(struct mem_cgroup *memcg) 274 { 275 struct shrinker_info *info; 276 int nid, size, ret = 0; 277 int map_size, defer_size = 0; 278 279 down_write(&shrinker_rwsem); 280 map_size = shrinker_map_size(shrinker_nr_max); 281 defer_size = shrinker_defer_size(shrinker_nr_max); 282 size = map_size + defer_size; 283 for_each_node(nid) { 284 info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); 285 if (!info) { 286 free_shrinker_info(memcg); 287 ret = -ENOMEM; 288 break; 289 } 290 info->nr_deferred = (atomic_long_t *)(info + 1); 291 info->map = (void *)info->nr_deferred + defer_size; 292 info->map_nr_max = shrinker_nr_max; 293 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); 294 } 295 up_write(&shrinker_rwsem); 296 297 return ret; 298 } 299 300 static int expand_shrinker_info(int new_id) 301 { 302 int ret = 0; 303 int new_nr_max = round_up(new_id + 1, BITS_PER_LONG); 304 int map_size, defer_size = 0; 305 int old_map_size, old_defer_size = 0; 306 struct mem_cgroup *memcg; 307 308 if (!root_mem_cgroup) 309 goto out; 310 311 lockdep_assert_held(&shrinker_rwsem); 312 313 map_size = shrinker_map_size(new_nr_max); 314 defer_size = shrinker_defer_size(new_nr_max); 315 old_map_size = shrinker_map_size(shrinker_nr_max); 316 old_defer_size = shrinker_defer_size(shrinker_nr_max); 317 318 memcg = mem_cgroup_iter(NULL, NULL, NULL); 319 do { 320 ret = expand_one_shrinker_info(memcg, map_size, defer_size, 321 old_map_size, old_defer_size, 322 new_nr_max); 323 if (ret) { 324 mem_cgroup_iter_break(NULL, memcg); 325 goto out; 326 } 327 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); 328 out: 329 if (!ret) 330 shrinker_nr_max = new_nr_max; 331 332 return ret; 333 } 334 335 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) 336 { 337 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { 338 struct shrinker_info *info; 339 340 rcu_read_lock(); 341 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); 342 if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { 343 /* Pairs with smp mb in shrink_slab() */ 344 smp_mb__before_atomic(); 345 set_bit(shrinker_id, info->map); 346 } 347 rcu_read_unlock(); 348 } 349 } 350 351 static DEFINE_IDR(shrinker_idr); 352 353 static int prealloc_memcg_shrinker(struct shrinker *shrinker) 354 { 355 int id, ret = -ENOMEM; 356 357 if (mem_cgroup_disabled()) 358 return -ENOSYS; 359 360 down_write(&shrinker_rwsem); 361 /* This may call shrinker, so it must use down_read_trylock() */ 362 id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); 363 if (id < 0) 364 goto unlock; 365 366 if (id >= shrinker_nr_max) { 367 if (expand_shrinker_info(id)) { 368 idr_remove(&shrinker_idr, id); 369 goto unlock; 370 } 371 } 372 shrinker->id = id; 373 ret = 0; 374 unlock: 375 up_write(&shrinker_rwsem); 376 return ret; 377 } 378 379 static void unregister_memcg_shrinker(struct shrinker *shrinker) 380 { 381 int id = shrinker->id; 382 383 BUG_ON(id < 0); 384 385 lockdep_assert_held(&shrinker_rwsem); 386 387 idr_remove(&shrinker_idr, id); 388 } 389 390 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, 391 struct mem_cgroup *memcg) 392 { 393 struct shrinker_info *info; 394 395 info = shrinker_info_protected(memcg, nid); 396 return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); 397 } 398 399 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, 400 struct mem_cgroup *memcg) 401 { 402 struct shrinker_info *info; 403 404 info = shrinker_info_protected(memcg, nid); 405 return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); 406 } 407 408 void reparent_shrinker_deferred(struct mem_cgroup *memcg) 409 { 410 int i, nid; 411 long nr; 412 struct mem_cgroup *parent; 413 struct shrinker_info *child_info, *parent_info; 414 415 parent = parent_mem_cgroup(memcg); 416 if (!parent) 417 parent = root_mem_cgroup; 418 419 /* Prevent from concurrent shrinker_info expand */ 420 down_read(&shrinker_rwsem); 421 for_each_node(nid) { 422 child_info = shrinker_info_protected(memcg, nid); 423 parent_info = shrinker_info_protected(parent, nid); 424 for (i = 0; i < child_info->map_nr_max; i++) { 425 nr = atomic_long_read(&child_info->nr_deferred[i]); 426 atomic_long_add(nr, &parent_info->nr_deferred[i]); 427 } 428 } 429 up_read(&shrinker_rwsem); 430 } 431 432 static bool cgroup_reclaim(struct scan_control *sc) 433 { 434 return sc->target_mem_cgroup; 435 } 436 437 static bool global_reclaim(struct scan_control *sc) 438 { 439 return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup); 440 } 441 442 /** 443 * writeback_throttling_sane - is the usual dirty throttling mechanism available? 444 * @sc: scan_control in question 445 * 446 * The normal page dirty throttling mechanism in balance_dirty_pages() is 447 * completely broken with the legacy memcg and direct stalling in 448 * shrink_folio_list() is used for throttling instead, which lacks all the 449 * niceties such as fairness, adaptive pausing, bandwidth proportional 450 * allocation and configurability. 451 * 452 * This function tests whether the vmscan currently in progress can assume 453 * that the normal dirty throttling mechanism is operational. 454 */ 455 static bool writeback_throttling_sane(struct scan_control *sc) 456 { 457 if (!cgroup_reclaim(sc)) 458 return true; 459 #ifdef CONFIG_CGROUP_WRITEBACK 460 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 461 return true; 462 #endif 463 return false; 464 } 465 #else 466 static int prealloc_memcg_shrinker(struct shrinker *shrinker) 467 { 468 return -ENOSYS; 469 } 470 471 static void unregister_memcg_shrinker(struct shrinker *shrinker) 472 { 473 } 474 475 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, 476 struct mem_cgroup *memcg) 477 { 478 return 0; 479 } 480 481 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, 482 struct mem_cgroup *memcg) 483 { 484 return 0; 485 } 486 487 static bool cgroup_reclaim(struct scan_control *sc) 488 { 489 return false; 490 } 491 492 static bool global_reclaim(struct scan_control *sc) 493 { 494 return true; 495 } 496 497 static bool writeback_throttling_sane(struct scan_control *sc) 498 { 499 return true; 500 } 501 #endif 502 503 static void set_task_reclaim_state(struct task_struct *task, 504 struct reclaim_state *rs) 505 { 506 /* Check for an overwrite */ 507 WARN_ON_ONCE(rs && task->reclaim_state); 508 509 /* Check for the nulling of an already-nulled member */ 510 WARN_ON_ONCE(!rs && !task->reclaim_state); 511 512 task->reclaim_state = rs; 513 } 514 515 /* 516 * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to 517 * scan_control->nr_reclaimed. 518 */ 519 static void flush_reclaim_state(struct scan_control *sc) 520 { 521 /* 522 * Currently, reclaim_state->reclaimed includes three types of pages 523 * freed outside of vmscan: 524 * (1) Slab pages. 525 * (2) Clean file pages from pruned inodes (on highmem systems). 526 * (3) XFS freed buffer pages. 527 * 528 * For all of these cases, we cannot universally link the pages to a 529 * single memcg. For example, a memcg-aware shrinker can free one object 530 * charged to the target memcg, causing an entire page to be freed. 531 * If we count the entire page as reclaimed from the memcg, we end up 532 * overestimating the reclaimed amount (potentially under-reclaiming). 533 * 534 * Only count such pages for global reclaim to prevent under-reclaiming 535 * from the target memcg; preventing unnecessary retries during memcg 536 * charging and false positives from proactive reclaim. 537 * 538 * For uncommon cases where the freed pages were actually mostly 539 * charged to the target memcg, we end up underestimating the reclaimed 540 * amount. This should be fine. The freed pages will be uncharged 541 * anyway, even if they are not counted here properly, and we will be 542 * able to make forward progress in charging (which is usually in a 543 * retry loop). 544 * 545 * We can go one step further, and report the uncharged objcg pages in 546 * memcg reclaim, to make reporting more accurate and reduce 547 * underestimation, but it's probably not worth the complexity for now. 548 */ 549 if (current->reclaim_state && global_reclaim(sc)) { 550 sc->nr_reclaimed += current->reclaim_state->reclaimed; 551 current->reclaim_state->reclaimed = 0; 552 } 553 } 554 555 static long xchg_nr_deferred(struct shrinker *shrinker, 556 struct shrink_control *sc) 557 { 558 int nid = sc->nid; 559 560 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) 561 nid = 0; 562 563 if (sc->memcg && 564 (shrinker->flags & SHRINKER_MEMCG_AWARE)) 565 return xchg_nr_deferred_memcg(nid, shrinker, 566 sc->memcg); 567 568 return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); 569 } 570 571 572 static long add_nr_deferred(long nr, struct shrinker *shrinker, 573 struct shrink_control *sc) 574 { 575 int nid = sc->nid; 576 577 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) 578 nid = 0; 579 580 if (sc->memcg && 581 (shrinker->flags & SHRINKER_MEMCG_AWARE)) 582 return add_nr_deferred_memcg(nr, nid, shrinker, 583 sc->memcg); 584 585 return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); 586 } 587 588 static bool can_demote(int nid, struct scan_control *sc) 589 { 590 if (!numa_demotion_enabled) 591 return false; 592 if (sc && sc->no_demotion) 593 return false; 594 if (next_demotion_node(nid) == NUMA_NO_NODE) 595 return false; 596 597 return true; 598 } 599 600 static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, 601 int nid, 602 struct scan_control *sc) 603 { 604 if (memcg == NULL) { 605 /* 606 * For non-memcg reclaim, is there 607 * space in any swap device? 608 */ 609 if (get_nr_swap_pages() > 0) 610 return true; 611 } else { 612 /* Is the memcg below its swap limit? */ 613 if (mem_cgroup_get_nr_swap_pages(memcg) > 0) 614 return true; 615 } 616 617 /* 618 * The page can not be swapped. 619 * 620 * Can it be reclaimed from this node via demotion? 621 */ 622 return can_demote(nid, sc); 623 } 624 625 /* 626 * This misses isolated folios which are not accounted for to save counters. 627 * As the data only determines if reclaim or compaction continues, it is 628 * not expected that isolated folios will be a dominating factor. 629 */ 630 unsigned long zone_reclaimable_pages(struct zone *zone) 631 { 632 unsigned long nr; 633 634 nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + 635 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); 636 if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) 637 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + 638 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); 639 640 return nr; 641 } 642 643 /** 644 * lruvec_lru_size - Returns the number of pages on the given LRU list. 645 * @lruvec: lru vector 646 * @lru: lru to use 647 * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list) 648 */ 649 static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, 650 int zone_idx) 651 { 652 unsigned long size = 0; 653 int zid; 654 655 for (zid = 0; zid <= zone_idx; zid++) { 656 struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; 657 658 if (!managed_zone(zone)) 659 continue; 660 661 if (!mem_cgroup_disabled()) 662 size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); 663 else 664 size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru); 665 } 666 return size; 667 } 668 669 /* 670 * Add a shrinker callback to be called from the vm. 671 */ 672 static int __prealloc_shrinker(struct shrinker *shrinker) 673 { 674 unsigned int size; 675 int err; 676 677 if (shrinker->flags & SHRINKER_MEMCG_AWARE) { 678 err = prealloc_memcg_shrinker(shrinker); 679 if (err != -ENOSYS) 680 return err; 681 682 shrinker->flags &= ~SHRINKER_MEMCG_AWARE; 683 } 684 685 size = sizeof(*shrinker->nr_deferred); 686 if (shrinker->flags & SHRINKER_NUMA_AWARE) 687 size *= nr_node_ids; 688 689 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); 690 if (!shrinker->nr_deferred) 691 return -ENOMEM; 692 693 return 0; 694 } 695 696 #ifdef CONFIG_SHRINKER_DEBUG 697 int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) 698 { 699 va_list ap; 700 int err; 701 702 va_start(ap, fmt); 703 shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); 704 va_end(ap); 705 if (!shrinker->name) 706 return -ENOMEM; 707 708 err = __prealloc_shrinker(shrinker); 709 if (err) { 710 kfree_const(shrinker->name); 711 shrinker->name = NULL; 712 } 713 714 return err; 715 } 716 #else 717 int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) 718 { 719 return __prealloc_shrinker(shrinker); 720 } 721 #endif 722 723 void free_prealloced_shrinker(struct shrinker *shrinker) 724 { 725 #ifdef CONFIG_SHRINKER_DEBUG 726 kfree_const(shrinker->name); 727 shrinker->name = NULL; 728 #endif 729 if (shrinker->flags & SHRINKER_MEMCG_AWARE) { 730 down_write(&shrinker_rwsem); 731 unregister_memcg_shrinker(shrinker); 732 up_write(&shrinker_rwsem); 733 return; 734 } 735 736 kfree(shrinker->nr_deferred); 737 shrinker->nr_deferred = NULL; 738 } 739 740 void register_shrinker_prepared(struct shrinker *shrinker) 741 { 742 down_write(&shrinker_rwsem); 743 list_add_tail(&shrinker->list, &shrinker_list); 744 shrinker->flags |= SHRINKER_REGISTERED; 745 shrinker_debugfs_add(shrinker); 746 up_write(&shrinker_rwsem); 747 } 748 749 static int __register_shrinker(struct shrinker *shrinker) 750 { 751 int err = __prealloc_shrinker(shrinker); 752 753 if (err) 754 return err; 755 register_shrinker_prepared(shrinker); 756 return 0; 757 } 758 759 #ifdef CONFIG_SHRINKER_DEBUG 760 int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) 761 { 762 va_list ap; 763 int err; 764 765 va_start(ap, fmt); 766 shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); 767 va_end(ap); 768 if (!shrinker->name) 769 return -ENOMEM; 770 771 err = __register_shrinker(shrinker); 772 if (err) { 773 kfree_const(shrinker->name); 774 shrinker->name = NULL; 775 } 776 return err; 777 } 778 #else 779 int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) 780 { 781 return __register_shrinker(shrinker); 782 } 783 #endif 784 EXPORT_SYMBOL(register_shrinker); 785 786 /* 787 * Remove one 788 */ 789 void unregister_shrinker(struct shrinker *shrinker) 790 { 791 struct dentry *debugfs_entry; 792 int debugfs_id; 793 794 if (!(shrinker->flags & SHRINKER_REGISTERED)) 795 return; 796 797 down_write(&shrinker_rwsem); 798 list_del(&shrinker->list); 799 shrinker->flags &= ~SHRINKER_REGISTERED; 800 if (shrinker->flags & SHRINKER_MEMCG_AWARE) 801 unregister_memcg_shrinker(shrinker); 802 debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); 803 up_write(&shrinker_rwsem); 804 805 shrinker_debugfs_remove(debugfs_entry, debugfs_id); 806 807 kfree(shrinker->nr_deferred); 808 shrinker->nr_deferred = NULL; 809 } 810 EXPORT_SYMBOL(unregister_shrinker); 811 812 /** 813 * synchronize_shrinkers - Wait for all running shrinkers to complete. 814 * 815 * This is equivalent to calling unregister_shrink() and register_shrinker(), 816 * but atomically and with less overhead. This is useful to guarantee that all 817 * shrinker invocations have seen an update, before freeing memory, similar to 818 * rcu. 819 */ 820 void synchronize_shrinkers(void) 821 { 822 down_write(&shrinker_rwsem); 823 up_write(&shrinker_rwsem); 824 } 825 EXPORT_SYMBOL(synchronize_shrinkers); 826 827 #define SHRINK_BATCH 128 828 829 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, 830 struct shrinker *shrinker, int priority) 831 { 832 unsigned long freed = 0; 833 unsigned long long delta; 834 long total_scan; 835 long freeable; 836 long nr; 837 long new_nr; 838 long batch_size = shrinker->batch ? shrinker->batch 839 : SHRINK_BATCH; 840 long scanned = 0, next_deferred; 841 842 freeable = shrinker->count_objects(shrinker, shrinkctl); 843 if (freeable == 0 || freeable == SHRINK_EMPTY) 844 return freeable; 845 846 /* 847 * copy the current shrinker scan count into a local variable 848 * and zero it so that other concurrent shrinker invocations 849 * don't also do this scanning work. 850 */ 851 nr = xchg_nr_deferred(shrinker, shrinkctl); 852 853 if (shrinker->seeks) { 854 delta = freeable >> priority; 855 delta *= 4; 856 do_div(delta, shrinker->seeks); 857 } else { 858 /* 859 * These objects don't require any IO to create. Trim 860 * them aggressively under memory pressure to keep 861 * them from causing refetches in the IO caches. 862 */ 863 delta = freeable / 2; 864 } 865 866 total_scan = nr >> priority; 867 total_scan += delta; 868 total_scan = min(total_scan, (2 * freeable)); 869 870 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, 871 freeable, delta, total_scan, priority); 872 873 /* 874 * Normally, we should not scan less than batch_size objects in one 875 * pass to avoid too frequent shrinker calls, but if the slab has less 876 * than batch_size objects in total and we are really tight on memory, 877 * we will try to reclaim all available objects, otherwise we can end 878 * up failing allocations although there are plenty of reclaimable 879 * objects spread over several slabs with usage less than the 880 * batch_size. 881 * 882 * We detect the "tight on memory" situations by looking at the total 883 * number of objects we want to scan (total_scan). If it is greater 884 * than the total number of objects on slab (freeable), we must be 885 * scanning at high prio and therefore should try to reclaim as much as 886 * possible. 887 */ 888 while (total_scan >= batch_size || 889 total_scan >= freeable) { 890 unsigned long ret; 891 unsigned long nr_to_scan = min(batch_size, total_scan); 892 893 shrinkctl->nr_to_scan = nr_to_scan; 894 shrinkctl->nr_scanned = nr_to_scan; 895 ret = shrinker->scan_objects(shrinker, shrinkctl); 896 if (ret == SHRINK_STOP) 897 break; 898 freed += ret; 899 900 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); 901 total_scan -= shrinkctl->nr_scanned; 902 scanned += shrinkctl->nr_scanned; 903 904 cond_resched(); 905 } 906 907 /* 908 * The deferred work is increased by any new work (delta) that wasn't 909 * done, decreased by old deferred work that was done now. 910 * 911 * And it is capped to two times of the freeable items. 912 */ 913 next_deferred = max_t(long, (nr + delta - scanned), 0); 914 next_deferred = min(next_deferred, (2 * freeable)); 915 916 /* 917 * move the unused scan count back into the shrinker in a 918 * manner that handles concurrent updates. 919 */ 920 new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); 921 922 trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); 923 return freed; 924 } 925 926 #ifdef CONFIG_MEMCG 927 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, 928 struct mem_cgroup *memcg, int priority) 929 { 930 struct shrinker_info *info; 931 unsigned long ret, freed = 0; 932 int i; 933 934 if (!mem_cgroup_online(memcg)) 935 return 0; 936 937 if (!down_read_trylock(&shrinker_rwsem)) 938 return 0; 939 940 info = shrinker_info_protected(memcg, nid); 941 if (unlikely(!info)) 942 goto unlock; 943 944 for_each_set_bit(i, info->map, info->map_nr_max) { 945 struct shrink_control sc = { 946 .gfp_mask = gfp_mask, 947 .nid = nid, 948 .memcg = memcg, 949 }; 950 struct shrinker *shrinker; 951 952 shrinker = idr_find(&shrinker_idr, i); 953 if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { 954 if (!shrinker) 955 clear_bit(i, info->map); 956 continue; 957 } 958 959 /* Call non-slab shrinkers even though kmem is disabled */ 960 if (!memcg_kmem_online() && 961 !(shrinker->flags & SHRINKER_NONSLAB)) 962 continue; 963 964 ret = do_shrink_slab(&sc, shrinker, priority); 965 if (ret == SHRINK_EMPTY) { 966 clear_bit(i, info->map); 967 /* 968 * After the shrinker reported that it had no objects to 969 * free, but before we cleared the corresponding bit in 970 * the memcg shrinker map, a new object might have been 971 * added. To make sure, we have the bit set in this 972 * case, we invoke the shrinker one more time and reset 973 * the bit if it reports that it is not empty anymore. 974 * The memory barrier here pairs with the barrier in 975 * set_shrinker_bit(): 976 * 977 * list_lru_add() shrink_slab_memcg() 978 * list_add_tail() clear_bit() 979 * <MB> <MB> 980 * set_bit() do_shrink_slab() 981 */ 982 smp_mb__after_atomic(); 983 ret = do_shrink_slab(&sc, shrinker, priority); 984 if (ret == SHRINK_EMPTY) 985 ret = 0; 986 else 987 set_shrinker_bit(memcg, nid, i); 988 } 989 freed += ret; 990 991 if (rwsem_is_contended(&shrinker_rwsem)) { 992 freed = freed ? : 1; 993 break; 994 } 995 } 996 unlock: 997 up_read(&shrinker_rwsem); 998 return freed; 999 } 1000 #else /* CONFIG_MEMCG */ 1001 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, 1002 struct mem_cgroup *memcg, int priority) 1003 { 1004 return 0; 1005 } 1006 #endif /* CONFIG_MEMCG */ 1007 1008 /** 1009 * shrink_slab - shrink slab caches 1010 * @gfp_mask: allocation context 1011 * @nid: node whose slab caches to target 1012 * @memcg: memory cgroup whose slab caches to target 1013 * @priority: the reclaim priority 1014 * 1015 * Call the shrink functions to age shrinkable caches. 1016 * 1017 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, 1018 * unaware shrinkers will receive a node id of 0 instead. 1019 * 1020 * @memcg specifies the memory cgroup to target. Unaware shrinkers 1021 * are called only if it is the root cgroup. 1022 * 1023 * @priority is sc->priority, we take the number of objects and >> by priority 1024 * in order to get the scan target. 1025 * 1026 * Returns the number of reclaimed slab objects. 1027 */ 1028 static unsigned long shrink_slab(gfp_t gfp_mask, int nid, 1029 struct mem_cgroup *memcg, 1030 int priority) 1031 { 1032 unsigned long ret, freed = 0; 1033 struct shrinker *shrinker; 1034 1035 /* 1036 * The root memcg might be allocated even though memcg is disabled 1037 * via "cgroup_disable=memory" boot parameter. This could make 1038 * mem_cgroup_is_root() return false, then just run memcg slab 1039 * shrink, but skip global shrink. This may result in premature 1040 * oom. 1041 */ 1042 if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) 1043 return shrink_slab_memcg(gfp_mask, nid, memcg, priority); 1044 1045 if (!down_read_trylock(&shrinker_rwsem)) 1046 goto out; 1047 1048 list_for_each_entry(shrinker, &shrinker_list, list) { 1049 struct shrink_control sc = { 1050 .gfp_mask = gfp_mask, 1051 .nid = nid, 1052 .memcg = memcg, 1053 }; 1054 1055 ret = do_shrink_slab(&sc, shrinker, priority); 1056 if (ret == SHRINK_EMPTY) 1057 ret = 0; 1058 freed += ret; 1059 /* 1060 * Bail out if someone want to register a new shrinker to 1061 * prevent the registration from being stalled for long periods 1062 * by parallel ongoing shrinking. 1063 */ 1064 if (rwsem_is_contended(&shrinker_rwsem)) { 1065 freed = freed ? : 1; 1066 break; 1067 } 1068 } 1069 1070 up_read(&shrinker_rwsem); 1071 out: 1072 cond_resched(); 1073 return freed; 1074 } 1075 1076 static unsigned long drop_slab_node(int nid) 1077 { 1078 unsigned long freed = 0; 1079 struct mem_cgroup *memcg = NULL; 1080 1081 memcg = mem_cgroup_iter(NULL, NULL, NULL); 1082 do { 1083 freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); 1084 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); 1085 1086 return freed; 1087 } 1088 1089 void drop_slab(void) 1090 { 1091 int nid; 1092 int shift = 0; 1093 unsigned long freed; 1094 1095 do { 1096 freed = 0; 1097 for_each_online_node(nid) { 1098 if (fatal_signal_pending(current)) 1099 return; 1100 1101 freed += drop_slab_node(nid); 1102 } 1103 } while ((freed >> shift++) > 1); 1104 } 1105 1106 static int reclaimer_offset(void) 1107 { 1108 BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != 1109 PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD); 1110 BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != 1111 PGSCAN_DIRECT - PGSCAN_KSWAPD); 1112 BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != 1113 PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD); 1114 BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != 1115 PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD); 1116 1117 if (current_is_kswapd()) 1118 return 0; 1119 if (current_is_khugepaged()) 1120 return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD; 1121 return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; 1122 } 1123 1124 static inline int is_page_cache_freeable(struct folio *folio) 1125 { 1126 /* 1127 * A freeable page cache folio is referenced only by the caller 1128 * that isolated the folio, the page cache and optional filesystem 1129 * private data at folio->private. 1130 */ 1131 return folio_ref_count(folio) - folio_test_private(folio) == 1132 1 + folio_nr_pages(folio); 1133 } 1134 1135 /* 1136 * We detected a synchronous write error writing a folio out. Probably 1137 * -ENOSPC. We need to propagate that into the address_space for a subsequent 1138 * fsync(), msync() or close(). 1139 * 1140 * The tricky part is that after writepage we cannot touch the mapping: nothing 1141 * prevents it from being freed up. But we have a ref on the folio and once 1142 * that folio is locked, the mapping is pinned. 1143 * 1144 * We're allowed to run sleeping folio_lock() here because we know the caller has 1145 * __GFP_FS. 1146 */ 1147 static void handle_write_error(struct address_space *mapping, 1148 struct folio *folio, int error) 1149 { 1150 folio_lock(folio); 1151 if (folio_mapping(folio) == mapping) 1152 mapping_set_error(mapping, error); 1153 folio_unlock(folio); 1154 } 1155 1156 static bool skip_throttle_noprogress(pg_data_t *pgdat) 1157 { 1158 int reclaimable = 0, write_pending = 0; 1159 int i; 1160 1161 /* 1162 * If kswapd is disabled, reschedule if necessary but do not 1163 * throttle as the system is likely near OOM. 1164 */ 1165 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) 1166 return true; 1167 1168 /* 1169 * If there are a lot of dirty/writeback folios then do not 1170 * throttle as throttling will occur when the folios cycle 1171 * towards the end of the LRU if still under writeback. 1172 */ 1173 for (i = 0; i < MAX_NR_ZONES; i++) { 1174 struct zone *zone = pgdat->node_zones + i; 1175 1176 if (!managed_zone(zone)) 1177 continue; 1178 1179 reclaimable += zone_reclaimable_pages(zone); 1180 write_pending += zone_page_state_snapshot(zone, 1181 NR_ZONE_WRITE_PENDING); 1182 } 1183 if (2 * write_pending <= reclaimable) 1184 return true; 1185 1186 return false; 1187 } 1188 1189 void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) 1190 { 1191 wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason]; 1192 long timeout, ret; 1193 DEFINE_WAIT(wait); 1194 1195 /* 1196 * Do not throttle user workers, kthreads other than kswapd or 1197 * workqueues. They may be required for reclaim to make 1198 * forward progress (e.g. journalling workqueues or kthreads). 1199 */ 1200 if (!current_is_kswapd() && 1201 current->flags & (PF_USER_WORKER|PF_KTHREAD)) { 1202 cond_resched(); 1203 return; 1204 } 1205 1206 /* 1207 * These figures are pulled out of thin air. 1208 * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many 1209 * parallel reclaimers which is a short-lived event so the timeout is 1210 * short. Failing to make progress or waiting on writeback are 1211 * potentially long-lived events so use a longer timeout. This is shaky 1212 * logic as a failure to make progress could be due to anything from 1213 * writeback to a slow device to excessive referenced folios at the tail 1214 * of the inactive LRU. 1215 */ 1216 switch(reason) { 1217 case VMSCAN_THROTTLE_WRITEBACK: 1218 timeout = HZ/10; 1219 1220 if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) { 1221 WRITE_ONCE(pgdat->nr_reclaim_start, 1222 node_page_state(pgdat, NR_THROTTLED_WRITTEN)); 1223 } 1224 1225 break; 1226 case VMSCAN_THROTTLE_CONGESTED: 1227 fallthrough; 1228 case VMSCAN_THROTTLE_NOPROGRESS: 1229 if (skip_throttle_noprogress(pgdat)) { 1230 cond_resched(); 1231 return; 1232 } 1233 1234 timeout = 1; 1235 1236 break; 1237 case VMSCAN_THROTTLE_ISOLATED: 1238 timeout = HZ/50; 1239 break; 1240 default: 1241 WARN_ON_ONCE(1); 1242 timeout = HZ; 1243 break; 1244 } 1245 1246 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 1247 ret = schedule_timeout(timeout); 1248 finish_wait(wqh, &wait); 1249 1250 if (reason == VMSCAN_THROTTLE_WRITEBACK) 1251 atomic_dec(&pgdat->nr_writeback_throttled); 1252 1253 trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout), 1254 jiffies_to_usecs(timeout - ret), 1255 reason); 1256 } 1257 1258 /* 1259 * Account for folios written if tasks are throttled waiting on dirty 1260 * folios to clean. If enough folios have been cleaned since throttling 1261 * started then wakeup the throttled tasks. 1262 */ 1263 void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, 1264 int nr_throttled) 1265 { 1266 unsigned long nr_written; 1267 1268 node_stat_add_folio(folio, NR_THROTTLED_WRITTEN); 1269 1270 /* 1271 * This is an inaccurate read as the per-cpu deltas may not 1272 * be synchronised. However, given that the system is 1273 * writeback throttled, it is not worth taking the penalty 1274 * of getting an accurate count. At worst, the throttle 1275 * timeout guarantees forward progress. 1276 */ 1277 nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) - 1278 READ_ONCE(pgdat->nr_reclaim_start); 1279 1280 if (nr_written > SWAP_CLUSTER_MAX * nr_throttled) 1281 wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]); 1282 } 1283 1284 /* possible outcome of pageout() */ 1285 typedef enum { 1286 /* failed to write folio out, folio is locked */ 1287 PAGE_KEEP, 1288 /* move folio to the active list, folio is locked */ 1289 PAGE_ACTIVATE, 1290 /* folio has been sent to the disk successfully, folio is unlocked */ 1291 PAGE_SUCCESS, 1292 /* folio is clean and locked */ 1293 PAGE_CLEAN, 1294 } pageout_t; 1295 1296 /* 1297 * pageout is called by shrink_folio_list() for each dirty folio. 1298 * Calls ->writepage(). 1299 */ 1300 static pageout_t pageout(struct folio *folio, struct address_space *mapping, 1301 struct swap_iocb **plug) 1302 { 1303 /* 1304 * If the folio is dirty, only perform writeback if that write 1305 * will be non-blocking. To prevent this allocation from being 1306 * stalled by pagecache activity. But note that there may be 1307 * stalls if we need to run get_block(). We could test 1308 * PagePrivate for that. 1309 * 1310 * If this process is currently in __generic_file_write_iter() against 1311 * this folio's queue, we can perform writeback even if that 1312 * will block. 1313 * 1314 * If the folio is swapcache, write it back even if that would 1315 * block, for some throttling. This happens by accident, because 1316 * swap_backing_dev_info is bust: it doesn't reflect the 1317 * congestion state of the swapdevs. Easy to fix, if needed. 1318 */ 1319 if (!is_page_cache_freeable(folio)) 1320 return PAGE_KEEP; 1321 if (!mapping) { 1322 /* 1323 * Some data journaling orphaned folios can have 1324 * folio->mapping == NULL while being dirty with clean buffers. 1325 */ 1326 if (folio_test_private(folio)) { 1327 if (try_to_free_buffers(folio)) { 1328 folio_clear_dirty(folio); 1329 pr_info("%s: orphaned folio\n", __func__); 1330 return PAGE_CLEAN; 1331 } 1332 } 1333 return PAGE_KEEP; 1334 } 1335 if (mapping->a_ops->writepage == NULL) 1336 return PAGE_ACTIVATE; 1337 1338 if (folio_clear_dirty_for_io(folio)) { 1339 int res; 1340 struct writeback_control wbc = { 1341 .sync_mode = WB_SYNC_NONE, 1342 .nr_to_write = SWAP_CLUSTER_MAX, 1343 .range_start = 0, 1344 .range_end = LLONG_MAX, 1345 .for_reclaim = 1, 1346 .swap_plug = plug, 1347 }; 1348 1349 folio_set_reclaim(folio); 1350 res = mapping->a_ops->writepage(&folio->page, &wbc); 1351 if (res < 0) 1352 handle_write_error(mapping, folio, res); 1353 if (res == AOP_WRITEPAGE_ACTIVATE) { 1354 folio_clear_reclaim(folio); 1355 return PAGE_ACTIVATE; 1356 } 1357 1358 if (!folio_test_writeback(folio)) { 1359 /* synchronous write or broken a_ops? */ 1360 folio_clear_reclaim(folio); 1361 } 1362 trace_mm_vmscan_write_folio(folio); 1363 node_stat_add_folio(folio, NR_VMSCAN_WRITE); 1364 return PAGE_SUCCESS; 1365 } 1366 1367 return PAGE_CLEAN; 1368 } 1369 1370 /* 1371 * Same as remove_mapping, but if the folio is removed from the mapping, it 1372 * gets returned with a refcount of 0. 1373 */ 1374 static int __remove_mapping(struct address_space *mapping, struct folio *folio, 1375 bool reclaimed, struct mem_cgroup *target_memcg) 1376 { 1377 int refcount; 1378 void *shadow = NULL; 1379 1380 BUG_ON(!folio_test_locked(folio)); 1381 BUG_ON(mapping != folio_mapping(folio)); 1382 1383 if (!folio_test_swapcache(folio)) 1384 spin_lock(&mapping->host->i_lock); 1385 xa_lock_irq(&mapping->i_pages); 1386 /* 1387 * The non racy check for a busy folio. 1388 * 1389 * Must be careful with the order of the tests. When someone has 1390 * a ref to the folio, it may be possible that they dirty it then 1391 * drop the reference. So if the dirty flag is tested before the 1392 * refcount here, then the following race may occur: 1393 * 1394 * get_user_pages(&page); 1395 * [user mapping goes away] 1396 * write_to(page); 1397 * !folio_test_dirty(folio) [good] 1398 * folio_set_dirty(folio); 1399 * folio_put(folio); 1400 * !refcount(folio) [good, discard it] 1401 * 1402 * [oops, our write_to data is lost] 1403 * 1404 * Reversing the order of the tests ensures such a situation cannot 1405 * escape unnoticed. The smp_rmb is needed to ensure the folio->flags 1406 * load is not satisfied before that of folio->_refcount. 1407 * 1408 * Note that if the dirty flag is always set via folio_mark_dirty, 1409 * and thus under the i_pages lock, then this ordering is not required. 1410 */ 1411 refcount = 1 + folio_nr_pages(folio); 1412 if (!folio_ref_freeze(folio, refcount)) 1413 goto cannot_free; 1414 /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */ 1415 if (unlikely(folio_test_dirty(folio))) { 1416 folio_ref_unfreeze(folio, refcount); 1417 goto cannot_free; 1418 } 1419 1420 if (folio_test_swapcache(folio)) { 1421 swp_entry_t swap = folio_swap_entry(folio); 1422 1423 if (reclaimed && !mapping_exiting(mapping)) 1424 shadow = workingset_eviction(folio, target_memcg); 1425 __delete_from_swap_cache(folio, swap, shadow); 1426 mem_cgroup_swapout(folio, swap); 1427 xa_unlock_irq(&mapping->i_pages); 1428 put_swap_folio(folio, swap); 1429 } else { 1430 void (*free_folio)(struct folio *); 1431 1432 free_folio = mapping->a_ops->free_folio; 1433 /* 1434 * Remember a shadow entry for reclaimed file cache in 1435 * order to detect refaults, thus thrashing, later on. 1436 * 1437 * But don't store shadows in an address space that is 1438 * already exiting. This is not just an optimization, 1439 * inode reclaim needs to empty out the radix tree or 1440 * the nodes are lost. Don't plant shadows behind its 1441 * back. 1442 * 1443 * We also don't store shadows for DAX mappings because the 1444 * only page cache folios found in these are zero pages 1445 * covering holes, and because we don't want to mix DAX 1446 * exceptional entries and shadow exceptional entries in the 1447 * same address_space. 1448 */ 1449 if (reclaimed && folio_is_file_lru(folio) && 1450 !mapping_exiting(mapping) && !dax_mapping(mapping)) 1451 shadow = workingset_eviction(folio, target_memcg); 1452 __filemap_remove_folio(folio, shadow); 1453 xa_unlock_irq(&mapping->i_pages); 1454 if (mapping_shrinkable(mapping)) 1455 inode_add_lru(mapping->host); 1456 spin_unlock(&mapping->host->i_lock); 1457 1458 if (free_folio) 1459 free_folio(folio); 1460 } 1461 1462 return 1; 1463 1464 cannot_free: 1465 xa_unlock_irq(&mapping->i_pages); 1466 if (!folio_test_swapcache(folio)) 1467 spin_unlock(&mapping->host->i_lock); 1468 return 0; 1469 } 1470 1471 /** 1472 * remove_mapping() - Attempt to remove a folio from its mapping. 1473 * @mapping: The address space. 1474 * @folio: The folio to remove. 1475 * 1476 * If the folio is dirty, under writeback or if someone else has a ref 1477 * on it, removal will fail. 1478 * Return: The number of pages removed from the mapping. 0 if the folio 1479 * could not be removed. 1480 * Context: The caller should have a single refcount on the folio and 1481 * hold its lock. 1482 */ 1483 long remove_mapping(struct address_space *mapping, struct folio *folio) 1484 { 1485 if (__remove_mapping(mapping, folio, false, NULL)) { 1486 /* 1487 * Unfreezing the refcount with 1 effectively 1488 * drops the pagecache ref for us without requiring another 1489 * atomic operation. 1490 */ 1491 folio_ref_unfreeze(folio, 1); 1492 return folio_nr_pages(folio); 1493 } 1494 return 0; 1495 } 1496 1497 /** 1498 * folio_putback_lru - Put previously isolated folio onto appropriate LRU list. 1499 * @folio: Folio to be returned to an LRU list. 1500 * 1501 * Add previously isolated @folio to appropriate LRU list. 1502 * The folio may still be unevictable for other reasons. 1503 * 1504 * Context: lru_lock must not be held, interrupts must be enabled. 1505 */ 1506 void folio_putback_lru(struct folio *folio) 1507 { 1508 folio_add_lru(folio); 1509 folio_put(folio); /* drop ref from isolate */ 1510 } 1511 1512 enum folio_references { 1513 FOLIOREF_RECLAIM, 1514 FOLIOREF_RECLAIM_CLEAN, 1515 FOLIOREF_KEEP, 1516 FOLIOREF_ACTIVATE, 1517 }; 1518 1519 static enum folio_references folio_check_references(struct folio *folio, 1520 struct scan_control *sc) 1521 { 1522 int referenced_ptes, referenced_folio; 1523 unsigned long vm_flags; 1524 1525 referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, 1526 &vm_flags); 1527 referenced_folio = folio_test_clear_referenced(folio); 1528 1529 /* 1530 * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. 1531 * Let the folio, now marked Mlocked, be moved to the unevictable list. 1532 */ 1533 if (vm_flags & VM_LOCKED) 1534 return FOLIOREF_ACTIVATE; 1535 1536 /* rmap lock contention: rotate */ 1537 if (referenced_ptes == -1) 1538 return FOLIOREF_KEEP; 1539 1540 if (referenced_ptes) { 1541 /* 1542 * All mapped folios start out with page table 1543 * references from the instantiating fault, so we need 1544 * to look twice if a mapped file/anon folio is used more 1545 * than once. 1546 * 1547 * Mark it and spare it for another trip around the 1548 * inactive list. Another page table reference will 1549 * lead to its activation. 1550 * 1551 * Note: the mark is set for activated folios as well 1552 * so that recently deactivated but used folios are 1553 * quickly recovered. 1554 */ 1555 folio_set_referenced(folio); 1556 1557 if (referenced_folio || referenced_ptes > 1) 1558 return FOLIOREF_ACTIVATE; 1559 1560 /* 1561 * Activate file-backed executable folios after first usage. 1562 */ 1563 if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) 1564 return FOLIOREF_ACTIVATE; 1565 1566 return FOLIOREF_KEEP; 1567 } 1568 1569 /* Reclaim if clean, defer dirty folios to writeback */ 1570 if (referenced_folio && folio_is_file_lru(folio)) 1571 return FOLIOREF_RECLAIM_CLEAN; 1572 1573 return FOLIOREF_RECLAIM; 1574 } 1575 1576 /* Check if a folio is dirty or under writeback */ 1577 static void folio_check_dirty_writeback(struct folio *folio, 1578 bool *dirty, bool *writeback) 1579 { 1580 struct address_space *mapping; 1581 1582 /* 1583 * Anonymous folios are not handled by flushers and must be written 1584 * from reclaim context. Do not stall reclaim based on them. 1585 * MADV_FREE anonymous folios are put into inactive file list too. 1586 * They could be mistakenly treated as file lru. So further anon 1587 * test is needed. 1588 */ 1589 if (!folio_is_file_lru(folio) || 1590 (folio_test_anon(folio) && !folio_test_swapbacked(folio))) { 1591 *dirty = false; 1592 *writeback = false; 1593 return; 1594 } 1595 1596 /* By default assume that the folio flags are accurate */ 1597 *dirty = folio_test_dirty(folio); 1598 *writeback = folio_test_writeback(folio); 1599 1600 /* Verify dirty/writeback state if the filesystem supports it */ 1601 if (!folio_test_private(folio)) 1602 return; 1603 1604 mapping = folio_mapping(folio); 1605 if (mapping && mapping->a_ops->is_dirty_writeback) 1606 mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); 1607 } 1608 1609 static struct folio *alloc_demote_folio(struct folio *src, 1610 unsigned long private) 1611 { 1612 struct folio *dst; 1613 nodemask_t *allowed_mask; 1614 struct migration_target_control *mtc; 1615 1616 mtc = (struct migration_target_control *)private; 1617 1618 allowed_mask = mtc->nmask; 1619 /* 1620 * make sure we allocate from the target node first also trying to 1621 * demote or reclaim pages from the target node via kswapd if we are 1622 * low on free memory on target node. If we don't do this and if 1623 * we have free memory on the slower(lower) memtier, we would start 1624 * allocating pages from slower(lower) memory tiers without even forcing 1625 * a demotion of cold pages from the target memtier. This can result 1626 * in the kernel placing hot pages in slower(lower) memory tiers. 1627 */ 1628 mtc->nmask = NULL; 1629 mtc->gfp_mask |= __GFP_THISNODE; 1630 dst = alloc_migration_target(src, (unsigned long)mtc); 1631 if (dst) 1632 return dst; 1633 1634 mtc->gfp_mask &= ~__GFP_THISNODE; 1635 mtc->nmask = allowed_mask; 1636 1637 return alloc_migration_target(src, (unsigned long)mtc); 1638 } 1639 1640 /* 1641 * Take folios on @demote_folios and attempt to demote them to another node. 1642 * Folios which are not demoted are left on @demote_folios. 1643 */ 1644 static unsigned int demote_folio_list(struct list_head *demote_folios, 1645 struct pglist_data *pgdat) 1646 { 1647 int target_nid = next_demotion_node(pgdat->node_id); 1648 unsigned int nr_succeeded; 1649 nodemask_t allowed_mask; 1650 1651 struct migration_target_control mtc = { 1652 /* 1653 * Allocate from 'node', or fail quickly and quietly. 1654 * When this happens, 'page' will likely just be discarded 1655 * instead of migrated. 1656 */ 1657 .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN | 1658 __GFP_NOMEMALLOC | GFP_NOWAIT, 1659 .nid = target_nid, 1660 .nmask = &allowed_mask 1661 }; 1662 1663 if (list_empty(demote_folios)) 1664 return 0; 1665 1666 if (target_nid == NUMA_NO_NODE) 1667 return 0; 1668 1669 node_get_allowed_targets(pgdat, &allowed_mask); 1670 1671 /* Demotion ignores all cpuset and mempolicy settings */ 1672 migrate_pages(demote_folios, alloc_demote_folio, NULL, 1673 (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, 1674 &nr_succeeded); 1675 1676 __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded); 1677 1678 return nr_succeeded; 1679 } 1680 1681 static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) 1682 { 1683 if (gfp_mask & __GFP_FS) 1684 return true; 1685 if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO)) 1686 return false; 1687 /* 1688 * We can "enter_fs" for swap-cache with only __GFP_IO 1689 * providing this isn't SWP_FS_OPS. 1690 * ->flags can be updated non-atomicially (scan_swap_map_slots), 1691 * but that will never affect SWP_FS_OPS, so the data_race 1692 * is safe. 1693 */ 1694 return !data_race(folio_swap_flags(folio) & SWP_FS_OPS); 1695 } 1696 1697 /* 1698 * shrink_folio_list() returns the number of reclaimed pages 1699 */ 1700 static unsigned int shrink_folio_list(struct list_head *folio_list, 1701 struct pglist_data *pgdat, struct scan_control *sc, 1702 struct reclaim_stat *stat, bool ignore_references) 1703 { 1704 LIST_HEAD(ret_folios); 1705 LIST_HEAD(free_folios); 1706 LIST_HEAD(demote_folios); 1707 unsigned int nr_reclaimed = 0; 1708 unsigned int pgactivate = 0; 1709 bool do_demote_pass; 1710 struct swap_iocb *plug = NULL; 1711 1712 memset(stat, 0, sizeof(*stat)); 1713 cond_resched(); 1714 do_demote_pass = can_demote(pgdat->node_id, sc); 1715 1716 retry: 1717 while (!list_empty(folio_list)) { 1718 struct address_space *mapping; 1719 struct folio *folio; 1720 enum folio_references references = FOLIOREF_RECLAIM; 1721 bool dirty, writeback; 1722 unsigned int nr_pages; 1723 1724 cond_resched(); 1725 1726 folio = lru_to_folio(folio_list); 1727 list_del(&folio->lru); 1728 1729 if (!folio_trylock(folio)) 1730 goto keep; 1731 1732 VM_BUG_ON_FOLIO(folio_test_active(folio), folio); 1733 1734 nr_pages = folio_nr_pages(folio); 1735 1736 /* Account the number of base pages */ 1737 sc->nr_scanned += nr_pages; 1738 1739 if (unlikely(!folio_evictable(folio))) 1740 goto activate_locked; 1741 1742 if (!sc->may_unmap && folio_mapped(folio)) 1743 goto keep_locked; 1744 1745 /* folio_update_gen() tried to promote this page? */ 1746 if (lru_gen_enabled() && !ignore_references && 1747 folio_mapped(folio) && folio_test_referenced(folio)) 1748 goto keep_locked; 1749 1750 /* 1751 * The number of dirty pages determines if a node is marked 1752 * reclaim_congested. kswapd will stall and start writing 1753 * folios if the tail of the LRU is all dirty unqueued folios. 1754 */ 1755 folio_check_dirty_writeback(folio, &dirty, &writeback); 1756 if (dirty || writeback) 1757 stat->nr_dirty += nr_pages; 1758 1759 if (dirty && !writeback) 1760 stat->nr_unqueued_dirty += nr_pages; 1761 1762 /* 1763 * Treat this folio as congested if folios are cycling 1764 * through the LRU so quickly that the folios marked 1765 * for immediate reclaim are making it to the end of 1766 * the LRU a second time. 1767 */ 1768 if (writeback && folio_test_reclaim(folio)) 1769 stat->nr_congested += nr_pages; 1770 1771 /* 1772 * If a folio at the tail of the LRU is under writeback, there 1773 * are three cases to consider. 1774 * 1775 * 1) If reclaim is encountering an excessive number 1776 * of folios under writeback and this folio has both 1777 * the writeback and reclaim flags set, then it 1778 * indicates that folios are being queued for I/O but 1779 * are being recycled through the LRU before the I/O 1780 * can complete. Waiting on the folio itself risks an 1781 * indefinite stall if it is impossible to writeback 1782 * the folio due to I/O error or disconnected storage 1783 * so instead note that the LRU is being scanned too 1784 * quickly and the caller can stall after the folio 1785 * list has been processed. 1786 * 1787 * 2) Global or new memcg reclaim encounters a folio that is 1788 * not marked for immediate reclaim, or the caller does not 1789 * have __GFP_FS (or __GFP_IO if it's simply going to swap, 1790 * not to fs). In this case mark the folio for immediate 1791 * reclaim and continue scanning. 1792 * 1793 * Require may_enter_fs() because we would wait on fs, which 1794 * may not have submitted I/O yet. And the loop driver might 1795 * enter reclaim, and deadlock if it waits on a folio for 1796 * which it is needed to do the write (loop masks off 1797 * __GFP_IO|__GFP_FS for this reason); but more thought 1798 * would probably show more reasons. 1799 * 1800 * 3) Legacy memcg encounters a folio that already has the 1801 * reclaim flag set. memcg does not have any dirty folio 1802 * throttling so we could easily OOM just because too many 1803 * folios are in writeback and there is nothing else to 1804 * reclaim. Wait for the writeback to complete. 1805 * 1806 * In cases 1) and 2) we activate the folios to get them out of 1807 * the way while we continue scanning for clean folios on the 1808 * inactive list and refilling from the active list. The 1809 * observation here is that waiting for disk writes is more 1810 * expensive than potentially causing reloads down the line. 1811 * Since they're marked for immediate reclaim, they won't put 1812 * memory pressure on the cache working set any longer than it 1813 * takes to write them to disk. 1814 */ 1815 if (folio_test_writeback(folio)) { 1816 /* Case 1 above */ 1817 if (current_is_kswapd() && 1818 folio_test_reclaim(folio) && 1819 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { 1820 stat->nr_immediate += nr_pages; 1821 goto activate_locked; 1822 1823 /* Case 2 above */ 1824 } else if (writeback_throttling_sane(sc) || 1825 !folio_test_reclaim(folio) || 1826 !may_enter_fs(folio, sc->gfp_mask)) { 1827 /* 1828 * This is slightly racy - 1829 * folio_end_writeback() might have 1830 * just cleared the reclaim flag, then 1831 * setting the reclaim flag here ends up 1832 * interpreted as the readahead flag - but 1833 * that does not matter enough to care. 1834 * What we do want is for this folio to 1835 * have the reclaim flag set next time 1836 * memcg reclaim reaches the tests above, 1837 * so it will then wait for writeback to 1838 * avoid OOM; and it's also appropriate 1839 * in global reclaim. 1840 */ 1841 folio_set_reclaim(folio); 1842 stat->nr_writeback += nr_pages; 1843 goto activate_locked; 1844 1845 /* Case 3 above */ 1846 } else { 1847 folio_unlock(folio); 1848 folio_wait_writeback(folio); 1849 /* then go back and try same folio again */ 1850 list_add_tail(&folio->lru, folio_list); 1851 continue; 1852 } 1853 } 1854 1855 if (!ignore_references) 1856 references = folio_check_references(folio, sc); 1857 1858 switch (references) { 1859 case FOLIOREF_ACTIVATE: 1860 goto activate_locked; 1861 case FOLIOREF_KEEP: 1862 stat->nr_ref_keep += nr_pages; 1863 goto keep_locked; 1864 case FOLIOREF_RECLAIM: 1865 case FOLIOREF_RECLAIM_CLEAN: 1866 ; /* try to reclaim the folio below */ 1867 } 1868 1869 /* 1870 * Before reclaiming the folio, try to relocate 1871 * its contents to another node. 1872 */ 1873 if (do_demote_pass && 1874 (thp_migration_supported() || !folio_test_large(folio))) { 1875 list_add(&folio->lru, &demote_folios); 1876 folio_unlock(folio); 1877 continue; 1878 } 1879 1880 /* 1881 * Anonymous process memory has backing store? 1882 * Try to allocate it some swap space here. 1883 * Lazyfree folio could be freed directly 1884 */ 1885 if (folio_test_anon(folio) && folio_test_swapbacked(folio)) { 1886 if (!folio_test_swapcache(folio)) { 1887 if (!(sc->gfp_mask & __GFP_IO)) 1888 goto keep_locked; 1889 if (folio_maybe_dma_pinned(folio)) 1890 goto keep_locked; 1891 if (folio_test_large(folio)) { 1892 /* cannot split folio, skip it */ 1893 if (!can_split_folio(folio, NULL)) 1894 goto activate_locked; 1895 /* 1896 * Split folios without a PMD map right 1897 * away. Chances are some or all of the 1898 * tail pages can be freed without IO. 1899 */ 1900 if (!folio_entire_mapcount(folio) && 1901 split_folio_to_list(folio, 1902 folio_list)) 1903 goto activate_locked; 1904 } 1905 if (!add_to_swap(folio)) { 1906 if (!folio_test_large(folio)) 1907 goto activate_locked_split; 1908 /* Fallback to swap normal pages */ 1909 if (split_folio_to_list(folio, 1910 folio_list)) 1911 goto activate_locked; 1912 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1913 count_vm_event(THP_SWPOUT_FALLBACK); 1914 #endif 1915 if (!add_to_swap(folio)) 1916 goto activate_locked_split; 1917 } 1918 } 1919 } else if (folio_test_swapbacked(folio) && 1920 folio_test_large(folio)) { 1921 /* Split shmem folio */ 1922 if (split_folio_to_list(folio, folio_list)) 1923 goto keep_locked; 1924 } 1925 1926 /* 1927 * If the folio was split above, the tail pages will make 1928 * their own pass through this function and be accounted 1929 * then. 1930 */ 1931 if ((nr_pages > 1) && !folio_test_large(folio)) { 1932 sc->nr_scanned -= (nr_pages - 1); 1933 nr_pages = 1; 1934 } 1935 1936 /* 1937 * The folio is mapped into the page tables of one or more 1938 * processes. Try to unmap it here. 1939 */ 1940 if (folio_mapped(folio)) { 1941 enum ttu_flags flags = TTU_BATCH_FLUSH; 1942 bool was_swapbacked = folio_test_swapbacked(folio); 1943 1944 if (folio_test_pmd_mappable(folio)) 1945 flags |= TTU_SPLIT_HUGE_PMD; 1946 1947 try_to_unmap(folio, flags); 1948 if (folio_mapped(folio)) { 1949 stat->nr_unmap_fail += nr_pages; 1950 if (!was_swapbacked && 1951 folio_test_swapbacked(folio)) 1952 stat->nr_lazyfree_fail += nr_pages; 1953 goto activate_locked; 1954 } 1955 } 1956 1957 /* 1958 * Folio is unmapped now so it cannot be newly pinned anymore. 1959 * No point in trying to reclaim folio if it is pinned. 1960 * Furthermore we don't want to reclaim underlying fs metadata 1961 * if the folio is pinned and thus potentially modified by the 1962 * pinning process as that may upset the filesystem. 1963 */ 1964 if (folio_maybe_dma_pinned(folio)) 1965 goto activate_locked; 1966 1967 mapping = folio_mapping(folio); 1968 if (folio_test_dirty(folio)) { 1969 /* 1970 * Only kswapd can writeback filesystem folios 1971 * to avoid risk of stack overflow. But avoid 1972 * injecting inefficient single-folio I/O into 1973 * flusher writeback as much as possible: only 1974 * write folios when we've encountered many 1975 * dirty folios, and when we've already scanned 1976 * the rest of the LRU for clean folios and see 1977 * the same dirty folios again (with the reclaim 1978 * flag set). 1979 */ 1980 if (folio_is_file_lru(folio) && 1981 (!current_is_kswapd() || 1982 !folio_test_reclaim(folio) || 1983 !test_bit(PGDAT_DIRTY, &pgdat->flags))) { 1984 /* 1985 * Immediately reclaim when written back. 1986 * Similar in principle to folio_deactivate() 1987 * except we already have the folio isolated 1988 * and know it's dirty 1989 */ 1990 node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE, 1991 nr_pages); 1992 folio_set_reclaim(folio); 1993 1994 goto activate_locked; 1995 } 1996 1997 if (references == FOLIOREF_RECLAIM_CLEAN) 1998 goto keep_locked; 1999 if (!may_enter_fs(folio, sc->gfp_mask)) 2000 goto keep_locked; 2001 if (!sc->may_writepage) 2002 goto keep_locked; 2003 2004 /* 2005 * Folio is dirty. Flush the TLB if a writable entry 2006 * potentially exists to avoid CPU writes after I/O 2007 * starts and then write it out here. 2008 */ 2009 try_to_unmap_flush_dirty(); 2010 switch (pageout(folio, mapping, &plug)) { 2011 case PAGE_KEEP: 2012 goto keep_locked; 2013 case PAGE_ACTIVATE: 2014 goto activate_locked; 2015 case PAGE_SUCCESS: 2016 stat->nr_pageout += nr_pages; 2017 2018 if (folio_test_writeback(folio)) 2019 goto keep; 2020 if (folio_test_dirty(folio)) 2021 goto keep; 2022 2023 /* 2024 * A synchronous write - probably a ramdisk. Go 2025 * ahead and try to reclaim the folio. 2026 */ 2027 if (!folio_trylock(folio)) 2028 goto keep; 2029 if (folio_test_dirty(folio) || 2030 folio_test_writeback(folio)) 2031 goto keep_locked; 2032 mapping = folio_mapping(folio); 2033 fallthrough; 2034 case PAGE_CLEAN: 2035 ; /* try to free the folio below */ 2036 } 2037 } 2038 2039 /* 2040 * If the folio has buffers, try to free the buffer 2041 * mappings associated with this folio. If we succeed 2042 * we try to free the folio as well. 2043 * 2044 * We do this even if the folio is dirty. 2045 * filemap_release_folio() does not perform I/O, but it 2046 * is possible for a folio to have the dirty flag set, 2047 * but it is actually clean (all its buffers are clean). 2048 * This happens if the buffers were written out directly, 2049 * with submit_bh(). ext3 will do this, as well as 2050 * the blockdev mapping. filemap_release_folio() will 2051 * discover that cleanness and will drop the buffers 2052 * and mark the folio clean - it can be freed. 2053 * 2054 * Rarely, folios can have buffers and no ->mapping. 2055 * These are the folios which were not successfully 2056 * invalidated in truncate_cleanup_folio(). We try to 2057 * drop those buffers here and if that worked, and the 2058 * folio is no longer mapped into process address space 2059 * (refcount == 1) it can be freed. Otherwise, leave 2060 * the folio on the LRU so it is swappable. 2061 */ 2062 if (folio_has_private(folio)) { 2063 if (!filemap_release_folio(folio, sc->gfp_mask)) 2064 goto activate_locked; 2065 if (!mapping && folio_ref_count(folio) == 1) { 2066 folio_unlock(folio); 2067 if (folio_put_testzero(folio)) 2068 goto free_it; 2069 else { 2070 /* 2071 * rare race with speculative reference. 2072 * the speculative reference will free 2073 * this folio shortly, so we may 2074 * increment nr_reclaimed here (and 2075 * leave it off the LRU). 2076 */ 2077 nr_reclaimed += nr_pages; 2078 continue; 2079 } 2080 } 2081 } 2082 2083 if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { 2084 /* follow __remove_mapping for reference */ 2085 if (!folio_ref_freeze(folio, 1)) 2086 goto keep_locked; 2087 /* 2088 * The folio has only one reference left, which is 2089 * from the isolation. After the caller puts the 2090 * folio back on the lru and drops the reference, the 2091 * folio will be freed anyway. It doesn't matter 2092 * which lru it goes on. So we don't bother checking 2093 * the dirty flag here. 2094 */ 2095 count_vm_events(PGLAZYFREED, nr_pages); 2096 count_memcg_folio_events(folio, PGLAZYFREED, nr_pages); 2097 } else if (!mapping || !__remove_mapping(mapping, folio, true, 2098 sc->target_mem_cgroup)) 2099 goto keep_locked; 2100 2101 folio_unlock(folio); 2102 free_it: 2103 /* 2104 * Folio may get swapped out as a whole, need to account 2105 * all pages in it. 2106 */ 2107 nr_reclaimed += nr_pages; 2108 2109 /* 2110 * Is there need to periodically free_folio_list? It would 2111 * appear not as the counts should be low 2112 */ 2113 if (unlikely(folio_test_large(folio))) 2114 destroy_large_folio(folio); 2115 else 2116 list_add(&folio->lru, &free_folios); 2117 continue; 2118 2119 activate_locked_split: 2120 /* 2121 * The tail pages that are failed to add into swap cache 2122 * reach here. Fixup nr_scanned and nr_pages. 2123 */ 2124 if (nr_pages > 1) { 2125 sc->nr_scanned -= (nr_pages - 1); 2126 nr_pages = 1; 2127 } 2128 activate_locked: 2129 /* Not a candidate for swapping, so reclaim swap space. */ 2130 if (folio_test_swapcache(folio) && 2131 (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio))) 2132 folio_free_swap(folio); 2133 VM_BUG_ON_FOLIO(folio_test_active(folio), folio); 2134 if (!folio_test_mlocked(folio)) { 2135 int type = folio_is_file_lru(folio); 2136 folio_set_active(folio); 2137 stat->nr_activate[type] += nr_pages; 2138 count_memcg_folio_events(folio, PGACTIVATE, nr_pages); 2139 } 2140 keep_locked: 2141 folio_unlock(folio); 2142 keep: 2143 list_add(&folio->lru, &ret_folios); 2144 VM_BUG_ON_FOLIO(folio_test_lru(folio) || 2145 folio_test_unevictable(folio), folio); 2146 } 2147 /* 'folio_list' is always empty here */ 2148 2149 /* Migrate folios selected for demotion */ 2150 nr_reclaimed += demote_folio_list(&demote_folios, pgdat); 2151 /* Folios that could not be demoted are still in @demote_folios */ 2152 if (!list_empty(&demote_folios)) { 2153 /* Folios which weren't demoted go back on @folio_list */ 2154 list_splice_init(&demote_folios, folio_list); 2155 2156 /* 2157 * goto retry to reclaim the undemoted folios in folio_list if 2158 * desired. 2159 * 2160 * Reclaiming directly from top tier nodes is not often desired 2161 * due to it breaking the LRU ordering: in general memory 2162 * should be reclaimed from lower tier nodes and demoted from 2163 * top tier nodes. 2164 * 2165 * However, disabling reclaim from top tier nodes entirely 2166 * would cause ooms in edge scenarios where lower tier memory 2167 * is unreclaimable for whatever reason, eg memory being 2168 * mlocked or too hot to reclaim. We can disable reclaim 2169 * from top tier nodes in proactive reclaim though as that is 2170 * not real memory pressure. 2171 */ 2172 if (!sc->proactive) { 2173 do_demote_pass = false; 2174 goto retry; 2175 } 2176 } 2177 2178 pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; 2179 2180 mem_cgroup_uncharge_list(&free_folios); 2181 try_to_unmap_flush(); 2182 free_unref_page_list(&free_folios); 2183 2184 list_splice(&ret_folios, folio_list); 2185 count_vm_events(PGACTIVATE, pgactivate); 2186 2187 if (plug) 2188 swap_write_unplug(plug); 2189 return nr_reclaimed; 2190 } 2191 2192 unsigned int reclaim_clean_pages_from_list(struct zone *zone, 2193 struct list_head *folio_list) 2194 { 2195 struct scan_control sc = { 2196 .gfp_mask = GFP_KERNEL, 2197 .may_unmap = 1, 2198 }; 2199 struct reclaim_stat stat; 2200 unsigned int nr_reclaimed; 2201 struct folio *folio, *next; 2202 LIST_HEAD(clean_folios); 2203 unsigned int noreclaim_flag; 2204 2205 list_for_each_entry_safe(folio, next, folio_list, lru) { 2206 if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) && 2207 !folio_test_dirty(folio) && !__folio_test_movable(folio) && 2208 !folio_test_unevictable(folio)) { 2209 folio_clear_active(folio); 2210 list_move(&folio->lru, &clean_folios); 2211 } 2212 } 2213 2214 /* 2215 * We should be safe here since we are only dealing with file pages and 2216 * we are not kswapd and therefore cannot write dirty file pages. But 2217 * call memalloc_noreclaim_save() anyway, just in case these conditions 2218 * change in the future. 2219 */ 2220 noreclaim_flag = memalloc_noreclaim_save(); 2221 nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, 2222 &stat, true); 2223 memalloc_noreclaim_restore(noreclaim_flag); 2224 2225 list_splice(&clean_folios, folio_list); 2226 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, 2227 -(long)nr_reclaimed); 2228 /* 2229 * Since lazyfree pages are isolated from file LRU from the beginning, 2230 * they will rotate back to anonymous LRU in the end if it failed to 2231 * discard so isolated count will be mismatched. 2232 * Compensate the isolated count for both LRU lists. 2233 */ 2234 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, 2235 stat.nr_lazyfree_fail); 2236 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, 2237 -(long)stat.nr_lazyfree_fail); 2238 return nr_reclaimed; 2239 } 2240 2241 /* 2242 * Update LRU sizes after isolating pages. The LRU size updates must 2243 * be complete before mem_cgroup_update_lru_size due to a sanity check. 2244 */ 2245 static __always_inline void update_lru_sizes(struct lruvec *lruvec, 2246 enum lru_list lru, unsigned long *nr_zone_taken) 2247 { 2248 int zid; 2249 2250 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 2251 if (!nr_zone_taken[zid]) 2252 continue; 2253 2254 update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); 2255 } 2256 2257 } 2258 2259 #ifdef CONFIG_CMA 2260 /* 2261 * It is waste of effort to scan and reclaim CMA pages if it is not available 2262 * for current allocation context. Kswapd can not be enrolled as it can not 2263 * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL 2264 */ 2265 static bool skip_cma(struct folio *folio, struct scan_control *sc) 2266 { 2267 return !current_is_kswapd() && 2268 gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE && 2269 get_pageblock_migratetype(&folio->page) == MIGRATE_CMA; 2270 } 2271 #else 2272 static bool skip_cma(struct folio *folio, struct scan_control *sc) 2273 { 2274 return false; 2275 } 2276 #endif 2277 2278 /* 2279 * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. 2280 * 2281 * lruvec->lru_lock is heavily contended. Some of the functions that 2282 * shrink the lists perform better by taking out a batch of pages 2283 * and working on them outside the LRU lock. 2284 * 2285 * For pagecache intensive workloads, this function is the hottest 2286 * spot in the kernel (apart from copy_*_user functions). 2287 * 2288 * Lru_lock must be held before calling this function. 2289 * 2290 * @nr_to_scan: The number of eligible pages to look through on the list. 2291 * @lruvec: The LRU vector to pull pages from. 2292 * @dst: The temp list to put pages on to. 2293 * @nr_scanned: The number of pages that were scanned. 2294 * @sc: The scan_control struct for this reclaim session 2295 * @lru: LRU list id for isolating 2296 * 2297 * returns how many pages were moved onto *@dst. 2298 */ 2299 static unsigned long isolate_lru_folios(unsigned long nr_to_scan, 2300 struct lruvec *lruvec, struct list_head *dst, 2301 unsigned long *nr_scanned, struct scan_control *sc, 2302 enum lru_list lru) 2303 { 2304 struct list_head *src = &lruvec->lists[lru]; 2305 unsigned long nr_taken = 0; 2306 unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; 2307 unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; 2308 unsigned long skipped = 0; 2309 unsigned long scan, total_scan, nr_pages; 2310 LIST_HEAD(folios_skipped); 2311 2312 total_scan = 0; 2313 scan = 0; 2314 while (scan < nr_to_scan && !list_empty(src)) { 2315 struct list_head *move_to = src; 2316 struct folio *folio; 2317 2318 folio = lru_to_folio(src); 2319 prefetchw_prev_lru_folio(folio, src, flags); 2320 2321 nr_pages = folio_nr_pages(folio); 2322 total_scan += nr_pages; 2323 2324 if (folio_zonenum(folio) > sc->reclaim_idx || 2325 skip_cma(folio, sc)) { 2326 nr_skipped[folio_zonenum(folio)] += nr_pages; 2327 move_to = &folios_skipped; 2328 goto move; 2329 } 2330 2331 /* 2332 * Do not count skipped folios because that makes the function 2333 * return with no isolated folios if the LRU mostly contains 2334 * ineligible folios. This causes the VM to not reclaim any 2335 * folios, triggering a premature OOM. 2336 * Account all pages in a folio. 2337 */ 2338 scan += nr_pages; 2339 2340 if (!folio_test_lru(folio)) 2341 goto move; 2342 if (!sc->may_unmap && folio_mapped(folio)) 2343 goto move; 2344 2345 /* 2346 * Be careful not to clear the lru flag until after we're 2347 * sure the folio is not being freed elsewhere -- the 2348 * folio release code relies on it. 2349 */ 2350 if (unlikely(!folio_try_get(folio))) 2351 goto move; 2352 2353 if (!folio_test_clear_lru(folio)) { 2354 /* Another thread is already isolating this folio */ 2355 folio_put(folio); 2356 goto move; 2357 } 2358 2359 nr_taken += nr_pages; 2360 nr_zone_taken[folio_zonenum(folio)] += nr_pages; 2361 move_to = dst; 2362 move: 2363 list_move(&folio->lru, move_to); 2364 } 2365 2366 /* 2367 * Splice any skipped folios to the start of the LRU list. Note that 2368 * this disrupts the LRU order when reclaiming for lower zones but 2369 * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX 2370 * scanning would soon rescan the same folios to skip and waste lots 2371 * of cpu cycles. 2372 */ 2373 if (!list_empty(&folios_skipped)) { 2374 int zid; 2375 2376 list_splice(&folios_skipped, src); 2377 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 2378 if (!nr_skipped[zid]) 2379 continue; 2380 2381 __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); 2382 skipped += nr_skipped[zid]; 2383 } 2384 } 2385 *nr_scanned = total_scan; 2386 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, 2387 total_scan, skipped, nr_taken, 2388 sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru); 2389 update_lru_sizes(lruvec, lru, nr_zone_taken); 2390 return nr_taken; 2391 } 2392 2393 /** 2394 * folio_isolate_lru() - Try to isolate a folio from its LRU list. 2395 * @folio: Folio to isolate from its LRU list. 2396 * 2397 * Isolate a @folio from an LRU list and adjust the vmstat statistic 2398 * corresponding to whatever LRU list the folio was on. 2399 * 2400 * The folio will have its LRU flag cleared. If it was found on the 2401 * active list, it will have the Active flag set. If it was found on the 2402 * unevictable list, it will have the Unevictable flag set. These flags 2403 * may need to be cleared by the caller before letting the page go. 2404 * 2405 * Context: 2406 * 2407 * (1) Must be called with an elevated refcount on the folio. This is a 2408 * fundamental difference from isolate_lru_folios() (which is called 2409 * without a stable reference). 2410 * (2) The lru_lock must not be held. 2411 * (3) Interrupts must be enabled. 2412 * 2413 * Return: true if the folio was removed from an LRU list. 2414 * false if the folio was not on an LRU list. 2415 */ 2416 bool folio_isolate_lru(struct folio *folio) 2417 { 2418 bool ret = false; 2419 2420 VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); 2421 2422 if (folio_test_clear_lru(folio)) { 2423 struct lruvec *lruvec; 2424 2425 folio_get(folio); 2426 lruvec = folio_lruvec_lock_irq(folio); 2427 lruvec_del_folio(lruvec, folio); 2428 unlock_page_lruvec_irq(lruvec); 2429 ret = true; 2430 } 2431 2432 return ret; 2433 } 2434 2435 /* 2436 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and 2437 * then get rescheduled. When there are massive number of tasks doing page 2438 * allocation, such sleeping direct reclaimers may keep piling up on each CPU, 2439 * the LRU list will go small and be scanned faster than necessary, leading to 2440 * unnecessary swapping, thrashing and OOM. 2441 */ 2442 static int too_many_isolated(struct pglist_data *pgdat, int file, 2443 struct scan_control *sc) 2444 { 2445 unsigned long inactive, isolated; 2446 bool too_many; 2447 2448 if (current_is_kswapd()) 2449 return 0; 2450 2451 if (!writeback_throttling_sane(sc)) 2452 return 0; 2453 2454 if (file) { 2455 inactive = node_page_state(pgdat, NR_INACTIVE_FILE); 2456 isolated = node_page_state(pgdat, NR_ISOLATED_FILE); 2457 } else { 2458 inactive = node_page_state(pgdat, NR_INACTIVE_ANON); 2459 isolated = node_page_state(pgdat, NR_ISOLATED_ANON); 2460 } 2461 2462 /* 2463 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they 2464 * won't get blocked by normal direct-reclaimers, forming a circular 2465 * deadlock. 2466 */ 2467 if (gfp_has_io_fs(sc->gfp_mask)) 2468 inactive >>= 3; 2469 2470 too_many = isolated > inactive; 2471 2472 /* Wake up tasks throttled due to too_many_isolated. */ 2473 if (!too_many) 2474 wake_throttle_isolated(pgdat); 2475 2476 return too_many; 2477 } 2478 2479 /* 2480 * move_folios_to_lru() moves folios from private @list to appropriate LRU list. 2481 * On return, @list is reused as a list of folios to be freed by the caller. 2482 * 2483 * Returns the number of pages moved to the given lruvec. 2484 */ 2485 static unsigned int move_folios_to_lru(struct lruvec *lruvec, 2486 struct list_head *list) 2487 { 2488 int nr_pages, nr_moved = 0; 2489 LIST_HEAD(folios_to_free); 2490 2491 while (!list_empty(list)) { 2492 struct folio *folio = lru_to_folio(list); 2493 2494 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 2495 list_del(&folio->lru); 2496 if (unlikely(!folio_evictable(folio))) { 2497 spin_unlock_irq(&lruvec->lru_lock); 2498 folio_putback_lru(folio); 2499 spin_lock_irq(&lruvec->lru_lock); 2500 continue; 2501 } 2502 2503 /* 2504 * The folio_set_lru needs to be kept here for list integrity. 2505 * Otherwise: 2506 * #0 move_folios_to_lru #1 release_pages 2507 * if (!folio_put_testzero()) 2508 * if (folio_put_testzero()) 2509 * !lru //skip lru_lock 2510 * folio_set_lru() 2511 * list_add(&folio->lru,) 2512 * list_add(&folio->lru,) 2513 */ 2514 folio_set_lru(folio); 2515 2516 if (unlikely(folio_put_testzero(folio))) { 2517 __folio_clear_lru_flags(folio); 2518 2519 if (unlikely(folio_test_large(folio))) { 2520 spin_unlock_irq(&lruvec->lru_lock); 2521 destroy_large_folio(folio); 2522 spin_lock_irq(&lruvec->lru_lock); 2523 } else 2524 list_add(&folio->lru, &folios_to_free); 2525 2526 continue; 2527 } 2528 2529 /* 2530 * All pages were isolated from the same lruvec (and isolation 2531 * inhibits memcg migration). 2532 */ 2533 VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); 2534 lruvec_add_folio(lruvec, folio); 2535 nr_pages = folio_nr_pages(folio); 2536 nr_moved += nr_pages; 2537 if (folio_test_active(folio)) 2538 workingset_age_nonresident(lruvec, nr_pages); 2539 } 2540 2541 /* 2542 * To save our caller's stack, now use input list for pages to free. 2543 */ 2544 list_splice(&folios_to_free, list); 2545 2546 return nr_moved; 2547 } 2548 2549 /* 2550 * If a kernel thread (such as nfsd for loop-back mounts) services a backing 2551 * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case 2552 * we should not throttle. Otherwise it is safe to do so. 2553 */ 2554 static int current_may_throttle(void) 2555 { 2556 return !(current->flags & PF_LOCAL_THROTTLE); 2557 } 2558 2559 /* 2560 * shrink_inactive_list() is a helper for shrink_node(). It returns the number 2561 * of reclaimed pages 2562 */ 2563 static unsigned long shrink_inactive_list(unsigned long nr_to_scan, 2564 struct lruvec *lruvec, struct scan_control *sc, 2565 enum lru_list lru) 2566 { 2567 LIST_HEAD(folio_list); 2568 unsigned long nr_scanned; 2569 unsigned int nr_reclaimed = 0; 2570 unsigned long nr_taken; 2571 struct reclaim_stat stat; 2572 bool file = is_file_lru(lru); 2573 enum vm_event_item item; 2574 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 2575 bool stalled = false; 2576 2577 while (unlikely(too_many_isolated(pgdat, file, sc))) { 2578 if (stalled) 2579 return 0; 2580 2581 /* wait a bit for the reclaimer. */ 2582 stalled = true; 2583 reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED); 2584 2585 /* We are about to die and free our memory. Return now. */ 2586 if (fatal_signal_pending(current)) 2587 return SWAP_CLUSTER_MAX; 2588 } 2589 2590 lru_add_drain(); 2591 2592 spin_lock_irq(&lruvec->lru_lock); 2593 2594 nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list, 2595 &nr_scanned, sc, lru); 2596 2597 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); 2598 item = PGSCAN_KSWAPD + reclaimer_offset(); 2599 if (!cgroup_reclaim(sc)) 2600 __count_vm_events(item, nr_scanned); 2601 __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); 2602 __count_vm_events(PGSCAN_ANON + file, nr_scanned); 2603 2604 spin_unlock_irq(&lruvec->lru_lock); 2605 2606 if (nr_taken == 0) 2607 return 0; 2608 2609 nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); 2610 2611 spin_lock_irq(&lruvec->lru_lock); 2612 move_folios_to_lru(lruvec, &folio_list); 2613 2614 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); 2615 item = PGSTEAL_KSWAPD + reclaimer_offset(); 2616 if (!cgroup_reclaim(sc)) 2617 __count_vm_events(item, nr_reclaimed); 2618 __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); 2619 __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); 2620 spin_unlock_irq(&lruvec->lru_lock); 2621 2622 lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); 2623 mem_cgroup_uncharge_list(&folio_list); 2624 free_unref_page_list(&folio_list); 2625 2626 /* 2627 * If dirty folios are scanned that are not queued for IO, it 2628 * implies that flushers are not doing their job. This can 2629 * happen when memory pressure pushes dirty folios to the end of 2630 * the LRU before the dirty limits are breached and the dirty 2631 * data has expired. It can also happen when the proportion of 2632 * dirty folios grows not through writes but through memory 2633 * pressure reclaiming all the clean cache. And in some cases, 2634 * the flushers simply cannot keep up with the allocation 2635 * rate. Nudge the flusher threads in case they are asleep. 2636 */ 2637 if (stat.nr_unqueued_dirty == nr_taken) { 2638 wakeup_flusher_threads(WB_REASON_VMSCAN); 2639 /* 2640 * For cgroupv1 dirty throttling is achieved by waking up 2641 * the kernel flusher here and later waiting on folios 2642 * which are in writeback to finish (see shrink_folio_list()). 2643 * 2644 * Flusher may not be able to issue writeback quickly 2645 * enough for cgroupv1 writeback throttling to work 2646 * on a large system. 2647 */ 2648 if (!writeback_throttling_sane(sc)) 2649 reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); 2650 } 2651 2652 sc->nr.dirty += stat.nr_dirty; 2653 sc->nr.congested += stat.nr_congested; 2654 sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; 2655 sc->nr.writeback += stat.nr_writeback; 2656 sc->nr.immediate += stat.nr_immediate; 2657 sc->nr.taken += nr_taken; 2658 if (file) 2659 sc->nr.file_taken += nr_taken; 2660 2661 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, 2662 nr_scanned, nr_reclaimed, &stat, sc->priority, file); 2663 return nr_reclaimed; 2664 } 2665 2666 /* 2667 * shrink_active_list() moves folios from the active LRU to the inactive LRU. 2668 * 2669 * We move them the other way if the folio is referenced by one or more 2670 * processes. 2671 * 2672 * If the folios are mostly unmapped, the processing is fast and it is 2673 * appropriate to hold lru_lock across the whole operation. But if 2674 * the folios are mapped, the processing is slow (folio_referenced()), so 2675 * we should drop lru_lock around each folio. It's impossible to balance 2676 * this, so instead we remove the folios from the LRU while processing them. 2677 * It is safe to rely on the active flag against the non-LRU folios in here 2678 * because nobody will play with that bit on a non-LRU folio. 2679 * 2680 * The downside is that we have to touch folio->_refcount against each folio. 2681 * But we had to alter folio->flags anyway. 2682 */ 2683 static void shrink_active_list(unsigned long nr_to_scan, 2684 struct lruvec *lruvec, 2685 struct scan_control *sc, 2686 enum lru_list lru) 2687 { 2688 unsigned long nr_taken; 2689 unsigned long nr_scanned; 2690 unsigned long vm_flags; 2691 LIST_HEAD(l_hold); /* The folios which were snipped off */ 2692 LIST_HEAD(l_active); 2693 LIST_HEAD(l_inactive); 2694 unsigned nr_deactivate, nr_activate; 2695 unsigned nr_rotated = 0; 2696 int file = is_file_lru(lru); 2697 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 2698 2699 lru_add_drain(); 2700 2701 spin_lock_irq(&lruvec->lru_lock); 2702 2703 nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, 2704 &nr_scanned, sc, lru); 2705 2706 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); 2707 2708 if (!cgroup_reclaim(sc)) 2709 __count_vm_events(PGREFILL, nr_scanned); 2710 __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); 2711 2712 spin_unlock_irq(&lruvec->lru_lock); 2713 2714 while (!list_empty(&l_hold)) { 2715 struct folio *folio; 2716 2717 cond_resched(); 2718 folio = lru_to_folio(&l_hold); 2719 list_del(&folio->lru); 2720 2721 if (unlikely(!folio_evictable(folio))) { 2722 folio_putback_lru(folio); 2723 continue; 2724 } 2725 2726 if (unlikely(buffer_heads_over_limit)) { 2727 if (folio_test_private(folio) && folio_trylock(folio)) { 2728 if (folio_test_private(folio)) 2729 filemap_release_folio(folio, 0); 2730 folio_unlock(folio); 2731 } 2732 } 2733 2734 /* Referenced or rmap lock contention: rotate */ 2735 if (folio_referenced(folio, 0, sc->target_mem_cgroup, 2736 &vm_flags) != 0) { 2737 /* 2738 * Identify referenced, file-backed active folios and 2739 * give them one more trip around the active list. So 2740 * that executable code get better chances to stay in 2741 * memory under moderate memory pressure. Anon folios 2742 * are not likely to be evicted by use-once streaming 2743 * IO, plus JVM can create lots of anon VM_EXEC folios, 2744 * so we ignore them here. 2745 */ 2746 if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) { 2747 nr_rotated += folio_nr_pages(folio); 2748 list_add(&folio->lru, &l_active); 2749 continue; 2750 } 2751 } 2752 2753 folio_clear_active(folio); /* we are de-activating */ 2754 folio_set_workingset(folio); 2755 list_add(&folio->lru, &l_inactive); 2756 } 2757 2758 /* 2759 * Move folios back to the lru list. 2760 */ 2761 spin_lock_irq(&lruvec->lru_lock); 2762 2763 nr_activate = move_folios_to_lru(lruvec, &l_active); 2764 nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); 2765 /* Keep all free folios in l_active list */ 2766 list_splice(&l_inactive, &l_active); 2767 2768 __count_vm_events(PGDEACTIVATE, nr_deactivate); 2769 __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); 2770 2771 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); 2772 spin_unlock_irq(&lruvec->lru_lock); 2773 2774 if (nr_rotated) 2775 lru_note_cost(lruvec, file, 0, nr_rotated); 2776 mem_cgroup_uncharge_list(&l_active); 2777 free_unref_page_list(&l_active); 2778 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, 2779 nr_deactivate, nr_rotated, sc->priority, file); 2780 } 2781 2782 static unsigned int reclaim_folio_list(struct list_head *folio_list, 2783 struct pglist_data *pgdat) 2784 { 2785 struct reclaim_stat dummy_stat; 2786 unsigned int nr_reclaimed; 2787 struct folio *folio; 2788 struct scan_control sc = { 2789 .gfp_mask = GFP_KERNEL, 2790 .may_writepage = 1, 2791 .may_unmap = 1, 2792 .may_swap = 1, 2793 .no_demotion = 1, 2794 }; 2795 2796 nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false); 2797 while (!list_empty(folio_list)) { 2798 folio = lru_to_folio(folio_list); 2799 list_del(&folio->lru); 2800 folio_putback_lru(folio); 2801 } 2802 2803 return nr_reclaimed; 2804 } 2805 2806 unsigned long reclaim_pages(struct list_head *folio_list) 2807 { 2808 int nid; 2809 unsigned int nr_reclaimed = 0; 2810 LIST_HEAD(node_folio_list); 2811 unsigned int noreclaim_flag; 2812 2813 if (list_empty(folio_list)) 2814 return nr_reclaimed; 2815 2816 noreclaim_flag = memalloc_noreclaim_save(); 2817 2818 nid = folio_nid(lru_to_folio(folio_list)); 2819 do { 2820 struct folio *folio = lru_to_folio(folio_list); 2821 2822 if (nid == folio_nid(folio)) { 2823 folio_clear_active(folio); 2824 list_move(&folio->lru, &node_folio_list); 2825 continue; 2826 } 2827 2828 nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); 2829 nid = folio_nid(lru_to_folio(folio_list)); 2830 } while (!list_empty(folio_list)); 2831 2832 nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); 2833 2834 memalloc_noreclaim_restore(noreclaim_flag); 2835 2836 return nr_reclaimed; 2837 } 2838 2839 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 2840 struct lruvec *lruvec, struct scan_control *sc) 2841 { 2842 if (is_active_lru(lru)) { 2843 if (sc->may_deactivate & (1 << is_file_lru(lru))) 2844 shrink_active_list(nr_to_scan, lruvec, sc, lru); 2845 else 2846 sc->skipped_deactivate = 1; 2847 return 0; 2848 } 2849 2850 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); 2851 } 2852 2853 /* 2854 * The inactive anon list should be small enough that the VM never has 2855 * to do too much work. 2856 * 2857 * The inactive file list should be small enough to leave most memory 2858 * to the established workingset on the scan-resistant active list, 2859 * but large enough to avoid thrashing the aggregate readahead window. 2860 * 2861 * Both inactive lists should also be large enough that each inactive 2862 * folio has a chance to be referenced again before it is reclaimed. 2863 * 2864 * If that fails and refaulting is observed, the inactive list grows. 2865 * 2866 * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios 2867 * on this LRU, maintained by the pageout code. An inactive_ratio 2868 * of 3 means 3:1 or 25% of the folios are kept on the inactive list. 2869 * 2870 * total target max 2871 * memory ratio inactive 2872 * ------------------------------------- 2873 * 10MB 1 5MB 2874 * 100MB 1 50MB 2875 * 1GB 3 250MB 2876 * 10GB 10 0.9GB 2877 * 100GB 31 3GB 2878 * 1TB 101 10GB 2879 * 10TB 320 32GB 2880 */ 2881 static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) 2882 { 2883 enum lru_list active_lru = inactive_lru + LRU_ACTIVE; 2884 unsigned long inactive, active; 2885 unsigned long inactive_ratio; 2886 unsigned long gb; 2887 2888 inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru); 2889 active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru); 2890 2891 gb = (inactive + active) >> (30 - PAGE_SHIFT); 2892 if (gb) 2893 inactive_ratio = int_sqrt(10 * gb); 2894 else 2895 inactive_ratio = 1; 2896 2897 return inactive * inactive_ratio < active; 2898 } 2899 2900 enum scan_balance { 2901 SCAN_EQUAL, 2902 SCAN_FRACT, 2903 SCAN_ANON, 2904 SCAN_FILE, 2905 }; 2906 2907 static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) 2908 { 2909 unsigned long file; 2910 struct lruvec *target_lruvec; 2911 2912 if (lru_gen_enabled()) 2913 return; 2914 2915 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); 2916 2917 /* 2918 * Flush the memory cgroup stats, so that we read accurate per-memcg 2919 * lruvec stats for heuristics. 2920 */ 2921 mem_cgroup_flush_stats(); 2922 2923 /* 2924 * Determine the scan balance between anon and file LRUs. 2925 */ 2926 spin_lock_irq(&target_lruvec->lru_lock); 2927 sc->anon_cost = target_lruvec->anon_cost; 2928 sc->file_cost = target_lruvec->file_cost; 2929 spin_unlock_irq(&target_lruvec->lru_lock); 2930 2931 /* 2932 * Target desirable inactive:active list ratios for the anon 2933 * and file LRU lists. 2934 */ 2935 if (!sc->force_deactivate) { 2936 unsigned long refaults; 2937 2938 /* 2939 * When refaults are being observed, it means a new 2940 * workingset is being established. Deactivate to get 2941 * rid of any stale active pages quickly. 2942 */ 2943 refaults = lruvec_page_state(target_lruvec, 2944 WORKINGSET_ACTIVATE_ANON); 2945 if (refaults != target_lruvec->refaults[WORKINGSET_ANON] || 2946 inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) 2947 sc->may_deactivate |= DEACTIVATE_ANON; 2948 else 2949 sc->may_deactivate &= ~DEACTIVATE_ANON; 2950 2951 refaults = lruvec_page_state(target_lruvec, 2952 WORKINGSET_ACTIVATE_FILE); 2953 if (refaults != target_lruvec->refaults[WORKINGSET_FILE] || 2954 inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) 2955 sc->may_deactivate |= DEACTIVATE_FILE; 2956 else 2957 sc->may_deactivate &= ~DEACTIVATE_FILE; 2958 } else 2959 sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; 2960 2961 /* 2962 * If we have plenty of inactive file pages that aren't 2963 * thrashing, try to reclaim those first before touching 2964 * anonymous pages. 2965 */ 2966 file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); 2967 if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) 2968 sc->cache_trim_mode = 1; 2969 else 2970 sc->cache_trim_mode = 0; 2971 2972 /* 2973 * Prevent the reclaimer from falling into the cache trap: as 2974 * cache pages start out inactive, every cache fault will tip 2975 * the scan balance towards the file LRU. And as the file LRU 2976 * shrinks, so does the window for rotation from references. 2977 * This means we have a runaway feedback loop where a tiny 2978 * thrashing file LRU becomes infinitely more attractive than 2979 * anon pages. Try to detect this based on file LRU size. 2980 */ 2981 if (!cgroup_reclaim(sc)) { 2982 unsigned long total_high_wmark = 0; 2983 unsigned long free, anon; 2984 int z; 2985 2986 free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); 2987 file = node_page_state(pgdat, NR_ACTIVE_FILE) + 2988 node_page_state(pgdat, NR_INACTIVE_FILE); 2989 2990 for (z = 0; z < MAX_NR_ZONES; z++) { 2991 struct zone *zone = &pgdat->node_zones[z]; 2992 2993 if (!managed_zone(zone)) 2994 continue; 2995 2996 total_high_wmark += high_wmark_pages(zone); 2997 } 2998 2999 /* 3000 * Consider anon: if that's low too, this isn't a 3001 * runaway file reclaim problem, but rather just 3002 * extreme pressure. Reclaim as per usual then. 3003 */ 3004 anon = node_page_state(pgdat, NR_INACTIVE_ANON); 3005 3006 sc->file_is_tiny = 3007 file + free <= total_high_wmark && 3008 !(sc->may_deactivate & DEACTIVATE_ANON) && 3009 anon >> sc->priority; 3010 } 3011 } 3012 3013 /* 3014 * Determine how aggressively the anon and file LRU lists should be 3015 * scanned. 3016 * 3017 * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan 3018 * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan 3019 */ 3020 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, 3021 unsigned long *nr) 3022 { 3023 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 3024 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3025 unsigned long anon_cost, file_cost, total_cost; 3026 int swappiness = mem_cgroup_swappiness(memcg); 3027 u64 fraction[ANON_AND_FILE]; 3028 u64 denominator = 0; /* gcc */ 3029 enum scan_balance scan_balance; 3030 unsigned long ap, fp; 3031 enum lru_list lru; 3032 3033 /* If we have no swap space, do not bother scanning anon folios. */ 3034 if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { 3035 scan_balance = SCAN_FILE; 3036 goto out; 3037 } 3038 3039 /* 3040 * Global reclaim will swap to prevent OOM even with no 3041 * swappiness, but memcg users want to use this knob to 3042 * disable swapping for individual groups completely when 3043 * using the memory controller's swap limit feature would be 3044 * too expensive. 3045 */ 3046 if (cgroup_reclaim(sc) && !swappiness) { 3047 scan_balance = SCAN_FILE; 3048 goto out; 3049 } 3050 3051 /* 3052 * Do not apply any pressure balancing cleverness when the 3053 * system is close to OOM, scan both anon and file equally 3054 * (unless the swappiness setting disagrees with swapping). 3055 */ 3056 if (!sc->priority && swappiness) { 3057 scan_balance = SCAN_EQUAL; 3058 goto out; 3059 } 3060 3061 /* 3062 * If the system is almost out of file pages, force-scan anon. 3063 */ 3064 if (sc->file_is_tiny) { 3065 scan_balance = SCAN_ANON; 3066 goto out; 3067 } 3068 3069 /* 3070 * If there is enough inactive page cache, we do not reclaim 3071 * anything from the anonymous working right now. 3072 */ 3073 if (sc->cache_trim_mode) { 3074 scan_balance = SCAN_FILE; 3075 goto out; 3076 } 3077 3078 scan_balance = SCAN_FRACT; 3079 /* 3080 * Calculate the pressure balance between anon and file pages. 3081 * 3082 * The amount of pressure we put on each LRU is inversely 3083 * proportional to the cost of reclaiming each list, as 3084 * determined by the share of pages that are refaulting, times 3085 * the relative IO cost of bringing back a swapped out 3086 * anonymous page vs reloading a filesystem page (swappiness). 3087 * 3088 * Although we limit that influence to ensure no list gets 3089 * left behind completely: at least a third of the pressure is 3090 * applied, before swappiness. 3091 * 3092 * With swappiness at 100, anon and file have equal IO cost. 3093 */ 3094 total_cost = sc->anon_cost + sc->file_cost; 3095 anon_cost = total_cost + sc->anon_cost; 3096 file_cost = total_cost + sc->file_cost; 3097 total_cost = anon_cost + file_cost; 3098 3099 ap = swappiness * (total_cost + 1); 3100 ap /= anon_cost + 1; 3101 3102 fp = (200 - swappiness) * (total_cost + 1); 3103 fp /= file_cost + 1; 3104 3105 fraction[0] = ap; 3106 fraction[1] = fp; 3107 denominator = ap + fp; 3108 out: 3109 for_each_evictable_lru(lru) { 3110 int file = is_file_lru(lru); 3111 unsigned long lruvec_size; 3112 unsigned long low, min; 3113 unsigned long scan; 3114 3115 lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); 3116 mem_cgroup_protection(sc->target_mem_cgroup, memcg, 3117 &min, &low); 3118 3119 if (min || low) { 3120 /* 3121 * Scale a cgroup's reclaim pressure by proportioning 3122 * its current usage to its memory.low or memory.min 3123 * setting. 3124 * 3125 * This is important, as otherwise scanning aggression 3126 * becomes extremely binary -- from nothing as we 3127 * approach the memory protection threshold, to totally 3128 * nominal as we exceed it. This results in requiring 3129 * setting extremely liberal protection thresholds. It 3130 * also means we simply get no protection at all if we 3131 * set it too low, which is not ideal. 3132 * 3133 * If there is any protection in place, we reduce scan 3134 * pressure by how much of the total memory used is 3135 * within protection thresholds. 3136 * 3137 * There is one special case: in the first reclaim pass, 3138 * we skip over all groups that are within their low 3139 * protection. If that fails to reclaim enough pages to 3140 * satisfy the reclaim goal, we come back and override 3141 * the best-effort low protection. However, we still 3142 * ideally want to honor how well-behaved groups are in 3143 * that case instead of simply punishing them all 3144 * equally. As such, we reclaim them based on how much 3145 * memory they are using, reducing the scan pressure 3146 * again by how much of the total memory used is under 3147 * hard protection. 3148 */ 3149 unsigned long cgroup_size = mem_cgroup_size(memcg); 3150 unsigned long protection; 3151 3152 /* memory.low scaling, make sure we retry before OOM */ 3153 if (!sc->memcg_low_reclaim && low > min) { 3154 protection = low; 3155 sc->memcg_low_skipped = 1; 3156 } else { 3157 protection = min; 3158 } 3159 3160 /* Avoid TOCTOU with earlier protection check */ 3161 cgroup_size = max(cgroup_size, protection); 3162 3163 scan = lruvec_size - lruvec_size * protection / 3164 (cgroup_size + 1); 3165 3166 /* 3167 * Minimally target SWAP_CLUSTER_MAX pages to keep 3168 * reclaim moving forwards, avoiding decrementing 3169 * sc->priority further than desirable. 3170 */ 3171 scan = max(scan, SWAP_CLUSTER_MAX); 3172 } else { 3173 scan = lruvec_size; 3174 } 3175 3176 scan >>= sc->priority; 3177 3178 /* 3179 * If the cgroup's already been deleted, make sure to 3180 * scrape out the remaining cache. 3181 */ 3182 if (!scan && !mem_cgroup_online(memcg)) 3183 scan = min(lruvec_size, SWAP_CLUSTER_MAX); 3184 3185 switch (scan_balance) { 3186 case SCAN_EQUAL: 3187 /* Scan lists relative to size */ 3188 break; 3189 case SCAN_FRACT: 3190 /* 3191 * Scan types proportional to swappiness and 3192 * their relative recent reclaim efficiency. 3193 * Make sure we don't miss the last page on 3194 * the offlined memory cgroups because of a 3195 * round-off error. 3196 */ 3197 scan = mem_cgroup_online(memcg) ? 3198 div64_u64(scan * fraction[file], denominator) : 3199 DIV64_U64_ROUND_UP(scan * fraction[file], 3200 denominator); 3201 break; 3202 case SCAN_FILE: 3203 case SCAN_ANON: 3204 /* Scan one type exclusively */ 3205 if ((scan_balance == SCAN_FILE) != file) 3206 scan = 0; 3207 break; 3208 default: 3209 /* Look ma, no brain */ 3210 BUG(); 3211 } 3212 3213 nr[lru] = scan; 3214 } 3215 } 3216 3217 /* 3218 * Anonymous LRU management is a waste if there is 3219 * ultimately no way to reclaim the memory. 3220 */ 3221 static bool can_age_anon_pages(struct pglist_data *pgdat, 3222 struct scan_control *sc) 3223 { 3224 /* Aging the anon LRU is valuable if swap is present: */ 3225 if (total_swap_pages > 0) 3226 return true; 3227 3228 /* Also valuable if anon pages can be demoted: */ 3229 return can_demote(pgdat->node_id, sc); 3230 } 3231 3232 #ifdef CONFIG_LRU_GEN 3233 3234 #ifdef CONFIG_LRU_GEN_ENABLED 3235 DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); 3236 #define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) 3237 #else 3238 DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); 3239 #define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap]) 3240 #endif 3241 3242 static bool should_walk_mmu(void) 3243 { 3244 return arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK); 3245 } 3246 3247 static bool should_clear_pmd_young(void) 3248 { 3249 return arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG); 3250 } 3251 3252 /****************************************************************************** 3253 * shorthand helpers 3254 ******************************************************************************/ 3255 3256 #define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) 3257 3258 #define DEFINE_MAX_SEQ(lruvec) \ 3259 unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) 3260 3261 #define DEFINE_MIN_SEQ(lruvec) \ 3262 unsigned long min_seq[ANON_AND_FILE] = { \ 3263 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \ 3264 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ 3265 } 3266 3267 #define for_each_gen_type_zone(gen, type, zone) \ 3268 for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ 3269 for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ 3270 for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) 3271 3272 #define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) 3273 #define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) 3274 3275 static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) 3276 { 3277 struct pglist_data *pgdat = NODE_DATA(nid); 3278 3279 #ifdef CONFIG_MEMCG 3280 if (memcg) { 3281 struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; 3282 3283 /* see the comment in mem_cgroup_lruvec() */ 3284 if (!lruvec->pgdat) 3285 lruvec->pgdat = pgdat; 3286 3287 return lruvec; 3288 } 3289 #endif 3290 VM_WARN_ON_ONCE(!mem_cgroup_disabled()); 3291 3292 return &pgdat->__lruvec; 3293 } 3294 3295 static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) 3296 { 3297 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3298 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 3299 3300 if (!sc->may_swap) 3301 return 0; 3302 3303 if (!can_demote(pgdat->node_id, sc) && 3304 mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) 3305 return 0; 3306 3307 return mem_cgroup_swappiness(memcg); 3308 } 3309 3310 static int get_nr_gens(struct lruvec *lruvec, int type) 3311 { 3312 return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; 3313 } 3314 3315 static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) 3316 { 3317 /* see the comment on lru_gen_folio */ 3318 return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && 3319 get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && 3320 get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; 3321 } 3322 3323 /****************************************************************************** 3324 * Bloom filters 3325 ******************************************************************************/ 3326 3327 /* 3328 * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when 3329 * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of 3330 * bits in a bitmap, k is the number of hash functions and n is the number of 3331 * inserted items. 3332 * 3333 * Page table walkers use one of the two filters to reduce their search space. 3334 * To get rid of non-leaf entries that no longer have enough leaf entries, the 3335 * aging uses the double-buffering technique to flip to the other filter each 3336 * time it produces a new generation. For non-leaf entries that have enough 3337 * leaf entries, the aging carries them over to the next generation in 3338 * walk_pmd_range(); the eviction also report them when walking the rmap 3339 * in lru_gen_look_around(). 3340 * 3341 * For future optimizations: 3342 * 1. It's not necessary to keep both filters all the time. The spare one can be 3343 * freed after the RCU grace period and reallocated if needed again. 3344 * 2. And when reallocating, it's worth scaling its size according to the number 3345 * of inserted entries in the other filter, to reduce the memory overhead on 3346 * small systems and false positives on large systems. 3347 * 3. Jenkins' hash function is an alternative to Knuth's. 3348 */ 3349 #define BLOOM_FILTER_SHIFT 15 3350 3351 static inline int filter_gen_from_seq(unsigned long seq) 3352 { 3353 return seq % NR_BLOOM_FILTERS; 3354 } 3355 3356 static void get_item_key(void *item, int *key) 3357 { 3358 u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); 3359 3360 BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); 3361 3362 key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); 3363 key[1] = hash >> BLOOM_FILTER_SHIFT; 3364 } 3365 3366 static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) 3367 { 3368 int key[2]; 3369 unsigned long *filter; 3370 int gen = filter_gen_from_seq(seq); 3371 3372 filter = READ_ONCE(lruvec->mm_state.filters[gen]); 3373 if (!filter) 3374 return true; 3375 3376 get_item_key(item, key); 3377 3378 return test_bit(key[0], filter) && test_bit(key[1], filter); 3379 } 3380 3381 static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) 3382 { 3383 int key[2]; 3384 unsigned long *filter; 3385 int gen = filter_gen_from_seq(seq); 3386 3387 filter = READ_ONCE(lruvec->mm_state.filters[gen]); 3388 if (!filter) 3389 return; 3390 3391 get_item_key(item, key); 3392 3393 if (!test_bit(key[0], filter)) 3394 set_bit(key[0], filter); 3395 if (!test_bit(key[1], filter)) 3396 set_bit(key[1], filter); 3397 } 3398 3399 static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) 3400 { 3401 unsigned long *filter; 3402 int gen = filter_gen_from_seq(seq); 3403 3404 filter = lruvec->mm_state.filters[gen]; 3405 if (filter) { 3406 bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); 3407 return; 3408 } 3409 3410 filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), 3411 __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); 3412 WRITE_ONCE(lruvec->mm_state.filters[gen], filter); 3413 } 3414 3415 /****************************************************************************** 3416 * mm_struct list 3417 ******************************************************************************/ 3418 3419 static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) 3420 { 3421 static struct lru_gen_mm_list mm_list = { 3422 .fifo = LIST_HEAD_INIT(mm_list.fifo), 3423 .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), 3424 }; 3425 3426 #ifdef CONFIG_MEMCG 3427 if (memcg) 3428 return &memcg->mm_list; 3429 #endif 3430 VM_WARN_ON_ONCE(!mem_cgroup_disabled()); 3431 3432 return &mm_list; 3433 } 3434 3435 void lru_gen_add_mm(struct mm_struct *mm) 3436 { 3437 int nid; 3438 struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); 3439 struct lru_gen_mm_list *mm_list = get_mm_list(memcg); 3440 3441 VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list)); 3442 #ifdef CONFIG_MEMCG 3443 VM_WARN_ON_ONCE(mm->lru_gen.memcg); 3444 mm->lru_gen.memcg = memcg; 3445 #endif 3446 spin_lock(&mm_list->lock); 3447 3448 for_each_node_state(nid, N_MEMORY) { 3449 struct lruvec *lruvec = get_lruvec(memcg, nid); 3450 3451 /* the first addition since the last iteration */ 3452 if (lruvec->mm_state.tail == &mm_list->fifo) 3453 lruvec->mm_state.tail = &mm->lru_gen.list; 3454 } 3455 3456 list_add_tail(&mm->lru_gen.list, &mm_list->fifo); 3457 3458 spin_unlock(&mm_list->lock); 3459 } 3460 3461 void lru_gen_del_mm(struct mm_struct *mm) 3462 { 3463 int nid; 3464 struct lru_gen_mm_list *mm_list; 3465 struct mem_cgroup *memcg = NULL; 3466 3467 if (list_empty(&mm->lru_gen.list)) 3468 return; 3469 3470 #ifdef CONFIG_MEMCG 3471 memcg = mm->lru_gen.memcg; 3472 #endif 3473 mm_list = get_mm_list(memcg); 3474 3475 spin_lock(&mm_list->lock); 3476 3477 for_each_node(nid) { 3478 struct lruvec *lruvec = get_lruvec(memcg, nid); 3479 3480 /* where the current iteration continues after */ 3481 if (lruvec->mm_state.head == &mm->lru_gen.list) 3482 lruvec->mm_state.head = lruvec->mm_state.head->prev; 3483 3484 /* where the last iteration ended before */ 3485 if (lruvec->mm_state.tail == &mm->lru_gen.list) 3486 lruvec->mm_state.tail = lruvec->mm_state.tail->next; 3487 } 3488 3489 list_del_init(&mm->lru_gen.list); 3490 3491 spin_unlock(&mm_list->lock); 3492 3493 #ifdef CONFIG_MEMCG 3494 mem_cgroup_put(mm->lru_gen.memcg); 3495 mm->lru_gen.memcg = NULL; 3496 #endif 3497 } 3498 3499 #ifdef CONFIG_MEMCG 3500 void lru_gen_migrate_mm(struct mm_struct *mm) 3501 { 3502 struct mem_cgroup *memcg; 3503 struct task_struct *task = rcu_dereference_protected(mm->owner, true); 3504 3505 VM_WARN_ON_ONCE(task->mm != mm); 3506 lockdep_assert_held(&task->alloc_lock); 3507 3508 /* for mm_update_next_owner() */ 3509 if (mem_cgroup_disabled()) 3510 return; 3511 3512 /* migration can happen before addition */ 3513 if (!mm->lru_gen.memcg) 3514 return; 3515 3516 rcu_read_lock(); 3517 memcg = mem_cgroup_from_task(task); 3518 rcu_read_unlock(); 3519 if (memcg == mm->lru_gen.memcg) 3520 return; 3521 3522 VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); 3523 3524 lru_gen_del_mm(mm); 3525 lru_gen_add_mm(mm); 3526 } 3527 #endif 3528 3529 static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) 3530 { 3531 int i; 3532 int hist; 3533 3534 lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); 3535 3536 if (walk) { 3537 hist = lru_hist_from_seq(walk->max_seq); 3538 3539 for (i = 0; i < NR_MM_STATS; i++) { 3540 WRITE_ONCE(lruvec->mm_state.stats[hist][i], 3541 lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]); 3542 walk->mm_stats[i] = 0; 3543 } 3544 } 3545 3546 if (NR_HIST_GENS > 1 && last) { 3547 hist = lru_hist_from_seq(lruvec->mm_state.seq + 1); 3548 3549 for (i = 0; i < NR_MM_STATS; i++) 3550 WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0); 3551 } 3552 } 3553 3554 static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) 3555 { 3556 int type; 3557 unsigned long size = 0; 3558 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 3559 int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); 3560 3561 if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) 3562 return true; 3563 3564 clear_bit(key, &mm->lru_gen.bitmap); 3565 3566 for (type = !walk->can_swap; type < ANON_AND_FILE; type++) { 3567 size += type ? get_mm_counter(mm, MM_FILEPAGES) : 3568 get_mm_counter(mm, MM_ANONPAGES) + 3569 get_mm_counter(mm, MM_SHMEMPAGES); 3570 } 3571 3572 if (size < MIN_LRU_BATCH) 3573 return true; 3574 3575 return !mmget_not_zero(mm); 3576 } 3577 3578 static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, 3579 struct mm_struct **iter) 3580 { 3581 bool first = false; 3582 bool last = false; 3583 struct mm_struct *mm = NULL; 3584 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3585 struct lru_gen_mm_list *mm_list = get_mm_list(memcg); 3586 struct lru_gen_mm_state *mm_state = &lruvec->mm_state; 3587 3588 /* 3589 * mm_state->seq is incremented after each iteration of mm_list. There 3590 * are three interesting cases for this page table walker: 3591 * 1. It tries to start a new iteration with a stale max_seq: there is 3592 * nothing left to do. 3593 * 2. It started the next iteration: it needs to reset the Bloom filter 3594 * so that a fresh set of PTE tables can be recorded. 3595 * 3. It ended the current iteration: it needs to reset the mm stats 3596 * counters and tell its caller to increment max_seq. 3597 */ 3598 spin_lock(&mm_list->lock); 3599 3600 VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); 3601 3602 if (walk->max_seq <= mm_state->seq) 3603 goto done; 3604 3605 if (!mm_state->head) 3606 mm_state->head = &mm_list->fifo; 3607 3608 if (mm_state->head == &mm_list->fifo) 3609 first = true; 3610 3611 do { 3612 mm_state->head = mm_state->head->next; 3613 if (mm_state->head == &mm_list->fifo) { 3614 WRITE_ONCE(mm_state->seq, mm_state->seq + 1); 3615 last = true; 3616 break; 3617 } 3618 3619 /* force scan for those added after the last iteration */ 3620 if (!mm_state->tail || mm_state->tail == mm_state->head) { 3621 mm_state->tail = mm_state->head->next; 3622 walk->force_scan = true; 3623 } 3624 3625 mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); 3626 if (should_skip_mm(mm, walk)) 3627 mm = NULL; 3628 } while (!mm); 3629 done: 3630 if (*iter || last) 3631 reset_mm_stats(lruvec, walk, last); 3632 3633 spin_unlock(&mm_list->lock); 3634 3635 if (mm && first) 3636 reset_bloom_filter(lruvec, walk->max_seq + 1); 3637 3638 if (*iter) 3639 mmput_async(*iter); 3640 3641 *iter = mm; 3642 3643 return last; 3644 } 3645 3646 static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) 3647 { 3648 bool success = false; 3649 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3650 struct lru_gen_mm_list *mm_list = get_mm_list(memcg); 3651 struct lru_gen_mm_state *mm_state = &lruvec->mm_state; 3652 3653 spin_lock(&mm_list->lock); 3654 3655 VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); 3656 3657 if (max_seq > mm_state->seq) { 3658 mm_state->head = NULL; 3659 mm_state->tail = NULL; 3660 WRITE_ONCE(mm_state->seq, mm_state->seq + 1); 3661 reset_mm_stats(lruvec, NULL, true); 3662 success = true; 3663 } 3664 3665 spin_unlock(&mm_list->lock); 3666 3667 return success; 3668 } 3669 3670 /****************************************************************************** 3671 * PID controller 3672 ******************************************************************************/ 3673 3674 /* 3675 * A feedback loop based on Proportional-Integral-Derivative (PID) controller. 3676 * 3677 * The P term is refaulted/(evicted+protected) from a tier in the generation 3678 * currently being evicted; the I term is the exponential moving average of the 3679 * P term over the generations previously evicted, using the smoothing factor 3680 * 1/2; the D term isn't supported. 3681 * 3682 * The setpoint (SP) is always the first tier of one type; the process variable 3683 * (PV) is either any tier of the other type or any other tier of the same 3684 * type. 3685 * 3686 * The error is the difference between the SP and the PV; the correction is to 3687 * turn off protection when SP>PV or turn on protection when SP<PV. 3688 * 3689 * For future optimizations: 3690 * 1. The D term may discount the other two terms over time so that long-lived 3691 * generations can resist stale information. 3692 */ 3693 struct ctrl_pos { 3694 unsigned long refaulted; 3695 unsigned long total; 3696 int gain; 3697 }; 3698 3699 static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, 3700 struct ctrl_pos *pos) 3701 { 3702 struct lru_gen_folio *lrugen = &lruvec->lrugen; 3703 int hist = lru_hist_from_seq(lrugen->min_seq[type]); 3704 3705 pos->refaulted = lrugen->avg_refaulted[type][tier] + 3706 atomic_long_read(&lrugen->refaulted[hist][type][tier]); 3707 pos->total = lrugen->avg_total[type][tier] + 3708 atomic_long_read(&lrugen->evicted[hist][type][tier]); 3709 if (tier) 3710 pos->total += lrugen->protected[hist][type][tier - 1]; 3711 pos->gain = gain; 3712 } 3713 3714 static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) 3715 { 3716 int hist, tier; 3717 struct lru_gen_folio *lrugen = &lruvec->lrugen; 3718 bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; 3719 unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; 3720 3721 lockdep_assert_held(&lruvec->lru_lock); 3722 3723 if (!carryover && !clear) 3724 return; 3725 3726 hist = lru_hist_from_seq(seq); 3727 3728 for (tier = 0; tier < MAX_NR_TIERS; tier++) { 3729 if (carryover) { 3730 unsigned long sum; 3731 3732 sum = lrugen->avg_refaulted[type][tier] + 3733 atomic_long_read(&lrugen->refaulted[hist][type][tier]); 3734 WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); 3735 3736 sum = lrugen->avg_total[type][tier] + 3737 atomic_long_read(&lrugen->evicted[hist][type][tier]); 3738 if (tier) 3739 sum += lrugen->protected[hist][type][tier - 1]; 3740 WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); 3741 } 3742 3743 if (clear) { 3744 atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); 3745 atomic_long_set(&lrugen->evicted[hist][type][tier], 0); 3746 if (tier) 3747 WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); 3748 } 3749 } 3750 } 3751 3752 static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) 3753 { 3754 /* 3755 * Return true if the PV has a limited number of refaults or a lower 3756 * refaulted/total than the SP. 3757 */ 3758 return pv->refaulted < MIN_LRU_BATCH || 3759 pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <= 3760 (sp->refaulted + 1) * pv->total * pv->gain; 3761 } 3762 3763 /****************************************************************************** 3764 * the aging 3765 ******************************************************************************/ 3766 3767 /* promote pages accessed through page tables */ 3768 static int folio_update_gen(struct folio *folio, int gen) 3769 { 3770 unsigned long new_flags, old_flags = READ_ONCE(folio->flags); 3771 3772 VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); 3773 VM_WARN_ON_ONCE(!rcu_read_lock_held()); 3774 3775 do { 3776 /* lru_gen_del_folio() has isolated this page? */ 3777 if (!(old_flags & LRU_GEN_MASK)) { 3778 /* for shrink_folio_list() */ 3779 new_flags = old_flags | BIT(PG_referenced); 3780 continue; 3781 } 3782 3783 new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); 3784 new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; 3785 } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); 3786 3787 return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; 3788 } 3789 3790 /* protect pages accessed multiple times through file descriptors */ 3791 static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) 3792 { 3793 int type = folio_is_file_lru(folio); 3794 struct lru_gen_folio *lrugen = &lruvec->lrugen; 3795 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); 3796 unsigned long new_flags, old_flags = READ_ONCE(folio->flags); 3797 3798 VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); 3799 3800 do { 3801 new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; 3802 /* folio_update_gen() has promoted this page? */ 3803 if (new_gen >= 0 && new_gen != old_gen) 3804 return new_gen; 3805 3806 new_gen = (old_gen + 1) % MAX_NR_GENS; 3807 3808 new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); 3809 new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; 3810 /* for folio_end_writeback() */ 3811 if (reclaiming) 3812 new_flags |= BIT(PG_reclaim); 3813 } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); 3814 3815 lru_gen_update_size(lruvec, folio, old_gen, new_gen); 3816 3817 return new_gen; 3818 } 3819 3820 static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, 3821 int old_gen, int new_gen) 3822 { 3823 int type = folio_is_file_lru(folio); 3824 int zone = folio_zonenum(folio); 3825 int delta = folio_nr_pages(folio); 3826 3827 VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS); 3828 VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS); 3829 3830 walk->batched++; 3831 3832 walk->nr_pages[old_gen][type][zone] -= delta; 3833 walk->nr_pages[new_gen][type][zone] += delta; 3834 } 3835 3836 static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) 3837 { 3838 int gen, type, zone; 3839 struct lru_gen_folio *lrugen = &lruvec->lrugen; 3840 3841 walk->batched = 0; 3842 3843 for_each_gen_type_zone(gen, type, zone) { 3844 enum lru_list lru = type * LRU_INACTIVE_FILE; 3845 int delta = walk->nr_pages[gen][type][zone]; 3846 3847 if (!delta) 3848 continue; 3849 3850 walk->nr_pages[gen][type][zone] = 0; 3851 WRITE_ONCE(lrugen->nr_pages[gen][type][zone], 3852 lrugen->nr_pages[gen][type][zone] + delta); 3853 3854 if (lru_gen_is_active(lruvec, gen)) 3855 lru += LRU_ACTIVE; 3856 __update_lru_size(lruvec, lru, zone, delta); 3857 } 3858 } 3859 3860 static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args) 3861 { 3862 struct address_space *mapping; 3863 struct vm_area_struct *vma = args->vma; 3864 struct lru_gen_mm_walk *walk = args->private; 3865 3866 if (!vma_is_accessible(vma)) 3867 return true; 3868 3869 if (is_vm_hugetlb_page(vma)) 3870 return true; 3871 3872 if (!vma_has_recency(vma)) 3873 return true; 3874 3875 if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) 3876 return true; 3877 3878 if (vma == get_gate_vma(vma->vm_mm)) 3879 return true; 3880 3881 if (vma_is_anonymous(vma)) 3882 return !walk->can_swap; 3883 3884 if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) 3885 return true; 3886 3887 mapping = vma->vm_file->f_mapping; 3888 if (mapping_unevictable(mapping)) 3889 return true; 3890 3891 if (shmem_mapping(mapping)) 3892 return !walk->can_swap; 3893 3894 /* to exclude special mappings like dax, etc. */ 3895 return !mapping->a_ops->read_folio; 3896 } 3897 3898 /* 3899 * Some userspace memory allocators map many single-page VMAs. Instead of 3900 * returning back to the PGD table for each of such VMAs, finish an entire PMD 3901 * table to reduce zigzags and improve cache performance. 3902 */ 3903 static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args, 3904 unsigned long *vm_start, unsigned long *vm_end) 3905 { 3906 unsigned long start = round_up(*vm_end, size); 3907 unsigned long end = (start | ~mask) + 1; 3908 VMA_ITERATOR(vmi, args->mm, start); 3909 3910 VM_WARN_ON_ONCE(mask & size); 3911 VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask)); 3912 3913 for_each_vma(vmi, args->vma) { 3914 if (end && end <= args->vma->vm_start) 3915 return false; 3916 3917 if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) 3918 continue; 3919 3920 *vm_start = max(start, args->vma->vm_start); 3921 *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; 3922 3923 return true; 3924 } 3925 3926 return false; 3927 } 3928 3929 static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) 3930 { 3931 unsigned long pfn = pte_pfn(pte); 3932 3933 VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); 3934 3935 if (!pte_present(pte) || is_zero_pfn(pfn)) 3936 return -1; 3937 3938 if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) 3939 return -1; 3940 3941 if (WARN_ON_ONCE(!pfn_valid(pfn))) 3942 return -1; 3943 3944 return pfn; 3945 } 3946 3947 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) 3948 static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) 3949 { 3950 unsigned long pfn = pmd_pfn(pmd); 3951 3952 VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); 3953 3954 if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) 3955 return -1; 3956 3957 if (WARN_ON_ONCE(pmd_devmap(pmd))) 3958 return -1; 3959 3960 if (WARN_ON_ONCE(!pfn_valid(pfn))) 3961 return -1; 3962 3963 return pfn; 3964 } 3965 #endif 3966 3967 static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, 3968 struct pglist_data *pgdat, bool can_swap) 3969 { 3970 struct folio *folio; 3971 3972 /* try to avoid unnecessary memory loads */ 3973 if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) 3974 return NULL; 3975 3976 folio = pfn_folio(pfn); 3977 if (folio_nid(folio) != pgdat->node_id) 3978 return NULL; 3979 3980 if (folio_memcg_rcu(folio) != memcg) 3981 return NULL; 3982 3983 /* file VMAs can contain anon pages from COW */ 3984 if (!folio_is_file_lru(folio) && !can_swap) 3985 return NULL; 3986 3987 return folio; 3988 } 3989 3990 static bool suitable_to_scan(int total, int young) 3991 { 3992 int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); 3993 3994 /* suitable if the average number of young PTEs per cacheline is >=1 */ 3995 return young * n >= total; 3996 } 3997 3998 static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, 3999 struct mm_walk *args) 4000 { 4001 int i; 4002 pte_t *pte; 4003 spinlock_t *ptl; 4004 unsigned long addr; 4005 int total = 0; 4006 int young = 0; 4007 struct lru_gen_mm_walk *walk = args->private; 4008 struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); 4009 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 4010 int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); 4011 4012 pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl); 4013 if (!pte) 4014 return false; 4015 if (!spin_trylock(ptl)) { 4016 pte_unmap(pte); 4017 return false; 4018 } 4019 4020 arch_enter_lazy_mmu_mode(); 4021 restart: 4022 for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { 4023 unsigned long pfn; 4024 struct folio *folio; 4025 pte_t ptent = ptep_get(pte + i); 4026 4027 total++; 4028 walk->mm_stats[MM_LEAF_TOTAL]++; 4029 4030 pfn = get_pte_pfn(ptent, args->vma, addr); 4031 if (pfn == -1) 4032 continue; 4033 4034 if (!pte_young(ptent)) { 4035 walk->mm_stats[MM_LEAF_OLD]++; 4036 continue; 4037 } 4038 4039 folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); 4040 if (!folio) 4041 continue; 4042 4043 if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) 4044 VM_WARN_ON_ONCE(true); 4045 4046 young++; 4047 walk->mm_stats[MM_LEAF_YOUNG]++; 4048 4049 if (pte_dirty(ptent) && !folio_test_dirty(folio) && 4050 !(folio_test_anon(folio) && folio_test_swapbacked(folio) && 4051 !folio_test_swapcache(folio))) 4052 folio_mark_dirty(folio); 4053 4054 old_gen = folio_update_gen(folio, new_gen); 4055 if (old_gen >= 0 && old_gen != new_gen) 4056 update_batch_size(walk, folio, old_gen, new_gen); 4057 } 4058 4059 if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) 4060 goto restart; 4061 4062 arch_leave_lazy_mmu_mode(); 4063 pte_unmap_unlock(pte, ptl); 4064 4065 return suitable_to_scan(total, young); 4066 } 4067 4068 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) 4069 static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, 4070 struct mm_walk *args, unsigned long *bitmap, unsigned long *first) 4071 { 4072 int i; 4073 pmd_t *pmd; 4074 spinlock_t *ptl; 4075 struct lru_gen_mm_walk *walk = args->private; 4076 struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); 4077 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 4078 int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); 4079 4080 VM_WARN_ON_ONCE(pud_leaf(*pud)); 4081 4082 /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ 4083 if (*first == -1) { 4084 *first = addr; 4085 bitmap_zero(bitmap, MIN_LRU_BATCH); 4086 return; 4087 } 4088 4089 i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first); 4090 if (i && i <= MIN_LRU_BATCH) { 4091 __set_bit(i - 1, bitmap); 4092 return; 4093 } 4094 4095 pmd = pmd_offset(pud, *first); 4096 4097 ptl = pmd_lockptr(args->mm, pmd); 4098 if (!spin_trylock(ptl)) 4099 goto done; 4100 4101 arch_enter_lazy_mmu_mode(); 4102 4103 do { 4104 unsigned long pfn; 4105 struct folio *folio; 4106 4107 /* don't round down the first address */ 4108 addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first; 4109 4110 pfn = get_pmd_pfn(pmd[i], vma, addr); 4111 if (pfn == -1) 4112 goto next; 4113 4114 if (!pmd_trans_huge(pmd[i])) { 4115 if (should_clear_pmd_young()) 4116 pmdp_test_and_clear_young(vma, addr, pmd + i); 4117 goto next; 4118 } 4119 4120 folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); 4121 if (!folio) 4122 goto next; 4123 4124 if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) 4125 goto next; 4126 4127 walk->mm_stats[MM_LEAF_YOUNG]++; 4128 4129 if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) && 4130 !(folio_test_anon(folio) && folio_test_swapbacked(folio) && 4131 !folio_test_swapcache(folio))) 4132 folio_mark_dirty(folio); 4133 4134 old_gen = folio_update_gen(folio, new_gen); 4135 if (old_gen >= 0 && old_gen != new_gen) 4136 update_batch_size(walk, folio, old_gen, new_gen); 4137 next: 4138 i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; 4139 } while (i <= MIN_LRU_BATCH); 4140 4141 arch_leave_lazy_mmu_mode(); 4142 spin_unlock(ptl); 4143 done: 4144 *first = -1; 4145 } 4146 #else 4147 static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, 4148 struct mm_walk *args, unsigned long *bitmap, unsigned long *first) 4149 { 4150 } 4151 #endif 4152 4153 static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, 4154 struct mm_walk *args) 4155 { 4156 int i; 4157 pmd_t *pmd; 4158 unsigned long next; 4159 unsigned long addr; 4160 struct vm_area_struct *vma; 4161 DECLARE_BITMAP(bitmap, MIN_LRU_BATCH); 4162 unsigned long first = -1; 4163 struct lru_gen_mm_walk *walk = args->private; 4164 4165 VM_WARN_ON_ONCE(pud_leaf(*pud)); 4166 4167 /* 4168 * Finish an entire PMD in two passes: the first only reaches to PTE 4169 * tables to avoid taking the PMD lock; the second, if necessary, takes 4170 * the PMD lock to clear the accessed bit in PMD entries. 4171 */ 4172 pmd = pmd_offset(pud, start & PUD_MASK); 4173 restart: 4174 /* walk_pte_range() may call get_next_vma() */ 4175 vma = args->vma; 4176 for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { 4177 pmd_t val = pmdp_get_lockless(pmd + i); 4178 4179 next = pmd_addr_end(addr, end); 4180 4181 if (!pmd_present(val) || is_huge_zero_pmd(val)) { 4182 walk->mm_stats[MM_LEAF_TOTAL]++; 4183 continue; 4184 } 4185 4186 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4187 if (pmd_trans_huge(val)) { 4188 unsigned long pfn = pmd_pfn(val); 4189 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 4190 4191 walk->mm_stats[MM_LEAF_TOTAL]++; 4192 4193 if (!pmd_young(val)) { 4194 walk->mm_stats[MM_LEAF_OLD]++; 4195 continue; 4196 } 4197 4198 /* try to avoid unnecessary memory loads */ 4199 if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) 4200 continue; 4201 4202 walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); 4203 continue; 4204 } 4205 #endif 4206 walk->mm_stats[MM_NONLEAF_TOTAL]++; 4207 4208 if (should_clear_pmd_young()) { 4209 if (!pmd_young(val)) 4210 continue; 4211 4212 walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); 4213 } 4214 4215 if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) 4216 continue; 4217 4218 walk->mm_stats[MM_NONLEAF_FOUND]++; 4219 4220 if (!walk_pte_range(&val, addr, next, args)) 4221 continue; 4222 4223 walk->mm_stats[MM_NONLEAF_ADDED]++; 4224 4225 /* carry over to the next generation */ 4226 update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); 4227 } 4228 4229 walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); 4230 4231 if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) 4232 goto restart; 4233 } 4234 4235 static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, 4236 struct mm_walk *args) 4237 { 4238 int i; 4239 pud_t *pud; 4240 unsigned long addr; 4241 unsigned long next; 4242 struct lru_gen_mm_walk *walk = args->private; 4243 4244 VM_WARN_ON_ONCE(p4d_leaf(*p4d)); 4245 4246 pud = pud_offset(p4d, start & P4D_MASK); 4247 restart: 4248 for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { 4249 pud_t val = READ_ONCE(pud[i]); 4250 4251 next = pud_addr_end(addr, end); 4252 4253 if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) 4254 continue; 4255 4256 walk_pmd_range(&val, addr, next, args); 4257 4258 if (need_resched() || walk->batched >= MAX_LRU_BATCH) { 4259 end = (addr | ~PUD_MASK) + 1; 4260 goto done; 4261 } 4262 } 4263 4264 if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end)) 4265 goto restart; 4266 4267 end = round_up(end, P4D_SIZE); 4268 done: 4269 if (!end || !args->vma) 4270 return 1; 4271 4272 walk->next_addr = max(end, args->vma->vm_start); 4273 4274 return -EAGAIN; 4275 } 4276 4277 static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk) 4278 { 4279 static const struct mm_walk_ops mm_walk_ops = { 4280 .test_walk = should_skip_vma, 4281 .p4d_entry = walk_pud_range, 4282 }; 4283 4284 int err; 4285 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4286 4287 walk->next_addr = FIRST_USER_ADDRESS; 4288 4289 do { 4290 DEFINE_MAX_SEQ(lruvec); 4291 4292 err = -EBUSY; 4293 4294 /* another thread might have called inc_max_seq() */ 4295 if (walk->max_seq != max_seq) 4296 break; 4297 4298 /* folio_update_gen() requires stable folio_memcg() */ 4299 if (!mem_cgroup_trylock_pages(memcg)) 4300 break; 4301 4302 /* the caller might be holding the lock for write */ 4303 if (mmap_read_trylock(mm)) { 4304 err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk); 4305 4306 mmap_read_unlock(mm); 4307 } 4308 4309 mem_cgroup_unlock_pages(); 4310 4311 if (walk->batched) { 4312 spin_lock_irq(&lruvec->lru_lock); 4313 reset_batch_size(lruvec, walk); 4314 spin_unlock_irq(&lruvec->lru_lock); 4315 } 4316 4317 cond_resched(); 4318 } while (err == -EAGAIN); 4319 } 4320 4321 static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc) 4322 { 4323 struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; 4324 4325 if (pgdat && current_is_kswapd()) { 4326 VM_WARN_ON_ONCE(walk); 4327 4328 walk = &pgdat->mm_walk; 4329 } else if (!walk && force_alloc) { 4330 VM_WARN_ON_ONCE(current_is_kswapd()); 4331 4332 walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); 4333 } 4334 4335 current->reclaim_state->mm_walk = walk; 4336 4337 return walk; 4338 } 4339 4340 static void clear_mm_walk(void) 4341 { 4342 struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; 4343 4344 VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages))); 4345 VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats))); 4346 4347 current->reclaim_state->mm_walk = NULL; 4348 4349 if (!current_is_kswapd()) 4350 kfree(walk); 4351 } 4352 4353 static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) 4354 { 4355 int zone; 4356 int remaining = MAX_LRU_BATCH; 4357 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4358 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); 4359 4360 if (type == LRU_GEN_ANON && !can_swap) 4361 goto done; 4362 4363 /* prevent cold/hot inversion if force_scan is true */ 4364 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4365 struct list_head *head = &lrugen->folios[old_gen][type][zone]; 4366 4367 while (!list_empty(head)) { 4368 struct folio *folio = lru_to_folio(head); 4369 4370 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); 4371 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); 4372 VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); 4373 VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); 4374 4375 new_gen = folio_inc_gen(lruvec, folio, false); 4376 list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); 4377 4378 if (!--remaining) 4379 return false; 4380 } 4381 } 4382 done: 4383 reset_ctrl_pos(lruvec, type, true); 4384 WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); 4385 4386 return true; 4387 } 4388 4389 static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) 4390 { 4391 int gen, type, zone; 4392 bool success = false; 4393 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4394 DEFINE_MIN_SEQ(lruvec); 4395 4396 VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); 4397 4398 /* find the oldest populated generation */ 4399 for (type = !can_swap; type < ANON_AND_FILE; type++) { 4400 while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { 4401 gen = lru_gen_from_seq(min_seq[type]); 4402 4403 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4404 if (!list_empty(&lrugen->folios[gen][type][zone])) 4405 goto next; 4406 } 4407 4408 min_seq[type]++; 4409 } 4410 next: 4411 ; 4412 } 4413 4414 /* see the comment on lru_gen_folio */ 4415 if (can_swap) { 4416 min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); 4417 min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); 4418 } 4419 4420 for (type = !can_swap; type < ANON_AND_FILE; type++) { 4421 if (min_seq[type] == lrugen->min_seq[type]) 4422 continue; 4423 4424 reset_ctrl_pos(lruvec, type, true); 4425 WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); 4426 success = true; 4427 } 4428 4429 return success; 4430 } 4431 4432 static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) 4433 { 4434 int prev, next; 4435 int type, zone; 4436 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4437 4438 spin_lock_irq(&lruvec->lru_lock); 4439 4440 VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); 4441 4442 for (type = ANON_AND_FILE - 1; type >= 0; type--) { 4443 if (get_nr_gens(lruvec, type) != MAX_NR_GENS) 4444 continue; 4445 4446 VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); 4447 4448 while (!inc_min_seq(lruvec, type, can_swap)) { 4449 spin_unlock_irq(&lruvec->lru_lock); 4450 cond_resched(); 4451 spin_lock_irq(&lruvec->lru_lock); 4452 } 4453 } 4454 4455 /* 4456 * Update the active/inactive LRU sizes for compatibility. Both sides of 4457 * the current max_seq need to be covered, since max_seq+1 can overlap 4458 * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do 4459 * overlap, cold/hot inversion happens. 4460 */ 4461 prev = lru_gen_from_seq(lrugen->max_seq - 1); 4462 next = lru_gen_from_seq(lrugen->max_seq + 1); 4463 4464 for (type = 0; type < ANON_AND_FILE; type++) { 4465 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4466 enum lru_list lru = type * LRU_INACTIVE_FILE; 4467 long delta = lrugen->nr_pages[prev][type][zone] - 4468 lrugen->nr_pages[next][type][zone]; 4469 4470 if (!delta) 4471 continue; 4472 4473 __update_lru_size(lruvec, lru, zone, delta); 4474 __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); 4475 } 4476 } 4477 4478 for (type = 0; type < ANON_AND_FILE; type++) 4479 reset_ctrl_pos(lruvec, type, false); 4480 4481 WRITE_ONCE(lrugen->timestamps[next], jiffies); 4482 /* make sure preceding modifications appear */ 4483 smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); 4484 4485 spin_unlock_irq(&lruvec->lru_lock); 4486 } 4487 4488 static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, 4489 struct scan_control *sc, bool can_swap, bool force_scan) 4490 { 4491 bool success; 4492 struct lru_gen_mm_walk *walk; 4493 struct mm_struct *mm = NULL; 4494 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4495 4496 VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); 4497 4498 /* see the comment in iterate_mm_list() */ 4499 if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) { 4500 success = false; 4501 goto done; 4502 } 4503 4504 /* 4505 * If the hardware doesn't automatically set the accessed bit, fallback 4506 * to lru_gen_look_around(), which only clears the accessed bit in a 4507 * handful of PTEs. Spreading the work out over a period of time usually 4508 * is less efficient, but it avoids bursty page faults. 4509 */ 4510 if (!should_walk_mmu()) { 4511 success = iterate_mm_list_nowalk(lruvec, max_seq); 4512 goto done; 4513 } 4514 4515 walk = set_mm_walk(NULL, true); 4516 if (!walk) { 4517 success = iterate_mm_list_nowalk(lruvec, max_seq); 4518 goto done; 4519 } 4520 4521 walk->lruvec = lruvec; 4522 walk->max_seq = max_seq; 4523 walk->can_swap = can_swap; 4524 walk->force_scan = force_scan; 4525 4526 do { 4527 success = iterate_mm_list(lruvec, walk, &mm); 4528 if (mm) 4529 walk_mm(lruvec, mm, walk); 4530 } while (mm); 4531 done: 4532 if (success) 4533 inc_max_seq(lruvec, can_swap, force_scan); 4534 4535 return success; 4536 } 4537 4538 /****************************************************************************** 4539 * working set protection 4540 ******************************************************************************/ 4541 4542 static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) 4543 { 4544 int gen, type, zone; 4545 unsigned long total = 0; 4546 bool can_swap = get_swappiness(lruvec, sc); 4547 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4548 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4549 DEFINE_MAX_SEQ(lruvec); 4550 DEFINE_MIN_SEQ(lruvec); 4551 4552 for (type = !can_swap; type < ANON_AND_FILE; type++) { 4553 unsigned long seq; 4554 4555 for (seq = min_seq[type]; seq <= max_seq; seq++) { 4556 gen = lru_gen_from_seq(seq); 4557 4558 for (zone = 0; zone < MAX_NR_ZONES; zone++) 4559 total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); 4560 } 4561 } 4562 4563 /* whether the size is big enough to be helpful */ 4564 return mem_cgroup_online(memcg) ? (total >> sc->priority) : total; 4565 } 4566 4567 static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc, 4568 unsigned long min_ttl) 4569 { 4570 int gen; 4571 unsigned long birth; 4572 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4573 DEFINE_MIN_SEQ(lruvec); 4574 4575 /* see the comment on lru_gen_folio */ 4576 gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); 4577 birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); 4578 4579 if (time_is_after_jiffies(birth + min_ttl)) 4580 return false; 4581 4582 if (!lruvec_is_sizable(lruvec, sc)) 4583 return false; 4584 4585 mem_cgroup_calculate_protection(NULL, memcg); 4586 4587 return !mem_cgroup_below_min(NULL, memcg); 4588 } 4589 4590 /* to protect the working set of the last N jiffies */ 4591 static unsigned long lru_gen_min_ttl __read_mostly; 4592 4593 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) 4594 { 4595 struct mem_cgroup *memcg; 4596 unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); 4597 4598 VM_WARN_ON_ONCE(!current_is_kswapd()); 4599 4600 /* check the order to exclude compaction-induced reclaim */ 4601 if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) 4602 return; 4603 4604 memcg = mem_cgroup_iter(NULL, NULL, NULL); 4605 do { 4606 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 4607 4608 if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) { 4609 mem_cgroup_iter_break(NULL, memcg); 4610 return; 4611 } 4612 4613 cond_resched(); 4614 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); 4615 4616 /* 4617 * The main goal is to OOM kill if every generation from all memcgs is 4618 * younger than min_ttl. However, another possibility is all memcgs are 4619 * either too small or below min. 4620 */ 4621 if (mutex_trylock(&oom_lock)) { 4622 struct oom_control oc = { 4623 .gfp_mask = sc->gfp_mask, 4624 }; 4625 4626 out_of_memory(&oc); 4627 4628 mutex_unlock(&oom_lock); 4629 } 4630 } 4631 4632 /****************************************************************************** 4633 * rmap/PT walk feedback 4634 ******************************************************************************/ 4635 4636 /* 4637 * This function exploits spatial locality when shrink_folio_list() walks the 4638 * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If 4639 * the scan was done cacheline efficiently, it adds the PMD entry pointing to 4640 * the PTE table to the Bloom filter. This forms a feedback loop between the 4641 * eviction and the aging. 4642 */ 4643 void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) 4644 { 4645 int i; 4646 unsigned long start; 4647 unsigned long end; 4648 struct lru_gen_mm_walk *walk; 4649 int young = 0; 4650 pte_t *pte = pvmw->pte; 4651 unsigned long addr = pvmw->address; 4652 struct folio *folio = pfn_folio(pvmw->pfn); 4653 struct mem_cgroup *memcg = folio_memcg(folio); 4654 struct pglist_data *pgdat = folio_pgdat(folio); 4655 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 4656 DEFINE_MAX_SEQ(lruvec); 4657 int old_gen, new_gen = lru_gen_from_seq(max_seq); 4658 4659 lockdep_assert_held(pvmw->ptl); 4660 VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); 4661 4662 if (spin_is_contended(pvmw->ptl)) 4663 return; 4664 4665 /* avoid taking the LRU lock under the PTL when possible */ 4666 walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; 4667 4668 start = max(addr & PMD_MASK, pvmw->vma->vm_start); 4669 end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; 4670 4671 if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { 4672 if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) 4673 end = start + MIN_LRU_BATCH * PAGE_SIZE; 4674 else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2) 4675 start = end - MIN_LRU_BATCH * PAGE_SIZE; 4676 else { 4677 start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2; 4678 end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2; 4679 } 4680 } 4681 4682 /* folio_update_gen() requires stable folio_memcg() */ 4683 if (!mem_cgroup_trylock_pages(memcg)) 4684 return; 4685 4686 arch_enter_lazy_mmu_mode(); 4687 4688 pte -= (addr - start) / PAGE_SIZE; 4689 4690 for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { 4691 unsigned long pfn; 4692 pte_t ptent = ptep_get(pte + i); 4693 4694 pfn = get_pte_pfn(ptent, pvmw->vma, addr); 4695 if (pfn == -1) 4696 continue; 4697 4698 if (!pte_young(ptent)) 4699 continue; 4700 4701 folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); 4702 if (!folio) 4703 continue; 4704 4705 if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) 4706 VM_WARN_ON_ONCE(true); 4707 4708 young++; 4709 4710 if (pte_dirty(ptent) && !folio_test_dirty(folio) && 4711 !(folio_test_anon(folio) && folio_test_swapbacked(folio) && 4712 !folio_test_swapcache(folio))) 4713 folio_mark_dirty(folio); 4714 4715 if (walk) { 4716 old_gen = folio_update_gen(folio, new_gen); 4717 if (old_gen >= 0 && old_gen != new_gen) 4718 update_batch_size(walk, folio, old_gen, new_gen); 4719 4720 continue; 4721 } 4722 4723 old_gen = folio_lru_gen(folio); 4724 if (old_gen < 0) 4725 folio_set_referenced(folio); 4726 else if (old_gen != new_gen) 4727 folio_activate(folio); 4728 } 4729 4730 arch_leave_lazy_mmu_mode(); 4731 mem_cgroup_unlock_pages(); 4732 4733 /* feedback from rmap walkers to page table walkers */ 4734 if (suitable_to_scan(i, young)) 4735 update_bloom_filter(lruvec, max_seq, pvmw->pmd); 4736 } 4737 4738 /****************************************************************************** 4739 * memcg LRU 4740 ******************************************************************************/ 4741 4742 /* see the comment on MEMCG_NR_GENS */ 4743 enum { 4744 MEMCG_LRU_NOP, 4745 MEMCG_LRU_HEAD, 4746 MEMCG_LRU_TAIL, 4747 MEMCG_LRU_OLD, 4748 MEMCG_LRU_YOUNG, 4749 }; 4750 4751 #ifdef CONFIG_MEMCG 4752 4753 static int lru_gen_memcg_seg(struct lruvec *lruvec) 4754 { 4755 return READ_ONCE(lruvec->lrugen.seg); 4756 } 4757 4758 static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) 4759 { 4760 int seg; 4761 int old, new; 4762 unsigned long flags; 4763 int bin = get_random_u32_below(MEMCG_NR_BINS); 4764 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 4765 4766 spin_lock_irqsave(&pgdat->memcg_lru.lock, flags); 4767 4768 VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); 4769 4770 seg = 0; 4771 new = old = lruvec->lrugen.gen; 4772 4773 /* see the comment on MEMCG_NR_GENS */ 4774 if (op == MEMCG_LRU_HEAD) 4775 seg = MEMCG_LRU_HEAD; 4776 else if (op == MEMCG_LRU_TAIL) 4777 seg = MEMCG_LRU_TAIL; 4778 else if (op == MEMCG_LRU_OLD) 4779 new = get_memcg_gen(pgdat->memcg_lru.seq); 4780 else if (op == MEMCG_LRU_YOUNG) 4781 new = get_memcg_gen(pgdat->memcg_lru.seq + 1); 4782 else 4783 VM_WARN_ON_ONCE(true); 4784 4785 hlist_nulls_del_rcu(&lruvec->lrugen.list); 4786 4787 if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) 4788 hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); 4789 else 4790 hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); 4791 4792 pgdat->memcg_lru.nr_memcgs[old]--; 4793 pgdat->memcg_lru.nr_memcgs[new]++; 4794 4795 lruvec->lrugen.gen = new; 4796 WRITE_ONCE(lruvec->lrugen.seg, seg); 4797 4798 if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) 4799 WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); 4800 4801 spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags); 4802 } 4803 4804 void lru_gen_online_memcg(struct mem_cgroup *memcg) 4805 { 4806 int gen; 4807 int nid; 4808 int bin = get_random_u32_below(MEMCG_NR_BINS); 4809 4810 for_each_node(nid) { 4811 struct pglist_data *pgdat = NODE_DATA(nid); 4812 struct lruvec *lruvec = get_lruvec(memcg, nid); 4813 4814 spin_lock_irq(&pgdat->memcg_lru.lock); 4815 4816 VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); 4817 4818 gen = get_memcg_gen(pgdat->memcg_lru.seq); 4819 4820 hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); 4821 pgdat->memcg_lru.nr_memcgs[gen]++; 4822 4823 lruvec->lrugen.gen = gen; 4824 4825 spin_unlock_irq(&pgdat->memcg_lru.lock); 4826 } 4827 } 4828 4829 void lru_gen_offline_memcg(struct mem_cgroup *memcg) 4830 { 4831 int nid; 4832 4833 for_each_node(nid) { 4834 struct lruvec *lruvec = get_lruvec(memcg, nid); 4835 4836 lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); 4837 } 4838 } 4839 4840 void lru_gen_release_memcg(struct mem_cgroup *memcg) 4841 { 4842 int gen; 4843 int nid; 4844 4845 for_each_node(nid) { 4846 struct pglist_data *pgdat = NODE_DATA(nid); 4847 struct lruvec *lruvec = get_lruvec(memcg, nid); 4848 4849 spin_lock_irq(&pgdat->memcg_lru.lock); 4850 4851 VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); 4852 4853 gen = lruvec->lrugen.gen; 4854 4855 hlist_nulls_del_rcu(&lruvec->lrugen.list); 4856 pgdat->memcg_lru.nr_memcgs[gen]--; 4857 4858 if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) 4859 WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); 4860 4861 spin_unlock_irq(&pgdat->memcg_lru.lock); 4862 } 4863 } 4864 4865 void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) 4866 { 4867 struct lruvec *lruvec = get_lruvec(memcg, nid); 4868 4869 /* see the comment on MEMCG_NR_GENS */ 4870 if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) 4871 lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); 4872 } 4873 4874 #else /* !CONFIG_MEMCG */ 4875 4876 static int lru_gen_memcg_seg(struct lruvec *lruvec) 4877 { 4878 return 0; 4879 } 4880 4881 #endif 4882 4883 /****************************************************************************** 4884 * the eviction 4885 ******************************************************************************/ 4886 4887 static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) 4888 { 4889 bool success; 4890 int gen = folio_lru_gen(folio); 4891 int type = folio_is_file_lru(folio); 4892 int zone = folio_zonenum(folio); 4893 int delta = folio_nr_pages(folio); 4894 int refs = folio_lru_refs(folio); 4895 int tier = lru_tier_from_refs(refs); 4896 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4897 4898 VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); 4899 4900 /* unevictable */ 4901 if (!folio_evictable(folio)) { 4902 success = lru_gen_del_folio(lruvec, folio, true); 4903 VM_WARN_ON_ONCE_FOLIO(!success, folio); 4904 folio_set_unevictable(folio); 4905 lruvec_add_folio(lruvec, folio); 4906 __count_vm_events(UNEVICTABLE_PGCULLED, delta); 4907 return true; 4908 } 4909 4910 /* dirty lazyfree */ 4911 if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) { 4912 success = lru_gen_del_folio(lruvec, folio, true); 4913 VM_WARN_ON_ONCE_FOLIO(!success, folio); 4914 folio_set_swapbacked(folio); 4915 lruvec_add_folio_tail(lruvec, folio); 4916 return true; 4917 } 4918 4919 /* promoted */ 4920 if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { 4921 list_move(&folio->lru, &lrugen->folios[gen][type][zone]); 4922 return true; 4923 } 4924 4925 /* protected */ 4926 if (tier > tier_idx) { 4927 int hist = lru_hist_from_seq(lrugen->min_seq[type]); 4928 4929 gen = folio_inc_gen(lruvec, folio, false); 4930 list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); 4931 4932 WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 4933 lrugen->protected[hist][type][tier - 1] + delta); 4934 return true; 4935 } 4936 4937 /* waiting for writeback */ 4938 if (folio_test_locked(folio) || folio_test_writeback(folio) || 4939 (type == LRU_GEN_FILE && folio_test_dirty(folio))) { 4940 gen = folio_inc_gen(lruvec, folio, true); 4941 list_move(&folio->lru, &lrugen->folios[gen][type][zone]); 4942 return true; 4943 } 4944 4945 return false; 4946 } 4947 4948 static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc) 4949 { 4950 bool success; 4951 4952 /* swapping inhibited */ 4953 if (!(sc->gfp_mask & __GFP_IO) && 4954 (folio_test_dirty(folio) || 4955 (folio_test_anon(folio) && !folio_test_swapcache(folio)))) 4956 return false; 4957 4958 /* raced with release_pages() */ 4959 if (!folio_try_get(folio)) 4960 return false; 4961 4962 /* raced with another isolation */ 4963 if (!folio_test_clear_lru(folio)) { 4964 folio_put(folio); 4965 return false; 4966 } 4967 4968 /* see the comment on MAX_NR_TIERS */ 4969 if (!folio_test_referenced(folio)) 4970 set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); 4971 4972 /* for shrink_folio_list() */ 4973 folio_clear_reclaim(folio); 4974 folio_clear_referenced(folio); 4975 4976 success = lru_gen_del_folio(lruvec, folio, true); 4977 VM_WARN_ON_ONCE_FOLIO(!success, folio); 4978 4979 return true; 4980 } 4981 4982 static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, 4983 int type, int tier, struct list_head *list) 4984 { 4985 int gen, zone; 4986 enum vm_event_item item; 4987 int sorted = 0; 4988 int scanned = 0; 4989 int isolated = 0; 4990 int remaining = MAX_LRU_BATCH; 4991 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4992 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4993 4994 VM_WARN_ON_ONCE(!list_empty(list)); 4995 4996 if (get_nr_gens(lruvec, type) == MIN_NR_GENS) 4997 return 0; 4998 4999 gen = lru_gen_from_seq(lrugen->min_seq[type]); 5000 5001 for (zone = sc->reclaim_idx; zone >= 0; zone--) { 5002 LIST_HEAD(moved); 5003 int skipped = 0; 5004 struct list_head *head = &lrugen->folios[gen][type][zone]; 5005 5006 while (!list_empty(head)) { 5007 struct folio *folio = lru_to_folio(head); 5008 int delta = folio_nr_pages(folio); 5009 5010 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); 5011 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); 5012 VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); 5013 VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); 5014 5015 scanned += delta; 5016 5017 if (sort_folio(lruvec, folio, tier)) 5018 sorted += delta; 5019 else if (isolate_folio(lruvec, folio, sc)) { 5020 list_add(&folio->lru, list); 5021 isolated += delta; 5022 } else { 5023 list_move(&folio->lru, &moved); 5024 skipped += delta; 5025 } 5026 5027 if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH) 5028 break; 5029 } 5030 5031 if (skipped) { 5032 list_splice(&moved, head); 5033 __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); 5034 } 5035 5036 if (!remaining || isolated >= MIN_LRU_BATCH) 5037 break; 5038 } 5039 5040 item = PGSCAN_KSWAPD + reclaimer_offset(); 5041 if (!cgroup_reclaim(sc)) { 5042 __count_vm_events(item, isolated); 5043 __count_vm_events(PGREFILL, sorted); 5044 } 5045 __count_memcg_events(memcg, item, isolated); 5046 __count_memcg_events(memcg, PGREFILL, sorted); 5047 __count_vm_events(PGSCAN_ANON + type, isolated); 5048 5049 /* 5050 * There might not be eligible folios due to reclaim_idx. Check the 5051 * remaining to prevent livelock if it's not making progress. 5052 */ 5053 return isolated || !remaining ? scanned : 0; 5054 } 5055 5056 static int get_tier_idx(struct lruvec *lruvec, int type) 5057 { 5058 int tier; 5059 struct ctrl_pos sp, pv; 5060 5061 /* 5062 * To leave a margin for fluctuations, use a larger gain factor (1:2). 5063 * This value is chosen because any other tier would have at least twice 5064 * as many refaults as the first tier. 5065 */ 5066 read_ctrl_pos(lruvec, type, 0, 1, &sp); 5067 for (tier = 1; tier < MAX_NR_TIERS; tier++) { 5068 read_ctrl_pos(lruvec, type, tier, 2, &pv); 5069 if (!positive_ctrl_err(&sp, &pv)) 5070 break; 5071 } 5072 5073 return tier - 1; 5074 } 5075 5076 static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) 5077 { 5078 int type, tier; 5079 struct ctrl_pos sp, pv; 5080 int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; 5081 5082 /* 5083 * Compare the first tier of anon with that of file to determine which 5084 * type to scan. Also need to compare other tiers of the selected type 5085 * with the first tier of the other type to determine the last tier (of 5086 * the selected type) to evict. 5087 */ 5088 read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); 5089 read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); 5090 type = positive_ctrl_err(&sp, &pv); 5091 5092 read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); 5093 for (tier = 1; tier < MAX_NR_TIERS; tier++) { 5094 read_ctrl_pos(lruvec, type, tier, gain[type], &pv); 5095 if (!positive_ctrl_err(&sp, &pv)) 5096 break; 5097 } 5098 5099 *tier_idx = tier - 1; 5100 5101 return type; 5102 } 5103 5104 static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, 5105 int *type_scanned, struct list_head *list) 5106 { 5107 int i; 5108 int type; 5109 int scanned; 5110 int tier = -1; 5111 DEFINE_MIN_SEQ(lruvec); 5112 5113 /* 5114 * Try to make the obvious choice first. When anon and file are both 5115 * available from the same generation, interpret swappiness 1 as file 5116 * first and 200 as anon first. 5117 */ 5118 if (!swappiness) 5119 type = LRU_GEN_FILE; 5120 else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) 5121 type = LRU_GEN_ANON; 5122 else if (swappiness == 1) 5123 type = LRU_GEN_FILE; 5124 else if (swappiness == 200) 5125 type = LRU_GEN_ANON; 5126 else 5127 type = get_type_to_scan(lruvec, swappiness, &tier); 5128 5129 for (i = !swappiness; i < ANON_AND_FILE; i++) { 5130 if (tier < 0) 5131 tier = get_tier_idx(lruvec, type); 5132 5133 scanned = scan_folios(lruvec, sc, type, tier, list); 5134 if (scanned) 5135 break; 5136 5137 type = !type; 5138 tier = -1; 5139 } 5140 5141 *type_scanned = type; 5142 5143 return scanned; 5144 } 5145 5146 static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) 5147 { 5148 int type; 5149 int scanned; 5150 int reclaimed; 5151 LIST_HEAD(list); 5152 LIST_HEAD(clean); 5153 struct folio *folio; 5154 struct folio *next; 5155 enum vm_event_item item; 5156 struct reclaim_stat stat; 5157 struct lru_gen_mm_walk *walk; 5158 bool skip_retry = false; 5159 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 5160 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 5161 5162 spin_lock_irq(&lruvec->lru_lock); 5163 5164 scanned = isolate_folios(lruvec, sc, swappiness, &type, &list); 5165 5166 scanned += try_to_inc_min_seq(lruvec, swappiness); 5167 5168 if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) 5169 scanned = 0; 5170 5171 spin_unlock_irq(&lruvec->lru_lock); 5172 5173 if (list_empty(&list)) 5174 return scanned; 5175 retry: 5176 reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); 5177 sc->nr_reclaimed += reclaimed; 5178 5179 list_for_each_entry_safe_reverse(folio, next, &list, lru) { 5180 if (!folio_evictable(folio)) { 5181 list_del(&folio->lru); 5182 folio_putback_lru(folio); 5183 continue; 5184 } 5185 5186 if (folio_test_reclaim(folio) && 5187 (folio_test_dirty(folio) || folio_test_writeback(folio))) { 5188 /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ 5189 if (folio_test_workingset(folio)) 5190 folio_set_referenced(folio); 5191 continue; 5192 } 5193 5194 if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || 5195 folio_mapped(folio) || folio_test_locked(folio) || 5196 folio_test_dirty(folio) || folio_test_writeback(folio)) { 5197 /* don't add rejected folios to the oldest generation */ 5198 set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 5199 BIT(PG_active)); 5200 continue; 5201 } 5202 5203 /* retry folios that may have missed folio_rotate_reclaimable() */ 5204 list_move(&folio->lru, &clean); 5205 sc->nr_scanned -= folio_nr_pages(folio); 5206 } 5207 5208 spin_lock_irq(&lruvec->lru_lock); 5209 5210 move_folios_to_lru(lruvec, &list); 5211 5212 walk = current->reclaim_state->mm_walk; 5213 if (walk && walk->batched) 5214 reset_batch_size(lruvec, walk); 5215 5216 item = PGSTEAL_KSWAPD + reclaimer_offset(); 5217 if (!cgroup_reclaim(sc)) 5218 __count_vm_events(item, reclaimed); 5219 __count_memcg_events(memcg, item, reclaimed); 5220 __count_vm_events(PGSTEAL_ANON + type, reclaimed); 5221 5222 spin_unlock_irq(&lruvec->lru_lock); 5223 5224 mem_cgroup_uncharge_list(&list); 5225 free_unref_page_list(&list); 5226 5227 INIT_LIST_HEAD(&list); 5228 list_splice_init(&clean, &list); 5229 5230 if (!list_empty(&list)) { 5231 skip_retry = true; 5232 goto retry; 5233 } 5234 5235 return scanned; 5236 } 5237 5238 static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, 5239 struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) 5240 { 5241 int gen, type, zone; 5242 unsigned long old = 0; 5243 unsigned long young = 0; 5244 unsigned long total = 0; 5245 struct lru_gen_folio *lrugen = &lruvec->lrugen; 5246 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 5247 DEFINE_MIN_SEQ(lruvec); 5248 5249 /* whether this lruvec is completely out of cold folios */ 5250 if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { 5251 *nr_to_scan = 0; 5252 return true; 5253 } 5254 5255 for (type = !can_swap; type < ANON_AND_FILE; type++) { 5256 unsigned long seq; 5257 5258 for (seq = min_seq[type]; seq <= max_seq; seq++) { 5259 unsigned long size = 0; 5260 5261 gen = lru_gen_from_seq(seq); 5262 5263 for (zone = 0; zone < MAX_NR_ZONES; zone++) 5264 size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); 5265 5266 total += size; 5267 if (seq == max_seq) 5268 young += size; 5269 else if (seq + MIN_NR_GENS == max_seq) 5270 old += size; 5271 } 5272 } 5273 5274 /* try to scrape all its memory if this memcg was deleted */ 5275 *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; 5276 5277 /* 5278 * The aging tries to be lazy to reduce the overhead, while the eviction 5279 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the 5280 * ideal number of generations is MIN_NR_GENS+1. 5281 */ 5282 if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) 5283 return false; 5284 5285 /* 5286 * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) 5287 * of the total number of pages for each generation. A reasonable range 5288 * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The 5289 * aging cares about the upper bound of hot pages, while the eviction 5290 * cares about the lower bound of cold pages. 5291 */ 5292 if (young * MIN_NR_GENS > total) 5293 return true; 5294 if (old * (MIN_NR_GENS + 2) < total) 5295 return true; 5296 5297 return false; 5298 } 5299 5300 /* 5301 * For future optimizations: 5302 * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg 5303 * reclaim. 5304 */ 5305 static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) 5306 { 5307 unsigned long nr_to_scan; 5308 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 5309 DEFINE_MAX_SEQ(lruvec); 5310 5311 if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) 5312 return 0; 5313 5314 if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan)) 5315 return nr_to_scan; 5316 5317 /* skip the aging path at the default priority */ 5318 if (sc->priority == DEF_PRIORITY) 5319 return nr_to_scan; 5320 5321 /* skip this lruvec as it's low on cold folios */ 5322 return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0; 5323 } 5324 5325 static unsigned long get_nr_to_reclaim(struct scan_control *sc) 5326 { 5327 /* don't abort memcg reclaim to ensure fairness */ 5328 if (!global_reclaim(sc)) 5329 return -1; 5330 5331 return max(sc->nr_to_reclaim, compact_gap(sc->order)); 5332 } 5333 5334 static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 5335 { 5336 long nr_to_scan; 5337 unsigned long scanned = 0; 5338 unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); 5339 int swappiness = get_swappiness(lruvec, sc); 5340 5341 /* clean file folios are more likely to exist */ 5342 if (swappiness && !(sc->gfp_mask & __GFP_IO)) 5343 swappiness = 1; 5344 5345 while (true) { 5346 int delta; 5347 5348 nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); 5349 if (nr_to_scan <= 0) 5350 break; 5351 5352 delta = evict_folios(lruvec, sc, swappiness); 5353 if (!delta) 5354 break; 5355 5356 scanned += delta; 5357 if (scanned >= nr_to_scan) 5358 break; 5359 5360 if (sc->nr_reclaimed >= nr_to_reclaim) 5361 break; 5362 5363 cond_resched(); 5364 } 5365 5366 /* whether try_to_inc_max_seq() was successful */ 5367 return nr_to_scan < 0; 5368 } 5369 5370 static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) 5371 { 5372 bool success; 5373 unsigned long scanned = sc->nr_scanned; 5374 unsigned long reclaimed = sc->nr_reclaimed; 5375 int seg = lru_gen_memcg_seg(lruvec); 5376 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 5377 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 5378 5379 /* see the comment on MEMCG_NR_GENS */ 5380 if (!lruvec_is_sizable(lruvec, sc)) 5381 return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; 5382 5383 mem_cgroup_calculate_protection(NULL, memcg); 5384 5385 if (mem_cgroup_below_min(NULL, memcg)) 5386 return MEMCG_LRU_YOUNG; 5387 5388 if (mem_cgroup_below_low(NULL, memcg)) { 5389 /* see the comment on MEMCG_NR_GENS */ 5390 if (seg != MEMCG_LRU_TAIL) 5391 return MEMCG_LRU_TAIL; 5392 5393 memcg_memory_event(memcg, MEMCG_LOW); 5394 } 5395 5396 success = try_to_shrink_lruvec(lruvec, sc); 5397 5398 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); 5399 5400 if (!sc->proactive) 5401 vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, 5402 sc->nr_reclaimed - reclaimed); 5403 5404 flush_reclaim_state(sc); 5405 5406 return success ? MEMCG_LRU_YOUNG : 0; 5407 } 5408 5409 #ifdef CONFIG_MEMCG 5410 5411 static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) 5412 { 5413 int op; 5414 int gen; 5415 int bin; 5416 int first_bin; 5417 struct lruvec *lruvec; 5418 struct lru_gen_folio *lrugen; 5419 struct mem_cgroup *memcg; 5420 const struct hlist_nulls_node *pos; 5421 unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); 5422 5423 bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); 5424 restart: 5425 op = 0; 5426 memcg = NULL; 5427 gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); 5428 5429 rcu_read_lock(); 5430 5431 hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { 5432 if (op) 5433 lru_gen_rotate_memcg(lruvec, op); 5434 5435 mem_cgroup_put(memcg); 5436 5437 lruvec = container_of(lrugen, struct lruvec, lrugen); 5438 memcg = lruvec_memcg(lruvec); 5439 5440 if (!mem_cgroup_tryget(memcg)) { 5441 op = 0; 5442 memcg = NULL; 5443 continue; 5444 } 5445 5446 rcu_read_unlock(); 5447 5448 op = shrink_one(lruvec, sc); 5449 5450 rcu_read_lock(); 5451 5452 if (sc->nr_reclaimed >= nr_to_reclaim) 5453 break; 5454 } 5455 5456 rcu_read_unlock(); 5457 5458 if (op) 5459 lru_gen_rotate_memcg(lruvec, op); 5460 5461 mem_cgroup_put(memcg); 5462 5463 if (sc->nr_reclaimed >= nr_to_reclaim) 5464 return; 5465 5466 /* restart if raced with lru_gen_rotate_memcg() */ 5467 if (gen != get_nulls_value(pos)) 5468 goto restart; 5469 5470 /* try the rest of the bins of the current generation */ 5471 bin = get_memcg_bin(bin + 1); 5472 if (bin != first_bin) 5473 goto restart; 5474 } 5475 5476 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 5477 { 5478 struct blk_plug plug; 5479 5480 VM_WARN_ON_ONCE(global_reclaim(sc)); 5481 VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap); 5482 5483 lru_add_drain(); 5484 5485 blk_start_plug(&plug); 5486 5487 set_mm_walk(NULL, sc->proactive); 5488 5489 if (try_to_shrink_lruvec(lruvec, sc)) 5490 lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG); 5491 5492 clear_mm_walk(); 5493 5494 blk_finish_plug(&plug); 5495 } 5496 5497 #else /* !CONFIG_MEMCG */ 5498 5499 static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) 5500 { 5501 BUILD_BUG(); 5502 } 5503 5504 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 5505 { 5506 BUILD_BUG(); 5507 } 5508 5509 #endif 5510 5511 static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) 5512 { 5513 int priority; 5514 unsigned long reclaimable; 5515 struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); 5516 5517 if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) 5518 return; 5519 /* 5520 * Determine the initial priority based on ((total / MEMCG_NR_GENS) >> 5521 * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the 5522 * estimated reclaimed_to_scanned_ratio = inactive / total. 5523 */ 5524 reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); 5525 if (get_swappiness(lruvec, sc)) 5526 reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); 5527 5528 reclaimable /= MEMCG_NR_GENS; 5529 5530 /* round down reclaimable and round up sc->nr_to_reclaim */ 5531 priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); 5532 5533 sc->priority = clamp(priority, 0, DEF_PRIORITY); 5534 } 5535 5536 static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) 5537 { 5538 struct blk_plug plug; 5539 unsigned long reclaimed = sc->nr_reclaimed; 5540 5541 VM_WARN_ON_ONCE(!global_reclaim(sc)); 5542 5543 /* 5544 * Unmapped clean folios are already prioritized. Scanning for more of 5545 * them is likely futile and can cause high reclaim latency when there 5546 * is a large number of memcgs. 5547 */ 5548 if (!sc->may_writepage || !sc->may_unmap) 5549 goto done; 5550 5551 lru_add_drain(); 5552 5553 blk_start_plug(&plug); 5554 5555 set_mm_walk(pgdat, sc->proactive); 5556 5557 set_initial_priority(pgdat, sc); 5558 5559 if (current_is_kswapd()) 5560 sc->nr_reclaimed = 0; 5561 5562 if (mem_cgroup_disabled()) 5563 shrink_one(&pgdat->__lruvec, sc); 5564 else 5565 shrink_many(pgdat, sc); 5566 5567 if (current_is_kswapd()) 5568 sc->nr_reclaimed += reclaimed; 5569 5570 clear_mm_walk(); 5571 5572 blk_finish_plug(&plug); 5573 done: 5574 /* kswapd should never fail */ 5575 pgdat->kswapd_failures = 0; 5576 } 5577 5578 /****************************************************************************** 5579 * state change 5580 ******************************************************************************/ 5581 5582 static bool __maybe_unused state_is_valid(struct lruvec *lruvec) 5583 { 5584 struct lru_gen_folio *lrugen = &lruvec->lrugen; 5585 5586 if (lrugen->enabled) { 5587 enum lru_list lru; 5588 5589 for_each_evictable_lru(lru) { 5590 if (!list_empty(&lruvec->lists[lru])) 5591 return false; 5592 } 5593 } else { 5594 int gen, type, zone; 5595 5596 for_each_gen_type_zone(gen, type, zone) { 5597 if (!list_empty(&lrugen->folios[gen][type][zone])) 5598 return false; 5599 } 5600 } 5601 5602 return true; 5603 } 5604 5605 static bool fill_evictable(struct lruvec *lruvec) 5606 { 5607 enum lru_list lru; 5608 int remaining = MAX_LRU_BATCH; 5609 5610 for_each_evictable_lru(lru) { 5611 int type = is_file_lru(lru); 5612 bool active = is_active_lru(lru); 5613 struct list_head *head = &lruvec->lists[lru]; 5614 5615 while (!list_empty(head)) { 5616 bool success; 5617 struct folio *folio = lru_to_folio(head); 5618 5619 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); 5620 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio); 5621 VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); 5622 VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio); 5623 5624 lruvec_del_folio(lruvec, folio); 5625 success = lru_gen_add_folio(lruvec, folio, false); 5626 VM_WARN_ON_ONCE(!success); 5627 5628 if (!--remaining) 5629 return false; 5630 } 5631 } 5632 5633 return true; 5634 } 5635 5636 static bool drain_evictable(struct lruvec *lruvec) 5637 { 5638 int gen, type, zone; 5639 int remaining = MAX_LRU_BATCH; 5640 5641 for_each_gen_type_zone(gen, type, zone) { 5642 struct list_head *head = &lruvec->lrugen.folios[gen][type][zone]; 5643 5644 while (!list_empty(head)) { 5645 bool success; 5646 struct folio *folio = lru_to_folio(head); 5647 5648 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); 5649 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); 5650 VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); 5651 VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); 5652 5653 success = lru_gen_del_folio(lruvec, folio, false); 5654 VM_WARN_ON_ONCE(!success); 5655 lruvec_add_folio(lruvec, folio); 5656 5657 if (!--remaining) 5658 return false; 5659 } 5660 } 5661 5662 return true; 5663 } 5664 5665 static void lru_gen_change_state(bool enabled) 5666 { 5667 static DEFINE_MUTEX(state_mutex); 5668 5669 struct mem_cgroup *memcg; 5670 5671 cgroup_lock(); 5672 cpus_read_lock(); 5673 get_online_mems(); 5674 mutex_lock(&state_mutex); 5675 5676 if (enabled == lru_gen_enabled()) 5677 goto unlock; 5678 5679 if (enabled) 5680 static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); 5681 else 5682 static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); 5683 5684 memcg = mem_cgroup_iter(NULL, NULL, NULL); 5685 do { 5686 int nid; 5687 5688 for_each_node(nid) { 5689 struct lruvec *lruvec = get_lruvec(memcg, nid); 5690 5691 spin_lock_irq(&lruvec->lru_lock); 5692 5693 VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); 5694 VM_WARN_ON_ONCE(!state_is_valid(lruvec)); 5695 5696 lruvec->lrugen.enabled = enabled; 5697 5698 while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { 5699 spin_unlock_irq(&lruvec->lru_lock); 5700 cond_resched(); 5701 spin_lock_irq(&lruvec->lru_lock); 5702 } 5703 5704 spin_unlock_irq(&lruvec->lru_lock); 5705 } 5706 5707 cond_resched(); 5708 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); 5709 unlock: 5710 mutex_unlock(&state_mutex); 5711 put_online_mems(); 5712 cpus_read_unlock(); 5713 cgroup_unlock(); 5714 } 5715 5716 /****************************************************************************** 5717 * sysfs interface 5718 ******************************************************************************/ 5719 5720 static ssize_t min_ttl_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) 5721 { 5722 return sysfs_emit(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); 5723 } 5724 5725 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ 5726 static ssize_t min_ttl_ms_store(struct kobject *kobj, struct kobj_attribute *attr, 5727 const char *buf, size_t len) 5728 { 5729 unsigned int msecs; 5730 5731 if (kstrtouint(buf, 0, &msecs)) 5732 return -EINVAL; 5733 5734 WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); 5735 5736 return len; 5737 } 5738 5739 static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR_RW(min_ttl_ms); 5740 5741 static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) 5742 { 5743 unsigned int caps = 0; 5744 5745 if (get_cap(LRU_GEN_CORE)) 5746 caps |= BIT(LRU_GEN_CORE); 5747 5748 if (should_walk_mmu()) 5749 caps |= BIT(LRU_GEN_MM_WALK); 5750 5751 if (should_clear_pmd_young()) 5752 caps |= BIT(LRU_GEN_NONLEAF_YOUNG); 5753 5754 return sysfs_emit(buf, "0x%04x\n", caps); 5755 } 5756 5757 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ 5758 static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, 5759 const char *buf, size_t len) 5760 { 5761 int i; 5762 unsigned int caps; 5763 5764 if (tolower(*buf) == 'n') 5765 caps = 0; 5766 else if (tolower(*buf) == 'y') 5767 caps = -1; 5768 else if (kstrtouint(buf, 0, &caps)) 5769 return -EINVAL; 5770 5771 for (i = 0; i < NR_LRU_GEN_CAPS; i++) { 5772 bool enabled = caps & BIT(i); 5773 5774 if (i == LRU_GEN_CORE) 5775 lru_gen_change_state(enabled); 5776 else if (enabled) 5777 static_branch_enable(&lru_gen_caps[i]); 5778 else 5779 static_branch_disable(&lru_gen_caps[i]); 5780 } 5781 5782 return len; 5783 } 5784 5785 static struct kobj_attribute lru_gen_enabled_attr = __ATTR_RW(enabled); 5786 5787 static struct attribute *lru_gen_attrs[] = { 5788 &lru_gen_min_ttl_attr.attr, 5789 &lru_gen_enabled_attr.attr, 5790 NULL 5791 }; 5792 5793 static const struct attribute_group lru_gen_attr_group = { 5794 .name = "lru_gen", 5795 .attrs = lru_gen_attrs, 5796 }; 5797 5798 /****************************************************************************** 5799 * debugfs interface 5800 ******************************************************************************/ 5801 5802 static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) 5803 { 5804 struct mem_cgroup *memcg; 5805 loff_t nr_to_skip = *pos; 5806 5807 m->private = kvmalloc(PATH_MAX, GFP_KERNEL); 5808 if (!m->private) 5809 return ERR_PTR(-ENOMEM); 5810 5811 memcg = mem_cgroup_iter(NULL, NULL, NULL); 5812 do { 5813 int nid; 5814 5815 for_each_node_state(nid, N_MEMORY) { 5816 if (!nr_to_skip--) 5817 return get_lruvec(memcg, nid); 5818 } 5819 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); 5820 5821 return NULL; 5822 } 5823 5824 static void lru_gen_seq_stop(struct seq_file *m, void *v) 5825 { 5826 if (!IS_ERR_OR_NULL(v)) 5827 mem_cgroup_iter_break(NULL, lruvec_memcg(v)); 5828 5829 kvfree(m->private); 5830 m->private = NULL; 5831 } 5832 5833 static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) 5834 { 5835 int nid = lruvec_pgdat(v)->node_id; 5836 struct mem_cgroup *memcg = lruvec_memcg(v); 5837 5838 ++*pos; 5839 5840 nid = next_memory_node(nid); 5841 if (nid == MAX_NUMNODES) { 5842 memcg = mem_cgroup_iter(NULL, memcg, NULL); 5843 if (!memcg) 5844 return NULL; 5845 5846 nid = first_memory_node; 5847 } 5848 5849 return get_lruvec(memcg, nid); 5850 } 5851 5852 static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, 5853 unsigned long max_seq, unsigned long *min_seq, 5854 unsigned long seq) 5855 { 5856 int i; 5857 int type, tier; 5858 int hist = lru_hist_from_seq(seq); 5859 struct lru_gen_folio *lrugen = &lruvec->lrugen; 5860 5861 for (tier = 0; tier < MAX_NR_TIERS; tier++) { 5862 seq_printf(m, " %10d", tier); 5863 for (type = 0; type < ANON_AND_FILE; type++) { 5864 const char *s = " "; 5865 unsigned long n[3] = {}; 5866 5867 if (seq == max_seq) { 5868 s = "RT "; 5869 n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); 5870 n[1] = READ_ONCE(lrugen->avg_total[type][tier]); 5871 } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { 5872 s = "rep"; 5873 n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); 5874 n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); 5875 if (tier) 5876 n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); 5877 } 5878 5879 for (i = 0; i < 3; i++) 5880 seq_printf(m, " %10lu%c", n[i], s[i]); 5881 } 5882 seq_putc(m, '\n'); 5883 } 5884 5885 seq_puts(m, " "); 5886 for (i = 0; i < NR_MM_STATS; i++) { 5887 const char *s = " "; 5888 unsigned long n = 0; 5889 5890 if (seq == max_seq && NR_HIST_GENS == 1) { 5891 s = "LOYNFA"; 5892 n = READ_ONCE(lruvec->mm_state.stats[hist][i]); 5893 } else if (seq != max_seq && NR_HIST_GENS > 1) { 5894 s = "loynfa"; 5895 n = READ_ONCE(lruvec->mm_state.stats[hist][i]); 5896 } 5897 5898 seq_printf(m, " %10lu%c", n, s[i]); 5899 } 5900 seq_putc(m, '\n'); 5901 } 5902 5903 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ 5904 static int lru_gen_seq_show(struct seq_file *m, void *v) 5905 { 5906 unsigned long seq; 5907 bool full = !debugfs_real_fops(m->file)->write; 5908 struct lruvec *lruvec = v; 5909 struct lru_gen_folio *lrugen = &lruvec->lrugen; 5910 int nid = lruvec_pgdat(lruvec)->node_id; 5911 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 5912 DEFINE_MAX_SEQ(lruvec); 5913 DEFINE_MIN_SEQ(lruvec); 5914 5915 if (nid == first_memory_node) { 5916 const char *path = memcg ? m->private : ""; 5917 5918 #ifdef CONFIG_MEMCG 5919 if (memcg) 5920 cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); 5921 #endif 5922 seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); 5923 } 5924 5925 seq_printf(m, " node %5d\n", nid); 5926 5927 if (!full) 5928 seq = min_seq[LRU_GEN_ANON]; 5929 else if (max_seq >= MAX_NR_GENS) 5930 seq = max_seq - MAX_NR_GENS + 1; 5931 else 5932 seq = 0; 5933 5934 for (; seq <= max_seq; seq++) { 5935 int type, zone; 5936 int gen = lru_gen_from_seq(seq); 5937 unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); 5938 5939 seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth)); 5940 5941 for (type = 0; type < ANON_AND_FILE; type++) { 5942 unsigned long size = 0; 5943 char mark = full && seq < min_seq[type] ? 'x' : ' '; 5944 5945 for (zone = 0; zone < MAX_NR_ZONES; zone++) 5946 size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); 5947 5948 seq_printf(m, " %10lu%c", size, mark); 5949 } 5950 5951 seq_putc(m, '\n'); 5952 5953 if (full) 5954 lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); 5955 } 5956 5957 return 0; 5958 } 5959 5960 static const struct seq_operations lru_gen_seq_ops = { 5961 .start = lru_gen_seq_start, 5962 .stop = lru_gen_seq_stop, 5963 .next = lru_gen_seq_next, 5964 .show = lru_gen_seq_show, 5965 }; 5966 5967 static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, 5968 bool can_swap, bool force_scan) 5969 { 5970 DEFINE_MAX_SEQ(lruvec); 5971 DEFINE_MIN_SEQ(lruvec); 5972 5973 if (seq < max_seq) 5974 return 0; 5975 5976 if (seq > max_seq) 5977 return -EINVAL; 5978 5979 if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) 5980 return -ERANGE; 5981 5982 try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan); 5983 5984 return 0; 5985 } 5986 5987 static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, 5988 int swappiness, unsigned long nr_to_reclaim) 5989 { 5990 DEFINE_MAX_SEQ(lruvec); 5991 5992 if (seq + MIN_NR_GENS > max_seq) 5993 return -EINVAL; 5994 5995 sc->nr_reclaimed = 0; 5996 5997 while (!signal_pending(current)) { 5998 DEFINE_MIN_SEQ(lruvec); 5999 6000 if (seq < min_seq[!swappiness]) 6001 return 0; 6002 6003 if (sc->nr_reclaimed >= nr_to_reclaim) 6004 return 0; 6005 6006 if (!evict_folios(lruvec, sc, swappiness)) 6007 return 0; 6008 6009 cond_resched(); 6010 } 6011 6012 return -EINTR; 6013 } 6014 6015 static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, 6016 struct scan_control *sc, int swappiness, unsigned long opt) 6017 { 6018 struct lruvec *lruvec; 6019 int err = -EINVAL; 6020 struct mem_cgroup *memcg = NULL; 6021 6022 if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) 6023 return -EINVAL; 6024 6025 if (!mem_cgroup_disabled()) { 6026 rcu_read_lock(); 6027 6028 memcg = mem_cgroup_from_id(memcg_id); 6029 if (!mem_cgroup_tryget(memcg)) 6030 memcg = NULL; 6031 6032 rcu_read_unlock(); 6033 6034 if (!memcg) 6035 return -EINVAL; 6036 } 6037 6038 if (memcg_id != mem_cgroup_id(memcg)) 6039 goto done; 6040 6041 lruvec = get_lruvec(memcg, nid); 6042 6043 if (swappiness < 0) 6044 swappiness = get_swappiness(lruvec, sc); 6045 else if (swappiness > 200) 6046 goto done; 6047 6048 switch (cmd) { 6049 case '+': 6050 err = run_aging(lruvec, seq, sc, swappiness, opt); 6051 break; 6052 case '-': 6053 err = run_eviction(lruvec, seq, sc, swappiness, opt); 6054 break; 6055 } 6056 done: 6057 mem_cgroup_put(memcg); 6058 6059 return err; 6060 } 6061 6062 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ 6063 static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, 6064 size_t len, loff_t *pos) 6065 { 6066 void *buf; 6067 char *cur, *next; 6068 unsigned int flags; 6069 struct blk_plug plug; 6070 int err = -EINVAL; 6071 struct scan_control sc = { 6072 .may_writepage = true, 6073 .may_unmap = true, 6074 .may_swap = true, 6075 .reclaim_idx = MAX_NR_ZONES - 1, 6076 .gfp_mask = GFP_KERNEL, 6077 }; 6078 6079 buf = kvmalloc(len + 1, GFP_KERNEL); 6080 if (!buf) 6081 return -ENOMEM; 6082 6083 if (copy_from_user(buf, src, len)) { 6084 kvfree(buf); 6085 return -EFAULT; 6086 } 6087 6088 set_task_reclaim_state(current, &sc.reclaim_state); 6089 flags = memalloc_noreclaim_save(); 6090 blk_start_plug(&plug); 6091 if (!set_mm_walk(NULL, true)) { 6092 err = -ENOMEM; 6093 goto done; 6094 } 6095 6096 next = buf; 6097 next[len] = '\0'; 6098 6099 while ((cur = strsep(&next, ",;\n"))) { 6100 int n; 6101 int end; 6102 char cmd; 6103 unsigned int memcg_id; 6104 unsigned int nid; 6105 unsigned long seq; 6106 unsigned int swappiness = -1; 6107 unsigned long opt = -1; 6108 6109 cur = skip_spaces(cur); 6110 if (!*cur) 6111 continue; 6112 6113 n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, 6114 &seq, &end, &swappiness, &end, &opt, &end); 6115 if (n < 4 || cur[end]) { 6116 err = -EINVAL; 6117 break; 6118 } 6119 6120 err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); 6121 if (err) 6122 break; 6123 } 6124 done: 6125 clear_mm_walk(); 6126 blk_finish_plug(&plug); 6127 memalloc_noreclaim_restore(flags); 6128 set_task_reclaim_state(current, NULL); 6129 6130 kvfree(buf); 6131 6132 return err ? : len; 6133 } 6134 6135 static int lru_gen_seq_open(struct inode *inode, struct file *file) 6136 { 6137 return seq_open(file, &lru_gen_seq_ops); 6138 } 6139 6140 static const struct file_operations lru_gen_rw_fops = { 6141 .open = lru_gen_seq_open, 6142 .read = seq_read, 6143 .write = lru_gen_seq_write, 6144 .llseek = seq_lseek, 6145 .release = seq_release, 6146 }; 6147 6148 static const struct file_operations lru_gen_ro_fops = { 6149 .open = lru_gen_seq_open, 6150 .read = seq_read, 6151 .llseek = seq_lseek, 6152 .release = seq_release, 6153 }; 6154 6155 /****************************************************************************** 6156 * initialization 6157 ******************************************************************************/ 6158 6159 void lru_gen_init_lruvec(struct lruvec *lruvec) 6160 { 6161 int i; 6162 int gen, type, zone; 6163 struct lru_gen_folio *lrugen = &lruvec->lrugen; 6164 6165 lrugen->max_seq = MIN_NR_GENS + 1; 6166 lrugen->enabled = lru_gen_enabled(); 6167 6168 for (i = 0; i <= MIN_NR_GENS + 1; i++) 6169 lrugen->timestamps[i] = jiffies; 6170 6171 for_each_gen_type_zone(gen, type, zone) 6172 INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); 6173 6174 lruvec->mm_state.seq = MIN_NR_GENS; 6175 } 6176 6177 #ifdef CONFIG_MEMCG 6178 6179 void lru_gen_init_pgdat(struct pglist_data *pgdat) 6180 { 6181 int i, j; 6182 6183 spin_lock_init(&pgdat->memcg_lru.lock); 6184 6185 for (i = 0; i < MEMCG_NR_GENS; i++) { 6186 for (j = 0; j < MEMCG_NR_BINS; j++) 6187 INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); 6188 } 6189 } 6190 6191 void lru_gen_init_memcg(struct mem_cgroup *memcg) 6192 { 6193 INIT_LIST_HEAD(&memcg->mm_list.fifo); 6194 spin_lock_init(&memcg->mm_list.lock); 6195 } 6196 6197 void lru_gen_exit_memcg(struct mem_cgroup *memcg) 6198 { 6199 int i; 6200 int nid; 6201 6202 VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo)); 6203 6204 for_each_node(nid) { 6205 struct lruvec *lruvec = get_lruvec(memcg, nid); 6206 6207 VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, 6208 sizeof(lruvec->lrugen.nr_pages))); 6209 6210 lruvec->lrugen.list.next = LIST_POISON1; 6211 6212 for (i = 0; i < NR_BLOOM_FILTERS; i++) { 6213 bitmap_free(lruvec->mm_state.filters[i]); 6214 lruvec->mm_state.filters[i] = NULL; 6215 } 6216 } 6217 } 6218 6219 #endif /* CONFIG_MEMCG */ 6220 6221 static int __init init_lru_gen(void) 6222 { 6223 BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); 6224 BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); 6225 6226 if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) 6227 pr_err("lru_gen: failed to create sysfs group\n"); 6228 6229 debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); 6230 debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); 6231 6232 return 0; 6233 }; 6234 late_initcall(init_lru_gen); 6235 6236 #else /* !CONFIG_LRU_GEN */ 6237 6238 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) 6239 { 6240 } 6241 6242 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 6243 { 6244 } 6245 6246 static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) 6247 { 6248 } 6249 6250 #endif /* CONFIG_LRU_GEN */ 6251 6252 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 6253 { 6254 unsigned long nr[NR_LRU_LISTS]; 6255 unsigned long targets[NR_LRU_LISTS]; 6256 unsigned long nr_to_scan; 6257 enum lru_list lru; 6258 unsigned long nr_reclaimed = 0; 6259 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 6260 bool proportional_reclaim; 6261 struct blk_plug plug; 6262 6263 if (lru_gen_enabled() && !global_reclaim(sc)) { 6264 lru_gen_shrink_lruvec(lruvec, sc); 6265 return; 6266 } 6267 6268 get_scan_count(lruvec, sc, nr); 6269 6270 /* Record the original scan target for proportional adjustments later */ 6271 memcpy(targets, nr, sizeof(nr)); 6272 6273 /* 6274 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal 6275 * event that can occur when there is little memory pressure e.g. 6276 * multiple streaming readers/writers. Hence, we do not abort scanning 6277 * when the requested number of pages are reclaimed when scanning at 6278 * DEF_PRIORITY on the assumption that the fact we are direct 6279 * reclaiming implies that kswapd is not keeping up and it is best to 6280 * do a batch of work at once. For memcg reclaim one check is made to 6281 * abort proportional reclaim if either the file or anon lru has already 6282 * dropped to zero at the first pass. 6283 */ 6284 proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() && 6285 sc->priority == DEF_PRIORITY); 6286 6287 blk_start_plug(&plug); 6288 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 6289 nr[LRU_INACTIVE_FILE]) { 6290 unsigned long nr_anon, nr_file, percentage; 6291 unsigned long nr_scanned; 6292 6293 for_each_evictable_lru(lru) { 6294 if (nr[lru]) { 6295 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); 6296 nr[lru] -= nr_to_scan; 6297 6298 nr_reclaimed += shrink_list(lru, nr_to_scan, 6299 lruvec, sc); 6300 } 6301 } 6302 6303 cond_resched(); 6304 6305 if (nr_reclaimed < nr_to_reclaim || proportional_reclaim) 6306 continue; 6307 6308 /* 6309 * For kswapd and memcg, reclaim at least the number of pages 6310 * requested. Ensure that the anon and file LRUs are scanned 6311 * proportionally what was requested by get_scan_count(). We 6312 * stop reclaiming one LRU and reduce the amount scanning 6313 * proportional to the original scan target. 6314 */ 6315 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; 6316 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; 6317 6318 /* 6319 * It's just vindictive to attack the larger once the smaller 6320 * has gone to zero. And given the way we stop scanning the 6321 * smaller below, this makes sure that we only make one nudge 6322 * towards proportionality once we've got nr_to_reclaim. 6323 */ 6324 if (!nr_file || !nr_anon) 6325 break; 6326 6327 if (nr_file > nr_anon) { 6328 unsigned long scan_target = targets[LRU_INACTIVE_ANON] + 6329 targets[LRU_ACTIVE_ANON] + 1; 6330 lru = LRU_BASE; 6331 percentage = nr_anon * 100 / scan_target; 6332 } else { 6333 unsigned long scan_target = targets[LRU_INACTIVE_FILE] + 6334 targets[LRU_ACTIVE_FILE] + 1; 6335 lru = LRU_FILE; 6336 percentage = nr_file * 100 / scan_target; 6337 } 6338 6339 /* Stop scanning the smaller of the LRU */ 6340 nr[lru] = 0; 6341 nr[lru + LRU_ACTIVE] = 0; 6342 6343 /* 6344 * Recalculate the other LRU scan count based on its original 6345 * scan target and the percentage scanning already complete 6346 */ 6347 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; 6348 nr_scanned = targets[lru] - nr[lru]; 6349 nr[lru] = targets[lru] * (100 - percentage) / 100; 6350 nr[lru] -= min(nr[lru], nr_scanned); 6351 6352 lru += LRU_ACTIVE; 6353 nr_scanned = targets[lru] - nr[lru]; 6354 nr[lru] = targets[lru] * (100 - percentage) / 100; 6355 nr[lru] -= min(nr[lru], nr_scanned); 6356 } 6357 blk_finish_plug(&plug); 6358 sc->nr_reclaimed += nr_reclaimed; 6359 6360 /* 6361 * Even if we did not try to evict anon pages at all, we want to 6362 * rebalance the anon lru active/inactive ratio. 6363 */ 6364 if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) && 6365 inactive_is_low(lruvec, LRU_INACTIVE_ANON)) 6366 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 6367 sc, LRU_ACTIVE_ANON); 6368 } 6369 6370 /* Use reclaim/compaction for costly allocs or under memory pressure */ 6371 static bool in_reclaim_compaction(struct scan_control *sc) 6372 { 6373 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && 6374 (sc->order > PAGE_ALLOC_COSTLY_ORDER || 6375 sc->priority < DEF_PRIORITY - 2)) 6376 return true; 6377 6378 return false; 6379 } 6380 6381 /* 6382 * Reclaim/compaction is used for high-order allocation requests. It reclaims 6383 * order-0 pages before compacting the zone. should_continue_reclaim() returns 6384 * true if more pages should be reclaimed such that when the page allocator 6385 * calls try_to_compact_pages() that it will have enough free pages to succeed. 6386 * It will give up earlier than that if there is difficulty reclaiming pages. 6387 */ 6388 static inline bool should_continue_reclaim(struct pglist_data *pgdat, 6389 unsigned long nr_reclaimed, 6390 struct scan_control *sc) 6391 { 6392 unsigned long pages_for_compaction; 6393 unsigned long inactive_lru_pages; 6394 int z; 6395 6396 /* If not in reclaim/compaction mode, stop */ 6397 if (!in_reclaim_compaction(sc)) 6398 return false; 6399 6400 /* 6401 * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX 6402 * number of pages that were scanned. This will return to the caller 6403 * with the risk reclaim/compaction and the resulting allocation attempt 6404 * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL 6405 * allocations through requiring that the full LRU list has been scanned 6406 * first, by assuming that zero delta of sc->nr_scanned means full LRU 6407 * scan, but that approximation was wrong, and there were corner cases 6408 * where always a non-zero amount of pages were scanned. 6409 */ 6410 if (!nr_reclaimed) 6411 return false; 6412 6413 /* If compaction would go ahead or the allocation would succeed, stop */ 6414 for (z = 0; z <= sc->reclaim_idx; z++) { 6415 struct zone *zone = &pgdat->node_zones[z]; 6416 if (!managed_zone(zone)) 6417 continue; 6418 6419 /* Allocation can already succeed, nothing to do */ 6420 if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), 6421 sc->reclaim_idx, 0)) 6422 return false; 6423 6424 if (compaction_suitable(zone, sc->order, sc->reclaim_idx)) 6425 return false; 6426 } 6427 6428 /* 6429 * If we have not reclaimed enough pages for compaction and the 6430 * inactive lists are large enough, continue reclaiming 6431 */ 6432 pages_for_compaction = compact_gap(sc->order); 6433 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); 6434 if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) 6435 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); 6436 6437 return inactive_lru_pages > pages_for_compaction; 6438 } 6439 6440 static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) 6441 { 6442 struct mem_cgroup *target_memcg = sc->target_mem_cgroup; 6443 struct mem_cgroup *memcg; 6444 6445 memcg = mem_cgroup_iter(target_memcg, NULL, NULL); 6446 do { 6447 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 6448 unsigned long reclaimed; 6449 unsigned long scanned; 6450 6451 /* 6452 * This loop can become CPU-bound when target memcgs 6453 * aren't eligible for reclaim - either because they 6454 * don't have any reclaimable pages, or because their 6455 * memory is explicitly protected. Avoid soft lockups. 6456 */ 6457 cond_resched(); 6458 6459 mem_cgroup_calculate_protection(target_memcg, memcg); 6460 6461 if (mem_cgroup_below_min(target_memcg, memcg)) { 6462 /* 6463 * Hard protection. 6464 * If there is no reclaimable memory, OOM. 6465 */ 6466 continue; 6467 } else if (mem_cgroup_below_low(target_memcg, memcg)) { 6468 /* 6469 * Soft protection. 6470 * Respect the protection only as long as 6471 * there is an unprotected supply 6472 * of reclaimable memory from other cgroups. 6473 */ 6474 if (!sc->memcg_low_reclaim) { 6475 sc->memcg_low_skipped = 1; 6476 continue; 6477 } 6478 memcg_memory_event(memcg, MEMCG_LOW); 6479 } 6480 6481 reclaimed = sc->nr_reclaimed; 6482 scanned = sc->nr_scanned; 6483 6484 shrink_lruvec(lruvec, sc); 6485 6486 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, 6487 sc->priority); 6488 6489 /* Record the group's reclaim efficiency */ 6490 if (!sc->proactive) 6491 vmpressure(sc->gfp_mask, memcg, false, 6492 sc->nr_scanned - scanned, 6493 sc->nr_reclaimed - reclaimed); 6494 6495 } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); 6496 } 6497 6498 static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) 6499 { 6500 unsigned long nr_reclaimed, nr_scanned, nr_node_reclaimed; 6501 struct lruvec *target_lruvec; 6502 bool reclaimable = false; 6503 6504 if (lru_gen_enabled() && global_reclaim(sc)) { 6505 lru_gen_shrink_node(pgdat, sc); 6506 return; 6507 } 6508 6509 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); 6510 6511 again: 6512 memset(&sc->nr, 0, sizeof(sc->nr)); 6513 6514 nr_reclaimed = sc->nr_reclaimed; 6515 nr_scanned = sc->nr_scanned; 6516 6517 prepare_scan_count(pgdat, sc); 6518 6519 shrink_node_memcgs(pgdat, sc); 6520 6521 flush_reclaim_state(sc); 6522 6523 nr_node_reclaimed = sc->nr_reclaimed - nr_reclaimed; 6524 6525 /* Record the subtree's reclaim efficiency */ 6526 if (!sc->proactive) 6527 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, 6528 sc->nr_scanned - nr_scanned, nr_node_reclaimed); 6529 6530 if (nr_node_reclaimed) 6531 reclaimable = true; 6532 6533 if (current_is_kswapd()) { 6534 /* 6535 * If reclaim is isolating dirty pages under writeback, 6536 * it implies that the long-lived page allocation rate 6537 * is exceeding the page laundering rate. Either the 6538 * global limits are not being effective at throttling 6539 * processes due to the page distribution throughout 6540 * zones or there is heavy usage of a slow backing 6541 * device. The only option is to throttle from reclaim 6542 * context which is not ideal as there is no guarantee 6543 * the dirtying process is throttled in the same way 6544 * balance_dirty_pages() manages. 6545 * 6546 * Once a node is flagged PGDAT_WRITEBACK, kswapd will 6547 * count the number of pages under pages flagged for 6548 * immediate reclaim and stall if any are encountered 6549 * in the nr_immediate check below. 6550 */ 6551 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) 6552 set_bit(PGDAT_WRITEBACK, &pgdat->flags); 6553 6554 /* Allow kswapd to start writing pages during reclaim.*/ 6555 if (sc->nr.unqueued_dirty == sc->nr.file_taken) 6556 set_bit(PGDAT_DIRTY, &pgdat->flags); 6557 6558 /* 6559 * If kswapd scans pages marked for immediate 6560 * reclaim and under writeback (nr_immediate), it 6561 * implies that pages are cycling through the LRU 6562 * faster than they are written so forcibly stall 6563 * until some pages complete writeback. 6564 */ 6565 if (sc->nr.immediate) 6566 reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); 6567 } 6568 6569 /* 6570 * Tag a node/memcg as congested if all the dirty pages were marked 6571 * for writeback and immediate reclaim (counted in nr.congested). 6572 * 6573 * Legacy memcg will stall in page writeback so avoid forcibly 6574 * stalling in reclaim_throttle(). 6575 */ 6576 if ((current_is_kswapd() || 6577 (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && 6578 sc->nr.dirty && sc->nr.dirty == sc->nr.congested) 6579 set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); 6580 6581 /* 6582 * Stall direct reclaim for IO completions if the lruvec is 6583 * node is congested. Allow kswapd to continue until it 6584 * starts encountering unqueued dirty pages or cycling through 6585 * the LRU too quickly. 6586 */ 6587 if (!current_is_kswapd() && current_may_throttle() && 6588 !sc->hibernation_mode && 6589 test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) 6590 reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED); 6591 6592 if (should_continue_reclaim(pgdat, nr_node_reclaimed, sc)) 6593 goto again; 6594 6595 /* 6596 * Kswapd gives up on balancing particular nodes after too 6597 * many failures to reclaim anything from them and goes to 6598 * sleep. On reclaim progress, reset the failure counter. A 6599 * successful direct reclaim run will revive a dormant kswapd. 6600 */ 6601 if (reclaimable) 6602 pgdat->kswapd_failures = 0; 6603 } 6604 6605 /* 6606 * Returns true if compaction should go ahead for a costly-order request, or 6607 * the allocation would already succeed without compaction. Return false if we 6608 * should reclaim first. 6609 */ 6610 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) 6611 { 6612 unsigned long watermark; 6613 6614 /* Allocation can already succeed, nothing to do */ 6615 if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), 6616 sc->reclaim_idx, 0)) 6617 return true; 6618 6619 /* Compaction cannot yet proceed. Do reclaim. */ 6620 if (!compaction_suitable(zone, sc->order, sc->reclaim_idx)) 6621 return false; 6622 6623 /* 6624 * Compaction is already possible, but it takes time to run and there 6625 * are potentially other callers using the pages just freed. So proceed 6626 * with reclaim to make a buffer of free pages available to give 6627 * compaction a reasonable chance of completing and allocating the page. 6628 * Note that we won't actually reclaim the whole buffer in one attempt 6629 * as the target watermark in should_continue_reclaim() is lower. But if 6630 * we are already above the high+gap watermark, don't reclaim at all. 6631 */ 6632 watermark = high_wmark_pages(zone) + compact_gap(sc->order); 6633 6634 return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); 6635 } 6636 6637 static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) 6638 { 6639 /* 6640 * If reclaim is making progress greater than 12% efficiency then 6641 * wake all the NOPROGRESS throttled tasks. 6642 */ 6643 if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) { 6644 wait_queue_head_t *wqh; 6645 6646 wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS]; 6647 if (waitqueue_active(wqh)) 6648 wake_up(wqh); 6649 6650 return; 6651 } 6652 6653 /* 6654 * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will 6655 * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages 6656 * under writeback and marked for immediate reclaim at the tail of the 6657 * LRU. 6658 */ 6659 if (current_is_kswapd() || cgroup_reclaim(sc)) 6660 return; 6661 6662 /* Throttle if making no progress at high prioities. */ 6663 if (sc->priority == 1 && !sc->nr_reclaimed) 6664 reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS); 6665 } 6666 6667 /* 6668 * This is the direct reclaim path, for page-allocating processes. We only 6669 * try to reclaim pages from zones which will satisfy the caller's allocation 6670 * request. 6671 * 6672 * If a zone is deemed to be full of pinned pages then just give it a light 6673 * scan then give up on it. 6674 */ 6675 static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) 6676 { 6677 struct zoneref *z; 6678 struct zone *zone; 6679 unsigned long nr_soft_reclaimed; 6680 unsigned long nr_soft_scanned; 6681 gfp_t orig_mask; 6682 pg_data_t *last_pgdat = NULL; 6683 pg_data_t *first_pgdat = NULL; 6684 6685 /* 6686 * If the number of buffer_heads in the machine exceeds the maximum 6687 * allowed level, force direct reclaim to scan the highmem zone as 6688 * highmem pages could be pinning lowmem pages storing buffer_heads 6689 */ 6690 orig_mask = sc->gfp_mask; 6691 if (buffer_heads_over_limit) { 6692 sc->gfp_mask |= __GFP_HIGHMEM; 6693 sc->reclaim_idx = gfp_zone(sc->gfp_mask); 6694 } 6695 6696 for_each_zone_zonelist_nodemask(zone, z, zonelist, 6697 sc->reclaim_idx, sc->nodemask) { 6698 /* 6699 * Take care memory controller reclaiming has small influence 6700 * to global LRU. 6701 */ 6702 if (!cgroup_reclaim(sc)) { 6703 if (!cpuset_zone_allowed(zone, 6704 GFP_KERNEL | __GFP_HARDWALL)) 6705 continue; 6706 6707 /* 6708 * If we already have plenty of memory free for 6709 * compaction in this zone, don't free any more. 6710 * Even though compaction is invoked for any 6711 * non-zero order, only frequent costly order 6712 * reclamation is disruptive enough to become a 6713 * noticeable problem, like transparent huge 6714 * page allocations. 6715 */ 6716 if (IS_ENABLED(CONFIG_COMPACTION) && 6717 sc->order > PAGE_ALLOC_COSTLY_ORDER && 6718 compaction_ready(zone, sc)) { 6719 sc->compaction_ready = true; 6720 continue; 6721 } 6722 6723 /* 6724 * Shrink each node in the zonelist once. If the 6725 * zonelist is ordered by zone (not the default) then a 6726 * node may be shrunk multiple times but in that case 6727 * the user prefers lower zones being preserved. 6728 */ 6729 if (zone->zone_pgdat == last_pgdat) 6730 continue; 6731 6732 /* 6733 * This steals pages from memory cgroups over softlimit 6734 * and returns the number of reclaimed pages and 6735 * scanned pages. This works for global memory pressure 6736 * and balancing, not for a memcg's limit. 6737 */ 6738 nr_soft_scanned = 0; 6739 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat, 6740 sc->order, sc->gfp_mask, 6741 &nr_soft_scanned); 6742 sc->nr_reclaimed += nr_soft_reclaimed; 6743 sc->nr_scanned += nr_soft_scanned; 6744 /* need some check for avoid more shrink_zone() */ 6745 } 6746 6747 if (!first_pgdat) 6748 first_pgdat = zone->zone_pgdat; 6749 6750 /* See comment about same check for global reclaim above */ 6751 if (zone->zone_pgdat == last_pgdat) 6752 continue; 6753 last_pgdat = zone->zone_pgdat; 6754 shrink_node(zone->zone_pgdat, sc); 6755 } 6756 6757 if (first_pgdat) 6758 consider_reclaim_throttle(first_pgdat, sc); 6759 6760 /* 6761 * Restore to original mask to avoid the impact on the caller if we 6762 * promoted it to __GFP_HIGHMEM. 6763 */ 6764 sc->gfp_mask = orig_mask; 6765 } 6766 6767 static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) 6768 { 6769 struct lruvec *target_lruvec; 6770 unsigned long refaults; 6771 6772 if (lru_gen_enabled()) 6773 return; 6774 6775 target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); 6776 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); 6777 target_lruvec->refaults[WORKINGSET_ANON] = refaults; 6778 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE); 6779 target_lruvec->refaults[WORKINGSET_FILE] = refaults; 6780 } 6781 6782 /* 6783 * This is the main entry point to direct page reclaim. 6784 * 6785 * If a full scan of the inactive list fails to free enough memory then we 6786 * are "out of memory" and something needs to be killed. 6787 * 6788 * If the caller is !__GFP_FS then the probability of a failure is reasonably 6789 * high - the zone may be full of dirty or under-writeback pages, which this 6790 * caller can't do much about. We kick the writeback threads and take explicit 6791 * naps in the hope that some of these pages can be written. But if the 6792 * allocating task holds filesystem locks which prevent writeout this might not 6793 * work, and the allocation attempt will fail. 6794 * 6795 * returns: 0, if no pages reclaimed 6796 * else, the number of pages reclaimed 6797 */ 6798 static unsigned long do_try_to_free_pages(struct zonelist *zonelist, 6799 struct scan_control *sc) 6800 { 6801 int initial_priority = sc->priority; 6802 pg_data_t *last_pgdat; 6803 struct zoneref *z; 6804 struct zone *zone; 6805 retry: 6806 delayacct_freepages_start(); 6807 6808 if (!cgroup_reclaim(sc)) 6809 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); 6810 6811 do { 6812 if (!sc->proactive) 6813 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, 6814 sc->priority); 6815 sc->nr_scanned = 0; 6816 shrink_zones(zonelist, sc); 6817 6818 if (sc->nr_reclaimed >= sc->nr_to_reclaim) 6819 break; 6820 6821 if (sc->compaction_ready) 6822 break; 6823 6824 /* 6825 * If we're getting trouble reclaiming, start doing 6826 * writepage even in laptop mode. 6827 */ 6828 if (sc->priority < DEF_PRIORITY - 2) 6829 sc->may_writepage = 1; 6830 } while (--sc->priority >= 0); 6831 6832 last_pgdat = NULL; 6833 for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, 6834 sc->nodemask) { 6835 if (zone->zone_pgdat == last_pgdat) 6836 continue; 6837 last_pgdat = zone->zone_pgdat; 6838 6839 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); 6840 6841 if (cgroup_reclaim(sc)) { 6842 struct lruvec *lruvec; 6843 6844 lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, 6845 zone->zone_pgdat); 6846 clear_bit(LRUVEC_CONGESTED, &lruvec->flags); 6847 } 6848 } 6849 6850 delayacct_freepages_end(); 6851 6852 if (sc->nr_reclaimed) 6853 return sc->nr_reclaimed; 6854 6855 /* Aborted reclaim to try compaction? don't OOM, then */ 6856 if (sc->compaction_ready) 6857 return 1; 6858 6859 /* 6860 * We make inactive:active ratio decisions based on the node's 6861 * composition of memory, but a restrictive reclaim_idx or a 6862 * memory.low cgroup setting can exempt large amounts of 6863 * memory from reclaim. Neither of which are very common, so 6864 * instead of doing costly eligibility calculations of the 6865 * entire cgroup subtree up front, we assume the estimates are 6866 * good, and retry with forcible deactivation if that fails. 6867 */ 6868 if (sc->skipped_deactivate) { 6869 sc->priority = initial_priority; 6870 sc->force_deactivate = 1; 6871 sc->skipped_deactivate = 0; 6872 goto retry; 6873 } 6874 6875 /* Untapped cgroup reserves? Don't OOM, retry. */ 6876 if (sc->memcg_low_skipped) { 6877 sc->priority = initial_priority; 6878 sc->force_deactivate = 0; 6879 sc->memcg_low_reclaim = 1; 6880 sc->memcg_low_skipped = 0; 6881 goto retry; 6882 } 6883 6884 return 0; 6885 } 6886 6887 static bool allow_direct_reclaim(pg_data_t *pgdat) 6888 { 6889 struct zone *zone; 6890 unsigned long pfmemalloc_reserve = 0; 6891 unsigned long free_pages = 0; 6892 int i; 6893 bool wmark_ok; 6894 6895 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) 6896 return true; 6897 6898 for (i = 0; i <= ZONE_NORMAL; i++) { 6899 zone = &pgdat->node_zones[i]; 6900 if (!managed_zone(zone)) 6901 continue; 6902 6903 if (!zone_reclaimable_pages(zone)) 6904 continue; 6905 6906 pfmemalloc_reserve += min_wmark_pages(zone); 6907 free_pages += zone_page_state_snapshot(zone, NR_FREE_PAGES); 6908 } 6909 6910 /* If there are no reserves (unexpected config) then do not throttle */ 6911 if (!pfmemalloc_reserve) 6912 return true; 6913 6914 wmark_ok = free_pages > pfmemalloc_reserve / 2; 6915 6916 /* kswapd must be awake if processes are being throttled */ 6917 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { 6918 if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) 6919 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL); 6920 6921 wake_up_interruptible(&pgdat->kswapd_wait); 6922 } 6923 6924 return wmark_ok; 6925 } 6926 6927 /* 6928 * Throttle direct reclaimers if backing storage is backed by the network 6929 * and the PFMEMALLOC reserve for the preferred node is getting dangerously 6930 * depleted. kswapd will continue to make progress and wake the processes 6931 * when the low watermark is reached. 6932 * 6933 * Returns true if a fatal signal was delivered during throttling. If this 6934 * happens, the page allocator should not consider triggering the OOM killer. 6935 */ 6936 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, 6937 nodemask_t *nodemask) 6938 { 6939 struct zoneref *z; 6940 struct zone *zone; 6941 pg_data_t *pgdat = NULL; 6942 6943 /* 6944 * Kernel threads should not be throttled as they may be indirectly 6945 * responsible for cleaning pages necessary for reclaim to make forward 6946 * progress. kjournald for example may enter direct reclaim while 6947 * committing a transaction where throttling it could forcing other 6948 * processes to block on log_wait_commit(). 6949 */ 6950 if (current->flags & PF_KTHREAD) 6951 goto out; 6952 6953 /* 6954 * If a fatal signal is pending, this process should not throttle. 6955 * It should return quickly so it can exit and free its memory 6956 */ 6957 if (fatal_signal_pending(current)) 6958 goto out; 6959 6960 /* 6961 * Check if the pfmemalloc reserves are ok by finding the first node 6962 * with a usable ZONE_NORMAL or lower zone. The expectation is that 6963 * GFP_KERNEL will be required for allocating network buffers when 6964 * swapping over the network so ZONE_HIGHMEM is unusable. 6965 * 6966 * Throttling is based on the first usable node and throttled processes 6967 * wait on a queue until kswapd makes progress and wakes them. There 6968 * is an affinity then between processes waking up and where reclaim 6969 * progress has been made assuming the process wakes on the same node. 6970 * More importantly, processes running on remote nodes will not compete 6971 * for remote pfmemalloc reserves and processes on different nodes 6972 * should make reasonable progress. 6973 */ 6974 for_each_zone_zonelist_nodemask(zone, z, zonelist, 6975 gfp_zone(gfp_mask), nodemask) { 6976 if (zone_idx(zone) > ZONE_NORMAL) 6977 continue; 6978 6979 /* Throttle based on the first usable node */ 6980 pgdat = zone->zone_pgdat; 6981 if (allow_direct_reclaim(pgdat)) 6982 goto out; 6983 break; 6984 } 6985 6986 /* If no zone was usable by the allocation flags then do not throttle */ 6987 if (!pgdat) 6988 goto out; 6989 6990 /* Account for the throttling */ 6991 count_vm_event(PGSCAN_DIRECT_THROTTLE); 6992 6993 /* 6994 * If the caller cannot enter the filesystem, it's possible that it 6995 * is due to the caller holding an FS lock or performing a journal 6996 * transaction in the case of a filesystem like ext[3|4]. In this case, 6997 * it is not safe to block on pfmemalloc_wait as kswapd could be 6998 * blocked waiting on the same lock. Instead, throttle for up to a 6999 * second before continuing. 7000 */ 7001 if (!(gfp_mask & __GFP_FS)) 7002 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, 7003 allow_direct_reclaim(pgdat), HZ); 7004 else 7005 /* Throttle until kswapd wakes the process */ 7006 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, 7007 allow_direct_reclaim(pgdat)); 7008 7009 if (fatal_signal_pending(current)) 7010 return true; 7011 7012 out: 7013 return false; 7014 } 7015 7016 unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 7017 gfp_t gfp_mask, nodemask_t *nodemask) 7018 { 7019 unsigned long nr_reclaimed; 7020 struct scan_control sc = { 7021 .nr_to_reclaim = SWAP_CLUSTER_MAX, 7022 .gfp_mask = current_gfp_context(gfp_mask), 7023 .reclaim_idx = gfp_zone(gfp_mask), 7024 .order = order, 7025 .nodemask = nodemask, 7026 .priority = DEF_PRIORITY, 7027 .may_writepage = !laptop_mode, 7028 .may_unmap = 1, 7029 .may_swap = 1, 7030 }; 7031 7032 /* 7033 * scan_control uses s8 fields for order, priority, and reclaim_idx. 7034 * Confirm they are large enough for max values. 7035 */ 7036 BUILD_BUG_ON(MAX_ORDER >= S8_MAX); 7037 BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); 7038 BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX); 7039 7040 /* 7041 * Do not enter reclaim if fatal signal was delivered while throttled. 7042 * 1 is returned so that the page allocator does not OOM kill at this 7043 * point. 7044 */ 7045 if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) 7046 return 1; 7047 7048 set_task_reclaim_state(current, &sc.reclaim_state); 7049 trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); 7050 7051 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 7052 7053 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); 7054 set_task_reclaim_state(current, NULL); 7055 7056 return nr_reclaimed; 7057 } 7058 7059 #ifdef CONFIG_MEMCG 7060 7061 /* Only used by soft limit reclaim. Do not reuse for anything else. */ 7062 unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, 7063 gfp_t gfp_mask, bool noswap, 7064 pg_data_t *pgdat, 7065 unsigned long *nr_scanned) 7066 { 7067 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 7068 struct scan_control sc = { 7069 .nr_to_reclaim = SWAP_CLUSTER_MAX, 7070 .target_mem_cgroup = memcg, 7071 .may_writepage = !laptop_mode, 7072 .may_unmap = 1, 7073 .reclaim_idx = MAX_NR_ZONES - 1, 7074 .may_swap = !noswap, 7075 }; 7076 7077 WARN_ON_ONCE(!current->reclaim_state); 7078 7079 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 7080 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 7081 7082 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, 7083 sc.gfp_mask); 7084 7085 /* 7086 * NOTE: Although we can get the priority field, using it 7087 * here is not a good idea, since it limits the pages we can scan. 7088 * if we don't reclaim here, the shrink_node from balance_pgdat 7089 * will pick up pages from other mem cgroup's as well. We hack 7090 * the priority and make it zero. 7091 */ 7092 shrink_lruvec(lruvec, &sc); 7093 7094 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 7095 7096 *nr_scanned = sc.nr_scanned; 7097 7098 return sc.nr_reclaimed; 7099 } 7100 7101 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, 7102 unsigned long nr_pages, 7103 gfp_t gfp_mask, 7104 unsigned int reclaim_options) 7105 { 7106 unsigned long nr_reclaimed; 7107 unsigned int noreclaim_flag; 7108 struct scan_control sc = { 7109 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), 7110 .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | 7111 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 7112 .reclaim_idx = MAX_NR_ZONES - 1, 7113 .target_mem_cgroup = memcg, 7114 .priority = DEF_PRIORITY, 7115 .may_writepage = !laptop_mode, 7116 .may_unmap = 1, 7117 .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), 7118 .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), 7119 }; 7120 /* 7121 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put 7122 * equal pressure on all the nodes. This is based on the assumption that 7123 * the reclaim does not bail out early. 7124 */ 7125 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 7126 7127 set_task_reclaim_state(current, &sc.reclaim_state); 7128 trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); 7129 noreclaim_flag = memalloc_noreclaim_save(); 7130 7131 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 7132 7133 memalloc_noreclaim_restore(noreclaim_flag); 7134 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 7135 set_task_reclaim_state(current, NULL); 7136 7137 return nr_reclaimed; 7138 } 7139 #endif 7140 7141 static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) 7142 { 7143 struct mem_cgroup *memcg; 7144 struct lruvec *lruvec; 7145 7146 if (lru_gen_enabled()) { 7147 lru_gen_age_node(pgdat, sc); 7148 return; 7149 } 7150 7151 if (!can_age_anon_pages(pgdat, sc)) 7152 return; 7153 7154 lruvec = mem_cgroup_lruvec(NULL, pgdat); 7155 if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON)) 7156 return; 7157 7158 memcg = mem_cgroup_iter(NULL, NULL, NULL); 7159 do { 7160 lruvec = mem_cgroup_lruvec(memcg, pgdat); 7161 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 7162 sc, LRU_ACTIVE_ANON); 7163 memcg = mem_cgroup_iter(NULL, memcg, NULL); 7164 } while (memcg); 7165 } 7166 7167 static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) 7168 { 7169 int i; 7170 struct zone *zone; 7171 7172 /* 7173 * Check for watermark boosts top-down as the higher zones 7174 * are more likely to be boosted. Both watermarks and boosts 7175 * should not be checked at the same time as reclaim would 7176 * start prematurely when there is no boosting and a lower 7177 * zone is balanced. 7178 */ 7179 for (i = highest_zoneidx; i >= 0; i--) { 7180 zone = pgdat->node_zones + i; 7181 if (!managed_zone(zone)) 7182 continue; 7183 7184 if (zone->watermark_boost) 7185 return true; 7186 } 7187 7188 return false; 7189 } 7190 7191 /* 7192 * Returns true if there is an eligible zone balanced for the request order 7193 * and highest_zoneidx 7194 */ 7195 static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) 7196 { 7197 int i; 7198 unsigned long mark = -1; 7199 struct zone *zone; 7200 7201 /* 7202 * Check watermarks bottom-up as lower zones are more likely to 7203 * meet watermarks. 7204 */ 7205 for (i = 0; i <= highest_zoneidx; i++) { 7206 zone = pgdat->node_zones + i; 7207 7208 if (!managed_zone(zone)) 7209 continue; 7210 7211 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) 7212 mark = wmark_pages(zone, WMARK_PROMO); 7213 else 7214 mark = high_wmark_pages(zone); 7215 if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) 7216 return true; 7217 } 7218 7219 /* 7220 * If a node has no managed zone within highest_zoneidx, it does not 7221 * need balancing by definition. This can happen if a zone-restricted 7222 * allocation tries to wake a remote kswapd. 7223 */ 7224 if (mark == -1) 7225 return true; 7226 7227 return false; 7228 } 7229 7230 /* Clear pgdat state for congested, dirty or under writeback. */ 7231 static void clear_pgdat_congested(pg_data_t *pgdat) 7232 { 7233 struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); 7234 7235 clear_bit(LRUVEC_CONGESTED, &lruvec->flags); 7236 clear_bit(PGDAT_DIRTY, &pgdat->flags); 7237 clear_bit(PGDAT_WRITEBACK, &pgdat->flags); 7238 } 7239 7240 /* 7241 * Prepare kswapd for sleeping. This verifies that there are no processes 7242 * waiting in throttle_direct_reclaim() and that watermarks have been met. 7243 * 7244 * Returns true if kswapd is ready to sleep 7245 */ 7246 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, 7247 int highest_zoneidx) 7248 { 7249 /* 7250 * The throttled processes are normally woken up in balance_pgdat() as 7251 * soon as allow_direct_reclaim() is true. But there is a potential 7252 * race between when kswapd checks the watermarks and a process gets 7253 * throttled. There is also a potential race if processes get 7254 * throttled, kswapd wakes, a large process exits thereby balancing the 7255 * zones, which causes kswapd to exit balance_pgdat() before reaching 7256 * the wake up checks. If kswapd is going to sleep, no process should 7257 * be sleeping on pfmemalloc_wait, so wake them now if necessary. If 7258 * the wake up is premature, processes will wake kswapd and get 7259 * throttled again. The difference from wake ups in balance_pgdat() is 7260 * that here we are under prepare_to_wait(). 7261 */ 7262 if (waitqueue_active(&pgdat->pfmemalloc_wait)) 7263 wake_up_all(&pgdat->pfmemalloc_wait); 7264 7265 /* Hopeless node, leave it to direct reclaim */ 7266 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) 7267 return true; 7268 7269 if (pgdat_balanced(pgdat, order, highest_zoneidx)) { 7270 clear_pgdat_congested(pgdat); 7271 return true; 7272 } 7273 7274 return false; 7275 } 7276 7277 /* 7278 * kswapd shrinks a node of pages that are at or below the highest usable 7279 * zone that is currently unbalanced. 7280 * 7281 * Returns true if kswapd scanned at least the requested number of pages to 7282 * reclaim or if the lack of progress was due to pages under writeback. 7283 * This is used to determine if the scanning priority needs to be raised. 7284 */ 7285 static bool kswapd_shrink_node(pg_data_t *pgdat, 7286 struct scan_control *sc) 7287 { 7288 struct zone *zone; 7289 int z; 7290 7291 /* Reclaim a number of pages proportional to the number of zones */ 7292 sc->nr_to_reclaim = 0; 7293 for (z = 0; z <= sc->reclaim_idx; z++) { 7294 zone = pgdat->node_zones + z; 7295 if (!managed_zone(zone)) 7296 continue; 7297 7298 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); 7299 } 7300 7301 /* 7302 * Historically care was taken to put equal pressure on all zones but 7303 * now pressure is applied based on node LRU order. 7304 */ 7305 shrink_node(pgdat, sc); 7306 7307 /* 7308 * Fragmentation may mean that the system cannot be rebalanced for 7309 * high-order allocations. If twice the allocation size has been 7310 * reclaimed then recheck watermarks only at order-0 to prevent 7311 * excessive reclaim. Assume that a process requested a high-order 7312 * can direct reclaim/compact. 7313 */ 7314 if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) 7315 sc->order = 0; 7316 7317 return sc->nr_scanned >= sc->nr_to_reclaim; 7318 } 7319 7320 /* Page allocator PCP high watermark is lowered if reclaim is active. */ 7321 static inline void 7322 update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active) 7323 { 7324 int i; 7325 struct zone *zone; 7326 7327 for (i = 0; i <= highest_zoneidx; i++) { 7328 zone = pgdat->node_zones + i; 7329 7330 if (!managed_zone(zone)) 7331 continue; 7332 7333 if (active) 7334 set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); 7335 else 7336 clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); 7337 } 7338 } 7339 7340 static inline void 7341 set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) 7342 { 7343 update_reclaim_active(pgdat, highest_zoneidx, true); 7344 } 7345 7346 static inline void 7347 clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) 7348 { 7349 update_reclaim_active(pgdat, highest_zoneidx, false); 7350 } 7351 7352 /* 7353 * For kswapd, balance_pgdat() will reclaim pages across a node from zones 7354 * that are eligible for use by the caller until at least one zone is 7355 * balanced. 7356 * 7357 * Returns the order kswapd finished reclaiming at. 7358 * 7359 * kswapd scans the zones in the highmem->normal->dma direction. It skips 7360 * zones which have free_pages > high_wmark_pages(zone), but once a zone is 7361 * found to have free_pages <= high_wmark_pages(zone), any page in that zone 7362 * or lower is eligible for reclaim until at least one usable zone is 7363 * balanced. 7364 */ 7365 static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) 7366 { 7367 int i; 7368 unsigned long nr_soft_reclaimed; 7369 unsigned long nr_soft_scanned; 7370 unsigned long pflags; 7371 unsigned long nr_boost_reclaim; 7372 unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; 7373 bool boosted; 7374 struct zone *zone; 7375 struct scan_control sc = { 7376 .gfp_mask = GFP_KERNEL, 7377 .order = order, 7378 .may_unmap = 1, 7379 }; 7380 7381 set_task_reclaim_state(current, &sc.reclaim_state); 7382 psi_memstall_enter(&pflags); 7383 __fs_reclaim_acquire(_THIS_IP_); 7384 7385 count_vm_event(PAGEOUTRUN); 7386 7387 /* 7388 * Account for the reclaim boost. Note that the zone boost is left in 7389 * place so that parallel allocations that are near the watermark will 7390 * stall or direct reclaim until kswapd is finished. 7391 */ 7392 nr_boost_reclaim = 0; 7393 for (i = 0; i <= highest_zoneidx; i++) { 7394 zone = pgdat->node_zones + i; 7395 if (!managed_zone(zone)) 7396 continue; 7397 7398 nr_boost_reclaim += zone->watermark_boost; 7399 zone_boosts[i] = zone->watermark_boost; 7400 } 7401 boosted = nr_boost_reclaim; 7402 7403 restart: 7404 set_reclaim_active(pgdat, highest_zoneidx); 7405 sc.priority = DEF_PRIORITY; 7406 do { 7407 unsigned long nr_reclaimed = sc.nr_reclaimed; 7408 bool raise_priority = true; 7409 bool balanced; 7410 bool ret; 7411 7412 sc.reclaim_idx = highest_zoneidx; 7413 7414 /* 7415 * If the number of buffer_heads exceeds the maximum allowed 7416 * then consider reclaiming from all zones. This has a dual 7417 * purpose -- on 64-bit systems it is expected that 7418 * buffer_heads are stripped during active rotation. On 32-bit 7419 * systems, highmem pages can pin lowmem memory and shrinking 7420 * buffers can relieve lowmem pressure. Reclaim may still not 7421 * go ahead if all eligible zones for the original allocation 7422 * request are balanced to avoid excessive reclaim from kswapd. 7423 */ 7424 if (buffer_heads_over_limit) { 7425 for (i = MAX_NR_ZONES - 1; i >= 0; i--) { 7426 zone = pgdat->node_zones + i; 7427 if (!managed_zone(zone)) 7428 continue; 7429 7430 sc.reclaim_idx = i; 7431 break; 7432 } 7433 } 7434 7435 /* 7436 * If the pgdat is imbalanced then ignore boosting and preserve 7437 * the watermarks for a later time and restart. Note that the 7438 * zone watermarks will be still reset at the end of balancing 7439 * on the grounds that the normal reclaim should be enough to 7440 * re-evaluate if boosting is required when kswapd next wakes. 7441 */ 7442 balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx); 7443 if (!balanced && nr_boost_reclaim) { 7444 nr_boost_reclaim = 0; 7445 goto restart; 7446 } 7447 7448 /* 7449 * If boosting is not active then only reclaim if there are no 7450 * eligible zones. Note that sc.reclaim_idx is not used as 7451 * buffer_heads_over_limit may have adjusted it. 7452 */ 7453 if (!nr_boost_reclaim && balanced) 7454 goto out; 7455 7456 /* Limit the priority of boosting to avoid reclaim writeback */ 7457 if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) 7458 raise_priority = false; 7459 7460 /* 7461 * Do not writeback or swap pages for boosted reclaim. The 7462 * intent is to relieve pressure not issue sub-optimal IO 7463 * from reclaim context. If no pages are reclaimed, the 7464 * reclaim will be aborted. 7465 */ 7466 sc.may_writepage = !laptop_mode && !nr_boost_reclaim; 7467 sc.may_swap = !nr_boost_reclaim; 7468 7469 /* 7470 * Do some background aging, to give pages a chance to be 7471 * referenced before reclaiming. All pages are rotated 7472 * regardless of classzone as this is about consistent aging. 7473 */ 7474 kswapd_age_node(pgdat, &sc); 7475 7476 /* 7477 * If we're getting trouble reclaiming, start doing writepage 7478 * even in laptop mode. 7479 */ 7480 if (sc.priority < DEF_PRIORITY - 2) 7481 sc.may_writepage = 1; 7482 7483 /* Call soft limit reclaim before calling shrink_node. */ 7484 sc.nr_scanned = 0; 7485 nr_soft_scanned = 0; 7486 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order, 7487 sc.gfp_mask, &nr_soft_scanned); 7488 sc.nr_reclaimed += nr_soft_reclaimed; 7489 7490 /* 7491 * There should be no need to raise the scanning priority if 7492 * enough pages are already being scanned that that high 7493 * watermark would be met at 100% efficiency. 7494 */ 7495 if (kswapd_shrink_node(pgdat, &sc)) 7496 raise_priority = false; 7497 7498 /* 7499 * If the low watermark is met there is no need for processes 7500 * to be throttled on pfmemalloc_wait as they should not be 7501 * able to safely make forward progress. Wake them 7502 */ 7503 if (waitqueue_active(&pgdat->pfmemalloc_wait) && 7504 allow_direct_reclaim(pgdat)) 7505 wake_up_all(&pgdat->pfmemalloc_wait); 7506 7507 /* Check if kswapd should be suspending */ 7508 __fs_reclaim_release(_THIS_IP_); 7509 ret = try_to_freeze(); 7510 __fs_reclaim_acquire(_THIS_IP_); 7511 if (ret || kthread_should_stop()) 7512 break; 7513 7514 /* 7515 * Raise priority if scanning rate is too low or there was no 7516 * progress in reclaiming pages 7517 */ 7518 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; 7519 nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed); 7520 7521 /* 7522 * If reclaim made no progress for a boost, stop reclaim as 7523 * IO cannot be queued and it could be an infinite loop in 7524 * extreme circumstances. 7525 */ 7526 if (nr_boost_reclaim && !nr_reclaimed) 7527 break; 7528 7529 if (raise_priority || !nr_reclaimed) 7530 sc.priority--; 7531 } while (sc.priority >= 1); 7532 7533 if (!sc.nr_reclaimed) 7534 pgdat->kswapd_failures++; 7535 7536 out: 7537 clear_reclaim_active(pgdat, highest_zoneidx); 7538 7539 /* If reclaim was boosted, account for the reclaim done in this pass */ 7540 if (boosted) { 7541 unsigned long flags; 7542 7543 for (i = 0; i <= highest_zoneidx; i++) { 7544 if (!zone_boosts[i]) 7545 continue; 7546 7547 /* Increments are under the zone lock */ 7548 zone = pgdat->node_zones + i; 7549 spin_lock_irqsave(&zone->lock, flags); 7550 zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]); 7551 spin_unlock_irqrestore(&zone->lock, flags); 7552 } 7553 7554 /* 7555 * As there is now likely space, wakeup kcompact to defragment 7556 * pageblocks. 7557 */ 7558 wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx); 7559 } 7560 7561 snapshot_refaults(NULL, pgdat); 7562 __fs_reclaim_release(_THIS_IP_); 7563 psi_memstall_leave(&pflags); 7564 set_task_reclaim_state(current, NULL); 7565 7566 /* 7567 * Return the order kswapd stopped reclaiming at as 7568 * prepare_kswapd_sleep() takes it into account. If another caller 7569 * entered the allocator slow path while kswapd was awake, order will 7570 * remain at the higher level. 7571 */ 7572 return sc.order; 7573 } 7574 7575 /* 7576 * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to 7577 * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is 7578 * not a valid index then either kswapd runs for first time or kswapd couldn't 7579 * sleep after previous reclaim attempt (node is still unbalanced). In that 7580 * case return the zone index of the previous kswapd reclaim cycle. 7581 */ 7582 static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, 7583 enum zone_type prev_highest_zoneidx) 7584 { 7585 enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); 7586 7587 return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; 7588 } 7589 7590 static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, 7591 unsigned int highest_zoneidx) 7592 { 7593 long remaining = 0; 7594 DEFINE_WAIT(wait); 7595 7596 if (freezing(current) || kthread_should_stop()) 7597 return; 7598 7599 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 7600 7601 /* 7602 * Try to sleep for a short interval. Note that kcompactd will only be 7603 * woken if it is possible to sleep for a short interval. This is 7604 * deliberate on the assumption that if reclaim cannot keep an 7605 * eligible zone balanced that it's also unlikely that compaction will 7606 * succeed. 7607 */ 7608 if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { 7609 /* 7610 * Compaction records what page blocks it recently failed to 7611 * isolate pages from and skips them in the future scanning. 7612 * When kswapd is going to sleep, it is reasonable to assume 7613 * that pages and compaction may succeed so reset the cache. 7614 */ 7615 reset_isolation_suitable(pgdat); 7616 7617 /* 7618 * We have freed the memory, now we should compact it to make 7619 * allocation of the requested order possible. 7620 */ 7621 wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); 7622 7623 remaining = schedule_timeout(HZ/10); 7624 7625 /* 7626 * If woken prematurely then reset kswapd_highest_zoneidx and 7627 * order. The values will either be from a wakeup request or 7628 * the previous request that slept prematurely. 7629 */ 7630 if (remaining) { 7631 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, 7632 kswapd_highest_zoneidx(pgdat, 7633 highest_zoneidx)); 7634 7635 if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) 7636 WRITE_ONCE(pgdat->kswapd_order, reclaim_order); 7637 } 7638 7639 finish_wait(&pgdat->kswapd_wait, &wait); 7640 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 7641 } 7642 7643 /* 7644 * After a short sleep, check if it was a premature sleep. If not, then 7645 * go fully to sleep until explicitly woken up. 7646 */ 7647 if (!remaining && 7648 prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { 7649 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 7650 7651 /* 7652 * vmstat counters are not perfectly accurate and the estimated 7653 * value for counters such as NR_FREE_PAGES can deviate from the 7654 * true value by nr_online_cpus * threshold. To avoid the zone 7655 * watermarks being breached while under pressure, we reduce the 7656 * per-cpu vmstat threshold while kswapd is awake and restore 7657 * them before going back to sleep. 7658 */ 7659 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 7660 7661 if (!kthread_should_stop()) 7662 schedule(); 7663 7664 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 7665 } else { 7666 if (remaining) 7667 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); 7668 else 7669 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); 7670 } 7671 finish_wait(&pgdat->kswapd_wait, &wait); 7672 } 7673 7674 /* 7675 * The background pageout daemon, started as a kernel thread 7676 * from the init process. 7677 * 7678 * This basically trickles out pages so that we have _some_ 7679 * free memory available even if there is no other activity 7680 * that frees anything up. This is needed for things like routing 7681 * etc, where we otherwise might have all activity going on in 7682 * asynchronous contexts that cannot page things out. 7683 * 7684 * If there are applications that are active memory-allocators 7685 * (most normal use), this basically shouldn't matter. 7686 */ 7687 static int kswapd(void *p) 7688 { 7689 unsigned int alloc_order, reclaim_order; 7690 unsigned int highest_zoneidx = MAX_NR_ZONES - 1; 7691 pg_data_t *pgdat = (pg_data_t *)p; 7692 struct task_struct *tsk = current; 7693 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 7694 7695 if (!cpumask_empty(cpumask)) 7696 set_cpus_allowed_ptr(tsk, cpumask); 7697 7698 /* 7699 * Tell the memory management that we're a "memory allocator", 7700 * and that if we need more memory we should get access to it 7701 * regardless (see "__alloc_pages()"). "kswapd" should 7702 * never get caught in the normal page freeing logic. 7703 * 7704 * (Kswapd normally doesn't need memory anyway, but sometimes 7705 * you need a small amount of memory in order to be able to 7706 * page out something else, and this flag essentially protects 7707 * us from recursively trying to free more memory as we're 7708 * trying to free the first piece of memory in the first place). 7709 */ 7710 tsk->flags |= PF_MEMALLOC | PF_KSWAPD; 7711 set_freezable(); 7712 7713 WRITE_ONCE(pgdat->kswapd_order, 0); 7714 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); 7715 atomic_set(&pgdat->nr_writeback_throttled, 0); 7716 for ( ; ; ) { 7717 bool ret; 7718 7719 alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); 7720 highest_zoneidx = kswapd_highest_zoneidx(pgdat, 7721 highest_zoneidx); 7722 7723 kswapd_try_sleep: 7724 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, 7725 highest_zoneidx); 7726 7727 /* Read the new order and highest_zoneidx */ 7728 alloc_order = READ_ONCE(pgdat->kswapd_order); 7729 highest_zoneidx = kswapd_highest_zoneidx(pgdat, 7730 highest_zoneidx); 7731 WRITE_ONCE(pgdat->kswapd_order, 0); 7732 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); 7733 7734 ret = try_to_freeze(); 7735 if (kthread_should_stop()) 7736 break; 7737 7738 /* 7739 * We can speed up thawing tasks if we don't call balance_pgdat 7740 * after returning from the refrigerator 7741 */ 7742 if (ret) 7743 continue; 7744 7745 /* 7746 * Reclaim begins at the requested order but if a high-order 7747 * reclaim fails then kswapd falls back to reclaiming for 7748 * order-0. If that happens, kswapd will consider sleeping 7749 * for the order it finished reclaiming at (reclaim_order) 7750 * but kcompactd is woken to compact for the original 7751 * request (alloc_order). 7752 */ 7753 trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, 7754 alloc_order); 7755 reclaim_order = balance_pgdat(pgdat, alloc_order, 7756 highest_zoneidx); 7757 if (reclaim_order < alloc_order) 7758 goto kswapd_try_sleep; 7759 } 7760 7761 tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD); 7762 7763 return 0; 7764 } 7765 7766 /* 7767 * A zone is low on free memory or too fragmented for high-order memory. If 7768 * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's 7769 * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim 7770 * has failed or is not needed, still wake up kcompactd if only compaction is 7771 * needed. 7772 */ 7773 void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, 7774 enum zone_type highest_zoneidx) 7775 { 7776 pg_data_t *pgdat; 7777 enum zone_type curr_idx; 7778 7779 if (!managed_zone(zone)) 7780 return; 7781 7782 if (!cpuset_zone_allowed(zone, gfp_flags)) 7783 return; 7784 7785 pgdat = zone->zone_pgdat; 7786 curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); 7787 7788 if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx) 7789 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx); 7790 7791 if (READ_ONCE(pgdat->kswapd_order) < order) 7792 WRITE_ONCE(pgdat->kswapd_order, order); 7793 7794 if (!waitqueue_active(&pgdat->kswapd_wait)) 7795 return; 7796 7797 /* Hopeless node, leave it to direct reclaim if possible */ 7798 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || 7799 (pgdat_balanced(pgdat, order, highest_zoneidx) && 7800 !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { 7801 /* 7802 * There may be plenty of free memory available, but it's too 7803 * fragmented for high-order allocations. Wake up kcompactd 7804 * and rely on compaction_suitable() to determine if it's 7805 * needed. If it fails, it will defer subsequent attempts to 7806 * ratelimit its work. 7807 */ 7808 if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) 7809 wakeup_kcompactd(pgdat, order, highest_zoneidx); 7810 return; 7811 } 7812 7813 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order, 7814 gfp_flags); 7815 wake_up_interruptible(&pgdat->kswapd_wait); 7816 } 7817 7818 #ifdef CONFIG_HIBERNATION 7819 /* 7820 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of 7821 * freed pages. 7822 * 7823 * Rather than trying to age LRUs the aim is to preserve the overall 7824 * LRU order by reclaiming preferentially 7825 * inactive > active > active referenced > active mapped 7826 */ 7827 unsigned long shrink_all_memory(unsigned long nr_to_reclaim) 7828 { 7829 struct scan_control sc = { 7830 .nr_to_reclaim = nr_to_reclaim, 7831 .gfp_mask = GFP_HIGHUSER_MOVABLE, 7832 .reclaim_idx = MAX_NR_ZONES - 1, 7833 .priority = DEF_PRIORITY, 7834 .may_writepage = 1, 7835 .may_unmap = 1, 7836 .may_swap = 1, 7837 .hibernation_mode = 1, 7838 }; 7839 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 7840 unsigned long nr_reclaimed; 7841 unsigned int noreclaim_flag; 7842 7843 fs_reclaim_acquire(sc.gfp_mask); 7844 noreclaim_flag = memalloc_noreclaim_save(); 7845 set_task_reclaim_state(current, &sc.reclaim_state); 7846 7847 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 7848 7849 set_task_reclaim_state(current, NULL); 7850 memalloc_noreclaim_restore(noreclaim_flag); 7851 fs_reclaim_release(sc.gfp_mask); 7852 7853 return nr_reclaimed; 7854 } 7855 #endif /* CONFIG_HIBERNATION */ 7856 7857 /* 7858 * This kswapd start function will be called by init and node-hot-add. 7859 */ 7860 void __meminit kswapd_run(int nid) 7861 { 7862 pg_data_t *pgdat = NODE_DATA(nid); 7863 7864 pgdat_kswapd_lock(pgdat); 7865 if (!pgdat->kswapd) { 7866 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); 7867 if (IS_ERR(pgdat->kswapd)) { 7868 /* failure at boot is fatal */ 7869 BUG_ON(system_state < SYSTEM_RUNNING); 7870 pr_err("Failed to start kswapd on node %d\n", nid); 7871 pgdat->kswapd = NULL; 7872 } 7873 } 7874 pgdat_kswapd_unlock(pgdat); 7875 } 7876 7877 /* 7878 * Called by memory hotplug when all memory in a node is offlined. Caller must 7879 * be holding mem_hotplug_begin/done(). 7880 */ 7881 void __meminit kswapd_stop(int nid) 7882 { 7883 pg_data_t *pgdat = NODE_DATA(nid); 7884 struct task_struct *kswapd; 7885 7886 pgdat_kswapd_lock(pgdat); 7887 kswapd = pgdat->kswapd; 7888 if (kswapd) { 7889 kthread_stop(kswapd); 7890 pgdat->kswapd = NULL; 7891 } 7892 pgdat_kswapd_unlock(pgdat); 7893 } 7894 7895 static int __init kswapd_init(void) 7896 { 7897 int nid; 7898 7899 swap_setup(); 7900 for_each_node_state(nid, N_MEMORY) 7901 kswapd_run(nid); 7902 return 0; 7903 } 7904 7905 module_init(kswapd_init) 7906 7907 #ifdef CONFIG_NUMA 7908 /* 7909 * Node reclaim mode 7910 * 7911 * If non-zero call node_reclaim when the number of free pages falls below 7912 * the watermarks. 7913 */ 7914 int node_reclaim_mode __read_mostly; 7915 7916 /* 7917 * Priority for NODE_RECLAIM. This determines the fraction of pages 7918 * of a node considered for each zone_reclaim. 4 scans 1/16th of 7919 * a zone. 7920 */ 7921 #define NODE_RECLAIM_PRIORITY 4 7922 7923 /* 7924 * Percentage of pages in a zone that must be unmapped for node_reclaim to 7925 * occur. 7926 */ 7927 int sysctl_min_unmapped_ratio = 1; 7928 7929 /* 7930 * If the number of slab pages in a zone grows beyond this percentage then 7931 * slab reclaim needs to occur. 7932 */ 7933 int sysctl_min_slab_ratio = 5; 7934 7935 static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat) 7936 { 7937 unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED); 7938 unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) + 7939 node_page_state(pgdat, NR_ACTIVE_FILE); 7940 7941 /* 7942 * It's possible for there to be more file mapped pages than 7943 * accounted for by the pages on the file LRU lists because 7944 * tmpfs pages accounted for as ANON can also be FILE_MAPPED 7945 */ 7946 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; 7947 } 7948 7949 /* Work out how many page cache pages we can reclaim in this reclaim_mode */ 7950 static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) 7951 { 7952 unsigned long nr_pagecache_reclaimable; 7953 unsigned long delta = 0; 7954 7955 /* 7956 * If RECLAIM_UNMAP is set, then all file pages are considered 7957 * potentially reclaimable. Otherwise, we have to worry about 7958 * pages like swapcache and node_unmapped_file_pages() provides 7959 * a better estimate 7960 */ 7961 if (node_reclaim_mode & RECLAIM_UNMAP) 7962 nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES); 7963 else 7964 nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat); 7965 7966 /* If we can't clean pages, remove dirty pages from consideration */ 7967 if (!(node_reclaim_mode & RECLAIM_WRITE)) 7968 delta += node_page_state(pgdat, NR_FILE_DIRTY); 7969 7970 /* Watch for any possible underflows due to delta */ 7971 if (unlikely(delta > nr_pagecache_reclaimable)) 7972 delta = nr_pagecache_reclaimable; 7973 7974 return nr_pagecache_reclaimable - delta; 7975 } 7976 7977 /* 7978 * Try to free up some pages from this node through reclaim. 7979 */ 7980 static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) 7981 { 7982 /* Minimum pages needed in order to stay on node */ 7983 const unsigned long nr_pages = 1 << order; 7984 struct task_struct *p = current; 7985 unsigned int noreclaim_flag; 7986 struct scan_control sc = { 7987 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), 7988 .gfp_mask = current_gfp_context(gfp_mask), 7989 .order = order, 7990 .priority = NODE_RECLAIM_PRIORITY, 7991 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), 7992 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), 7993 .may_swap = 1, 7994 .reclaim_idx = gfp_zone(gfp_mask), 7995 }; 7996 unsigned long pflags; 7997 7998 trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, 7999 sc.gfp_mask); 8000 8001 cond_resched(); 8002 psi_memstall_enter(&pflags); 8003 fs_reclaim_acquire(sc.gfp_mask); 8004 /* 8005 * We need to be able to allocate from the reserves for RECLAIM_UNMAP 8006 */ 8007 noreclaim_flag = memalloc_noreclaim_save(); 8008 set_task_reclaim_state(p, &sc.reclaim_state); 8009 8010 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages || 8011 node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) { 8012 /* 8013 * Free memory by calling shrink node with increasing 8014 * priorities until we have enough memory freed. 8015 */ 8016 do { 8017 shrink_node(pgdat, &sc); 8018 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); 8019 } 8020 8021 set_task_reclaim_state(p, NULL); 8022 memalloc_noreclaim_restore(noreclaim_flag); 8023 fs_reclaim_release(sc.gfp_mask); 8024 psi_memstall_leave(&pflags); 8025 8026 trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); 8027 8028 return sc.nr_reclaimed >= nr_pages; 8029 } 8030 8031 int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) 8032 { 8033 int ret; 8034 8035 /* 8036 * Node reclaim reclaims unmapped file backed pages and 8037 * slab pages if we are over the defined limits. 8038 * 8039 * A small portion of unmapped file backed pages is needed for 8040 * file I/O otherwise pages read by file I/O will be immediately 8041 * thrown out if the node is overallocated. So we do not reclaim 8042 * if less than a specified percentage of the node is used by 8043 * unmapped file backed pages. 8044 */ 8045 if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && 8046 node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <= 8047 pgdat->min_slab_pages) 8048 return NODE_RECLAIM_FULL; 8049 8050 /* 8051 * Do not scan if the allocation should not be delayed. 8052 */ 8053 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) 8054 return NODE_RECLAIM_NOSCAN; 8055 8056 /* 8057 * Only run node reclaim on the local node or on nodes that do not 8058 * have associated processors. This will favor the local processor 8059 * over remote processors and spread off node memory allocations 8060 * as wide as possible. 8061 */ 8062 if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) 8063 return NODE_RECLAIM_NOSCAN; 8064 8065 if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) 8066 return NODE_RECLAIM_NOSCAN; 8067 8068 ret = __node_reclaim(pgdat, gfp_mask, order); 8069 clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); 8070 8071 if (!ret) 8072 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); 8073 8074 return ret; 8075 } 8076 #endif 8077 8078 void check_move_unevictable_pages(struct pagevec *pvec) 8079 { 8080 struct folio_batch fbatch; 8081 unsigned i; 8082 8083 folio_batch_init(&fbatch); 8084 for (i = 0; i < pvec->nr; i++) { 8085 struct page *page = pvec->pages[i]; 8086 8087 if (PageTransTail(page)) 8088 continue; 8089 folio_batch_add(&fbatch, page_folio(page)); 8090 } 8091 check_move_unevictable_folios(&fbatch); 8092 } 8093 EXPORT_SYMBOL_GPL(check_move_unevictable_pages); 8094 8095 /** 8096 * check_move_unevictable_folios - Move evictable folios to appropriate zone 8097 * lru list 8098 * @fbatch: Batch of lru folios to check. 8099 * 8100 * Checks folios for evictability, if an evictable folio is in the unevictable 8101 * lru list, moves it to the appropriate evictable lru list. This function 8102 * should be only used for lru folios. 8103 */ 8104 void check_move_unevictable_folios(struct folio_batch *fbatch) 8105 { 8106 struct lruvec *lruvec = NULL; 8107 int pgscanned = 0; 8108 int pgrescued = 0; 8109 int i; 8110 8111 for (i = 0; i < fbatch->nr; i++) { 8112 struct folio *folio = fbatch->folios[i]; 8113 int nr_pages = folio_nr_pages(folio); 8114 8115 pgscanned += nr_pages; 8116 8117 /* block memcg migration while the folio moves between lrus */ 8118 if (!folio_test_clear_lru(folio)) 8119 continue; 8120 8121 lruvec = folio_lruvec_relock_irq(folio, lruvec); 8122 if (folio_evictable(folio) && folio_test_unevictable(folio)) { 8123 lruvec_del_folio(lruvec, folio); 8124 folio_clear_unevictable(folio); 8125 lruvec_add_folio(lruvec, folio); 8126 pgrescued += nr_pages; 8127 } 8128 folio_set_lru(folio); 8129 } 8130 8131 if (lruvec) { 8132 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); 8133 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); 8134 unlock_page_lruvec_irq(lruvec); 8135 } else if (pgscanned) { 8136 count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); 8137 } 8138 } 8139 EXPORT_SYMBOL_GPL(check_move_unevictable_folios); 8140