1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 4 * 5 * Swap reorganised 29.12.95, Stephen Tweedie. 6 * kswapd added: 7.1.96 sct 7 * Removed kswapd_ctl limits, and swap out as many pages as needed 8 * to bring the system back to freepages.high: 2.4.97, Rik van Riel. 9 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). 10 * Multiqueue VM started 5.8.00, Rik van Riel. 11 */ 12 13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 14 15 #include <linux/mm.h> 16 #include <linux/sched/mm.h> 17 #include <linux/module.h> 18 #include <linux/gfp.h> 19 #include <linux/kernel_stat.h> 20 #include <linux/swap.h> 21 #include <linux/pagemap.h> 22 #include <linux/init.h> 23 #include <linux/highmem.h> 24 #include <linux/vmpressure.h> 25 #include <linux/vmstat.h> 26 #include <linux/file.h> 27 #include <linux/writeback.h> 28 #include <linux/blkdev.h> 29 #include <linux/buffer_head.h> /* for buffer_heads_over_limit */ 30 #include <linux/mm_inline.h> 31 #include <linux/backing-dev.h> 32 #include <linux/rmap.h> 33 #include <linux/topology.h> 34 #include <linux/cpu.h> 35 #include <linux/cpuset.h> 36 #include <linux/compaction.h> 37 #include <linux/notifier.h> 38 #include <linux/mutex.h> 39 #include <linux/delay.h> 40 #include <linux/kthread.h> 41 #include <linux/freezer.h> 42 #include <linux/memcontrol.h> 43 #include <linux/migrate.h> 44 #include <linux/delayacct.h> 45 #include <linux/sysctl.h> 46 #include <linux/memory-tiers.h> 47 #include <linux/oom.h> 48 #include <linux/pagevec.h> 49 #include <linux/prefetch.h> 50 #include <linux/printk.h> 51 #include <linux/dax.h> 52 #include <linux/psi.h> 53 #include <linux/pagewalk.h> 54 #include <linux/shmem_fs.h> 55 #include <linux/ctype.h> 56 #include <linux/debugfs.h> 57 #include <linux/khugepaged.h> 58 #include <linux/rculist_nulls.h> 59 #include <linux/random.h> 60 #include <linux/srcu.h> 61 62 #include <asm/tlbflush.h> 63 #include <asm/div64.h> 64 65 #include <linux/swapops.h> 66 #include <linux/balloon_compaction.h> 67 #include <linux/sched/sysctl.h> 68 69 #include "internal.h" 70 #include "swap.h" 71 72 #define CREATE_TRACE_POINTS 73 #include <trace/events/vmscan.h> 74 75 struct scan_control { 76 /* How many pages shrink_list() should reclaim */ 77 unsigned long nr_to_reclaim; 78 79 /* 80 * Nodemask of nodes allowed by the caller. If NULL, all nodes 81 * are scanned. 82 */ 83 nodemask_t *nodemask; 84 85 /* 86 * The memory cgroup that hit its limit and as a result is the 87 * primary target of this reclaim invocation. 88 */ 89 struct mem_cgroup *target_mem_cgroup; 90 91 /* 92 * Scan pressure balancing between anon and file LRUs 93 */ 94 unsigned long anon_cost; 95 unsigned long file_cost; 96 97 /* Can active folios be deactivated as part of reclaim? */ 98 #define DEACTIVATE_ANON 1 99 #define DEACTIVATE_FILE 2 100 unsigned int may_deactivate:2; 101 unsigned int force_deactivate:1; 102 unsigned int skipped_deactivate:1; 103 104 /* Writepage batching in laptop mode; RECLAIM_WRITE */ 105 unsigned int may_writepage:1; 106 107 /* Can mapped folios be reclaimed? */ 108 unsigned int may_unmap:1; 109 110 /* Can folios be swapped as part of reclaim? */ 111 unsigned int may_swap:1; 112 113 /* Proactive reclaim invoked by userspace through memory.reclaim */ 114 unsigned int proactive:1; 115 116 /* 117 * Cgroup memory below memory.low is protected as long as we 118 * don't threaten to OOM. If any cgroup is reclaimed at 119 * reduced force or passed over entirely due to its memory.low 120 * setting (memcg_low_skipped), and nothing is reclaimed as a 121 * result, then go back for one more cycle that reclaims the protected 122 * memory (memcg_low_reclaim) to avert OOM. 123 */ 124 unsigned int memcg_low_reclaim:1; 125 unsigned int memcg_low_skipped:1; 126 127 unsigned int hibernation_mode:1; 128 129 /* One of the zones is ready for compaction */ 130 unsigned int compaction_ready:1; 131 132 /* There is easily reclaimable cold cache in the current node */ 133 unsigned int cache_trim_mode:1; 134 135 /* The file folios on the current node are dangerously low */ 136 unsigned int file_is_tiny:1; 137 138 /* Always discard instead of demoting to lower tier memory */ 139 unsigned int no_demotion:1; 140 141 /* Allocation order */ 142 s8 order; 143 144 /* Scan (total_size >> priority) pages at once */ 145 s8 priority; 146 147 /* The highest zone to isolate folios for reclaim from */ 148 s8 reclaim_idx; 149 150 /* This context's GFP mask */ 151 gfp_t gfp_mask; 152 153 /* Incremented by the number of inactive pages that were scanned */ 154 unsigned long nr_scanned; 155 156 /* Number of pages freed so far during a call to shrink_zones() */ 157 unsigned long nr_reclaimed; 158 159 struct { 160 unsigned int dirty; 161 unsigned int unqueued_dirty; 162 unsigned int congested; 163 unsigned int writeback; 164 unsigned int immediate; 165 unsigned int file_taken; 166 unsigned int taken; 167 } nr; 168 169 /* for recording the reclaimed slab by now */ 170 struct reclaim_state reclaim_state; 171 }; 172 173 #ifdef ARCH_HAS_PREFETCHW 174 #define prefetchw_prev_lru_folio(_folio, _base, _field) \ 175 do { \ 176 if ((_folio)->lru.prev != _base) { \ 177 struct folio *prev; \ 178 \ 179 prev = lru_to_folio(&(_folio->lru)); \ 180 prefetchw(&prev->_field); \ 181 } \ 182 } while (0) 183 #else 184 #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) 185 #endif 186 187 /* 188 * From 0 .. 200. Higher means more swappy. 189 */ 190 int vm_swappiness = 60; 191 192 static void set_task_reclaim_state(struct task_struct *task, 193 struct reclaim_state *rs) 194 { 195 /* Check for an overwrite */ 196 WARN_ON_ONCE(rs && task->reclaim_state); 197 198 /* Check for the nulling of an already-nulled member */ 199 WARN_ON_ONCE(!rs && !task->reclaim_state); 200 201 task->reclaim_state = rs; 202 } 203 204 LIST_HEAD(shrinker_list); 205 DEFINE_MUTEX(shrinker_mutex); 206 DEFINE_SRCU(shrinker_srcu); 207 static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0); 208 209 #ifdef CONFIG_MEMCG 210 static int shrinker_nr_max; 211 212 /* The shrinker_info is expanded in a batch of BITS_PER_LONG */ 213 static inline int shrinker_map_size(int nr_items) 214 { 215 return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); 216 } 217 218 static inline int shrinker_defer_size(int nr_items) 219 { 220 return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); 221 } 222 223 static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, 224 int nid) 225 { 226 return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info, 227 &shrinker_srcu, 228 lockdep_is_held(&shrinker_mutex)); 229 } 230 231 static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, 232 int nid) 233 { 234 return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info, 235 &shrinker_srcu); 236 } 237 238 static void free_shrinker_info_rcu(struct rcu_head *head) 239 { 240 kvfree(container_of(head, struct shrinker_info, rcu)); 241 } 242 243 static int expand_one_shrinker_info(struct mem_cgroup *memcg, 244 int map_size, int defer_size, 245 int old_map_size, int old_defer_size, 246 int new_nr_max) 247 { 248 struct shrinker_info *new, *old; 249 struct mem_cgroup_per_node *pn; 250 int nid; 251 int size = map_size + defer_size; 252 253 for_each_node(nid) { 254 pn = memcg->nodeinfo[nid]; 255 old = shrinker_info_protected(memcg, nid); 256 /* Not yet online memcg */ 257 if (!old) 258 return 0; 259 260 /* Already expanded this shrinker_info */ 261 if (new_nr_max <= old->map_nr_max) 262 continue; 263 264 new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); 265 if (!new) 266 return -ENOMEM; 267 268 new->nr_deferred = (atomic_long_t *)(new + 1); 269 new->map = (void *)new->nr_deferred + defer_size; 270 new->map_nr_max = new_nr_max; 271 272 /* map: set all old bits, clear all new bits */ 273 memset(new->map, (int)0xff, old_map_size); 274 memset((void *)new->map + old_map_size, 0, map_size - old_map_size); 275 /* nr_deferred: copy old values, clear all new values */ 276 memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); 277 memset((void *)new->nr_deferred + old_defer_size, 0, 278 defer_size - old_defer_size); 279 280 rcu_assign_pointer(pn->shrinker_info, new); 281 call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu); 282 } 283 284 return 0; 285 } 286 287 void free_shrinker_info(struct mem_cgroup *memcg) 288 { 289 struct mem_cgroup_per_node *pn; 290 struct shrinker_info *info; 291 int nid; 292 293 for_each_node(nid) { 294 pn = memcg->nodeinfo[nid]; 295 info = rcu_dereference_protected(pn->shrinker_info, true); 296 kvfree(info); 297 rcu_assign_pointer(pn->shrinker_info, NULL); 298 } 299 } 300 301 int alloc_shrinker_info(struct mem_cgroup *memcg) 302 { 303 struct shrinker_info *info; 304 int nid, size, ret = 0; 305 int map_size, defer_size = 0; 306 307 mutex_lock(&shrinker_mutex); 308 map_size = shrinker_map_size(shrinker_nr_max); 309 defer_size = shrinker_defer_size(shrinker_nr_max); 310 size = map_size + defer_size; 311 for_each_node(nid) { 312 info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); 313 if (!info) { 314 free_shrinker_info(memcg); 315 ret = -ENOMEM; 316 break; 317 } 318 info->nr_deferred = (atomic_long_t *)(info + 1); 319 info->map = (void *)info->nr_deferred + defer_size; 320 info->map_nr_max = shrinker_nr_max; 321 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); 322 } 323 mutex_unlock(&shrinker_mutex); 324 325 return ret; 326 } 327 328 static int expand_shrinker_info(int new_id) 329 { 330 int ret = 0; 331 int new_nr_max = round_up(new_id + 1, BITS_PER_LONG); 332 int map_size, defer_size = 0; 333 int old_map_size, old_defer_size = 0; 334 struct mem_cgroup *memcg; 335 336 if (!root_mem_cgroup) 337 goto out; 338 339 lockdep_assert_held(&shrinker_mutex); 340 341 map_size = shrinker_map_size(new_nr_max); 342 defer_size = shrinker_defer_size(new_nr_max); 343 old_map_size = shrinker_map_size(shrinker_nr_max); 344 old_defer_size = shrinker_defer_size(shrinker_nr_max); 345 346 memcg = mem_cgroup_iter(NULL, NULL, NULL); 347 do { 348 ret = expand_one_shrinker_info(memcg, map_size, defer_size, 349 old_map_size, old_defer_size, 350 new_nr_max); 351 if (ret) { 352 mem_cgroup_iter_break(NULL, memcg); 353 goto out; 354 } 355 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); 356 out: 357 if (!ret) 358 shrinker_nr_max = new_nr_max; 359 360 return ret; 361 } 362 363 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) 364 { 365 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { 366 struct shrinker_info *info; 367 int srcu_idx; 368 369 srcu_idx = srcu_read_lock(&shrinker_srcu); 370 info = shrinker_info_srcu(memcg, nid); 371 if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { 372 /* Pairs with smp mb in shrink_slab() */ 373 smp_mb__before_atomic(); 374 set_bit(shrinker_id, info->map); 375 } 376 srcu_read_unlock(&shrinker_srcu, srcu_idx); 377 } 378 } 379 380 static DEFINE_IDR(shrinker_idr); 381 382 static int prealloc_memcg_shrinker(struct shrinker *shrinker) 383 { 384 int id, ret = -ENOMEM; 385 386 if (mem_cgroup_disabled()) 387 return -ENOSYS; 388 389 mutex_lock(&shrinker_mutex); 390 id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); 391 if (id < 0) 392 goto unlock; 393 394 if (id >= shrinker_nr_max) { 395 if (expand_shrinker_info(id)) { 396 idr_remove(&shrinker_idr, id); 397 goto unlock; 398 } 399 } 400 shrinker->id = id; 401 ret = 0; 402 unlock: 403 mutex_unlock(&shrinker_mutex); 404 return ret; 405 } 406 407 static void unregister_memcg_shrinker(struct shrinker *shrinker) 408 { 409 int id = shrinker->id; 410 411 BUG_ON(id < 0); 412 413 lockdep_assert_held(&shrinker_mutex); 414 415 idr_remove(&shrinker_idr, id); 416 } 417 418 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, 419 struct mem_cgroup *memcg) 420 { 421 struct shrinker_info *info; 422 423 info = shrinker_info_srcu(memcg, nid); 424 return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); 425 } 426 427 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, 428 struct mem_cgroup *memcg) 429 { 430 struct shrinker_info *info; 431 432 info = shrinker_info_srcu(memcg, nid); 433 return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); 434 } 435 436 void reparent_shrinker_deferred(struct mem_cgroup *memcg) 437 { 438 int i, nid; 439 long nr; 440 struct mem_cgroup *parent; 441 struct shrinker_info *child_info, *parent_info; 442 443 parent = parent_mem_cgroup(memcg); 444 if (!parent) 445 parent = root_mem_cgroup; 446 447 /* Prevent from concurrent shrinker_info expand */ 448 mutex_lock(&shrinker_mutex); 449 for_each_node(nid) { 450 child_info = shrinker_info_protected(memcg, nid); 451 parent_info = shrinker_info_protected(parent, nid); 452 for (i = 0; i < child_info->map_nr_max; i++) { 453 nr = atomic_long_read(&child_info->nr_deferred[i]); 454 atomic_long_add(nr, &parent_info->nr_deferred[i]); 455 } 456 } 457 mutex_unlock(&shrinker_mutex); 458 } 459 460 static bool cgroup_reclaim(struct scan_control *sc) 461 { 462 return sc->target_mem_cgroup; 463 } 464 465 static bool global_reclaim(struct scan_control *sc) 466 { 467 return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup); 468 } 469 470 /** 471 * writeback_throttling_sane - is the usual dirty throttling mechanism available? 472 * @sc: scan_control in question 473 * 474 * The normal page dirty throttling mechanism in balance_dirty_pages() is 475 * completely broken with the legacy memcg and direct stalling in 476 * shrink_folio_list() is used for throttling instead, which lacks all the 477 * niceties such as fairness, adaptive pausing, bandwidth proportional 478 * allocation and configurability. 479 * 480 * This function tests whether the vmscan currently in progress can assume 481 * that the normal dirty throttling mechanism is operational. 482 */ 483 static bool writeback_throttling_sane(struct scan_control *sc) 484 { 485 if (!cgroup_reclaim(sc)) 486 return true; 487 #ifdef CONFIG_CGROUP_WRITEBACK 488 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 489 return true; 490 #endif 491 return false; 492 } 493 #else 494 static int prealloc_memcg_shrinker(struct shrinker *shrinker) 495 { 496 return -ENOSYS; 497 } 498 499 static void unregister_memcg_shrinker(struct shrinker *shrinker) 500 { 501 } 502 503 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, 504 struct mem_cgroup *memcg) 505 { 506 return 0; 507 } 508 509 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, 510 struct mem_cgroup *memcg) 511 { 512 return 0; 513 } 514 515 static bool cgroup_reclaim(struct scan_control *sc) 516 { 517 return false; 518 } 519 520 static bool global_reclaim(struct scan_control *sc) 521 { 522 return true; 523 } 524 525 static bool writeback_throttling_sane(struct scan_control *sc) 526 { 527 return true; 528 } 529 #endif 530 531 static long xchg_nr_deferred(struct shrinker *shrinker, 532 struct shrink_control *sc) 533 { 534 int nid = sc->nid; 535 536 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) 537 nid = 0; 538 539 if (sc->memcg && 540 (shrinker->flags & SHRINKER_MEMCG_AWARE)) 541 return xchg_nr_deferred_memcg(nid, shrinker, 542 sc->memcg); 543 544 return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); 545 } 546 547 548 static long add_nr_deferred(long nr, struct shrinker *shrinker, 549 struct shrink_control *sc) 550 { 551 int nid = sc->nid; 552 553 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) 554 nid = 0; 555 556 if (sc->memcg && 557 (shrinker->flags & SHRINKER_MEMCG_AWARE)) 558 return add_nr_deferred_memcg(nr, nid, shrinker, 559 sc->memcg); 560 561 return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); 562 } 563 564 static bool can_demote(int nid, struct scan_control *sc) 565 { 566 if (!numa_demotion_enabled) 567 return false; 568 if (sc && sc->no_demotion) 569 return false; 570 if (next_demotion_node(nid) == NUMA_NO_NODE) 571 return false; 572 573 return true; 574 } 575 576 static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, 577 int nid, 578 struct scan_control *sc) 579 { 580 if (memcg == NULL) { 581 /* 582 * For non-memcg reclaim, is there 583 * space in any swap device? 584 */ 585 if (get_nr_swap_pages() > 0) 586 return true; 587 } else { 588 /* Is the memcg below its swap limit? */ 589 if (mem_cgroup_get_nr_swap_pages(memcg) > 0) 590 return true; 591 } 592 593 /* 594 * The page can not be swapped. 595 * 596 * Can it be reclaimed from this node via demotion? 597 */ 598 return can_demote(nid, sc); 599 } 600 601 /* 602 * This misses isolated folios which are not accounted for to save counters. 603 * As the data only determines if reclaim or compaction continues, it is 604 * not expected that isolated folios will be a dominating factor. 605 */ 606 unsigned long zone_reclaimable_pages(struct zone *zone) 607 { 608 unsigned long nr; 609 610 nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + 611 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); 612 if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) 613 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + 614 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); 615 616 return nr; 617 } 618 619 /** 620 * lruvec_lru_size - Returns the number of pages on the given LRU list. 621 * @lruvec: lru vector 622 * @lru: lru to use 623 * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list) 624 */ 625 static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, 626 int zone_idx) 627 { 628 unsigned long size = 0; 629 int zid; 630 631 for (zid = 0; zid <= zone_idx; zid++) { 632 struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; 633 634 if (!managed_zone(zone)) 635 continue; 636 637 if (!mem_cgroup_disabled()) 638 size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); 639 else 640 size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru); 641 } 642 return size; 643 } 644 645 /* 646 * Add a shrinker callback to be called from the vm. 647 */ 648 static int __prealloc_shrinker(struct shrinker *shrinker) 649 { 650 unsigned int size; 651 int err; 652 653 if (shrinker->flags & SHRINKER_MEMCG_AWARE) { 654 err = prealloc_memcg_shrinker(shrinker); 655 if (err != -ENOSYS) 656 return err; 657 658 shrinker->flags &= ~SHRINKER_MEMCG_AWARE; 659 } 660 661 size = sizeof(*shrinker->nr_deferred); 662 if (shrinker->flags & SHRINKER_NUMA_AWARE) 663 size *= nr_node_ids; 664 665 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); 666 if (!shrinker->nr_deferred) 667 return -ENOMEM; 668 669 return 0; 670 } 671 672 #ifdef CONFIG_SHRINKER_DEBUG 673 int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) 674 { 675 va_list ap; 676 int err; 677 678 va_start(ap, fmt); 679 shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); 680 va_end(ap); 681 if (!shrinker->name) 682 return -ENOMEM; 683 684 err = __prealloc_shrinker(shrinker); 685 if (err) { 686 kfree_const(shrinker->name); 687 shrinker->name = NULL; 688 } 689 690 return err; 691 } 692 #else 693 int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) 694 { 695 return __prealloc_shrinker(shrinker); 696 } 697 #endif 698 699 void free_prealloced_shrinker(struct shrinker *shrinker) 700 { 701 #ifdef CONFIG_SHRINKER_DEBUG 702 kfree_const(shrinker->name); 703 shrinker->name = NULL; 704 #endif 705 if (shrinker->flags & SHRINKER_MEMCG_AWARE) { 706 mutex_lock(&shrinker_mutex); 707 unregister_memcg_shrinker(shrinker); 708 mutex_unlock(&shrinker_mutex); 709 return; 710 } 711 712 kfree(shrinker->nr_deferred); 713 shrinker->nr_deferred = NULL; 714 } 715 716 void register_shrinker_prepared(struct shrinker *shrinker) 717 { 718 mutex_lock(&shrinker_mutex); 719 list_add_tail_rcu(&shrinker->list, &shrinker_list); 720 shrinker->flags |= SHRINKER_REGISTERED; 721 shrinker_debugfs_add(shrinker); 722 mutex_unlock(&shrinker_mutex); 723 } 724 725 static int __register_shrinker(struct shrinker *shrinker) 726 { 727 int err = __prealloc_shrinker(shrinker); 728 729 if (err) 730 return err; 731 register_shrinker_prepared(shrinker); 732 return 0; 733 } 734 735 #ifdef CONFIG_SHRINKER_DEBUG 736 int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) 737 { 738 va_list ap; 739 int err; 740 741 va_start(ap, fmt); 742 shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); 743 va_end(ap); 744 if (!shrinker->name) 745 return -ENOMEM; 746 747 err = __register_shrinker(shrinker); 748 if (err) { 749 kfree_const(shrinker->name); 750 shrinker->name = NULL; 751 } 752 return err; 753 } 754 #else 755 int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) 756 { 757 return __register_shrinker(shrinker); 758 } 759 #endif 760 EXPORT_SYMBOL(register_shrinker); 761 762 /* 763 * Remove one 764 */ 765 void unregister_shrinker(struct shrinker *shrinker) 766 { 767 struct dentry *debugfs_entry; 768 769 if (!(shrinker->flags & SHRINKER_REGISTERED)) 770 return; 771 772 mutex_lock(&shrinker_mutex); 773 list_del_rcu(&shrinker->list); 774 shrinker->flags &= ~SHRINKER_REGISTERED; 775 if (shrinker->flags & SHRINKER_MEMCG_AWARE) 776 unregister_memcg_shrinker(shrinker); 777 debugfs_entry = shrinker_debugfs_remove(shrinker); 778 mutex_unlock(&shrinker_mutex); 779 780 atomic_inc(&shrinker_srcu_generation); 781 synchronize_srcu(&shrinker_srcu); 782 783 debugfs_remove_recursive(debugfs_entry); 784 785 kfree(shrinker->nr_deferred); 786 shrinker->nr_deferred = NULL; 787 } 788 EXPORT_SYMBOL(unregister_shrinker); 789 790 /** 791 * synchronize_shrinkers - Wait for all running shrinkers to complete. 792 * 793 * This is useful to guarantee that all shrinker invocations have seen an 794 * update, before freeing memory. 795 */ 796 void synchronize_shrinkers(void) 797 { 798 atomic_inc(&shrinker_srcu_generation); 799 synchronize_srcu(&shrinker_srcu); 800 } 801 EXPORT_SYMBOL(synchronize_shrinkers); 802 803 #define SHRINK_BATCH 128 804 805 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, 806 struct shrinker *shrinker, int priority) 807 { 808 unsigned long freed = 0; 809 unsigned long long delta; 810 long total_scan; 811 long freeable; 812 long nr; 813 long new_nr; 814 long batch_size = shrinker->batch ? shrinker->batch 815 : SHRINK_BATCH; 816 long scanned = 0, next_deferred; 817 818 freeable = shrinker->count_objects(shrinker, shrinkctl); 819 if (freeable == 0 || freeable == SHRINK_EMPTY) 820 return freeable; 821 822 /* 823 * copy the current shrinker scan count into a local variable 824 * and zero it so that other concurrent shrinker invocations 825 * don't also do this scanning work. 826 */ 827 nr = xchg_nr_deferred(shrinker, shrinkctl); 828 829 if (shrinker->seeks) { 830 delta = freeable >> priority; 831 delta *= 4; 832 do_div(delta, shrinker->seeks); 833 } else { 834 /* 835 * These objects don't require any IO to create. Trim 836 * them aggressively under memory pressure to keep 837 * them from causing refetches in the IO caches. 838 */ 839 delta = freeable / 2; 840 } 841 842 total_scan = nr >> priority; 843 total_scan += delta; 844 total_scan = min(total_scan, (2 * freeable)); 845 846 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, 847 freeable, delta, total_scan, priority); 848 849 /* 850 * Normally, we should not scan less than batch_size objects in one 851 * pass to avoid too frequent shrinker calls, but if the slab has less 852 * than batch_size objects in total and we are really tight on memory, 853 * we will try to reclaim all available objects, otherwise we can end 854 * up failing allocations although there are plenty of reclaimable 855 * objects spread over several slabs with usage less than the 856 * batch_size. 857 * 858 * We detect the "tight on memory" situations by looking at the total 859 * number of objects we want to scan (total_scan). If it is greater 860 * than the total number of objects on slab (freeable), we must be 861 * scanning at high prio and therefore should try to reclaim as much as 862 * possible. 863 */ 864 while (total_scan >= batch_size || 865 total_scan >= freeable) { 866 unsigned long ret; 867 unsigned long nr_to_scan = min(batch_size, total_scan); 868 869 shrinkctl->nr_to_scan = nr_to_scan; 870 shrinkctl->nr_scanned = nr_to_scan; 871 ret = shrinker->scan_objects(shrinker, shrinkctl); 872 if (ret == SHRINK_STOP) 873 break; 874 freed += ret; 875 876 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); 877 total_scan -= shrinkctl->nr_scanned; 878 scanned += shrinkctl->nr_scanned; 879 880 cond_resched(); 881 } 882 883 /* 884 * The deferred work is increased by any new work (delta) that wasn't 885 * done, decreased by old deferred work that was done now. 886 * 887 * And it is capped to two times of the freeable items. 888 */ 889 next_deferred = max_t(long, (nr + delta - scanned), 0); 890 next_deferred = min(next_deferred, (2 * freeable)); 891 892 /* 893 * move the unused scan count back into the shrinker in a 894 * manner that handles concurrent updates. 895 */ 896 new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); 897 898 trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); 899 return freed; 900 } 901 902 #ifdef CONFIG_MEMCG 903 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, 904 struct mem_cgroup *memcg, int priority) 905 { 906 struct shrinker_info *info; 907 unsigned long ret, freed = 0; 908 int srcu_idx, generation; 909 int i = 0; 910 911 if (!mem_cgroup_online(memcg)) 912 return 0; 913 914 again: 915 srcu_idx = srcu_read_lock(&shrinker_srcu); 916 info = shrinker_info_srcu(memcg, nid); 917 if (unlikely(!info)) 918 goto unlock; 919 920 generation = atomic_read(&shrinker_srcu_generation); 921 for_each_set_bit_from(i, info->map, info->map_nr_max) { 922 struct shrink_control sc = { 923 .gfp_mask = gfp_mask, 924 .nid = nid, 925 .memcg = memcg, 926 }; 927 struct shrinker *shrinker; 928 929 shrinker = idr_find(&shrinker_idr, i); 930 if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { 931 if (!shrinker) 932 clear_bit(i, info->map); 933 continue; 934 } 935 936 /* Call non-slab shrinkers even though kmem is disabled */ 937 if (!memcg_kmem_online() && 938 !(shrinker->flags & SHRINKER_NONSLAB)) 939 continue; 940 941 ret = do_shrink_slab(&sc, shrinker, priority); 942 if (ret == SHRINK_EMPTY) { 943 clear_bit(i, info->map); 944 /* 945 * After the shrinker reported that it had no objects to 946 * free, but before we cleared the corresponding bit in 947 * the memcg shrinker map, a new object might have been 948 * added. To make sure, we have the bit set in this 949 * case, we invoke the shrinker one more time and reset 950 * the bit if it reports that it is not empty anymore. 951 * The memory barrier here pairs with the barrier in 952 * set_shrinker_bit(): 953 * 954 * list_lru_add() shrink_slab_memcg() 955 * list_add_tail() clear_bit() 956 * <MB> <MB> 957 * set_bit() do_shrink_slab() 958 */ 959 smp_mb__after_atomic(); 960 ret = do_shrink_slab(&sc, shrinker, priority); 961 if (ret == SHRINK_EMPTY) 962 ret = 0; 963 else 964 set_shrinker_bit(memcg, nid, i); 965 } 966 freed += ret; 967 if (atomic_read(&shrinker_srcu_generation) != generation) { 968 srcu_read_unlock(&shrinker_srcu, srcu_idx); 969 i++; 970 goto again; 971 } 972 } 973 unlock: 974 srcu_read_unlock(&shrinker_srcu, srcu_idx); 975 return freed; 976 } 977 #else /* CONFIG_MEMCG */ 978 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, 979 struct mem_cgroup *memcg, int priority) 980 { 981 return 0; 982 } 983 #endif /* CONFIG_MEMCG */ 984 985 /** 986 * shrink_slab - shrink slab caches 987 * @gfp_mask: allocation context 988 * @nid: node whose slab caches to target 989 * @memcg: memory cgroup whose slab caches to target 990 * @priority: the reclaim priority 991 * 992 * Call the shrink functions to age shrinkable caches. 993 * 994 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, 995 * unaware shrinkers will receive a node id of 0 instead. 996 * 997 * @memcg specifies the memory cgroup to target. Unaware shrinkers 998 * are called only if it is the root cgroup. 999 * 1000 * @priority is sc->priority, we take the number of objects and >> by priority 1001 * in order to get the scan target. 1002 * 1003 * Returns the number of reclaimed slab objects. 1004 */ 1005 static unsigned long shrink_slab(gfp_t gfp_mask, int nid, 1006 struct mem_cgroup *memcg, 1007 int priority) 1008 { 1009 unsigned long ret, freed = 0; 1010 struct shrinker *shrinker; 1011 int srcu_idx, generation; 1012 1013 /* 1014 * The root memcg might be allocated even though memcg is disabled 1015 * via "cgroup_disable=memory" boot parameter. This could make 1016 * mem_cgroup_is_root() return false, then just run memcg slab 1017 * shrink, but skip global shrink. This may result in premature 1018 * oom. 1019 */ 1020 if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) 1021 return shrink_slab_memcg(gfp_mask, nid, memcg, priority); 1022 1023 srcu_idx = srcu_read_lock(&shrinker_srcu); 1024 1025 generation = atomic_read(&shrinker_srcu_generation); 1026 list_for_each_entry_srcu(shrinker, &shrinker_list, list, 1027 srcu_read_lock_held(&shrinker_srcu)) { 1028 struct shrink_control sc = { 1029 .gfp_mask = gfp_mask, 1030 .nid = nid, 1031 .memcg = memcg, 1032 }; 1033 1034 ret = do_shrink_slab(&sc, shrinker, priority); 1035 if (ret == SHRINK_EMPTY) 1036 ret = 0; 1037 freed += ret; 1038 1039 if (atomic_read(&shrinker_srcu_generation) != generation) { 1040 freed = freed ? : 1; 1041 break; 1042 } 1043 } 1044 1045 srcu_read_unlock(&shrinker_srcu, srcu_idx); 1046 cond_resched(); 1047 return freed; 1048 } 1049 1050 static unsigned long drop_slab_node(int nid) 1051 { 1052 unsigned long freed = 0; 1053 struct mem_cgroup *memcg = NULL; 1054 1055 memcg = mem_cgroup_iter(NULL, NULL, NULL); 1056 do { 1057 freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); 1058 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); 1059 1060 return freed; 1061 } 1062 1063 void drop_slab(void) 1064 { 1065 int nid; 1066 int shift = 0; 1067 unsigned long freed; 1068 1069 do { 1070 freed = 0; 1071 for_each_online_node(nid) { 1072 if (fatal_signal_pending(current)) 1073 return; 1074 1075 freed += drop_slab_node(nid); 1076 } 1077 } while ((freed >> shift++) > 1); 1078 } 1079 1080 static int reclaimer_offset(void) 1081 { 1082 BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != 1083 PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD); 1084 BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != 1085 PGSCAN_DIRECT - PGSCAN_KSWAPD); 1086 BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != 1087 PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD); 1088 BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != 1089 PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD); 1090 1091 if (current_is_kswapd()) 1092 return 0; 1093 if (current_is_khugepaged()) 1094 return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD; 1095 return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; 1096 } 1097 1098 static inline int is_page_cache_freeable(struct folio *folio) 1099 { 1100 /* 1101 * A freeable page cache folio is referenced only by the caller 1102 * that isolated the folio, the page cache and optional filesystem 1103 * private data at folio->private. 1104 */ 1105 return folio_ref_count(folio) - folio_test_private(folio) == 1106 1 + folio_nr_pages(folio); 1107 } 1108 1109 /* 1110 * We detected a synchronous write error writing a folio out. Probably 1111 * -ENOSPC. We need to propagate that into the address_space for a subsequent 1112 * fsync(), msync() or close(). 1113 * 1114 * The tricky part is that after writepage we cannot touch the mapping: nothing 1115 * prevents it from being freed up. But we have a ref on the folio and once 1116 * that folio is locked, the mapping is pinned. 1117 * 1118 * We're allowed to run sleeping folio_lock() here because we know the caller has 1119 * __GFP_FS. 1120 */ 1121 static void handle_write_error(struct address_space *mapping, 1122 struct folio *folio, int error) 1123 { 1124 folio_lock(folio); 1125 if (folio_mapping(folio) == mapping) 1126 mapping_set_error(mapping, error); 1127 folio_unlock(folio); 1128 } 1129 1130 static bool skip_throttle_noprogress(pg_data_t *pgdat) 1131 { 1132 int reclaimable = 0, write_pending = 0; 1133 int i; 1134 1135 /* 1136 * If kswapd is disabled, reschedule if necessary but do not 1137 * throttle as the system is likely near OOM. 1138 */ 1139 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) 1140 return true; 1141 1142 /* 1143 * If there are a lot of dirty/writeback folios then do not 1144 * throttle as throttling will occur when the folios cycle 1145 * towards the end of the LRU if still under writeback. 1146 */ 1147 for (i = 0; i < MAX_NR_ZONES; i++) { 1148 struct zone *zone = pgdat->node_zones + i; 1149 1150 if (!managed_zone(zone)) 1151 continue; 1152 1153 reclaimable += zone_reclaimable_pages(zone); 1154 write_pending += zone_page_state_snapshot(zone, 1155 NR_ZONE_WRITE_PENDING); 1156 } 1157 if (2 * write_pending <= reclaimable) 1158 return true; 1159 1160 return false; 1161 } 1162 1163 void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) 1164 { 1165 wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason]; 1166 long timeout, ret; 1167 DEFINE_WAIT(wait); 1168 1169 /* 1170 * Do not throttle IO workers, kthreads other than kswapd or 1171 * workqueues. They may be required for reclaim to make 1172 * forward progress (e.g. journalling workqueues or kthreads). 1173 */ 1174 if (!current_is_kswapd() && 1175 current->flags & (PF_IO_WORKER|PF_KTHREAD)) { 1176 cond_resched(); 1177 return; 1178 } 1179 1180 /* 1181 * These figures are pulled out of thin air. 1182 * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many 1183 * parallel reclaimers which is a short-lived event so the timeout is 1184 * short. Failing to make progress or waiting on writeback are 1185 * potentially long-lived events so use a longer timeout. This is shaky 1186 * logic as a failure to make progress could be due to anything from 1187 * writeback to a slow device to excessive referenced folios at the tail 1188 * of the inactive LRU. 1189 */ 1190 switch(reason) { 1191 case VMSCAN_THROTTLE_WRITEBACK: 1192 timeout = HZ/10; 1193 1194 if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) { 1195 WRITE_ONCE(pgdat->nr_reclaim_start, 1196 node_page_state(pgdat, NR_THROTTLED_WRITTEN)); 1197 } 1198 1199 break; 1200 case VMSCAN_THROTTLE_CONGESTED: 1201 fallthrough; 1202 case VMSCAN_THROTTLE_NOPROGRESS: 1203 if (skip_throttle_noprogress(pgdat)) { 1204 cond_resched(); 1205 return; 1206 } 1207 1208 timeout = 1; 1209 1210 break; 1211 case VMSCAN_THROTTLE_ISOLATED: 1212 timeout = HZ/50; 1213 break; 1214 default: 1215 WARN_ON_ONCE(1); 1216 timeout = HZ; 1217 break; 1218 } 1219 1220 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 1221 ret = schedule_timeout(timeout); 1222 finish_wait(wqh, &wait); 1223 1224 if (reason == VMSCAN_THROTTLE_WRITEBACK) 1225 atomic_dec(&pgdat->nr_writeback_throttled); 1226 1227 trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout), 1228 jiffies_to_usecs(timeout - ret), 1229 reason); 1230 } 1231 1232 /* 1233 * Account for folios written if tasks are throttled waiting on dirty 1234 * folios to clean. If enough folios have been cleaned since throttling 1235 * started then wakeup the throttled tasks. 1236 */ 1237 void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, 1238 int nr_throttled) 1239 { 1240 unsigned long nr_written; 1241 1242 node_stat_add_folio(folio, NR_THROTTLED_WRITTEN); 1243 1244 /* 1245 * This is an inaccurate read as the per-cpu deltas may not 1246 * be synchronised. However, given that the system is 1247 * writeback throttled, it is not worth taking the penalty 1248 * of getting an accurate count. At worst, the throttle 1249 * timeout guarantees forward progress. 1250 */ 1251 nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) - 1252 READ_ONCE(pgdat->nr_reclaim_start); 1253 1254 if (nr_written > SWAP_CLUSTER_MAX * nr_throttled) 1255 wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]); 1256 } 1257 1258 /* possible outcome of pageout() */ 1259 typedef enum { 1260 /* failed to write folio out, folio is locked */ 1261 PAGE_KEEP, 1262 /* move folio to the active list, folio is locked */ 1263 PAGE_ACTIVATE, 1264 /* folio has been sent to the disk successfully, folio is unlocked */ 1265 PAGE_SUCCESS, 1266 /* folio is clean and locked */ 1267 PAGE_CLEAN, 1268 } pageout_t; 1269 1270 /* 1271 * pageout is called by shrink_folio_list() for each dirty folio. 1272 * Calls ->writepage(). 1273 */ 1274 static pageout_t pageout(struct folio *folio, struct address_space *mapping, 1275 struct swap_iocb **plug) 1276 { 1277 /* 1278 * If the folio is dirty, only perform writeback if that write 1279 * will be non-blocking. To prevent this allocation from being 1280 * stalled by pagecache activity. But note that there may be 1281 * stalls if we need to run get_block(). We could test 1282 * PagePrivate for that. 1283 * 1284 * If this process is currently in __generic_file_write_iter() against 1285 * this folio's queue, we can perform writeback even if that 1286 * will block. 1287 * 1288 * If the folio is swapcache, write it back even if that would 1289 * block, for some throttling. This happens by accident, because 1290 * swap_backing_dev_info is bust: it doesn't reflect the 1291 * congestion state of the swapdevs. Easy to fix, if needed. 1292 */ 1293 if (!is_page_cache_freeable(folio)) 1294 return PAGE_KEEP; 1295 if (!mapping) { 1296 /* 1297 * Some data journaling orphaned folios can have 1298 * folio->mapping == NULL while being dirty with clean buffers. 1299 */ 1300 if (folio_test_private(folio)) { 1301 if (try_to_free_buffers(folio)) { 1302 folio_clear_dirty(folio); 1303 pr_info("%s: orphaned folio\n", __func__); 1304 return PAGE_CLEAN; 1305 } 1306 } 1307 return PAGE_KEEP; 1308 } 1309 if (mapping->a_ops->writepage == NULL) 1310 return PAGE_ACTIVATE; 1311 1312 if (folio_clear_dirty_for_io(folio)) { 1313 int res; 1314 struct writeback_control wbc = { 1315 .sync_mode = WB_SYNC_NONE, 1316 .nr_to_write = SWAP_CLUSTER_MAX, 1317 .range_start = 0, 1318 .range_end = LLONG_MAX, 1319 .for_reclaim = 1, 1320 .swap_plug = plug, 1321 }; 1322 1323 folio_set_reclaim(folio); 1324 res = mapping->a_ops->writepage(&folio->page, &wbc); 1325 if (res < 0) 1326 handle_write_error(mapping, folio, res); 1327 if (res == AOP_WRITEPAGE_ACTIVATE) { 1328 folio_clear_reclaim(folio); 1329 return PAGE_ACTIVATE; 1330 } 1331 1332 if (!folio_test_writeback(folio)) { 1333 /* synchronous write or broken a_ops? */ 1334 folio_clear_reclaim(folio); 1335 } 1336 trace_mm_vmscan_write_folio(folio); 1337 node_stat_add_folio(folio, NR_VMSCAN_WRITE); 1338 return PAGE_SUCCESS; 1339 } 1340 1341 return PAGE_CLEAN; 1342 } 1343 1344 /* 1345 * Same as remove_mapping, but if the folio is removed from the mapping, it 1346 * gets returned with a refcount of 0. 1347 */ 1348 static int __remove_mapping(struct address_space *mapping, struct folio *folio, 1349 bool reclaimed, struct mem_cgroup *target_memcg) 1350 { 1351 int refcount; 1352 void *shadow = NULL; 1353 1354 BUG_ON(!folio_test_locked(folio)); 1355 BUG_ON(mapping != folio_mapping(folio)); 1356 1357 if (!folio_test_swapcache(folio)) 1358 spin_lock(&mapping->host->i_lock); 1359 xa_lock_irq(&mapping->i_pages); 1360 /* 1361 * The non racy check for a busy folio. 1362 * 1363 * Must be careful with the order of the tests. When someone has 1364 * a ref to the folio, it may be possible that they dirty it then 1365 * drop the reference. So if the dirty flag is tested before the 1366 * refcount here, then the following race may occur: 1367 * 1368 * get_user_pages(&page); 1369 * [user mapping goes away] 1370 * write_to(page); 1371 * !folio_test_dirty(folio) [good] 1372 * folio_set_dirty(folio); 1373 * folio_put(folio); 1374 * !refcount(folio) [good, discard it] 1375 * 1376 * [oops, our write_to data is lost] 1377 * 1378 * Reversing the order of the tests ensures such a situation cannot 1379 * escape unnoticed. The smp_rmb is needed to ensure the folio->flags 1380 * load is not satisfied before that of folio->_refcount. 1381 * 1382 * Note that if the dirty flag is always set via folio_mark_dirty, 1383 * and thus under the i_pages lock, then this ordering is not required. 1384 */ 1385 refcount = 1 + folio_nr_pages(folio); 1386 if (!folio_ref_freeze(folio, refcount)) 1387 goto cannot_free; 1388 /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */ 1389 if (unlikely(folio_test_dirty(folio))) { 1390 folio_ref_unfreeze(folio, refcount); 1391 goto cannot_free; 1392 } 1393 1394 if (folio_test_swapcache(folio)) { 1395 swp_entry_t swap = folio_swap_entry(folio); 1396 1397 if (reclaimed && !mapping_exiting(mapping)) 1398 shadow = workingset_eviction(folio, target_memcg); 1399 __delete_from_swap_cache(folio, swap, shadow); 1400 mem_cgroup_swapout(folio, swap); 1401 xa_unlock_irq(&mapping->i_pages); 1402 put_swap_folio(folio, swap); 1403 } else { 1404 void (*free_folio)(struct folio *); 1405 1406 free_folio = mapping->a_ops->free_folio; 1407 /* 1408 * Remember a shadow entry for reclaimed file cache in 1409 * order to detect refaults, thus thrashing, later on. 1410 * 1411 * But don't store shadows in an address space that is 1412 * already exiting. This is not just an optimization, 1413 * inode reclaim needs to empty out the radix tree or 1414 * the nodes are lost. Don't plant shadows behind its 1415 * back. 1416 * 1417 * We also don't store shadows for DAX mappings because the 1418 * only page cache folios found in these are zero pages 1419 * covering holes, and because we don't want to mix DAX 1420 * exceptional entries and shadow exceptional entries in the 1421 * same address_space. 1422 */ 1423 if (reclaimed && folio_is_file_lru(folio) && 1424 !mapping_exiting(mapping) && !dax_mapping(mapping)) 1425 shadow = workingset_eviction(folio, target_memcg); 1426 __filemap_remove_folio(folio, shadow); 1427 xa_unlock_irq(&mapping->i_pages); 1428 if (mapping_shrinkable(mapping)) 1429 inode_add_lru(mapping->host); 1430 spin_unlock(&mapping->host->i_lock); 1431 1432 if (free_folio) 1433 free_folio(folio); 1434 } 1435 1436 return 1; 1437 1438 cannot_free: 1439 xa_unlock_irq(&mapping->i_pages); 1440 if (!folio_test_swapcache(folio)) 1441 spin_unlock(&mapping->host->i_lock); 1442 return 0; 1443 } 1444 1445 /** 1446 * remove_mapping() - Attempt to remove a folio from its mapping. 1447 * @mapping: The address space. 1448 * @folio: The folio to remove. 1449 * 1450 * If the folio is dirty, under writeback or if someone else has a ref 1451 * on it, removal will fail. 1452 * Return: The number of pages removed from the mapping. 0 if the folio 1453 * could not be removed. 1454 * Context: The caller should have a single refcount on the folio and 1455 * hold its lock. 1456 */ 1457 long remove_mapping(struct address_space *mapping, struct folio *folio) 1458 { 1459 if (__remove_mapping(mapping, folio, false, NULL)) { 1460 /* 1461 * Unfreezing the refcount with 1 effectively 1462 * drops the pagecache ref for us without requiring another 1463 * atomic operation. 1464 */ 1465 folio_ref_unfreeze(folio, 1); 1466 return folio_nr_pages(folio); 1467 } 1468 return 0; 1469 } 1470 1471 /** 1472 * folio_putback_lru - Put previously isolated folio onto appropriate LRU list. 1473 * @folio: Folio to be returned to an LRU list. 1474 * 1475 * Add previously isolated @folio to appropriate LRU list. 1476 * The folio may still be unevictable for other reasons. 1477 * 1478 * Context: lru_lock must not be held, interrupts must be enabled. 1479 */ 1480 void folio_putback_lru(struct folio *folio) 1481 { 1482 folio_add_lru(folio); 1483 folio_put(folio); /* drop ref from isolate */ 1484 } 1485 1486 enum folio_references { 1487 FOLIOREF_RECLAIM, 1488 FOLIOREF_RECLAIM_CLEAN, 1489 FOLIOREF_KEEP, 1490 FOLIOREF_ACTIVATE, 1491 }; 1492 1493 static enum folio_references folio_check_references(struct folio *folio, 1494 struct scan_control *sc) 1495 { 1496 int referenced_ptes, referenced_folio; 1497 unsigned long vm_flags; 1498 1499 referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, 1500 &vm_flags); 1501 referenced_folio = folio_test_clear_referenced(folio); 1502 1503 /* 1504 * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. 1505 * Let the folio, now marked Mlocked, be moved to the unevictable list. 1506 */ 1507 if (vm_flags & VM_LOCKED) 1508 return FOLIOREF_ACTIVATE; 1509 1510 /* rmap lock contention: rotate */ 1511 if (referenced_ptes == -1) 1512 return FOLIOREF_KEEP; 1513 1514 if (referenced_ptes) { 1515 /* 1516 * All mapped folios start out with page table 1517 * references from the instantiating fault, so we need 1518 * to look twice if a mapped file/anon folio is used more 1519 * than once. 1520 * 1521 * Mark it and spare it for another trip around the 1522 * inactive list. Another page table reference will 1523 * lead to its activation. 1524 * 1525 * Note: the mark is set for activated folios as well 1526 * so that recently deactivated but used folios are 1527 * quickly recovered. 1528 */ 1529 folio_set_referenced(folio); 1530 1531 if (referenced_folio || referenced_ptes > 1) 1532 return FOLIOREF_ACTIVATE; 1533 1534 /* 1535 * Activate file-backed executable folios after first usage. 1536 */ 1537 if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) 1538 return FOLIOREF_ACTIVATE; 1539 1540 return FOLIOREF_KEEP; 1541 } 1542 1543 /* Reclaim if clean, defer dirty folios to writeback */ 1544 if (referenced_folio && folio_is_file_lru(folio)) 1545 return FOLIOREF_RECLAIM_CLEAN; 1546 1547 return FOLIOREF_RECLAIM; 1548 } 1549 1550 /* Check if a folio is dirty or under writeback */ 1551 static void folio_check_dirty_writeback(struct folio *folio, 1552 bool *dirty, bool *writeback) 1553 { 1554 struct address_space *mapping; 1555 1556 /* 1557 * Anonymous folios are not handled by flushers and must be written 1558 * from reclaim context. Do not stall reclaim based on them. 1559 * MADV_FREE anonymous folios are put into inactive file list too. 1560 * They could be mistakenly treated as file lru. So further anon 1561 * test is needed. 1562 */ 1563 if (!folio_is_file_lru(folio) || 1564 (folio_test_anon(folio) && !folio_test_swapbacked(folio))) { 1565 *dirty = false; 1566 *writeback = false; 1567 return; 1568 } 1569 1570 /* By default assume that the folio flags are accurate */ 1571 *dirty = folio_test_dirty(folio); 1572 *writeback = folio_test_writeback(folio); 1573 1574 /* Verify dirty/writeback state if the filesystem supports it */ 1575 if (!folio_test_private(folio)) 1576 return; 1577 1578 mapping = folio_mapping(folio); 1579 if (mapping && mapping->a_ops->is_dirty_writeback) 1580 mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); 1581 } 1582 1583 static struct page *alloc_demote_page(struct page *page, unsigned long private) 1584 { 1585 struct page *target_page; 1586 nodemask_t *allowed_mask; 1587 struct migration_target_control *mtc; 1588 1589 mtc = (struct migration_target_control *)private; 1590 1591 allowed_mask = mtc->nmask; 1592 /* 1593 * make sure we allocate from the target node first also trying to 1594 * demote or reclaim pages from the target node via kswapd if we are 1595 * low on free memory on target node. If we don't do this and if 1596 * we have free memory on the slower(lower) memtier, we would start 1597 * allocating pages from slower(lower) memory tiers without even forcing 1598 * a demotion of cold pages from the target memtier. This can result 1599 * in the kernel placing hot pages in slower(lower) memory tiers. 1600 */ 1601 mtc->nmask = NULL; 1602 mtc->gfp_mask |= __GFP_THISNODE; 1603 target_page = alloc_migration_target(page, (unsigned long)mtc); 1604 if (target_page) 1605 return target_page; 1606 1607 mtc->gfp_mask &= ~__GFP_THISNODE; 1608 mtc->nmask = allowed_mask; 1609 1610 return alloc_migration_target(page, (unsigned long)mtc); 1611 } 1612 1613 /* 1614 * Take folios on @demote_folios and attempt to demote them to another node. 1615 * Folios which are not demoted are left on @demote_folios. 1616 */ 1617 static unsigned int demote_folio_list(struct list_head *demote_folios, 1618 struct pglist_data *pgdat) 1619 { 1620 int target_nid = next_demotion_node(pgdat->node_id); 1621 unsigned int nr_succeeded; 1622 nodemask_t allowed_mask; 1623 1624 struct migration_target_control mtc = { 1625 /* 1626 * Allocate from 'node', or fail quickly and quietly. 1627 * When this happens, 'page' will likely just be discarded 1628 * instead of migrated. 1629 */ 1630 .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN | 1631 __GFP_NOMEMALLOC | GFP_NOWAIT, 1632 .nid = target_nid, 1633 .nmask = &allowed_mask 1634 }; 1635 1636 if (list_empty(demote_folios)) 1637 return 0; 1638 1639 if (target_nid == NUMA_NO_NODE) 1640 return 0; 1641 1642 node_get_allowed_targets(pgdat, &allowed_mask); 1643 1644 /* Demotion ignores all cpuset and mempolicy settings */ 1645 migrate_pages(demote_folios, alloc_demote_page, NULL, 1646 (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, 1647 &nr_succeeded); 1648 1649 __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded); 1650 1651 return nr_succeeded; 1652 } 1653 1654 static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) 1655 { 1656 if (gfp_mask & __GFP_FS) 1657 return true; 1658 if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO)) 1659 return false; 1660 /* 1661 * We can "enter_fs" for swap-cache with only __GFP_IO 1662 * providing this isn't SWP_FS_OPS. 1663 * ->flags can be updated non-atomicially (scan_swap_map_slots), 1664 * but that will never affect SWP_FS_OPS, so the data_race 1665 * is safe. 1666 */ 1667 return !data_race(folio_swap_flags(folio) & SWP_FS_OPS); 1668 } 1669 1670 /* 1671 * shrink_folio_list() returns the number of reclaimed pages 1672 */ 1673 static unsigned int shrink_folio_list(struct list_head *folio_list, 1674 struct pglist_data *pgdat, struct scan_control *sc, 1675 struct reclaim_stat *stat, bool ignore_references) 1676 { 1677 LIST_HEAD(ret_folios); 1678 LIST_HEAD(free_folios); 1679 LIST_HEAD(demote_folios); 1680 unsigned int nr_reclaimed = 0; 1681 unsigned int pgactivate = 0; 1682 bool do_demote_pass; 1683 struct swap_iocb *plug = NULL; 1684 1685 memset(stat, 0, sizeof(*stat)); 1686 cond_resched(); 1687 do_demote_pass = can_demote(pgdat->node_id, sc); 1688 1689 retry: 1690 while (!list_empty(folio_list)) { 1691 struct address_space *mapping; 1692 struct folio *folio; 1693 enum folio_references references = FOLIOREF_RECLAIM; 1694 bool dirty, writeback; 1695 unsigned int nr_pages; 1696 1697 cond_resched(); 1698 1699 folio = lru_to_folio(folio_list); 1700 list_del(&folio->lru); 1701 1702 if (!folio_trylock(folio)) 1703 goto keep; 1704 1705 VM_BUG_ON_FOLIO(folio_test_active(folio), folio); 1706 1707 nr_pages = folio_nr_pages(folio); 1708 1709 /* Account the number of base pages */ 1710 sc->nr_scanned += nr_pages; 1711 1712 if (unlikely(!folio_evictable(folio))) 1713 goto activate_locked; 1714 1715 if (!sc->may_unmap && folio_mapped(folio)) 1716 goto keep_locked; 1717 1718 /* folio_update_gen() tried to promote this page? */ 1719 if (lru_gen_enabled() && !ignore_references && 1720 folio_mapped(folio) && folio_test_referenced(folio)) 1721 goto keep_locked; 1722 1723 /* 1724 * The number of dirty pages determines if a node is marked 1725 * reclaim_congested. kswapd will stall and start writing 1726 * folios if the tail of the LRU is all dirty unqueued folios. 1727 */ 1728 folio_check_dirty_writeback(folio, &dirty, &writeback); 1729 if (dirty || writeback) 1730 stat->nr_dirty += nr_pages; 1731 1732 if (dirty && !writeback) 1733 stat->nr_unqueued_dirty += nr_pages; 1734 1735 /* 1736 * Treat this folio as congested if folios are cycling 1737 * through the LRU so quickly that the folios marked 1738 * for immediate reclaim are making it to the end of 1739 * the LRU a second time. 1740 */ 1741 if (writeback && folio_test_reclaim(folio)) 1742 stat->nr_congested += nr_pages; 1743 1744 /* 1745 * If a folio at the tail of the LRU is under writeback, there 1746 * are three cases to consider. 1747 * 1748 * 1) If reclaim is encountering an excessive number 1749 * of folios under writeback and this folio has both 1750 * the writeback and reclaim flags set, then it 1751 * indicates that folios are being queued for I/O but 1752 * are being recycled through the LRU before the I/O 1753 * can complete. Waiting on the folio itself risks an 1754 * indefinite stall if it is impossible to writeback 1755 * the folio due to I/O error or disconnected storage 1756 * so instead note that the LRU is being scanned too 1757 * quickly and the caller can stall after the folio 1758 * list has been processed. 1759 * 1760 * 2) Global or new memcg reclaim encounters a folio that is 1761 * not marked for immediate reclaim, or the caller does not 1762 * have __GFP_FS (or __GFP_IO if it's simply going to swap, 1763 * not to fs). In this case mark the folio for immediate 1764 * reclaim and continue scanning. 1765 * 1766 * Require may_enter_fs() because we would wait on fs, which 1767 * may not have submitted I/O yet. And the loop driver might 1768 * enter reclaim, and deadlock if it waits on a folio for 1769 * which it is needed to do the write (loop masks off 1770 * __GFP_IO|__GFP_FS for this reason); but more thought 1771 * would probably show more reasons. 1772 * 1773 * 3) Legacy memcg encounters a folio that already has the 1774 * reclaim flag set. memcg does not have any dirty folio 1775 * throttling so we could easily OOM just because too many 1776 * folios are in writeback and there is nothing else to 1777 * reclaim. Wait for the writeback to complete. 1778 * 1779 * In cases 1) and 2) we activate the folios to get them out of 1780 * the way while we continue scanning for clean folios on the 1781 * inactive list and refilling from the active list. The 1782 * observation here is that waiting for disk writes is more 1783 * expensive than potentially causing reloads down the line. 1784 * Since they're marked for immediate reclaim, they won't put 1785 * memory pressure on the cache working set any longer than it 1786 * takes to write them to disk. 1787 */ 1788 if (folio_test_writeback(folio)) { 1789 /* Case 1 above */ 1790 if (current_is_kswapd() && 1791 folio_test_reclaim(folio) && 1792 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { 1793 stat->nr_immediate += nr_pages; 1794 goto activate_locked; 1795 1796 /* Case 2 above */ 1797 } else if (writeback_throttling_sane(sc) || 1798 !folio_test_reclaim(folio) || 1799 !may_enter_fs(folio, sc->gfp_mask)) { 1800 /* 1801 * This is slightly racy - 1802 * folio_end_writeback() might have 1803 * just cleared the reclaim flag, then 1804 * setting the reclaim flag here ends up 1805 * interpreted as the readahead flag - but 1806 * that does not matter enough to care. 1807 * What we do want is for this folio to 1808 * have the reclaim flag set next time 1809 * memcg reclaim reaches the tests above, 1810 * so it will then wait for writeback to 1811 * avoid OOM; and it's also appropriate 1812 * in global reclaim. 1813 */ 1814 folio_set_reclaim(folio); 1815 stat->nr_writeback += nr_pages; 1816 goto activate_locked; 1817 1818 /* Case 3 above */ 1819 } else { 1820 folio_unlock(folio); 1821 folio_wait_writeback(folio); 1822 /* then go back and try same folio again */ 1823 list_add_tail(&folio->lru, folio_list); 1824 continue; 1825 } 1826 } 1827 1828 if (!ignore_references) 1829 references = folio_check_references(folio, sc); 1830 1831 switch (references) { 1832 case FOLIOREF_ACTIVATE: 1833 goto activate_locked; 1834 case FOLIOREF_KEEP: 1835 stat->nr_ref_keep += nr_pages; 1836 goto keep_locked; 1837 case FOLIOREF_RECLAIM: 1838 case FOLIOREF_RECLAIM_CLEAN: 1839 ; /* try to reclaim the folio below */ 1840 } 1841 1842 /* 1843 * Before reclaiming the folio, try to relocate 1844 * its contents to another node. 1845 */ 1846 if (do_demote_pass && 1847 (thp_migration_supported() || !folio_test_large(folio))) { 1848 list_add(&folio->lru, &demote_folios); 1849 folio_unlock(folio); 1850 continue; 1851 } 1852 1853 /* 1854 * Anonymous process memory has backing store? 1855 * Try to allocate it some swap space here. 1856 * Lazyfree folio could be freed directly 1857 */ 1858 if (folio_test_anon(folio) && folio_test_swapbacked(folio)) { 1859 if (!folio_test_swapcache(folio)) { 1860 if (!(sc->gfp_mask & __GFP_IO)) 1861 goto keep_locked; 1862 if (folio_maybe_dma_pinned(folio)) 1863 goto keep_locked; 1864 if (folio_test_large(folio)) { 1865 /* cannot split folio, skip it */ 1866 if (!can_split_folio(folio, NULL)) 1867 goto activate_locked; 1868 /* 1869 * Split folios without a PMD map right 1870 * away. Chances are some or all of the 1871 * tail pages can be freed without IO. 1872 */ 1873 if (!folio_entire_mapcount(folio) && 1874 split_folio_to_list(folio, 1875 folio_list)) 1876 goto activate_locked; 1877 } 1878 if (!add_to_swap(folio)) { 1879 if (!folio_test_large(folio)) 1880 goto activate_locked_split; 1881 /* Fallback to swap normal pages */ 1882 if (split_folio_to_list(folio, 1883 folio_list)) 1884 goto activate_locked; 1885 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1886 count_vm_event(THP_SWPOUT_FALLBACK); 1887 #endif 1888 if (!add_to_swap(folio)) 1889 goto activate_locked_split; 1890 } 1891 } 1892 } else if (folio_test_swapbacked(folio) && 1893 folio_test_large(folio)) { 1894 /* Split shmem folio */ 1895 if (split_folio_to_list(folio, folio_list)) 1896 goto keep_locked; 1897 } 1898 1899 /* 1900 * If the folio was split above, the tail pages will make 1901 * their own pass through this function and be accounted 1902 * then. 1903 */ 1904 if ((nr_pages > 1) && !folio_test_large(folio)) { 1905 sc->nr_scanned -= (nr_pages - 1); 1906 nr_pages = 1; 1907 } 1908 1909 /* 1910 * The folio is mapped into the page tables of one or more 1911 * processes. Try to unmap it here. 1912 */ 1913 if (folio_mapped(folio)) { 1914 enum ttu_flags flags = TTU_BATCH_FLUSH; 1915 bool was_swapbacked = folio_test_swapbacked(folio); 1916 1917 if (folio_test_pmd_mappable(folio)) 1918 flags |= TTU_SPLIT_HUGE_PMD; 1919 1920 try_to_unmap(folio, flags); 1921 if (folio_mapped(folio)) { 1922 stat->nr_unmap_fail += nr_pages; 1923 if (!was_swapbacked && 1924 folio_test_swapbacked(folio)) 1925 stat->nr_lazyfree_fail += nr_pages; 1926 goto activate_locked; 1927 } 1928 } 1929 1930 mapping = folio_mapping(folio); 1931 if (folio_test_dirty(folio)) { 1932 /* 1933 * Only kswapd can writeback filesystem folios 1934 * to avoid risk of stack overflow. But avoid 1935 * injecting inefficient single-folio I/O into 1936 * flusher writeback as much as possible: only 1937 * write folios when we've encountered many 1938 * dirty folios, and when we've already scanned 1939 * the rest of the LRU for clean folios and see 1940 * the same dirty folios again (with the reclaim 1941 * flag set). 1942 */ 1943 if (folio_is_file_lru(folio) && 1944 (!current_is_kswapd() || 1945 !folio_test_reclaim(folio) || 1946 !test_bit(PGDAT_DIRTY, &pgdat->flags))) { 1947 /* 1948 * Immediately reclaim when written back. 1949 * Similar in principle to folio_deactivate() 1950 * except we already have the folio isolated 1951 * and know it's dirty 1952 */ 1953 node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE, 1954 nr_pages); 1955 folio_set_reclaim(folio); 1956 1957 goto activate_locked; 1958 } 1959 1960 if (references == FOLIOREF_RECLAIM_CLEAN) 1961 goto keep_locked; 1962 if (!may_enter_fs(folio, sc->gfp_mask)) 1963 goto keep_locked; 1964 if (!sc->may_writepage) 1965 goto keep_locked; 1966 1967 /* 1968 * Folio is dirty. Flush the TLB if a writable entry 1969 * potentially exists to avoid CPU writes after I/O 1970 * starts and then write it out here. 1971 */ 1972 try_to_unmap_flush_dirty(); 1973 switch (pageout(folio, mapping, &plug)) { 1974 case PAGE_KEEP: 1975 goto keep_locked; 1976 case PAGE_ACTIVATE: 1977 goto activate_locked; 1978 case PAGE_SUCCESS: 1979 stat->nr_pageout += nr_pages; 1980 1981 if (folio_test_writeback(folio)) 1982 goto keep; 1983 if (folio_test_dirty(folio)) 1984 goto keep; 1985 1986 /* 1987 * A synchronous write - probably a ramdisk. Go 1988 * ahead and try to reclaim the folio. 1989 */ 1990 if (!folio_trylock(folio)) 1991 goto keep; 1992 if (folio_test_dirty(folio) || 1993 folio_test_writeback(folio)) 1994 goto keep_locked; 1995 mapping = folio_mapping(folio); 1996 fallthrough; 1997 case PAGE_CLEAN: 1998 ; /* try to free the folio below */ 1999 } 2000 } 2001 2002 /* 2003 * If the folio has buffers, try to free the buffer 2004 * mappings associated with this folio. If we succeed 2005 * we try to free the folio as well. 2006 * 2007 * We do this even if the folio is dirty. 2008 * filemap_release_folio() does not perform I/O, but it 2009 * is possible for a folio to have the dirty flag set, 2010 * but it is actually clean (all its buffers are clean). 2011 * This happens if the buffers were written out directly, 2012 * with submit_bh(). ext3 will do this, as well as 2013 * the blockdev mapping. filemap_release_folio() will 2014 * discover that cleanness and will drop the buffers 2015 * and mark the folio clean - it can be freed. 2016 * 2017 * Rarely, folios can have buffers and no ->mapping. 2018 * These are the folios which were not successfully 2019 * invalidated in truncate_cleanup_folio(). We try to 2020 * drop those buffers here and if that worked, and the 2021 * folio is no longer mapped into process address space 2022 * (refcount == 1) it can be freed. Otherwise, leave 2023 * the folio on the LRU so it is swappable. 2024 */ 2025 if (folio_has_private(folio)) { 2026 if (!filemap_release_folio(folio, sc->gfp_mask)) 2027 goto activate_locked; 2028 if (!mapping && folio_ref_count(folio) == 1) { 2029 folio_unlock(folio); 2030 if (folio_put_testzero(folio)) 2031 goto free_it; 2032 else { 2033 /* 2034 * rare race with speculative reference. 2035 * the speculative reference will free 2036 * this folio shortly, so we may 2037 * increment nr_reclaimed here (and 2038 * leave it off the LRU). 2039 */ 2040 nr_reclaimed += nr_pages; 2041 continue; 2042 } 2043 } 2044 } 2045 2046 if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { 2047 /* follow __remove_mapping for reference */ 2048 if (!folio_ref_freeze(folio, 1)) 2049 goto keep_locked; 2050 /* 2051 * The folio has only one reference left, which is 2052 * from the isolation. After the caller puts the 2053 * folio back on the lru and drops the reference, the 2054 * folio will be freed anyway. It doesn't matter 2055 * which lru it goes on. So we don't bother checking 2056 * the dirty flag here. 2057 */ 2058 count_vm_events(PGLAZYFREED, nr_pages); 2059 count_memcg_folio_events(folio, PGLAZYFREED, nr_pages); 2060 } else if (!mapping || !__remove_mapping(mapping, folio, true, 2061 sc->target_mem_cgroup)) 2062 goto keep_locked; 2063 2064 folio_unlock(folio); 2065 free_it: 2066 /* 2067 * Folio may get swapped out as a whole, need to account 2068 * all pages in it. 2069 */ 2070 nr_reclaimed += nr_pages; 2071 2072 /* 2073 * Is there need to periodically free_folio_list? It would 2074 * appear not as the counts should be low 2075 */ 2076 if (unlikely(folio_test_large(folio))) 2077 destroy_large_folio(folio); 2078 else 2079 list_add(&folio->lru, &free_folios); 2080 continue; 2081 2082 activate_locked_split: 2083 /* 2084 * The tail pages that are failed to add into swap cache 2085 * reach here. Fixup nr_scanned and nr_pages. 2086 */ 2087 if (nr_pages > 1) { 2088 sc->nr_scanned -= (nr_pages - 1); 2089 nr_pages = 1; 2090 } 2091 activate_locked: 2092 /* Not a candidate for swapping, so reclaim swap space. */ 2093 if (folio_test_swapcache(folio) && 2094 (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio))) 2095 folio_free_swap(folio); 2096 VM_BUG_ON_FOLIO(folio_test_active(folio), folio); 2097 if (!folio_test_mlocked(folio)) { 2098 int type = folio_is_file_lru(folio); 2099 folio_set_active(folio); 2100 stat->nr_activate[type] += nr_pages; 2101 count_memcg_folio_events(folio, PGACTIVATE, nr_pages); 2102 } 2103 keep_locked: 2104 folio_unlock(folio); 2105 keep: 2106 list_add(&folio->lru, &ret_folios); 2107 VM_BUG_ON_FOLIO(folio_test_lru(folio) || 2108 folio_test_unevictable(folio), folio); 2109 } 2110 /* 'folio_list' is always empty here */ 2111 2112 /* Migrate folios selected for demotion */ 2113 nr_reclaimed += demote_folio_list(&demote_folios, pgdat); 2114 /* Folios that could not be demoted are still in @demote_folios */ 2115 if (!list_empty(&demote_folios)) { 2116 /* Folios which weren't demoted go back on @folio_list */ 2117 list_splice_init(&demote_folios, folio_list); 2118 2119 /* 2120 * goto retry to reclaim the undemoted folios in folio_list if 2121 * desired. 2122 * 2123 * Reclaiming directly from top tier nodes is not often desired 2124 * due to it breaking the LRU ordering: in general memory 2125 * should be reclaimed from lower tier nodes and demoted from 2126 * top tier nodes. 2127 * 2128 * However, disabling reclaim from top tier nodes entirely 2129 * would cause ooms in edge scenarios where lower tier memory 2130 * is unreclaimable for whatever reason, eg memory being 2131 * mlocked or too hot to reclaim. We can disable reclaim 2132 * from top tier nodes in proactive reclaim though as that is 2133 * not real memory pressure. 2134 */ 2135 if (!sc->proactive) { 2136 do_demote_pass = false; 2137 goto retry; 2138 } 2139 } 2140 2141 pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; 2142 2143 mem_cgroup_uncharge_list(&free_folios); 2144 try_to_unmap_flush(); 2145 free_unref_page_list(&free_folios); 2146 2147 list_splice(&ret_folios, folio_list); 2148 count_vm_events(PGACTIVATE, pgactivate); 2149 2150 if (plug) 2151 swap_write_unplug(plug); 2152 return nr_reclaimed; 2153 } 2154 2155 unsigned int reclaim_clean_pages_from_list(struct zone *zone, 2156 struct list_head *folio_list) 2157 { 2158 struct scan_control sc = { 2159 .gfp_mask = GFP_KERNEL, 2160 .may_unmap = 1, 2161 }; 2162 struct reclaim_stat stat; 2163 unsigned int nr_reclaimed; 2164 struct folio *folio, *next; 2165 LIST_HEAD(clean_folios); 2166 unsigned int noreclaim_flag; 2167 2168 list_for_each_entry_safe(folio, next, folio_list, lru) { 2169 if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) && 2170 !folio_test_dirty(folio) && !__folio_test_movable(folio) && 2171 !folio_test_unevictable(folio)) { 2172 folio_clear_active(folio); 2173 list_move(&folio->lru, &clean_folios); 2174 } 2175 } 2176 2177 /* 2178 * We should be safe here since we are only dealing with file pages and 2179 * we are not kswapd and therefore cannot write dirty file pages. But 2180 * call memalloc_noreclaim_save() anyway, just in case these conditions 2181 * change in the future. 2182 */ 2183 noreclaim_flag = memalloc_noreclaim_save(); 2184 nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, 2185 &stat, true); 2186 memalloc_noreclaim_restore(noreclaim_flag); 2187 2188 list_splice(&clean_folios, folio_list); 2189 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, 2190 -(long)nr_reclaimed); 2191 /* 2192 * Since lazyfree pages are isolated from file LRU from the beginning, 2193 * they will rotate back to anonymous LRU in the end if it failed to 2194 * discard so isolated count will be mismatched. 2195 * Compensate the isolated count for both LRU lists. 2196 */ 2197 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, 2198 stat.nr_lazyfree_fail); 2199 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, 2200 -(long)stat.nr_lazyfree_fail); 2201 return nr_reclaimed; 2202 } 2203 2204 /* 2205 * Update LRU sizes after isolating pages. The LRU size updates must 2206 * be complete before mem_cgroup_update_lru_size due to a sanity check. 2207 */ 2208 static __always_inline void update_lru_sizes(struct lruvec *lruvec, 2209 enum lru_list lru, unsigned long *nr_zone_taken) 2210 { 2211 int zid; 2212 2213 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 2214 if (!nr_zone_taken[zid]) 2215 continue; 2216 2217 update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); 2218 } 2219 2220 } 2221 2222 /* 2223 * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. 2224 * 2225 * lruvec->lru_lock is heavily contended. Some of the functions that 2226 * shrink the lists perform better by taking out a batch of pages 2227 * and working on them outside the LRU lock. 2228 * 2229 * For pagecache intensive workloads, this function is the hottest 2230 * spot in the kernel (apart from copy_*_user functions). 2231 * 2232 * Lru_lock must be held before calling this function. 2233 * 2234 * @nr_to_scan: The number of eligible pages to look through on the list. 2235 * @lruvec: The LRU vector to pull pages from. 2236 * @dst: The temp list to put pages on to. 2237 * @nr_scanned: The number of pages that were scanned. 2238 * @sc: The scan_control struct for this reclaim session 2239 * @lru: LRU list id for isolating 2240 * 2241 * returns how many pages were moved onto *@dst. 2242 */ 2243 static unsigned long isolate_lru_folios(unsigned long nr_to_scan, 2244 struct lruvec *lruvec, struct list_head *dst, 2245 unsigned long *nr_scanned, struct scan_control *sc, 2246 enum lru_list lru) 2247 { 2248 struct list_head *src = &lruvec->lists[lru]; 2249 unsigned long nr_taken = 0; 2250 unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; 2251 unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; 2252 unsigned long skipped = 0; 2253 unsigned long scan, total_scan, nr_pages; 2254 LIST_HEAD(folios_skipped); 2255 2256 total_scan = 0; 2257 scan = 0; 2258 while (scan < nr_to_scan && !list_empty(src)) { 2259 struct list_head *move_to = src; 2260 struct folio *folio; 2261 2262 folio = lru_to_folio(src); 2263 prefetchw_prev_lru_folio(folio, src, flags); 2264 2265 nr_pages = folio_nr_pages(folio); 2266 total_scan += nr_pages; 2267 2268 if (folio_zonenum(folio) > sc->reclaim_idx) { 2269 nr_skipped[folio_zonenum(folio)] += nr_pages; 2270 move_to = &folios_skipped; 2271 goto move; 2272 } 2273 2274 /* 2275 * Do not count skipped folios because that makes the function 2276 * return with no isolated folios if the LRU mostly contains 2277 * ineligible folios. This causes the VM to not reclaim any 2278 * folios, triggering a premature OOM. 2279 * Account all pages in a folio. 2280 */ 2281 scan += nr_pages; 2282 2283 if (!folio_test_lru(folio)) 2284 goto move; 2285 if (!sc->may_unmap && folio_mapped(folio)) 2286 goto move; 2287 2288 /* 2289 * Be careful not to clear the lru flag until after we're 2290 * sure the folio is not being freed elsewhere -- the 2291 * folio release code relies on it. 2292 */ 2293 if (unlikely(!folio_try_get(folio))) 2294 goto move; 2295 2296 if (!folio_test_clear_lru(folio)) { 2297 /* Another thread is already isolating this folio */ 2298 folio_put(folio); 2299 goto move; 2300 } 2301 2302 nr_taken += nr_pages; 2303 nr_zone_taken[folio_zonenum(folio)] += nr_pages; 2304 move_to = dst; 2305 move: 2306 list_move(&folio->lru, move_to); 2307 } 2308 2309 /* 2310 * Splice any skipped folios to the start of the LRU list. Note that 2311 * this disrupts the LRU order when reclaiming for lower zones but 2312 * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX 2313 * scanning would soon rescan the same folios to skip and waste lots 2314 * of cpu cycles. 2315 */ 2316 if (!list_empty(&folios_skipped)) { 2317 int zid; 2318 2319 list_splice(&folios_skipped, src); 2320 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 2321 if (!nr_skipped[zid]) 2322 continue; 2323 2324 __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); 2325 skipped += nr_skipped[zid]; 2326 } 2327 } 2328 *nr_scanned = total_scan; 2329 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, 2330 total_scan, skipped, nr_taken, 2331 sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru); 2332 update_lru_sizes(lruvec, lru, nr_zone_taken); 2333 return nr_taken; 2334 } 2335 2336 /** 2337 * folio_isolate_lru() - Try to isolate a folio from its LRU list. 2338 * @folio: Folio to isolate from its LRU list. 2339 * 2340 * Isolate a @folio from an LRU list and adjust the vmstat statistic 2341 * corresponding to whatever LRU list the folio was on. 2342 * 2343 * The folio will have its LRU flag cleared. If it was found on the 2344 * active list, it will have the Active flag set. If it was found on the 2345 * unevictable list, it will have the Unevictable flag set. These flags 2346 * may need to be cleared by the caller before letting the page go. 2347 * 2348 * Context: 2349 * 2350 * (1) Must be called with an elevated refcount on the folio. This is a 2351 * fundamental difference from isolate_lru_folios() (which is called 2352 * without a stable reference). 2353 * (2) The lru_lock must not be held. 2354 * (3) Interrupts must be enabled. 2355 * 2356 * Return: true if the folio was removed from an LRU list. 2357 * false if the folio was not on an LRU list. 2358 */ 2359 bool folio_isolate_lru(struct folio *folio) 2360 { 2361 bool ret = false; 2362 2363 VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); 2364 2365 if (folio_test_clear_lru(folio)) { 2366 struct lruvec *lruvec; 2367 2368 folio_get(folio); 2369 lruvec = folio_lruvec_lock_irq(folio); 2370 lruvec_del_folio(lruvec, folio); 2371 unlock_page_lruvec_irq(lruvec); 2372 ret = true; 2373 } 2374 2375 return ret; 2376 } 2377 2378 /* 2379 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and 2380 * then get rescheduled. When there are massive number of tasks doing page 2381 * allocation, such sleeping direct reclaimers may keep piling up on each CPU, 2382 * the LRU list will go small and be scanned faster than necessary, leading to 2383 * unnecessary swapping, thrashing and OOM. 2384 */ 2385 static int too_many_isolated(struct pglist_data *pgdat, int file, 2386 struct scan_control *sc) 2387 { 2388 unsigned long inactive, isolated; 2389 bool too_many; 2390 2391 if (current_is_kswapd()) 2392 return 0; 2393 2394 if (!writeback_throttling_sane(sc)) 2395 return 0; 2396 2397 if (file) { 2398 inactive = node_page_state(pgdat, NR_INACTIVE_FILE); 2399 isolated = node_page_state(pgdat, NR_ISOLATED_FILE); 2400 } else { 2401 inactive = node_page_state(pgdat, NR_INACTIVE_ANON); 2402 isolated = node_page_state(pgdat, NR_ISOLATED_ANON); 2403 } 2404 2405 /* 2406 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they 2407 * won't get blocked by normal direct-reclaimers, forming a circular 2408 * deadlock. 2409 */ 2410 if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) 2411 inactive >>= 3; 2412 2413 too_many = isolated > inactive; 2414 2415 /* Wake up tasks throttled due to too_many_isolated. */ 2416 if (!too_many) 2417 wake_throttle_isolated(pgdat); 2418 2419 return too_many; 2420 } 2421 2422 /* 2423 * move_folios_to_lru() moves folios from private @list to appropriate LRU list. 2424 * On return, @list is reused as a list of folios to be freed by the caller. 2425 * 2426 * Returns the number of pages moved to the given lruvec. 2427 */ 2428 static unsigned int move_folios_to_lru(struct lruvec *lruvec, 2429 struct list_head *list) 2430 { 2431 int nr_pages, nr_moved = 0; 2432 LIST_HEAD(folios_to_free); 2433 2434 while (!list_empty(list)) { 2435 struct folio *folio = lru_to_folio(list); 2436 2437 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 2438 list_del(&folio->lru); 2439 if (unlikely(!folio_evictable(folio))) { 2440 spin_unlock_irq(&lruvec->lru_lock); 2441 folio_putback_lru(folio); 2442 spin_lock_irq(&lruvec->lru_lock); 2443 continue; 2444 } 2445 2446 /* 2447 * The folio_set_lru needs to be kept here for list integrity. 2448 * Otherwise: 2449 * #0 move_folios_to_lru #1 release_pages 2450 * if (!folio_put_testzero()) 2451 * if (folio_put_testzero()) 2452 * !lru //skip lru_lock 2453 * folio_set_lru() 2454 * list_add(&folio->lru,) 2455 * list_add(&folio->lru,) 2456 */ 2457 folio_set_lru(folio); 2458 2459 if (unlikely(folio_put_testzero(folio))) { 2460 __folio_clear_lru_flags(folio); 2461 2462 if (unlikely(folio_test_large(folio))) { 2463 spin_unlock_irq(&lruvec->lru_lock); 2464 destroy_large_folio(folio); 2465 spin_lock_irq(&lruvec->lru_lock); 2466 } else 2467 list_add(&folio->lru, &folios_to_free); 2468 2469 continue; 2470 } 2471 2472 /* 2473 * All pages were isolated from the same lruvec (and isolation 2474 * inhibits memcg migration). 2475 */ 2476 VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); 2477 lruvec_add_folio(lruvec, folio); 2478 nr_pages = folio_nr_pages(folio); 2479 nr_moved += nr_pages; 2480 if (folio_test_active(folio)) 2481 workingset_age_nonresident(lruvec, nr_pages); 2482 } 2483 2484 /* 2485 * To save our caller's stack, now use input list for pages to free. 2486 */ 2487 list_splice(&folios_to_free, list); 2488 2489 return nr_moved; 2490 } 2491 2492 /* 2493 * If a kernel thread (such as nfsd for loop-back mounts) services a backing 2494 * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case 2495 * we should not throttle. Otherwise it is safe to do so. 2496 */ 2497 static int current_may_throttle(void) 2498 { 2499 return !(current->flags & PF_LOCAL_THROTTLE); 2500 } 2501 2502 /* 2503 * shrink_inactive_list() is a helper for shrink_node(). It returns the number 2504 * of reclaimed pages 2505 */ 2506 static unsigned long shrink_inactive_list(unsigned long nr_to_scan, 2507 struct lruvec *lruvec, struct scan_control *sc, 2508 enum lru_list lru) 2509 { 2510 LIST_HEAD(folio_list); 2511 unsigned long nr_scanned; 2512 unsigned int nr_reclaimed = 0; 2513 unsigned long nr_taken; 2514 struct reclaim_stat stat; 2515 bool file = is_file_lru(lru); 2516 enum vm_event_item item; 2517 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 2518 bool stalled = false; 2519 2520 while (unlikely(too_many_isolated(pgdat, file, sc))) { 2521 if (stalled) 2522 return 0; 2523 2524 /* wait a bit for the reclaimer. */ 2525 stalled = true; 2526 reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED); 2527 2528 /* We are about to die and free our memory. Return now. */ 2529 if (fatal_signal_pending(current)) 2530 return SWAP_CLUSTER_MAX; 2531 } 2532 2533 lru_add_drain(); 2534 2535 spin_lock_irq(&lruvec->lru_lock); 2536 2537 nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list, 2538 &nr_scanned, sc, lru); 2539 2540 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); 2541 item = PGSCAN_KSWAPD + reclaimer_offset(); 2542 if (!cgroup_reclaim(sc)) 2543 __count_vm_events(item, nr_scanned); 2544 __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); 2545 __count_vm_events(PGSCAN_ANON + file, nr_scanned); 2546 2547 spin_unlock_irq(&lruvec->lru_lock); 2548 2549 if (nr_taken == 0) 2550 return 0; 2551 2552 nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); 2553 2554 spin_lock_irq(&lruvec->lru_lock); 2555 move_folios_to_lru(lruvec, &folio_list); 2556 2557 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); 2558 item = PGSTEAL_KSWAPD + reclaimer_offset(); 2559 if (!cgroup_reclaim(sc)) 2560 __count_vm_events(item, nr_reclaimed); 2561 __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); 2562 __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); 2563 spin_unlock_irq(&lruvec->lru_lock); 2564 2565 lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); 2566 mem_cgroup_uncharge_list(&folio_list); 2567 free_unref_page_list(&folio_list); 2568 2569 /* 2570 * If dirty folios are scanned that are not queued for IO, it 2571 * implies that flushers are not doing their job. This can 2572 * happen when memory pressure pushes dirty folios to the end of 2573 * the LRU before the dirty limits are breached and the dirty 2574 * data has expired. It can also happen when the proportion of 2575 * dirty folios grows not through writes but through memory 2576 * pressure reclaiming all the clean cache. And in some cases, 2577 * the flushers simply cannot keep up with the allocation 2578 * rate. Nudge the flusher threads in case they are asleep. 2579 */ 2580 if (stat.nr_unqueued_dirty == nr_taken) { 2581 wakeup_flusher_threads(WB_REASON_VMSCAN); 2582 /* 2583 * For cgroupv1 dirty throttling is achieved by waking up 2584 * the kernel flusher here and later waiting on folios 2585 * which are in writeback to finish (see shrink_folio_list()). 2586 * 2587 * Flusher may not be able to issue writeback quickly 2588 * enough for cgroupv1 writeback throttling to work 2589 * on a large system. 2590 */ 2591 if (!writeback_throttling_sane(sc)) 2592 reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); 2593 } 2594 2595 sc->nr.dirty += stat.nr_dirty; 2596 sc->nr.congested += stat.nr_congested; 2597 sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; 2598 sc->nr.writeback += stat.nr_writeback; 2599 sc->nr.immediate += stat.nr_immediate; 2600 sc->nr.taken += nr_taken; 2601 if (file) 2602 sc->nr.file_taken += nr_taken; 2603 2604 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, 2605 nr_scanned, nr_reclaimed, &stat, sc->priority, file); 2606 return nr_reclaimed; 2607 } 2608 2609 /* 2610 * shrink_active_list() moves folios from the active LRU to the inactive LRU. 2611 * 2612 * We move them the other way if the folio is referenced by one or more 2613 * processes. 2614 * 2615 * If the folios are mostly unmapped, the processing is fast and it is 2616 * appropriate to hold lru_lock across the whole operation. But if 2617 * the folios are mapped, the processing is slow (folio_referenced()), so 2618 * we should drop lru_lock around each folio. It's impossible to balance 2619 * this, so instead we remove the folios from the LRU while processing them. 2620 * It is safe to rely on the active flag against the non-LRU folios in here 2621 * because nobody will play with that bit on a non-LRU folio. 2622 * 2623 * The downside is that we have to touch folio->_refcount against each folio. 2624 * But we had to alter folio->flags anyway. 2625 */ 2626 static void shrink_active_list(unsigned long nr_to_scan, 2627 struct lruvec *lruvec, 2628 struct scan_control *sc, 2629 enum lru_list lru) 2630 { 2631 unsigned long nr_taken; 2632 unsigned long nr_scanned; 2633 unsigned long vm_flags; 2634 LIST_HEAD(l_hold); /* The folios which were snipped off */ 2635 LIST_HEAD(l_active); 2636 LIST_HEAD(l_inactive); 2637 unsigned nr_deactivate, nr_activate; 2638 unsigned nr_rotated = 0; 2639 int file = is_file_lru(lru); 2640 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 2641 2642 lru_add_drain(); 2643 2644 spin_lock_irq(&lruvec->lru_lock); 2645 2646 nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, 2647 &nr_scanned, sc, lru); 2648 2649 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); 2650 2651 if (!cgroup_reclaim(sc)) 2652 __count_vm_events(PGREFILL, nr_scanned); 2653 __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); 2654 2655 spin_unlock_irq(&lruvec->lru_lock); 2656 2657 while (!list_empty(&l_hold)) { 2658 struct folio *folio; 2659 2660 cond_resched(); 2661 folio = lru_to_folio(&l_hold); 2662 list_del(&folio->lru); 2663 2664 if (unlikely(!folio_evictable(folio))) { 2665 folio_putback_lru(folio); 2666 continue; 2667 } 2668 2669 if (unlikely(buffer_heads_over_limit)) { 2670 if (folio_test_private(folio) && folio_trylock(folio)) { 2671 if (folio_test_private(folio)) 2672 filemap_release_folio(folio, 0); 2673 folio_unlock(folio); 2674 } 2675 } 2676 2677 /* Referenced or rmap lock contention: rotate */ 2678 if (folio_referenced(folio, 0, sc->target_mem_cgroup, 2679 &vm_flags) != 0) { 2680 /* 2681 * Identify referenced, file-backed active folios and 2682 * give them one more trip around the active list. So 2683 * that executable code get better chances to stay in 2684 * memory under moderate memory pressure. Anon folios 2685 * are not likely to be evicted by use-once streaming 2686 * IO, plus JVM can create lots of anon VM_EXEC folios, 2687 * so we ignore them here. 2688 */ 2689 if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) { 2690 nr_rotated += folio_nr_pages(folio); 2691 list_add(&folio->lru, &l_active); 2692 continue; 2693 } 2694 } 2695 2696 folio_clear_active(folio); /* we are de-activating */ 2697 folio_set_workingset(folio); 2698 list_add(&folio->lru, &l_inactive); 2699 } 2700 2701 /* 2702 * Move folios back to the lru list. 2703 */ 2704 spin_lock_irq(&lruvec->lru_lock); 2705 2706 nr_activate = move_folios_to_lru(lruvec, &l_active); 2707 nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); 2708 /* Keep all free folios in l_active list */ 2709 list_splice(&l_inactive, &l_active); 2710 2711 __count_vm_events(PGDEACTIVATE, nr_deactivate); 2712 __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); 2713 2714 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); 2715 spin_unlock_irq(&lruvec->lru_lock); 2716 2717 if (nr_rotated) 2718 lru_note_cost(lruvec, file, 0, nr_rotated); 2719 mem_cgroup_uncharge_list(&l_active); 2720 free_unref_page_list(&l_active); 2721 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, 2722 nr_deactivate, nr_rotated, sc->priority, file); 2723 } 2724 2725 static unsigned int reclaim_folio_list(struct list_head *folio_list, 2726 struct pglist_data *pgdat) 2727 { 2728 struct reclaim_stat dummy_stat; 2729 unsigned int nr_reclaimed; 2730 struct folio *folio; 2731 struct scan_control sc = { 2732 .gfp_mask = GFP_KERNEL, 2733 .may_writepage = 1, 2734 .may_unmap = 1, 2735 .may_swap = 1, 2736 .no_demotion = 1, 2737 }; 2738 2739 nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false); 2740 while (!list_empty(folio_list)) { 2741 folio = lru_to_folio(folio_list); 2742 list_del(&folio->lru); 2743 folio_putback_lru(folio); 2744 } 2745 2746 return nr_reclaimed; 2747 } 2748 2749 unsigned long reclaim_pages(struct list_head *folio_list) 2750 { 2751 int nid; 2752 unsigned int nr_reclaimed = 0; 2753 LIST_HEAD(node_folio_list); 2754 unsigned int noreclaim_flag; 2755 2756 if (list_empty(folio_list)) 2757 return nr_reclaimed; 2758 2759 noreclaim_flag = memalloc_noreclaim_save(); 2760 2761 nid = folio_nid(lru_to_folio(folio_list)); 2762 do { 2763 struct folio *folio = lru_to_folio(folio_list); 2764 2765 if (nid == folio_nid(folio)) { 2766 folio_clear_active(folio); 2767 list_move(&folio->lru, &node_folio_list); 2768 continue; 2769 } 2770 2771 nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); 2772 nid = folio_nid(lru_to_folio(folio_list)); 2773 } while (!list_empty(folio_list)); 2774 2775 nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); 2776 2777 memalloc_noreclaim_restore(noreclaim_flag); 2778 2779 return nr_reclaimed; 2780 } 2781 2782 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 2783 struct lruvec *lruvec, struct scan_control *sc) 2784 { 2785 if (is_active_lru(lru)) { 2786 if (sc->may_deactivate & (1 << is_file_lru(lru))) 2787 shrink_active_list(nr_to_scan, lruvec, sc, lru); 2788 else 2789 sc->skipped_deactivate = 1; 2790 return 0; 2791 } 2792 2793 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); 2794 } 2795 2796 /* 2797 * The inactive anon list should be small enough that the VM never has 2798 * to do too much work. 2799 * 2800 * The inactive file list should be small enough to leave most memory 2801 * to the established workingset on the scan-resistant active list, 2802 * but large enough to avoid thrashing the aggregate readahead window. 2803 * 2804 * Both inactive lists should also be large enough that each inactive 2805 * folio has a chance to be referenced again before it is reclaimed. 2806 * 2807 * If that fails and refaulting is observed, the inactive list grows. 2808 * 2809 * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios 2810 * on this LRU, maintained by the pageout code. An inactive_ratio 2811 * of 3 means 3:1 or 25% of the folios are kept on the inactive list. 2812 * 2813 * total target max 2814 * memory ratio inactive 2815 * ------------------------------------- 2816 * 10MB 1 5MB 2817 * 100MB 1 50MB 2818 * 1GB 3 250MB 2819 * 10GB 10 0.9GB 2820 * 100GB 31 3GB 2821 * 1TB 101 10GB 2822 * 10TB 320 32GB 2823 */ 2824 static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) 2825 { 2826 enum lru_list active_lru = inactive_lru + LRU_ACTIVE; 2827 unsigned long inactive, active; 2828 unsigned long inactive_ratio; 2829 unsigned long gb; 2830 2831 inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru); 2832 active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru); 2833 2834 gb = (inactive + active) >> (30 - PAGE_SHIFT); 2835 if (gb) 2836 inactive_ratio = int_sqrt(10 * gb); 2837 else 2838 inactive_ratio = 1; 2839 2840 return inactive * inactive_ratio < active; 2841 } 2842 2843 enum scan_balance { 2844 SCAN_EQUAL, 2845 SCAN_FRACT, 2846 SCAN_ANON, 2847 SCAN_FILE, 2848 }; 2849 2850 static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) 2851 { 2852 unsigned long file; 2853 struct lruvec *target_lruvec; 2854 2855 if (lru_gen_enabled()) 2856 return; 2857 2858 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); 2859 2860 /* 2861 * Flush the memory cgroup stats, so that we read accurate per-memcg 2862 * lruvec stats for heuristics. 2863 */ 2864 mem_cgroup_flush_stats(); 2865 2866 /* 2867 * Determine the scan balance between anon and file LRUs. 2868 */ 2869 spin_lock_irq(&target_lruvec->lru_lock); 2870 sc->anon_cost = target_lruvec->anon_cost; 2871 sc->file_cost = target_lruvec->file_cost; 2872 spin_unlock_irq(&target_lruvec->lru_lock); 2873 2874 /* 2875 * Target desirable inactive:active list ratios for the anon 2876 * and file LRU lists. 2877 */ 2878 if (!sc->force_deactivate) { 2879 unsigned long refaults; 2880 2881 /* 2882 * When refaults are being observed, it means a new 2883 * workingset is being established. Deactivate to get 2884 * rid of any stale active pages quickly. 2885 */ 2886 refaults = lruvec_page_state(target_lruvec, 2887 WORKINGSET_ACTIVATE_ANON); 2888 if (refaults != target_lruvec->refaults[WORKINGSET_ANON] || 2889 inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) 2890 sc->may_deactivate |= DEACTIVATE_ANON; 2891 else 2892 sc->may_deactivate &= ~DEACTIVATE_ANON; 2893 2894 refaults = lruvec_page_state(target_lruvec, 2895 WORKINGSET_ACTIVATE_FILE); 2896 if (refaults != target_lruvec->refaults[WORKINGSET_FILE] || 2897 inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) 2898 sc->may_deactivate |= DEACTIVATE_FILE; 2899 else 2900 sc->may_deactivate &= ~DEACTIVATE_FILE; 2901 } else 2902 sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; 2903 2904 /* 2905 * If we have plenty of inactive file pages that aren't 2906 * thrashing, try to reclaim those first before touching 2907 * anonymous pages. 2908 */ 2909 file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); 2910 if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) 2911 sc->cache_trim_mode = 1; 2912 else 2913 sc->cache_trim_mode = 0; 2914 2915 /* 2916 * Prevent the reclaimer from falling into the cache trap: as 2917 * cache pages start out inactive, every cache fault will tip 2918 * the scan balance towards the file LRU. And as the file LRU 2919 * shrinks, so does the window for rotation from references. 2920 * This means we have a runaway feedback loop where a tiny 2921 * thrashing file LRU becomes infinitely more attractive than 2922 * anon pages. Try to detect this based on file LRU size. 2923 */ 2924 if (!cgroup_reclaim(sc)) { 2925 unsigned long total_high_wmark = 0; 2926 unsigned long free, anon; 2927 int z; 2928 2929 free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); 2930 file = node_page_state(pgdat, NR_ACTIVE_FILE) + 2931 node_page_state(pgdat, NR_INACTIVE_FILE); 2932 2933 for (z = 0; z < MAX_NR_ZONES; z++) { 2934 struct zone *zone = &pgdat->node_zones[z]; 2935 2936 if (!managed_zone(zone)) 2937 continue; 2938 2939 total_high_wmark += high_wmark_pages(zone); 2940 } 2941 2942 /* 2943 * Consider anon: if that's low too, this isn't a 2944 * runaway file reclaim problem, but rather just 2945 * extreme pressure. Reclaim as per usual then. 2946 */ 2947 anon = node_page_state(pgdat, NR_INACTIVE_ANON); 2948 2949 sc->file_is_tiny = 2950 file + free <= total_high_wmark && 2951 !(sc->may_deactivate & DEACTIVATE_ANON) && 2952 anon >> sc->priority; 2953 } 2954 } 2955 2956 /* 2957 * Determine how aggressively the anon and file LRU lists should be 2958 * scanned. 2959 * 2960 * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan 2961 * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan 2962 */ 2963 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, 2964 unsigned long *nr) 2965 { 2966 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 2967 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 2968 unsigned long anon_cost, file_cost, total_cost; 2969 int swappiness = mem_cgroup_swappiness(memcg); 2970 u64 fraction[ANON_AND_FILE]; 2971 u64 denominator = 0; /* gcc */ 2972 enum scan_balance scan_balance; 2973 unsigned long ap, fp; 2974 enum lru_list lru; 2975 2976 /* If we have no swap space, do not bother scanning anon folios. */ 2977 if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { 2978 scan_balance = SCAN_FILE; 2979 goto out; 2980 } 2981 2982 /* 2983 * Global reclaim will swap to prevent OOM even with no 2984 * swappiness, but memcg users want to use this knob to 2985 * disable swapping for individual groups completely when 2986 * using the memory controller's swap limit feature would be 2987 * too expensive. 2988 */ 2989 if (cgroup_reclaim(sc) && !swappiness) { 2990 scan_balance = SCAN_FILE; 2991 goto out; 2992 } 2993 2994 /* 2995 * Do not apply any pressure balancing cleverness when the 2996 * system is close to OOM, scan both anon and file equally 2997 * (unless the swappiness setting disagrees with swapping). 2998 */ 2999 if (!sc->priority && swappiness) { 3000 scan_balance = SCAN_EQUAL; 3001 goto out; 3002 } 3003 3004 /* 3005 * If the system is almost out of file pages, force-scan anon. 3006 */ 3007 if (sc->file_is_tiny) { 3008 scan_balance = SCAN_ANON; 3009 goto out; 3010 } 3011 3012 /* 3013 * If there is enough inactive page cache, we do not reclaim 3014 * anything from the anonymous working right now. 3015 */ 3016 if (sc->cache_trim_mode) { 3017 scan_balance = SCAN_FILE; 3018 goto out; 3019 } 3020 3021 scan_balance = SCAN_FRACT; 3022 /* 3023 * Calculate the pressure balance between anon and file pages. 3024 * 3025 * The amount of pressure we put on each LRU is inversely 3026 * proportional to the cost of reclaiming each list, as 3027 * determined by the share of pages that are refaulting, times 3028 * the relative IO cost of bringing back a swapped out 3029 * anonymous page vs reloading a filesystem page (swappiness). 3030 * 3031 * Although we limit that influence to ensure no list gets 3032 * left behind completely: at least a third of the pressure is 3033 * applied, before swappiness. 3034 * 3035 * With swappiness at 100, anon and file have equal IO cost. 3036 */ 3037 total_cost = sc->anon_cost + sc->file_cost; 3038 anon_cost = total_cost + sc->anon_cost; 3039 file_cost = total_cost + sc->file_cost; 3040 total_cost = anon_cost + file_cost; 3041 3042 ap = swappiness * (total_cost + 1); 3043 ap /= anon_cost + 1; 3044 3045 fp = (200 - swappiness) * (total_cost + 1); 3046 fp /= file_cost + 1; 3047 3048 fraction[0] = ap; 3049 fraction[1] = fp; 3050 denominator = ap + fp; 3051 out: 3052 for_each_evictable_lru(lru) { 3053 int file = is_file_lru(lru); 3054 unsigned long lruvec_size; 3055 unsigned long low, min; 3056 unsigned long scan; 3057 3058 lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); 3059 mem_cgroup_protection(sc->target_mem_cgroup, memcg, 3060 &min, &low); 3061 3062 if (min || low) { 3063 /* 3064 * Scale a cgroup's reclaim pressure by proportioning 3065 * its current usage to its memory.low or memory.min 3066 * setting. 3067 * 3068 * This is important, as otherwise scanning aggression 3069 * becomes extremely binary -- from nothing as we 3070 * approach the memory protection threshold, to totally 3071 * nominal as we exceed it. This results in requiring 3072 * setting extremely liberal protection thresholds. It 3073 * also means we simply get no protection at all if we 3074 * set it too low, which is not ideal. 3075 * 3076 * If there is any protection in place, we reduce scan 3077 * pressure by how much of the total memory used is 3078 * within protection thresholds. 3079 * 3080 * There is one special case: in the first reclaim pass, 3081 * we skip over all groups that are within their low 3082 * protection. If that fails to reclaim enough pages to 3083 * satisfy the reclaim goal, we come back and override 3084 * the best-effort low protection. However, we still 3085 * ideally want to honor how well-behaved groups are in 3086 * that case instead of simply punishing them all 3087 * equally. As such, we reclaim them based on how much 3088 * memory they are using, reducing the scan pressure 3089 * again by how much of the total memory used is under 3090 * hard protection. 3091 */ 3092 unsigned long cgroup_size = mem_cgroup_size(memcg); 3093 unsigned long protection; 3094 3095 /* memory.low scaling, make sure we retry before OOM */ 3096 if (!sc->memcg_low_reclaim && low > min) { 3097 protection = low; 3098 sc->memcg_low_skipped = 1; 3099 } else { 3100 protection = min; 3101 } 3102 3103 /* Avoid TOCTOU with earlier protection check */ 3104 cgroup_size = max(cgroup_size, protection); 3105 3106 scan = lruvec_size - lruvec_size * protection / 3107 (cgroup_size + 1); 3108 3109 /* 3110 * Minimally target SWAP_CLUSTER_MAX pages to keep 3111 * reclaim moving forwards, avoiding decrementing 3112 * sc->priority further than desirable. 3113 */ 3114 scan = max(scan, SWAP_CLUSTER_MAX); 3115 } else { 3116 scan = lruvec_size; 3117 } 3118 3119 scan >>= sc->priority; 3120 3121 /* 3122 * If the cgroup's already been deleted, make sure to 3123 * scrape out the remaining cache. 3124 */ 3125 if (!scan && !mem_cgroup_online(memcg)) 3126 scan = min(lruvec_size, SWAP_CLUSTER_MAX); 3127 3128 switch (scan_balance) { 3129 case SCAN_EQUAL: 3130 /* Scan lists relative to size */ 3131 break; 3132 case SCAN_FRACT: 3133 /* 3134 * Scan types proportional to swappiness and 3135 * their relative recent reclaim efficiency. 3136 * Make sure we don't miss the last page on 3137 * the offlined memory cgroups because of a 3138 * round-off error. 3139 */ 3140 scan = mem_cgroup_online(memcg) ? 3141 div64_u64(scan * fraction[file], denominator) : 3142 DIV64_U64_ROUND_UP(scan * fraction[file], 3143 denominator); 3144 break; 3145 case SCAN_FILE: 3146 case SCAN_ANON: 3147 /* Scan one type exclusively */ 3148 if ((scan_balance == SCAN_FILE) != file) 3149 scan = 0; 3150 break; 3151 default: 3152 /* Look ma, no brain */ 3153 BUG(); 3154 } 3155 3156 nr[lru] = scan; 3157 } 3158 } 3159 3160 /* 3161 * Anonymous LRU management is a waste if there is 3162 * ultimately no way to reclaim the memory. 3163 */ 3164 static bool can_age_anon_pages(struct pglist_data *pgdat, 3165 struct scan_control *sc) 3166 { 3167 /* Aging the anon LRU is valuable if swap is present: */ 3168 if (total_swap_pages > 0) 3169 return true; 3170 3171 /* Also valuable if anon pages can be demoted: */ 3172 return can_demote(pgdat->node_id, sc); 3173 } 3174 3175 #ifdef CONFIG_LRU_GEN 3176 3177 #ifdef CONFIG_LRU_GEN_ENABLED 3178 DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); 3179 #define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) 3180 #else 3181 DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); 3182 #define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap]) 3183 #endif 3184 3185 /****************************************************************************** 3186 * shorthand helpers 3187 ******************************************************************************/ 3188 3189 #define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) 3190 3191 #define DEFINE_MAX_SEQ(lruvec) \ 3192 unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) 3193 3194 #define DEFINE_MIN_SEQ(lruvec) \ 3195 unsigned long min_seq[ANON_AND_FILE] = { \ 3196 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \ 3197 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ 3198 } 3199 3200 #define for_each_gen_type_zone(gen, type, zone) \ 3201 for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ 3202 for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ 3203 for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) 3204 3205 #define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) 3206 #define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) 3207 3208 static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) 3209 { 3210 struct pglist_data *pgdat = NODE_DATA(nid); 3211 3212 #ifdef CONFIG_MEMCG 3213 if (memcg) { 3214 struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; 3215 3216 /* see the comment in mem_cgroup_lruvec() */ 3217 if (!lruvec->pgdat) 3218 lruvec->pgdat = pgdat; 3219 3220 return lruvec; 3221 } 3222 #endif 3223 VM_WARN_ON_ONCE(!mem_cgroup_disabled()); 3224 3225 return &pgdat->__lruvec; 3226 } 3227 3228 static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) 3229 { 3230 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3231 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 3232 3233 if (!sc->may_swap) 3234 return 0; 3235 3236 if (!can_demote(pgdat->node_id, sc) && 3237 mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) 3238 return 0; 3239 3240 return mem_cgroup_swappiness(memcg); 3241 } 3242 3243 static int get_nr_gens(struct lruvec *lruvec, int type) 3244 { 3245 return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; 3246 } 3247 3248 static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) 3249 { 3250 /* see the comment on lru_gen_folio */ 3251 return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && 3252 get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && 3253 get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; 3254 } 3255 3256 /****************************************************************************** 3257 * Bloom filters 3258 ******************************************************************************/ 3259 3260 /* 3261 * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when 3262 * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of 3263 * bits in a bitmap, k is the number of hash functions and n is the number of 3264 * inserted items. 3265 * 3266 * Page table walkers use one of the two filters to reduce their search space. 3267 * To get rid of non-leaf entries that no longer have enough leaf entries, the 3268 * aging uses the double-buffering technique to flip to the other filter each 3269 * time it produces a new generation. For non-leaf entries that have enough 3270 * leaf entries, the aging carries them over to the next generation in 3271 * walk_pmd_range(); the eviction also report them when walking the rmap 3272 * in lru_gen_look_around(). 3273 * 3274 * For future optimizations: 3275 * 1. It's not necessary to keep both filters all the time. The spare one can be 3276 * freed after the RCU grace period and reallocated if needed again. 3277 * 2. And when reallocating, it's worth scaling its size according to the number 3278 * of inserted entries in the other filter, to reduce the memory overhead on 3279 * small systems and false positives on large systems. 3280 * 3. Jenkins' hash function is an alternative to Knuth's. 3281 */ 3282 #define BLOOM_FILTER_SHIFT 15 3283 3284 static inline int filter_gen_from_seq(unsigned long seq) 3285 { 3286 return seq % NR_BLOOM_FILTERS; 3287 } 3288 3289 static void get_item_key(void *item, int *key) 3290 { 3291 u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); 3292 3293 BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); 3294 3295 key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); 3296 key[1] = hash >> BLOOM_FILTER_SHIFT; 3297 } 3298 3299 static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) 3300 { 3301 int key[2]; 3302 unsigned long *filter; 3303 int gen = filter_gen_from_seq(seq); 3304 3305 filter = READ_ONCE(lruvec->mm_state.filters[gen]); 3306 if (!filter) 3307 return true; 3308 3309 get_item_key(item, key); 3310 3311 return test_bit(key[0], filter) && test_bit(key[1], filter); 3312 } 3313 3314 static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) 3315 { 3316 int key[2]; 3317 unsigned long *filter; 3318 int gen = filter_gen_from_seq(seq); 3319 3320 filter = READ_ONCE(lruvec->mm_state.filters[gen]); 3321 if (!filter) 3322 return; 3323 3324 get_item_key(item, key); 3325 3326 if (!test_bit(key[0], filter)) 3327 set_bit(key[0], filter); 3328 if (!test_bit(key[1], filter)) 3329 set_bit(key[1], filter); 3330 } 3331 3332 static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) 3333 { 3334 unsigned long *filter; 3335 int gen = filter_gen_from_seq(seq); 3336 3337 filter = lruvec->mm_state.filters[gen]; 3338 if (filter) { 3339 bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); 3340 return; 3341 } 3342 3343 filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), 3344 __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); 3345 WRITE_ONCE(lruvec->mm_state.filters[gen], filter); 3346 } 3347 3348 /****************************************************************************** 3349 * mm_struct list 3350 ******************************************************************************/ 3351 3352 static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) 3353 { 3354 static struct lru_gen_mm_list mm_list = { 3355 .fifo = LIST_HEAD_INIT(mm_list.fifo), 3356 .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), 3357 }; 3358 3359 #ifdef CONFIG_MEMCG 3360 if (memcg) 3361 return &memcg->mm_list; 3362 #endif 3363 VM_WARN_ON_ONCE(!mem_cgroup_disabled()); 3364 3365 return &mm_list; 3366 } 3367 3368 void lru_gen_add_mm(struct mm_struct *mm) 3369 { 3370 int nid; 3371 struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); 3372 struct lru_gen_mm_list *mm_list = get_mm_list(memcg); 3373 3374 VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list)); 3375 #ifdef CONFIG_MEMCG 3376 VM_WARN_ON_ONCE(mm->lru_gen.memcg); 3377 mm->lru_gen.memcg = memcg; 3378 #endif 3379 spin_lock(&mm_list->lock); 3380 3381 for_each_node_state(nid, N_MEMORY) { 3382 struct lruvec *lruvec = get_lruvec(memcg, nid); 3383 3384 /* the first addition since the last iteration */ 3385 if (lruvec->mm_state.tail == &mm_list->fifo) 3386 lruvec->mm_state.tail = &mm->lru_gen.list; 3387 } 3388 3389 list_add_tail(&mm->lru_gen.list, &mm_list->fifo); 3390 3391 spin_unlock(&mm_list->lock); 3392 } 3393 3394 void lru_gen_del_mm(struct mm_struct *mm) 3395 { 3396 int nid; 3397 struct lru_gen_mm_list *mm_list; 3398 struct mem_cgroup *memcg = NULL; 3399 3400 if (list_empty(&mm->lru_gen.list)) 3401 return; 3402 3403 #ifdef CONFIG_MEMCG 3404 memcg = mm->lru_gen.memcg; 3405 #endif 3406 mm_list = get_mm_list(memcg); 3407 3408 spin_lock(&mm_list->lock); 3409 3410 for_each_node(nid) { 3411 struct lruvec *lruvec = get_lruvec(memcg, nid); 3412 3413 /* where the last iteration ended (exclusive) */ 3414 if (lruvec->mm_state.tail == &mm->lru_gen.list) 3415 lruvec->mm_state.tail = lruvec->mm_state.tail->next; 3416 3417 /* where the current iteration continues (inclusive) */ 3418 if (lruvec->mm_state.head != &mm->lru_gen.list) 3419 continue; 3420 3421 lruvec->mm_state.head = lruvec->mm_state.head->next; 3422 /* the deletion ends the current iteration */ 3423 if (lruvec->mm_state.head == &mm_list->fifo) 3424 WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1); 3425 } 3426 3427 list_del_init(&mm->lru_gen.list); 3428 3429 spin_unlock(&mm_list->lock); 3430 3431 #ifdef CONFIG_MEMCG 3432 mem_cgroup_put(mm->lru_gen.memcg); 3433 mm->lru_gen.memcg = NULL; 3434 #endif 3435 } 3436 3437 #ifdef CONFIG_MEMCG 3438 void lru_gen_migrate_mm(struct mm_struct *mm) 3439 { 3440 struct mem_cgroup *memcg; 3441 struct task_struct *task = rcu_dereference_protected(mm->owner, true); 3442 3443 VM_WARN_ON_ONCE(task->mm != mm); 3444 lockdep_assert_held(&task->alloc_lock); 3445 3446 /* for mm_update_next_owner() */ 3447 if (mem_cgroup_disabled()) 3448 return; 3449 3450 /* migration can happen before addition */ 3451 if (!mm->lru_gen.memcg) 3452 return; 3453 3454 rcu_read_lock(); 3455 memcg = mem_cgroup_from_task(task); 3456 rcu_read_unlock(); 3457 if (memcg == mm->lru_gen.memcg) 3458 return; 3459 3460 VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); 3461 3462 lru_gen_del_mm(mm); 3463 lru_gen_add_mm(mm); 3464 } 3465 #endif 3466 3467 static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) 3468 { 3469 int i; 3470 int hist; 3471 3472 lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); 3473 3474 if (walk) { 3475 hist = lru_hist_from_seq(walk->max_seq); 3476 3477 for (i = 0; i < NR_MM_STATS; i++) { 3478 WRITE_ONCE(lruvec->mm_state.stats[hist][i], 3479 lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]); 3480 walk->mm_stats[i] = 0; 3481 } 3482 } 3483 3484 if (NR_HIST_GENS > 1 && last) { 3485 hist = lru_hist_from_seq(lruvec->mm_state.seq + 1); 3486 3487 for (i = 0; i < NR_MM_STATS; i++) 3488 WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0); 3489 } 3490 } 3491 3492 static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) 3493 { 3494 int type; 3495 unsigned long size = 0; 3496 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 3497 int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); 3498 3499 if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) 3500 return true; 3501 3502 clear_bit(key, &mm->lru_gen.bitmap); 3503 3504 for (type = !walk->can_swap; type < ANON_AND_FILE; type++) { 3505 size += type ? get_mm_counter(mm, MM_FILEPAGES) : 3506 get_mm_counter(mm, MM_ANONPAGES) + 3507 get_mm_counter(mm, MM_SHMEMPAGES); 3508 } 3509 3510 if (size < MIN_LRU_BATCH) 3511 return true; 3512 3513 return !mmget_not_zero(mm); 3514 } 3515 3516 static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, 3517 struct mm_struct **iter) 3518 { 3519 bool first = false; 3520 bool last = true; 3521 struct mm_struct *mm = NULL; 3522 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3523 struct lru_gen_mm_list *mm_list = get_mm_list(memcg); 3524 struct lru_gen_mm_state *mm_state = &lruvec->mm_state; 3525 3526 /* 3527 * There are four interesting cases for this page table walker: 3528 * 1. It tries to start a new iteration of mm_list with a stale max_seq; 3529 * there is nothing left to do. 3530 * 2. It's the first of the current generation, and it needs to reset 3531 * the Bloom filter for the next generation. 3532 * 3. It reaches the end of mm_list, and it needs to increment 3533 * mm_state->seq; the iteration is done. 3534 * 4. It's the last of the current generation, and it needs to reset the 3535 * mm stats counters for the next generation. 3536 */ 3537 spin_lock(&mm_list->lock); 3538 3539 VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); 3540 VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq); 3541 VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers); 3542 3543 if (walk->max_seq <= mm_state->seq) { 3544 if (!*iter) 3545 last = false; 3546 goto done; 3547 } 3548 3549 if (!mm_state->nr_walkers) { 3550 VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); 3551 3552 mm_state->head = mm_list->fifo.next; 3553 first = true; 3554 } 3555 3556 while (!mm && mm_state->head != &mm_list->fifo) { 3557 mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); 3558 3559 mm_state->head = mm_state->head->next; 3560 3561 /* force scan for those added after the last iteration */ 3562 if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) { 3563 mm_state->tail = mm_state->head; 3564 walk->force_scan = true; 3565 } 3566 3567 if (should_skip_mm(mm, walk)) 3568 mm = NULL; 3569 } 3570 3571 if (mm_state->head == &mm_list->fifo) 3572 WRITE_ONCE(mm_state->seq, mm_state->seq + 1); 3573 done: 3574 if (*iter && !mm) 3575 mm_state->nr_walkers--; 3576 if (!*iter && mm) 3577 mm_state->nr_walkers++; 3578 3579 if (mm_state->nr_walkers) 3580 last = false; 3581 3582 if (*iter || last) 3583 reset_mm_stats(lruvec, walk, last); 3584 3585 spin_unlock(&mm_list->lock); 3586 3587 if (mm && first) 3588 reset_bloom_filter(lruvec, walk->max_seq + 1); 3589 3590 if (*iter) 3591 mmput_async(*iter); 3592 3593 *iter = mm; 3594 3595 return last; 3596 } 3597 3598 static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) 3599 { 3600 bool success = false; 3601 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3602 struct lru_gen_mm_list *mm_list = get_mm_list(memcg); 3603 struct lru_gen_mm_state *mm_state = &lruvec->mm_state; 3604 3605 spin_lock(&mm_list->lock); 3606 3607 VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); 3608 3609 if (max_seq > mm_state->seq && !mm_state->nr_walkers) { 3610 VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); 3611 3612 WRITE_ONCE(mm_state->seq, mm_state->seq + 1); 3613 reset_mm_stats(lruvec, NULL, true); 3614 success = true; 3615 } 3616 3617 spin_unlock(&mm_list->lock); 3618 3619 return success; 3620 } 3621 3622 /****************************************************************************** 3623 * PID controller 3624 ******************************************************************************/ 3625 3626 /* 3627 * A feedback loop based on Proportional-Integral-Derivative (PID) controller. 3628 * 3629 * The P term is refaulted/(evicted+protected) from a tier in the generation 3630 * currently being evicted; the I term is the exponential moving average of the 3631 * P term over the generations previously evicted, using the smoothing factor 3632 * 1/2; the D term isn't supported. 3633 * 3634 * The setpoint (SP) is always the first tier of one type; the process variable 3635 * (PV) is either any tier of the other type or any other tier of the same 3636 * type. 3637 * 3638 * The error is the difference between the SP and the PV; the correction is to 3639 * turn off protection when SP>PV or turn on protection when SP<PV. 3640 * 3641 * For future optimizations: 3642 * 1. The D term may discount the other two terms over time so that long-lived 3643 * generations can resist stale information. 3644 */ 3645 struct ctrl_pos { 3646 unsigned long refaulted; 3647 unsigned long total; 3648 int gain; 3649 }; 3650 3651 static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, 3652 struct ctrl_pos *pos) 3653 { 3654 struct lru_gen_folio *lrugen = &lruvec->lrugen; 3655 int hist = lru_hist_from_seq(lrugen->min_seq[type]); 3656 3657 pos->refaulted = lrugen->avg_refaulted[type][tier] + 3658 atomic_long_read(&lrugen->refaulted[hist][type][tier]); 3659 pos->total = lrugen->avg_total[type][tier] + 3660 atomic_long_read(&lrugen->evicted[hist][type][tier]); 3661 if (tier) 3662 pos->total += lrugen->protected[hist][type][tier - 1]; 3663 pos->gain = gain; 3664 } 3665 3666 static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) 3667 { 3668 int hist, tier; 3669 struct lru_gen_folio *lrugen = &lruvec->lrugen; 3670 bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; 3671 unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; 3672 3673 lockdep_assert_held(&lruvec->lru_lock); 3674 3675 if (!carryover && !clear) 3676 return; 3677 3678 hist = lru_hist_from_seq(seq); 3679 3680 for (tier = 0; tier < MAX_NR_TIERS; tier++) { 3681 if (carryover) { 3682 unsigned long sum; 3683 3684 sum = lrugen->avg_refaulted[type][tier] + 3685 atomic_long_read(&lrugen->refaulted[hist][type][tier]); 3686 WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); 3687 3688 sum = lrugen->avg_total[type][tier] + 3689 atomic_long_read(&lrugen->evicted[hist][type][tier]); 3690 if (tier) 3691 sum += lrugen->protected[hist][type][tier - 1]; 3692 WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); 3693 } 3694 3695 if (clear) { 3696 atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); 3697 atomic_long_set(&lrugen->evicted[hist][type][tier], 0); 3698 if (tier) 3699 WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); 3700 } 3701 } 3702 } 3703 3704 static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) 3705 { 3706 /* 3707 * Return true if the PV has a limited number of refaults or a lower 3708 * refaulted/total than the SP. 3709 */ 3710 return pv->refaulted < MIN_LRU_BATCH || 3711 pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <= 3712 (sp->refaulted + 1) * pv->total * pv->gain; 3713 } 3714 3715 /****************************************************************************** 3716 * the aging 3717 ******************************************************************************/ 3718 3719 /* promote pages accessed through page tables */ 3720 static int folio_update_gen(struct folio *folio, int gen) 3721 { 3722 unsigned long new_flags, old_flags = READ_ONCE(folio->flags); 3723 3724 VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); 3725 VM_WARN_ON_ONCE(!rcu_read_lock_held()); 3726 3727 do { 3728 /* lru_gen_del_folio() has isolated this page? */ 3729 if (!(old_flags & LRU_GEN_MASK)) { 3730 /* for shrink_folio_list() */ 3731 new_flags = old_flags | BIT(PG_referenced); 3732 continue; 3733 } 3734 3735 new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); 3736 new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; 3737 } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); 3738 3739 return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; 3740 } 3741 3742 /* protect pages accessed multiple times through file descriptors */ 3743 static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) 3744 { 3745 int type = folio_is_file_lru(folio); 3746 struct lru_gen_folio *lrugen = &lruvec->lrugen; 3747 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); 3748 unsigned long new_flags, old_flags = READ_ONCE(folio->flags); 3749 3750 VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); 3751 3752 do { 3753 new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; 3754 /* folio_update_gen() has promoted this page? */ 3755 if (new_gen >= 0 && new_gen != old_gen) 3756 return new_gen; 3757 3758 new_gen = (old_gen + 1) % MAX_NR_GENS; 3759 3760 new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); 3761 new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; 3762 /* for folio_end_writeback() */ 3763 if (reclaiming) 3764 new_flags |= BIT(PG_reclaim); 3765 } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); 3766 3767 lru_gen_update_size(lruvec, folio, old_gen, new_gen); 3768 3769 return new_gen; 3770 } 3771 3772 static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, 3773 int old_gen, int new_gen) 3774 { 3775 int type = folio_is_file_lru(folio); 3776 int zone = folio_zonenum(folio); 3777 int delta = folio_nr_pages(folio); 3778 3779 VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS); 3780 VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS); 3781 3782 walk->batched++; 3783 3784 walk->nr_pages[old_gen][type][zone] -= delta; 3785 walk->nr_pages[new_gen][type][zone] += delta; 3786 } 3787 3788 static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) 3789 { 3790 int gen, type, zone; 3791 struct lru_gen_folio *lrugen = &lruvec->lrugen; 3792 3793 walk->batched = 0; 3794 3795 for_each_gen_type_zone(gen, type, zone) { 3796 enum lru_list lru = type * LRU_INACTIVE_FILE; 3797 int delta = walk->nr_pages[gen][type][zone]; 3798 3799 if (!delta) 3800 continue; 3801 3802 walk->nr_pages[gen][type][zone] = 0; 3803 WRITE_ONCE(lrugen->nr_pages[gen][type][zone], 3804 lrugen->nr_pages[gen][type][zone] + delta); 3805 3806 if (lru_gen_is_active(lruvec, gen)) 3807 lru += LRU_ACTIVE; 3808 __update_lru_size(lruvec, lru, zone, delta); 3809 } 3810 } 3811 3812 static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args) 3813 { 3814 struct address_space *mapping; 3815 struct vm_area_struct *vma = args->vma; 3816 struct lru_gen_mm_walk *walk = args->private; 3817 3818 if (!vma_is_accessible(vma)) 3819 return true; 3820 3821 if (is_vm_hugetlb_page(vma)) 3822 return true; 3823 3824 if (!vma_has_recency(vma)) 3825 return true; 3826 3827 if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) 3828 return true; 3829 3830 if (vma == get_gate_vma(vma->vm_mm)) 3831 return true; 3832 3833 if (vma_is_anonymous(vma)) 3834 return !walk->can_swap; 3835 3836 if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) 3837 return true; 3838 3839 mapping = vma->vm_file->f_mapping; 3840 if (mapping_unevictable(mapping)) 3841 return true; 3842 3843 if (shmem_mapping(mapping)) 3844 return !walk->can_swap; 3845 3846 /* to exclude special mappings like dax, etc. */ 3847 return !mapping->a_ops->read_folio; 3848 } 3849 3850 /* 3851 * Some userspace memory allocators map many single-page VMAs. Instead of 3852 * returning back to the PGD table for each of such VMAs, finish an entire PMD 3853 * table to reduce zigzags and improve cache performance. 3854 */ 3855 static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args, 3856 unsigned long *vm_start, unsigned long *vm_end) 3857 { 3858 unsigned long start = round_up(*vm_end, size); 3859 unsigned long end = (start | ~mask) + 1; 3860 VMA_ITERATOR(vmi, args->mm, start); 3861 3862 VM_WARN_ON_ONCE(mask & size); 3863 VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask)); 3864 3865 for_each_vma(vmi, args->vma) { 3866 if (end && end <= args->vma->vm_start) 3867 return false; 3868 3869 if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) 3870 continue; 3871 3872 *vm_start = max(start, args->vma->vm_start); 3873 *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; 3874 3875 return true; 3876 } 3877 3878 return false; 3879 } 3880 3881 static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) 3882 { 3883 unsigned long pfn = pte_pfn(pte); 3884 3885 VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); 3886 3887 if (!pte_present(pte) || is_zero_pfn(pfn)) 3888 return -1; 3889 3890 if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) 3891 return -1; 3892 3893 if (WARN_ON_ONCE(!pfn_valid(pfn))) 3894 return -1; 3895 3896 return pfn; 3897 } 3898 3899 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) 3900 static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) 3901 { 3902 unsigned long pfn = pmd_pfn(pmd); 3903 3904 VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); 3905 3906 if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) 3907 return -1; 3908 3909 if (WARN_ON_ONCE(pmd_devmap(pmd))) 3910 return -1; 3911 3912 if (WARN_ON_ONCE(!pfn_valid(pfn))) 3913 return -1; 3914 3915 return pfn; 3916 } 3917 #endif 3918 3919 static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, 3920 struct pglist_data *pgdat, bool can_swap) 3921 { 3922 struct folio *folio; 3923 3924 /* try to avoid unnecessary memory loads */ 3925 if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) 3926 return NULL; 3927 3928 folio = pfn_folio(pfn); 3929 if (folio_nid(folio) != pgdat->node_id) 3930 return NULL; 3931 3932 if (folio_memcg_rcu(folio) != memcg) 3933 return NULL; 3934 3935 /* file VMAs can contain anon pages from COW */ 3936 if (!folio_is_file_lru(folio) && !can_swap) 3937 return NULL; 3938 3939 return folio; 3940 } 3941 3942 static bool suitable_to_scan(int total, int young) 3943 { 3944 int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); 3945 3946 /* suitable if the average number of young PTEs per cacheline is >=1 */ 3947 return young * n >= total; 3948 } 3949 3950 static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, 3951 struct mm_walk *args) 3952 { 3953 int i; 3954 pte_t *pte; 3955 spinlock_t *ptl; 3956 unsigned long addr; 3957 int total = 0; 3958 int young = 0; 3959 struct lru_gen_mm_walk *walk = args->private; 3960 struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); 3961 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 3962 int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); 3963 3964 VM_WARN_ON_ONCE(pmd_leaf(*pmd)); 3965 3966 ptl = pte_lockptr(args->mm, pmd); 3967 if (!spin_trylock(ptl)) 3968 return false; 3969 3970 arch_enter_lazy_mmu_mode(); 3971 3972 pte = pte_offset_map(pmd, start & PMD_MASK); 3973 restart: 3974 for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { 3975 unsigned long pfn; 3976 struct folio *folio; 3977 3978 total++; 3979 walk->mm_stats[MM_LEAF_TOTAL]++; 3980 3981 pfn = get_pte_pfn(pte[i], args->vma, addr); 3982 if (pfn == -1) 3983 continue; 3984 3985 if (!pte_young(pte[i])) { 3986 walk->mm_stats[MM_LEAF_OLD]++; 3987 continue; 3988 } 3989 3990 folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); 3991 if (!folio) 3992 continue; 3993 3994 if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) 3995 VM_WARN_ON_ONCE(true); 3996 3997 young++; 3998 walk->mm_stats[MM_LEAF_YOUNG]++; 3999 4000 if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && 4001 !(folio_test_anon(folio) && folio_test_swapbacked(folio) && 4002 !folio_test_swapcache(folio))) 4003 folio_mark_dirty(folio); 4004 4005 old_gen = folio_update_gen(folio, new_gen); 4006 if (old_gen >= 0 && old_gen != new_gen) 4007 update_batch_size(walk, folio, old_gen, new_gen); 4008 } 4009 4010 if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) 4011 goto restart; 4012 4013 pte_unmap(pte); 4014 4015 arch_leave_lazy_mmu_mode(); 4016 spin_unlock(ptl); 4017 4018 return suitable_to_scan(total, young); 4019 } 4020 4021 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) 4022 static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, 4023 struct mm_walk *args, unsigned long *bitmap, unsigned long *first) 4024 { 4025 int i; 4026 pmd_t *pmd; 4027 spinlock_t *ptl; 4028 struct lru_gen_mm_walk *walk = args->private; 4029 struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); 4030 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 4031 int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); 4032 4033 VM_WARN_ON_ONCE(pud_leaf(*pud)); 4034 4035 /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ 4036 if (*first == -1) { 4037 *first = addr; 4038 bitmap_zero(bitmap, MIN_LRU_BATCH); 4039 return; 4040 } 4041 4042 i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first); 4043 if (i && i <= MIN_LRU_BATCH) { 4044 __set_bit(i - 1, bitmap); 4045 return; 4046 } 4047 4048 pmd = pmd_offset(pud, *first); 4049 4050 ptl = pmd_lockptr(args->mm, pmd); 4051 if (!spin_trylock(ptl)) 4052 goto done; 4053 4054 arch_enter_lazy_mmu_mode(); 4055 4056 do { 4057 unsigned long pfn; 4058 struct folio *folio; 4059 4060 /* don't round down the first address */ 4061 addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first; 4062 4063 pfn = get_pmd_pfn(pmd[i], vma, addr); 4064 if (pfn == -1) 4065 goto next; 4066 4067 if (!pmd_trans_huge(pmd[i])) { 4068 if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) 4069 pmdp_test_and_clear_young(vma, addr, pmd + i); 4070 goto next; 4071 } 4072 4073 folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); 4074 if (!folio) 4075 goto next; 4076 4077 if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) 4078 goto next; 4079 4080 walk->mm_stats[MM_LEAF_YOUNG]++; 4081 4082 if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) && 4083 !(folio_test_anon(folio) && folio_test_swapbacked(folio) && 4084 !folio_test_swapcache(folio))) 4085 folio_mark_dirty(folio); 4086 4087 old_gen = folio_update_gen(folio, new_gen); 4088 if (old_gen >= 0 && old_gen != new_gen) 4089 update_batch_size(walk, folio, old_gen, new_gen); 4090 next: 4091 i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; 4092 } while (i <= MIN_LRU_BATCH); 4093 4094 arch_leave_lazy_mmu_mode(); 4095 spin_unlock(ptl); 4096 done: 4097 *first = -1; 4098 } 4099 #else 4100 static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, 4101 struct mm_walk *args, unsigned long *bitmap, unsigned long *first) 4102 { 4103 } 4104 #endif 4105 4106 static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, 4107 struct mm_walk *args) 4108 { 4109 int i; 4110 pmd_t *pmd; 4111 unsigned long next; 4112 unsigned long addr; 4113 struct vm_area_struct *vma; 4114 unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; 4115 unsigned long first = -1; 4116 struct lru_gen_mm_walk *walk = args->private; 4117 4118 VM_WARN_ON_ONCE(pud_leaf(*pud)); 4119 4120 /* 4121 * Finish an entire PMD in two passes: the first only reaches to PTE 4122 * tables to avoid taking the PMD lock; the second, if necessary, takes 4123 * the PMD lock to clear the accessed bit in PMD entries. 4124 */ 4125 pmd = pmd_offset(pud, start & PUD_MASK); 4126 restart: 4127 /* walk_pte_range() may call get_next_vma() */ 4128 vma = args->vma; 4129 for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { 4130 pmd_t val = pmdp_get_lockless(pmd + i); 4131 4132 next = pmd_addr_end(addr, end); 4133 4134 if (!pmd_present(val) || is_huge_zero_pmd(val)) { 4135 walk->mm_stats[MM_LEAF_TOTAL]++; 4136 continue; 4137 } 4138 4139 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4140 if (pmd_trans_huge(val)) { 4141 unsigned long pfn = pmd_pfn(val); 4142 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 4143 4144 walk->mm_stats[MM_LEAF_TOTAL]++; 4145 4146 if (!pmd_young(val)) { 4147 walk->mm_stats[MM_LEAF_OLD]++; 4148 continue; 4149 } 4150 4151 /* try to avoid unnecessary memory loads */ 4152 if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) 4153 continue; 4154 4155 walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); 4156 continue; 4157 } 4158 #endif 4159 walk->mm_stats[MM_NONLEAF_TOTAL]++; 4160 4161 if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) { 4162 if (!pmd_young(val)) 4163 continue; 4164 4165 walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); 4166 } 4167 4168 if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) 4169 continue; 4170 4171 walk->mm_stats[MM_NONLEAF_FOUND]++; 4172 4173 if (!walk_pte_range(&val, addr, next, args)) 4174 continue; 4175 4176 walk->mm_stats[MM_NONLEAF_ADDED]++; 4177 4178 /* carry over to the next generation */ 4179 update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); 4180 } 4181 4182 walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); 4183 4184 if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) 4185 goto restart; 4186 } 4187 4188 static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, 4189 struct mm_walk *args) 4190 { 4191 int i; 4192 pud_t *pud; 4193 unsigned long addr; 4194 unsigned long next; 4195 struct lru_gen_mm_walk *walk = args->private; 4196 4197 VM_WARN_ON_ONCE(p4d_leaf(*p4d)); 4198 4199 pud = pud_offset(p4d, start & P4D_MASK); 4200 restart: 4201 for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { 4202 pud_t val = READ_ONCE(pud[i]); 4203 4204 next = pud_addr_end(addr, end); 4205 4206 if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) 4207 continue; 4208 4209 walk_pmd_range(&val, addr, next, args); 4210 4211 /* a racy check to curtail the waiting time */ 4212 if (wq_has_sleeper(&walk->lruvec->mm_state.wait)) 4213 return 1; 4214 4215 if (need_resched() || walk->batched >= MAX_LRU_BATCH) { 4216 end = (addr | ~PUD_MASK) + 1; 4217 goto done; 4218 } 4219 } 4220 4221 if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end)) 4222 goto restart; 4223 4224 end = round_up(end, P4D_SIZE); 4225 done: 4226 if (!end || !args->vma) 4227 return 1; 4228 4229 walk->next_addr = max(end, args->vma->vm_start); 4230 4231 return -EAGAIN; 4232 } 4233 4234 static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk) 4235 { 4236 static const struct mm_walk_ops mm_walk_ops = { 4237 .test_walk = should_skip_vma, 4238 .p4d_entry = walk_pud_range, 4239 }; 4240 4241 int err; 4242 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4243 4244 walk->next_addr = FIRST_USER_ADDRESS; 4245 4246 do { 4247 err = -EBUSY; 4248 4249 /* folio_update_gen() requires stable folio_memcg() */ 4250 if (!mem_cgroup_trylock_pages(memcg)) 4251 break; 4252 4253 /* the caller might be holding the lock for write */ 4254 if (mmap_read_trylock(mm)) { 4255 err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk); 4256 4257 mmap_read_unlock(mm); 4258 } 4259 4260 mem_cgroup_unlock_pages(); 4261 4262 if (walk->batched) { 4263 spin_lock_irq(&lruvec->lru_lock); 4264 reset_batch_size(lruvec, walk); 4265 spin_unlock_irq(&lruvec->lru_lock); 4266 } 4267 4268 cond_resched(); 4269 } while (err == -EAGAIN); 4270 } 4271 4272 static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc) 4273 { 4274 struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; 4275 4276 if (pgdat && current_is_kswapd()) { 4277 VM_WARN_ON_ONCE(walk); 4278 4279 walk = &pgdat->mm_walk; 4280 } else if (!walk && force_alloc) { 4281 VM_WARN_ON_ONCE(current_is_kswapd()); 4282 4283 walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); 4284 } 4285 4286 current->reclaim_state->mm_walk = walk; 4287 4288 return walk; 4289 } 4290 4291 static void clear_mm_walk(void) 4292 { 4293 struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; 4294 4295 VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages))); 4296 VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats))); 4297 4298 current->reclaim_state->mm_walk = NULL; 4299 4300 if (!current_is_kswapd()) 4301 kfree(walk); 4302 } 4303 4304 static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) 4305 { 4306 int zone; 4307 int remaining = MAX_LRU_BATCH; 4308 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4309 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); 4310 4311 if (type == LRU_GEN_ANON && !can_swap) 4312 goto done; 4313 4314 /* prevent cold/hot inversion if force_scan is true */ 4315 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4316 struct list_head *head = &lrugen->folios[old_gen][type][zone]; 4317 4318 while (!list_empty(head)) { 4319 struct folio *folio = lru_to_folio(head); 4320 4321 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); 4322 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); 4323 VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); 4324 VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); 4325 4326 new_gen = folio_inc_gen(lruvec, folio, false); 4327 list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); 4328 4329 if (!--remaining) 4330 return false; 4331 } 4332 } 4333 done: 4334 reset_ctrl_pos(lruvec, type, true); 4335 WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); 4336 4337 return true; 4338 } 4339 4340 static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) 4341 { 4342 int gen, type, zone; 4343 bool success = false; 4344 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4345 DEFINE_MIN_SEQ(lruvec); 4346 4347 VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); 4348 4349 /* find the oldest populated generation */ 4350 for (type = !can_swap; type < ANON_AND_FILE; type++) { 4351 while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { 4352 gen = lru_gen_from_seq(min_seq[type]); 4353 4354 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4355 if (!list_empty(&lrugen->folios[gen][type][zone])) 4356 goto next; 4357 } 4358 4359 min_seq[type]++; 4360 } 4361 next: 4362 ; 4363 } 4364 4365 /* see the comment on lru_gen_folio */ 4366 if (can_swap) { 4367 min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); 4368 min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); 4369 } 4370 4371 for (type = !can_swap; type < ANON_AND_FILE; type++) { 4372 if (min_seq[type] == lrugen->min_seq[type]) 4373 continue; 4374 4375 reset_ctrl_pos(lruvec, type, true); 4376 WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); 4377 success = true; 4378 } 4379 4380 return success; 4381 } 4382 4383 static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) 4384 { 4385 int prev, next; 4386 int type, zone; 4387 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4388 4389 spin_lock_irq(&lruvec->lru_lock); 4390 4391 VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); 4392 4393 for (type = ANON_AND_FILE - 1; type >= 0; type--) { 4394 if (get_nr_gens(lruvec, type) != MAX_NR_GENS) 4395 continue; 4396 4397 VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); 4398 4399 while (!inc_min_seq(lruvec, type, can_swap)) { 4400 spin_unlock_irq(&lruvec->lru_lock); 4401 cond_resched(); 4402 spin_lock_irq(&lruvec->lru_lock); 4403 } 4404 } 4405 4406 /* 4407 * Update the active/inactive LRU sizes for compatibility. Both sides of 4408 * the current max_seq need to be covered, since max_seq+1 can overlap 4409 * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do 4410 * overlap, cold/hot inversion happens. 4411 */ 4412 prev = lru_gen_from_seq(lrugen->max_seq - 1); 4413 next = lru_gen_from_seq(lrugen->max_seq + 1); 4414 4415 for (type = 0; type < ANON_AND_FILE; type++) { 4416 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4417 enum lru_list lru = type * LRU_INACTIVE_FILE; 4418 long delta = lrugen->nr_pages[prev][type][zone] - 4419 lrugen->nr_pages[next][type][zone]; 4420 4421 if (!delta) 4422 continue; 4423 4424 __update_lru_size(lruvec, lru, zone, delta); 4425 __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); 4426 } 4427 } 4428 4429 for (type = 0; type < ANON_AND_FILE; type++) 4430 reset_ctrl_pos(lruvec, type, false); 4431 4432 WRITE_ONCE(lrugen->timestamps[next], jiffies); 4433 /* make sure preceding modifications appear */ 4434 smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); 4435 4436 spin_unlock_irq(&lruvec->lru_lock); 4437 } 4438 4439 static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, 4440 struct scan_control *sc, bool can_swap, bool force_scan) 4441 { 4442 bool success; 4443 struct lru_gen_mm_walk *walk; 4444 struct mm_struct *mm = NULL; 4445 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4446 4447 VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); 4448 4449 /* see the comment in iterate_mm_list() */ 4450 if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) { 4451 success = false; 4452 goto done; 4453 } 4454 4455 /* 4456 * If the hardware doesn't automatically set the accessed bit, fallback 4457 * to lru_gen_look_around(), which only clears the accessed bit in a 4458 * handful of PTEs. Spreading the work out over a period of time usually 4459 * is less efficient, but it avoids bursty page faults. 4460 */ 4461 if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) { 4462 success = iterate_mm_list_nowalk(lruvec, max_seq); 4463 goto done; 4464 } 4465 4466 walk = set_mm_walk(NULL, true); 4467 if (!walk) { 4468 success = iterate_mm_list_nowalk(lruvec, max_seq); 4469 goto done; 4470 } 4471 4472 walk->lruvec = lruvec; 4473 walk->max_seq = max_seq; 4474 walk->can_swap = can_swap; 4475 walk->force_scan = force_scan; 4476 4477 do { 4478 success = iterate_mm_list(lruvec, walk, &mm); 4479 if (mm) 4480 walk_mm(lruvec, mm, walk); 4481 4482 cond_resched(); 4483 } while (mm); 4484 done: 4485 if (!success) { 4486 if (sc->priority <= DEF_PRIORITY - 2) 4487 wait_event_killable(lruvec->mm_state.wait, 4488 max_seq < READ_ONCE(lrugen->max_seq)); 4489 return false; 4490 } 4491 4492 VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); 4493 4494 inc_max_seq(lruvec, can_swap, force_scan); 4495 /* either this sees any waiters or they will see updated max_seq */ 4496 if (wq_has_sleeper(&lruvec->mm_state.wait)) 4497 wake_up_all(&lruvec->mm_state.wait); 4498 4499 return true; 4500 } 4501 4502 /****************************************************************************** 4503 * working set protection 4504 ******************************************************************************/ 4505 4506 static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) 4507 { 4508 int gen, type, zone; 4509 unsigned long total = 0; 4510 bool can_swap = get_swappiness(lruvec, sc); 4511 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4512 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4513 DEFINE_MAX_SEQ(lruvec); 4514 DEFINE_MIN_SEQ(lruvec); 4515 4516 for (type = !can_swap; type < ANON_AND_FILE; type++) { 4517 unsigned long seq; 4518 4519 for (seq = min_seq[type]; seq <= max_seq; seq++) { 4520 gen = lru_gen_from_seq(seq); 4521 4522 for (zone = 0; zone < MAX_NR_ZONES; zone++) 4523 total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); 4524 } 4525 } 4526 4527 /* whether the size is big enough to be helpful */ 4528 return mem_cgroup_online(memcg) ? (total >> sc->priority) : total; 4529 } 4530 4531 static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc, 4532 unsigned long min_ttl) 4533 { 4534 int gen; 4535 unsigned long birth; 4536 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4537 DEFINE_MIN_SEQ(lruvec); 4538 4539 /* see the comment on lru_gen_folio */ 4540 gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); 4541 birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); 4542 4543 if (time_is_after_jiffies(birth + min_ttl)) 4544 return false; 4545 4546 if (!lruvec_is_sizable(lruvec, sc)) 4547 return false; 4548 4549 mem_cgroup_calculate_protection(NULL, memcg); 4550 4551 return !mem_cgroup_below_min(NULL, memcg); 4552 } 4553 4554 /* to protect the working set of the last N jiffies */ 4555 static unsigned long lru_gen_min_ttl __read_mostly; 4556 4557 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) 4558 { 4559 struct mem_cgroup *memcg; 4560 unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); 4561 4562 VM_WARN_ON_ONCE(!current_is_kswapd()); 4563 4564 /* check the order to exclude compaction-induced reclaim */ 4565 if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) 4566 return; 4567 4568 memcg = mem_cgroup_iter(NULL, NULL, NULL); 4569 do { 4570 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 4571 4572 if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) { 4573 mem_cgroup_iter_break(NULL, memcg); 4574 return; 4575 } 4576 4577 cond_resched(); 4578 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); 4579 4580 /* 4581 * The main goal is to OOM kill if every generation from all memcgs is 4582 * younger than min_ttl. However, another possibility is all memcgs are 4583 * either too small or below min. 4584 */ 4585 if (mutex_trylock(&oom_lock)) { 4586 struct oom_control oc = { 4587 .gfp_mask = sc->gfp_mask, 4588 }; 4589 4590 out_of_memory(&oc); 4591 4592 mutex_unlock(&oom_lock); 4593 } 4594 } 4595 4596 /****************************************************************************** 4597 * rmap/PT walk feedback 4598 ******************************************************************************/ 4599 4600 /* 4601 * This function exploits spatial locality when shrink_folio_list() walks the 4602 * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If 4603 * the scan was done cacheline efficiently, it adds the PMD entry pointing to 4604 * the PTE table to the Bloom filter. This forms a feedback loop between the 4605 * eviction and the aging. 4606 */ 4607 void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) 4608 { 4609 int i; 4610 unsigned long start; 4611 unsigned long end; 4612 struct lru_gen_mm_walk *walk; 4613 int young = 0; 4614 pte_t *pte = pvmw->pte; 4615 unsigned long addr = pvmw->address; 4616 struct folio *folio = pfn_folio(pvmw->pfn); 4617 struct mem_cgroup *memcg = folio_memcg(folio); 4618 struct pglist_data *pgdat = folio_pgdat(folio); 4619 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 4620 DEFINE_MAX_SEQ(lruvec); 4621 int old_gen, new_gen = lru_gen_from_seq(max_seq); 4622 4623 lockdep_assert_held(pvmw->ptl); 4624 VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); 4625 4626 if (spin_is_contended(pvmw->ptl)) 4627 return; 4628 4629 /* avoid taking the LRU lock under the PTL when possible */ 4630 walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; 4631 4632 start = max(addr & PMD_MASK, pvmw->vma->vm_start); 4633 end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; 4634 4635 if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { 4636 if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) 4637 end = start + MIN_LRU_BATCH * PAGE_SIZE; 4638 else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2) 4639 start = end - MIN_LRU_BATCH * PAGE_SIZE; 4640 else { 4641 start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2; 4642 end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2; 4643 } 4644 } 4645 4646 /* folio_update_gen() requires stable folio_memcg() */ 4647 if (!mem_cgroup_trylock_pages(memcg)) 4648 return; 4649 4650 arch_enter_lazy_mmu_mode(); 4651 4652 pte -= (addr - start) / PAGE_SIZE; 4653 4654 for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { 4655 unsigned long pfn; 4656 4657 pfn = get_pte_pfn(pte[i], pvmw->vma, addr); 4658 if (pfn == -1) 4659 continue; 4660 4661 if (!pte_young(pte[i])) 4662 continue; 4663 4664 folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); 4665 if (!folio) 4666 continue; 4667 4668 if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) 4669 VM_WARN_ON_ONCE(true); 4670 4671 young++; 4672 4673 if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && 4674 !(folio_test_anon(folio) && folio_test_swapbacked(folio) && 4675 !folio_test_swapcache(folio))) 4676 folio_mark_dirty(folio); 4677 4678 if (walk) { 4679 old_gen = folio_update_gen(folio, new_gen); 4680 if (old_gen >= 0 && old_gen != new_gen) 4681 update_batch_size(walk, folio, old_gen, new_gen); 4682 4683 continue; 4684 } 4685 4686 old_gen = folio_lru_gen(folio); 4687 if (old_gen < 0) 4688 folio_set_referenced(folio); 4689 else if (old_gen != new_gen) 4690 folio_activate(folio); 4691 } 4692 4693 arch_leave_lazy_mmu_mode(); 4694 mem_cgroup_unlock_pages(); 4695 4696 /* feedback from rmap walkers to page table walkers */ 4697 if (suitable_to_scan(i, young)) 4698 update_bloom_filter(lruvec, max_seq, pvmw->pmd); 4699 } 4700 4701 /****************************************************************************** 4702 * memcg LRU 4703 ******************************************************************************/ 4704 4705 /* see the comment on MEMCG_NR_GENS */ 4706 enum { 4707 MEMCG_LRU_NOP, 4708 MEMCG_LRU_HEAD, 4709 MEMCG_LRU_TAIL, 4710 MEMCG_LRU_OLD, 4711 MEMCG_LRU_YOUNG, 4712 }; 4713 4714 #ifdef CONFIG_MEMCG 4715 4716 static int lru_gen_memcg_seg(struct lruvec *lruvec) 4717 { 4718 return READ_ONCE(lruvec->lrugen.seg); 4719 } 4720 4721 static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) 4722 { 4723 int seg; 4724 int old, new; 4725 int bin = get_random_u32_below(MEMCG_NR_BINS); 4726 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 4727 4728 spin_lock(&pgdat->memcg_lru.lock); 4729 4730 VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); 4731 4732 seg = 0; 4733 new = old = lruvec->lrugen.gen; 4734 4735 /* see the comment on MEMCG_NR_GENS */ 4736 if (op == MEMCG_LRU_HEAD) 4737 seg = MEMCG_LRU_HEAD; 4738 else if (op == MEMCG_LRU_TAIL) 4739 seg = MEMCG_LRU_TAIL; 4740 else if (op == MEMCG_LRU_OLD) 4741 new = get_memcg_gen(pgdat->memcg_lru.seq); 4742 else if (op == MEMCG_LRU_YOUNG) 4743 new = get_memcg_gen(pgdat->memcg_lru.seq + 1); 4744 else 4745 VM_WARN_ON_ONCE(true); 4746 4747 hlist_nulls_del_rcu(&lruvec->lrugen.list); 4748 4749 if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) 4750 hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); 4751 else 4752 hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); 4753 4754 pgdat->memcg_lru.nr_memcgs[old]--; 4755 pgdat->memcg_lru.nr_memcgs[new]++; 4756 4757 lruvec->lrugen.gen = new; 4758 WRITE_ONCE(lruvec->lrugen.seg, seg); 4759 4760 if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) 4761 WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); 4762 4763 spin_unlock(&pgdat->memcg_lru.lock); 4764 } 4765 4766 void lru_gen_online_memcg(struct mem_cgroup *memcg) 4767 { 4768 int gen; 4769 int nid; 4770 int bin = get_random_u32_below(MEMCG_NR_BINS); 4771 4772 for_each_node(nid) { 4773 struct pglist_data *pgdat = NODE_DATA(nid); 4774 struct lruvec *lruvec = get_lruvec(memcg, nid); 4775 4776 spin_lock(&pgdat->memcg_lru.lock); 4777 4778 VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); 4779 4780 gen = get_memcg_gen(pgdat->memcg_lru.seq); 4781 4782 hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); 4783 pgdat->memcg_lru.nr_memcgs[gen]++; 4784 4785 lruvec->lrugen.gen = gen; 4786 4787 spin_unlock(&pgdat->memcg_lru.lock); 4788 } 4789 } 4790 4791 void lru_gen_offline_memcg(struct mem_cgroup *memcg) 4792 { 4793 int nid; 4794 4795 for_each_node(nid) { 4796 struct lruvec *lruvec = get_lruvec(memcg, nid); 4797 4798 lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); 4799 } 4800 } 4801 4802 void lru_gen_release_memcg(struct mem_cgroup *memcg) 4803 { 4804 int gen; 4805 int nid; 4806 4807 for_each_node(nid) { 4808 struct pglist_data *pgdat = NODE_DATA(nid); 4809 struct lruvec *lruvec = get_lruvec(memcg, nid); 4810 4811 spin_lock(&pgdat->memcg_lru.lock); 4812 4813 VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); 4814 4815 gen = lruvec->lrugen.gen; 4816 4817 hlist_nulls_del_rcu(&lruvec->lrugen.list); 4818 pgdat->memcg_lru.nr_memcgs[gen]--; 4819 4820 if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) 4821 WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); 4822 4823 spin_unlock(&pgdat->memcg_lru.lock); 4824 } 4825 } 4826 4827 void lru_gen_soft_reclaim(struct lruvec *lruvec) 4828 { 4829 /* see the comment on MEMCG_NR_GENS */ 4830 if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) 4831 lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); 4832 } 4833 4834 #else /* !CONFIG_MEMCG */ 4835 4836 static int lru_gen_memcg_seg(struct lruvec *lruvec) 4837 { 4838 return 0; 4839 } 4840 4841 #endif 4842 4843 /****************************************************************************** 4844 * the eviction 4845 ******************************************************************************/ 4846 4847 static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) 4848 { 4849 bool success; 4850 int gen = folio_lru_gen(folio); 4851 int type = folio_is_file_lru(folio); 4852 int zone = folio_zonenum(folio); 4853 int delta = folio_nr_pages(folio); 4854 int refs = folio_lru_refs(folio); 4855 int tier = lru_tier_from_refs(refs); 4856 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4857 4858 VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); 4859 4860 /* unevictable */ 4861 if (!folio_evictable(folio)) { 4862 success = lru_gen_del_folio(lruvec, folio, true); 4863 VM_WARN_ON_ONCE_FOLIO(!success, folio); 4864 folio_set_unevictable(folio); 4865 lruvec_add_folio(lruvec, folio); 4866 __count_vm_events(UNEVICTABLE_PGCULLED, delta); 4867 return true; 4868 } 4869 4870 /* dirty lazyfree */ 4871 if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) { 4872 success = lru_gen_del_folio(lruvec, folio, true); 4873 VM_WARN_ON_ONCE_FOLIO(!success, folio); 4874 folio_set_swapbacked(folio); 4875 lruvec_add_folio_tail(lruvec, folio); 4876 return true; 4877 } 4878 4879 /* promoted */ 4880 if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { 4881 list_move(&folio->lru, &lrugen->folios[gen][type][zone]); 4882 return true; 4883 } 4884 4885 /* protected */ 4886 if (tier > tier_idx) { 4887 int hist = lru_hist_from_seq(lrugen->min_seq[type]); 4888 4889 gen = folio_inc_gen(lruvec, folio, false); 4890 list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); 4891 4892 WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 4893 lrugen->protected[hist][type][tier - 1] + delta); 4894 __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); 4895 return true; 4896 } 4897 4898 /* waiting for writeback */ 4899 if (folio_test_locked(folio) || folio_test_writeback(folio) || 4900 (type == LRU_GEN_FILE && folio_test_dirty(folio))) { 4901 gen = folio_inc_gen(lruvec, folio, true); 4902 list_move(&folio->lru, &lrugen->folios[gen][type][zone]); 4903 return true; 4904 } 4905 4906 return false; 4907 } 4908 4909 static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc) 4910 { 4911 bool success; 4912 4913 /* swapping inhibited */ 4914 if (!(sc->gfp_mask & __GFP_IO) && 4915 (folio_test_dirty(folio) || 4916 (folio_test_anon(folio) && !folio_test_swapcache(folio)))) 4917 return false; 4918 4919 /* raced with release_pages() */ 4920 if (!folio_try_get(folio)) 4921 return false; 4922 4923 /* raced with another isolation */ 4924 if (!folio_test_clear_lru(folio)) { 4925 folio_put(folio); 4926 return false; 4927 } 4928 4929 /* see the comment on MAX_NR_TIERS */ 4930 if (!folio_test_referenced(folio)) 4931 set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); 4932 4933 /* for shrink_folio_list() */ 4934 folio_clear_reclaim(folio); 4935 folio_clear_referenced(folio); 4936 4937 success = lru_gen_del_folio(lruvec, folio, true); 4938 VM_WARN_ON_ONCE_FOLIO(!success, folio); 4939 4940 return true; 4941 } 4942 4943 static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, 4944 int type, int tier, struct list_head *list) 4945 { 4946 int gen, zone; 4947 enum vm_event_item item; 4948 int sorted = 0; 4949 int scanned = 0; 4950 int isolated = 0; 4951 int remaining = MAX_LRU_BATCH; 4952 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4953 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4954 4955 VM_WARN_ON_ONCE(!list_empty(list)); 4956 4957 if (get_nr_gens(lruvec, type) == MIN_NR_GENS) 4958 return 0; 4959 4960 gen = lru_gen_from_seq(lrugen->min_seq[type]); 4961 4962 for (zone = sc->reclaim_idx; zone >= 0; zone--) { 4963 LIST_HEAD(moved); 4964 int skipped = 0; 4965 struct list_head *head = &lrugen->folios[gen][type][zone]; 4966 4967 while (!list_empty(head)) { 4968 struct folio *folio = lru_to_folio(head); 4969 int delta = folio_nr_pages(folio); 4970 4971 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); 4972 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); 4973 VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); 4974 VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); 4975 4976 scanned += delta; 4977 4978 if (sort_folio(lruvec, folio, tier)) 4979 sorted += delta; 4980 else if (isolate_folio(lruvec, folio, sc)) { 4981 list_add(&folio->lru, list); 4982 isolated += delta; 4983 } else { 4984 list_move(&folio->lru, &moved); 4985 skipped += delta; 4986 } 4987 4988 if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH) 4989 break; 4990 } 4991 4992 if (skipped) { 4993 list_splice(&moved, head); 4994 __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); 4995 } 4996 4997 if (!remaining || isolated >= MIN_LRU_BATCH) 4998 break; 4999 } 5000 5001 item = PGSCAN_KSWAPD + reclaimer_offset(); 5002 if (!cgroup_reclaim(sc)) { 5003 __count_vm_events(item, isolated); 5004 __count_vm_events(PGREFILL, sorted); 5005 } 5006 __count_memcg_events(memcg, item, isolated); 5007 __count_memcg_events(memcg, PGREFILL, sorted); 5008 __count_vm_events(PGSCAN_ANON + type, isolated); 5009 5010 /* 5011 * There might not be eligible folios due to reclaim_idx. Check the 5012 * remaining to prevent livelock if it's not making progress. 5013 */ 5014 return isolated || !remaining ? scanned : 0; 5015 } 5016 5017 static int get_tier_idx(struct lruvec *lruvec, int type) 5018 { 5019 int tier; 5020 struct ctrl_pos sp, pv; 5021 5022 /* 5023 * To leave a margin for fluctuations, use a larger gain factor (1:2). 5024 * This value is chosen because any other tier would have at least twice 5025 * as many refaults as the first tier. 5026 */ 5027 read_ctrl_pos(lruvec, type, 0, 1, &sp); 5028 for (tier = 1; tier < MAX_NR_TIERS; tier++) { 5029 read_ctrl_pos(lruvec, type, tier, 2, &pv); 5030 if (!positive_ctrl_err(&sp, &pv)) 5031 break; 5032 } 5033 5034 return tier - 1; 5035 } 5036 5037 static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) 5038 { 5039 int type, tier; 5040 struct ctrl_pos sp, pv; 5041 int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; 5042 5043 /* 5044 * Compare the first tier of anon with that of file to determine which 5045 * type to scan. Also need to compare other tiers of the selected type 5046 * with the first tier of the other type to determine the last tier (of 5047 * the selected type) to evict. 5048 */ 5049 read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); 5050 read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); 5051 type = positive_ctrl_err(&sp, &pv); 5052 5053 read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); 5054 for (tier = 1; tier < MAX_NR_TIERS; tier++) { 5055 read_ctrl_pos(lruvec, type, tier, gain[type], &pv); 5056 if (!positive_ctrl_err(&sp, &pv)) 5057 break; 5058 } 5059 5060 *tier_idx = tier - 1; 5061 5062 return type; 5063 } 5064 5065 static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, 5066 int *type_scanned, struct list_head *list) 5067 { 5068 int i; 5069 int type; 5070 int scanned; 5071 int tier = -1; 5072 DEFINE_MIN_SEQ(lruvec); 5073 5074 /* 5075 * Try to make the obvious choice first. When anon and file are both 5076 * available from the same generation, interpret swappiness 1 as file 5077 * first and 200 as anon first. 5078 */ 5079 if (!swappiness) 5080 type = LRU_GEN_FILE; 5081 else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) 5082 type = LRU_GEN_ANON; 5083 else if (swappiness == 1) 5084 type = LRU_GEN_FILE; 5085 else if (swappiness == 200) 5086 type = LRU_GEN_ANON; 5087 else 5088 type = get_type_to_scan(lruvec, swappiness, &tier); 5089 5090 for (i = !swappiness; i < ANON_AND_FILE; i++) { 5091 if (tier < 0) 5092 tier = get_tier_idx(lruvec, type); 5093 5094 scanned = scan_folios(lruvec, sc, type, tier, list); 5095 if (scanned) 5096 break; 5097 5098 type = !type; 5099 tier = -1; 5100 } 5101 5102 *type_scanned = type; 5103 5104 return scanned; 5105 } 5106 5107 static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) 5108 { 5109 int type; 5110 int scanned; 5111 int reclaimed; 5112 LIST_HEAD(list); 5113 LIST_HEAD(clean); 5114 struct folio *folio; 5115 struct folio *next; 5116 enum vm_event_item item; 5117 struct reclaim_stat stat; 5118 struct lru_gen_mm_walk *walk; 5119 bool skip_retry = false; 5120 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 5121 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 5122 5123 spin_lock_irq(&lruvec->lru_lock); 5124 5125 scanned = isolate_folios(lruvec, sc, swappiness, &type, &list); 5126 5127 scanned += try_to_inc_min_seq(lruvec, swappiness); 5128 5129 if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) 5130 scanned = 0; 5131 5132 spin_unlock_irq(&lruvec->lru_lock); 5133 5134 if (list_empty(&list)) 5135 return scanned; 5136 retry: 5137 reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); 5138 sc->nr_reclaimed += reclaimed; 5139 5140 list_for_each_entry_safe_reverse(folio, next, &list, lru) { 5141 if (!folio_evictable(folio)) { 5142 list_del(&folio->lru); 5143 folio_putback_lru(folio); 5144 continue; 5145 } 5146 5147 if (folio_test_reclaim(folio) && 5148 (folio_test_dirty(folio) || folio_test_writeback(folio))) { 5149 /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ 5150 if (folio_test_workingset(folio)) 5151 folio_set_referenced(folio); 5152 continue; 5153 } 5154 5155 if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || 5156 folio_mapped(folio) || folio_test_locked(folio) || 5157 folio_test_dirty(folio) || folio_test_writeback(folio)) { 5158 /* don't add rejected folios to the oldest generation */ 5159 set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 5160 BIT(PG_active)); 5161 continue; 5162 } 5163 5164 /* retry folios that may have missed folio_rotate_reclaimable() */ 5165 list_move(&folio->lru, &clean); 5166 sc->nr_scanned -= folio_nr_pages(folio); 5167 } 5168 5169 spin_lock_irq(&lruvec->lru_lock); 5170 5171 move_folios_to_lru(lruvec, &list); 5172 5173 walk = current->reclaim_state->mm_walk; 5174 if (walk && walk->batched) 5175 reset_batch_size(lruvec, walk); 5176 5177 item = PGSTEAL_KSWAPD + reclaimer_offset(); 5178 if (!cgroup_reclaim(sc)) 5179 __count_vm_events(item, reclaimed); 5180 __count_memcg_events(memcg, item, reclaimed); 5181 __count_vm_events(PGSTEAL_ANON + type, reclaimed); 5182 5183 spin_unlock_irq(&lruvec->lru_lock); 5184 5185 mem_cgroup_uncharge_list(&list); 5186 free_unref_page_list(&list); 5187 5188 INIT_LIST_HEAD(&list); 5189 list_splice_init(&clean, &list); 5190 5191 if (!list_empty(&list)) { 5192 skip_retry = true; 5193 goto retry; 5194 } 5195 5196 return scanned; 5197 } 5198 5199 static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, 5200 struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) 5201 { 5202 int gen, type, zone; 5203 unsigned long old = 0; 5204 unsigned long young = 0; 5205 unsigned long total = 0; 5206 struct lru_gen_folio *lrugen = &lruvec->lrugen; 5207 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 5208 DEFINE_MIN_SEQ(lruvec); 5209 5210 /* whether this lruvec is completely out of cold folios */ 5211 if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { 5212 *nr_to_scan = 0; 5213 return true; 5214 } 5215 5216 for (type = !can_swap; type < ANON_AND_FILE; type++) { 5217 unsigned long seq; 5218 5219 for (seq = min_seq[type]; seq <= max_seq; seq++) { 5220 unsigned long size = 0; 5221 5222 gen = lru_gen_from_seq(seq); 5223 5224 for (zone = 0; zone < MAX_NR_ZONES; zone++) 5225 size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); 5226 5227 total += size; 5228 if (seq == max_seq) 5229 young += size; 5230 else if (seq + MIN_NR_GENS == max_seq) 5231 old += size; 5232 } 5233 } 5234 5235 /* try to scrape all its memory if this memcg was deleted */ 5236 *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; 5237 5238 /* 5239 * The aging tries to be lazy to reduce the overhead, while the eviction 5240 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the 5241 * ideal number of generations is MIN_NR_GENS+1. 5242 */ 5243 if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) 5244 return false; 5245 5246 /* 5247 * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) 5248 * of the total number of pages for each generation. A reasonable range 5249 * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The 5250 * aging cares about the upper bound of hot pages, while the eviction 5251 * cares about the lower bound of cold pages. 5252 */ 5253 if (young * MIN_NR_GENS > total) 5254 return true; 5255 if (old * (MIN_NR_GENS + 2) < total) 5256 return true; 5257 5258 return false; 5259 } 5260 5261 /* 5262 * For future optimizations: 5263 * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg 5264 * reclaim. 5265 */ 5266 static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) 5267 { 5268 unsigned long nr_to_scan; 5269 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 5270 DEFINE_MAX_SEQ(lruvec); 5271 5272 if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) 5273 return 0; 5274 5275 if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan)) 5276 return nr_to_scan; 5277 5278 /* skip the aging path at the default priority */ 5279 if (sc->priority == DEF_PRIORITY) 5280 return nr_to_scan; 5281 5282 /* skip this lruvec as it's low on cold folios */ 5283 return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0; 5284 } 5285 5286 static unsigned long get_nr_to_reclaim(struct scan_control *sc) 5287 { 5288 /* don't abort memcg reclaim to ensure fairness */ 5289 if (!global_reclaim(sc)) 5290 return -1; 5291 5292 return max(sc->nr_to_reclaim, compact_gap(sc->order)); 5293 } 5294 5295 static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 5296 { 5297 long nr_to_scan; 5298 unsigned long scanned = 0; 5299 unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); 5300 int swappiness = get_swappiness(lruvec, sc); 5301 5302 /* clean file folios are more likely to exist */ 5303 if (swappiness && !(sc->gfp_mask & __GFP_IO)) 5304 swappiness = 1; 5305 5306 while (true) { 5307 int delta; 5308 5309 nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); 5310 if (nr_to_scan <= 0) 5311 break; 5312 5313 delta = evict_folios(lruvec, sc, swappiness); 5314 if (!delta) 5315 break; 5316 5317 scanned += delta; 5318 if (scanned >= nr_to_scan) 5319 break; 5320 5321 if (sc->nr_reclaimed >= nr_to_reclaim) 5322 break; 5323 5324 cond_resched(); 5325 } 5326 5327 /* whether try_to_inc_max_seq() was successful */ 5328 return nr_to_scan < 0; 5329 } 5330 5331 static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) 5332 { 5333 bool success; 5334 unsigned long scanned = sc->nr_scanned; 5335 unsigned long reclaimed = sc->nr_reclaimed; 5336 int seg = lru_gen_memcg_seg(lruvec); 5337 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 5338 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 5339 5340 /* see the comment on MEMCG_NR_GENS */ 5341 if (!lruvec_is_sizable(lruvec, sc)) 5342 return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; 5343 5344 mem_cgroup_calculate_protection(NULL, memcg); 5345 5346 if (mem_cgroup_below_min(NULL, memcg)) 5347 return MEMCG_LRU_YOUNG; 5348 5349 if (mem_cgroup_below_low(NULL, memcg)) { 5350 /* see the comment on MEMCG_NR_GENS */ 5351 if (seg != MEMCG_LRU_TAIL) 5352 return MEMCG_LRU_TAIL; 5353 5354 memcg_memory_event(memcg, MEMCG_LOW); 5355 } 5356 5357 success = try_to_shrink_lruvec(lruvec, sc); 5358 5359 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); 5360 5361 if (!sc->proactive) 5362 vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, 5363 sc->nr_reclaimed - reclaimed); 5364 5365 sc->nr_reclaimed += current->reclaim_state->reclaimed_slab; 5366 current->reclaim_state->reclaimed_slab = 0; 5367 5368 return success ? MEMCG_LRU_YOUNG : 0; 5369 } 5370 5371 #ifdef CONFIG_MEMCG 5372 5373 static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) 5374 { 5375 int op; 5376 int gen; 5377 int bin; 5378 int first_bin; 5379 struct lruvec *lruvec; 5380 struct lru_gen_folio *lrugen; 5381 struct mem_cgroup *memcg; 5382 const struct hlist_nulls_node *pos; 5383 unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); 5384 5385 bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); 5386 restart: 5387 op = 0; 5388 memcg = NULL; 5389 gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); 5390 5391 rcu_read_lock(); 5392 5393 hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { 5394 if (op) 5395 lru_gen_rotate_memcg(lruvec, op); 5396 5397 mem_cgroup_put(memcg); 5398 5399 lruvec = container_of(lrugen, struct lruvec, lrugen); 5400 memcg = lruvec_memcg(lruvec); 5401 5402 if (!mem_cgroup_tryget(memcg)) { 5403 op = 0; 5404 memcg = NULL; 5405 continue; 5406 } 5407 5408 rcu_read_unlock(); 5409 5410 op = shrink_one(lruvec, sc); 5411 5412 rcu_read_lock(); 5413 5414 if (sc->nr_reclaimed >= nr_to_reclaim) 5415 break; 5416 } 5417 5418 rcu_read_unlock(); 5419 5420 if (op) 5421 lru_gen_rotate_memcg(lruvec, op); 5422 5423 mem_cgroup_put(memcg); 5424 5425 if (sc->nr_reclaimed >= nr_to_reclaim) 5426 return; 5427 5428 /* restart if raced with lru_gen_rotate_memcg() */ 5429 if (gen != get_nulls_value(pos)) 5430 goto restart; 5431 5432 /* try the rest of the bins of the current generation */ 5433 bin = get_memcg_bin(bin + 1); 5434 if (bin != first_bin) 5435 goto restart; 5436 } 5437 5438 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 5439 { 5440 struct blk_plug plug; 5441 5442 VM_WARN_ON_ONCE(global_reclaim(sc)); 5443 VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap); 5444 5445 lru_add_drain(); 5446 5447 blk_start_plug(&plug); 5448 5449 set_mm_walk(NULL, sc->proactive); 5450 5451 if (try_to_shrink_lruvec(lruvec, sc)) 5452 lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG); 5453 5454 clear_mm_walk(); 5455 5456 blk_finish_plug(&plug); 5457 } 5458 5459 #else /* !CONFIG_MEMCG */ 5460 5461 static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) 5462 { 5463 BUILD_BUG(); 5464 } 5465 5466 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 5467 { 5468 BUILD_BUG(); 5469 } 5470 5471 #endif 5472 5473 static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) 5474 { 5475 int priority; 5476 unsigned long reclaimable; 5477 struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); 5478 5479 if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) 5480 return; 5481 /* 5482 * Determine the initial priority based on ((total / MEMCG_NR_GENS) >> 5483 * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the 5484 * estimated reclaimed_to_scanned_ratio = inactive / total. 5485 */ 5486 reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); 5487 if (get_swappiness(lruvec, sc)) 5488 reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); 5489 5490 reclaimable /= MEMCG_NR_GENS; 5491 5492 /* round down reclaimable and round up sc->nr_to_reclaim */ 5493 priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); 5494 5495 sc->priority = clamp(priority, 0, DEF_PRIORITY); 5496 } 5497 5498 static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) 5499 { 5500 struct blk_plug plug; 5501 unsigned long reclaimed = sc->nr_reclaimed; 5502 5503 VM_WARN_ON_ONCE(!global_reclaim(sc)); 5504 5505 /* 5506 * Unmapped clean folios are already prioritized. Scanning for more of 5507 * them is likely futile and can cause high reclaim latency when there 5508 * is a large number of memcgs. 5509 */ 5510 if (!sc->may_writepage || !sc->may_unmap) 5511 goto done; 5512 5513 lru_add_drain(); 5514 5515 blk_start_plug(&plug); 5516 5517 set_mm_walk(pgdat, sc->proactive); 5518 5519 set_initial_priority(pgdat, sc); 5520 5521 if (current_is_kswapd()) 5522 sc->nr_reclaimed = 0; 5523 5524 if (mem_cgroup_disabled()) 5525 shrink_one(&pgdat->__lruvec, sc); 5526 else 5527 shrink_many(pgdat, sc); 5528 5529 if (current_is_kswapd()) 5530 sc->nr_reclaimed += reclaimed; 5531 5532 clear_mm_walk(); 5533 5534 blk_finish_plug(&plug); 5535 done: 5536 /* kswapd should never fail */ 5537 pgdat->kswapd_failures = 0; 5538 } 5539 5540 /****************************************************************************** 5541 * state change 5542 ******************************************************************************/ 5543 5544 static bool __maybe_unused state_is_valid(struct lruvec *lruvec) 5545 { 5546 struct lru_gen_folio *lrugen = &lruvec->lrugen; 5547 5548 if (lrugen->enabled) { 5549 enum lru_list lru; 5550 5551 for_each_evictable_lru(lru) { 5552 if (!list_empty(&lruvec->lists[lru])) 5553 return false; 5554 } 5555 } else { 5556 int gen, type, zone; 5557 5558 for_each_gen_type_zone(gen, type, zone) { 5559 if (!list_empty(&lrugen->folios[gen][type][zone])) 5560 return false; 5561 } 5562 } 5563 5564 return true; 5565 } 5566 5567 static bool fill_evictable(struct lruvec *lruvec) 5568 { 5569 enum lru_list lru; 5570 int remaining = MAX_LRU_BATCH; 5571 5572 for_each_evictable_lru(lru) { 5573 int type = is_file_lru(lru); 5574 bool active = is_active_lru(lru); 5575 struct list_head *head = &lruvec->lists[lru]; 5576 5577 while (!list_empty(head)) { 5578 bool success; 5579 struct folio *folio = lru_to_folio(head); 5580 5581 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); 5582 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio); 5583 VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); 5584 VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio); 5585 5586 lruvec_del_folio(lruvec, folio); 5587 success = lru_gen_add_folio(lruvec, folio, false); 5588 VM_WARN_ON_ONCE(!success); 5589 5590 if (!--remaining) 5591 return false; 5592 } 5593 } 5594 5595 return true; 5596 } 5597 5598 static bool drain_evictable(struct lruvec *lruvec) 5599 { 5600 int gen, type, zone; 5601 int remaining = MAX_LRU_BATCH; 5602 5603 for_each_gen_type_zone(gen, type, zone) { 5604 struct list_head *head = &lruvec->lrugen.folios[gen][type][zone]; 5605 5606 while (!list_empty(head)) { 5607 bool success; 5608 struct folio *folio = lru_to_folio(head); 5609 5610 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); 5611 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); 5612 VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); 5613 VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); 5614 5615 success = lru_gen_del_folio(lruvec, folio, false); 5616 VM_WARN_ON_ONCE(!success); 5617 lruvec_add_folio(lruvec, folio); 5618 5619 if (!--remaining) 5620 return false; 5621 } 5622 } 5623 5624 return true; 5625 } 5626 5627 static void lru_gen_change_state(bool enabled) 5628 { 5629 static DEFINE_MUTEX(state_mutex); 5630 5631 struct mem_cgroup *memcg; 5632 5633 cgroup_lock(); 5634 cpus_read_lock(); 5635 get_online_mems(); 5636 mutex_lock(&state_mutex); 5637 5638 if (enabled == lru_gen_enabled()) 5639 goto unlock; 5640 5641 if (enabled) 5642 static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); 5643 else 5644 static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); 5645 5646 memcg = mem_cgroup_iter(NULL, NULL, NULL); 5647 do { 5648 int nid; 5649 5650 for_each_node(nid) { 5651 struct lruvec *lruvec = get_lruvec(memcg, nid); 5652 5653 spin_lock_irq(&lruvec->lru_lock); 5654 5655 VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); 5656 VM_WARN_ON_ONCE(!state_is_valid(lruvec)); 5657 5658 lruvec->lrugen.enabled = enabled; 5659 5660 while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { 5661 spin_unlock_irq(&lruvec->lru_lock); 5662 cond_resched(); 5663 spin_lock_irq(&lruvec->lru_lock); 5664 } 5665 5666 spin_unlock_irq(&lruvec->lru_lock); 5667 } 5668 5669 cond_resched(); 5670 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); 5671 unlock: 5672 mutex_unlock(&state_mutex); 5673 put_online_mems(); 5674 cpus_read_unlock(); 5675 cgroup_unlock(); 5676 } 5677 5678 /****************************************************************************** 5679 * sysfs interface 5680 ******************************************************************************/ 5681 5682 static ssize_t min_ttl_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) 5683 { 5684 return sysfs_emit(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); 5685 } 5686 5687 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ 5688 static ssize_t min_ttl_ms_store(struct kobject *kobj, struct kobj_attribute *attr, 5689 const char *buf, size_t len) 5690 { 5691 unsigned int msecs; 5692 5693 if (kstrtouint(buf, 0, &msecs)) 5694 return -EINVAL; 5695 5696 WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); 5697 5698 return len; 5699 } 5700 5701 static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR_RW(min_ttl_ms); 5702 5703 static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) 5704 { 5705 unsigned int caps = 0; 5706 5707 if (get_cap(LRU_GEN_CORE)) 5708 caps |= BIT(LRU_GEN_CORE); 5709 5710 if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK)) 5711 caps |= BIT(LRU_GEN_MM_WALK); 5712 5713 if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) 5714 caps |= BIT(LRU_GEN_NONLEAF_YOUNG); 5715 5716 return sysfs_emit(buf, "0x%04x\n", caps); 5717 } 5718 5719 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ 5720 static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, 5721 const char *buf, size_t len) 5722 { 5723 int i; 5724 unsigned int caps; 5725 5726 if (tolower(*buf) == 'n') 5727 caps = 0; 5728 else if (tolower(*buf) == 'y') 5729 caps = -1; 5730 else if (kstrtouint(buf, 0, &caps)) 5731 return -EINVAL; 5732 5733 for (i = 0; i < NR_LRU_GEN_CAPS; i++) { 5734 bool enabled = caps & BIT(i); 5735 5736 if (i == LRU_GEN_CORE) 5737 lru_gen_change_state(enabled); 5738 else if (enabled) 5739 static_branch_enable(&lru_gen_caps[i]); 5740 else 5741 static_branch_disable(&lru_gen_caps[i]); 5742 } 5743 5744 return len; 5745 } 5746 5747 static struct kobj_attribute lru_gen_enabled_attr = __ATTR_RW(enabled); 5748 5749 static struct attribute *lru_gen_attrs[] = { 5750 &lru_gen_min_ttl_attr.attr, 5751 &lru_gen_enabled_attr.attr, 5752 NULL 5753 }; 5754 5755 static const struct attribute_group lru_gen_attr_group = { 5756 .name = "lru_gen", 5757 .attrs = lru_gen_attrs, 5758 }; 5759 5760 /****************************************************************************** 5761 * debugfs interface 5762 ******************************************************************************/ 5763 5764 static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) 5765 { 5766 struct mem_cgroup *memcg; 5767 loff_t nr_to_skip = *pos; 5768 5769 m->private = kvmalloc(PATH_MAX, GFP_KERNEL); 5770 if (!m->private) 5771 return ERR_PTR(-ENOMEM); 5772 5773 memcg = mem_cgroup_iter(NULL, NULL, NULL); 5774 do { 5775 int nid; 5776 5777 for_each_node_state(nid, N_MEMORY) { 5778 if (!nr_to_skip--) 5779 return get_lruvec(memcg, nid); 5780 } 5781 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); 5782 5783 return NULL; 5784 } 5785 5786 static void lru_gen_seq_stop(struct seq_file *m, void *v) 5787 { 5788 if (!IS_ERR_OR_NULL(v)) 5789 mem_cgroup_iter_break(NULL, lruvec_memcg(v)); 5790 5791 kvfree(m->private); 5792 m->private = NULL; 5793 } 5794 5795 static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) 5796 { 5797 int nid = lruvec_pgdat(v)->node_id; 5798 struct mem_cgroup *memcg = lruvec_memcg(v); 5799 5800 ++*pos; 5801 5802 nid = next_memory_node(nid); 5803 if (nid == MAX_NUMNODES) { 5804 memcg = mem_cgroup_iter(NULL, memcg, NULL); 5805 if (!memcg) 5806 return NULL; 5807 5808 nid = first_memory_node; 5809 } 5810 5811 return get_lruvec(memcg, nid); 5812 } 5813 5814 static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, 5815 unsigned long max_seq, unsigned long *min_seq, 5816 unsigned long seq) 5817 { 5818 int i; 5819 int type, tier; 5820 int hist = lru_hist_from_seq(seq); 5821 struct lru_gen_folio *lrugen = &lruvec->lrugen; 5822 5823 for (tier = 0; tier < MAX_NR_TIERS; tier++) { 5824 seq_printf(m, " %10d", tier); 5825 for (type = 0; type < ANON_AND_FILE; type++) { 5826 const char *s = " "; 5827 unsigned long n[3] = {}; 5828 5829 if (seq == max_seq) { 5830 s = "RT "; 5831 n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); 5832 n[1] = READ_ONCE(lrugen->avg_total[type][tier]); 5833 } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { 5834 s = "rep"; 5835 n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); 5836 n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); 5837 if (tier) 5838 n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); 5839 } 5840 5841 for (i = 0; i < 3; i++) 5842 seq_printf(m, " %10lu%c", n[i], s[i]); 5843 } 5844 seq_putc(m, '\n'); 5845 } 5846 5847 seq_puts(m, " "); 5848 for (i = 0; i < NR_MM_STATS; i++) { 5849 const char *s = " "; 5850 unsigned long n = 0; 5851 5852 if (seq == max_seq && NR_HIST_GENS == 1) { 5853 s = "LOYNFA"; 5854 n = READ_ONCE(lruvec->mm_state.stats[hist][i]); 5855 } else if (seq != max_seq && NR_HIST_GENS > 1) { 5856 s = "loynfa"; 5857 n = READ_ONCE(lruvec->mm_state.stats[hist][i]); 5858 } 5859 5860 seq_printf(m, " %10lu%c", n, s[i]); 5861 } 5862 seq_putc(m, '\n'); 5863 } 5864 5865 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ 5866 static int lru_gen_seq_show(struct seq_file *m, void *v) 5867 { 5868 unsigned long seq; 5869 bool full = !debugfs_real_fops(m->file)->write; 5870 struct lruvec *lruvec = v; 5871 struct lru_gen_folio *lrugen = &lruvec->lrugen; 5872 int nid = lruvec_pgdat(lruvec)->node_id; 5873 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 5874 DEFINE_MAX_SEQ(lruvec); 5875 DEFINE_MIN_SEQ(lruvec); 5876 5877 if (nid == first_memory_node) { 5878 const char *path = memcg ? m->private : ""; 5879 5880 #ifdef CONFIG_MEMCG 5881 if (memcg) 5882 cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); 5883 #endif 5884 seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); 5885 } 5886 5887 seq_printf(m, " node %5d\n", nid); 5888 5889 if (!full) 5890 seq = min_seq[LRU_GEN_ANON]; 5891 else if (max_seq >= MAX_NR_GENS) 5892 seq = max_seq - MAX_NR_GENS + 1; 5893 else 5894 seq = 0; 5895 5896 for (; seq <= max_seq; seq++) { 5897 int type, zone; 5898 int gen = lru_gen_from_seq(seq); 5899 unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); 5900 5901 seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth)); 5902 5903 for (type = 0; type < ANON_AND_FILE; type++) { 5904 unsigned long size = 0; 5905 char mark = full && seq < min_seq[type] ? 'x' : ' '; 5906 5907 for (zone = 0; zone < MAX_NR_ZONES; zone++) 5908 size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); 5909 5910 seq_printf(m, " %10lu%c", size, mark); 5911 } 5912 5913 seq_putc(m, '\n'); 5914 5915 if (full) 5916 lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); 5917 } 5918 5919 return 0; 5920 } 5921 5922 static const struct seq_operations lru_gen_seq_ops = { 5923 .start = lru_gen_seq_start, 5924 .stop = lru_gen_seq_stop, 5925 .next = lru_gen_seq_next, 5926 .show = lru_gen_seq_show, 5927 }; 5928 5929 static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, 5930 bool can_swap, bool force_scan) 5931 { 5932 DEFINE_MAX_SEQ(lruvec); 5933 DEFINE_MIN_SEQ(lruvec); 5934 5935 if (seq < max_seq) 5936 return 0; 5937 5938 if (seq > max_seq) 5939 return -EINVAL; 5940 5941 if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) 5942 return -ERANGE; 5943 5944 try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan); 5945 5946 return 0; 5947 } 5948 5949 static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, 5950 int swappiness, unsigned long nr_to_reclaim) 5951 { 5952 DEFINE_MAX_SEQ(lruvec); 5953 5954 if (seq + MIN_NR_GENS > max_seq) 5955 return -EINVAL; 5956 5957 sc->nr_reclaimed = 0; 5958 5959 while (!signal_pending(current)) { 5960 DEFINE_MIN_SEQ(lruvec); 5961 5962 if (seq < min_seq[!swappiness]) 5963 return 0; 5964 5965 if (sc->nr_reclaimed >= nr_to_reclaim) 5966 return 0; 5967 5968 if (!evict_folios(lruvec, sc, swappiness)) 5969 return 0; 5970 5971 cond_resched(); 5972 } 5973 5974 return -EINTR; 5975 } 5976 5977 static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, 5978 struct scan_control *sc, int swappiness, unsigned long opt) 5979 { 5980 struct lruvec *lruvec; 5981 int err = -EINVAL; 5982 struct mem_cgroup *memcg = NULL; 5983 5984 if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) 5985 return -EINVAL; 5986 5987 if (!mem_cgroup_disabled()) { 5988 rcu_read_lock(); 5989 5990 memcg = mem_cgroup_from_id(memcg_id); 5991 if (!mem_cgroup_tryget(memcg)) 5992 memcg = NULL; 5993 5994 rcu_read_unlock(); 5995 5996 if (!memcg) 5997 return -EINVAL; 5998 } 5999 6000 if (memcg_id != mem_cgroup_id(memcg)) 6001 goto done; 6002 6003 lruvec = get_lruvec(memcg, nid); 6004 6005 if (swappiness < 0) 6006 swappiness = get_swappiness(lruvec, sc); 6007 else if (swappiness > 200) 6008 goto done; 6009 6010 switch (cmd) { 6011 case '+': 6012 err = run_aging(lruvec, seq, sc, swappiness, opt); 6013 break; 6014 case '-': 6015 err = run_eviction(lruvec, seq, sc, swappiness, opt); 6016 break; 6017 } 6018 done: 6019 mem_cgroup_put(memcg); 6020 6021 return err; 6022 } 6023 6024 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ 6025 static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, 6026 size_t len, loff_t *pos) 6027 { 6028 void *buf; 6029 char *cur, *next; 6030 unsigned int flags; 6031 struct blk_plug plug; 6032 int err = -EINVAL; 6033 struct scan_control sc = { 6034 .may_writepage = true, 6035 .may_unmap = true, 6036 .may_swap = true, 6037 .reclaim_idx = MAX_NR_ZONES - 1, 6038 .gfp_mask = GFP_KERNEL, 6039 }; 6040 6041 buf = kvmalloc(len + 1, GFP_KERNEL); 6042 if (!buf) 6043 return -ENOMEM; 6044 6045 if (copy_from_user(buf, src, len)) { 6046 kvfree(buf); 6047 return -EFAULT; 6048 } 6049 6050 set_task_reclaim_state(current, &sc.reclaim_state); 6051 flags = memalloc_noreclaim_save(); 6052 blk_start_plug(&plug); 6053 if (!set_mm_walk(NULL, true)) { 6054 err = -ENOMEM; 6055 goto done; 6056 } 6057 6058 next = buf; 6059 next[len] = '\0'; 6060 6061 while ((cur = strsep(&next, ",;\n"))) { 6062 int n; 6063 int end; 6064 char cmd; 6065 unsigned int memcg_id; 6066 unsigned int nid; 6067 unsigned long seq; 6068 unsigned int swappiness = -1; 6069 unsigned long opt = -1; 6070 6071 cur = skip_spaces(cur); 6072 if (!*cur) 6073 continue; 6074 6075 n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, 6076 &seq, &end, &swappiness, &end, &opt, &end); 6077 if (n < 4 || cur[end]) { 6078 err = -EINVAL; 6079 break; 6080 } 6081 6082 err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); 6083 if (err) 6084 break; 6085 } 6086 done: 6087 clear_mm_walk(); 6088 blk_finish_plug(&plug); 6089 memalloc_noreclaim_restore(flags); 6090 set_task_reclaim_state(current, NULL); 6091 6092 kvfree(buf); 6093 6094 return err ? : len; 6095 } 6096 6097 static int lru_gen_seq_open(struct inode *inode, struct file *file) 6098 { 6099 return seq_open(file, &lru_gen_seq_ops); 6100 } 6101 6102 static const struct file_operations lru_gen_rw_fops = { 6103 .open = lru_gen_seq_open, 6104 .read = seq_read, 6105 .write = lru_gen_seq_write, 6106 .llseek = seq_lseek, 6107 .release = seq_release, 6108 }; 6109 6110 static const struct file_operations lru_gen_ro_fops = { 6111 .open = lru_gen_seq_open, 6112 .read = seq_read, 6113 .llseek = seq_lseek, 6114 .release = seq_release, 6115 }; 6116 6117 /****************************************************************************** 6118 * initialization 6119 ******************************************************************************/ 6120 6121 void lru_gen_init_lruvec(struct lruvec *lruvec) 6122 { 6123 int i; 6124 int gen, type, zone; 6125 struct lru_gen_folio *lrugen = &lruvec->lrugen; 6126 6127 lrugen->max_seq = MIN_NR_GENS + 1; 6128 lrugen->enabled = lru_gen_enabled(); 6129 6130 for (i = 0; i <= MIN_NR_GENS + 1; i++) 6131 lrugen->timestamps[i] = jiffies; 6132 6133 for_each_gen_type_zone(gen, type, zone) 6134 INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); 6135 6136 lruvec->mm_state.seq = MIN_NR_GENS; 6137 init_waitqueue_head(&lruvec->mm_state.wait); 6138 } 6139 6140 #ifdef CONFIG_MEMCG 6141 6142 void lru_gen_init_pgdat(struct pglist_data *pgdat) 6143 { 6144 int i, j; 6145 6146 spin_lock_init(&pgdat->memcg_lru.lock); 6147 6148 for (i = 0; i < MEMCG_NR_GENS; i++) { 6149 for (j = 0; j < MEMCG_NR_BINS; j++) 6150 INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); 6151 } 6152 } 6153 6154 void lru_gen_init_memcg(struct mem_cgroup *memcg) 6155 { 6156 INIT_LIST_HEAD(&memcg->mm_list.fifo); 6157 spin_lock_init(&memcg->mm_list.lock); 6158 } 6159 6160 void lru_gen_exit_memcg(struct mem_cgroup *memcg) 6161 { 6162 int i; 6163 int nid; 6164 6165 VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo)); 6166 6167 for_each_node(nid) { 6168 struct lruvec *lruvec = get_lruvec(memcg, nid); 6169 6170 VM_WARN_ON_ONCE(lruvec->mm_state.nr_walkers); 6171 VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, 6172 sizeof(lruvec->lrugen.nr_pages))); 6173 6174 lruvec->lrugen.list.next = LIST_POISON1; 6175 6176 for (i = 0; i < NR_BLOOM_FILTERS; i++) { 6177 bitmap_free(lruvec->mm_state.filters[i]); 6178 lruvec->mm_state.filters[i] = NULL; 6179 } 6180 } 6181 } 6182 6183 #endif /* CONFIG_MEMCG */ 6184 6185 static int __init init_lru_gen(void) 6186 { 6187 BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); 6188 BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); 6189 6190 if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) 6191 pr_err("lru_gen: failed to create sysfs group\n"); 6192 6193 debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); 6194 debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); 6195 6196 return 0; 6197 }; 6198 late_initcall(init_lru_gen); 6199 6200 #else /* !CONFIG_LRU_GEN */ 6201 6202 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) 6203 { 6204 } 6205 6206 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 6207 { 6208 } 6209 6210 static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) 6211 { 6212 } 6213 6214 #endif /* CONFIG_LRU_GEN */ 6215 6216 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 6217 { 6218 unsigned long nr[NR_LRU_LISTS]; 6219 unsigned long targets[NR_LRU_LISTS]; 6220 unsigned long nr_to_scan; 6221 enum lru_list lru; 6222 unsigned long nr_reclaimed = 0; 6223 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 6224 bool proportional_reclaim; 6225 struct blk_plug plug; 6226 6227 if (lru_gen_enabled() && !global_reclaim(sc)) { 6228 lru_gen_shrink_lruvec(lruvec, sc); 6229 return; 6230 } 6231 6232 get_scan_count(lruvec, sc, nr); 6233 6234 /* Record the original scan target for proportional adjustments later */ 6235 memcpy(targets, nr, sizeof(nr)); 6236 6237 /* 6238 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal 6239 * event that can occur when there is little memory pressure e.g. 6240 * multiple streaming readers/writers. Hence, we do not abort scanning 6241 * when the requested number of pages are reclaimed when scanning at 6242 * DEF_PRIORITY on the assumption that the fact we are direct 6243 * reclaiming implies that kswapd is not keeping up and it is best to 6244 * do a batch of work at once. For memcg reclaim one check is made to 6245 * abort proportional reclaim if either the file or anon lru has already 6246 * dropped to zero at the first pass. 6247 */ 6248 proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() && 6249 sc->priority == DEF_PRIORITY); 6250 6251 blk_start_plug(&plug); 6252 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 6253 nr[LRU_INACTIVE_FILE]) { 6254 unsigned long nr_anon, nr_file, percentage; 6255 unsigned long nr_scanned; 6256 6257 for_each_evictable_lru(lru) { 6258 if (nr[lru]) { 6259 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); 6260 nr[lru] -= nr_to_scan; 6261 6262 nr_reclaimed += shrink_list(lru, nr_to_scan, 6263 lruvec, sc); 6264 } 6265 } 6266 6267 cond_resched(); 6268 6269 if (nr_reclaimed < nr_to_reclaim || proportional_reclaim) 6270 continue; 6271 6272 /* 6273 * For kswapd and memcg, reclaim at least the number of pages 6274 * requested. Ensure that the anon and file LRUs are scanned 6275 * proportionally what was requested by get_scan_count(). We 6276 * stop reclaiming one LRU and reduce the amount scanning 6277 * proportional to the original scan target. 6278 */ 6279 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; 6280 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; 6281 6282 /* 6283 * It's just vindictive to attack the larger once the smaller 6284 * has gone to zero. And given the way we stop scanning the 6285 * smaller below, this makes sure that we only make one nudge 6286 * towards proportionality once we've got nr_to_reclaim. 6287 */ 6288 if (!nr_file || !nr_anon) 6289 break; 6290 6291 if (nr_file > nr_anon) { 6292 unsigned long scan_target = targets[LRU_INACTIVE_ANON] + 6293 targets[LRU_ACTIVE_ANON] + 1; 6294 lru = LRU_BASE; 6295 percentage = nr_anon * 100 / scan_target; 6296 } else { 6297 unsigned long scan_target = targets[LRU_INACTIVE_FILE] + 6298 targets[LRU_ACTIVE_FILE] + 1; 6299 lru = LRU_FILE; 6300 percentage = nr_file * 100 / scan_target; 6301 } 6302 6303 /* Stop scanning the smaller of the LRU */ 6304 nr[lru] = 0; 6305 nr[lru + LRU_ACTIVE] = 0; 6306 6307 /* 6308 * Recalculate the other LRU scan count based on its original 6309 * scan target and the percentage scanning already complete 6310 */ 6311 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; 6312 nr_scanned = targets[lru] - nr[lru]; 6313 nr[lru] = targets[lru] * (100 - percentage) / 100; 6314 nr[lru] -= min(nr[lru], nr_scanned); 6315 6316 lru += LRU_ACTIVE; 6317 nr_scanned = targets[lru] - nr[lru]; 6318 nr[lru] = targets[lru] * (100 - percentage) / 100; 6319 nr[lru] -= min(nr[lru], nr_scanned); 6320 } 6321 blk_finish_plug(&plug); 6322 sc->nr_reclaimed += nr_reclaimed; 6323 6324 /* 6325 * Even if we did not try to evict anon pages at all, we want to 6326 * rebalance the anon lru active/inactive ratio. 6327 */ 6328 if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) && 6329 inactive_is_low(lruvec, LRU_INACTIVE_ANON)) 6330 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 6331 sc, LRU_ACTIVE_ANON); 6332 } 6333 6334 /* Use reclaim/compaction for costly allocs or under memory pressure */ 6335 static bool in_reclaim_compaction(struct scan_control *sc) 6336 { 6337 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && 6338 (sc->order > PAGE_ALLOC_COSTLY_ORDER || 6339 sc->priority < DEF_PRIORITY - 2)) 6340 return true; 6341 6342 return false; 6343 } 6344 6345 /* 6346 * Reclaim/compaction is used for high-order allocation requests. It reclaims 6347 * order-0 pages before compacting the zone. should_continue_reclaim() returns 6348 * true if more pages should be reclaimed such that when the page allocator 6349 * calls try_to_compact_pages() that it will have enough free pages to succeed. 6350 * It will give up earlier than that if there is difficulty reclaiming pages. 6351 */ 6352 static inline bool should_continue_reclaim(struct pglist_data *pgdat, 6353 unsigned long nr_reclaimed, 6354 struct scan_control *sc) 6355 { 6356 unsigned long pages_for_compaction; 6357 unsigned long inactive_lru_pages; 6358 int z; 6359 6360 /* If not in reclaim/compaction mode, stop */ 6361 if (!in_reclaim_compaction(sc)) 6362 return false; 6363 6364 /* 6365 * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX 6366 * number of pages that were scanned. This will return to the caller 6367 * with the risk reclaim/compaction and the resulting allocation attempt 6368 * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL 6369 * allocations through requiring that the full LRU list has been scanned 6370 * first, by assuming that zero delta of sc->nr_scanned means full LRU 6371 * scan, but that approximation was wrong, and there were corner cases 6372 * where always a non-zero amount of pages were scanned. 6373 */ 6374 if (!nr_reclaimed) 6375 return false; 6376 6377 /* If compaction would go ahead or the allocation would succeed, stop */ 6378 for (z = 0; z <= sc->reclaim_idx; z++) { 6379 struct zone *zone = &pgdat->node_zones[z]; 6380 if (!managed_zone(zone)) 6381 continue; 6382 6383 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { 6384 case COMPACT_SUCCESS: 6385 case COMPACT_CONTINUE: 6386 return false; 6387 default: 6388 /* check next zone */ 6389 ; 6390 } 6391 } 6392 6393 /* 6394 * If we have not reclaimed enough pages for compaction and the 6395 * inactive lists are large enough, continue reclaiming 6396 */ 6397 pages_for_compaction = compact_gap(sc->order); 6398 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); 6399 if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) 6400 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); 6401 6402 return inactive_lru_pages > pages_for_compaction; 6403 } 6404 6405 static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) 6406 { 6407 struct mem_cgroup *target_memcg = sc->target_mem_cgroup; 6408 struct mem_cgroup *memcg; 6409 6410 memcg = mem_cgroup_iter(target_memcg, NULL, NULL); 6411 do { 6412 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 6413 unsigned long reclaimed; 6414 unsigned long scanned; 6415 6416 /* 6417 * This loop can become CPU-bound when target memcgs 6418 * aren't eligible for reclaim - either because they 6419 * don't have any reclaimable pages, or because their 6420 * memory is explicitly protected. Avoid soft lockups. 6421 */ 6422 cond_resched(); 6423 6424 mem_cgroup_calculate_protection(target_memcg, memcg); 6425 6426 if (mem_cgroup_below_min(target_memcg, memcg)) { 6427 /* 6428 * Hard protection. 6429 * If there is no reclaimable memory, OOM. 6430 */ 6431 continue; 6432 } else if (mem_cgroup_below_low(target_memcg, memcg)) { 6433 /* 6434 * Soft protection. 6435 * Respect the protection only as long as 6436 * there is an unprotected supply 6437 * of reclaimable memory from other cgroups. 6438 */ 6439 if (!sc->memcg_low_reclaim) { 6440 sc->memcg_low_skipped = 1; 6441 continue; 6442 } 6443 memcg_memory_event(memcg, MEMCG_LOW); 6444 } 6445 6446 reclaimed = sc->nr_reclaimed; 6447 scanned = sc->nr_scanned; 6448 6449 shrink_lruvec(lruvec, sc); 6450 6451 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, 6452 sc->priority); 6453 6454 /* Record the group's reclaim efficiency */ 6455 if (!sc->proactive) 6456 vmpressure(sc->gfp_mask, memcg, false, 6457 sc->nr_scanned - scanned, 6458 sc->nr_reclaimed - reclaimed); 6459 6460 } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); 6461 } 6462 6463 static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) 6464 { 6465 struct reclaim_state *reclaim_state = current->reclaim_state; 6466 unsigned long nr_reclaimed, nr_scanned; 6467 struct lruvec *target_lruvec; 6468 bool reclaimable = false; 6469 6470 if (lru_gen_enabled() && global_reclaim(sc)) { 6471 lru_gen_shrink_node(pgdat, sc); 6472 return; 6473 } 6474 6475 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); 6476 6477 again: 6478 memset(&sc->nr, 0, sizeof(sc->nr)); 6479 6480 nr_reclaimed = sc->nr_reclaimed; 6481 nr_scanned = sc->nr_scanned; 6482 6483 prepare_scan_count(pgdat, sc); 6484 6485 shrink_node_memcgs(pgdat, sc); 6486 6487 if (reclaim_state) { 6488 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 6489 reclaim_state->reclaimed_slab = 0; 6490 } 6491 6492 /* Record the subtree's reclaim efficiency */ 6493 if (!sc->proactive) 6494 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, 6495 sc->nr_scanned - nr_scanned, 6496 sc->nr_reclaimed - nr_reclaimed); 6497 6498 if (sc->nr_reclaimed - nr_reclaimed) 6499 reclaimable = true; 6500 6501 if (current_is_kswapd()) { 6502 /* 6503 * If reclaim is isolating dirty pages under writeback, 6504 * it implies that the long-lived page allocation rate 6505 * is exceeding the page laundering rate. Either the 6506 * global limits are not being effective at throttling 6507 * processes due to the page distribution throughout 6508 * zones or there is heavy usage of a slow backing 6509 * device. The only option is to throttle from reclaim 6510 * context which is not ideal as there is no guarantee 6511 * the dirtying process is throttled in the same way 6512 * balance_dirty_pages() manages. 6513 * 6514 * Once a node is flagged PGDAT_WRITEBACK, kswapd will 6515 * count the number of pages under pages flagged for 6516 * immediate reclaim and stall if any are encountered 6517 * in the nr_immediate check below. 6518 */ 6519 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) 6520 set_bit(PGDAT_WRITEBACK, &pgdat->flags); 6521 6522 /* Allow kswapd to start writing pages during reclaim.*/ 6523 if (sc->nr.unqueued_dirty == sc->nr.file_taken) 6524 set_bit(PGDAT_DIRTY, &pgdat->flags); 6525 6526 /* 6527 * If kswapd scans pages marked for immediate 6528 * reclaim and under writeback (nr_immediate), it 6529 * implies that pages are cycling through the LRU 6530 * faster than they are written so forcibly stall 6531 * until some pages complete writeback. 6532 */ 6533 if (sc->nr.immediate) 6534 reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); 6535 } 6536 6537 /* 6538 * Tag a node/memcg as congested if all the dirty pages were marked 6539 * for writeback and immediate reclaim (counted in nr.congested). 6540 * 6541 * Legacy memcg will stall in page writeback so avoid forcibly 6542 * stalling in reclaim_throttle(). 6543 */ 6544 if ((current_is_kswapd() || 6545 (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && 6546 sc->nr.dirty && sc->nr.dirty == sc->nr.congested) 6547 set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); 6548 6549 /* 6550 * Stall direct reclaim for IO completions if the lruvec is 6551 * node is congested. Allow kswapd to continue until it 6552 * starts encountering unqueued dirty pages or cycling through 6553 * the LRU too quickly. 6554 */ 6555 if (!current_is_kswapd() && current_may_throttle() && 6556 !sc->hibernation_mode && 6557 test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) 6558 reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED); 6559 6560 if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, 6561 sc)) 6562 goto again; 6563 6564 /* 6565 * Kswapd gives up on balancing particular nodes after too 6566 * many failures to reclaim anything from them and goes to 6567 * sleep. On reclaim progress, reset the failure counter. A 6568 * successful direct reclaim run will revive a dormant kswapd. 6569 */ 6570 if (reclaimable) 6571 pgdat->kswapd_failures = 0; 6572 } 6573 6574 /* 6575 * Returns true if compaction should go ahead for a costly-order request, or 6576 * the allocation would already succeed without compaction. Return false if we 6577 * should reclaim first. 6578 */ 6579 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) 6580 { 6581 unsigned long watermark; 6582 enum compact_result suitable; 6583 6584 suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx); 6585 if (suitable == COMPACT_SUCCESS) 6586 /* Allocation should succeed already. Don't reclaim. */ 6587 return true; 6588 if (suitable == COMPACT_SKIPPED) 6589 /* Compaction cannot yet proceed. Do reclaim. */ 6590 return false; 6591 6592 /* 6593 * Compaction is already possible, but it takes time to run and there 6594 * are potentially other callers using the pages just freed. So proceed 6595 * with reclaim to make a buffer of free pages available to give 6596 * compaction a reasonable chance of completing and allocating the page. 6597 * Note that we won't actually reclaim the whole buffer in one attempt 6598 * as the target watermark in should_continue_reclaim() is lower. But if 6599 * we are already above the high+gap watermark, don't reclaim at all. 6600 */ 6601 watermark = high_wmark_pages(zone) + compact_gap(sc->order); 6602 6603 return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); 6604 } 6605 6606 static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) 6607 { 6608 /* 6609 * If reclaim is making progress greater than 12% efficiency then 6610 * wake all the NOPROGRESS throttled tasks. 6611 */ 6612 if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) { 6613 wait_queue_head_t *wqh; 6614 6615 wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS]; 6616 if (waitqueue_active(wqh)) 6617 wake_up(wqh); 6618 6619 return; 6620 } 6621 6622 /* 6623 * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will 6624 * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages 6625 * under writeback and marked for immediate reclaim at the tail of the 6626 * LRU. 6627 */ 6628 if (current_is_kswapd() || cgroup_reclaim(sc)) 6629 return; 6630 6631 /* Throttle if making no progress at high prioities. */ 6632 if (sc->priority == 1 && !sc->nr_reclaimed) 6633 reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS); 6634 } 6635 6636 /* 6637 * This is the direct reclaim path, for page-allocating processes. We only 6638 * try to reclaim pages from zones which will satisfy the caller's allocation 6639 * request. 6640 * 6641 * If a zone is deemed to be full of pinned pages then just give it a light 6642 * scan then give up on it. 6643 */ 6644 static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) 6645 { 6646 struct zoneref *z; 6647 struct zone *zone; 6648 unsigned long nr_soft_reclaimed; 6649 unsigned long nr_soft_scanned; 6650 gfp_t orig_mask; 6651 pg_data_t *last_pgdat = NULL; 6652 pg_data_t *first_pgdat = NULL; 6653 6654 /* 6655 * If the number of buffer_heads in the machine exceeds the maximum 6656 * allowed level, force direct reclaim to scan the highmem zone as 6657 * highmem pages could be pinning lowmem pages storing buffer_heads 6658 */ 6659 orig_mask = sc->gfp_mask; 6660 if (buffer_heads_over_limit) { 6661 sc->gfp_mask |= __GFP_HIGHMEM; 6662 sc->reclaim_idx = gfp_zone(sc->gfp_mask); 6663 } 6664 6665 for_each_zone_zonelist_nodemask(zone, z, zonelist, 6666 sc->reclaim_idx, sc->nodemask) { 6667 /* 6668 * Take care memory controller reclaiming has small influence 6669 * to global LRU. 6670 */ 6671 if (!cgroup_reclaim(sc)) { 6672 if (!cpuset_zone_allowed(zone, 6673 GFP_KERNEL | __GFP_HARDWALL)) 6674 continue; 6675 6676 /* 6677 * If we already have plenty of memory free for 6678 * compaction in this zone, don't free any more. 6679 * Even though compaction is invoked for any 6680 * non-zero order, only frequent costly order 6681 * reclamation is disruptive enough to become a 6682 * noticeable problem, like transparent huge 6683 * page allocations. 6684 */ 6685 if (IS_ENABLED(CONFIG_COMPACTION) && 6686 sc->order > PAGE_ALLOC_COSTLY_ORDER && 6687 compaction_ready(zone, sc)) { 6688 sc->compaction_ready = true; 6689 continue; 6690 } 6691 6692 /* 6693 * Shrink each node in the zonelist once. If the 6694 * zonelist is ordered by zone (not the default) then a 6695 * node may be shrunk multiple times but in that case 6696 * the user prefers lower zones being preserved. 6697 */ 6698 if (zone->zone_pgdat == last_pgdat) 6699 continue; 6700 6701 /* 6702 * This steals pages from memory cgroups over softlimit 6703 * and returns the number of reclaimed pages and 6704 * scanned pages. This works for global memory pressure 6705 * and balancing, not for a memcg's limit. 6706 */ 6707 nr_soft_scanned = 0; 6708 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat, 6709 sc->order, sc->gfp_mask, 6710 &nr_soft_scanned); 6711 sc->nr_reclaimed += nr_soft_reclaimed; 6712 sc->nr_scanned += nr_soft_scanned; 6713 /* need some check for avoid more shrink_zone() */ 6714 } 6715 6716 if (!first_pgdat) 6717 first_pgdat = zone->zone_pgdat; 6718 6719 /* See comment about same check for global reclaim above */ 6720 if (zone->zone_pgdat == last_pgdat) 6721 continue; 6722 last_pgdat = zone->zone_pgdat; 6723 shrink_node(zone->zone_pgdat, sc); 6724 } 6725 6726 if (first_pgdat) 6727 consider_reclaim_throttle(first_pgdat, sc); 6728 6729 /* 6730 * Restore to original mask to avoid the impact on the caller if we 6731 * promoted it to __GFP_HIGHMEM. 6732 */ 6733 sc->gfp_mask = orig_mask; 6734 } 6735 6736 static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) 6737 { 6738 struct lruvec *target_lruvec; 6739 unsigned long refaults; 6740 6741 if (lru_gen_enabled()) 6742 return; 6743 6744 target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); 6745 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); 6746 target_lruvec->refaults[WORKINGSET_ANON] = refaults; 6747 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE); 6748 target_lruvec->refaults[WORKINGSET_FILE] = refaults; 6749 } 6750 6751 /* 6752 * This is the main entry point to direct page reclaim. 6753 * 6754 * If a full scan of the inactive list fails to free enough memory then we 6755 * are "out of memory" and something needs to be killed. 6756 * 6757 * If the caller is !__GFP_FS then the probability of a failure is reasonably 6758 * high - the zone may be full of dirty or under-writeback pages, which this 6759 * caller can't do much about. We kick the writeback threads and take explicit 6760 * naps in the hope that some of these pages can be written. But if the 6761 * allocating task holds filesystem locks which prevent writeout this might not 6762 * work, and the allocation attempt will fail. 6763 * 6764 * returns: 0, if no pages reclaimed 6765 * else, the number of pages reclaimed 6766 */ 6767 static unsigned long do_try_to_free_pages(struct zonelist *zonelist, 6768 struct scan_control *sc) 6769 { 6770 int initial_priority = sc->priority; 6771 pg_data_t *last_pgdat; 6772 struct zoneref *z; 6773 struct zone *zone; 6774 retry: 6775 delayacct_freepages_start(); 6776 6777 if (!cgroup_reclaim(sc)) 6778 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); 6779 6780 do { 6781 if (!sc->proactive) 6782 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, 6783 sc->priority); 6784 sc->nr_scanned = 0; 6785 shrink_zones(zonelist, sc); 6786 6787 if (sc->nr_reclaimed >= sc->nr_to_reclaim) 6788 break; 6789 6790 if (sc->compaction_ready) 6791 break; 6792 6793 /* 6794 * If we're getting trouble reclaiming, start doing 6795 * writepage even in laptop mode. 6796 */ 6797 if (sc->priority < DEF_PRIORITY - 2) 6798 sc->may_writepage = 1; 6799 } while (--sc->priority >= 0); 6800 6801 last_pgdat = NULL; 6802 for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, 6803 sc->nodemask) { 6804 if (zone->zone_pgdat == last_pgdat) 6805 continue; 6806 last_pgdat = zone->zone_pgdat; 6807 6808 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); 6809 6810 if (cgroup_reclaim(sc)) { 6811 struct lruvec *lruvec; 6812 6813 lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, 6814 zone->zone_pgdat); 6815 clear_bit(LRUVEC_CONGESTED, &lruvec->flags); 6816 } 6817 } 6818 6819 delayacct_freepages_end(); 6820 6821 if (sc->nr_reclaimed) 6822 return sc->nr_reclaimed; 6823 6824 /* Aborted reclaim to try compaction? don't OOM, then */ 6825 if (sc->compaction_ready) 6826 return 1; 6827 6828 /* 6829 * We make inactive:active ratio decisions based on the node's 6830 * composition of memory, but a restrictive reclaim_idx or a 6831 * memory.low cgroup setting can exempt large amounts of 6832 * memory from reclaim. Neither of which are very common, so 6833 * instead of doing costly eligibility calculations of the 6834 * entire cgroup subtree up front, we assume the estimates are 6835 * good, and retry with forcible deactivation if that fails. 6836 */ 6837 if (sc->skipped_deactivate) { 6838 sc->priority = initial_priority; 6839 sc->force_deactivate = 1; 6840 sc->skipped_deactivate = 0; 6841 goto retry; 6842 } 6843 6844 /* Untapped cgroup reserves? Don't OOM, retry. */ 6845 if (sc->memcg_low_skipped) { 6846 sc->priority = initial_priority; 6847 sc->force_deactivate = 0; 6848 sc->memcg_low_reclaim = 1; 6849 sc->memcg_low_skipped = 0; 6850 goto retry; 6851 } 6852 6853 return 0; 6854 } 6855 6856 static bool allow_direct_reclaim(pg_data_t *pgdat) 6857 { 6858 struct zone *zone; 6859 unsigned long pfmemalloc_reserve = 0; 6860 unsigned long free_pages = 0; 6861 int i; 6862 bool wmark_ok; 6863 6864 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) 6865 return true; 6866 6867 for (i = 0; i <= ZONE_NORMAL; i++) { 6868 zone = &pgdat->node_zones[i]; 6869 if (!managed_zone(zone)) 6870 continue; 6871 6872 if (!zone_reclaimable_pages(zone)) 6873 continue; 6874 6875 pfmemalloc_reserve += min_wmark_pages(zone); 6876 free_pages += zone_page_state(zone, NR_FREE_PAGES); 6877 } 6878 6879 /* If there are no reserves (unexpected config) then do not throttle */ 6880 if (!pfmemalloc_reserve) 6881 return true; 6882 6883 wmark_ok = free_pages > pfmemalloc_reserve / 2; 6884 6885 /* kswapd must be awake if processes are being throttled */ 6886 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { 6887 if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) 6888 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL); 6889 6890 wake_up_interruptible(&pgdat->kswapd_wait); 6891 } 6892 6893 return wmark_ok; 6894 } 6895 6896 /* 6897 * Throttle direct reclaimers if backing storage is backed by the network 6898 * and the PFMEMALLOC reserve for the preferred node is getting dangerously 6899 * depleted. kswapd will continue to make progress and wake the processes 6900 * when the low watermark is reached. 6901 * 6902 * Returns true if a fatal signal was delivered during throttling. If this 6903 * happens, the page allocator should not consider triggering the OOM killer. 6904 */ 6905 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, 6906 nodemask_t *nodemask) 6907 { 6908 struct zoneref *z; 6909 struct zone *zone; 6910 pg_data_t *pgdat = NULL; 6911 6912 /* 6913 * Kernel threads should not be throttled as they may be indirectly 6914 * responsible for cleaning pages necessary for reclaim to make forward 6915 * progress. kjournald for example may enter direct reclaim while 6916 * committing a transaction where throttling it could forcing other 6917 * processes to block on log_wait_commit(). 6918 */ 6919 if (current->flags & PF_KTHREAD) 6920 goto out; 6921 6922 /* 6923 * If a fatal signal is pending, this process should not throttle. 6924 * It should return quickly so it can exit and free its memory 6925 */ 6926 if (fatal_signal_pending(current)) 6927 goto out; 6928 6929 /* 6930 * Check if the pfmemalloc reserves are ok by finding the first node 6931 * with a usable ZONE_NORMAL or lower zone. The expectation is that 6932 * GFP_KERNEL will be required for allocating network buffers when 6933 * swapping over the network so ZONE_HIGHMEM is unusable. 6934 * 6935 * Throttling is based on the first usable node and throttled processes 6936 * wait on a queue until kswapd makes progress and wakes them. There 6937 * is an affinity then between processes waking up and where reclaim 6938 * progress has been made assuming the process wakes on the same node. 6939 * More importantly, processes running on remote nodes will not compete 6940 * for remote pfmemalloc reserves and processes on different nodes 6941 * should make reasonable progress. 6942 */ 6943 for_each_zone_zonelist_nodemask(zone, z, zonelist, 6944 gfp_zone(gfp_mask), nodemask) { 6945 if (zone_idx(zone) > ZONE_NORMAL) 6946 continue; 6947 6948 /* Throttle based on the first usable node */ 6949 pgdat = zone->zone_pgdat; 6950 if (allow_direct_reclaim(pgdat)) 6951 goto out; 6952 break; 6953 } 6954 6955 /* If no zone was usable by the allocation flags then do not throttle */ 6956 if (!pgdat) 6957 goto out; 6958 6959 /* Account for the throttling */ 6960 count_vm_event(PGSCAN_DIRECT_THROTTLE); 6961 6962 /* 6963 * If the caller cannot enter the filesystem, it's possible that it 6964 * is due to the caller holding an FS lock or performing a journal 6965 * transaction in the case of a filesystem like ext[3|4]. In this case, 6966 * it is not safe to block on pfmemalloc_wait as kswapd could be 6967 * blocked waiting on the same lock. Instead, throttle for up to a 6968 * second before continuing. 6969 */ 6970 if (!(gfp_mask & __GFP_FS)) 6971 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, 6972 allow_direct_reclaim(pgdat), HZ); 6973 else 6974 /* Throttle until kswapd wakes the process */ 6975 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, 6976 allow_direct_reclaim(pgdat)); 6977 6978 if (fatal_signal_pending(current)) 6979 return true; 6980 6981 out: 6982 return false; 6983 } 6984 6985 unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 6986 gfp_t gfp_mask, nodemask_t *nodemask) 6987 { 6988 unsigned long nr_reclaimed; 6989 struct scan_control sc = { 6990 .nr_to_reclaim = SWAP_CLUSTER_MAX, 6991 .gfp_mask = current_gfp_context(gfp_mask), 6992 .reclaim_idx = gfp_zone(gfp_mask), 6993 .order = order, 6994 .nodemask = nodemask, 6995 .priority = DEF_PRIORITY, 6996 .may_writepage = !laptop_mode, 6997 .may_unmap = 1, 6998 .may_swap = 1, 6999 }; 7000 7001 /* 7002 * scan_control uses s8 fields for order, priority, and reclaim_idx. 7003 * Confirm they are large enough for max values. 7004 */ 7005 BUILD_BUG_ON(MAX_ORDER >= S8_MAX); 7006 BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); 7007 BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX); 7008 7009 /* 7010 * Do not enter reclaim if fatal signal was delivered while throttled. 7011 * 1 is returned so that the page allocator does not OOM kill at this 7012 * point. 7013 */ 7014 if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) 7015 return 1; 7016 7017 set_task_reclaim_state(current, &sc.reclaim_state); 7018 trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); 7019 7020 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 7021 7022 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); 7023 set_task_reclaim_state(current, NULL); 7024 7025 return nr_reclaimed; 7026 } 7027 7028 #ifdef CONFIG_MEMCG 7029 7030 /* Only used by soft limit reclaim. Do not reuse for anything else. */ 7031 unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, 7032 gfp_t gfp_mask, bool noswap, 7033 pg_data_t *pgdat, 7034 unsigned long *nr_scanned) 7035 { 7036 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 7037 struct scan_control sc = { 7038 .nr_to_reclaim = SWAP_CLUSTER_MAX, 7039 .target_mem_cgroup = memcg, 7040 .may_writepage = !laptop_mode, 7041 .may_unmap = 1, 7042 .reclaim_idx = MAX_NR_ZONES - 1, 7043 .may_swap = !noswap, 7044 }; 7045 7046 WARN_ON_ONCE(!current->reclaim_state); 7047 7048 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 7049 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 7050 7051 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, 7052 sc.gfp_mask); 7053 7054 /* 7055 * NOTE: Although we can get the priority field, using it 7056 * here is not a good idea, since it limits the pages we can scan. 7057 * if we don't reclaim here, the shrink_node from balance_pgdat 7058 * will pick up pages from other mem cgroup's as well. We hack 7059 * the priority and make it zero. 7060 */ 7061 shrink_lruvec(lruvec, &sc); 7062 7063 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 7064 7065 *nr_scanned = sc.nr_scanned; 7066 7067 return sc.nr_reclaimed; 7068 } 7069 7070 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, 7071 unsigned long nr_pages, 7072 gfp_t gfp_mask, 7073 unsigned int reclaim_options) 7074 { 7075 unsigned long nr_reclaimed; 7076 unsigned int noreclaim_flag; 7077 struct scan_control sc = { 7078 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), 7079 .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | 7080 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 7081 .reclaim_idx = MAX_NR_ZONES - 1, 7082 .target_mem_cgroup = memcg, 7083 .priority = DEF_PRIORITY, 7084 .may_writepage = !laptop_mode, 7085 .may_unmap = 1, 7086 .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), 7087 .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), 7088 }; 7089 /* 7090 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put 7091 * equal pressure on all the nodes. This is based on the assumption that 7092 * the reclaim does not bail out early. 7093 */ 7094 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 7095 7096 set_task_reclaim_state(current, &sc.reclaim_state); 7097 trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); 7098 noreclaim_flag = memalloc_noreclaim_save(); 7099 7100 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 7101 7102 memalloc_noreclaim_restore(noreclaim_flag); 7103 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 7104 set_task_reclaim_state(current, NULL); 7105 7106 return nr_reclaimed; 7107 } 7108 #endif 7109 7110 static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) 7111 { 7112 struct mem_cgroup *memcg; 7113 struct lruvec *lruvec; 7114 7115 if (lru_gen_enabled()) { 7116 lru_gen_age_node(pgdat, sc); 7117 return; 7118 } 7119 7120 if (!can_age_anon_pages(pgdat, sc)) 7121 return; 7122 7123 lruvec = mem_cgroup_lruvec(NULL, pgdat); 7124 if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON)) 7125 return; 7126 7127 memcg = mem_cgroup_iter(NULL, NULL, NULL); 7128 do { 7129 lruvec = mem_cgroup_lruvec(memcg, pgdat); 7130 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 7131 sc, LRU_ACTIVE_ANON); 7132 memcg = mem_cgroup_iter(NULL, memcg, NULL); 7133 } while (memcg); 7134 } 7135 7136 static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) 7137 { 7138 int i; 7139 struct zone *zone; 7140 7141 /* 7142 * Check for watermark boosts top-down as the higher zones 7143 * are more likely to be boosted. Both watermarks and boosts 7144 * should not be checked at the same time as reclaim would 7145 * start prematurely when there is no boosting and a lower 7146 * zone is balanced. 7147 */ 7148 for (i = highest_zoneidx; i >= 0; i--) { 7149 zone = pgdat->node_zones + i; 7150 if (!managed_zone(zone)) 7151 continue; 7152 7153 if (zone->watermark_boost) 7154 return true; 7155 } 7156 7157 return false; 7158 } 7159 7160 /* 7161 * Returns true if there is an eligible zone balanced for the request order 7162 * and highest_zoneidx 7163 */ 7164 static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) 7165 { 7166 int i; 7167 unsigned long mark = -1; 7168 struct zone *zone; 7169 7170 /* 7171 * Check watermarks bottom-up as lower zones are more likely to 7172 * meet watermarks. 7173 */ 7174 for (i = 0; i <= highest_zoneidx; i++) { 7175 zone = pgdat->node_zones + i; 7176 7177 if (!managed_zone(zone)) 7178 continue; 7179 7180 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) 7181 mark = wmark_pages(zone, WMARK_PROMO); 7182 else 7183 mark = high_wmark_pages(zone); 7184 if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) 7185 return true; 7186 } 7187 7188 /* 7189 * If a node has no managed zone within highest_zoneidx, it does not 7190 * need balancing by definition. This can happen if a zone-restricted 7191 * allocation tries to wake a remote kswapd. 7192 */ 7193 if (mark == -1) 7194 return true; 7195 7196 return false; 7197 } 7198 7199 /* Clear pgdat state for congested, dirty or under writeback. */ 7200 static void clear_pgdat_congested(pg_data_t *pgdat) 7201 { 7202 struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); 7203 7204 clear_bit(LRUVEC_CONGESTED, &lruvec->flags); 7205 clear_bit(PGDAT_DIRTY, &pgdat->flags); 7206 clear_bit(PGDAT_WRITEBACK, &pgdat->flags); 7207 } 7208 7209 /* 7210 * Prepare kswapd for sleeping. This verifies that there are no processes 7211 * waiting in throttle_direct_reclaim() and that watermarks have been met. 7212 * 7213 * Returns true if kswapd is ready to sleep 7214 */ 7215 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, 7216 int highest_zoneidx) 7217 { 7218 /* 7219 * The throttled processes are normally woken up in balance_pgdat() as 7220 * soon as allow_direct_reclaim() is true. But there is a potential 7221 * race between when kswapd checks the watermarks and a process gets 7222 * throttled. There is also a potential race if processes get 7223 * throttled, kswapd wakes, a large process exits thereby balancing the 7224 * zones, which causes kswapd to exit balance_pgdat() before reaching 7225 * the wake up checks. If kswapd is going to sleep, no process should 7226 * be sleeping on pfmemalloc_wait, so wake them now if necessary. If 7227 * the wake up is premature, processes will wake kswapd and get 7228 * throttled again. The difference from wake ups in balance_pgdat() is 7229 * that here we are under prepare_to_wait(). 7230 */ 7231 if (waitqueue_active(&pgdat->pfmemalloc_wait)) 7232 wake_up_all(&pgdat->pfmemalloc_wait); 7233 7234 /* Hopeless node, leave it to direct reclaim */ 7235 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) 7236 return true; 7237 7238 if (pgdat_balanced(pgdat, order, highest_zoneidx)) { 7239 clear_pgdat_congested(pgdat); 7240 return true; 7241 } 7242 7243 return false; 7244 } 7245 7246 /* 7247 * kswapd shrinks a node of pages that are at or below the highest usable 7248 * zone that is currently unbalanced. 7249 * 7250 * Returns true if kswapd scanned at least the requested number of pages to 7251 * reclaim or if the lack of progress was due to pages under writeback. 7252 * This is used to determine if the scanning priority needs to be raised. 7253 */ 7254 static bool kswapd_shrink_node(pg_data_t *pgdat, 7255 struct scan_control *sc) 7256 { 7257 struct zone *zone; 7258 int z; 7259 7260 /* Reclaim a number of pages proportional to the number of zones */ 7261 sc->nr_to_reclaim = 0; 7262 for (z = 0; z <= sc->reclaim_idx; z++) { 7263 zone = pgdat->node_zones + z; 7264 if (!managed_zone(zone)) 7265 continue; 7266 7267 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); 7268 } 7269 7270 /* 7271 * Historically care was taken to put equal pressure on all zones but 7272 * now pressure is applied based on node LRU order. 7273 */ 7274 shrink_node(pgdat, sc); 7275 7276 /* 7277 * Fragmentation may mean that the system cannot be rebalanced for 7278 * high-order allocations. If twice the allocation size has been 7279 * reclaimed then recheck watermarks only at order-0 to prevent 7280 * excessive reclaim. Assume that a process requested a high-order 7281 * can direct reclaim/compact. 7282 */ 7283 if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) 7284 sc->order = 0; 7285 7286 return sc->nr_scanned >= sc->nr_to_reclaim; 7287 } 7288 7289 /* Page allocator PCP high watermark is lowered if reclaim is active. */ 7290 static inline void 7291 update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active) 7292 { 7293 int i; 7294 struct zone *zone; 7295 7296 for (i = 0; i <= highest_zoneidx; i++) { 7297 zone = pgdat->node_zones + i; 7298 7299 if (!managed_zone(zone)) 7300 continue; 7301 7302 if (active) 7303 set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); 7304 else 7305 clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); 7306 } 7307 } 7308 7309 static inline void 7310 set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) 7311 { 7312 update_reclaim_active(pgdat, highest_zoneidx, true); 7313 } 7314 7315 static inline void 7316 clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) 7317 { 7318 update_reclaim_active(pgdat, highest_zoneidx, false); 7319 } 7320 7321 /* 7322 * For kswapd, balance_pgdat() will reclaim pages across a node from zones 7323 * that are eligible for use by the caller until at least one zone is 7324 * balanced. 7325 * 7326 * Returns the order kswapd finished reclaiming at. 7327 * 7328 * kswapd scans the zones in the highmem->normal->dma direction. It skips 7329 * zones which have free_pages > high_wmark_pages(zone), but once a zone is 7330 * found to have free_pages <= high_wmark_pages(zone), any page in that zone 7331 * or lower is eligible for reclaim until at least one usable zone is 7332 * balanced. 7333 */ 7334 static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) 7335 { 7336 int i; 7337 unsigned long nr_soft_reclaimed; 7338 unsigned long nr_soft_scanned; 7339 unsigned long pflags; 7340 unsigned long nr_boost_reclaim; 7341 unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; 7342 bool boosted; 7343 struct zone *zone; 7344 struct scan_control sc = { 7345 .gfp_mask = GFP_KERNEL, 7346 .order = order, 7347 .may_unmap = 1, 7348 }; 7349 7350 set_task_reclaim_state(current, &sc.reclaim_state); 7351 psi_memstall_enter(&pflags); 7352 __fs_reclaim_acquire(_THIS_IP_); 7353 7354 count_vm_event(PAGEOUTRUN); 7355 7356 /* 7357 * Account for the reclaim boost. Note that the zone boost is left in 7358 * place so that parallel allocations that are near the watermark will 7359 * stall or direct reclaim until kswapd is finished. 7360 */ 7361 nr_boost_reclaim = 0; 7362 for (i = 0; i <= highest_zoneidx; i++) { 7363 zone = pgdat->node_zones + i; 7364 if (!managed_zone(zone)) 7365 continue; 7366 7367 nr_boost_reclaim += zone->watermark_boost; 7368 zone_boosts[i] = zone->watermark_boost; 7369 } 7370 boosted = nr_boost_reclaim; 7371 7372 restart: 7373 set_reclaim_active(pgdat, highest_zoneidx); 7374 sc.priority = DEF_PRIORITY; 7375 do { 7376 unsigned long nr_reclaimed = sc.nr_reclaimed; 7377 bool raise_priority = true; 7378 bool balanced; 7379 bool ret; 7380 7381 sc.reclaim_idx = highest_zoneidx; 7382 7383 /* 7384 * If the number of buffer_heads exceeds the maximum allowed 7385 * then consider reclaiming from all zones. This has a dual 7386 * purpose -- on 64-bit systems it is expected that 7387 * buffer_heads are stripped during active rotation. On 32-bit 7388 * systems, highmem pages can pin lowmem memory and shrinking 7389 * buffers can relieve lowmem pressure. Reclaim may still not 7390 * go ahead if all eligible zones for the original allocation 7391 * request are balanced to avoid excessive reclaim from kswapd. 7392 */ 7393 if (buffer_heads_over_limit) { 7394 for (i = MAX_NR_ZONES - 1; i >= 0; i--) { 7395 zone = pgdat->node_zones + i; 7396 if (!managed_zone(zone)) 7397 continue; 7398 7399 sc.reclaim_idx = i; 7400 break; 7401 } 7402 } 7403 7404 /* 7405 * If the pgdat is imbalanced then ignore boosting and preserve 7406 * the watermarks for a later time and restart. Note that the 7407 * zone watermarks will be still reset at the end of balancing 7408 * on the grounds that the normal reclaim should be enough to 7409 * re-evaluate if boosting is required when kswapd next wakes. 7410 */ 7411 balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx); 7412 if (!balanced && nr_boost_reclaim) { 7413 nr_boost_reclaim = 0; 7414 goto restart; 7415 } 7416 7417 /* 7418 * If boosting is not active then only reclaim if there are no 7419 * eligible zones. Note that sc.reclaim_idx is not used as 7420 * buffer_heads_over_limit may have adjusted it. 7421 */ 7422 if (!nr_boost_reclaim && balanced) 7423 goto out; 7424 7425 /* Limit the priority of boosting to avoid reclaim writeback */ 7426 if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) 7427 raise_priority = false; 7428 7429 /* 7430 * Do not writeback or swap pages for boosted reclaim. The 7431 * intent is to relieve pressure not issue sub-optimal IO 7432 * from reclaim context. If no pages are reclaimed, the 7433 * reclaim will be aborted. 7434 */ 7435 sc.may_writepage = !laptop_mode && !nr_boost_reclaim; 7436 sc.may_swap = !nr_boost_reclaim; 7437 7438 /* 7439 * Do some background aging, to give pages a chance to be 7440 * referenced before reclaiming. All pages are rotated 7441 * regardless of classzone as this is about consistent aging. 7442 */ 7443 kswapd_age_node(pgdat, &sc); 7444 7445 /* 7446 * If we're getting trouble reclaiming, start doing writepage 7447 * even in laptop mode. 7448 */ 7449 if (sc.priority < DEF_PRIORITY - 2) 7450 sc.may_writepage = 1; 7451 7452 /* Call soft limit reclaim before calling shrink_node. */ 7453 sc.nr_scanned = 0; 7454 nr_soft_scanned = 0; 7455 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order, 7456 sc.gfp_mask, &nr_soft_scanned); 7457 sc.nr_reclaimed += nr_soft_reclaimed; 7458 7459 /* 7460 * There should be no need to raise the scanning priority if 7461 * enough pages are already being scanned that that high 7462 * watermark would be met at 100% efficiency. 7463 */ 7464 if (kswapd_shrink_node(pgdat, &sc)) 7465 raise_priority = false; 7466 7467 /* 7468 * If the low watermark is met there is no need for processes 7469 * to be throttled on pfmemalloc_wait as they should not be 7470 * able to safely make forward progress. Wake them 7471 */ 7472 if (waitqueue_active(&pgdat->pfmemalloc_wait) && 7473 allow_direct_reclaim(pgdat)) 7474 wake_up_all(&pgdat->pfmemalloc_wait); 7475 7476 /* Check if kswapd should be suspending */ 7477 __fs_reclaim_release(_THIS_IP_); 7478 ret = try_to_freeze(); 7479 __fs_reclaim_acquire(_THIS_IP_); 7480 if (ret || kthread_should_stop()) 7481 break; 7482 7483 /* 7484 * Raise priority if scanning rate is too low or there was no 7485 * progress in reclaiming pages 7486 */ 7487 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; 7488 nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed); 7489 7490 /* 7491 * If reclaim made no progress for a boost, stop reclaim as 7492 * IO cannot be queued and it could be an infinite loop in 7493 * extreme circumstances. 7494 */ 7495 if (nr_boost_reclaim && !nr_reclaimed) 7496 break; 7497 7498 if (raise_priority || !nr_reclaimed) 7499 sc.priority--; 7500 } while (sc.priority >= 1); 7501 7502 if (!sc.nr_reclaimed) 7503 pgdat->kswapd_failures++; 7504 7505 out: 7506 clear_reclaim_active(pgdat, highest_zoneidx); 7507 7508 /* If reclaim was boosted, account for the reclaim done in this pass */ 7509 if (boosted) { 7510 unsigned long flags; 7511 7512 for (i = 0; i <= highest_zoneidx; i++) { 7513 if (!zone_boosts[i]) 7514 continue; 7515 7516 /* Increments are under the zone lock */ 7517 zone = pgdat->node_zones + i; 7518 spin_lock_irqsave(&zone->lock, flags); 7519 zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]); 7520 spin_unlock_irqrestore(&zone->lock, flags); 7521 } 7522 7523 /* 7524 * As there is now likely space, wakeup kcompact to defragment 7525 * pageblocks. 7526 */ 7527 wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx); 7528 } 7529 7530 snapshot_refaults(NULL, pgdat); 7531 __fs_reclaim_release(_THIS_IP_); 7532 psi_memstall_leave(&pflags); 7533 set_task_reclaim_state(current, NULL); 7534 7535 /* 7536 * Return the order kswapd stopped reclaiming at as 7537 * prepare_kswapd_sleep() takes it into account. If another caller 7538 * entered the allocator slow path while kswapd was awake, order will 7539 * remain at the higher level. 7540 */ 7541 return sc.order; 7542 } 7543 7544 /* 7545 * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to 7546 * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is 7547 * not a valid index then either kswapd runs for first time or kswapd couldn't 7548 * sleep after previous reclaim attempt (node is still unbalanced). In that 7549 * case return the zone index of the previous kswapd reclaim cycle. 7550 */ 7551 static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, 7552 enum zone_type prev_highest_zoneidx) 7553 { 7554 enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); 7555 7556 return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; 7557 } 7558 7559 static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, 7560 unsigned int highest_zoneidx) 7561 { 7562 long remaining = 0; 7563 DEFINE_WAIT(wait); 7564 7565 if (freezing(current) || kthread_should_stop()) 7566 return; 7567 7568 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 7569 7570 /* 7571 * Try to sleep for a short interval. Note that kcompactd will only be 7572 * woken if it is possible to sleep for a short interval. This is 7573 * deliberate on the assumption that if reclaim cannot keep an 7574 * eligible zone balanced that it's also unlikely that compaction will 7575 * succeed. 7576 */ 7577 if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { 7578 /* 7579 * Compaction records what page blocks it recently failed to 7580 * isolate pages from and skips them in the future scanning. 7581 * When kswapd is going to sleep, it is reasonable to assume 7582 * that pages and compaction may succeed so reset the cache. 7583 */ 7584 reset_isolation_suitable(pgdat); 7585 7586 /* 7587 * We have freed the memory, now we should compact it to make 7588 * allocation of the requested order possible. 7589 */ 7590 wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); 7591 7592 remaining = schedule_timeout(HZ/10); 7593 7594 /* 7595 * If woken prematurely then reset kswapd_highest_zoneidx and 7596 * order. The values will either be from a wakeup request or 7597 * the previous request that slept prematurely. 7598 */ 7599 if (remaining) { 7600 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, 7601 kswapd_highest_zoneidx(pgdat, 7602 highest_zoneidx)); 7603 7604 if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) 7605 WRITE_ONCE(pgdat->kswapd_order, reclaim_order); 7606 } 7607 7608 finish_wait(&pgdat->kswapd_wait, &wait); 7609 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 7610 } 7611 7612 /* 7613 * After a short sleep, check if it was a premature sleep. If not, then 7614 * go fully to sleep until explicitly woken up. 7615 */ 7616 if (!remaining && 7617 prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { 7618 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 7619 7620 /* 7621 * vmstat counters are not perfectly accurate and the estimated 7622 * value for counters such as NR_FREE_PAGES can deviate from the 7623 * true value by nr_online_cpus * threshold. To avoid the zone 7624 * watermarks being breached while under pressure, we reduce the 7625 * per-cpu vmstat threshold while kswapd is awake and restore 7626 * them before going back to sleep. 7627 */ 7628 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 7629 7630 if (!kthread_should_stop()) 7631 schedule(); 7632 7633 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 7634 } else { 7635 if (remaining) 7636 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); 7637 else 7638 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); 7639 } 7640 finish_wait(&pgdat->kswapd_wait, &wait); 7641 } 7642 7643 /* 7644 * The background pageout daemon, started as a kernel thread 7645 * from the init process. 7646 * 7647 * This basically trickles out pages so that we have _some_ 7648 * free memory available even if there is no other activity 7649 * that frees anything up. This is needed for things like routing 7650 * etc, where we otherwise might have all activity going on in 7651 * asynchronous contexts that cannot page things out. 7652 * 7653 * If there are applications that are active memory-allocators 7654 * (most normal use), this basically shouldn't matter. 7655 */ 7656 static int kswapd(void *p) 7657 { 7658 unsigned int alloc_order, reclaim_order; 7659 unsigned int highest_zoneidx = MAX_NR_ZONES - 1; 7660 pg_data_t *pgdat = (pg_data_t *)p; 7661 struct task_struct *tsk = current; 7662 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 7663 7664 if (!cpumask_empty(cpumask)) 7665 set_cpus_allowed_ptr(tsk, cpumask); 7666 7667 /* 7668 * Tell the memory management that we're a "memory allocator", 7669 * and that if we need more memory we should get access to it 7670 * regardless (see "__alloc_pages()"). "kswapd" should 7671 * never get caught in the normal page freeing logic. 7672 * 7673 * (Kswapd normally doesn't need memory anyway, but sometimes 7674 * you need a small amount of memory in order to be able to 7675 * page out something else, and this flag essentially protects 7676 * us from recursively trying to free more memory as we're 7677 * trying to free the first piece of memory in the first place). 7678 */ 7679 tsk->flags |= PF_MEMALLOC | PF_KSWAPD; 7680 set_freezable(); 7681 7682 WRITE_ONCE(pgdat->kswapd_order, 0); 7683 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); 7684 atomic_set(&pgdat->nr_writeback_throttled, 0); 7685 for ( ; ; ) { 7686 bool ret; 7687 7688 alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); 7689 highest_zoneidx = kswapd_highest_zoneidx(pgdat, 7690 highest_zoneidx); 7691 7692 kswapd_try_sleep: 7693 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, 7694 highest_zoneidx); 7695 7696 /* Read the new order and highest_zoneidx */ 7697 alloc_order = READ_ONCE(pgdat->kswapd_order); 7698 highest_zoneidx = kswapd_highest_zoneidx(pgdat, 7699 highest_zoneidx); 7700 WRITE_ONCE(pgdat->kswapd_order, 0); 7701 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); 7702 7703 ret = try_to_freeze(); 7704 if (kthread_should_stop()) 7705 break; 7706 7707 /* 7708 * We can speed up thawing tasks if we don't call balance_pgdat 7709 * after returning from the refrigerator 7710 */ 7711 if (ret) 7712 continue; 7713 7714 /* 7715 * Reclaim begins at the requested order but if a high-order 7716 * reclaim fails then kswapd falls back to reclaiming for 7717 * order-0. If that happens, kswapd will consider sleeping 7718 * for the order it finished reclaiming at (reclaim_order) 7719 * but kcompactd is woken to compact for the original 7720 * request (alloc_order). 7721 */ 7722 trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, 7723 alloc_order); 7724 reclaim_order = balance_pgdat(pgdat, alloc_order, 7725 highest_zoneidx); 7726 if (reclaim_order < alloc_order) 7727 goto kswapd_try_sleep; 7728 } 7729 7730 tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD); 7731 7732 return 0; 7733 } 7734 7735 /* 7736 * A zone is low on free memory or too fragmented for high-order memory. If 7737 * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's 7738 * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim 7739 * has failed or is not needed, still wake up kcompactd if only compaction is 7740 * needed. 7741 */ 7742 void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, 7743 enum zone_type highest_zoneidx) 7744 { 7745 pg_data_t *pgdat; 7746 enum zone_type curr_idx; 7747 7748 if (!managed_zone(zone)) 7749 return; 7750 7751 if (!cpuset_zone_allowed(zone, gfp_flags)) 7752 return; 7753 7754 pgdat = zone->zone_pgdat; 7755 curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); 7756 7757 if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx) 7758 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx); 7759 7760 if (READ_ONCE(pgdat->kswapd_order) < order) 7761 WRITE_ONCE(pgdat->kswapd_order, order); 7762 7763 if (!waitqueue_active(&pgdat->kswapd_wait)) 7764 return; 7765 7766 /* Hopeless node, leave it to direct reclaim if possible */ 7767 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || 7768 (pgdat_balanced(pgdat, order, highest_zoneidx) && 7769 !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { 7770 /* 7771 * There may be plenty of free memory available, but it's too 7772 * fragmented for high-order allocations. Wake up kcompactd 7773 * and rely on compaction_suitable() to determine if it's 7774 * needed. If it fails, it will defer subsequent attempts to 7775 * ratelimit its work. 7776 */ 7777 if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) 7778 wakeup_kcompactd(pgdat, order, highest_zoneidx); 7779 return; 7780 } 7781 7782 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order, 7783 gfp_flags); 7784 wake_up_interruptible(&pgdat->kswapd_wait); 7785 } 7786 7787 #ifdef CONFIG_HIBERNATION 7788 /* 7789 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of 7790 * freed pages. 7791 * 7792 * Rather than trying to age LRUs the aim is to preserve the overall 7793 * LRU order by reclaiming preferentially 7794 * inactive > active > active referenced > active mapped 7795 */ 7796 unsigned long shrink_all_memory(unsigned long nr_to_reclaim) 7797 { 7798 struct scan_control sc = { 7799 .nr_to_reclaim = nr_to_reclaim, 7800 .gfp_mask = GFP_HIGHUSER_MOVABLE, 7801 .reclaim_idx = MAX_NR_ZONES - 1, 7802 .priority = DEF_PRIORITY, 7803 .may_writepage = 1, 7804 .may_unmap = 1, 7805 .may_swap = 1, 7806 .hibernation_mode = 1, 7807 }; 7808 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 7809 unsigned long nr_reclaimed; 7810 unsigned int noreclaim_flag; 7811 7812 fs_reclaim_acquire(sc.gfp_mask); 7813 noreclaim_flag = memalloc_noreclaim_save(); 7814 set_task_reclaim_state(current, &sc.reclaim_state); 7815 7816 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 7817 7818 set_task_reclaim_state(current, NULL); 7819 memalloc_noreclaim_restore(noreclaim_flag); 7820 fs_reclaim_release(sc.gfp_mask); 7821 7822 return nr_reclaimed; 7823 } 7824 #endif /* CONFIG_HIBERNATION */ 7825 7826 /* 7827 * This kswapd start function will be called by init and node-hot-add. 7828 */ 7829 void kswapd_run(int nid) 7830 { 7831 pg_data_t *pgdat = NODE_DATA(nid); 7832 7833 pgdat_kswapd_lock(pgdat); 7834 if (!pgdat->kswapd) { 7835 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); 7836 if (IS_ERR(pgdat->kswapd)) { 7837 /* failure at boot is fatal */ 7838 BUG_ON(system_state < SYSTEM_RUNNING); 7839 pr_err("Failed to start kswapd on node %d\n", nid); 7840 pgdat->kswapd = NULL; 7841 } 7842 } 7843 pgdat_kswapd_unlock(pgdat); 7844 } 7845 7846 /* 7847 * Called by memory hotplug when all memory in a node is offlined. Caller must 7848 * be holding mem_hotplug_begin/done(). 7849 */ 7850 void kswapd_stop(int nid) 7851 { 7852 pg_data_t *pgdat = NODE_DATA(nid); 7853 struct task_struct *kswapd; 7854 7855 pgdat_kswapd_lock(pgdat); 7856 kswapd = pgdat->kswapd; 7857 if (kswapd) { 7858 kthread_stop(kswapd); 7859 pgdat->kswapd = NULL; 7860 } 7861 pgdat_kswapd_unlock(pgdat); 7862 } 7863 7864 static int __init kswapd_init(void) 7865 { 7866 int nid; 7867 7868 swap_setup(); 7869 for_each_node_state(nid, N_MEMORY) 7870 kswapd_run(nid); 7871 return 0; 7872 } 7873 7874 module_init(kswapd_init) 7875 7876 #ifdef CONFIG_NUMA 7877 /* 7878 * Node reclaim mode 7879 * 7880 * If non-zero call node_reclaim when the number of free pages falls below 7881 * the watermarks. 7882 */ 7883 int node_reclaim_mode __read_mostly; 7884 7885 /* 7886 * Priority for NODE_RECLAIM. This determines the fraction of pages 7887 * of a node considered for each zone_reclaim. 4 scans 1/16th of 7888 * a zone. 7889 */ 7890 #define NODE_RECLAIM_PRIORITY 4 7891 7892 /* 7893 * Percentage of pages in a zone that must be unmapped for node_reclaim to 7894 * occur. 7895 */ 7896 int sysctl_min_unmapped_ratio = 1; 7897 7898 /* 7899 * If the number of slab pages in a zone grows beyond this percentage then 7900 * slab reclaim needs to occur. 7901 */ 7902 int sysctl_min_slab_ratio = 5; 7903 7904 static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat) 7905 { 7906 unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED); 7907 unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) + 7908 node_page_state(pgdat, NR_ACTIVE_FILE); 7909 7910 /* 7911 * It's possible for there to be more file mapped pages than 7912 * accounted for by the pages on the file LRU lists because 7913 * tmpfs pages accounted for as ANON can also be FILE_MAPPED 7914 */ 7915 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; 7916 } 7917 7918 /* Work out how many page cache pages we can reclaim in this reclaim_mode */ 7919 static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) 7920 { 7921 unsigned long nr_pagecache_reclaimable; 7922 unsigned long delta = 0; 7923 7924 /* 7925 * If RECLAIM_UNMAP is set, then all file pages are considered 7926 * potentially reclaimable. Otherwise, we have to worry about 7927 * pages like swapcache and node_unmapped_file_pages() provides 7928 * a better estimate 7929 */ 7930 if (node_reclaim_mode & RECLAIM_UNMAP) 7931 nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES); 7932 else 7933 nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat); 7934 7935 /* If we can't clean pages, remove dirty pages from consideration */ 7936 if (!(node_reclaim_mode & RECLAIM_WRITE)) 7937 delta += node_page_state(pgdat, NR_FILE_DIRTY); 7938 7939 /* Watch for any possible underflows due to delta */ 7940 if (unlikely(delta > nr_pagecache_reclaimable)) 7941 delta = nr_pagecache_reclaimable; 7942 7943 return nr_pagecache_reclaimable - delta; 7944 } 7945 7946 /* 7947 * Try to free up some pages from this node through reclaim. 7948 */ 7949 static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) 7950 { 7951 /* Minimum pages needed in order to stay on node */ 7952 const unsigned long nr_pages = 1 << order; 7953 struct task_struct *p = current; 7954 unsigned int noreclaim_flag; 7955 struct scan_control sc = { 7956 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), 7957 .gfp_mask = current_gfp_context(gfp_mask), 7958 .order = order, 7959 .priority = NODE_RECLAIM_PRIORITY, 7960 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), 7961 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), 7962 .may_swap = 1, 7963 .reclaim_idx = gfp_zone(gfp_mask), 7964 }; 7965 unsigned long pflags; 7966 7967 trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, 7968 sc.gfp_mask); 7969 7970 cond_resched(); 7971 psi_memstall_enter(&pflags); 7972 fs_reclaim_acquire(sc.gfp_mask); 7973 /* 7974 * We need to be able to allocate from the reserves for RECLAIM_UNMAP 7975 */ 7976 noreclaim_flag = memalloc_noreclaim_save(); 7977 set_task_reclaim_state(p, &sc.reclaim_state); 7978 7979 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages || 7980 node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) { 7981 /* 7982 * Free memory by calling shrink node with increasing 7983 * priorities until we have enough memory freed. 7984 */ 7985 do { 7986 shrink_node(pgdat, &sc); 7987 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); 7988 } 7989 7990 set_task_reclaim_state(p, NULL); 7991 memalloc_noreclaim_restore(noreclaim_flag); 7992 fs_reclaim_release(sc.gfp_mask); 7993 psi_memstall_leave(&pflags); 7994 7995 trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); 7996 7997 return sc.nr_reclaimed >= nr_pages; 7998 } 7999 8000 int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) 8001 { 8002 int ret; 8003 8004 /* 8005 * Node reclaim reclaims unmapped file backed pages and 8006 * slab pages if we are over the defined limits. 8007 * 8008 * A small portion of unmapped file backed pages is needed for 8009 * file I/O otherwise pages read by file I/O will be immediately 8010 * thrown out if the node is overallocated. So we do not reclaim 8011 * if less than a specified percentage of the node is used by 8012 * unmapped file backed pages. 8013 */ 8014 if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && 8015 node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <= 8016 pgdat->min_slab_pages) 8017 return NODE_RECLAIM_FULL; 8018 8019 /* 8020 * Do not scan if the allocation should not be delayed. 8021 */ 8022 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) 8023 return NODE_RECLAIM_NOSCAN; 8024 8025 /* 8026 * Only run node reclaim on the local node or on nodes that do not 8027 * have associated processors. This will favor the local processor 8028 * over remote processors and spread off node memory allocations 8029 * as wide as possible. 8030 */ 8031 if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) 8032 return NODE_RECLAIM_NOSCAN; 8033 8034 if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) 8035 return NODE_RECLAIM_NOSCAN; 8036 8037 ret = __node_reclaim(pgdat, gfp_mask, order); 8038 clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); 8039 8040 if (!ret) 8041 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); 8042 8043 return ret; 8044 } 8045 #endif 8046 8047 void check_move_unevictable_pages(struct pagevec *pvec) 8048 { 8049 struct folio_batch fbatch; 8050 unsigned i; 8051 8052 folio_batch_init(&fbatch); 8053 for (i = 0; i < pvec->nr; i++) { 8054 struct page *page = pvec->pages[i]; 8055 8056 if (PageTransTail(page)) 8057 continue; 8058 folio_batch_add(&fbatch, page_folio(page)); 8059 } 8060 check_move_unevictable_folios(&fbatch); 8061 } 8062 EXPORT_SYMBOL_GPL(check_move_unevictable_pages); 8063 8064 /** 8065 * check_move_unevictable_folios - Move evictable folios to appropriate zone 8066 * lru list 8067 * @fbatch: Batch of lru folios to check. 8068 * 8069 * Checks folios for evictability, if an evictable folio is in the unevictable 8070 * lru list, moves it to the appropriate evictable lru list. This function 8071 * should be only used for lru folios. 8072 */ 8073 void check_move_unevictable_folios(struct folio_batch *fbatch) 8074 { 8075 struct lruvec *lruvec = NULL; 8076 int pgscanned = 0; 8077 int pgrescued = 0; 8078 int i; 8079 8080 for (i = 0; i < fbatch->nr; i++) { 8081 struct folio *folio = fbatch->folios[i]; 8082 int nr_pages = folio_nr_pages(folio); 8083 8084 pgscanned += nr_pages; 8085 8086 /* block memcg migration while the folio moves between lrus */ 8087 if (!folio_test_clear_lru(folio)) 8088 continue; 8089 8090 lruvec = folio_lruvec_relock_irq(folio, lruvec); 8091 if (folio_evictable(folio) && folio_test_unevictable(folio)) { 8092 lruvec_del_folio(lruvec, folio); 8093 folio_clear_unevictable(folio); 8094 lruvec_add_folio(lruvec, folio); 8095 pgrescued += nr_pages; 8096 } 8097 folio_set_lru(folio); 8098 } 8099 8100 if (lruvec) { 8101 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); 8102 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); 8103 unlock_page_lruvec_irq(lruvec); 8104 } else if (pgscanned) { 8105 count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); 8106 } 8107 } 8108 EXPORT_SYMBOL_GPL(check_move_unevictable_folios); 8109