1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 4 * 5 * Swap reorganised 29.12.95, Stephen Tweedie. 6 * kswapd added: 7.1.96 sct 7 * Removed kswapd_ctl limits, and swap out as many pages as needed 8 * to bring the system back to freepages.high: 2.4.97, Rik van Riel. 9 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). 10 * Multiqueue VM started 5.8.00, Rik van Riel. 11 */ 12 13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 14 15 #include <linux/mm.h> 16 #include <linux/sched/mm.h> 17 #include <linux/module.h> 18 #include <linux/gfp.h> 19 #include <linux/kernel_stat.h> 20 #include <linux/swap.h> 21 #include <linux/pagemap.h> 22 #include <linux/init.h> 23 #include <linux/highmem.h> 24 #include <linux/vmpressure.h> 25 #include <linux/vmstat.h> 26 #include <linux/file.h> 27 #include <linux/writeback.h> 28 #include <linux/blkdev.h> 29 #include <linux/buffer_head.h> /* for buffer_heads_over_limit */ 30 #include <linux/mm_inline.h> 31 #include <linux/backing-dev.h> 32 #include <linux/rmap.h> 33 #include <linux/topology.h> 34 #include <linux/cpu.h> 35 #include <linux/cpuset.h> 36 #include <linux/compaction.h> 37 #include <linux/notifier.h> 38 #include <linux/rwsem.h> 39 #include <linux/delay.h> 40 #include <linux/kthread.h> 41 #include <linux/freezer.h> 42 #include <linux/memcontrol.h> 43 #include <linux/migrate.h> 44 #include <linux/delayacct.h> 45 #include <linux/sysctl.h> 46 #include <linux/memory-tiers.h> 47 #include <linux/oom.h> 48 #include <linux/pagevec.h> 49 #include <linux/prefetch.h> 50 #include <linux/printk.h> 51 #include <linux/dax.h> 52 #include <linux/psi.h> 53 #include <linux/pagewalk.h> 54 #include <linux/shmem_fs.h> 55 #include <linux/ctype.h> 56 #include <linux/debugfs.h> 57 #include <linux/khugepaged.h> 58 #include <linux/rculist_nulls.h> 59 #include <linux/random.h> 60 61 #include <asm/tlbflush.h> 62 #include <asm/div64.h> 63 64 #include <linux/swapops.h> 65 #include <linux/balloon_compaction.h> 66 #include <linux/sched/sysctl.h> 67 68 #include "internal.h" 69 #include "swap.h" 70 71 #define CREATE_TRACE_POINTS 72 #include <trace/events/vmscan.h> 73 74 struct scan_control { 75 /* How many pages shrink_list() should reclaim */ 76 unsigned long nr_to_reclaim; 77 78 /* 79 * Nodemask of nodes allowed by the caller. If NULL, all nodes 80 * are scanned. 81 */ 82 nodemask_t *nodemask; 83 84 /* 85 * The memory cgroup that hit its limit and as a result is the 86 * primary target of this reclaim invocation. 87 */ 88 struct mem_cgroup *target_mem_cgroup; 89 90 /* 91 * Scan pressure balancing between anon and file LRUs 92 */ 93 unsigned long anon_cost; 94 unsigned long file_cost; 95 96 /* Can active folios be deactivated as part of reclaim? */ 97 #define DEACTIVATE_ANON 1 98 #define DEACTIVATE_FILE 2 99 unsigned int may_deactivate:2; 100 unsigned int force_deactivate:1; 101 unsigned int skipped_deactivate:1; 102 103 /* Writepage batching in laptop mode; RECLAIM_WRITE */ 104 unsigned int may_writepage:1; 105 106 /* Can mapped folios be reclaimed? */ 107 unsigned int may_unmap:1; 108 109 /* Can folios be swapped as part of reclaim? */ 110 unsigned int may_swap:1; 111 112 /* Proactive reclaim invoked by userspace through memory.reclaim */ 113 unsigned int proactive:1; 114 115 /* 116 * Cgroup memory below memory.low is protected as long as we 117 * don't threaten to OOM. If any cgroup is reclaimed at 118 * reduced force or passed over entirely due to its memory.low 119 * setting (memcg_low_skipped), and nothing is reclaimed as a 120 * result, then go back for one more cycle that reclaims the protected 121 * memory (memcg_low_reclaim) to avert OOM. 122 */ 123 unsigned int memcg_low_reclaim:1; 124 unsigned int memcg_low_skipped:1; 125 126 unsigned int hibernation_mode:1; 127 128 /* One of the zones is ready for compaction */ 129 unsigned int compaction_ready:1; 130 131 /* There is easily reclaimable cold cache in the current node */ 132 unsigned int cache_trim_mode:1; 133 134 /* The file folios on the current node are dangerously low */ 135 unsigned int file_is_tiny:1; 136 137 /* Always discard instead of demoting to lower tier memory */ 138 unsigned int no_demotion:1; 139 140 /* Allocation order */ 141 s8 order; 142 143 /* Scan (total_size >> priority) pages at once */ 144 s8 priority; 145 146 /* The highest zone to isolate folios for reclaim from */ 147 s8 reclaim_idx; 148 149 /* This context's GFP mask */ 150 gfp_t gfp_mask; 151 152 /* Incremented by the number of inactive pages that were scanned */ 153 unsigned long nr_scanned; 154 155 /* Number of pages freed so far during a call to shrink_zones() */ 156 unsigned long nr_reclaimed; 157 158 struct { 159 unsigned int dirty; 160 unsigned int unqueued_dirty; 161 unsigned int congested; 162 unsigned int writeback; 163 unsigned int immediate; 164 unsigned int file_taken; 165 unsigned int taken; 166 } nr; 167 168 /* for recording the reclaimed slab by now */ 169 struct reclaim_state reclaim_state; 170 }; 171 172 #ifdef ARCH_HAS_PREFETCHW 173 #define prefetchw_prev_lru_folio(_folio, _base, _field) \ 174 do { \ 175 if ((_folio)->lru.prev != _base) { \ 176 struct folio *prev; \ 177 \ 178 prev = lru_to_folio(&(_folio->lru)); \ 179 prefetchw(&prev->_field); \ 180 } \ 181 } while (0) 182 #else 183 #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) 184 #endif 185 186 /* 187 * From 0 .. 200. Higher means more swappy. 188 */ 189 int vm_swappiness = 60; 190 191 static void set_task_reclaim_state(struct task_struct *task, 192 struct reclaim_state *rs) 193 { 194 /* Check for an overwrite */ 195 WARN_ON_ONCE(rs && task->reclaim_state); 196 197 /* Check for the nulling of an already-nulled member */ 198 WARN_ON_ONCE(!rs && !task->reclaim_state); 199 200 task->reclaim_state = rs; 201 } 202 203 LIST_HEAD(shrinker_list); 204 DECLARE_RWSEM(shrinker_rwsem); 205 206 #ifdef CONFIG_MEMCG 207 static int shrinker_nr_max; 208 209 /* The shrinker_info is expanded in a batch of BITS_PER_LONG */ 210 static inline int shrinker_map_size(int nr_items) 211 { 212 return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); 213 } 214 215 static inline int shrinker_defer_size(int nr_items) 216 { 217 return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); 218 } 219 220 static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, 221 int nid) 222 { 223 return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, 224 lockdep_is_held(&shrinker_rwsem)); 225 } 226 227 static int expand_one_shrinker_info(struct mem_cgroup *memcg, 228 int map_size, int defer_size, 229 int old_map_size, int old_defer_size) 230 { 231 struct shrinker_info *new, *old; 232 struct mem_cgroup_per_node *pn; 233 int nid; 234 int size = map_size + defer_size; 235 236 for_each_node(nid) { 237 pn = memcg->nodeinfo[nid]; 238 old = shrinker_info_protected(memcg, nid); 239 /* Not yet online memcg */ 240 if (!old) 241 return 0; 242 243 new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); 244 if (!new) 245 return -ENOMEM; 246 247 new->nr_deferred = (atomic_long_t *)(new + 1); 248 new->map = (void *)new->nr_deferred + defer_size; 249 250 /* map: set all old bits, clear all new bits */ 251 memset(new->map, (int)0xff, old_map_size); 252 memset((void *)new->map + old_map_size, 0, map_size - old_map_size); 253 /* nr_deferred: copy old values, clear all new values */ 254 memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); 255 memset((void *)new->nr_deferred + old_defer_size, 0, 256 defer_size - old_defer_size); 257 258 rcu_assign_pointer(pn->shrinker_info, new); 259 kvfree_rcu(old, rcu); 260 } 261 262 return 0; 263 } 264 265 void free_shrinker_info(struct mem_cgroup *memcg) 266 { 267 struct mem_cgroup_per_node *pn; 268 struct shrinker_info *info; 269 int nid; 270 271 for_each_node(nid) { 272 pn = memcg->nodeinfo[nid]; 273 info = rcu_dereference_protected(pn->shrinker_info, true); 274 kvfree(info); 275 rcu_assign_pointer(pn->shrinker_info, NULL); 276 } 277 } 278 279 int alloc_shrinker_info(struct mem_cgroup *memcg) 280 { 281 struct shrinker_info *info; 282 int nid, size, ret = 0; 283 int map_size, defer_size = 0; 284 285 down_write(&shrinker_rwsem); 286 map_size = shrinker_map_size(shrinker_nr_max); 287 defer_size = shrinker_defer_size(shrinker_nr_max); 288 size = map_size + defer_size; 289 for_each_node(nid) { 290 info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); 291 if (!info) { 292 free_shrinker_info(memcg); 293 ret = -ENOMEM; 294 break; 295 } 296 info->nr_deferred = (atomic_long_t *)(info + 1); 297 info->map = (void *)info->nr_deferred + defer_size; 298 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); 299 } 300 up_write(&shrinker_rwsem); 301 302 return ret; 303 } 304 305 static inline bool need_expand(int nr_max) 306 { 307 return round_up(nr_max, BITS_PER_LONG) > 308 round_up(shrinker_nr_max, BITS_PER_LONG); 309 } 310 311 static int expand_shrinker_info(int new_id) 312 { 313 int ret = 0; 314 int new_nr_max = new_id + 1; 315 int map_size, defer_size = 0; 316 int old_map_size, old_defer_size = 0; 317 struct mem_cgroup *memcg; 318 319 if (!need_expand(new_nr_max)) 320 goto out; 321 322 if (!root_mem_cgroup) 323 goto out; 324 325 lockdep_assert_held(&shrinker_rwsem); 326 327 map_size = shrinker_map_size(new_nr_max); 328 defer_size = shrinker_defer_size(new_nr_max); 329 old_map_size = shrinker_map_size(shrinker_nr_max); 330 old_defer_size = shrinker_defer_size(shrinker_nr_max); 331 332 memcg = mem_cgroup_iter(NULL, NULL, NULL); 333 do { 334 ret = expand_one_shrinker_info(memcg, map_size, defer_size, 335 old_map_size, old_defer_size); 336 if (ret) { 337 mem_cgroup_iter_break(NULL, memcg); 338 goto out; 339 } 340 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); 341 out: 342 if (!ret) 343 shrinker_nr_max = new_nr_max; 344 345 return ret; 346 } 347 348 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) 349 { 350 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { 351 struct shrinker_info *info; 352 353 rcu_read_lock(); 354 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); 355 /* Pairs with smp mb in shrink_slab() */ 356 smp_mb__before_atomic(); 357 set_bit(shrinker_id, info->map); 358 rcu_read_unlock(); 359 } 360 } 361 362 static DEFINE_IDR(shrinker_idr); 363 364 static int prealloc_memcg_shrinker(struct shrinker *shrinker) 365 { 366 int id, ret = -ENOMEM; 367 368 if (mem_cgroup_disabled()) 369 return -ENOSYS; 370 371 down_write(&shrinker_rwsem); 372 /* This may call shrinker, so it must use down_read_trylock() */ 373 id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); 374 if (id < 0) 375 goto unlock; 376 377 if (id >= shrinker_nr_max) { 378 if (expand_shrinker_info(id)) { 379 idr_remove(&shrinker_idr, id); 380 goto unlock; 381 } 382 } 383 shrinker->id = id; 384 ret = 0; 385 unlock: 386 up_write(&shrinker_rwsem); 387 return ret; 388 } 389 390 static void unregister_memcg_shrinker(struct shrinker *shrinker) 391 { 392 int id = shrinker->id; 393 394 BUG_ON(id < 0); 395 396 lockdep_assert_held(&shrinker_rwsem); 397 398 idr_remove(&shrinker_idr, id); 399 } 400 401 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, 402 struct mem_cgroup *memcg) 403 { 404 struct shrinker_info *info; 405 406 info = shrinker_info_protected(memcg, nid); 407 return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); 408 } 409 410 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, 411 struct mem_cgroup *memcg) 412 { 413 struct shrinker_info *info; 414 415 info = shrinker_info_protected(memcg, nid); 416 return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); 417 } 418 419 void reparent_shrinker_deferred(struct mem_cgroup *memcg) 420 { 421 int i, nid; 422 long nr; 423 struct mem_cgroup *parent; 424 struct shrinker_info *child_info, *parent_info; 425 426 parent = parent_mem_cgroup(memcg); 427 if (!parent) 428 parent = root_mem_cgroup; 429 430 /* Prevent from concurrent shrinker_info expand */ 431 down_read(&shrinker_rwsem); 432 for_each_node(nid) { 433 child_info = shrinker_info_protected(memcg, nid); 434 parent_info = shrinker_info_protected(parent, nid); 435 for (i = 0; i < shrinker_nr_max; i++) { 436 nr = atomic_long_read(&child_info->nr_deferred[i]); 437 atomic_long_add(nr, &parent_info->nr_deferred[i]); 438 } 439 } 440 up_read(&shrinker_rwsem); 441 } 442 443 static bool cgroup_reclaim(struct scan_control *sc) 444 { 445 return sc->target_mem_cgroup; 446 } 447 448 static bool global_reclaim(struct scan_control *sc) 449 { 450 return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup); 451 } 452 453 /** 454 * writeback_throttling_sane - is the usual dirty throttling mechanism available? 455 * @sc: scan_control in question 456 * 457 * The normal page dirty throttling mechanism in balance_dirty_pages() is 458 * completely broken with the legacy memcg and direct stalling in 459 * shrink_folio_list() is used for throttling instead, which lacks all the 460 * niceties such as fairness, adaptive pausing, bandwidth proportional 461 * allocation and configurability. 462 * 463 * This function tests whether the vmscan currently in progress can assume 464 * that the normal dirty throttling mechanism is operational. 465 */ 466 static bool writeback_throttling_sane(struct scan_control *sc) 467 { 468 if (!cgroup_reclaim(sc)) 469 return true; 470 #ifdef CONFIG_CGROUP_WRITEBACK 471 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 472 return true; 473 #endif 474 return false; 475 } 476 #else 477 static int prealloc_memcg_shrinker(struct shrinker *shrinker) 478 { 479 return -ENOSYS; 480 } 481 482 static void unregister_memcg_shrinker(struct shrinker *shrinker) 483 { 484 } 485 486 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, 487 struct mem_cgroup *memcg) 488 { 489 return 0; 490 } 491 492 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, 493 struct mem_cgroup *memcg) 494 { 495 return 0; 496 } 497 498 static bool cgroup_reclaim(struct scan_control *sc) 499 { 500 return false; 501 } 502 503 static bool global_reclaim(struct scan_control *sc) 504 { 505 return true; 506 } 507 508 static bool writeback_throttling_sane(struct scan_control *sc) 509 { 510 return true; 511 } 512 #endif 513 514 static long xchg_nr_deferred(struct shrinker *shrinker, 515 struct shrink_control *sc) 516 { 517 int nid = sc->nid; 518 519 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) 520 nid = 0; 521 522 if (sc->memcg && 523 (shrinker->flags & SHRINKER_MEMCG_AWARE)) 524 return xchg_nr_deferred_memcg(nid, shrinker, 525 sc->memcg); 526 527 return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); 528 } 529 530 531 static long add_nr_deferred(long nr, struct shrinker *shrinker, 532 struct shrink_control *sc) 533 { 534 int nid = sc->nid; 535 536 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) 537 nid = 0; 538 539 if (sc->memcg && 540 (shrinker->flags & SHRINKER_MEMCG_AWARE)) 541 return add_nr_deferred_memcg(nr, nid, shrinker, 542 sc->memcg); 543 544 return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); 545 } 546 547 static bool can_demote(int nid, struct scan_control *sc) 548 { 549 if (!numa_demotion_enabled) 550 return false; 551 if (sc && sc->no_demotion) 552 return false; 553 if (next_demotion_node(nid) == NUMA_NO_NODE) 554 return false; 555 556 return true; 557 } 558 559 static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, 560 int nid, 561 struct scan_control *sc) 562 { 563 if (memcg == NULL) { 564 /* 565 * For non-memcg reclaim, is there 566 * space in any swap device? 567 */ 568 if (get_nr_swap_pages() > 0) 569 return true; 570 } else { 571 /* Is the memcg below its swap limit? */ 572 if (mem_cgroup_get_nr_swap_pages(memcg) > 0) 573 return true; 574 } 575 576 /* 577 * The page can not be swapped. 578 * 579 * Can it be reclaimed from this node via demotion? 580 */ 581 return can_demote(nid, sc); 582 } 583 584 /* 585 * This misses isolated folios which are not accounted for to save counters. 586 * As the data only determines if reclaim or compaction continues, it is 587 * not expected that isolated folios will be a dominating factor. 588 */ 589 unsigned long zone_reclaimable_pages(struct zone *zone) 590 { 591 unsigned long nr; 592 593 nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + 594 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); 595 if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) 596 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + 597 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); 598 599 return nr; 600 } 601 602 /** 603 * lruvec_lru_size - Returns the number of pages on the given LRU list. 604 * @lruvec: lru vector 605 * @lru: lru to use 606 * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list) 607 */ 608 static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, 609 int zone_idx) 610 { 611 unsigned long size = 0; 612 int zid; 613 614 for (zid = 0; zid <= zone_idx; zid++) { 615 struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; 616 617 if (!managed_zone(zone)) 618 continue; 619 620 if (!mem_cgroup_disabled()) 621 size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); 622 else 623 size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru); 624 } 625 return size; 626 } 627 628 /* 629 * Add a shrinker callback to be called from the vm. 630 */ 631 static int __prealloc_shrinker(struct shrinker *shrinker) 632 { 633 unsigned int size; 634 int err; 635 636 if (shrinker->flags & SHRINKER_MEMCG_AWARE) { 637 err = prealloc_memcg_shrinker(shrinker); 638 if (err != -ENOSYS) 639 return err; 640 641 shrinker->flags &= ~SHRINKER_MEMCG_AWARE; 642 } 643 644 size = sizeof(*shrinker->nr_deferred); 645 if (shrinker->flags & SHRINKER_NUMA_AWARE) 646 size *= nr_node_ids; 647 648 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); 649 if (!shrinker->nr_deferred) 650 return -ENOMEM; 651 652 return 0; 653 } 654 655 #ifdef CONFIG_SHRINKER_DEBUG 656 int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) 657 { 658 va_list ap; 659 int err; 660 661 va_start(ap, fmt); 662 shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); 663 va_end(ap); 664 if (!shrinker->name) 665 return -ENOMEM; 666 667 err = __prealloc_shrinker(shrinker); 668 if (err) { 669 kfree_const(shrinker->name); 670 shrinker->name = NULL; 671 } 672 673 return err; 674 } 675 #else 676 int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) 677 { 678 return __prealloc_shrinker(shrinker); 679 } 680 #endif 681 682 void free_prealloced_shrinker(struct shrinker *shrinker) 683 { 684 #ifdef CONFIG_SHRINKER_DEBUG 685 kfree_const(shrinker->name); 686 shrinker->name = NULL; 687 #endif 688 if (shrinker->flags & SHRINKER_MEMCG_AWARE) { 689 down_write(&shrinker_rwsem); 690 unregister_memcg_shrinker(shrinker); 691 up_write(&shrinker_rwsem); 692 return; 693 } 694 695 kfree(shrinker->nr_deferred); 696 shrinker->nr_deferred = NULL; 697 } 698 699 void register_shrinker_prepared(struct shrinker *shrinker) 700 { 701 down_write(&shrinker_rwsem); 702 list_add_tail(&shrinker->list, &shrinker_list); 703 shrinker->flags |= SHRINKER_REGISTERED; 704 shrinker_debugfs_add(shrinker); 705 up_write(&shrinker_rwsem); 706 } 707 708 static int __register_shrinker(struct shrinker *shrinker) 709 { 710 int err = __prealloc_shrinker(shrinker); 711 712 if (err) 713 return err; 714 register_shrinker_prepared(shrinker); 715 return 0; 716 } 717 718 #ifdef CONFIG_SHRINKER_DEBUG 719 int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) 720 { 721 va_list ap; 722 int err; 723 724 va_start(ap, fmt); 725 shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); 726 va_end(ap); 727 if (!shrinker->name) 728 return -ENOMEM; 729 730 err = __register_shrinker(shrinker); 731 if (err) { 732 kfree_const(shrinker->name); 733 shrinker->name = NULL; 734 } 735 return err; 736 } 737 #else 738 int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) 739 { 740 return __register_shrinker(shrinker); 741 } 742 #endif 743 EXPORT_SYMBOL(register_shrinker); 744 745 /* 746 * Remove one 747 */ 748 void unregister_shrinker(struct shrinker *shrinker) 749 { 750 if (!(shrinker->flags & SHRINKER_REGISTERED)) 751 return; 752 753 down_write(&shrinker_rwsem); 754 list_del(&shrinker->list); 755 shrinker->flags &= ~SHRINKER_REGISTERED; 756 if (shrinker->flags & SHRINKER_MEMCG_AWARE) 757 unregister_memcg_shrinker(shrinker); 758 shrinker_debugfs_remove(shrinker); 759 up_write(&shrinker_rwsem); 760 761 kfree(shrinker->nr_deferred); 762 shrinker->nr_deferred = NULL; 763 } 764 EXPORT_SYMBOL(unregister_shrinker); 765 766 /** 767 * synchronize_shrinkers - Wait for all running shrinkers to complete. 768 * 769 * This is equivalent to calling unregister_shrink() and register_shrinker(), 770 * but atomically and with less overhead. This is useful to guarantee that all 771 * shrinker invocations have seen an update, before freeing memory, similar to 772 * rcu. 773 */ 774 void synchronize_shrinkers(void) 775 { 776 down_write(&shrinker_rwsem); 777 up_write(&shrinker_rwsem); 778 } 779 EXPORT_SYMBOL(synchronize_shrinkers); 780 781 #define SHRINK_BATCH 128 782 783 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, 784 struct shrinker *shrinker, int priority) 785 { 786 unsigned long freed = 0; 787 unsigned long long delta; 788 long total_scan; 789 long freeable; 790 long nr; 791 long new_nr; 792 long batch_size = shrinker->batch ? shrinker->batch 793 : SHRINK_BATCH; 794 long scanned = 0, next_deferred; 795 796 freeable = shrinker->count_objects(shrinker, shrinkctl); 797 if (freeable == 0 || freeable == SHRINK_EMPTY) 798 return freeable; 799 800 /* 801 * copy the current shrinker scan count into a local variable 802 * and zero it so that other concurrent shrinker invocations 803 * don't also do this scanning work. 804 */ 805 nr = xchg_nr_deferred(shrinker, shrinkctl); 806 807 if (shrinker->seeks) { 808 delta = freeable >> priority; 809 delta *= 4; 810 do_div(delta, shrinker->seeks); 811 } else { 812 /* 813 * These objects don't require any IO to create. Trim 814 * them aggressively under memory pressure to keep 815 * them from causing refetches in the IO caches. 816 */ 817 delta = freeable / 2; 818 } 819 820 total_scan = nr >> priority; 821 total_scan += delta; 822 total_scan = min(total_scan, (2 * freeable)); 823 824 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, 825 freeable, delta, total_scan, priority); 826 827 /* 828 * Normally, we should not scan less than batch_size objects in one 829 * pass to avoid too frequent shrinker calls, but if the slab has less 830 * than batch_size objects in total and we are really tight on memory, 831 * we will try to reclaim all available objects, otherwise we can end 832 * up failing allocations although there are plenty of reclaimable 833 * objects spread over several slabs with usage less than the 834 * batch_size. 835 * 836 * We detect the "tight on memory" situations by looking at the total 837 * number of objects we want to scan (total_scan). If it is greater 838 * than the total number of objects on slab (freeable), we must be 839 * scanning at high prio and therefore should try to reclaim as much as 840 * possible. 841 */ 842 while (total_scan >= batch_size || 843 total_scan >= freeable) { 844 unsigned long ret; 845 unsigned long nr_to_scan = min(batch_size, total_scan); 846 847 shrinkctl->nr_to_scan = nr_to_scan; 848 shrinkctl->nr_scanned = nr_to_scan; 849 ret = shrinker->scan_objects(shrinker, shrinkctl); 850 if (ret == SHRINK_STOP) 851 break; 852 freed += ret; 853 854 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); 855 total_scan -= shrinkctl->nr_scanned; 856 scanned += shrinkctl->nr_scanned; 857 858 cond_resched(); 859 } 860 861 /* 862 * The deferred work is increased by any new work (delta) that wasn't 863 * done, decreased by old deferred work that was done now. 864 * 865 * And it is capped to two times of the freeable items. 866 */ 867 next_deferred = max_t(long, (nr + delta - scanned), 0); 868 next_deferred = min(next_deferred, (2 * freeable)); 869 870 /* 871 * move the unused scan count back into the shrinker in a 872 * manner that handles concurrent updates. 873 */ 874 new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); 875 876 trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); 877 return freed; 878 } 879 880 #ifdef CONFIG_MEMCG 881 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, 882 struct mem_cgroup *memcg, int priority) 883 { 884 struct shrinker_info *info; 885 unsigned long ret, freed = 0; 886 int i; 887 888 if (!mem_cgroup_online(memcg)) 889 return 0; 890 891 if (!down_read_trylock(&shrinker_rwsem)) 892 return 0; 893 894 info = shrinker_info_protected(memcg, nid); 895 if (unlikely(!info)) 896 goto unlock; 897 898 for_each_set_bit(i, info->map, shrinker_nr_max) { 899 struct shrink_control sc = { 900 .gfp_mask = gfp_mask, 901 .nid = nid, 902 .memcg = memcg, 903 }; 904 struct shrinker *shrinker; 905 906 shrinker = idr_find(&shrinker_idr, i); 907 if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { 908 if (!shrinker) 909 clear_bit(i, info->map); 910 continue; 911 } 912 913 /* Call non-slab shrinkers even though kmem is disabled */ 914 if (!memcg_kmem_enabled() && 915 !(shrinker->flags & SHRINKER_NONSLAB)) 916 continue; 917 918 ret = do_shrink_slab(&sc, shrinker, priority); 919 if (ret == SHRINK_EMPTY) { 920 clear_bit(i, info->map); 921 /* 922 * After the shrinker reported that it had no objects to 923 * free, but before we cleared the corresponding bit in 924 * the memcg shrinker map, a new object might have been 925 * added. To make sure, we have the bit set in this 926 * case, we invoke the shrinker one more time and reset 927 * the bit if it reports that it is not empty anymore. 928 * The memory barrier here pairs with the barrier in 929 * set_shrinker_bit(): 930 * 931 * list_lru_add() shrink_slab_memcg() 932 * list_add_tail() clear_bit() 933 * <MB> <MB> 934 * set_bit() do_shrink_slab() 935 */ 936 smp_mb__after_atomic(); 937 ret = do_shrink_slab(&sc, shrinker, priority); 938 if (ret == SHRINK_EMPTY) 939 ret = 0; 940 else 941 set_shrinker_bit(memcg, nid, i); 942 } 943 freed += ret; 944 945 if (rwsem_is_contended(&shrinker_rwsem)) { 946 freed = freed ? : 1; 947 break; 948 } 949 } 950 unlock: 951 up_read(&shrinker_rwsem); 952 return freed; 953 } 954 #else /* CONFIG_MEMCG */ 955 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, 956 struct mem_cgroup *memcg, int priority) 957 { 958 return 0; 959 } 960 #endif /* CONFIG_MEMCG */ 961 962 /** 963 * shrink_slab - shrink slab caches 964 * @gfp_mask: allocation context 965 * @nid: node whose slab caches to target 966 * @memcg: memory cgroup whose slab caches to target 967 * @priority: the reclaim priority 968 * 969 * Call the shrink functions to age shrinkable caches. 970 * 971 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, 972 * unaware shrinkers will receive a node id of 0 instead. 973 * 974 * @memcg specifies the memory cgroup to target. Unaware shrinkers 975 * are called only if it is the root cgroup. 976 * 977 * @priority is sc->priority, we take the number of objects and >> by priority 978 * in order to get the scan target. 979 * 980 * Returns the number of reclaimed slab objects. 981 */ 982 static unsigned long shrink_slab(gfp_t gfp_mask, int nid, 983 struct mem_cgroup *memcg, 984 int priority) 985 { 986 unsigned long ret, freed = 0; 987 struct shrinker *shrinker; 988 989 /* 990 * The root memcg might be allocated even though memcg is disabled 991 * via "cgroup_disable=memory" boot parameter. This could make 992 * mem_cgroup_is_root() return false, then just run memcg slab 993 * shrink, but skip global shrink. This may result in premature 994 * oom. 995 */ 996 if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) 997 return shrink_slab_memcg(gfp_mask, nid, memcg, priority); 998 999 if (!down_read_trylock(&shrinker_rwsem)) 1000 goto out; 1001 1002 list_for_each_entry(shrinker, &shrinker_list, list) { 1003 struct shrink_control sc = { 1004 .gfp_mask = gfp_mask, 1005 .nid = nid, 1006 .memcg = memcg, 1007 }; 1008 1009 ret = do_shrink_slab(&sc, shrinker, priority); 1010 if (ret == SHRINK_EMPTY) 1011 ret = 0; 1012 freed += ret; 1013 /* 1014 * Bail out if someone want to register a new shrinker to 1015 * prevent the registration from being stalled for long periods 1016 * by parallel ongoing shrinking. 1017 */ 1018 if (rwsem_is_contended(&shrinker_rwsem)) { 1019 freed = freed ? : 1; 1020 break; 1021 } 1022 } 1023 1024 up_read(&shrinker_rwsem); 1025 out: 1026 cond_resched(); 1027 return freed; 1028 } 1029 1030 static unsigned long drop_slab_node(int nid) 1031 { 1032 unsigned long freed = 0; 1033 struct mem_cgroup *memcg = NULL; 1034 1035 memcg = mem_cgroup_iter(NULL, NULL, NULL); 1036 do { 1037 freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); 1038 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); 1039 1040 return freed; 1041 } 1042 1043 void drop_slab(void) 1044 { 1045 int nid; 1046 int shift = 0; 1047 unsigned long freed; 1048 1049 do { 1050 freed = 0; 1051 for_each_online_node(nid) { 1052 if (fatal_signal_pending(current)) 1053 return; 1054 1055 freed += drop_slab_node(nid); 1056 } 1057 } while ((freed >> shift++) > 1); 1058 } 1059 1060 static int reclaimer_offset(void) 1061 { 1062 BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != 1063 PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD); 1064 BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != 1065 PGSCAN_DIRECT - PGSCAN_KSWAPD); 1066 BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != 1067 PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD); 1068 BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != 1069 PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD); 1070 1071 if (current_is_kswapd()) 1072 return 0; 1073 if (current_is_khugepaged()) 1074 return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD; 1075 return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; 1076 } 1077 1078 static inline int is_page_cache_freeable(struct folio *folio) 1079 { 1080 /* 1081 * A freeable page cache folio is referenced only by the caller 1082 * that isolated the folio, the page cache and optional filesystem 1083 * private data at folio->private. 1084 */ 1085 return folio_ref_count(folio) - folio_test_private(folio) == 1086 1 + folio_nr_pages(folio); 1087 } 1088 1089 /* 1090 * We detected a synchronous write error writing a folio out. Probably 1091 * -ENOSPC. We need to propagate that into the address_space for a subsequent 1092 * fsync(), msync() or close(). 1093 * 1094 * The tricky part is that after writepage we cannot touch the mapping: nothing 1095 * prevents it from being freed up. But we have a ref on the folio and once 1096 * that folio is locked, the mapping is pinned. 1097 * 1098 * We're allowed to run sleeping folio_lock() here because we know the caller has 1099 * __GFP_FS. 1100 */ 1101 static void handle_write_error(struct address_space *mapping, 1102 struct folio *folio, int error) 1103 { 1104 folio_lock(folio); 1105 if (folio_mapping(folio) == mapping) 1106 mapping_set_error(mapping, error); 1107 folio_unlock(folio); 1108 } 1109 1110 static bool skip_throttle_noprogress(pg_data_t *pgdat) 1111 { 1112 int reclaimable = 0, write_pending = 0; 1113 int i; 1114 1115 /* 1116 * If kswapd is disabled, reschedule if necessary but do not 1117 * throttle as the system is likely near OOM. 1118 */ 1119 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) 1120 return true; 1121 1122 /* 1123 * If there are a lot of dirty/writeback folios then do not 1124 * throttle as throttling will occur when the folios cycle 1125 * towards the end of the LRU if still under writeback. 1126 */ 1127 for (i = 0; i < MAX_NR_ZONES; i++) { 1128 struct zone *zone = pgdat->node_zones + i; 1129 1130 if (!managed_zone(zone)) 1131 continue; 1132 1133 reclaimable += zone_reclaimable_pages(zone); 1134 write_pending += zone_page_state_snapshot(zone, 1135 NR_ZONE_WRITE_PENDING); 1136 } 1137 if (2 * write_pending <= reclaimable) 1138 return true; 1139 1140 return false; 1141 } 1142 1143 void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) 1144 { 1145 wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason]; 1146 long timeout, ret; 1147 DEFINE_WAIT(wait); 1148 1149 /* 1150 * Do not throttle IO workers, kthreads other than kswapd or 1151 * workqueues. They may be required for reclaim to make 1152 * forward progress (e.g. journalling workqueues or kthreads). 1153 */ 1154 if (!current_is_kswapd() && 1155 current->flags & (PF_IO_WORKER|PF_KTHREAD)) { 1156 cond_resched(); 1157 return; 1158 } 1159 1160 /* 1161 * These figures are pulled out of thin air. 1162 * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many 1163 * parallel reclaimers which is a short-lived event so the timeout is 1164 * short. Failing to make progress or waiting on writeback are 1165 * potentially long-lived events so use a longer timeout. This is shaky 1166 * logic as a failure to make progress could be due to anything from 1167 * writeback to a slow device to excessive referenced folios at the tail 1168 * of the inactive LRU. 1169 */ 1170 switch(reason) { 1171 case VMSCAN_THROTTLE_WRITEBACK: 1172 timeout = HZ/10; 1173 1174 if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) { 1175 WRITE_ONCE(pgdat->nr_reclaim_start, 1176 node_page_state(pgdat, NR_THROTTLED_WRITTEN)); 1177 } 1178 1179 break; 1180 case VMSCAN_THROTTLE_CONGESTED: 1181 fallthrough; 1182 case VMSCAN_THROTTLE_NOPROGRESS: 1183 if (skip_throttle_noprogress(pgdat)) { 1184 cond_resched(); 1185 return; 1186 } 1187 1188 timeout = 1; 1189 1190 break; 1191 case VMSCAN_THROTTLE_ISOLATED: 1192 timeout = HZ/50; 1193 break; 1194 default: 1195 WARN_ON_ONCE(1); 1196 timeout = HZ; 1197 break; 1198 } 1199 1200 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 1201 ret = schedule_timeout(timeout); 1202 finish_wait(wqh, &wait); 1203 1204 if (reason == VMSCAN_THROTTLE_WRITEBACK) 1205 atomic_dec(&pgdat->nr_writeback_throttled); 1206 1207 trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout), 1208 jiffies_to_usecs(timeout - ret), 1209 reason); 1210 } 1211 1212 /* 1213 * Account for folios written if tasks are throttled waiting on dirty 1214 * folios to clean. If enough folios have been cleaned since throttling 1215 * started then wakeup the throttled tasks. 1216 */ 1217 void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, 1218 int nr_throttled) 1219 { 1220 unsigned long nr_written; 1221 1222 node_stat_add_folio(folio, NR_THROTTLED_WRITTEN); 1223 1224 /* 1225 * This is an inaccurate read as the per-cpu deltas may not 1226 * be synchronised. However, given that the system is 1227 * writeback throttled, it is not worth taking the penalty 1228 * of getting an accurate count. At worst, the throttle 1229 * timeout guarantees forward progress. 1230 */ 1231 nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) - 1232 READ_ONCE(pgdat->nr_reclaim_start); 1233 1234 if (nr_written > SWAP_CLUSTER_MAX * nr_throttled) 1235 wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]); 1236 } 1237 1238 /* possible outcome of pageout() */ 1239 typedef enum { 1240 /* failed to write folio out, folio is locked */ 1241 PAGE_KEEP, 1242 /* move folio to the active list, folio is locked */ 1243 PAGE_ACTIVATE, 1244 /* folio has been sent to the disk successfully, folio is unlocked */ 1245 PAGE_SUCCESS, 1246 /* folio is clean and locked */ 1247 PAGE_CLEAN, 1248 } pageout_t; 1249 1250 /* 1251 * pageout is called by shrink_folio_list() for each dirty folio. 1252 * Calls ->writepage(). 1253 */ 1254 static pageout_t pageout(struct folio *folio, struct address_space *mapping, 1255 struct swap_iocb **plug) 1256 { 1257 /* 1258 * If the folio is dirty, only perform writeback if that write 1259 * will be non-blocking. To prevent this allocation from being 1260 * stalled by pagecache activity. But note that there may be 1261 * stalls if we need to run get_block(). We could test 1262 * PagePrivate for that. 1263 * 1264 * If this process is currently in __generic_file_write_iter() against 1265 * this folio's queue, we can perform writeback even if that 1266 * will block. 1267 * 1268 * If the folio is swapcache, write it back even if that would 1269 * block, for some throttling. This happens by accident, because 1270 * swap_backing_dev_info is bust: it doesn't reflect the 1271 * congestion state of the swapdevs. Easy to fix, if needed. 1272 */ 1273 if (!is_page_cache_freeable(folio)) 1274 return PAGE_KEEP; 1275 if (!mapping) { 1276 /* 1277 * Some data journaling orphaned folios can have 1278 * folio->mapping == NULL while being dirty with clean buffers. 1279 */ 1280 if (folio_test_private(folio)) { 1281 if (try_to_free_buffers(folio)) { 1282 folio_clear_dirty(folio); 1283 pr_info("%s: orphaned folio\n", __func__); 1284 return PAGE_CLEAN; 1285 } 1286 } 1287 return PAGE_KEEP; 1288 } 1289 if (mapping->a_ops->writepage == NULL) 1290 return PAGE_ACTIVATE; 1291 1292 if (folio_clear_dirty_for_io(folio)) { 1293 int res; 1294 struct writeback_control wbc = { 1295 .sync_mode = WB_SYNC_NONE, 1296 .nr_to_write = SWAP_CLUSTER_MAX, 1297 .range_start = 0, 1298 .range_end = LLONG_MAX, 1299 .for_reclaim = 1, 1300 .swap_plug = plug, 1301 }; 1302 1303 folio_set_reclaim(folio); 1304 res = mapping->a_ops->writepage(&folio->page, &wbc); 1305 if (res < 0) 1306 handle_write_error(mapping, folio, res); 1307 if (res == AOP_WRITEPAGE_ACTIVATE) { 1308 folio_clear_reclaim(folio); 1309 return PAGE_ACTIVATE; 1310 } 1311 1312 if (!folio_test_writeback(folio)) { 1313 /* synchronous write or broken a_ops? */ 1314 folio_clear_reclaim(folio); 1315 } 1316 trace_mm_vmscan_write_folio(folio); 1317 node_stat_add_folio(folio, NR_VMSCAN_WRITE); 1318 return PAGE_SUCCESS; 1319 } 1320 1321 return PAGE_CLEAN; 1322 } 1323 1324 /* 1325 * Same as remove_mapping, but if the folio is removed from the mapping, it 1326 * gets returned with a refcount of 0. 1327 */ 1328 static int __remove_mapping(struct address_space *mapping, struct folio *folio, 1329 bool reclaimed, struct mem_cgroup *target_memcg) 1330 { 1331 int refcount; 1332 void *shadow = NULL; 1333 1334 BUG_ON(!folio_test_locked(folio)); 1335 BUG_ON(mapping != folio_mapping(folio)); 1336 1337 if (!folio_test_swapcache(folio)) 1338 spin_lock(&mapping->host->i_lock); 1339 xa_lock_irq(&mapping->i_pages); 1340 /* 1341 * The non racy check for a busy folio. 1342 * 1343 * Must be careful with the order of the tests. When someone has 1344 * a ref to the folio, it may be possible that they dirty it then 1345 * drop the reference. So if the dirty flag is tested before the 1346 * refcount here, then the following race may occur: 1347 * 1348 * get_user_pages(&page); 1349 * [user mapping goes away] 1350 * write_to(page); 1351 * !folio_test_dirty(folio) [good] 1352 * folio_set_dirty(folio); 1353 * folio_put(folio); 1354 * !refcount(folio) [good, discard it] 1355 * 1356 * [oops, our write_to data is lost] 1357 * 1358 * Reversing the order of the tests ensures such a situation cannot 1359 * escape unnoticed. The smp_rmb is needed to ensure the folio->flags 1360 * load is not satisfied before that of folio->_refcount. 1361 * 1362 * Note that if the dirty flag is always set via folio_mark_dirty, 1363 * and thus under the i_pages lock, then this ordering is not required. 1364 */ 1365 refcount = 1 + folio_nr_pages(folio); 1366 if (!folio_ref_freeze(folio, refcount)) 1367 goto cannot_free; 1368 /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */ 1369 if (unlikely(folio_test_dirty(folio))) { 1370 folio_ref_unfreeze(folio, refcount); 1371 goto cannot_free; 1372 } 1373 1374 if (folio_test_swapcache(folio)) { 1375 swp_entry_t swap = folio_swap_entry(folio); 1376 1377 if (reclaimed && !mapping_exiting(mapping)) 1378 shadow = workingset_eviction(folio, target_memcg); 1379 __delete_from_swap_cache(folio, swap, shadow); 1380 mem_cgroup_swapout(folio, swap); 1381 xa_unlock_irq(&mapping->i_pages); 1382 put_swap_folio(folio, swap); 1383 } else { 1384 void (*free_folio)(struct folio *); 1385 1386 free_folio = mapping->a_ops->free_folio; 1387 /* 1388 * Remember a shadow entry for reclaimed file cache in 1389 * order to detect refaults, thus thrashing, later on. 1390 * 1391 * But don't store shadows in an address space that is 1392 * already exiting. This is not just an optimization, 1393 * inode reclaim needs to empty out the radix tree or 1394 * the nodes are lost. Don't plant shadows behind its 1395 * back. 1396 * 1397 * We also don't store shadows for DAX mappings because the 1398 * only page cache folios found in these are zero pages 1399 * covering holes, and because we don't want to mix DAX 1400 * exceptional entries and shadow exceptional entries in the 1401 * same address_space. 1402 */ 1403 if (reclaimed && folio_is_file_lru(folio) && 1404 !mapping_exiting(mapping) && !dax_mapping(mapping)) 1405 shadow = workingset_eviction(folio, target_memcg); 1406 __filemap_remove_folio(folio, shadow); 1407 xa_unlock_irq(&mapping->i_pages); 1408 if (mapping_shrinkable(mapping)) 1409 inode_add_lru(mapping->host); 1410 spin_unlock(&mapping->host->i_lock); 1411 1412 if (free_folio) 1413 free_folio(folio); 1414 } 1415 1416 return 1; 1417 1418 cannot_free: 1419 xa_unlock_irq(&mapping->i_pages); 1420 if (!folio_test_swapcache(folio)) 1421 spin_unlock(&mapping->host->i_lock); 1422 return 0; 1423 } 1424 1425 /** 1426 * remove_mapping() - Attempt to remove a folio from its mapping. 1427 * @mapping: The address space. 1428 * @folio: The folio to remove. 1429 * 1430 * If the folio is dirty, under writeback or if someone else has a ref 1431 * on it, removal will fail. 1432 * Return: The number of pages removed from the mapping. 0 if the folio 1433 * could not be removed. 1434 * Context: The caller should have a single refcount on the folio and 1435 * hold its lock. 1436 */ 1437 long remove_mapping(struct address_space *mapping, struct folio *folio) 1438 { 1439 if (__remove_mapping(mapping, folio, false, NULL)) { 1440 /* 1441 * Unfreezing the refcount with 1 effectively 1442 * drops the pagecache ref for us without requiring another 1443 * atomic operation. 1444 */ 1445 folio_ref_unfreeze(folio, 1); 1446 return folio_nr_pages(folio); 1447 } 1448 return 0; 1449 } 1450 1451 /** 1452 * folio_putback_lru - Put previously isolated folio onto appropriate LRU list. 1453 * @folio: Folio to be returned to an LRU list. 1454 * 1455 * Add previously isolated @folio to appropriate LRU list. 1456 * The folio may still be unevictable for other reasons. 1457 * 1458 * Context: lru_lock must not be held, interrupts must be enabled. 1459 */ 1460 void folio_putback_lru(struct folio *folio) 1461 { 1462 folio_add_lru(folio); 1463 folio_put(folio); /* drop ref from isolate */ 1464 } 1465 1466 enum folio_references { 1467 FOLIOREF_RECLAIM, 1468 FOLIOREF_RECLAIM_CLEAN, 1469 FOLIOREF_KEEP, 1470 FOLIOREF_ACTIVATE, 1471 }; 1472 1473 static enum folio_references folio_check_references(struct folio *folio, 1474 struct scan_control *sc) 1475 { 1476 int referenced_ptes, referenced_folio; 1477 unsigned long vm_flags; 1478 1479 referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, 1480 &vm_flags); 1481 referenced_folio = folio_test_clear_referenced(folio); 1482 1483 /* 1484 * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. 1485 * Let the folio, now marked Mlocked, be moved to the unevictable list. 1486 */ 1487 if (vm_flags & VM_LOCKED) 1488 return FOLIOREF_ACTIVATE; 1489 1490 /* rmap lock contention: rotate */ 1491 if (referenced_ptes == -1) 1492 return FOLIOREF_KEEP; 1493 1494 if (referenced_ptes) { 1495 /* 1496 * All mapped folios start out with page table 1497 * references from the instantiating fault, so we need 1498 * to look twice if a mapped file/anon folio is used more 1499 * than once. 1500 * 1501 * Mark it and spare it for another trip around the 1502 * inactive list. Another page table reference will 1503 * lead to its activation. 1504 * 1505 * Note: the mark is set for activated folios as well 1506 * so that recently deactivated but used folios are 1507 * quickly recovered. 1508 */ 1509 folio_set_referenced(folio); 1510 1511 if (referenced_folio || referenced_ptes > 1) 1512 return FOLIOREF_ACTIVATE; 1513 1514 /* 1515 * Activate file-backed executable folios after first usage. 1516 */ 1517 if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) 1518 return FOLIOREF_ACTIVATE; 1519 1520 return FOLIOREF_KEEP; 1521 } 1522 1523 /* Reclaim if clean, defer dirty folios to writeback */ 1524 if (referenced_folio && folio_is_file_lru(folio)) 1525 return FOLIOREF_RECLAIM_CLEAN; 1526 1527 return FOLIOREF_RECLAIM; 1528 } 1529 1530 /* Check if a folio is dirty or under writeback */ 1531 static void folio_check_dirty_writeback(struct folio *folio, 1532 bool *dirty, bool *writeback) 1533 { 1534 struct address_space *mapping; 1535 1536 /* 1537 * Anonymous folios are not handled by flushers and must be written 1538 * from reclaim context. Do not stall reclaim based on them. 1539 * MADV_FREE anonymous folios are put into inactive file list too. 1540 * They could be mistakenly treated as file lru. So further anon 1541 * test is needed. 1542 */ 1543 if (!folio_is_file_lru(folio) || 1544 (folio_test_anon(folio) && !folio_test_swapbacked(folio))) { 1545 *dirty = false; 1546 *writeback = false; 1547 return; 1548 } 1549 1550 /* By default assume that the folio flags are accurate */ 1551 *dirty = folio_test_dirty(folio); 1552 *writeback = folio_test_writeback(folio); 1553 1554 /* Verify dirty/writeback state if the filesystem supports it */ 1555 if (!folio_test_private(folio)) 1556 return; 1557 1558 mapping = folio_mapping(folio); 1559 if (mapping && mapping->a_ops->is_dirty_writeback) 1560 mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); 1561 } 1562 1563 static struct page *alloc_demote_page(struct page *page, unsigned long private) 1564 { 1565 struct page *target_page; 1566 nodemask_t *allowed_mask; 1567 struct migration_target_control *mtc; 1568 1569 mtc = (struct migration_target_control *)private; 1570 1571 allowed_mask = mtc->nmask; 1572 /* 1573 * make sure we allocate from the target node first also trying to 1574 * demote or reclaim pages from the target node via kswapd if we are 1575 * low on free memory on target node. If we don't do this and if 1576 * we have free memory on the slower(lower) memtier, we would start 1577 * allocating pages from slower(lower) memory tiers without even forcing 1578 * a demotion of cold pages from the target memtier. This can result 1579 * in the kernel placing hot pages in slower(lower) memory tiers. 1580 */ 1581 mtc->nmask = NULL; 1582 mtc->gfp_mask |= __GFP_THISNODE; 1583 target_page = alloc_migration_target(page, (unsigned long)mtc); 1584 if (target_page) 1585 return target_page; 1586 1587 mtc->gfp_mask &= ~__GFP_THISNODE; 1588 mtc->nmask = allowed_mask; 1589 1590 return alloc_migration_target(page, (unsigned long)mtc); 1591 } 1592 1593 /* 1594 * Take folios on @demote_folios and attempt to demote them to another node. 1595 * Folios which are not demoted are left on @demote_folios. 1596 */ 1597 static unsigned int demote_folio_list(struct list_head *demote_folios, 1598 struct pglist_data *pgdat) 1599 { 1600 int target_nid = next_demotion_node(pgdat->node_id); 1601 unsigned int nr_succeeded; 1602 nodemask_t allowed_mask; 1603 1604 struct migration_target_control mtc = { 1605 /* 1606 * Allocate from 'node', or fail quickly and quietly. 1607 * When this happens, 'page' will likely just be discarded 1608 * instead of migrated. 1609 */ 1610 .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN | 1611 __GFP_NOMEMALLOC | GFP_NOWAIT, 1612 .nid = target_nid, 1613 .nmask = &allowed_mask 1614 }; 1615 1616 if (list_empty(demote_folios)) 1617 return 0; 1618 1619 if (target_nid == NUMA_NO_NODE) 1620 return 0; 1621 1622 node_get_allowed_targets(pgdat, &allowed_mask); 1623 1624 /* Demotion ignores all cpuset and mempolicy settings */ 1625 migrate_pages(demote_folios, alloc_demote_page, NULL, 1626 (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, 1627 &nr_succeeded); 1628 1629 __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded); 1630 1631 return nr_succeeded; 1632 } 1633 1634 static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) 1635 { 1636 if (gfp_mask & __GFP_FS) 1637 return true; 1638 if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO)) 1639 return false; 1640 /* 1641 * We can "enter_fs" for swap-cache with only __GFP_IO 1642 * providing this isn't SWP_FS_OPS. 1643 * ->flags can be updated non-atomicially (scan_swap_map_slots), 1644 * but that will never affect SWP_FS_OPS, so the data_race 1645 * is safe. 1646 */ 1647 return !data_race(folio_swap_flags(folio) & SWP_FS_OPS); 1648 } 1649 1650 /* 1651 * shrink_folio_list() returns the number of reclaimed pages 1652 */ 1653 static unsigned int shrink_folio_list(struct list_head *folio_list, 1654 struct pglist_data *pgdat, struct scan_control *sc, 1655 struct reclaim_stat *stat, bool ignore_references) 1656 { 1657 LIST_HEAD(ret_folios); 1658 LIST_HEAD(free_folios); 1659 LIST_HEAD(demote_folios); 1660 unsigned int nr_reclaimed = 0; 1661 unsigned int pgactivate = 0; 1662 bool do_demote_pass; 1663 struct swap_iocb *plug = NULL; 1664 1665 memset(stat, 0, sizeof(*stat)); 1666 cond_resched(); 1667 do_demote_pass = can_demote(pgdat->node_id, sc); 1668 1669 retry: 1670 while (!list_empty(folio_list)) { 1671 struct address_space *mapping; 1672 struct folio *folio; 1673 enum folio_references references = FOLIOREF_RECLAIM; 1674 bool dirty, writeback; 1675 unsigned int nr_pages; 1676 1677 cond_resched(); 1678 1679 folio = lru_to_folio(folio_list); 1680 list_del(&folio->lru); 1681 1682 if (!folio_trylock(folio)) 1683 goto keep; 1684 1685 VM_BUG_ON_FOLIO(folio_test_active(folio), folio); 1686 1687 nr_pages = folio_nr_pages(folio); 1688 1689 /* Account the number of base pages */ 1690 sc->nr_scanned += nr_pages; 1691 1692 if (unlikely(!folio_evictable(folio))) 1693 goto activate_locked; 1694 1695 if (!sc->may_unmap && folio_mapped(folio)) 1696 goto keep_locked; 1697 1698 /* folio_update_gen() tried to promote this page? */ 1699 if (lru_gen_enabled() && !ignore_references && 1700 folio_mapped(folio) && folio_test_referenced(folio)) 1701 goto keep_locked; 1702 1703 /* 1704 * The number of dirty pages determines if a node is marked 1705 * reclaim_congested. kswapd will stall and start writing 1706 * folios if the tail of the LRU is all dirty unqueued folios. 1707 */ 1708 folio_check_dirty_writeback(folio, &dirty, &writeback); 1709 if (dirty || writeback) 1710 stat->nr_dirty += nr_pages; 1711 1712 if (dirty && !writeback) 1713 stat->nr_unqueued_dirty += nr_pages; 1714 1715 /* 1716 * Treat this folio as congested if folios are cycling 1717 * through the LRU so quickly that the folios marked 1718 * for immediate reclaim are making it to the end of 1719 * the LRU a second time. 1720 */ 1721 if (writeback && folio_test_reclaim(folio)) 1722 stat->nr_congested += nr_pages; 1723 1724 /* 1725 * If a folio at the tail of the LRU is under writeback, there 1726 * are three cases to consider. 1727 * 1728 * 1) If reclaim is encountering an excessive number 1729 * of folios under writeback and this folio has both 1730 * the writeback and reclaim flags set, then it 1731 * indicates that folios are being queued for I/O but 1732 * are being recycled through the LRU before the I/O 1733 * can complete. Waiting on the folio itself risks an 1734 * indefinite stall if it is impossible to writeback 1735 * the folio due to I/O error or disconnected storage 1736 * so instead note that the LRU is being scanned too 1737 * quickly and the caller can stall after the folio 1738 * list has been processed. 1739 * 1740 * 2) Global or new memcg reclaim encounters a folio that is 1741 * not marked for immediate reclaim, or the caller does not 1742 * have __GFP_FS (or __GFP_IO if it's simply going to swap, 1743 * not to fs). In this case mark the folio for immediate 1744 * reclaim and continue scanning. 1745 * 1746 * Require may_enter_fs() because we would wait on fs, which 1747 * may not have submitted I/O yet. And the loop driver might 1748 * enter reclaim, and deadlock if it waits on a folio for 1749 * which it is needed to do the write (loop masks off 1750 * __GFP_IO|__GFP_FS for this reason); but more thought 1751 * would probably show more reasons. 1752 * 1753 * 3) Legacy memcg encounters a folio that already has the 1754 * reclaim flag set. memcg does not have any dirty folio 1755 * throttling so we could easily OOM just because too many 1756 * folios are in writeback and there is nothing else to 1757 * reclaim. Wait for the writeback to complete. 1758 * 1759 * In cases 1) and 2) we activate the folios to get them out of 1760 * the way while we continue scanning for clean folios on the 1761 * inactive list and refilling from the active list. The 1762 * observation here is that waiting for disk writes is more 1763 * expensive than potentially causing reloads down the line. 1764 * Since they're marked for immediate reclaim, they won't put 1765 * memory pressure on the cache working set any longer than it 1766 * takes to write them to disk. 1767 */ 1768 if (folio_test_writeback(folio)) { 1769 /* Case 1 above */ 1770 if (current_is_kswapd() && 1771 folio_test_reclaim(folio) && 1772 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { 1773 stat->nr_immediate += nr_pages; 1774 goto activate_locked; 1775 1776 /* Case 2 above */ 1777 } else if (writeback_throttling_sane(sc) || 1778 !folio_test_reclaim(folio) || 1779 !may_enter_fs(folio, sc->gfp_mask)) { 1780 /* 1781 * This is slightly racy - 1782 * folio_end_writeback() might have 1783 * just cleared the reclaim flag, then 1784 * setting the reclaim flag here ends up 1785 * interpreted as the readahead flag - but 1786 * that does not matter enough to care. 1787 * What we do want is for this folio to 1788 * have the reclaim flag set next time 1789 * memcg reclaim reaches the tests above, 1790 * so it will then wait for writeback to 1791 * avoid OOM; and it's also appropriate 1792 * in global reclaim. 1793 */ 1794 folio_set_reclaim(folio); 1795 stat->nr_writeback += nr_pages; 1796 goto activate_locked; 1797 1798 /* Case 3 above */ 1799 } else { 1800 folio_unlock(folio); 1801 folio_wait_writeback(folio); 1802 /* then go back and try same folio again */ 1803 list_add_tail(&folio->lru, folio_list); 1804 continue; 1805 } 1806 } 1807 1808 if (!ignore_references) 1809 references = folio_check_references(folio, sc); 1810 1811 switch (references) { 1812 case FOLIOREF_ACTIVATE: 1813 goto activate_locked; 1814 case FOLIOREF_KEEP: 1815 stat->nr_ref_keep += nr_pages; 1816 goto keep_locked; 1817 case FOLIOREF_RECLAIM: 1818 case FOLIOREF_RECLAIM_CLEAN: 1819 ; /* try to reclaim the folio below */ 1820 } 1821 1822 /* 1823 * Before reclaiming the folio, try to relocate 1824 * its contents to another node. 1825 */ 1826 if (do_demote_pass && 1827 (thp_migration_supported() || !folio_test_large(folio))) { 1828 list_add(&folio->lru, &demote_folios); 1829 folio_unlock(folio); 1830 continue; 1831 } 1832 1833 /* 1834 * Anonymous process memory has backing store? 1835 * Try to allocate it some swap space here. 1836 * Lazyfree folio could be freed directly 1837 */ 1838 if (folio_test_anon(folio) && folio_test_swapbacked(folio)) { 1839 if (!folio_test_swapcache(folio)) { 1840 if (!(sc->gfp_mask & __GFP_IO)) 1841 goto keep_locked; 1842 if (folio_maybe_dma_pinned(folio)) 1843 goto keep_locked; 1844 if (folio_test_large(folio)) { 1845 /* cannot split folio, skip it */ 1846 if (!can_split_folio(folio, NULL)) 1847 goto activate_locked; 1848 /* 1849 * Split folios without a PMD map right 1850 * away. Chances are some or all of the 1851 * tail pages can be freed without IO. 1852 */ 1853 if (!folio_entire_mapcount(folio) && 1854 split_folio_to_list(folio, 1855 folio_list)) 1856 goto activate_locked; 1857 } 1858 if (!add_to_swap(folio)) { 1859 if (!folio_test_large(folio)) 1860 goto activate_locked_split; 1861 /* Fallback to swap normal pages */ 1862 if (split_folio_to_list(folio, 1863 folio_list)) 1864 goto activate_locked; 1865 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1866 count_vm_event(THP_SWPOUT_FALLBACK); 1867 #endif 1868 if (!add_to_swap(folio)) 1869 goto activate_locked_split; 1870 } 1871 } 1872 } else if (folio_test_swapbacked(folio) && 1873 folio_test_large(folio)) { 1874 /* Split shmem folio */ 1875 if (split_folio_to_list(folio, folio_list)) 1876 goto keep_locked; 1877 } 1878 1879 /* 1880 * If the folio was split above, the tail pages will make 1881 * their own pass through this function and be accounted 1882 * then. 1883 */ 1884 if ((nr_pages > 1) && !folio_test_large(folio)) { 1885 sc->nr_scanned -= (nr_pages - 1); 1886 nr_pages = 1; 1887 } 1888 1889 /* 1890 * The folio is mapped into the page tables of one or more 1891 * processes. Try to unmap it here. 1892 */ 1893 if (folio_mapped(folio)) { 1894 enum ttu_flags flags = TTU_BATCH_FLUSH; 1895 bool was_swapbacked = folio_test_swapbacked(folio); 1896 1897 if (folio_test_pmd_mappable(folio)) 1898 flags |= TTU_SPLIT_HUGE_PMD; 1899 1900 try_to_unmap(folio, flags); 1901 if (folio_mapped(folio)) { 1902 stat->nr_unmap_fail += nr_pages; 1903 if (!was_swapbacked && 1904 folio_test_swapbacked(folio)) 1905 stat->nr_lazyfree_fail += nr_pages; 1906 goto activate_locked; 1907 } 1908 } 1909 1910 mapping = folio_mapping(folio); 1911 if (folio_test_dirty(folio)) { 1912 /* 1913 * Only kswapd can writeback filesystem folios 1914 * to avoid risk of stack overflow. But avoid 1915 * injecting inefficient single-folio I/O into 1916 * flusher writeback as much as possible: only 1917 * write folios when we've encountered many 1918 * dirty folios, and when we've already scanned 1919 * the rest of the LRU for clean folios and see 1920 * the same dirty folios again (with the reclaim 1921 * flag set). 1922 */ 1923 if (folio_is_file_lru(folio) && 1924 (!current_is_kswapd() || 1925 !folio_test_reclaim(folio) || 1926 !test_bit(PGDAT_DIRTY, &pgdat->flags))) { 1927 /* 1928 * Immediately reclaim when written back. 1929 * Similar in principle to folio_deactivate() 1930 * except we already have the folio isolated 1931 * and know it's dirty 1932 */ 1933 node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE, 1934 nr_pages); 1935 folio_set_reclaim(folio); 1936 1937 goto activate_locked; 1938 } 1939 1940 if (references == FOLIOREF_RECLAIM_CLEAN) 1941 goto keep_locked; 1942 if (!may_enter_fs(folio, sc->gfp_mask)) 1943 goto keep_locked; 1944 if (!sc->may_writepage) 1945 goto keep_locked; 1946 1947 /* 1948 * Folio is dirty. Flush the TLB if a writable entry 1949 * potentially exists to avoid CPU writes after I/O 1950 * starts and then write it out here. 1951 */ 1952 try_to_unmap_flush_dirty(); 1953 switch (pageout(folio, mapping, &plug)) { 1954 case PAGE_KEEP: 1955 goto keep_locked; 1956 case PAGE_ACTIVATE: 1957 goto activate_locked; 1958 case PAGE_SUCCESS: 1959 stat->nr_pageout += nr_pages; 1960 1961 if (folio_test_writeback(folio)) 1962 goto keep; 1963 if (folio_test_dirty(folio)) 1964 goto keep; 1965 1966 /* 1967 * A synchronous write - probably a ramdisk. Go 1968 * ahead and try to reclaim the folio. 1969 */ 1970 if (!folio_trylock(folio)) 1971 goto keep; 1972 if (folio_test_dirty(folio) || 1973 folio_test_writeback(folio)) 1974 goto keep_locked; 1975 mapping = folio_mapping(folio); 1976 fallthrough; 1977 case PAGE_CLEAN: 1978 ; /* try to free the folio below */ 1979 } 1980 } 1981 1982 /* 1983 * If the folio has buffers, try to free the buffer 1984 * mappings associated with this folio. If we succeed 1985 * we try to free the folio as well. 1986 * 1987 * We do this even if the folio is dirty. 1988 * filemap_release_folio() does not perform I/O, but it 1989 * is possible for a folio to have the dirty flag set, 1990 * but it is actually clean (all its buffers are clean). 1991 * This happens if the buffers were written out directly, 1992 * with submit_bh(). ext3 will do this, as well as 1993 * the blockdev mapping. filemap_release_folio() will 1994 * discover that cleanness and will drop the buffers 1995 * and mark the folio clean - it can be freed. 1996 * 1997 * Rarely, folios can have buffers and no ->mapping. 1998 * These are the folios which were not successfully 1999 * invalidated in truncate_cleanup_folio(). We try to 2000 * drop those buffers here and if that worked, and the 2001 * folio is no longer mapped into process address space 2002 * (refcount == 1) it can be freed. Otherwise, leave 2003 * the folio on the LRU so it is swappable. 2004 */ 2005 if (folio_has_private(folio)) { 2006 if (!filemap_release_folio(folio, sc->gfp_mask)) 2007 goto activate_locked; 2008 if (!mapping && folio_ref_count(folio) == 1) { 2009 folio_unlock(folio); 2010 if (folio_put_testzero(folio)) 2011 goto free_it; 2012 else { 2013 /* 2014 * rare race with speculative reference. 2015 * the speculative reference will free 2016 * this folio shortly, so we may 2017 * increment nr_reclaimed here (and 2018 * leave it off the LRU). 2019 */ 2020 nr_reclaimed += nr_pages; 2021 continue; 2022 } 2023 } 2024 } 2025 2026 if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { 2027 /* follow __remove_mapping for reference */ 2028 if (!folio_ref_freeze(folio, 1)) 2029 goto keep_locked; 2030 /* 2031 * The folio has only one reference left, which is 2032 * from the isolation. After the caller puts the 2033 * folio back on the lru and drops the reference, the 2034 * folio will be freed anyway. It doesn't matter 2035 * which lru it goes on. So we don't bother checking 2036 * the dirty flag here. 2037 */ 2038 count_vm_events(PGLAZYFREED, nr_pages); 2039 count_memcg_folio_events(folio, PGLAZYFREED, nr_pages); 2040 } else if (!mapping || !__remove_mapping(mapping, folio, true, 2041 sc->target_mem_cgroup)) 2042 goto keep_locked; 2043 2044 folio_unlock(folio); 2045 free_it: 2046 /* 2047 * Folio may get swapped out as a whole, need to account 2048 * all pages in it. 2049 */ 2050 nr_reclaimed += nr_pages; 2051 2052 /* 2053 * Is there need to periodically free_folio_list? It would 2054 * appear not as the counts should be low 2055 */ 2056 if (unlikely(folio_test_large(folio))) 2057 destroy_large_folio(folio); 2058 else 2059 list_add(&folio->lru, &free_folios); 2060 continue; 2061 2062 activate_locked_split: 2063 /* 2064 * The tail pages that are failed to add into swap cache 2065 * reach here. Fixup nr_scanned and nr_pages. 2066 */ 2067 if (nr_pages > 1) { 2068 sc->nr_scanned -= (nr_pages - 1); 2069 nr_pages = 1; 2070 } 2071 activate_locked: 2072 /* Not a candidate for swapping, so reclaim swap space. */ 2073 if (folio_test_swapcache(folio) && 2074 (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio))) 2075 folio_free_swap(folio); 2076 VM_BUG_ON_FOLIO(folio_test_active(folio), folio); 2077 if (!folio_test_mlocked(folio)) { 2078 int type = folio_is_file_lru(folio); 2079 folio_set_active(folio); 2080 stat->nr_activate[type] += nr_pages; 2081 count_memcg_folio_events(folio, PGACTIVATE, nr_pages); 2082 } 2083 keep_locked: 2084 folio_unlock(folio); 2085 keep: 2086 list_add(&folio->lru, &ret_folios); 2087 VM_BUG_ON_FOLIO(folio_test_lru(folio) || 2088 folio_test_unevictable(folio), folio); 2089 } 2090 /* 'folio_list' is always empty here */ 2091 2092 /* Migrate folios selected for demotion */ 2093 nr_reclaimed += demote_folio_list(&demote_folios, pgdat); 2094 /* Folios that could not be demoted are still in @demote_folios */ 2095 if (!list_empty(&demote_folios)) { 2096 /* Folios which weren't demoted go back on @folio_list */ 2097 list_splice_init(&demote_folios, folio_list); 2098 2099 /* 2100 * goto retry to reclaim the undemoted folios in folio_list if 2101 * desired. 2102 * 2103 * Reclaiming directly from top tier nodes is not often desired 2104 * due to it breaking the LRU ordering: in general memory 2105 * should be reclaimed from lower tier nodes and demoted from 2106 * top tier nodes. 2107 * 2108 * However, disabling reclaim from top tier nodes entirely 2109 * would cause ooms in edge scenarios where lower tier memory 2110 * is unreclaimable for whatever reason, eg memory being 2111 * mlocked or too hot to reclaim. We can disable reclaim 2112 * from top tier nodes in proactive reclaim though as that is 2113 * not real memory pressure. 2114 */ 2115 if (!sc->proactive) { 2116 do_demote_pass = false; 2117 goto retry; 2118 } 2119 } 2120 2121 pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; 2122 2123 mem_cgroup_uncharge_list(&free_folios); 2124 try_to_unmap_flush(); 2125 free_unref_page_list(&free_folios); 2126 2127 list_splice(&ret_folios, folio_list); 2128 count_vm_events(PGACTIVATE, pgactivate); 2129 2130 if (plug) 2131 swap_write_unplug(plug); 2132 return nr_reclaimed; 2133 } 2134 2135 unsigned int reclaim_clean_pages_from_list(struct zone *zone, 2136 struct list_head *folio_list) 2137 { 2138 struct scan_control sc = { 2139 .gfp_mask = GFP_KERNEL, 2140 .may_unmap = 1, 2141 }; 2142 struct reclaim_stat stat; 2143 unsigned int nr_reclaimed; 2144 struct folio *folio, *next; 2145 LIST_HEAD(clean_folios); 2146 unsigned int noreclaim_flag; 2147 2148 list_for_each_entry_safe(folio, next, folio_list, lru) { 2149 if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) && 2150 !folio_test_dirty(folio) && !__folio_test_movable(folio) && 2151 !folio_test_unevictable(folio)) { 2152 folio_clear_active(folio); 2153 list_move(&folio->lru, &clean_folios); 2154 } 2155 } 2156 2157 /* 2158 * We should be safe here since we are only dealing with file pages and 2159 * we are not kswapd and therefore cannot write dirty file pages. But 2160 * call memalloc_noreclaim_save() anyway, just in case these conditions 2161 * change in the future. 2162 */ 2163 noreclaim_flag = memalloc_noreclaim_save(); 2164 nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, 2165 &stat, true); 2166 memalloc_noreclaim_restore(noreclaim_flag); 2167 2168 list_splice(&clean_folios, folio_list); 2169 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, 2170 -(long)nr_reclaimed); 2171 /* 2172 * Since lazyfree pages are isolated from file LRU from the beginning, 2173 * they will rotate back to anonymous LRU in the end if it failed to 2174 * discard so isolated count will be mismatched. 2175 * Compensate the isolated count for both LRU lists. 2176 */ 2177 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, 2178 stat.nr_lazyfree_fail); 2179 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, 2180 -(long)stat.nr_lazyfree_fail); 2181 return nr_reclaimed; 2182 } 2183 2184 /* 2185 * Update LRU sizes after isolating pages. The LRU size updates must 2186 * be complete before mem_cgroup_update_lru_size due to a sanity check. 2187 */ 2188 static __always_inline void update_lru_sizes(struct lruvec *lruvec, 2189 enum lru_list lru, unsigned long *nr_zone_taken) 2190 { 2191 int zid; 2192 2193 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 2194 if (!nr_zone_taken[zid]) 2195 continue; 2196 2197 update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); 2198 } 2199 2200 } 2201 2202 /* 2203 * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. 2204 * 2205 * lruvec->lru_lock is heavily contended. Some of the functions that 2206 * shrink the lists perform better by taking out a batch of pages 2207 * and working on them outside the LRU lock. 2208 * 2209 * For pagecache intensive workloads, this function is the hottest 2210 * spot in the kernel (apart from copy_*_user functions). 2211 * 2212 * Lru_lock must be held before calling this function. 2213 * 2214 * @nr_to_scan: The number of eligible pages to look through on the list. 2215 * @lruvec: The LRU vector to pull pages from. 2216 * @dst: The temp list to put pages on to. 2217 * @nr_scanned: The number of pages that were scanned. 2218 * @sc: The scan_control struct for this reclaim session 2219 * @lru: LRU list id for isolating 2220 * 2221 * returns how many pages were moved onto *@dst. 2222 */ 2223 static unsigned long isolate_lru_folios(unsigned long nr_to_scan, 2224 struct lruvec *lruvec, struct list_head *dst, 2225 unsigned long *nr_scanned, struct scan_control *sc, 2226 enum lru_list lru) 2227 { 2228 struct list_head *src = &lruvec->lists[lru]; 2229 unsigned long nr_taken = 0; 2230 unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; 2231 unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; 2232 unsigned long skipped = 0; 2233 unsigned long scan, total_scan, nr_pages; 2234 LIST_HEAD(folios_skipped); 2235 2236 total_scan = 0; 2237 scan = 0; 2238 while (scan < nr_to_scan && !list_empty(src)) { 2239 struct list_head *move_to = src; 2240 struct folio *folio; 2241 2242 folio = lru_to_folio(src); 2243 prefetchw_prev_lru_folio(folio, src, flags); 2244 2245 nr_pages = folio_nr_pages(folio); 2246 total_scan += nr_pages; 2247 2248 if (folio_zonenum(folio) > sc->reclaim_idx) { 2249 nr_skipped[folio_zonenum(folio)] += nr_pages; 2250 move_to = &folios_skipped; 2251 goto move; 2252 } 2253 2254 /* 2255 * Do not count skipped folios because that makes the function 2256 * return with no isolated folios if the LRU mostly contains 2257 * ineligible folios. This causes the VM to not reclaim any 2258 * folios, triggering a premature OOM. 2259 * Account all pages in a folio. 2260 */ 2261 scan += nr_pages; 2262 2263 if (!folio_test_lru(folio)) 2264 goto move; 2265 if (!sc->may_unmap && folio_mapped(folio)) 2266 goto move; 2267 2268 /* 2269 * Be careful not to clear the lru flag until after we're 2270 * sure the folio is not being freed elsewhere -- the 2271 * folio release code relies on it. 2272 */ 2273 if (unlikely(!folio_try_get(folio))) 2274 goto move; 2275 2276 if (!folio_test_clear_lru(folio)) { 2277 /* Another thread is already isolating this folio */ 2278 folio_put(folio); 2279 goto move; 2280 } 2281 2282 nr_taken += nr_pages; 2283 nr_zone_taken[folio_zonenum(folio)] += nr_pages; 2284 move_to = dst; 2285 move: 2286 list_move(&folio->lru, move_to); 2287 } 2288 2289 /* 2290 * Splice any skipped folios to the start of the LRU list. Note that 2291 * this disrupts the LRU order when reclaiming for lower zones but 2292 * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX 2293 * scanning would soon rescan the same folios to skip and waste lots 2294 * of cpu cycles. 2295 */ 2296 if (!list_empty(&folios_skipped)) { 2297 int zid; 2298 2299 list_splice(&folios_skipped, src); 2300 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 2301 if (!nr_skipped[zid]) 2302 continue; 2303 2304 __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); 2305 skipped += nr_skipped[zid]; 2306 } 2307 } 2308 *nr_scanned = total_scan; 2309 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, 2310 total_scan, skipped, nr_taken, 2311 sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru); 2312 update_lru_sizes(lruvec, lru, nr_zone_taken); 2313 return nr_taken; 2314 } 2315 2316 /** 2317 * folio_isolate_lru() - Try to isolate a folio from its LRU list. 2318 * @folio: Folio to isolate from its LRU list. 2319 * 2320 * Isolate a @folio from an LRU list and adjust the vmstat statistic 2321 * corresponding to whatever LRU list the folio was on. 2322 * 2323 * The folio will have its LRU flag cleared. If it was found on the 2324 * active list, it will have the Active flag set. If it was found on the 2325 * unevictable list, it will have the Unevictable flag set. These flags 2326 * may need to be cleared by the caller before letting the page go. 2327 * 2328 * Context: 2329 * 2330 * (1) Must be called with an elevated refcount on the folio. This is a 2331 * fundamental difference from isolate_lru_folios() (which is called 2332 * without a stable reference). 2333 * (2) The lru_lock must not be held. 2334 * (3) Interrupts must be enabled. 2335 * 2336 * Return: 0 if the folio was removed from an LRU list. 2337 * -EBUSY if the folio was not on an LRU list. 2338 */ 2339 int folio_isolate_lru(struct folio *folio) 2340 { 2341 int ret = -EBUSY; 2342 2343 VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); 2344 2345 if (folio_test_clear_lru(folio)) { 2346 struct lruvec *lruvec; 2347 2348 folio_get(folio); 2349 lruvec = folio_lruvec_lock_irq(folio); 2350 lruvec_del_folio(lruvec, folio); 2351 unlock_page_lruvec_irq(lruvec); 2352 ret = 0; 2353 } 2354 2355 return ret; 2356 } 2357 2358 /* 2359 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and 2360 * then get rescheduled. When there are massive number of tasks doing page 2361 * allocation, such sleeping direct reclaimers may keep piling up on each CPU, 2362 * the LRU list will go small and be scanned faster than necessary, leading to 2363 * unnecessary swapping, thrashing and OOM. 2364 */ 2365 static int too_many_isolated(struct pglist_data *pgdat, int file, 2366 struct scan_control *sc) 2367 { 2368 unsigned long inactive, isolated; 2369 bool too_many; 2370 2371 if (current_is_kswapd()) 2372 return 0; 2373 2374 if (!writeback_throttling_sane(sc)) 2375 return 0; 2376 2377 if (file) { 2378 inactive = node_page_state(pgdat, NR_INACTIVE_FILE); 2379 isolated = node_page_state(pgdat, NR_ISOLATED_FILE); 2380 } else { 2381 inactive = node_page_state(pgdat, NR_INACTIVE_ANON); 2382 isolated = node_page_state(pgdat, NR_ISOLATED_ANON); 2383 } 2384 2385 /* 2386 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they 2387 * won't get blocked by normal direct-reclaimers, forming a circular 2388 * deadlock. 2389 */ 2390 if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) 2391 inactive >>= 3; 2392 2393 too_many = isolated > inactive; 2394 2395 /* Wake up tasks throttled due to too_many_isolated. */ 2396 if (!too_many) 2397 wake_throttle_isolated(pgdat); 2398 2399 return too_many; 2400 } 2401 2402 /* 2403 * move_folios_to_lru() moves folios from private @list to appropriate LRU list. 2404 * On return, @list is reused as a list of folios to be freed by the caller. 2405 * 2406 * Returns the number of pages moved to the given lruvec. 2407 */ 2408 static unsigned int move_folios_to_lru(struct lruvec *lruvec, 2409 struct list_head *list) 2410 { 2411 int nr_pages, nr_moved = 0; 2412 LIST_HEAD(folios_to_free); 2413 2414 while (!list_empty(list)) { 2415 struct folio *folio = lru_to_folio(list); 2416 2417 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 2418 list_del(&folio->lru); 2419 if (unlikely(!folio_evictable(folio))) { 2420 spin_unlock_irq(&lruvec->lru_lock); 2421 folio_putback_lru(folio); 2422 spin_lock_irq(&lruvec->lru_lock); 2423 continue; 2424 } 2425 2426 /* 2427 * The folio_set_lru needs to be kept here for list integrity. 2428 * Otherwise: 2429 * #0 move_folios_to_lru #1 release_pages 2430 * if (!folio_put_testzero()) 2431 * if (folio_put_testzero()) 2432 * !lru //skip lru_lock 2433 * folio_set_lru() 2434 * list_add(&folio->lru,) 2435 * list_add(&folio->lru,) 2436 */ 2437 folio_set_lru(folio); 2438 2439 if (unlikely(folio_put_testzero(folio))) { 2440 __folio_clear_lru_flags(folio); 2441 2442 if (unlikely(folio_test_large(folio))) { 2443 spin_unlock_irq(&lruvec->lru_lock); 2444 destroy_large_folio(folio); 2445 spin_lock_irq(&lruvec->lru_lock); 2446 } else 2447 list_add(&folio->lru, &folios_to_free); 2448 2449 continue; 2450 } 2451 2452 /* 2453 * All pages were isolated from the same lruvec (and isolation 2454 * inhibits memcg migration). 2455 */ 2456 VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); 2457 lruvec_add_folio(lruvec, folio); 2458 nr_pages = folio_nr_pages(folio); 2459 nr_moved += nr_pages; 2460 if (folio_test_active(folio)) 2461 workingset_age_nonresident(lruvec, nr_pages); 2462 } 2463 2464 /* 2465 * To save our caller's stack, now use input list for pages to free. 2466 */ 2467 list_splice(&folios_to_free, list); 2468 2469 return nr_moved; 2470 } 2471 2472 /* 2473 * If a kernel thread (such as nfsd for loop-back mounts) services a backing 2474 * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case 2475 * we should not throttle. Otherwise it is safe to do so. 2476 */ 2477 static int current_may_throttle(void) 2478 { 2479 return !(current->flags & PF_LOCAL_THROTTLE); 2480 } 2481 2482 /* 2483 * shrink_inactive_list() is a helper for shrink_node(). It returns the number 2484 * of reclaimed pages 2485 */ 2486 static unsigned long shrink_inactive_list(unsigned long nr_to_scan, 2487 struct lruvec *lruvec, struct scan_control *sc, 2488 enum lru_list lru) 2489 { 2490 LIST_HEAD(folio_list); 2491 unsigned long nr_scanned; 2492 unsigned int nr_reclaimed = 0; 2493 unsigned long nr_taken; 2494 struct reclaim_stat stat; 2495 bool file = is_file_lru(lru); 2496 enum vm_event_item item; 2497 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 2498 bool stalled = false; 2499 2500 while (unlikely(too_many_isolated(pgdat, file, sc))) { 2501 if (stalled) 2502 return 0; 2503 2504 /* wait a bit for the reclaimer. */ 2505 stalled = true; 2506 reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED); 2507 2508 /* We are about to die and free our memory. Return now. */ 2509 if (fatal_signal_pending(current)) 2510 return SWAP_CLUSTER_MAX; 2511 } 2512 2513 lru_add_drain(); 2514 2515 spin_lock_irq(&lruvec->lru_lock); 2516 2517 nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list, 2518 &nr_scanned, sc, lru); 2519 2520 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); 2521 item = PGSCAN_KSWAPD + reclaimer_offset(); 2522 if (!cgroup_reclaim(sc)) 2523 __count_vm_events(item, nr_scanned); 2524 __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); 2525 __count_vm_events(PGSCAN_ANON + file, nr_scanned); 2526 2527 spin_unlock_irq(&lruvec->lru_lock); 2528 2529 if (nr_taken == 0) 2530 return 0; 2531 2532 nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); 2533 2534 spin_lock_irq(&lruvec->lru_lock); 2535 move_folios_to_lru(lruvec, &folio_list); 2536 2537 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); 2538 item = PGSTEAL_KSWAPD + reclaimer_offset(); 2539 if (!cgroup_reclaim(sc)) 2540 __count_vm_events(item, nr_reclaimed); 2541 __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); 2542 __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); 2543 spin_unlock_irq(&lruvec->lru_lock); 2544 2545 lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); 2546 mem_cgroup_uncharge_list(&folio_list); 2547 free_unref_page_list(&folio_list); 2548 2549 /* 2550 * If dirty folios are scanned that are not queued for IO, it 2551 * implies that flushers are not doing their job. This can 2552 * happen when memory pressure pushes dirty folios to the end of 2553 * the LRU before the dirty limits are breached and the dirty 2554 * data has expired. It can also happen when the proportion of 2555 * dirty folios grows not through writes but through memory 2556 * pressure reclaiming all the clean cache. And in some cases, 2557 * the flushers simply cannot keep up with the allocation 2558 * rate. Nudge the flusher threads in case they are asleep. 2559 */ 2560 if (stat.nr_unqueued_dirty == nr_taken) { 2561 wakeup_flusher_threads(WB_REASON_VMSCAN); 2562 /* 2563 * For cgroupv1 dirty throttling is achieved by waking up 2564 * the kernel flusher here and later waiting on folios 2565 * which are in writeback to finish (see shrink_folio_list()). 2566 * 2567 * Flusher may not be able to issue writeback quickly 2568 * enough for cgroupv1 writeback throttling to work 2569 * on a large system. 2570 */ 2571 if (!writeback_throttling_sane(sc)) 2572 reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); 2573 } 2574 2575 sc->nr.dirty += stat.nr_dirty; 2576 sc->nr.congested += stat.nr_congested; 2577 sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; 2578 sc->nr.writeback += stat.nr_writeback; 2579 sc->nr.immediate += stat.nr_immediate; 2580 sc->nr.taken += nr_taken; 2581 if (file) 2582 sc->nr.file_taken += nr_taken; 2583 2584 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, 2585 nr_scanned, nr_reclaimed, &stat, sc->priority, file); 2586 return nr_reclaimed; 2587 } 2588 2589 /* 2590 * shrink_active_list() moves folios from the active LRU to the inactive LRU. 2591 * 2592 * We move them the other way if the folio is referenced by one or more 2593 * processes. 2594 * 2595 * If the folios are mostly unmapped, the processing is fast and it is 2596 * appropriate to hold lru_lock across the whole operation. But if 2597 * the folios are mapped, the processing is slow (folio_referenced()), so 2598 * we should drop lru_lock around each folio. It's impossible to balance 2599 * this, so instead we remove the folios from the LRU while processing them. 2600 * It is safe to rely on the active flag against the non-LRU folios in here 2601 * because nobody will play with that bit on a non-LRU folio. 2602 * 2603 * The downside is that we have to touch folio->_refcount against each folio. 2604 * But we had to alter folio->flags anyway. 2605 */ 2606 static void shrink_active_list(unsigned long nr_to_scan, 2607 struct lruvec *lruvec, 2608 struct scan_control *sc, 2609 enum lru_list lru) 2610 { 2611 unsigned long nr_taken; 2612 unsigned long nr_scanned; 2613 unsigned long vm_flags; 2614 LIST_HEAD(l_hold); /* The folios which were snipped off */ 2615 LIST_HEAD(l_active); 2616 LIST_HEAD(l_inactive); 2617 unsigned nr_deactivate, nr_activate; 2618 unsigned nr_rotated = 0; 2619 int file = is_file_lru(lru); 2620 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 2621 2622 lru_add_drain(); 2623 2624 spin_lock_irq(&lruvec->lru_lock); 2625 2626 nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, 2627 &nr_scanned, sc, lru); 2628 2629 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); 2630 2631 if (!cgroup_reclaim(sc)) 2632 __count_vm_events(PGREFILL, nr_scanned); 2633 __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); 2634 2635 spin_unlock_irq(&lruvec->lru_lock); 2636 2637 while (!list_empty(&l_hold)) { 2638 struct folio *folio; 2639 2640 cond_resched(); 2641 folio = lru_to_folio(&l_hold); 2642 list_del(&folio->lru); 2643 2644 if (unlikely(!folio_evictable(folio))) { 2645 folio_putback_lru(folio); 2646 continue; 2647 } 2648 2649 if (unlikely(buffer_heads_over_limit)) { 2650 if (folio_test_private(folio) && folio_trylock(folio)) { 2651 if (folio_test_private(folio)) 2652 filemap_release_folio(folio, 0); 2653 folio_unlock(folio); 2654 } 2655 } 2656 2657 /* Referenced or rmap lock contention: rotate */ 2658 if (folio_referenced(folio, 0, sc->target_mem_cgroup, 2659 &vm_flags) != 0) { 2660 /* 2661 * Identify referenced, file-backed active folios and 2662 * give them one more trip around the active list. So 2663 * that executable code get better chances to stay in 2664 * memory under moderate memory pressure. Anon folios 2665 * are not likely to be evicted by use-once streaming 2666 * IO, plus JVM can create lots of anon VM_EXEC folios, 2667 * so we ignore them here. 2668 */ 2669 if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) { 2670 nr_rotated += folio_nr_pages(folio); 2671 list_add(&folio->lru, &l_active); 2672 continue; 2673 } 2674 } 2675 2676 folio_clear_active(folio); /* we are de-activating */ 2677 folio_set_workingset(folio); 2678 list_add(&folio->lru, &l_inactive); 2679 } 2680 2681 /* 2682 * Move folios back to the lru list. 2683 */ 2684 spin_lock_irq(&lruvec->lru_lock); 2685 2686 nr_activate = move_folios_to_lru(lruvec, &l_active); 2687 nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); 2688 /* Keep all free folios in l_active list */ 2689 list_splice(&l_inactive, &l_active); 2690 2691 __count_vm_events(PGDEACTIVATE, nr_deactivate); 2692 __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); 2693 2694 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); 2695 spin_unlock_irq(&lruvec->lru_lock); 2696 2697 if (nr_rotated) 2698 lru_note_cost(lruvec, file, 0, nr_rotated); 2699 mem_cgroup_uncharge_list(&l_active); 2700 free_unref_page_list(&l_active); 2701 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, 2702 nr_deactivate, nr_rotated, sc->priority, file); 2703 } 2704 2705 static unsigned int reclaim_folio_list(struct list_head *folio_list, 2706 struct pglist_data *pgdat) 2707 { 2708 struct reclaim_stat dummy_stat; 2709 unsigned int nr_reclaimed; 2710 struct folio *folio; 2711 struct scan_control sc = { 2712 .gfp_mask = GFP_KERNEL, 2713 .may_writepage = 1, 2714 .may_unmap = 1, 2715 .may_swap = 1, 2716 .no_demotion = 1, 2717 }; 2718 2719 nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false); 2720 while (!list_empty(folio_list)) { 2721 folio = lru_to_folio(folio_list); 2722 list_del(&folio->lru); 2723 folio_putback_lru(folio); 2724 } 2725 2726 return nr_reclaimed; 2727 } 2728 2729 unsigned long reclaim_pages(struct list_head *folio_list) 2730 { 2731 int nid; 2732 unsigned int nr_reclaimed = 0; 2733 LIST_HEAD(node_folio_list); 2734 unsigned int noreclaim_flag; 2735 2736 if (list_empty(folio_list)) 2737 return nr_reclaimed; 2738 2739 noreclaim_flag = memalloc_noreclaim_save(); 2740 2741 nid = folio_nid(lru_to_folio(folio_list)); 2742 do { 2743 struct folio *folio = lru_to_folio(folio_list); 2744 2745 if (nid == folio_nid(folio)) { 2746 folio_clear_active(folio); 2747 list_move(&folio->lru, &node_folio_list); 2748 continue; 2749 } 2750 2751 nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); 2752 nid = folio_nid(lru_to_folio(folio_list)); 2753 } while (!list_empty(folio_list)); 2754 2755 nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); 2756 2757 memalloc_noreclaim_restore(noreclaim_flag); 2758 2759 return nr_reclaimed; 2760 } 2761 2762 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 2763 struct lruvec *lruvec, struct scan_control *sc) 2764 { 2765 if (is_active_lru(lru)) { 2766 if (sc->may_deactivate & (1 << is_file_lru(lru))) 2767 shrink_active_list(nr_to_scan, lruvec, sc, lru); 2768 else 2769 sc->skipped_deactivate = 1; 2770 return 0; 2771 } 2772 2773 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); 2774 } 2775 2776 /* 2777 * The inactive anon list should be small enough that the VM never has 2778 * to do too much work. 2779 * 2780 * The inactive file list should be small enough to leave most memory 2781 * to the established workingset on the scan-resistant active list, 2782 * but large enough to avoid thrashing the aggregate readahead window. 2783 * 2784 * Both inactive lists should also be large enough that each inactive 2785 * folio has a chance to be referenced again before it is reclaimed. 2786 * 2787 * If that fails and refaulting is observed, the inactive list grows. 2788 * 2789 * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios 2790 * on this LRU, maintained by the pageout code. An inactive_ratio 2791 * of 3 means 3:1 or 25% of the folios are kept on the inactive list. 2792 * 2793 * total target max 2794 * memory ratio inactive 2795 * ------------------------------------- 2796 * 10MB 1 5MB 2797 * 100MB 1 50MB 2798 * 1GB 3 250MB 2799 * 10GB 10 0.9GB 2800 * 100GB 31 3GB 2801 * 1TB 101 10GB 2802 * 10TB 320 32GB 2803 */ 2804 static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) 2805 { 2806 enum lru_list active_lru = inactive_lru + LRU_ACTIVE; 2807 unsigned long inactive, active; 2808 unsigned long inactive_ratio; 2809 unsigned long gb; 2810 2811 inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru); 2812 active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru); 2813 2814 gb = (inactive + active) >> (30 - PAGE_SHIFT); 2815 if (gb) 2816 inactive_ratio = int_sqrt(10 * gb); 2817 else 2818 inactive_ratio = 1; 2819 2820 return inactive * inactive_ratio < active; 2821 } 2822 2823 enum scan_balance { 2824 SCAN_EQUAL, 2825 SCAN_FRACT, 2826 SCAN_ANON, 2827 SCAN_FILE, 2828 }; 2829 2830 static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) 2831 { 2832 unsigned long file; 2833 struct lruvec *target_lruvec; 2834 2835 if (lru_gen_enabled()) 2836 return; 2837 2838 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); 2839 2840 /* 2841 * Flush the memory cgroup stats, so that we read accurate per-memcg 2842 * lruvec stats for heuristics. 2843 */ 2844 mem_cgroup_flush_stats(); 2845 2846 /* 2847 * Determine the scan balance between anon and file LRUs. 2848 */ 2849 spin_lock_irq(&target_lruvec->lru_lock); 2850 sc->anon_cost = target_lruvec->anon_cost; 2851 sc->file_cost = target_lruvec->file_cost; 2852 spin_unlock_irq(&target_lruvec->lru_lock); 2853 2854 /* 2855 * Target desirable inactive:active list ratios for the anon 2856 * and file LRU lists. 2857 */ 2858 if (!sc->force_deactivate) { 2859 unsigned long refaults; 2860 2861 /* 2862 * When refaults are being observed, it means a new 2863 * workingset is being established. Deactivate to get 2864 * rid of any stale active pages quickly. 2865 */ 2866 refaults = lruvec_page_state(target_lruvec, 2867 WORKINGSET_ACTIVATE_ANON); 2868 if (refaults != target_lruvec->refaults[WORKINGSET_ANON] || 2869 inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) 2870 sc->may_deactivate |= DEACTIVATE_ANON; 2871 else 2872 sc->may_deactivate &= ~DEACTIVATE_ANON; 2873 2874 refaults = lruvec_page_state(target_lruvec, 2875 WORKINGSET_ACTIVATE_FILE); 2876 if (refaults != target_lruvec->refaults[WORKINGSET_FILE] || 2877 inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) 2878 sc->may_deactivate |= DEACTIVATE_FILE; 2879 else 2880 sc->may_deactivate &= ~DEACTIVATE_FILE; 2881 } else 2882 sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; 2883 2884 /* 2885 * If we have plenty of inactive file pages that aren't 2886 * thrashing, try to reclaim those first before touching 2887 * anonymous pages. 2888 */ 2889 file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); 2890 if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) 2891 sc->cache_trim_mode = 1; 2892 else 2893 sc->cache_trim_mode = 0; 2894 2895 /* 2896 * Prevent the reclaimer from falling into the cache trap: as 2897 * cache pages start out inactive, every cache fault will tip 2898 * the scan balance towards the file LRU. And as the file LRU 2899 * shrinks, so does the window for rotation from references. 2900 * This means we have a runaway feedback loop where a tiny 2901 * thrashing file LRU becomes infinitely more attractive than 2902 * anon pages. Try to detect this based on file LRU size. 2903 */ 2904 if (!cgroup_reclaim(sc)) { 2905 unsigned long total_high_wmark = 0; 2906 unsigned long free, anon; 2907 int z; 2908 2909 free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); 2910 file = node_page_state(pgdat, NR_ACTIVE_FILE) + 2911 node_page_state(pgdat, NR_INACTIVE_FILE); 2912 2913 for (z = 0; z < MAX_NR_ZONES; z++) { 2914 struct zone *zone = &pgdat->node_zones[z]; 2915 2916 if (!managed_zone(zone)) 2917 continue; 2918 2919 total_high_wmark += high_wmark_pages(zone); 2920 } 2921 2922 /* 2923 * Consider anon: if that's low too, this isn't a 2924 * runaway file reclaim problem, but rather just 2925 * extreme pressure. Reclaim as per usual then. 2926 */ 2927 anon = node_page_state(pgdat, NR_INACTIVE_ANON); 2928 2929 sc->file_is_tiny = 2930 file + free <= total_high_wmark && 2931 !(sc->may_deactivate & DEACTIVATE_ANON) && 2932 anon >> sc->priority; 2933 } 2934 } 2935 2936 /* 2937 * Determine how aggressively the anon and file LRU lists should be 2938 * scanned. 2939 * 2940 * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan 2941 * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan 2942 */ 2943 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, 2944 unsigned long *nr) 2945 { 2946 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 2947 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 2948 unsigned long anon_cost, file_cost, total_cost; 2949 int swappiness = mem_cgroup_swappiness(memcg); 2950 u64 fraction[ANON_AND_FILE]; 2951 u64 denominator = 0; /* gcc */ 2952 enum scan_balance scan_balance; 2953 unsigned long ap, fp; 2954 enum lru_list lru; 2955 2956 /* If we have no swap space, do not bother scanning anon folios. */ 2957 if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { 2958 scan_balance = SCAN_FILE; 2959 goto out; 2960 } 2961 2962 /* 2963 * Global reclaim will swap to prevent OOM even with no 2964 * swappiness, but memcg users want to use this knob to 2965 * disable swapping for individual groups completely when 2966 * using the memory controller's swap limit feature would be 2967 * too expensive. 2968 */ 2969 if (cgroup_reclaim(sc) && !swappiness) { 2970 scan_balance = SCAN_FILE; 2971 goto out; 2972 } 2973 2974 /* 2975 * Do not apply any pressure balancing cleverness when the 2976 * system is close to OOM, scan both anon and file equally 2977 * (unless the swappiness setting disagrees with swapping). 2978 */ 2979 if (!sc->priority && swappiness) { 2980 scan_balance = SCAN_EQUAL; 2981 goto out; 2982 } 2983 2984 /* 2985 * If the system is almost out of file pages, force-scan anon. 2986 */ 2987 if (sc->file_is_tiny) { 2988 scan_balance = SCAN_ANON; 2989 goto out; 2990 } 2991 2992 /* 2993 * If there is enough inactive page cache, we do not reclaim 2994 * anything from the anonymous working right now. 2995 */ 2996 if (sc->cache_trim_mode) { 2997 scan_balance = SCAN_FILE; 2998 goto out; 2999 } 3000 3001 scan_balance = SCAN_FRACT; 3002 /* 3003 * Calculate the pressure balance between anon and file pages. 3004 * 3005 * The amount of pressure we put on each LRU is inversely 3006 * proportional to the cost of reclaiming each list, as 3007 * determined by the share of pages that are refaulting, times 3008 * the relative IO cost of bringing back a swapped out 3009 * anonymous page vs reloading a filesystem page (swappiness). 3010 * 3011 * Although we limit that influence to ensure no list gets 3012 * left behind completely: at least a third of the pressure is 3013 * applied, before swappiness. 3014 * 3015 * With swappiness at 100, anon and file have equal IO cost. 3016 */ 3017 total_cost = sc->anon_cost + sc->file_cost; 3018 anon_cost = total_cost + sc->anon_cost; 3019 file_cost = total_cost + sc->file_cost; 3020 total_cost = anon_cost + file_cost; 3021 3022 ap = swappiness * (total_cost + 1); 3023 ap /= anon_cost + 1; 3024 3025 fp = (200 - swappiness) * (total_cost + 1); 3026 fp /= file_cost + 1; 3027 3028 fraction[0] = ap; 3029 fraction[1] = fp; 3030 denominator = ap + fp; 3031 out: 3032 for_each_evictable_lru(lru) { 3033 int file = is_file_lru(lru); 3034 unsigned long lruvec_size; 3035 unsigned long low, min; 3036 unsigned long scan; 3037 3038 lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); 3039 mem_cgroup_protection(sc->target_mem_cgroup, memcg, 3040 &min, &low); 3041 3042 if (min || low) { 3043 /* 3044 * Scale a cgroup's reclaim pressure by proportioning 3045 * its current usage to its memory.low or memory.min 3046 * setting. 3047 * 3048 * This is important, as otherwise scanning aggression 3049 * becomes extremely binary -- from nothing as we 3050 * approach the memory protection threshold, to totally 3051 * nominal as we exceed it. This results in requiring 3052 * setting extremely liberal protection thresholds. It 3053 * also means we simply get no protection at all if we 3054 * set it too low, which is not ideal. 3055 * 3056 * If there is any protection in place, we reduce scan 3057 * pressure by how much of the total memory used is 3058 * within protection thresholds. 3059 * 3060 * There is one special case: in the first reclaim pass, 3061 * we skip over all groups that are within their low 3062 * protection. If that fails to reclaim enough pages to 3063 * satisfy the reclaim goal, we come back and override 3064 * the best-effort low protection. However, we still 3065 * ideally want to honor how well-behaved groups are in 3066 * that case instead of simply punishing them all 3067 * equally. As such, we reclaim them based on how much 3068 * memory they are using, reducing the scan pressure 3069 * again by how much of the total memory used is under 3070 * hard protection. 3071 */ 3072 unsigned long cgroup_size = mem_cgroup_size(memcg); 3073 unsigned long protection; 3074 3075 /* memory.low scaling, make sure we retry before OOM */ 3076 if (!sc->memcg_low_reclaim && low > min) { 3077 protection = low; 3078 sc->memcg_low_skipped = 1; 3079 } else { 3080 protection = min; 3081 } 3082 3083 /* Avoid TOCTOU with earlier protection check */ 3084 cgroup_size = max(cgroup_size, protection); 3085 3086 scan = lruvec_size - lruvec_size * protection / 3087 (cgroup_size + 1); 3088 3089 /* 3090 * Minimally target SWAP_CLUSTER_MAX pages to keep 3091 * reclaim moving forwards, avoiding decrementing 3092 * sc->priority further than desirable. 3093 */ 3094 scan = max(scan, SWAP_CLUSTER_MAX); 3095 } else { 3096 scan = lruvec_size; 3097 } 3098 3099 scan >>= sc->priority; 3100 3101 /* 3102 * If the cgroup's already been deleted, make sure to 3103 * scrape out the remaining cache. 3104 */ 3105 if (!scan && !mem_cgroup_online(memcg)) 3106 scan = min(lruvec_size, SWAP_CLUSTER_MAX); 3107 3108 switch (scan_balance) { 3109 case SCAN_EQUAL: 3110 /* Scan lists relative to size */ 3111 break; 3112 case SCAN_FRACT: 3113 /* 3114 * Scan types proportional to swappiness and 3115 * their relative recent reclaim efficiency. 3116 * Make sure we don't miss the last page on 3117 * the offlined memory cgroups because of a 3118 * round-off error. 3119 */ 3120 scan = mem_cgroup_online(memcg) ? 3121 div64_u64(scan * fraction[file], denominator) : 3122 DIV64_U64_ROUND_UP(scan * fraction[file], 3123 denominator); 3124 break; 3125 case SCAN_FILE: 3126 case SCAN_ANON: 3127 /* Scan one type exclusively */ 3128 if ((scan_balance == SCAN_FILE) != file) 3129 scan = 0; 3130 break; 3131 default: 3132 /* Look ma, no brain */ 3133 BUG(); 3134 } 3135 3136 nr[lru] = scan; 3137 } 3138 } 3139 3140 /* 3141 * Anonymous LRU management is a waste if there is 3142 * ultimately no way to reclaim the memory. 3143 */ 3144 static bool can_age_anon_pages(struct pglist_data *pgdat, 3145 struct scan_control *sc) 3146 { 3147 /* Aging the anon LRU is valuable if swap is present: */ 3148 if (total_swap_pages > 0) 3149 return true; 3150 3151 /* Also valuable if anon pages can be demoted: */ 3152 return can_demote(pgdat->node_id, sc); 3153 } 3154 3155 #ifdef CONFIG_LRU_GEN 3156 3157 #ifdef CONFIG_LRU_GEN_ENABLED 3158 DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); 3159 #define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) 3160 #else 3161 DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); 3162 #define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap]) 3163 #endif 3164 3165 /****************************************************************************** 3166 * shorthand helpers 3167 ******************************************************************************/ 3168 3169 #define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) 3170 3171 #define DEFINE_MAX_SEQ(lruvec) \ 3172 unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) 3173 3174 #define DEFINE_MIN_SEQ(lruvec) \ 3175 unsigned long min_seq[ANON_AND_FILE] = { \ 3176 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \ 3177 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ 3178 } 3179 3180 #define for_each_gen_type_zone(gen, type, zone) \ 3181 for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ 3182 for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ 3183 for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) 3184 3185 #define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) 3186 #define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) 3187 3188 static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) 3189 { 3190 struct pglist_data *pgdat = NODE_DATA(nid); 3191 3192 #ifdef CONFIG_MEMCG 3193 if (memcg) { 3194 struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; 3195 3196 /* see the comment in mem_cgroup_lruvec() */ 3197 if (!lruvec->pgdat) 3198 lruvec->pgdat = pgdat; 3199 3200 return lruvec; 3201 } 3202 #endif 3203 VM_WARN_ON_ONCE(!mem_cgroup_disabled()); 3204 3205 return &pgdat->__lruvec; 3206 } 3207 3208 static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) 3209 { 3210 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3211 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 3212 3213 if (!sc->may_swap) 3214 return 0; 3215 3216 if (!can_demote(pgdat->node_id, sc) && 3217 mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) 3218 return 0; 3219 3220 return mem_cgroup_swappiness(memcg); 3221 } 3222 3223 static int get_nr_gens(struct lruvec *lruvec, int type) 3224 { 3225 return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; 3226 } 3227 3228 static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) 3229 { 3230 /* see the comment on lru_gen_folio */ 3231 return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && 3232 get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && 3233 get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; 3234 } 3235 3236 /****************************************************************************** 3237 * mm_struct list 3238 ******************************************************************************/ 3239 3240 static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) 3241 { 3242 static struct lru_gen_mm_list mm_list = { 3243 .fifo = LIST_HEAD_INIT(mm_list.fifo), 3244 .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), 3245 }; 3246 3247 #ifdef CONFIG_MEMCG 3248 if (memcg) 3249 return &memcg->mm_list; 3250 #endif 3251 VM_WARN_ON_ONCE(!mem_cgroup_disabled()); 3252 3253 return &mm_list; 3254 } 3255 3256 void lru_gen_add_mm(struct mm_struct *mm) 3257 { 3258 int nid; 3259 struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); 3260 struct lru_gen_mm_list *mm_list = get_mm_list(memcg); 3261 3262 VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list)); 3263 #ifdef CONFIG_MEMCG 3264 VM_WARN_ON_ONCE(mm->lru_gen.memcg); 3265 mm->lru_gen.memcg = memcg; 3266 #endif 3267 spin_lock(&mm_list->lock); 3268 3269 for_each_node_state(nid, N_MEMORY) { 3270 struct lruvec *lruvec = get_lruvec(memcg, nid); 3271 3272 /* the first addition since the last iteration */ 3273 if (lruvec->mm_state.tail == &mm_list->fifo) 3274 lruvec->mm_state.tail = &mm->lru_gen.list; 3275 } 3276 3277 list_add_tail(&mm->lru_gen.list, &mm_list->fifo); 3278 3279 spin_unlock(&mm_list->lock); 3280 } 3281 3282 void lru_gen_del_mm(struct mm_struct *mm) 3283 { 3284 int nid; 3285 struct lru_gen_mm_list *mm_list; 3286 struct mem_cgroup *memcg = NULL; 3287 3288 if (list_empty(&mm->lru_gen.list)) 3289 return; 3290 3291 #ifdef CONFIG_MEMCG 3292 memcg = mm->lru_gen.memcg; 3293 #endif 3294 mm_list = get_mm_list(memcg); 3295 3296 spin_lock(&mm_list->lock); 3297 3298 for_each_node(nid) { 3299 struct lruvec *lruvec = get_lruvec(memcg, nid); 3300 3301 /* where the last iteration ended (exclusive) */ 3302 if (lruvec->mm_state.tail == &mm->lru_gen.list) 3303 lruvec->mm_state.tail = lruvec->mm_state.tail->next; 3304 3305 /* where the current iteration continues (inclusive) */ 3306 if (lruvec->mm_state.head != &mm->lru_gen.list) 3307 continue; 3308 3309 lruvec->mm_state.head = lruvec->mm_state.head->next; 3310 /* the deletion ends the current iteration */ 3311 if (lruvec->mm_state.head == &mm_list->fifo) 3312 WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1); 3313 } 3314 3315 list_del_init(&mm->lru_gen.list); 3316 3317 spin_unlock(&mm_list->lock); 3318 3319 #ifdef CONFIG_MEMCG 3320 mem_cgroup_put(mm->lru_gen.memcg); 3321 mm->lru_gen.memcg = NULL; 3322 #endif 3323 } 3324 3325 #ifdef CONFIG_MEMCG 3326 void lru_gen_migrate_mm(struct mm_struct *mm) 3327 { 3328 struct mem_cgroup *memcg; 3329 struct task_struct *task = rcu_dereference_protected(mm->owner, true); 3330 3331 VM_WARN_ON_ONCE(task->mm != mm); 3332 lockdep_assert_held(&task->alloc_lock); 3333 3334 /* for mm_update_next_owner() */ 3335 if (mem_cgroup_disabled()) 3336 return; 3337 3338 /* migration can happen before addition */ 3339 if (!mm->lru_gen.memcg) 3340 return; 3341 3342 rcu_read_lock(); 3343 memcg = mem_cgroup_from_task(task); 3344 rcu_read_unlock(); 3345 if (memcg == mm->lru_gen.memcg) 3346 return; 3347 3348 VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); 3349 3350 lru_gen_del_mm(mm); 3351 lru_gen_add_mm(mm); 3352 } 3353 #endif 3354 3355 /* 3356 * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when 3357 * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of 3358 * bits in a bitmap, k is the number of hash functions and n is the number of 3359 * inserted items. 3360 * 3361 * Page table walkers use one of the two filters to reduce their search space. 3362 * To get rid of non-leaf entries that no longer have enough leaf entries, the 3363 * aging uses the double-buffering technique to flip to the other filter each 3364 * time it produces a new generation. For non-leaf entries that have enough 3365 * leaf entries, the aging carries them over to the next generation in 3366 * walk_pmd_range(); the eviction also report them when walking the rmap 3367 * in lru_gen_look_around(). 3368 * 3369 * For future optimizations: 3370 * 1. It's not necessary to keep both filters all the time. The spare one can be 3371 * freed after the RCU grace period and reallocated if needed again. 3372 * 2. And when reallocating, it's worth scaling its size according to the number 3373 * of inserted entries in the other filter, to reduce the memory overhead on 3374 * small systems and false positives on large systems. 3375 * 3. Jenkins' hash function is an alternative to Knuth's. 3376 */ 3377 #define BLOOM_FILTER_SHIFT 15 3378 3379 static inline int filter_gen_from_seq(unsigned long seq) 3380 { 3381 return seq % NR_BLOOM_FILTERS; 3382 } 3383 3384 static void get_item_key(void *item, int *key) 3385 { 3386 u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); 3387 3388 BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); 3389 3390 key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); 3391 key[1] = hash >> BLOOM_FILTER_SHIFT; 3392 } 3393 3394 static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) 3395 { 3396 unsigned long *filter; 3397 int gen = filter_gen_from_seq(seq); 3398 3399 filter = lruvec->mm_state.filters[gen]; 3400 if (filter) { 3401 bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); 3402 return; 3403 } 3404 3405 filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), 3406 __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); 3407 WRITE_ONCE(lruvec->mm_state.filters[gen], filter); 3408 } 3409 3410 static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) 3411 { 3412 int key[2]; 3413 unsigned long *filter; 3414 int gen = filter_gen_from_seq(seq); 3415 3416 filter = READ_ONCE(lruvec->mm_state.filters[gen]); 3417 if (!filter) 3418 return; 3419 3420 get_item_key(item, key); 3421 3422 if (!test_bit(key[0], filter)) 3423 set_bit(key[0], filter); 3424 if (!test_bit(key[1], filter)) 3425 set_bit(key[1], filter); 3426 } 3427 3428 static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) 3429 { 3430 int key[2]; 3431 unsigned long *filter; 3432 int gen = filter_gen_from_seq(seq); 3433 3434 filter = READ_ONCE(lruvec->mm_state.filters[gen]); 3435 if (!filter) 3436 return true; 3437 3438 get_item_key(item, key); 3439 3440 return test_bit(key[0], filter) && test_bit(key[1], filter); 3441 } 3442 3443 static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) 3444 { 3445 int i; 3446 int hist; 3447 3448 lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); 3449 3450 if (walk) { 3451 hist = lru_hist_from_seq(walk->max_seq); 3452 3453 for (i = 0; i < NR_MM_STATS; i++) { 3454 WRITE_ONCE(lruvec->mm_state.stats[hist][i], 3455 lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]); 3456 walk->mm_stats[i] = 0; 3457 } 3458 } 3459 3460 if (NR_HIST_GENS > 1 && last) { 3461 hist = lru_hist_from_seq(lruvec->mm_state.seq + 1); 3462 3463 for (i = 0; i < NR_MM_STATS; i++) 3464 WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0); 3465 } 3466 } 3467 3468 static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) 3469 { 3470 int type; 3471 unsigned long size = 0; 3472 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 3473 int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); 3474 3475 if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) 3476 return true; 3477 3478 clear_bit(key, &mm->lru_gen.bitmap); 3479 3480 for (type = !walk->can_swap; type < ANON_AND_FILE; type++) { 3481 size += type ? get_mm_counter(mm, MM_FILEPAGES) : 3482 get_mm_counter(mm, MM_ANONPAGES) + 3483 get_mm_counter(mm, MM_SHMEMPAGES); 3484 } 3485 3486 if (size < MIN_LRU_BATCH) 3487 return true; 3488 3489 return !mmget_not_zero(mm); 3490 } 3491 3492 static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, 3493 struct mm_struct **iter) 3494 { 3495 bool first = false; 3496 bool last = true; 3497 struct mm_struct *mm = NULL; 3498 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3499 struct lru_gen_mm_list *mm_list = get_mm_list(memcg); 3500 struct lru_gen_mm_state *mm_state = &lruvec->mm_state; 3501 3502 /* 3503 * There are four interesting cases for this page table walker: 3504 * 1. It tries to start a new iteration of mm_list with a stale max_seq; 3505 * there is nothing left to do. 3506 * 2. It's the first of the current generation, and it needs to reset 3507 * the Bloom filter for the next generation. 3508 * 3. It reaches the end of mm_list, and it needs to increment 3509 * mm_state->seq; the iteration is done. 3510 * 4. It's the last of the current generation, and it needs to reset the 3511 * mm stats counters for the next generation. 3512 */ 3513 spin_lock(&mm_list->lock); 3514 3515 VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); 3516 VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq); 3517 VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers); 3518 3519 if (walk->max_seq <= mm_state->seq) { 3520 if (!*iter) 3521 last = false; 3522 goto done; 3523 } 3524 3525 if (!mm_state->nr_walkers) { 3526 VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); 3527 3528 mm_state->head = mm_list->fifo.next; 3529 first = true; 3530 } 3531 3532 while (!mm && mm_state->head != &mm_list->fifo) { 3533 mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); 3534 3535 mm_state->head = mm_state->head->next; 3536 3537 /* force scan for those added after the last iteration */ 3538 if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) { 3539 mm_state->tail = mm_state->head; 3540 walk->force_scan = true; 3541 } 3542 3543 if (should_skip_mm(mm, walk)) 3544 mm = NULL; 3545 } 3546 3547 if (mm_state->head == &mm_list->fifo) 3548 WRITE_ONCE(mm_state->seq, mm_state->seq + 1); 3549 done: 3550 if (*iter && !mm) 3551 mm_state->nr_walkers--; 3552 if (!*iter && mm) 3553 mm_state->nr_walkers++; 3554 3555 if (mm_state->nr_walkers) 3556 last = false; 3557 3558 if (*iter || last) 3559 reset_mm_stats(lruvec, walk, last); 3560 3561 spin_unlock(&mm_list->lock); 3562 3563 if (mm && first) 3564 reset_bloom_filter(lruvec, walk->max_seq + 1); 3565 3566 if (*iter) 3567 mmput_async(*iter); 3568 3569 *iter = mm; 3570 3571 return last; 3572 } 3573 3574 static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) 3575 { 3576 bool success = false; 3577 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3578 struct lru_gen_mm_list *mm_list = get_mm_list(memcg); 3579 struct lru_gen_mm_state *mm_state = &lruvec->mm_state; 3580 3581 spin_lock(&mm_list->lock); 3582 3583 VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); 3584 3585 if (max_seq > mm_state->seq && !mm_state->nr_walkers) { 3586 VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); 3587 3588 WRITE_ONCE(mm_state->seq, mm_state->seq + 1); 3589 reset_mm_stats(lruvec, NULL, true); 3590 success = true; 3591 } 3592 3593 spin_unlock(&mm_list->lock); 3594 3595 return success; 3596 } 3597 3598 /****************************************************************************** 3599 * refault feedback loop 3600 ******************************************************************************/ 3601 3602 /* 3603 * A feedback loop based on Proportional-Integral-Derivative (PID) controller. 3604 * 3605 * The P term is refaulted/(evicted+protected) from a tier in the generation 3606 * currently being evicted; the I term is the exponential moving average of the 3607 * P term over the generations previously evicted, using the smoothing factor 3608 * 1/2; the D term isn't supported. 3609 * 3610 * The setpoint (SP) is always the first tier of one type; the process variable 3611 * (PV) is either any tier of the other type or any other tier of the same 3612 * type. 3613 * 3614 * The error is the difference between the SP and the PV; the correction is to 3615 * turn off protection when SP>PV or turn on protection when SP<PV. 3616 * 3617 * For future optimizations: 3618 * 1. The D term may discount the other two terms over time so that long-lived 3619 * generations can resist stale information. 3620 */ 3621 struct ctrl_pos { 3622 unsigned long refaulted; 3623 unsigned long total; 3624 int gain; 3625 }; 3626 3627 static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, 3628 struct ctrl_pos *pos) 3629 { 3630 struct lru_gen_folio *lrugen = &lruvec->lrugen; 3631 int hist = lru_hist_from_seq(lrugen->min_seq[type]); 3632 3633 pos->refaulted = lrugen->avg_refaulted[type][tier] + 3634 atomic_long_read(&lrugen->refaulted[hist][type][tier]); 3635 pos->total = lrugen->avg_total[type][tier] + 3636 atomic_long_read(&lrugen->evicted[hist][type][tier]); 3637 if (tier) 3638 pos->total += lrugen->protected[hist][type][tier - 1]; 3639 pos->gain = gain; 3640 } 3641 3642 static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) 3643 { 3644 int hist, tier; 3645 struct lru_gen_folio *lrugen = &lruvec->lrugen; 3646 bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; 3647 unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; 3648 3649 lockdep_assert_held(&lruvec->lru_lock); 3650 3651 if (!carryover && !clear) 3652 return; 3653 3654 hist = lru_hist_from_seq(seq); 3655 3656 for (tier = 0; tier < MAX_NR_TIERS; tier++) { 3657 if (carryover) { 3658 unsigned long sum; 3659 3660 sum = lrugen->avg_refaulted[type][tier] + 3661 atomic_long_read(&lrugen->refaulted[hist][type][tier]); 3662 WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); 3663 3664 sum = lrugen->avg_total[type][tier] + 3665 atomic_long_read(&lrugen->evicted[hist][type][tier]); 3666 if (tier) 3667 sum += lrugen->protected[hist][type][tier - 1]; 3668 WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); 3669 } 3670 3671 if (clear) { 3672 atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); 3673 atomic_long_set(&lrugen->evicted[hist][type][tier], 0); 3674 if (tier) 3675 WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); 3676 } 3677 } 3678 } 3679 3680 static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) 3681 { 3682 /* 3683 * Return true if the PV has a limited number of refaults or a lower 3684 * refaulted/total than the SP. 3685 */ 3686 return pv->refaulted < MIN_LRU_BATCH || 3687 pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <= 3688 (sp->refaulted + 1) * pv->total * pv->gain; 3689 } 3690 3691 /****************************************************************************** 3692 * the aging 3693 ******************************************************************************/ 3694 3695 /* promote pages accessed through page tables */ 3696 static int folio_update_gen(struct folio *folio, int gen) 3697 { 3698 unsigned long new_flags, old_flags = READ_ONCE(folio->flags); 3699 3700 VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); 3701 VM_WARN_ON_ONCE(!rcu_read_lock_held()); 3702 3703 do { 3704 /* lru_gen_del_folio() has isolated this page? */ 3705 if (!(old_flags & LRU_GEN_MASK)) { 3706 /* for shrink_folio_list() */ 3707 new_flags = old_flags | BIT(PG_referenced); 3708 continue; 3709 } 3710 3711 new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); 3712 new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; 3713 } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); 3714 3715 return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; 3716 } 3717 3718 /* protect pages accessed multiple times through file descriptors */ 3719 static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) 3720 { 3721 int type = folio_is_file_lru(folio); 3722 struct lru_gen_folio *lrugen = &lruvec->lrugen; 3723 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); 3724 unsigned long new_flags, old_flags = READ_ONCE(folio->flags); 3725 3726 VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); 3727 3728 do { 3729 new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; 3730 /* folio_update_gen() has promoted this page? */ 3731 if (new_gen >= 0 && new_gen != old_gen) 3732 return new_gen; 3733 3734 new_gen = (old_gen + 1) % MAX_NR_GENS; 3735 3736 new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); 3737 new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; 3738 /* for folio_end_writeback() */ 3739 if (reclaiming) 3740 new_flags |= BIT(PG_reclaim); 3741 } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); 3742 3743 lru_gen_update_size(lruvec, folio, old_gen, new_gen); 3744 3745 return new_gen; 3746 } 3747 3748 static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, 3749 int old_gen, int new_gen) 3750 { 3751 int type = folio_is_file_lru(folio); 3752 int zone = folio_zonenum(folio); 3753 int delta = folio_nr_pages(folio); 3754 3755 VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS); 3756 VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS); 3757 3758 walk->batched++; 3759 3760 walk->nr_pages[old_gen][type][zone] -= delta; 3761 walk->nr_pages[new_gen][type][zone] += delta; 3762 } 3763 3764 static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) 3765 { 3766 int gen, type, zone; 3767 struct lru_gen_folio *lrugen = &lruvec->lrugen; 3768 3769 walk->batched = 0; 3770 3771 for_each_gen_type_zone(gen, type, zone) { 3772 enum lru_list lru = type * LRU_INACTIVE_FILE; 3773 int delta = walk->nr_pages[gen][type][zone]; 3774 3775 if (!delta) 3776 continue; 3777 3778 walk->nr_pages[gen][type][zone] = 0; 3779 WRITE_ONCE(lrugen->nr_pages[gen][type][zone], 3780 lrugen->nr_pages[gen][type][zone] + delta); 3781 3782 if (lru_gen_is_active(lruvec, gen)) 3783 lru += LRU_ACTIVE; 3784 __update_lru_size(lruvec, lru, zone, delta); 3785 } 3786 } 3787 3788 static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args) 3789 { 3790 struct address_space *mapping; 3791 struct vm_area_struct *vma = args->vma; 3792 struct lru_gen_mm_walk *walk = args->private; 3793 3794 if (!vma_is_accessible(vma)) 3795 return true; 3796 3797 if (is_vm_hugetlb_page(vma)) 3798 return true; 3799 3800 if (!vma_has_recency(vma)) 3801 return true; 3802 3803 if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) 3804 return true; 3805 3806 if (vma == get_gate_vma(vma->vm_mm)) 3807 return true; 3808 3809 if (vma_is_anonymous(vma)) 3810 return !walk->can_swap; 3811 3812 if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) 3813 return true; 3814 3815 mapping = vma->vm_file->f_mapping; 3816 if (mapping_unevictable(mapping)) 3817 return true; 3818 3819 if (shmem_mapping(mapping)) 3820 return !walk->can_swap; 3821 3822 /* to exclude special mappings like dax, etc. */ 3823 return !mapping->a_ops->read_folio; 3824 } 3825 3826 /* 3827 * Some userspace memory allocators map many single-page VMAs. Instead of 3828 * returning back to the PGD table for each of such VMAs, finish an entire PMD 3829 * table to reduce zigzags and improve cache performance. 3830 */ 3831 static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args, 3832 unsigned long *vm_start, unsigned long *vm_end) 3833 { 3834 unsigned long start = round_up(*vm_end, size); 3835 unsigned long end = (start | ~mask) + 1; 3836 VMA_ITERATOR(vmi, args->mm, start); 3837 3838 VM_WARN_ON_ONCE(mask & size); 3839 VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask)); 3840 3841 for_each_vma(vmi, args->vma) { 3842 if (end && end <= args->vma->vm_start) 3843 return false; 3844 3845 if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) 3846 continue; 3847 3848 *vm_start = max(start, args->vma->vm_start); 3849 *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; 3850 3851 return true; 3852 } 3853 3854 return false; 3855 } 3856 3857 static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) 3858 { 3859 unsigned long pfn = pte_pfn(pte); 3860 3861 VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); 3862 3863 if (!pte_present(pte) || is_zero_pfn(pfn)) 3864 return -1; 3865 3866 if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) 3867 return -1; 3868 3869 if (WARN_ON_ONCE(!pfn_valid(pfn))) 3870 return -1; 3871 3872 return pfn; 3873 } 3874 3875 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) 3876 static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) 3877 { 3878 unsigned long pfn = pmd_pfn(pmd); 3879 3880 VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); 3881 3882 if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) 3883 return -1; 3884 3885 if (WARN_ON_ONCE(pmd_devmap(pmd))) 3886 return -1; 3887 3888 if (WARN_ON_ONCE(!pfn_valid(pfn))) 3889 return -1; 3890 3891 return pfn; 3892 } 3893 #endif 3894 3895 static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, 3896 struct pglist_data *pgdat, bool can_swap) 3897 { 3898 struct folio *folio; 3899 3900 /* try to avoid unnecessary memory loads */ 3901 if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) 3902 return NULL; 3903 3904 folio = pfn_folio(pfn); 3905 if (folio_nid(folio) != pgdat->node_id) 3906 return NULL; 3907 3908 if (folio_memcg_rcu(folio) != memcg) 3909 return NULL; 3910 3911 /* file VMAs can contain anon pages from COW */ 3912 if (!folio_is_file_lru(folio) && !can_swap) 3913 return NULL; 3914 3915 return folio; 3916 } 3917 3918 static bool suitable_to_scan(int total, int young) 3919 { 3920 int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); 3921 3922 /* suitable if the average number of young PTEs per cacheline is >=1 */ 3923 return young * n >= total; 3924 } 3925 3926 static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, 3927 struct mm_walk *args) 3928 { 3929 int i; 3930 pte_t *pte; 3931 spinlock_t *ptl; 3932 unsigned long addr; 3933 int total = 0; 3934 int young = 0; 3935 struct lru_gen_mm_walk *walk = args->private; 3936 struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); 3937 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 3938 int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); 3939 3940 VM_WARN_ON_ONCE(pmd_leaf(*pmd)); 3941 3942 ptl = pte_lockptr(args->mm, pmd); 3943 if (!spin_trylock(ptl)) 3944 return false; 3945 3946 arch_enter_lazy_mmu_mode(); 3947 3948 pte = pte_offset_map(pmd, start & PMD_MASK); 3949 restart: 3950 for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { 3951 unsigned long pfn; 3952 struct folio *folio; 3953 3954 total++; 3955 walk->mm_stats[MM_LEAF_TOTAL]++; 3956 3957 pfn = get_pte_pfn(pte[i], args->vma, addr); 3958 if (pfn == -1) 3959 continue; 3960 3961 if (!pte_young(pte[i])) { 3962 walk->mm_stats[MM_LEAF_OLD]++; 3963 continue; 3964 } 3965 3966 folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); 3967 if (!folio) 3968 continue; 3969 3970 if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) 3971 VM_WARN_ON_ONCE(true); 3972 3973 young++; 3974 walk->mm_stats[MM_LEAF_YOUNG]++; 3975 3976 if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && 3977 !(folio_test_anon(folio) && folio_test_swapbacked(folio) && 3978 !folio_test_swapcache(folio))) 3979 folio_mark_dirty(folio); 3980 3981 old_gen = folio_update_gen(folio, new_gen); 3982 if (old_gen >= 0 && old_gen != new_gen) 3983 update_batch_size(walk, folio, old_gen, new_gen); 3984 } 3985 3986 if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) 3987 goto restart; 3988 3989 pte_unmap(pte); 3990 3991 arch_leave_lazy_mmu_mode(); 3992 spin_unlock(ptl); 3993 3994 return suitable_to_scan(total, young); 3995 } 3996 3997 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) 3998 static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, 3999 struct mm_walk *args, unsigned long *bitmap, unsigned long *start) 4000 { 4001 int i; 4002 pmd_t *pmd; 4003 spinlock_t *ptl; 4004 struct lru_gen_mm_walk *walk = args->private; 4005 struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); 4006 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 4007 int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); 4008 4009 VM_WARN_ON_ONCE(pud_leaf(*pud)); 4010 4011 /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ 4012 if (*start == -1) { 4013 *start = next; 4014 return; 4015 } 4016 4017 i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start); 4018 if (i && i <= MIN_LRU_BATCH) { 4019 __set_bit(i - 1, bitmap); 4020 return; 4021 } 4022 4023 pmd = pmd_offset(pud, *start); 4024 4025 ptl = pmd_lockptr(args->mm, pmd); 4026 if (!spin_trylock(ptl)) 4027 goto done; 4028 4029 arch_enter_lazy_mmu_mode(); 4030 4031 do { 4032 unsigned long pfn; 4033 struct folio *folio; 4034 unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start; 4035 4036 pfn = get_pmd_pfn(pmd[i], vma, addr); 4037 if (pfn == -1) 4038 goto next; 4039 4040 if (!pmd_trans_huge(pmd[i])) { 4041 if (arch_has_hw_nonleaf_pmd_young() && 4042 get_cap(LRU_GEN_NONLEAF_YOUNG)) 4043 pmdp_test_and_clear_young(vma, addr, pmd + i); 4044 goto next; 4045 } 4046 4047 folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); 4048 if (!folio) 4049 goto next; 4050 4051 if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) 4052 goto next; 4053 4054 walk->mm_stats[MM_LEAF_YOUNG]++; 4055 4056 if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) && 4057 !(folio_test_anon(folio) && folio_test_swapbacked(folio) && 4058 !folio_test_swapcache(folio))) 4059 folio_mark_dirty(folio); 4060 4061 old_gen = folio_update_gen(folio, new_gen); 4062 if (old_gen >= 0 && old_gen != new_gen) 4063 update_batch_size(walk, folio, old_gen, new_gen); 4064 next: 4065 i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; 4066 } while (i <= MIN_LRU_BATCH); 4067 4068 arch_leave_lazy_mmu_mode(); 4069 spin_unlock(ptl); 4070 done: 4071 *start = -1; 4072 bitmap_zero(bitmap, MIN_LRU_BATCH); 4073 } 4074 #else 4075 static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, 4076 struct mm_walk *args, unsigned long *bitmap, unsigned long *start) 4077 { 4078 } 4079 #endif 4080 4081 static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, 4082 struct mm_walk *args) 4083 { 4084 int i; 4085 pmd_t *pmd; 4086 unsigned long next; 4087 unsigned long addr; 4088 struct vm_area_struct *vma; 4089 unsigned long pos = -1; 4090 struct lru_gen_mm_walk *walk = args->private; 4091 unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; 4092 4093 VM_WARN_ON_ONCE(pud_leaf(*pud)); 4094 4095 /* 4096 * Finish an entire PMD in two passes: the first only reaches to PTE 4097 * tables to avoid taking the PMD lock; the second, if necessary, takes 4098 * the PMD lock to clear the accessed bit in PMD entries. 4099 */ 4100 pmd = pmd_offset(pud, start & PUD_MASK); 4101 restart: 4102 /* walk_pte_range() may call get_next_vma() */ 4103 vma = args->vma; 4104 for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { 4105 pmd_t val = pmdp_get_lockless(pmd + i); 4106 4107 next = pmd_addr_end(addr, end); 4108 4109 if (!pmd_present(val) || is_huge_zero_pmd(val)) { 4110 walk->mm_stats[MM_LEAF_TOTAL]++; 4111 continue; 4112 } 4113 4114 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4115 if (pmd_trans_huge(val)) { 4116 unsigned long pfn = pmd_pfn(val); 4117 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 4118 4119 walk->mm_stats[MM_LEAF_TOTAL]++; 4120 4121 if (!pmd_young(val)) { 4122 walk->mm_stats[MM_LEAF_OLD]++; 4123 continue; 4124 } 4125 4126 /* try to avoid unnecessary memory loads */ 4127 if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) 4128 continue; 4129 4130 walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); 4131 continue; 4132 } 4133 #endif 4134 walk->mm_stats[MM_NONLEAF_TOTAL]++; 4135 4136 if (arch_has_hw_nonleaf_pmd_young() && 4137 get_cap(LRU_GEN_NONLEAF_YOUNG)) { 4138 if (!pmd_young(val)) 4139 continue; 4140 4141 walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); 4142 } 4143 4144 if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) 4145 continue; 4146 4147 walk->mm_stats[MM_NONLEAF_FOUND]++; 4148 4149 if (!walk_pte_range(&val, addr, next, args)) 4150 continue; 4151 4152 walk->mm_stats[MM_NONLEAF_ADDED]++; 4153 4154 /* carry over to the next generation */ 4155 update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); 4156 } 4157 4158 walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos); 4159 4160 if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) 4161 goto restart; 4162 } 4163 4164 static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, 4165 struct mm_walk *args) 4166 { 4167 int i; 4168 pud_t *pud; 4169 unsigned long addr; 4170 unsigned long next; 4171 struct lru_gen_mm_walk *walk = args->private; 4172 4173 VM_WARN_ON_ONCE(p4d_leaf(*p4d)); 4174 4175 pud = pud_offset(p4d, start & P4D_MASK); 4176 restart: 4177 for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { 4178 pud_t val = READ_ONCE(pud[i]); 4179 4180 next = pud_addr_end(addr, end); 4181 4182 if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) 4183 continue; 4184 4185 walk_pmd_range(&val, addr, next, args); 4186 4187 /* a racy check to curtail the waiting time */ 4188 if (wq_has_sleeper(&walk->lruvec->mm_state.wait)) 4189 return 1; 4190 4191 if (need_resched() || walk->batched >= MAX_LRU_BATCH) { 4192 end = (addr | ~PUD_MASK) + 1; 4193 goto done; 4194 } 4195 } 4196 4197 if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end)) 4198 goto restart; 4199 4200 end = round_up(end, P4D_SIZE); 4201 done: 4202 if (!end || !args->vma) 4203 return 1; 4204 4205 walk->next_addr = max(end, args->vma->vm_start); 4206 4207 return -EAGAIN; 4208 } 4209 4210 static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk) 4211 { 4212 static const struct mm_walk_ops mm_walk_ops = { 4213 .test_walk = should_skip_vma, 4214 .p4d_entry = walk_pud_range, 4215 }; 4216 4217 int err; 4218 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4219 4220 walk->next_addr = FIRST_USER_ADDRESS; 4221 4222 do { 4223 err = -EBUSY; 4224 4225 /* folio_update_gen() requires stable folio_memcg() */ 4226 if (!mem_cgroup_trylock_pages(memcg)) 4227 break; 4228 4229 /* the caller might be holding the lock for write */ 4230 if (mmap_read_trylock(mm)) { 4231 err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk); 4232 4233 mmap_read_unlock(mm); 4234 } 4235 4236 mem_cgroup_unlock_pages(); 4237 4238 if (walk->batched) { 4239 spin_lock_irq(&lruvec->lru_lock); 4240 reset_batch_size(lruvec, walk); 4241 spin_unlock_irq(&lruvec->lru_lock); 4242 } 4243 4244 cond_resched(); 4245 } while (err == -EAGAIN); 4246 } 4247 4248 static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc) 4249 { 4250 struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; 4251 4252 if (pgdat && current_is_kswapd()) { 4253 VM_WARN_ON_ONCE(walk); 4254 4255 walk = &pgdat->mm_walk; 4256 } else if (!walk && force_alloc) { 4257 VM_WARN_ON_ONCE(current_is_kswapd()); 4258 4259 walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); 4260 } 4261 4262 current->reclaim_state->mm_walk = walk; 4263 4264 return walk; 4265 } 4266 4267 static void clear_mm_walk(void) 4268 { 4269 struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; 4270 4271 VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages))); 4272 VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats))); 4273 4274 current->reclaim_state->mm_walk = NULL; 4275 4276 if (!current_is_kswapd()) 4277 kfree(walk); 4278 } 4279 4280 static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) 4281 { 4282 int zone; 4283 int remaining = MAX_LRU_BATCH; 4284 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4285 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); 4286 4287 if (type == LRU_GEN_ANON && !can_swap) 4288 goto done; 4289 4290 /* prevent cold/hot inversion if force_scan is true */ 4291 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4292 struct list_head *head = &lrugen->folios[old_gen][type][zone]; 4293 4294 while (!list_empty(head)) { 4295 struct folio *folio = lru_to_folio(head); 4296 4297 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); 4298 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); 4299 VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); 4300 VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); 4301 4302 new_gen = folio_inc_gen(lruvec, folio, false); 4303 list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); 4304 4305 if (!--remaining) 4306 return false; 4307 } 4308 } 4309 done: 4310 reset_ctrl_pos(lruvec, type, true); 4311 WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); 4312 4313 return true; 4314 } 4315 4316 static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) 4317 { 4318 int gen, type, zone; 4319 bool success = false; 4320 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4321 DEFINE_MIN_SEQ(lruvec); 4322 4323 VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); 4324 4325 /* find the oldest populated generation */ 4326 for (type = !can_swap; type < ANON_AND_FILE; type++) { 4327 while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { 4328 gen = lru_gen_from_seq(min_seq[type]); 4329 4330 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4331 if (!list_empty(&lrugen->folios[gen][type][zone])) 4332 goto next; 4333 } 4334 4335 min_seq[type]++; 4336 } 4337 next: 4338 ; 4339 } 4340 4341 /* see the comment on lru_gen_folio */ 4342 if (can_swap) { 4343 min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); 4344 min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); 4345 } 4346 4347 for (type = !can_swap; type < ANON_AND_FILE; type++) { 4348 if (min_seq[type] == lrugen->min_seq[type]) 4349 continue; 4350 4351 reset_ctrl_pos(lruvec, type, true); 4352 WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); 4353 success = true; 4354 } 4355 4356 return success; 4357 } 4358 4359 static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) 4360 { 4361 int prev, next; 4362 int type, zone; 4363 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4364 4365 spin_lock_irq(&lruvec->lru_lock); 4366 4367 VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); 4368 4369 for (type = ANON_AND_FILE - 1; type >= 0; type--) { 4370 if (get_nr_gens(lruvec, type) != MAX_NR_GENS) 4371 continue; 4372 4373 VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); 4374 4375 while (!inc_min_seq(lruvec, type, can_swap)) { 4376 spin_unlock_irq(&lruvec->lru_lock); 4377 cond_resched(); 4378 spin_lock_irq(&lruvec->lru_lock); 4379 } 4380 } 4381 4382 /* 4383 * Update the active/inactive LRU sizes for compatibility. Both sides of 4384 * the current max_seq need to be covered, since max_seq+1 can overlap 4385 * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do 4386 * overlap, cold/hot inversion happens. 4387 */ 4388 prev = lru_gen_from_seq(lrugen->max_seq - 1); 4389 next = lru_gen_from_seq(lrugen->max_seq + 1); 4390 4391 for (type = 0; type < ANON_AND_FILE; type++) { 4392 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4393 enum lru_list lru = type * LRU_INACTIVE_FILE; 4394 long delta = lrugen->nr_pages[prev][type][zone] - 4395 lrugen->nr_pages[next][type][zone]; 4396 4397 if (!delta) 4398 continue; 4399 4400 __update_lru_size(lruvec, lru, zone, delta); 4401 __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); 4402 } 4403 } 4404 4405 for (type = 0; type < ANON_AND_FILE; type++) 4406 reset_ctrl_pos(lruvec, type, false); 4407 4408 WRITE_ONCE(lrugen->timestamps[next], jiffies); 4409 /* make sure preceding modifications appear */ 4410 smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); 4411 4412 spin_unlock_irq(&lruvec->lru_lock); 4413 } 4414 4415 static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, 4416 struct scan_control *sc, bool can_swap, bool force_scan) 4417 { 4418 bool success; 4419 struct lru_gen_mm_walk *walk; 4420 struct mm_struct *mm = NULL; 4421 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4422 4423 VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); 4424 4425 /* see the comment in iterate_mm_list() */ 4426 if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) { 4427 success = false; 4428 goto done; 4429 } 4430 4431 /* 4432 * If the hardware doesn't automatically set the accessed bit, fallback 4433 * to lru_gen_look_around(), which only clears the accessed bit in a 4434 * handful of PTEs. Spreading the work out over a period of time usually 4435 * is less efficient, but it avoids bursty page faults. 4436 */ 4437 if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) { 4438 success = iterate_mm_list_nowalk(lruvec, max_seq); 4439 goto done; 4440 } 4441 4442 walk = set_mm_walk(NULL, true); 4443 if (!walk) { 4444 success = iterate_mm_list_nowalk(lruvec, max_seq); 4445 goto done; 4446 } 4447 4448 walk->lruvec = lruvec; 4449 walk->max_seq = max_seq; 4450 walk->can_swap = can_swap; 4451 walk->force_scan = force_scan; 4452 4453 do { 4454 success = iterate_mm_list(lruvec, walk, &mm); 4455 if (mm) 4456 walk_mm(lruvec, mm, walk); 4457 4458 cond_resched(); 4459 } while (mm); 4460 done: 4461 if (!success) { 4462 if (sc->priority <= DEF_PRIORITY - 2) 4463 wait_event_killable(lruvec->mm_state.wait, 4464 max_seq < READ_ONCE(lrugen->max_seq)); 4465 return false; 4466 } 4467 4468 VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); 4469 4470 inc_max_seq(lruvec, can_swap, force_scan); 4471 /* either this sees any waiters or they will see updated max_seq */ 4472 if (wq_has_sleeper(&lruvec->mm_state.wait)) 4473 wake_up_all(&lruvec->mm_state.wait); 4474 4475 return true; 4476 } 4477 4478 static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) 4479 { 4480 int gen, type, zone; 4481 unsigned long total = 0; 4482 bool can_swap = get_swappiness(lruvec, sc); 4483 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4484 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4485 DEFINE_MAX_SEQ(lruvec); 4486 DEFINE_MIN_SEQ(lruvec); 4487 4488 for (type = !can_swap; type < ANON_AND_FILE; type++) { 4489 unsigned long seq; 4490 4491 for (seq = min_seq[type]; seq <= max_seq; seq++) { 4492 gen = lru_gen_from_seq(seq); 4493 4494 for (zone = 0; zone < MAX_NR_ZONES; zone++) 4495 total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); 4496 } 4497 } 4498 4499 /* whether the size is big enough to be helpful */ 4500 return mem_cgroup_online(memcg) ? (total >> sc->priority) : total; 4501 } 4502 4503 static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc, 4504 unsigned long min_ttl) 4505 { 4506 int gen; 4507 unsigned long birth; 4508 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4509 DEFINE_MIN_SEQ(lruvec); 4510 4511 /* see the comment on lru_gen_folio */ 4512 gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); 4513 birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); 4514 4515 if (time_is_after_jiffies(birth + min_ttl)) 4516 return false; 4517 4518 if (!lruvec_is_sizable(lruvec, sc)) 4519 return false; 4520 4521 mem_cgroup_calculate_protection(NULL, memcg); 4522 4523 return !mem_cgroup_below_min(NULL, memcg); 4524 } 4525 4526 /* to protect the working set of the last N jiffies */ 4527 static unsigned long lru_gen_min_ttl __read_mostly; 4528 4529 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) 4530 { 4531 struct mem_cgroup *memcg; 4532 unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); 4533 4534 VM_WARN_ON_ONCE(!current_is_kswapd()); 4535 4536 /* check the order to exclude compaction-induced reclaim */ 4537 if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) 4538 return; 4539 4540 memcg = mem_cgroup_iter(NULL, NULL, NULL); 4541 do { 4542 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 4543 4544 if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) { 4545 mem_cgroup_iter_break(NULL, memcg); 4546 return; 4547 } 4548 4549 cond_resched(); 4550 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); 4551 4552 /* 4553 * The main goal is to OOM kill if every generation from all memcgs is 4554 * younger than min_ttl. However, another possibility is all memcgs are 4555 * either too small or below min. 4556 */ 4557 if (mutex_trylock(&oom_lock)) { 4558 struct oom_control oc = { 4559 .gfp_mask = sc->gfp_mask, 4560 }; 4561 4562 out_of_memory(&oc); 4563 4564 mutex_unlock(&oom_lock); 4565 } 4566 } 4567 4568 /* 4569 * This function exploits spatial locality when shrink_folio_list() walks the 4570 * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If 4571 * the scan was done cacheline efficiently, it adds the PMD entry pointing to 4572 * the PTE table to the Bloom filter. This forms a feedback loop between the 4573 * eviction and the aging. 4574 */ 4575 void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) 4576 { 4577 int i; 4578 pte_t *pte; 4579 unsigned long start; 4580 unsigned long end; 4581 unsigned long addr; 4582 struct lru_gen_mm_walk *walk; 4583 int young = 0; 4584 unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; 4585 struct folio *folio = pfn_folio(pvmw->pfn); 4586 struct mem_cgroup *memcg = folio_memcg(folio); 4587 struct pglist_data *pgdat = folio_pgdat(folio); 4588 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 4589 DEFINE_MAX_SEQ(lruvec); 4590 int old_gen, new_gen = lru_gen_from_seq(max_seq); 4591 4592 lockdep_assert_held(pvmw->ptl); 4593 VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); 4594 4595 if (spin_is_contended(pvmw->ptl)) 4596 return; 4597 4598 /* avoid taking the LRU lock under the PTL when possible */ 4599 walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; 4600 4601 start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); 4602 end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; 4603 4604 if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { 4605 if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) 4606 end = start + MIN_LRU_BATCH * PAGE_SIZE; 4607 else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) 4608 start = end - MIN_LRU_BATCH * PAGE_SIZE; 4609 else { 4610 start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; 4611 end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; 4612 } 4613 } 4614 4615 pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; 4616 4617 rcu_read_lock(); 4618 arch_enter_lazy_mmu_mode(); 4619 4620 for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { 4621 unsigned long pfn; 4622 4623 pfn = get_pte_pfn(pte[i], pvmw->vma, addr); 4624 if (pfn == -1) 4625 continue; 4626 4627 if (!pte_young(pte[i])) 4628 continue; 4629 4630 folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); 4631 if (!folio) 4632 continue; 4633 4634 if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) 4635 VM_WARN_ON_ONCE(true); 4636 4637 young++; 4638 4639 if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && 4640 !(folio_test_anon(folio) && folio_test_swapbacked(folio) && 4641 !folio_test_swapcache(folio))) 4642 folio_mark_dirty(folio); 4643 4644 old_gen = folio_lru_gen(folio); 4645 if (old_gen < 0) 4646 folio_set_referenced(folio); 4647 else if (old_gen != new_gen) 4648 __set_bit(i, bitmap); 4649 } 4650 4651 arch_leave_lazy_mmu_mode(); 4652 rcu_read_unlock(); 4653 4654 /* feedback from rmap walkers to page table walkers */ 4655 if (suitable_to_scan(i, young)) 4656 update_bloom_filter(lruvec, max_seq, pvmw->pmd); 4657 4658 if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { 4659 for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { 4660 folio = pfn_folio(pte_pfn(pte[i])); 4661 folio_activate(folio); 4662 } 4663 return; 4664 } 4665 4666 /* folio_update_gen() requires stable folio_memcg() */ 4667 if (!mem_cgroup_trylock_pages(memcg)) 4668 return; 4669 4670 if (!walk) { 4671 spin_lock_irq(&lruvec->lru_lock); 4672 new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); 4673 } 4674 4675 for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { 4676 folio = pfn_folio(pte_pfn(pte[i])); 4677 if (folio_memcg_rcu(folio) != memcg) 4678 continue; 4679 4680 old_gen = folio_update_gen(folio, new_gen); 4681 if (old_gen < 0 || old_gen == new_gen) 4682 continue; 4683 4684 if (walk) 4685 update_batch_size(walk, folio, old_gen, new_gen); 4686 else 4687 lru_gen_update_size(lruvec, folio, old_gen, new_gen); 4688 } 4689 4690 if (!walk) 4691 spin_unlock_irq(&lruvec->lru_lock); 4692 4693 mem_cgroup_unlock_pages(); 4694 } 4695 4696 /****************************************************************************** 4697 * the eviction 4698 ******************************************************************************/ 4699 4700 static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) 4701 { 4702 bool success; 4703 int gen = folio_lru_gen(folio); 4704 int type = folio_is_file_lru(folio); 4705 int zone = folio_zonenum(folio); 4706 int delta = folio_nr_pages(folio); 4707 int refs = folio_lru_refs(folio); 4708 int tier = lru_tier_from_refs(refs); 4709 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4710 4711 VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); 4712 4713 /* unevictable */ 4714 if (!folio_evictable(folio)) { 4715 success = lru_gen_del_folio(lruvec, folio, true); 4716 VM_WARN_ON_ONCE_FOLIO(!success, folio); 4717 folio_set_unevictable(folio); 4718 lruvec_add_folio(lruvec, folio); 4719 __count_vm_events(UNEVICTABLE_PGCULLED, delta); 4720 return true; 4721 } 4722 4723 /* dirty lazyfree */ 4724 if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) { 4725 success = lru_gen_del_folio(lruvec, folio, true); 4726 VM_WARN_ON_ONCE_FOLIO(!success, folio); 4727 folio_set_swapbacked(folio); 4728 lruvec_add_folio_tail(lruvec, folio); 4729 return true; 4730 } 4731 4732 /* promoted */ 4733 if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { 4734 list_move(&folio->lru, &lrugen->folios[gen][type][zone]); 4735 return true; 4736 } 4737 4738 /* protected */ 4739 if (tier > tier_idx) { 4740 int hist = lru_hist_from_seq(lrugen->min_seq[type]); 4741 4742 gen = folio_inc_gen(lruvec, folio, false); 4743 list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); 4744 4745 WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 4746 lrugen->protected[hist][type][tier - 1] + delta); 4747 __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); 4748 return true; 4749 } 4750 4751 /* waiting for writeback */ 4752 if (folio_test_locked(folio) || folio_test_writeback(folio) || 4753 (type == LRU_GEN_FILE && folio_test_dirty(folio))) { 4754 gen = folio_inc_gen(lruvec, folio, true); 4755 list_move(&folio->lru, &lrugen->folios[gen][type][zone]); 4756 return true; 4757 } 4758 4759 return false; 4760 } 4761 4762 static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc) 4763 { 4764 bool success; 4765 4766 /* swapping inhibited */ 4767 if (!(sc->gfp_mask & __GFP_IO) && 4768 (folio_test_dirty(folio) || 4769 (folio_test_anon(folio) && !folio_test_swapcache(folio)))) 4770 return false; 4771 4772 /* raced with release_pages() */ 4773 if (!folio_try_get(folio)) 4774 return false; 4775 4776 /* raced with another isolation */ 4777 if (!folio_test_clear_lru(folio)) { 4778 folio_put(folio); 4779 return false; 4780 } 4781 4782 /* see the comment on MAX_NR_TIERS */ 4783 if (!folio_test_referenced(folio)) 4784 set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); 4785 4786 /* for shrink_folio_list() */ 4787 folio_clear_reclaim(folio); 4788 folio_clear_referenced(folio); 4789 4790 success = lru_gen_del_folio(lruvec, folio, true); 4791 VM_WARN_ON_ONCE_FOLIO(!success, folio); 4792 4793 return true; 4794 } 4795 4796 static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, 4797 int type, int tier, struct list_head *list) 4798 { 4799 int gen, zone; 4800 enum vm_event_item item; 4801 int sorted = 0; 4802 int scanned = 0; 4803 int isolated = 0; 4804 int remaining = MAX_LRU_BATCH; 4805 struct lru_gen_folio *lrugen = &lruvec->lrugen; 4806 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4807 4808 VM_WARN_ON_ONCE(!list_empty(list)); 4809 4810 if (get_nr_gens(lruvec, type) == MIN_NR_GENS) 4811 return 0; 4812 4813 gen = lru_gen_from_seq(lrugen->min_seq[type]); 4814 4815 for (zone = sc->reclaim_idx; zone >= 0; zone--) { 4816 LIST_HEAD(moved); 4817 int skipped = 0; 4818 struct list_head *head = &lrugen->folios[gen][type][zone]; 4819 4820 while (!list_empty(head)) { 4821 struct folio *folio = lru_to_folio(head); 4822 int delta = folio_nr_pages(folio); 4823 4824 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); 4825 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); 4826 VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); 4827 VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); 4828 4829 scanned += delta; 4830 4831 if (sort_folio(lruvec, folio, tier)) 4832 sorted += delta; 4833 else if (isolate_folio(lruvec, folio, sc)) { 4834 list_add(&folio->lru, list); 4835 isolated += delta; 4836 } else { 4837 list_move(&folio->lru, &moved); 4838 skipped += delta; 4839 } 4840 4841 if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH) 4842 break; 4843 } 4844 4845 if (skipped) { 4846 list_splice(&moved, head); 4847 __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); 4848 } 4849 4850 if (!remaining || isolated >= MIN_LRU_BATCH) 4851 break; 4852 } 4853 4854 item = PGSCAN_KSWAPD + reclaimer_offset(); 4855 if (!cgroup_reclaim(sc)) { 4856 __count_vm_events(item, isolated); 4857 __count_vm_events(PGREFILL, sorted); 4858 } 4859 __count_memcg_events(memcg, item, isolated); 4860 __count_memcg_events(memcg, PGREFILL, sorted); 4861 __count_vm_events(PGSCAN_ANON + type, isolated); 4862 4863 /* 4864 * There might not be eligible folios due to reclaim_idx. Check the 4865 * remaining to prevent livelock if it's not making progress. 4866 */ 4867 return isolated || !remaining ? scanned : 0; 4868 } 4869 4870 static int get_tier_idx(struct lruvec *lruvec, int type) 4871 { 4872 int tier; 4873 struct ctrl_pos sp, pv; 4874 4875 /* 4876 * To leave a margin for fluctuations, use a larger gain factor (1:2). 4877 * This value is chosen because any other tier would have at least twice 4878 * as many refaults as the first tier. 4879 */ 4880 read_ctrl_pos(lruvec, type, 0, 1, &sp); 4881 for (tier = 1; tier < MAX_NR_TIERS; tier++) { 4882 read_ctrl_pos(lruvec, type, tier, 2, &pv); 4883 if (!positive_ctrl_err(&sp, &pv)) 4884 break; 4885 } 4886 4887 return tier - 1; 4888 } 4889 4890 static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) 4891 { 4892 int type, tier; 4893 struct ctrl_pos sp, pv; 4894 int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; 4895 4896 /* 4897 * Compare the first tier of anon with that of file to determine which 4898 * type to scan. Also need to compare other tiers of the selected type 4899 * with the first tier of the other type to determine the last tier (of 4900 * the selected type) to evict. 4901 */ 4902 read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); 4903 read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); 4904 type = positive_ctrl_err(&sp, &pv); 4905 4906 read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); 4907 for (tier = 1; tier < MAX_NR_TIERS; tier++) { 4908 read_ctrl_pos(lruvec, type, tier, gain[type], &pv); 4909 if (!positive_ctrl_err(&sp, &pv)) 4910 break; 4911 } 4912 4913 *tier_idx = tier - 1; 4914 4915 return type; 4916 } 4917 4918 static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, 4919 int *type_scanned, struct list_head *list) 4920 { 4921 int i; 4922 int type; 4923 int scanned; 4924 int tier = -1; 4925 DEFINE_MIN_SEQ(lruvec); 4926 4927 /* 4928 * Try to make the obvious choice first. When anon and file are both 4929 * available from the same generation, interpret swappiness 1 as file 4930 * first and 200 as anon first. 4931 */ 4932 if (!swappiness) 4933 type = LRU_GEN_FILE; 4934 else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) 4935 type = LRU_GEN_ANON; 4936 else if (swappiness == 1) 4937 type = LRU_GEN_FILE; 4938 else if (swappiness == 200) 4939 type = LRU_GEN_ANON; 4940 else 4941 type = get_type_to_scan(lruvec, swappiness, &tier); 4942 4943 for (i = !swappiness; i < ANON_AND_FILE; i++) { 4944 if (tier < 0) 4945 tier = get_tier_idx(lruvec, type); 4946 4947 scanned = scan_folios(lruvec, sc, type, tier, list); 4948 if (scanned) 4949 break; 4950 4951 type = !type; 4952 tier = -1; 4953 } 4954 4955 *type_scanned = type; 4956 4957 return scanned; 4958 } 4959 4960 static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) 4961 { 4962 int type; 4963 int scanned; 4964 int reclaimed; 4965 LIST_HEAD(list); 4966 LIST_HEAD(clean); 4967 struct folio *folio; 4968 struct folio *next; 4969 enum vm_event_item item; 4970 struct reclaim_stat stat; 4971 struct lru_gen_mm_walk *walk; 4972 bool skip_retry = false; 4973 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4974 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 4975 4976 spin_lock_irq(&lruvec->lru_lock); 4977 4978 scanned = isolate_folios(lruvec, sc, swappiness, &type, &list); 4979 4980 scanned += try_to_inc_min_seq(lruvec, swappiness); 4981 4982 if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) 4983 scanned = 0; 4984 4985 spin_unlock_irq(&lruvec->lru_lock); 4986 4987 if (list_empty(&list)) 4988 return scanned; 4989 retry: 4990 reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); 4991 sc->nr_reclaimed += reclaimed; 4992 4993 list_for_each_entry_safe_reverse(folio, next, &list, lru) { 4994 if (!folio_evictable(folio)) { 4995 list_del(&folio->lru); 4996 folio_putback_lru(folio); 4997 continue; 4998 } 4999 5000 if (folio_test_reclaim(folio) && 5001 (folio_test_dirty(folio) || folio_test_writeback(folio))) { 5002 /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ 5003 if (folio_test_workingset(folio)) 5004 folio_set_referenced(folio); 5005 continue; 5006 } 5007 5008 if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || 5009 folio_mapped(folio) || folio_test_locked(folio) || 5010 folio_test_dirty(folio) || folio_test_writeback(folio)) { 5011 /* don't add rejected folios to the oldest generation */ 5012 set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 5013 BIT(PG_active)); 5014 continue; 5015 } 5016 5017 /* retry folios that may have missed folio_rotate_reclaimable() */ 5018 list_move(&folio->lru, &clean); 5019 sc->nr_scanned -= folio_nr_pages(folio); 5020 } 5021 5022 spin_lock_irq(&lruvec->lru_lock); 5023 5024 move_folios_to_lru(lruvec, &list); 5025 5026 walk = current->reclaim_state->mm_walk; 5027 if (walk && walk->batched) 5028 reset_batch_size(lruvec, walk); 5029 5030 item = PGSTEAL_KSWAPD + reclaimer_offset(); 5031 if (!cgroup_reclaim(sc)) 5032 __count_vm_events(item, reclaimed); 5033 __count_memcg_events(memcg, item, reclaimed); 5034 __count_vm_events(PGSTEAL_ANON + type, reclaimed); 5035 5036 spin_unlock_irq(&lruvec->lru_lock); 5037 5038 mem_cgroup_uncharge_list(&list); 5039 free_unref_page_list(&list); 5040 5041 INIT_LIST_HEAD(&list); 5042 list_splice_init(&clean, &list); 5043 5044 if (!list_empty(&list)) { 5045 skip_retry = true; 5046 goto retry; 5047 } 5048 5049 return scanned; 5050 } 5051 5052 static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, 5053 struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) 5054 { 5055 int gen, type, zone; 5056 unsigned long old = 0; 5057 unsigned long young = 0; 5058 unsigned long total = 0; 5059 struct lru_gen_folio *lrugen = &lruvec->lrugen; 5060 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 5061 DEFINE_MIN_SEQ(lruvec); 5062 5063 /* whether this lruvec is completely out of cold folios */ 5064 if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { 5065 *nr_to_scan = 0; 5066 return true; 5067 } 5068 5069 for (type = !can_swap; type < ANON_AND_FILE; type++) { 5070 unsigned long seq; 5071 5072 for (seq = min_seq[type]; seq <= max_seq; seq++) { 5073 unsigned long size = 0; 5074 5075 gen = lru_gen_from_seq(seq); 5076 5077 for (zone = 0; zone < MAX_NR_ZONES; zone++) 5078 size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); 5079 5080 total += size; 5081 if (seq == max_seq) 5082 young += size; 5083 else if (seq + MIN_NR_GENS == max_seq) 5084 old += size; 5085 } 5086 } 5087 5088 /* try to scrape all its memory if this memcg was deleted */ 5089 *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; 5090 5091 /* 5092 * The aging tries to be lazy to reduce the overhead, while the eviction 5093 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the 5094 * ideal number of generations is MIN_NR_GENS+1. 5095 */ 5096 if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) 5097 return false; 5098 5099 /* 5100 * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) 5101 * of the total number of pages for each generation. A reasonable range 5102 * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The 5103 * aging cares about the upper bound of hot pages, while the eviction 5104 * cares about the lower bound of cold pages. 5105 */ 5106 if (young * MIN_NR_GENS > total) 5107 return true; 5108 if (old * (MIN_NR_GENS + 2) < total) 5109 return true; 5110 5111 return false; 5112 } 5113 5114 /* 5115 * For future optimizations: 5116 * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg 5117 * reclaim. 5118 */ 5119 static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) 5120 { 5121 unsigned long nr_to_scan; 5122 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 5123 DEFINE_MAX_SEQ(lruvec); 5124 5125 if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) 5126 return 0; 5127 5128 if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan)) 5129 return nr_to_scan; 5130 5131 /* skip the aging path at the default priority */ 5132 if (sc->priority == DEF_PRIORITY) 5133 return nr_to_scan; 5134 5135 /* skip this lruvec as it's low on cold folios */ 5136 return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0; 5137 } 5138 5139 static unsigned long get_nr_to_reclaim(struct scan_control *sc) 5140 { 5141 /* don't abort memcg reclaim to ensure fairness */ 5142 if (!global_reclaim(sc)) 5143 return -1; 5144 5145 return max(sc->nr_to_reclaim, compact_gap(sc->order)); 5146 } 5147 5148 static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 5149 { 5150 long nr_to_scan; 5151 unsigned long scanned = 0; 5152 unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); 5153 int swappiness = get_swappiness(lruvec, sc); 5154 5155 /* clean file folios are more likely to exist */ 5156 if (swappiness && !(sc->gfp_mask & __GFP_IO)) 5157 swappiness = 1; 5158 5159 while (true) { 5160 int delta; 5161 5162 nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); 5163 if (nr_to_scan <= 0) 5164 break; 5165 5166 delta = evict_folios(lruvec, sc, swappiness); 5167 if (!delta) 5168 break; 5169 5170 scanned += delta; 5171 if (scanned >= nr_to_scan) 5172 break; 5173 5174 if (sc->nr_reclaimed >= nr_to_reclaim) 5175 break; 5176 5177 cond_resched(); 5178 } 5179 5180 /* whether try_to_inc_max_seq() was successful */ 5181 return nr_to_scan < 0; 5182 } 5183 5184 static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) 5185 { 5186 bool success; 5187 unsigned long scanned = sc->nr_scanned; 5188 unsigned long reclaimed = sc->nr_reclaimed; 5189 int seg = lru_gen_memcg_seg(lruvec); 5190 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 5191 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 5192 5193 /* see the comment on MEMCG_NR_GENS */ 5194 if (!lruvec_is_sizable(lruvec, sc)) 5195 return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; 5196 5197 mem_cgroup_calculate_protection(NULL, memcg); 5198 5199 if (mem_cgroup_below_min(NULL, memcg)) 5200 return MEMCG_LRU_YOUNG; 5201 5202 if (mem_cgroup_below_low(NULL, memcg)) { 5203 /* see the comment on MEMCG_NR_GENS */ 5204 if (seg != MEMCG_LRU_TAIL) 5205 return MEMCG_LRU_TAIL; 5206 5207 memcg_memory_event(memcg, MEMCG_LOW); 5208 } 5209 5210 success = try_to_shrink_lruvec(lruvec, sc); 5211 5212 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); 5213 5214 if (!sc->proactive) 5215 vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, 5216 sc->nr_reclaimed - reclaimed); 5217 5218 sc->nr_reclaimed += current->reclaim_state->reclaimed_slab; 5219 current->reclaim_state->reclaimed_slab = 0; 5220 5221 return success ? MEMCG_LRU_YOUNG : 0; 5222 } 5223 5224 #ifdef CONFIG_MEMCG 5225 5226 static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) 5227 { 5228 int gen; 5229 int bin; 5230 int first_bin; 5231 struct lruvec *lruvec; 5232 struct lru_gen_folio *lrugen; 5233 const struct hlist_nulls_node *pos; 5234 int op = 0; 5235 struct mem_cgroup *memcg = NULL; 5236 unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); 5237 5238 bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); 5239 restart: 5240 gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); 5241 5242 rcu_read_lock(); 5243 5244 hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { 5245 if (op) 5246 lru_gen_rotate_memcg(lruvec, op); 5247 5248 mem_cgroup_put(memcg); 5249 5250 lruvec = container_of(lrugen, struct lruvec, lrugen); 5251 memcg = lruvec_memcg(lruvec); 5252 5253 if (!mem_cgroup_tryget(memcg)) { 5254 op = 0; 5255 memcg = NULL; 5256 continue; 5257 } 5258 5259 rcu_read_unlock(); 5260 5261 op = shrink_one(lruvec, sc); 5262 5263 if (sc->nr_reclaimed >= nr_to_reclaim) 5264 goto success; 5265 5266 rcu_read_lock(); 5267 } 5268 5269 rcu_read_unlock(); 5270 5271 /* restart if raced with lru_gen_rotate_memcg() */ 5272 if (gen != get_nulls_value(pos)) 5273 goto restart; 5274 5275 /* try the rest of the bins of the current generation */ 5276 bin = get_memcg_bin(bin + 1); 5277 if (bin != first_bin) 5278 goto restart; 5279 success: 5280 if (op) 5281 lru_gen_rotate_memcg(lruvec, op); 5282 5283 mem_cgroup_put(memcg); 5284 } 5285 5286 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 5287 { 5288 struct blk_plug plug; 5289 5290 VM_WARN_ON_ONCE(global_reclaim(sc)); 5291 VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap); 5292 5293 lru_add_drain(); 5294 5295 blk_start_plug(&plug); 5296 5297 set_mm_walk(NULL, sc->proactive); 5298 5299 if (try_to_shrink_lruvec(lruvec, sc)) 5300 lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG); 5301 5302 clear_mm_walk(); 5303 5304 blk_finish_plug(&plug); 5305 } 5306 5307 #else /* !CONFIG_MEMCG */ 5308 5309 static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) 5310 { 5311 BUILD_BUG(); 5312 } 5313 5314 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 5315 { 5316 BUILD_BUG(); 5317 } 5318 5319 #endif 5320 5321 static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) 5322 { 5323 int priority; 5324 unsigned long reclaimable; 5325 struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); 5326 5327 if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) 5328 return; 5329 /* 5330 * Determine the initial priority based on ((total / MEMCG_NR_GENS) >> 5331 * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the 5332 * estimated reclaimed_to_scanned_ratio = inactive / total. 5333 */ 5334 reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); 5335 if (get_swappiness(lruvec, sc)) 5336 reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); 5337 5338 reclaimable /= MEMCG_NR_GENS; 5339 5340 /* round down reclaimable and round up sc->nr_to_reclaim */ 5341 priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); 5342 5343 sc->priority = clamp(priority, 0, DEF_PRIORITY); 5344 } 5345 5346 static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) 5347 { 5348 struct blk_plug plug; 5349 unsigned long reclaimed = sc->nr_reclaimed; 5350 5351 VM_WARN_ON_ONCE(!global_reclaim(sc)); 5352 5353 /* 5354 * Unmapped clean folios are already prioritized. Scanning for more of 5355 * them is likely futile and can cause high reclaim latency when there 5356 * is a large number of memcgs. 5357 */ 5358 if (!sc->may_writepage || !sc->may_unmap) 5359 goto done; 5360 5361 lru_add_drain(); 5362 5363 blk_start_plug(&plug); 5364 5365 set_mm_walk(pgdat, sc->proactive); 5366 5367 set_initial_priority(pgdat, sc); 5368 5369 if (current_is_kswapd()) 5370 sc->nr_reclaimed = 0; 5371 5372 if (mem_cgroup_disabled()) 5373 shrink_one(&pgdat->__lruvec, sc); 5374 else 5375 shrink_many(pgdat, sc); 5376 5377 if (current_is_kswapd()) 5378 sc->nr_reclaimed += reclaimed; 5379 5380 clear_mm_walk(); 5381 5382 blk_finish_plug(&plug); 5383 done: 5384 /* kswapd should never fail */ 5385 pgdat->kswapd_failures = 0; 5386 } 5387 5388 #ifdef CONFIG_MEMCG 5389 void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) 5390 { 5391 int seg; 5392 int old, new; 5393 int bin = get_random_u32_below(MEMCG_NR_BINS); 5394 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 5395 5396 spin_lock(&pgdat->memcg_lru.lock); 5397 5398 VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); 5399 5400 seg = 0; 5401 new = old = lruvec->lrugen.gen; 5402 5403 /* see the comment on MEMCG_NR_GENS */ 5404 if (op == MEMCG_LRU_HEAD) 5405 seg = MEMCG_LRU_HEAD; 5406 else if (op == MEMCG_LRU_TAIL) 5407 seg = MEMCG_LRU_TAIL; 5408 else if (op == MEMCG_LRU_OLD) 5409 new = get_memcg_gen(pgdat->memcg_lru.seq); 5410 else if (op == MEMCG_LRU_YOUNG) 5411 new = get_memcg_gen(pgdat->memcg_lru.seq + 1); 5412 else 5413 VM_WARN_ON_ONCE(true); 5414 5415 hlist_nulls_del_rcu(&lruvec->lrugen.list); 5416 5417 if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) 5418 hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); 5419 else 5420 hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); 5421 5422 pgdat->memcg_lru.nr_memcgs[old]--; 5423 pgdat->memcg_lru.nr_memcgs[new]++; 5424 5425 lruvec->lrugen.gen = new; 5426 WRITE_ONCE(lruvec->lrugen.seg, seg); 5427 5428 if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) 5429 WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); 5430 5431 spin_unlock(&pgdat->memcg_lru.lock); 5432 } 5433 #endif 5434 5435 /****************************************************************************** 5436 * state change 5437 ******************************************************************************/ 5438 5439 static bool __maybe_unused state_is_valid(struct lruvec *lruvec) 5440 { 5441 struct lru_gen_folio *lrugen = &lruvec->lrugen; 5442 5443 if (lrugen->enabled) { 5444 enum lru_list lru; 5445 5446 for_each_evictable_lru(lru) { 5447 if (!list_empty(&lruvec->lists[lru])) 5448 return false; 5449 } 5450 } else { 5451 int gen, type, zone; 5452 5453 for_each_gen_type_zone(gen, type, zone) { 5454 if (!list_empty(&lrugen->folios[gen][type][zone])) 5455 return false; 5456 } 5457 } 5458 5459 return true; 5460 } 5461 5462 static bool fill_evictable(struct lruvec *lruvec) 5463 { 5464 enum lru_list lru; 5465 int remaining = MAX_LRU_BATCH; 5466 5467 for_each_evictable_lru(lru) { 5468 int type = is_file_lru(lru); 5469 bool active = is_active_lru(lru); 5470 struct list_head *head = &lruvec->lists[lru]; 5471 5472 while (!list_empty(head)) { 5473 bool success; 5474 struct folio *folio = lru_to_folio(head); 5475 5476 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); 5477 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio); 5478 VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); 5479 VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio); 5480 5481 lruvec_del_folio(lruvec, folio); 5482 success = lru_gen_add_folio(lruvec, folio, false); 5483 VM_WARN_ON_ONCE(!success); 5484 5485 if (!--remaining) 5486 return false; 5487 } 5488 } 5489 5490 return true; 5491 } 5492 5493 static bool drain_evictable(struct lruvec *lruvec) 5494 { 5495 int gen, type, zone; 5496 int remaining = MAX_LRU_BATCH; 5497 5498 for_each_gen_type_zone(gen, type, zone) { 5499 struct list_head *head = &lruvec->lrugen.folios[gen][type][zone]; 5500 5501 while (!list_empty(head)) { 5502 bool success; 5503 struct folio *folio = lru_to_folio(head); 5504 5505 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); 5506 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); 5507 VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); 5508 VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); 5509 5510 success = lru_gen_del_folio(lruvec, folio, false); 5511 VM_WARN_ON_ONCE(!success); 5512 lruvec_add_folio(lruvec, folio); 5513 5514 if (!--remaining) 5515 return false; 5516 } 5517 } 5518 5519 return true; 5520 } 5521 5522 static void lru_gen_change_state(bool enabled) 5523 { 5524 static DEFINE_MUTEX(state_mutex); 5525 5526 struct mem_cgroup *memcg; 5527 5528 cgroup_lock(); 5529 cpus_read_lock(); 5530 get_online_mems(); 5531 mutex_lock(&state_mutex); 5532 5533 if (enabled == lru_gen_enabled()) 5534 goto unlock; 5535 5536 if (enabled) 5537 static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); 5538 else 5539 static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); 5540 5541 memcg = mem_cgroup_iter(NULL, NULL, NULL); 5542 do { 5543 int nid; 5544 5545 for_each_node(nid) { 5546 struct lruvec *lruvec = get_lruvec(memcg, nid); 5547 5548 spin_lock_irq(&lruvec->lru_lock); 5549 5550 VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); 5551 VM_WARN_ON_ONCE(!state_is_valid(lruvec)); 5552 5553 lruvec->lrugen.enabled = enabled; 5554 5555 while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { 5556 spin_unlock_irq(&lruvec->lru_lock); 5557 cond_resched(); 5558 spin_lock_irq(&lruvec->lru_lock); 5559 } 5560 5561 spin_unlock_irq(&lruvec->lru_lock); 5562 } 5563 5564 cond_resched(); 5565 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); 5566 unlock: 5567 mutex_unlock(&state_mutex); 5568 put_online_mems(); 5569 cpus_read_unlock(); 5570 cgroup_unlock(); 5571 } 5572 5573 /****************************************************************************** 5574 * sysfs interface 5575 ******************************************************************************/ 5576 5577 static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) 5578 { 5579 return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); 5580 } 5581 5582 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ 5583 static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, 5584 const char *buf, size_t len) 5585 { 5586 unsigned int msecs; 5587 5588 if (kstrtouint(buf, 0, &msecs)) 5589 return -EINVAL; 5590 5591 WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); 5592 5593 return len; 5594 } 5595 5596 static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( 5597 min_ttl_ms, 0644, show_min_ttl, store_min_ttl 5598 ); 5599 5600 static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf) 5601 { 5602 unsigned int caps = 0; 5603 5604 if (get_cap(LRU_GEN_CORE)) 5605 caps |= BIT(LRU_GEN_CORE); 5606 5607 if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK)) 5608 caps |= BIT(LRU_GEN_MM_WALK); 5609 5610 if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) 5611 caps |= BIT(LRU_GEN_NONLEAF_YOUNG); 5612 5613 return sysfs_emit(buf, "0x%04x\n", caps); 5614 } 5615 5616 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ 5617 static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, 5618 const char *buf, size_t len) 5619 { 5620 int i; 5621 unsigned int caps; 5622 5623 if (tolower(*buf) == 'n') 5624 caps = 0; 5625 else if (tolower(*buf) == 'y') 5626 caps = -1; 5627 else if (kstrtouint(buf, 0, &caps)) 5628 return -EINVAL; 5629 5630 for (i = 0; i < NR_LRU_GEN_CAPS; i++) { 5631 bool enabled = caps & BIT(i); 5632 5633 if (i == LRU_GEN_CORE) 5634 lru_gen_change_state(enabled); 5635 else if (enabled) 5636 static_branch_enable(&lru_gen_caps[i]); 5637 else 5638 static_branch_disable(&lru_gen_caps[i]); 5639 } 5640 5641 return len; 5642 } 5643 5644 static struct kobj_attribute lru_gen_enabled_attr = __ATTR( 5645 enabled, 0644, show_enabled, store_enabled 5646 ); 5647 5648 static struct attribute *lru_gen_attrs[] = { 5649 &lru_gen_min_ttl_attr.attr, 5650 &lru_gen_enabled_attr.attr, 5651 NULL 5652 }; 5653 5654 static struct attribute_group lru_gen_attr_group = { 5655 .name = "lru_gen", 5656 .attrs = lru_gen_attrs, 5657 }; 5658 5659 /****************************************************************************** 5660 * debugfs interface 5661 ******************************************************************************/ 5662 5663 static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) 5664 { 5665 struct mem_cgroup *memcg; 5666 loff_t nr_to_skip = *pos; 5667 5668 m->private = kvmalloc(PATH_MAX, GFP_KERNEL); 5669 if (!m->private) 5670 return ERR_PTR(-ENOMEM); 5671 5672 memcg = mem_cgroup_iter(NULL, NULL, NULL); 5673 do { 5674 int nid; 5675 5676 for_each_node_state(nid, N_MEMORY) { 5677 if (!nr_to_skip--) 5678 return get_lruvec(memcg, nid); 5679 } 5680 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); 5681 5682 return NULL; 5683 } 5684 5685 static void lru_gen_seq_stop(struct seq_file *m, void *v) 5686 { 5687 if (!IS_ERR_OR_NULL(v)) 5688 mem_cgroup_iter_break(NULL, lruvec_memcg(v)); 5689 5690 kvfree(m->private); 5691 m->private = NULL; 5692 } 5693 5694 static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) 5695 { 5696 int nid = lruvec_pgdat(v)->node_id; 5697 struct mem_cgroup *memcg = lruvec_memcg(v); 5698 5699 ++*pos; 5700 5701 nid = next_memory_node(nid); 5702 if (nid == MAX_NUMNODES) { 5703 memcg = mem_cgroup_iter(NULL, memcg, NULL); 5704 if (!memcg) 5705 return NULL; 5706 5707 nid = first_memory_node; 5708 } 5709 5710 return get_lruvec(memcg, nid); 5711 } 5712 5713 static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, 5714 unsigned long max_seq, unsigned long *min_seq, 5715 unsigned long seq) 5716 { 5717 int i; 5718 int type, tier; 5719 int hist = lru_hist_from_seq(seq); 5720 struct lru_gen_folio *lrugen = &lruvec->lrugen; 5721 5722 for (tier = 0; tier < MAX_NR_TIERS; tier++) { 5723 seq_printf(m, " %10d", tier); 5724 for (type = 0; type < ANON_AND_FILE; type++) { 5725 const char *s = " "; 5726 unsigned long n[3] = {}; 5727 5728 if (seq == max_seq) { 5729 s = "RT "; 5730 n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); 5731 n[1] = READ_ONCE(lrugen->avg_total[type][tier]); 5732 } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { 5733 s = "rep"; 5734 n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); 5735 n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); 5736 if (tier) 5737 n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); 5738 } 5739 5740 for (i = 0; i < 3; i++) 5741 seq_printf(m, " %10lu%c", n[i], s[i]); 5742 } 5743 seq_putc(m, '\n'); 5744 } 5745 5746 seq_puts(m, " "); 5747 for (i = 0; i < NR_MM_STATS; i++) { 5748 const char *s = " "; 5749 unsigned long n = 0; 5750 5751 if (seq == max_seq && NR_HIST_GENS == 1) { 5752 s = "LOYNFA"; 5753 n = READ_ONCE(lruvec->mm_state.stats[hist][i]); 5754 } else if (seq != max_seq && NR_HIST_GENS > 1) { 5755 s = "loynfa"; 5756 n = READ_ONCE(lruvec->mm_state.stats[hist][i]); 5757 } 5758 5759 seq_printf(m, " %10lu%c", n, s[i]); 5760 } 5761 seq_putc(m, '\n'); 5762 } 5763 5764 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ 5765 static int lru_gen_seq_show(struct seq_file *m, void *v) 5766 { 5767 unsigned long seq; 5768 bool full = !debugfs_real_fops(m->file)->write; 5769 struct lruvec *lruvec = v; 5770 struct lru_gen_folio *lrugen = &lruvec->lrugen; 5771 int nid = lruvec_pgdat(lruvec)->node_id; 5772 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 5773 DEFINE_MAX_SEQ(lruvec); 5774 DEFINE_MIN_SEQ(lruvec); 5775 5776 if (nid == first_memory_node) { 5777 const char *path = memcg ? m->private : ""; 5778 5779 #ifdef CONFIG_MEMCG 5780 if (memcg) 5781 cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); 5782 #endif 5783 seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); 5784 } 5785 5786 seq_printf(m, " node %5d\n", nid); 5787 5788 if (!full) 5789 seq = min_seq[LRU_GEN_ANON]; 5790 else if (max_seq >= MAX_NR_GENS) 5791 seq = max_seq - MAX_NR_GENS + 1; 5792 else 5793 seq = 0; 5794 5795 for (; seq <= max_seq; seq++) { 5796 int type, zone; 5797 int gen = lru_gen_from_seq(seq); 5798 unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); 5799 5800 seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth)); 5801 5802 for (type = 0; type < ANON_AND_FILE; type++) { 5803 unsigned long size = 0; 5804 char mark = full && seq < min_seq[type] ? 'x' : ' '; 5805 5806 for (zone = 0; zone < MAX_NR_ZONES; zone++) 5807 size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); 5808 5809 seq_printf(m, " %10lu%c", size, mark); 5810 } 5811 5812 seq_putc(m, '\n'); 5813 5814 if (full) 5815 lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); 5816 } 5817 5818 return 0; 5819 } 5820 5821 static const struct seq_operations lru_gen_seq_ops = { 5822 .start = lru_gen_seq_start, 5823 .stop = lru_gen_seq_stop, 5824 .next = lru_gen_seq_next, 5825 .show = lru_gen_seq_show, 5826 }; 5827 5828 static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, 5829 bool can_swap, bool force_scan) 5830 { 5831 DEFINE_MAX_SEQ(lruvec); 5832 DEFINE_MIN_SEQ(lruvec); 5833 5834 if (seq < max_seq) 5835 return 0; 5836 5837 if (seq > max_seq) 5838 return -EINVAL; 5839 5840 if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) 5841 return -ERANGE; 5842 5843 try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan); 5844 5845 return 0; 5846 } 5847 5848 static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, 5849 int swappiness, unsigned long nr_to_reclaim) 5850 { 5851 DEFINE_MAX_SEQ(lruvec); 5852 5853 if (seq + MIN_NR_GENS > max_seq) 5854 return -EINVAL; 5855 5856 sc->nr_reclaimed = 0; 5857 5858 while (!signal_pending(current)) { 5859 DEFINE_MIN_SEQ(lruvec); 5860 5861 if (seq < min_seq[!swappiness]) 5862 return 0; 5863 5864 if (sc->nr_reclaimed >= nr_to_reclaim) 5865 return 0; 5866 5867 if (!evict_folios(lruvec, sc, swappiness)) 5868 return 0; 5869 5870 cond_resched(); 5871 } 5872 5873 return -EINTR; 5874 } 5875 5876 static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, 5877 struct scan_control *sc, int swappiness, unsigned long opt) 5878 { 5879 struct lruvec *lruvec; 5880 int err = -EINVAL; 5881 struct mem_cgroup *memcg = NULL; 5882 5883 if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) 5884 return -EINVAL; 5885 5886 if (!mem_cgroup_disabled()) { 5887 rcu_read_lock(); 5888 5889 memcg = mem_cgroup_from_id(memcg_id); 5890 if (!mem_cgroup_tryget(memcg)) 5891 memcg = NULL; 5892 5893 rcu_read_unlock(); 5894 5895 if (!memcg) 5896 return -EINVAL; 5897 } 5898 5899 if (memcg_id != mem_cgroup_id(memcg)) 5900 goto done; 5901 5902 lruvec = get_lruvec(memcg, nid); 5903 5904 if (swappiness < 0) 5905 swappiness = get_swappiness(lruvec, sc); 5906 else if (swappiness > 200) 5907 goto done; 5908 5909 switch (cmd) { 5910 case '+': 5911 err = run_aging(lruvec, seq, sc, swappiness, opt); 5912 break; 5913 case '-': 5914 err = run_eviction(lruvec, seq, sc, swappiness, opt); 5915 break; 5916 } 5917 done: 5918 mem_cgroup_put(memcg); 5919 5920 return err; 5921 } 5922 5923 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ 5924 static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, 5925 size_t len, loff_t *pos) 5926 { 5927 void *buf; 5928 char *cur, *next; 5929 unsigned int flags; 5930 struct blk_plug plug; 5931 int err = -EINVAL; 5932 struct scan_control sc = { 5933 .may_writepage = true, 5934 .may_unmap = true, 5935 .may_swap = true, 5936 .reclaim_idx = MAX_NR_ZONES - 1, 5937 .gfp_mask = GFP_KERNEL, 5938 }; 5939 5940 buf = kvmalloc(len + 1, GFP_KERNEL); 5941 if (!buf) 5942 return -ENOMEM; 5943 5944 if (copy_from_user(buf, src, len)) { 5945 kvfree(buf); 5946 return -EFAULT; 5947 } 5948 5949 set_task_reclaim_state(current, &sc.reclaim_state); 5950 flags = memalloc_noreclaim_save(); 5951 blk_start_plug(&plug); 5952 if (!set_mm_walk(NULL, true)) { 5953 err = -ENOMEM; 5954 goto done; 5955 } 5956 5957 next = buf; 5958 next[len] = '\0'; 5959 5960 while ((cur = strsep(&next, ",;\n"))) { 5961 int n; 5962 int end; 5963 char cmd; 5964 unsigned int memcg_id; 5965 unsigned int nid; 5966 unsigned long seq; 5967 unsigned int swappiness = -1; 5968 unsigned long opt = -1; 5969 5970 cur = skip_spaces(cur); 5971 if (!*cur) 5972 continue; 5973 5974 n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, 5975 &seq, &end, &swappiness, &end, &opt, &end); 5976 if (n < 4 || cur[end]) { 5977 err = -EINVAL; 5978 break; 5979 } 5980 5981 err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); 5982 if (err) 5983 break; 5984 } 5985 done: 5986 clear_mm_walk(); 5987 blk_finish_plug(&plug); 5988 memalloc_noreclaim_restore(flags); 5989 set_task_reclaim_state(current, NULL); 5990 5991 kvfree(buf); 5992 5993 return err ? : len; 5994 } 5995 5996 static int lru_gen_seq_open(struct inode *inode, struct file *file) 5997 { 5998 return seq_open(file, &lru_gen_seq_ops); 5999 } 6000 6001 static const struct file_operations lru_gen_rw_fops = { 6002 .open = lru_gen_seq_open, 6003 .read = seq_read, 6004 .write = lru_gen_seq_write, 6005 .llseek = seq_lseek, 6006 .release = seq_release, 6007 }; 6008 6009 static const struct file_operations lru_gen_ro_fops = { 6010 .open = lru_gen_seq_open, 6011 .read = seq_read, 6012 .llseek = seq_lseek, 6013 .release = seq_release, 6014 }; 6015 6016 /****************************************************************************** 6017 * initialization 6018 ******************************************************************************/ 6019 6020 void lru_gen_init_lruvec(struct lruvec *lruvec) 6021 { 6022 int i; 6023 int gen, type, zone; 6024 struct lru_gen_folio *lrugen = &lruvec->lrugen; 6025 6026 lrugen->max_seq = MIN_NR_GENS + 1; 6027 lrugen->enabled = lru_gen_enabled(); 6028 6029 for (i = 0; i <= MIN_NR_GENS + 1; i++) 6030 lrugen->timestamps[i] = jiffies; 6031 6032 for_each_gen_type_zone(gen, type, zone) 6033 INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); 6034 6035 lruvec->mm_state.seq = MIN_NR_GENS; 6036 init_waitqueue_head(&lruvec->mm_state.wait); 6037 } 6038 6039 #ifdef CONFIG_MEMCG 6040 6041 void lru_gen_init_pgdat(struct pglist_data *pgdat) 6042 { 6043 int i, j; 6044 6045 spin_lock_init(&pgdat->memcg_lru.lock); 6046 6047 for (i = 0; i < MEMCG_NR_GENS; i++) { 6048 for (j = 0; j < MEMCG_NR_BINS; j++) 6049 INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); 6050 } 6051 } 6052 6053 void lru_gen_init_memcg(struct mem_cgroup *memcg) 6054 { 6055 INIT_LIST_HEAD(&memcg->mm_list.fifo); 6056 spin_lock_init(&memcg->mm_list.lock); 6057 } 6058 6059 void lru_gen_exit_memcg(struct mem_cgroup *memcg) 6060 { 6061 int i; 6062 int nid; 6063 6064 for_each_node(nid) { 6065 struct lruvec *lruvec = get_lruvec(memcg, nid); 6066 6067 VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, 6068 sizeof(lruvec->lrugen.nr_pages))); 6069 6070 for (i = 0; i < NR_BLOOM_FILTERS; i++) { 6071 bitmap_free(lruvec->mm_state.filters[i]); 6072 lruvec->mm_state.filters[i] = NULL; 6073 } 6074 } 6075 } 6076 6077 void lru_gen_online_memcg(struct mem_cgroup *memcg) 6078 { 6079 int gen; 6080 int nid; 6081 int bin = get_random_u32_below(MEMCG_NR_BINS); 6082 6083 for_each_node(nid) { 6084 struct pglist_data *pgdat = NODE_DATA(nid); 6085 struct lruvec *lruvec = get_lruvec(memcg, nid); 6086 6087 spin_lock(&pgdat->memcg_lru.lock); 6088 6089 VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); 6090 6091 gen = get_memcg_gen(pgdat->memcg_lru.seq); 6092 6093 hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); 6094 pgdat->memcg_lru.nr_memcgs[gen]++; 6095 6096 lruvec->lrugen.gen = gen; 6097 6098 spin_unlock(&pgdat->memcg_lru.lock); 6099 } 6100 } 6101 6102 void lru_gen_offline_memcg(struct mem_cgroup *memcg) 6103 { 6104 int nid; 6105 6106 for_each_node(nid) { 6107 struct lruvec *lruvec = get_lruvec(memcg, nid); 6108 6109 lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); 6110 } 6111 } 6112 6113 void lru_gen_release_memcg(struct mem_cgroup *memcg) 6114 { 6115 int gen; 6116 int nid; 6117 6118 for_each_node(nid) { 6119 struct pglist_data *pgdat = NODE_DATA(nid); 6120 struct lruvec *lruvec = get_lruvec(memcg, nid); 6121 6122 spin_lock(&pgdat->memcg_lru.lock); 6123 6124 VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); 6125 6126 gen = lruvec->lrugen.gen; 6127 6128 hlist_nulls_del_rcu(&lruvec->lrugen.list); 6129 pgdat->memcg_lru.nr_memcgs[gen]--; 6130 6131 if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) 6132 WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); 6133 6134 spin_unlock(&pgdat->memcg_lru.lock); 6135 } 6136 } 6137 6138 #endif /* CONFIG_MEMCG */ 6139 6140 static int __init init_lru_gen(void) 6141 { 6142 BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); 6143 BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); 6144 6145 if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) 6146 pr_err("lru_gen: failed to create sysfs group\n"); 6147 6148 debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); 6149 debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); 6150 6151 return 0; 6152 }; 6153 late_initcall(init_lru_gen); 6154 6155 #else /* !CONFIG_LRU_GEN */ 6156 6157 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) 6158 { 6159 } 6160 6161 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 6162 { 6163 } 6164 6165 static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) 6166 { 6167 } 6168 6169 #endif /* CONFIG_LRU_GEN */ 6170 6171 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 6172 { 6173 unsigned long nr[NR_LRU_LISTS]; 6174 unsigned long targets[NR_LRU_LISTS]; 6175 unsigned long nr_to_scan; 6176 enum lru_list lru; 6177 unsigned long nr_reclaimed = 0; 6178 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 6179 bool proportional_reclaim; 6180 struct blk_plug plug; 6181 6182 if (lru_gen_enabled() && !global_reclaim(sc)) { 6183 lru_gen_shrink_lruvec(lruvec, sc); 6184 return; 6185 } 6186 6187 get_scan_count(lruvec, sc, nr); 6188 6189 /* Record the original scan target for proportional adjustments later */ 6190 memcpy(targets, nr, sizeof(nr)); 6191 6192 /* 6193 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal 6194 * event that can occur when there is little memory pressure e.g. 6195 * multiple streaming readers/writers. Hence, we do not abort scanning 6196 * when the requested number of pages are reclaimed when scanning at 6197 * DEF_PRIORITY on the assumption that the fact we are direct 6198 * reclaiming implies that kswapd is not keeping up and it is best to 6199 * do a batch of work at once. For memcg reclaim one check is made to 6200 * abort proportional reclaim if either the file or anon lru has already 6201 * dropped to zero at the first pass. 6202 */ 6203 proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() && 6204 sc->priority == DEF_PRIORITY); 6205 6206 blk_start_plug(&plug); 6207 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 6208 nr[LRU_INACTIVE_FILE]) { 6209 unsigned long nr_anon, nr_file, percentage; 6210 unsigned long nr_scanned; 6211 6212 for_each_evictable_lru(lru) { 6213 if (nr[lru]) { 6214 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); 6215 nr[lru] -= nr_to_scan; 6216 6217 nr_reclaimed += shrink_list(lru, nr_to_scan, 6218 lruvec, sc); 6219 } 6220 } 6221 6222 cond_resched(); 6223 6224 if (nr_reclaimed < nr_to_reclaim || proportional_reclaim) 6225 continue; 6226 6227 /* 6228 * For kswapd and memcg, reclaim at least the number of pages 6229 * requested. Ensure that the anon and file LRUs are scanned 6230 * proportionally what was requested by get_scan_count(). We 6231 * stop reclaiming one LRU and reduce the amount scanning 6232 * proportional to the original scan target. 6233 */ 6234 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; 6235 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; 6236 6237 /* 6238 * It's just vindictive to attack the larger once the smaller 6239 * has gone to zero. And given the way we stop scanning the 6240 * smaller below, this makes sure that we only make one nudge 6241 * towards proportionality once we've got nr_to_reclaim. 6242 */ 6243 if (!nr_file || !nr_anon) 6244 break; 6245 6246 if (nr_file > nr_anon) { 6247 unsigned long scan_target = targets[LRU_INACTIVE_ANON] + 6248 targets[LRU_ACTIVE_ANON] + 1; 6249 lru = LRU_BASE; 6250 percentage = nr_anon * 100 / scan_target; 6251 } else { 6252 unsigned long scan_target = targets[LRU_INACTIVE_FILE] + 6253 targets[LRU_ACTIVE_FILE] + 1; 6254 lru = LRU_FILE; 6255 percentage = nr_file * 100 / scan_target; 6256 } 6257 6258 /* Stop scanning the smaller of the LRU */ 6259 nr[lru] = 0; 6260 nr[lru + LRU_ACTIVE] = 0; 6261 6262 /* 6263 * Recalculate the other LRU scan count based on its original 6264 * scan target and the percentage scanning already complete 6265 */ 6266 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; 6267 nr_scanned = targets[lru] - nr[lru]; 6268 nr[lru] = targets[lru] * (100 - percentage) / 100; 6269 nr[lru] -= min(nr[lru], nr_scanned); 6270 6271 lru += LRU_ACTIVE; 6272 nr_scanned = targets[lru] - nr[lru]; 6273 nr[lru] = targets[lru] * (100 - percentage) / 100; 6274 nr[lru] -= min(nr[lru], nr_scanned); 6275 } 6276 blk_finish_plug(&plug); 6277 sc->nr_reclaimed += nr_reclaimed; 6278 6279 /* 6280 * Even if we did not try to evict anon pages at all, we want to 6281 * rebalance the anon lru active/inactive ratio. 6282 */ 6283 if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) && 6284 inactive_is_low(lruvec, LRU_INACTIVE_ANON)) 6285 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 6286 sc, LRU_ACTIVE_ANON); 6287 } 6288 6289 /* Use reclaim/compaction for costly allocs or under memory pressure */ 6290 static bool in_reclaim_compaction(struct scan_control *sc) 6291 { 6292 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && 6293 (sc->order > PAGE_ALLOC_COSTLY_ORDER || 6294 sc->priority < DEF_PRIORITY - 2)) 6295 return true; 6296 6297 return false; 6298 } 6299 6300 /* 6301 * Reclaim/compaction is used for high-order allocation requests. It reclaims 6302 * order-0 pages before compacting the zone. should_continue_reclaim() returns 6303 * true if more pages should be reclaimed such that when the page allocator 6304 * calls try_to_compact_pages() that it will have enough free pages to succeed. 6305 * It will give up earlier than that if there is difficulty reclaiming pages. 6306 */ 6307 static inline bool should_continue_reclaim(struct pglist_data *pgdat, 6308 unsigned long nr_reclaimed, 6309 struct scan_control *sc) 6310 { 6311 unsigned long pages_for_compaction; 6312 unsigned long inactive_lru_pages; 6313 int z; 6314 6315 /* If not in reclaim/compaction mode, stop */ 6316 if (!in_reclaim_compaction(sc)) 6317 return false; 6318 6319 /* 6320 * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX 6321 * number of pages that were scanned. This will return to the caller 6322 * with the risk reclaim/compaction and the resulting allocation attempt 6323 * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL 6324 * allocations through requiring that the full LRU list has been scanned 6325 * first, by assuming that zero delta of sc->nr_scanned means full LRU 6326 * scan, but that approximation was wrong, and there were corner cases 6327 * where always a non-zero amount of pages were scanned. 6328 */ 6329 if (!nr_reclaimed) 6330 return false; 6331 6332 /* If compaction would go ahead or the allocation would succeed, stop */ 6333 for (z = 0; z <= sc->reclaim_idx; z++) { 6334 struct zone *zone = &pgdat->node_zones[z]; 6335 if (!managed_zone(zone)) 6336 continue; 6337 6338 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { 6339 case COMPACT_SUCCESS: 6340 case COMPACT_CONTINUE: 6341 return false; 6342 default: 6343 /* check next zone */ 6344 ; 6345 } 6346 } 6347 6348 /* 6349 * If we have not reclaimed enough pages for compaction and the 6350 * inactive lists are large enough, continue reclaiming 6351 */ 6352 pages_for_compaction = compact_gap(sc->order); 6353 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); 6354 if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) 6355 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); 6356 6357 return inactive_lru_pages > pages_for_compaction; 6358 } 6359 6360 static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) 6361 { 6362 struct mem_cgroup *target_memcg = sc->target_mem_cgroup; 6363 struct mem_cgroup *memcg; 6364 6365 memcg = mem_cgroup_iter(target_memcg, NULL, NULL); 6366 do { 6367 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 6368 unsigned long reclaimed; 6369 unsigned long scanned; 6370 6371 /* 6372 * This loop can become CPU-bound when target memcgs 6373 * aren't eligible for reclaim - either because they 6374 * don't have any reclaimable pages, or because their 6375 * memory is explicitly protected. Avoid soft lockups. 6376 */ 6377 cond_resched(); 6378 6379 mem_cgroup_calculate_protection(target_memcg, memcg); 6380 6381 if (mem_cgroup_below_min(target_memcg, memcg)) { 6382 /* 6383 * Hard protection. 6384 * If there is no reclaimable memory, OOM. 6385 */ 6386 continue; 6387 } else if (mem_cgroup_below_low(target_memcg, memcg)) { 6388 /* 6389 * Soft protection. 6390 * Respect the protection only as long as 6391 * there is an unprotected supply 6392 * of reclaimable memory from other cgroups. 6393 */ 6394 if (!sc->memcg_low_reclaim) { 6395 sc->memcg_low_skipped = 1; 6396 continue; 6397 } 6398 memcg_memory_event(memcg, MEMCG_LOW); 6399 } 6400 6401 reclaimed = sc->nr_reclaimed; 6402 scanned = sc->nr_scanned; 6403 6404 shrink_lruvec(lruvec, sc); 6405 6406 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, 6407 sc->priority); 6408 6409 /* Record the group's reclaim efficiency */ 6410 if (!sc->proactive) 6411 vmpressure(sc->gfp_mask, memcg, false, 6412 sc->nr_scanned - scanned, 6413 sc->nr_reclaimed - reclaimed); 6414 6415 } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); 6416 } 6417 6418 static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) 6419 { 6420 struct reclaim_state *reclaim_state = current->reclaim_state; 6421 unsigned long nr_reclaimed, nr_scanned; 6422 struct lruvec *target_lruvec; 6423 bool reclaimable = false; 6424 6425 if (lru_gen_enabled() && global_reclaim(sc)) { 6426 lru_gen_shrink_node(pgdat, sc); 6427 return; 6428 } 6429 6430 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); 6431 6432 again: 6433 memset(&sc->nr, 0, sizeof(sc->nr)); 6434 6435 nr_reclaimed = sc->nr_reclaimed; 6436 nr_scanned = sc->nr_scanned; 6437 6438 prepare_scan_count(pgdat, sc); 6439 6440 shrink_node_memcgs(pgdat, sc); 6441 6442 if (reclaim_state) { 6443 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 6444 reclaim_state->reclaimed_slab = 0; 6445 } 6446 6447 /* Record the subtree's reclaim efficiency */ 6448 if (!sc->proactive) 6449 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, 6450 sc->nr_scanned - nr_scanned, 6451 sc->nr_reclaimed - nr_reclaimed); 6452 6453 if (sc->nr_reclaimed - nr_reclaimed) 6454 reclaimable = true; 6455 6456 if (current_is_kswapd()) { 6457 /* 6458 * If reclaim is isolating dirty pages under writeback, 6459 * it implies that the long-lived page allocation rate 6460 * is exceeding the page laundering rate. Either the 6461 * global limits are not being effective at throttling 6462 * processes due to the page distribution throughout 6463 * zones or there is heavy usage of a slow backing 6464 * device. The only option is to throttle from reclaim 6465 * context which is not ideal as there is no guarantee 6466 * the dirtying process is throttled in the same way 6467 * balance_dirty_pages() manages. 6468 * 6469 * Once a node is flagged PGDAT_WRITEBACK, kswapd will 6470 * count the number of pages under pages flagged for 6471 * immediate reclaim and stall if any are encountered 6472 * in the nr_immediate check below. 6473 */ 6474 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) 6475 set_bit(PGDAT_WRITEBACK, &pgdat->flags); 6476 6477 /* Allow kswapd to start writing pages during reclaim.*/ 6478 if (sc->nr.unqueued_dirty == sc->nr.file_taken) 6479 set_bit(PGDAT_DIRTY, &pgdat->flags); 6480 6481 /* 6482 * If kswapd scans pages marked for immediate 6483 * reclaim and under writeback (nr_immediate), it 6484 * implies that pages are cycling through the LRU 6485 * faster than they are written so forcibly stall 6486 * until some pages complete writeback. 6487 */ 6488 if (sc->nr.immediate) 6489 reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); 6490 } 6491 6492 /* 6493 * Tag a node/memcg as congested if all the dirty pages were marked 6494 * for writeback and immediate reclaim (counted in nr.congested). 6495 * 6496 * Legacy memcg will stall in page writeback so avoid forcibly 6497 * stalling in reclaim_throttle(). 6498 */ 6499 if ((current_is_kswapd() || 6500 (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && 6501 sc->nr.dirty && sc->nr.dirty == sc->nr.congested) 6502 set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); 6503 6504 /* 6505 * Stall direct reclaim for IO completions if the lruvec is 6506 * node is congested. Allow kswapd to continue until it 6507 * starts encountering unqueued dirty pages or cycling through 6508 * the LRU too quickly. 6509 */ 6510 if (!current_is_kswapd() && current_may_throttle() && 6511 !sc->hibernation_mode && 6512 test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) 6513 reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED); 6514 6515 if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, 6516 sc)) 6517 goto again; 6518 6519 /* 6520 * Kswapd gives up on balancing particular nodes after too 6521 * many failures to reclaim anything from them and goes to 6522 * sleep. On reclaim progress, reset the failure counter. A 6523 * successful direct reclaim run will revive a dormant kswapd. 6524 */ 6525 if (reclaimable) 6526 pgdat->kswapd_failures = 0; 6527 } 6528 6529 /* 6530 * Returns true if compaction should go ahead for a costly-order request, or 6531 * the allocation would already succeed without compaction. Return false if we 6532 * should reclaim first. 6533 */ 6534 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) 6535 { 6536 unsigned long watermark; 6537 enum compact_result suitable; 6538 6539 suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx); 6540 if (suitable == COMPACT_SUCCESS) 6541 /* Allocation should succeed already. Don't reclaim. */ 6542 return true; 6543 if (suitable == COMPACT_SKIPPED) 6544 /* Compaction cannot yet proceed. Do reclaim. */ 6545 return false; 6546 6547 /* 6548 * Compaction is already possible, but it takes time to run and there 6549 * are potentially other callers using the pages just freed. So proceed 6550 * with reclaim to make a buffer of free pages available to give 6551 * compaction a reasonable chance of completing and allocating the page. 6552 * Note that we won't actually reclaim the whole buffer in one attempt 6553 * as the target watermark in should_continue_reclaim() is lower. But if 6554 * we are already above the high+gap watermark, don't reclaim at all. 6555 */ 6556 watermark = high_wmark_pages(zone) + compact_gap(sc->order); 6557 6558 return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); 6559 } 6560 6561 static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) 6562 { 6563 /* 6564 * If reclaim is making progress greater than 12% efficiency then 6565 * wake all the NOPROGRESS throttled tasks. 6566 */ 6567 if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) { 6568 wait_queue_head_t *wqh; 6569 6570 wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS]; 6571 if (waitqueue_active(wqh)) 6572 wake_up(wqh); 6573 6574 return; 6575 } 6576 6577 /* 6578 * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will 6579 * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages 6580 * under writeback and marked for immediate reclaim at the tail of the 6581 * LRU. 6582 */ 6583 if (current_is_kswapd() || cgroup_reclaim(sc)) 6584 return; 6585 6586 /* Throttle if making no progress at high prioities. */ 6587 if (sc->priority == 1 && !sc->nr_reclaimed) 6588 reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS); 6589 } 6590 6591 /* 6592 * This is the direct reclaim path, for page-allocating processes. We only 6593 * try to reclaim pages from zones which will satisfy the caller's allocation 6594 * request. 6595 * 6596 * If a zone is deemed to be full of pinned pages then just give it a light 6597 * scan then give up on it. 6598 */ 6599 static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) 6600 { 6601 struct zoneref *z; 6602 struct zone *zone; 6603 unsigned long nr_soft_reclaimed; 6604 unsigned long nr_soft_scanned; 6605 gfp_t orig_mask; 6606 pg_data_t *last_pgdat = NULL; 6607 pg_data_t *first_pgdat = NULL; 6608 6609 /* 6610 * If the number of buffer_heads in the machine exceeds the maximum 6611 * allowed level, force direct reclaim to scan the highmem zone as 6612 * highmem pages could be pinning lowmem pages storing buffer_heads 6613 */ 6614 orig_mask = sc->gfp_mask; 6615 if (buffer_heads_over_limit) { 6616 sc->gfp_mask |= __GFP_HIGHMEM; 6617 sc->reclaim_idx = gfp_zone(sc->gfp_mask); 6618 } 6619 6620 for_each_zone_zonelist_nodemask(zone, z, zonelist, 6621 sc->reclaim_idx, sc->nodemask) { 6622 /* 6623 * Take care memory controller reclaiming has small influence 6624 * to global LRU. 6625 */ 6626 if (!cgroup_reclaim(sc)) { 6627 if (!cpuset_zone_allowed(zone, 6628 GFP_KERNEL | __GFP_HARDWALL)) 6629 continue; 6630 6631 /* 6632 * If we already have plenty of memory free for 6633 * compaction in this zone, don't free any more. 6634 * Even though compaction is invoked for any 6635 * non-zero order, only frequent costly order 6636 * reclamation is disruptive enough to become a 6637 * noticeable problem, like transparent huge 6638 * page allocations. 6639 */ 6640 if (IS_ENABLED(CONFIG_COMPACTION) && 6641 sc->order > PAGE_ALLOC_COSTLY_ORDER && 6642 compaction_ready(zone, sc)) { 6643 sc->compaction_ready = true; 6644 continue; 6645 } 6646 6647 /* 6648 * Shrink each node in the zonelist once. If the 6649 * zonelist is ordered by zone (not the default) then a 6650 * node may be shrunk multiple times but in that case 6651 * the user prefers lower zones being preserved. 6652 */ 6653 if (zone->zone_pgdat == last_pgdat) 6654 continue; 6655 6656 /* 6657 * This steals pages from memory cgroups over softlimit 6658 * and returns the number of reclaimed pages and 6659 * scanned pages. This works for global memory pressure 6660 * and balancing, not for a memcg's limit. 6661 */ 6662 nr_soft_scanned = 0; 6663 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat, 6664 sc->order, sc->gfp_mask, 6665 &nr_soft_scanned); 6666 sc->nr_reclaimed += nr_soft_reclaimed; 6667 sc->nr_scanned += nr_soft_scanned; 6668 /* need some check for avoid more shrink_zone() */ 6669 } 6670 6671 if (!first_pgdat) 6672 first_pgdat = zone->zone_pgdat; 6673 6674 /* See comment about same check for global reclaim above */ 6675 if (zone->zone_pgdat == last_pgdat) 6676 continue; 6677 last_pgdat = zone->zone_pgdat; 6678 shrink_node(zone->zone_pgdat, sc); 6679 } 6680 6681 if (first_pgdat) 6682 consider_reclaim_throttle(first_pgdat, sc); 6683 6684 /* 6685 * Restore to original mask to avoid the impact on the caller if we 6686 * promoted it to __GFP_HIGHMEM. 6687 */ 6688 sc->gfp_mask = orig_mask; 6689 } 6690 6691 static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) 6692 { 6693 struct lruvec *target_lruvec; 6694 unsigned long refaults; 6695 6696 if (lru_gen_enabled()) 6697 return; 6698 6699 target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); 6700 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); 6701 target_lruvec->refaults[WORKINGSET_ANON] = refaults; 6702 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE); 6703 target_lruvec->refaults[WORKINGSET_FILE] = refaults; 6704 } 6705 6706 /* 6707 * This is the main entry point to direct page reclaim. 6708 * 6709 * If a full scan of the inactive list fails to free enough memory then we 6710 * are "out of memory" and something needs to be killed. 6711 * 6712 * If the caller is !__GFP_FS then the probability of a failure is reasonably 6713 * high - the zone may be full of dirty or under-writeback pages, which this 6714 * caller can't do much about. We kick the writeback threads and take explicit 6715 * naps in the hope that some of these pages can be written. But if the 6716 * allocating task holds filesystem locks which prevent writeout this might not 6717 * work, and the allocation attempt will fail. 6718 * 6719 * returns: 0, if no pages reclaimed 6720 * else, the number of pages reclaimed 6721 */ 6722 static unsigned long do_try_to_free_pages(struct zonelist *zonelist, 6723 struct scan_control *sc) 6724 { 6725 int initial_priority = sc->priority; 6726 pg_data_t *last_pgdat; 6727 struct zoneref *z; 6728 struct zone *zone; 6729 retry: 6730 delayacct_freepages_start(); 6731 6732 if (!cgroup_reclaim(sc)) 6733 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); 6734 6735 do { 6736 if (!sc->proactive) 6737 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, 6738 sc->priority); 6739 sc->nr_scanned = 0; 6740 shrink_zones(zonelist, sc); 6741 6742 if (sc->nr_reclaimed >= sc->nr_to_reclaim) 6743 break; 6744 6745 if (sc->compaction_ready) 6746 break; 6747 6748 /* 6749 * If we're getting trouble reclaiming, start doing 6750 * writepage even in laptop mode. 6751 */ 6752 if (sc->priority < DEF_PRIORITY - 2) 6753 sc->may_writepage = 1; 6754 } while (--sc->priority >= 0); 6755 6756 last_pgdat = NULL; 6757 for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, 6758 sc->nodemask) { 6759 if (zone->zone_pgdat == last_pgdat) 6760 continue; 6761 last_pgdat = zone->zone_pgdat; 6762 6763 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); 6764 6765 if (cgroup_reclaim(sc)) { 6766 struct lruvec *lruvec; 6767 6768 lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, 6769 zone->zone_pgdat); 6770 clear_bit(LRUVEC_CONGESTED, &lruvec->flags); 6771 } 6772 } 6773 6774 delayacct_freepages_end(); 6775 6776 if (sc->nr_reclaimed) 6777 return sc->nr_reclaimed; 6778 6779 /* Aborted reclaim to try compaction? don't OOM, then */ 6780 if (sc->compaction_ready) 6781 return 1; 6782 6783 /* 6784 * We make inactive:active ratio decisions based on the node's 6785 * composition of memory, but a restrictive reclaim_idx or a 6786 * memory.low cgroup setting can exempt large amounts of 6787 * memory from reclaim. Neither of which are very common, so 6788 * instead of doing costly eligibility calculations of the 6789 * entire cgroup subtree up front, we assume the estimates are 6790 * good, and retry with forcible deactivation if that fails. 6791 */ 6792 if (sc->skipped_deactivate) { 6793 sc->priority = initial_priority; 6794 sc->force_deactivate = 1; 6795 sc->skipped_deactivate = 0; 6796 goto retry; 6797 } 6798 6799 /* Untapped cgroup reserves? Don't OOM, retry. */ 6800 if (sc->memcg_low_skipped) { 6801 sc->priority = initial_priority; 6802 sc->force_deactivate = 0; 6803 sc->memcg_low_reclaim = 1; 6804 sc->memcg_low_skipped = 0; 6805 goto retry; 6806 } 6807 6808 return 0; 6809 } 6810 6811 static bool allow_direct_reclaim(pg_data_t *pgdat) 6812 { 6813 struct zone *zone; 6814 unsigned long pfmemalloc_reserve = 0; 6815 unsigned long free_pages = 0; 6816 int i; 6817 bool wmark_ok; 6818 6819 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) 6820 return true; 6821 6822 for (i = 0; i <= ZONE_NORMAL; i++) { 6823 zone = &pgdat->node_zones[i]; 6824 if (!managed_zone(zone)) 6825 continue; 6826 6827 if (!zone_reclaimable_pages(zone)) 6828 continue; 6829 6830 pfmemalloc_reserve += min_wmark_pages(zone); 6831 free_pages += zone_page_state(zone, NR_FREE_PAGES); 6832 } 6833 6834 /* If there are no reserves (unexpected config) then do not throttle */ 6835 if (!pfmemalloc_reserve) 6836 return true; 6837 6838 wmark_ok = free_pages > pfmemalloc_reserve / 2; 6839 6840 /* kswapd must be awake if processes are being throttled */ 6841 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { 6842 if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) 6843 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL); 6844 6845 wake_up_interruptible(&pgdat->kswapd_wait); 6846 } 6847 6848 return wmark_ok; 6849 } 6850 6851 /* 6852 * Throttle direct reclaimers if backing storage is backed by the network 6853 * and the PFMEMALLOC reserve for the preferred node is getting dangerously 6854 * depleted. kswapd will continue to make progress and wake the processes 6855 * when the low watermark is reached. 6856 * 6857 * Returns true if a fatal signal was delivered during throttling. If this 6858 * happens, the page allocator should not consider triggering the OOM killer. 6859 */ 6860 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, 6861 nodemask_t *nodemask) 6862 { 6863 struct zoneref *z; 6864 struct zone *zone; 6865 pg_data_t *pgdat = NULL; 6866 6867 /* 6868 * Kernel threads should not be throttled as they may be indirectly 6869 * responsible for cleaning pages necessary for reclaim to make forward 6870 * progress. kjournald for example may enter direct reclaim while 6871 * committing a transaction where throttling it could forcing other 6872 * processes to block on log_wait_commit(). 6873 */ 6874 if (current->flags & PF_KTHREAD) 6875 goto out; 6876 6877 /* 6878 * If a fatal signal is pending, this process should not throttle. 6879 * It should return quickly so it can exit and free its memory 6880 */ 6881 if (fatal_signal_pending(current)) 6882 goto out; 6883 6884 /* 6885 * Check if the pfmemalloc reserves are ok by finding the first node 6886 * with a usable ZONE_NORMAL or lower zone. The expectation is that 6887 * GFP_KERNEL will be required for allocating network buffers when 6888 * swapping over the network so ZONE_HIGHMEM is unusable. 6889 * 6890 * Throttling is based on the first usable node and throttled processes 6891 * wait on a queue until kswapd makes progress and wakes them. There 6892 * is an affinity then between processes waking up and where reclaim 6893 * progress has been made assuming the process wakes on the same node. 6894 * More importantly, processes running on remote nodes will not compete 6895 * for remote pfmemalloc reserves and processes on different nodes 6896 * should make reasonable progress. 6897 */ 6898 for_each_zone_zonelist_nodemask(zone, z, zonelist, 6899 gfp_zone(gfp_mask), nodemask) { 6900 if (zone_idx(zone) > ZONE_NORMAL) 6901 continue; 6902 6903 /* Throttle based on the first usable node */ 6904 pgdat = zone->zone_pgdat; 6905 if (allow_direct_reclaim(pgdat)) 6906 goto out; 6907 break; 6908 } 6909 6910 /* If no zone was usable by the allocation flags then do not throttle */ 6911 if (!pgdat) 6912 goto out; 6913 6914 /* Account for the throttling */ 6915 count_vm_event(PGSCAN_DIRECT_THROTTLE); 6916 6917 /* 6918 * If the caller cannot enter the filesystem, it's possible that it 6919 * is due to the caller holding an FS lock or performing a journal 6920 * transaction in the case of a filesystem like ext[3|4]. In this case, 6921 * it is not safe to block on pfmemalloc_wait as kswapd could be 6922 * blocked waiting on the same lock. Instead, throttle for up to a 6923 * second before continuing. 6924 */ 6925 if (!(gfp_mask & __GFP_FS)) 6926 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, 6927 allow_direct_reclaim(pgdat), HZ); 6928 else 6929 /* Throttle until kswapd wakes the process */ 6930 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, 6931 allow_direct_reclaim(pgdat)); 6932 6933 if (fatal_signal_pending(current)) 6934 return true; 6935 6936 out: 6937 return false; 6938 } 6939 6940 unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 6941 gfp_t gfp_mask, nodemask_t *nodemask) 6942 { 6943 unsigned long nr_reclaimed; 6944 struct scan_control sc = { 6945 .nr_to_reclaim = SWAP_CLUSTER_MAX, 6946 .gfp_mask = current_gfp_context(gfp_mask), 6947 .reclaim_idx = gfp_zone(gfp_mask), 6948 .order = order, 6949 .nodemask = nodemask, 6950 .priority = DEF_PRIORITY, 6951 .may_writepage = !laptop_mode, 6952 .may_unmap = 1, 6953 .may_swap = 1, 6954 }; 6955 6956 /* 6957 * scan_control uses s8 fields for order, priority, and reclaim_idx. 6958 * Confirm they are large enough for max values. 6959 */ 6960 BUILD_BUG_ON(MAX_ORDER > S8_MAX); 6961 BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); 6962 BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX); 6963 6964 /* 6965 * Do not enter reclaim if fatal signal was delivered while throttled. 6966 * 1 is returned so that the page allocator does not OOM kill at this 6967 * point. 6968 */ 6969 if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) 6970 return 1; 6971 6972 set_task_reclaim_state(current, &sc.reclaim_state); 6973 trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); 6974 6975 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 6976 6977 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); 6978 set_task_reclaim_state(current, NULL); 6979 6980 return nr_reclaimed; 6981 } 6982 6983 #ifdef CONFIG_MEMCG 6984 6985 /* Only used by soft limit reclaim. Do not reuse for anything else. */ 6986 unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, 6987 gfp_t gfp_mask, bool noswap, 6988 pg_data_t *pgdat, 6989 unsigned long *nr_scanned) 6990 { 6991 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 6992 struct scan_control sc = { 6993 .nr_to_reclaim = SWAP_CLUSTER_MAX, 6994 .target_mem_cgroup = memcg, 6995 .may_writepage = !laptop_mode, 6996 .may_unmap = 1, 6997 .reclaim_idx = MAX_NR_ZONES - 1, 6998 .may_swap = !noswap, 6999 }; 7000 7001 WARN_ON_ONCE(!current->reclaim_state); 7002 7003 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 7004 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 7005 7006 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, 7007 sc.gfp_mask); 7008 7009 /* 7010 * NOTE: Although we can get the priority field, using it 7011 * here is not a good idea, since it limits the pages we can scan. 7012 * if we don't reclaim here, the shrink_node from balance_pgdat 7013 * will pick up pages from other mem cgroup's as well. We hack 7014 * the priority and make it zero. 7015 */ 7016 shrink_lruvec(lruvec, &sc); 7017 7018 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 7019 7020 *nr_scanned = sc.nr_scanned; 7021 7022 return sc.nr_reclaimed; 7023 } 7024 7025 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, 7026 unsigned long nr_pages, 7027 gfp_t gfp_mask, 7028 unsigned int reclaim_options) 7029 { 7030 unsigned long nr_reclaimed; 7031 unsigned int noreclaim_flag; 7032 struct scan_control sc = { 7033 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), 7034 .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | 7035 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 7036 .reclaim_idx = MAX_NR_ZONES - 1, 7037 .target_mem_cgroup = memcg, 7038 .priority = DEF_PRIORITY, 7039 .may_writepage = !laptop_mode, 7040 .may_unmap = 1, 7041 .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), 7042 .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), 7043 }; 7044 /* 7045 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put 7046 * equal pressure on all the nodes. This is based on the assumption that 7047 * the reclaim does not bail out early. 7048 */ 7049 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 7050 7051 set_task_reclaim_state(current, &sc.reclaim_state); 7052 trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); 7053 noreclaim_flag = memalloc_noreclaim_save(); 7054 7055 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 7056 7057 memalloc_noreclaim_restore(noreclaim_flag); 7058 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 7059 set_task_reclaim_state(current, NULL); 7060 7061 return nr_reclaimed; 7062 } 7063 #endif 7064 7065 static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) 7066 { 7067 struct mem_cgroup *memcg; 7068 struct lruvec *lruvec; 7069 7070 if (lru_gen_enabled()) { 7071 lru_gen_age_node(pgdat, sc); 7072 return; 7073 } 7074 7075 if (!can_age_anon_pages(pgdat, sc)) 7076 return; 7077 7078 lruvec = mem_cgroup_lruvec(NULL, pgdat); 7079 if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON)) 7080 return; 7081 7082 memcg = mem_cgroup_iter(NULL, NULL, NULL); 7083 do { 7084 lruvec = mem_cgroup_lruvec(memcg, pgdat); 7085 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 7086 sc, LRU_ACTIVE_ANON); 7087 memcg = mem_cgroup_iter(NULL, memcg, NULL); 7088 } while (memcg); 7089 } 7090 7091 static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) 7092 { 7093 int i; 7094 struct zone *zone; 7095 7096 /* 7097 * Check for watermark boosts top-down as the higher zones 7098 * are more likely to be boosted. Both watermarks and boosts 7099 * should not be checked at the same time as reclaim would 7100 * start prematurely when there is no boosting and a lower 7101 * zone is balanced. 7102 */ 7103 for (i = highest_zoneidx; i >= 0; i--) { 7104 zone = pgdat->node_zones + i; 7105 if (!managed_zone(zone)) 7106 continue; 7107 7108 if (zone->watermark_boost) 7109 return true; 7110 } 7111 7112 return false; 7113 } 7114 7115 /* 7116 * Returns true if there is an eligible zone balanced for the request order 7117 * and highest_zoneidx 7118 */ 7119 static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) 7120 { 7121 int i; 7122 unsigned long mark = -1; 7123 struct zone *zone; 7124 7125 /* 7126 * Check watermarks bottom-up as lower zones are more likely to 7127 * meet watermarks. 7128 */ 7129 for (i = 0; i <= highest_zoneidx; i++) { 7130 zone = pgdat->node_zones + i; 7131 7132 if (!managed_zone(zone)) 7133 continue; 7134 7135 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) 7136 mark = wmark_pages(zone, WMARK_PROMO); 7137 else 7138 mark = high_wmark_pages(zone); 7139 if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) 7140 return true; 7141 } 7142 7143 /* 7144 * If a node has no managed zone within highest_zoneidx, it does not 7145 * need balancing by definition. This can happen if a zone-restricted 7146 * allocation tries to wake a remote kswapd. 7147 */ 7148 if (mark == -1) 7149 return true; 7150 7151 return false; 7152 } 7153 7154 /* Clear pgdat state for congested, dirty or under writeback. */ 7155 static void clear_pgdat_congested(pg_data_t *pgdat) 7156 { 7157 struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); 7158 7159 clear_bit(LRUVEC_CONGESTED, &lruvec->flags); 7160 clear_bit(PGDAT_DIRTY, &pgdat->flags); 7161 clear_bit(PGDAT_WRITEBACK, &pgdat->flags); 7162 } 7163 7164 /* 7165 * Prepare kswapd for sleeping. This verifies that there are no processes 7166 * waiting in throttle_direct_reclaim() and that watermarks have been met. 7167 * 7168 * Returns true if kswapd is ready to sleep 7169 */ 7170 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, 7171 int highest_zoneidx) 7172 { 7173 /* 7174 * The throttled processes are normally woken up in balance_pgdat() as 7175 * soon as allow_direct_reclaim() is true. But there is a potential 7176 * race between when kswapd checks the watermarks and a process gets 7177 * throttled. There is also a potential race if processes get 7178 * throttled, kswapd wakes, a large process exits thereby balancing the 7179 * zones, which causes kswapd to exit balance_pgdat() before reaching 7180 * the wake up checks. If kswapd is going to sleep, no process should 7181 * be sleeping on pfmemalloc_wait, so wake them now if necessary. If 7182 * the wake up is premature, processes will wake kswapd and get 7183 * throttled again. The difference from wake ups in balance_pgdat() is 7184 * that here we are under prepare_to_wait(). 7185 */ 7186 if (waitqueue_active(&pgdat->pfmemalloc_wait)) 7187 wake_up_all(&pgdat->pfmemalloc_wait); 7188 7189 /* Hopeless node, leave it to direct reclaim */ 7190 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) 7191 return true; 7192 7193 if (pgdat_balanced(pgdat, order, highest_zoneidx)) { 7194 clear_pgdat_congested(pgdat); 7195 return true; 7196 } 7197 7198 return false; 7199 } 7200 7201 /* 7202 * kswapd shrinks a node of pages that are at or below the highest usable 7203 * zone that is currently unbalanced. 7204 * 7205 * Returns true if kswapd scanned at least the requested number of pages to 7206 * reclaim or if the lack of progress was due to pages under writeback. 7207 * This is used to determine if the scanning priority needs to be raised. 7208 */ 7209 static bool kswapd_shrink_node(pg_data_t *pgdat, 7210 struct scan_control *sc) 7211 { 7212 struct zone *zone; 7213 int z; 7214 7215 /* Reclaim a number of pages proportional to the number of zones */ 7216 sc->nr_to_reclaim = 0; 7217 for (z = 0; z <= sc->reclaim_idx; z++) { 7218 zone = pgdat->node_zones + z; 7219 if (!managed_zone(zone)) 7220 continue; 7221 7222 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); 7223 } 7224 7225 /* 7226 * Historically care was taken to put equal pressure on all zones but 7227 * now pressure is applied based on node LRU order. 7228 */ 7229 shrink_node(pgdat, sc); 7230 7231 /* 7232 * Fragmentation may mean that the system cannot be rebalanced for 7233 * high-order allocations. If twice the allocation size has been 7234 * reclaimed then recheck watermarks only at order-0 to prevent 7235 * excessive reclaim. Assume that a process requested a high-order 7236 * can direct reclaim/compact. 7237 */ 7238 if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) 7239 sc->order = 0; 7240 7241 return sc->nr_scanned >= sc->nr_to_reclaim; 7242 } 7243 7244 /* Page allocator PCP high watermark is lowered if reclaim is active. */ 7245 static inline void 7246 update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active) 7247 { 7248 int i; 7249 struct zone *zone; 7250 7251 for (i = 0; i <= highest_zoneidx; i++) { 7252 zone = pgdat->node_zones + i; 7253 7254 if (!managed_zone(zone)) 7255 continue; 7256 7257 if (active) 7258 set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); 7259 else 7260 clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); 7261 } 7262 } 7263 7264 static inline void 7265 set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) 7266 { 7267 update_reclaim_active(pgdat, highest_zoneidx, true); 7268 } 7269 7270 static inline void 7271 clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) 7272 { 7273 update_reclaim_active(pgdat, highest_zoneidx, false); 7274 } 7275 7276 /* 7277 * For kswapd, balance_pgdat() will reclaim pages across a node from zones 7278 * that are eligible for use by the caller until at least one zone is 7279 * balanced. 7280 * 7281 * Returns the order kswapd finished reclaiming at. 7282 * 7283 * kswapd scans the zones in the highmem->normal->dma direction. It skips 7284 * zones which have free_pages > high_wmark_pages(zone), but once a zone is 7285 * found to have free_pages <= high_wmark_pages(zone), any page in that zone 7286 * or lower is eligible for reclaim until at least one usable zone is 7287 * balanced. 7288 */ 7289 static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) 7290 { 7291 int i; 7292 unsigned long nr_soft_reclaimed; 7293 unsigned long nr_soft_scanned; 7294 unsigned long pflags; 7295 unsigned long nr_boost_reclaim; 7296 unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; 7297 bool boosted; 7298 struct zone *zone; 7299 struct scan_control sc = { 7300 .gfp_mask = GFP_KERNEL, 7301 .order = order, 7302 .may_unmap = 1, 7303 }; 7304 7305 set_task_reclaim_state(current, &sc.reclaim_state); 7306 psi_memstall_enter(&pflags); 7307 __fs_reclaim_acquire(_THIS_IP_); 7308 7309 count_vm_event(PAGEOUTRUN); 7310 7311 /* 7312 * Account for the reclaim boost. Note that the zone boost is left in 7313 * place so that parallel allocations that are near the watermark will 7314 * stall or direct reclaim until kswapd is finished. 7315 */ 7316 nr_boost_reclaim = 0; 7317 for (i = 0; i <= highest_zoneidx; i++) { 7318 zone = pgdat->node_zones + i; 7319 if (!managed_zone(zone)) 7320 continue; 7321 7322 nr_boost_reclaim += zone->watermark_boost; 7323 zone_boosts[i] = zone->watermark_boost; 7324 } 7325 boosted = nr_boost_reclaim; 7326 7327 restart: 7328 set_reclaim_active(pgdat, highest_zoneidx); 7329 sc.priority = DEF_PRIORITY; 7330 do { 7331 unsigned long nr_reclaimed = sc.nr_reclaimed; 7332 bool raise_priority = true; 7333 bool balanced; 7334 bool ret; 7335 7336 sc.reclaim_idx = highest_zoneidx; 7337 7338 /* 7339 * If the number of buffer_heads exceeds the maximum allowed 7340 * then consider reclaiming from all zones. This has a dual 7341 * purpose -- on 64-bit systems it is expected that 7342 * buffer_heads are stripped during active rotation. On 32-bit 7343 * systems, highmem pages can pin lowmem memory and shrinking 7344 * buffers can relieve lowmem pressure. Reclaim may still not 7345 * go ahead if all eligible zones for the original allocation 7346 * request are balanced to avoid excessive reclaim from kswapd. 7347 */ 7348 if (buffer_heads_over_limit) { 7349 for (i = MAX_NR_ZONES - 1; i >= 0; i--) { 7350 zone = pgdat->node_zones + i; 7351 if (!managed_zone(zone)) 7352 continue; 7353 7354 sc.reclaim_idx = i; 7355 break; 7356 } 7357 } 7358 7359 /* 7360 * If the pgdat is imbalanced then ignore boosting and preserve 7361 * the watermarks for a later time and restart. Note that the 7362 * zone watermarks will be still reset at the end of balancing 7363 * on the grounds that the normal reclaim should be enough to 7364 * re-evaluate if boosting is required when kswapd next wakes. 7365 */ 7366 balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx); 7367 if (!balanced && nr_boost_reclaim) { 7368 nr_boost_reclaim = 0; 7369 goto restart; 7370 } 7371 7372 /* 7373 * If boosting is not active then only reclaim if there are no 7374 * eligible zones. Note that sc.reclaim_idx is not used as 7375 * buffer_heads_over_limit may have adjusted it. 7376 */ 7377 if (!nr_boost_reclaim && balanced) 7378 goto out; 7379 7380 /* Limit the priority of boosting to avoid reclaim writeback */ 7381 if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) 7382 raise_priority = false; 7383 7384 /* 7385 * Do not writeback or swap pages for boosted reclaim. The 7386 * intent is to relieve pressure not issue sub-optimal IO 7387 * from reclaim context. If no pages are reclaimed, the 7388 * reclaim will be aborted. 7389 */ 7390 sc.may_writepage = !laptop_mode && !nr_boost_reclaim; 7391 sc.may_swap = !nr_boost_reclaim; 7392 7393 /* 7394 * Do some background aging, to give pages a chance to be 7395 * referenced before reclaiming. All pages are rotated 7396 * regardless of classzone as this is about consistent aging. 7397 */ 7398 kswapd_age_node(pgdat, &sc); 7399 7400 /* 7401 * If we're getting trouble reclaiming, start doing writepage 7402 * even in laptop mode. 7403 */ 7404 if (sc.priority < DEF_PRIORITY - 2) 7405 sc.may_writepage = 1; 7406 7407 /* Call soft limit reclaim before calling shrink_node. */ 7408 sc.nr_scanned = 0; 7409 nr_soft_scanned = 0; 7410 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order, 7411 sc.gfp_mask, &nr_soft_scanned); 7412 sc.nr_reclaimed += nr_soft_reclaimed; 7413 7414 /* 7415 * There should be no need to raise the scanning priority if 7416 * enough pages are already being scanned that that high 7417 * watermark would be met at 100% efficiency. 7418 */ 7419 if (kswapd_shrink_node(pgdat, &sc)) 7420 raise_priority = false; 7421 7422 /* 7423 * If the low watermark is met there is no need for processes 7424 * to be throttled on pfmemalloc_wait as they should not be 7425 * able to safely make forward progress. Wake them 7426 */ 7427 if (waitqueue_active(&pgdat->pfmemalloc_wait) && 7428 allow_direct_reclaim(pgdat)) 7429 wake_up_all(&pgdat->pfmemalloc_wait); 7430 7431 /* Check if kswapd should be suspending */ 7432 __fs_reclaim_release(_THIS_IP_); 7433 ret = try_to_freeze(); 7434 __fs_reclaim_acquire(_THIS_IP_); 7435 if (ret || kthread_should_stop()) 7436 break; 7437 7438 /* 7439 * Raise priority if scanning rate is too low or there was no 7440 * progress in reclaiming pages 7441 */ 7442 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; 7443 nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed); 7444 7445 /* 7446 * If reclaim made no progress for a boost, stop reclaim as 7447 * IO cannot be queued and it could be an infinite loop in 7448 * extreme circumstances. 7449 */ 7450 if (nr_boost_reclaim && !nr_reclaimed) 7451 break; 7452 7453 if (raise_priority || !nr_reclaimed) 7454 sc.priority--; 7455 } while (sc.priority >= 1); 7456 7457 if (!sc.nr_reclaimed) 7458 pgdat->kswapd_failures++; 7459 7460 out: 7461 clear_reclaim_active(pgdat, highest_zoneidx); 7462 7463 /* If reclaim was boosted, account for the reclaim done in this pass */ 7464 if (boosted) { 7465 unsigned long flags; 7466 7467 for (i = 0; i <= highest_zoneidx; i++) { 7468 if (!zone_boosts[i]) 7469 continue; 7470 7471 /* Increments are under the zone lock */ 7472 zone = pgdat->node_zones + i; 7473 spin_lock_irqsave(&zone->lock, flags); 7474 zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]); 7475 spin_unlock_irqrestore(&zone->lock, flags); 7476 } 7477 7478 /* 7479 * As there is now likely space, wakeup kcompact to defragment 7480 * pageblocks. 7481 */ 7482 wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx); 7483 } 7484 7485 snapshot_refaults(NULL, pgdat); 7486 __fs_reclaim_release(_THIS_IP_); 7487 psi_memstall_leave(&pflags); 7488 set_task_reclaim_state(current, NULL); 7489 7490 /* 7491 * Return the order kswapd stopped reclaiming at as 7492 * prepare_kswapd_sleep() takes it into account. If another caller 7493 * entered the allocator slow path while kswapd was awake, order will 7494 * remain at the higher level. 7495 */ 7496 return sc.order; 7497 } 7498 7499 /* 7500 * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to 7501 * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is 7502 * not a valid index then either kswapd runs for first time or kswapd couldn't 7503 * sleep after previous reclaim attempt (node is still unbalanced). In that 7504 * case return the zone index of the previous kswapd reclaim cycle. 7505 */ 7506 static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, 7507 enum zone_type prev_highest_zoneidx) 7508 { 7509 enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); 7510 7511 return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; 7512 } 7513 7514 static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, 7515 unsigned int highest_zoneidx) 7516 { 7517 long remaining = 0; 7518 DEFINE_WAIT(wait); 7519 7520 if (freezing(current) || kthread_should_stop()) 7521 return; 7522 7523 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 7524 7525 /* 7526 * Try to sleep for a short interval. Note that kcompactd will only be 7527 * woken if it is possible to sleep for a short interval. This is 7528 * deliberate on the assumption that if reclaim cannot keep an 7529 * eligible zone balanced that it's also unlikely that compaction will 7530 * succeed. 7531 */ 7532 if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { 7533 /* 7534 * Compaction records what page blocks it recently failed to 7535 * isolate pages from and skips them in the future scanning. 7536 * When kswapd is going to sleep, it is reasonable to assume 7537 * that pages and compaction may succeed so reset the cache. 7538 */ 7539 reset_isolation_suitable(pgdat); 7540 7541 /* 7542 * We have freed the memory, now we should compact it to make 7543 * allocation of the requested order possible. 7544 */ 7545 wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); 7546 7547 remaining = schedule_timeout(HZ/10); 7548 7549 /* 7550 * If woken prematurely then reset kswapd_highest_zoneidx and 7551 * order. The values will either be from a wakeup request or 7552 * the previous request that slept prematurely. 7553 */ 7554 if (remaining) { 7555 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, 7556 kswapd_highest_zoneidx(pgdat, 7557 highest_zoneidx)); 7558 7559 if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) 7560 WRITE_ONCE(pgdat->kswapd_order, reclaim_order); 7561 } 7562 7563 finish_wait(&pgdat->kswapd_wait, &wait); 7564 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 7565 } 7566 7567 /* 7568 * After a short sleep, check if it was a premature sleep. If not, then 7569 * go fully to sleep until explicitly woken up. 7570 */ 7571 if (!remaining && 7572 prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { 7573 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 7574 7575 /* 7576 * vmstat counters are not perfectly accurate and the estimated 7577 * value for counters such as NR_FREE_PAGES can deviate from the 7578 * true value by nr_online_cpus * threshold. To avoid the zone 7579 * watermarks being breached while under pressure, we reduce the 7580 * per-cpu vmstat threshold while kswapd is awake and restore 7581 * them before going back to sleep. 7582 */ 7583 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 7584 7585 if (!kthread_should_stop()) 7586 schedule(); 7587 7588 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 7589 } else { 7590 if (remaining) 7591 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); 7592 else 7593 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); 7594 } 7595 finish_wait(&pgdat->kswapd_wait, &wait); 7596 } 7597 7598 /* 7599 * The background pageout daemon, started as a kernel thread 7600 * from the init process. 7601 * 7602 * This basically trickles out pages so that we have _some_ 7603 * free memory available even if there is no other activity 7604 * that frees anything up. This is needed for things like routing 7605 * etc, where we otherwise might have all activity going on in 7606 * asynchronous contexts that cannot page things out. 7607 * 7608 * If there are applications that are active memory-allocators 7609 * (most normal use), this basically shouldn't matter. 7610 */ 7611 static int kswapd(void *p) 7612 { 7613 unsigned int alloc_order, reclaim_order; 7614 unsigned int highest_zoneidx = MAX_NR_ZONES - 1; 7615 pg_data_t *pgdat = (pg_data_t *)p; 7616 struct task_struct *tsk = current; 7617 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 7618 7619 if (!cpumask_empty(cpumask)) 7620 set_cpus_allowed_ptr(tsk, cpumask); 7621 7622 /* 7623 * Tell the memory management that we're a "memory allocator", 7624 * and that if we need more memory we should get access to it 7625 * regardless (see "__alloc_pages()"). "kswapd" should 7626 * never get caught in the normal page freeing logic. 7627 * 7628 * (Kswapd normally doesn't need memory anyway, but sometimes 7629 * you need a small amount of memory in order to be able to 7630 * page out something else, and this flag essentially protects 7631 * us from recursively trying to free more memory as we're 7632 * trying to free the first piece of memory in the first place). 7633 */ 7634 tsk->flags |= PF_MEMALLOC | PF_KSWAPD; 7635 set_freezable(); 7636 7637 WRITE_ONCE(pgdat->kswapd_order, 0); 7638 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); 7639 atomic_set(&pgdat->nr_writeback_throttled, 0); 7640 for ( ; ; ) { 7641 bool ret; 7642 7643 alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); 7644 highest_zoneidx = kswapd_highest_zoneidx(pgdat, 7645 highest_zoneidx); 7646 7647 kswapd_try_sleep: 7648 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, 7649 highest_zoneidx); 7650 7651 /* Read the new order and highest_zoneidx */ 7652 alloc_order = READ_ONCE(pgdat->kswapd_order); 7653 highest_zoneidx = kswapd_highest_zoneidx(pgdat, 7654 highest_zoneidx); 7655 WRITE_ONCE(pgdat->kswapd_order, 0); 7656 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); 7657 7658 ret = try_to_freeze(); 7659 if (kthread_should_stop()) 7660 break; 7661 7662 /* 7663 * We can speed up thawing tasks if we don't call balance_pgdat 7664 * after returning from the refrigerator 7665 */ 7666 if (ret) 7667 continue; 7668 7669 /* 7670 * Reclaim begins at the requested order but if a high-order 7671 * reclaim fails then kswapd falls back to reclaiming for 7672 * order-0. If that happens, kswapd will consider sleeping 7673 * for the order it finished reclaiming at (reclaim_order) 7674 * but kcompactd is woken to compact for the original 7675 * request (alloc_order). 7676 */ 7677 trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, 7678 alloc_order); 7679 reclaim_order = balance_pgdat(pgdat, alloc_order, 7680 highest_zoneidx); 7681 if (reclaim_order < alloc_order) 7682 goto kswapd_try_sleep; 7683 } 7684 7685 tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD); 7686 7687 return 0; 7688 } 7689 7690 /* 7691 * A zone is low on free memory or too fragmented for high-order memory. If 7692 * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's 7693 * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim 7694 * has failed or is not needed, still wake up kcompactd if only compaction is 7695 * needed. 7696 */ 7697 void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, 7698 enum zone_type highest_zoneidx) 7699 { 7700 pg_data_t *pgdat; 7701 enum zone_type curr_idx; 7702 7703 if (!managed_zone(zone)) 7704 return; 7705 7706 if (!cpuset_zone_allowed(zone, gfp_flags)) 7707 return; 7708 7709 pgdat = zone->zone_pgdat; 7710 curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); 7711 7712 if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx) 7713 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx); 7714 7715 if (READ_ONCE(pgdat->kswapd_order) < order) 7716 WRITE_ONCE(pgdat->kswapd_order, order); 7717 7718 if (!waitqueue_active(&pgdat->kswapd_wait)) 7719 return; 7720 7721 /* Hopeless node, leave it to direct reclaim if possible */ 7722 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || 7723 (pgdat_balanced(pgdat, order, highest_zoneidx) && 7724 !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { 7725 /* 7726 * There may be plenty of free memory available, but it's too 7727 * fragmented for high-order allocations. Wake up kcompactd 7728 * and rely on compaction_suitable() to determine if it's 7729 * needed. If it fails, it will defer subsequent attempts to 7730 * ratelimit its work. 7731 */ 7732 if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) 7733 wakeup_kcompactd(pgdat, order, highest_zoneidx); 7734 return; 7735 } 7736 7737 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order, 7738 gfp_flags); 7739 wake_up_interruptible(&pgdat->kswapd_wait); 7740 } 7741 7742 #ifdef CONFIG_HIBERNATION 7743 /* 7744 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of 7745 * freed pages. 7746 * 7747 * Rather than trying to age LRUs the aim is to preserve the overall 7748 * LRU order by reclaiming preferentially 7749 * inactive > active > active referenced > active mapped 7750 */ 7751 unsigned long shrink_all_memory(unsigned long nr_to_reclaim) 7752 { 7753 struct scan_control sc = { 7754 .nr_to_reclaim = nr_to_reclaim, 7755 .gfp_mask = GFP_HIGHUSER_MOVABLE, 7756 .reclaim_idx = MAX_NR_ZONES - 1, 7757 .priority = DEF_PRIORITY, 7758 .may_writepage = 1, 7759 .may_unmap = 1, 7760 .may_swap = 1, 7761 .hibernation_mode = 1, 7762 }; 7763 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 7764 unsigned long nr_reclaimed; 7765 unsigned int noreclaim_flag; 7766 7767 fs_reclaim_acquire(sc.gfp_mask); 7768 noreclaim_flag = memalloc_noreclaim_save(); 7769 set_task_reclaim_state(current, &sc.reclaim_state); 7770 7771 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 7772 7773 set_task_reclaim_state(current, NULL); 7774 memalloc_noreclaim_restore(noreclaim_flag); 7775 fs_reclaim_release(sc.gfp_mask); 7776 7777 return nr_reclaimed; 7778 } 7779 #endif /* CONFIG_HIBERNATION */ 7780 7781 /* 7782 * This kswapd start function will be called by init and node-hot-add. 7783 */ 7784 void kswapd_run(int nid) 7785 { 7786 pg_data_t *pgdat = NODE_DATA(nid); 7787 7788 pgdat_kswapd_lock(pgdat); 7789 if (!pgdat->kswapd) { 7790 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); 7791 if (IS_ERR(pgdat->kswapd)) { 7792 /* failure at boot is fatal */ 7793 BUG_ON(system_state < SYSTEM_RUNNING); 7794 pr_err("Failed to start kswapd on node %d\n", nid); 7795 pgdat->kswapd = NULL; 7796 } 7797 } 7798 pgdat_kswapd_unlock(pgdat); 7799 } 7800 7801 /* 7802 * Called by memory hotplug when all memory in a node is offlined. Caller must 7803 * be holding mem_hotplug_begin/done(). 7804 */ 7805 void kswapd_stop(int nid) 7806 { 7807 pg_data_t *pgdat = NODE_DATA(nid); 7808 struct task_struct *kswapd; 7809 7810 pgdat_kswapd_lock(pgdat); 7811 kswapd = pgdat->kswapd; 7812 if (kswapd) { 7813 kthread_stop(kswapd); 7814 pgdat->kswapd = NULL; 7815 } 7816 pgdat_kswapd_unlock(pgdat); 7817 } 7818 7819 static int __init kswapd_init(void) 7820 { 7821 int nid; 7822 7823 swap_setup(); 7824 for_each_node_state(nid, N_MEMORY) 7825 kswapd_run(nid); 7826 return 0; 7827 } 7828 7829 module_init(kswapd_init) 7830 7831 #ifdef CONFIG_NUMA 7832 /* 7833 * Node reclaim mode 7834 * 7835 * If non-zero call node_reclaim when the number of free pages falls below 7836 * the watermarks. 7837 */ 7838 int node_reclaim_mode __read_mostly; 7839 7840 /* 7841 * Priority for NODE_RECLAIM. This determines the fraction of pages 7842 * of a node considered for each zone_reclaim. 4 scans 1/16th of 7843 * a zone. 7844 */ 7845 #define NODE_RECLAIM_PRIORITY 4 7846 7847 /* 7848 * Percentage of pages in a zone that must be unmapped for node_reclaim to 7849 * occur. 7850 */ 7851 int sysctl_min_unmapped_ratio = 1; 7852 7853 /* 7854 * If the number of slab pages in a zone grows beyond this percentage then 7855 * slab reclaim needs to occur. 7856 */ 7857 int sysctl_min_slab_ratio = 5; 7858 7859 static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat) 7860 { 7861 unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED); 7862 unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) + 7863 node_page_state(pgdat, NR_ACTIVE_FILE); 7864 7865 /* 7866 * It's possible for there to be more file mapped pages than 7867 * accounted for by the pages on the file LRU lists because 7868 * tmpfs pages accounted for as ANON can also be FILE_MAPPED 7869 */ 7870 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; 7871 } 7872 7873 /* Work out how many page cache pages we can reclaim in this reclaim_mode */ 7874 static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) 7875 { 7876 unsigned long nr_pagecache_reclaimable; 7877 unsigned long delta = 0; 7878 7879 /* 7880 * If RECLAIM_UNMAP is set, then all file pages are considered 7881 * potentially reclaimable. Otherwise, we have to worry about 7882 * pages like swapcache and node_unmapped_file_pages() provides 7883 * a better estimate 7884 */ 7885 if (node_reclaim_mode & RECLAIM_UNMAP) 7886 nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES); 7887 else 7888 nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat); 7889 7890 /* If we can't clean pages, remove dirty pages from consideration */ 7891 if (!(node_reclaim_mode & RECLAIM_WRITE)) 7892 delta += node_page_state(pgdat, NR_FILE_DIRTY); 7893 7894 /* Watch for any possible underflows due to delta */ 7895 if (unlikely(delta > nr_pagecache_reclaimable)) 7896 delta = nr_pagecache_reclaimable; 7897 7898 return nr_pagecache_reclaimable - delta; 7899 } 7900 7901 /* 7902 * Try to free up some pages from this node through reclaim. 7903 */ 7904 static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) 7905 { 7906 /* Minimum pages needed in order to stay on node */ 7907 const unsigned long nr_pages = 1 << order; 7908 struct task_struct *p = current; 7909 unsigned int noreclaim_flag; 7910 struct scan_control sc = { 7911 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), 7912 .gfp_mask = current_gfp_context(gfp_mask), 7913 .order = order, 7914 .priority = NODE_RECLAIM_PRIORITY, 7915 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), 7916 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), 7917 .may_swap = 1, 7918 .reclaim_idx = gfp_zone(gfp_mask), 7919 }; 7920 unsigned long pflags; 7921 7922 trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, 7923 sc.gfp_mask); 7924 7925 cond_resched(); 7926 psi_memstall_enter(&pflags); 7927 fs_reclaim_acquire(sc.gfp_mask); 7928 /* 7929 * We need to be able to allocate from the reserves for RECLAIM_UNMAP 7930 */ 7931 noreclaim_flag = memalloc_noreclaim_save(); 7932 set_task_reclaim_state(p, &sc.reclaim_state); 7933 7934 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages || 7935 node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) { 7936 /* 7937 * Free memory by calling shrink node with increasing 7938 * priorities until we have enough memory freed. 7939 */ 7940 do { 7941 shrink_node(pgdat, &sc); 7942 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); 7943 } 7944 7945 set_task_reclaim_state(p, NULL); 7946 memalloc_noreclaim_restore(noreclaim_flag); 7947 fs_reclaim_release(sc.gfp_mask); 7948 psi_memstall_leave(&pflags); 7949 7950 trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); 7951 7952 return sc.nr_reclaimed >= nr_pages; 7953 } 7954 7955 int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) 7956 { 7957 int ret; 7958 7959 /* 7960 * Node reclaim reclaims unmapped file backed pages and 7961 * slab pages if we are over the defined limits. 7962 * 7963 * A small portion of unmapped file backed pages is needed for 7964 * file I/O otherwise pages read by file I/O will be immediately 7965 * thrown out if the node is overallocated. So we do not reclaim 7966 * if less than a specified percentage of the node is used by 7967 * unmapped file backed pages. 7968 */ 7969 if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && 7970 node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <= 7971 pgdat->min_slab_pages) 7972 return NODE_RECLAIM_FULL; 7973 7974 /* 7975 * Do not scan if the allocation should not be delayed. 7976 */ 7977 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) 7978 return NODE_RECLAIM_NOSCAN; 7979 7980 /* 7981 * Only run node reclaim on the local node or on nodes that do not 7982 * have associated processors. This will favor the local processor 7983 * over remote processors and spread off node memory allocations 7984 * as wide as possible. 7985 */ 7986 if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) 7987 return NODE_RECLAIM_NOSCAN; 7988 7989 if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) 7990 return NODE_RECLAIM_NOSCAN; 7991 7992 ret = __node_reclaim(pgdat, gfp_mask, order); 7993 clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); 7994 7995 if (!ret) 7996 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); 7997 7998 return ret; 7999 } 8000 #endif 8001 8002 void check_move_unevictable_pages(struct pagevec *pvec) 8003 { 8004 struct folio_batch fbatch; 8005 unsigned i; 8006 8007 folio_batch_init(&fbatch); 8008 for (i = 0; i < pvec->nr; i++) { 8009 struct page *page = pvec->pages[i]; 8010 8011 if (PageTransTail(page)) 8012 continue; 8013 folio_batch_add(&fbatch, page_folio(page)); 8014 } 8015 check_move_unevictable_folios(&fbatch); 8016 } 8017 EXPORT_SYMBOL_GPL(check_move_unevictable_pages); 8018 8019 /** 8020 * check_move_unevictable_folios - Move evictable folios to appropriate zone 8021 * lru list 8022 * @fbatch: Batch of lru folios to check. 8023 * 8024 * Checks folios for evictability, if an evictable folio is in the unevictable 8025 * lru list, moves it to the appropriate evictable lru list. This function 8026 * should be only used for lru folios. 8027 */ 8028 void check_move_unevictable_folios(struct folio_batch *fbatch) 8029 { 8030 struct lruvec *lruvec = NULL; 8031 int pgscanned = 0; 8032 int pgrescued = 0; 8033 int i; 8034 8035 for (i = 0; i < fbatch->nr; i++) { 8036 struct folio *folio = fbatch->folios[i]; 8037 int nr_pages = folio_nr_pages(folio); 8038 8039 pgscanned += nr_pages; 8040 8041 /* block memcg migration while the folio moves between lrus */ 8042 if (!folio_test_clear_lru(folio)) 8043 continue; 8044 8045 lruvec = folio_lruvec_relock_irq(folio, lruvec); 8046 if (folio_evictable(folio) && folio_test_unevictable(folio)) { 8047 lruvec_del_folio(lruvec, folio); 8048 folio_clear_unevictable(folio); 8049 lruvec_add_folio(lruvec, folio); 8050 pgrescued += nr_pages; 8051 } 8052 folio_set_lru(folio); 8053 } 8054 8055 if (lruvec) { 8056 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); 8057 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); 8058 unlock_page_lruvec_irq(lruvec); 8059 } else if (pgscanned) { 8060 count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); 8061 } 8062 } 8063 EXPORT_SYMBOL_GPL(check_move_unevictable_folios); 8064