vmscan.c (a579086c99ed70cc4bfc104348dbe3dd8f2787e6) vmscan.c (7348cc91821b0cb24dfb00e578047f68299a50ab)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
4 *
5 * Swap reorganised 29.12.95, Stephen Tweedie.
6 * kswapd added: 7.1.96 sct
7 * Removed kswapd_ctl limits, and swap out as many pages as needed
8 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.

--- 123 unchanged lines hidden (view full) ---

132 /* The file folios on the current node are dangerously low */
133 unsigned int file_is_tiny:1;
134
135 /* Always discard instead of demoting to lower tier memory */
136 unsigned int no_demotion:1;
137
138#ifdef CONFIG_LRU_GEN
139 /* help kswapd make better choices among multiple memcgs */
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
4 *
5 * Swap reorganised 29.12.95, Stephen Tweedie.
6 * kswapd added: 7.1.96 sct
7 * Removed kswapd_ctl limits, and swap out as many pages as needed
8 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.

--- 123 unchanged lines hidden (view full) ---

132 /* The file folios on the current node are dangerously low */
133 unsigned int file_is_tiny:1;
134
135 /* Always discard instead of demoting to lower tier memory */
136 unsigned int no_demotion:1;
137
138#ifdef CONFIG_LRU_GEN
139 /* help kswapd make better choices among multiple memcgs */
140 unsigned int memcgs_need_aging:1;
141 unsigned long last_reclaimed;
142#endif
143
144 /* Allocation order */
145 s8 order;
146
147 /* Scan (total_size >> priority) pages at once */
148 s8 priority;

--- 4314 unchanged lines hidden (view full) ---

4463 inc_max_seq(lruvec, can_swap, force_scan);
4464 /* either this sees any waiters or they will see updated max_seq */
4465 if (wq_has_sleeper(&lruvec->mm_state.wait))
4466 wake_up_all(&lruvec->mm_state.wait);
4467
4468 return true;
4469}
4470
140 unsigned long last_reclaimed;
141#endif
142
143 /* Allocation order */
144 s8 order;
145
146 /* Scan (total_size >> priority) pages at once */
147 s8 priority;

--- 4314 unchanged lines hidden (view full) ---

4462 inc_max_seq(lruvec, can_swap, force_scan);
4463 /* either this sees any waiters or they will see updated max_seq */
4464 if (wq_has_sleeper(&lruvec->mm_state.wait))
4465 wake_up_all(&lruvec->mm_state.wait);
4466
4467 return true;
4468}
4469
4471static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
4470static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
4472 struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
4473{
4474 int gen, type, zone;
4475 unsigned long old = 0;
4476 unsigned long young = 0;
4477 unsigned long total = 0;
4478 struct lru_gen_folio *lrugen = &lruvec->lrugen;
4479 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4471 struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
4472{
4473 int gen, type, zone;
4474 unsigned long old = 0;
4475 unsigned long young = 0;
4476 unsigned long total = 0;
4477 struct lru_gen_folio *lrugen = &lruvec->lrugen;
4478 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4479 DEFINE_MIN_SEQ(lruvec);
4480
4480
4481 /* whether this lruvec is completely out of cold folios */
4482 if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
4483 *nr_to_scan = 0;
4484 return true;
4485 }
4486
4481 for (type = !can_swap; type < ANON_AND_FILE; type++) {
4482 unsigned long seq;
4483
4484 for (seq = min_seq[type]; seq <= max_seq; seq++) {
4485 unsigned long size = 0;
4486
4487 gen = lru_gen_from_seq(seq);
4488

--- 11 unchanged lines hidden (view full) ---

4500 /* try to scrape all its memory if this memcg was deleted */
4501 *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
4502
4503 /*
4504 * The aging tries to be lazy to reduce the overhead, while the eviction
4505 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
4506 * ideal number of generations is MIN_NR_GENS+1.
4507 */
4487 for (type = !can_swap; type < ANON_AND_FILE; type++) {
4488 unsigned long seq;
4489
4490 for (seq = min_seq[type]; seq <= max_seq; seq++) {
4491 unsigned long size = 0;
4492
4493 gen = lru_gen_from_seq(seq);
4494

--- 11 unchanged lines hidden (view full) ---

4506 /* try to scrape all its memory if this memcg was deleted */
4507 *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
4508
4509 /*
4510 * The aging tries to be lazy to reduce the overhead, while the eviction
4511 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
4512 * ideal number of generations is MIN_NR_GENS+1.
4513 */
4508 if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
4509 return true;
4510 if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
4511 return false;
4512
4513 /*
4514 * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
4515 * of the total number of pages for each generation. A reasonable range
4516 * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
4517 * aging cares about the upper bound of hot pages, while the eviction
4518 * cares about the lower bound of cold pages.
4519 */
4520 if (young * MIN_NR_GENS > total)
4521 return true;
4522 if (old * (MIN_NR_GENS + 2) < total)
4523 return true;
4524
4525 return false;
4526}
4527
4514 if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
4515 return false;
4516
4517 /*
4518 * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
4519 * of the total number of pages for each generation. A reasonable range
4520 * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
4521 * aging cares about the upper bound of hot pages, while the eviction
4522 * cares about the lower bound of cold pages.
4523 */
4524 if (young * MIN_NR_GENS > total)
4525 return true;
4526 if (old * (MIN_NR_GENS + 2) < total)
4527 return true;
4528
4529 return false;
4530}
4531
4528static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
4532static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
4529{
4533{
4530 bool need_aging;
4531 unsigned long nr_to_scan;
4532 int swappiness = get_swappiness(lruvec, sc);
4534 int gen, type, zone;
4535 unsigned long total = 0;
4536 bool can_swap = get_swappiness(lruvec, sc);
4537 struct lru_gen_folio *lrugen = &lruvec->lrugen;
4533 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4534 DEFINE_MAX_SEQ(lruvec);
4535 DEFINE_MIN_SEQ(lruvec);
4536
4538 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4539 DEFINE_MAX_SEQ(lruvec);
4540 DEFINE_MIN_SEQ(lruvec);
4541
4537 VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
4542 for (type = !can_swap; type < ANON_AND_FILE; type++) {
4543 unsigned long seq;
4538
4544
4539 mem_cgroup_calculate_protection(NULL, memcg);
4545 for (seq = min_seq[type]; seq <= max_seq; seq++) {
4546 gen = lru_gen_from_seq(seq);
4540
4547
4541 if (mem_cgroup_below_min(NULL, memcg))
4542 return false;
4548 for (zone = 0; zone < MAX_NR_ZONES; zone++)
4549 total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
4550 }
4551 }
4543
4552
4544 need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
4553 /* whether the size is big enough to be helpful */
4554 return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
4555}
4545
4556
4546 if (min_ttl) {
4547 int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
4548 unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
4557static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
4558 unsigned long min_ttl)
4559{
4560 int gen;
4561 unsigned long birth;
4562 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4563 DEFINE_MIN_SEQ(lruvec);
4549
4564
4550 if (time_is_after_jiffies(birth + min_ttl))
4551 return false;
4565 VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
4552
4566
4553 /* the size is likely too small to be helpful */
4554 if (!nr_to_scan && sc->priority != DEF_PRIORITY)
4555 return false;
4556 }
4567 /* see the comment on lru_gen_folio */
4568 gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
4569 birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
4557
4570
4558 if (need_aging)
4559 try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
4571 if (time_is_after_jiffies(birth + min_ttl))
4572 return false;
4560
4573
4561 return true;
4574 if (!lruvec_is_sizable(lruvec, sc))
4575 return false;
4576
4577 mem_cgroup_calculate_protection(NULL, memcg);
4578
4579 return !mem_cgroup_below_min(NULL, memcg);
4562}
4563
4564/* to protect the working set of the last N jiffies */
4565static unsigned long lru_gen_min_ttl __read_mostly;
4566
4567static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
4568{
4569 struct mem_cgroup *memcg;
4580}
4581
4582/* to protect the working set of the last N jiffies */
4583static unsigned long lru_gen_min_ttl __read_mostly;
4584
4585static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
4586{
4587 struct mem_cgroup *memcg;
4570 bool success = false;
4571 unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
4572
4573 VM_WARN_ON_ONCE(!current_is_kswapd());
4574
4575 sc->last_reclaimed = sc->nr_reclaimed;
4576
4588 unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
4589
4590 VM_WARN_ON_ONCE(!current_is_kswapd());
4591
4592 sc->last_reclaimed = sc->nr_reclaimed;
4593
4577 /*
4578 * To reduce the chance of going into the aging path, which can be
4579 * costly, optimistically skip it if the flag below was cleared in the
4580 * eviction path. This improves the overall performance when multiple
4581 * memcgs are available.
4582 */
4583 if (!sc->memcgs_need_aging) {
4584 sc->memcgs_need_aging = true;
4594 /* check the order to exclude compaction-induced reclaim */
4595 if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
4585 return;
4596 return;
4586 }
4587
4597
4588 set_mm_walk(pgdat);
4589
4590 memcg = mem_cgroup_iter(NULL, NULL, NULL);
4591 do {
4592 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
4593
4598 memcg = mem_cgroup_iter(NULL, NULL, NULL);
4599 do {
4600 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
4601
4594 if (age_lruvec(lruvec, sc, min_ttl))
4595 success = true;
4602 if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
4603 mem_cgroup_iter_break(NULL, memcg);
4604 return;
4605 }
4596
4597 cond_resched();
4598 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
4599
4606
4607 cond_resched();
4608 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
4609
4600 clear_mm_walk();
4601
4602 /* check the order to exclude compaction-induced reclaim */
4603 if (success || !min_ttl || sc->order)
4604 return;
4605
4606 /*
4607 * The main goal is to OOM kill if every generation from all memcgs is
4608 * younger than min_ttl. However, another possibility is all memcgs are
4610 /*
4611 * The main goal is to OOM kill if every generation from all memcgs is
4612 * younger than min_ttl. However, another possibility is all memcgs are
4609 * either below min or empty.
4613 * either too small or below min.
4610 */
4611 if (mutex_trylock(&oom_lock)) {
4612 struct oom_control oc = {
4613 .gfp_mask = sc->gfp_mask,
4614 };
4615
4616 out_of_memory(&oc);
4617

--- 491 unchanged lines hidden (view full) ---

5109}
5110
5111/*
5112 * For future optimizations:
5113 * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
5114 * reclaim.
5115 */
5116static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
4614 */
4615 if (mutex_trylock(&oom_lock)) {
4616 struct oom_control oc = {
4617 .gfp_mask = sc->gfp_mask,
4618 };
4619
4620 out_of_memory(&oc);
4621

--- 491 unchanged lines hidden (view full) ---

5113}
5114
5115/*
5116 * For future optimizations:
5117 * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
5118 * reclaim.
5119 */
5120static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
5117 bool can_swap, bool *need_aging)
5121 bool can_swap)
5118{
5119 unsigned long nr_to_scan;
5120 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
5121 DEFINE_MAX_SEQ(lruvec);
5122{
5123 unsigned long nr_to_scan;
5124 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
5125 DEFINE_MAX_SEQ(lruvec);
5122 DEFINE_MIN_SEQ(lruvec);
5123
5124 if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) ||
5125 (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) &&
5126 !sc->memcg_low_reclaim))
5127 return 0;
5128
5126
5127 if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) ||
5128 (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) &&
5129 !sc->memcg_low_reclaim))
5130 return 0;
5131
5129 *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
5130 if (!*need_aging)
5132 if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
5131 return nr_to_scan;
5132
5133 /* skip the aging path at the default priority */
5134 if (sc->priority == DEF_PRIORITY)
5133 return nr_to_scan;
5134
5135 /* skip the aging path at the default priority */
5136 if (sc->priority == DEF_PRIORITY)
5135 goto done;
5137 return nr_to_scan;
5136
5138
5137 /* leave the work to lru_gen_age_node() */
5138 if (current_is_kswapd())
5139 return 0;
5139 try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
5140
5140
5141 if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
5142 return nr_to_scan;
5143done:
5144 return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
5141 /* skip this lruvec as it's low on cold folios */
5142 return 0;
5145}
5146
5147static unsigned long get_nr_to_reclaim(struct scan_control *sc)
5148{
5149 /* don't abort memcg reclaim to ensure fairness */
5150 if (!global_reclaim(sc))
5151 return -1;
5152
5153 /* discount the previous progress for kswapd */
5154 if (current_is_kswapd())
5155 return sc->nr_to_reclaim + sc->last_reclaimed;
5156
5157 return max(sc->nr_to_reclaim, compact_gap(sc->order));
5158}
5159
5160static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
5161{
5162 struct blk_plug plug;
5143}
5144
5145static unsigned long get_nr_to_reclaim(struct scan_control *sc)
5146{
5147 /* don't abort memcg reclaim to ensure fairness */
5148 if (!global_reclaim(sc))
5149 return -1;
5150
5151 /* discount the previous progress for kswapd */
5152 if (current_is_kswapd())
5153 return sc->nr_to_reclaim + sc->last_reclaimed;
5154
5155 return max(sc->nr_to_reclaim, compact_gap(sc->order));
5156}
5157
5158static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
5159{
5160 struct blk_plug plug;
5163 bool need_aging = false;
5164 unsigned long scanned = 0;
5161 unsigned long scanned = 0;
5165 unsigned long reclaimed = sc->nr_reclaimed;
5166 unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
5167
5168 lru_add_drain();
5169
5170 blk_start_plug(&plug);
5171
5172 set_mm_walk(lruvec_pgdat(lruvec));
5173

--- 4 unchanged lines hidden (view full) ---

5178
5179 if (sc->may_swap)
5180 swappiness = get_swappiness(lruvec, sc);
5181 else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
5182 swappiness = 1;
5183 else
5184 swappiness = 0;
5185
5162 unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
5163
5164 lru_add_drain();
5165
5166 blk_start_plug(&plug);
5167
5168 set_mm_walk(lruvec_pgdat(lruvec));
5169

--- 4 unchanged lines hidden (view full) ---

5174
5175 if (sc->may_swap)
5176 swappiness = get_swappiness(lruvec, sc);
5177 else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
5178 swappiness = 1;
5179 else
5180 swappiness = 0;
5181
5186 nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
5182 nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
5187 if (!nr_to_scan)
5183 if (!nr_to_scan)
5188 goto done;
5184 break;
5189
5190 delta = evict_folios(lruvec, sc, swappiness);
5191 if (!delta)
5185
5186 delta = evict_folios(lruvec, sc, swappiness);
5187 if (!delta)
5192 goto done;
5188 break;
5193
5194 scanned += delta;
5195 if (scanned >= nr_to_scan)
5196 break;
5197
5198 if (sc->nr_reclaimed >= nr_to_reclaim)
5199 break;
5200
5201 cond_resched();
5202 }
5203
5189
5190 scanned += delta;
5191 if (scanned >= nr_to_scan)
5192 break;
5193
5194 if (sc->nr_reclaimed >= nr_to_reclaim)
5195 break;
5196
5197 cond_resched();
5198 }
5199
5204 /* see the comment in lru_gen_age_node() */
5205 if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
5206 sc->memcgs_need_aging = false;
5207done:
5208 clear_mm_walk();
5209
5210 blk_finish_plug(&plug);
5211}
5212
5213/******************************************************************************
5214 * state change
5215 ******************************************************************************/

--- 2544 unchanged lines hidden ---
5200 clear_mm_walk();
5201
5202 blk_finish_plug(&plug);
5203}
5204
5205/******************************************************************************
5206 * state change
5207 ******************************************************************************/

--- 2544 unchanged lines hidden ---