vmscan.c (018ee47f14893d500131dfca2ff9f3ff8ebd4ed2) vmscan.c (bd74fdaea146029e4fa12c6de89adbe0779348a9)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
4 *
5 * Swap reorganised 29.12.95, Stephen Tweedie.
6 * kswapd added: 7.1.96 sct
7 * Removed kswapd_ctl limits, and swap out as many pages as needed
8 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.

--- 35 unchanged lines hidden (view full) ---

44#include <linux/delayacct.h>
45#include <linux/sysctl.h>
46#include <linux/oom.h>
47#include <linux/pagevec.h>
48#include <linux/prefetch.h>
49#include <linux/printk.h>
50#include <linux/dax.h>
51#include <linux/psi.h>
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
4 *
5 * Swap reorganised 29.12.95, Stephen Tweedie.
6 * kswapd added: 7.1.96 sct
7 * Removed kswapd_ctl limits, and swap out as many pages as needed
8 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.

--- 35 unchanged lines hidden (view full) ---

44#include <linux/delayacct.h>
45#include <linux/sysctl.h>
46#include <linux/oom.h>
47#include <linux/pagevec.h>
48#include <linux/prefetch.h>
49#include <linux/printk.h>
50#include <linux/dax.h>
51#include <linux/psi.h>
52#include <linux/pagewalk.h>
53#include <linux/shmem_fs.h>
52
53#include <asm/tlbflush.h>
54#include <asm/div64.h>
55
56#include <linux/swapops.h>
57#include <linux/balloon_compaction.h>
58#include <linux/sched/sysctl.h>
59

--- 3017 unchanged lines hidden (view full) ---

3077 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
3078 }
3079
3080#define for_each_gen_type_zone(gen, type, zone) \
3081 for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
3082 for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
3083 for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
3084
54
55#include <asm/tlbflush.h>
56#include <asm/div64.h>
57
58#include <linux/swapops.h>
59#include <linux/balloon_compaction.h>
60#include <linux/sched/sysctl.h>
61

--- 3017 unchanged lines hidden (view full) ---

3079 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
3080 }
3081
3082#define for_each_gen_type_zone(gen, type, zone) \
3083 for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
3084 for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
3085 for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
3086
3085static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
3087static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
3086{
3087 struct pglist_data *pgdat = NODE_DATA(nid);
3088
3089#ifdef CONFIG_MEMCG
3090 if (memcg) {
3091 struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
3092
3093 /* for hotadd_new_pgdat() */

--- 29 unchanged lines hidden (view full) ---

3123{
3124 /* see the comment on lru_gen_struct */
3125 return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
3126 get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
3127 get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
3128}
3129
3130/******************************************************************************
3088{
3089 struct pglist_data *pgdat = NODE_DATA(nid);
3090
3091#ifdef CONFIG_MEMCG
3092 if (memcg) {
3093 struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
3094
3095 /* for hotadd_new_pgdat() */

--- 29 unchanged lines hidden (view full) ---

3125{
3126 /* see the comment on lru_gen_struct */
3127 return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
3128 get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
3129 get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
3130}
3131
3132/******************************************************************************
3133 * mm_struct list
3134 ******************************************************************************/
3135
3136static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
3137{
3138 static struct lru_gen_mm_list mm_list = {
3139 .fifo = LIST_HEAD_INIT(mm_list.fifo),
3140 .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
3141 };
3142
3143#ifdef CONFIG_MEMCG
3144 if (memcg)
3145 return &memcg->mm_list;
3146#endif
3147 VM_WARN_ON_ONCE(!mem_cgroup_disabled());
3148
3149 return &mm_list;
3150}
3151
3152void lru_gen_add_mm(struct mm_struct *mm)
3153{
3154 int nid;
3155 struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
3156 struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
3157
3158 VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
3159#ifdef CONFIG_MEMCG
3160 VM_WARN_ON_ONCE(mm->lru_gen.memcg);
3161 mm->lru_gen.memcg = memcg;
3162#endif
3163 spin_lock(&mm_list->lock);
3164
3165 for_each_node_state(nid, N_MEMORY) {
3166 struct lruvec *lruvec = get_lruvec(memcg, nid);
3167
3168 if (!lruvec)
3169 continue;
3170
3171 /* the first addition since the last iteration */
3172 if (lruvec->mm_state.tail == &mm_list->fifo)
3173 lruvec->mm_state.tail = &mm->lru_gen.list;
3174 }
3175
3176 list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
3177
3178 spin_unlock(&mm_list->lock);
3179}
3180
3181void lru_gen_del_mm(struct mm_struct *mm)
3182{
3183 int nid;
3184 struct lru_gen_mm_list *mm_list;
3185 struct mem_cgroup *memcg = NULL;
3186
3187 if (list_empty(&mm->lru_gen.list))
3188 return;
3189
3190#ifdef CONFIG_MEMCG
3191 memcg = mm->lru_gen.memcg;
3192#endif
3193 mm_list = get_mm_list(memcg);
3194
3195 spin_lock(&mm_list->lock);
3196
3197 for_each_node(nid) {
3198 struct lruvec *lruvec = get_lruvec(memcg, nid);
3199
3200 if (!lruvec)
3201 continue;
3202
3203 /* where the last iteration ended (exclusive) */
3204 if (lruvec->mm_state.tail == &mm->lru_gen.list)
3205 lruvec->mm_state.tail = lruvec->mm_state.tail->next;
3206
3207 /* where the current iteration continues (inclusive) */
3208 if (lruvec->mm_state.head != &mm->lru_gen.list)
3209 continue;
3210
3211 lruvec->mm_state.head = lruvec->mm_state.head->next;
3212 /* the deletion ends the current iteration */
3213 if (lruvec->mm_state.head == &mm_list->fifo)
3214 WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
3215 }
3216
3217 list_del_init(&mm->lru_gen.list);
3218
3219 spin_unlock(&mm_list->lock);
3220
3221#ifdef CONFIG_MEMCG
3222 mem_cgroup_put(mm->lru_gen.memcg);
3223 mm->lru_gen.memcg = NULL;
3224#endif
3225}
3226
3227#ifdef CONFIG_MEMCG
3228void lru_gen_migrate_mm(struct mm_struct *mm)
3229{
3230 struct mem_cgroup *memcg;
3231 struct task_struct *task = rcu_dereference_protected(mm->owner, true);
3232
3233 VM_WARN_ON_ONCE(task->mm != mm);
3234 lockdep_assert_held(&task->alloc_lock);
3235
3236 /* for mm_update_next_owner() */
3237 if (mem_cgroup_disabled())
3238 return;
3239
3240 rcu_read_lock();
3241 memcg = mem_cgroup_from_task(task);
3242 rcu_read_unlock();
3243 if (memcg == mm->lru_gen.memcg)
3244 return;
3245
3246 VM_WARN_ON_ONCE(!mm->lru_gen.memcg);
3247 VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
3248
3249 lru_gen_del_mm(mm);
3250 lru_gen_add_mm(mm);
3251}
3252#endif
3253
3254/*
3255 * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
3256 * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
3257 * bits in a bitmap, k is the number of hash functions and n is the number of
3258 * inserted items.
3259 *
3260 * Page table walkers use one of the two filters to reduce their search space.
3261 * To get rid of non-leaf entries that no longer have enough leaf entries, the
3262 * aging uses the double-buffering technique to flip to the other filter each
3263 * time it produces a new generation. For non-leaf entries that have enough
3264 * leaf entries, the aging carries them over to the next generation in
3265 * walk_pmd_range(); the eviction also report them when walking the rmap
3266 * in lru_gen_look_around().
3267 *
3268 * For future optimizations:
3269 * 1. It's not necessary to keep both filters all the time. The spare one can be
3270 * freed after the RCU grace period and reallocated if needed again.
3271 * 2. And when reallocating, it's worth scaling its size according to the number
3272 * of inserted entries in the other filter, to reduce the memory overhead on
3273 * small systems and false positives on large systems.
3274 * 3. Jenkins' hash function is an alternative to Knuth's.
3275 */
3276#define BLOOM_FILTER_SHIFT 15
3277
3278static inline int filter_gen_from_seq(unsigned long seq)
3279{
3280 return seq % NR_BLOOM_FILTERS;
3281}
3282
3283static void get_item_key(void *item, int *key)
3284{
3285 u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
3286
3287 BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
3288
3289 key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
3290 key[1] = hash >> BLOOM_FILTER_SHIFT;
3291}
3292
3293static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
3294{
3295 unsigned long *filter;
3296 int gen = filter_gen_from_seq(seq);
3297
3298 filter = lruvec->mm_state.filters[gen];
3299 if (filter) {
3300 bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
3301 return;
3302 }
3303
3304 filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
3305 __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
3306 WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
3307}
3308
3309static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
3310{
3311 int key[2];
3312 unsigned long *filter;
3313 int gen = filter_gen_from_seq(seq);
3314
3315 filter = READ_ONCE(lruvec->mm_state.filters[gen]);
3316 if (!filter)
3317 return;
3318
3319 get_item_key(item, key);
3320
3321 if (!test_bit(key[0], filter))
3322 set_bit(key[0], filter);
3323 if (!test_bit(key[1], filter))
3324 set_bit(key[1], filter);
3325}
3326
3327static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
3328{
3329 int key[2];
3330 unsigned long *filter;
3331 int gen = filter_gen_from_seq(seq);
3332
3333 filter = READ_ONCE(lruvec->mm_state.filters[gen]);
3334 if (!filter)
3335 return true;
3336
3337 get_item_key(item, key);
3338
3339 return test_bit(key[0], filter) && test_bit(key[1], filter);
3340}
3341
3342static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
3343{
3344 int i;
3345 int hist;
3346
3347 lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
3348
3349 if (walk) {
3350 hist = lru_hist_from_seq(walk->max_seq);
3351
3352 for (i = 0; i < NR_MM_STATS; i++) {
3353 WRITE_ONCE(lruvec->mm_state.stats[hist][i],
3354 lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
3355 walk->mm_stats[i] = 0;
3356 }
3357 }
3358
3359 if (NR_HIST_GENS > 1 && last) {
3360 hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
3361
3362 for (i = 0; i < NR_MM_STATS; i++)
3363 WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
3364 }
3365}
3366
3367static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
3368{
3369 int type;
3370 unsigned long size = 0;
3371 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
3372 int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
3373
3374 if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
3375 return true;
3376
3377 clear_bit(key, &mm->lru_gen.bitmap);
3378
3379 for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
3380 size += type ? get_mm_counter(mm, MM_FILEPAGES) :
3381 get_mm_counter(mm, MM_ANONPAGES) +
3382 get_mm_counter(mm, MM_SHMEMPAGES);
3383 }
3384
3385 if (size < MIN_LRU_BATCH)
3386 return true;
3387
3388 return !mmget_not_zero(mm);
3389}
3390
3391static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
3392 struct mm_struct **iter)
3393{
3394 bool first = false;
3395 bool last = true;
3396 struct mm_struct *mm = NULL;
3397 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3398 struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
3399 struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
3400
3401 /*
3402 * There are four interesting cases for this page table walker:
3403 * 1. It tries to start a new iteration of mm_list with a stale max_seq;
3404 * there is nothing left to do.
3405 * 2. It's the first of the current generation, and it needs to reset
3406 * the Bloom filter for the next generation.
3407 * 3. It reaches the end of mm_list, and it needs to increment
3408 * mm_state->seq; the iteration is done.
3409 * 4. It's the last of the current generation, and it needs to reset the
3410 * mm stats counters for the next generation.
3411 */
3412 spin_lock(&mm_list->lock);
3413
3414 VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
3415 VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
3416 VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
3417
3418 if (walk->max_seq <= mm_state->seq) {
3419 if (!*iter)
3420 last = false;
3421 goto done;
3422 }
3423
3424 if (!mm_state->nr_walkers) {
3425 VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
3426
3427 mm_state->head = mm_list->fifo.next;
3428 first = true;
3429 }
3430
3431 while (!mm && mm_state->head != &mm_list->fifo) {
3432 mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
3433
3434 mm_state->head = mm_state->head->next;
3435
3436 /* force scan for those added after the last iteration */
3437 if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
3438 mm_state->tail = mm_state->head;
3439 walk->force_scan = true;
3440 }
3441
3442 if (should_skip_mm(mm, walk))
3443 mm = NULL;
3444 }
3445
3446 if (mm_state->head == &mm_list->fifo)
3447 WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
3448done:
3449 if (*iter && !mm)
3450 mm_state->nr_walkers--;
3451 if (!*iter && mm)
3452 mm_state->nr_walkers++;
3453
3454 if (mm_state->nr_walkers)
3455 last = false;
3456
3457 if (*iter || last)
3458 reset_mm_stats(lruvec, walk, last);
3459
3460 spin_unlock(&mm_list->lock);
3461
3462 if (mm && first)
3463 reset_bloom_filter(lruvec, walk->max_seq + 1);
3464
3465 if (*iter)
3466 mmput_async(*iter);
3467
3468 *iter = mm;
3469
3470 return last;
3471}
3472
3473static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
3474{
3475 bool success = false;
3476 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3477 struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
3478 struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
3479
3480 spin_lock(&mm_list->lock);
3481
3482 VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
3483
3484 if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
3485 VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
3486
3487 WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
3488 reset_mm_stats(lruvec, NULL, true);
3489 success = true;
3490 }
3491
3492 spin_unlock(&mm_list->lock);
3493
3494 return success;
3495}
3496
3497/******************************************************************************
3131 * refault feedback loop
3132 ******************************************************************************/
3133
3134/*
3135 * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
3136 *
3137 * The P term is refaulted/(evicted+protected) from a tier in the generation
3138 * currently being evicted; the I term is the exponential moving average of the

--- 133 unchanged lines hidden (view full) ---

3272 new_flags |= BIT(PG_reclaim);
3273 } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
3274
3275 lru_gen_update_size(lruvec, folio, old_gen, new_gen);
3276
3277 return new_gen;
3278}
3279
3498 * refault feedback loop
3499 ******************************************************************************/
3500
3501/*
3502 * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
3503 *
3504 * The P term is refaulted/(evicted+protected) from a tier in the generation
3505 * currently being evicted; the I term is the exponential moving average of the

--- 133 unchanged lines hidden (view full) ---

3639 new_flags |= BIT(PG_reclaim);
3640 } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
3641
3642 lru_gen_update_size(lruvec, folio, old_gen, new_gen);
3643
3644 return new_gen;
3645}
3646
3647static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
3648 int old_gen, int new_gen)
3649{
3650 int type = folio_is_file_lru(folio);
3651 int zone = folio_zonenum(folio);
3652 int delta = folio_nr_pages(folio);
3653
3654 VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
3655 VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);
3656
3657 walk->batched++;
3658
3659 walk->nr_pages[old_gen][type][zone] -= delta;
3660 walk->nr_pages[new_gen][type][zone] += delta;
3661}
3662
3663static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
3664{
3665 int gen, type, zone;
3666 struct lru_gen_struct *lrugen = &lruvec->lrugen;
3667
3668 walk->batched = 0;
3669
3670 for_each_gen_type_zone(gen, type, zone) {
3671 enum lru_list lru = type * LRU_INACTIVE_FILE;
3672 int delta = walk->nr_pages[gen][type][zone];
3673
3674 if (!delta)
3675 continue;
3676
3677 walk->nr_pages[gen][type][zone] = 0;
3678 WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
3679 lrugen->nr_pages[gen][type][zone] + delta);
3680
3681 if (lru_gen_is_active(lruvec, gen))
3682 lru += LRU_ACTIVE;
3683 __update_lru_size(lruvec, lru, zone, delta);
3684 }
3685}
3686
3687static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
3688{
3689 struct address_space *mapping;
3690 struct vm_area_struct *vma = args->vma;
3691 struct lru_gen_mm_walk *walk = args->private;
3692
3693 if (!vma_is_accessible(vma))
3694 return true;
3695
3696 if (is_vm_hugetlb_page(vma))
3697 return true;
3698
3699 if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
3700 return true;
3701
3702 if (vma == get_gate_vma(vma->vm_mm))
3703 return true;
3704
3705 if (vma_is_anonymous(vma))
3706 return !walk->can_swap;
3707
3708 if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
3709 return true;
3710
3711 mapping = vma->vm_file->f_mapping;
3712 if (mapping_unevictable(mapping))
3713 return true;
3714
3715 if (shmem_mapping(mapping))
3716 return !walk->can_swap;
3717
3718 /* to exclude special mappings like dax, etc. */
3719 return !mapping->a_ops->read_folio;
3720}
3721
3722/*
3723 * Some userspace memory allocators map many single-page VMAs. Instead of
3724 * returning back to the PGD table for each of such VMAs, finish an entire PMD
3725 * table to reduce zigzags and improve cache performance.
3726 */
3727static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
3728 unsigned long *vm_start, unsigned long *vm_end)
3729{
3730 unsigned long start = round_up(*vm_end, size);
3731 unsigned long end = (start | ~mask) + 1;
3732
3733 VM_WARN_ON_ONCE(mask & size);
3734 VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));
3735
3736 while (args->vma) {
3737 if (start >= args->vma->vm_end) {
3738 args->vma = args->vma->vm_next;
3739 continue;
3740 }
3741
3742 if (end && end <= args->vma->vm_start)
3743 return false;
3744
3745 if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) {
3746 args->vma = args->vma->vm_next;
3747 continue;
3748 }
3749
3750 *vm_start = max(start, args->vma->vm_start);
3751 *vm_end = min(end - 1, args->vma->vm_end - 1) + 1;
3752
3753 return true;
3754 }
3755
3756 return false;
3757}
3758
3280static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
3281{
3282 unsigned long pfn = pte_pfn(pte);
3283
3284 VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
3285
3286 if (!pte_present(pte) || is_zero_pfn(pfn))
3287 return -1;
3288
3289 if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
3290 return -1;
3291
3292 if (WARN_ON_ONCE(!pfn_valid(pfn)))
3293 return -1;
3294
3295 return pfn;
3296}
3297
3759static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
3760{
3761 unsigned long pfn = pte_pfn(pte);
3762
3763 VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
3764
3765 if (!pte_present(pte) || is_zero_pfn(pfn))
3766 return -1;
3767
3768 if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
3769 return -1;
3770
3771 if (WARN_ON_ONCE(!pfn_valid(pfn)))
3772 return -1;
3773
3774 return pfn;
3775}
3776
3777#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
3778static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
3779{
3780 unsigned long pfn = pmd_pfn(pmd);
3781
3782 VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
3783
3784 if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
3785 return -1;
3786
3787 if (WARN_ON_ONCE(pmd_devmap(pmd)))
3788 return -1;
3789
3790 if (WARN_ON_ONCE(!pfn_valid(pfn)))
3791 return -1;
3792
3793 return pfn;
3794}
3795#endif
3796
3298static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
3797static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
3299 struct pglist_data *pgdat)
3798 struct pglist_data *pgdat, bool can_swap)
3300{
3301 struct folio *folio;
3302
3303 /* try to avoid unnecessary memory loads */
3304 if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
3305 return NULL;
3306
3307 folio = pfn_folio(pfn);
3308 if (folio_nid(folio) != pgdat->node_id)
3309 return NULL;
3310
3311 if (folio_memcg_rcu(folio) != memcg)
3312 return NULL;
3313
3799{
3800 struct folio *folio;
3801
3802 /* try to avoid unnecessary memory loads */
3803 if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
3804 return NULL;
3805
3806 folio = pfn_folio(pfn);
3807 if (folio_nid(folio) != pgdat->node_id)
3808 return NULL;
3809
3810 if (folio_memcg_rcu(folio) != memcg)
3811 return NULL;
3812
3813 /* file VMAs can contain anon pages from COW */
3814 if (!folio_is_file_lru(folio) && !can_swap)
3815 return NULL;
3816
3314 return folio;
3315}
3316
3817 return folio;
3818}
3819
3820static bool suitable_to_scan(int total, int young)
3821{
3822 int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
3823
3824 /* suitable if the average number of young PTEs per cacheline is >=1 */
3825 return young * n >= total;
3826}
3827
3828static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
3829 struct mm_walk *args)
3830{
3831 int i;
3832 pte_t *pte;
3833 spinlock_t *ptl;
3834 unsigned long addr;
3835 int total = 0;
3836 int young = 0;
3837 struct lru_gen_mm_walk *walk = args->private;
3838 struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
3839 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
3840 int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
3841
3842 VM_WARN_ON_ONCE(pmd_leaf(*pmd));
3843
3844 ptl = pte_lockptr(args->mm, pmd);
3845 if (!spin_trylock(ptl))
3846 return false;
3847
3848 arch_enter_lazy_mmu_mode();
3849
3850 pte = pte_offset_map(pmd, start & PMD_MASK);
3851restart:
3852 for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
3853 unsigned long pfn;
3854 struct folio *folio;
3855
3856 total++;
3857 walk->mm_stats[MM_LEAF_TOTAL]++;
3858
3859 pfn = get_pte_pfn(pte[i], args->vma, addr);
3860 if (pfn == -1)
3861 continue;
3862
3863 if (!pte_young(pte[i])) {
3864 walk->mm_stats[MM_LEAF_OLD]++;
3865 continue;
3866 }
3867
3868 folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
3869 if (!folio)
3870 continue;
3871
3872 if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
3873 VM_WARN_ON_ONCE(true);
3874
3875 young++;
3876 walk->mm_stats[MM_LEAF_YOUNG]++;
3877
3878 if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
3879 !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
3880 !folio_test_swapcache(folio)))
3881 folio_mark_dirty(folio);
3882
3883 old_gen = folio_update_gen(folio, new_gen);
3884 if (old_gen >= 0 && old_gen != new_gen)
3885 update_batch_size(walk, folio, old_gen, new_gen);
3886 }
3887
3888 if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
3889 goto restart;
3890
3891 pte_unmap(pte);
3892
3893 arch_leave_lazy_mmu_mode();
3894 spin_unlock(ptl);
3895
3896 return suitable_to_scan(total, young);
3897}
3898
3899#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
3900static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
3901 struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
3902{
3903 int i;
3904 pmd_t *pmd;
3905 spinlock_t *ptl;
3906 struct lru_gen_mm_walk *walk = args->private;
3907 struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
3908 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
3909 int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
3910
3911 VM_WARN_ON_ONCE(pud_leaf(*pud));
3912
3913 /* try to batch at most 1+MIN_LRU_BATCH+1 entries */
3914 if (*start == -1) {
3915 *start = next;
3916 return;
3917 }
3918
3919 i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start);
3920 if (i && i <= MIN_LRU_BATCH) {
3921 __set_bit(i - 1, bitmap);
3922 return;
3923 }
3924
3925 pmd = pmd_offset(pud, *start);
3926
3927 ptl = pmd_lockptr(args->mm, pmd);
3928 if (!spin_trylock(ptl))
3929 goto done;
3930
3931 arch_enter_lazy_mmu_mode();
3932
3933 do {
3934 unsigned long pfn;
3935 struct folio *folio;
3936 unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start;
3937
3938 pfn = get_pmd_pfn(pmd[i], vma, addr);
3939 if (pfn == -1)
3940 goto next;
3941
3942 if (!pmd_trans_huge(pmd[i])) {
3943 if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
3944 pmdp_test_and_clear_young(vma, addr, pmd + i);
3945 goto next;
3946 }
3947
3948 folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
3949 if (!folio)
3950 goto next;
3951
3952 if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
3953 goto next;
3954
3955 walk->mm_stats[MM_LEAF_YOUNG]++;
3956
3957 if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) &&
3958 !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
3959 !folio_test_swapcache(folio)))
3960 folio_mark_dirty(folio);
3961
3962 old_gen = folio_update_gen(folio, new_gen);
3963 if (old_gen >= 0 && old_gen != new_gen)
3964 update_batch_size(walk, folio, old_gen, new_gen);
3965next:
3966 i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
3967 } while (i <= MIN_LRU_BATCH);
3968
3969 arch_leave_lazy_mmu_mode();
3970 spin_unlock(ptl);
3971done:
3972 *start = -1;
3973 bitmap_zero(bitmap, MIN_LRU_BATCH);
3974}
3975#else
3976static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
3977 struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
3978{
3979}
3980#endif
3981
3982static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
3983 struct mm_walk *args)
3984{
3985 int i;
3986 pmd_t *pmd;
3987 unsigned long next;
3988 unsigned long addr;
3989 struct vm_area_struct *vma;
3990 unsigned long pos = -1;
3991 struct lru_gen_mm_walk *walk = args->private;
3992 unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
3993
3994 VM_WARN_ON_ONCE(pud_leaf(*pud));
3995
3996 /*
3997 * Finish an entire PMD in two passes: the first only reaches to PTE
3998 * tables to avoid taking the PMD lock; the second, if necessary, takes
3999 * the PMD lock to clear the accessed bit in PMD entries.
4000 */
4001 pmd = pmd_offset(pud, start & PUD_MASK);
4002restart:
4003 /* walk_pte_range() may call get_next_vma() */
4004 vma = args->vma;
4005 for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
4006 pmd_t val = pmd_read_atomic(pmd + i);
4007
4008 /* for pmd_read_atomic() */
4009 barrier();
4010
4011 next = pmd_addr_end(addr, end);
4012
4013 if (!pmd_present(val) || is_huge_zero_pmd(val)) {
4014 walk->mm_stats[MM_LEAF_TOTAL]++;
4015 continue;
4016 }
4017
4018#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4019 if (pmd_trans_huge(val)) {
4020 unsigned long pfn = pmd_pfn(val);
4021 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
4022
4023 walk->mm_stats[MM_LEAF_TOTAL]++;
4024
4025 if (!pmd_young(val)) {
4026 walk->mm_stats[MM_LEAF_OLD]++;
4027 continue;
4028 }
4029
4030 /* try to avoid unnecessary memory loads */
4031 if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
4032 continue;
4033
4034 walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
4035 continue;
4036 }
4037#endif
4038 walk->mm_stats[MM_NONLEAF_TOTAL]++;
4039
4040#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
4041 if (!pmd_young(val))
4042 continue;
4043
4044 walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
4045#endif
4046 if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
4047 continue;
4048
4049 walk->mm_stats[MM_NONLEAF_FOUND]++;
4050
4051 if (!walk_pte_range(&val, addr, next, args))
4052 continue;
4053
4054 walk->mm_stats[MM_NONLEAF_ADDED]++;
4055
4056 /* carry over to the next generation */
4057 update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
4058 }
4059
4060 walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos);
4061
4062 if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
4063 goto restart;
4064}
4065
4066static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
4067 struct mm_walk *args)
4068{
4069 int i;
4070 pud_t *pud;
4071 unsigned long addr;
4072 unsigned long next;
4073 struct lru_gen_mm_walk *walk = args->private;
4074
4075 VM_WARN_ON_ONCE(p4d_leaf(*p4d));
4076
4077 pud = pud_offset(p4d, start & P4D_MASK);
4078restart:
4079 for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
4080 pud_t val = READ_ONCE(pud[i]);
4081
4082 next = pud_addr_end(addr, end);
4083
4084 if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
4085 continue;
4086
4087 walk_pmd_range(&val, addr, next, args);
4088
4089 /* a racy check to curtail the waiting time */
4090 if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
4091 return 1;
4092
4093 if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
4094 end = (addr | ~PUD_MASK) + 1;
4095 goto done;
4096 }
4097 }
4098
4099 if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end))
4100 goto restart;
4101
4102 end = round_up(end, P4D_SIZE);
4103done:
4104 if (!end || !args->vma)
4105 return 1;
4106
4107 walk->next_addr = max(end, args->vma->vm_start);
4108
4109 return -EAGAIN;
4110}
4111
4112static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
4113{
4114 static const struct mm_walk_ops mm_walk_ops = {
4115 .test_walk = should_skip_vma,
4116 .p4d_entry = walk_pud_range,
4117 };
4118
4119 int err;
4120 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4121
4122 walk->next_addr = FIRST_USER_ADDRESS;
4123
4124 do {
4125 err = -EBUSY;
4126
4127 /* folio_update_gen() requires stable folio_memcg() */
4128 if (!mem_cgroup_trylock_pages(memcg))
4129 break;
4130
4131 /* the caller might be holding the lock for write */
4132 if (mmap_read_trylock(mm)) {
4133 err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
4134
4135 mmap_read_unlock(mm);
4136 }
4137
4138 mem_cgroup_unlock_pages();
4139
4140 if (walk->batched) {
4141 spin_lock_irq(&lruvec->lru_lock);
4142 reset_batch_size(lruvec, walk);
4143 spin_unlock_irq(&lruvec->lru_lock);
4144 }
4145
4146 cond_resched();
4147 } while (err == -EAGAIN);
4148}
4149
4150static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
4151{
4152 struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
4153
4154 if (pgdat && current_is_kswapd()) {
4155 VM_WARN_ON_ONCE(walk);
4156
4157 walk = &pgdat->mm_walk;
4158 } else if (!pgdat && !walk) {
4159 VM_WARN_ON_ONCE(current_is_kswapd());
4160
4161 walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
4162 }
4163
4164 current->reclaim_state->mm_walk = walk;
4165
4166 return walk;
4167}
4168
4169static void clear_mm_walk(void)
4170{
4171 struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
4172
4173 VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages)));
4174 VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats)));
4175
4176 current->reclaim_state->mm_walk = NULL;
4177
4178 if (!current_is_kswapd())
4179 kfree(walk);
4180}
4181
3317static void inc_min_seq(struct lruvec *lruvec, int type)
3318{
3319 struct lru_gen_struct *lrugen = &lruvec->lrugen;
3320
3321 reset_ctrl_pos(lruvec, type, true);
3322 WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
3323}
3324

--- 35 unchanged lines hidden (view full) ---

3360 reset_ctrl_pos(lruvec, type, true);
3361 WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
3362 success = true;
3363 }
3364
3365 return success;
3366}
3367
4182static void inc_min_seq(struct lruvec *lruvec, int type)
4183{
4184 struct lru_gen_struct *lrugen = &lruvec->lrugen;
4185
4186 reset_ctrl_pos(lruvec, type, true);
4187 WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
4188}
4189

--- 35 unchanged lines hidden (view full) ---

4225 reset_ctrl_pos(lruvec, type, true);
4226 WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
4227 success = true;
4228 }
4229
4230 return success;
4231}
4232
3368static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap)
4233static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
3369{
3370 int prev, next;
3371 int type, zone;
3372 struct lru_gen_struct *lrugen = &lruvec->lrugen;
3373
3374 spin_lock_irq(&lruvec->lru_lock);
3375
3376 VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
3377
4234{
4235 int prev, next;
4236 int type, zone;
4237 struct lru_gen_struct *lrugen = &lruvec->lrugen;
4238
4239 spin_lock_irq(&lruvec->lru_lock);
4240
4241 VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
4242
3378 if (max_seq != lrugen->max_seq)
3379 goto unlock;
3380
3381 for (type = ANON_AND_FILE - 1; type >= 0; type--) {
3382 if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
3383 continue;
3384
3385 VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap);
3386
3387 inc_min_seq(lruvec, type);
3388 }

--- 21 unchanged lines hidden (view full) ---

3410 }
3411 }
3412
3413 for (type = 0; type < ANON_AND_FILE; type++)
3414 reset_ctrl_pos(lruvec, type, false);
3415
3416 /* make sure preceding modifications appear */
3417 smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
4243 for (type = ANON_AND_FILE - 1; type >= 0; type--) {
4244 if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
4245 continue;
4246
4247 VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap);
4248
4249 inc_min_seq(lruvec, type);
4250 }

--- 21 unchanged lines hidden (view full) ---

4272 }
4273 }
4274
4275 for (type = 0; type < ANON_AND_FILE; type++)
4276 reset_ctrl_pos(lruvec, type, false);
4277
4278 /* make sure preceding modifications appear */
4279 smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
3418unlock:
4280
3419 spin_unlock_irq(&lruvec->lru_lock);
3420}
3421
4281 spin_unlock_irq(&lruvec->lru_lock);
4282}
4283
4284static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
4285 struct scan_control *sc, bool can_swap)
4286{
4287 bool success;
4288 struct lru_gen_mm_walk *walk;
4289 struct mm_struct *mm = NULL;
4290 struct lru_gen_struct *lrugen = &lruvec->lrugen;
4291
4292 VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
4293
4294 /* see the comment in iterate_mm_list() */
4295 if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) {
4296 success = false;
4297 goto done;
4298 }
4299
4300 /*
4301 * If the hardware doesn't automatically set the accessed bit, fallback
4302 * to lru_gen_look_around(), which only clears the accessed bit in a
4303 * handful of PTEs. Spreading the work out over a period of time usually
4304 * is less efficient, but it avoids bursty page faults.
4305 */
4306 if (!arch_has_hw_pte_young()) {
4307 success = iterate_mm_list_nowalk(lruvec, max_seq);
4308 goto done;
4309 }
4310
4311 walk = set_mm_walk(NULL);
4312 if (!walk) {
4313 success = iterate_mm_list_nowalk(lruvec, max_seq);
4314 goto done;
4315 }
4316
4317 walk->lruvec = lruvec;
4318 walk->max_seq = max_seq;
4319 walk->can_swap = can_swap;
4320 walk->force_scan = false;
4321
4322 do {
4323 success = iterate_mm_list(lruvec, walk, &mm);
4324 if (mm)
4325 walk_mm(lruvec, mm, walk);
4326
4327 cond_resched();
4328 } while (mm);
4329done:
4330 if (!success) {
4331 if (sc->priority <= DEF_PRIORITY - 2)
4332 wait_event_killable(lruvec->mm_state.wait,
4333 max_seq < READ_ONCE(lrugen->max_seq));
4334
4335 return max_seq < READ_ONCE(lrugen->max_seq);
4336 }
4337
4338 VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
4339
4340 inc_max_seq(lruvec, can_swap);
4341 /* either this sees any waiters or they will see updated max_seq */
4342 if (wq_has_sleeper(&lruvec->mm_state.wait))
4343 wake_up_all(&lruvec->mm_state.wait);
4344
4345 wakeup_flusher_threads(WB_REASON_VMSCAN);
4346
4347 return true;
4348}
4349
3422static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
3423 struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
3424{
3425 int gen, type, zone;
3426 unsigned long old = 0;
3427 unsigned long young = 0;
3428 unsigned long total = 0;
3429 struct lru_gen_struct *lrugen = &lruvec->lrugen;

--- 59 unchanged lines hidden (view full) ---

3489
3490 mem_cgroup_calculate_protection(NULL, memcg);
3491
3492 if (mem_cgroup_below_min(memcg))
3493 return;
3494
3495 need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
3496 if (need_aging)
4350static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
4351 struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
4352{
4353 int gen, type, zone;
4354 unsigned long old = 0;
4355 unsigned long young = 0;
4356 unsigned long total = 0;
4357 struct lru_gen_struct *lrugen = &lruvec->lrugen;

--- 59 unchanged lines hidden (view full) ---

4417
4418 mem_cgroup_calculate_protection(NULL, memcg);
4419
4420 if (mem_cgroup_below_min(memcg))
4421 return;
4422
4423 need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
4424 if (need_aging)
3497 inc_max_seq(lruvec, max_seq, swappiness);
4425 try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
3498}
3499
3500static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
3501{
3502 struct mem_cgroup *memcg;
3503
3504 VM_WARN_ON_ONCE(!current_is_kswapd());
3505
4426}
4427
4428static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
4429{
4430 struct mem_cgroup *memcg;
4431
4432 VM_WARN_ON_ONCE(!current_is_kswapd());
4433
4434 set_mm_walk(pgdat);
4435
3506 memcg = mem_cgroup_iter(NULL, NULL, NULL);
3507 do {
3508 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
3509
3510 age_lruvec(lruvec, sc);
3511
3512 cond_resched();
3513 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
4436 memcg = mem_cgroup_iter(NULL, NULL, NULL);
4437 do {
4438 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
4439
4440 age_lruvec(lruvec, sc);
4441
4442 cond_resched();
4443 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
4444
4445 clear_mm_walk();
3514}
3515
3516/*
3517 * This function exploits spatial locality when shrink_page_list() walks the
4446}
4447
4448/*
4449 * This function exploits spatial locality when shrink_page_list() walks the
3518 * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
4450 * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
4451 * the scan was done cacheline efficiently, it adds the PMD entry pointing to
4452 * the PTE table to the Bloom filter. This forms a feedback loop between the
4453 * eviction and the aging.
3519 */
3520void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
3521{
3522 int i;
3523 pte_t *pte;
3524 unsigned long start;
3525 unsigned long end;
3526 unsigned long addr;
4454 */
4455void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
4456{
4457 int i;
4458 pte_t *pte;
4459 unsigned long start;
4460 unsigned long end;
4461 unsigned long addr;
4462 struct lru_gen_mm_walk *walk;
4463 int young = 0;
3527 unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
3528 struct folio *folio = pfn_folio(pvmw->pfn);
3529 struct mem_cgroup *memcg = folio_memcg(folio);
3530 struct pglist_data *pgdat = folio_pgdat(folio);
3531 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
3532 DEFINE_MAX_SEQ(lruvec);
3533 int old_gen, new_gen = lru_gen_from_seq(max_seq);
3534
3535 lockdep_assert_held(pvmw->ptl);
3536 VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
3537
3538 if (spin_is_contended(pvmw->ptl))
3539 return;
3540
4464 unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
4465 struct folio *folio = pfn_folio(pvmw->pfn);
4466 struct mem_cgroup *memcg = folio_memcg(folio);
4467 struct pglist_data *pgdat = folio_pgdat(folio);
4468 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
4469 DEFINE_MAX_SEQ(lruvec);
4470 int old_gen, new_gen = lru_gen_from_seq(max_seq);
4471
4472 lockdep_assert_held(pvmw->ptl);
4473 VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
4474
4475 if (spin_is_contended(pvmw->ptl))
4476 return;
4477
4478 /* avoid taking the LRU lock under the PTL when possible */
4479 walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
4480
3541 start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
3542 end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
3543
3544 if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
3545 if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
3546 end = start + MIN_LRU_BATCH * PAGE_SIZE;
3547 else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
3548 start = end - MIN_LRU_BATCH * PAGE_SIZE;

--- 13 unchanged lines hidden (view full) ---

3562
3563 pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
3564 if (pfn == -1)
3565 continue;
3566
3567 if (!pte_young(pte[i]))
3568 continue;
3569
4481 start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
4482 end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
4483
4484 if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
4485 if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
4486 end = start + MIN_LRU_BATCH * PAGE_SIZE;
4487 else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
4488 start = end - MIN_LRU_BATCH * PAGE_SIZE;

--- 13 unchanged lines hidden (view full) ---

4502
4503 pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
4504 if (pfn == -1)
4505 continue;
4506
4507 if (!pte_young(pte[i]))
4508 continue;
4509
3570 folio = get_pfn_folio(pfn, memcg, pgdat);
4510 folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap);
3571 if (!folio)
3572 continue;
3573
3574 if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
3575 VM_WARN_ON_ONCE(true);
3576
4511 if (!folio)
4512 continue;
4513
4514 if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
4515 VM_WARN_ON_ONCE(true);
4516
4517 young++;
4518
3577 if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
3578 !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
3579 !folio_test_swapcache(folio)))
3580 folio_mark_dirty(folio);
3581
3582 old_gen = folio_lru_gen(folio);
3583 if (old_gen < 0)
3584 folio_set_referenced(folio);
3585 else if (old_gen != new_gen)
3586 __set_bit(i, bitmap);
3587 }
3588
3589 arch_leave_lazy_mmu_mode();
3590 rcu_read_unlock();
3591
4519 if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
4520 !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
4521 !folio_test_swapcache(folio)))
4522 folio_mark_dirty(folio);
4523
4524 old_gen = folio_lru_gen(folio);
4525 if (old_gen < 0)
4526 folio_set_referenced(folio);
4527 else if (old_gen != new_gen)
4528 __set_bit(i, bitmap);
4529 }
4530
4531 arch_leave_lazy_mmu_mode();
4532 rcu_read_unlock();
4533
3592 if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
4534 /* feedback from rmap walkers to page table walkers */
4535 if (suitable_to_scan(i, young))
4536 update_bloom_filter(lruvec, max_seq, pvmw->pmd);
4537
4538 if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
3593 for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
3594 folio = pfn_folio(pte_pfn(pte[i]));
3595 folio_activate(folio);
3596 }
3597 return;
3598 }
3599
3600 /* folio_update_gen() requires stable folio_memcg() */
3601 if (!mem_cgroup_trylock_pages(memcg))
3602 return;
3603
4539 for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
4540 folio = pfn_folio(pte_pfn(pte[i]));
4541 folio_activate(folio);
4542 }
4543 return;
4544 }
4545
4546 /* folio_update_gen() requires stable folio_memcg() */
4547 if (!mem_cgroup_trylock_pages(memcg))
4548 return;
4549
3604 spin_lock_irq(&lruvec->lru_lock);
3605 new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
4550 if (!walk) {
4551 spin_lock_irq(&lruvec->lru_lock);
4552 new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
4553 }
3606
3607 for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
3608 folio = pfn_folio(pte_pfn(pte[i]));
3609 if (folio_memcg_rcu(folio) != memcg)
3610 continue;
3611
3612 old_gen = folio_update_gen(folio, new_gen);
3613 if (old_gen < 0 || old_gen == new_gen)
3614 continue;
3615
4554
4555 for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
4556 folio = pfn_folio(pte_pfn(pte[i]));
4557 if (folio_memcg_rcu(folio) != memcg)
4558 continue;
4559
4560 old_gen = folio_update_gen(folio, new_gen);
4561 if (old_gen < 0 || old_gen == new_gen)
4562 continue;
4563
3616 lru_gen_update_size(lruvec, folio, old_gen, new_gen);
4564 if (walk)
4565 update_batch_size(walk, folio, old_gen, new_gen);
4566 else
4567 lru_gen_update_size(lruvec, folio, old_gen, new_gen);
3617 }
3618
4568 }
4569
3619 spin_unlock_irq(&lruvec->lru_lock);
4570 if (!walk)
4571 spin_unlock_irq(&lruvec->lru_lock);
3620
3621 mem_cgroup_unlock_pages();
3622}
3623
3624/******************************************************************************
3625 * the eviction
3626 ******************************************************************************/
3627

--- 266 unchanged lines hidden (view full) ---

3894{
3895 int type;
3896 int scanned;
3897 int reclaimed;
3898 LIST_HEAD(list);
3899 struct folio *folio;
3900 enum vm_event_item item;
3901 struct reclaim_stat stat;
4572
4573 mem_cgroup_unlock_pages();
4574}
4575
4576/******************************************************************************
4577 * the eviction
4578 ******************************************************************************/
4579

--- 266 unchanged lines hidden (view full) ---

4846{
4847 int type;
4848 int scanned;
4849 int reclaimed;
4850 LIST_HEAD(list);
4851 struct folio *folio;
4852 enum vm_event_item item;
4853 struct reclaim_stat stat;
4854 struct lru_gen_mm_walk *walk;
3902 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3903 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
3904
3905 spin_lock_irq(&lruvec->lru_lock);
3906
3907 scanned = isolate_folios(lruvec, sc, swappiness, &type, &list);
3908
3909 scanned += try_to_inc_min_seq(lruvec, swappiness);

--- 20 unchanged lines hidden (view full) ---

3930 else
3931 folio_set_active(folio);
3932 }
3933
3934 spin_lock_irq(&lruvec->lru_lock);
3935
3936 move_pages_to_lru(lruvec, &list);
3937
4855 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4856 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
4857
4858 spin_lock_irq(&lruvec->lru_lock);
4859
4860 scanned = isolate_folios(lruvec, sc, swappiness, &type, &list);
4861
4862 scanned += try_to_inc_min_seq(lruvec, swappiness);

--- 20 unchanged lines hidden (view full) ---

4883 else
4884 folio_set_active(folio);
4885 }
4886
4887 spin_lock_irq(&lruvec->lru_lock);
4888
4889 move_pages_to_lru(lruvec, &list);
4890
4891 walk = current->reclaim_state->mm_walk;
4892 if (walk && walk->batched)
4893 reset_batch_size(lruvec, walk);
4894
3938 item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
3939 if (!cgroup_reclaim(sc))
3940 __count_vm_events(item, reclaimed);
3941 __count_memcg_events(memcg, item, reclaimed);
3942 __count_vm_events(PGSTEAL_ANON + type, reclaimed);
3943
3944 spin_unlock_irq(&lruvec->lru_lock);
3945
3946 mem_cgroup_uncharge_list(&list);
3947 free_unref_page_list(&list);
3948
3949 sc->nr_reclaimed += reclaimed;
3950
3951 return scanned;
3952}
3953
4895 item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
4896 if (!cgroup_reclaim(sc))
4897 __count_vm_events(item, reclaimed);
4898 __count_memcg_events(memcg, item, reclaimed);
4899 __count_vm_events(PGSTEAL_ANON + type, reclaimed);
4900
4901 spin_unlock_irq(&lruvec->lru_lock);
4902
4903 mem_cgroup_uncharge_list(&list);
4904 free_unref_page_list(&list);
4905
4906 sc->nr_reclaimed += reclaimed;
4907
4908 return scanned;
4909}
4910
4911/*
4912 * For future optimizations:
4913 * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
4914 * reclaim.
4915 */
3954static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
3955 bool can_swap)
3956{
3957 bool need_aging;
3958 unsigned long nr_to_scan;
3959 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3960 DEFINE_MAX_SEQ(lruvec);
3961 DEFINE_MIN_SEQ(lruvec);

--- 9 unchanged lines hidden (view full) ---

3971 /* skip the aging path at the default priority */
3972 if (sc->priority == DEF_PRIORITY)
3973 goto done;
3974
3975 /* leave the work to lru_gen_age_node() */
3976 if (current_is_kswapd())
3977 return 0;
3978
4916static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
4917 bool can_swap)
4918{
4919 bool need_aging;
4920 unsigned long nr_to_scan;
4921 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4922 DEFINE_MAX_SEQ(lruvec);
4923 DEFINE_MIN_SEQ(lruvec);

--- 9 unchanged lines hidden (view full) ---

4933 /* skip the aging path at the default priority */
4934 if (sc->priority == DEF_PRIORITY)
4935 goto done;
4936
4937 /* leave the work to lru_gen_age_node() */
4938 if (current_is_kswapd())
4939 return 0;
4940
3979 inc_max_seq(lruvec, max_seq, can_swap);
4941 if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap))
4942 return nr_to_scan;
3980done:
3981 return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
3982}
3983
3984static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
3985{
3986 struct blk_plug plug;
3987 unsigned long scanned = 0;
3988
3989 lru_add_drain();
3990
3991 blk_start_plug(&plug);
3992
4943done:
4944 return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
4945}
4946
4947static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
4948{
4949 struct blk_plug plug;
4950 unsigned long scanned = 0;
4951
4952 lru_add_drain();
4953
4954 blk_start_plug(&plug);
4955
4956 set_mm_walk(lruvec_pgdat(lruvec));
4957
3993 while (true) {
3994 int delta;
3995 int swappiness;
3996 unsigned long nr_to_scan;
3997
3998 if (sc->may_swap)
3999 swappiness = get_swappiness(lruvec, sc);
4000 else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))

--- 11 unchanged lines hidden (view full) ---

4012
4013 scanned += delta;
4014 if (scanned >= nr_to_scan)
4015 break;
4016
4017 cond_resched();
4018 }
4019
4958 while (true) {
4959 int delta;
4960 int swappiness;
4961 unsigned long nr_to_scan;
4962
4963 if (sc->may_swap)
4964 swappiness = get_swappiness(lruvec, sc);
4965 else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))

--- 11 unchanged lines hidden (view full) ---

4977
4978 scanned += delta;
4979 if (scanned >= nr_to_scan)
4980 break;
4981
4982 cond_resched();
4983 }
4984
4985 clear_mm_walk();
4986
4020 blk_finish_plug(&plug);
4021}
4022
4023/******************************************************************************
4024 * initialization
4025 ******************************************************************************/
4026
4027void lru_gen_init_lruvec(struct lruvec *lruvec)
4028{
4029 int gen, type, zone;
4030 struct lru_gen_struct *lrugen = &lruvec->lrugen;
4031
4032 lrugen->max_seq = MIN_NR_GENS + 1;
4033
4034 for_each_gen_type_zone(gen, type, zone)
4035 INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
4987 blk_finish_plug(&plug);
4988}
4989
4990/******************************************************************************
4991 * initialization
4992 ******************************************************************************/
4993
4994void lru_gen_init_lruvec(struct lruvec *lruvec)
4995{
4996 int gen, type, zone;
4997 struct lru_gen_struct *lrugen = &lruvec->lrugen;
4998
4999 lrugen->max_seq = MIN_NR_GENS + 1;
5000
5001 for_each_gen_type_zone(gen, type, zone)
5002 INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
5003
5004 lruvec->mm_state.seq = MIN_NR_GENS;
5005 init_waitqueue_head(&lruvec->mm_state.wait);
4036}
4037
4038#ifdef CONFIG_MEMCG
4039void lru_gen_init_memcg(struct mem_cgroup *memcg)
4040{
5006}
5007
5008#ifdef CONFIG_MEMCG
5009void lru_gen_init_memcg(struct mem_cgroup *memcg)
5010{
5011 INIT_LIST_HEAD(&memcg->mm_list.fifo);
5012 spin_lock_init(&memcg->mm_list.lock);
4041}
4042
4043void lru_gen_exit_memcg(struct mem_cgroup *memcg)
4044{
5013}
5014
5015void lru_gen_exit_memcg(struct mem_cgroup *memcg)
5016{
5017 int i;
4045 int nid;
4046
4047 for_each_node(nid) {
4048 struct lruvec *lruvec = get_lruvec(memcg, nid);
4049
4050 VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
4051 sizeof(lruvec->lrugen.nr_pages)));
5018 int nid;
5019
5020 for_each_node(nid) {
5021 struct lruvec *lruvec = get_lruvec(memcg, nid);
5022
5023 VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
5024 sizeof(lruvec->lrugen.nr_pages)));
5025
5026 for (i = 0; i < NR_BLOOM_FILTERS; i++) {
5027 bitmap_free(lruvec->mm_state.filters[i]);
5028 lruvec->mm_state.filters[i] = NULL;
5029 }
4052 }
4053}
4054#endif
4055
4056static int __init init_lru_gen(void)
4057{
4058 BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
4059 BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);

--- 1907 unchanged lines hidden ---
5030 }
5031}
5032#endif
5033
5034static int __init init_lru_gen(void)
5035{
5036 BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
5037 BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);

--- 1907 unchanged lines hidden ---