vmscan.c (018ee47f14893d500131dfca2ff9f3ff8ebd4ed2) | vmscan.c (bd74fdaea146029e4fa12c6de89adbe0779348a9) |
---|---|
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 4 * 5 * Swap reorganised 29.12.95, Stephen Tweedie. 6 * kswapd added: 7.1.96 sct 7 * Removed kswapd_ctl limits, and swap out as many pages as needed 8 * to bring the system back to freepages.high: 2.4.97, Rik van Riel. --- 35 unchanged lines hidden (view full) --- 44#include <linux/delayacct.h> 45#include <linux/sysctl.h> 46#include <linux/oom.h> 47#include <linux/pagevec.h> 48#include <linux/prefetch.h> 49#include <linux/printk.h> 50#include <linux/dax.h> 51#include <linux/psi.h> | 1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 4 * 5 * Swap reorganised 29.12.95, Stephen Tweedie. 6 * kswapd added: 7.1.96 sct 7 * Removed kswapd_ctl limits, and swap out as many pages as needed 8 * to bring the system back to freepages.high: 2.4.97, Rik van Riel. --- 35 unchanged lines hidden (view full) --- 44#include <linux/delayacct.h> 45#include <linux/sysctl.h> 46#include <linux/oom.h> 47#include <linux/pagevec.h> 48#include <linux/prefetch.h> 49#include <linux/printk.h> 50#include <linux/dax.h> 51#include <linux/psi.h> |
52#include <linux/pagewalk.h> 53#include <linux/shmem_fs.h> |
|
52 53#include <asm/tlbflush.h> 54#include <asm/div64.h> 55 56#include <linux/swapops.h> 57#include <linux/balloon_compaction.h> 58#include <linux/sched/sysctl.h> 59 --- 3017 unchanged lines hidden (view full) --- 3077 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ 3078 } 3079 3080#define for_each_gen_type_zone(gen, type, zone) \ 3081 for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ 3082 for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ 3083 for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) 3084 | 54 55#include <asm/tlbflush.h> 56#include <asm/div64.h> 57 58#include <linux/swapops.h> 59#include <linux/balloon_compaction.h> 60#include <linux/sched/sysctl.h> 61 --- 3017 unchanged lines hidden (view full) --- 3079 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ 3080 } 3081 3082#define for_each_gen_type_zone(gen, type, zone) \ 3083 for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ 3084 for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ 3085 for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) 3086 |
3085static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid) | 3087static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) |
3086{ 3087 struct pglist_data *pgdat = NODE_DATA(nid); 3088 3089#ifdef CONFIG_MEMCG 3090 if (memcg) { 3091 struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; 3092 3093 /* for hotadd_new_pgdat() */ --- 29 unchanged lines hidden (view full) --- 3123{ 3124 /* see the comment on lru_gen_struct */ 3125 return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && 3126 get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && 3127 get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; 3128} 3129 3130/****************************************************************************** | 3088{ 3089 struct pglist_data *pgdat = NODE_DATA(nid); 3090 3091#ifdef CONFIG_MEMCG 3092 if (memcg) { 3093 struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; 3094 3095 /* for hotadd_new_pgdat() */ --- 29 unchanged lines hidden (view full) --- 3125{ 3126 /* see the comment on lru_gen_struct */ 3127 return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && 3128 get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && 3129 get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; 3130} 3131 3132/****************************************************************************** |
3133 * mm_struct list 3134 ******************************************************************************/ 3135 3136static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) 3137{ 3138 static struct lru_gen_mm_list mm_list = { 3139 .fifo = LIST_HEAD_INIT(mm_list.fifo), 3140 .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), 3141 }; 3142 3143#ifdef CONFIG_MEMCG 3144 if (memcg) 3145 return &memcg->mm_list; 3146#endif 3147 VM_WARN_ON_ONCE(!mem_cgroup_disabled()); 3148 3149 return &mm_list; 3150} 3151 3152void lru_gen_add_mm(struct mm_struct *mm) 3153{ 3154 int nid; 3155 struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); 3156 struct lru_gen_mm_list *mm_list = get_mm_list(memcg); 3157 3158 VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list)); 3159#ifdef CONFIG_MEMCG 3160 VM_WARN_ON_ONCE(mm->lru_gen.memcg); 3161 mm->lru_gen.memcg = memcg; 3162#endif 3163 spin_lock(&mm_list->lock); 3164 3165 for_each_node_state(nid, N_MEMORY) { 3166 struct lruvec *lruvec = get_lruvec(memcg, nid); 3167 3168 if (!lruvec) 3169 continue; 3170 3171 /* the first addition since the last iteration */ 3172 if (lruvec->mm_state.tail == &mm_list->fifo) 3173 lruvec->mm_state.tail = &mm->lru_gen.list; 3174 } 3175 3176 list_add_tail(&mm->lru_gen.list, &mm_list->fifo); 3177 3178 spin_unlock(&mm_list->lock); 3179} 3180 3181void lru_gen_del_mm(struct mm_struct *mm) 3182{ 3183 int nid; 3184 struct lru_gen_mm_list *mm_list; 3185 struct mem_cgroup *memcg = NULL; 3186 3187 if (list_empty(&mm->lru_gen.list)) 3188 return; 3189 3190#ifdef CONFIG_MEMCG 3191 memcg = mm->lru_gen.memcg; 3192#endif 3193 mm_list = get_mm_list(memcg); 3194 3195 spin_lock(&mm_list->lock); 3196 3197 for_each_node(nid) { 3198 struct lruvec *lruvec = get_lruvec(memcg, nid); 3199 3200 if (!lruvec) 3201 continue; 3202 3203 /* where the last iteration ended (exclusive) */ 3204 if (lruvec->mm_state.tail == &mm->lru_gen.list) 3205 lruvec->mm_state.tail = lruvec->mm_state.tail->next; 3206 3207 /* where the current iteration continues (inclusive) */ 3208 if (lruvec->mm_state.head != &mm->lru_gen.list) 3209 continue; 3210 3211 lruvec->mm_state.head = lruvec->mm_state.head->next; 3212 /* the deletion ends the current iteration */ 3213 if (lruvec->mm_state.head == &mm_list->fifo) 3214 WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1); 3215 } 3216 3217 list_del_init(&mm->lru_gen.list); 3218 3219 spin_unlock(&mm_list->lock); 3220 3221#ifdef CONFIG_MEMCG 3222 mem_cgroup_put(mm->lru_gen.memcg); 3223 mm->lru_gen.memcg = NULL; 3224#endif 3225} 3226 3227#ifdef CONFIG_MEMCG 3228void lru_gen_migrate_mm(struct mm_struct *mm) 3229{ 3230 struct mem_cgroup *memcg; 3231 struct task_struct *task = rcu_dereference_protected(mm->owner, true); 3232 3233 VM_WARN_ON_ONCE(task->mm != mm); 3234 lockdep_assert_held(&task->alloc_lock); 3235 3236 /* for mm_update_next_owner() */ 3237 if (mem_cgroup_disabled()) 3238 return; 3239 3240 rcu_read_lock(); 3241 memcg = mem_cgroup_from_task(task); 3242 rcu_read_unlock(); 3243 if (memcg == mm->lru_gen.memcg) 3244 return; 3245 3246 VM_WARN_ON_ONCE(!mm->lru_gen.memcg); 3247 VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); 3248 3249 lru_gen_del_mm(mm); 3250 lru_gen_add_mm(mm); 3251} 3252#endif 3253 3254/* 3255 * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when 3256 * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of 3257 * bits in a bitmap, k is the number of hash functions and n is the number of 3258 * inserted items. 3259 * 3260 * Page table walkers use one of the two filters to reduce their search space. 3261 * To get rid of non-leaf entries that no longer have enough leaf entries, the 3262 * aging uses the double-buffering technique to flip to the other filter each 3263 * time it produces a new generation. For non-leaf entries that have enough 3264 * leaf entries, the aging carries them over to the next generation in 3265 * walk_pmd_range(); the eviction also report them when walking the rmap 3266 * in lru_gen_look_around(). 3267 * 3268 * For future optimizations: 3269 * 1. It's not necessary to keep both filters all the time. The spare one can be 3270 * freed after the RCU grace period and reallocated if needed again. 3271 * 2. And when reallocating, it's worth scaling its size according to the number 3272 * of inserted entries in the other filter, to reduce the memory overhead on 3273 * small systems and false positives on large systems. 3274 * 3. Jenkins' hash function is an alternative to Knuth's. 3275 */ 3276#define BLOOM_FILTER_SHIFT 15 3277 3278static inline int filter_gen_from_seq(unsigned long seq) 3279{ 3280 return seq % NR_BLOOM_FILTERS; 3281} 3282 3283static void get_item_key(void *item, int *key) 3284{ 3285 u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); 3286 3287 BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); 3288 3289 key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); 3290 key[1] = hash >> BLOOM_FILTER_SHIFT; 3291} 3292 3293static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) 3294{ 3295 unsigned long *filter; 3296 int gen = filter_gen_from_seq(seq); 3297 3298 filter = lruvec->mm_state.filters[gen]; 3299 if (filter) { 3300 bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); 3301 return; 3302 } 3303 3304 filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), 3305 __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); 3306 WRITE_ONCE(lruvec->mm_state.filters[gen], filter); 3307} 3308 3309static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) 3310{ 3311 int key[2]; 3312 unsigned long *filter; 3313 int gen = filter_gen_from_seq(seq); 3314 3315 filter = READ_ONCE(lruvec->mm_state.filters[gen]); 3316 if (!filter) 3317 return; 3318 3319 get_item_key(item, key); 3320 3321 if (!test_bit(key[0], filter)) 3322 set_bit(key[0], filter); 3323 if (!test_bit(key[1], filter)) 3324 set_bit(key[1], filter); 3325} 3326 3327static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) 3328{ 3329 int key[2]; 3330 unsigned long *filter; 3331 int gen = filter_gen_from_seq(seq); 3332 3333 filter = READ_ONCE(lruvec->mm_state.filters[gen]); 3334 if (!filter) 3335 return true; 3336 3337 get_item_key(item, key); 3338 3339 return test_bit(key[0], filter) && test_bit(key[1], filter); 3340} 3341 3342static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) 3343{ 3344 int i; 3345 int hist; 3346 3347 lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); 3348 3349 if (walk) { 3350 hist = lru_hist_from_seq(walk->max_seq); 3351 3352 for (i = 0; i < NR_MM_STATS; i++) { 3353 WRITE_ONCE(lruvec->mm_state.stats[hist][i], 3354 lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]); 3355 walk->mm_stats[i] = 0; 3356 } 3357 } 3358 3359 if (NR_HIST_GENS > 1 && last) { 3360 hist = lru_hist_from_seq(lruvec->mm_state.seq + 1); 3361 3362 for (i = 0; i < NR_MM_STATS; i++) 3363 WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0); 3364 } 3365} 3366 3367static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) 3368{ 3369 int type; 3370 unsigned long size = 0; 3371 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 3372 int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); 3373 3374 if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) 3375 return true; 3376 3377 clear_bit(key, &mm->lru_gen.bitmap); 3378 3379 for (type = !walk->can_swap; type < ANON_AND_FILE; type++) { 3380 size += type ? get_mm_counter(mm, MM_FILEPAGES) : 3381 get_mm_counter(mm, MM_ANONPAGES) + 3382 get_mm_counter(mm, MM_SHMEMPAGES); 3383 } 3384 3385 if (size < MIN_LRU_BATCH) 3386 return true; 3387 3388 return !mmget_not_zero(mm); 3389} 3390 3391static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, 3392 struct mm_struct **iter) 3393{ 3394 bool first = false; 3395 bool last = true; 3396 struct mm_struct *mm = NULL; 3397 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3398 struct lru_gen_mm_list *mm_list = get_mm_list(memcg); 3399 struct lru_gen_mm_state *mm_state = &lruvec->mm_state; 3400 3401 /* 3402 * There are four interesting cases for this page table walker: 3403 * 1. It tries to start a new iteration of mm_list with a stale max_seq; 3404 * there is nothing left to do. 3405 * 2. It's the first of the current generation, and it needs to reset 3406 * the Bloom filter for the next generation. 3407 * 3. It reaches the end of mm_list, and it needs to increment 3408 * mm_state->seq; the iteration is done. 3409 * 4. It's the last of the current generation, and it needs to reset the 3410 * mm stats counters for the next generation. 3411 */ 3412 spin_lock(&mm_list->lock); 3413 3414 VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); 3415 VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq); 3416 VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers); 3417 3418 if (walk->max_seq <= mm_state->seq) { 3419 if (!*iter) 3420 last = false; 3421 goto done; 3422 } 3423 3424 if (!mm_state->nr_walkers) { 3425 VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); 3426 3427 mm_state->head = mm_list->fifo.next; 3428 first = true; 3429 } 3430 3431 while (!mm && mm_state->head != &mm_list->fifo) { 3432 mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); 3433 3434 mm_state->head = mm_state->head->next; 3435 3436 /* force scan for those added after the last iteration */ 3437 if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) { 3438 mm_state->tail = mm_state->head; 3439 walk->force_scan = true; 3440 } 3441 3442 if (should_skip_mm(mm, walk)) 3443 mm = NULL; 3444 } 3445 3446 if (mm_state->head == &mm_list->fifo) 3447 WRITE_ONCE(mm_state->seq, mm_state->seq + 1); 3448done: 3449 if (*iter && !mm) 3450 mm_state->nr_walkers--; 3451 if (!*iter && mm) 3452 mm_state->nr_walkers++; 3453 3454 if (mm_state->nr_walkers) 3455 last = false; 3456 3457 if (*iter || last) 3458 reset_mm_stats(lruvec, walk, last); 3459 3460 spin_unlock(&mm_list->lock); 3461 3462 if (mm && first) 3463 reset_bloom_filter(lruvec, walk->max_seq + 1); 3464 3465 if (*iter) 3466 mmput_async(*iter); 3467 3468 *iter = mm; 3469 3470 return last; 3471} 3472 3473static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) 3474{ 3475 bool success = false; 3476 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3477 struct lru_gen_mm_list *mm_list = get_mm_list(memcg); 3478 struct lru_gen_mm_state *mm_state = &lruvec->mm_state; 3479 3480 spin_lock(&mm_list->lock); 3481 3482 VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); 3483 3484 if (max_seq > mm_state->seq && !mm_state->nr_walkers) { 3485 VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); 3486 3487 WRITE_ONCE(mm_state->seq, mm_state->seq + 1); 3488 reset_mm_stats(lruvec, NULL, true); 3489 success = true; 3490 } 3491 3492 spin_unlock(&mm_list->lock); 3493 3494 return success; 3495} 3496 3497/****************************************************************************** |
|
3131 * refault feedback loop 3132 ******************************************************************************/ 3133 3134/* 3135 * A feedback loop based on Proportional-Integral-Derivative (PID) controller. 3136 * 3137 * The P term is refaulted/(evicted+protected) from a tier in the generation 3138 * currently being evicted; the I term is the exponential moving average of the --- 133 unchanged lines hidden (view full) --- 3272 new_flags |= BIT(PG_reclaim); 3273 } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); 3274 3275 lru_gen_update_size(lruvec, folio, old_gen, new_gen); 3276 3277 return new_gen; 3278} 3279 | 3498 * refault feedback loop 3499 ******************************************************************************/ 3500 3501/* 3502 * A feedback loop based on Proportional-Integral-Derivative (PID) controller. 3503 * 3504 * The P term is refaulted/(evicted+protected) from a tier in the generation 3505 * currently being evicted; the I term is the exponential moving average of the --- 133 unchanged lines hidden (view full) --- 3639 new_flags |= BIT(PG_reclaim); 3640 } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); 3641 3642 lru_gen_update_size(lruvec, folio, old_gen, new_gen); 3643 3644 return new_gen; 3645} 3646 |
3647static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, 3648 int old_gen, int new_gen) 3649{ 3650 int type = folio_is_file_lru(folio); 3651 int zone = folio_zonenum(folio); 3652 int delta = folio_nr_pages(folio); 3653 3654 VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS); 3655 VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS); 3656 3657 walk->batched++; 3658 3659 walk->nr_pages[old_gen][type][zone] -= delta; 3660 walk->nr_pages[new_gen][type][zone] += delta; 3661} 3662 3663static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) 3664{ 3665 int gen, type, zone; 3666 struct lru_gen_struct *lrugen = &lruvec->lrugen; 3667 3668 walk->batched = 0; 3669 3670 for_each_gen_type_zone(gen, type, zone) { 3671 enum lru_list lru = type * LRU_INACTIVE_FILE; 3672 int delta = walk->nr_pages[gen][type][zone]; 3673 3674 if (!delta) 3675 continue; 3676 3677 walk->nr_pages[gen][type][zone] = 0; 3678 WRITE_ONCE(lrugen->nr_pages[gen][type][zone], 3679 lrugen->nr_pages[gen][type][zone] + delta); 3680 3681 if (lru_gen_is_active(lruvec, gen)) 3682 lru += LRU_ACTIVE; 3683 __update_lru_size(lruvec, lru, zone, delta); 3684 } 3685} 3686 3687static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args) 3688{ 3689 struct address_space *mapping; 3690 struct vm_area_struct *vma = args->vma; 3691 struct lru_gen_mm_walk *walk = args->private; 3692 3693 if (!vma_is_accessible(vma)) 3694 return true; 3695 3696 if (is_vm_hugetlb_page(vma)) 3697 return true; 3698 3699 if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) 3700 return true; 3701 3702 if (vma == get_gate_vma(vma->vm_mm)) 3703 return true; 3704 3705 if (vma_is_anonymous(vma)) 3706 return !walk->can_swap; 3707 3708 if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) 3709 return true; 3710 3711 mapping = vma->vm_file->f_mapping; 3712 if (mapping_unevictable(mapping)) 3713 return true; 3714 3715 if (shmem_mapping(mapping)) 3716 return !walk->can_swap; 3717 3718 /* to exclude special mappings like dax, etc. */ 3719 return !mapping->a_ops->read_folio; 3720} 3721 3722/* 3723 * Some userspace memory allocators map many single-page VMAs. Instead of 3724 * returning back to the PGD table for each of such VMAs, finish an entire PMD 3725 * table to reduce zigzags and improve cache performance. 3726 */ 3727static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args, 3728 unsigned long *vm_start, unsigned long *vm_end) 3729{ 3730 unsigned long start = round_up(*vm_end, size); 3731 unsigned long end = (start | ~mask) + 1; 3732 3733 VM_WARN_ON_ONCE(mask & size); 3734 VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask)); 3735 3736 while (args->vma) { 3737 if (start >= args->vma->vm_end) { 3738 args->vma = args->vma->vm_next; 3739 continue; 3740 } 3741 3742 if (end && end <= args->vma->vm_start) 3743 return false; 3744 3745 if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) { 3746 args->vma = args->vma->vm_next; 3747 continue; 3748 } 3749 3750 *vm_start = max(start, args->vma->vm_start); 3751 *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; 3752 3753 return true; 3754 } 3755 3756 return false; 3757} 3758 |
|
3280static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) 3281{ 3282 unsigned long pfn = pte_pfn(pte); 3283 3284 VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); 3285 3286 if (!pte_present(pte) || is_zero_pfn(pfn)) 3287 return -1; 3288 3289 if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) 3290 return -1; 3291 3292 if (WARN_ON_ONCE(!pfn_valid(pfn))) 3293 return -1; 3294 3295 return pfn; 3296} 3297 | 3759static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) 3760{ 3761 unsigned long pfn = pte_pfn(pte); 3762 3763 VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); 3764 3765 if (!pte_present(pte) || is_zero_pfn(pfn)) 3766 return -1; 3767 3768 if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) 3769 return -1; 3770 3771 if (WARN_ON_ONCE(!pfn_valid(pfn))) 3772 return -1; 3773 3774 return pfn; 3775} 3776 |
3777#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) 3778static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) 3779{ 3780 unsigned long pfn = pmd_pfn(pmd); 3781 3782 VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); 3783 3784 if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) 3785 return -1; 3786 3787 if (WARN_ON_ONCE(pmd_devmap(pmd))) 3788 return -1; 3789 3790 if (WARN_ON_ONCE(!pfn_valid(pfn))) 3791 return -1; 3792 3793 return pfn; 3794} 3795#endif 3796 |
|
3298static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, | 3797static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, |
3299 struct pglist_data *pgdat) | 3798 struct pglist_data *pgdat, bool can_swap) |
3300{ 3301 struct folio *folio; 3302 3303 /* try to avoid unnecessary memory loads */ 3304 if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) 3305 return NULL; 3306 3307 folio = pfn_folio(pfn); 3308 if (folio_nid(folio) != pgdat->node_id) 3309 return NULL; 3310 3311 if (folio_memcg_rcu(folio) != memcg) 3312 return NULL; 3313 | 3799{ 3800 struct folio *folio; 3801 3802 /* try to avoid unnecessary memory loads */ 3803 if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) 3804 return NULL; 3805 3806 folio = pfn_folio(pfn); 3807 if (folio_nid(folio) != pgdat->node_id) 3808 return NULL; 3809 3810 if (folio_memcg_rcu(folio) != memcg) 3811 return NULL; 3812 |
3813 /* file VMAs can contain anon pages from COW */ 3814 if (!folio_is_file_lru(folio) && !can_swap) 3815 return NULL; 3816 |
|
3314 return folio; 3315} 3316 | 3817 return folio; 3818} 3819 |
3820static bool suitable_to_scan(int total, int young) 3821{ 3822 int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); 3823 3824 /* suitable if the average number of young PTEs per cacheline is >=1 */ 3825 return young * n >= total; 3826} 3827 3828static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, 3829 struct mm_walk *args) 3830{ 3831 int i; 3832 pte_t *pte; 3833 spinlock_t *ptl; 3834 unsigned long addr; 3835 int total = 0; 3836 int young = 0; 3837 struct lru_gen_mm_walk *walk = args->private; 3838 struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); 3839 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 3840 int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); 3841 3842 VM_WARN_ON_ONCE(pmd_leaf(*pmd)); 3843 3844 ptl = pte_lockptr(args->mm, pmd); 3845 if (!spin_trylock(ptl)) 3846 return false; 3847 3848 arch_enter_lazy_mmu_mode(); 3849 3850 pte = pte_offset_map(pmd, start & PMD_MASK); 3851restart: 3852 for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { 3853 unsigned long pfn; 3854 struct folio *folio; 3855 3856 total++; 3857 walk->mm_stats[MM_LEAF_TOTAL]++; 3858 3859 pfn = get_pte_pfn(pte[i], args->vma, addr); 3860 if (pfn == -1) 3861 continue; 3862 3863 if (!pte_young(pte[i])) { 3864 walk->mm_stats[MM_LEAF_OLD]++; 3865 continue; 3866 } 3867 3868 folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); 3869 if (!folio) 3870 continue; 3871 3872 if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) 3873 VM_WARN_ON_ONCE(true); 3874 3875 young++; 3876 walk->mm_stats[MM_LEAF_YOUNG]++; 3877 3878 if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && 3879 !(folio_test_anon(folio) && folio_test_swapbacked(folio) && 3880 !folio_test_swapcache(folio))) 3881 folio_mark_dirty(folio); 3882 3883 old_gen = folio_update_gen(folio, new_gen); 3884 if (old_gen >= 0 && old_gen != new_gen) 3885 update_batch_size(walk, folio, old_gen, new_gen); 3886 } 3887 3888 if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) 3889 goto restart; 3890 3891 pte_unmap(pte); 3892 3893 arch_leave_lazy_mmu_mode(); 3894 spin_unlock(ptl); 3895 3896 return suitable_to_scan(total, young); 3897} 3898 3899#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) 3900static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, 3901 struct mm_walk *args, unsigned long *bitmap, unsigned long *start) 3902{ 3903 int i; 3904 pmd_t *pmd; 3905 spinlock_t *ptl; 3906 struct lru_gen_mm_walk *walk = args->private; 3907 struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); 3908 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 3909 int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); 3910 3911 VM_WARN_ON_ONCE(pud_leaf(*pud)); 3912 3913 /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ 3914 if (*start == -1) { 3915 *start = next; 3916 return; 3917 } 3918 3919 i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start); 3920 if (i && i <= MIN_LRU_BATCH) { 3921 __set_bit(i - 1, bitmap); 3922 return; 3923 } 3924 3925 pmd = pmd_offset(pud, *start); 3926 3927 ptl = pmd_lockptr(args->mm, pmd); 3928 if (!spin_trylock(ptl)) 3929 goto done; 3930 3931 arch_enter_lazy_mmu_mode(); 3932 3933 do { 3934 unsigned long pfn; 3935 struct folio *folio; 3936 unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start; 3937 3938 pfn = get_pmd_pfn(pmd[i], vma, addr); 3939 if (pfn == -1) 3940 goto next; 3941 3942 if (!pmd_trans_huge(pmd[i])) { 3943 if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)) 3944 pmdp_test_and_clear_young(vma, addr, pmd + i); 3945 goto next; 3946 } 3947 3948 folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); 3949 if (!folio) 3950 goto next; 3951 3952 if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) 3953 goto next; 3954 3955 walk->mm_stats[MM_LEAF_YOUNG]++; 3956 3957 if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) && 3958 !(folio_test_anon(folio) && folio_test_swapbacked(folio) && 3959 !folio_test_swapcache(folio))) 3960 folio_mark_dirty(folio); 3961 3962 old_gen = folio_update_gen(folio, new_gen); 3963 if (old_gen >= 0 && old_gen != new_gen) 3964 update_batch_size(walk, folio, old_gen, new_gen); 3965next: 3966 i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; 3967 } while (i <= MIN_LRU_BATCH); 3968 3969 arch_leave_lazy_mmu_mode(); 3970 spin_unlock(ptl); 3971done: 3972 *start = -1; 3973 bitmap_zero(bitmap, MIN_LRU_BATCH); 3974} 3975#else 3976static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, 3977 struct mm_walk *args, unsigned long *bitmap, unsigned long *start) 3978{ 3979} 3980#endif 3981 3982static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, 3983 struct mm_walk *args) 3984{ 3985 int i; 3986 pmd_t *pmd; 3987 unsigned long next; 3988 unsigned long addr; 3989 struct vm_area_struct *vma; 3990 unsigned long pos = -1; 3991 struct lru_gen_mm_walk *walk = args->private; 3992 unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; 3993 3994 VM_WARN_ON_ONCE(pud_leaf(*pud)); 3995 3996 /* 3997 * Finish an entire PMD in two passes: the first only reaches to PTE 3998 * tables to avoid taking the PMD lock; the second, if necessary, takes 3999 * the PMD lock to clear the accessed bit in PMD entries. 4000 */ 4001 pmd = pmd_offset(pud, start & PUD_MASK); 4002restart: 4003 /* walk_pte_range() may call get_next_vma() */ 4004 vma = args->vma; 4005 for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { 4006 pmd_t val = pmd_read_atomic(pmd + i); 4007 4008 /* for pmd_read_atomic() */ 4009 barrier(); 4010 4011 next = pmd_addr_end(addr, end); 4012 4013 if (!pmd_present(val) || is_huge_zero_pmd(val)) { 4014 walk->mm_stats[MM_LEAF_TOTAL]++; 4015 continue; 4016 } 4017 4018#ifdef CONFIG_TRANSPARENT_HUGEPAGE 4019 if (pmd_trans_huge(val)) { 4020 unsigned long pfn = pmd_pfn(val); 4021 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 4022 4023 walk->mm_stats[MM_LEAF_TOTAL]++; 4024 4025 if (!pmd_young(val)) { 4026 walk->mm_stats[MM_LEAF_OLD]++; 4027 continue; 4028 } 4029 4030 /* try to avoid unnecessary memory loads */ 4031 if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) 4032 continue; 4033 4034 walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); 4035 continue; 4036 } 4037#endif 4038 walk->mm_stats[MM_NONLEAF_TOTAL]++; 4039 4040#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG 4041 if (!pmd_young(val)) 4042 continue; 4043 4044 walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); 4045#endif 4046 if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) 4047 continue; 4048 4049 walk->mm_stats[MM_NONLEAF_FOUND]++; 4050 4051 if (!walk_pte_range(&val, addr, next, args)) 4052 continue; 4053 4054 walk->mm_stats[MM_NONLEAF_ADDED]++; 4055 4056 /* carry over to the next generation */ 4057 update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); 4058 } 4059 4060 walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos); 4061 4062 if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) 4063 goto restart; 4064} 4065 4066static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, 4067 struct mm_walk *args) 4068{ 4069 int i; 4070 pud_t *pud; 4071 unsigned long addr; 4072 unsigned long next; 4073 struct lru_gen_mm_walk *walk = args->private; 4074 4075 VM_WARN_ON_ONCE(p4d_leaf(*p4d)); 4076 4077 pud = pud_offset(p4d, start & P4D_MASK); 4078restart: 4079 for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { 4080 pud_t val = READ_ONCE(pud[i]); 4081 4082 next = pud_addr_end(addr, end); 4083 4084 if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) 4085 continue; 4086 4087 walk_pmd_range(&val, addr, next, args); 4088 4089 /* a racy check to curtail the waiting time */ 4090 if (wq_has_sleeper(&walk->lruvec->mm_state.wait)) 4091 return 1; 4092 4093 if (need_resched() || walk->batched >= MAX_LRU_BATCH) { 4094 end = (addr | ~PUD_MASK) + 1; 4095 goto done; 4096 } 4097 } 4098 4099 if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end)) 4100 goto restart; 4101 4102 end = round_up(end, P4D_SIZE); 4103done: 4104 if (!end || !args->vma) 4105 return 1; 4106 4107 walk->next_addr = max(end, args->vma->vm_start); 4108 4109 return -EAGAIN; 4110} 4111 4112static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk) 4113{ 4114 static const struct mm_walk_ops mm_walk_ops = { 4115 .test_walk = should_skip_vma, 4116 .p4d_entry = walk_pud_range, 4117 }; 4118 4119 int err; 4120 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4121 4122 walk->next_addr = FIRST_USER_ADDRESS; 4123 4124 do { 4125 err = -EBUSY; 4126 4127 /* folio_update_gen() requires stable folio_memcg() */ 4128 if (!mem_cgroup_trylock_pages(memcg)) 4129 break; 4130 4131 /* the caller might be holding the lock for write */ 4132 if (mmap_read_trylock(mm)) { 4133 err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk); 4134 4135 mmap_read_unlock(mm); 4136 } 4137 4138 mem_cgroup_unlock_pages(); 4139 4140 if (walk->batched) { 4141 spin_lock_irq(&lruvec->lru_lock); 4142 reset_batch_size(lruvec, walk); 4143 spin_unlock_irq(&lruvec->lru_lock); 4144 } 4145 4146 cond_resched(); 4147 } while (err == -EAGAIN); 4148} 4149 4150static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) 4151{ 4152 struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; 4153 4154 if (pgdat && current_is_kswapd()) { 4155 VM_WARN_ON_ONCE(walk); 4156 4157 walk = &pgdat->mm_walk; 4158 } else if (!pgdat && !walk) { 4159 VM_WARN_ON_ONCE(current_is_kswapd()); 4160 4161 walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); 4162 } 4163 4164 current->reclaim_state->mm_walk = walk; 4165 4166 return walk; 4167} 4168 4169static void clear_mm_walk(void) 4170{ 4171 struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; 4172 4173 VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages))); 4174 VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats))); 4175 4176 current->reclaim_state->mm_walk = NULL; 4177 4178 if (!current_is_kswapd()) 4179 kfree(walk); 4180} 4181 |
|
3317static void inc_min_seq(struct lruvec *lruvec, int type) 3318{ 3319 struct lru_gen_struct *lrugen = &lruvec->lrugen; 3320 3321 reset_ctrl_pos(lruvec, type, true); 3322 WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); 3323} 3324 --- 35 unchanged lines hidden (view full) --- 3360 reset_ctrl_pos(lruvec, type, true); 3361 WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); 3362 success = true; 3363 } 3364 3365 return success; 3366} 3367 | 4182static void inc_min_seq(struct lruvec *lruvec, int type) 4183{ 4184 struct lru_gen_struct *lrugen = &lruvec->lrugen; 4185 4186 reset_ctrl_pos(lruvec, type, true); 4187 WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); 4188} 4189 --- 35 unchanged lines hidden (view full) --- 4225 reset_ctrl_pos(lruvec, type, true); 4226 WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); 4227 success = true; 4228 } 4229 4230 return success; 4231} 4232 |
3368static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap) | 4233static void inc_max_seq(struct lruvec *lruvec, bool can_swap) |
3369{ 3370 int prev, next; 3371 int type, zone; 3372 struct lru_gen_struct *lrugen = &lruvec->lrugen; 3373 3374 spin_lock_irq(&lruvec->lru_lock); 3375 3376 VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); 3377 | 4234{ 4235 int prev, next; 4236 int type, zone; 4237 struct lru_gen_struct *lrugen = &lruvec->lrugen; 4238 4239 spin_lock_irq(&lruvec->lru_lock); 4240 4241 VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); 4242 |
3378 if (max_seq != lrugen->max_seq) 3379 goto unlock; 3380 | |
3381 for (type = ANON_AND_FILE - 1; type >= 0; type--) { 3382 if (get_nr_gens(lruvec, type) != MAX_NR_GENS) 3383 continue; 3384 3385 VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap); 3386 3387 inc_min_seq(lruvec, type); 3388 } --- 21 unchanged lines hidden (view full) --- 3410 } 3411 } 3412 3413 for (type = 0; type < ANON_AND_FILE; type++) 3414 reset_ctrl_pos(lruvec, type, false); 3415 3416 /* make sure preceding modifications appear */ 3417 smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); | 4243 for (type = ANON_AND_FILE - 1; type >= 0; type--) { 4244 if (get_nr_gens(lruvec, type) != MAX_NR_GENS) 4245 continue; 4246 4247 VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap); 4248 4249 inc_min_seq(lruvec, type); 4250 } --- 21 unchanged lines hidden (view full) --- 4272 } 4273 } 4274 4275 for (type = 0; type < ANON_AND_FILE; type++) 4276 reset_ctrl_pos(lruvec, type, false); 4277 4278 /* make sure preceding modifications appear */ 4279 smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); |
3418unlock: | 4280 |
3419 spin_unlock_irq(&lruvec->lru_lock); 3420} 3421 | 4281 spin_unlock_irq(&lruvec->lru_lock); 4282} 4283 |
4284static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, 4285 struct scan_control *sc, bool can_swap) 4286{ 4287 bool success; 4288 struct lru_gen_mm_walk *walk; 4289 struct mm_struct *mm = NULL; 4290 struct lru_gen_struct *lrugen = &lruvec->lrugen; 4291 4292 VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); 4293 4294 /* see the comment in iterate_mm_list() */ 4295 if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) { 4296 success = false; 4297 goto done; 4298 } 4299 4300 /* 4301 * If the hardware doesn't automatically set the accessed bit, fallback 4302 * to lru_gen_look_around(), which only clears the accessed bit in a 4303 * handful of PTEs. Spreading the work out over a period of time usually 4304 * is less efficient, but it avoids bursty page faults. 4305 */ 4306 if (!arch_has_hw_pte_young()) { 4307 success = iterate_mm_list_nowalk(lruvec, max_seq); 4308 goto done; 4309 } 4310 4311 walk = set_mm_walk(NULL); 4312 if (!walk) { 4313 success = iterate_mm_list_nowalk(lruvec, max_seq); 4314 goto done; 4315 } 4316 4317 walk->lruvec = lruvec; 4318 walk->max_seq = max_seq; 4319 walk->can_swap = can_swap; 4320 walk->force_scan = false; 4321 4322 do { 4323 success = iterate_mm_list(lruvec, walk, &mm); 4324 if (mm) 4325 walk_mm(lruvec, mm, walk); 4326 4327 cond_resched(); 4328 } while (mm); 4329done: 4330 if (!success) { 4331 if (sc->priority <= DEF_PRIORITY - 2) 4332 wait_event_killable(lruvec->mm_state.wait, 4333 max_seq < READ_ONCE(lrugen->max_seq)); 4334 4335 return max_seq < READ_ONCE(lrugen->max_seq); 4336 } 4337 4338 VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); 4339 4340 inc_max_seq(lruvec, can_swap); 4341 /* either this sees any waiters or they will see updated max_seq */ 4342 if (wq_has_sleeper(&lruvec->mm_state.wait)) 4343 wake_up_all(&lruvec->mm_state.wait); 4344 4345 wakeup_flusher_threads(WB_REASON_VMSCAN); 4346 4347 return true; 4348} 4349 |
|
3422static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, 3423 struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) 3424{ 3425 int gen, type, zone; 3426 unsigned long old = 0; 3427 unsigned long young = 0; 3428 unsigned long total = 0; 3429 struct lru_gen_struct *lrugen = &lruvec->lrugen; --- 59 unchanged lines hidden (view full) --- 3489 3490 mem_cgroup_calculate_protection(NULL, memcg); 3491 3492 if (mem_cgroup_below_min(memcg)) 3493 return; 3494 3495 need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); 3496 if (need_aging) | 4350static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, 4351 struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) 4352{ 4353 int gen, type, zone; 4354 unsigned long old = 0; 4355 unsigned long young = 0; 4356 unsigned long total = 0; 4357 struct lru_gen_struct *lrugen = &lruvec->lrugen; --- 59 unchanged lines hidden (view full) --- 4417 4418 mem_cgroup_calculate_protection(NULL, memcg); 4419 4420 if (mem_cgroup_below_min(memcg)) 4421 return; 4422 4423 need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); 4424 if (need_aging) |
3497 inc_max_seq(lruvec, max_seq, swappiness); | 4425 try_to_inc_max_seq(lruvec, max_seq, sc, swappiness); |
3498} 3499 3500static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) 3501{ 3502 struct mem_cgroup *memcg; 3503 3504 VM_WARN_ON_ONCE(!current_is_kswapd()); 3505 | 4426} 4427 4428static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) 4429{ 4430 struct mem_cgroup *memcg; 4431 4432 VM_WARN_ON_ONCE(!current_is_kswapd()); 4433 |
4434 set_mm_walk(pgdat); 4435 |
|
3506 memcg = mem_cgroup_iter(NULL, NULL, NULL); 3507 do { 3508 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 3509 3510 age_lruvec(lruvec, sc); 3511 3512 cond_resched(); 3513 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); | 4436 memcg = mem_cgroup_iter(NULL, NULL, NULL); 4437 do { 4438 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 4439 4440 age_lruvec(lruvec, sc); 4441 4442 cond_resched(); 4443 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); |
4444 4445 clear_mm_walk(); |
|
3514} 3515 3516/* 3517 * This function exploits spatial locality when shrink_page_list() walks the | 4446} 4447 4448/* 4449 * This function exploits spatial locality when shrink_page_list() walks the |
3518 * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. | 4450 * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If 4451 * the scan was done cacheline efficiently, it adds the PMD entry pointing to 4452 * the PTE table to the Bloom filter. This forms a feedback loop between the 4453 * eviction and the aging. |
3519 */ 3520void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) 3521{ 3522 int i; 3523 pte_t *pte; 3524 unsigned long start; 3525 unsigned long end; 3526 unsigned long addr; | 4454 */ 4455void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) 4456{ 4457 int i; 4458 pte_t *pte; 4459 unsigned long start; 4460 unsigned long end; 4461 unsigned long addr; |
4462 struct lru_gen_mm_walk *walk; 4463 int young = 0; |
|
3527 unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; 3528 struct folio *folio = pfn_folio(pvmw->pfn); 3529 struct mem_cgroup *memcg = folio_memcg(folio); 3530 struct pglist_data *pgdat = folio_pgdat(folio); 3531 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 3532 DEFINE_MAX_SEQ(lruvec); 3533 int old_gen, new_gen = lru_gen_from_seq(max_seq); 3534 3535 lockdep_assert_held(pvmw->ptl); 3536 VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); 3537 3538 if (spin_is_contended(pvmw->ptl)) 3539 return; 3540 | 4464 unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; 4465 struct folio *folio = pfn_folio(pvmw->pfn); 4466 struct mem_cgroup *memcg = folio_memcg(folio); 4467 struct pglist_data *pgdat = folio_pgdat(folio); 4468 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 4469 DEFINE_MAX_SEQ(lruvec); 4470 int old_gen, new_gen = lru_gen_from_seq(max_seq); 4471 4472 lockdep_assert_held(pvmw->ptl); 4473 VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); 4474 4475 if (spin_is_contended(pvmw->ptl)) 4476 return; 4477 |
4478 /* avoid taking the LRU lock under the PTL when possible */ 4479 walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; 4480 |
|
3541 start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); 3542 end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; 3543 3544 if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { 3545 if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) 3546 end = start + MIN_LRU_BATCH * PAGE_SIZE; 3547 else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) 3548 start = end - MIN_LRU_BATCH * PAGE_SIZE; --- 13 unchanged lines hidden (view full) --- 3562 3563 pfn = get_pte_pfn(pte[i], pvmw->vma, addr); 3564 if (pfn == -1) 3565 continue; 3566 3567 if (!pte_young(pte[i])) 3568 continue; 3569 | 4481 start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); 4482 end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; 4483 4484 if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { 4485 if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) 4486 end = start + MIN_LRU_BATCH * PAGE_SIZE; 4487 else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) 4488 start = end - MIN_LRU_BATCH * PAGE_SIZE; --- 13 unchanged lines hidden (view full) --- 4502 4503 pfn = get_pte_pfn(pte[i], pvmw->vma, addr); 4504 if (pfn == -1) 4505 continue; 4506 4507 if (!pte_young(pte[i])) 4508 continue; 4509 |
3570 folio = get_pfn_folio(pfn, memcg, pgdat); | 4510 folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); |
3571 if (!folio) 3572 continue; 3573 3574 if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) 3575 VM_WARN_ON_ONCE(true); 3576 | 4511 if (!folio) 4512 continue; 4513 4514 if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) 4515 VM_WARN_ON_ONCE(true); 4516 |
4517 young++; 4518 |
|
3577 if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && 3578 !(folio_test_anon(folio) && folio_test_swapbacked(folio) && 3579 !folio_test_swapcache(folio))) 3580 folio_mark_dirty(folio); 3581 3582 old_gen = folio_lru_gen(folio); 3583 if (old_gen < 0) 3584 folio_set_referenced(folio); 3585 else if (old_gen != new_gen) 3586 __set_bit(i, bitmap); 3587 } 3588 3589 arch_leave_lazy_mmu_mode(); 3590 rcu_read_unlock(); 3591 | 4519 if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && 4520 !(folio_test_anon(folio) && folio_test_swapbacked(folio) && 4521 !folio_test_swapcache(folio))) 4522 folio_mark_dirty(folio); 4523 4524 old_gen = folio_lru_gen(folio); 4525 if (old_gen < 0) 4526 folio_set_referenced(folio); 4527 else if (old_gen != new_gen) 4528 __set_bit(i, bitmap); 4529 } 4530 4531 arch_leave_lazy_mmu_mode(); 4532 rcu_read_unlock(); 4533 |
3592 if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { | 4534 /* feedback from rmap walkers to page table walkers */ 4535 if (suitable_to_scan(i, young)) 4536 update_bloom_filter(lruvec, max_seq, pvmw->pmd); 4537 4538 if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { |
3593 for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { 3594 folio = pfn_folio(pte_pfn(pte[i])); 3595 folio_activate(folio); 3596 } 3597 return; 3598 } 3599 3600 /* folio_update_gen() requires stable folio_memcg() */ 3601 if (!mem_cgroup_trylock_pages(memcg)) 3602 return; 3603 | 4539 for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { 4540 folio = pfn_folio(pte_pfn(pte[i])); 4541 folio_activate(folio); 4542 } 4543 return; 4544 } 4545 4546 /* folio_update_gen() requires stable folio_memcg() */ 4547 if (!mem_cgroup_trylock_pages(memcg)) 4548 return; 4549 |
3604 spin_lock_irq(&lruvec->lru_lock); 3605 new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); | 4550 if (!walk) { 4551 spin_lock_irq(&lruvec->lru_lock); 4552 new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); 4553 } |
3606 3607 for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { 3608 folio = pfn_folio(pte_pfn(pte[i])); 3609 if (folio_memcg_rcu(folio) != memcg) 3610 continue; 3611 3612 old_gen = folio_update_gen(folio, new_gen); 3613 if (old_gen < 0 || old_gen == new_gen) 3614 continue; 3615 | 4554 4555 for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { 4556 folio = pfn_folio(pte_pfn(pte[i])); 4557 if (folio_memcg_rcu(folio) != memcg) 4558 continue; 4559 4560 old_gen = folio_update_gen(folio, new_gen); 4561 if (old_gen < 0 || old_gen == new_gen) 4562 continue; 4563 |
3616 lru_gen_update_size(lruvec, folio, old_gen, new_gen); | 4564 if (walk) 4565 update_batch_size(walk, folio, old_gen, new_gen); 4566 else 4567 lru_gen_update_size(lruvec, folio, old_gen, new_gen); |
3617 } 3618 | 4568 } 4569 |
3619 spin_unlock_irq(&lruvec->lru_lock); | 4570 if (!walk) 4571 spin_unlock_irq(&lruvec->lru_lock); |
3620 3621 mem_cgroup_unlock_pages(); 3622} 3623 3624/****************************************************************************** 3625 * the eviction 3626 ******************************************************************************/ 3627 --- 266 unchanged lines hidden (view full) --- 3894{ 3895 int type; 3896 int scanned; 3897 int reclaimed; 3898 LIST_HEAD(list); 3899 struct folio *folio; 3900 enum vm_event_item item; 3901 struct reclaim_stat stat; | 4572 4573 mem_cgroup_unlock_pages(); 4574} 4575 4576/****************************************************************************** 4577 * the eviction 4578 ******************************************************************************/ 4579 --- 266 unchanged lines hidden (view full) --- 4846{ 4847 int type; 4848 int scanned; 4849 int reclaimed; 4850 LIST_HEAD(list); 4851 struct folio *folio; 4852 enum vm_event_item item; 4853 struct reclaim_stat stat; |
4854 struct lru_gen_mm_walk *walk; |
|
3902 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3903 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 3904 3905 spin_lock_irq(&lruvec->lru_lock); 3906 3907 scanned = isolate_folios(lruvec, sc, swappiness, &type, &list); 3908 3909 scanned += try_to_inc_min_seq(lruvec, swappiness); --- 20 unchanged lines hidden (view full) --- 3930 else 3931 folio_set_active(folio); 3932 } 3933 3934 spin_lock_irq(&lruvec->lru_lock); 3935 3936 move_pages_to_lru(lruvec, &list); 3937 | 4855 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4856 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 4857 4858 spin_lock_irq(&lruvec->lru_lock); 4859 4860 scanned = isolate_folios(lruvec, sc, swappiness, &type, &list); 4861 4862 scanned += try_to_inc_min_seq(lruvec, swappiness); --- 20 unchanged lines hidden (view full) --- 4883 else 4884 folio_set_active(folio); 4885 } 4886 4887 spin_lock_irq(&lruvec->lru_lock); 4888 4889 move_pages_to_lru(lruvec, &list); 4890 |
4891 walk = current->reclaim_state->mm_walk; 4892 if (walk && walk->batched) 4893 reset_batch_size(lruvec, walk); 4894 |
|
3938 item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; 3939 if (!cgroup_reclaim(sc)) 3940 __count_vm_events(item, reclaimed); 3941 __count_memcg_events(memcg, item, reclaimed); 3942 __count_vm_events(PGSTEAL_ANON + type, reclaimed); 3943 3944 spin_unlock_irq(&lruvec->lru_lock); 3945 3946 mem_cgroup_uncharge_list(&list); 3947 free_unref_page_list(&list); 3948 3949 sc->nr_reclaimed += reclaimed; 3950 3951 return scanned; 3952} 3953 | 4895 item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; 4896 if (!cgroup_reclaim(sc)) 4897 __count_vm_events(item, reclaimed); 4898 __count_memcg_events(memcg, item, reclaimed); 4899 __count_vm_events(PGSTEAL_ANON + type, reclaimed); 4900 4901 spin_unlock_irq(&lruvec->lru_lock); 4902 4903 mem_cgroup_uncharge_list(&list); 4904 free_unref_page_list(&list); 4905 4906 sc->nr_reclaimed += reclaimed; 4907 4908 return scanned; 4909} 4910 |
4911/* 4912 * For future optimizations: 4913 * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg 4914 * reclaim. 4915 */ |
|
3954static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, 3955 bool can_swap) 3956{ 3957 bool need_aging; 3958 unsigned long nr_to_scan; 3959 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3960 DEFINE_MAX_SEQ(lruvec); 3961 DEFINE_MIN_SEQ(lruvec); --- 9 unchanged lines hidden (view full) --- 3971 /* skip the aging path at the default priority */ 3972 if (sc->priority == DEF_PRIORITY) 3973 goto done; 3974 3975 /* leave the work to lru_gen_age_node() */ 3976 if (current_is_kswapd()) 3977 return 0; 3978 | 4916static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, 4917 bool can_swap) 4918{ 4919 bool need_aging; 4920 unsigned long nr_to_scan; 4921 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4922 DEFINE_MAX_SEQ(lruvec); 4923 DEFINE_MIN_SEQ(lruvec); --- 9 unchanged lines hidden (view full) --- 4933 /* skip the aging path at the default priority */ 4934 if (sc->priority == DEF_PRIORITY) 4935 goto done; 4936 4937 /* leave the work to lru_gen_age_node() */ 4938 if (current_is_kswapd()) 4939 return 0; 4940 |
3979 inc_max_seq(lruvec, max_seq, can_swap); | 4941 if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap)) 4942 return nr_to_scan; |
3980done: 3981 return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; 3982} 3983 3984static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 3985{ 3986 struct blk_plug plug; 3987 unsigned long scanned = 0; 3988 3989 lru_add_drain(); 3990 3991 blk_start_plug(&plug); 3992 | 4943done: 4944 return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; 4945} 4946 4947static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 4948{ 4949 struct blk_plug plug; 4950 unsigned long scanned = 0; 4951 4952 lru_add_drain(); 4953 4954 blk_start_plug(&plug); 4955 |
4956 set_mm_walk(lruvec_pgdat(lruvec)); 4957 |
|
3993 while (true) { 3994 int delta; 3995 int swappiness; 3996 unsigned long nr_to_scan; 3997 3998 if (sc->may_swap) 3999 swappiness = get_swappiness(lruvec, sc); 4000 else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) --- 11 unchanged lines hidden (view full) --- 4012 4013 scanned += delta; 4014 if (scanned >= nr_to_scan) 4015 break; 4016 4017 cond_resched(); 4018 } 4019 | 4958 while (true) { 4959 int delta; 4960 int swappiness; 4961 unsigned long nr_to_scan; 4962 4963 if (sc->may_swap) 4964 swappiness = get_swappiness(lruvec, sc); 4965 else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) --- 11 unchanged lines hidden (view full) --- 4977 4978 scanned += delta; 4979 if (scanned >= nr_to_scan) 4980 break; 4981 4982 cond_resched(); 4983 } 4984 |
4985 clear_mm_walk(); 4986 |
|
4020 blk_finish_plug(&plug); 4021} 4022 4023/****************************************************************************** 4024 * initialization 4025 ******************************************************************************/ 4026 4027void lru_gen_init_lruvec(struct lruvec *lruvec) 4028{ 4029 int gen, type, zone; 4030 struct lru_gen_struct *lrugen = &lruvec->lrugen; 4031 4032 lrugen->max_seq = MIN_NR_GENS + 1; 4033 4034 for_each_gen_type_zone(gen, type, zone) 4035 INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); | 4987 blk_finish_plug(&plug); 4988} 4989 4990/****************************************************************************** 4991 * initialization 4992 ******************************************************************************/ 4993 4994void lru_gen_init_lruvec(struct lruvec *lruvec) 4995{ 4996 int gen, type, zone; 4997 struct lru_gen_struct *lrugen = &lruvec->lrugen; 4998 4999 lrugen->max_seq = MIN_NR_GENS + 1; 5000 5001 for_each_gen_type_zone(gen, type, zone) 5002 INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); |
5003 5004 lruvec->mm_state.seq = MIN_NR_GENS; 5005 init_waitqueue_head(&lruvec->mm_state.wait); |
|
4036} 4037 4038#ifdef CONFIG_MEMCG 4039void lru_gen_init_memcg(struct mem_cgroup *memcg) 4040{ | 5006} 5007 5008#ifdef CONFIG_MEMCG 5009void lru_gen_init_memcg(struct mem_cgroup *memcg) 5010{ |
5011 INIT_LIST_HEAD(&memcg->mm_list.fifo); 5012 spin_lock_init(&memcg->mm_list.lock); |
|
4041} 4042 4043void lru_gen_exit_memcg(struct mem_cgroup *memcg) 4044{ | 5013} 5014 5015void lru_gen_exit_memcg(struct mem_cgroup *memcg) 5016{ |
5017 int i; |
|
4045 int nid; 4046 4047 for_each_node(nid) { 4048 struct lruvec *lruvec = get_lruvec(memcg, nid); 4049 4050 VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, 4051 sizeof(lruvec->lrugen.nr_pages))); | 5018 int nid; 5019 5020 for_each_node(nid) { 5021 struct lruvec *lruvec = get_lruvec(memcg, nid); 5022 5023 VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, 5024 sizeof(lruvec->lrugen.nr_pages))); |
5025 5026 for (i = 0; i < NR_BLOOM_FILTERS; i++) { 5027 bitmap_free(lruvec->mm_state.filters[i]); 5028 lruvec->mm_state.filters[i] = NULL; 5029 } |
|
4052 } 4053} 4054#endif 4055 4056static int __init init_lru_gen(void) 4057{ 4058 BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); 4059 BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); --- 1907 unchanged lines hidden --- | 5030 } 5031} 5032#endif 5033 5034static int __init init_lru_gen(void) 5035{ 5036 BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); 5037 BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); --- 1907 unchanged lines hidden --- |