xref: /openbmc/linux/mm/damon/vaddr.c (revision 603c09f2)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * DAMON Primitives for Virtual Address Spaces
4   *
5   * Author: SeongJae Park <sjpark@amazon.de>
6   */
7  
8  #define pr_fmt(fmt) "damon-va: " fmt
9  
10  #include <asm-generic/mman-common.h>
11  #include <linux/highmem.h>
12  #include <linux/hugetlb.h>
13  #include <linux/mmu_notifier.h>
14  #include <linux/page_idle.h>
15  #include <linux/pagewalk.h>
16  #include <linux/sched/mm.h>
17  
18  #include "ops-common.h"
19  
20  #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
21  #undef DAMON_MIN_REGION
22  #define DAMON_MIN_REGION 1
23  #endif
24  
25  /*
26   * 't->pid' should be the pointer to the relevant 'struct pid' having reference
27   * count.  Caller must put the returned task, unless it is NULL.
28   */
29  static inline struct task_struct *damon_get_task_struct(struct damon_target *t)
30  {
31  	return get_pid_task(t->pid, PIDTYPE_PID);
32  }
33  
34  /*
35   * Get the mm_struct of the given target
36   *
37   * Caller _must_ put the mm_struct after use, unless it is NULL.
38   *
39   * Returns the mm_struct of the target on success, NULL on failure
40   */
41  static struct mm_struct *damon_get_mm(struct damon_target *t)
42  {
43  	struct task_struct *task;
44  	struct mm_struct *mm;
45  
46  	task = damon_get_task_struct(t);
47  	if (!task)
48  		return NULL;
49  
50  	mm = get_task_mm(task);
51  	put_task_struct(task);
52  	return mm;
53  }
54  
55  /*
56   * Functions for the initial monitoring target regions construction
57   */
58  
59  /*
60   * Size-evenly split a region into 'nr_pieces' small regions
61   *
62   * Returns 0 on success, or negative error code otherwise.
63   */
64  static int damon_va_evenly_split_region(struct damon_target *t,
65  		struct damon_region *r, unsigned int nr_pieces)
66  {
67  	unsigned long sz_orig, sz_piece, orig_end;
68  	struct damon_region *n = NULL, *next;
69  	unsigned long start;
70  
71  	if (!r || !nr_pieces)
72  		return -EINVAL;
73  
74  	orig_end = r->ar.end;
75  	sz_orig = r->ar.end - r->ar.start;
76  	sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION);
77  
78  	if (!sz_piece)
79  		return -EINVAL;
80  
81  	r->ar.end = r->ar.start + sz_piece;
82  	next = damon_next_region(r);
83  	for (start = r->ar.end; start + sz_piece <= orig_end;
84  			start += sz_piece) {
85  		n = damon_new_region(start, start + sz_piece);
86  		if (!n)
87  			return -ENOMEM;
88  		damon_insert_region(n, r, next, t);
89  		r = n;
90  	}
91  	/* complement last region for possible rounding error */
92  	if (n)
93  		n->ar.end = orig_end;
94  
95  	return 0;
96  }
97  
98  static unsigned long sz_range(struct damon_addr_range *r)
99  {
100  	return r->end - r->start;
101  }
102  
103  /*
104   * Find three regions separated by two biggest unmapped regions
105   *
106   * vma		the head vma of the target address space
107   * regions	an array of three address ranges that results will be saved
108   *
109   * This function receives an address space and finds three regions in it which
110   * separated by the two biggest unmapped regions in the space.  Please refer to
111   * below comments of '__damon_va_init_regions()' function to know why this is
112   * necessary.
113   *
114   * Returns 0 if success, or negative error code otherwise.
115   */
116  static int __damon_va_three_regions(struct vm_area_struct *vma,
117  				       struct damon_addr_range regions[3])
118  {
119  	struct damon_addr_range gap = {0}, first_gap = {0}, second_gap = {0};
120  	struct vm_area_struct *last_vma = NULL;
121  	unsigned long start = 0;
122  	struct rb_root rbroot;
123  
124  	/* Find two biggest gaps so that first_gap > second_gap > others */
125  	for (; vma; vma = vma->vm_next) {
126  		if (!last_vma) {
127  			start = vma->vm_start;
128  			goto next;
129  		}
130  
131  		if (vma->rb_subtree_gap <= sz_range(&second_gap)) {
132  			rbroot.rb_node = &vma->vm_rb;
133  			vma = rb_entry(rb_last(&rbroot),
134  					struct vm_area_struct, vm_rb);
135  			goto next;
136  		}
137  
138  		gap.start = last_vma->vm_end;
139  		gap.end = vma->vm_start;
140  		if (sz_range(&gap) > sz_range(&second_gap)) {
141  			swap(gap, second_gap);
142  			if (sz_range(&second_gap) > sz_range(&first_gap))
143  				swap(second_gap, first_gap);
144  		}
145  next:
146  		last_vma = vma;
147  	}
148  
149  	if (!sz_range(&second_gap) || !sz_range(&first_gap))
150  		return -EINVAL;
151  
152  	/* Sort the two biggest gaps by address */
153  	if (first_gap.start > second_gap.start)
154  		swap(first_gap, second_gap);
155  
156  	/* Store the result */
157  	regions[0].start = ALIGN(start, DAMON_MIN_REGION);
158  	regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION);
159  	regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION);
160  	regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION);
161  	regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION);
162  	regions[2].end = ALIGN(last_vma->vm_end, DAMON_MIN_REGION);
163  
164  	return 0;
165  }
166  
167  /*
168   * Get the three regions in the given target (task)
169   *
170   * Returns 0 on success, negative error code otherwise.
171   */
172  static int damon_va_three_regions(struct damon_target *t,
173  				struct damon_addr_range regions[3])
174  {
175  	struct mm_struct *mm;
176  	int rc;
177  
178  	mm = damon_get_mm(t);
179  	if (!mm)
180  		return -EINVAL;
181  
182  	mmap_read_lock(mm);
183  	rc = __damon_va_three_regions(mm->mmap, regions);
184  	mmap_read_unlock(mm);
185  
186  	mmput(mm);
187  	return rc;
188  }
189  
190  /*
191   * Initialize the monitoring target regions for the given target (task)
192   *
193   * t	the given target
194   *
195   * Because only a number of small portions of the entire address space
196   * is actually mapped to the memory and accessed, monitoring the unmapped
197   * regions is wasteful.  That said, because we can deal with small noises,
198   * tracking every mapping is not strictly required but could even incur a high
199   * overhead if the mapping frequently changes or the number of mappings is
200   * high.  The adaptive regions adjustment mechanism will further help to deal
201   * with the noise by simply identifying the unmapped areas as a region that
202   * has no access.  Moreover, applying the real mappings that would have many
203   * unmapped areas inside will make the adaptive mechanism quite complex.  That
204   * said, too huge unmapped areas inside the monitoring target should be removed
205   * to not take the time for the adaptive mechanism.
206   *
207   * For the reason, we convert the complex mappings to three distinct regions
208   * that cover every mapped area of the address space.  Also the two gaps
209   * between the three regions are the two biggest unmapped areas in the given
210   * address space.  In detail, this function first identifies the start and the
211   * end of the mappings and the two biggest unmapped areas of the address space.
212   * Then, it constructs the three regions as below:
213   *
214   *     [mappings[0]->start, big_two_unmapped_areas[0]->start)
215   *     [big_two_unmapped_areas[0]->end, big_two_unmapped_areas[1]->start)
216   *     [big_two_unmapped_areas[1]->end, mappings[nr_mappings - 1]->end)
217   *
218   * As usual memory map of processes is as below, the gap between the heap and
219   * the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed
220   * region and the stack will be two biggest unmapped regions.  Because these
221   * gaps are exceptionally huge areas in usual address space, excluding these
222   * two biggest unmapped regions will be sufficient to make a trade-off.
223   *
224   *   <heap>
225   *   <BIG UNMAPPED REGION 1>
226   *   <uppermost mmap()-ed region>
227   *   (other mmap()-ed regions and small unmapped regions)
228   *   <lowermost mmap()-ed region>
229   *   <BIG UNMAPPED REGION 2>
230   *   <stack>
231   */
232  static void __damon_va_init_regions(struct damon_ctx *ctx,
233  				     struct damon_target *t)
234  {
235  	struct damon_target *ti;
236  	struct damon_region *r;
237  	struct damon_addr_range regions[3];
238  	unsigned long sz = 0, nr_pieces;
239  	int i, tidx = 0;
240  
241  	if (damon_va_three_regions(t, regions)) {
242  		damon_for_each_target(ti, ctx) {
243  			if (ti == t)
244  				break;
245  			tidx++;
246  		}
247  		pr_debug("Failed to get three regions of %dth target\n", tidx);
248  		return;
249  	}
250  
251  	for (i = 0; i < 3; i++)
252  		sz += regions[i].end - regions[i].start;
253  	if (ctx->min_nr_regions)
254  		sz /= ctx->min_nr_regions;
255  	if (sz < DAMON_MIN_REGION)
256  		sz = DAMON_MIN_REGION;
257  
258  	/* Set the initial three regions of the target */
259  	for (i = 0; i < 3; i++) {
260  		r = damon_new_region(regions[i].start, regions[i].end);
261  		if (!r) {
262  			pr_err("%d'th init region creation failed\n", i);
263  			return;
264  		}
265  		damon_add_region(r, t);
266  
267  		nr_pieces = (regions[i].end - regions[i].start) / sz;
268  		damon_va_evenly_split_region(t, r, nr_pieces);
269  	}
270  }
271  
272  /* Initialize '->regions_list' of every target (task) */
273  static void damon_va_init(struct damon_ctx *ctx)
274  {
275  	struct damon_target *t;
276  
277  	damon_for_each_target(t, ctx) {
278  		/* the user may set the target regions as they want */
279  		if (!damon_nr_regions(t))
280  			__damon_va_init_regions(ctx, t);
281  	}
282  }
283  
284  /*
285   * Update regions for current memory mappings
286   */
287  static void damon_va_update(struct damon_ctx *ctx)
288  {
289  	struct damon_addr_range three_regions[3];
290  	struct damon_target *t;
291  
292  	damon_for_each_target(t, ctx) {
293  		if (damon_va_three_regions(t, three_regions))
294  			continue;
295  		damon_set_regions(t, three_regions, 3);
296  	}
297  }
298  
299  static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
300  		unsigned long next, struct mm_walk *walk)
301  {
302  	pte_t *pte;
303  	spinlock_t *ptl;
304  
305  	if (pmd_huge(*pmd)) {
306  		ptl = pmd_lock(walk->mm, pmd);
307  		if (pmd_huge(*pmd)) {
308  			damon_pmdp_mkold(pmd, walk->mm, addr);
309  			spin_unlock(ptl);
310  			return 0;
311  		}
312  		spin_unlock(ptl);
313  	}
314  
315  	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
316  		return 0;
317  	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
318  	if (!pte_present(*pte))
319  		goto out;
320  	damon_ptep_mkold(pte, walk->mm, addr);
321  out:
322  	pte_unmap_unlock(pte, ptl);
323  	return 0;
324  }
325  
326  #ifdef CONFIG_HUGETLB_PAGE
327  static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
328  				struct vm_area_struct *vma, unsigned long addr)
329  {
330  	bool referenced = false;
331  	pte_t entry = huge_ptep_get(pte);
332  	struct page *page = pte_page(entry);
333  
334  	get_page(page);
335  
336  	if (pte_young(entry)) {
337  		referenced = true;
338  		entry = pte_mkold(entry);
339  		set_huge_pte_at(mm, addr, pte, entry);
340  	}
341  
342  #ifdef CONFIG_MMU_NOTIFIER
343  	if (mmu_notifier_clear_young(mm, addr,
344  				     addr + huge_page_size(hstate_vma(vma))))
345  		referenced = true;
346  #endif /* CONFIG_MMU_NOTIFIER */
347  
348  	if (referenced)
349  		set_page_young(page);
350  
351  	set_page_idle(page);
352  	put_page(page);
353  }
354  
355  static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask,
356  				     unsigned long addr, unsigned long end,
357  				     struct mm_walk *walk)
358  {
359  	struct hstate *h = hstate_vma(walk->vma);
360  	spinlock_t *ptl;
361  	pte_t entry;
362  
363  	ptl = huge_pte_lock(h, walk->mm, pte);
364  	entry = huge_ptep_get(pte);
365  	if (!pte_present(entry))
366  		goto out;
367  
368  	damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr);
369  
370  out:
371  	spin_unlock(ptl);
372  	return 0;
373  }
374  #else
375  #define damon_mkold_hugetlb_entry NULL
376  #endif /* CONFIG_HUGETLB_PAGE */
377  
378  static const struct mm_walk_ops damon_mkold_ops = {
379  	.pmd_entry = damon_mkold_pmd_entry,
380  	.hugetlb_entry = damon_mkold_hugetlb_entry,
381  };
382  
383  static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
384  {
385  	mmap_read_lock(mm);
386  	walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL);
387  	mmap_read_unlock(mm);
388  }
389  
390  /*
391   * Functions for the access checking of the regions
392   */
393  
394  static void __damon_va_prepare_access_check(struct damon_ctx *ctx,
395  			struct mm_struct *mm, struct damon_region *r)
396  {
397  	r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
398  
399  	damon_va_mkold(mm, r->sampling_addr);
400  }
401  
402  static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
403  {
404  	struct damon_target *t;
405  	struct mm_struct *mm;
406  	struct damon_region *r;
407  
408  	damon_for_each_target(t, ctx) {
409  		mm = damon_get_mm(t);
410  		if (!mm)
411  			continue;
412  		damon_for_each_region(r, t)
413  			__damon_va_prepare_access_check(ctx, mm, r);
414  		mmput(mm);
415  	}
416  }
417  
418  struct damon_young_walk_private {
419  	unsigned long *page_sz;
420  	bool young;
421  };
422  
423  static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
424  		unsigned long next, struct mm_walk *walk)
425  {
426  	pte_t *pte;
427  	spinlock_t *ptl;
428  	struct page *page;
429  	struct damon_young_walk_private *priv = walk->private;
430  
431  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
432  	if (pmd_huge(*pmd)) {
433  		ptl = pmd_lock(walk->mm, pmd);
434  		if (!pmd_huge(*pmd)) {
435  			spin_unlock(ptl);
436  			goto regular_page;
437  		}
438  		page = damon_get_page(pmd_pfn(*pmd));
439  		if (!page)
440  			goto huge_out;
441  		if (pmd_young(*pmd) || !page_is_idle(page) ||
442  					mmu_notifier_test_young(walk->mm,
443  						addr)) {
444  			*priv->page_sz = HPAGE_PMD_SIZE;
445  			priv->young = true;
446  		}
447  		put_page(page);
448  huge_out:
449  		spin_unlock(ptl);
450  		return 0;
451  	}
452  
453  regular_page:
454  #endif	/* CONFIG_TRANSPARENT_HUGEPAGE */
455  
456  	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
457  		return -EINVAL;
458  	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
459  	if (!pte_present(*pte))
460  		goto out;
461  	page = damon_get_page(pte_pfn(*pte));
462  	if (!page)
463  		goto out;
464  	if (pte_young(*pte) || !page_is_idle(page) ||
465  			mmu_notifier_test_young(walk->mm, addr)) {
466  		*priv->page_sz = PAGE_SIZE;
467  		priv->young = true;
468  	}
469  	put_page(page);
470  out:
471  	pte_unmap_unlock(pte, ptl);
472  	return 0;
473  }
474  
475  #ifdef CONFIG_HUGETLB_PAGE
476  static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
477  				     unsigned long addr, unsigned long end,
478  				     struct mm_walk *walk)
479  {
480  	struct damon_young_walk_private *priv = walk->private;
481  	struct hstate *h = hstate_vma(walk->vma);
482  	struct page *page;
483  	spinlock_t *ptl;
484  	pte_t entry;
485  
486  	ptl = huge_pte_lock(h, walk->mm, pte);
487  	entry = huge_ptep_get(pte);
488  	if (!pte_present(entry))
489  		goto out;
490  
491  	page = pte_page(entry);
492  	get_page(page);
493  
494  	if (pte_young(entry) || !page_is_idle(page) ||
495  	    mmu_notifier_test_young(walk->mm, addr)) {
496  		*priv->page_sz = huge_page_size(h);
497  		priv->young = true;
498  	}
499  
500  	put_page(page);
501  
502  out:
503  	spin_unlock(ptl);
504  	return 0;
505  }
506  #else
507  #define damon_young_hugetlb_entry NULL
508  #endif /* CONFIG_HUGETLB_PAGE */
509  
510  static const struct mm_walk_ops damon_young_ops = {
511  	.pmd_entry = damon_young_pmd_entry,
512  	.hugetlb_entry = damon_young_hugetlb_entry,
513  };
514  
515  static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
516  		unsigned long *page_sz)
517  {
518  	struct damon_young_walk_private arg = {
519  		.page_sz = page_sz,
520  		.young = false,
521  	};
522  
523  	mmap_read_lock(mm);
524  	walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg);
525  	mmap_read_unlock(mm);
526  	return arg.young;
527  }
528  
529  /*
530   * Check whether the region was accessed after the last preparation
531   *
532   * mm	'mm_struct' for the given virtual address space
533   * r	the region to be checked
534   */
535  static void __damon_va_check_access(struct damon_ctx *ctx,
536  			       struct mm_struct *mm, struct damon_region *r)
537  {
538  	static struct mm_struct *last_mm;
539  	static unsigned long last_addr;
540  	static unsigned long last_page_sz = PAGE_SIZE;
541  	static bool last_accessed;
542  
543  	/* If the region is in the last checked page, reuse the result */
544  	if (mm == last_mm && (ALIGN_DOWN(last_addr, last_page_sz) ==
545  				ALIGN_DOWN(r->sampling_addr, last_page_sz))) {
546  		if (last_accessed)
547  			r->nr_accesses++;
548  		return;
549  	}
550  
551  	last_accessed = damon_va_young(mm, r->sampling_addr, &last_page_sz);
552  	if (last_accessed)
553  		r->nr_accesses++;
554  
555  	last_mm = mm;
556  	last_addr = r->sampling_addr;
557  }
558  
559  static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
560  {
561  	struct damon_target *t;
562  	struct mm_struct *mm;
563  	struct damon_region *r;
564  	unsigned int max_nr_accesses = 0;
565  
566  	damon_for_each_target(t, ctx) {
567  		mm = damon_get_mm(t);
568  		if (!mm)
569  			continue;
570  		damon_for_each_region(r, t) {
571  			__damon_va_check_access(ctx, mm, r);
572  			max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
573  		}
574  		mmput(mm);
575  	}
576  
577  	return max_nr_accesses;
578  }
579  
580  /*
581   * Functions for the target validity check and cleanup
582   */
583  
584  static bool damon_va_target_valid(void *target)
585  {
586  	struct damon_target *t = target;
587  	struct task_struct *task;
588  
589  	task = damon_get_task_struct(t);
590  	if (task) {
591  		put_task_struct(task);
592  		return true;
593  	}
594  
595  	return false;
596  }
597  
598  #ifndef CONFIG_ADVISE_SYSCALLS
599  static unsigned long damos_madvise(struct damon_target *target,
600  		struct damon_region *r, int behavior)
601  {
602  	return 0;
603  }
604  #else
605  static unsigned long damos_madvise(struct damon_target *target,
606  		struct damon_region *r, int behavior)
607  {
608  	struct mm_struct *mm;
609  	unsigned long start = PAGE_ALIGN(r->ar.start);
610  	unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start);
611  	unsigned long applied;
612  
613  	mm = damon_get_mm(target);
614  	if (!mm)
615  		return 0;
616  
617  	applied = do_madvise(mm, start, len, behavior) ? 0 : len;
618  	mmput(mm);
619  
620  	return applied;
621  }
622  #endif	/* CONFIG_ADVISE_SYSCALLS */
623  
624  static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
625  		struct damon_target *t, struct damon_region *r,
626  		struct damos *scheme)
627  {
628  	int madv_action;
629  
630  	switch (scheme->action) {
631  	case DAMOS_WILLNEED:
632  		madv_action = MADV_WILLNEED;
633  		break;
634  	case DAMOS_COLD:
635  		madv_action = MADV_COLD;
636  		break;
637  	case DAMOS_PAGEOUT:
638  		madv_action = MADV_PAGEOUT;
639  		break;
640  	case DAMOS_HUGEPAGE:
641  		madv_action = MADV_HUGEPAGE;
642  		break;
643  	case DAMOS_NOHUGEPAGE:
644  		madv_action = MADV_NOHUGEPAGE;
645  		break;
646  	case DAMOS_STAT:
647  		return 0;
648  	default:
649  		return 0;
650  	}
651  
652  	return damos_madvise(t, r, madv_action);
653  }
654  
655  static int damon_va_scheme_score(struct damon_ctx *context,
656  		struct damon_target *t, struct damon_region *r,
657  		struct damos *scheme)
658  {
659  
660  	switch (scheme->action) {
661  	case DAMOS_PAGEOUT:
662  		return damon_pageout_score(context, r, scheme);
663  	default:
664  		break;
665  	}
666  
667  	return DAMOS_MAX_SCORE;
668  }
669  
670  static int __init damon_va_initcall(void)
671  {
672  	struct damon_operations ops = {
673  		.id = DAMON_OPS_VADDR,
674  		.init = damon_va_init,
675  		.update = damon_va_update,
676  		.prepare_access_checks = damon_va_prepare_access_checks,
677  		.check_accesses = damon_va_check_accesses,
678  		.reset_aggregated = NULL,
679  		.target_valid = damon_va_target_valid,
680  		.cleanup = NULL,
681  		.apply_scheme = damon_va_apply_scheme,
682  		.get_scheme_score = damon_va_scheme_score,
683  	};
684  	/* ops for fixed virtual address ranges */
685  	struct damon_operations ops_fvaddr = ops;
686  	int err;
687  
688  	/* Don't set the monitoring target regions for the entire mapping */
689  	ops_fvaddr.id = DAMON_OPS_FVADDR;
690  	ops_fvaddr.init = NULL;
691  	ops_fvaddr.update = NULL;
692  
693  	err = damon_register_ops(&ops);
694  	if (err)
695  		return err;
696  	return damon_register_ops(&ops_fvaddr);
697  };
698  
699  subsys_initcall(damon_va_initcall);
700  
701  #include "vaddr-test.h"
702