1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * DAMON Primitives for Virtual Address Spaces 4 * 5 * Author: SeongJae Park <sjpark@amazon.de> 6 */ 7 8 #define pr_fmt(fmt) "damon-va: " fmt 9 10 #include <linux/damon.h> 11 #include <linux/hugetlb.h> 12 #include <linux/mm.h> 13 #include <linux/mmu_notifier.h> 14 #include <linux/highmem.h> 15 #include <linux/page_idle.h> 16 #include <linux/pagewalk.h> 17 #include <linux/random.h> 18 #include <linux/sched/mm.h> 19 #include <linux/slab.h> 20 21 #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST 22 #undef DAMON_MIN_REGION 23 #define DAMON_MIN_REGION 1 24 #endif 25 26 /* Get a random number in [l, r) */ 27 #define damon_rand(l, r) (l + prandom_u32_max(r - l)) 28 29 /* 30 * 't->id' should be the pointer to the relevant 'struct pid' having reference 31 * count. Caller must put the returned task, unless it is NULL. 32 */ 33 #define damon_get_task_struct(t) \ 34 (get_pid_task((struct pid *)t->id, PIDTYPE_PID)) 35 36 /* 37 * Get the mm_struct of the given target 38 * 39 * Caller _must_ put the mm_struct after use, unless it is NULL. 40 * 41 * Returns the mm_struct of the target on success, NULL on failure 42 */ 43 static struct mm_struct *damon_get_mm(struct damon_target *t) 44 { 45 struct task_struct *task; 46 struct mm_struct *mm; 47 48 task = damon_get_task_struct(t); 49 if (!task) 50 return NULL; 51 52 mm = get_task_mm(task); 53 put_task_struct(task); 54 return mm; 55 } 56 57 /* 58 * Functions for the initial monitoring target regions construction 59 */ 60 61 /* 62 * Size-evenly split a region into 'nr_pieces' small regions 63 * 64 * Returns 0 on success, or negative error code otherwise. 65 */ 66 static int damon_va_evenly_split_region(struct damon_target *t, 67 struct damon_region *r, unsigned int nr_pieces) 68 { 69 unsigned long sz_orig, sz_piece, orig_end; 70 struct damon_region *n = NULL, *next; 71 unsigned long start; 72 73 if (!r || !nr_pieces) 74 return -EINVAL; 75 76 orig_end = r->ar.end; 77 sz_orig = r->ar.end - r->ar.start; 78 sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION); 79 80 if (!sz_piece) 81 return -EINVAL; 82 83 r->ar.end = r->ar.start + sz_piece; 84 next = damon_next_region(r); 85 for (start = r->ar.end; start + sz_piece <= orig_end; 86 start += sz_piece) { 87 n = damon_new_region(start, start + sz_piece); 88 if (!n) 89 return -ENOMEM; 90 damon_insert_region(n, r, next, t); 91 r = n; 92 } 93 /* complement last region for possible rounding error */ 94 if (n) 95 n->ar.end = orig_end; 96 97 return 0; 98 } 99 100 static unsigned long sz_range(struct damon_addr_range *r) 101 { 102 return r->end - r->start; 103 } 104 105 static void swap_ranges(struct damon_addr_range *r1, 106 struct damon_addr_range *r2) 107 { 108 struct damon_addr_range tmp; 109 110 tmp = *r1; 111 *r1 = *r2; 112 *r2 = tmp; 113 } 114 115 /* 116 * Find three regions separated by two biggest unmapped regions 117 * 118 * vma the head vma of the target address space 119 * regions an array of three address ranges that results will be saved 120 * 121 * This function receives an address space and finds three regions in it which 122 * separated by the two biggest unmapped regions in the space. Please refer to 123 * below comments of '__damon_va_init_regions()' function to know why this is 124 * necessary. 125 * 126 * Returns 0 if success, or negative error code otherwise. 127 */ 128 static int __damon_va_three_regions(struct vm_area_struct *vma, 129 struct damon_addr_range regions[3]) 130 { 131 struct damon_addr_range gap = {0}, first_gap = {0}, second_gap = {0}; 132 struct vm_area_struct *last_vma = NULL; 133 unsigned long start = 0; 134 struct rb_root rbroot; 135 136 /* Find two biggest gaps so that first_gap > second_gap > others */ 137 for (; vma; vma = vma->vm_next) { 138 if (!last_vma) { 139 start = vma->vm_start; 140 goto next; 141 } 142 143 if (vma->rb_subtree_gap <= sz_range(&second_gap)) { 144 rbroot.rb_node = &vma->vm_rb; 145 vma = rb_entry(rb_last(&rbroot), 146 struct vm_area_struct, vm_rb); 147 goto next; 148 } 149 150 gap.start = last_vma->vm_end; 151 gap.end = vma->vm_start; 152 if (sz_range(&gap) > sz_range(&second_gap)) { 153 swap_ranges(&gap, &second_gap); 154 if (sz_range(&second_gap) > sz_range(&first_gap)) 155 swap_ranges(&second_gap, &first_gap); 156 } 157 next: 158 last_vma = vma; 159 } 160 161 if (!sz_range(&second_gap) || !sz_range(&first_gap)) 162 return -EINVAL; 163 164 /* Sort the two biggest gaps by address */ 165 if (first_gap.start > second_gap.start) 166 swap_ranges(&first_gap, &second_gap); 167 168 /* Store the result */ 169 regions[0].start = ALIGN(start, DAMON_MIN_REGION); 170 regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION); 171 regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION); 172 regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION); 173 regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION); 174 regions[2].end = ALIGN(last_vma->vm_end, DAMON_MIN_REGION); 175 176 return 0; 177 } 178 179 /* 180 * Get the three regions in the given target (task) 181 * 182 * Returns 0 on success, negative error code otherwise. 183 */ 184 static int damon_va_three_regions(struct damon_target *t, 185 struct damon_addr_range regions[3]) 186 { 187 struct mm_struct *mm; 188 int rc; 189 190 mm = damon_get_mm(t); 191 if (!mm) 192 return -EINVAL; 193 194 mmap_read_lock(mm); 195 rc = __damon_va_three_regions(mm->mmap, regions); 196 mmap_read_unlock(mm); 197 198 mmput(mm); 199 return rc; 200 } 201 202 /* 203 * Initialize the monitoring target regions for the given target (task) 204 * 205 * t the given target 206 * 207 * Because only a number of small portions of the entire address space 208 * is actually mapped to the memory and accessed, monitoring the unmapped 209 * regions is wasteful. That said, because we can deal with small noises, 210 * tracking every mapping is not strictly required but could even incur a high 211 * overhead if the mapping frequently changes or the number of mappings is 212 * high. The adaptive regions adjustment mechanism will further help to deal 213 * with the noise by simply identifying the unmapped areas as a region that 214 * has no access. Moreover, applying the real mappings that would have many 215 * unmapped areas inside will make the adaptive mechanism quite complex. That 216 * said, too huge unmapped areas inside the monitoring target should be removed 217 * to not take the time for the adaptive mechanism. 218 * 219 * For the reason, we convert the complex mappings to three distinct regions 220 * that cover every mapped area of the address space. Also the two gaps 221 * between the three regions are the two biggest unmapped areas in the given 222 * address space. In detail, this function first identifies the start and the 223 * end of the mappings and the two biggest unmapped areas of the address space. 224 * Then, it constructs the three regions as below: 225 * 226 * [mappings[0]->start, big_two_unmapped_areas[0]->start) 227 * [big_two_unmapped_areas[0]->end, big_two_unmapped_areas[1]->start) 228 * [big_two_unmapped_areas[1]->end, mappings[nr_mappings - 1]->end) 229 * 230 * As usual memory map of processes is as below, the gap between the heap and 231 * the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed 232 * region and the stack will be two biggest unmapped regions. Because these 233 * gaps are exceptionally huge areas in usual address space, excluding these 234 * two biggest unmapped regions will be sufficient to make a trade-off. 235 * 236 * <heap> 237 * <BIG UNMAPPED REGION 1> 238 * <uppermost mmap()-ed region> 239 * (other mmap()-ed regions and small unmapped regions) 240 * <lowermost mmap()-ed region> 241 * <BIG UNMAPPED REGION 2> 242 * <stack> 243 */ 244 static void __damon_va_init_regions(struct damon_ctx *ctx, 245 struct damon_target *t) 246 { 247 struct damon_region *r; 248 struct damon_addr_range regions[3]; 249 unsigned long sz = 0, nr_pieces; 250 int i; 251 252 if (damon_va_three_regions(t, regions)) { 253 pr_err("Failed to get three regions of target %lu\n", t->id); 254 return; 255 } 256 257 for (i = 0; i < 3; i++) 258 sz += regions[i].end - regions[i].start; 259 if (ctx->min_nr_regions) 260 sz /= ctx->min_nr_regions; 261 if (sz < DAMON_MIN_REGION) 262 sz = DAMON_MIN_REGION; 263 264 /* Set the initial three regions of the target */ 265 for (i = 0; i < 3; i++) { 266 r = damon_new_region(regions[i].start, regions[i].end); 267 if (!r) { 268 pr_err("%d'th init region creation failed\n", i); 269 return; 270 } 271 damon_add_region(r, t); 272 273 nr_pieces = (regions[i].end - regions[i].start) / sz; 274 damon_va_evenly_split_region(t, r, nr_pieces); 275 } 276 } 277 278 /* Initialize '->regions_list' of every target (task) */ 279 void damon_va_init(struct damon_ctx *ctx) 280 { 281 struct damon_target *t; 282 283 damon_for_each_target(t, ctx) { 284 /* the user may set the target regions as they want */ 285 if (!damon_nr_regions(t)) 286 __damon_va_init_regions(ctx, t); 287 } 288 } 289 290 /* 291 * Functions for the dynamic monitoring target regions update 292 */ 293 294 /* 295 * Check whether a region is intersecting an address range 296 * 297 * Returns true if it is. 298 */ 299 static bool damon_intersect(struct damon_region *r, struct damon_addr_range *re) 300 { 301 return !(r->ar.end <= re->start || re->end <= r->ar.start); 302 } 303 304 /* 305 * Update damon regions for the three big regions of the given target 306 * 307 * t the given target 308 * bregions the three big regions of the target 309 */ 310 static void damon_va_apply_three_regions(struct damon_target *t, 311 struct damon_addr_range bregions[3]) 312 { 313 struct damon_region *r, *next; 314 unsigned int i = 0; 315 316 /* Remove regions which are not in the three big regions now */ 317 damon_for_each_region_safe(r, next, t) { 318 for (i = 0; i < 3; i++) { 319 if (damon_intersect(r, &bregions[i])) 320 break; 321 } 322 if (i == 3) 323 damon_destroy_region(r, t); 324 } 325 326 /* Adjust intersecting regions to fit with the three big regions */ 327 for (i = 0; i < 3; i++) { 328 struct damon_region *first = NULL, *last; 329 struct damon_region *newr; 330 struct damon_addr_range *br; 331 332 br = &bregions[i]; 333 /* Get the first and last regions which intersects with br */ 334 damon_for_each_region(r, t) { 335 if (damon_intersect(r, br)) { 336 if (!first) 337 first = r; 338 last = r; 339 } 340 if (r->ar.start >= br->end) 341 break; 342 } 343 if (!first) { 344 /* no damon_region intersects with this big region */ 345 newr = damon_new_region( 346 ALIGN_DOWN(br->start, 347 DAMON_MIN_REGION), 348 ALIGN(br->end, DAMON_MIN_REGION)); 349 if (!newr) 350 continue; 351 damon_insert_region(newr, damon_prev_region(r), r, t); 352 } else { 353 first->ar.start = ALIGN_DOWN(br->start, 354 DAMON_MIN_REGION); 355 last->ar.end = ALIGN(br->end, DAMON_MIN_REGION); 356 } 357 } 358 } 359 360 /* 361 * Update regions for current memory mappings 362 */ 363 void damon_va_update(struct damon_ctx *ctx) 364 { 365 struct damon_addr_range three_regions[3]; 366 struct damon_target *t; 367 368 damon_for_each_target(t, ctx) { 369 if (damon_va_three_regions(t, three_regions)) 370 continue; 371 damon_va_apply_three_regions(t, three_regions); 372 } 373 } 374 375 /* 376 * Get an online page for a pfn if it's in the LRU list. Otherwise, returns 377 * NULL. 378 * 379 * The body of this function is stolen from the 'page_idle_get_page()'. We 380 * steal rather than reuse it because the code is quite simple. 381 */ 382 static struct page *damon_get_page(unsigned long pfn) 383 { 384 struct page *page = pfn_to_online_page(pfn); 385 386 if (!page || !PageLRU(page) || !get_page_unless_zero(page)) 387 return NULL; 388 389 if (unlikely(!PageLRU(page))) { 390 put_page(page); 391 page = NULL; 392 } 393 return page; 394 } 395 396 static void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, 397 unsigned long addr) 398 { 399 bool referenced = false; 400 struct page *page = damon_get_page(pte_pfn(*pte)); 401 402 if (!page) 403 return; 404 405 if (pte_young(*pte)) { 406 referenced = true; 407 *pte = pte_mkold(*pte); 408 } 409 410 #ifdef CONFIG_MMU_NOTIFIER 411 if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE)) 412 referenced = true; 413 #endif /* CONFIG_MMU_NOTIFIER */ 414 415 if (referenced) 416 set_page_young(page); 417 418 set_page_idle(page); 419 put_page(page); 420 } 421 422 static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, 423 unsigned long addr) 424 { 425 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 426 bool referenced = false; 427 struct page *page = damon_get_page(pmd_pfn(*pmd)); 428 429 if (!page) 430 return; 431 432 if (pmd_young(*pmd)) { 433 referenced = true; 434 *pmd = pmd_mkold(*pmd); 435 } 436 437 #ifdef CONFIG_MMU_NOTIFIER 438 if (mmu_notifier_clear_young(mm, addr, 439 addr + ((1UL) << HPAGE_PMD_SHIFT))) 440 referenced = true; 441 #endif /* CONFIG_MMU_NOTIFIER */ 442 443 if (referenced) 444 set_page_young(page); 445 446 set_page_idle(page); 447 put_page(page); 448 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 449 } 450 451 static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, 452 unsigned long next, struct mm_walk *walk) 453 { 454 pte_t *pte; 455 spinlock_t *ptl; 456 457 if (pmd_huge(*pmd)) { 458 ptl = pmd_lock(walk->mm, pmd); 459 if (pmd_huge(*pmd)) { 460 damon_pmdp_mkold(pmd, walk->mm, addr); 461 spin_unlock(ptl); 462 return 0; 463 } 464 spin_unlock(ptl); 465 } 466 467 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 468 return 0; 469 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 470 if (!pte_present(*pte)) 471 goto out; 472 damon_ptep_mkold(pte, walk->mm, addr); 473 out: 474 pte_unmap_unlock(pte, ptl); 475 return 0; 476 } 477 478 static struct mm_walk_ops damon_mkold_ops = { 479 .pmd_entry = damon_mkold_pmd_entry, 480 }; 481 482 static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) 483 { 484 mmap_read_lock(mm); 485 walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL); 486 mmap_read_unlock(mm); 487 } 488 489 /* 490 * Functions for the access checking of the regions 491 */ 492 493 static void damon_va_prepare_access_check(struct damon_ctx *ctx, 494 struct mm_struct *mm, struct damon_region *r) 495 { 496 r->sampling_addr = damon_rand(r->ar.start, r->ar.end); 497 498 damon_va_mkold(mm, r->sampling_addr); 499 } 500 501 void damon_va_prepare_access_checks(struct damon_ctx *ctx) 502 { 503 struct damon_target *t; 504 struct mm_struct *mm; 505 struct damon_region *r; 506 507 damon_for_each_target(t, ctx) { 508 mm = damon_get_mm(t); 509 if (!mm) 510 continue; 511 damon_for_each_region(r, t) 512 damon_va_prepare_access_check(ctx, mm, r); 513 mmput(mm); 514 } 515 } 516 517 struct damon_young_walk_private { 518 unsigned long *page_sz; 519 bool young; 520 }; 521 522 static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, 523 unsigned long next, struct mm_walk *walk) 524 { 525 pte_t *pte; 526 spinlock_t *ptl; 527 struct page *page; 528 struct damon_young_walk_private *priv = walk->private; 529 530 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 531 if (pmd_huge(*pmd)) { 532 ptl = pmd_lock(walk->mm, pmd); 533 if (!pmd_huge(*pmd)) { 534 spin_unlock(ptl); 535 goto regular_page; 536 } 537 page = damon_get_page(pmd_pfn(*pmd)); 538 if (!page) 539 goto huge_out; 540 if (pmd_young(*pmd) || !page_is_idle(page) || 541 mmu_notifier_test_young(walk->mm, 542 addr)) { 543 *priv->page_sz = ((1UL) << HPAGE_PMD_SHIFT); 544 priv->young = true; 545 } 546 put_page(page); 547 huge_out: 548 spin_unlock(ptl); 549 return 0; 550 } 551 552 regular_page: 553 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 554 555 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 556 return -EINVAL; 557 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 558 if (!pte_present(*pte)) 559 goto out; 560 page = damon_get_page(pte_pfn(*pte)); 561 if (!page) 562 goto out; 563 if (pte_young(*pte) || !page_is_idle(page) || 564 mmu_notifier_test_young(walk->mm, addr)) { 565 *priv->page_sz = PAGE_SIZE; 566 priv->young = true; 567 } 568 put_page(page); 569 out: 570 pte_unmap_unlock(pte, ptl); 571 return 0; 572 } 573 574 static struct mm_walk_ops damon_young_ops = { 575 .pmd_entry = damon_young_pmd_entry, 576 }; 577 578 static bool damon_va_young(struct mm_struct *mm, unsigned long addr, 579 unsigned long *page_sz) 580 { 581 struct damon_young_walk_private arg = { 582 .page_sz = page_sz, 583 .young = false, 584 }; 585 586 mmap_read_lock(mm); 587 walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg); 588 mmap_read_unlock(mm); 589 return arg.young; 590 } 591 592 /* 593 * Check whether the region was accessed after the last preparation 594 * 595 * mm 'mm_struct' for the given virtual address space 596 * r the region to be checked 597 */ 598 static void damon_va_check_access(struct damon_ctx *ctx, 599 struct mm_struct *mm, struct damon_region *r) 600 { 601 static struct mm_struct *last_mm; 602 static unsigned long last_addr; 603 static unsigned long last_page_sz = PAGE_SIZE; 604 static bool last_accessed; 605 606 /* If the region is in the last checked page, reuse the result */ 607 if (mm == last_mm && (ALIGN_DOWN(last_addr, last_page_sz) == 608 ALIGN_DOWN(r->sampling_addr, last_page_sz))) { 609 if (last_accessed) 610 r->nr_accesses++; 611 return; 612 } 613 614 last_accessed = damon_va_young(mm, r->sampling_addr, &last_page_sz); 615 if (last_accessed) 616 r->nr_accesses++; 617 618 last_mm = mm; 619 last_addr = r->sampling_addr; 620 } 621 622 unsigned int damon_va_check_accesses(struct damon_ctx *ctx) 623 { 624 struct damon_target *t; 625 struct mm_struct *mm; 626 struct damon_region *r; 627 unsigned int max_nr_accesses = 0; 628 629 damon_for_each_target(t, ctx) { 630 mm = damon_get_mm(t); 631 if (!mm) 632 continue; 633 damon_for_each_region(r, t) { 634 damon_va_check_access(ctx, mm, r); 635 max_nr_accesses = max(r->nr_accesses, max_nr_accesses); 636 } 637 mmput(mm); 638 } 639 640 return max_nr_accesses; 641 } 642 643 /* 644 * Functions for the target validity check and cleanup 645 */ 646 647 bool damon_va_target_valid(void *target) 648 { 649 struct damon_target *t = target; 650 struct task_struct *task; 651 652 task = damon_get_task_struct(t); 653 if (task) { 654 put_task_struct(task); 655 return true; 656 } 657 658 return false; 659 } 660 661 void damon_va_set_primitives(struct damon_ctx *ctx) 662 { 663 ctx->primitive.init = damon_va_init; 664 ctx->primitive.update = damon_va_update; 665 ctx->primitive.prepare_access_checks = damon_va_prepare_access_checks; 666 ctx->primitive.check_accesses = damon_va_check_accesses; 667 ctx->primitive.reset_aggregated = NULL; 668 ctx->primitive.target_valid = damon_va_target_valid; 669 ctx->primitive.cleanup = NULL; 670 } 671 672 #include "vaddr-test.h" 673