1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/debugfs.h> 3 #include <linux/mm.h> 4 #include <linux/slab.h> 5 #include <linux/uaccess.h> 6 #include <linux/memblock.h> 7 #include <linux/stacktrace.h> 8 #include <linux/page_owner.h> 9 #include <linux/jump_label.h> 10 #include <linux/migrate.h> 11 #include <linux/stackdepot.h> 12 #include <linux/seq_file.h> 13 #include <linux/memcontrol.h> 14 #include <linux/sched/clock.h> 15 16 #include "internal.h" 17 18 /* 19 * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack) 20 * to use off stack temporal storage 21 */ 22 #define PAGE_OWNER_STACK_DEPTH (16) 23 24 struct page_owner { 25 unsigned short order; 26 short last_migrate_reason; 27 gfp_t gfp_mask; 28 depot_stack_handle_t handle; 29 depot_stack_handle_t free_handle; 30 u64 ts_nsec; 31 u64 free_ts_nsec; 32 char comm[TASK_COMM_LEN]; 33 pid_t pid; 34 pid_t tgid; 35 }; 36 37 static bool page_owner_enabled __initdata; 38 DEFINE_STATIC_KEY_FALSE(page_owner_inited); 39 40 static depot_stack_handle_t dummy_handle; 41 static depot_stack_handle_t failure_handle; 42 static depot_stack_handle_t early_handle; 43 44 static void init_early_allocated_pages(void); 45 46 static int __init early_page_owner_param(char *buf) 47 { 48 int ret = kstrtobool(buf, &page_owner_enabled); 49 50 if (page_owner_enabled) 51 stack_depot_request_early_init(); 52 53 return ret; 54 } 55 early_param("page_owner", early_page_owner_param); 56 57 static __init bool need_page_owner(void) 58 { 59 return page_owner_enabled; 60 } 61 62 static __always_inline depot_stack_handle_t create_dummy_stack(void) 63 { 64 unsigned long entries[4]; 65 unsigned int nr_entries; 66 67 nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); 68 return stack_depot_save(entries, nr_entries, GFP_KERNEL); 69 } 70 71 static noinline void register_dummy_stack(void) 72 { 73 dummy_handle = create_dummy_stack(); 74 } 75 76 static noinline void register_failure_stack(void) 77 { 78 failure_handle = create_dummy_stack(); 79 } 80 81 static noinline void register_early_stack(void) 82 { 83 early_handle = create_dummy_stack(); 84 } 85 86 static __init void init_page_owner(void) 87 { 88 if (!page_owner_enabled) 89 return; 90 91 register_dummy_stack(); 92 register_failure_stack(); 93 register_early_stack(); 94 static_branch_enable(&page_owner_inited); 95 init_early_allocated_pages(); 96 } 97 98 struct page_ext_operations page_owner_ops = { 99 .size = sizeof(struct page_owner), 100 .need = need_page_owner, 101 .init = init_page_owner, 102 .need_shared_flags = true, 103 }; 104 105 static inline struct page_owner *get_page_owner(struct page_ext *page_ext) 106 { 107 return (void *)page_ext + page_owner_ops.offset; 108 } 109 110 static noinline depot_stack_handle_t save_stack(gfp_t flags) 111 { 112 unsigned long entries[PAGE_OWNER_STACK_DEPTH]; 113 depot_stack_handle_t handle; 114 unsigned int nr_entries; 115 116 /* 117 * Avoid recursion. 118 * 119 * Sometimes page metadata allocation tracking requires more 120 * memory to be allocated: 121 * - when new stack trace is saved to stack depot 122 * - when backtrace itself is calculated (ia64) 123 */ 124 if (current->in_page_owner) 125 return dummy_handle; 126 current->in_page_owner = 1; 127 128 nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); 129 handle = stack_depot_save(entries, nr_entries, flags); 130 if (!handle) 131 handle = failure_handle; 132 133 current->in_page_owner = 0; 134 return handle; 135 } 136 137 void __reset_page_owner(struct page *page, unsigned short order) 138 { 139 int i; 140 struct page_ext *page_ext; 141 depot_stack_handle_t handle; 142 struct page_owner *page_owner; 143 u64 free_ts_nsec = local_clock(); 144 145 page_ext = page_ext_get(page); 146 if (unlikely(!page_ext)) 147 return; 148 149 handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); 150 for (i = 0; i < (1 << order); i++) { 151 __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); 152 page_owner = get_page_owner(page_ext); 153 page_owner->free_handle = handle; 154 page_owner->free_ts_nsec = free_ts_nsec; 155 page_ext = page_ext_next(page_ext); 156 } 157 page_ext_put(page_ext); 158 } 159 160 static inline void __set_page_owner_handle(struct page_ext *page_ext, 161 depot_stack_handle_t handle, 162 unsigned short order, gfp_t gfp_mask) 163 { 164 struct page_owner *page_owner; 165 int i; 166 u64 ts_nsec = local_clock(); 167 168 for (i = 0; i < (1 << order); i++) { 169 page_owner = get_page_owner(page_ext); 170 page_owner->handle = handle; 171 page_owner->order = order; 172 page_owner->gfp_mask = gfp_mask; 173 page_owner->last_migrate_reason = -1; 174 page_owner->pid = current->pid; 175 page_owner->tgid = current->tgid; 176 page_owner->ts_nsec = ts_nsec; 177 strscpy(page_owner->comm, current->comm, 178 sizeof(page_owner->comm)); 179 __set_bit(PAGE_EXT_OWNER, &page_ext->flags); 180 __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); 181 182 page_ext = page_ext_next(page_ext); 183 } 184 } 185 186 noinline void __set_page_owner(struct page *page, unsigned short order, 187 gfp_t gfp_mask) 188 { 189 struct page_ext *page_ext; 190 depot_stack_handle_t handle; 191 192 handle = save_stack(gfp_mask); 193 194 page_ext = page_ext_get(page); 195 if (unlikely(!page_ext)) 196 return; 197 __set_page_owner_handle(page_ext, handle, order, gfp_mask); 198 page_ext_put(page_ext); 199 } 200 201 void __set_page_owner_migrate_reason(struct page *page, int reason) 202 { 203 struct page_ext *page_ext = page_ext_get(page); 204 struct page_owner *page_owner; 205 206 if (unlikely(!page_ext)) 207 return; 208 209 page_owner = get_page_owner(page_ext); 210 page_owner->last_migrate_reason = reason; 211 page_ext_put(page_ext); 212 } 213 214 void __split_page_owner(struct page *page, unsigned int nr) 215 { 216 int i; 217 struct page_ext *page_ext = page_ext_get(page); 218 struct page_owner *page_owner; 219 220 if (unlikely(!page_ext)) 221 return; 222 223 for (i = 0; i < nr; i++) { 224 page_owner = get_page_owner(page_ext); 225 page_owner->order = 0; 226 page_ext = page_ext_next(page_ext); 227 } 228 page_ext_put(page_ext); 229 } 230 231 void __folio_copy_owner(struct folio *newfolio, struct folio *old) 232 { 233 struct page_ext *old_ext; 234 struct page_ext *new_ext; 235 struct page_owner *old_page_owner, *new_page_owner; 236 237 old_ext = page_ext_get(&old->page); 238 if (unlikely(!old_ext)) 239 return; 240 241 new_ext = page_ext_get(&newfolio->page); 242 if (unlikely(!new_ext)) { 243 page_ext_put(old_ext); 244 return; 245 } 246 247 old_page_owner = get_page_owner(old_ext); 248 new_page_owner = get_page_owner(new_ext); 249 new_page_owner->order = old_page_owner->order; 250 new_page_owner->gfp_mask = old_page_owner->gfp_mask; 251 new_page_owner->last_migrate_reason = 252 old_page_owner->last_migrate_reason; 253 new_page_owner->handle = old_page_owner->handle; 254 new_page_owner->pid = old_page_owner->pid; 255 new_page_owner->tgid = old_page_owner->tgid; 256 new_page_owner->ts_nsec = old_page_owner->ts_nsec; 257 new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; 258 strcpy(new_page_owner->comm, old_page_owner->comm); 259 260 /* 261 * We don't clear the bit on the old folio as it's going to be freed 262 * after migration. Until then, the info can be useful in case of 263 * a bug, and the overall stats will be off a bit only temporarily. 264 * Also, migrate_misplaced_transhuge_page() can still fail the 265 * migration and then we want the old folio to retain the info. But 266 * in that case we also don't need to explicitly clear the info from 267 * the new page, which will be freed. 268 */ 269 __set_bit(PAGE_EXT_OWNER, &new_ext->flags); 270 __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags); 271 page_ext_put(new_ext); 272 page_ext_put(old_ext); 273 } 274 275 void pagetypeinfo_showmixedcount_print(struct seq_file *m, 276 pg_data_t *pgdat, struct zone *zone) 277 { 278 struct page *page; 279 struct page_ext *page_ext; 280 struct page_owner *page_owner; 281 unsigned long pfn, block_end_pfn; 282 unsigned long end_pfn = zone_end_pfn(zone); 283 unsigned long count[MIGRATE_TYPES] = { 0, }; 284 int pageblock_mt, page_mt; 285 int i; 286 287 /* Scan block by block. First and last block may be incomplete */ 288 pfn = zone->zone_start_pfn; 289 290 /* 291 * Walk the zone in pageblock_nr_pages steps. If a page block spans 292 * a zone boundary, it will be double counted between zones. This does 293 * not matter as the mixed block count will still be correct 294 */ 295 for (; pfn < end_pfn; ) { 296 page = pfn_to_online_page(pfn); 297 if (!page) { 298 pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); 299 continue; 300 } 301 302 block_end_pfn = pageblock_end_pfn(pfn); 303 block_end_pfn = min(block_end_pfn, end_pfn); 304 305 pageblock_mt = get_pageblock_migratetype(page); 306 307 for (; pfn < block_end_pfn; pfn++) { 308 /* The pageblock is online, no need to recheck. */ 309 page = pfn_to_page(pfn); 310 311 if (page_zone(page) != zone) 312 continue; 313 314 if (PageBuddy(page)) { 315 unsigned long freepage_order; 316 317 freepage_order = buddy_order_unsafe(page); 318 if (freepage_order <= MAX_ORDER) 319 pfn += (1UL << freepage_order) - 1; 320 continue; 321 } 322 323 if (PageReserved(page)) 324 continue; 325 326 page_ext = page_ext_get(page); 327 if (unlikely(!page_ext)) 328 continue; 329 330 if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) 331 goto ext_put_continue; 332 333 page_owner = get_page_owner(page_ext); 334 page_mt = gfp_migratetype(page_owner->gfp_mask); 335 if (pageblock_mt != page_mt) { 336 if (is_migrate_cma(pageblock_mt)) 337 count[MIGRATE_MOVABLE]++; 338 else 339 count[pageblock_mt]++; 340 341 pfn = block_end_pfn; 342 page_ext_put(page_ext); 343 break; 344 } 345 pfn += (1UL << page_owner->order) - 1; 346 ext_put_continue: 347 page_ext_put(page_ext); 348 } 349 } 350 351 /* Print counts */ 352 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 353 for (i = 0; i < MIGRATE_TYPES; i++) 354 seq_printf(m, "%12lu ", count[i]); 355 seq_putc(m, '\n'); 356 } 357 358 /* 359 * Looking for memcg information and print it out 360 */ 361 static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret, 362 struct page *page) 363 { 364 #ifdef CONFIG_MEMCG 365 unsigned long memcg_data; 366 struct mem_cgroup *memcg; 367 bool online; 368 char name[80]; 369 370 rcu_read_lock(); 371 memcg_data = READ_ONCE(page->memcg_data); 372 if (!memcg_data) 373 goto out_unlock; 374 375 if (memcg_data & MEMCG_DATA_OBJCGS) 376 ret += scnprintf(kbuf + ret, count - ret, 377 "Slab cache page\n"); 378 379 memcg = page_memcg_check(page); 380 if (!memcg) 381 goto out_unlock; 382 383 online = (memcg->css.flags & CSS_ONLINE); 384 cgroup_name(memcg->css.cgroup, name, sizeof(name)); 385 ret += scnprintf(kbuf + ret, count - ret, 386 "Charged %sto %smemcg %s\n", 387 PageMemcgKmem(page) ? "(via objcg) " : "", 388 online ? "" : "offline ", 389 name); 390 out_unlock: 391 rcu_read_unlock(); 392 #endif /* CONFIG_MEMCG */ 393 394 return ret; 395 } 396 397 static ssize_t 398 print_page_owner(char __user *buf, size_t count, unsigned long pfn, 399 struct page *page, struct page_owner *page_owner, 400 depot_stack_handle_t handle) 401 { 402 int ret, pageblock_mt, page_mt; 403 char *kbuf; 404 405 count = min_t(size_t, count, PAGE_SIZE); 406 kbuf = kmalloc(count, GFP_KERNEL); 407 if (!kbuf) 408 return -ENOMEM; 409 410 ret = scnprintf(kbuf, count, 411 "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns, free_ts %llu ns\n", 412 page_owner->order, page_owner->gfp_mask, 413 &page_owner->gfp_mask, page_owner->pid, 414 page_owner->tgid, page_owner->comm, 415 page_owner->ts_nsec, page_owner->free_ts_nsec); 416 417 /* Print information relevant to grouping pages by mobility */ 418 pageblock_mt = get_pageblock_migratetype(page); 419 page_mt = gfp_migratetype(page_owner->gfp_mask); 420 ret += scnprintf(kbuf + ret, count - ret, 421 "PFN %lu type %s Block %lu type %s Flags %pGp\n", 422 pfn, 423 migratetype_names[page_mt], 424 pfn >> pageblock_order, 425 migratetype_names[pageblock_mt], 426 &page->flags); 427 428 ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0); 429 if (ret >= count) 430 goto err; 431 432 if (page_owner->last_migrate_reason != -1) { 433 ret += scnprintf(kbuf + ret, count - ret, 434 "Page has been migrated, last migrate reason: %s\n", 435 migrate_reason_names[page_owner->last_migrate_reason]); 436 } 437 438 ret = print_page_owner_memcg(kbuf, count, ret, page); 439 440 ret += snprintf(kbuf + ret, count - ret, "\n"); 441 if (ret >= count) 442 goto err; 443 444 if (copy_to_user(buf, kbuf, ret)) 445 ret = -EFAULT; 446 447 kfree(kbuf); 448 return ret; 449 450 err: 451 kfree(kbuf); 452 return -ENOMEM; 453 } 454 455 void __dump_page_owner(const struct page *page) 456 { 457 struct page_ext *page_ext = page_ext_get((void *)page); 458 struct page_owner *page_owner; 459 depot_stack_handle_t handle; 460 gfp_t gfp_mask; 461 int mt; 462 463 if (unlikely(!page_ext)) { 464 pr_alert("There is not page extension available.\n"); 465 return; 466 } 467 468 page_owner = get_page_owner(page_ext); 469 gfp_mask = page_owner->gfp_mask; 470 mt = gfp_migratetype(gfp_mask); 471 472 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { 473 pr_alert("page_owner info is not present (never set?)\n"); 474 page_ext_put(page_ext); 475 return; 476 } 477 478 if (test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) 479 pr_alert("page_owner tracks the page as allocated\n"); 480 else 481 pr_alert("page_owner tracks the page as freed\n"); 482 483 pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu, free_ts %llu\n", 484 page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, 485 page_owner->pid, page_owner->tgid, page_owner->comm, 486 page_owner->ts_nsec, page_owner->free_ts_nsec); 487 488 handle = READ_ONCE(page_owner->handle); 489 if (!handle) 490 pr_alert("page_owner allocation stack trace missing\n"); 491 else 492 stack_depot_print(handle); 493 494 handle = READ_ONCE(page_owner->free_handle); 495 if (!handle) { 496 pr_alert("page_owner free stack trace missing\n"); 497 } else { 498 pr_alert("page last free stack trace:\n"); 499 stack_depot_print(handle); 500 } 501 502 if (page_owner->last_migrate_reason != -1) 503 pr_alert("page has been migrated, last migrate reason: %s\n", 504 migrate_reason_names[page_owner->last_migrate_reason]); 505 page_ext_put(page_ext); 506 } 507 508 static ssize_t 509 read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) 510 { 511 unsigned long pfn; 512 struct page *page; 513 struct page_ext *page_ext; 514 struct page_owner *page_owner; 515 depot_stack_handle_t handle; 516 517 if (!static_branch_unlikely(&page_owner_inited)) 518 return -EINVAL; 519 520 page = NULL; 521 if (*ppos == 0) 522 pfn = min_low_pfn; 523 else 524 pfn = *ppos; 525 /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ 526 while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) 527 pfn++; 528 529 /* Find an allocated page */ 530 for (; pfn < max_pfn; pfn++) { 531 /* 532 * This temporary page_owner is required so 533 * that we can avoid the context switches while holding 534 * the rcu lock and copying the page owner information to 535 * user through copy_to_user() or GFP_KERNEL allocations. 536 */ 537 struct page_owner page_owner_tmp; 538 539 /* 540 * If the new page is in a new MAX_ORDER_NR_PAGES area, 541 * validate the area as existing, skip it if not 542 */ 543 if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) { 544 pfn += MAX_ORDER_NR_PAGES - 1; 545 continue; 546 } 547 548 page = pfn_to_page(pfn); 549 if (PageBuddy(page)) { 550 unsigned long freepage_order = buddy_order_unsafe(page); 551 552 if (freepage_order <= MAX_ORDER) 553 pfn += (1UL << freepage_order) - 1; 554 continue; 555 } 556 557 page_ext = page_ext_get(page); 558 if (unlikely(!page_ext)) 559 continue; 560 561 /* 562 * Some pages could be missed by concurrent allocation or free, 563 * because we don't hold the zone lock. 564 */ 565 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) 566 goto ext_put_continue; 567 568 /* 569 * Although we do have the info about past allocation of free 570 * pages, it's not relevant for current memory usage. 571 */ 572 if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) 573 goto ext_put_continue; 574 575 page_owner = get_page_owner(page_ext); 576 577 /* 578 * Don't print "tail" pages of high-order allocations as that 579 * would inflate the stats. 580 */ 581 if (!IS_ALIGNED(pfn, 1 << page_owner->order)) 582 goto ext_put_continue; 583 584 /* 585 * Access to page_ext->handle isn't synchronous so we should 586 * be careful to access it. 587 */ 588 handle = READ_ONCE(page_owner->handle); 589 if (!handle) 590 goto ext_put_continue; 591 592 /* Record the next PFN to read in the file offset */ 593 *ppos = pfn + 1; 594 595 page_owner_tmp = *page_owner; 596 page_ext_put(page_ext); 597 return print_page_owner(buf, count, pfn, page, 598 &page_owner_tmp, handle); 599 ext_put_continue: 600 page_ext_put(page_ext); 601 } 602 603 return 0; 604 } 605 606 static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig) 607 { 608 switch (orig) { 609 case SEEK_SET: 610 file->f_pos = offset; 611 break; 612 case SEEK_CUR: 613 file->f_pos += offset; 614 break; 615 default: 616 return -EINVAL; 617 } 618 return file->f_pos; 619 } 620 621 static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) 622 { 623 unsigned long pfn = zone->zone_start_pfn; 624 unsigned long end_pfn = zone_end_pfn(zone); 625 unsigned long count = 0; 626 627 /* 628 * Walk the zone in pageblock_nr_pages steps. If a page block spans 629 * a zone boundary, it will be double counted between zones. This does 630 * not matter as the mixed block count will still be correct 631 */ 632 for (; pfn < end_pfn; ) { 633 unsigned long block_end_pfn; 634 635 if (!pfn_valid(pfn)) { 636 pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); 637 continue; 638 } 639 640 block_end_pfn = pageblock_end_pfn(pfn); 641 block_end_pfn = min(block_end_pfn, end_pfn); 642 643 for (; pfn < block_end_pfn; pfn++) { 644 struct page *page = pfn_to_page(pfn); 645 struct page_ext *page_ext; 646 647 if (page_zone(page) != zone) 648 continue; 649 650 /* 651 * To avoid having to grab zone->lock, be a little 652 * careful when reading buddy page order. The only 653 * danger is that we skip too much and potentially miss 654 * some early allocated pages, which is better than 655 * heavy lock contention. 656 */ 657 if (PageBuddy(page)) { 658 unsigned long order = buddy_order_unsafe(page); 659 660 if (order > 0 && order <= MAX_ORDER) 661 pfn += (1UL << order) - 1; 662 continue; 663 } 664 665 if (PageReserved(page)) 666 continue; 667 668 page_ext = page_ext_get(page); 669 if (unlikely(!page_ext)) 670 continue; 671 672 /* Maybe overlapping zone */ 673 if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) 674 goto ext_put_continue; 675 676 /* Found early allocated page */ 677 __set_page_owner_handle(page_ext, early_handle, 678 0, 0); 679 count++; 680 ext_put_continue: 681 page_ext_put(page_ext); 682 } 683 cond_resched(); 684 } 685 686 pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", 687 pgdat->node_id, zone->name, count); 688 } 689 690 static void init_zones_in_node(pg_data_t *pgdat) 691 { 692 struct zone *zone; 693 struct zone *node_zones = pgdat->node_zones; 694 695 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 696 if (!populated_zone(zone)) 697 continue; 698 699 init_pages_in_zone(pgdat, zone); 700 } 701 } 702 703 static void init_early_allocated_pages(void) 704 { 705 pg_data_t *pgdat; 706 707 for_each_online_pgdat(pgdat) 708 init_zones_in_node(pgdat); 709 } 710 711 static const struct file_operations proc_page_owner_operations = { 712 .read = read_page_owner, 713 .llseek = lseek_page_owner, 714 }; 715 716 static int __init pageowner_init(void) 717 { 718 if (!static_branch_unlikely(&page_owner_inited)) { 719 pr_info("page_owner is disabled\n"); 720 return 0; 721 } 722 723 debugfs_create_file("page_owner", 0400, NULL, NULL, 724 &proc_page_owner_operations); 725 726 return 0; 727 } 728 late_initcall(pageowner_init) 729