1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/debugfs.h> 3 #include <linux/mm.h> 4 #include <linux/slab.h> 5 #include <linux/uaccess.h> 6 #include <linux/memblock.h> 7 #include <linux/stacktrace.h> 8 #include <linux/page_owner.h> 9 #include <linux/jump_label.h> 10 #include <linux/migrate.h> 11 #include <linux/stackdepot.h> 12 #include <linux/seq_file.h> 13 #include <linux/memcontrol.h> 14 #include <linux/sched/clock.h> 15 16 #include "internal.h" 17 18 /* 19 * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack) 20 * to use off stack temporal storage 21 */ 22 #define PAGE_OWNER_STACK_DEPTH (16) 23 24 struct page_owner { 25 unsigned short order; 26 short last_migrate_reason; 27 gfp_t gfp_mask; 28 depot_stack_handle_t handle; 29 depot_stack_handle_t free_handle; 30 u64 ts_nsec; 31 u64 free_ts_nsec; 32 char comm[TASK_COMM_LEN]; 33 pid_t pid; 34 pid_t tgid; 35 }; 36 37 static bool page_owner_enabled __initdata; 38 DEFINE_STATIC_KEY_FALSE(page_owner_inited); 39 40 static depot_stack_handle_t dummy_handle; 41 static depot_stack_handle_t failure_handle; 42 static depot_stack_handle_t early_handle; 43 44 static void init_early_allocated_pages(void); 45 46 static int __init early_page_owner_param(char *buf) 47 { 48 int ret = kstrtobool(buf, &page_owner_enabled); 49 50 if (page_owner_enabled) 51 stack_depot_want_early_init(); 52 53 return ret; 54 } 55 early_param("page_owner", early_page_owner_param); 56 57 static __init bool need_page_owner(void) 58 { 59 return page_owner_enabled; 60 } 61 62 static __always_inline depot_stack_handle_t create_dummy_stack(void) 63 { 64 unsigned long entries[4]; 65 unsigned int nr_entries; 66 67 nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); 68 return stack_depot_save(entries, nr_entries, GFP_KERNEL); 69 } 70 71 static noinline void register_dummy_stack(void) 72 { 73 dummy_handle = create_dummy_stack(); 74 } 75 76 static noinline void register_failure_stack(void) 77 { 78 failure_handle = create_dummy_stack(); 79 } 80 81 static noinline void register_early_stack(void) 82 { 83 early_handle = create_dummy_stack(); 84 } 85 86 static __init void init_page_owner(void) 87 { 88 if (!page_owner_enabled) 89 return; 90 91 register_dummy_stack(); 92 register_failure_stack(); 93 register_early_stack(); 94 static_branch_enable(&page_owner_inited); 95 init_early_allocated_pages(); 96 } 97 98 struct page_ext_operations page_owner_ops = { 99 .size = sizeof(struct page_owner), 100 .need = need_page_owner, 101 .init = init_page_owner, 102 }; 103 104 static inline struct page_owner *get_page_owner(struct page_ext *page_ext) 105 { 106 return (void *)page_ext + page_owner_ops.offset; 107 } 108 109 static noinline depot_stack_handle_t save_stack(gfp_t flags) 110 { 111 unsigned long entries[PAGE_OWNER_STACK_DEPTH]; 112 depot_stack_handle_t handle; 113 unsigned int nr_entries; 114 115 /* 116 * Avoid recursion. 117 * 118 * Sometimes page metadata allocation tracking requires more 119 * memory to be allocated: 120 * - when new stack trace is saved to stack depot 121 * - when backtrace itself is calculated (ia64) 122 */ 123 if (current->in_page_owner) 124 return dummy_handle; 125 current->in_page_owner = 1; 126 127 nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); 128 handle = stack_depot_save(entries, nr_entries, flags); 129 if (!handle) 130 handle = failure_handle; 131 132 current->in_page_owner = 0; 133 return handle; 134 } 135 136 void __reset_page_owner(struct page *page, unsigned short order) 137 { 138 int i; 139 struct page_ext *page_ext; 140 depot_stack_handle_t handle; 141 struct page_owner *page_owner; 142 u64 free_ts_nsec = local_clock(); 143 144 page_ext = lookup_page_ext(page); 145 if (unlikely(!page_ext)) 146 return; 147 148 handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); 149 for (i = 0; i < (1 << order); i++) { 150 __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); 151 page_owner = get_page_owner(page_ext); 152 page_owner->free_handle = handle; 153 page_owner->free_ts_nsec = free_ts_nsec; 154 page_ext = page_ext_next(page_ext); 155 } 156 } 157 158 static inline void __set_page_owner_handle(struct page_ext *page_ext, 159 depot_stack_handle_t handle, 160 unsigned short order, gfp_t gfp_mask) 161 { 162 struct page_owner *page_owner; 163 int i; 164 165 for (i = 0; i < (1 << order); i++) { 166 page_owner = get_page_owner(page_ext); 167 page_owner->handle = handle; 168 page_owner->order = order; 169 page_owner->gfp_mask = gfp_mask; 170 page_owner->last_migrate_reason = -1; 171 page_owner->pid = current->pid; 172 page_owner->tgid = current->tgid; 173 page_owner->ts_nsec = local_clock(); 174 strscpy(page_owner->comm, current->comm, 175 sizeof(page_owner->comm)); 176 __set_bit(PAGE_EXT_OWNER, &page_ext->flags); 177 __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); 178 179 page_ext = page_ext_next(page_ext); 180 } 181 } 182 183 noinline void __set_page_owner(struct page *page, unsigned short order, 184 gfp_t gfp_mask) 185 { 186 struct page_ext *page_ext = lookup_page_ext(page); 187 depot_stack_handle_t handle; 188 189 if (unlikely(!page_ext)) 190 return; 191 192 handle = save_stack(gfp_mask); 193 __set_page_owner_handle(page_ext, handle, order, gfp_mask); 194 } 195 196 void __set_page_owner_migrate_reason(struct page *page, int reason) 197 { 198 struct page_ext *page_ext = lookup_page_ext(page); 199 struct page_owner *page_owner; 200 201 if (unlikely(!page_ext)) 202 return; 203 204 page_owner = get_page_owner(page_ext); 205 page_owner->last_migrate_reason = reason; 206 } 207 208 void __split_page_owner(struct page *page, unsigned int nr) 209 { 210 int i; 211 struct page_ext *page_ext = lookup_page_ext(page); 212 struct page_owner *page_owner; 213 214 if (unlikely(!page_ext)) 215 return; 216 217 for (i = 0; i < nr; i++) { 218 page_owner = get_page_owner(page_ext); 219 page_owner->order = 0; 220 page_ext = page_ext_next(page_ext); 221 } 222 } 223 224 void __folio_copy_owner(struct folio *newfolio, struct folio *old) 225 { 226 struct page_ext *old_ext = lookup_page_ext(&old->page); 227 struct page_ext *new_ext = lookup_page_ext(&newfolio->page); 228 struct page_owner *old_page_owner, *new_page_owner; 229 230 if (unlikely(!old_ext || !new_ext)) 231 return; 232 233 old_page_owner = get_page_owner(old_ext); 234 new_page_owner = get_page_owner(new_ext); 235 new_page_owner->order = old_page_owner->order; 236 new_page_owner->gfp_mask = old_page_owner->gfp_mask; 237 new_page_owner->last_migrate_reason = 238 old_page_owner->last_migrate_reason; 239 new_page_owner->handle = old_page_owner->handle; 240 new_page_owner->pid = old_page_owner->pid; 241 new_page_owner->tgid = old_page_owner->tgid; 242 new_page_owner->ts_nsec = old_page_owner->ts_nsec; 243 new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; 244 strcpy(new_page_owner->comm, old_page_owner->comm); 245 246 /* 247 * We don't clear the bit on the old folio as it's going to be freed 248 * after migration. Until then, the info can be useful in case of 249 * a bug, and the overall stats will be off a bit only temporarily. 250 * Also, migrate_misplaced_transhuge_page() can still fail the 251 * migration and then we want the old folio to retain the info. But 252 * in that case we also don't need to explicitly clear the info from 253 * the new page, which will be freed. 254 */ 255 __set_bit(PAGE_EXT_OWNER, &new_ext->flags); 256 __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags); 257 } 258 259 void pagetypeinfo_showmixedcount_print(struct seq_file *m, 260 pg_data_t *pgdat, struct zone *zone) 261 { 262 struct page *page; 263 struct page_ext *page_ext; 264 struct page_owner *page_owner; 265 unsigned long pfn, block_end_pfn; 266 unsigned long end_pfn = zone_end_pfn(zone); 267 unsigned long count[MIGRATE_TYPES] = { 0, }; 268 int pageblock_mt, page_mt; 269 int i; 270 271 /* Scan block by block. First and last block may be incomplete */ 272 pfn = zone->zone_start_pfn; 273 274 /* 275 * Walk the zone in pageblock_nr_pages steps. If a page block spans 276 * a zone boundary, it will be double counted between zones. This does 277 * not matter as the mixed block count will still be correct 278 */ 279 for (; pfn < end_pfn; ) { 280 page = pfn_to_online_page(pfn); 281 if (!page) { 282 pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); 283 continue; 284 } 285 286 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 287 block_end_pfn = min(block_end_pfn, end_pfn); 288 289 pageblock_mt = get_pageblock_migratetype(page); 290 291 for (; pfn < block_end_pfn; pfn++) { 292 /* The pageblock is online, no need to recheck. */ 293 page = pfn_to_page(pfn); 294 295 if (page_zone(page) != zone) 296 continue; 297 298 if (PageBuddy(page)) { 299 unsigned long freepage_order; 300 301 freepage_order = buddy_order_unsafe(page); 302 if (freepage_order < MAX_ORDER) 303 pfn += (1UL << freepage_order) - 1; 304 continue; 305 } 306 307 if (PageReserved(page)) 308 continue; 309 310 page_ext = lookup_page_ext(page); 311 if (unlikely(!page_ext)) 312 continue; 313 314 if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) 315 continue; 316 317 page_owner = get_page_owner(page_ext); 318 page_mt = gfp_migratetype(page_owner->gfp_mask); 319 if (pageblock_mt != page_mt) { 320 if (is_migrate_cma(pageblock_mt)) 321 count[MIGRATE_MOVABLE]++; 322 else 323 count[pageblock_mt]++; 324 325 pfn = block_end_pfn; 326 break; 327 } 328 pfn += (1UL << page_owner->order) - 1; 329 } 330 } 331 332 /* Print counts */ 333 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 334 for (i = 0; i < MIGRATE_TYPES; i++) 335 seq_printf(m, "%12lu ", count[i]); 336 seq_putc(m, '\n'); 337 } 338 339 /* 340 * Looking for memcg information and print it out 341 */ 342 static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret, 343 struct page *page) 344 { 345 #ifdef CONFIG_MEMCG 346 unsigned long memcg_data; 347 struct mem_cgroup *memcg; 348 bool online; 349 char name[80]; 350 351 rcu_read_lock(); 352 memcg_data = READ_ONCE(page->memcg_data); 353 if (!memcg_data) 354 goto out_unlock; 355 356 if (memcg_data & MEMCG_DATA_OBJCGS) 357 ret += scnprintf(kbuf + ret, count - ret, 358 "Slab cache page\n"); 359 360 memcg = page_memcg_check(page); 361 if (!memcg) 362 goto out_unlock; 363 364 online = (memcg->css.flags & CSS_ONLINE); 365 cgroup_name(memcg->css.cgroup, name, sizeof(name)); 366 ret += scnprintf(kbuf + ret, count - ret, 367 "Charged %sto %smemcg %s\n", 368 PageMemcgKmem(page) ? "(via objcg) " : "", 369 online ? "" : "offline ", 370 name); 371 out_unlock: 372 rcu_read_unlock(); 373 #endif /* CONFIG_MEMCG */ 374 375 return ret; 376 } 377 378 static ssize_t 379 print_page_owner(char __user *buf, size_t count, unsigned long pfn, 380 struct page *page, struct page_owner *page_owner, 381 depot_stack_handle_t handle) 382 { 383 int ret, pageblock_mt, page_mt; 384 char *kbuf; 385 386 count = min_t(size_t, count, PAGE_SIZE); 387 kbuf = kmalloc(count, GFP_KERNEL); 388 if (!kbuf) 389 return -ENOMEM; 390 391 ret = scnprintf(kbuf, count, 392 "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns, free_ts %llu ns\n", 393 page_owner->order, page_owner->gfp_mask, 394 &page_owner->gfp_mask, page_owner->pid, 395 page_owner->tgid, page_owner->comm, 396 page_owner->ts_nsec, page_owner->free_ts_nsec); 397 398 /* Print information relevant to grouping pages by mobility */ 399 pageblock_mt = get_pageblock_migratetype(page); 400 page_mt = gfp_migratetype(page_owner->gfp_mask); 401 ret += scnprintf(kbuf + ret, count - ret, 402 "PFN %lu type %s Block %lu type %s Flags %pGp\n", 403 pfn, 404 migratetype_names[page_mt], 405 pfn >> pageblock_order, 406 migratetype_names[pageblock_mt], 407 &page->flags); 408 409 ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0); 410 if (ret >= count) 411 goto err; 412 413 if (page_owner->last_migrate_reason != -1) { 414 ret += scnprintf(kbuf + ret, count - ret, 415 "Page has been migrated, last migrate reason: %s\n", 416 migrate_reason_names[page_owner->last_migrate_reason]); 417 } 418 419 ret = print_page_owner_memcg(kbuf, count, ret, page); 420 421 ret += snprintf(kbuf + ret, count - ret, "\n"); 422 if (ret >= count) 423 goto err; 424 425 if (copy_to_user(buf, kbuf, ret)) 426 ret = -EFAULT; 427 428 kfree(kbuf); 429 return ret; 430 431 err: 432 kfree(kbuf); 433 return -ENOMEM; 434 } 435 436 void __dump_page_owner(const struct page *page) 437 { 438 struct page_ext *page_ext = lookup_page_ext(page); 439 struct page_owner *page_owner; 440 depot_stack_handle_t handle; 441 gfp_t gfp_mask; 442 int mt; 443 444 if (unlikely(!page_ext)) { 445 pr_alert("There is not page extension available.\n"); 446 return; 447 } 448 449 page_owner = get_page_owner(page_ext); 450 gfp_mask = page_owner->gfp_mask; 451 mt = gfp_migratetype(gfp_mask); 452 453 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { 454 pr_alert("page_owner info is not present (never set?)\n"); 455 return; 456 } 457 458 if (test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) 459 pr_alert("page_owner tracks the page as allocated\n"); 460 else 461 pr_alert("page_owner tracks the page as freed\n"); 462 463 pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu, free_ts %llu\n", 464 page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, 465 page_owner->pid, page_owner->tgid, page_owner->comm, 466 page_owner->ts_nsec, page_owner->free_ts_nsec); 467 468 handle = READ_ONCE(page_owner->handle); 469 if (!handle) 470 pr_alert("page_owner allocation stack trace missing\n"); 471 else 472 stack_depot_print(handle); 473 474 handle = READ_ONCE(page_owner->free_handle); 475 if (!handle) { 476 pr_alert("page_owner free stack trace missing\n"); 477 } else { 478 pr_alert("page last free stack trace:\n"); 479 stack_depot_print(handle); 480 } 481 482 if (page_owner->last_migrate_reason != -1) 483 pr_alert("page has been migrated, last migrate reason: %s\n", 484 migrate_reason_names[page_owner->last_migrate_reason]); 485 } 486 487 static ssize_t 488 read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) 489 { 490 unsigned long pfn; 491 struct page *page; 492 struct page_ext *page_ext; 493 struct page_owner *page_owner; 494 depot_stack_handle_t handle; 495 496 if (!static_branch_unlikely(&page_owner_inited)) 497 return -EINVAL; 498 499 page = NULL; 500 pfn = min_low_pfn + *ppos; 501 502 /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ 503 while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) 504 pfn++; 505 506 drain_all_pages(NULL); 507 508 /* Find an allocated page */ 509 for (; pfn < max_pfn; pfn++) { 510 /* 511 * If the new page is in a new MAX_ORDER_NR_PAGES area, 512 * validate the area as existing, skip it if not 513 */ 514 if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) { 515 pfn += MAX_ORDER_NR_PAGES - 1; 516 continue; 517 } 518 519 page = pfn_to_page(pfn); 520 if (PageBuddy(page)) { 521 unsigned long freepage_order = buddy_order_unsafe(page); 522 523 if (freepage_order < MAX_ORDER) 524 pfn += (1UL << freepage_order) - 1; 525 continue; 526 } 527 528 page_ext = lookup_page_ext(page); 529 if (unlikely(!page_ext)) 530 continue; 531 532 /* 533 * Some pages could be missed by concurrent allocation or free, 534 * because we don't hold the zone lock. 535 */ 536 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) 537 continue; 538 539 /* 540 * Although we do have the info about past allocation of free 541 * pages, it's not relevant for current memory usage. 542 */ 543 if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) 544 continue; 545 546 page_owner = get_page_owner(page_ext); 547 548 /* 549 * Don't print "tail" pages of high-order allocations as that 550 * would inflate the stats. 551 */ 552 if (!IS_ALIGNED(pfn, 1 << page_owner->order)) 553 continue; 554 555 /* 556 * Access to page_ext->handle isn't synchronous so we should 557 * be careful to access it. 558 */ 559 handle = READ_ONCE(page_owner->handle); 560 if (!handle) 561 continue; 562 563 /* Record the next PFN to read in the file offset */ 564 *ppos = (pfn - min_low_pfn) + 1; 565 566 return print_page_owner(buf, count, pfn, page, 567 page_owner, handle); 568 } 569 570 return 0; 571 } 572 573 static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) 574 { 575 unsigned long pfn = zone->zone_start_pfn; 576 unsigned long end_pfn = zone_end_pfn(zone); 577 unsigned long count = 0; 578 579 /* 580 * Walk the zone in pageblock_nr_pages steps. If a page block spans 581 * a zone boundary, it will be double counted between zones. This does 582 * not matter as the mixed block count will still be correct 583 */ 584 for (; pfn < end_pfn; ) { 585 unsigned long block_end_pfn; 586 587 if (!pfn_valid(pfn)) { 588 pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); 589 continue; 590 } 591 592 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 593 block_end_pfn = min(block_end_pfn, end_pfn); 594 595 for (; pfn < block_end_pfn; pfn++) { 596 struct page *page = pfn_to_page(pfn); 597 struct page_ext *page_ext; 598 599 if (page_zone(page) != zone) 600 continue; 601 602 /* 603 * To avoid having to grab zone->lock, be a little 604 * careful when reading buddy page order. The only 605 * danger is that we skip too much and potentially miss 606 * some early allocated pages, which is better than 607 * heavy lock contention. 608 */ 609 if (PageBuddy(page)) { 610 unsigned long order = buddy_order_unsafe(page); 611 612 if (order > 0 && order < MAX_ORDER) 613 pfn += (1UL << order) - 1; 614 continue; 615 } 616 617 if (PageReserved(page)) 618 continue; 619 620 page_ext = lookup_page_ext(page); 621 if (unlikely(!page_ext)) 622 continue; 623 624 /* Maybe overlapping zone */ 625 if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) 626 continue; 627 628 /* Found early allocated page */ 629 __set_page_owner_handle(page_ext, early_handle, 630 0, 0); 631 count++; 632 } 633 cond_resched(); 634 } 635 636 pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", 637 pgdat->node_id, zone->name, count); 638 } 639 640 static void init_zones_in_node(pg_data_t *pgdat) 641 { 642 struct zone *zone; 643 struct zone *node_zones = pgdat->node_zones; 644 645 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 646 if (!populated_zone(zone)) 647 continue; 648 649 init_pages_in_zone(pgdat, zone); 650 } 651 } 652 653 static void init_early_allocated_pages(void) 654 { 655 pg_data_t *pgdat; 656 657 for_each_online_pgdat(pgdat) 658 init_zones_in_node(pgdat); 659 } 660 661 static const struct file_operations proc_page_owner_operations = { 662 .read = read_page_owner, 663 }; 664 665 static int __init pageowner_init(void) 666 { 667 if (!static_branch_unlikely(&page_owner_inited)) { 668 pr_info("page_owner is disabled\n"); 669 return 0; 670 } 671 672 debugfs_create_file("page_owner", 0400, NULL, NULL, 673 &proc_page_owner_operations); 674 675 return 0; 676 } 677 late_initcall(pageowner_init) 678