xref: /openbmc/linux/mm/page_owner.c (revision 28efb0046512e8a13ed9f9bdf0d68d10bbfbe9cf)
1 #include <linux/debugfs.h>
2 #include <linux/mm.h>
3 #include <linux/slab.h>
4 #include <linux/uaccess.h>
5 #include <linux/bootmem.h>
6 #include <linux/stacktrace.h>
7 #include <linux/page_owner.h>
8 #include <linux/jump_label.h>
9 #include <linux/migrate.h>
10 #include <linux/stackdepot.h>
11 #include <linux/seq_file.h>
12 
13 #include "internal.h"
14 
15 /*
16  * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack)
17  * to use off stack temporal storage
18  */
19 #define PAGE_OWNER_STACK_DEPTH (16)
20 
21 struct page_owner {
22 	unsigned int order;
23 	gfp_t gfp_mask;
24 	int last_migrate_reason;
25 	depot_stack_handle_t handle;
26 };
27 
28 static bool page_owner_disabled = true;
29 DEFINE_STATIC_KEY_FALSE(page_owner_inited);
30 
31 static depot_stack_handle_t dummy_handle;
32 static depot_stack_handle_t failure_handle;
33 static depot_stack_handle_t early_handle;
34 
35 static void init_early_allocated_pages(void);
36 
37 static int early_page_owner_param(char *buf)
38 {
39 	if (!buf)
40 		return -EINVAL;
41 
42 	if (strcmp(buf, "on") == 0)
43 		page_owner_disabled = false;
44 
45 	return 0;
46 }
47 early_param("page_owner", early_page_owner_param);
48 
49 static bool need_page_owner(void)
50 {
51 	if (page_owner_disabled)
52 		return false;
53 
54 	return true;
55 }
56 
57 static __always_inline depot_stack_handle_t create_dummy_stack(void)
58 {
59 	unsigned long entries[4];
60 	struct stack_trace dummy;
61 
62 	dummy.nr_entries = 0;
63 	dummy.max_entries = ARRAY_SIZE(entries);
64 	dummy.entries = &entries[0];
65 	dummy.skip = 0;
66 
67 	save_stack_trace(&dummy);
68 	return depot_save_stack(&dummy, GFP_KERNEL);
69 }
70 
71 static noinline void register_dummy_stack(void)
72 {
73 	dummy_handle = create_dummy_stack();
74 }
75 
76 static noinline void register_failure_stack(void)
77 {
78 	failure_handle = create_dummy_stack();
79 }
80 
81 static noinline void register_early_stack(void)
82 {
83 	early_handle = create_dummy_stack();
84 }
85 
86 static void init_page_owner(void)
87 {
88 	if (page_owner_disabled)
89 		return;
90 
91 	register_dummy_stack();
92 	register_failure_stack();
93 	register_early_stack();
94 	static_branch_enable(&page_owner_inited);
95 	init_early_allocated_pages();
96 }
97 
98 struct page_ext_operations page_owner_ops = {
99 	.size = sizeof(struct page_owner),
100 	.need = need_page_owner,
101 	.init = init_page_owner,
102 };
103 
104 static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
105 {
106 	return (void *)page_ext + page_owner_ops.offset;
107 }
108 
109 void __reset_page_owner(struct page *page, unsigned int order)
110 {
111 	int i;
112 	struct page_ext *page_ext;
113 
114 	for (i = 0; i < (1 << order); i++) {
115 		page_ext = lookup_page_ext(page + i);
116 		if (unlikely(!page_ext))
117 			continue;
118 		__clear_bit(PAGE_EXT_OWNER, &page_ext->flags);
119 	}
120 }
121 
122 static inline bool check_recursive_alloc(struct stack_trace *trace,
123 					unsigned long ip)
124 {
125 	int i, count;
126 
127 	if (!trace->nr_entries)
128 		return false;
129 
130 	for (i = 0, count = 0; i < trace->nr_entries; i++) {
131 		if (trace->entries[i] == ip && ++count == 2)
132 			return true;
133 	}
134 
135 	return false;
136 }
137 
138 static noinline depot_stack_handle_t save_stack(gfp_t flags)
139 {
140 	unsigned long entries[PAGE_OWNER_STACK_DEPTH];
141 	struct stack_trace trace = {
142 		.nr_entries = 0,
143 		.entries = entries,
144 		.max_entries = PAGE_OWNER_STACK_DEPTH,
145 		.skip = 2
146 	};
147 	depot_stack_handle_t handle;
148 
149 	save_stack_trace(&trace);
150 	if (trace.nr_entries != 0 &&
151 	    trace.entries[trace.nr_entries-1] == ULONG_MAX)
152 		trace.nr_entries--;
153 
154 	/*
155 	 * We need to check recursion here because our request to stackdepot
156 	 * could trigger memory allocation to save new entry. New memory
157 	 * allocation would reach here and call depot_save_stack() again
158 	 * if we don't catch it. There is still not enough memory in stackdepot
159 	 * so it would try to allocate memory again and loop forever.
160 	 */
161 	if (check_recursive_alloc(&trace, _RET_IP_))
162 		return dummy_handle;
163 
164 	handle = depot_save_stack(&trace, flags);
165 	if (!handle)
166 		handle = failure_handle;
167 
168 	return handle;
169 }
170 
171 static inline void __set_page_owner_handle(struct page_ext *page_ext,
172 	depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask)
173 {
174 	struct page_owner *page_owner;
175 
176 	page_owner = get_page_owner(page_ext);
177 	page_owner->handle = handle;
178 	page_owner->order = order;
179 	page_owner->gfp_mask = gfp_mask;
180 	page_owner->last_migrate_reason = -1;
181 
182 	__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
183 }
184 
185 noinline void __set_page_owner(struct page *page, unsigned int order,
186 					gfp_t gfp_mask)
187 {
188 	struct page_ext *page_ext = lookup_page_ext(page);
189 	depot_stack_handle_t handle;
190 
191 	if (unlikely(!page_ext))
192 		return;
193 
194 	handle = save_stack(gfp_mask);
195 	__set_page_owner_handle(page_ext, handle, order, gfp_mask);
196 }
197 
198 void __set_page_owner_migrate_reason(struct page *page, int reason)
199 {
200 	struct page_ext *page_ext = lookup_page_ext(page);
201 	struct page_owner *page_owner;
202 
203 	if (unlikely(!page_ext))
204 		return;
205 
206 	page_owner = get_page_owner(page_ext);
207 	page_owner->last_migrate_reason = reason;
208 }
209 
210 void __split_page_owner(struct page *page, unsigned int order)
211 {
212 	int i;
213 	struct page_ext *page_ext = lookup_page_ext(page);
214 	struct page_owner *page_owner;
215 
216 	if (unlikely(!page_ext))
217 		return;
218 
219 	page_owner = get_page_owner(page_ext);
220 	page_owner->order = 0;
221 	for (i = 1; i < (1 << order); i++)
222 		__copy_page_owner(page, page + i);
223 }
224 
225 void __copy_page_owner(struct page *oldpage, struct page *newpage)
226 {
227 	struct page_ext *old_ext = lookup_page_ext(oldpage);
228 	struct page_ext *new_ext = lookup_page_ext(newpage);
229 	struct page_owner *old_page_owner, *new_page_owner;
230 
231 	if (unlikely(!old_ext || !new_ext))
232 		return;
233 
234 	old_page_owner = get_page_owner(old_ext);
235 	new_page_owner = get_page_owner(new_ext);
236 	new_page_owner->order = old_page_owner->order;
237 	new_page_owner->gfp_mask = old_page_owner->gfp_mask;
238 	new_page_owner->last_migrate_reason =
239 		old_page_owner->last_migrate_reason;
240 	new_page_owner->handle = old_page_owner->handle;
241 
242 	/*
243 	 * We don't clear the bit on the oldpage as it's going to be freed
244 	 * after migration. Until then, the info can be useful in case of
245 	 * a bug, and the overal stats will be off a bit only temporarily.
246 	 * Also, migrate_misplaced_transhuge_page() can still fail the
247 	 * migration and then we want the oldpage to retain the info. But
248 	 * in that case we also don't need to explicitly clear the info from
249 	 * the new page, which will be freed.
250 	 */
251 	__set_bit(PAGE_EXT_OWNER, &new_ext->flags);
252 }
253 
254 void pagetypeinfo_showmixedcount_print(struct seq_file *m,
255 				       pg_data_t *pgdat, struct zone *zone)
256 {
257 	struct page *page;
258 	struct page_ext *page_ext;
259 	struct page_owner *page_owner;
260 	unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
261 	unsigned long end_pfn = pfn + zone->spanned_pages;
262 	unsigned long count[MIGRATE_TYPES] = { 0, };
263 	int pageblock_mt, page_mt;
264 	int i;
265 
266 	/* Scan block by block. First and last block may be incomplete */
267 	pfn = zone->zone_start_pfn;
268 
269 	/*
270 	 * Walk the zone in pageblock_nr_pages steps. If a page block spans
271 	 * a zone boundary, it will be double counted between zones. This does
272 	 * not matter as the mixed block count will still be correct
273 	 */
274 	for (; pfn < end_pfn; ) {
275 		if (!pfn_valid(pfn)) {
276 			pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
277 			continue;
278 		}
279 
280 		block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
281 		block_end_pfn = min(block_end_pfn, end_pfn);
282 
283 		page = pfn_to_page(pfn);
284 		pageblock_mt = get_pageblock_migratetype(page);
285 
286 		for (; pfn < block_end_pfn; pfn++) {
287 			if (!pfn_valid_within(pfn))
288 				continue;
289 
290 			page = pfn_to_page(pfn);
291 
292 			if (page_zone(page) != zone)
293 				continue;
294 
295 			if (PageBuddy(page)) {
296 				unsigned long freepage_order;
297 
298 				freepage_order = page_order_unsafe(page);
299 				if (freepage_order < MAX_ORDER)
300 					pfn += (1UL << freepage_order) - 1;
301 				continue;
302 			}
303 
304 			if (PageReserved(page))
305 				continue;
306 
307 			page_ext = lookup_page_ext(page);
308 			if (unlikely(!page_ext))
309 				continue;
310 
311 			if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
312 				continue;
313 
314 			page_owner = get_page_owner(page_ext);
315 			page_mt = gfpflags_to_migratetype(
316 					page_owner->gfp_mask);
317 			if (pageblock_mt != page_mt) {
318 				if (is_migrate_cma(pageblock_mt))
319 					count[MIGRATE_MOVABLE]++;
320 				else
321 					count[pageblock_mt]++;
322 
323 				pfn = block_end_pfn;
324 				break;
325 			}
326 			pfn += (1UL << page_owner->order) - 1;
327 		}
328 	}
329 
330 	/* Print counts */
331 	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
332 	for (i = 0; i < MIGRATE_TYPES; i++)
333 		seq_printf(m, "%12lu ", count[i]);
334 	seq_putc(m, '\n');
335 }
336 
337 static ssize_t
338 print_page_owner(char __user *buf, size_t count, unsigned long pfn,
339 		struct page *page, struct page_owner *page_owner,
340 		depot_stack_handle_t handle)
341 {
342 	int ret;
343 	int pageblock_mt, page_mt;
344 	char *kbuf;
345 	unsigned long entries[PAGE_OWNER_STACK_DEPTH];
346 	struct stack_trace trace = {
347 		.nr_entries = 0,
348 		.entries = entries,
349 		.max_entries = PAGE_OWNER_STACK_DEPTH,
350 		.skip = 0
351 	};
352 
353 	kbuf = kmalloc(count, GFP_KERNEL);
354 	if (!kbuf)
355 		return -ENOMEM;
356 
357 	ret = snprintf(kbuf, count,
358 			"Page allocated via order %u, mask %#x(%pGg)\n",
359 			page_owner->order, page_owner->gfp_mask,
360 			&page_owner->gfp_mask);
361 
362 	if (ret >= count)
363 		goto err;
364 
365 	/* Print information relevant to grouping pages by mobility */
366 	pageblock_mt = get_pageblock_migratetype(page);
367 	page_mt  = gfpflags_to_migratetype(page_owner->gfp_mask);
368 	ret += snprintf(kbuf + ret, count - ret,
369 			"PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
370 			pfn,
371 			migratetype_names[page_mt],
372 			pfn >> pageblock_order,
373 			migratetype_names[pageblock_mt],
374 			page->flags, &page->flags);
375 
376 	if (ret >= count)
377 		goto err;
378 
379 	depot_fetch_stack(handle, &trace);
380 	ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0);
381 	if (ret >= count)
382 		goto err;
383 
384 	if (page_owner->last_migrate_reason != -1) {
385 		ret += snprintf(kbuf + ret, count - ret,
386 			"Page has been migrated, last migrate reason: %s\n",
387 			migrate_reason_names[page_owner->last_migrate_reason]);
388 		if (ret >= count)
389 			goto err;
390 	}
391 
392 	ret += snprintf(kbuf + ret, count - ret, "\n");
393 	if (ret >= count)
394 		goto err;
395 
396 	if (copy_to_user(buf, kbuf, ret))
397 		ret = -EFAULT;
398 
399 	kfree(kbuf);
400 	return ret;
401 
402 err:
403 	kfree(kbuf);
404 	return -ENOMEM;
405 }
406 
407 void __dump_page_owner(struct page *page)
408 {
409 	struct page_ext *page_ext = lookup_page_ext(page);
410 	struct page_owner *page_owner;
411 	unsigned long entries[PAGE_OWNER_STACK_DEPTH];
412 	struct stack_trace trace = {
413 		.nr_entries = 0,
414 		.entries = entries,
415 		.max_entries = PAGE_OWNER_STACK_DEPTH,
416 		.skip = 0
417 	};
418 	depot_stack_handle_t handle;
419 	gfp_t gfp_mask;
420 	int mt;
421 
422 	if (unlikely(!page_ext)) {
423 		pr_alert("There is not page extension available.\n");
424 		return;
425 	}
426 
427 	page_owner = get_page_owner(page_ext);
428 	gfp_mask = page_owner->gfp_mask;
429 	mt = gfpflags_to_migratetype(gfp_mask);
430 
431 	if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
432 		pr_alert("page_owner info is not active (free page?)\n");
433 		return;
434 	}
435 
436 	handle = READ_ONCE(page_owner->handle);
437 	if (!handle) {
438 		pr_alert("page_owner info is not active (free page?)\n");
439 		return;
440 	}
441 
442 	depot_fetch_stack(handle, &trace);
443 	pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
444 		 page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
445 	print_stack_trace(&trace, 0);
446 
447 	if (page_owner->last_migrate_reason != -1)
448 		pr_alert("page has been migrated, last migrate reason: %s\n",
449 			migrate_reason_names[page_owner->last_migrate_reason]);
450 }
451 
452 static ssize_t
453 read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
454 {
455 	unsigned long pfn;
456 	struct page *page;
457 	struct page_ext *page_ext;
458 	struct page_owner *page_owner;
459 	depot_stack_handle_t handle;
460 
461 	if (!static_branch_unlikely(&page_owner_inited))
462 		return -EINVAL;
463 
464 	page = NULL;
465 	pfn = min_low_pfn + *ppos;
466 
467 	/* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */
468 	while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
469 		pfn++;
470 
471 	drain_all_pages(NULL);
472 
473 	/* Find an allocated page */
474 	for (; pfn < max_pfn; pfn++) {
475 		/*
476 		 * If the new page is in a new MAX_ORDER_NR_PAGES area,
477 		 * validate the area as existing, skip it if not
478 		 */
479 		if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) {
480 			pfn += MAX_ORDER_NR_PAGES - 1;
481 			continue;
482 		}
483 
484 		/* Check for holes within a MAX_ORDER area */
485 		if (!pfn_valid_within(pfn))
486 			continue;
487 
488 		page = pfn_to_page(pfn);
489 		if (PageBuddy(page)) {
490 			unsigned long freepage_order = page_order_unsafe(page);
491 
492 			if (freepage_order < MAX_ORDER)
493 				pfn += (1UL << freepage_order) - 1;
494 			continue;
495 		}
496 
497 		page_ext = lookup_page_ext(page);
498 		if (unlikely(!page_ext))
499 			continue;
500 
501 		/*
502 		 * Some pages could be missed by concurrent allocation or free,
503 		 * because we don't hold the zone lock.
504 		 */
505 		if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
506 			continue;
507 
508 		page_owner = get_page_owner(page_ext);
509 
510 		/*
511 		 * Access to page_ext->handle isn't synchronous so we should
512 		 * be careful to access it.
513 		 */
514 		handle = READ_ONCE(page_owner->handle);
515 		if (!handle)
516 			continue;
517 
518 		/* Record the next PFN to read in the file offset */
519 		*ppos = (pfn - min_low_pfn) + 1;
520 
521 		return print_page_owner(buf, count, pfn, page,
522 				page_owner, handle);
523 	}
524 
525 	return 0;
526 }
527 
528 static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
529 {
530 	struct page *page;
531 	struct page_ext *page_ext;
532 	unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
533 	unsigned long end_pfn = pfn + zone->spanned_pages;
534 	unsigned long count = 0;
535 
536 	/* Scan block by block. First and last block may be incomplete */
537 	pfn = zone->zone_start_pfn;
538 
539 	/*
540 	 * Walk the zone in pageblock_nr_pages steps. If a page block spans
541 	 * a zone boundary, it will be double counted between zones. This does
542 	 * not matter as the mixed block count will still be correct
543 	 */
544 	for (; pfn < end_pfn; ) {
545 		if (!pfn_valid(pfn)) {
546 			pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
547 			continue;
548 		}
549 
550 		block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
551 		block_end_pfn = min(block_end_pfn, end_pfn);
552 
553 		page = pfn_to_page(pfn);
554 
555 		for (; pfn < block_end_pfn; pfn++) {
556 			if (!pfn_valid_within(pfn))
557 				continue;
558 
559 			page = pfn_to_page(pfn);
560 
561 			if (page_zone(page) != zone)
562 				continue;
563 
564 			/*
565 			 * To avoid having to grab zone->lock, be a little
566 			 * careful when reading buddy page order. The only
567 			 * danger is that we skip too much and potentially miss
568 			 * some early allocated pages, which is better than
569 			 * heavy lock contention.
570 			 */
571 			if (PageBuddy(page)) {
572 				unsigned long order = page_order_unsafe(page);
573 
574 				if (order > 0 && order < MAX_ORDER)
575 					pfn += (1UL << order) - 1;
576 				continue;
577 			}
578 
579 			if (PageReserved(page))
580 				continue;
581 
582 			page_ext = lookup_page_ext(page);
583 			if (unlikely(!page_ext))
584 				continue;
585 
586 			/* Maybe overlapping zone */
587 			if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
588 				continue;
589 
590 			/* Found early allocated page */
591 			__set_page_owner_handle(page_ext, early_handle, 0, 0);
592 			count++;
593 		}
594 		cond_resched();
595 	}
596 
597 	pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
598 		pgdat->node_id, zone->name, count);
599 }
600 
601 static void init_zones_in_node(pg_data_t *pgdat)
602 {
603 	struct zone *zone;
604 	struct zone *node_zones = pgdat->node_zones;
605 
606 	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
607 		if (!populated_zone(zone))
608 			continue;
609 
610 		init_pages_in_zone(pgdat, zone);
611 	}
612 }
613 
614 static void init_early_allocated_pages(void)
615 {
616 	pg_data_t *pgdat;
617 
618 	drain_all_pages(NULL);
619 	for_each_online_pgdat(pgdat)
620 		init_zones_in_node(pgdat);
621 }
622 
623 static const struct file_operations proc_page_owner_operations = {
624 	.read		= read_page_owner,
625 };
626 
627 static int __init pageowner_init(void)
628 {
629 	struct dentry *dentry;
630 
631 	if (!static_branch_unlikely(&page_owner_inited)) {
632 		pr_info("page_owner is disabled\n");
633 		return 0;
634 	}
635 
636 	dentry = debugfs_create_file("page_owner", S_IRUSR, NULL,
637 			NULL, &proc_page_owner_operations);
638 	if (IS_ERR(dentry))
639 		return PTR_ERR(dentry);
640 
641 	return 0;
642 }
643 late_initcall(pageowner_init)
644