xref: /openbmc/linux/tools/perf/builtin-kmem.c (revision 99b7e93c)
1 #include "builtin.h"
2 #include "perf.h"
3 
4 #include "util/evlist.h"
5 #include "util/evsel.h"
6 #include "util/util.h"
7 #include "util/cache.h"
8 #include "util/symbol.h"
9 #include "util/thread.h"
10 #include "util/header.h"
11 #include "util/session.h"
12 #include "util/tool.h"
13 
14 #include "util/parse-options.h"
15 #include "util/trace-event.h"
16 #include "util/data.h"
17 #include "util/cpumap.h"
18 
19 #include "util/debug.h"
20 
21 #include <linux/rbtree.h>
22 #include <linux/string.h>
23 #include <locale.h>
24 
25 static int	kmem_slab;
26 static int	kmem_page;
27 
28 static long	kmem_page_size;
29 
30 struct alloc_stat;
31 typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *);
32 
33 static int			alloc_flag;
34 static int			caller_flag;
35 
36 static int			alloc_lines = -1;
37 static int			caller_lines = -1;
38 
39 static bool			raw_ip;
40 
41 struct alloc_stat {
42 	u64	call_site;
43 	u64	ptr;
44 	u64	bytes_req;
45 	u64	bytes_alloc;
46 	u32	hit;
47 	u32	pingpong;
48 
49 	short	alloc_cpu;
50 
51 	struct rb_node node;
52 };
53 
54 static struct rb_root root_alloc_stat;
55 static struct rb_root root_alloc_sorted;
56 static struct rb_root root_caller_stat;
57 static struct rb_root root_caller_sorted;
58 
59 static unsigned long total_requested, total_allocated;
60 static unsigned long nr_allocs, nr_cross_allocs;
61 
62 static int insert_alloc_stat(unsigned long call_site, unsigned long ptr,
63 			     int bytes_req, int bytes_alloc, int cpu)
64 {
65 	struct rb_node **node = &root_alloc_stat.rb_node;
66 	struct rb_node *parent = NULL;
67 	struct alloc_stat *data = NULL;
68 
69 	while (*node) {
70 		parent = *node;
71 		data = rb_entry(*node, struct alloc_stat, node);
72 
73 		if (ptr > data->ptr)
74 			node = &(*node)->rb_right;
75 		else if (ptr < data->ptr)
76 			node = &(*node)->rb_left;
77 		else
78 			break;
79 	}
80 
81 	if (data && data->ptr == ptr) {
82 		data->hit++;
83 		data->bytes_req += bytes_req;
84 		data->bytes_alloc += bytes_alloc;
85 	} else {
86 		data = malloc(sizeof(*data));
87 		if (!data) {
88 			pr_err("%s: malloc failed\n", __func__);
89 			return -1;
90 		}
91 		data->ptr = ptr;
92 		data->pingpong = 0;
93 		data->hit = 1;
94 		data->bytes_req = bytes_req;
95 		data->bytes_alloc = bytes_alloc;
96 
97 		rb_link_node(&data->node, parent, node);
98 		rb_insert_color(&data->node, &root_alloc_stat);
99 	}
100 	data->call_site = call_site;
101 	data->alloc_cpu = cpu;
102 	return 0;
103 }
104 
105 static int insert_caller_stat(unsigned long call_site,
106 			      int bytes_req, int bytes_alloc)
107 {
108 	struct rb_node **node = &root_caller_stat.rb_node;
109 	struct rb_node *parent = NULL;
110 	struct alloc_stat *data = NULL;
111 
112 	while (*node) {
113 		parent = *node;
114 		data = rb_entry(*node, struct alloc_stat, node);
115 
116 		if (call_site > data->call_site)
117 			node = &(*node)->rb_right;
118 		else if (call_site < data->call_site)
119 			node = &(*node)->rb_left;
120 		else
121 			break;
122 	}
123 
124 	if (data && data->call_site == call_site) {
125 		data->hit++;
126 		data->bytes_req += bytes_req;
127 		data->bytes_alloc += bytes_alloc;
128 	} else {
129 		data = malloc(sizeof(*data));
130 		if (!data) {
131 			pr_err("%s: malloc failed\n", __func__);
132 			return -1;
133 		}
134 		data->call_site = call_site;
135 		data->pingpong = 0;
136 		data->hit = 1;
137 		data->bytes_req = bytes_req;
138 		data->bytes_alloc = bytes_alloc;
139 
140 		rb_link_node(&data->node, parent, node);
141 		rb_insert_color(&data->node, &root_caller_stat);
142 	}
143 
144 	return 0;
145 }
146 
147 static int perf_evsel__process_alloc_event(struct perf_evsel *evsel,
148 					   struct perf_sample *sample)
149 {
150 	unsigned long ptr = perf_evsel__intval(evsel, sample, "ptr"),
151 		      call_site = perf_evsel__intval(evsel, sample, "call_site");
152 	int bytes_req = perf_evsel__intval(evsel, sample, "bytes_req"),
153 	    bytes_alloc = perf_evsel__intval(evsel, sample, "bytes_alloc");
154 
155 	if (insert_alloc_stat(call_site, ptr, bytes_req, bytes_alloc, sample->cpu) ||
156 	    insert_caller_stat(call_site, bytes_req, bytes_alloc))
157 		return -1;
158 
159 	total_requested += bytes_req;
160 	total_allocated += bytes_alloc;
161 
162 	nr_allocs++;
163 	return 0;
164 }
165 
166 static int perf_evsel__process_alloc_node_event(struct perf_evsel *evsel,
167 						struct perf_sample *sample)
168 {
169 	int ret = perf_evsel__process_alloc_event(evsel, sample);
170 
171 	if (!ret) {
172 		int node1 = cpu__get_node(sample->cpu),
173 		    node2 = perf_evsel__intval(evsel, sample, "node");
174 
175 		if (node1 != node2)
176 			nr_cross_allocs++;
177 	}
178 
179 	return ret;
180 }
181 
182 static int ptr_cmp(struct alloc_stat *, struct alloc_stat *);
183 static int callsite_cmp(struct alloc_stat *, struct alloc_stat *);
184 
185 static struct alloc_stat *search_alloc_stat(unsigned long ptr,
186 					    unsigned long call_site,
187 					    struct rb_root *root,
188 					    sort_fn_t sort_fn)
189 {
190 	struct rb_node *node = root->rb_node;
191 	struct alloc_stat key = { .ptr = ptr, .call_site = call_site };
192 
193 	while (node) {
194 		struct alloc_stat *data;
195 		int cmp;
196 
197 		data = rb_entry(node, struct alloc_stat, node);
198 
199 		cmp = sort_fn(&key, data);
200 		if (cmp < 0)
201 			node = node->rb_left;
202 		else if (cmp > 0)
203 			node = node->rb_right;
204 		else
205 			return data;
206 	}
207 	return NULL;
208 }
209 
210 static int perf_evsel__process_free_event(struct perf_evsel *evsel,
211 					  struct perf_sample *sample)
212 {
213 	unsigned long ptr = perf_evsel__intval(evsel, sample, "ptr");
214 	struct alloc_stat *s_alloc, *s_caller;
215 
216 	s_alloc = search_alloc_stat(ptr, 0, &root_alloc_stat, ptr_cmp);
217 	if (!s_alloc)
218 		return 0;
219 
220 	if ((short)sample->cpu != s_alloc->alloc_cpu) {
221 		s_alloc->pingpong++;
222 
223 		s_caller = search_alloc_stat(0, s_alloc->call_site,
224 					     &root_caller_stat, callsite_cmp);
225 		if (!s_caller)
226 			return -1;
227 		s_caller->pingpong++;
228 	}
229 	s_alloc->alloc_cpu = -1;
230 
231 	return 0;
232 }
233 
234 static u64 total_page_alloc_bytes;
235 static u64 total_page_free_bytes;
236 static u64 total_page_nomatch_bytes;
237 static u64 total_page_fail_bytes;
238 static unsigned long nr_page_allocs;
239 static unsigned long nr_page_frees;
240 static unsigned long nr_page_fails;
241 static unsigned long nr_page_nomatch;
242 
243 static bool use_pfn;
244 
245 #define MAX_MIGRATE_TYPES  6
246 #define MAX_PAGE_ORDER     11
247 
248 static int order_stats[MAX_PAGE_ORDER][MAX_MIGRATE_TYPES];
249 
250 struct page_stat {
251 	struct rb_node 	node;
252 	u64 		page;
253 	int 		order;
254 	unsigned 	gfp_flags;
255 	unsigned 	migrate_type;
256 	u64		alloc_bytes;
257 	u64 		free_bytes;
258 	int 		nr_alloc;
259 	int 		nr_free;
260 };
261 
262 static struct rb_root page_tree;
263 static struct rb_root page_alloc_tree;
264 static struct rb_root page_alloc_sorted;
265 
266 static struct page_stat *search_page(unsigned long page, bool create)
267 {
268 	struct rb_node **node = &page_tree.rb_node;
269 	struct rb_node *parent = NULL;
270 	struct page_stat *data;
271 
272 	while (*node) {
273 		s64 cmp;
274 
275 		parent = *node;
276 		data = rb_entry(*node, struct page_stat, node);
277 
278 		cmp = data->page - page;
279 		if (cmp < 0)
280 			node = &parent->rb_left;
281 		else if (cmp > 0)
282 			node = &parent->rb_right;
283 		else
284 			return data;
285 	}
286 
287 	if (!create)
288 		return NULL;
289 
290 	data = zalloc(sizeof(*data));
291 	if (data != NULL) {
292 		data->page = page;
293 
294 		rb_link_node(&data->node, parent, node);
295 		rb_insert_color(&data->node, &page_tree);
296 	}
297 
298 	return data;
299 }
300 
301 static int page_stat_cmp(struct page_stat *a, struct page_stat *b)
302 {
303 	if (a->page > b->page)
304 		return -1;
305 	if (a->page < b->page)
306 		return 1;
307 	if (a->order > b->order)
308 		return -1;
309 	if (a->order < b->order)
310 		return 1;
311 	if (a->migrate_type > b->migrate_type)
312 		return -1;
313 	if (a->migrate_type < b->migrate_type)
314 		return 1;
315 	if (a->gfp_flags > b->gfp_flags)
316 		return -1;
317 	if (a->gfp_flags < b->gfp_flags)
318 		return 1;
319 	return 0;
320 }
321 
322 static struct page_stat *search_page_alloc_stat(struct page_stat *pstat, bool create)
323 {
324 	struct rb_node **node = &page_alloc_tree.rb_node;
325 	struct rb_node *parent = NULL;
326 	struct page_stat *data;
327 
328 	while (*node) {
329 		s64 cmp;
330 
331 		parent = *node;
332 		data = rb_entry(*node, struct page_stat, node);
333 
334 		cmp = page_stat_cmp(data, pstat);
335 		if (cmp < 0)
336 			node = &parent->rb_left;
337 		else if (cmp > 0)
338 			node = &parent->rb_right;
339 		else
340 			return data;
341 	}
342 
343 	if (!create)
344 		return NULL;
345 
346 	data = zalloc(sizeof(*data));
347 	if (data != NULL) {
348 		data->page = pstat->page;
349 		data->order = pstat->order;
350 		data->gfp_flags = pstat->gfp_flags;
351 		data->migrate_type = pstat->migrate_type;
352 
353 		rb_link_node(&data->node, parent, node);
354 		rb_insert_color(&data->node, &page_alloc_tree);
355 	}
356 
357 	return data;
358 }
359 
360 static bool valid_page(u64 pfn_or_page)
361 {
362 	if (use_pfn && pfn_or_page == -1UL)
363 		return false;
364 	if (!use_pfn && pfn_or_page == 0)
365 		return false;
366 	return true;
367 }
368 
369 static int perf_evsel__process_page_alloc_event(struct perf_evsel *evsel,
370 						struct perf_sample *sample)
371 {
372 	u64 page;
373 	unsigned int order = perf_evsel__intval(evsel, sample, "order");
374 	unsigned int gfp_flags = perf_evsel__intval(evsel, sample, "gfp_flags");
375 	unsigned int migrate_type = perf_evsel__intval(evsel, sample,
376 						       "migratetype");
377 	u64 bytes = kmem_page_size << order;
378 	struct page_stat *pstat;
379 	struct page_stat this = {
380 		.order = order,
381 		.gfp_flags = gfp_flags,
382 		.migrate_type = migrate_type,
383 	};
384 
385 	if (use_pfn)
386 		page = perf_evsel__intval(evsel, sample, "pfn");
387 	else
388 		page = perf_evsel__intval(evsel, sample, "page");
389 
390 	nr_page_allocs++;
391 	total_page_alloc_bytes += bytes;
392 
393 	if (!valid_page(page)) {
394 		nr_page_fails++;
395 		total_page_fail_bytes += bytes;
396 
397 		return 0;
398 	}
399 
400 	/*
401 	 * This is to find the current page (with correct gfp flags and
402 	 * migrate type) at free event.
403 	 */
404 	pstat = search_page(page, true);
405 	if (pstat == NULL)
406 		return -ENOMEM;
407 
408 	pstat->order = order;
409 	pstat->gfp_flags = gfp_flags;
410 	pstat->migrate_type = migrate_type;
411 
412 	this.page = page;
413 	pstat = search_page_alloc_stat(&this, true);
414 	if (pstat == NULL)
415 		return -ENOMEM;
416 
417 	pstat->nr_alloc++;
418 	pstat->alloc_bytes += bytes;
419 
420 	order_stats[order][migrate_type]++;
421 
422 	return 0;
423 }
424 
425 static int perf_evsel__process_page_free_event(struct perf_evsel *evsel,
426 						struct perf_sample *sample)
427 {
428 	u64 page;
429 	unsigned int order = perf_evsel__intval(evsel, sample, "order");
430 	u64 bytes = kmem_page_size << order;
431 	struct page_stat *pstat;
432 	struct page_stat this = {
433 		.order = order,
434 	};
435 
436 	if (use_pfn)
437 		page = perf_evsel__intval(evsel, sample, "pfn");
438 	else
439 		page = perf_evsel__intval(evsel, sample, "page");
440 
441 	nr_page_frees++;
442 	total_page_free_bytes += bytes;
443 
444 	pstat = search_page(page, false);
445 	if (pstat == NULL) {
446 		pr_debug2("missing free at page %"PRIx64" (order: %d)\n",
447 			  page, order);
448 
449 		nr_page_nomatch++;
450 		total_page_nomatch_bytes += bytes;
451 
452 		return 0;
453 	}
454 
455 	this.page = page;
456 	this.gfp_flags = pstat->gfp_flags;
457 	this.migrate_type = pstat->migrate_type;
458 
459 	rb_erase(&pstat->node, &page_tree);
460 	free(pstat);
461 
462 	pstat = search_page_alloc_stat(&this, false);
463 	if (pstat == NULL)
464 		return -ENOENT;
465 
466 	pstat->nr_free++;
467 	pstat->free_bytes += bytes;
468 
469 	return 0;
470 }
471 
472 typedef int (*tracepoint_handler)(struct perf_evsel *evsel,
473 				  struct perf_sample *sample);
474 
475 static int process_sample_event(struct perf_tool *tool __maybe_unused,
476 				union perf_event *event,
477 				struct perf_sample *sample,
478 				struct perf_evsel *evsel,
479 				struct machine *machine)
480 {
481 	struct thread *thread = machine__findnew_thread(machine, sample->pid,
482 							sample->tid);
483 
484 	if (thread == NULL) {
485 		pr_debug("problem processing %d event, skipping it.\n",
486 			 event->header.type);
487 		return -1;
488 	}
489 
490 	dump_printf(" ... thread: %s:%d\n", thread__comm_str(thread), thread->tid);
491 
492 	if (evsel->handler != NULL) {
493 		tracepoint_handler f = evsel->handler;
494 		return f(evsel, sample);
495 	}
496 
497 	return 0;
498 }
499 
500 static struct perf_tool perf_kmem = {
501 	.sample		 = process_sample_event,
502 	.comm		 = perf_event__process_comm,
503 	.mmap		 = perf_event__process_mmap,
504 	.mmap2		 = perf_event__process_mmap2,
505 	.ordered_events	 = true,
506 };
507 
508 static double fragmentation(unsigned long n_req, unsigned long n_alloc)
509 {
510 	if (n_alloc == 0)
511 		return 0.0;
512 	else
513 		return 100.0 - (100.0 * n_req / n_alloc);
514 }
515 
516 static void __print_slab_result(struct rb_root *root,
517 				struct perf_session *session,
518 				int n_lines, int is_caller)
519 {
520 	struct rb_node *next;
521 	struct machine *machine = &session->machines.host;
522 
523 	printf("%.105s\n", graph_dotted_line);
524 	printf(" %-34s |",  is_caller ? "Callsite": "Alloc Ptr");
525 	printf(" Total_alloc/Per | Total_req/Per   | Hit      | Ping-pong | Frag\n");
526 	printf("%.105s\n", graph_dotted_line);
527 
528 	next = rb_first(root);
529 
530 	while (next && n_lines--) {
531 		struct alloc_stat *data = rb_entry(next, struct alloc_stat,
532 						   node);
533 		struct symbol *sym = NULL;
534 		struct map *map;
535 		char buf[BUFSIZ];
536 		u64 addr;
537 
538 		if (is_caller) {
539 			addr = data->call_site;
540 			if (!raw_ip)
541 				sym = machine__find_kernel_function(machine, addr, &map, NULL);
542 		} else
543 			addr = data->ptr;
544 
545 		if (sym != NULL)
546 			snprintf(buf, sizeof(buf), "%s+%" PRIx64 "", sym->name,
547 				 addr - map->unmap_ip(map, sym->start));
548 		else
549 			snprintf(buf, sizeof(buf), "%#" PRIx64 "", addr);
550 		printf(" %-34s |", buf);
551 
552 		printf(" %9llu/%-5lu | %9llu/%-5lu | %8lu | %9lu | %6.3f%%\n",
553 		       (unsigned long long)data->bytes_alloc,
554 		       (unsigned long)data->bytes_alloc / data->hit,
555 		       (unsigned long long)data->bytes_req,
556 		       (unsigned long)data->bytes_req / data->hit,
557 		       (unsigned long)data->hit,
558 		       (unsigned long)data->pingpong,
559 		       fragmentation(data->bytes_req, data->bytes_alloc));
560 
561 		next = rb_next(next);
562 	}
563 
564 	if (n_lines == -1)
565 		printf(" ...                                | ...             | ...             | ...      | ...       | ...   \n");
566 
567 	printf("%.105s\n", graph_dotted_line);
568 }
569 
570 static const char * const migrate_type_str[] = {
571 	"UNMOVABL",
572 	"RECLAIM",
573 	"MOVABLE",
574 	"RESERVED",
575 	"CMA/ISLT",
576 	"UNKNOWN",
577 };
578 
579 static void __print_page_result(struct rb_root *root,
580 				struct perf_session *session __maybe_unused,
581 				int n_lines)
582 {
583 	struct rb_node *next = rb_first(root);
584 	const char *format;
585 
586 	printf("\n%.80s\n", graph_dotted_line);
587 	printf(" %-16s | Total alloc (KB) | Hits      | Order | Mig.type | GFP flags\n",
588 	       use_pfn ? "PFN" : "Page");
589 	printf("%.80s\n", graph_dotted_line);
590 
591 	if (use_pfn)
592 		format = " %16llu | %'16llu | %'9d | %5d | %8s |  %08lx\n";
593 	else
594 		format = " %016llx | %'16llu | %'9d | %5d | %8s |  %08lx\n";
595 
596 	while (next && n_lines--) {
597 		struct page_stat *data;
598 
599 		data = rb_entry(next, struct page_stat, node);
600 
601 		printf(format, (unsigned long long)data->page,
602 		       (unsigned long long)data->alloc_bytes / 1024,
603 		       data->nr_alloc, data->order,
604 		       migrate_type_str[data->migrate_type],
605 		       (unsigned long)data->gfp_flags);
606 
607 		next = rb_next(next);
608 	}
609 
610 	if (n_lines == -1)
611 		printf(" ...              | ...              | ...       | ...   | ...      | ...     \n");
612 
613 	printf("%.80s\n", graph_dotted_line);
614 }
615 
616 static void print_slab_summary(void)
617 {
618 	printf("\nSUMMARY (SLAB allocator)");
619 	printf("\n========================\n");
620 	printf("Total bytes requested: %'lu\n", total_requested);
621 	printf("Total bytes allocated: %'lu\n", total_allocated);
622 	printf("Total bytes wasted on internal fragmentation: %'lu\n",
623 	       total_allocated - total_requested);
624 	printf("Internal fragmentation: %f%%\n",
625 	       fragmentation(total_requested, total_allocated));
626 	printf("Cross CPU allocations: %'lu/%'lu\n", nr_cross_allocs, nr_allocs);
627 }
628 
629 static void print_page_summary(void)
630 {
631 	int o, m;
632 	u64 nr_alloc_freed = nr_page_frees - nr_page_nomatch;
633 	u64 total_alloc_freed_bytes = total_page_free_bytes - total_page_nomatch_bytes;
634 
635 	printf("\nSUMMARY (page allocator)");
636 	printf("\n========================\n");
637 	printf("%-30s: %'16lu   [ %'16"PRIu64" KB ]\n", "Total allocation requests",
638 	       nr_page_allocs, total_page_alloc_bytes / 1024);
639 	printf("%-30s: %'16lu   [ %'16"PRIu64" KB ]\n", "Total free requests",
640 	       nr_page_frees, total_page_free_bytes / 1024);
641 	printf("\n");
642 
643 	printf("%-30s: %'16"PRIu64"   [ %'16"PRIu64" KB ]\n", "Total alloc+freed requests",
644 	       nr_alloc_freed, (total_alloc_freed_bytes) / 1024);
645 	printf("%-30s: %'16"PRIu64"   [ %'16"PRIu64" KB ]\n", "Total alloc-only requests",
646 	       nr_page_allocs - nr_alloc_freed,
647 	       (total_page_alloc_bytes - total_alloc_freed_bytes) / 1024);
648 	printf("%-30s: %'16lu   [ %'16"PRIu64" KB ]\n", "Total free-only requests",
649 	       nr_page_nomatch, total_page_nomatch_bytes / 1024);
650 	printf("\n");
651 
652 	printf("%-30s: %'16lu   [ %'16"PRIu64" KB ]\n", "Total allocation failures",
653 	       nr_page_fails, total_page_fail_bytes / 1024);
654 	printf("\n");
655 
656 	printf("%5s  %12s  %12s  %12s  %12s  %12s\n", "Order",  "Unmovable",
657 	       "Reclaimable", "Movable", "Reserved", "CMA/Isolated");
658 	printf("%.5s  %.12s  %.12s  %.12s  %.12s  %.12s\n", graph_dotted_line,
659 	       graph_dotted_line, graph_dotted_line, graph_dotted_line,
660 	       graph_dotted_line, graph_dotted_line);
661 
662 	for (o = 0; o < MAX_PAGE_ORDER; o++) {
663 		printf("%5d", o);
664 		for (m = 0; m < MAX_MIGRATE_TYPES - 1; m++) {
665 			if (order_stats[o][m])
666 				printf("  %'12d", order_stats[o][m]);
667 			else
668 				printf("  %12c", '.');
669 		}
670 		printf("\n");
671 	}
672 }
673 
674 static void print_slab_result(struct perf_session *session)
675 {
676 	if (caller_flag)
677 		__print_slab_result(&root_caller_sorted, session, caller_lines, 1);
678 	if (alloc_flag)
679 		__print_slab_result(&root_alloc_sorted, session, alloc_lines, 0);
680 	print_slab_summary();
681 }
682 
683 static void print_page_result(struct perf_session *session)
684 {
685 	if (alloc_flag)
686 		__print_page_result(&page_alloc_sorted, session, alloc_lines);
687 	print_page_summary();
688 }
689 
690 static void print_result(struct perf_session *session)
691 {
692 	if (kmem_slab)
693 		print_slab_result(session);
694 	if (kmem_page)
695 		print_page_result(session);
696 }
697 
698 struct sort_dimension {
699 	const char		name[20];
700 	sort_fn_t		cmp;
701 	struct list_head	list;
702 };
703 
704 static LIST_HEAD(caller_sort);
705 static LIST_HEAD(alloc_sort);
706 
707 static void sort_slab_insert(struct rb_root *root, struct alloc_stat *data,
708 			     struct list_head *sort_list)
709 {
710 	struct rb_node **new = &(root->rb_node);
711 	struct rb_node *parent = NULL;
712 	struct sort_dimension *sort;
713 
714 	while (*new) {
715 		struct alloc_stat *this;
716 		int cmp = 0;
717 
718 		this = rb_entry(*new, struct alloc_stat, node);
719 		parent = *new;
720 
721 		list_for_each_entry(sort, sort_list, list) {
722 			cmp = sort->cmp(data, this);
723 			if (cmp)
724 				break;
725 		}
726 
727 		if (cmp > 0)
728 			new = &((*new)->rb_left);
729 		else
730 			new = &((*new)->rb_right);
731 	}
732 
733 	rb_link_node(&data->node, parent, new);
734 	rb_insert_color(&data->node, root);
735 }
736 
737 static void __sort_slab_result(struct rb_root *root, struct rb_root *root_sorted,
738 			       struct list_head *sort_list)
739 {
740 	struct rb_node *node;
741 	struct alloc_stat *data;
742 
743 	for (;;) {
744 		node = rb_first(root);
745 		if (!node)
746 			break;
747 
748 		rb_erase(node, root);
749 		data = rb_entry(node, struct alloc_stat, node);
750 		sort_slab_insert(root_sorted, data, sort_list);
751 	}
752 }
753 
754 static void sort_page_insert(struct rb_root *root, struct page_stat *data)
755 {
756 	struct rb_node **new = &root->rb_node;
757 	struct rb_node *parent = NULL;
758 
759 	while (*new) {
760 		struct page_stat *this;
761 		int cmp = 0;
762 
763 		this = rb_entry(*new, struct page_stat, node);
764 		parent = *new;
765 
766 		/* TODO: support more sort key */
767 		cmp = data->alloc_bytes - this->alloc_bytes;
768 
769 		if (cmp > 0)
770 			new = &parent->rb_left;
771 		else
772 			new = &parent->rb_right;
773 	}
774 
775 	rb_link_node(&data->node, parent, new);
776 	rb_insert_color(&data->node, root);
777 }
778 
779 static void __sort_page_result(struct rb_root *root, struct rb_root *root_sorted)
780 {
781 	struct rb_node *node;
782 	struct page_stat *data;
783 
784 	for (;;) {
785 		node = rb_first(root);
786 		if (!node)
787 			break;
788 
789 		rb_erase(node, root);
790 		data = rb_entry(node, struct page_stat, node);
791 		sort_page_insert(root_sorted, data);
792 	}
793 }
794 
795 static void sort_result(void)
796 {
797 	if (kmem_slab) {
798 		__sort_slab_result(&root_alloc_stat, &root_alloc_sorted,
799 				   &alloc_sort);
800 		__sort_slab_result(&root_caller_stat, &root_caller_sorted,
801 				   &caller_sort);
802 	}
803 	if (kmem_page) {
804 		__sort_page_result(&page_alloc_tree, &page_alloc_sorted);
805 	}
806 }
807 
808 static int __cmd_kmem(struct perf_session *session)
809 {
810 	int err = -EINVAL;
811 	struct perf_evsel *evsel;
812 	const struct perf_evsel_str_handler kmem_tracepoints[] = {
813 		/* slab allocator */
814 		{ "kmem:kmalloc",		perf_evsel__process_alloc_event, },
815     		{ "kmem:kmem_cache_alloc",	perf_evsel__process_alloc_event, },
816 		{ "kmem:kmalloc_node",		perf_evsel__process_alloc_node_event, },
817     		{ "kmem:kmem_cache_alloc_node", perf_evsel__process_alloc_node_event, },
818 		{ "kmem:kfree",			perf_evsel__process_free_event, },
819     		{ "kmem:kmem_cache_free",	perf_evsel__process_free_event, },
820 		/* page allocator */
821 		{ "kmem:mm_page_alloc",		perf_evsel__process_page_alloc_event, },
822 		{ "kmem:mm_page_free",		perf_evsel__process_page_free_event, },
823 	};
824 
825 	if (!perf_session__has_traces(session, "kmem record"))
826 		goto out;
827 
828 	if (perf_session__set_tracepoints_handlers(session, kmem_tracepoints)) {
829 		pr_err("Initializing perf session tracepoint handlers failed\n");
830 		goto out;
831 	}
832 
833 	evlist__for_each(session->evlist, evsel) {
834 		if (!strcmp(perf_evsel__name(evsel), "kmem:mm_page_alloc") &&
835 		    perf_evsel__field(evsel, "pfn")) {
836 			use_pfn = true;
837 			break;
838 		}
839 	}
840 
841 	setup_pager();
842 	err = perf_session__process_events(session);
843 	if (err != 0) {
844 		pr_err("error during process events: %d\n", err);
845 		goto out;
846 	}
847 	sort_result();
848 	print_result(session);
849 out:
850 	return err;
851 }
852 
853 static int ptr_cmp(struct alloc_stat *l, struct alloc_stat *r)
854 {
855 	if (l->ptr < r->ptr)
856 		return -1;
857 	else if (l->ptr > r->ptr)
858 		return 1;
859 	return 0;
860 }
861 
862 static struct sort_dimension ptr_sort_dimension = {
863 	.name	= "ptr",
864 	.cmp	= ptr_cmp,
865 };
866 
867 static int callsite_cmp(struct alloc_stat *l, struct alloc_stat *r)
868 {
869 	if (l->call_site < r->call_site)
870 		return -1;
871 	else if (l->call_site > r->call_site)
872 		return 1;
873 	return 0;
874 }
875 
876 static struct sort_dimension callsite_sort_dimension = {
877 	.name	= "callsite",
878 	.cmp	= callsite_cmp,
879 };
880 
881 static int hit_cmp(struct alloc_stat *l, struct alloc_stat *r)
882 {
883 	if (l->hit < r->hit)
884 		return -1;
885 	else if (l->hit > r->hit)
886 		return 1;
887 	return 0;
888 }
889 
890 static struct sort_dimension hit_sort_dimension = {
891 	.name	= "hit",
892 	.cmp	= hit_cmp,
893 };
894 
895 static int bytes_cmp(struct alloc_stat *l, struct alloc_stat *r)
896 {
897 	if (l->bytes_alloc < r->bytes_alloc)
898 		return -1;
899 	else if (l->bytes_alloc > r->bytes_alloc)
900 		return 1;
901 	return 0;
902 }
903 
904 static struct sort_dimension bytes_sort_dimension = {
905 	.name	= "bytes",
906 	.cmp	= bytes_cmp,
907 };
908 
909 static int frag_cmp(struct alloc_stat *l, struct alloc_stat *r)
910 {
911 	double x, y;
912 
913 	x = fragmentation(l->bytes_req, l->bytes_alloc);
914 	y = fragmentation(r->bytes_req, r->bytes_alloc);
915 
916 	if (x < y)
917 		return -1;
918 	else if (x > y)
919 		return 1;
920 	return 0;
921 }
922 
923 static struct sort_dimension frag_sort_dimension = {
924 	.name	= "frag",
925 	.cmp	= frag_cmp,
926 };
927 
928 static int pingpong_cmp(struct alloc_stat *l, struct alloc_stat *r)
929 {
930 	if (l->pingpong < r->pingpong)
931 		return -1;
932 	else if (l->pingpong > r->pingpong)
933 		return 1;
934 	return 0;
935 }
936 
937 static struct sort_dimension pingpong_sort_dimension = {
938 	.name	= "pingpong",
939 	.cmp	= pingpong_cmp,
940 };
941 
942 static struct sort_dimension *avail_sorts[] = {
943 	&ptr_sort_dimension,
944 	&callsite_sort_dimension,
945 	&hit_sort_dimension,
946 	&bytes_sort_dimension,
947 	&frag_sort_dimension,
948 	&pingpong_sort_dimension,
949 };
950 
951 #define NUM_AVAIL_SORTS	((int)ARRAY_SIZE(avail_sorts))
952 
953 static int sort_dimension__add(const char *tok, struct list_head *list)
954 {
955 	struct sort_dimension *sort;
956 	int i;
957 
958 	for (i = 0; i < NUM_AVAIL_SORTS; i++) {
959 		if (!strcmp(avail_sorts[i]->name, tok)) {
960 			sort = memdup(avail_sorts[i], sizeof(*avail_sorts[i]));
961 			if (!sort) {
962 				pr_err("%s: memdup failed\n", __func__);
963 				return -1;
964 			}
965 			list_add_tail(&sort->list, list);
966 			return 0;
967 		}
968 	}
969 
970 	return -1;
971 }
972 
973 static int setup_sorting(struct list_head *sort_list, const char *arg)
974 {
975 	char *tok;
976 	char *str = strdup(arg);
977 	char *pos = str;
978 
979 	if (!str) {
980 		pr_err("%s: strdup failed\n", __func__);
981 		return -1;
982 	}
983 
984 	while (true) {
985 		tok = strsep(&pos, ",");
986 		if (!tok)
987 			break;
988 		if (sort_dimension__add(tok, sort_list) < 0) {
989 			error("Unknown --sort key: '%s'", tok);
990 			free(str);
991 			return -1;
992 		}
993 	}
994 
995 	free(str);
996 	return 0;
997 }
998 
999 static int parse_sort_opt(const struct option *opt __maybe_unused,
1000 			  const char *arg, int unset __maybe_unused)
1001 {
1002 	if (!arg)
1003 		return -1;
1004 
1005 	if (caller_flag > alloc_flag)
1006 		return setup_sorting(&caller_sort, arg);
1007 	else
1008 		return setup_sorting(&alloc_sort, arg);
1009 
1010 	return 0;
1011 }
1012 
1013 static int parse_caller_opt(const struct option *opt __maybe_unused,
1014 			    const char *arg __maybe_unused,
1015 			    int unset __maybe_unused)
1016 {
1017 	caller_flag = (alloc_flag + 1);
1018 	return 0;
1019 }
1020 
1021 static int parse_alloc_opt(const struct option *opt __maybe_unused,
1022 			   const char *arg __maybe_unused,
1023 			   int unset __maybe_unused)
1024 {
1025 	alloc_flag = (caller_flag + 1);
1026 	return 0;
1027 }
1028 
1029 static int parse_slab_opt(const struct option *opt __maybe_unused,
1030 			  const char *arg __maybe_unused,
1031 			  int unset __maybe_unused)
1032 {
1033 	kmem_slab = (kmem_page + 1);
1034 	return 0;
1035 }
1036 
1037 static int parse_page_opt(const struct option *opt __maybe_unused,
1038 			  const char *arg __maybe_unused,
1039 			  int unset __maybe_unused)
1040 {
1041 	kmem_page = (kmem_slab + 1);
1042 	return 0;
1043 }
1044 
1045 static int parse_line_opt(const struct option *opt __maybe_unused,
1046 			  const char *arg, int unset __maybe_unused)
1047 {
1048 	int lines;
1049 
1050 	if (!arg)
1051 		return -1;
1052 
1053 	lines = strtoul(arg, NULL, 10);
1054 
1055 	if (caller_flag > alloc_flag)
1056 		caller_lines = lines;
1057 	else
1058 		alloc_lines = lines;
1059 
1060 	return 0;
1061 }
1062 
1063 static int __cmd_record(int argc, const char **argv)
1064 {
1065 	const char * const record_args[] = {
1066 	"record", "-a", "-R", "-c", "1",
1067 	};
1068 	const char * const slab_events[] = {
1069 	"-e", "kmem:kmalloc",
1070 	"-e", "kmem:kmalloc_node",
1071 	"-e", "kmem:kfree",
1072 	"-e", "kmem:kmem_cache_alloc",
1073 	"-e", "kmem:kmem_cache_alloc_node",
1074 	"-e", "kmem:kmem_cache_free",
1075 	};
1076 	const char * const page_events[] = {
1077 	"-e", "kmem:mm_page_alloc",
1078 	"-e", "kmem:mm_page_free",
1079 	};
1080 	unsigned int rec_argc, i, j;
1081 	const char **rec_argv;
1082 
1083 	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
1084 	if (kmem_slab)
1085 		rec_argc += ARRAY_SIZE(slab_events);
1086 	if (kmem_page)
1087 		rec_argc += ARRAY_SIZE(page_events);
1088 
1089 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
1090 
1091 	if (rec_argv == NULL)
1092 		return -ENOMEM;
1093 
1094 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
1095 		rec_argv[i] = strdup(record_args[i]);
1096 
1097 	if (kmem_slab) {
1098 		for (j = 0; j < ARRAY_SIZE(slab_events); j++, i++)
1099 			rec_argv[i] = strdup(slab_events[j]);
1100 	}
1101 	if (kmem_page) {
1102 		for (j = 0; j < ARRAY_SIZE(page_events); j++, i++)
1103 			rec_argv[i] = strdup(page_events[j]);
1104 	}
1105 
1106 	for (j = 1; j < (unsigned int)argc; j++, i++)
1107 		rec_argv[i] = argv[j];
1108 
1109 	return cmd_record(i, rec_argv, NULL);
1110 }
1111 
1112 int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
1113 {
1114 	const char * const default_sort_order = "frag,hit,bytes";
1115 	struct perf_data_file file = {
1116 		.mode = PERF_DATA_MODE_READ,
1117 	};
1118 	const struct option kmem_options[] = {
1119 	OPT_STRING('i', "input", &input_name, "file", "input file name"),
1120 	OPT_INCR('v', "verbose", &verbose,
1121 		    "be more verbose (show symbol address, etc)"),
1122 	OPT_CALLBACK_NOOPT(0, "caller", NULL, NULL,
1123 			   "show per-callsite statistics", parse_caller_opt),
1124 	OPT_CALLBACK_NOOPT(0, "alloc", NULL, NULL,
1125 			   "show per-allocation statistics", parse_alloc_opt),
1126 	OPT_CALLBACK('s', "sort", NULL, "key[,key2...]",
1127 		     "sort by keys: ptr, call_site, bytes, hit, pingpong, frag",
1128 		     parse_sort_opt),
1129 	OPT_CALLBACK('l', "line", NULL, "num", "show n lines", parse_line_opt),
1130 	OPT_BOOLEAN(0, "raw-ip", &raw_ip, "show raw ip instead of symbol"),
1131 	OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
1132 	OPT_CALLBACK_NOOPT(0, "slab", NULL, NULL, "Analyze slab allocator",
1133 			   parse_slab_opt),
1134 	OPT_CALLBACK_NOOPT(0, "page", NULL, NULL, "Analyze page allocator",
1135 			   parse_page_opt),
1136 	OPT_END()
1137 	};
1138 	const char *const kmem_subcommands[] = { "record", "stat", NULL };
1139 	const char *kmem_usage[] = {
1140 		NULL,
1141 		NULL
1142 	};
1143 	struct perf_session *session;
1144 	int ret = -1;
1145 
1146 	argc = parse_options_subcommand(argc, argv, kmem_options,
1147 					kmem_subcommands, kmem_usage, 0);
1148 
1149 	if (!argc)
1150 		usage_with_options(kmem_usage, kmem_options);
1151 
1152 	if (kmem_slab == 0 && kmem_page == 0)
1153 		kmem_slab = 1;  /* for backward compatibility */
1154 
1155 	if (!strncmp(argv[0], "rec", 3)) {
1156 		symbol__init(NULL);
1157 		return __cmd_record(argc, argv);
1158 	}
1159 
1160 	file.path = input_name;
1161 
1162 	session = perf_session__new(&file, false, &perf_kmem);
1163 	if (session == NULL)
1164 		return -1;
1165 
1166 	if (kmem_page) {
1167 		struct perf_evsel *evsel = perf_evlist__first(session->evlist);
1168 
1169 		if (evsel == NULL || evsel->tp_format == NULL) {
1170 			pr_err("invalid event found.. aborting\n");
1171 			return -1;
1172 		}
1173 
1174 		kmem_page_size = pevent_get_page_size(evsel->tp_format->pevent);
1175 	}
1176 
1177 	symbol__init(&session->header.env);
1178 
1179 	if (!strcmp(argv[0], "stat")) {
1180 		setlocale(LC_ALL, "");
1181 
1182 		if (cpu__setup_cpunode_map())
1183 			goto out_delete;
1184 
1185 		if (list_empty(&caller_sort))
1186 			setup_sorting(&caller_sort, default_sort_order);
1187 		if (list_empty(&alloc_sort))
1188 			setup_sorting(&alloc_sort, default_sort_order);
1189 
1190 		ret = __cmd_kmem(session);
1191 	} else
1192 		usage_with_options(kmem_usage, kmem_options);
1193 
1194 out_delete:
1195 	perf_session__delete(session);
1196 
1197 	return ret;
1198 }
1199 
1200