xref: /openbmc/linux/tools/perf/util/stat-shadow.c (revision b830f94f)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <stdio.h>
3 #include "evsel.h"
4 #include "stat.h"
5 #include "color.h"
6 #include "pmu.h"
7 #include "rblist.h"
8 #include "evlist.h"
9 #include "expr.h"
10 #include "metricgroup.h"
11 #include <linux/zalloc.h>
12 
13 /*
14  * AGGR_GLOBAL: Use CPU 0
15  * AGGR_SOCKET: Use first CPU of socket
16  * AGGR_DIE: Use first CPU of die
17  * AGGR_CORE: Use first CPU of core
18  * AGGR_NONE: Use matching CPU
19  * AGGR_THREAD: Not supported?
20  */
21 static bool have_frontend_stalled;
22 
23 struct runtime_stat rt_stat;
24 struct stats walltime_nsecs_stats;
25 
26 struct saved_value {
27 	struct rb_node rb_node;
28 	struct perf_evsel *evsel;
29 	enum stat_type type;
30 	int ctx;
31 	int cpu;
32 	struct runtime_stat *stat;
33 	struct stats stats;
34 };
35 
36 static int saved_value_cmp(struct rb_node *rb_node, const void *entry)
37 {
38 	struct saved_value *a = container_of(rb_node,
39 					     struct saved_value,
40 					     rb_node);
41 	const struct saved_value *b = entry;
42 
43 	if (a->cpu != b->cpu)
44 		return a->cpu - b->cpu;
45 
46 	/*
47 	 * Previously the rbtree was used to link generic metrics.
48 	 * The keys were evsel/cpu. Now the rbtree is extended to support
49 	 * per-thread shadow stats. For shadow stats case, the keys
50 	 * are cpu/type/ctx/stat (evsel is NULL). For generic metrics
51 	 * case, the keys are still evsel/cpu (type/ctx/stat are 0 or NULL).
52 	 */
53 	if (a->type != b->type)
54 		return a->type - b->type;
55 
56 	if (a->ctx != b->ctx)
57 		return a->ctx - b->ctx;
58 
59 	if (a->evsel == NULL && b->evsel == NULL) {
60 		if (a->stat == b->stat)
61 			return 0;
62 
63 		if ((char *)a->stat < (char *)b->stat)
64 			return -1;
65 
66 		return 1;
67 	}
68 
69 	if (a->evsel == b->evsel)
70 		return 0;
71 	if ((char *)a->evsel < (char *)b->evsel)
72 		return -1;
73 	return +1;
74 }
75 
76 static struct rb_node *saved_value_new(struct rblist *rblist __maybe_unused,
77 				     const void *entry)
78 {
79 	struct saved_value *nd = malloc(sizeof(struct saved_value));
80 
81 	if (!nd)
82 		return NULL;
83 	memcpy(nd, entry, sizeof(struct saved_value));
84 	return &nd->rb_node;
85 }
86 
87 static void saved_value_delete(struct rblist *rblist __maybe_unused,
88 			       struct rb_node *rb_node)
89 {
90 	struct saved_value *v;
91 
92 	BUG_ON(!rb_node);
93 	v = container_of(rb_node, struct saved_value, rb_node);
94 	free(v);
95 }
96 
97 static struct saved_value *saved_value_lookup(struct perf_evsel *evsel,
98 					      int cpu,
99 					      bool create,
100 					      enum stat_type type,
101 					      int ctx,
102 					      struct runtime_stat *st)
103 {
104 	struct rblist *rblist;
105 	struct rb_node *nd;
106 	struct saved_value dm = {
107 		.cpu = cpu,
108 		.evsel = evsel,
109 		.type = type,
110 		.ctx = ctx,
111 		.stat = st,
112 	};
113 
114 	rblist = &st->value_list;
115 
116 	nd = rblist__find(rblist, &dm);
117 	if (nd)
118 		return container_of(nd, struct saved_value, rb_node);
119 	if (create) {
120 		rblist__add_node(rblist, &dm);
121 		nd = rblist__find(rblist, &dm);
122 		if (nd)
123 			return container_of(nd, struct saved_value, rb_node);
124 	}
125 	return NULL;
126 }
127 
128 void runtime_stat__init(struct runtime_stat *st)
129 {
130 	struct rblist *rblist = &st->value_list;
131 
132 	rblist__init(rblist);
133 	rblist->node_cmp = saved_value_cmp;
134 	rblist->node_new = saved_value_new;
135 	rblist->node_delete = saved_value_delete;
136 }
137 
138 void runtime_stat__exit(struct runtime_stat *st)
139 {
140 	rblist__exit(&st->value_list);
141 }
142 
143 void perf_stat__init_shadow_stats(void)
144 {
145 	have_frontend_stalled = pmu_have_event("cpu", "stalled-cycles-frontend");
146 	runtime_stat__init(&rt_stat);
147 }
148 
149 static int evsel_context(struct perf_evsel *evsel)
150 {
151 	int ctx = 0;
152 
153 	if (evsel->attr.exclude_kernel)
154 		ctx |= CTX_BIT_KERNEL;
155 	if (evsel->attr.exclude_user)
156 		ctx |= CTX_BIT_USER;
157 	if (evsel->attr.exclude_hv)
158 		ctx |= CTX_BIT_HV;
159 	if (evsel->attr.exclude_host)
160 		ctx |= CTX_BIT_HOST;
161 	if (evsel->attr.exclude_idle)
162 		ctx |= CTX_BIT_IDLE;
163 
164 	return ctx;
165 }
166 
167 static void reset_stat(struct runtime_stat *st)
168 {
169 	struct rblist *rblist;
170 	struct rb_node *pos, *next;
171 
172 	rblist = &st->value_list;
173 	next = rb_first_cached(&rblist->entries);
174 	while (next) {
175 		pos = next;
176 		next = rb_next(pos);
177 		memset(&container_of(pos, struct saved_value, rb_node)->stats,
178 		       0,
179 		       sizeof(struct stats));
180 	}
181 }
182 
183 void perf_stat__reset_shadow_stats(void)
184 {
185 	reset_stat(&rt_stat);
186 	memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats));
187 }
188 
189 void perf_stat__reset_shadow_per_stat(struct runtime_stat *st)
190 {
191 	reset_stat(st);
192 }
193 
194 static void update_runtime_stat(struct runtime_stat *st,
195 				enum stat_type type,
196 				int ctx, int cpu, u64 count)
197 {
198 	struct saved_value *v = saved_value_lookup(NULL, cpu, true,
199 						   type, ctx, st);
200 
201 	if (v)
202 		update_stats(&v->stats, count);
203 }
204 
205 /*
206  * Update various tracking values we maintain to print
207  * more semantic information such as miss/hit ratios,
208  * instruction rates, etc:
209  */
210 void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 count,
211 				    int cpu, struct runtime_stat *st)
212 {
213 	int ctx = evsel_context(counter);
214 	u64 count_ns = count;
215 
216 	count *= counter->scale;
217 
218 	if (perf_evsel__is_clock(counter))
219 		update_runtime_stat(st, STAT_NSECS, 0, cpu, count_ns);
220 	else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
221 		update_runtime_stat(st, STAT_CYCLES, ctx, cpu, count);
222 	else if (perf_stat_evsel__is(counter, CYCLES_IN_TX))
223 		update_runtime_stat(st, STAT_CYCLES_IN_TX, ctx, cpu, count);
224 	else if (perf_stat_evsel__is(counter, TRANSACTION_START))
225 		update_runtime_stat(st, STAT_TRANSACTION, ctx, cpu, count);
226 	else if (perf_stat_evsel__is(counter, ELISION_START))
227 		update_runtime_stat(st, STAT_ELISION, ctx, cpu, count);
228 	else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS))
229 		update_runtime_stat(st, STAT_TOPDOWN_TOTAL_SLOTS,
230 				    ctx, cpu, count);
231 	else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED))
232 		update_runtime_stat(st, STAT_TOPDOWN_SLOTS_ISSUED,
233 				    ctx, cpu, count);
234 	else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED))
235 		update_runtime_stat(st, STAT_TOPDOWN_SLOTS_RETIRED,
236 				    ctx, cpu, count);
237 	else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES))
238 		update_runtime_stat(st, STAT_TOPDOWN_FETCH_BUBBLES,
239 				    ctx, cpu, count);
240 	else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES))
241 		update_runtime_stat(st, STAT_TOPDOWN_RECOVERY_BUBBLES,
242 				    ctx, cpu, count);
243 	else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
244 		update_runtime_stat(st, STAT_STALLED_CYCLES_FRONT,
245 				    ctx, cpu, count);
246 	else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
247 		update_runtime_stat(st, STAT_STALLED_CYCLES_BACK,
248 				    ctx, cpu, count);
249 	else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
250 		update_runtime_stat(st, STAT_BRANCHES, ctx, cpu, count);
251 	else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES))
252 		update_runtime_stat(st, STAT_CACHEREFS, ctx, cpu, count);
253 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D))
254 		update_runtime_stat(st, STAT_L1_DCACHE, ctx, cpu, count);
255 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I))
256 		update_runtime_stat(st, STAT_L1_ICACHE, ctx, cpu, count);
257 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL))
258 		update_runtime_stat(st, STAT_LL_CACHE, ctx, cpu, count);
259 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB))
260 		update_runtime_stat(st, STAT_DTLB_CACHE, ctx, cpu, count);
261 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
262 		update_runtime_stat(st, STAT_ITLB_CACHE, ctx, cpu, count);
263 	else if (perf_stat_evsel__is(counter, SMI_NUM))
264 		update_runtime_stat(st, STAT_SMI_NUM, ctx, cpu, count);
265 	else if (perf_stat_evsel__is(counter, APERF))
266 		update_runtime_stat(st, STAT_APERF, ctx, cpu, count);
267 
268 	if (counter->collect_stat) {
269 		struct saved_value *v = saved_value_lookup(counter, cpu, true,
270 							   STAT_NONE, 0, st);
271 		update_stats(&v->stats, count);
272 	}
273 }
274 
275 /* used for get_ratio_color() */
276 enum grc_type {
277 	GRC_STALLED_CYCLES_FE,
278 	GRC_STALLED_CYCLES_BE,
279 	GRC_CACHE_MISSES,
280 	GRC_MAX_NR
281 };
282 
283 static const char *get_ratio_color(enum grc_type type, double ratio)
284 {
285 	static const double grc_table[GRC_MAX_NR][3] = {
286 		[GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 },
287 		[GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 },
288 		[GRC_CACHE_MISSES] 	= { 20.0, 10.0, 5.0 },
289 	};
290 	const char *color = PERF_COLOR_NORMAL;
291 
292 	if (ratio > grc_table[type][0])
293 		color = PERF_COLOR_RED;
294 	else if (ratio > grc_table[type][1])
295 		color = PERF_COLOR_MAGENTA;
296 	else if (ratio > grc_table[type][2])
297 		color = PERF_COLOR_YELLOW;
298 
299 	return color;
300 }
301 
302 static struct perf_evsel *perf_stat__find_event(struct perf_evlist *evsel_list,
303 						const char *name)
304 {
305 	struct perf_evsel *c2;
306 
307 	evlist__for_each_entry (evsel_list, c2) {
308 		if (!strcasecmp(c2->name, name) && !c2->collect_stat)
309 			return c2;
310 	}
311 	return NULL;
312 }
313 
314 /* Mark MetricExpr target events and link events using them to them. */
315 void perf_stat__collect_metric_expr(struct perf_evlist *evsel_list)
316 {
317 	struct perf_evsel *counter, *leader, **metric_events, *oc;
318 	bool found;
319 	const char **metric_names;
320 	int i;
321 	int num_metric_names;
322 
323 	evlist__for_each_entry(evsel_list, counter) {
324 		bool invalid = false;
325 
326 		leader = counter->leader;
327 		if (!counter->metric_expr)
328 			continue;
329 		metric_events = counter->metric_events;
330 		if (!metric_events) {
331 			if (expr__find_other(counter->metric_expr, counter->name,
332 						&metric_names, &num_metric_names) < 0)
333 				continue;
334 
335 			metric_events = calloc(sizeof(struct perf_evsel *),
336 					       num_metric_names + 1);
337 			if (!metric_events)
338 				return;
339 			counter->metric_events = metric_events;
340 		}
341 
342 		for (i = 0; i < num_metric_names; i++) {
343 			found = false;
344 			if (leader) {
345 				/* Search in group */
346 				for_each_group_member (oc, leader) {
347 					if (!strcasecmp(oc->name, metric_names[i]) &&
348 						!oc->collect_stat) {
349 						found = true;
350 						break;
351 					}
352 				}
353 			}
354 			if (!found) {
355 				/* Search ignoring groups */
356 				oc = perf_stat__find_event(evsel_list, metric_names[i]);
357 			}
358 			if (!oc) {
359 				/* Deduping one is good enough to handle duplicated PMUs. */
360 				static char *printed;
361 
362 				/*
363 				 * Adding events automatically would be difficult, because
364 				 * it would risk creating groups that are not schedulable.
365 				 * perf stat doesn't understand all the scheduling constraints
366 				 * of events. So we ask the user instead to add the missing
367 				 * events.
368 				 */
369 				if (!printed || strcasecmp(printed, metric_names[i])) {
370 					fprintf(stderr,
371 						"Add %s event to groups to get metric expression for %s\n",
372 						metric_names[i],
373 						counter->name);
374 					printed = strdup(metric_names[i]);
375 				}
376 				invalid = true;
377 				continue;
378 			}
379 			metric_events[i] = oc;
380 			oc->collect_stat = true;
381 		}
382 		metric_events[i] = NULL;
383 		free(metric_names);
384 		if (invalid) {
385 			free(metric_events);
386 			counter->metric_events = NULL;
387 			counter->metric_expr = NULL;
388 		}
389 	}
390 }
391 
392 static double runtime_stat_avg(struct runtime_stat *st,
393 			       enum stat_type type, int ctx, int cpu)
394 {
395 	struct saved_value *v;
396 
397 	v = saved_value_lookup(NULL, cpu, false, type, ctx, st);
398 	if (!v)
399 		return 0.0;
400 
401 	return avg_stats(&v->stats);
402 }
403 
404 static double runtime_stat_n(struct runtime_stat *st,
405 			     enum stat_type type, int ctx, int cpu)
406 {
407 	struct saved_value *v;
408 
409 	v = saved_value_lookup(NULL, cpu, false, type, ctx, st);
410 	if (!v)
411 		return 0.0;
412 
413 	return v->stats.n;
414 }
415 
416 static void print_stalled_cycles_frontend(struct perf_stat_config *config,
417 					  int cpu,
418 					  struct perf_evsel *evsel, double avg,
419 					  struct perf_stat_output_ctx *out,
420 					  struct runtime_stat *st)
421 {
422 	double total, ratio = 0.0;
423 	const char *color;
424 	int ctx = evsel_context(evsel);
425 
426 	total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
427 
428 	if (total)
429 		ratio = avg / total * 100.0;
430 
431 	color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio);
432 
433 	if (ratio)
434 		out->print_metric(config, out->ctx, color, "%7.2f%%", "frontend cycles idle",
435 				  ratio);
436 	else
437 		out->print_metric(config, out->ctx, NULL, NULL, "frontend cycles idle", 0);
438 }
439 
440 static void print_stalled_cycles_backend(struct perf_stat_config *config,
441 					 int cpu,
442 					 struct perf_evsel *evsel, double avg,
443 					 struct perf_stat_output_ctx *out,
444 					 struct runtime_stat *st)
445 {
446 	double total, ratio = 0.0;
447 	const char *color;
448 	int ctx = evsel_context(evsel);
449 
450 	total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
451 
452 	if (total)
453 		ratio = avg / total * 100.0;
454 
455 	color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio);
456 
457 	out->print_metric(config, out->ctx, color, "%7.2f%%", "backend cycles idle", ratio);
458 }
459 
460 static void print_branch_misses(struct perf_stat_config *config,
461 				int cpu,
462 				struct perf_evsel *evsel,
463 				double avg,
464 				struct perf_stat_output_ctx *out,
465 				struct runtime_stat *st)
466 {
467 	double total, ratio = 0.0;
468 	const char *color;
469 	int ctx = evsel_context(evsel);
470 
471 	total = runtime_stat_avg(st, STAT_BRANCHES, ctx, cpu);
472 
473 	if (total)
474 		ratio = avg / total * 100.0;
475 
476 	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
477 
478 	out->print_metric(config, out->ctx, color, "%7.2f%%", "of all branches", ratio);
479 }
480 
481 static void print_l1_dcache_misses(struct perf_stat_config *config,
482 				   int cpu,
483 				   struct perf_evsel *evsel,
484 				   double avg,
485 				   struct perf_stat_output_ctx *out,
486 				   struct runtime_stat *st)
487 
488 {
489 	double total, ratio = 0.0;
490 	const char *color;
491 	int ctx = evsel_context(evsel);
492 
493 	total = runtime_stat_avg(st, STAT_L1_DCACHE, ctx, cpu);
494 
495 	if (total)
496 		ratio = avg / total * 100.0;
497 
498 	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
499 
500 	out->print_metric(config, out->ctx, color, "%7.2f%%", "of all L1-dcache hits", ratio);
501 }
502 
503 static void print_l1_icache_misses(struct perf_stat_config *config,
504 				   int cpu,
505 				   struct perf_evsel *evsel,
506 				   double avg,
507 				   struct perf_stat_output_ctx *out,
508 				   struct runtime_stat *st)
509 
510 {
511 	double total, ratio = 0.0;
512 	const char *color;
513 	int ctx = evsel_context(evsel);
514 
515 	total = runtime_stat_avg(st, STAT_L1_ICACHE, ctx, cpu);
516 
517 	if (total)
518 		ratio = avg / total * 100.0;
519 
520 	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
521 	out->print_metric(config, out->ctx, color, "%7.2f%%", "of all L1-icache hits", ratio);
522 }
523 
524 static void print_dtlb_cache_misses(struct perf_stat_config *config,
525 				    int cpu,
526 				    struct perf_evsel *evsel,
527 				    double avg,
528 				    struct perf_stat_output_ctx *out,
529 				    struct runtime_stat *st)
530 {
531 	double total, ratio = 0.0;
532 	const char *color;
533 	int ctx = evsel_context(evsel);
534 
535 	total = runtime_stat_avg(st, STAT_DTLB_CACHE, ctx, cpu);
536 
537 	if (total)
538 		ratio = avg / total * 100.0;
539 
540 	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
541 	out->print_metric(config, out->ctx, color, "%7.2f%%", "of all dTLB cache hits", ratio);
542 }
543 
544 static void print_itlb_cache_misses(struct perf_stat_config *config,
545 				    int cpu,
546 				    struct perf_evsel *evsel,
547 				    double avg,
548 				    struct perf_stat_output_ctx *out,
549 				    struct runtime_stat *st)
550 {
551 	double total, ratio = 0.0;
552 	const char *color;
553 	int ctx = evsel_context(evsel);
554 
555 	total = runtime_stat_avg(st, STAT_ITLB_CACHE, ctx, cpu);
556 
557 	if (total)
558 		ratio = avg / total * 100.0;
559 
560 	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
561 	out->print_metric(config, out->ctx, color, "%7.2f%%", "of all iTLB cache hits", ratio);
562 }
563 
564 static void print_ll_cache_misses(struct perf_stat_config *config,
565 				  int cpu,
566 				  struct perf_evsel *evsel,
567 				  double avg,
568 				  struct perf_stat_output_ctx *out,
569 				  struct runtime_stat *st)
570 {
571 	double total, ratio = 0.0;
572 	const char *color;
573 	int ctx = evsel_context(evsel);
574 
575 	total = runtime_stat_avg(st, STAT_LL_CACHE, ctx, cpu);
576 
577 	if (total)
578 		ratio = avg / total * 100.0;
579 
580 	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
581 	out->print_metric(config, out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio);
582 }
583 
584 /*
585  * High level "TopDown" CPU core pipe line bottleneck break down.
586  *
587  * Basic concept following
588  * Yasin, A Top Down Method for Performance analysis and Counter architecture
589  * ISPASS14
590  *
591  * The CPU pipeline is divided into 4 areas that can be bottlenecks:
592  *
593  * Frontend -> Backend -> Retiring
594  * BadSpeculation in addition means out of order execution that is thrown away
595  * (for example branch mispredictions)
596  * Frontend is instruction decoding.
597  * Backend is execution, like computation and accessing data in memory
598  * Retiring is good execution that is not directly bottlenecked
599  *
600  * The formulas are computed in slots.
601  * A slot is an entry in the pipeline each for the pipeline width
602  * (for example a 4-wide pipeline has 4 slots for each cycle)
603  *
604  * Formulas:
605  * BadSpeculation = ((SlotsIssued - SlotsRetired) + RecoveryBubbles) /
606  *			TotalSlots
607  * Retiring = SlotsRetired / TotalSlots
608  * FrontendBound = FetchBubbles / TotalSlots
609  * BackendBound = 1.0 - BadSpeculation - Retiring - FrontendBound
610  *
611  * The kernel provides the mapping to the low level CPU events and any scaling
612  * needed for the CPU pipeline width, for example:
613  *
614  * TotalSlots = Cycles * 4
615  *
616  * The scaling factor is communicated in the sysfs unit.
617  *
618  * In some cases the CPU may not be able to measure all the formulas due to
619  * missing events. In this case multiple formulas are combined, as possible.
620  *
621  * Full TopDown supports more levels to sub-divide each area: for example
622  * BackendBound into computing bound and memory bound. For now we only
623  * support Level 1 TopDown.
624  */
625 
626 static double sanitize_val(double x)
627 {
628 	if (x < 0 && x >= -0.02)
629 		return 0.0;
630 	return x;
631 }
632 
633 static double td_total_slots(int ctx, int cpu, struct runtime_stat *st)
634 {
635 	return runtime_stat_avg(st, STAT_TOPDOWN_TOTAL_SLOTS, ctx, cpu);
636 }
637 
638 static double td_bad_spec(int ctx, int cpu, struct runtime_stat *st)
639 {
640 	double bad_spec = 0;
641 	double total_slots;
642 	double total;
643 
644 	total = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_ISSUED, ctx, cpu) -
645 		runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED, ctx, cpu) +
646 		runtime_stat_avg(st, STAT_TOPDOWN_RECOVERY_BUBBLES, ctx, cpu);
647 
648 	total_slots = td_total_slots(ctx, cpu, st);
649 	if (total_slots)
650 		bad_spec = total / total_slots;
651 	return sanitize_val(bad_spec);
652 }
653 
654 static double td_retiring(int ctx, int cpu, struct runtime_stat *st)
655 {
656 	double retiring = 0;
657 	double total_slots = td_total_slots(ctx, cpu, st);
658 	double ret_slots = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED,
659 					    ctx, cpu);
660 
661 	if (total_slots)
662 		retiring = ret_slots / total_slots;
663 	return retiring;
664 }
665 
666 static double td_fe_bound(int ctx, int cpu, struct runtime_stat *st)
667 {
668 	double fe_bound = 0;
669 	double total_slots = td_total_slots(ctx, cpu, st);
670 	double fetch_bub = runtime_stat_avg(st, STAT_TOPDOWN_FETCH_BUBBLES,
671 					    ctx, cpu);
672 
673 	if (total_slots)
674 		fe_bound = fetch_bub / total_slots;
675 	return fe_bound;
676 }
677 
678 static double td_be_bound(int ctx, int cpu, struct runtime_stat *st)
679 {
680 	double sum = (td_fe_bound(ctx, cpu, st) +
681 		      td_bad_spec(ctx, cpu, st) +
682 		      td_retiring(ctx, cpu, st));
683 	if (sum == 0)
684 		return 0;
685 	return sanitize_val(1.0 - sum);
686 }
687 
688 static void print_smi_cost(struct perf_stat_config *config,
689 			   int cpu, struct perf_evsel *evsel,
690 			   struct perf_stat_output_ctx *out,
691 			   struct runtime_stat *st)
692 {
693 	double smi_num, aperf, cycles, cost = 0.0;
694 	int ctx = evsel_context(evsel);
695 	const char *color = NULL;
696 
697 	smi_num = runtime_stat_avg(st, STAT_SMI_NUM, ctx, cpu);
698 	aperf = runtime_stat_avg(st, STAT_APERF, ctx, cpu);
699 	cycles = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
700 
701 	if ((cycles == 0) || (aperf == 0))
702 		return;
703 
704 	if (smi_num)
705 		cost = (aperf - cycles) / aperf * 100.00;
706 
707 	if (cost > 10)
708 		color = PERF_COLOR_RED;
709 	out->print_metric(config, out->ctx, color, "%8.1f%%", "SMI cycles%", cost);
710 	out->print_metric(config, out->ctx, NULL, "%4.0f", "SMI#", smi_num);
711 }
712 
713 static void generic_metric(struct perf_stat_config *config,
714 			   const char *metric_expr,
715 			   struct perf_evsel **metric_events,
716 			   char *name,
717 			   const char *metric_name,
718 			   double avg,
719 			   int cpu,
720 			   struct perf_stat_output_ctx *out,
721 			   struct runtime_stat *st)
722 {
723 	print_metric_t print_metric = out->print_metric;
724 	struct parse_ctx pctx;
725 	double ratio;
726 	int i;
727 	void *ctxp = out->ctx;
728 	char *n, *pn;
729 
730 	expr__ctx_init(&pctx);
731 	expr__add_id(&pctx, name, avg);
732 	for (i = 0; metric_events[i]; i++) {
733 		struct saved_value *v;
734 		struct stats *stats;
735 		double scale;
736 
737 		if (!strcmp(metric_events[i]->name, "duration_time")) {
738 			stats = &walltime_nsecs_stats;
739 			scale = 1e-9;
740 		} else {
741 			v = saved_value_lookup(metric_events[i], cpu, false,
742 					       STAT_NONE, 0, st);
743 			if (!v)
744 				break;
745 			stats = &v->stats;
746 			scale = 1.0;
747 		}
748 
749 		n = strdup(metric_events[i]->name);
750 		if (!n)
751 			return;
752 		/*
753 		 * This display code with --no-merge adds [cpu] postfixes.
754 		 * These are not supported by the parser. Remove everything
755 		 * after the space.
756 		 */
757 		pn = strchr(n, ' ');
758 		if (pn)
759 			*pn = 0;
760 		expr__add_id(&pctx, n, avg_stats(stats)*scale);
761 	}
762 	if (!metric_events[i]) {
763 		const char *p = metric_expr;
764 
765 		if (expr__parse(&ratio, &pctx, &p) == 0)
766 			print_metric(config, ctxp, NULL, "%8.1f",
767 				metric_name ?
768 				metric_name :
769 				out->force_header ?  name : "",
770 				ratio);
771 		else
772 			print_metric(config, ctxp, NULL, NULL,
773 				     out->force_header ?
774 				     (metric_name ? metric_name : name) : "", 0);
775 	} else
776 		print_metric(config, ctxp, NULL, NULL, "", 0);
777 
778 	for (i = 1; i < pctx.num_ids; i++)
779 		zfree(&pctx.ids[i].name);
780 }
781 
782 void perf_stat__print_shadow_stats(struct perf_stat_config *config,
783 				   struct perf_evsel *evsel,
784 				   double avg, int cpu,
785 				   struct perf_stat_output_ctx *out,
786 				   struct rblist *metric_events,
787 				   struct runtime_stat *st)
788 {
789 	void *ctxp = out->ctx;
790 	print_metric_t print_metric = out->print_metric;
791 	double total, ratio = 0.0, total2;
792 	const char *color = NULL;
793 	int ctx = evsel_context(evsel);
794 	struct metric_event *me;
795 	int num = 1;
796 
797 	if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
798 		total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
799 
800 		if (total) {
801 			ratio = avg / total;
802 			print_metric(config, ctxp, NULL, "%7.2f ",
803 					"insn per cycle", ratio);
804 		} else {
805 			print_metric(config, ctxp, NULL, NULL, "insn per cycle", 0);
806 		}
807 
808 		total = runtime_stat_avg(st, STAT_STALLED_CYCLES_FRONT,
809 					 ctx, cpu);
810 
811 		total = max(total, runtime_stat_avg(st,
812 						    STAT_STALLED_CYCLES_BACK,
813 						    ctx, cpu));
814 
815 		if (total && avg) {
816 			out->new_line(config, ctxp);
817 			ratio = total / avg;
818 			print_metric(config, ctxp, NULL, "%7.2f ",
819 					"stalled cycles per insn",
820 					ratio);
821 		} else if (have_frontend_stalled) {
822 			out->new_line(config, ctxp);
823 			print_metric(config, ctxp, NULL, "%7.2f ",
824 				     "stalled cycles per insn", 0);
825 		}
826 	} else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) {
827 		if (runtime_stat_n(st, STAT_BRANCHES, ctx, cpu) != 0)
828 			print_branch_misses(config, cpu, evsel, avg, out, st);
829 		else
830 			print_metric(config, ctxp, NULL, NULL, "of all branches", 0);
831 	} else if (
832 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
833 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1D |
834 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
835 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
836 
837 		if (runtime_stat_n(st, STAT_L1_DCACHE, ctx, cpu) != 0)
838 			print_l1_dcache_misses(config, cpu, evsel, avg, out, st);
839 		else
840 			print_metric(config, ctxp, NULL, NULL, "of all L1-dcache hits", 0);
841 	} else if (
842 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
843 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1I |
844 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
845 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
846 
847 		if (runtime_stat_n(st, STAT_L1_ICACHE, ctx, cpu) != 0)
848 			print_l1_icache_misses(config, cpu, evsel, avg, out, st);
849 		else
850 			print_metric(config, ctxp, NULL, NULL, "of all L1-icache hits", 0);
851 	} else if (
852 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
853 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_DTLB |
854 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
855 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
856 
857 		if (runtime_stat_n(st, STAT_DTLB_CACHE, ctx, cpu) != 0)
858 			print_dtlb_cache_misses(config, cpu, evsel, avg, out, st);
859 		else
860 			print_metric(config, ctxp, NULL, NULL, "of all dTLB cache hits", 0);
861 	} else if (
862 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
863 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_ITLB |
864 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
865 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
866 
867 		if (runtime_stat_n(st, STAT_ITLB_CACHE, ctx, cpu) != 0)
868 			print_itlb_cache_misses(config, cpu, evsel, avg, out, st);
869 		else
870 			print_metric(config, ctxp, NULL, NULL, "of all iTLB cache hits", 0);
871 	} else if (
872 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
873 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_LL |
874 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
875 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
876 
877 		if (runtime_stat_n(st, STAT_LL_CACHE, ctx, cpu) != 0)
878 			print_ll_cache_misses(config, cpu, evsel, avg, out, st);
879 		else
880 			print_metric(config, ctxp, NULL, NULL, "of all LL-cache hits", 0);
881 	} else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) {
882 		total = runtime_stat_avg(st, STAT_CACHEREFS, ctx, cpu);
883 
884 		if (total)
885 			ratio = avg * 100 / total;
886 
887 		if (runtime_stat_n(st, STAT_CACHEREFS, ctx, cpu) != 0)
888 			print_metric(config, ctxp, NULL, "%8.3f %%",
889 				     "of all cache refs", ratio);
890 		else
891 			print_metric(config, ctxp, NULL, NULL, "of all cache refs", 0);
892 	} else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
893 		print_stalled_cycles_frontend(config, cpu, evsel, avg, out, st);
894 	} else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
895 		print_stalled_cycles_backend(config, cpu, evsel, avg, out, st);
896 	} else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
897 		total = runtime_stat_avg(st, STAT_NSECS, 0, cpu);
898 
899 		if (total) {
900 			ratio = avg / total;
901 			print_metric(config, ctxp, NULL, "%8.3f", "GHz", ratio);
902 		} else {
903 			print_metric(config, ctxp, NULL, NULL, "Ghz", 0);
904 		}
905 	} else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) {
906 		total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
907 
908 		if (total)
909 			print_metric(config, ctxp, NULL,
910 					"%7.2f%%", "transactional cycles",
911 					100.0 * (avg / total));
912 		else
913 			print_metric(config, ctxp, NULL, NULL, "transactional cycles",
914 				     0);
915 	} else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) {
916 		total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
917 		total2 = runtime_stat_avg(st, STAT_CYCLES_IN_TX, ctx, cpu);
918 
919 		if (total2 < avg)
920 			total2 = avg;
921 		if (total)
922 			print_metric(config, ctxp, NULL, "%7.2f%%", "aborted cycles",
923 				100.0 * ((total2-avg) / total));
924 		else
925 			print_metric(config, ctxp, NULL, NULL, "aborted cycles", 0);
926 	} else if (perf_stat_evsel__is(evsel, TRANSACTION_START)) {
927 		total = runtime_stat_avg(st, STAT_CYCLES_IN_TX,
928 					 ctx, cpu);
929 
930 		if (avg)
931 			ratio = total / avg;
932 
933 		if (runtime_stat_n(st, STAT_CYCLES_IN_TX, ctx, cpu) != 0)
934 			print_metric(config, ctxp, NULL, "%8.0f",
935 				     "cycles / transaction", ratio);
936 		else
937 			print_metric(config, ctxp, NULL, NULL, "cycles / transaction",
938 				      0);
939 	} else if (perf_stat_evsel__is(evsel, ELISION_START)) {
940 		total = runtime_stat_avg(st, STAT_CYCLES_IN_TX,
941 					 ctx, cpu);
942 
943 		if (avg)
944 			ratio = total / avg;
945 
946 		print_metric(config, ctxp, NULL, "%8.0f", "cycles / elision", ratio);
947 	} else if (perf_evsel__is_clock(evsel)) {
948 		if ((ratio = avg_stats(&walltime_nsecs_stats)) != 0)
949 			print_metric(config, ctxp, NULL, "%8.3f", "CPUs utilized",
950 				     avg / (ratio * evsel->scale));
951 		else
952 			print_metric(config, ctxp, NULL, NULL, "CPUs utilized", 0);
953 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) {
954 		double fe_bound = td_fe_bound(ctx, cpu, st);
955 
956 		if (fe_bound > 0.2)
957 			color = PERF_COLOR_RED;
958 		print_metric(config, ctxp, color, "%8.1f%%", "frontend bound",
959 				fe_bound * 100.);
960 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) {
961 		double retiring = td_retiring(ctx, cpu, st);
962 
963 		if (retiring > 0.7)
964 			color = PERF_COLOR_GREEN;
965 		print_metric(config, ctxp, color, "%8.1f%%", "retiring",
966 				retiring * 100.);
967 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) {
968 		double bad_spec = td_bad_spec(ctx, cpu, st);
969 
970 		if (bad_spec > 0.1)
971 			color = PERF_COLOR_RED;
972 		print_metric(config, ctxp, color, "%8.1f%%", "bad speculation",
973 				bad_spec * 100.);
974 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) {
975 		double be_bound = td_be_bound(ctx, cpu, st);
976 		const char *name = "backend bound";
977 		static int have_recovery_bubbles = -1;
978 
979 		/* In case the CPU does not support topdown-recovery-bubbles */
980 		if (have_recovery_bubbles < 0)
981 			have_recovery_bubbles = pmu_have_event("cpu",
982 					"topdown-recovery-bubbles");
983 		if (!have_recovery_bubbles)
984 			name = "backend bound/bad spec";
985 
986 		if (be_bound > 0.2)
987 			color = PERF_COLOR_RED;
988 		if (td_total_slots(ctx, cpu, st) > 0)
989 			print_metric(config, ctxp, color, "%8.1f%%", name,
990 					be_bound * 100.);
991 		else
992 			print_metric(config, ctxp, NULL, NULL, name, 0);
993 	} else if (evsel->metric_expr) {
994 		generic_metric(config, evsel->metric_expr, evsel->metric_events, evsel->name,
995 				evsel->metric_name, avg, cpu, out, st);
996 	} else if (runtime_stat_n(st, STAT_NSECS, 0, cpu) != 0) {
997 		char unit = 'M';
998 		char unit_buf[10];
999 
1000 		total = runtime_stat_avg(st, STAT_NSECS, 0, cpu);
1001 
1002 		if (total)
1003 			ratio = 1000.0 * avg / total;
1004 		if (ratio < 0.001) {
1005 			ratio *= 1000;
1006 			unit = 'K';
1007 		}
1008 		snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit);
1009 		print_metric(config, ctxp, NULL, "%8.3f", unit_buf, ratio);
1010 	} else if (perf_stat_evsel__is(evsel, SMI_NUM)) {
1011 		print_smi_cost(config, cpu, evsel, out, st);
1012 	} else {
1013 		num = 0;
1014 	}
1015 
1016 	if ((me = metricgroup__lookup(metric_events, evsel, false)) != NULL) {
1017 		struct metric_expr *mexp;
1018 
1019 		list_for_each_entry (mexp, &me->head, nd) {
1020 			if (num++ > 0)
1021 				out->new_line(config, ctxp);
1022 			generic_metric(config, mexp->metric_expr, mexp->metric_events,
1023 					evsel->name, mexp->metric_name,
1024 					avg, cpu, out, st);
1025 		}
1026 	}
1027 	if (num == 0)
1028 		print_metric(config, ctxp, NULL, NULL, NULL, 0);
1029 }
1030