1 // SPDX-License-Identifier: GPL-2.0 2 #include <stdio.h> 3 #include "evsel.h" 4 #include "stat.h" 5 #include "color.h" 6 #include "pmu.h" 7 #include "rblist.h" 8 #include "evlist.h" 9 #include "expr.h" 10 #include "metricgroup.h" 11 #include "cgroup.h" 12 #include <linux/zalloc.h> 13 14 /* 15 * AGGR_GLOBAL: Use CPU 0 16 * AGGR_SOCKET: Use first CPU of socket 17 * AGGR_DIE: Use first CPU of die 18 * AGGR_CORE: Use first CPU of core 19 * AGGR_NONE: Use matching CPU 20 * AGGR_THREAD: Not supported? 21 */ 22 23 struct runtime_stat rt_stat; 24 struct stats walltime_nsecs_stats; 25 26 struct saved_value { 27 struct rb_node rb_node; 28 struct evsel *evsel; 29 enum stat_type type; 30 int ctx; 31 int cpu; 32 struct cgroup *cgrp; 33 struct runtime_stat *stat; 34 struct stats stats; 35 u64 metric_total; 36 int metric_other; 37 }; 38 39 static int saved_value_cmp(struct rb_node *rb_node, const void *entry) 40 { 41 struct saved_value *a = container_of(rb_node, 42 struct saved_value, 43 rb_node); 44 const struct saved_value *b = entry; 45 46 if (a->cpu != b->cpu) 47 return a->cpu - b->cpu; 48 49 /* 50 * Previously the rbtree was used to link generic metrics. 51 * The keys were evsel/cpu. Now the rbtree is extended to support 52 * per-thread shadow stats. For shadow stats case, the keys 53 * are cpu/type/ctx/stat (evsel is NULL). For generic metrics 54 * case, the keys are still evsel/cpu (type/ctx/stat are 0 or NULL). 55 */ 56 if (a->type != b->type) 57 return a->type - b->type; 58 59 if (a->ctx != b->ctx) 60 return a->ctx - b->ctx; 61 62 if (a->cgrp != b->cgrp) 63 return (char *)a->cgrp < (char *)b->cgrp ? -1 : +1; 64 65 if (a->evsel == NULL && b->evsel == NULL) { 66 if (a->stat == b->stat) 67 return 0; 68 69 if ((char *)a->stat < (char *)b->stat) 70 return -1; 71 72 return 1; 73 } 74 75 if (a->evsel == b->evsel) 76 return 0; 77 if ((char *)a->evsel < (char *)b->evsel) 78 return -1; 79 return +1; 80 } 81 82 static struct rb_node *saved_value_new(struct rblist *rblist __maybe_unused, 83 const void *entry) 84 { 85 struct saved_value *nd = malloc(sizeof(struct saved_value)); 86 87 if (!nd) 88 return NULL; 89 memcpy(nd, entry, sizeof(struct saved_value)); 90 return &nd->rb_node; 91 } 92 93 static void saved_value_delete(struct rblist *rblist __maybe_unused, 94 struct rb_node *rb_node) 95 { 96 struct saved_value *v; 97 98 BUG_ON(!rb_node); 99 v = container_of(rb_node, struct saved_value, rb_node); 100 free(v); 101 } 102 103 static struct saved_value *saved_value_lookup(struct evsel *evsel, 104 int cpu, 105 bool create, 106 enum stat_type type, 107 int ctx, 108 struct runtime_stat *st, 109 struct cgroup *cgrp) 110 { 111 struct rblist *rblist; 112 struct rb_node *nd; 113 struct saved_value dm = { 114 .cpu = cpu, 115 .evsel = evsel, 116 .type = type, 117 .ctx = ctx, 118 .stat = st, 119 .cgrp = cgrp, 120 }; 121 122 rblist = &st->value_list; 123 124 /* don't use context info for clock events */ 125 if (type == STAT_NSECS) 126 dm.ctx = 0; 127 128 nd = rblist__find(rblist, &dm); 129 if (nd) 130 return container_of(nd, struct saved_value, rb_node); 131 if (create) { 132 rblist__add_node(rblist, &dm); 133 nd = rblist__find(rblist, &dm); 134 if (nd) 135 return container_of(nd, struct saved_value, rb_node); 136 } 137 return NULL; 138 } 139 140 void runtime_stat__init(struct runtime_stat *st) 141 { 142 struct rblist *rblist = &st->value_list; 143 144 rblist__init(rblist); 145 rblist->node_cmp = saved_value_cmp; 146 rblist->node_new = saved_value_new; 147 rblist->node_delete = saved_value_delete; 148 } 149 150 void runtime_stat__exit(struct runtime_stat *st) 151 { 152 rblist__exit(&st->value_list); 153 } 154 155 void perf_stat__init_shadow_stats(void) 156 { 157 runtime_stat__init(&rt_stat); 158 } 159 160 static int evsel_context(struct evsel *evsel) 161 { 162 int ctx = 0; 163 164 if (evsel->core.attr.exclude_kernel) 165 ctx |= CTX_BIT_KERNEL; 166 if (evsel->core.attr.exclude_user) 167 ctx |= CTX_BIT_USER; 168 if (evsel->core.attr.exclude_hv) 169 ctx |= CTX_BIT_HV; 170 if (evsel->core.attr.exclude_host) 171 ctx |= CTX_BIT_HOST; 172 if (evsel->core.attr.exclude_idle) 173 ctx |= CTX_BIT_IDLE; 174 175 return ctx; 176 } 177 178 static void reset_stat(struct runtime_stat *st) 179 { 180 struct rblist *rblist; 181 struct rb_node *pos, *next; 182 183 rblist = &st->value_list; 184 next = rb_first_cached(&rblist->entries); 185 while (next) { 186 pos = next; 187 next = rb_next(pos); 188 memset(&container_of(pos, struct saved_value, rb_node)->stats, 189 0, 190 sizeof(struct stats)); 191 } 192 } 193 194 void perf_stat__reset_shadow_stats(void) 195 { 196 reset_stat(&rt_stat); 197 memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats)); 198 } 199 200 void perf_stat__reset_shadow_per_stat(struct runtime_stat *st) 201 { 202 reset_stat(st); 203 } 204 205 struct runtime_stat_data { 206 int ctx; 207 struct cgroup *cgrp; 208 }; 209 210 static void update_runtime_stat(struct runtime_stat *st, 211 enum stat_type type, 212 int cpu, u64 count, 213 struct runtime_stat_data *rsd) 214 { 215 struct saved_value *v = saved_value_lookup(NULL, cpu, true, type, 216 rsd->ctx, st, rsd->cgrp); 217 218 if (v) 219 update_stats(&v->stats, count); 220 } 221 222 /* 223 * Update various tracking values we maintain to print 224 * more semantic information such as miss/hit ratios, 225 * instruction rates, etc: 226 */ 227 void perf_stat__update_shadow_stats(struct evsel *counter, u64 count, 228 int cpu, struct runtime_stat *st) 229 { 230 u64 count_ns = count; 231 struct saved_value *v; 232 struct runtime_stat_data rsd = { 233 .ctx = evsel_context(counter), 234 .cgrp = counter->cgrp, 235 }; 236 237 count *= counter->scale; 238 239 if (evsel__is_clock(counter)) 240 update_runtime_stat(st, STAT_NSECS, cpu, count_ns, &rsd); 241 else if (evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) 242 update_runtime_stat(st, STAT_CYCLES, cpu, count, &rsd); 243 else if (perf_stat_evsel__is(counter, CYCLES_IN_TX)) 244 update_runtime_stat(st, STAT_CYCLES_IN_TX, cpu, count, &rsd); 245 else if (perf_stat_evsel__is(counter, TRANSACTION_START)) 246 update_runtime_stat(st, STAT_TRANSACTION, cpu, count, &rsd); 247 else if (perf_stat_evsel__is(counter, ELISION_START)) 248 update_runtime_stat(st, STAT_ELISION, cpu, count, &rsd); 249 else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS)) 250 update_runtime_stat(st, STAT_TOPDOWN_TOTAL_SLOTS, 251 cpu, count, &rsd); 252 else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED)) 253 update_runtime_stat(st, STAT_TOPDOWN_SLOTS_ISSUED, 254 cpu, count, &rsd); 255 else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED)) 256 update_runtime_stat(st, STAT_TOPDOWN_SLOTS_RETIRED, 257 cpu, count, &rsd); 258 else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES)) 259 update_runtime_stat(st, STAT_TOPDOWN_FETCH_BUBBLES, 260 cpu, count, &rsd); 261 else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES)) 262 update_runtime_stat(st, STAT_TOPDOWN_RECOVERY_BUBBLES, 263 cpu, count, &rsd); 264 else if (perf_stat_evsel__is(counter, TOPDOWN_RETIRING)) 265 update_runtime_stat(st, STAT_TOPDOWN_RETIRING, 266 cpu, count, &rsd); 267 else if (perf_stat_evsel__is(counter, TOPDOWN_BAD_SPEC)) 268 update_runtime_stat(st, STAT_TOPDOWN_BAD_SPEC, 269 cpu, count, &rsd); 270 else if (perf_stat_evsel__is(counter, TOPDOWN_FE_BOUND)) 271 update_runtime_stat(st, STAT_TOPDOWN_FE_BOUND, 272 cpu, count, &rsd); 273 else if (perf_stat_evsel__is(counter, TOPDOWN_BE_BOUND)) 274 update_runtime_stat(st, STAT_TOPDOWN_BE_BOUND, 275 cpu, count, &rsd); 276 else if (evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) 277 update_runtime_stat(st, STAT_STALLED_CYCLES_FRONT, 278 cpu, count, &rsd); 279 else if (evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) 280 update_runtime_stat(st, STAT_STALLED_CYCLES_BACK, 281 cpu, count, &rsd); 282 else if (evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) 283 update_runtime_stat(st, STAT_BRANCHES, cpu, count, &rsd); 284 else if (evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) 285 update_runtime_stat(st, STAT_CACHEREFS, cpu, count, &rsd); 286 else if (evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) 287 update_runtime_stat(st, STAT_L1_DCACHE, cpu, count, &rsd); 288 else if (evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) 289 update_runtime_stat(st, STAT_L1_ICACHE, cpu, count, &rsd); 290 else if (evsel__match(counter, HW_CACHE, HW_CACHE_LL)) 291 update_runtime_stat(st, STAT_LL_CACHE, cpu, count, &rsd); 292 else if (evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) 293 update_runtime_stat(st, STAT_DTLB_CACHE, cpu, count, &rsd); 294 else if (evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) 295 update_runtime_stat(st, STAT_ITLB_CACHE, cpu, count, &rsd); 296 else if (perf_stat_evsel__is(counter, SMI_NUM)) 297 update_runtime_stat(st, STAT_SMI_NUM, cpu, count, &rsd); 298 else if (perf_stat_evsel__is(counter, APERF)) 299 update_runtime_stat(st, STAT_APERF, cpu, count, &rsd); 300 301 if (counter->collect_stat) { 302 v = saved_value_lookup(counter, cpu, true, STAT_NONE, 0, st, 303 rsd.cgrp); 304 update_stats(&v->stats, count); 305 if (counter->metric_leader) 306 v->metric_total += count; 307 } else if (counter->metric_leader) { 308 v = saved_value_lookup(counter->metric_leader, 309 cpu, true, STAT_NONE, 0, st, rsd.cgrp); 310 v->metric_total += count; 311 v->metric_other++; 312 } 313 } 314 315 /* used for get_ratio_color() */ 316 enum grc_type { 317 GRC_STALLED_CYCLES_FE, 318 GRC_STALLED_CYCLES_BE, 319 GRC_CACHE_MISSES, 320 GRC_MAX_NR 321 }; 322 323 static const char *get_ratio_color(enum grc_type type, double ratio) 324 { 325 static const double grc_table[GRC_MAX_NR][3] = { 326 [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 }, 327 [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 }, 328 [GRC_CACHE_MISSES] = { 20.0, 10.0, 5.0 }, 329 }; 330 const char *color = PERF_COLOR_NORMAL; 331 332 if (ratio > grc_table[type][0]) 333 color = PERF_COLOR_RED; 334 else if (ratio > grc_table[type][1]) 335 color = PERF_COLOR_MAGENTA; 336 else if (ratio > grc_table[type][2]) 337 color = PERF_COLOR_YELLOW; 338 339 return color; 340 } 341 342 static struct evsel *perf_stat__find_event(struct evlist *evsel_list, 343 const char *name) 344 { 345 struct evsel *c2; 346 347 evlist__for_each_entry (evsel_list, c2) { 348 if (!strcasecmp(c2->name, name) && !c2->collect_stat) 349 return c2; 350 } 351 return NULL; 352 } 353 354 /* Mark MetricExpr target events and link events using them to them. */ 355 void perf_stat__collect_metric_expr(struct evlist *evsel_list) 356 { 357 struct evsel *counter, *leader, **metric_events, *oc; 358 bool found; 359 struct expr_parse_ctx ctx; 360 struct hashmap_entry *cur; 361 size_t bkt; 362 int i; 363 364 expr__ctx_init(&ctx); 365 evlist__for_each_entry(evsel_list, counter) { 366 bool invalid = false; 367 368 leader = counter->leader; 369 if (!counter->metric_expr) 370 continue; 371 372 expr__ctx_clear(&ctx); 373 metric_events = counter->metric_events; 374 if (!metric_events) { 375 if (expr__find_other(counter->metric_expr, 376 counter->name, 377 &ctx, 1) < 0) 378 continue; 379 380 metric_events = calloc(sizeof(struct evsel *), 381 hashmap__size(&ctx.ids) + 1); 382 if (!metric_events) { 383 expr__ctx_clear(&ctx); 384 return; 385 } 386 counter->metric_events = metric_events; 387 } 388 389 i = 0; 390 hashmap__for_each_entry((&ctx.ids), cur, bkt) { 391 const char *metric_name = (const char *)cur->key; 392 393 found = false; 394 if (leader) { 395 /* Search in group */ 396 for_each_group_member (oc, leader) { 397 if (!strcasecmp(oc->name, 398 metric_name) && 399 !oc->collect_stat) { 400 found = true; 401 break; 402 } 403 } 404 } 405 if (!found) { 406 /* Search ignoring groups */ 407 oc = perf_stat__find_event(evsel_list, 408 metric_name); 409 } 410 if (!oc) { 411 /* Deduping one is good enough to handle duplicated PMUs. */ 412 static char *printed; 413 414 /* 415 * Adding events automatically would be difficult, because 416 * it would risk creating groups that are not schedulable. 417 * perf stat doesn't understand all the scheduling constraints 418 * of events. So we ask the user instead to add the missing 419 * events. 420 */ 421 if (!printed || 422 strcasecmp(printed, metric_name)) { 423 fprintf(stderr, 424 "Add %s event to groups to get metric expression for %s\n", 425 metric_name, 426 counter->name); 427 printed = strdup(metric_name); 428 } 429 invalid = true; 430 continue; 431 } 432 metric_events[i++] = oc; 433 oc->collect_stat = true; 434 } 435 metric_events[i] = NULL; 436 if (invalid) { 437 free(metric_events); 438 counter->metric_events = NULL; 439 counter->metric_expr = NULL; 440 } 441 } 442 expr__ctx_clear(&ctx); 443 } 444 445 static double runtime_stat_avg(struct runtime_stat *st, 446 enum stat_type type, int cpu, 447 struct runtime_stat_data *rsd) 448 { 449 struct saved_value *v; 450 451 v = saved_value_lookup(NULL, cpu, false, type, rsd->ctx, st, rsd->cgrp); 452 if (!v) 453 return 0.0; 454 455 return avg_stats(&v->stats); 456 } 457 458 static double runtime_stat_n(struct runtime_stat *st, 459 enum stat_type type, int cpu, 460 struct runtime_stat_data *rsd) 461 { 462 struct saved_value *v; 463 464 v = saved_value_lookup(NULL, cpu, false, type, rsd->ctx, st, rsd->cgrp); 465 if (!v) 466 return 0.0; 467 468 return v->stats.n; 469 } 470 471 static void print_stalled_cycles_frontend(struct perf_stat_config *config, 472 int cpu, double avg, 473 struct perf_stat_output_ctx *out, 474 struct runtime_stat *st, 475 struct runtime_stat_data *rsd) 476 { 477 double total, ratio = 0.0; 478 const char *color; 479 480 total = runtime_stat_avg(st, STAT_CYCLES, cpu, rsd); 481 482 if (total) 483 ratio = avg / total * 100.0; 484 485 color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio); 486 487 if (ratio) 488 out->print_metric(config, out->ctx, color, "%7.2f%%", "frontend cycles idle", 489 ratio); 490 else 491 out->print_metric(config, out->ctx, NULL, NULL, "frontend cycles idle", 0); 492 } 493 494 static void print_stalled_cycles_backend(struct perf_stat_config *config, 495 int cpu, double avg, 496 struct perf_stat_output_ctx *out, 497 struct runtime_stat *st, 498 struct runtime_stat_data *rsd) 499 { 500 double total, ratio = 0.0; 501 const char *color; 502 503 total = runtime_stat_avg(st, STAT_CYCLES, cpu, rsd); 504 505 if (total) 506 ratio = avg / total * 100.0; 507 508 color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio); 509 510 out->print_metric(config, out->ctx, color, "%7.2f%%", "backend cycles idle", ratio); 511 } 512 513 static void print_branch_misses(struct perf_stat_config *config, 514 int cpu, double avg, 515 struct perf_stat_output_ctx *out, 516 struct runtime_stat *st, 517 struct runtime_stat_data *rsd) 518 { 519 double total, ratio = 0.0; 520 const char *color; 521 522 total = runtime_stat_avg(st, STAT_BRANCHES, cpu, rsd); 523 524 if (total) 525 ratio = avg / total * 100.0; 526 527 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 528 529 out->print_metric(config, out->ctx, color, "%7.2f%%", "of all branches", ratio); 530 } 531 532 static void print_l1_dcache_misses(struct perf_stat_config *config, 533 int cpu, double avg, 534 struct perf_stat_output_ctx *out, 535 struct runtime_stat *st, 536 struct runtime_stat_data *rsd) 537 { 538 double total, ratio = 0.0; 539 const char *color; 540 541 total = runtime_stat_avg(st, STAT_L1_DCACHE, cpu, rsd); 542 543 if (total) 544 ratio = avg / total * 100.0; 545 546 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 547 548 out->print_metric(config, out->ctx, color, "%7.2f%%", "of all L1-dcache accesses", ratio); 549 } 550 551 static void print_l1_icache_misses(struct perf_stat_config *config, 552 int cpu, double avg, 553 struct perf_stat_output_ctx *out, 554 struct runtime_stat *st, 555 struct runtime_stat_data *rsd) 556 { 557 double total, ratio = 0.0; 558 const char *color; 559 560 total = runtime_stat_avg(st, STAT_L1_ICACHE, cpu, rsd); 561 562 if (total) 563 ratio = avg / total * 100.0; 564 565 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 566 out->print_metric(config, out->ctx, color, "%7.2f%%", "of all L1-icache accesses", ratio); 567 } 568 569 static void print_dtlb_cache_misses(struct perf_stat_config *config, 570 int cpu, double avg, 571 struct perf_stat_output_ctx *out, 572 struct runtime_stat *st, 573 struct runtime_stat_data *rsd) 574 { 575 double total, ratio = 0.0; 576 const char *color; 577 578 total = runtime_stat_avg(st, STAT_DTLB_CACHE, cpu, rsd); 579 580 if (total) 581 ratio = avg / total * 100.0; 582 583 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 584 out->print_metric(config, out->ctx, color, "%7.2f%%", "of all dTLB cache accesses", ratio); 585 } 586 587 static void print_itlb_cache_misses(struct perf_stat_config *config, 588 int cpu, double avg, 589 struct perf_stat_output_ctx *out, 590 struct runtime_stat *st, 591 struct runtime_stat_data *rsd) 592 { 593 double total, ratio = 0.0; 594 const char *color; 595 596 total = runtime_stat_avg(st, STAT_ITLB_CACHE, cpu, rsd); 597 598 if (total) 599 ratio = avg / total * 100.0; 600 601 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 602 out->print_metric(config, out->ctx, color, "%7.2f%%", "of all iTLB cache accesses", ratio); 603 } 604 605 static void print_ll_cache_misses(struct perf_stat_config *config, 606 int cpu, double avg, 607 struct perf_stat_output_ctx *out, 608 struct runtime_stat *st, 609 struct runtime_stat_data *rsd) 610 { 611 double total, ratio = 0.0; 612 const char *color; 613 614 total = runtime_stat_avg(st, STAT_LL_CACHE, cpu, rsd); 615 616 if (total) 617 ratio = avg / total * 100.0; 618 619 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 620 out->print_metric(config, out->ctx, color, "%7.2f%%", "of all LL-cache accesses", ratio); 621 } 622 623 /* 624 * High level "TopDown" CPU core pipe line bottleneck break down. 625 * 626 * Basic concept following 627 * Yasin, A Top Down Method for Performance analysis and Counter architecture 628 * ISPASS14 629 * 630 * The CPU pipeline is divided into 4 areas that can be bottlenecks: 631 * 632 * Frontend -> Backend -> Retiring 633 * BadSpeculation in addition means out of order execution that is thrown away 634 * (for example branch mispredictions) 635 * Frontend is instruction decoding. 636 * Backend is execution, like computation and accessing data in memory 637 * Retiring is good execution that is not directly bottlenecked 638 * 639 * The formulas are computed in slots. 640 * A slot is an entry in the pipeline each for the pipeline width 641 * (for example a 4-wide pipeline has 4 slots for each cycle) 642 * 643 * Formulas: 644 * BadSpeculation = ((SlotsIssued - SlotsRetired) + RecoveryBubbles) / 645 * TotalSlots 646 * Retiring = SlotsRetired / TotalSlots 647 * FrontendBound = FetchBubbles / TotalSlots 648 * BackendBound = 1.0 - BadSpeculation - Retiring - FrontendBound 649 * 650 * The kernel provides the mapping to the low level CPU events and any scaling 651 * needed for the CPU pipeline width, for example: 652 * 653 * TotalSlots = Cycles * 4 654 * 655 * The scaling factor is communicated in the sysfs unit. 656 * 657 * In some cases the CPU may not be able to measure all the formulas due to 658 * missing events. In this case multiple formulas are combined, as possible. 659 * 660 * Full TopDown supports more levels to sub-divide each area: for example 661 * BackendBound into computing bound and memory bound. For now we only 662 * support Level 1 TopDown. 663 */ 664 665 static double sanitize_val(double x) 666 { 667 if (x < 0 && x >= -0.02) 668 return 0.0; 669 return x; 670 } 671 672 static double td_total_slots(int cpu, struct runtime_stat *st, 673 struct runtime_stat_data *rsd) 674 { 675 return runtime_stat_avg(st, STAT_TOPDOWN_TOTAL_SLOTS, cpu, rsd); 676 } 677 678 static double td_bad_spec(int cpu, struct runtime_stat *st, 679 struct runtime_stat_data *rsd) 680 { 681 double bad_spec = 0; 682 double total_slots; 683 double total; 684 685 total = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_ISSUED, cpu, rsd) - 686 runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED, cpu, rsd) + 687 runtime_stat_avg(st, STAT_TOPDOWN_RECOVERY_BUBBLES, cpu, rsd); 688 689 total_slots = td_total_slots(cpu, st, rsd); 690 if (total_slots) 691 bad_spec = total / total_slots; 692 return sanitize_val(bad_spec); 693 } 694 695 static double td_retiring(int cpu, struct runtime_stat *st, 696 struct runtime_stat_data *rsd) 697 { 698 double retiring = 0; 699 double total_slots = td_total_slots(cpu, st, rsd); 700 double ret_slots = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED, 701 cpu, rsd); 702 703 if (total_slots) 704 retiring = ret_slots / total_slots; 705 return retiring; 706 } 707 708 static double td_fe_bound(int cpu, struct runtime_stat *st, 709 struct runtime_stat_data *rsd) 710 { 711 double fe_bound = 0; 712 double total_slots = td_total_slots(cpu, st, rsd); 713 double fetch_bub = runtime_stat_avg(st, STAT_TOPDOWN_FETCH_BUBBLES, 714 cpu, rsd); 715 716 if (total_slots) 717 fe_bound = fetch_bub / total_slots; 718 return fe_bound; 719 } 720 721 static double td_be_bound(int cpu, struct runtime_stat *st, 722 struct runtime_stat_data *rsd) 723 { 724 double sum = (td_fe_bound(cpu, st, rsd) + 725 td_bad_spec(cpu, st, rsd) + 726 td_retiring(cpu, st, rsd)); 727 if (sum == 0) 728 return 0; 729 return sanitize_val(1.0 - sum); 730 } 731 732 /* 733 * Kernel reports metrics multiplied with slots. To get back 734 * the ratios we need to recreate the sum. 735 */ 736 737 static double td_metric_ratio(int cpu, enum stat_type type, 738 struct runtime_stat *stat, 739 struct runtime_stat_data *rsd) 740 { 741 double sum = runtime_stat_avg(stat, STAT_TOPDOWN_RETIRING, cpu, rsd) + 742 runtime_stat_avg(stat, STAT_TOPDOWN_FE_BOUND, cpu, rsd) + 743 runtime_stat_avg(stat, STAT_TOPDOWN_BE_BOUND, cpu, rsd) + 744 runtime_stat_avg(stat, STAT_TOPDOWN_BAD_SPEC, cpu, rsd); 745 double d = runtime_stat_avg(stat, type, cpu, rsd); 746 747 if (sum) 748 return d / sum; 749 return 0; 750 } 751 752 /* 753 * ... but only if most of the values are actually available. 754 * We allow two missing. 755 */ 756 757 static bool full_td(int cpu, struct runtime_stat *stat, 758 struct runtime_stat_data *rsd) 759 { 760 int c = 0; 761 762 if (runtime_stat_avg(stat, STAT_TOPDOWN_RETIRING, cpu, rsd) > 0) 763 c++; 764 if (runtime_stat_avg(stat, STAT_TOPDOWN_BE_BOUND, cpu, rsd) > 0) 765 c++; 766 if (runtime_stat_avg(stat, STAT_TOPDOWN_FE_BOUND, cpu, rsd) > 0) 767 c++; 768 if (runtime_stat_avg(stat, STAT_TOPDOWN_BAD_SPEC, cpu, rsd) > 0) 769 c++; 770 return c >= 2; 771 } 772 773 static void print_smi_cost(struct perf_stat_config *config, int cpu, 774 struct perf_stat_output_ctx *out, 775 struct runtime_stat *st, 776 struct runtime_stat_data *rsd) 777 { 778 double smi_num, aperf, cycles, cost = 0.0; 779 const char *color = NULL; 780 781 smi_num = runtime_stat_avg(st, STAT_SMI_NUM, cpu, rsd); 782 aperf = runtime_stat_avg(st, STAT_APERF, cpu, rsd); 783 cycles = runtime_stat_avg(st, STAT_CYCLES, cpu, rsd); 784 785 if ((cycles == 0) || (aperf == 0)) 786 return; 787 788 if (smi_num) 789 cost = (aperf - cycles) / aperf * 100.00; 790 791 if (cost > 10) 792 color = PERF_COLOR_RED; 793 out->print_metric(config, out->ctx, color, "%8.1f%%", "SMI cycles%", cost); 794 out->print_metric(config, out->ctx, NULL, "%4.0f", "SMI#", smi_num); 795 } 796 797 static int prepare_metric(struct evsel **metric_events, 798 struct metric_ref *metric_refs, 799 struct expr_parse_ctx *pctx, 800 int cpu, 801 struct runtime_stat *st) 802 { 803 double scale; 804 char *n, *pn; 805 int i, j, ret; 806 807 expr__ctx_init(pctx); 808 for (i = 0; metric_events[i]; i++) { 809 struct saved_value *v; 810 struct stats *stats; 811 u64 metric_total = 0; 812 813 if (!strcmp(metric_events[i]->name, "duration_time")) { 814 stats = &walltime_nsecs_stats; 815 scale = 1e-9; 816 } else { 817 v = saved_value_lookup(metric_events[i], cpu, false, 818 STAT_NONE, 0, st, 819 metric_events[i]->cgrp); 820 if (!v) 821 break; 822 stats = &v->stats; 823 scale = 1.0; 824 825 if (v->metric_other) 826 metric_total = v->metric_total; 827 } 828 829 n = strdup(metric_events[i]->name); 830 if (!n) 831 return -ENOMEM; 832 /* 833 * This display code with --no-merge adds [cpu] postfixes. 834 * These are not supported by the parser. Remove everything 835 * after the space. 836 */ 837 pn = strchr(n, ' '); 838 if (pn) 839 *pn = 0; 840 841 if (metric_total) 842 expr__add_id_val(pctx, n, metric_total); 843 else 844 expr__add_id_val(pctx, n, avg_stats(stats)*scale); 845 } 846 847 for (j = 0; metric_refs && metric_refs[j].metric_name; j++) { 848 ret = expr__add_ref(pctx, &metric_refs[j]); 849 if (ret) 850 return ret; 851 } 852 853 return i; 854 } 855 856 static void generic_metric(struct perf_stat_config *config, 857 const char *metric_expr, 858 struct evsel **metric_events, 859 struct metric_ref *metric_refs, 860 char *name, 861 const char *metric_name, 862 const char *metric_unit, 863 int runtime, 864 int cpu, 865 struct perf_stat_output_ctx *out, 866 struct runtime_stat *st) 867 { 868 print_metric_t print_metric = out->print_metric; 869 struct expr_parse_ctx pctx; 870 double ratio, scale; 871 int i; 872 void *ctxp = out->ctx; 873 874 i = prepare_metric(metric_events, metric_refs, &pctx, cpu, st); 875 if (i < 0) 876 return; 877 878 if (!metric_events[i]) { 879 if (expr__parse(&ratio, &pctx, metric_expr, runtime) == 0) { 880 char *unit; 881 char metric_bf[64]; 882 883 if (metric_unit && metric_name) { 884 if (perf_pmu__convert_scale(metric_unit, 885 &unit, &scale) >= 0) { 886 ratio *= scale; 887 } 888 if (strstr(metric_expr, "?")) 889 scnprintf(metric_bf, sizeof(metric_bf), 890 "%s %s_%d", unit, metric_name, runtime); 891 else 892 scnprintf(metric_bf, sizeof(metric_bf), 893 "%s %s", unit, metric_name); 894 895 print_metric(config, ctxp, NULL, "%8.1f", 896 metric_bf, ratio); 897 } else { 898 print_metric(config, ctxp, NULL, "%8.2f", 899 metric_name ? 900 metric_name : 901 out->force_header ? name : "", 902 ratio); 903 } 904 } else { 905 print_metric(config, ctxp, NULL, NULL, 906 out->force_header ? 907 (metric_name ? metric_name : name) : "", 0); 908 } 909 } else { 910 print_metric(config, ctxp, NULL, NULL, 911 out->force_header ? 912 (metric_name ? metric_name : name) : "", 0); 913 } 914 915 expr__ctx_clear(&pctx); 916 } 917 918 double test_generic_metric(struct metric_expr *mexp, int cpu, struct runtime_stat *st) 919 { 920 struct expr_parse_ctx pctx; 921 double ratio = 0.0; 922 923 if (prepare_metric(mexp->metric_events, mexp->metric_refs, &pctx, cpu, st) < 0) 924 goto out; 925 926 if (expr__parse(&ratio, &pctx, mexp->metric_expr, 1)) 927 ratio = 0.0; 928 929 out: 930 expr__ctx_clear(&pctx); 931 return ratio; 932 } 933 934 void perf_stat__print_shadow_stats(struct perf_stat_config *config, 935 struct evsel *evsel, 936 double avg, int cpu, 937 struct perf_stat_output_ctx *out, 938 struct rblist *metric_events, 939 struct runtime_stat *st) 940 { 941 void *ctxp = out->ctx; 942 print_metric_t print_metric = out->print_metric; 943 double total, ratio = 0.0, total2; 944 const char *color = NULL; 945 struct runtime_stat_data rsd = { 946 .ctx = evsel_context(evsel), 947 .cgrp = evsel->cgrp, 948 }; 949 struct metric_event *me; 950 int num = 1; 951 952 if (evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { 953 total = runtime_stat_avg(st, STAT_CYCLES, cpu, &rsd); 954 955 if (total) { 956 ratio = avg / total; 957 print_metric(config, ctxp, NULL, "%7.2f ", 958 "insn per cycle", ratio); 959 } else { 960 print_metric(config, ctxp, NULL, NULL, "insn per cycle", 0); 961 } 962 963 total = runtime_stat_avg(st, STAT_STALLED_CYCLES_FRONT, cpu, &rsd); 964 965 total = max(total, runtime_stat_avg(st, 966 STAT_STALLED_CYCLES_BACK, 967 cpu, &rsd)); 968 969 if (total && avg) { 970 out->new_line(config, ctxp); 971 ratio = total / avg; 972 print_metric(config, ctxp, NULL, "%7.2f ", 973 "stalled cycles per insn", 974 ratio); 975 } 976 } else if (evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) { 977 if (runtime_stat_n(st, STAT_BRANCHES, cpu, &rsd) != 0) 978 print_branch_misses(config, cpu, avg, out, st, &rsd); 979 else 980 print_metric(config, ctxp, NULL, NULL, "of all branches", 0); 981 } else if ( 982 evsel->core.attr.type == PERF_TYPE_HW_CACHE && 983 evsel->core.attr.config == ( PERF_COUNT_HW_CACHE_L1D | 984 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 985 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 986 987 if (runtime_stat_n(st, STAT_L1_DCACHE, cpu, &rsd) != 0) 988 print_l1_dcache_misses(config, cpu, avg, out, st, &rsd); 989 else 990 print_metric(config, ctxp, NULL, NULL, "of all L1-dcache accesses", 0); 991 } else if ( 992 evsel->core.attr.type == PERF_TYPE_HW_CACHE && 993 evsel->core.attr.config == ( PERF_COUNT_HW_CACHE_L1I | 994 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 995 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 996 997 if (runtime_stat_n(st, STAT_L1_ICACHE, cpu, &rsd) != 0) 998 print_l1_icache_misses(config, cpu, avg, out, st, &rsd); 999 else 1000 print_metric(config, ctxp, NULL, NULL, "of all L1-icache accesses", 0); 1001 } else if ( 1002 evsel->core.attr.type == PERF_TYPE_HW_CACHE && 1003 evsel->core.attr.config == ( PERF_COUNT_HW_CACHE_DTLB | 1004 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1005 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 1006 1007 if (runtime_stat_n(st, STAT_DTLB_CACHE, cpu, &rsd) != 0) 1008 print_dtlb_cache_misses(config, cpu, avg, out, st, &rsd); 1009 else 1010 print_metric(config, ctxp, NULL, NULL, "of all dTLB cache accesses", 0); 1011 } else if ( 1012 evsel->core.attr.type == PERF_TYPE_HW_CACHE && 1013 evsel->core.attr.config == ( PERF_COUNT_HW_CACHE_ITLB | 1014 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1015 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 1016 1017 if (runtime_stat_n(st, STAT_ITLB_CACHE, cpu, &rsd) != 0) 1018 print_itlb_cache_misses(config, cpu, avg, out, st, &rsd); 1019 else 1020 print_metric(config, ctxp, NULL, NULL, "of all iTLB cache accesses", 0); 1021 } else if ( 1022 evsel->core.attr.type == PERF_TYPE_HW_CACHE && 1023 evsel->core.attr.config == ( PERF_COUNT_HW_CACHE_LL | 1024 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1025 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 1026 1027 if (runtime_stat_n(st, STAT_LL_CACHE, cpu, &rsd) != 0) 1028 print_ll_cache_misses(config, cpu, avg, out, st, &rsd); 1029 else 1030 print_metric(config, ctxp, NULL, NULL, "of all LL-cache accesses", 0); 1031 } else if (evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) { 1032 total = runtime_stat_avg(st, STAT_CACHEREFS, cpu, &rsd); 1033 1034 if (total) 1035 ratio = avg * 100 / total; 1036 1037 if (runtime_stat_n(st, STAT_CACHEREFS, cpu, &rsd) != 0) 1038 print_metric(config, ctxp, NULL, "%8.3f %%", 1039 "of all cache refs", ratio); 1040 else 1041 print_metric(config, ctxp, NULL, NULL, "of all cache refs", 0); 1042 } else if (evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { 1043 print_stalled_cycles_frontend(config, cpu, avg, out, st, &rsd); 1044 } else if (evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { 1045 print_stalled_cycles_backend(config, cpu, avg, out, st, &rsd); 1046 } else if (evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { 1047 total = runtime_stat_avg(st, STAT_NSECS, cpu, &rsd); 1048 1049 if (total) { 1050 ratio = avg / total; 1051 print_metric(config, ctxp, NULL, "%8.3f", "GHz", ratio); 1052 } else { 1053 print_metric(config, ctxp, NULL, NULL, "Ghz", 0); 1054 } 1055 } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) { 1056 total = runtime_stat_avg(st, STAT_CYCLES, cpu, &rsd); 1057 1058 if (total) 1059 print_metric(config, ctxp, NULL, 1060 "%7.2f%%", "transactional cycles", 1061 100.0 * (avg / total)); 1062 else 1063 print_metric(config, ctxp, NULL, NULL, "transactional cycles", 1064 0); 1065 } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) { 1066 total = runtime_stat_avg(st, STAT_CYCLES, cpu, &rsd); 1067 total2 = runtime_stat_avg(st, STAT_CYCLES_IN_TX, cpu, &rsd); 1068 1069 if (total2 < avg) 1070 total2 = avg; 1071 if (total) 1072 print_metric(config, ctxp, NULL, "%7.2f%%", "aborted cycles", 1073 100.0 * ((total2-avg) / total)); 1074 else 1075 print_metric(config, ctxp, NULL, NULL, "aborted cycles", 0); 1076 } else if (perf_stat_evsel__is(evsel, TRANSACTION_START)) { 1077 total = runtime_stat_avg(st, STAT_CYCLES_IN_TX, cpu, &rsd); 1078 1079 if (avg) 1080 ratio = total / avg; 1081 1082 if (runtime_stat_n(st, STAT_CYCLES_IN_TX, cpu, &rsd) != 0) 1083 print_metric(config, ctxp, NULL, "%8.0f", 1084 "cycles / transaction", ratio); 1085 else 1086 print_metric(config, ctxp, NULL, NULL, "cycles / transaction", 1087 0); 1088 } else if (perf_stat_evsel__is(evsel, ELISION_START)) { 1089 total = runtime_stat_avg(st, STAT_CYCLES_IN_TX, cpu, &rsd); 1090 1091 if (avg) 1092 ratio = total / avg; 1093 1094 print_metric(config, ctxp, NULL, "%8.0f", "cycles / elision", ratio); 1095 } else if (evsel__is_clock(evsel)) { 1096 if ((ratio = avg_stats(&walltime_nsecs_stats)) != 0) 1097 print_metric(config, ctxp, NULL, "%8.3f", "CPUs utilized", 1098 avg / (ratio * evsel->scale)); 1099 else 1100 print_metric(config, ctxp, NULL, NULL, "CPUs utilized", 0); 1101 } else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) { 1102 double fe_bound = td_fe_bound(cpu, st, &rsd); 1103 1104 if (fe_bound > 0.2) 1105 color = PERF_COLOR_RED; 1106 print_metric(config, ctxp, color, "%8.1f%%", "frontend bound", 1107 fe_bound * 100.); 1108 } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) { 1109 double retiring = td_retiring(cpu, st, &rsd); 1110 1111 if (retiring > 0.7) 1112 color = PERF_COLOR_GREEN; 1113 print_metric(config, ctxp, color, "%8.1f%%", "retiring", 1114 retiring * 100.); 1115 } else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) { 1116 double bad_spec = td_bad_spec(cpu, st, &rsd); 1117 1118 if (bad_spec > 0.1) 1119 color = PERF_COLOR_RED; 1120 print_metric(config, ctxp, color, "%8.1f%%", "bad speculation", 1121 bad_spec * 100.); 1122 } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) { 1123 double be_bound = td_be_bound(cpu, st, &rsd); 1124 const char *name = "backend bound"; 1125 static int have_recovery_bubbles = -1; 1126 1127 /* In case the CPU does not support topdown-recovery-bubbles */ 1128 if (have_recovery_bubbles < 0) 1129 have_recovery_bubbles = pmu_have_event("cpu", 1130 "topdown-recovery-bubbles"); 1131 if (!have_recovery_bubbles) 1132 name = "backend bound/bad spec"; 1133 1134 if (be_bound > 0.2) 1135 color = PERF_COLOR_RED; 1136 if (td_total_slots(cpu, st, &rsd) > 0) 1137 print_metric(config, ctxp, color, "%8.1f%%", name, 1138 be_bound * 100.); 1139 else 1140 print_metric(config, ctxp, NULL, NULL, name, 0); 1141 } else if (perf_stat_evsel__is(evsel, TOPDOWN_RETIRING) && 1142 full_td(cpu, st, &rsd)) { 1143 double retiring = td_metric_ratio(cpu, 1144 STAT_TOPDOWN_RETIRING, st, 1145 &rsd); 1146 if (retiring > 0.7) 1147 color = PERF_COLOR_GREEN; 1148 print_metric(config, ctxp, color, "%8.1f%%", "retiring", 1149 retiring * 100.); 1150 } else if (perf_stat_evsel__is(evsel, TOPDOWN_FE_BOUND) && 1151 full_td(cpu, st, &rsd)) { 1152 double fe_bound = td_metric_ratio(cpu, 1153 STAT_TOPDOWN_FE_BOUND, st, 1154 &rsd); 1155 if (fe_bound > 0.2) 1156 color = PERF_COLOR_RED; 1157 print_metric(config, ctxp, color, "%8.1f%%", "frontend bound", 1158 fe_bound * 100.); 1159 } else if (perf_stat_evsel__is(evsel, TOPDOWN_BE_BOUND) && 1160 full_td(cpu, st, &rsd)) { 1161 double be_bound = td_metric_ratio(cpu, 1162 STAT_TOPDOWN_BE_BOUND, st, 1163 &rsd); 1164 if (be_bound > 0.2) 1165 color = PERF_COLOR_RED; 1166 print_metric(config, ctxp, color, "%8.1f%%", "backend bound", 1167 be_bound * 100.); 1168 } else if (perf_stat_evsel__is(evsel, TOPDOWN_BAD_SPEC) && 1169 full_td(cpu, st, &rsd)) { 1170 double bad_spec = td_metric_ratio(cpu, 1171 STAT_TOPDOWN_BAD_SPEC, st, 1172 &rsd); 1173 if (bad_spec > 0.1) 1174 color = PERF_COLOR_RED; 1175 print_metric(config, ctxp, color, "%8.1f%%", "bad speculation", 1176 bad_spec * 100.); 1177 } else if (evsel->metric_expr) { 1178 generic_metric(config, evsel->metric_expr, evsel->metric_events, NULL, 1179 evsel->name, evsel->metric_name, NULL, 1, cpu, out, st); 1180 } else if (runtime_stat_n(st, STAT_NSECS, cpu, &rsd) != 0) { 1181 char unit = 'M'; 1182 char unit_buf[10]; 1183 1184 total = runtime_stat_avg(st, STAT_NSECS, cpu, &rsd); 1185 1186 if (total) 1187 ratio = 1000.0 * avg / total; 1188 if (ratio < 0.001) { 1189 ratio *= 1000; 1190 unit = 'K'; 1191 } 1192 snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit); 1193 print_metric(config, ctxp, NULL, "%8.3f", unit_buf, ratio); 1194 } else if (perf_stat_evsel__is(evsel, SMI_NUM)) { 1195 print_smi_cost(config, cpu, out, st, &rsd); 1196 } else { 1197 num = 0; 1198 } 1199 1200 if ((me = metricgroup__lookup(metric_events, evsel, false)) != NULL) { 1201 struct metric_expr *mexp; 1202 1203 list_for_each_entry (mexp, &me->head, nd) { 1204 if (num++ > 0) 1205 out->new_line(config, ctxp); 1206 generic_metric(config, mexp->metric_expr, mexp->metric_events, 1207 mexp->metric_refs, evsel->name, mexp->metric_name, 1208 mexp->metric_unit, mexp->runtime, cpu, out, st); 1209 } 1210 } 1211 if (num == 0) 1212 print_metric(config, ctxp, NULL, NULL, NULL, 0); 1213 } 1214