1 // SPDX-License-Identifier: GPL-2.0 2 #include <stdio.h> 3 #include "evsel.h" 4 #include "stat.h" 5 #include "color.h" 6 #include "pmu.h" 7 #include "rblist.h" 8 #include "evlist.h" 9 #include "expr.h" 10 #include "metricgroup.h" 11 #include <linux/zalloc.h> 12 13 /* 14 * AGGR_GLOBAL: Use CPU 0 15 * AGGR_SOCKET: Use first CPU of socket 16 * AGGR_DIE: Use first CPU of die 17 * AGGR_CORE: Use first CPU of core 18 * AGGR_NONE: Use matching CPU 19 * AGGR_THREAD: Not supported? 20 */ 21 22 struct runtime_stat rt_stat; 23 struct stats walltime_nsecs_stats; 24 25 struct saved_value { 26 struct rb_node rb_node; 27 struct evsel *evsel; 28 enum stat_type type; 29 int ctx; 30 int cpu; 31 struct runtime_stat *stat; 32 struct stats stats; 33 u64 metric_total; 34 int metric_other; 35 }; 36 37 static int saved_value_cmp(struct rb_node *rb_node, const void *entry) 38 { 39 struct saved_value *a = container_of(rb_node, 40 struct saved_value, 41 rb_node); 42 const struct saved_value *b = entry; 43 44 if (a->cpu != b->cpu) 45 return a->cpu - b->cpu; 46 47 /* 48 * Previously the rbtree was used to link generic metrics. 49 * The keys were evsel/cpu. Now the rbtree is extended to support 50 * per-thread shadow stats. For shadow stats case, the keys 51 * are cpu/type/ctx/stat (evsel is NULL). For generic metrics 52 * case, the keys are still evsel/cpu (type/ctx/stat are 0 or NULL). 53 */ 54 if (a->type != b->type) 55 return a->type - b->type; 56 57 if (a->ctx != b->ctx) 58 return a->ctx - b->ctx; 59 60 if (a->evsel == NULL && b->evsel == NULL) { 61 if (a->stat == b->stat) 62 return 0; 63 64 if ((char *)a->stat < (char *)b->stat) 65 return -1; 66 67 return 1; 68 } 69 70 if (a->evsel == b->evsel) 71 return 0; 72 if ((char *)a->evsel < (char *)b->evsel) 73 return -1; 74 return +1; 75 } 76 77 static struct rb_node *saved_value_new(struct rblist *rblist __maybe_unused, 78 const void *entry) 79 { 80 struct saved_value *nd = malloc(sizeof(struct saved_value)); 81 82 if (!nd) 83 return NULL; 84 memcpy(nd, entry, sizeof(struct saved_value)); 85 return &nd->rb_node; 86 } 87 88 static void saved_value_delete(struct rblist *rblist __maybe_unused, 89 struct rb_node *rb_node) 90 { 91 struct saved_value *v; 92 93 BUG_ON(!rb_node); 94 v = container_of(rb_node, struct saved_value, rb_node); 95 free(v); 96 } 97 98 static struct saved_value *saved_value_lookup(struct evsel *evsel, 99 int cpu, 100 bool create, 101 enum stat_type type, 102 int ctx, 103 struct runtime_stat *st) 104 { 105 struct rblist *rblist; 106 struct rb_node *nd; 107 struct saved_value dm = { 108 .cpu = cpu, 109 .evsel = evsel, 110 .type = type, 111 .ctx = ctx, 112 .stat = st, 113 }; 114 115 rblist = &st->value_list; 116 117 nd = rblist__find(rblist, &dm); 118 if (nd) 119 return container_of(nd, struct saved_value, rb_node); 120 if (create) { 121 rblist__add_node(rblist, &dm); 122 nd = rblist__find(rblist, &dm); 123 if (nd) 124 return container_of(nd, struct saved_value, rb_node); 125 } 126 return NULL; 127 } 128 129 void runtime_stat__init(struct runtime_stat *st) 130 { 131 struct rblist *rblist = &st->value_list; 132 133 rblist__init(rblist); 134 rblist->node_cmp = saved_value_cmp; 135 rblist->node_new = saved_value_new; 136 rblist->node_delete = saved_value_delete; 137 } 138 139 void runtime_stat__exit(struct runtime_stat *st) 140 { 141 rblist__exit(&st->value_list); 142 } 143 144 void perf_stat__init_shadow_stats(void) 145 { 146 runtime_stat__init(&rt_stat); 147 } 148 149 static int evsel_context(struct evsel *evsel) 150 { 151 int ctx = 0; 152 153 if (evsel->core.attr.exclude_kernel) 154 ctx |= CTX_BIT_KERNEL; 155 if (evsel->core.attr.exclude_user) 156 ctx |= CTX_BIT_USER; 157 if (evsel->core.attr.exclude_hv) 158 ctx |= CTX_BIT_HV; 159 if (evsel->core.attr.exclude_host) 160 ctx |= CTX_BIT_HOST; 161 if (evsel->core.attr.exclude_idle) 162 ctx |= CTX_BIT_IDLE; 163 164 return ctx; 165 } 166 167 static void reset_stat(struct runtime_stat *st) 168 { 169 struct rblist *rblist; 170 struct rb_node *pos, *next; 171 172 rblist = &st->value_list; 173 next = rb_first_cached(&rblist->entries); 174 while (next) { 175 pos = next; 176 next = rb_next(pos); 177 memset(&container_of(pos, struct saved_value, rb_node)->stats, 178 0, 179 sizeof(struct stats)); 180 } 181 } 182 183 void perf_stat__reset_shadow_stats(void) 184 { 185 reset_stat(&rt_stat); 186 memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats)); 187 } 188 189 void perf_stat__reset_shadow_per_stat(struct runtime_stat *st) 190 { 191 reset_stat(st); 192 } 193 194 static void update_runtime_stat(struct runtime_stat *st, 195 enum stat_type type, 196 int ctx, int cpu, u64 count) 197 { 198 struct saved_value *v = saved_value_lookup(NULL, cpu, true, 199 type, ctx, st); 200 201 if (v) 202 update_stats(&v->stats, count); 203 } 204 205 /* 206 * Update various tracking values we maintain to print 207 * more semantic information such as miss/hit ratios, 208 * instruction rates, etc: 209 */ 210 void perf_stat__update_shadow_stats(struct evsel *counter, u64 count, 211 int cpu, struct runtime_stat *st) 212 { 213 int ctx = evsel_context(counter); 214 u64 count_ns = count; 215 struct saved_value *v; 216 217 count *= counter->scale; 218 219 if (evsel__is_clock(counter)) 220 update_runtime_stat(st, STAT_NSECS, 0, cpu, count_ns); 221 else if (evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) 222 update_runtime_stat(st, STAT_CYCLES, ctx, cpu, count); 223 else if (perf_stat_evsel__is(counter, CYCLES_IN_TX)) 224 update_runtime_stat(st, STAT_CYCLES_IN_TX, ctx, cpu, count); 225 else if (perf_stat_evsel__is(counter, TRANSACTION_START)) 226 update_runtime_stat(st, STAT_TRANSACTION, ctx, cpu, count); 227 else if (perf_stat_evsel__is(counter, ELISION_START)) 228 update_runtime_stat(st, STAT_ELISION, ctx, cpu, count); 229 else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS)) 230 update_runtime_stat(st, STAT_TOPDOWN_TOTAL_SLOTS, 231 ctx, cpu, count); 232 else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED)) 233 update_runtime_stat(st, STAT_TOPDOWN_SLOTS_ISSUED, 234 ctx, cpu, count); 235 else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED)) 236 update_runtime_stat(st, STAT_TOPDOWN_SLOTS_RETIRED, 237 ctx, cpu, count); 238 else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES)) 239 update_runtime_stat(st, STAT_TOPDOWN_FETCH_BUBBLES, 240 ctx, cpu, count); 241 else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES)) 242 update_runtime_stat(st, STAT_TOPDOWN_RECOVERY_BUBBLES, 243 ctx, cpu, count); 244 else if (perf_stat_evsel__is(counter, TOPDOWN_RETIRING)) 245 update_runtime_stat(st, STAT_TOPDOWN_RETIRING, 246 ctx, cpu, count); 247 else if (perf_stat_evsel__is(counter, TOPDOWN_BAD_SPEC)) 248 update_runtime_stat(st, STAT_TOPDOWN_BAD_SPEC, 249 ctx, cpu, count); 250 else if (perf_stat_evsel__is(counter, TOPDOWN_FE_BOUND)) 251 update_runtime_stat(st, STAT_TOPDOWN_FE_BOUND, 252 ctx, cpu, count); 253 else if (perf_stat_evsel__is(counter, TOPDOWN_BE_BOUND)) 254 update_runtime_stat(st, STAT_TOPDOWN_BE_BOUND, 255 ctx, cpu, count); 256 else if (evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) 257 update_runtime_stat(st, STAT_STALLED_CYCLES_FRONT, 258 ctx, cpu, count); 259 else if (evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) 260 update_runtime_stat(st, STAT_STALLED_CYCLES_BACK, 261 ctx, cpu, count); 262 else if (evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) 263 update_runtime_stat(st, STAT_BRANCHES, ctx, cpu, count); 264 else if (evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) 265 update_runtime_stat(st, STAT_CACHEREFS, ctx, cpu, count); 266 else if (evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) 267 update_runtime_stat(st, STAT_L1_DCACHE, ctx, cpu, count); 268 else if (evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) 269 update_runtime_stat(st, STAT_L1_ICACHE, ctx, cpu, count); 270 else if (evsel__match(counter, HW_CACHE, HW_CACHE_LL)) 271 update_runtime_stat(st, STAT_LL_CACHE, ctx, cpu, count); 272 else if (evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) 273 update_runtime_stat(st, STAT_DTLB_CACHE, ctx, cpu, count); 274 else if (evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) 275 update_runtime_stat(st, STAT_ITLB_CACHE, ctx, cpu, count); 276 else if (perf_stat_evsel__is(counter, SMI_NUM)) 277 update_runtime_stat(st, STAT_SMI_NUM, ctx, cpu, count); 278 else if (perf_stat_evsel__is(counter, APERF)) 279 update_runtime_stat(st, STAT_APERF, ctx, cpu, count); 280 281 if (counter->collect_stat) { 282 v = saved_value_lookup(counter, cpu, true, STAT_NONE, 0, st); 283 update_stats(&v->stats, count); 284 if (counter->metric_leader) 285 v->metric_total += count; 286 } else if (counter->metric_leader) { 287 v = saved_value_lookup(counter->metric_leader, 288 cpu, true, STAT_NONE, 0, st); 289 v->metric_total += count; 290 v->metric_other++; 291 } 292 } 293 294 /* used for get_ratio_color() */ 295 enum grc_type { 296 GRC_STALLED_CYCLES_FE, 297 GRC_STALLED_CYCLES_BE, 298 GRC_CACHE_MISSES, 299 GRC_MAX_NR 300 }; 301 302 static const char *get_ratio_color(enum grc_type type, double ratio) 303 { 304 static const double grc_table[GRC_MAX_NR][3] = { 305 [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 }, 306 [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 }, 307 [GRC_CACHE_MISSES] = { 20.0, 10.0, 5.0 }, 308 }; 309 const char *color = PERF_COLOR_NORMAL; 310 311 if (ratio > grc_table[type][0]) 312 color = PERF_COLOR_RED; 313 else if (ratio > grc_table[type][1]) 314 color = PERF_COLOR_MAGENTA; 315 else if (ratio > grc_table[type][2]) 316 color = PERF_COLOR_YELLOW; 317 318 return color; 319 } 320 321 static struct evsel *perf_stat__find_event(struct evlist *evsel_list, 322 const char *name) 323 { 324 struct evsel *c2; 325 326 evlist__for_each_entry (evsel_list, c2) { 327 if (!strcasecmp(c2->name, name) && !c2->collect_stat) 328 return c2; 329 } 330 return NULL; 331 } 332 333 /* Mark MetricExpr target events and link events using them to them. */ 334 void perf_stat__collect_metric_expr(struct evlist *evsel_list) 335 { 336 struct evsel *counter, *leader, **metric_events, *oc; 337 bool found; 338 struct expr_parse_ctx ctx; 339 struct hashmap_entry *cur; 340 size_t bkt; 341 int i; 342 343 expr__ctx_init(&ctx); 344 evlist__for_each_entry(evsel_list, counter) { 345 bool invalid = false; 346 347 leader = counter->leader; 348 if (!counter->metric_expr) 349 continue; 350 351 expr__ctx_clear(&ctx); 352 metric_events = counter->metric_events; 353 if (!metric_events) { 354 if (expr__find_other(counter->metric_expr, 355 counter->name, 356 &ctx, 1) < 0) 357 continue; 358 359 metric_events = calloc(sizeof(struct evsel *), 360 hashmap__size(&ctx.ids) + 1); 361 if (!metric_events) { 362 expr__ctx_clear(&ctx); 363 return; 364 } 365 counter->metric_events = metric_events; 366 } 367 368 i = 0; 369 hashmap__for_each_entry((&ctx.ids), cur, bkt) { 370 const char *metric_name = (const char *)cur->key; 371 372 found = false; 373 if (leader) { 374 /* Search in group */ 375 for_each_group_member (oc, leader) { 376 if (!strcasecmp(oc->name, 377 metric_name) && 378 !oc->collect_stat) { 379 found = true; 380 break; 381 } 382 } 383 } 384 if (!found) { 385 /* Search ignoring groups */ 386 oc = perf_stat__find_event(evsel_list, 387 metric_name); 388 } 389 if (!oc) { 390 /* Deduping one is good enough to handle duplicated PMUs. */ 391 static char *printed; 392 393 /* 394 * Adding events automatically would be difficult, because 395 * it would risk creating groups that are not schedulable. 396 * perf stat doesn't understand all the scheduling constraints 397 * of events. So we ask the user instead to add the missing 398 * events. 399 */ 400 if (!printed || 401 strcasecmp(printed, metric_name)) { 402 fprintf(stderr, 403 "Add %s event to groups to get metric expression for %s\n", 404 metric_name, 405 counter->name); 406 printed = strdup(metric_name); 407 } 408 invalid = true; 409 continue; 410 } 411 metric_events[i++] = oc; 412 oc->collect_stat = true; 413 } 414 metric_events[i] = NULL; 415 if (invalid) { 416 free(metric_events); 417 counter->metric_events = NULL; 418 counter->metric_expr = NULL; 419 } 420 } 421 expr__ctx_clear(&ctx); 422 } 423 424 static double runtime_stat_avg(struct runtime_stat *st, 425 enum stat_type type, int ctx, int cpu) 426 { 427 struct saved_value *v; 428 429 v = saved_value_lookup(NULL, cpu, false, type, ctx, st); 430 if (!v) 431 return 0.0; 432 433 return avg_stats(&v->stats); 434 } 435 436 static double runtime_stat_n(struct runtime_stat *st, 437 enum stat_type type, int ctx, int cpu) 438 { 439 struct saved_value *v; 440 441 v = saved_value_lookup(NULL, cpu, false, type, ctx, st); 442 if (!v) 443 return 0.0; 444 445 return v->stats.n; 446 } 447 448 static void print_stalled_cycles_frontend(struct perf_stat_config *config, 449 int cpu, 450 struct evsel *evsel, double avg, 451 struct perf_stat_output_ctx *out, 452 struct runtime_stat *st) 453 { 454 double total, ratio = 0.0; 455 const char *color; 456 int ctx = evsel_context(evsel); 457 458 total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu); 459 460 if (total) 461 ratio = avg / total * 100.0; 462 463 color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio); 464 465 if (ratio) 466 out->print_metric(config, out->ctx, color, "%7.2f%%", "frontend cycles idle", 467 ratio); 468 else 469 out->print_metric(config, out->ctx, NULL, NULL, "frontend cycles idle", 0); 470 } 471 472 static void print_stalled_cycles_backend(struct perf_stat_config *config, 473 int cpu, 474 struct evsel *evsel, double avg, 475 struct perf_stat_output_ctx *out, 476 struct runtime_stat *st) 477 { 478 double total, ratio = 0.0; 479 const char *color; 480 int ctx = evsel_context(evsel); 481 482 total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu); 483 484 if (total) 485 ratio = avg / total * 100.0; 486 487 color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio); 488 489 out->print_metric(config, out->ctx, color, "%7.2f%%", "backend cycles idle", ratio); 490 } 491 492 static void print_branch_misses(struct perf_stat_config *config, 493 int cpu, 494 struct evsel *evsel, 495 double avg, 496 struct perf_stat_output_ctx *out, 497 struct runtime_stat *st) 498 { 499 double total, ratio = 0.0; 500 const char *color; 501 int ctx = evsel_context(evsel); 502 503 total = runtime_stat_avg(st, STAT_BRANCHES, ctx, cpu); 504 505 if (total) 506 ratio = avg / total * 100.0; 507 508 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 509 510 out->print_metric(config, out->ctx, color, "%7.2f%%", "of all branches", ratio); 511 } 512 513 static void print_l1_dcache_misses(struct perf_stat_config *config, 514 int cpu, 515 struct evsel *evsel, 516 double avg, 517 struct perf_stat_output_ctx *out, 518 struct runtime_stat *st) 519 520 { 521 double total, ratio = 0.0; 522 const char *color; 523 int ctx = evsel_context(evsel); 524 525 total = runtime_stat_avg(st, STAT_L1_DCACHE, ctx, cpu); 526 527 if (total) 528 ratio = avg / total * 100.0; 529 530 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 531 532 out->print_metric(config, out->ctx, color, "%7.2f%%", "of all L1-dcache accesses", ratio); 533 } 534 535 static void print_l1_icache_misses(struct perf_stat_config *config, 536 int cpu, 537 struct evsel *evsel, 538 double avg, 539 struct perf_stat_output_ctx *out, 540 struct runtime_stat *st) 541 542 { 543 double total, ratio = 0.0; 544 const char *color; 545 int ctx = evsel_context(evsel); 546 547 total = runtime_stat_avg(st, STAT_L1_ICACHE, ctx, cpu); 548 549 if (total) 550 ratio = avg / total * 100.0; 551 552 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 553 out->print_metric(config, out->ctx, color, "%7.2f%%", "of all L1-icache accesses", ratio); 554 } 555 556 static void print_dtlb_cache_misses(struct perf_stat_config *config, 557 int cpu, 558 struct evsel *evsel, 559 double avg, 560 struct perf_stat_output_ctx *out, 561 struct runtime_stat *st) 562 { 563 double total, ratio = 0.0; 564 const char *color; 565 int ctx = evsel_context(evsel); 566 567 total = runtime_stat_avg(st, STAT_DTLB_CACHE, ctx, cpu); 568 569 if (total) 570 ratio = avg / total * 100.0; 571 572 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 573 out->print_metric(config, out->ctx, color, "%7.2f%%", "of all dTLB cache accesses", ratio); 574 } 575 576 static void print_itlb_cache_misses(struct perf_stat_config *config, 577 int cpu, 578 struct evsel *evsel, 579 double avg, 580 struct perf_stat_output_ctx *out, 581 struct runtime_stat *st) 582 { 583 double total, ratio = 0.0; 584 const char *color; 585 int ctx = evsel_context(evsel); 586 587 total = runtime_stat_avg(st, STAT_ITLB_CACHE, ctx, cpu); 588 589 if (total) 590 ratio = avg / total * 100.0; 591 592 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 593 out->print_metric(config, out->ctx, color, "%7.2f%%", "of all iTLB cache accesses", ratio); 594 } 595 596 static void print_ll_cache_misses(struct perf_stat_config *config, 597 int cpu, 598 struct evsel *evsel, 599 double avg, 600 struct perf_stat_output_ctx *out, 601 struct runtime_stat *st) 602 { 603 double total, ratio = 0.0; 604 const char *color; 605 int ctx = evsel_context(evsel); 606 607 total = runtime_stat_avg(st, STAT_LL_CACHE, ctx, cpu); 608 609 if (total) 610 ratio = avg / total * 100.0; 611 612 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 613 out->print_metric(config, out->ctx, color, "%7.2f%%", "of all LL-cache accesses", ratio); 614 } 615 616 /* 617 * High level "TopDown" CPU core pipe line bottleneck break down. 618 * 619 * Basic concept following 620 * Yasin, A Top Down Method for Performance analysis and Counter architecture 621 * ISPASS14 622 * 623 * The CPU pipeline is divided into 4 areas that can be bottlenecks: 624 * 625 * Frontend -> Backend -> Retiring 626 * BadSpeculation in addition means out of order execution that is thrown away 627 * (for example branch mispredictions) 628 * Frontend is instruction decoding. 629 * Backend is execution, like computation and accessing data in memory 630 * Retiring is good execution that is not directly bottlenecked 631 * 632 * The formulas are computed in slots. 633 * A slot is an entry in the pipeline each for the pipeline width 634 * (for example a 4-wide pipeline has 4 slots for each cycle) 635 * 636 * Formulas: 637 * BadSpeculation = ((SlotsIssued - SlotsRetired) + RecoveryBubbles) / 638 * TotalSlots 639 * Retiring = SlotsRetired / TotalSlots 640 * FrontendBound = FetchBubbles / TotalSlots 641 * BackendBound = 1.0 - BadSpeculation - Retiring - FrontendBound 642 * 643 * The kernel provides the mapping to the low level CPU events and any scaling 644 * needed for the CPU pipeline width, for example: 645 * 646 * TotalSlots = Cycles * 4 647 * 648 * The scaling factor is communicated in the sysfs unit. 649 * 650 * In some cases the CPU may not be able to measure all the formulas due to 651 * missing events. In this case multiple formulas are combined, as possible. 652 * 653 * Full TopDown supports more levels to sub-divide each area: for example 654 * BackendBound into computing bound and memory bound. For now we only 655 * support Level 1 TopDown. 656 */ 657 658 static double sanitize_val(double x) 659 { 660 if (x < 0 && x >= -0.02) 661 return 0.0; 662 return x; 663 } 664 665 static double td_total_slots(int ctx, int cpu, struct runtime_stat *st) 666 { 667 return runtime_stat_avg(st, STAT_TOPDOWN_TOTAL_SLOTS, ctx, cpu); 668 } 669 670 static double td_bad_spec(int ctx, int cpu, struct runtime_stat *st) 671 { 672 double bad_spec = 0; 673 double total_slots; 674 double total; 675 676 total = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_ISSUED, ctx, cpu) - 677 runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED, ctx, cpu) + 678 runtime_stat_avg(st, STAT_TOPDOWN_RECOVERY_BUBBLES, ctx, cpu); 679 680 total_slots = td_total_slots(ctx, cpu, st); 681 if (total_slots) 682 bad_spec = total / total_slots; 683 return sanitize_val(bad_spec); 684 } 685 686 static double td_retiring(int ctx, int cpu, struct runtime_stat *st) 687 { 688 double retiring = 0; 689 double total_slots = td_total_slots(ctx, cpu, st); 690 double ret_slots = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED, 691 ctx, cpu); 692 693 if (total_slots) 694 retiring = ret_slots / total_slots; 695 return retiring; 696 } 697 698 static double td_fe_bound(int ctx, int cpu, struct runtime_stat *st) 699 { 700 double fe_bound = 0; 701 double total_slots = td_total_slots(ctx, cpu, st); 702 double fetch_bub = runtime_stat_avg(st, STAT_TOPDOWN_FETCH_BUBBLES, 703 ctx, cpu); 704 705 if (total_slots) 706 fe_bound = fetch_bub / total_slots; 707 return fe_bound; 708 } 709 710 static double td_be_bound(int ctx, int cpu, struct runtime_stat *st) 711 { 712 double sum = (td_fe_bound(ctx, cpu, st) + 713 td_bad_spec(ctx, cpu, st) + 714 td_retiring(ctx, cpu, st)); 715 if (sum == 0) 716 return 0; 717 return sanitize_val(1.0 - sum); 718 } 719 720 /* 721 * Kernel reports metrics multiplied with slots. To get back 722 * the ratios we need to recreate the sum. 723 */ 724 725 static double td_metric_ratio(int ctx, int cpu, 726 enum stat_type type, 727 struct runtime_stat *stat) 728 { 729 double sum = runtime_stat_avg(stat, STAT_TOPDOWN_RETIRING, ctx, cpu) + 730 runtime_stat_avg(stat, STAT_TOPDOWN_FE_BOUND, ctx, cpu) + 731 runtime_stat_avg(stat, STAT_TOPDOWN_BE_BOUND, ctx, cpu) + 732 runtime_stat_avg(stat, STAT_TOPDOWN_BAD_SPEC, ctx, cpu); 733 double d = runtime_stat_avg(stat, type, ctx, cpu); 734 735 if (sum) 736 return d / sum; 737 return 0; 738 } 739 740 /* 741 * ... but only if most of the values are actually available. 742 * We allow two missing. 743 */ 744 745 static bool full_td(int ctx, int cpu, 746 struct runtime_stat *stat) 747 { 748 int c = 0; 749 750 if (runtime_stat_avg(stat, STAT_TOPDOWN_RETIRING, ctx, cpu) > 0) 751 c++; 752 if (runtime_stat_avg(stat, STAT_TOPDOWN_BE_BOUND, ctx, cpu) > 0) 753 c++; 754 if (runtime_stat_avg(stat, STAT_TOPDOWN_FE_BOUND, ctx, cpu) > 0) 755 c++; 756 if (runtime_stat_avg(stat, STAT_TOPDOWN_BAD_SPEC, ctx, cpu) > 0) 757 c++; 758 return c >= 2; 759 } 760 761 static void print_smi_cost(struct perf_stat_config *config, 762 int cpu, struct evsel *evsel, 763 struct perf_stat_output_ctx *out, 764 struct runtime_stat *st) 765 { 766 double smi_num, aperf, cycles, cost = 0.0; 767 int ctx = evsel_context(evsel); 768 const char *color = NULL; 769 770 smi_num = runtime_stat_avg(st, STAT_SMI_NUM, ctx, cpu); 771 aperf = runtime_stat_avg(st, STAT_APERF, ctx, cpu); 772 cycles = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu); 773 774 if ((cycles == 0) || (aperf == 0)) 775 return; 776 777 if (smi_num) 778 cost = (aperf - cycles) / aperf * 100.00; 779 780 if (cost > 10) 781 color = PERF_COLOR_RED; 782 out->print_metric(config, out->ctx, color, "%8.1f%%", "SMI cycles%", cost); 783 out->print_metric(config, out->ctx, NULL, "%4.0f", "SMI#", smi_num); 784 } 785 786 static int prepare_metric(struct evsel **metric_events, 787 struct metric_ref *metric_refs, 788 struct expr_parse_ctx *pctx, 789 int cpu, 790 struct runtime_stat *st) 791 { 792 double scale; 793 char *n, *pn; 794 int i, j, ret; 795 796 expr__ctx_init(pctx); 797 for (i = 0; metric_events[i]; i++) { 798 struct saved_value *v; 799 struct stats *stats; 800 u64 metric_total = 0; 801 802 if (!strcmp(metric_events[i]->name, "duration_time")) { 803 stats = &walltime_nsecs_stats; 804 scale = 1e-9; 805 } else { 806 v = saved_value_lookup(metric_events[i], cpu, false, 807 STAT_NONE, 0, st); 808 if (!v) 809 break; 810 stats = &v->stats; 811 scale = 1.0; 812 813 if (v->metric_other) 814 metric_total = v->metric_total; 815 } 816 817 n = strdup(metric_events[i]->name); 818 if (!n) 819 return -ENOMEM; 820 /* 821 * This display code with --no-merge adds [cpu] postfixes. 822 * These are not supported by the parser. Remove everything 823 * after the space. 824 */ 825 pn = strchr(n, ' '); 826 if (pn) 827 *pn = 0; 828 829 if (metric_total) 830 expr__add_id_val(pctx, n, metric_total); 831 else 832 expr__add_id_val(pctx, n, avg_stats(stats)*scale); 833 } 834 835 for (j = 0; metric_refs && metric_refs[j].metric_name; j++) { 836 ret = expr__add_ref(pctx, &metric_refs[j]); 837 if (ret) 838 return ret; 839 } 840 841 return i; 842 } 843 844 static void generic_metric(struct perf_stat_config *config, 845 const char *metric_expr, 846 struct evsel **metric_events, 847 struct metric_ref *metric_refs, 848 char *name, 849 const char *metric_name, 850 const char *metric_unit, 851 int runtime, 852 int cpu, 853 struct perf_stat_output_ctx *out, 854 struct runtime_stat *st) 855 { 856 print_metric_t print_metric = out->print_metric; 857 struct expr_parse_ctx pctx; 858 double ratio, scale; 859 int i; 860 void *ctxp = out->ctx; 861 862 i = prepare_metric(metric_events, metric_refs, &pctx, cpu, st); 863 if (i < 0) 864 return; 865 866 if (!metric_events[i]) { 867 if (expr__parse(&ratio, &pctx, metric_expr, runtime) == 0) { 868 char *unit; 869 char metric_bf[64]; 870 871 if (metric_unit && metric_name) { 872 if (perf_pmu__convert_scale(metric_unit, 873 &unit, &scale) >= 0) { 874 ratio *= scale; 875 } 876 if (strstr(metric_expr, "?")) 877 scnprintf(metric_bf, sizeof(metric_bf), 878 "%s %s_%d", unit, metric_name, runtime); 879 else 880 scnprintf(metric_bf, sizeof(metric_bf), 881 "%s %s", unit, metric_name); 882 883 print_metric(config, ctxp, NULL, "%8.1f", 884 metric_bf, ratio); 885 } else { 886 print_metric(config, ctxp, NULL, "%8.2f", 887 metric_name ? 888 metric_name : 889 out->force_header ? name : "", 890 ratio); 891 } 892 } else { 893 print_metric(config, ctxp, NULL, NULL, 894 out->force_header ? 895 (metric_name ? metric_name : name) : "", 0); 896 } 897 } else { 898 print_metric(config, ctxp, NULL, NULL, 899 out->force_header ? 900 (metric_name ? metric_name : name) : "", 0); 901 } 902 903 expr__ctx_clear(&pctx); 904 } 905 906 double test_generic_metric(struct metric_expr *mexp, int cpu, struct runtime_stat *st) 907 { 908 struct expr_parse_ctx pctx; 909 double ratio = 0.0; 910 911 if (prepare_metric(mexp->metric_events, mexp->metric_refs, &pctx, cpu, st) < 0) 912 goto out; 913 914 if (expr__parse(&ratio, &pctx, mexp->metric_expr, 1)) 915 ratio = 0.0; 916 917 out: 918 expr__ctx_clear(&pctx); 919 return ratio; 920 } 921 922 void perf_stat__print_shadow_stats(struct perf_stat_config *config, 923 struct evsel *evsel, 924 double avg, int cpu, 925 struct perf_stat_output_ctx *out, 926 struct rblist *metric_events, 927 struct runtime_stat *st) 928 { 929 void *ctxp = out->ctx; 930 print_metric_t print_metric = out->print_metric; 931 double total, ratio = 0.0, total2; 932 const char *color = NULL; 933 int ctx = evsel_context(evsel); 934 struct metric_event *me; 935 int num = 1; 936 937 if (evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { 938 total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu); 939 940 if (total) { 941 ratio = avg / total; 942 print_metric(config, ctxp, NULL, "%7.2f ", 943 "insn per cycle", ratio); 944 } else { 945 print_metric(config, ctxp, NULL, NULL, "insn per cycle", 0); 946 } 947 948 total = runtime_stat_avg(st, STAT_STALLED_CYCLES_FRONT, 949 ctx, cpu); 950 951 total = max(total, runtime_stat_avg(st, 952 STAT_STALLED_CYCLES_BACK, 953 ctx, cpu)); 954 955 if (total && avg) { 956 out->new_line(config, ctxp); 957 ratio = total / avg; 958 print_metric(config, ctxp, NULL, "%7.2f ", 959 "stalled cycles per insn", 960 ratio); 961 } 962 } else if (evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) { 963 if (runtime_stat_n(st, STAT_BRANCHES, ctx, cpu) != 0) 964 print_branch_misses(config, cpu, evsel, avg, out, st); 965 else 966 print_metric(config, ctxp, NULL, NULL, "of all branches", 0); 967 } else if ( 968 evsel->core.attr.type == PERF_TYPE_HW_CACHE && 969 evsel->core.attr.config == ( PERF_COUNT_HW_CACHE_L1D | 970 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 971 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 972 973 if (runtime_stat_n(st, STAT_L1_DCACHE, ctx, cpu) != 0) 974 print_l1_dcache_misses(config, cpu, evsel, avg, out, st); 975 else 976 print_metric(config, ctxp, NULL, NULL, "of all L1-dcache accesses", 0); 977 } else if ( 978 evsel->core.attr.type == PERF_TYPE_HW_CACHE && 979 evsel->core.attr.config == ( PERF_COUNT_HW_CACHE_L1I | 980 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 981 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 982 983 if (runtime_stat_n(st, STAT_L1_ICACHE, ctx, cpu) != 0) 984 print_l1_icache_misses(config, cpu, evsel, avg, out, st); 985 else 986 print_metric(config, ctxp, NULL, NULL, "of all L1-icache accesses", 0); 987 } else if ( 988 evsel->core.attr.type == PERF_TYPE_HW_CACHE && 989 evsel->core.attr.config == ( PERF_COUNT_HW_CACHE_DTLB | 990 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 991 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 992 993 if (runtime_stat_n(st, STAT_DTLB_CACHE, ctx, cpu) != 0) 994 print_dtlb_cache_misses(config, cpu, evsel, avg, out, st); 995 else 996 print_metric(config, ctxp, NULL, NULL, "of all dTLB cache accesses", 0); 997 } else if ( 998 evsel->core.attr.type == PERF_TYPE_HW_CACHE && 999 evsel->core.attr.config == ( PERF_COUNT_HW_CACHE_ITLB | 1000 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1001 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 1002 1003 if (runtime_stat_n(st, STAT_ITLB_CACHE, ctx, cpu) != 0) 1004 print_itlb_cache_misses(config, cpu, evsel, avg, out, st); 1005 else 1006 print_metric(config, ctxp, NULL, NULL, "of all iTLB cache accesses", 0); 1007 } else if ( 1008 evsel->core.attr.type == PERF_TYPE_HW_CACHE && 1009 evsel->core.attr.config == ( PERF_COUNT_HW_CACHE_LL | 1010 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1011 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 1012 1013 if (runtime_stat_n(st, STAT_LL_CACHE, ctx, cpu) != 0) 1014 print_ll_cache_misses(config, cpu, evsel, avg, out, st); 1015 else 1016 print_metric(config, ctxp, NULL, NULL, "of all LL-cache accesses", 0); 1017 } else if (evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) { 1018 total = runtime_stat_avg(st, STAT_CACHEREFS, ctx, cpu); 1019 1020 if (total) 1021 ratio = avg * 100 / total; 1022 1023 if (runtime_stat_n(st, STAT_CACHEREFS, ctx, cpu) != 0) 1024 print_metric(config, ctxp, NULL, "%8.3f %%", 1025 "of all cache refs", ratio); 1026 else 1027 print_metric(config, ctxp, NULL, NULL, "of all cache refs", 0); 1028 } else if (evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { 1029 print_stalled_cycles_frontend(config, cpu, evsel, avg, out, st); 1030 } else if (evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { 1031 print_stalled_cycles_backend(config, cpu, evsel, avg, out, st); 1032 } else if (evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { 1033 total = runtime_stat_avg(st, STAT_NSECS, 0, cpu); 1034 1035 if (total) { 1036 ratio = avg / total; 1037 print_metric(config, ctxp, NULL, "%8.3f", "GHz", ratio); 1038 } else { 1039 print_metric(config, ctxp, NULL, NULL, "Ghz", 0); 1040 } 1041 } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) { 1042 total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu); 1043 1044 if (total) 1045 print_metric(config, ctxp, NULL, 1046 "%7.2f%%", "transactional cycles", 1047 100.0 * (avg / total)); 1048 else 1049 print_metric(config, ctxp, NULL, NULL, "transactional cycles", 1050 0); 1051 } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) { 1052 total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu); 1053 total2 = runtime_stat_avg(st, STAT_CYCLES_IN_TX, ctx, cpu); 1054 1055 if (total2 < avg) 1056 total2 = avg; 1057 if (total) 1058 print_metric(config, ctxp, NULL, "%7.2f%%", "aborted cycles", 1059 100.0 * ((total2-avg) / total)); 1060 else 1061 print_metric(config, ctxp, NULL, NULL, "aborted cycles", 0); 1062 } else if (perf_stat_evsel__is(evsel, TRANSACTION_START)) { 1063 total = runtime_stat_avg(st, STAT_CYCLES_IN_TX, 1064 ctx, cpu); 1065 1066 if (avg) 1067 ratio = total / avg; 1068 1069 if (runtime_stat_n(st, STAT_CYCLES_IN_TX, ctx, cpu) != 0) 1070 print_metric(config, ctxp, NULL, "%8.0f", 1071 "cycles / transaction", ratio); 1072 else 1073 print_metric(config, ctxp, NULL, NULL, "cycles / transaction", 1074 0); 1075 } else if (perf_stat_evsel__is(evsel, ELISION_START)) { 1076 total = runtime_stat_avg(st, STAT_CYCLES_IN_TX, 1077 ctx, cpu); 1078 1079 if (avg) 1080 ratio = total / avg; 1081 1082 print_metric(config, ctxp, NULL, "%8.0f", "cycles / elision", ratio); 1083 } else if (evsel__is_clock(evsel)) { 1084 if ((ratio = avg_stats(&walltime_nsecs_stats)) != 0) 1085 print_metric(config, ctxp, NULL, "%8.3f", "CPUs utilized", 1086 avg / (ratio * evsel->scale)); 1087 else 1088 print_metric(config, ctxp, NULL, NULL, "CPUs utilized", 0); 1089 } else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) { 1090 double fe_bound = td_fe_bound(ctx, cpu, st); 1091 1092 if (fe_bound > 0.2) 1093 color = PERF_COLOR_RED; 1094 print_metric(config, ctxp, color, "%8.1f%%", "frontend bound", 1095 fe_bound * 100.); 1096 } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) { 1097 double retiring = td_retiring(ctx, cpu, st); 1098 1099 if (retiring > 0.7) 1100 color = PERF_COLOR_GREEN; 1101 print_metric(config, ctxp, color, "%8.1f%%", "retiring", 1102 retiring * 100.); 1103 } else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) { 1104 double bad_spec = td_bad_spec(ctx, cpu, st); 1105 1106 if (bad_spec > 0.1) 1107 color = PERF_COLOR_RED; 1108 print_metric(config, ctxp, color, "%8.1f%%", "bad speculation", 1109 bad_spec * 100.); 1110 } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) { 1111 double be_bound = td_be_bound(ctx, cpu, st); 1112 const char *name = "backend bound"; 1113 static int have_recovery_bubbles = -1; 1114 1115 /* In case the CPU does not support topdown-recovery-bubbles */ 1116 if (have_recovery_bubbles < 0) 1117 have_recovery_bubbles = pmu_have_event("cpu", 1118 "topdown-recovery-bubbles"); 1119 if (!have_recovery_bubbles) 1120 name = "backend bound/bad spec"; 1121 1122 if (be_bound > 0.2) 1123 color = PERF_COLOR_RED; 1124 if (td_total_slots(ctx, cpu, st) > 0) 1125 print_metric(config, ctxp, color, "%8.1f%%", name, 1126 be_bound * 100.); 1127 else 1128 print_metric(config, ctxp, NULL, NULL, name, 0); 1129 } else if (perf_stat_evsel__is(evsel, TOPDOWN_RETIRING) && 1130 full_td(ctx, cpu, st)) { 1131 double retiring = td_metric_ratio(ctx, cpu, 1132 STAT_TOPDOWN_RETIRING, st); 1133 1134 if (retiring > 0.7) 1135 color = PERF_COLOR_GREEN; 1136 print_metric(config, ctxp, color, "%8.1f%%", "retiring", 1137 retiring * 100.); 1138 } else if (perf_stat_evsel__is(evsel, TOPDOWN_FE_BOUND) && 1139 full_td(ctx, cpu, st)) { 1140 double fe_bound = td_metric_ratio(ctx, cpu, 1141 STAT_TOPDOWN_FE_BOUND, st); 1142 1143 if (fe_bound > 0.2) 1144 color = PERF_COLOR_RED; 1145 print_metric(config, ctxp, color, "%8.1f%%", "frontend bound", 1146 fe_bound * 100.); 1147 } else if (perf_stat_evsel__is(evsel, TOPDOWN_BE_BOUND) && 1148 full_td(ctx, cpu, st)) { 1149 double be_bound = td_metric_ratio(ctx, cpu, 1150 STAT_TOPDOWN_BE_BOUND, st); 1151 1152 if (be_bound > 0.2) 1153 color = PERF_COLOR_RED; 1154 print_metric(config, ctxp, color, "%8.1f%%", "backend bound", 1155 be_bound * 100.); 1156 } else if (perf_stat_evsel__is(evsel, TOPDOWN_BAD_SPEC) && 1157 full_td(ctx, cpu, st)) { 1158 double bad_spec = td_metric_ratio(ctx, cpu, 1159 STAT_TOPDOWN_BAD_SPEC, st); 1160 1161 if (bad_spec > 0.1) 1162 color = PERF_COLOR_RED; 1163 print_metric(config, ctxp, color, "%8.1f%%", "bad speculation", 1164 bad_spec * 100.); 1165 } else if (evsel->metric_expr) { 1166 generic_metric(config, evsel->metric_expr, evsel->metric_events, NULL, 1167 evsel->name, evsel->metric_name, NULL, 1, cpu, out, st); 1168 } else if (runtime_stat_n(st, STAT_NSECS, 0, cpu) != 0) { 1169 char unit = 'M'; 1170 char unit_buf[10]; 1171 1172 total = runtime_stat_avg(st, STAT_NSECS, 0, cpu); 1173 1174 if (total) 1175 ratio = 1000.0 * avg / total; 1176 if (ratio < 0.001) { 1177 ratio *= 1000; 1178 unit = 'K'; 1179 } 1180 snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit); 1181 print_metric(config, ctxp, NULL, "%8.3f", unit_buf, ratio); 1182 } else if (perf_stat_evsel__is(evsel, SMI_NUM)) { 1183 print_smi_cost(config, cpu, evsel, out, st); 1184 } else { 1185 num = 0; 1186 } 1187 1188 if ((me = metricgroup__lookup(metric_events, evsel, false)) != NULL) { 1189 struct metric_expr *mexp; 1190 1191 list_for_each_entry (mexp, &me->head, nd) { 1192 if (num++ > 0) 1193 out->new_line(config, ctxp); 1194 generic_metric(config, mexp->metric_expr, mexp->metric_events, 1195 mexp->metric_refs, evsel->name, mexp->metric_name, 1196 mexp->metric_unit, mexp->runtime, cpu, out, st); 1197 } 1198 } 1199 if (num == 0) 1200 print_metric(config, ctxp, NULL, NULL, NULL, 0); 1201 } 1202