1 // SPDX-License-Identifier: GPL-2.0 2 #include <stdio.h> 3 #include "evsel.h" 4 #include "stat.h" 5 #include "color.h" 6 #include "pmu.h" 7 #include "rblist.h" 8 #include "evlist.h" 9 #include "expr.h" 10 11 enum { 12 CTX_BIT_USER = 1 << 0, 13 CTX_BIT_KERNEL = 1 << 1, 14 CTX_BIT_HV = 1 << 2, 15 CTX_BIT_HOST = 1 << 3, 16 CTX_BIT_IDLE = 1 << 4, 17 CTX_BIT_MAX = 1 << 5, 18 }; 19 20 #define NUM_CTX CTX_BIT_MAX 21 22 /* 23 * AGGR_GLOBAL: Use CPU 0 24 * AGGR_SOCKET: Use first CPU of socket 25 * AGGR_CORE: Use first CPU of core 26 * AGGR_NONE: Use matching CPU 27 * AGGR_THREAD: Not supported? 28 */ 29 static struct stats runtime_nsecs_stats[MAX_NR_CPUS]; 30 static struct stats runtime_cycles_stats[NUM_CTX][MAX_NR_CPUS]; 31 static struct stats runtime_stalled_cycles_front_stats[NUM_CTX][MAX_NR_CPUS]; 32 static struct stats runtime_stalled_cycles_back_stats[NUM_CTX][MAX_NR_CPUS]; 33 static struct stats runtime_branches_stats[NUM_CTX][MAX_NR_CPUS]; 34 static struct stats runtime_cacherefs_stats[NUM_CTX][MAX_NR_CPUS]; 35 static struct stats runtime_l1_dcache_stats[NUM_CTX][MAX_NR_CPUS]; 36 static struct stats runtime_l1_icache_stats[NUM_CTX][MAX_NR_CPUS]; 37 static struct stats runtime_ll_cache_stats[NUM_CTX][MAX_NR_CPUS]; 38 static struct stats runtime_itlb_cache_stats[NUM_CTX][MAX_NR_CPUS]; 39 static struct stats runtime_dtlb_cache_stats[NUM_CTX][MAX_NR_CPUS]; 40 static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS]; 41 static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS]; 42 static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS]; 43 static struct stats runtime_topdown_total_slots[NUM_CTX][MAX_NR_CPUS]; 44 static struct stats runtime_topdown_slots_issued[NUM_CTX][MAX_NR_CPUS]; 45 static struct stats runtime_topdown_slots_retired[NUM_CTX][MAX_NR_CPUS]; 46 static struct stats runtime_topdown_fetch_bubbles[NUM_CTX][MAX_NR_CPUS]; 47 static struct stats runtime_topdown_recovery_bubbles[NUM_CTX][MAX_NR_CPUS]; 48 static struct stats runtime_smi_num_stats[NUM_CTX][MAX_NR_CPUS]; 49 static struct stats runtime_aperf_stats[NUM_CTX][MAX_NR_CPUS]; 50 static struct rblist runtime_saved_values; 51 static bool have_frontend_stalled; 52 53 struct stats walltime_nsecs_stats; 54 55 struct saved_value { 56 struct rb_node rb_node; 57 struct perf_evsel *evsel; 58 int cpu; 59 int ctx; 60 struct stats stats; 61 }; 62 63 static int saved_value_cmp(struct rb_node *rb_node, const void *entry) 64 { 65 struct saved_value *a = container_of(rb_node, 66 struct saved_value, 67 rb_node); 68 const struct saved_value *b = entry; 69 70 if (a->ctx != b->ctx) 71 return a->ctx - b->ctx; 72 if (a->cpu != b->cpu) 73 return a->cpu - b->cpu; 74 if (a->evsel == b->evsel) 75 return 0; 76 if ((char *)a->evsel < (char *)b->evsel) 77 return -1; 78 return +1; 79 } 80 81 static struct rb_node *saved_value_new(struct rblist *rblist __maybe_unused, 82 const void *entry) 83 { 84 struct saved_value *nd = malloc(sizeof(struct saved_value)); 85 86 if (!nd) 87 return NULL; 88 memcpy(nd, entry, sizeof(struct saved_value)); 89 return &nd->rb_node; 90 } 91 92 static struct saved_value *saved_value_lookup(struct perf_evsel *evsel, 93 int cpu, int ctx, 94 bool create) 95 { 96 struct rb_node *nd; 97 struct saved_value dm = { 98 .cpu = cpu, 99 .ctx = ctx, 100 .evsel = evsel, 101 }; 102 nd = rblist__find(&runtime_saved_values, &dm); 103 if (nd) 104 return container_of(nd, struct saved_value, rb_node); 105 if (create) { 106 rblist__add_node(&runtime_saved_values, &dm); 107 nd = rblist__find(&runtime_saved_values, &dm); 108 if (nd) 109 return container_of(nd, struct saved_value, rb_node); 110 } 111 return NULL; 112 } 113 114 void perf_stat__init_shadow_stats(void) 115 { 116 have_frontend_stalled = pmu_have_event("cpu", "stalled-cycles-frontend"); 117 rblist__init(&runtime_saved_values); 118 runtime_saved_values.node_cmp = saved_value_cmp; 119 runtime_saved_values.node_new = saved_value_new; 120 /* No delete for now */ 121 } 122 123 static int evsel_context(struct perf_evsel *evsel) 124 { 125 int ctx = 0; 126 127 if (evsel->attr.exclude_kernel) 128 ctx |= CTX_BIT_KERNEL; 129 if (evsel->attr.exclude_user) 130 ctx |= CTX_BIT_USER; 131 if (evsel->attr.exclude_hv) 132 ctx |= CTX_BIT_HV; 133 if (evsel->attr.exclude_host) 134 ctx |= CTX_BIT_HOST; 135 if (evsel->attr.exclude_idle) 136 ctx |= CTX_BIT_IDLE; 137 138 return ctx; 139 } 140 141 void perf_stat__reset_shadow_stats(void) 142 { 143 struct rb_node *pos, *next; 144 145 memset(runtime_nsecs_stats, 0, sizeof(runtime_nsecs_stats)); 146 memset(runtime_cycles_stats, 0, sizeof(runtime_cycles_stats)); 147 memset(runtime_stalled_cycles_front_stats, 0, sizeof(runtime_stalled_cycles_front_stats)); 148 memset(runtime_stalled_cycles_back_stats, 0, sizeof(runtime_stalled_cycles_back_stats)); 149 memset(runtime_branches_stats, 0, sizeof(runtime_branches_stats)); 150 memset(runtime_cacherefs_stats, 0, sizeof(runtime_cacherefs_stats)); 151 memset(runtime_l1_dcache_stats, 0, sizeof(runtime_l1_dcache_stats)); 152 memset(runtime_l1_icache_stats, 0, sizeof(runtime_l1_icache_stats)); 153 memset(runtime_ll_cache_stats, 0, sizeof(runtime_ll_cache_stats)); 154 memset(runtime_itlb_cache_stats, 0, sizeof(runtime_itlb_cache_stats)); 155 memset(runtime_dtlb_cache_stats, 0, sizeof(runtime_dtlb_cache_stats)); 156 memset(runtime_cycles_in_tx_stats, 0, 157 sizeof(runtime_cycles_in_tx_stats)); 158 memset(runtime_transaction_stats, 0, 159 sizeof(runtime_transaction_stats)); 160 memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats)); 161 memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats)); 162 memset(runtime_topdown_total_slots, 0, sizeof(runtime_topdown_total_slots)); 163 memset(runtime_topdown_slots_retired, 0, sizeof(runtime_topdown_slots_retired)); 164 memset(runtime_topdown_slots_issued, 0, sizeof(runtime_topdown_slots_issued)); 165 memset(runtime_topdown_fetch_bubbles, 0, sizeof(runtime_topdown_fetch_bubbles)); 166 memset(runtime_topdown_recovery_bubbles, 0, sizeof(runtime_topdown_recovery_bubbles)); 167 memset(runtime_smi_num_stats, 0, sizeof(runtime_smi_num_stats)); 168 memset(runtime_aperf_stats, 0, sizeof(runtime_aperf_stats)); 169 170 next = rb_first(&runtime_saved_values.entries); 171 while (next) { 172 pos = next; 173 next = rb_next(pos); 174 memset(&container_of(pos, struct saved_value, rb_node)->stats, 175 0, 176 sizeof(struct stats)); 177 } 178 } 179 180 /* 181 * Update various tracking values we maintain to print 182 * more semantic information such as miss/hit ratios, 183 * instruction rates, etc: 184 */ 185 void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count, 186 int cpu) 187 { 188 int ctx = evsel_context(counter); 189 190 if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK) || 191 perf_evsel__match(counter, SOFTWARE, SW_CPU_CLOCK)) 192 update_stats(&runtime_nsecs_stats[cpu], count[0]); 193 else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) 194 update_stats(&runtime_cycles_stats[ctx][cpu], count[0]); 195 else if (perf_stat_evsel__is(counter, CYCLES_IN_TX)) 196 update_stats(&runtime_cycles_in_tx_stats[ctx][cpu], count[0]); 197 else if (perf_stat_evsel__is(counter, TRANSACTION_START)) 198 update_stats(&runtime_transaction_stats[ctx][cpu], count[0]); 199 else if (perf_stat_evsel__is(counter, ELISION_START)) 200 update_stats(&runtime_elision_stats[ctx][cpu], count[0]); 201 else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS)) 202 update_stats(&runtime_topdown_total_slots[ctx][cpu], count[0]); 203 else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED)) 204 update_stats(&runtime_topdown_slots_issued[ctx][cpu], count[0]); 205 else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED)) 206 update_stats(&runtime_topdown_slots_retired[ctx][cpu], count[0]); 207 else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES)) 208 update_stats(&runtime_topdown_fetch_bubbles[ctx][cpu],count[0]); 209 else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES)) 210 update_stats(&runtime_topdown_recovery_bubbles[ctx][cpu], count[0]); 211 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) 212 update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count[0]); 213 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) 214 update_stats(&runtime_stalled_cycles_back_stats[ctx][cpu], count[0]); 215 else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) 216 update_stats(&runtime_branches_stats[ctx][cpu], count[0]); 217 else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) 218 update_stats(&runtime_cacherefs_stats[ctx][cpu], count[0]); 219 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) 220 update_stats(&runtime_l1_dcache_stats[ctx][cpu], count[0]); 221 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) 222 update_stats(&runtime_ll_cache_stats[ctx][cpu], count[0]); 223 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL)) 224 update_stats(&runtime_ll_cache_stats[ctx][cpu], count[0]); 225 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) 226 update_stats(&runtime_dtlb_cache_stats[ctx][cpu], count[0]); 227 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) 228 update_stats(&runtime_itlb_cache_stats[ctx][cpu], count[0]); 229 else if (perf_stat_evsel__is(counter, SMI_NUM)) 230 update_stats(&runtime_smi_num_stats[ctx][cpu], count[0]); 231 else if (perf_stat_evsel__is(counter, APERF)) 232 update_stats(&runtime_aperf_stats[ctx][cpu], count[0]); 233 234 if (counter->collect_stat) { 235 struct saved_value *v = saved_value_lookup(counter, cpu, ctx, 236 true); 237 update_stats(&v->stats, count[0]); 238 } 239 } 240 241 /* used for get_ratio_color() */ 242 enum grc_type { 243 GRC_STALLED_CYCLES_FE, 244 GRC_STALLED_CYCLES_BE, 245 GRC_CACHE_MISSES, 246 GRC_MAX_NR 247 }; 248 249 static const char *get_ratio_color(enum grc_type type, double ratio) 250 { 251 static const double grc_table[GRC_MAX_NR][3] = { 252 [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 }, 253 [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 }, 254 [GRC_CACHE_MISSES] = { 20.0, 10.0, 5.0 }, 255 }; 256 const char *color = PERF_COLOR_NORMAL; 257 258 if (ratio > grc_table[type][0]) 259 color = PERF_COLOR_RED; 260 else if (ratio > grc_table[type][1]) 261 color = PERF_COLOR_MAGENTA; 262 else if (ratio > grc_table[type][2]) 263 color = PERF_COLOR_YELLOW; 264 265 return color; 266 } 267 268 static struct perf_evsel *perf_stat__find_event(struct perf_evlist *evsel_list, 269 const char *name) 270 { 271 struct perf_evsel *c2; 272 273 evlist__for_each_entry (evsel_list, c2) { 274 if (!strcasecmp(c2->name, name)) 275 return c2; 276 } 277 return NULL; 278 } 279 280 /* Mark MetricExpr target events and link events using them to them. */ 281 void perf_stat__collect_metric_expr(struct perf_evlist *evsel_list) 282 { 283 struct perf_evsel *counter, *leader, **metric_events, *oc; 284 bool found; 285 const char **metric_names; 286 int i; 287 int num_metric_names; 288 289 evlist__for_each_entry(evsel_list, counter) { 290 bool invalid = false; 291 292 leader = counter->leader; 293 if (!counter->metric_expr) 294 continue; 295 metric_events = counter->metric_events; 296 if (!metric_events) { 297 if (expr__find_other(counter->metric_expr, counter->name, 298 &metric_names, &num_metric_names) < 0) 299 continue; 300 301 metric_events = calloc(sizeof(struct perf_evsel *), 302 num_metric_names + 1); 303 if (!metric_events) 304 return; 305 counter->metric_events = metric_events; 306 } 307 308 for (i = 0; i < num_metric_names; i++) { 309 found = false; 310 if (leader) { 311 /* Search in group */ 312 for_each_group_member (oc, leader) { 313 if (!strcasecmp(oc->name, metric_names[i])) { 314 found = true; 315 break; 316 } 317 } 318 } 319 if (!found) { 320 /* Search ignoring groups */ 321 oc = perf_stat__find_event(evsel_list, metric_names[i]); 322 } 323 if (!oc) { 324 /* Deduping one is good enough to handle duplicated PMUs. */ 325 static char *printed; 326 327 /* 328 * Adding events automatically would be difficult, because 329 * it would risk creating groups that are not schedulable. 330 * perf stat doesn't understand all the scheduling constraints 331 * of events. So we ask the user instead to add the missing 332 * events. 333 */ 334 if (!printed || strcasecmp(printed, metric_names[i])) { 335 fprintf(stderr, 336 "Add %s event to groups to get metric expression for %s\n", 337 metric_names[i], 338 counter->name); 339 printed = strdup(metric_names[i]); 340 } 341 invalid = true; 342 continue; 343 } 344 metric_events[i] = oc; 345 oc->collect_stat = true; 346 } 347 metric_events[i] = NULL; 348 free(metric_names); 349 if (invalid) { 350 free(metric_events); 351 counter->metric_events = NULL; 352 counter->metric_expr = NULL; 353 } 354 } 355 } 356 357 static void print_stalled_cycles_frontend(int cpu, 358 struct perf_evsel *evsel, double avg, 359 struct perf_stat_output_ctx *out) 360 { 361 double total, ratio = 0.0; 362 const char *color; 363 int ctx = evsel_context(evsel); 364 365 total = avg_stats(&runtime_cycles_stats[ctx][cpu]); 366 367 if (total) 368 ratio = avg / total * 100.0; 369 370 color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio); 371 372 if (ratio) 373 out->print_metric(out->ctx, color, "%7.2f%%", "frontend cycles idle", 374 ratio); 375 else 376 out->print_metric(out->ctx, NULL, NULL, "frontend cycles idle", 0); 377 } 378 379 static void print_stalled_cycles_backend(int cpu, 380 struct perf_evsel *evsel, double avg, 381 struct perf_stat_output_ctx *out) 382 { 383 double total, ratio = 0.0; 384 const char *color; 385 int ctx = evsel_context(evsel); 386 387 total = avg_stats(&runtime_cycles_stats[ctx][cpu]); 388 389 if (total) 390 ratio = avg / total * 100.0; 391 392 color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio); 393 394 out->print_metric(out->ctx, color, "%7.2f%%", "backend cycles idle", ratio); 395 } 396 397 static void print_branch_misses(int cpu, 398 struct perf_evsel *evsel, 399 double avg, 400 struct perf_stat_output_ctx *out) 401 { 402 double total, ratio = 0.0; 403 const char *color; 404 int ctx = evsel_context(evsel); 405 406 total = avg_stats(&runtime_branches_stats[ctx][cpu]); 407 408 if (total) 409 ratio = avg / total * 100.0; 410 411 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 412 413 out->print_metric(out->ctx, color, "%7.2f%%", "of all branches", ratio); 414 } 415 416 static void print_l1_dcache_misses(int cpu, 417 struct perf_evsel *evsel, 418 double avg, 419 struct perf_stat_output_ctx *out) 420 { 421 double total, ratio = 0.0; 422 const char *color; 423 int ctx = evsel_context(evsel); 424 425 total = avg_stats(&runtime_l1_dcache_stats[ctx][cpu]); 426 427 if (total) 428 ratio = avg / total * 100.0; 429 430 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 431 432 out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-dcache hits", ratio); 433 } 434 435 static void print_l1_icache_misses(int cpu, 436 struct perf_evsel *evsel, 437 double avg, 438 struct perf_stat_output_ctx *out) 439 { 440 double total, ratio = 0.0; 441 const char *color; 442 int ctx = evsel_context(evsel); 443 444 total = avg_stats(&runtime_l1_icache_stats[ctx][cpu]); 445 446 if (total) 447 ratio = avg / total * 100.0; 448 449 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 450 out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-icache hits", ratio); 451 } 452 453 static void print_dtlb_cache_misses(int cpu, 454 struct perf_evsel *evsel, 455 double avg, 456 struct perf_stat_output_ctx *out) 457 { 458 double total, ratio = 0.0; 459 const char *color; 460 int ctx = evsel_context(evsel); 461 462 total = avg_stats(&runtime_dtlb_cache_stats[ctx][cpu]); 463 464 if (total) 465 ratio = avg / total * 100.0; 466 467 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 468 out->print_metric(out->ctx, color, "%7.2f%%", "of all dTLB cache hits", ratio); 469 } 470 471 static void print_itlb_cache_misses(int cpu, 472 struct perf_evsel *evsel, 473 double avg, 474 struct perf_stat_output_ctx *out) 475 { 476 double total, ratio = 0.0; 477 const char *color; 478 int ctx = evsel_context(evsel); 479 480 total = avg_stats(&runtime_itlb_cache_stats[ctx][cpu]); 481 482 if (total) 483 ratio = avg / total * 100.0; 484 485 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 486 out->print_metric(out->ctx, color, "%7.2f%%", "of all iTLB cache hits", ratio); 487 } 488 489 static void print_ll_cache_misses(int cpu, 490 struct perf_evsel *evsel, 491 double avg, 492 struct perf_stat_output_ctx *out) 493 { 494 double total, ratio = 0.0; 495 const char *color; 496 int ctx = evsel_context(evsel); 497 498 total = avg_stats(&runtime_ll_cache_stats[ctx][cpu]); 499 500 if (total) 501 ratio = avg / total * 100.0; 502 503 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 504 out->print_metric(out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio); 505 } 506 507 /* 508 * High level "TopDown" CPU core pipe line bottleneck break down. 509 * 510 * Basic concept following 511 * Yasin, A Top Down Method for Performance analysis and Counter architecture 512 * ISPASS14 513 * 514 * The CPU pipeline is divided into 4 areas that can be bottlenecks: 515 * 516 * Frontend -> Backend -> Retiring 517 * BadSpeculation in addition means out of order execution that is thrown away 518 * (for example branch mispredictions) 519 * Frontend is instruction decoding. 520 * Backend is execution, like computation and accessing data in memory 521 * Retiring is good execution that is not directly bottlenecked 522 * 523 * The formulas are computed in slots. 524 * A slot is an entry in the pipeline each for the pipeline width 525 * (for example a 4-wide pipeline has 4 slots for each cycle) 526 * 527 * Formulas: 528 * BadSpeculation = ((SlotsIssued - SlotsRetired) + RecoveryBubbles) / 529 * TotalSlots 530 * Retiring = SlotsRetired / TotalSlots 531 * FrontendBound = FetchBubbles / TotalSlots 532 * BackendBound = 1.0 - BadSpeculation - Retiring - FrontendBound 533 * 534 * The kernel provides the mapping to the low level CPU events and any scaling 535 * needed for the CPU pipeline width, for example: 536 * 537 * TotalSlots = Cycles * 4 538 * 539 * The scaling factor is communicated in the sysfs unit. 540 * 541 * In some cases the CPU may not be able to measure all the formulas due to 542 * missing events. In this case multiple formulas are combined, as possible. 543 * 544 * Full TopDown supports more levels to sub-divide each area: for example 545 * BackendBound into computing bound and memory bound. For now we only 546 * support Level 1 TopDown. 547 */ 548 549 static double sanitize_val(double x) 550 { 551 if (x < 0 && x >= -0.02) 552 return 0.0; 553 return x; 554 } 555 556 static double td_total_slots(int ctx, int cpu) 557 { 558 return avg_stats(&runtime_topdown_total_slots[ctx][cpu]); 559 } 560 561 static double td_bad_spec(int ctx, int cpu) 562 { 563 double bad_spec = 0; 564 double total_slots; 565 double total; 566 567 total = avg_stats(&runtime_topdown_slots_issued[ctx][cpu]) - 568 avg_stats(&runtime_topdown_slots_retired[ctx][cpu]) + 569 avg_stats(&runtime_topdown_recovery_bubbles[ctx][cpu]); 570 total_slots = td_total_slots(ctx, cpu); 571 if (total_slots) 572 bad_spec = total / total_slots; 573 return sanitize_val(bad_spec); 574 } 575 576 static double td_retiring(int ctx, int cpu) 577 { 578 double retiring = 0; 579 double total_slots = td_total_slots(ctx, cpu); 580 double ret_slots = avg_stats(&runtime_topdown_slots_retired[ctx][cpu]); 581 582 if (total_slots) 583 retiring = ret_slots / total_slots; 584 return retiring; 585 } 586 587 static double td_fe_bound(int ctx, int cpu) 588 { 589 double fe_bound = 0; 590 double total_slots = td_total_slots(ctx, cpu); 591 double fetch_bub = avg_stats(&runtime_topdown_fetch_bubbles[ctx][cpu]); 592 593 if (total_slots) 594 fe_bound = fetch_bub / total_slots; 595 return fe_bound; 596 } 597 598 static double td_be_bound(int ctx, int cpu) 599 { 600 double sum = (td_fe_bound(ctx, cpu) + 601 td_bad_spec(ctx, cpu) + 602 td_retiring(ctx, cpu)); 603 if (sum == 0) 604 return 0; 605 return sanitize_val(1.0 - sum); 606 } 607 608 static void print_smi_cost(int cpu, struct perf_evsel *evsel, 609 struct perf_stat_output_ctx *out) 610 { 611 double smi_num, aperf, cycles, cost = 0.0; 612 int ctx = evsel_context(evsel); 613 const char *color = NULL; 614 615 smi_num = avg_stats(&runtime_smi_num_stats[ctx][cpu]); 616 aperf = avg_stats(&runtime_aperf_stats[ctx][cpu]); 617 cycles = avg_stats(&runtime_cycles_stats[ctx][cpu]); 618 619 if ((cycles == 0) || (aperf == 0)) 620 return; 621 622 if (smi_num) 623 cost = (aperf - cycles) / aperf * 100.00; 624 625 if (cost > 10) 626 color = PERF_COLOR_RED; 627 out->print_metric(out->ctx, color, "%8.1f%%", "SMI cycles%", cost); 628 out->print_metric(out->ctx, NULL, "%4.0f", "SMI#", smi_num); 629 } 630 631 void perf_stat__print_shadow_stats(struct perf_evsel *evsel, 632 double avg, int cpu, 633 struct perf_stat_output_ctx *out) 634 { 635 void *ctxp = out->ctx; 636 print_metric_t print_metric = out->print_metric; 637 double total, ratio = 0.0, total2; 638 const char *color = NULL; 639 int ctx = evsel_context(evsel); 640 641 if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { 642 total = avg_stats(&runtime_cycles_stats[ctx][cpu]); 643 if (total) { 644 ratio = avg / total; 645 print_metric(ctxp, NULL, "%7.2f ", 646 "insn per cycle", ratio); 647 } else { 648 print_metric(ctxp, NULL, NULL, "insn per cycle", 0); 649 } 650 total = avg_stats(&runtime_stalled_cycles_front_stats[ctx][cpu]); 651 total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[ctx][cpu])); 652 653 if (total && avg) { 654 out->new_line(ctxp); 655 ratio = total / avg; 656 print_metric(ctxp, NULL, "%7.2f ", 657 "stalled cycles per insn", 658 ratio); 659 } else if (have_frontend_stalled) { 660 print_metric(ctxp, NULL, NULL, 661 "stalled cycles per insn", 0); 662 } 663 } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) { 664 if (runtime_branches_stats[ctx][cpu].n != 0) 665 print_branch_misses(cpu, evsel, avg, out); 666 else 667 print_metric(ctxp, NULL, NULL, "of all branches", 0); 668 } else if ( 669 evsel->attr.type == PERF_TYPE_HW_CACHE && 670 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | 671 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 672 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 673 if (runtime_l1_dcache_stats[ctx][cpu].n != 0) 674 print_l1_dcache_misses(cpu, evsel, avg, out); 675 else 676 print_metric(ctxp, NULL, NULL, "of all L1-dcache hits", 0); 677 } else if ( 678 evsel->attr.type == PERF_TYPE_HW_CACHE && 679 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I | 680 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 681 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 682 if (runtime_l1_icache_stats[ctx][cpu].n != 0) 683 print_l1_icache_misses(cpu, evsel, avg, out); 684 else 685 print_metric(ctxp, NULL, NULL, "of all L1-icache hits", 0); 686 } else if ( 687 evsel->attr.type == PERF_TYPE_HW_CACHE && 688 evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB | 689 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 690 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 691 if (runtime_dtlb_cache_stats[ctx][cpu].n != 0) 692 print_dtlb_cache_misses(cpu, evsel, avg, out); 693 else 694 print_metric(ctxp, NULL, NULL, "of all dTLB cache hits", 0); 695 } else if ( 696 evsel->attr.type == PERF_TYPE_HW_CACHE && 697 evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB | 698 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 699 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 700 if (runtime_itlb_cache_stats[ctx][cpu].n != 0) 701 print_itlb_cache_misses(cpu, evsel, avg, out); 702 else 703 print_metric(ctxp, NULL, NULL, "of all iTLB cache hits", 0); 704 } else if ( 705 evsel->attr.type == PERF_TYPE_HW_CACHE && 706 evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL | 707 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 708 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 709 if (runtime_ll_cache_stats[ctx][cpu].n != 0) 710 print_ll_cache_misses(cpu, evsel, avg, out); 711 else 712 print_metric(ctxp, NULL, NULL, "of all LL-cache hits", 0); 713 } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) { 714 total = avg_stats(&runtime_cacherefs_stats[ctx][cpu]); 715 716 if (total) 717 ratio = avg * 100 / total; 718 719 if (runtime_cacherefs_stats[ctx][cpu].n != 0) 720 print_metric(ctxp, NULL, "%8.3f %%", 721 "of all cache refs", ratio); 722 else 723 print_metric(ctxp, NULL, NULL, "of all cache refs", 0); 724 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { 725 print_stalled_cycles_frontend(cpu, evsel, avg, out); 726 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { 727 print_stalled_cycles_backend(cpu, evsel, avg, out); 728 } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { 729 total = avg_stats(&runtime_nsecs_stats[cpu]); 730 731 if (total) { 732 ratio = avg / total; 733 print_metric(ctxp, NULL, "%8.3f", "GHz", ratio); 734 } else { 735 print_metric(ctxp, NULL, NULL, "Ghz", 0); 736 } 737 } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) { 738 total = avg_stats(&runtime_cycles_stats[ctx][cpu]); 739 if (total) 740 print_metric(ctxp, NULL, 741 "%7.2f%%", "transactional cycles", 742 100.0 * (avg / total)); 743 else 744 print_metric(ctxp, NULL, NULL, "transactional cycles", 745 0); 746 } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) { 747 total = avg_stats(&runtime_cycles_stats[ctx][cpu]); 748 total2 = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]); 749 if (total2 < avg) 750 total2 = avg; 751 if (total) 752 print_metric(ctxp, NULL, "%7.2f%%", "aborted cycles", 753 100.0 * ((total2-avg) / total)); 754 else 755 print_metric(ctxp, NULL, NULL, "aborted cycles", 0); 756 } else if (perf_stat_evsel__is(evsel, TRANSACTION_START)) { 757 total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]); 758 759 if (avg) 760 ratio = total / avg; 761 762 if (runtime_cycles_in_tx_stats[ctx][cpu].n != 0) 763 print_metric(ctxp, NULL, "%8.0f", 764 "cycles / transaction", ratio); 765 else 766 print_metric(ctxp, NULL, NULL, "cycles / transaction", 767 0); 768 } else if (perf_stat_evsel__is(evsel, ELISION_START)) { 769 total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]); 770 771 if (avg) 772 ratio = total / avg; 773 774 print_metric(ctxp, NULL, "%8.0f", "cycles / elision", ratio); 775 } else if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK) || 776 perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK)) { 777 if ((ratio = avg_stats(&walltime_nsecs_stats)) != 0) 778 print_metric(ctxp, NULL, "%8.3f", "CPUs utilized", 779 avg / ratio); 780 else 781 print_metric(ctxp, NULL, NULL, "CPUs utilized", 0); 782 } else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) { 783 double fe_bound = td_fe_bound(ctx, cpu); 784 785 if (fe_bound > 0.2) 786 color = PERF_COLOR_RED; 787 print_metric(ctxp, color, "%8.1f%%", "frontend bound", 788 fe_bound * 100.); 789 } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) { 790 double retiring = td_retiring(ctx, cpu); 791 792 if (retiring > 0.7) 793 color = PERF_COLOR_GREEN; 794 print_metric(ctxp, color, "%8.1f%%", "retiring", 795 retiring * 100.); 796 } else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) { 797 double bad_spec = td_bad_spec(ctx, cpu); 798 799 if (bad_spec > 0.1) 800 color = PERF_COLOR_RED; 801 print_metric(ctxp, color, "%8.1f%%", "bad speculation", 802 bad_spec * 100.); 803 } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) { 804 double be_bound = td_be_bound(ctx, cpu); 805 const char *name = "backend bound"; 806 static int have_recovery_bubbles = -1; 807 808 /* In case the CPU does not support topdown-recovery-bubbles */ 809 if (have_recovery_bubbles < 0) 810 have_recovery_bubbles = pmu_have_event("cpu", 811 "topdown-recovery-bubbles"); 812 if (!have_recovery_bubbles) 813 name = "backend bound/bad spec"; 814 815 if (be_bound > 0.2) 816 color = PERF_COLOR_RED; 817 if (td_total_slots(ctx, cpu) > 0) 818 print_metric(ctxp, color, "%8.1f%%", name, 819 be_bound * 100.); 820 else 821 print_metric(ctxp, NULL, NULL, name, 0); 822 } else if (evsel->metric_expr) { 823 struct parse_ctx pctx; 824 int i; 825 826 expr__ctx_init(&pctx); 827 expr__add_id(&pctx, evsel->name, avg); 828 for (i = 0; evsel->metric_events[i]; i++) { 829 struct saved_value *v; 830 831 v = saved_value_lookup(evsel->metric_events[i], cpu, ctx, false); 832 if (!v) 833 break; 834 expr__add_id(&pctx, evsel->metric_events[i]->name, 835 avg_stats(&v->stats)); 836 } 837 if (!evsel->metric_events[i]) { 838 const char *p = evsel->metric_expr; 839 840 if (expr__parse(&ratio, &pctx, &p) == 0) 841 print_metric(ctxp, NULL, "%8.1f", 842 evsel->metric_name ? 843 evsel->metric_name : 844 out->force_header ? evsel->name : "", 845 ratio); 846 else 847 print_metric(ctxp, NULL, NULL, "", 0); 848 } else 849 print_metric(ctxp, NULL, NULL, "", 0); 850 } else if (runtime_nsecs_stats[cpu].n != 0) { 851 char unit = 'M'; 852 char unit_buf[10]; 853 854 total = avg_stats(&runtime_nsecs_stats[cpu]); 855 856 if (total) 857 ratio = 1000.0 * avg / total; 858 if (ratio < 0.001) { 859 ratio *= 1000; 860 unit = 'K'; 861 } 862 snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit); 863 print_metric(ctxp, NULL, "%8.3f", unit_buf, ratio); 864 } else if (perf_stat_evsel__is(evsel, SMI_NUM)) { 865 print_smi_cost(cpu, evsel, out); 866 } else { 867 print_metric(ctxp, NULL, NULL, NULL, 0); 868 } 869 } 870