1 #include <stdio.h> 2 #include "evsel.h" 3 #include "stat.h" 4 #include "color.h" 5 #include "pmu.h" 6 #include "rblist.h" 7 #include "evlist.h" 8 #include "expr.h" 9 10 enum { 11 CTX_BIT_USER = 1 << 0, 12 CTX_BIT_KERNEL = 1 << 1, 13 CTX_BIT_HV = 1 << 2, 14 CTX_BIT_HOST = 1 << 3, 15 CTX_BIT_IDLE = 1 << 4, 16 CTX_BIT_MAX = 1 << 5, 17 }; 18 19 #define NUM_CTX CTX_BIT_MAX 20 21 /* 22 * AGGR_GLOBAL: Use CPU 0 23 * AGGR_SOCKET: Use first CPU of socket 24 * AGGR_CORE: Use first CPU of core 25 * AGGR_NONE: Use matching CPU 26 * AGGR_THREAD: Not supported? 27 */ 28 static struct stats runtime_nsecs_stats[MAX_NR_CPUS]; 29 static struct stats runtime_cycles_stats[NUM_CTX][MAX_NR_CPUS]; 30 static struct stats runtime_stalled_cycles_front_stats[NUM_CTX][MAX_NR_CPUS]; 31 static struct stats runtime_stalled_cycles_back_stats[NUM_CTX][MAX_NR_CPUS]; 32 static struct stats runtime_branches_stats[NUM_CTX][MAX_NR_CPUS]; 33 static struct stats runtime_cacherefs_stats[NUM_CTX][MAX_NR_CPUS]; 34 static struct stats runtime_l1_dcache_stats[NUM_CTX][MAX_NR_CPUS]; 35 static struct stats runtime_l1_icache_stats[NUM_CTX][MAX_NR_CPUS]; 36 static struct stats runtime_ll_cache_stats[NUM_CTX][MAX_NR_CPUS]; 37 static struct stats runtime_itlb_cache_stats[NUM_CTX][MAX_NR_CPUS]; 38 static struct stats runtime_dtlb_cache_stats[NUM_CTX][MAX_NR_CPUS]; 39 static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS]; 40 static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS]; 41 static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS]; 42 static struct stats runtime_topdown_total_slots[NUM_CTX][MAX_NR_CPUS]; 43 static struct stats runtime_topdown_slots_issued[NUM_CTX][MAX_NR_CPUS]; 44 static struct stats runtime_topdown_slots_retired[NUM_CTX][MAX_NR_CPUS]; 45 static struct stats runtime_topdown_fetch_bubbles[NUM_CTX][MAX_NR_CPUS]; 46 static struct stats runtime_topdown_recovery_bubbles[NUM_CTX][MAX_NR_CPUS]; 47 static struct stats runtime_smi_num_stats[NUM_CTX][MAX_NR_CPUS]; 48 static struct stats runtime_aperf_stats[NUM_CTX][MAX_NR_CPUS]; 49 static struct rblist runtime_saved_values; 50 static bool have_frontend_stalled; 51 52 struct stats walltime_nsecs_stats; 53 54 struct saved_value { 55 struct rb_node rb_node; 56 struct perf_evsel *evsel; 57 int cpu; 58 int ctx; 59 struct stats stats; 60 }; 61 62 static int saved_value_cmp(struct rb_node *rb_node, const void *entry) 63 { 64 struct saved_value *a = container_of(rb_node, 65 struct saved_value, 66 rb_node); 67 const struct saved_value *b = entry; 68 69 if (a->ctx != b->ctx) 70 return a->ctx - b->ctx; 71 if (a->cpu != b->cpu) 72 return a->cpu - b->cpu; 73 return a->evsel - b->evsel; 74 } 75 76 static struct rb_node *saved_value_new(struct rblist *rblist __maybe_unused, 77 const void *entry) 78 { 79 struct saved_value *nd = malloc(sizeof(struct saved_value)); 80 81 if (!nd) 82 return NULL; 83 memcpy(nd, entry, sizeof(struct saved_value)); 84 return &nd->rb_node; 85 } 86 87 static struct saved_value *saved_value_lookup(struct perf_evsel *evsel, 88 int cpu, int ctx, 89 bool create) 90 { 91 struct rb_node *nd; 92 struct saved_value dm = { 93 .cpu = cpu, 94 .ctx = ctx, 95 .evsel = evsel, 96 }; 97 nd = rblist__find(&runtime_saved_values, &dm); 98 if (nd) 99 return container_of(nd, struct saved_value, rb_node); 100 if (create) { 101 rblist__add_node(&runtime_saved_values, &dm); 102 nd = rblist__find(&runtime_saved_values, &dm); 103 if (nd) 104 return container_of(nd, struct saved_value, rb_node); 105 } 106 return NULL; 107 } 108 109 void perf_stat__init_shadow_stats(void) 110 { 111 have_frontend_stalled = pmu_have_event("cpu", "stalled-cycles-frontend"); 112 rblist__init(&runtime_saved_values); 113 runtime_saved_values.node_cmp = saved_value_cmp; 114 runtime_saved_values.node_new = saved_value_new; 115 /* No delete for now */ 116 } 117 118 static int evsel_context(struct perf_evsel *evsel) 119 { 120 int ctx = 0; 121 122 if (evsel->attr.exclude_kernel) 123 ctx |= CTX_BIT_KERNEL; 124 if (evsel->attr.exclude_user) 125 ctx |= CTX_BIT_USER; 126 if (evsel->attr.exclude_hv) 127 ctx |= CTX_BIT_HV; 128 if (evsel->attr.exclude_host) 129 ctx |= CTX_BIT_HOST; 130 if (evsel->attr.exclude_idle) 131 ctx |= CTX_BIT_IDLE; 132 133 return ctx; 134 } 135 136 void perf_stat__reset_shadow_stats(void) 137 { 138 struct rb_node *pos, *next; 139 140 memset(runtime_nsecs_stats, 0, sizeof(runtime_nsecs_stats)); 141 memset(runtime_cycles_stats, 0, sizeof(runtime_cycles_stats)); 142 memset(runtime_stalled_cycles_front_stats, 0, sizeof(runtime_stalled_cycles_front_stats)); 143 memset(runtime_stalled_cycles_back_stats, 0, sizeof(runtime_stalled_cycles_back_stats)); 144 memset(runtime_branches_stats, 0, sizeof(runtime_branches_stats)); 145 memset(runtime_cacherefs_stats, 0, sizeof(runtime_cacherefs_stats)); 146 memset(runtime_l1_dcache_stats, 0, sizeof(runtime_l1_dcache_stats)); 147 memset(runtime_l1_icache_stats, 0, sizeof(runtime_l1_icache_stats)); 148 memset(runtime_ll_cache_stats, 0, sizeof(runtime_ll_cache_stats)); 149 memset(runtime_itlb_cache_stats, 0, sizeof(runtime_itlb_cache_stats)); 150 memset(runtime_dtlb_cache_stats, 0, sizeof(runtime_dtlb_cache_stats)); 151 memset(runtime_cycles_in_tx_stats, 0, 152 sizeof(runtime_cycles_in_tx_stats)); 153 memset(runtime_transaction_stats, 0, 154 sizeof(runtime_transaction_stats)); 155 memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats)); 156 memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats)); 157 memset(runtime_topdown_total_slots, 0, sizeof(runtime_topdown_total_slots)); 158 memset(runtime_topdown_slots_retired, 0, sizeof(runtime_topdown_slots_retired)); 159 memset(runtime_topdown_slots_issued, 0, sizeof(runtime_topdown_slots_issued)); 160 memset(runtime_topdown_fetch_bubbles, 0, sizeof(runtime_topdown_fetch_bubbles)); 161 memset(runtime_topdown_recovery_bubbles, 0, sizeof(runtime_topdown_recovery_bubbles)); 162 memset(runtime_smi_num_stats, 0, sizeof(runtime_smi_num_stats)); 163 memset(runtime_aperf_stats, 0, sizeof(runtime_aperf_stats)); 164 165 next = rb_first(&runtime_saved_values.entries); 166 while (next) { 167 pos = next; 168 next = rb_next(pos); 169 memset(&container_of(pos, struct saved_value, rb_node)->stats, 170 0, 171 sizeof(struct stats)); 172 } 173 } 174 175 /* 176 * Update various tracking values we maintain to print 177 * more semantic information such as miss/hit ratios, 178 * instruction rates, etc: 179 */ 180 void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count, 181 int cpu) 182 { 183 int ctx = evsel_context(counter); 184 185 if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK) || 186 perf_evsel__match(counter, SOFTWARE, SW_CPU_CLOCK)) 187 update_stats(&runtime_nsecs_stats[cpu], count[0]); 188 else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) 189 update_stats(&runtime_cycles_stats[ctx][cpu], count[0]); 190 else if (perf_stat_evsel__is(counter, CYCLES_IN_TX)) 191 update_stats(&runtime_cycles_in_tx_stats[ctx][cpu], count[0]); 192 else if (perf_stat_evsel__is(counter, TRANSACTION_START)) 193 update_stats(&runtime_transaction_stats[ctx][cpu], count[0]); 194 else if (perf_stat_evsel__is(counter, ELISION_START)) 195 update_stats(&runtime_elision_stats[ctx][cpu], count[0]); 196 else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS)) 197 update_stats(&runtime_topdown_total_slots[ctx][cpu], count[0]); 198 else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED)) 199 update_stats(&runtime_topdown_slots_issued[ctx][cpu], count[0]); 200 else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED)) 201 update_stats(&runtime_topdown_slots_retired[ctx][cpu], count[0]); 202 else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES)) 203 update_stats(&runtime_topdown_fetch_bubbles[ctx][cpu],count[0]); 204 else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES)) 205 update_stats(&runtime_topdown_recovery_bubbles[ctx][cpu], count[0]); 206 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) 207 update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count[0]); 208 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) 209 update_stats(&runtime_stalled_cycles_back_stats[ctx][cpu], count[0]); 210 else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) 211 update_stats(&runtime_branches_stats[ctx][cpu], count[0]); 212 else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) 213 update_stats(&runtime_cacherefs_stats[ctx][cpu], count[0]); 214 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) 215 update_stats(&runtime_l1_dcache_stats[ctx][cpu], count[0]); 216 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) 217 update_stats(&runtime_ll_cache_stats[ctx][cpu], count[0]); 218 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL)) 219 update_stats(&runtime_ll_cache_stats[ctx][cpu], count[0]); 220 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) 221 update_stats(&runtime_dtlb_cache_stats[ctx][cpu], count[0]); 222 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) 223 update_stats(&runtime_itlb_cache_stats[ctx][cpu], count[0]); 224 else if (perf_stat_evsel__is(counter, SMI_NUM)) 225 update_stats(&runtime_smi_num_stats[ctx][cpu], count[0]); 226 else if (perf_stat_evsel__is(counter, APERF)) 227 update_stats(&runtime_aperf_stats[ctx][cpu], count[0]); 228 229 if (counter->collect_stat) { 230 struct saved_value *v = saved_value_lookup(counter, cpu, ctx, 231 true); 232 update_stats(&v->stats, count[0]); 233 } 234 } 235 236 /* used for get_ratio_color() */ 237 enum grc_type { 238 GRC_STALLED_CYCLES_FE, 239 GRC_STALLED_CYCLES_BE, 240 GRC_CACHE_MISSES, 241 GRC_MAX_NR 242 }; 243 244 static const char *get_ratio_color(enum grc_type type, double ratio) 245 { 246 static const double grc_table[GRC_MAX_NR][3] = { 247 [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 }, 248 [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 }, 249 [GRC_CACHE_MISSES] = { 20.0, 10.0, 5.0 }, 250 }; 251 const char *color = PERF_COLOR_NORMAL; 252 253 if (ratio > grc_table[type][0]) 254 color = PERF_COLOR_RED; 255 else if (ratio > grc_table[type][1]) 256 color = PERF_COLOR_MAGENTA; 257 else if (ratio > grc_table[type][2]) 258 color = PERF_COLOR_YELLOW; 259 260 return color; 261 } 262 263 static struct perf_evsel *perf_stat__find_event(struct perf_evlist *evsel_list, 264 const char *name) 265 { 266 struct perf_evsel *c2; 267 268 evlist__for_each_entry (evsel_list, c2) { 269 if (!strcasecmp(c2->name, name)) 270 return c2; 271 } 272 return NULL; 273 } 274 275 /* Mark MetricExpr target events and link events using them to them. */ 276 void perf_stat__collect_metric_expr(struct perf_evlist *evsel_list) 277 { 278 struct perf_evsel *counter, *leader, **metric_events, *oc; 279 bool found; 280 const char **metric_names; 281 int i; 282 int num_metric_names; 283 284 evlist__for_each_entry(evsel_list, counter) { 285 bool invalid = false; 286 287 leader = counter->leader; 288 if (!counter->metric_expr) 289 continue; 290 metric_events = counter->metric_events; 291 if (!metric_events) { 292 if (expr__find_other(counter->metric_expr, counter->name, 293 &metric_names, &num_metric_names) < 0) 294 continue; 295 296 metric_events = calloc(sizeof(struct perf_evsel *), 297 num_metric_names + 1); 298 if (!metric_events) 299 return; 300 counter->metric_events = metric_events; 301 } 302 303 for (i = 0; i < num_metric_names; i++) { 304 found = false; 305 if (leader) { 306 /* Search in group */ 307 for_each_group_member (oc, leader) { 308 if (!strcasecmp(oc->name, metric_names[i])) { 309 found = true; 310 break; 311 } 312 } 313 } 314 if (!found) { 315 /* Search ignoring groups */ 316 oc = perf_stat__find_event(evsel_list, metric_names[i]); 317 } 318 if (!oc) { 319 /* Deduping one is good enough to handle duplicated PMUs. */ 320 static char *printed; 321 322 /* 323 * Adding events automatically would be difficult, because 324 * it would risk creating groups that are not schedulable. 325 * perf stat doesn't understand all the scheduling constraints 326 * of events. So we ask the user instead to add the missing 327 * events. 328 */ 329 if (!printed || strcasecmp(printed, metric_names[i])) { 330 fprintf(stderr, 331 "Add %s event to groups to get metric expression for %s\n", 332 metric_names[i], 333 counter->name); 334 printed = strdup(metric_names[i]); 335 } 336 invalid = true; 337 continue; 338 } 339 metric_events[i] = oc; 340 oc->collect_stat = true; 341 } 342 metric_events[i] = NULL; 343 free(metric_names); 344 if (invalid) { 345 free(metric_events); 346 counter->metric_events = NULL; 347 counter->metric_expr = NULL; 348 } 349 } 350 } 351 352 static void print_stalled_cycles_frontend(int cpu, 353 struct perf_evsel *evsel, double avg, 354 struct perf_stat_output_ctx *out) 355 { 356 double total, ratio = 0.0; 357 const char *color; 358 int ctx = evsel_context(evsel); 359 360 total = avg_stats(&runtime_cycles_stats[ctx][cpu]); 361 362 if (total) 363 ratio = avg / total * 100.0; 364 365 color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio); 366 367 if (ratio) 368 out->print_metric(out->ctx, color, "%7.2f%%", "frontend cycles idle", 369 ratio); 370 else 371 out->print_metric(out->ctx, NULL, NULL, "frontend cycles idle", 0); 372 } 373 374 static void print_stalled_cycles_backend(int cpu, 375 struct perf_evsel *evsel, double avg, 376 struct perf_stat_output_ctx *out) 377 { 378 double total, ratio = 0.0; 379 const char *color; 380 int ctx = evsel_context(evsel); 381 382 total = avg_stats(&runtime_cycles_stats[ctx][cpu]); 383 384 if (total) 385 ratio = avg / total * 100.0; 386 387 color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio); 388 389 out->print_metric(out->ctx, color, "%7.2f%%", "backend cycles idle", ratio); 390 } 391 392 static void print_branch_misses(int cpu, 393 struct perf_evsel *evsel, 394 double avg, 395 struct perf_stat_output_ctx *out) 396 { 397 double total, ratio = 0.0; 398 const char *color; 399 int ctx = evsel_context(evsel); 400 401 total = avg_stats(&runtime_branches_stats[ctx][cpu]); 402 403 if (total) 404 ratio = avg / total * 100.0; 405 406 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 407 408 out->print_metric(out->ctx, color, "%7.2f%%", "of all branches", ratio); 409 } 410 411 static void print_l1_dcache_misses(int cpu, 412 struct perf_evsel *evsel, 413 double avg, 414 struct perf_stat_output_ctx *out) 415 { 416 double total, ratio = 0.0; 417 const char *color; 418 int ctx = evsel_context(evsel); 419 420 total = avg_stats(&runtime_l1_dcache_stats[ctx][cpu]); 421 422 if (total) 423 ratio = avg / total * 100.0; 424 425 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 426 427 out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-dcache hits", ratio); 428 } 429 430 static void print_l1_icache_misses(int cpu, 431 struct perf_evsel *evsel, 432 double avg, 433 struct perf_stat_output_ctx *out) 434 { 435 double total, ratio = 0.0; 436 const char *color; 437 int ctx = evsel_context(evsel); 438 439 total = avg_stats(&runtime_l1_icache_stats[ctx][cpu]); 440 441 if (total) 442 ratio = avg / total * 100.0; 443 444 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 445 out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-icache hits", ratio); 446 } 447 448 static void print_dtlb_cache_misses(int cpu, 449 struct perf_evsel *evsel, 450 double avg, 451 struct perf_stat_output_ctx *out) 452 { 453 double total, ratio = 0.0; 454 const char *color; 455 int ctx = evsel_context(evsel); 456 457 total = avg_stats(&runtime_dtlb_cache_stats[ctx][cpu]); 458 459 if (total) 460 ratio = avg / total * 100.0; 461 462 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 463 out->print_metric(out->ctx, color, "%7.2f%%", "of all dTLB cache hits", ratio); 464 } 465 466 static void print_itlb_cache_misses(int cpu, 467 struct perf_evsel *evsel, 468 double avg, 469 struct perf_stat_output_ctx *out) 470 { 471 double total, ratio = 0.0; 472 const char *color; 473 int ctx = evsel_context(evsel); 474 475 total = avg_stats(&runtime_itlb_cache_stats[ctx][cpu]); 476 477 if (total) 478 ratio = avg / total * 100.0; 479 480 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 481 out->print_metric(out->ctx, color, "%7.2f%%", "of all iTLB cache hits", ratio); 482 } 483 484 static void print_ll_cache_misses(int cpu, 485 struct perf_evsel *evsel, 486 double avg, 487 struct perf_stat_output_ctx *out) 488 { 489 double total, ratio = 0.0; 490 const char *color; 491 int ctx = evsel_context(evsel); 492 493 total = avg_stats(&runtime_ll_cache_stats[ctx][cpu]); 494 495 if (total) 496 ratio = avg / total * 100.0; 497 498 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 499 out->print_metric(out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio); 500 } 501 502 /* 503 * High level "TopDown" CPU core pipe line bottleneck break down. 504 * 505 * Basic concept following 506 * Yasin, A Top Down Method for Performance analysis and Counter architecture 507 * ISPASS14 508 * 509 * The CPU pipeline is divided into 4 areas that can be bottlenecks: 510 * 511 * Frontend -> Backend -> Retiring 512 * BadSpeculation in addition means out of order execution that is thrown away 513 * (for example branch mispredictions) 514 * Frontend is instruction decoding. 515 * Backend is execution, like computation and accessing data in memory 516 * Retiring is good execution that is not directly bottlenecked 517 * 518 * The formulas are computed in slots. 519 * A slot is an entry in the pipeline each for the pipeline width 520 * (for example a 4-wide pipeline has 4 slots for each cycle) 521 * 522 * Formulas: 523 * BadSpeculation = ((SlotsIssued - SlotsRetired) + RecoveryBubbles) / 524 * TotalSlots 525 * Retiring = SlotsRetired / TotalSlots 526 * FrontendBound = FetchBubbles / TotalSlots 527 * BackendBound = 1.0 - BadSpeculation - Retiring - FrontendBound 528 * 529 * The kernel provides the mapping to the low level CPU events and any scaling 530 * needed for the CPU pipeline width, for example: 531 * 532 * TotalSlots = Cycles * 4 533 * 534 * The scaling factor is communicated in the sysfs unit. 535 * 536 * In some cases the CPU may not be able to measure all the formulas due to 537 * missing events. In this case multiple formulas are combined, as possible. 538 * 539 * Full TopDown supports more levels to sub-divide each area: for example 540 * BackendBound into computing bound and memory bound. For now we only 541 * support Level 1 TopDown. 542 */ 543 544 static double sanitize_val(double x) 545 { 546 if (x < 0 && x >= -0.02) 547 return 0.0; 548 return x; 549 } 550 551 static double td_total_slots(int ctx, int cpu) 552 { 553 return avg_stats(&runtime_topdown_total_slots[ctx][cpu]); 554 } 555 556 static double td_bad_spec(int ctx, int cpu) 557 { 558 double bad_spec = 0; 559 double total_slots; 560 double total; 561 562 total = avg_stats(&runtime_topdown_slots_issued[ctx][cpu]) - 563 avg_stats(&runtime_topdown_slots_retired[ctx][cpu]) + 564 avg_stats(&runtime_topdown_recovery_bubbles[ctx][cpu]); 565 total_slots = td_total_slots(ctx, cpu); 566 if (total_slots) 567 bad_spec = total / total_slots; 568 return sanitize_val(bad_spec); 569 } 570 571 static double td_retiring(int ctx, int cpu) 572 { 573 double retiring = 0; 574 double total_slots = td_total_slots(ctx, cpu); 575 double ret_slots = avg_stats(&runtime_topdown_slots_retired[ctx][cpu]); 576 577 if (total_slots) 578 retiring = ret_slots / total_slots; 579 return retiring; 580 } 581 582 static double td_fe_bound(int ctx, int cpu) 583 { 584 double fe_bound = 0; 585 double total_slots = td_total_slots(ctx, cpu); 586 double fetch_bub = avg_stats(&runtime_topdown_fetch_bubbles[ctx][cpu]); 587 588 if (total_slots) 589 fe_bound = fetch_bub / total_slots; 590 return fe_bound; 591 } 592 593 static double td_be_bound(int ctx, int cpu) 594 { 595 double sum = (td_fe_bound(ctx, cpu) + 596 td_bad_spec(ctx, cpu) + 597 td_retiring(ctx, cpu)); 598 if (sum == 0) 599 return 0; 600 return sanitize_val(1.0 - sum); 601 } 602 603 static void print_smi_cost(int cpu, struct perf_evsel *evsel, 604 struct perf_stat_output_ctx *out) 605 { 606 double smi_num, aperf, cycles, cost = 0.0; 607 int ctx = evsel_context(evsel); 608 const char *color = NULL; 609 610 smi_num = avg_stats(&runtime_smi_num_stats[ctx][cpu]); 611 aperf = avg_stats(&runtime_aperf_stats[ctx][cpu]); 612 cycles = avg_stats(&runtime_cycles_stats[ctx][cpu]); 613 614 if ((cycles == 0) || (aperf == 0)) 615 return; 616 617 if (smi_num) 618 cost = (aperf - cycles) / aperf * 100.00; 619 620 if (cost > 10) 621 color = PERF_COLOR_RED; 622 out->print_metric(out->ctx, color, "%8.1f%%", "SMI cycles%", cost); 623 out->print_metric(out->ctx, NULL, "%4.0f", "SMI#", smi_num); 624 } 625 626 void perf_stat__print_shadow_stats(struct perf_evsel *evsel, 627 double avg, int cpu, 628 struct perf_stat_output_ctx *out) 629 { 630 void *ctxp = out->ctx; 631 print_metric_t print_metric = out->print_metric; 632 double total, ratio = 0.0, total2; 633 const char *color = NULL; 634 int ctx = evsel_context(evsel); 635 636 if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { 637 total = avg_stats(&runtime_cycles_stats[ctx][cpu]); 638 if (total) { 639 ratio = avg / total; 640 print_metric(ctxp, NULL, "%7.2f ", 641 "insn per cycle", ratio); 642 } else { 643 print_metric(ctxp, NULL, NULL, "insn per cycle", 0); 644 } 645 total = avg_stats(&runtime_stalled_cycles_front_stats[ctx][cpu]); 646 total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[ctx][cpu])); 647 648 if (total && avg) { 649 out->new_line(ctxp); 650 ratio = total / avg; 651 print_metric(ctxp, NULL, "%7.2f ", 652 "stalled cycles per insn", 653 ratio); 654 } else if (have_frontend_stalled) { 655 print_metric(ctxp, NULL, NULL, 656 "stalled cycles per insn", 0); 657 } 658 } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) { 659 if (runtime_branches_stats[ctx][cpu].n != 0) 660 print_branch_misses(cpu, evsel, avg, out); 661 else 662 print_metric(ctxp, NULL, NULL, "of all branches", 0); 663 } else if ( 664 evsel->attr.type == PERF_TYPE_HW_CACHE && 665 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | 666 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 667 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 668 if (runtime_l1_dcache_stats[ctx][cpu].n != 0) 669 print_l1_dcache_misses(cpu, evsel, avg, out); 670 else 671 print_metric(ctxp, NULL, NULL, "of all L1-dcache hits", 0); 672 } else if ( 673 evsel->attr.type == PERF_TYPE_HW_CACHE && 674 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I | 675 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 676 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 677 if (runtime_l1_icache_stats[ctx][cpu].n != 0) 678 print_l1_icache_misses(cpu, evsel, avg, out); 679 else 680 print_metric(ctxp, NULL, NULL, "of all L1-icache hits", 0); 681 } else if ( 682 evsel->attr.type == PERF_TYPE_HW_CACHE && 683 evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB | 684 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 685 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 686 if (runtime_dtlb_cache_stats[ctx][cpu].n != 0) 687 print_dtlb_cache_misses(cpu, evsel, avg, out); 688 else 689 print_metric(ctxp, NULL, NULL, "of all dTLB cache hits", 0); 690 } else if ( 691 evsel->attr.type == PERF_TYPE_HW_CACHE && 692 evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB | 693 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 694 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 695 if (runtime_itlb_cache_stats[ctx][cpu].n != 0) 696 print_itlb_cache_misses(cpu, evsel, avg, out); 697 else 698 print_metric(ctxp, NULL, NULL, "of all iTLB cache hits", 0); 699 } else if ( 700 evsel->attr.type == PERF_TYPE_HW_CACHE && 701 evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL | 702 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 703 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 704 if (runtime_ll_cache_stats[ctx][cpu].n != 0) 705 print_ll_cache_misses(cpu, evsel, avg, out); 706 else 707 print_metric(ctxp, NULL, NULL, "of all LL-cache hits", 0); 708 } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) { 709 total = avg_stats(&runtime_cacherefs_stats[ctx][cpu]); 710 711 if (total) 712 ratio = avg * 100 / total; 713 714 if (runtime_cacherefs_stats[ctx][cpu].n != 0) 715 print_metric(ctxp, NULL, "%8.3f %%", 716 "of all cache refs", ratio); 717 else 718 print_metric(ctxp, NULL, NULL, "of all cache refs", 0); 719 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { 720 print_stalled_cycles_frontend(cpu, evsel, avg, out); 721 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { 722 print_stalled_cycles_backend(cpu, evsel, avg, out); 723 } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { 724 total = avg_stats(&runtime_nsecs_stats[cpu]); 725 726 if (total) { 727 ratio = avg / total; 728 print_metric(ctxp, NULL, "%8.3f", "GHz", ratio); 729 } else { 730 print_metric(ctxp, NULL, NULL, "Ghz", 0); 731 } 732 } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) { 733 total = avg_stats(&runtime_cycles_stats[ctx][cpu]); 734 if (total) 735 print_metric(ctxp, NULL, 736 "%7.2f%%", "transactional cycles", 737 100.0 * (avg / total)); 738 else 739 print_metric(ctxp, NULL, NULL, "transactional cycles", 740 0); 741 } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) { 742 total = avg_stats(&runtime_cycles_stats[ctx][cpu]); 743 total2 = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]); 744 if (total2 < avg) 745 total2 = avg; 746 if (total) 747 print_metric(ctxp, NULL, "%7.2f%%", "aborted cycles", 748 100.0 * ((total2-avg) / total)); 749 else 750 print_metric(ctxp, NULL, NULL, "aborted cycles", 0); 751 } else if (perf_stat_evsel__is(evsel, TRANSACTION_START)) { 752 total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]); 753 754 if (avg) 755 ratio = total / avg; 756 757 if (runtime_cycles_in_tx_stats[ctx][cpu].n != 0) 758 print_metric(ctxp, NULL, "%8.0f", 759 "cycles / transaction", ratio); 760 else 761 print_metric(ctxp, NULL, NULL, "cycles / transaction", 762 0); 763 } else if (perf_stat_evsel__is(evsel, ELISION_START)) { 764 total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]); 765 766 if (avg) 767 ratio = total / avg; 768 769 print_metric(ctxp, NULL, "%8.0f", "cycles / elision", ratio); 770 } else if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK) || 771 perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK)) { 772 if ((ratio = avg_stats(&walltime_nsecs_stats)) != 0) 773 print_metric(ctxp, NULL, "%8.3f", "CPUs utilized", 774 avg / ratio); 775 else 776 print_metric(ctxp, NULL, NULL, "CPUs utilized", 0); 777 } else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) { 778 double fe_bound = td_fe_bound(ctx, cpu); 779 780 if (fe_bound > 0.2) 781 color = PERF_COLOR_RED; 782 print_metric(ctxp, color, "%8.1f%%", "frontend bound", 783 fe_bound * 100.); 784 } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) { 785 double retiring = td_retiring(ctx, cpu); 786 787 if (retiring > 0.7) 788 color = PERF_COLOR_GREEN; 789 print_metric(ctxp, color, "%8.1f%%", "retiring", 790 retiring * 100.); 791 } else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) { 792 double bad_spec = td_bad_spec(ctx, cpu); 793 794 if (bad_spec > 0.1) 795 color = PERF_COLOR_RED; 796 print_metric(ctxp, color, "%8.1f%%", "bad speculation", 797 bad_spec * 100.); 798 } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) { 799 double be_bound = td_be_bound(ctx, cpu); 800 const char *name = "backend bound"; 801 static int have_recovery_bubbles = -1; 802 803 /* In case the CPU does not support topdown-recovery-bubbles */ 804 if (have_recovery_bubbles < 0) 805 have_recovery_bubbles = pmu_have_event("cpu", 806 "topdown-recovery-bubbles"); 807 if (!have_recovery_bubbles) 808 name = "backend bound/bad spec"; 809 810 if (be_bound > 0.2) 811 color = PERF_COLOR_RED; 812 if (td_total_slots(ctx, cpu) > 0) 813 print_metric(ctxp, color, "%8.1f%%", name, 814 be_bound * 100.); 815 else 816 print_metric(ctxp, NULL, NULL, name, 0); 817 } else if (evsel->metric_expr) { 818 struct parse_ctx pctx; 819 int i; 820 821 expr__ctx_init(&pctx); 822 expr__add_id(&pctx, evsel->name, avg); 823 for (i = 0; evsel->metric_events[i]; i++) { 824 struct saved_value *v; 825 826 v = saved_value_lookup(evsel->metric_events[i], cpu, ctx, false); 827 if (!v) 828 break; 829 expr__add_id(&pctx, evsel->metric_events[i]->name, 830 avg_stats(&v->stats)); 831 } 832 if (!evsel->metric_events[i]) { 833 const char *p = evsel->metric_expr; 834 835 if (expr__parse(&ratio, &pctx, &p) == 0) 836 print_metric(ctxp, NULL, "%8.1f", 837 evsel->metric_name ? 838 evsel->metric_name : 839 out->force_header ? evsel->name : "", 840 ratio); 841 else 842 print_metric(ctxp, NULL, NULL, "", 0); 843 } else 844 print_metric(ctxp, NULL, NULL, "", 0); 845 } else if (runtime_nsecs_stats[cpu].n != 0) { 846 char unit = 'M'; 847 char unit_buf[10]; 848 849 total = avg_stats(&runtime_nsecs_stats[cpu]); 850 851 if (total) 852 ratio = 1000.0 * avg / total; 853 if (ratio < 0.001) { 854 ratio *= 1000; 855 unit = 'K'; 856 } 857 snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit); 858 print_metric(ctxp, NULL, "%8.3f", unit_buf, ratio); 859 } else if (perf_stat_evsel__is(evsel, SMI_NUM)) { 860 print_smi_cost(cpu, evsel, out); 861 } else { 862 print_metric(ctxp, NULL, NULL, NULL, 0); 863 } 864 } 865