1 #include <stdio.h> 2 #include "evsel.h" 3 #include "stat.h" 4 #include "color.h" 5 #include "pmu.h" 6 #include "rblist.h" 7 #include "evlist.h" 8 #include "expr.h" 9 #include "metricgroup.h" 10 11 enum { 12 CTX_BIT_USER = 1 << 0, 13 CTX_BIT_KERNEL = 1 << 1, 14 CTX_BIT_HV = 1 << 2, 15 CTX_BIT_HOST = 1 << 3, 16 CTX_BIT_IDLE = 1 << 4, 17 CTX_BIT_MAX = 1 << 5, 18 }; 19 20 #define NUM_CTX CTX_BIT_MAX 21 22 /* 23 * AGGR_GLOBAL: Use CPU 0 24 * AGGR_SOCKET: Use first CPU of socket 25 * AGGR_CORE: Use first CPU of core 26 * AGGR_NONE: Use matching CPU 27 * AGGR_THREAD: Not supported? 28 */ 29 static struct stats runtime_nsecs_stats[MAX_NR_CPUS]; 30 static struct stats runtime_cycles_stats[NUM_CTX][MAX_NR_CPUS]; 31 static struct stats runtime_stalled_cycles_front_stats[NUM_CTX][MAX_NR_CPUS]; 32 static struct stats runtime_stalled_cycles_back_stats[NUM_CTX][MAX_NR_CPUS]; 33 static struct stats runtime_branches_stats[NUM_CTX][MAX_NR_CPUS]; 34 static struct stats runtime_cacherefs_stats[NUM_CTX][MAX_NR_CPUS]; 35 static struct stats runtime_l1_dcache_stats[NUM_CTX][MAX_NR_CPUS]; 36 static struct stats runtime_l1_icache_stats[NUM_CTX][MAX_NR_CPUS]; 37 static struct stats runtime_ll_cache_stats[NUM_CTX][MAX_NR_CPUS]; 38 static struct stats runtime_itlb_cache_stats[NUM_CTX][MAX_NR_CPUS]; 39 static struct stats runtime_dtlb_cache_stats[NUM_CTX][MAX_NR_CPUS]; 40 static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS]; 41 static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS]; 42 static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS]; 43 static struct stats runtime_topdown_total_slots[NUM_CTX][MAX_NR_CPUS]; 44 static struct stats runtime_topdown_slots_issued[NUM_CTX][MAX_NR_CPUS]; 45 static struct stats runtime_topdown_slots_retired[NUM_CTX][MAX_NR_CPUS]; 46 static struct stats runtime_topdown_fetch_bubbles[NUM_CTX][MAX_NR_CPUS]; 47 static struct stats runtime_topdown_recovery_bubbles[NUM_CTX][MAX_NR_CPUS]; 48 static struct stats runtime_smi_num_stats[NUM_CTX][MAX_NR_CPUS]; 49 static struct stats runtime_aperf_stats[NUM_CTX][MAX_NR_CPUS]; 50 static struct rblist runtime_saved_values; 51 static bool have_frontend_stalled; 52 53 struct stats walltime_nsecs_stats; 54 55 struct saved_value { 56 struct rb_node rb_node; 57 struct perf_evsel *evsel; 58 int cpu; 59 struct stats stats; 60 }; 61 62 static int saved_value_cmp(struct rb_node *rb_node, const void *entry) 63 { 64 struct saved_value *a = container_of(rb_node, 65 struct saved_value, 66 rb_node); 67 const struct saved_value *b = entry; 68 69 if (a->cpu != b->cpu) 70 return a->cpu - b->cpu; 71 if (a->evsel == b->evsel) 72 return 0; 73 if ((char *)a->evsel < (char *)b->evsel) 74 return -1; 75 return +1; 76 } 77 78 static struct rb_node *saved_value_new(struct rblist *rblist __maybe_unused, 79 const void *entry) 80 { 81 struct saved_value *nd = malloc(sizeof(struct saved_value)); 82 83 if (!nd) 84 return NULL; 85 memcpy(nd, entry, sizeof(struct saved_value)); 86 return &nd->rb_node; 87 } 88 89 static struct saved_value *saved_value_lookup(struct perf_evsel *evsel, 90 int cpu, 91 bool create) 92 { 93 struct rb_node *nd; 94 struct saved_value dm = { 95 .cpu = cpu, 96 .evsel = evsel, 97 }; 98 nd = rblist__find(&runtime_saved_values, &dm); 99 if (nd) 100 return container_of(nd, struct saved_value, rb_node); 101 if (create) { 102 rblist__add_node(&runtime_saved_values, &dm); 103 nd = rblist__find(&runtime_saved_values, &dm); 104 if (nd) 105 return container_of(nd, struct saved_value, rb_node); 106 } 107 return NULL; 108 } 109 110 void perf_stat__init_shadow_stats(void) 111 { 112 have_frontend_stalled = pmu_have_event("cpu", "stalled-cycles-frontend"); 113 rblist__init(&runtime_saved_values); 114 runtime_saved_values.node_cmp = saved_value_cmp; 115 runtime_saved_values.node_new = saved_value_new; 116 /* No delete for now */ 117 } 118 119 static int evsel_context(struct perf_evsel *evsel) 120 { 121 int ctx = 0; 122 123 if (evsel->attr.exclude_kernel) 124 ctx |= CTX_BIT_KERNEL; 125 if (evsel->attr.exclude_user) 126 ctx |= CTX_BIT_USER; 127 if (evsel->attr.exclude_hv) 128 ctx |= CTX_BIT_HV; 129 if (evsel->attr.exclude_host) 130 ctx |= CTX_BIT_HOST; 131 if (evsel->attr.exclude_idle) 132 ctx |= CTX_BIT_IDLE; 133 134 return ctx; 135 } 136 137 void perf_stat__reset_shadow_stats(void) 138 { 139 struct rb_node *pos, *next; 140 141 memset(runtime_nsecs_stats, 0, sizeof(runtime_nsecs_stats)); 142 memset(runtime_cycles_stats, 0, sizeof(runtime_cycles_stats)); 143 memset(runtime_stalled_cycles_front_stats, 0, sizeof(runtime_stalled_cycles_front_stats)); 144 memset(runtime_stalled_cycles_back_stats, 0, sizeof(runtime_stalled_cycles_back_stats)); 145 memset(runtime_branches_stats, 0, sizeof(runtime_branches_stats)); 146 memset(runtime_cacherefs_stats, 0, sizeof(runtime_cacherefs_stats)); 147 memset(runtime_l1_dcache_stats, 0, sizeof(runtime_l1_dcache_stats)); 148 memset(runtime_l1_icache_stats, 0, sizeof(runtime_l1_icache_stats)); 149 memset(runtime_ll_cache_stats, 0, sizeof(runtime_ll_cache_stats)); 150 memset(runtime_itlb_cache_stats, 0, sizeof(runtime_itlb_cache_stats)); 151 memset(runtime_dtlb_cache_stats, 0, sizeof(runtime_dtlb_cache_stats)); 152 memset(runtime_cycles_in_tx_stats, 0, 153 sizeof(runtime_cycles_in_tx_stats)); 154 memset(runtime_transaction_stats, 0, 155 sizeof(runtime_transaction_stats)); 156 memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats)); 157 memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats)); 158 memset(runtime_topdown_total_slots, 0, sizeof(runtime_topdown_total_slots)); 159 memset(runtime_topdown_slots_retired, 0, sizeof(runtime_topdown_slots_retired)); 160 memset(runtime_topdown_slots_issued, 0, sizeof(runtime_topdown_slots_issued)); 161 memset(runtime_topdown_fetch_bubbles, 0, sizeof(runtime_topdown_fetch_bubbles)); 162 memset(runtime_topdown_recovery_bubbles, 0, sizeof(runtime_topdown_recovery_bubbles)); 163 memset(runtime_smi_num_stats, 0, sizeof(runtime_smi_num_stats)); 164 memset(runtime_aperf_stats, 0, sizeof(runtime_aperf_stats)); 165 166 next = rb_first(&runtime_saved_values.entries); 167 while (next) { 168 pos = next; 169 next = rb_next(pos); 170 memset(&container_of(pos, struct saved_value, rb_node)->stats, 171 0, 172 sizeof(struct stats)); 173 } 174 } 175 176 /* 177 * Update various tracking values we maintain to print 178 * more semantic information such as miss/hit ratios, 179 * instruction rates, etc: 180 */ 181 void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count, 182 int cpu) 183 { 184 int ctx = evsel_context(counter); 185 186 if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK) || 187 perf_evsel__match(counter, SOFTWARE, SW_CPU_CLOCK)) 188 update_stats(&runtime_nsecs_stats[cpu], count[0]); 189 else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) 190 update_stats(&runtime_cycles_stats[ctx][cpu], count[0]); 191 else if (perf_stat_evsel__is(counter, CYCLES_IN_TX)) 192 update_stats(&runtime_cycles_in_tx_stats[ctx][cpu], count[0]); 193 else if (perf_stat_evsel__is(counter, TRANSACTION_START)) 194 update_stats(&runtime_transaction_stats[ctx][cpu], count[0]); 195 else if (perf_stat_evsel__is(counter, ELISION_START)) 196 update_stats(&runtime_elision_stats[ctx][cpu], count[0]); 197 else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS)) 198 update_stats(&runtime_topdown_total_slots[ctx][cpu], count[0]); 199 else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED)) 200 update_stats(&runtime_topdown_slots_issued[ctx][cpu], count[0]); 201 else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED)) 202 update_stats(&runtime_topdown_slots_retired[ctx][cpu], count[0]); 203 else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES)) 204 update_stats(&runtime_topdown_fetch_bubbles[ctx][cpu],count[0]); 205 else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES)) 206 update_stats(&runtime_topdown_recovery_bubbles[ctx][cpu], count[0]); 207 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) 208 update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count[0]); 209 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) 210 update_stats(&runtime_stalled_cycles_back_stats[ctx][cpu], count[0]); 211 else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) 212 update_stats(&runtime_branches_stats[ctx][cpu], count[0]); 213 else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) 214 update_stats(&runtime_cacherefs_stats[ctx][cpu], count[0]); 215 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) 216 update_stats(&runtime_l1_dcache_stats[ctx][cpu], count[0]); 217 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) 218 update_stats(&runtime_ll_cache_stats[ctx][cpu], count[0]); 219 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL)) 220 update_stats(&runtime_ll_cache_stats[ctx][cpu], count[0]); 221 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) 222 update_stats(&runtime_dtlb_cache_stats[ctx][cpu], count[0]); 223 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) 224 update_stats(&runtime_itlb_cache_stats[ctx][cpu], count[0]); 225 else if (perf_stat_evsel__is(counter, SMI_NUM)) 226 update_stats(&runtime_smi_num_stats[ctx][cpu], count[0]); 227 else if (perf_stat_evsel__is(counter, APERF)) 228 update_stats(&runtime_aperf_stats[ctx][cpu], count[0]); 229 230 if (counter->collect_stat) { 231 struct saved_value *v = saved_value_lookup(counter, cpu, true); 232 update_stats(&v->stats, count[0]); 233 } 234 } 235 236 /* used for get_ratio_color() */ 237 enum grc_type { 238 GRC_STALLED_CYCLES_FE, 239 GRC_STALLED_CYCLES_BE, 240 GRC_CACHE_MISSES, 241 GRC_MAX_NR 242 }; 243 244 static const char *get_ratio_color(enum grc_type type, double ratio) 245 { 246 static const double grc_table[GRC_MAX_NR][3] = { 247 [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 }, 248 [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 }, 249 [GRC_CACHE_MISSES] = { 20.0, 10.0, 5.0 }, 250 }; 251 const char *color = PERF_COLOR_NORMAL; 252 253 if (ratio > grc_table[type][0]) 254 color = PERF_COLOR_RED; 255 else if (ratio > grc_table[type][1]) 256 color = PERF_COLOR_MAGENTA; 257 else if (ratio > grc_table[type][2]) 258 color = PERF_COLOR_YELLOW; 259 260 return color; 261 } 262 263 static struct perf_evsel *perf_stat__find_event(struct perf_evlist *evsel_list, 264 const char *name) 265 { 266 struct perf_evsel *c2; 267 268 evlist__for_each_entry (evsel_list, c2) { 269 if (!strcasecmp(c2->name, name)) 270 return c2; 271 } 272 return NULL; 273 } 274 275 /* Mark MetricExpr target events and link events using them to them. */ 276 void perf_stat__collect_metric_expr(struct perf_evlist *evsel_list) 277 { 278 struct perf_evsel *counter, *leader, **metric_events, *oc; 279 bool found; 280 const char **metric_names; 281 int i; 282 int num_metric_names; 283 284 evlist__for_each_entry(evsel_list, counter) { 285 bool invalid = false; 286 287 leader = counter->leader; 288 if (!counter->metric_expr) 289 continue; 290 metric_events = counter->metric_events; 291 if (!metric_events) { 292 if (expr__find_other(counter->metric_expr, counter->name, 293 &metric_names, &num_metric_names) < 0) 294 continue; 295 296 metric_events = calloc(sizeof(struct perf_evsel *), 297 num_metric_names + 1); 298 if (!metric_events) 299 return; 300 counter->metric_events = metric_events; 301 } 302 303 for (i = 0; i < num_metric_names; i++) { 304 found = false; 305 if (leader) { 306 /* Search in group */ 307 for_each_group_member (oc, leader) { 308 if (!strcasecmp(oc->name, metric_names[i])) { 309 found = true; 310 break; 311 } 312 } 313 } 314 if (!found) { 315 /* Search ignoring groups */ 316 oc = perf_stat__find_event(evsel_list, metric_names[i]); 317 } 318 if (!oc) { 319 /* Deduping one is good enough to handle duplicated PMUs. */ 320 static char *printed; 321 322 /* 323 * Adding events automatically would be difficult, because 324 * it would risk creating groups that are not schedulable. 325 * perf stat doesn't understand all the scheduling constraints 326 * of events. So we ask the user instead to add the missing 327 * events. 328 */ 329 if (!printed || strcasecmp(printed, metric_names[i])) { 330 fprintf(stderr, 331 "Add %s event to groups to get metric expression for %s\n", 332 metric_names[i], 333 counter->name); 334 printed = strdup(metric_names[i]); 335 } 336 invalid = true; 337 continue; 338 } 339 metric_events[i] = oc; 340 oc->collect_stat = true; 341 } 342 metric_events[i] = NULL; 343 free(metric_names); 344 if (invalid) { 345 free(metric_events); 346 counter->metric_events = NULL; 347 counter->metric_expr = NULL; 348 } 349 } 350 } 351 352 static void print_stalled_cycles_frontend(int cpu, 353 struct perf_evsel *evsel, double avg, 354 struct perf_stat_output_ctx *out) 355 { 356 double total, ratio = 0.0; 357 const char *color; 358 int ctx = evsel_context(evsel); 359 360 total = avg_stats(&runtime_cycles_stats[ctx][cpu]); 361 362 if (total) 363 ratio = avg / total * 100.0; 364 365 color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio); 366 367 if (ratio) 368 out->print_metric(out->ctx, color, "%7.2f%%", "frontend cycles idle", 369 ratio); 370 else 371 out->print_metric(out->ctx, NULL, NULL, "frontend cycles idle", 0); 372 } 373 374 static void print_stalled_cycles_backend(int cpu, 375 struct perf_evsel *evsel, double avg, 376 struct perf_stat_output_ctx *out) 377 { 378 double total, ratio = 0.0; 379 const char *color; 380 int ctx = evsel_context(evsel); 381 382 total = avg_stats(&runtime_cycles_stats[ctx][cpu]); 383 384 if (total) 385 ratio = avg / total * 100.0; 386 387 color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio); 388 389 out->print_metric(out->ctx, color, "%7.2f%%", "backend cycles idle", ratio); 390 } 391 392 static void print_branch_misses(int cpu, 393 struct perf_evsel *evsel, 394 double avg, 395 struct perf_stat_output_ctx *out) 396 { 397 double total, ratio = 0.0; 398 const char *color; 399 int ctx = evsel_context(evsel); 400 401 total = avg_stats(&runtime_branches_stats[ctx][cpu]); 402 403 if (total) 404 ratio = avg / total * 100.0; 405 406 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 407 408 out->print_metric(out->ctx, color, "%7.2f%%", "of all branches", ratio); 409 } 410 411 static void print_l1_dcache_misses(int cpu, 412 struct perf_evsel *evsel, 413 double avg, 414 struct perf_stat_output_ctx *out) 415 { 416 double total, ratio = 0.0; 417 const char *color; 418 int ctx = evsel_context(evsel); 419 420 total = avg_stats(&runtime_l1_dcache_stats[ctx][cpu]); 421 422 if (total) 423 ratio = avg / total * 100.0; 424 425 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 426 427 out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-dcache hits", ratio); 428 } 429 430 static void print_l1_icache_misses(int cpu, 431 struct perf_evsel *evsel, 432 double avg, 433 struct perf_stat_output_ctx *out) 434 { 435 double total, ratio = 0.0; 436 const char *color; 437 int ctx = evsel_context(evsel); 438 439 total = avg_stats(&runtime_l1_icache_stats[ctx][cpu]); 440 441 if (total) 442 ratio = avg / total * 100.0; 443 444 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 445 out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-icache hits", ratio); 446 } 447 448 static void print_dtlb_cache_misses(int cpu, 449 struct perf_evsel *evsel, 450 double avg, 451 struct perf_stat_output_ctx *out) 452 { 453 double total, ratio = 0.0; 454 const char *color; 455 int ctx = evsel_context(evsel); 456 457 total = avg_stats(&runtime_dtlb_cache_stats[ctx][cpu]); 458 459 if (total) 460 ratio = avg / total * 100.0; 461 462 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 463 out->print_metric(out->ctx, color, "%7.2f%%", "of all dTLB cache hits", ratio); 464 } 465 466 static void print_itlb_cache_misses(int cpu, 467 struct perf_evsel *evsel, 468 double avg, 469 struct perf_stat_output_ctx *out) 470 { 471 double total, ratio = 0.0; 472 const char *color; 473 int ctx = evsel_context(evsel); 474 475 total = avg_stats(&runtime_itlb_cache_stats[ctx][cpu]); 476 477 if (total) 478 ratio = avg / total * 100.0; 479 480 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 481 out->print_metric(out->ctx, color, "%7.2f%%", "of all iTLB cache hits", ratio); 482 } 483 484 static void print_ll_cache_misses(int cpu, 485 struct perf_evsel *evsel, 486 double avg, 487 struct perf_stat_output_ctx *out) 488 { 489 double total, ratio = 0.0; 490 const char *color; 491 int ctx = evsel_context(evsel); 492 493 total = avg_stats(&runtime_ll_cache_stats[ctx][cpu]); 494 495 if (total) 496 ratio = avg / total * 100.0; 497 498 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 499 out->print_metric(out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio); 500 } 501 502 /* 503 * High level "TopDown" CPU core pipe line bottleneck break down. 504 * 505 * Basic concept following 506 * Yasin, A Top Down Method for Performance analysis and Counter architecture 507 * ISPASS14 508 * 509 * The CPU pipeline is divided into 4 areas that can be bottlenecks: 510 * 511 * Frontend -> Backend -> Retiring 512 * BadSpeculation in addition means out of order execution that is thrown away 513 * (for example branch mispredictions) 514 * Frontend is instruction decoding. 515 * Backend is execution, like computation and accessing data in memory 516 * Retiring is good execution that is not directly bottlenecked 517 * 518 * The formulas are computed in slots. 519 * A slot is an entry in the pipeline each for the pipeline width 520 * (for example a 4-wide pipeline has 4 slots for each cycle) 521 * 522 * Formulas: 523 * BadSpeculation = ((SlotsIssued - SlotsRetired) + RecoveryBubbles) / 524 * TotalSlots 525 * Retiring = SlotsRetired / TotalSlots 526 * FrontendBound = FetchBubbles / TotalSlots 527 * BackendBound = 1.0 - BadSpeculation - Retiring - FrontendBound 528 * 529 * The kernel provides the mapping to the low level CPU events and any scaling 530 * needed for the CPU pipeline width, for example: 531 * 532 * TotalSlots = Cycles * 4 533 * 534 * The scaling factor is communicated in the sysfs unit. 535 * 536 * In some cases the CPU may not be able to measure all the formulas due to 537 * missing events. In this case multiple formulas are combined, as possible. 538 * 539 * Full TopDown supports more levels to sub-divide each area: for example 540 * BackendBound into computing bound and memory bound. For now we only 541 * support Level 1 TopDown. 542 */ 543 544 static double sanitize_val(double x) 545 { 546 if (x < 0 && x >= -0.02) 547 return 0.0; 548 return x; 549 } 550 551 static double td_total_slots(int ctx, int cpu) 552 { 553 return avg_stats(&runtime_topdown_total_slots[ctx][cpu]); 554 } 555 556 static double td_bad_spec(int ctx, int cpu) 557 { 558 double bad_spec = 0; 559 double total_slots; 560 double total; 561 562 total = avg_stats(&runtime_topdown_slots_issued[ctx][cpu]) - 563 avg_stats(&runtime_topdown_slots_retired[ctx][cpu]) + 564 avg_stats(&runtime_topdown_recovery_bubbles[ctx][cpu]); 565 total_slots = td_total_slots(ctx, cpu); 566 if (total_slots) 567 bad_spec = total / total_slots; 568 return sanitize_val(bad_spec); 569 } 570 571 static double td_retiring(int ctx, int cpu) 572 { 573 double retiring = 0; 574 double total_slots = td_total_slots(ctx, cpu); 575 double ret_slots = avg_stats(&runtime_topdown_slots_retired[ctx][cpu]); 576 577 if (total_slots) 578 retiring = ret_slots / total_slots; 579 return retiring; 580 } 581 582 static double td_fe_bound(int ctx, int cpu) 583 { 584 double fe_bound = 0; 585 double total_slots = td_total_slots(ctx, cpu); 586 double fetch_bub = avg_stats(&runtime_topdown_fetch_bubbles[ctx][cpu]); 587 588 if (total_slots) 589 fe_bound = fetch_bub / total_slots; 590 return fe_bound; 591 } 592 593 static double td_be_bound(int ctx, int cpu) 594 { 595 double sum = (td_fe_bound(ctx, cpu) + 596 td_bad_spec(ctx, cpu) + 597 td_retiring(ctx, cpu)); 598 if (sum == 0) 599 return 0; 600 return sanitize_val(1.0 - sum); 601 } 602 603 static void print_smi_cost(int cpu, struct perf_evsel *evsel, 604 struct perf_stat_output_ctx *out) 605 { 606 double smi_num, aperf, cycles, cost = 0.0; 607 int ctx = evsel_context(evsel); 608 const char *color = NULL; 609 610 smi_num = avg_stats(&runtime_smi_num_stats[ctx][cpu]); 611 aperf = avg_stats(&runtime_aperf_stats[ctx][cpu]); 612 cycles = avg_stats(&runtime_cycles_stats[ctx][cpu]); 613 614 if ((cycles == 0) || (aperf == 0)) 615 return; 616 617 if (smi_num) 618 cost = (aperf - cycles) / aperf * 100.00; 619 620 if (cost > 10) 621 color = PERF_COLOR_RED; 622 out->print_metric(out->ctx, color, "%8.1f%%", "SMI cycles%", cost); 623 out->print_metric(out->ctx, NULL, "%4.0f", "SMI#", smi_num); 624 } 625 626 static void generic_metric(const char *metric_expr, 627 struct perf_evsel **metric_events, 628 char *name, 629 const char *metric_name, 630 double avg, 631 int cpu, 632 struct perf_stat_output_ctx *out) 633 { 634 print_metric_t print_metric = out->print_metric; 635 struct parse_ctx pctx; 636 double ratio; 637 int i; 638 void *ctxp = out->ctx; 639 640 expr__ctx_init(&pctx); 641 expr__add_id(&pctx, name, avg); 642 for (i = 0; metric_events[i]; i++) { 643 struct saved_value *v; 644 struct stats *stats; 645 double scale; 646 647 if (!strcmp(metric_events[i]->name, "duration_time")) { 648 stats = &walltime_nsecs_stats; 649 scale = 1e-9; 650 } else { 651 v = saved_value_lookup(metric_events[i], cpu, false); 652 if (!v) 653 break; 654 stats = &v->stats; 655 scale = 1.0; 656 } 657 expr__add_id(&pctx, metric_events[i]->name, avg_stats(stats)*scale); 658 } 659 if (!metric_events[i]) { 660 const char *p = metric_expr; 661 662 if (expr__parse(&ratio, &pctx, &p) == 0) 663 print_metric(ctxp, NULL, "%8.1f", 664 metric_name ? 665 metric_name : 666 out->force_header ? name : "", 667 ratio); 668 else 669 print_metric(ctxp, NULL, NULL, 670 out->force_header ? 671 (metric_name ? metric_name : name) : "", 0); 672 } else 673 print_metric(ctxp, NULL, NULL, "", 0); 674 } 675 676 void perf_stat__print_shadow_stats(struct perf_evsel *evsel, 677 double avg, int cpu, 678 struct perf_stat_output_ctx *out, 679 struct rblist *metric_events) 680 { 681 void *ctxp = out->ctx; 682 print_metric_t print_metric = out->print_metric; 683 double total, ratio = 0.0, total2; 684 const char *color = NULL; 685 int ctx = evsel_context(evsel); 686 struct metric_event *me; 687 int num = 1; 688 689 if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { 690 total = avg_stats(&runtime_cycles_stats[ctx][cpu]); 691 if (total) { 692 ratio = avg / total; 693 print_metric(ctxp, NULL, "%7.2f ", 694 "insn per cycle", ratio); 695 } else { 696 print_metric(ctxp, NULL, NULL, "insn per cycle", 0); 697 } 698 total = avg_stats(&runtime_stalled_cycles_front_stats[ctx][cpu]); 699 total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[ctx][cpu])); 700 701 if (total && avg) { 702 out->new_line(ctxp); 703 ratio = total / avg; 704 print_metric(ctxp, NULL, "%7.2f ", 705 "stalled cycles per insn", 706 ratio); 707 } else if (have_frontend_stalled) { 708 print_metric(ctxp, NULL, NULL, 709 "stalled cycles per insn", 0); 710 } 711 } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) { 712 if (runtime_branches_stats[ctx][cpu].n != 0) 713 print_branch_misses(cpu, evsel, avg, out); 714 else 715 print_metric(ctxp, NULL, NULL, "of all branches", 0); 716 } else if ( 717 evsel->attr.type == PERF_TYPE_HW_CACHE && 718 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | 719 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 720 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 721 if (runtime_l1_dcache_stats[ctx][cpu].n != 0) 722 print_l1_dcache_misses(cpu, evsel, avg, out); 723 else 724 print_metric(ctxp, NULL, NULL, "of all L1-dcache hits", 0); 725 } else if ( 726 evsel->attr.type == PERF_TYPE_HW_CACHE && 727 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I | 728 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 729 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 730 if (runtime_l1_icache_stats[ctx][cpu].n != 0) 731 print_l1_icache_misses(cpu, evsel, avg, out); 732 else 733 print_metric(ctxp, NULL, NULL, "of all L1-icache hits", 0); 734 } else if ( 735 evsel->attr.type == PERF_TYPE_HW_CACHE && 736 evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB | 737 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 738 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 739 if (runtime_dtlb_cache_stats[ctx][cpu].n != 0) 740 print_dtlb_cache_misses(cpu, evsel, avg, out); 741 else 742 print_metric(ctxp, NULL, NULL, "of all dTLB cache hits", 0); 743 } else if ( 744 evsel->attr.type == PERF_TYPE_HW_CACHE && 745 evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB | 746 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 747 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 748 if (runtime_itlb_cache_stats[ctx][cpu].n != 0) 749 print_itlb_cache_misses(cpu, evsel, avg, out); 750 else 751 print_metric(ctxp, NULL, NULL, "of all iTLB cache hits", 0); 752 } else if ( 753 evsel->attr.type == PERF_TYPE_HW_CACHE && 754 evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL | 755 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 756 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { 757 if (runtime_ll_cache_stats[ctx][cpu].n != 0) 758 print_ll_cache_misses(cpu, evsel, avg, out); 759 else 760 print_metric(ctxp, NULL, NULL, "of all LL-cache hits", 0); 761 } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) { 762 total = avg_stats(&runtime_cacherefs_stats[ctx][cpu]); 763 764 if (total) 765 ratio = avg * 100 / total; 766 767 if (runtime_cacherefs_stats[ctx][cpu].n != 0) 768 print_metric(ctxp, NULL, "%8.3f %%", 769 "of all cache refs", ratio); 770 else 771 print_metric(ctxp, NULL, NULL, "of all cache refs", 0); 772 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { 773 print_stalled_cycles_frontend(cpu, evsel, avg, out); 774 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { 775 print_stalled_cycles_backend(cpu, evsel, avg, out); 776 } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { 777 total = avg_stats(&runtime_nsecs_stats[cpu]); 778 779 if (total) { 780 ratio = avg / total; 781 print_metric(ctxp, NULL, "%8.3f", "GHz", ratio); 782 } else { 783 print_metric(ctxp, NULL, NULL, "Ghz", 0); 784 } 785 } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) { 786 total = avg_stats(&runtime_cycles_stats[ctx][cpu]); 787 if (total) 788 print_metric(ctxp, NULL, 789 "%7.2f%%", "transactional cycles", 790 100.0 * (avg / total)); 791 else 792 print_metric(ctxp, NULL, NULL, "transactional cycles", 793 0); 794 } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) { 795 total = avg_stats(&runtime_cycles_stats[ctx][cpu]); 796 total2 = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]); 797 if (total2 < avg) 798 total2 = avg; 799 if (total) 800 print_metric(ctxp, NULL, "%7.2f%%", "aborted cycles", 801 100.0 * ((total2-avg) / total)); 802 else 803 print_metric(ctxp, NULL, NULL, "aborted cycles", 0); 804 } else if (perf_stat_evsel__is(evsel, TRANSACTION_START)) { 805 total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]); 806 807 if (avg) 808 ratio = total / avg; 809 810 if (runtime_cycles_in_tx_stats[ctx][cpu].n != 0) 811 print_metric(ctxp, NULL, "%8.0f", 812 "cycles / transaction", ratio); 813 else 814 print_metric(ctxp, NULL, NULL, "cycles / transaction", 815 0); 816 } else if (perf_stat_evsel__is(evsel, ELISION_START)) { 817 total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]); 818 819 if (avg) 820 ratio = total / avg; 821 822 print_metric(ctxp, NULL, "%8.0f", "cycles / elision", ratio); 823 } else if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK) || 824 perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK)) { 825 if ((ratio = avg_stats(&walltime_nsecs_stats)) != 0) 826 print_metric(ctxp, NULL, "%8.3f", "CPUs utilized", 827 avg / ratio); 828 else 829 print_metric(ctxp, NULL, NULL, "CPUs utilized", 0); 830 } else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) { 831 double fe_bound = td_fe_bound(ctx, cpu); 832 833 if (fe_bound > 0.2) 834 color = PERF_COLOR_RED; 835 print_metric(ctxp, color, "%8.1f%%", "frontend bound", 836 fe_bound * 100.); 837 } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) { 838 double retiring = td_retiring(ctx, cpu); 839 840 if (retiring > 0.7) 841 color = PERF_COLOR_GREEN; 842 print_metric(ctxp, color, "%8.1f%%", "retiring", 843 retiring * 100.); 844 } else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) { 845 double bad_spec = td_bad_spec(ctx, cpu); 846 847 if (bad_spec > 0.1) 848 color = PERF_COLOR_RED; 849 print_metric(ctxp, color, "%8.1f%%", "bad speculation", 850 bad_spec * 100.); 851 } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) { 852 double be_bound = td_be_bound(ctx, cpu); 853 const char *name = "backend bound"; 854 static int have_recovery_bubbles = -1; 855 856 /* In case the CPU does not support topdown-recovery-bubbles */ 857 if (have_recovery_bubbles < 0) 858 have_recovery_bubbles = pmu_have_event("cpu", 859 "topdown-recovery-bubbles"); 860 if (!have_recovery_bubbles) 861 name = "backend bound/bad spec"; 862 863 if (be_bound > 0.2) 864 color = PERF_COLOR_RED; 865 if (td_total_slots(ctx, cpu) > 0) 866 print_metric(ctxp, color, "%8.1f%%", name, 867 be_bound * 100.); 868 else 869 print_metric(ctxp, NULL, NULL, name, 0); 870 } else if (evsel->metric_expr) { 871 generic_metric(evsel->metric_expr, evsel->metric_events, evsel->name, 872 evsel->metric_name, avg, cpu, out); 873 } else if (runtime_nsecs_stats[cpu].n != 0) { 874 char unit = 'M'; 875 char unit_buf[10]; 876 877 total = avg_stats(&runtime_nsecs_stats[cpu]); 878 879 if (total) 880 ratio = 1000.0 * avg / total; 881 if (ratio < 0.001) { 882 ratio *= 1000; 883 unit = 'K'; 884 } 885 snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit); 886 print_metric(ctxp, NULL, "%8.3f", unit_buf, ratio); 887 } else if (perf_stat_evsel__is(evsel, SMI_NUM)) { 888 print_smi_cost(cpu, evsel, out); 889 } else { 890 num = 0; 891 } 892 893 if ((me = metricgroup__lookup(metric_events, evsel, false)) != NULL) { 894 struct metric_expr *mexp; 895 896 list_for_each_entry (mexp, &me->head, nd) { 897 if (num++ > 0) 898 out->new_line(ctxp); 899 generic_metric(mexp->metric_expr, mexp->metric_events, 900 evsel->name, mexp->metric_name, 901 avg, cpu, out); 902 } 903 } 904 if (num == 0) 905 print_metric(ctxp, NULL, NULL, NULL, 0); 906 } 907