1 /* 2 * builtin-stat.c 3 * 4 * Builtin stat command: Give a precise performance counters summary 5 * overview about any workload, CPU or specific PID. 6 * 7 * Sample output: 8 9 $ perf stat ./hackbench 10 10 11 Time: 0.118 12 13 Performance counter stats for './hackbench 10': 14 15 1708.761321 task-clock # 11.037 CPUs utilized 16 41,190 context-switches # 0.024 M/sec 17 6,735 CPU-migrations # 0.004 M/sec 18 17,318 page-faults # 0.010 M/sec 19 5,205,202,243 cycles # 3.046 GHz 20 3,856,436,920 stalled-cycles-frontend # 74.09% frontend cycles idle 21 1,600,790,871 stalled-cycles-backend # 30.75% backend cycles idle 22 2,603,501,247 instructions # 0.50 insns per cycle 23 # 1.48 stalled cycles per insn 24 484,357,498 branches # 283.455 M/sec 25 6,388,934 branch-misses # 1.32% of all branches 26 27 0.154822978 seconds time elapsed 28 29 * 30 * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar <mingo@redhat.com> 31 * 32 * Improvements and fixes by: 33 * 34 * Arjan van de Ven <arjan@linux.intel.com> 35 * Yanmin Zhang <yanmin.zhang@intel.com> 36 * Wu Fengguang <fengguang.wu@intel.com> 37 * Mike Galbraith <efault@gmx.de> 38 * Paul Mackerras <paulus@samba.org> 39 * Jaswinder Singh Rajput <jaswinder@kernel.org> 40 * 41 * Released under the GPL v2. (and only v2, not any later version) 42 */ 43 44 #include "perf.h" 45 #include "builtin.h" 46 #include "util/util.h" 47 #include "util/parse-options.h" 48 #include "util/parse-events.h" 49 #include "util/event.h" 50 #include "util/evlist.h" 51 #include "util/evsel.h" 52 #include "util/debug.h" 53 #include "util/color.h" 54 #include "util/stat.h" 55 #include "util/header.h" 56 #include "util/cpumap.h" 57 #include "util/thread.h" 58 #include "util/thread_map.h" 59 60 #include <sys/prctl.h> 61 #include <locale.h> 62 63 #define DEFAULT_SEPARATOR " " 64 #define CNTR_NOT_SUPPORTED "<not supported>" 65 #define CNTR_NOT_COUNTED "<not counted>" 66 67 static struct perf_evlist *evsel_list; 68 69 static struct perf_target target = { 70 .uid = UINT_MAX, 71 }; 72 73 static int run_count = 1; 74 static bool no_inherit = false; 75 static bool scale = true; 76 static bool no_aggr = false; 77 static pid_t child_pid = -1; 78 static bool null_run = false; 79 static int detailed_run = 0; 80 static bool big_num = true; 81 static int big_num_opt = -1; 82 static const char *csv_sep = NULL; 83 static bool csv_output = false; 84 static bool group = false; 85 static FILE *output = NULL; 86 87 static volatile int done = 0; 88 89 struct perf_stat { 90 struct stats res_stats[3]; 91 }; 92 93 static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel) 94 { 95 evsel->priv = zalloc(sizeof(struct perf_stat)); 96 return evsel->priv == NULL ? -ENOMEM : 0; 97 } 98 99 static void perf_evsel__free_stat_priv(struct perf_evsel *evsel) 100 { 101 free(evsel->priv); 102 evsel->priv = NULL; 103 } 104 105 static inline struct cpu_map *perf_evsel__cpus(struct perf_evsel *evsel) 106 { 107 return (evsel->cpus && !target.cpu_list) ? evsel->cpus : evsel_list->cpus; 108 } 109 110 static inline int perf_evsel__nr_cpus(struct perf_evsel *evsel) 111 { 112 return perf_evsel__cpus(evsel)->nr; 113 } 114 115 static struct stats runtime_nsecs_stats[MAX_NR_CPUS]; 116 static struct stats runtime_cycles_stats[MAX_NR_CPUS]; 117 static struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS]; 118 static struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS]; 119 static struct stats runtime_branches_stats[MAX_NR_CPUS]; 120 static struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; 121 static struct stats runtime_l1_dcache_stats[MAX_NR_CPUS]; 122 static struct stats runtime_l1_icache_stats[MAX_NR_CPUS]; 123 static struct stats runtime_ll_cache_stats[MAX_NR_CPUS]; 124 static struct stats runtime_itlb_cache_stats[MAX_NR_CPUS]; 125 static struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS]; 126 static struct stats walltime_nsecs_stats; 127 128 static int create_perf_stat_counter(struct perf_evsel *evsel, 129 struct perf_evsel *first) 130 { 131 struct perf_event_attr *attr = &evsel->attr; 132 bool exclude_guest_missing = false; 133 int ret; 134 135 if (scale) 136 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | 137 PERF_FORMAT_TOTAL_TIME_RUNNING; 138 139 attr->inherit = !no_inherit; 140 141 retry: 142 if (exclude_guest_missing) 143 evsel->attr.exclude_guest = evsel->attr.exclude_host = 0; 144 145 if (perf_target__has_cpu(&target)) { 146 ret = perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel)); 147 if (ret) 148 goto check_ret; 149 return 0; 150 } 151 152 if (!perf_target__has_task(&target) && (!group || evsel == first)) { 153 attr->disabled = 1; 154 attr->enable_on_exec = 1; 155 } 156 157 ret = perf_evsel__open_per_thread(evsel, evsel_list->threads); 158 if (!ret) 159 return 0; 160 /* fall through */ 161 check_ret: 162 if (ret && errno == EINVAL) { 163 if (!exclude_guest_missing && 164 (evsel->attr.exclude_guest || evsel->attr.exclude_host)) { 165 pr_debug("Old kernel, cannot exclude " 166 "guest or host samples.\n"); 167 exclude_guest_missing = true; 168 goto retry; 169 } 170 } 171 return ret; 172 } 173 174 /* 175 * Does the counter have nsecs as a unit? 176 */ 177 static inline int nsec_counter(struct perf_evsel *evsel) 178 { 179 if (perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) || 180 perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) 181 return 1; 182 183 return 0; 184 } 185 186 /* 187 * Update various tracking values we maintain to print 188 * more semantic information such as miss/hit ratios, 189 * instruction rates, etc: 190 */ 191 static void update_shadow_stats(struct perf_evsel *counter, u64 *count) 192 { 193 if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) 194 update_stats(&runtime_nsecs_stats[0], count[0]); 195 else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) 196 update_stats(&runtime_cycles_stats[0], count[0]); 197 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) 198 update_stats(&runtime_stalled_cycles_front_stats[0], count[0]); 199 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) 200 update_stats(&runtime_stalled_cycles_back_stats[0], count[0]); 201 else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) 202 update_stats(&runtime_branches_stats[0], count[0]); 203 else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) 204 update_stats(&runtime_cacherefs_stats[0], count[0]); 205 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) 206 update_stats(&runtime_l1_dcache_stats[0], count[0]); 207 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) 208 update_stats(&runtime_l1_icache_stats[0], count[0]); 209 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL)) 210 update_stats(&runtime_ll_cache_stats[0], count[0]); 211 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) 212 update_stats(&runtime_dtlb_cache_stats[0], count[0]); 213 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) 214 update_stats(&runtime_itlb_cache_stats[0], count[0]); 215 } 216 217 /* 218 * Read out the results of a single counter: 219 * aggregate counts across CPUs in system-wide mode 220 */ 221 static int read_counter_aggr(struct perf_evsel *counter) 222 { 223 struct perf_stat *ps = counter->priv; 224 u64 *count = counter->counts->aggr.values; 225 int i; 226 227 if (__perf_evsel__read(counter, perf_evsel__nr_cpus(counter), 228 evsel_list->threads->nr, scale) < 0) 229 return -1; 230 231 for (i = 0; i < 3; i++) 232 update_stats(&ps->res_stats[i], count[i]); 233 234 if (verbose) { 235 fprintf(output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", 236 perf_evsel__name(counter), count[0], count[1], count[2]); 237 } 238 239 /* 240 * Save the full runtime - to allow normalization during printout: 241 */ 242 update_shadow_stats(counter, count); 243 244 return 0; 245 } 246 247 /* 248 * Read out the results of a single counter: 249 * do not aggregate counts across CPUs in system-wide mode 250 */ 251 static int read_counter(struct perf_evsel *counter) 252 { 253 u64 *count; 254 int cpu; 255 256 for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { 257 if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0) 258 return -1; 259 260 count = counter->counts->cpu[cpu].values; 261 262 update_shadow_stats(counter, count); 263 } 264 265 return 0; 266 } 267 268 static int run_perf_stat(int argc __maybe_unused, const char **argv) 269 { 270 unsigned long long t0, t1; 271 struct perf_evsel *counter, *first; 272 int status = 0; 273 int child_ready_pipe[2], go_pipe[2]; 274 const bool forks = (argc > 0); 275 char buf; 276 277 if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) { 278 perror("failed to create pipes"); 279 return -1; 280 } 281 282 if (forks) { 283 if ((child_pid = fork()) < 0) 284 perror("failed to fork"); 285 286 if (!child_pid) { 287 close(child_ready_pipe[0]); 288 close(go_pipe[1]); 289 fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC); 290 291 /* 292 * Do a dummy execvp to get the PLT entry resolved, 293 * so we avoid the resolver overhead on the real 294 * execvp call. 295 */ 296 execvp("", (char **)argv); 297 298 /* 299 * Tell the parent we're ready to go 300 */ 301 close(child_ready_pipe[1]); 302 303 /* 304 * Wait until the parent tells us to go. 305 */ 306 if (read(go_pipe[0], &buf, 1) == -1) 307 perror("unable to read pipe"); 308 309 execvp(argv[0], (char **)argv); 310 311 perror(argv[0]); 312 exit(-1); 313 } 314 315 if (perf_target__none(&target)) 316 evsel_list->threads->map[0] = child_pid; 317 318 /* 319 * Wait for the child to be ready to exec. 320 */ 321 close(child_ready_pipe[1]); 322 close(go_pipe[0]); 323 if (read(child_ready_pipe[0], &buf, 1) == -1) 324 perror("unable to read pipe"); 325 close(child_ready_pipe[0]); 326 } 327 328 if (group) 329 perf_evlist__set_leader(evsel_list); 330 331 first = perf_evlist__first(evsel_list); 332 333 list_for_each_entry(counter, &evsel_list->entries, node) { 334 if (create_perf_stat_counter(counter, first) < 0) { 335 /* 336 * PPC returns ENXIO for HW counters until 2.6.37 337 * (behavior changed with commit b0a873e). 338 */ 339 if (errno == EINVAL || errno == ENOSYS || 340 errno == ENOENT || errno == EOPNOTSUPP || 341 errno == ENXIO) { 342 if (verbose) 343 ui__warning("%s event is not supported by the kernel.\n", 344 perf_evsel__name(counter)); 345 counter->supported = false; 346 continue; 347 } 348 349 if (errno == EPERM || errno == EACCES) { 350 error("You may not have permission to collect %sstats.\n" 351 "\t Consider tweaking" 352 " /proc/sys/kernel/perf_event_paranoid or running as root.", 353 target.system_wide ? "system-wide " : ""); 354 } else { 355 error("open_counter returned with %d (%s). " 356 "/bin/dmesg may provide additional information.\n", 357 errno, strerror(errno)); 358 } 359 if (child_pid != -1) 360 kill(child_pid, SIGTERM); 361 362 pr_err("Not all events could be opened.\n"); 363 return -1; 364 } 365 counter->supported = true; 366 } 367 368 if (perf_evlist__apply_filters(evsel_list)) { 369 error("failed to set filter with %d (%s)\n", errno, 370 strerror(errno)); 371 return -1; 372 } 373 374 /* 375 * Enable counters and exec the command: 376 */ 377 t0 = rdclock(); 378 379 if (forks) { 380 close(go_pipe[1]); 381 wait(&status); 382 if (WIFSIGNALED(status)) 383 psignal(WTERMSIG(status), argv[0]); 384 } else { 385 while(!done) sleep(1); 386 } 387 388 t1 = rdclock(); 389 390 update_stats(&walltime_nsecs_stats, t1 - t0); 391 392 if (no_aggr) { 393 list_for_each_entry(counter, &evsel_list->entries, node) { 394 read_counter(counter); 395 perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 1); 396 } 397 } else { 398 list_for_each_entry(counter, &evsel_list->entries, node) { 399 read_counter_aggr(counter); 400 perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 401 evsel_list->threads->nr); 402 } 403 } 404 405 return WEXITSTATUS(status); 406 } 407 408 static void print_noise_pct(double total, double avg) 409 { 410 double pct = rel_stddev_stats(total, avg); 411 412 if (csv_output) 413 fprintf(output, "%s%.2f%%", csv_sep, pct); 414 else if (pct) 415 fprintf(output, " ( +-%6.2f%% )", pct); 416 } 417 418 static void print_noise(struct perf_evsel *evsel, double avg) 419 { 420 struct perf_stat *ps; 421 422 if (run_count == 1) 423 return; 424 425 ps = evsel->priv; 426 print_noise_pct(stddev_stats(&ps->res_stats[0]), avg); 427 } 428 429 static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) 430 { 431 double msecs = avg / 1e6; 432 char cpustr[16] = { '\0', }; 433 const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-25s"; 434 435 if (no_aggr) 436 sprintf(cpustr, "CPU%*d%s", 437 csv_output ? 0 : -4, 438 perf_evsel__cpus(evsel)->map[cpu], csv_sep); 439 440 fprintf(output, fmt, cpustr, msecs, csv_sep, perf_evsel__name(evsel)); 441 442 if (evsel->cgrp) 443 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); 444 445 if (csv_output) 446 return; 447 448 if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) 449 fprintf(output, " # %8.3f CPUs utilized ", 450 avg / avg_stats(&walltime_nsecs_stats)); 451 else 452 fprintf(output, " "); 453 } 454 455 /* used for get_ratio_color() */ 456 enum grc_type { 457 GRC_STALLED_CYCLES_FE, 458 GRC_STALLED_CYCLES_BE, 459 GRC_CACHE_MISSES, 460 GRC_MAX_NR 461 }; 462 463 static const char *get_ratio_color(enum grc_type type, double ratio) 464 { 465 static const double grc_table[GRC_MAX_NR][3] = { 466 [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 }, 467 [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 }, 468 [GRC_CACHE_MISSES] = { 20.0, 10.0, 5.0 }, 469 }; 470 const char *color = PERF_COLOR_NORMAL; 471 472 if (ratio > grc_table[type][0]) 473 color = PERF_COLOR_RED; 474 else if (ratio > grc_table[type][1]) 475 color = PERF_COLOR_MAGENTA; 476 else if (ratio > grc_table[type][2]) 477 color = PERF_COLOR_YELLOW; 478 479 return color; 480 } 481 482 static void print_stalled_cycles_frontend(int cpu, 483 struct perf_evsel *evsel 484 __maybe_unused, double avg) 485 { 486 double total, ratio = 0.0; 487 const char *color; 488 489 total = avg_stats(&runtime_cycles_stats[cpu]); 490 491 if (total) 492 ratio = avg / total * 100.0; 493 494 color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio); 495 496 fprintf(output, " # "); 497 color_fprintf(output, color, "%6.2f%%", ratio); 498 fprintf(output, " frontend cycles idle "); 499 } 500 501 static void print_stalled_cycles_backend(int cpu, 502 struct perf_evsel *evsel 503 __maybe_unused, double avg) 504 { 505 double total, ratio = 0.0; 506 const char *color; 507 508 total = avg_stats(&runtime_cycles_stats[cpu]); 509 510 if (total) 511 ratio = avg / total * 100.0; 512 513 color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio); 514 515 fprintf(output, " # "); 516 color_fprintf(output, color, "%6.2f%%", ratio); 517 fprintf(output, " backend cycles idle "); 518 } 519 520 static void print_branch_misses(int cpu, 521 struct perf_evsel *evsel __maybe_unused, 522 double avg) 523 { 524 double total, ratio = 0.0; 525 const char *color; 526 527 total = avg_stats(&runtime_branches_stats[cpu]); 528 529 if (total) 530 ratio = avg / total * 100.0; 531 532 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 533 534 fprintf(output, " # "); 535 color_fprintf(output, color, "%6.2f%%", ratio); 536 fprintf(output, " of all branches "); 537 } 538 539 static void print_l1_dcache_misses(int cpu, 540 struct perf_evsel *evsel __maybe_unused, 541 double avg) 542 { 543 double total, ratio = 0.0; 544 const char *color; 545 546 total = avg_stats(&runtime_l1_dcache_stats[cpu]); 547 548 if (total) 549 ratio = avg / total * 100.0; 550 551 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 552 553 fprintf(output, " # "); 554 color_fprintf(output, color, "%6.2f%%", ratio); 555 fprintf(output, " of all L1-dcache hits "); 556 } 557 558 static void print_l1_icache_misses(int cpu, 559 struct perf_evsel *evsel __maybe_unused, 560 double avg) 561 { 562 double total, ratio = 0.0; 563 const char *color; 564 565 total = avg_stats(&runtime_l1_icache_stats[cpu]); 566 567 if (total) 568 ratio = avg / total * 100.0; 569 570 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 571 572 fprintf(output, " # "); 573 color_fprintf(output, color, "%6.2f%%", ratio); 574 fprintf(output, " of all L1-icache hits "); 575 } 576 577 static void print_dtlb_cache_misses(int cpu, 578 struct perf_evsel *evsel __maybe_unused, 579 double avg) 580 { 581 double total, ratio = 0.0; 582 const char *color; 583 584 total = avg_stats(&runtime_dtlb_cache_stats[cpu]); 585 586 if (total) 587 ratio = avg / total * 100.0; 588 589 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 590 591 fprintf(output, " # "); 592 color_fprintf(output, color, "%6.2f%%", ratio); 593 fprintf(output, " of all dTLB cache hits "); 594 } 595 596 static void print_itlb_cache_misses(int cpu, 597 struct perf_evsel *evsel __maybe_unused, 598 double avg) 599 { 600 double total, ratio = 0.0; 601 const char *color; 602 603 total = avg_stats(&runtime_itlb_cache_stats[cpu]); 604 605 if (total) 606 ratio = avg / total * 100.0; 607 608 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 609 610 fprintf(output, " # "); 611 color_fprintf(output, color, "%6.2f%%", ratio); 612 fprintf(output, " of all iTLB cache hits "); 613 } 614 615 static void print_ll_cache_misses(int cpu, 616 struct perf_evsel *evsel __maybe_unused, 617 double avg) 618 { 619 double total, ratio = 0.0; 620 const char *color; 621 622 total = avg_stats(&runtime_ll_cache_stats[cpu]); 623 624 if (total) 625 ratio = avg / total * 100.0; 626 627 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 628 629 fprintf(output, " # "); 630 color_fprintf(output, color, "%6.2f%%", ratio); 631 fprintf(output, " of all LL-cache hits "); 632 } 633 634 static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) 635 { 636 double total, ratio = 0.0; 637 char cpustr[16] = { '\0', }; 638 const char *fmt; 639 640 if (csv_output) 641 fmt = "%s%.0f%s%s"; 642 else if (big_num) 643 fmt = "%s%'18.0f%s%-25s"; 644 else 645 fmt = "%s%18.0f%s%-25s"; 646 647 if (no_aggr) 648 sprintf(cpustr, "CPU%*d%s", 649 csv_output ? 0 : -4, 650 perf_evsel__cpus(evsel)->map[cpu], csv_sep); 651 else 652 cpu = 0; 653 654 fprintf(output, fmt, cpustr, avg, csv_sep, perf_evsel__name(evsel)); 655 656 if (evsel->cgrp) 657 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); 658 659 if (csv_output) 660 return; 661 662 if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { 663 total = avg_stats(&runtime_cycles_stats[cpu]); 664 665 if (total) 666 ratio = avg / total; 667 668 fprintf(output, " # %5.2f insns per cycle ", ratio); 669 670 total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]); 671 total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu])); 672 673 if (total && avg) { 674 ratio = total / avg; 675 fprintf(output, "\n # %5.2f stalled cycles per insn", ratio); 676 } 677 678 } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && 679 runtime_branches_stats[cpu].n != 0) { 680 print_branch_misses(cpu, evsel, avg); 681 } else if ( 682 evsel->attr.type == PERF_TYPE_HW_CACHE && 683 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | 684 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 685 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 686 runtime_l1_dcache_stats[cpu].n != 0) { 687 print_l1_dcache_misses(cpu, evsel, avg); 688 } else if ( 689 evsel->attr.type == PERF_TYPE_HW_CACHE && 690 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I | 691 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 692 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 693 runtime_l1_icache_stats[cpu].n != 0) { 694 print_l1_icache_misses(cpu, evsel, avg); 695 } else if ( 696 evsel->attr.type == PERF_TYPE_HW_CACHE && 697 evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB | 698 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 699 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 700 runtime_dtlb_cache_stats[cpu].n != 0) { 701 print_dtlb_cache_misses(cpu, evsel, avg); 702 } else if ( 703 evsel->attr.type == PERF_TYPE_HW_CACHE && 704 evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB | 705 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 706 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 707 runtime_itlb_cache_stats[cpu].n != 0) { 708 print_itlb_cache_misses(cpu, evsel, avg); 709 } else if ( 710 evsel->attr.type == PERF_TYPE_HW_CACHE && 711 evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL | 712 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 713 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 714 runtime_ll_cache_stats[cpu].n != 0) { 715 print_ll_cache_misses(cpu, evsel, avg); 716 } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && 717 runtime_cacherefs_stats[cpu].n != 0) { 718 total = avg_stats(&runtime_cacherefs_stats[cpu]); 719 720 if (total) 721 ratio = avg * 100 / total; 722 723 fprintf(output, " # %8.3f %% of all cache refs ", ratio); 724 725 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { 726 print_stalled_cycles_frontend(cpu, evsel, avg); 727 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { 728 print_stalled_cycles_backend(cpu, evsel, avg); 729 } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { 730 total = avg_stats(&runtime_nsecs_stats[cpu]); 731 732 if (total) 733 ratio = 1.0 * avg / total; 734 735 fprintf(output, " # %8.3f GHz ", ratio); 736 } else if (runtime_nsecs_stats[cpu].n != 0) { 737 char unit = 'M'; 738 739 total = avg_stats(&runtime_nsecs_stats[cpu]); 740 741 if (total) 742 ratio = 1000.0 * avg / total; 743 if (ratio < 0.001) { 744 ratio *= 1000; 745 unit = 'K'; 746 } 747 748 fprintf(output, " # %8.3f %c/sec ", ratio, unit); 749 } else { 750 fprintf(output, " "); 751 } 752 } 753 754 /* 755 * Print out the results of a single counter: 756 * aggregated counts in system-wide mode 757 */ 758 static void print_counter_aggr(struct perf_evsel *counter) 759 { 760 struct perf_stat *ps = counter->priv; 761 double avg = avg_stats(&ps->res_stats[0]); 762 int scaled = counter->counts->scaled; 763 764 if (scaled == -1) { 765 fprintf(output, "%*s%s%*s", 766 csv_output ? 0 : 18, 767 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 768 csv_sep, 769 csv_output ? 0 : -24, 770 perf_evsel__name(counter)); 771 772 if (counter->cgrp) 773 fprintf(output, "%s%s", csv_sep, counter->cgrp->name); 774 775 fputc('\n', output); 776 return; 777 } 778 779 if (nsec_counter(counter)) 780 nsec_printout(-1, counter, avg); 781 else 782 abs_printout(-1, counter, avg); 783 784 print_noise(counter, avg); 785 786 if (csv_output) { 787 fputc('\n', output); 788 return; 789 } 790 791 if (scaled) { 792 double avg_enabled, avg_running; 793 794 avg_enabled = avg_stats(&ps->res_stats[1]); 795 avg_running = avg_stats(&ps->res_stats[2]); 796 797 fprintf(output, " [%5.2f%%]", 100 * avg_running / avg_enabled); 798 } 799 fprintf(output, "\n"); 800 } 801 802 /* 803 * Print out the results of a single counter: 804 * does not use aggregated count in system-wide 805 */ 806 static void print_counter(struct perf_evsel *counter) 807 { 808 u64 ena, run, val; 809 int cpu; 810 811 for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { 812 val = counter->counts->cpu[cpu].val; 813 ena = counter->counts->cpu[cpu].ena; 814 run = counter->counts->cpu[cpu].run; 815 if (run == 0 || ena == 0) { 816 fprintf(output, "CPU%*d%s%*s%s%*s", 817 csv_output ? 0 : -4, 818 perf_evsel__cpus(counter)->map[cpu], csv_sep, 819 csv_output ? 0 : 18, 820 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 821 csv_sep, 822 csv_output ? 0 : -24, 823 perf_evsel__name(counter)); 824 825 if (counter->cgrp) 826 fprintf(output, "%s%s", 827 csv_sep, counter->cgrp->name); 828 829 fputc('\n', output); 830 continue; 831 } 832 833 if (nsec_counter(counter)) 834 nsec_printout(cpu, counter, val); 835 else 836 abs_printout(cpu, counter, val); 837 838 if (!csv_output) { 839 print_noise(counter, 1.0); 840 841 if (run != ena) 842 fprintf(output, " (%.2f%%)", 843 100.0 * run / ena); 844 } 845 fputc('\n', output); 846 } 847 } 848 849 static void print_stat(int argc, const char **argv) 850 { 851 struct perf_evsel *counter; 852 int i; 853 854 fflush(stdout); 855 856 if (!csv_output) { 857 fprintf(output, "\n"); 858 fprintf(output, " Performance counter stats for "); 859 if (!perf_target__has_task(&target)) { 860 fprintf(output, "\'%s", argv[0]); 861 for (i = 1; i < argc; i++) 862 fprintf(output, " %s", argv[i]); 863 } else if (target.pid) 864 fprintf(output, "process id \'%s", target.pid); 865 else 866 fprintf(output, "thread id \'%s", target.tid); 867 868 fprintf(output, "\'"); 869 if (run_count > 1) 870 fprintf(output, " (%d runs)", run_count); 871 fprintf(output, ":\n\n"); 872 } 873 874 if (no_aggr) { 875 list_for_each_entry(counter, &evsel_list->entries, node) 876 print_counter(counter); 877 } else { 878 list_for_each_entry(counter, &evsel_list->entries, node) 879 print_counter_aggr(counter); 880 } 881 882 if (!csv_output) { 883 if (!null_run) 884 fprintf(output, "\n"); 885 fprintf(output, " %17.9f seconds time elapsed", 886 avg_stats(&walltime_nsecs_stats)/1e9); 887 if (run_count > 1) { 888 fprintf(output, " "); 889 print_noise_pct(stddev_stats(&walltime_nsecs_stats), 890 avg_stats(&walltime_nsecs_stats)); 891 } 892 fprintf(output, "\n\n"); 893 } 894 } 895 896 static volatile int signr = -1; 897 898 static void skip_signal(int signo) 899 { 900 if(child_pid == -1) 901 done = 1; 902 903 signr = signo; 904 } 905 906 static void sig_atexit(void) 907 { 908 if (child_pid != -1) 909 kill(child_pid, SIGTERM); 910 911 if (signr == -1) 912 return; 913 914 signal(signr, SIG_DFL); 915 kill(getpid(), signr); 916 } 917 918 static int stat__set_big_num(const struct option *opt __maybe_unused, 919 const char *s __maybe_unused, int unset) 920 { 921 big_num_opt = unset ? 0 : 1; 922 return 0; 923 } 924 925 /* 926 * Add default attributes, if there were no attributes specified or 927 * if -d/--detailed, -d -d or -d -d -d is used: 928 */ 929 static int add_default_attributes(void) 930 { 931 struct perf_event_attr default_attrs[] = { 932 933 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, 934 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES }, 935 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, 936 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, 937 938 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, 939 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, 940 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, 941 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, 942 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, 943 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, 944 945 }; 946 947 /* 948 * Detailed stats (-d), covering the L1 and last level data caches: 949 */ 950 struct perf_event_attr detailed_attrs[] = { 951 952 { .type = PERF_TYPE_HW_CACHE, 953 .config = 954 PERF_COUNT_HW_CACHE_L1D << 0 | 955 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 956 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 957 958 { .type = PERF_TYPE_HW_CACHE, 959 .config = 960 PERF_COUNT_HW_CACHE_L1D << 0 | 961 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 962 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 963 964 { .type = PERF_TYPE_HW_CACHE, 965 .config = 966 PERF_COUNT_HW_CACHE_LL << 0 | 967 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 968 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 969 970 { .type = PERF_TYPE_HW_CACHE, 971 .config = 972 PERF_COUNT_HW_CACHE_LL << 0 | 973 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 974 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 975 }; 976 977 /* 978 * Very detailed stats (-d -d), covering the instruction cache and the TLB caches: 979 */ 980 struct perf_event_attr very_detailed_attrs[] = { 981 982 { .type = PERF_TYPE_HW_CACHE, 983 .config = 984 PERF_COUNT_HW_CACHE_L1I << 0 | 985 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 986 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 987 988 { .type = PERF_TYPE_HW_CACHE, 989 .config = 990 PERF_COUNT_HW_CACHE_L1I << 0 | 991 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 992 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 993 994 { .type = PERF_TYPE_HW_CACHE, 995 .config = 996 PERF_COUNT_HW_CACHE_DTLB << 0 | 997 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 998 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 999 1000 { .type = PERF_TYPE_HW_CACHE, 1001 .config = 1002 PERF_COUNT_HW_CACHE_DTLB << 0 | 1003 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1004 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1005 1006 { .type = PERF_TYPE_HW_CACHE, 1007 .config = 1008 PERF_COUNT_HW_CACHE_ITLB << 0 | 1009 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1010 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1011 1012 { .type = PERF_TYPE_HW_CACHE, 1013 .config = 1014 PERF_COUNT_HW_CACHE_ITLB << 0 | 1015 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1016 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1017 1018 }; 1019 1020 /* 1021 * Very, very detailed stats (-d -d -d), adding prefetch events: 1022 */ 1023 struct perf_event_attr very_very_detailed_attrs[] = { 1024 1025 { .type = PERF_TYPE_HW_CACHE, 1026 .config = 1027 PERF_COUNT_HW_CACHE_L1D << 0 | 1028 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 1029 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1030 1031 { .type = PERF_TYPE_HW_CACHE, 1032 .config = 1033 PERF_COUNT_HW_CACHE_L1D << 0 | 1034 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 1035 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1036 }; 1037 1038 /* Set attrs if no event is selected and !null_run: */ 1039 if (null_run) 1040 return 0; 1041 1042 if (!evsel_list->nr_entries) { 1043 if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0) 1044 return -1; 1045 } 1046 1047 /* Detailed events get appended to the event list: */ 1048 1049 if (detailed_run < 1) 1050 return 0; 1051 1052 /* Append detailed run extra attributes: */ 1053 if (perf_evlist__add_default_attrs(evsel_list, detailed_attrs) < 0) 1054 return -1; 1055 1056 if (detailed_run < 2) 1057 return 0; 1058 1059 /* Append very detailed run extra attributes: */ 1060 if (perf_evlist__add_default_attrs(evsel_list, very_detailed_attrs) < 0) 1061 return -1; 1062 1063 if (detailed_run < 3) 1064 return 0; 1065 1066 /* Append very, very detailed run extra attributes: */ 1067 return perf_evlist__add_default_attrs(evsel_list, very_very_detailed_attrs); 1068 } 1069 1070 int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) 1071 { 1072 bool append_file = false, 1073 sync_run = false; 1074 int output_fd = 0; 1075 const char *output_name = NULL; 1076 const struct option options[] = { 1077 OPT_CALLBACK('e', "event", &evsel_list, "event", 1078 "event selector. use 'perf list' to list available events", 1079 parse_events_option), 1080 OPT_CALLBACK(0, "filter", &evsel_list, "filter", 1081 "event filter", parse_filter), 1082 OPT_BOOLEAN('i', "no-inherit", &no_inherit, 1083 "child tasks do not inherit counters"), 1084 OPT_STRING('p', "pid", &target.pid, "pid", 1085 "stat events on existing process id"), 1086 OPT_STRING('t', "tid", &target.tid, "tid", 1087 "stat events on existing thread id"), 1088 OPT_BOOLEAN('a', "all-cpus", &target.system_wide, 1089 "system-wide collection from all CPUs"), 1090 OPT_BOOLEAN('g', "group", &group, 1091 "put the counters into a counter group"), 1092 OPT_BOOLEAN('c', "scale", &scale, "scale/normalize counters"), 1093 OPT_INCR('v', "verbose", &verbose, 1094 "be more verbose (show counter open errors, etc)"), 1095 OPT_INTEGER('r', "repeat", &run_count, 1096 "repeat command and print average + stddev (max: 100)"), 1097 OPT_BOOLEAN('n', "null", &null_run, 1098 "null run - dont start any counters"), 1099 OPT_INCR('d', "detailed", &detailed_run, 1100 "detailed run - start a lot of events"), 1101 OPT_BOOLEAN('S', "sync", &sync_run, 1102 "call sync() before starting a run"), 1103 OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, 1104 "print large numbers with thousands\' separators", 1105 stat__set_big_num), 1106 OPT_STRING('C', "cpu", &target.cpu_list, "cpu", 1107 "list of cpus to monitor in system-wide"), 1108 OPT_BOOLEAN('A', "no-aggr", &no_aggr, "disable CPU count aggregation"), 1109 OPT_STRING('x', "field-separator", &csv_sep, "separator", 1110 "print counts with custom separator"), 1111 OPT_CALLBACK('G', "cgroup", &evsel_list, "name", 1112 "monitor event in cgroup name only", parse_cgroups), 1113 OPT_STRING('o', "output", &output_name, "file", "output file name"), 1114 OPT_BOOLEAN(0, "append", &append_file, "append to the output file"), 1115 OPT_INTEGER(0, "log-fd", &output_fd, 1116 "log output to fd, instead of stderr"), 1117 OPT_END() 1118 }; 1119 const char * const stat_usage[] = { 1120 "perf stat [<options>] [<command>]", 1121 NULL 1122 }; 1123 struct perf_evsel *pos; 1124 int status = -ENOMEM, run_idx; 1125 const char *mode; 1126 1127 setlocale(LC_ALL, ""); 1128 1129 evsel_list = perf_evlist__new(NULL, NULL); 1130 if (evsel_list == NULL) 1131 return -ENOMEM; 1132 1133 argc = parse_options(argc, argv, options, stat_usage, 1134 PARSE_OPT_STOP_AT_NON_OPTION); 1135 1136 output = stderr; 1137 if (output_name && strcmp(output_name, "-")) 1138 output = NULL; 1139 1140 if (output_name && output_fd) { 1141 fprintf(stderr, "cannot use both --output and --log-fd\n"); 1142 usage_with_options(stat_usage, options); 1143 } 1144 1145 if (output_fd < 0) { 1146 fprintf(stderr, "argument to --log-fd must be a > 0\n"); 1147 usage_with_options(stat_usage, options); 1148 } 1149 1150 if (!output) { 1151 struct timespec tm; 1152 mode = append_file ? "a" : "w"; 1153 1154 output = fopen(output_name, mode); 1155 if (!output) { 1156 perror("failed to create output file"); 1157 return -1; 1158 } 1159 clock_gettime(CLOCK_REALTIME, &tm); 1160 fprintf(output, "# started on %s\n", ctime(&tm.tv_sec)); 1161 } else if (output_fd > 0) { 1162 mode = append_file ? "a" : "w"; 1163 output = fdopen(output_fd, mode); 1164 if (!output) { 1165 perror("Failed opening logfd"); 1166 return -errno; 1167 } 1168 } 1169 1170 if (csv_sep) { 1171 csv_output = true; 1172 if (!strcmp(csv_sep, "\\t")) 1173 csv_sep = "\t"; 1174 } else 1175 csv_sep = DEFAULT_SEPARATOR; 1176 1177 /* 1178 * let the spreadsheet do the pretty-printing 1179 */ 1180 if (csv_output) { 1181 /* User explicitly passed -B? */ 1182 if (big_num_opt == 1) { 1183 fprintf(stderr, "-B option not supported with -x\n"); 1184 usage_with_options(stat_usage, options); 1185 } else /* Nope, so disable big number formatting */ 1186 big_num = false; 1187 } else if (big_num_opt == 0) /* User passed --no-big-num */ 1188 big_num = false; 1189 1190 if (!argc && !perf_target__has_task(&target)) 1191 usage_with_options(stat_usage, options); 1192 if (run_count <= 0) 1193 usage_with_options(stat_usage, options); 1194 1195 /* no_aggr, cgroup are for system-wide only */ 1196 if ((no_aggr || nr_cgroups) && !perf_target__has_cpu(&target)) { 1197 fprintf(stderr, "both cgroup and no-aggregation " 1198 "modes only available in system-wide mode\n"); 1199 1200 usage_with_options(stat_usage, options); 1201 } 1202 1203 if (add_default_attributes()) 1204 goto out; 1205 1206 perf_target__validate(&target); 1207 1208 if (perf_evlist__create_maps(evsel_list, &target) < 0) { 1209 if (perf_target__has_task(&target)) 1210 pr_err("Problems finding threads of monitor\n"); 1211 if (perf_target__has_cpu(&target)) 1212 perror("failed to parse CPUs map"); 1213 1214 usage_with_options(stat_usage, options); 1215 return -1; 1216 } 1217 1218 list_for_each_entry(pos, &evsel_list->entries, node) { 1219 if (perf_evsel__alloc_stat_priv(pos) < 0 || 1220 perf_evsel__alloc_counts(pos, perf_evsel__nr_cpus(pos)) < 0) 1221 goto out_free_fd; 1222 } 1223 1224 /* 1225 * We dont want to block the signals - that would cause 1226 * child tasks to inherit that and Ctrl-C would not work. 1227 * What we want is for Ctrl-C to work in the exec()-ed 1228 * task, but being ignored by perf stat itself: 1229 */ 1230 atexit(sig_atexit); 1231 signal(SIGINT, skip_signal); 1232 signal(SIGALRM, skip_signal); 1233 signal(SIGABRT, skip_signal); 1234 1235 status = 0; 1236 for (run_idx = 0; run_idx < run_count; run_idx++) { 1237 if (run_count != 1 && verbose) 1238 fprintf(output, "[ perf stat: executing run #%d ... ]\n", 1239 run_idx + 1); 1240 1241 if (sync_run) 1242 sync(); 1243 1244 status = run_perf_stat(argc, argv); 1245 } 1246 1247 if (status != -1) 1248 print_stat(argc, argv); 1249 out_free_fd: 1250 list_for_each_entry(pos, &evsel_list->entries, node) 1251 perf_evsel__free_stat_priv(pos); 1252 perf_evlist__delete_maps(evsel_list); 1253 out: 1254 perf_evlist__delete(evsel_list); 1255 return status; 1256 } 1257