1 /* 2 * builtin-stat.c 3 * 4 * Builtin stat command: Give a precise performance counters summary 5 * overview about any workload, CPU or specific PID. 6 * 7 * Sample output: 8 9 $ perf stat ./hackbench 10 10 11 Time: 0.118 12 13 Performance counter stats for './hackbench 10': 14 15 1708.761321 task-clock # 11.037 CPUs utilized 16 41,190 context-switches # 0.024 M/sec 17 6,735 CPU-migrations # 0.004 M/sec 18 17,318 page-faults # 0.010 M/sec 19 5,205,202,243 cycles # 3.046 GHz 20 3,856,436,920 stalled-cycles-frontend # 74.09% frontend cycles idle 21 1,600,790,871 stalled-cycles-backend # 30.75% backend cycles idle 22 2,603,501,247 instructions # 0.50 insns per cycle 23 # 1.48 stalled cycles per insn 24 484,357,498 branches # 283.455 M/sec 25 6,388,934 branch-misses # 1.32% of all branches 26 27 0.154822978 seconds time elapsed 28 29 * 30 * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar <mingo@redhat.com> 31 * 32 * Improvements and fixes by: 33 * 34 * Arjan van de Ven <arjan@linux.intel.com> 35 * Yanmin Zhang <yanmin.zhang@intel.com> 36 * Wu Fengguang <fengguang.wu@intel.com> 37 * Mike Galbraith <efault@gmx.de> 38 * Paul Mackerras <paulus@samba.org> 39 * Jaswinder Singh Rajput <jaswinder@kernel.org> 40 * 41 * Released under the GPL v2. (and only v2, not any later version) 42 */ 43 44 #include "perf.h" 45 #include "builtin.h" 46 #include "util/util.h" 47 #include "util/parse-options.h" 48 #include "util/parse-events.h" 49 #include "util/pmu.h" 50 #include "util/event.h" 51 #include "util/evlist.h" 52 #include "util/evsel.h" 53 #include "util/debug.h" 54 #include "util/color.h" 55 #include "util/stat.h" 56 #include "util/header.h" 57 #include "util/cpumap.h" 58 #include "util/thread.h" 59 #include "util/thread_map.h" 60 61 #include <stdlib.h> 62 #include <sys/prctl.h> 63 #include <locale.h> 64 65 #define DEFAULT_SEPARATOR " " 66 #define CNTR_NOT_SUPPORTED "<not supported>" 67 #define CNTR_NOT_COUNTED "<not counted>" 68 69 static void print_stat(int argc, const char **argv); 70 static void print_counter_aggr(struct perf_evsel *counter, char *prefix); 71 static void print_counter(struct perf_evsel *counter, char *prefix); 72 static void print_aggr(char *prefix); 73 74 /* Default events used for perf stat -T */ 75 static const char * const transaction_attrs[] = { 76 "task-clock", 77 "{" 78 "instructions," 79 "cycles," 80 "cpu/cycles-t/," 81 "cpu/tx-start/," 82 "cpu/el-start/," 83 "cpu/cycles-ct/" 84 "}" 85 }; 86 87 /* More limited version when the CPU does not have all events. */ 88 static const char * const transaction_limited_attrs[] = { 89 "task-clock", 90 "{" 91 "instructions," 92 "cycles," 93 "cpu/cycles-t/," 94 "cpu/tx-start/" 95 "}" 96 }; 97 98 /* must match transaction_attrs and the beginning limited_attrs */ 99 enum { 100 T_TASK_CLOCK, 101 T_INSTRUCTIONS, 102 T_CYCLES, 103 T_CYCLES_IN_TX, 104 T_TRANSACTION_START, 105 T_ELISION_START, 106 T_CYCLES_IN_TX_CP, 107 }; 108 109 static struct perf_evlist *evsel_list; 110 111 static struct target target = { 112 .uid = UINT_MAX, 113 }; 114 115 enum aggr_mode { 116 AGGR_NONE, 117 AGGR_GLOBAL, 118 AGGR_SOCKET, 119 AGGR_CORE, 120 }; 121 122 static int run_count = 1; 123 static bool no_inherit = false; 124 static bool scale = true; 125 static enum aggr_mode aggr_mode = AGGR_GLOBAL; 126 static volatile pid_t child_pid = -1; 127 static bool null_run = false; 128 static int detailed_run = 0; 129 static bool transaction_run; 130 static bool big_num = true; 131 static int big_num_opt = -1; 132 static const char *csv_sep = NULL; 133 static bool csv_output = false; 134 static bool group = false; 135 static FILE *output = NULL; 136 static const char *pre_cmd = NULL; 137 static const char *post_cmd = NULL; 138 static bool sync_run = false; 139 static unsigned int interval = 0; 140 static unsigned int initial_delay = 0; 141 static unsigned int unit_width = 4; /* strlen("unit") */ 142 static bool forever = false; 143 static struct timespec ref_time; 144 static struct cpu_map *aggr_map; 145 static int (*aggr_get_id)(struct cpu_map *m, int cpu); 146 147 static volatile int done = 0; 148 149 struct perf_stat { 150 struct stats res_stats[3]; 151 }; 152 153 static inline void diff_timespec(struct timespec *r, struct timespec *a, 154 struct timespec *b) 155 { 156 r->tv_sec = a->tv_sec - b->tv_sec; 157 if (a->tv_nsec < b->tv_nsec) { 158 r->tv_nsec = a->tv_nsec + 1000000000L - b->tv_nsec; 159 r->tv_sec--; 160 } else { 161 r->tv_nsec = a->tv_nsec - b->tv_nsec ; 162 } 163 } 164 165 static inline struct cpu_map *perf_evsel__cpus(struct perf_evsel *evsel) 166 { 167 return (evsel->cpus && !target.cpu_list) ? evsel->cpus : evsel_list->cpus; 168 } 169 170 static inline int perf_evsel__nr_cpus(struct perf_evsel *evsel) 171 { 172 return perf_evsel__cpus(evsel)->nr; 173 } 174 175 static void perf_evsel__reset_stat_priv(struct perf_evsel *evsel) 176 { 177 memset(evsel->priv, 0, sizeof(struct perf_stat)); 178 } 179 180 static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel) 181 { 182 evsel->priv = zalloc(sizeof(struct perf_stat)); 183 return evsel->priv == NULL ? -ENOMEM : 0; 184 } 185 186 static void perf_evsel__free_stat_priv(struct perf_evsel *evsel) 187 { 188 zfree(&evsel->priv); 189 } 190 191 static int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel) 192 { 193 void *addr; 194 size_t sz; 195 196 sz = sizeof(*evsel->counts) + 197 (perf_evsel__nr_cpus(evsel) * sizeof(struct perf_counts_values)); 198 199 addr = zalloc(sz); 200 if (!addr) 201 return -ENOMEM; 202 203 evsel->prev_raw_counts = addr; 204 205 return 0; 206 } 207 208 static void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel) 209 { 210 zfree(&evsel->prev_raw_counts); 211 } 212 213 static void perf_evlist__free_stats(struct perf_evlist *evlist) 214 { 215 struct perf_evsel *evsel; 216 217 evlist__for_each(evlist, evsel) { 218 perf_evsel__free_stat_priv(evsel); 219 perf_evsel__free_counts(evsel); 220 perf_evsel__free_prev_raw_counts(evsel); 221 } 222 } 223 224 static int perf_evlist__alloc_stats(struct perf_evlist *evlist, bool alloc_raw) 225 { 226 struct perf_evsel *evsel; 227 228 evlist__for_each(evlist, evsel) { 229 if (perf_evsel__alloc_stat_priv(evsel) < 0 || 230 perf_evsel__alloc_counts(evsel, perf_evsel__nr_cpus(evsel)) < 0 || 231 (alloc_raw && perf_evsel__alloc_prev_raw_counts(evsel) < 0)) 232 goto out_free; 233 } 234 235 return 0; 236 237 out_free: 238 perf_evlist__free_stats(evlist); 239 return -1; 240 } 241 242 static struct stats runtime_nsecs_stats[MAX_NR_CPUS]; 243 static struct stats runtime_cycles_stats[MAX_NR_CPUS]; 244 static struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS]; 245 static struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS]; 246 static struct stats runtime_branches_stats[MAX_NR_CPUS]; 247 static struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; 248 static struct stats runtime_l1_dcache_stats[MAX_NR_CPUS]; 249 static struct stats runtime_l1_icache_stats[MAX_NR_CPUS]; 250 static struct stats runtime_ll_cache_stats[MAX_NR_CPUS]; 251 static struct stats runtime_itlb_cache_stats[MAX_NR_CPUS]; 252 static struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS]; 253 static struct stats runtime_cycles_in_tx_stats[MAX_NR_CPUS]; 254 static struct stats walltime_nsecs_stats; 255 static struct stats runtime_transaction_stats[MAX_NR_CPUS]; 256 static struct stats runtime_elision_stats[MAX_NR_CPUS]; 257 258 static void perf_stat__reset_stats(struct perf_evlist *evlist) 259 { 260 struct perf_evsel *evsel; 261 262 evlist__for_each(evlist, evsel) { 263 perf_evsel__reset_stat_priv(evsel); 264 perf_evsel__reset_counts(evsel, perf_evsel__nr_cpus(evsel)); 265 } 266 267 memset(runtime_nsecs_stats, 0, sizeof(runtime_nsecs_stats)); 268 memset(runtime_cycles_stats, 0, sizeof(runtime_cycles_stats)); 269 memset(runtime_stalled_cycles_front_stats, 0, sizeof(runtime_stalled_cycles_front_stats)); 270 memset(runtime_stalled_cycles_back_stats, 0, sizeof(runtime_stalled_cycles_back_stats)); 271 memset(runtime_branches_stats, 0, sizeof(runtime_branches_stats)); 272 memset(runtime_cacherefs_stats, 0, sizeof(runtime_cacherefs_stats)); 273 memset(runtime_l1_dcache_stats, 0, sizeof(runtime_l1_dcache_stats)); 274 memset(runtime_l1_icache_stats, 0, sizeof(runtime_l1_icache_stats)); 275 memset(runtime_ll_cache_stats, 0, sizeof(runtime_ll_cache_stats)); 276 memset(runtime_itlb_cache_stats, 0, sizeof(runtime_itlb_cache_stats)); 277 memset(runtime_dtlb_cache_stats, 0, sizeof(runtime_dtlb_cache_stats)); 278 memset(runtime_cycles_in_tx_stats, 0, 279 sizeof(runtime_cycles_in_tx_stats)); 280 memset(runtime_transaction_stats, 0, 281 sizeof(runtime_transaction_stats)); 282 memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats)); 283 memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats)); 284 } 285 286 static int create_perf_stat_counter(struct perf_evsel *evsel) 287 { 288 struct perf_event_attr *attr = &evsel->attr; 289 290 if (scale) 291 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | 292 PERF_FORMAT_TOTAL_TIME_RUNNING; 293 294 attr->inherit = !no_inherit; 295 296 if (target__has_cpu(&target)) 297 return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel)); 298 299 if (!target__has_task(&target) && perf_evsel__is_group_leader(evsel)) { 300 attr->disabled = 1; 301 if (!initial_delay) 302 attr->enable_on_exec = 1; 303 } 304 305 return perf_evsel__open_per_thread(evsel, evsel_list->threads); 306 } 307 308 /* 309 * Does the counter have nsecs as a unit? 310 */ 311 static inline int nsec_counter(struct perf_evsel *evsel) 312 { 313 if (perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) || 314 perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) 315 return 1; 316 317 return 0; 318 } 319 320 static struct perf_evsel *nth_evsel(int n) 321 { 322 static struct perf_evsel **array; 323 static int array_len; 324 struct perf_evsel *ev; 325 int j; 326 327 /* Assumes this only called when evsel_list does not change anymore. */ 328 if (!array) { 329 evlist__for_each(evsel_list, ev) 330 array_len++; 331 array = malloc(array_len * sizeof(void *)); 332 if (!array) 333 exit(ENOMEM); 334 j = 0; 335 evlist__for_each(evsel_list, ev) 336 array[j++] = ev; 337 } 338 if (n < array_len) 339 return array[n]; 340 return NULL; 341 } 342 343 /* 344 * Update various tracking values we maintain to print 345 * more semantic information such as miss/hit ratios, 346 * instruction rates, etc: 347 */ 348 static void update_shadow_stats(struct perf_evsel *counter, u64 *count) 349 { 350 if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) 351 update_stats(&runtime_nsecs_stats[0], count[0]); 352 else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) 353 update_stats(&runtime_cycles_stats[0], count[0]); 354 else if (transaction_run && 355 perf_evsel__cmp(counter, nth_evsel(T_CYCLES_IN_TX))) 356 update_stats(&runtime_cycles_in_tx_stats[0], count[0]); 357 else if (transaction_run && 358 perf_evsel__cmp(counter, nth_evsel(T_TRANSACTION_START))) 359 update_stats(&runtime_transaction_stats[0], count[0]); 360 else if (transaction_run && 361 perf_evsel__cmp(counter, nth_evsel(T_ELISION_START))) 362 update_stats(&runtime_elision_stats[0], count[0]); 363 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) 364 update_stats(&runtime_stalled_cycles_front_stats[0], count[0]); 365 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) 366 update_stats(&runtime_stalled_cycles_back_stats[0], count[0]); 367 else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) 368 update_stats(&runtime_branches_stats[0], count[0]); 369 else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) 370 update_stats(&runtime_cacherefs_stats[0], count[0]); 371 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) 372 update_stats(&runtime_l1_dcache_stats[0], count[0]); 373 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) 374 update_stats(&runtime_l1_icache_stats[0], count[0]); 375 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL)) 376 update_stats(&runtime_ll_cache_stats[0], count[0]); 377 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) 378 update_stats(&runtime_dtlb_cache_stats[0], count[0]); 379 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) 380 update_stats(&runtime_itlb_cache_stats[0], count[0]); 381 } 382 383 /* 384 * Read out the results of a single counter: 385 * aggregate counts across CPUs in system-wide mode 386 */ 387 static int read_counter_aggr(struct perf_evsel *counter) 388 { 389 struct perf_stat *ps = counter->priv; 390 u64 *count = counter->counts->aggr.values; 391 int i; 392 393 if (__perf_evsel__read(counter, perf_evsel__nr_cpus(counter), 394 thread_map__nr(evsel_list->threads), scale) < 0) 395 return -1; 396 397 for (i = 0; i < 3; i++) 398 update_stats(&ps->res_stats[i], count[i]); 399 400 if (verbose) { 401 fprintf(output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", 402 perf_evsel__name(counter), count[0], count[1], count[2]); 403 } 404 405 /* 406 * Save the full runtime - to allow normalization during printout: 407 */ 408 update_shadow_stats(counter, count); 409 410 return 0; 411 } 412 413 /* 414 * Read out the results of a single counter: 415 * do not aggregate counts across CPUs in system-wide mode 416 */ 417 static int read_counter(struct perf_evsel *counter) 418 { 419 u64 *count; 420 int cpu; 421 422 for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { 423 if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0) 424 return -1; 425 426 count = counter->counts->cpu[cpu].values; 427 428 update_shadow_stats(counter, count); 429 } 430 431 return 0; 432 } 433 434 static void print_interval(void) 435 { 436 static int num_print_interval; 437 struct perf_evsel *counter; 438 struct perf_stat *ps; 439 struct timespec ts, rs; 440 char prefix[64]; 441 442 if (aggr_mode == AGGR_GLOBAL) { 443 evlist__for_each(evsel_list, counter) { 444 ps = counter->priv; 445 memset(ps->res_stats, 0, sizeof(ps->res_stats)); 446 read_counter_aggr(counter); 447 } 448 } else { 449 evlist__for_each(evsel_list, counter) { 450 ps = counter->priv; 451 memset(ps->res_stats, 0, sizeof(ps->res_stats)); 452 read_counter(counter); 453 } 454 } 455 456 clock_gettime(CLOCK_MONOTONIC, &ts); 457 diff_timespec(&rs, &ts, &ref_time); 458 sprintf(prefix, "%6lu.%09lu%s", rs.tv_sec, rs.tv_nsec, csv_sep); 459 460 if (num_print_interval == 0 && !csv_output) { 461 switch (aggr_mode) { 462 case AGGR_SOCKET: 463 fprintf(output, "# time socket cpus counts %*s events\n", unit_width, "unit"); 464 break; 465 case AGGR_CORE: 466 fprintf(output, "# time core cpus counts %*s events\n", unit_width, "unit"); 467 break; 468 case AGGR_NONE: 469 fprintf(output, "# time CPU counts %*s events\n", unit_width, "unit"); 470 break; 471 case AGGR_GLOBAL: 472 default: 473 fprintf(output, "# time counts %*s events\n", unit_width, "unit"); 474 } 475 } 476 477 if (++num_print_interval == 25) 478 num_print_interval = 0; 479 480 switch (aggr_mode) { 481 case AGGR_CORE: 482 case AGGR_SOCKET: 483 print_aggr(prefix); 484 break; 485 case AGGR_NONE: 486 evlist__for_each(evsel_list, counter) 487 print_counter(counter, prefix); 488 break; 489 case AGGR_GLOBAL: 490 default: 491 evlist__for_each(evsel_list, counter) 492 print_counter_aggr(counter, prefix); 493 } 494 495 fflush(output); 496 } 497 498 static void handle_initial_delay(void) 499 { 500 struct perf_evsel *counter; 501 502 if (initial_delay) { 503 const int ncpus = cpu_map__nr(evsel_list->cpus), 504 nthreads = thread_map__nr(evsel_list->threads); 505 506 usleep(initial_delay * 1000); 507 evlist__for_each(evsel_list, counter) 508 perf_evsel__enable(counter, ncpus, nthreads); 509 } 510 } 511 512 static volatile int workload_exec_errno; 513 514 /* 515 * perf_evlist__prepare_workload will send a SIGUSR1 516 * if the fork fails, since we asked by setting its 517 * want_signal to true. 518 */ 519 static void workload_exec_failed_signal(int signo __maybe_unused, siginfo_t *info, 520 void *ucontext __maybe_unused) 521 { 522 workload_exec_errno = info->si_value.sival_int; 523 } 524 525 static int __run_perf_stat(int argc, const char **argv) 526 { 527 char msg[512]; 528 unsigned long long t0, t1; 529 struct perf_evsel *counter; 530 struct timespec ts; 531 size_t l; 532 int status = 0; 533 const bool forks = (argc > 0); 534 535 if (interval) { 536 ts.tv_sec = interval / 1000; 537 ts.tv_nsec = (interval % 1000) * 1000000; 538 } else { 539 ts.tv_sec = 1; 540 ts.tv_nsec = 0; 541 } 542 543 if (forks) { 544 if (perf_evlist__prepare_workload(evsel_list, &target, argv, false, 545 workload_exec_failed_signal) < 0) { 546 perror("failed to prepare workload"); 547 return -1; 548 } 549 child_pid = evsel_list->workload.pid; 550 } 551 552 if (group) 553 perf_evlist__set_leader(evsel_list); 554 555 evlist__for_each(evsel_list, counter) { 556 if (create_perf_stat_counter(counter) < 0) { 557 /* 558 * PPC returns ENXIO for HW counters until 2.6.37 559 * (behavior changed with commit b0a873e). 560 */ 561 if (errno == EINVAL || errno == ENOSYS || 562 errno == ENOENT || errno == EOPNOTSUPP || 563 errno == ENXIO) { 564 if (verbose) 565 ui__warning("%s event is not supported by the kernel.\n", 566 perf_evsel__name(counter)); 567 counter->supported = false; 568 continue; 569 } 570 571 perf_evsel__open_strerror(counter, &target, 572 errno, msg, sizeof(msg)); 573 ui__error("%s\n", msg); 574 575 if (child_pid != -1) 576 kill(child_pid, SIGTERM); 577 578 return -1; 579 } 580 counter->supported = true; 581 582 l = strlen(counter->unit); 583 if (l > unit_width) 584 unit_width = l; 585 } 586 587 if (perf_evlist__apply_filters(evsel_list)) { 588 error("failed to set filter with %d (%s)\n", errno, 589 strerror(errno)); 590 return -1; 591 } 592 593 /* 594 * Enable counters and exec the command: 595 */ 596 t0 = rdclock(); 597 clock_gettime(CLOCK_MONOTONIC, &ref_time); 598 599 if (forks) { 600 perf_evlist__start_workload(evsel_list); 601 handle_initial_delay(); 602 603 if (interval) { 604 while (!waitpid(child_pid, &status, WNOHANG)) { 605 nanosleep(&ts, NULL); 606 print_interval(); 607 } 608 } 609 wait(&status); 610 611 if (workload_exec_errno) { 612 const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg)); 613 pr_err("Workload failed: %s\n", emsg); 614 return -1; 615 } 616 617 if (WIFSIGNALED(status)) 618 psignal(WTERMSIG(status), argv[0]); 619 } else { 620 handle_initial_delay(); 621 while (!done) { 622 nanosleep(&ts, NULL); 623 if (interval) 624 print_interval(); 625 } 626 } 627 628 t1 = rdclock(); 629 630 update_stats(&walltime_nsecs_stats, t1 - t0); 631 632 if (aggr_mode == AGGR_GLOBAL) { 633 evlist__for_each(evsel_list, counter) { 634 read_counter_aggr(counter); 635 perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 636 thread_map__nr(evsel_list->threads)); 637 } 638 } else { 639 evlist__for_each(evsel_list, counter) { 640 read_counter(counter); 641 perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 1); 642 } 643 } 644 645 return WEXITSTATUS(status); 646 } 647 648 static int run_perf_stat(int argc, const char **argv) 649 { 650 int ret; 651 652 if (pre_cmd) { 653 ret = system(pre_cmd); 654 if (ret) 655 return ret; 656 } 657 658 if (sync_run) 659 sync(); 660 661 ret = __run_perf_stat(argc, argv); 662 if (ret) 663 return ret; 664 665 if (post_cmd) { 666 ret = system(post_cmd); 667 if (ret) 668 return ret; 669 } 670 671 return ret; 672 } 673 674 static void print_noise_pct(double total, double avg) 675 { 676 double pct = rel_stddev_stats(total, avg); 677 678 if (csv_output) 679 fprintf(output, "%s%.2f%%", csv_sep, pct); 680 else if (pct) 681 fprintf(output, " ( +-%6.2f%% )", pct); 682 } 683 684 static void print_noise(struct perf_evsel *evsel, double avg) 685 { 686 struct perf_stat *ps; 687 688 if (run_count == 1) 689 return; 690 691 ps = evsel->priv; 692 print_noise_pct(stddev_stats(&ps->res_stats[0]), avg); 693 } 694 695 static void aggr_printout(struct perf_evsel *evsel, int id, int nr) 696 { 697 switch (aggr_mode) { 698 case AGGR_CORE: 699 fprintf(output, "S%d-C%*d%s%*d%s", 700 cpu_map__id_to_socket(id), 701 csv_output ? 0 : -8, 702 cpu_map__id_to_cpu(id), 703 csv_sep, 704 csv_output ? 0 : 4, 705 nr, 706 csv_sep); 707 break; 708 case AGGR_SOCKET: 709 fprintf(output, "S%*d%s%*d%s", 710 csv_output ? 0 : -5, 711 id, 712 csv_sep, 713 csv_output ? 0 : 4, 714 nr, 715 csv_sep); 716 break; 717 case AGGR_NONE: 718 fprintf(output, "CPU%*d%s", 719 csv_output ? 0 : -4, 720 perf_evsel__cpus(evsel)->map[id], csv_sep); 721 break; 722 case AGGR_GLOBAL: 723 default: 724 break; 725 } 726 } 727 728 static void nsec_printout(int cpu, int nr, struct perf_evsel *evsel, double avg) 729 { 730 double msecs = avg / 1e6; 731 const char *fmt_v, *fmt_n; 732 char name[25]; 733 734 fmt_v = csv_output ? "%.6f%s" : "%18.6f%s"; 735 fmt_n = csv_output ? "%s" : "%-25s"; 736 737 aggr_printout(evsel, cpu, nr); 738 739 scnprintf(name, sizeof(name), "%s%s", 740 perf_evsel__name(evsel), csv_output ? "" : " (msec)"); 741 742 fprintf(output, fmt_v, msecs, csv_sep); 743 744 if (csv_output) 745 fprintf(output, "%s%s", evsel->unit, csv_sep); 746 else 747 fprintf(output, "%-*s%s", unit_width, evsel->unit, csv_sep); 748 749 fprintf(output, fmt_n, name); 750 751 if (evsel->cgrp) 752 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); 753 754 if (csv_output || interval) 755 return; 756 757 if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) 758 fprintf(output, " # %8.3f CPUs utilized ", 759 avg / avg_stats(&walltime_nsecs_stats)); 760 else 761 fprintf(output, " "); 762 } 763 764 /* used for get_ratio_color() */ 765 enum grc_type { 766 GRC_STALLED_CYCLES_FE, 767 GRC_STALLED_CYCLES_BE, 768 GRC_CACHE_MISSES, 769 GRC_MAX_NR 770 }; 771 772 static const char *get_ratio_color(enum grc_type type, double ratio) 773 { 774 static const double grc_table[GRC_MAX_NR][3] = { 775 [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 }, 776 [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 }, 777 [GRC_CACHE_MISSES] = { 20.0, 10.0, 5.0 }, 778 }; 779 const char *color = PERF_COLOR_NORMAL; 780 781 if (ratio > grc_table[type][0]) 782 color = PERF_COLOR_RED; 783 else if (ratio > grc_table[type][1]) 784 color = PERF_COLOR_MAGENTA; 785 else if (ratio > grc_table[type][2]) 786 color = PERF_COLOR_YELLOW; 787 788 return color; 789 } 790 791 static void print_stalled_cycles_frontend(int cpu, 792 struct perf_evsel *evsel 793 __maybe_unused, double avg) 794 { 795 double total, ratio = 0.0; 796 const char *color; 797 798 total = avg_stats(&runtime_cycles_stats[cpu]); 799 800 if (total) 801 ratio = avg / total * 100.0; 802 803 color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio); 804 805 fprintf(output, " # "); 806 color_fprintf(output, color, "%6.2f%%", ratio); 807 fprintf(output, " frontend cycles idle "); 808 } 809 810 static void print_stalled_cycles_backend(int cpu, 811 struct perf_evsel *evsel 812 __maybe_unused, double avg) 813 { 814 double total, ratio = 0.0; 815 const char *color; 816 817 total = avg_stats(&runtime_cycles_stats[cpu]); 818 819 if (total) 820 ratio = avg / total * 100.0; 821 822 color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio); 823 824 fprintf(output, " # "); 825 color_fprintf(output, color, "%6.2f%%", ratio); 826 fprintf(output, " backend cycles idle "); 827 } 828 829 static void print_branch_misses(int cpu, 830 struct perf_evsel *evsel __maybe_unused, 831 double avg) 832 { 833 double total, ratio = 0.0; 834 const char *color; 835 836 total = avg_stats(&runtime_branches_stats[cpu]); 837 838 if (total) 839 ratio = avg / total * 100.0; 840 841 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 842 843 fprintf(output, " # "); 844 color_fprintf(output, color, "%6.2f%%", ratio); 845 fprintf(output, " of all branches "); 846 } 847 848 static void print_l1_dcache_misses(int cpu, 849 struct perf_evsel *evsel __maybe_unused, 850 double avg) 851 { 852 double total, ratio = 0.0; 853 const char *color; 854 855 total = avg_stats(&runtime_l1_dcache_stats[cpu]); 856 857 if (total) 858 ratio = avg / total * 100.0; 859 860 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 861 862 fprintf(output, " # "); 863 color_fprintf(output, color, "%6.2f%%", ratio); 864 fprintf(output, " of all L1-dcache hits "); 865 } 866 867 static void print_l1_icache_misses(int cpu, 868 struct perf_evsel *evsel __maybe_unused, 869 double avg) 870 { 871 double total, ratio = 0.0; 872 const char *color; 873 874 total = avg_stats(&runtime_l1_icache_stats[cpu]); 875 876 if (total) 877 ratio = avg / total * 100.0; 878 879 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 880 881 fprintf(output, " # "); 882 color_fprintf(output, color, "%6.2f%%", ratio); 883 fprintf(output, " of all L1-icache hits "); 884 } 885 886 static void print_dtlb_cache_misses(int cpu, 887 struct perf_evsel *evsel __maybe_unused, 888 double avg) 889 { 890 double total, ratio = 0.0; 891 const char *color; 892 893 total = avg_stats(&runtime_dtlb_cache_stats[cpu]); 894 895 if (total) 896 ratio = avg / total * 100.0; 897 898 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 899 900 fprintf(output, " # "); 901 color_fprintf(output, color, "%6.2f%%", ratio); 902 fprintf(output, " of all dTLB cache hits "); 903 } 904 905 static void print_itlb_cache_misses(int cpu, 906 struct perf_evsel *evsel __maybe_unused, 907 double avg) 908 { 909 double total, ratio = 0.0; 910 const char *color; 911 912 total = avg_stats(&runtime_itlb_cache_stats[cpu]); 913 914 if (total) 915 ratio = avg / total * 100.0; 916 917 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 918 919 fprintf(output, " # "); 920 color_fprintf(output, color, "%6.2f%%", ratio); 921 fprintf(output, " of all iTLB cache hits "); 922 } 923 924 static void print_ll_cache_misses(int cpu, 925 struct perf_evsel *evsel __maybe_unused, 926 double avg) 927 { 928 double total, ratio = 0.0; 929 const char *color; 930 931 total = avg_stats(&runtime_ll_cache_stats[cpu]); 932 933 if (total) 934 ratio = avg / total * 100.0; 935 936 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 937 938 fprintf(output, " # "); 939 color_fprintf(output, color, "%6.2f%%", ratio); 940 fprintf(output, " of all LL-cache hits "); 941 } 942 943 static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg) 944 { 945 double total, ratio = 0.0, total2; 946 double sc = evsel->scale; 947 const char *fmt; 948 949 if (csv_output) { 950 fmt = sc != 1.0 ? "%.2f%s" : "%.0f%s"; 951 } else { 952 if (big_num) 953 fmt = sc != 1.0 ? "%'18.2f%s" : "%'18.0f%s"; 954 else 955 fmt = sc != 1.0 ? "%18.2f%s" : "%18.0f%s"; 956 } 957 958 aggr_printout(evsel, cpu, nr); 959 960 if (aggr_mode == AGGR_GLOBAL) 961 cpu = 0; 962 963 fprintf(output, fmt, avg, csv_sep); 964 965 if (evsel->unit) 966 fprintf(output, "%-*s%s", 967 csv_output ? 0 : unit_width, 968 evsel->unit, csv_sep); 969 970 fprintf(output, "%-*s", csv_output ? 0 : 25, perf_evsel__name(evsel)); 971 972 if (evsel->cgrp) 973 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); 974 975 if (csv_output || interval) 976 return; 977 978 if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { 979 total = avg_stats(&runtime_cycles_stats[cpu]); 980 if (total) { 981 ratio = avg / total; 982 fprintf(output, " # %5.2f insns per cycle ", ratio); 983 } 984 total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]); 985 total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu])); 986 987 if (total && avg) { 988 ratio = total / avg; 989 fprintf(output, "\n"); 990 if (aggr_mode == AGGR_NONE) 991 fprintf(output, " "); 992 fprintf(output, " # %5.2f stalled cycles per insn", ratio); 993 } 994 995 } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && 996 runtime_branches_stats[cpu].n != 0) { 997 print_branch_misses(cpu, evsel, avg); 998 } else if ( 999 evsel->attr.type == PERF_TYPE_HW_CACHE && 1000 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | 1001 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1002 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1003 runtime_l1_dcache_stats[cpu].n != 0) { 1004 print_l1_dcache_misses(cpu, evsel, avg); 1005 } else if ( 1006 evsel->attr.type == PERF_TYPE_HW_CACHE && 1007 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I | 1008 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1009 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1010 runtime_l1_icache_stats[cpu].n != 0) { 1011 print_l1_icache_misses(cpu, evsel, avg); 1012 } else if ( 1013 evsel->attr.type == PERF_TYPE_HW_CACHE && 1014 evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB | 1015 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1016 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1017 runtime_dtlb_cache_stats[cpu].n != 0) { 1018 print_dtlb_cache_misses(cpu, evsel, avg); 1019 } else if ( 1020 evsel->attr.type == PERF_TYPE_HW_CACHE && 1021 evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB | 1022 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1023 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1024 runtime_itlb_cache_stats[cpu].n != 0) { 1025 print_itlb_cache_misses(cpu, evsel, avg); 1026 } else if ( 1027 evsel->attr.type == PERF_TYPE_HW_CACHE && 1028 evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL | 1029 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1030 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1031 runtime_ll_cache_stats[cpu].n != 0) { 1032 print_ll_cache_misses(cpu, evsel, avg); 1033 } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && 1034 runtime_cacherefs_stats[cpu].n != 0) { 1035 total = avg_stats(&runtime_cacherefs_stats[cpu]); 1036 1037 if (total) 1038 ratio = avg * 100 / total; 1039 1040 fprintf(output, " # %8.3f %% of all cache refs ", ratio); 1041 1042 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { 1043 print_stalled_cycles_frontend(cpu, evsel, avg); 1044 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { 1045 print_stalled_cycles_backend(cpu, evsel, avg); 1046 } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { 1047 total = avg_stats(&runtime_nsecs_stats[cpu]); 1048 1049 if (total) { 1050 ratio = avg / total; 1051 fprintf(output, " # %8.3f GHz ", ratio); 1052 } 1053 } else if (transaction_run && 1054 perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX))) { 1055 total = avg_stats(&runtime_cycles_stats[cpu]); 1056 if (total) 1057 fprintf(output, 1058 " # %5.2f%% transactional cycles ", 1059 100.0 * (avg / total)); 1060 } else if (transaction_run && 1061 perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX_CP))) { 1062 total = avg_stats(&runtime_cycles_stats[cpu]); 1063 total2 = avg_stats(&runtime_cycles_in_tx_stats[cpu]); 1064 if (total2 < avg) 1065 total2 = avg; 1066 if (total) 1067 fprintf(output, 1068 " # %5.2f%% aborted cycles ", 1069 100.0 * ((total2-avg) / total)); 1070 } else if (transaction_run && 1071 perf_evsel__cmp(evsel, nth_evsel(T_TRANSACTION_START)) && 1072 avg > 0 && 1073 runtime_cycles_in_tx_stats[cpu].n != 0) { 1074 total = avg_stats(&runtime_cycles_in_tx_stats[cpu]); 1075 1076 if (total) 1077 ratio = total / avg; 1078 1079 fprintf(output, " # %8.0f cycles / transaction ", ratio); 1080 } else if (transaction_run && 1081 perf_evsel__cmp(evsel, nth_evsel(T_ELISION_START)) && 1082 avg > 0 && 1083 runtime_cycles_in_tx_stats[cpu].n != 0) { 1084 total = avg_stats(&runtime_cycles_in_tx_stats[cpu]); 1085 1086 if (total) 1087 ratio = total / avg; 1088 1089 fprintf(output, " # %8.0f cycles / elision ", ratio); 1090 } else if (runtime_nsecs_stats[cpu].n != 0) { 1091 char unit = 'M'; 1092 1093 total = avg_stats(&runtime_nsecs_stats[cpu]); 1094 1095 if (total) 1096 ratio = 1000.0 * avg / total; 1097 if (ratio < 0.001) { 1098 ratio *= 1000; 1099 unit = 'K'; 1100 } 1101 1102 fprintf(output, " # %8.3f %c/sec ", ratio, unit); 1103 } else { 1104 fprintf(output, " "); 1105 } 1106 } 1107 1108 static void print_aggr(char *prefix) 1109 { 1110 struct perf_evsel *counter; 1111 int cpu, cpu2, s, s2, id, nr; 1112 double uval; 1113 u64 ena, run, val; 1114 1115 if (!(aggr_map || aggr_get_id)) 1116 return; 1117 1118 for (s = 0; s < aggr_map->nr; s++) { 1119 id = aggr_map->map[s]; 1120 evlist__for_each(evsel_list, counter) { 1121 val = ena = run = 0; 1122 nr = 0; 1123 for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { 1124 cpu2 = perf_evsel__cpus(counter)->map[cpu]; 1125 s2 = aggr_get_id(evsel_list->cpus, cpu2); 1126 if (s2 != id) 1127 continue; 1128 val += counter->counts->cpu[cpu].val; 1129 ena += counter->counts->cpu[cpu].ena; 1130 run += counter->counts->cpu[cpu].run; 1131 nr++; 1132 } 1133 if (prefix) 1134 fprintf(output, "%s", prefix); 1135 1136 if (run == 0 || ena == 0) { 1137 aggr_printout(counter, id, nr); 1138 1139 fprintf(output, "%*s%s", 1140 csv_output ? 0 : 18, 1141 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 1142 csv_sep); 1143 1144 fprintf(output, "%-*s%s", 1145 csv_output ? 0 : unit_width, 1146 counter->unit, csv_sep); 1147 1148 fprintf(output, "%*s", 1149 csv_output ? 0 : -25, 1150 perf_evsel__name(counter)); 1151 1152 if (counter->cgrp) 1153 fprintf(output, "%s%s", 1154 csv_sep, counter->cgrp->name); 1155 1156 fputc('\n', output); 1157 continue; 1158 } 1159 uval = val * counter->scale; 1160 1161 if (nsec_counter(counter)) 1162 nsec_printout(id, nr, counter, uval); 1163 else 1164 abs_printout(id, nr, counter, uval); 1165 1166 if (!csv_output) { 1167 print_noise(counter, 1.0); 1168 1169 if (run != ena) 1170 fprintf(output, " (%.2f%%)", 1171 100.0 * run / ena); 1172 } 1173 fputc('\n', output); 1174 } 1175 } 1176 } 1177 1178 /* 1179 * Print out the results of a single counter: 1180 * aggregated counts in system-wide mode 1181 */ 1182 static void print_counter_aggr(struct perf_evsel *counter, char *prefix) 1183 { 1184 struct perf_stat *ps = counter->priv; 1185 double avg = avg_stats(&ps->res_stats[0]); 1186 int scaled = counter->counts->scaled; 1187 double uval; 1188 1189 if (prefix) 1190 fprintf(output, "%s", prefix); 1191 1192 if (scaled == -1) { 1193 fprintf(output, "%*s%s", 1194 csv_output ? 0 : 18, 1195 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 1196 csv_sep); 1197 fprintf(output, "%-*s%s", 1198 csv_output ? 0 : unit_width, 1199 counter->unit, csv_sep); 1200 fprintf(output, "%*s", 1201 csv_output ? 0 : -25, 1202 perf_evsel__name(counter)); 1203 1204 if (counter->cgrp) 1205 fprintf(output, "%s%s", csv_sep, counter->cgrp->name); 1206 1207 fputc('\n', output); 1208 return; 1209 } 1210 1211 uval = avg * counter->scale; 1212 1213 if (nsec_counter(counter)) 1214 nsec_printout(-1, 0, counter, uval); 1215 else 1216 abs_printout(-1, 0, counter, uval); 1217 1218 print_noise(counter, avg); 1219 1220 if (csv_output) { 1221 fputc('\n', output); 1222 return; 1223 } 1224 1225 if (scaled) { 1226 double avg_enabled, avg_running; 1227 1228 avg_enabled = avg_stats(&ps->res_stats[1]); 1229 avg_running = avg_stats(&ps->res_stats[2]); 1230 1231 fprintf(output, " [%5.2f%%]", 100 * avg_running / avg_enabled); 1232 } 1233 fprintf(output, "\n"); 1234 } 1235 1236 /* 1237 * Print out the results of a single counter: 1238 * does not use aggregated count in system-wide 1239 */ 1240 static void print_counter(struct perf_evsel *counter, char *prefix) 1241 { 1242 u64 ena, run, val; 1243 double uval; 1244 int cpu; 1245 1246 for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { 1247 val = counter->counts->cpu[cpu].val; 1248 ena = counter->counts->cpu[cpu].ena; 1249 run = counter->counts->cpu[cpu].run; 1250 1251 if (prefix) 1252 fprintf(output, "%s", prefix); 1253 1254 if (run == 0 || ena == 0) { 1255 fprintf(output, "CPU%*d%s%*s%s", 1256 csv_output ? 0 : -4, 1257 perf_evsel__cpus(counter)->map[cpu], csv_sep, 1258 csv_output ? 0 : 18, 1259 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 1260 csv_sep); 1261 1262 fprintf(output, "%-*s%s", 1263 csv_output ? 0 : unit_width, 1264 counter->unit, csv_sep); 1265 1266 fprintf(output, "%*s", 1267 csv_output ? 0 : -25, 1268 perf_evsel__name(counter)); 1269 1270 if (counter->cgrp) 1271 fprintf(output, "%s%s", 1272 csv_sep, counter->cgrp->name); 1273 1274 fputc('\n', output); 1275 continue; 1276 } 1277 1278 uval = val * counter->scale; 1279 1280 if (nsec_counter(counter)) 1281 nsec_printout(cpu, 0, counter, uval); 1282 else 1283 abs_printout(cpu, 0, counter, uval); 1284 1285 if (!csv_output) { 1286 print_noise(counter, 1.0); 1287 1288 if (run != ena) 1289 fprintf(output, " (%.2f%%)", 1290 100.0 * run / ena); 1291 } 1292 fputc('\n', output); 1293 } 1294 } 1295 1296 static void print_stat(int argc, const char **argv) 1297 { 1298 struct perf_evsel *counter; 1299 int i; 1300 1301 fflush(stdout); 1302 1303 if (!csv_output) { 1304 fprintf(output, "\n"); 1305 fprintf(output, " Performance counter stats for "); 1306 if (target.system_wide) 1307 fprintf(output, "\'system wide"); 1308 else if (target.cpu_list) 1309 fprintf(output, "\'CPU(s) %s", target.cpu_list); 1310 else if (!target__has_task(&target)) { 1311 fprintf(output, "\'%s", argv[0]); 1312 for (i = 1; i < argc; i++) 1313 fprintf(output, " %s", argv[i]); 1314 } else if (target.pid) 1315 fprintf(output, "process id \'%s", target.pid); 1316 else 1317 fprintf(output, "thread id \'%s", target.tid); 1318 1319 fprintf(output, "\'"); 1320 if (run_count > 1) 1321 fprintf(output, " (%d runs)", run_count); 1322 fprintf(output, ":\n\n"); 1323 } 1324 1325 switch (aggr_mode) { 1326 case AGGR_CORE: 1327 case AGGR_SOCKET: 1328 print_aggr(NULL); 1329 break; 1330 case AGGR_GLOBAL: 1331 evlist__for_each(evsel_list, counter) 1332 print_counter_aggr(counter, NULL); 1333 break; 1334 case AGGR_NONE: 1335 evlist__for_each(evsel_list, counter) 1336 print_counter(counter, NULL); 1337 break; 1338 default: 1339 break; 1340 } 1341 1342 if (!csv_output) { 1343 if (!null_run) 1344 fprintf(output, "\n"); 1345 fprintf(output, " %17.9f seconds time elapsed", 1346 avg_stats(&walltime_nsecs_stats)/1e9); 1347 if (run_count > 1) { 1348 fprintf(output, " "); 1349 print_noise_pct(stddev_stats(&walltime_nsecs_stats), 1350 avg_stats(&walltime_nsecs_stats)); 1351 } 1352 fprintf(output, "\n\n"); 1353 } 1354 } 1355 1356 static volatile int signr = -1; 1357 1358 static void skip_signal(int signo) 1359 { 1360 if ((child_pid == -1) || interval) 1361 done = 1; 1362 1363 signr = signo; 1364 /* 1365 * render child_pid harmless 1366 * won't send SIGTERM to a random 1367 * process in case of race condition 1368 * and fast PID recycling 1369 */ 1370 child_pid = -1; 1371 } 1372 1373 static void sig_atexit(void) 1374 { 1375 sigset_t set, oset; 1376 1377 /* 1378 * avoid race condition with SIGCHLD handler 1379 * in skip_signal() which is modifying child_pid 1380 * goal is to avoid send SIGTERM to a random 1381 * process 1382 */ 1383 sigemptyset(&set); 1384 sigaddset(&set, SIGCHLD); 1385 sigprocmask(SIG_BLOCK, &set, &oset); 1386 1387 if (child_pid != -1) 1388 kill(child_pid, SIGTERM); 1389 1390 sigprocmask(SIG_SETMASK, &oset, NULL); 1391 1392 if (signr == -1) 1393 return; 1394 1395 signal(signr, SIG_DFL); 1396 kill(getpid(), signr); 1397 } 1398 1399 static int stat__set_big_num(const struct option *opt __maybe_unused, 1400 const char *s __maybe_unused, int unset) 1401 { 1402 big_num_opt = unset ? 0 : 1; 1403 return 0; 1404 } 1405 1406 static int perf_stat_init_aggr_mode(void) 1407 { 1408 switch (aggr_mode) { 1409 case AGGR_SOCKET: 1410 if (cpu_map__build_socket_map(evsel_list->cpus, &aggr_map)) { 1411 perror("cannot build socket map"); 1412 return -1; 1413 } 1414 aggr_get_id = cpu_map__get_socket; 1415 break; 1416 case AGGR_CORE: 1417 if (cpu_map__build_core_map(evsel_list->cpus, &aggr_map)) { 1418 perror("cannot build core map"); 1419 return -1; 1420 } 1421 aggr_get_id = cpu_map__get_core; 1422 break; 1423 case AGGR_NONE: 1424 case AGGR_GLOBAL: 1425 default: 1426 break; 1427 } 1428 return 0; 1429 } 1430 1431 static int setup_events(const char * const *attrs, unsigned len) 1432 { 1433 unsigned i; 1434 1435 for (i = 0; i < len; i++) { 1436 if (parse_events(evsel_list, attrs[i])) 1437 return -1; 1438 } 1439 return 0; 1440 } 1441 1442 /* 1443 * Add default attributes, if there were no attributes specified or 1444 * if -d/--detailed, -d -d or -d -d -d is used: 1445 */ 1446 static int add_default_attributes(void) 1447 { 1448 struct perf_event_attr default_attrs[] = { 1449 1450 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, 1451 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES }, 1452 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, 1453 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, 1454 1455 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, 1456 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, 1457 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, 1458 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, 1459 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, 1460 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, 1461 1462 }; 1463 1464 /* 1465 * Detailed stats (-d), covering the L1 and last level data caches: 1466 */ 1467 struct perf_event_attr detailed_attrs[] = { 1468 1469 { .type = PERF_TYPE_HW_CACHE, 1470 .config = 1471 PERF_COUNT_HW_CACHE_L1D << 0 | 1472 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1473 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1474 1475 { .type = PERF_TYPE_HW_CACHE, 1476 .config = 1477 PERF_COUNT_HW_CACHE_L1D << 0 | 1478 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1479 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1480 1481 { .type = PERF_TYPE_HW_CACHE, 1482 .config = 1483 PERF_COUNT_HW_CACHE_LL << 0 | 1484 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1485 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1486 1487 { .type = PERF_TYPE_HW_CACHE, 1488 .config = 1489 PERF_COUNT_HW_CACHE_LL << 0 | 1490 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1491 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1492 }; 1493 1494 /* 1495 * Very detailed stats (-d -d), covering the instruction cache and the TLB caches: 1496 */ 1497 struct perf_event_attr very_detailed_attrs[] = { 1498 1499 { .type = PERF_TYPE_HW_CACHE, 1500 .config = 1501 PERF_COUNT_HW_CACHE_L1I << 0 | 1502 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1503 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1504 1505 { .type = PERF_TYPE_HW_CACHE, 1506 .config = 1507 PERF_COUNT_HW_CACHE_L1I << 0 | 1508 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1509 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1510 1511 { .type = PERF_TYPE_HW_CACHE, 1512 .config = 1513 PERF_COUNT_HW_CACHE_DTLB << 0 | 1514 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1515 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1516 1517 { .type = PERF_TYPE_HW_CACHE, 1518 .config = 1519 PERF_COUNT_HW_CACHE_DTLB << 0 | 1520 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1521 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1522 1523 { .type = PERF_TYPE_HW_CACHE, 1524 .config = 1525 PERF_COUNT_HW_CACHE_ITLB << 0 | 1526 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1527 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1528 1529 { .type = PERF_TYPE_HW_CACHE, 1530 .config = 1531 PERF_COUNT_HW_CACHE_ITLB << 0 | 1532 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1533 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1534 1535 }; 1536 1537 /* 1538 * Very, very detailed stats (-d -d -d), adding prefetch events: 1539 */ 1540 struct perf_event_attr very_very_detailed_attrs[] = { 1541 1542 { .type = PERF_TYPE_HW_CACHE, 1543 .config = 1544 PERF_COUNT_HW_CACHE_L1D << 0 | 1545 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 1546 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1547 1548 { .type = PERF_TYPE_HW_CACHE, 1549 .config = 1550 PERF_COUNT_HW_CACHE_L1D << 0 | 1551 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 1552 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1553 }; 1554 1555 /* Set attrs if no event is selected and !null_run: */ 1556 if (null_run) 1557 return 0; 1558 1559 if (transaction_run) { 1560 int err; 1561 if (pmu_have_event("cpu", "cycles-ct") && 1562 pmu_have_event("cpu", "el-start")) 1563 err = setup_events(transaction_attrs, 1564 ARRAY_SIZE(transaction_attrs)); 1565 else 1566 err = setup_events(transaction_limited_attrs, 1567 ARRAY_SIZE(transaction_limited_attrs)); 1568 if (err < 0) { 1569 fprintf(stderr, "Cannot set up transaction events\n"); 1570 return -1; 1571 } 1572 return 0; 1573 } 1574 1575 if (!evsel_list->nr_entries) { 1576 if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0) 1577 return -1; 1578 } 1579 1580 /* Detailed events get appended to the event list: */ 1581 1582 if (detailed_run < 1) 1583 return 0; 1584 1585 /* Append detailed run extra attributes: */ 1586 if (perf_evlist__add_default_attrs(evsel_list, detailed_attrs) < 0) 1587 return -1; 1588 1589 if (detailed_run < 2) 1590 return 0; 1591 1592 /* Append very detailed run extra attributes: */ 1593 if (perf_evlist__add_default_attrs(evsel_list, very_detailed_attrs) < 0) 1594 return -1; 1595 1596 if (detailed_run < 3) 1597 return 0; 1598 1599 /* Append very, very detailed run extra attributes: */ 1600 return perf_evlist__add_default_attrs(evsel_list, very_very_detailed_attrs); 1601 } 1602 1603 int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) 1604 { 1605 bool append_file = false; 1606 int output_fd = 0; 1607 const char *output_name = NULL; 1608 const struct option options[] = { 1609 OPT_BOOLEAN('T', "transaction", &transaction_run, 1610 "hardware transaction statistics"), 1611 OPT_CALLBACK('e', "event", &evsel_list, "event", 1612 "event selector. use 'perf list' to list available events", 1613 parse_events_option), 1614 OPT_CALLBACK(0, "filter", &evsel_list, "filter", 1615 "event filter", parse_filter), 1616 OPT_BOOLEAN('i', "no-inherit", &no_inherit, 1617 "child tasks do not inherit counters"), 1618 OPT_STRING('p', "pid", &target.pid, "pid", 1619 "stat events on existing process id"), 1620 OPT_STRING('t', "tid", &target.tid, "tid", 1621 "stat events on existing thread id"), 1622 OPT_BOOLEAN('a', "all-cpus", &target.system_wide, 1623 "system-wide collection from all CPUs"), 1624 OPT_BOOLEAN('g', "group", &group, 1625 "put the counters into a counter group"), 1626 OPT_BOOLEAN('c', "scale", &scale, "scale/normalize counters"), 1627 OPT_INCR('v', "verbose", &verbose, 1628 "be more verbose (show counter open errors, etc)"), 1629 OPT_INTEGER('r', "repeat", &run_count, 1630 "repeat command and print average + stddev (max: 100, forever: 0)"), 1631 OPT_BOOLEAN('n', "null", &null_run, 1632 "null run - dont start any counters"), 1633 OPT_INCR('d', "detailed", &detailed_run, 1634 "detailed run - start a lot of events"), 1635 OPT_BOOLEAN('S', "sync", &sync_run, 1636 "call sync() before starting a run"), 1637 OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, 1638 "print large numbers with thousands\' separators", 1639 stat__set_big_num), 1640 OPT_STRING('C', "cpu", &target.cpu_list, "cpu", 1641 "list of cpus to monitor in system-wide"), 1642 OPT_SET_UINT('A', "no-aggr", &aggr_mode, 1643 "disable CPU count aggregation", AGGR_NONE), 1644 OPT_STRING('x', "field-separator", &csv_sep, "separator", 1645 "print counts with custom separator"), 1646 OPT_CALLBACK('G', "cgroup", &evsel_list, "name", 1647 "monitor event in cgroup name only", parse_cgroups), 1648 OPT_STRING('o', "output", &output_name, "file", "output file name"), 1649 OPT_BOOLEAN(0, "append", &append_file, "append to the output file"), 1650 OPT_INTEGER(0, "log-fd", &output_fd, 1651 "log output to fd, instead of stderr"), 1652 OPT_STRING(0, "pre", &pre_cmd, "command", 1653 "command to run prior to the measured command"), 1654 OPT_STRING(0, "post", &post_cmd, "command", 1655 "command to run after to the measured command"), 1656 OPT_UINTEGER('I', "interval-print", &interval, 1657 "print counts at regular interval in ms (>= 100)"), 1658 OPT_SET_UINT(0, "per-socket", &aggr_mode, 1659 "aggregate counts per processor socket", AGGR_SOCKET), 1660 OPT_SET_UINT(0, "per-core", &aggr_mode, 1661 "aggregate counts per physical processor core", AGGR_CORE), 1662 OPT_UINTEGER('D', "delay", &initial_delay, 1663 "ms to wait before starting measurement after program start"), 1664 OPT_END() 1665 }; 1666 const char * const stat_usage[] = { 1667 "perf stat [<options>] [<command>]", 1668 NULL 1669 }; 1670 int status = -EINVAL, run_idx; 1671 const char *mode; 1672 1673 setlocale(LC_ALL, ""); 1674 1675 evsel_list = perf_evlist__new(); 1676 if (evsel_list == NULL) 1677 return -ENOMEM; 1678 1679 argc = parse_options(argc, argv, options, stat_usage, 1680 PARSE_OPT_STOP_AT_NON_OPTION); 1681 1682 output = stderr; 1683 if (output_name && strcmp(output_name, "-")) 1684 output = NULL; 1685 1686 if (output_name && output_fd) { 1687 fprintf(stderr, "cannot use both --output and --log-fd\n"); 1688 parse_options_usage(stat_usage, options, "o", 1); 1689 parse_options_usage(NULL, options, "log-fd", 0); 1690 goto out; 1691 } 1692 1693 if (output_fd < 0) { 1694 fprintf(stderr, "argument to --log-fd must be a > 0\n"); 1695 parse_options_usage(stat_usage, options, "log-fd", 0); 1696 goto out; 1697 } 1698 1699 if (!output) { 1700 struct timespec tm; 1701 mode = append_file ? "a" : "w"; 1702 1703 output = fopen(output_name, mode); 1704 if (!output) { 1705 perror("failed to create output file"); 1706 return -1; 1707 } 1708 clock_gettime(CLOCK_REALTIME, &tm); 1709 fprintf(output, "# started on %s\n", ctime(&tm.tv_sec)); 1710 } else if (output_fd > 0) { 1711 mode = append_file ? "a" : "w"; 1712 output = fdopen(output_fd, mode); 1713 if (!output) { 1714 perror("Failed opening logfd"); 1715 return -errno; 1716 } 1717 } 1718 1719 if (csv_sep) { 1720 csv_output = true; 1721 if (!strcmp(csv_sep, "\\t")) 1722 csv_sep = "\t"; 1723 } else 1724 csv_sep = DEFAULT_SEPARATOR; 1725 1726 /* 1727 * let the spreadsheet do the pretty-printing 1728 */ 1729 if (csv_output) { 1730 /* User explicitly passed -B? */ 1731 if (big_num_opt == 1) { 1732 fprintf(stderr, "-B option not supported with -x\n"); 1733 parse_options_usage(stat_usage, options, "B", 1); 1734 parse_options_usage(NULL, options, "x", 1); 1735 goto out; 1736 } else /* Nope, so disable big number formatting */ 1737 big_num = false; 1738 } else if (big_num_opt == 0) /* User passed --no-big-num */ 1739 big_num = false; 1740 1741 if (!argc && target__none(&target)) 1742 usage_with_options(stat_usage, options); 1743 1744 if (run_count < 0) { 1745 pr_err("Run count must be a positive number\n"); 1746 parse_options_usage(stat_usage, options, "r", 1); 1747 goto out; 1748 } else if (run_count == 0) { 1749 forever = true; 1750 run_count = 1; 1751 } 1752 1753 /* no_aggr, cgroup are for system-wide only */ 1754 if ((aggr_mode != AGGR_GLOBAL || nr_cgroups) && 1755 !target__has_cpu(&target)) { 1756 fprintf(stderr, "both cgroup and no-aggregation " 1757 "modes only available in system-wide mode\n"); 1758 1759 parse_options_usage(stat_usage, options, "G", 1); 1760 parse_options_usage(NULL, options, "A", 1); 1761 parse_options_usage(NULL, options, "a", 1); 1762 goto out; 1763 } 1764 1765 if (add_default_attributes()) 1766 goto out; 1767 1768 target__validate(&target); 1769 1770 if (perf_evlist__create_maps(evsel_list, &target) < 0) { 1771 if (target__has_task(&target)) { 1772 pr_err("Problems finding threads of monitor\n"); 1773 parse_options_usage(stat_usage, options, "p", 1); 1774 parse_options_usage(NULL, options, "t", 1); 1775 } else if (target__has_cpu(&target)) { 1776 perror("failed to parse CPUs map"); 1777 parse_options_usage(stat_usage, options, "C", 1); 1778 parse_options_usage(NULL, options, "a", 1); 1779 } 1780 goto out; 1781 } 1782 if (interval && interval < 100) { 1783 pr_err("print interval must be >= 100ms\n"); 1784 parse_options_usage(stat_usage, options, "I", 1); 1785 goto out; 1786 } 1787 1788 if (perf_evlist__alloc_stats(evsel_list, interval)) 1789 goto out; 1790 1791 if (perf_stat_init_aggr_mode()) 1792 goto out; 1793 1794 /* 1795 * We dont want to block the signals - that would cause 1796 * child tasks to inherit that and Ctrl-C would not work. 1797 * What we want is for Ctrl-C to work in the exec()-ed 1798 * task, but being ignored by perf stat itself: 1799 */ 1800 atexit(sig_atexit); 1801 if (!forever) 1802 signal(SIGINT, skip_signal); 1803 signal(SIGCHLD, skip_signal); 1804 signal(SIGALRM, skip_signal); 1805 signal(SIGABRT, skip_signal); 1806 1807 status = 0; 1808 for (run_idx = 0; forever || run_idx < run_count; run_idx++) { 1809 if (run_count != 1 && verbose) 1810 fprintf(output, "[ perf stat: executing run #%d ... ]\n", 1811 run_idx + 1); 1812 1813 status = run_perf_stat(argc, argv); 1814 if (forever && status != -1) { 1815 print_stat(argc, argv); 1816 perf_stat__reset_stats(evsel_list); 1817 } 1818 } 1819 1820 if (!forever && status != -1 && !interval) 1821 print_stat(argc, argv); 1822 1823 perf_evlist__free_stats(evsel_list); 1824 out: 1825 perf_evlist__delete(evsel_list); 1826 return status; 1827 } 1828