1 /* 2 * builtin-stat.c 3 * 4 * Builtin stat command: Give a precise performance counters summary 5 * overview about any workload, CPU or specific PID. 6 * 7 * Sample output: 8 9 $ perf stat ./hackbench 10 10 11 Time: 0.118 12 13 Performance counter stats for './hackbench 10': 14 15 1708.761321 task-clock # 11.037 CPUs utilized 16 41,190 context-switches # 0.024 M/sec 17 6,735 CPU-migrations # 0.004 M/sec 18 17,318 page-faults # 0.010 M/sec 19 5,205,202,243 cycles # 3.046 GHz 20 3,856,436,920 stalled-cycles-frontend # 74.09% frontend cycles idle 21 1,600,790,871 stalled-cycles-backend # 30.75% backend cycles idle 22 2,603,501,247 instructions # 0.50 insns per cycle 23 # 1.48 stalled cycles per insn 24 484,357,498 branches # 283.455 M/sec 25 6,388,934 branch-misses # 1.32% of all branches 26 27 0.154822978 seconds time elapsed 28 29 * 30 * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar <mingo@redhat.com> 31 * 32 * Improvements and fixes by: 33 * 34 * Arjan van de Ven <arjan@linux.intel.com> 35 * Yanmin Zhang <yanmin.zhang@intel.com> 36 * Wu Fengguang <fengguang.wu@intel.com> 37 * Mike Galbraith <efault@gmx.de> 38 * Paul Mackerras <paulus@samba.org> 39 * Jaswinder Singh Rajput <jaswinder@kernel.org> 40 * 41 * Released under the GPL v2. (and only v2, not any later version) 42 */ 43 44 #include "perf.h" 45 #include "builtin.h" 46 #include "util/cgroup.h" 47 #include "util/util.h" 48 #include "util/parse-options.h" 49 #include "util/parse-events.h" 50 #include "util/pmu.h" 51 #include "util/event.h" 52 #include "util/evlist.h" 53 #include "util/evsel.h" 54 #include "util/debug.h" 55 #include "util/color.h" 56 #include "util/stat.h" 57 #include "util/header.h" 58 #include "util/cpumap.h" 59 #include "util/thread.h" 60 #include "util/thread_map.h" 61 62 #include <stdlib.h> 63 #include <sys/prctl.h> 64 #include <locale.h> 65 66 #define DEFAULT_SEPARATOR " " 67 #define CNTR_NOT_SUPPORTED "<not supported>" 68 #define CNTR_NOT_COUNTED "<not counted>" 69 70 static void print_stat(int argc, const char **argv); 71 static void print_counter_aggr(struct perf_evsel *counter, char *prefix); 72 static void print_counter(struct perf_evsel *counter, char *prefix); 73 static void print_aggr(char *prefix); 74 75 /* Default events used for perf stat -T */ 76 static const char * const transaction_attrs[] = { 77 "task-clock", 78 "{" 79 "instructions," 80 "cycles," 81 "cpu/cycles-t/," 82 "cpu/tx-start/," 83 "cpu/el-start/," 84 "cpu/cycles-ct/" 85 "}" 86 }; 87 88 /* More limited version when the CPU does not have all events. */ 89 static const char * const transaction_limited_attrs[] = { 90 "task-clock", 91 "{" 92 "instructions," 93 "cycles," 94 "cpu/cycles-t/," 95 "cpu/tx-start/" 96 "}" 97 }; 98 99 /* must match transaction_attrs and the beginning limited_attrs */ 100 enum { 101 T_TASK_CLOCK, 102 T_INSTRUCTIONS, 103 T_CYCLES, 104 T_CYCLES_IN_TX, 105 T_TRANSACTION_START, 106 T_ELISION_START, 107 T_CYCLES_IN_TX_CP, 108 }; 109 110 static struct perf_evlist *evsel_list; 111 112 static struct target target = { 113 .uid = UINT_MAX, 114 }; 115 116 enum aggr_mode { 117 AGGR_NONE, 118 AGGR_GLOBAL, 119 AGGR_SOCKET, 120 AGGR_CORE, 121 }; 122 123 static int run_count = 1; 124 static bool no_inherit = false; 125 static bool scale = true; 126 static enum aggr_mode aggr_mode = AGGR_GLOBAL; 127 static volatile pid_t child_pid = -1; 128 static bool null_run = false; 129 static int detailed_run = 0; 130 static bool transaction_run; 131 static bool big_num = true; 132 static int big_num_opt = -1; 133 static const char *csv_sep = NULL; 134 static bool csv_output = false; 135 static bool group = false; 136 static FILE *output = NULL; 137 static const char *pre_cmd = NULL; 138 static const char *post_cmd = NULL; 139 static bool sync_run = false; 140 static unsigned int interval = 0; 141 static unsigned int initial_delay = 0; 142 static unsigned int unit_width = 4; /* strlen("unit") */ 143 static bool forever = false; 144 static struct timespec ref_time; 145 static struct cpu_map *aggr_map; 146 static int (*aggr_get_id)(struct cpu_map *m, int cpu); 147 148 static volatile int done = 0; 149 150 struct perf_stat { 151 struct stats res_stats[3]; 152 }; 153 154 static inline void diff_timespec(struct timespec *r, struct timespec *a, 155 struct timespec *b) 156 { 157 r->tv_sec = a->tv_sec - b->tv_sec; 158 if (a->tv_nsec < b->tv_nsec) { 159 r->tv_nsec = a->tv_nsec + 1000000000L - b->tv_nsec; 160 r->tv_sec--; 161 } else { 162 r->tv_nsec = a->tv_nsec - b->tv_nsec ; 163 } 164 } 165 166 static inline struct cpu_map *perf_evsel__cpus(struct perf_evsel *evsel) 167 { 168 return (evsel->cpus && !target.cpu_list) ? evsel->cpus : evsel_list->cpus; 169 } 170 171 static inline int perf_evsel__nr_cpus(struct perf_evsel *evsel) 172 { 173 return perf_evsel__cpus(evsel)->nr; 174 } 175 176 static void perf_evsel__reset_stat_priv(struct perf_evsel *evsel) 177 { 178 int i; 179 struct perf_stat *ps = evsel->priv; 180 181 for (i = 0; i < 3; i++) 182 init_stats(&ps->res_stats[i]); 183 } 184 185 static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel) 186 { 187 evsel->priv = zalloc(sizeof(struct perf_stat)); 188 if (evsel->priv == NULL) 189 return -ENOMEM; 190 perf_evsel__reset_stat_priv(evsel); 191 return 0; 192 } 193 194 static void perf_evsel__free_stat_priv(struct perf_evsel *evsel) 195 { 196 zfree(&evsel->priv); 197 } 198 199 static int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel) 200 { 201 void *addr; 202 size_t sz; 203 204 sz = sizeof(*evsel->counts) + 205 (perf_evsel__nr_cpus(evsel) * sizeof(struct perf_counts_values)); 206 207 addr = zalloc(sz); 208 if (!addr) 209 return -ENOMEM; 210 211 evsel->prev_raw_counts = addr; 212 213 return 0; 214 } 215 216 static void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel) 217 { 218 zfree(&evsel->prev_raw_counts); 219 } 220 221 static void perf_evlist__free_stats(struct perf_evlist *evlist) 222 { 223 struct perf_evsel *evsel; 224 225 evlist__for_each(evlist, evsel) { 226 perf_evsel__free_stat_priv(evsel); 227 perf_evsel__free_counts(evsel); 228 perf_evsel__free_prev_raw_counts(evsel); 229 } 230 } 231 232 static int perf_evlist__alloc_stats(struct perf_evlist *evlist, bool alloc_raw) 233 { 234 struct perf_evsel *evsel; 235 236 evlist__for_each(evlist, evsel) { 237 if (perf_evsel__alloc_stat_priv(evsel) < 0 || 238 perf_evsel__alloc_counts(evsel, perf_evsel__nr_cpus(evsel)) < 0 || 239 (alloc_raw && perf_evsel__alloc_prev_raw_counts(evsel) < 0)) 240 goto out_free; 241 } 242 243 return 0; 244 245 out_free: 246 perf_evlist__free_stats(evlist); 247 return -1; 248 } 249 250 static struct stats runtime_nsecs_stats[MAX_NR_CPUS]; 251 static struct stats runtime_cycles_stats[MAX_NR_CPUS]; 252 static struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS]; 253 static struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS]; 254 static struct stats runtime_branches_stats[MAX_NR_CPUS]; 255 static struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; 256 static struct stats runtime_l1_dcache_stats[MAX_NR_CPUS]; 257 static struct stats runtime_l1_icache_stats[MAX_NR_CPUS]; 258 static struct stats runtime_ll_cache_stats[MAX_NR_CPUS]; 259 static struct stats runtime_itlb_cache_stats[MAX_NR_CPUS]; 260 static struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS]; 261 static struct stats runtime_cycles_in_tx_stats[MAX_NR_CPUS]; 262 static struct stats walltime_nsecs_stats; 263 static struct stats runtime_transaction_stats[MAX_NR_CPUS]; 264 static struct stats runtime_elision_stats[MAX_NR_CPUS]; 265 266 static void perf_stat__reset_stats(struct perf_evlist *evlist) 267 { 268 struct perf_evsel *evsel; 269 270 evlist__for_each(evlist, evsel) { 271 perf_evsel__reset_stat_priv(evsel); 272 perf_evsel__reset_counts(evsel, perf_evsel__nr_cpus(evsel)); 273 } 274 275 memset(runtime_nsecs_stats, 0, sizeof(runtime_nsecs_stats)); 276 memset(runtime_cycles_stats, 0, sizeof(runtime_cycles_stats)); 277 memset(runtime_stalled_cycles_front_stats, 0, sizeof(runtime_stalled_cycles_front_stats)); 278 memset(runtime_stalled_cycles_back_stats, 0, sizeof(runtime_stalled_cycles_back_stats)); 279 memset(runtime_branches_stats, 0, sizeof(runtime_branches_stats)); 280 memset(runtime_cacherefs_stats, 0, sizeof(runtime_cacherefs_stats)); 281 memset(runtime_l1_dcache_stats, 0, sizeof(runtime_l1_dcache_stats)); 282 memset(runtime_l1_icache_stats, 0, sizeof(runtime_l1_icache_stats)); 283 memset(runtime_ll_cache_stats, 0, sizeof(runtime_ll_cache_stats)); 284 memset(runtime_itlb_cache_stats, 0, sizeof(runtime_itlb_cache_stats)); 285 memset(runtime_dtlb_cache_stats, 0, sizeof(runtime_dtlb_cache_stats)); 286 memset(runtime_cycles_in_tx_stats, 0, 287 sizeof(runtime_cycles_in_tx_stats)); 288 memset(runtime_transaction_stats, 0, 289 sizeof(runtime_transaction_stats)); 290 memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats)); 291 memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats)); 292 } 293 294 static int create_perf_stat_counter(struct perf_evsel *evsel) 295 { 296 struct perf_event_attr *attr = &evsel->attr; 297 298 if (scale) 299 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | 300 PERF_FORMAT_TOTAL_TIME_RUNNING; 301 302 attr->inherit = !no_inherit; 303 304 if (target__has_cpu(&target)) 305 return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel)); 306 307 if (!target__has_task(&target) && perf_evsel__is_group_leader(evsel)) { 308 attr->disabled = 1; 309 if (!initial_delay) 310 attr->enable_on_exec = 1; 311 } 312 313 return perf_evsel__open_per_thread(evsel, evsel_list->threads); 314 } 315 316 /* 317 * Does the counter have nsecs as a unit? 318 */ 319 static inline int nsec_counter(struct perf_evsel *evsel) 320 { 321 if (perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) || 322 perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) 323 return 1; 324 325 return 0; 326 } 327 328 static struct perf_evsel *nth_evsel(int n) 329 { 330 static struct perf_evsel **array; 331 static int array_len; 332 struct perf_evsel *ev; 333 int j; 334 335 /* Assumes this only called when evsel_list does not change anymore. */ 336 if (!array) { 337 evlist__for_each(evsel_list, ev) 338 array_len++; 339 array = malloc(array_len * sizeof(void *)); 340 if (!array) 341 exit(ENOMEM); 342 j = 0; 343 evlist__for_each(evsel_list, ev) 344 array[j++] = ev; 345 } 346 if (n < array_len) 347 return array[n]; 348 return NULL; 349 } 350 351 /* 352 * Update various tracking values we maintain to print 353 * more semantic information such as miss/hit ratios, 354 * instruction rates, etc: 355 */ 356 static void update_shadow_stats(struct perf_evsel *counter, u64 *count, 357 int cpu) 358 { 359 if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) 360 update_stats(&runtime_nsecs_stats[cpu], count[0]); 361 else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) 362 update_stats(&runtime_cycles_stats[cpu], count[0]); 363 else if (transaction_run && 364 perf_evsel__cmp(counter, nth_evsel(T_CYCLES_IN_TX))) 365 update_stats(&runtime_cycles_in_tx_stats[cpu], count[0]); 366 else if (transaction_run && 367 perf_evsel__cmp(counter, nth_evsel(T_TRANSACTION_START))) 368 update_stats(&runtime_transaction_stats[cpu], count[0]); 369 else if (transaction_run && 370 perf_evsel__cmp(counter, nth_evsel(T_ELISION_START))) 371 update_stats(&runtime_elision_stats[cpu], count[0]); 372 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) 373 update_stats(&runtime_stalled_cycles_front_stats[cpu], count[0]); 374 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) 375 update_stats(&runtime_stalled_cycles_back_stats[cpu], count[0]); 376 else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) 377 update_stats(&runtime_branches_stats[cpu], count[0]); 378 else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) 379 update_stats(&runtime_cacherefs_stats[cpu], count[0]); 380 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) 381 update_stats(&runtime_l1_dcache_stats[cpu], count[0]); 382 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) 383 update_stats(&runtime_l1_icache_stats[cpu], count[0]); 384 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL)) 385 update_stats(&runtime_ll_cache_stats[cpu], count[0]); 386 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) 387 update_stats(&runtime_dtlb_cache_stats[cpu], count[0]); 388 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) 389 update_stats(&runtime_itlb_cache_stats[cpu], count[0]); 390 } 391 392 static void zero_per_pkg(struct perf_evsel *counter) 393 { 394 if (counter->per_pkg_mask) 395 memset(counter->per_pkg_mask, 0, MAX_NR_CPUS); 396 } 397 398 static int check_per_pkg(struct perf_evsel *counter, int cpu, bool *skip) 399 { 400 unsigned long *mask = counter->per_pkg_mask; 401 struct cpu_map *cpus = perf_evsel__cpus(counter); 402 int s; 403 404 *skip = false; 405 406 if (!counter->per_pkg) 407 return 0; 408 409 if (cpu_map__empty(cpus)) 410 return 0; 411 412 if (!mask) { 413 mask = zalloc(MAX_NR_CPUS); 414 if (!mask) 415 return -ENOMEM; 416 417 counter->per_pkg_mask = mask; 418 } 419 420 s = cpu_map__get_socket(cpus, cpu); 421 if (s < 0) 422 return -1; 423 424 *skip = test_and_set_bit(s, mask) == 1; 425 return 0; 426 } 427 428 static int read_cb(struct perf_evsel *evsel, int cpu, int thread __maybe_unused, 429 struct perf_counts_values *count) 430 { 431 struct perf_counts_values *aggr = &evsel->counts->aggr; 432 static struct perf_counts_values zero; 433 bool skip = false; 434 435 if (check_per_pkg(evsel, cpu, &skip)) { 436 pr_err("failed to read per-pkg counter\n"); 437 return -1; 438 } 439 440 if (skip) 441 count = &zero; 442 443 switch (aggr_mode) { 444 case AGGR_CORE: 445 case AGGR_SOCKET: 446 case AGGR_NONE: 447 if (!evsel->snapshot) 448 perf_evsel__compute_deltas(evsel, cpu, count); 449 perf_counts_values__scale(count, scale, NULL); 450 evsel->counts->cpu[cpu] = *count; 451 if (aggr_mode == AGGR_NONE) 452 update_shadow_stats(evsel, count->values, cpu); 453 break; 454 case AGGR_GLOBAL: 455 aggr->val += count->val; 456 if (scale) { 457 aggr->ena += count->ena; 458 aggr->run += count->run; 459 } 460 default: 461 break; 462 } 463 464 return 0; 465 } 466 467 static int read_counter(struct perf_evsel *counter); 468 469 /* 470 * Read out the results of a single counter: 471 * aggregate counts across CPUs in system-wide mode 472 */ 473 static int read_counter_aggr(struct perf_evsel *counter) 474 { 475 struct perf_counts_values *aggr = &counter->counts->aggr; 476 struct perf_stat *ps = counter->priv; 477 u64 *count = counter->counts->aggr.values; 478 int i; 479 480 aggr->val = aggr->ena = aggr->run = 0; 481 482 if (read_counter(counter)) 483 return -1; 484 485 if (!counter->snapshot) 486 perf_evsel__compute_deltas(counter, -1, aggr); 487 perf_counts_values__scale(aggr, scale, &counter->counts->scaled); 488 489 for (i = 0; i < 3; i++) 490 update_stats(&ps->res_stats[i], count[i]); 491 492 if (verbose) { 493 fprintf(output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", 494 perf_evsel__name(counter), count[0], count[1], count[2]); 495 } 496 497 /* 498 * Save the full runtime - to allow normalization during printout: 499 */ 500 update_shadow_stats(counter, count, 0); 501 502 return 0; 503 } 504 505 /* 506 * Read out the results of a single counter: 507 * do not aggregate counts across CPUs in system-wide mode 508 */ 509 static int read_counter(struct perf_evsel *counter) 510 { 511 int nthreads = thread_map__nr(evsel_list->threads); 512 int ncpus = perf_evsel__nr_cpus(counter); 513 int cpu, thread; 514 515 if (!counter->supported) 516 return -ENOENT; 517 518 if (counter->system_wide) 519 nthreads = 1; 520 521 if (counter->per_pkg) 522 zero_per_pkg(counter); 523 524 for (thread = 0; thread < nthreads; thread++) { 525 for (cpu = 0; cpu < ncpus; cpu++) { 526 if (perf_evsel__read_cb(counter, cpu, thread, read_cb)) 527 return -1; 528 } 529 } 530 531 return 0; 532 } 533 534 static void print_interval(void) 535 { 536 static int num_print_interval; 537 struct perf_evsel *counter; 538 struct perf_stat *ps; 539 struct timespec ts, rs; 540 char prefix[64]; 541 542 if (aggr_mode == AGGR_GLOBAL) { 543 evlist__for_each(evsel_list, counter) { 544 ps = counter->priv; 545 memset(ps->res_stats, 0, sizeof(ps->res_stats)); 546 read_counter_aggr(counter); 547 } 548 } else { 549 evlist__for_each(evsel_list, counter) { 550 ps = counter->priv; 551 memset(ps->res_stats, 0, sizeof(ps->res_stats)); 552 read_counter(counter); 553 } 554 } 555 556 clock_gettime(CLOCK_MONOTONIC, &ts); 557 diff_timespec(&rs, &ts, &ref_time); 558 sprintf(prefix, "%6lu.%09lu%s", rs.tv_sec, rs.tv_nsec, csv_sep); 559 560 if (num_print_interval == 0 && !csv_output) { 561 switch (aggr_mode) { 562 case AGGR_SOCKET: 563 fprintf(output, "# time socket cpus counts %*s events\n", unit_width, "unit"); 564 break; 565 case AGGR_CORE: 566 fprintf(output, "# time core cpus counts %*s events\n", unit_width, "unit"); 567 break; 568 case AGGR_NONE: 569 fprintf(output, "# time CPU counts %*s events\n", unit_width, "unit"); 570 break; 571 case AGGR_GLOBAL: 572 default: 573 fprintf(output, "# time counts %*s events\n", unit_width, "unit"); 574 } 575 } 576 577 if (++num_print_interval == 25) 578 num_print_interval = 0; 579 580 switch (aggr_mode) { 581 case AGGR_CORE: 582 case AGGR_SOCKET: 583 print_aggr(prefix); 584 break; 585 case AGGR_NONE: 586 evlist__for_each(evsel_list, counter) 587 print_counter(counter, prefix); 588 break; 589 case AGGR_GLOBAL: 590 default: 591 evlist__for_each(evsel_list, counter) 592 print_counter_aggr(counter, prefix); 593 } 594 595 fflush(output); 596 } 597 598 static void handle_initial_delay(void) 599 { 600 struct perf_evsel *counter; 601 602 if (initial_delay) { 603 const int ncpus = cpu_map__nr(evsel_list->cpus), 604 nthreads = thread_map__nr(evsel_list->threads); 605 606 usleep(initial_delay * 1000); 607 evlist__for_each(evsel_list, counter) 608 perf_evsel__enable(counter, ncpus, nthreads); 609 } 610 } 611 612 static volatile int workload_exec_errno; 613 614 /* 615 * perf_evlist__prepare_workload will send a SIGUSR1 616 * if the fork fails, since we asked by setting its 617 * want_signal to true. 618 */ 619 static void workload_exec_failed_signal(int signo __maybe_unused, siginfo_t *info, 620 void *ucontext __maybe_unused) 621 { 622 workload_exec_errno = info->si_value.sival_int; 623 } 624 625 static int __run_perf_stat(int argc, const char **argv) 626 { 627 char msg[512]; 628 unsigned long long t0, t1; 629 struct perf_evsel *counter; 630 struct timespec ts; 631 size_t l; 632 int status = 0; 633 const bool forks = (argc > 0); 634 635 if (interval) { 636 ts.tv_sec = interval / 1000; 637 ts.tv_nsec = (interval % 1000) * 1000000; 638 } else { 639 ts.tv_sec = 1; 640 ts.tv_nsec = 0; 641 } 642 643 if (forks) { 644 if (perf_evlist__prepare_workload(evsel_list, &target, argv, false, 645 workload_exec_failed_signal) < 0) { 646 perror("failed to prepare workload"); 647 return -1; 648 } 649 child_pid = evsel_list->workload.pid; 650 } 651 652 if (group) 653 perf_evlist__set_leader(evsel_list); 654 655 evlist__for_each(evsel_list, counter) { 656 if (create_perf_stat_counter(counter) < 0) { 657 /* 658 * PPC returns ENXIO for HW counters until 2.6.37 659 * (behavior changed with commit b0a873e). 660 */ 661 if (errno == EINVAL || errno == ENOSYS || 662 errno == ENOENT || errno == EOPNOTSUPP || 663 errno == ENXIO) { 664 if (verbose) 665 ui__warning("%s event is not supported by the kernel.\n", 666 perf_evsel__name(counter)); 667 counter->supported = false; 668 continue; 669 } 670 671 perf_evsel__open_strerror(counter, &target, 672 errno, msg, sizeof(msg)); 673 ui__error("%s\n", msg); 674 675 if (child_pid != -1) 676 kill(child_pid, SIGTERM); 677 678 return -1; 679 } 680 counter->supported = true; 681 682 l = strlen(counter->unit); 683 if (l > unit_width) 684 unit_width = l; 685 } 686 687 if (perf_evlist__apply_filters(evsel_list, &counter)) { 688 error("failed to set filter \"%s\" on event %s with %d (%s)\n", 689 counter->filter, perf_evsel__name(counter), errno, 690 strerror_r(errno, msg, sizeof(msg))); 691 return -1; 692 } 693 694 /* 695 * Enable counters and exec the command: 696 */ 697 t0 = rdclock(); 698 clock_gettime(CLOCK_MONOTONIC, &ref_time); 699 700 if (forks) { 701 perf_evlist__start_workload(evsel_list); 702 handle_initial_delay(); 703 704 if (interval) { 705 while (!waitpid(child_pid, &status, WNOHANG)) { 706 nanosleep(&ts, NULL); 707 print_interval(); 708 } 709 } 710 wait(&status); 711 712 if (workload_exec_errno) { 713 const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg)); 714 pr_err("Workload failed: %s\n", emsg); 715 return -1; 716 } 717 718 if (WIFSIGNALED(status)) 719 psignal(WTERMSIG(status), argv[0]); 720 } else { 721 handle_initial_delay(); 722 while (!done) { 723 nanosleep(&ts, NULL); 724 if (interval) 725 print_interval(); 726 } 727 } 728 729 t1 = rdclock(); 730 731 update_stats(&walltime_nsecs_stats, t1 - t0); 732 733 if (aggr_mode == AGGR_GLOBAL) { 734 evlist__for_each(evsel_list, counter) { 735 read_counter_aggr(counter); 736 perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 737 thread_map__nr(evsel_list->threads)); 738 } 739 } else { 740 evlist__for_each(evsel_list, counter) { 741 read_counter(counter); 742 perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 1); 743 } 744 } 745 746 return WEXITSTATUS(status); 747 } 748 749 static int run_perf_stat(int argc, const char **argv) 750 { 751 int ret; 752 753 if (pre_cmd) { 754 ret = system(pre_cmd); 755 if (ret) 756 return ret; 757 } 758 759 if (sync_run) 760 sync(); 761 762 ret = __run_perf_stat(argc, argv); 763 if (ret) 764 return ret; 765 766 if (post_cmd) { 767 ret = system(post_cmd); 768 if (ret) 769 return ret; 770 } 771 772 return ret; 773 } 774 775 static void print_running(u64 run, u64 ena) 776 { 777 if (csv_output) { 778 fprintf(output, "%s%" PRIu64 "%s%.2f", 779 csv_sep, 780 run, 781 csv_sep, 782 ena ? 100.0 * run / ena : 100.0); 783 } else if (run != ena) { 784 fprintf(output, " (%.2f%%)", 100.0 * run / ena); 785 } 786 } 787 788 static void print_noise_pct(double total, double avg) 789 { 790 double pct = rel_stddev_stats(total, avg); 791 792 if (csv_output) 793 fprintf(output, "%s%.2f%%", csv_sep, pct); 794 else if (pct) 795 fprintf(output, " ( +-%6.2f%% )", pct); 796 } 797 798 static void print_noise(struct perf_evsel *evsel, double avg) 799 { 800 struct perf_stat *ps; 801 802 if (run_count == 1) 803 return; 804 805 ps = evsel->priv; 806 print_noise_pct(stddev_stats(&ps->res_stats[0]), avg); 807 } 808 809 static void aggr_printout(struct perf_evsel *evsel, int id, int nr) 810 { 811 switch (aggr_mode) { 812 case AGGR_CORE: 813 fprintf(output, "S%d-C%*d%s%*d%s", 814 cpu_map__id_to_socket(id), 815 csv_output ? 0 : -8, 816 cpu_map__id_to_cpu(id), 817 csv_sep, 818 csv_output ? 0 : 4, 819 nr, 820 csv_sep); 821 break; 822 case AGGR_SOCKET: 823 fprintf(output, "S%*d%s%*d%s", 824 csv_output ? 0 : -5, 825 id, 826 csv_sep, 827 csv_output ? 0 : 4, 828 nr, 829 csv_sep); 830 break; 831 case AGGR_NONE: 832 fprintf(output, "CPU%*d%s", 833 csv_output ? 0 : -4, 834 perf_evsel__cpus(evsel)->map[id], csv_sep); 835 break; 836 case AGGR_GLOBAL: 837 default: 838 break; 839 } 840 } 841 842 static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg) 843 { 844 double msecs = avg / 1e6; 845 const char *fmt_v, *fmt_n; 846 char name[25]; 847 848 fmt_v = csv_output ? "%.6f%s" : "%18.6f%s"; 849 fmt_n = csv_output ? "%s" : "%-25s"; 850 851 aggr_printout(evsel, id, nr); 852 853 scnprintf(name, sizeof(name), "%s%s", 854 perf_evsel__name(evsel), csv_output ? "" : " (msec)"); 855 856 fprintf(output, fmt_v, msecs, csv_sep); 857 858 if (csv_output) 859 fprintf(output, "%s%s", evsel->unit, csv_sep); 860 else 861 fprintf(output, "%-*s%s", unit_width, evsel->unit, csv_sep); 862 863 fprintf(output, fmt_n, name); 864 865 if (evsel->cgrp) 866 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); 867 868 if (csv_output || interval) 869 return; 870 871 if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) 872 fprintf(output, " # %8.3f CPUs utilized ", 873 avg / avg_stats(&walltime_nsecs_stats)); 874 else 875 fprintf(output, " "); 876 } 877 878 /* used for get_ratio_color() */ 879 enum grc_type { 880 GRC_STALLED_CYCLES_FE, 881 GRC_STALLED_CYCLES_BE, 882 GRC_CACHE_MISSES, 883 GRC_MAX_NR 884 }; 885 886 static const char *get_ratio_color(enum grc_type type, double ratio) 887 { 888 static const double grc_table[GRC_MAX_NR][3] = { 889 [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 }, 890 [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 }, 891 [GRC_CACHE_MISSES] = { 20.0, 10.0, 5.0 }, 892 }; 893 const char *color = PERF_COLOR_NORMAL; 894 895 if (ratio > grc_table[type][0]) 896 color = PERF_COLOR_RED; 897 else if (ratio > grc_table[type][1]) 898 color = PERF_COLOR_MAGENTA; 899 else if (ratio > grc_table[type][2]) 900 color = PERF_COLOR_YELLOW; 901 902 return color; 903 } 904 905 static void print_stalled_cycles_frontend(int cpu, 906 struct perf_evsel *evsel 907 __maybe_unused, double avg) 908 { 909 double total, ratio = 0.0; 910 const char *color; 911 912 total = avg_stats(&runtime_cycles_stats[cpu]); 913 914 if (total) 915 ratio = avg / total * 100.0; 916 917 color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio); 918 919 fprintf(output, " # "); 920 color_fprintf(output, color, "%6.2f%%", ratio); 921 fprintf(output, " frontend cycles idle "); 922 } 923 924 static void print_stalled_cycles_backend(int cpu, 925 struct perf_evsel *evsel 926 __maybe_unused, double avg) 927 { 928 double total, ratio = 0.0; 929 const char *color; 930 931 total = avg_stats(&runtime_cycles_stats[cpu]); 932 933 if (total) 934 ratio = avg / total * 100.0; 935 936 color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio); 937 938 fprintf(output, " # "); 939 color_fprintf(output, color, "%6.2f%%", ratio); 940 fprintf(output, " backend cycles idle "); 941 } 942 943 static void print_branch_misses(int cpu, 944 struct perf_evsel *evsel __maybe_unused, 945 double avg) 946 { 947 double total, ratio = 0.0; 948 const char *color; 949 950 total = avg_stats(&runtime_branches_stats[cpu]); 951 952 if (total) 953 ratio = avg / total * 100.0; 954 955 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 956 957 fprintf(output, " # "); 958 color_fprintf(output, color, "%6.2f%%", ratio); 959 fprintf(output, " of all branches "); 960 } 961 962 static void print_l1_dcache_misses(int cpu, 963 struct perf_evsel *evsel __maybe_unused, 964 double avg) 965 { 966 double total, ratio = 0.0; 967 const char *color; 968 969 total = avg_stats(&runtime_l1_dcache_stats[cpu]); 970 971 if (total) 972 ratio = avg / total * 100.0; 973 974 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 975 976 fprintf(output, " # "); 977 color_fprintf(output, color, "%6.2f%%", ratio); 978 fprintf(output, " of all L1-dcache hits "); 979 } 980 981 static void print_l1_icache_misses(int cpu, 982 struct perf_evsel *evsel __maybe_unused, 983 double avg) 984 { 985 double total, ratio = 0.0; 986 const char *color; 987 988 total = avg_stats(&runtime_l1_icache_stats[cpu]); 989 990 if (total) 991 ratio = avg / total * 100.0; 992 993 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 994 995 fprintf(output, " # "); 996 color_fprintf(output, color, "%6.2f%%", ratio); 997 fprintf(output, " of all L1-icache hits "); 998 } 999 1000 static void print_dtlb_cache_misses(int cpu, 1001 struct perf_evsel *evsel __maybe_unused, 1002 double avg) 1003 { 1004 double total, ratio = 0.0; 1005 const char *color; 1006 1007 total = avg_stats(&runtime_dtlb_cache_stats[cpu]); 1008 1009 if (total) 1010 ratio = avg / total * 100.0; 1011 1012 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 1013 1014 fprintf(output, " # "); 1015 color_fprintf(output, color, "%6.2f%%", ratio); 1016 fprintf(output, " of all dTLB cache hits "); 1017 } 1018 1019 static void print_itlb_cache_misses(int cpu, 1020 struct perf_evsel *evsel __maybe_unused, 1021 double avg) 1022 { 1023 double total, ratio = 0.0; 1024 const char *color; 1025 1026 total = avg_stats(&runtime_itlb_cache_stats[cpu]); 1027 1028 if (total) 1029 ratio = avg / total * 100.0; 1030 1031 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 1032 1033 fprintf(output, " # "); 1034 color_fprintf(output, color, "%6.2f%%", ratio); 1035 fprintf(output, " of all iTLB cache hits "); 1036 } 1037 1038 static void print_ll_cache_misses(int cpu, 1039 struct perf_evsel *evsel __maybe_unused, 1040 double avg) 1041 { 1042 double total, ratio = 0.0; 1043 const char *color; 1044 1045 total = avg_stats(&runtime_ll_cache_stats[cpu]); 1046 1047 if (total) 1048 ratio = avg / total * 100.0; 1049 1050 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 1051 1052 fprintf(output, " # "); 1053 color_fprintf(output, color, "%6.2f%%", ratio); 1054 fprintf(output, " of all LL-cache hits "); 1055 } 1056 1057 static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg) 1058 { 1059 double total, ratio = 0.0, total2; 1060 double sc = evsel->scale; 1061 const char *fmt; 1062 int cpu = cpu_map__id_to_cpu(id); 1063 1064 if (csv_output) { 1065 fmt = sc != 1.0 ? "%.2f%s" : "%.0f%s"; 1066 } else { 1067 if (big_num) 1068 fmt = sc != 1.0 ? "%'18.2f%s" : "%'18.0f%s"; 1069 else 1070 fmt = sc != 1.0 ? "%18.2f%s" : "%18.0f%s"; 1071 } 1072 1073 aggr_printout(evsel, id, nr); 1074 1075 if (aggr_mode == AGGR_GLOBAL) 1076 cpu = 0; 1077 1078 fprintf(output, fmt, avg, csv_sep); 1079 1080 if (evsel->unit) 1081 fprintf(output, "%-*s%s", 1082 csv_output ? 0 : unit_width, 1083 evsel->unit, csv_sep); 1084 1085 fprintf(output, "%-*s", csv_output ? 0 : 25, perf_evsel__name(evsel)); 1086 1087 if (evsel->cgrp) 1088 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); 1089 1090 if (csv_output || interval) 1091 return; 1092 1093 if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { 1094 total = avg_stats(&runtime_cycles_stats[cpu]); 1095 if (total) { 1096 ratio = avg / total; 1097 fprintf(output, " # %5.2f insns per cycle ", ratio); 1098 } else { 1099 fprintf(output, " "); 1100 } 1101 total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]); 1102 total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu])); 1103 1104 if (total && avg) { 1105 ratio = total / avg; 1106 fprintf(output, "\n"); 1107 if (aggr_mode == AGGR_NONE) 1108 fprintf(output, " "); 1109 fprintf(output, " # %5.2f stalled cycles per insn", ratio); 1110 } 1111 1112 } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && 1113 runtime_branches_stats[cpu].n != 0) { 1114 print_branch_misses(cpu, evsel, avg); 1115 } else if ( 1116 evsel->attr.type == PERF_TYPE_HW_CACHE && 1117 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | 1118 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1119 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1120 runtime_l1_dcache_stats[cpu].n != 0) { 1121 print_l1_dcache_misses(cpu, evsel, avg); 1122 } else if ( 1123 evsel->attr.type == PERF_TYPE_HW_CACHE && 1124 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I | 1125 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1126 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1127 runtime_l1_icache_stats[cpu].n != 0) { 1128 print_l1_icache_misses(cpu, evsel, avg); 1129 } else if ( 1130 evsel->attr.type == PERF_TYPE_HW_CACHE && 1131 evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB | 1132 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1133 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1134 runtime_dtlb_cache_stats[cpu].n != 0) { 1135 print_dtlb_cache_misses(cpu, evsel, avg); 1136 } else if ( 1137 evsel->attr.type == PERF_TYPE_HW_CACHE && 1138 evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB | 1139 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1140 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1141 runtime_itlb_cache_stats[cpu].n != 0) { 1142 print_itlb_cache_misses(cpu, evsel, avg); 1143 } else if ( 1144 evsel->attr.type == PERF_TYPE_HW_CACHE && 1145 evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL | 1146 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1147 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1148 runtime_ll_cache_stats[cpu].n != 0) { 1149 print_ll_cache_misses(cpu, evsel, avg); 1150 } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && 1151 runtime_cacherefs_stats[cpu].n != 0) { 1152 total = avg_stats(&runtime_cacherefs_stats[cpu]); 1153 1154 if (total) 1155 ratio = avg * 100 / total; 1156 1157 fprintf(output, " # %8.3f %% of all cache refs ", ratio); 1158 1159 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { 1160 print_stalled_cycles_frontend(cpu, evsel, avg); 1161 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { 1162 print_stalled_cycles_backend(cpu, evsel, avg); 1163 } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { 1164 total = avg_stats(&runtime_nsecs_stats[cpu]); 1165 1166 if (total) { 1167 ratio = avg / total; 1168 fprintf(output, " # %8.3f GHz ", ratio); 1169 } else { 1170 fprintf(output, " "); 1171 } 1172 } else if (transaction_run && 1173 perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX))) { 1174 total = avg_stats(&runtime_cycles_stats[cpu]); 1175 if (total) 1176 fprintf(output, 1177 " # %5.2f%% transactional cycles ", 1178 100.0 * (avg / total)); 1179 } else if (transaction_run && 1180 perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX_CP))) { 1181 total = avg_stats(&runtime_cycles_stats[cpu]); 1182 total2 = avg_stats(&runtime_cycles_in_tx_stats[cpu]); 1183 if (total2 < avg) 1184 total2 = avg; 1185 if (total) 1186 fprintf(output, 1187 " # %5.2f%% aborted cycles ", 1188 100.0 * ((total2-avg) / total)); 1189 } else if (transaction_run && 1190 perf_evsel__cmp(evsel, nth_evsel(T_TRANSACTION_START)) && 1191 avg > 0 && 1192 runtime_cycles_in_tx_stats[cpu].n != 0) { 1193 total = avg_stats(&runtime_cycles_in_tx_stats[cpu]); 1194 1195 if (total) 1196 ratio = total / avg; 1197 1198 fprintf(output, " # %8.0f cycles / transaction ", ratio); 1199 } else if (transaction_run && 1200 perf_evsel__cmp(evsel, nth_evsel(T_ELISION_START)) && 1201 avg > 0 && 1202 runtime_cycles_in_tx_stats[cpu].n != 0) { 1203 total = avg_stats(&runtime_cycles_in_tx_stats[cpu]); 1204 1205 if (total) 1206 ratio = total / avg; 1207 1208 fprintf(output, " # %8.0f cycles / elision ", ratio); 1209 } else if (runtime_nsecs_stats[cpu].n != 0) { 1210 char unit = 'M'; 1211 1212 total = avg_stats(&runtime_nsecs_stats[cpu]); 1213 1214 if (total) 1215 ratio = 1000.0 * avg / total; 1216 if (ratio < 0.001) { 1217 ratio *= 1000; 1218 unit = 'K'; 1219 } 1220 1221 fprintf(output, " # %8.3f %c/sec ", ratio, unit); 1222 } else { 1223 fprintf(output, " "); 1224 } 1225 } 1226 1227 static void print_aggr(char *prefix) 1228 { 1229 struct perf_evsel *counter; 1230 int cpu, cpu2, s, s2, id, nr; 1231 double uval; 1232 u64 ena, run, val; 1233 1234 if (!(aggr_map || aggr_get_id)) 1235 return; 1236 1237 for (s = 0; s < aggr_map->nr; s++) { 1238 id = aggr_map->map[s]; 1239 evlist__for_each(evsel_list, counter) { 1240 val = ena = run = 0; 1241 nr = 0; 1242 for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { 1243 cpu2 = perf_evsel__cpus(counter)->map[cpu]; 1244 s2 = aggr_get_id(evsel_list->cpus, cpu2); 1245 if (s2 != id) 1246 continue; 1247 val += counter->counts->cpu[cpu].val; 1248 ena += counter->counts->cpu[cpu].ena; 1249 run += counter->counts->cpu[cpu].run; 1250 nr++; 1251 } 1252 if (prefix) 1253 fprintf(output, "%s", prefix); 1254 1255 if (run == 0 || ena == 0) { 1256 aggr_printout(counter, id, nr); 1257 1258 fprintf(output, "%*s%s", 1259 csv_output ? 0 : 18, 1260 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 1261 csv_sep); 1262 1263 fprintf(output, "%-*s%s", 1264 csv_output ? 0 : unit_width, 1265 counter->unit, csv_sep); 1266 1267 fprintf(output, "%*s", 1268 csv_output ? 0 : -25, 1269 perf_evsel__name(counter)); 1270 1271 if (counter->cgrp) 1272 fprintf(output, "%s%s", 1273 csv_sep, counter->cgrp->name); 1274 1275 print_running(run, ena); 1276 fputc('\n', output); 1277 continue; 1278 } 1279 uval = val * counter->scale; 1280 1281 if (nsec_counter(counter)) 1282 nsec_printout(id, nr, counter, uval); 1283 else 1284 abs_printout(id, nr, counter, uval); 1285 1286 if (!csv_output) 1287 print_noise(counter, 1.0); 1288 1289 print_running(run, ena); 1290 fputc('\n', output); 1291 } 1292 } 1293 } 1294 1295 /* 1296 * Print out the results of a single counter: 1297 * aggregated counts in system-wide mode 1298 */ 1299 static void print_counter_aggr(struct perf_evsel *counter, char *prefix) 1300 { 1301 struct perf_stat *ps = counter->priv; 1302 double avg = avg_stats(&ps->res_stats[0]); 1303 int scaled = counter->counts->scaled; 1304 double uval; 1305 double avg_enabled, avg_running; 1306 1307 avg_enabled = avg_stats(&ps->res_stats[1]); 1308 avg_running = avg_stats(&ps->res_stats[2]); 1309 1310 if (prefix) 1311 fprintf(output, "%s", prefix); 1312 1313 if (scaled == -1 || !counter->supported) { 1314 fprintf(output, "%*s%s", 1315 csv_output ? 0 : 18, 1316 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 1317 csv_sep); 1318 fprintf(output, "%-*s%s", 1319 csv_output ? 0 : unit_width, 1320 counter->unit, csv_sep); 1321 fprintf(output, "%*s", 1322 csv_output ? 0 : -25, 1323 perf_evsel__name(counter)); 1324 1325 if (counter->cgrp) 1326 fprintf(output, "%s%s", csv_sep, counter->cgrp->name); 1327 1328 print_running(avg_running, avg_enabled); 1329 fputc('\n', output); 1330 return; 1331 } 1332 1333 uval = avg * counter->scale; 1334 1335 if (nsec_counter(counter)) 1336 nsec_printout(-1, 0, counter, uval); 1337 else 1338 abs_printout(-1, 0, counter, uval); 1339 1340 print_noise(counter, avg); 1341 1342 print_running(avg_running, avg_enabled); 1343 fprintf(output, "\n"); 1344 } 1345 1346 /* 1347 * Print out the results of a single counter: 1348 * does not use aggregated count in system-wide 1349 */ 1350 static void print_counter(struct perf_evsel *counter, char *prefix) 1351 { 1352 u64 ena, run, val; 1353 double uval; 1354 int cpu; 1355 1356 for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { 1357 val = counter->counts->cpu[cpu].val; 1358 ena = counter->counts->cpu[cpu].ena; 1359 run = counter->counts->cpu[cpu].run; 1360 1361 if (prefix) 1362 fprintf(output, "%s", prefix); 1363 1364 if (run == 0 || ena == 0) { 1365 fprintf(output, "CPU%*d%s%*s%s", 1366 csv_output ? 0 : -4, 1367 perf_evsel__cpus(counter)->map[cpu], csv_sep, 1368 csv_output ? 0 : 18, 1369 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 1370 csv_sep); 1371 1372 fprintf(output, "%-*s%s", 1373 csv_output ? 0 : unit_width, 1374 counter->unit, csv_sep); 1375 1376 fprintf(output, "%*s", 1377 csv_output ? 0 : -25, 1378 perf_evsel__name(counter)); 1379 1380 if (counter->cgrp) 1381 fprintf(output, "%s%s", 1382 csv_sep, counter->cgrp->name); 1383 1384 print_running(run, ena); 1385 fputc('\n', output); 1386 continue; 1387 } 1388 1389 uval = val * counter->scale; 1390 1391 if (nsec_counter(counter)) 1392 nsec_printout(cpu, 0, counter, uval); 1393 else 1394 abs_printout(cpu, 0, counter, uval); 1395 1396 if (!csv_output) 1397 print_noise(counter, 1.0); 1398 print_running(run, ena); 1399 1400 fputc('\n', output); 1401 } 1402 } 1403 1404 static void print_stat(int argc, const char **argv) 1405 { 1406 struct perf_evsel *counter; 1407 int i; 1408 1409 fflush(stdout); 1410 1411 if (!csv_output) { 1412 fprintf(output, "\n"); 1413 fprintf(output, " Performance counter stats for "); 1414 if (target.system_wide) 1415 fprintf(output, "\'system wide"); 1416 else if (target.cpu_list) 1417 fprintf(output, "\'CPU(s) %s", target.cpu_list); 1418 else if (!target__has_task(&target)) { 1419 fprintf(output, "\'%s", argv[0]); 1420 for (i = 1; i < argc; i++) 1421 fprintf(output, " %s", argv[i]); 1422 } else if (target.pid) 1423 fprintf(output, "process id \'%s", target.pid); 1424 else 1425 fprintf(output, "thread id \'%s", target.tid); 1426 1427 fprintf(output, "\'"); 1428 if (run_count > 1) 1429 fprintf(output, " (%d runs)", run_count); 1430 fprintf(output, ":\n\n"); 1431 } 1432 1433 switch (aggr_mode) { 1434 case AGGR_CORE: 1435 case AGGR_SOCKET: 1436 print_aggr(NULL); 1437 break; 1438 case AGGR_GLOBAL: 1439 evlist__for_each(evsel_list, counter) 1440 print_counter_aggr(counter, NULL); 1441 break; 1442 case AGGR_NONE: 1443 evlist__for_each(evsel_list, counter) 1444 print_counter(counter, NULL); 1445 break; 1446 default: 1447 break; 1448 } 1449 1450 if (!csv_output) { 1451 if (!null_run) 1452 fprintf(output, "\n"); 1453 fprintf(output, " %17.9f seconds time elapsed", 1454 avg_stats(&walltime_nsecs_stats)/1e9); 1455 if (run_count > 1) { 1456 fprintf(output, " "); 1457 print_noise_pct(stddev_stats(&walltime_nsecs_stats), 1458 avg_stats(&walltime_nsecs_stats)); 1459 } 1460 fprintf(output, "\n\n"); 1461 } 1462 } 1463 1464 static volatile int signr = -1; 1465 1466 static void skip_signal(int signo) 1467 { 1468 if ((child_pid == -1) || interval) 1469 done = 1; 1470 1471 signr = signo; 1472 /* 1473 * render child_pid harmless 1474 * won't send SIGTERM to a random 1475 * process in case of race condition 1476 * and fast PID recycling 1477 */ 1478 child_pid = -1; 1479 } 1480 1481 static void sig_atexit(void) 1482 { 1483 sigset_t set, oset; 1484 1485 /* 1486 * avoid race condition with SIGCHLD handler 1487 * in skip_signal() which is modifying child_pid 1488 * goal is to avoid send SIGTERM to a random 1489 * process 1490 */ 1491 sigemptyset(&set); 1492 sigaddset(&set, SIGCHLD); 1493 sigprocmask(SIG_BLOCK, &set, &oset); 1494 1495 if (child_pid != -1) 1496 kill(child_pid, SIGTERM); 1497 1498 sigprocmask(SIG_SETMASK, &oset, NULL); 1499 1500 if (signr == -1) 1501 return; 1502 1503 signal(signr, SIG_DFL); 1504 kill(getpid(), signr); 1505 } 1506 1507 static int stat__set_big_num(const struct option *opt __maybe_unused, 1508 const char *s __maybe_unused, int unset) 1509 { 1510 big_num_opt = unset ? 0 : 1; 1511 return 0; 1512 } 1513 1514 static int perf_stat_init_aggr_mode(void) 1515 { 1516 switch (aggr_mode) { 1517 case AGGR_SOCKET: 1518 if (cpu_map__build_socket_map(evsel_list->cpus, &aggr_map)) { 1519 perror("cannot build socket map"); 1520 return -1; 1521 } 1522 aggr_get_id = cpu_map__get_socket; 1523 break; 1524 case AGGR_CORE: 1525 if (cpu_map__build_core_map(evsel_list->cpus, &aggr_map)) { 1526 perror("cannot build core map"); 1527 return -1; 1528 } 1529 aggr_get_id = cpu_map__get_core; 1530 break; 1531 case AGGR_NONE: 1532 case AGGR_GLOBAL: 1533 default: 1534 break; 1535 } 1536 return 0; 1537 } 1538 1539 static int setup_events(const char * const *attrs, unsigned len) 1540 { 1541 unsigned i; 1542 1543 for (i = 0; i < len; i++) { 1544 if (parse_events(evsel_list, attrs[i])) 1545 return -1; 1546 } 1547 return 0; 1548 } 1549 1550 /* 1551 * Add default attributes, if there were no attributes specified or 1552 * if -d/--detailed, -d -d or -d -d -d is used: 1553 */ 1554 static int add_default_attributes(void) 1555 { 1556 struct perf_event_attr default_attrs[] = { 1557 1558 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, 1559 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES }, 1560 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, 1561 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, 1562 1563 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, 1564 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, 1565 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, 1566 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, 1567 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, 1568 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, 1569 1570 }; 1571 1572 /* 1573 * Detailed stats (-d), covering the L1 and last level data caches: 1574 */ 1575 struct perf_event_attr detailed_attrs[] = { 1576 1577 { .type = PERF_TYPE_HW_CACHE, 1578 .config = 1579 PERF_COUNT_HW_CACHE_L1D << 0 | 1580 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1581 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1582 1583 { .type = PERF_TYPE_HW_CACHE, 1584 .config = 1585 PERF_COUNT_HW_CACHE_L1D << 0 | 1586 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1587 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1588 1589 { .type = PERF_TYPE_HW_CACHE, 1590 .config = 1591 PERF_COUNT_HW_CACHE_LL << 0 | 1592 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1593 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1594 1595 { .type = PERF_TYPE_HW_CACHE, 1596 .config = 1597 PERF_COUNT_HW_CACHE_LL << 0 | 1598 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1599 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1600 }; 1601 1602 /* 1603 * Very detailed stats (-d -d), covering the instruction cache and the TLB caches: 1604 */ 1605 struct perf_event_attr very_detailed_attrs[] = { 1606 1607 { .type = PERF_TYPE_HW_CACHE, 1608 .config = 1609 PERF_COUNT_HW_CACHE_L1I << 0 | 1610 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1611 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1612 1613 { .type = PERF_TYPE_HW_CACHE, 1614 .config = 1615 PERF_COUNT_HW_CACHE_L1I << 0 | 1616 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1617 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1618 1619 { .type = PERF_TYPE_HW_CACHE, 1620 .config = 1621 PERF_COUNT_HW_CACHE_DTLB << 0 | 1622 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1623 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1624 1625 { .type = PERF_TYPE_HW_CACHE, 1626 .config = 1627 PERF_COUNT_HW_CACHE_DTLB << 0 | 1628 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1629 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1630 1631 { .type = PERF_TYPE_HW_CACHE, 1632 .config = 1633 PERF_COUNT_HW_CACHE_ITLB << 0 | 1634 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1635 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1636 1637 { .type = PERF_TYPE_HW_CACHE, 1638 .config = 1639 PERF_COUNT_HW_CACHE_ITLB << 0 | 1640 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1641 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1642 1643 }; 1644 1645 /* 1646 * Very, very detailed stats (-d -d -d), adding prefetch events: 1647 */ 1648 struct perf_event_attr very_very_detailed_attrs[] = { 1649 1650 { .type = PERF_TYPE_HW_CACHE, 1651 .config = 1652 PERF_COUNT_HW_CACHE_L1D << 0 | 1653 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 1654 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1655 1656 { .type = PERF_TYPE_HW_CACHE, 1657 .config = 1658 PERF_COUNT_HW_CACHE_L1D << 0 | 1659 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 1660 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1661 }; 1662 1663 /* Set attrs if no event is selected and !null_run: */ 1664 if (null_run) 1665 return 0; 1666 1667 if (transaction_run) { 1668 int err; 1669 if (pmu_have_event("cpu", "cycles-ct") && 1670 pmu_have_event("cpu", "el-start")) 1671 err = setup_events(transaction_attrs, 1672 ARRAY_SIZE(transaction_attrs)); 1673 else 1674 err = setup_events(transaction_limited_attrs, 1675 ARRAY_SIZE(transaction_limited_attrs)); 1676 if (err < 0) { 1677 fprintf(stderr, "Cannot set up transaction events\n"); 1678 return -1; 1679 } 1680 return 0; 1681 } 1682 1683 if (!evsel_list->nr_entries) { 1684 if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0) 1685 return -1; 1686 } 1687 1688 /* Detailed events get appended to the event list: */ 1689 1690 if (detailed_run < 1) 1691 return 0; 1692 1693 /* Append detailed run extra attributes: */ 1694 if (perf_evlist__add_default_attrs(evsel_list, detailed_attrs) < 0) 1695 return -1; 1696 1697 if (detailed_run < 2) 1698 return 0; 1699 1700 /* Append very detailed run extra attributes: */ 1701 if (perf_evlist__add_default_attrs(evsel_list, very_detailed_attrs) < 0) 1702 return -1; 1703 1704 if (detailed_run < 3) 1705 return 0; 1706 1707 /* Append very, very detailed run extra attributes: */ 1708 return perf_evlist__add_default_attrs(evsel_list, very_very_detailed_attrs); 1709 } 1710 1711 int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) 1712 { 1713 bool append_file = false; 1714 int output_fd = 0; 1715 const char *output_name = NULL; 1716 const struct option options[] = { 1717 OPT_BOOLEAN('T', "transaction", &transaction_run, 1718 "hardware transaction statistics"), 1719 OPT_CALLBACK('e', "event", &evsel_list, "event", 1720 "event selector. use 'perf list' to list available events", 1721 parse_events_option), 1722 OPT_CALLBACK(0, "filter", &evsel_list, "filter", 1723 "event filter", parse_filter), 1724 OPT_BOOLEAN('i', "no-inherit", &no_inherit, 1725 "child tasks do not inherit counters"), 1726 OPT_STRING('p', "pid", &target.pid, "pid", 1727 "stat events on existing process id"), 1728 OPT_STRING('t', "tid", &target.tid, "tid", 1729 "stat events on existing thread id"), 1730 OPT_BOOLEAN('a', "all-cpus", &target.system_wide, 1731 "system-wide collection from all CPUs"), 1732 OPT_BOOLEAN('g', "group", &group, 1733 "put the counters into a counter group"), 1734 OPT_BOOLEAN('c', "scale", &scale, "scale/normalize counters"), 1735 OPT_INCR('v', "verbose", &verbose, 1736 "be more verbose (show counter open errors, etc)"), 1737 OPT_INTEGER('r', "repeat", &run_count, 1738 "repeat command and print average + stddev (max: 100, forever: 0)"), 1739 OPT_BOOLEAN('n', "null", &null_run, 1740 "null run - dont start any counters"), 1741 OPT_INCR('d', "detailed", &detailed_run, 1742 "detailed run - start a lot of events"), 1743 OPT_BOOLEAN('S', "sync", &sync_run, 1744 "call sync() before starting a run"), 1745 OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, 1746 "print large numbers with thousands\' separators", 1747 stat__set_big_num), 1748 OPT_STRING('C', "cpu", &target.cpu_list, "cpu", 1749 "list of cpus to monitor in system-wide"), 1750 OPT_SET_UINT('A', "no-aggr", &aggr_mode, 1751 "disable CPU count aggregation", AGGR_NONE), 1752 OPT_STRING('x', "field-separator", &csv_sep, "separator", 1753 "print counts with custom separator"), 1754 OPT_CALLBACK('G', "cgroup", &evsel_list, "name", 1755 "monitor event in cgroup name only", parse_cgroups), 1756 OPT_STRING('o', "output", &output_name, "file", "output file name"), 1757 OPT_BOOLEAN(0, "append", &append_file, "append to the output file"), 1758 OPT_INTEGER(0, "log-fd", &output_fd, 1759 "log output to fd, instead of stderr"), 1760 OPT_STRING(0, "pre", &pre_cmd, "command", 1761 "command to run prior to the measured command"), 1762 OPT_STRING(0, "post", &post_cmd, "command", 1763 "command to run after to the measured command"), 1764 OPT_UINTEGER('I', "interval-print", &interval, 1765 "print counts at regular interval in ms (>= 100)"), 1766 OPT_SET_UINT(0, "per-socket", &aggr_mode, 1767 "aggregate counts per processor socket", AGGR_SOCKET), 1768 OPT_SET_UINT(0, "per-core", &aggr_mode, 1769 "aggregate counts per physical processor core", AGGR_CORE), 1770 OPT_UINTEGER('D', "delay", &initial_delay, 1771 "ms to wait before starting measurement after program start"), 1772 OPT_END() 1773 }; 1774 const char * const stat_usage[] = { 1775 "perf stat [<options>] [<command>]", 1776 NULL 1777 }; 1778 int status = -EINVAL, run_idx; 1779 const char *mode; 1780 1781 setlocale(LC_ALL, ""); 1782 1783 evsel_list = perf_evlist__new(); 1784 if (evsel_list == NULL) 1785 return -ENOMEM; 1786 1787 argc = parse_options(argc, argv, options, stat_usage, 1788 PARSE_OPT_STOP_AT_NON_OPTION); 1789 1790 output = stderr; 1791 if (output_name && strcmp(output_name, "-")) 1792 output = NULL; 1793 1794 if (output_name && output_fd) { 1795 fprintf(stderr, "cannot use both --output and --log-fd\n"); 1796 parse_options_usage(stat_usage, options, "o", 1); 1797 parse_options_usage(NULL, options, "log-fd", 0); 1798 goto out; 1799 } 1800 1801 if (output_fd < 0) { 1802 fprintf(stderr, "argument to --log-fd must be a > 0\n"); 1803 parse_options_usage(stat_usage, options, "log-fd", 0); 1804 goto out; 1805 } 1806 1807 if (!output) { 1808 struct timespec tm; 1809 mode = append_file ? "a" : "w"; 1810 1811 output = fopen(output_name, mode); 1812 if (!output) { 1813 perror("failed to create output file"); 1814 return -1; 1815 } 1816 clock_gettime(CLOCK_REALTIME, &tm); 1817 fprintf(output, "# started on %s\n", ctime(&tm.tv_sec)); 1818 } else if (output_fd > 0) { 1819 mode = append_file ? "a" : "w"; 1820 output = fdopen(output_fd, mode); 1821 if (!output) { 1822 perror("Failed opening logfd"); 1823 return -errno; 1824 } 1825 } 1826 1827 if (csv_sep) { 1828 csv_output = true; 1829 if (!strcmp(csv_sep, "\\t")) 1830 csv_sep = "\t"; 1831 } else 1832 csv_sep = DEFAULT_SEPARATOR; 1833 1834 /* 1835 * let the spreadsheet do the pretty-printing 1836 */ 1837 if (csv_output) { 1838 /* User explicitly passed -B? */ 1839 if (big_num_opt == 1) { 1840 fprintf(stderr, "-B option not supported with -x\n"); 1841 parse_options_usage(stat_usage, options, "B", 1); 1842 parse_options_usage(NULL, options, "x", 1); 1843 goto out; 1844 } else /* Nope, so disable big number formatting */ 1845 big_num = false; 1846 } else if (big_num_opt == 0) /* User passed --no-big-num */ 1847 big_num = false; 1848 1849 if (!argc && target__none(&target)) 1850 usage_with_options(stat_usage, options); 1851 1852 if (run_count < 0) { 1853 pr_err("Run count must be a positive number\n"); 1854 parse_options_usage(stat_usage, options, "r", 1); 1855 goto out; 1856 } else if (run_count == 0) { 1857 forever = true; 1858 run_count = 1; 1859 } 1860 1861 /* no_aggr, cgroup are for system-wide only */ 1862 if ((aggr_mode != AGGR_GLOBAL || nr_cgroups) && 1863 !target__has_cpu(&target)) { 1864 fprintf(stderr, "both cgroup and no-aggregation " 1865 "modes only available in system-wide mode\n"); 1866 1867 parse_options_usage(stat_usage, options, "G", 1); 1868 parse_options_usage(NULL, options, "A", 1); 1869 parse_options_usage(NULL, options, "a", 1); 1870 goto out; 1871 } 1872 1873 if (add_default_attributes()) 1874 goto out; 1875 1876 target__validate(&target); 1877 1878 if (perf_evlist__create_maps(evsel_list, &target) < 0) { 1879 if (target__has_task(&target)) { 1880 pr_err("Problems finding threads of monitor\n"); 1881 parse_options_usage(stat_usage, options, "p", 1); 1882 parse_options_usage(NULL, options, "t", 1); 1883 } else if (target__has_cpu(&target)) { 1884 perror("failed to parse CPUs map"); 1885 parse_options_usage(stat_usage, options, "C", 1); 1886 parse_options_usage(NULL, options, "a", 1); 1887 } 1888 goto out; 1889 } 1890 if (interval && interval < 100) { 1891 pr_err("print interval must be >= 100ms\n"); 1892 parse_options_usage(stat_usage, options, "I", 1); 1893 goto out; 1894 } 1895 1896 if (perf_evlist__alloc_stats(evsel_list, interval)) 1897 goto out; 1898 1899 if (perf_stat_init_aggr_mode()) 1900 goto out; 1901 1902 /* 1903 * We dont want to block the signals - that would cause 1904 * child tasks to inherit that and Ctrl-C would not work. 1905 * What we want is for Ctrl-C to work in the exec()-ed 1906 * task, but being ignored by perf stat itself: 1907 */ 1908 atexit(sig_atexit); 1909 if (!forever) 1910 signal(SIGINT, skip_signal); 1911 signal(SIGCHLD, skip_signal); 1912 signal(SIGALRM, skip_signal); 1913 signal(SIGABRT, skip_signal); 1914 1915 status = 0; 1916 for (run_idx = 0; forever || run_idx < run_count; run_idx++) { 1917 if (run_count != 1 && verbose) 1918 fprintf(output, "[ perf stat: executing run #%d ... ]\n", 1919 run_idx + 1); 1920 1921 status = run_perf_stat(argc, argv); 1922 if (forever && status != -1) { 1923 print_stat(argc, argv); 1924 perf_stat__reset_stats(evsel_list); 1925 } 1926 } 1927 1928 if (!forever && status != -1 && !interval) 1929 print_stat(argc, argv); 1930 1931 perf_evlist__free_stats(evsel_list); 1932 out: 1933 perf_evlist__delete(evsel_list); 1934 return status; 1935 } 1936