1 /* 2 * builtin-stat.c 3 * 4 * Builtin stat command: Give a precise performance counters summary 5 * overview about any workload, CPU or specific PID. 6 * 7 * Sample output: 8 9 $ perf stat ./hackbench 10 10 11 Time: 0.118 12 13 Performance counter stats for './hackbench 10': 14 15 1708.761321 task-clock # 11.037 CPUs utilized 16 41,190 context-switches # 0.024 M/sec 17 6,735 CPU-migrations # 0.004 M/sec 18 17,318 page-faults # 0.010 M/sec 19 5,205,202,243 cycles # 3.046 GHz 20 3,856,436,920 stalled-cycles-frontend # 74.09% frontend cycles idle 21 1,600,790,871 stalled-cycles-backend # 30.75% backend cycles idle 22 2,603,501,247 instructions # 0.50 insns per cycle 23 # 1.48 stalled cycles per insn 24 484,357,498 branches # 283.455 M/sec 25 6,388,934 branch-misses # 1.32% of all branches 26 27 0.154822978 seconds time elapsed 28 29 * 30 * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar <mingo@redhat.com> 31 * 32 * Improvements and fixes by: 33 * 34 * Arjan van de Ven <arjan@linux.intel.com> 35 * Yanmin Zhang <yanmin.zhang@intel.com> 36 * Wu Fengguang <fengguang.wu@intel.com> 37 * Mike Galbraith <efault@gmx.de> 38 * Paul Mackerras <paulus@samba.org> 39 * Jaswinder Singh Rajput <jaswinder@kernel.org> 40 * 41 * Released under the GPL v2. (and only v2, not any later version) 42 */ 43 44 #include "perf.h" 45 #include "builtin.h" 46 #include "util/cgroup.h" 47 #include "util/util.h" 48 #include "util/parse-options.h" 49 #include "util/parse-events.h" 50 #include "util/pmu.h" 51 #include "util/event.h" 52 #include "util/evlist.h" 53 #include "util/evsel.h" 54 #include "util/debug.h" 55 #include "util/color.h" 56 #include "util/stat.h" 57 #include "util/header.h" 58 #include "util/cpumap.h" 59 #include "util/thread.h" 60 #include "util/thread_map.h" 61 62 #include <stdlib.h> 63 #include <sys/prctl.h> 64 #include <locale.h> 65 66 #define DEFAULT_SEPARATOR " " 67 #define CNTR_NOT_SUPPORTED "<not supported>" 68 #define CNTR_NOT_COUNTED "<not counted>" 69 70 static void print_stat(int argc, const char **argv); 71 static void print_counter_aggr(struct perf_evsel *counter, char *prefix); 72 static void print_counter(struct perf_evsel *counter, char *prefix); 73 static void print_aggr(char *prefix); 74 75 /* Default events used for perf stat -T */ 76 static const char * const transaction_attrs[] = { 77 "task-clock", 78 "{" 79 "instructions," 80 "cycles," 81 "cpu/cycles-t/," 82 "cpu/tx-start/," 83 "cpu/el-start/," 84 "cpu/cycles-ct/" 85 "}" 86 }; 87 88 /* More limited version when the CPU does not have all events. */ 89 static const char * const transaction_limited_attrs[] = { 90 "task-clock", 91 "{" 92 "instructions," 93 "cycles," 94 "cpu/cycles-t/," 95 "cpu/tx-start/" 96 "}" 97 }; 98 99 /* must match transaction_attrs and the beginning limited_attrs */ 100 enum { 101 T_TASK_CLOCK, 102 T_INSTRUCTIONS, 103 T_CYCLES, 104 T_CYCLES_IN_TX, 105 T_TRANSACTION_START, 106 T_ELISION_START, 107 T_CYCLES_IN_TX_CP, 108 }; 109 110 static struct perf_evlist *evsel_list; 111 112 static struct target target = { 113 .uid = UINT_MAX, 114 }; 115 116 enum aggr_mode { 117 AGGR_NONE, 118 AGGR_GLOBAL, 119 AGGR_SOCKET, 120 AGGR_CORE, 121 }; 122 123 static int run_count = 1; 124 static bool no_inherit = false; 125 static bool scale = true; 126 static enum aggr_mode aggr_mode = AGGR_GLOBAL; 127 static volatile pid_t child_pid = -1; 128 static bool null_run = false; 129 static int detailed_run = 0; 130 static bool transaction_run; 131 static bool big_num = true; 132 static int big_num_opt = -1; 133 static const char *csv_sep = NULL; 134 static bool csv_output = false; 135 static bool group = false; 136 static FILE *output = NULL; 137 static const char *pre_cmd = NULL; 138 static const char *post_cmd = NULL; 139 static bool sync_run = false; 140 static unsigned int interval = 0; 141 static unsigned int initial_delay = 0; 142 static unsigned int unit_width = 4; /* strlen("unit") */ 143 static bool forever = false; 144 static struct timespec ref_time; 145 static struct cpu_map *aggr_map; 146 static int (*aggr_get_id)(struct cpu_map *m, int cpu); 147 148 static volatile int done = 0; 149 150 struct perf_stat { 151 struct stats res_stats[3]; 152 }; 153 154 static inline void diff_timespec(struct timespec *r, struct timespec *a, 155 struct timespec *b) 156 { 157 r->tv_sec = a->tv_sec - b->tv_sec; 158 if (a->tv_nsec < b->tv_nsec) { 159 r->tv_nsec = a->tv_nsec + 1000000000L - b->tv_nsec; 160 r->tv_sec--; 161 } else { 162 r->tv_nsec = a->tv_nsec - b->tv_nsec ; 163 } 164 } 165 166 static inline struct cpu_map *perf_evsel__cpus(struct perf_evsel *evsel) 167 { 168 return (evsel->cpus && !target.cpu_list) ? evsel->cpus : evsel_list->cpus; 169 } 170 171 static inline int perf_evsel__nr_cpus(struct perf_evsel *evsel) 172 { 173 return perf_evsel__cpus(evsel)->nr; 174 } 175 176 static void perf_evsel__reset_stat_priv(struct perf_evsel *evsel) 177 { 178 int i; 179 struct perf_stat *ps = evsel->priv; 180 181 for (i = 0; i < 3; i++) 182 init_stats(&ps->res_stats[i]); 183 } 184 185 static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel) 186 { 187 evsel->priv = zalloc(sizeof(struct perf_stat)); 188 if (evsel->priv == NULL) 189 return -ENOMEM; 190 perf_evsel__reset_stat_priv(evsel); 191 return 0; 192 } 193 194 static void perf_evsel__free_stat_priv(struct perf_evsel *evsel) 195 { 196 zfree(&evsel->priv); 197 } 198 199 static int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel) 200 { 201 void *addr; 202 size_t sz; 203 204 sz = sizeof(*evsel->counts) + 205 (perf_evsel__nr_cpus(evsel) * sizeof(struct perf_counts_values)); 206 207 addr = zalloc(sz); 208 if (!addr) 209 return -ENOMEM; 210 211 evsel->prev_raw_counts = addr; 212 213 return 0; 214 } 215 216 static void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel) 217 { 218 zfree(&evsel->prev_raw_counts); 219 } 220 221 static void perf_evlist__free_stats(struct perf_evlist *evlist) 222 { 223 struct perf_evsel *evsel; 224 225 evlist__for_each(evlist, evsel) { 226 perf_evsel__free_stat_priv(evsel); 227 perf_evsel__free_counts(evsel); 228 perf_evsel__free_prev_raw_counts(evsel); 229 } 230 } 231 232 static int perf_evlist__alloc_stats(struct perf_evlist *evlist, bool alloc_raw) 233 { 234 struct perf_evsel *evsel; 235 236 evlist__for_each(evlist, evsel) { 237 if (perf_evsel__alloc_stat_priv(evsel) < 0 || 238 perf_evsel__alloc_counts(evsel, perf_evsel__nr_cpus(evsel)) < 0 || 239 (alloc_raw && perf_evsel__alloc_prev_raw_counts(evsel) < 0)) 240 goto out_free; 241 } 242 243 return 0; 244 245 out_free: 246 perf_evlist__free_stats(evlist); 247 return -1; 248 } 249 250 static struct stats runtime_nsecs_stats[MAX_NR_CPUS]; 251 static struct stats runtime_cycles_stats[MAX_NR_CPUS]; 252 static struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS]; 253 static struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS]; 254 static struct stats runtime_branches_stats[MAX_NR_CPUS]; 255 static struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; 256 static struct stats runtime_l1_dcache_stats[MAX_NR_CPUS]; 257 static struct stats runtime_l1_icache_stats[MAX_NR_CPUS]; 258 static struct stats runtime_ll_cache_stats[MAX_NR_CPUS]; 259 static struct stats runtime_itlb_cache_stats[MAX_NR_CPUS]; 260 static struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS]; 261 static struct stats runtime_cycles_in_tx_stats[MAX_NR_CPUS]; 262 static struct stats walltime_nsecs_stats; 263 static struct stats runtime_transaction_stats[MAX_NR_CPUS]; 264 static struct stats runtime_elision_stats[MAX_NR_CPUS]; 265 266 static void perf_stat__reset_stats(struct perf_evlist *evlist) 267 { 268 struct perf_evsel *evsel; 269 270 evlist__for_each(evlist, evsel) { 271 perf_evsel__reset_stat_priv(evsel); 272 perf_evsel__reset_counts(evsel, perf_evsel__nr_cpus(evsel)); 273 } 274 275 memset(runtime_nsecs_stats, 0, sizeof(runtime_nsecs_stats)); 276 memset(runtime_cycles_stats, 0, sizeof(runtime_cycles_stats)); 277 memset(runtime_stalled_cycles_front_stats, 0, sizeof(runtime_stalled_cycles_front_stats)); 278 memset(runtime_stalled_cycles_back_stats, 0, sizeof(runtime_stalled_cycles_back_stats)); 279 memset(runtime_branches_stats, 0, sizeof(runtime_branches_stats)); 280 memset(runtime_cacherefs_stats, 0, sizeof(runtime_cacherefs_stats)); 281 memset(runtime_l1_dcache_stats, 0, sizeof(runtime_l1_dcache_stats)); 282 memset(runtime_l1_icache_stats, 0, sizeof(runtime_l1_icache_stats)); 283 memset(runtime_ll_cache_stats, 0, sizeof(runtime_ll_cache_stats)); 284 memset(runtime_itlb_cache_stats, 0, sizeof(runtime_itlb_cache_stats)); 285 memset(runtime_dtlb_cache_stats, 0, sizeof(runtime_dtlb_cache_stats)); 286 memset(runtime_cycles_in_tx_stats, 0, 287 sizeof(runtime_cycles_in_tx_stats)); 288 memset(runtime_transaction_stats, 0, 289 sizeof(runtime_transaction_stats)); 290 memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats)); 291 memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats)); 292 } 293 294 static int create_perf_stat_counter(struct perf_evsel *evsel) 295 { 296 struct perf_event_attr *attr = &evsel->attr; 297 298 if (scale) 299 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | 300 PERF_FORMAT_TOTAL_TIME_RUNNING; 301 302 attr->inherit = !no_inherit; 303 304 if (target__has_cpu(&target)) 305 return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel)); 306 307 if (!target__has_task(&target) && perf_evsel__is_group_leader(evsel)) { 308 attr->disabled = 1; 309 if (!initial_delay) 310 attr->enable_on_exec = 1; 311 } 312 313 return perf_evsel__open_per_thread(evsel, evsel_list->threads); 314 } 315 316 /* 317 * Does the counter have nsecs as a unit? 318 */ 319 static inline int nsec_counter(struct perf_evsel *evsel) 320 { 321 if (perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) || 322 perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) 323 return 1; 324 325 return 0; 326 } 327 328 static struct perf_evsel *nth_evsel(int n) 329 { 330 static struct perf_evsel **array; 331 static int array_len; 332 struct perf_evsel *ev; 333 int j; 334 335 /* Assumes this only called when evsel_list does not change anymore. */ 336 if (!array) { 337 evlist__for_each(evsel_list, ev) 338 array_len++; 339 array = malloc(array_len * sizeof(void *)); 340 if (!array) 341 exit(ENOMEM); 342 j = 0; 343 evlist__for_each(evsel_list, ev) 344 array[j++] = ev; 345 } 346 if (n < array_len) 347 return array[n]; 348 return NULL; 349 } 350 351 /* 352 * Update various tracking values we maintain to print 353 * more semantic information such as miss/hit ratios, 354 * instruction rates, etc: 355 */ 356 static void update_shadow_stats(struct perf_evsel *counter, u64 *count) 357 { 358 if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) 359 update_stats(&runtime_nsecs_stats[0], count[0]); 360 else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) 361 update_stats(&runtime_cycles_stats[0], count[0]); 362 else if (transaction_run && 363 perf_evsel__cmp(counter, nth_evsel(T_CYCLES_IN_TX))) 364 update_stats(&runtime_cycles_in_tx_stats[0], count[0]); 365 else if (transaction_run && 366 perf_evsel__cmp(counter, nth_evsel(T_TRANSACTION_START))) 367 update_stats(&runtime_transaction_stats[0], count[0]); 368 else if (transaction_run && 369 perf_evsel__cmp(counter, nth_evsel(T_ELISION_START))) 370 update_stats(&runtime_elision_stats[0], count[0]); 371 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) 372 update_stats(&runtime_stalled_cycles_front_stats[0], count[0]); 373 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) 374 update_stats(&runtime_stalled_cycles_back_stats[0], count[0]); 375 else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) 376 update_stats(&runtime_branches_stats[0], count[0]); 377 else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) 378 update_stats(&runtime_cacherefs_stats[0], count[0]); 379 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) 380 update_stats(&runtime_l1_dcache_stats[0], count[0]); 381 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) 382 update_stats(&runtime_l1_icache_stats[0], count[0]); 383 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL)) 384 update_stats(&runtime_ll_cache_stats[0], count[0]); 385 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) 386 update_stats(&runtime_dtlb_cache_stats[0], count[0]); 387 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) 388 update_stats(&runtime_itlb_cache_stats[0], count[0]); 389 } 390 391 static void zero_per_pkg(struct perf_evsel *counter) 392 { 393 if (counter->per_pkg_mask) 394 memset(counter->per_pkg_mask, 0, MAX_NR_CPUS); 395 } 396 397 static int check_per_pkg(struct perf_evsel *counter, int cpu, bool *skip) 398 { 399 unsigned long *mask = counter->per_pkg_mask; 400 struct cpu_map *cpus = perf_evsel__cpus(counter); 401 int s; 402 403 *skip = false; 404 405 if (!counter->per_pkg) 406 return 0; 407 408 if (cpu_map__empty(cpus)) 409 return 0; 410 411 if (!mask) { 412 mask = zalloc(MAX_NR_CPUS); 413 if (!mask) 414 return -ENOMEM; 415 416 counter->per_pkg_mask = mask; 417 } 418 419 s = cpu_map__get_socket(cpus, cpu); 420 if (s < 0) 421 return -1; 422 423 *skip = test_and_set_bit(s, mask) == 1; 424 return 0; 425 } 426 427 static int read_cb(struct perf_evsel *evsel, int cpu, int thread __maybe_unused, 428 struct perf_counts_values *count) 429 { 430 struct perf_counts_values *aggr = &evsel->counts->aggr; 431 static struct perf_counts_values zero; 432 bool skip = false; 433 434 if (check_per_pkg(evsel, cpu, &skip)) { 435 pr_err("failed to read per-pkg counter\n"); 436 return -1; 437 } 438 439 if (skip) 440 count = &zero; 441 442 switch (aggr_mode) { 443 case AGGR_CORE: 444 case AGGR_SOCKET: 445 case AGGR_NONE: 446 if (!evsel->snapshot) 447 perf_evsel__compute_deltas(evsel, cpu, count); 448 perf_counts_values__scale(count, scale, NULL); 449 evsel->counts->cpu[cpu] = *count; 450 update_shadow_stats(evsel, count->values); 451 break; 452 case AGGR_GLOBAL: 453 aggr->val += count->val; 454 if (scale) { 455 aggr->ena += count->ena; 456 aggr->run += count->run; 457 } 458 default: 459 break; 460 } 461 462 return 0; 463 } 464 465 static int read_counter(struct perf_evsel *counter); 466 467 /* 468 * Read out the results of a single counter: 469 * aggregate counts across CPUs in system-wide mode 470 */ 471 static int read_counter_aggr(struct perf_evsel *counter) 472 { 473 struct perf_counts_values *aggr = &counter->counts->aggr; 474 struct perf_stat *ps = counter->priv; 475 u64 *count = counter->counts->aggr.values; 476 int i; 477 478 aggr->val = aggr->ena = aggr->run = 0; 479 480 if (read_counter(counter)) 481 return -1; 482 483 if (!counter->snapshot) 484 perf_evsel__compute_deltas(counter, -1, aggr); 485 perf_counts_values__scale(aggr, scale, &counter->counts->scaled); 486 487 for (i = 0; i < 3; i++) 488 update_stats(&ps->res_stats[i], count[i]); 489 490 if (verbose) { 491 fprintf(output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", 492 perf_evsel__name(counter), count[0], count[1], count[2]); 493 } 494 495 /* 496 * Save the full runtime - to allow normalization during printout: 497 */ 498 update_shadow_stats(counter, count); 499 500 return 0; 501 } 502 503 /* 504 * Read out the results of a single counter: 505 * do not aggregate counts across CPUs in system-wide mode 506 */ 507 static int read_counter(struct perf_evsel *counter) 508 { 509 int nthreads = thread_map__nr(evsel_list->threads); 510 int ncpus = perf_evsel__nr_cpus(counter); 511 int cpu, thread; 512 513 if (counter->system_wide) 514 nthreads = 1; 515 516 if (counter->per_pkg) 517 zero_per_pkg(counter); 518 519 for (thread = 0; thread < nthreads; thread++) { 520 for (cpu = 0; cpu < ncpus; cpu++) { 521 if (perf_evsel__read_cb(counter, cpu, thread, read_cb)) 522 return -1; 523 } 524 } 525 526 return 0; 527 } 528 529 static void print_interval(void) 530 { 531 static int num_print_interval; 532 struct perf_evsel *counter; 533 struct perf_stat *ps; 534 struct timespec ts, rs; 535 char prefix[64]; 536 537 if (aggr_mode == AGGR_GLOBAL) { 538 evlist__for_each(evsel_list, counter) { 539 ps = counter->priv; 540 memset(ps->res_stats, 0, sizeof(ps->res_stats)); 541 read_counter_aggr(counter); 542 } 543 } else { 544 evlist__for_each(evsel_list, counter) { 545 ps = counter->priv; 546 memset(ps->res_stats, 0, sizeof(ps->res_stats)); 547 read_counter(counter); 548 } 549 } 550 551 clock_gettime(CLOCK_MONOTONIC, &ts); 552 diff_timespec(&rs, &ts, &ref_time); 553 sprintf(prefix, "%6lu.%09lu%s", rs.tv_sec, rs.tv_nsec, csv_sep); 554 555 if (num_print_interval == 0 && !csv_output) { 556 switch (aggr_mode) { 557 case AGGR_SOCKET: 558 fprintf(output, "# time socket cpus counts %*s events\n", unit_width, "unit"); 559 break; 560 case AGGR_CORE: 561 fprintf(output, "# time core cpus counts %*s events\n", unit_width, "unit"); 562 break; 563 case AGGR_NONE: 564 fprintf(output, "# time CPU counts %*s events\n", unit_width, "unit"); 565 break; 566 case AGGR_GLOBAL: 567 default: 568 fprintf(output, "# time counts %*s events\n", unit_width, "unit"); 569 } 570 } 571 572 if (++num_print_interval == 25) 573 num_print_interval = 0; 574 575 switch (aggr_mode) { 576 case AGGR_CORE: 577 case AGGR_SOCKET: 578 print_aggr(prefix); 579 break; 580 case AGGR_NONE: 581 evlist__for_each(evsel_list, counter) 582 print_counter(counter, prefix); 583 break; 584 case AGGR_GLOBAL: 585 default: 586 evlist__for_each(evsel_list, counter) 587 print_counter_aggr(counter, prefix); 588 } 589 590 fflush(output); 591 } 592 593 static void handle_initial_delay(void) 594 { 595 struct perf_evsel *counter; 596 597 if (initial_delay) { 598 const int ncpus = cpu_map__nr(evsel_list->cpus), 599 nthreads = thread_map__nr(evsel_list->threads); 600 601 usleep(initial_delay * 1000); 602 evlist__for_each(evsel_list, counter) 603 perf_evsel__enable(counter, ncpus, nthreads); 604 } 605 } 606 607 static volatile int workload_exec_errno; 608 609 /* 610 * perf_evlist__prepare_workload will send a SIGUSR1 611 * if the fork fails, since we asked by setting its 612 * want_signal to true. 613 */ 614 static void workload_exec_failed_signal(int signo __maybe_unused, siginfo_t *info, 615 void *ucontext __maybe_unused) 616 { 617 workload_exec_errno = info->si_value.sival_int; 618 } 619 620 static int __run_perf_stat(int argc, const char **argv) 621 { 622 char msg[512]; 623 unsigned long long t0, t1; 624 struct perf_evsel *counter; 625 struct timespec ts; 626 size_t l; 627 int status = 0; 628 const bool forks = (argc > 0); 629 630 if (interval) { 631 ts.tv_sec = interval / 1000; 632 ts.tv_nsec = (interval % 1000) * 1000000; 633 } else { 634 ts.tv_sec = 1; 635 ts.tv_nsec = 0; 636 } 637 638 if (forks) { 639 if (perf_evlist__prepare_workload(evsel_list, &target, argv, false, 640 workload_exec_failed_signal) < 0) { 641 perror("failed to prepare workload"); 642 return -1; 643 } 644 child_pid = evsel_list->workload.pid; 645 } 646 647 if (group) 648 perf_evlist__set_leader(evsel_list); 649 650 evlist__for_each(evsel_list, counter) { 651 if (create_perf_stat_counter(counter) < 0) { 652 /* 653 * PPC returns ENXIO for HW counters until 2.6.37 654 * (behavior changed with commit b0a873e). 655 */ 656 if (errno == EINVAL || errno == ENOSYS || 657 errno == ENOENT || errno == EOPNOTSUPP || 658 errno == ENXIO) { 659 if (verbose) 660 ui__warning("%s event is not supported by the kernel.\n", 661 perf_evsel__name(counter)); 662 counter->supported = false; 663 continue; 664 } 665 666 perf_evsel__open_strerror(counter, &target, 667 errno, msg, sizeof(msg)); 668 ui__error("%s\n", msg); 669 670 if (child_pid != -1) 671 kill(child_pid, SIGTERM); 672 673 return -1; 674 } 675 counter->supported = true; 676 677 l = strlen(counter->unit); 678 if (l > unit_width) 679 unit_width = l; 680 } 681 682 if (perf_evlist__apply_filters(evsel_list)) { 683 error("failed to set filter with %d (%s)\n", errno, 684 strerror_r(errno, msg, sizeof(msg))); 685 return -1; 686 } 687 688 /* 689 * Enable counters and exec the command: 690 */ 691 t0 = rdclock(); 692 clock_gettime(CLOCK_MONOTONIC, &ref_time); 693 694 if (forks) { 695 perf_evlist__start_workload(evsel_list); 696 handle_initial_delay(); 697 698 if (interval) { 699 while (!waitpid(child_pid, &status, WNOHANG)) { 700 nanosleep(&ts, NULL); 701 print_interval(); 702 } 703 } 704 wait(&status); 705 706 if (workload_exec_errno) { 707 const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg)); 708 pr_err("Workload failed: %s\n", emsg); 709 return -1; 710 } 711 712 if (WIFSIGNALED(status)) 713 psignal(WTERMSIG(status), argv[0]); 714 } else { 715 handle_initial_delay(); 716 while (!done) { 717 nanosleep(&ts, NULL); 718 if (interval) 719 print_interval(); 720 } 721 } 722 723 t1 = rdclock(); 724 725 update_stats(&walltime_nsecs_stats, t1 - t0); 726 727 if (aggr_mode == AGGR_GLOBAL) { 728 evlist__for_each(evsel_list, counter) { 729 read_counter_aggr(counter); 730 perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 731 thread_map__nr(evsel_list->threads)); 732 } 733 } else { 734 evlist__for_each(evsel_list, counter) { 735 read_counter(counter); 736 perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 1); 737 } 738 } 739 740 return WEXITSTATUS(status); 741 } 742 743 static int run_perf_stat(int argc, const char **argv) 744 { 745 int ret; 746 747 if (pre_cmd) { 748 ret = system(pre_cmd); 749 if (ret) 750 return ret; 751 } 752 753 if (sync_run) 754 sync(); 755 756 ret = __run_perf_stat(argc, argv); 757 if (ret) 758 return ret; 759 760 if (post_cmd) { 761 ret = system(post_cmd); 762 if (ret) 763 return ret; 764 } 765 766 return ret; 767 } 768 769 static void print_noise_pct(double total, double avg) 770 { 771 double pct = rel_stddev_stats(total, avg); 772 773 if (csv_output) 774 fprintf(output, "%s%.2f%%", csv_sep, pct); 775 else if (pct) 776 fprintf(output, " ( +-%6.2f%% )", pct); 777 } 778 779 static void print_noise(struct perf_evsel *evsel, double avg) 780 { 781 struct perf_stat *ps; 782 783 if (run_count == 1) 784 return; 785 786 ps = evsel->priv; 787 print_noise_pct(stddev_stats(&ps->res_stats[0]), avg); 788 } 789 790 static void aggr_printout(struct perf_evsel *evsel, int id, int nr) 791 { 792 switch (aggr_mode) { 793 case AGGR_CORE: 794 fprintf(output, "S%d-C%*d%s%*d%s", 795 cpu_map__id_to_socket(id), 796 csv_output ? 0 : -8, 797 cpu_map__id_to_cpu(id), 798 csv_sep, 799 csv_output ? 0 : 4, 800 nr, 801 csv_sep); 802 break; 803 case AGGR_SOCKET: 804 fprintf(output, "S%*d%s%*d%s", 805 csv_output ? 0 : -5, 806 id, 807 csv_sep, 808 csv_output ? 0 : 4, 809 nr, 810 csv_sep); 811 break; 812 case AGGR_NONE: 813 fprintf(output, "CPU%*d%s", 814 csv_output ? 0 : -4, 815 perf_evsel__cpus(evsel)->map[id], csv_sep); 816 break; 817 case AGGR_GLOBAL: 818 default: 819 break; 820 } 821 } 822 823 static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg) 824 { 825 double msecs = avg / 1e6; 826 const char *fmt_v, *fmt_n; 827 char name[25]; 828 829 fmt_v = csv_output ? "%.6f%s" : "%18.6f%s"; 830 fmt_n = csv_output ? "%s" : "%-25s"; 831 832 aggr_printout(evsel, id, nr); 833 834 scnprintf(name, sizeof(name), "%s%s", 835 perf_evsel__name(evsel), csv_output ? "" : " (msec)"); 836 837 fprintf(output, fmt_v, msecs, csv_sep); 838 839 if (csv_output) 840 fprintf(output, "%s%s", evsel->unit, csv_sep); 841 else 842 fprintf(output, "%-*s%s", unit_width, evsel->unit, csv_sep); 843 844 fprintf(output, fmt_n, name); 845 846 if (evsel->cgrp) 847 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); 848 849 if (csv_output || interval) 850 return; 851 852 if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) 853 fprintf(output, " # %8.3f CPUs utilized ", 854 avg / avg_stats(&walltime_nsecs_stats)); 855 else 856 fprintf(output, " "); 857 } 858 859 /* used for get_ratio_color() */ 860 enum grc_type { 861 GRC_STALLED_CYCLES_FE, 862 GRC_STALLED_CYCLES_BE, 863 GRC_CACHE_MISSES, 864 GRC_MAX_NR 865 }; 866 867 static const char *get_ratio_color(enum grc_type type, double ratio) 868 { 869 static const double grc_table[GRC_MAX_NR][3] = { 870 [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 }, 871 [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 }, 872 [GRC_CACHE_MISSES] = { 20.0, 10.0, 5.0 }, 873 }; 874 const char *color = PERF_COLOR_NORMAL; 875 876 if (ratio > grc_table[type][0]) 877 color = PERF_COLOR_RED; 878 else if (ratio > grc_table[type][1]) 879 color = PERF_COLOR_MAGENTA; 880 else if (ratio > grc_table[type][2]) 881 color = PERF_COLOR_YELLOW; 882 883 return color; 884 } 885 886 static void print_stalled_cycles_frontend(int cpu, 887 struct perf_evsel *evsel 888 __maybe_unused, double avg) 889 { 890 double total, ratio = 0.0; 891 const char *color; 892 893 total = avg_stats(&runtime_cycles_stats[cpu]); 894 895 if (total) 896 ratio = avg / total * 100.0; 897 898 color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio); 899 900 fprintf(output, " # "); 901 color_fprintf(output, color, "%6.2f%%", ratio); 902 fprintf(output, " frontend cycles idle "); 903 } 904 905 static void print_stalled_cycles_backend(int cpu, 906 struct perf_evsel *evsel 907 __maybe_unused, double avg) 908 { 909 double total, ratio = 0.0; 910 const char *color; 911 912 total = avg_stats(&runtime_cycles_stats[cpu]); 913 914 if (total) 915 ratio = avg / total * 100.0; 916 917 color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio); 918 919 fprintf(output, " # "); 920 color_fprintf(output, color, "%6.2f%%", ratio); 921 fprintf(output, " backend cycles idle "); 922 } 923 924 static void print_branch_misses(int cpu, 925 struct perf_evsel *evsel __maybe_unused, 926 double avg) 927 { 928 double total, ratio = 0.0; 929 const char *color; 930 931 total = avg_stats(&runtime_branches_stats[cpu]); 932 933 if (total) 934 ratio = avg / total * 100.0; 935 936 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 937 938 fprintf(output, " # "); 939 color_fprintf(output, color, "%6.2f%%", ratio); 940 fprintf(output, " of all branches "); 941 } 942 943 static void print_l1_dcache_misses(int cpu, 944 struct perf_evsel *evsel __maybe_unused, 945 double avg) 946 { 947 double total, ratio = 0.0; 948 const char *color; 949 950 total = avg_stats(&runtime_l1_dcache_stats[cpu]); 951 952 if (total) 953 ratio = avg / total * 100.0; 954 955 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 956 957 fprintf(output, " # "); 958 color_fprintf(output, color, "%6.2f%%", ratio); 959 fprintf(output, " of all L1-dcache hits "); 960 } 961 962 static void print_l1_icache_misses(int cpu, 963 struct perf_evsel *evsel __maybe_unused, 964 double avg) 965 { 966 double total, ratio = 0.0; 967 const char *color; 968 969 total = avg_stats(&runtime_l1_icache_stats[cpu]); 970 971 if (total) 972 ratio = avg / total * 100.0; 973 974 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 975 976 fprintf(output, " # "); 977 color_fprintf(output, color, "%6.2f%%", ratio); 978 fprintf(output, " of all L1-icache hits "); 979 } 980 981 static void print_dtlb_cache_misses(int cpu, 982 struct perf_evsel *evsel __maybe_unused, 983 double avg) 984 { 985 double total, ratio = 0.0; 986 const char *color; 987 988 total = avg_stats(&runtime_dtlb_cache_stats[cpu]); 989 990 if (total) 991 ratio = avg / total * 100.0; 992 993 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 994 995 fprintf(output, " # "); 996 color_fprintf(output, color, "%6.2f%%", ratio); 997 fprintf(output, " of all dTLB cache hits "); 998 } 999 1000 static void print_itlb_cache_misses(int cpu, 1001 struct perf_evsel *evsel __maybe_unused, 1002 double avg) 1003 { 1004 double total, ratio = 0.0; 1005 const char *color; 1006 1007 total = avg_stats(&runtime_itlb_cache_stats[cpu]); 1008 1009 if (total) 1010 ratio = avg / total * 100.0; 1011 1012 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 1013 1014 fprintf(output, " # "); 1015 color_fprintf(output, color, "%6.2f%%", ratio); 1016 fprintf(output, " of all iTLB cache hits "); 1017 } 1018 1019 static void print_ll_cache_misses(int cpu, 1020 struct perf_evsel *evsel __maybe_unused, 1021 double avg) 1022 { 1023 double total, ratio = 0.0; 1024 const char *color; 1025 1026 total = avg_stats(&runtime_ll_cache_stats[cpu]); 1027 1028 if (total) 1029 ratio = avg / total * 100.0; 1030 1031 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 1032 1033 fprintf(output, " # "); 1034 color_fprintf(output, color, "%6.2f%%", ratio); 1035 fprintf(output, " of all LL-cache hits "); 1036 } 1037 1038 static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg) 1039 { 1040 double total, ratio = 0.0, total2; 1041 double sc = evsel->scale; 1042 const char *fmt; 1043 int cpu = cpu_map__id_to_cpu(id); 1044 1045 if (csv_output) { 1046 fmt = sc != 1.0 ? "%.2f%s" : "%.0f%s"; 1047 } else { 1048 if (big_num) 1049 fmt = sc != 1.0 ? "%'18.2f%s" : "%'18.0f%s"; 1050 else 1051 fmt = sc != 1.0 ? "%18.2f%s" : "%18.0f%s"; 1052 } 1053 1054 aggr_printout(evsel, id, nr); 1055 1056 if (aggr_mode == AGGR_GLOBAL) 1057 cpu = 0; 1058 1059 fprintf(output, fmt, avg, csv_sep); 1060 1061 if (evsel->unit) 1062 fprintf(output, "%-*s%s", 1063 csv_output ? 0 : unit_width, 1064 evsel->unit, csv_sep); 1065 1066 fprintf(output, "%-*s", csv_output ? 0 : 25, perf_evsel__name(evsel)); 1067 1068 if (evsel->cgrp) 1069 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); 1070 1071 if (csv_output || interval) 1072 return; 1073 1074 if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { 1075 total = avg_stats(&runtime_cycles_stats[cpu]); 1076 if (total) { 1077 ratio = avg / total; 1078 fprintf(output, " # %5.2f insns per cycle ", ratio); 1079 } 1080 total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]); 1081 total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu])); 1082 1083 if (total && avg) { 1084 ratio = total / avg; 1085 fprintf(output, "\n"); 1086 if (aggr_mode == AGGR_NONE) 1087 fprintf(output, " "); 1088 fprintf(output, " # %5.2f stalled cycles per insn", ratio); 1089 } 1090 1091 } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && 1092 runtime_branches_stats[cpu].n != 0) { 1093 print_branch_misses(cpu, evsel, avg); 1094 } else if ( 1095 evsel->attr.type == PERF_TYPE_HW_CACHE && 1096 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | 1097 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1098 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1099 runtime_l1_dcache_stats[cpu].n != 0) { 1100 print_l1_dcache_misses(cpu, evsel, avg); 1101 } else if ( 1102 evsel->attr.type == PERF_TYPE_HW_CACHE && 1103 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I | 1104 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1105 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1106 runtime_l1_icache_stats[cpu].n != 0) { 1107 print_l1_icache_misses(cpu, evsel, avg); 1108 } else if ( 1109 evsel->attr.type == PERF_TYPE_HW_CACHE && 1110 evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB | 1111 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1112 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1113 runtime_dtlb_cache_stats[cpu].n != 0) { 1114 print_dtlb_cache_misses(cpu, evsel, avg); 1115 } else if ( 1116 evsel->attr.type == PERF_TYPE_HW_CACHE && 1117 evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB | 1118 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1119 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1120 runtime_itlb_cache_stats[cpu].n != 0) { 1121 print_itlb_cache_misses(cpu, evsel, avg); 1122 } else if ( 1123 evsel->attr.type == PERF_TYPE_HW_CACHE && 1124 evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL | 1125 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1126 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1127 runtime_ll_cache_stats[cpu].n != 0) { 1128 print_ll_cache_misses(cpu, evsel, avg); 1129 } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && 1130 runtime_cacherefs_stats[cpu].n != 0) { 1131 total = avg_stats(&runtime_cacherefs_stats[cpu]); 1132 1133 if (total) 1134 ratio = avg * 100 / total; 1135 1136 fprintf(output, " # %8.3f %% of all cache refs ", ratio); 1137 1138 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { 1139 print_stalled_cycles_frontend(cpu, evsel, avg); 1140 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { 1141 print_stalled_cycles_backend(cpu, evsel, avg); 1142 } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { 1143 total = avg_stats(&runtime_nsecs_stats[cpu]); 1144 1145 if (total) { 1146 ratio = avg / total; 1147 fprintf(output, " # %8.3f GHz ", ratio); 1148 } 1149 } else if (transaction_run && 1150 perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX))) { 1151 total = avg_stats(&runtime_cycles_stats[cpu]); 1152 if (total) 1153 fprintf(output, 1154 " # %5.2f%% transactional cycles ", 1155 100.0 * (avg / total)); 1156 } else if (transaction_run && 1157 perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX_CP))) { 1158 total = avg_stats(&runtime_cycles_stats[cpu]); 1159 total2 = avg_stats(&runtime_cycles_in_tx_stats[cpu]); 1160 if (total2 < avg) 1161 total2 = avg; 1162 if (total) 1163 fprintf(output, 1164 " # %5.2f%% aborted cycles ", 1165 100.0 * ((total2-avg) / total)); 1166 } else if (transaction_run && 1167 perf_evsel__cmp(evsel, nth_evsel(T_TRANSACTION_START)) && 1168 avg > 0 && 1169 runtime_cycles_in_tx_stats[cpu].n != 0) { 1170 total = avg_stats(&runtime_cycles_in_tx_stats[cpu]); 1171 1172 if (total) 1173 ratio = total / avg; 1174 1175 fprintf(output, " # %8.0f cycles / transaction ", ratio); 1176 } else if (transaction_run && 1177 perf_evsel__cmp(evsel, nth_evsel(T_ELISION_START)) && 1178 avg > 0 && 1179 runtime_cycles_in_tx_stats[cpu].n != 0) { 1180 total = avg_stats(&runtime_cycles_in_tx_stats[cpu]); 1181 1182 if (total) 1183 ratio = total / avg; 1184 1185 fprintf(output, " # %8.0f cycles / elision ", ratio); 1186 } else if (runtime_nsecs_stats[cpu].n != 0) { 1187 char unit = 'M'; 1188 1189 total = avg_stats(&runtime_nsecs_stats[cpu]); 1190 1191 if (total) 1192 ratio = 1000.0 * avg / total; 1193 if (ratio < 0.001) { 1194 ratio *= 1000; 1195 unit = 'K'; 1196 } 1197 1198 fprintf(output, " # %8.3f %c/sec ", ratio, unit); 1199 } else { 1200 fprintf(output, " "); 1201 } 1202 } 1203 1204 static void print_aggr(char *prefix) 1205 { 1206 struct perf_evsel *counter; 1207 int cpu, cpu2, s, s2, id, nr; 1208 double uval; 1209 u64 ena, run, val; 1210 1211 if (!(aggr_map || aggr_get_id)) 1212 return; 1213 1214 for (s = 0; s < aggr_map->nr; s++) { 1215 id = aggr_map->map[s]; 1216 evlist__for_each(evsel_list, counter) { 1217 val = ena = run = 0; 1218 nr = 0; 1219 for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { 1220 cpu2 = perf_evsel__cpus(counter)->map[cpu]; 1221 s2 = aggr_get_id(evsel_list->cpus, cpu2); 1222 if (s2 != id) 1223 continue; 1224 val += counter->counts->cpu[cpu].val; 1225 ena += counter->counts->cpu[cpu].ena; 1226 run += counter->counts->cpu[cpu].run; 1227 nr++; 1228 } 1229 if (prefix) 1230 fprintf(output, "%s", prefix); 1231 1232 if (run == 0 || ena == 0) { 1233 aggr_printout(counter, id, nr); 1234 1235 fprintf(output, "%*s%s", 1236 csv_output ? 0 : 18, 1237 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 1238 csv_sep); 1239 1240 fprintf(output, "%-*s%s", 1241 csv_output ? 0 : unit_width, 1242 counter->unit, csv_sep); 1243 1244 fprintf(output, "%*s", 1245 csv_output ? 0 : -25, 1246 perf_evsel__name(counter)); 1247 1248 if (counter->cgrp) 1249 fprintf(output, "%s%s", 1250 csv_sep, counter->cgrp->name); 1251 1252 fputc('\n', output); 1253 continue; 1254 } 1255 uval = val * counter->scale; 1256 1257 if (nsec_counter(counter)) 1258 nsec_printout(id, nr, counter, uval); 1259 else 1260 abs_printout(id, nr, counter, uval); 1261 1262 if (!csv_output) { 1263 print_noise(counter, 1.0); 1264 1265 if (run != ena) 1266 fprintf(output, " (%.2f%%)", 1267 100.0 * run / ena); 1268 } 1269 fputc('\n', output); 1270 } 1271 } 1272 } 1273 1274 /* 1275 * Print out the results of a single counter: 1276 * aggregated counts in system-wide mode 1277 */ 1278 static void print_counter_aggr(struct perf_evsel *counter, char *prefix) 1279 { 1280 struct perf_stat *ps = counter->priv; 1281 double avg = avg_stats(&ps->res_stats[0]); 1282 int scaled = counter->counts->scaled; 1283 double uval; 1284 1285 if (prefix) 1286 fprintf(output, "%s", prefix); 1287 1288 if (scaled == -1) { 1289 fprintf(output, "%*s%s", 1290 csv_output ? 0 : 18, 1291 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 1292 csv_sep); 1293 fprintf(output, "%-*s%s", 1294 csv_output ? 0 : unit_width, 1295 counter->unit, csv_sep); 1296 fprintf(output, "%*s", 1297 csv_output ? 0 : -25, 1298 perf_evsel__name(counter)); 1299 1300 if (counter->cgrp) 1301 fprintf(output, "%s%s", csv_sep, counter->cgrp->name); 1302 1303 fputc('\n', output); 1304 return; 1305 } 1306 1307 uval = avg * counter->scale; 1308 1309 if (nsec_counter(counter)) 1310 nsec_printout(-1, 0, counter, uval); 1311 else 1312 abs_printout(-1, 0, counter, uval); 1313 1314 print_noise(counter, avg); 1315 1316 if (csv_output) { 1317 fputc('\n', output); 1318 return; 1319 } 1320 1321 if (scaled) { 1322 double avg_enabled, avg_running; 1323 1324 avg_enabled = avg_stats(&ps->res_stats[1]); 1325 avg_running = avg_stats(&ps->res_stats[2]); 1326 1327 fprintf(output, " [%5.2f%%]", 100 * avg_running / avg_enabled); 1328 } 1329 fprintf(output, "\n"); 1330 } 1331 1332 /* 1333 * Print out the results of a single counter: 1334 * does not use aggregated count in system-wide 1335 */ 1336 static void print_counter(struct perf_evsel *counter, char *prefix) 1337 { 1338 u64 ena, run, val; 1339 double uval; 1340 int cpu; 1341 1342 for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { 1343 val = counter->counts->cpu[cpu].val; 1344 ena = counter->counts->cpu[cpu].ena; 1345 run = counter->counts->cpu[cpu].run; 1346 1347 if (prefix) 1348 fprintf(output, "%s", prefix); 1349 1350 if (run == 0 || ena == 0) { 1351 fprintf(output, "CPU%*d%s%*s%s", 1352 csv_output ? 0 : -4, 1353 perf_evsel__cpus(counter)->map[cpu], csv_sep, 1354 csv_output ? 0 : 18, 1355 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 1356 csv_sep); 1357 1358 fprintf(output, "%-*s%s", 1359 csv_output ? 0 : unit_width, 1360 counter->unit, csv_sep); 1361 1362 fprintf(output, "%*s", 1363 csv_output ? 0 : -25, 1364 perf_evsel__name(counter)); 1365 1366 if (counter->cgrp) 1367 fprintf(output, "%s%s", 1368 csv_sep, counter->cgrp->name); 1369 1370 fputc('\n', output); 1371 continue; 1372 } 1373 1374 uval = val * counter->scale; 1375 1376 if (nsec_counter(counter)) 1377 nsec_printout(cpu, 0, counter, uval); 1378 else 1379 abs_printout(cpu, 0, counter, uval); 1380 1381 if (!csv_output) { 1382 print_noise(counter, 1.0); 1383 1384 if (run != ena) 1385 fprintf(output, " (%.2f%%)", 1386 100.0 * run / ena); 1387 } 1388 fputc('\n', output); 1389 } 1390 } 1391 1392 static void print_stat(int argc, const char **argv) 1393 { 1394 struct perf_evsel *counter; 1395 int i; 1396 1397 fflush(stdout); 1398 1399 if (!csv_output) { 1400 fprintf(output, "\n"); 1401 fprintf(output, " Performance counter stats for "); 1402 if (target.system_wide) 1403 fprintf(output, "\'system wide"); 1404 else if (target.cpu_list) 1405 fprintf(output, "\'CPU(s) %s", target.cpu_list); 1406 else if (!target__has_task(&target)) { 1407 fprintf(output, "\'%s", argv[0]); 1408 for (i = 1; i < argc; i++) 1409 fprintf(output, " %s", argv[i]); 1410 } else if (target.pid) 1411 fprintf(output, "process id \'%s", target.pid); 1412 else 1413 fprintf(output, "thread id \'%s", target.tid); 1414 1415 fprintf(output, "\'"); 1416 if (run_count > 1) 1417 fprintf(output, " (%d runs)", run_count); 1418 fprintf(output, ":\n\n"); 1419 } 1420 1421 switch (aggr_mode) { 1422 case AGGR_CORE: 1423 case AGGR_SOCKET: 1424 print_aggr(NULL); 1425 break; 1426 case AGGR_GLOBAL: 1427 evlist__for_each(evsel_list, counter) 1428 print_counter_aggr(counter, NULL); 1429 break; 1430 case AGGR_NONE: 1431 evlist__for_each(evsel_list, counter) 1432 print_counter(counter, NULL); 1433 break; 1434 default: 1435 break; 1436 } 1437 1438 if (!csv_output) { 1439 if (!null_run) 1440 fprintf(output, "\n"); 1441 fprintf(output, " %17.9f seconds time elapsed", 1442 avg_stats(&walltime_nsecs_stats)/1e9); 1443 if (run_count > 1) { 1444 fprintf(output, " "); 1445 print_noise_pct(stddev_stats(&walltime_nsecs_stats), 1446 avg_stats(&walltime_nsecs_stats)); 1447 } 1448 fprintf(output, "\n\n"); 1449 } 1450 } 1451 1452 static volatile int signr = -1; 1453 1454 static void skip_signal(int signo) 1455 { 1456 if ((child_pid == -1) || interval) 1457 done = 1; 1458 1459 signr = signo; 1460 /* 1461 * render child_pid harmless 1462 * won't send SIGTERM to a random 1463 * process in case of race condition 1464 * and fast PID recycling 1465 */ 1466 child_pid = -1; 1467 } 1468 1469 static void sig_atexit(void) 1470 { 1471 sigset_t set, oset; 1472 1473 /* 1474 * avoid race condition with SIGCHLD handler 1475 * in skip_signal() which is modifying child_pid 1476 * goal is to avoid send SIGTERM to a random 1477 * process 1478 */ 1479 sigemptyset(&set); 1480 sigaddset(&set, SIGCHLD); 1481 sigprocmask(SIG_BLOCK, &set, &oset); 1482 1483 if (child_pid != -1) 1484 kill(child_pid, SIGTERM); 1485 1486 sigprocmask(SIG_SETMASK, &oset, NULL); 1487 1488 if (signr == -1) 1489 return; 1490 1491 signal(signr, SIG_DFL); 1492 kill(getpid(), signr); 1493 } 1494 1495 static int stat__set_big_num(const struct option *opt __maybe_unused, 1496 const char *s __maybe_unused, int unset) 1497 { 1498 big_num_opt = unset ? 0 : 1; 1499 return 0; 1500 } 1501 1502 static int perf_stat_init_aggr_mode(void) 1503 { 1504 switch (aggr_mode) { 1505 case AGGR_SOCKET: 1506 if (cpu_map__build_socket_map(evsel_list->cpus, &aggr_map)) { 1507 perror("cannot build socket map"); 1508 return -1; 1509 } 1510 aggr_get_id = cpu_map__get_socket; 1511 break; 1512 case AGGR_CORE: 1513 if (cpu_map__build_core_map(evsel_list->cpus, &aggr_map)) { 1514 perror("cannot build core map"); 1515 return -1; 1516 } 1517 aggr_get_id = cpu_map__get_core; 1518 break; 1519 case AGGR_NONE: 1520 case AGGR_GLOBAL: 1521 default: 1522 break; 1523 } 1524 return 0; 1525 } 1526 1527 static int setup_events(const char * const *attrs, unsigned len) 1528 { 1529 unsigned i; 1530 1531 for (i = 0; i < len; i++) { 1532 if (parse_events(evsel_list, attrs[i])) 1533 return -1; 1534 } 1535 return 0; 1536 } 1537 1538 /* 1539 * Add default attributes, if there were no attributes specified or 1540 * if -d/--detailed, -d -d or -d -d -d is used: 1541 */ 1542 static int add_default_attributes(void) 1543 { 1544 struct perf_event_attr default_attrs[] = { 1545 1546 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, 1547 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES }, 1548 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, 1549 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, 1550 1551 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, 1552 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, 1553 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, 1554 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, 1555 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, 1556 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, 1557 1558 }; 1559 1560 /* 1561 * Detailed stats (-d), covering the L1 and last level data caches: 1562 */ 1563 struct perf_event_attr detailed_attrs[] = { 1564 1565 { .type = PERF_TYPE_HW_CACHE, 1566 .config = 1567 PERF_COUNT_HW_CACHE_L1D << 0 | 1568 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1569 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1570 1571 { .type = PERF_TYPE_HW_CACHE, 1572 .config = 1573 PERF_COUNT_HW_CACHE_L1D << 0 | 1574 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1575 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1576 1577 { .type = PERF_TYPE_HW_CACHE, 1578 .config = 1579 PERF_COUNT_HW_CACHE_LL << 0 | 1580 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1581 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1582 1583 { .type = PERF_TYPE_HW_CACHE, 1584 .config = 1585 PERF_COUNT_HW_CACHE_LL << 0 | 1586 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1587 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1588 }; 1589 1590 /* 1591 * Very detailed stats (-d -d), covering the instruction cache and the TLB caches: 1592 */ 1593 struct perf_event_attr very_detailed_attrs[] = { 1594 1595 { .type = PERF_TYPE_HW_CACHE, 1596 .config = 1597 PERF_COUNT_HW_CACHE_L1I << 0 | 1598 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1599 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1600 1601 { .type = PERF_TYPE_HW_CACHE, 1602 .config = 1603 PERF_COUNT_HW_CACHE_L1I << 0 | 1604 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1605 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1606 1607 { .type = PERF_TYPE_HW_CACHE, 1608 .config = 1609 PERF_COUNT_HW_CACHE_DTLB << 0 | 1610 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1611 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1612 1613 { .type = PERF_TYPE_HW_CACHE, 1614 .config = 1615 PERF_COUNT_HW_CACHE_DTLB << 0 | 1616 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1617 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1618 1619 { .type = PERF_TYPE_HW_CACHE, 1620 .config = 1621 PERF_COUNT_HW_CACHE_ITLB << 0 | 1622 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1623 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1624 1625 { .type = PERF_TYPE_HW_CACHE, 1626 .config = 1627 PERF_COUNT_HW_CACHE_ITLB << 0 | 1628 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1629 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1630 1631 }; 1632 1633 /* 1634 * Very, very detailed stats (-d -d -d), adding prefetch events: 1635 */ 1636 struct perf_event_attr very_very_detailed_attrs[] = { 1637 1638 { .type = PERF_TYPE_HW_CACHE, 1639 .config = 1640 PERF_COUNT_HW_CACHE_L1D << 0 | 1641 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 1642 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1643 1644 { .type = PERF_TYPE_HW_CACHE, 1645 .config = 1646 PERF_COUNT_HW_CACHE_L1D << 0 | 1647 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 1648 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1649 }; 1650 1651 /* Set attrs if no event is selected and !null_run: */ 1652 if (null_run) 1653 return 0; 1654 1655 if (transaction_run) { 1656 int err; 1657 if (pmu_have_event("cpu", "cycles-ct") && 1658 pmu_have_event("cpu", "el-start")) 1659 err = setup_events(transaction_attrs, 1660 ARRAY_SIZE(transaction_attrs)); 1661 else 1662 err = setup_events(transaction_limited_attrs, 1663 ARRAY_SIZE(transaction_limited_attrs)); 1664 if (err < 0) { 1665 fprintf(stderr, "Cannot set up transaction events\n"); 1666 return -1; 1667 } 1668 return 0; 1669 } 1670 1671 if (!evsel_list->nr_entries) { 1672 if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0) 1673 return -1; 1674 } 1675 1676 /* Detailed events get appended to the event list: */ 1677 1678 if (detailed_run < 1) 1679 return 0; 1680 1681 /* Append detailed run extra attributes: */ 1682 if (perf_evlist__add_default_attrs(evsel_list, detailed_attrs) < 0) 1683 return -1; 1684 1685 if (detailed_run < 2) 1686 return 0; 1687 1688 /* Append very detailed run extra attributes: */ 1689 if (perf_evlist__add_default_attrs(evsel_list, very_detailed_attrs) < 0) 1690 return -1; 1691 1692 if (detailed_run < 3) 1693 return 0; 1694 1695 /* Append very, very detailed run extra attributes: */ 1696 return perf_evlist__add_default_attrs(evsel_list, very_very_detailed_attrs); 1697 } 1698 1699 int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) 1700 { 1701 bool append_file = false; 1702 int output_fd = 0; 1703 const char *output_name = NULL; 1704 const struct option options[] = { 1705 OPT_BOOLEAN('T', "transaction", &transaction_run, 1706 "hardware transaction statistics"), 1707 OPT_CALLBACK('e', "event", &evsel_list, "event", 1708 "event selector. use 'perf list' to list available events", 1709 parse_events_option), 1710 OPT_CALLBACK(0, "filter", &evsel_list, "filter", 1711 "event filter", parse_filter), 1712 OPT_BOOLEAN('i', "no-inherit", &no_inherit, 1713 "child tasks do not inherit counters"), 1714 OPT_STRING('p', "pid", &target.pid, "pid", 1715 "stat events on existing process id"), 1716 OPT_STRING('t', "tid", &target.tid, "tid", 1717 "stat events on existing thread id"), 1718 OPT_BOOLEAN('a', "all-cpus", &target.system_wide, 1719 "system-wide collection from all CPUs"), 1720 OPT_BOOLEAN('g', "group", &group, 1721 "put the counters into a counter group"), 1722 OPT_BOOLEAN('c', "scale", &scale, "scale/normalize counters"), 1723 OPT_INCR('v', "verbose", &verbose, 1724 "be more verbose (show counter open errors, etc)"), 1725 OPT_INTEGER('r', "repeat", &run_count, 1726 "repeat command and print average + stddev (max: 100, forever: 0)"), 1727 OPT_BOOLEAN('n', "null", &null_run, 1728 "null run - dont start any counters"), 1729 OPT_INCR('d', "detailed", &detailed_run, 1730 "detailed run - start a lot of events"), 1731 OPT_BOOLEAN('S', "sync", &sync_run, 1732 "call sync() before starting a run"), 1733 OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, 1734 "print large numbers with thousands\' separators", 1735 stat__set_big_num), 1736 OPT_STRING('C', "cpu", &target.cpu_list, "cpu", 1737 "list of cpus to monitor in system-wide"), 1738 OPT_SET_UINT('A', "no-aggr", &aggr_mode, 1739 "disable CPU count aggregation", AGGR_NONE), 1740 OPT_STRING('x', "field-separator", &csv_sep, "separator", 1741 "print counts with custom separator"), 1742 OPT_CALLBACK('G', "cgroup", &evsel_list, "name", 1743 "monitor event in cgroup name only", parse_cgroups), 1744 OPT_STRING('o', "output", &output_name, "file", "output file name"), 1745 OPT_BOOLEAN(0, "append", &append_file, "append to the output file"), 1746 OPT_INTEGER(0, "log-fd", &output_fd, 1747 "log output to fd, instead of stderr"), 1748 OPT_STRING(0, "pre", &pre_cmd, "command", 1749 "command to run prior to the measured command"), 1750 OPT_STRING(0, "post", &post_cmd, "command", 1751 "command to run after to the measured command"), 1752 OPT_UINTEGER('I', "interval-print", &interval, 1753 "print counts at regular interval in ms (>= 100)"), 1754 OPT_SET_UINT(0, "per-socket", &aggr_mode, 1755 "aggregate counts per processor socket", AGGR_SOCKET), 1756 OPT_SET_UINT(0, "per-core", &aggr_mode, 1757 "aggregate counts per physical processor core", AGGR_CORE), 1758 OPT_UINTEGER('D', "delay", &initial_delay, 1759 "ms to wait before starting measurement after program start"), 1760 OPT_END() 1761 }; 1762 const char * const stat_usage[] = { 1763 "perf stat [<options>] [<command>]", 1764 NULL 1765 }; 1766 int status = -EINVAL, run_idx; 1767 const char *mode; 1768 1769 setlocale(LC_ALL, ""); 1770 1771 evsel_list = perf_evlist__new(); 1772 if (evsel_list == NULL) 1773 return -ENOMEM; 1774 1775 argc = parse_options(argc, argv, options, stat_usage, 1776 PARSE_OPT_STOP_AT_NON_OPTION); 1777 1778 output = stderr; 1779 if (output_name && strcmp(output_name, "-")) 1780 output = NULL; 1781 1782 if (output_name && output_fd) { 1783 fprintf(stderr, "cannot use both --output and --log-fd\n"); 1784 parse_options_usage(stat_usage, options, "o", 1); 1785 parse_options_usage(NULL, options, "log-fd", 0); 1786 goto out; 1787 } 1788 1789 if (output_fd < 0) { 1790 fprintf(stderr, "argument to --log-fd must be a > 0\n"); 1791 parse_options_usage(stat_usage, options, "log-fd", 0); 1792 goto out; 1793 } 1794 1795 if (!output) { 1796 struct timespec tm; 1797 mode = append_file ? "a" : "w"; 1798 1799 output = fopen(output_name, mode); 1800 if (!output) { 1801 perror("failed to create output file"); 1802 return -1; 1803 } 1804 clock_gettime(CLOCK_REALTIME, &tm); 1805 fprintf(output, "# started on %s\n", ctime(&tm.tv_sec)); 1806 } else if (output_fd > 0) { 1807 mode = append_file ? "a" : "w"; 1808 output = fdopen(output_fd, mode); 1809 if (!output) { 1810 perror("Failed opening logfd"); 1811 return -errno; 1812 } 1813 } 1814 1815 if (csv_sep) { 1816 csv_output = true; 1817 if (!strcmp(csv_sep, "\\t")) 1818 csv_sep = "\t"; 1819 } else 1820 csv_sep = DEFAULT_SEPARATOR; 1821 1822 /* 1823 * let the spreadsheet do the pretty-printing 1824 */ 1825 if (csv_output) { 1826 /* User explicitly passed -B? */ 1827 if (big_num_opt == 1) { 1828 fprintf(stderr, "-B option not supported with -x\n"); 1829 parse_options_usage(stat_usage, options, "B", 1); 1830 parse_options_usage(NULL, options, "x", 1); 1831 goto out; 1832 } else /* Nope, so disable big number formatting */ 1833 big_num = false; 1834 } else if (big_num_opt == 0) /* User passed --no-big-num */ 1835 big_num = false; 1836 1837 if (!argc && target__none(&target)) 1838 usage_with_options(stat_usage, options); 1839 1840 if (run_count < 0) { 1841 pr_err("Run count must be a positive number\n"); 1842 parse_options_usage(stat_usage, options, "r", 1); 1843 goto out; 1844 } else if (run_count == 0) { 1845 forever = true; 1846 run_count = 1; 1847 } 1848 1849 /* no_aggr, cgroup are for system-wide only */ 1850 if ((aggr_mode != AGGR_GLOBAL || nr_cgroups) && 1851 !target__has_cpu(&target)) { 1852 fprintf(stderr, "both cgroup and no-aggregation " 1853 "modes only available in system-wide mode\n"); 1854 1855 parse_options_usage(stat_usage, options, "G", 1); 1856 parse_options_usage(NULL, options, "A", 1); 1857 parse_options_usage(NULL, options, "a", 1); 1858 goto out; 1859 } 1860 1861 if (add_default_attributes()) 1862 goto out; 1863 1864 target__validate(&target); 1865 1866 if (perf_evlist__create_maps(evsel_list, &target) < 0) { 1867 if (target__has_task(&target)) { 1868 pr_err("Problems finding threads of monitor\n"); 1869 parse_options_usage(stat_usage, options, "p", 1); 1870 parse_options_usage(NULL, options, "t", 1); 1871 } else if (target__has_cpu(&target)) { 1872 perror("failed to parse CPUs map"); 1873 parse_options_usage(stat_usage, options, "C", 1); 1874 parse_options_usage(NULL, options, "a", 1); 1875 } 1876 goto out; 1877 } 1878 if (interval && interval < 100) { 1879 pr_err("print interval must be >= 100ms\n"); 1880 parse_options_usage(stat_usage, options, "I", 1); 1881 goto out; 1882 } 1883 1884 if (perf_evlist__alloc_stats(evsel_list, interval)) 1885 goto out; 1886 1887 if (perf_stat_init_aggr_mode()) 1888 goto out; 1889 1890 /* 1891 * We dont want to block the signals - that would cause 1892 * child tasks to inherit that and Ctrl-C would not work. 1893 * What we want is for Ctrl-C to work in the exec()-ed 1894 * task, but being ignored by perf stat itself: 1895 */ 1896 atexit(sig_atexit); 1897 if (!forever) 1898 signal(SIGINT, skip_signal); 1899 signal(SIGCHLD, skip_signal); 1900 signal(SIGALRM, skip_signal); 1901 signal(SIGABRT, skip_signal); 1902 1903 status = 0; 1904 for (run_idx = 0; forever || run_idx < run_count; run_idx++) { 1905 if (run_count != 1 && verbose) 1906 fprintf(output, "[ perf stat: executing run #%d ... ]\n", 1907 run_idx + 1); 1908 1909 status = run_perf_stat(argc, argv); 1910 if (forever && status != -1) { 1911 print_stat(argc, argv); 1912 perf_stat__reset_stats(evsel_list); 1913 } 1914 } 1915 1916 if (!forever && status != -1 && !interval) 1917 print_stat(argc, argv); 1918 1919 perf_evlist__free_stats(evsel_list); 1920 out: 1921 perf_evlist__delete(evsel_list); 1922 return status; 1923 } 1924