1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * builtin-record.c 4 * 5 * Builtin record command: Record the profile of a workload 6 * (or a CPU, or a PID) into the perf.data output file - for 7 * later analysis via perf report. 8 */ 9 #include "builtin.h" 10 11 #include "util/build-id.h" 12 #include <subcmd/parse-options.h> 13 #include "util/parse-events.h" 14 #include "util/config.h" 15 16 #include "util/callchain.h" 17 #include "util/cgroup.h" 18 #include "util/header.h" 19 #include "util/event.h" 20 #include "util/evlist.h" 21 #include "util/evsel.h" 22 #include "util/debug.h" 23 #include "util/mmap.h" 24 #include "util/target.h" 25 #include "util/session.h" 26 #include "util/tool.h" 27 #include "util/symbol.h" 28 #include "util/record.h" 29 #include "util/cpumap.h" 30 #include "util/thread_map.h" 31 #include "util/data.h" 32 #include "util/perf_regs.h" 33 #include "util/auxtrace.h" 34 #include "util/tsc.h" 35 #include "util/parse-branch-options.h" 36 #include "util/parse-regs-options.h" 37 #include "util/llvm-utils.h" 38 #include "util/bpf-loader.h" 39 #include "util/trigger.h" 40 #include "util/perf-hooks.h" 41 #include "util/cpu-set-sched.h" 42 #include "util/synthetic-events.h" 43 #include "util/time-utils.h" 44 #include "util/units.h" 45 #include "util/bpf-event.h" 46 #include "asm/bug.h" 47 #include "perf.h" 48 49 #include <errno.h> 50 #include <inttypes.h> 51 #include <locale.h> 52 #include <poll.h> 53 #include <unistd.h> 54 #include <sched.h> 55 #include <signal.h> 56 #include <sys/mman.h> 57 #include <sys/wait.h> 58 #include <sys/types.h> 59 #include <sys/stat.h> 60 #include <fcntl.h> 61 #include <linux/err.h> 62 #include <linux/string.h> 63 #include <linux/time64.h> 64 #include <linux/zalloc.h> 65 #include <linux/bitmap.h> 66 67 struct switch_output { 68 bool enabled; 69 bool signal; 70 unsigned long size; 71 unsigned long time; 72 const char *str; 73 bool set; 74 char **filenames; 75 int num_files; 76 int cur_file; 77 }; 78 79 struct record { 80 struct perf_tool tool; 81 struct record_opts opts; 82 u64 bytes_written; 83 struct perf_data data; 84 struct auxtrace_record *itr; 85 struct evlist *evlist; 86 struct perf_session *session; 87 int realtime_prio; 88 bool no_buildid; 89 bool no_buildid_set; 90 bool no_buildid_cache; 91 bool no_buildid_cache_set; 92 bool buildid_all; 93 bool timestamp_filename; 94 bool timestamp_boundary; 95 struct switch_output switch_output; 96 unsigned long long samples; 97 struct mmap_cpu_mask affinity_mask; 98 unsigned long output_max_size; /* = 0: unlimited */ 99 }; 100 101 static volatile int done; 102 103 static volatile int auxtrace_record__snapshot_started; 104 static DEFINE_TRIGGER(auxtrace_snapshot_trigger); 105 static DEFINE_TRIGGER(switch_output_trigger); 106 107 static const char *affinity_tags[PERF_AFFINITY_MAX] = { 108 "SYS", "NODE", "CPU" 109 }; 110 111 static bool switch_output_signal(struct record *rec) 112 { 113 return rec->switch_output.signal && 114 trigger_is_ready(&switch_output_trigger); 115 } 116 117 static bool switch_output_size(struct record *rec) 118 { 119 return rec->switch_output.size && 120 trigger_is_ready(&switch_output_trigger) && 121 (rec->bytes_written >= rec->switch_output.size); 122 } 123 124 static bool switch_output_time(struct record *rec) 125 { 126 return rec->switch_output.time && 127 trigger_is_ready(&switch_output_trigger); 128 } 129 130 static bool record__output_max_size_exceeded(struct record *rec) 131 { 132 return rec->output_max_size && 133 (rec->bytes_written >= rec->output_max_size); 134 } 135 136 static int record__write(struct record *rec, struct mmap *map __maybe_unused, 137 void *bf, size_t size) 138 { 139 struct perf_data_file *file = &rec->session->data->file; 140 141 if (perf_data_file__write(file, bf, size) < 0) { 142 pr_err("failed to write perf data, error: %m\n"); 143 return -1; 144 } 145 146 rec->bytes_written += size; 147 148 if (record__output_max_size_exceeded(rec) && !done) { 149 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB)," 150 " stopping session ]\n", 151 rec->bytes_written >> 10); 152 done = 1; 153 } 154 155 if (switch_output_size(rec)) 156 trigger_hit(&switch_output_trigger); 157 158 return 0; 159 } 160 161 static int record__aio_enabled(struct record *rec); 162 static int record__comp_enabled(struct record *rec); 163 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size, 164 void *src, size_t src_size); 165 166 #ifdef HAVE_AIO_SUPPORT 167 static int record__aio_write(struct aiocb *cblock, int trace_fd, 168 void *buf, size_t size, off_t off) 169 { 170 int rc; 171 172 cblock->aio_fildes = trace_fd; 173 cblock->aio_buf = buf; 174 cblock->aio_nbytes = size; 175 cblock->aio_offset = off; 176 cblock->aio_sigevent.sigev_notify = SIGEV_NONE; 177 178 do { 179 rc = aio_write(cblock); 180 if (rc == 0) { 181 break; 182 } else if (errno != EAGAIN) { 183 cblock->aio_fildes = -1; 184 pr_err("failed to queue perf data, error: %m\n"); 185 break; 186 } 187 } while (1); 188 189 return rc; 190 } 191 192 static int record__aio_complete(struct mmap *md, struct aiocb *cblock) 193 { 194 void *rem_buf; 195 off_t rem_off; 196 size_t rem_size; 197 int rc, aio_errno; 198 ssize_t aio_ret, written; 199 200 aio_errno = aio_error(cblock); 201 if (aio_errno == EINPROGRESS) 202 return 0; 203 204 written = aio_ret = aio_return(cblock); 205 if (aio_ret < 0) { 206 if (aio_errno != EINTR) 207 pr_err("failed to write perf data, error: %m\n"); 208 written = 0; 209 } 210 211 rem_size = cblock->aio_nbytes - written; 212 213 if (rem_size == 0) { 214 cblock->aio_fildes = -1; 215 /* 216 * md->refcount is incremented in record__aio_pushfn() for 217 * every aio write request started in record__aio_push() so 218 * decrement it because the request is now complete. 219 */ 220 perf_mmap__put(&md->core); 221 rc = 1; 222 } else { 223 /* 224 * aio write request may require restart with the 225 * reminder if the kernel didn't write whole 226 * chunk at once. 227 */ 228 rem_off = cblock->aio_offset + written; 229 rem_buf = (void *)(cblock->aio_buf + written); 230 record__aio_write(cblock, cblock->aio_fildes, 231 rem_buf, rem_size, rem_off); 232 rc = 0; 233 } 234 235 return rc; 236 } 237 238 static int record__aio_sync(struct mmap *md, bool sync_all) 239 { 240 struct aiocb **aiocb = md->aio.aiocb; 241 struct aiocb *cblocks = md->aio.cblocks; 242 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */ 243 int i, do_suspend; 244 245 do { 246 do_suspend = 0; 247 for (i = 0; i < md->aio.nr_cblocks; ++i) { 248 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) { 249 if (sync_all) 250 aiocb[i] = NULL; 251 else 252 return i; 253 } else { 254 /* 255 * Started aio write is not complete yet 256 * so it has to be waited before the 257 * next allocation. 258 */ 259 aiocb[i] = &cblocks[i]; 260 do_suspend = 1; 261 } 262 } 263 if (!do_suspend) 264 return -1; 265 266 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) { 267 if (!(errno == EAGAIN || errno == EINTR)) 268 pr_err("failed to sync perf data, error: %m\n"); 269 } 270 } while (1); 271 } 272 273 struct record_aio { 274 struct record *rec; 275 void *data; 276 size_t size; 277 }; 278 279 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size) 280 { 281 struct record_aio *aio = to; 282 283 /* 284 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer 285 * to release space in the kernel buffer as fast as possible, calling 286 * perf_mmap__consume() from perf_mmap__push() function. 287 * 288 * That lets the kernel to proceed with storing more profiling data into 289 * the kernel buffer earlier than other per-cpu kernel buffers are handled. 290 * 291 * Coping can be done in two steps in case the chunk of profiling data 292 * crosses the upper bound of the kernel buffer. In this case we first move 293 * part of data from map->start till the upper bound and then the reminder 294 * from the beginning of the kernel buffer till the end of the data chunk. 295 */ 296 297 if (record__comp_enabled(aio->rec)) { 298 size = zstd_compress(aio->rec->session, aio->data + aio->size, 299 mmap__mmap_len(map) - aio->size, 300 buf, size); 301 } else { 302 memcpy(aio->data + aio->size, buf, size); 303 } 304 305 if (!aio->size) { 306 /* 307 * Increment map->refcount to guard map->aio.data[] buffer 308 * from premature deallocation because map object can be 309 * released earlier than aio write request started on 310 * map->aio.data[] buffer is complete. 311 * 312 * perf_mmap__put() is done at record__aio_complete() 313 * after started aio request completion or at record__aio_push() 314 * if the request failed to start. 315 */ 316 perf_mmap__get(&map->core); 317 } 318 319 aio->size += size; 320 321 return size; 322 } 323 324 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off) 325 { 326 int ret, idx; 327 int trace_fd = rec->session->data->file.fd; 328 struct record_aio aio = { .rec = rec, .size = 0 }; 329 330 /* 331 * Call record__aio_sync() to wait till map->aio.data[] buffer 332 * becomes available after previous aio write operation. 333 */ 334 335 idx = record__aio_sync(map, false); 336 aio.data = map->aio.data[idx]; 337 ret = perf_mmap__push(map, &aio, record__aio_pushfn); 338 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */ 339 return ret; 340 341 rec->samples++; 342 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off); 343 if (!ret) { 344 *off += aio.size; 345 rec->bytes_written += aio.size; 346 if (switch_output_size(rec)) 347 trigger_hit(&switch_output_trigger); 348 } else { 349 /* 350 * Decrement map->refcount incremented in record__aio_pushfn() 351 * back if record__aio_write() operation failed to start, otherwise 352 * map->refcount is decremented in record__aio_complete() after 353 * aio write operation finishes successfully. 354 */ 355 perf_mmap__put(&map->core); 356 } 357 358 return ret; 359 } 360 361 static off_t record__aio_get_pos(int trace_fd) 362 { 363 return lseek(trace_fd, 0, SEEK_CUR); 364 } 365 366 static void record__aio_set_pos(int trace_fd, off_t pos) 367 { 368 lseek(trace_fd, pos, SEEK_SET); 369 } 370 371 static void record__aio_mmap_read_sync(struct record *rec) 372 { 373 int i; 374 struct evlist *evlist = rec->evlist; 375 struct mmap *maps = evlist->mmap; 376 377 if (!record__aio_enabled(rec)) 378 return; 379 380 for (i = 0; i < evlist->core.nr_mmaps; i++) { 381 struct mmap *map = &maps[i]; 382 383 if (map->core.base) 384 record__aio_sync(map, true); 385 } 386 } 387 388 static int nr_cblocks_default = 1; 389 static int nr_cblocks_max = 4; 390 391 static int record__aio_parse(const struct option *opt, 392 const char *str, 393 int unset) 394 { 395 struct record_opts *opts = (struct record_opts *)opt->value; 396 397 if (unset) { 398 opts->nr_cblocks = 0; 399 } else { 400 if (str) 401 opts->nr_cblocks = strtol(str, NULL, 0); 402 if (!opts->nr_cblocks) 403 opts->nr_cblocks = nr_cblocks_default; 404 } 405 406 return 0; 407 } 408 #else /* HAVE_AIO_SUPPORT */ 409 static int nr_cblocks_max = 0; 410 411 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused, 412 off_t *off __maybe_unused) 413 { 414 return -1; 415 } 416 417 static off_t record__aio_get_pos(int trace_fd __maybe_unused) 418 { 419 return -1; 420 } 421 422 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused) 423 { 424 } 425 426 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused) 427 { 428 } 429 #endif 430 431 static int record__aio_enabled(struct record *rec) 432 { 433 return rec->opts.nr_cblocks > 0; 434 } 435 436 #define MMAP_FLUSH_DEFAULT 1 437 static int record__mmap_flush_parse(const struct option *opt, 438 const char *str, 439 int unset) 440 { 441 int flush_max; 442 struct record_opts *opts = (struct record_opts *)opt->value; 443 static struct parse_tag tags[] = { 444 { .tag = 'B', .mult = 1 }, 445 { .tag = 'K', .mult = 1 << 10 }, 446 { .tag = 'M', .mult = 1 << 20 }, 447 { .tag = 'G', .mult = 1 << 30 }, 448 { .tag = 0 }, 449 }; 450 451 if (unset) 452 return 0; 453 454 if (str) { 455 opts->mmap_flush = parse_tag_value(str, tags); 456 if (opts->mmap_flush == (int)-1) 457 opts->mmap_flush = strtol(str, NULL, 0); 458 } 459 460 if (!opts->mmap_flush) 461 opts->mmap_flush = MMAP_FLUSH_DEFAULT; 462 463 flush_max = evlist__mmap_size(opts->mmap_pages); 464 flush_max /= 4; 465 if (opts->mmap_flush > flush_max) 466 opts->mmap_flush = flush_max; 467 468 return 0; 469 } 470 471 #ifdef HAVE_ZSTD_SUPPORT 472 static unsigned int comp_level_default = 1; 473 474 static int record__parse_comp_level(const struct option *opt, const char *str, int unset) 475 { 476 struct record_opts *opts = opt->value; 477 478 if (unset) { 479 opts->comp_level = 0; 480 } else { 481 if (str) 482 opts->comp_level = strtol(str, NULL, 0); 483 if (!opts->comp_level) 484 opts->comp_level = comp_level_default; 485 } 486 487 return 0; 488 } 489 #endif 490 static unsigned int comp_level_max = 22; 491 492 static int record__comp_enabled(struct record *rec) 493 { 494 return rec->opts.comp_level > 0; 495 } 496 497 static int process_synthesized_event(struct perf_tool *tool, 498 union perf_event *event, 499 struct perf_sample *sample __maybe_unused, 500 struct machine *machine __maybe_unused) 501 { 502 struct record *rec = container_of(tool, struct record, tool); 503 return record__write(rec, NULL, event, event->header.size); 504 } 505 506 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size) 507 { 508 struct record *rec = to; 509 510 if (record__comp_enabled(rec)) { 511 size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size); 512 bf = map->data; 513 } 514 515 rec->samples++; 516 return record__write(rec, map, bf, size); 517 } 518 519 static volatile int signr = -1; 520 static volatile int child_finished; 521 522 static void sig_handler(int sig) 523 { 524 if (sig == SIGCHLD) 525 child_finished = 1; 526 else 527 signr = sig; 528 529 done = 1; 530 } 531 532 static void sigsegv_handler(int sig) 533 { 534 perf_hooks__recover(); 535 sighandler_dump_stack(sig); 536 } 537 538 static void record__sig_exit(void) 539 { 540 if (signr == -1) 541 return; 542 543 signal(signr, SIG_DFL); 544 raise(signr); 545 } 546 547 #ifdef HAVE_AUXTRACE_SUPPORT 548 549 static int record__process_auxtrace(struct perf_tool *tool, 550 struct mmap *map, 551 union perf_event *event, void *data1, 552 size_t len1, void *data2, size_t len2) 553 { 554 struct record *rec = container_of(tool, struct record, tool); 555 struct perf_data *data = &rec->data; 556 size_t padding; 557 u8 pad[8] = {0}; 558 559 if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) { 560 off_t file_offset; 561 int fd = perf_data__fd(data); 562 int err; 563 564 file_offset = lseek(fd, 0, SEEK_CUR); 565 if (file_offset == -1) 566 return -1; 567 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index, 568 event, file_offset); 569 if (err) 570 return err; 571 } 572 573 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */ 574 padding = (len1 + len2) & 7; 575 if (padding) 576 padding = 8 - padding; 577 578 record__write(rec, map, event, event->header.size); 579 record__write(rec, map, data1, len1); 580 if (len2) 581 record__write(rec, map, data2, len2); 582 record__write(rec, map, &pad, padding); 583 584 return 0; 585 } 586 587 static int record__auxtrace_mmap_read(struct record *rec, 588 struct mmap *map) 589 { 590 int ret; 591 592 ret = auxtrace_mmap__read(map, rec->itr, &rec->tool, 593 record__process_auxtrace); 594 if (ret < 0) 595 return ret; 596 597 if (ret) 598 rec->samples++; 599 600 return 0; 601 } 602 603 static int record__auxtrace_mmap_read_snapshot(struct record *rec, 604 struct mmap *map) 605 { 606 int ret; 607 608 ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool, 609 record__process_auxtrace, 610 rec->opts.auxtrace_snapshot_size); 611 if (ret < 0) 612 return ret; 613 614 if (ret) 615 rec->samples++; 616 617 return 0; 618 } 619 620 static int record__auxtrace_read_snapshot_all(struct record *rec) 621 { 622 int i; 623 int rc = 0; 624 625 for (i = 0; i < rec->evlist->core.nr_mmaps; i++) { 626 struct mmap *map = &rec->evlist->mmap[i]; 627 628 if (!map->auxtrace_mmap.base) 629 continue; 630 631 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) { 632 rc = -1; 633 goto out; 634 } 635 } 636 out: 637 return rc; 638 } 639 640 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit) 641 { 642 pr_debug("Recording AUX area tracing snapshot\n"); 643 if (record__auxtrace_read_snapshot_all(rec) < 0) { 644 trigger_error(&auxtrace_snapshot_trigger); 645 } else { 646 if (auxtrace_record__snapshot_finish(rec->itr, on_exit)) 647 trigger_error(&auxtrace_snapshot_trigger); 648 else 649 trigger_ready(&auxtrace_snapshot_trigger); 650 } 651 } 652 653 static int record__auxtrace_snapshot_exit(struct record *rec) 654 { 655 if (trigger_is_error(&auxtrace_snapshot_trigger)) 656 return 0; 657 658 if (!auxtrace_record__snapshot_started && 659 auxtrace_record__snapshot_start(rec->itr)) 660 return -1; 661 662 record__read_auxtrace_snapshot(rec, true); 663 if (trigger_is_error(&auxtrace_snapshot_trigger)) 664 return -1; 665 666 return 0; 667 } 668 669 static int record__auxtrace_init(struct record *rec) 670 { 671 int err; 672 673 if (!rec->itr) { 674 rec->itr = auxtrace_record__init(rec->evlist, &err); 675 if (err) 676 return err; 677 } 678 679 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts, 680 rec->opts.auxtrace_snapshot_opts); 681 if (err) 682 return err; 683 684 err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts, 685 rec->opts.auxtrace_sample_opts); 686 if (err) 687 return err; 688 689 return auxtrace_parse_filters(rec->evlist); 690 } 691 692 #else 693 694 static inline 695 int record__auxtrace_mmap_read(struct record *rec __maybe_unused, 696 struct mmap *map __maybe_unused) 697 { 698 return 0; 699 } 700 701 static inline 702 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused, 703 bool on_exit __maybe_unused) 704 { 705 } 706 707 static inline 708 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused) 709 { 710 return 0; 711 } 712 713 static inline 714 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused) 715 { 716 return 0; 717 } 718 719 static int record__auxtrace_init(struct record *rec __maybe_unused) 720 { 721 return 0; 722 } 723 724 #endif 725 726 static bool record__kcore_readable(struct machine *machine) 727 { 728 char kcore[PATH_MAX]; 729 int fd; 730 731 scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir); 732 733 fd = open(kcore, O_RDONLY); 734 if (fd < 0) 735 return false; 736 737 close(fd); 738 739 return true; 740 } 741 742 static int record__kcore_copy(struct machine *machine, struct perf_data *data) 743 { 744 char from_dir[PATH_MAX]; 745 char kcore_dir[PATH_MAX]; 746 int ret; 747 748 snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir); 749 750 ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir)); 751 if (ret) 752 return ret; 753 754 return kcore_copy(from_dir, kcore_dir); 755 } 756 757 static int record__mmap_evlist(struct record *rec, 758 struct evlist *evlist) 759 { 760 struct record_opts *opts = &rec->opts; 761 bool auxtrace_overwrite = opts->auxtrace_snapshot_mode || 762 opts->auxtrace_sample_mode; 763 char msg[512]; 764 765 if (opts->affinity != PERF_AFFINITY_SYS) 766 cpu__setup_cpunode_map(); 767 768 if (evlist__mmap_ex(evlist, opts->mmap_pages, 769 opts->auxtrace_mmap_pages, 770 auxtrace_overwrite, 771 opts->nr_cblocks, opts->affinity, 772 opts->mmap_flush, opts->comp_level) < 0) { 773 if (errno == EPERM) { 774 pr_err("Permission error mapping pages.\n" 775 "Consider increasing " 776 "/proc/sys/kernel/perf_event_mlock_kb,\n" 777 "or try again with a smaller value of -m/--mmap_pages.\n" 778 "(current value: %u,%u)\n", 779 opts->mmap_pages, opts->auxtrace_mmap_pages); 780 return -errno; 781 } else { 782 pr_err("failed to mmap with %d (%s)\n", errno, 783 str_error_r(errno, msg, sizeof(msg))); 784 if (errno) 785 return -errno; 786 else 787 return -EINVAL; 788 } 789 } 790 return 0; 791 } 792 793 static int record__mmap(struct record *rec) 794 { 795 return record__mmap_evlist(rec, rec->evlist); 796 } 797 798 static int record__open(struct record *rec) 799 { 800 char msg[BUFSIZ]; 801 struct evsel *pos; 802 struct evlist *evlist = rec->evlist; 803 struct perf_session *session = rec->session; 804 struct record_opts *opts = &rec->opts; 805 int rc = 0; 806 807 /* 808 * For initial_delay we need to add a dummy event so that we can track 809 * PERF_RECORD_MMAP while we wait for the initial delay to enable the 810 * real events, the ones asked by the user. 811 */ 812 if (opts->initial_delay) { 813 if (perf_evlist__add_dummy(evlist)) 814 return -ENOMEM; 815 816 pos = evlist__first(evlist); 817 pos->tracking = 0; 818 pos = evlist__last(evlist); 819 pos->tracking = 1; 820 pos->core.attr.enable_on_exec = 1; 821 } 822 823 perf_evlist__config(evlist, opts, &callchain_param); 824 825 evlist__for_each_entry(evlist, pos) { 826 try_again: 827 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) { 828 if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) { 829 if (verbose > 0) 830 ui__warning("%s\n", msg); 831 goto try_again; 832 } 833 if ((errno == EINVAL || errno == EBADF) && 834 pos->leader != pos && 835 pos->weak_group) { 836 pos = perf_evlist__reset_weak_group(evlist, pos, true); 837 goto try_again; 838 } 839 rc = -errno; 840 perf_evsel__open_strerror(pos, &opts->target, 841 errno, msg, sizeof(msg)); 842 ui__error("%s\n", msg); 843 goto out; 844 } 845 846 pos->supported = true; 847 } 848 849 if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) { 850 pr_warning( 851 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n" 852 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n" 853 "Samples in kernel functions may not be resolved if a suitable vmlinux\n" 854 "file is not found in the buildid cache or in the vmlinux path.\n\n" 855 "Samples in kernel modules won't be resolved at all.\n\n" 856 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n" 857 "even with a suitable vmlinux or kallsyms file.\n\n"); 858 } 859 860 if (perf_evlist__apply_filters(evlist, &pos)) { 861 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n", 862 pos->filter, perf_evsel__name(pos), errno, 863 str_error_r(errno, msg, sizeof(msg))); 864 rc = -1; 865 goto out; 866 } 867 868 rc = record__mmap(rec); 869 if (rc) 870 goto out; 871 872 session->evlist = evlist; 873 perf_session__set_id_hdr_size(session); 874 out: 875 return rc; 876 } 877 878 static int process_sample_event(struct perf_tool *tool, 879 union perf_event *event, 880 struct perf_sample *sample, 881 struct evsel *evsel, 882 struct machine *machine) 883 { 884 struct record *rec = container_of(tool, struct record, tool); 885 886 if (rec->evlist->first_sample_time == 0) 887 rec->evlist->first_sample_time = sample->time; 888 889 rec->evlist->last_sample_time = sample->time; 890 891 if (rec->buildid_all) 892 return 0; 893 894 rec->samples++; 895 return build_id__mark_dso_hit(tool, event, sample, evsel, machine); 896 } 897 898 static int process_buildids(struct record *rec) 899 { 900 struct perf_session *session = rec->session; 901 902 if (perf_data__size(&rec->data) == 0) 903 return 0; 904 905 /* 906 * During this process, it'll load kernel map and replace the 907 * dso->long_name to a real pathname it found. In this case 908 * we prefer the vmlinux path like 909 * /lib/modules/3.16.4/build/vmlinux 910 * 911 * rather than build-id path (in debug directory). 912 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551 913 */ 914 symbol_conf.ignore_vmlinux_buildid = true; 915 916 /* 917 * If --buildid-all is given, it marks all DSO regardless of hits, 918 * so no need to process samples. But if timestamp_boundary is enabled, 919 * it still needs to walk on all samples to get the timestamps of 920 * first/last samples. 921 */ 922 if (rec->buildid_all && !rec->timestamp_boundary) 923 rec->tool.sample = NULL; 924 925 return perf_session__process_events(session); 926 } 927 928 static void perf_event__synthesize_guest_os(struct machine *machine, void *data) 929 { 930 int err; 931 struct perf_tool *tool = data; 932 /* 933 *As for guest kernel when processing subcommand record&report, 934 *we arrange module mmap prior to guest kernel mmap and trigger 935 *a preload dso because default guest module symbols are loaded 936 *from guest kallsyms instead of /lib/modules/XXX/XXX. This 937 *method is used to avoid symbol missing when the first addr is 938 *in module instead of in guest kernel. 939 */ 940 err = perf_event__synthesize_modules(tool, process_synthesized_event, 941 machine); 942 if (err < 0) 943 pr_err("Couldn't record guest kernel [%d]'s reference" 944 " relocation symbol.\n", machine->pid); 945 946 /* 947 * We use _stext for guest kernel because guest kernel's /proc/kallsyms 948 * have no _text sometimes. 949 */ 950 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, 951 machine); 952 if (err < 0) 953 pr_err("Couldn't record guest kernel [%d]'s reference" 954 " relocation symbol.\n", machine->pid); 955 } 956 957 static struct perf_event_header finished_round_event = { 958 .size = sizeof(struct perf_event_header), 959 .type = PERF_RECORD_FINISHED_ROUND, 960 }; 961 962 static void record__adjust_affinity(struct record *rec, struct mmap *map) 963 { 964 if (rec->opts.affinity != PERF_AFFINITY_SYS && 965 !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits, 966 rec->affinity_mask.nbits)) { 967 bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits); 968 bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits, 969 map->affinity_mask.bits, rec->affinity_mask.nbits); 970 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask), 971 (cpu_set_t *)rec->affinity_mask.bits); 972 if (verbose == 2) 973 mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread"); 974 } 975 } 976 977 static size_t process_comp_header(void *record, size_t increment) 978 { 979 struct perf_record_compressed *event = record; 980 size_t size = sizeof(*event); 981 982 if (increment) { 983 event->header.size += increment; 984 return increment; 985 } 986 987 event->header.type = PERF_RECORD_COMPRESSED; 988 event->header.size = size; 989 990 return size; 991 } 992 993 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size, 994 void *src, size_t src_size) 995 { 996 size_t compressed; 997 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1; 998 999 compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size, 1000 max_record_size, process_comp_header); 1001 1002 session->bytes_transferred += src_size; 1003 session->bytes_compressed += compressed; 1004 1005 return compressed; 1006 } 1007 1008 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist, 1009 bool overwrite, bool synch) 1010 { 1011 u64 bytes_written = rec->bytes_written; 1012 int i; 1013 int rc = 0; 1014 struct mmap *maps; 1015 int trace_fd = rec->data.file.fd; 1016 off_t off = 0; 1017 1018 if (!evlist) 1019 return 0; 1020 1021 maps = overwrite ? evlist->overwrite_mmap : evlist->mmap; 1022 if (!maps) 1023 return 0; 1024 1025 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING) 1026 return 0; 1027 1028 if (record__aio_enabled(rec)) 1029 off = record__aio_get_pos(trace_fd); 1030 1031 for (i = 0; i < evlist->core.nr_mmaps; i++) { 1032 u64 flush = 0; 1033 struct mmap *map = &maps[i]; 1034 1035 if (map->core.base) { 1036 record__adjust_affinity(rec, map); 1037 if (synch) { 1038 flush = map->core.flush; 1039 map->core.flush = 1; 1040 } 1041 if (!record__aio_enabled(rec)) { 1042 if (perf_mmap__push(map, rec, record__pushfn) < 0) { 1043 if (synch) 1044 map->core.flush = flush; 1045 rc = -1; 1046 goto out; 1047 } 1048 } else { 1049 if (record__aio_push(rec, map, &off) < 0) { 1050 record__aio_set_pos(trace_fd, off); 1051 if (synch) 1052 map->core.flush = flush; 1053 rc = -1; 1054 goto out; 1055 } 1056 } 1057 if (synch) 1058 map->core.flush = flush; 1059 } 1060 1061 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode && 1062 !rec->opts.auxtrace_sample_mode && 1063 record__auxtrace_mmap_read(rec, map) != 0) { 1064 rc = -1; 1065 goto out; 1066 } 1067 } 1068 1069 if (record__aio_enabled(rec)) 1070 record__aio_set_pos(trace_fd, off); 1071 1072 /* 1073 * Mark the round finished in case we wrote 1074 * at least one event. 1075 */ 1076 if (bytes_written != rec->bytes_written) 1077 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event)); 1078 1079 if (overwrite) 1080 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY); 1081 out: 1082 return rc; 1083 } 1084 1085 static int record__mmap_read_all(struct record *rec, bool synch) 1086 { 1087 int err; 1088 1089 err = record__mmap_read_evlist(rec, rec->evlist, false, synch); 1090 if (err) 1091 return err; 1092 1093 return record__mmap_read_evlist(rec, rec->evlist, true, synch); 1094 } 1095 1096 static void record__init_features(struct record *rec) 1097 { 1098 struct perf_session *session = rec->session; 1099 int feat; 1100 1101 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++) 1102 perf_header__set_feat(&session->header, feat); 1103 1104 if (rec->no_buildid) 1105 perf_header__clear_feat(&session->header, HEADER_BUILD_ID); 1106 1107 if (!have_tracepoints(&rec->evlist->core.entries)) 1108 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA); 1109 1110 if (!rec->opts.branch_stack) 1111 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK); 1112 1113 if (!rec->opts.full_auxtrace) 1114 perf_header__clear_feat(&session->header, HEADER_AUXTRACE); 1115 1116 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns)) 1117 perf_header__clear_feat(&session->header, HEADER_CLOCKID); 1118 1119 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT); 1120 if (!record__comp_enabled(rec)) 1121 perf_header__clear_feat(&session->header, HEADER_COMPRESSED); 1122 1123 perf_header__clear_feat(&session->header, HEADER_STAT); 1124 } 1125 1126 static void 1127 record__finish_output(struct record *rec) 1128 { 1129 struct perf_data *data = &rec->data; 1130 int fd = perf_data__fd(data); 1131 1132 if (data->is_pipe) 1133 return; 1134 1135 rec->session->header.data_size += rec->bytes_written; 1136 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR); 1137 1138 if (!rec->no_buildid) { 1139 process_buildids(rec); 1140 1141 if (rec->buildid_all) 1142 dsos__hit_all(rec->session); 1143 } 1144 perf_session__write_header(rec->session, rec->evlist, fd, true); 1145 1146 return; 1147 } 1148 1149 static int record__synthesize_workload(struct record *rec, bool tail) 1150 { 1151 int err; 1152 struct perf_thread_map *thread_map; 1153 1154 if (rec->opts.tail_synthesize != tail) 1155 return 0; 1156 1157 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid); 1158 if (thread_map == NULL) 1159 return -1; 1160 1161 err = perf_event__synthesize_thread_map(&rec->tool, thread_map, 1162 process_synthesized_event, 1163 &rec->session->machines.host, 1164 rec->opts.sample_address); 1165 perf_thread_map__put(thread_map); 1166 return err; 1167 } 1168 1169 static int record__synthesize(struct record *rec, bool tail); 1170 1171 static int 1172 record__switch_output(struct record *rec, bool at_exit) 1173 { 1174 struct perf_data *data = &rec->data; 1175 int fd, err; 1176 char *new_filename; 1177 1178 /* Same Size: "2015122520103046"*/ 1179 char timestamp[] = "InvalidTimestamp"; 1180 1181 record__aio_mmap_read_sync(rec); 1182 1183 record__synthesize(rec, true); 1184 if (target__none(&rec->opts.target)) 1185 record__synthesize_workload(rec, true); 1186 1187 rec->samples = 0; 1188 record__finish_output(rec); 1189 err = fetch_current_timestamp(timestamp, sizeof(timestamp)); 1190 if (err) { 1191 pr_err("Failed to get current timestamp\n"); 1192 return -EINVAL; 1193 } 1194 1195 fd = perf_data__switch(data, timestamp, 1196 rec->session->header.data_offset, 1197 at_exit, &new_filename); 1198 if (fd >= 0 && !at_exit) { 1199 rec->bytes_written = 0; 1200 rec->session->header.data_size = 0; 1201 } 1202 1203 if (!quiet) 1204 fprintf(stderr, "[ perf record: Dump %s.%s ]\n", 1205 data->path, timestamp); 1206 1207 if (rec->switch_output.num_files) { 1208 int n = rec->switch_output.cur_file + 1; 1209 1210 if (n >= rec->switch_output.num_files) 1211 n = 0; 1212 rec->switch_output.cur_file = n; 1213 if (rec->switch_output.filenames[n]) { 1214 remove(rec->switch_output.filenames[n]); 1215 zfree(&rec->switch_output.filenames[n]); 1216 } 1217 rec->switch_output.filenames[n] = new_filename; 1218 } else { 1219 free(new_filename); 1220 } 1221 1222 /* Output tracking events */ 1223 if (!at_exit) { 1224 record__synthesize(rec, false); 1225 1226 /* 1227 * In 'perf record --switch-output' without -a, 1228 * record__synthesize() in record__switch_output() won't 1229 * generate tracking events because there's no thread_map 1230 * in evlist. Which causes newly created perf.data doesn't 1231 * contain map and comm information. 1232 * Create a fake thread_map and directly call 1233 * perf_event__synthesize_thread_map() for those events. 1234 */ 1235 if (target__none(&rec->opts.target)) 1236 record__synthesize_workload(rec, false); 1237 } 1238 return fd; 1239 } 1240 1241 static volatile int workload_exec_errno; 1242 1243 /* 1244 * perf_evlist__prepare_workload will send a SIGUSR1 1245 * if the fork fails, since we asked by setting its 1246 * want_signal to true. 1247 */ 1248 static void workload_exec_failed_signal(int signo __maybe_unused, 1249 siginfo_t *info, 1250 void *ucontext __maybe_unused) 1251 { 1252 workload_exec_errno = info->si_value.sival_int; 1253 done = 1; 1254 child_finished = 1; 1255 } 1256 1257 static void snapshot_sig_handler(int sig); 1258 static void alarm_sig_handler(int sig); 1259 1260 static const struct perf_event_mmap_page * 1261 perf_evlist__pick_pc(struct evlist *evlist) 1262 { 1263 if (evlist) { 1264 if (evlist->mmap && evlist->mmap[0].core.base) 1265 return evlist->mmap[0].core.base; 1266 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base) 1267 return evlist->overwrite_mmap[0].core.base; 1268 } 1269 return NULL; 1270 } 1271 1272 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec) 1273 { 1274 const struct perf_event_mmap_page *pc; 1275 1276 pc = perf_evlist__pick_pc(rec->evlist); 1277 if (pc) 1278 return pc; 1279 return NULL; 1280 } 1281 1282 static int record__synthesize(struct record *rec, bool tail) 1283 { 1284 struct perf_session *session = rec->session; 1285 struct machine *machine = &session->machines.host; 1286 struct perf_data *data = &rec->data; 1287 struct record_opts *opts = &rec->opts; 1288 struct perf_tool *tool = &rec->tool; 1289 int fd = perf_data__fd(data); 1290 int err = 0; 1291 1292 if (rec->opts.tail_synthesize != tail) 1293 return 0; 1294 1295 if (data->is_pipe) { 1296 /* 1297 * We need to synthesize events first, because some 1298 * features works on top of them (on report side). 1299 */ 1300 err = perf_event__synthesize_attrs(tool, rec->evlist, 1301 process_synthesized_event); 1302 if (err < 0) { 1303 pr_err("Couldn't synthesize attrs.\n"); 1304 goto out; 1305 } 1306 1307 err = perf_event__synthesize_features(tool, session, rec->evlist, 1308 process_synthesized_event); 1309 if (err < 0) { 1310 pr_err("Couldn't synthesize features.\n"); 1311 return err; 1312 } 1313 1314 if (have_tracepoints(&rec->evlist->core.entries)) { 1315 /* 1316 * FIXME err <= 0 here actually means that 1317 * there were no tracepoints so its not really 1318 * an error, just that we don't need to 1319 * synthesize anything. We really have to 1320 * return this more properly and also 1321 * propagate errors that now are calling die() 1322 */ 1323 err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist, 1324 process_synthesized_event); 1325 if (err <= 0) { 1326 pr_err("Couldn't record tracing data.\n"); 1327 goto out; 1328 } 1329 rec->bytes_written += err; 1330 } 1331 } 1332 1333 err = perf_event__synth_time_conv(record__pick_pc(rec), tool, 1334 process_synthesized_event, machine); 1335 if (err) 1336 goto out; 1337 1338 /* Synthesize id_index before auxtrace_info */ 1339 if (rec->opts.auxtrace_sample_mode) { 1340 err = perf_event__synthesize_id_index(tool, 1341 process_synthesized_event, 1342 session->evlist, machine); 1343 if (err) 1344 goto out; 1345 } 1346 1347 if (rec->opts.full_auxtrace) { 1348 err = perf_event__synthesize_auxtrace_info(rec->itr, tool, 1349 session, process_synthesized_event); 1350 if (err) 1351 goto out; 1352 } 1353 1354 if (!perf_evlist__exclude_kernel(rec->evlist)) { 1355 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, 1356 machine); 1357 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n" 1358 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 1359 "Check /proc/kallsyms permission or run as root.\n"); 1360 1361 err = perf_event__synthesize_modules(tool, process_synthesized_event, 1362 machine); 1363 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n" 1364 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 1365 "Check /proc/modules permission or run as root.\n"); 1366 } 1367 1368 if (perf_guest) { 1369 machines__process_guests(&session->machines, 1370 perf_event__synthesize_guest_os, tool); 1371 } 1372 1373 err = perf_event__synthesize_extra_attr(&rec->tool, 1374 rec->evlist, 1375 process_synthesized_event, 1376 data->is_pipe); 1377 if (err) 1378 goto out; 1379 1380 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads, 1381 process_synthesized_event, 1382 NULL); 1383 if (err < 0) { 1384 pr_err("Couldn't synthesize thread map.\n"); 1385 return err; 1386 } 1387 1388 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus, 1389 process_synthesized_event, NULL); 1390 if (err < 0) { 1391 pr_err("Couldn't synthesize cpu map.\n"); 1392 return err; 1393 } 1394 1395 err = perf_event__synthesize_bpf_events(session, process_synthesized_event, 1396 machine, opts); 1397 if (err < 0) 1398 pr_warning("Couldn't synthesize bpf events.\n"); 1399 1400 err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads, 1401 process_synthesized_event, opts->sample_address, 1402 1); 1403 out: 1404 return err; 1405 } 1406 1407 static int __cmd_record(struct record *rec, int argc, const char **argv) 1408 { 1409 int err; 1410 int status = 0; 1411 unsigned long waking = 0; 1412 const bool forks = argc > 0; 1413 struct perf_tool *tool = &rec->tool; 1414 struct record_opts *opts = &rec->opts; 1415 struct perf_data *data = &rec->data; 1416 struct perf_session *session; 1417 bool disabled = false, draining = false; 1418 struct evlist *sb_evlist = NULL; 1419 int fd; 1420 float ratio = 0; 1421 1422 atexit(record__sig_exit); 1423 signal(SIGCHLD, sig_handler); 1424 signal(SIGINT, sig_handler); 1425 signal(SIGTERM, sig_handler); 1426 signal(SIGSEGV, sigsegv_handler); 1427 1428 if (rec->opts.record_namespaces) 1429 tool->namespace_events = true; 1430 1431 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) { 1432 signal(SIGUSR2, snapshot_sig_handler); 1433 if (rec->opts.auxtrace_snapshot_mode) 1434 trigger_on(&auxtrace_snapshot_trigger); 1435 if (rec->switch_output.enabled) 1436 trigger_on(&switch_output_trigger); 1437 } else { 1438 signal(SIGUSR2, SIG_IGN); 1439 } 1440 1441 session = perf_session__new(data, false, tool); 1442 if (IS_ERR(session)) { 1443 pr_err("Perf session creation failed.\n"); 1444 return PTR_ERR(session); 1445 } 1446 1447 fd = perf_data__fd(data); 1448 rec->session = session; 1449 1450 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) { 1451 pr_err("Compression initialization failed.\n"); 1452 return -1; 1453 } 1454 1455 session->header.env.comp_type = PERF_COMP_ZSTD; 1456 session->header.env.comp_level = rec->opts.comp_level; 1457 1458 if (rec->opts.kcore && 1459 !record__kcore_readable(&session->machines.host)) { 1460 pr_err("ERROR: kcore is not readable.\n"); 1461 return -1; 1462 } 1463 1464 record__init_features(rec); 1465 1466 if (rec->opts.use_clockid && rec->opts.clockid_res_ns) 1467 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns; 1468 1469 if (forks) { 1470 err = perf_evlist__prepare_workload(rec->evlist, &opts->target, 1471 argv, data->is_pipe, 1472 workload_exec_failed_signal); 1473 if (err < 0) { 1474 pr_err("Couldn't run the workload!\n"); 1475 status = err; 1476 goto out_delete_session; 1477 } 1478 } 1479 1480 /* 1481 * If we have just single event and are sending data 1482 * through pipe, we need to force the ids allocation, 1483 * because we synthesize event name through the pipe 1484 * and need the id for that. 1485 */ 1486 if (data->is_pipe && rec->evlist->core.nr_entries == 1) 1487 rec->opts.sample_id = true; 1488 1489 if (record__open(rec) != 0) { 1490 err = -1; 1491 goto out_child; 1492 } 1493 session->header.env.comp_mmap_len = session->evlist->core.mmap_len; 1494 1495 if (rec->opts.kcore) { 1496 err = record__kcore_copy(&session->machines.host, data); 1497 if (err) { 1498 pr_err("ERROR: Failed to copy kcore\n"); 1499 goto out_child; 1500 } 1501 } 1502 1503 err = bpf__apply_obj_config(); 1504 if (err) { 1505 char errbuf[BUFSIZ]; 1506 1507 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf)); 1508 pr_err("ERROR: Apply config to BPF failed: %s\n", 1509 errbuf); 1510 goto out_child; 1511 } 1512 1513 /* 1514 * Normally perf_session__new would do this, but it doesn't have the 1515 * evlist. 1516 */ 1517 if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) { 1518 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n"); 1519 rec->tool.ordered_events = false; 1520 } 1521 1522 if (!rec->evlist->nr_groups) 1523 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC); 1524 1525 if (data->is_pipe) { 1526 err = perf_header__write_pipe(fd); 1527 if (err < 0) 1528 goto out_child; 1529 } else { 1530 err = perf_session__write_header(session, rec->evlist, fd, false); 1531 if (err < 0) 1532 goto out_child; 1533 } 1534 1535 if (!rec->no_buildid 1536 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) { 1537 pr_err("Couldn't generate buildids. " 1538 "Use --no-buildid to profile anyway.\n"); 1539 err = -1; 1540 goto out_child; 1541 } 1542 1543 if (!opts->no_bpf_event) 1544 bpf_event__add_sb_event(&sb_evlist, &session->header.env); 1545 1546 if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) { 1547 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n"); 1548 opts->no_bpf_event = true; 1549 } 1550 1551 err = record__synthesize(rec, false); 1552 if (err < 0) 1553 goto out_child; 1554 1555 if (rec->realtime_prio) { 1556 struct sched_param param; 1557 1558 param.sched_priority = rec->realtime_prio; 1559 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { 1560 pr_err("Could not set realtime priority.\n"); 1561 err = -1; 1562 goto out_child; 1563 } 1564 } 1565 1566 /* 1567 * When perf is starting the traced process, all the events 1568 * (apart from group members) have enable_on_exec=1 set, 1569 * so don't spoil it by prematurely enabling them. 1570 */ 1571 if (!target__none(&opts->target) && !opts->initial_delay) 1572 evlist__enable(rec->evlist); 1573 1574 /* 1575 * Let the child rip 1576 */ 1577 if (forks) { 1578 struct machine *machine = &session->machines.host; 1579 union perf_event *event; 1580 pid_t tgid; 1581 1582 event = malloc(sizeof(event->comm) + machine->id_hdr_size); 1583 if (event == NULL) { 1584 err = -ENOMEM; 1585 goto out_child; 1586 } 1587 1588 /* 1589 * Some H/W events are generated before COMM event 1590 * which is emitted during exec(), so perf script 1591 * cannot see a correct process name for those events. 1592 * Synthesize COMM event to prevent it. 1593 */ 1594 tgid = perf_event__synthesize_comm(tool, event, 1595 rec->evlist->workload.pid, 1596 process_synthesized_event, 1597 machine); 1598 free(event); 1599 1600 if (tgid == -1) 1601 goto out_child; 1602 1603 event = malloc(sizeof(event->namespaces) + 1604 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) + 1605 machine->id_hdr_size); 1606 if (event == NULL) { 1607 err = -ENOMEM; 1608 goto out_child; 1609 } 1610 1611 /* 1612 * Synthesize NAMESPACES event for the command specified. 1613 */ 1614 perf_event__synthesize_namespaces(tool, event, 1615 rec->evlist->workload.pid, 1616 tgid, process_synthesized_event, 1617 machine); 1618 free(event); 1619 1620 perf_evlist__start_workload(rec->evlist); 1621 } 1622 1623 if (opts->initial_delay) { 1624 usleep(opts->initial_delay * USEC_PER_MSEC); 1625 evlist__enable(rec->evlist); 1626 } 1627 1628 trigger_ready(&auxtrace_snapshot_trigger); 1629 trigger_ready(&switch_output_trigger); 1630 perf_hooks__invoke_record_start(); 1631 for (;;) { 1632 unsigned long long hits = rec->samples; 1633 1634 /* 1635 * rec->evlist->bkw_mmap_state is possible to be 1636 * BKW_MMAP_EMPTY here: when done == true and 1637 * hits != rec->samples in previous round. 1638 * 1639 * perf_evlist__toggle_bkw_mmap ensure we never 1640 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING. 1641 */ 1642 if (trigger_is_hit(&switch_output_trigger) || done || draining) 1643 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING); 1644 1645 if (record__mmap_read_all(rec, false) < 0) { 1646 trigger_error(&auxtrace_snapshot_trigger); 1647 trigger_error(&switch_output_trigger); 1648 err = -1; 1649 goto out_child; 1650 } 1651 1652 if (auxtrace_record__snapshot_started) { 1653 auxtrace_record__snapshot_started = 0; 1654 if (!trigger_is_error(&auxtrace_snapshot_trigger)) 1655 record__read_auxtrace_snapshot(rec, false); 1656 if (trigger_is_error(&auxtrace_snapshot_trigger)) { 1657 pr_err("AUX area tracing snapshot failed\n"); 1658 err = -1; 1659 goto out_child; 1660 } 1661 } 1662 1663 if (trigger_is_hit(&switch_output_trigger)) { 1664 /* 1665 * If switch_output_trigger is hit, the data in 1666 * overwritable ring buffer should have been collected, 1667 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY. 1668 * 1669 * If SIGUSR2 raise after or during record__mmap_read_all(), 1670 * record__mmap_read_all() didn't collect data from 1671 * overwritable ring buffer. Read again. 1672 */ 1673 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING) 1674 continue; 1675 trigger_ready(&switch_output_trigger); 1676 1677 /* 1678 * Reenable events in overwrite ring buffer after 1679 * record__mmap_read_all(): we should have collected 1680 * data from it. 1681 */ 1682 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING); 1683 1684 if (!quiet) 1685 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n", 1686 waking); 1687 waking = 0; 1688 fd = record__switch_output(rec, false); 1689 if (fd < 0) { 1690 pr_err("Failed to switch to new file\n"); 1691 trigger_error(&switch_output_trigger); 1692 err = fd; 1693 goto out_child; 1694 } 1695 1696 /* re-arm the alarm */ 1697 if (rec->switch_output.time) 1698 alarm(rec->switch_output.time); 1699 } 1700 1701 if (hits == rec->samples) { 1702 if (done || draining) 1703 break; 1704 err = evlist__poll(rec->evlist, -1); 1705 /* 1706 * Propagate error, only if there's any. Ignore positive 1707 * number of returned events and interrupt error. 1708 */ 1709 if (err > 0 || (err < 0 && errno == EINTR)) 1710 err = 0; 1711 waking++; 1712 1713 if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0) 1714 draining = true; 1715 } 1716 1717 /* 1718 * When perf is starting the traced process, at the end events 1719 * die with the process and we wait for that. Thus no need to 1720 * disable events in this case. 1721 */ 1722 if (done && !disabled && !target__none(&opts->target)) { 1723 trigger_off(&auxtrace_snapshot_trigger); 1724 evlist__disable(rec->evlist); 1725 disabled = true; 1726 } 1727 } 1728 1729 trigger_off(&auxtrace_snapshot_trigger); 1730 trigger_off(&switch_output_trigger); 1731 1732 if (opts->auxtrace_snapshot_on_exit) 1733 record__auxtrace_snapshot_exit(rec); 1734 1735 if (forks && workload_exec_errno) { 1736 char msg[STRERR_BUFSIZE]; 1737 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg)); 1738 pr_err("Workload failed: %s\n", emsg); 1739 err = -1; 1740 goto out_child; 1741 } 1742 1743 if (!quiet) 1744 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking); 1745 1746 if (target__none(&rec->opts.target)) 1747 record__synthesize_workload(rec, true); 1748 1749 out_child: 1750 record__mmap_read_all(rec, true); 1751 record__aio_mmap_read_sync(rec); 1752 1753 if (rec->session->bytes_transferred && rec->session->bytes_compressed) { 1754 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed; 1755 session->header.env.comp_ratio = ratio + 0.5; 1756 } 1757 1758 if (forks) { 1759 int exit_status; 1760 1761 if (!child_finished) 1762 kill(rec->evlist->workload.pid, SIGTERM); 1763 1764 wait(&exit_status); 1765 1766 if (err < 0) 1767 status = err; 1768 else if (WIFEXITED(exit_status)) 1769 status = WEXITSTATUS(exit_status); 1770 else if (WIFSIGNALED(exit_status)) 1771 signr = WTERMSIG(exit_status); 1772 } else 1773 status = err; 1774 1775 record__synthesize(rec, true); 1776 /* this will be recalculated during process_buildids() */ 1777 rec->samples = 0; 1778 1779 if (!err) { 1780 if (!rec->timestamp_filename) { 1781 record__finish_output(rec); 1782 } else { 1783 fd = record__switch_output(rec, true); 1784 if (fd < 0) { 1785 status = fd; 1786 goto out_delete_session; 1787 } 1788 } 1789 } 1790 1791 perf_hooks__invoke_record_end(); 1792 1793 if (!err && !quiet) { 1794 char samples[128]; 1795 const char *postfix = rec->timestamp_filename ? 1796 ".<timestamp>" : ""; 1797 1798 if (rec->samples && !rec->opts.full_auxtrace) 1799 scnprintf(samples, sizeof(samples), 1800 " (%" PRIu64 " samples)", rec->samples); 1801 else 1802 samples[0] = '\0'; 1803 1804 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s", 1805 perf_data__size(data) / 1024.0 / 1024.0, 1806 data->path, postfix, samples); 1807 if (ratio) { 1808 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)", 1809 rec->session->bytes_transferred / 1024.0 / 1024.0, 1810 ratio); 1811 } 1812 fprintf(stderr, " ]\n"); 1813 } 1814 1815 out_delete_session: 1816 zstd_fini(&session->zstd_data); 1817 perf_session__delete(session); 1818 1819 if (!opts->no_bpf_event) 1820 perf_evlist__stop_sb_thread(sb_evlist); 1821 return status; 1822 } 1823 1824 static void callchain_debug(struct callchain_param *callchain) 1825 { 1826 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" }; 1827 1828 pr_debug("callchain: type %s\n", str[callchain->record_mode]); 1829 1830 if (callchain->record_mode == CALLCHAIN_DWARF) 1831 pr_debug("callchain: stack dump size %d\n", 1832 callchain->dump_size); 1833 } 1834 1835 int record_opts__parse_callchain(struct record_opts *record, 1836 struct callchain_param *callchain, 1837 const char *arg, bool unset) 1838 { 1839 int ret; 1840 callchain->enabled = !unset; 1841 1842 /* --no-call-graph */ 1843 if (unset) { 1844 callchain->record_mode = CALLCHAIN_NONE; 1845 pr_debug("callchain: disabled\n"); 1846 return 0; 1847 } 1848 1849 ret = parse_callchain_record_opt(arg, callchain); 1850 if (!ret) { 1851 /* Enable data address sampling for DWARF unwind. */ 1852 if (callchain->record_mode == CALLCHAIN_DWARF) 1853 record->sample_address = true; 1854 callchain_debug(callchain); 1855 } 1856 1857 return ret; 1858 } 1859 1860 int record_parse_callchain_opt(const struct option *opt, 1861 const char *arg, 1862 int unset) 1863 { 1864 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset); 1865 } 1866 1867 int record_callchain_opt(const struct option *opt, 1868 const char *arg __maybe_unused, 1869 int unset __maybe_unused) 1870 { 1871 struct callchain_param *callchain = opt->value; 1872 1873 callchain->enabled = true; 1874 1875 if (callchain->record_mode == CALLCHAIN_NONE) 1876 callchain->record_mode = CALLCHAIN_FP; 1877 1878 callchain_debug(callchain); 1879 return 0; 1880 } 1881 1882 static int perf_record_config(const char *var, const char *value, void *cb) 1883 { 1884 struct record *rec = cb; 1885 1886 if (!strcmp(var, "record.build-id")) { 1887 if (!strcmp(value, "cache")) 1888 rec->no_buildid_cache = false; 1889 else if (!strcmp(value, "no-cache")) 1890 rec->no_buildid_cache = true; 1891 else if (!strcmp(value, "skip")) 1892 rec->no_buildid = true; 1893 else 1894 return -1; 1895 return 0; 1896 } 1897 if (!strcmp(var, "record.call-graph")) { 1898 var = "call-graph.record-mode"; 1899 return perf_default_config(var, value, cb); 1900 } 1901 #ifdef HAVE_AIO_SUPPORT 1902 if (!strcmp(var, "record.aio")) { 1903 rec->opts.nr_cblocks = strtol(value, NULL, 0); 1904 if (!rec->opts.nr_cblocks) 1905 rec->opts.nr_cblocks = nr_cblocks_default; 1906 } 1907 #endif 1908 1909 return 0; 1910 } 1911 1912 struct clockid_map { 1913 const char *name; 1914 int clockid; 1915 }; 1916 1917 #define CLOCKID_MAP(n, c) \ 1918 { .name = n, .clockid = (c), } 1919 1920 #define CLOCKID_END { .name = NULL, } 1921 1922 1923 /* 1924 * Add the missing ones, we need to build on many distros... 1925 */ 1926 #ifndef CLOCK_MONOTONIC_RAW 1927 #define CLOCK_MONOTONIC_RAW 4 1928 #endif 1929 #ifndef CLOCK_BOOTTIME 1930 #define CLOCK_BOOTTIME 7 1931 #endif 1932 #ifndef CLOCK_TAI 1933 #define CLOCK_TAI 11 1934 #endif 1935 1936 static const struct clockid_map clockids[] = { 1937 /* available for all events, NMI safe */ 1938 CLOCKID_MAP("monotonic", CLOCK_MONOTONIC), 1939 CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW), 1940 1941 /* available for some events */ 1942 CLOCKID_MAP("realtime", CLOCK_REALTIME), 1943 CLOCKID_MAP("boottime", CLOCK_BOOTTIME), 1944 CLOCKID_MAP("tai", CLOCK_TAI), 1945 1946 /* available for the lazy */ 1947 CLOCKID_MAP("mono", CLOCK_MONOTONIC), 1948 CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW), 1949 CLOCKID_MAP("real", CLOCK_REALTIME), 1950 CLOCKID_MAP("boot", CLOCK_BOOTTIME), 1951 1952 CLOCKID_END, 1953 }; 1954 1955 static int get_clockid_res(clockid_t clk_id, u64 *res_ns) 1956 { 1957 struct timespec res; 1958 1959 *res_ns = 0; 1960 if (!clock_getres(clk_id, &res)) 1961 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC; 1962 else 1963 pr_warning("WARNING: Failed to determine specified clock resolution.\n"); 1964 1965 return 0; 1966 } 1967 1968 static int parse_clockid(const struct option *opt, const char *str, int unset) 1969 { 1970 struct record_opts *opts = (struct record_opts *)opt->value; 1971 const struct clockid_map *cm; 1972 const char *ostr = str; 1973 1974 if (unset) { 1975 opts->use_clockid = 0; 1976 return 0; 1977 } 1978 1979 /* no arg passed */ 1980 if (!str) 1981 return 0; 1982 1983 /* no setting it twice */ 1984 if (opts->use_clockid) 1985 return -1; 1986 1987 opts->use_clockid = true; 1988 1989 /* if its a number, we're done */ 1990 if (sscanf(str, "%d", &opts->clockid) == 1) 1991 return get_clockid_res(opts->clockid, &opts->clockid_res_ns); 1992 1993 /* allow a "CLOCK_" prefix to the name */ 1994 if (!strncasecmp(str, "CLOCK_", 6)) 1995 str += 6; 1996 1997 for (cm = clockids; cm->name; cm++) { 1998 if (!strcasecmp(str, cm->name)) { 1999 opts->clockid = cm->clockid; 2000 return get_clockid_res(opts->clockid, 2001 &opts->clockid_res_ns); 2002 } 2003 } 2004 2005 opts->use_clockid = false; 2006 ui__warning("unknown clockid %s, check man page\n", ostr); 2007 return -1; 2008 } 2009 2010 static int record__parse_affinity(const struct option *opt, const char *str, int unset) 2011 { 2012 struct record_opts *opts = (struct record_opts *)opt->value; 2013 2014 if (unset || !str) 2015 return 0; 2016 2017 if (!strcasecmp(str, "node")) 2018 opts->affinity = PERF_AFFINITY_NODE; 2019 else if (!strcasecmp(str, "cpu")) 2020 opts->affinity = PERF_AFFINITY_CPU; 2021 2022 return 0; 2023 } 2024 2025 static int parse_output_max_size(const struct option *opt, 2026 const char *str, int unset) 2027 { 2028 unsigned long *s = (unsigned long *)opt->value; 2029 static struct parse_tag tags_size[] = { 2030 { .tag = 'B', .mult = 1 }, 2031 { .tag = 'K', .mult = 1 << 10 }, 2032 { .tag = 'M', .mult = 1 << 20 }, 2033 { .tag = 'G', .mult = 1 << 30 }, 2034 { .tag = 0 }, 2035 }; 2036 unsigned long val; 2037 2038 if (unset) { 2039 *s = 0; 2040 return 0; 2041 } 2042 2043 val = parse_tag_value(str, tags_size); 2044 if (val != (unsigned long) -1) { 2045 *s = val; 2046 return 0; 2047 } 2048 2049 return -1; 2050 } 2051 2052 static int record__parse_mmap_pages(const struct option *opt, 2053 const char *str, 2054 int unset __maybe_unused) 2055 { 2056 struct record_opts *opts = opt->value; 2057 char *s, *p; 2058 unsigned int mmap_pages; 2059 int ret; 2060 2061 if (!str) 2062 return -EINVAL; 2063 2064 s = strdup(str); 2065 if (!s) 2066 return -ENOMEM; 2067 2068 p = strchr(s, ','); 2069 if (p) 2070 *p = '\0'; 2071 2072 if (*s) { 2073 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s); 2074 if (ret) 2075 goto out_free; 2076 opts->mmap_pages = mmap_pages; 2077 } 2078 2079 if (!p) { 2080 ret = 0; 2081 goto out_free; 2082 } 2083 2084 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1); 2085 if (ret) 2086 goto out_free; 2087 2088 opts->auxtrace_mmap_pages = mmap_pages; 2089 2090 out_free: 2091 free(s); 2092 return ret; 2093 } 2094 2095 static void switch_output_size_warn(struct record *rec) 2096 { 2097 u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages); 2098 struct switch_output *s = &rec->switch_output; 2099 2100 wakeup_size /= 2; 2101 2102 if (s->size < wakeup_size) { 2103 char buf[100]; 2104 2105 unit_number__scnprintf(buf, sizeof(buf), wakeup_size); 2106 pr_warning("WARNING: switch-output data size lower than " 2107 "wakeup kernel buffer size (%s) " 2108 "expect bigger perf.data sizes\n", buf); 2109 } 2110 } 2111 2112 static int switch_output_setup(struct record *rec) 2113 { 2114 struct switch_output *s = &rec->switch_output; 2115 static struct parse_tag tags_size[] = { 2116 { .tag = 'B', .mult = 1 }, 2117 { .tag = 'K', .mult = 1 << 10 }, 2118 { .tag = 'M', .mult = 1 << 20 }, 2119 { .tag = 'G', .mult = 1 << 30 }, 2120 { .tag = 0 }, 2121 }; 2122 static struct parse_tag tags_time[] = { 2123 { .tag = 's', .mult = 1 }, 2124 { .tag = 'm', .mult = 60 }, 2125 { .tag = 'h', .mult = 60*60 }, 2126 { .tag = 'd', .mult = 60*60*24 }, 2127 { .tag = 0 }, 2128 }; 2129 unsigned long val; 2130 2131 if (!s->set) 2132 return 0; 2133 2134 if (!strcmp(s->str, "signal")) { 2135 s->signal = true; 2136 pr_debug("switch-output with SIGUSR2 signal\n"); 2137 goto enabled; 2138 } 2139 2140 val = parse_tag_value(s->str, tags_size); 2141 if (val != (unsigned long) -1) { 2142 s->size = val; 2143 pr_debug("switch-output with %s size threshold\n", s->str); 2144 goto enabled; 2145 } 2146 2147 val = parse_tag_value(s->str, tags_time); 2148 if (val != (unsigned long) -1) { 2149 s->time = val; 2150 pr_debug("switch-output with %s time threshold (%lu seconds)\n", 2151 s->str, s->time); 2152 goto enabled; 2153 } 2154 2155 return -1; 2156 2157 enabled: 2158 rec->timestamp_filename = true; 2159 s->enabled = true; 2160 2161 if (s->size && !rec->opts.no_buffering) 2162 switch_output_size_warn(rec); 2163 2164 return 0; 2165 } 2166 2167 static const char * const __record_usage[] = { 2168 "perf record [<options>] [<command>]", 2169 "perf record [<options>] -- <command> [<options>]", 2170 NULL 2171 }; 2172 const char * const *record_usage = __record_usage; 2173 2174 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event, 2175 struct perf_sample *sample, struct machine *machine) 2176 { 2177 /* 2178 * We already have the kernel maps, put in place via perf_session__create_kernel_maps() 2179 * no need to add them twice. 2180 */ 2181 if (!(event->header.misc & PERF_RECORD_MISC_USER)) 2182 return 0; 2183 return perf_event__process_mmap(tool, event, sample, machine); 2184 } 2185 2186 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event, 2187 struct perf_sample *sample, struct machine *machine) 2188 { 2189 /* 2190 * We already have the kernel maps, put in place via perf_session__create_kernel_maps() 2191 * no need to add them twice. 2192 */ 2193 if (!(event->header.misc & PERF_RECORD_MISC_USER)) 2194 return 0; 2195 2196 return perf_event__process_mmap2(tool, event, sample, machine); 2197 } 2198 2199 /* 2200 * XXX Ideally would be local to cmd_record() and passed to a record__new 2201 * because we need to have access to it in record__exit, that is called 2202 * after cmd_record() exits, but since record_options need to be accessible to 2203 * builtin-script, leave it here. 2204 * 2205 * At least we don't ouch it in all the other functions here directly. 2206 * 2207 * Just say no to tons of global variables, sigh. 2208 */ 2209 static struct record record = { 2210 .opts = { 2211 .sample_time = true, 2212 .mmap_pages = UINT_MAX, 2213 .user_freq = UINT_MAX, 2214 .user_interval = ULLONG_MAX, 2215 .freq = 4000, 2216 .target = { 2217 .uses_mmap = true, 2218 .default_per_cpu = true, 2219 }, 2220 .mmap_flush = MMAP_FLUSH_DEFAULT, 2221 }, 2222 .tool = { 2223 .sample = process_sample_event, 2224 .fork = perf_event__process_fork, 2225 .exit = perf_event__process_exit, 2226 .comm = perf_event__process_comm, 2227 .namespaces = perf_event__process_namespaces, 2228 .mmap = build_id__process_mmap, 2229 .mmap2 = build_id__process_mmap2, 2230 .ordered_events = true, 2231 }, 2232 }; 2233 2234 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP 2235 "\n\t\t\t\tDefault: fp"; 2236 2237 static bool dry_run; 2238 2239 /* 2240 * XXX Will stay a global variable till we fix builtin-script.c to stop messing 2241 * with it and switch to use the library functions in perf_evlist that came 2242 * from builtin-record.c, i.e. use record_opts, 2243 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record', 2244 * using pipes, etc. 2245 */ 2246 static struct option __record_options[] = { 2247 OPT_CALLBACK('e', "event", &record.evlist, "event", 2248 "event selector. use 'perf list' to list available events", 2249 parse_events_option), 2250 OPT_CALLBACK(0, "filter", &record.evlist, "filter", 2251 "event filter", parse_filter), 2252 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist, 2253 NULL, "don't record events from perf itself", 2254 exclude_perf), 2255 OPT_STRING('p', "pid", &record.opts.target.pid, "pid", 2256 "record events on existing process id"), 2257 OPT_STRING('t', "tid", &record.opts.target.tid, "tid", 2258 "record events on existing thread id"), 2259 OPT_INTEGER('r', "realtime", &record.realtime_prio, 2260 "collect data with this RT SCHED_FIFO priority"), 2261 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering, 2262 "collect data without buffering"), 2263 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples, 2264 "collect raw sample records from all opened counters"), 2265 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide, 2266 "system-wide collection from all CPUs"), 2267 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu", 2268 "list of cpus to monitor"), 2269 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"), 2270 OPT_STRING('o', "output", &record.data.path, "file", 2271 "output file name"), 2272 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit, 2273 &record.opts.no_inherit_set, 2274 "child tasks do not inherit counters"), 2275 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize, 2276 "synthesize non-sample events at the end of output"), 2277 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"), 2278 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"), 2279 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq, 2280 "Fail if the specified frequency can't be used"), 2281 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'", 2282 "profile at this frequency", 2283 record__parse_freq), 2284 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]", 2285 "number of mmap data pages and AUX area tracing mmap pages", 2286 record__parse_mmap_pages), 2287 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number", 2288 "Minimal number of bytes that is extracted from mmap data pages (default: 1)", 2289 record__mmap_flush_parse), 2290 OPT_BOOLEAN(0, "group", &record.opts.group, 2291 "put the counters into a counter group"), 2292 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param, 2293 NULL, "enables call-graph recording" , 2294 &record_callchain_opt), 2295 OPT_CALLBACK(0, "call-graph", &record.opts, 2296 "record_mode[,record_size]", record_callchain_help, 2297 &record_parse_callchain_opt), 2298 OPT_INCR('v', "verbose", &verbose, 2299 "be more verbose (show counter open errors, etc)"), 2300 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"), 2301 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat, 2302 "per thread counts"), 2303 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"), 2304 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr, 2305 "Record the sample physical addresses"), 2306 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"), 2307 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time, 2308 &record.opts.sample_time_set, 2309 "Record the sample timestamps"), 2310 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set, 2311 "Record the sample period"), 2312 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples, 2313 "don't sample"), 2314 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache, 2315 &record.no_buildid_cache_set, 2316 "do not update the buildid cache"), 2317 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid, 2318 &record.no_buildid_set, 2319 "do not collect buildids in perf.data"), 2320 OPT_CALLBACK('G', "cgroup", &record.evlist, "name", 2321 "monitor event in cgroup name only", 2322 parse_cgroups), 2323 OPT_UINTEGER('D', "delay", &record.opts.initial_delay, 2324 "ms to wait before starting measurement after program start"), 2325 OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"), 2326 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user", 2327 "user to profile"), 2328 2329 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack, 2330 "branch any", "sample any taken branches", 2331 parse_branch_stack), 2332 2333 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack, 2334 "branch filter mask", "branch stack filter modes", 2335 parse_branch_stack), 2336 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight, 2337 "sample by weight (on special events only)"), 2338 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction, 2339 "sample transaction flags (special events only)"), 2340 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread, 2341 "use per-thread mmaps"), 2342 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register", 2343 "sample selected machine registers on interrupt," 2344 " use '-I?' to list register names", parse_intr_regs), 2345 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register", 2346 "sample selected machine registers on interrupt," 2347 " use '--user-regs=?' to list register names", parse_user_regs), 2348 OPT_BOOLEAN(0, "running-time", &record.opts.running_time, 2349 "Record running/enabled time of read (:S) events"), 2350 OPT_CALLBACK('k', "clockid", &record.opts, 2351 "clockid", "clockid to use for events, see clock_gettime()", 2352 parse_clockid), 2353 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts, 2354 "opts", "AUX area tracing Snapshot Mode", ""), 2355 OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts, 2356 "opts", "sample AUX area", ""), 2357 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout, 2358 "per thread proc mmap processing timeout in ms"), 2359 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces, 2360 "Record namespaces events"), 2361 OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events, 2362 "Record context switch events"), 2363 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel, 2364 "Configure all used events to run in kernel space.", 2365 PARSE_OPT_EXCLUSIVE), 2366 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user, 2367 "Configure all used events to run in user space.", 2368 PARSE_OPT_EXCLUSIVE), 2369 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains, 2370 "collect kernel callchains"), 2371 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains, 2372 "collect user callchains"), 2373 OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path", 2374 "clang binary to use for compiling BPF scriptlets"), 2375 OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options", 2376 "options passed to clang when compiling BPF scriptlets"), 2377 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name, 2378 "file", "vmlinux pathname"), 2379 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all, 2380 "Record build-id of all DSOs regardless of hits"), 2381 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename, 2382 "append timestamp to output filename"), 2383 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary, 2384 "Record timestamp boundary (time of first/last samples)"), 2385 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str, 2386 &record.switch_output.set, "signal or size[BKMG] or time[smhd]", 2387 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold", 2388 "signal"), 2389 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files, 2390 "Limit number of switch output generated files"), 2391 OPT_BOOLEAN(0, "dry-run", &dry_run, 2392 "Parse options then exit"), 2393 #ifdef HAVE_AIO_SUPPORT 2394 OPT_CALLBACK_OPTARG(0, "aio", &record.opts, 2395 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)", 2396 record__aio_parse), 2397 #endif 2398 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu", 2399 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer", 2400 record__parse_affinity), 2401 #ifdef HAVE_ZSTD_SUPPORT 2402 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, 2403 "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)", 2404 record__parse_comp_level), 2405 #endif 2406 OPT_CALLBACK(0, "max-size", &record.output_max_size, 2407 "size", "Limit the maximum size of the output file", parse_output_max_size), 2408 OPT_END() 2409 }; 2410 2411 struct option *record_options = __record_options; 2412 2413 int cmd_record(int argc, const char **argv) 2414 { 2415 int err; 2416 struct record *rec = &record; 2417 char errbuf[BUFSIZ]; 2418 2419 setlocale(LC_ALL, ""); 2420 2421 #ifndef HAVE_LIBBPF_SUPPORT 2422 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c) 2423 set_nobuild('\0', "clang-path", true); 2424 set_nobuild('\0', "clang-opt", true); 2425 # undef set_nobuild 2426 #endif 2427 2428 #ifndef HAVE_BPF_PROLOGUE 2429 # if !defined (HAVE_DWARF_SUPPORT) 2430 # define REASON "NO_DWARF=1" 2431 # elif !defined (HAVE_LIBBPF_SUPPORT) 2432 # define REASON "NO_LIBBPF=1" 2433 # else 2434 # define REASON "this architecture doesn't support BPF prologue" 2435 # endif 2436 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c) 2437 set_nobuild('\0', "vmlinux", true); 2438 # undef set_nobuild 2439 # undef REASON 2440 #endif 2441 2442 rec->opts.affinity = PERF_AFFINITY_SYS; 2443 2444 rec->evlist = evlist__new(); 2445 if (rec->evlist == NULL) 2446 return -ENOMEM; 2447 2448 err = perf_config(perf_record_config, rec); 2449 if (err) 2450 return err; 2451 2452 argc = parse_options(argc, argv, record_options, record_usage, 2453 PARSE_OPT_STOP_AT_NON_OPTION); 2454 if (quiet) 2455 perf_quiet_option(); 2456 2457 /* Make system wide (-a) the default target. */ 2458 if (!argc && target__none(&rec->opts.target)) 2459 rec->opts.target.system_wide = true; 2460 2461 if (nr_cgroups && !rec->opts.target.system_wide) { 2462 usage_with_options_msg(record_usage, record_options, 2463 "cgroup monitoring only available in system-wide mode"); 2464 2465 } 2466 2467 if (rec->opts.kcore) 2468 rec->data.is_dir = true; 2469 2470 if (rec->opts.comp_level != 0) { 2471 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n"); 2472 rec->no_buildid = true; 2473 } 2474 2475 if (rec->opts.record_switch_events && 2476 !perf_can_record_switch_events()) { 2477 ui__error("kernel does not support recording context switch events\n"); 2478 parse_options_usage(record_usage, record_options, "switch-events", 0); 2479 return -EINVAL; 2480 } 2481 2482 if (switch_output_setup(rec)) { 2483 parse_options_usage(record_usage, record_options, "switch-output", 0); 2484 return -EINVAL; 2485 } 2486 2487 if (rec->switch_output.time) { 2488 signal(SIGALRM, alarm_sig_handler); 2489 alarm(rec->switch_output.time); 2490 } 2491 2492 if (rec->switch_output.num_files) { 2493 rec->switch_output.filenames = calloc(sizeof(char *), 2494 rec->switch_output.num_files); 2495 if (!rec->switch_output.filenames) 2496 return -EINVAL; 2497 } 2498 2499 /* 2500 * Allow aliases to facilitate the lookup of symbols for address 2501 * filters. Refer to auxtrace_parse_filters(). 2502 */ 2503 symbol_conf.allow_aliases = true; 2504 2505 symbol__init(NULL); 2506 2507 if (rec->opts.affinity != PERF_AFFINITY_SYS) { 2508 rec->affinity_mask.nbits = cpu__max_cpu(); 2509 rec->affinity_mask.bits = bitmap_alloc(rec->affinity_mask.nbits); 2510 if (!rec->affinity_mask.bits) { 2511 pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits); 2512 return -ENOMEM; 2513 } 2514 pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits); 2515 } 2516 2517 err = record__auxtrace_init(rec); 2518 if (err) 2519 goto out; 2520 2521 if (dry_run) 2522 goto out; 2523 2524 err = bpf__setup_stdout(rec->evlist); 2525 if (err) { 2526 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf)); 2527 pr_err("ERROR: Setup BPF stdout failed: %s\n", 2528 errbuf); 2529 goto out; 2530 } 2531 2532 err = -ENOMEM; 2533 2534 if (rec->no_buildid_cache || rec->no_buildid) { 2535 disable_buildid_cache(); 2536 } else if (rec->switch_output.enabled) { 2537 /* 2538 * In 'perf record --switch-output', disable buildid 2539 * generation by default to reduce data file switching 2540 * overhead. Still generate buildid if they are required 2541 * explicitly using 2542 * 2543 * perf record --switch-output --no-no-buildid \ 2544 * --no-no-buildid-cache 2545 * 2546 * Following code equals to: 2547 * 2548 * if ((rec->no_buildid || !rec->no_buildid_set) && 2549 * (rec->no_buildid_cache || !rec->no_buildid_cache_set)) 2550 * disable_buildid_cache(); 2551 */ 2552 bool disable = true; 2553 2554 if (rec->no_buildid_set && !rec->no_buildid) 2555 disable = false; 2556 if (rec->no_buildid_cache_set && !rec->no_buildid_cache) 2557 disable = false; 2558 if (disable) { 2559 rec->no_buildid = true; 2560 rec->no_buildid_cache = true; 2561 disable_buildid_cache(); 2562 } 2563 } 2564 2565 if (record.opts.overwrite) 2566 record.opts.tail_synthesize = true; 2567 2568 if (rec->evlist->core.nr_entries == 0 && 2569 __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) { 2570 pr_err("Not enough memory for event selector list\n"); 2571 goto out; 2572 } 2573 2574 if (rec->opts.target.tid && !rec->opts.no_inherit_set) 2575 rec->opts.no_inherit = true; 2576 2577 err = target__validate(&rec->opts.target); 2578 if (err) { 2579 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ); 2580 ui__warning("%s\n", errbuf); 2581 } 2582 2583 err = target__parse_uid(&rec->opts.target); 2584 if (err) { 2585 int saved_errno = errno; 2586 2587 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ); 2588 ui__error("%s", errbuf); 2589 2590 err = -saved_errno; 2591 goto out; 2592 } 2593 2594 /* Enable ignoring missing threads when -u/-p option is defined. */ 2595 rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid; 2596 2597 err = -ENOMEM; 2598 if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0) 2599 usage_with_options(record_usage, record_options); 2600 2601 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts); 2602 if (err) 2603 goto out; 2604 2605 /* 2606 * We take all buildids when the file contains 2607 * AUX area tracing data because we do not decode the 2608 * trace because it would take too long. 2609 */ 2610 if (rec->opts.full_auxtrace) 2611 rec->buildid_all = true; 2612 2613 if (record_opts__config(&rec->opts)) { 2614 err = -EINVAL; 2615 goto out; 2616 } 2617 2618 if (rec->opts.nr_cblocks > nr_cblocks_max) 2619 rec->opts.nr_cblocks = nr_cblocks_max; 2620 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks); 2621 2622 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]); 2623 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush); 2624 2625 if (rec->opts.comp_level > comp_level_max) 2626 rec->opts.comp_level = comp_level_max; 2627 pr_debug("comp level: %d\n", rec->opts.comp_level); 2628 2629 err = __cmd_record(&record, argc, argv); 2630 out: 2631 bitmap_free(rec->affinity_mask.bits); 2632 evlist__delete(rec->evlist); 2633 symbol__exit(); 2634 auxtrace_record__free(rec->itr); 2635 return err; 2636 } 2637 2638 static void snapshot_sig_handler(int sig __maybe_unused) 2639 { 2640 struct record *rec = &record; 2641 2642 if (trigger_is_ready(&auxtrace_snapshot_trigger)) { 2643 trigger_hit(&auxtrace_snapshot_trigger); 2644 auxtrace_record__snapshot_started = 1; 2645 if (auxtrace_record__snapshot_start(record.itr)) 2646 trigger_error(&auxtrace_snapshot_trigger); 2647 } 2648 2649 if (switch_output_signal(rec)) 2650 trigger_hit(&switch_output_trigger); 2651 } 2652 2653 static void alarm_sig_handler(int sig __maybe_unused) 2654 { 2655 struct record *rec = &record; 2656 2657 if (switch_output_time(rec)) 2658 trigger_hit(&switch_output_trigger); 2659 } 2660