1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * builtin-record.c 4 * 5 * Builtin record command: Record the profile of a workload 6 * (or a CPU, or a PID) into the perf.data output file - for 7 * later analysis via perf report. 8 */ 9 #include "builtin.h" 10 11 #include "util/build-id.h" 12 #include <subcmd/parse-options.h> 13 #include "util/parse-events.h" 14 #include "util/config.h" 15 16 #include "util/callchain.h" 17 #include "util/cgroup.h" 18 #include "util/header.h" 19 #include "util/event.h" 20 #include "util/evlist.h" 21 #include "util/evsel.h" 22 #include "util/debug.h" 23 #include "util/mmap.h" 24 #include "util/target.h" 25 #include "util/session.h" 26 #include "util/tool.h" 27 #include "util/symbol.h" 28 #include "util/record.h" 29 #include "util/cpumap.h" 30 #include "util/thread_map.h" 31 #include "util/data.h" 32 #include "util/perf_regs.h" 33 #include "util/auxtrace.h" 34 #include "util/tsc.h" 35 #include "util/parse-branch-options.h" 36 #include "util/parse-regs-options.h" 37 #include "util/llvm-utils.h" 38 #include "util/bpf-loader.h" 39 #include "util/trigger.h" 40 #include "util/perf-hooks.h" 41 #include "util/cpu-set-sched.h" 42 #include "util/synthetic-events.h" 43 #include "util/time-utils.h" 44 #include "util/units.h" 45 #include "util/bpf-event.h" 46 #include "util/util.h" 47 #include "asm/bug.h" 48 #include "perf.h" 49 50 #include <errno.h> 51 #include <inttypes.h> 52 #include <locale.h> 53 #include <poll.h> 54 #include <pthread.h> 55 #include <unistd.h> 56 #include <sched.h> 57 #include <signal.h> 58 #include <sys/mman.h> 59 #include <sys/wait.h> 60 #include <sys/types.h> 61 #include <sys/stat.h> 62 #include <fcntl.h> 63 #include <linux/err.h> 64 #include <linux/string.h> 65 #include <linux/time64.h> 66 #include <linux/zalloc.h> 67 #include <linux/bitmap.h> 68 69 struct switch_output { 70 bool enabled; 71 bool signal; 72 unsigned long size; 73 unsigned long time; 74 const char *str; 75 bool set; 76 char **filenames; 77 int num_files; 78 int cur_file; 79 }; 80 81 struct record { 82 struct perf_tool tool; 83 struct record_opts opts; 84 u64 bytes_written; 85 struct perf_data data; 86 struct auxtrace_record *itr; 87 struct evlist *evlist; 88 struct perf_session *session; 89 int realtime_prio; 90 bool no_buildid; 91 bool no_buildid_set; 92 bool no_buildid_cache; 93 bool no_buildid_cache_set; 94 bool buildid_all; 95 bool timestamp_filename; 96 bool timestamp_boundary; 97 struct switch_output switch_output; 98 unsigned long long samples; 99 struct mmap_cpu_mask affinity_mask; 100 unsigned long output_max_size; /* = 0: unlimited */ 101 }; 102 103 static volatile int done; 104 105 static volatile int auxtrace_record__snapshot_started; 106 static DEFINE_TRIGGER(auxtrace_snapshot_trigger); 107 static DEFINE_TRIGGER(switch_output_trigger); 108 109 static const char *affinity_tags[PERF_AFFINITY_MAX] = { 110 "SYS", "NODE", "CPU" 111 }; 112 113 static bool switch_output_signal(struct record *rec) 114 { 115 return rec->switch_output.signal && 116 trigger_is_ready(&switch_output_trigger); 117 } 118 119 static bool switch_output_size(struct record *rec) 120 { 121 return rec->switch_output.size && 122 trigger_is_ready(&switch_output_trigger) && 123 (rec->bytes_written >= rec->switch_output.size); 124 } 125 126 static bool switch_output_time(struct record *rec) 127 { 128 return rec->switch_output.time && 129 trigger_is_ready(&switch_output_trigger); 130 } 131 132 static bool record__output_max_size_exceeded(struct record *rec) 133 { 134 return rec->output_max_size && 135 (rec->bytes_written >= rec->output_max_size); 136 } 137 138 static int record__write(struct record *rec, struct mmap *map __maybe_unused, 139 void *bf, size_t size) 140 { 141 struct perf_data_file *file = &rec->session->data->file; 142 143 if (perf_data_file__write(file, bf, size) < 0) { 144 pr_err("failed to write perf data, error: %m\n"); 145 return -1; 146 } 147 148 rec->bytes_written += size; 149 150 if (record__output_max_size_exceeded(rec) && !done) { 151 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB)," 152 " stopping session ]\n", 153 rec->bytes_written >> 10); 154 done = 1; 155 } 156 157 if (switch_output_size(rec)) 158 trigger_hit(&switch_output_trigger); 159 160 return 0; 161 } 162 163 static int record__aio_enabled(struct record *rec); 164 static int record__comp_enabled(struct record *rec); 165 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size, 166 void *src, size_t src_size); 167 168 #ifdef HAVE_AIO_SUPPORT 169 static int record__aio_write(struct aiocb *cblock, int trace_fd, 170 void *buf, size_t size, off_t off) 171 { 172 int rc; 173 174 cblock->aio_fildes = trace_fd; 175 cblock->aio_buf = buf; 176 cblock->aio_nbytes = size; 177 cblock->aio_offset = off; 178 cblock->aio_sigevent.sigev_notify = SIGEV_NONE; 179 180 do { 181 rc = aio_write(cblock); 182 if (rc == 0) { 183 break; 184 } else if (errno != EAGAIN) { 185 cblock->aio_fildes = -1; 186 pr_err("failed to queue perf data, error: %m\n"); 187 break; 188 } 189 } while (1); 190 191 return rc; 192 } 193 194 static int record__aio_complete(struct mmap *md, struct aiocb *cblock) 195 { 196 void *rem_buf; 197 off_t rem_off; 198 size_t rem_size; 199 int rc, aio_errno; 200 ssize_t aio_ret, written; 201 202 aio_errno = aio_error(cblock); 203 if (aio_errno == EINPROGRESS) 204 return 0; 205 206 written = aio_ret = aio_return(cblock); 207 if (aio_ret < 0) { 208 if (aio_errno != EINTR) 209 pr_err("failed to write perf data, error: %m\n"); 210 written = 0; 211 } 212 213 rem_size = cblock->aio_nbytes - written; 214 215 if (rem_size == 0) { 216 cblock->aio_fildes = -1; 217 /* 218 * md->refcount is incremented in record__aio_pushfn() for 219 * every aio write request started in record__aio_push() so 220 * decrement it because the request is now complete. 221 */ 222 perf_mmap__put(&md->core); 223 rc = 1; 224 } else { 225 /* 226 * aio write request may require restart with the 227 * reminder if the kernel didn't write whole 228 * chunk at once. 229 */ 230 rem_off = cblock->aio_offset + written; 231 rem_buf = (void *)(cblock->aio_buf + written); 232 record__aio_write(cblock, cblock->aio_fildes, 233 rem_buf, rem_size, rem_off); 234 rc = 0; 235 } 236 237 return rc; 238 } 239 240 static int record__aio_sync(struct mmap *md, bool sync_all) 241 { 242 struct aiocb **aiocb = md->aio.aiocb; 243 struct aiocb *cblocks = md->aio.cblocks; 244 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */ 245 int i, do_suspend; 246 247 do { 248 do_suspend = 0; 249 for (i = 0; i < md->aio.nr_cblocks; ++i) { 250 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) { 251 if (sync_all) 252 aiocb[i] = NULL; 253 else 254 return i; 255 } else { 256 /* 257 * Started aio write is not complete yet 258 * so it has to be waited before the 259 * next allocation. 260 */ 261 aiocb[i] = &cblocks[i]; 262 do_suspend = 1; 263 } 264 } 265 if (!do_suspend) 266 return -1; 267 268 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) { 269 if (!(errno == EAGAIN || errno == EINTR)) 270 pr_err("failed to sync perf data, error: %m\n"); 271 } 272 } while (1); 273 } 274 275 struct record_aio { 276 struct record *rec; 277 void *data; 278 size_t size; 279 }; 280 281 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size) 282 { 283 struct record_aio *aio = to; 284 285 /* 286 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer 287 * to release space in the kernel buffer as fast as possible, calling 288 * perf_mmap__consume() from perf_mmap__push() function. 289 * 290 * That lets the kernel to proceed with storing more profiling data into 291 * the kernel buffer earlier than other per-cpu kernel buffers are handled. 292 * 293 * Coping can be done in two steps in case the chunk of profiling data 294 * crosses the upper bound of the kernel buffer. In this case we first move 295 * part of data from map->start till the upper bound and then the reminder 296 * from the beginning of the kernel buffer till the end of the data chunk. 297 */ 298 299 if (record__comp_enabled(aio->rec)) { 300 size = zstd_compress(aio->rec->session, aio->data + aio->size, 301 mmap__mmap_len(map) - aio->size, 302 buf, size); 303 } else { 304 memcpy(aio->data + aio->size, buf, size); 305 } 306 307 if (!aio->size) { 308 /* 309 * Increment map->refcount to guard map->aio.data[] buffer 310 * from premature deallocation because map object can be 311 * released earlier than aio write request started on 312 * map->aio.data[] buffer is complete. 313 * 314 * perf_mmap__put() is done at record__aio_complete() 315 * after started aio request completion or at record__aio_push() 316 * if the request failed to start. 317 */ 318 perf_mmap__get(&map->core); 319 } 320 321 aio->size += size; 322 323 return size; 324 } 325 326 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off) 327 { 328 int ret, idx; 329 int trace_fd = rec->session->data->file.fd; 330 struct record_aio aio = { .rec = rec, .size = 0 }; 331 332 /* 333 * Call record__aio_sync() to wait till map->aio.data[] buffer 334 * becomes available after previous aio write operation. 335 */ 336 337 idx = record__aio_sync(map, false); 338 aio.data = map->aio.data[idx]; 339 ret = perf_mmap__push(map, &aio, record__aio_pushfn); 340 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */ 341 return ret; 342 343 rec->samples++; 344 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off); 345 if (!ret) { 346 *off += aio.size; 347 rec->bytes_written += aio.size; 348 if (switch_output_size(rec)) 349 trigger_hit(&switch_output_trigger); 350 } else { 351 /* 352 * Decrement map->refcount incremented in record__aio_pushfn() 353 * back if record__aio_write() operation failed to start, otherwise 354 * map->refcount is decremented in record__aio_complete() after 355 * aio write operation finishes successfully. 356 */ 357 perf_mmap__put(&map->core); 358 } 359 360 return ret; 361 } 362 363 static off_t record__aio_get_pos(int trace_fd) 364 { 365 return lseek(trace_fd, 0, SEEK_CUR); 366 } 367 368 static void record__aio_set_pos(int trace_fd, off_t pos) 369 { 370 lseek(trace_fd, pos, SEEK_SET); 371 } 372 373 static void record__aio_mmap_read_sync(struct record *rec) 374 { 375 int i; 376 struct evlist *evlist = rec->evlist; 377 struct mmap *maps = evlist->mmap; 378 379 if (!record__aio_enabled(rec)) 380 return; 381 382 for (i = 0; i < evlist->core.nr_mmaps; i++) { 383 struct mmap *map = &maps[i]; 384 385 if (map->core.base) 386 record__aio_sync(map, true); 387 } 388 } 389 390 static int nr_cblocks_default = 1; 391 static int nr_cblocks_max = 4; 392 393 static int record__aio_parse(const struct option *opt, 394 const char *str, 395 int unset) 396 { 397 struct record_opts *opts = (struct record_opts *)opt->value; 398 399 if (unset) { 400 opts->nr_cblocks = 0; 401 } else { 402 if (str) 403 opts->nr_cblocks = strtol(str, NULL, 0); 404 if (!opts->nr_cblocks) 405 opts->nr_cblocks = nr_cblocks_default; 406 } 407 408 return 0; 409 } 410 #else /* HAVE_AIO_SUPPORT */ 411 static int nr_cblocks_max = 0; 412 413 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused, 414 off_t *off __maybe_unused) 415 { 416 return -1; 417 } 418 419 static off_t record__aio_get_pos(int trace_fd __maybe_unused) 420 { 421 return -1; 422 } 423 424 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused) 425 { 426 } 427 428 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused) 429 { 430 } 431 #endif 432 433 static int record__aio_enabled(struct record *rec) 434 { 435 return rec->opts.nr_cblocks > 0; 436 } 437 438 #define MMAP_FLUSH_DEFAULT 1 439 static int record__mmap_flush_parse(const struct option *opt, 440 const char *str, 441 int unset) 442 { 443 int flush_max; 444 struct record_opts *opts = (struct record_opts *)opt->value; 445 static struct parse_tag tags[] = { 446 { .tag = 'B', .mult = 1 }, 447 { .tag = 'K', .mult = 1 << 10 }, 448 { .tag = 'M', .mult = 1 << 20 }, 449 { .tag = 'G', .mult = 1 << 30 }, 450 { .tag = 0 }, 451 }; 452 453 if (unset) 454 return 0; 455 456 if (str) { 457 opts->mmap_flush = parse_tag_value(str, tags); 458 if (opts->mmap_flush == (int)-1) 459 opts->mmap_flush = strtol(str, NULL, 0); 460 } 461 462 if (!opts->mmap_flush) 463 opts->mmap_flush = MMAP_FLUSH_DEFAULT; 464 465 flush_max = evlist__mmap_size(opts->mmap_pages); 466 flush_max /= 4; 467 if (opts->mmap_flush > flush_max) 468 opts->mmap_flush = flush_max; 469 470 return 0; 471 } 472 473 #ifdef HAVE_ZSTD_SUPPORT 474 static unsigned int comp_level_default = 1; 475 476 static int record__parse_comp_level(const struct option *opt, const char *str, int unset) 477 { 478 struct record_opts *opts = opt->value; 479 480 if (unset) { 481 opts->comp_level = 0; 482 } else { 483 if (str) 484 opts->comp_level = strtol(str, NULL, 0); 485 if (!opts->comp_level) 486 opts->comp_level = comp_level_default; 487 } 488 489 return 0; 490 } 491 #endif 492 static unsigned int comp_level_max = 22; 493 494 static int record__comp_enabled(struct record *rec) 495 { 496 return rec->opts.comp_level > 0; 497 } 498 499 static int process_synthesized_event(struct perf_tool *tool, 500 union perf_event *event, 501 struct perf_sample *sample __maybe_unused, 502 struct machine *machine __maybe_unused) 503 { 504 struct record *rec = container_of(tool, struct record, tool); 505 return record__write(rec, NULL, event, event->header.size); 506 } 507 508 static int process_locked_synthesized_event(struct perf_tool *tool, 509 union perf_event *event, 510 struct perf_sample *sample __maybe_unused, 511 struct machine *machine __maybe_unused) 512 { 513 static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER; 514 int ret; 515 516 pthread_mutex_lock(&synth_lock); 517 ret = process_synthesized_event(tool, event, sample, machine); 518 pthread_mutex_unlock(&synth_lock); 519 return ret; 520 } 521 522 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size) 523 { 524 struct record *rec = to; 525 526 if (record__comp_enabled(rec)) { 527 size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size); 528 bf = map->data; 529 } 530 531 rec->samples++; 532 return record__write(rec, map, bf, size); 533 } 534 535 static volatile int signr = -1; 536 static volatile int child_finished; 537 538 static void sig_handler(int sig) 539 { 540 if (sig == SIGCHLD) 541 child_finished = 1; 542 else 543 signr = sig; 544 545 done = 1; 546 } 547 548 static void sigsegv_handler(int sig) 549 { 550 perf_hooks__recover(); 551 sighandler_dump_stack(sig); 552 } 553 554 static void record__sig_exit(void) 555 { 556 if (signr == -1) 557 return; 558 559 signal(signr, SIG_DFL); 560 raise(signr); 561 } 562 563 #ifdef HAVE_AUXTRACE_SUPPORT 564 565 static int record__process_auxtrace(struct perf_tool *tool, 566 struct mmap *map, 567 union perf_event *event, void *data1, 568 size_t len1, void *data2, size_t len2) 569 { 570 struct record *rec = container_of(tool, struct record, tool); 571 struct perf_data *data = &rec->data; 572 size_t padding; 573 u8 pad[8] = {0}; 574 575 if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) { 576 off_t file_offset; 577 int fd = perf_data__fd(data); 578 int err; 579 580 file_offset = lseek(fd, 0, SEEK_CUR); 581 if (file_offset == -1) 582 return -1; 583 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index, 584 event, file_offset); 585 if (err) 586 return err; 587 } 588 589 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */ 590 padding = (len1 + len2) & 7; 591 if (padding) 592 padding = 8 - padding; 593 594 record__write(rec, map, event, event->header.size); 595 record__write(rec, map, data1, len1); 596 if (len2) 597 record__write(rec, map, data2, len2); 598 record__write(rec, map, &pad, padding); 599 600 return 0; 601 } 602 603 static int record__auxtrace_mmap_read(struct record *rec, 604 struct mmap *map) 605 { 606 int ret; 607 608 ret = auxtrace_mmap__read(map, rec->itr, &rec->tool, 609 record__process_auxtrace); 610 if (ret < 0) 611 return ret; 612 613 if (ret) 614 rec->samples++; 615 616 return 0; 617 } 618 619 static int record__auxtrace_mmap_read_snapshot(struct record *rec, 620 struct mmap *map) 621 { 622 int ret; 623 624 ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool, 625 record__process_auxtrace, 626 rec->opts.auxtrace_snapshot_size); 627 if (ret < 0) 628 return ret; 629 630 if (ret) 631 rec->samples++; 632 633 return 0; 634 } 635 636 static int record__auxtrace_read_snapshot_all(struct record *rec) 637 { 638 int i; 639 int rc = 0; 640 641 for (i = 0; i < rec->evlist->core.nr_mmaps; i++) { 642 struct mmap *map = &rec->evlist->mmap[i]; 643 644 if (!map->auxtrace_mmap.base) 645 continue; 646 647 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) { 648 rc = -1; 649 goto out; 650 } 651 } 652 out: 653 return rc; 654 } 655 656 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit) 657 { 658 pr_debug("Recording AUX area tracing snapshot\n"); 659 if (record__auxtrace_read_snapshot_all(rec) < 0) { 660 trigger_error(&auxtrace_snapshot_trigger); 661 } else { 662 if (auxtrace_record__snapshot_finish(rec->itr, on_exit)) 663 trigger_error(&auxtrace_snapshot_trigger); 664 else 665 trigger_ready(&auxtrace_snapshot_trigger); 666 } 667 } 668 669 static int record__auxtrace_snapshot_exit(struct record *rec) 670 { 671 if (trigger_is_error(&auxtrace_snapshot_trigger)) 672 return 0; 673 674 if (!auxtrace_record__snapshot_started && 675 auxtrace_record__snapshot_start(rec->itr)) 676 return -1; 677 678 record__read_auxtrace_snapshot(rec, true); 679 if (trigger_is_error(&auxtrace_snapshot_trigger)) 680 return -1; 681 682 return 0; 683 } 684 685 static int record__auxtrace_init(struct record *rec) 686 { 687 int err; 688 689 if (!rec->itr) { 690 rec->itr = auxtrace_record__init(rec->evlist, &err); 691 if (err) 692 return err; 693 } 694 695 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts, 696 rec->opts.auxtrace_snapshot_opts); 697 if (err) 698 return err; 699 700 err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts, 701 rec->opts.auxtrace_sample_opts); 702 if (err) 703 return err; 704 705 return auxtrace_parse_filters(rec->evlist); 706 } 707 708 #else 709 710 static inline 711 int record__auxtrace_mmap_read(struct record *rec __maybe_unused, 712 struct mmap *map __maybe_unused) 713 { 714 return 0; 715 } 716 717 static inline 718 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused, 719 bool on_exit __maybe_unused) 720 { 721 } 722 723 static inline 724 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused) 725 { 726 return 0; 727 } 728 729 static inline 730 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused) 731 { 732 return 0; 733 } 734 735 static int record__auxtrace_init(struct record *rec __maybe_unused) 736 { 737 return 0; 738 } 739 740 #endif 741 742 static bool record__kcore_readable(struct machine *machine) 743 { 744 char kcore[PATH_MAX]; 745 int fd; 746 747 scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir); 748 749 fd = open(kcore, O_RDONLY); 750 if (fd < 0) 751 return false; 752 753 close(fd); 754 755 return true; 756 } 757 758 static int record__kcore_copy(struct machine *machine, struct perf_data *data) 759 { 760 char from_dir[PATH_MAX]; 761 char kcore_dir[PATH_MAX]; 762 int ret; 763 764 snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir); 765 766 ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir)); 767 if (ret) 768 return ret; 769 770 return kcore_copy(from_dir, kcore_dir); 771 } 772 773 static int record__mmap_evlist(struct record *rec, 774 struct evlist *evlist) 775 { 776 struct record_opts *opts = &rec->opts; 777 bool auxtrace_overwrite = opts->auxtrace_snapshot_mode || 778 opts->auxtrace_sample_mode; 779 char msg[512]; 780 781 if (opts->affinity != PERF_AFFINITY_SYS) 782 cpu__setup_cpunode_map(); 783 784 if (evlist__mmap_ex(evlist, opts->mmap_pages, 785 opts->auxtrace_mmap_pages, 786 auxtrace_overwrite, 787 opts->nr_cblocks, opts->affinity, 788 opts->mmap_flush, opts->comp_level) < 0) { 789 if (errno == EPERM) { 790 pr_err("Permission error mapping pages.\n" 791 "Consider increasing " 792 "/proc/sys/kernel/perf_event_mlock_kb,\n" 793 "or try again with a smaller value of -m/--mmap_pages.\n" 794 "(current value: %u,%u)\n", 795 opts->mmap_pages, opts->auxtrace_mmap_pages); 796 return -errno; 797 } else { 798 pr_err("failed to mmap with %d (%s)\n", errno, 799 str_error_r(errno, msg, sizeof(msg))); 800 if (errno) 801 return -errno; 802 else 803 return -EINVAL; 804 } 805 } 806 return 0; 807 } 808 809 static int record__mmap(struct record *rec) 810 { 811 return record__mmap_evlist(rec, rec->evlist); 812 } 813 814 static int record__open(struct record *rec) 815 { 816 char msg[BUFSIZ]; 817 struct evsel *pos; 818 struct evlist *evlist = rec->evlist; 819 struct perf_session *session = rec->session; 820 struct record_opts *opts = &rec->opts; 821 int rc = 0; 822 823 /* 824 * For initial_delay we need to add a dummy event so that we can track 825 * PERF_RECORD_MMAP while we wait for the initial delay to enable the 826 * real events, the ones asked by the user. 827 */ 828 if (opts->initial_delay) { 829 if (perf_evlist__add_dummy(evlist)) 830 return -ENOMEM; 831 832 pos = evlist__first(evlist); 833 pos->tracking = 0; 834 pos = evlist__last(evlist); 835 pos->tracking = 1; 836 pos->core.attr.enable_on_exec = 1; 837 } 838 839 perf_evlist__config(evlist, opts, &callchain_param); 840 841 evlist__for_each_entry(evlist, pos) { 842 try_again: 843 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) { 844 if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) { 845 if (verbose > 0) 846 ui__warning("%s\n", msg); 847 goto try_again; 848 } 849 if ((errno == EINVAL || errno == EBADF) && 850 pos->leader != pos && 851 pos->weak_group) { 852 pos = perf_evlist__reset_weak_group(evlist, pos, true); 853 goto try_again; 854 } 855 rc = -errno; 856 perf_evsel__open_strerror(pos, &opts->target, 857 errno, msg, sizeof(msg)); 858 ui__error("%s\n", msg); 859 goto out; 860 } 861 862 pos->supported = true; 863 } 864 865 if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) { 866 pr_warning( 867 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n" 868 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n" 869 "Samples in kernel functions may not be resolved if a suitable vmlinux\n" 870 "file is not found in the buildid cache or in the vmlinux path.\n\n" 871 "Samples in kernel modules won't be resolved at all.\n\n" 872 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n" 873 "even with a suitable vmlinux or kallsyms file.\n\n"); 874 } 875 876 if (perf_evlist__apply_filters(evlist, &pos)) { 877 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n", 878 pos->filter, perf_evsel__name(pos), errno, 879 str_error_r(errno, msg, sizeof(msg))); 880 rc = -1; 881 goto out; 882 } 883 884 rc = record__mmap(rec); 885 if (rc) 886 goto out; 887 888 session->evlist = evlist; 889 perf_session__set_id_hdr_size(session); 890 out: 891 return rc; 892 } 893 894 static int process_sample_event(struct perf_tool *tool, 895 union perf_event *event, 896 struct perf_sample *sample, 897 struct evsel *evsel, 898 struct machine *machine) 899 { 900 struct record *rec = container_of(tool, struct record, tool); 901 902 if (rec->evlist->first_sample_time == 0) 903 rec->evlist->first_sample_time = sample->time; 904 905 rec->evlist->last_sample_time = sample->time; 906 907 if (rec->buildid_all) 908 return 0; 909 910 rec->samples++; 911 return build_id__mark_dso_hit(tool, event, sample, evsel, machine); 912 } 913 914 static int process_buildids(struct record *rec) 915 { 916 struct perf_session *session = rec->session; 917 918 if (perf_data__size(&rec->data) == 0) 919 return 0; 920 921 /* 922 * During this process, it'll load kernel map and replace the 923 * dso->long_name to a real pathname it found. In this case 924 * we prefer the vmlinux path like 925 * /lib/modules/3.16.4/build/vmlinux 926 * 927 * rather than build-id path (in debug directory). 928 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551 929 */ 930 symbol_conf.ignore_vmlinux_buildid = true; 931 932 /* 933 * If --buildid-all is given, it marks all DSO regardless of hits, 934 * so no need to process samples. But if timestamp_boundary is enabled, 935 * it still needs to walk on all samples to get the timestamps of 936 * first/last samples. 937 */ 938 if (rec->buildid_all && !rec->timestamp_boundary) 939 rec->tool.sample = NULL; 940 941 return perf_session__process_events(session); 942 } 943 944 static void perf_event__synthesize_guest_os(struct machine *machine, void *data) 945 { 946 int err; 947 struct perf_tool *tool = data; 948 /* 949 *As for guest kernel when processing subcommand record&report, 950 *we arrange module mmap prior to guest kernel mmap and trigger 951 *a preload dso because default guest module symbols are loaded 952 *from guest kallsyms instead of /lib/modules/XXX/XXX. This 953 *method is used to avoid symbol missing when the first addr is 954 *in module instead of in guest kernel. 955 */ 956 err = perf_event__synthesize_modules(tool, process_synthesized_event, 957 machine); 958 if (err < 0) 959 pr_err("Couldn't record guest kernel [%d]'s reference" 960 " relocation symbol.\n", machine->pid); 961 962 /* 963 * We use _stext for guest kernel because guest kernel's /proc/kallsyms 964 * have no _text sometimes. 965 */ 966 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, 967 machine); 968 if (err < 0) 969 pr_err("Couldn't record guest kernel [%d]'s reference" 970 " relocation symbol.\n", machine->pid); 971 } 972 973 static struct perf_event_header finished_round_event = { 974 .size = sizeof(struct perf_event_header), 975 .type = PERF_RECORD_FINISHED_ROUND, 976 }; 977 978 static void record__adjust_affinity(struct record *rec, struct mmap *map) 979 { 980 if (rec->opts.affinity != PERF_AFFINITY_SYS && 981 !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits, 982 rec->affinity_mask.nbits)) { 983 bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits); 984 bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits, 985 map->affinity_mask.bits, rec->affinity_mask.nbits); 986 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask), 987 (cpu_set_t *)rec->affinity_mask.bits); 988 if (verbose == 2) 989 mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread"); 990 } 991 } 992 993 static size_t process_comp_header(void *record, size_t increment) 994 { 995 struct perf_record_compressed *event = record; 996 size_t size = sizeof(*event); 997 998 if (increment) { 999 event->header.size += increment; 1000 return increment; 1001 } 1002 1003 event->header.type = PERF_RECORD_COMPRESSED; 1004 event->header.size = size; 1005 1006 return size; 1007 } 1008 1009 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size, 1010 void *src, size_t src_size) 1011 { 1012 size_t compressed; 1013 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1; 1014 1015 compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size, 1016 max_record_size, process_comp_header); 1017 1018 session->bytes_transferred += src_size; 1019 session->bytes_compressed += compressed; 1020 1021 return compressed; 1022 } 1023 1024 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist, 1025 bool overwrite, bool synch) 1026 { 1027 u64 bytes_written = rec->bytes_written; 1028 int i; 1029 int rc = 0; 1030 struct mmap *maps; 1031 int trace_fd = rec->data.file.fd; 1032 off_t off = 0; 1033 1034 if (!evlist) 1035 return 0; 1036 1037 maps = overwrite ? evlist->overwrite_mmap : evlist->mmap; 1038 if (!maps) 1039 return 0; 1040 1041 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING) 1042 return 0; 1043 1044 if (record__aio_enabled(rec)) 1045 off = record__aio_get_pos(trace_fd); 1046 1047 for (i = 0; i < evlist->core.nr_mmaps; i++) { 1048 u64 flush = 0; 1049 struct mmap *map = &maps[i]; 1050 1051 if (map->core.base) { 1052 record__adjust_affinity(rec, map); 1053 if (synch) { 1054 flush = map->core.flush; 1055 map->core.flush = 1; 1056 } 1057 if (!record__aio_enabled(rec)) { 1058 if (perf_mmap__push(map, rec, record__pushfn) < 0) { 1059 if (synch) 1060 map->core.flush = flush; 1061 rc = -1; 1062 goto out; 1063 } 1064 } else { 1065 if (record__aio_push(rec, map, &off) < 0) { 1066 record__aio_set_pos(trace_fd, off); 1067 if (synch) 1068 map->core.flush = flush; 1069 rc = -1; 1070 goto out; 1071 } 1072 } 1073 if (synch) 1074 map->core.flush = flush; 1075 } 1076 1077 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode && 1078 !rec->opts.auxtrace_sample_mode && 1079 record__auxtrace_mmap_read(rec, map) != 0) { 1080 rc = -1; 1081 goto out; 1082 } 1083 } 1084 1085 if (record__aio_enabled(rec)) 1086 record__aio_set_pos(trace_fd, off); 1087 1088 /* 1089 * Mark the round finished in case we wrote 1090 * at least one event. 1091 */ 1092 if (bytes_written != rec->bytes_written) 1093 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event)); 1094 1095 if (overwrite) 1096 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY); 1097 out: 1098 return rc; 1099 } 1100 1101 static int record__mmap_read_all(struct record *rec, bool synch) 1102 { 1103 int err; 1104 1105 err = record__mmap_read_evlist(rec, rec->evlist, false, synch); 1106 if (err) 1107 return err; 1108 1109 return record__mmap_read_evlist(rec, rec->evlist, true, synch); 1110 } 1111 1112 static void record__init_features(struct record *rec) 1113 { 1114 struct perf_session *session = rec->session; 1115 int feat; 1116 1117 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++) 1118 perf_header__set_feat(&session->header, feat); 1119 1120 if (rec->no_buildid) 1121 perf_header__clear_feat(&session->header, HEADER_BUILD_ID); 1122 1123 if (!have_tracepoints(&rec->evlist->core.entries)) 1124 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA); 1125 1126 if (!rec->opts.branch_stack) 1127 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK); 1128 1129 if (!rec->opts.full_auxtrace) 1130 perf_header__clear_feat(&session->header, HEADER_AUXTRACE); 1131 1132 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns)) 1133 perf_header__clear_feat(&session->header, HEADER_CLOCKID); 1134 1135 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT); 1136 if (!record__comp_enabled(rec)) 1137 perf_header__clear_feat(&session->header, HEADER_COMPRESSED); 1138 1139 perf_header__clear_feat(&session->header, HEADER_STAT); 1140 } 1141 1142 static void 1143 record__finish_output(struct record *rec) 1144 { 1145 struct perf_data *data = &rec->data; 1146 int fd = perf_data__fd(data); 1147 1148 if (data->is_pipe) 1149 return; 1150 1151 rec->session->header.data_size += rec->bytes_written; 1152 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR); 1153 1154 if (!rec->no_buildid) { 1155 process_buildids(rec); 1156 1157 if (rec->buildid_all) 1158 dsos__hit_all(rec->session); 1159 } 1160 perf_session__write_header(rec->session, rec->evlist, fd, true); 1161 1162 return; 1163 } 1164 1165 static int record__synthesize_workload(struct record *rec, bool tail) 1166 { 1167 int err; 1168 struct perf_thread_map *thread_map; 1169 1170 if (rec->opts.tail_synthesize != tail) 1171 return 0; 1172 1173 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid); 1174 if (thread_map == NULL) 1175 return -1; 1176 1177 err = perf_event__synthesize_thread_map(&rec->tool, thread_map, 1178 process_synthesized_event, 1179 &rec->session->machines.host, 1180 rec->opts.sample_address); 1181 perf_thread_map__put(thread_map); 1182 return err; 1183 } 1184 1185 static int record__synthesize(struct record *rec, bool tail); 1186 1187 static int 1188 record__switch_output(struct record *rec, bool at_exit) 1189 { 1190 struct perf_data *data = &rec->data; 1191 int fd, err; 1192 char *new_filename; 1193 1194 /* Same Size: "2015122520103046"*/ 1195 char timestamp[] = "InvalidTimestamp"; 1196 1197 record__aio_mmap_read_sync(rec); 1198 1199 record__synthesize(rec, true); 1200 if (target__none(&rec->opts.target)) 1201 record__synthesize_workload(rec, true); 1202 1203 rec->samples = 0; 1204 record__finish_output(rec); 1205 err = fetch_current_timestamp(timestamp, sizeof(timestamp)); 1206 if (err) { 1207 pr_err("Failed to get current timestamp\n"); 1208 return -EINVAL; 1209 } 1210 1211 fd = perf_data__switch(data, timestamp, 1212 rec->session->header.data_offset, 1213 at_exit, &new_filename); 1214 if (fd >= 0 && !at_exit) { 1215 rec->bytes_written = 0; 1216 rec->session->header.data_size = 0; 1217 } 1218 1219 if (!quiet) 1220 fprintf(stderr, "[ perf record: Dump %s.%s ]\n", 1221 data->path, timestamp); 1222 1223 if (rec->switch_output.num_files) { 1224 int n = rec->switch_output.cur_file + 1; 1225 1226 if (n >= rec->switch_output.num_files) 1227 n = 0; 1228 rec->switch_output.cur_file = n; 1229 if (rec->switch_output.filenames[n]) { 1230 remove(rec->switch_output.filenames[n]); 1231 zfree(&rec->switch_output.filenames[n]); 1232 } 1233 rec->switch_output.filenames[n] = new_filename; 1234 } else { 1235 free(new_filename); 1236 } 1237 1238 /* Output tracking events */ 1239 if (!at_exit) { 1240 record__synthesize(rec, false); 1241 1242 /* 1243 * In 'perf record --switch-output' without -a, 1244 * record__synthesize() in record__switch_output() won't 1245 * generate tracking events because there's no thread_map 1246 * in evlist. Which causes newly created perf.data doesn't 1247 * contain map and comm information. 1248 * Create a fake thread_map and directly call 1249 * perf_event__synthesize_thread_map() for those events. 1250 */ 1251 if (target__none(&rec->opts.target)) 1252 record__synthesize_workload(rec, false); 1253 } 1254 return fd; 1255 } 1256 1257 static volatile int workload_exec_errno; 1258 1259 /* 1260 * perf_evlist__prepare_workload will send a SIGUSR1 1261 * if the fork fails, since we asked by setting its 1262 * want_signal to true. 1263 */ 1264 static void workload_exec_failed_signal(int signo __maybe_unused, 1265 siginfo_t *info, 1266 void *ucontext __maybe_unused) 1267 { 1268 workload_exec_errno = info->si_value.sival_int; 1269 done = 1; 1270 child_finished = 1; 1271 } 1272 1273 static void snapshot_sig_handler(int sig); 1274 static void alarm_sig_handler(int sig); 1275 1276 static const struct perf_event_mmap_page * 1277 perf_evlist__pick_pc(struct evlist *evlist) 1278 { 1279 if (evlist) { 1280 if (evlist->mmap && evlist->mmap[0].core.base) 1281 return evlist->mmap[0].core.base; 1282 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base) 1283 return evlist->overwrite_mmap[0].core.base; 1284 } 1285 return NULL; 1286 } 1287 1288 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec) 1289 { 1290 const struct perf_event_mmap_page *pc; 1291 1292 pc = perf_evlist__pick_pc(rec->evlist); 1293 if (pc) 1294 return pc; 1295 return NULL; 1296 } 1297 1298 static int record__synthesize(struct record *rec, bool tail) 1299 { 1300 struct perf_session *session = rec->session; 1301 struct machine *machine = &session->machines.host; 1302 struct perf_data *data = &rec->data; 1303 struct record_opts *opts = &rec->opts; 1304 struct perf_tool *tool = &rec->tool; 1305 int fd = perf_data__fd(data); 1306 int err = 0; 1307 event_op f = process_synthesized_event; 1308 1309 if (rec->opts.tail_synthesize != tail) 1310 return 0; 1311 1312 if (data->is_pipe) { 1313 /* 1314 * We need to synthesize events first, because some 1315 * features works on top of them (on report side). 1316 */ 1317 err = perf_event__synthesize_attrs(tool, rec->evlist, 1318 process_synthesized_event); 1319 if (err < 0) { 1320 pr_err("Couldn't synthesize attrs.\n"); 1321 goto out; 1322 } 1323 1324 err = perf_event__synthesize_features(tool, session, rec->evlist, 1325 process_synthesized_event); 1326 if (err < 0) { 1327 pr_err("Couldn't synthesize features.\n"); 1328 return err; 1329 } 1330 1331 if (have_tracepoints(&rec->evlist->core.entries)) { 1332 /* 1333 * FIXME err <= 0 here actually means that 1334 * there were no tracepoints so its not really 1335 * an error, just that we don't need to 1336 * synthesize anything. We really have to 1337 * return this more properly and also 1338 * propagate errors that now are calling die() 1339 */ 1340 err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist, 1341 process_synthesized_event); 1342 if (err <= 0) { 1343 pr_err("Couldn't record tracing data.\n"); 1344 goto out; 1345 } 1346 rec->bytes_written += err; 1347 } 1348 } 1349 1350 err = perf_event__synth_time_conv(record__pick_pc(rec), tool, 1351 process_synthesized_event, machine); 1352 if (err) 1353 goto out; 1354 1355 /* Synthesize id_index before auxtrace_info */ 1356 if (rec->opts.auxtrace_sample_mode) { 1357 err = perf_event__synthesize_id_index(tool, 1358 process_synthesized_event, 1359 session->evlist, machine); 1360 if (err) 1361 goto out; 1362 } 1363 1364 if (rec->opts.full_auxtrace) { 1365 err = perf_event__synthesize_auxtrace_info(rec->itr, tool, 1366 session, process_synthesized_event); 1367 if (err) 1368 goto out; 1369 } 1370 1371 if (!perf_evlist__exclude_kernel(rec->evlist)) { 1372 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, 1373 machine); 1374 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n" 1375 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 1376 "Check /proc/kallsyms permission or run as root.\n"); 1377 1378 err = perf_event__synthesize_modules(tool, process_synthesized_event, 1379 machine); 1380 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n" 1381 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 1382 "Check /proc/modules permission or run as root.\n"); 1383 } 1384 1385 if (perf_guest) { 1386 machines__process_guests(&session->machines, 1387 perf_event__synthesize_guest_os, tool); 1388 } 1389 1390 err = perf_event__synthesize_extra_attr(&rec->tool, 1391 rec->evlist, 1392 process_synthesized_event, 1393 data->is_pipe); 1394 if (err) 1395 goto out; 1396 1397 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads, 1398 process_synthesized_event, 1399 NULL); 1400 if (err < 0) { 1401 pr_err("Couldn't synthesize thread map.\n"); 1402 return err; 1403 } 1404 1405 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus, 1406 process_synthesized_event, NULL); 1407 if (err < 0) { 1408 pr_err("Couldn't synthesize cpu map.\n"); 1409 return err; 1410 } 1411 1412 err = perf_event__synthesize_bpf_events(session, process_synthesized_event, 1413 machine, opts); 1414 if (err < 0) 1415 pr_warning("Couldn't synthesize bpf events.\n"); 1416 1417 err = perf_event__synthesize_cgroups(tool, process_synthesized_event, 1418 machine); 1419 if (err < 0) 1420 pr_warning("Couldn't synthesize cgroup events.\n"); 1421 1422 if (rec->opts.nr_threads_synthesize > 1) { 1423 perf_set_multithreaded(); 1424 f = process_locked_synthesized_event; 1425 } 1426 1427 err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads, 1428 f, opts->sample_address, 1429 rec->opts.nr_threads_synthesize); 1430 1431 if (rec->opts.nr_threads_synthesize > 1) 1432 perf_set_singlethreaded(); 1433 1434 out: 1435 return err; 1436 } 1437 1438 static int __cmd_record(struct record *rec, int argc, const char **argv) 1439 { 1440 int err; 1441 int status = 0; 1442 unsigned long waking = 0; 1443 const bool forks = argc > 0; 1444 struct perf_tool *tool = &rec->tool; 1445 struct record_opts *opts = &rec->opts; 1446 struct perf_data *data = &rec->data; 1447 struct perf_session *session; 1448 bool disabled = false, draining = false; 1449 struct evlist *sb_evlist = NULL; 1450 int fd; 1451 float ratio = 0; 1452 1453 atexit(record__sig_exit); 1454 signal(SIGCHLD, sig_handler); 1455 signal(SIGINT, sig_handler); 1456 signal(SIGTERM, sig_handler); 1457 signal(SIGSEGV, sigsegv_handler); 1458 1459 if (rec->opts.record_namespaces) 1460 tool->namespace_events = true; 1461 1462 if (rec->opts.record_cgroup) { 1463 #ifdef HAVE_FILE_HANDLE 1464 tool->cgroup_events = true; 1465 #else 1466 pr_err("cgroup tracking is not supported\n"); 1467 return -1; 1468 #endif 1469 } 1470 1471 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) { 1472 signal(SIGUSR2, snapshot_sig_handler); 1473 if (rec->opts.auxtrace_snapshot_mode) 1474 trigger_on(&auxtrace_snapshot_trigger); 1475 if (rec->switch_output.enabled) 1476 trigger_on(&switch_output_trigger); 1477 } else { 1478 signal(SIGUSR2, SIG_IGN); 1479 } 1480 1481 session = perf_session__new(data, false, tool); 1482 if (IS_ERR(session)) { 1483 pr_err("Perf session creation failed.\n"); 1484 return PTR_ERR(session); 1485 } 1486 1487 fd = perf_data__fd(data); 1488 rec->session = session; 1489 1490 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) { 1491 pr_err("Compression initialization failed.\n"); 1492 return -1; 1493 } 1494 1495 session->header.env.comp_type = PERF_COMP_ZSTD; 1496 session->header.env.comp_level = rec->opts.comp_level; 1497 1498 if (rec->opts.kcore && 1499 !record__kcore_readable(&session->machines.host)) { 1500 pr_err("ERROR: kcore is not readable.\n"); 1501 return -1; 1502 } 1503 1504 record__init_features(rec); 1505 1506 if (rec->opts.use_clockid && rec->opts.clockid_res_ns) 1507 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns; 1508 1509 if (forks) { 1510 err = perf_evlist__prepare_workload(rec->evlist, &opts->target, 1511 argv, data->is_pipe, 1512 workload_exec_failed_signal); 1513 if (err < 0) { 1514 pr_err("Couldn't run the workload!\n"); 1515 status = err; 1516 goto out_delete_session; 1517 } 1518 } 1519 1520 /* 1521 * If we have just single event and are sending data 1522 * through pipe, we need to force the ids allocation, 1523 * because we synthesize event name through the pipe 1524 * and need the id for that. 1525 */ 1526 if (data->is_pipe && rec->evlist->core.nr_entries == 1) 1527 rec->opts.sample_id = true; 1528 1529 if (record__open(rec) != 0) { 1530 err = -1; 1531 goto out_child; 1532 } 1533 session->header.env.comp_mmap_len = session->evlist->core.mmap_len; 1534 1535 if (rec->opts.kcore) { 1536 err = record__kcore_copy(&session->machines.host, data); 1537 if (err) { 1538 pr_err("ERROR: Failed to copy kcore\n"); 1539 goto out_child; 1540 } 1541 } 1542 1543 err = bpf__apply_obj_config(); 1544 if (err) { 1545 char errbuf[BUFSIZ]; 1546 1547 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf)); 1548 pr_err("ERROR: Apply config to BPF failed: %s\n", 1549 errbuf); 1550 goto out_child; 1551 } 1552 1553 /* 1554 * Normally perf_session__new would do this, but it doesn't have the 1555 * evlist. 1556 */ 1557 if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) { 1558 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n"); 1559 rec->tool.ordered_events = false; 1560 } 1561 1562 if (!rec->evlist->nr_groups) 1563 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC); 1564 1565 if (data->is_pipe) { 1566 err = perf_header__write_pipe(fd); 1567 if (err < 0) 1568 goto out_child; 1569 } else { 1570 err = perf_session__write_header(session, rec->evlist, fd, false); 1571 if (err < 0) 1572 goto out_child; 1573 } 1574 1575 if (!rec->no_buildid 1576 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) { 1577 pr_err("Couldn't generate buildids. " 1578 "Use --no-buildid to profile anyway.\n"); 1579 err = -1; 1580 goto out_child; 1581 } 1582 1583 if (!opts->no_bpf_event) 1584 bpf_event__add_sb_event(&sb_evlist, &session->header.env); 1585 1586 if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) { 1587 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n"); 1588 opts->no_bpf_event = true; 1589 } 1590 1591 err = record__synthesize(rec, false); 1592 if (err < 0) 1593 goto out_child; 1594 1595 if (rec->realtime_prio) { 1596 struct sched_param param; 1597 1598 param.sched_priority = rec->realtime_prio; 1599 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { 1600 pr_err("Could not set realtime priority.\n"); 1601 err = -1; 1602 goto out_child; 1603 } 1604 } 1605 1606 /* 1607 * When perf is starting the traced process, all the events 1608 * (apart from group members) have enable_on_exec=1 set, 1609 * so don't spoil it by prematurely enabling them. 1610 */ 1611 if (!target__none(&opts->target) && !opts->initial_delay) 1612 evlist__enable(rec->evlist); 1613 1614 /* 1615 * Let the child rip 1616 */ 1617 if (forks) { 1618 struct machine *machine = &session->machines.host; 1619 union perf_event *event; 1620 pid_t tgid; 1621 1622 event = malloc(sizeof(event->comm) + machine->id_hdr_size); 1623 if (event == NULL) { 1624 err = -ENOMEM; 1625 goto out_child; 1626 } 1627 1628 /* 1629 * Some H/W events are generated before COMM event 1630 * which is emitted during exec(), so perf script 1631 * cannot see a correct process name for those events. 1632 * Synthesize COMM event to prevent it. 1633 */ 1634 tgid = perf_event__synthesize_comm(tool, event, 1635 rec->evlist->workload.pid, 1636 process_synthesized_event, 1637 machine); 1638 free(event); 1639 1640 if (tgid == -1) 1641 goto out_child; 1642 1643 event = malloc(sizeof(event->namespaces) + 1644 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) + 1645 machine->id_hdr_size); 1646 if (event == NULL) { 1647 err = -ENOMEM; 1648 goto out_child; 1649 } 1650 1651 /* 1652 * Synthesize NAMESPACES event for the command specified. 1653 */ 1654 perf_event__synthesize_namespaces(tool, event, 1655 rec->evlist->workload.pid, 1656 tgid, process_synthesized_event, 1657 machine); 1658 free(event); 1659 1660 perf_evlist__start_workload(rec->evlist); 1661 } 1662 1663 if (opts->initial_delay) { 1664 usleep(opts->initial_delay * USEC_PER_MSEC); 1665 evlist__enable(rec->evlist); 1666 } 1667 1668 trigger_ready(&auxtrace_snapshot_trigger); 1669 trigger_ready(&switch_output_trigger); 1670 perf_hooks__invoke_record_start(); 1671 for (;;) { 1672 unsigned long long hits = rec->samples; 1673 1674 /* 1675 * rec->evlist->bkw_mmap_state is possible to be 1676 * BKW_MMAP_EMPTY here: when done == true and 1677 * hits != rec->samples in previous round. 1678 * 1679 * perf_evlist__toggle_bkw_mmap ensure we never 1680 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING. 1681 */ 1682 if (trigger_is_hit(&switch_output_trigger) || done || draining) 1683 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING); 1684 1685 if (record__mmap_read_all(rec, false) < 0) { 1686 trigger_error(&auxtrace_snapshot_trigger); 1687 trigger_error(&switch_output_trigger); 1688 err = -1; 1689 goto out_child; 1690 } 1691 1692 if (auxtrace_record__snapshot_started) { 1693 auxtrace_record__snapshot_started = 0; 1694 if (!trigger_is_error(&auxtrace_snapshot_trigger)) 1695 record__read_auxtrace_snapshot(rec, false); 1696 if (trigger_is_error(&auxtrace_snapshot_trigger)) { 1697 pr_err("AUX area tracing snapshot failed\n"); 1698 err = -1; 1699 goto out_child; 1700 } 1701 } 1702 1703 if (trigger_is_hit(&switch_output_trigger)) { 1704 /* 1705 * If switch_output_trigger is hit, the data in 1706 * overwritable ring buffer should have been collected, 1707 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY. 1708 * 1709 * If SIGUSR2 raise after or during record__mmap_read_all(), 1710 * record__mmap_read_all() didn't collect data from 1711 * overwritable ring buffer. Read again. 1712 */ 1713 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING) 1714 continue; 1715 trigger_ready(&switch_output_trigger); 1716 1717 /* 1718 * Reenable events in overwrite ring buffer after 1719 * record__mmap_read_all(): we should have collected 1720 * data from it. 1721 */ 1722 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING); 1723 1724 if (!quiet) 1725 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n", 1726 waking); 1727 waking = 0; 1728 fd = record__switch_output(rec, false); 1729 if (fd < 0) { 1730 pr_err("Failed to switch to new file\n"); 1731 trigger_error(&switch_output_trigger); 1732 err = fd; 1733 goto out_child; 1734 } 1735 1736 /* re-arm the alarm */ 1737 if (rec->switch_output.time) 1738 alarm(rec->switch_output.time); 1739 } 1740 1741 if (hits == rec->samples) { 1742 if (done || draining) 1743 break; 1744 err = evlist__poll(rec->evlist, -1); 1745 /* 1746 * Propagate error, only if there's any. Ignore positive 1747 * number of returned events and interrupt error. 1748 */ 1749 if (err > 0 || (err < 0 && errno == EINTR)) 1750 err = 0; 1751 waking++; 1752 1753 if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0) 1754 draining = true; 1755 } 1756 1757 /* 1758 * When perf is starting the traced process, at the end events 1759 * die with the process and we wait for that. Thus no need to 1760 * disable events in this case. 1761 */ 1762 if (done && !disabled && !target__none(&opts->target)) { 1763 trigger_off(&auxtrace_snapshot_trigger); 1764 evlist__disable(rec->evlist); 1765 disabled = true; 1766 } 1767 } 1768 1769 trigger_off(&auxtrace_snapshot_trigger); 1770 trigger_off(&switch_output_trigger); 1771 1772 if (opts->auxtrace_snapshot_on_exit) 1773 record__auxtrace_snapshot_exit(rec); 1774 1775 if (forks && workload_exec_errno) { 1776 char msg[STRERR_BUFSIZE]; 1777 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg)); 1778 pr_err("Workload failed: %s\n", emsg); 1779 err = -1; 1780 goto out_child; 1781 } 1782 1783 if (!quiet) 1784 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking); 1785 1786 if (target__none(&rec->opts.target)) 1787 record__synthesize_workload(rec, true); 1788 1789 out_child: 1790 record__mmap_read_all(rec, true); 1791 record__aio_mmap_read_sync(rec); 1792 1793 if (rec->session->bytes_transferred && rec->session->bytes_compressed) { 1794 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed; 1795 session->header.env.comp_ratio = ratio + 0.5; 1796 } 1797 1798 if (forks) { 1799 int exit_status; 1800 1801 if (!child_finished) 1802 kill(rec->evlist->workload.pid, SIGTERM); 1803 1804 wait(&exit_status); 1805 1806 if (err < 0) 1807 status = err; 1808 else if (WIFEXITED(exit_status)) 1809 status = WEXITSTATUS(exit_status); 1810 else if (WIFSIGNALED(exit_status)) 1811 signr = WTERMSIG(exit_status); 1812 } else 1813 status = err; 1814 1815 record__synthesize(rec, true); 1816 /* this will be recalculated during process_buildids() */ 1817 rec->samples = 0; 1818 1819 if (!err) { 1820 if (!rec->timestamp_filename) { 1821 record__finish_output(rec); 1822 } else { 1823 fd = record__switch_output(rec, true); 1824 if (fd < 0) { 1825 status = fd; 1826 goto out_delete_session; 1827 } 1828 } 1829 } 1830 1831 perf_hooks__invoke_record_end(); 1832 1833 if (!err && !quiet) { 1834 char samples[128]; 1835 const char *postfix = rec->timestamp_filename ? 1836 ".<timestamp>" : ""; 1837 1838 if (rec->samples && !rec->opts.full_auxtrace) 1839 scnprintf(samples, sizeof(samples), 1840 " (%" PRIu64 " samples)", rec->samples); 1841 else 1842 samples[0] = '\0'; 1843 1844 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s", 1845 perf_data__size(data) / 1024.0 / 1024.0, 1846 data->path, postfix, samples); 1847 if (ratio) { 1848 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)", 1849 rec->session->bytes_transferred / 1024.0 / 1024.0, 1850 ratio); 1851 } 1852 fprintf(stderr, " ]\n"); 1853 } 1854 1855 out_delete_session: 1856 zstd_fini(&session->zstd_data); 1857 perf_session__delete(session); 1858 1859 if (!opts->no_bpf_event) 1860 perf_evlist__stop_sb_thread(sb_evlist); 1861 return status; 1862 } 1863 1864 static void callchain_debug(struct callchain_param *callchain) 1865 { 1866 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" }; 1867 1868 pr_debug("callchain: type %s\n", str[callchain->record_mode]); 1869 1870 if (callchain->record_mode == CALLCHAIN_DWARF) 1871 pr_debug("callchain: stack dump size %d\n", 1872 callchain->dump_size); 1873 } 1874 1875 int record_opts__parse_callchain(struct record_opts *record, 1876 struct callchain_param *callchain, 1877 const char *arg, bool unset) 1878 { 1879 int ret; 1880 callchain->enabled = !unset; 1881 1882 /* --no-call-graph */ 1883 if (unset) { 1884 callchain->record_mode = CALLCHAIN_NONE; 1885 pr_debug("callchain: disabled\n"); 1886 return 0; 1887 } 1888 1889 ret = parse_callchain_record_opt(arg, callchain); 1890 if (!ret) { 1891 /* Enable data address sampling for DWARF unwind. */ 1892 if (callchain->record_mode == CALLCHAIN_DWARF) 1893 record->sample_address = true; 1894 callchain_debug(callchain); 1895 } 1896 1897 return ret; 1898 } 1899 1900 int record_parse_callchain_opt(const struct option *opt, 1901 const char *arg, 1902 int unset) 1903 { 1904 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset); 1905 } 1906 1907 int record_callchain_opt(const struct option *opt, 1908 const char *arg __maybe_unused, 1909 int unset __maybe_unused) 1910 { 1911 struct callchain_param *callchain = opt->value; 1912 1913 callchain->enabled = true; 1914 1915 if (callchain->record_mode == CALLCHAIN_NONE) 1916 callchain->record_mode = CALLCHAIN_FP; 1917 1918 callchain_debug(callchain); 1919 return 0; 1920 } 1921 1922 static int perf_record_config(const char *var, const char *value, void *cb) 1923 { 1924 struct record *rec = cb; 1925 1926 if (!strcmp(var, "record.build-id")) { 1927 if (!strcmp(value, "cache")) 1928 rec->no_buildid_cache = false; 1929 else if (!strcmp(value, "no-cache")) 1930 rec->no_buildid_cache = true; 1931 else if (!strcmp(value, "skip")) 1932 rec->no_buildid = true; 1933 else 1934 return -1; 1935 return 0; 1936 } 1937 if (!strcmp(var, "record.call-graph")) { 1938 var = "call-graph.record-mode"; 1939 return perf_default_config(var, value, cb); 1940 } 1941 #ifdef HAVE_AIO_SUPPORT 1942 if (!strcmp(var, "record.aio")) { 1943 rec->opts.nr_cblocks = strtol(value, NULL, 0); 1944 if (!rec->opts.nr_cblocks) 1945 rec->opts.nr_cblocks = nr_cblocks_default; 1946 } 1947 #endif 1948 1949 return 0; 1950 } 1951 1952 struct clockid_map { 1953 const char *name; 1954 int clockid; 1955 }; 1956 1957 #define CLOCKID_MAP(n, c) \ 1958 { .name = n, .clockid = (c), } 1959 1960 #define CLOCKID_END { .name = NULL, } 1961 1962 1963 /* 1964 * Add the missing ones, we need to build on many distros... 1965 */ 1966 #ifndef CLOCK_MONOTONIC_RAW 1967 #define CLOCK_MONOTONIC_RAW 4 1968 #endif 1969 #ifndef CLOCK_BOOTTIME 1970 #define CLOCK_BOOTTIME 7 1971 #endif 1972 #ifndef CLOCK_TAI 1973 #define CLOCK_TAI 11 1974 #endif 1975 1976 static const struct clockid_map clockids[] = { 1977 /* available for all events, NMI safe */ 1978 CLOCKID_MAP("monotonic", CLOCK_MONOTONIC), 1979 CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW), 1980 1981 /* available for some events */ 1982 CLOCKID_MAP("realtime", CLOCK_REALTIME), 1983 CLOCKID_MAP("boottime", CLOCK_BOOTTIME), 1984 CLOCKID_MAP("tai", CLOCK_TAI), 1985 1986 /* available for the lazy */ 1987 CLOCKID_MAP("mono", CLOCK_MONOTONIC), 1988 CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW), 1989 CLOCKID_MAP("real", CLOCK_REALTIME), 1990 CLOCKID_MAP("boot", CLOCK_BOOTTIME), 1991 1992 CLOCKID_END, 1993 }; 1994 1995 static int get_clockid_res(clockid_t clk_id, u64 *res_ns) 1996 { 1997 struct timespec res; 1998 1999 *res_ns = 0; 2000 if (!clock_getres(clk_id, &res)) 2001 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC; 2002 else 2003 pr_warning("WARNING: Failed to determine specified clock resolution.\n"); 2004 2005 return 0; 2006 } 2007 2008 static int parse_clockid(const struct option *opt, const char *str, int unset) 2009 { 2010 struct record_opts *opts = (struct record_opts *)opt->value; 2011 const struct clockid_map *cm; 2012 const char *ostr = str; 2013 2014 if (unset) { 2015 opts->use_clockid = 0; 2016 return 0; 2017 } 2018 2019 /* no arg passed */ 2020 if (!str) 2021 return 0; 2022 2023 /* no setting it twice */ 2024 if (opts->use_clockid) 2025 return -1; 2026 2027 opts->use_clockid = true; 2028 2029 /* if its a number, we're done */ 2030 if (sscanf(str, "%d", &opts->clockid) == 1) 2031 return get_clockid_res(opts->clockid, &opts->clockid_res_ns); 2032 2033 /* allow a "CLOCK_" prefix to the name */ 2034 if (!strncasecmp(str, "CLOCK_", 6)) 2035 str += 6; 2036 2037 for (cm = clockids; cm->name; cm++) { 2038 if (!strcasecmp(str, cm->name)) { 2039 opts->clockid = cm->clockid; 2040 return get_clockid_res(opts->clockid, 2041 &opts->clockid_res_ns); 2042 } 2043 } 2044 2045 opts->use_clockid = false; 2046 ui__warning("unknown clockid %s, check man page\n", ostr); 2047 return -1; 2048 } 2049 2050 static int record__parse_affinity(const struct option *opt, const char *str, int unset) 2051 { 2052 struct record_opts *opts = (struct record_opts *)opt->value; 2053 2054 if (unset || !str) 2055 return 0; 2056 2057 if (!strcasecmp(str, "node")) 2058 opts->affinity = PERF_AFFINITY_NODE; 2059 else if (!strcasecmp(str, "cpu")) 2060 opts->affinity = PERF_AFFINITY_CPU; 2061 2062 return 0; 2063 } 2064 2065 static int parse_output_max_size(const struct option *opt, 2066 const char *str, int unset) 2067 { 2068 unsigned long *s = (unsigned long *)opt->value; 2069 static struct parse_tag tags_size[] = { 2070 { .tag = 'B', .mult = 1 }, 2071 { .tag = 'K', .mult = 1 << 10 }, 2072 { .tag = 'M', .mult = 1 << 20 }, 2073 { .tag = 'G', .mult = 1 << 30 }, 2074 { .tag = 0 }, 2075 }; 2076 unsigned long val; 2077 2078 if (unset) { 2079 *s = 0; 2080 return 0; 2081 } 2082 2083 val = parse_tag_value(str, tags_size); 2084 if (val != (unsigned long) -1) { 2085 *s = val; 2086 return 0; 2087 } 2088 2089 return -1; 2090 } 2091 2092 static int record__parse_mmap_pages(const struct option *opt, 2093 const char *str, 2094 int unset __maybe_unused) 2095 { 2096 struct record_opts *opts = opt->value; 2097 char *s, *p; 2098 unsigned int mmap_pages; 2099 int ret; 2100 2101 if (!str) 2102 return -EINVAL; 2103 2104 s = strdup(str); 2105 if (!s) 2106 return -ENOMEM; 2107 2108 p = strchr(s, ','); 2109 if (p) 2110 *p = '\0'; 2111 2112 if (*s) { 2113 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s); 2114 if (ret) 2115 goto out_free; 2116 opts->mmap_pages = mmap_pages; 2117 } 2118 2119 if (!p) { 2120 ret = 0; 2121 goto out_free; 2122 } 2123 2124 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1); 2125 if (ret) 2126 goto out_free; 2127 2128 opts->auxtrace_mmap_pages = mmap_pages; 2129 2130 out_free: 2131 free(s); 2132 return ret; 2133 } 2134 2135 static void switch_output_size_warn(struct record *rec) 2136 { 2137 u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages); 2138 struct switch_output *s = &rec->switch_output; 2139 2140 wakeup_size /= 2; 2141 2142 if (s->size < wakeup_size) { 2143 char buf[100]; 2144 2145 unit_number__scnprintf(buf, sizeof(buf), wakeup_size); 2146 pr_warning("WARNING: switch-output data size lower than " 2147 "wakeup kernel buffer size (%s) " 2148 "expect bigger perf.data sizes\n", buf); 2149 } 2150 } 2151 2152 static int switch_output_setup(struct record *rec) 2153 { 2154 struct switch_output *s = &rec->switch_output; 2155 static struct parse_tag tags_size[] = { 2156 { .tag = 'B', .mult = 1 }, 2157 { .tag = 'K', .mult = 1 << 10 }, 2158 { .tag = 'M', .mult = 1 << 20 }, 2159 { .tag = 'G', .mult = 1 << 30 }, 2160 { .tag = 0 }, 2161 }; 2162 static struct parse_tag tags_time[] = { 2163 { .tag = 's', .mult = 1 }, 2164 { .tag = 'm', .mult = 60 }, 2165 { .tag = 'h', .mult = 60*60 }, 2166 { .tag = 'd', .mult = 60*60*24 }, 2167 { .tag = 0 }, 2168 }; 2169 unsigned long val; 2170 2171 if (!s->set) 2172 return 0; 2173 2174 if (!strcmp(s->str, "signal")) { 2175 s->signal = true; 2176 pr_debug("switch-output with SIGUSR2 signal\n"); 2177 goto enabled; 2178 } 2179 2180 val = parse_tag_value(s->str, tags_size); 2181 if (val != (unsigned long) -1) { 2182 s->size = val; 2183 pr_debug("switch-output with %s size threshold\n", s->str); 2184 goto enabled; 2185 } 2186 2187 val = parse_tag_value(s->str, tags_time); 2188 if (val != (unsigned long) -1) { 2189 s->time = val; 2190 pr_debug("switch-output with %s time threshold (%lu seconds)\n", 2191 s->str, s->time); 2192 goto enabled; 2193 } 2194 2195 return -1; 2196 2197 enabled: 2198 rec->timestamp_filename = true; 2199 s->enabled = true; 2200 2201 if (s->size && !rec->opts.no_buffering) 2202 switch_output_size_warn(rec); 2203 2204 return 0; 2205 } 2206 2207 static const char * const __record_usage[] = { 2208 "perf record [<options>] [<command>]", 2209 "perf record [<options>] -- <command> [<options>]", 2210 NULL 2211 }; 2212 const char * const *record_usage = __record_usage; 2213 2214 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event, 2215 struct perf_sample *sample, struct machine *machine) 2216 { 2217 /* 2218 * We already have the kernel maps, put in place via perf_session__create_kernel_maps() 2219 * no need to add them twice. 2220 */ 2221 if (!(event->header.misc & PERF_RECORD_MISC_USER)) 2222 return 0; 2223 return perf_event__process_mmap(tool, event, sample, machine); 2224 } 2225 2226 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event, 2227 struct perf_sample *sample, struct machine *machine) 2228 { 2229 /* 2230 * We already have the kernel maps, put in place via perf_session__create_kernel_maps() 2231 * no need to add them twice. 2232 */ 2233 if (!(event->header.misc & PERF_RECORD_MISC_USER)) 2234 return 0; 2235 2236 return perf_event__process_mmap2(tool, event, sample, machine); 2237 } 2238 2239 /* 2240 * XXX Ideally would be local to cmd_record() and passed to a record__new 2241 * because we need to have access to it in record__exit, that is called 2242 * after cmd_record() exits, but since record_options need to be accessible to 2243 * builtin-script, leave it here. 2244 * 2245 * At least we don't ouch it in all the other functions here directly. 2246 * 2247 * Just say no to tons of global variables, sigh. 2248 */ 2249 static struct record record = { 2250 .opts = { 2251 .sample_time = true, 2252 .mmap_pages = UINT_MAX, 2253 .user_freq = UINT_MAX, 2254 .user_interval = ULLONG_MAX, 2255 .freq = 4000, 2256 .target = { 2257 .uses_mmap = true, 2258 .default_per_cpu = true, 2259 }, 2260 .mmap_flush = MMAP_FLUSH_DEFAULT, 2261 .nr_threads_synthesize = 1, 2262 }, 2263 .tool = { 2264 .sample = process_sample_event, 2265 .fork = perf_event__process_fork, 2266 .exit = perf_event__process_exit, 2267 .comm = perf_event__process_comm, 2268 .namespaces = perf_event__process_namespaces, 2269 .mmap = build_id__process_mmap, 2270 .mmap2 = build_id__process_mmap2, 2271 .ordered_events = true, 2272 }, 2273 }; 2274 2275 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP 2276 "\n\t\t\t\tDefault: fp"; 2277 2278 static bool dry_run; 2279 2280 /* 2281 * XXX Will stay a global variable till we fix builtin-script.c to stop messing 2282 * with it and switch to use the library functions in perf_evlist that came 2283 * from builtin-record.c, i.e. use record_opts, 2284 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record', 2285 * using pipes, etc. 2286 */ 2287 static struct option __record_options[] = { 2288 OPT_CALLBACK('e', "event", &record.evlist, "event", 2289 "event selector. use 'perf list' to list available events", 2290 parse_events_option), 2291 OPT_CALLBACK(0, "filter", &record.evlist, "filter", 2292 "event filter", parse_filter), 2293 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist, 2294 NULL, "don't record events from perf itself", 2295 exclude_perf), 2296 OPT_STRING('p', "pid", &record.opts.target.pid, "pid", 2297 "record events on existing process id"), 2298 OPT_STRING('t', "tid", &record.opts.target.tid, "tid", 2299 "record events on existing thread id"), 2300 OPT_INTEGER('r', "realtime", &record.realtime_prio, 2301 "collect data with this RT SCHED_FIFO priority"), 2302 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering, 2303 "collect data without buffering"), 2304 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples, 2305 "collect raw sample records from all opened counters"), 2306 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide, 2307 "system-wide collection from all CPUs"), 2308 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu", 2309 "list of cpus to monitor"), 2310 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"), 2311 OPT_STRING('o', "output", &record.data.path, "file", 2312 "output file name"), 2313 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit, 2314 &record.opts.no_inherit_set, 2315 "child tasks do not inherit counters"), 2316 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize, 2317 "synthesize non-sample events at the end of output"), 2318 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"), 2319 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"), 2320 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq, 2321 "Fail if the specified frequency can't be used"), 2322 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'", 2323 "profile at this frequency", 2324 record__parse_freq), 2325 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]", 2326 "number of mmap data pages and AUX area tracing mmap pages", 2327 record__parse_mmap_pages), 2328 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number", 2329 "Minimal number of bytes that is extracted from mmap data pages (default: 1)", 2330 record__mmap_flush_parse), 2331 OPT_BOOLEAN(0, "group", &record.opts.group, 2332 "put the counters into a counter group"), 2333 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param, 2334 NULL, "enables call-graph recording" , 2335 &record_callchain_opt), 2336 OPT_CALLBACK(0, "call-graph", &record.opts, 2337 "record_mode[,record_size]", record_callchain_help, 2338 &record_parse_callchain_opt), 2339 OPT_INCR('v', "verbose", &verbose, 2340 "be more verbose (show counter open errors, etc)"), 2341 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"), 2342 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat, 2343 "per thread counts"), 2344 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"), 2345 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr, 2346 "Record the sample physical addresses"), 2347 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"), 2348 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time, 2349 &record.opts.sample_time_set, 2350 "Record the sample timestamps"), 2351 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set, 2352 "Record the sample period"), 2353 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples, 2354 "don't sample"), 2355 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache, 2356 &record.no_buildid_cache_set, 2357 "do not update the buildid cache"), 2358 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid, 2359 &record.no_buildid_set, 2360 "do not collect buildids in perf.data"), 2361 OPT_CALLBACK('G', "cgroup", &record.evlist, "name", 2362 "monitor event in cgroup name only", 2363 parse_cgroups), 2364 OPT_UINTEGER('D', "delay", &record.opts.initial_delay, 2365 "ms to wait before starting measurement after program start"), 2366 OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"), 2367 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user", 2368 "user to profile"), 2369 2370 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack, 2371 "branch any", "sample any taken branches", 2372 parse_branch_stack), 2373 2374 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack, 2375 "branch filter mask", "branch stack filter modes", 2376 parse_branch_stack), 2377 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight, 2378 "sample by weight (on special events only)"), 2379 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction, 2380 "sample transaction flags (special events only)"), 2381 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread, 2382 "use per-thread mmaps"), 2383 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register", 2384 "sample selected machine registers on interrupt," 2385 " use '-I?' to list register names", parse_intr_regs), 2386 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register", 2387 "sample selected machine registers on interrupt," 2388 " use '--user-regs=?' to list register names", parse_user_regs), 2389 OPT_BOOLEAN(0, "running-time", &record.opts.running_time, 2390 "Record running/enabled time of read (:S) events"), 2391 OPT_CALLBACK('k', "clockid", &record.opts, 2392 "clockid", "clockid to use for events, see clock_gettime()", 2393 parse_clockid), 2394 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts, 2395 "opts", "AUX area tracing Snapshot Mode", ""), 2396 OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts, 2397 "opts", "sample AUX area", ""), 2398 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout, 2399 "per thread proc mmap processing timeout in ms"), 2400 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces, 2401 "Record namespaces events"), 2402 OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup, 2403 "Record cgroup events"), 2404 OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events, 2405 "Record context switch events"), 2406 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel, 2407 "Configure all used events to run in kernel space.", 2408 PARSE_OPT_EXCLUSIVE), 2409 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user, 2410 "Configure all used events to run in user space.", 2411 PARSE_OPT_EXCLUSIVE), 2412 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains, 2413 "collect kernel callchains"), 2414 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains, 2415 "collect user callchains"), 2416 OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path", 2417 "clang binary to use for compiling BPF scriptlets"), 2418 OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options", 2419 "options passed to clang when compiling BPF scriptlets"), 2420 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name, 2421 "file", "vmlinux pathname"), 2422 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all, 2423 "Record build-id of all DSOs regardless of hits"), 2424 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename, 2425 "append timestamp to output filename"), 2426 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary, 2427 "Record timestamp boundary (time of first/last samples)"), 2428 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str, 2429 &record.switch_output.set, "signal or size[BKMG] or time[smhd]", 2430 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold", 2431 "signal"), 2432 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files, 2433 "Limit number of switch output generated files"), 2434 OPT_BOOLEAN(0, "dry-run", &dry_run, 2435 "Parse options then exit"), 2436 #ifdef HAVE_AIO_SUPPORT 2437 OPT_CALLBACK_OPTARG(0, "aio", &record.opts, 2438 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)", 2439 record__aio_parse), 2440 #endif 2441 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu", 2442 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer", 2443 record__parse_affinity), 2444 #ifdef HAVE_ZSTD_SUPPORT 2445 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, 2446 "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)", 2447 record__parse_comp_level), 2448 #endif 2449 OPT_CALLBACK(0, "max-size", &record.output_max_size, 2450 "size", "Limit the maximum size of the output file", parse_output_max_size), 2451 OPT_UINTEGER(0, "num-thread-synthesize", 2452 &record.opts.nr_threads_synthesize, 2453 "number of threads to run for event synthesis"), 2454 OPT_END() 2455 }; 2456 2457 struct option *record_options = __record_options; 2458 2459 int cmd_record(int argc, const char **argv) 2460 { 2461 int err; 2462 struct record *rec = &record; 2463 char errbuf[BUFSIZ]; 2464 2465 setlocale(LC_ALL, ""); 2466 2467 #ifndef HAVE_LIBBPF_SUPPORT 2468 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c) 2469 set_nobuild('\0', "clang-path", true); 2470 set_nobuild('\0', "clang-opt", true); 2471 # undef set_nobuild 2472 #endif 2473 2474 #ifndef HAVE_BPF_PROLOGUE 2475 # if !defined (HAVE_DWARF_SUPPORT) 2476 # define REASON "NO_DWARF=1" 2477 # elif !defined (HAVE_LIBBPF_SUPPORT) 2478 # define REASON "NO_LIBBPF=1" 2479 # else 2480 # define REASON "this architecture doesn't support BPF prologue" 2481 # endif 2482 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c) 2483 set_nobuild('\0', "vmlinux", true); 2484 # undef set_nobuild 2485 # undef REASON 2486 #endif 2487 2488 rec->opts.affinity = PERF_AFFINITY_SYS; 2489 2490 rec->evlist = evlist__new(); 2491 if (rec->evlist == NULL) 2492 return -ENOMEM; 2493 2494 err = perf_config(perf_record_config, rec); 2495 if (err) 2496 return err; 2497 2498 argc = parse_options(argc, argv, record_options, record_usage, 2499 PARSE_OPT_STOP_AT_NON_OPTION); 2500 if (quiet) 2501 perf_quiet_option(); 2502 2503 /* Make system wide (-a) the default target. */ 2504 if (!argc && target__none(&rec->opts.target)) 2505 rec->opts.target.system_wide = true; 2506 2507 if (nr_cgroups && !rec->opts.target.system_wide) { 2508 usage_with_options_msg(record_usage, record_options, 2509 "cgroup monitoring only available in system-wide mode"); 2510 2511 } 2512 2513 if (rec->opts.kcore) 2514 rec->data.is_dir = true; 2515 2516 if (rec->opts.comp_level != 0) { 2517 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n"); 2518 rec->no_buildid = true; 2519 } 2520 2521 if (rec->opts.record_switch_events && 2522 !perf_can_record_switch_events()) { 2523 ui__error("kernel does not support recording context switch events\n"); 2524 parse_options_usage(record_usage, record_options, "switch-events", 0); 2525 return -EINVAL; 2526 } 2527 2528 if (switch_output_setup(rec)) { 2529 parse_options_usage(record_usage, record_options, "switch-output", 0); 2530 return -EINVAL; 2531 } 2532 2533 if (rec->switch_output.time) { 2534 signal(SIGALRM, alarm_sig_handler); 2535 alarm(rec->switch_output.time); 2536 } 2537 2538 if (rec->switch_output.num_files) { 2539 rec->switch_output.filenames = calloc(sizeof(char *), 2540 rec->switch_output.num_files); 2541 if (!rec->switch_output.filenames) 2542 return -EINVAL; 2543 } 2544 2545 /* 2546 * Allow aliases to facilitate the lookup of symbols for address 2547 * filters. Refer to auxtrace_parse_filters(). 2548 */ 2549 symbol_conf.allow_aliases = true; 2550 2551 symbol__init(NULL); 2552 2553 if (rec->opts.affinity != PERF_AFFINITY_SYS) { 2554 rec->affinity_mask.nbits = cpu__max_cpu(); 2555 rec->affinity_mask.bits = bitmap_alloc(rec->affinity_mask.nbits); 2556 if (!rec->affinity_mask.bits) { 2557 pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits); 2558 return -ENOMEM; 2559 } 2560 pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits); 2561 } 2562 2563 err = record__auxtrace_init(rec); 2564 if (err) 2565 goto out; 2566 2567 if (dry_run) 2568 goto out; 2569 2570 err = bpf__setup_stdout(rec->evlist); 2571 if (err) { 2572 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf)); 2573 pr_err("ERROR: Setup BPF stdout failed: %s\n", 2574 errbuf); 2575 goto out; 2576 } 2577 2578 err = -ENOMEM; 2579 2580 if (rec->no_buildid_cache || rec->no_buildid) { 2581 disable_buildid_cache(); 2582 } else if (rec->switch_output.enabled) { 2583 /* 2584 * In 'perf record --switch-output', disable buildid 2585 * generation by default to reduce data file switching 2586 * overhead. Still generate buildid if they are required 2587 * explicitly using 2588 * 2589 * perf record --switch-output --no-no-buildid \ 2590 * --no-no-buildid-cache 2591 * 2592 * Following code equals to: 2593 * 2594 * if ((rec->no_buildid || !rec->no_buildid_set) && 2595 * (rec->no_buildid_cache || !rec->no_buildid_cache_set)) 2596 * disable_buildid_cache(); 2597 */ 2598 bool disable = true; 2599 2600 if (rec->no_buildid_set && !rec->no_buildid) 2601 disable = false; 2602 if (rec->no_buildid_cache_set && !rec->no_buildid_cache) 2603 disable = false; 2604 if (disable) { 2605 rec->no_buildid = true; 2606 rec->no_buildid_cache = true; 2607 disable_buildid_cache(); 2608 } 2609 } 2610 2611 if (record.opts.overwrite) 2612 record.opts.tail_synthesize = true; 2613 2614 if (rec->evlist->core.nr_entries == 0 && 2615 __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) { 2616 pr_err("Not enough memory for event selector list\n"); 2617 goto out; 2618 } 2619 2620 if (rec->opts.target.tid && !rec->opts.no_inherit_set) 2621 rec->opts.no_inherit = true; 2622 2623 err = target__validate(&rec->opts.target); 2624 if (err) { 2625 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ); 2626 ui__warning("%s\n", errbuf); 2627 } 2628 2629 err = target__parse_uid(&rec->opts.target); 2630 if (err) { 2631 int saved_errno = errno; 2632 2633 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ); 2634 ui__error("%s", errbuf); 2635 2636 err = -saved_errno; 2637 goto out; 2638 } 2639 2640 /* Enable ignoring missing threads when -u/-p option is defined. */ 2641 rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid; 2642 2643 err = -ENOMEM; 2644 if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0) 2645 usage_with_options(record_usage, record_options); 2646 2647 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts); 2648 if (err) 2649 goto out; 2650 2651 /* 2652 * We take all buildids when the file contains 2653 * AUX area tracing data because we do not decode the 2654 * trace because it would take too long. 2655 */ 2656 if (rec->opts.full_auxtrace) 2657 rec->buildid_all = true; 2658 2659 if (record_opts__config(&rec->opts)) { 2660 err = -EINVAL; 2661 goto out; 2662 } 2663 2664 if (rec->opts.nr_cblocks > nr_cblocks_max) 2665 rec->opts.nr_cblocks = nr_cblocks_max; 2666 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks); 2667 2668 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]); 2669 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush); 2670 2671 if (rec->opts.comp_level > comp_level_max) 2672 rec->opts.comp_level = comp_level_max; 2673 pr_debug("comp level: %d\n", rec->opts.comp_level); 2674 2675 err = __cmd_record(&record, argc, argv); 2676 out: 2677 bitmap_free(rec->affinity_mask.bits); 2678 evlist__delete(rec->evlist); 2679 symbol__exit(); 2680 auxtrace_record__free(rec->itr); 2681 return err; 2682 } 2683 2684 static void snapshot_sig_handler(int sig __maybe_unused) 2685 { 2686 struct record *rec = &record; 2687 2688 if (trigger_is_ready(&auxtrace_snapshot_trigger)) { 2689 trigger_hit(&auxtrace_snapshot_trigger); 2690 auxtrace_record__snapshot_started = 1; 2691 if (auxtrace_record__snapshot_start(record.itr)) 2692 trigger_error(&auxtrace_snapshot_trigger); 2693 } 2694 2695 if (switch_output_signal(rec)) 2696 trigger_hit(&switch_output_trigger); 2697 } 2698 2699 static void alarm_sig_handler(int sig __maybe_unused) 2700 { 2701 struct record *rec = &record; 2702 2703 if (switch_output_time(rec)) 2704 trigger_hit(&switch_output_trigger); 2705 } 2706