1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * builtin-record.c 4 * 5 * Builtin record command: Record the profile of a workload 6 * (or a CPU, or a PID) into the perf.data output file - for 7 * later analysis via perf report. 8 */ 9 #include "builtin.h" 10 11 #include "util/build-id.h" 12 #include <subcmd/parse-options.h> 13 #include "util/parse-events.h" 14 #include "util/config.h" 15 16 #include "util/callchain.h" 17 #include "util/cgroup.h" 18 #include "util/header.h" 19 #include "util/event.h" 20 #include "util/evlist.h" 21 #include "util/evsel.h" 22 #include "util/debug.h" 23 #include "util/mmap.h" 24 #include "util/target.h" 25 #include "util/session.h" 26 #include "util/tool.h" 27 #include "util/symbol.h" 28 #include "util/record.h" 29 #include "util/cpumap.h" 30 #include "util/thread_map.h" 31 #include "util/data.h" 32 #include "util/perf_regs.h" 33 #include "util/auxtrace.h" 34 #include "util/tsc.h" 35 #include "util/parse-branch-options.h" 36 #include "util/parse-regs-options.h" 37 #include "util/llvm-utils.h" 38 #include "util/bpf-loader.h" 39 #include "util/trigger.h" 40 #include "util/perf-hooks.h" 41 #include "util/cpu-set-sched.h" 42 #include "util/synthetic-events.h" 43 #include "util/time-utils.h" 44 #include "util/units.h" 45 #include "util/bpf-event.h" 46 #include "asm/bug.h" 47 #include "perf.h" 48 49 #include <errno.h> 50 #include <inttypes.h> 51 #include <locale.h> 52 #include <poll.h> 53 #include <unistd.h> 54 #include <sched.h> 55 #include <signal.h> 56 #include <sys/mman.h> 57 #include <sys/wait.h> 58 #include <sys/types.h> 59 #include <sys/stat.h> 60 #include <fcntl.h> 61 #include <linux/err.h> 62 #include <linux/string.h> 63 #include <linux/time64.h> 64 #include <linux/zalloc.h> 65 66 struct switch_output { 67 bool enabled; 68 bool signal; 69 unsigned long size; 70 unsigned long time; 71 const char *str; 72 bool set; 73 char **filenames; 74 int num_files; 75 int cur_file; 76 }; 77 78 struct record { 79 struct perf_tool tool; 80 struct record_opts opts; 81 u64 bytes_written; 82 struct perf_data data; 83 struct auxtrace_record *itr; 84 struct evlist *evlist; 85 struct perf_session *session; 86 int realtime_prio; 87 bool no_buildid; 88 bool no_buildid_set; 89 bool no_buildid_cache; 90 bool no_buildid_cache_set; 91 bool buildid_all; 92 bool timestamp_filename; 93 bool timestamp_boundary; 94 struct switch_output switch_output; 95 unsigned long long samples; 96 cpu_set_t affinity_mask; 97 unsigned long output_max_size; /* = 0: unlimited */ 98 }; 99 100 static volatile int done; 101 102 static volatile int auxtrace_record__snapshot_started; 103 static DEFINE_TRIGGER(auxtrace_snapshot_trigger); 104 static DEFINE_TRIGGER(switch_output_trigger); 105 106 static const char *affinity_tags[PERF_AFFINITY_MAX] = { 107 "SYS", "NODE", "CPU" 108 }; 109 110 static bool switch_output_signal(struct record *rec) 111 { 112 return rec->switch_output.signal && 113 trigger_is_ready(&switch_output_trigger); 114 } 115 116 static bool switch_output_size(struct record *rec) 117 { 118 return rec->switch_output.size && 119 trigger_is_ready(&switch_output_trigger) && 120 (rec->bytes_written >= rec->switch_output.size); 121 } 122 123 static bool switch_output_time(struct record *rec) 124 { 125 return rec->switch_output.time && 126 trigger_is_ready(&switch_output_trigger); 127 } 128 129 static bool record__output_max_size_exceeded(struct record *rec) 130 { 131 return rec->output_max_size && 132 (rec->bytes_written >= rec->output_max_size); 133 } 134 135 static int record__write(struct record *rec, struct mmap *map __maybe_unused, 136 void *bf, size_t size) 137 { 138 struct perf_data_file *file = &rec->session->data->file; 139 140 if (perf_data_file__write(file, bf, size) < 0) { 141 pr_err("failed to write perf data, error: %m\n"); 142 return -1; 143 } 144 145 rec->bytes_written += size; 146 147 if (record__output_max_size_exceeded(rec) && !done) { 148 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB)," 149 " stopping session ]\n", 150 rec->bytes_written >> 10); 151 done = 1; 152 } 153 154 if (switch_output_size(rec)) 155 trigger_hit(&switch_output_trigger); 156 157 return 0; 158 } 159 160 static int record__aio_enabled(struct record *rec); 161 static int record__comp_enabled(struct record *rec); 162 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size, 163 void *src, size_t src_size); 164 165 #ifdef HAVE_AIO_SUPPORT 166 static int record__aio_write(struct aiocb *cblock, int trace_fd, 167 void *buf, size_t size, off_t off) 168 { 169 int rc; 170 171 cblock->aio_fildes = trace_fd; 172 cblock->aio_buf = buf; 173 cblock->aio_nbytes = size; 174 cblock->aio_offset = off; 175 cblock->aio_sigevent.sigev_notify = SIGEV_NONE; 176 177 do { 178 rc = aio_write(cblock); 179 if (rc == 0) { 180 break; 181 } else if (errno != EAGAIN) { 182 cblock->aio_fildes = -1; 183 pr_err("failed to queue perf data, error: %m\n"); 184 break; 185 } 186 } while (1); 187 188 return rc; 189 } 190 191 static int record__aio_complete(struct mmap *md, struct aiocb *cblock) 192 { 193 void *rem_buf; 194 off_t rem_off; 195 size_t rem_size; 196 int rc, aio_errno; 197 ssize_t aio_ret, written; 198 199 aio_errno = aio_error(cblock); 200 if (aio_errno == EINPROGRESS) 201 return 0; 202 203 written = aio_ret = aio_return(cblock); 204 if (aio_ret < 0) { 205 if (aio_errno != EINTR) 206 pr_err("failed to write perf data, error: %m\n"); 207 written = 0; 208 } 209 210 rem_size = cblock->aio_nbytes - written; 211 212 if (rem_size == 0) { 213 cblock->aio_fildes = -1; 214 /* 215 * md->refcount is incremented in record__aio_pushfn() for 216 * every aio write request started in record__aio_push() so 217 * decrement it because the request is now complete. 218 */ 219 perf_mmap__put(&md->core); 220 rc = 1; 221 } else { 222 /* 223 * aio write request may require restart with the 224 * reminder if the kernel didn't write whole 225 * chunk at once. 226 */ 227 rem_off = cblock->aio_offset + written; 228 rem_buf = (void *)(cblock->aio_buf + written); 229 record__aio_write(cblock, cblock->aio_fildes, 230 rem_buf, rem_size, rem_off); 231 rc = 0; 232 } 233 234 return rc; 235 } 236 237 static int record__aio_sync(struct mmap *md, bool sync_all) 238 { 239 struct aiocb **aiocb = md->aio.aiocb; 240 struct aiocb *cblocks = md->aio.cblocks; 241 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */ 242 int i, do_suspend; 243 244 do { 245 do_suspend = 0; 246 for (i = 0; i < md->aio.nr_cblocks; ++i) { 247 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) { 248 if (sync_all) 249 aiocb[i] = NULL; 250 else 251 return i; 252 } else { 253 /* 254 * Started aio write is not complete yet 255 * so it has to be waited before the 256 * next allocation. 257 */ 258 aiocb[i] = &cblocks[i]; 259 do_suspend = 1; 260 } 261 } 262 if (!do_suspend) 263 return -1; 264 265 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) { 266 if (!(errno == EAGAIN || errno == EINTR)) 267 pr_err("failed to sync perf data, error: %m\n"); 268 } 269 } while (1); 270 } 271 272 struct record_aio { 273 struct record *rec; 274 void *data; 275 size_t size; 276 }; 277 278 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size) 279 { 280 struct record_aio *aio = to; 281 282 /* 283 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer 284 * to release space in the kernel buffer as fast as possible, calling 285 * perf_mmap__consume() from perf_mmap__push() function. 286 * 287 * That lets the kernel to proceed with storing more profiling data into 288 * the kernel buffer earlier than other per-cpu kernel buffers are handled. 289 * 290 * Coping can be done in two steps in case the chunk of profiling data 291 * crosses the upper bound of the kernel buffer. In this case we first move 292 * part of data from map->start till the upper bound and then the reminder 293 * from the beginning of the kernel buffer till the end of the data chunk. 294 */ 295 296 if (record__comp_enabled(aio->rec)) { 297 size = zstd_compress(aio->rec->session, aio->data + aio->size, 298 mmap__mmap_len(map) - aio->size, 299 buf, size); 300 } else { 301 memcpy(aio->data + aio->size, buf, size); 302 } 303 304 if (!aio->size) { 305 /* 306 * Increment map->refcount to guard map->aio.data[] buffer 307 * from premature deallocation because map object can be 308 * released earlier than aio write request started on 309 * map->aio.data[] buffer is complete. 310 * 311 * perf_mmap__put() is done at record__aio_complete() 312 * after started aio request completion or at record__aio_push() 313 * if the request failed to start. 314 */ 315 perf_mmap__get(&map->core); 316 } 317 318 aio->size += size; 319 320 return size; 321 } 322 323 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off) 324 { 325 int ret, idx; 326 int trace_fd = rec->session->data->file.fd; 327 struct record_aio aio = { .rec = rec, .size = 0 }; 328 329 /* 330 * Call record__aio_sync() to wait till map->aio.data[] buffer 331 * becomes available after previous aio write operation. 332 */ 333 334 idx = record__aio_sync(map, false); 335 aio.data = map->aio.data[idx]; 336 ret = perf_mmap__push(map, &aio, record__aio_pushfn); 337 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */ 338 return ret; 339 340 rec->samples++; 341 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off); 342 if (!ret) { 343 *off += aio.size; 344 rec->bytes_written += aio.size; 345 if (switch_output_size(rec)) 346 trigger_hit(&switch_output_trigger); 347 } else { 348 /* 349 * Decrement map->refcount incremented in record__aio_pushfn() 350 * back if record__aio_write() operation failed to start, otherwise 351 * map->refcount is decremented in record__aio_complete() after 352 * aio write operation finishes successfully. 353 */ 354 perf_mmap__put(&map->core); 355 } 356 357 return ret; 358 } 359 360 static off_t record__aio_get_pos(int trace_fd) 361 { 362 return lseek(trace_fd, 0, SEEK_CUR); 363 } 364 365 static void record__aio_set_pos(int trace_fd, off_t pos) 366 { 367 lseek(trace_fd, pos, SEEK_SET); 368 } 369 370 static void record__aio_mmap_read_sync(struct record *rec) 371 { 372 int i; 373 struct evlist *evlist = rec->evlist; 374 struct mmap *maps = evlist->mmap; 375 376 if (!record__aio_enabled(rec)) 377 return; 378 379 for (i = 0; i < evlist->core.nr_mmaps; i++) { 380 struct mmap *map = &maps[i]; 381 382 if (map->core.base) 383 record__aio_sync(map, true); 384 } 385 } 386 387 static int nr_cblocks_default = 1; 388 static int nr_cblocks_max = 4; 389 390 static int record__aio_parse(const struct option *opt, 391 const char *str, 392 int unset) 393 { 394 struct record_opts *opts = (struct record_opts *)opt->value; 395 396 if (unset) { 397 opts->nr_cblocks = 0; 398 } else { 399 if (str) 400 opts->nr_cblocks = strtol(str, NULL, 0); 401 if (!opts->nr_cblocks) 402 opts->nr_cblocks = nr_cblocks_default; 403 } 404 405 return 0; 406 } 407 #else /* HAVE_AIO_SUPPORT */ 408 static int nr_cblocks_max = 0; 409 410 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused, 411 off_t *off __maybe_unused) 412 { 413 return -1; 414 } 415 416 static off_t record__aio_get_pos(int trace_fd __maybe_unused) 417 { 418 return -1; 419 } 420 421 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused) 422 { 423 } 424 425 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused) 426 { 427 } 428 #endif 429 430 static int record__aio_enabled(struct record *rec) 431 { 432 return rec->opts.nr_cblocks > 0; 433 } 434 435 #define MMAP_FLUSH_DEFAULT 1 436 static int record__mmap_flush_parse(const struct option *opt, 437 const char *str, 438 int unset) 439 { 440 int flush_max; 441 struct record_opts *opts = (struct record_opts *)opt->value; 442 static struct parse_tag tags[] = { 443 { .tag = 'B', .mult = 1 }, 444 { .tag = 'K', .mult = 1 << 10 }, 445 { .tag = 'M', .mult = 1 << 20 }, 446 { .tag = 'G', .mult = 1 << 30 }, 447 { .tag = 0 }, 448 }; 449 450 if (unset) 451 return 0; 452 453 if (str) { 454 opts->mmap_flush = parse_tag_value(str, tags); 455 if (opts->mmap_flush == (int)-1) 456 opts->mmap_flush = strtol(str, NULL, 0); 457 } 458 459 if (!opts->mmap_flush) 460 opts->mmap_flush = MMAP_FLUSH_DEFAULT; 461 462 flush_max = evlist__mmap_size(opts->mmap_pages); 463 flush_max /= 4; 464 if (opts->mmap_flush > flush_max) 465 opts->mmap_flush = flush_max; 466 467 return 0; 468 } 469 470 #ifdef HAVE_ZSTD_SUPPORT 471 static unsigned int comp_level_default = 1; 472 473 static int record__parse_comp_level(const struct option *opt, const char *str, int unset) 474 { 475 struct record_opts *opts = opt->value; 476 477 if (unset) { 478 opts->comp_level = 0; 479 } else { 480 if (str) 481 opts->comp_level = strtol(str, NULL, 0); 482 if (!opts->comp_level) 483 opts->comp_level = comp_level_default; 484 } 485 486 return 0; 487 } 488 #endif 489 static unsigned int comp_level_max = 22; 490 491 static int record__comp_enabled(struct record *rec) 492 { 493 return rec->opts.comp_level > 0; 494 } 495 496 static int process_synthesized_event(struct perf_tool *tool, 497 union perf_event *event, 498 struct perf_sample *sample __maybe_unused, 499 struct machine *machine __maybe_unused) 500 { 501 struct record *rec = container_of(tool, struct record, tool); 502 return record__write(rec, NULL, event, event->header.size); 503 } 504 505 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size) 506 { 507 struct record *rec = to; 508 509 if (record__comp_enabled(rec)) { 510 size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size); 511 bf = map->data; 512 } 513 514 rec->samples++; 515 return record__write(rec, map, bf, size); 516 } 517 518 static volatile int signr = -1; 519 static volatile int child_finished; 520 521 static void sig_handler(int sig) 522 { 523 if (sig == SIGCHLD) 524 child_finished = 1; 525 else 526 signr = sig; 527 528 done = 1; 529 } 530 531 static void sigsegv_handler(int sig) 532 { 533 perf_hooks__recover(); 534 sighandler_dump_stack(sig); 535 } 536 537 static void record__sig_exit(void) 538 { 539 if (signr == -1) 540 return; 541 542 signal(signr, SIG_DFL); 543 raise(signr); 544 } 545 546 #ifdef HAVE_AUXTRACE_SUPPORT 547 548 static int record__process_auxtrace(struct perf_tool *tool, 549 struct mmap *map, 550 union perf_event *event, void *data1, 551 size_t len1, void *data2, size_t len2) 552 { 553 struct record *rec = container_of(tool, struct record, tool); 554 struct perf_data *data = &rec->data; 555 size_t padding; 556 u8 pad[8] = {0}; 557 558 if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) { 559 off_t file_offset; 560 int fd = perf_data__fd(data); 561 int err; 562 563 file_offset = lseek(fd, 0, SEEK_CUR); 564 if (file_offset == -1) 565 return -1; 566 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index, 567 event, file_offset); 568 if (err) 569 return err; 570 } 571 572 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */ 573 padding = (len1 + len2) & 7; 574 if (padding) 575 padding = 8 - padding; 576 577 record__write(rec, map, event, event->header.size); 578 record__write(rec, map, data1, len1); 579 if (len2) 580 record__write(rec, map, data2, len2); 581 record__write(rec, map, &pad, padding); 582 583 return 0; 584 } 585 586 static int record__auxtrace_mmap_read(struct record *rec, 587 struct mmap *map) 588 { 589 int ret; 590 591 ret = auxtrace_mmap__read(map, rec->itr, &rec->tool, 592 record__process_auxtrace); 593 if (ret < 0) 594 return ret; 595 596 if (ret) 597 rec->samples++; 598 599 return 0; 600 } 601 602 static int record__auxtrace_mmap_read_snapshot(struct record *rec, 603 struct mmap *map) 604 { 605 int ret; 606 607 ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool, 608 record__process_auxtrace, 609 rec->opts.auxtrace_snapshot_size); 610 if (ret < 0) 611 return ret; 612 613 if (ret) 614 rec->samples++; 615 616 return 0; 617 } 618 619 static int record__auxtrace_read_snapshot_all(struct record *rec) 620 { 621 int i; 622 int rc = 0; 623 624 for (i = 0; i < rec->evlist->core.nr_mmaps; i++) { 625 struct mmap *map = &rec->evlist->mmap[i]; 626 627 if (!map->auxtrace_mmap.base) 628 continue; 629 630 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) { 631 rc = -1; 632 goto out; 633 } 634 } 635 out: 636 return rc; 637 } 638 639 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit) 640 { 641 pr_debug("Recording AUX area tracing snapshot\n"); 642 if (record__auxtrace_read_snapshot_all(rec) < 0) { 643 trigger_error(&auxtrace_snapshot_trigger); 644 } else { 645 if (auxtrace_record__snapshot_finish(rec->itr, on_exit)) 646 trigger_error(&auxtrace_snapshot_trigger); 647 else 648 trigger_ready(&auxtrace_snapshot_trigger); 649 } 650 } 651 652 static int record__auxtrace_snapshot_exit(struct record *rec) 653 { 654 if (trigger_is_error(&auxtrace_snapshot_trigger)) 655 return 0; 656 657 if (!auxtrace_record__snapshot_started && 658 auxtrace_record__snapshot_start(rec->itr)) 659 return -1; 660 661 record__read_auxtrace_snapshot(rec, true); 662 if (trigger_is_error(&auxtrace_snapshot_trigger)) 663 return -1; 664 665 return 0; 666 } 667 668 static int record__auxtrace_init(struct record *rec) 669 { 670 int err; 671 672 if (!rec->itr) { 673 rec->itr = auxtrace_record__init(rec->evlist, &err); 674 if (err) 675 return err; 676 } 677 678 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts, 679 rec->opts.auxtrace_snapshot_opts); 680 if (err) 681 return err; 682 683 err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts, 684 rec->opts.auxtrace_sample_opts); 685 if (err) 686 return err; 687 688 return auxtrace_parse_filters(rec->evlist); 689 } 690 691 #else 692 693 static inline 694 int record__auxtrace_mmap_read(struct record *rec __maybe_unused, 695 struct mmap *map __maybe_unused) 696 { 697 return 0; 698 } 699 700 static inline 701 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused, 702 bool on_exit __maybe_unused) 703 { 704 } 705 706 static inline 707 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused) 708 { 709 return 0; 710 } 711 712 static inline 713 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused) 714 { 715 return 0; 716 } 717 718 static int record__auxtrace_init(struct record *rec __maybe_unused) 719 { 720 return 0; 721 } 722 723 #endif 724 725 static bool record__kcore_readable(struct machine *machine) 726 { 727 char kcore[PATH_MAX]; 728 int fd; 729 730 scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir); 731 732 fd = open(kcore, O_RDONLY); 733 if (fd < 0) 734 return false; 735 736 close(fd); 737 738 return true; 739 } 740 741 static int record__kcore_copy(struct machine *machine, struct perf_data *data) 742 { 743 char from_dir[PATH_MAX]; 744 char kcore_dir[PATH_MAX]; 745 int ret; 746 747 snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir); 748 749 ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir)); 750 if (ret) 751 return ret; 752 753 return kcore_copy(from_dir, kcore_dir); 754 } 755 756 static int record__mmap_evlist(struct record *rec, 757 struct evlist *evlist) 758 { 759 struct record_opts *opts = &rec->opts; 760 bool auxtrace_overwrite = opts->auxtrace_snapshot_mode || 761 opts->auxtrace_sample_mode; 762 char msg[512]; 763 764 if (opts->affinity != PERF_AFFINITY_SYS) 765 cpu__setup_cpunode_map(); 766 767 if (evlist__mmap_ex(evlist, opts->mmap_pages, 768 opts->auxtrace_mmap_pages, 769 auxtrace_overwrite, 770 opts->nr_cblocks, opts->affinity, 771 opts->mmap_flush, opts->comp_level) < 0) { 772 if (errno == EPERM) { 773 pr_err("Permission error mapping pages.\n" 774 "Consider increasing " 775 "/proc/sys/kernel/perf_event_mlock_kb,\n" 776 "or try again with a smaller value of -m/--mmap_pages.\n" 777 "(current value: %u,%u)\n", 778 opts->mmap_pages, opts->auxtrace_mmap_pages); 779 return -errno; 780 } else { 781 pr_err("failed to mmap with %d (%s)\n", errno, 782 str_error_r(errno, msg, sizeof(msg))); 783 if (errno) 784 return -errno; 785 else 786 return -EINVAL; 787 } 788 } 789 return 0; 790 } 791 792 static int record__mmap(struct record *rec) 793 { 794 return record__mmap_evlist(rec, rec->evlist); 795 } 796 797 static int record__open(struct record *rec) 798 { 799 char msg[BUFSIZ]; 800 struct evsel *pos; 801 struct evlist *evlist = rec->evlist; 802 struct perf_session *session = rec->session; 803 struct record_opts *opts = &rec->opts; 804 int rc = 0; 805 806 /* 807 * For initial_delay we need to add a dummy event so that we can track 808 * PERF_RECORD_MMAP while we wait for the initial delay to enable the 809 * real events, the ones asked by the user. 810 */ 811 if (opts->initial_delay) { 812 if (perf_evlist__add_dummy(evlist)) 813 return -ENOMEM; 814 815 pos = evlist__first(evlist); 816 pos->tracking = 0; 817 pos = evlist__last(evlist); 818 pos->tracking = 1; 819 pos->core.attr.enable_on_exec = 1; 820 } 821 822 perf_evlist__config(evlist, opts, &callchain_param); 823 824 evlist__for_each_entry(evlist, pos) { 825 try_again: 826 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) { 827 if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) { 828 if (verbose > 0) 829 ui__warning("%s\n", msg); 830 goto try_again; 831 } 832 if ((errno == EINVAL || errno == EBADF) && 833 pos->leader != pos && 834 pos->weak_group) { 835 pos = perf_evlist__reset_weak_group(evlist, pos); 836 goto try_again; 837 } 838 rc = -errno; 839 perf_evsel__open_strerror(pos, &opts->target, 840 errno, msg, sizeof(msg)); 841 ui__error("%s\n", msg); 842 goto out; 843 } 844 845 pos->supported = true; 846 } 847 848 if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) { 849 pr_warning( 850 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n" 851 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n" 852 "Samples in kernel functions may not be resolved if a suitable vmlinux\n" 853 "file is not found in the buildid cache or in the vmlinux path.\n\n" 854 "Samples in kernel modules won't be resolved at all.\n\n" 855 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n" 856 "even with a suitable vmlinux or kallsyms file.\n\n"); 857 } 858 859 if (perf_evlist__apply_filters(evlist, &pos)) { 860 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n", 861 pos->filter, perf_evsel__name(pos), errno, 862 str_error_r(errno, msg, sizeof(msg))); 863 rc = -1; 864 goto out; 865 } 866 867 rc = record__mmap(rec); 868 if (rc) 869 goto out; 870 871 session->evlist = evlist; 872 perf_session__set_id_hdr_size(session); 873 out: 874 return rc; 875 } 876 877 static int process_sample_event(struct perf_tool *tool, 878 union perf_event *event, 879 struct perf_sample *sample, 880 struct evsel *evsel, 881 struct machine *machine) 882 { 883 struct record *rec = container_of(tool, struct record, tool); 884 885 if (rec->evlist->first_sample_time == 0) 886 rec->evlist->first_sample_time = sample->time; 887 888 rec->evlist->last_sample_time = sample->time; 889 890 if (rec->buildid_all) 891 return 0; 892 893 rec->samples++; 894 return build_id__mark_dso_hit(tool, event, sample, evsel, machine); 895 } 896 897 static int process_buildids(struct record *rec) 898 { 899 struct perf_session *session = rec->session; 900 901 if (perf_data__size(&rec->data) == 0) 902 return 0; 903 904 /* 905 * During this process, it'll load kernel map and replace the 906 * dso->long_name to a real pathname it found. In this case 907 * we prefer the vmlinux path like 908 * /lib/modules/3.16.4/build/vmlinux 909 * 910 * rather than build-id path (in debug directory). 911 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551 912 */ 913 symbol_conf.ignore_vmlinux_buildid = true; 914 915 /* 916 * If --buildid-all is given, it marks all DSO regardless of hits, 917 * so no need to process samples. But if timestamp_boundary is enabled, 918 * it still needs to walk on all samples to get the timestamps of 919 * first/last samples. 920 */ 921 if (rec->buildid_all && !rec->timestamp_boundary) 922 rec->tool.sample = NULL; 923 924 return perf_session__process_events(session); 925 } 926 927 static void perf_event__synthesize_guest_os(struct machine *machine, void *data) 928 { 929 int err; 930 struct perf_tool *tool = data; 931 /* 932 *As for guest kernel when processing subcommand record&report, 933 *we arrange module mmap prior to guest kernel mmap and trigger 934 *a preload dso because default guest module symbols are loaded 935 *from guest kallsyms instead of /lib/modules/XXX/XXX. This 936 *method is used to avoid symbol missing when the first addr is 937 *in module instead of in guest kernel. 938 */ 939 err = perf_event__synthesize_modules(tool, process_synthesized_event, 940 machine); 941 if (err < 0) 942 pr_err("Couldn't record guest kernel [%d]'s reference" 943 " relocation symbol.\n", machine->pid); 944 945 /* 946 * We use _stext for guest kernel because guest kernel's /proc/kallsyms 947 * have no _text sometimes. 948 */ 949 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, 950 machine); 951 if (err < 0) 952 pr_err("Couldn't record guest kernel [%d]'s reference" 953 " relocation symbol.\n", machine->pid); 954 } 955 956 static struct perf_event_header finished_round_event = { 957 .size = sizeof(struct perf_event_header), 958 .type = PERF_RECORD_FINISHED_ROUND, 959 }; 960 961 static void record__adjust_affinity(struct record *rec, struct mmap *map) 962 { 963 if (rec->opts.affinity != PERF_AFFINITY_SYS && 964 !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) { 965 CPU_ZERO(&rec->affinity_mask); 966 CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask); 967 sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask); 968 } 969 } 970 971 static size_t process_comp_header(void *record, size_t increment) 972 { 973 struct perf_record_compressed *event = record; 974 size_t size = sizeof(*event); 975 976 if (increment) { 977 event->header.size += increment; 978 return increment; 979 } 980 981 event->header.type = PERF_RECORD_COMPRESSED; 982 event->header.size = size; 983 984 return size; 985 } 986 987 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size, 988 void *src, size_t src_size) 989 { 990 size_t compressed; 991 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1; 992 993 compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size, 994 max_record_size, process_comp_header); 995 996 session->bytes_transferred += src_size; 997 session->bytes_compressed += compressed; 998 999 return compressed; 1000 } 1001 1002 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist, 1003 bool overwrite, bool synch) 1004 { 1005 u64 bytes_written = rec->bytes_written; 1006 int i; 1007 int rc = 0; 1008 struct mmap *maps; 1009 int trace_fd = rec->data.file.fd; 1010 off_t off = 0; 1011 1012 if (!evlist) 1013 return 0; 1014 1015 maps = overwrite ? evlist->overwrite_mmap : evlist->mmap; 1016 if (!maps) 1017 return 0; 1018 1019 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING) 1020 return 0; 1021 1022 if (record__aio_enabled(rec)) 1023 off = record__aio_get_pos(trace_fd); 1024 1025 for (i = 0; i < evlist->core.nr_mmaps; i++) { 1026 u64 flush = 0; 1027 struct mmap *map = &maps[i]; 1028 1029 if (map->core.base) { 1030 record__adjust_affinity(rec, map); 1031 if (synch) { 1032 flush = map->core.flush; 1033 map->core.flush = 1; 1034 } 1035 if (!record__aio_enabled(rec)) { 1036 if (perf_mmap__push(map, rec, record__pushfn) < 0) { 1037 if (synch) 1038 map->core.flush = flush; 1039 rc = -1; 1040 goto out; 1041 } 1042 } else { 1043 if (record__aio_push(rec, map, &off) < 0) { 1044 record__aio_set_pos(trace_fd, off); 1045 if (synch) 1046 map->core.flush = flush; 1047 rc = -1; 1048 goto out; 1049 } 1050 } 1051 if (synch) 1052 map->core.flush = flush; 1053 } 1054 1055 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode && 1056 !rec->opts.auxtrace_sample_mode && 1057 record__auxtrace_mmap_read(rec, map) != 0) { 1058 rc = -1; 1059 goto out; 1060 } 1061 } 1062 1063 if (record__aio_enabled(rec)) 1064 record__aio_set_pos(trace_fd, off); 1065 1066 /* 1067 * Mark the round finished in case we wrote 1068 * at least one event. 1069 */ 1070 if (bytes_written != rec->bytes_written) 1071 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event)); 1072 1073 if (overwrite) 1074 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY); 1075 out: 1076 return rc; 1077 } 1078 1079 static int record__mmap_read_all(struct record *rec, bool synch) 1080 { 1081 int err; 1082 1083 err = record__mmap_read_evlist(rec, rec->evlist, false, synch); 1084 if (err) 1085 return err; 1086 1087 return record__mmap_read_evlist(rec, rec->evlist, true, synch); 1088 } 1089 1090 static void record__init_features(struct record *rec) 1091 { 1092 struct perf_session *session = rec->session; 1093 int feat; 1094 1095 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++) 1096 perf_header__set_feat(&session->header, feat); 1097 1098 if (rec->no_buildid) 1099 perf_header__clear_feat(&session->header, HEADER_BUILD_ID); 1100 1101 if (!have_tracepoints(&rec->evlist->core.entries)) 1102 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA); 1103 1104 if (!rec->opts.branch_stack) 1105 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK); 1106 1107 if (!rec->opts.full_auxtrace) 1108 perf_header__clear_feat(&session->header, HEADER_AUXTRACE); 1109 1110 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns)) 1111 perf_header__clear_feat(&session->header, HEADER_CLOCKID); 1112 1113 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT); 1114 if (!record__comp_enabled(rec)) 1115 perf_header__clear_feat(&session->header, HEADER_COMPRESSED); 1116 1117 perf_header__clear_feat(&session->header, HEADER_STAT); 1118 } 1119 1120 static void 1121 record__finish_output(struct record *rec) 1122 { 1123 struct perf_data *data = &rec->data; 1124 int fd = perf_data__fd(data); 1125 1126 if (data->is_pipe) 1127 return; 1128 1129 rec->session->header.data_size += rec->bytes_written; 1130 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR); 1131 1132 if (!rec->no_buildid) { 1133 process_buildids(rec); 1134 1135 if (rec->buildid_all) 1136 dsos__hit_all(rec->session); 1137 } 1138 perf_session__write_header(rec->session, rec->evlist, fd, true); 1139 1140 return; 1141 } 1142 1143 static int record__synthesize_workload(struct record *rec, bool tail) 1144 { 1145 int err; 1146 struct perf_thread_map *thread_map; 1147 1148 if (rec->opts.tail_synthesize != tail) 1149 return 0; 1150 1151 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid); 1152 if (thread_map == NULL) 1153 return -1; 1154 1155 err = perf_event__synthesize_thread_map(&rec->tool, thread_map, 1156 process_synthesized_event, 1157 &rec->session->machines.host, 1158 rec->opts.sample_address); 1159 perf_thread_map__put(thread_map); 1160 return err; 1161 } 1162 1163 static int record__synthesize(struct record *rec, bool tail); 1164 1165 static int 1166 record__switch_output(struct record *rec, bool at_exit) 1167 { 1168 struct perf_data *data = &rec->data; 1169 int fd, err; 1170 char *new_filename; 1171 1172 /* Same Size: "2015122520103046"*/ 1173 char timestamp[] = "InvalidTimestamp"; 1174 1175 record__aio_mmap_read_sync(rec); 1176 1177 record__synthesize(rec, true); 1178 if (target__none(&rec->opts.target)) 1179 record__synthesize_workload(rec, true); 1180 1181 rec->samples = 0; 1182 record__finish_output(rec); 1183 err = fetch_current_timestamp(timestamp, sizeof(timestamp)); 1184 if (err) { 1185 pr_err("Failed to get current timestamp\n"); 1186 return -EINVAL; 1187 } 1188 1189 fd = perf_data__switch(data, timestamp, 1190 rec->session->header.data_offset, 1191 at_exit, &new_filename); 1192 if (fd >= 0 && !at_exit) { 1193 rec->bytes_written = 0; 1194 rec->session->header.data_size = 0; 1195 } 1196 1197 if (!quiet) 1198 fprintf(stderr, "[ perf record: Dump %s.%s ]\n", 1199 data->path, timestamp); 1200 1201 if (rec->switch_output.num_files) { 1202 int n = rec->switch_output.cur_file + 1; 1203 1204 if (n >= rec->switch_output.num_files) 1205 n = 0; 1206 rec->switch_output.cur_file = n; 1207 if (rec->switch_output.filenames[n]) { 1208 remove(rec->switch_output.filenames[n]); 1209 zfree(&rec->switch_output.filenames[n]); 1210 } 1211 rec->switch_output.filenames[n] = new_filename; 1212 } else { 1213 free(new_filename); 1214 } 1215 1216 /* Output tracking events */ 1217 if (!at_exit) { 1218 record__synthesize(rec, false); 1219 1220 /* 1221 * In 'perf record --switch-output' without -a, 1222 * record__synthesize() in record__switch_output() won't 1223 * generate tracking events because there's no thread_map 1224 * in evlist. Which causes newly created perf.data doesn't 1225 * contain map and comm information. 1226 * Create a fake thread_map and directly call 1227 * perf_event__synthesize_thread_map() for those events. 1228 */ 1229 if (target__none(&rec->opts.target)) 1230 record__synthesize_workload(rec, false); 1231 } 1232 return fd; 1233 } 1234 1235 static volatile int workload_exec_errno; 1236 1237 /* 1238 * perf_evlist__prepare_workload will send a SIGUSR1 1239 * if the fork fails, since we asked by setting its 1240 * want_signal to true. 1241 */ 1242 static void workload_exec_failed_signal(int signo __maybe_unused, 1243 siginfo_t *info, 1244 void *ucontext __maybe_unused) 1245 { 1246 workload_exec_errno = info->si_value.sival_int; 1247 done = 1; 1248 child_finished = 1; 1249 } 1250 1251 static void snapshot_sig_handler(int sig); 1252 static void alarm_sig_handler(int sig); 1253 1254 static const struct perf_event_mmap_page * 1255 perf_evlist__pick_pc(struct evlist *evlist) 1256 { 1257 if (evlist) { 1258 if (evlist->mmap && evlist->mmap[0].core.base) 1259 return evlist->mmap[0].core.base; 1260 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base) 1261 return evlist->overwrite_mmap[0].core.base; 1262 } 1263 return NULL; 1264 } 1265 1266 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec) 1267 { 1268 const struct perf_event_mmap_page *pc; 1269 1270 pc = perf_evlist__pick_pc(rec->evlist); 1271 if (pc) 1272 return pc; 1273 return NULL; 1274 } 1275 1276 static int record__synthesize(struct record *rec, bool tail) 1277 { 1278 struct perf_session *session = rec->session; 1279 struct machine *machine = &session->machines.host; 1280 struct perf_data *data = &rec->data; 1281 struct record_opts *opts = &rec->opts; 1282 struct perf_tool *tool = &rec->tool; 1283 int fd = perf_data__fd(data); 1284 int err = 0; 1285 1286 if (rec->opts.tail_synthesize != tail) 1287 return 0; 1288 1289 if (data->is_pipe) { 1290 /* 1291 * We need to synthesize events first, because some 1292 * features works on top of them (on report side). 1293 */ 1294 err = perf_event__synthesize_attrs(tool, rec->evlist, 1295 process_synthesized_event); 1296 if (err < 0) { 1297 pr_err("Couldn't synthesize attrs.\n"); 1298 goto out; 1299 } 1300 1301 err = perf_event__synthesize_features(tool, session, rec->evlist, 1302 process_synthesized_event); 1303 if (err < 0) { 1304 pr_err("Couldn't synthesize features.\n"); 1305 return err; 1306 } 1307 1308 if (have_tracepoints(&rec->evlist->core.entries)) { 1309 /* 1310 * FIXME err <= 0 here actually means that 1311 * there were no tracepoints so its not really 1312 * an error, just that we don't need to 1313 * synthesize anything. We really have to 1314 * return this more properly and also 1315 * propagate errors that now are calling die() 1316 */ 1317 err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist, 1318 process_synthesized_event); 1319 if (err <= 0) { 1320 pr_err("Couldn't record tracing data.\n"); 1321 goto out; 1322 } 1323 rec->bytes_written += err; 1324 } 1325 } 1326 1327 err = perf_event__synth_time_conv(record__pick_pc(rec), tool, 1328 process_synthesized_event, machine); 1329 if (err) 1330 goto out; 1331 1332 /* Synthesize id_index before auxtrace_info */ 1333 if (rec->opts.auxtrace_sample_mode) { 1334 err = perf_event__synthesize_id_index(tool, 1335 process_synthesized_event, 1336 session->evlist, machine); 1337 if (err) 1338 goto out; 1339 } 1340 1341 if (rec->opts.full_auxtrace) { 1342 err = perf_event__synthesize_auxtrace_info(rec->itr, tool, 1343 session, process_synthesized_event); 1344 if (err) 1345 goto out; 1346 } 1347 1348 if (!perf_evlist__exclude_kernel(rec->evlist)) { 1349 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, 1350 machine); 1351 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n" 1352 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 1353 "Check /proc/kallsyms permission or run as root.\n"); 1354 1355 err = perf_event__synthesize_modules(tool, process_synthesized_event, 1356 machine); 1357 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n" 1358 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 1359 "Check /proc/modules permission or run as root.\n"); 1360 } 1361 1362 if (perf_guest) { 1363 machines__process_guests(&session->machines, 1364 perf_event__synthesize_guest_os, tool); 1365 } 1366 1367 err = perf_event__synthesize_extra_attr(&rec->tool, 1368 rec->evlist, 1369 process_synthesized_event, 1370 data->is_pipe); 1371 if (err) 1372 goto out; 1373 1374 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads, 1375 process_synthesized_event, 1376 NULL); 1377 if (err < 0) { 1378 pr_err("Couldn't synthesize thread map.\n"); 1379 return err; 1380 } 1381 1382 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus, 1383 process_synthesized_event, NULL); 1384 if (err < 0) { 1385 pr_err("Couldn't synthesize cpu map.\n"); 1386 return err; 1387 } 1388 1389 err = perf_event__synthesize_bpf_events(session, process_synthesized_event, 1390 machine, opts); 1391 if (err < 0) 1392 pr_warning("Couldn't synthesize bpf events.\n"); 1393 1394 err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads, 1395 process_synthesized_event, opts->sample_address, 1396 1); 1397 out: 1398 return err; 1399 } 1400 1401 static int __cmd_record(struct record *rec, int argc, const char **argv) 1402 { 1403 int err; 1404 int status = 0; 1405 unsigned long waking = 0; 1406 const bool forks = argc > 0; 1407 struct perf_tool *tool = &rec->tool; 1408 struct record_opts *opts = &rec->opts; 1409 struct perf_data *data = &rec->data; 1410 struct perf_session *session; 1411 bool disabled = false, draining = false; 1412 struct evlist *sb_evlist = NULL; 1413 int fd; 1414 float ratio = 0; 1415 1416 atexit(record__sig_exit); 1417 signal(SIGCHLD, sig_handler); 1418 signal(SIGINT, sig_handler); 1419 signal(SIGTERM, sig_handler); 1420 signal(SIGSEGV, sigsegv_handler); 1421 1422 if (rec->opts.record_namespaces) 1423 tool->namespace_events = true; 1424 1425 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) { 1426 signal(SIGUSR2, snapshot_sig_handler); 1427 if (rec->opts.auxtrace_snapshot_mode) 1428 trigger_on(&auxtrace_snapshot_trigger); 1429 if (rec->switch_output.enabled) 1430 trigger_on(&switch_output_trigger); 1431 } else { 1432 signal(SIGUSR2, SIG_IGN); 1433 } 1434 1435 session = perf_session__new(data, false, tool); 1436 if (IS_ERR(session)) { 1437 pr_err("Perf session creation failed.\n"); 1438 return PTR_ERR(session); 1439 } 1440 1441 fd = perf_data__fd(data); 1442 rec->session = session; 1443 1444 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) { 1445 pr_err("Compression initialization failed.\n"); 1446 return -1; 1447 } 1448 1449 session->header.env.comp_type = PERF_COMP_ZSTD; 1450 session->header.env.comp_level = rec->opts.comp_level; 1451 1452 if (rec->opts.kcore && 1453 !record__kcore_readable(&session->machines.host)) { 1454 pr_err("ERROR: kcore is not readable.\n"); 1455 return -1; 1456 } 1457 1458 record__init_features(rec); 1459 1460 if (rec->opts.use_clockid && rec->opts.clockid_res_ns) 1461 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns; 1462 1463 if (forks) { 1464 err = perf_evlist__prepare_workload(rec->evlist, &opts->target, 1465 argv, data->is_pipe, 1466 workload_exec_failed_signal); 1467 if (err < 0) { 1468 pr_err("Couldn't run the workload!\n"); 1469 status = err; 1470 goto out_delete_session; 1471 } 1472 } 1473 1474 /* 1475 * If we have just single event and are sending data 1476 * through pipe, we need to force the ids allocation, 1477 * because we synthesize event name through the pipe 1478 * and need the id for that. 1479 */ 1480 if (data->is_pipe && rec->evlist->core.nr_entries == 1) 1481 rec->opts.sample_id = true; 1482 1483 if (record__open(rec) != 0) { 1484 err = -1; 1485 goto out_child; 1486 } 1487 session->header.env.comp_mmap_len = session->evlist->core.mmap_len; 1488 1489 if (rec->opts.kcore) { 1490 err = record__kcore_copy(&session->machines.host, data); 1491 if (err) { 1492 pr_err("ERROR: Failed to copy kcore\n"); 1493 goto out_child; 1494 } 1495 } 1496 1497 err = bpf__apply_obj_config(); 1498 if (err) { 1499 char errbuf[BUFSIZ]; 1500 1501 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf)); 1502 pr_err("ERROR: Apply config to BPF failed: %s\n", 1503 errbuf); 1504 goto out_child; 1505 } 1506 1507 /* 1508 * Normally perf_session__new would do this, but it doesn't have the 1509 * evlist. 1510 */ 1511 if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) { 1512 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n"); 1513 rec->tool.ordered_events = false; 1514 } 1515 1516 if (!rec->evlist->nr_groups) 1517 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC); 1518 1519 if (data->is_pipe) { 1520 err = perf_header__write_pipe(fd); 1521 if (err < 0) 1522 goto out_child; 1523 } else { 1524 err = perf_session__write_header(session, rec->evlist, fd, false); 1525 if (err < 0) 1526 goto out_child; 1527 } 1528 1529 if (!rec->no_buildid 1530 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) { 1531 pr_err("Couldn't generate buildids. " 1532 "Use --no-buildid to profile anyway.\n"); 1533 err = -1; 1534 goto out_child; 1535 } 1536 1537 if (!opts->no_bpf_event) 1538 bpf_event__add_sb_event(&sb_evlist, &session->header.env); 1539 1540 if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) { 1541 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n"); 1542 opts->no_bpf_event = true; 1543 } 1544 1545 err = record__synthesize(rec, false); 1546 if (err < 0) 1547 goto out_child; 1548 1549 if (rec->realtime_prio) { 1550 struct sched_param param; 1551 1552 param.sched_priority = rec->realtime_prio; 1553 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { 1554 pr_err("Could not set realtime priority.\n"); 1555 err = -1; 1556 goto out_child; 1557 } 1558 } 1559 1560 /* 1561 * When perf is starting the traced process, all the events 1562 * (apart from group members) have enable_on_exec=1 set, 1563 * so don't spoil it by prematurely enabling them. 1564 */ 1565 if (!target__none(&opts->target) && !opts->initial_delay) 1566 evlist__enable(rec->evlist); 1567 1568 /* 1569 * Let the child rip 1570 */ 1571 if (forks) { 1572 struct machine *machine = &session->machines.host; 1573 union perf_event *event; 1574 pid_t tgid; 1575 1576 event = malloc(sizeof(event->comm) + machine->id_hdr_size); 1577 if (event == NULL) { 1578 err = -ENOMEM; 1579 goto out_child; 1580 } 1581 1582 /* 1583 * Some H/W events are generated before COMM event 1584 * which is emitted during exec(), so perf script 1585 * cannot see a correct process name for those events. 1586 * Synthesize COMM event to prevent it. 1587 */ 1588 tgid = perf_event__synthesize_comm(tool, event, 1589 rec->evlist->workload.pid, 1590 process_synthesized_event, 1591 machine); 1592 free(event); 1593 1594 if (tgid == -1) 1595 goto out_child; 1596 1597 event = malloc(sizeof(event->namespaces) + 1598 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) + 1599 machine->id_hdr_size); 1600 if (event == NULL) { 1601 err = -ENOMEM; 1602 goto out_child; 1603 } 1604 1605 /* 1606 * Synthesize NAMESPACES event for the command specified. 1607 */ 1608 perf_event__synthesize_namespaces(tool, event, 1609 rec->evlist->workload.pid, 1610 tgid, process_synthesized_event, 1611 machine); 1612 free(event); 1613 1614 perf_evlist__start_workload(rec->evlist); 1615 } 1616 1617 if (opts->initial_delay) { 1618 usleep(opts->initial_delay * USEC_PER_MSEC); 1619 evlist__enable(rec->evlist); 1620 } 1621 1622 trigger_ready(&auxtrace_snapshot_trigger); 1623 trigger_ready(&switch_output_trigger); 1624 perf_hooks__invoke_record_start(); 1625 for (;;) { 1626 unsigned long long hits = rec->samples; 1627 1628 /* 1629 * rec->evlist->bkw_mmap_state is possible to be 1630 * BKW_MMAP_EMPTY here: when done == true and 1631 * hits != rec->samples in previous round. 1632 * 1633 * perf_evlist__toggle_bkw_mmap ensure we never 1634 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING. 1635 */ 1636 if (trigger_is_hit(&switch_output_trigger) || done || draining) 1637 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING); 1638 1639 if (record__mmap_read_all(rec, false) < 0) { 1640 trigger_error(&auxtrace_snapshot_trigger); 1641 trigger_error(&switch_output_trigger); 1642 err = -1; 1643 goto out_child; 1644 } 1645 1646 if (auxtrace_record__snapshot_started) { 1647 auxtrace_record__snapshot_started = 0; 1648 if (!trigger_is_error(&auxtrace_snapshot_trigger)) 1649 record__read_auxtrace_snapshot(rec, false); 1650 if (trigger_is_error(&auxtrace_snapshot_trigger)) { 1651 pr_err("AUX area tracing snapshot failed\n"); 1652 err = -1; 1653 goto out_child; 1654 } 1655 } 1656 1657 if (trigger_is_hit(&switch_output_trigger)) { 1658 /* 1659 * If switch_output_trigger is hit, the data in 1660 * overwritable ring buffer should have been collected, 1661 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY. 1662 * 1663 * If SIGUSR2 raise after or during record__mmap_read_all(), 1664 * record__mmap_read_all() didn't collect data from 1665 * overwritable ring buffer. Read again. 1666 */ 1667 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING) 1668 continue; 1669 trigger_ready(&switch_output_trigger); 1670 1671 /* 1672 * Reenable events in overwrite ring buffer after 1673 * record__mmap_read_all(): we should have collected 1674 * data from it. 1675 */ 1676 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING); 1677 1678 if (!quiet) 1679 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n", 1680 waking); 1681 waking = 0; 1682 fd = record__switch_output(rec, false); 1683 if (fd < 0) { 1684 pr_err("Failed to switch to new file\n"); 1685 trigger_error(&switch_output_trigger); 1686 err = fd; 1687 goto out_child; 1688 } 1689 1690 /* re-arm the alarm */ 1691 if (rec->switch_output.time) 1692 alarm(rec->switch_output.time); 1693 } 1694 1695 if (hits == rec->samples) { 1696 if (done || draining) 1697 break; 1698 err = evlist__poll(rec->evlist, -1); 1699 /* 1700 * Propagate error, only if there's any. Ignore positive 1701 * number of returned events and interrupt error. 1702 */ 1703 if (err > 0 || (err < 0 && errno == EINTR)) 1704 err = 0; 1705 waking++; 1706 1707 if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0) 1708 draining = true; 1709 } 1710 1711 /* 1712 * When perf is starting the traced process, at the end events 1713 * die with the process and we wait for that. Thus no need to 1714 * disable events in this case. 1715 */ 1716 if (done && !disabled && !target__none(&opts->target)) { 1717 trigger_off(&auxtrace_snapshot_trigger); 1718 evlist__disable(rec->evlist); 1719 disabled = true; 1720 } 1721 } 1722 1723 trigger_off(&auxtrace_snapshot_trigger); 1724 trigger_off(&switch_output_trigger); 1725 1726 if (opts->auxtrace_snapshot_on_exit) 1727 record__auxtrace_snapshot_exit(rec); 1728 1729 if (forks && workload_exec_errno) { 1730 char msg[STRERR_BUFSIZE]; 1731 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg)); 1732 pr_err("Workload failed: %s\n", emsg); 1733 err = -1; 1734 goto out_child; 1735 } 1736 1737 if (!quiet) 1738 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking); 1739 1740 if (target__none(&rec->opts.target)) 1741 record__synthesize_workload(rec, true); 1742 1743 out_child: 1744 record__mmap_read_all(rec, true); 1745 record__aio_mmap_read_sync(rec); 1746 1747 if (rec->session->bytes_transferred && rec->session->bytes_compressed) { 1748 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed; 1749 session->header.env.comp_ratio = ratio + 0.5; 1750 } 1751 1752 if (forks) { 1753 int exit_status; 1754 1755 if (!child_finished) 1756 kill(rec->evlist->workload.pid, SIGTERM); 1757 1758 wait(&exit_status); 1759 1760 if (err < 0) 1761 status = err; 1762 else if (WIFEXITED(exit_status)) 1763 status = WEXITSTATUS(exit_status); 1764 else if (WIFSIGNALED(exit_status)) 1765 signr = WTERMSIG(exit_status); 1766 } else 1767 status = err; 1768 1769 record__synthesize(rec, true); 1770 /* this will be recalculated during process_buildids() */ 1771 rec->samples = 0; 1772 1773 if (!err) { 1774 if (!rec->timestamp_filename) { 1775 record__finish_output(rec); 1776 } else { 1777 fd = record__switch_output(rec, true); 1778 if (fd < 0) { 1779 status = fd; 1780 goto out_delete_session; 1781 } 1782 } 1783 } 1784 1785 perf_hooks__invoke_record_end(); 1786 1787 if (!err && !quiet) { 1788 char samples[128]; 1789 const char *postfix = rec->timestamp_filename ? 1790 ".<timestamp>" : ""; 1791 1792 if (rec->samples && !rec->opts.full_auxtrace) 1793 scnprintf(samples, sizeof(samples), 1794 " (%" PRIu64 " samples)", rec->samples); 1795 else 1796 samples[0] = '\0'; 1797 1798 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s", 1799 perf_data__size(data) / 1024.0 / 1024.0, 1800 data->path, postfix, samples); 1801 if (ratio) { 1802 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)", 1803 rec->session->bytes_transferred / 1024.0 / 1024.0, 1804 ratio); 1805 } 1806 fprintf(stderr, " ]\n"); 1807 } 1808 1809 out_delete_session: 1810 zstd_fini(&session->zstd_data); 1811 perf_session__delete(session); 1812 1813 if (!opts->no_bpf_event) 1814 perf_evlist__stop_sb_thread(sb_evlist); 1815 return status; 1816 } 1817 1818 static void callchain_debug(struct callchain_param *callchain) 1819 { 1820 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" }; 1821 1822 pr_debug("callchain: type %s\n", str[callchain->record_mode]); 1823 1824 if (callchain->record_mode == CALLCHAIN_DWARF) 1825 pr_debug("callchain: stack dump size %d\n", 1826 callchain->dump_size); 1827 } 1828 1829 int record_opts__parse_callchain(struct record_opts *record, 1830 struct callchain_param *callchain, 1831 const char *arg, bool unset) 1832 { 1833 int ret; 1834 callchain->enabled = !unset; 1835 1836 /* --no-call-graph */ 1837 if (unset) { 1838 callchain->record_mode = CALLCHAIN_NONE; 1839 pr_debug("callchain: disabled\n"); 1840 return 0; 1841 } 1842 1843 ret = parse_callchain_record_opt(arg, callchain); 1844 if (!ret) { 1845 /* Enable data address sampling for DWARF unwind. */ 1846 if (callchain->record_mode == CALLCHAIN_DWARF) 1847 record->sample_address = true; 1848 callchain_debug(callchain); 1849 } 1850 1851 return ret; 1852 } 1853 1854 int record_parse_callchain_opt(const struct option *opt, 1855 const char *arg, 1856 int unset) 1857 { 1858 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset); 1859 } 1860 1861 int record_callchain_opt(const struct option *opt, 1862 const char *arg __maybe_unused, 1863 int unset __maybe_unused) 1864 { 1865 struct callchain_param *callchain = opt->value; 1866 1867 callchain->enabled = true; 1868 1869 if (callchain->record_mode == CALLCHAIN_NONE) 1870 callchain->record_mode = CALLCHAIN_FP; 1871 1872 callchain_debug(callchain); 1873 return 0; 1874 } 1875 1876 static int perf_record_config(const char *var, const char *value, void *cb) 1877 { 1878 struct record *rec = cb; 1879 1880 if (!strcmp(var, "record.build-id")) { 1881 if (!strcmp(value, "cache")) 1882 rec->no_buildid_cache = false; 1883 else if (!strcmp(value, "no-cache")) 1884 rec->no_buildid_cache = true; 1885 else if (!strcmp(value, "skip")) 1886 rec->no_buildid = true; 1887 else 1888 return -1; 1889 return 0; 1890 } 1891 if (!strcmp(var, "record.call-graph")) { 1892 var = "call-graph.record-mode"; 1893 return perf_default_config(var, value, cb); 1894 } 1895 #ifdef HAVE_AIO_SUPPORT 1896 if (!strcmp(var, "record.aio")) { 1897 rec->opts.nr_cblocks = strtol(value, NULL, 0); 1898 if (!rec->opts.nr_cblocks) 1899 rec->opts.nr_cblocks = nr_cblocks_default; 1900 } 1901 #endif 1902 1903 return 0; 1904 } 1905 1906 struct clockid_map { 1907 const char *name; 1908 int clockid; 1909 }; 1910 1911 #define CLOCKID_MAP(n, c) \ 1912 { .name = n, .clockid = (c), } 1913 1914 #define CLOCKID_END { .name = NULL, } 1915 1916 1917 /* 1918 * Add the missing ones, we need to build on many distros... 1919 */ 1920 #ifndef CLOCK_MONOTONIC_RAW 1921 #define CLOCK_MONOTONIC_RAW 4 1922 #endif 1923 #ifndef CLOCK_BOOTTIME 1924 #define CLOCK_BOOTTIME 7 1925 #endif 1926 #ifndef CLOCK_TAI 1927 #define CLOCK_TAI 11 1928 #endif 1929 1930 static const struct clockid_map clockids[] = { 1931 /* available for all events, NMI safe */ 1932 CLOCKID_MAP("monotonic", CLOCK_MONOTONIC), 1933 CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW), 1934 1935 /* available for some events */ 1936 CLOCKID_MAP("realtime", CLOCK_REALTIME), 1937 CLOCKID_MAP("boottime", CLOCK_BOOTTIME), 1938 CLOCKID_MAP("tai", CLOCK_TAI), 1939 1940 /* available for the lazy */ 1941 CLOCKID_MAP("mono", CLOCK_MONOTONIC), 1942 CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW), 1943 CLOCKID_MAP("real", CLOCK_REALTIME), 1944 CLOCKID_MAP("boot", CLOCK_BOOTTIME), 1945 1946 CLOCKID_END, 1947 }; 1948 1949 static int get_clockid_res(clockid_t clk_id, u64 *res_ns) 1950 { 1951 struct timespec res; 1952 1953 *res_ns = 0; 1954 if (!clock_getres(clk_id, &res)) 1955 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC; 1956 else 1957 pr_warning("WARNING: Failed to determine specified clock resolution.\n"); 1958 1959 return 0; 1960 } 1961 1962 static int parse_clockid(const struct option *opt, const char *str, int unset) 1963 { 1964 struct record_opts *opts = (struct record_opts *)opt->value; 1965 const struct clockid_map *cm; 1966 const char *ostr = str; 1967 1968 if (unset) { 1969 opts->use_clockid = 0; 1970 return 0; 1971 } 1972 1973 /* no arg passed */ 1974 if (!str) 1975 return 0; 1976 1977 /* no setting it twice */ 1978 if (opts->use_clockid) 1979 return -1; 1980 1981 opts->use_clockid = true; 1982 1983 /* if its a number, we're done */ 1984 if (sscanf(str, "%d", &opts->clockid) == 1) 1985 return get_clockid_res(opts->clockid, &opts->clockid_res_ns); 1986 1987 /* allow a "CLOCK_" prefix to the name */ 1988 if (!strncasecmp(str, "CLOCK_", 6)) 1989 str += 6; 1990 1991 for (cm = clockids; cm->name; cm++) { 1992 if (!strcasecmp(str, cm->name)) { 1993 opts->clockid = cm->clockid; 1994 return get_clockid_res(opts->clockid, 1995 &opts->clockid_res_ns); 1996 } 1997 } 1998 1999 opts->use_clockid = false; 2000 ui__warning("unknown clockid %s, check man page\n", ostr); 2001 return -1; 2002 } 2003 2004 static int record__parse_affinity(const struct option *opt, const char *str, int unset) 2005 { 2006 struct record_opts *opts = (struct record_opts *)opt->value; 2007 2008 if (unset || !str) 2009 return 0; 2010 2011 if (!strcasecmp(str, "node")) 2012 opts->affinity = PERF_AFFINITY_NODE; 2013 else if (!strcasecmp(str, "cpu")) 2014 opts->affinity = PERF_AFFINITY_CPU; 2015 2016 return 0; 2017 } 2018 2019 static int parse_output_max_size(const struct option *opt, 2020 const char *str, int unset) 2021 { 2022 unsigned long *s = (unsigned long *)opt->value; 2023 static struct parse_tag tags_size[] = { 2024 { .tag = 'B', .mult = 1 }, 2025 { .tag = 'K', .mult = 1 << 10 }, 2026 { .tag = 'M', .mult = 1 << 20 }, 2027 { .tag = 'G', .mult = 1 << 30 }, 2028 { .tag = 0 }, 2029 }; 2030 unsigned long val; 2031 2032 if (unset) { 2033 *s = 0; 2034 return 0; 2035 } 2036 2037 val = parse_tag_value(str, tags_size); 2038 if (val != (unsigned long) -1) { 2039 *s = val; 2040 return 0; 2041 } 2042 2043 return -1; 2044 } 2045 2046 static int record__parse_mmap_pages(const struct option *opt, 2047 const char *str, 2048 int unset __maybe_unused) 2049 { 2050 struct record_opts *opts = opt->value; 2051 char *s, *p; 2052 unsigned int mmap_pages; 2053 int ret; 2054 2055 if (!str) 2056 return -EINVAL; 2057 2058 s = strdup(str); 2059 if (!s) 2060 return -ENOMEM; 2061 2062 p = strchr(s, ','); 2063 if (p) 2064 *p = '\0'; 2065 2066 if (*s) { 2067 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s); 2068 if (ret) 2069 goto out_free; 2070 opts->mmap_pages = mmap_pages; 2071 } 2072 2073 if (!p) { 2074 ret = 0; 2075 goto out_free; 2076 } 2077 2078 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1); 2079 if (ret) 2080 goto out_free; 2081 2082 opts->auxtrace_mmap_pages = mmap_pages; 2083 2084 out_free: 2085 free(s); 2086 return ret; 2087 } 2088 2089 static void switch_output_size_warn(struct record *rec) 2090 { 2091 u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages); 2092 struct switch_output *s = &rec->switch_output; 2093 2094 wakeup_size /= 2; 2095 2096 if (s->size < wakeup_size) { 2097 char buf[100]; 2098 2099 unit_number__scnprintf(buf, sizeof(buf), wakeup_size); 2100 pr_warning("WARNING: switch-output data size lower than " 2101 "wakeup kernel buffer size (%s) " 2102 "expect bigger perf.data sizes\n", buf); 2103 } 2104 } 2105 2106 static int switch_output_setup(struct record *rec) 2107 { 2108 struct switch_output *s = &rec->switch_output; 2109 static struct parse_tag tags_size[] = { 2110 { .tag = 'B', .mult = 1 }, 2111 { .tag = 'K', .mult = 1 << 10 }, 2112 { .tag = 'M', .mult = 1 << 20 }, 2113 { .tag = 'G', .mult = 1 << 30 }, 2114 { .tag = 0 }, 2115 }; 2116 static struct parse_tag tags_time[] = { 2117 { .tag = 's', .mult = 1 }, 2118 { .tag = 'm', .mult = 60 }, 2119 { .tag = 'h', .mult = 60*60 }, 2120 { .tag = 'd', .mult = 60*60*24 }, 2121 { .tag = 0 }, 2122 }; 2123 unsigned long val; 2124 2125 if (!s->set) 2126 return 0; 2127 2128 if (!strcmp(s->str, "signal")) { 2129 s->signal = true; 2130 pr_debug("switch-output with SIGUSR2 signal\n"); 2131 goto enabled; 2132 } 2133 2134 val = parse_tag_value(s->str, tags_size); 2135 if (val != (unsigned long) -1) { 2136 s->size = val; 2137 pr_debug("switch-output with %s size threshold\n", s->str); 2138 goto enabled; 2139 } 2140 2141 val = parse_tag_value(s->str, tags_time); 2142 if (val != (unsigned long) -1) { 2143 s->time = val; 2144 pr_debug("switch-output with %s time threshold (%lu seconds)\n", 2145 s->str, s->time); 2146 goto enabled; 2147 } 2148 2149 return -1; 2150 2151 enabled: 2152 rec->timestamp_filename = true; 2153 s->enabled = true; 2154 2155 if (s->size && !rec->opts.no_buffering) 2156 switch_output_size_warn(rec); 2157 2158 return 0; 2159 } 2160 2161 static const char * const __record_usage[] = { 2162 "perf record [<options>] [<command>]", 2163 "perf record [<options>] -- <command> [<options>]", 2164 NULL 2165 }; 2166 const char * const *record_usage = __record_usage; 2167 2168 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event, 2169 struct perf_sample *sample, struct machine *machine) 2170 { 2171 /* 2172 * We already have the kernel maps, put in place via perf_session__create_kernel_maps() 2173 * no need to add them twice. 2174 */ 2175 if (!(event->header.misc & PERF_RECORD_MISC_USER)) 2176 return 0; 2177 return perf_event__process_mmap(tool, event, sample, machine); 2178 } 2179 2180 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event, 2181 struct perf_sample *sample, struct machine *machine) 2182 { 2183 /* 2184 * We already have the kernel maps, put in place via perf_session__create_kernel_maps() 2185 * no need to add them twice. 2186 */ 2187 if (!(event->header.misc & PERF_RECORD_MISC_USER)) 2188 return 0; 2189 2190 return perf_event__process_mmap2(tool, event, sample, machine); 2191 } 2192 2193 /* 2194 * XXX Ideally would be local to cmd_record() and passed to a record__new 2195 * because we need to have access to it in record__exit, that is called 2196 * after cmd_record() exits, but since record_options need to be accessible to 2197 * builtin-script, leave it here. 2198 * 2199 * At least we don't ouch it in all the other functions here directly. 2200 * 2201 * Just say no to tons of global variables, sigh. 2202 */ 2203 static struct record record = { 2204 .opts = { 2205 .sample_time = true, 2206 .mmap_pages = UINT_MAX, 2207 .user_freq = UINT_MAX, 2208 .user_interval = ULLONG_MAX, 2209 .freq = 4000, 2210 .target = { 2211 .uses_mmap = true, 2212 .default_per_cpu = true, 2213 }, 2214 .mmap_flush = MMAP_FLUSH_DEFAULT, 2215 }, 2216 .tool = { 2217 .sample = process_sample_event, 2218 .fork = perf_event__process_fork, 2219 .exit = perf_event__process_exit, 2220 .comm = perf_event__process_comm, 2221 .namespaces = perf_event__process_namespaces, 2222 .mmap = build_id__process_mmap, 2223 .mmap2 = build_id__process_mmap2, 2224 .ordered_events = true, 2225 }, 2226 }; 2227 2228 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP 2229 "\n\t\t\t\tDefault: fp"; 2230 2231 static bool dry_run; 2232 2233 /* 2234 * XXX Will stay a global variable till we fix builtin-script.c to stop messing 2235 * with it and switch to use the library functions in perf_evlist that came 2236 * from builtin-record.c, i.e. use record_opts, 2237 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record', 2238 * using pipes, etc. 2239 */ 2240 static struct option __record_options[] = { 2241 OPT_CALLBACK('e', "event", &record.evlist, "event", 2242 "event selector. use 'perf list' to list available events", 2243 parse_events_option), 2244 OPT_CALLBACK(0, "filter", &record.evlist, "filter", 2245 "event filter", parse_filter), 2246 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist, 2247 NULL, "don't record events from perf itself", 2248 exclude_perf), 2249 OPT_STRING('p', "pid", &record.opts.target.pid, "pid", 2250 "record events on existing process id"), 2251 OPT_STRING('t', "tid", &record.opts.target.tid, "tid", 2252 "record events on existing thread id"), 2253 OPT_INTEGER('r', "realtime", &record.realtime_prio, 2254 "collect data with this RT SCHED_FIFO priority"), 2255 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering, 2256 "collect data without buffering"), 2257 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples, 2258 "collect raw sample records from all opened counters"), 2259 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide, 2260 "system-wide collection from all CPUs"), 2261 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu", 2262 "list of cpus to monitor"), 2263 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"), 2264 OPT_STRING('o', "output", &record.data.path, "file", 2265 "output file name"), 2266 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit, 2267 &record.opts.no_inherit_set, 2268 "child tasks do not inherit counters"), 2269 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize, 2270 "synthesize non-sample events at the end of output"), 2271 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"), 2272 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"), 2273 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq, 2274 "Fail if the specified frequency can't be used"), 2275 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'", 2276 "profile at this frequency", 2277 record__parse_freq), 2278 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]", 2279 "number of mmap data pages and AUX area tracing mmap pages", 2280 record__parse_mmap_pages), 2281 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number", 2282 "Minimal number of bytes that is extracted from mmap data pages (default: 1)", 2283 record__mmap_flush_parse), 2284 OPT_BOOLEAN(0, "group", &record.opts.group, 2285 "put the counters into a counter group"), 2286 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param, 2287 NULL, "enables call-graph recording" , 2288 &record_callchain_opt), 2289 OPT_CALLBACK(0, "call-graph", &record.opts, 2290 "record_mode[,record_size]", record_callchain_help, 2291 &record_parse_callchain_opt), 2292 OPT_INCR('v', "verbose", &verbose, 2293 "be more verbose (show counter open errors, etc)"), 2294 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"), 2295 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat, 2296 "per thread counts"), 2297 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"), 2298 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr, 2299 "Record the sample physical addresses"), 2300 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"), 2301 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time, 2302 &record.opts.sample_time_set, 2303 "Record the sample timestamps"), 2304 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set, 2305 "Record the sample period"), 2306 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples, 2307 "don't sample"), 2308 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache, 2309 &record.no_buildid_cache_set, 2310 "do not update the buildid cache"), 2311 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid, 2312 &record.no_buildid_set, 2313 "do not collect buildids in perf.data"), 2314 OPT_CALLBACK('G', "cgroup", &record.evlist, "name", 2315 "monitor event in cgroup name only", 2316 parse_cgroups), 2317 OPT_UINTEGER('D', "delay", &record.opts.initial_delay, 2318 "ms to wait before starting measurement after program start"), 2319 OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"), 2320 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user", 2321 "user to profile"), 2322 2323 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack, 2324 "branch any", "sample any taken branches", 2325 parse_branch_stack), 2326 2327 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack, 2328 "branch filter mask", "branch stack filter modes", 2329 parse_branch_stack), 2330 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight, 2331 "sample by weight (on special events only)"), 2332 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction, 2333 "sample transaction flags (special events only)"), 2334 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread, 2335 "use per-thread mmaps"), 2336 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register", 2337 "sample selected machine registers on interrupt," 2338 " use '-I?' to list register names", parse_intr_regs), 2339 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register", 2340 "sample selected machine registers on interrupt," 2341 " use '--user-regs=?' to list register names", parse_user_regs), 2342 OPT_BOOLEAN(0, "running-time", &record.opts.running_time, 2343 "Record running/enabled time of read (:S) events"), 2344 OPT_CALLBACK('k', "clockid", &record.opts, 2345 "clockid", "clockid to use for events, see clock_gettime()", 2346 parse_clockid), 2347 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts, 2348 "opts", "AUX area tracing Snapshot Mode", ""), 2349 OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts, 2350 "opts", "sample AUX area", ""), 2351 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout, 2352 "per thread proc mmap processing timeout in ms"), 2353 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces, 2354 "Record namespaces events"), 2355 OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events, 2356 "Record context switch events"), 2357 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel, 2358 "Configure all used events to run in kernel space.", 2359 PARSE_OPT_EXCLUSIVE), 2360 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user, 2361 "Configure all used events to run in user space.", 2362 PARSE_OPT_EXCLUSIVE), 2363 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains, 2364 "collect kernel callchains"), 2365 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains, 2366 "collect user callchains"), 2367 OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path", 2368 "clang binary to use for compiling BPF scriptlets"), 2369 OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options", 2370 "options passed to clang when compiling BPF scriptlets"), 2371 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name, 2372 "file", "vmlinux pathname"), 2373 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all, 2374 "Record build-id of all DSOs regardless of hits"), 2375 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename, 2376 "append timestamp to output filename"), 2377 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary, 2378 "Record timestamp boundary (time of first/last samples)"), 2379 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str, 2380 &record.switch_output.set, "signal or size[BKMG] or time[smhd]", 2381 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold", 2382 "signal"), 2383 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files, 2384 "Limit number of switch output generated files"), 2385 OPT_BOOLEAN(0, "dry-run", &dry_run, 2386 "Parse options then exit"), 2387 #ifdef HAVE_AIO_SUPPORT 2388 OPT_CALLBACK_OPTARG(0, "aio", &record.opts, 2389 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)", 2390 record__aio_parse), 2391 #endif 2392 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu", 2393 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer", 2394 record__parse_affinity), 2395 #ifdef HAVE_ZSTD_SUPPORT 2396 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, 2397 "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)", 2398 record__parse_comp_level), 2399 #endif 2400 OPT_CALLBACK(0, "max-size", &record.output_max_size, 2401 "size", "Limit the maximum size of the output file", parse_output_max_size), 2402 OPT_END() 2403 }; 2404 2405 struct option *record_options = __record_options; 2406 2407 int cmd_record(int argc, const char **argv) 2408 { 2409 int err; 2410 struct record *rec = &record; 2411 char errbuf[BUFSIZ]; 2412 2413 setlocale(LC_ALL, ""); 2414 2415 #ifndef HAVE_LIBBPF_SUPPORT 2416 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c) 2417 set_nobuild('\0', "clang-path", true); 2418 set_nobuild('\0', "clang-opt", true); 2419 # undef set_nobuild 2420 #endif 2421 2422 #ifndef HAVE_BPF_PROLOGUE 2423 # if !defined (HAVE_DWARF_SUPPORT) 2424 # define REASON "NO_DWARF=1" 2425 # elif !defined (HAVE_LIBBPF_SUPPORT) 2426 # define REASON "NO_LIBBPF=1" 2427 # else 2428 # define REASON "this architecture doesn't support BPF prologue" 2429 # endif 2430 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c) 2431 set_nobuild('\0', "vmlinux", true); 2432 # undef set_nobuild 2433 # undef REASON 2434 #endif 2435 2436 CPU_ZERO(&rec->affinity_mask); 2437 rec->opts.affinity = PERF_AFFINITY_SYS; 2438 2439 rec->evlist = evlist__new(); 2440 if (rec->evlist == NULL) 2441 return -ENOMEM; 2442 2443 err = perf_config(perf_record_config, rec); 2444 if (err) 2445 return err; 2446 2447 argc = parse_options(argc, argv, record_options, record_usage, 2448 PARSE_OPT_STOP_AT_NON_OPTION); 2449 if (quiet) 2450 perf_quiet_option(); 2451 2452 /* Make system wide (-a) the default target. */ 2453 if (!argc && target__none(&rec->opts.target)) 2454 rec->opts.target.system_wide = true; 2455 2456 if (nr_cgroups && !rec->opts.target.system_wide) { 2457 usage_with_options_msg(record_usage, record_options, 2458 "cgroup monitoring only available in system-wide mode"); 2459 2460 } 2461 2462 if (rec->opts.kcore) 2463 rec->data.is_dir = true; 2464 2465 if (rec->opts.comp_level != 0) { 2466 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n"); 2467 rec->no_buildid = true; 2468 } 2469 2470 if (rec->opts.record_switch_events && 2471 !perf_can_record_switch_events()) { 2472 ui__error("kernel does not support recording context switch events\n"); 2473 parse_options_usage(record_usage, record_options, "switch-events", 0); 2474 return -EINVAL; 2475 } 2476 2477 if (switch_output_setup(rec)) { 2478 parse_options_usage(record_usage, record_options, "switch-output", 0); 2479 return -EINVAL; 2480 } 2481 2482 if (rec->switch_output.time) { 2483 signal(SIGALRM, alarm_sig_handler); 2484 alarm(rec->switch_output.time); 2485 } 2486 2487 if (rec->switch_output.num_files) { 2488 rec->switch_output.filenames = calloc(sizeof(char *), 2489 rec->switch_output.num_files); 2490 if (!rec->switch_output.filenames) 2491 return -EINVAL; 2492 } 2493 2494 /* 2495 * Allow aliases to facilitate the lookup of symbols for address 2496 * filters. Refer to auxtrace_parse_filters(). 2497 */ 2498 symbol_conf.allow_aliases = true; 2499 2500 symbol__init(NULL); 2501 2502 err = record__auxtrace_init(rec); 2503 if (err) 2504 goto out; 2505 2506 if (dry_run) 2507 goto out; 2508 2509 err = bpf__setup_stdout(rec->evlist); 2510 if (err) { 2511 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf)); 2512 pr_err("ERROR: Setup BPF stdout failed: %s\n", 2513 errbuf); 2514 goto out; 2515 } 2516 2517 err = -ENOMEM; 2518 2519 if (rec->no_buildid_cache || rec->no_buildid) { 2520 disable_buildid_cache(); 2521 } else if (rec->switch_output.enabled) { 2522 /* 2523 * In 'perf record --switch-output', disable buildid 2524 * generation by default to reduce data file switching 2525 * overhead. Still generate buildid if they are required 2526 * explicitly using 2527 * 2528 * perf record --switch-output --no-no-buildid \ 2529 * --no-no-buildid-cache 2530 * 2531 * Following code equals to: 2532 * 2533 * if ((rec->no_buildid || !rec->no_buildid_set) && 2534 * (rec->no_buildid_cache || !rec->no_buildid_cache_set)) 2535 * disable_buildid_cache(); 2536 */ 2537 bool disable = true; 2538 2539 if (rec->no_buildid_set && !rec->no_buildid) 2540 disable = false; 2541 if (rec->no_buildid_cache_set && !rec->no_buildid_cache) 2542 disable = false; 2543 if (disable) { 2544 rec->no_buildid = true; 2545 rec->no_buildid_cache = true; 2546 disable_buildid_cache(); 2547 } 2548 } 2549 2550 if (record.opts.overwrite) 2551 record.opts.tail_synthesize = true; 2552 2553 if (rec->evlist->core.nr_entries == 0 && 2554 __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) { 2555 pr_err("Not enough memory for event selector list\n"); 2556 goto out; 2557 } 2558 2559 if (rec->opts.target.tid && !rec->opts.no_inherit_set) 2560 rec->opts.no_inherit = true; 2561 2562 err = target__validate(&rec->opts.target); 2563 if (err) { 2564 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ); 2565 ui__warning("%s\n", errbuf); 2566 } 2567 2568 err = target__parse_uid(&rec->opts.target); 2569 if (err) { 2570 int saved_errno = errno; 2571 2572 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ); 2573 ui__error("%s", errbuf); 2574 2575 err = -saved_errno; 2576 goto out; 2577 } 2578 2579 /* Enable ignoring missing threads when -u/-p option is defined. */ 2580 rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid; 2581 2582 err = -ENOMEM; 2583 if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0) 2584 usage_with_options(record_usage, record_options); 2585 2586 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts); 2587 if (err) 2588 goto out; 2589 2590 /* 2591 * We take all buildids when the file contains 2592 * AUX area tracing data because we do not decode the 2593 * trace because it would take too long. 2594 */ 2595 if (rec->opts.full_auxtrace) 2596 rec->buildid_all = true; 2597 2598 if (record_opts__config(&rec->opts)) { 2599 err = -EINVAL; 2600 goto out; 2601 } 2602 2603 if (rec->opts.nr_cblocks > nr_cblocks_max) 2604 rec->opts.nr_cblocks = nr_cblocks_max; 2605 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks); 2606 2607 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]); 2608 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush); 2609 2610 if (rec->opts.comp_level > comp_level_max) 2611 rec->opts.comp_level = comp_level_max; 2612 pr_debug("comp level: %d\n", rec->opts.comp_level); 2613 2614 err = __cmd_record(&record, argc, argv); 2615 out: 2616 evlist__delete(rec->evlist); 2617 symbol__exit(); 2618 auxtrace_record__free(rec->itr); 2619 return err; 2620 } 2621 2622 static void snapshot_sig_handler(int sig __maybe_unused) 2623 { 2624 struct record *rec = &record; 2625 2626 if (trigger_is_ready(&auxtrace_snapshot_trigger)) { 2627 trigger_hit(&auxtrace_snapshot_trigger); 2628 auxtrace_record__snapshot_started = 1; 2629 if (auxtrace_record__snapshot_start(record.itr)) 2630 trigger_error(&auxtrace_snapshot_trigger); 2631 } 2632 2633 if (switch_output_signal(rec)) 2634 trigger_hit(&switch_output_trigger); 2635 } 2636 2637 static void alarm_sig_handler(int sig __maybe_unused) 2638 { 2639 struct record *rec = &record; 2640 2641 if (switch_output_time(rec)) 2642 trigger_hit(&switch_output_trigger); 2643 } 2644