1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * builtin-record.c 4 * 5 * Builtin record command: Record the profile of a workload 6 * (or a CPU, or a PID) into the perf.data output file - for 7 * later analysis via perf report. 8 */ 9 #include "builtin.h" 10 11 #include "util/build-id.h" 12 #include <subcmd/parse-options.h> 13 #include "util/parse-events.h" 14 #include "util/config.h" 15 16 #include "util/callchain.h" 17 #include "util/cgroup.h" 18 #include "util/header.h" 19 #include "util/event.h" 20 #include "util/evlist.h" 21 #include "util/evsel.h" 22 #include "util/debug.h" 23 #include "util/mmap.h" 24 #include "util/target.h" 25 #include "util/session.h" 26 #include "util/tool.h" 27 #include "util/symbol.h" 28 #include "util/record.h" 29 #include "util/cpumap.h" 30 #include "util/thread_map.h" 31 #include "util/data.h" 32 #include "util/perf_regs.h" 33 #include "util/auxtrace.h" 34 #include "util/tsc.h" 35 #include "util/parse-branch-options.h" 36 #include "util/parse-regs-options.h" 37 #include "util/perf_api_probe.h" 38 #include "util/llvm-utils.h" 39 #include "util/bpf-loader.h" 40 #include "util/trigger.h" 41 #include "util/perf-hooks.h" 42 #include "util/cpu-set-sched.h" 43 #include "util/synthetic-events.h" 44 #include "util/time-utils.h" 45 #include "util/units.h" 46 #include "util/bpf-event.h" 47 #include "util/util.h" 48 #include "asm/bug.h" 49 #include "perf.h" 50 51 #include <errno.h> 52 #include <inttypes.h> 53 #include <locale.h> 54 #include <poll.h> 55 #include <pthread.h> 56 #include <unistd.h> 57 #include <sched.h> 58 #include <signal.h> 59 #include <sys/mman.h> 60 #include <sys/wait.h> 61 #include <sys/types.h> 62 #include <sys/stat.h> 63 #include <fcntl.h> 64 #include <linux/err.h> 65 #include <linux/string.h> 66 #include <linux/time64.h> 67 #include <linux/zalloc.h> 68 #include <linux/bitmap.h> 69 70 struct switch_output { 71 bool enabled; 72 bool signal; 73 unsigned long size; 74 unsigned long time; 75 const char *str; 76 bool set; 77 char **filenames; 78 int num_files; 79 int cur_file; 80 }; 81 82 struct record { 83 struct perf_tool tool; 84 struct record_opts opts; 85 u64 bytes_written; 86 struct perf_data data; 87 struct auxtrace_record *itr; 88 struct evlist *evlist; 89 struct perf_session *session; 90 struct evlist *sb_evlist; 91 pthread_t thread_id; 92 int realtime_prio; 93 bool switch_output_event_set; 94 bool no_buildid; 95 bool no_buildid_set; 96 bool no_buildid_cache; 97 bool no_buildid_cache_set; 98 bool buildid_all; 99 bool timestamp_filename; 100 bool timestamp_boundary; 101 struct switch_output switch_output; 102 unsigned long long samples; 103 struct mmap_cpu_mask affinity_mask; 104 unsigned long output_max_size; /* = 0: unlimited */ 105 }; 106 107 static volatile int done; 108 109 static volatile int auxtrace_record__snapshot_started; 110 static DEFINE_TRIGGER(auxtrace_snapshot_trigger); 111 static DEFINE_TRIGGER(switch_output_trigger); 112 113 static const char *affinity_tags[PERF_AFFINITY_MAX] = { 114 "SYS", "NODE", "CPU" 115 }; 116 117 static bool switch_output_signal(struct record *rec) 118 { 119 return rec->switch_output.signal && 120 trigger_is_ready(&switch_output_trigger); 121 } 122 123 static bool switch_output_size(struct record *rec) 124 { 125 return rec->switch_output.size && 126 trigger_is_ready(&switch_output_trigger) && 127 (rec->bytes_written >= rec->switch_output.size); 128 } 129 130 static bool switch_output_time(struct record *rec) 131 { 132 return rec->switch_output.time && 133 trigger_is_ready(&switch_output_trigger); 134 } 135 136 static bool record__output_max_size_exceeded(struct record *rec) 137 { 138 return rec->output_max_size && 139 (rec->bytes_written >= rec->output_max_size); 140 } 141 142 static int record__write(struct record *rec, struct mmap *map __maybe_unused, 143 void *bf, size_t size) 144 { 145 struct perf_data_file *file = &rec->session->data->file; 146 147 if (perf_data_file__write(file, bf, size) < 0) { 148 pr_err("failed to write perf data, error: %m\n"); 149 return -1; 150 } 151 152 rec->bytes_written += size; 153 154 if (record__output_max_size_exceeded(rec) && !done) { 155 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB)," 156 " stopping session ]\n", 157 rec->bytes_written >> 10); 158 done = 1; 159 } 160 161 if (switch_output_size(rec)) 162 trigger_hit(&switch_output_trigger); 163 164 return 0; 165 } 166 167 static int record__aio_enabled(struct record *rec); 168 static int record__comp_enabled(struct record *rec); 169 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size, 170 void *src, size_t src_size); 171 172 #ifdef HAVE_AIO_SUPPORT 173 static int record__aio_write(struct aiocb *cblock, int trace_fd, 174 void *buf, size_t size, off_t off) 175 { 176 int rc; 177 178 cblock->aio_fildes = trace_fd; 179 cblock->aio_buf = buf; 180 cblock->aio_nbytes = size; 181 cblock->aio_offset = off; 182 cblock->aio_sigevent.sigev_notify = SIGEV_NONE; 183 184 do { 185 rc = aio_write(cblock); 186 if (rc == 0) { 187 break; 188 } else if (errno != EAGAIN) { 189 cblock->aio_fildes = -1; 190 pr_err("failed to queue perf data, error: %m\n"); 191 break; 192 } 193 } while (1); 194 195 return rc; 196 } 197 198 static int record__aio_complete(struct mmap *md, struct aiocb *cblock) 199 { 200 void *rem_buf; 201 off_t rem_off; 202 size_t rem_size; 203 int rc, aio_errno; 204 ssize_t aio_ret, written; 205 206 aio_errno = aio_error(cblock); 207 if (aio_errno == EINPROGRESS) 208 return 0; 209 210 written = aio_ret = aio_return(cblock); 211 if (aio_ret < 0) { 212 if (aio_errno != EINTR) 213 pr_err("failed to write perf data, error: %m\n"); 214 written = 0; 215 } 216 217 rem_size = cblock->aio_nbytes - written; 218 219 if (rem_size == 0) { 220 cblock->aio_fildes = -1; 221 /* 222 * md->refcount is incremented in record__aio_pushfn() for 223 * every aio write request started in record__aio_push() so 224 * decrement it because the request is now complete. 225 */ 226 perf_mmap__put(&md->core); 227 rc = 1; 228 } else { 229 /* 230 * aio write request may require restart with the 231 * reminder if the kernel didn't write whole 232 * chunk at once. 233 */ 234 rem_off = cblock->aio_offset + written; 235 rem_buf = (void *)(cblock->aio_buf + written); 236 record__aio_write(cblock, cblock->aio_fildes, 237 rem_buf, rem_size, rem_off); 238 rc = 0; 239 } 240 241 return rc; 242 } 243 244 static int record__aio_sync(struct mmap *md, bool sync_all) 245 { 246 struct aiocb **aiocb = md->aio.aiocb; 247 struct aiocb *cblocks = md->aio.cblocks; 248 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */ 249 int i, do_suspend; 250 251 do { 252 do_suspend = 0; 253 for (i = 0; i < md->aio.nr_cblocks; ++i) { 254 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) { 255 if (sync_all) 256 aiocb[i] = NULL; 257 else 258 return i; 259 } else { 260 /* 261 * Started aio write is not complete yet 262 * so it has to be waited before the 263 * next allocation. 264 */ 265 aiocb[i] = &cblocks[i]; 266 do_suspend = 1; 267 } 268 } 269 if (!do_suspend) 270 return -1; 271 272 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) { 273 if (!(errno == EAGAIN || errno == EINTR)) 274 pr_err("failed to sync perf data, error: %m\n"); 275 } 276 } while (1); 277 } 278 279 struct record_aio { 280 struct record *rec; 281 void *data; 282 size_t size; 283 }; 284 285 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size) 286 { 287 struct record_aio *aio = to; 288 289 /* 290 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer 291 * to release space in the kernel buffer as fast as possible, calling 292 * perf_mmap__consume() from perf_mmap__push() function. 293 * 294 * That lets the kernel to proceed with storing more profiling data into 295 * the kernel buffer earlier than other per-cpu kernel buffers are handled. 296 * 297 * Coping can be done in two steps in case the chunk of profiling data 298 * crosses the upper bound of the kernel buffer. In this case we first move 299 * part of data from map->start till the upper bound and then the reminder 300 * from the beginning of the kernel buffer till the end of the data chunk. 301 */ 302 303 if (record__comp_enabled(aio->rec)) { 304 size = zstd_compress(aio->rec->session, aio->data + aio->size, 305 mmap__mmap_len(map) - aio->size, 306 buf, size); 307 } else { 308 memcpy(aio->data + aio->size, buf, size); 309 } 310 311 if (!aio->size) { 312 /* 313 * Increment map->refcount to guard map->aio.data[] buffer 314 * from premature deallocation because map object can be 315 * released earlier than aio write request started on 316 * map->aio.data[] buffer is complete. 317 * 318 * perf_mmap__put() is done at record__aio_complete() 319 * after started aio request completion or at record__aio_push() 320 * if the request failed to start. 321 */ 322 perf_mmap__get(&map->core); 323 } 324 325 aio->size += size; 326 327 return size; 328 } 329 330 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off) 331 { 332 int ret, idx; 333 int trace_fd = rec->session->data->file.fd; 334 struct record_aio aio = { .rec = rec, .size = 0 }; 335 336 /* 337 * Call record__aio_sync() to wait till map->aio.data[] buffer 338 * becomes available after previous aio write operation. 339 */ 340 341 idx = record__aio_sync(map, false); 342 aio.data = map->aio.data[idx]; 343 ret = perf_mmap__push(map, &aio, record__aio_pushfn); 344 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */ 345 return ret; 346 347 rec->samples++; 348 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off); 349 if (!ret) { 350 *off += aio.size; 351 rec->bytes_written += aio.size; 352 if (switch_output_size(rec)) 353 trigger_hit(&switch_output_trigger); 354 } else { 355 /* 356 * Decrement map->refcount incremented in record__aio_pushfn() 357 * back if record__aio_write() operation failed to start, otherwise 358 * map->refcount is decremented in record__aio_complete() after 359 * aio write operation finishes successfully. 360 */ 361 perf_mmap__put(&map->core); 362 } 363 364 return ret; 365 } 366 367 static off_t record__aio_get_pos(int trace_fd) 368 { 369 return lseek(trace_fd, 0, SEEK_CUR); 370 } 371 372 static void record__aio_set_pos(int trace_fd, off_t pos) 373 { 374 lseek(trace_fd, pos, SEEK_SET); 375 } 376 377 static void record__aio_mmap_read_sync(struct record *rec) 378 { 379 int i; 380 struct evlist *evlist = rec->evlist; 381 struct mmap *maps = evlist->mmap; 382 383 if (!record__aio_enabled(rec)) 384 return; 385 386 for (i = 0; i < evlist->core.nr_mmaps; i++) { 387 struct mmap *map = &maps[i]; 388 389 if (map->core.base) 390 record__aio_sync(map, true); 391 } 392 } 393 394 static int nr_cblocks_default = 1; 395 static int nr_cblocks_max = 4; 396 397 static int record__aio_parse(const struct option *opt, 398 const char *str, 399 int unset) 400 { 401 struct record_opts *opts = (struct record_opts *)opt->value; 402 403 if (unset) { 404 opts->nr_cblocks = 0; 405 } else { 406 if (str) 407 opts->nr_cblocks = strtol(str, NULL, 0); 408 if (!opts->nr_cblocks) 409 opts->nr_cblocks = nr_cblocks_default; 410 } 411 412 return 0; 413 } 414 #else /* HAVE_AIO_SUPPORT */ 415 static int nr_cblocks_max = 0; 416 417 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused, 418 off_t *off __maybe_unused) 419 { 420 return -1; 421 } 422 423 static off_t record__aio_get_pos(int trace_fd __maybe_unused) 424 { 425 return -1; 426 } 427 428 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused) 429 { 430 } 431 432 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused) 433 { 434 } 435 #endif 436 437 static int record__aio_enabled(struct record *rec) 438 { 439 return rec->opts.nr_cblocks > 0; 440 } 441 442 #define MMAP_FLUSH_DEFAULT 1 443 static int record__mmap_flush_parse(const struct option *opt, 444 const char *str, 445 int unset) 446 { 447 int flush_max; 448 struct record_opts *opts = (struct record_opts *)opt->value; 449 static struct parse_tag tags[] = { 450 { .tag = 'B', .mult = 1 }, 451 { .tag = 'K', .mult = 1 << 10 }, 452 { .tag = 'M', .mult = 1 << 20 }, 453 { .tag = 'G', .mult = 1 << 30 }, 454 { .tag = 0 }, 455 }; 456 457 if (unset) 458 return 0; 459 460 if (str) { 461 opts->mmap_flush = parse_tag_value(str, tags); 462 if (opts->mmap_flush == (int)-1) 463 opts->mmap_flush = strtol(str, NULL, 0); 464 } 465 466 if (!opts->mmap_flush) 467 opts->mmap_flush = MMAP_FLUSH_DEFAULT; 468 469 flush_max = evlist__mmap_size(opts->mmap_pages); 470 flush_max /= 4; 471 if (opts->mmap_flush > flush_max) 472 opts->mmap_flush = flush_max; 473 474 return 0; 475 } 476 477 #ifdef HAVE_ZSTD_SUPPORT 478 static unsigned int comp_level_default = 1; 479 480 static int record__parse_comp_level(const struct option *opt, const char *str, int unset) 481 { 482 struct record_opts *opts = opt->value; 483 484 if (unset) { 485 opts->comp_level = 0; 486 } else { 487 if (str) 488 opts->comp_level = strtol(str, NULL, 0); 489 if (!opts->comp_level) 490 opts->comp_level = comp_level_default; 491 } 492 493 return 0; 494 } 495 #endif 496 static unsigned int comp_level_max = 22; 497 498 static int record__comp_enabled(struct record *rec) 499 { 500 return rec->opts.comp_level > 0; 501 } 502 503 static int process_synthesized_event(struct perf_tool *tool, 504 union perf_event *event, 505 struct perf_sample *sample __maybe_unused, 506 struct machine *machine __maybe_unused) 507 { 508 struct record *rec = container_of(tool, struct record, tool); 509 return record__write(rec, NULL, event, event->header.size); 510 } 511 512 static int process_locked_synthesized_event(struct perf_tool *tool, 513 union perf_event *event, 514 struct perf_sample *sample __maybe_unused, 515 struct machine *machine __maybe_unused) 516 { 517 static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER; 518 int ret; 519 520 pthread_mutex_lock(&synth_lock); 521 ret = process_synthesized_event(tool, event, sample, machine); 522 pthread_mutex_unlock(&synth_lock); 523 return ret; 524 } 525 526 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size) 527 { 528 struct record *rec = to; 529 530 if (record__comp_enabled(rec)) { 531 size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size); 532 bf = map->data; 533 } 534 535 rec->samples++; 536 return record__write(rec, map, bf, size); 537 } 538 539 static volatile int signr = -1; 540 static volatile int child_finished; 541 542 static void sig_handler(int sig) 543 { 544 if (sig == SIGCHLD) 545 child_finished = 1; 546 else 547 signr = sig; 548 549 done = 1; 550 } 551 552 static void sigsegv_handler(int sig) 553 { 554 perf_hooks__recover(); 555 sighandler_dump_stack(sig); 556 } 557 558 static void record__sig_exit(void) 559 { 560 if (signr == -1) 561 return; 562 563 signal(signr, SIG_DFL); 564 raise(signr); 565 } 566 567 #ifdef HAVE_AUXTRACE_SUPPORT 568 569 static int record__process_auxtrace(struct perf_tool *tool, 570 struct mmap *map, 571 union perf_event *event, void *data1, 572 size_t len1, void *data2, size_t len2) 573 { 574 struct record *rec = container_of(tool, struct record, tool); 575 struct perf_data *data = &rec->data; 576 size_t padding; 577 u8 pad[8] = {0}; 578 579 if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) { 580 off_t file_offset; 581 int fd = perf_data__fd(data); 582 int err; 583 584 file_offset = lseek(fd, 0, SEEK_CUR); 585 if (file_offset == -1) 586 return -1; 587 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index, 588 event, file_offset); 589 if (err) 590 return err; 591 } 592 593 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */ 594 padding = (len1 + len2) & 7; 595 if (padding) 596 padding = 8 - padding; 597 598 record__write(rec, map, event, event->header.size); 599 record__write(rec, map, data1, len1); 600 if (len2) 601 record__write(rec, map, data2, len2); 602 record__write(rec, map, &pad, padding); 603 604 return 0; 605 } 606 607 static int record__auxtrace_mmap_read(struct record *rec, 608 struct mmap *map) 609 { 610 int ret; 611 612 ret = auxtrace_mmap__read(map, rec->itr, &rec->tool, 613 record__process_auxtrace); 614 if (ret < 0) 615 return ret; 616 617 if (ret) 618 rec->samples++; 619 620 return 0; 621 } 622 623 static int record__auxtrace_mmap_read_snapshot(struct record *rec, 624 struct mmap *map) 625 { 626 int ret; 627 628 ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool, 629 record__process_auxtrace, 630 rec->opts.auxtrace_snapshot_size); 631 if (ret < 0) 632 return ret; 633 634 if (ret) 635 rec->samples++; 636 637 return 0; 638 } 639 640 static int record__auxtrace_read_snapshot_all(struct record *rec) 641 { 642 int i; 643 int rc = 0; 644 645 for (i = 0; i < rec->evlist->core.nr_mmaps; i++) { 646 struct mmap *map = &rec->evlist->mmap[i]; 647 648 if (!map->auxtrace_mmap.base) 649 continue; 650 651 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) { 652 rc = -1; 653 goto out; 654 } 655 } 656 out: 657 return rc; 658 } 659 660 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit) 661 { 662 pr_debug("Recording AUX area tracing snapshot\n"); 663 if (record__auxtrace_read_snapshot_all(rec) < 0) { 664 trigger_error(&auxtrace_snapshot_trigger); 665 } else { 666 if (auxtrace_record__snapshot_finish(rec->itr, on_exit)) 667 trigger_error(&auxtrace_snapshot_trigger); 668 else 669 trigger_ready(&auxtrace_snapshot_trigger); 670 } 671 } 672 673 static int record__auxtrace_snapshot_exit(struct record *rec) 674 { 675 if (trigger_is_error(&auxtrace_snapshot_trigger)) 676 return 0; 677 678 if (!auxtrace_record__snapshot_started && 679 auxtrace_record__snapshot_start(rec->itr)) 680 return -1; 681 682 record__read_auxtrace_snapshot(rec, true); 683 if (trigger_is_error(&auxtrace_snapshot_trigger)) 684 return -1; 685 686 return 0; 687 } 688 689 static int record__auxtrace_init(struct record *rec) 690 { 691 int err; 692 693 if (!rec->itr) { 694 rec->itr = auxtrace_record__init(rec->evlist, &err); 695 if (err) 696 return err; 697 } 698 699 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts, 700 rec->opts.auxtrace_snapshot_opts); 701 if (err) 702 return err; 703 704 err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts, 705 rec->opts.auxtrace_sample_opts); 706 if (err) 707 return err; 708 709 return auxtrace_parse_filters(rec->evlist); 710 } 711 712 #else 713 714 static inline 715 int record__auxtrace_mmap_read(struct record *rec __maybe_unused, 716 struct mmap *map __maybe_unused) 717 { 718 return 0; 719 } 720 721 static inline 722 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused, 723 bool on_exit __maybe_unused) 724 { 725 } 726 727 static inline 728 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused) 729 { 730 return 0; 731 } 732 733 static inline 734 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused) 735 { 736 return 0; 737 } 738 739 static int record__auxtrace_init(struct record *rec __maybe_unused) 740 { 741 return 0; 742 } 743 744 #endif 745 746 static bool record__kcore_readable(struct machine *machine) 747 { 748 char kcore[PATH_MAX]; 749 int fd; 750 751 scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir); 752 753 fd = open(kcore, O_RDONLY); 754 if (fd < 0) 755 return false; 756 757 close(fd); 758 759 return true; 760 } 761 762 static int record__kcore_copy(struct machine *machine, struct perf_data *data) 763 { 764 char from_dir[PATH_MAX]; 765 char kcore_dir[PATH_MAX]; 766 int ret; 767 768 snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir); 769 770 ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir)); 771 if (ret) 772 return ret; 773 774 return kcore_copy(from_dir, kcore_dir); 775 } 776 777 static int record__mmap_evlist(struct record *rec, 778 struct evlist *evlist) 779 { 780 struct record_opts *opts = &rec->opts; 781 bool auxtrace_overwrite = opts->auxtrace_snapshot_mode || 782 opts->auxtrace_sample_mode; 783 char msg[512]; 784 785 if (opts->affinity != PERF_AFFINITY_SYS) 786 cpu__setup_cpunode_map(); 787 788 if (evlist__mmap_ex(evlist, opts->mmap_pages, 789 opts->auxtrace_mmap_pages, 790 auxtrace_overwrite, 791 opts->nr_cblocks, opts->affinity, 792 opts->mmap_flush, opts->comp_level) < 0) { 793 if (errno == EPERM) { 794 pr_err("Permission error mapping pages.\n" 795 "Consider increasing " 796 "/proc/sys/kernel/perf_event_mlock_kb,\n" 797 "or try again with a smaller value of -m/--mmap_pages.\n" 798 "(current value: %u,%u)\n", 799 opts->mmap_pages, opts->auxtrace_mmap_pages); 800 return -errno; 801 } else { 802 pr_err("failed to mmap with %d (%s)\n", errno, 803 str_error_r(errno, msg, sizeof(msg))); 804 if (errno) 805 return -errno; 806 else 807 return -EINVAL; 808 } 809 } 810 return 0; 811 } 812 813 static int record__mmap(struct record *rec) 814 { 815 return record__mmap_evlist(rec, rec->evlist); 816 } 817 818 static int record__open(struct record *rec) 819 { 820 char msg[BUFSIZ]; 821 struct evsel *pos; 822 struct evlist *evlist = rec->evlist; 823 struct perf_session *session = rec->session; 824 struct record_opts *opts = &rec->opts; 825 int rc = 0; 826 827 /* 828 * For initial_delay we need to add a dummy event so that we can track 829 * PERF_RECORD_MMAP while we wait for the initial delay to enable the 830 * real events, the ones asked by the user. 831 */ 832 if (opts->initial_delay) { 833 if (perf_evlist__add_dummy(evlist)) 834 return -ENOMEM; 835 836 pos = evlist__first(evlist); 837 pos->tracking = 0; 838 pos = evlist__last(evlist); 839 pos->tracking = 1; 840 pos->core.attr.enable_on_exec = 1; 841 } 842 843 perf_evlist__config(evlist, opts, &callchain_param); 844 845 evlist__for_each_entry(evlist, pos) { 846 try_again: 847 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) { 848 if (evsel__fallback(pos, errno, msg, sizeof(msg))) { 849 if (verbose > 0) 850 ui__warning("%s\n", msg); 851 goto try_again; 852 } 853 if ((errno == EINVAL || errno == EBADF) && 854 pos->leader != pos && 855 pos->weak_group) { 856 pos = perf_evlist__reset_weak_group(evlist, pos, true); 857 goto try_again; 858 } 859 rc = -errno; 860 evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg)); 861 ui__error("%s\n", msg); 862 goto out; 863 } 864 865 pos->supported = true; 866 } 867 868 if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) { 869 pr_warning( 870 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n" 871 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n" 872 "Samples in kernel functions may not be resolved if a suitable vmlinux\n" 873 "file is not found in the buildid cache or in the vmlinux path.\n\n" 874 "Samples in kernel modules won't be resolved at all.\n\n" 875 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n" 876 "even with a suitable vmlinux or kallsyms file.\n\n"); 877 } 878 879 if (perf_evlist__apply_filters(evlist, &pos)) { 880 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n", 881 pos->filter, evsel__name(pos), errno, 882 str_error_r(errno, msg, sizeof(msg))); 883 rc = -1; 884 goto out; 885 } 886 887 rc = record__mmap(rec); 888 if (rc) 889 goto out; 890 891 session->evlist = evlist; 892 perf_session__set_id_hdr_size(session); 893 out: 894 return rc; 895 } 896 897 static int process_sample_event(struct perf_tool *tool, 898 union perf_event *event, 899 struct perf_sample *sample, 900 struct evsel *evsel, 901 struct machine *machine) 902 { 903 struct record *rec = container_of(tool, struct record, tool); 904 905 if (rec->evlist->first_sample_time == 0) 906 rec->evlist->first_sample_time = sample->time; 907 908 rec->evlist->last_sample_time = sample->time; 909 910 if (rec->buildid_all) 911 return 0; 912 913 rec->samples++; 914 return build_id__mark_dso_hit(tool, event, sample, evsel, machine); 915 } 916 917 static int process_buildids(struct record *rec) 918 { 919 struct perf_session *session = rec->session; 920 921 if (perf_data__size(&rec->data) == 0) 922 return 0; 923 924 /* 925 * During this process, it'll load kernel map and replace the 926 * dso->long_name to a real pathname it found. In this case 927 * we prefer the vmlinux path like 928 * /lib/modules/3.16.4/build/vmlinux 929 * 930 * rather than build-id path (in debug directory). 931 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551 932 */ 933 symbol_conf.ignore_vmlinux_buildid = true; 934 935 /* 936 * If --buildid-all is given, it marks all DSO regardless of hits, 937 * so no need to process samples. But if timestamp_boundary is enabled, 938 * it still needs to walk on all samples to get the timestamps of 939 * first/last samples. 940 */ 941 if (rec->buildid_all && !rec->timestamp_boundary) 942 rec->tool.sample = NULL; 943 944 return perf_session__process_events(session); 945 } 946 947 static void perf_event__synthesize_guest_os(struct machine *machine, void *data) 948 { 949 int err; 950 struct perf_tool *tool = data; 951 /* 952 *As for guest kernel when processing subcommand record&report, 953 *we arrange module mmap prior to guest kernel mmap and trigger 954 *a preload dso because default guest module symbols are loaded 955 *from guest kallsyms instead of /lib/modules/XXX/XXX. This 956 *method is used to avoid symbol missing when the first addr is 957 *in module instead of in guest kernel. 958 */ 959 err = perf_event__synthesize_modules(tool, process_synthesized_event, 960 machine); 961 if (err < 0) 962 pr_err("Couldn't record guest kernel [%d]'s reference" 963 " relocation symbol.\n", machine->pid); 964 965 /* 966 * We use _stext for guest kernel because guest kernel's /proc/kallsyms 967 * have no _text sometimes. 968 */ 969 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, 970 machine); 971 if (err < 0) 972 pr_err("Couldn't record guest kernel [%d]'s reference" 973 " relocation symbol.\n", machine->pid); 974 } 975 976 static struct perf_event_header finished_round_event = { 977 .size = sizeof(struct perf_event_header), 978 .type = PERF_RECORD_FINISHED_ROUND, 979 }; 980 981 static void record__adjust_affinity(struct record *rec, struct mmap *map) 982 { 983 if (rec->opts.affinity != PERF_AFFINITY_SYS && 984 !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits, 985 rec->affinity_mask.nbits)) { 986 bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits); 987 bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits, 988 map->affinity_mask.bits, rec->affinity_mask.nbits); 989 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask), 990 (cpu_set_t *)rec->affinity_mask.bits); 991 if (verbose == 2) 992 mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread"); 993 } 994 } 995 996 static size_t process_comp_header(void *record, size_t increment) 997 { 998 struct perf_record_compressed *event = record; 999 size_t size = sizeof(*event); 1000 1001 if (increment) { 1002 event->header.size += increment; 1003 return increment; 1004 } 1005 1006 event->header.type = PERF_RECORD_COMPRESSED; 1007 event->header.size = size; 1008 1009 return size; 1010 } 1011 1012 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size, 1013 void *src, size_t src_size) 1014 { 1015 size_t compressed; 1016 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1; 1017 1018 compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size, 1019 max_record_size, process_comp_header); 1020 1021 session->bytes_transferred += src_size; 1022 session->bytes_compressed += compressed; 1023 1024 return compressed; 1025 } 1026 1027 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist, 1028 bool overwrite, bool synch) 1029 { 1030 u64 bytes_written = rec->bytes_written; 1031 int i; 1032 int rc = 0; 1033 struct mmap *maps; 1034 int trace_fd = rec->data.file.fd; 1035 off_t off = 0; 1036 1037 if (!evlist) 1038 return 0; 1039 1040 maps = overwrite ? evlist->overwrite_mmap : evlist->mmap; 1041 if (!maps) 1042 return 0; 1043 1044 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING) 1045 return 0; 1046 1047 if (record__aio_enabled(rec)) 1048 off = record__aio_get_pos(trace_fd); 1049 1050 for (i = 0; i < evlist->core.nr_mmaps; i++) { 1051 u64 flush = 0; 1052 struct mmap *map = &maps[i]; 1053 1054 if (map->core.base) { 1055 record__adjust_affinity(rec, map); 1056 if (synch) { 1057 flush = map->core.flush; 1058 map->core.flush = 1; 1059 } 1060 if (!record__aio_enabled(rec)) { 1061 if (perf_mmap__push(map, rec, record__pushfn) < 0) { 1062 if (synch) 1063 map->core.flush = flush; 1064 rc = -1; 1065 goto out; 1066 } 1067 } else { 1068 if (record__aio_push(rec, map, &off) < 0) { 1069 record__aio_set_pos(trace_fd, off); 1070 if (synch) 1071 map->core.flush = flush; 1072 rc = -1; 1073 goto out; 1074 } 1075 } 1076 if (synch) 1077 map->core.flush = flush; 1078 } 1079 1080 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode && 1081 !rec->opts.auxtrace_sample_mode && 1082 record__auxtrace_mmap_read(rec, map) != 0) { 1083 rc = -1; 1084 goto out; 1085 } 1086 } 1087 1088 if (record__aio_enabled(rec)) 1089 record__aio_set_pos(trace_fd, off); 1090 1091 /* 1092 * Mark the round finished in case we wrote 1093 * at least one event. 1094 */ 1095 if (bytes_written != rec->bytes_written) 1096 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event)); 1097 1098 if (overwrite) 1099 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY); 1100 out: 1101 return rc; 1102 } 1103 1104 static int record__mmap_read_all(struct record *rec, bool synch) 1105 { 1106 int err; 1107 1108 err = record__mmap_read_evlist(rec, rec->evlist, false, synch); 1109 if (err) 1110 return err; 1111 1112 return record__mmap_read_evlist(rec, rec->evlist, true, synch); 1113 } 1114 1115 static void record__init_features(struct record *rec) 1116 { 1117 struct perf_session *session = rec->session; 1118 int feat; 1119 1120 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++) 1121 perf_header__set_feat(&session->header, feat); 1122 1123 if (rec->no_buildid) 1124 perf_header__clear_feat(&session->header, HEADER_BUILD_ID); 1125 1126 if (!have_tracepoints(&rec->evlist->core.entries)) 1127 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA); 1128 1129 if (!rec->opts.branch_stack) 1130 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK); 1131 1132 if (!rec->opts.full_auxtrace) 1133 perf_header__clear_feat(&session->header, HEADER_AUXTRACE); 1134 1135 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns)) 1136 perf_header__clear_feat(&session->header, HEADER_CLOCKID); 1137 1138 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT); 1139 if (!record__comp_enabled(rec)) 1140 perf_header__clear_feat(&session->header, HEADER_COMPRESSED); 1141 1142 perf_header__clear_feat(&session->header, HEADER_STAT); 1143 } 1144 1145 static void 1146 record__finish_output(struct record *rec) 1147 { 1148 struct perf_data *data = &rec->data; 1149 int fd = perf_data__fd(data); 1150 1151 if (data->is_pipe) 1152 return; 1153 1154 rec->session->header.data_size += rec->bytes_written; 1155 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR); 1156 1157 if (!rec->no_buildid) { 1158 process_buildids(rec); 1159 1160 if (rec->buildid_all) 1161 dsos__hit_all(rec->session); 1162 } 1163 perf_session__write_header(rec->session, rec->evlist, fd, true); 1164 1165 return; 1166 } 1167 1168 static int record__synthesize_workload(struct record *rec, bool tail) 1169 { 1170 int err; 1171 struct perf_thread_map *thread_map; 1172 1173 if (rec->opts.tail_synthesize != tail) 1174 return 0; 1175 1176 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid); 1177 if (thread_map == NULL) 1178 return -1; 1179 1180 err = perf_event__synthesize_thread_map(&rec->tool, thread_map, 1181 process_synthesized_event, 1182 &rec->session->machines.host, 1183 rec->opts.sample_address); 1184 perf_thread_map__put(thread_map); 1185 return err; 1186 } 1187 1188 static int record__synthesize(struct record *rec, bool tail); 1189 1190 static int 1191 record__switch_output(struct record *rec, bool at_exit) 1192 { 1193 struct perf_data *data = &rec->data; 1194 int fd, err; 1195 char *new_filename; 1196 1197 /* Same Size: "2015122520103046"*/ 1198 char timestamp[] = "InvalidTimestamp"; 1199 1200 record__aio_mmap_read_sync(rec); 1201 1202 record__synthesize(rec, true); 1203 if (target__none(&rec->opts.target)) 1204 record__synthesize_workload(rec, true); 1205 1206 rec->samples = 0; 1207 record__finish_output(rec); 1208 err = fetch_current_timestamp(timestamp, sizeof(timestamp)); 1209 if (err) { 1210 pr_err("Failed to get current timestamp\n"); 1211 return -EINVAL; 1212 } 1213 1214 fd = perf_data__switch(data, timestamp, 1215 rec->session->header.data_offset, 1216 at_exit, &new_filename); 1217 if (fd >= 0 && !at_exit) { 1218 rec->bytes_written = 0; 1219 rec->session->header.data_size = 0; 1220 } 1221 1222 if (!quiet) 1223 fprintf(stderr, "[ perf record: Dump %s.%s ]\n", 1224 data->path, timestamp); 1225 1226 if (rec->switch_output.num_files) { 1227 int n = rec->switch_output.cur_file + 1; 1228 1229 if (n >= rec->switch_output.num_files) 1230 n = 0; 1231 rec->switch_output.cur_file = n; 1232 if (rec->switch_output.filenames[n]) { 1233 remove(rec->switch_output.filenames[n]); 1234 zfree(&rec->switch_output.filenames[n]); 1235 } 1236 rec->switch_output.filenames[n] = new_filename; 1237 } else { 1238 free(new_filename); 1239 } 1240 1241 /* Output tracking events */ 1242 if (!at_exit) { 1243 record__synthesize(rec, false); 1244 1245 /* 1246 * In 'perf record --switch-output' without -a, 1247 * record__synthesize() in record__switch_output() won't 1248 * generate tracking events because there's no thread_map 1249 * in evlist. Which causes newly created perf.data doesn't 1250 * contain map and comm information. 1251 * Create a fake thread_map and directly call 1252 * perf_event__synthesize_thread_map() for those events. 1253 */ 1254 if (target__none(&rec->opts.target)) 1255 record__synthesize_workload(rec, false); 1256 } 1257 return fd; 1258 } 1259 1260 static volatile int workload_exec_errno; 1261 1262 /* 1263 * perf_evlist__prepare_workload will send a SIGUSR1 1264 * if the fork fails, since we asked by setting its 1265 * want_signal to true. 1266 */ 1267 static void workload_exec_failed_signal(int signo __maybe_unused, 1268 siginfo_t *info, 1269 void *ucontext __maybe_unused) 1270 { 1271 workload_exec_errno = info->si_value.sival_int; 1272 done = 1; 1273 child_finished = 1; 1274 } 1275 1276 static void snapshot_sig_handler(int sig); 1277 static void alarm_sig_handler(int sig); 1278 1279 static const struct perf_event_mmap_page * 1280 perf_evlist__pick_pc(struct evlist *evlist) 1281 { 1282 if (evlist) { 1283 if (evlist->mmap && evlist->mmap[0].core.base) 1284 return evlist->mmap[0].core.base; 1285 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base) 1286 return evlist->overwrite_mmap[0].core.base; 1287 } 1288 return NULL; 1289 } 1290 1291 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec) 1292 { 1293 const struct perf_event_mmap_page *pc; 1294 1295 pc = perf_evlist__pick_pc(rec->evlist); 1296 if (pc) 1297 return pc; 1298 return NULL; 1299 } 1300 1301 static int record__synthesize(struct record *rec, bool tail) 1302 { 1303 struct perf_session *session = rec->session; 1304 struct machine *machine = &session->machines.host; 1305 struct perf_data *data = &rec->data; 1306 struct record_opts *opts = &rec->opts; 1307 struct perf_tool *tool = &rec->tool; 1308 int fd = perf_data__fd(data); 1309 int err = 0; 1310 event_op f = process_synthesized_event; 1311 1312 if (rec->opts.tail_synthesize != tail) 1313 return 0; 1314 1315 if (data->is_pipe) { 1316 /* 1317 * We need to synthesize events first, because some 1318 * features works on top of them (on report side). 1319 */ 1320 err = perf_event__synthesize_attrs(tool, rec->evlist, 1321 process_synthesized_event); 1322 if (err < 0) { 1323 pr_err("Couldn't synthesize attrs.\n"); 1324 goto out; 1325 } 1326 1327 err = perf_event__synthesize_features(tool, session, rec->evlist, 1328 process_synthesized_event); 1329 if (err < 0) { 1330 pr_err("Couldn't synthesize features.\n"); 1331 return err; 1332 } 1333 1334 if (have_tracepoints(&rec->evlist->core.entries)) { 1335 /* 1336 * FIXME err <= 0 here actually means that 1337 * there were no tracepoints so its not really 1338 * an error, just that we don't need to 1339 * synthesize anything. We really have to 1340 * return this more properly and also 1341 * propagate errors that now are calling die() 1342 */ 1343 err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist, 1344 process_synthesized_event); 1345 if (err <= 0) { 1346 pr_err("Couldn't record tracing data.\n"); 1347 goto out; 1348 } 1349 rec->bytes_written += err; 1350 } 1351 } 1352 1353 err = perf_event__synth_time_conv(record__pick_pc(rec), tool, 1354 process_synthesized_event, machine); 1355 if (err) 1356 goto out; 1357 1358 /* Synthesize id_index before auxtrace_info */ 1359 if (rec->opts.auxtrace_sample_mode) { 1360 err = perf_event__synthesize_id_index(tool, 1361 process_synthesized_event, 1362 session->evlist, machine); 1363 if (err) 1364 goto out; 1365 } 1366 1367 if (rec->opts.full_auxtrace) { 1368 err = perf_event__synthesize_auxtrace_info(rec->itr, tool, 1369 session, process_synthesized_event); 1370 if (err) 1371 goto out; 1372 } 1373 1374 if (!perf_evlist__exclude_kernel(rec->evlist)) { 1375 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, 1376 machine); 1377 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n" 1378 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 1379 "Check /proc/kallsyms permission or run as root.\n"); 1380 1381 err = perf_event__synthesize_modules(tool, process_synthesized_event, 1382 machine); 1383 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n" 1384 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 1385 "Check /proc/modules permission or run as root.\n"); 1386 } 1387 1388 if (perf_guest) { 1389 machines__process_guests(&session->machines, 1390 perf_event__synthesize_guest_os, tool); 1391 } 1392 1393 err = perf_event__synthesize_extra_attr(&rec->tool, 1394 rec->evlist, 1395 process_synthesized_event, 1396 data->is_pipe); 1397 if (err) 1398 goto out; 1399 1400 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads, 1401 process_synthesized_event, 1402 NULL); 1403 if (err < 0) { 1404 pr_err("Couldn't synthesize thread map.\n"); 1405 return err; 1406 } 1407 1408 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus, 1409 process_synthesized_event, NULL); 1410 if (err < 0) { 1411 pr_err("Couldn't synthesize cpu map.\n"); 1412 return err; 1413 } 1414 1415 err = perf_event__synthesize_bpf_events(session, process_synthesized_event, 1416 machine, opts); 1417 if (err < 0) 1418 pr_warning("Couldn't synthesize bpf events.\n"); 1419 1420 err = perf_event__synthesize_cgroups(tool, process_synthesized_event, 1421 machine); 1422 if (err < 0) 1423 pr_warning("Couldn't synthesize cgroup events.\n"); 1424 1425 if (rec->opts.nr_threads_synthesize > 1) { 1426 perf_set_multithreaded(); 1427 f = process_locked_synthesized_event; 1428 } 1429 1430 err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads, 1431 f, opts->sample_address, 1432 rec->opts.nr_threads_synthesize); 1433 1434 if (rec->opts.nr_threads_synthesize > 1) 1435 perf_set_singlethreaded(); 1436 1437 out: 1438 return err; 1439 } 1440 1441 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data) 1442 { 1443 struct record *rec = data; 1444 pthread_kill(rec->thread_id, SIGUSR2); 1445 return 0; 1446 } 1447 1448 static int record__setup_sb_evlist(struct record *rec) 1449 { 1450 struct record_opts *opts = &rec->opts; 1451 1452 if (rec->sb_evlist != NULL) { 1453 /* 1454 * We get here if --switch-output-event populated the 1455 * sb_evlist, so associate a callback that will send a SIGUSR2 1456 * to the main thread. 1457 */ 1458 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec); 1459 rec->thread_id = pthread_self(); 1460 } 1461 1462 if (!opts->no_bpf_event) { 1463 if (rec->sb_evlist == NULL) { 1464 rec->sb_evlist = evlist__new(); 1465 1466 if (rec->sb_evlist == NULL) { 1467 pr_err("Couldn't create side band evlist.\n."); 1468 return -1; 1469 } 1470 } 1471 1472 if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) { 1473 pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n."); 1474 return -1; 1475 } 1476 } 1477 1478 if (perf_evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) { 1479 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n"); 1480 opts->no_bpf_event = true; 1481 } 1482 1483 return 0; 1484 } 1485 1486 static int __cmd_record(struct record *rec, int argc, const char **argv) 1487 { 1488 int err; 1489 int status = 0; 1490 unsigned long waking = 0; 1491 const bool forks = argc > 0; 1492 struct perf_tool *tool = &rec->tool; 1493 struct record_opts *opts = &rec->opts; 1494 struct perf_data *data = &rec->data; 1495 struct perf_session *session; 1496 bool disabled = false, draining = false; 1497 int fd; 1498 float ratio = 0; 1499 1500 atexit(record__sig_exit); 1501 signal(SIGCHLD, sig_handler); 1502 signal(SIGINT, sig_handler); 1503 signal(SIGTERM, sig_handler); 1504 signal(SIGSEGV, sigsegv_handler); 1505 1506 if (rec->opts.record_namespaces) 1507 tool->namespace_events = true; 1508 1509 if (rec->opts.record_cgroup) { 1510 #ifdef HAVE_FILE_HANDLE 1511 tool->cgroup_events = true; 1512 #else 1513 pr_err("cgroup tracking is not supported\n"); 1514 return -1; 1515 #endif 1516 } 1517 1518 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) { 1519 signal(SIGUSR2, snapshot_sig_handler); 1520 if (rec->opts.auxtrace_snapshot_mode) 1521 trigger_on(&auxtrace_snapshot_trigger); 1522 if (rec->switch_output.enabled) 1523 trigger_on(&switch_output_trigger); 1524 } else { 1525 signal(SIGUSR2, SIG_IGN); 1526 } 1527 1528 session = perf_session__new(data, false, tool); 1529 if (IS_ERR(session)) { 1530 pr_err("Perf session creation failed.\n"); 1531 return PTR_ERR(session); 1532 } 1533 1534 fd = perf_data__fd(data); 1535 rec->session = session; 1536 1537 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) { 1538 pr_err("Compression initialization failed.\n"); 1539 return -1; 1540 } 1541 1542 session->header.env.comp_type = PERF_COMP_ZSTD; 1543 session->header.env.comp_level = rec->opts.comp_level; 1544 1545 if (rec->opts.kcore && 1546 !record__kcore_readable(&session->machines.host)) { 1547 pr_err("ERROR: kcore is not readable.\n"); 1548 return -1; 1549 } 1550 1551 record__init_features(rec); 1552 1553 if (rec->opts.use_clockid && rec->opts.clockid_res_ns) 1554 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns; 1555 1556 if (forks) { 1557 err = perf_evlist__prepare_workload(rec->evlist, &opts->target, 1558 argv, data->is_pipe, 1559 workload_exec_failed_signal); 1560 if (err < 0) { 1561 pr_err("Couldn't run the workload!\n"); 1562 status = err; 1563 goto out_delete_session; 1564 } 1565 } 1566 1567 /* 1568 * If we have just single event and are sending data 1569 * through pipe, we need to force the ids allocation, 1570 * because we synthesize event name through the pipe 1571 * and need the id for that. 1572 */ 1573 if (data->is_pipe && rec->evlist->core.nr_entries == 1) 1574 rec->opts.sample_id = true; 1575 1576 if (record__open(rec) != 0) { 1577 err = -1; 1578 goto out_child; 1579 } 1580 session->header.env.comp_mmap_len = session->evlist->core.mmap_len; 1581 1582 if (rec->opts.kcore) { 1583 err = record__kcore_copy(&session->machines.host, data); 1584 if (err) { 1585 pr_err("ERROR: Failed to copy kcore\n"); 1586 goto out_child; 1587 } 1588 } 1589 1590 err = bpf__apply_obj_config(); 1591 if (err) { 1592 char errbuf[BUFSIZ]; 1593 1594 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf)); 1595 pr_err("ERROR: Apply config to BPF failed: %s\n", 1596 errbuf); 1597 goto out_child; 1598 } 1599 1600 /* 1601 * Normally perf_session__new would do this, but it doesn't have the 1602 * evlist. 1603 */ 1604 if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) { 1605 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n"); 1606 rec->tool.ordered_events = false; 1607 } 1608 1609 if (!rec->evlist->nr_groups) 1610 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC); 1611 1612 if (data->is_pipe) { 1613 err = perf_header__write_pipe(fd); 1614 if (err < 0) 1615 goto out_child; 1616 } else { 1617 err = perf_session__write_header(session, rec->evlist, fd, false); 1618 if (err < 0) 1619 goto out_child; 1620 } 1621 1622 err = -1; 1623 if (!rec->no_buildid 1624 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) { 1625 pr_err("Couldn't generate buildids. " 1626 "Use --no-buildid to profile anyway.\n"); 1627 goto out_child; 1628 } 1629 1630 err = record__setup_sb_evlist(rec); 1631 if (err) 1632 goto out_child; 1633 1634 err = record__synthesize(rec, false); 1635 if (err < 0) 1636 goto out_child; 1637 1638 if (rec->realtime_prio) { 1639 struct sched_param param; 1640 1641 param.sched_priority = rec->realtime_prio; 1642 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { 1643 pr_err("Could not set realtime priority.\n"); 1644 err = -1; 1645 goto out_child; 1646 } 1647 } 1648 1649 /* 1650 * When perf is starting the traced process, all the events 1651 * (apart from group members) have enable_on_exec=1 set, 1652 * so don't spoil it by prematurely enabling them. 1653 */ 1654 if (!target__none(&opts->target) && !opts->initial_delay) 1655 evlist__enable(rec->evlist); 1656 1657 /* 1658 * Let the child rip 1659 */ 1660 if (forks) { 1661 struct machine *machine = &session->machines.host; 1662 union perf_event *event; 1663 pid_t tgid; 1664 1665 event = malloc(sizeof(event->comm) + machine->id_hdr_size); 1666 if (event == NULL) { 1667 err = -ENOMEM; 1668 goto out_child; 1669 } 1670 1671 /* 1672 * Some H/W events are generated before COMM event 1673 * which is emitted during exec(), so perf script 1674 * cannot see a correct process name for those events. 1675 * Synthesize COMM event to prevent it. 1676 */ 1677 tgid = perf_event__synthesize_comm(tool, event, 1678 rec->evlist->workload.pid, 1679 process_synthesized_event, 1680 machine); 1681 free(event); 1682 1683 if (tgid == -1) 1684 goto out_child; 1685 1686 event = malloc(sizeof(event->namespaces) + 1687 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) + 1688 machine->id_hdr_size); 1689 if (event == NULL) { 1690 err = -ENOMEM; 1691 goto out_child; 1692 } 1693 1694 /* 1695 * Synthesize NAMESPACES event for the command specified. 1696 */ 1697 perf_event__synthesize_namespaces(tool, event, 1698 rec->evlist->workload.pid, 1699 tgid, process_synthesized_event, 1700 machine); 1701 free(event); 1702 1703 perf_evlist__start_workload(rec->evlist); 1704 } 1705 1706 if (opts->initial_delay) { 1707 usleep(opts->initial_delay * USEC_PER_MSEC); 1708 evlist__enable(rec->evlist); 1709 } 1710 1711 trigger_ready(&auxtrace_snapshot_trigger); 1712 trigger_ready(&switch_output_trigger); 1713 perf_hooks__invoke_record_start(); 1714 for (;;) { 1715 unsigned long long hits = rec->samples; 1716 1717 /* 1718 * rec->evlist->bkw_mmap_state is possible to be 1719 * BKW_MMAP_EMPTY here: when done == true and 1720 * hits != rec->samples in previous round. 1721 * 1722 * perf_evlist__toggle_bkw_mmap ensure we never 1723 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING. 1724 */ 1725 if (trigger_is_hit(&switch_output_trigger) || done || draining) 1726 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING); 1727 1728 if (record__mmap_read_all(rec, false) < 0) { 1729 trigger_error(&auxtrace_snapshot_trigger); 1730 trigger_error(&switch_output_trigger); 1731 err = -1; 1732 goto out_child; 1733 } 1734 1735 if (auxtrace_record__snapshot_started) { 1736 auxtrace_record__snapshot_started = 0; 1737 if (!trigger_is_error(&auxtrace_snapshot_trigger)) 1738 record__read_auxtrace_snapshot(rec, false); 1739 if (trigger_is_error(&auxtrace_snapshot_trigger)) { 1740 pr_err("AUX area tracing snapshot failed\n"); 1741 err = -1; 1742 goto out_child; 1743 } 1744 } 1745 1746 if (trigger_is_hit(&switch_output_trigger)) { 1747 /* 1748 * If switch_output_trigger is hit, the data in 1749 * overwritable ring buffer should have been collected, 1750 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY. 1751 * 1752 * If SIGUSR2 raise after or during record__mmap_read_all(), 1753 * record__mmap_read_all() didn't collect data from 1754 * overwritable ring buffer. Read again. 1755 */ 1756 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING) 1757 continue; 1758 trigger_ready(&switch_output_trigger); 1759 1760 /* 1761 * Reenable events in overwrite ring buffer after 1762 * record__mmap_read_all(): we should have collected 1763 * data from it. 1764 */ 1765 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING); 1766 1767 if (!quiet) 1768 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n", 1769 waking); 1770 waking = 0; 1771 fd = record__switch_output(rec, false); 1772 if (fd < 0) { 1773 pr_err("Failed to switch to new file\n"); 1774 trigger_error(&switch_output_trigger); 1775 err = fd; 1776 goto out_child; 1777 } 1778 1779 /* re-arm the alarm */ 1780 if (rec->switch_output.time) 1781 alarm(rec->switch_output.time); 1782 } 1783 1784 if (hits == rec->samples) { 1785 if (done || draining) 1786 break; 1787 err = evlist__poll(rec->evlist, -1); 1788 /* 1789 * Propagate error, only if there's any. Ignore positive 1790 * number of returned events and interrupt error. 1791 */ 1792 if (err > 0 || (err < 0 && errno == EINTR)) 1793 err = 0; 1794 waking++; 1795 1796 if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0) 1797 draining = true; 1798 } 1799 1800 /* 1801 * When perf is starting the traced process, at the end events 1802 * die with the process and we wait for that. Thus no need to 1803 * disable events in this case. 1804 */ 1805 if (done && !disabled && !target__none(&opts->target)) { 1806 trigger_off(&auxtrace_snapshot_trigger); 1807 evlist__disable(rec->evlist); 1808 disabled = true; 1809 } 1810 } 1811 1812 trigger_off(&auxtrace_snapshot_trigger); 1813 trigger_off(&switch_output_trigger); 1814 1815 if (opts->auxtrace_snapshot_on_exit) 1816 record__auxtrace_snapshot_exit(rec); 1817 1818 if (forks && workload_exec_errno) { 1819 char msg[STRERR_BUFSIZE]; 1820 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg)); 1821 pr_err("Workload failed: %s\n", emsg); 1822 err = -1; 1823 goto out_child; 1824 } 1825 1826 if (!quiet) 1827 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking); 1828 1829 if (target__none(&rec->opts.target)) 1830 record__synthesize_workload(rec, true); 1831 1832 out_child: 1833 record__mmap_read_all(rec, true); 1834 record__aio_mmap_read_sync(rec); 1835 1836 if (rec->session->bytes_transferred && rec->session->bytes_compressed) { 1837 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed; 1838 session->header.env.comp_ratio = ratio + 0.5; 1839 } 1840 1841 if (forks) { 1842 int exit_status; 1843 1844 if (!child_finished) 1845 kill(rec->evlist->workload.pid, SIGTERM); 1846 1847 wait(&exit_status); 1848 1849 if (err < 0) 1850 status = err; 1851 else if (WIFEXITED(exit_status)) 1852 status = WEXITSTATUS(exit_status); 1853 else if (WIFSIGNALED(exit_status)) 1854 signr = WTERMSIG(exit_status); 1855 } else 1856 status = err; 1857 1858 record__synthesize(rec, true); 1859 /* this will be recalculated during process_buildids() */ 1860 rec->samples = 0; 1861 1862 if (!err) { 1863 if (!rec->timestamp_filename) { 1864 record__finish_output(rec); 1865 } else { 1866 fd = record__switch_output(rec, true); 1867 if (fd < 0) { 1868 status = fd; 1869 goto out_delete_session; 1870 } 1871 } 1872 } 1873 1874 perf_hooks__invoke_record_end(); 1875 1876 if (!err && !quiet) { 1877 char samples[128]; 1878 const char *postfix = rec->timestamp_filename ? 1879 ".<timestamp>" : ""; 1880 1881 if (rec->samples && !rec->opts.full_auxtrace) 1882 scnprintf(samples, sizeof(samples), 1883 " (%" PRIu64 " samples)", rec->samples); 1884 else 1885 samples[0] = '\0'; 1886 1887 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s", 1888 perf_data__size(data) / 1024.0 / 1024.0, 1889 data->path, postfix, samples); 1890 if (ratio) { 1891 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)", 1892 rec->session->bytes_transferred / 1024.0 / 1024.0, 1893 ratio); 1894 } 1895 fprintf(stderr, " ]\n"); 1896 } 1897 1898 out_delete_session: 1899 zstd_fini(&session->zstd_data); 1900 perf_session__delete(session); 1901 1902 if (!opts->no_bpf_event) 1903 perf_evlist__stop_sb_thread(rec->sb_evlist); 1904 return status; 1905 } 1906 1907 static void callchain_debug(struct callchain_param *callchain) 1908 { 1909 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" }; 1910 1911 pr_debug("callchain: type %s\n", str[callchain->record_mode]); 1912 1913 if (callchain->record_mode == CALLCHAIN_DWARF) 1914 pr_debug("callchain: stack dump size %d\n", 1915 callchain->dump_size); 1916 } 1917 1918 int record_opts__parse_callchain(struct record_opts *record, 1919 struct callchain_param *callchain, 1920 const char *arg, bool unset) 1921 { 1922 int ret; 1923 callchain->enabled = !unset; 1924 1925 /* --no-call-graph */ 1926 if (unset) { 1927 callchain->record_mode = CALLCHAIN_NONE; 1928 pr_debug("callchain: disabled\n"); 1929 return 0; 1930 } 1931 1932 ret = parse_callchain_record_opt(arg, callchain); 1933 if (!ret) { 1934 /* Enable data address sampling for DWARF unwind. */ 1935 if (callchain->record_mode == CALLCHAIN_DWARF) 1936 record->sample_address = true; 1937 callchain_debug(callchain); 1938 } 1939 1940 return ret; 1941 } 1942 1943 int record_parse_callchain_opt(const struct option *opt, 1944 const char *arg, 1945 int unset) 1946 { 1947 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset); 1948 } 1949 1950 int record_callchain_opt(const struct option *opt, 1951 const char *arg __maybe_unused, 1952 int unset __maybe_unused) 1953 { 1954 struct callchain_param *callchain = opt->value; 1955 1956 callchain->enabled = true; 1957 1958 if (callchain->record_mode == CALLCHAIN_NONE) 1959 callchain->record_mode = CALLCHAIN_FP; 1960 1961 callchain_debug(callchain); 1962 return 0; 1963 } 1964 1965 static int perf_record_config(const char *var, const char *value, void *cb) 1966 { 1967 struct record *rec = cb; 1968 1969 if (!strcmp(var, "record.build-id")) { 1970 if (!strcmp(value, "cache")) 1971 rec->no_buildid_cache = false; 1972 else if (!strcmp(value, "no-cache")) 1973 rec->no_buildid_cache = true; 1974 else if (!strcmp(value, "skip")) 1975 rec->no_buildid = true; 1976 else 1977 return -1; 1978 return 0; 1979 } 1980 if (!strcmp(var, "record.call-graph")) { 1981 var = "call-graph.record-mode"; 1982 return perf_default_config(var, value, cb); 1983 } 1984 #ifdef HAVE_AIO_SUPPORT 1985 if (!strcmp(var, "record.aio")) { 1986 rec->opts.nr_cblocks = strtol(value, NULL, 0); 1987 if (!rec->opts.nr_cblocks) 1988 rec->opts.nr_cblocks = nr_cblocks_default; 1989 } 1990 #endif 1991 1992 return 0; 1993 } 1994 1995 struct clockid_map { 1996 const char *name; 1997 int clockid; 1998 }; 1999 2000 #define CLOCKID_MAP(n, c) \ 2001 { .name = n, .clockid = (c), } 2002 2003 #define CLOCKID_END { .name = NULL, } 2004 2005 2006 /* 2007 * Add the missing ones, we need to build on many distros... 2008 */ 2009 #ifndef CLOCK_MONOTONIC_RAW 2010 #define CLOCK_MONOTONIC_RAW 4 2011 #endif 2012 #ifndef CLOCK_BOOTTIME 2013 #define CLOCK_BOOTTIME 7 2014 #endif 2015 #ifndef CLOCK_TAI 2016 #define CLOCK_TAI 11 2017 #endif 2018 2019 static const struct clockid_map clockids[] = { 2020 /* available for all events, NMI safe */ 2021 CLOCKID_MAP("monotonic", CLOCK_MONOTONIC), 2022 CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW), 2023 2024 /* available for some events */ 2025 CLOCKID_MAP("realtime", CLOCK_REALTIME), 2026 CLOCKID_MAP("boottime", CLOCK_BOOTTIME), 2027 CLOCKID_MAP("tai", CLOCK_TAI), 2028 2029 /* available for the lazy */ 2030 CLOCKID_MAP("mono", CLOCK_MONOTONIC), 2031 CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW), 2032 CLOCKID_MAP("real", CLOCK_REALTIME), 2033 CLOCKID_MAP("boot", CLOCK_BOOTTIME), 2034 2035 CLOCKID_END, 2036 }; 2037 2038 static int get_clockid_res(clockid_t clk_id, u64 *res_ns) 2039 { 2040 struct timespec res; 2041 2042 *res_ns = 0; 2043 if (!clock_getres(clk_id, &res)) 2044 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC; 2045 else 2046 pr_warning("WARNING: Failed to determine specified clock resolution.\n"); 2047 2048 return 0; 2049 } 2050 2051 static int parse_clockid(const struct option *opt, const char *str, int unset) 2052 { 2053 struct record_opts *opts = (struct record_opts *)opt->value; 2054 const struct clockid_map *cm; 2055 const char *ostr = str; 2056 2057 if (unset) { 2058 opts->use_clockid = 0; 2059 return 0; 2060 } 2061 2062 /* no arg passed */ 2063 if (!str) 2064 return 0; 2065 2066 /* no setting it twice */ 2067 if (opts->use_clockid) 2068 return -1; 2069 2070 opts->use_clockid = true; 2071 2072 /* if its a number, we're done */ 2073 if (sscanf(str, "%d", &opts->clockid) == 1) 2074 return get_clockid_res(opts->clockid, &opts->clockid_res_ns); 2075 2076 /* allow a "CLOCK_" prefix to the name */ 2077 if (!strncasecmp(str, "CLOCK_", 6)) 2078 str += 6; 2079 2080 for (cm = clockids; cm->name; cm++) { 2081 if (!strcasecmp(str, cm->name)) { 2082 opts->clockid = cm->clockid; 2083 return get_clockid_res(opts->clockid, 2084 &opts->clockid_res_ns); 2085 } 2086 } 2087 2088 opts->use_clockid = false; 2089 ui__warning("unknown clockid %s, check man page\n", ostr); 2090 return -1; 2091 } 2092 2093 static int record__parse_affinity(const struct option *opt, const char *str, int unset) 2094 { 2095 struct record_opts *opts = (struct record_opts *)opt->value; 2096 2097 if (unset || !str) 2098 return 0; 2099 2100 if (!strcasecmp(str, "node")) 2101 opts->affinity = PERF_AFFINITY_NODE; 2102 else if (!strcasecmp(str, "cpu")) 2103 opts->affinity = PERF_AFFINITY_CPU; 2104 2105 return 0; 2106 } 2107 2108 static int parse_output_max_size(const struct option *opt, 2109 const char *str, int unset) 2110 { 2111 unsigned long *s = (unsigned long *)opt->value; 2112 static struct parse_tag tags_size[] = { 2113 { .tag = 'B', .mult = 1 }, 2114 { .tag = 'K', .mult = 1 << 10 }, 2115 { .tag = 'M', .mult = 1 << 20 }, 2116 { .tag = 'G', .mult = 1 << 30 }, 2117 { .tag = 0 }, 2118 }; 2119 unsigned long val; 2120 2121 if (unset) { 2122 *s = 0; 2123 return 0; 2124 } 2125 2126 val = parse_tag_value(str, tags_size); 2127 if (val != (unsigned long) -1) { 2128 *s = val; 2129 return 0; 2130 } 2131 2132 return -1; 2133 } 2134 2135 static int record__parse_mmap_pages(const struct option *opt, 2136 const char *str, 2137 int unset __maybe_unused) 2138 { 2139 struct record_opts *opts = opt->value; 2140 char *s, *p; 2141 unsigned int mmap_pages; 2142 int ret; 2143 2144 if (!str) 2145 return -EINVAL; 2146 2147 s = strdup(str); 2148 if (!s) 2149 return -ENOMEM; 2150 2151 p = strchr(s, ','); 2152 if (p) 2153 *p = '\0'; 2154 2155 if (*s) { 2156 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s); 2157 if (ret) 2158 goto out_free; 2159 opts->mmap_pages = mmap_pages; 2160 } 2161 2162 if (!p) { 2163 ret = 0; 2164 goto out_free; 2165 } 2166 2167 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1); 2168 if (ret) 2169 goto out_free; 2170 2171 opts->auxtrace_mmap_pages = mmap_pages; 2172 2173 out_free: 2174 free(s); 2175 return ret; 2176 } 2177 2178 static void switch_output_size_warn(struct record *rec) 2179 { 2180 u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages); 2181 struct switch_output *s = &rec->switch_output; 2182 2183 wakeup_size /= 2; 2184 2185 if (s->size < wakeup_size) { 2186 char buf[100]; 2187 2188 unit_number__scnprintf(buf, sizeof(buf), wakeup_size); 2189 pr_warning("WARNING: switch-output data size lower than " 2190 "wakeup kernel buffer size (%s) " 2191 "expect bigger perf.data sizes\n", buf); 2192 } 2193 } 2194 2195 static int switch_output_setup(struct record *rec) 2196 { 2197 struct switch_output *s = &rec->switch_output; 2198 static struct parse_tag tags_size[] = { 2199 { .tag = 'B', .mult = 1 }, 2200 { .tag = 'K', .mult = 1 << 10 }, 2201 { .tag = 'M', .mult = 1 << 20 }, 2202 { .tag = 'G', .mult = 1 << 30 }, 2203 { .tag = 0 }, 2204 }; 2205 static struct parse_tag tags_time[] = { 2206 { .tag = 's', .mult = 1 }, 2207 { .tag = 'm', .mult = 60 }, 2208 { .tag = 'h', .mult = 60*60 }, 2209 { .tag = 'd', .mult = 60*60*24 }, 2210 { .tag = 0 }, 2211 }; 2212 unsigned long val; 2213 2214 /* 2215 * If we're using --switch-output-events, then we imply its 2216 * --switch-output=signal, as we'll send a SIGUSR2 from the side band 2217 * thread to its parent. 2218 */ 2219 if (rec->switch_output_event_set) 2220 goto do_signal; 2221 2222 if (!s->set) 2223 return 0; 2224 2225 if (!strcmp(s->str, "signal")) { 2226 do_signal: 2227 s->signal = true; 2228 pr_debug("switch-output with SIGUSR2 signal\n"); 2229 goto enabled; 2230 } 2231 2232 val = parse_tag_value(s->str, tags_size); 2233 if (val != (unsigned long) -1) { 2234 s->size = val; 2235 pr_debug("switch-output with %s size threshold\n", s->str); 2236 goto enabled; 2237 } 2238 2239 val = parse_tag_value(s->str, tags_time); 2240 if (val != (unsigned long) -1) { 2241 s->time = val; 2242 pr_debug("switch-output with %s time threshold (%lu seconds)\n", 2243 s->str, s->time); 2244 goto enabled; 2245 } 2246 2247 return -1; 2248 2249 enabled: 2250 rec->timestamp_filename = true; 2251 s->enabled = true; 2252 2253 if (s->size && !rec->opts.no_buffering) 2254 switch_output_size_warn(rec); 2255 2256 return 0; 2257 } 2258 2259 static const char * const __record_usage[] = { 2260 "perf record [<options>] [<command>]", 2261 "perf record [<options>] -- <command> [<options>]", 2262 NULL 2263 }; 2264 const char * const *record_usage = __record_usage; 2265 2266 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event, 2267 struct perf_sample *sample, struct machine *machine) 2268 { 2269 /* 2270 * We already have the kernel maps, put in place via perf_session__create_kernel_maps() 2271 * no need to add them twice. 2272 */ 2273 if (!(event->header.misc & PERF_RECORD_MISC_USER)) 2274 return 0; 2275 return perf_event__process_mmap(tool, event, sample, machine); 2276 } 2277 2278 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event, 2279 struct perf_sample *sample, struct machine *machine) 2280 { 2281 /* 2282 * We already have the kernel maps, put in place via perf_session__create_kernel_maps() 2283 * no need to add them twice. 2284 */ 2285 if (!(event->header.misc & PERF_RECORD_MISC_USER)) 2286 return 0; 2287 2288 return perf_event__process_mmap2(tool, event, sample, machine); 2289 } 2290 2291 /* 2292 * XXX Ideally would be local to cmd_record() and passed to a record__new 2293 * because we need to have access to it in record__exit, that is called 2294 * after cmd_record() exits, but since record_options need to be accessible to 2295 * builtin-script, leave it here. 2296 * 2297 * At least we don't ouch it in all the other functions here directly. 2298 * 2299 * Just say no to tons of global variables, sigh. 2300 */ 2301 static struct record record = { 2302 .opts = { 2303 .sample_time = true, 2304 .mmap_pages = UINT_MAX, 2305 .user_freq = UINT_MAX, 2306 .user_interval = ULLONG_MAX, 2307 .freq = 4000, 2308 .target = { 2309 .uses_mmap = true, 2310 .default_per_cpu = true, 2311 }, 2312 .mmap_flush = MMAP_FLUSH_DEFAULT, 2313 .nr_threads_synthesize = 1, 2314 }, 2315 .tool = { 2316 .sample = process_sample_event, 2317 .fork = perf_event__process_fork, 2318 .exit = perf_event__process_exit, 2319 .comm = perf_event__process_comm, 2320 .namespaces = perf_event__process_namespaces, 2321 .mmap = build_id__process_mmap, 2322 .mmap2 = build_id__process_mmap2, 2323 .ordered_events = true, 2324 }, 2325 }; 2326 2327 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP 2328 "\n\t\t\t\tDefault: fp"; 2329 2330 static bool dry_run; 2331 2332 /* 2333 * XXX Will stay a global variable till we fix builtin-script.c to stop messing 2334 * with it and switch to use the library functions in perf_evlist that came 2335 * from builtin-record.c, i.e. use record_opts, 2336 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record', 2337 * using pipes, etc. 2338 */ 2339 static struct option __record_options[] = { 2340 OPT_CALLBACK('e', "event", &record.evlist, "event", 2341 "event selector. use 'perf list' to list available events", 2342 parse_events_option), 2343 OPT_CALLBACK(0, "filter", &record.evlist, "filter", 2344 "event filter", parse_filter), 2345 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist, 2346 NULL, "don't record events from perf itself", 2347 exclude_perf), 2348 OPT_STRING('p', "pid", &record.opts.target.pid, "pid", 2349 "record events on existing process id"), 2350 OPT_STRING('t', "tid", &record.opts.target.tid, "tid", 2351 "record events on existing thread id"), 2352 OPT_INTEGER('r', "realtime", &record.realtime_prio, 2353 "collect data with this RT SCHED_FIFO priority"), 2354 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering, 2355 "collect data without buffering"), 2356 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples, 2357 "collect raw sample records from all opened counters"), 2358 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide, 2359 "system-wide collection from all CPUs"), 2360 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu", 2361 "list of cpus to monitor"), 2362 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"), 2363 OPT_STRING('o', "output", &record.data.path, "file", 2364 "output file name"), 2365 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit, 2366 &record.opts.no_inherit_set, 2367 "child tasks do not inherit counters"), 2368 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize, 2369 "synthesize non-sample events at the end of output"), 2370 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"), 2371 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"), 2372 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq, 2373 "Fail if the specified frequency can't be used"), 2374 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'", 2375 "profile at this frequency", 2376 record__parse_freq), 2377 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]", 2378 "number of mmap data pages and AUX area tracing mmap pages", 2379 record__parse_mmap_pages), 2380 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number", 2381 "Minimal number of bytes that is extracted from mmap data pages (default: 1)", 2382 record__mmap_flush_parse), 2383 OPT_BOOLEAN(0, "group", &record.opts.group, 2384 "put the counters into a counter group"), 2385 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param, 2386 NULL, "enables call-graph recording" , 2387 &record_callchain_opt), 2388 OPT_CALLBACK(0, "call-graph", &record.opts, 2389 "record_mode[,record_size]", record_callchain_help, 2390 &record_parse_callchain_opt), 2391 OPT_INCR('v', "verbose", &verbose, 2392 "be more verbose (show counter open errors, etc)"), 2393 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"), 2394 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat, 2395 "per thread counts"), 2396 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"), 2397 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr, 2398 "Record the sample physical addresses"), 2399 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"), 2400 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time, 2401 &record.opts.sample_time_set, 2402 "Record the sample timestamps"), 2403 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set, 2404 "Record the sample period"), 2405 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples, 2406 "don't sample"), 2407 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache, 2408 &record.no_buildid_cache_set, 2409 "do not update the buildid cache"), 2410 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid, 2411 &record.no_buildid_set, 2412 "do not collect buildids in perf.data"), 2413 OPT_CALLBACK('G', "cgroup", &record.evlist, "name", 2414 "monitor event in cgroup name only", 2415 parse_cgroups), 2416 OPT_UINTEGER('D', "delay", &record.opts.initial_delay, 2417 "ms to wait before starting measurement after program start"), 2418 OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"), 2419 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user", 2420 "user to profile"), 2421 2422 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack, 2423 "branch any", "sample any taken branches", 2424 parse_branch_stack), 2425 2426 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack, 2427 "branch filter mask", "branch stack filter modes", 2428 parse_branch_stack), 2429 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight, 2430 "sample by weight (on special events only)"), 2431 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction, 2432 "sample transaction flags (special events only)"), 2433 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread, 2434 "use per-thread mmaps"), 2435 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register", 2436 "sample selected machine registers on interrupt," 2437 " use '-I?' to list register names", parse_intr_regs), 2438 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register", 2439 "sample selected machine registers on interrupt," 2440 " use '--user-regs=?' to list register names", parse_user_regs), 2441 OPT_BOOLEAN(0, "running-time", &record.opts.running_time, 2442 "Record running/enabled time of read (:S) events"), 2443 OPT_CALLBACK('k', "clockid", &record.opts, 2444 "clockid", "clockid to use for events, see clock_gettime()", 2445 parse_clockid), 2446 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts, 2447 "opts", "AUX area tracing Snapshot Mode", ""), 2448 OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts, 2449 "opts", "sample AUX area", ""), 2450 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout, 2451 "per thread proc mmap processing timeout in ms"), 2452 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces, 2453 "Record namespaces events"), 2454 OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup, 2455 "Record cgroup events"), 2456 OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events, 2457 "Record context switch events"), 2458 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel, 2459 "Configure all used events to run in kernel space.", 2460 PARSE_OPT_EXCLUSIVE), 2461 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user, 2462 "Configure all used events to run in user space.", 2463 PARSE_OPT_EXCLUSIVE), 2464 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains, 2465 "collect kernel callchains"), 2466 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains, 2467 "collect user callchains"), 2468 OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path", 2469 "clang binary to use for compiling BPF scriptlets"), 2470 OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options", 2471 "options passed to clang when compiling BPF scriptlets"), 2472 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name, 2473 "file", "vmlinux pathname"), 2474 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all, 2475 "Record build-id of all DSOs regardless of hits"), 2476 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename, 2477 "append timestamp to output filename"), 2478 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary, 2479 "Record timestamp boundary (time of first/last samples)"), 2480 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str, 2481 &record.switch_output.set, "signal or size[BKMG] or time[smhd]", 2482 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold", 2483 "signal"), 2484 OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event", 2485 "switch output event selector. use 'perf list' to list available events", 2486 parse_events_option_new_evlist), 2487 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files, 2488 "Limit number of switch output generated files"), 2489 OPT_BOOLEAN(0, "dry-run", &dry_run, 2490 "Parse options then exit"), 2491 #ifdef HAVE_AIO_SUPPORT 2492 OPT_CALLBACK_OPTARG(0, "aio", &record.opts, 2493 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)", 2494 record__aio_parse), 2495 #endif 2496 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu", 2497 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer", 2498 record__parse_affinity), 2499 #ifdef HAVE_ZSTD_SUPPORT 2500 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, 2501 "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)", 2502 record__parse_comp_level), 2503 #endif 2504 OPT_CALLBACK(0, "max-size", &record.output_max_size, 2505 "size", "Limit the maximum size of the output file", parse_output_max_size), 2506 OPT_UINTEGER(0, "num-thread-synthesize", 2507 &record.opts.nr_threads_synthesize, 2508 "number of threads to run for event synthesis"), 2509 OPT_END() 2510 }; 2511 2512 struct option *record_options = __record_options; 2513 2514 int cmd_record(int argc, const char **argv) 2515 { 2516 int err; 2517 struct record *rec = &record; 2518 char errbuf[BUFSIZ]; 2519 2520 setlocale(LC_ALL, ""); 2521 2522 #ifndef HAVE_LIBBPF_SUPPORT 2523 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c) 2524 set_nobuild('\0', "clang-path", true); 2525 set_nobuild('\0', "clang-opt", true); 2526 # undef set_nobuild 2527 #endif 2528 2529 #ifndef HAVE_BPF_PROLOGUE 2530 # if !defined (HAVE_DWARF_SUPPORT) 2531 # define REASON "NO_DWARF=1" 2532 # elif !defined (HAVE_LIBBPF_SUPPORT) 2533 # define REASON "NO_LIBBPF=1" 2534 # else 2535 # define REASON "this architecture doesn't support BPF prologue" 2536 # endif 2537 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c) 2538 set_nobuild('\0', "vmlinux", true); 2539 # undef set_nobuild 2540 # undef REASON 2541 #endif 2542 2543 rec->opts.affinity = PERF_AFFINITY_SYS; 2544 2545 rec->evlist = evlist__new(); 2546 if (rec->evlist == NULL) 2547 return -ENOMEM; 2548 2549 err = perf_config(perf_record_config, rec); 2550 if (err) 2551 return err; 2552 2553 argc = parse_options(argc, argv, record_options, record_usage, 2554 PARSE_OPT_STOP_AT_NON_OPTION); 2555 if (quiet) 2556 perf_quiet_option(); 2557 2558 /* Make system wide (-a) the default target. */ 2559 if (!argc && target__none(&rec->opts.target)) 2560 rec->opts.target.system_wide = true; 2561 2562 if (nr_cgroups && !rec->opts.target.system_wide) { 2563 usage_with_options_msg(record_usage, record_options, 2564 "cgroup monitoring only available in system-wide mode"); 2565 2566 } 2567 2568 if (rec->opts.kcore) 2569 rec->data.is_dir = true; 2570 2571 if (rec->opts.comp_level != 0) { 2572 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n"); 2573 rec->no_buildid = true; 2574 } 2575 2576 if (rec->opts.record_switch_events && 2577 !perf_can_record_switch_events()) { 2578 ui__error("kernel does not support recording context switch events\n"); 2579 parse_options_usage(record_usage, record_options, "switch-events", 0); 2580 return -EINVAL; 2581 } 2582 2583 if (switch_output_setup(rec)) { 2584 parse_options_usage(record_usage, record_options, "switch-output", 0); 2585 return -EINVAL; 2586 } 2587 2588 if (rec->switch_output.time) { 2589 signal(SIGALRM, alarm_sig_handler); 2590 alarm(rec->switch_output.time); 2591 } 2592 2593 if (rec->switch_output.num_files) { 2594 rec->switch_output.filenames = calloc(sizeof(char *), 2595 rec->switch_output.num_files); 2596 if (!rec->switch_output.filenames) 2597 return -EINVAL; 2598 } 2599 2600 /* 2601 * Allow aliases to facilitate the lookup of symbols for address 2602 * filters. Refer to auxtrace_parse_filters(). 2603 */ 2604 symbol_conf.allow_aliases = true; 2605 2606 symbol__init(NULL); 2607 2608 if (rec->opts.affinity != PERF_AFFINITY_SYS) { 2609 rec->affinity_mask.nbits = cpu__max_cpu(); 2610 rec->affinity_mask.bits = bitmap_alloc(rec->affinity_mask.nbits); 2611 if (!rec->affinity_mask.bits) { 2612 pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits); 2613 return -ENOMEM; 2614 } 2615 pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits); 2616 } 2617 2618 err = record__auxtrace_init(rec); 2619 if (err) 2620 goto out; 2621 2622 if (dry_run) 2623 goto out; 2624 2625 err = bpf__setup_stdout(rec->evlist); 2626 if (err) { 2627 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf)); 2628 pr_err("ERROR: Setup BPF stdout failed: %s\n", 2629 errbuf); 2630 goto out; 2631 } 2632 2633 err = -ENOMEM; 2634 2635 if (rec->no_buildid_cache || rec->no_buildid) { 2636 disable_buildid_cache(); 2637 } else if (rec->switch_output.enabled) { 2638 /* 2639 * In 'perf record --switch-output', disable buildid 2640 * generation by default to reduce data file switching 2641 * overhead. Still generate buildid if they are required 2642 * explicitly using 2643 * 2644 * perf record --switch-output --no-no-buildid \ 2645 * --no-no-buildid-cache 2646 * 2647 * Following code equals to: 2648 * 2649 * if ((rec->no_buildid || !rec->no_buildid_set) && 2650 * (rec->no_buildid_cache || !rec->no_buildid_cache_set)) 2651 * disable_buildid_cache(); 2652 */ 2653 bool disable = true; 2654 2655 if (rec->no_buildid_set && !rec->no_buildid) 2656 disable = false; 2657 if (rec->no_buildid_cache_set && !rec->no_buildid_cache) 2658 disable = false; 2659 if (disable) { 2660 rec->no_buildid = true; 2661 rec->no_buildid_cache = true; 2662 disable_buildid_cache(); 2663 } 2664 } 2665 2666 if (record.opts.overwrite) 2667 record.opts.tail_synthesize = true; 2668 2669 if (rec->evlist->core.nr_entries == 0 && 2670 __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) { 2671 pr_err("Not enough memory for event selector list\n"); 2672 goto out; 2673 } 2674 2675 if (rec->opts.target.tid && !rec->opts.no_inherit_set) 2676 rec->opts.no_inherit = true; 2677 2678 err = target__validate(&rec->opts.target); 2679 if (err) { 2680 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ); 2681 ui__warning("%s\n", errbuf); 2682 } 2683 2684 err = target__parse_uid(&rec->opts.target); 2685 if (err) { 2686 int saved_errno = errno; 2687 2688 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ); 2689 ui__error("%s", errbuf); 2690 2691 err = -saved_errno; 2692 goto out; 2693 } 2694 2695 /* Enable ignoring missing threads when -u/-p option is defined. */ 2696 rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid; 2697 2698 err = -ENOMEM; 2699 if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0) 2700 usage_with_options(record_usage, record_options); 2701 2702 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts); 2703 if (err) 2704 goto out; 2705 2706 /* 2707 * We take all buildids when the file contains 2708 * AUX area tracing data because we do not decode the 2709 * trace because it would take too long. 2710 */ 2711 if (rec->opts.full_auxtrace) 2712 rec->buildid_all = true; 2713 2714 if (record_opts__config(&rec->opts)) { 2715 err = -EINVAL; 2716 goto out; 2717 } 2718 2719 if (rec->opts.nr_cblocks > nr_cblocks_max) 2720 rec->opts.nr_cblocks = nr_cblocks_max; 2721 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks); 2722 2723 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]); 2724 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush); 2725 2726 if (rec->opts.comp_level > comp_level_max) 2727 rec->opts.comp_level = comp_level_max; 2728 pr_debug("comp level: %d\n", rec->opts.comp_level); 2729 2730 err = __cmd_record(&record, argc, argv); 2731 out: 2732 bitmap_free(rec->affinity_mask.bits); 2733 evlist__delete(rec->evlist); 2734 symbol__exit(); 2735 auxtrace_record__free(rec->itr); 2736 return err; 2737 } 2738 2739 static void snapshot_sig_handler(int sig __maybe_unused) 2740 { 2741 struct record *rec = &record; 2742 2743 if (trigger_is_ready(&auxtrace_snapshot_trigger)) { 2744 trigger_hit(&auxtrace_snapshot_trigger); 2745 auxtrace_record__snapshot_started = 1; 2746 if (auxtrace_record__snapshot_start(record.itr)) 2747 trigger_error(&auxtrace_snapshot_trigger); 2748 } 2749 2750 if (switch_output_signal(rec)) 2751 trigger_hit(&switch_output_trigger); 2752 } 2753 2754 static void alarm_sig_handler(int sig __maybe_unused) 2755 { 2756 struct record *rec = &record; 2757 2758 if (switch_output_time(rec)) 2759 trigger_hit(&switch_output_trigger); 2760 } 2761