xref: /openbmc/linux/tools/perf/builtin-record.c (revision 68198dca)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "perf.h"
12 
13 #include "util/build-id.h"
14 #include "util/util.h"
15 #include <subcmd/parse-options.h>
16 #include "util/parse-events.h"
17 #include "util/config.h"
18 
19 #include "util/callchain.h"
20 #include "util/cgroup.h"
21 #include "util/header.h"
22 #include "util/event.h"
23 #include "util/evlist.h"
24 #include "util/evsel.h"
25 #include "util/debug.h"
26 #include "util/drv_configs.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/cpumap.h"
31 #include "util/thread_map.h"
32 #include "util/data.h"
33 #include "util/perf_regs.h"
34 #include "util/auxtrace.h"
35 #include "util/tsc.h"
36 #include "util/parse-branch-options.h"
37 #include "util/parse-regs-options.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/time-utils.h"
43 #include "util/units.h"
44 #include "asm/bug.h"
45 
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <poll.h>
49 #include <unistd.h>
50 #include <sched.h>
51 #include <signal.h>
52 #include <sys/mman.h>
53 #include <sys/wait.h>
54 #include <asm/bug.h>
55 #include <linux/time64.h>
56 
57 struct switch_output {
58 	bool		 enabled;
59 	bool		 signal;
60 	unsigned long	 size;
61 	unsigned long	 time;
62 	const char	*str;
63 	bool		 set;
64 };
65 
66 struct record {
67 	struct perf_tool	tool;
68 	struct record_opts	opts;
69 	u64			bytes_written;
70 	struct perf_data	data;
71 	struct auxtrace_record	*itr;
72 	struct perf_evlist	*evlist;
73 	struct perf_session	*session;
74 	const char		*progname;
75 	int			realtime_prio;
76 	bool			no_buildid;
77 	bool			no_buildid_set;
78 	bool			no_buildid_cache;
79 	bool			no_buildid_cache_set;
80 	bool			buildid_all;
81 	bool			timestamp_filename;
82 	struct switch_output	switch_output;
83 	unsigned long long	samples;
84 };
85 
86 static volatile int auxtrace_record__snapshot_started;
87 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
88 static DEFINE_TRIGGER(switch_output_trigger);
89 
90 static bool switch_output_signal(struct record *rec)
91 {
92 	return rec->switch_output.signal &&
93 	       trigger_is_ready(&switch_output_trigger);
94 }
95 
96 static bool switch_output_size(struct record *rec)
97 {
98 	return rec->switch_output.size &&
99 	       trigger_is_ready(&switch_output_trigger) &&
100 	       (rec->bytes_written >= rec->switch_output.size);
101 }
102 
103 static bool switch_output_time(struct record *rec)
104 {
105 	return rec->switch_output.time &&
106 	       trigger_is_ready(&switch_output_trigger);
107 }
108 
109 static int record__write(struct record *rec, void *bf, size_t size)
110 {
111 	if (perf_data__write(rec->session->data, bf, size) < 0) {
112 		pr_err("failed to write perf data, error: %m\n");
113 		return -1;
114 	}
115 
116 	rec->bytes_written += size;
117 
118 	if (switch_output_size(rec))
119 		trigger_hit(&switch_output_trigger);
120 
121 	return 0;
122 }
123 
124 static int process_synthesized_event(struct perf_tool *tool,
125 				     union perf_event *event,
126 				     struct perf_sample *sample __maybe_unused,
127 				     struct machine *machine __maybe_unused)
128 {
129 	struct record *rec = container_of(tool, struct record, tool);
130 	return record__write(rec, event, event->header.size);
131 }
132 
133 static int record__pushfn(void *to, void *bf, size_t size)
134 {
135 	struct record *rec = to;
136 
137 	rec->samples++;
138 	return record__write(rec, bf, size);
139 }
140 
141 static volatile int done;
142 static volatile int signr = -1;
143 static volatile int child_finished;
144 
145 static void sig_handler(int sig)
146 {
147 	if (sig == SIGCHLD)
148 		child_finished = 1;
149 	else
150 		signr = sig;
151 
152 	done = 1;
153 }
154 
155 static void sigsegv_handler(int sig)
156 {
157 	perf_hooks__recover();
158 	sighandler_dump_stack(sig);
159 }
160 
161 static void record__sig_exit(void)
162 {
163 	if (signr == -1)
164 		return;
165 
166 	signal(signr, SIG_DFL);
167 	raise(signr);
168 }
169 
170 #ifdef HAVE_AUXTRACE_SUPPORT
171 
172 static int record__process_auxtrace(struct perf_tool *tool,
173 				    union perf_event *event, void *data1,
174 				    size_t len1, void *data2, size_t len2)
175 {
176 	struct record *rec = container_of(tool, struct record, tool);
177 	struct perf_data *data = &rec->data;
178 	size_t padding;
179 	u8 pad[8] = {0};
180 
181 	if (!perf_data__is_pipe(data)) {
182 		off_t file_offset;
183 		int fd = perf_data__fd(data);
184 		int err;
185 
186 		file_offset = lseek(fd, 0, SEEK_CUR);
187 		if (file_offset == -1)
188 			return -1;
189 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
190 						     event, file_offset);
191 		if (err)
192 			return err;
193 	}
194 
195 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
196 	padding = (len1 + len2) & 7;
197 	if (padding)
198 		padding = 8 - padding;
199 
200 	record__write(rec, event, event->header.size);
201 	record__write(rec, data1, len1);
202 	if (len2)
203 		record__write(rec, data2, len2);
204 	record__write(rec, &pad, padding);
205 
206 	return 0;
207 }
208 
209 static int record__auxtrace_mmap_read(struct record *rec,
210 				      struct auxtrace_mmap *mm)
211 {
212 	int ret;
213 
214 	ret = auxtrace_mmap__read(mm, rec->itr, &rec->tool,
215 				  record__process_auxtrace);
216 	if (ret < 0)
217 		return ret;
218 
219 	if (ret)
220 		rec->samples++;
221 
222 	return 0;
223 }
224 
225 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
226 					       struct auxtrace_mmap *mm)
227 {
228 	int ret;
229 
230 	ret = auxtrace_mmap__read_snapshot(mm, rec->itr, &rec->tool,
231 					   record__process_auxtrace,
232 					   rec->opts.auxtrace_snapshot_size);
233 	if (ret < 0)
234 		return ret;
235 
236 	if (ret)
237 		rec->samples++;
238 
239 	return 0;
240 }
241 
242 static int record__auxtrace_read_snapshot_all(struct record *rec)
243 {
244 	int i;
245 	int rc = 0;
246 
247 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
248 		struct auxtrace_mmap *mm =
249 				&rec->evlist->mmap[i].auxtrace_mmap;
250 
251 		if (!mm->base)
252 			continue;
253 
254 		if (record__auxtrace_mmap_read_snapshot(rec, mm) != 0) {
255 			rc = -1;
256 			goto out;
257 		}
258 	}
259 out:
260 	return rc;
261 }
262 
263 static void record__read_auxtrace_snapshot(struct record *rec)
264 {
265 	pr_debug("Recording AUX area tracing snapshot\n");
266 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
267 		trigger_error(&auxtrace_snapshot_trigger);
268 	} else {
269 		if (auxtrace_record__snapshot_finish(rec->itr))
270 			trigger_error(&auxtrace_snapshot_trigger);
271 		else
272 			trigger_ready(&auxtrace_snapshot_trigger);
273 	}
274 }
275 
276 #else
277 
278 static inline
279 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
280 			       struct auxtrace_mmap *mm __maybe_unused)
281 {
282 	return 0;
283 }
284 
285 static inline
286 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
287 {
288 }
289 
290 static inline
291 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
292 {
293 	return 0;
294 }
295 
296 #endif
297 
298 static int record__mmap_evlist(struct record *rec,
299 			       struct perf_evlist *evlist)
300 {
301 	struct record_opts *opts = &rec->opts;
302 	char msg[512];
303 
304 	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, false,
305 				 opts->auxtrace_mmap_pages,
306 				 opts->auxtrace_snapshot_mode) < 0) {
307 		if (errno == EPERM) {
308 			pr_err("Permission error mapping pages.\n"
309 			       "Consider increasing "
310 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
311 			       "or try again with a smaller value of -m/--mmap_pages.\n"
312 			       "(current value: %u,%u)\n",
313 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
314 			return -errno;
315 		} else {
316 			pr_err("failed to mmap with %d (%s)\n", errno,
317 				str_error_r(errno, msg, sizeof(msg)));
318 			if (errno)
319 				return -errno;
320 			else
321 				return -EINVAL;
322 		}
323 	}
324 	return 0;
325 }
326 
327 static int record__mmap(struct record *rec)
328 {
329 	return record__mmap_evlist(rec, rec->evlist);
330 }
331 
332 static int record__open(struct record *rec)
333 {
334 	char msg[BUFSIZ];
335 	struct perf_evsel *pos;
336 	struct perf_evlist *evlist = rec->evlist;
337 	struct perf_session *session = rec->session;
338 	struct record_opts *opts = &rec->opts;
339 	struct perf_evsel_config_term *err_term;
340 	int rc = 0;
341 
342 	/*
343 	 * For initial_delay we need to add a dummy event so that we can track
344 	 * PERF_RECORD_MMAP while we wait for the initial delay to enable the
345 	 * real events, the ones asked by the user.
346 	 */
347 	if (opts->initial_delay) {
348 		if (perf_evlist__add_dummy(evlist))
349 			return -ENOMEM;
350 
351 		pos = perf_evlist__first(evlist);
352 		pos->tracking = 0;
353 		pos = perf_evlist__last(evlist);
354 		pos->tracking = 1;
355 		pos->attr.enable_on_exec = 1;
356 	}
357 
358 	perf_evlist__config(evlist, opts, &callchain_param);
359 
360 	evlist__for_each_entry(evlist, pos) {
361 try_again:
362 		if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
363 			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
364 				if (verbose > 0)
365 					ui__warning("%s\n", msg);
366 				goto try_again;
367 			}
368 
369 			rc = -errno;
370 			perf_evsel__open_strerror(pos, &opts->target,
371 						  errno, msg, sizeof(msg));
372 			ui__error("%s\n", msg);
373 			goto out;
374 		}
375 	}
376 
377 	if (perf_evlist__apply_filters(evlist, &pos)) {
378 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
379 			pos->filter, perf_evsel__name(pos), errno,
380 			str_error_r(errno, msg, sizeof(msg)));
381 		rc = -1;
382 		goto out;
383 	}
384 
385 	if (perf_evlist__apply_drv_configs(evlist, &pos, &err_term)) {
386 		pr_err("failed to set config \"%s\" on event %s with %d (%s)\n",
387 		      err_term->val.drv_cfg, perf_evsel__name(pos), errno,
388 		      str_error_r(errno, msg, sizeof(msg)));
389 		rc = -1;
390 		goto out;
391 	}
392 
393 	rc = record__mmap(rec);
394 	if (rc)
395 		goto out;
396 
397 	session->evlist = evlist;
398 	perf_session__set_id_hdr_size(session);
399 out:
400 	return rc;
401 }
402 
403 static int process_sample_event(struct perf_tool *tool,
404 				union perf_event *event,
405 				struct perf_sample *sample,
406 				struct perf_evsel *evsel,
407 				struct machine *machine)
408 {
409 	struct record *rec = container_of(tool, struct record, tool);
410 
411 	rec->samples++;
412 
413 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
414 }
415 
416 static int process_buildids(struct record *rec)
417 {
418 	struct perf_data *data = &rec->data;
419 	struct perf_session *session = rec->session;
420 
421 	if (data->size == 0)
422 		return 0;
423 
424 	/*
425 	 * During this process, it'll load kernel map and replace the
426 	 * dso->long_name to a real pathname it found.  In this case
427 	 * we prefer the vmlinux path like
428 	 *   /lib/modules/3.16.4/build/vmlinux
429 	 *
430 	 * rather than build-id path (in debug directory).
431 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
432 	 */
433 	symbol_conf.ignore_vmlinux_buildid = true;
434 
435 	/*
436 	 * If --buildid-all is given, it marks all DSO regardless of hits,
437 	 * so no need to process samples.
438 	 */
439 	if (rec->buildid_all)
440 		rec->tool.sample = NULL;
441 
442 	return perf_session__process_events(session);
443 }
444 
445 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
446 {
447 	int err;
448 	struct perf_tool *tool = data;
449 	/*
450 	 *As for guest kernel when processing subcommand record&report,
451 	 *we arrange module mmap prior to guest kernel mmap and trigger
452 	 *a preload dso because default guest module symbols are loaded
453 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
454 	 *method is used to avoid symbol missing when the first addr is
455 	 *in module instead of in guest kernel.
456 	 */
457 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
458 					     machine);
459 	if (err < 0)
460 		pr_err("Couldn't record guest kernel [%d]'s reference"
461 		       " relocation symbol.\n", machine->pid);
462 
463 	/*
464 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
465 	 * have no _text sometimes.
466 	 */
467 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
468 						 machine);
469 	if (err < 0)
470 		pr_err("Couldn't record guest kernel [%d]'s reference"
471 		       " relocation symbol.\n", machine->pid);
472 }
473 
474 static struct perf_event_header finished_round_event = {
475 	.size = sizeof(struct perf_event_header),
476 	.type = PERF_RECORD_FINISHED_ROUND,
477 };
478 
479 static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist,
480 				    bool backward)
481 {
482 	u64 bytes_written = rec->bytes_written;
483 	int i;
484 	int rc = 0;
485 	struct perf_mmap *maps;
486 
487 	if (!evlist)
488 		return 0;
489 
490 	maps = backward ? evlist->backward_mmap : evlist->mmap;
491 	if (!maps)
492 		return 0;
493 
494 	if (backward && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
495 		return 0;
496 
497 	for (i = 0; i < evlist->nr_mmaps; i++) {
498 		struct auxtrace_mmap *mm = &maps[i].auxtrace_mmap;
499 
500 		if (maps[i].base) {
501 			if (perf_mmap__push(&maps[i], evlist->overwrite, backward, rec, record__pushfn) != 0) {
502 				rc = -1;
503 				goto out;
504 			}
505 		}
506 
507 		if (mm->base && !rec->opts.auxtrace_snapshot_mode &&
508 		    record__auxtrace_mmap_read(rec, mm) != 0) {
509 			rc = -1;
510 			goto out;
511 		}
512 	}
513 
514 	/*
515 	 * Mark the round finished in case we wrote
516 	 * at least one event.
517 	 */
518 	if (bytes_written != rec->bytes_written)
519 		rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
520 
521 	if (backward)
522 		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
523 out:
524 	return rc;
525 }
526 
527 static int record__mmap_read_all(struct record *rec)
528 {
529 	int err;
530 
531 	err = record__mmap_read_evlist(rec, rec->evlist, false);
532 	if (err)
533 		return err;
534 
535 	return record__mmap_read_evlist(rec, rec->evlist, true);
536 }
537 
538 static void record__init_features(struct record *rec)
539 {
540 	struct perf_session *session = rec->session;
541 	int feat;
542 
543 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
544 		perf_header__set_feat(&session->header, feat);
545 
546 	if (rec->no_buildid)
547 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
548 
549 	if (!have_tracepoints(&rec->evlist->entries))
550 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
551 
552 	if (!rec->opts.branch_stack)
553 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
554 
555 	if (!rec->opts.full_auxtrace)
556 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
557 
558 	perf_header__clear_feat(&session->header, HEADER_STAT);
559 }
560 
561 static void
562 record__finish_output(struct record *rec)
563 {
564 	struct perf_data *data = &rec->data;
565 	int fd = perf_data__fd(data);
566 
567 	if (data->is_pipe)
568 		return;
569 
570 	rec->session->header.data_size += rec->bytes_written;
571 	data->size = lseek(perf_data__fd(data), 0, SEEK_CUR);
572 
573 	if (!rec->no_buildid) {
574 		process_buildids(rec);
575 
576 		if (rec->buildid_all)
577 			dsos__hit_all(rec->session);
578 	}
579 	perf_session__write_header(rec->session, rec->evlist, fd, true);
580 
581 	return;
582 }
583 
584 static int record__synthesize_workload(struct record *rec, bool tail)
585 {
586 	int err;
587 	struct thread_map *thread_map;
588 
589 	if (rec->opts.tail_synthesize != tail)
590 		return 0;
591 
592 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
593 	if (thread_map == NULL)
594 		return -1;
595 
596 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
597 						 process_synthesized_event,
598 						 &rec->session->machines.host,
599 						 rec->opts.sample_address,
600 						 rec->opts.proc_map_timeout);
601 	thread_map__put(thread_map);
602 	return err;
603 }
604 
605 static int record__synthesize(struct record *rec, bool tail);
606 
607 static int
608 record__switch_output(struct record *rec, bool at_exit)
609 {
610 	struct perf_data *data = &rec->data;
611 	int fd, err;
612 
613 	/* Same Size:      "2015122520103046"*/
614 	char timestamp[] = "InvalidTimestamp";
615 
616 	record__synthesize(rec, true);
617 	if (target__none(&rec->opts.target))
618 		record__synthesize_workload(rec, true);
619 
620 	rec->samples = 0;
621 	record__finish_output(rec);
622 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
623 	if (err) {
624 		pr_err("Failed to get current timestamp\n");
625 		return -EINVAL;
626 	}
627 
628 	fd = perf_data__switch(data, timestamp,
629 				    rec->session->header.data_offset,
630 				    at_exit);
631 	if (fd >= 0 && !at_exit) {
632 		rec->bytes_written = 0;
633 		rec->session->header.data_size = 0;
634 	}
635 
636 	if (!quiet)
637 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
638 			data->file.path, timestamp);
639 
640 	/* Output tracking events */
641 	if (!at_exit) {
642 		record__synthesize(rec, false);
643 
644 		/*
645 		 * In 'perf record --switch-output' without -a,
646 		 * record__synthesize() in record__switch_output() won't
647 		 * generate tracking events because there's no thread_map
648 		 * in evlist. Which causes newly created perf.data doesn't
649 		 * contain map and comm information.
650 		 * Create a fake thread_map and directly call
651 		 * perf_event__synthesize_thread_map() for those events.
652 		 */
653 		if (target__none(&rec->opts.target))
654 			record__synthesize_workload(rec, false);
655 	}
656 	return fd;
657 }
658 
659 static volatile int workload_exec_errno;
660 
661 /*
662  * perf_evlist__prepare_workload will send a SIGUSR1
663  * if the fork fails, since we asked by setting its
664  * want_signal to true.
665  */
666 static void workload_exec_failed_signal(int signo __maybe_unused,
667 					siginfo_t *info,
668 					void *ucontext __maybe_unused)
669 {
670 	workload_exec_errno = info->si_value.sival_int;
671 	done = 1;
672 	child_finished = 1;
673 }
674 
675 static void snapshot_sig_handler(int sig);
676 static void alarm_sig_handler(int sig);
677 
678 int __weak
679 perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
680 			    struct perf_tool *tool __maybe_unused,
681 			    perf_event__handler_t process __maybe_unused,
682 			    struct machine *machine __maybe_unused)
683 {
684 	return 0;
685 }
686 
687 static const struct perf_event_mmap_page *
688 perf_evlist__pick_pc(struct perf_evlist *evlist)
689 {
690 	if (evlist) {
691 		if (evlist->mmap && evlist->mmap[0].base)
692 			return evlist->mmap[0].base;
693 		if (evlist->backward_mmap && evlist->backward_mmap[0].base)
694 			return evlist->backward_mmap[0].base;
695 	}
696 	return NULL;
697 }
698 
699 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
700 {
701 	const struct perf_event_mmap_page *pc;
702 
703 	pc = perf_evlist__pick_pc(rec->evlist);
704 	if (pc)
705 		return pc;
706 	return NULL;
707 }
708 
709 static int record__synthesize(struct record *rec, bool tail)
710 {
711 	struct perf_session *session = rec->session;
712 	struct machine *machine = &session->machines.host;
713 	struct perf_data *data = &rec->data;
714 	struct record_opts *opts = &rec->opts;
715 	struct perf_tool *tool = &rec->tool;
716 	int fd = perf_data__fd(data);
717 	int err = 0;
718 
719 	if (rec->opts.tail_synthesize != tail)
720 		return 0;
721 
722 	if (data->is_pipe) {
723 		err = perf_event__synthesize_features(
724 			tool, session, rec->evlist, process_synthesized_event);
725 		if (err < 0) {
726 			pr_err("Couldn't synthesize features.\n");
727 			return err;
728 		}
729 
730 		err = perf_event__synthesize_attrs(tool, session,
731 						   process_synthesized_event);
732 		if (err < 0) {
733 			pr_err("Couldn't synthesize attrs.\n");
734 			goto out;
735 		}
736 
737 		if (have_tracepoints(&rec->evlist->entries)) {
738 			/*
739 			 * FIXME err <= 0 here actually means that
740 			 * there were no tracepoints so its not really
741 			 * an error, just that we don't need to
742 			 * synthesize anything.  We really have to
743 			 * return this more properly and also
744 			 * propagate errors that now are calling die()
745 			 */
746 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
747 								  process_synthesized_event);
748 			if (err <= 0) {
749 				pr_err("Couldn't record tracing data.\n");
750 				goto out;
751 			}
752 			rec->bytes_written += err;
753 		}
754 	}
755 
756 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
757 					  process_synthesized_event, machine);
758 	if (err)
759 		goto out;
760 
761 	if (rec->opts.full_auxtrace) {
762 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
763 					session, process_synthesized_event);
764 		if (err)
765 			goto out;
766 	}
767 
768 	if (!perf_evlist__exclude_kernel(rec->evlist)) {
769 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
770 							 machine);
771 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
772 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
773 				   "Check /proc/kallsyms permission or run as root.\n");
774 
775 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
776 						     machine);
777 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
778 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
779 				   "Check /proc/modules permission or run as root.\n");
780 	}
781 
782 	if (perf_guest) {
783 		machines__process_guests(&session->machines,
784 					 perf_event__synthesize_guest_os, tool);
785 	}
786 
787 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
788 					    process_synthesized_event, opts->sample_address,
789 					    opts->proc_map_timeout, 1);
790 out:
791 	return err;
792 }
793 
794 static int __cmd_record(struct record *rec, int argc, const char **argv)
795 {
796 	int err;
797 	int status = 0;
798 	unsigned long waking = 0;
799 	const bool forks = argc > 0;
800 	struct machine *machine;
801 	struct perf_tool *tool = &rec->tool;
802 	struct record_opts *opts = &rec->opts;
803 	struct perf_data *data = &rec->data;
804 	struct perf_session *session;
805 	bool disabled = false, draining = false;
806 	int fd;
807 
808 	rec->progname = argv[0];
809 
810 	atexit(record__sig_exit);
811 	signal(SIGCHLD, sig_handler);
812 	signal(SIGINT, sig_handler);
813 	signal(SIGTERM, sig_handler);
814 	signal(SIGSEGV, sigsegv_handler);
815 
816 	if (rec->opts.record_namespaces)
817 		tool->namespace_events = true;
818 
819 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
820 		signal(SIGUSR2, snapshot_sig_handler);
821 		if (rec->opts.auxtrace_snapshot_mode)
822 			trigger_on(&auxtrace_snapshot_trigger);
823 		if (rec->switch_output.enabled)
824 			trigger_on(&switch_output_trigger);
825 	} else {
826 		signal(SIGUSR2, SIG_IGN);
827 	}
828 
829 	session = perf_session__new(data, false, tool);
830 	if (session == NULL) {
831 		pr_err("Perf session creation failed.\n");
832 		return -1;
833 	}
834 
835 	fd = perf_data__fd(data);
836 	rec->session = session;
837 
838 	record__init_features(rec);
839 
840 	if (forks) {
841 		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
842 						    argv, data->is_pipe,
843 						    workload_exec_failed_signal);
844 		if (err < 0) {
845 			pr_err("Couldn't run the workload!\n");
846 			status = err;
847 			goto out_delete_session;
848 		}
849 	}
850 
851 	if (record__open(rec) != 0) {
852 		err = -1;
853 		goto out_child;
854 	}
855 
856 	err = bpf__apply_obj_config();
857 	if (err) {
858 		char errbuf[BUFSIZ];
859 
860 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
861 		pr_err("ERROR: Apply config to BPF failed: %s\n",
862 			 errbuf);
863 		goto out_child;
864 	}
865 
866 	/*
867 	 * Normally perf_session__new would do this, but it doesn't have the
868 	 * evlist.
869 	 */
870 	if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
871 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
872 		rec->tool.ordered_events = false;
873 	}
874 
875 	if (!rec->evlist->nr_groups)
876 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
877 
878 	if (data->is_pipe) {
879 		err = perf_header__write_pipe(fd);
880 		if (err < 0)
881 			goto out_child;
882 	} else {
883 		err = perf_session__write_header(session, rec->evlist, fd, false);
884 		if (err < 0)
885 			goto out_child;
886 	}
887 
888 	if (!rec->no_buildid
889 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
890 		pr_err("Couldn't generate buildids. "
891 		       "Use --no-buildid to profile anyway.\n");
892 		err = -1;
893 		goto out_child;
894 	}
895 
896 	machine = &session->machines.host;
897 
898 	err = record__synthesize(rec, false);
899 	if (err < 0)
900 		goto out_child;
901 
902 	if (rec->realtime_prio) {
903 		struct sched_param param;
904 
905 		param.sched_priority = rec->realtime_prio;
906 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
907 			pr_err("Could not set realtime priority.\n");
908 			err = -1;
909 			goto out_child;
910 		}
911 	}
912 
913 	/*
914 	 * When perf is starting the traced process, all the events
915 	 * (apart from group members) have enable_on_exec=1 set,
916 	 * so don't spoil it by prematurely enabling them.
917 	 */
918 	if (!target__none(&opts->target) && !opts->initial_delay)
919 		perf_evlist__enable(rec->evlist);
920 
921 	/*
922 	 * Let the child rip
923 	 */
924 	if (forks) {
925 		union perf_event *event;
926 		pid_t tgid;
927 
928 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
929 		if (event == NULL) {
930 			err = -ENOMEM;
931 			goto out_child;
932 		}
933 
934 		/*
935 		 * Some H/W events are generated before COMM event
936 		 * which is emitted during exec(), so perf script
937 		 * cannot see a correct process name for those events.
938 		 * Synthesize COMM event to prevent it.
939 		 */
940 		tgid = perf_event__synthesize_comm(tool, event,
941 						   rec->evlist->workload.pid,
942 						   process_synthesized_event,
943 						   machine);
944 		free(event);
945 
946 		if (tgid == -1)
947 			goto out_child;
948 
949 		event = malloc(sizeof(event->namespaces) +
950 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
951 			       machine->id_hdr_size);
952 		if (event == NULL) {
953 			err = -ENOMEM;
954 			goto out_child;
955 		}
956 
957 		/*
958 		 * Synthesize NAMESPACES event for the command specified.
959 		 */
960 		perf_event__synthesize_namespaces(tool, event,
961 						  rec->evlist->workload.pid,
962 						  tgid, process_synthesized_event,
963 						  machine);
964 		free(event);
965 
966 		perf_evlist__start_workload(rec->evlist);
967 	}
968 
969 	if (opts->initial_delay) {
970 		usleep(opts->initial_delay * USEC_PER_MSEC);
971 		perf_evlist__enable(rec->evlist);
972 	}
973 
974 	trigger_ready(&auxtrace_snapshot_trigger);
975 	trigger_ready(&switch_output_trigger);
976 	perf_hooks__invoke_record_start();
977 	for (;;) {
978 		unsigned long long hits = rec->samples;
979 
980 		/*
981 		 * rec->evlist->bkw_mmap_state is possible to be
982 		 * BKW_MMAP_EMPTY here: when done == true and
983 		 * hits != rec->samples in previous round.
984 		 *
985 		 * perf_evlist__toggle_bkw_mmap ensure we never
986 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
987 		 */
988 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
989 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
990 
991 		if (record__mmap_read_all(rec) < 0) {
992 			trigger_error(&auxtrace_snapshot_trigger);
993 			trigger_error(&switch_output_trigger);
994 			err = -1;
995 			goto out_child;
996 		}
997 
998 		if (auxtrace_record__snapshot_started) {
999 			auxtrace_record__snapshot_started = 0;
1000 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1001 				record__read_auxtrace_snapshot(rec);
1002 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1003 				pr_err("AUX area tracing snapshot failed\n");
1004 				err = -1;
1005 				goto out_child;
1006 			}
1007 		}
1008 
1009 		if (trigger_is_hit(&switch_output_trigger)) {
1010 			/*
1011 			 * If switch_output_trigger is hit, the data in
1012 			 * overwritable ring buffer should have been collected,
1013 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1014 			 *
1015 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1016 			 * record__mmap_read_all() didn't collect data from
1017 			 * overwritable ring buffer. Read again.
1018 			 */
1019 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1020 				continue;
1021 			trigger_ready(&switch_output_trigger);
1022 
1023 			/*
1024 			 * Reenable events in overwrite ring buffer after
1025 			 * record__mmap_read_all(): we should have collected
1026 			 * data from it.
1027 			 */
1028 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1029 
1030 			if (!quiet)
1031 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1032 					waking);
1033 			waking = 0;
1034 			fd = record__switch_output(rec, false);
1035 			if (fd < 0) {
1036 				pr_err("Failed to switch to new file\n");
1037 				trigger_error(&switch_output_trigger);
1038 				err = fd;
1039 				goto out_child;
1040 			}
1041 
1042 			/* re-arm the alarm */
1043 			if (rec->switch_output.time)
1044 				alarm(rec->switch_output.time);
1045 		}
1046 
1047 		if (hits == rec->samples) {
1048 			if (done || draining)
1049 				break;
1050 			err = perf_evlist__poll(rec->evlist, -1);
1051 			/*
1052 			 * Propagate error, only if there's any. Ignore positive
1053 			 * number of returned events and interrupt error.
1054 			 */
1055 			if (err > 0 || (err < 0 && errno == EINTR))
1056 				err = 0;
1057 			waking++;
1058 
1059 			if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1060 				draining = true;
1061 		}
1062 
1063 		/*
1064 		 * When perf is starting the traced process, at the end events
1065 		 * die with the process and we wait for that. Thus no need to
1066 		 * disable events in this case.
1067 		 */
1068 		if (done && !disabled && !target__none(&opts->target)) {
1069 			trigger_off(&auxtrace_snapshot_trigger);
1070 			perf_evlist__disable(rec->evlist);
1071 			disabled = true;
1072 		}
1073 	}
1074 	trigger_off(&auxtrace_snapshot_trigger);
1075 	trigger_off(&switch_output_trigger);
1076 
1077 	if (forks && workload_exec_errno) {
1078 		char msg[STRERR_BUFSIZE];
1079 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1080 		pr_err("Workload failed: %s\n", emsg);
1081 		err = -1;
1082 		goto out_child;
1083 	}
1084 
1085 	if (!quiet)
1086 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1087 
1088 	if (target__none(&rec->opts.target))
1089 		record__synthesize_workload(rec, true);
1090 
1091 out_child:
1092 	if (forks) {
1093 		int exit_status;
1094 
1095 		if (!child_finished)
1096 			kill(rec->evlist->workload.pid, SIGTERM);
1097 
1098 		wait(&exit_status);
1099 
1100 		if (err < 0)
1101 			status = err;
1102 		else if (WIFEXITED(exit_status))
1103 			status = WEXITSTATUS(exit_status);
1104 		else if (WIFSIGNALED(exit_status))
1105 			signr = WTERMSIG(exit_status);
1106 	} else
1107 		status = err;
1108 
1109 	record__synthesize(rec, true);
1110 	/* this will be recalculated during process_buildids() */
1111 	rec->samples = 0;
1112 
1113 	if (!err) {
1114 		if (!rec->timestamp_filename) {
1115 			record__finish_output(rec);
1116 		} else {
1117 			fd = record__switch_output(rec, true);
1118 			if (fd < 0) {
1119 				status = fd;
1120 				goto out_delete_session;
1121 			}
1122 		}
1123 	}
1124 
1125 	perf_hooks__invoke_record_end();
1126 
1127 	if (!err && !quiet) {
1128 		char samples[128];
1129 		const char *postfix = rec->timestamp_filename ?
1130 					".<timestamp>" : "";
1131 
1132 		if (rec->samples && !rec->opts.full_auxtrace)
1133 			scnprintf(samples, sizeof(samples),
1134 				  " (%" PRIu64 " samples)", rec->samples);
1135 		else
1136 			samples[0] = '\0';
1137 
1138 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s ]\n",
1139 			perf_data__size(data) / 1024.0 / 1024.0,
1140 			data->file.path, postfix, samples);
1141 	}
1142 
1143 out_delete_session:
1144 	perf_session__delete(session);
1145 	return status;
1146 }
1147 
1148 static void callchain_debug(struct callchain_param *callchain)
1149 {
1150 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1151 
1152 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1153 
1154 	if (callchain->record_mode == CALLCHAIN_DWARF)
1155 		pr_debug("callchain: stack dump size %d\n",
1156 			 callchain->dump_size);
1157 }
1158 
1159 int record_opts__parse_callchain(struct record_opts *record,
1160 				 struct callchain_param *callchain,
1161 				 const char *arg, bool unset)
1162 {
1163 	int ret;
1164 	callchain->enabled = !unset;
1165 
1166 	/* --no-call-graph */
1167 	if (unset) {
1168 		callchain->record_mode = CALLCHAIN_NONE;
1169 		pr_debug("callchain: disabled\n");
1170 		return 0;
1171 	}
1172 
1173 	ret = parse_callchain_record_opt(arg, callchain);
1174 	if (!ret) {
1175 		/* Enable data address sampling for DWARF unwind. */
1176 		if (callchain->record_mode == CALLCHAIN_DWARF)
1177 			record->sample_address = true;
1178 		callchain_debug(callchain);
1179 	}
1180 
1181 	return ret;
1182 }
1183 
1184 int record_parse_callchain_opt(const struct option *opt,
1185 			       const char *arg,
1186 			       int unset)
1187 {
1188 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1189 }
1190 
1191 int record_callchain_opt(const struct option *opt,
1192 			 const char *arg __maybe_unused,
1193 			 int unset __maybe_unused)
1194 {
1195 	struct callchain_param *callchain = opt->value;
1196 
1197 	callchain->enabled = true;
1198 
1199 	if (callchain->record_mode == CALLCHAIN_NONE)
1200 		callchain->record_mode = CALLCHAIN_FP;
1201 
1202 	callchain_debug(callchain);
1203 	return 0;
1204 }
1205 
1206 static int perf_record_config(const char *var, const char *value, void *cb)
1207 {
1208 	struct record *rec = cb;
1209 
1210 	if (!strcmp(var, "record.build-id")) {
1211 		if (!strcmp(value, "cache"))
1212 			rec->no_buildid_cache = false;
1213 		else if (!strcmp(value, "no-cache"))
1214 			rec->no_buildid_cache = true;
1215 		else if (!strcmp(value, "skip"))
1216 			rec->no_buildid = true;
1217 		else
1218 			return -1;
1219 		return 0;
1220 	}
1221 	if (!strcmp(var, "record.call-graph"))
1222 		var = "call-graph.record-mode"; /* fall-through */
1223 
1224 	return perf_default_config(var, value, cb);
1225 }
1226 
1227 struct clockid_map {
1228 	const char *name;
1229 	int clockid;
1230 };
1231 
1232 #define CLOCKID_MAP(n, c)	\
1233 	{ .name = n, .clockid = (c), }
1234 
1235 #define CLOCKID_END	{ .name = NULL, }
1236 
1237 
1238 /*
1239  * Add the missing ones, we need to build on many distros...
1240  */
1241 #ifndef CLOCK_MONOTONIC_RAW
1242 #define CLOCK_MONOTONIC_RAW 4
1243 #endif
1244 #ifndef CLOCK_BOOTTIME
1245 #define CLOCK_BOOTTIME 7
1246 #endif
1247 #ifndef CLOCK_TAI
1248 #define CLOCK_TAI 11
1249 #endif
1250 
1251 static const struct clockid_map clockids[] = {
1252 	/* available for all events, NMI safe */
1253 	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1254 	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1255 
1256 	/* available for some events */
1257 	CLOCKID_MAP("realtime", CLOCK_REALTIME),
1258 	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1259 	CLOCKID_MAP("tai", CLOCK_TAI),
1260 
1261 	/* available for the lazy */
1262 	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1263 	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1264 	CLOCKID_MAP("real", CLOCK_REALTIME),
1265 	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1266 
1267 	CLOCKID_END,
1268 };
1269 
1270 static int parse_clockid(const struct option *opt, const char *str, int unset)
1271 {
1272 	struct record_opts *opts = (struct record_opts *)opt->value;
1273 	const struct clockid_map *cm;
1274 	const char *ostr = str;
1275 
1276 	if (unset) {
1277 		opts->use_clockid = 0;
1278 		return 0;
1279 	}
1280 
1281 	/* no arg passed */
1282 	if (!str)
1283 		return 0;
1284 
1285 	/* no setting it twice */
1286 	if (opts->use_clockid)
1287 		return -1;
1288 
1289 	opts->use_clockid = true;
1290 
1291 	/* if its a number, we're done */
1292 	if (sscanf(str, "%d", &opts->clockid) == 1)
1293 		return 0;
1294 
1295 	/* allow a "CLOCK_" prefix to the name */
1296 	if (!strncasecmp(str, "CLOCK_", 6))
1297 		str += 6;
1298 
1299 	for (cm = clockids; cm->name; cm++) {
1300 		if (!strcasecmp(str, cm->name)) {
1301 			opts->clockid = cm->clockid;
1302 			return 0;
1303 		}
1304 	}
1305 
1306 	opts->use_clockid = false;
1307 	ui__warning("unknown clockid %s, check man page\n", ostr);
1308 	return -1;
1309 }
1310 
1311 static int record__parse_mmap_pages(const struct option *opt,
1312 				    const char *str,
1313 				    int unset __maybe_unused)
1314 {
1315 	struct record_opts *opts = opt->value;
1316 	char *s, *p;
1317 	unsigned int mmap_pages;
1318 	int ret;
1319 
1320 	if (!str)
1321 		return -EINVAL;
1322 
1323 	s = strdup(str);
1324 	if (!s)
1325 		return -ENOMEM;
1326 
1327 	p = strchr(s, ',');
1328 	if (p)
1329 		*p = '\0';
1330 
1331 	if (*s) {
1332 		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1333 		if (ret)
1334 			goto out_free;
1335 		opts->mmap_pages = mmap_pages;
1336 	}
1337 
1338 	if (!p) {
1339 		ret = 0;
1340 		goto out_free;
1341 	}
1342 
1343 	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1344 	if (ret)
1345 		goto out_free;
1346 
1347 	opts->auxtrace_mmap_pages = mmap_pages;
1348 
1349 out_free:
1350 	free(s);
1351 	return ret;
1352 }
1353 
1354 static void switch_output_size_warn(struct record *rec)
1355 {
1356 	u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1357 	struct switch_output *s = &rec->switch_output;
1358 
1359 	wakeup_size /= 2;
1360 
1361 	if (s->size < wakeup_size) {
1362 		char buf[100];
1363 
1364 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1365 		pr_warning("WARNING: switch-output data size lower than "
1366 			   "wakeup kernel buffer size (%s) "
1367 			   "expect bigger perf.data sizes\n", buf);
1368 	}
1369 }
1370 
1371 static int switch_output_setup(struct record *rec)
1372 {
1373 	struct switch_output *s = &rec->switch_output;
1374 	static struct parse_tag tags_size[] = {
1375 		{ .tag  = 'B', .mult = 1       },
1376 		{ .tag  = 'K', .mult = 1 << 10 },
1377 		{ .tag  = 'M', .mult = 1 << 20 },
1378 		{ .tag  = 'G', .mult = 1 << 30 },
1379 		{ .tag  = 0 },
1380 	};
1381 	static struct parse_tag tags_time[] = {
1382 		{ .tag  = 's', .mult = 1        },
1383 		{ .tag  = 'm', .mult = 60       },
1384 		{ .tag  = 'h', .mult = 60*60    },
1385 		{ .tag  = 'd', .mult = 60*60*24 },
1386 		{ .tag  = 0 },
1387 	};
1388 	unsigned long val;
1389 
1390 	if (!s->set)
1391 		return 0;
1392 
1393 	if (!strcmp(s->str, "signal")) {
1394 		s->signal = true;
1395 		pr_debug("switch-output with SIGUSR2 signal\n");
1396 		goto enabled;
1397 	}
1398 
1399 	val = parse_tag_value(s->str, tags_size);
1400 	if (val != (unsigned long) -1) {
1401 		s->size = val;
1402 		pr_debug("switch-output with %s size threshold\n", s->str);
1403 		goto enabled;
1404 	}
1405 
1406 	val = parse_tag_value(s->str, tags_time);
1407 	if (val != (unsigned long) -1) {
1408 		s->time = val;
1409 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
1410 			 s->str, s->time);
1411 		goto enabled;
1412 	}
1413 
1414 	return -1;
1415 
1416 enabled:
1417 	rec->timestamp_filename = true;
1418 	s->enabled              = true;
1419 
1420 	if (s->size && !rec->opts.no_buffering)
1421 		switch_output_size_warn(rec);
1422 
1423 	return 0;
1424 }
1425 
1426 static const char * const __record_usage[] = {
1427 	"perf record [<options>] [<command>]",
1428 	"perf record [<options>] -- <command> [<options>]",
1429 	NULL
1430 };
1431 const char * const *record_usage = __record_usage;
1432 
1433 /*
1434  * XXX Ideally would be local to cmd_record() and passed to a record__new
1435  * because we need to have access to it in record__exit, that is called
1436  * after cmd_record() exits, but since record_options need to be accessible to
1437  * builtin-script, leave it here.
1438  *
1439  * At least we don't ouch it in all the other functions here directly.
1440  *
1441  * Just say no to tons of global variables, sigh.
1442  */
1443 static struct record record = {
1444 	.opts = {
1445 		.sample_time	     = true,
1446 		.mmap_pages	     = UINT_MAX,
1447 		.user_freq	     = UINT_MAX,
1448 		.user_interval	     = ULLONG_MAX,
1449 		.freq		     = 4000,
1450 		.target		     = {
1451 			.uses_mmap   = true,
1452 			.default_per_cpu = true,
1453 		},
1454 		.proc_map_timeout     = 500,
1455 	},
1456 	.tool = {
1457 		.sample		= process_sample_event,
1458 		.fork		= perf_event__process_fork,
1459 		.exit		= perf_event__process_exit,
1460 		.comm		= perf_event__process_comm,
1461 		.namespaces	= perf_event__process_namespaces,
1462 		.mmap		= perf_event__process_mmap,
1463 		.mmap2		= perf_event__process_mmap2,
1464 		.ordered_events	= true,
1465 	},
1466 };
1467 
1468 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
1469 	"\n\t\t\t\tDefault: fp";
1470 
1471 static bool dry_run;
1472 
1473 /*
1474  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
1475  * with it and switch to use the library functions in perf_evlist that came
1476  * from builtin-record.c, i.e. use record_opts,
1477  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
1478  * using pipes, etc.
1479  */
1480 static struct option __record_options[] = {
1481 	OPT_CALLBACK('e', "event", &record.evlist, "event",
1482 		     "event selector. use 'perf list' to list available events",
1483 		     parse_events_option),
1484 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
1485 		     "event filter", parse_filter),
1486 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
1487 			   NULL, "don't record events from perf itself",
1488 			   exclude_perf),
1489 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
1490 		    "record events on existing process id"),
1491 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
1492 		    "record events on existing thread id"),
1493 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
1494 		    "collect data with this RT SCHED_FIFO priority"),
1495 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
1496 		    "collect data without buffering"),
1497 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1498 		    "collect raw sample records from all opened counters"),
1499 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1500 			    "system-wide collection from all CPUs"),
1501 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1502 		    "list of cpus to monitor"),
1503 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
1504 	OPT_STRING('o', "output", &record.data.file.path, "file",
1505 		    "output file name"),
1506 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
1507 			&record.opts.no_inherit_set,
1508 			"child tasks do not inherit counters"),
1509 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
1510 		    "synthesize non-sample events at the end of output"),
1511 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
1512 	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
1513 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
1514 		     "number of mmap data pages and AUX area tracing mmap pages",
1515 		     record__parse_mmap_pages),
1516 	OPT_BOOLEAN(0, "group", &record.opts.group,
1517 		    "put the counters into a counter group"),
1518 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
1519 			   NULL, "enables call-graph recording" ,
1520 			   &record_callchain_opt),
1521 	OPT_CALLBACK(0, "call-graph", &record.opts,
1522 		     "record_mode[,record_size]", record_callchain_help,
1523 		     &record_parse_callchain_opt),
1524 	OPT_INCR('v', "verbose", &verbose,
1525 		    "be more verbose (show counter open errors, etc)"),
1526 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1527 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1528 		    "per thread counts"),
1529 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
1530 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
1531 		    "Record the sample physical addresses"),
1532 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
1533 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
1534 			&record.opts.sample_time_set,
1535 			"Record the sample timestamps"),
1536 	OPT_BOOLEAN('P', "period", &record.opts.period, "Record the sample period"),
1537 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1538 		    "don't sample"),
1539 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
1540 			&record.no_buildid_cache_set,
1541 			"do not update the buildid cache"),
1542 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
1543 			&record.no_buildid_set,
1544 			"do not collect buildids in perf.data"),
1545 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
1546 		     "monitor event in cgroup name only",
1547 		     parse_cgroups),
1548 	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
1549 		  "ms to wait before starting measurement after program start"),
1550 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
1551 		   "user to profile"),
1552 
1553 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
1554 		     "branch any", "sample any taken branches",
1555 		     parse_branch_stack),
1556 
1557 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
1558 		     "branch filter mask", "branch stack filter modes",
1559 		     parse_branch_stack),
1560 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
1561 		    "sample by weight (on special events only)"),
1562 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
1563 		    "sample transaction flags (special events only)"),
1564 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
1565 		    "use per-thread mmaps"),
1566 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
1567 		    "sample selected machine registers on interrupt,"
1568 		    " use -I ? to list register names", parse_regs),
1569 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
1570 		    "sample selected machine registers on interrupt,"
1571 		    " use -I ? to list register names", parse_regs),
1572 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
1573 		    "Record running/enabled time of read (:S) events"),
1574 	OPT_CALLBACK('k', "clockid", &record.opts,
1575 	"clockid", "clockid to use for events, see clock_gettime()",
1576 	parse_clockid),
1577 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
1578 			  "opts", "AUX area tracing Snapshot Mode", ""),
1579 	OPT_UINTEGER(0, "proc-map-timeout", &record.opts.proc_map_timeout,
1580 			"per thread proc mmap processing timeout in ms"),
1581 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
1582 		    "Record namespaces events"),
1583 	OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
1584 		    "Record context switch events"),
1585 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
1586 			 "Configure all used events to run in kernel space.",
1587 			 PARSE_OPT_EXCLUSIVE),
1588 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
1589 			 "Configure all used events to run in user space.",
1590 			 PARSE_OPT_EXCLUSIVE),
1591 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
1592 		   "clang binary to use for compiling BPF scriptlets"),
1593 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
1594 		   "options passed to clang when compiling BPF scriptlets"),
1595 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
1596 		   "file", "vmlinux pathname"),
1597 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
1598 		    "Record build-id of all DSOs regardless of hits"),
1599 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
1600 		    "append timestamp to output filename"),
1601 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
1602 			  &record.switch_output.set, "signal,size,time",
1603 			  "Switch output when receive SIGUSR2 or cross size,time threshold",
1604 			  "signal"),
1605 	OPT_BOOLEAN(0, "dry-run", &dry_run,
1606 		    "Parse options then exit"),
1607 	OPT_END()
1608 };
1609 
1610 struct option *record_options = __record_options;
1611 
1612 int cmd_record(int argc, const char **argv)
1613 {
1614 	int err;
1615 	struct record *rec = &record;
1616 	char errbuf[BUFSIZ];
1617 
1618 #ifndef HAVE_LIBBPF_SUPPORT
1619 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
1620 	set_nobuild('\0', "clang-path", true);
1621 	set_nobuild('\0', "clang-opt", true);
1622 # undef set_nobuild
1623 #endif
1624 
1625 #ifndef HAVE_BPF_PROLOGUE
1626 # if !defined (HAVE_DWARF_SUPPORT)
1627 #  define REASON  "NO_DWARF=1"
1628 # elif !defined (HAVE_LIBBPF_SUPPORT)
1629 #  define REASON  "NO_LIBBPF=1"
1630 # else
1631 #  define REASON  "this architecture doesn't support BPF prologue"
1632 # endif
1633 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
1634 	set_nobuild('\0', "vmlinux", true);
1635 # undef set_nobuild
1636 # undef REASON
1637 #endif
1638 
1639 	rec->evlist = perf_evlist__new();
1640 	if (rec->evlist == NULL)
1641 		return -ENOMEM;
1642 
1643 	err = perf_config(perf_record_config, rec);
1644 	if (err)
1645 		return err;
1646 
1647 	argc = parse_options(argc, argv, record_options, record_usage,
1648 			    PARSE_OPT_STOP_AT_NON_OPTION);
1649 	if (quiet)
1650 		perf_quiet_option();
1651 
1652 	/* Make system wide (-a) the default target. */
1653 	if (!argc && target__none(&rec->opts.target))
1654 		rec->opts.target.system_wide = true;
1655 
1656 	if (nr_cgroups && !rec->opts.target.system_wide) {
1657 		usage_with_options_msg(record_usage, record_options,
1658 			"cgroup monitoring only available in system-wide mode");
1659 
1660 	}
1661 	if (rec->opts.record_switch_events &&
1662 	    !perf_can_record_switch_events()) {
1663 		ui__error("kernel does not support recording context switch events\n");
1664 		parse_options_usage(record_usage, record_options, "switch-events", 0);
1665 		return -EINVAL;
1666 	}
1667 
1668 	if (switch_output_setup(rec)) {
1669 		parse_options_usage(record_usage, record_options, "switch-output", 0);
1670 		return -EINVAL;
1671 	}
1672 
1673 	if (rec->switch_output.time) {
1674 		signal(SIGALRM, alarm_sig_handler);
1675 		alarm(rec->switch_output.time);
1676 	}
1677 
1678 	if (!rec->itr) {
1679 		rec->itr = auxtrace_record__init(rec->evlist, &err);
1680 		if (err)
1681 			goto out;
1682 	}
1683 
1684 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
1685 					      rec->opts.auxtrace_snapshot_opts);
1686 	if (err)
1687 		goto out;
1688 
1689 	/*
1690 	 * Allow aliases to facilitate the lookup of symbols for address
1691 	 * filters. Refer to auxtrace_parse_filters().
1692 	 */
1693 	symbol_conf.allow_aliases = true;
1694 
1695 	symbol__init(NULL);
1696 
1697 	err = auxtrace_parse_filters(rec->evlist);
1698 	if (err)
1699 		goto out;
1700 
1701 	if (dry_run)
1702 		goto out;
1703 
1704 	err = bpf__setup_stdout(rec->evlist);
1705 	if (err) {
1706 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
1707 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
1708 			 errbuf);
1709 		goto out;
1710 	}
1711 
1712 	err = -ENOMEM;
1713 
1714 	if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
1715 		pr_warning(
1716 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1717 "check /proc/sys/kernel/kptr_restrict.\n\n"
1718 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1719 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1720 "Samples in kernel modules won't be resolved at all.\n\n"
1721 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1722 "even with a suitable vmlinux or kallsyms file.\n\n");
1723 
1724 	if (rec->no_buildid_cache || rec->no_buildid) {
1725 		disable_buildid_cache();
1726 	} else if (rec->switch_output.enabled) {
1727 		/*
1728 		 * In 'perf record --switch-output', disable buildid
1729 		 * generation by default to reduce data file switching
1730 		 * overhead. Still generate buildid if they are required
1731 		 * explicitly using
1732 		 *
1733 		 *  perf record --switch-output --no-no-buildid \
1734 		 *              --no-no-buildid-cache
1735 		 *
1736 		 * Following code equals to:
1737 		 *
1738 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
1739 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
1740 		 *         disable_buildid_cache();
1741 		 */
1742 		bool disable = true;
1743 
1744 		if (rec->no_buildid_set && !rec->no_buildid)
1745 			disable = false;
1746 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
1747 			disable = false;
1748 		if (disable) {
1749 			rec->no_buildid = true;
1750 			rec->no_buildid_cache = true;
1751 			disable_buildid_cache();
1752 		}
1753 	}
1754 
1755 	if (record.opts.overwrite)
1756 		record.opts.tail_synthesize = true;
1757 
1758 	if (rec->evlist->nr_entries == 0 &&
1759 	    __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
1760 		pr_err("Not enough memory for event selector list\n");
1761 		goto out;
1762 	}
1763 
1764 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
1765 		rec->opts.no_inherit = true;
1766 
1767 	err = target__validate(&rec->opts.target);
1768 	if (err) {
1769 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1770 		ui__warning("%s", errbuf);
1771 	}
1772 
1773 	err = target__parse_uid(&rec->opts.target);
1774 	if (err) {
1775 		int saved_errno = errno;
1776 
1777 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1778 		ui__error("%s", errbuf);
1779 
1780 		err = -saved_errno;
1781 		goto out;
1782 	}
1783 
1784 	/* Enable ignoring missing threads when -u option is defined. */
1785 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX;
1786 
1787 	err = -ENOMEM;
1788 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
1789 		usage_with_options(record_usage, record_options);
1790 
1791 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
1792 	if (err)
1793 		goto out;
1794 
1795 	/*
1796 	 * We take all buildids when the file contains
1797 	 * AUX area tracing data because we do not decode the
1798 	 * trace because it would take too long.
1799 	 */
1800 	if (rec->opts.full_auxtrace)
1801 		rec->buildid_all = true;
1802 
1803 	if (record_opts__config(&rec->opts)) {
1804 		err = -EINVAL;
1805 		goto out;
1806 	}
1807 
1808 	err = __cmd_record(&record, argc, argv);
1809 out:
1810 	perf_evlist__delete(rec->evlist);
1811 	symbol__exit();
1812 	auxtrace_record__free(rec->itr);
1813 	return err;
1814 }
1815 
1816 static void snapshot_sig_handler(int sig __maybe_unused)
1817 {
1818 	struct record *rec = &record;
1819 
1820 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1821 		trigger_hit(&auxtrace_snapshot_trigger);
1822 		auxtrace_record__snapshot_started = 1;
1823 		if (auxtrace_record__snapshot_start(record.itr))
1824 			trigger_error(&auxtrace_snapshot_trigger);
1825 	}
1826 
1827 	if (switch_output_signal(rec))
1828 		trigger_hit(&switch_output_trigger);
1829 }
1830 
1831 static void alarm_sig_handler(int sig __maybe_unused)
1832 {
1833 	struct record *rec = &record;
1834 
1835 	if (switch_output_time(rec))
1836 		trigger_hit(&switch_output_trigger);
1837 }
1838