xref: /openbmc/linux/tools/perf/builtin-record.c (revision 2c64e9cb)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "perf.h"
12 
13 #include "util/build-id.h"
14 #include "util/util.h"
15 #include <subcmd/parse-options.h>
16 #include "util/parse-events.h"
17 #include "util/config.h"
18 
19 #include "util/callchain.h"
20 #include "util/cgroup.h"
21 #include "util/header.h"
22 #include "util/event.h"
23 #include "util/evlist.h"
24 #include "util/evsel.h"
25 #include "util/debug.h"
26 #include "util/session.h"
27 #include "util/tool.h"
28 #include "util/symbol.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/llvm-utils.h"
38 #include "util/bpf-loader.h"
39 #include "util/trigger.h"
40 #include "util/perf-hooks.h"
41 #include "util/cpu-set-sched.h"
42 #include "util/time-utils.h"
43 #include "util/units.h"
44 #include "util/bpf-event.h"
45 #include "asm/bug.h"
46 
47 #include <errno.h>
48 #include <inttypes.h>
49 #include <locale.h>
50 #include <poll.h>
51 #include <unistd.h>
52 #include <sched.h>
53 #include <signal.h>
54 #include <sys/mman.h>
55 #include <sys/wait.h>
56 #include <linux/time64.h>
57 
58 struct switch_output {
59 	bool		 enabled;
60 	bool		 signal;
61 	unsigned long	 size;
62 	unsigned long	 time;
63 	const char	*str;
64 	bool		 set;
65 	char		 **filenames;
66 	int		 num_files;
67 	int		 cur_file;
68 };
69 
70 struct record {
71 	struct perf_tool	tool;
72 	struct record_opts	opts;
73 	u64			bytes_written;
74 	struct perf_data	data;
75 	struct auxtrace_record	*itr;
76 	struct perf_evlist	*evlist;
77 	struct perf_session	*session;
78 	int			realtime_prio;
79 	bool			no_buildid;
80 	bool			no_buildid_set;
81 	bool			no_buildid_cache;
82 	bool			no_buildid_cache_set;
83 	bool			buildid_all;
84 	bool			timestamp_filename;
85 	bool			timestamp_boundary;
86 	struct switch_output	switch_output;
87 	unsigned long long	samples;
88 	cpu_set_t		affinity_mask;
89 };
90 
91 static volatile int auxtrace_record__snapshot_started;
92 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
93 static DEFINE_TRIGGER(switch_output_trigger);
94 
95 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
96 	"SYS", "NODE", "CPU"
97 };
98 
99 static bool switch_output_signal(struct record *rec)
100 {
101 	return rec->switch_output.signal &&
102 	       trigger_is_ready(&switch_output_trigger);
103 }
104 
105 static bool switch_output_size(struct record *rec)
106 {
107 	return rec->switch_output.size &&
108 	       trigger_is_ready(&switch_output_trigger) &&
109 	       (rec->bytes_written >= rec->switch_output.size);
110 }
111 
112 static bool switch_output_time(struct record *rec)
113 {
114 	return rec->switch_output.time &&
115 	       trigger_is_ready(&switch_output_trigger);
116 }
117 
118 static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused,
119 			 void *bf, size_t size)
120 {
121 	struct perf_data_file *file = &rec->session->data->file;
122 
123 	if (perf_data_file__write(file, bf, size) < 0) {
124 		pr_err("failed to write perf data, error: %m\n");
125 		return -1;
126 	}
127 
128 	rec->bytes_written += size;
129 
130 	if (switch_output_size(rec))
131 		trigger_hit(&switch_output_trigger);
132 
133 	return 0;
134 }
135 
136 #ifdef HAVE_AIO_SUPPORT
137 static int record__aio_write(struct aiocb *cblock, int trace_fd,
138 		void *buf, size_t size, off_t off)
139 {
140 	int rc;
141 
142 	cblock->aio_fildes = trace_fd;
143 	cblock->aio_buf    = buf;
144 	cblock->aio_nbytes = size;
145 	cblock->aio_offset = off;
146 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
147 
148 	do {
149 		rc = aio_write(cblock);
150 		if (rc == 0) {
151 			break;
152 		} else if (errno != EAGAIN) {
153 			cblock->aio_fildes = -1;
154 			pr_err("failed to queue perf data, error: %m\n");
155 			break;
156 		}
157 	} while (1);
158 
159 	return rc;
160 }
161 
162 static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
163 {
164 	void *rem_buf;
165 	off_t rem_off;
166 	size_t rem_size;
167 	int rc, aio_errno;
168 	ssize_t aio_ret, written;
169 
170 	aio_errno = aio_error(cblock);
171 	if (aio_errno == EINPROGRESS)
172 		return 0;
173 
174 	written = aio_ret = aio_return(cblock);
175 	if (aio_ret < 0) {
176 		if (aio_errno != EINTR)
177 			pr_err("failed to write perf data, error: %m\n");
178 		written = 0;
179 	}
180 
181 	rem_size = cblock->aio_nbytes - written;
182 
183 	if (rem_size == 0) {
184 		cblock->aio_fildes = -1;
185 		/*
186 		 * md->refcount is incremented in perf_mmap__push() for
187 		 * every enqueued aio write request so decrement it because
188 		 * the request is now complete.
189 		 */
190 		perf_mmap__put(md);
191 		rc = 1;
192 	} else {
193 		/*
194 		 * aio write request may require restart with the
195 		 * reminder if the kernel didn't write whole
196 		 * chunk at once.
197 		 */
198 		rem_off = cblock->aio_offset + written;
199 		rem_buf = (void *)(cblock->aio_buf + written);
200 		record__aio_write(cblock, cblock->aio_fildes,
201 				rem_buf, rem_size, rem_off);
202 		rc = 0;
203 	}
204 
205 	return rc;
206 }
207 
208 static int record__aio_sync(struct perf_mmap *md, bool sync_all)
209 {
210 	struct aiocb **aiocb = md->aio.aiocb;
211 	struct aiocb *cblocks = md->aio.cblocks;
212 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
213 	int i, do_suspend;
214 
215 	do {
216 		do_suspend = 0;
217 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
218 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
219 				if (sync_all)
220 					aiocb[i] = NULL;
221 				else
222 					return i;
223 			} else {
224 				/*
225 				 * Started aio write is not complete yet
226 				 * so it has to be waited before the
227 				 * next allocation.
228 				 */
229 				aiocb[i] = &cblocks[i];
230 				do_suspend = 1;
231 			}
232 		}
233 		if (!do_suspend)
234 			return -1;
235 
236 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
237 			if (!(errno == EAGAIN || errno == EINTR))
238 				pr_err("failed to sync perf data, error: %m\n");
239 		}
240 	} while (1);
241 }
242 
243 static int record__aio_pushfn(void *to, struct aiocb *cblock, void *bf, size_t size, off_t off)
244 {
245 	struct record *rec = to;
246 	int ret, trace_fd = rec->session->data->file.fd;
247 
248 	rec->samples++;
249 
250 	ret = record__aio_write(cblock, trace_fd, bf, size, off);
251 	if (!ret) {
252 		rec->bytes_written += size;
253 		if (switch_output_size(rec))
254 			trigger_hit(&switch_output_trigger);
255 	}
256 
257 	return ret;
258 }
259 
260 static off_t record__aio_get_pos(int trace_fd)
261 {
262 	return lseek(trace_fd, 0, SEEK_CUR);
263 }
264 
265 static void record__aio_set_pos(int trace_fd, off_t pos)
266 {
267 	lseek(trace_fd, pos, SEEK_SET);
268 }
269 
270 static void record__aio_mmap_read_sync(struct record *rec)
271 {
272 	int i;
273 	struct perf_evlist *evlist = rec->evlist;
274 	struct perf_mmap *maps = evlist->mmap;
275 
276 	if (!rec->opts.nr_cblocks)
277 		return;
278 
279 	for (i = 0; i < evlist->nr_mmaps; i++) {
280 		struct perf_mmap *map = &maps[i];
281 
282 		if (map->base)
283 			record__aio_sync(map, true);
284 	}
285 }
286 
287 static int nr_cblocks_default = 1;
288 static int nr_cblocks_max = 4;
289 
290 static int record__aio_parse(const struct option *opt,
291 			     const char *str,
292 			     int unset)
293 {
294 	struct record_opts *opts = (struct record_opts *)opt->value;
295 
296 	if (unset) {
297 		opts->nr_cblocks = 0;
298 	} else {
299 		if (str)
300 			opts->nr_cblocks = strtol(str, NULL, 0);
301 		if (!opts->nr_cblocks)
302 			opts->nr_cblocks = nr_cblocks_default;
303 	}
304 
305 	return 0;
306 }
307 #else /* HAVE_AIO_SUPPORT */
308 static int nr_cblocks_max = 0;
309 
310 static int record__aio_sync(struct perf_mmap *md __maybe_unused, bool sync_all __maybe_unused)
311 {
312 	return -1;
313 }
314 
315 static int record__aio_pushfn(void *to __maybe_unused, struct aiocb *cblock __maybe_unused,
316 		void *bf __maybe_unused, size_t size __maybe_unused, off_t off __maybe_unused)
317 {
318 	return -1;
319 }
320 
321 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
322 {
323 	return -1;
324 }
325 
326 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
327 {
328 }
329 
330 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
331 {
332 }
333 #endif
334 
335 static int record__aio_enabled(struct record *rec)
336 {
337 	return rec->opts.nr_cblocks > 0;
338 }
339 
340 #define MMAP_FLUSH_DEFAULT 1
341 static int record__mmap_flush_parse(const struct option *opt,
342 				    const char *str,
343 				    int unset)
344 {
345 	int flush_max;
346 	struct record_opts *opts = (struct record_opts *)opt->value;
347 	static struct parse_tag tags[] = {
348 			{ .tag  = 'B', .mult = 1       },
349 			{ .tag  = 'K', .mult = 1 << 10 },
350 			{ .tag  = 'M', .mult = 1 << 20 },
351 			{ .tag  = 'G', .mult = 1 << 30 },
352 			{ .tag  = 0 },
353 	};
354 
355 	if (unset)
356 		return 0;
357 
358 	if (str) {
359 		opts->mmap_flush = parse_tag_value(str, tags);
360 		if (opts->mmap_flush == (int)-1)
361 			opts->mmap_flush = strtol(str, NULL, 0);
362 	}
363 
364 	if (!opts->mmap_flush)
365 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
366 
367 	flush_max = perf_evlist__mmap_size(opts->mmap_pages);
368 	flush_max /= 4;
369 	if (opts->mmap_flush > flush_max)
370 		opts->mmap_flush = flush_max;
371 
372 	return 0;
373 }
374 
375 static int process_synthesized_event(struct perf_tool *tool,
376 				     union perf_event *event,
377 				     struct perf_sample *sample __maybe_unused,
378 				     struct machine *machine __maybe_unused)
379 {
380 	struct record *rec = container_of(tool, struct record, tool);
381 	return record__write(rec, NULL, event, event->header.size);
382 }
383 
384 static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size)
385 {
386 	struct record *rec = to;
387 
388 	rec->samples++;
389 	return record__write(rec, map, bf, size);
390 }
391 
392 static volatile int done;
393 static volatile int signr = -1;
394 static volatile int child_finished;
395 
396 static void sig_handler(int sig)
397 {
398 	if (sig == SIGCHLD)
399 		child_finished = 1;
400 	else
401 		signr = sig;
402 
403 	done = 1;
404 }
405 
406 static void sigsegv_handler(int sig)
407 {
408 	perf_hooks__recover();
409 	sighandler_dump_stack(sig);
410 }
411 
412 static void record__sig_exit(void)
413 {
414 	if (signr == -1)
415 		return;
416 
417 	signal(signr, SIG_DFL);
418 	raise(signr);
419 }
420 
421 #ifdef HAVE_AUXTRACE_SUPPORT
422 
423 static int record__process_auxtrace(struct perf_tool *tool,
424 				    struct perf_mmap *map,
425 				    union perf_event *event, void *data1,
426 				    size_t len1, void *data2, size_t len2)
427 {
428 	struct record *rec = container_of(tool, struct record, tool);
429 	struct perf_data *data = &rec->data;
430 	size_t padding;
431 	u8 pad[8] = {0};
432 
433 	if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
434 		off_t file_offset;
435 		int fd = perf_data__fd(data);
436 		int err;
437 
438 		file_offset = lseek(fd, 0, SEEK_CUR);
439 		if (file_offset == -1)
440 			return -1;
441 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
442 						     event, file_offset);
443 		if (err)
444 			return err;
445 	}
446 
447 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
448 	padding = (len1 + len2) & 7;
449 	if (padding)
450 		padding = 8 - padding;
451 
452 	record__write(rec, map, event, event->header.size);
453 	record__write(rec, map, data1, len1);
454 	if (len2)
455 		record__write(rec, map, data2, len2);
456 	record__write(rec, map, &pad, padding);
457 
458 	return 0;
459 }
460 
461 static int record__auxtrace_mmap_read(struct record *rec,
462 				      struct perf_mmap *map)
463 {
464 	int ret;
465 
466 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
467 				  record__process_auxtrace);
468 	if (ret < 0)
469 		return ret;
470 
471 	if (ret)
472 		rec->samples++;
473 
474 	return 0;
475 }
476 
477 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
478 					       struct perf_mmap *map)
479 {
480 	int ret;
481 
482 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
483 					   record__process_auxtrace,
484 					   rec->opts.auxtrace_snapshot_size);
485 	if (ret < 0)
486 		return ret;
487 
488 	if (ret)
489 		rec->samples++;
490 
491 	return 0;
492 }
493 
494 static int record__auxtrace_read_snapshot_all(struct record *rec)
495 {
496 	int i;
497 	int rc = 0;
498 
499 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
500 		struct perf_mmap *map = &rec->evlist->mmap[i];
501 
502 		if (!map->auxtrace_mmap.base)
503 			continue;
504 
505 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
506 			rc = -1;
507 			goto out;
508 		}
509 	}
510 out:
511 	return rc;
512 }
513 
514 static void record__read_auxtrace_snapshot(struct record *rec)
515 {
516 	pr_debug("Recording AUX area tracing snapshot\n");
517 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
518 		trigger_error(&auxtrace_snapshot_trigger);
519 	} else {
520 		if (auxtrace_record__snapshot_finish(rec->itr))
521 			trigger_error(&auxtrace_snapshot_trigger);
522 		else
523 			trigger_ready(&auxtrace_snapshot_trigger);
524 	}
525 }
526 
527 static int record__auxtrace_init(struct record *rec)
528 {
529 	int err;
530 
531 	if (!rec->itr) {
532 		rec->itr = auxtrace_record__init(rec->evlist, &err);
533 		if (err)
534 			return err;
535 	}
536 
537 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
538 					      rec->opts.auxtrace_snapshot_opts);
539 	if (err)
540 		return err;
541 
542 	return auxtrace_parse_filters(rec->evlist);
543 }
544 
545 #else
546 
547 static inline
548 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
549 			       struct perf_mmap *map __maybe_unused)
550 {
551 	return 0;
552 }
553 
554 static inline
555 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
556 {
557 }
558 
559 static inline
560 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
561 {
562 	return 0;
563 }
564 
565 static int record__auxtrace_init(struct record *rec __maybe_unused)
566 {
567 	return 0;
568 }
569 
570 #endif
571 
572 static int record__mmap_evlist(struct record *rec,
573 			       struct perf_evlist *evlist)
574 {
575 	struct record_opts *opts = &rec->opts;
576 	char msg[512];
577 
578 	if (opts->affinity != PERF_AFFINITY_SYS)
579 		cpu__setup_cpunode_map();
580 
581 	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
582 				 opts->auxtrace_mmap_pages,
583 				 opts->auxtrace_snapshot_mode,
584 				 opts->nr_cblocks, opts->affinity,
585 				 opts->mmap_flush) < 0) {
586 		if (errno == EPERM) {
587 			pr_err("Permission error mapping pages.\n"
588 			       "Consider increasing "
589 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
590 			       "or try again with a smaller value of -m/--mmap_pages.\n"
591 			       "(current value: %u,%u)\n",
592 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
593 			return -errno;
594 		} else {
595 			pr_err("failed to mmap with %d (%s)\n", errno,
596 				str_error_r(errno, msg, sizeof(msg)));
597 			if (errno)
598 				return -errno;
599 			else
600 				return -EINVAL;
601 		}
602 	}
603 	return 0;
604 }
605 
606 static int record__mmap(struct record *rec)
607 {
608 	return record__mmap_evlist(rec, rec->evlist);
609 }
610 
611 static int record__open(struct record *rec)
612 {
613 	char msg[BUFSIZ];
614 	struct perf_evsel *pos;
615 	struct perf_evlist *evlist = rec->evlist;
616 	struct perf_session *session = rec->session;
617 	struct record_opts *opts = &rec->opts;
618 	int rc = 0;
619 
620 	/*
621 	 * For initial_delay we need to add a dummy event so that we can track
622 	 * PERF_RECORD_MMAP while we wait for the initial delay to enable the
623 	 * real events, the ones asked by the user.
624 	 */
625 	if (opts->initial_delay) {
626 		if (perf_evlist__add_dummy(evlist))
627 			return -ENOMEM;
628 
629 		pos = perf_evlist__first(evlist);
630 		pos->tracking = 0;
631 		pos = perf_evlist__last(evlist);
632 		pos->tracking = 1;
633 		pos->attr.enable_on_exec = 1;
634 	}
635 
636 	perf_evlist__config(evlist, opts, &callchain_param);
637 
638 	evlist__for_each_entry(evlist, pos) {
639 try_again:
640 		if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
641 			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
642 				if (verbose > 0)
643 					ui__warning("%s\n", msg);
644 				goto try_again;
645 			}
646 			if ((errno == EINVAL || errno == EBADF) &&
647 			    pos->leader != pos &&
648 			    pos->weak_group) {
649 			        pos = perf_evlist__reset_weak_group(evlist, pos);
650 				goto try_again;
651 			}
652 			rc = -errno;
653 			perf_evsel__open_strerror(pos, &opts->target,
654 						  errno, msg, sizeof(msg));
655 			ui__error("%s\n", msg);
656 			goto out;
657 		}
658 
659 		pos->supported = true;
660 	}
661 
662 	if (perf_evlist__apply_filters(evlist, &pos)) {
663 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
664 			pos->filter, perf_evsel__name(pos), errno,
665 			str_error_r(errno, msg, sizeof(msg)));
666 		rc = -1;
667 		goto out;
668 	}
669 
670 	rc = record__mmap(rec);
671 	if (rc)
672 		goto out;
673 
674 	session->evlist = evlist;
675 	perf_session__set_id_hdr_size(session);
676 out:
677 	return rc;
678 }
679 
680 static int process_sample_event(struct perf_tool *tool,
681 				union perf_event *event,
682 				struct perf_sample *sample,
683 				struct perf_evsel *evsel,
684 				struct machine *machine)
685 {
686 	struct record *rec = container_of(tool, struct record, tool);
687 
688 	if (rec->evlist->first_sample_time == 0)
689 		rec->evlist->first_sample_time = sample->time;
690 
691 	rec->evlist->last_sample_time = sample->time;
692 
693 	if (rec->buildid_all)
694 		return 0;
695 
696 	rec->samples++;
697 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
698 }
699 
700 static int process_buildids(struct record *rec)
701 {
702 	struct perf_session *session = rec->session;
703 
704 	if (perf_data__size(&rec->data) == 0)
705 		return 0;
706 
707 	/*
708 	 * During this process, it'll load kernel map and replace the
709 	 * dso->long_name to a real pathname it found.  In this case
710 	 * we prefer the vmlinux path like
711 	 *   /lib/modules/3.16.4/build/vmlinux
712 	 *
713 	 * rather than build-id path (in debug directory).
714 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
715 	 */
716 	symbol_conf.ignore_vmlinux_buildid = true;
717 
718 	/*
719 	 * If --buildid-all is given, it marks all DSO regardless of hits,
720 	 * so no need to process samples. But if timestamp_boundary is enabled,
721 	 * it still needs to walk on all samples to get the timestamps of
722 	 * first/last samples.
723 	 */
724 	if (rec->buildid_all && !rec->timestamp_boundary)
725 		rec->tool.sample = NULL;
726 
727 	return perf_session__process_events(session);
728 }
729 
730 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
731 {
732 	int err;
733 	struct perf_tool *tool = data;
734 	/*
735 	 *As for guest kernel when processing subcommand record&report,
736 	 *we arrange module mmap prior to guest kernel mmap and trigger
737 	 *a preload dso because default guest module symbols are loaded
738 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
739 	 *method is used to avoid symbol missing when the first addr is
740 	 *in module instead of in guest kernel.
741 	 */
742 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
743 					     machine);
744 	if (err < 0)
745 		pr_err("Couldn't record guest kernel [%d]'s reference"
746 		       " relocation symbol.\n", machine->pid);
747 
748 	/*
749 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
750 	 * have no _text sometimes.
751 	 */
752 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
753 						 machine);
754 	if (err < 0)
755 		pr_err("Couldn't record guest kernel [%d]'s reference"
756 		       " relocation symbol.\n", machine->pid);
757 }
758 
759 static struct perf_event_header finished_round_event = {
760 	.size = sizeof(struct perf_event_header),
761 	.type = PERF_RECORD_FINISHED_ROUND,
762 };
763 
764 static void record__adjust_affinity(struct record *rec, struct perf_mmap *map)
765 {
766 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
767 	    !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
768 		CPU_ZERO(&rec->affinity_mask);
769 		CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
770 		sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
771 	}
772 }
773 
774 static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist,
775 				    bool overwrite, bool synch)
776 {
777 	u64 bytes_written = rec->bytes_written;
778 	int i;
779 	int rc = 0;
780 	struct perf_mmap *maps;
781 	int trace_fd = rec->data.file.fd;
782 	off_t off;
783 
784 	if (!evlist)
785 		return 0;
786 
787 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
788 	if (!maps)
789 		return 0;
790 
791 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
792 		return 0;
793 
794 	if (record__aio_enabled(rec))
795 		off = record__aio_get_pos(trace_fd);
796 
797 	for (i = 0; i < evlist->nr_mmaps; i++) {
798 		u64 flush = 0;
799 		struct perf_mmap *map = &maps[i];
800 
801 		if (map->base) {
802 			record__adjust_affinity(rec, map);
803 			if (synch) {
804 				flush = map->flush;
805 				map->flush = 1;
806 			}
807 			if (!record__aio_enabled(rec)) {
808 				if (perf_mmap__push(map, rec, record__pushfn) != 0) {
809 					if (synch)
810 						map->flush = flush;
811 					rc = -1;
812 					goto out;
813 				}
814 			} else {
815 				int idx;
816 				/*
817 				 * Call record__aio_sync() to wait till map->data buffer
818 				 * becomes available after previous aio write request.
819 				 */
820 				idx = record__aio_sync(map, false);
821 				if (perf_mmap__aio_push(map, rec, idx, record__aio_pushfn, &off) != 0) {
822 					record__aio_set_pos(trace_fd, off);
823 					if (synch)
824 						map->flush = flush;
825 					rc = -1;
826 					goto out;
827 				}
828 			}
829 			if (synch)
830 				map->flush = flush;
831 		}
832 
833 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
834 		    record__auxtrace_mmap_read(rec, map) != 0) {
835 			rc = -1;
836 			goto out;
837 		}
838 	}
839 
840 	if (record__aio_enabled(rec))
841 		record__aio_set_pos(trace_fd, off);
842 
843 	/*
844 	 * Mark the round finished in case we wrote
845 	 * at least one event.
846 	 */
847 	if (bytes_written != rec->bytes_written)
848 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
849 
850 	if (overwrite)
851 		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
852 out:
853 	return rc;
854 }
855 
856 static int record__mmap_read_all(struct record *rec, bool synch)
857 {
858 	int err;
859 
860 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
861 	if (err)
862 		return err;
863 
864 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
865 }
866 
867 static void record__init_features(struct record *rec)
868 {
869 	struct perf_session *session = rec->session;
870 	int feat;
871 
872 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
873 		perf_header__set_feat(&session->header, feat);
874 
875 	if (rec->no_buildid)
876 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
877 
878 	if (!have_tracepoints(&rec->evlist->entries))
879 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
880 
881 	if (!rec->opts.branch_stack)
882 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
883 
884 	if (!rec->opts.full_auxtrace)
885 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
886 
887 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
888 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
889 
890 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
891 
892 	perf_header__clear_feat(&session->header, HEADER_STAT);
893 }
894 
895 static void
896 record__finish_output(struct record *rec)
897 {
898 	struct perf_data *data = &rec->data;
899 	int fd = perf_data__fd(data);
900 
901 	if (data->is_pipe)
902 		return;
903 
904 	rec->session->header.data_size += rec->bytes_written;
905 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
906 
907 	if (!rec->no_buildid) {
908 		process_buildids(rec);
909 
910 		if (rec->buildid_all)
911 			dsos__hit_all(rec->session);
912 	}
913 	perf_session__write_header(rec->session, rec->evlist, fd, true);
914 
915 	return;
916 }
917 
918 static int record__synthesize_workload(struct record *rec, bool tail)
919 {
920 	int err;
921 	struct thread_map *thread_map;
922 
923 	if (rec->opts.tail_synthesize != tail)
924 		return 0;
925 
926 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
927 	if (thread_map == NULL)
928 		return -1;
929 
930 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
931 						 process_synthesized_event,
932 						 &rec->session->machines.host,
933 						 rec->opts.sample_address);
934 	thread_map__put(thread_map);
935 	return err;
936 }
937 
938 static int record__synthesize(struct record *rec, bool tail);
939 
940 static int
941 record__switch_output(struct record *rec, bool at_exit)
942 {
943 	struct perf_data *data = &rec->data;
944 	int fd, err;
945 	char *new_filename;
946 
947 	/* Same Size:      "2015122520103046"*/
948 	char timestamp[] = "InvalidTimestamp";
949 
950 	record__aio_mmap_read_sync(rec);
951 
952 	record__synthesize(rec, true);
953 	if (target__none(&rec->opts.target))
954 		record__synthesize_workload(rec, true);
955 
956 	rec->samples = 0;
957 	record__finish_output(rec);
958 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
959 	if (err) {
960 		pr_err("Failed to get current timestamp\n");
961 		return -EINVAL;
962 	}
963 
964 	fd = perf_data__switch(data, timestamp,
965 				    rec->session->header.data_offset,
966 				    at_exit, &new_filename);
967 	if (fd >= 0 && !at_exit) {
968 		rec->bytes_written = 0;
969 		rec->session->header.data_size = 0;
970 	}
971 
972 	if (!quiet)
973 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
974 			data->path, timestamp);
975 
976 	if (rec->switch_output.num_files) {
977 		int n = rec->switch_output.cur_file + 1;
978 
979 		if (n >= rec->switch_output.num_files)
980 			n = 0;
981 		rec->switch_output.cur_file = n;
982 		if (rec->switch_output.filenames[n]) {
983 			remove(rec->switch_output.filenames[n]);
984 			free(rec->switch_output.filenames[n]);
985 		}
986 		rec->switch_output.filenames[n] = new_filename;
987 	} else {
988 		free(new_filename);
989 	}
990 
991 	/* Output tracking events */
992 	if (!at_exit) {
993 		record__synthesize(rec, false);
994 
995 		/*
996 		 * In 'perf record --switch-output' without -a,
997 		 * record__synthesize() in record__switch_output() won't
998 		 * generate tracking events because there's no thread_map
999 		 * in evlist. Which causes newly created perf.data doesn't
1000 		 * contain map and comm information.
1001 		 * Create a fake thread_map and directly call
1002 		 * perf_event__synthesize_thread_map() for those events.
1003 		 */
1004 		if (target__none(&rec->opts.target))
1005 			record__synthesize_workload(rec, false);
1006 	}
1007 	return fd;
1008 }
1009 
1010 static volatile int workload_exec_errno;
1011 
1012 /*
1013  * perf_evlist__prepare_workload will send a SIGUSR1
1014  * if the fork fails, since we asked by setting its
1015  * want_signal to true.
1016  */
1017 static void workload_exec_failed_signal(int signo __maybe_unused,
1018 					siginfo_t *info,
1019 					void *ucontext __maybe_unused)
1020 {
1021 	workload_exec_errno = info->si_value.sival_int;
1022 	done = 1;
1023 	child_finished = 1;
1024 }
1025 
1026 static void snapshot_sig_handler(int sig);
1027 static void alarm_sig_handler(int sig);
1028 
1029 int __weak
1030 perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
1031 			    struct perf_tool *tool __maybe_unused,
1032 			    perf_event__handler_t process __maybe_unused,
1033 			    struct machine *machine __maybe_unused)
1034 {
1035 	return 0;
1036 }
1037 
1038 static const struct perf_event_mmap_page *
1039 perf_evlist__pick_pc(struct perf_evlist *evlist)
1040 {
1041 	if (evlist) {
1042 		if (evlist->mmap && evlist->mmap[0].base)
1043 			return evlist->mmap[0].base;
1044 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
1045 			return evlist->overwrite_mmap[0].base;
1046 	}
1047 	return NULL;
1048 }
1049 
1050 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1051 {
1052 	const struct perf_event_mmap_page *pc;
1053 
1054 	pc = perf_evlist__pick_pc(rec->evlist);
1055 	if (pc)
1056 		return pc;
1057 	return NULL;
1058 }
1059 
1060 static int record__synthesize(struct record *rec, bool tail)
1061 {
1062 	struct perf_session *session = rec->session;
1063 	struct machine *machine = &session->machines.host;
1064 	struct perf_data *data = &rec->data;
1065 	struct record_opts *opts = &rec->opts;
1066 	struct perf_tool *tool = &rec->tool;
1067 	int fd = perf_data__fd(data);
1068 	int err = 0;
1069 
1070 	if (rec->opts.tail_synthesize != tail)
1071 		return 0;
1072 
1073 	if (data->is_pipe) {
1074 		/*
1075 		 * We need to synthesize events first, because some
1076 		 * features works on top of them (on report side).
1077 		 */
1078 		err = perf_event__synthesize_attrs(tool, rec->evlist,
1079 						   process_synthesized_event);
1080 		if (err < 0) {
1081 			pr_err("Couldn't synthesize attrs.\n");
1082 			goto out;
1083 		}
1084 
1085 		err = perf_event__synthesize_features(tool, session, rec->evlist,
1086 						      process_synthesized_event);
1087 		if (err < 0) {
1088 			pr_err("Couldn't synthesize features.\n");
1089 			return err;
1090 		}
1091 
1092 		if (have_tracepoints(&rec->evlist->entries)) {
1093 			/*
1094 			 * FIXME err <= 0 here actually means that
1095 			 * there were no tracepoints so its not really
1096 			 * an error, just that we don't need to
1097 			 * synthesize anything.  We really have to
1098 			 * return this more properly and also
1099 			 * propagate errors that now are calling die()
1100 			 */
1101 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
1102 								  process_synthesized_event);
1103 			if (err <= 0) {
1104 				pr_err("Couldn't record tracing data.\n");
1105 				goto out;
1106 			}
1107 			rec->bytes_written += err;
1108 		}
1109 	}
1110 
1111 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1112 					  process_synthesized_event, machine);
1113 	if (err)
1114 		goto out;
1115 
1116 	if (rec->opts.full_auxtrace) {
1117 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1118 					session, process_synthesized_event);
1119 		if (err)
1120 			goto out;
1121 	}
1122 
1123 	if (!perf_evlist__exclude_kernel(rec->evlist)) {
1124 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1125 							 machine);
1126 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1127 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1128 				   "Check /proc/kallsyms permission or run as root.\n");
1129 
1130 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1131 						     machine);
1132 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1133 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1134 				   "Check /proc/modules permission or run as root.\n");
1135 	}
1136 
1137 	if (perf_guest) {
1138 		machines__process_guests(&session->machines,
1139 					 perf_event__synthesize_guest_os, tool);
1140 	}
1141 
1142 	err = perf_event__synthesize_extra_attr(&rec->tool,
1143 						rec->evlist,
1144 						process_synthesized_event,
1145 						data->is_pipe);
1146 	if (err)
1147 		goto out;
1148 
1149 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->threads,
1150 						 process_synthesized_event,
1151 						NULL);
1152 	if (err < 0) {
1153 		pr_err("Couldn't synthesize thread map.\n");
1154 		return err;
1155 	}
1156 
1157 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->cpus,
1158 					     process_synthesized_event, NULL);
1159 	if (err < 0) {
1160 		pr_err("Couldn't synthesize cpu map.\n");
1161 		return err;
1162 	}
1163 
1164 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1165 						machine, opts);
1166 	if (err < 0)
1167 		pr_warning("Couldn't synthesize bpf events.\n");
1168 
1169 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
1170 					    process_synthesized_event, opts->sample_address,
1171 					    1);
1172 out:
1173 	return err;
1174 }
1175 
1176 static int __cmd_record(struct record *rec, int argc, const char **argv)
1177 {
1178 	int err;
1179 	int status = 0;
1180 	unsigned long waking = 0;
1181 	const bool forks = argc > 0;
1182 	struct perf_tool *tool = &rec->tool;
1183 	struct record_opts *opts = &rec->opts;
1184 	struct perf_data *data = &rec->data;
1185 	struct perf_session *session;
1186 	bool disabled = false, draining = false;
1187 	struct perf_evlist *sb_evlist = NULL;
1188 	int fd;
1189 
1190 	atexit(record__sig_exit);
1191 	signal(SIGCHLD, sig_handler);
1192 	signal(SIGINT, sig_handler);
1193 	signal(SIGTERM, sig_handler);
1194 	signal(SIGSEGV, sigsegv_handler);
1195 
1196 	if (rec->opts.record_namespaces)
1197 		tool->namespace_events = true;
1198 
1199 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1200 		signal(SIGUSR2, snapshot_sig_handler);
1201 		if (rec->opts.auxtrace_snapshot_mode)
1202 			trigger_on(&auxtrace_snapshot_trigger);
1203 		if (rec->switch_output.enabled)
1204 			trigger_on(&switch_output_trigger);
1205 	} else {
1206 		signal(SIGUSR2, SIG_IGN);
1207 	}
1208 
1209 	session = perf_session__new(data, false, tool);
1210 	if (session == NULL) {
1211 		pr_err("Perf session creation failed.\n");
1212 		return -1;
1213 	}
1214 
1215 	fd = perf_data__fd(data);
1216 	rec->session = session;
1217 
1218 	record__init_features(rec);
1219 
1220 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1221 		session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1222 
1223 	if (forks) {
1224 		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1225 						    argv, data->is_pipe,
1226 						    workload_exec_failed_signal);
1227 		if (err < 0) {
1228 			pr_err("Couldn't run the workload!\n");
1229 			status = err;
1230 			goto out_delete_session;
1231 		}
1232 	}
1233 
1234 	/*
1235 	 * If we have just single event and are sending data
1236 	 * through pipe, we need to force the ids allocation,
1237 	 * because we synthesize event name through the pipe
1238 	 * and need the id for that.
1239 	 */
1240 	if (data->is_pipe && rec->evlist->nr_entries == 1)
1241 		rec->opts.sample_id = true;
1242 
1243 	if (record__open(rec) != 0) {
1244 		err = -1;
1245 		goto out_child;
1246 	}
1247 
1248 	err = bpf__apply_obj_config();
1249 	if (err) {
1250 		char errbuf[BUFSIZ];
1251 
1252 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1253 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1254 			 errbuf);
1255 		goto out_child;
1256 	}
1257 
1258 	/*
1259 	 * Normally perf_session__new would do this, but it doesn't have the
1260 	 * evlist.
1261 	 */
1262 	if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1263 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1264 		rec->tool.ordered_events = false;
1265 	}
1266 
1267 	if (!rec->evlist->nr_groups)
1268 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1269 
1270 	if (data->is_pipe) {
1271 		err = perf_header__write_pipe(fd);
1272 		if (err < 0)
1273 			goto out_child;
1274 	} else {
1275 		err = perf_session__write_header(session, rec->evlist, fd, false);
1276 		if (err < 0)
1277 			goto out_child;
1278 	}
1279 
1280 	if (!rec->no_buildid
1281 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1282 		pr_err("Couldn't generate buildids. "
1283 		       "Use --no-buildid to profile anyway.\n");
1284 		err = -1;
1285 		goto out_child;
1286 	}
1287 
1288 	if (!opts->no_bpf_event)
1289 		bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1290 
1291 	if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1292 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1293 		opts->no_bpf_event = true;
1294 	}
1295 
1296 	err = record__synthesize(rec, false);
1297 	if (err < 0)
1298 		goto out_child;
1299 
1300 	if (rec->realtime_prio) {
1301 		struct sched_param param;
1302 
1303 		param.sched_priority = rec->realtime_prio;
1304 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1305 			pr_err("Could not set realtime priority.\n");
1306 			err = -1;
1307 			goto out_child;
1308 		}
1309 	}
1310 
1311 	/*
1312 	 * When perf is starting the traced process, all the events
1313 	 * (apart from group members) have enable_on_exec=1 set,
1314 	 * so don't spoil it by prematurely enabling them.
1315 	 */
1316 	if (!target__none(&opts->target) && !opts->initial_delay)
1317 		perf_evlist__enable(rec->evlist);
1318 
1319 	/*
1320 	 * Let the child rip
1321 	 */
1322 	if (forks) {
1323 		struct machine *machine = &session->machines.host;
1324 		union perf_event *event;
1325 		pid_t tgid;
1326 
1327 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1328 		if (event == NULL) {
1329 			err = -ENOMEM;
1330 			goto out_child;
1331 		}
1332 
1333 		/*
1334 		 * Some H/W events are generated before COMM event
1335 		 * which is emitted during exec(), so perf script
1336 		 * cannot see a correct process name for those events.
1337 		 * Synthesize COMM event to prevent it.
1338 		 */
1339 		tgid = perf_event__synthesize_comm(tool, event,
1340 						   rec->evlist->workload.pid,
1341 						   process_synthesized_event,
1342 						   machine);
1343 		free(event);
1344 
1345 		if (tgid == -1)
1346 			goto out_child;
1347 
1348 		event = malloc(sizeof(event->namespaces) +
1349 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1350 			       machine->id_hdr_size);
1351 		if (event == NULL) {
1352 			err = -ENOMEM;
1353 			goto out_child;
1354 		}
1355 
1356 		/*
1357 		 * Synthesize NAMESPACES event for the command specified.
1358 		 */
1359 		perf_event__synthesize_namespaces(tool, event,
1360 						  rec->evlist->workload.pid,
1361 						  tgid, process_synthesized_event,
1362 						  machine);
1363 		free(event);
1364 
1365 		perf_evlist__start_workload(rec->evlist);
1366 	}
1367 
1368 	if (opts->initial_delay) {
1369 		usleep(opts->initial_delay * USEC_PER_MSEC);
1370 		perf_evlist__enable(rec->evlist);
1371 	}
1372 
1373 	trigger_ready(&auxtrace_snapshot_trigger);
1374 	trigger_ready(&switch_output_trigger);
1375 	perf_hooks__invoke_record_start();
1376 	for (;;) {
1377 		unsigned long long hits = rec->samples;
1378 
1379 		/*
1380 		 * rec->evlist->bkw_mmap_state is possible to be
1381 		 * BKW_MMAP_EMPTY here: when done == true and
1382 		 * hits != rec->samples in previous round.
1383 		 *
1384 		 * perf_evlist__toggle_bkw_mmap ensure we never
1385 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1386 		 */
1387 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1388 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1389 
1390 		if (record__mmap_read_all(rec, false) < 0) {
1391 			trigger_error(&auxtrace_snapshot_trigger);
1392 			trigger_error(&switch_output_trigger);
1393 			err = -1;
1394 			goto out_child;
1395 		}
1396 
1397 		if (auxtrace_record__snapshot_started) {
1398 			auxtrace_record__snapshot_started = 0;
1399 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1400 				record__read_auxtrace_snapshot(rec);
1401 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1402 				pr_err("AUX area tracing snapshot failed\n");
1403 				err = -1;
1404 				goto out_child;
1405 			}
1406 		}
1407 
1408 		if (trigger_is_hit(&switch_output_trigger)) {
1409 			/*
1410 			 * If switch_output_trigger is hit, the data in
1411 			 * overwritable ring buffer should have been collected,
1412 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1413 			 *
1414 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1415 			 * record__mmap_read_all() didn't collect data from
1416 			 * overwritable ring buffer. Read again.
1417 			 */
1418 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1419 				continue;
1420 			trigger_ready(&switch_output_trigger);
1421 
1422 			/*
1423 			 * Reenable events in overwrite ring buffer after
1424 			 * record__mmap_read_all(): we should have collected
1425 			 * data from it.
1426 			 */
1427 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1428 
1429 			if (!quiet)
1430 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1431 					waking);
1432 			waking = 0;
1433 			fd = record__switch_output(rec, false);
1434 			if (fd < 0) {
1435 				pr_err("Failed to switch to new file\n");
1436 				trigger_error(&switch_output_trigger);
1437 				err = fd;
1438 				goto out_child;
1439 			}
1440 
1441 			/* re-arm the alarm */
1442 			if (rec->switch_output.time)
1443 				alarm(rec->switch_output.time);
1444 		}
1445 
1446 		if (hits == rec->samples) {
1447 			if (done || draining)
1448 				break;
1449 			err = perf_evlist__poll(rec->evlist, -1);
1450 			/*
1451 			 * Propagate error, only if there's any. Ignore positive
1452 			 * number of returned events and interrupt error.
1453 			 */
1454 			if (err > 0 || (err < 0 && errno == EINTR))
1455 				err = 0;
1456 			waking++;
1457 
1458 			if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1459 				draining = true;
1460 		}
1461 
1462 		/*
1463 		 * When perf is starting the traced process, at the end events
1464 		 * die with the process and we wait for that. Thus no need to
1465 		 * disable events in this case.
1466 		 */
1467 		if (done && !disabled && !target__none(&opts->target)) {
1468 			trigger_off(&auxtrace_snapshot_trigger);
1469 			perf_evlist__disable(rec->evlist);
1470 			disabled = true;
1471 		}
1472 	}
1473 	trigger_off(&auxtrace_snapshot_trigger);
1474 	trigger_off(&switch_output_trigger);
1475 
1476 	if (forks && workload_exec_errno) {
1477 		char msg[STRERR_BUFSIZE];
1478 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1479 		pr_err("Workload failed: %s\n", emsg);
1480 		err = -1;
1481 		goto out_child;
1482 	}
1483 
1484 	if (!quiet)
1485 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1486 
1487 	if (target__none(&rec->opts.target))
1488 		record__synthesize_workload(rec, true);
1489 
1490 out_child:
1491 	record__mmap_read_all(rec, true);
1492 	record__aio_mmap_read_sync(rec);
1493 
1494 	if (forks) {
1495 		int exit_status;
1496 
1497 		if (!child_finished)
1498 			kill(rec->evlist->workload.pid, SIGTERM);
1499 
1500 		wait(&exit_status);
1501 
1502 		if (err < 0)
1503 			status = err;
1504 		else if (WIFEXITED(exit_status))
1505 			status = WEXITSTATUS(exit_status);
1506 		else if (WIFSIGNALED(exit_status))
1507 			signr = WTERMSIG(exit_status);
1508 	} else
1509 		status = err;
1510 
1511 	record__synthesize(rec, true);
1512 	/* this will be recalculated during process_buildids() */
1513 	rec->samples = 0;
1514 
1515 	if (!err) {
1516 		if (!rec->timestamp_filename) {
1517 			record__finish_output(rec);
1518 		} else {
1519 			fd = record__switch_output(rec, true);
1520 			if (fd < 0) {
1521 				status = fd;
1522 				goto out_delete_session;
1523 			}
1524 		}
1525 	}
1526 
1527 	perf_hooks__invoke_record_end();
1528 
1529 	if (!err && !quiet) {
1530 		char samples[128];
1531 		const char *postfix = rec->timestamp_filename ?
1532 					".<timestamp>" : "";
1533 
1534 		if (rec->samples && !rec->opts.full_auxtrace)
1535 			scnprintf(samples, sizeof(samples),
1536 				  " (%" PRIu64 " samples)", rec->samples);
1537 		else
1538 			samples[0] = '\0';
1539 
1540 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s ]\n",
1541 			perf_data__size(data) / 1024.0 / 1024.0,
1542 			data->path, postfix, samples);
1543 	}
1544 
1545 out_delete_session:
1546 	perf_session__delete(session);
1547 
1548 	if (!opts->no_bpf_event)
1549 		perf_evlist__stop_sb_thread(sb_evlist);
1550 	return status;
1551 }
1552 
1553 static void callchain_debug(struct callchain_param *callchain)
1554 {
1555 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1556 
1557 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1558 
1559 	if (callchain->record_mode == CALLCHAIN_DWARF)
1560 		pr_debug("callchain: stack dump size %d\n",
1561 			 callchain->dump_size);
1562 }
1563 
1564 int record_opts__parse_callchain(struct record_opts *record,
1565 				 struct callchain_param *callchain,
1566 				 const char *arg, bool unset)
1567 {
1568 	int ret;
1569 	callchain->enabled = !unset;
1570 
1571 	/* --no-call-graph */
1572 	if (unset) {
1573 		callchain->record_mode = CALLCHAIN_NONE;
1574 		pr_debug("callchain: disabled\n");
1575 		return 0;
1576 	}
1577 
1578 	ret = parse_callchain_record_opt(arg, callchain);
1579 	if (!ret) {
1580 		/* Enable data address sampling for DWARF unwind. */
1581 		if (callchain->record_mode == CALLCHAIN_DWARF)
1582 			record->sample_address = true;
1583 		callchain_debug(callchain);
1584 	}
1585 
1586 	return ret;
1587 }
1588 
1589 int record_parse_callchain_opt(const struct option *opt,
1590 			       const char *arg,
1591 			       int unset)
1592 {
1593 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1594 }
1595 
1596 int record_callchain_opt(const struct option *opt,
1597 			 const char *arg __maybe_unused,
1598 			 int unset __maybe_unused)
1599 {
1600 	struct callchain_param *callchain = opt->value;
1601 
1602 	callchain->enabled = true;
1603 
1604 	if (callchain->record_mode == CALLCHAIN_NONE)
1605 		callchain->record_mode = CALLCHAIN_FP;
1606 
1607 	callchain_debug(callchain);
1608 	return 0;
1609 }
1610 
1611 static int perf_record_config(const char *var, const char *value, void *cb)
1612 {
1613 	struct record *rec = cb;
1614 
1615 	if (!strcmp(var, "record.build-id")) {
1616 		if (!strcmp(value, "cache"))
1617 			rec->no_buildid_cache = false;
1618 		else if (!strcmp(value, "no-cache"))
1619 			rec->no_buildid_cache = true;
1620 		else if (!strcmp(value, "skip"))
1621 			rec->no_buildid = true;
1622 		else
1623 			return -1;
1624 		return 0;
1625 	}
1626 	if (!strcmp(var, "record.call-graph")) {
1627 		var = "call-graph.record-mode";
1628 		return perf_default_config(var, value, cb);
1629 	}
1630 #ifdef HAVE_AIO_SUPPORT
1631 	if (!strcmp(var, "record.aio")) {
1632 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
1633 		if (!rec->opts.nr_cblocks)
1634 			rec->opts.nr_cblocks = nr_cblocks_default;
1635 	}
1636 #endif
1637 
1638 	return 0;
1639 }
1640 
1641 struct clockid_map {
1642 	const char *name;
1643 	int clockid;
1644 };
1645 
1646 #define CLOCKID_MAP(n, c)	\
1647 	{ .name = n, .clockid = (c), }
1648 
1649 #define CLOCKID_END	{ .name = NULL, }
1650 
1651 
1652 /*
1653  * Add the missing ones, we need to build on many distros...
1654  */
1655 #ifndef CLOCK_MONOTONIC_RAW
1656 #define CLOCK_MONOTONIC_RAW 4
1657 #endif
1658 #ifndef CLOCK_BOOTTIME
1659 #define CLOCK_BOOTTIME 7
1660 #endif
1661 #ifndef CLOCK_TAI
1662 #define CLOCK_TAI 11
1663 #endif
1664 
1665 static const struct clockid_map clockids[] = {
1666 	/* available for all events, NMI safe */
1667 	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1668 	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1669 
1670 	/* available for some events */
1671 	CLOCKID_MAP("realtime", CLOCK_REALTIME),
1672 	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1673 	CLOCKID_MAP("tai", CLOCK_TAI),
1674 
1675 	/* available for the lazy */
1676 	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1677 	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1678 	CLOCKID_MAP("real", CLOCK_REALTIME),
1679 	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1680 
1681 	CLOCKID_END,
1682 };
1683 
1684 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1685 {
1686 	struct timespec res;
1687 
1688 	*res_ns = 0;
1689 	if (!clock_getres(clk_id, &res))
1690 		*res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1691 	else
1692 		pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1693 
1694 	return 0;
1695 }
1696 
1697 static int parse_clockid(const struct option *opt, const char *str, int unset)
1698 {
1699 	struct record_opts *opts = (struct record_opts *)opt->value;
1700 	const struct clockid_map *cm;
1701 	const char *ostr = str;
1702 
1703 	if (unset) {
1704 		opts->use_clockid = 0;
1705 		return 0;
1706 	}
1707 
1708 	/* no arg passed */
1709 	if (!str)
1710 		return 0;
1711 
1712 	/* no setting it twice */
1713 	if (opts->use_clockid)
1714 		return -1;
1715 
1716 	opts->use_clockid = true;
1717 
1718 	/* if its a number, we're done */
1719 	if (sscanf(str, "%d", &opts->clockid) == 1)
1720 		return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1721 
1722 	/* allow a "CLOCK_" prefix to the name */
1723 	if (!strncasecmp(str, "CLOCK_", 6))
1724 		str += 6;
1725 
1726 	for (cm = clockids; cm->name; cm++) {
1727 		if (!strcasecmp(str, cm->name)) {
1728 			opts->clockid = cm->clockid;
1729 			return get_clockid_res(opts->clockid,
1730 					       &opts->clockid_res_ns);
1731 		}
1732 	}
1733 
1734 	opts->use_clockid = false;
1735 	ui__warning("unknown clockid %s, check man page\n", ostr);
1736 	return -1;
1737 }
1738 
1739 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1740 {
1741 	struct record_opts *opts = (struct record_opts *)opt->value;
1742 
1743 	if (unset || !str)
1744 		return 0;
1745 
1746 	if (!strcasecmp(str, "node"))
1747 		opts->affinity = PERF_AFFINITY_NODE;
1748 	else if (!strcasecmp(str, "cpu"))
1749 		opts->affinity = PERF_AFFINITY_CPU;
1750 
1751 	return 0;
1752 }
1753 
1754 static int record__parse_mmap_pages(const struct option *opt,
1755 				    const char *str,
1756 				    int unset __maybe_unused)
1757 {
1758 	struct record_opts *opts = opt->value;
1759 	char *s, *p;
1760 	unsigned int mmap_pages;
1761 	int ret;
1762 
1763 	if (!str)
1764 		return -EINVAL;
1765 
1766 	s = strdup(str);
1767 	if (!s)
1768 		return -ENOMEM;
1769 
1770 	p = strchr(s, ',');
1771 	if (p)
1772 		*p = '\0';
1773 
1774 	if (*s) {
1775 		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1776 		if (ret)
1777 			goto out_free;
1778 		opts->mmap_pages = mmap_pages;
1779 	}
1780 
1781 	if (!p) {
1782 		ret = 0;
1783 		goto out_free;
1784 	}
1785 
1786 	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1787 	if (ret)
1788 		goto out_free;
1789 
1790 	opts->auxtrace_mmap_pages = mmap_pages;
1791 
1792 out_free:
1793 	free(s);
1794 	return ret;
1795 }
1796 
1797 static void switch_output_size_warn(struct record *rec)
1798 {
1799 	u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1800 	struct switch_output *s = &rec->switch_output;
1801 
1802 	wakeup_size /= 2;
1803 
1804 	if (s->size < wakeup_size) {
1805 		char buf[100];
1806 
1807 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1808 		pr_warning("WARNING: switch-output data size lower than "
1809 			   "wakeup kernel buffer size (%s) "
1810 			   "expect bigger perf.data sizes\n", buf);
1811 	}
1812 }
1813 
1814 static int switch_output_setup(struct record *rec)
1815 {
1816 	struct switch_output *s = &rec->switch_output;
1817 	static struct parse_tag tags_size[] = {
1818 		{ .tag  = 'B', .mult = 1       },
1819 		{ .tag  = 'K', .mult = 1 << 10 },
1820 		{ .tag  = 'M', .mult = 1 << 20 },
1821 		{ .tag  = 'G', .mult = 1 << 30 },
1822 		{ .tag  = 0 },
1823 	};
1824 	static struct parse_tag tags_time[] = {
1825 		{ .tag  = 's', .mult = 1        },
1826 		{ .tag  = 'm', .mult = 60       },
1827 		{ .tag  = 'h', .mult = 60*60    },
1828 		{ .tag  = 'd', .mult = 60*60*24 },
1829 		{ .tag  = 0 },
1830 	};
1831 	unsigned long val;
1832 
1833 	if (!s->set)
1834 		return 0;
1835 
1836 	if (!strcmp(s->str, "signal")) {
1837 		s->signal = true;
1838 		pr_debug("switch-output with SIGUSR2 signal\n");
1839 		goto enabled;
1840 	}
1841 
1842 	val = parse_tag_value(s->str, tags_size);
1843 	if (val != (unsigned long) -1) {
1844 		s->size = val;
1845 		pr_debug("switch-output with %s size threshold\n", s->str);
1846 		goto enabled;
1847 	}
1848 
1849 	val = parse_tag_value(s->str, tags_time);
1850 	if (val != (unsigned long) -1) {
1851 		s->time = val;
1852 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
1853 			 s->str, s->time);
1854 		goto enabled;
1855 	}
1856 
1857 	return -1;
1858 
1859 enabled:
1860 	rec->timestamp_filename = true;
1861 	s->enabled              = true;
1862 
1863 	if (s->size && !rec->opts.no_buffering)
1864 		switch_output_size_warn(rec);
1865 
1866 	return 0;
1867 }
1868 
1869 static const char * const __record_usage[] = {
1870 	"perf record [<options>] [<command>]",
1871 	"perf record [<options>] -- <command> [<options>]",
1872 	NULL
1873 };
1874 const char * const *record_usage = __record_usage;
1875 
1876 /*
1877  * XXX Ideally would be local to cmd_record() and passed to a record__new
1878  * because we need to have access to it in record__exit, that is called
1879  * after cmd_record() exits, but since record_options need to be accessible to
1880  * builtin-script, leave it here.
1881  *
1882  * At least we don't ouch it in all the other functions here directly.
1883  *
1884  * Just say no to tons of global variables, sigh.
1885  */
1886 static struct record record = {
1887 	.opts = {
1888 		.sample_time	     = true,
1889 		.mmap_pages	     = UINT_MAX,
1890 		.user_freq	     = UINT_MAX,
1891 		.user_interval	     = ULLONG_MAX,
1892 		.freq		     = 4000,
1893 		.target		     = {
1894 			.uses_mmap   = true,
1895 			.default_per_cpu = true,
1896 		},
1897 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
1898 	},
1899 	.tool = {
1900 		.sample		= process_sample_event,
1901 		.fork		= perf_event__process_fork,
1902 		.exit		= perf_event__process_exit,
1903 		.comm		= perf_event__process_comm,
1904 		.namespaces	= perf_event__process_namespaces,
1905 		.mmap		= perf_event__process_mmap,
1906 		.mmap2		= perf_event__process_mmap2,
1907 		.ordered_events	= true,
1908 	},
1909 };
1910 
1911 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
1912 	"\n\t\t\t\tDefault: fp";
1913 
1914 static bool dry_run;
1915 
1916 /*
1917  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
1918  * with it and switch to use the library functions in perf_evlist that came
1919  * from builtin-record.c, i.e. use record_opts,
1920  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
1921  * using pipes, etc.
1922  */
1923 static struct option __record_options[] = {
1924 	OPT_CALLBACK('e', "event", &record.evlist, "event",
1925 		     "event selector. use 'perf list' to list available events",
1926 		     parse_events_option),
1927 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
1928 		     "event filter", parse_filter),
1929 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
1930 			   NULL, "don't record events from perf itself",
1931 			   exclude_perf),
1932 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
1933 		    "record events on existing process id"),
1934 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
1935 		    "record events on existing thread id"),
1936 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
1937 		    "collect data with this RT SCHED_FIFO priority"),
1938 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
1939 		    "collect data without buffering"),
1940 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1941 		    "collect raw sample records from all opened counters"),
1942 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1943 			    "system-wide collection from all CPUs"),
1944 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1945 		    "list of cpus to monitor"),
1946 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
1947 	OPT_STRING('o', "output", &record.data.path, "file",
1948 		    "output file name"),
1949 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
1950 			&record.opts.no_inherit_set,
1951 			"child tasks do not inherit counters"),
1952 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
1953 		    "synthesize non-sample events at the end of output"),
1954 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
1955 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
1956 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
1957 		    "Fail if the specified frequency can't be used"),
1958 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
1959 		     "profile at this frequency",
1960 		      record__parse_freq),
1961 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
1962 		     "number of mmap data pages and AUX area tracing mmap pages",
1963 		     record__parse_mmap_pages),
1964 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
1965 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
1966 		     record__mmap_flush_parse),
1967 	OPT_BOOLEAN(0, "group", &record.opts.group,
1968 		    "put the counters into a counter group"),
1969 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
1970 			   NULL, "enables call-graph recording" ,
1971 			   &record_callchain_opt),
1972 	OPT_CALLBACK(0, "call-graph", &record.opts,
1973 		     "record_mode[,record_size]", record_callchain_help,
1974 		     &record_parse_callchain_opt),
1975 	OPT_INCR('v', "verbose", &verbose,
1976 		    "be more verbose (show counter open errors, etc)"),
1977 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1978 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1979 		    "per thread counts"),
1980 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
1981 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
1982 		    "Record the sample physical addresses"),
1983 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
1984 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
1985 			&record.opts.sample_time_set,
1986 			"Record the sample timestamps"),
1987 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
1988 			"Record the sample period"),
1989 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1990 		    "don't sample"),
1991 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
1992 			&record.no_buildid_cache_set,
1993 			"do not update the buildid cache"),
1994 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
1995 			&record.no_buildid_set,
1996 			"do not collect buildids in perf.data"),
1997 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
1998 		     "monitor event in cgroup name only",
1999 		     parse_cgroups),
2000 	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2001 		  "ms to wait before starting measurement after program start"),
2002 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2003 		   "user to profile"),
2004 
2005 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2006 		     "branch any", "sample any taken branches",
2007 		     parse_branch_stack),
2008 
2009 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2010 		     "branch filter mask", "branch stack filter modes",
2011 		     parse_branch_stack),
2012 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2013 		    "sample by weight (on special events only)"),
2014 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2015 		    "sample transaction flags (special events only)"),
2016 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2017 		    "use per-thread mmaps"),
2018 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2019 		    "sample selected machine registers on interrupt,"
2020 		    " use -I ? to list register names", parse_regs),
2021 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2022 		    "sample selected machine registers on interrupt,"
2023 		    " use -I ? to list register names", parse_regs),
2024 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2025 		    "Record running/enabled time of read (:S) events"),
2026 	OPT_CALLBACK('k', "clockid", &record.opts,
2027 	"clockid", "clockid to use for events, see clock_gettime()",
2028 	parse_clockid),
2029 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2030 			  "opts", "AUX area tracing Snapshot Mode", ""),
2031 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2032 			"per thread proc mmap processing timeout in ms"),
2033 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2034 		    "Record namespaces events"),
2035 	OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2036 		    "Record context switch events"),
2037 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2038 			 "Configure all used events to run in kernel space.",
2039 			 PARSE_OPT_EXCLUSIVE),
2040 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2041 			 "Configure all used events to run in user space.",
2042 			 PARSE_OPT_EXCLUSIVE),
2043 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2044 		   "clang binary to use for compiling BPF scriptlets"),
2045 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2046 		   "options passed to clang when compiling BPF scriptlets"),
2047 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2048 		   "file", "vmlinux pathname"),
2049 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2050 		    "Record build-id of all DSOs regardless of hits"),
2051 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2052 		    "append timestamp to output filename"),
2053 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2054 		    "Record timestamp boundary (time of first/last samples)"),
2055 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2056 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2057 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2058 			  "signal"),
2059 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2060 		   "Limit number of switch output generated files"),
2061 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2062 		    "Parse options then exit"),
2063 #ifdef HAVE_AIO_SUPPORT
2064 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2065 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2066 		     record__aio_parse),
2067 #endif
2068 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2069 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2070 		     record__parse_affinity),
2071 	OPT_END()
2072 };
2073 
2074 struct option *record_options = __record_options;
2075 
2076 int cmd_record(int argc, const char **argv)
2077 {
2078 	int err;
2079 	struct record *rec = &record;
2080 	char errbuf[BUFSIZ];
2081 
2082 	setlocale(LC_ALL, "");
2083 
2084 #ifndef HAVE_LIBBPF_SUPPORT
2085 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2086 	set_nobuild('\0', "clang-path", true);
2087 	set_nobuild('\0', "clang-opt", true);
2088 # undef set_nobuild
2089 #endif
2090 
2091 #ifndef HAVE_BPF_PROLOGUE
2092 # if !defined (HAVE_DWARF_SUPPORT)
2093 #  define REASON  "NO_DWARF=1"
2094 # elif !defined (HAVE_LIBBPF_SUPPORT)
2095 #  define REASON  "NO_LIBBPF=1"
2096 # else
2097 #  define REASON  "this architecture doesn't support BPF prologue"
2098 # endif
2099 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2100 	set_nobuild('\0', "vmlinux", true);
2101 # undef set_nobuild
2102 # undef REASON
2103 #endif
2104 
2105 	CPU_ZERO(&rec->affinity_mask);
2106 	rec->opts.affinity = PERF_AFFINITY_SYS;
2107 
2108 	rec->evlist = perf_evlist__new();
2109 	if (rec->evlist == NULL)
2110 		return -ENOMEM;
2111 
2112 	err = perf_config(perf_record_config, rec);
2113 	if (err)
2114 		return err;
2115 
2116 	argc = parse_options(argc, argv, record_options, record_usage,
2117 			    PARSE_OPT_STOP_AT_NON_OPTION);
2118 	if (quiet)
2119 		perf_quiet_option();
2120 
2121 	/* Make system wide (-a) the default target. */
2122 	if (!argc && target__none(&rec->opts.target))
2123 		rec->opts.target.system_wide = true;
2124 
2125 	if (nr_cgroups && !rec->opts.target.system_wide) {
2126 		usage_with_options_msg(record_usage, record_options,
2127 			"cgroup monitoring only available in system-wide mode");
2128 
2129 	}
2130 	if (rec->opts.record_switch_events &&
2131 	    !perf_can_record_switch_events()) {
2132 		ui__error("kernel does not support recording context switch events\n");
2133 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2134 		return -EINVAL;
2135 	}
2136 
2137 	if (switch_output_setup(rec)) {
2138 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2139 		return -EINVAL;
2140 	}
2141 
2142 	if (rec->switch_output.time) {
2143 		signal(SIGALRM, alarm_sig_handler);
2144 		alarm(rec->switch_output.time);
2145 	}
2146 
2147 	if (rec->switch_output.num_files) {
2148 		rec->switch_output.filenames = calloc(sizeof(char *),
2149 						      rec->switch_output.num_files);
2150 		if (!rec->switch_output.filenames)
2151 			return -EINVAL;
2152 	}
2153 
2154 	/*
2155 	 * Allow aliases to facilitate the lookup of symbols for address
2156 	 * filters. Refer to auxtrace_parse_filters().
2157 	 */
2158 	symbol_conf.allow_aliases = true;
2159 
2160 	symbol__init(NULL);
2161 
2162 	err = record__auxtrace_init(rec);
2163 	if (err)
2164 		goto out;
2165 
2166 	if (dry_run)
2167 		goto out;
2168 
2169 	err = bpf__setup_stdout(rec->evlist);
2170 	if (err) {
2171 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2172 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2173 			 errbuf);
2174 		goto out;
2175 	}
2176 
2177 	err = -ENOMEM;
2178 
2179 	if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
2180 		pr_warning(
2181 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
2182 "check /proc/sys/kernel/kptr_restrict.\n\n"
2183 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
2184 "file is not found in the buildid cache or in the vmlinux path.\n\n"
2185 "Samples in kernel modules won't be resolved at all.\n\n"
2186 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
2187 "even with a suitable vmlinux or kallsyms file.\n\n");
2188 
2189 	if (rec->no_buildid_cache || rec->no_buildid) {
2190 		disable_buildid_cache();
2191 	} else if (rec->switch_output.enabled) {
2192 		/*
2193 		 * In 'perf record --switch-output', disable buildid
2194 		 * generation by default to reduce data file switching
2195 		 * overhead. Still generate buildid if they are required
2196 		 * explicitly using
2197 		 *
2198 		 *  perf record --switch-output --no-no-buildid \
2199 		 *              --no-no-buildid-cache
2200 		 *
2201 		 * Following code equals to:
2202 		 *
2203 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2204 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2205 		 *         disable_buildid_cache();
2206 		 */
2207 		bool disable = true;
2208 
2209 		if (rec->no_buildid_set && !rec->no_buildid)
2210 			disable = false;
2211 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2212 			disable = false;
2213 		if (disable) {
2214 			rec->no_buildid = true;
2215 			rec->no_buildid_cache = true;
2216 			disable_buildid_cache();
2217 		}
2218 	}
2219 
2220 	if (record.opts.overwrite)
2221 		record.opts.tail_synthesize = true;
2222 
2223 	if (rec->evlist->nr_entries == 0 &&
2224 	    __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2225 		pr_err("Not enough memory for event selector list\n");
2226 		goto out;
2227 	}
2228 
2229 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2230 		rec->opts.no_inherit = true;
2231 
2232 	err = target__validate(&rec->opts.target);
2233 	if (err) {
2234 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2235 		ui__warning("%s\n", errbuf);
2236 	}
2237 
2238 	err = target__parse_uid(&rec->opts.target);
2239 	if (err) {
2240 		int saved_errno = errno;
2241 
2242 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2243 		ui__error("%s", errbuf);
2244 
2245 		err = -saved_errno;
2246 		goto out;
2247 	}
2248 
2249 	/* Enable ignoring missing threads when -u/-p option is defined. */
2250 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2251 
2252 	err = -ENOMEM;
2253 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2254 		usage_with_options(record_usage, record_options);
2255 
2256 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2257 	if (err)
2258 		goto out;
2259 
2260 	/*
2261 	 * We take all buildids when the file contains
2262 	 * AUX area tracing data because we do not decode the
2263 	 * trace because it would take too long.
2264 	 */
2265 	if (rec->opts.full_auxtrace)
2266 		rec->buildid_all = true;
2267 
2268 	if (record_opts__config(&rec->opts)) {
2269 		err = -EINVAL;
2270 		goto out;
2271 	}
2272 
2273 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2274 		rec->opts.nr_cblocks = nr_cblocks_max;
2275 	if (verbose > 0)
2276 		pr_info("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2277 
2278 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2279 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2280 
2281 	err = __cmd_record(&record, argc, argv);
2282 out:
2283 	perf_evlist__delete(rec->evlist);
2284 	symbol__exit();
2285 	auxtrace_record__free(rec->itr);
2286 	return err;
2287 }
2288 
2289 static void snapshot_sig_handler(int sig __maybe_unused)
2290 {
2291 	struct record *rec = &record;
2292 
2293 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2294 		trigger_hit(&auxtrace_snapshot_trigger);
2295 		auxtrace_record__snapshot_started = 1;
2296 		if (auxtrace_record__snapshot_start(record.itr))
2297 			trigger_error(&auxtrace_snapshot_trigger);
2298 	}
2299 
2300 	if (switch_output_signal(rec))
2301 		trigger_hit(&switch_output_trigger);
2302 }
2303 
2304 static void alarm_sig_handler(int sig __maybe_unused)
2305 {
2306 	struct record *rec = &record;
2307 
2308 	if (switch_output_time(rec))
2309 		trigger_hit(&switch_output_trigger);
2310 }
2311