xref: /openbmc/linux/tools/perf/builtin-record.c (revision e620a1e0)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/llvm-utils.h"
38 #include "util/bpf-loader.h"
39 #include "util/trigger.h"
40 #include "util/perf-hooks.h"
41 #include "util/cpu-set-sched.h"
42 #include "util/synthetic-events.h"
43 #include "util/time-utils.h"
44 #include "util/units.h"
45 #include "util/bpf-event.h"
46 #include "asm/bug.h"
47 #include "perf.h"
48 
49 #include <errno.h>
50 #include <inttypes.h>
51 #include <locale.h>
52 #include <poll.h>
53 #include <unistd.h>
54 #include <sched.h>
55 #include <signal.h>
56 #include <sys/mman.h>
57 #include <sys/wait.h>
58 #include <linux/err.h>
59 #include <linux/string.h>
60 #include <linux/time64.h>
61 #include <linux/zalloc.h>
62 
63 struct switch_output {
64 	bool		 enabled;
65 	bool		 signal;
66 	unsigned long	 size;
67 	unsigned long	 time;
68 	const char	*str;
69 	bool		 set;
70 	char		 **filenames;
71 	int		 num_files;
72 	int		 cur_file;
73 };
74 
75 struct record {
76 	struct perf_tool	tool;
77 	struct record_opts	opts;
78 	u64			bytes_written;
79 	struct perf_data	data;
80 	struct auxtrace_record	*itr;
81 	struct evlist	*evlist;
82 	struct perf_session	*session;
83 	int			realtime_prio;
84 	bool			no_buildid;
85 	bool			no_buildid_set;
86 	bool			no_buildid_cache;
87 	bool			no_buildid_cache_set;
88 	bool			buildid_all;
89 	bool			timestamp_filename;
90 	bool			timestamp_boundary;
91 	struct switch_output	switch_output;
92 	unsigned long long	samples;
93 	cpu_set_t		affinity_mask;
94 };
95 
96 static volatile int auxtrace_record__snapshot_started;
97 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
98 static DEFINE_TRIGGER(switch_output_trigger);
99 
100 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
101 	"SYS", "NODE", "CPU"
102 };
103 
104 static bool switch_output_signal(struct record *rec)
105 {
106 	return rec->switch_output.signal &&
107 	       trigger_is_ready(&switch_output_trigger);
108 }
109 
110 static bool switch_output_size(struct record *rec)
111 {
112 	return rec->switch_output.size &&
113 	       trigger_is_ready(&switch_output_trigger) &&
114 	       (rec->bytes_written >= rec->switch_output.size);
115 }
116 
117 static bool switch_output_time(struct record *rec)
118 {
119 	return rec->switch_output.time &&
120 	       trigger_is_ready(&switch_output_trigger);
121 }
122 
123 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
124 			 void *bf, size_t size)
125 {
126 	struct perf_data_file *file = &rec->session->data->file;
127 
128 	if (perf_data_file__write(file, bf, size) < 0) {
129 		pr_err("failed to write perf data, error: %m\n");
130 		return -1;
131 	}
132 
133 	rec->bytes_written += size;
134 
135 	if (switch_output_size(rec))
136 		trigger_hit(&switch_output_trigger);
137 
138 	return 0;
139 }
140 
141 static int record__aio_enabled(struct record *rec);
142 static int record__comp_enabled(struct record *rec);
143 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
144 			    void *src, size_t src_size);
145 
146 #ifdef HAVE_AIO_SUPPORT
147 static int record__aio_write(struct aiocb *cblock, int trace_fd,
148 		void *buf, size_t size, off_t off)
149 {
150 	int rc;
151 
152 	cblock->aio_fildes = trace_fd;
153 	cblock->aio_buf    = buf;
154 	cblock->aio_nbytes = size;
155 	cblock->aio_offset = off;
156 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
157 
158 	do {
159 		rc = aio_write(cblock);
160 		if (rc == 0) {
161 			break;
162 		} else if (errno != EAGAIN) {
163 			cblock->aio_fildes = -1;
164 			pr_err("failed to queue perf data, error: %m\n");
165 			break;
166 		}
167 	} while (1);
168 
169 	return rc;
170 }
171 
172 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
173 {
174 	void *rem_buf;
175 	off_t rem_off;
176 	size_t rem_size;
177 	int rc, aio_errno;
178 	ssize_t aio_ret, written;
179 
180 	aio_errno = aio_error(cblock);
181 	if (aio_errno == EINPROGRESS)
182 		return 0;
183 
184 	written = aio_ret = aio_return(cblock);
185 	if (aio_ret < 0) {
186 		if (aio_errno != EINTR)
187 			pr_err("failed to write perf data, error: %m\n");
188 		written = 0;
189 	}
190 
191 	rem_size = cblock->aio_nbytes - written;
192 
193 	if (rem_size == 0) {
194 		cblock->aio_fildes = -1;
195 		/*
196 		 * md->refcount is incremented in record__aio_pushfn() for
197 		 * every aio write request started in record__aio_push() so
198 		 * decrement it because the request is now complete.
199 		 */
200 		perf_mmap__put(md);
201 		rc = 1;
202 	} else {
203 		/*
204 		 * aio write request may require restart with the
205 		 * reminder if the kernel didn't write whole
206 		 * chunk at once.
207 		 */
208 		rem_off = cblock->aio_offset + written;
209 		rem_buf = (void *)(cblock->aio_buf + written);
210 		record__aio_write(cblock, cblock->aio_fildes,
211 				rem_buf, rem_size, rem_off);
212 		rc = 0;
213 	}
214 
215 	return rc;
216 }
217 
218 static int record__aio_sync(struct mmap *md, bool sync_all)
219 {
220 	struct aiocb **aiocb = md->aio.aiocb;
221 	struct aiocb *cblocks = md->aio.cblocks;
222 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
223 	int i, do_suspend;
224 
225 	do {
226 		do_suspend = 0;
227 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
228 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
229 				if (sync_all)
230 					aiocb[i] = NULL;
231 				else
232 					return i;
233 			} else {
234 				/*
235 				 * Started aio write is not complete yet
236 				 * so it has to be waited before the
237 				 * next allocation.
238 				 */
239 				aiocb[i] = &cblocks[i];
240 				do_suspend = 1;
241 			}
242 		}
243 		if (!do_suspend)
244 			return -1;
245 
246 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
247 			if (!(errno == EAGAIN || errno == EINTR))
248 				pr_err("failed to sync perf data, error: %m\n");
249 		}
250 	} while (1);
251 }
252 
253 struct record_aio {
254 	struct record	*rec;
255 	void		*data;
256 	size_t		size;
257 };
258 
259 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
260 {
261 	struct record_aio *aio = to;
262 
263 	/*
264 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
265 	 * to release space in the kernel buffer as fast as possible, calling
266 	 * perf_mmap__consume() from perf_mmap__push() function.
267 	 *
268 	 * That lets the kernel to proceed with storing more profiling data into
269 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
270 	 *
271 	 * Coping can be done in two steps in case the chunk of profiling data
272 	 * crosses the upper bound of the kernel buffer. In this case we first move
273 	 * part of data from map->start till the upper bound and then the reminder
274 	 * from the beginning of the kernel buffer till the end of the data chunk.
275 	 */
276 
277 	if (record__comp_enabled(aio->rec)) {
278 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
279 				     perf_mmap__mmap_len(map) - aio->size,
280 				     buf, size);
281 	} else {
282 		memcpy(aio->data + aio->size, buf, size);
283 	}
284 
285 	if (!aio->size) {
286 		/*
287 		 * Increment map->refcount to guard map->aio.data[] buffer
288 		 * from premature deallocation because map object can be
289 		 * released earlier than aio write request started on
290 		 * map->aio.data[] buffer is complete.
291 		 *
292 		 * perf_mmap__put() is done at record__aio_complete()
293 		 * after started aio request completion or at record__aio_push()
294 		 * if the request failed to start.
295 		 */
296 		perf_mmap__get(map);
297 	}
298 
299 	aio->size += size;
300 
301 	return size;
302 }
303 
304 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
305 {
306 	int ret, idx;
307 	int trace_fd = rec->session->data->file.fd;
308 	struct record_aio aio = { .rec = rec, .size = 0 };
309 
310 	/*
311 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
312 	 * becomes available after previous aio write operation.
313 	 */
314 
315 	idx = record__aio_sync(map, false);
316 	aio.data = map->aio.data[idx];
317 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
318 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
319 		return ret;
320 
321 	rec->samples++;
322 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
323 	if (!ret) {
324 		*off += aio.size;
325 		rec->bytes_written += aio.size;
326 		if (switch_output_size(rec))
327 			trigger_hit(&switch_output_trigger);
328 	} else {
329 		/*
330 		 * Decrement map->refcount incremented in record__aio_pushfn()
331 		 * back if record__aio_write() operation failed to start, otherwise
332 		 * map->refcount is decremented in record__aio_complete() after
333 		 * aio write operation finishes successfully.
334 		 */
335 		perf_mmap__put(map);
336 	}
337 
338 	return ret;
339 }
340 
341 static off_t record__aio_get_pos(int trace_fd)
342 {
343 	return lseek(trace_fd, 0, SEEK_CUR);
344 }
345 
346 static void record__aio_set_pos(int trace_fd, off_t pos)
347 {
348 	lseek(trace_fd, pos, SEEK_SET);
349 }
350 
351 static void record__aio_mmap_read_sync(struct record *rec)
352 {
353 	int i;
354 	struct evlist *evlist = rec->evlist;
355 	struct mmap *maps = evlist->mmap;
356 
357 	if (!record__aio_enabled(rec))
358 		return;
359 
360 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
361 		struct mmap *map = &maps[i];
362 
363 		if (map->core.base)
364 			record__aio_sync(map, true);
365 	}
366 }
367 
368 static int nr_cblocks_default = 1;
369 static int nr_cblocks_max = 4;
370 
371 static int record__aio_parse(const struct option *opt,
372 			     const char *str,
373 			     int unset)
374 {
375 	struct record_opts *opts = (struct record_opts *)opt->value;
376 
377 	if (unset) {
378 		opts->nr_cblocks = 0;
379 	} else {
380 		if (str)
381 			opts->nr_cblocks = strtol(str, NULL, 0);
382 		if (!opts->nr_cblocks)
383 			opts->nr_cblocks = nr_cblocks_default;
384 	}
385 
386 	return 0;
387 }
388 #else /* HAVE_AIO_SUPPORT */
389 static int nr_cblocks_max = 0;
390 
391 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
392 			    off_t *off __maybe_unused)
393 {
394 	return -1;
395 }
396 
397 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
398 {
399 	return -1;
400 }
401 
402 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
403 {
404 }
405 
406 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
407 {
408 }
409 #endif
410 
411 static int record__aio_enabled(struct record *rec)
412 {
413 	return rec->opts.nr_cblocks > 0;
414 }
415 
416 #define MMAP_FLUSH_DEFAULT 1
417 static int record__mmap_flush_parse(const struct option *opt,
418 				    const char *str,
419 				    int unset)
420 {
421 	int flush_max;
422 	struct record_opts *opts = (struct record_opts *)opt->value;
423 	static struct parse_tag tags[] = {
424 			{ .tag  = 'B', .mult = 1       },
425 			{ .tag  = 'K', .mult = 1 << 10 },
426 			{ .tag  = 'M', .mult = 1 << 20 },
427 			{ .tag  = 'G', .mult = 1 << 30 },
428 			{ .tag  = 0 },
429 	};
430 
431 	if (unset)
432 		return 0;
433 
434 	if (str) {
435 		opts->mmap_flush = parse_tag_value(str, tags);
436 		if (opts->mmap_flush == (int)-1)
437 			opts->mmap_flush = strtol(str, NULL, 0);
438 	}
439 
440 	if (!opts->mmap_flush)
441 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
442 
443 	flush_max = evlist__mmap_size(opts->mmap_pages);
444 	flush_max /= 4;
445 	if (opts->mmap_flush > flush_max)
446 		opts->mmap_flush = flush_max;
447 
448 	return 0;
449 }
450 
451 #ifdef HAVE_ZSTD_SUPPORT
452 static unsigned int comp_level_default = 1;
453 
454 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
455 {
456 	struct record_opts *opts = opt->value;
457 
458 	if (unset) {
459 		opts->comp_level = 0;
460 	} else {
461 		if (str)
462 			opts->comp_level = strtol(str, NULL, 0);
463 		if (!opts->comp_level)
464 			opts->comp_level = comp_level_default;
465 	}
466 
467 	return 0;
468 }
469 #endif
470 static unsigned int comp_level_max = 22;
471 
472 static int record__comp_enabled(struct record *rec)
473 {
474 	return rec->opts.comp_level > 0;
475 }
476 
477 static int process_synthesized_event(struct perf_tool *tool,
478 				     union perf_event *event,
479 				     struct perf_sample *sample __maybe_unused,
480 				     struct machine *machine __maybe_unused)
481 {
482 	struct record *rec = container_of(tool, struct record, tool);
483 	return record__write(rec, NULL, event, event->header.size);
484 }
485 
486 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
487 {
488 	struct record *rec = to;
489 
490 	if (record__comp_enabled(rec)) {
491 		size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
492 		bf   = map->data;
493 	}
494 
495 	rec->samples++;
496 	return record__write(rec, map, bf, size);
497 }
498 
499 static volatile int done;
500 static volatile int signr = -1;
501 static volatile int child_finished;
502 
503 static void sig_handler(int sig)
504 {
505 	if (sig == SIGCHLD)
506 		child_finished = 1;
507 	else
508 		signr = sig;
509 
510 	done = 1;
511 }
512 
513 static void sigsegv_handler(int sig)
514 {
515 	perf_hooks__recover();
516 	sighandler_dump_stack(sig);
517 }
518 
519 static void record__sig_exit(void)
520 {
521 	if (signr == -1)
522 		return;
523 
524 	signal(signr, SIG_DFL);
525 	raise(signr);
526 }
527 
528 #ifdef HAVE_AUXTRACE_SUPPORT
529 
530 static int record__process_auxtrace(struct perf_tool *tool,
531 				    struct mmap *map,
532 				    union perf_event *event, void *data1,
533 				    size_t len1, void *data2, size_t len2)
534 {
535 	struct record *rec = container_of(tool, struct record, tool);
536 	struct perf_data *data = &rec->data;
537 	size_t padding;
538 	u8 pad[8] = {0};
539 
540 	if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
541 		off_t file_offset;
542 		int fd = perf_data__fd(data);
543 		int err;
544 
545 		file_offset = lseek(fd, 0, SEEK_CUR);
546 		if (file_offset == -1)
547 			return -1;
548 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
549 						     event, file_offset);
550 		if (err)
551 			return err;
552 	}
553 
554 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
555 	padding = (len1 + len2) & 7;
556 	if (padding)
557 		padding = 8 - padding;
558 
559 	record__write(rec, map, event, event->header.size);
560 	record__write(rec, map, data1, len1);
561 	if (len2)
562 		record__write(rec, map, data2, len2);
563 	record__write(rec, map, &pad, padding);
564 
565 	return 0;
566 }
567 
568 static int record__auxtrace_mmap_read(struct record *rec,
569 				      struct mmap *map)
570 {
571 	int ret;
572 
573 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
574 				  record__process_auxtrace);
575 	if (ret < 0)
576 		return ret;
577 
578 	if (ret)
579 		rec->samples++;
580 
581 	return 0;
582 }
583 
584 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
585 					       struct mmap *map)
586 {
587 	int ret;
588 
589 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
590 					   record__process_auxtrace,
591 					   rec->opts.auxtrace_snapshot_size);
592 	if (ret < 0)
593 		return ret;
594 
595 	if (ret)
596 		rec->samples++;
597 
598 	return 0;
599 }
600 
601 static int record__auxtrace_read_snapshot_all(struct record *rec)
602 {
603 	int i;
604 	int rc = 0;
605 
606 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
607 		struct mmap *map = &rec->evlist->mmap[i];
608 
609 		if (!map->auxtrace_mmap.base)
610 			continue;
611 
612 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
613 			rc = -1;
614 			goto out;
615 		}
616 	}
617 out:
618 	return rc;
619 }
620 
621 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
622 {
623 	pr_debug("Recording AUX area tracing snapshot\n");
624 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
625 		trigger_error(&auxtrace_snapshot_trigger);
626 	} else {
627 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
628 			trigger_error(&auxtrace_snapshot_trigger);
629 		else
630 			trigger_ready(&auxtrace_snapshot_trigger);
631 	}
632 }
633 
634 static int record__auxtrace_snapshot_exit(struct record *rec)
635 {
636 	if (trigger_is_error(&auxtrace_snapshot_trigger))
637 		return 0;
638 
639 	if (!auxtrace_record__snapshot_started &&
640 	    auxtrace_record__snapshot_start(rec->itr))
641 		return -1;
642 
643 	record__read_auxtrace_snapshot(rec, true);
644 	if (trigger_is_error(&auxtrace_snapshot_trigger))
645 		return -1;
646 
647 	return 0;
648 }
649 
650 static int record__auxtrace_init(struct record *rec)
651 {
652 	int err;
653 
654 	if (!rec->itr) {
655 		rec->itr = auxtrace_record__init(rec->evlist, &err);
656 		if (err)
657 			return err;
658 	}
659 
660 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
661 					      rec->opts.auxtrace_snapshot_opts);
662 	if (err)
663 		return err;
664 
665 	return auxtrace_parse_filters(rec->evlist);
666 }
667 
668 #else
669 
670 static inline
671 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
672 			       struct mmap *map __maybe_unused)
673 {
674 	return 0;
675 }
676 
677 static inline
678 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
679 				    bool on_exit __maybe_unused)
680 {
681 }
682 
683 static inline
684 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
685 {
686 	return 0;
687 }
688 
689 static inline
690 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
691 {
692 	return 0;
693 }
694 
695 static int record__auxtrace_init(struct record *rec __maybe_unused)
696 {
697 	return 0;
698 }
699 
700 #endif
701 
702 static int record__mmap_evlist(struct record *rec,
703 			       struct evlist *evlist)
704 {
705 	struct record_opts *opts = &rec->opts;
706 	char msg[512];
707 
708 	if (opts->affinity != PERF_AFFINITY_SYS)
709 		cpu__setup_cpunode_map();
710 
711 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
712 				 opts->auxtrace_mmap_pages,
713 				 opts->auxtrace_snapshot_mode,
714 				 opts->nr_cblocks, opts->affinity,
715 				 opts->mmap_flush, opts->comp_level) < 0) {
716 		if (errno == EPERM) {
717 			pr_err("Permission error mapping pages.\n"
718 			       "Consider increasing "
719 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
720 			       "or try again with a smaller value of -m/--mmap_pages.\n"
721 			       "(current value: %u,%u)\n",
722 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
723 			return -errno;
724 		} else {
725 			pr_err("failed to mmap with %d (%s)\n", errno,
726 				str_error_r(errno, msg, sizeof(msg)));
727 			if (errno)
728 				return -errno;
729 			else
730 				return -EINVAL;
731 		}
732 	}
733 	return 0;
734 }
735 
736 static int record__mmap(struct record *rec)
737 {
738 	return record__mmap_evlist(rec, rec->evlist);
739 }
740 
741 static int record__open(struct record *rec)
742 {
743 	char msg[BUFSIZ];
744 	struct evsel *pos;
745 	struct evlist *evlist = rec->evlist;
746 	struct perf_session *session = rec->session;
747 	struct record_opts *opts = &rec->opts;
748 	int rc = 0;
749 
750 	/*
751 	 * For initial_delay we need to add a dummy event so that we can track
752 	 * PERF_RECORD_MMAP while we wait for the initial delay to enable the
753 	 * real events, the ones asked by the user.
754 	 */
755 	if (opts->initial_delay) {
756 		if (perf_evlist__add_dummy(evlist))
757 			return -ENOMEM;
758 
759 		pos = evlist__first(evlist);
760 		pos->tracking = 0;
761 		pos = evlist__last(evlist);
762 		pos->tracking = 1;
763 		pos->core.attr.enable_on_exec = 1;
764 	}
765 
766 	perf_evlist__config(evlist, opts, &callchain_param);
767 
768 	evlist__for_each_entry(evlist, pos) {
769 try_again:
770 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
771 			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
772 				if (verbose > 0)
773 					ui__warning("%s\n", msg);
774 				goto try_again;
775 			}
776 			if ((errno == EINVAL || errno == EBADF) &&
777 			    pos->leader != pos &&
778 			    pos->weak_group) {
779 			        pos = perf_evlist__reset_weak_group(evlist, pos);
780 				goto try_again;
781 			}
782 			rc = -errno;
783 			perf_evsel__open_strerror(pos, &opts->target,
784 						  errno, msg, sizeof(msg));
785 			ui__error("%s\n", msg);
786 			goto out;
787 		}
788 
789 		pos->supported = true;
790 	}
791 
792 	if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) {
793 		pr_warning(
794 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
795 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
796 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
797 "file is not found in the buildid cache or in the vmlinux path.\n\n"
798 "Samples in kernel modules won't be resolved at all.\n\n"
799 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
800 "even with a suitable vmlinux or kallsyms file.\n\n");
801 	}
802 
803 	if (perf_evlist__apply_filters(evlist, &pos)) {
804 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
805 			pos->filter, perf_evsel__name(pos), errno,
806 			str_error_r(errno, msg, sizeof(msg)));
807 		rc = -1;
808 		goto out;
809 	}
810 
811 	rc = record__mmap(rec);
812 	if (rc)
813 		goto out;
814 
815 	session->evlist = evlist;
816 	perf_session__set_id_hdr_size(session);
817 out:
818 	return rc;
819 }
820 
821 static int process_sample_event(struct perf_tool *tool,
822 				union perf_event *event,
823 				struct perf_sample *sample,
824 				struct evsel *evsel,
825 				struct machine *machine)
826 {
827 	struct record *rec = container_of(tool, struct record, tool);
828 
829 	if (rec->evlist->first_sample_time == 0)
830 		rec->evlist->first_sample_time = sample->time;
831 
832 	rec->evlist->last_sample_time = sample->time;
833 
834 	if (rec->buildid_all)
835 		return 0;
836 
837 	rec->samples++;
838 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
839 }
840 
841 static int process_buildids(struct record *rec)
842 {
843 	struct perf_session *session = rec->session;
844 
845 	if (perf_data__size(&rec->data) == 0)
846 		return 0;
847 
848 	/*
849 	 * During this process, it'll load kernel map and replace the
850 	 * dso->long_name to a real pathname it found.  In this case
851 	 * we prefer the vmlinux path like
852 	 *   /lib/modules/3.16.4/build/vmlinux
853 	 *
854 	 * rather than build-id path (in debug directory).
855 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
856 	 */
857 	symbol_conf.ignore_vmlinux_buildid = true;
858 
859 	/*
860 	 * If --buildid-all is given, it marks all DSO regardless of hits,
861 	 * so no need to process samples. But if timestamp_boundary is enabled,
862 	 * it still needs to walk on all samples to get the timestamps of
863 	 * first/last samples.
864 	 */
865 	if (rec->buildid_all && !rec->timestamp_boundary)
866 		rec->tool.sample = NULL;
867 
868 	return perf_session__process_events(session);
869 }
870 
871 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
872 {
873 	int err;
874 	struct perf_tool *tool = data;
875 	/*
876 	 *As for guest kernel when processing subcommand record&report,
877 	 *we arrange module mmap prior to guest kernel mmap and trigger
878 	 *a preload dso because default guest module symbols are loaded
879 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
880 	 *method is used to avoid symbol missing when the first addr is
881 	 *in module instead of in guest kernel.
882 	 */
883 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
884 					     machine);
885 	if (err < 0)
886 		pr_err("Couldn't record guest kernel [%d]'s reference"
887 		       " relocation symbol.\n", machine->pid);
888 
889 	/*
890 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
891 	 * have no _text sometimes.
892 	 */
893 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
894 						 machine);
895 	if (err < 0)
896 		pr_err("Couldn't record guest kernel [%d]'s reference"
897 		       " relocation symbol.\n", machine->pid);
898 }
899 
900 static struct perf_event_header finished_round_event = {
901 	.size = sizeof(struct perf_event_header),
902 	.type = PERF_RECORD_FINISHED_ROUND,
903 };
904 
905 static void record__adjust_affinity(struct record *rec, struct mmap *map)
906 {
907 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
908 	    !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
909 		CPU_ZERO(&rec->affinity_mask);
910 		CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
911 		sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
912 	}
913 }
914 
915 static size_t process_comp_header(void *record, size_t increment)
916 {
917 	struct perf_record_compressed *event = record;
918 	size_t size = sizeof(*event);
919 
920 	if (increment) {
921 		event->header.size += increment;
922 		return increment;
923 	}
924 
925 	event->header.type = PERF_RECORD_COMPRESSED;
926 	event->header.size = size;
927 
928 	return size;
929 }
930 
931 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
932 			    void *src, size_t src_size)
933 {
934 	size_t compressed;
935 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
936 
937 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
938 						     max_record_size, process_comp_header);
939 
940 	session->bytes_transferred += src_size;
941 	session->bytes_compressed  += compressed;
942 
943 	return compressed;
944 }
945 
946 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
947 				    bool overwrite, bool synch)
948 {
949 	u64 bytes_written = rec->bytes_written;
950 	int i;
951 	int rc = 0;
952 	struct mmap *maps;
953 	int trace_fd = rec->data.file.fd;
954 	off_t off = 0;
955 
956 	if (!evlist)
957 		return 0;
958 
959 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
960 	if (!maps)
961 		return 0;
962 
963 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
964 		return 0;
965 
966 	if (record__aio_enabled(rec))
967 		off = record__aio_get_pos(trace_fd);
968 
969 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
970 		u64 flush = 0;
971 		struct mmap *map = &maps[i];
972 
973 		if (map->core.base) {
974 			record__adjust_affinity(rec, map);
975 			if (synch) {
976 				flush = map->core.flush;
977 				map->core.flush = 1;
978 			}
979 			if (!record__aio_enabled(rec)) {
980 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
981 					if (synch)
982 						map->core.flush = flush;
983 					rc = -1;
984 					goto out;
985 				}
986 			} else {
987 				if (record__aio_push(rec, map, &off) < 0) {
988 					record__aio_set_pos(trace_fd, off);
989 					if (synch)
990 						map->core.flush = flush;
991 					rc = -1;
992 					goto out;
993 				}
994 			}
995 			if (synch)
996 				map->core.flush = flush;
997 		}
998 
999 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1000 		    record__auxtrace_mmap_read(rec, map) != 0) {
1001 			rc = -1;
1002 			goto out;
1003 		}
1004 	}
1005 
1006 	if (record__aio_enabled(rec))
1007 		record__aio_set_pos(trace_fd, off);
1008 
1009 	/*
1010 	 * Mark the round finished in case we wrote
1011 	 * at least one event.
1012 	 */
1013 	if (bytes_written != rec->bytes_written)
1014 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1015 
1016 	if (overwrite)
1017 		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1018 out:
1019 	return rc;
1020 }
1021 
1022 static int record__mmap_read_all(struct record *rec, bool synch)
1023 {
1024 	int err;
1025 
1026 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1027 	if (err)
1028 		return err;
1029 
1030 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1031 }
1032 
1033 static void record__init_features(struct record *rec)
1034 {
1035 	struct perf_session *session = rec->session;
1036 	int feat;
1037 
1038 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1039 		perf_header__set_feat(&session->header, feat);
1040 
1041 	if (rec->no_buildid)
1042 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1043 
1044 	if (!have_tracepoints(&rec->evlist->core.entries))
1045 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1046 
1047 	if (!rec->opts.branch_stack)
1048 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1049 
1050 	if (!rec->opts.full_auxtrace)
1051 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1052 
1053 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1054 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1055 
1056 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1057 	if (!record__comp_enabled(rec))
1058 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1059 
1060 	perf_header__clear_feat(&session->header, HEADER_STAT);
1061 }
1062 
1063 static void
1064 record__finish_output(struct record *rec)
1065 {
1066 	struct perf_data *data = &rec->data;
1067 	int fd = perf_data__fd(data);
1068 
1069 	if (data->is_pipe)
1070 		return;
1071 
1072 	rec->session->header.data_size += rec->bytes_written;
1073 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1074 
1075 	if (!rec->no_buildid) {
1076 		process_buildids(rec);
1077 
1078 		if (rec->buildid_all)
1079 			dsos__hit_all(rec->session);
1080 	}
1081 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1082 
1083 	return;
1084 }
1085 
1086 static int record__synthesize_workload(struct record *rec, bool tail)
1087 {
1088 	int err;
1089 	struct perf_thread_map *thread_map;
1090 
1091 	if (rec->opts.tail_synthesize != tail)
1092 		return 0;
1093 
1094 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1095 	if (thread_map == NULL)
1096 		return -1;
1097 
1098 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1099 						 process_synthesized_event,
1100 						 &rec->session->machines.host,
1101 						 rec->opts.sample_address);
1102 	perf_thread_map__put(thread_map);
1103 	return err;
1104 }
1105 
1106 static int record__synthesize(struct record *rec, bool tail);
1107 
1108 static int
1109 record__switch_output(struct record *rec, bool at_exit)
1110 {
1111 	struct perf_data *data = &rec->data;
1112 	int fd, err;
1113 	char *new_filename;
1114 
1115 	/* Same Size:      "2015122520103046"*/
1116 	char timestamp[] = "InvalidTimestamp";
1117 
1118 	record__aio_mmap_read_sync(rec);
1119 
1120 	record__synthesize(rec, true);
1121 	if (target__none(&rec->opts.target))
1122 		record__synthesize_workload(rec, true);
1123 
1124 	rec->samples = 0;
1125 	record__finish_output(rec);
1126 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1127 	if (err) {
1128 		pr_err("Failed to get current timestamp\n");
1129 		return -EINVAL;
1130 	}
1131 
1132 	fd = perf_data__switch(data, timestamp,
1133 				    rec->session->header.data_offset,
1134 				    at_exit, &new_filename);
1135 	if (fd >= 0 && !at_exit) {
1136 		rec->bytes_written = 0;
1137 		rec->session->header.data_size = 0;
1138 	}
1139 
1140 	if (!quiet)
1141 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1142 			data->path, timestamp);
1143 
1144 	if (rec->switch_output.num_files) {
1145 		int n = rec->switch_output.cur_file + 1;
1146 
1147 		if (n >= rec->switch_output.num_files)
1148 			n = 0;
1149 		rec->switch_output.cur_file = n;
1150 		if (rec->switch_output.filenames[n]) {
1151 			remove(rec->switch_output.filenames[n]);
1152 			zfree(&rec->switch_output.filenames[n]);
1153 		}
1154 		rec->switch_output.filenames[n] = new_filename;
1155 	} else {
1156 		free(new_filename);
1157 	}
1158 
1159 	/* Output tracking events */
1160 	if (!at_exit) {
1161 		record__synthesize(rec, false);
1162 
1163 		/*
1164 		 * In 'perf record --switch-output' without -a,
1165 		 * record__synthesize() in record__switch_output() won't
1166 		 * generate tracking events because there's no thread_map
1167 		 * in evlist. Which causes newly created perf.data doesn't
1168 		 * contain map and comm information.
1169 		 * Create a fake thread_map and directly call
1170 		 * perf_event__synthesize_thread_map() for those events.
1171 		 */
1172 		if (target__none(&rec->opts.target))
1173 			record__synthesize_workload(rec, false);
1174 	}
1175 	return fd;
1176 }
1177 
1178 static volatile int workload_exec_errno;
1179 
1180 /*
1181  * perf_evlist__prepare_workload will send a SIGUSR1
1182  * if the fork fails, since we asked by setting its
1183  * want_signal to true.
1184  */
1185 static void workload_exec_failed_signal(int signo __maybe_unused,
1186 					siginfo_t *info,
1187 					void *ucontext __maybe_unused)
1188 {
1189 	workload_exec_errno = info->si_value.sival_int;
1190 	done = 1;
1191 	child_finished = 1;
1192 }
1193 
1194 static void snapshot_sig_handler(int sig);
1195 static void alarm_sig_handler(int sig);
1196 
1197 static const struct perf_event_mmap_page *
1198 perf_evlist__pick_pc(struct evlist *evlist)
1199 {
1200 	if (evlist) {
1201 		if (evlist->mmap && evlist->mmap[0].core.base)
1202 			return evlist->mmap[0].core.base;
1203 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1204 			return evlist->overwrite_mmap[0].core.base;
1205 	}
1206 	return NULL;
1207 }
1208 
1209 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1210 {
1211 	const struct perf_event_mmap_page *pc;
1212 
1213 	pc = perf_evlist__pick_pc(rec->evlist);
1214 	if (pc)
1215 		return pc;
1216 	return NULL;
1217 }
1218 
1219 static int record__synthesize(struct record *rec, bool tail)
1220 {
1221 	struct perf_session *session = rec->session;
1222 	struct machine *machine = &session->machines.host;
1223 	struct perf_data *data = &rec->data;
1224 	struct record_opts *opts = &rec->opts;
1225 	struct perf_tool *tool = &rec->tool;
1226 	int fd = perf_data__fd(data);
1227 	int err = 0;
1228 
1229 	if (rec->opts.tail_synthesize != tail)
1230 		return 0;
1231 
1232 	if (data->is_pipe) {
1233 		/*
1234 		 * We need to synthesize events first, because some
1235 		 * features works on top of them (on report side).
1236 		 */
1237 		err = perf_event__synthesize_attrs(tool, rec->evlist,
1238 						   process_synthesized_event);
1239 		if (err < 0) {
1240 			pr_err("Couldn't synthesize attrs.\n");
1241 			goto out;
1242 		}
1243 
1244 		err = perf_event__synthesize_features(tool, session, rec->evlist,
1245 						      process_synthesized_event);
1246 		if (err < 0) {
1247 			pr_err("Couldn't synthesize features.\n");
1248 			return err;
1249 		}
1250 
1251 		if (have_tracepoints(&rec->evlist->core.entries)) {
1252 			/*
1253 			 * FIXME err <= 0 here actually means that
1254 			 * there were no tracepoints so its not really
1255 			 * an error, just that we don't need to
1256 			 * synthesize anything.  We really have to
1257 			 * return this more properly and also
1258 			 * propagate errors that now are calling die()
1259 			 */
1260 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
1261 								  process_synthesized_event);
1262 			if (err <= 0) {
1263 				pr_err("Couldn't record tracing data.\n");
1264 				goto out;
1265 			}
1266 			rec->bytes_written += err;
1267 		}
1268 	}
1269 
1270 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1271 					  process_synthesized_event, machine);
1272 	if (err)
1273 		goto out;
1274 
1275 	if (rec->opts.full_auxtrace) {
1276 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1277 					session, process_synthesized_event);
1278 		if (err)
1279 			goto out;
1280 	}
1281 
1282 	if (!perf_evlist__exclude_kernel(rec->evlist)) {
1283 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1284 							 machine);
1285 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1286 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1287 				   "Check /proc/kallsyms permission or run as root.\n");
1288 
1289 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1290 						     machine);
1291 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1292 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1293 				   "Check /proc/modules permission or run as root.\n");
1294 	}
1295 
1296 	if (perf_guest) {
1297 		machines__process_guests(&session->machines,
1298 					 perf_event__synthesize_guest_os, tool);
1299 	}
1300 
1301 	err = perf_event__synthesize_extra_attr(&rec->tool,
1302 						rec->evlist,
1303 						process_synthesized_event,
1304 						data->is_pipe);
1305 	if (err)
1306 		goto out;
1307 
1308 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1309 						 process_synthesized_event,
1310 						NULL);
1311 	if (err < 0) {
1312 		pr_err("Couldn't synthesize thread map.\n");
1313 		return err;
1314 	}
1315 
1316 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1317 					     process_synthesized_event, NULL);
1318 	if (err < 0) {
1319 		pr_err("Couldn't synthesize cpu map.\n");
1320 		return err;
1321 	}
1322 
1323 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1324 						machine, opts);
1325 	if (err < 0)
1326 		pr_warning("Couldn't synthesize bpf events.\n");
1327 
1328 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1329 					    process_synthesized_event, opts->sample_address,
1330 					    1);
1331 out:
1332 	return err;
1333 }
1334 
1335 static int __cmd_record(struct record *rec, int argc, const char **argv)
1336 {
1337 	int err;
1338 	int status = 0;
1339 	unsigned long waking = 0;
1340 	const bool forks = argc > 0;
1341 	struct perf_tool *tool = &rec->tool;
1342 	struct record_opts *opts = &rec->opts;
1343 	struct perf_data *data = &rec->data;
1344 	struct perf_session *session;
1345 	bool disabled = false, draining = false;
1346 	struct evlist *sb_evlist = NULL;
1347 	int fd;
1348 	float ratio = 0;
1349 
1350 	atexit(record__sig_exit);
1351 	signal(SIGCHLD, sig_handler);
1352 	signal(SIGINT, sig_handler);
1353 	signal(SIGTERM, sig_handler);
1354 	signal(SIGSEGV, sigsegv_handler);
1355 
1356 	if (rec->opts.record_namespaces)
1357 		tool->namespace_events = true;
1358 
1359 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1360 		signal(SIGUSR2, snapshot_sig_handler);
1361 		if (rec->opts.auxtrace_snapshot_mode)
1362 			trigger_on(&auxtrace_snapshot_trigger);
1363 		if (rec->switch_output.enabled)
1364 			trigger_on(&switch_output_trigger);
1365 	} else {
1366 		signal(SIGUSR2, SIG_IGN);
1367 	}
1368 
1369 	session = perf_session__new(data, false, tool);
1370 	if (IS_ERR(session)) {
1371 		pr_err("Perf session creation failed.\n");
1372 		return PTR_ERR(session);
1373 	}
1374 
1375 	fd = perf_data__fd(data);
1376 	rec->session = session;
1377 
1378 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1379 		pr_err("Compression initialization failed.\n");
1380 		return -1;
1381 	}
1382 
1383 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1384 	session->header.env.comp_level = rec->opts.comp_level;
1385 
1386 	record__init_features(rec);
1387 
1388 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1389 		session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1390 
1391 	if (forks) {
1392 		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1393 						    argv, data->is_pipe,
1394 						    workload_exec_failed_signal);
1395 		if (err < 0) {
1396 			pr_err("Couldn't run the workload!\n");
1397 			status = err;
1398 			goto out_delete_session;
1399 		}
1400 	}
1401 
1402 	/*
1403 	 * If we have just single event and are sending data
1404 	 * through pipe, we need to force the ids allocation,
1405 	 * because we synthesize event name through the pipe
1406 	 * and need the id for that.
1407 	 */
1408 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1409 		rec->opts.sample_id = true;
1410 
1411 	if (record__open(rec) != 0) {
1412 		err = -1;
1413 		goto out_child;
1414 	}
1415 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1416 
1417 	err = bpf__apply_obj_config();
1418 	if (err) {
1419 		char errbuf[BUFSIZ];
1420 
1421 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1422 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1423 			 errbuf);
1424 		goto out_child;
1425 	}
1426 
1427 	/*
1428 	 * Normally perf_session__new would do this, but it doesn't have the
1429 	 * evlist.
1430 	 */
1431 	if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1432 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1433 		rec->tool.ordered_events = false;
1434 	}
1435 
1436 	if (!rec->evlist->nr_groups)
1437 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1438 
1439 	if (data->is_pipe) {
1440 		err = perf_header__write_pipe(fd);
1441 		if (err < 0)
1442 			goto out_child;
1443 	} else {
1444 		err = perf_session__write_header(session, rec->evlist, fd, false);
1445 		if (err < 0)
1446 			goto out_child;
1447 	}
1448 
1449 	if (!rec->no_buildid
1450 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1451 		pr_err("Couldn't generate buildids. "
1452 		       "Use --no-buildid to profile anyway.\n");
1453 		err = -1;
1454 		goto out_child;
1455 	}
1456 
1457 	if (!opts->no_bpf_event)
1458 		bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1459 
1460 	if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1461 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1462 		opts->no_bpf_event = true;
1463 	}
1464 
1465 	err = record__synthesize(rec, false);
1466 	if (err < 0)
1467 		goto out_child;
1468 
1469 	if (rec->realtime_prio) {
1470 		struct sched_param param;
1471 
1472 		param.sched_priority = rec->realtime_prio;
1473 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1474 			pr_err("Could not set realtime priority.\n");
1475 			err = -1;
1476 			goto out_child;
1477 		}
1478 	}
1479 
1480 	/*
1481 	 * When perf is starting the traced process, all the events
1482 	 * (apart from group members) have enable_on_exec=1 set,
1483 	 * so don't spoil it by prematurely enabling them.
1484 	 */
1485 	if (!target__none(&opts->target) && !opts->initial_delay)
1486 		evlist__enable(rec->evlist);
1487 
1488 	/*
1489 	 * Let the child rip
1490 	 */
1491 	if (forks) {
1492 		struct machine *machine = &session->machines.host;
1493 		union perf_event *event;
1494 		pid_t tgid;
1495 
1496 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1497 		if (event == NULL) {
1498 			err = -ENOMEM;
1499 			goto out_child;
1500 		}
1501 
1502 		/*
1503 		 * Some H/W events are generated before COMM event
1504 		 * which is emitted during exec(), so perf script
1505 		 * cannot see a correct process name for those events.
1506 		 * Synthesize COMM event to prevent it.
1507 		 */
1508 		tgid = perf_event__synthesize_comm(tool, event,
1509 						   rec->evlist->workload.pid,
1510 						   process_synthesized_event,
1511 						   machine);
1512 		free(event);
1513 
1514 		if (tgid == -1)
1515 			goto out_child;
1516 
1517 		event = malloc(sizeof(event->namespaces) +
1518 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1519 			       machine->id_hdr_size);
1520 		if (event == NULL) {
1521 			err = -ENOMEM;
1522 			goto out_child;
1523 		}
1524 
1525 		/*
1526 		 * Synthesize NAMESPACES event for the command specified.
1527 		 */
1528 		perf_event__synthesize_namespaces(tool, event,
1529 						  rec->evlist->workload.pid,
1530 						  tgid, process_synthesized_event,
1531 						  machine);
1532 		free(event);
1533 
1534 		perf_evlist__start_workload(rec->evlist);
1535 	}
1536 
1537 	if (opts->initial_delay) {
1538 		usleep(opts->initial_delay * USEC_PER_MSEC);
1539 		evlist__enable(rec->evlist);
1540 	}
1541 
1542 	trigger_ready(&auxtrace_snapshot_trigger);
1543 	trigger_ready(&switch_output_trigger);
1544 	perf_hooks__invoke_record_start();
1545 	for (;;) {
1546 		unsigned long long hits = rec->samples;
1547 
1548 		/*
1549 		 * rec->evlist->bkw_mmap_state is possible to be
1550 		 * BKW_MMAP_EMPTY here: when done == true and
1551 		 * hits != rec->samples in previous round.
1552 		 *
1553 		 * perf_evlist__toggle_bkw_mmap ensure we never
1554 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1555 		 */
1556 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1557 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1558 
1559 		if (record__mmap_read_all(rec, false) < 0) {
1560 			trigger_error(&auxtrace_snapshot_trigger);
1561 			trigger_error(&switch_output_trigger);
1562 			err = -1;
1563 			goto out_child;
1564 		}
1565 
1566 		if (auxtrace_record__snapshot_started) {
1567 			auxtrace_record__snapshot_started = 0;
1568 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1569 				record__read_auxtrace_snapshot(rec, false);
1570 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1571 				pr_err("AUX area tracing snapshot failed\n");
1572 				err = -1;
1573 				goto out_child;
1574 			}
1575 		}
1576 
1577 		if (trigger_is_hit(&switch_output_trigger)) {
1578 			/*
1579 			 * If switch_output_trigger is hit, the data in
1580 			 * overwritable ring buffer should have been collected,
1581 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1582 			 *
1583 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1584 			 * record__mmap_read_all() didn't collect data from
1585 			 * overwritable ring buffer. Read again.
1586 			 */
1587 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1588 				continue;
1589 			trigger_ready(&switch_output_trigger);
1590 
1591 			/*
1592 			 * Reenable events in overwrite ring buffer after
1593 			 * record__mmap_read_all(): we should have collected
1594 			 * data from it.
1595 			 */
1596 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1597 
1598 			if (!quiet)
1599 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1600 					waking);
1601 			waking = 0;
1602 			fd = record__switch_output(rec, false);
1603 			if (fd < 0) {
1604 				pr_err("Failed to switch to new file\n");
1605 				trigger_error(&switch_output_trigger);
1606 				err = fd;
1607 				goto out_child;
1608 			}
1609 
1610 			/* re-arm the alarm */
1611 			if (rec->switch_output.time)
1612 				alarm(rec->switch_output.time);
1613 		}
1614 
1615 		if (hits == rec->samples) {
1616 			if (done || draining)
1617 				break;
1618 			err = evlist__poll(rec->evlist, -1);
1619 			/*
1620 			 * Propagate error, only if there's any. Ignore positive
1621 			 * number of returned events and interrupt error.
1622 			 */
1623 			if (err > 0 || (err < 0 && errno == EINTR))
1624 				err = 0;
1625 			waking++;
1626 
1627 			if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1628 				draining = true;
1629 		}
1630 
1631 		/*
1632 		 * When perf is starting the traced process, at the end events
1633 		 * die with the process and we wait for that. Thus no need to
1634 		 * disable events in this case.
1635 		 */
1636 		if (done && !disabled && !target__none(&opts->target)) {
1637 			trigger_off(&auxtrace_snapshot_trigger);
1638 			evlist__disable(rec->evlist);
1639 			disabled = true;
1640 		}
1641 	}
1642 
1643 	trigger_off(&auxtrace_snapshot_trigger);
1644 	trigger_off(&switch_output_trigger);
1645 
1646 	if (opts->auxtrace_snapshot_on_exit)
1647 		record__auxtrace_snapshot_exit(rec);
1648 
1649 	if (forks && workload_exec_errno) {
1650 		char msg[STRERR_BUFSIZE];
1651 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1652 		pr_err("Workload failed: %s\n", emsg);
1653 		err = -1;
1654 		goto out_child;
1655 	}
1656 
1657 	if (!quiet)
1658 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1659 
1660 	if (target__none(&rec->opts.target))
1661 		record__synthesize_workload(rec, true);
1662 
1663 out_child:
1664 	record__mmap_read_all(rec, true);
1665 	record__aio_mmap_read_sync(rec);
1666 
1667 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1668 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1669 		session->header.env.comp_ratio = ratio + 0.5;
1670 	}
1671 
1672 	if (forks) {
1673 		int exit_status;
1674 
1675 		if (!child_finished)
1676 			kill(rec->evlist->workload.pid, SIGTERM);
1677 
1678 		wait(&exit_status);
1679 
1680 		if (err < 0)
1681 			status = err;
1682 		else if (WIFEXITED(exit_status))
1683 			status = WEXITSTATUS(exit_status);
1684 		else if (WIFSIGNALED(exit_status))
1685 			signr = WTERMSIG(exit_status);
1686 	} else
1687 		status = err;
1688 
1689 	record__synthesize(rec, true);
1690 	/* this will be recalculated during process_buildids() */
1691 	rec->samples = 0;
1692 
1693 	if (!err) {
1694 		if (!rec->timestamp_filename) {
1695 			record__finish_output(rec);
1696 		} else {
1697 			fd = record__switch_output(rec, true);
1698 			if (fd < 0) {
1699 				status = fd;
1700 				goto out_delete_session;
1701 			}
1702 		}
1703 	}
1704 
1705 	perf_hooks__invoke_record_end();
1706 
1707 	if (!err && !quiet) {
1708 		char samples[128];
1709 		const char *postfix = rec->timestamp_filename ?
1710 					".<timestamp>" : "";
1711 
1712 		if (rec->samples && !rec->opts.full_auxtrace)
1713 			scnprintf(samples, sizeof(samples),
1714 				  " (%" PRIu64 " samples)", rec->samples);
1715 		else
1716 			samples[0] = '\0';
1717 
1718 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
1719 			perf_data__size(data) / 1024.0 / 1024.0,
1720 			data->path, postfix, samples);
1721 		if (ratio) {
1722 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
1723 					rec->session->bytes_transferred / 1024.0 / 1024.0,
1724 					ratio);
1725 		}
1726 		fprintf(stderr, " ]\n");
1727 	}
1728 
1729 out_delete_session:
1730 	zstd_fini(&session->zstd_data);
1731 	perf_session__delete(session);
1732 
1733 	if (!opts->no_bpf_event)
1734 		perf_evlist__stop_sb_thread(sb_evlist);
1735 	return status;
1736 }
1737 
1738 static void callchain_debug(struct callchain_param *callchain)
1739 {
1740 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1741 
1742 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1743 
1744 	if (callchain->record_mode == CALLCHAIN_DWARF)
1745 		pr_debug("callchain: stack dump size %d\n",
1746 			 callchain->dump_size);
1747 }
1748 
1749 int record_opts__parse_callchain(struct record_opts *record,
1750 				 struct callchain_param *callchain,
1751 				 const char *arg, bool unset)
1752 {
1753 	int ret;
1754 	callchain->enabled = !unset;
1755 
1756 	/* --no-call-graph */
1757 	if (unset) {
1758 		callchain->record_mode = CALLCHAIN_NONE;
1759 		pr_debug("callchain: disabled\n");
1760 		return 0;
1761 	}
1762 
1763 	ret = parse_callchain_record_opt(arg, callchain);
1764 	if (!ret) {
1765 		/* Enable data address sampling for DWARF unwind. */
1766 		if (callchain->record_mode == CALLCHAIN_DWARF)
1767 			record->sample_address = true;
1768 		callchain_debug(callchain);
1769 	}
1770 
1771 	return ret;
1772 }
1773 
1774 int record_parse_callchain_opt(const struct option *opt,
1775 			       const char *arg,
1776 			       int unset)
1777 {
1778 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1779 }
1780 
1781 int record_callchain_opt(const struct option *opt,
1782 			 const char *arg __maybe_unused,
1783 			 int unset __maybe_unused)
1784 {
1785 	struct callchain_param *callchain = opt->value;
1786 
1787 	callchain->enabled = true;
1788 
1789 	if (callchain->record_mode == CALLCHAIN_NONE)
1790 		callchain->record_mode = CALLCHAIN_FP;
1791 
1792 	callchain_debug(callchain);
1793 	return 0;
1794 }
1795 
1796 static int perf_record_config(const char *var, const char *value, void *cb)
1797 {
1798 	struct record *rec = cb;
1799 
1800 	if (!strcmp(var, "record.build-id")) {
1801 		if (!strcmp(value, "cache"))
1802 			rec->no_buildid_cache = false;
1803 		else if (!strcmp(value, "no-cache"))
1804 			rec->no_buildid_cache = true;
1805 		else if (!strcmp(value, "skip"))
1806 			rec->no_buildid = true;
1807 		else
1808 			return -1;
1809 		return 0;
1810 	}
1811 	if (!strcmp(var, "record.call-graph")) {
1812 		var = "call-graph.record-mode";
1813 		return perf_default_config(var, value, cb);
1814 	}
1815 #ifdef HAVE_AIO_SUPPORT
1816 	if (!strcmp(var, "record.aio")) {
1817 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
1818 		if (!rec->opts.nr_cblocks)
1819 			rec->opts.nr_cblocks = nr_cblocks_default;
1820 	}
1821 #endif
1822 
1823 	return 0;
1824 }
1825 
1826 struct clockid_map {
1827 	const char *name;
1828 	int clockid;
1829 };
1830 
1831 #define CLOCKID_MAP(n, c)	\
1832 	{ .name = n, .clockid = (c), }
1833 
1834 #define CLOCKID_END	{ .name = NULL, }
1835 
1836 
1837 /*
1838  * Add the missing ones, we need to build on many distros...
1839  */
1840 #ifndef CLOCK_MONOTONIC_RAW
1841 #define CLOCK_MONOTONIC_RAW 4
1842 #endif
1843 #ifndef CLOCK_BOOTTIME
1844 #define CLOCK_BOOTTIME 7
1845 #endif
1846 #ifndef CLOCK_TAI
1847 #define CLOCK_TAI 11
1848 #endif
1849 
1850 static const struct clockid_map clockids[] = {
1851 	/* available for all events, NMI safe */
1852 	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1853 	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1854 
1855 	/* available for some events */
1856 	CLOCKID_MAP("realtime", CLOCK_REALTIME),
1857 	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1858 	CLOCKID_MAP("tai", CLOCK_TAI),
1859 
1860 	/* available for the lazy */
1861 	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1862 	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1863 	CLOCKID_MAP("real", CLOCK_REALTIME),
1864 	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1865 
1866 	CLOCKID_END,
1867 };
1868 
1869 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1870 {
1871 	struct timespec res;
1872 
1873 	*res_ns = 0;
1874 	if (!clock_getres(clk_id, &res))
1875 		*res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1876 	else
1877 		pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1878 
1879 	return 0;
1880 }
1881 
1882 static int parse_clockid(const struct option *opt, const char *str, int unset)
1883 {
1884 	struct record_opts *opts = (struct record_opts *)opt->value;
1885 	const struct clockid_map *cm;
1886 	const char *ostr = str;
1887 
1888 	if (unset) {
1889 		opts->use_clockid = 0;
1890 		return 0;
1891 	}
1892 
1893 	/* no arg passed */
1894 	if (!str)
1895 		return 0;
1896 
1897 	/* no setting it twice */
1898 	if (opts->use_clockid)
1899 		return -1;
1900 
1901 	opts->use_clockid = true;
1902 
1903 	/* if its a number, we're done */
1904 	if (sscanf(str, "%d", &opts->clockid) == 1)
1905 		return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1906 
1907 	/* allow a "CLOCK_" prefix to the name */
1908 	if (!strncasecmp(str, "CLOCK_", 6))
1909 		str += 6;
1910 
1911 	for (cm = clockids; cm->name; cm++) {
1912 		if (!strcasecmp(str, cm->name)) {
1913 			opts->clockid = cm->clockid;
1914 			return get_clockid_res(opts->clockid,
1915 					       &opts->clockid_res_ns);
1916 		}
1917 	}
1918 
1919 	opts->use_clockid = false;
1920 	ui__warning("unknown clockid %s, check man page\n", ostr);
1921 	return -1;
1922 }
1923 
1924 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1925 {
1926 	struct record_opts *opts = (struct record_opts *)opt->value;
1927 
1928 	if (unset || !str)
1929 		return 0;
1930 
1931 	if (!strcasecmp(str, "node"))
1932 		opts->affinity = PERF_AFFINITY_NODE;
1933 	else if (!strcasecmp(str, "cpu"))
1934 		opts->affinity = PERF_AFFINITY_CPU;
1935 
1936 	return 0;
1937 }
1938 
1939 static int record__parse_mmap_pages(const struct option *opt,
1940 				    const char *str,
1941 				    int unset __maybe_unused)
1942 {
1943 	struct record_opts *opts = opt->value;
1944 	char *s, *p;
1945 	unsigned int mmap_pages;
1946 	int ret;
1947 
1948 	if (!str)
1949 		return -EINVAL;
1950 
1951 	s = strdup(str);
1952 	if (!s)
1953 		return -ENOMEM;
1954 
1955 	p = strchr(s, ',');
1956 	if (p)
1957 		*p = '\0';
1958 
1959 	if (*s) {
1960 		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1961 		if (ret)
1962 			goto out_free;
1963 		opts->mmap_pages = mmap_pages;
1964 	}
1965 
1966 	if (!p) {
1967 		ret = 0;
1968 		goto out_free;
1969 	}
1970 
1971 	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1972 	if (ret)
1973 		goto out_free;
1974 
1975 	opts->auxtrace_mmap_pages = mmap_pages;
1976 
1977 out_free:
1978 	free(s);
1979 	return ret;
1980 }
1981 
1982 static void switch_output_size_warn(struct record *rec)
1983 {
1984 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
1985 	struct switch_output *s = &rec->switch_output;
1986 
1987 	wakeup_size /= 2;
1988 
1989 	if (s->size < wakeup_size) {
1990 		char buf[100];
1991 
1992 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1993 		pr_warning("WARNING: switch-output data size lower than "
1994 			   "wakeup kernel buffer size (%s) "
1995 			   "expect bigger perf.data sizes\n", buf);
1996 	}
1997 }
1998 
1999 static int switch_output_setup(struct record *rec)
2000 {
2001 	struct switch_output *s = &rec->switch_output;
2002 	static struct parse_tag tags_size[] = {
2003 		{ .tag  = 'B', .mult = 1       },
2004 		{ .tag  = 'K', .mult = 1 << 10 },
2005 		{ .tag  = 'M', .mult = 1 << 20 },
2006 		{ .tag  = 'G', .mult = 1 << 30 },
2007 		{ .tag  = 0 },
2008 	};
2009 	static struct parse_tag tags_time[] = {
2010 		{ .tag  = 's', .mult = 1        },
2011 		{ .tag  = 'm', .mult = 60       },
2012 		{ .tag  = 'h', .mult = 60*60    },
2013 		{ .tag  = 'd', .mult = 60*60*24 },
2014 		{ .tag  = 0 },
2015 	};
2016 	unsigned long val;
2017 
2018 	if (!s->set)
2019 		return 0;
2020 
2021 	if (!strcmp(s->str, "signal")) {
2022 		s->signal = true;
2023 		pr_debug("switch-output with SIGUSR2 signal\n");
2024 		goto enabled;
2025 	}
2026 
2027 	val = parse_tag_value(s->str, tags_size);
2028 	if (val != (unsigned long) -1) {
2029 		s->size = val;
2030 		pr_debug("switch-output with %s size threshold\n", s->str);
2031 		goto enabled;
2032 	}
2033 
2034 	val = parse_tag_value(s->str, tags_time);
2035 	if (val != (unsigned long) -1) {
2036 		s->time = val;
2037 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2038 			 s->str, s->time);
2039 		goto enabled;
2040 	}
2041 
2042 	return -1;
2043 
2044 enabled:
2045 	rec->timestamp_filename = true;
2046 	s->enabled              = true;
2047 
2048 	if (s->size && !rec->opts.no_buffering)
2049 		switch_output_size_warn(rec);
2050 
2051 	return 0;
2052 }
2053 
2054 static const char * const __record_usage[] = {
2055 	"perf record [<options>] [<command>]",
2056 	"perf record [<options>] -- <command> [<options>]",
2057 	NULL
2058 };
2059 const char * const *record_usage = __record_usage;
2060 
2061 /*
2062  * XXX Ideally would be local to cmd_record() and passed to a record__new
2063  * because we need to have access to it in record__exit, that is called
2064  * after cmd_record() exits, but since record_options need to be accessible to
2065  * builtin-script, leave it here.
2066  *
2067  * At least we don't ouch it in all the other functions here directly.
2068  *
2069  * Just say no to tons of global variables, sigh.
2070  */
2071 static struct record record = {
2072 	.opts = {
2073 		.sample_time	     = true,
2074 		.mmap_pages	     = UINT_MAX,
2075 		.user_freq	     = UINT_MAX,
2076 		.user_interval	     = ULLONG_MAX,
2077 		.freq		     = 4000,
2078 		.target		     = {
2079 			.uses_mmap   = true,
2080 			.default_per_cpu = true,
2081 		},
2082 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2083 	},
2084 	.tool = {
2085 		.sample		= process_sample_event,
2086 		.fork		= perf_event__process_fork,
2087 		.exit		= perf_event__process_exit,
2088 		.comm		= perf_event__process_comm,
2089 		.namespaces	= perf_event__process_namespaces,
2090 		.mmap		= perf_event__process_mmap,
2091 		.mmap2		= perf_event__process_mmap2,
2092 		.ordered_events	= true,
2093 	},
2094 };
2095 
2096 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2097 	"\n\t\t\t\tDefault: fp";
2098 
2099 static bool dry_run;
2100 
2101 /*
2102  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2103  * with it and switch to use the library functions in perf_evlist that came
2104  * from builtin-record.c, i.e. use record_opts,
2105  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2106  * using pipes, etc.
2107  */
2108 static struct option __record_options[] = {
2109 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2110 		     "event selector. use 'perf list' to list available events",
2111 		     parse_events_option),
2112 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2113 		     "event filter", parse_filter),
2114 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2115 			   NULL, "don't record events from perf itself",
2116 			   exclude_perf),
2117 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2118 		    "record events on existing process id"),
2119 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2120 		    "record events on existing thread id"),
2121 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2122 		    "collect data with this RT SCHED_FIFO priority"),
2123 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2124 		    "collect data without buffering"),
2125 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2126 		    "collect raw sample records from all opened counters"),
2127 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2128 			    "system-wide collection from all CPUs"),
2129 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2130 		    "list of cpus to monitor"),
2131 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2132 	OPT_STRING('o', "output", &record.data.path, "file",
2133 		    "output file name"),
2134 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2135 			&record.opts.no_inherit_set,
2136 			"child tasks do not inherit counters"),
2137 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2138 		    "synthesize non-sample events at the end of output"),
2139 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2140 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2141 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2142 		    "Fail if the specified frequency can't be used"),
2143 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2144 		     "profile at this frequency",
2145 		      record__parse_freq),
2146 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2147 		     "number of mmap data pages and AUX area tracing mmap pages",
2148 		     record__parse_mmap_pages),
2149 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2150 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2151 		     record__mmap_flush_parse),
2152 	OPT_BOOLEAN(0, "group", &record.opts.group,
2153 		    "put the counters into a counter group"),
2154 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2155 			   NULL, "enables call-graph recording" ,
2156 			   &record_callchain_opt),
2157 	OPT_CALLBACK(0, "call-graph", &record.opts,
2158 		     "record_mode[,record_size]", record_callchain_help,
2159 		     &record_parse_callchain_opt),
2160 	OPT_INCR('v', "verbose", &verbose,
2161 		    "be more verbose (show counter open errors, etc)"),
2162 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2163 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2164 		    "per thread counts"),
2165 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2166 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2167 		    "Record the sample physical addresses"),
2168 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2169 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2170 			&record.opts.sample_time_set,
2171 			"Record the sample timestamps"),
2172 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2173 			"Record the sample period"),
2174 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2175 		    "don't sample"),
2176 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2177 			&record.no_buildid_cache_set,
2178 			"do not update the buildid cache"),
2179 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2180 			&record.no_buildid_set,
2181 			"do not collect buildids in perf.data"),
2182 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2183 		     "monitor event in cgroup name only",
2184 		     parse_cgroups),
2185 	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2186 		  "ms to wait before starting measurement after program start"),
2187 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2188 		   "user to profile"),
2189 
2190 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2191 		     "branch any", "sample any taken branches",
2192 		     parse_branch_stack),
2193 
2194 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2195 		     "branch filter mask", "branch stack filter modes",
2196 		     parse_branch_stack),
2197 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2198 		    "sample by weight (on special events only)"),
2199 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2200 		    "sample transaction flags (special events only)"),
2201 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2202 		    "use per-thread mmaps"),
2203 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2204 		    "sample selected machine registers on interrupt,"
2205 		    " use '-I?' to list register names", parse_intr_regs),
2206 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2207 		    "sample selected machine registers on interrupt,"
2208 		    " use '--user-regs=?' to list register names", parse_user_regs),
2209 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2210 		    "Record running/enabled time of read (:S) events"),
2211 	OPT_CALLBACK('k', "clockid", &record.opts,
2212 	"clockid", "clockid to use for events, see clock_gettime()",
2213 	parse_clockid),
2214 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2215 			  "opts", "AUX area tracing Snapshot Mode", ""),
2216 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2217 			"per thread proc mmap processing timeout in ms"),
2218 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2219 		    "Record namespaces events"),
2220 	OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2221 		    "Record context switch events"),
2222 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2223 			 "Configure all used events to run in kernel space.",
2224 			 PARSE_OPT_EXCLUSIVE),
2225 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2226 			 "Configure all used events to run in user space.",
2227 			 PARSE_OPT_EXCLUSIVE),
2228 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2229 		    "collect kernel callchains"),
2230 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2231 		    "collect user callchains"),
2232 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2233 		   "clang binary to use for compiling BPF scriptlets"),
2234 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2235 		   "options passed to clang when compiling BPF scriptlets"),
2236 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2237 		   "file", "vmlinux pathname"),
2238 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2239 		    "Record build-id of all DSOs regardless of hits"),
2240 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2241 		    "append timestamp to output filename"),
2242 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2243 		    "Record timestamp boundary (time of first/last samples)"),
2244 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2245 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2246 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2247 			  "signal"),
2248 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2249 		   "Limit number of switch output generated files"),
2250 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2251 		    "Parse options then exit"),
2252 #ifdef HAVE_AIO_SUPPORT
2253 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2254 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2255 		     record__aio_parse),
2256 #endif
2257 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2258 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2259 		     record__parse_affinity),
2260 #ifdef HAVE_ZSTD_SUPPORT
2261 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2262 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2263 			    record__parse_comp_level),
2264 #endif
2265 	OPT_END()
2266 };
2267 
2268 struct option *record_options = __record_options;
2269 
2270 int cmd_record(int argc, const char **argv)
2271 {
2272 	int err;
2273 	struct record *rec = &record;
2274 	char errbuf[BUFSIZ];
2275 
2276 	setlocale(LC_ALL, "");
2277 
2278 #ifndef HAVE_LIBBPF_SUPPORT
2279 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2280 	set_nobuild('\0', "clang-path", true);
2281 	set_nobuild('\0', "clang-opt", true);
2282 # undef set_nobuild
2283 #endif
2284 
2285 #ifndef HAVE_BPF_PROLOGUE
2286 # if !defined (HAVE_DWARF_SUPPORT)
2287 #  define REASON  "NO_DWARF=1"
2288 # elif !defined (HAVE_LIBBPF_SUPPORT)
2289 #  define REASON  "NO_LIBBPF=1"
2290 # else
2291 #  define REASON  "this architecture doesn't support BPF prologue"
2292 # endif
2293 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2294 	set_nobuild('\0', "vmlinux", true);
2295 # undef set_nobuild
2296 # undef REASON
2297 #endif
2298 
2299 	CPU_ZERO(&rec->affinity_mask);
2300 	rec->opts.affinity = PERF_AFFINITY_SYS;
2301 
2302 	rec->evlist = evlist__new();
2303 	if (rec->evlist == NULL)
2304 		return -ENOMEM;
2305 
2306 	err = perf_config(perf_record_config, rec);
2307 	if (err)
2308 		return err;
2309 
2310 	argc = parse_options(argc, argv, record_options, record_usage,
2311 			    PARSE_OPT_STOP_AT_NON_OPTION);
2312 	if (quiet)
2313 		perf_quiet_option();
2314 
2315 	/* Make system wide (-a) the default target. */
2316 	if (!argc && target__none(&rec->opts.target))
2317 		rec->opts.target.system_wide = true;
2318 
2319 	if (nr_cgroups && !rec->opts.target.system_wide) {
2320 		usage_with_options_msg(record_usage, record_options,
2321 			"cgroup monitoring only available in system-wide mode");
2322 
2323 	}
2324 
2325 	if (rec->opts.comp_level != 0) {
2326 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2327 		rec->no_buildid = true;
2328 	}
2329 
2330 	if (rec->opts.record_switch_events &&
2331 	    !perf_can_record_switch_events()) {
2332 		ui__error("kernel does not support recording context switch events\n");
2333 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2334 		return -EINVAL;
2335 	}
2336 
2337 	if (switch_output_setup(rec)) {
2338 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2339 		return -EINVAL;
2340 	}
2341 
2342 	if (rec->switch_output.time) {
2343 		signal(SIGALRM, alarm_sig_handler);
2344 		alarm(rec->switch_output.time);
2345 	}
2346 
2347 	if (rec->switch_output.num_files) {
2348 		rec->switch_output.filenames = calloc(sizeof(char *),
2349 						      rec->switch_output.num_files);
2350 		if (!rec->switch_output.filenames)
2351 			return -EINVAL;
2352 	}
2353 
2354 	/*
2355 	 * Allow aliases to facilitate the lookup of symbols for address
2356 	 * filters. Refer to auxtrace_parse_filters().
2357 	 */
2358 	symbol_conf.allow_aliases = true;
2359 
2360 	symbol__init(NULL);
2361 
2362 	err = record__auxtrace_init(rec);
2363 	if (err)
2364 		goto out;
2365 
2366 	if (dry_run)
2367 		goto out;
2368 
2369 	err = bpf__setup_stdout(rec->evlist);
2370 	if (err) {
2371 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2372 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2373 			 errbuf);
2374 		goto out;
2375 	}
2376 
2377 	err = -ENOMEM;
2378 
2379 	if (rec->no_buildid_cache || rec->no_buildid) {
2380 		disable_buildid_cache();
2381 	} else if (rec->switch_output.enabled) {
2382 		/*
2383 		 * In 'perf record --switch-output', disable buildid
2384 		 * generation by default to reduce data file switching
2385 		 * overhead. Still generate buildid if they are required
2386 		 * explicitly using
2387 		 *
2388 		 *  perf record --switch-output --no-no-buildid \
2389 		 *              --no-no-buildid-cache
2390 		 *
2391 		 * Following code equals to:
2392 		 *
2393 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2394 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2395 		 *         disable_buildid_cache();
2396 		 */
2397 		bool disable = true;
2398 
2399 		if (rec->no_buildid_set && !rec->no_buildid)
2400 			disable = false;
2401 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2402 			disable = false;
2403 		if (disable) {
2404 			rec->no_buildid = true;
2405 			rec->no_buildid_cache = true;
2406 			disable_buildid_cache();
2407 		}
2408 	}
2409 
2410 	if (record.opts.overwrite)
2411 		record.opts.tail_synthesize = true;
2412 
2413 	if (rec->evlist->core.nr_entries == 0 &&
2414 	    __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2415 		pr_err("Not enough memory for event selector list\n");
2416 		goto out;
2417 	}
2418 
2419 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2420 		rec->opts.no_inherit = true;
2421 
2422 	err = target__validate(&rec->opts.target);
2423 	if (err) {
2424 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2425 		ui__warning("%s\n", errbuf);
2426 	}
2427 
2428 	err = target__parse_uid(&rec->opts.target);
2429 	if (err) {
2430 		int saved_errno = errno;
2431 
2432 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2433 		ui__error("%s", errbuf);
2434 
2435 		err = -saved_errno;
2436 		goto out;
2437 	}
2438 
2439 	/* Enable ignoring missing threads when -u/-p option is defined. */
2440 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2441 
2442 	err = -ENOMEM;
2443 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2444 		usage_with_options(record_usage, record_options);
2445 
2446 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2447 	if (err)
2448 		goto out;
2449 
2450 	/*
2451 	 * We take all buildids when the file contains
2452 	 * AUX area tracing data because we do not decode the
2453 	 * trace because it would take too long.
2454 	 */
2455 	if (rec->opts.full_auxtrace)
2456 		rec->buildid_all = true;
2457 
2458 	if (record_opts__config(&rec->opts)) {
2459 		err = -EINVAL;
2460 		goto out;
2461 	}
2462 
2463 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2464 		rec->opts.nr_cblocks = nr_cblocks_max;
2465 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2466 
2467 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2468 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2469 
2470 	if (rec->opts.comp_level > comp_level_max)
2471 		rec->opts.comp_level = comp_level_max;
2472 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2473 
2474 	err = __cmd_record(&record, argc, argv);
2475 out:
2476 	evlist__delete(rec->evlist);
2477 	symbol__exit();
2478 	auxtrace_record__free(rec->itr);
2479 	return err;
2480 }
2481 
2482 static void snapshot_sig_handler(int sig __maybe_unused)
2483 {
2484 	struct record *rec = &record;
2485 
2486 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2487 		trigger_hit(&auxtrace_snapshot_trigger);
2488 		auxtrace_record__snapshot_started = 1;
2489 		if (auxtrace_record__snapshot_start(record.itr))
2490 			trigger_error(&auxtrace_snapshot_trigger);
2491 	}
2492 
2493 	if (switch_output_signal(rec))
2494 		trigger_hit(&switch_output_trigger);
2495 }
2496 
2497 static void alarm_sig_handler(int sig __maybe_unused)
2498 {
2499 	struct record *rec = &record;
2500 
2501 	if (switch_output_time(rec))
2502 		trigger_hit(&switch_output_trigger);
2503 }
2504