xref: /openbmc/linux/tools/perf/builtin-record.c (revision ecd25094)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/target.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/record.h"
28 #include "util/cpumap.h"
29 #include "util/thread_map.h"
30 #include "util/data.h"
31 #include "util/perf_regs.h"
32 #include "util/auxtrace.h"
33 #include "util/tsc.h"
34 #include "util/parse-branch-options.h"
35 #include "util/parse-regs-options.h"
36 #include "util/llvm-utils.h"
37 #include "util/bpf-loader.h"
38 #include "util/trigger.h"
39 #include "util/perf-hooks.h"
40 #include "util/cpu-set-sched.h"
41 #include "util/time-utils.h"
42 #include "util/units.h"
43 #include "util/bpf-event.h"
44 #include "asm/bug.h"
45 #include "perf.h"
46 
47 #include <errno.h>
48 #include <inttypes.h>
49 #include <locale.h>
50 #include <poll.h>
51 #include <unistd.h>
52 #include <sched.h>
53 #include <signal.h>
54 #include <sys/mman.h>
55 #include <sys/wait.h>
56 #include <linux/string.h>
57 #include <linux/time64.h>
58 #include <linux/zalloc.h>
59 
60 struct switch_output {
61 	bool		 enabled;
62 	bool		 signal;
63 	unsigned long	 size;
64 	unsigned long	 time;
65 	const char	*str;
66 	bool		 set;
67 	char		 **filenames;
68 	int		 num_files;
69 	int		 cur_file;
70 };
71 
72 struct record {
73 	struct perf_tool	tool;
74 	struct record_opts	opts;
75 	u64			bytes_written;
76 	struct perf_data	data;
77 	struct auxtrace_record	*itr;
78 	struct evlist	*evlist;
79 	struct perf_session	*session;
80 	int			realtime_prio;
81 	bool			no_buildid;
82 	bool			no_buildid_set;
83 	bool			no_buildid_cache;
84 	bool			no_buildid_cache_set;
85 	bool			buildid_all;
86 	bool			timestamp_filename;
87 	bool			timestamp_boundary;
88 	struct switch_output	switch_output;
89 	unsigned long long	samples;
90 	cpu_set_t		affinity_mask;
91 };
92 
93 static volatile int auxtrace_record__snapshot_started;
94 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
95 static DEFINE_TRIGGER(switch_output_trigger);
96 
97 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
98 	"SYS", "NODE", "CPU"
99 };
100 
101 static bool switch_output_signal(struct record *rec)
102 {
103 	return rec->switch_output.signal &&
104 	       trigger_is_ready(&switch_output_trigger);
105 }
106 
107 static bool switch_output_size(struct record *rec)
108 {
109 	return rec->switch_output.size &&
110 	       trigger_is_ready(&switch_output_trigger) &&
111 	       (rec->bytes_written >= rec->switch_output.size);
112 }
113 
114 static bool switch_output_time(struct record *rec)
115 {
116 	return rec->switch_output.time &&
117 	       trigger_is_ready(&switch_output_trigger);
118 }
119 
120 static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused,
121 			 void *bf, size_t size)
122 {
123 	struct perf_data_file *file = &rec->session->data->file;
124 
125 	if (perf_data_file__write(file, bf, size) < 0) {
126 		pr_err("failed to write perf data, error: %m\n");
127 		return -1;
128 	}
129 
130 	rec->bytes_written += size;
131 
132 	if (switch_output_size(rec))
133 		trigger_hit(&switch_output_trigger);
134 
135 	return 0;
136 }
137 
138 static int record__aio_enabled(struct record *rec);
139 static int record__comp_enabled(struct record *rec);
140 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
141 			    void *src, size_t src_size);
142 
143 #ifdef HAVE_AIO_SUPPORT
144 static int record__aio_write(struct aiocb *cblock, int trace_fd,
145 		void *buf, size_t size, off_t off)
146 {
147 	int rc;
148 
149 	cblock->aio_fildes = trace_fd;
150 	cblock->aio_buf    = buf;
151 	cblock->aio_nbytes = size;
152 	cblock->aio_offset = off;
153 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
154 
155 	do {
156 		rc = aio_write(cblock);
157 		if (rc == 0) {
158 			break;
159 		} else if (errno != EAGAIN) {
160 			cblock->aio_fildes = -1;
161 			pr_err("failed to queue perf data, error: %m\n");
162 			break;
163 		}
164 	} while (1);
165 
166 	return rc;
167 }
168 
169 static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
170 {
171 	void *rem_buf;
172 	off_t rem_off;
173 	size_t rem_size;
174 	int rc, aio_errno;
175 	ssize_t aio_ret, written;
176 
177 	aio_errno = aio_error(cblock);
178 	if (aio_errno == EINPROGRESS)
179 		return 0;
180 
181 	written = aio_ret = aio_return(cblock);
182 	if (aio_ret < 0) {
183 		if (aio_errno != EINTR)
184 			pr_err("failed to write perf data, error: %m\n");
185 		written = 0;
186 	}
187 
188 	rem_size = cblock->aio_nbytes - written;
189 
190 	if (rem_size == 0) {
191 		cblock->aio_fildes = -1;
192 		/*
193 		 * md->refcount is incremented in record__aio_pushfn() for
194 		 * every aio write request started in record__aio_push() so
195 		 * decrement it because the request is now complete.
196 		 */
197 		perf_mmap__put(md);
198 		rc = 1;
199 	} else {
200 		/*
201 		 * aio write request may require restart with the
202 		 * reminder if the kernel didn't write whole
203 		 * chunk at once.
204 		 */
205 		rem_off = cblock->aio_offset + written;
206 		rem_buf = (void *)(cblock->aio_buf + written);
207 		record__aio_write(cblock, cblock->aio_fildes,
208 				rem_buf, rem_size, rem_off);
209 		rc = 0;
210 	}
211 
212 	return rc;
213 }
214 
215 static int record__aio_sync(struct perf_mmap *md, bool sync_all)
216 {
217 	struct aiocb **aiocb = md->aio.aiocb;
218 	struct aiocb *cblocks = md->aio.cblocks;
219 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
220 	int i, do_suspend;
221 
222 	do {
223 		do_suspend = 0;
224 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
225 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
226 				if (sync_all)
227 					aiocb[i] = NULL;
228 				else
229 					return i;
230 			} else {
231 				/*
232 				 * Started aio write is not complete yet
233 				 * so it has to be waited before the
234 				 * next allocation.
235 				 */
236 				aiocb[i] = &cblocks[i];
237 				do_suspend = 1;
238 			}
239 		}
240 		if (!do_suspend)
241 			return -1;
242 
243 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
244 			if (!(errno == EAGAIN || errno == EINTR))
245 				pr_err("failed to sync perf data, error: %m\n");
246 		}
247 	} while (1);
248 }
249 
250 struct record_aio {
251 	struct record	*rec;
252 	void		*data;
253 	size_t		size;
254 };
255 
256 static int record__aio_pushfn(struct perf_mmap *map, void *to, void *buf, size_t size)
257 {
258 	struct record_aio *aio = to;
259 
260 	/*
261 	 * map->base data pointed by buf is copied into free map->aio.data[] buffer
262 	 * to release space in the kernel buffer as fast as possible, calling
263 	 * perf_mmap__consume() from perf_mmap__push() function.
264 	 *
265 	 * That lets the kernel to proceed with storing more profiling data into
266 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
267 	 *
268 	 * Coping can be done in two steps in case the chunk of profiling data
269 	 * crosses the upper bound of the kernel buffer. In this case we first move
270 	 * part of data from map->start till the upper bound and then the reminder
271 	 * from the beginning of the kernel buffer till the end of the data chunk.
272 	 */
273 
274 	if (record__comp_enabled(aio->rec)) {
275 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
276 				     perf_mmap__mmap_len(map) - aio->size,
277 				     buf, size);
278 	} else {
279 		memcpy(aio->data + aio->size, buf, size);
280 	}
281 
282 	if (!aio->size) {
283 		/*
284 		 * Increment map->refcount to guard map->aio.data[] buffer
285 		 * from premature deallocation because map object can be
286 		 * released earlier than aio write request started on
287 		 * map->aio.data[] buffer is complete.
288 		 *
289 		 * perf_mmap__put() is done at record__aio_complete()
290 		 * after started aio request completion or at record__aio_push()
291 		 * if the request failed to start.
292 		 */
293 		perf_mmap__get(map);
294 	}
295 
296 	aio->size += size;
297 
298 	return size;
299 }
300 
301 static int record__aio_push(struct record *rec, struct perf_mmap *map, off_t *off)
302 {
303 	int ret, idx;
304 	int trace_fd = rec->session->data->file.fd;
305 	struct record_aio aio = { .rec = rec, .size = 0 };
306 
307 	/*
308 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
309 	 * becomes available after previous aio write operation.
310 	 */
311 
312 	idx = record__aio_sync(map, false);
313 	aio.data = map->aio.data[idx];
314 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
315 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
316 		return ret;
317 
318 	rec->samples++;
319 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
320 	if (!ret) {
321 		*off += aio.size;
322 		rec->bytes_written += aio.size;
323 		if (switch_output_size(rec))
324 			trigger_hit(&switch_output_trigger);
325 	} else {
326 		/*
327 		 * Decrement map->refcount incremented in record__aio_pushfn()
328 		 * back if record__aio_write() operation failed to start, otherwise
329 		 * map->refcount is decremented in record__aio_complete() after
330 		 * aio write operation finishes successfully.
331 		 */
332 		perf_mmap__put(map);
333 	}
334 
335 	return ret;
336 }
337 
338 static off_t record__aio_get_pos(int trace_fd)
339 {
340 	return lseek(trace_fd, 0, SEEK_CUR);
341 }
342 
343 static void record__aio_set_pos(int trace_fd, off_t pos)
344 {
345 	lseek(trace_fd, pos, SEEK_SET);
346 }
347 
348 static void record__aio_mmap_read_sync(struct record *rec)
349 {
350 	int i;
351 	struct evlist *evlist = rec->evlist;
352 	struct perf_mmap *maps = evlist->mmap;
353 
354 	if (!record__aio_enabled(rec))
355 		return;
356 
357 	for (i = 0; i < evlist->nr_mmaps; i++) {
358 		struct perf_mmap *map = &maps[i];
359 
360 		if (map->base)
361 			record__aio_sync(map, true);
362 	}
363 }
364 
365 static int nr_cblocks_default = 1;
366 static int nr_cblocks_max = 4;
367 
368 static int record__aio_parse(const struct option *opt,
369 			     const char *str,
370 			     int unset)
371 {
372 	struct record_opts *opts = (struct record_opts *)opt->value;
373 
374 	if (unset) {
375 		opts->nr_cblocks = 0;
376 	} else {
377 		if (str)
378 			opts->nr_cblocks = strtol(str, NULL, 0);
379 		if (!opts->nr_cblocks)
380 			opts->nr_cblocks = nr_cblocks_default;
381 	}
382 
383 	return 0;
384 }
385 #else /* HAVE_AIO_SUPPORT */
386 static int nr_cblocks_max = 0;
387 
388 static int record__aio_push(struct record *rec __maybe_unused, struct perf_mmap *map __maybe_unused,
389 			    off_t *off __maybe_unused)
390 {
391 	return -1;
392 }
393 
394 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
395 {
396 	return -1;
397 }
398 
399 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
400 {
401 }
402 
403 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
404 {
405 }
406 #endif
407 
408 static int record__aio_enabled(struct record *rec)
409 {
410 	return rec->opts.nr_cblocks > 0;
411 }
412 
413 #define MMAP_FLUSH_DEFAULT 1
414 static int record__mmap_flush_parse(const struct option *opt,
415 				    const char *str,
416 				    int unset)
417 {
418 	int flush_max;
419 	struct record_opts *opts = (struct record_opts *)opt->value;
420 	static struct parse_tag tags[] = {
421 			{ .tag  = 'B', .mult = 1       },
422 			{ .tag  = 'K', .mult = 1 << 10 },
423 			{ .tag  = 'M', .mult = 1 << 20 },
424 			{ .tag  = 'G', .mult = 1 << 30 },
425 			{ .tag  = 0 },
426 	};
427 
428 	if (unset)
429 		return 0;
430 
431 	if (str) {
432 		opts->mmap_flush = parse_tag_value(str, tags);
433 		if (opts->mmap_flush == (int)-1)
434 			opts->mmap_flush = strtol(str, NULL, 0);
435 	}
436 
437 	if (!opts->mmap_flush)
438 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
439 
440 	flush_max = perf_evlist__mmap_size(opts->mmap_pages);
441 	flush_max /= 4;
442 	if (opts->mmap_flush > flush_max)
443 		opts->mmap_flush = flush_max;
444 
445 	return 0;
446 }
447 
448 #ifdef HAVE_ZSTD_SUPPORT
449 static unsigned int comp_level_default = 1;
450 
451 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
452 {
453 	struct record_opts *opts = opt->value;
454 
455 	if (unset) {
456 		opts->comp_level = 0;
457 	} else {
458 		if (str)
459 			opts->comp_level = strtol(str, NULL, 0);
460 		if (!opts->comp_level)
461 			opts->comp_level = comp_level_default;
462 	}
463 
464 	return 0;
465 }
466 #endif
467 static unsigned int comp_level_max = 22;
468 
469 static int record__comp_enabled(struct record *rec)
470 {
471 	return rec->opts.comp_level > 0;
472 }
473 
474 static int process_synthesized_event(struct perf_tool *tool,
475 				     union perf_event *event,
476 				     struct perf_sample *sample __maybe_unused,
477 				     struct machine *machine __maybe_unused)
478 {
479 	struct record *rec = container_of(tool, struct record, tool);
480 	return record__write(rec, NULL, event, event->header.size);
481 }
482 
483 static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size)
484 {
485 	struct record *rec = to;
486 
487 	if (record__comp_enabled(rec)) {
488 		size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
489 		bf   = map->data;
490 	}
491 
492 	rec->samples++;
493 	return record__write(rec, map, bf, size);
494 }
495 
496 static volatile int done;
497 static volatile int signr = -1;
498 static volatile int child_finished;
499 
500 static void sig_handler(int sig)
501 {
502 	if (sig == SIGCHLD)
503 		child_finished = 1;
504 	else
505 		signr = sig;
506 
507 	done = 1;
508 }
509 
510 static void sigsegv_handler(int sig)
511 {
512 	perf_hooks__recover();
513 	sighandler_dump_stack(sig);
514 }
515 
516 static void record__sig_exit(void)
517 {
518 	if (signr == -1)
519 		return;
520 
521 	signal(signr, SIG_DFL);
522 	raise(signr);
523 }
524 
525 #ifdef HAVE_AUXTRACE_SUPPORT
526 
527 static int record__process_auxtrace(struct perf_tool *tool,
528 				    struct perf_mmap *map,
529 				    union perf_event *event, void *data1,
530 				    size_t len1, void *data2, size_t len2)
531 {
532 	struct record *rec = container_of(tool, struct record, tool);
533 	struct perf_data *data = &rec->data;
534 	size_t padding;
535 	u8 pad[8] = {0};
536 
537 	if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
538 		off_t file_offset;
539 		int fd = perf_data__fd(data);
540 		int err;
541 
542 		file_offset = lseek(fd, 0, SEEK_CUR);
543 		if (file_offset == -1)
544 			return -1;
545 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
546 						     event, file_offset);
547 		if (err)
548 			return err;
549 	}
550 
551 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
552 	padding = (len1 + len2) & 7;
553 	if (padding)
554 		padding = 8 - padding;
555 
556 	record__write(rec, map, event, event->header.size);
557 	record__write(rec, map, data1, len1);
558 	if (len2)
559 		record__write(rec, map, data2, len2);
560 	record__write(rec, map, &pad, padding);
561 
562 	return 0;
563 }
564 
565 static int record__auxtrace_mmap_read(struct record *rec,
566 				      struct perf_mmap *map)
567 {
568 	int ret;
569 
570 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
571 				  record__process_auxtrace);
572 	if (ret < 0)
573 		return ret;
574 
575 	if (ret)
576 		rec->samples++;
577 
578 	return 0;
579 }
580 
581 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
582 					       struct perf_mmap *map)
583 {
584 	int ret;
585 
586 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
587 					   record__process_auxtrace,
588 					   rec->opts.auxtrace_snapshot_size);
589 	if (ret < 0)
590 		return ret;
591 
592 	if (ret)
593 		rec->samples++;
594 
595 	return 0;
596 }
597 
598 static int record__auxtrace_read_snapshot_all(struct record *rec)
599 {
600 	int i;
601 	int rc = 0;
602 
603 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
604 		struct perf_mmap *map = &rec->evlist->mmap[i];
605 
606 		if (!map->auxtrace_mmap.base)
607 			continue;
608 
609 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
610 			rc = -1;
611 			goto out;
612 		}
613 	}
614 out:
615 	return rc;
616 }
617 
618 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
619 {
620 	pr_debug("Recording AUX area tracing snapshot\n");
621 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
622 		trigger_error(&auxtrace_snapshot_trigger);
623 	} else {
624 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
625 			trigger_error(&auxtrace_snapshot_trigger);
626 		else
627 			trigger_ready(&auxtrace_snapshot_trigger);
628 	}
629 }
630 
631 static int record__auxtrace_snapshot_exit(struct record *rec)
632 {
633 	if (trigger_is_error(&auxtrace_snapshot_trigger))
634 		return 0;
635 
636 	if (!auxtrace_record__snapshot_started &&
637 	    auxtrace_record__snapshot_start(rec->itr))
638 		return -1;
639 
640 	record__read_auxtrace_snapshot(rec, true);
641 	if (trigger_is_error(&auxtrace_snapshot_trigger))
642 		return -1;
643 
644 	return 0;
645 }
646 
647 static int record__auxtrace_init(struct record *rec)
648 {
649 	int err;
650 
651 	if (!rec->itr) {
652 		rec->itr = auxtrace_record__init(rec->evlist, &err);
653 		if (err)
654 			return err;
655 	}
656 
657 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
658 					      rec->opts.auxtrace_snapshot_opts);
659 	if (err)
660 		return err;
661 
662 	return auxtrace_parse_filters(rec->evlist);
663 }
664 
665 #else
666 
667 static inline
668 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
669 			       struct perf_mmap *map __maybe_unused)
670 {
671 	return 0;
672 }
673 
674 static inline
675 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
676 				    bool on_exit __maybe_unused)
677 {
678 }
679 
680 static inline
681 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
682 {
683 	return 0;
684 }
685 
686 static inline
687 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
688 {
689 	return 0;
690 }
691 
692 static int record__auxtrace_init(struct record *rec __maybe_unused)
693 {
694 	return 0;
695 }
696 
697 #endif
698 
699 static int record__mmap_evlist(struct record *rec,
700 			       struct evlist *evlist)
701 {
702 	struct record_opts *opts = &rec->opts;
703 	char msg[512];
704 
705 	if (opts->affinity != PERF_AFFINITY_SYS)
706 		cpu__setup_cpunode_map();
707 
708 	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
709 				 opts->auxtrace_mmap_pages,
710 				 opts->auxtrace_snapshot_mode,
711 				 opts->nr_cblocks, opts->affinity,
712 				 opts->mmap_flush, opts->comp_level) < 0) {
713 		if (errno == EPERM) {
714 			pr_err("Permission error mapping pages.\n"
715 			       "Consider increasing "
716 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
717 			       "or try again with a smaller value of -m/--mmap_pages.\n"
718 			       "(current value: %u,%u)\n",
719 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
720 			return -errno;
721 		} else {
722 			pr_err("failed to mmap with %d (%s)\n", errno,
723 				str_error_r(errno, msg, sizeof(msg)));
724 			if (errno)
725 				return -errno;
726 			else
727 				return -EINVAL;
728 		}
729 	}
730 	return 0;
731 }
732 
733 static int record__mmap(struct record *rec)
734 {
735 	return record__mmap_evlist(rec, rec->evlist);
736 }
737 
738 static int record__open(struct record *rec)
739 {
740 	char msg[BUFSIZ];
741 	struct evsel *pos;
742 	struct evlist *evlist = rec->evlist;
743 	struct perf_session *session = rec->session;
744 	struct record_opts *opts = &rec->opts;
745 	int rc = 0;
746 
747 	/*
748 	 * For initial_delay we need to add a dummy event so that we can track
749 	 * PERF_RECORD_MMAP while we wait for the initial delay to enable the
750 	 * real events, the ones asked by the user.
751 	 */
752 	if (opts->initial_delay) {
753 		if (perf_evlist__add_dummy(evlist))
754 			return -ENOMEM;
755 
756 		pos = perf_evlist__first(evlist);
757 		pos->tracking = 0;
758 		pos = perf_evlist__last(evlist);
759 		pos->tracking = 1;
760 		pos->core.attr.enable_on_exec = 1;
761 	}
762 
763 	perf_evlist__config(evlist, opts, &callchain_param);
764 
765 	evlist__for_each_entry(evlist, pos) {
766 try_again:
767 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
768 			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
769 				if (verbose > 0)
770 					ui__warning("%s\n", msg);
771 				goto try_again;
772 			}
773 			if ((errno == EINVAL || errno == EBADF) &&
774 			    pos->leader != pos &&
775 			    pos->weak_group) {
776 			        pos = perf_evlist__reset_weak_group(evlist, pos);
777 				goto try_again;
778 			}
779 			rc = -errno;
780 			perf_evsel__open_strerror(pos, &opts->target,
781 						  errno, msg, sizeof(msg));
782 			ui__error("%s\n", msg);
783 			goto out;
784 		}
785 
786 		pos->supported = true;
787 	}
788 
789 	if (perf_evlist__apply_filters(evlist, &pos)) {
790 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
791 			pos->filter, perf_evsel__name(pos), errno,
792 			str_error_r(errno, msg, sizeof(msg)));
793 		rc = -1;
794 		goto out;
795 	}
796 
797 	rc = record__mmap(rec);
798 	if (rc)
799 		goto out;
800 
801 	session->evlist = evlist;
802 	perf_session__set_id_hdr_size(session);
803 out:
804 	return rc;
805 }
806 
807 static int process_sample_event(struct perf_tool *tool,
808 				union perf_event *event,
809 				struct perf_sample *sample,
810 				struct evsel *evsel,
811 				struct machine *machine)
812 {
813 	struct record *rec = container_of(tool, struct record, tool);
814 
815 	if (rec->evlist->first_sample_time == 0)
816 		rec->evlist->first_sample_time = sample->time;
817 
818 	rec->evlist->last_sample_time = sample->time;
819 
820 	if (rec->buildid_all)
821 		return 0;
822 
823 	rec->samples++;
824 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
825 }
826 
827 static int process_buildids(struct record *rec)
828 {
829 	struct perf_session *session = rec->session;
830 
831 	if (perf_data__size(&rec->data) == 0)
832 		return 0;
833 
834 	/*
835 	 * During this process, it'll load kernel map and replace the
836 	 * dso->long_name to a real pathname it found.  In this case
837 	 * we prefer the vmlinux path like
838 	 *   /lib/modules/3.16.4/build/vmlinux
839 	 *
840 	 * rather than build-id path (in debug directory).
841 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
842 	 */
843 	symbol_conf.ignore_vmlinux_buildid = true;
844 
845 	/*
846 	 * If --buildid-all is given, it marks all DSO regardless of hits,
847 	 * so no need to process samples. But if timestamp_boundary is enabled,
848 	 * it still needs to walk on all samples to get the timestamps of
849 	 * first/last samples.
850 	 */
851 	if (rec->buildid_all && !rec->timestamp_boundary)
852 		rec->tool.sample = NULL;
853 
854 	return perf_session__process_events(session);
855 }
856 
857 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
858 {
859 	int err;
860 	struct perf_tool *tool = data;
861 	/*
862 	 *As for guest kernel when processing subcommand record&report,
863 	 *we arrange module mmap prior to guest kernel mmap and trigger
864 	 *a preload dso because default guest module symbols are loaded
865 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
866 	 *method is used to avoid symbol missing when the first addr is
867 	 *in module instead of in guest kernel.
868 	 */
869 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
870 					     machine);
871 	if (err < 0)
872 		pr_err("Couldn't record guest kernel [%d]'s reference"
873 		       " relocation symbol.\n", machine->pid);
874 
875 	/*
876 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
877 	 * have no _text sometimes.
878 	 */
879 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
880 						 machine);
881 	if (err < 0)
882 		pr_err("Couldn't record guest kernel [%d]'s reference"
883 		       " relocation symbol.\n", machine->pid);
884 }
885 
886 static struct perf_event_header finished_round_event = {
887 	.size = sizeof(struct perf_event_header),
888 	.type = PERF_RECORD_FINISHED_ROUND,
889 };
890 
891 static void record__adjust_affinity(struct record *rec, struct perf_mmap *map)
892 {
893 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
894 	    !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
895 		CPU_ZERO(&rec->affinity_mask);
896 		CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
897 		sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
898 	}
899 }
900 
901 static size_t process_comp_header(void *record, size_t increment)
902 {
903 	struct perf_record_compressed *event = record;
904 	size_t size = sizeof(*event);
905 
906 	if (increment) {
907 		event->header.size += increment;
908 		return increment;
909 	}
910 
911 	event->header.type = PERF_RECORD_COMPRESSED;
912 	event->header.size = size;
913 
914 	return size;
915 }
916 
917 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
918 			    void *src, size_t src_size)
919 {
920 	size_t compressed;
921 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
922 
923 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
924 						     max_record_size, process_comp_header);
925 
926 	session->bytes_transferred += src_size;
927 	session->bytes_compressed  += compressed;
928 
929 	return compressed;
930 }
931 
932 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
933 				    bool overwrite, bool synch)
934 {
935 	u64 bytes_written = rec->bytes_written;
936 	int i;
937 	int rc = 0;
938 	struct perf_mmap *maps;
939 	int trace_fd = rec->data.file.fd;
940 	off_t off = 0;
941 
942 	if (!evlist)
943 		return 0;
944 
945 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
946 	if (!maps)
947 		return 0;
948 
949 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
950 		return 0;
951 
952 	if (record__aio_enabled(rec))
953 		off = record__aio_get_pos(trace_fd);
954 
955 	for (i = 0; i < evlist->nr_mmaps; i++) {
956 		u64 flush = 0;
957 		struct perf_mmap *map = &maps[i];
958 
959 		if (map->base) {
960 			record__adjust_affinity(rec, map);
961 			if (synch) {
962 				flush = map->flush;
963 				map->flush = 1;
964 			}
965 			if (!record__aio_enabled(rec)) {
966 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
967 					if (synch)
968 						map->flush = flush;
969 					rc = -1;
970 					goto out;
971 				}
972 			} else {
973 				if (record__aio_push(rec, map, &off) < 0) {
974 					record__aio_set_pos(trace_fd, off);
975 					if (synch)
976 						map->flush = flush;
977 					rc = -1;
978 					goto out;
979 				}
980 			}
981 			if (synch)
982 				map->flush = flush;
983 		}
984 
985 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
986 		    record__auxtrace_mmap_read(rec, map) != 0) {
987 			rc = -1;
988 			goto out;
989 		}
990 	}
991 
992 	if (record__aio_enabled(rec))
993 		record__aio_set_pos(trace_fd, off);
994 
995 	/*
996 	 * Mark the round finished in case we wrote
997 	 * at least one event.
998 	 */
999 	if (bytes_written != rec->bytes_written)
1000 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1001 
1002 	if (overwrite)
1003 		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1004 out:
1005 	return rc;
1006 }
1007 
1008 static int record__mmap_read_all(struct record *rec, bool synch)
1009 {
1010 	int err;
1011 
1012 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1013 	if (err)
1014 		return err;
1015 
1016 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1017 }
1018 
1019 static void record__init_features(struct record *rec)
1020 {
1021 	struct perf_session *session = rec->session;
1022 	int feat;
1023 
1024 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1025 		perf_header__set_feat(&session->header, feat);
1026 
1027 	if (rec->no_buildid)
1028 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1029 
1030 	if (!have_tracepoints(&rec->evlist->core.entries))
1031 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1032 
1033 	if (!rec->opts.branch_stack)
1034 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1035 
1036 	if (!rec->opts.full_auxtrace)
1037 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1038 
1039 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1040 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1041 
1042 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1043 	if (!record__comp_enabled(rec))
1044 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1045 
1046 	perf_header__clear_feat(&session->header, HEADER_STAT);
1047 }
1048 
1049 static void
1050 record__finish_output(struct record *rec)
1051 {
1052 	struct perf_data *data = &rec->data;
1053 	int fd = perf_data__fd(data);
1054 
1055 	if (data->is_pipe)
1056 		return;
1057 
1058 	rec->session->header.data_size += rec->bytes_written;
1059 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1060 
1061 	if (!rec->no_buildid) {
1062 		process_buildids(rec);
1063 
1064 		if (rec->buildid_all)
1065 			dsos__hit_all(rec->session);
1066 	}
1067 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1068 
1069 	return;
1070 }
1071 
1072 static int record__synthesize_workload(struct record *rec, bool tail)
1073 {
1074 	int err;
1075 	struct perf_thread_map *thread_map;
1076 
1077 	if (rec->opts.tail_synthesize != tail)
1078 		return 0;
1079 
1080 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1081 	if (thread_map == NULL)
1082 		return -1;
1083 
1084 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1085 						 process_synthesized_event,
1086 						 &rec->session->machines.host,
1087 						 rec->opts.sample_address);
1088 	perf_thread_map__put(thread_map);
1089 	return err;
1090 }
1091 
1092 static int record__synthesize(struct record *rec, bool tail);
1093 
1094 static int
1095 record__switch_output(struct record *rec, bool at_exit)
1096 {
1097 	struct perf_data *data = &rec->data;
1098 	int fd, err;
1099 	char *new_filename;
1100 
1101 	/* Same Size:      "2015122520103046"*/
1102 	char timestamp[] = "InvalidTimestamp";
1103 
1104 	record__aio_mmap_read_sync(rec);
1105 
1106 	record__synthesize(rec, true);
1107 	if (target__none(&rec->opts.target))
1108 		record__synthesize_workload(rec, true);
1109 
1110 	rec->samples = 0;
1111 	record__finish_output(rec);
1112 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1113 	if (err) {
1114 		pr_err("Failed to get current timestamp\n");
1115 		return -EINVAL;
1116 	}
1117 
1118 	fd = perf_data__switch(data, timestamp,
1119 				    rec->session->header.data_offset,
1120 				    at_exit, &new_filename);
1121 	if (fd >= 0 && !at_exit) {
1122 		rec->bytes_written = 0;
1123 		rec->session->header.data_size = 0;
1124 	}
1125 
1126 	if (!quiet)
1127 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1128 			data->path, timestamp);
1129 
1130 	if (rec->switch_output.num_files) {
1131 		int n = rec->switch_output.cur_file + 1;
1132 
1133 		if (n >= rec->switch_output.num_files)
1134 			n = 0;
1135 		rec->switch_output.cur_file = n;
1136 		if (rec->switch_output.filenames[n]) {
1137 			remove(rec->switch_output.filenames[n]);
1138 			zfree(&rec->switch_output.filenames[n]);
1139 		}
1140 		rec->switch_output.filenames[n] = new_filename;
1141 	} else {
1142 		free(new_filename);
1143 	}
1144 
1145 	/* Output tracking events */
1146 	if (!at_exit) {
1147 		record__synthesize(rec, false);
1148 
1149 		/*
1150 		 * In 'perf record --switch-output' without -a,
1151 		 * record__synthesize() in record__switch_output() won't
1152 		 * generate tracking events because there's no thread_map
1153 		 * in evlist. Which causes newly created perf.data doesn't
1154 		 * contain map and comm information.
1155 		 * Create a fake thread_map and directly call
1156 		 * perf_event__synthesize_thread_map() for those events.
1157 		 */
1158 		if (target__none(&rec->opts.target))
1159 			record__synthesize_workload(rec, false);
1160 	}
1161 	return fd;
1162 }
1163 
1164 static volatile int workload_exec_errno;
1165 
1166 /*
1167  * perf_evlist__prepare_workload will send a SIGUSR1
1168  * if the fork fails, since we asked by setting its
1169  * want_signal to true.
1170  */
1171 static void workload_exec_failed_signal(int signo __maybe_unused,
1172 					siginfo_t *info,
1173 					void *ucontext __maybe_unused)
1174 {
1175 	workload_exec_errno = info->si_value.sival_int;
1176 	done = 1;
1177 	child_finished = 1;
1178 }
1179 
1180 static void snapshot_sig_handler(int sig);
1181 static void alarm_sig_handler(int sig);
1182 
1183 int __weak
1184 perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
1185 			    struct perf_tool *tool __maybe_unused,
1186 			    perf_event__handler_t process __maybe_unused,
1187 			    struct machine *machine __maybe_unused)
1188 {
1189 	return 0;
1190 }
1191 
1192 static const struct perf_event_mmap_page *
1193 perf_evlist__pick_pc(struct evlist *evlist)
1194 {
1195 	if (evlist) {
1196 		if (evlist->mmap && evlist->mmap[0].base)
1197 			return evlist->mmap[0].base;
1198 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
1199 			return evlist->overwrite_mmap[0].base;
1200 	}
1201 	return NULL;
1202 }
1203 
1204 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1205 {
1206 	const struct perf_event_mmap_page *pc;
1207 
1208 	pc = perf_evlist__pick_pc(rec->evlist);
1209 	if (pc)
1210 		return pc;
1211 	return NULL;
1212 }
1213 
1214 static int record__synthesize(struct record *rec, bool tail)
1215 {
1216 	struct perf_session *session = rec->session;
1217 	struct machine *machine = &session->machines.host;
1218 	struct perf_data *data = &rec->data;
1219 	struct record_opts *opts = &rec->opts;
1220 	struct perf_tool *tool = &rec->tool;
1221 	int fd = perf_data__fd(data);
1222 	int err = 0;
1223 
1224 	if (rec->opts.tail_synthesize != tail)
1225 		return 0;
1226 
1227 	if (data->is_pipe) {
1228 		/*
1229 		 * We need to synthesize events first, because some
1230 		 * features works on top of them (on report side).
1231 		 */
1232 		err = perf_event__synthesize_attrs(tool, rec->evlist,
1233 						   process_synthesized_event);
1234 		if (err < 0) {
1235 			pr_err("Couldn't synthesize attrs.\n");
1236 			goto out;
1237 		}
1238 
1239 		err = perf_event__synthesize_features(tool, session, rec->evlist,
1240 						      process_synthesized_event);
1241 		if (err < 0) {
1242 			pr_err("Couldn't synthesize features.\n");
1243 			return err;
1244 		}
1245 
1246 		if (have_tracepoints(&rec->evlist->core.entries)) {
1247 			/*
1248 			 * FIXME err <= 0 here actually means that
1249 			 * there were no tracepoints so its not really
1250 			 * an error, just that we don't need to
1251 			 * synthesize anything.  We really have to
1252 			 * return this more properly and also
1253 			 * propagate errors that now are calling die()
1254 			 */
1255 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
1256 								  process_synthesized_event);
1257 			if (err <= 0) {
1258 				pr_err("Couldn't record tracing data.\n");
1259 				goto out;
1260 			}
1261 			rec->bytes_written += err;
1262 		}
1263 	}
1264 
1265 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1266 					  process_synthesized_event, machine);
1267 	if (err)
1268 		goto out;
1269 
1270 	if (rec->opts.full_auxtrace) {
1271 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1272 					session, process_synthesized_event);
1273 		if (err)
1274 			goto out;
1275 	}
1276 
1277 	if (!perf_evlist__exclude_kernel(rec->evlist)) {
1278 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1279 							 machine);
1280 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1281 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1282 				   "Check /proc/kallsyms permission or run as root.\n");
1283 
1284 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1285 						     machine);
1286 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1287 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1288 				   "Check /proc/modules permission or run as root.\n");
1289 	}
1290 
1291 	if (perf_guest) {
1292 		machines__process_guests(&session->machines,
1293 					 perf_event__synthesize_guest_os, tool);
1294 	}
1295 
1296 	err = perf_event__synthesize_extra_attr(&rec->tool,
1297 						rec->evlist,
1298 						process_synthesized_event,
1299 						data->is_pipe);
1300 	if (err)
1301 		goto out;
1302 
1303 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1304 						 process_synthesized_event,
1305 						NULL);
1306 	if (err < 0) {
1307 		pr_err("Couldn't synthesize thread map.\n");
1308 		return err;
1309 	}
1310 
1311 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1312 					     process_synthesized_event, NULL);
1313 	if (err < 0) {
1314 		pr_err("Couldn't synthesize cpu map.\n");
1315 		return err;
1316 	}
1317 
1318 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1319 						machine, opts);
1320 	if (err < 0)
1321 		pr_warning("Couldn't synthesize bpf events.\n");
1322 
1323 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1324 					    process_synthesized_event, opts->sample_address,
1325 					    1);
1326 out:
1327 	return err;
1328 }
1329 
1330 static int __cmd_record(struct record *rec, int argc, const char **argv)
1331 {
1332 	int err;
1333 	int status = 0;
1334 	unsigned long waking = 0;
1335 	const bool forks = argc > 0;
1336 	struct perf_tool *tool = &rec->tool;
1337 	struct record_opts *opts = &rec->opts;
1338 	struct perf_data *data = &rec->data;
1339 	struct perf_session *session;
1340 	bool disabled = false, draining = false;
1341 	struct evlist *sb_evlist = NULL;
1342 	int fd;
1343 	float ratio = 0;
1344 
1345 	atexit(record__sig_exit);
1346 	signal(SIGCHLD, sig_handler);
1347 	signal(SIGINT, sig_handler);
1348 	signal(SIGTERM, sig_handler);
1349 	signal(SIGSEGV, sigsegv_handler);
1350 
1351 	if (rec->opts.record_namespaces)
1352 		tool->namespace_events = true;
1353 
1354 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1355 		signal(SIGUSR2, snapshot_sig_handler);
1356 		if (rec->opts.auxtrace_snapshot_mode)
1357 			trigger_on(&auxtrace_snapshot_trigger);
1358 		if (rec->switch_output.enabled)
1359 			trigger_on(&switch_output_trigger);
1360 	} else {
1361 		signal(SIGUSR2, SIG_IGN);
1362 	}
1363 
1364 	session = perf_session__new(data, false, tool);
1365 	if (session == NULL) {
1366 		pr_err("Perf session creation failed.\n");
1367 		return -1;
1368 	}
1369 
1370 	fd = perf_data__fd(data);
1371 	rec->session = session;
1372 
1373 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1374 		pr_err("Compression initialization failed.\n");
1375 		return -1;
1376 	}
1377 
1378 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1379 	session->header.env.comp_level = rec->opts.comp_level;
1380 
1381 	record__init_features(rec);
1382 
1383 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1384 		session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1385 
1386 	if (forks) {
1387 		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1388 						    argv, data->is_pipe,
1389 						    workload_exec_failed_signal);
1390 		if (err < 0) {
1391 			pr_err("Couldn't run the workload!\n");
1392 			status = err;
1393 			goto out_delete_session;
1394 		}
1395 	}
1396 
1397 	/*
1398 	 * If we have just single event and are sending data
1399 	 * through pipe, we need to force the ids allocation,
1400 	 * because we synthesize event name through the pipe
1401 	 * and need the id for that.
1402 	 */
1403 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1404 		rec->opts.sample_id = true;
1405 
1406 	if (record__open(rec) != 0) {
1407 		err = -1;
1408 		goto out_child;
1409 	}
1410 	session->header.env.comp_mmap_len = session->evlist->mmap_len;
1411 
1412 	err = bpf__apply_obj_config();
1413 	if (err) {
1414 		char errbuf[BUFSIZ];
1415 
1416 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1417 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1418 			 errbuf);
1419 		goto out_child;
1420 	}
1421 
1422 	/*
1423 	 * Normally perf_session__new would do this, but it doesn't have the
1424 	 * evlist.
1425 	 */
1426 	if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1427 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1428 		rec->tool.ordered_events = false;
1429 	}
1430 
1431 	if (!rec->evlist->nr_groups)
1432 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1433 
1434 	if (data->is_pipe) {
1435 		err = perf_header__write_pipe(fd);
1436 		if (err < 0)
1437 			goto out_child;
1438 	} else {
1439 		err = perf_session__write_header(session, rec->evlist, fd, false);
1440 		if (err < 0)
1441 			goto out_child;
1442 	}
1443 
1444 	if (!rec->no_buildid
1445 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1446 		pr_err("Couldn't generate buildids. "
1447 		       "Use --no-buildid to profile anyway.\n");
1448 		err = -1;
1449 		goto out_child;
1450 	}
1451 
1452 	if (!opts->no_bpf_event)
1453 		bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1454 
1455 	if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1456 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1457 		opts->no_bpf_event = true;
1458 	}
1459 
1460 	err = record__synthesize(rec, false);
1461 	if (err < 0)
1462 		goto out_child;
1463 
1464 	if (rec->realtime_prio) {
1465 		struct sched_param param;
1466 
1467 		param.sched_priority = rec->realtime_prio;
1468 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1469 			pr_err("Could not set realtime priority.\n");
1470 			err = -1;
1471 			goto out_child;
1472 		}
1473 	}
1474 
1475 	/*
1476 	 * When perf is starting the traced process, all the events
1477 	 * (apart from group members) have enable_on_exec=1 set,
1478 	 * so don't spoil it by prematurely enabling them.
1479 	 */
1480 	if (!target__none(&opts->target) && !opts->initial_delay)
1481 		evlist__enable(rec->evlist);
1482 
1483 	/*
1484 	 * Let the child rip
1485 	 */
1486 	if (forks) {
1487 		struct machine *machine = &session->machines.host;
1488 		union perf_event *event;
1489 		pid_t tgid;
1490 
1491 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1492 		if (event == NULL) {
1493 			err = -ENOMEM;
1494 			goto out_child;
1495 		}
1496 
1497 		/*
1498 		 * Some H/W events are generated before COMM event
1499 		 * which is emitted during exec(), so perf script
1500 		 * cannot see a correct process name for those events.
1501 		 * Synthesize COMM event to prevent it.
1502 		 */
1503 		tgid = perf_event__synthesize_comm(tool, event,
1504 						   rec->evlist->workload.pid,
1505 						   process_synthesized_event,
1506 						   machine);
1507 		free(event);
1508 
1509 		if (tgid == -1)
1510 			goto out_child;
1511 
1512 		event = malloc(sizeof(event->namespaces) +
1513 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1514 			       machine->id_hdr_size);
1515 		if (event == NULL) {
1516 			err = -ENOMEM;
1517 			goto out_child;
1518 		}
1519 
1520 		/*
1521 		 * Synthesize NAMESPACES event for the command specified.
1522 		 */
1523 		perf_event__synthesize_namespaces(tool, event,
1524 						  rec->evlist->workload.pid,
1525 						  tgid, process_synthesized_event,
1526 						  machine);
1527 		free(event);
1528 
1529 		perf_evlist__start_workload(rec->evlist);
1530 	}
1531 
1532 	if (opts->initial_delay) {
1533 		usleep(opts->initial_delay * USEC_PER_MSEC);
1534 		evlist__enable(rec->evlist);
1535 	}
1536 
1537 	trigger_ready(&auxtrace_snapshot_trigger);
1538 	trigger_ready(&switch_output_trigger);
1539 	perf_hooks__invoke_record_start();
1540 	for (;;) {
1541 		unsigned long long hits = rec->samples;
1542 
1543 		/*
1544 		 * rec->evlist->bkw_mmap_state is possible to be
1545 		 * BKW_MMAP_EMPTY here: when done == true and
1546 		 * hits != rec->samples in previous round.
1547 		 *
1548 		 * perf_evlist__toggle_bkw_mmap ensure we never
1549 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1550 		 */
1551 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1552 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1553 
1554 		if (record__mmap_read_all(rec, false) < 0) {
1555 			trigger_error(&auxtrace_snapshot_trigger);
1556 			trigger_error(&switch_output_trigger);
1557 			err = -1;
1558 			goto out_child;
1559 		}
1560 
1561 		if (auxtrace_record__snapshot_started) {
1562 			auxtrace_record__snapshot_started = 0;
1563 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1564 				record__read_auxtrace_snapshot(rec, false);
1565 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1566 				pr_err("AUX area tracing snapshot failed\n");
1567 				err = -1;
1568 				goto out_child;
1569 			}
1570 		}
1571 
1572 		if (trigger_is_hit(&switch_output_trigger)) {
1573 			/*
1574 			 * If switch_output_trigger is hit, the data in
1575 			 * overwritable ring buffer should have been collected,
1576 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1577 			 *
1578 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1579 			 * record__mmap_read_all() didn't collect data from
1580 			 * overwritable ring buffer. Read again.
1581 			 */
1582 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1583 				continue;
1584 			trigger_ready(&switch_output_trigger);
1585 
1586 			/*
1587 			 * Reenable events in overwrite ring buffer after
1588 			 * record__mmap_read_all(): we should have collected
1589 			 * data from it.
1590 			 */
1591 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1592 
1593 			if (!quiet)
1594 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1595 					waking);
1596 			waking = 0;
1597 			fd = record__switch_output(rec, false);
1598 			if (fd < 0) {
1599 				pr_err("Failed to switch to new file\n");
1600 				trigger_error(&switch_output_trigger);
1601 				err = fd;
1602 				goto out_child;
1603 			}
1604 
1605 			/* re-arm the alarm */
1606 			if (rec->switch_output.time)
1607 				alarm(rec->switch_output.time);
1608 		}
1609 
1610 		if (hits == rec->samples) {
1611 			if (done || draining)
1612 				break;
1613 			err = perf_evlist__poll(rec->evlist, -1);
1614 			/*
1615 			 * Propagate error, only if there's any. Ignore positive
1616 			 * number of returned events and interrupt error.
1617 			 */
1618 			if (err > 0 || (err < 0 && errno == EINTR))
1619 				err = 0;
1620 			waking++;
1621 
1622 			if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1623 				draining = true;
1624 		}
1625 
1626 		/*
1627 		 * When perf is starting the traced process, at the end events
1628 		 * die with the process and we wait for that. Thus no need to
1629 		 * disable events in this case.
1630 		 */
1631 		if (done && !disabled && !target__none(&opts->target)) {
1632 			trigger_off(&auxtrace_snapshot_trigger);
1633 			evlist__disable(rec->evlist);
1634 			disabled = true;
1635 		}
1636 	}
1637 
1638 	trigger_off(&auxtrace_snapshot_trigger);
1639 	trigger_off(&switch_output_trigger);
1640 
1641 	if (opts->auxtrace_snapshot_on_exit)
1642 		record__auxtrace_snapshot_exit(rec);
1643 
1644 	if (forks && workload_exec_errno) {
1645 		char msg[STRERR_BUFSIZE];
1646 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1647 		pr_err("Workload failed: %s\n", emsg);
1648 		err = -1;
1649 		goto out_child;
1650 	}
1651 
1652 	if (!quiet)
1653 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1654 
1655 	if (target__none(&rec->opts.target))
1656 		record__synthesize_workload(rec, true);
1657 
1658 out_child:
1659 	record__mmap_read_all(rec, true);
1660 	record__aio_mmap_read_sync(rec);
1661 
1662 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1663 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1664 		session->header.env.comp_ratio = ratio + 0.5;
1665 	}
1666 
1667 	if (forks) {
1668 		int exit_status;
1669 
1670 		if (!child_finished)
1671 			kill(rec->evlist->workload.pid, SIGTERM);
1672 
1673 		wait(&exit_status);
1674 
1675 		if (err < 0)
1676 			status = err;
1677 		else if (WIFEXITED(exit_status))
1678 			status = WEXITSTATUS(exit_status);
1679 		else if (WIFSIGNALED(exit_status))
1680 			signr = WTERMSIG(exit_status);
1681 	} else
1682 		status = err;
1683 
1684 	record__synthesize(rec, true);
1685 	/* this will be recalculated during process_buildids() */
1686 	rec->samples = 0;
1687 
1688 	if (!err) {
1689 		if (!rec->timestamp_filename) {
1690 			record__finish_output(rec);
1691 		} else {
1692 			fd = record__switch_output(rec, true);
1693 			if (fd < 0) {
1694 				status = fd;
1695 				goto out_delete_session;
1696 			}
1697 		}
1698 	}
1699 
1700 	perf_hooks__invoke_record_end();
1701 
1702 	if (!err && !quiet) {
1703 		char samples[128];
1704 		const char *postfix = rec->timestamp_filename ?
1705 					".<timestamp>" : "";
1706 
1707 		if (rec->samples && !rec->opts.full_auxtrace)
1708 			scnprintf(samples, sizeof(samples),
1709 				  " (%" PRIu64 " samples)", rec->samples);
1710 		else
1711 			samples[0] = '\0';
1712 
1713 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
1714 			perf_data__size(data) / 1024.0 / 1024.0,
1715 			data->path, postfix, samples);
1716 		if (ratio) {
1717 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
1718 					rec->session->bytes_transferred / 1024.0 / 1024.0,
1719 					ratio);
1720 		}
1721 		fprintf(stderr, " ]\n");
1722 	}
1723 
1724 out_delete_session:
1725 	zstd_fini(&session->zstd_data);
1726 	perf_session__delete(session);
1727 
1728 	if (!opts->no_bpf_event)
1729 		perf_evlist__stop_sb_thread(sb_evlist);
1730 	return status;
1731 }
1732 
1733 static void callchain_debug(struct callchain_param *callchain)
1734 {
1735 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1736 
1737 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1738 
1739 	if (callchain->record_mode == CALLCHAIN_DWARF)
1740 		pr_debug("callchain: stack dump size %d\n",
1741 			 callchain->dump_size);
1742 }
1743 
1744 int record_opts__parse_callchain(struct record_opts *record,
1745 				 struct callchain_param *callchain,
1746 				 const char *arg, bool unset)
1747 {
1748 	int ret;
1749 	callchain->enabled = !unset;
1750 
1751 	/* --no-call-graph */
1752 	if (unset) {
1753 		callchain->record_mode = CALLCHAIN_NONE;
1754 		pr_debug("callchain: disabled\n");
1755 		return 0;
1756 	}
1757 
1758 	ret = parse_callchain_record_opt(arg, callchain);
1759 	if (!ret) {
1760 		/* Enable data address sampling for DWARF unwind. */
1761 		if (callchain->record_mode == CALLCHAIN_DWARF)
1762 			record->sample_address = true;
1763 		callchain_debug(callchain);
1764 	}
1765 
1766 	return ret;
1767 }
1768 
1769 int record_parse_callchain_opt(const struct option *opt,
1770 			       const char *arg,
1771 			       int unset)
1772 {
1773 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1774 }
1775 
1776 int record_callchain_opt(const struct option *opt,
1777 			 const char *arg __maybe_unused,
1778 			 int unset __maybe_unused)
1779 {
1780 	struct callchain_param *callchain = opt->value;
1781 
1782 	callchain->enabled = true;
1783 
1784 	if (callchain->record_mode == CALLCHAIN_NONE)
1785 		callchain->record_mode = CALLCHAIN_FP;
1786 
1787 	callchain_debug(callchain);
1788 	return 0;
1789 }
1790 
1791 static int perf_record_config(const char *var, const char *value, void *cb)
1792 {
1793 	struct record *rec = cb;
1794 
1795 	if (!strcmp(var, "record.build-id")) {
1796 		if (!strcmp(value, "cache"))
1797 			rec->no_buildid_cache = false;
1798 		else if (!strcmp(value, "no-cache"))
1799 			rec->no_buildid_cache = true;
1800 		else if (!strcmp(value, "skip"))
1801 			rec->no_buildid = true;
1802 		else
1803 			return -1;
1804 		return 0;
1805 	}
1806 	if (!strcmp(var, "record.call-graph")) {
1807 		var = "call-graph.record-mode";
1808 		return perf_default_config(var, value, cb);
1809 	}
1810 #ifdef HAVE_AIO_SUPPORT
1811 	if (!strcmp(var, "record.aio")) {
1812 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
1813 		if (!rec->opts.nr_cblocks)
1814 			rec->opts.nr_cblocks = nr_cblocks_default;
1815 	}
1816 #endif
1817 
1818 	return 0;
1819 }
1820 
1821 struct clockid_map {
1822 	const char *name;
1823 	int clockid;
1824 };
1825 
1826 #define CLOCKID_MAP(n, c)	\
1827 	{ .name = n, .clockid = (c), }
1828 
1829 #define CLOCKID_END	{ .name = NULL, }
1830 
1831 
1832 /*
1833  * Add the missing ones, we need to build on many distros...
1834  */
1835 #ifndef CLOCK_MONOTONIC_RAW
1836 #define CLOCK_MONOTONIC_RAW 4
1837 #endif
1838 #ifndef CLOCK_BOOTTIME
1839 #define CLOCK_BOOTTIME 7
1840 #endif
1841 #ifndef CLOCK_TAI
1842 #define CLOCK_TAI 11
1843 #endif
1844 
1845 static const struct clockid_map clockids[] = {
1846 	/* available for all events, NMI safe */
1847 	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1848 	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1849 
1850 	/* available for some events */
1851 	CLOCKID_MAP("realtime", CLOCK_REALTIME),
1852 	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1853 	CLOCKID_MAP("tai", CLOCK_TAI),
1854 
1855 	/* available for the lazy */
1856 	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1857 	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1858 	CLOCKID_MAP("real", CLOCK_REALTIME),
1859 	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1860 
1861 	CLOCKID_END,
1862 };
1863 
1864 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1865 {
1866 	struct timespec res;
1867 
1868 	*res_ns = 0;
1869 	if (!clock_getres(clk_id, &res))
1870 		*res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1871 	else
1872 		pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1873 
1874 	return 0;
1875 }
1876 
1877 static int parse_clockid(const struct option *opt, const char *str, int unset)
1878 {
1879 	struct record_opts *opts = (struct record_opts *)opt->value;
1880 	const struct clockid_map *cm;
1881 	const char *ostr = str;
1882 
1883 	if (unset) {
1884 		opts->use_clockid = 0;
1885 		return 0;
1886 	}
1887 
1888 	/* no arg passed */
1889 	if (!str)
1890 		return 0;
1891 
1892 	/* no setting it twice */
1893 	if (opts->use_clockid)
1894 		return -1;
1895 
1896 	opts->use_clockid = true;
1897 
1898 	/* if its a number, we're done */
1899 	if (sscanf(str, "%d", &opts->clockid) == 1)
1900 		return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1901 
1902 	/* allow a "CLOCK_" prefix to the name */
1903 	if (!strncasecmp(str, "CLOCK_", 6))
1904 		str += 6;
1905 
1906 	for (cm = clockids; cm->name; cm++) {
1907 		if (!strcasecmp(str, cm->name)) {
1908 			opts->clockid = cm->clockid;
1909 			return get_clockid_res(opts->clockid,
1910 					       &opts->clockid_res_ns);
1911 		}
1912 	}
1913 
1914 	opts->use_clockid = false;
1915 	ui__warning("unknown clockid %s, check man page\n", ostr);
1916 	return -1;
1917 }
1918 
1919 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1920 {
1921 	struct record_opts *opts = (struct record_opts *)opt->value;
1922 
1923 	if (unset || !str)
1924 		return 0;
1925 
1926 	if (!strcasecmp(str, "node"))
1927 		opts->affinity = PERF_AFFINITY_NODE;
1928 	else if (!strcasecmp(str, "cpu"))
1929 		opts->affinity = PERF_AFFINITY_CPU;
1930 
1931 	return 0;
1932 }
1933 
1934 static int record__parse_mmap_pages(const struct option *opt,
1935 				    const char *str,
1936 				    int unset __maybe_unused)
1937 {
1938 	struct record_opts *opts = opt->value;
1939 	char *s, *p;
1940 	unsigned int mmap_pages;
1941 	int ret;
1942 
1943 	if (!str)
1944 		return -EINVAL;
1945 
1946 	s = strdup(str);
1947 	if (!s)
1948 		return -ENOMEM;
1949 
1950 	p = strchr(s, ',');
1951 	if (p)
1952 		*p = '\0';
1953 
1954 	if (*s) {
1955 		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1956 		if (ret)
1957 			goto out_free;
1958 		opts->mmap_pages = mmap_pages;
1959 	}
1960 
1961 	if (!p) {
1962 		ret = 0;
1963 		goto out_free;
1964 	}
1965 
1966 	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1967 	if (ret)
1968 		goto out_free;
1969 
1970 	opts->auxtrace_mmap_pages = mmap_pages;
1971 
1972 out_free:
1973 	free(s);
1974 	return ret;
1975 }
1976 
1977 static void switch_output_size_warn(struct record *rec)
1978 {
1979 	u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1980 	struct switch_output *s = &rec->switch_output;
1981 
1982 	wakeup_size /= 2;
1983 
1984 	if (s->size < wakeup_size) {
1985 		char buf[100];
1986 
1987 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1988 		pr_warning("WARNING: switch-output data size lower than "
1989 			   "wakeup kernel buffer size (%s) "
1990 			   "expect bigger perf.data sizes\n", buf);
1991 	}
1992 }
1993 
1994 static int switch_output_setup(struct record *rec)
1995 {
1996 	struct switch_output *s = &rec->switch_output;
1997 	static struct parse_tag tags_size[] = {
1998 		{ .tag  = 'B', .mult = 1       },
1999 		{ .tag  = 'K', .mult = 1 << 10 },
2000 		{ .tag  = 'M', .mult = 1 << 20 },
2001 		{ .tag  = 'G', .mult = 1 << 30 },
2002 		{ .tag  = 0 },
2003 	};
2004 	static struct parse_tag tags_time[] = {
2005 		{ .tag  = 's', .mult = 1        },
2006 		{ .tag  = 'm', .mult = 60       },
2007 		{ .tag  = 'h', .mult = 60*60    },
2008 		{ .tag  = 'd', .mult = 60*60*24 },
2009 		{ .tag  = 0 },
2010 	};
2011 	unsigned long val;
2012 
2013 	if (!s->set)
2014 		return 0;
2015 
2016 	if (!strcmp(s->str, "signal")) {
2017 		s->signal = true;
2018 		pr_debug("switch-output with SIGUSR2 signal\n");
2019 		goto enabled;
2020 	}
2021 
2022 	val = parse_tag_value(s->str, tags_size);
2023 	if (val != (unsigned long) -1) {
2024 		s->size = val;
2025 		pr_debug("switch-output with %s size threshold\n", s->str);
2026 		goto enabled;
2027 	}
2028 
2029 	val = parse_tag_value(s->str, tags_time);
2030 	if (val != (unsigned long) -1) {
2031 		s->time = val;
2032 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2033 			 s->str, s->time);
2034 		goto enabled;
2035 	}
2036 
2037 	return -1;
2038 
2039 enabled:
2040 	rec->timestamp_filename = true;
2041 	s->enabled              = true;
2042 
2043 	if (s->size && !rec->opts.no_buffering)
2044 		switch_output_size_warn(rec);
2045 
2046 	return 0;
2047 }
2048 
2049 static const char * const __record_usage[] = {
2050 	"perf record [<options>] [<command>]",
2051 	"perf record [<options>] -- <command> [<options>]",
2052 	NULL
2053 };
2054 const char * const *record_usage = __record_usage;
2055 
2056 /*
2057  * XXX Ideally would be local to cmd_record() and passed to a record__new
2058  * because we need to have access to it in record__exit, that is called
2059  * after cmd_record() exits, but since record_options need to be accessible to
2060  * builtin-script, leave it here.
2061  *
2062  * At least we don't ouch it in all the other functions here directly.
2063  *
2064  * Just say no to tons of global variables, sigh.
2065  */
2066 static struct record record = {
2067 	.opts = {
2068 		.sample_time	     = true,
2069 		.mmap_pages	     = UINT_MAX,
2070 		.user_freq	     = UINT_MAX,
2071 		.user_interval	     = ULLONG_MAX,
2072 		.freq		     = 4000,
2073 		.target		     = {
2074 			.uses_mmap   = true,
2075 			.default_per_cpu = true,
2076 		},
2077 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2078 	},
2079 	.tool = {
2080 		.sample		= process_sample_event,
2081 		.fork		= perf_event__process_fork,
2082 		.exit		= perf_event__process_exit,
2083 		.comm		= perf_event__process_comm,
2084 		.namespaces	= perf_event__process_namespaces,
2085 		.mmap		= perf_event__process_mmap,
2086 		.mmap2		= perf_event__process_mmap2,
2087 		.ordered_events	= true,
2088 	},
2089 };
2090 
2091 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2092 	"\n\t\t\t\tDefault: fp";
2093 
2094 static bool dry_run;
2095 
2096 /*
2097  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2098  * with it and switch to use the library functions in perf_evlist that came
2099  * from builtin-record.c, i.e. use record_opts,
2100  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2101  * using pipes, etc.
2102  */
2103 static struct option __record_options[] = {
2104 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2105 		     "event selector. use 'perf list' to list available events",
2106 		     parse_events_option),
2107 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2108 		     "event filter", parse_filter),
2109 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2110 			   NULL, "don't record events from perf itself",
2111 			   exclude_perf),
2112 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2113 		    "record events on existing process id"),
2114 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2115 		    "record events on existing thread id"),
2116 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2117 		    "collect data with this RT SCHED_FIFO priority"),
2118 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2119 		    "collect data without buffering"),
2120 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2121 		    "collect raw sample records from all opened counters"),
2122 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2123 			    "system-wide collection from all CPUs"),
2124 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2125 		    "list of cpus to monitor"),
2126 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2127 	OPT_STRING('o', "output", &record.data.path, "file",
2128 		    "output file name"),
2129 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2130 			&record.opts.no_inherit_set,
2131 			"child tasks do not inherit counters"),
2132 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2133 		    "synthesize non-sample events at the end of output"),
2134 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2135 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2136 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2137 		    "Fail if the specified frequency can't be used"),
2138 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2139 		     "profile at this frequency",
2140 		      record__parse_freq),
2141 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2142 		     "number of mmap data pages and AUX area tracing mmap pages",
2143 		     record__parse_mmap_pages),
2144 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2145 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2146 		     record__mmap_flush_parse),
2147 	OPT_BOOLEAN(0, "group", &record.opts.group,
2148 		    "put the counters into a counter group"),
2149 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2150 			   NULL, "enables call-graph recording" ,
2151 			   &record_callchain_opt),
2152 	OPT_CALLBACK(0, "call-graph", &record.opts,
2153 		     "record_mode[,record_size]", record_callchain_help,
2154 		     &record_parse_callchain_opt),
2155 	OPT_INCR('v', "verbose", &verbose,
2156 		    "be more verbose (show counter open errors, etc)"),
2157 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2158 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2159 		    "per thread counts"),
2160 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2161 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2162 		    "Record the sample physical addresses"),
2163 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2164 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2165 			&record.opts.sample_time_set,
2166 			"Record the sample timestamps"),
2167 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2168 			"Record the sample period"),
2169 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2170 		    "don't sample"),
2171 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2172 			&record.no_buildid_cache_set,
2173 			"do not update the buildid cache"),
2174 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2175 			&record.no_buildid_set,
2176 			"do not collect buildids in perf.data"),
2177 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2178 		     "monitor event in cgroup name only",
2179 		     parse_cgroups),
2180 	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2181 		  "ms to wait before starting measurement after program start"),
2182 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2183 		   "user to profile"),
2184 
2185 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2186 		     "branch any", "sample any taken branches",
2187 		     parse_branch_stack),
2188 
2189 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2190 		     "branch filter mask", "branch stack filter modes",
2191 		     parse_branch_stack),
2192 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2193 		    "sample by weight (on special events only)"),
2194 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2195 		    "sample transaction flags (special events only)"),
2196 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2197 		    "use per-thread mmaps"),
2198 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2199 		    "sample selected machine registers on interrupt,"
2200 		    " use '-I?' to list register names", parse_intr_regs),
2201 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2202 		    "sample selected machine registers on interrupt,"
2203 		    " use '--user-regs=?' to list register names", parse_user_regs),
2204 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2205 		    "Record running/enabled time of read (:S) events"),
2206 	OPT_CALLBACK('k', "clockid", &record.opts,
2207 	"clockid", "clockid to use for events, see clock_gettime()",
2208 	parse_clockid),
2209 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2210 			  "opts", "AUX area tracing Snapshot Mode", ""),
2211 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2212 			"per thread proc mmap processing timeout in ms"),
2213 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2214 		    "Record namespaces events"),
2215 	OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2216 		    "Record context switch events"),
2217 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2218 			 "Configure all used events to run in kernel space.",
2219 			 PARSE_OPT_EXCLUSIVE),
2220 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2221 			 "Configure all used events to run in user space.",
2222 			 PARSE_OPT_EXCLUSIVE),
2223 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2224 		    "collect kernel callchains"),
2225 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2226 		    "collect user callchains"),
2227 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2228 		   "clang binary to use for compiling BPF scriptlets"),
2229 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2230 		   "options passed to clang when compiling BPF scriptlets"),
2231 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2232 		   "file", "vmlinux pathname"),
2233 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2234 		    "Record build-id of all DSOs regardless of hits"),
2235 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2236 		    "append timestamp to output filename"),
2237 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2238 		    "Record timestamp boundary (time of first/last samples)"),
2239 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2240 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2241 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2242 			  "signal"),
2243 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2244 		   "Limit number of switch output generated files"),
2245 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2246 		    "Parse options then exit"),
2247 #ifdef HAVE_AIO_SUPPORT
2248 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2249 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2250 		     record__aio_parse),
2251 #endif
2252 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2253 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2254 		     record__parse_affinity),
2255 #ifdef HAVE_ZSTD_SUPPORT
2256 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2257 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2258 			    record__parse_comp_level),
2259 #endif
2260 	OPT_END()
2261 };
2262 
2263 struct option *record_options = __record_options;
2264 
2265 int cmd_record(int argc, const char **argv)
2266 {
2267 	int err;
2268 	struct record *rec = &record;
2269 	char errbuf[BUFSIZ];
2270 
2271 	setlocale(LC_ALL, "");
2272 
2273 #ifndef HAVE_LIBBPF_SUPPORT
2274 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2275 	set_nobuild('\0', "clang-path", true);
2276 	set_nobuild('\0', "clang-opt", true);
2277 # undef set_nobuild
2278 #endif
2279 
2280 #ifndef HAVE_BPF_PROLOGUE
2281 # if !defined (HAVE_DWARF_SUPPORT)
2282 #  define REASON  "NO_DWARF=1"
2283 # elif !defined (HAVE_LIBBPF_SUPPORT)
2284 #  define REASON  "NO_LIBBPF=1"
2285 # else
2286 #  define REASON  "this architecture doesn't support BPF prologue"
2287 # endif
2288 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2289 	set_nobuild('\0', "vmlinux", true);
2290 # undef set_nobuild
2291 # undef REASON
2292 #endif
2293 
2294 	CPU_ZERO(&rec->affinity_mask);
2295 	rec->opts.affinity = PERF_AFFINITY_SYS;
2296 
2297 	rec->evlist = evlist__new();
2298 	if (rec->evlist == NULL)
2299 		return -ENOMEM;
2300 
2301 	err = perf_config(perf_record_config, rec);
2302 	if (err)
2303 		return err;
2304 
2305 	argc = parse_options(argc, argv, record_options, record_usage,
2306 			    PARSE_OPT_STOP_AT_NON_OPTION);
2307 	if (quiet)
2308 		perf_quiet_option();
2309 
2310 	/* Make system wide (-a) the default target. */
2311 	if (!argc && target__none(&rec->opts.target))
2312 		rec->opts.target.system_wide = true;
2313 
2314 	if (nr_cgroups && !rec->opts.target.system_wide) {
2315 		usage_with_options_msg(record_usage, record_options,
2316 			"cgroup monitoring only available in system-wide mode");
2317 
2318 	}
2319 
2320 	if (rec->opts.comp_level != 0) {
2321 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2322 		rec->no_buildid = true;
2323 	}
2324 
2325 	if (rec->opts.record_switch_events &&
2326 	    !perf_can_record_switch_events()) {
2327 		ui__error("kernel does not support recording context switch events\n");
2328 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2329 		return -EINVAL;
2330 	}
2331 
2332 	if (switch_output_setup(rec)) {
2333 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2334 		return -EINVAL;
2335 	}
2336 
2337 	if (rec->switch_output.time) {
2338 		signal(SIGALRM, alarm_sig_handler);
2339 		alarm(rec->switch_output.time);
2340 	}
2341 
2342 	if (rec->switch_output.num_files) {
2343 		rec->switch_output.filenames = calloc(sizeof(char *),
2344 						      rec->switch_output.num_files);
2345 		if (!rec->switch_output.filenames)
2346 			return -EINVAL;
2347 	}
2348 
2349 	/*
2350 	 * Allow aliases to facilitate the lookup of symbols for address
2351 	 * filters. Refer to auxtrace_parse_filters().
2352 	 */
2353 	symbol_conf.allow_aliases = true;
2354 
2355 	symbol__init(NULL);
2356 
2357 	err = record__auxtrace_init(rec);
2358 	if (err)
2359 		goto out;
2360 
2361 	if (dry_run)
2362 		goto out;
2363 
2364 	err = bpf__setup_stdout(rec->evlist);
2365 	if (err) {
2366 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2367 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2368 			 errbuf);
2369 		goto out;
2370 	}
2371 
2372 	err = -ENOMEM;
2373 
2374 	if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
2375 		pr_warning(
2376 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
2377 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
2378 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
2379 "file is not found in the buildid cache or in the vmlinux path.\n\n"
2380 "Samples in kernel modules won't be resolved at all.\n\n"
2381 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
2382 "even with a suitable vmlinux or kallsyms file.\n\n");
2383 
2384 	if (rec->no_buildid_cache || rec->no_buildid) {
2385 		disable_buildid_cache();
2386 	} else if (rec->switch_output.enabled) {
2387 		/*
2388 		 * In 'perf record --switch-output', disable buildid
2389 		 * generation by default to reduce data file switching
2390 		 * overhead. Still generate buildid if they are required
2391 		 * explicitly using
2392 		 *
2393 		 *  perf record --switch-output --no-no-buildid \
2394 		 *              --no-no-buildid-cache
2395 		 *
2396 		 * Following code equals to:
2397 		 *
2398 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2399 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2400 		 *         disable_buildid_cache();
2401 		 */
2402 		bool disable = true;
2403 
2404 		if (rec->no_buildid_set && !rec->no_buildid)
2405 			disable = false;
2406 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2407 			disable = false;
2408 		if (disable) {
2409 			rec->no_buildid = true;
2410 			rec->no_buildid_cache = true;
2411 			disable_buildid_cache();
2412 		}
2413 	}
2414 
2415 	if (record.opts.overwrite)
2416 		record.opts.tail_synthesize = true;
2417 
2418 	if (rec->evlist->core.nr_entries == 0 &&
2419 	    __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2420 		pr_err("Not enough memory for event selector list\n");
2421 		goto out;
2422 	}
2423 
2424 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2425 		rec->opts.no_inherit = true;
2426 
2427 	err = target__validate(&rec->opts.target);
2428 	if (err) {
2429 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2430 		ui__warning("%s\n", errbuf);
2431 	}
2432 
2433 	err = target__parse_uid(&rec->opts.target);
2434 	if (err) {
2435 		int saved_errno = errno;
2436 
2437 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2438 		ui__error("%s", errbuf);
2439 
2440 		err = -saved_errno;
2441 		goto out;
2442 	}
2443 
2444 	/* Enable ignoring missing threads when -u/-p option is defined. */
2445 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2446 
2447 	err = -ENOMEM;
2448 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2449 		usage_with_options(record_usage, record_options);
2450 
2451 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2452 	if (err)
2453 		goto out;
2454 
2455 	/*
2456 	 * We take all buildids when the file contains
2457 	 * AUX area tracing data because we do not decode the
2458 	 * trace because it would take too long.
2459 	 */
2460 	if (rec->opts.full_auxtrace)
2461 		rec->buildid_all = true;
2462 
2463 	if (record_opts__config(&rec->opts)) {
2464 		err = -EINVAL;
2465 		goto out;
2466 	}
2467 
2468 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2469 		rec->opts.nr_cblocks = nr_cblocks_max;
2470 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2471 
2472 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2473 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2474 
2475 	if (rec->opts.comp_level > comp_level_max)
2476 		rec->opts.comp_level = comp_level_max;
2477 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2478 
2479 	err = __cmd_record(&record, argc, argv);
2480 out:
2481 	evlist__delete(rec->evlist);
2482 	symbol__exit();
2483 	auxtrace_record__free(rec->itr);
2484 	return err;
2485 }
2486 
2487 static void snapshot_sig_handler(int sig __maybe_unused)
2488 {
2489 	struct record *rec = &record;
2490 
2491 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2492 		trigger_hit(&auxtrace_snapshot_trigger);
2493 		auxtrace_record__snapshot_started = 1;
2494 		if (auxtrace_record__snapshot_start(record.itr))
2495 			trigger_error(&auxtrace_snapshot_trigger);
2496 	}
2497 
2498 	if (switch_output_signal(rec))
2499 		trigger_hit(&switch_output_trigger);
2500 }
2501 
2502 static void alarm_sig_handler(int sig __maybe_unused)
2503 {
2504 	struct record *rec = &record;
2505 
2506 	if (switch_output_time(rec))
2507 		trigger_hit(&switch_output_trigger);
2508 }
2509