xref: /openbmc/linux/tools/perf/builtin-record.c (revision f21e49be)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/clockid.h"
50 #include "util/pmu-hybrid.h"
51 #include "util/evlist-hybrid.h"
52 #include "asm/bug.h"
53 #include "perf.h"
54 
55 #include <errno.h>
56 #include <inttypes.h>
57 #include <locale.h>
58 #include <poll.h>
59 #include <pthread.h>
60 #include <unistd.h>
61 #include <sched.h>
62 #include <signal.h>
63 #ifdef HAVE_EVENTFD_SUPPORT
64 #include <sys/eventfd.h>
65 #endif
66 #include <sys/mman.h>
67 #include <sys/wait.h>
68 #include <sys/types.h>
69 #include <sys/stat.h>
70 #include <fcntl.h>
71 #include <linux/err.h>
72 #include <linux/string.h>
73 #include <linux/time64.h>
74 #include <linux/zalloc.h>
75 #include <linux/bitmap.h>
76 #include <sys/time.h>
77 
78 struct switch_output {
79 	bool		 enabled;
80 	bool		 signal;
81 	unsigned long	 size;
82 	unsigned long	 time;
83 	const char	*str;
84 	bool		 set;
85 	char		 **filenames;
86 	int		 num_files;
87 	int		 cur_file;
88 };
89 
90 struct record {
91 	struct perf_tool	tool;
92 	struct record_opts	opts;
93 	u64			bytes_written;
94 	struct perf_data	data;
95 	struct auxtrace_record	*itr;
96 	struct evlist	*evlist;
97 	struct perf_session	*session;
98 	struct evlist		*sb_evlist;
99 	pthread_t		thread_id;
100 	int			realtime_prio;
101 	bool			switch_output_event_set;
102 	bool			no_buildid;
103 	bool			no_buildid_set;
104 	bool			no_buildid_cache;
105 	bool			no_buildid_cache_set;
106 	bool			buildid_all;
107 	bool			buildid_mmap;
108 	bool			timestamp_filename;
109 	bool			timestamp_boundary;
110 	struct switch_output	switch_output;
111 	unsigned long long	samples;
112 	struct mmap_cpu_mask	affinity_mask;
113 	unsigned long		output_max_size;	/* = 0: unlimited */
114 };
115 
116 static volatile int done;
117 
118 static volatile int auxtrace_record__snapshot_started;
119 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
120 static DEFINE_TRIGGER(switch_output_trigger);
121 
122 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
123 	"SYS", "NODE", "CPU"
124 };
125 
126 static bool switch_output_signal(struct record *rec)
127 {
128 	return rec->switch_output.signal &&
129 	       trigger_is_ready(&switch_output_trigger);
130 }
131 
132 static bool switch_output_size(struct record *rec)
133 {
134 	return rec->switch_output.size &&
135 	       trigger_is_ready(&switch_output_trigger) &&
136 	       (rec->bytes_written >= rec->switch_output.size);
137 }
138 
139 static bool switch_output_time(struct record *rec)
140 {
141 	return rec->switch_output.time &&
142 	       trigger_is_ready(&switch_output_trigger);
143 }
144 
145 static bool record__output_max_size_exceeded(struct record *rec)
146 {
147 	return rec->output_max_size &&
148 	       (rec->bytes_written >= rec->output_max_size);
149 }
150 
151 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
152 			 void *bf, size_t size)
153 {
154 	struct perf_data_file *file = &rec->session->data->file;
155 
156 	if (perf_data_file__write(file, bf, size) < 0) {
157 		pr_err("failed to write perf data, error: %m\n");
158 		return -1;
159 	}
160 
161 	rec->bytes_written += size;
162 
163 	if (record__output_max_size_exceeded(rec) && !done) {
164 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
165 				" stopping session ]\n",
166 				rec->bytes_written >> 10);
167 		done = 1;
168 	}
169 
170 	if (switch_output_size(rec))
171 		trigger_hit(&switch_output_trigger);
172 
173 	return 0;
174 }
175 
176 static int record__aio_enabled(struct record *rec);
177 static int record__comp_enabled(struct record *rec);
178 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
179 			    void *src, size_t src_size);
180 
181 #ifdef HAVE_AIO_SUPPORT
182 static int record__aio_write(struct aiocb *cblock, int trace_fd,
183 		void *buf, size_t size, off_t off)
184 {
185 	int rc;
186 
187 	cblock->aio_fildes = trace_fd;
188 	cblock->aio_buf    = buf;
189 	cblock->aio_nbytes = size;
190 	cblock->aio_offset = off;
191 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
192 
193 	do {
194 		rc = aio_write(cblock);
195 		if (rc == 0) {
196 			break;
197 		} else if (errno != EAGAIN) {
198 			cblock->aio_fildes = -1;
199 			pr_err("failed to queue perf data, error: %m\n");
200 			break;
201 		}
202 	} while (1);
203 
204 	return rc;
205 }
206 
207 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
208 {
209 	void *rem_buf;
210 	off_t rem_off;
211 	size_t rem_size;
212 	int rc, aio_errno;
213 	ssize_t aio_ret, written;
214 
215 	aio_errno = aio_error(cblock);
216 	if (aio_errno == EINPROGRESS)
217 		return 0;
218 
219 	written = aio_ret = aio_return(cblock);
220 	if (aio_ret < 0) {
221 		if (aio_errno != EINTR)
222 			pr_err("failed to write perf data, error: %m\n");
223 		written = 0;
224 	}
225 
226 	rem_size = cblock->aio_nbytes - written;
227 
228 	if (rem_size == 0) {
229 		cblock->aio_fildes = -1;
230 		/*
231 		 * md->refcount is incremented in record__aio_pushfn() for
232 		 * every aio write request started in record__aio_push() so
233 		 * decrement it because the request is now complete.
234 		 */
235 		perf_mmap__put(&md->core);
236 		rc = 1;
237 	} else {
238 		/*
239 		 * aio write request may require restart with the
240 		 * reminder if the kernel didn't write whole
241 		 * chunk at once.
242 		 */
243 		rem_off = cblock->aio_offset + written;
244 		rem_buf = (void *)(cblock->aio_buf + written);
245 		record__aio_write(cblock, cblock->aio_fildes,
246 				rem_buf, rem_size, rem_off);
247 		rc = 0;
248 	}
249 
250 	return rc;
251 }
252 
253 static int record__aio_sync(struct mmap *md, bool sync_all)
254 {
255 	struct aiocb **aiocb = md->aio.aiocb;
256 	struct aiocb *cblocks = md->aio.cblocks;
257 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
258 	int i, do_suspend;
259 
260 	do {
261 		do_suspend = 0;
262 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
263 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
264 				if (sync_all)
265 					aiocb[i] = NULL;
266 				else
267 					return i;
268 			} else {
269 				/*
270 				 * Started aio write is not complete yet
271 				 * so it has to be waited before the
272 				 * next allocation.
273 				 */
274 				aiocb[i] = &cblocks[i];
275 				do_suspend = 1;
276 			}
277 		}
278 		if (!do_suspend)
279 			return -1;
280 
281 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
282 			if (!(errno == EAGAIN || errno == EINTR))
283 				pr_err("failed to sync perf data, error: %m\n");
284 		}
285 	} while (1);
286 }
287 
288 struct record_aio {
289 	struct record	*rec;
290 	void		*data;
291 	size_t		size;
292 };
293 
294 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
295 {
296 	struct record_aio *aio = to;
297 
298 	/*
299 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
300 	 * to release space in the kernel buffer as fast as possible, calling
301 	 * perf_mmap__consume() from perf_mmap__push() function.
302 	 *
303 	 * That lets the kernel to proceed with storing more profiling data into
304 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
305 	 *
306 	 * Coping can be done in two steps in case the chunk of profiling data
307 	 * crosses the upper bound of the kernel buffer. In this case we first move
308 	 * part of data from map->start till the upper bound and then the reminder
309 	 * from the beginning of the kernel buffer till the end of the data chunk.
310 	 */
311 
312 	if (record__comp_enabled(aio->rec)) {
313 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
314 				     mmap__mmap_len(map) - aio->size,
315 				     buf, size);
316 	} else {
317 		memcpy(aio->data + aio->size, buf, size);
318 	}
319 
320 	if (!aio->size) {
321 		/*
322 		 * Increment map->refcount to guard map->aio.data[] buffer
323 		 * from premature deallocation because map object can be
324 		 * released earlier than aio write request started on
325 		 * map->aio.data[] buffer is complete.
326 		 *
327 		 * perf_mmap__put() is done at record__aio_complete()
328 		 * after started aio request completion or at record__aio_push()
329 		 * if the request failed to start.
330 		 */
331 		perf_mmap__get(&map->core);
332 	}
333 
334 	aio->size += size;
335 
336 	return size;
337 }
338 
339 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
340 {
341 	int ret, idx;
342 	int trace_fd = rec->session->data->file.fd;
343 	struct record_aio aio = { .rec = rec, .size = 0 };
344 
345 	/*
346 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
347 	 * becomes available after previous aio write operation.
348 	 */
349 
350 	idx = record__aio_sync(map, false);
351 	aio.data = map->aio.data[idx];
352 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
353 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
354 		return ret;
355 
356 	rec->samples++;
357 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
358 	if (!ret) {
359 		*off += aio.size;
360 		rec->bytes_written += aio.size;
361 		if (switch_output_size(rec))
362 			trigger_hit(&switch_output_trigger);
363 	} else {
364 		/*
365 		 * Decrement map->refcount incremented in record__aio_pushfn()
366 		 * back if record__aio_write() operation failed to start, otherwise
367 		 * map->refcount is decremented in record__aio_complete() after
368 		 * aio write operation finishes successfully.
369 		 */
370 		perf_mmap__put(&map->core);
371 	}
372 
373 	return ret;
374 }
375 
376 static off_t record__aio_get_pos(int trace_fd)
377 {
378 	return lseek(trace_fd, 0, SEEK_CUR);
379 }
380 
381 static void record__aio_set_pos(int trace_fd, off_t pos)
382 {
383 	lseek(trace_fd, pos, SEEK_SET);
384 }
385 
386 static void record__aio_mmap_read_sync(struct record *rec)
387 {
388 	int i;
389 	struct evlist *evlist = rec->evlist;
390 	struct mmap *maps = evlist->mmap;
391 
392 	if (!record__aio_enabled(rec))
393 		return;
394 
395 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
396 		struct mmap *map = &maps[i];
397 
398 		if (map->core.base)
399 			record__aio_sync(map, true);
400 	}
401 }
402 
403 static int nr_cblocks_default = 1;
404 static int nr_cblocks_max = 4;
405 
406 static int record__aio_parse(const struct option *opt,
407 			     const char *str,
408 			     int unset)
409 {
410 	struct record_opts *opts = (struct record_opts *)opt->value;
411 
412 	if (unset) {
413 		opts->nr_cblocks = 0;
414 	} else {
415 		if (str)
416 			opts->nr_cblocks = strtol(str, NULL, 0);
417 		if (!opts->nr_cblocks)
418 			opts->nr_cblocks = nr_cblocks_default;
419 	}
420 
421 	return 0;
422 }
423 #else /* HAVE_AIO_SUPPORT */
424 static int nr_cblocks_max = 0;
425 
426 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
427 			    off_t *off __maybe_unused)
428 {
429 	return -1;
430 }
431 
432 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
433 {
434 	return -1;
435 }
436 
437 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
438 {
439 }
440 
441 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
442 {
443 }
444 #endif
445 
446 static int record__aio_enabled(struct record *rec)
447 {
448 	return rec->opts.nr_cblocks > 0;
449 }
450 
451 #define MMAP_FLUSH_DEFAULT 1
452 static int record__mmap_flush_parse(const struct option *opt,
453 				    const char *str,
454 				    int unset)
455 {
456 	int flush_max;
457 	struct record_opts *opts = (struct record_opts *)opt->value;
458 	static struct parse_tag tags[] = {
459 			{ .tag  = 'B', .mult = 1       },
460 			{ .tag  = 'K', .mult = 1 << 10 },
461 			{ .tag  = 'M', .mult = 1 << 20 },
462 			{ .tag  = 'G', .mult = 1 << 30 },
463 			{ .tag  = 0 },
464 	};
465 
466 	if (unset)
467 		return 0;
468 
469 	if (str) {
470 		opts->mmap_flush = parse_tag_value(str, tags);
471 		if (opts->mmap_flush == (int)-1)
472 			opts->mmap_flush = strtol(str, NULL, 0);
473 	}
474 
475 	if (!opts->mmap_flush)
476 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
477 
478 	flush_max = evlist__mmap_size(opts->mmap_pages);
479 	flush_max /= 4;
480 	if (opts->mmap_flush > flush_max)
481 		opts->mmap_flush = flush_max;
482 
483 	return 0;
484 }
485 
486 #ifdef HAVE_ZSTD_SUPPORT
487 static unsigned int comp_level_default = 1;
488 
489 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
490 {
491 	struct record_opts *opts = opt->value;
492 
493 	if (unset) {
494 		opts->comp_level = 0;
495 	} else {
496 		if (str)
497 			opts->comp_level = strtol(str, NULL, 0);
498 		if (!opts->comp_level)
499 			opts->comp_level = comp_level_default;
500 	}
501 
502 	return 0;
503 }
504 #endif
505 static unsigned int comp_level_max = 22;
506 
507 static int record__comp_enabled(struct record *rec)
508 {
509 	return rec->opts.comp_level > 0;
510 }
511 
512 static int process_synthesized_event(struct perf_tool *tool,
513 				     union perf_event *event,
514 				     struct perf_sample *sample __maybe_unused,
515 				     struct machine *machine __maybe_unused)
516 {
517 	struct record *rec = container_of(tool, struct record, tool);
518 	return record__write(rec, NULL, event, event->header.size);
519 }
520 
521 static int process_locked_synthesized_event(struct perf_tool *tool,
522 				     union perf_event *event,
523 				     struct perf_sample *sample __maybe_unused,
524 				     struct machine *machine __maybe_unused)
525 {
526 	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
527 	int ret;
528 
529 	pthread_mutex_lock(&synth_lock);
530 	ret = process_synthesized_event(tool, event, sample, machine);
531 	pthread_mutex_unlock(&synth_lock);
532 	return ret;
533 }
534 
535 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
536 {
537 	struct record *rec = to;
538 
539 	if (record__comp_enabled(rec)) {
540 		size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
541 		bf   = map->data;
542 	}
543 
544 	rec->samples++;
545 	return record__write(rec, map, bf, size);
546 }
547 
548 static volatile int signr = -1;
549 static volatile int child_finished;
550 #ifdef HAVE_EVENTFD_SUPPORT
551 static int done_fd = -1;
552 #endif
553 
554 static void sig_handler(int sig)
555 {
556 	if (sig == SIGCHLD)
557 		child_finished = 1;
558 	else
559 		signr = sig;
560 
561 	done = 1;
562 #ifdef HAVE_EVENTFD_SUPPORT
563 {
564 	u64 tmp = 1;
565 	/*
566 	 * It is possible for this signal handler to run after done is checked
567 	 * in the main loop, but before the perf counter fds are polled. If this
568 	 * happens, the poll() will continue to wait even though done is set,
569 	 * and will only break out if either another signal is received, or the
570 	 * counters are ready for read. To ensure the poll() doesn't sleep when
571 	 * done is set, use an eventfd (done_fd) to wake up the poll().
572 	 */
573 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
574 		pr_err("failed to signal wakeup fd, error: %m\n");
575 }
576 #endif // HAVE_EVENTFD_SUPPORT
577 }
578 
579 static void sigsegv_handler(int sig)
580 {
581 	perf_hooks__recover();
582 	sighandler_dump_stack(sig);
583 }
584 
585 static void record__sig_exit(void)
586 {
587 	if (signr == -1)
588 		return;
589 
590 	signal(signr, SIG_DFL);
591 	raise(signr);
592 }
593 
594 #ifdef HAVE_AUXTRACE_SUPPORT
595 
596 static int record__process_auxtrace(struct perf_tool *tool,
597 				    struct mmap *map,
598 				    union perf_event *event, void *data1,
599 				    size_t len1, void *data2, size_t len2)
600 {
601 	struct record *rec = container_of(tool, struct record, tool);
602 	struct perf_data *data = &rec->data;
603 	size_t padding;
604 	u8 pad[8] = {0};
605 
606 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
607 		off_t file_offset;
608 		int fd = perf_data__fd(data);
609 		int err;
610 
611 		file_offset = lseek(fd, 0, SEEK_CUR);
612 		if (file_offset == -1)
613 			return -1;
614 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
615 						     event, file_offset);
616 		if (err)
617 			return err;
618 	}
619 
620 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
621 	padding = (len1 + len2) & 7;
622 	if (padding)
623 		padding = 8 - padding;
624 
625 	record__write(rec, map, event, event->header.size);
626 	record__write(rec, map, data1, len1);
627 	if (len2)
628 		record__write(rec, map, data2, len2);
629 	record__write(rec, map, &pad, padding);
630 
631 	return 0;
632 }
633 
634 static int record__auxtrace_mmap_read(struct record *rec,
635 				      struct mmap *map)
636 {
637 	int ret;
638 
639 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
640 				  record__process_auxtrace);
641 	if (ret < 0)
642 		return ret;
643 
644 	if (ret)
645 		rec->samples++;
646 
647 	return 0;
648 }
649 
650 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
651 					       struct mmap *map)
652 {
653 	int ret;
654 
655 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
656 					   record__process_auxtrace,
657 					   rec->opts.auxtrace_snapshot_size);
658 	if (ret < 0)
659 		return ret;
660 
661 	if (ret)
662 		rec->samples++;
663 
664 	return 0;
665 }
666 
667 static int record__auxtrace_read_snapshot_all(struct record *rec)
668 {
669 	int i;
670 	int rc = 0;
671 
672 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
673 		struct mmap *map = &rec->evlist->mmap[i];
674 
675 		if (!map->auxtrace_mmap.base)
676 			continue;
677 
678 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
679 			rc = -1;
680 			goto out;
681 		}
682 	}
683 out:
684 	return rc;
685 }
686 
687 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
688 {
689 	pr_debug("Recording AUX area tracing snapshot\n");
690 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
691 		trigger_error(&auxtrace_snapshot_trigger);
692 	} else {
693 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
694 			trigger_error(&auxtrace_snapshot_trigger);
695 		else
696 			trigger_ready(&auxtrace_snapshot_trigger);
697 	}
698 }
699 
700 static int record__auxtrace_snapshot_exit(struct record *rec)
701 {
702 	if (trigger_is_error(&auxtrace_snapshot_trigger))
703 		return 0;
704 
705 	if (!auxtrace_record__snapshot_started &&
706 	    auxtrace_record__snapshot_start(rec->itr))
707 		return -1;
708 
709 	record__read_auxtrace_snapshot(rec, true);
710 	if (trigger_is_error(&auxtrace_snapshot_trigger))
711 		return -1;
712 
713 	return 0;
714 }
715 
716 static int record__auxtrace_init(struct record *rec)
717 {
718 	int err;
719 
720 	if (!rec->itr) {
721 		rec->itr = auxtrace_record__init(rec->evlist, &err);
722 		if (err)
723 			return err;
724 	}
725 
726 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
727 					      rec->opts.auxtrace_snapshot_opts);
728 	if (err)
729 		return err;
730 
731 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
732 					    rec->opts.auxtrace_sample_opts);
733 	if (err)
734 		return err;
735 
736 	auxtrace_regroup_aux_output(rec->evlist);
737 
738 	return auxtrace_parse_filters(rec->evlist);
739 }
740 
741 #else
742 
743 static inline
744 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
745 			       struct mmap *map __maybe_unused)
746 {
747 	return 0;
748 }
749 
750 static inline
751 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
752 				    bool on_exit __maybe_unused)
753 {
754 }
755 
756 static inline
757 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
758 {
759 	return 0;
760 }
761 
762 static inline
763 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
764 {
765 	return 0;
766 }
767 
768 static int record__auxtrace_init(struct record *rec __maybe_unused)
769 {
770 	return 0;
771 }
772 
773 #endif
774 
775 static int record__config_text_poke(struct evlist *evlist)
776 {
777 	struct evsel *evsel;
778 	int err;
779 
780 	/* Nothing to do if text poke is already configured */
781 	evlist__for_each_entry(evlist, evsel) {
782 		if (evsel->core.attr.text_poke)
783 			return 0;
784 	}
785 
786 	err = parse_events(evlist, "dummy:u", NULL);
787 	if (err)
788 		return err;
789 
790 	evsel = evlist__last(evlist);
791 
792 	evsel->core.attr.freq = 0;
793 	evsel->core.attr.sample_period = 1;
794 	evsel->core.attr.text_poke = 1;
795 	evsel->core.attr.ksymbol = 1;
796 
797 	evsel->core.system_wide = true;
798 	evsel->no_aux_samples = true;
799 	evsel->immediate = true;
800 
801 	/* Text poke must be collected on all CPUs */
802 	perf_cpu_map__put(evsel->core.own_cpus);
803 	evsel->core.own_cpus = perf_cpu_map__new(NULL);
804 	perf_cpu_map__put(evsel->core.cpus);
805 	evsel->core.cpus = perf_cpu_map__get(evsel->core.own_cpus);
806 
807 	evsel__set_sample_bit(evsel, TIME);
808 
809 	return 0;
810 }
811 
812 static bool record__kcore_readable(struct machine *machine)
813 {
814 	char kcore[PATH_MAX];
815 	int fd;
816 
817 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
818 
819 	fd = open(kcore, O_RDONLY);
820 	if (fd < 0)
821 		return false;
822 
823 	close(fd);
824 
825 	return true;
826 }
827 
828 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
829 {
830 	char from_dir[PATH_MAX];
831 	char kcore_dir[PATH_MAX];
832 	int ret;
833 
834 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
835 
836 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
837 	if (ret)
838 		return ret;
839 
840 	return kcore_copy(from_dir, kcore_dir);
841 }
842 
843 static int record__mmap_evlist(struct record *rec,
844 			       struct evlist *evlist)
845 {
846 	struct record_opts *opts = &rec->opts;
847 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
848 				  opts->auxtrace_sample_mode;
849 	char msg[512];
850 
851 	if (opts->affinity != PERF_AFFINITY_SYS)
852 		cpu__setup_cpunode_map();
853 
854 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
855 				 opts->auxtrace_mmap_pages,
856 				 auxtrace_overwrite,
857 				 opts->nr_cblocks, opts->affinity,
858 				 opts->mmap_flush, opts->comp_level) < 0) {
859 		if (errno == EPERM) {
860 			pr_err("Permission error mapping pages.\n"
861 			       "Consider increasing "
862 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
863 			       "or try again with a smaller value of -m/--mmap_pages.\n"
864 			       "(current value: %u,%u)\n",
865 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
866 			return -errno;
867 		} else {
868 			pr_err("failed to mmap with %d (%s)\n", errno,
869 				str_error_r(errno, msg, sizeof(msg)));
870 			if (errno)
871 				return -errno;
872 			else
873 				return -EINVAL;
874 		}
875 	}
876 	return 0;
877 }
878 
879 static int record__mmap(struct record *rec)
880 {
881 	return record__mmap_evlist(rec, rec->evlist);
882 }
883 
884 static int record__open(struct record *rec)
885 {
886 	char msg[BUFSIZ];
887 	struct evsel *pos;
888 	struct evlist *evlist = rec->evlist;
889 	struct perf_session *session = rec->session;
890 	struct record_opts *opts = &rec->opts;
891 	int rc = 0;
892 
893 	/*
894 	 * For initial_delay, system wide or a hybrid system, we need to add a
895 	 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
896 	 * of waiting or event synthesis.
897 	 */
898 	if (opts->initial_delay || target__has_cpu(&opts->target) ||
899 	    perf_pmu__has_hybrid()) {
900 		pos = evlist__get_tracking_event(evlist);
901 		if (!evsel__is_dummy_event(pos)) {
902 			/* Set up dummy event. */
903 			if (evlist__add_dummy(evlist))
904 				return -ENOMEM;
905 			pos = evlist__last(evlist);
906 			evlist__set_tracking_event(evlist, pos);
907 		}
908 
909 		/*
910 		 * Enable the dummy event when the process is forked for
911 		 * initial_delay, immediately for system wide.
912 		 */
913 		if (opts->initial_delay && !pos->immediate &&
914 		    !target__has_cpu(&opts->target))
915 			pos->core.attr.enable_on_exec = 1;
916 		else
917 			pos->immediate = 1;
918 	}
919 
920 	evlist__config(evlist, opts, &callchain_param);
921 
922 	evlist__for_each_entry(evlist, pos) {
923 try_again:
924 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
925 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
926 				if (verbose > 0)
927 					ui__warning("%s\n", msg);
928 				goto try_again;
929 			}
930 			if ((errno == EINVAL || errno == EBADF) &&
931 			    pos->core.leader != &pos->core &&
932 			    pos->weak_group) {
933 			        pos = evlist__reset_weak_group(evlist, pos, true);
934 				goto try_again;
935 			}
936 			rc = -errno;
937 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
938 			ui__error("%s\n", msg);
939 			goto out;
940 		}
941 
942 		pos->supported = true;
943 	}
944 
945 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
946 		pr_warning(
947 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
948 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
949 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
950 "file is not found in the buildid cache or in the vmlinux path.\n\n"
951 "Samples in kernel modules won't be resolved at all.\n\n"
952 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
953 "even with a suitable vmlinux or kallsyms file.\n\n");
954 	}
955 
956 	if (evlist__apply_filters(evlist, &pos)) {
957 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
958 			pos->filter, evsel__name(pos), errno,
959 			str_error_r(errno, msg, sizeof(msg)));
960 		rc = -1;
961 		goto out;
962 	}
963 
964 	rc = record__mmap(rec);
965 	if (rc)
966 		goto out;
967 
968 	session->evlist = evlist;
969 	perf_session__set_id_hdr_size(session);
970 out:
971 	return rc;
972 }
973 
974 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
975 {
976 	if (rec->evlist->first_sample_time == 0)
977 		rec->evlist->first_sample_time = sample_time;
978 
979 	if (sample_time)
980 		rec->evlist->last_sample_time = sample_time;
981 }
982 
983 static int process_sample_event(struct perf_tool *tool,
984 				union perf_event *event,
985 				struct perf_sample *sample,
986 				struct evsel *evsel,
987 				struct machine *machine)
988 {
989 	struct record *rec = container_of(tool, struct record, tool);
990 
991 	set_timestamp_boundary(rec, sample->time);
992 
993 	if (rec->buildid_all)
994 		return 0;
995 
996 	rec->samples++;
997 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
998 }
999 
1000 static int process_buildids(struct record *rec)
1001 {
1002 	struct perf_session *session = rec->session;
1003 
1004 	if (perf_data__size(&rec->data) == 0)
1005 		return 0;
1006 
1007 	/*
1008 	 * During this process, it'll load kernel map and replace the
1009 	 * dso->long_name to a real pathname it found.  In this case
1010 	 * we prefer the vmlinux path like
1011 	 *   /lib/modules/3.16.4/build/vmlinux
1012 	 *
1013 	 * rather than build-id path (in debug directory).
1014 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1015 	 */
1016 	symbol_conf.ignore_vmlinux_buildid = true;
1017 
1018 	/*
1019 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1020 	 * so no need to process samples. But if timestamp_boundary is enabled,
1021 	 * it still needs to walk on all samples to get the timestamps of
1022 	 * first/last samples.
1023 	 */
1024 	if (rec->buildid_all && !rec->timestamp_boundary)
1025 		rec->tool.sample = NULL;
1026 
1027 	return perf_session__process_events(session);
1028 }
1029 
1030 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1031 {
1032 	int err;
1033 	struct perf_tool *tool = data;
1034 	/*
1035 	 *As for guest kernel when processing subcommand record&report,
1036 	 *we arrange module mmap prior to guest kernel mmap and trigger
1037 	 *a preload dso because default guest module symbols are loaded
1038 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1039 	 *method is used to avoid symbol missing when the first addr is
1040 	 *in module instead of in guest kernel.
1041 	 */
1042 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1043 					     machine);
1044 	if (err < 0)
1045 		pr_err("Couldn't record guest kernel [%d]'s reference"
1046 		       " relocation symbol.\n", machine->pid);
1047 
1048 	/*
1049 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1050 	 * have no _text sometimes.
1051 	 */
1052 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1053 						 machine);
1054 	if (err < 0)
1055 		pr_err("Couldn't record guest kernel [%d]'s reference"
1056 		       " relocation symbol.\n", machine->pid);
1057 }
1058 
1059 static struct perf_event_header finished_round_event = {
1060 	.size = sizeof(struct perf_event_header),
1061 	.type = PERF_RECORD_FINISHED_ROUND,
1062 };
1063 
1064 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1065 {
1066 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1067 	    !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
1068 			  rec->affinity_mask.nbits)) {
1069 		bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
1070 		bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
1071 			  map->affinity_mask.bits, rec->affinity_mask.nbits);
1072 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
1073 				  (cpu_set_t *)rec->affinity_mask.bits);
1074 		if (verbose == 2)
1075 			mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
1076 	}
1077 }
1078 
1079 static size_t process_comp_header(void *record, size_t increment)
1080 {
1081 	struct perf_record_compressed *event = record;
1082 	size_t size = sizeof(*event);
1083 
1084 	if (increment) {
1085 		event->header.size += increment;
1086 		return increment;
1087 	}
1088 
1089 	event->header.type = PERF_RECORD_COMPRESSED;
1090 	event->header.size = size;
1091 
1092 	return size;
1093 }
1094 
1095 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
1096 			    void *src, size_t src_size)
1097 {
1098 	size_t compressed;
1099 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1100 
1101 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1102 						     max_record_size, process_comp_header);
1103 
1104 	session->bytes_transferred += src_size;
1105 	session->bytes_compressed  += compressed;
1106 
1107 	return compressed;
1108 }
1109 
1110 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1111 				    bool overwrite, bool synch)
1112 {
1113 	u64 bytes_written = rec->bytes_written;
1114 	int i;
1115 	int rc = 0;
1116 	struct mmap *maps;
1117 	int trace_fd = rec->data.file.fd;
1118 	off_t off = 0;
1119 
1120 	if (!evlist)
1121 		return 0;
1122 
1123 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1124 	if (!maps)
1125 		return 0;
1126 
1127 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1128 		return 0;
1129 
1130 	if (record__aio_enabled(rec))
1131 		off = record__aio_get_pos(trace_fd);
1132 
1133 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
1134 		u64 flush = 0;
1135 		struct mmap *map = &maps[i];
1136 
1137 		if (map->core.base) {
1138 			record__adjust_affinity(rec, map);
1139 			if (synch) {
1140 				flush = map->core.flush;
1141 				map->core.flush = 1;
1142 			}
1143 			if (!record__aio_enabled(rec)) {
1144 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1145 					if (synch)
1146 						map->core.flush = flush;
1147 					rc = -1;
1148 					goto out;
1149 				}
1150 			} else {
1151 				if (record__aio_push(rec, map, &off) < 0) {
1152 					record__aio_set_pos(trace_fd, off);
1153 					if (synch)
1154 						map->core.flush = flush;
1155 					rc = -1;
1156 					goto out;
1157 				}
1158 			}
1159 			if (synch)
1160 				map->core.flush = flush;
1161 		}
1162 
1163 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1164 		    !rec->opts.auxtrace_sample_mode &&
1165 		    record__auxtrace_mmap_read(rec, map) != 0) {
1166 			rc = -1;
1167 			goto out;
1168 		}
1169 	}
1170 
1171 	if (record__aio_enabled(rec))
1172 		record__aio_set_pos(trace_fd, off);
1173 
1174 	/*
1175 	 * Mark the round finished in case we wrote
1176 	 * at least one event.
1177 	 */
1178 	if (bytes_written != rec->bytes_written)
1179 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1180 
1181 	if (overwrite)
1182 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1183 out:
1184 	return rc;
1185 }
1186 
1187 static int record__mmap_read_all(struct record *rec, bool synch)
1188 {
1189 	int err;
1190 
1191 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1192 	if (err)
1193 		return err;
1194 
1195 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1196 }
1197 
1198 static void record__init_features(struct record *rec)
1199 {
1200 	struct perf_session *session = rec->session;
1201 	int feat;
1202 
1203 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1204 		perf_header__set_feat(&session->header, feat);
1205 
1206 	if (rec->no_buildid)
1207 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1208 
1209 	if (!have_tracepoints(&rec->evlist->core.entries))
1210 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1211 
1212 	if (!rec->opts.branch_stack)
1213 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1214 
1215 	if (!rec->opts.full_auxtrace)
1216 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1217 
1218 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1219 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1220 
1221 	if (!rec->opts.use_clockid)
1222 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1223 
1224 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1225 	if (!record__comp_enabled(rec))
1226 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1227 
1228 	perf_header__clear_feat(&session->header, HEADER_STAT);
1229 }
1230 
1231 static void
1232 record__finish_output(struct record *rec)
1233 {
1234 	struct perf_data *data = &rec->data;
1235 	int fd = perf_data__fd(data);
1236 
1237 	if (data->is_pipe)
1238 		return;
1239 
1240 	rec->session->header.data_size += rec->bytes_written;
1241 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1242 
1243 	if (!rec->no_buildid) {
1244 		process_buildids(rec);
1245 
1246 		if (rec->buildid_all)
1247 			dsos__hit_all(rec->session);
1248 	}
1249 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1250 
1251 	return;
1252 }
1253 
1254 static int record__synthesize_workload(struct record *rec, bool tail)
1255 {
1256 	int err;
1257 	struct perf_thread_map *thread_map;
1258 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1259 
1260 	if (rec->opts.tail_synthesize != tail)
1261 		return 0;
1262 
1263 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1264 	if (thread_map == NULL)
1265 		return -1;
1266 
1267 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1268 						 process_synthesized_event,
1269 						 &rec->session->machines.host,
1270 						 needs_mmap,
1271 						 rec->opts.sample_address);
1272 	perf_thread_map__put(thread_map);
1273 	return err;
1274 }
1275 
1276 static int record__synthesize(struct record *rec, bool tail);
1277 
1278 static int
1279 record__switch_output(struct record *rec, bool at_exit)
1280 {
1281 	struct perf_data *data = &rec->data;
1282 	int fd, err;
1283 	char *new_filename;
1284 
1285 	/* Same Size:      "2015122520103046"*/
1286 	char timestamp[] = "InvalidTimestamp";
1287 
1288 	record__aio_mmap_read_sync(rec);
1289 
1290 	record__synthesize(rec, true);
1291 	if (target__none(&rec->opts.target))
1292 		record__synthesize_workload(rec, true);
1293 
1294 	rec->samples = 0;
1295 	record__finish_output(rec);
1296 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1297 	if (err) {
1298 		pr_err("Failed to get current timestamp\n");
1299 		return -EINVAL;
1300 	}
1301 
1302 	fd = perf_data__switch(data, timestamp,
1303 				    rec->session->header.data_offset,
1304 				    at_exit, &new_filename);
1305 	if (fd >= 0 && !at_exit) {
1306 		rec->bytes_written = 0;
1307 		rec->session->header.data_size = 0;
1308 	}
1309 
1310 	if (!quiet)
1311 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1312 			data->path, timestamp);
1313 
1314 	if (rec->switch_output.num_files) {
1315 		int n = rec->switch_output.cur_file + 1;
1316 
1317 		if (n >= rec->switch_output.num_files)
1318 			n = 0;
1319 		rec->switch_output.cur_file = n;
1320 		if (rec->switch_output.filenames[n]) {
1321 			remove(rec->switch_output.filenames[n]);
1322 			zfree(&rec->switch_output.filenames[n]);
1323 		}
1324 		rec->switch_output.filenames[n] = new_filename;
1325 	} else {
1326 		free(new_filename);
1327 	}
1328 
1329 	/* Output tracking events */
1330 	if (!at_exit) {
1331 		record__synthesize(rec, false);
1332 
1333 		/*
1334 		 * In 'perf record --switch-output' without -a,
1335 		 * record__synthesize() in record__switch_output() won't
1336 		 * generate tracking events because there's no thread_map
1337 		 * in evlist. Which causes newly created perf.data doesn't
1338 		 * contain map and comm information.
1339 		 * Create a fake thread_map and directly call
1340 		 * perf_event__synthesize_thread_map() for those events.
1341 		 */
1342 		if (target__none(&rec->opts.target))
1343 			record__synthesize_workload(rec, false);
1344 	}
1345 	return fd;
1346 }
1347 
1348 static volatile int workload_exec_errno;
1349 
1350 /*
1351  * evlist__prepare_workload will send a SIGUSR1
1352  * if the fork fails, since we asked by setting its
1353  * want_signal to true.
1354  */
1355 static void workload_exec_failed_signal(int signo __maybe_unused,
1356 					siginfo_t *info,
1357 					void *ucontext __maybe_unused)
1358 {
1359 	workload_exec_errno = info->si_value.sival_int;
1360 	done = 1;
1361 	child_finished = 1;
1362 }
1363 
1364 static void snapshot_sig_handler(int sig);
1365 static void alarm_sig_handler(int sig);
1366 
1367 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1368 {
1369 	if (evlist) {
1370 		if (evlist->mmap && evlist->mmap[0].core.base)
1371 			return evlist->mmap[0].core.base;
1372 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1373 			return evlist->overwrite_mmap[0].core.base;
1374 	}
1375 	return NULL;
1376 }
1377 
1378 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1379 {
1380 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1381 	if (pc)
1382 		return pc;
1383 	return NULL;
1384 }
1385 
1386 static int record__synthesize(struct record *rec, bool tail)
1387 {
1388 	struct perf_session *session = rec->session;
1389 	struct machine *machine = &session->machines.host;
1390 	struct perf_data *data = &rec->data;
1391 	struct record_opts *opts = &rec->opts;
1392 	struct perf_tool *tool = &rec->tool;
1393 	int err = 0;
1394 	event_op f = process_synthesized_event;
1395 
1396 	if (rec->opts.tail_synthesize != tail)
1397 		return 0;
1398 
1399 	if (data->is_pipe) {
1400 		err = perf_event__synthesize_for_pipe(tool, session, data,
1401 						      process_synthesized_event);
1402 		if (err < 0)
1403 			goto out;
1404 
1405 		rec->bytes_written += err;
1406 	}
1407 
1408 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1409 					  process_synthesized_event, machine);
1410 	if (err)
1411 		goto out;
1412 
1413 	/* Synthesize id_index before auxtrace_info */
1414 	if (rec->opts.auxtrace_sample_mode || rec->opts.full_auxtrace) {
1415 		err = perf_event__synthesize_id_index(tool,
1416 						      process_synthesized_event,
1417 						      session->evlist, machine);
1418 		if (err)
1419 			goto out;
1420 	}
1421 
1422 	if (rec->opts.full_auxtrace) {
1423 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1424 					session, process_synthesized_event);
1425 		if (err)
1426 			goto out;
1427 	}
1428 
1429 	if (!evlist__exclude_kernel(rec->evlist)) {
1430 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1431 							 machine);
1432 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1433 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1434 				   "Check /proc/kallsyms permission or run as root.\n");
1435 
1436 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1437 						     machine);
1438 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1439 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1440 				   "Check /proc/modules permission or run as root.\n");
1441 	}
1442 
1443 	if (perf_guest) {
1444 		machines__process_guests(&session->machines,
1445 					 perf_event__synthesize_guest_os, tool);
1446 	}
1447 
1448 	err = perf_event__synthesize_extra_attr(&rec->tool,
1449 						rec->evlist,
1450 						process_synthesized_event,
1451 						data->is_pipe);
1452 	if (err)
1453 		goto out;
1454 
1455 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1456 						 process_synthesized_event,
1457 						NULL);
1458 	if (err < 0) {
1459 		pr_err("Couldn't synthesize thread map.\n");
1460 		return err;
1461 	}
1462 
1463 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1464 					     process_synthesized_event, NULL);
1465 	if (err < 0) {
1466 		pr_err("Couldn't synthesize cpu map.\n");
1467 		return err;
1468 	}
1469 
1470 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1471 						machine, opts);
1472 	if (err < 0)
1473 		pr_warning("Couldn't synthesize bpf events.\n");
1474 
1475 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
1476 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1477 						     machine);
1478 		if (err < 0)
1479 			pr_warning("Couldn't synthesize cgroup events.\n");
1480 	}
1481 
1482 	if (rec->opts.nr_threads_synthesize > 1) {
1483 		perf_set_multithreaded();
1484 		f = process_locked_synthesized_event;
1485 	}
1486 
1487 	if (rec->opts.synth & PERF_SYNTH_TASK) {
1488 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1489 
1490 		err = __machine__synthesize_threads(machine, tool, &opts->target,
1491 						    rec->evlist->core.threads,
1492 						    f, needs_mmap, opts->sample_address,
1493 						    rec->opts.nr_threads_synthesize);
1494 	}
1495 
1496 	if (rec->opts.nr_threads_synthesize > 1)
1497 		perf_set_singlethreaded();
1498 
1499 out:
1500 	return err;
1501 }
1502 
1503 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1504 {
1505 	struct record *rec = data;
1506 	pthread_kill(rec->thread_id, SIGUSR2);
1507 	return 0;
1508 }
1509 
1510 static int record__setup_sb_evlist(struct record *rec)
1511 {
1512 	struct record_opts *opts = &rec->opts;
1513 
1514 	if (rec->sb_evlist != NULL) {
1515 		/*
1516 		 * We get here if --switch-output-event populated the
1517 		 * sb_evlist, so associate a callback that will send a SIGUSR2
1518 		 * to the main thread.
1519 		 */
1520 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1521 		rec->thread_id = pthread_self();
1522 	}
1523 #ifdef HAVE_LIBBPF_SUPPORT
1524 	if (!opts->no_bpf_event) {
1525 		if (rec->sb_evlist == NULL) {
1526 			rec->sb_evlist = evlist__new();
1527 
1528 			if (rec->sb_evlist == NULL) {
1529 				pr_err("Couldn't create side band evlist.\n.");
1530 				return -1;
1531 			}
1532 		}
1533 
1534 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1535 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1536 			return -1;
1537 		}
1538 	}
1539 #endif
1540 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1541 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1542 		opts->no_bpf_event = true;
1543 	}
1544 
1545 	return 0;
1546 }
1547 
1548 static int record__init_clock(struct record *rec)
1549 {
1550 	struct perf_session *session = rec->session;
1551 	struct timespec ref_clockid;
1552 	struct timeval ref_tod;
1553 	u64 ref;
1554 
1555 	if (!rec->opts.use_clockid)
1556 		return 0;
1557 
1558 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1559 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
1560 
1561 	session->header.env.clock.clockid = rec->opts.clockid;
1562 
1563 	if (gettimeofday(&ref_tod, NULL) != 0) {
1564 		pr_err("gettimeofday failed, cannot set reference time.\n");
1565 		return -1;
1566 	}
1567 
1568 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
1569 		pr_err("clock_gettime failed, cannot set reference time.\n");
1570 		return -1;
1571 	}
1572 
1573 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
1574 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
1575 
1576 	session->header.env.clock.tod_ns = ref;
1577 
1578 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
1579 	      (u64) ref_clockid.tv_nsec;
1580 
1581 	session->header.env.clock.clockid_ns = ref;
1582 	return 0;
1583 }
1584 
1585 static void hit_auxtrace_snapshot_trigger(struct record *rec)
1586 {
1587 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1588 		trigger_hit(&auxtrace_snapshot_trigger);
1589 		auxtrace_record__snapshot_started = 1;
1590 		if (auxtrace_record__snapshot_start(rec->itr))
1591 			trigger_error(&auxtrace_snapshot_trigger);
1592 	}
1593 }
1594 
1595 static void record__uniquify_name(struct record *rec)
1596 {
1597 	struct evsel *pos;
1598 	struct evlist *evlist = rec->evlist;
1599 	char *new_name;
1600 	int ret;
1601 
1602 	if (!perf_pmu__has_hybrid())
1603 		return;
1604 
1605 	evlist__for_each_entry(evlist, pos) {
1606 		if (!evsel__is_hybrid(pos))
1607 			continue;
1608 
1609 		if (strchr(pos->name, '/'))
1610 			continue;
1611 
1612 		ret = asprintf(&new_name, "%s/%s/",
1613 			       pos->pmu_name, pos->name);
1614 		if (ret) {
1615 			free(pos->name);
1616 			pos->name = new_name;
1617 		}
1618 	}
1619 }
1620 
1621 static int __cmd_record(struct record *rec, int argc, const char **argv)
1622 {
1623 	int err;
1624 	int status = 0;
1625 	unsigned long waking = 0;
1626 	const bool forks = argc > 0;
1627 	struct perf_tool *tool = &rec->tool;
1628 	struct record_opts *opts = &rec->opts;
1629 	struct perf_data *data = &rec->data;
1630 	struct perf_session *session;
1631 	bool disabled = false, draining = false;
1632 	int fd;
1633 	float ratio = 0;
1634 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
1635 
1636 	atexit(record__sig_exit);
1637 	signal(SIGCHLD, sig_handler);
1638 	signal(SIGINT, sig_handler);
1639 	signal(SIGTERM, sig_handler);
1640 	signal(SIGSEGV, sigsegv_handler);
1641 
1642 	if (rec->opts.record_namespaces)
1643 		tool->namespace_events = true;
1644 
1645 	if (rec->opts.record_cgroup) {
1646 #ifdef HAVE_FILE_HANDLE
1647 		tool->cgroup_events = true;
1648 #else
1649 		pr_err("cgroup tracking is not supported\n");
1650 		return -1;
1651 #endif
1652 	}
1653 
1654 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1655 		signal(SIGUSR2, snapshot_sig_handler);
1656 		if (rec->opts.auxtrace_snapshot_mode)
1657 			trigger_on(&auxtrace_snapshot_trigger);
1658 		if (rec->switch_output.enabled)
1659 			trigger_on(&switch_output_trigger);
1660 	} else {
1661 		signal(SIGUSR2, SIG_IGN);
1662 	}
1663 
1664 	session = perf_session__new(data, tool);
1665 	if (IS_ERR(session)) {
1666 		pr_err("Perf session creation failed.\n");
1667 		return PTR_ERR(session);
1668 	}
1669 
1670 	fd = perf_data__fd(data);
1671 	rec->session = session;
1672 
1673 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1674 		pr_err("Compression initialization failed.\n");
1675 		return -1;
1676 	}
1677 #ifdef HAVE_EVENTFD_SUPPORT
1678 	done_fd = eventfd(0, EFD_NONBLOCK);
1679 	if (done_fd < 0) {
1680 		pr_err("Failed to create wakeup eventfd, error: %m\n");
1681 		status = -1;
1682 		goto out_delete_session;
1683 	}
1684 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
1685 	if (err < 0) {
1686 		pr_err("Failed to add wakeup eventfd to poll list\n");
1687 		status = err;
1688 		goto out_delete_session;
1689 	}
1690 #endif // HAVE_EVENTFD_SUPPORT
1691 
1692 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1693 	session->header.env.comp_level = rec->opts.comp_level;
1694 
1695 	if (rec->opts.kcore &&
1696 	    !record__kcore_readable(&session->machines.host)) {
1697 		pr_err("ERROR: kcore is not readable.\n");
1698 		return -1;
1699 	}
1700 
1701 	if (record__init_clock(rec))
1702 		return -1;
1703 
1704 	record__init_features(rec);
1705 
1706 	if (forks) {
1707 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
1708 					       workload_exec_failed_signal);
1709 		if (err < 0) {
1710 			pr_err("Couldn't run the workload!\n");
1711 			status = err;
1712 			goto out_delete_session;
1713 		}
1714 	}
1715 
1716 	/*
1717 	 * If we have just single event and are sending data
1718 	 * through pipe, we need to force the ids allocation,
1719 	 * because we synthesize event name through the pipe
1720 	 * and need the id for that.
1721 	 */
1722 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1723 		rec->opts.sample_id = true;
1724 
1725 	record__uniquify_name(rec);
1726 
1727 	if (record__open(rec) != 0) {
1728 		err = -1;
1729 		goto out_child;
1730 	}
1731 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1732 
1733 	if (rec->opts.kcore) {
1734 		err = record__kcore_copy(&session->machines.host, data);
1735 		if (err) {
1736 			pr_err("ERROR: Failed to copy kcore\n");
1737 			goto out_child;
1738 		}
1739 	}
1740 
1741 	err = bpf__apply_obj_config();
1742 	if (err) {
1743 		char errbuf[BUFSIZ];
1744 
1745 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1746 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1747 			 errbuf);
1748 		goto out_child;
1749 	}
1750 
1751 	/*
1752 	 * Normally perf_session__new would do this, but it doesn't have the
1753 	 * evlist.
1754 	 */
1755 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
1756 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1757 		rec->tool.ordered_events = false;
1758 	}
1759 
1760 	if (!rec->evlist->core.nr_groups)
1761 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1762 
1763 	if (data->is_pipe) {
1764 		err = perf_header__write_pipe(fd);
1765 		if (err < 0)
1766 			goto out_child;
1767 	} else {
1768 		err = perf_session__write_header(session, rec->evlist, fd, false);
1769 		if (err < 0)
1770 			goto out_child;
1771 	}
1772 
1773 	err = -1;
1774 	if (!rec->no_buildid
1775 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1776 		pr_err("Couldn't generate buildids. "
1777 		       "Use --no-buildid to profile anyway.\n");
1778 		goto out_child;
1779 	}
1780 
1781 	err = record__setup_sb_evlist(rec);
1782 	if (err)
1783 		goto out_child;
1784 
1785 	err = record__synthesize(rec, false);
1786 	if (err < 0)
1787 		goto out_child;
1788 
1789 	if (rec->realtime_prio) {
1790 		struct sched_param param;
1791 
1792 		param.sched_priority = rec->realtime_prio;
1793 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1794 			pr_err("Could not set realtime priority.\n");
1795 			err = -1;
1796 			goto out_child;
1797 		}
1798 	}
1799 
1800 	/*
1801 	 * When perf is starting the traced process, all the events
1802 	 * (apart from group members) have enable_on_exec=1 set,
1803 	 * so don't spoil it by prematurely enabling them.
1804 	 */
1805 	if (!target__none(&opts->target) && !opts->initial_delay)
1806 		evlist__enable(rec->evlist);
1807 
1808 	/*
1809 	 * Let the child rip
1810 	 */
1811 	if (forks) {
1812 		struct machine *machine = &session->machines.host;
1813 		union perf_event *event;
1814 		pid_t tgid;
1815 
1816 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1817 		if (event == NULL) {
1818 			err = -ENOMEM;
1819 			goto out_child;
1820 		}
1821 
1822 		/*
1823 		 * Some H/W events are generated before COMM event
1824 		 * which is emitted during exec(), so perf script
1825 		 * cannot see a correct process name for those events.
1826 		 * Synthesize COMM event to prevent it.
1827 		 */
1828 		tgid = perf_event__synthesize_comm(tool, event,
1829 						   rec->evlist->workload.pid,
1830 						   process_synthesized_event,
1831 						   machine);
1832 		free(event);
1833 
1834 		if (tgid == -1)
1835 			goto out_child;
1836 
1837 		event = malloc(sizeof(event->namespaces) +
1838 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1839 			       machine->id_hdr_size);
1840 		if (event == NULL) {
1841 			err = -ENOMEM;
1842 			goto out_child;
1843 		}
1844 
1845 		/*
1846 		 * Synthesize NAMESPACES event for the command specified.
1847 		 */
1848 		perf_event__synthesize_namespaces(tool, event,
1849 						  rec->evlist->workload.pid,
1850 						  tgid, process_synthesized_event,
1851 						  machine);
1852 		free(event);
1853 
1854 		evlist__start_workload(rec->evlist);
1855 	}
1856 
1857 	if (evlist__initialize_ctlfd(rec->evlist, opts->ctl_fd, opts->ctl_fd_ack))
1858 		goto out_child;
1859 
1860 	if (opts->initial_delay) {
1861 		pr_info(EVLIST_DISABLED_MSG);
1862 		if (opts->initial_delay > 0) {
1863 			usleep(opts->initial_delay * USEC_PER_MSEC);
1864 			evlist__enable(rec->evlist);
1865 			pr_info(EVLIST_ENABLED_MSG);
1866 		}
1867 	}
1868 
1869 	trigger_ready(&auxtrace_snapshot_trigger);
1870 	trigger_ready(&switch_output_trigger);
1871 	perf_hooks__invoke_record_start();
1872 	for (;;) {
1873 		unsigned long long hits = rec->samples;
1874 
1875 		/*
1876 		 * rec->evlist->bkw_mmap_state is possible to be
1877 		 * BKW_MMAP_EMPTY here: when done == true and
1878 		 * hits != rec->samples in previous round.
1879 		 *
1880 		 * evlist__toggle_bkw_mmap ensure we never
1881 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1882 		 */
1883 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1884 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1885 
1886 		if (record__mmap_read_all(rec, false) < 0) {
1887 			trigger_error(&auxtrace_snapshot_trigger);
1888 			trigger_error(&switch_output_trigger);
1889 			err = -1;
1890 			goto out_child;
1891 		}
1892 
1893 		if (auxtrace_record__snapshot_started) {
1894 			auxtrace_record__snapshot_started = 0;
1895 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1896 				record__read_auxtrace_snapshot(rec, false);
1897 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1898 				pr_err("AUX area tracing snapshot failed\n");
1899 				err = -1;
1900 				goto out_child;
1901 			}
1902 		}
1903 
1904 		if (trigger_is_hit(&switch_output_trigger)) {
1905 			/*
1906 			 * If switch_output_trigger is hit, the data in
1907 			 * overwritable ring buffer should have been collected,
1908 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1909 			 *
1910 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1911 			 * record__mmap_read_all() didn't collect data from
1912 			 * overwritable ring buffer. Read again.
1913 			 */
1914 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1915 				continue;
1916 			trigger_ready(&switch_output_trigger);
1917 
1918 			/*
1919 			 * Reenable events in overwrite ring buffer after
1920 			 * record__mmap_read_all(): we should have collected
1921 			 * data from it.
1922 			 */
1923 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1924 
1925 			if (!quiet)
1926 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1927 					waking);
1928 			waking = 0;
1929 			fd = record__switch_output(rec, false);
1930 			if (fd < 0) {
1931 				pr_err("Failed to switch to new file\n");
1932 				trigger_error(&switch_output_trigger);
1933 				err = fd;
1934 				goto out_child;
1935 			}
1936 
1937 			/* re-arm the alarm */
1938 			if (rec->switch_output.time)
1939 				alarm(rec->switch_output.time);
1940 		}
1941 
1942 		if (hits == rec->samples) {
1943 			if (done || draining)
1944 				break;
1945 			err = evlist__poll(rec->evlist, -1);
1946 			/*
1947 			 * Propagate error, only if there's any. Ignore positive
1948 			 * number of returned events and interrupt error.
1949 			 */
1950 			if (err > 0 || (err < 0 && errno == EINTR))
1951 				err = 0;
1952 			waking++;
1953 
1954 			if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1955 				draining = true;
1956 		}
1957 
1958 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
1959 			switch (cmd) {
1960 			case EVLIST_CTL_CMD_SNAPSHOT:
1961 				hit_auxtrace_snapshot_trigger(rec);
1962 				evlist__ctlfd_ack(rec->evlist);
1963 				break;
1964 			case EVLIST_CTL_CMD_STOP:
1965 				done = 1;
1966 				break;
1967 			case EVLIST_CTL_CMD_ACK:
1968 			case EVLIST_CTL_CMD_UNSUPPORTED:
1969 			case EVLIST_CTL_CMD_ENABLE:
1970 			case EVLIST_CTL_CMD_DISABLE:
1971 			case EVLIST_CTL_CMD_EVLIST:
1972 			case EVLIST_CTL_CMD_PING:
1973 			default:
1974 				break;
1975 			}
1976 		}
1977 
1978 		/*
1979 		 * When perf is starting the traced process, at the end events
1980 		 * die with the process and we wait for that. Thus no need to
1981 		 * disable events in this case.
1982 		 */
1983 		if (done && !disabled && !target__none(&opts->target)) {
1984 			trigger_off(&auxtrace_snapshot_trigger);
1985 			evlist__disable(rec->evlist);
1986 			disabled = true;
1987 		}
1988 	}
1989 
1990 	trigger_off(&auxtrace_snapshot_trigger);
1991 	trigger_off(&switch_output_trigger);
1992 
1993 	if (opts->auxtrace_snapshot_on_exit)
1994 		record__auxtrace_snapshot_exit(rec);
1995 
1996 	if (forks && workload_exec_errno) {
1997 		char msg[STRERR_BUFSIZE], strevsels[2048];
1998 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1999 
2000 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2001 
2002 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2003 			strevsels, argv[0], emsg);
2004 		err = -1;
2005 		goto out_child;
2006 	}
2007 
2008 	if (!quiet)
2009 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
2010 
2011 	if (target__none(&rec->opts.target))
2012 		record__synthesize_workload(rec, true);
2013 
2014 out_child:
2015 	evlist__finalize_ctlfd(rec->evlist);
2016 	record__mmap_read_all(rec, true);
2017 	record__aio_mmap_read_sync(rec);
2018 
2019 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2020 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2021 		session->header.env.comp_ratio = ratio + 0.5;
2022 	}
2023 
2024 	if (forks) {
2025 		int exit_status;
2026 
2027 		if (!child_finished)
2028 			kill(rec->evlist->workload.pid, SIGTERM);
2029 
2030 		wait(&exit_status);
2031 
2032 		if (err < 0)
2033 			status = err;
2034 		else if (WIFEXITED(exit_status))
2035 			status = WEXITSTATUS(exit_status);
2036 		else if (WIFSIGNALED(exit_status))
2037 			signr = WTERMSIG(exit_status);
2038 	} else
2039 		status = err;
2040 
2041 	record__synthesize(rec, true);
2042 	/* this will be recalculated during process_buildids() */
2043 	rec->samples = 0;
2044 
2045 	if (!err) {
2046 		if (!rec->timestamp_filename) {
2047 			record__finish_output(rec);
2048 		} else {
2049 			fd = record__switch_output(rec, true);
2050 			if (fd < 0) {
2051 				status = fd;
2052 				goto out_delete_session;
2053 			}
2054 		}
2055 	}
2056 
2057 	perf_hooks__invoke_record_end();
2058 
2059 	if (!err && !quiet) {
2060 		char samples[128];
2061 		const char *postfix = rec->timestamp_filename ?
2062 					".<timestamp>" : "";
2063 
2064 		if (rec->samples && !rec->opts.full_auxtrace)
2065 			scnprintf(samples, sizeof(samples),
2066 				  " (%" PRIu64 " samples)", rec->samples);
2067 		else
2068 			samples[0] = '\0';
2069 
2070 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2071 			perf_data__size(data) / 1024.0 / 1024.0,
2072 			data->path, postfix, samples);
2073 		if (ratio) {
2074 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2075 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2076 					ratio);
2077 		}
2078 		fprintf(stderr, " ]\n");
2079 	}
2080 
2081 out_delete_session:
2082 #ifdef HAVE_EVENTFD_SUPPORT
2083 	if (done_fd >= 0)
2084 		close(done_fd);
2085 #endif
2086 	zstd_fini(&session->zstd_data);
2087 	perf_session__delete(session);
2088 
2089 	if (!opts->no_bpf_event)
2090 		evlist__stop_sb_thread(rec->sb_evlist);
2091 	return status;
2092 }
2093 
2094 static void callchain_debug(struct callchain_param *callchain)
2095 {
2096 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2097 
2098 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2099 
2100 	if (callchain->record_mode == CALLCHAIN_DWARF)
2101 		pr_debug("callchain: stack dump size %d\n",
2102 			 callchain->dump_size);
2103 }
2104 
2105 int record_opts__parse_callchain(struct record_opts *record,
2106 				 struct callchain_param *callchain,
2107 				 const char *arg, bool unset)
2108 {
2109 	int ret;
2110 	callchain->enabled = !unset;
2111 
2112 	/* --no-call-graph */
2113 	if (unset) {
2114 		callchain->record_mode = CALLCHAIN_NONE;
2115 		pr_debug("callchain: disabled\n");
2116 		return 0;
2117 	}
2118 
2119 	ret = parse_callchain_record_opt(arg, callchain);
2120 	if (!ret) {
2121 		/* Enable data address sampling for DWARF unwind. */
2122 		if (callchain->record_mode == CALLCHAIN_DWARF)
2123 			record->sample_address = true;
2124 		callchain_debug(callchain);
2125 	}
2126 
2127 	return ret;
2128 }
2129 
2130 int record_parse_callchain_opt(const struct option *opt,
2131 			       const char *arg,
2132 			       int unset)
2133 {
2134 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2135 }
2136 
2137 int record_callchain_opt(const struct option *opt,
2138 			 const char *arg __maybe_unused,
2139 			 int unset __maybe_unused)
2140 {
2141 	struct callchain_param *callchain = opt->value;
2142 
2143 	callchain->enabled = true;
2144 
2145 	if (callchain->record_mode == CALLCHAIN_NONE)
2146 		callchain->record_mode = CALLCHAIN_FP;
2147 
2148 	callchain_debug(callchain);
2149 	return 0;
2150 }
2151 
2152 static int perf_record_config(const char *var, const char *value, void *cb)
2153 {
2154 	struct record *rec = cb;
2155 
2156 	if (!strcmp(var, "record.build-id")) {
2157 		if (!strcmp(value, "cache"))
2158 			rec->no_buildid_cache = false;
2159 		else if (!strcmp(value, "no-cache"))
2160 			rec->no_buildid_cache = true;
2161 		else if (!strcmp(value, "skip"))
2162 			rec->no_buildid = true;
2163 		else if (!strcmp(value, "mmap"))
2164 			rec->buildid_mmap = true;
2165 		else
2166 			return -1;
2167 		return 0;
2168 	}
2169 	if (!strcmp(var, "record.call-graph")) {
2170 		var = "call-graph.record-mode";
2171 		return perf_default_config(var, value, cb);
2172 	}
2173 #ifdef HAVE_AIO_SUPPORT
2174 	if (!strcmp(var, "record.aio")) {
2175 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2176 		if (!rec->opts.nr_cblocks)
2177 			rec->opts.nr_cblocks = nr_cblocks_default;
2178 	}
2179 #endif
2180 
2181 	return 0;
2182 }
2183 
2184 
2185 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2186 {
2187 	struct record_opts *opts = (struct record_opts *)opt->value;
2188 
2189 	if (unset || !str)
2190 		return 0;
2191 
2192 	if (!strcasecmp(str, "node"))
2193 		opts->affinity = PERF_AFFINITY_NODE;
2194 	else if (!strcasecmp(str, "cpu"))
2195 		opts->affinity = PERF_AFFINITY_CPU;
2196 
2197 	return 0;
2198 }
2199 
2200 static int parse_output_max_size(const struct option *opt,
2201 				 const char *str, int unset)
2202 {
2203 	unsigned long *s = (unsigned long *)opt->value;
2204 	static struct parse_tag tags_size[] = {
2205 		{ .tag  = 'B', .mult = 1       },
2206 		{ .tag  = 'K', .mult = 1 << 10 },
2207 		{ .tag  = 'M', .mult = 1 << 20 },
2208 		{ .tag  = 'G', .mult = 1 << 30 },
2209 		{ .tag  = 0 },
2210 	};
2211 	unsigned long val;
2212 
2213 	if (unset) {
2214 		*s = 0;
2215 		return 0;
2216 	}
2217 
2218 	val = parse_tag_value(str, tags_size);
2219 	if (val != (unsigned long) -1) {
2220 		*s = val;
2221 		return 0;
2222 	}
2223 
2224 	return -1;
2225 }
2226 
2227 static int record__parse_mmap_pages(const struct option *opt,
2228 				    const char *str,
2229 				    int unset __maybe_unused)
2230 {
2231 	struct record_opts *opts = opt->value;
2232 	char *s, *p;
2233 	unsigned int mmap_pages;
2234 	int ret;
2235 
2236 	if (!str)
2237 		return -EINVAL;
2238 
2239 	s = strdup(str);
2240 	if (!s)
2241 		return -ENOMEM;
2242 
2243 	p = strchr(s, ',');
2244 	if (p)
2245 		*p = '\0';
2246 
2247 	if (*s) {
2248 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
2249 		if (ret)
2250 			goto out_free;
2251 		opts->mmap_pages = mmap_pages;
2252 	}
2253 
2254 	if (!p) {
2255 		ret = 0;
2256 		goto out_free;
2257 	}
2258 
2259 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
2260 	if (ret)
2261 		goto out_free;
2262 
2263 	opts->auxtrace_mmap_pages = mmap_pages;
2264 
2265 out_free:
2266 	free(s);
2267 	return ret;
2268 }
2269 
2270 static int parse_control_option(const struct option *opt,
2271 				const char *str,
2272 				int unset __maybe_unused)
2273 {
2274 	struct record_opts *opts = opt->value;
2275 
2276 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
2277 }
2278 
2279 static void switch_output_size_warn(struct record *rec)
2280 {
2281 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2282 	struct switch_output *s = &rec->switch_output;
2283 
2284 	wakeup_size /= 2;
2285 
2286 	if (s->size < wakeup_size) {
2287 		char buf[100];
2288 
2289 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2290 		pr_warning("WARNING: switch-output data size lower than "
2291 			   "wakeup kernel buffer size (%s) "
2292 			   "expect bigger perf.data sizes\n", buf);
2293 	}
2294 }
2295 
2296 static int switch_output_setup(struct record *rec)
2297 {
2298 	struct switch_output *s = &rec->switch_output;
2299 	static struct parse_tag tags_size[] = {
2300 		{ .tag  = 'B', .mult = 1       },
2301 		{ .tag  = 'K', .mult = 1 << 10 },
2302 		{ .tag  = 'M', .mult = 1 << 20 },
2303 		{ .tag  = 'G', .mult = 1 << 30 },
2304 		{ .tag  = 0 },
2305 	};
2306 	static struct parse_tag tags_time[] = {
2307 		{ .tag  = 's', .mult = 1        },
2308 		{ .tag  = 'm', .mult = 60       },
2309 		{ .tag  = 'h', .mult = 60*60    },
2310 		{ .tag  = 'd', .mult = 60*60*24 },
2311 		{ .tag  = 0 },
2312 	};
2313 	unsigned long val;
2314 
2315 	/*
2316 	 * If we're using --switch-output-events, then we imply its
2317 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2318 	 *  thread to its parent.
2319 	 */
2320 	if (rec->switch_output_event_set)
2321 		goto do_signal;
2322 
2323 	if (!s->set)
2324 		return 0;
2325 
2326 	if (!strcmp(s->str, "signal")) {
2327 do_signal:
2328 		s->signal = true;
2329 		pr_debug("switch-output with SIGUSR2 signal\n");
2330 		goto enabled;
2331 	}
2332 
2333 	val = parse_tag_value(s->str, tags_size);
2334 	if (val != (unsigned long) -1) {
2335 		s->size = val;
2336 		pr_debug("switch-output with %s size threshold\n", s->str);
2337 		goto enabled;
2338 	}
2339 
2340 	val = parse_tag_value(s->str, tags_time);
2341 	if (val != (unsigned long) -1) {
2342 		s->time = val;
2343 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2344 			 s->str, s->time);
2345 		goto enabled;
2346 	}
2347 
2348 	return -1;
2349 
2350 enabled:
2351 	rec->timestamp_filename = true;
2352 	s->enabled              = true;
2353 
2354 	if (s->size && !rec->opts.no_buffering)
2355 		switch_output_size_warn(rec);
2356 
2357 	return 0;
2358 }
2359 
2360 static const char * const __record_usage[] = {
2361 	"perf record [<options>] [<command>]",
2362 	"perf record [<options>] -- <command> [<options>]",
2363 	NULL
2364 };
2365 const char * const *record_usage = __record_usage;
2366 
2367 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2368 				  struct perf_sample *sample, struct machine *machine)
2369 {
2370 	/*
2371 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2372 	 * no need to add them twice.
2373 	 */
2374 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2375 		return 0;
2376 	return perf_event__process_mmap(tool, event, sample, machine);
2377 }
2378 
2379 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2380 				   struct perf_sample *sample, struct machine *machine)
2381 {
2382 	/*
2383 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2384 	 * no need to add them twice.
2385 	 */
2386 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2387 		return 0;
2388 
2389 	return perf_event__process_mmap2(tool, event, sample, machine);
2390 }
2391 
2392 static int process_timestamp_boundary(struct perf_tool *tool,
2393 				      union perf_event *event __maybe_unused,
2394 				      struct perf_sample *sample,
2395 				      struct machine *machine __maybe_unused)
2396 {
2397 	struct record *rec = container_of(tool, struct record, tool);
2398 
2399 	set_timestamp_boundary(rec, sample->time);
2400 	return 0;
2401 }
2402 
2403 static int parse_record_synth_option(const struct option *opt,
2404 				     const char *str,
2405 				     int unset __maybe_unused)
2406 {
2407 	struct record_opts *opts = opt->value;
2408 	char *p = strdup(str);
2409 
2410 	if (p == NULL)
2411 		return -1;
2412 
2413 	opts->synth = parse_synth_opt(p);
2414 	free(p);
2415 
2416 	if (opts->synth < 0) {
2417 		pr_err("Invalid synth option: %s\n", str);
2418 		return -1;
2419 	}
2420 	return 0;
2421 }
2422 
2423 /*
2424  * XXX Ideally would be local to cmd_record() and passed to a record__new
2425  * because we need to have access to it in record__exit, that is called
2426  * after cmd_record() exits, but since record_options need to be accessible to
2427  * builtin-script, leave it here.
2428  *
2429  * At least we don't ouch it in all the other functions here directly.
2430  *
2431  * Just say no to tons of global variables, sigh.
2432  */
2433 static struct record record = {
2434 	.opts = {
2435 		.sample_time	     = true,
2436 		.mmap_pages	     = UINT_MAX,
2437 		.user_freq	     = UINT_MAX,
2438 		.user_interval	     = ULLONG_MAX,
2439 		.freq		     = 4000,
2440 		.target		     = {
2441 			.uses_mmap   = true,
2442 			.default_per_cpu = true,
2443 		},
2444 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2445 		.nr_threads_synthesize = 1,
2446 		.ctl_fd              = -1,
2447 		.ctl_fd_ack          = -1,
2448 		.synth               = PERF_SYNTH_ALL,
2449 	},
2450 	.tool = {
2451 		.sample		= process_sample_event,
2452 		.fork		= perf_event__process_fork,
2453 		.exit		= perf_event__process_exit,
2454 		.comm		= perf_event__process_comm,
2455 		.namespaces	= perf_event__process_namespaces,
2456 		.mmap		= build_id__process_mmap,
2457 		.mmap2		= build_id__process_mmap2,
2458 		.itrace_start	= process_timestamp_boundary,
2459 		.aux		= process_timestamp_boundary,
2460 		.ordered_events	= true,
2461 	},
2462 };
2463 
2464 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2465 	"\n\t\t\t\tDefault: fp";
2466 
2467 static bool dry_run;
2468 
2469 /*
2470  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2471  * with it and switch to use the library functions in perf_evlist that came
2472  * from builtin-record.c, i.e. use record_opts,
2473  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2474  * using pipes, etc.
2475  */
2476 static struct option __record_options[] = {
2477 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2478 		     "event selector. use 'perf list' to list available events",
2479 		     parse_events_option),
2480 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2481 		     "event filter", parse_filter),
2482 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2483 			   NULL, "don't record events from perf itself",
2484 			   exclude_perf),
2485 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2486 		    "record events on existing process id"),
2487 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2488 		    "record events on existing thread id"),
2489 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2490 		    "collect data with this RT SCHED_FIFO priority"),
2491 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2492 		    "collect data without buffering"),
2493 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2494 		    "collect raw sample records from all opened counters"),
2495 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2496 			    "system-wide collection from all CPUs"),
2497 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2498 		    "list of cpus to monitor"),
2499 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2500 	OPT_STRING('o', "output", &record.data.path, "file",
2501 		    "output file name"),
2502 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2503 			&record.opts.no_inherit_set,
2504 			"child tasks do not inherit counters"),
2505 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2506 		    "synthesize non-sample events at the end of output"),
2507 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2508 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
2509 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2510 		    "Fail if the specified frequency can't be used"),
2511 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2512 		     "profile at this frequency",
2513 		      record__parse_freq),
2514 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2515 		     "number of mmap data pages and AUX area tracing mmap pages",
2516 		     record__parse_mmap_pages),
2517 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2518 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2519 		     record__mmap_flush_parse),
2520 	OPT_BOOLEAN(0, "group", &record.opts.group,
2521 		    "put the counters into a counter group"),
2522 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2523 			   NULL, "enables call-graph recording" ,
2524 			   &record_callchain_opt),
2525 	OPT_CALLBACK(0, "call-graph", &record.opts,
2526 		     "record_mode[,record_size]", record_callchain_help,
2527 		     &record_parse_callchain_opt),
2528 	OPT_INCR('v', "verbose", &verbose,
2529 		    "be more verbose (show counter open errors, etc)"),
2530 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2531 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2532 		    "per thread counts"),
2533 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2534 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2535 		    "Record the sample physical addresses"),
2536 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
2537 		    "Record the sampled data address data page size"),
2538 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
2539 		    "Record the sampled code address (ip) page size"),
2540 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2541 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2542 			&record.opts.sample_time_set,
2543 			"Record the sample timestamps"),
2544 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2545 			"Record the sample period"),
2546 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2547 		    "don't sample"),
2548 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2549 			&record.no_buildid_cache_set,
2550 			"do not update the buildid cache"),
2551 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2552 			&record.no_buildid_set,
2553 			"do not collect buildids in perf.data"),
2554 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2555 		     "monitor event in cgroup name only",
2556 		     parse_cgroups),
2557 	OPT_INTEGER('D', "delay", &record.opts.initial_delay,
2558 		  "ms to wait before starting measurement after program start (-1: start with events disabled)"),
2559 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2560 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2561 		   "user to profile"),
2562 
2563 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2564 		     "branch any", "sample any taken branches",
2565 		     parse_branch_stack),
2566 
2567 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2568 		     "branch filter mask", "branch stack filter modes",
2569 		     parse_branch_stack),
2570 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2571 		    "sample by weight (on special events only)"),
2572 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2573 		    "sample transaction flags (special events only)"),
2574 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2575 		    "use per-thread mmaps"),
2576 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2577 		    "sample selected machine registers on interrupt,"
2578 		    " use '-I?' to list register names", parse_intr_regs),
2579 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2580 		    "sample selected machine registers on interrupt,"
2581 		    " use '--user-regs=?' to list register names", parse_user_regs),
2582 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2583 		    "Record running/enabled time of read (:S) events"),
2584 	OPT_CALLBACK('k', "clockid", &record.opts,
2585 	"clockid", "clockid to use for events, see clock_gettime()",
2586 	parse_clockid),
2587 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2588 			  "opts", "AUX area tracing Snapshot Mode", ""),
2589 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2590 			  "opts", "sample AUX area", ""),
2591 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2592 			"per thread proc mmap processing timeout in ms"),
2593 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2594 		    "Record namespaces events"),
2595 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
2596 		    "Record cgroup events"),
2597 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
2598 			&record.opts.record_switch_events_set,
2599 			"Record context switch events"),
2600 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2601 			 "Configure all used events to run in kernel space.",
2602 			 PARSE_OPT_EXCLUSIVE),
2603 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2604 			 "Configure all used events to run in user space.",
2605 			 PARSE_OPT_EXCLUSIVE),
2606 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2607 		    "collect kernel callchains"),
2608 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2609 		    "collect user callchains"),
2610 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2611 		   "clang binary to use for compiling BPF scriptlets"),
2612 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2613 		   "options passed to clang when compiling BPF scriptlets"),
2614 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2615 		   "file", "vmlinux pathname"),
2616 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2617 		    "Record build-id of all DSOs regardless of hits"),
2618 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
2619 		    "Record build-id in map events"),
2620 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2621 		    "append timestamp to output filename"),
2622 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2623 		    "Record timestamp boundary (time of first/last samples)"),
2624 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2625 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2626 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2627 			  "signal"),
2628 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
2629 			 "switch output event selector. use 'perf list' to list available events",
2630 			 parse_events_option_new_evlist),
2631 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2632 		   "Limit number of switch output generated files"),
2633 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2634 		    "Parse options then exit"),
2635 #ifdef HAVE_AIO_SUPPORT
2636 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2637 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2638 		     record__aio_parse),
2639 #endif
2640 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2641 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2642 		     record__parse_affinity),
2643 #ifdef HAVE_ZSTD_SUPPORT
2644 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2645 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2646 			    record__parse_comp_level),
2647 #endif
2648 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
2649 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
2650 	OPT_UINTEGER(0, "num-thread-synthesize",
2651 		     &record.opts.nr_threads_synthesize,
2652 		     "number of threads to run for event synthesis"),
2653 #ifdef HAVE_LIBPFM
2654 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
2655 		"libpfm4 event selector. use 'perf list' to list available events",
2656 		parse_libpfm_events_option),
2657 #endif
2658 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
2659 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
2660 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
2661 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
2662 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
2663 		      parse_control_option),
2664 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
2665 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
2666 	OPT_END()
2667 };
2668 
2669 struct option *record_options = __record_options;
2670 
2671 int cmd_record(int argc, const char **argv)
2672 {
2673 	int err;
2674 	struct record *rec = &record;
2675 	char errbuf[BUFSIZ];
2676 
2677 	setlocale(LC_ALL, "");
2678 
2679 #ifndef HAVE_LIBBPF_SUPPORT
2680 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2681 	set_nobuild('\0', "clang-path", true);
2682 	set_nobuild('\0', "clang-opt", true);
2683 # undef set_nobuild
2684 #endif
2685 
2686 #ifndef HAVE_BPF_PROLOGUE
2687 # if !defined (HAVE_DWARF_SUPPORT)
2688 #  define REASON  "NO_DWARF=1"
2689 # elif !defined (HAVE_LIBBPF_SUPPORT)
2690 #  define REASON  "NO_LIBBPF=1"
2691 # else
2692 #  define REASON  "this architecture doesn't support BPF prologue"
2693 # endif
2694 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2695 	set_nobuild('\0', "vmlinux", true);
2696 # undef set_nobuild
2697 # undef REASON
2698 #endif
2699 
2700 	rec->opts.affinity = PERF_AFFINITY_SYS;
2701 
2702 	rec->evlist = evlist__new();
2703 	if (rec->evlist == NULL)
2704 		return -ENOMEM;
2705 
2706 	err = perf_config(perf_record_config, rec);
2707 	if (err)
2708 		return err;
2709 
2710 	argc = parse_options(argc, argv, record_options, record_usage,
2711 			    PARSE_OPT_STOP_AT_NON_OPTION);
2712 	if (quiet)
2713 		perf_quiet_option();
2714 
2715 	err = symbol__validate_sym_arguments();
2716 	if (err)
2717 		return err;
2718 
2719 	/* Make system wide (-a) the default target. */
2720 	if (!argc && target__none(&rec->opts.target))
2721 		rec->opts.target.system_wide = true;
2722 
2723 	if (nr_cgroups && !rec->opts.target.system_wide) {
2724 		usage_with_options_msg(record_usage, record_options,
2725 			"cgroup monitoring only available in system-wide mode");
2726 
2727 	}
2728 
2729 	if (rec->buildid_mmap) {
2730 		if (!perf_can_record_build_id()) {
2731 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
2732 			err = -EINVAL;
2733 			goto out_opts;
2734 		}
2735 		pr_debug("Enabling build id in mmap2 events.\n");
2736 		/* Enable mmap build id synthesizing. */
2737 		symbol_conf.buildid_mmap2 = true;
2738 		/* Enable perf_event_attr::build_id bit. */
2739 		rec->opts.build_id = true;
2740 		/* Disable build id cache. */
2741 		rec->no_buildid = true;
2742 	}
2743 
2744 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
2745 		pr_err("Kernel has no cgroup sampling support.\n");
2746 		err = -EINVAL;
2747 		goto out_opts;
2748 	}
2749 
2750 	if (rec->opts.kcore)
2751 		rec->data.is_dir = true;
2752 
2753 	if (rec->opts.comp_level != 0) {
2754 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2755 		rec->no_buildid = true;
2756 	}
2757 
2758 	if (rec->opts.record_switch_events &&
2759 	    !perf_can_record_switch_events()) {
2760 		ui__error("kernel does not support recording context switch events\n");
2761 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2762 		err = -EINVAL;
2763 		goto out_opts;
2764 	}
2765 
2766 	if (switch_output_setup(rec)) {
2767 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2768 		err = -EINVAL;
2769 		goto out_opts;
2770 	}
2771 
2772 	if (rec->switch_output.time) {
2773 		signal(SIGALRM, alarm_sig_handler);
2774 		alarm(rec->switch_output.time);
2775 	}
2776 
2777 	if (rec->switch_output.num_files) {
2778 		rec->switch_output.filenames = calloc(sizeof(char *),
2779 						      rec->switch_output.num_files);
2780 		if (!rec->switch_output.filenames) {
2781 			err = -EINVAL;
2782 			goto out_opts;
2783 		}
2784 	}
2785 
2786 	/*
2787 	 * Allow aliases to facilitate the lookup of symbols for address
2788 	 * filters. Refer to auxtrace_parse_filters().
2789 	 */
2790 	symbol_conf.allow_aliases = true;
2791 
2792 	symbol__init(NULL);
2793 
2794 	if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2795 		rec->affinity_mask.nbits = cpu__max_cpu();
2796 		rec->affinity_mask.bits = bitmap_zalloc(rec->affinity_mask.nbits);
2797 		if (!rec->affinity_mask.bits) {
2798 			pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2799 			err = -ENOMEM;
2800 			goto out_opts;
2801 		}
2802 		pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2803 	}
2804 
2805 	err = record__auxtrace_init(rec);
2806 	if (err)
2807 		goto out;
2808 
2809 	if (dry_run)
2810 		goto out;
2811 
2812 	err = bpf__setup_stdout(rec->evlist);
2813 	if (err) {
2814 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2815 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2816 			 errbuf);
2817 		goto out;
2818 	}
2819 
2820 	err = -ENOMEM;
2821 
2822 	if (rec->no_buildid_cache || rec->no_buildid) {
2823 		disable_buildid_cache();
2824 	} else if (rec->switch_output.enabled) {
2825 		/*
2826 		 * In 'perf record --switch-output', disable buildid
2827 		 * generation by default to reduce data file switching
2828 		 * overhead. Still generate buildid if they are required
2829 		 * explicitly using
2830 		 *
2831 		 *  perf record --switch-output --no-no-buildid \
2832 		 *              --no-no-buildid-cache
2833 		 *
2834 		 * Following code equals to:
2835 		 *
2836 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2837 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2838 		 *         disable_buildid_cache();
2839 		 */
2840 		bool disable = true;
2841 
2842 		if (rec->no_buildid_set && !rec->no_buildid)
2843 			disable = false;
2844 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2845 			disable = false;
2846 		if (disable) {
2847 			rec->no_buildid = true;
2848 			rec->no_buildid_cache = true;
2849 			disable_buildid_cache();
2850 		}
2851 	}
2852 
2853 	if (record.opts.overwrite)
2854 		record.opts.tail_synthesize = true;
2855 
2856 	if (rec->evlist->core.nr_entries == 0) {
2857 		if (perf_pmu__has_hybrid()) {
2858 			err = evlist__add_default_hybrid(rec->evlist,
2859 							 !record.opts.no_samples);
2860 		} else {
2861 			err = __evlist__add_default(rec->evlist,
2862 						    !record.opts.no_samples);
2863 		}
2864 
2865 		if (err < 0) {
2866 			pr_err("Not enough memory for event selector list\n");
2867 			goto out;
2868 		}
2869 	}
2870 
2871 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2872 		rec->opts.no_inherit = true;
2873 
2874 	err = target__validate(&rec->opts.target);
2875 	if (err) {
2876 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2877 		ui__warning("%s\n", errbuf);
2878 	}
2879 
2880 	err = target__parse_uid(&rec->opts.target);
2881 	if (err) {
2882 		int saved_errno = errno;
2883 
2884 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2885 		ui__error("%s", errbuf);
2886 
2887 		err = -saved_errno;
2888 		goto out;
2889 	}
2890 
2891 	/* Enable ignoring missing threads when -u/-p option is defined. */
2892 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2893 
2894 	if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
2895 		pr_err("failed to use cpu list %s\n",
2896 		       rec->opts.target.cpu_list);
2897 		goto out;
2898 	}
2899 
2900 	rec->opts.target.hybrid = perf_pmu__has_hybrid();
2901 	err = -ENOMEM;
2902 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2903 		usage_with_options(record_usage, record_options);
2904 
2905 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2906 	if (err)
2907 		goto out;
2908 
2909 	/*
2910 	 * We take all buildids when the file contains
2911 	 * AUX area tracing data because we do not decode the
2912 	 * trace because it would take too long.
2913 	 */
2914 	if (rec->opts.full_auxtrace)
2915 		rec->buildid_all = true;
2916 
2917 	if (rec->opts.text_poke) {
2918 		err = record__config_text_poke(rec->evlist);
2919 		if (err) {
2920 			pr_err("record__config_text_poke failed, error %d\n", err);
2921 			goto out;
2922 		}
2923 	}
2924 
2925 	if (record_opts__config(&rec->opts)) {
2926 		err = -EINVAL;
2927 		goto out;
2928 	}
2929 
2930 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2931 		rec->opts.nr_cblocks = nr_cblocks_max;
2932 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2933 
2934 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2935 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2936 
2937 	if (rec->opts.comp_level > comp_level_max)
2938 		rec->opts.comp_level = comp_level_max;
2939 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2940 
2941 	err = __cmd_record(&record, argc, argv);
2942 out:
2943 	bitmap_free(rec->affinity_mask.bits);
2944 	evlist__delete(rec->evlist);
2945 	symbol__exit();
2946 	auxtrace_record__free(rec->itr);
2947 out_opts:
2948 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
2949 	return err;
2950 }
2951 
2952 static void snapshot_sig_handler(int sig __maybe_unused)
2953 {
2954 	struct record *rec = &record;
2955 
2956 	hit_auxtrace_snapshot_trigger(rec);
2957 
2958 	if (switch_output_signal(rec))
2959 		trigger_hit(&switch_output_trigger);
2960 }
2961 
2962 static void alarm_sig_handler(int sig __maybe_unused)
2963 {
2964 	struct record *rec = &record;
2965 
2966 	if (switch_output_time(rec))
2967 		trigger_hit(&switch_output_trigger);
2968 }
2969