xref: /openbmc/linux/tools/perf/builtin-record.c (revision 64288aa9)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/clockid.h"
50 #include "util/pmu-hybrid.h"
51 #include "util/evlist-hybrid.h"
52 #include "asm/bug.h"
53 #include "perf.h"
54 
55 #include <errno.h>
56 #include <inttypes.h>
57 #include <locale.h>
58 #include <poll.h>
59 #include <pthread.h>
60 #include <unistd.h>
61 #include <sched.h>
62 #include <signal.h>
63 #ifdef HAVE_EVENTFD_SUPPORT
64 #include <sys/eventfd.h>
65 #endif
66 #include <sys/mman.h>
67 #include <sys/wait.h>
68 #include <sys/types.h>
69 #include <sys/stat.h>
70 #include <fcntl.h>
71 #include <linux/err.h>
72 #include <linux/string.h>
73 #include <linux/time64.h>
74 #include <linux/zalloc.h>
75 #include <linux/bitmap.h>
76 #include <sys/time.h>
77 
78 struct switch_output {
79 	bool		 enabled;
80 	bool		 signal;
81 	unsigned long	 size;
82 	unsigned long	 time;
83 	const char	*str;
84 	bool		 set;
85 	char		 **filenames;
86 	int		 num_files;
87 	int		 cur_file;
88 };
89 
90 struct record {
91 	struct perf_tool	tool;
92 	struct record_opts	opts;
93 	u64			bytes_written;
94 	struct perf_data	data;
95 	struct auxtrace_record	*itr;
96 	struct evlist	*evlist;
97 	struct perf_session	*session;
98 	struct evlist		*sb_evlist;
99 	pthread_t		thread_id;
100 	int			realtime_prio;
101 	bool			switch_output_event_set;
102 	bool			no_buildid;
103 	bool			no_buildid_set;
104 	bool			no_buildid_cache;
105 	bool			no_buildid_cache_set;
106 	bool			buildid_all;
107 	bool			buildid_mmap;
108 	bool			timestamp_filename;
109 	bool			timestamp_boundary;
110 	struct switch_output	switch_output;
111 	unsigned long long	samples;
112 	struct mmap_cpu_mask	affinity_mask;
113 	unsigned long		output_max_size;	/* = 0: unlimited */
114 	struct perf_debuginfod	debuginfod;
115 };
116 
117 static volatile int done;
118 
119 static volatile int auxtrace_record__snapshot_started;
120 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
121 static DEFINE_TRIGGER(switch_output_trigger);
122 
123 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
124 	"SYS", "NODE", "CPU"
125 };
126 
127 static bool switch_output_signal(struct record *rec)
128 {
129 	return rec->switch_output.signal &&
130 	       trigger_is_ready(&switch_output_trigger);
131 }
132 
133 static bool switch_output_size(struct record *rec)
134 {
135 	return rec->switch_output.size &&
136 	       trigger_is_ready(&switch_output_trigger) &&
137 	       (rec->bytes_written >= rec->switch_output.size);
138 }
139 
140 static bool switch_output_time(struct record *rec)
141 {
142 	return rec->switch_output.time &&
143 	       trigger_is_ready(&switch_output_trigger);
144 }
145 
146 static bool record__output_max_size_exceeded(struct record *rec)
147 {
148 	return rec->output_max_size &&
149 	       (rec->bytes_written >= rec->output_max_size);
150 }
151 
152 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
153 			 void *bf, size_t size)
154 {
155 	struct perf_data_file *file = &rec->session->data->file;
156 
157 	if (perf_data_file__write(file, bf, size) < 0) {
158 		pr_err("failed to write perf data, error: %m\n");
159 		return -1;
160 	}
161 
162 	rec->bytes_written += size;
163 
164 	if (record__output_max_size_exceeded(rec) && !done) {
165 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
166 				" stopping session ]\n",
167 				rec->bytes_written >> 10);
168 		done = 1;
169 	}
170 
171 	if (switch_output_size(rec))
172 		trigger_hit(&switch_output_trigger);
173 
174 	return 0;
175 }
176 
177 static int record__aio_enabled(struct record *rec);
178 static int record__comp_enabled(struct record *rec);
179 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
180 			    void *src, size_t src_size);
181 
182 #ifdef HAVE_AIO_SUPPORT
183 static int record__aio_write(struct aiocb *cblock, int trace_fd,
184 		void *buf, size_t size, off_t off)
185 {
186 	int rc;
187 
188 	cblock->aio_fildes = trace_fd;
189 	cblock->aio_buf    = buf;
190 	cblock->aio_nbytes = size;
191 	cblock->aio_offset = off;
192 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
193 
194 	do {
195 		rc = aio_write(cblock);
196 		if (rc == 0) {
197 			break;
198 		} else if (errno != EAGAIN) {
199 			cblock->aio_fildes = -1;
200 			pr_err("failed to queue perf data, error: %m\n");
201 			break;
202 		}
203 	} while (1);
204 
205 	return rc;
206 }
207 
208 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
209 {
210 	void *rem_buf;
211 	off_t rem_off;
212 	size_t rem_size;
213 	int rc, aio_errno;
214 	ssize_t aio_ret, written;
215 
216 	aio_errno = aio_error(cblock);
217 	if (aio_errno == EINPROGRESS)
218 		return 0;
219 
220 	written = aio_ret = aio_return(cblock);
221 	if (aio_ret < 0) {
222 		if (aio_errno != EINTR)
223 			pr_err("failed to write perf data, error: %m\n");
224 		written = 0;
225 	}
226 
227 	rem_size = cblock->aio_nbytes - written;
228 
229 	if (rem_size == 0) {
230 		cblock->aio_fildes = -1;
231 		/*
232 		 * md->refcount is incremented in record__aio_pushfn() for
233 		 * every aio write request started in record__aio_push() so
234 		 * decrement it because the request is now complete.
235 		 */
236 		perf_mmap__put(&md->core);
237 		rc = 1;
238 	} else {
239 		/*
240 		 * aio write request may require restart with the
241 		 * reminder if the kernel didn't write whole
242 		 * chunk at once.
243 		 */
244 		rem_off = cblock->aio_offset + written;
245 		rem_buf = (void *)(cblock->aio_buf + written);
246 		record__aio_write(cblock, cblock->aio_fildes,
247 				rem_buf, rem_size, rem_off);
248 		rc = 0;
249 	}
250 
251 	return rc;
252 }
253 
254 static int record__aio_sync(struct mmap *md, bool sync_all)
255 {
256 	struct aiocb **aiocb = md->aio.aiocb;
257 	struct aiocb *cblocks = md->aio.cblocks;
258 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
259 	int i, do_suspend;
260 
261 	do {
262 		do_suspend = 0;
263 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
264 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
265 				if (sync_all)
266 					aiocb[i] = NULL;
267 				else
268 					return i;
269 			} else {
270 				/*
271 				 * Started aio write is not complete yet
272 				 * so it has to be waited before the
273 				 * next allocation.
274 				 */
275 				aiocb[i] = &cblocks[i];
276 				do_suspend = 1;
277 			}
278 		}
279 		if (!do_suspend)
280 			return -1;
281 
282 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
283 			if (!(errno == EAGAIN || errno == EINTR))
284 				pr_err("failed to sync perf data, error: %m\n");
285 		}
286 	} while (1);
287 }
288 
289 struct record_aio {
290 	struct record	*rec;
291 	void		*data;
292 	size_t		size;
293 };
294 
295 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
296 {
297 	struct record_aio *aio = to;
298 
299 	/*
300 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
301 	 * to release space in the kernel buffer as fast as possible, calling
302 	 * perf_mmap__consume() from perf_mmap__push() function.
303 	 *
304 	 * That lets the kernel to proceed with storing more profiling data into
305 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
306 	 *
307 	 * Coping can be done in two steps in case the chunk of profiling data
308 	 * crosses the upper bound of the kernel buffer. In this case we first move
309 	 * part of data from map->start till the upper bound and then the reminder
310 	 * from the beginning of the kernel buffer till the end of the data chunk.
311 	 */
312 
313 	if (record__comp_enabled(aio->rec)) {
314 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
315 				     mmap__mmap_len(map) - aio->size,
316 				     buf, size);
317 	} else {
318 		memcpy(aio->data + aio->size, buf, size);
319 	}
320 
321 	if (!aio->size) {
322 		/*
323 		 * Increment map->refcount to guard map->aio.data[] buffer
324 		 * from premature deallocation because map object can be
325 		 * released earlier than aio write request started on
326 		 * map->aio.data[] buffer is complete.
327 		 *
328 		 * perf_mmap__put() is done at record__aio_complete()
329 		 * after started aio request completion or at record__aio_push()
330 		 * if the request failed to start.
331 		 */
332 		perf_mmap__get(&map->core);
333 	}
334 
335 	aio->size += size;
336 
337 	return size;
338 }
339 
340 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
341 {
342 	int ret, idx;
343 	int trace_fd = rec->session->data->file.fd;
344 	struct record_aio aio = { .rec = rec, .size = 0 };
345 
346 	/*
347 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
348 	 * becomes available after previous aio write operation.
349 	 */
350 
351 	idx = record__aio_sync(map, false);
352 	aio.data = map->aio.data[idx];
353 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
354 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
355 		return ret;
356 
357 	rec->samples++;
358 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
359 	if (!ret) {
360 		*off += aio.size;
361 		rec->bytes_written += aio.size;
362 		if (switch_output_size(rec))
363 			trigger_hit(&switch_output_trigger);
364 	} else {
365 		/*
366 		 * Decrement map->refcount incremented in record__aio_pushfn()
367 		 * back if record__aio_write() operation failed to start, otherwise
368 		 * map->refcount is decremented in record__aio_complete() after
369 		 * aio write operation finishes successfully.
370 		 */
371 		perf_mmap__put(&map->core);
372 	}
373 
374 	return ret;
375 }
376 
377 static off_t record__aio_get_pos(int trace_fd)
378 {
379 	return lseek(trace_fd, 0, SEEK_CUR);
380 }
381 
382 static void record__aio_set_pos(int trace_fd, off_t pos)
383 {
384 	lseek(trace_fd, pos, SEEK_SET);
385 }
386 
387 static void record__aio_mmap_read_sync(struct record *rec)
388 {
389 	int i;
390 	struct evlist *evlist = rec->evlist;
391 	struct mmap *maps = evlist->mmap;
392 
393 	if (!record__aio_enabled(rec))
394 		return;
395 
396 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
397 		struct mmap *map = &maps[i];
398 
399 		if (map->core.base)
400 			record__aio_sync(map, true);
401 	}
402 }
403 
404 static int nr_cblocks_default = 1;
405 static int nr_cblocks_max = 4;
406 
407 static int record__aio_parse(const struct option *opt,
408 			     const char *str,
409 			     int unset)
410 {
411 	struct record_opts *opts = (struct record_opts *)opt->value;
412 
413 	if (unset) {
414 		opts->nr_cblocks = 0;
415 	} else {
416 		if (str)
417 			opts->nr_cblocks = strtol(str, NULL, 0);
418 		if (!opts->nr_cblocks)
419 			opts->nr_cblocks = nr_cblocks_default;
420 	}
421 
422 	return 0;
423 }
424 #else /* HAVE_AIO_SUPPORT */
425 static int nr_cblocks_max = 0;
426 
427 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
428 			    off_t *off __maybe_unused)
429 {
430 	return -1;
431 }
432 
433 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
434 {
435 	return -1;
436 }
437 
438 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
439 {
440 }
441 
442 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
443 {
444 }
445 #endif
446 
447 static int record__aio_enabled(struct record *rec)
448 {
449 	return rec->opts.nr_cblocks > 0;
450 }
451 
452 #define MMAP_FLUSH_DEFAULT 1
453 static int record__mmap_flush_parse(const struct option *opt,
454 				    const char *str,
455 				    int unset)
456 {
457 	int flush_max;
458 	struct record_opts *opts = (struct record_opts *)opt->value;
459 	static struct parse_tag tags[] = {
460 			{ .tag  = 'B', .mult = 1       },
461 			{ .tag  = 'K', .mult = 1 << 10 },
462 			{ .tag  = 'M', .mult = 1 << 20 },
463 			{ .tag  = 'G', .mult = 1 << 30 },
464 			{ .tag  = 0 },
465 	};
466 
467 	if (unset)
468 		return 0;
469 
470 	if (str) {
471 		opts->mmap_flush = parse_tag_value(str, tags);
472 		if (opts->mmap_flush == (int)-1)
473 			opts->mmap_flush = strtol(str, NULL, 0);
474 	}
475 
476 	if (!opts->mmap_flush)
477 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
478 
479 	flush_max = evlist__mmap_size(opts->mmap_pages);
480 	flush_max /= 4;
481 	if (opts->mmap_flush > flush_max)
482 		opts->mmap_flush = flush_max;
483 
484 	return 0;
485 }
486 
487 #ifdef HAVE_ZSTD_SUPPORT
488 static unsigned int comp_level_default = 1;
489 
490 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
491 {
492 	struct record_opts *opts = opt->value;
493 
494 	if (unset) {
495 		opts->comp_level = 0;
496 	} else {
497 		if (str)
498 			opts->comp_level = strtol(str, NULL, 0);
499 		if (!opts->comp_level)
500 			opts->comp_level = comp_level_default;
501 	}
502 
503 	return 0;
504 }
505 #endif
506 static unsigned int comp_level_max = 22;
507 
508 static int record__comp_enabled(struct record *rec)
509 {
510 	return rec->opts.comp_level > 0;
511 }
512 
513 static int process_synthesized_event(struct perf_tool *tool,
514 				     union perf_event *event,
515 				     struct perf_sample *sample __maybe_unused,
516 				     struct machine *machine __maybe_unused)
517 {
518 	struct record *rec = container_of(tool, struct record, tool);
519 	return record__write(rec, NULL, event, event->header.size);
520 }
521 
522 static int process_locked_synthesized_event(struct perf_tool *tool,
523 				     union perf_event *event,
524 				     struct perf_sample *sample __maybe_unused,
525 				     struct machine *machine __maybe_unused)
526 {
527 	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
528 	int ret;
529 
530 	pthread_mutex_lock(&synth_lock);
531 	ret = process_synthesized_event(tool, event, sample, machine);
532 	pthread_mutex_unlock(&synth_lock);
533 	return ret;
534 }
535 
536 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
537 {
538 	struct record *rec = to;
539 
540 	if (record__comp_enabled(rec)) {
541 		size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
542 		bf   = map->data;
543 	}
544 
545 	rec->samples++;
546 	return record__write(rec, map, bf, size);
547 }
548 
549 static volatile int signr = -1;
550 static volatile int child_finished;
551 #ifdef HAVE_EVENTFD_SUPPORT
552 static int done_fd = -1;
553 #endif
554 
555 static void sig_handler(int sig)
556 {
557 	if (sig == SIGCHLD)
558 		child_finished = 1;
559 	else
560 		signr = sig;
561 
562 	done = 1;
563 #ifdef HAVE_EVENTFD_SUPPORT
564 {
565 	u64 tmp = 1;
566 	/*
567 	 * It is possible for this signal handler to run after done is checked
568 	 * in the main loop, but before the perf counter fds are polled. If this
569 	 * happens, the poll() will continue to wait even though done is set,
570 	 * and will only break out if either another signal is received, or the
571 	 * counters are ready for read. To ensure the poll() doesn't sleep when
572 	 * done is set, use an eventfd (done_fd) to wake up the poll().
573 	 */
574 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
575 		pr_err("failed to signal wakeup fd, error: %m\n");
576 }
577 #endif // HAVE_EVENTFD_SUPPORT
578 }
579 
580 static void sigsegv_handler(int sig)
581 {
582 	perf_hooks__recover();
583 	sighandler_dump_stack(sig);
584 }
585 
586 static void record__sig_exit(void)
587 {
588 	if (signr == -1)
589 		return;
590 
591 	signal(signr, SIG_DFL);
592 	raise(signr);
593 }
594 
595 #ifdef HAVE_AUXTRACE_SUPPORT
596 
597 static int record__process_auxtrace(struct perf_tool *tool,
598 				    struct mmap *map,
599 				    union perf_event *event, void *data1,
600 				    size_t len1, void *data2, size_t len2)
601 {
602 	struct record *rec = container_of(tool, struct record, tool);
603 	struct perf_data *data = &rec->data;
604 	size_t padding;
605 	u8 pad[8] = {0};
606 
607 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
608 		off_t file_offset;
609 		int fd = perf_data__fd(data);
610 		int err;
611 
612 		file_offset = lseek(fd, 0, SEEK_CUR);
613 		if (file_offset == -1)
614 			return -1;
615 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
616 						     event, file_offset);
617 		if (err)
618 			return err;
619 	}
620 
621 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
622 	padding = (len1 + len2) & 7;
623 	if (padding)
624 		padding = 8 - padding;
625 
626 	record__write(rec, map, event, event->header.size);
627 	record__write(rec, map, data1, len1);
628 	if (len2)
629 		record__write(rec, map, data2, len2);
630 	record__write(rec, map, &pad, padding);
631 
632 	return 0;
633 }
634 
635 static int record__auxtrace_mmap_read(struct record *rec,
636 				      struct mmap *map)
637 {
638 	int ret;
639 
640 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
641 				  record__process_auxtrace);
642 	if (ret < 0)
643 		return ret;
644 
645 	if (ret)
646 		rec->samples++;
647 
648 	return 0;
649 }
650 
651 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
652 					       struct mmap *map)
653 {
654 	int ret;
655 
656 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
657 					   record__process_auxtrace,
658 					   rec->opts.auxtrace_snapshot_size);
659 	if (ret < 0)
660 		return ret;
661 
662 	if (ret)
663 		rec->samples++;
664 
665 	return 0;
666 }
667 
668 static int record__auxtrace_read_snapshot_all(struct record *rec)
669 {
670 	int i;
671 	int rc = 0;
672 
673 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
674 		struct mmap *map = &rec->evlist->mmap[i];
675 
676 		if (!map->auxtrace_mmap.base)
677 			continue;
678 
679 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
680 			rc = -1;
681 			goto out;
682 		}
683 	}
684 out:
685 	return rc;
686 }
687 
688 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
689 {
690 	pr_debug("Recording AUX area tracing snapshot\n");
691 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
692 		trigger_error(&auxtrace_snapshot_trigger);
693 	} else {
694 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
695 			trigger_error(&auxtrace_snapshot_trigger);
696 		else
697 			trigger_ready(&auxtrace_snapshot_trigger);
698 	}
699 }
700 
701 static int record__auxtrace_snapshot_exit(struct record *rec)
702 {
703 	if (trigger_is_error(&auxtrace_snapshot_trigger))
704 		return 0;
705 
706 	if (!auxtrace_record__snapshot_started &&
707 	    auxtrace_record__snapshot_start(rec->itr))
708 		return -1;
709 
710 	record__read_auxtrace_snapshot(rec, true);
711 	if (trigger_is_error(&auxtrace_snapshot_trigger))
712 		return -1;
713 
714 	return 0;
715 }
716 
717 static int record__auxtrace_init(struct record *rec)
718 {
719 	int err;
720 
721 	if (!rec->itr) {
722 		rec->itr = auxtrace_record__init(rec->evlist, &err);
723 		if (err)
724 			return err;
725 	}
726 
727 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
728 					      rec->opts.auxtrace_snapshot_opts);
729 	if (err)
730 		return err;
731 
732 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
733 					    rec->opts.auxtrace_sample_opts);
734 	if (err)
735 		return err;
736 
737 	auxtrace_regroup_aux_output(rec->evlist);
738 
739 	return auxtrace_parse_filters(rec->evlist);
740 }
741 
742 #else
743 
744 static inline
745 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
746 			       struct mmap *map __maybe_unused)
747 {
748 	return 0;
749 }
750 
751 static inline
752 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
753 				    bool on_exit __maybe_unused)
754 {
755 }
756 
757 static inline
758 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
759 {
760 	return 0;
761 }
762 
763 static inline
764 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
765 {
766 	return 0;
767 }
768 
769 static int record__auxtrace_init(struct record *rec __maybe_unused)
770 {
771 	return 0;
772 }
773 
774 #endif
775 
776 static int record__config_text_poke(struct evlist *evlist)
777 {
778 	struct evsel *evsel;
779 	int err;
780 
781 	/* Nothing to do if text poke is already configured */
782 	evlist__for_each_entry(evlist, evsel) {
783 		if (evsel->core.attr.text_poke)
784 			return 0;
785 	}
786 
787 	err = parse_events(evlist, "dummy:u", NULL);
788 	if (err)
789 		return err;
790 
791 	evsel = evlist__last(evlist);
792 
793 	evsel->core.attr.freq = 0;
794 	evsel->core.attr.sample_period = 1;
795 	evsel->core.attr.text_poke = 1;
796 	evsel->core.attr.ksymbol = 1;
797 
798 	evsel->core.system_wide = true;
799 	evsel->no_aux_samples = true;
800 	evsel->immediate = true;
801 
802 	/* Text poke must be collected on all CPUs */
803 	perf_cpu_map__put(evsel->core.own_cpus);
804 	evsel->core.own_cpus = perf_cpu_map__new(NULL);
805 	perf_cpu_map__put(evsel->core.cpus);
806 	evsel->core.cpus = perf_cpu_map__get(evsel->core.own_cpus);
807 
808 	evsel__set_sample_bit(evsel, TIME);
809 
810 	return 0;
811 }
812 
813 static bool record__kcore_readable(struct machine *machine)
814 {
815 	char kcore[PATH_MAX];
816 	int fd;
817 
818 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
819 
820 	fd = open(kcore, O_RDONLY);
821 	if (fd < 0)
822 		return false;
823 
824 	close(fd);
825 
826 	return true;
827 }
828 
829 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
830 {
831 	char from_dir[PATH_MAX];
832 	char kcore_dir[PATH_MAX];
833 	int ret;
834 
835 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
836 
837 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
838 	if (ret)
839 		return ret;
840 
841 	return kcore_copy(from_dir, kcore_dir);
842 }
843 
844 static int record__mmap_evlist(struct record *rec,
845 			       struct evlist *evlist)
846 {
847 	struct record_opts *opts = &rec->opts;
848 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
849 				  opts->auxtrace_sample_mode;
850 	char msg[512];
851 
852 	if (opts->affinity != PERF_AFFINITY_SYS)
853 		cpu__setup_cpunode_map();
854 
855 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
856 				 opts->auxtrace_mmap_pages,
857 				 auxtrace_overwrite,
858 				 opts->nr_cblocks, opts->affinity,
859 				 opts->mmap_flush, opts->comp_level) < 0) {
860 		if (errno == EPERM) {
861 			pr_err("Permission error mapping pages.\n"
862 			       "Consider increasing "
863 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
864 			       "or try again with a smaller value of -m/--mmap_pages.\n"
865 			       "(current value: %u,%u)\n",
866 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
867 			return -errno;
868 		} else {
869 			pr_err("failed to mmap with %d (%s)\n", errno,
870 				str_error_r(errno, msg, sizeof(msg)));
871 			if (errno)
872 				return -errno;
873 			else
874 				return -EINVAL;
875 		}
876 	}
877 	return 0;
878 }
879 
880 static int record__mmap(struct record *rec)
881 {
882 	return record__mmap_evlist(rec, rec->evlist);
883 }
884 
885 static int record__open(struct record *rec)
886 {
887 	char msg[BUFSIZ];
888 	struct evsel *pos;
889 	struct evlist *evlist = rec->evlist;
890 	struct perf_session *session = rec->session;
891 	struct record_opts *opts = &rec->opts;
892 	int rc = 0;
893 
894 	/*
895 	 * For initial_delay, system wide or a hybrid system, we need to add a
896 	 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
897 	 * of waiting or event synthesis.
898 	 */
899 	if (opts->initial_delay || target__has_cpu(&opts->target) ||
900 	    perf_pmu__has_hybrid()) {
901 		pos = evlist__get_tracking_event(evlist);
902 		if (!evsel__is_dummy_event(pos)) {
903 			/* Set up dummy event. */
904 			if (evlist__add_dummy(evlist))
905 				return -ENOMEM;
906 			pos = evlist__last(evlist);
907 			evlist__set_tracking_event(evlist, pos);
908 		}
909 
910 		/*
911 		 * Enable the dummy event when the process is forked for
912 		 * initial_delay, immediately for system wide.
913 		 */
914 		if (opts->initial_delay && !pos->immediate &&
915 		    !target__has_cpu(&opts->target))
916 			pos->core.attr.enable_on_exec = 1;
917 		else
918 			pos->immediate = 1;
919 	}
920 
921 	evlist__config(evlist, opts, &callchain_param);
922 
923 	evlist__for_each_entry(evlist, pos) {
924 try_again:
925 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
926 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
927 				if (verbose > 0)
928 					ui__warning("%s\n", msg);
929 				goto try_again;
930 			}
931 			if ((errno == EINVAL || errno == EBADF) &&
932 			    pos->core.leader != &pos->core &&
933 			    pos->weak_group) {
934 			        pos = evlist__reset_weak_group(evlist, pos, true);
935 				goto try_again;
936 			}
937 			rc = -errno;
938 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
939 			ui__error("%s\n", msg);
940 			goto out;
941 		}
942 
943 		pos->supported = true;
944 	}
945 
946 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
947 		pr_warning(
948 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
949 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
950 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
951 "file is not found in the buildid cache or in the vmlinux path.\n\n"
952 "Samples in kernel modules won't be resolved at all.\n\n"
953 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
954 "even with a suitable vmlinux or kallsyms file.\n\n");
955 	}
956 
957 	if (evlist__apply_filters(evlist, &pos)) {
958 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
959 			pos->filter, evsel__name(pos), errno,
960 			str_error_r(errno, msg, sizeof(msg)));
961 		rc = -1;
962 		goto out;
963 	}
964 
965 	rc = record__mmap(rec);
966 	if (rc)
967 		goto out;
968 
969 	session->evlist = evlist;
970 	perf_session__set_id_hdr_size(session);
971 out:
972 	return rc;
973 }
974 
975 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
976 {
977 	if (rec->evlist->first_sample_time == 0)
978 		rec->evlist->first_sample_time = sample_time;
979 
980 	if (sample_time)
981 		rec->evlist->last_sample_time = sample_time;
982 }
983 
984 static int process_sample_event(struct perf_tool *tool,
985 				union perf_event *event,
986 				struct perf_sample *sample,
987 				struct evsel *evsel,
988 				struct machine *machine)
989 {
990 	struct record *rec = container_of(tool, struct record, tool);
991 
992 	set_timestamp_boundary(rec, sample->time);
993 
994 	if (rec->buildid_all)
995 		return 0;
996 
997 	rec->samples++;
998 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
999 }
1000 
1001 static int process_buildids(struct record *rec)
1002 {
1003 	struct perf_session *session = rec->session;
1004 
1005 	if (perf_data__size(&rec->data) == 0)
1006 		return 0;
1007 
1008 	/*
1009 	 * During this process, it'll load kernel map and replace the
1010 	 * dso->long_name to a real pathname it found.  In this case
1011 	 * we prefer the vmlinux path like
1012 	 *   /lib/modules/3.16.4/build/vmlinux
1013 	 *
1014 	 * rather than build-id path (in debug directory).
1015 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1016 	 */
1017 	symbol_conf.ignore_vmlinux_buildid = true;
1018 
1019 	/*
1020 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1021 	 * so no need to process samples. But if timestamp_boundary is enabled,
1022 	 * it still needs to walk on all samples to get the timestamps of
1023 	 * first/last samples.
1024 	 */
1025 	if (rec->buildid_all && !rec->timestamp_boundary)
1026 		rec->tool.sample = NULL;
1027 
1028 	return perf_session__process_events(session);
1029 }
1030 
1031 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1032 {
1033 	int err;
1034 	struct perf_tool *tool = data;
1035 	/*
1036 	 *As for guest kernel when processing subcommand record&report,
1037 	 *we arrange module mmap prior to guest kernel mmap and trigger
1038 	 *a preload dso because default guest module symbols are loaded
1039 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1040 	 *method is used to avoid symbol missing when the first addr is
1041 	 *in module instead of in guest kernel.
1042 	 */
1043 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1044 					     machine);
1045 	if (err < 0)
1046 		pr_err("Couldn't record guest kernel [%d]'s reference"
1047 		       " relocation symbol.\n", machine->pid);
1048 
1049 	/*
1050 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1051 	 * have no _text sometimes.
1052 	 */
1053 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1054 						 machine);
1055 	if (err < 0)
1056 		pr_err("Couldn't record guest kernel [%d]'s reference"
1057 		       " relocation symbol.\n", machine->pid);
1058 }
1059 
1060 static struct perf_event_header finished_round_event = {
1061 	.size = sizeof(struct perf_event_header),
1062 	.type = PERF_RECORD_FINISHED_ROUND,
1063 };
1064 
1065 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1066 {
1067 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1068 	    !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
1069 			  rec->affinity_mask.nbits)) {
1070 		bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
1071 		bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
1072 			  map->affinity_mask.bits, rec->affinity_mask.nbits);
1073 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
1074 				  (cpu_set_t *)rec->affinity_mask.bits);
1075 		if (verbose == 2)
1076 			mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
1077 	}
1078 }
1079 
1080 static size_t process_comp_header(void *record, size_t increment)
1081 {
1082 	struct perf_record_compressed *event = record;
1083 	size_t size = sizeof(*event);
1084 
1085 	if (increment) {
1086 		event->header.size += increment;
1087 		return increment;
1088 	}
1089 
1090 	event->header.type = PERF_RECORD_COMPRESSED;
1091 	event->header.size = size;
1092 
1093 	return size;
1094 }
1095 
1096 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
1097 			    void *src, size_t src_size)
1098 {
1099 	size_t compressed;
1100 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1101 
1102 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1103 						     max_record_size, process_comp_header);
1104 
1105 	session->bytes_transferred += src_size;
1106 	session->bytes_compressed  += compressed;
1107 
1108 	return compressed;
1109 }
1110 
1111 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1112 				    bool overwrite, bool synch)
1113 {
1114 	u64 bytes_written = rec->bytes_written;
1115 	int i;
1116 	int rc = 0;
1117 	struct mmap *maps;
1118 	int trace_fd = rec->data.file.fd;
1119 	off_t off = 0;
1120 
1121 	if (!evlist)
1122 		return 0;
1123 
1124 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1125 	if (!maps)
1126 		return 0;
1127 
1128 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1129 		return 0;
1130 
1131 	if (record__aio_enabled(rec))
1132 		off = record__aio_get_pos(trace_fd);
1133 
1134 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
1135 		u64 flush = 0;
1136 		struct mmap *map = &maps[i];
1137 
1138 		if (map->core.base) {
1139 			record__adjust_affinity(rec, map);
1140 			if (synch) {
1141 				flush = map->core.flush;
1142 				map->core.flush = 1;
1143 			}
1144 			if (!record__aio_enabled(rec)) {
1145 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1146 					if (synch)
1147 						map->core.flush = flush;
1148 					rc = -1;
1149 					goto out;
1150 				}
1151 			} else {
1152 				if (record__aio_push(rec, map, &off) < 0) {
1153 					record__aio_set_pos(trace_fd, off);
1154 					if (synch)
1155 						map->core.flush = flush;
1156 					rc = -1;
1157 					goto out;
1158 				}
1159 			}
1160 			if (synch)
1161 				map->core.flush = flush;
1162 		}
1163 
1164 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1165 		    !rec->opts.auxtrace_sample_mode &&
1166 		    record__auxtrace_mmap_read(rec, map) != 0) {
1167 			rc = -1;
1168 			goto out;
1169 		}
1170 	}
1171 
1172 	if (record__aio_enabled(rec))
1173 		record__aio_set_pos(trace_fd, off);
1174 
1175 	/*
1176 	 * Mark the round finished in case we wrote
1177 	 * at least one event.
1178 	 */
1179 	if (bytes_written != rec->bytes_written)
1180 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1181 
1182 	if (overwrite)
1183 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1184 out:
1185 	return rc;
1186 }
1187 
1188 static int record__mmap_read_all(struct record *rec, bool synch)
1189 {
1190 	int err;
1191 
1192 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1193 	if (err)
1194 		return err;
1195 
1196 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1197 }
1198 
1199 static void record__init_features(struct record *rec)
1200 {
1201 	struct perf_session *session = rec->session;
1202 	int feat;
1203 
1204 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1205 		perf_header__set_feat(&session->header, feat);
1206 
1207 	if (rec->no_buildid)
1208 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1209 
1210 	if (!have_tracepoints(&rec->evlist->core.entries))
1211 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1212 
1213 	if (!rec->opts.branch_stack)
1214 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1215 
1216 	if (!rec->opts.full_auxtrace)
1217 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1218 
1219 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1220 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1221 
1222 	if (!rec->opts.use_clockid)
1223 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1224 
1225 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1226 	if (!record__comp_enabled(rec))
1227 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1228 
1229 	perf_header__clear_feat(&session->header, HEADER_STAT);
1230 }
1231 
1232 static void
1233 record__finish_output(struct record *rec)
1234 {
1235 	struct perf_data *data = &rec->data;
1236 	int fd = perf_data__fd(data);
1237 
1238 	if (data->is_pipe)
1239 		return;
1240 
1241 	rec->session->header.data_size += rec->bytes_written;
1242 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1243 
1244 	if (!rec->no_buildid) {
1245 		process_buildids(rec);
1246 
1247 		if (rec->buildid_all)
1248 			dsos__hit_all(rec->session);
1249 	}
1250 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1251 
1252 	return;
1253 }
1254 
1255 static int record__synthesize_workload(struct record *rec, bool tail)
1256 {
1257 	int err;
1258 	struct perf_thread_map *thread_map;
1259 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1260 
1261 	if (rec->opts.tail_synthesize != tail)
1262 		return 0;
1263 
1264 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1265 	if (thread_map == NULL)
1266 		return -1;
1267 
1268 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1269 						 process_synthesized_event,
1270 						 &rec->session->machines.host,
1271 						 needs_mmap,
1272 						 rec->opts.sample_address);
1273 	perf_thread_map__put(thread_map);
1274 	return err;
1275 }
1276 
1277 static int record__synthesize(struct record *rec, bool tail);
1278 
1279 static int
1280 record__switch_output(struct record *rec, bool at_exit)
1281 {
1282 	struct perf_data *data = &rec->data;
1283 	int fd, err;
1284 	char *new_filename;
1285 
1286 	/* Same Size:      "2015122520103046"*/
1287 	char timestamp[] = "InvalidTimestamp";
1288 
1289 	record__aio_mmap_read_sync(rec);
1290 
1291 	record__synthesize(rec, true);
1292 	if (target__none(&rec->opts.target))
1293 		record__synthesize_workload(rec, true);
1294 
1295 	rec->samples = 0;
1296 	record__finish_output(rec);
1297 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1298 	if (err) {
1299 		pr_err("Failed to get current timestamp\n");
1300 		return -EINVAL;
1301 	}
1302 
1303 	fd = perf_data__switch(data, timestamp,
1304 				    rec->session->header.data_offset,
1305 				    at_exit, &new_filename);
1306 	if (fd >= 0 && !at_exit) {
1307 		rec->bytes_written = 0;
1308 		rec->session->header.data_size = 0;
1309 	}
1310 
1311 	if (!quiet)
1312 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1313 			data->path, timestamp);
1314 
1315 	if (rec->switch_output.num_files) {
1316 		int n = rec->switch_output.cur_file + 1;
1317 
1318 		if (n >= rec->switch_output.num_files)
1319 			n = 0;
1320 		rec->switch_output.cur_file = n;
1321 		if (rec->switch_output.filenames[n]) {
1322 			remove(rec->switch_output.filenames[n]);
1323 			zfree(&rec->switch_output.filenames[n]);
1324 		}
1325 		rec->switch_output.filenames[n] = new_filename;
1326 	} else {
1327 		free(new_filename);
1328 	}
1329 
1330 	/* Output tracking events */
1331 	if (!at_exit) {
1332 		record__synthesize(rec, false);
1333 
1334 		/*
1335 		 * In 'perf record --switch-output' without -a,
1336 		 * record__synthesize() in record__switch_output() won't
1337 		 * generate tracking events because there's no thread_map
1338 		 * in evlist. Which causes newly created perf.data doesn't
1339 		 * contain map and comm information.
1340 		 * Create a fake thread_map and directly call
1341 		 * perf_event__synthesize_thread_map() for those events.
1342 		 */
1343 		if (target__none(&rec->opts.target))
1344 			record__synthesize_workload(rec, false);
1345 	}
1346 	return fd;
1347 }
1348 
1349 static volatile int workload_exec_errno;
1350 
1351 /*
1352  * evlist__prepare_workload will send a SIGUSR1
1353  * if the fork fails, since we asked by setting its
1354  * want_signal to true.
1355  */
1356 static void workload_exec_failed_signal(int signo __maybe_unused,
1357 					siginfo_t *info,
1358 					void *ucontext __maybe_unused)
1359 {
1360 	workload_exec_errno = info->si_value.sival_int;
1361 	done = 1;
1362 	child_finished = 1;
1363 }
1364 
1365 static void snapshot_sig_handler(int sig);
1366 static void alarm_sig_handler(int sig);
1367 
1368 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1369 {
1370 	if (evlist) {
1371 		if (evlist->mmap && evlist->mmap[0].core.base)
1372 			return evlist->mmap[0].core.base;
1373 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1374 			return evlist->overwrite_mmap[0].core.base;
1375 	}
1376 	return NULL;
1377 }
1378 
1379 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1380 {
1381 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1382 	if (pc)
1383 		return pc;
1384 	return NULL;
1385 }
1386 
1387 static int record__synthesize(struct record *rec, bool tail)
1388 {
1389 	struct perf_session *session = rec->session;
1390 	struct machine *machine = &session->machines.host;
1391 	struct perf_data *data = &rec->data;
1392 	struct record_opts *opts = &rec->opts;
1393 	struct perf_tool *tool = &rec->tool;
1394 	int err = 0;
1395 	event_op f = process_synthesized_event;
1396 
1397 	if (rec->opts.tail_synthesize != tail)
1398 		return 0;
1399 
1400 	if (data->is_pipe) {
1401 		err = perf_event__synthesize_for_pipe(tool, session, data,
1402 						      process_synthesized_event);
1403 		if (err < 0)
1404 			goto out;
1405 
1406 		rec->bytes_written += err;
1407 	}
1408 
1409 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1410 					  process_synthesized_event, machine);
1411 	if (err)
1412 		goto out;
1413 
1414 	/* Synthesize id_index before auxtrace_info */
1415 	if (rec->opts.auxtrace_sample_mode || rec->opts.full_auxtrace) {
1416 		err = perf_event__synthesize_id_index(tool,
1417 						      process_synthesized_event,
1418 						      session->evlist, machine);
1419 		if (err)
1420 			goto out;
1421 	}
1422 
1423 	if (rec->opts.full_auxtrace) {
1424 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1425 					session, process_synthesized_event);
1426 		if (err)
1427 			goto out;
1428 	}
1429 
1430 	if (!evlist__exclude_kernel(rec->evlist)) {
1431 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1432 							 machine);
1433 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1434 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1435 				   "Check /proc/kallsyms permission or run as root.\n");
1436 
1437 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1438 						     machine);
1439 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1440 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1441 				   "Check /proc/modules permission or run as root.\n");
1442 	}
1443 
1444 	if (perf_guest) {
1445 		machines__process_guests(&session->machines,
1446 					 perf_event__synthesize_guest_os, tool);
1447 	}
1448 
1449 	err = perf_event__synthesize_extra_attr(&rec->tool,
1450 						rec->evlist,
1451 						process_synthesized_event,
1452 						data->is_pipe);
1453 	if (err)
1454 		goto out;
1455 
1456 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1457 						 process_synthesized_event,
1458 						NULL);
1459 	if (err < 0) {
1460 		pr_err("Couldn't synthesize thread map.\n");
1461 		return err;
1462 	}
1463 
1464 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1465 					     process_synthesized_event, NULL);
1466 	if (err < 0) {
1467 		pr_err("Couldn't synthesize cpu map.\n");
1468 		return err;
1469 	}
1470 
1471 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1472 						machine, opts);
1473 	if (err < 0)
1474 		pr_warning("Couldn't synthesize bpf events.\n");
1475 
1476 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
1477 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1478 						     machine);
1479 		if (err < 0)
1480 			pr_warning("Couldn't synthesize cgroup events.\n");
1481 	}
1482 
1483 	if (rec->opts.nr_threads_synthesize > 1) {
1484 		perf_set_multithreaded();
1485 		f = process_locked_synthesized_event;
1486 	}
1487 
1488 	if (rec->opts.synth & PERF_SYNTH_TASK) {
1489 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1490 
1491 		err = __machine__synthesize_threads(machine, tool, &opts->target,
1492 						    rec->evlist->core.threads,
1493 						    f, needs_mmap, opts->sample_address,
1494 						    rec->opts.nr_threads_synthesize);
1495 	}
1496 
1497 	if (rec->opts.nr_threads_synthesize > 1)
1498 		perf_set_singlethreaded();
1499 
1500 out:
1501 	return err;
1502 }
1503 
1504 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1505 {
1506 	struct record *rec = data;
1507 	pthread_kill(rec->thread_id, SIGUSR2);
1508 	return 0;
1509 }
1510 
1511 static int record__setup_sb_evlist(struct record *rec)
1512 {
1513 	struct record_opts *opts = &rec->opts;
1514 
1515 	if (rec->sb_evlist != NULL) {
1516 		/*
1517 		 * We get here if --switch-output-event populated the
1518 		 * sb_evlist, so associate a callback that will send a SIGUSR2
1519 		 * to the main thread.
1520 		 */
1521 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1522 		rec->thread_id = pthread_self();
1523 	}
1524 #ifdef HAVE_LIBBPF_SUPPORT
1525 	if (!opts->no_bpf_event) {
1526 		if (rec->sb_evlist == NULL) {
1527 			rec->sb_evlist = evlist__new();
1528 
1529 			if (rec->sb_evlist == NULL) {
1530 				pr_err("Couldn't create side band evlist.\n.");
1531 				return -1;
1532 			}
1533 		}
1534 
1535 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1536 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1537 			return -1;
1538 		}
1539 	}
1540 #endif
1541 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1542 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1543 		opts->no_bpf_event = true;
1544 	}
1545 
1546 	return 0;
1547 }
1548 
1549 static int record__init_clock(struct record *rec)
1550 {
1551 	struct perf_session *session = rec->session;
1552 	struct timespec ref_clockid;
1553 	struct timeval ref_tod;
1554 	u64 ref;
1555 
1556 	if (!rec->opts.use_clockid)
1557 		return 0;
1558 
1559 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1560 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
1561 
1562 	session->header.env.clock.clockid = rec->opts.clockid;
1563 
1564 	if (gettimeofday(&ref_tod, NULL) != 0) {
1565 		pr_err("gettimeofday failed, cannot set reference time.\n");
1566 		return -1;
1567 	}
1568 
1569 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
1570 		pr_err("clock_gettime failed, cannot set reference time.\n");
1571 		return -1;
1572 	}
1573 
1574 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
1575 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
1576 
1577 	session->header.env.clock.tod_ns = ref;
1578 
1579 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
1580 	      (u64) ref_clockid.tv_nsec;
1581 
1582 	session->header.env.clock.clockid_ns = ref;
1583 	return 0;
1584 }
1585 
1586 static void hit_auxtrace_snapshot_trigger(struct record *rec)
1587 {
1588 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1589 		trigger_hit(&auxtrace_snapshot_trigger);
1590 		auxtrace_record__snapshot_started = 1;
1591 		if (auxtrace_record__snapshot_start(rec->itr))
1592 			trigger_error(&auxtrace_snapshot_trigger);
1593 	}
1594 }
1595 
1596 static void record__uniquify_name(struct record *rec)
1597 {
1598 	struct evsel *pos;
1599 	struct evlist *evlist = rec->evlist;
1600 	char *new_name;
1601 	int ret;
1602 
1603 	if (!perf_pmu__has_hybrid())
1604 		return;
1605 
1606 	evlist__for_each_entry(evlist, pos) {
1607 		if (!evsel__is_hybrid(pos))
1608 			continue;
1609 
1610 		if (strchr(pos->name, '/'))
1611 			continue;
1612 
1613 		ret = asprintf(&new_name, "%s/%s/",
1614 			       pos->pmu_name, pos->name);
1615 		if (ret) {
1616 			free(pos->name);
1617 			pos->name = new_name;
1618 		}
1619 	}
1620 }
1621 
1622 static int __cmd_record(struct record *rec, int argc, const char **argv)
1623 {
1624 	int err;
1625 	int status = 0;
1626 	unsigned long waking = 0;
1627 	const bool forks = argc > 0;
1628 	struct perf_tool *tool = &rec->tool;
1629 	struct record_opts *opts = &rec->opts;
1630 	struct perf_data *data = &rec->data;
1631 	struct perf_session *session;
1632 	bool disabled = false, draining = false;
1633 	int fd;
1634 	float ratio = 0;
1635 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
1636 
1637 	atexit(record__sig_exit);
1638 	signal(SIGCHLD, sig_handler);
1639 	signal(SIGINT, sig_handler);
1640 	signal(SIGTERM, sig_handler);
1641 	signal(SIGSEGV, sigsegv_handler);
1642 
1643 	if (rec->opts.record_namespaces)
1644 		tool->namespace_events = true;
1645 
1646 	if (rec->opts.record_cgroup) {
1647 #ifdef HAVE_FILE_HANDLE
1648 		tool->cgroup_events = true;
1649 #else
1650 		pr_err("cgroup tracking is not supported\n");
1651 		return -1;
1652 #endif
1653 	}
1654 
1655 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1656 		signal(SIGUSR2, snapshot_sig_handler);
1657 		if (rec->opts.auxtrace_snapshot_mode)
1658 			trigger_on(&auxtrace_snapshot_trigger);
1659 		if (rec->switch_output.enabled)
1660 			trigger_on(&switch_output_trigger);
1661 	} else {
1662 		signal(SIGUSR2, SIG_IGN);
1663 	}
1664 
1665 	session = perf_session__new(data, tool);
1666 	if (IS_ERR(session)) {
1667 		pr_err("Perf session creation failed.\n");
1668 		return PTR_ERR(session);
1669 	}
1670 
1671 	fd = perf_data__fd(data);
1672 	rec->session = session;
1673 
1674 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1675 		pr_err("Compression initialization failed.\n");
1676 		return -1;
1677 	}
1678 #ifdef HAVE_EVENTFD_SUPPORT
1679 	done_fd = eventfd(0, EFD_NONBLOCK);
1680 	if (done_fd < 0) {
1681 		pr_err("Failed to create wakeup eventfd, error: %m\n");
1682 		status = -1;
1683 		goto out_delete_session;
1684 	}
1685 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
1686 	if (err < 0) {
1687 		pr_err("Failed to add wakeup eventfd to poll list\n");
1688 		status = err;
1689 		goto out_delete_session;
1690 	}
1691 #endif // HAVE_EVENTFD_SUPPORT
1692 
1693 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1694 	session->header.env.comp_level = rec->opts.comp_level;
1695 
1696 	if (rec->opts.kcore &&
1697 	    !record__kcore_readable(&session->machines.host)) {
1698 		pr_err("ERROR: kcore is not readable.\n");
1699 		return -1;
1700 	}
1701 
1702 	if (record__init_clock(rec))
1703 		return -1;
1704 
1705 	record__init_features(rec);
1706 
1707 	if (forks) {
1708 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
1709 					       workload_exec_failed_signal);
1710 		if (err < 0) {
1711 			pr_err("Couldn't run the workload!\n");
1712 			status = err;
1713 			goto out_delete_session;
1714 		}
1715 	}
1716 
1717 	/*
1718 	 * If we have just single event and are sending data
1719 	 * through pipe, we need to force the ids allocation,
1720 	 * because we synthesize event name through the pipe
1721 	 * and need the id for that.
1722 	 */
1723 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1724 		rec->opts.sample_id = true;
1725 
1726 	record__uniquify_name(rec);
1727 
1728 	if (record__open(rec) != 0) {
1729 		err = -1;
1730 		goto out_child;
1731 	}
1732 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1733 
1734 	if (rec->opts.kcore) {
1735 		err = record__kcore_copy(&session->machines.host, data);
1736 		if (err) {
1737 			pr_err("ERROR: Failed to copy kcore\n");
1738 			goto out_child;
1739 		}
1740 	}
1741 
1742 	err = bpf__apply_obj_config();
1743 	if (err) {
1744 		char errbuf[BUFSIZ];
1745 
1746 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1747 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1748 			 errbuf);
1749 		goto out_child;
1750 	}
1751 
1752 	/*
1753 	 * Normally perf_session__new would do this, but it doesn't have the
1754 	 * evlist.
1755 	 */
1756 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
1757 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1758 		rec->tool.ordered_events = false;
1759 	}
1760 
1761 	if (!rec->evlist->core.nr_groups)
1762 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1763 
1764 	if (data->is_pipe) {
1765 		err = perf_header__write_pipe(fd);
1766 		if (err < 0)
1767 			goto out_child;
1768 	} else {
1769 		err = perf_session__write_header(session, rec->evlist, fd, false);
1770 		if (err < 0)
1771 			goto out_child;
1772 	}
1773 
1774 	err = -1;
1775 	if (!rec->no_buildid
1776 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1777 		pr_err("Couldn't generate buildids. "
1778 		       "Use --no-buildid to profile anyway.\n");
1779 		goto out_child;
1780 	}
1781 
1782 	err = record__setup_sb_evlist(rec);
1783 	if (err)
1784 		goto out_child;
1785 
1786 	err = record__synthesize(rec, false);
1787 	if (err < 0)
1788 		goto out_child;
1789 
1790 	if (rec->realtime_prio) {
1791 		struct sched_param param;
1792 
1793 		param.sched_priority = rec->realtime_prio;
1794 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1795 			pr_err("Could not set realtime priority.\n");
1796 			err = -1;
1797 			goto out_child;
1798 		}
1799 	}
1800 
1801 	/*
1802 	 * When perf is starting the traced process, all the events
1803 	 * (apart from group members) have enable_on_exec=1 set,
1804 	 * so don't spoil it by prematurely enabling them.
1805 	 */
1806 	if (!target__none(&opts->target) && !opts->initial_delay)
1807 		evlist__enable(rec->evlist);
1808 
1809 	/*
1810 	 * Let the child rip
1811 	 */
1812 	if (forks) {
1813 		struct machine *machine = &session->machines.host;
1814 		union perf_event *event;
1815 		pid_t tgid;
1816 
1817 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1818 		if (event == NULL) {
1819 			err = -ENOMEM;
1820 			goto out_child;
1821 		}
1822 
1823 		/*
1824 		 * Some H/W events are generated before COMM event
1825 		 * which is emitted during exec(), so perf script
1826 		 * cannot see a correct process name for those events.
1827 		 * Synthesize COMM event to prevent it.
1828 		 */
1829 		tgid = perf_event__synthesize_comm(tool, event,
1830 						   rec->evlist->workload.pid,
1831 						   process_synthesized_event,
1832 						   machine);
1833 		free(event);
1834 
1835 		if (tgid == -1)
1836 			goto out_child;
1837 
1838 		event = malloc(sizeof(event->namespaces) +
1839 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1840 			       machine->id_hdr_size);
1841 		if (event == NULL) {
1842 			err = -ENOMEM;
1843 			goto out_child;
1844 		}
1845 
1846 		/*
1847 		 * Synthesize NAMESPACES event for the command specified.
1848 		 */
1849 		perf_event__synthesize_namespaces(tool, event,
1850 						  rec->evlist->workload.pid,
1851 						  tgid, process_synthesized_event,
1852 						  machine);
1853 		free(event);
1854 
1855 		evlist__start_workload(rec->evlist);
1856 	}
1857 
1858 	if (evlist__initialize_ctlfd(rec->evlist, opts->ctl_fd, opts->ctl_fd_ack))
1859 		goto out_child;
1860 
1861 	if (opts->initial_delay) {
1862 		pr_info(EVLIST_DISABLED_MSG);
1863 		if (opts->initial_delay > 0) {
1864 			usleep(opts->initial_delay * USEC_PER_MSEC);
1865 			evlist__enable(rec->evlist);
1866 			pr_info(EVLIST_ENABLED_MSG);
1867 		}
1868 	}
1869 
1870 	trigger_ready(&auxtrace_snapshot_trigger);
1871 	trigger_ready(&switch_output_trigger);
1872 	perf_hooks__invoke_record_start();
1873 	for (;;) {
1874 		unsigned long long hits = rec->samples;
1875 
1876 		/*
1877 		 * rec->evlist->bkw_mmap_state is possible to be
1878 		 * BKW_MMAP_EMPTY here: when done == true and
1879 		 * hits != rec->samples in previous round.
1880 		 *
1881 		 * evlist__toggle_bkw_mmap ensure we never
1882 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1883 		 */
1884 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1885 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1886 
1887 		if (record__mmap_read_all(rec, false) < 0) {
1888 			trigger_error(&auxtrace_snapshot_trigger);
1889 			trigger_error(&switch_output_trigger);
1890 			err = -1;
1891 			goto out_child;
1892 		}
1893 
1894 		if (auxtrace_record__snapshot_started) {
1895 			auxtrace_record__snapshot_started = 0;
1896 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1897 				record__read_auxtrace_snapshot(rec, false);
1898 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1899 				pr_err("AUX area tracing snapshot failed\n");
1900 				err = -1;
1901 				goto out_child;
1902 			}
1903 		}
1904 
1905 		if (trigger_is_hit(&switch_output_trigger)) {
1906 			/*
1907 			 * If switch_output_trigger is hit, the data in
1908 			 * overwritable ring buffer should have been collected,
1909 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1910 			 *
1911 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1912 			 * record__mmap_read_all() didn't collect data from
1913 			 * overwritable ring buffer. Read again.
1914 			 */
1915 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1916 				continue;
1917 			trigger_ready(&switch_output_trigger);
1918 
1919 			/*
1920 			 * Reenable events in overwrite ring buffer after
1921 			 * record__mmap_read_all(): we should have collected
1922 			 * data from it.
1923 			 */
1924 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1925 
1926 			if (!quiet)
1927 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1928 					waking);
1929 			waking = 0;
1930 			fd = record__switch_output(rec, false);
1931 			if (fd < 0) {
1932 				pr_err("Failed to switch to new file\n");
1933 				trigger_error(&switch_output_trigger);
1934 				err = fd;
1935 				goto out_child;
1936 			}
1937 
1938 			/* re-arm the alarm */
1939 			if (rec->switch_output.time)
1940 				alarm(rec->switch_output.time);
1941 		}
1942 
1943 		if (hits == rec->samples) {
1944 			if (done || draining)
1945 				break;
1946 			err = evlist__poll(rec->evlist, -1);
1947 			/*
1948 			 * Propagate error, only if there's any. Ignore positive
1949 			 * number of returned events and interrupt error.
1950 			 */
1951 			if (err > 0 || (err < 0 && errno == EINTR))
1952 				err = 0;
1953 			waking++;
1954 
1955 			if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1956 				draining = true;
1957 		}
1958 
1959 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
1960 			switch (cmd) {
1961 			case EVLIST_CTL_CMD_SNAPSHOT:
1962 				hit_auxtrace_snapshot_trigger(rec);
1963 				evlist__ctlfd_ack(rec->evlist);
1964 				break;
1965 			case EVLIST_CTL_CMD_STOP:
1966 				done = 1;
1967 				break;
1968 			case EVLIST_CTL_CMD_ACK:
1969 			case EVLIST_CTL_CMD_UNSUPPORTED:
1970 			case EVLIST_CTL_CMD_ENABLE:
1971 			case EVLIST_CTL_CMD_DISABLE:
1972 			case EVLIST_CTL_CMD_EVLIST:
1973 			case EVLIST_CTL_CMD_PING:
1974 			default:
1975 				break;
1976 			}
1977 		}
1978 
1979 		/*
1980 		 * When perf is starting the traced process, at the end events
1981 		 * die with the process and we wait for that. Thus no need to
1982 		 * disable events in this case.
1983 		 */
1984 		if (done && !disabled && !target__none(&opts->target)) {
1985 			trigger_off(&auxtrace_snapshot_trigger);
1986 			evlist__disable(rec->evlist);
1987 			disabled = true;
1988 		}
1989 	}
1990 
1991 	trigger_off(&auxtrace_snapshot_trigger);
1992 	trigger_off(&switch_output_trigger);
1993 
1994 	if (opts->auxtrace_snapshot_on_exit)
1995 		record__auxtrace_snapshot_exit(rec);
1996 
1997 	if (forks && workload_exec_errno) {
1998 		char msg[STRERR_BUFSIZE], strevsels[2048];
1999 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2000 
2001 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2002 
2003 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2004 			strevsels, argv[0], emsg);
2005 		err = -1;
2006 		goto out_child;
2007 	}
2008 
2009 	if (!quiet)
2010 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
2011 
2012 	if (target__none(&rec->opts.target))
2013 		record__synthesize_workload(rec, true);
2014 
2015 out_child:
2016 	evlist__finalize_ctlfd(rec->evlist);
2017 	record__mmap_read_all(rec, true);
2018 	record__aio_mmap_read_sync(rec);
2019 
2020 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2021 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2022 		session->header.env.comp_ratio = ratio + 0.5;
2023 	}
2024 
2025 	if (forks) {
2026 		int exit_status;
2027 
2028 		if (!child_finished)
2029 			kill(rec->evlist->workload.pid, SIGTERM);
2030 
2031 		wait(&exit_status);
2032 
2033 		if (err < 0)
2034 			status = err;
2035 		else if (WIFEXITED(exit_status))
2036 			status = WEXITSTATUS(exit_status);
2037 		else if (WIFSIGNALED(exit_status))
2038 			signr = WTERMSIG(exit_status);
2039 	} else
2040 		status = err;
2041 
2042 	record__synthesize(rec, true);
2043 	/* this will be recalculated during process_buildids() */
2044 	rec->samples = 0;
2045 
2046 	if (!err) {
2047 		if (!rec->timestamp_filename) {
2048 			record__finish_output(rec);
2049 		} else {
2050 			fd = record__switch_output(rec, true);
2051 			if (fd < 0) {
2052 				status = fd;
2053 				goto out_delete_session;
2054 			}
2055 		}
2056 	}
2057 
2058 	perf_hooks__invoke_record_end();
2059 
2060 	if (!err && !quiet) {
2061 		char samples[128];
2062 		const char *postfix = rec->timestamp_filename ?
2063 					".<timestamp>" : "";
2064 
2065 		if (rec->samples && !rec->opts.full_auxtrace)
2066 			scnprintf(samples, sizeof(samples),
2067 				  " (%" PRIu64 " samples)", rec->samples);
2068 		else
2069 			samples[0] = '\0';
2070 
2071 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2072 			perf_data__size(data) / 1024.0 / 1024.0,
2073 			data->path, postfix, samples);
2074 		if (ratio) {
2075 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2076 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2077 					ratio);
2078 		}
2079 		fprintf(stderr, " ]\n");
2080 	}
2081 
2082 out_delete_session:
2083 #ifdef HAVE_EVENTFD_SUPPORT
2084 	if (done_fd >= 0)
2085 		close(done_fd);
2086 #endif
2087 	zstd_fini(&session->zstd_data);
2088 	perf_session__delete(session);
2089 
2090 	if (!opts->no_bpf_event)
2091 		evlist__stop_sb_thread(rec->sb_evlist);
2092 	return status;
2093 }
2094 
2095 static void callchain_debug(struct callchain_param *callchain)
2096 {
2097 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2098 
2099 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2100 
2101 	if (callchain->record_mode == CALLCHAIN_DWARF)
2102 		pr_debug("callchain: stack dump size %d\n",
2103 			 callchain->dump_size);
2104 }
2105 
2106 int record_opts__parse_callchain(struct record_opts *record,
2107 				 struct callchain_param *callchain,
2108 				 const char *arg, bool unset)
2109 {
2110 	int ret;
2111 	callchain->enabled = !unset;
2112 
2113 	/* --no-call-graph */
2114 	if (unset) {
2115 		callchain->record_mode = CALLCHAIN_NONE;
2116 		pr_debug("callchain: disabled\n");
2117 		return 0;
2118 	}
2119 
2120 	ret = parse_callchain_record_opt(arg, callchain);
2121 	if (!ret) {
2122 		/* Enable data address sampling for DWARF unwind. */
2123 		if (callchain->record_mode == CALLCHAIN_DWARF)
2124 			record->sample_address = true;
2125 		callchain_debug(callchain);
2126 	}
2127 
2128 	return ret;
2129 }
2130 
2131 int record_parse_callchain_opt(const struct option *opt,
2132 			       const char *arg,
2133 			       int unset)
2134 {
2135 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2136 }
2137 
2138 int record_callchain_opt(const struct option *opt,
2139 			 const char *arg __maybe_unused,
2140 			 int unset __maybe_unused)
2141 {
2142 	struct callchain_param *callchain = opt->value;
2143 
2144 	callchain->enabled = true;
2145 
2146 	if (callchain->record_mode == CALLCHAIN_NONE)
2147 		callchain->record_mode = CALLCHAIN_FP;
2148 
2149 	callchain_debug(callchain);
2150 	return 0;
2151 }
2152 
2153 static int perf_record_config(const char *var, const char *value, void *cb)
2154 {
2155 	struct record *rec = cb;
2156 
2157 	if (!strcmp(var, "record.build-id")) {
2158 		if (!strcmp(value, "cache"))
2159 			rec->no_buildid_cache = false;
2160 		else if (!strcmp(value, "no-cache"))
2161 			rec->no_buildid_cache = true;
2162 		else if (!strcmp(value, "skip"))
2163 			rec->no_buildid = true;
2164 		else if (!strcmp(value, "mmap"))
2165 			rec->buildid_mmap = true;
2166 		else
2167 			return -1;
2168 		return 0;
2169 	}
2170 	if (!strcmp(var, "record.call-graph")) {
2171 		var = "call-graph.record-mode";
2172 		return perf_default_config(var, value, cb);
2173 	}
2174 #ifdef HAVE_AIO_SUPPORT
2175 	if (!strcmp(var, "record.aio")) {
2176 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2177 		if (!rec->opts.nr_cblocks)
2178 			rec->opts.nr_cblocks = nr_cblocks_default;
2179 	}
2180 #endif
2181 	if (!strcmp(var, "record.debuginfod")) {
2182 		rec->debuginfod.urls = strdup(value);
2183 		if (!rec->debuginfod.urls)
2184 			return -ENOMEM;
2185 		rec->debuginfod.set = true;
2186 	}
2187 
2188 	return 0;
2189 }
2190 
2191 
2192 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2193 {
2194 	struct record_opts *opts = (struct record_opts *)opt->value;
2195 
2196 	if (unset || !str)
2197 		return 0;
2198 
2199 	if (!strcasecmp(str, "node"))
2200 		opts->affinity = PERF_AFFINITY_NODE;
2201 	else if (!strcasecmp(str, "cpu"))
2202 		opts->affinity = PERF_AFFINITY_CPU;
2203 
2204 	return 0;
2205 }
2206 
2207 static int parse_output_max_size(const struct option *opt,
2208 				 const char *str, int unset)
2209 {
2210 	unsigned long *s = (unsigned long *)opt->value;
2211 	static struct parse_tag tags_size[] = {
2212 		{ .tag  = 'B', .mult = 1       },
2213 		{ .tag  = 'K', .mult = 1 << 10 },
2214 		{ .tag  = 'M', .mult = 1 << 20 },
2215 		{ .tag  = 'G', .mult = 1 << 30 },
2216 		{ .tag  = 0 },
2217 	};
2218 	unsigned long val;
2219 
2220 	if (unset) {
2221 		*s = 0;
2222 		return 0;
2223 	}
2224 
2225 	val = parse_tag_value(str, tags_size);
2226 	if (val != (unsigned long) -1) {
2227 		*s = val;
2228 		return 0;
2229 	}
2230 
2231 	return -1;
2232 }
2233 
2234 static int record__parse_mmap_pages(const struct option *opt,
2235 				    const char *str,
2236 				    int unset __maybe_unused)
2237 {
2238 	struct record_opts *opts = opt->value;
2239 	char *s, *p;
2240 	unsigned int mmap_pages;
2241 	int ret;
2242 
2243 	if (!str)
2244 		return -EINVAL;
2245 
2246 	s = strdup(str);
2247 	if (!s)
2248 		return -ENOMEM;
2249 
2250 	p = strchr(s, ',');
2251 	if (p)
2252 		*p = '\0';
2253 
2254 	if (*s) {
2255 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
2256 		if (ret)
2257 			goto out_free;
2258 		opts->mmap_pages = mmap_pages;
2259 	}
2260 
2261 	if (!p) {
2262 		ret = 0;
2263 		goto out_free;
2264 	}
2265 
2266 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
2267 	if (ret)
2268 		goto out_free;
2269 
2270 	opts->auxtrace_mmap_pages = mmap_pages;
2271 
2272 out_free:
2273 	free(s);
2274 	return ret;
2275 }
2276 
2277 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
2278 {
2279 }
2280 
2281 static int parse_control_option(const struct option *opt,
2282 				const char *str,
2283 				int unset __maybe_unused)
2284 {
2285 	struct record_opts *opts = opt->value;
2286 
2287 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
2288 }
2289 
2290 static void switch_output_size_warn(struct record *rec)
2291 {
2292 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2293 	struct switch_output *s = &rec->switch_output;
2294 
2295 	wakeup_size /= 2;
2296 
2297 	if (s->size < wakeup_size) {
2298 		char buf[100];
2299 
2300 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2301 		pr_warning("WARNING: switch-output data size lower than "
2302 			   "wakeup kernel buffer size (%s) "
2303 			   "expect bigger perf.data sizes\n", buf);
2304 	}
2305 }
2306 
2307 static int switch_output_setup(struct record *rec)
2308 {
2309 	struct switch_output *s = &rec->switch_output;
2310 	static struct parse_tag tags_size[] = {
2311 		{ .tag  = 'B', .mult = 1       },
2312 		{ .tag  = 'K', .mult = 1 << 10 },
2313 		{ .tag  = 'M', .mult = 1 << 20 },
2314 		{ .tag  = 'G', .mult = 1 << 30 },
2315 		{ .tag  = 0 },
2316 	};
2317 	static struct parse_tag tags_time[] = {
2318 		{ .tag  = 's', .mult = 1        },
2319 		{ .tag  = 'm', .mult = 60       },
2320 		{ .tag  = 'h', .mult = 60*60    },
2321 		{ .tag  = 'd', .mult = 60*60*24 },
2322 		{ .tag  = 0 },
2323 	};
2324 	unsigned long val;
2325 
2326 	/*
2327 	 * If we're using --switch-output-events, then we imply its
2328 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2329 	 *  thread to its parent.
2330 	 */
2331 	if (rec->switch_output_event_set)
2332 		goto do_signal;
2333 
2334 	if (!s->set)
2335 		return 0;
2336 
2337 	if (!strcmp(s->str, "signal")) {
2338 do_signal:
2339 		s->signal = true;
2340 		pr_debug("switch-output with SIGUSR2 signal\n");
2341 		goto enabled;
2342 	}
2343 
2344 	val = parse_tag_value(s->str, tags_size);
2345 	if (val != (unsigned long) -1) {
2346 		s->size = val;
2347 		pr_debug("switch-output with %s size threshold\n", s->str);
2348 		goto enabled;
2349 	}
2350 
2351 	val = parse_tag_value(s->str, tags_time);
2352 	if (val != (unsigned long) -1) {
2353 		s->time = val;
2354 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2355 			 s->str, s->time);
2356 		goto enabled;
2357 	}
2358 
2359 	return -1;
2360 
2361 enabled:
2362 	rec->timestamp_filename = true;
2363 	s->enabled              = true;
2364 
2365 	if (s->size && !rec->opts.no_buffering)
2366 		switch_output_size_warn(rec);
2367 
2368 	return 0;
2369 }
2370 
2371 static const char * const __record_usage[] = {
2372 	"perf record [<options>] [<command>]",
2373 	"perf record [<options>] -- <command> [<options>]",
2374 	NULL
2375 };
2376 const char * const *record_usage = __record_usage;
2377 
2378 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2379 				  struct perf_sample *sample, struct machine *machine)
2380 {
2381 	/*
2382 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2383 	 * no need to add them twice.
2384 	 */
2385 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2386 		return 0;
2387 	return perf_event__process_mmap(tool, event, sample, machine);
2388 }
2389 
2390 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2391 				   struct perf_sample *sample, struct machine *machine)
2392 {
2393 	/*
2394 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2395 	 * no need to add them twice.
2396 	 */
2397 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2398 		return 0;
2399 
2400 	return perf_event__process_mmap2(tool, event, sample, machine);
2401 }
2402 
2403 static int process_timestamp_boundary(struct perf_tool *tool,
2404 				      union perf_event *event __maybe_unused,
2405 				      struct perf_sample *sample,
2406 				      struct machine *machine __maybe_unused)
2407 {
2408 	struct record *rec = container_of(tool, struct record, tool);
2409 
2410 	set_timestamp_boundary(rec, sample->time);
2411 	return 0;
2412 }
2413 
2414 static int parse_record_synth_option(const struct option *opt,
2415 				     const char *str,
2416 				     int unset __maybe_unused)
2417 {
2418 	struct record_opts *opts = opt->value;
2419 	char *p = strdup(str);
2420 
2421 	if (p == NULL)
2422 		return -1;
2423 
2424 	opts->synth = parse_synth_opt(p);
2425 	free(p);
2426 
2427 	if (opts->synth < 0) {
2428 		pr_err("Invalid synth option: %s\n", str);
2429 		return -1;
2430 	}
2431 	return 0;
2432 }
2433 
2434 /*
2435  * XXX Ideally would be local to cmd_record() and passed to a record__new
2436  * because we need to have access to it in record__exit, that is called
2437  * after cmd_record() exits, but since record_options need to be accessible to
2438  * builtin-script, leave it here.
2439  *
2440  * At least we don't ouch it in all the other functions here directly.
2441  *
2442  * Just say no to tons of global variables, sigh.
2443  */
2444 static struct record record = {
2445 	.opts = {
2446 		.sample_time	     = true,
2447 		.mmap_pages	     = UINT_MAX,
2448 		.user_freq	     = UINT_MAX,
2449 		.user_interval	     = ULLONG_MAX,
2450 		.freq		     = 4000,
2451 		.target		     = {
2452 			.uses_mmap   = true,
2453 			.default_per_cpu = true,
2454 		},
2455 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2456 		.nr_threads_synthesize = 1,
2457 		.ctl_fd              = -1,
2458 		.ctl_fd_ack          = -1,
2459 		.synth               = PERF_SYNTH_ALL,
2460 	},
2461 	.tool = {
2462 		.sample		= process_sample_event,
2463 		.fork		= perf_event__process_fork,
2464 		.exit		= perf_event__process_exit,
2465 		.comm		= perf_event__process_comm,
2466 		.namespaces	= perf_event__process_namespaces,
2467 		.mmap		= build_id__process_mmap,
2468 		.mmap2		= build_id__process_mmap2,
2469 		.itrace_start	= process_timestamp_boundary,
2470 		.aux		= process_timestamp_boundary,
2471 		.ordered_events	= true,
2472 	},
2473 };
2474 
2475 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2476 	"\n\t\t\t\tDefault: fp";
2477 
2478 static bool dry_run;
2479 
2480 /*
2481  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2482  * with it and switch to use the library functions in perf_evlist that came
2483  * from builtin-record.c, i.e. use record_opts,
2484  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2485  * using pipes, etc.
2486  */
2487 static struct option __record_options[] = {
2488 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2489 		     "event selector. use 'perf list' to list available events",
2490 		     parse_events_option),
2491 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2492 		     "event filter", parse_filter),
2493 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2494 			   NULL, "don't record events from perf itself",
2495 			   exclude_perf),
2496 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2497 		    "record events on existing process id"),
2498 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2499 		    "record events on existing thread id"),
2500 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2501 		    "collect data with this RT SCHED_FIFO priority"),
2502 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2503 		    "collect data without buffering"),
2504 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2505 		    "collect raw sample records from all opened counters"),
2506 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2507 			    "system-wide collection from all CPUs"),
2508 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2509 		    "list of cpus to monitor"),
2510 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2511 	OPT_STRING('o', "output", &record.data.path, "file",
2512 		    "output file name"),
2513 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2514 			&record.opts.no_inherit_set,
2515 			"child tasks do not inherit counters"),
2516 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2517 		    "synthesize non-sample events at the end of output"),
2518 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2519 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
2520 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2521 		    "Fail if the specified frequency can't be used"),
2522 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2523 		     "profile at this frequency",
2524 		      record__parse_freq),
2525 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2526 		     "number of mmap data pages and AUX area tracing mmap pages",
2527 		     record__parse_mmap_pages),
2528 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2529 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2530 		     record__mmap_flush_parse),
2531 	OPT_BOOLEAN(0, "group", &record.opts.group,
2532 		    "put the counters into a counter group"),
2533 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2534 			   NULL, "enables call-graph recording" ,
2535 			   &record_callchain_opt),
2536 	OPT_CALLBACK(0, "call-graph", &record.opts,
2537 		     "record_mode[,record_size]", record_callchain_help,
2538 		     &record_parse_callchain_opt),
2539 	OPT_INCR('v', "verbose", &verbose,
2540 		    "be more verbose (show counter open errors, etc)"),
2541 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2542 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2543 		    "per thread counts"),
2544 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2545 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2546 		    "Record the sample physical addresses"),
2547 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
2548 		    "Record the sampled data address data page size"),
2549 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
2550 		    "Record the sampled code address (ip) page size"),
2551 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2552 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2553 			&record.opts.sample_time_set,
2554 			"Record the sample timestamps"),
2555 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2556 			"Record the sample period"),
2557 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2558 		    "don't sample"),
2559 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2560 			&record.no_buildid_cache_set,
2561 			"do not update the buildid cache"),
2562 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2563 			&record.no_buildid_set,
2564 			"do not collect buildids in perf.data"),
2565 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2566 		     "monitor event in cgroup name only",
2567 		     parse_cgroups),
2568 	OPT_INTEGER('D', "delay", &record.opts.initial_delay,
2569 		  "ms to wait before starting measurement after program start (-1: start with events disabled)"),
2570 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2571 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2572 		   "user to profile"),
2573 
2574 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2575 		     "branch any", "sample any taken branches",
2576 		     parse_branch_stack),
2577 
2578 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2579 		     "branch filter mask", "branch stack filter modes",
2580 		     parse_branch_stack),
2581 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2582 		    "sample by weight (on special events only)"),
2583 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2584 		    "sample transaction flags (special events only)"),
2585 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2586 		    "use per-thread mmaps"),
2587 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2588 		    "sample selected machine registers on interrupt,"
2589 		    " use '-I?' to list register names", parse_intr_regs),
2590 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2591 		    "sample selected machine registers on interrupt,"
2592 		    " use '--user-regs=?' to list register names", parse_user_regs),
2593 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2594 		    "Record running/enabled time of read (:S) events"),
2595 	OPT_CALLBACK('k', "clockid", &record.opts,
2596 	"clockid", "clockid to use for events, see clock_gettime()",
2597 	parse_clockid),
2598 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2599 			  "opts", "AUX area tracing Snapshot Mode", ""),
2600 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2601 			  "opts", "sample AUX area", ""),
2602 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2603 			"per thread proc mmap processing timeout in ms"),
2604 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2605 		    "Record namespaces events"),
2606 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
2607 		    "Record cgroup events"),
2608 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
2609 			&record.opts.record_switch_events_set,
2610 			"Record context switch events"),
2611 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2612 			 "Configure all used events to run in kernel space.",
2613 			 PARSE_OPT_EXCLUSIVE),
2614 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2615 			 "Configure all used events to run in user space.",
2616 			 PARSE_OPT_EXCLUSIVE),
2617 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2618 		    "collect kernel callchains"),
2619 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2620 		    "collect user callchains"),
2621 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2622 		   "clang binary to use for compiling BPF scriptlets"),
2623 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2624 		   "options passed to clang when compiling BPF scriptlets"),
2625 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2626 		   "file", "vmlinux pathname"),
2627 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2628 		    "Record build-id of all DSOs regardless of hits"),
2629 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
2630 		    "Record build-id in map events"),
2631 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2632 		    "append timestamp to output filename"),
2633 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2634 		    "Record timestamp boundary (time of first/last samples)"),
2635 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2636 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2637 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2638 			  "signal"),
2639 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
2640 			 "switch output event selector. use 'perf list' to list available events",
2641 			 parse_events_option_new_evlist),
2642 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2643 		   "Limit number of switch output generated files"),
2644 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2645 		    "Parse options then exit"),
2646 #ifdef HAVE_AIO_SUPPORT
2647 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2648 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2649 		     record__aio_parse),
2650 #endif
2651 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2652 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2653 		     record__parse_affinity),
2654 #ifdef HAVE_ZSTD_SUPPORT
2655 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2656 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2657 			    record__parse_comp_level),
2658 #endif
2659 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
2660 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
2661 	OPT_UINTEGER(0, "num-thread-synthesize",
2662 		     &record.opts.nr_threads_synthesize,
2663 		     "number of threads to run for event synthesis"),
2664 #ifdef HAVE_LIBPFM
2665 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
2666 		"libpfm4 event selector. use 'perf list' to list available events",
2667 		parse_libpfm_events_option),
2668 #endif
2669 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
2670 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
2671 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
2672 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
2673 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
2674 		      parse_control_option),
2675 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
2676 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
2677 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
2678 			  &record.debuginfod.set, "debuginfod urls",
2679 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
2680 			  "system"),
2681 	OPT_END()
2682 };
2683 
2684 struct option *record_options = __record_options;
2685 
2686 int cmd_record(int argc, const char **argv)
2687 {
2688 	int err;
2689 	struct record *rec = &record;
2690 	char errbuf[BUFSIZ];
2691 
2692 	setlocale(LC_ALL, "");
2693 
2694 #ifndef HAVE_LIBBPF_SUPPORT
2695 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2696 	set_nobuild('\0', "clang-path", true);
2697 	set_nobuild('\0', "clang-opt", true);
2698 # undef set_nobuild
2699 #endif
2700 
2701 #ifndef HAVE_BPF_PROLOGUE
2702 # if !defined (HAVE_DWARF_SUPPORT)
2703 #  define REASON  "NO_DWARF=1"
2704 # elif !defined (HAVE_LIBBPF_SUPPORT)
2705 #  define REASON  "NO_LIBBPF=1"
2706 # else
2707 #  define REASON  "this architecture doesn't support BPF prologue"
2708 # endif
2709 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2710 	set_nobuild('\0', "vmlinux", true);
2711 # undef set_nobuild
2712 # undef REASON
2713 #endif
2714 
2715 	rec->opts.affinity = PERF_AFFINITY_SYS;
2716 
2717 	rec->evlist = evlist__new();
2718 	if (rec->evlist == NULL)
2719 		return -ENOMEM;
2720 
2721 	err = perf_config(perf_record_config, rec);
2722 	if (err)
2723 		return err;
2724 
2725 	argc = parse_options(argc, argv, record_options, record_usage,
2726 			    PARSE_OPT_STOP_AT_NON_OPTION);
2727 	if (quiet)
2728 		perf_quiet_option();
2729 
2730 	err = symbol__validate_sym_arguments();
2731 	if (err)
2732 		return err;
2733 
2734 	perf_debuginfod_setup(&record.debuginfod);
2735 
2736 	/* Make system wide (-a) the default target. */
2737 	if (!argc && target__none(&rec->opts.target))
2738 		rec->opts.target.system_wide = true;
2739 
2740 	if (nr_cgroups && !rec->opts.target.system_wide) {
2741 		usage_with_options_msg(record_usage, record_options,
2742 			"cgroup monitoring only available in system-wide mode");
2743 
2744 	}
2745 
2746 	if (rec->buildid_mmap) {
2747 		if (!perf_can_record_build_id()) {
2748 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
2749 			err = -EINVAL;
2750 			goto out_opts;
2751 		}
2752 		pr_debug("Enabling build id in mmap2 events.\n");
2753 		/* Enable mmap build id synthesizing. */
2754 		symbol_conf.buildid_mmap2 = true;
2755 		/* Enable perf_event_attr::build_id bit. */
2756 		rec->opts.build_id = true;
2757 		/* Disable build id cache. */
2758 		rec->no_buildid = true;
2759 	}
2760 
2761 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
2762 		pr_err("Kernel has no cgroup sampling support.\n");
2763 		err = -EINVAL;
2764 		goto out_opts;
2765 	}
2766 
2767 	if (rec->opts.kcore)
2768 		rec->data.is_dir = true;
2769 
2770 	if (rec->opts.comp_level != 0) {
2771 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2772 		rec->no_buildid = true;
2773 	}
2774 
2775 	if (rec->opts.record_switch_events &&
2776 	    !perf_can_record_switch_events()) {
2777 		ui__error("kernel does not support recording context switch events\n");
2778 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2779 		err = -EINVAL;
2780 		goto out_opts;
2781 	}
2782 
2783 	if (switch_output_setup(rec)) {
2784 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2785 		err = -EINVAL;
2786 		goto out_opts;
2787 	}
2788 
2789 	if (rec->switch_output.time) {
2790 		signal(SIGALRM, alarm_sig_handler);
2791 		alarm(rec->switch_output.time);
2792 	}
2793 
2794 	if (rec->switch_output.num_files) {
2795 		rec->switch_output.filenames = calloc(sizeof(char *),
2796 						      rec->switch_output.num_files);
2797 		if (!rec->switch_output.filenames) {
2798 			err = -EINVAL;
2799 			goto out_opts;
2800 		}
2801 	}
2802 
2803 	/*
2804 	 * Allow aliases to facilitate the lookup of symbols for address
2805 	 * filters. Refer to auxtrace_parse_filters().
2806 	 */
2807 	symbol_conf.allow_aliases = true;
2808 
2809 	symbol__init(NULL);
2810 
2811 	if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2812 		rec->affinity_mask.nbits = cpu__max_cpu().cpu;
2813 		rec->affinity_mask.bits = bitmap_zalloc(rec->affinity_mask.nbits);
2814 		if (!rec->affinity_mask.bits) {
2815 			pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2816 			err = -ENOMEM;
2817 			goto out_opts;
2818 		}
2819 		pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2820 	}
2821 
2822 	err = record__auxtrace_init(rec);
2823 	if (err)
2824 		goto out;
2825 
2826 	if (dry_run)
2827 		goto out;
2828 
2829 	err = bpf__setup_stdout(rec->evlist);
2830 	if (err) {
2831 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2832 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2833 			 errbuf);
2834 		goto out;
2835 	}
2836 
2837 	err = -ENOMEM;
2838 
2839 	if (rec->no_buildid_cache || rec->no_buildid) {
2840 		disable_buildid_cache();
2841 	} else if (rec->switch_output.enabled) {
2842 		/*
2843 		 * In 'perf record --switch-output', disable buildid
2844 		 * generation by default to reduce data file switching
2845 		 * overhead. Still generate buildid if they are required
2846 		 * explicitly using
2847 		 *
2848 		 *  perf record --switch-output --no-no-buildid \
2849 		 *              --no-no-buildid-cache
2850 		 *
2851 		 * Following code equals to:
2852 		 *
2853 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2854 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2855 		 *         disable_buildid_cache();
2856 		 */
2857 		bool disable = true;
2858 
2859 		if (rec->no_buildid_set && !rec->no_buildid)
2860 			disable = false;
2861 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2862 			disable = false;
2863 		if (disable) {
2864 			rec->no_buildid = true;
2865 			rec->no_buildid_cache = true;
2866 			disable_buildid_cache();
2867 		}
2868 	}
2869 
2870 	if (record.opts.overwrite)
2871 		record.opts.tail_synthesize = true;
2872 
2873 	if (rec->evlist->core.nr_entries == 0) {
2874 		if (perf_pmu__has_hybrid()) {
2875 			err = evlist__add_default_hybrid(rec->evlist,
2876 							 !record.opts.no_samples);
2877 		} else {
2878 			err = __evlist__add_default(rec->evlist,
2879 						    !record.opts.no_samples);
2880 		}
2881 
2882 		if (err < 0) {
2883 			pr_err("Not enough memory for event selector list\n");
2884 			goto out;
2885 		}
2886 	}
2887 
2888 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2889 		rec->opts.no_inherit = true;
2890 
2891 	err = target__validate(&rec->opts.target);
2892 	if (err) {
2893 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2894 		ui__warning("%s\n", errbuf);
2895 	}
2896 
2897 	err = target__parse_uid(&rec->opts.target);
2898 	if (err) {
2899 		int saved_errno = errno;
2900 
2901 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2902 		ui__error("%s", errbuf);
2903 
2904 		err = -saved_errno;
2905 		goto out;
2906 	}
2907 
2908 	/* Enable ignoring missing threads when -u/-p option is defined. */
2909 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2910 
2911 	if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
2912 		pr_err("failed to use cpu list %s\n",
2913 		       rec->opts.target.cpu_list);
2914 		goto out;
2915 	}
2916 
2917 	rec->opts.target.hybrid = perf_pmu__has_hybrid();
2918 
2919 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
2920 		arch__add_leaf_frame_record_opts(&rec->opts);
2921 
2922 	err = -ENOMEM;
2923 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2924 		usage_with_options(record_usage, record_options);
2925 
2926 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2927 	if (err)
2928 		goto out;
2929 
2930 	/*
2931 	 * We take all buildids when the file contains
2932 	 * AUX area tracing data because we do not decode the
2933 	 * trace because it would take too long.
2934 	 */
2935 	if (rec->opts.full_auxtrace)
2936 		rec->buildid_all = true;
2937 
2938 	if (rec->opts.text_poke) {
2939 		err = record__config_text_poke(rec->evlist);
2940 		if (err) {
2941 			pr_err("record__config_text_poke failed, error %d\n", err);
2942 			goto out;
2943 		}
2944 	}
2945 
2946 	if (record_opts__config(&rec->opts)) {
2947 		err = -EINVAL;
2948 		goto out;
2949 	}
2950 
2951 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2952 		rec->opts.nr_cblocks = nr_cblocks_max;
2953 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2954 
2955 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2956 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2957 
2958 	if (rec->opts.comp_level > comp_level_max)
2959 		rec->opts.comp_level = comp_level_max;
2960 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2961 
2962 	err = __cmd_record(&record, argc, argv);
2963 out:
2964 	bitmap_free(rec->affinity_mask.bits);
2965 	evlist__delete(rec->evlist);
2966 	symbol__exit();
2967 	auxtrace_record__free(rec->itr);
2968 out_opts:
2969 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
2970 	return err;
2971 }
2972 
2973 static void snapshot_sig_handler(int sig __maybe_unused)
2974 {
2975 	struct record *rec = &record;
2976 
2977 	hit_auxtrace_snapshot_trigger(rec);
2978 
2979 	if (switch_output_signal(rec))
2980 		trigger_hit(&switch_output_trigger);
2981 }
2982 
2983 static void alarm_sig_handler(int sig __maybe_unused)
2984 {
2985 	struct record *rec = &record;
2986 
2987 	if (switch_output_time(rec))
2988 		trigger_hit(&switch_output_trigger);
2989 }
2990