xref: /openbmc/linux/tools/perf/builtin-record.c (revision 76a4f7cc)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/clockid.h"
50 #include "util/pmu-hybrid.h"
51 #include "util/evlist-hybrid.h"
52 #include "asm/bug.h"
53 #include "perf.h"
54 
55 #include <errno.h>
56 #include <inttypes.h>
57 #include <locale.h>
58 #include <poll.h>
59 #include <pthread.h>
60 #include <unistd.h>
61 #include <sched.h>
62 #include <signal.h>
63 #ifdef HAVE_EVENTFD_SUPPORT
64 #include <sys/eventfd.h>
65 #endif
66 #include <sys/mman.h>
67 #include <sys/wait.h>
68 #include <sys/types.h>
69 #include <sys/stat.h>
70 #include <fcntl.h>
71 #include <linux/err.h>
72 #include <linux/string.h>
73 #include <linux/time64.h>
74 #include <linux/zalloc.h>
75 #include <linux/bitmap.h>
76 #include <sys/time.h>
77 
78 struct switch_output {
79 	bool		 enabled;
80 	bool		 signal;
81 	unsigned long	 size;
82 	unsigned long	 time;
83 	const char	*str;
84 	bool		 set;
85 	char		 **filenames;
86 	int		 num_files;
87 	int		 cur_file;
88 };
89 
90 struct record {
91 	struct perf_tool	tool;
92 	struct record_opts	opts;
93 	u64			bytes_written;
94 	struct perf_data	data;
95 	struct auxtrace_record	*itr;
96 	struct evlist	*evlist;
97 	struct perf_session	*session;
98 	struct evlist		*sb_evlist;
99 	pthread_t		thread_id;
100 	int			realtime_prio;
101 	bool			switch_output_event_set;
102 	bool			no_buildid;
103 	bool			no_buildid_set;
104 	bool			no_buildid_cache;
105 	bool			no_buildid_cache_set;
106 	bool			buildid_all;
107 	bool			buildid_mmap;
108 	bool			timestamp_filename;
109 	bool			timestamp_boundary;
110 	struct switch_output	switch_output;
111 	unsigned long long	samples;
112 	struct mmap_cpu_mask	affinity_mask;
113 	unsigned long		output_max_size;	/* = 0: unlimited */
114 };
115 
116 static volatile int done;
117 
118 static volatile int auxtrace_record__snapshot_started;
119 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
120 static DEFINE_TRIGGER(switch_output_trigger);
121 
122 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
123 	"SYS", "NODE", "CPU"
124 };
125 
126 static bool switch_output_signal(struct record *rec)
127 {
128 	return rec->switch_output.signal &&
129 	       trigger_is_ready(&switch_output_trigger);
130 }
131 
132 static bool switch_output_size(struct record *rec)
133 {
134 	return rec->switch_output.size &&
135 	       trigger_is_ready(&switch_output_trigger) &&
136 	       (rec->bytes_written >= rec->switch_output.size);
137 }
138 
139 static bool switch_output_time(struct record *rec)
140 {
141 	return rec->switch_output.time &&
142 	       trigger_is_ready(&switch_output_trigger);
143 }
144 
145 static bool record__output_max_size_exceeded(struct record *rec)
146 {
147 	return rec->output_max_size &&
148 	       (rec->bytes_written >= rec->output_max_size);
149 }
150 
151 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
152 			 void *bf, size_t size)
153 {
154 	struct perf_data_file *file = &rec->session->data->file;
155 
156 	if (perf_data_file__write(file, bf, size) < 0) {
157 		pr_err("failed to write perf data, error: %m\n");
158 		return -1;
159 	}
160 
161 	rec->bytes_written += size;
162 
163 	if (record__output_max_size_exceeded(rec) && !done) {
164 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
165 				" stopping session ]\n",
166 				rec->bytes_written >> 10);
167 		done = 1;
168 	}
169 
170 	if (switch_output_size(rec))
171 		trigger_hit(&switch_output_trigger);
172 
173 	return 0;
174 }
175 
176 static int record__aio_enabled(struct record *rec);
177 static int record__comp_enabled(struct record *rec);
178 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
179 			    void *src, size_t src_size);
180 
181 #ifdef HAVE_AIO_SUPPORT
182 static int record__aio_write(struct aiocb *cblock, int trace_fd,
183 		void *buf, size_t size, off_t off)
184 {
185 	int rc;
186 
187 	cblock->aio_fildes = trace_fd;
188 	cblock->aio_buf    = buf;
189 	cblock->aio_nbytes = size;
190 	cblock->aio_offset = off;
191 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
192 
193 	do {
194 		rc = aio_write(cblock);
195 		if (rc == 0) {
196 			break;
197 		} else if (errno != EAGAIN) {
198 			cblock->aio_fildes = -1;
199 			pr_err("failed to queue perf data, error: %m\n");
200 			break;
201 		}
202 	} while (1);
203 
204 	return rc;
205 }
206 
207 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
208 {
209 	void *rem_buf;
210 	off_t rem_off;
211 	size_t rem_size;
212 	int rc, aio_errno;
213 	ssize_t aio_ret, written;
214 
215 	aio_errno = aio_error(cblock);
216 	if (aio_errno == EINPROGRESS)
217 		return 0;
218 
219 	written = aio_ret = aio_return(cblock);
220 	if (aio_ret < 0) {
221 		if (aio_errno != EINTR)
222 			pr_err("failed to write perf data, error: %m\n");
223 		written = 0;
224 	}
225 
226 	rem_size = cblock->aio_nbytes - written;
227 
228 	if (rem_size == 0) {
229 		cblock->aio_fildes = -1;
230 		/*
231 		 * md->refcount is incremented in record__aio_pushfn() for
232 		 * every aio write request started in record__aio_push() so
233 		 * decrement it because the request is now complete.
234 		 */
235 		perf_mmap__put(&md->core);
236 		rc = 1;
237 	} else {
238 		/*
239 		 * aio write request may require restart with the
240 		 * reminder if the kernel didn't write whole
241 		 * chunk at once.
242 		 */
243 		rem_off = cblock->aio_offset + written;
244 		rem_buf = (void *)(cblock->aio_buf + written);
245 		record__aio_write(cblock, cblock->aio_fildes,
246 				rem_buf, rem_size, rem_off);
247 		rc = 0;
248 	}
249 
250 	return rc;
251 }
252 
253 static int record__aio_sync(struct mmap *md, bool sync_all)
254 {
255 	struct aiocb **aiocb = md->aio.aiocb;
256 	struct aiocb *cblocks = md->aio.cblocks;
257 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
258 	int i, do_suspend;
259 
260 	do {
261 		do_suspend = 0;
262 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
263 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
264 				if (sync_all)
265 					aiocb[i] = NULL;
266 				else
267 					return i;
268 			} else {
269 				/*
270 				 * Started aio write is not complete yet
271 				 * so it has to be waited before the
272 				 * next allocation.
273 				 */
274 				aiocb[i] = &cblocks[i];
275 				do_suspend = 1;
276 			}
277 		}
278 		if (!do_suspend)
279 			return -1;
280 
281 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
282 			if (!(errno == EAGAIN || errno == EINTR))
283 				pr_err("failed to sync perf data, error: %m\n");
284 		}
285 	} while (1);
286 }
287 
288 struct record_aio {
289 	struct record	*rec;
290 	void		*data;
291 	size_t		size;
292 };
293 
294 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
295 {
296 	struct record_aio *aio = to;
297 
298 	/*
299 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
300 	 * to release space in the kernel buffer as fast as possible, calling
301 	 * perf_mmap__consume() from perf_mmap__push() function.
302 	 *
303 	 * That lets the kernel to proceed with storing more profiling data into
304 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
305 	 *
306 	 * Coping can be done in two steps in case the chunk of profiling data
307 	 * crosses the upper bound of the kernel buffer. In this case we first move
308 	 * part of data from map->start till the upper bound and then the reminder
309 	 * from the beginning of the kernel buffer till the end of the data chunk.
310 	 */
311 
312 	if (record__comp_enabled(aio->rec)) {
313 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
314 				     mmap__mmap_len(map) - aio->size,
315 				     buf, size);
316 	} else {
317 		memcpy(aio->data + aio->size, buf, size);
318 	}
319 
320 	if (!aio->size) {
321 		/*
322 		 * Increment map->refcount to guard map->aio.data[] buffer
323 		 * from premature deallocation because map object can be
324 		 * released earlier than aio write request started on
325 		 * map->aio.data[] buffer is complete.
326 		 *
327 		 * perf_mmap__put() is done at record__aio_complete()
328 		 * after started aio request completion or at record__aio_push()
329 		 * if the request failed to start.
330 		 */
331 		perf_mmap__get(&map->core);
332 	}
333 
334 	aio->size += size;
335 
336 	return size;
337 }
338 
339 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
340 {
341 	int ret, idx;
342 	int trace_fd = rec->session->data->file.fd;
343 	struct record_aio aio = { .rec = rec, .size = 0 };
344 
345 	/*
346 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
347 	 * becomes available after previous aio write operation.
348 	 */
349 
350 	idx = record__aio_sync(map, false);
351 	aio.data = map->aio.data[idx];
352 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
353 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
354 		return ret;
355 
356 	rec->samples++;
357 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
358 	if (!ret) {
359 		*off += aio.size;
360 		rec->bytes_written += aio.size;
361 		if (switch_output_size(rec))
362 			trigger_hit(&switch_output_trigger);
363 	} else {
364 		/*
365 		 * Decrement map->refcount incremented in record__aio_pushfn()
366 		 * back if record__aio_write() operation failed to start, otherwise
367 		 * map->refcount is decremented in record__aio_complete() after
368 		 * aio write operation finishes successfully.
369 		 */
370 		perf_mmap__put(&map->core);
371 	}
372 
373 	return ret;
374 }
375 
376 static off_t record__aio_get_pos(int trace_fd)
377 {
378 	return lseek(trace_fd, 0, SEEK_CUR);
379 }
380 
381 static void record__aio_set_pos(int trace_fd, off_t pos)
382 {
383 	lseek(trace_fd, pos, SEEK_SET);
384 }
385 
386 static void record__aio_mmap_read_sync(struct record *rec)
387 {
388 	int i;
389 	struct evlist *evlist = rec->evlist;
390 	struct mmap *maps = evlist->mmap;
391 
392 	if (!record__aio_enabled(rec))
393 		return;
394 
395 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
396 		struct mmap *map = &maps[i];
397 
398 		if (map->core.base)
399 			record__aio_sync(map, true);
400 	}
401 }
402 
403 static int nr_cblocks_default = 1;
404 static int nr_cblocks_max = 4;
405 
406 static int record__aio_parse(const struct option *opt,
407 			     const char *str,
408 			     int unset)
409 {
410 	struct record_opts *opts = (struct record_opts *)opt->value;
411 
412 	if (unset) {
413 		opts->nr_cblocks = 0;
414 	} else {
415 		if (str)
416 			opts->nr_cblocks = strtol(str, NULL, 0);
417 		if (!opts->nr_cblocks)
418 			opts->nr_cblocks = nr_cblocks_default;
419 	}
420 
421 	return 0;
422 }
423 #else /* HAVE_AIO_SUPPORT */
424 static int nr_cblocks_max = 0;
425 
426 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
427 			    off_t *off __maybe_unused)
428 {
429 	return -1;
430 }
431 
432 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
433 {
434 	return -1;
435 }
436 
437 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
438 {
439 }
440 
441 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
442 {
443 }
444 #endif
445 
446 static int record__aio_enabled(struct record *rec)
447 {
448 	return rec->opts.nr_cblocks > 0;
449 }
450 
451 #define MMAP_FLUSH_DEFAULT 1
452 static int record__mmap_flush_parse(const struct option *opt,
453 				    const char *str,
454 				    int unset)
455 {
456 	int flush_max;
457 	struct record_opts *opts = (struct record_opts *)opt->value;
458 	static struct parse_tag tags[] = {
459 			{ .tag  = 'B', .mult = 1       },
460 			{ .tag  = 'K', .mult = 1 << 10 },
461 			{ .tag  = 'M', .mult = 1 << 20 },
462 			{ .tag  = 'G', .mult = 1 << 30 },
463 			{ .tag  = 0 },
464 	};
465 
466 	if (unset)
467 		return 0;
468 
469 	if (str) {
470 		opts->mmap_flush = parse_tag_value(str, tags);
471 		if (opts->mmap_flush == (int)-1)
472 			opts->mmap_flush = strtol(str, NULL, 0);
473 	}
474 
475 	if (!opts->mmap_flush)
476 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
477 
478 	flush_max = evlist__mmap_size(opts->mmap_pages);
479 	flush_max /= 4;
480 	if (opts->mmap_flush > flush_max)
481 		opts->mmap_flush = flush_max;
482 
483 	return 0;
484 }
485 
486 #ifdef HAVE_ZSTD_SUPPORT
487 static unsigned int comp_level_default = 1;
488 
489 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
490 {
491 	struct record_opts *opts = opt->value;
492 
493 	if (unset) {
494 		opts->comp_level = 0;
495 	} else {
496 		if (str)
497 			opts->comp_level = strtol(str, NULL, 0);
498 		if (!opts->comp_level)
499 			opts->comp_level = comp_level_default;
500 	}
501 
502 	return 0;
503 }
504 #endif
505 static unsigned int comp_level_max = 22;
506 
507 static int record__comp_enabled(struct record *rec)
508 {
509 	return rec->opts.comp_level > 0;
510 }
511 
512 static int process_synthesized_event(struct perf_tool *tool,
513 				     union perf_event *event,
514 				     struct perf_sample *sample __maybe_unused,
515 				     struct machine *machine __maybe_unused)
516 {
517 	struct record *rec = container_of(tool, struct record, tool);
518 	return record__write(rec, NULL, event, event->header.size);
519 }
520 
521 static int process_locked_synthesized_event(struct perf_tool *tool,
522 				     union perf_event *event,
523 				     struct perf_sample *sample __maybe_unused,
524 				     struct machine *machine __maybe_unused)
525 {
526 	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
527 	int ret;
528 
529 	pthread_mutex_lock(&synth_lock);
530 	ret = process_synthesized_event(tool, event, sample, machine);
531 	pthread_mutex_unlock(&synth_lock);
532 	return ret;
533 }
534 
535 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
536 {
537 	struct record *rec = to;
538 
539 	if (record__comp_enabled(rec)) {
540 		size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
541 		bf   = map->data;
542 	}
543 
544 	rec->samples++;
545 	return record__write(rec, map, bf, size);
546 }
547 
548 static volatile int signr = -1;
549 static volatile int child_finished;
550 #ifdef HAVE_EVENTFD_SUPPORT
551 static int done_fd = -1;
552 #endif
553 
554 static void sig_handler(int sig)
555 {
556 	if (sig == SIGCHLD)
557 		child_finished = 1;
558 	else
559 		signr = sig;
560 
561 	done = 1;
562 #ifdef HAVE_EVENTFD_SUPPORT
563 {
564 	u64 tmp = 1;
565 	/*
566 	 * It is possible for this signal handler to run after done is checked
567 	 * in the main loop, but before the perf counter fds are polled. If this
568 	 * happens, the poll() will continue to wait even though done is set,
569 	 * and will only break out if either another signal is received, or the
570 	 * counters are ready for read. To ensure the poll() doesn't sleep when
571 	 * done is set, use an eventfd (done_fd) to wake up the poll().
572 	 */
573 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
574 		pr_err("failed to signal wakeup fd, error: %m\n");
575 }
576 #endif // HAVE_EVENTFD_SUPPORT
577 }
578 
579 static void sigsegv_handler(int sig)
580 {
581 	perf_hooks__recover();
582 	sighandler_dump_stack(sig);
583 }
584 
585 static void record__sig_exit(void)
586 {
587 	if (signr == -1)
588 		return;
589 
590 	signal(signr, SIG_DFL);
591 	raise(signr);
592 }
593 
594 #ifdef HAVE_AUXTRACE_SUPPORT
595 
596 static int record__process_auxtrace(struct perf_tool *tool,
597 				    struct mmap *map,
598 				    union perf_event *event, void *data1,
599 				    size_t len1, void *data2, size_t len2)
600 {
601 	struct record *rec = container_of(tool, struct record, tool);
602 	struct perf_data *data = &rec->data;
603 	size_t padding;
604 	u8 pad[8] = {0};
605 
606 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
607 		off_t file_offset;
608 		int fd = perf_data__fd(data);
609 		int err;
610 
611 		file_offset = lseek(fd, 0, SEEK_CUR);
612 		if (file_offset == -1)
613 			return -1;
614 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
615 						     event, file_offset);
616 		if (err)
617 			return err;
618 	}
619 
620 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
621 	padding = (len1 + len2) & 7;
622 	if (padding)
623 		padding = 8 - padding;
624 
625 	record__write(rec, map, event, event->header.size);
626 	record__write(rec, map, data1, len1);
627 	if (len2)
628 		record__write(rec, map, data2, len2);
629 	record__write(rec, map, &pad, padding);
630 
631 	return 0;
632 }
633 
634 static int record__auxtrace_mmap_read(struct record *rec,
635 				      struct mmap *map)
636 {
637 	int ret;
638 
639 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
640 				  record__process_auxtrace);
641 	if (ret < 0)
642 		return ret;
643 
644 	if (ret)
645 		rec->samples++;
646 
647 	return 0;
648 }
649 
650 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
651 					       struct mmap *map)
652 {
653 	int ret;
654 
655 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
656 					   record__process_auxtrace,
657 					   rec->opts.auxtrace_snapshot_size);
658 	if (ret < 0)
659 		return ret;
660 
661 	if (ret)
662 		rec->samples++;
663 
664 	return 0;
665 }
666 
667 static int record__auxtrace_read_snapshot_all(struct record *rec)
668 {
669 	int i;
670 	int rc = 0;
671 
672 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
673 		struct mmap *map = &rec->evlist->mmap[i];
674 
675 		if (!map->auxtrace_mmap.base)
676 			continue;
677 
678 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
679 			rc = -1;
680 			goto out;
681 		}
682 	}
683 out:
684 	return rc;
685 }
686 
687 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
688 {
689 	pr_debug("Recording AUX area tracing snapshot\n");
690 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
691 		trigger_error(&auxtrace_snapshot_trigger);
692 	} else {
693 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
694 			trigger_error(&auxtrace_snapshot_trigger);
695 		else
696 			trigger_ready(&auxtrace_snapshot_trigger);
697 	}
698 }
699 
700 static int record__auxtrace_snapshot_exit(struct record *rec)
701 {
702 	if (trigger_is_error(&auxtrace_snapshot_trigger))
703 		return 0;
704 
705 	if (!auxtrace_record__snapshot_started &&
706 	    auxtrace_record__snapshot_start(rec->itr))
707 		return -1;
708 
709 	record__read_auxtrace_snapshot(rec, true);
710 	if (trigger_is_error(&auxtrace_snapshot_trigger))
711 		return -1;
712 
713 	return 0;
714 }
715 
716 static int record__auxtrace_init(struct record *rec)
717 {
718 	int err;
719 
720 	if (!rec->itr) {
721 		rec->itr = auxtrace_record__init(rec->evlist, &err);
722 		if (err)
723 			return err;
724 	}
725 
726 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
727 					      rec->opts.auxtrace_snapshot_opts);
728 	if (err)
729 		return err;
730 
731 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
732 					    rec->opts.auxtrace_sample_opts);
733 	if (err)
734 		return err;
735 
736 	auxtrace_regroup_aux_output(rec->evlist);
737 
738 	return auxtrace_parse_filters(rec->evlist);
739 }
740 
741 #else
742 
743 static inline
744 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
745 			       struct mmap *map __maybe_unused)
746 {
747 	return 0;
748 }
749 
750 static inline
751 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
752 				    bool on_exit __maybe_unused)
753 {
754 }
755 
756 static inline
757 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
758 {
759 	return 0;
760 }
761 
762 static inline
763 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
764 {
765 	return 0;
766 }
767 
768 static int record__auxtrace_init(struct record *rec __maybe_unused)
769 {
770 	return 0;
771 }
772 
773 #endif
774 
775 static int record__config_text_poke(struct evlist *evlist)
776 {
777 	struct evsel *evsel;
778 	int err;
779 
780 	/* Nothing to do if text poke is already configured */
781 	evlist__for_each_entry(evlist, evsel) {
782 		if (evsel->core.attr.text_poke)
783 			return 0;
784 	}
785 
786 	err = parse_events(evlist, "dummy:u", NULL);
787 	if (err)
788 		return err;
789 
790 	evsel = evlist__last(evlist);
791 
792 	evsel->core.attr.freq = 0;
793 	evsel->core.attr.sample_period = 1;
794 	evsel->core.attr.text_poke = 1;
795 	evsel->core.attr.ksymbol = 1;
796 
797 	evsel->core.system_wide = true;
798 	evsel->no_aux_samples = true;
799 	evsel->immediate = true;
800 
801 	/* Text poke must be collected on all CPUs */
802 	perf_cpu_map__put(evsel->core.own_cpus);
803 	evsel->core.own_cpus = perf_cpu_map__new(NULL);
804 	perf_cpu_map__put(evsel->core.cpus);
805 	evsel->core.cpus = perf_cpu_map__get(evsel->core.own_cpus);
806 
807 	evsel__set_sample_bit(evsel, TIME);
808 
809 	return 0;
810 }
811 
812 static bool record__kcore_readable(struct machine *machine)
813 {
814 	char kcore[PATH_MAX];
815 	int fd;
816 
817 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
818 
819 	fd = open(kcore, O_RDONLY);
820 	if (fd < 0)
821 		return false;
822 
823 	close(fd);
824 
825 	return true;
826 }
827 
828 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
829 {
830 	char from_dir[PATH_MAX];
831 	char kcore_dir[PATH_MAX];
832 	int ret;
833 
834 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
835 
836 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
837 	if (ret)
838 		return ret;
839 
840 	return kcore_copy(from_dir, kcore_dir);
841 }
842 
843 static int record__mmap_evlist(struct record *rec,
844 			       struct evlist *evlist)
845 {
846 	struct record_opts *opts = &rec->opts;
847 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
848 				  opts->auxtrace_sample_mode;
849 	char msg[512];
850 
851 	if (opts->affinity != PERF_AFFINITY_SYS)
852 		cpu__setup_cpunode_map();
853 
854 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
855 				 opts->auxtrace_mmap_pages,
856 				 auxtrace_overwrite,
857 				 opts->nr_cblocks, opts->affinity,
858 				 opts->mmap_flush, opts->comp_level) < 0) {
859 		if (errno == EPERM) {
860 			pr_err("Permission error mapping pages.\n"
861 			       "Consider increasing "
862 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
863 			       "or try again with a smaller value of -m/--mmap_pages.\n"
864 			       "(current value: %u,%u)\n",
865 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
866 			return -errno;
867 		} else {
868 			pr_err("failed to mmap with %d (%s)\n", errno,
869 				str_error_r(errno, msg, sizeof(msg)));
870 			if (errno)
871 				return -errno;
872 			else
873 				return -EINVAL;
874 		}
875 	}
876 	return 0;
877 }
878 
879 static int record__mmap(struct record *rec)
880 {
881 	return record__mmap_evlist(rec, rec->evlist);
882 }
883 
884 static int record__open(struct record *rec)
885 {
886 	char msg[BUFSIZ];
887 	struct evsel *pos;
888 	struct evlist *evlist = rec->evlist;
889 	struct perf_session *session = rec->session;
890 	struct record_opts *opts = &rec->opts;
891 	int rc = 0;
892 
893 	/*
894 	 * For initial_delay, system wide or a hybrid system, we need to add a
895 	 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
896 	 * of waiting or event synthesis.
897 	 */
898 	if (opts->initial_delay || target__has_cpu(&opts->target) ||
899 	    perf_pmu__has_hybrid()) {
900 		pos = evlist__get_tracking_event(evlist);
901 		if (!evsel__is_dummy_event(pos)) {
902 			/* Set up dummy event. */
903 			if (evlist__add_dummy(evlist))
904 				return -ENOMEM;
905 			pos = evlist__last(evlist);
906 			evlist__set_tracking_event(evlist, pos);
907 		}
908 
909 		/*
910 		 * Enable the dummy event when the process is forked for
911 		 * initial_delay, immediately for system wide.
912 		 */
913 		if (opts->initial_delay && !pos->immediate &&
914 		    !target__has_cpu(&opts->target))
915 			pos->core.attr.enable_on_exec = 1;
916 		else
917 			pos->immediate = 1;
918 	}
919 
920 	evlist__config(evlist, opts, &callchain_param);
921 
922 	evlist__for_each_entry(evlist, pos) {
923 try_again:
924 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
925 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
926 				if (verbose > 0)
927 					ui__warning("%s\n", msg);
928 				goto try_again;
929 			}
930 			if ((errno == EINVAL || errno == EBADF) &&
931 			    pos->core.leader != &pos->core &&
932 			    pos->weak_group) {
933 			        pos = evlist__reset_weak_group(evlist, pos, true);
934 				goto try_again;
935 			}
936 			rc = -errno;
937 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
938 			ui__error("%s\n", msg);
939 			goto out;
940 		}
941 
942 		pos->supported = true;
943 	}
944 
945 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
946 		pr_warning(
947 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
948 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
949 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
950 "file is not found in the buildid cache or in the vmlinux path.\n\n"
951 "Samples in kernel modules won't be resolved at all.\n\n"
952 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
953 "even with a suitable vmlinux or kallsyms file.\n\n");
954 	}
955 
956 	if (evlist__apply_filters(evlist, &pos)) {
957 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
958 			pos->filter, evsel__name(pos), errno,
959 			str_error_r(errno, msg, sizeof(msg)));
960 		rc = -1;
961 		goto out;
962 	}
963 
964 	rc = record__mmap(rec);
965 	if (rc)
966 		goto out;
967 
968 	session->evlist = evlist;
969 	perf_session__set_id_hdr_size(session);
970 out:
971 	return rc;
972 }
973 
974 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
975 {
976 	if (rec->evlist->first_sample_time == 0)
977 		rec->evlist->first_sample_time = sample_time;
978 
979 	if (sample_time)
980 		rec->evlist->last_sample_time = sample_time;
981 }
982 
983 static int process_sample_event(struct perf_tool *tool,
984 				union perf_event *event,
985 				struct perf_sample *sample,
986 				struct evsel *evsel,
987 				struct machine *machine)
988 {
989 	struct record *rec = container_of(tool, struct record, tool);
990 
991 	set_timestamp_boundary(rec, sample->time);
992 
993 	if (rec->buildid_all)
994 		return 0;
995 
996 	rec->samples++;
997 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
998 }
999 
1000 static int process_buildids(struct record *rec)
1001 {
1002 	struct perf_session *session = rec->session;
1003 
1004 	if (perf_data__size(&rec->data) == 0)
1005 		return 0;
1006 
1007 	/*
1008 	 * During this process, it'll load kernel map and replace the
1009 	 * dso->long_name to a real pathname it found.  In this case
1010 	 * we prefer the vmlinux path like
1011 	 *   /lib/modules/3.16.4/build/vmlinux
1012 	 *
1013 	 * rather than build-id path (in debug directory).
1014 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1015 	 */
1016 	symbol_conf.ignore_vmlinux_buildid = true;
1017 
1018 	/*
1019 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1020 	 * so no need to process samples. But if timestamp_boundary is enabled,
1021 	 * it still needs to walk on all samples to get the timestamps of
1022 	 * first/last samples.
1023 	 */
1024 	if (rec->buildid_all && !rec->timestamp_boundary)
1025 		rec->tool.sample = NULL;
1026 
1027 	return perf_session__process_events(session);
1028 }
1029 
1030 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1031 {
1032 	int err;
1033 	struct perf_tool *tool = data;
1034 	/*
1035 	 *As for guest kernel when processing subcommand record&report,
1036 	 *we arrange module mmap prior to guest kernel mmap and trigger
1037 	 *a preload dso because default guest module symbols are loaded
1038 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1039 	 *method is used to avoid symbol missing when the first addr is
1040 	 *in module instead of in guest kernel.
1041 	 */
1042 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1043 					     machine);
1044 	if (err < 0)
1045 		pr_err("Couldn't record guest kernel [%d]'s reference"
1046 		       " relocation symbol.\n", machine->pid);
1047 
1048 	/*
1049 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1050 	 * have no _text sometimes.
1051 	 */
1052 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1053 						 machine);
1054 	if (err < 0)
1055 		pr_err("Couldn't record guest kernel [%d]'s reference"
1056 		       " relocation symbol.\n", machine->pid);
1057 }
1058 
1059 static struct perf_event_header finished_round_event = {
1060 	.size = sizeof(struct perf_event_header),
1061 	.type = PERF_RECORD_FINISHED_ROUND,
1062 };
1063 
1064 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1065 {
1066 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1067 	    !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
1068 			  rec->affinity_mask.nbits)) {
1069 		bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
1070 		bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
1071 			  map->affinity_mask.bits, rec->affinity_mask.nbits);
1072 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
1073 				  (cpu_set_t *)rec->affinity_mask.bits);
1074 		if (verbose == 2)
1075 			mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
1076 	}
1077 }
1078 
1079 static size_t process_comp_header(void *record, size_t increment)
1080 {
1081 	struct perf_record_compressed *event = record;
1082 	size_t size = sizeof(*event);
1083 
1084 	if (increment) {
1085 		event->header.size += increment;
1086 		return increment;
1087 	}
1088 
1089 	event->header.type = PERF_RECORD_COMPRESSED;
1090 	event->header.size = size;
1091 
1092 	return size;
1093 }
1094 
1095 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
1096 			    void *src, size_t src_size)
1097 {
1098 	size_t compressed;
1099 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1100 
1101 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1102 						     max_record_size, process_comp_header);
1103 
1104 	session->bytes_transferred += src_size;
1105 	session->bytes_compressed  += compressed;
1106 
1107 	return compressed;
1108 }
1109 
1110 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1111 				    bool overwrite, bool synch)
1112 {
1113 	u64 bytes_written = rec->bytes_written;
1114 	int i;
1115 	int rc = 0;
1116 	struct mmap *maps;
1117 	int trace_fd = rec->data.file.fd;
1118 	off_t off = 0;
1119 
1120 	if (!evlist)
1121 		return 0;
1122 
1123 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1124 	if (!maps)
1125 		return 0;
1126 
1127 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1128 		return 0;
1129 
1130 	if (record__aio_enabled(rec))
1131 		off = record__aio_get_pos(trace_fd);
1132 
1133 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
1134 		u64 flush = 0;
1135 		struct mmap *map = &maps[i];
1136 
1137 		if (map->core.base) {
1138 			record__adjust_affinity(rec, map);
1139 			if (synch) {
1140 				flush = map->core.flush;
1141 				map->core.flush = 1;
1142 			}
1143 			if (!record__aio_enabled(rec)) {
1144 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1145 					if (synch)
1146 						map->core.flush = flush;
1147 					rc = -1;
1148 					goto out;
1149 				}
1150 			} else {
1151 				if (record__aio_push(rec, map, &off) < 0) {
1152 					record__aio_set_pos(trace_fd, off);
1153 					if (synch)
1154 						map->core.flush = flush;
1155 					rc = -1;
1156 					goto out;
1157 				}
1158 			}
1159 			if (synch)
1160 				map->core.flush = flush;
1161 		}
1162 
1163 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1164 		    !rec->opts.auxtrace_sample_mode &&
1165 		    record__auxtrace_mmap_read(rec, map) != 0) {
1166 			rc = -1;
1167 			goto out;
1168 		}
1169 	}
1170 
1171 	if (record__aio_enabled(rec))
1172 		record__aio_set_pos(trace_fd, off);
1173 
1174 	/*
1175 	 * Mark the round finished in case we wrote
1176 	 * at least one event.
1177 	 */
1178 	if (bytes_written != rec->bytes_written)
1179 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1180 
1181 	if (overwrite)
1182 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1183 out:
1184 	return rc;
1185 }
1186 
1187 static int record__mmap_read_all(struct record *rec, bool synch)
1188 {
1189 	int err;
1190 
1191 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1192 	if (err)
1193 		return err;
1194 
1195 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1196 }
1197 
1198 static void record__init_features(struct record *rec)
1199 {
1200 	struct perf_session *session = rec->session;
1201 	int feat;
1202 
1203 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1204 		perf_header__set_feat(&session->header, feat);
1205 
1206 	if (rec->no_buildid)
1207 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1208 
1209 	if (!have_tracepoints(&rec->evlist->core.entries))
1210 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1211 
1212 	if (!rec->opts.branch_stack)
1213 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1214 
1215 	if (!rec->opts.full_auxtrace)
1216 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1217 
1218 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1219 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1220 
1221 	if (!rec->opts.use_clockid)
1222 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1223 
1224 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1225 	if (!record__comp_enabled(rec))
1226 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1227 
1228 	perf_header__clear_feat(&session->header, HEADER_STAT);
1229 }
1230 
1231 static void
1232 record__finish_output(struct record *rec)
1233 {
1234 	struct perf_data *data = &rec->data;
1235 	int fd = perf_data__fd(data);
1236 
1237 	if (data->is_pipe)
1238 		return;
1239 
1240 	rec->session->header.data_size += rec->bytes_written;
1241 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1242 
1243 	if (!rec->no_buildid) {
1244 		process_buildids(rec);
1245 
1246 		if (rec->buildid_all)
1247 			dsos__hit_all(rec->session);
1248 	}
1249 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1250 
1251 	return;
1252 }
1253 
1254 static int record__synthesize_workload(struct record *rec, bool tail)
1255 {
1256 	int err;
1257 	struct perf_thread_map *thread_map;
1258 
1259 	if (rec->opts.tail_synthesize != tail)
1260 		return 0;
1261 
1262 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1263 	if (thread_map == NULL)
1264 		return -1;
1265 
1266 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1267 						 process_synthesized_event,
1268 						 &rec->session->machines.host,
1269 						 rec->opts.sample_address);
1270 	perf_thread_map__put(thread_map);
1271 	return err;
1272 }
1273 
1274 static int record__synthesize(struct record *rec, bool tail);
1275 
1276 static int
1277 record__switch_output(struct record *rec, bool at_exit)
1278 {
1279 	struct perf_data *data = &rec->data;
1280 	int fd, err;
1281 	char *new_filename;
1282 
1283 	/* Same Size:      "2015122520103046"*/
1284 	char timestamp[] = "InvalidTimestamp";
1285 
1286 	record__aio_mmap_read_sync(rec);
1287 
1288 	record__synthesize(rec, true);
1289 	if (target__none(&rec->opts.target))
1290 		record__synthesize_workload(rec, true);
1291 
1292 	rec->samples = 0;
1293 	record__finish_output(rec);
1294 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1295 	if (err) {
1296 		pr_err("Failed to get current timestamp\n");
1297 		return -EINVAL;
1298 	}
1299 
1300 	fd = perf_data__switch(data, timestamp,
1301 				    rec->session->header.data_offset,
1302 				    at_exit, &new_filename);
1303 	if (fd >= 0 && !at_exit) {
1304 		rec->bytes_written = 0;
1305 		rec->session->header.data_size = 0;
1306 	}
1307 
1308 	if (!quiet)
1309 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1310 			data->path, timestamp);
1311 
1312 	if (rec->switch_output.num_files) {
1313 		int n = rec->switch_output.cur_file + 1;
1314 
1315 		if (n >= rec->switch_output.num_files)
1316 			n = 0;
1317 		rec->switch_output.cur_file = n;
1318 		if (rec->switch_output.filenames[n]) {
1319 			remove(rec->switch_output.filenames[n]);
1320 			zfree(&rec->switch_output.filenames[n]);
1321 		}
1322 		rec->switch_output.filenames[n] = new_filename;
1323 	} else {
1324 		free(new_filename);
1325 	}
1326 
1327 	/* Output tracking events */
1328 	if (!at_exit) {
1329 		record__synthesize(rec, false);
1330 
1331 		/*
1332 		 * In 'perf record --switch-output' without -a,
1333 		 * record__synthesize() in record__switch_output() won't
1334 		 * generate tracking events because there's no thread_map
1335 		 * in evlist. Which causes newly created perf.data doesn't
1336 		 * contain map and comm information.
1337 		 * Create a fake thread_map and directly call
1338 		 * perf_event__synthesize_thread_map() for those events.
1339 		 */
1340 		if (target__none(&rec->opts.target))
1341 			record__synthesize_workload(rec, false);
1342 	}
1343 	return fd;
1344 }
1345 
1346 static volatile int workload_exec_errno;
1347 
1348 /*
1349  * evlist__prepare_workload will send a SIGUSR1
1350  * if the fork fails, since we asked by setting its
1351  * want_signal to true.
1352  */
1353 static void workload_exec_failed_signal(int signo __maybe_unused,
1354 					siginfo_t *info,
1355 					void *ucontext __maybe_unused)
1356 {
1357 	workload_exec_errno = info->si_value.sival_int;
1358 	done = 1;
1359 	child_finished = 1;
1360 }
1361 
1362 static void snapshot_sig_handler(int sig);
1363 static void alarm_sig_handler(int sig);
1364 
1365 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1366 {
1367 	if (evlist) {
1368 		if (evlist->mmap && evlist->mmap[0].core.base)
1369 			return evlist->mmap[0].core.base;
1370 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1371 			return evlist->overwrite_mmap[0].core.base;
1372 	}
1373 	return NULL;
1374 }
1375 
1376 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1377 {
1378 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1379 	if (pc)
1380 		return pc;
1381 	return NULL;
1382 }
1383 
1384 static int record__synthesize(struct record *rec, bool tail)
1385 {
1386 	struct perf_session *session = rec->session;
1387 	struct machine *machine = &session->machines.host;
1388 	struct perf_data *data = &rec->data;
1389 	struct record_opts *opts = &rec->opts;
1390 	struct perf_tool *tool = &rec->tool;
1391 	int err = 0;
1392 	event_op f = process_synthesized_event;
1393 
1394 	if (rec->opts.tail_synthesize != tail)
1395 		return 0;
1396 
1397 	if (data->is_pipe) {
1398 		err = perf_event__synthesize_for_pipe(tool, session, data,
1399 						      process_synthesized_event);
1400 		if (err < 0)
1401 			goto out;
1402 
1403 		rec->bytes_written += err;
1404 	}
1405 
1406 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1407 					  process_synthesized_event, machine);
1408 	if (err)
1409 		goto out;
1410 
1411 	/* Synthesize id_index before auxtrace_info */
1412 	if (rec->opts.auxtrace_sample_mode) {
1413 		err = perf_event__synthesize_id_index(tool,
1414 						      process_synthesized_event,
1415 						      session->evlist, machine);
1416 		if (err)
1417 			goto out;
1418 	}
1419 
1420 	if (rec->opts.full_auxtrace) {
1421 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1422 					session, process_synthesized_event);
1423 		if (err)
1424 			goto out;
1425 	}
1426 
1427 	if (!evlist__exclude_kernel(rec->evlist)) {
1428 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1429 							 machine);
1430 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1431 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1432 				   "Check /proc/kallsyms permission or run as root.\n");
1433 
1434 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1435 						     machine);
1436 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1437 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1438 				   "Check /proc/modules permission or run as root.\n");
1439 	}
1440 
1441 	if (perf_guest) {
1442 		machines__process_guests(&session->machines,
1443 					 perf_event__synthesize_guest_os, tool);
1444 	}
1445 
1446 	err = perf_event__synthesize_extra_attr(&rec->tool,
1447 						rec->evlist,
1448 						process_synthesized_event,
1449 						data->is_pipe);
1450 	if (err)
1451 		goto out;
1452 
1453 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1454 						 process_synthesized_event,
1455 						NULL);
1456 	if (err < 0) {
1457 		pr_err("Couldn't synthesize thread map.\n");
1458 		return err;
1459 	}
1460 
1461 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1462 					     process_synthesized_event, NULL);
1463 	if (err < 0) {
1464 		pr_err("Couldn't synthesize cpu map.\n");
1465 		return err;
1466 	}
1467 
1468 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1469 						machine, opts);
1470 	if (err < 0)
1471 		pr_warning("Couldn't synthesize bpf events.\n");
1472 
1473 	err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1474 					     machine);
1475 	if (err < 0)
1476 		pr_warning("Couldn't synthesize cgroup events.\n");
1477 
1478 	if (rec->opts.nr_threads_synthesize > 1) {
1479 		perf_set_multithreaded();
1480 		f = process_locked_synthesized_event;
1481 	}
1482 
1483 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1484 					    f, opts->sample_address,
1485 					    rec->opts.nr_threads_synthesize);
1486 
1487 	if (rec->opts.nr_threads_synthesize > 1)
1488 		perf_set_singlethreaded();
1489 
1490 out:
1491 	return err;
1492 }
1493 
1494 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1495 {
1496 	struct record *rec = data;
1497 	pthread_kill(rec->thread_id, SIGUSR2);
1498 	return 0;
1499 }
1500 
1501 static int record__setup_sb_evlist(struct record *rec)
1502 {
1503 	struct record_opts *opts = &rec->opts;
1504 
1505 	if (rec->sb_evlist != NULL) {
1506 		/*
1507 		 * We get here if --switch-output-event populated the
1508 		 * sb_evlist, so associate a callback that will send a SIGUSR2
1509 		 * to the main thread.
1510 		 */
1511 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1512 		rec->thread_id = pthread_self();
1513 	}
1514 #ifdef HAVE_LIBBPF_SUPPORT
1515 	if (!opts->no_bpf_event) {
1516 		if (rec->sb_evlist == NULL) {
1517 			rec->sb_evlist = evlist__new();
1518 
1519 			if (rec->sb_evlist == NULL) {
1520 				pr_err("Couldn't create side band evlist.\n.");
1521 				return -1;
1522 			}
1523 		}
1524 
1525 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1526 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1527 			return -1;
1528 		}
1529 	}
1530 #endif
1531 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1532 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1533 		opts->no_bpf_event = true;
1534 	}
1535 
1536 	return 0;
1537 }
1538 
1539 static int record__init_clock(struct record *rec)
1540 {
1541 	struct perf_session *session = rec->session;
1542 	struct timespec ref_clockid;
1543 	struct timeval ref_tod;
1544 	u64 ref;
1545 
1546 	if (!rec->opts.use_clockid)
1547 		return 0;
1548 
1549 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1550 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
1551 
1552 	session->header.env.clock.clockid = rec->opts.clockid;
1553 
1554 	if (gettimeofday(&ref_tod, NULL) != 0) {
1555 		pr_err("gettimeofday failed, cannot set reference time.\n");
1556 		return -1;
1557 	}
1558 
1559 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
1560 		pr_err("clock_gettime failed, cannot set reference time.\n");
1561 		return -1;
1562 	}
1563 
1564 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
1565 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
1566 
1567 	session->header.env.clock.tod_ns = ref;
1568 
1569 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
1570 	      (u64) ref_clockid.tv_nsec;
1571 
1572 	session->header.env.clock.clockid_ns = ref;
1573 	return 0;
1574 }
1575 
1576 static void hit_auxtrace_snapshot_trigger(struct record *rec)
1577 {
1578 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1579 		trigger_hit(&auxtrace_snapshot_trigger);
1580 		auxtrace_record__snapshot_started = 1;
1581 		if (auxtrace_record__snapshot_start(rec->itr))
1582 			trigger_error(&auxtrace_snapshot_trigger);
1583 	}
1584 }
1585 
1586 static void record__uniquify_name(struct record *rec)
1587 {
1588 	struct evsel *pos;
1589 	struct evlist *evlist = rec->evlist;
1590 	char *new_name;
1591 	int ret;
1592 
1593 	if (!perf_pmu__has_hybrid())
1594 		return;
1595 
1596 	evlist__for_each_entry(evlist, pos) {
1597 		if (!evsel__is_hybrid(pos))
1598 			continue;
1599 
1600 		if (strchr(pos->name, '/'))
1601 			continue;
1602 
1603 		ret = asprintf(&new_name, "%s/%s/",
1604 			       pos->pmu_name, pos->name);
1605 		if (ret) {
1606 			free(pos->name);
1607 			pos->name = new_name;
1608 		}
1609 	}
1610 }
1611 
1612 static int __cmd_record(struct record *rec, int argc, const char **argv)
1613 {
1614 	int err;
1615 	int status = 0;
1616 	unsigned long waking = 0;
1617 	const bool forks = argc > 0;
1618 	struct perf_tool *tool = &rec->tool;
1619 	struct record_opts *opts = &rec->opts;
1620 	struct perf_data *data = &rec->data;
1621 	struct perf_session *session;
1622 	bool disabled = false, draining = false;
1623 	int fd;
1624 	float ratio = 0;
1625 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
1626 
1627 	atexit(record__sig_exit);
1628 	signal(SIGCHLD, sig_handler);
1629 	signal(SIGINT, sig_handler);
1630 	signal(SIGTERM, sig_handler);
1631 	signal(SIGSEGV, sigsegv_handler);
1632 
1633 	if (rec->opts.record_namespaces)
1634 		tool->namespace_events = true;
1635 
1636 	if (rec->opts.record_cgroup) {
1637 #ifdef HAVE_FILE_HANDLE
1638 		tool->cgroup_events = true;
1639 #else
1640 		pr_err("cgroup tracking is not supported\n");
1641 		return -1;
1642 #endif
1643 	}
1644 
1645 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1646 		signal(SIGUSR2, snapshot_sig_handler);
1647 		if (rec->opts.auxtrace_snapshot_mode)
1648 			trigger_on(&auxtrace_snapshot_trigger);
1649 		if (rec->switch_output.enabled)
1650 			trigger_on(&switch_output_trigger);
1651 	} else {
1652 		signal(SIGUSR2, SIG_IGN);
1653 	}
1654 
1655 	session = perf_session__new(data, tool);
1656 	if (IS_ERR(session)) {
1657 		pr_err("Perf session creation failed.\n");
1658 		return PTR_ERR(session);
1659 	}
1660 
1661 	fd = perf_data__fd(data);
1662 	rec->session = session;
1663 
1664 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1665 		pr_err("Compression initialization failed.\n");
1666 		return -1;
1667 	}
1668 #ifdef HAVE_EVENTFD_SUPPORT
1669 	done_fd = eventfd(0, EFD_NONBLOCK);
1670 	if (done_fd < 0) {
1671 		pr_err("Failed to create wakeup eventfd, error: %m\n");
1672 		status = -1;
1673 		goto out_delete_session;
1674 	}
1675 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
1676 	if (err < 0) {
1677 		pr_err("Failed to add wakeup eventfd to poll list\n");
1678 		status = err;
1679 		goto out_delete_session;
1680 	}
1681 #endif // HAVE_EVENTFD_SUPPORT
1682 
1683 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1684 	session->header.env.comp_level = rec->opts.comp_level;
1685 
1686 	if (rec->opts.kcore &&
1687 	    !record__kcore_readable(&session->machines.host)) {
1688 		pr_err("ERROR: kcore is not readable.\n");
1689 		return -1;
1690 	}
1691 
1692 	if (record__init_clock(rec))
1693 		return -1;
1694 
1695 	record__init_features(rec);
1696 
1697 	if (forks) {
1698 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
1699 					       workload_exec_failed_signal);
1700 		if (err < 0) {
1701 			pr_err("Couldn't run the workload!\n");
1702 			status = err;
1703 			goto out_delete_session;
1704 		}
1705 	}
1706 
1707 	/*
1708 	 * If we have just single event and are sending data
1709 	 * through pipe, we need to force the ids allocation,
1710 	 * because we synthesize event name through the pipe
1711 	 * and need the id for that.
1712 	 */
1713 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1714 		rec->opts.sample_id = true;
1715 
1716 	record__uniquify_name(rec);
1717 
1718 	if (record__open(rec) != 0) {
1719 		err = -1;
1720 		goto out_child;
1721 	}
1722 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1723 
1724 	if (rec->opts.kcore) {
1725 		err = record__kcore_copy(&session->machines.host, data);
1726 		if (err) {
1727 			pr_err("ERROR: Failed to copy kcore\n");
1728 			goto out_child;
1729 		}
1730 	}
1731 
1732 	err = bpf__apply_obj_config();
1733 	if (err) {
1734 		char errbuf[BUFSIZ];
1735 
1736 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1737 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1738 			 errbuf);
1739 		goto out_child;
1740 	}
1741 
1742 	/*
1743 	 * Normally perf_session__new would do this, but it doesn't have the
1744 	 * evlist.
1745 	 */
1746 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
1747 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1748 		rec->tool.ordered_events = false;
1749 	}
1750 
1751 	if (!rec->evlist->core.nr_groups)
1752 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1753 
1754 	if (data->is_pipe) {
1755 		err = perf_header__write_pipe(fd);
1756 		if (err < 0)
1757 			goto out_child;
1758 	} else {
1759 		err = perf_session__write_header(session, rec->evlist, fd, false);
1760 		if (err < 0)
1761 			goto out_child;
1762 	}
1763 
1764 	err = -1;
1765 	if (!rec->no_buildid
1766 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1767 		pr_err("Couldn't generate buildids. "
1768 		       "Use --no-buildid to profile anyway.\n");
1769 		goto out_child;
1770 	}
1771 
1772 	err = record__setup_sb_evlist(rec);
1773 	if (err)
1774 		goto out_child;
1775 
1776 	err = record__synthesize(rec, false);
1777 	if (err < 0)
1778 		goto out_child;
1779 
1780 	if (rec->realtime_prio) {
1781 		struct sched_param param;
1782 
1783 		param.sched_priority = rec->realtime_prio;
1784 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1785 			pr_err("Could not set realtime priority.\n");
1786 			err = -1;
1787 			goto out_child;
1788 		}
1789 	}
1790 
1791 	/*
1792 	 * When perf is starting the traced process, all the events
1793 	 * (apart from group members) have enable_on_exec=1 set,
1794 	 * so don't spoil it by prematurely enabling them.
1795 	 */
1796 	if (!target__none(&opts->target) && !opts->initial_delay)
1797 		evlist__enable(rec->evlist);
1798 
1799 	/*
1800 	 * Let the child rip
1801 	 */
1802 	if (forks) {
1803 		struct machine *machine = &session->machines.host;
1804 		union perf_event *event;
1805 		pid_t tgid;
1806 
1807 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1808 		if (event == NULL) {
1809 			err = -ENOMEM;
1810 			goto out_child;
1811 		}
1812 
1813 		/*
1814 		 * Some H/W events are generated before COMM event
1815 		 * which is emitted during exec(), so perf script
1816 		 * cannot see a correct process name for those events.
1817 		 * Synthesize COMM event to prevent it.
1818 		 */
1819 		tgid = perf_event__synthesize_comm(tool, event,
1820 						   rec->evlist->workload.pid,
1821 						   process_synthesized_event,
1822 						   machine);
1823 		free(event);
1824 
1825 		if (tgid == -1)
1826 			goto out_child;
1827 
1828 		event = malloc(sizeof(event->namespaces) +
1829 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1830 			       machine->id_hdr_size);
1831 		if (event == NULL) {
1832 			err = -ENOMEM;
1833 			goto out_child;
1834 		}
1835 
1836 		/*
1837 		 * Synthesize NAMESPACES event for the command specified.
1838 		 */
1839 		perf_event__synthesize_namespaces(tool, event,
1840 						  rec->evlist->workload.pid,
1841 						  tgid, process_synthesized_event,
1842 						  machine);
1843 		free(event);
1844 
1845 		evlist__start_workload(rec->evlist);
1846 	}
1847 
1848 	if (evlist__initialize_ctlfd(rec->evlist, opts->ctl_fd, opts->ctl_fd_ack))
1849 		goto out_child;
1850 
1851 	if (opts->initial_delay) {
1852 		pr_info(EVLIST_DISABLED_MSG);
1853 		if (opts->initial_delay > 0) {
1854 			usleep(opts->initial_delay * USEC_PER_MSEC);
1855 			evlist__enable(rec->evlist);
1856 			pr_info(EVLIST_ENABLED_MSG);
1857 		}
1858 	}
1859 
1860 	trigger_ready(&auxtrace_snapshot_trigger);
1861 	trigger_ready(&switch_output_trigger);
1862 	perf_hooks__invoke_record_start();
1863 	for (;;) {
1864 		unsigned long long hits = rec->samples;
1865 
1866 		/*
1867 		 * rec->evlist->bkw_mmap_state is possible to be
1868 		 * BKW_MMAP_EMPTY here: when done == true and
1869 		 * hits != rec->samples in previous round.
1870 		 *
1871 		 * evlist__toggle_bkw_mmap ensure we never
1872 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1873 		 */
1874 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1875 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1876 
1877 		if (record__mmap_read_all(rec, false) < 0) {
1878 			trigger_error(&auxtrace_snapshot_trigger);
1879 			trigger_error(&switch_output_trigger);
1880 			err = -1;
1881 			goto out_child;
1882 		}
1883 
1884 		if (auxtrace_record__snapshot_started) {
1885 			auxtrace_record__snapshot_started = 0;
1886 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1887 				record__read_auxtrace_snapshot(rec, false);
1888 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1889 				pr_err("AUX area tracing snapshot failed\n");
1890 				err = -1;
1891 				goto out_child;
1892 			}
1893 		}
1894 
1895 		if (trigger_is_hit(&switch_output_trigger)) {
1896 			/*
1897 			 * If switch_output_trigger is hit, the data in
1898 			 * overwritable ring buffer should have been collected,
1899 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1900 			 *
1901 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1902 			 * record__mmap_read_all() didn't collect data from
1903 			 * overwritable ring buffer. Read again.
1904 			 */
1905 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1906 				continue;
1907 			trigger_ready(&switch_output_trigger);
1908 
1909 			/*
1910 			 * Reenable events in overwrite ring buffer after
1911 			 * record__mmap_read_all(): we should have collected
1912 			 * data from it.
1913 			 */
1914 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1915 
1916 			if (!quiet)
1917 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1918 					waking);
1919 			waking = 0;
1920 			fd = record__switch_output(rec, false);
1921 			if (fd < 0) {
1922 				pr_err("Failed to switch to new file\n");
1923 				trigger_error(&switch_output_trigger);
1924 				err = fd;
1925 				goto out_child;
1926 			}
1927 
1928 			/* re-arm the alarm */
1929 			if (rec->switch_output.time)
1930 				alarm(rec->switch_output.time);
1931 		}
1932 
1933 		if (hits == rec->samples) {
1934 			if (done || draining)
1935 				break;
1936 			err = evlist__poll(rec->evlist, -1);
1937 			/*
1938 			 * Propagate error, only if there's any. Ignore positive
1939 			 * number of returned events and interrupt error.
1940 			 */
1941 			if (err > 0 || (err < 0 && errno == EINTR))
1942 				err = 0;
1943 			waking++;
1944 
1945 			if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1946 				draining = true;
1947 		}
1948 
1949 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
1950 			switch (cmd) {
1951 			case EVLIST_CTL_CMD_SNAPSHOT:
1952 				hit_auxtrace_snapshot_trigger(rec);
1953 				evlist__ctlfd_ack(rec->evlist);
1954 				break;
1955 			case EVLIST_CTL_CMD_STOP:
1956 				done = 1;
1957 				break;
1958 			case EVLIST_CTL_CMD_ACK:
1959 			case EVLIST_CTL_CMD_UNSUPPORTED:
1960 			case EVLIST_CTL_CMD_ENABLE:
1961 			case EVLIST_CTL_CMD_DISABLE:
1962 			case EVLIST_CTL_CMD_EVLIST:
1963 			case EVLIST_CTL_CMD_PING:
1964 			default:
1965 				break;
1966 			}
1967 		}
1968 
1969 		/*
1970 		 * When perf is starting the traced process, at the end events
1971 		 * die with the process and we wait for that. Thus no need to
1972 		 * disable events in this case.
1973 		 */
1974 		if (done && !disabled && !target__none(&opts->target)) {
1975 			trigger_off(&auxtrace_snapshot_trigger);
1976 			evlist__disable(rec->evlist);
1977 			disabled = true;
1978 		}
1979 	}
1980 
1981 	trigger_off(&auxtrace_snapshot_trigger);
1982 	trigger_off(&switch_output_trigger);
1983 
1984 	if (opts->auxtrace_snapshot_on_exit)
1985 		record__auxtrace_snapshot_exit(rec);
1986 
1987 	if (forks && workload_exec_errno) {
1988 		char msg[STRERR_BUFSIZE], strevsels[2048];
1989 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1990 
1991 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
1992 
1993 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
1994 			strevsels, argv[0], emsg);
1995 		err = -1;
1996 		goto out_child;
1997 	}
1998 
1999 	if (!quiet)
2000 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
2001 
2002 	if (target__none(&rec->opts.target))
2003 		record__synthesize_workload(rec, true);
2004 
2005 out_child:
2006 	evlist__finalize_ctlfd(rec->evlist);
2007 	record__mmap_read_all(rec, true);
2008 	record__aio_mmap_read_sync(rec);
2009 
2010 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2011 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2012 		session->header.env.comp_ratio = ratio + 0.5;
2013 	}
2014 
2015 	if (forks) {
2016 		int exit_status;
2017 
2018 		if (!child_finished)
2019 			kill(rec->evlist->workload.pid, SIGTERM);
2020 
2021 		wait(&exit_status);
2022 
2023 		if (err < 0)
2024 			status = err;
2025 		else if (WIFEXITED(exit_status))
2026 			status = WEXITSTATUS(exit_status);
2027 		else if (WIFSIGNALED(exit_status))
2028 			signr = WTERMSIG(exit_status);
2029 	} else
2030 		status = err;
2031 
2032 	record__synthesize(rec, true);
2033 	/* this will be recalculated during process_buildids() */
2034 	rec->samples = 0;
2035 
2036 	if (!err) {
2037 		if (!rec->timestamp_filename) {
2038 			record__finish_output(rec);
2039 		} else {
2040 			fd = record__switch_output(rec, true);
2041 			if (fd < 0) {
2042 				status = fd;
2043 				goto out_delete_session;
2044 			}
2045 		}
2046 	}
2047 
2048 	perf_hooks__invoke_record_end();
2049 
2050 	if (!err && !quiet) {
2051 		char samples[128];
2052 		const char *postfix = rec->timestamp_filename ?
2053 					".<timestamp>" : "";
2054 
2055 		if (rec->samples && !rec->opts.full_auxtrace)
2056 			scnprintf(samples, sizeof(samples),
2057 				  " (%" PRIu64 " samples)", rec->samples);
2058 		else
2059 			samples[0] = '\0';
2060 
2061 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2062 			perf_data__size(data) / 1024.0 / 1024.0,
2063 			data->path, postfix, samples);
2064 		if (ratio) {
2065 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2066 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2067 					ratio);
2068 		}
2069 		fprintf(stderr, " ]\n");
2070 	}
2071 
2072 out_delete_session:
2073 #ifdef HAVE_EVENTFD_SUPPORT
2074 	if (done_fd >= 0)
2075 		close(done_fd);
2076 #endif
2077 	zstd_fini(&session->zstd_data);
2078 	perf_session__delete(session);
2079 
2080 	if (!opts->no_bpf_event)
2081 		evlist__stop_sb_thread(rec->sb_evlist);
2082 	return status;
2083 }
2084 
2085 static void callchain_debug(struct callchain_param *callchain)
2086 {
2087 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2088 
2089 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2090 
2091 	if (callchain->record_mode == CALLCHAIN_DWARF)
2092 		pr_debug("callchain: stack dump size %d\n",
2093 			 callchain->dump_size);
2094 }
2095 
2096 int record_opts__parse_callchain(struct record_opts *record,
2097 				 struct callchain_param *callchain,
2098 				 const char *arg, bool unset)
2099 {
2100 	int ret;
2101 	callchain->enabled = !unset;
2102 
2103 	/* --no-call-graph */
2104 	if (unset) {
2105 		callchain->record_mode = CALLCHAIN_NONE;
2106 		pr_debug("callchain: disabled\n");
2107 		return 0;
2108 	}
2109 
2110 	ret = parse_callchain_record_opt(arg, callchain);
2111 	if (!ret) {
2112 		/* Enable data address sampling for DWARF unwind. */
2113 		if (callchain->record_mode == CALLCHAIN_DWARF)
2114 			record->sample_address = true;
2115 		callchain_debug(callchain);
2116 	}
2117 
2118 	return ret;
2119 }
2120 
2121 int record_parse_callchain_opt(const struct option *opt,
2122 			       const char *arg,
2123 			       int unset)
2124 {
2125 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2126 }
2127 
2128 int record_callchain_opt(const struct option *opt,
2129 			 const char *arg __maybe_unused,
2130 			 int unset __maybe_unused)
2131 {
2132 	struct callchain_param *callchain = opt->value;
2133 
2134 	callchain->enabled = true;
2135 
2136 	if (callchain->record_mode == CALLCHAIN_NONE)
2137 		callchain->record_mode = CALLCHAIN_FP;
2138 
2139 	callchain_debug(callchain);
2140 	return 0;
2141 }
2142 
2143 static int perf_record_config(const char *var, const char *value, void *cb)
2144 {
2145 	struct record *rec = cb;
2146 
2147 	if (!strcmp(var, "record.build-id")) {
2148 		if (!strcmp(value, "cache"))
2149 			rec->no_buildid_cache = false;
2150 		else if (!strcmp(value, "no-cache"))
2151 			rec->no_buildid_cache = true;
2152 		else if (!strcmp(value, "skip"))
2153 			rec->no_buildid = true;
2154 		else if (!strcmp(value, "mmap"))
2155 			rec->buildid_mmap = true;
2156 		else
2157 			return -1;
2158 		return 0;
2159 	}
2160 	if (!strcmp(var, "record.call-graph")) {
2161 		var = "call-graph.record-mode";
2162 		return perf_default_config(var, value, cb);
2163 	}
2164 #ifdef HAVE_AIO_SUPPORT
2165 	if (!strcmp(var, "record.aio")) {
2166 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2167 		if (!rec->opts.nr_cblocks)
2168 			rec->opts.nr_cblocks = nr_cblocks_default;
2169 	}
2170 #endif
2171 
2172 	return 0;
2173 }
2174 
2175 
2176 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2177 {
2178 	struct record_opts *opts = (struct record_opts *)opt->value;
2179 
2180 	if (unset || !str)
2181 		return 0;
2182 
2183 	if (!strcasecmp(str, "node"))
2184 		opts->affinity = PERF_AFFINITY_NODE;
2185 	else if (!strcasecmp(str, "cpu"))
2186 		opts->affinity = PERF_AFFINITY_CPU;
2187 
2188 	return 0;
2189 }
2190 
2191 static int parse_output_max_size(const struct option *opt,
2192 				 const char *str, int unset)
2193 {
2194 	unsigned long *s = (unsigned long *)opt->value;
2195 	static struct parse_tag tags_size[] = {
2196 		{ .tag  = 'B', .mult = 1       },
2197 		{ .tag  = 'K', .mult = 1 << 10 },
2198 		{ .tag  = 'M', .mult = 1 << 20 },
2199 		{ .tag  = 'G', .mult = 1 << 30 },
2200 		{ .tag  = 0 },
2201 	};
2202 	unsigned long val;
2203 
2204 	if (unset) {
2205 		*s = 0;
2206 		return 0;
2207 	}
2208 
2209 	val = parse_tag_value(str, tags_size);
2210 	if (val != (unsigned long) -1) {
2211 		*s = val;
2212 		return 0;
2213 	}
2214 
2215 	return -1;
2216 }
2217 
2218 static int record__parse_mmap_pages(const struct option *opt,
2219 				    const char *str,
2220 				    int unset __maybe_unused)
2221 {
2222 	struct record_opts *opts = opt->value;
2223 	char *s, *p;
2224 	unsigned int mmap_pages;
2225 	int ret;
2226 
2227 	if (!str)
2228 		return -EINVAL;
2229 
2230 	s = strdup(str);
2231 	if (!s)
2232 		return -ENOMEM;
2233 
2234 	p = strchr(s, ',');
2235 	if (p)
2236 		*p = '\0';
2237 
2238 	if (*s) {
2239 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
2240 		if (ret)
2241 			goto out_free;
2242 		opts->mmap_pages = mmap_pages;
2243 	}
2244 
2245 	if (!p) {
2246 		ret = 0;
2247 		goto out_free;
2248 	}
2249 
2250 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
2251 	if (ret)
2252 		goto out_free;
2253 
2254 	opts->auxtrace_mmap_pages = mmap_pages;
2255 
2256 out_free:
2257 	free(s);
2258 	return ret;
2259 }
2260 
2261 static int parse_control_option(const struct option *opt,
2262 				const char *str,
2263 				int unset __maybe_unused)
2264 {
2265 	struct record_opts *opts = opt->value;
2266 
2267 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
2268 }
2269 
2270 static void switch_output_size_warn(struct record *rec)
2271 {
2272 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2273 	struct switch_output *s = &rec->switch_output;
2274 
2275 	wakeup_size /= 2;
2276 
2277 	if (s->size < wakeup_size) {
2278 		char buf[100];
2279 
2280 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2281 		pr_warning("WARNING: switch-output data size lower than "
2282 			   "wakeup kernel buffer size (%s) "
2283 			   "expect bigger perf.data sizes\n", buf);
2284 	}
2285 }
2286 
2287 static int switch_output_setup(struct record *rec)
2288 {
2289 	struct switch_output *s = &rec->switch_output;
2290 	static struct parse_tag tags_size[] = {
2291 		{ .tag  = 'B', .mult = 1       },
2292 		{ .tag  = 'K', .mult = 1 << 10 },
2293 		{ .tag  = 'M', .mult = 1 << 20 },
2294 		{ .tag  = 'G', .mult = 1 << 30 },
2295 		{ .tag  = 0 },
2296 	};
2297 	static struct parse_tag tags_time[] = {
2298 		{ .tag  = 's', .mult = 1        },
2299 		{ .tag  = 'm', .mult = 60       },
2300 		{ .tag  = 'h', .mult = 60*60    },
2301 		{ .tag  = 'd', .mult = 60*60*24 },
2302 		{ .tag  = 0 },
2303 	};
2304 	unsigned long val;
2305 
2306 	/*
2307 	 * If we're using --switch-output-events, then we imply its
2308 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2309 	 *  thread to its parent.
2310 	 */
2311 	if (rec->switch_output_event_set)
2312 		goto do_signal;
2313 
2314 	if (!s->set)
2315 		return 0;
2316 
2317 	if (!strcmp(s->str, "signal")) {
2318 do_signal:
2319 		s->signal = true;
2320 		pr_debug("switch-output with SIGUSR2 signal\n");
2321 		goto enabled;
2322 	}
2323 
2324 	val = parse_tag_value(s->str, tags_size);
2325 	if (val != (unsigned long) -1) {
2326 		s->size = val;
2327 		pr_debug("switch-output with %s size threshold\n", s->str);
2328 		goto enabled;
2329 	}
2330 
2331 	val = parse_tag_value(s->str, tags_time);
2332 	if (val != (unsigned long) -1) {
2333 		s->time = val;
2334 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2335 			 s->str, s->time);
2336 		goto enabled;
2337 	}
2338 
2339 	return -1;
2340 
2341 enabled:
2342 	rec->timestamp_filename = true;
2343 	s->enabled              = true;
2344 
2345 	if (s->size && !rec->opts.no_buffering)
2346 		switch_output_size_warn(rec);
2347 
2348 	return 0;
2349 }
2350 
2351 static const char * const __record_usage[] = {
2352 	"perf record [<options>] [<command>]",
2353 	"perf record [<options>] -- <command> [<options>]",
2354 	NULL
2355 };
2356 const char * const *record_usage = __record_usage;
2357 
2358 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2359 				  struct perf_sample *sample, struct machine *machine)
2360 {
2361 	/*
2362 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2363 	 * no need to add them twice.
2364 	 */
2365 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2366 		return 0;
2367 	return perf_event__process_mmap(tool, event, sample, machine);
2368 }
2369 
2370 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2371 				   struct perf_sample *sample, struct machine *machine)
2372 {
2373 	/*
2374 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2375 	 * no need to add them twice.
2376 	 */
2377 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2378 		return 0;
2379 
2380 	return perf_event__process_mmap2(tool, event, sample, machine);
2381 }
2382 
2383 static int process_timestamp_boundary(struct perf_tool *tool,
2384 				      union perf_event *event __maybe_unused,
2385 				      struct perf_sample *sample,
2386 				      struct machine *machine __maybe_unused)
2387 {
2388 	struct record *rec = container_of(tool, struct record, tool);
2389 
2390 	set_timestamp_boundary(rec, sample->time);
2391 	return 0;
2392 }
2393 
2394 /*
2395  * XXX Ideally would be local to cmd_record() and passed to a record__new
2396  * because we need to have access to it in record__exit, that is called
2397  * after cmd_record() exits, but since record_options need to be accessible to
2398  * builtin-script, leave it here.
2399  *
2400  * At least we don't ouch it in all the other functions here directly.
2401  *
2402  * Just say no to tons of global variables, sigh.
2403  */
2404 static struct record record = {
2405 	.opts = {
2406 		.sample_time	     = true,
2407 		.mmap_pages	     = UINT_MAX,
2408 		.user_freq	     = UINT_MAX,
2409 		.user_interval	     = ULLONG_MAX,
2410 		.freq		     = 4000,
2411 		.target		     = {
2412 			.uses_mmap   = true,
2413 			.default_per_cpu = true,
2414 		},
2415 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2416 		.nr_threads_synthesize = 1,
2417 		.ctl_fd              = -1,
2418 		.ctl_fd_ack          = -1,
2419 	},
2420 	.tool = {
2421 		.sample		= process_sample_event,
2422 		.fork		= perf_event__process_fork,
2423 		.exit		= perf_event__process_exit,
2424 		.comm		= perf_event__process_comm,
2425 		.namespaces	= perf_event__process_namespaces,
2426 		.mmap		= build_id__process_mmap,
2427 		.mmap2		= build_id__process_mmap2,
2428 		.itrace_start	= process_timestamp_boundary,
2429 		.aux		= process_timestamp_boundary,
2430 		.ordered_events	= true,
2431 	},
2432 };
2433 
2434 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2435 	"\n\t\t\t\tDefault: fp";
2436 
2437 static bool dry_run;
2438 
2439 /*
2440  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2441  * with it and switch to use the library functions in perf_evlist that came
2442  * from builtin-record.c, i.e. use record_opts,
2443  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2444  * using pipes, etc.
2445  */
2446 static struct option __record_options[] = {
2447 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2448 		     "event selector. use 'perf list' to list available events",
2449 		     parse_events_option),
2450 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2451 		     "event filter", parse_filter),
2452 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2453 			   NULL, "don't record events from perf itself",
2454 			   exclude_perf),
2455 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2456 		    "record events on existing process id"),
2457 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2458 		    "record events on existing thread id"),
2459 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2460 		    "collect data with this RT SCHED_FIFO priority"),
2461 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2462 		    "collect data without buffering"),
2463 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2464 		    "collect raw sample records from all opened counters"),
2465 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2466 			    "system-wide collection from all CPUs"),
2467 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2468 		    "list of cpus to monitor"),
2469 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2470 	OPT_STRING('o', "output", &record.data.path, "file",
2471 		    "output file name"),
2472 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2473 			&record.opts.no_inherit_set,
2474 			"child tasks do not inherit counters"),
2475 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2476 		    "synthesize non-sample events at the end of output"),
2477 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2478 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
2479 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2480 		    "Fail if the specified frequency can't be used"),
2481 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2482 		     "profile at this frequency",
2483 		      record__parse_freq),
2484 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2485 		     "number of mmap data pages and AUX area tracing mmap pages",
2486 		     record__parse_mmap_pages),
2487 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2488 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2489 		     record__mmap_flush_parse),
2490 	OPT_BOOLEAN(0, "group", &record.opts.group,
2491 		    "put the counters into a counter group"),
2492 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2493 			   NULL, "enables call-graph recording" ,
2494 			   &record_callchain_opt),
2495 	OPT_CALLBACK(0, "call-graph", &record.opts,
2496 		     "record_mode[,record_size]", record_callchain_help,
2497 		     &record_parse_callchain_opt),
2498 	OPT_INCR('v', "verbose", &verbose,
2499 		    "be more verbose (show counter open errors, etc)"),
2500 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2501 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2502 		    "per thread counts"),
2503 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2504 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2505 		    "Record the sample physical addresses"),
2506 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
2507 		    "Record the sampled data address data page size"),
2508 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
2509 		    "Record the sampled code address (ip) page size"),
2510 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2511 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2512 			&record.opts.sample_time_set,
2513 			"Record the sample timestamps"),
2514 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2515 			"Record the sample period"),
2516 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2517 		    "don't sample"),
2518 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2519 			&record.no_buildid_cache_set,
2520 			"do not update the buildid cache"),
2521 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2522 			&record.no_buildid_set,
2523 			"do not collect buildids in perf.data"),
2524 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2525 		     "monitor event in cgroup name only",
2526 		     parse_cgroups),
2527 	OPT_INTEGER('D', "delay", &record.opts.initial_delay,
2528 		  "ms to wait before starting measurement after program start (-1: start with events disabled)"),
2529 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2530 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2531 		   "user to profile"),
2532 
2533 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2534 		     "branch any", "sample any taken branches",
2535 		     parse_branch_stack),
2536 
2537 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2538 		     "branch filter mask", "branch stack filter modes",
2539 		     parse_branch_stack),
2540 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2541 		    "sample by weight (on special events only)"),
2542 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2543 		    "sample transaction flags (special events only)"),
2544 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2545 		    "use per-thread mmaps"),
2546 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2547 		    "sample selected machine registers on interrupt,"
2548 		    " use '-I?' to list register names", parse_intr_regs),
2549 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2550 		    "sample selected machine registers on interrupt,"
2551 		    " use '--user-regs=?' to list register names", parse_user_regs),
2552 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2553 		    "Record running/enabled time of read (:S) events"),
2554 	OPT_CALLBACK('k', "clockid", &record.opts,
2555 	"clockid", "clockid to use for events, see clock_gettime()",
2556 	parse_clockid),
2557 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2558 			  "opts", "AUX area tracing Snapshot Mode", ""),
2559 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2560 			  "opts", "sample AUX area", ""),
2561 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2562 			"per thread proc mmap processing timeout in ms"),
2563 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2564 		    "Record namespaces events"),
2565 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
2566 		    "Record cgroup events"),
2567 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
2568 			&record.opts.record_switch_events_set,
2569 			"Record context switch events"),
2570 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2571 			 "Configure all used events to run in kernel space.",
2572 			 PARSE_OPT_EXCLUSIVE),
2573 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2574 			 "Configure all used events to run in user space.",
2575 			 PARSE_OPT_EXCLUSIVE),
2576 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2577 		    "collect kernel callchains"),
2578 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2579 		    "collect user callchains"),
2580 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2581 		   "clang binary to use for compiling BPF scriptlets"),
2582 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2583 		   "options passed to clang when compiling BPF scriptlets"),
2584 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2585 		   "file", "vmlinux pathname"),
2586 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2587 		    "Record build-id of all DSOs regardless of hits"),
2588 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
2589 		    "Record build-id in map events"),
2590 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2591 		    "append timestamp to output filename"),
2592 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2593 		    "Record timestamp boundary (time of first/last samples)"),
2594 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2595 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2596 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2597 			  "signal"),
2598 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
2599 			 "switch output event selector. use 'perf list' to list available events",
2600 			 parse_events_option_new_evlist),
2601 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2602 		   "Limit number of switch output generated files"),
2603 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2604 		    "Parse options then exit"),
2605 #ifdef HAVE_AIO_SUPPORT
2606 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2607 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2608 		     record__aio_parse),
2609 #endif
2610 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2611 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2612 		     record__parse_affinity),
2613 #ifdef HAVE_ZSTD_SUPPORT
2614 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2615 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2616 			    record__parse_comp_level),
2617 #endif
2618 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
2619 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
2620 	OPT_UINTEGER(0, "num-thread-synthesize",
2621 		     &record.opts.nr_threads_synthesize,
2622 		     "number of threads to run for event synthesis"),
2623 #ifdef HAVE_LIBPFM
2624 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
2625 		"libpfm4 event selector. use 'perf list' to list available events",
2626 		parse_libpfm_events_option),
2627 #endif
2628 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
2629 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
2630 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
2631 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
2632 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
2633 		      parse_control_option),
2634 	OPT_END()
2635 };
2636 
2637 struct option *record_options = __record_options;
2638 
2639 int cmd_record(int argc, const char **argv)
2640 {
2641 	int err;
2642 	struct record *rec = &record;
2643 	char errbuf[BUFSIZ];
2644 
2645 	setlocale(LC_ALL, "");
2646 
2647 #ifndef HAVE_LIBBPF_SUPPORT
2648 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2649 	set_nobuild('\0', "clang-path", true);
2650 	set_nobuild('\0', "clang-opt", true);
2651 # undef set_nobuild
2652 #endif
2653 
2654 #ifndef HAVE_BPF_PROLOGUE
2655 # if !defined (HAVE_DWARF_SUPPORT)
2656 #  define REASON  "NO_DWARF=1"
2657 # elif !defined (HAVE_LIBBPF_SUPPORT)
2658 #  define REASON  "NO_LIBBPF=1"
2659 # else
2660 #  define REASON  "this architecture doesn't support BPF prologue"
2661 # endif
2662 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2663 	set_nobuild('\0', "vmlinux", true);
2664 # undef set_nobuild
2665 # undef REASON
2666 #endif
2667 
2668 	rec->opts.affinity = PERF_AFFINITY_SYS;
2669 
2670 	rec->evlist = evlist__new();
2671 	if (rec->evlist == NULL)
2672 		return -ENOMEM;
2673 
2674 	err = perf_config(perf_record_config, rec);
2675 	if (err)
2676 		return err;
2677 
2678 	argc = parse_options(argc, argv, record_options, record_usage,
2679 			    PARSE_OPT_STOP_AT_NON_OPTION);
2680 	if (quiet)
2681 		perf_quiet_option();
2682 
2683 	/* Make system wide (-a) the default target. */
2684 	if (!argc && target__none(&rec->opts.target))
2685 		rec->opts.target.system_wide = true;
2686 
2687 	if (nr_cgroups && !rec->opts.target.system_wide) {
2688 		usage_with_options_msg(record_usage, record_options,
2689 			"cgroup monitoring only available in system-wide mode");
2690 
2691 	}
2692 
2693 	if (rec->buildid_mmap) {
2694 		if (!perf_can_record_build_id()) {
2695 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
2696 			err = -EINVAL;
2697 			goto out_opts;
2698 		}
2699 		pr_debug("Enabling build id in mmap2 events.\n");
2700 		/* Enable mmap build id synthesizing. */
2701 		symbol_conf.buildid_mmap2 = true;
2702 		/* Enable perf_event_attr::build_id bit. */
2703 		rec->opts.build_id = true;
2704 		/* Disable build id cache. */
2705 		rec->no_buildid = true;
2706 	}
2707 
2708 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
2709 		pr_err("Kernel has no cgroup sampling support.\n");
2710 		err = -EINVAL;
2711 		goto out_opts;
2712 	}
2713 
2714 	if (rec->opts.kcore)
2715 		rec->data.is_dir = true;
2716 
2717 	if (rec->opts.comp_level != 0) {
2718 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2719 		rec->no_buildid = true;
2720 	}
2721 
2722 	if (rec->opts.record_switch_events &&
2723 	    !perf_can_record_switch_events()) {
2724 		ui__error("kernel does not support recording context switch events\n");
2725 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2726 		err = -EINVAL;
2727 		goto out_opts;
2728 	}
2729 
2730 	if (switch_output_setup(rec)) {
2731 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2732 		err = -EINVAL;
2733 		goto out_opts;
2734 	}
2735 
2736 	if (rec->switch_output.time) {
2737 		signal(SIGALRM, alarm_sig_handler);
2738 		alarm(rec->switch_output.time);
2739 	}
2740 
2741 	if (rec->switch_output.num_files) {
2742 		rec->switch_output.filenames = calloc(sizeof(char *),
2743 						      rec->switch_output.num_files);
2744 		if (!rec->switch_output.filenames) {
2745 			err = -EINVAL;
2746 			goto out_opts;
2747 		}
2748 	}
2749 
2750 	/*
2751 	 * Allow aliases to facilitate the lookup of symbols for address
2752 	 * filters. Refer to auxtrace_parse_filters().
2753 	 */
2754 	symbol_conf.allow_aliases = true;
2755 
2756 	symbol__init(NULL);
2757 
2758 	if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2759 		rec->affinity_mask.nbits = cpu__max_cpu();
2760 		rec->affinity_mask.bits = bitmap_zalloc(rec->affinity_mask.nbits);
2761 		if (!rec->affinity_mask.bits) {
2762 			pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2763 			err = -ENOMEM;
2764 			goto out_opts;
2765 		}
2766 		pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2767 	}
2768 
2769 	err = record__auxtrace_init(rec);
2770 	if (err)
2771 		goto out;
2772 
2773 	if (dry_run)
2774 		goto out;
2775 
2776 	err = bpf__setup_stdout(rec->evlist);
2777 	if (err) {
2778 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2779 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2780 			 errbuf);
2781 		goto out;
2782 	}
2783 
2784 	err = -ENOMEM;
2785 
2786 	if (rec->no_buildid_cache || rec->no_buildid) {
2787 		disable_buildid_cache();
2788 	} else if (rec->switch_output.enabled) {
2789 		/*
2790 		 * In 'perf record --switch-output', disable buildid
2791 		 * generation by default to reduce data file switching
2792 		 * overhead. Still generate buildid if they are required
2793 		 * explicitly using
2794 		 *
2795 		 *  perf record --switch-output --no-no-buildid \
2796 		 *              --no-no-buildid-cache
2797 		 *
2798 		 * Following code equals to:
2799 		 *
2800 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2801 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2802 		 *         disable_buildid_cache();
2803 		 */
2804 		bool disable = true;
2805 
2806 		if (rec->no_buildid_set && !rec->no_buildid)
2807 			disable = false;
2808 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2809 			disable = false;
2810 		if (disable) {
2811 			rec->no_buildid = true;
2812 			rec->no_buildid_cache = true;
2813 			disable_buildid_cache();
2814 		}
2815 	}
2816 
2817 	if (record.opts.overwrite)
2818 		record.opts.tail_synthesize = true;
2819 
2820 	if (rec->evlist->core.nr_entries == 0) {
2821 		if (perf_pmu__has_hybrid()) {
2822 			err = evlist__add_default_hybrid(rec->evlist,
2823 							 !record.opts.no_samples);
2824 		} else {
2825 			err = __evlist__add_default(rec->evlist,
2826 						    !record.opts.no_samples);
2827 		}
2828 
2829 		if (err < 0) {
2830 			pr_err("Not enough memory for event selector list\n");
2831 			goto out;
2832 		}
2833 	}
2834 
2835 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2836 		rec->opts.no_inherit = true;
2837 
2838 	err = target__validate(&rec->opts.target);
2839 	if (err) {
2840 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2841 		ui__warning("%s\n", errbuf);
2842 	}
2843 
2844 	err = target__parse_uid(&rec->opts.target);
2845 	if (err) {
2846 		int saved_errno = errno;
2847 
2848 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2849 		ui__error("%s", errbuf);
2850 
2851 		err = -saved_errno;
2852 		goto out;
2853 	}
2854 
2855 	/* Enable ignoring missing threads when -u/-p option is defined. */
2856 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2857 
2858 	if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
2859 		pr_err("failed to use cpu list %s\n",
2860 		       rec->opts.target.cpu_list);
2861 		goto out;
2862 	}
2863 
2864 	rec->opts.target.hybrid = perf_pmu__has_hybrid();
2865 	err = -ENOMEM;
2866 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2867 		usage_with_options(record_usage, record_options);
2868 
2869 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2870 	if (err)
2871 		goto out;
2872 
2873 	/*
2874 	 * We take all buildids when the file contains
2875 	 * AUX area tracing data because we do not decode the
2876 	 * trace because it would take too long.
2877 	 */
2878 	if (rec->opts.full_auxtrace)
2879 		rec->buildid_all = true;
2880 
2881 	if (rec->opts.text_poke) {
2882 		err = record__config_text_poke(rec->evlist);
2883 		if (err) {
2884 			pr_err("record__config_text_poke failed, error %d\n", err);
2885 			goto out;
2886 		}
2887 	}
2888 
2889 	if (record_opts__config(&rec->opts)) {
2890 		err = -EINVAL;
2891 		goto out;
2892 	}
2893 
2894 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2895 		rec->opts.nr_cblocks = nr_cblocks_max;
2896 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2897 
2898 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2899 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2900 
2901 	if (rec->opts.comp_level > comp_level_max)
2902 		rec->opts.comp_level = comp_level_max;
2903 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2904 
2905 	err = __cmd_record(&record, argc, argv);
2906 out:
2907 	bitmap_free(rec->affinity_mask.bits);
2908 	evlist__delete(rec->evlist);
2909 	symbol__exit();
2910 	auxtrace_record__free(rec->itr);
2911 out_opts:
2912 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
2913 	return err;
2914 }
2915 
2916 static void snapshot_sig_handler(int sig __maybe_unused)
2917 {
2918 	struct record *rec = &record;
2919 
2920 	hit_auxtrace_snapshot_trigger(rec);
2921 
2922 	if (switch_output_signal(rec))
2923 		trigger_hit(&switch_output_trigger);
2924 }
2925 
2926 static void alarm_sig_handler(int sig __maybe_unused)
2927 {
2928 	struct record *rec = &record;
2929 
2930 	if (switch_output_time(rec))
2931 		trigger_hit(&switch_output_trigger);
2932 }
2933