xref: /openbmc/linux/tools/perf/builtin-record.c (revision c0891ac1)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/clockid.h"
50 #include "util/pmu-hybrid.h"
51 #include "util/evlist-hybrid.h"
52 #include "asm/bug.h"
53 #include "perf.h"
54 
55 #include <errno.h>
56 #include <inttypes.h>
57 #include <locale.h>
58 #include <poll.h>
59 #include <pthread.h>
60 #include <unistd.h>
61 #include <sched.h>
62 #include <signal.h>
63 #ifdef HAVE_EVENTFD_SUPPORT
64 #include <sys/eventfd.h>
65 #endif
66 #include <sys/mman.h>
67 #include <sys/wait.h>
68 #include <sys/types.h>
69 #include <sys/stat.h>
70 #include <fcntl.h>
71 #include <linux/err.h>
72 #include <linux/string.h>
73 #include <linux/time64.h>
74 #include <linux/zalloc.h>
75 #include <linux/bitmap.h>
76 #include <sys/time.h>
77 
78 struct switch_output {
79 	bool		 enabled;
80 	bool		 signal;
81 	unsigned long	 size;
82 	unsigned long	 time;
83 	const char	*str;
84 	bool		 set;
85 	char		 **filenames;
86 	int		 num_files;
87 	int		 cur_file;
88 };
89 
90 struct record {
91 	struct perf_tool	tool;
92 	struct record_opts	opts;
93 	u64			bytes_written;
94 	struct perf_data	data;
95 	struct auxtrace_record	*itr;
96 	struct evlist	*evlist;
97 	struct perf_session	*session;
98 	struct evlist		*sb_evlist;
99 	pthread_t		thread_id;
100 	int			realtime_prio;
101 	bool			switch_output_event_set;
102 	bool			no_buildid;
103 	bool			no_buildid_set;
104 	bool			no_buildid_cache;
105 	bool			no_buildid_cache_set;
106 	bool			buildid_all;
107 	bool			buildid_mmap;
108 	bool			timestamp_filename;
109 	bool			timestamp_boundary;
110 	struct switch_output	switch_output;
111 	unsigned long long	samples;
112 	struct mmap_cpu_mask	affinity_mask;
113 	unsigned long		output_max_size;	/* = 0: unlimited */
114 };
115 
116 static volatile int done;
117 
118 static volatile int auxtrace_record__snapshot_started;
119 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
120 static DEFINE_TRIGGER(switch_output_trigger);
121 
122 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
123 	"SYS", "NODE", "CPU"
124 };
125 
126 static bool switch_output_signal(struct record *rec)
127 {
128 	return rec->switch_output.signal &&
129 	       trigger_is_ready(&switch_output_trigger);
130 }
131 
132 static bool switch_output_size(struct record *rec)
133 {
134 	return rec->switch_output.size &&
135 	       trigger_is_ready(&switch_output_trigger) &&
136 	       (rec->bytes_written >= rec->switch_output.size);
137 }
138 
139 static bool switch_output_time(struct record *rec)
140 {
141 	return rec->switch_output.time &&
142 	       trigger_is_ready(&switch_output_trigger);
143 }
144 
145 static bool record__output_max_size_exceeded(struct record *rec)
146 {
147 	return rec->output_max_size &&
148 	       (rec->bytes_written >= rec->output_max_size);
149 }
150 
151 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
152 			 void *bf, size_t size)
153 {
154 	struct perf_data_file *file = &rec->session->data->file;
155 
156 	if (perf_data_file__write(file, bf, size) < 0) {
157 		pr_err("failed to write perf data, error: %m\n");
158 		return -1;
159 	}
160 
161 	rec->bytes_written += size;
162 
163 	if (record__output_max_size_exceeded(rec) && !done) {
164 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
165 				" stopping session ]\n",
166 				rec->bytes_written >> 10);
167 		done = 1;
168 	}
169 
170 	if (switch_output_size(rec))
171 		trigger_hit(&switch_output_trigger);
172 
173 	return 0;
174 }
175 
176 static int record__aio_enabled(struct record *rec);
177 static int record__comp_enabled(struct record *rec);
178 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
179 			    void *src, size_t src_size);
180 
181 #ifdef HAVE_AIO_SUPPORT
182 static int record__aio_write(struct aiocb *cblock, int trace_fd,
183 		void *buf, size_t size, off_t off)
184 {
185 	int rc;
186 
187 	cblock->aio_fildes = trace_fd;
188 	cblock->aio_buf    = buf;
189 	cblock->aio_nbytes = size;
190 	cblock->aio_offset = off;
191 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
192 
193 	do {
194 		rc = aio_write(cblock);
195 		if (rc == 0) {
196 			break;
197 		} else if (errno != EAGAIN) {
198 			cblock->aio_fildes = -1;
199 			pr_err("failed to queue perf data, error: %m\n");
200 			break;
201 		}
202 	} while (1);
203 
204 	return rc;
205 }
206 
207 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
208 {
209 	void *rem_buf;
210 	off_t rem_off;
211 	size_t rem_size;
212 	int rc, aio_errno;
213 	ssize_t aio_ret, written;
214 
215 	aio_errno = aio_error(cblock);
216 	if (aio_errno == EINPROGRESS)
217 		return 0;
218 
219 	written = aio_ret = aio_return(cblock);
220 	if (aio_ret < 0) {
221 		if (aio_errno != EINTR)
222 			pr_err("failed to write perf data, error: %m\n");
223 		written = 0;
224 	}
225 
226 	rem_size = cblock->aio_nbytes - written;
227 
228 	if (rem_size == 0) {
229 		cblock->aio_fildes = -1;
230 		/*
231 		 * md->refcount is incremented in record__aio_pushfn() for
232 		 * every aio write request started in record__aio_push() so
233 		 * decrement it because the request is now complete.
234 		 */
235 		perf_mmap__put(&md->core);
236 		rc = 1;
237 	} else {
238 		/*
239 		 * aio write request may require restart with the
240 		 * reminder if the kernel didn't write whole
241 		 * chunk at once.
242 		 */
243 		rem_off = cblock->aio_offset + written;
244 		rem_buf = (void *)(cblock->aio_buf + written);
245 		record__aio_write(cblock, cblock->aio_fildes,
246 				rem_buf, rem_size, rem_off);
247 		rc = 0;
248 	}
249 
250 	return rc;
251 }
252 
253 static int record__aio_sync(struct mmap *md, bool sync_all)
254 {
255 	struct aiocb **aiocb = md->aio.aiocb;
256 	struct aiocb *cblocks = md->aio.cblocks;
257 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
258 	int i, do_suspend;
259 
260 	do {
261 		do_suspend = 0;
262 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
263 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
264 				if (sync_all)
265 					aiocb[i] = NULL;
266 				else
267 					return i;
268 			} else {
269 				/*
270 				 * Started aio write is not complete yet
271 				 * so it has to be waited before the
272 				 * next allocation.
273 				 */
274 				aiocb[i] = &cblocks[i];
275 				do_suspend = 1;
276 			}
277 		}
278 		if (!do_suspend)
279 			return -1;
280 
281 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
282 			if (!(errno == EAGAIN || errno == EINTR))
283 				pr_err("failed to sync perf data, error: %m\n");
284 		}
285 	} while (1);
286 }
287 
288 struct record_aio {
289 	struct record	*rec;
290 	void		*data;
291 	size_t		size;
292 };
293 
294 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
295 {
296 	struct record_aio *aio = to;
297 
298 	/*
299 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
300 	 * to release space in the kernel buffer as fast as possible, calling
301 	 * perf_mmap__consume() from perf_mmap__push() function.
302 	 *
303 	 * That lets the kernel to proceed with storing more profiling data into
304 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
305 	 *
306 	 * Coping can be done in two steps in case the chunk of profiling data
307 	 * crosses the upper bound of the kernel buffer. In this case we first move
308 	 * part of data from map->start till the upper bound and then the reminder
309 	 * from the beginning of the kernel buffer till the end of the data chunk.
310 	 */
311 
312 	if (record__comp_enabled(aio->rec)) {
313 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
314 				     mmap__mmap_len(map) - aio->size,
315 				     buf, size);
316 	} else {
317 		memcpy(aio->data + aio->size, buf, size);
318 	}
319 
320 	if (!aio->size) {
321 		/*
322 		 * Increment map->refcount to guard map->aio.data[] buffer
323 		 * from premature deallocation because map object can be
324 		 * released earlier than aio write request started on
325 		 * map->aio.data[] buffer is complete.
326 		 *
327 		 * perf_mmap__put() is done at record__aio_complete()
328 		 * after started aio request completion or at record__aio_push()
329 		 * if the request failed to start.
330 		 */
331 		perf_mmap__get(&map->core);
332 	}
333 
334 	aio->size += size;
335 
336 	return size;
337 }
338 
339 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
340 {
341 	int ret, idx;
342 	int trace_fd = rec->session->data->file.fd;
343 	struct record_aio aio = { .rec = rec, .size = 0 };
344 
345 	/*
346 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
347 	 * becomes available after previous aio write operation.
348 	 */
349 
350 	idx = record__aio_sync(map, false);
351 	aio.data = map->aio.data[idx];
352 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
353 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
354 		return ret;
355 
356 	rec->samples++;
357 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
358 	if (!ret) {
359 		*off += aio.size;
360 		rec->bytes_written += aio.size;
361 		if (switch_output_size(rec))
362 			trigger_hit(&switch_output_trigger);
363 	} else {
364 		/*
365 		 * Decrement map->refcount incremented in record__aio_pushfn()
366 		 * back if record__aio_write() operation failed to start, otherwise
367 		 * map->refcount is decremented in record__aio_complete() after
368 		 * aio write operation finishes successfully.
369 		 */
370 		perf_mmap__put(&map->core);
371 	}
372 
373 	return ret;
374 }
375 
376 static off_t record__aio_get_pos(int trace_fd)
377 {
378 	return lseek(trace_fd, 0, SEEK_CUR);
379 }
380 
381 static void record__aio_set_pos(int trace_fd, off_t pos)
382 {
383 	lseek(trace_fd, pos, SEEK_SET);
384 }
385 
386 static void record__aio_mmap_read_sync(struct record *rec)
387 {
388 	int i;
389 	struct evlist *evlist = rec->evlist;
390 	struct mmap *maps = evlist->mmap;
391 
392 	if (!record__aio_enabled(rec))
393 		return;
394 
395 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
396 		struct mmap *map = &maps[i];
397 
398 		if (map->core.base)
399 			record__aio_sync(map, true);
400 	}
401 }
402 
403 static int nr_cblocks_default = 1;
404 static int nr_cblocks_max = 4;
405 
406 static int record__aio_parse(const struct option *opt,
407 			     const char *str,
408 			     int unset)
409 {
410 	struct record_opts *opts = (struct record_opts *)opt->value;
411 
412 	if (unset) {
413 		opts->nr_cblocks = 0;
414 	} else {
415 		if (str)
416 			opts->nr_cblocks = strtol(str, NULL, 0);
417 		if (!opts->nr_cblocks)
418 			opts->nr_cblocks = nr_cblocks_default;
419 	}
420 
421 	return 0;
422 }
423 #else /* HAVE_AIO_SUPPORT */
424 static int nr_cblocks_max = 0;
425 
426 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
427 			    off_t *off __maybe_unused)
428 {
429 	return -1;
430 }
431 
432 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
433 {
434 	return -1;
435 }
436 
437 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
438 {
439 }
440 
441 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
442 {
443 }
444 #endif
445 
446 static int record__aio_enabled(struct record *rec)
447 {
448 	return rec->opts.nr_cblocks > 0;
449 }
450 
451 #define MMAP_FLUSH_DEFAULT 1
452 static int record__mmap_flush_parse(const struct option *opt,
453 				    const char *str,
454 				    int unset)
455 {
456 	int flush_max;
457 	struct record_opts *opts = (struct record_opts *)opt->value;
458 	static struct parse_tag tags[] = {
459 			{ .tag  = 'B', .mult = 1       },
460 			{ .tag  = 'K', .mult = 1 << 10 },
461 			{ .tag  = 'M', .mult = 1 << 20 },
462 			{ .tag  = 'G', .mult = 1 << 30 },
463 			{ .tag  = 0 },
464 	};
465 
466 	if (unset)
467 		return 0;
468 
469 	if (str) {
470 		opts->mmap_flush = parse_tag_value(str, tags);
471 		if (opts->mmap_flush == (int)-1)
472 			opts->mmap_flush = strtol(str, NULL, 0);
473 	}
474 
475 	if (!opts->mmap_flush)
476 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
477 
478 	flush_max = evlist__mmap_size(opts->mmap_pages);
479 	flush_max /= 4;
480 	if (opts->mmap_flush > flush_max)
481 		opts->mmap_flush = flush_max;
482 
483 	return 0;
484 }
485 
486 #ifdef HAVE_ZSTD_SUPPORT
487 static unsigned int comp_level_default = 1;
488 
489 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
490 {
491 	struct record_opts *opts = opt->value;
492 
493 	if (unset) {
494 		opts->comp_level = 0;
495 	} else {
496 		if (str)
497 			opts->comp_level = strtol(str, NULL, 0);
498 		if (!opts->comp_level)
499 			opts->comp_level = comp_level_default;
500 	}
501 
502 	return 0;
503 }
504 #endif
505 static unsigned int comp_level_max = 22;
506 
507 static int record__comp_enabled(struct record *rec)
508 {
509 	return rec->opts.comp_level > 0;
510 }
511 
512 static int process_synthesized_event(struct perf_tool *tool,
513 				     union perf_event *event,
514 				     struct perf_sample *sample __maybe_unused,
515 				     struct machine *machine __maybe_unused)
516 {
517 	struct record *rec = container_of(tool, struct record, tool);
518 	return record__write(rec, NULL, event, event->header.size);
519 }
520 
521 static int process_locked_synthesized_event(struct perf_tool *tool,
522 				     union perf_event *event,
523 				     struct perf_sample *sample __maybe_unused,
524 				     struct machine *machine __maybe_unused)
525 {
526 	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
527 	int ret;
528 
529 	pthread_mutex_lock(&synth_lock);
530 	ret = process_synthesized_event(tool, event, sample, machine);
531 	pthread_mutex_unlock(&synth_lock);
532 	return ret;
533 }
534 
535 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
536 {
537 	struct record *rec = to;
538 
539 	if (record__comp_enabled(rec)) {
540 		size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
541 		bf   = map->data;
542 	}
543 
544 	rec->samples++;
545 	return record__write(rec, map, bf, size);
546 }
547 
548 static volatile int signr = -1;
549 static volatile int child_finished;
550 #ifdef HAVE_EVENTFD_SUPPORT
551 static int done_fd = -1;
552 #endif
553 
554 static void sig_handler(int sig)
555 {
556 	if (sig == SIGCHLD)
557 		child_finished = 1;
558 	else
559 		signr = sig;
560 
561 	done = 1;
562 #ifdef HAVE_EVENTFD_SUPPORT
563 {
564 	u64 tmp = 1;
565 	/*
566 	 * It is possible for this signal handler to run after done is checked
567 	 * in the main loop, but before the perf counter fds are polled. If this
568 	 * happens, the poll() will continue to wait even though done is set,
569 	 * and will only break out if either another signal is received, or the
570 	 * counters are ready for read. To ensure the poll() doesn't sleep when
571 	 * done is set, use an eventfd (done_fd) to wake up the poll().
572 	 */
573 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
574 		pr_err("failed to signal wakeup fd, error: %m\n");
575 }
576 #endif // HAVE_EVENTFD_SUPPORT
577 }
578 
579 static void sigsegv_handler(int sig)
580 {
581 	perf_hooks__recover();
582 	sighandler_dump_stack(sig);
583 }
584 
585 static void record__sig_exit(void)
586 {
587 	if (signr == -1)
588 		return;
589 
590 	signal(signr, SIG_DFL);
591 	raise(signr);
592 }
593 
594 #ifdef HAVE_AUXTRACE_SUPPORT
595 
596 static int record__process_auxtrace(struct perf_tool *tool,
597 				    struct mmap *map,
598 				    union perf_event *event, void *data1,
599 				    size_t len1, void *data2, size_t len2)
600 {
601 	struct record *rec = container_of(tool, struct record, tool);
602 	struct perf_data *data = &rec->data;
603 	size_t padding;
604 	u8 pad[8] = {0};
605 
606 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
607 		off_t file_offset;
608 		int fd = perf_data__fd(data);
609 		int err;
610 
611 		file_offset = lseek(fd, 0, SEEK_CUR);
612 		if (file_offset == -1)
613 			return -1;
614 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
615 						     event, file_offset);
616 		if (err)
617 			return err;
618 	}
619 
620 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
621 	padding = (len1 + len2) & 7;
622 	if (padding)
623 		padding = 8 - padding;
624 
625 	record__write(rec, map, event, event->header.size);
626 	record__write(rec, map, data1, len1);
627 	if (len2)
628 		record__write(rec, map, data2, len2);
629 	record__write(rec, map, &pad, padding);
630 
631 	return 0;
632 }
633 
634 static int record__auxtrace_mmap_read(struct record *rec,
635 				      struct mmap *map)
636 {
637 	int ret;
638 
639 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
640 				  record__process_auxtrace);
641 	if (ret < 0)
642 		return ret;
643 
644 	if (ret)
645 		rec->samples++;
646 
647 	return 0;
648 }
649 
650 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
651 					       struct mmap *map)
652 {
653 	int ret;
654 
655 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
656 					   record__process_auxtrace,
657 					   rec->opts.auxtrace_snapshot_size);
658 	if (ret < 0)
659 		return ret;
660 
661 	if (ret)
662 		rec->samples++;
663 
664 	return 0;
665 }
666 
667 static int record__auxtrace_read_snapshot_all(struct record *rec)
668 {
669 	int i;
670 	int rc = 0;
671 
672 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
673 		struct mmap *map = &rec->evlist->mmap[i];
674 
675 		if (!map->auxtrace_mmap.base)
676 			continue;
677 
678 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
679 			rc = -1;
680 			goto out;
681 		}
682 	}
683 out:
684 	return rc;
685 }
686 
687 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
688 {
689 	pr_debug("Recording AUX area tracing snapshot\n");
690 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
691 		trigger_error(&auxtrace_snapshot_trigger);
692 	} else {
693 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
694 			trigger_error(&auxtrace_snapshot_trigger);
695 		else
696 			trigger_ready(&auxtrace_snapshot_trigger);
697 	}
698 }
699 
700 static int record__auxtrace_snapshot_exit(struct record *rec)
701 {
702 	if (trigger_is_error(&auxtrace_snapshot_trigger))
703 		return 0;
704 
705 	if (!auxtrace_record__snapshot_started &&
706 	    auxtrace_record__snapshot_start(rec->itr))
707 		return -1;
708 
709 	record__read_auxtrace_snapshot(rec, true);
710 	if (trigger_is_error(&auxtrace_snapshot_trigger))
711 		return -1;
712 
713 	return 0;
714 }
715 
716 static int record__auxtrace_init(struct record *rec)
717 {
718 	int err;
719 
720 	if (!rec->itr) {
721 		rec->itr = auxtrace_record__init(rec->evlist, &err);
722 		if (err)
723 			return err;
724 	}
725 
726 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
727 					      rec->opts.auxtrace_snapshot_opts);
728 	if (err)
729 		return err;
730 
731 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
732 					    rec->opts.auxtrace_sample_opts);
733 	if (err)
734 		return err;
735 
736 	auxtrace_regroup_aux_output(rec->evlist);
737 
738 	return auxtrace_parse_filters(rec->evlist);
739 }
740 
741 #else
742 
743 static inline
744 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
745 			       struct mmap *map __maybe_unused)
746 {
747 	return 0;
748 }
749 
750 static inline
751 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
752 				    bool on_exit __maybe_unused)
753 {
754 }
755 
756 static inline
757 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
758 {
759 	return 0;
760 }
761 
762 static inline
763 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
764 {
765 	return 0;
766 }
767 
768 static int record__auxtrace_init(struct record *rec __maybe_unused)
769 {
770 	return 0;
771 }
772 
773 #endif
774 
775 static int record__config_text_poke(struct evlist *evlist)
776 {
777 	struct evsel *evsel;
778 	int err;
779 
780 	/* Nothing to do if text poke is already configured */
781 	evlist__for_each_entry(evlist, evsel) {
782 		if (evsel->core.attr.text_poke)
783 			return 0;
784 	}
785 
786 	err = parse_events(evlist, "dummy:u", NULL);
787 	if (err)
788 		return err;
789 
790 	evsel = evlist__last(evlist);
791 
792 	evsel->core.attr.freq = 0;
793 	evsel->core.attr.sample_period = 1;
794 	evsel->core.attr.text_poke = 1;
795 	evsel->core.attr.ksymbol = 1;
796 
797 	evsel->core.system_wide = true;
798 	evsel->no_aux_samples = true;
799 	evsel->immediate = true;
800 
801 	/* Text poke must be collected on all CPUs */
802 	perf_cpu_map__put(evsel->core.own_cpus);
803 	evsel->core.own_cpus = perf_cpu_map__new(NULL);
804 	perf_cpu_map__put(evsel->core.cpus);
805 	evsel->core.cpus = perf_cpu_map__get(evsel->core.own_cpus);
806 
807 	evsel__set_sample_bit(evsel, TIME);
808 
809 	return 0;
810 }
811 
812 static bool record__kcore_readable(struct machine *machine)
813 {
814 	char kcore[PATH_MAX];
815 	int fd;
816 
817 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
818 
819 	fd = open(kcore, O_RDONLY);
820 	if (fd < 0)
821 		return false;
822 
823 	close(fd);
824 
825 	return true;
826 }
827 
828 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
829 {
830 	char from_dir[PATH_MAX];
831 	char kcore_dir[PATH_MAX];
832 	int ret;
833 
834 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
835 
836 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
837 	if (ret)
838 		return ret;
839 
840 	return kcore_copy(from_dir, kcore_dir);
841 }
842 
843 static int record__mmap_evlist(struct record *rec,
844 			       struct evlist *evlist)
845 {
846 	struct record_opts *opts = &rec->opts;
847 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
848 				  opts->auxtrace_sample_mode;
849 	char msg[512];
850 
851 	if (opts->affinity != PERF_AFFINITY_SYS)
852 		cpu__setup_cpunode_map();
853 
854 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
855 				 opts->auxtrace_mmap_pages,
856 				 auxtrace_overwrite,
857 				 opts->nr_cblocks, opts->affinity,
858 				 opts->mmap_flush, opts->comp_level) < 0) {
859 		if (errno == EPERM) {
860 			pr_err("Permission error mapping pages.\n"
861 			       "Consider increasing "
862 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
863 			       "or try again with a smaller value of -m/--mmap_pages.\n"
864 			       "(current value: %u,%u)\n",
865 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
866 			return -errno;
867 		} else {
868 			pr_err("failed to mmap with %d (%s)\n", errno,
869 				str_error_r(errno, msg, sizeof(msg)));
870 			if (errno)
871 				return -errno;
872 			else
873 				return -EINVAL;
874 		}
875 	}
876 	return 0;
877 }
878 
879 static int record__mmap(struct record *rec)
880 {
881 	return record__mmap_evlist(rec, rec->evlist);
882 }
883 
884 static int record__open(struct record *rec)
885 {
886 	char msg[BUFSIZ];
887 	struct evsel *pos;
888 	struct evlist *evlist = rec->evlist;
889 	struct perf_session *session = rec->session;
890 	struct record_opts *opts = &rec->opts;
891 	int rc = 0;
892 
893 	/*
894 	 * For initial_delay, system wide or a hybrid system, we need to add a
895 	 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
896 	 * of waiting or event synthesis.
897 	 */
898 	if (opts->initial_delay || target__has_cpu(&opts->target) ||
899 	    perf_pmu__has_hybrid()) {
900 		pos = evlist__get_tracking_event(evlist);
901 		if (!evsel__is_dummy_event(pos)) {
902 			/* Set up dummy event. */
903 			if (evlist__add_dummy(evlist))
904 				return -ENOMEM;
905 			pos = evlist__last(evlist);
906 			evlist__set_tracking_event(evlist, pos);
907 		}
908 
909 		/*
910 		 * Enable the dummy event when the process is forked for
911 		 * initial_delay, immediately for system wide.
912 		 */
913 		if (opts->initial_delay && !pos->immediate)
914 			pos->core.attr.enable_on_exec = 1;
915 		else
916 			pos->immediate = 1;
917 	}
918 
919 	evlist__config(evlist, opts, &callchain_param);
920 
921 	evlist__for_each_entry(evlist, pos) {
922 try_again:
923 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
924 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
925 				if (verbose > 0)
926 					ui__warning("%s\n", msg);
927 				goto try_again;
928 			}
929 			if ((errno == EINVAL || errno == EBADF) &&
930 			    pos->core.leader != &pos->core &&
931 			    pos->weak_group) {
932 			        pos = evlist__reset_weak_group(evlist, pos, true);
933 				goto try_again;
934 			}
935 			rc = -errno;
936 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
937 			ui__error("%s\n", msg);
938 			goto out;
939 		}
940 
941 		pos->supported = true;
942 	}
943 
944 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
945 		pr_warning(
946 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
947 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
948 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
949 "file is not found in the buildid cache or in the vmlinux path.\n\n"
950 "Samples in kernel modules won't be resolved at all.\n\n"
951 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
952 "even with a suitable vmlinux or kallsyms file.\n\n");
953 	}
954 
955 	if (evlist__apply_filters(evlist, &pos)) {
956 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
957 			pos->filter, evsel__name(pos), errno,
958 			str_error_r(errno, msg, sizeof(msg)));
959 		rc = -1;
960 		goto out;
961 	}
962 
963 	rc = record__mmap(rec);
964 	if (rc)
965 		goto out;
966 
967 	session->evlist = evlist;
968 	perf_session__set_id_hdr_size(session);
969 out:
970 	return rc;
971 }
972 
973 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
974 {
975 	if (rec->evlist->first_sample_time == 0)
976 		rec->evlist->first_sample_time = sample_time;
977 
978 	if (sample_time)
979 		rec->evlist->last_sample_time = sample_time;
980 }
981 
982 static int process_sample_event(struct perf_tool *tool,
983 				union perf_event *event,
984 				struct perf_sample *sample,
985 				struct evsel *evsel,
986 				struct machine *machine)
987 {
988 	struct record *rec = container_of(tool, struct record, tool);
989 
990 	set_timestamp_boundary(rec, sample->time);
991 
992 	if (rec->buildid_all)
993 		return 0;
994 
995 	rec->samples++;
996 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
997 }
998 
999 static int process_buildids(struct record *rec)
1000 {
1001 	struct perf_session *session = rec->session;
1002 
1003 	if (perf_data__size(&rec->data) == 0)
1004 		return 0;
1005 
1006 	/*
1007 	 * During this process, it'll load kernel map and replace the
1008 	 * dso->long_name to a real pathname it found.  In this case
1009 	 * we prefer the vmlinux path like
1010 	 *   /lib/modules/3.16.4/build/vmlinux
1011 	 *
1012 	 * rather than build-id path (in debug directory).
1013 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1014 	 */
1015 	symbol_conf.ignore_vmlinux_buildid = true;
1016 
1017 	/*
1018 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1019 	 * so no need to process samples. But if timestamp_boundary is enabled,
1020 	 * it still needs to walk on all samples to get the timestamps of
1021 	 * first/last samples.
1022 	 */
1023 	if (rec->buildid_all && !rec->timestamp_boundary)
1024 		rec->tool.sample = NULL;
1025 
1026 	return perf_session__process_events(session);
1027 }
1028 
1029 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1030 {
1031 	int err;
1032 	struct perf_tool *tool = data;
1033 	/*
1034 	 *As for guest kernel when processing subcommand record&report,
1035 	 *we arrange module mmap prior to guest kernel mmap and trigger
1036 	 *a preload dso because default guest module symbols are loaded
1037 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1038 	 *method is used to avoid symbol missing when the first addr is
1039 	 *in module instead of in guest kernel.
1040 	 */
1041 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1042 					     machine);
1043 	if (err < 0)
1044 		pr_err("Couldn't record guest kernel [%d]'s reference"
1045 		       " relocation symbol.\n", machine->pid);
1046 
1047 	/*
1048 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1049 	 * have no _text sometimes.
1050 	 */
1051 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1052 						 machine);
1053 	if (err < 0)
1054 		pr_err("Couldn't record guest kernel [%d]'s reference"
1055 		       " relocation symbol.\n", machine->pid);
1056 }
1057 
1058 static struct perf_event_header finished_round_event = {
1059 	.size = sizeof(struct perf_event_header),
1060 	.type = PERF_RECORD_FINISHED_ROUND,
1061 };
1062 
1063 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1064 {
1065 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1066 	    !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
1067 			  rec->affinity_mask.nbits)) {
1068 		bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
1069 		bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
1070 			  map->affinity_mask.bits, rec->affinity_mask.nbits);
1071 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
1072 				  (cpu_set_t *)rec->affinity_mask.bits);
1073 		if (verbose == 2)
1074 			mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
1075 	}
1076 }
1077 
1078 static size_t process_comp_header(void *record, size_t increment)
1079 {
1080 	struct perf_record_compressed *event = record;
1081 	size_t size = sizeof(*event);
1082 
1083 	if (increment) {
1084 		event->header.size += increment;
1085 		return increment;
1086 	}
1087 
1088 	event->header.type = PERF_RECORD_COMPRESSED;
1089 	event->header.size = size;
1090 
1091 	return size;
1092 }
1093 
1094 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
1095 			    void *src, size_t src_size)
1096 {
1097 	size_t compressed;
1098 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1099 
1100 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1101 						     max_record_size, process_comp_header);
1102 
1103 	session->bytes_transferred += src_size;
1104 	session->bytes_compressed  += compressed;
1105 
1106 	return compressed;
1107 }
1108 
1109 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1110 				    bool overwrite, bool synch)
1111 {
1112 	u64 bytes_written = rec->bytes_written;
1113 	int i;
1114 	int rc = 0;
1115 	struct mmap *maps;
1116 	int trace_fd = rec->data.file.fd;
1117 	off_t off = 0;
1118 
1119 	if (!evlist)
1120 		return 0;
1121 
1122 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1123 	if (!maps)
1124 		return 0;
1125 
1126 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1127 		return 0;
1128 
1129 	if (record__aio_enabled(rec))
1130 		off = record__aio_get_pos(trace_fd);
1131 
1132 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
1133 		u64 flush = 0;
1134 		struct mmap *map = &maps[i];
1135 
1136 		if (map->core.base) {
1137 			record__adjust_affinity(rec, map);
1138 			if (synch) {
1139 				flush = map->core.flush;
1140 				map->core.flush = 1;
1141 			}
1142 			if (!record__aio_enabled(rec)) {
1143 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1144 					if (synch)
1145 						map->core.flush = flush;
1146 					rc = -1;
1147 					goto out;
1148 				}
1149 			} else {
1150 				if (record__aio_push(rec, map, &off) < 0) {
1151 					record__aio_set_pos(trace_fd, off);
1152 					if (synch)
1153 						map->core.flush = flush;
1154 					rc = -1;
1155 					goto out;
1156 				}
1157 			}
1158 			if (synch)
1159 				map->core.flush = flush;
1160 		}
1161 
1162 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1163 		    !rec->opts.auxtrace_sample_mode &&
1164 		    record__auxtrace_mmap_read(rec, map) != 0) {
1165 			rc = -1;
1166 			goto out;
1167 		}
1168 	}
1169 
1170 	if (record__aio_enabled(rec))
1171 		record__aio_set_pos(trace_fd, off);
1172 
1173 	/*
1174 	 * Mark the round finished in case we wrote
1175 	 * at least one event.
1176 	 */
1177 	if (bytes_written != rec->bytes_written)
1178 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1179 
1180 	if (overwrite)
1181 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1182 out:
1183 	return rc;
1184 }
1185 
1186 static int record__mmap_read_all(struct record *rec, bool synch)
1187 {
1188 	int err;
1189 
1190 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1191 	if (err)
1192 		return err;
1193 
1194 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1195 }
1196 
1197 static void record__init_features(struct record *rec)
1198 {
1199 	struct perf_session *session = rec->session;
1200 	int feat;
1201 
1202 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1203 		perf_header__set_feat(&session->header, feat);
1204 
1205 	if (rec->no_buildid)
1206 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1207 
1208 	if (!have_tracepoints(&rec->evlist->core.entries))
1209 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1210 
1211 	if (!rec->opts.branch_stack)
1212 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1213 
1214 	if (!rec->opts.full_auxtrace)
1215 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1216 
1217 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1218 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1219 
1220 	if (!rec->opts.use_clockid)
1221 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1222 
1223 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1224 	if (!record__comp_enabled(rec))
1225 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1226 
1227 	perf_header__clear_feat(&session->header, HEADER_STAT);
1228 }
1229 
1230 static void
1231 record__finish_output(struct record *rec)
1232 {
1233 	struct perf_data *data = &rec->data;
1234 	int fd = perf_data__fd(data);
1235 
1236 	if (data->is_pipe)
1237 		return;
1238 
1239 	rec->session->header.data_size += rec->bytes_written;
1240 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1241 
1242 	if (!rec->no_buildid) {
1243 		process_buildids(rec);
1244 
1245 		if (rec->buildid_all)
1246 			dsos__hit_all(rec->session);
1247 	}
1248 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1249 
1250 	return;
1251 }
1252 
1253 static int record__synthesize_workload(struct record *rec, bool tail)
1254 {
1255 	int err;
1256 	struct perf_thread_map *thread_map;
1257 
1258 	if (rec->opts.tail_synthesize != tail)
1259 		return 0;
1260 
1261 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1262 	if (thread_map == NULL)
1263 		return -1;
1264 
1265 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1266 						 process_synthesized_event,
1267 						 &rec->session->machines.host,
1268 						 rec->opts.sample_address);
1269 	perf_thread_map__put(thread_map);
1270 	return err;
1271 }
1272 
1273 static int record__synthesize(struct record *rec, bool tail);
1274 
1275 static int
1276 record__switch_output(struct record *rec, bool at_exit)
1277 {
1278 	struct perf_data *data = &rec->data;
1279 	int fd, err;
1280 	char *new_filename;
1281 
1282 	/* Same Size:      "2015122520103046"*/
1283 	char timestamp[] = "InvalidTimestamp";
1284 
1285 	record__aio_mmap_read_sync(rec);
1286 
1287 	record__synthesize(rec, true);
1288 	if (target__none(&rec->opts.target))
1289 		record__synthesize_workload(rec, true);
1290 
1291 	rec->samples = 0;
1292 	record__finish_output(rec);
1293 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1294 	if (err) {
1295 		pr_err("Failed to get current timestamp\n");
1296 		return -EINVAL;
1297 	}
1298 
1299 	fd = perf_data__switch(data, timestamp,
1300 				    rec->session->header.data_offset,
1301 				    at_exit, &new_filename);
1302 	if (fd >= 0 && !at_exit) {
1303 		rec->bytes_written = 0;
1304 		rec->session->header.data_size = 0;
1305 	}
1306 
1307 	if (!quiet)
1308 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1309 			data->path, timestamp);
1310 
1311 	if (rec->switch_output.num_files) {
1312 		int n = rec->switch_output.cur_file + 1;
1313 
1314 		if (n >= rec->switch_output.num_files)
1315 			n = 0;
1316 		rec->switch_output.cur_file = n;
1317 		if (rec->switch_output.filenames[n]) {
1318 			remove(rec->switch_output.filenames[n]);
1319 			zfree(&rec->switch_output.filenames[n]);
1320 		}
1321 		rec->switch_output.filenames[n] = new_filename;
1322 	} else {
1323 		free(new_filename);
1324 	}
1325 
1326 	/* Output tracking events */
1327 	if (!at_exit) {
1328 		record__synthesize(rec, false);
1329 
1330 		/*
1331 		 * In 'perf record --switch-output' without -a,
1332 		 * record__synthesize() in record__switch_output() won't
1333 		 * generate tracking events because there's no thread_map
1334 		 * in evlist. Which causes newly created perf.data doesn't
1335 		 * contain map and comm information.
1336 		 * Create a fake thread_map and directly call
1337 		 * perf_event__synthesize_thread_map() for those events.
1338 		 */
1339 		if (target__none(&rec->opts.target))
1340 			record__synthesize_workload(rec, false);
1341 	}
1342 	return fd;
1343 }
1344 
1345 static volatile int workload_exec_errno;
1346 
1347 /*
1348  * evlist__prepare_workload will send a SIGUSR1
1349  * if the fork fails, since we asked by setting its
1350  * want_signal to true.
1351  */
1352 static void workload_exec_failed_signal(int signo __maybe_unused,
1353 					siginfo_t *info,
1354 					void *ucontext __maybe_unused)
1355 {
1356 	workload_exec_errno = info->si_value.sival_int;
1357 	done = 1;
1358 	child_finished = 1;
1359 }
1360 
1361 static void snapshot_sig_handler(int sig);
1362 static void alarm_sig_handler(int sig);
1363 
1364 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1365 {
1366 	if (evlist) {
1367 		if (evlist->mmap && evlist->mmap[0].core.base)
1368 			return evlist->mmap[0].core.base;
1369 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1370 			return evlist->overwrite_mmap[0].core.base;
1371 	}
1372 	return NULL;
1373 }
1374 
1375 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1376 {
1377 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1378 	if (pc)
1379 		return pc;
1380 	return NULL;
1381 }
1382 
1383 static int record__synthesize(struct record *rec, bool tail)
1384 {
1385 	struct perf_session *session = rec->session;
1386 	struct machine *machine = &session->machines.host;
1387 	struct perf_data *data = &rec->data;
1388 	struct record_opts *opts = &rec->opts;
1389 	struct perf_tool *tool = &rec->tool;
1390 	int fd = perf_data__fd(data);
1391 	int err = 0;
1392 	event_op f = process_synthesized_event;
1393 
1394 	if (rec->opts.tail_synthesize != tail)
1395 		return 0;
1396 
1397 	if (data->is_pipe) {
1398 		/*
1399 		 * We need to synthesize events first, because some
1400 		 * features works on top of them (on report side).
1401 		 */
1402 		err = perf_event__synthesize_attrs(tool, rec->evlist,
1403 						   process_synthesized_event);
1404 		if (err < 0) {
1405 			pr_err("Couldn't synthesize attrs.\n");
1406 			goto out;
1407 		}
1408 
1409 		err = perf_event__synthesize_features(tool, session, rec->evlist,
1410 						      process_synthesized_event);
1411 		if (err < 0) {
1412 			pr_err("Couldn't synthesize features.\n");
1413 			return err;
1414 		}
1415 
1416 		if (have_tracepoints(&rec->evlist->core.entries)) {
1417 			/*
1418 			 * FIXME err <= 0 here actually means that
1419 			 * there were no tracepoints so its not really
1420 			 * an error, just that we don't need to
1421 			 * synthesize anything.  We really have to
1422 			 * return this more properly and also
1423 			 * propagate errors that now are calling die()
1424 			 */
1425 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
1426 								  process_synthesized_event);
1427 			if (err <= 0) {
1428 				pr_err("Couldn't record tracing data.\n");
1429 				goto out;
1430 			}
1431 			rec->bytes_written += err;
1432 		}
1433 	}
1434 
1435 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1436 					  process_synthesized_event, machine);
1437 	if (err)
1438 		goto out;
1439 
1440 	/* Synthesize id_index before auxtrace_info */
1441 	if (rec->opts.auxtrace_sample_mode) {
1442 		err = perf_event__synthesize_id_index(tool,
1443 						      process_synthesized_event,
1444 						      session->evlist, machine);
1445 		if (err)
1446 			goto out;
1447 	}
1448 
1449 	if (rec->opts.full_auxtrace) {
1450 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1451 					session, process_synthesized_event);
1452 		if (err)
1453 			goto out;
1454 	}
1455 
1456 	if (!evlist__exclude_kernel(rec->evlist)) {
1457 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1458 							 machine);
1459 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1460 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1461 				   "Check /proc/kallsyms permission or run as root.\n");
1462 
1463 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1464 						     machine);
1465 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1466 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1467 				   "Check /proc/modules permission or run as root.\n");
1468 	}
1469 
1470 	if (perf_guest) {
1471 		machines__process_guests(&session->machines,
1472 					 perf_event__synthesize_guest_os, tool);
1473 	}
1474 
1475 	err = perf_event__synthesize_extra_attr(&rec->tool,
1476 						rec->evlist,
1477 						process_synthesized_event,
1478 						data->is_pipe);
1479 	if (err)
1480 		goto out;
1481 
1482 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1483 						 process_synthesized_event,
1484 						NULL);
1485 	if (err < 0) {
1486 		pr_err("Couldn't synthesize thread map.\n");
1487 		return err;
1488 	}
1489 
1490 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1491 					     process_synthesized_event, NULL);
1492 	if (err < 0) {
1493 		pr_err("Couldn't synthesize cpu map.\n");
1494 		return err;
1495 	}
1496 
1497 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1498 						machine, opts);
1499 	if (err < 0)
1500 		pr_warning("Couldn't synthesize bpf events.\n");
1501 
1502 	err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1503 					     machine);
1504 	if (err < 0)
1505 		pr_warning("Couldn't synthesize cgroup events.\n");
1506 
1507 	if (rec->opts.nr_threads_synthesize > 1) {
1508 		perf_set_multithreaded();
1509 		f = process_locked_synthesized_event;
1510 	}
1511 
1512 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1513 					    f, opts->sample_address,
1514 					    rec->opts.nr_threads_synthesize);
1515 
1516 	if (rec->opts.nr_threads_synthesize > 1)
1517 		perf_set_singlethreaded();
1518 
1519 out:
1520 	return err;
1521 }
1522 
1523 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1524 {
1525 	struct record *rec = data;
1526 	pthread_kill(rec->thread_id, SIGUSR2);
1527 	return 0;
1528 }
1529 
1530 static int record__setup_sb_evlist(struct record *rec)
1531 {
1532 	struct record_opts *opts = &rec->opts;
1533 
1534 	if (rec->sb_evlist != NULL) {
1535 		/*
1536 		 * We get here if --switch-output-event populated the
1537 		 * sb_evlist, so associate a callback that will send a SIGUSR2
1538 		 * to the main thread.
1539 		 */
1540 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1541 		rec->thread_id = pthread_self();
1542 	}
1543 #ifdef HAVE_LIBBPF_SUPPORT
1544 	if (!opts->no_bpf_event) {
1545 		if (rec->sb_evlist == NULL) {
1546 			rec->sb_evlist = evlist__new();
1547 
1548 			if (rec->sb_evlist == NULL) {
1549 				pr_err("Couldn't create side band evlist.\n.");
1550 				return -1;
1551 			}
1552 		}
1553 
1554 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1555 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1556 			return -1;
1557 		}
1558 	}
1559 #endif
1560 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1561 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1562 		opts->no_bpf_event = true;
1563 	}
1564 
1565 	return 0;
1566 }
1567 
1568 static int record__init_clock(struct record *rec)
1569 {
1570 	struct perf_session *session = rec->session;
1571 	struct timespec ref_clockid;
1572 	struct timeval ref_tod;
1573 	u64 ref;
1574 
1575 	if (!rec->opts.use_clockid)
1576 		return 0;
1577 
1578 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1579 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
1580 
1581 	session->header.env.clock.clockid = rec->opts.clockid;
1582 
1583 	if (gettimeofday(&ref_tod, NULL) != 0) {
1584 		pr_err("gettimeofday failed, cannot set reference time.\n");
1585 		return -1;
1586 	}
1587 
1588 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
1589 		pr_err("clock_gettime failed, cannot set reference time.\n");
1590 		return -1;
1591 	}
1592 
1593 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
1594 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
1595 
1596 	session->header.env.clock.tod_ns = ref;
1597 
1598 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
1599 	      (u64) ref_clockid.tv_nsec;
1600 
1601 	session->header.env.clock.clockid_ns = ref;
1602 	return 0;
1603 }
1604 
1605 static void hit_auxtrace_snapshot_trigger(struct record *rec)
1606 {
1607 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1608 		trigger_hit(&auxtrace_snapshot_trigger);
1609 		auxtrace_record__snapshot_started = 1;
1610 		if (auxtrace_record__snapshot_start(rec->itr))
1611 			trigger_error(&auxtrace_snapshot_trigger);
1612 	}
1613 }
1614 
1615 static void record__uniquify_name(struct record *rec)
1616 {
1617 	struct evsel *pos;
1618 	struct evlist *evlist = rec->evlist;
1619 	char *new_name;
1620 	int ret;
1621 
1622 	if (!perf_pmu__has_hybrid())
1623 		return;
1624 
1625 	evlist__for_each_entry(evlist, pos) {
1626 		if (!evsel__is_hybrid(pos))
1627 			continue;
1628 
1629 		if (strchr(pos->name, '/'))
1630 			continue;
1631 
1632 		ret = asprintf(&new_name, "%s/%s/",
1633 			       pos->pmu_name, pos->name);
1634 		if (ret) {
1635 			free(pos->name);
1636 			pos->name = new_name;
1637 		}
1638 	}
1639 }
1640 
1641 static int __cmd_record(struct record *rec, int argc, const char **argv)
1642 {
1643 	int err;
1644 	int status = 0;
1645 	unsigned long waking = 0;
1646 	const bool forks = argc > 0;
1647 	struct perf_tool *tool = &rec->tool;
1648 	struct record_opts *opts = &rec->opts;
1649 	struct perf_data *data = &rec->data;
1650 	struct perf_session *session;
1651 	bool disabled = false, draining = false;
1652 	int fd;
1653 	float ratio = 0;
1654 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
1655 
1656 	atexit(record__sig_exit);
1657 	signal(SIGCHLD, sig_handler);
1658 	signal(SIGINT, sig_handler);
1659 	signal(SIGTERM, sig_handler);
1660 	signal(SIGSEGV, sigsegv_handler);
1661 
1662 	if (rec->opts.record_namespaces)
1663 		tool->namespace_events = true;
1664 
1665 	if (rec->opts.record_cgroup) {
1666 #ifdef HAVE_FILE_HANDLE
1667 		tool->cgroup_events = true;
1668 #else
1669 		pr_err("cgroup tracking is not supported\n");
1670 		return -1;
1671 #endif
1672 	}
1673 
1674 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1675 		signal(SIGUSR2, snapshot_sig_handler);
1676 		if (rec->opts.auxtrace_snapshot_mode)
1677 			trigger_on(&auxtrace_snapshot_trigger);
1678 		if (rec->switch_output.enabled)
1679 			trigger_on(&switch_output_trigger);
1680 	} else {
1681 		signal(SIGUSR2, SIG_IGN);
1682 	}
1683 
1684 	session = perf_session__new(data, false, tool);
1685 	if (IS_ERR(session)) {
1686 		pr_err("Perf session creation failed.\n");
1687 		return PTR_ERR(session);
1688 	}
1689 
1690 	fd = perf_data__fd(data);
1691 	rec->session = session;
1692 
1693 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1694 		pr_err("Compression initialization failed.\n");
1695 		return -1;
1696 	}
1697 #ifdef HAVE_EVENTFD_SUPPORT
1698 	done_fd = eventfd(0, EFD_NONBLOCK);
1699 	if (done_fd < 0) {
1700 		pr_err("Failed to create wakeup eventfd, error: %m\n");
1701 		status = -1;
1702 		goto out_delete_session;
1703 	}
1704 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
1705 	if (err < 0) {
1706 		pr_err("Failed to add wakeup eventfd to poll list\n");
1707 		status = err;
1708 		goto out_delete_session;
1709 	}
1710 #endif // HAVE_EVENTFD_SUPPORT
1711 
1712 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1713 	session->header.env.comp_level = rec->opts.comp_level;
1714 
1715 	if (rec->opts.kcore &&
1716 	    !record__kcore_readable(&session->machines.host)) {
1717 		pr_err("ERROR: kcore is not readable.\n");
1718 		return -1;
1719 	}
1720 
1721 	if (record__init_clock(rec))
1722 		return -1;
1723 
1724 	record__init_features(rec);
1725 
1726 	if (forks) {
1727 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
1728 					       workload_exec_failed_signal);
1729 		if (err < 0) {
1730 			pr_err("Couldn't run the workload!\n");
1731 			status = err;
1732 			goto out_delete_session;
1733 		}
1734 	}
1735 
1736 	/*
1737 	 * If we have just single event and are sending data
1738 	 * through pipe, we need to force the ids allocation,
1739 	 * because we synthesize event name through the pipe
1740 	 * and need the id for that.
1741 	 */
1742 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1743 		rec->opts.sample_id = true;
1744 
1745 	record__uniquify_name(rec);
1746 
1747 	if (record__open(rec) != 0) {
1748 		err = -1;
1749 		goto out_child;
1750 	}
1751 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1752 
1753 	if (rec->opts.kcore) {
1754 		err = record__kcore_copy(&session->machines.host, data);
1755 		if (err) {
1756 			pr_err("ERROR: Failed to copy kcore\n");
1757 			goto out_child;
1758 		}
1759 	}
1760 
1761 	err = bpf__apply_obj_config();
1762 	if (err) {
1763 		char errbuf[BUFSIZ];
1764 
1765 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1766 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1767 			 errbuf);
1768 		goto out_child;
1769 	}
1770 
1771 	/*
1772 	 * Normally perf_session__new would do this, but it doesn't have the
1773 	 * evlist.
1774 	 */
1775 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
1776 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1777 		rec->tool.ordered_events = false;
1778 	}
1779 
1780 	if (!rec->evlist->core.nr_groups)
1781 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1782 
1783 	if (data->is_pipe) {
1784 		err = perf_header__write_pipe(fd);
1785 		if (err < 0)
1786 			goto out_child;
1787 	} else {
1788 		err = perf_session__write_header(session, rec->evlist, fd, false);
1789 		if (err < 0)
1790 			goto out_child;
1791 	}
1792 
1793 	err = -1;
1794 	if (!rec->no_buildid
1795 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1796 		pr_err("Couldn't generate buildids. "
1797 		       "Use --no-buildid to profile anyway.\n");
1798 		goto out_child;
1799 	}
1800 
1801 	err = record__setup_sb_evlist(rec);
1802 	if (err)
1803 		goto out_child;
1804 
1805 	err = record__synthesize(rec, false);
1806 	if (err < 0)
1807 		goto out_child;
1808 
1809 	if (rec->realtime_prio) {
1810 		struct sched_param param;
1811 
1812 		param.sched_priority = rec->realtime_prio;
1813 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1814 			pr_err("Could not set realtime priority.\n");
1815 			err = -1;
1816 			goto out_child;
1817 		}
1818 	}
1819 
1820 	/*
1821 	 * When perf is starting the traced process, all the events
1822 	 * (apart from group members) have enable_on_exec=1 set,
1823 	 * so don't spoil it by prematurely enabling them.
1824 	 */
1825 	if (!target__none(&opts->target) && !opts->initial_delay)
1826 		evlist__enable(rec->evlist);
1827 
1828 	/*
1829 	 * Let the child rip
1830 	 */
1831 	if (forks) {
1832 		struct machine *machine = &session->machines.host;
1833 		union perf_event *event;
1834 		pid_t tgid;
1835 
1836 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1837 		if (event == NULL) {
1838 			err = -ENOMEM;
1839 			goto out_child;
1840 		}
1841 
1842 		/*
1843 		 * Some H/W events are generated before COMM event
1844 		 * which is emitted during exec(), so perf script
1845 		 * cannot see a correct process name for those events.
1846 		 * Synthesize COMM event to prevent it.
1847 		 */
1848 		tgid = perf_event__synthesize_comm(tool, event,
1849 						   rec->evlist->workload.pid,
1850 						   process_synthesized_event,
1851 						   machine);
1852 		free(event);
1853 
1854 		if (tgid == -1)
1855 			goto out_child;
1856 
1857 		event = malloc(sizeof(event->namespaces) +
1858 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1859 			       machine->id_hdr_size);
1860 		if (event == NULL) {
1861 			err = -ENOMEM;
1862 			goto out_child;
1863 		}
1864 
1865 		/*
1866 		 * Synthesize NAMESPACES event for the command specified.
1867 		 */
1868 		perf_event__synthesize_namespaces(tool, event,
1869 						  rec->evlist->workload.pid,
1870 						  tgid, process_synthesized_event,
1871 						  machine);
1872 		free(event);
1873 
1874 		evlist__start_workload(rec->evlist);
1875 	}
1876 
1877 	if (evlist__initialize_ctlfd(rec->evlist, opts->ctl_fd, opts->ctl_fd_ack))
1878 		goto out_child;
1879 
1880 	if (opts->initial_delay) {
1881 		pr_info(EVLIST_DISABLED_MSG);
1882 		if (opts->initial_delay > 0) {
1883 			usleep(opts->initial_delay * USEC_PER_MSEC);
1884 			evlist__enable(rec->evlist);
1885 			pr_info(EVLIST_ENABLED_MSG);
1886 		}
1887 	}
1888 
1889 	trigger_ready(&auxtrace_snapshot_trigger);
1890 	trigger_ready(&switch_output_trigger);
1891 	perf_hooks__invoke_record_start();
1892 	for (;;) {
1893 		unsigned long long hits = rec->samples;
1894 
1895 		/*
1896 		 * rec->evlist->bkw_mmap_state is possible to be
1897 		 * BKW_MMAP_EMPTY here: when done == true and
1898 		 * hits != rec->samples in previous round.
1899 		 *
1900 		 * evlist__toggle_bkw_mmap ensure we never
1901 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1902 		 */
1903 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1904 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1905 
1906 		if (record__mmap_read_all(rec, false) < 0) {
1907 			trigger_error(&auxtrace_snapshot_trigger);
1908 			trigger_error(&switch_output_trigger);
1909 			err = -1;
1910 			goto out_child;
1911 		}
1912 
1913 		if (auxtrace_record__snapshot_started) {
1914 			auxtrace_record__snapshot_started = 0;
1915 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1916 				record__read_auxtrace_snapshot(rec, false);
1917 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1918 				pr_err("AUX area tracing snapshot failed\n");
1919 				err = -1;
1920 				goto out_child;
1921 			}
1922 		}
1923 
1924 		if (trigger_is_hit(&switch_output_trigger)) {
1925 			/*
1926 			 * If switch_output_trigger is hit, the data in
1927 			 * overwritable ring buffer should have been collected,
1928 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1929 			 *
1930 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1931 			 * record__mmap_read_all() didn't collect data from
1932 			 * overwritable ring buffer. Read again.
1933 			 */
1934 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1935 				continue;
1936 			trigger_ready(&switch_output_trigger);
1937 
1938 			/*
1939 			 * Reenable events in overwrite ring buffer after
1940 			 * record__mmap_read_all(): we should have collected
1941 			 * data from it.
1942 			 */
1943 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1944 
1945 			if (!quiet)
1946 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1947 					waking);
1948 			waking = 0;
1949 			fd = record__switch_output(rec, false);
1950 			if (fd < 0) {
1951 				pr_err("Failed to switch to new file\n");
1952 				trigger_error(&switch_output_trigger);
1953 				err = fd;
1954 				goto out_child;
1955 			}
1956 
1957 			/* re-arm the alarm */
1958 			if (rec->switch_output.time)
1959 				alarm(rec->switch_output.time);
1960 		}
1961 
1962 		if (hits == rec->samples) {
1963 			if (done || draining)
1964 				break;
1965 			err = evlist__poll(rec->evlist, -1);
1966 			/*
1967 			 * Propagate error, only if there's any. Ignore positive
1968 			 * number of returned events and interrupt error.
1969 			 */
1970 			if (err > 0 || (err < 0 && errno == EINTR))
1971 				err = 0;
1972 			waking++;
1973 
1974 			if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1975 				draining = true;
1976 		}
1977 
1978 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
1979 			switch (cmd) {
1980 			case EVLIST_CTL_CMD_SNAPSHOT:
1981 				hit_auxtrace_snapshot_trigger(rec);
1982 				evlist__ctlfd_ack(rec->evlist);
1983 				break;
1984 			case EVLIST_CTL_CMD_STOP:
1985 				done = 1;
1986 				break;
1987 			case EVLIST_CTL_CMD_ACK:
1988 			case EVLIST_CTL_CMD_UNSUPPORTED:
1989 			case EVLIST_CTL_CMD_ENABLE:
1990 			case EVLIST_CTL_CMD_DISABLE:
1991 			case EVLIST_CTL_CMD_EVLIST:
1992 			case EVLIST_CTL_CMD_PING:
1993 			default:
1994 				break;
1995 			}
1996 		}
1997 
1998 		/*
1999 		 * When perf is starting the traced process, at the end events
2000 		 * die with the process and we wait for that. Thus no need to
2001 		 * disable events in this case.
2002 		 */
2003 		if (done && !disabled && !target__none(&opts->target)) {
2004 			trigger_off(&auxtrace_snapshot_trigger);
2005 			evlist__disable(rec->evlist);
2006 			disabled = true;
2007 		}
2008 	}
2009 
2010 	trigger_off(&auxtrace_snapshot_trigger);
2011 	trigger_off(&switch_output_trigger);
2012 
2013 	if (opts->auxtrace_snapshot_on_exit)
2014 		record__auxtrace_snapshot_exit(rec);
2015 
2016 	if (forks && workload_exec_errno) {
2017 		char msg[STRERR_BUFSIZE], strevsels[2048];
2018 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2019 
2020 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2021 
2022 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2023 			strevsels, argv[0], emsg);
2024 		err = -1;
2025 		goto out_child;
2026 	}
2027 
2028 	if (!quiet)
2029 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
2030 
2031 	if (target__none(&rec->opts.target))
2032 		record__synthesize_workload(rec, true);
2033 
2034 out_child:
2035 	evlist__finalize_ctlfd(rec->evlist);
2036 	record__mmap_read_all(rec, true);
2037 	record__aio_mmap_read_sync(rec);
2038 
2039 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2040 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2041 		session->header.env.comp_ratio = ratio + 0.5;
2042 	}
2043 
2044 	if (forks) {
2045 		int exit_status;
2046 
2047 		if (!child_finished)
2048 			kill(rec->evlist->workload.pid, SIGTERM);
2049 
2050 		wait(&exit_status);
2051 
2052 		if (err < 0)
2053 			status = err;
2054 		else if (WIFEXITED(exit_status))
2055 			status = WEXITSTATUS(exit_status);
2056 		else if (WIFSIGNALED(exit_status))
2057 			signr = WTERMSIG(exit_status);
2058 	} else
2059 		status = err;
2060 
2061 	record__synthesize(rec, true);
2062 	/* this will be recalculated during process_buildids() */
2063 	rec->samples = 0;
2064 
2065 	if (!err) {
2066 		if (!rec->timestamp_filename) {
2067 			record__finish_output(rec);
2068 		} else {
2069 			fd = record__switch_output(rec, true);
2070 			if (fd < 0) {
2071 				status = fd;
2072 				goto out_delete_session;
2073 			}
2074 		}
2075 	}
2076 
2077 	perf_hooks__invoke_record_end();
2078 
2079 	if (!err && !quiet) {
2080 		char samples[128];
2081 		const char *postfix = rec->timestamp_filename ?
2082 					".<timestamp>" : "";
2083 
2084 		if (rec->samples && !rec->opts.full_auxtrace)
2085 			scnprintf(samples, sizeof(samples),
2086 				  " (%" PRIu64 " samples)", rec->samples);
2087 		else
2088 			samples[0] = '\0';
2089 
2090 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2091 			perf_data__size(data) / 1024.0 / 1024.0,
2092 			data->path, postfix, samples);
2093 		if (ratio) {
2094 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2095 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2096 					ratio);
2097 		}
2098 		fprintf(stderr, " ]\n");
2099 	}
2100 
2101 out_delete_session:
2102 #ifdef HAVE_EVENTFD_SUPPORT
2103 	if (done_fd >= 0)
2104 		close(done_fd);
2105 #endif
2106 	zstd_fini(&session->zstd_data);
2107 	perf_session__delete(session);
2108 
2109 	if (!opts->no_bpf_event)
2110 		evlist__stop_sb_thread(rec->sb_evlist);
2111 	return status;
2112 }
2113 
2114 static void callchain_debug(struct callchain_param *callchain)
2115 {
2116 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2117 
2118 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2119 
2120 	if (callchain->record_mode == CALLCHAIN_DWARF)
2121 		pr_debug("callchain: stack dump size %d\n",
2122 			 callchain->dump_size);
2123 }
2124 
2125 int record_opts__parse_callchain(struct record_opts *record,
2126 				 struct callchain_param *callchain,
2127 				 const char *arg, bool unset)
2128 {
2129 	int ret;
2130 	callchain->enabled = !unset;
2131 
2132 	/* --no-call-graph */
2133 	if (unset) {
2134 		callchain->record_mode = CALLCHAIN_NONE;
2135 		pr_debug("callchain: disabled\n");
2136 		return 0;
2137 	}
2138 
2139 	ret = parse_callchain_record_opt(arg, callchain);
2140 	if (!ret) {
2141 		/* Enable data address sampling for DWARF unwind. */
2142 		if (callchain->record_mode == CALLCHAIN_DWARF)
2143 			record->sample_address = true;
2144 		callchain_debug(callchain);
2145 	}
2146 
2147 	return ret;
2148 }
2149 
2150 int record_parse_callchain_opt(const struct option *opt,
2151 			       const char *arg,
2152 			       int unset)
2153 {
2154 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2155 }
2156 
2157 int record_callchain_opt(const struct option *opt,
2158 			 const char *arg __maybe_unused,
2159 			 int unset __maybe_unused)
2160 {
2161 	struct callchain_param *callchain = opt->value;
2162 
2163 	callchain->enabled = true;
2164 
2165 	if (callchain->record_mode == CALLCHAIN_NONE)
2166 		callchain->record_mode = CALLCHAIN_FP;
2167 
2168 	callchain_debug(callchain);
2169 	return 0;
2170 }
2171 
2172 static int perf_record_config(const char *var, const char *value, void *cb)
2173 {
2174 	struct record *rec = cb;
2175 
2176 	if (!strcmp(var, "record.build-id")) {
2177 		if (!strcmp(value, "cache"))
2178 			rec->no_buildid_cache = false;
2179 		else if (!strcmp(value, "no-cache"))
2180 			rec->no_buildid_cache = true;
2181 		else if (!strcmp(value, "skip"))
2182 			rec->no_buildid = true;
2183 		else if (!strcmp(value, "mmap"))
2184 			rec->buildid_mmap = true;
2185 		else
2186 			return -1;
2187 		return 0;
2188 	}
2189 	if (!strcmp(var, "record.call-graph")) {
2190 		var = "call-graph.record-mode";
2191 		return perf_default_config(var, value, cb);
2192 	}
2193 #ifdef HAVE_AIO_SUPPORT
2194 	if (!strcmp(var, "record.aio")) {
2195 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2196 		if (!rec->opts.nr_cblocks)
2197 			rec->opts.nr_cblocks = nr_cblocks_default;
2198 	}
2199 #endif
2200 
2201 	return 0;
2202 }
2203 
2204 
2205 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2206 {
2207 	struct record_opts *opts = (struct record_opts *)opt->value;
2208 
2209 	if (unset || !str)
2210 		return 0;
2211 
2212 	if (!strcasecmp(str, "node"))
2213 		opts->affinity = PERF_AFFINITY_NODE;
2214 	else if (!strcasecmp(str, "cpu"))
2215 		opts->affinity = PERF_AFFINITY_CPU;
2216 
2217 	return 0;
2218 }
2219 
2220 static int parse_output_max_size(const struct option *opt,
2221 				 const char *str, int unset)
2222 {
2223 	unsigned long *s = (unsigned long *)opt->value;
2224 	static struct parse_tag tags_size[] = {
2225 		{ .tag  = 'B', .mult = 1       },
2226 		{ .tag  = 'K', .mult = 1 << 10 },
2227 		{ .tag  = 'M', .mult = 1 << 20 },
2228 		{ .tag  = 'G', .mult = 1 << 30 },
2229 		{ .tag  = 0 },
2230 	};
2231 	unsigned long val;
2232 
2233 	if (unset) {
2234 		*s = 0;
2235 		return 0;
2236 	}
2237 
2238 	val = parse_tag_value(str, tags_size);
2239 	if (val != (unsigned long) -1) {
2240 		*s = val;
2241 		return 0;
2242 	}
2243 
2244 	return -1;
2245 }
2246 
2247 static int record__parse_mmap_pages(const struct option *opt,
2248 				    const char *str,
2249 				    int unset __maybe_unused)
2250 {
2251 	struct record_opts *opts = opt->value;
2252 	char *s, *p;
2253 	unsigned int mmap_pages;
2254 	int ret;
2255 
2256 	if (!str)
2257 		return -EINVAL;
2258 
2259 	s = strdup(str);
2260 	if (!s)
2261 		return -ENOMEM;
2262 
2263 	p = strchr(s, ',');
2264 	if (p)
2265 		*p = '\0';
2266 
2267 	if (*s) {
2268 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
2269 		if (ret)
2270 			goto out_free;
2271 		opts->mmap_pages = mmap_pages;
2272 	}
2273 
2274 	if (!p) {
2275 		ret = 0;
2276 		goto out_free;
2277 	}
2278 
2279 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
2280 	if (ret)
2281 		goto out_free;
2282 
2283 	opts->auxtrace_mmap_pages = mmap_pages;
2284 
2285 out_free:
2286 	free(s);
2287 	return ret;
2288 }
2289 
2290 static int parse_control_option(const struct option *opt,
2291 				const char *str,
2292 				int unset __maybe_unused)
2293 {
2294 	struct record_opts *opts = opt->value;
2295 
2296 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
2297 }
2298 
2299 static void switch_output_size_warn(struct record *rec)
2300 {
2301 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2302 	struct switch_output *s = &rec->switch_output;
2303 
2304 	wakeup_size /= 2;
2305 
2306 	if (s->size < wakeup_size) {
2307 		char buf[100];
2308 
2309 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2310 		pr_warning("WARNING: switch-output data size lower than "
2311 			   "wakeup kernel buffer size (%s) "
2312 			   "expect bigger perf.data sizes\n", buf);
2313 	}
2314 }
2315 
2316 static int switch_output_setup(struct record *rec)
2317 {
2318 	struct switch_output *s = &rec->switch_output;
2319 	static struct parse_tag tags_size[] = {
2320 		{ .tag  = 'B', .mult = 1       },
2321 		{ .tag  = 'K', .mult = 1 << 10 },
2322 		{ .tag  = 'M', .mult = 1 << 20 },
2323 		{ .tag  = 'G', .mult = 1 << 30 },
2324 		{ .tag  = 0 },
2325 	};
2326 	static struct parse_tag tags_time[] = {
2327 		{ .tag  = 's', .mult = 1        },
2328 		{ .tag  = 'm', .mult = 60       },
2329 		{ .tag  = 'h', .mult = 60*60    },
2330 		{ .tag  = 'd', .mult = 60*60*24 },
2331 		{ .tag  = 0 },
2332 	};
2333 	unsigned long val;
2334 
2335 	/*
2336 	 * If we're using --switch-output-events, then we imply its
2337 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2338 	 *  thread to its parent.
2339 	 */
2340 	if (rec->switch_output_event_set)
2341 		goto do_signal;
2342 
2343 	if (!s->set)
2344 		return 0;
2345 
2346 	if (!strcmp(s->str, "signal")) {
2347 do_signal:
2348 		s->signal = true;
2349 		pr_debug("switch-output with SIGUSR2 signal\n");
2350 		goto enabled;
2351 	}
2352 
2353 	val = parse_tag_value(s->str, tags_size);
2354 	if (val != (unsigned long) -1) {
2355 		s->size = val;
2356 		pr_debug("switch-output with %s size threshold\n", s->str);
2357 		goto enabled;
2358 	}
2359 
2360 	val = parse_tag_value(s->str, tags_time);
2361 	if (val != (unsigned long) -1) {
2362 		s->time = val;
2363 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2364 			 s->str, s->time);
2365 		goto enabled;
2366 	}
2367 
2368 	return -1;
2369 
2370 enabled:
2371 	rec->timestamp_filename = true;
2372 	s->enabled              = true;
2373 
2374 	if (s->size && !rec->opts.no_buffering)
2375 		switch_output_size_warn(rec);
2376 
2377 	return 0;
2378 }
2379 
2380 static const char * const __record_usage[] = {
2381 	"perf record [<options>] [<command>]",
2382 	"perf record [<options>] -- <command> [<options>]",
2383 	NULL
2384 };
2385 const char * const *record_usage = __record_usage;
2386 
2387 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2388 				  struct perf_sample *sample, struct machine *machine)
2389 {
2390 	/*
2391 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2392 	 * no need to add them twice.
2393 	 */
2394 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2395 		return 0;
2396 	return perf_event__process_mmap(tool, event, sample, machine);
2397 }
2398 
2399 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2400 				   struct perf_sample *sample, struct machine *machine)
2401 {
2402 	/*
2403 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2404 	 * no need to add them twice.
2405 	 */
2406 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2407 		return 0;
2408 
2409 	return perf_event__process_mmap2(tool, event, sample, machine);
2410 }
2411 
2412 static int process_timestamp_boundary(struct perf_tool *tool,
2413 				      union perf_event *event __maybe_unused,
2414 				      struct perf_sample *sample,
2415 				      struct machine *machine __maybe_unused)
2416 {
2417 	struct record *rec = container_of(tool, struct record, tool);
2418 
2419 	set_timestamp_boundary(rec, sample->time);
2420 	return 0;
2421 }
2422 
2423 /*
2424  * XXX Ideally would be local to cmd_record() and passed to a record__new
2425  * because we need to have access to it in record__exit, that is called
2426  * after cmd_record() exits, but since record_options need to be accessible to
2427  * builtin-script, leave it here.
2428  *
2429  * At least we don't ouch it in all the other functions here directly.
2430  *
2431  * Just say no to tons of global variables, sigh.
2432  */
2433 static struct record record = {
2434 	.opts = {
2435 		.sample_time	     = true,
2436 		.mmap_pages	     = UINT_MAX,
2437 		.user_freq	     = UINT_MAX,
2438 		.user_interval	     = ULLONG_MAX,
2439 		.freq		     = 4000,
2440 		.target		     = {
2441 			.uses_mmap   = true,
2442 			.default_per_cpu = true,
2443 		},
2444 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2445 		.nr_threads_synthesize = 1,
2446 		.ctl_fd              = -1,
2447 		.ctl_fd_ack          = -1,
2448 	},
2449 	.tool = {
2450 		.sample		= process_sample_event,
2451 		.fork		= perf_event__process_fork,
2452 		.exit		= perf_event__process_exit,
2453 		.comm		= perf_event__process_comm,
2454 		.namespaces	= perf_event__process_namespaces,
2455 		.mmap		= build_id__process_mmap,
2456 		.mmap2		= build_id__process_mmap2,
2457 		.itrace_start	= process_timestamp_boundary,
2458 		.aux		= process_timestamp_boundary,
2459 		.ordered_events	= true,
2460 	},
2461 };
2462 
2463 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2464 	"\n\t\t\t\tDefault: fp";
2465 
2466 static bool dry_run;
2467 
2468 /*
2469  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2470  * with it and switch to use the library functions in perf_evlist that came
2471  * from builtin-record.c, i.e. use record_opts,
2472  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2473  * using pipes, etc.
2474  */
2475 static struct option __record_options[] = {
2476 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2477 		     "event selector. use 'perf list' to list available events",
2478 		     parse_events_option),
2479 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2480 		     "event filter", parse_filter),
2481 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2482 			   NULL, "don't record events from perf itself",
2483 			   exclude_perf),
2484 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2485 		    "record events on existing process id"),
2486 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2487 		    "record events on existing thread id"),
2488 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2489 		    "collect data with this RT SCHED_FIFO priority"),
2490 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2491 		    "collect data without buffering"),
2492 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2493 		    "collect raw sample records from all opened counters"),
2494 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2495 			    "system-wide collection from all CPUs"),
2496 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2497 		    "list of cpus to monitor"),
2498 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2499 	OPT_STRING('o', "output", &record.data.path, "file",
2500 		    "output file name"),
2501 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2502 			&record.opts.no_inherit_set,
2503 			"child tasks do not inherit counters"),
2504 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2505 		    "synthesize non-sample events at the end of output"),
2506 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2507 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
2508 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2509 		    "Fail if the specified frequency can't be used"),
2510 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2511 		     "profile at this frequency",
2512 		      record__parse_freq),
2513 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2514 		     "number of mmap data pages and AUX area tracing mmap pages",
2515 		     record__parse_mmap_pages),
2516 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2517 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2518 		     record__mmap_flush_parse),
2519 	OPT_BOOLEAN(0, "group", &record.opts.group,
2520 		    "put the counters into a counter group"),
2521 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2522 			   NULL, "enables call-graph recording" ,
2523 			   &record_callchain_opt),
2524 	OPT_CALLBACK(0, "call-graph", &record.opts,
2525 		     "record_mode[,record_size]", record_callchain_help,
2526 		     &record_parse_callchain_opt),
2527 	OPT_INCR('v', "verbose", &verbose,
2528 		    "be more verbose (show counter open errors, etc)"),
2529 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2530 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2531 		    "per thread counts"),
2532 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2533 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2534 		    "Record the sample physical addresses"),
2535 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
2536 		    "Record the sampled data address data page size"),
2537 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
2538 		    "Record the sampled code address (ip) page size"),
2539 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2540 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2541 			&record.opts.sample_time_set,
2542 			"Record the sample timestamps"),
2543 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2544 			"Record the sample period"),
2545 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2546 		    "don't sample"),
2547 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2548 			&record.no_buildid_cache_set,
2549 			"do not update the buildid cache"),
2550 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2551 			&record.no_buildid_set,
2552 			"do not collect buildids in perf.data"),
2553 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2554 		     "monitor event in cgroup name only",
2555 		     parse_cgroups),
2556 	OPT_INTEGER('D', "delay", &record.opts.initial_delay,
2557 		  "ms to wait before starting measurement after program start (-1: start with events disabled)"),
2558 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2559 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2560 		   "user to profile"),
2561 
2562 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2563 		     "branch any", "sample any taken branches",
2564 		     parse_branch_stack),
2565 
2566 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2567 		     "branch filter mask", "branch stack filter modes",
2568 		     parse_branch_stack),
2569 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2570 		    "sample by weight (on special events only)"),
2571 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2572 		    "sample transaction flags (special events only)"),
2573 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2574 		    "use per-thread mmaps"),
2575 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2576 		    "sample selected machine registers on interrupt,"
2577 		    " use '-I?' to list register names", parse_intr_regs),
2578 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2579 		    "sample selected machine registers on interrupt,"
2580 		    " use '--user-regs=?' to list register names", parse_user_regs),
2581 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2582 		    "Record running/enabled time of read (:S) events"),
2583 	OPT_CALLBACK('k', "clockid", &record.opts,
2584 	"clockid", "clockid to use for events, see clock_gettime()",
2585 	parse_clockid),
2586 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2587 			  "opts", "AUX area tracing Snapshot Mode", ""),
2588 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2589 			  "opts", "sample AUX area", ""),
2590 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2591 			"per thread proc mmap processing timeout in ms"),
2592 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2593 		    "Record namespaces events"),
2594 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
2595 		    "Record cgroup events"),
2596 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
2597 			&record.opts.record_switch_events_set,
2598 			"Record context switch events"),
2599 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2600 			 "Configure all used events to run in kernel space.",
2601 			 PARSE_OPT_EXCLUSIVE),
2602 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2603 			 "Configure all used events to run in user space.",
2604 			 PARSE_OPT_EXCLUSIVE),
2605 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2606 		    "collect kernel callchains"),
2607 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2608 		    "collect user callchains"),
2609 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2610 		   "clang binary to use for compiling BPF scriptlets"),
2611 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2612 		   "options passed to clang when compiling BPF scriptlets"),
2613 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2614 		   "file", "vmlinux pathname"),
2615 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2616 		    "Record build-id of all DSOs regardless of hits"),
2617 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
2618 		    "Record build-id in map events"),
2619 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2620 		    "append timestamp to output filename"),
2621 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2622 		    "Record timestamp boundary (time of first/last samples)"),
2623 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2624 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2625 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2626 			  "signal"),
2627 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
2628 			 "switch output event selector. use 'perf list' to list available events",
2629 			 parse_events_option_new_evlist),
2630 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2631 		   "Limit number of switch output generated files"),
2632 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2633 		    "Parse options then exit"),
2634 #ifdef HAVE_AIO_SUPPORT
2635 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2636 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2637 		     record__aio_parse),
2638 #endif
2639 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2640 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2641 		     record__parse_affinity),
2642 #ifdef HAVE_ZSTD_SUPPORT
2643 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2644 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2645 			    record__parse_comp_level),
2646 #endif
2647 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
2648 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
2649 	OPT_UINTEGER(0, "num-thread-synthesize",
2650 		     &record.opts.nr_threads_synthesize,
2651 		     "number of threads to run for event synthesis"),
2652 #ifdef HAVE_LIBPFM
2653 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
2654 		"libpfm4 event selector. use 'perf list' to list available events",
2655 		parse_libpfm_events_option),
2656 #endif
2657 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
2658 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
2659 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
2660 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
2661 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
2662 		      parse_control_option),
2663 	OPT_END()
2664 };
2665 
2666 struct option *record_options = __record_options;
2667 
2668 int cmd_record(int argc, const char **argv)
2669 {
2670 	int err;
2671 	struct record *rec = &record;
2672 	char errbuf[BUFSIZ];
2673 
2674 	setlocale(LC_ALL, "");
2675 
2676 #ifndef HAVE_LIBBPF_SUPPORT
2677 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2678 	set_nobuild('\0', "clang-path", true);
2679 	set_nobuild('\0', "clang-opt", true);
2680 # undef set_nobuild
2681 #endif
2682 
2683 #ifndef HAVE_BPF_PROLOGUE
2684 # if !defined (HAVE_DWARF_SUPPORT)
2685 #  define REASON  "NO_DWARF=1"
2686 # elif !defined (HAVE_LIBBPF_SUPPORT)
2687 #  define REASON  "NO_LIBBPF=1"
2688 # else
2689 #  define REASON  "this architecture doesn't support BPF prologue"
2690 # endif
2691 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2692 	set_nobuild('\0', "vmlinux", true);
2693 # undef set_nobuild
2694 # undef REASON
2695 #endif
2696 
2697 	rec->opts.affinity = PERF_AFFINITY_SYS;
2698 
2699 	rec->evlist = evlist__new();
2700 	if (rec->evlist == NULL)
2701 		return -ENOMEM;
2702 
2703 	err = perf_config(perf_record_config, rec);
2704 	if (err)
2705 		return err;
2706 
2707 	argc = parse_options(argc, argv, record_options, record_usage,
2708 			    PARSE_OPT_STOP_AT_NON_OPTION);
2709 	if (quiet)
2710 		perf_quiet_option();
2711 
2712 	/* Make system wide (-a) the default target. */
2713 	if (!argc && target__none(&rec->opts.target))
2714 		rec->opts.target.system_wide = true;
2715 
2716 	if (nr_cgroups && !rec->opts.target.system_wide) {
2717 		usage_with_options_msg(record_usage, record_options,
2718 			"cgroup monitoring only available in system-wide mode");
2719 
2720 	}
2721 
2722 	if (rec->buildid_mmap) {
2723 		if (!perf_can_record_build_id()) {
2724 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
2725 			err = -EINVAL;
2726 			goto out_opts;
2727 		}
2728 		pr_debug("Enabling build id in mmap2 events.\n");
2729 		/* Enable mmap build id synthesizing. */
2730 		symbol_conf.buildid_mmap2 = true;
2731 		/* Enable perf_event_attr::build_id bit. */
2732 		rec->opts.build_id = true;
2733 		/* Disable build id cache. */
2734 		rec->no_buildid = true;
2735 	}
2736 
2737 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
2738 		pr_err("Kernel has no cgroup sampling support.\n");
2739 		err = -EINVAL;
2740 		goto out_opts;
2741 	}
2742 
2743 	if (rec->opts.kcore)
2744 		rec->data.is_dir = true;
2745 
2746 	if (rec->opts.comp_level != 0) {
2747 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2748 		rec->no_buildid = true;
2749 	}
2750 
2751 	if (rec->opts.record_switch_events &&
2752 	    !perf_can_record_switch_events()) {
2753 		ui__error("kernel does not support recording context switch events\n");
2754 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2755 		err = -EINVAL;
2756 		goto out_opts;
2757 	}
2758 
2759 	if (switch_output_setup(rec)) {
2760 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2761 		err = -EINVAL;
2762 		goto out_opts;
2763 	}
2764 
2765 	if (rec->switch_output.time) {
2766 		signal(SIGALRM, alarm_sig_handler);
2767 		alarm(rec->switch_output.time);
2768 	}
2769 
2770 	if (rec->switch_output.num_files) {
2771 		rec->switch_output.filenames = calloc(sizeof(char *),
2772 						      rec->switch_output.num_files);
2773 		if (!rec->switch_output.filenames) {
2774 			err = -EINVAL;
2775 			goto out_opts;
2776 		}
2777 	}
2778 
2779 	/*
2780 	 * Allow aliases to facilitate the lookup of symbols for address
2781 	 * filters. Refer to auxtrace_parse_filters().
2782 	 */
2783 	symbol_conf.allow_aliases = true;
2784 
2785 	symbol__init(NULL);
2786 
2787 	if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2788 		rec->affinity_mask.nbits = cpu__max_cpu();
2789 		rec->affinity_mask.bits = bitmap_alloc(rec->affinity_mask.nbits);
2790 		if (!rec->affinity_mask.bits) {
2791 			pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2792 			err = -ENOMEM;
2793 			goto out_opts;
2794 		}
2795 		pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2796 	}
2797 
2798 	err = record__auxtrace_init(rec);
2799 	if (err)
2800 		goto out;
2801 
2802 	if (dry_run)
2803 		goto out;
2804 
2805 	err = bpf__setup_stdout(rec->evlist);
2806 	if (err) {
2807 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2808 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2809 			 errbuf);
2810 		goto out;
2811 	}
2812 
2813 	err = -ENOMEM;
2814 
2815 	if (rec->no_buildid_cache || rec->no_buildid) {
2816 		disable_buildid_cache();
2817 	} else if (rec->switch_output.enabled) {
2818 		/*
2819 		 * In 'perf record --switch-output', disable buildid
2820 		 * generation by default to reduce data file switching
2821 		 * overhead. Still generate buildid if they are required
2822 		 * explicitly using
2823 		 *
2824 		 *  perf record --switch-output --no-no-buildid \
2825 		 *              --no-no-buildid-cache
2826 		 *
2827 		 * Following code equals to:
2828 		 *
2829 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2830 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2831 		 *         disable_buildid_cache();
2832 		 */
2833 		bool disable = true;
2834 
2835 		if (rec->no_buildid_set && !rec->no_buildid)
2836 			disable = false;
2837 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2838 			disable = false;
2839 		if (disable) {
2840 			rec->no_buildid = true;
2841 			rec->no_buildid_cache = true;
2842 			disable_buildid_cache();
2843 		}
2844 	}
2845 
2846 	if (record.opts.overwrite)
2847 		record.opts.tail_synthesize = true;
2848 
2849 	if (rec->evlist->core.nr_entries == 0) {
2850 		if (perf_pmu__has_hybrid()) {
2851 			err = evlist__add_default_hybrid(rec->evlist,
2852 							 !record.opts.no_samples);
2853 		} else {
2854 			err = __evlist__add_default(rec->evlist,
2855 						    !record.opts.no_samples);
2856 		}
2857 
2858 		if (err < 0) {
2859 			pr_err("Not enough memory for event selector list\n");
2860 			goto out;
2861 		}
2862 	}
2863 
2864 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2865 		rec->opts.no_inherit = true;
2866 
2867 	err = target__validate(&rec->opts.target);
2868 	if (err) {
2869 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2870 		ui__warning("%s\n", errbuf);
2871 	}
2872 
2873 	err = target__parse_uid(&rec->opts.target);
2874 	if (err) {
2875 		int saved_errno = errno;
2876 
2877 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2878 		ui__error("%s", errbuf);
2879 
2880 		err = -saved_errno;
2881 		goto out;
2882 	}
2883 
2884 	/* Enable ignoring missing threads when -u/-p option is defined. */
2885 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2886 
2887 	err = -ENOMEM;
2888 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2889 		usage_with_options(record_usage, record_options);
2890 
2891 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2892 	if (err)
2893 		goto out;
2894 
2895 	/*
2896 	 * We take all buildids when the file contains
2897 	 * AUX area tracing data because we do not decode the
2898 	 * trace because it would take too long.
2899 	 */
2900 	if (rec->opts.full_auxtrace)
2901 		rec->buildid_all = true;
2902 
2903 	if (rec->opts.text_poke) {
2904 		err = record__config_text_poke(rec->evlist);
2905 		if (err) {
2906 			pr_err("record__config_text_poke failed, error %d\n", err);
2907 			goto out;
2908 		}
2909 	}
2910 
2911 	if (record_opts__config(&rec->opts)) {
2912 		err = -EINVAL;
2913 		goto out;
2914 	}
2915 
2916 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2917 		rec->opts.nr_cblocks = nr_cblocks_max;
2918 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2919 
2920 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2921 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2922 
2923 	if (rec->opts.comp_level > comp_level_max)
2924 		rec->opts.comp_level = comp_level_max;
2925 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2926 
2927 	err = __cmd_record(&record, argc, argv);
2928 out:
2929 	bitmap_free(rec->affinity_mask.bits);
2930 	evlist__delete(rec->evlist);
2931 	symbol__exit();
2932 	auxtrace_record__free(rec->itr);
2933 out_opts:
2934 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
2935 	return err;
2936 }
2937 
2938 static void snapshot_sig_handler(int sig __maybe_unused)
2939 {
2940 	struct record *rec = &record;
2941 
2942 	hit_auxtrace_snapshot_trigger(rec);
2943 
2944 	if (switch_output_signal(rec))
2945 		trigger_hit(&switch_output_trigger);
2946 }
2947 
2948 static void alarm_sig_handler(int sig __maybe_unused)
2949 {
2950 	struct record *rec = &record;
2951 
2952 	if (switch_output_time(rec))
2953 		trigger_hit(&switch_output_trigger);
2954 }
2955