xref: /openbmc/linux/tools/perf/builtin-record.c (revision 9257bd80)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/clockid.h"
50 #include "util/pmu-hybrid.h"
51 #include "util/evlist-hybrid.h"
52 #include "asm/bug.h"
53 #include "perf.h"
54 
55 #include <errno.h>
56 #include <inttypes.h>
57 #include <locale.h>
58 #include <poll.h>
59 #include <pthread.h>
60 #include <unistd.h>
61 #include <sched.h>
62 #include <signal.h>
63 #ifdef HAVE_EVENTFD_SUPPORT
64 #include <sys/eventfd.h>
65 #endif
66 #include <sys/mman.h>
67 #include <sys/wait.h>
68 #include <sys/types.h>
69 #include <sys/stat.h>
70 #include <fcntl.h>
71 #include <linux/err.h>
72 #include <linux/string.h>
73 #include <linux/time64.h>
74 #include <linux/zalloc.h>
75 #include <linux/bitmap.h>
76 #include <sys/time.h>
77 
78 struct switch_output {
79 	bool		 enabled;
80 	bool		 signal;
81 	unsigned long	 size;
82 	unsigned long	 time;
83 	const char	*str;
84 	bool		 set;
85 	char		 **filenames;
86 	int		 num_files;
87 	int		 cur_file;
88 };
89 
90 struct record {
91 	struct perf_tool	tool;
92 	struct record_opts	opts;
93 	u64			bytes_written;
94 	struct perf_data	data;
95 	struct auxtrace_record	*itr;
96 	struct evlist	*evlist;
97 	struct perf_session	*session;
98 	struct evlist		*sb_evlist;
99 	pthread_t		thread_id;
100 	int			realtime_prio;
101 	bool			switch_output_event_set;
102 	bool			no_buildid;
103 	bool			no_buildid_set;
104 	bool			no_buildid_cache;
105 	bool			no_buildid_cache_set;
106 	bool			buildid_all;
107 	bool			buildid_mmap;
108 	bool			timestamp_filename;
109 	bool			timestamp_boundary;
110 	struct switch_output	switch_output;
111 	unsigned long long	samples;
112 	struct mmap_cpu_mask	affinity_mask;
113 	unsigned long		output_max_size;	/* = 0: unlimited */
114 };
115 
116 static volatile int done;
117 
118 static volatile int auxtrace_record__snapshot_started;
119 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
120 static DEFINE_TRIGGER(switch_output_trigger);
121 
122 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
123 	"SYS", "NODE", "CPU"
124 };
125 
126 static bool switch_output_signal(struct record *rec)
127 {
128 	return rec->switch_output.signal &&
129 	       trigger_is_ready(&switch_output_trigger);
130 }
131 
132 static bool switch_output_size(struct record *rec)
133 {
134 	return rec->switch_output.size &&
135 	       trigger_is_ready(&switch_output_trigger) &&
136 	       (rec->bytes_written >= rec->switch_output.size);
137 }
138 
139 static bool switch_output_time(struct record *rec)
140 {
141 	return rec->switch_output.time &&
142 	       trigger_is_ready(&switch_output_trigger);
143 }
144 
145 static bool record__output_max_size_exceeded(struct record *rec)
146 {
147 	return rec->output_max_size &&
148 	       (rec->bytes_written >= rec->output_max_size);
149 }
150 
151 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
152 			 void *bf, size_t size)
153 {
154 	struct perf_data_file *file = &rec->session->data->file;
155 
156 	if (perf_data_file__write(file, bf, size) < 0) {
157 		pr_err("failed to write perf data, error: %m\n");
158 		return -1;
159 	}
160 
161 	rec->bytes_written += size;
162 
163 	if (record__output_max_size_exceeded(rec) && !done) {
164 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
165 				" stopping session ]\n",
166 				rec->bytes_written >> 10);
167 		done = 1;
168 	}
169 
170 	if (switch_output_size(rec))
171 		trigger_hit(&switch_output_trigger);
172 
173 	return 0;
174 }
175 
176 static int record__aio_enabled(struct record *rec);
177 static int record__comp_enabled(struct record *rec);
178 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
179 			    void *src, size_t src_size);
180 
181 #ifdef HAVE_AIO_SUPPORT
182 static int record__aio_write(struct aiocb *cblock, int trace_fd,
183 		void *buf, size_t size, off_t off)
184 {
185 	int rc;
186 
187 	cblock->aio_fildes = trace_fd;
188 	cblock->aio_buf    = buf;
189 	cblock->aio_nbytes = size;
190 	cblock->aio_offset = off;
191 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
192 
193 	do {
194 		rc = aio_write(cblock);
195 		if (rc == 0) {
196 			break;
197 		} else if (errno != EAGAIN) {
198 			cblock->aio_fildes = -1;
199 			pr_err("failed to queue perf data, error: %m\n");
200 			break;
201 		}
202 	} while (1);
203 
204 	return rc;
205 }
206 
207 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
208 {
209 	void *rem_buf;
210 	off_t rem_off;
211 	size_t rem_size;
212 	int rc, aio_errno;
213 	ssize_t aio_ret, written;
214 
215 	aio_errno = aio_error(cblock);
216 	if (aio_errno == EINPROGRESS)
217 		return 0;
218 
219 	written = aio_ret = aio_return(cblock);
220 	if (aio_ret < 0) {
221 		if (aio_errno != EINTR)
222 			pr_err("failed to write perf data, error: %m\n");
223 		written = 0;
224 	}
225 
226 	rem_size = cblock->aio_nbytes - written;
227 
228 	if (rem_size == 0) {
229 		cblock->aio_fildes = -1;
230 		/*
231 		 * md->refcount is incremented in record__aio_pushfn() for
232 		 * every aio write request started in record__aio_push() so
233 		 * decrement it because the request is now complete.
234 		 */
235 		perf_mmap__put(&md->core);
236 		rc = 1;
237 	} else {
238 		/*
239 		 * aio write request may require restart with the
240 		 * reminder if the kernel didn't write whole
241 		 * chunk at once.
242 		 */
243 		rem_off = cblock->aio_offset + written;
244 		rem_buf = (void *)(cblock->aio_buf + written);
245 		record__aio_write(cblock, cblock->aio_fildes,
246 				rem_buf, rem_size, rem_off);
247 		rc = 0;
248 	}
249 
250 	return rc;
251 }
252 
253 static int record__aio_sync(struct mmap *md, bool sync_all)
254 {
255 	struct aiocb **aiocb = md->aio.aiocb;
256 	struct aiocb *cblocks = md->aio.cblocks;
257 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
258 	int i, do_suspend;
259 
260 	do {
261 		do_suspend = 0;
262 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
263 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
264 				if (sync_all)
265 					aiocb[i] = NULL;
266 				else
267 					return i;
268 			} else {
269 				/*
270 				 * Started aio write is not complete yet
271 				 * so it has to be waited before the
272 				 * next allocation.
273 				 */
274 				aiocb[i] = &cblocks[i];
275 				do_suspend = 1;
276 			}
277 		}
278 		if (!do_suspend)
279 			return -1;
280 
281 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
282 			if (!(errno == EAGAIN || errno == EINTR))
283 				pr_err("failed to sync perf data, error: %m\n");
284 		}
285 	} while (1);
286 }
287 
288 struct record_aio {
289 	struct record	*rec;
290 	void		*data;
291 	size_t		size;
292 };
293 
294 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
295 {
296 	struct record_aio *aio = to;
297 
298 	/*
299 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
300 	 * to release space in the kernel buffer as fast as possible, calling
301 	 * perf_mmap__consume() from perf_mmap__push() function.
302 	 *
303 	 * That lets the kernel to proceed with storing more profiling data into
304 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
305 	 *
306 	 * Coping can be done in two steps in case the chunk of profiling data
307 	 * crosses the upper bound of the kernel buffer. In this case we first move
308 	 * part of data from map->start till the upper bound and then the reminder
309 	 * from the beginning of the kernel buffer till the end of the data chunk.
310 	 */
311 
312 	if (record__comp_enabled(aio->rec)) {
313 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
314 				     mmap__mmap_len(map) - aio->size,
315 				     buf, size);
316 	} else {
317 		memcpy(aio->data + aio->size, buf, size);
318 	}
319 
320 	if (!aio->size) {
321 		/*
322 		 * Increment map->refcount to guard map->aio.data[] buffer
323 		 * from premature deallocation because map object can be
324 		 * released earlier than aio write request started on
325 		 * map->aio.data[] buffer is complete.
326 		 *
327 		 * perf_mmap__put() is done at record__aio_complete()
328 		 * after started aio request completion or at record__aio_push()
329 		 * if the request failed to start.
330 		 */
331 		perf_mmap__get(&map->core);
332 	}
333 
334 	aio->size += size;
335 
336 	return size;
337 }
338 
339 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
340 {
341 	int ret, idx;
342 	int trace_fd = rec->session->data->file.fd;
343 	struct record_aio aio = { .rec = rec, .size = 0 };
344 
345 	/*
346 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
347 	 * becomes available after previous aio write operation.
348 	 */
349 
350 	idx = record__aio_sync(map, false);
351 	aio.data = map->aio.data[idx];
352 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
353 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
354 		return ret;
355 
356 	rec->samples++;
357 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
358 	if (!ret) {
359 		*off += aio.size;
360 		rec->bytes_written += aio.size;
361 		if (switch_output_size(rec))
362 			trigger_hit(&switch_output_trigger);
363 	} else {
364 		/*
365 		 * Decrement map->refcount incremented in record__aio_pushfn()
366 		 * back if record__aio_write() operation failed to start, otherwise
367 		 * map->refcount is decremented in record__aio_complete() after
368 		 * aio write operation finishes successfully.
369 		 */
370 		perf_mmap__put(&map->core);
371 	}
372 
373 	return ret;
374 }
375 
376 static off_t record__aio_get_pos(int trace_fd)
377 {
378 	return lseek(trace_fd, 0, SEEK_CUR);
379 }
380 
381 static void record__aio_set_pos(int trace_fd, off_t pos)
382 {
383 	lseek(trace_fd, pos, SEEK_SET);
384 }
385 
386 static void record__aio_mmap_read_sync(struct record *rec)
387 {
388 	int i;
389 	struct evlist *evlist = rec->evlist;
390 	struct mmap *maps = evlist->mmap;
391 
392 	if (!record__aio_enabled(rec))
393 		return;
394 
395 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
396 		struct mmap *map = &maps[i];
397 
398 		if (map->core.base)
399 			record__aio_sync(map, true);
400 	}
401 }
402 
403 static int nr_cblocks_default = 1;
404 static int nr_cblocks_max = 4;
405 
406 static int record__aio_parse(const struct option *opt,
407 			     const char *str,
408 			     int unset)
409 {
410 	struct record_opts *opts = (struct record_opts *)opt->value;
411 
412 	if (unset) {
413 		opts->nr_cblocks = 0;
414 	} else {
415 		if (str)
416 			opts->nr_cblocks = strtol(str, NULL, 0);
417 		if (!opts->nr_cblocks)
418 			opts->nr_cblocks = nr_cblocks_default;
419 	}
420 
421 	return 0;
422 }
423 #else /* HAVE_AIO_SUPPORT */
424 static int nr_cblocks_max = 0;
425 
426 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
427 			    off_t *off __maybe_unused)
428 {
429 	return -1;
430 }
431 
432 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
433 {
434 	return -1;
435 }
436 
437 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
438 {
439 }
440 
441 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
442 {
443 }
444 #endif
445 
446 static int record__aio_enabled(struct record *rec)
447 {
448 	return rec->opts.nr_cblocks > 0;
449 }
450 
451 #define MMAP_FLUSH_DEFAULT 1
452 static int record__mmap_flush_parse(const struct option *opt,
453 				    const char *str,
454 				    int unset)
455 {
456 	int flush_max;
457 	struct record_opts *opts = (struct record_opts *)opt->value;
458 	static struct parse_tag tags[] = {
459 			{ .tag  = 'B', .mult = 1       },
460 			{ .tag  = 'K', .mult = 1 << 10 },
461 			{ .tag  = 'M', .mult = 1 << 20 },
462 			{ .tag  = 'G', .mult = 1 << 30 },
463 			{ .tag  = 0 },
464 	};
465 
466 	if (unset)
467 		return 0;
468 
469 	if (str) {
470 		opts->mmap_flush = parse_tag_value(str, tags);
471 		if (opts->mmap_flush == (int)-1)
472 			opts->mmap_flush = strtol(str, NULL, 0);
473 	}
474 
475 	if (!opts->mmap_flush)
476 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
477 
478 	flush_max = evlist__mmap_size(opts->mmap_pages);
479 	flush_max /= 4;
480 	if (opts->mmap_flush > flush_max)
481 		opts->mmap_flush = flush_max;
482 
483 	return 0;
484 }
485 
486 #ifdef HAVE_ZSTD_SUPPORT
487 static unsigned int comp_level_default = 1;
488 
489 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
490 {
491 	struct record_opts *opts = opt->value;
492 
493 	if (unset) {
494 		opts->comp_level = 0;
495 	} else {
496 		if (str)
497 			opts->comp_level = strtol(str, NULL, 0);
498 		if (!opts->comp_level)
499 			opts->comp_level = comp_level_default;
500 	}
501 
502 	return 0;
503 }
504 #endif
505 static unsigned int comp_level_max = 22;
506 
507 static int record__comp_enabled(struct record *rec)
508 {
509 	return rec->opts.comp_level > 0;
510 }
511 
512 static int process_synthesized_event(struct perf_tool *tool,
513 				     union perf_event *event,
514 				     struct perf_sample *sample __maybe_unused,
515 				     struct machine *machine __maybe_unused)
516 {
517 	struct record *rec = container_of(tool, struct record, tool);
518 	return record__write(rec, NULL, event, event->header.size);
519 }
520 
521 static int process_locked_synthesized_event(struct perf_tool *tool,
522 				     union perf_event *event,
523 				     struct perf_sample *sample __maybe_unused,
524 				     struct machine *machine __maybe_unused)
525 {
526 	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
527 	int ret;
528 
529 	pthread_mutex_lock(&synth_lock);
530 	ret = process_synthesized_event(tool, event, sample, machine);
531 	pthread_mutex_unlock(&synth_lock);
532 	return ret;
533 }
534 
535 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
536 {
537 	struct record *rec = to;
538 
539 	if (record__comp_enabled(rec)) {
540 		size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
541 		bf   = map->data;
542 	}
543 
544 	rec->samples++;
545 	return record__write(rec, map, bf, size);
546 }
547 
548 static volatile int signr = -1;
549 static volatile int child_finished;
550 #ifdef HAVE_EVENTFD_SUPPORT
551 static int done_fd = -1;
552 #endif
553 
554 static void sig_handler(int sig)
555 {
556 	if (sig == SIGCHLD)
557 		child_finished = 1;
558 	else
559 		signr = sig;
560 
561 	done = 1;
562 #ifdef HAVE_EVENTFD_SUPPORT
563 {
564 	u64 tmp = 1;
565 	/*
566 	 * It is possible for this signal handler to run after done is checked
567 	 * in the main loop, but before the perf counter fds are polled. If this
568 	 * happens, the poll() will continue to wait even though done is set,
569 	 * and will only break out if either another signal is received, or the
570 	 * counters are ready for read. To ensure the poll() doesn't sleep when
571 	 * done is set, use an eventfd (done_fd) to wake up the poll().
572 	 */
573 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
574 		pr_err("failed to signal wakeup fd, error: %m\n");
575 }
576 #endif // HAVE_EVENTFD_SUPPORT
577 }
578 
579 static void sigsegv_handler(int sig)
580 {
581 	perf_hooks__recover();
582 	sighandler_dump_stack(sig);
583 }
584 
585 static void record__sig_exit(void)
586 {
587 	if (signr == -1)
588 		return;
589 
590 	signal(signr, SIG_DFL);
591 	raise(signr);
592 }
593 
594 #ifdef HAVE_AUXTRACE_SUPPORT
595 
596 static int record__process_auxtrace(struct perf_tool *tool,
597 				    struct mmap *map,
598 				    union perf_event *event, void *data1,
599 				    size_t len1, void *data2, size_t len2)
600 {
601 	struct record *rec = container_of(tool, struct record, tool);
602 	struct perf_data *data = &rec->data;
603 	size_t padding;
604 	u8 pad[8] = {0};
605 
606 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
607 		off_t file_offset;
608 		int fd = perf_data__fd(data);
609 		int err;
610 
611 		file_offset = lseek(fd, 0, SEEK_CUR);
612 		if (file_offset == -1)
613 			return -1;
614 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
615 						     event, file_offset);
616 		if (err)
617 			return err;
618 	}
619 
620 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
621 	padding = (len1 + len2) & 7;
622 	if (padding)
623 		padding = 8 - padding;
624 
625 	record__write(rec, map, event, event->header.size);
626 	record__write(rec, map, data1, len1);
627 	if (len2)
628 		record__write(rec, map, data2, len2);
629 	record__write(rec, map, &pad, padding);
630 
631 	return 0;
632 }
633 
634 static int record__auxtrace_mmap_read(struct record *rec,
635 				      struct mmap *map)
636 {
637 	int ret;
638 
639 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
640 				  record__process_auxtrace);
641 	if (ret < 0)
642 		return ret;
643 
644 	if (ret)
645 		rec->samples++;
646 
647 	return 0;
648 }
649 
650 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
651 					       struct mmap *map)
652 {
653 	int ret;
654 
655 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
656 					   record__process_auxtrace,
657 					   rec->opts.auxtrace_snapshot_size);
658 	if (ret < 0)
659 		return ret;
660 
661 	if (ret)
662 		rec->samples++;
663 
664 	return 0;
665 }
666 
667 static int record__auxtrace_read_snapshot_all(struct record *rec)
668 {
669 	int i;
670 	int rc = 0;
671 
672 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
673 		struct mmap *map = &rec->evlist->mmap[i];
674 
675 		if (!map->auxtrace_mmap.base)
676 			continue;
677 
678 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
679 			rc = -1;
680 			goto out;
681 		}
682 	}
683 out:
684 	return rc;
685 }
686 
687 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
688 {
689 	pr_debug("Recording AUX area tracing snapshot\n");
690 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
691 		trigger_error(&auxtrace_snapshot_trigger);
692 	} else {
693 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
694 			trigger_error(&auxtrace_snapshot_trigger);
695 		else
696 			trigger_ready(&auxtrace_snapshot_trigger);
697 	}
698 }
699 
700 static int record__auxtrace_snapshot_exit(struct record *rec)
701 {
702 	if (trigger_is_error(&auxtrace_snapshot_trigger))
703 		return 0;
704 
705 	if (!auxtrace_record__snapshot_started &&
706 	    auxtrace_record__snapshot_start(rec->itr))
707 		return -1;
708 
709 	record__read_auxtrace_snapshot(rec, true);
710 	if (trigger_is_error(&auxtrace_snapshot_trigger))
711 		return -1;
712 
713 	return 0;
714 }
715 
716 static int record__auxtrace_init(struct record *rec)
717 {
718 	int err;
719 
720 	if (!rec->itr) {
721 		rec->itr = auxtrace_record__init(rec->evlist, &err);
722 		if (err)
723 			return err;
724 	}
725 
726 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
727 					      rec->opts.auxtrace_snapshot_opts);
728 	if (err)
729 		return err;
730 
731 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
732 					    rec->opts.auxtrace_sample_opts);
733 	if (err)
734 		return err;
735 
736 	auxtrace_regroup_aux_output(rec->evlist);
737 
738 	return auxtrace_parse_filters(rec->evlist);
739 }
740 
741 #else
742 
743 static inline
744 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
745 			       struct mmap *map __maybe_unused)
746 {
747 	return 0;
748 }
749 
750 static inline
751 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
752 				    bool on_exit __maybe_unused)
753 {
754 }
755 
756 static inline
757 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
758 {
759 	return 0;
760 }
761 
762 static inline
763 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
764 {
765 	return 0;
766 }
767 
768 static int record__auxtrace_init(struct record *rec __maybe_unused)
769 {
770 	return 0;
771 }
772 
773 #endif
774 
775 static int record__config_text_poke(struct evlist *evlist)
776 {
777 	struct evsel *evsel;
778 	int err;
779 
780 	/* Nothing to do if text poke is already configured */
781 	evlist__for_each_entry(evlist, evsel) {
782 		if (evsel->core.attr.text_poke)
783 			return 0;
784 	}
785 
786 	err = parse_events(evlist, "dummy:u", NULL);
787 	if (err)
788 		return err;
789 
790 	evsel = evlist__last(evlist);
791 
792 	evsel->core.attr.freq = 0;
793 	evsel->core.attr.sample_period = 1;
794 	evsel->core.attr.text_poke = 1;
795 	evsel->core.attr.ksymbol = 1;
796 
797 	evsel->core.system_wide = true;
798 	evsel->no_aux_samples = true;
799 	evsel->immediate = true;
800 
801 	/* Text poke must be collected on all CPUs */
802 	perf_cpu_map__put(evsel->core.own_cpus);
803 	evsel->core.own_cpus = perf_cpu_map__new(NULL);
804 	perf_cpu_map__put(evsel->core.cpus);
805 	evsel->core.cpus = perf_cpu_map__get(evsel->core.own_cpus);
806 
807 	evsel__set_sample_bit(evsel, TIME);
808 
809 	return 0;
810 }
811 
812 static bool record__kcore_readable(struct machine *machine)
813 {
814 	char kcore[PATH_MAX];
815 	int fd;
816 
817 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
818 
819 	fd = open(kcore, O_RDONLY);
820 	if (fd < 0)
821 		return false;
822 
823 	close(fd);
824 
825 	return true;
826 }
827 
828 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
829 {
830 	char from_dir[PATH_MAX];
831 	char kcore_dir[PATH_MAX];
832 	int ret;
833 
834 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
835 
836 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
837 	if (ret)
838 		return ret;
839 
840 	return kcore_copy(from_dir, kcore_dir);
841 }
842 
843 static int record__mmap_evlist(struct record *rec,
844 			       struct evlist *evlist)
845 {
846 	struct record_opts *opts = &rec->opts;
847 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
848 				  opts->auxtrace_sample_mode;
849 	char msg[512];
850 
851 	if (opts->affinity != PERF_AFFINITY_SYS)
852 		cpu__setup_cpunode_map();
853 
854 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
855 				 opts->auxtrace_mmap_pages,
856 				 auxtrace_overwrite,
857 				 opts->nr_cblocks, opts->affinity,
858 				 opts->mmap_flush, opts->comp_level) < 0) {
859 		if (errno == EPERM) {
860 			pr_err("Permission error mapping pages.\n"
861 			       "Consider increasing "
862 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
863 			       "or try again with a smaller value of -m/--mmap_pages.\n"
864 			       "(current value: %u,%u)\n",
865 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
866 			return -errno;
867 		} else {
868 			pr_err("failed to mmap with %d (%s)\n", errno,
869 				str_error_r(errno, msg, sizeof(msg)));
870 			if (errno)
871 				return -errno;
872 			else
873 				return -EINVAL;
874 		}
875 	}
876 	return 0;
877 }
878 
879 static int record__mmap(struct record *rec)
880 {
881 	return record__mmap_evlist(rec, rec->evlist);
882 }
883 
884 static int record__open(struct record *rec)
885 {
886 	char msg[BUFSIZ];
887 	struct evsel *pos;
888 	struct evlist *evlist = rec->evlist;
889 	struct perf_session *session = rec->session;
890 	struct record_opts *opts = &rec->opts;
891 	int rc = 0;
892 
893 	/*
894 	 * For initial_delay or system wide, we need to add a dummy event so
895 	 * that we can track PERF_RECORD_MMAP to cover the delay of waiting or
896 	 * event synthesis.
897 	 */
898 	if (opts->initial_delay || target__has_cpu(&opts->target)) {
899 		pos = evlist__get_tracking_event(evlist);
900 		if (!evsel__is_dummy_event(pos)) {
901 			/* Set up dummy event. */
902 			if (evlist__add_dummy(evlist))
903 				return -ENOMEM;
904 			pos = evlist__last(evlist);
905 			evlist__set_tracking_event(evlist, pos);
906 		}
907 
908 		/*
909 		 * Enable the dummy event when the process is forked for
910 		 * initial_delay, immediately for system wide.
911 		 */
912 		if (opts->initial_delay && !pos->immediate)
913 			pos->core.attr.enable_on_exec = 1;
914 		else
915 			pos->immediate = 1;
916 	}
917 
918 	evlist__config(evlist, opts, &callchain_param);
919 
920 	evlist__for_each_entry(evlist, pos) {
921 try_again:
922 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
923 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
924 				if (verbose > 0)
925 					ui__warning("%s\n", msg);
926 				goto try_again;
927 			}
928 			if ((errno == EINVAL || errno == EBADF) &&
929 			    pos->leader != pos &&
930 			    pos->weak_group) {
931 			        pos = evlist__reset_weak_group(evlist, pos, true);
932 				goto try_again;
933 			}
934 			rc = -errno;
935 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
936 			ui__error("%s\n", msg);
937 			goto out;
938 		}
939 
940 		pos->supported = true;
941 	}
942 
943 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
944 		pr_warning(
945 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
946 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
947 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
948 "file is not found in the buildid cache or in the vmlinux path.\n\n"
949 "Samples in kernel modules won't be resolved at all.\n\n"
950 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
951 "even with a suitable vmlinux or kallsyms file.\n\n");
952 	}
953 
954 	if (evlist__apply_filters(evlist, &pos)) {
955 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
956 			pos->filter, evsel__name(pos), errno,
957 			str_error_r(errno, msg, sizeof(msg)));
958 		rc = -1;
959 		goto out;
960 	}
961 
962 	rc = record__mmap(rec);
963 	if (rc)
964 		goto out;
965 
966 	session->evlist = evlist;
967 	perf_session__set_id_hdr_size(session);
968 out:
969 	return rc;
970 }
971 
972 static int process_sample_event(struct perf_tool *tool,
973 				union perf_event *event,
974 				struct perf_sample *sample,
975 				struct evsel *evsel,
976 				struct machine *machine)
977 {
978 	struct record *rec = container_of(tool, struct record, tool);
979 
980 	if (rec->evlist->first_sample_time == 0)
981 		rec->evlist->first_sample_time = sample->time;
982 
983 	rec->evlist->last_sample_time = sample->time;
984 
985 	if (rec->buildid_all)
986 		return 0;
987 
988 	rec->samples++;
989 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
990 }
991 
992 static int process_buildids(struct record *rec)
993 {
994 	struct perf_session *session = rec->session;
995 
996 	if (perf_data__size(&rec->data) == 0)
997 		return 0;
998 
999 	/*
1000 	 * During this process, it'll load kernel map and replace the
1001 	 * dso->long_name to a real pathname it found.  In this case
1002 	 * we prefer the vmlinux path like
1003 	 *   /lib/modules/3.16.4/build/vmlinux
1004 	 *
1005 	 * rather than build-id path (in debug directory).
1006 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1007 	 */
1008 	symbol_conf.ignore_vmlinux_buildid = true;
1009 
1010 	/*
1011 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1012 	 * so no need to process samples. But if timestamp_boundary is enabled,
1013 	 * it still needs to walk on all samples to get the timestamps of
1014 	 * first/last samples.
1015 	 */
1016 	if (rec->buildid_all && !rec->timestamp_boundary)
1017 		rec->tool.sample = NULL;
1018 
1019 	return perf_session__process_events(session);
1020 }
1021 
1022 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1023 {
1024 	int err;
1025 	struct perf_tool *tool = data;
1026 	/*
1027 	 *As for guest kernel when processing subcommand record&report,
1028 	 *we arrange module mmap prior to guest kernel mmap and trigger
1029 	 *a preload dso because default guest module symbols are loaded
1030 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1031 	 *method is used to avoid symbol missing when the first addr is
1032 	 *in module instead of in guest kernel.
1033 	 */
1034 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1035 					     machine);
1036 	if (err < 0)
1037 		pr_err("Couldn't record guest kernel [%d]'s reference"
1038 		       " relocation symbol.\n", machine->pid);
1039 
1040 	/*
1041 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1042 	 * have no _text sometimes.
1043 	 */
1044 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1045 						 machine);
1046 	if (err < 0)
1047 		pr_err("Couldn't record guest kernel [%d]'s reference"
1048 		       " relocation symbol.\n", machine->pid);
1049 }
1050 
1051 static struct perf_event_header finished_round_event = {
1052 	.size = sizeof(struct perf_event_header),
1053 	.type = PERF_RECORD_FINISHED_ROUND,
1054 };
1055 
1056 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1057 {
1058 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1059 	    !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
1060 			  rec->affinity_mask.nbits)) {
1061 		bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
1062 		bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
1063 			  map->affinity_mask.bits, rec->affinity_mask.nbits);
1064 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
1065 				  (cpu_set_t *)rec->affinity_mask.bits);
1066 		if (verbose == 2)
1067 			mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
1068 	}
1069 }
1070 
1071 static size_t process_comp_header(void *record, size_t increment)
1072 {
1073 	struct perf_record_compressed *event = record;
1074 	size_t size = sizeof(*event);
1075 
1076 	if (increment) {
1077 		event->header.size += increment;
1078 		return increment;
1079 	}
1080 
1081 	event->header.type = PERF_RECORD_COMPRESSED;
1082 	event->header.size = size;
1083 
1084 	return size;
1085 }
1086 
1087 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
1088 			    void *src, size_t src_size)
1089 {
1090 	size_t compressed;
1091 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1092 
1093 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1094 						     max_record_size, process_comp_header);
1095 
1096 	session->bytes_transferred += src_size;
1097 	session->bytes_compressed  += compressed;
1098 
1099 	return compressed;
1100 }
1101 
1102 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1103 				    bool overwrite, bool synch)
1104 {
1105 	u64 bytes_written = rec->bytes_written;
1106 	int i;
1107 	int rc = 0;
1108 	struct mmap *maps;
1109 	int trace_fd = rec->data.file.fd;
1110 	off_t off = 0;
1111 
1112 	if (!evlist)
1113 		return 0;
1114 
1115 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1116 	if (!maps)
1117 		return 0;
1118 
1119 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1120 		return 0;
1121 
1122 	if (record__aio_enabled(rec))
1123 		off = record__aio_get_pos(trace_fd);
1124 
1125 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
1126 		u64 flush = 0;
1127 		struct mmap *map = &maps[i];
1128 
1129 		if (map->core.base) {
1130 			record__adjust_affinity(rec, map);
1131 			if (synch) {
1132 				flush = map->core.flush;
1133 				map->core.flush = 1;
1134 			}
1135 			if (!record__aio_enabled(rec)) {
1136 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1137 					if (synch)
1138 						map->core.flush = flush;
1139 					rc = -1;
1140 					goto out;
1141 				}
1142 			} else {
1143 				if (record__aio_push(rec, map, &off) < 0) {
1144 					record__aio_set_pos(trace_fd, off);
1145 					if (synch)
1146 						map->core.flush = flush;
1147 					rc = -1;
1148 					goto out;
1149 				}
1150 			}
1151 			if (synch)
1152 				map->core.flush = flush;
1153 		}
1154 
1155 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1156 		    !rec->opts.auxtrace_sample_mode &&
1157 		    record__auxtrace_mmap_read(rec, map) != 0) {
1158 			rc = -1;
1159 			goto out;
1160 		}
1161 	}
1162 
1163 	if (record__aio_enabled(rec))
1164 		record__aio_set_pos(trace_fd, off);
1165 
1166 	/*
1167 	 * Mark the round finished in case we wrote
1168 	 * at least one event.
1169 	 */
1170 	if (bytes_written != rec->bytes_written)
1171 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1172 
1173 	if (overwrite)
1174 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1175 out:
1176 	return rc;
1177 }
1178 
1179 static int record__mmap_read_all(struct record *rec, bool synch)
1180 {
1181 	int err;
1182 
1183 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1184 	if (err)
1185 		return err;
1186 
1187 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1188 }
1189 
1190 static void record__init_features(struct record *rec)
1191 {
1192 	struct perf_session *session = rec->session;
1193 	int feat;
1194 
1195 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1196 		perf_header__set_feat(&session->header, feat);
1197 
1198 	if (rec->no_buildid)
1199 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1200 
1201 	if (!have_tracepoints(&rec->evlist->core.entries))
1202 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1203 
1204 	if (!rec->opts.branch_stack)
1205 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1206 
1207 	if (!rec->opts.full_auxtrace)
1208 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1209 
1210 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1211 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1212 
1213 	if (!rec->opts.use_clockid)
1214 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1215 
1216 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1217 	if (!record__comp_enabled(rec))
1218 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1219 
1220 	perf_header__clear_feat(&session->header, HEADER_STAT);
1221 }
1222 
1223 static void
1224 record__finish_output(struct record *rec)
1225 {
1226 	struct perf_data *data = &rec->data;
1227 	int fd = perf_data__fd(data);
1228 
1229 	if (data->is_pipe)
1230 		return;
1231 
1232 	rec->session->header.data_size += rec->bytes_written;
1233 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1234 
1235 	if (!rec->no_buildid) {
1236 		process_buildids(rec);
1237 
1238 		if (rec->buildid_all)
1239 			dsos__hit_all(rec->session);
1240 	}
1241 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1242 
1243 	return;
1244 }
1245 
1246 static int record__synthesize_workload(struct record *rec, bool tail)
1247 {
1248 	int err;
1249 	struct perf_thread_map *thread_map;
1250 
1251 	if (rec->opts.tail_synthesize != tail)
1252 		return 0;
1253 
1254 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1255 	if (thread_map == NULL)
1256 		return -1;
1257 
1258 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1259 						 process_synthesized_event,
1260 						 &rec->session->machines.host,
1261 						 rec->opts.sample_address);
1262 	perf_thread_map__put(thread_map);
1263 	return err;
1264 }
1265 
1266 static int record__synthesize(struct record *rec, bool tail);
1267 
1268 static int
1269 record__switch_output(struct record *rec, bool at_exit)
1270 {
1271 	struct perf_data *data = &rec->data;
1272 	int fd, err;
1273 	char *new_filename;
1274 
1275 	/* Same Size:      "2015122520103046"*/
1276 	char timestamp[] = "InvalidTimestamp";
1277 
1278 	record__aio_mmap_read_sync(rec);
1279 
1280 	record__synthesize(rec, true);
1281 	if (target__none(&rec->opts.target))
1282 		record__synthesize_workload(rec, true);
1283 
1284 	rec->samples = 0;
1285 	record__finish_output(rec);
1286 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1287 	if (err) {
1288 		pr_err("Failed to get current timestamp\n");
1289 		return -EINVAL;
1290 	}
1291 
1292 	fd = perf_data__switch(data, timestamp,
1293 				    rec->session->header.data_offset,
1294 				    at_exit, &new_filename);
1295 	if (fd >= 0 && !at_exit) {
1296 		rec->bytes_written = 0;
1297 		rec->session->header.data_size = 0;
1298 	}
1299 
1300 	if (!quiet)
1301 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1302 			data->path, timestamp);
1303 
1304 	if (rec->switch_output.num_files) {
1305 		int n = rec->switch_output.cur_file + 1;
1306 
1307 		if (n >= rec->switch_output.num_files)
1308 			n = 0;
1309 		rec->switch_output.cur_file = n;
1310 		if (rec->switch_output.filenames[n]) {
1311 			remove(rec->switch_output.filenames[n]);
1312 			zfree(&rec->switch_output.filenames[n]);
1313 		}
1314 		rec->switch_output.filenames[n] = new_filename;
1315 	} else {
1316 		free(new_filename);
1317 	}
1318 
1319 	/* Output tracking events */
1320 	if (!at_exit) {
1321 		record__synthesize(rec, false);
1322 
1323 		/*
1324 		 * In 'perf record --switch-output' without -a,
1325 		 * record__synthesize() in record__switch_output() won't
1326 		 * generate tracking events because there's no thread_map
1327 		 * in evlist. Which causes newly created perf.data doesn't
1328 		 * contain map and comm information.
1329 		 * Create a fake thread_map and directly call
1330 		 * perf_event__synthesize_thread_map() for those events.
1331 		 */
1332 		if (target__none(&rec->opts.target))
1333 			record__synthesize_workload(rec, false);
1334 	}
1335 	return fd;
1336 }
1337 
1338 static volatile int workload_exec_errno;
1339 
1340 /*
1341  * evlist__prepare_workload will send a SIGUSR1
1342  * if the fork fails, since we asked by setting its
1343  * want_signal to true.
1344  */
1345 static void workload_exec_failed_signal(int signo __maybe_unused,
1346 					siginfo_t *info,
1347 					void *ucontext __maybe_unused)
1348 {
1349 	workload_exec_errno = info->si_value.sival_int;
1350 	done = 1;
1351 	child_finished = 1;
1352 }
1353 
1354 static void snapshot_sig_handler(int sig);
1355 static void alarm_sig_handler(int sig);
1356 
1357 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1358 {
1359 	if (evlist) {
1360 		if (evlist->mmap && evlist->mmap[0].core.base)
1361 			return evlist->mmap[0].core.base;
1362 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1363 			return evlist->overwrite_mmap[0].core.base;
1364 	}
1365 	return NULL;
1366 }
1367 
1368 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1369 {
1370 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1371 	if (pc)
1372 		return pc;
1373 	return NULL;
1374 }
1375 
1376 static int record__synthesize(struct record *rec, bool tail)
1377 {
1378 	struct perf_session *session = rec->session;
1379 	struct machine *machine = &session->machines.host;
1380 	struct perf_data *data = &rec->data;
1381 	struct record_opts *opts = &rec->opts;
1382 	struct perf_tool *tool = &rec->tool;
1383 	int fd = perf_data__fd(data);
1384 	int err = 0;
1385 	event_op f = process_synthesized_event;
1386 
1387 	if (rec->opts.tail_synthesize != tail)
1388 		return 0;
1389 
1390 	if (data->is_pipe) {
1391 		/*
1392 		 * We need to synthesize events first, because some
1393 		 * features works on top of them (on report side).
1394 		 */
1395 		err = perf_event__synthesize_attrs(tool, rec->evlist,
1396 						   process_synthesized_event);
1397 		if (err < 0) {
1398 			pr_err("Couldn't synthesize attrs.\n");
1399 			goto out;
1400 		}
1401 
1402 		err = perf_event__synthesize_features(tool, session, rec->evlist,
1403 						      process_synthesized_event);
1404 		if (err < 0) {
1405 			pr_err("Couldn't synthesize features.\n");
1406 			return err;
1407 		}
1408 
1409 		if (have_tracepoints(&rec->evlist->core.entries)) {
1410 			/*
1411 			 * FIXME err <= 0 here actually means that
1412 			 * there were no tracepoints so its not really
1413 			 * an error, just that we don't need to
1414 			 * synthesize anything.  We really have to
1415 			 * return this more properly and also
1416 			 * propagate errors that now are calling die()
1417 			 */
1418 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
1419 								  process_synthesized_event);
1420 			if (err <= 0) {
1421 				pr_err("Couldn't record tracing data.\n");
1422 				goto out;
1423 			}
1424 			rec->bytes_written += err;
1425 		}
1426 	}
1427 
1428 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1429 					  process_synthesized_event, machine);
1430 	if (err)
1431 		goto out;
1432 
1433 	/* Synthesize id_index before auxtrace_info */
1434 	if (rec->opts.auxtrace_sample_mode) {
1435 		err = perf_event__synthesize_id_index(tool,
1436 						      process_synthesized_event,
1437 						      session->evlist, machine);
1438 		if (err)
1439 			goto out;
1440 	}
1441 
1442 	if (rec->opts.full_auxtrace) {
1443 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1444 					session, process_synthesized_event);
1445 		if (err)
1446 			goto out;
1447 	}
1448 
1449 	if (!evlist__exclude_kernel(rec->evlist)) {
1450 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1451 							 machine);
1452 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1453 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1454 				   "Check /proc/kallsyms permission or run as root.\n");
1455 
1456 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1457 						     machine);
1458 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1459 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1460 				   "Check /proc/modules permission or run as root.\n");
1461 	}
1462 
1463 	if (perf_guest) {
1464 		machines__process_guests(&session->machines,
1465 					 perf_event__synthesize_guest_os, tool);
1466 	}
1467 
1468 	err = perf_event__synthesize_extra_attr(&rec->tool,
1469 						rec->evlist,
1470 						process_synthesized_event,
1471 						data->is_pipe);
1472 	if (err)
1473 		goto out;
1474 
1475 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1476 						 process_synthesized_event,
1477 						NULL);
1478 	if (err < 0) {
1479 		pr_err("Couldn't synthesize thread map.\n");
1480 		return err;
1481 	}
1482 
1483 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1484 					     process_synthesized_event, NULL);
1485 	if (err < 0) {
1486 		pr_err("Couldn't synthesize cpu map.\n");
1487 		return err;
1488 	}
1489 
1490 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1491 						machine, opts);
1492 	if (err < 0)
1493 		pr_warning("Couldn't synthesize bpf events.\n");
1494 
1495 	err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1496 					     machine);
1497 	if (err < 0)
1498 		pr_warning("Couldn't synthesize cgroup events.\n");
1499 
1500 	if (rec->opts.nr_threads_synthesize > 1) {
1501 		perf_set_multithreaded();
1502 		f = process_locked_synthesized_event;
1503 	}
1504 
1505 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1506 					    f, opts->sample_address,
1507 					    rec->opts.nr_threads_synthesize);
1508 
1509 	if (rec->opts.nr_threads_synthesize > 1)
1510 		perf_set_singlethreaded();
1511 
1512 out:
1513 	return err;
1514 }
1515 
1516 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1517 {
1518 	struct record *rec = data;
1519 	pthread_kill(rec->thread_id, SIGUSR2);
1520 	return 0;
1521 }
1522 
1523 static int record__setup_sb_evlist(struct record *rec)
1524 {
1525 	struct record_opts *opts = &rec->opts;
1526 
1527 	if (rec->sb_evlist != NULL) {
1528 		/*
1529 		 * We get here if --switch-output-event populated the
1530 		 * sb_evlist, so associate a callback that will send a SIGUSR2
1531 		 * to the main thread.
1532 		 */
1533 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1534 		rec->thread_id = pthread_self();
1535 	}
1536 #ifdef HAVE_LIBBPF_SUPPORT
1537 	if (!opts->no_bpf_event) {
1538 		if (rec->sb_evlist == NULL) {
1539 			rec->sb_evlist = evlist__new();
1540 
1541 			if (rec->sb_evlist == NULL) {
1542 				pr_err("Couldn't create side band evlist.\n.");
1543 				return -1;
1544 			}
1545 		}
1546 
1547 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1548 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1549 			return -1;
1550 		}
1551 	}
1552 #endif
1553 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1554 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1555 		opts->no_bpf_event = true;
1556 	}
1557 
1558 	return 0;
1559 }
1560 
1561 static int record__init_clock(struct record *rec)
1562 {
1563 	struct perf_session *session = rec->session;
1564 	struct timespec ref_clockid;
1565 	struct timeval ref_tod;
1566 	u64 ref;
1567 
1568 	if (!rec->opts.use_clockid)
1569 		return 0;
1570 
1571 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1572 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
1573 
1574 	session->header.env.clock.clockid = rec->opts.clockid;
1575 
1576 	if (gettimeofday(&ref_tod, NULL) != 0) {
1577 		pr_err("gettimeofday failed, cannot set reference time.\n");
1578 		return -1;
1579 	}
1580 
1581 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
1582 		pr_err("clock_gettime failed, cannot set reference time.\n");
1583 		return -1;
1584 	}
1585 
1586 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
1587 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
1588 
1589 	session->header.env.clock.tod_ns = ref;
1590 
1591 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
1592 	      (u64) ref_clockid.tv_nsec;
1593 
1594 	session->header.env.clock.clockid_ns = ref;
1595 	return 0;
1596 }
1597 
1598 static void hit_auxtrace_snapshot_trigger(struct record *rec)
1599 {
1600 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1601 		trigger_hit(&auxtrace_snapshot_trigger);
1602 		auxtrace_record__snapshot_started = 1;
1603 		if (auxtrace_record__snapshot_start(rec->itr))
1604 			trigger_error(&auxtrace_snapshot_trigger);
1605 	}
1606 }
1607 
1608 static void record__uniquify_name(struct record *rec)
1609 {
1610 	struct evsel *pos;
1611 	struct evlist *evlist = rec->evlist;
1612 	char *new_name;
1613 	int ret;
1614 
1615 	if (!perf_pmu__has_hybrid())
1616 		return;
1617 
1618 	evlist__for_each_entry(evlist, pos) {
1619 		if (!evsel__is_hybrid(pos))
1620 			continue;
1621 
1622 		if (strchr(pos->name, '/'))
1623 			continue;
1624 
1625 		ret = asprintf(&new_name, "%s/%s/",
1626 			       pos->pmu_name, pos->name);
1627 		if (ret) {
1628 			free(pos->name);
1629 			pos->name = new_name;
1630 		}
1631 	}
1632 }
1633 
1634 static int __cmd_record(struct record *rec, int argc, const char **argv)
1635 {
1636 	int err;
1637 	int status = 0;
1638 	unsigned long waking = 0;
1639 	const bool forks = argc > 0;
1640 	struct perf_tool *tool = &rec->tool;
1641 	struct record_opts *opts = &rec->opts;
1642 	struct perf_data *data = &rec->data;
1643 	struct perf_session *session;
1644 	bool disabled = false, draining = false;
1645 	int fd;
1646 	float ratio = 0;
1647 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
1648 
1649 	atexit(record__sig_exit);
1650 	signal(SIGCHLD, sig_handler);
1651 	signal(SIGINT, sig_handler);
1652 	signal(SIGTERM, sig_handler);
1653 	signal(SIGSEGV, sigsegv_handler);
1654 
1655 	if (rec->opts.record_namespaces)
1656 		tool->namespace_events = true;
1657 
1658 	if (rec->opts.record_cgroup) {
1659 #ifdef HAVE_FILE_HANDLE
1660 		tool->cgroup_events = true;
1661 #else
1662 		pr_err("cgroup tracking is not supported\n");
1663 		return -1;
1664 #endif
1665 	}
1666 
1667 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1668 		signal(SIGUSR2, snapshot_sig_handler);
1669 		if (rec->opts.auxtrace_snapshot_mode)
1670 			trigger_on(&auxtrace_snapshot_trigger);
1671 		if (rec->switch_output.enabled)
1672 			trigger_on(&switch_output_trigger);
1673 	} else {
1674 		signal(SIGUSR2, SIG_IGN);
1675 	}
1676 
1677 	session = perf_session__new(data, false, tool);
1678 	if (IS_ERR(session)) {
1679 		pr_err("Perf session creation failed.\n");
1680 		return PTR_ERR(session);
1681 	}
1682 
1683 	fd = perf_data__fd(data);
1684 	rec->session = session;
1685 
1686 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1687 		pr_err("Compression initialization failed.\n");
1688 		return -1;
1689 	}
1690 #ifdef HAVE_EVENTFD_SUPPORT
1691 	done_fd = eventfd(0, EFD_NONBLOCK);
1692 	if (done_fd < 0) {
1693 		pr_err("Failed to create wakeup eventfd, error: %m\n");
1694 		status = -1;
1695 		goto out_delete_session;
1696 	}
1697 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
1698 	if (err < 0) {
1699 		pr_err("Failed to add wakeup eventfd to poll list\n");
1700 		status = err;
1701 		goto out_delete_session;
1702 	}
1703 #endif // HAVE_EVENTFD_SUPPORT
1704 
1705 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1706 	session->header.env.comp_level = rec->opts.comp_level;
1707 
1708 	if (rec->opts.kcore &&
1709 	    !record__kcore_readable(&session->machines.host)) {
1710 		pr_err("ERROR: kcore is not readable.\n");
1711 		return -1;
1712 	}
1713 
1714 	if (record__init_clock(rec))
1715 		return -1;
1716 
1717 	record__init_features(rec);
1718 
1719 	if (forks) {
1720 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
1721 					       workload_exec_failed_signal);
1722 		if (err < 0) {
1723 			pr_err("Couldn't run the workload!\n");
1724 			status = err;
1725 			goto out_delete_session;
1726 		}
1727 	}
1728 
1729 	/*
1730 	 * If we have just single event and are sending data
1731 	 * through pipe, we need to force the ids allocation,
1732 	 * because we synthesize event name through the pipe
1733 	 * and need the id for that.
1734 	 */
1735 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1736 		rec->opts.sample_id = true;
1737 
1738 	record__uniquify_name(rec);
1739 
1740 	if (record__open(rec) != 0) {
1741 		err = -1;
1742 		goto out_child;
1743 	}
1744 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1745 
1746 	if (rec->opts.kcore) {
1747 		err = record__kcore_copy(&session->machines.host, data);
1748 		if (err) {
1749 			pr_err("ERROR: Failed to copy kcore\n");
1750 			goto out_child;
1751 		}
1752 	}
1753 
1754 	err = bpf__apply_obj_config();
1755 	if (err) {
1756 		char errbuf[BUFSIZ];
1757 
1758 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1759 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1760 			 errbuf);
1761 		goto out_child;
1762 	}
1763 
1764 	/*
1765 	 * Normally perf_session__new would do this, but it doesn't have the
1766 	 * evlist.
1767 	 */
1768 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
1769 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1770 		rec->tool.ordered_events = false;
1771 	}
1772 
1773 	if (!rec->evlist->nr_groups)
1774 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1775 
1776 	if (data->is_pipe) {
1777 		err = perf_header__write_pipe(fd);
1778 		if (err < 0)
1779 			goto out_child;
1780 	} else {
1781 		err = perf_session__write_header(session, rec->evlist, fd, false);
1782 		if (err < 0)
1783 			goto out_child;
1784 	}
1785 
1786 	err = -1;
1787 	if (!rec->no_buildid
1788 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1789 		pr_err("Couldn't generate buildids. "
1790 		       "Use --no-buildid to profile anyway.\n");
1791 		goto out_child;
1792 	}
1793 
1794 	err = record__setup_sb_evlist(rec);
1795 	if (err)
1796 		goto out_child;
1797 
1798 	err = record__synthesize(rec, false);
1799 	if (err < 0)
1800 		goto out_child;
1801 
1802 	if (rec->realtime_prio) {
1803 		struct sched_param param;
1804 
1805 		param.sched_priority = rec->realtime_prio;
1806 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1807 			pr_err("Could not set realtime priority.\n");
1808 			err = -1;
1809 			goto out_child;
1810 		}
1811 	}
1812 
1813 	/*
1814 	 * When perf is starting the traced process, all the events
1815 	 * (apart from group members) have enable_on_exec=1 set,
1816 	 * so don't spoil it by prematurely enabling them.
1817 	 */
1818 	if (!target__none(&opts->target) && !opts->initial_delay)
1819 		evlist__enable(rec->evlist);
1820 
1821 	/*
1822 	 * Let the child rip
1823 	 */
1824 	if (forks) {
1825 		struct machine *machine = &session->machines.host;
1826 		union perf_event *event;
1827 		pid_t tgid;
1828 
1829 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1830 		if (event == NULL) {
1831 			err = -ENOMEM;
1832 			goto out_child;
1833 		}
1834 
1835 		/*
1836 		 * Some H/W events are generated before COMM event
1837 		 * which is emitted during exec(), so perf script
1838 		 * cannot see a correct process name for those events.
1839 		 * Synthesize COMM event to prevent it.
1840 		 */
1841 		tgid = perf_event__synthesize_comm(tool, event,
1842 						   rec->evlist->workload.pid,
1843 						   process_synthesized_event,
1844 						   machine);
1845 		free(event);
1846 
1847 		if (tgid == -1)
1848 			goto out_child;
1849 
1850 		event = malloc(sizeof(event->namespaces) +
1851 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1852 			       machine->id_hdr_size);
1853 		if (event == NULL) {
1854 			err = -ENOMEM;
1855 			goto out_child;
1856 		}
1857 
1858 		/*
1859 		 * Synthesize NAMESPACES event for the command specified.
1860 		 */
1861 		perf_event__synthesize_namespaces(tool, event,
1862 						  rec->evlist->workload.pid,
1863 						  tgid, process_synthesized_event,
1864 						  machine);
1865 		free(event);
1866 
1867 		evlist__start_workload(rec->evlist);
1868 	}
1869 
1870 	if (evlist__initialize_ctlfd(rec->evlist, opts->ctl_fd, opts->ctl_fd_ack))
1871 		goto out_child;
1872 
1873 	if (opts->initial_delay) {
1874 		pr_info(EVLIST_DISABLED_MSG);
1875 		if (opts->initial_delay > 0) {
1876 			usleep(opts->initial_delay * USEC_PER_MSEC);
1877 			evlist__enable(rec->evlist);
1878 			pr_info(EVLIST_ENABLED_MSG);
1879 		}
1880 	}
1881 
1882 	trigger_ready(&auxtrace_snapshot_trigger);
1883 	trigger_ready(&switch_output_trigger);
1884 	perf_hooks__invoke_record_start();
1885 	for (;;) {
1886 		unsigned long long hits = rec->samples;
1887 
1888 		/*
1889 		 * rec->evlist->bkw_mmap_state is possible to be
1890 		 * BKW_MMAP_EMPTY here: when done == true and
1891 		 * hits != rec->samples in previous round.
1892 		 *
1893 		 * evlist__toggle_bkw_mmap ensure we never
1894 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1895 		 */
1896 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1897 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1898 
1899 		if (record__mmap_read_all(rec, false) < 0) {
1900 			trigger_error(&auxtrace_snapshot_trigger);
1901 			trigger_error(&switch_output_trigger);
1902 			err = -1;
1903 			goto out_child;
1904 		}
1905 
1906 		if (auxtrace_record__snapshot_started) {
1907 			auxtrace_record__snapshot_started = 0;
1908 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1909 				record__read_auxtrace_snapshot(rec, false);
1910 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1911 				pr_err("AUX area tracing snapshot failed\n");
1912 				err = -1;
1913 				goto out_child;
1914 			}
1915 		}
1916 
1917 		if (trigger_is_hit(&switch_output_trigger)) {
1918 			/*
1919 			 * If switch_output_trigger is hit, the data in
1920 			 * overwritable ring buffer should have been collected,
1921 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1922 			 *
1923 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1924 			 * record__mmap_read_all() didn't collect data from
1925 			 * overwritable ring buffer. Read again.
1926 			 */
1927 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1928 				continue;
1929 			trigger_ready(&switch_output_trigger);
1930 
1931 			/*
1932 			 * Reenable events in overwrite ring buffer after
1933 			 * record__mmap_read_all(): we should have collected
1934 			 * data from it.
1935 			 */
1936 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1937 
1938 			if (!quiet)
1939 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1940 					waking);
1941 			waking = 0;
1942 			fd = record__switch_output(rec, false);
1943 			if (fd < 0) {
1944 				pr_err("Failed to switch to new file\n");
1945 				trigger_error(&switch_output_trigger);
1946 				err = fd;
1947 				goto out_child;
1948 			}
1949 
1950 			/* re-arm the alarm */
1951 			if (rec->switch_output.time)
1952 				alarm(rec->switch_output.time);
1953 		}
1954 
1955 		if (hits == rec->samples) {
1956 			if (done || draining)
1957 				break;
1958 			err = evlist__poll(rec->evlist, -1);
1959 			/*
1960 			 * Propagate error, only if there's any. Ignore positive
1961 			 * number of returned events and interrupt error.
1962 			 */
1963 			if (err > 0 || (err < 0 && errno == EINTR))
1964 				err = 0;
1965 			waking++;
1966 
1967 			if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1968 				draining = true;
1969 		}
1970 
1971 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
1972 			switch (cmd) {
1973 			case EVLIST_CTL_CMD_SNAPSHOT:
1974 				hit_auxtrace_snapshot_trigger(rec);
1975 				evlist__ctlfd_ack(rec->evlist);
1976 				break;
1977 			case EVLIST_CTL_CMD_STOP:
1978 				done = 1;
1979 				break;
1980 			case EVLIST_CTL_CMD_ACK:
1981 			case EVLIST_CTL_CMD_UNSUPPORTED:
1982 			case EVLIST_CTL_CMD_ENABLE:
1983 			case EVLIST_CTL_CMD_DISABLE:
1984 			case EVLIST_CTL_CMD_EVLIST:
1985 			case EVLIST_CTL_CMD_PING:
1986 			default:
1987 				break;
1988 			}
1989 		}
1990 
1991 		/*
1992 		 * When perf is starting the traced process, at the end events
1993 		 * die with the process and we wait for that. Thus no need to
1994 		 * disable events in this case.
1995 		 */
1996 		if (done && !disabled && !target__none(&opts->target)) {
1997 			trigger_off(&auxtrace_snapshot_trigger);
1998 			evlist__disable(rec->evlist);
1999 			disabled = true;
2000 		}
2001 	}
2002 
2003 	trigger_off(&auxtrace_snapshot_trigger);
2004 	trigger_off(&switch_output_trigger);
2005 
2006 	if (opts->auxtrace_snapshot_on_exit)
2007 		record__auxtrace_snapshot_exit(rec);
2008 
2009 	if (forks && workload_exec_errno) {
2010 		char msg[STRERR_BUFSIZE], strevsels[2048];
2011 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2012 
2013 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2014 
2015 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2016 			strevsels, argv[0], emsg);
2017 		err = -1;
2018 		goto out_child;
2019 	}
2020 
2021 	if (!quiet)
2022 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
2023 
2024 	if (target__none(&rec->opts.target))
2025 		record__synthesize_workload(rec, true);
2026 
2027 out_child:
2028 	evlist__finalize_ctlfd(rec->evlist);
2029 	record__mmap_read_all(rec, true);
2030 	record__aio_mmap_read_sync(rec);
2031 
2032 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2033 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2034 		session->header.env.comp_ratio = ratio + 0.5;
2035 	}
2036 
2037 	if (forks) {
2038 		int exit_status;
2039 
2040 		if (!child_finished)
2041 			kill(rec->evlist->workload.pid, SIGTERM);
2042 
2043 		wait(&exit_status);
2044 
2045 		if (err < 0)
2046 			status = err;
2047 		else if (WIFEXITED(exit_status))
2048 			status = WEXITSTATUS(exit_status);
2049 		else if (WIFSIGNALED(exit_status))
2050 			signr = WTERMSIG(exit_status);
2051 	} else
2052 		status = err;
2053 
2054 	record__synthesize(rec, true);
2055 	/* this will be recalculated during process_buildids() */
2056 	rec->samples = 0;
2057 
2058 	if (!err) {
2059 		if (!rec->timestamp_filename) {
2060 			record__finish_output(rec);
2061 		} else {
2062 			fd = record__switch_output(rec, true);
2063 			if (fd < 0) {
2064 				status = fd;
2065 				goto out_delete_session;
2066 			}
2067 		}
2068 	}
2069 
2070 	perf_hooks__invoke_record_end();
2071 
2072 	if (!err && !quiet) {
2073 		char samples[128];
2074 		const char *postfix = rec->timestamp_filename ?
2075 					".<timestamp>" : "";
2076 
2077 		if (rec->samples && !rec->opts.full_auxtrace)
2078 			scnprintf(samples, sizeof(samples),
2079 				  " (%" PRIu64 " samples)", rec->samples);
2080 		else
2081 			samples[0] = '\0';
2082 
2083 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2084 			perf_data__size(data) / 1024.0 / 1024.0,
2085 			data->path, postfix, samples);
2086 		if (ratio) {
2087 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2088 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2089 					ratio);
2090 		}
2091 		fprintf(stderr, " ]\n");
2092 	}
2093 
2094 out_delete_session:
2095 #ifdef HAVE_EVENTFD_SUPPORT
2096 	if (done_fd >= 0)
2097 		close(done_fd);
2098 #endif
2099 	zstd_fini(&session->zstd_data);
2100 	perf_session__delete(session);
2101 
2102 	if (!opts->no_bpf_event)
2103 		evlist__stop_sb_thread(rec->sb_evlist);
2104 	return status;
2105 }
2106 
2107 static void callchain_debug(struct callchain_param *callchain)
2108 {
2109 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2110 
2111 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2112 
2113 	if (callchain->record_mode == CALLCHAIN_DWARF)
2114 		pr_debug("callchain: stack dump size %d\n",
2115 			 callchain->dump_size);
2116 }
2117 
2118 int record_opts__parse_callchain(struct record_opts *record,
2119 				 struct callchain_param *callchain,
2120 				 const char *arg, bool unset)
2121 {
2122 	int ret;
2123 	callchain->enabled = !unset;
2124 
2125 	/* --no-call-graph */
2126 	if (unset) {
2127 		callchain->record_mode = CALLCHAIN_NONE;
2128 		pr_debug("callchain: disabled\n");
2129 		return 0;
2130 	}
2131 
2132 	ret = parse_callchain_record_opt(arg, callchain);
2133 	if (!ret) {
2134 		/* Enable data address sampling for DWARF unwind. */
2135 		if (callchain->record_mode == CALLCHAIN_DWARF)
2136 			record->sample_address = true;
2137 		callchain_debug(callchain);
2138 	}
2139 
2140 	return ret;
2141 }
2142 
2143 int record_parse_callchain_opt(const struct option *opt,
2144 			       const char *arg,
2145 			       int unset)
2146 {
2147 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2148 }
2149 
2150 int record_callchain_opt(const struct option *opt,
2151 			 const char *arg __maybe_unused,
2152 			 int unset __maybe_unused)
2153 {
2154 	struct callchain_param *callchain = opt->value;
2155 
2156 	callchain->enabled = true;
2157 
2158 	if (callchain->record_mode == CALLCHAIN_NONE)
2159 		callchain->record_mode = CALLCHAIN_FP;
2160 
2161 	callchain_debug(callchain);
2162 	return 0;
2163 }
2164 
2165 static int perf_record_config(const char *var, const char *value, void *cb)
2166 {
2167 	struct record *rec = cb;
2168 
2169 	if (!strcmp(var, "record.build-id")) {
2170 		if (!strcmp(value, "cache"))
2171 			rec->no_buildid_cache = false;
2172 		else if (!strcmp(value, "no-cache"))
2173 			rec->no_buildid_cache = true;
2174 		else if (!strcmp(value, "skip"))
2175 			rec->no_buildid = true;
2176 		else if (!strcmp(value, "mmap"))
2177 			rec->buildid_mmap = true;
2178 		else
2179 			return -1;
2180 		return 0;
2181 	}
2182 	if (!strcmp(var, "record.call-graph")) {
2183 		var = "call-graph.record-mode";
2184 		return perf_default_config(var, value, cb);
2185 	}
2186 #ifdef HAVE_AIO_SUPPORT
2187 	if (!strcmp(var, "record.aio")) {
2188 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2189 		if (!rec->opts.nr_cblocks)
2190 			rec->opts.nr_cblocks = nr_cblocks_default;
2191 	}
2192 #endif
2193 
2194 	return 0;
2195 }
2196 
2197 
2198 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2199 {
2200 	struct record_opts *opts = (struct record_opts *)opt->value;
2201 
2202 	if (unset || !str)
2203 		return 0;
2204 
2205 	if (!strcasecmp(str, "node"))
2206 		opts->affinity = PERF_AFFINITY_NODE;
2207 	else if (!strcasecmp(str, "cpu"))
2208 		opts->affinity = PERF_AFFINITY_CPU;
2209 
2210 	return 0;
2211 }
2212 
2213 static int parse_output_max_size(const struct option *opt,
2214 				 const char *str, int unset)
2215 {
2216 	unsigned long *s = (unsigned long *)opt->value;
2217 	static struct parse_tag tags_size[] = {
2218 		{ .tag  = 'B', .mult = 1       },
2219 		{ .tag  = 'K', .mult = 1 << 10 },
2220 		{ .tag  = 'M', .mult = 1 << 20 },
2221 		{ .tag  = 'G', .mult = 1 << 30 },
2222 		{ .tag  = 0 },
2223 	};
2224 	unsigned long val;
2225 
2226 	if (unset) {
2227 		*s = 0;
2228 		return 0;
2229 	}
2230 
2231 	val = parse_tag_value(str, tags_size);
2232 	if (val != (unsigned long) -1) {
2233 		*s = val;
2234 		return 0;
2235 	}
2236 
2237 	return -1;
2238 }
2239 
2240 static int record__parse_mmap_pages(const struct option *opt,
2241 				    const char *str,
2242 				    int unset __maybe_unused)
2243 {
2244 	struct record_opts *opts = opt->value;
2245 	char *s, *p;
2246 	unsigned int mmap_pages;
2247 	int ret;
2248 
2249 	if (!str)
2250 		return -EINVAL;
2251 
2252 	s = strdup(str);
2253 	if (!s)
2254 		return -ENOMEM;
2255 
2256 	p = strchr(s, ',');
2257 	if (p)
2258 		*p = '\0';
2259 
2260 	if (*s) {
2261 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
2262 		if (ret)
2263 			goto out_free;
2264 		opts->mmap_pages = mmap_pages;
2265 	}
2266 
2267 	if (!p) {
2268 		ret = 0;
2269 		goto out_free;
2270 	}
2271 
2272 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
2273 	if (ret)
2274 		goto out_free;
2275 
2276 	opts->auxtrace_mmap_pages = mmap_pages;
2277 
2278 out_free:
2279 	free(s);
2280 	return ret;
2281 }
2282 
2283 static int parse_control_option(const struct option *opt,
2284 				const char *str,
2285 				int unset __maybe_unused)
2286 {
2287 	struct record_opts *opts = opt->value;
2288 
2289 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
2290 }
2291 
2292 static void switch_output_size_warn(struct record *rec)
2293 {
2294 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2295 	struct switch_output *s = &rec->switch_output;
2296 
2297 	wakeup_size /= 2;
2298 
2299 	if (s->size < wakeup_size) {
2300 		char buf[100];
2301 
2302 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2303 		pr_warning("WARNING: switch-output data size lower than "
2304 			   "wakeup kernel buffer size (%s) "
2305 			   "expect bigger perf.data sizes\n", buf);
2306 	}
2307 }
2308 
2309 static int switch_output_setup(struct record *rec)
2310 {
2311 	struct switch_output *s = &rec->switch_output;
2312 	static struct parse_tag tags_size[] = {
2313 		{ .tag  = 'B', .mult = 1       },
2314 		{ .tag  = 'K', .mult = 1 << 10 },
2315 		{ .tag  = 'M', .mult = 1 << 20 },
2316 		{ .tag  = 'G', .mult = 1 << 30 },
2317 		{ .tag  = 0 },
2318 	};
2319 	static struct parse_tag tags_time[] = {
2320 		{ .tag  = 's', .mult = 1        },
2321 		{ .tag  = 'm', .mult = 60       },
2322 		{ .tag  = 'h', .mult = 60*60    },
2323 		{ .tag  = 'd', .mult = 60*60*24 },
2324 		{ .tag  = 0 },
2325 	};
2326 	unsigned long val;
2327 
2328 	/*
2329 	 * If we're using --switch-output-events, then we imply its
2330 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2331 	 *  thread to its parent.
2332 	 */
2333 	if (rec->switch_output_event_set)
2334 		goto do_signal;
2335 
2336 	if (!s->set)
2337 		return 0;
2338 
2339 	if (!strcmp(s->str, "signal")) {
2340 do_signal:
2341 		s->signal = true;
2342 		pr_debug("switch-output with SIGUSR2 signal\n");
2343 		goto enabled;
2344 	}
2345 
2346 	val = parse_tag_value(s->str, tags_size);
2347 	if (val != (unsigned long) -1) {
2348 		s->size = val;
2349 		pr_debug("switch-output with %s size threshold\n", s->str);
2350 		goto enabled;
2351 	}
2352 
2353 	val = parse_tag_value(s->str, tags_time);
2354 	if (val != (unsigned long) -1) {
2355 		s->time = val;
2356 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2357 			 s->str, s->time);
2358 		goto enabled;
2359 	}
2360 
2361 	return -1;
2362 
2363 enabled:
2364 	rec->timestamp_filename = true;
2365 	s->enabled              = true;
2366 
2367 	if (s->size && !rec->opts.no_buffering)
2368 		switch_output_size_warn(rec);
2369 
2370 	return 0;
2371 }
2372 
2373 static const char * const __record_usage[] = {
2374 	"perf record [<options>] [<command>]",
2375 	"perf record [<options>] -- <command> [<options>]",
2376 	NULL
2377 };
2378 const char * const *record_usage = __record_usage;
2379 
2380 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2381 				  struct perf_sample *sample, struct machine *machine)
2382 {
2383 	/*
2384 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2385 	 * no need to add them twice.
2386 	 */
2387 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2388 		return 0;
2389 	return perf_event__process_mmap(tool, event, sample, machine);
2390 }
2391 
2392 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2393 				   struct perf_sample *sample, struct machine *machine)
2394 {
2395 	/*
2396 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2397 	 * no need to add them twice.
2398 	 */
2399 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2400 		return 0;
2401 
2402 	return perf_event__process_mmap2(tool, event, sample, machine);
2403 }
2404 
2405 /*
2406  * XXX Ideally would be local to cmd_record() and passed to a record__new
2407  * because we need to have access to it in record__exit, that is called
2408  * after cmd_record() exits, but since record_options need to be accessible to
2409  * builtin-script, leave it here.
2410  *
2411  * At least we don't ouch it in all the other functions here directly.
2412  *
2413  * Just say no to tons of global variables, sigh.
2414  */
2415 static struct record record = {
2416 	.opts = {
2417 		.sample_time	     = true,
2418 		.mmap_pages	     = UINT_MAX,
2419 		.user_freq	     = UINT_MAX,
2420 		.user_interval	     = ULLONG_MAX,
2421 		.freq		     = 4000,
2422 		.target		     = {
2423 			.uses_mmap   = true,
2424 			.default_per_cpu = true,
2425 		},
2426 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2427 		.nr_threads_synthesize = 1,
2428 		.ctl_fd              = -1,
2429 		.ctl_fd_ack          = -1,
2430 	},
2431 	.tool = {
2432 		.sample		= process_sample_event,
2433 		.fork		= perf_event__process_fork,
2434 		.exit		= perf_event__process_exit,
2435 		.comm		= perf_event__process_comm,
2436 		.namespaces	= perf_event__process_namespaces,
2437 		.mmap		= build_id__process_mmap,
2438 		.mmap2		= build_id__process_mmap2,
2439 		.ordered_events	= true,
2440 	},
2441 };
2442 
2443 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2444 	"\n\t\t\t\tDefault: fp";
2445 
2446 static bool dry_run;
2447 
2448 /*
2449  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2450  * with it and switch to use the library functions in perf_evlist that came
2451  * from builtin-record.c, i.e. use record_opts,
2452  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2453  * using pipes, etc.
2454  */
2455 static struct option __record_options[] = {
2456 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2457 		     "event selector. use 'perf list' to list available events",
2458 		     parse_events_option),
2459 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2460 		     "event filter", parse_filter),
2461 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2462 			   NULL, "don't record events from perf itself",
2463 			   exclude_perf),
2464 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2465 		    "record events on existing process id"),
2466 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2467 		    "record events on existing thread id"),
2468 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2469 		    "collect data with this RT SCHED_FIFO priority"),
2470 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2471 		    "collect data without buffering"),
2472 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2473 		    "collect raw sample records from all opened counters"),
2474 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2475 			    "system-wide collection from all CPUs"),
2476 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2477 		    "list of cpus to monitor"),
2478 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2479 	OPT_STRING('o', "output", &record.data.path, "file",
2480 		    "output file name"),
2481 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2482 			&record.opts.no_inherit_set,
2483 			"child tasks do not inherit counters"),
2484 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2485 		    "synthesize non-sample events at the end of output"),
2486 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2487 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
2488 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2489 		    "Fail if the specified frequency can't be used"),
2490 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2491 		     "profile at this frequency",
2492 		      record__parse_freq),
2493 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2494 		     "number of mmap data pages and AUX area tracing mmap pages",
2495 		     record__parse_mmap_pages),
2496 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2497 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2498 		     record__mmap_flush_parse),
2499 	OPT_BOOLEAN(0, "group", &record.opts.group,
2500 		    "put the counters into a counter group"),
2501 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2502 			   NULL, "enables call-graph recording" ,
2503 			   &record_callchain_opt),
2504 	OPT_CALLBACK(0, "call-graph", &record.opts,
2505 		     "record_mode[,record_size]", record_callchain_help,
2506 		     &record_parse_callchain_opt),
2507 	OPT_INCR('v', "verbose", &verbose,
2508 		    "be more verbose (show counter open errors, etc)"),
2509 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2510 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2511 		    "per thread counts"),
2512 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2513 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2514 		    "Record the sample physical addresses"),
2515 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
2516 		    "Record the sampled data address data page size"),
2517 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
2518 		    "Record the sampled code address (ip) page size"),
2519 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2520 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2521 			&record.opts.sample_time_set,
2522 			"Record the sample timestamps"),
2523 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2524 			"Record the sample period"),
2525 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2526 		    "don't sample"),
2527 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2528 			&record.no_buildid_cache_set,
2529 			"do not update the buildid cache"),
2530 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2531 			&record.no_buildid_set,
2532 			"do not collect buildids in perf.data"),
2533 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2534 		     "monitor event in cgroup name only",
2535 		     parse_cgroups),
2536 	OPT_INTEGER('D', "delay", &record.opts.initial_delay,
2537 		  "ms to wait before starting measurement after program start (-1: start with events disabled)"),
2538 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2539 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2540 		   "user to profile"),
2541 
2542 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2543 		     "branch any", "sample any taken branches",
2544 		     parse_branch_stack),
2545 
2546 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2547 		     "branch filter mask", "branch stack filter modes",
2548 		     parse_branch_stack),
2549 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2550 		    "sample by weight (on special events only)"),
2551 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2552 		    "sample transaction flags (special events only)"),
2553 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2554 		    "use per-thread mmaps"),
2555 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2556 		    "sample selected machine registers on interrupt,"
2557 		    " use '-I?' to list register names", parse_intr_regs),
2558 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2559 		    "sample selected machine registers on interrupt,"
2560 		    " use '--user-regs=?' to list register names", parse_user_regs),
2561 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2562 		    "Record running/enabled time of read (:S) events"),
2563 	OPT_CALLBACK('k', "clockid", &record.opts,
2564 	"clockid", "clockid to use for events, see clock_gettime()",
2565 	parse_clockid),
2566 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2567 			  "opts", "AUX area tracing Snapshot Mode", ""),
2568 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2569 			  "opts", "sample AUX area", ""),
2570 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2571 			"per thread proc mmap processing timeout in ms"),
2572 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2573 		    "Record namespaces events"),
2574 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
2575 		    "Record cgroup events"),
2576 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
2577 			&record.opts.record_switch_events_set,
2578 			"Record context switch events"),
2579 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2580 			 "Configure all used events to run in kernel space.",
2581 			 PARSE_OPT_EXCLUSIVE),
2582 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2583 			 "Configure all used events to run in user space.",
2584 			 PARSE_OPT_EXCLUSIVE),
2585 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2586 		    "collect kernel callchains"),
2587 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2588 		    "collect user callchains"),
2589 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2590 		   "clang binary to use for compiling BPF scriptlets"),
2591 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2592 		   "options passed to clang when compiling BPF scriptlets"),
2593 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2594 		   "file", "vmlinux pathname"),
2595 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2596 		    "Record build-id of all DSOs regardless of hits"),
2597 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
2598 		    "Record build-id in map events"),
2599 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2600 		    "append timestamp to output filename"),
2601 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2602 		    "Record timestamp boundary (time of first/last samples)"),
2603 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2604 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2605 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2606 			  "signal"),
2607 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
2608 			 "switch output event selector. use 'perf list' to list available events",
2609 			 parse_events_option_new_evlist),
2610 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2611 		   "Limit number of switch output generated files"),
2612 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2613 		    "Parse options then exit"),
2614 #ifdef HAVE_AIO_SUPPORT
2615 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2616 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2617 		     record__aio_parse),
2618 #endif
2619 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2620 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2621 		     record__parse_affinity),
2622 #ifdef HAVE_ZSTD_SUPPORT
2623 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2624 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2625 			    record__parse_comp_level),
2626 #endif
2627 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
2628 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
2629 	OPT_UINTEGER(0, "num-thread-synthesize",
2630 		     &record.opts.nr_threads_synthesize,
2631 		     "number of threads to run for event synthesis"),
2632 #ifdef HAVE_LIBPFM
2633 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
2634 		"libpfm4 event selector. use 'perf list' to list available events",
2635 		parse_libpfm_events_option),
2636 #endif
2637 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
2638 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
2639 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
2640 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
2641 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
2642 		      parse_control_option),
2643 	OPT_END()
2644 };
2645 
2646 struct option *record_options = __record_options;
2647 
2648 int cmd_record(int argc, const char **argv)
2649 {
2650 	int err;
2651 	struct record *rec = &record;
2652 	char errbuf[BUFSIZ];
2653 
2654 	setlocale(LC_ALL, "");
2655 
2656 #ifndef HAVE_LIBBPF_SUPPORT
2657 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2658 	set_nobuild('\0', "clang-path", true);
2659 	set_nobuild('\0', "clang-opt", true);
2660 # undef set_nobuild
2661 #endif
2662 
2663 #ifndef HAVE_BPF_PROLOGUE
2664 # if !defined (HAVE_DWARF_SUPPORT)
2665 #  define REASON  "NO_DWARF=1"
2666 # elif !defined (HAVE_LIBBPF_SUPPORT)
2667 #  define REASON  "NO_LIBBPF=1"
2668 # else
2669 #  define REASON  "this architecture doesn't support BPF prologue"
2670 # endif
2671 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2672 	set_nobuild('\0', "vmlinux", true);
2673 # undef set_nobuild
2674 # undef REASON
2675 #endif
2676 
2677 	rec->opts.affinity = PERF_AFFINITY_SYS;
2678 
2679 	rec->evlist = evlist__new();
2680 	if (rec->evlist == NULL)
2681 		return -ENOMEM;
2682 
2683 	err = perf_config(perf_record_config, rec);
2684 	if (err)
2685 		return err;
2686 
2687 	argc = parse_options(argc, argv, record_options, record_usage,
2688 			    PARSE_OPT_STOP_AT_NON_OPTION);
2689 	if (quiet)
2690 		perf_quiet_option();
2691 
2692 	/* Make system wide (-a) the default target. */
2693 	if (!argc && target__none(&rec->opts.target))
2694 		rec->opts.target.system_wide = true;
2695 
2696 	if (nr_cgroups && !rec->opts.target.system_wide) {
2697 		usage_with_options_msg(record_usage, record_options,
2698 			"cgroup monitoring only available in system-wide mode");
2699 
2700 	}
2701 
2702 	if (rec->buildid_mmap) {
2703 		if (!perf_can_record_build_id()) {
2704 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
2705 			err = -EINVAL;
2706 			goto out_opts;
2707 		}
2708 		pr_debug("Enabling build id in mmap2 events.\n");
2709 		/* Enable mmap build id synthesizing. */
2710 		symbol_conf.buildid_mmap2 = true;
2711 		/* Enable perf_event_attr::build_id bit. */
2712 		rec->opts.build_id = true;
2713 		/* Disable build id cache. */
2714 		rec->no_buildid = true;
2715 	}
2716 
2717 	if (rec->opts.kcore)
2718 		rec->data.is_dir = true;
2719 
2720 	if (rec->opts.comp_level != 0) {
2721 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2722 		rec->no_buildid = true;
2723 	}
2724 
2725 	if (rec->opts.record_switch_events &&
2726 	    !perf_can_record_switch_events()) {
2727 		ui__error("kernel does not support recording context switch events\n");
2728 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2729 		err = -EINVAL;
2730 		goto out_opts;
2731 	}
2732 
2733 	if (switch_output_setup(rec)) {
2734 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2735 		err = -EINVAL;
2736 		goto out_opts;
2737 	}
2738 
2739 	if (rec->switch_output.time) {
2740 		signal(SIGALRM, alarm_sig_handler);
2741 		alarm(rec->switch_output.time);
2742 	}
2743 
2744 	if (rec->switch_output.num_files) {
2745 		rec->switch_output.filenames = calloc(sizeof(char *),
2746 						      rec->switch_output.num_files);
2747 		if (!rec->switch_output.filenames) {
2748 			err = -EINVAL;
2749 			goto out_opts;
2750 		}
2751 	}
2752 
2753 	/*
2754 	 * Allow aliases to facilitate the lookup of symbols for address
2755 	 * filters. Refer to auxtrace_parse_filters().
2756 	 */
2757 	symbol_conf.allow_aliases = true;
2758 
2759 	symbol__init(NULL);
2760 
2761 	if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2762 		rec->affinity_mask.nbits = cpu__max_cpu();
2763 		rec->affinity_mask.bits = bitmap_alloc(rec->affinity_mask.nbits);
2764 		if (!rec->affinity_mask.bits) {
2765 			pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2766 			err = -ENOMEM;
2767 			goto out_opts;
2768 		}
2769 		pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2770 	}
2771 
2772 	err = record__auxtrace_init(rec);
2773 	if (err)
2774 		goto out;
2775 
2776 	if (dry_run)
2777 		goto out;
2778 
2779 	err = bpf__setup_stdout(rec->evlist);
2780 	if (err) {
2781 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2782 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2783 			 errbuf);
2784 		goto out;
2785 	}
2786 
2787 	err = -ENOMEM;
2788 
2789 	if (rec->no_buildid_cache || rec->no_buildid) {
2790 		disable_buildid_cache();
2791 	} else if (rec->switch_output.enabled) {
2792 		/*
2793 		 * In 'perf record --switch-output', disable buildid
2794 		 * generation by default to reduce data file switching
2795 		 * overhead. Still generate buildid if they are required
2796 		 * explicitly using
2797 		 *
2798 		 *  perf record --switch-output --no-no-buildid \
2799 		 *              --no-no-buildid-cache
2800 		 *
2801 		 * Following code equals to:
2802 		 *
2803 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2804 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2805 		 *         disable_buildid_cache();
2806 		 */
2807 		bool disable = true;
2808 
2809 		if (rec->no_buildid_set && !rec->no_buildid)
2810 			disable = false;
2811 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2812 			disable = false;
2813 		if (disable) {
2814 			rec->no_buildid = true;
2815 			rec->no_buildid_cache = true;
2816 			disable_buildid_cache();
2817 		}
2818 	}
2819 
2820 	if (record.opts.overwrite)
2821 		record.opts.tail_synthesize = true;
2822 
2823 	if (rec->evlist->core.nr_entries == 0) {
2824 		if (perf_pmu__has_hybrid()) {
2825 			err = evlist__add_default_hybrid(rec->evlist,
2826 							 !record.opts.no_samples);
2827 		} else {
2828 			err = __evlist__add_default(rec->evlist,
2829 						    !record.opts.no_samples);
2830 		}
2831 
2832 		if (err < 0) {
2833 			pr_err("Not enough memory for event selector list\n");
2834 			goto out;
2835 		}
2836 	}
2837 
2838 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2839 		rec->opts.no_inherit = true;
2840 
2841 	err = target__validate(&rec->opts.target);
2842 	if (err) {
2843 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2844 		ui__warning("%s\n", errbuf);
2845 	}
2846 
2847 	err = target__parse_uid(&rec->opts.target);
2848 	if (err) {
2849 		int saved_errno = errno;
2850 
2851 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2852 		ui__error("%s", errbuf);
2853 
2854 		err = -saved_errno;
2855 		goto out;
2856 	}
2857 
2858 	/* Enable ignoring missing threads when -u/-p option is defined. */
2859 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2860 
2861 	err = -ENOMEM;
2862 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2863 		usage_with_options(record_usage, record_options);
2864 
2865 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2866 	if (err)
2867 		goto out;
2868 
2869 	/*
2870 	 * We take all buildids when the file contains
2871 	 * AUX area tracing data because we do not decode the
2872 	 * trace because it would take too long.
2873 	 */
2874 	if (rec->opts.full_auxtrace)
2875 		rec->buildid_all = true;
2876 
2877 	if (rec->opts.text_poke) {
2878 		err = record__config_text_poke(rec->evlist);
2879 		if (err) {
2880 			pr_err("record__config_text_poke failed, error %d\n", err);
2881 			goto out;
2882 		}
2883 	}
2884 
2885 	if (record_opts__config(&rec->opts)) {
2886 		err = -EINVAL;
2887 		goto out;
2888 	}
2889 
2890 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2891 		rec->opts.nr_cblocks = nr_cblocks_max;
2892 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2893 
2894 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2895 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2896 
2897 	if (rec->opts.comp_level > comp_level_max)
2898 		rec->opts.comp_level = comp_level_max;
2899 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2900 
2901 	err = __cmd_record(&record, argc, argv);
2902 out:
2903 	bitmap_free(rec->affinity_mask.bits);
2904 	evlist__delete(rec->evlist);
2905 	symbol__exit();
2906 	auxtrace_record__free(rec->itr);
2907 out_opts:
2908 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
2909 	return err;
2910 }
2911 
2912 static void snapshot_sig_handler(int sig __maybe_unused)
2913 {
2914 	struct record *rec = &record;
2915 
2916 	hit_auxtrace_snapshot_trigger(rec);
2917 
2918 	if (switch_output_signal(rec))
2919 		trigger_hit(&switch_output_trigger);
2920 }
2921 
2922 static void alarm_sig_handler(int sig __maybe_unused)
2923 {
2924 	struct record *rec = &record;
2925 
2926 	if (switch_output_time(rec))
2927 		trigger_hit(&switch_output_trigger);
2928 }
2929