xref: /openbmc/linux/tools/perf/builtin-record.c (revision d87c25e8)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/clockid.h"
50 #include "util/pmu-hybrid.h"
51 #include "util/evlist-hybrid.h"
52 #include "asm/bug.h"
53 #include "perf.h"
54 
55 #include <errno.h>
56 #include <inttypes.h>
57 #include <locale.h>
58 #include <poll.h>
59 #include <pthread.h>
60 #include <unistd.h>
61 #include <sched.h>
62 #include <signal.h>
63 #ifdef HAVE_EVENTFD_SUPPORT
64 #include <sys/eventfd.h>
65 #endif
66 #include <sys/mman.h>
67 #include <sys/wait.h>
68 #include <sys/types.h>
69 #include <sys/stat.h>
70 #include <fcntl.h>
71 #include <linux/err.h>
72 #include <linux/string.h>
73 #include <linux/time64.h>
74 #include <linux/zalloc.h>
75 #include <linux/bitmap.h>
76 #include <sys/time.h>
77 
78 struct switch_output {
79 	bool		 enabled;
80 	bool		 signal;
81 	unsigned long	 size;
82 	unsigned long	 time;
83 	const char	*str;
84 	bool		 set;
85 	char		 **filenames;
86 	int		 num_files;
87 	int		 cur_file;
88 };
89 
90 struct thread_mask {
91 	struct mmap_cpu_mask	maps;
92 	struct mmap_cpu_mask	affinity;
93 };
94 
95 struct record {
96 	struct perf_tool	tool;
97 	struct record_opts	opts;
98 	u64			bytes_written;
99 	struct perf_data	data;
100 	struct auxtrace_record	*itr;
101 	struct evlist	*evlist;
102 	struct perf_session	*session;
103 	struct evlist		*sb_evlist;
104 	pthread_t		thread_id;
105 	int			realtime_prio;
106 	bool			switch_output_event_set;
107 	bool			no_buildid;
108 	bool			no_buildid_set;
109 	bool			no_buildid_cache;
110 	bool			no_buildid_cache_set;
111 	bool			buildid_all;
112 	bool			buildid_mmap;
113 	bool			timestamp_filename;
114 	bool			timestamp_boundary;
115 	struct switch_output	switch_output;
116 	unsigned long long	samples;
117 	struct mmap_cpu_mask	affinity_mask;
118 	unsigned long		output_max_size;	/* = 0: unlimited */
119 	struct perf_debuginfod	debuginfod;
120 	int			nr_threads;
121 	struct thread_mask	*thread_masks;
122 };
123 
124 static volatile int done;
125 
126 static volatile int auxtrace_record__snapshot_started;
127 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
128 static DEFINE_TRIGGER(switch_output_trigger);
129 
130 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
131 	"SYS", "NODE", "CPU"
132 };
133 
134 static bool switch_output_signal(struct record *rec)
135 {
136 	return rec->switch_output.signal &&
137 	       trigger_is_ready(&switch_output_trigger);
138 }
139 
140 static bool switch_output_size(struct record *rec)
141 {
142 	return rec->switch_output.size &&
143 	       trigger_is_ready(&switch_output_trigger) &&
144 	       (rec->bytes_written >= rec->switch_output.size);
145 }
146 
147 static bool switch_output_time(struct record *rec)
148 {
149 	return rec->switch_output.time &&
150 	       trigger_is_ready(&switch_output_trigger);
151 }
152 
153 static bool record__output_max_size_exceeded(struct record *rec)
154 {
155 	return rec->output_max_size &&
156 	       (rec->bytes_written >= rec->output_max_size);
157 }
158 
159 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
160 			 void *bf, size_t size)
161 {
162 	struct perf_data_file *file = &rec->session->data->file;
163 
164 	if (perf_data_file__write(file, bf, size) < 0) {
165 		pr_err("failed to write perf data, error: %m\n");
166 		return -1;
167 	}
168 
169 	rec->bytes_written += size;
170 
171 	if (record__output_max_size_exceeded(rec) && !done) {
172 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
173 				" stopping session ]\n",
174 				rec->bytes_written >> 10);
175 		done = 1;
176 	}
177 
178 	if (switch_output_size(rec))
179 		trigger_hit(&switch_output_trigger);
180 
181 	return 0;
182 }
183 
184 static int record__aio_enabled(struct record *rec);
185 static int record__comp_enabled(struct record *rec);
186 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
187 			    void *src, size_t src_size);
188 
189 #ifdef HAVE_AIO_SUPPORT
190 static int record__aio_write(struct aiocb *cblock, int trace_fd,
191 		void *buf, size_t size, off_t off)
192 {
193 	int rc;
194 
195 	cblock->aio_fildes = trace_fd;
196 	cblock->aio_buf    = buf;
197 	cblock->aio_nbytes = size;
198 	cblock->aio_offset = off;
199 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
200 
201 	do {
202 		rc = aio_write(cblock);
203 		if (rc == 0) {
204 			break;
205 		} else if (errno != EAGAIN) {
206 			cblock->aio_fildes = -1;
207 			pr_err("failed to queue perf data, error: %m\n");
208 			break;
209 		}
210 	} while (1);
211 
212 	return rc;
213 }
214 
215 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
216 {
217 	void *rem_buf;
218 	off_t rem_off;
219 	size_t rem_size;
220 	int rc, aio_errno;
221 	ssize_t aio_ret, written;
222 
223 	aio_errno = aio_error(cblock);
224 	if (aio_errno == EINPROGRESS)
225 		return 0;
226 
227 	written = aio_ret = aio_return(cblock);
228 	if (aio_ret < 0) {
229 		if (aio_errno != EINTR)
230 			pr_err("failed to write perf data, error: %m\n");
231 		written = 0;
232 	}
233 
234 	rem_size = cblock->aio_nbytes - written;
235 
236 	if (rem_size == 0) {
237 		cblock->aio_fildes = -1;
238 		/*
239 		 * md->refcount is incremented in record__aio_pushfn() for
240 		 * every aio write request started in record__aio_push() so
241 		 * decrement it because the request is now complete.
242 		 */
243 		perf_mmap__put(&md->core);
244 		rc = 1;
245 	} else {
246 		/*
247 		 * aio write request may require restart with the
248 		 * reminder if the kernel didn't write whole
249 		 * chunk at once.
250 		 */
251 		rem_off = cblock->aio_offset + written;
252 		rem_buf = (void *)(cblock->aio_buf + written);
253 		record__aio_write(cblock, cblock->aio_fildes,
254 				rem_buf, rem_size, rem_off);
255 		rc = 0;
256 	}
257 
258 	return rc;
259 }
260 
261 static int record__aio_sync(struct mmap *md, bool sync_all)
262 {
263 	struct aiocb **aiocb = md->aio.aiocb;
264 	struct aiocb *cblocks = md->aio.cblocks;
265 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
266 	int i, do_suspend;
267 
268 	do {
269 		do_suspend = 0;
270 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
271 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
272 				if (sync_all)
273 					aiocb[i] = NULL;
274 				else
275 					return i;
276 			} else {
277 				/*
278 				 * Started aio write is not complete yet
279 				 * so it has to be waited before the
280 				 * next allocation.
281 				 */
282 				aiocb[i] = &cblocks[i];
283 				do_suspend = 1;
284 			}
285 		}
286 		if (!do_suspend)
287 			return -1;
288 
289 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
290 			if (!(errno == EAGAIN || errno == EINTR))
291 				pr_err("failed to sync perf data, error: %m\n");
292 		}
293 	} while (1);
294 }
295 
296 struct record_aio {
297 	struct record	*rec;
298 	void		*data;
299 	size_t		size;
300 };
301 
302 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
303 {
304 	struct record_aio *aio = to;
305 
306 	/*
307 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
308 	 * to release space in the kernel buffer as fast as possible, calling
309 	 * perf_mmap__consume() from perf_mmap__push() function.
310 	 *
311 	 * That lets the kernel to proceed with storing more profiling data into
312 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
313 	 *
314 	 * Coping can be done in two steps in case the chunk of profiling data
315 	 * crosses the upper bound of the kernel buffer. In this case we first move
316 	 * part of data from map->start till the upper bound and then the reminder
317 	 * from the beginning of the kernel buffer till the end of the data chunk.
318 	 */
319 
320 	if (record__comp_enabled(aio->rec)) {
321 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
322 				     mmap__mmap_len(map) - aio->size,
323 				     buf, size);
324 	} else {
325 		memcpy(aio->data + aio->size, buf, size);
326 	}
327 
328 	if (!aio->size) {
329 		/*
330 		 * Increment map->refcount to guard map->aio.data[] buffer
331 		 * from premature deallocation because map object can be
332 		 * released earlier than aio write request started on
333 		 * map->aio.data[] buffer is complete.
334 		 *
335 		 * perf_mmap__put() is done at record__aio_complete()
336 		 * after started aio request completion or at record__aio_push()
337 		 * if the request failed to start.
338 		 */
339 		perf_mmap__get(&map->core);
340 	}
341 
342 	aio->size += size;
343 
344 	return size;
345 }
346 
347 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
348 {
349 	int ret, idx;
350 	int trace_fd = rec->session->data->file.fd;
351 	struct record_aio aio = { .rec = rec, .size = 0 };
352 
353 	/*
354 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
355 	 * becomes available after previous aio write operation.
356 	 */
357 
358 	idx = record__aio_sync(map, false);
359 	aio.data = map->aio.data[idx];
360 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
361 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
362 		return ret;
363 
364 	rec->samples++;
365 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
366 	if (!ret) {
367 		*off += aio.size;
368 		rec->bytes_written += aio.size;
369 		if (switch_output_size(rec))
370 			trigger_hit(&switch_output_trigger);
371 	} else {
372 		/*
373 		 * Decrement map->refcount incremented in record__aio_pushfn()
374 		 * back if record__aio_write() operation failed to start, otherwise
375 		 * map->refcount is decremented in record__aio_complete() after
376 		 * aio write operation finishes successfully.
377 		 */
378 		perf_mmap__put(&map->core);
379 	}
380 
381 	return ret;
382 }
383 
384 static off_t record__aio_get_pos(int trace_fd)
385 {
386 	return lseek(trace_fd, 0, SEEK_CUR);
387 }
388 
389 static void record__aio_set_pos(int trace_fd, off_t pos)
390 {
391 	lseek(trace_fd, pos, SEEK_SET);
392 }
393 
394 static void record__aio_mmap_read_sync(struct record *rec)
395 {
396 	int i;
397 	struct evlist *evlist = rec->evlist;
398 	struct mmap *maps = evlist->mmap;
399 
400 	if (!record__aio_enabled(rec))
401 		return;
402 
403 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
404 		struct mmap *map = &maps[i];
405 
406 		if (map->core.base)
407 			record__aio_sync(map, true);
408 	}
409 }
410 
411 static int nr_cblocks_default = 1;
412 static int nr_cblocks_max = 4;
413 
414 static int record__aio_parse(const struct option *opt,
415 			     const char *str,
416 			     int unset)
417 {
418 	struct record_opts *opts = (struct record_opts *)opt->value;
419 
420 	if (unset) {
421 		opts->nr_cblocks = 0;
422 	} else {
423 		if (str)
424 			opts->nr_cblocks = strtol(str, NULL, 0);
425 		if (!opts->nr_cblocks)
426 			opts->nr_cblocks = nr_cblocks_default;
427 	}
428 
429 	return 0;
430 }
431 #else /* HAVE_AIO_SUPPORT */
432 static int nr_cblocks_max = 0;
433 
434 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
435 			    off_t *off __maybe_unused)
436 {
437 	return -1;
438 }
439 
440 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
441 {
442 	return -1;
443 }
444 
445 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
446 {
447 }
448 
449 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
450 {
451 }
452 #endif
453 
454 static int record__aio_enabled(struct record *rec)
455 {
456 	return rec->opts.nr_cblocks > 0;
457 }
458 
459 #define MMAP_FLUSH_DEFAULT 1
460 static int record__mmap_flush_parse(const struct option *opt,
461 				    const char *str,
462 				    int unset)
463 {
464 	int flush_max;
465 	struct record_opts *opts = (struct record_opts *)opt->value;
466 	static struct parse_tag tags[] = {
467 			{ .tag  = 'B', .mult = 1       },
468 			{ .tag  = 'K', .mult = 1 << 10 },
469 			{ .tag  = 'M', .mult = 1 << 20 },
470 			{ .tag  = 'G', .mult = 1 << 30 },
471 			{ .tag  = 0 },
472 	};
473 
474 	if (unset)
475 		return 0;
476 
477 	if (str) {
478 		opts->mmap_flush = parse_tag_value(str, tags);
479 		if (opts->mmap_flush == (int)-1)
480 			opts->mmap_flush = strtol(str, NULL, 0);
481 	}
482 
483 	if (!opts->mmap_flush)
484 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
485 
486 	flush_max = evlist__mmap_size(opts->mmap_pages);
487 	flush_max /= 4;
488 	if (opts->mmap_flush > flush_max)
489 		opts->mmap_flush = flush_max;
490 
491 	return 0;
492 }
493 
494 #ifdef HAVE_ZSTD_SUPPORT
495 static unsigned int comp_level_default = 1;
496 
497 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
498 {
499 	struct record_opts *opts = opt->value;
500 
501 	if (unset) {
502 		opts->comp_level = 0;
503 	} else {
504 		if (str)
505 			opts->comp_level = strtol(str, NULL, 0);
506 		if (!opts->comp_level)
507 			opts->comp_level = comp_level_default;
508 	}
509 
510 	return 0;
511 }
512 #endif
513 static unsigned int comp_level_max = 22;
514 
515 static int record__comp_enabled(struct record *rec)
516 {
517 	return rec->opts.comp_level > 0;
518 }
519 
520 static int process_synthesized_event(struct perf_tool *tool,
521 				     union perf_event *event,
522 				     struct perf_sample *sample __maybe_unused,
523 				     struct machine *machine __maybe_unused)
524 {
525 	struct record *rec = container_of(tool, struct record, tool);
526 	return record__write(rec, NULL, event, event->header.size);
527 }
528 
529 static int process_locked_synthesized_event(struct perf_tool *tool,
530 				     union perf_event *event,
531 				     struct perf_sample *sample __maybe_unused,
532 				     struct machine *machine __maybe_unused)
533 {
534 	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
535 	int ret;
536 
537 	pthread_mutex_lock(&synth_lock);
538 	ret = process_synthesized_event(tool, event, sample, machine);
539 	pthread_mutex_unlock(&synth_lock);
540 	return ret;
541 }
542 
543 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
544 {
545 	struct record *rec = to;
546 
547 	if (record__comp_enabled(rec)) {
548 		size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
549 		bf   = map->data;
550 	}
551 
552 	rec->samples++;
553 	return record__write(rec, map, bf, size);
554 }
555 
556 static volatile int signr = -1;
557 static volatile int child_finished;
558 #ifdef HAVE_EVENTFD_SUPPORT
559 static int done_fd = -1;
560 #endif
561 
562 static void sig_handler(int sig)
563 {
564 	if (sig == SIGCHLD)
565 		child_finished = 1;
566 	else
567 		signr = sig;
568 
569 	done = 1;
570 #ifdef HAVE_EVENTFD_SUPPORT
571 {
572 	u64 tmp = 1;
573 	/*
574 	 * It is possible for this signal handler to run after done is checked
575 	 * in the main loop, but before the perf counter fds are polled. If this
576 	 * happens, the poll() will continue to wait even though done is set,
577 	 * and will only break out if either another signal is received, or the
578 	 * counters are ready for read. To ensure the poll() doesn't sleep when
579 	 * done is set, use an eventfd (done_fd) to wake up the poll().
580 	 */
581 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
582 		pr_err("failed to signal wakeup fd, error: %m\n");
583 }
584 #endif // HAVE_EVENTFD_SUPPORT
585 }
586 
587 static void sigsegv_handler(int sig)
588 {
589 	perf_hooks__recover();
590 	sighandler_dump_stack(sig);
591 }
592 
593 static void record__sig_exit(void)
594 {
595 	if (signr == -1)
596 		return;
597 
598 	signal(signr, SIG_DFL);
599 	raise(signr);
600 }
601 
602 #ifdef HAVE_AUXTRACE_SUPPORT
603 
604 static int record__process_auxtrace(struct perf_tool *tool,
605 				    struct mmap *map,
606 				    union perf_event *event, void *data1,
607 				    size_t len1, void *data2, size_t len2)
608 {
609 	struct record *rec = container_of(tool, struct record, tool);
610 	struct perf_data *data = &rec->data;
611 	size_t padding;
612 	u8 pad[8] = {0};
613 
614 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
615 		off_t file_offset;
616 		int fd = perf_data__fd(data);
617 		int err;
618 
619 		file_offset = lseek(fd, 0, SEEK_CUR);
620 		if (file_offset == -1)
621 			return -1;
622 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
623 						     event, file_offset);
624 		if (err)
625 			return err;
626 	}
627 
628 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
629 	padding = (len1 + len2) & 7;
630 	if (padding)
631 		padding = 8 - padding;
632 
633 	record__write(rec, map, event, event->header.size);
634 	record__write(rec, map, data1, len1);
635 	if (len2)
636 		record__write(rec, map, data2, len2);
637 	record__write(rec, map, &pad, padding);
638 
639 	return 0;
640 }
641 
642 static int record__auxtrace_mmap_read(struct record *rec,
643 				      struct mmap *map)
644 {
645 	int ret;
646 
647 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
648 				  record__process_auxtrace);
649 	if (ret < 0)
650 		return ret;
651 
652 	if (ret)
653 		rec->samples++;
654 
655 	return 0;
656 }
657 
658 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
659 					       struct mmap *map)
660 {
661 	int ret;
662 
663 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
664 					   record__process_auxtrace,
665 					   rec->opts.auxtrace_snapshot_size);
666 	if (ret < 0)
667 		return ret;
668 
669 	if (ret)
670 		rec->samples++;
671 
672 	return 0;
673 }
674 
675 static int record__auxtrace_read_snapshot_all(struct record *rec)
676 {
677 	int i;
678 	int rc = 0;
679 
680 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
681 		struct mmap *map = &rec->evlist->mmap[i];
682 
683 		if (!map->auxtrace_mmap.base)
684 			continue;
685 
686 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
687 			rc = -1;
688 			goto out;
689 		}
690 	}
691 out:
692 	return rc;
693 }
694 
695 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
696 {
697 	pr_debug("Recording AUX area tracing snapshot\n");
698 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
699 		trigger_error(&auxtrace_snapshot_trigger);
700 	} else {
701 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
702 			trigger_error(&auxtrace_snapshot_trigger);
703 		else
704 			trigger_ready(&auxtrace_snapshot_trigger);
705 	}
706 }
707 
708 static int record__auxtrace_snapshot_exit(struct record *rec)
709 {
710 	if (trigger_is_error(&auxtrace_snapshot_trigger))
711 		return 0;
712 
713 	if (!auxtrace_record__snapshot_started &&
714 	    auxtrace_record__snapshot_start(rec->itr))
715 		return -1;
716 
717 	record__read_auxtrace_snapshot(rec, true);
718 	if (trigger_is_error(&auxtrace_snapshot_trigger))
719 		return -1;
720 
721 	return 0;
722 }
723 
724 static int record__auxtrace_init(struct record *rec)
725 {
726 	int err;
727 
728 	if (!rec->itr) {
729 		rec->itr = auxtrace_record__init(rec->evlist, &err);
730 		if (err)
731 			return err;
732 	}
733 
734 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
735 					      rec->opts.auxtrace_snapshot_opts);
736 	if (err)
737 		return err;
738 
739 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
740 					    rec->opts.auxtrace_sample_opts);
741 	if (err)
742 		return err;
743 
744 	auxtrace_regroup_aux_output(rec->evlist);
745 
746 	return auxtrace_parse_filters(rec->evlist);
747 }
748 
749 #else
750 
751 static inline
752 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
753 			       struct mmap *map __maybe_unused)
754 {
755 	return 0;
756 }
757 
758 static inline
759 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
760 				    bool on_exit __maybe_unused)
761 {
762 }
763 
764 static inline
765 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
766 {
767 	return 0;
768 }
769 
770 static inline
771 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
772 {
773 	return 0;
774 }
775 
776 static int record__auxtrace_init(struct record *rec __maybe_unused)
777 {
778 	return 0;
779 }
780 
781 #endif
782 
783 static int record__config_text_poke(struct evlist *evlist)
784 {
785 	struct evsel *evsel;
786 	int err;
787 
788 	/* Nothing to do if text poke is already configured */
789 	evlist__for_each_entry(evlist, evsel) {
790 		if (evsel->core.attr.text_poke)
791 			return 0;
792 	}
793 
794 	err = parse_events(evlist, "dummy:u", NULL);
795 	if (err)
796 		return err;
797 
798 	evsel = evlist__last(evlist);
799 
800 	evsel->core.attr.freq = 0;
801 	evsel->core.attr.sample_period = 1;
802 	evsel->core.attr.text_poke = 1;
803 	evsel->core.attr.ksymbol = 1;
804 
805 	evsel->core.system_wide = true;
806 	evsel->no_aux_samples = true;
807 	evsel->immediate = true;
808 
809 	/* Text poke must be collected on all CPUs */
810 	perf_cpu_map__put(evsel->core.own_cpus);
811 	evsel->core.own_cpus = perf_cpu_map__new(NULL);
812 	perf_cpu_map__put(evsel->core.cpus);
813 	evsel->core.cpus = perf_cpu_map__get(evsel->core.own_cpus);
814 
815 	evsel__set_sample_bit(evsel, TIME);
816 
817 	return 0;
818 }
819 
820 static bool record__kcore_readable(struct machine *machine)
821 {
822 	char kcore[PATH_MAX];
823 	int fd;
824 
825 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
826 
827 	fd = open(kcore, O_RDONLY);
828 	if (fd < 0)
829 		return false;
830 
831 	close(fd);
832 
833 	return true;
834 }
835 
836 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
837 {
838 	char from_dir[PATH_MAX];
839 	char kcore_dir[PATH_MAX];
840 	int ret;
841 
842 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
843 
844 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
845 	if (ret)
846 		return ret;
847 
848 	return kcore_copy(from_dir, kcore_dir);
849 }
850 
851 static int record__mmap_evlist(struct record *rec,
852 			       struct evlist *evlist)
853 {
854 	struct record_opts *opts = &rec->opts;
855 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
856 				  opts->auxtrace_sample_mode;
857 	char msg[512];
858 
859 	if (opts->affinity != PERF_AFFINITY_SYS)
860 		cpu__setup_cpunode_map();
861 
862 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
863 				 opts->auxtrace_mmap_pages,
864 				 auxtrace_overwrite,
865 				 opts->nr_cblocks, opts->affinity,
866 				 opts->mmap_flush, opts->comp_level) < 0) {
867 		if (errno == EPERM) {
868 			pr_err("Permission error mapping pages.\n"
869 			       "Consider increasing "
870 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
871 			       "or try again with a smaller value of -m/--mmap_pages.\n"
872 			       "(current value: %u,%u)\n",
873 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
874 			return -errno;
875 		} else {
876 			pr_err("failed to mmap with %d (%s)\n", errno,
877 				str_error_r(errno, msg, sizeof(msg)));
878 			if (errno)
879 				return -errno;
880 			else
881 				return -EINVAL;
882 		}
883 	}
884 	return 0;
885 }
886 
887 static int record__mmap(struct record *rec)
888 {
889 	return record__mmap_evlist(rec, rec->evlist);
890 }
891 
892 static int record__open(struct record *rec)
893 {
894 	char msg[BUFSIZ];
895 	struct evsel *pos;
896 	struct evlist *evlist = rec->evlist;
897 	struct perf_session *session = rec->session;
898 	struct record_opts *opts = &rec->opts;
899 	int rc = 0;
900 
901 	/*
902 	 * For initial_delay, system wide or a hybrid system, we need to add a
903 	 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
904 	 * of waiting or event synthesis.
905 	 */
906 	if (opts->initial_delay || target__has_cpu(&opts->target) ||
907 	    perf_pmu__has_hybrid()) {
908 		pos = evlist__get_tracking_event(evlist);
909 		if (!evsel__is_dummy_event(pos)) {
910 			/* Set up dummy event. */
911 			if (evlist__add_dummy(evlist))
912 				return -ENOMEM;
913 			pos = evlist__last(evlist);
914 			evlist__set_tracking_event(evlist, pos);
915 		}
916 
917 		/*
918 		 * Enable the dummy event when the process is forked for
919 		 * initial_delay, immediately for system wide.
920 		 */
921 		if (opts->initial_delay && !pos->immediate &&
922 		    !target__has_cpu(&opts->target))
923 			pos->core.attr.enable_on_exec = 1;
924 		else
925 			pos->immediate = 1;
926 	}
927 
928 	evlist__config(evlist, opts, &callchain_param);
929 
930 	evlist__for_each_entry(evlist, pos) {
931 try_again:
932 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
933 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
934 				if (verbose > 0)
935 					ui__warning("%s\n", msg);
936 				goto try_again;
937 			}
938 			if ((errno == EINVAL || errno == EBADF) &&
939 			    pos->core.leader != &pos->core &&
940 			    pos->weak_group) {
941 			        pos = evlist__reset_weak_group(evlist, pos, true);
942 				goto try_again;
943 			}
944 			rc = -errno;
945 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
946 			ui__error("%s\n", msg);
947 			goto out;
948 		}
949 
950 		pos->supported = true;
951 	}
952 
953 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
954 		pr_warning(
955 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
956 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
957 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
958 "file is not found in the buildid cache or in the vmlinux path.\n\n"
959 "Samples in kernel modules won't be resolved at all.\n\n"
960 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
961 "even with a suitable vmlinux or kallsyms file.\n\n");
962 	}
963 
964 	if (evlist__apply_filters(evlist, &pos)) {
965 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
966 			pos->filter, evsel__name(pos), errno,
967 			str_error_r(errno, msg, sizeof(msg)));
968 		rc = -1;
969 		goto out;
970 	}
971 
972 	rc = record__mmap(rec);
973 	if (rc)
974 		goto out;
975 
976 	session->evlist = evlist;
977 	perf_session__set_id_hdr_size(session);
978 out:
979 	return rc;
980 }
981 
982 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
983 {
984 	if (rec->evlist->first_sample_time == 0)
985 		rec->evlist->first_sample_time = sample_time;
986 
987 	if (sample_time)
988 		rec->evlist->last_sample_time = sample_time;
989 }
990 
991 static int process_sample_event(struct perf_tool *tool,
992 				union perf_event *event,
993 				struct perf_sample *sample,
994 				struct evsel *evsel,
995 				struct machine *machine)
996 {
997 	struct record *rec = container_of(tool, struct record, tool);
998 
999 	set_timestamp_boundary(rec, sample->time);
1000 
1001 	if (rec->buildid_all)
1002 		return 0;
1003 
1004 	rec->samples++;
1005 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1006 }
1007 
1008 static int process_buildids(struct record *rec)
1009 {
1010 	struct perf_session *session = rec->session;
1011 
1012 	if (perf_data__size(&rec->data) == 0)
1013 		return 0;
1014 
1015 	/*
1016 	 * During this process, it'll load kernel map and replace the
1017 	 * dso->long_name to a real pathname it found.  In this case
1018 	 * we prefer the vmlinux path like
1019 	 *   /lib/modules/3.16.4/build/vmlinux
1020 	 *
1021 	 * rather than build-id path (in debug directory).
1022 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1023 	 */
1024 	symbol_conf.ignore_vmlinux_buildid = true;
1025 
1026 	/*
1027 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1028 	 * so no need to process samples. But if timestamp_boundary is enabled,
1029 	 * it still needs to walk on all samples to get the timestamps of
1030 	 * first/last samples.
1031 	 */
1032 	if (rec->buildid_all && !rec->timestamp_boundary)
1033 		rec->tool.sample = NULL;
1034 
1035 	return perf_session__process_events(session);
1036 }
1037 
1038 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1039 {
1040 	int err;
1041 	struct perf_tool *tool = data;
1042 	/*
1043 	 *As for guest kernel when processing subcommand record&report,
1044 	 *we arrange module mmap prior to guest kernel mmap and trigger
1045 	 *a preload dso because default guest module symbols are loaded
1046 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1047 	 *method is used to avoid symbol missing when the first addr is
1048 	 *in module instead of in guest kernel.
1049 	 */
1050 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1051 					     machine);
1052 	if (err < 0)
1053 		pr_err("Couldn't record guest kernel [%d]'s reference"
1054 		       " relocation symbol.\n", machine->pid);
1055 
1056 	/*
1057 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1058 	 * have no _text sometimes.
1059 	 */
1060 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1061 						 machine);
1062 	if (err < 0)
1063 		pr_err("Couldn't record guest kernel [%d]'s reference"
1064 		       " relocation symbol.\n", machine->pid);
1065 }
1066 
1067 static struct perf_event_header finished_round_event = {
1068 	.size = sizeof(struct perf_event_header),
1069 	.type = PERF_RECORD_FINISHED_ROUND,
1070 };
1071 
1072 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1073 {
1074 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1075 	    !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
1076 			  rec->affinity_mask.nbits)) {
1077 		bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
1078 		bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
1079 			  map->affinity_mask.bits, rec->affinity_mask.nbits);
1080 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
1081 				  (cpu_set_t *)rec->affinity_mask.bits);
1082 		if (verbose == 2)
1083 			mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
1084 	}
1085 }
1086 
1087 static size_t process_comp_header(void *record, size_t increment)
1088 {
1089 	struct perf_record_compressed *event = record;
1090 	size_t size = sizeof(*event);
1091 
1092 	if (increment) {
1093 		event->header.size += increment;
1094 		return increment;
1095 	}
1096 
1097 	event->header.type = PERF_RECORD_COMPRESSED;
1098 	event->header.size = size;
1099 
1100 	return size;
1101 }
1102 
1103 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
1104 			    void *src, size_t src_size)
1105 {
1106 	size_t compressed;
1107 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1108 
1109 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1110 						     max_record_size, process_comp_header);
1111 
1112 	session->bytes_transferred += src_size;
1113 	session->bytes_compressed  += compressed;
1114 
1115 	return compressed;
1116 }
1117 
1118 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1119 				    bool overwrite, bool synch)
1120 {
1121 	u64 bytes_written = rec->bytes_written;
1122 	int i;
1123 	int rc = 0;
1124 	struct mmap *maps;
1125 	int trace_fd = rec->data.file.fd;
1126 	off_t off = 0;
1127 
1128 	if (!evlist)
1129 		return 0;
1130 
1131 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1132 	if (!maps)
1133 		return 0;
1134 
1135 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1136 		return 0;
1137 
1138 	if (record__aio_enabled(rec))
1139 		off = record__aio_get_pos(trace_fd);
1140 
1141 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
1142 		u64 flush = 0;
1143 		struct mmap *map = &maps[i];
1144 
1145 		if (map->core.base) {
1146 			record__adjust_affinity(rec, map);
1147 			if (synch) {
1148 				flush = map->core.flush;
1149 				map->core.flush = 1;
1150 			}
1151 			if (!record__aio_enabled(rec)) {
1152 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1153 					if (synch)
1154 						map->core.flush = flush;
1155 					rc = -1;
1156 					goto out;
1157 				}
1158 			} else {
1159 				if (record__aio_push(rec, map, &off) < 0) {
1160 					record__aio_set_pos(trace_fd, off);
1161 					if (synch)
1162 						map->core.flush = flush;
1163 					rc = -1;
1164 					goto out;
1165 				}
1166 			}
1167 			if (synch)
1168 				map->core.flush = flush;
1169 		}
1170 
1171 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1172 		    !rec->opts.auxtrace_sample_mode &&
1173 		    record__auxtrace_mmap_read(rec, map) != 0) {
1174 			rc = -1;
1175 			goto out;
1176 		}
1177 	}
1178 
1179 	if (record__aio_enabled(rec))
1180 		record__aio_set_pos(trace_fd, off);
1181 
1182 	/*
1183 	 * Mark the round finished in case we wrote
1184 	 * at least one event.
1185 	 */
1186 	if (bytes_written != rec->bytes_written)
1187 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1188 
1189 	if (overwrite)
1190 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1191 out:
1192 	return rc;
1193 }
1194 
1195 static int record__mmap_read_all(struct record *rec, bool synch)
1196 {
1197 	int err;
1198 
1199 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1200 	if (err)
1201 		return err;
1202 
1203 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1204 }
1205 
1206 static void record__init_features(struct record *rec)
1207 {
1208 	struct perf_session *session = rec->session;
1209 	int feat;
1210 
1211 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1212 		perf_header__set_feat(&session->header, feat);
1213 
1214 	if (rec->no_buildid)
1215 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1216 
1217 	if (!have_tracepoints(&rec->evlist->core.entries))
1218 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1219 
1220 	if (!rec->opts.branch_stack)
1221 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1222 
1223 	if (!rec->opts.full_auxtrace)
1224 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1225 
1226 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1227 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1228 
1229 	if (!rec->opts.use_clockid)
1230 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1231 
1232 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1233 	if (!record__comp_enabled(rec))
1234 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1235 
1236 	perf_header__clear_feat(&session->header, HEADER_STAT);
1237 }
1238 
1239 static void
1240 record__finish_output(struct record *rec)
1241 {
1242 	struct perf_data *data = &rec->data;
1243 	int fd = perf_data__fd(data);
1244 
1245 	if (data->is_pipe)
1246 		return;
1247 
1248 	rec->session->header.data_size += rec->bytes_written;
1249 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1250 
1251 	if (!rec->no_buildid) {
1252 		process_buildids(rec);
1253 
1254 		if (rec->buildid_all)
1255 			dsos__hit_all(rec->session);
1256 	}
1257 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1258 
1259 	return;
1260 }
1261 
1262 static int record__synthesize_workload(struct record *rec, bool tail)
1263 {
1264 	int err;
1265 	struct perf_thread_map *thread_map;
1266 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1267 
1268 	if (rec->opts.tail_synthesize != tail)
1269 		return 0;
1270 
1271 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1272 	if (thread_map == NULL)
1273 		return -1;
1274 
1275 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1276 						 process_synthesized_event,
1277 						 &rec->session->machines.host,
1278 						 needs_mmap,
1279 						 rec->opts.sample_address);
1280 	perf_thread_map__put(thread_map);
1281 	return err;
1282 }
1283 
1284 static int record__synthesize(struct record *rec, bool tail);
1285 
1286 static int
1287 record__switch_output(struct record *rec, bool at_exit)
1288 {
1289 	struct perf_data *data = &rec->data;
1290 	int fd, err;
1291 	char *new_filename;
1292 
1293 	/* Same Size:      "2015122520103046"*/
1294 	char timestamp[] = "InvalidTimestamp";
1295 
1296 	record__aio_mmap_read_sync(rec);
1297 
1298 	record__synthesize(rec, true);
1299 	if (target__none(&rec->opts.target))
1300 		record__synthesize_workload(rec, true);
1301 
1302 	rec->samples = 0;
1303 	record__finish_output(rec);
1304 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1305 	if (err) {
1306 		pr_err("Failed to get current timestamp\n");
1307 		return -EINVAL;
1308 	}
1309 
1310 	fd = perf_data__switch(data, timestamp,
1311 				    rec->session->header.data_offset,
1312 				    at_exit, &new_filename);
1313 	if (fd >= 0 && !at_exit) {
1314 		rec->bytes_written = 0;
1315 		rec->session->header.data_size = 0;
1316 	}
1317 
1318 	if (!quiet)
1319 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1320 			data->path, timestamp);
1321 
1322 	if (rec->switch_output.num_files) {
1323 		int n = rec->switch_output.cur_file + 1;
1324 
1325 		if (n >= rec->switch_output.num_files)
1326 			n = 0;
1327 		rec->switch_output.cur_file = n;
1328 		if (rec->switch_output.filenames[n]) {
1329 			remove(rec->switch_output.filenames[n]);
1330 			zfree(&rec->switch_output.filenames[n]);
1331 		}
1332 		rec->switch_output.filenames[n] = new_filename;
1333 	} else {
1334 		free(new_filename);
1335 	}
1336 
1337 	/* Output tracking events */
1338 	if (!at_exit) {
1339 		record__synthesize(rec, false);
1340 
1341 		/*
1342 		 * In 'perf record --switch-output' without -a,
1343 		 * record__synthesize() in record__switch_output() won't
1344 		 * generate tracking events because there's no thread_map
1345 		 * in evlist. Which causes newly created perf.data doesn't
1346 		 * contain map and comm information.
1347 		 * Create a fake thread_map and directly call
1348 		 * perf_event__synthesize_thread_map() for those events.
1349 		 */
1350 		if (target__none(&rec->opts.target))
1351 			record__synthesize_workload(rec, false);
1352 	}
1353 	return fd;
1354 }
1355 
1356 static volatile int workload_exec_errno;
1357 
1358 /*
1359  * evlist__prepare_workload will send a SIGUSR1
1360  * if the fork fails, since we asked by setting its
1361  * want_signal to true.
1362  */
1363 static void workload_exec_failed_signal(int signo __maybe_unused,
1364 					siginfo_t *info,
1365 					void *ucontext __maybe_unused)
1366 {
1367 	workload_exec_errno = info->si_value.sival_int;
1368 	done = 1;
1369 	child_finished = 1;
1370 }
1371 
1372 static void snapshot_sig_handler(int sig);
1373 static void alarm_sig_handler(int sig);
1374 
1375 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1376 {
1377 	if (evlist) {
1378 		if (evlist->mmap && evlist->mmap[0].core.base)
1379 			return evlist->mmap[0].core.base;
1380 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1381 			return evlist->overwrite_mmap[0].core.base;
1382 	}
1383 	return NULL;
1384 }
1385 
1386 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1387 {
1388 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1389 	if (pc)
1390 		return pc;
1391 	return NULL;
1392 }
1393 
1394 static int record__synthesize(struct record *rec, bool tail)
1395 {
1396 	struct perf_session *session = rec->session;
1397 	struct machine *machine = &session->machines.host;
1398 	struct perf_data *data = &rec->data;
1399 	struct record_opts *opts = &rec->opts;
1400 	struct perf_tool *tool = &rec->tool;
1401 	int err = 0;
1402 	event_op f = process_synthesized_event;
1403 
1404 	if (rec->opts.tail_synthesize != tail)
1405 		return 0;
1406 
1407 	if (data->is_pipe) {
1408 		err = perf_event__synthesize_for_pipe(tool, session, data,
1409 						      process_synthesized_event);
1410 		if (err < 0)
1411 			goto out;
1412 
1413 		rec->bytes_written += err;
1414 	}
1415 
1416 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1417 					  process_synthesized_event, machine);
1418 	if (err)
1419 		goto out;
1420 
1421 	/* Synthesize id_index before auxtrace_info */
1422 	if (rec->opts.auxtrace_sample_mode || rec->opts.full_auxtrace) {
1423 		err = perf_event__synthesize_id_index(tool,
1424 						      process_synthesized_event,
1425 						      session->evlist, machine);
1426 		if (err)
1427 			goto out;
1428 	}
1429 
1430 	if (rec->opts.full_auxtrace) {
1431 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1432 					session, process_synthesized_event);
1433 		if (err)
1434 			goto out;
1435 	}
1436 
1437 	if (!evlist__exclude_kernel(rec->evlist)) {
1438 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1439 							 machine);
1440 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1441 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1442 				   "Check /proc/kallsyms permission or run as root.\n");
1443 
1444 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1445 						     machine);
1446 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1447 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1448 				   "Check /proc/modules permission or run as root.\n");
1449 	}
1450 
1451 	if (perf_guest) {
1452 		machines__process_guests(&session->machines,
1453 					 perf_event__synthesize_guest_os, tool);
1454 	}
1455 
1456 	err = perf_event__synthesize_extra_attr(&rec->tool,
1457 						rec->evlist,
1458 						process_synthesized_event,
1459 						data->is_pipe);
1460 	if (err)
1461 		goto out;
1462 
1463 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1464 						 process_synthesized_event,
1465 						NULL);
1466 	if (err < 0) {
1467 		pr_err("Couldn't synthesize thread map.\n");
1468 		return err;
1469 	}
1470 
1471 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1472 					     process_synthesized_event, NULL);
1473 	if (err < 0) {
1474 		pr_err("Couldn't synthesize cpu map.\n");
1475 		return err;
1476 	}
1477 
1478 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1479 						machine, opts);
1480 	if (err < 0)
1481 		pr_warning("Couldn't synthesize bpf events.\n");
1482 
1483 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
1484 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1485 						     machine);
1486 		if (err < 0)
1487 			pr_warning("Couldn't synthesize cgroup events.\n");
1488 	}
1489 
1490 	if (rec->opts.nr_threads_synthesize > 1) {
1491 		perf_set_multithreaded();
1492 		f = process_locked_synthesized_event;
1493 	}
1494 
1495 	if (rec->opts.synth & PERF_SYNTH_TASK) {
1496 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1497 
1498 		err = __machine__synthesize_threads(machine, tool, &opts->target,
1499 						    rec->evlist->core.threads,
1500 						    f, needs_mmap, opts->sample_address,
1501 						    rec->opts.nr_threads_synthesize);
1502 	}
1503 
1504 	if (rec->opts.nr_threads_synthesize > 1)
1505 		perf_set_singlethreaded();
1506 
1507 out:
1508 	return err;
1509 }
1510 
1511 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1512 {
1513 	struct record *rec = data;
1514 	pthread_kill(rec->thread_id, SIGUSR2);
1515 	return 0;
1516 }
1517 
1518 static int record__setup_sb_evlist(struct record *rec)
1519 {
1520 	struct record_opts *opts = &rec->opts;
1521 
1522 	if (rec->sb_evlist != NULL) {
1523 		/*
1524 		 * We get here if --switch-output-event populated the
1525 		 * sb_evlist, so associate a callback that will send a SIGUSR2
1526 		 * to the main thread.
1527 		 */
1528 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1529 		rec->thread_id = pthread_self();
1530 	}
1531 #ifdef HAVE_LIBBPF_SUPPORT
1532 	if (!opts->no_bpf_event) {
1533 		if (rec->sb_evlist == NULL) {
1534 			rec->sb_evlist = evlist__new();
1535 
1536 			if (rec->sb_evlist == NULL) {
1537 				pr_err("Couldn't create side band evlist.\n.");
1538 				return -1;
1539 			}
1540 		}
1541 
1542 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1543 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1544 			return -1;
1545 		}
1546 	}
1547 #endif
1548 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1549 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1550 		opts->no_bpf_event = true;
1551 	}
1552 
1553 	return 0;
1554 }
1555 
1556 static int record__init_clock(struct record *rec)
1557 {
1558 	struct perf_session *session = rec->session;
1559 	struct timespec ref_clockid;
1560 	struct timeval ref_tod;
1561 	u64 ref;
1562 
1563 	if (!rec->opts.use_clockid)
1564 		return 0;
1565 
1566 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1567 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
1568 
1569 	session->header.env.clock.clockid = rec->opts.clockid;
1570 
1571 	if (gettimeofday(&ref_tod, NULL) != 0) {
1572 		pr_err("gettimeofday failed, cannot set reference time.\n");
1573 		return -1;
1574 	}
1575 
1576 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
1577 		pr_err("clock_gettime failed, cannot set reference time.\n");
1578 		return -1;
1579 	}
1580 
1581 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
1582 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
1583 
1584 	session->header.env.clock.tod_ns = ref;
1585 
1586 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
1587 	      (u64) ref_clockid.tv_nsec;
1588 
1589 	session->header.env.clock.clockid_ns = ref;
1590 	return 0;
1591 }
1592 
1593 static void hit_auxtrace_snapshot_trigger(struct record *rec)
1594 {
1595 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1596 		trigger_hit(&auxtrace_snapshot_trigger);
1597 		auxtrace_record__snapshot_started = 1;
1598 		if (auxtrace_record__snapshot_start(rec->itr))
1599 			trigger_error(&auxtrace_snapshot_trigger);
1600 	}
1601 }
1602 
1603 static void record__uniquify_name(struct record *rec)
1604 {
1605 	struct evsel *pos;
1606 	struct evlist *evlist = rec->evlist;
1607 	char *new_name;
1608 	int ret;
1609 
1610 	if (!perf_pmu__has_hybrid())
1611 		return;
1612 
1613 	evlist__for_each_entry(evlist, pos) {
1614 		if (!evsel__is_hybrid(pos))
1615 			continue;
1616 
1617 		if (strchr(pos->name, '/'))
1618 			continue;
1619 
1620 		ret = asprintf(&new_name, "%s/%s/",
1621 			       pos->pmu_name, pos->name);
1622 		if (ret) {
1623 			free(pos->name);
1624 			pos->name = new_name;
1625 		}
1626 	}
1627 }
1628 
1629 static int __cmd_record(struct record *rec, int argc, const char **argv)
1630 {
1631 	int err;
1632 	int status = 0;
1633 	unsigned long waking = 0;
1634 	const bool forks = argc > 0;
1635 	struct perf_tool *tool = &rec->tool;
1636 	struct record_opts *opts = &rec->opts;
1637 	struct perf_data *data = &rec->data;
1638 	struct perf_session *session;
1639 	bool disabled = false, draining = false;
1640 	int fd;
1641 	float ratio = 0;
1642 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
1643 
1644 	atexit(record__sig_exit);
1645 	signal(SIGCHLD, sig_handler);
1646 	signal(SIGINT, sig_handler);
1647 	signal(SIGTERM, sig_handler);
1648 	signal(SIGSEGV, sigsegv_handler);
1649 
1650 	if (rec->opts.record_namespaces)
1651 		tool->namespace_events = true;
1652 
1653 	if (rec->opts.record_cgroup) {
1654 #ifdef HAVE_FILE_HANDLE
1655 		tool->cgroup_events = true;
1656 #else
1657 		pr_err("cgroup tracking is not supported\n");
1658 		return -1;
1659 #endif
1660 	}
1661 
1662 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1663 		signal(SIGUSR2, snapshot_sig_handler);
1664 		if (rec->opts.auxtrace_snapshot_mode)
1665 			trigger_on(&auxtrace_snapshot_trigger);
1666 		if (rec->switch_output.enabled)
1667 			trigger_on(&switch_output_trigger);
1668 	} else {
1669 		signal(SIGUSR2, SIG_IGN);
1670 	}
1671 
1672 	session = perf_session__new(data, tool);
1673 	if (IS_ERR(session)) {
1674 		pr_err("Perf session creation failed.\n");
1675 		return PTR_ERR(session);
1676 	}
1677 
1678 	fd = perf_data__fd(data);
1679 	rec->session = session;
1680 
1681 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1682 		pr_err("Compression initialization failed.\n");
1683 		return -1;
1684 	}
1685 #ifdef HAVE_EVENTFD_SUPPORT
1686 	done_fd = eventfd(0, EFD_NONBLOCK);
1687 	if (done_fd < 0) {
1688 		pr_err("Failed to create wakeup eventfd, error: %m\n");
1689 		status = -1;
1690 		goto out_delete_session;
1691 	}
1692 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
1693 	if (err < 0) {
1694 		pr_err("Failed to add wakeup eventfd to poll list\n");
1695 		status = err;
1696 		goto out_delete_session;
1697 	}
1698 #endif // HAVE_EVENTFD_SUPPORT
1699 
1700 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1701 	session->header.env.comp_level = rec->opts.comp_level;
1702 
1703 	if (rec->opts.kcore &&
1704 	    !record__kcore_readable(&session->machines.host)) {
1705 		pr_err("ERROR: kcore is not readable.\n");
1706 		return -1;
1707 	}
1708 
1709 	if (record__init_clock(rec))
1710 		return -1;
1711 
1712 	record__init_features(rec);
1713 
1714 	if (forks) {
1715 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
1716 					       workload_exec_failed_signal);
1717 		if (err < 0) {
1718 			pr_err("Couldn't run the workload!\n");
1719 			status = err;
1720 			goto out_delete_session;
1721 		}
1722 	}
1723 
1724 	/*
1725 	 * If we have just single event and are sending data
1726 	 * through pipe, we need to force the ids allocation,
1727 	 * because we synthesize event name through the pipe
1728 	 * and need the id for that.
1729 	 */
1730 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1731 		rec->opts.sample_id = true;
1732 
1733 	record__uniquify_name(rec);
1734 
1735 	if (record__open(rec) != 0) {
1736 		err = -1;
1737 		goto out_child;
1738 	}
1739 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1740 
1741 	if (rec->opts.kcore) {
1742 		err = record__kcore_copy(&session->machines.host, data);
1743 		if (err) {
1744 			pr_err("ERROR: Failed to copy kcore\n");
1745 			goto out_child;
1746 		}
1747 	}
1748 
1749 	err = bpf__apply_obj_config();
1750 	if (err) {
1751 		char errbuf[BUFSIZ];
1752 
1753 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1754 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1755 			 errbuf);
1756 		goto out_child;
1757 	}
1758 
1759 	/*
1760 	 * Normally perf_session__new would do this, but it doesn't have the
1761 	 * evlist.
1762 	 */
1763 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
1764 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1765 		rec->tool.ordered_events = false;
1766 	}
1767 
1768 	if (!rec->evlist->core.nr_groups)
1769 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1770 
1771 	if (data->is_pipe) {
1772 		err = perf_header__write_pipe(fd);
1773 		if (err < 0)
1774 			goto out_child;
1775 	} else {
1776 		err = perf_session__write_header(session, rec->evlist, fd, false);
1777 		if (err < 0)
1778 			goto out_child;
1779 	}
1780 
1781 	err = -1;
1782 	if (!rec->no_buildid
1783 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1784 		pr_err("Couldn't generate buildids. "
1785 		       "Use --no-buildid to profile anyway.\n");
1786 		goto out_child;
1787 	}
1788 
1789 	err = record__setup_sb_evlist(rec);
1790 	if (err)
1791 		goto out_child;
1792 
1793 	err = record__synthesize(rec, false);
1794 	if (err < 0)
1795 		goto out_child;
1796 
1797 	if (rec->realtime_prio) {
1798 		struct sched_param param;
1799 
1800 		param.sched_priority = rec->realtime_prio;
1801 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1802 			pr_err("Could not set realtime priority.\n");
1803 			err = -1;
1804 			goto out_child;
1805 		}
1806 	}
1807 
1808 	/*
1809 	 * When perf is starting the traced process, all the events
1810 	 * (apart from group members) have enable_on_exec=1 set,
1811 	 * so don't spoil it by prematurely enabling them.
1812 	 */
1813 	if (!target__none(&opts->target) && !opts->initial_delay)
1814 		evlist__enable(rec->evlist);
1815 
1816 	/*
1817 	 * Let the child rip
1818 	 */
1819 	if (forks) {
1820 		struct machine *machine = &session->machines.host;
1821 		union perf_event *event;
1822 		pid_t tgid;
1823 
1824 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1825 		if (event == NULL) {
1826 			err = -ENOMEM;
1827 			goto out_child;
1828 		}
1829 
1830 		/*
1831 		 * Some H/W events are generated before COMM event
1832 		 * which is emitted during exec(), so perf script
1833 		 * cannot see a correct process name for those events.
1834 		 * Synthesize COMM event to prevent it.
1835 		 */
1836 		tgid = perf_event__synthesize_comm(tool, event,
1837 						   rec->evlist->workload.pid,
1838 						   process_synthesized_event,
1839 						   machine);
1840 		free(event);
1841 
1842 		if (tgid == -1)
1843 			goto out_child;
1844 
1845 		event = malloc(sizeof(event->namespaces) +
1846 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1847 			       machine->id_hdr_size);
1848 		if (event == NULL) {
1849 			err = -ENOMEM;
1850 			goto out_child;
1851 		}
1852 
1853 		/*
1854 		 * Synthesize NAMESPACES event for the command specified.
1855 		 */
1856 		perf_event__synthesize_namespaces(tool, event,
1857 						  rec->evlist->workload.pid,
1858 						  tgid, process_synthesized_event,
1859 						  machine);
1860 		free(event);
1861 
1862 		evlist__start_workload(rec->evlist);
1863 	}
1864 
1865 	if (evlist__initialize_ctlfd(rec->evlist, opts->ctl_fd, opts->ctl_fd_ack))
1866 		goto out_child;
1867 
1868 	if (opts->initial_delay) {
1869 		pr_info(EVLIST_DISABLED_MSG);
1870 		if (opts->initial_delay > 0) {
1871 			usleep(opts->initial_delay * USEC_PER_MSEC);
1872 			evlist__enable(rec->evlist);
1873 			pr_info(EVLIST_ENABLED_MSG);
1874 		}
1875 	}
1876 
1877 	trigger_ready(&auxtrace_snapshot_trigger);
1878 	trigger_ready(&switch_output_trigger);
1879 	perf_hooks__invoke_record_start();
1880 	for (;;) {
1881 		unsigned long long hits = rec->samples;
1882 
1883 		/*
1884 		 * rec->evlist->bkw_mmap_state is possible to be
1885 		 * BKW_MMAP_EMPTY here: when done == true and
1886 		 * hits != rec->samples in previous round.
1887 		 *
1888 		 * evlist__toggle_bkw_mmap ensure we never
1889 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1890 		 */
1891 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1892 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1893 
1894 		if (record__mmap_read_all(rec, false) < 0) {
1895 			trigger_error(&auxtrace_snapshot_trigger);
1896 			trigger_error(&switch_output_trigger);
1897 			err = -1;
1898 			goto out_child;
1899 		}
1900 
1901 		if (auxtrace_record__snapshot_started) {
1902 			auxtrace_record__snapshot_started = 0;
1903 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1904 				record__read_auxtrace_snapshot(rec, false);
1905 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1906 				pr_err("AUX area tracing snapshot failed\n");
1907 				err = -1;
1908 				goto out_child;
1909 			}
1910 		}
1911 
1912 		if (trigger_is_hit(&switch_output_trigger)) {
1913 			/*
1914 			 * If switch_output_trigger is hit, the data in
1915 			 * overwritable ring buffer should have been collected,
1916 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1917 			 *
1918 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1919 			 * record__mmap_read_all() didn't collect data from
1920 			 * overwritable ring buffer. Read again.
1921 			 */
1922 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1923 				continue;
1924 			trigger_ready(&switch_output_trigger);
1925 
1926 			/*
1927 			 * Reenable events in overwrite ring buffer after
1928 			 * record__mmap_read_all(): we should have collected
1929 			 * data from it.
1930 			 */
1931 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1932 
1933 			if (!quiet)
1934 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1935 					waking);
1936 			waking = 0;
1937 			fd = record__switch_output(rec, false);
1938 			if (fd < 0) {
1939 				pr_err("Failed to switch to new file\n");
1940 				trigger_error(&switch_output_trigger);
1941 				err = fd;
1942 				goto out_child;
1943 			}
1944 
1945 			/* re-arm the alarm */
1946 			if (rec->switch_output.time)
1947 				alarm(rec->switch_output.time);
1948 		}
1949 
1950 		if (hits == rec->samples) {
1951 			if (done || draining)
1952 				break;
1953 			err = evlist__poll(rec->evlist, -1);
1954 			/*
1955 			 * Propagate error, only if there's any. Ignore positive
1956 			 * number of returned events and interrupt error.
1957 			 */
1958 			if (err > 0 || (err < 0 && errno == EINTR))
1959 				err = 0;
1960 			waking++;
1961 
1962 			if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1963 				draining = true;
1964 		}
1965 
1966 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
1967 			switch (cmd) {
1968 			case EVLIST_CTL_CMD_SNAPSHOT:
1969 				hit_auxtrace_snapshot_trigger(rec);
1970 				evlist__ctlfd_ack(rec->evlist);
1971 				break;
1972 			case EVLIST_CTL_CMD_STOP:
1973 				done = 1;
1974 				break;
1975 			case EVLIST_CTL_CMD_ACK:
1976 			case EVLIST_CTL_CMD_UNSUPPORTED:
1977 			case EVLIST_CTL_CMD_ENABLE:
1978 			case EVLIST_CTL_CMD_DISABLE:
1979 			case EVLIST_CTL_CMD_EVLIST:
1980 			case EVLIST_CTL_CMD_PING:
1981 			default:
1982 				break;
1983 			}
1984 		}
1985 
1986 		/*
1987 		 * When perf is starting the traced process, at the end events
1988 		 * die with the process and we wait for that. Thus no need to
1989 		 * disable events in this case.
1990 		 */
1991 		if (done && !disabled && !target__none(&opts->target)) {
1992 			trigger_off(&auxtrace_snapshot_trigger);
1993 			evlist__disable(rec->evlist);
1994 			disabled = true;
1995 		}
1996 	}
1997 
1998 	trigger_off(&auxtrace_snapshot_trigger);
1999 	trigger_off(&switch_output_trigger);
2000 
2001 	if (opts->auxtrace_snapshot_on_exit)
2002 		record__auxtrace_snapshot_exit(rec);
2003 
2004 	if (forks && workload_exec_errno) {
2005 		char msg[STRERR_BUFSIZE], strevsels[2048];
2006 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2007 
2008 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2009 
2010 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2011 			strevsels, argv[0], emsg);
2012 		err = -1;
2013 		goto out_child;
2014 	}
2015 
2016 	if (!quiet)
2017 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
2018 
2019 	if (target__none(&rec->opts.target))
2020 		record__synthesize_workload(rec, true);
2021 
2022 out_child:
2023 	evlist__finalize_ctlfd(rec->evlist);
2024 	record__mmap_read_all(rec, true);
2025 	record__aio_mmap_read_sync(rec);
2026 
2027 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2028 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2029 		session->header.env.comp_ratio = ratio + 0.5;
2030 	}
2031 
2032 	if (forks) {
2033 		int exit_status;
2034 
2035 		if (!child_finished)
2036 			kill(rec->evlist->workload.pid, SIGTERM);
2037 
2038 		wait(&exit_status);
2039 
2040 		if (err < 0)
2041 			status = err;
2042 		else if (WIFEXITED(exit_status))
2043 			status = WEXITSTATUS(exit_status);
2044 		else if (WIFSIGNALED(exit_status))
2045 			signr = WTERMSIG(exit_status);
2046 	} else
2047 		status = err;
2048 
2049 	record__synthesize(rec, true);
2050 	/* this will be recalculated during process_buildids() */
2051 	rec->samples = 0;
2052 
2053 	if (!err) {
2054 		if (!rec->timestamp_filename) {
2055 			record__finish_output(rec);
2056 		} else {
2057 			fd = record__switch_output(rec, true);
2058 			if (fd < 0) {
2059 				status = fd;
2060 				goto out_delete_session;
2061 			}
2062 		}
2063 	}
2064 
2065 	perf_hooks__invoke_record_end();
2066 
2067 	if (!err && !quiet) {
2068 		char samples[128];
2069 		const char *postfix = rec->timestamp_filename ?
2070 					".<timestamp>" : "";
2071 
2072 		if (rec->samples && !rec->opts.full_auxtrace)
2073 			scnprintf(samples, sizeof(samples),
2074 				  " (%" PRIu64 " samples)", rec->samples);
2075 		else
2076 			samples[0] = '\0';
2077 
2078 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2079 			perf_data__size(data) / 1024.0 / 1024.0,
2080 			data->path, postfix, samples);
2081 		if (ratio) {
2082 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2083 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2084 					ratio);
2085 		}
2086 		fprintf(stderr, " ]\n");
2087 	}
2088 
2089 out_delete_session:
2090 #ifdef HAVE_EVENTFD_SUPPORT
2091 	if (done_fd >= 0)
2092 		close(done_fd);
2093 #endif
2094 	zstd_fini(&session->zstd_data);
2095 	perf_session__delete(session);
2096 
2097 	if (!opts->no_bpf_event)
2098 		evlist__stop_sb_thread(rec->sb_evlist);
2099 	return status;
2100 }
2101 
2102 static void callchain_debug(struct callchain_param *callchain)
2103 {
2104 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2105 
2106 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2107 
2108 	if (callchain->record_mode == CALLCHAIN_DWARF)
2109 		pr_debug("callchain: stack dump size %d\n",
2110 			 callchain->dump_size);
2111 }
2112 
2113 int record_opts__parse_callchain(struct record_opts *record,
2114 				 struct callchain_param *callchain,
2115 				 const char *arg, bool unset)
2116 {
2117 	int ret;
2118 	callchain->enabled = !unset;
2119 
2120 	/* --no-call-graph */
2121 	if (unset) {
2122 		callchain->record_mode = CALLCHAIN_NONE;
2123 		pr_debug("callchain: disabled\n");
2124 		return 0;
2125 	}
2126 
2127 	ret = parse_callchain_record_opt(arg, callchain);
2128 	if (!ret) {
2129 		/* Enable data address sampling for DWARF unwind. */
2130 		if (callchain->record_mode == CALLCHAIN_DWARF)
2131 			record->sample_address = true;
2132 		callchain_debug(callchain);
2133 	}
2134 
2135 	return ret;
2136 }
2137 
2138 int record_parse_callchain_opt(const struct option *opt,
2139 			       const char *arg,
2140 			       int unset)
2141 {
2142 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2143 }
2144 
2145 int record_callchain_opt(const struct option *opt,
2146 			 const char *arg __maybe_unused,
2147 			 int unset __maybe_unused)
2148 {
2149 	struct callchain_param *callchain = opt->value;
2150 
2151 	callchain->enabled = true;
2152 
2153 	if (callchain->record_mode == CALLCHAIN_NONE)
2154 		callchain->record_mode = CALLCHAIN_FP;
2155 
2156 	callchain_debug(callchain);
2157 	return 0;
2158 }
2159 
2160 static int perf_record_config(const char *var, const char *value, void *cb)
2161 {
2162 	struct record *rec = cb;
2163 
2164 	if (!strcmp(var, "record.build-id")) {
2165 		if (!strcmp(value, "cache"))
2166 			rec->no_buildid_cache = false;
2167 		else if (!strcmp(value, "no-cache"))
2168 			rec->no_buildid_cache = true;
2169 		else if (!strcmp(value, "skip"))
2170 			rec->no_buildid = true;
2171 		else if (!strcmp(value, "mmap"))
2172 			rec->buildid_mmap = true;
2173 		else
2174 			return -1;
2175 		return 0;
2176 	}
2177 	if (!strcmp(var, "record.call-graph")) {
2178 		var = "call-graph.record-mode";
2179 		return perf_default_config(var, value, cb);
2180 	}
2181 #ifdef HAVE_AIO_SUPPORT
2182 	if (!strcmp(var, "record.aio")) {
2183 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2184 		if (!rec->opts.nr_cblocks)
2185 			rec->opts.nr_cblocks = nr_cblocks_default;
2186 	}
2187 #endif
2188 	if (!strcmp(var, "record.debuginfod")) {
2189 		rec->debuginfod.urls = strdup(value);
2190 		if (!rec->debuginfod.urls)
2191 			return -ENOMEM;
2192 		rec->debuginfod.set = true;
2193 	}
2194 
2195 	return 0;
2196 }
2197 
2198 
2199 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2200 {
2201 	struct record_opts *opts = (struct record_opts *)opt->value;
2202 
2203 	if (unset || !str)
2204 		return 0;
2205 
2206 	if (!strcasecmp(str, "node"))
2207 		opts->affinity = PERF_AFFINITY_NODE;
2208 	else if (!strcasecmp(str, "cpu"))
2209 		opts->affinity = PERF_AFFINITY_CPU;
2210 
2211 	return 0;
2212 }
2213 
2214 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2215 {
2216 	mask->nbits = nr_bits;
2217 	mask->bits = bitmap_zalloc(mask->nbits);
2218 	if (!mask->bits)
2219 		return -ENOMEM;
2220 
2221 	return 0;
2222 }
2223 
2224 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2225 {
2226 	bitmap_free(mask->bits);
2227 	mask->nbits = 0;
2228 }
2229 
2230 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
2231 {
2232 	int ret;
2233 
2234 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
2235 	if (ret) {
2236 		mask->affinity.bits = NULL;
2237 		return ret;
2238 	}
2239 
2240 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
2241 	if (ret) {
2242 		record__mmap_cpu_mask_free(&mask->maps);
2243 		mask->maps.bits = NULL;
2244 	}
2245 
2246 	return ret;
2247 }
2248 
2249 static void record__thread_mask_free(struct thread_mask *mask)
2250 {
2251 	record__mmap_cpu_mask_free(&mask->maps);
2252 	record__mmap_cpu_mask_free(&mask->affinity);
2253 }
2254 
2255 static int parse_output_max_size(const struct option *opt,
2256 				 const char *str, int unset)
2257 {
2258 	unsigned long *s = (unsigned long *)opt->value;
2259 	static struct parse_tag tags_size[] = {
2260 		{ .tag  = 'B', .mult = 1       },
2261 		{ .tag  = 'K', .mult = 1 << 10 },
2262 		{ .tag  = 'M', .mult = 1 << 20 },
2263 		{ .tag  = 'G', .mult = 1 << 30 },
2264 		{ .tag  = 0 },
2265 	};
2266 	unsigned long val;
2267 
2268 	if (unset) {
2269 		*s = 0;
2270 		return 0;
2271 	}
2272 
2273 	val = parse_tag_value(str, tags_size);
2274 	if (val != (unsigned long) -1) {
2275 		*s = val;
2276 		return 0;
2277 	}
2278 
2279 	return -1;
2280 }
2281 
2282 static int record__parse_mmap_pages(const struct option *opt,
2283 				    const char *str,
2284 				    int unset __maybe_unused)
2285 {
2286 	struct record_opts *opts = opt->value;
2287 	char *s, *p;
2288 	unsigned int mmap_pages;
2289 	int ret;
2290 
2291 	if (!str)
2292 		return -EINVAL;
2293 
2294 	s = strdup(str);
2295 	if (!s)
2296 		return -ENOMEM;
2297 
2298 	p = strchr(s, ',');
2299 	if (p)
2300 		*p = '\0';
2301 
2302 	if (*s) {
2303 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
2304 		if (ret)
2305 			goto out_free;
2306 		opts->mmap_pages = mmap_pages;
2307 	}
2308 
2309 	if (!p) {
2310 		ret = 0;
2311 		goto out_free;
2312 	}
2313 
2314 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
2315 	if (ret)
2316 		goto out_free;
2317 
2318 	opts->auxtrace_mmap_pages = mmap_pages;
2319 
2320 out_free:
2321 	free(s);
2322 	return ret;
2323 }
2324 
2325 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
2326 {
2327 }
2328 
2329 static int parse_control_option(const struct option *opt,
2330 				const char *str,
2331 				int unset __maybe_unused)
2332 {
2333 	struct record_opts *opts = opt->value;
2334 
2335 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
2336 }
2337 
2338 static void switch_output_size_warn(struct record *rec)
2339 {
2340 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2341 	struct switch_output *s = &rec->switch_output;
2342 
2343 	wakeup_size /= 2;
2344 
2345 	if (s->size < wakeup_size) {
2346 		char buf[100];
2347 
2348 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2349 		pr_warning("WARNING: switch-output data size lower than "
2350 			   "wakeup kernel buffer size (%s) "
2351 			   "expect bigger perf.data sizes\n", buf);
2352 	}
2353 }
2354 
2355 static int switch_output_setup(struct record *rec)
2356 {
2357 	struct switch_output *s = &rec->switch_output;
2358 	static struct parse_tag tags_size[] = {
2359 		{ .tag  = 'B', .mult = 1       },
2360 		{ .tag  = 'K', .mult = 1 << 10 },
2361 		{ .tag  = 'M', .mult = 1 << 20 },
2362 		{ .tag  = 'G', .mult = 1 << 30 },
2363 		{ .tag  = 0 },
2364 	};
2365 	static struct parse_tag tags_time[] = {
2366 		{ .tag  = 's', .mult = 1        },
2367 		{ .tag  = 'm', .mult = 60       },
2368 		{ .tag  = 'h', .mult = 60*60    },
2369 		{ .tag  = 'd', .mult = 60*60*24 },
2370 		{ .tag  = 0 },
2371 	};
2372 	unsigned long val;
2373 
2374 	/*
2375 	 * If we're using --switch-output-events, then we imply its
2376 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2377 	 *  thread to its parent.
2378 	 */
2379 	if (rec->switch_output_event_set)
2380 		goto do_signal;
2381 
2382 	if (!s->set)
2383 		return 0;
2384 
2385 	if (!strcmp(s->str, "signal")) {
2386 do_signal:
2387 		s->signal = true;
2388 		pr_debug("switch-output with SIGUSR2 signal\n");
2389 		goto enabled;
2390 	}
2391 
2392 	val = parse_tag_value(s->str, tags_size);
2393 	if (val != (unsigned long) -1) {
2394 		s->size = val;
2395 		pr_debug("switch-output with %s size threshold\n", s->str);
2396 		goto enabled;
2397 	}
2398 
2399 	val = parse_tag_value(s->str, tags_time);
2400 	if (val != (unsigned long) -1) {
2401 		s->time = val;
2402 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2403 			 s->str, s->time);
2404 		goto enabled;
2405 	}
2406 
2407 	return -1;
2408 
2409 enabled:
2410 	rec->timestamp_filename = true;
2411 	s->enabled              = true;
2412 
2413 	if (s->size && !rec->opts.no_buffering)
2414 		switch_output_size_warn(rec);
2415 
2416 	return 0;
2417 }
2418 
2419 static const char * const __record_usage[] = {
2420 	"perf record [<options>] [<command>]",
2421 	"perf record [<options>] -- <command> [<options>]",
2422 	NULL
2423 };
2424 const char * const *record_usage = __record_usage;
2425 
2426 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2427 				  struct perf_sample *sample, struct machine *machine)
2428 {
2429 	/*
2430 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2431 	 * no need to add them twice.
2432 	 */
2433 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2434 		return 0;
2435 	return perf_event__process_mmap(tool, event, sample, machine);
2436 }
2437 
2438 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2439 				   struct perf_sample *sample, struct machine *machine)
2440 {
2441 	/*
2442 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2443 	 * no need to add them twice.
2444 	 */
2445 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2446 		return 0;
2447 
2448 	return perf_event__process_mmap2(tool, event, sample, machine);
2449 }
2450 
2451 static int process_timestamp_boundary(struct perf_tool *tool,
2452 				      union perf_event *event __maybe_unused,
2453 				      struct perf_sample *sample,
2454 				      struct machine *machine __maybe_unused)
2455 {
2456 	struct record *rec = container_of(tool, struct record, tool);
2457 
2458 	set_timestamp_boundary(rec, sample->time);
2459 	return 0;
2460 }
2461 
2462 static int parse_record_synth_option(const struct option *opt,
2463 				     const char *str,
2464 				     int unset __maybe_unused)
2465 {
2466 	struct record_opts *opts = opt->value;
2467 	char *p = strdup(str);
2468 
2469 	if (p == NULL)
2470 		return -1;
2471 
2472 	opts->synth = parse_synth_opt(p);
2473 	free(p);
2474 
2475 	if (opts->synth < 0) {
2476 		pr_err("Invalid synth option: %s\n", str);
2477 		return -1;
2478 	}
2479 	return 0;
2480 }
2481 
2482 /*
2483  * XXX Ideally would be local to cmd_record() and passed to a record__new
2484  * because we need to have access to it in record__exit, that is called
2485  * after cmd_record() exits, but since record_options need to be accessible to
2486  * builtin-script, leave it here.
2487  *
2488  * At least we don't ouch it in all the other functions here directly.
2489  *
2490  * Just say no to tons of global variables, sigh.
2491  */
2492 static struct record record = {
2493 	.opts = {
2494 		.sample_time	     = true,
2495 		.mmap_pages	     = UINT_MAX,
2496 		.user_freq	     = UINT_MAX,
2497 		.user_interval	     = ULLONG_MAX,
2498 		.freq		     = 4000,
2499 		.target		     = {
2500 			.uses_mmap   = true,
2501 			.default_per_cpu = true,
2502 		},
2503 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2504 		.nr_threads_synthesize = 1,
2505 		.ctl_fd              = -1,
2506 		.ctl_fd_ack          = -1,
2507 		.synth               = PERF_SYNTH_ALL,
2508 	},
2509 	.tool = {
2510 		.sample		= process_sample_event,
2511 		.fork		= perf_event__process_fork,
2512 		.exit		= perf_event__process_exit,
2513 		.comm		= perf_event__process_comm,
2514 		.namespaces	= perf_event__process_namespaces,
2515 		.mmap		= build_id__process_mmap,
2516 		.mmap2		= build_id__process_mmap2,
2517 		.itrace_start	= process_timestamp_boundary,
2518 		.aux		= process_timestamp_boundary,
2519 		.ordered_events	= true,
2520 	},
2521 };
2522 
2523 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2524 	"\n\t\t\t\tDefault: fp";
2525 
2526 static bool dry_run;
2527 
2528 /*
2529  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2530  * with it and switch to use the library functions in perf_evlist that came
2531  * from builtin-record.c, i.e. use record_opts,
2532  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2533  * using pipes, etc.
2534  */
2535 static struct option __record_options[] = {
2536 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2537 		     "event selector. use 'perf list' to list available events",
2538 		     parse_events_option),
2539 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2540 		     "event filter", parse_filter),
2541 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2542 			   NULL, "don't record events from perf itself",
2543 			   exclude_perf),
2544 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2545 		    "record events on existing process id"),
2546 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2547 		    "record events on existing thread id"),
2548 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2549 		    "collect data with this RT SCHED_FIFO priority"),
2550 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2551 		    "collect data without buffering"),
2552 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2553 		    "collect raw sample records from all opened counters"),
2554 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2555 			    "system-wide collection from all CPUs"),
2556 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2557 		    "list of cpus to monitor"),
2558 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2559 	OPT_STRING('o', "output", &record.data.path, "file",
2560 		    "output file name"),
2561 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2562 			&record.opts.no_inherit_set,
2563 			"child tasks do not inherit counters"),
2564 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2565 		    "synthesize non-sample events at the end of output"),
2566 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2567 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
2568 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2569 		    "Fail if the specified frequency can't be used"),
2570 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2571 		     "profile at this frequency",
2572 		      record__parse_freq),
2573 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2574 		     "number of mmap data pages and AUX area tracing mmap pages",
2575 		     record__parse_mmap_pages),
2576 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2577 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2578 		     record__mmap_flush_parse),
2579 	OPT_BOOLEAN(0, "group", &record.opts.group,
2580 		    "put the counters into a counter group"),
2581 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2582 			   NULL, "enables call-graph recording" ,
2583 			   &record_callchain_opt),
2584 	OPT_CALLBACK(0, "call-graph", &record.opts,
2585 		     "record_mode[,record_size]", record_callchain_help,
2586 		     &record_parse_callchain_opt),
2587 	OPT_INCR('v', "verbose", &verbose,
2588 		    "be more verbose (show counter open errors, etc)"),
2589 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2590 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2591 		    "per thread counts"),
2592 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2593 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2594 		    "Record the sample physical addresses"),
2595 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
2596 		    "Record the sampled data address data page size"),
2597 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
2598 		    "Record the sampled code address (ip) page size"),
2599 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2600 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2601 			&record.opts.sample_time_set,
2602 			"Record the sample timestamps"),
2603 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2604 			"Record the sample period"),
2605 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2606 		    "don't sample"),
2607 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2608 			&record.no_buildid_cache_set,
2609 			"do not update the buildid cache"),
2610 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2611 			&record.no_buildid_set,
2612 			"do not collect buildids in perf.data"),
2613 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2614 		     "monitor event in cgroup name only",
2615 		     parse_cgroups),
2616 	OPT_INTEGER('D', "delay", &record.opts.initial_delay,
2617 		  "ms to wait before starting measurement after program start (-1: start with events disabled)"),
2618 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2619 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2620 		   "user to profile"),
2621 
2622 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2623 		     "branch any", "sample any taken branches",
2624 		     parse_branch_stack),
2625 
2626 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2627 		     "branch filter mask", "branch stack filter modes",
2628 		     parse_branch_stack),
2629 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2630 		    "sample by weight (on special events only)"),
2631 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2632 		    "sample transaction flags (special events only)"),
2633 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2634 		    "use per-thread mmaps"),
2635 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2636 		    "sample selected machine registers on interrupt,"
2637 		    " use '-I?' to list register names", parse_intr_regs),
2638 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2639 		    "sample selected machine registers on interrupt,"
2640 		    " use '--user-regs=?' to list register names", parse_user_regs),
2641 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2642 		    "Record running/enabled time of read (:S) events"),
2643 	OPT_CALLBACK('k', "clockid", &record.opts,
2644 	"clockid", "clockid to use for events, see clock_gettime()",
2645 	parse_clockid),
2646 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2647 			  "opts", "AUX area tracing Snapshot Mode", ""),
2648 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2649 			  "opts", "sample AUX area", ""),
2650 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2651 			"per thread proc mmap processing timeout in ms"),
2652 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2653 		    "Record namespaces events"),
2654 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
2655 		    "Record cgroup events"),
2656 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
2657 			&record.opts.record_switch_events_set,
2658 			"Record context switch events"),
2659 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2660 			 "Configure all used events to run in kernel space.",
2661 			 PARSE_OPT_EXCLUSIVE),
2662 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2663 			 "Configure all used events to run in user space.",
2664 			 PARSE_OPT_EXCLUSIVE),
2665 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2666 		    "collect kernel callchains"),
2667 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2668 		    "collect user callchains"),
2669 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2670 		   "clang binary to use for compiling BPF scriptlets"),
2671 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2672 		   "options passed to clang when compiling BPF scriptlets"),
2673 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2674 		   "file", "vmlinux pathname"),
2675 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2676 		    "Record build-id of all DSOs regardless of hits"),
2677 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
2678 		    "Record build-id in map events"),
2679 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2680 		    "append timestamp to output filename"),
2681 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2682 		    "Record timestamp boundary (time of first/last samples)"),
2683 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2684 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2685 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2686 			  "signal"),
2687 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
2688 			 "switch output event selector. use 'perf list' to list available events",
2689 			 parse_events_option_new_evlist),
2690 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2691 		   "Limit number of switch output generated files"),
2692 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2693 		    "Parse options then exit"),
2694 #ifdef HAVE_AIO_SUPPORT
2695 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2696 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2697 		     record__aio_parse),
2698 #endif
2699 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2700 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2701 		     record__parse_affinity),
2702 #ifdef HAVE_ZSTD_SUPPORT
2703 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2704 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2705 			    record__parse_comp_level),
2706 #endif
2707 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
2708 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
2709 	OPT_UINTEGER(0, "num-thread-synthesize",
2710 		     &record.opts.nr_threads_synthesize,
2711 		     "number of threads to run for event synthesis"),
2712 #ifdef HAVE_LIBPFM
2713 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
2714 		"libpfm4 event selector. use 'perf list' to list available events",
2715 		parse_libpfm_events_option),
2716 #endif
2717 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
2718 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
2719 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
2720 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
2721 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
2722 		      parse_control_option),
2723 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
2724 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
2725 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
2726 			  &record.debuginfod.set, "debuginfod urls",
2727 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
2728 			  "system"),
2729 	OPT_END()
2730 };
2731 
2732 struct option *record_options = __record_options;
2733 
2734 static void record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
2735 {
2736 	int c;
2737 
2738 	for (c = 0; c < cpus->nr; c++)
2739 		set_bit(cpus->map[c].cpu, mask->bits);
2740 }
2741 
2742 static void record__free_thread_masks(struct record *rec, int nr_threads)
2743 {
2744 	int t;
2745 
2746 	if (rec->thread_masks)
2747 		for (t = 0; t < nr_threads; t++)
2748 			record__thread_mask_free(&rec->thread_masks[t]);
2749 
2750 	zfree(&rec->thread_masks);
2751 }
2752 
2753 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
2754 {
2755 	int t, ret;
2756 
2757 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
2758 	if (!rec->thread_masks) {
2759 		pr_err("Failed to allocate thread masks\n");
2760 		return -ENOMEM;
2761 	}
2762 
2763 	for (t = 0; t < nr_threads; t++) {
2764 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
2765 		if (ret) {
2766 			pr_err("Failed to allocate thread masks[%d]\n", t);
2767 			goto out_free;
2768 		}
2769 	}
2770 
2771 	return 0;
2772 
2773 out_free:
2774 	record__free_thread_masks(rec, nr_threads);
2775 
2776 	return ret;
2777 }
2778 
2779 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
2780 {
2781 	int ret;
2782 
2783 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
2784 	if (ret)
2785 		return ret;
2786 
2787 	record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus);
2788 
2789 	rec->nr_threads = 1;
2790 
2791 	return 0;
2792 }
2793 
2794 static int record__init_thread_masks(struct record *rec)
2795 {
2796 	struct perf_cpu_map *cpus = rec->evlist->core.cpus;
2797 
2798 	return record__init_thread_default_masks(rec, cpus);
2799 }
2800 
2801 int cmd_record(int argc, const char **argv)
2802 {
2803 	int err;
2804 	struct record *rec = &record;
2805 	char errbuf[BUFSIZ];
2806 
2807 	setlocale(LC_ALL, "");
2808 
2809 #ifndef HAVE_LIBBPF_SUPPORT
2810 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2811 	set_nobuild('\0', "clang-path", true);
2812 	set_nobuild('\0', "clang-opt", true);
2813 # undef set_nobuild
2814 #endif
2815 
2816 #ifndef HAVE_BPF_PROLOGUE
2817 # if !defined (HAVE_DWARF_SUPPORT)
2818 #  define REASON  "NO_DWARF=1"
2819 # elif !defined (HAVE_LIBBPF_SUPPORT)
2820 #  define REASON  "NO_LIBBPF=1"
2821 # else
2822 #  define REASON  "this architecture doesn't support BPF prologue"
2823 # endif
2824 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2825 	set_nobuild('\0', "vmlinux", true);
2826 # undef set_nobuild
2827 # undef REASON
2828 #endif
2829 
2830 	rec->opts.affinity = PERF_AFFINITY_SYS;
2831 
2832 	rec->evlist = evlist__new();
2833 	if (rec->evlist == NULL)
2834 		return -ENOMEM;
2835 
2836 	err = perf_config(perf_record_config, rec);
2837 	if (err)
2838 		return err;
2839 
2840 	argc = parse_options(argc, argv, record_options, record_usage,
2841 			    PARSE_OPT_STOP_AT_NON_OPTION);
2842 	if (quiet)
2843 		perf_quiet_option();
2844 
2845 	err = symbol__validate_sym_arguments();
2846 	if (err)
2847 		return err;
2848 
2849 	perf_debuginfod_setup(&record.debuginfod);
2850 
2851 	/* Make system wide (-a) the default target. */
2852 	if (!argc && target__none(&rec->opts.target))
2853 		rec->opts.target.system_wide = true;
2854 
2855 	if (nr_cgroups && !rec->opts.target.system_wide) {
2856 		usage_with_options_msg(record_usage, record_options,
2857 			"cgroup monitoring only available in system-wide mode");
2858 
2859 	}
2860 
2861 	if (rec->buildid_mmap) {
2862 		if (!perf_can_record_build_id()) {
2863 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
2864 			err = -EINVAL;
2865 			goto out_opts;
2866 		}
2867 		pr_debug("Enabling build id in mmap2 events.\n");
2868 		/* Enable mmap build id synthesizing. */
2869 		symbol_conf.buildid_mmap2 = true;
2870 		/* Enable perf_event_attr::build_id bit. */
2871 		rec->opts.build_id = true;
2872 		/* Disable build id cache. */
2873 		rec->no_buildid = true;
2874 	}
2875 
2876 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
2877 		pr_err("Kernel has no cgroup sampling support.\n");
2878 		err = -EINVAL;
2879 		goto out_opts;
2880 	}
2881 
2882 	if (rec->opts.kcore)
2883 		rec->data.is_dir = true;
2884 
2885 	if (rec->opts.comp_level != 0) {
2886 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2887 		rec->no_buildid = true;
2888 	}
2889 
2890 	if (rec->opts.record_switch_events &&
2891 	    !perf_can_record_switch_events()) {
2892 		ui__error("kernel does not support recording context switch events\n");
2893 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2894 		err = -EINVAL;
2895 		goto out_opts;
2896 	}
2897 
2898 	if (switch_output_setup(rec)) {
2899 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2900 		err = -EINVAL;
2901 		goto out_opts;
2902 	}
2903 
2904 	if (rec->switch_output.time) {
2905 		signal(SIGALRM, alarm_sig_handler);
2906 		alarm(rec->switch_output.time);
2907 	}
2908 
2909 	if (rec->switch_output.num_files) {
2910 		rec->switch_output.filenames = calloc(sizeof(char *),
2911 						      rec->switch_output.num_files);
2912 		if (!rec->switch_output.filenames) {
2913 			err = -EINVAL;
2914 			goto out_opts;
2915 		}
2916 	}
2917 
2918 	/*
2919 	 * Allow aliases to facilitate the lookup of symbols for address
2920 	 * filters. Refer to auxtrace_parse_filters().
2921 	 */
2922 	symbol_conf.allow_aliases = true;
2923 
2924 	symbol__init(NULL);
2925 
2926 	if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2927 		rec->affinity_mask.nbits = cpu__max_cpu().cpu;
2928 		rec->affinity_mask.bits = bitmap_zalloc(rec->affinity_mask.nbits);
2929 		if (!rec->affinity_mask.bits) {
2930 			pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2931 			err = -ENOMEM;
2932 			goto out_opts;
2933 		}
2934 		pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2935 	}
2936 
2937 	err = record__auxtrace_init(rec);
2938 	if (err)
2939 		goto out;
2940 
2941 	if (dry_run)
2942 		goto out;
2943 
2944 	err = bpf__setup_stdout(rec->evlist);
2945 	if (err) {
2946 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2947 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2948 			 errbuf);
2949 		goto out;
2950 	}
2951 
2952 	err = -ENOMEM;
2953 
2954 	if (rec->no_buildid_cache || rec->no_buildid) {
2955 		disable_buildid_cache();
2956 	} else if (rec->switch_output.enabled) {
2957 		/*
2958 		 * In 'perf record --switch-output', disable buildid
2959 		 * generation by default to reduce data file switching
2960 		 * overhead. Still generate buildid if they are required
2961 		 * explicitly using
2962 		 *
2963 		 *  perf record --switch-output --no-no-buildid \
2964 		 *              --no-no-buildid-cache
2965 		 *
2966 		 * Following code equals to:
2967 		 *
2968 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2969 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2970 		 *         disable_buildid_cache();
2971 		 */
2972 		bool disable = true;
2973 
2974 		if (rec->no_buildid_set && !rec->no_buildid)
2975 			disable = false;
2976 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2977 			disable = false;
2978 		if (disable) {
2979 			rec->no_buildid = true;
2980 			rec->no_buildid_cache = true;
2981 			disable_buildid_cache();
2982 		}
2983 	}
2984 
2985 	if (record.opts.overwrite)
2986 		record.opts.tail_synthesize = true;
2987 
2988 	if (rec->evlist->core.nr_entries == 0) {
2989 		if (perf_pmu__has_hybrid()) {
2990 			err = evlist__add_default_hybrid(rec->evlist,
2991 							 !record.opts.no_samples);
2992 		} else {
2993 			err = __evlist__add_default(rec->evlist,
2994 						    !record.opts.no_samples);
2995 		}
2996 
2997 		if (err < 0) {
2998 			pr_err("Not enough memory for event selector list\n");
2999 			goto out;
3000 		}
3001 	}
3002 
3003 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
3004 		rec->opts.no_inherit = true;
3005 
3006 	err = target__validate(&rec->opts.target);
3007 	if (err) {
3008 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
3009 		ui__warning("%s\n", errbuf);
3010 	}
3011 
3012 	err = target__parse_uid(&rec->opts.target);
3013 	if (err) {
3014 		int saved_errno = errno;
3015 
3016 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
3017 		ui__error("%s", errbuf);
3018 
3019 		err = -saved_errno;
3020 		goto out;
3021 	}
3022 
3023 	/* Enable ignoring missing threads when -u/-p option is defined. */
3024 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
3025 
3026 	if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
3027 		pr_err("failed to use cpu list %s\n",
3028 		       rec->opts.target.cpu_list);
3029 		goto out;
3030 	}
3031 
3032 	rec->opts.target.hybrid = perf_pmu__has_hybrid();
3033 
3034 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
3035 		arch__add_leaf_frame_record_opts(&rec->opts);
3036 
3037 	err = -ENOMEM;
3038 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
3039 		usage_with_options(record_usage, record_options);
3040 
3041 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
3042 	if (err)
3043 		goto out;
3044 
3045 	/*
3046 	 * We take all buildids when the file contains
3047 	 * AUX area tracing data because we do not decode the
3048 	 * trace because it would take too long.
3049 	 */
3050 	if (rec->opts.full_auxtrace)
3051 		rec->buildid_all = true;
3052 
3053 	if (rec->opts.text_poke) {
3054 		err = record__config_text_poke(rec->evlist);
3055 		if (err) {
3056 			pr_err("record__config_text_poke failed, error %d\n", err);
3057 			goto out;
3058 		}
3059 	}
3060 
3061 	if (record_opts__config(&rec->opts)) {
3062 		err = -EINVAL;
3063 		goto out;
3064 	}
3065 
3066 	err = record__init_thread_masks(rec);
3067 	if (err) {
3068 		pr_err("Failed to initialize parallel data streaming masks\n");
3069 		goto out;
3070 	}
3071 
3072 	if (rec->opts.nr_cblocks > nr_cblocks_max)
3073 		rec->opts.nr_cblocks = nr_cblocks_max;
3074 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
3075 
3076 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
3077 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
3078 
3079 	if (rec->opts.comp_level > comp_level_max)
3080 		rec->opts.comp_level = comp_level_max;
3081 	pr_debug("comp level: %d\n", rec->opts.comp_level);
3082 
3083 	err = __cmd_record(&record, argc, argv);
3084 out:
3085 	bitmap_free(rec->affinity_mask.bits);
3086 	evlist__delete(rec->evlist);
3087 	symbol__exit();
3088 	auxtrace_record__free(rec->itr);
3089 out_opts:
3090 	record__free_thread_masks(rec, rec->nr_threads);
3091 	rec->nr_threads = 0;
3092 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
3093 	return err;
3094 }
3095 
3096 static void snapshot_sig_handler(int sig __maybe_unused)
3097 {
3098 	struct record *rec = &record;
3099 
3100 	hit_auxtrace_snapshot_trigger(rec);
3101 
3102 	if (switch_output_signal(rec))
3103 		trigger_hit(&switch_output_trigger);
3104 }
3105 
3106 static void alarm_sig_handler(int sig __maybe_unused)
3107 {
3108 	struct record *rec = &record;
3109 
3110 	if (switch_output_time(rec))
3111 		trigger_hit(&switch_output_trigger);
3112 }
3113