xref: /openbmc/linux/tools/perf/builtin-record.c (revision 6562c9ac)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/clockid.h"
50 #include "util/pmu-hybrid.h"
51 #include "util/evlist-hybrid.h"
52 #include "util/off_cpu.h"
53 #include "asm/bug.h"
54 #include "perf.h"
55 #include "cputopo.h"
56 
57 #include <errno.h>
58 #include <inttypes.h>
59 #include <locale.h>
60 #include <poll.h>
61 #include <pthread.h>
62 #include <unistd.h>
63 #ifndef HAVE_GETTID
64 #include <syscall.h>
65 #endif
66 #include <sched.h>
67 #include <signal.h>
68 #ifdef HAVE_EVENTFD_SUPPORT
69 #include <sys/eventfd.h>
70 #endif
71 #include <sys/mman.h>
72 #include <sys/wait.h>
73 #include <sys/types.h>
74 #include <sys/stat.h>
75 #include <fcntl.h>
76 #include <linux/err.h>
77 #include <linux/string.h>
78 #include <linux/time64.h>
79 #include <linux/zalloc.h>
80 #include <linux/bitmap.h>
81 #include <sys/time.h>
82 
83 struct switch_output {
84 	bool		 enabled;
85 	bool		 signal;
86 	unsigned long	 size;
87 	unsigned long	 time;
88 	const char	*str;
89 	bool		 set;
90 	char		 **filenames;
91 	int		 num_files;
92 	int		 cur_file;
93 };
94 
95 struct thread_mask {
96 	struct mmap_cpu_mask	maps;
97 	struct mmap_cpu_mask	affinity;
98 };
99 
100 struct record_thread {
101 	pid_t			tid;
102 	struct thread_mask	*mask;
103 	struct {
104 		int		msg[2];
105 		int		ack[2];
106 	} pipes;
107 	struct fdarray		pollfd;
108 	int			ctlfd_pos;
109 	int			nr_mmaps;
110 	struct mmap		**maps;
111 	struct mmap		**overwrite_maps;
112 	struct record		*rec;
113 	unsigned long long	samples;
114 	unsigned long		waking;
115 	u64			bytes_written;
116 	u64			bytes_transferred;
117 	u64			bytes_compressed;
118 };
119 
120 static __thread struct record_thread *thread;
121 
122 enum thread_msg {
123 	THREAD_MSG__UNDEFINED = 0,
124 	THREAD_MSG__READY,
125 	THREAD_MSG__MAX,
126 };
127 
128 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
129 	"UNDEFINED", "READY"
130 };
131 
132 enum thread_spec {
133 	THREAD_SPEC__UNDEFINED = 0,
134 	THREAD_SPEC__CPU,
135 	THREAD_SPEC__CORE,
136 	THREAD_SPEC__PACKAGE,
137 	THREAD_SPEC__NUMA,
138 	THREAD_SPEC__USER,
139 	THREAD_SPEC__MAX,
140 };
141 
142 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
143 	"undefined", "cpu", "core", "package", "numa", "user"
144 };
145 
146 struct pollfd_index_map {
147 	int evlist_pollfd_index;
148 	int thread_pollfd_index;
149 };
150 
151 struct record {
152 	struct perf_tool	tool;
153 	struct record_opts	opts;
154 	u64			bytes_written;
155 	struct perf_data	data;
156 	struct auxtrace_record	*itr;
157 	struct evlist	*evlist;
158 	struct perf_session	*session;
159 	struct evlist		*sb_evlist;
160 	pthread_t		thread_id;
161 	int			realtime_prio;
162 	bool			switch_output_event_set;
163 	bool			no_buildid;
164 	bool			no_buildid_set;
165 	bool			no_buildid_cache;
166 	bool			no_buildid_cache_set;
167 	bool			buildid_all;
168 	bool			buildid_mmap;
169 	bool			timestamp_filename;
170 	bool			timestamp_boundary;
171 	bool			off_cpu;
172 	struct switch_output	switch_output;
173 	unsigned long long	samples;
174 	unsigned long		output_max_size;	/* = 0: unlimited */
175 	struct perf_debuginfod	debuginfod;
176 	int			nr_threads;
177 	struct thread_mask	*thread_masks;
178 	struct record_thread	*thread_data;
179 	struct pollfd_index_map	*index_map;
180 	size_t			index_map_sz;
181 	size_t			index_map_cnt;
182 };
183 
184 static volatile int done;
185 
186 static volatile int auxtrace_record__snapshot_started;
187 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
188 static DEFINE_TRIGGER(switch_output_trigger);
189 
190 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
191 	"SYS", "NODE", "CPU"
192 };
193 
194 #ifndef HAVE_GETTID
195 static inline pid_t gettid(void)
196 {
197 	return (pid_t)syscall(__NR_gettid);
198 }
199 #endif
200 
201 static int record__threads_enabled(struct record *rec)
202 {
203 	return rec->opts.threads_spec;
204 }
205 
206 static bool switch_output_signal(struct record *rec)
207 {
208 	return rec->switch_output.signal &&
209 	       trigger_is_ready(&switch_output_trigger);
210 }
211 
212 static bool switch_output_size(struct record *rec)
213 {
214 	return rec->switch_output.size &&
215 	       trigger_is_ready(&switch_output_trigger) &&
216 	       (rec->bytes_written >= rec->switch_output.size);
217 }
218 
219 static bool switch_output_time(struct record *rec)
220 {
221 	return rec->switch_output.time &&
222 	       trigger_is_ready(&switch_output_trigger);
223 }
224 
225 static u64 record__bytes_written(struct record *rec)
226 {
227 	int t;
228 	u64 bytes_written = rec->bytes_written;
229 	struct record_thread *thread_data = rec->thread_data;
230 
231 	for (t = 0; t < rec->nr_threads; t++)
232 		bytes_written += thread_data[t].bytes_written;
233 
234 	return bytes_written;
235 }
236 
237 static bool record__output_max_size_exceeded(struct record *rec)
238 {
239 	return rec->output_max_size &&
240 	       (record__bytes_written(rec) >= rec->output_max_size);
241 }
242 
243 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
244 			 void *bf, size_t size)
245 {
246 	struct perf_data_file *file = &rec->session->data->file;
247 
248 	if (map && map->file)
249 		file = map->file;
250 
251 	if (perf_data_file__write(file, bf, size) < 0) {
252 		pr_err("failed to write perf data, error: %m\n");
253 		return -1;
254 	}
255 
256 	if (map && map->file)
257 		thread->bytes_written += size;
258 	else
259 		rec->bytes_written += size;
260 
261 	if (record__output_max_size_exceeded(rec) && !done) {
262 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
263 				" stopping session ]\n",
264 				record__bytes_written(rec) >> 10);
265 		done = 1;
266 	}
267 
268 	if (switch_output_size(rec))
269 		trigger_hit(&switch_output_trigger);
270 
271 	return 0;
272 }
273 
274 static int record__aio_enabled(struct record *rec);
275 static int record__comp_enabled(struct record *rec);
276 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
277 			    void *dst, size_t dst_size, void *src, size_t src_size);
278 
279 #ifdef HAVE_AIO_SUPPORT
280 static int record__aio_write(struct aiocb *cblock, int trace_fd,
281 		void *buf, size_t size, off_t off)
282 {
283 	int rc;
284 
285 	cblock->aio_fildes = trace_fd;
286 	cblock->aio_buf    = buf;
287 	cblock->aio_nbytes = size;
288 	cblock->aio_offset = off;
289 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
290 
291 	do {
292 		rc = aio_write(cblock);
293 		if (rc == 0) {
294 			break;
295 		} else if (errno != EAGAIN) {
296 			cblock->aio_fildes = -1;
297 			pr_err("failed to queue perf data, error: %m\n");
298 			break;
299 		}
300 	} while (1);
301 
302 	return rc;
303 }
304 
305 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
306 {
307 	void *rem_buf;
308 	off_t rem_off;
309 	size_t rem_size;
310 	int rc, aio_errno;
311 	ssize_t aio_ret, written;
312 
313 	aio_errno = aio_error(cblock);
314 	if (aio_errno == EINPROGRESS)
315 		return 0;
316 
317 	written = aio_ret = aio_return(cblock);
318 	if (aio_ret < 0) {
319 		if (aio_errno != EINTR)
320 			pr_err("failed to write perf data, error: %m\n");
321 		written = 0;
322 	}
323 
324 	rem_size = cblock->aio_nbytes - written;
325 
326 	if (rem_size == 0) {
327 		cblock->aio_fildes = -1;
328 		/*
329 		 * md->refcount is incremented in record__aio_pushfn() for
330 		 * every aio write request started in record__aio_push() so
331 		 * decrement it because the request is now complete.
332 		 */
333 		perf_mmap__put(&md->core);
334 		rc = 1;
335 	} else {
336 		/*
337 		 * aio write request may require restart with the
338 		 * reminder if the kernel didn't write whole
339 		 * chunk at once.
340 		 */
341 		rem_off = cblock->aio_offset + written;
342 		rem_buf = (void *)(cblock->aio_buf + written);
343 		record__aio_write(cblock, cblock->aio_fildes,
344 				rem_buf, rem_size, rem_off);
345 		rc = 0;
346 	}
347 
348 	return rc;
349 }
350 
351 static int record__aio_sync(struct mmap *md, bool sync_all)
352 {
353 	struct aiocb **aiocb = md->aio.aiocb;
354 	struct aiocb *cblocks = md->aio.cblocks;
355 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
356 	int i, do_suspend;
357 
358 	do {
359 		do_suspend = 0;
360 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
361 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
362 				if (sync_all)
363 					aiocb[i] = NULL;
364 				else
365 					return i;
366 			} else {
367 				/*
368 				 * Started aio write is not complete yet
369 				 * so it has to be waited before the
370 				 * next allocation.
371 				 */
372 				aiocb[i] = &cblocks[i];
373 				do_suspend = 1;
374 			}
375 		}
376 		if (!do_suspend)
377 			return -1;
378 
379 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
380 			if (!(errno == EAGAIN || errno == EINTR))
381 				pr_err("failed to sync perf data, error: %m\n");
382 		}
383 	} while (1);
384 }
385 
386 struct record_aio {
387 	struct record	*rec;
388 	void		*data;
389 	size_t		size;
390 };
391 
392 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
393 {
394 	struct record_aio *aio = to;
395 
396 	/*
397 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
398 	 * to release space in the kernel buffer as fast as possible, calling
399 	 * perf_mmap__consume() from perf_mmap__push() function.
400 	 *
401 	 * That lets the kernel to proceed with storing more profiling data into
402 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
403 	 *
404 	 * Coping can be done in two steps in case the chunk of profiling data
405 	 * crosses the upper bound of the kernel buffer. In this case we first move
406 	 * part of data from map->start till the upper bound and then the reminder
407 	 * from the beginning of the kernel buffer till the end of the data chunk.
408 	 */
409 
410 	if (record__comp_enabled(aio->rec)) {
411 		size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
412 				     mmap__mmap_len(map) - aio->size,
413 				     buf, size);
414 	} else {
415 		memcpy(aio->data + aio->size, buf, size);
416 	}
417 
418 	if (!aio->size) {
419 		/*
420 		 * Increment map->refcount to guard map->aio.data[] buffer
421 		 * from premature deallocation because map object can be
422 		 * released earlier than aio write request started on
423 		 * map->aio.data[] buffer is complete.
424 		 *
425 		 * perf_mmap__put() is done at record__aio_complete()
426 		 * after started aio request completion or at record__aio_push()
427 		 * if the request failed to start.
428 		 */
429 		perf_mmap__get(&map->core);
430 	}
431 
432 	aio->size += size;
433 
434 	return size;
435 }
436 
437 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
438 {
439 	int ret, idx;
440 	int trace_fd = rec->session->data->file.fd;
441 	struct record_aio aio = { .rec = rec, .size = 0 };
442 
443 	/*
444 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
445 	 * becomes available after previous aio write operation.
446 	 */
447 
448 	idx = record__aio_sync(map, false);
449 	aio.data = map->aio.data[idx];
450 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
451 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
452 		return ret;
453 
454 	rec->samples++;
455 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
456 	if (!ret) {
457 		*off += aio.size;
458 		rec->bytes_written += aio.size;
459 		if (switch_output_size(rec))
460 			trigger_hit(&switch_output_trigger);
461 	} else {
462 		/*
463 		 * Decrement map->refcount incremented in record__aio_pushfn()
464 		 * back if record__aio_write() operation failed to start, otherwise
465 		 * map->refcount is decremented in record__aio_complete() after
466 		 * aio write operation finishes successfully.
467 		 */
468 		perf_mmap__put(&map->core);
469 	}
470 
471 	return ret;
472 }
473 
474 static off_t record__aio_get_pos(int trace_fd)
475 {
476 	return lseek(trace_fd, 0, SEEK_CUR);
477 }
478 
479 static void record__aio_set_pos(int trace_fd, off_t pos)
480 {
481 	lseek(trace_fd, pos, SEEK_SET);
482 }
483 
484 static void record__aio_mmap_read_sync(struct record *rec)
485 {
486 	int i;
487 	struct evlist *evlist = rec->evlist;
488 	struct mmap *maps = evlist->mmap;
489 
490 	if (!record__aio_enabled(rec))
491 		return;
492 
493 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
494 		struct mmap *map = &maps[i];
495 
496 		if (map->core.base)
497 			record__aio_sync(map, true);
498 	}
499 }
500 
501 static int nr_cblocks_default = 1;
502 static int nr_cblocks_max = 4;
503 
504 static int record__aio_parse(const struct option *opt,
505 			     const char *str,
506 			     int unset)
507 {
508 	struct record_opts *opts = (struct record_opts *)opt->value;
509 
510 	if (unset) {
511 		opts->nr_cblocks = 0;
512 	} else {
513 		if (str)
514 			opts->nr_cblocks = strtol(str, NULL, 0);
515 		if (!opts->nr_cblocks)
516 			opts->nr_cblocks = nr_cblocks_default;
517 	}
518 
519 	return 0;
520 }
521 #else /* HAVE_AIO_SUPPORT */
522 static int nr_cblocks_max = 0;
523 
524 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
525 			    off_t *off __maybe_unused)
526 {
527 	return -1;
528 }
529 
530 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
531 {
532 	return -1;
533 }
534 
535 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
536 {
537 }
538 
539 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
540 {
541 }
542 #endif
543 
544 static int record__aio_enabled(struct record *rec)
545 {
546 	return rec->opts.nr_cblocks > 0;
547 }
548 
549 #define MMAP_FLUSH_DEFAULT 1
550 static int record__mmap_flush_parse(const struct option *opt,
551 				    const char *str,
552 				    int unset)
553 {
554 	int flush_max;
555 	struct record_opts *opts = (struct record_opts *)opt->value;
556 	static struct parse_tag tags[] = {
557 			{ .tag  = 'B', .mult = 1       },
558 			{ .tag  = 'K', .mult = 1 << 10 },
559 			{ .tag  = 'M', .mult = 1 << 20 },
560 			{ .tag  = 'G', .mult = 1 << 30 },
561 			{ .tag  = 0 },
562 	};
563 
564 	if (unset)
565 		return 0;
566 
567 	if (str) {
568 		opts->mmap_flush = parse_tag_value(str, tags);
569 		if (opts->mmap_flush == (int)-1)
570 			opts->mmap_flush = strtol(str, NULL, 0);
571 	}
572 
573 	if (!opts->mmap_flush)
574 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
575 
576 	flush_max = evlist__mmap_size(opts->mmap_pages);
577 	flush_max /= 4;
578 	if (opts->mmap_flush > flush_max)
579 		opts->mmap_flush = flush_max;
580 
581 	return 0;
582 }
583 
584 #ifdef HAVE_ZSTD_SUPPORT
585 static unsigned int comp_level_default = 1;
586 
587 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
588 {
589 	struct record_opts *opts = opt->value;
590 
591 	if (unset) {
592 		opts->comp_level = 0;
593 	} else {
594 		if (str)
595 			opts->comp_level = strtol(str, NULL, 0);
596 		if (!opts->comp_level)
597 			opts->comp_level = comp_level_default;
598 	}
599 
600 	return 0;
601 }
602 #endif
603 static unsigned int comp_level_max = 22;
604 
605 static int record__comp_enabled(struct record *rec)
606 {
607 	return rec->opts.comp_level > 0;
608 }
609 
610 static int process_synthesized_event(struct perf_tool *tool,
611 				     union perf_event *event,
612 				     struct perf_sample *sample __maybe_unused,
613 				     struct machine *machine __maybe_unused)
614 {
615 	struct record *rec = container_of(tool, struct record, tool);
616 	return record__write(rec, NULL, event, event->header.size);
617 }
618 
619 static int process_locked_synthesized_event(struct perf_tool *tool,
620 				     union perf_event *event,
621 				     struct perf_sample *sample __maybe_unused,
622 				     struct machine *machine __maybe_unused)
623 {
624 	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
625 	int ret;
626 
627 	pthread_mutex_lock(&synth_lock);
628 	ret = process_synthesized_event(tool, event, sample, machine);
629 	pthread_mutex_unlock(&synth_lock);
630 	return ret;
631 }
632 
633 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
634 {
635 	struct record *rec = to;
636 
637 	if (record__comp_enabled(rec)) {
638 		size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
639 		bf   = map->data;
640 	}
641 
642 	thread->samples++;
643 	return record__write(rec, map, bf, size);
644 }
645 
646 static volatile int signr = -1;
647 static volatile int child_finished;
648 #ifdef HAVE_EVENTFD_SUPPORT
649 static int done_fd = -1;
650 #endif
651 
652 static void sig_handler(int sig)
653 {
654 	if (sig == SIGCHLD)
655 		child_finished = 1;
656 	else
657 		signr = sig;
658 
659 	done = 1;
660 #ifdef HAVE_EVENTFD_SUPPORT
661 {
662 	u64 tmp = 1;
663 	/*
664 	 * It is possible for this signal handler to run after done is checked
665 	 * in the main loop, but before the perf counter fds are polled. If this
666 	 * happens, the poll() will continue to wait even though done is set,
667 	 * and will only break out if either another signal is received, or the
668 	 * counters are ready for read. To ensure the poll() doesn't sleep when
669 	 * done is set, use an eventfd (done_fd) to wake up the poll().
670 	 */
671 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
672 		pr_err("failed to signal wakeup fd, error: %m\n");
673 }
674 #endif // HAVE_EVENTFD_SUPPORT
675 }
676 
677 static void sigsegv_handler(int sig)
678 {
679 	perf_hooks__recover();
680 	sighandler_dump_stack(sig);
681 }
682 
683 static void record__sig_exit(void)
684 {
685 	if (signr == -1)
686 		return;
687 
688 	signal(signr, SIG_DFL);
689 	raise(signr);
690 }
691 
692 #ifdef HAVE_AUXTRACE_SUPPORT
693 
694 static int record__process_auxtrace(struct perf_tool *tool,
695 				    struct mmap *map,
696 				    union perf_event *event, void *data1,
697 				    size_t len1, void *data2, size_t len2)
698 {
699 	struct record *rec = container_of(tool, struct record, tool);
700 	struct perf_data *data = &rec->data;
701 	size_t padding;
702 	u8 pad[8] = {0};
703 
704 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
705 		off_t file_offset;
706 		int fd = perf_data__fd(data);
707 		int err;
708 
709 		file_offset = lseek(fd, 0, SEEK_CUR);
710 		if (file_offset == -1)
711 			return -1;
712 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
713 						     event, file_offset);
714 		if (err)
715 			return err;
716 	}
717 
718 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
719 	padding = (len1 + len2) & 7;
720 	if (padding)
721 		padding = 8 - padding;
722 
723 	record__write(rec, map, event, event->header.size);
724 	record__write(rec, map, data1, len1);
725 	if (len2)
726 		record__write(rec, map, data2, len2);
727 	record__write(rec, map, &pad, padding);
728 
729 	return 0;
730 }
731 
732 static int record__auxtrace_mmap_read(struct record *rec,
733 				      struct mmap *map)
734 {
735 	int ret;
736 
737 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
738 				  record__process_auxtrace);
739 	if (ret < 0)
740 		return ret;
741 
742 	if (ret)
743 		rec->samples++;
744 
745 	return 0;
746 }
747 
748 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
749 					       struct mmap *map)
750 {
751 	int ret;
752 
753 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
754 					   record__process_auxtrace,
755 					   rec->opts.auxtrace_snapshot_size);
756 	if (ret < 0)
757 		return ret;
758 
759 	if (ret)
760 		rec->samples++;
761 
762 	return 0;
763 }
764 
765 static int record__auxtrace_read_snapshot_all(struct record *rec)
766 {
767 	int i;
768 	int rc = 0;
769 
770 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
771 		struct mmap *map = &rec->evlist->mmap[i];
772 
773 		if (!map->auxtrace_mmap.base)
774 			continue;
775 
776 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
777 			rc = -1;
778 			goto out;
779 		}
780 	}
781 out:
782 	return rc;
783 }
784 
785 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
786 {
787 	pr_debug("Recording AUX area tracing snapshot\n");
788 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
789 		trigger_error(&auxtrace_snapshot_trigger);
790 	} else {
791 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
792 			trigger_error(&auxtrace_snapshot_trigger);
793 		else
794 			trigger_ready(&auxtrace_snapshot_trigger);
795 	}
796 }
797 
798 static int record__auxtrace_snapshot_exit(struct record *rec)
799 {
800 	if (trigger_is_error(&auxtrace_snapshot_trigger))
801 		return 0;
802 
803 	if (!auxtrace_record__snapshot_started &&
804 	    auxtrace_record__snapshot_start(rec->itr))
805 		return -1;
806 
807 	record__read_auxtrace_snapshot(rec, true);
808 	if (trigger_is_error(&auxtrace_snapshot_trigger))
809 		return -1;
810 
811 	return 0;
812 }
813 
814 static int record__auxtrace_init(struct record *rec)
815 {
816 	int err;
817 
818 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
819 	    && record__threads_enabled(rec)) {
820 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
821 		return -EINVAL;
822 	}
823 
824 	if (!rec->itr) {
825 		rec->itr = auxtrace_record__init(rec->evlist, &err);
826 		if (err)
827 			return err;
828 	}
829 
830 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
831 					      rec->opts.auxtrace_snapshot_opts);
832 	if (err)
833 		return err;
834 
835 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
836 					    rec->opts.auxtrace_sample_opts);
837 	if (err)
838 		return err;
839 
840 	auxtrace_regroup_aux_output(rec->evlist);
841 
842 	return auxtrace_parse_filters(rec->evlist);
843 }
844 
845 #else
846 
847 static inline
848 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
849 			       struct mmap *map __maybe_unused)
850 {
851 	return 0;
852 }
853 
854 static inline
855 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
856 				    bool on_exit __maybe_unused)
857 {
858 }
859 
860 static inline
861 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
862 {
863 	return 0;
864 }
865 
866 static inline
867 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
868 {
869 	return 0;
870 }
871 
872 static int record__auxtrace_init(struct record *rec __maybe_unused)
873 {
874 	return 0;
875 }
876 
877 #endif
878 
879 static int record__config_text_poke(struct evlist *evlist)
880 {
881 	struct evsel *evsel;
882 
883 	/* Nothing to do if text poke is already configured */
884 	evlist__for_each_entry(evlist, evsel) {
885 		if (evsel->core.attr.text_poke)
886 			return 0;
887 	}
888 
889 	evsel = evlist__add_dummy_on_all_cpus(evlist);
890 	if (!evsel)
891 		return -ENOMEM;
892 
893 	evsel->core.attr.text_poke = 1;
894 	evsel->core.attr.ksymbol = 1;
895 	evsel->immediate = true;
896 	evsel__set_sample_bit(evsel, TIME);
897 
898 	return 0;
899 }
900 
901 static int record__config_off_cpu(struct record *rec)
902 {
903 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
904 }
905 
906 static bool record__kcore_readable(struct machine *machine)
907 {
908 	char kcore[PATH_MAX];
909 	int fd;
910 
911 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
912 
913 	fd = open(kcore, O_RDONLY);
914 	if (fd < 0)
915 		return false;
916 
917 	close(fd);
918 
919 	return true;
920 }
921 
922 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
923 {
924 	char from_dir[PATH_MAX];
925 	char kcore_dir[PATH_MAX];
926 	int ret;
927 
928 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
929 
930 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
931 	if (ret)
932 		return ret;
933 
934 	return kcore_copy(from_dir, kcore_dir);
935 }
936 
937 static void record__thread_data_init_pipes(struct record_thread *thread_data)
938 {
939 	thread_data->pipes.msg[0] = -1;
940 	thread_data->pipes.msg[1] = -1;
941 	thread_data->pipes.ack[0] = -1;
942 	thread_data->pipes.ack[1] = -1;
943 }
944 
945 static int record__thread_data_open_pipes(struct record_thread *thread_data)
946 {
947 	if (pipe(thread_data->pipes.msg))
948 		return -EINVAL;
949 
950 	if (pipe(thread_data->pipes.ack)) {
951 		close(thread_data->pipes.msg[0]);
952 		thread_data->pipes.msg[0] = -1;
953 		close(thread_data->pipes.msg[1]);
954 		thread_data->pipes.msg[1] = -1;
955 		return -EINVAL;
956 	}
957 
958 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
959 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
960 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
961 
962 	return 0;
963 }
964 
965 static void record__thread_data_close_pipes(struct record_thread *thread_data)
966 {
967 	if (thread_data->pipes.msg[0] != -1) {
968 		close(thread_data->pipes.msg[0]);
969 		thread_data->pipes.msg[0] = -1;
970 	}
971 	if (thread_data->pipes.msg[1] != -1) {
972 		close(thread_data->pipes.msg[1]);
973 		thread_data->pipes.msg[1] = -1;
974 	}
975 	if (thread_data->pipes.ack[0] != -1) {
976 		close(thread_data->pipes.ack[0]);
977 		thread_data->pipes.ack[0] = -1;
978 	}
979 	if (thread_data->pipes.ack[1] != -1) {
980 		close(thread_data->pipes.ack[1]);
981 		thread_data->pipes.ack[1] = -1;
982 	}
983 }
984 
985 static bool evlist__per_thread(struct evlist *evlist)
986 {
987 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
988 }
989 
990 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
991 {
992 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
993 	struct mmap *mmap = evlist->mmap;
994 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
995 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
996 	bool per_thread = evlist__per_thread(evlist);
997 
998 	if (per_thread)
999 		thread_data->nr_mmaps = nr_mmaps;
1000 	else
1001 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1002 						      thread_data->mask->maps.nbits);
1003 	if (mmap) {
1004 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1005 		if (!thread_data->maps)
1006 			return -ENOMEM;
1007 	}
1008 	if (overwrite_mmap) {
1009 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1010 		if (!thread_data->overwrite_maps) {
1011 			zfree(&thread_data->maps);
1012 			return -ENOMEM;
1013 		}
1014 	}
1015 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1016 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1017 
1018 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1019 		if (per_thread ||
1020 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1021 			if (thread_data->maps) {
1022 				thread_data->maps[tm] = &mmap[m];
1023 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1024 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1025 			}
1026 			if (thread_data->overwrite_maps) {
1027 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1028 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1029 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1030 			}
1031 			tm++;
1032 		}
1033 	}
1034 
1035 	return 0;
1036 }
1037 
1038 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1039 {
1040 	int f, tm, pos;
1041 	struct mmap *map, *overwrite_map;
1042 
1043 	fdarray__init(&thread_data->pollfd, 64);
1044 
1045 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1046 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1047 		overwrite_map = thread_data->overwrite_maps ?
1048 				thread_data->overwrite_maps[tm] : NULL;
1049 
1050 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1051 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1052 
1053 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1054 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1055 							      &evlist->core.pollfd);
1056 				if (pos < 0)
1057 					return pos;
1058 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1059 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1060 			}
1061 		}
1062 	}
1063 
1064 	return 0;
1065 }
1066 
1067 static void record__free_thread_data(struct record *rec)
1068 {
1069 	int t;
1070 	struct record_thread *thread_data = rec->thread_data;
1071 
1072 	if (thread_data == NULL)
1073 		return;
1074 
1075 	for (t = 0; t < rec->nr_threads; t++) {
1076 		record__thread_data_close_pipes(&thread_data[t]);
1077 		zfree(&thread_data[t].maps);
1078 		zfree(&thread_data[t].overwrite_maps);
1079 		fdarray__exit(&thread_data[t].pollfd);
1080 	}
1081 
1082 	zfree(&rec->thread_data);
1083 }
1084 
1085 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1086 						    int evlist_pollfd_index,
1087 						    int thread_pollfd_index)
1088 {
1089 	size_t x = rec->index_map_cnt;
1090 
1091 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1092 		return -ENOMEM;
1093 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1094 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1095 	rec->index_map_cnt += 1;
1096 	return 0;
1097 }
1098 
1099 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1100 						    struct evlist *evlist,
1101 						    struct record_thread *thread_data)
1102 {
1103 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1104 	struct pollfd *t_entries = thread_data->pollfd.entries;
1105 	int err = 0;
1106 	size_t i;
1107 
1108 	for (i = 0; i < rec->index_map_cnt; i++) {
1109 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1110 		int t_pos = rec->index_map[i].thread_pollfd_index;
1111 
1112 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1113 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1114 			pr_err("Thread and evlist pollfd index mismatch\n");
1115 			err = -EINVAL;
1116 			continue;
1117 		}
1118 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1119 	}
1120 	return err;
1121 }
1122 
1123 static int record__dup_non_perf_events(struct record *rec,
1124 				       struct evlist *evlist,
1125 				       struct record_thread *thread_data)
1126 {
1127 	struct fdarray *fda = &evlist->core.pollfd;
1128 	int i, ret;
1129 
1130 	for (i = 0; i < fda->nr; i++) {
1131 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1132 			continue;
1133 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1134 		if (ret < 0) {
1135 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1136 			return ret;
1137 		}
1138 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1139 			  thread_data, ret, fda->entries[i].fd);
1140 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1141 		if (ret < 0) {
1142 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1143 			return ret;
1144 		}
1145 	}
1146 	return 0;
1147 }
1148 
1149 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1150 {
1151 	int t, ret;
1152 	struct record_thread *thread_data;
1153 
1154 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1155 	if (!rec->thread_data) {
1156 		pr_err("Failed to allocate thread data\n");
1157 		return -ENOMEM;
1158 	}
1159 	thread_data = rec->thread_data;
1160 
1161 	for (t = 0; t < rec->nr_threads; t++)
1162 		record__thread_data_init_pipes(&thread_data[t]);
1163 
1164 	for (t = 0; t < rec->nr_threads; t++) {
1165 		thread_data[t].rec = rec;
1166 		thread_data[t].mask = &rec->thread_masks[t];
1167 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1168 		if (ret) {
1169 			pr_err("Failed to initialize thread[%d] maps\n", t);
1170 			goto out_free;
1171 		}
1172 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1173 		if (ret) {
1174 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1175 			goto out_free;
1176 		}
1177 		if (t) {
1178 			thread_data[t].tid = -1;
1179 			ret = record__thread_data_open_pipes(&thread_data[t]);
1180 			if (ret) {
1181 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1182 				goto out_free;
1183 			}
1184 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1185 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1186 			if (ret < 0) {
1187 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1188 				goto out_free;
1189 			}
1190 			thread_data[t].ctlfd_pos = ret;
1191 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1192 				 thread_data, thread_data[t].ctlfd_pos,
1193 				 thread_data[t].pipes.msg[0]);
1194 		} else {
1195 			thread_data[t].tid = gettid();
1196 
1197 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1198 			if (ret < 0)
1199 				goto out_free;
1200 
1201 			if (evlist->ctl_fd.pos == -1)
1202 				continue;
1203 			ret = fdarray__dup_entry_from(&thread_data[t].pollfd, evlist->ctl_fd.pos,
1204 						      &evlist->core.pollfd);
1205 			if (ret < 0) {
1206 				pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1207 				goto out_free;
1208 			}
1209 			thread_data[t].ctlfd_pos = ret;
1210 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1211 				 thread_data, thread_data[t].ctlfd_pos,
1212 				 evlist->core.pollfd.entries[evlist->ctl_fd.pos].fd);
1213 		}
1214 	}
1215 
1216 	return 0;
1217 
1218 out_free:
1219 	record__free_thread_data(rec);
1220 
1221 	return ret;
1222 }
1223 
1224 static int record__mmap_evlist(struct record *rec,
1225 			       struct evlist *evlist)
1226 {
1227 	int i, ret;
1228 	struct record_opts *opts = &rec->opts;
1229 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1230 				  opts->auxtrace_sample_mode;
1231 	char msg[512];
1232 
1233 	if (opts->affinity != PERF_AFFINITY_SYS)
1234 		cpu__setup_cpunode_map();
1235 
1236 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1237 				 opts->auxtrace_mmap_pages,
1238 				 auxtrace_overwrite,
1239 				 opts->nr_cblocks, opts->affinity,
1240 				 opts->mmap_flush, opts->comp_level) < 0) {
1241 		if (errno == EPERM) {
1242 			pr_err("Permission error mapping pages.\n"
1243 			       "Consider increasing "
1244 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1245 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1246 			       "(current value: %u,%u)\n",
1247 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1248 			return -errno;
1249 		} else {
1250 			pr_err("failed to mmap with %d (%s)\n", errno,
1251 				str_error_r(errno, msg, sizeof(msg)));
1252 			if (errno)
1253 				return -errno;
1254 			else
1255 				return -EINVAL;
1256 		}
1257 	}
1258 
1259 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1260 		return -1;
1261 
1262 	ret = record__alloc_thread_data(rec, evlist);
1263 	if (ret)
1264 		return ret;
1265 
1266 	if (record__threads_enabled(rec)) {
1267 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1268 		if (ret) {
1269 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1270 			return ret;
1271 		}
1272 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1273 			if (evlist->mmap)
1274 				evlist->mmap[i].file = &rec->data.dir.files[i];
1275 			if (evlist->overwrite_mmap)
1276 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1277 		}
1278 	}
1279 
1280 	return 0;
1281 }
1282 
1283 static int record__mmap(struct record *rec)
1284 {
1285 	return record__mmap_evlist(rec, rec->evlist);
1286 }
1287 
1288 static int record__open(struct record *rec)
1289 {
1290 	char msg[BUFSIZ];
1291 	struct evsel *pos;
1292 	struct evlist *evlist = rec->evlist;
1293 	struct perf_session *session = rec->session;
1294 	struct record_opts *opts = &rec->opts;
1295 	int rc = 0;
1296 
1297 	/*
1298 	 * For initial_delay, system wide or a hybrid system, we need to add a
1299 	 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
1300 	 * of waiting or event synthesis.
1301 	 */
1302 	if (opts->initial_delay || target__has_cpu(&opts->target) ||
1303 	    perf_pmu__has_hybrid()) {
1304 		pos = evlist__get_tracking_event(evlist);
1305 		if (!evsel__is_dummy_event(pos)) {
1306 			/* Set up dummy event. */
1307 			if (evlist__add_dummy(evlist))
1308 				return -ENOMEM;
1309 			pos = evlist__last(evlist);
1310 			evlist__set_tracking_event(evlist, pos);
1311 		}
1312 
1313 		/*
1314 		 * Enable the dummy event when the process is forked for
1315 		 * initial_delay, immediately for system wide.
1316 		 */
1317 		if (opts->initial_delay && !pos->immediate &&
1318 		    !target__has_cpu(&opts->target))
1319 			pos->core.attr.enable_on_exec = 1;
1320 		else
1321 			pos->immediate = 1;
1322 	}
1323 
1324 	evlist__config(evlist, opts, &callchain_param);
1325 
1326 	evlist__for_each_entry(evlist, pos) {
1327 try_again:
1328 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1329 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1330 				if (verbose > 0)
1331 					ui__warning("%s\n", msg);
1332 				goto try_again;
1333 			}
1334 			if ((errno == EINVAL || errno == EBADF) &&
1335 			    pos->core.leader != &pos->core &&
1336 			    pos->weak_group) {
1337 			        pos = evlist__reset_weak_group(evlist, pos, true);
1338 				goto try_again;
1339 			}
1340 			rc = -errno;
1341 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1342 			ui__error("%s\n", msg);
1343 			goto out;
1344 		}
1345 
1346 		pos->supported = true;
1347 	}
1348 
1349 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1350 		pr_warning(
1351 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1352 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1353 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1354 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1355 "Samples in kernel modules won't be resolved at all.\n\n"
1356 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1357 "even with a suitable vmlinux or kallsyms file.\n\n");
1358 	}
1359 
1360 	if (evlist__apply_filters(evlist, &pos)) {
1361 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1362 			pos->filter, evsel__name(pos), errno,
1363 			str_error_r(errno, msg, sizeof(msg)));
1364 		rc = -1;
1365 		goto out;
1366 	}
1367 
1368 	rc = record__mmap(rec);
1369 	if (rc)
1370 		goto out;
1371 
1372 	session->evlist = evlist;
1373 	perf_session__set_id_hdr_size(session);
1374 out:
1375 	return rc;
1376 }
1377 
1378 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1379 {
1380 	if (rec->evlist->first_sample_time == 0)
1381 		rec->evlist->first_sample_time = sample_time;
1382 
1383 	if (sample_time)
1384 		rec->evlist->last_sample_time = sample_time;
1385 }
1386 
1387 static int process_sample_event(struct perf_tool *tool,
1388 				union perf_event *event,
1389 				struct perf_sample *sample,
1390 				struct evsel *evsel,
1391 				struct machine *machine)
1392 {
1393 	struct record *rec = container_of(tool, struct record, tool);
1394 
1395 	set_timestamp_boundary(rec, sample->time);
1396 
1397 	if (rec->buildid_all)
1398 		return 0;
1399 
1400 	rec->samples++;
1401 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1402 }
1403 
1404 static int process_buildids(struct record *rec)
1405 {
1406 	struct perf_session *session = rec->session;
1407 
1408 	if (perf_data__size(&rec->data) == 0)
1409 		return 0;
1410 
1411 	/*
1412 	 * During this process, it'll load kernel map and replace the
1413 	 * dso->long_name to a real pathname it found.  In this case
1414 	 * we prefer the vmlinux path like
1415 	 *   /lib/modules/3.16.4/build/vmlinux
1416 	 *
1417 	 * rather than build-id path (in debug directory).
1418 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1419 	 */
1420 	symbol_conf.ignore_vmlinux_buildid = true;
1421 
1422 	/*
1423 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1424 	 * so no need to process samples. But if timestamp_boundary is enabled,
1425 	 * it still needs to walk on all samples to get the timestamps of
1426 	 * first/last samples.
1427 	 */
1428 	if (rec->buildid_all && !rec->timestamp_boundary)
1429 		rec->tool.sample = NULL;
1430 
1431 	return perf_session__process_events(session);
1432 }
1433 
1434 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1435 {
1436 	int err;
1437 	struct perf_tool *tool = data;
1438 	/*
1439 	 *As for guest kernel when processing subcommand record&report,
1440 	 *we arrange module mmap prior to guest kernel mmap and trigger
1441 	 *a preload dso because default guest module symbols are loaded
1442 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1443 	 *method is used to avoid symbol missing when the first addr is
1444 	 *in module instead of in guest kernel.
1445 	 */
1446 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1447 					     machine);
1448 	if (err < 0)
1449 		pr_err("Couldn't record guest kernel [%d]'s reference"
1450 		       " relocation symbol.\n", machine->pid);
1451 
1452 	/*
1453 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1454 	 * have no _text sometimes.
1455 	 */
1456 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1457 						 machine);
1458 	if (err < 0)
1459 		pr_err("Couldn't record guest kernel [%d]'s reference"
1460 		       " relocation symbol.\n", machine->pid);
1461 }
1462 
1463 static struct perf_event_header finished_round_event = {
1464 	.size = sizeof(struct perf_event_header),
1465 	.type = PERF_RECORD_FINISHED_ROUND,
1466 };
1467 
1468 static struct perf_event_header finished_init_event = {
1469 	.size = sizeof(struct perf_event_header),
1470 	.type = PERF_RECORD_FINISHED_INIT,
1471 };
1472 
1473 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1474 {
1475 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1476 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1477 			  thread->mask->affinity.nbits)) {
1478 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1479 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1480 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1481 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1482 					(cpu_set_t *)thread->mask->affinity.bits);
1483 		if (verbose == 2) {
1484 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1485 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1486 		}
1487 	}
1488 }
1489 
1490 static size_t process_comp_header(void *record, size_t increment)
1491 {
1492 	struct perf_record_compressed *event = record;
1493 	size_t size = sizeof(*event);
1494 
1495 	if (increment) {
1496 		event->header.size += increment;
1497 		return increment;
1498 	}
1499 
1500 	event->header.type = PERF_RECORD_COMPRESSED;
1501 	event->header.size = size;
1502 
1503 	return size;
1504 }
1505 
1506 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1507 			    void *dst, size_t dst_size, void *src, size_t src_size)
1508 {
1509 	size_t compressed;
1510 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1511 	struct zstd_data *zstd_data = &session->zstd_data;
1512 
1513 	if (map && map->file)
1514 		zstd_data = &map->zstd_data;
1515 
1516 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1517 						     max_record_size, process_comp_header);
1518 
1519 	if (map && map->file) {
1520 		thread->bytes_transferred += src_size;
1521 		thread->bytes_compressed  += compressed;
1522 	} else {
1523 		session->bytes_transferred += src_size;
1524 		session->bytes_compressed  += compressed;
1525 	}
1526 
1527 	return compressed;
1528 }
1529 
1530 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1531 				    bool overwrite, bool synch)
1532 {
1533 	u64 bytes_written = rec->bytes_written;
1534 	int i;
1535 	int rc = 0;
1536 	int nr_mmaps;
1537 	struct mmap **maps;
1538 	int trace_fd = rec->data.file.fd;
1539 	off_t off = 0;
1540 
1541 	if (!evlist)
1542 		return 0;
1543 
1544 	nr_mmaps = thread->nr_mmaps;
1545 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1546 
1547 	if (!maps)
1548 		return 0;
1549 
1550 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1551 		return 0;
1552 
1553 	if (record__aio_enabled(rec))
1554 		off = record__aio_get_pos(trace_fd);
1555 
1556 	for (i = 0; i < nr_mmaps; i++) {
1557 		u64 flush = 0;
1558 		struct mmap *map = maps[i];
1559 
1560 		if (map->core.base) {
1561 			record__adjust_affinity(rec, map);
1562 			if (synch) {
1563 				flush = map->core.flush;
1564 				map->core.flush = 1;
1565 			}
1566 			if (!record__aio_enabled(rec)) {
1567 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1568 					if (synch)
1569 						map->core.flush = flush;
1570 					rc = -1;
1571 					goto out;
1572 				}
1573 			} else {
1574 				if (record__aio_push(rec, map, &off) < 0) {
1575 					record__aio_set_pos(trace_fd, off);
1576 					if (synch)
1577 						map->core.flush = flush;
1578 					rc = -1;
1579 					goto out;
1580 				}
1581 			}
1582 			if (synch)
1583 				map->core.flush = flush;
1584 		}
1585 
1586 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1587 		    !rec->opts.auxtrace_sample_mode &&
1588 		    record__auxtrace_mmap_read(rec, map) != 0) {
1589 			rc = -1;
1590 			goto out;
1591 		}
1592 	}
1593 
1594 	if (record__aio_enabled(rec))
1595 		record__aio_set_pos(trace_fd, off);
1596 
1597 	/*
1598 	 * Mark the round finished in case we wrote
1599 	 * at least one event.
1600 	 *
1601 	 * No need for round events in directory mode,
1602 	 * because per-cpu maps and files have data
1603 	 * sorted by kernel.
1604 	 */
1605 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1606 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1607 
1608 	if (overwrite)
1609 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1610 out:
1611 	return rc;
1612 }
1613 
1614 static int record__mmap_read_all(struct record *rec, bool synch)
1615 {
1616 	int err;
1617 
1618 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1619 	if (err)
1620 		return err;
1621 
1622 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1623 }
1624 
1625 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1626 					   void *arg __maybe_unused)
1627 {
1628 	struct perf_mmap *map = fda->priv[fd].ptr;
1629 
1630 	if (map)
1631 		perf_mmap__put(map);
1632 }
1633 
1634 static void *record__thread(void *arg)
1635 {
1636 	enum thread_msg msg = THREAD_MSG__READY;
1637 	bool terminate = false;
1638 	struct fdarray *pollfd;
1639 	int err, ctlfd_pos;
1640 
1641 	thread = arg;
1642 	thread->tid = gettid();
1643 
1644 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1645 	if (err == -1)
1646 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1647 			   thread->tid, strerror(errno));
1648 
1649 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1650 
1651 	pollfd = &thread->pollfd;
1652 	ctlfd_pos = thread->ctlfd_pos;
1653 
1654 	for (;;) {
1655 		unsigned long long hits = thread->samples;
1656 
1657 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1658 			break;
1659 
1660 		if (hits == thread->samples) {
1661 
1662 			err = fdarray__poll(pollfd, -1);
1663 			/*
1664 			 * Propagate error, only if there's any. Ignore positive
1665 			 * number of returned events and interrupt error.
1666 			 */
1667 			if (err > 0 || (err < 0 && errno == EINTR))
1668 				err = 0;
1669 			thread->waking++;
1670 
1671 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1672 					    record__thread_munmap_filtered, NULL) == 0)
1673 				break;
1674 		}
1675 
1676 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1677 			terminate = true;
1678 			close(thread->pipes.msg[0]);
1679 			thread->pipes.msg[0] = -1;
1680 			pollfd->entries[ctlfd_pos].fd = -1;
1681 			pollfd->entries[ctlfd_pos].events = 0;
1682 		}
1683 
1684 		pollfd->entries[ctlfd_pos].revents = 0;
1685 	}
1686 	record__mmap_read_all(thread->rec, true);
1687 
1688 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1689 	if (err == -1)
1690 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1691 			   thread->tid, strerror(errno));
1692 
1693 	return NULL;
1694 }
1695 
1696 static void record__init_features(struct record *rec)
1697 {
1698 	struct perf_session *session = rec->session;
1699 	int feat;
1700 
1701 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1702 		perf_header__set_feat(&session->header, feat);
1703 
1704 	if (rec->no_buildid)
1705 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1706 
1707 	if (!have_tracepoints(&rec->evlist->core.entries))
1708 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1709 
1710 	if (!rec->opts.branch_stack)
1711 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1712 
1713 	if (!rec->opts.full_auxtrace)
1714 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1715 
1716 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1717 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1718 
1719 	if (!rec->opts.use_clockid)
1720 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1721 
1722 	if (!record__threads_enabled(rec))
1723 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1724 
1725 	if (!record__comp_enabled(rec))
1726 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1727 
1728 	perf_header__clear_feat(&session->header, HEADER_STAT);
1729 }
1730 
1731 static void
1732 record__finish_output(struct record *rec)
1733 {
1734 	int i;
1735 	struct perf_data *data = &rec->data;
1736 	int fd = perf_data__fd(data);
1737 
1738 	if (data->is_pipe)
1739 		return;
1740 
1741 	rec->session->header.data_size += rec->bytes_written;
1742 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1743 	if (record__threads_enabled(rec)) {
1744 		for (i = 0; i < data->dir.nr; i++)
1745 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1746 	}
1747 
1748 	if (!rec->no_buildid) {
1749 		process_buildids(rec);
1750 
1751 		if (rec->buildid_all)
1752 			dsos__hit_all(rec->session);
1753 	}
1754 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1755 
1756 	return;
1757 }
1758 
1759 static int record__synthesize_workload(struct record *rec, bool tail)
1760 {
1761 	int err;
1762 	struct perf_thread_map *thread_map;
1763 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1764 
1765 	if (rec->opts.tail_synthesize != tail)
1766 		return 0;
1767 
1768 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1769 	if (thread_map == NULL)
1770 		return -1;
1771 
1772 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1773 						 process_synthesized_event,
1774 						 &rec->session->machines.host,
1775 						 needs_mmap,
1776 						 rec->opts.sample_address);
1777 	perf_thread_map__put(thread_map);
1778 	return err;
1779 }
1780 
1781 static int write_finished_init(struct record *rec, bool tail)
1782 {
1783 	if (rec->opts.tail_synthesize != tail)
1784 		return 0;
1785 
1786 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1787 }
1788 
1789 static int record__synthesize(struct record *rec, bool tail);
1790 
1791 static int
1792 record__switch_output(struct record *rec, bool at_exit)
1793 {
1794 	struct perf_data *data = &rec->data;
1795 	int fd, err;
1796 	char *new_filename;
1797 
1798 	/* Same Size:      "2015122520103046"*/
1799 	char timestamp[] = "InvalidTimestamp";
1800 
1801 	record__aio_mmap_read_sync(rec);
1802 
1803 	write_finished_init(rec, true);
1804 
1805 	record__synthesize(rec, true);
1806 	if (target__none(&rec->opts.target))
1807 		record__synthesize_workload(rec, true);
1808 
1809 	rec->samples = 0;
1810 	record__finish_output(rec);
1811 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1812 	if (err) {
1813 		pr_err("Failed to get current timestamp\n");
1814 		return -EINVAL;
1815 	}
1816 
1817 	fd = perf_data__switch(data, timestamp,
1818 				    rec->session->header.data_offset,
1819 				    at_exit, &new_filename);
1820 	if (fd >= 0 && !at_exit) {
1821 		rec->bytes_written = 0;
1822 		rec->session->header.data_size = 0;
1823 	}
1824 
1825 	if (!quiet)
1826 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1827 			data->path, timestamp);
1828 
1829 	if (rec->switch_output.num_files) {
1830 		int n = rec->switch_output.cur_file + 1;
1831 
1832 		if (n >= rec->switch_output.num_files)
1833 			n = 0;
1834 		rec->switch_output.cur_file = n;
1835 		if (rec->switch_output.filenames[n]) {
1836 			remove(rec->switch_output.filenames[n]);
1837 			zfree(&rec->switch_output.filenames[n]);
1838 		}
1839 		rec->switch_output.filenames[n] = new_filename;
1840 	} else {
1841 		free(new_filename);
1842 	}
1843 
1844 	/* Output tracking events */
1845 	if (!at_exit) {
1846 		record__synthesize(rec, false);
1847 
1848 		/*
1849 		 * In 'perf record --switch-output' without -a,
1850 		 * record__synthesize() in record__switch_output() won't
1851 		 * generate tracking events because there's no thread_map
1852 		 * in evlist. Which causes newly created perf.data doesn't
1853 		 * contain map and comm information.
1854 		 * Create a fake thread_map and directly call
1855 		 * perf_event__synthesize_thread_map() for those events.
1856 		 */
1857 		if (target__none(&rec->opts.target))
1858 			record__synthesize_workload(rec, false);
1859 		write_finished_init(rec, false);
1860 	}
1861 	return fd;
1862 }
1863 
1864 static volatile int workload_exec_errno;
1865 
1866 /*
1867  * evlist__prepare_workload will send a SIGUSR1
1868  * if the fork fails, since we asked by setting its
1869  * want_signal to true.
1870  */
1871 static void workload_exec_failed_signal(int signo __maybe_unused,
1872 					siginfo_t *info,
1873 					void *ucontext __maybe_unused)
1874 {
1875 	workload_exec_errno = info->si_value.sival_int;
1876 	done = 1;
1877 	child_finished = 1;
1878 }
1879 
1880 static void snapshot_sig_handler(int sig);
1881 static void alarm_sig_handler(int sig);
1882 
1883 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1884 {
1885 	if (evlist) {
1886 		if (evlist->mmap && evlist->mmap[0].core.base)
1887 			return evlist->mmap[0].core.base;
1888 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1889 			return evlist->overwrite_mmap[0].core.base;
1890 	}
1891 	return NULL;
1892 }
1893 
1894 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1895 {
1896 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1897 	if (pc)
1898 		return pc;
1899 	return NULL;
1900 }
1901 
1902 static int record__synthesize(struct record *rec, bool tail)
1903 {
1904 	struct perf_session *session = rec->session;
1905 	struct machine *machine = &session->machines.host;
1906 	struct perf_data *data = &rec->data;
1907 	struct record_opts *opts = &rec->opts;
1908 	struct perf_tool *tool = &rec->tool;
1909 	int err = 0;
1910 	event_op f = process_synthesized_event;
1911 
1912 	if (rec->opts.tail_synthesize != tail)
1913 		return 0;
1914 
1915 	if (data->is_pipe) {
1916 		err = perf_event__synthesize_for_pipe(tool, session, data,
1917 						      process_synthesized_event);
1918 		if (err < 0)
1919 			goto out;
1920 
1921 		rec->bytes_written += err;
1922 	}
1923 
1924 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1925 					  process_synthesized_event, machine);
1926 	if (err)
1927 		goto out;
1928 
1929 	/* Synthesize id_index before auxtrace_info */
1930 	err = perf_event__synthesize_id_index(tool,
1931 					      process_synthesized_event,
1932 					      session->evlist, machine);
1933 	if (err)
1934 		goto out;
1935 
1936 	if (rec->opts.full_auxtrace) {
1937 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1938 					session, process_synthesized_event);
1939 		if (err)
1940 			goto out;
1941 	}
1942 
1943 	if (!evlist__exclude_kernel(rec->evlist)) {
1944 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1945 							 machine);
1946 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1947 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1948 				   "Check /proc/kallsyms permission or run as root.\n");
1949 
1950 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1951 						     machine);
1952 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1953 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1954 				   "Check /proc/modules permission or run as root.\n");
1955 	}
1956 
1957 	if (perf_guest) {
1958 		machines__process_guests(&session->machines,
1959 					 perf_event__synthesize_guest_os, tool);
1960 	}
1961 
1962 	err = perf_event__synthesize_extra_attr(&rec->tool,
1963 						rec->evlist,
1964 						process_synthesized_event,
1965 						data->is_pipe);
1966 	if (err)
1967 		goto out;
1968 
1969 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1970 						 process_synthesized_event,
1971 						NULL);
1972 	if (err < 0) {
1973 		pr_err("Couldn't synthesize thread map.\n");
1974 		return err;
1975 	}
1976 
1977 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
1978 					     process_synthesized_event, NULL);
1979 	if (err < 0) {
1980 		pr_err("Couldn't synthesize cpu map.\n");
1981 		return err;
1982 	}
1983 
1984 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1985 						machine, opts);
1986 	if (err < 0) {
1987 		pr_warning("Couldn't synthesize bpf events.\n");
1988 		err = 0;
1989 	}
1990 
1991 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
1992 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1993 						     machine);
1994 		if (err < 0) {
1995 			pr_warning("Couldn't synthesize cgroup events.\n");
1996 			err = 0;
1997 		}
1998 	}
1999 
2000 	if (rec->opts.nr_threads_synthesize > 1) {
2001 		perf_set_multithreaded();
2002 		f = process_locked_synthesized_event;
2003 	}
2004 
2005 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2006 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2007 
2008 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2009 						    rec->evlist->core.threads,
2010 						    f, needs_mmap, opts->sample_address,
2011 						    rec->opts.nr_threads_synthesize);
2012 	}
2013 
2014 	if (rec->opts.nr_threads_synthesize > 1)
2015 		perf_set_singlethreaded();
2016 
2017 out:
2018 	return err;
2019 }
2020 
2021 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2022 {
2023 	struct record *rec = data;
2024 	pthread_kill(rec->thread_id, SIGUSR2);
2025 	return 0;
2026 }
2027 
2028 static int record__setup_sb_evlist(struct record *rec)
2029 {
2030 	struct record_opts *opts = &rec->opts;
2031 
2032 	if (rec->sb_evlist != NULL) {
2033 		/*
2034 		 * We get here if --switch-output-event populated the
2035 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2036 		 * to the main thread.
2037 		 */
2038 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2039 		rec->thread_id = pthread_self();
2040 	}
2041 #ifdef HAVE_LIBBPF_SUPPORT
2042 	if (!opts->no_bpf_event) {
2043 		if (rec->sb_evlist == NULL) {
2044 			rec->sb_evlist = evlist__new();
2045 
2046 			if (rec->sb_evlist == NULL) {
2047 				pr_err("Couldn't create side band evlist.\n.");
2048 				return -1;
2049 			}
2050 		}
2051 
2052 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2053 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2054 			return -1;
2055 		}
2056 	}
2057 #endif
2058 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2059 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2060 		opts->no_bpf_event = true;
2061 	}
2062 
2063 	return 0;
2064 }
2065 
2066 static int record__init_clock(struct record *rec)
2067 {
2068 	struct perf_session *session = rec->session;
2069 	struct timespec ref_clockid;
2070 	struct timeval ref_tod;
2071 	u64 ref;
2072 
2073 	if (!rec->opts.use_clockid)
2074 		return 0;
2075 
2076 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2077 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2078 
2079 	session->header.env.clock.clockid = rec->opts.clockid;
2080 
2081 	if (gettimeofday(&ref_tod, NULL) != 0) {
2082 		pr_err("gettimeofday failed, cannot set reference time.\n");
2083 		return -1;
2084 	}
2085 
2086 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2087 		pr_err("clock_gettime failed, cannot set reference time.\n");
2088 		return -1;
2089 	}
2090 
2091 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2092 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2093 
2094 	session->header.env.clock.tod_ns = ref;
2095 
2096 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2097 	      (u64) ref_clockid.tv_nsec;
2098 
2099 	session->header.env.clock.clockid_ns = ref;
2100 	return 0;
2101 }
2102 
2103 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2104 {
2105 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2106 		trigger_hit(&auxtrace_snapshot_trigger);
2107 		auxtrace_record__snapshot_started = 1;
2108 		if (auxtrace_record__snapshot_start(rec->itr))
2109 			trigger_error(&auxtrace_snapshot_trigger);
2110 	}
2111 }
2112 
2113 static void record__uniquify_name(struct record *rec)
2114 {
2115 	struct evsel *pos;
2116 	struct evlist *evlist = rec->evlist;
2117 	char *new_name;
2118 	int ret;
2119 
2120 	if (!perf_pmu__has_hybrid())
2121 		return;
2122 
2123 	evlist__for_each_entry(evlist, pos) {
2124 		if (!evsel__is_hybrid(pos))
2125 			continue;
2126 
2127 		if (strchr(pos->name, '/'))
2128 			continue;
2129 
2130 		ret = asprintf(&new_name, "%s/%s/",
2131 			       pos->pmu_name, pos->name);
2132 		if (ret) {
2133 			free(pos->name);
2134 			pos->name = new_name;
2135 		}
2136 	}
2137 }
2138 
2139 static int record__terminate_thread(struct record_thread *thread_data)
2140 {
2141 	int err;
2142 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2143 	pid_t tid = thread_data->tid;
2144 
2145 	close(thread_data->pipes.msg[1]);
2146 	thread_data->pipes.msg[1] = -1;
2147 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2148 	if (err > 0)
2149 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2150 	else
2151 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2152 			   thread->tid, tid);
2153 
2154 	return 0;
2155 }
2156 
2157 static int record__start_threads(struct record *rec)
2158 {
2159 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2160 	struct record_thread *thread_data = rec->thread_data;
2161 	sigset_t full, mask;
2162 	pthread_t handle;
2163 	pthread_attr_t attrs;
2164 
2165 	thread = &thread_data[0];
2166 
2167 	if (!record__threads_enabled(rec))
2168 		return 0;
2169 
2170 	sigfillset(&full);
2171 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2172 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2173 		return -1;
2174 	}
2175 
2176 	pthread_attr_init(&attrs);
2177 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2178 
2179 	for (t = 1; t < nr_threads; t++) {
2180 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2181 
2182 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2183 		pthread_attr_setaffinity_np(&attrs,
2184 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2185 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2186 #endif
2187 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2188 			for (tt = 1; tt < t; tt++)
2189 				record__terminate_thread(&thread_data[t]);
2190 			pr_err("Failed to start threads: %s\n", strerror(errno));
2191 			ret = -1;
2192 			goto out_err;
2193 		}
2194 
2195 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2196 		if (err > 0)
2197 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2198 				  thread_msg_tags[msg]);
2199 		else
2200 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2201 				   thread->tid, rec->thread_data[t].tid);
2202 	}
2203 
2204 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2205 			(cpu_set_t *)thread->mask->affinity.bits);
2206 
2207 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2208 
2209 out_err:
2210 	pthread_attr_destroy(&attrs);
2211 
2212 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2213 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2214 		ret = -1;
2215 	}
2216 
2217 	return ret;
2218 }
2219 
2220 static int record__stop_threads(struct record *rec)
2221 {
2222 	int t;
2223 	struct record_thread *thread_data = rec->thread_data;
2224 
2225 	for (t = 1; t < rec->nr_threads; t++)
2226 		record__terminate_thread(&thread_data[t]);
2227 
2228 	for (t = 0; t < rec->nr_threads; t++) {
2229 		rec->samples += thread_data[t].samples;
2230 		if (!record__threads_enabled(rec))
2231 			continue;
2232 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2233 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2234 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2235 			 thread_data[t].samples, thread_data[t].waking);
2236 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2237 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2238 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2239 		else
2240 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2241 	}
2242 
2243 	return 0;
2244 }
2245 
2246 static unsigned long record__waking(struct record *rec)
2247 {
2248 	int t;
2249 	unsigned long waking = 0;
2250 	struct record_thread *thread_data = rec->thread_data;
2251 
2252 	for (t = 0; t < rec->nr_threads; t++)
2253 		waking += thread_data[t].waking;
2254 
2255 	return waking;
2256 }
2257 
2258 static int __cmd_record(struct record *rec, int argc, const char **argv)
2259 {
2260 	int err;
2261 	int status = 0;
2262 	const bool forks = argc > 0;
2263 	struct perf_tool *tool = &rec->tool;
2264 	struct record_opts *opts = &rec->opts;
2265 	struct perf_data *data = &rec->data;
2266 	struct perf_session *session;
2267 	bool disabled = false, draining = false;
2268 	int fd;
2269 	float ratio = 0;
2270 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2271 
2272 	atexit(record__sig_exit);
2273 	signal(SIGCHLD, sig_handler);
2274 	signal(SIGINT, sig_handler);
2275 	signal(SIGTERM, sig_handler);
2276 	signal(SIGSEGV, sigsegv_handler);
2277 
2278 	if (rec->opts.record_namespaces)
2279 		tool->namespace_events = true;
2280 
2281 	if (rec->opts.record_cgroup) {
2282 #ifdef HAVE_FILE_HANDLE
2283 		tool->cgroup_events = true;
2284 #else
2285 		pr_err("cgroup tracking is not supported\n");
2286 		return -1;
2287 #endif
2288 	}
2289 
2290 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2291 		signal(SIGUSR2, snapshot_sig_handler);
2292 		if (rec->opts.auxtrace_snapshot_mode)
2293 			trigger_on(&auxtrace_snapshot_trigger);
2294 		if (rec->switch_output.enabled)
2295 			trigger_on(&switch_output_trigger);
2296 	} else {
2297 		signal(SIGUSR2, SIG_IGN);
2298 	}
2299 
2300 	session = perf_session__new(data, tool);
2301 	if (IS_ERR(session)) {
2302 		pr_err("Perf session creation failed.\n");
2303 		return PTR_ERR(session);
2304 	}
2305 
2306 	if (record__threads_enabled(rec)) {
2307 		if (perf_data__is_pipe(&rec->data)) {
2308 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2309 			return -1;
2310 		}
2311 		if (rec->opts.full_auxtrace) {
2312 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2313 			return -1;
2314 		}
2315 	}
2316 
2317 	fd = perf_data__fd(data);
2318 	rec->session = session;
2319 
2320 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2321 		pr_err("Compression initialization failed.\n");
2322 		return -1;
2323 	}
2324 #ifdef HAVE_EVENTFD_SUPPORT
2325 	done_fd = eventfd(0, EFD_NONBLOCK);
2326 	if (done_fd < 0) {
2327 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2328 		status = -1;
2329 		goto out_delete_session;
2330 	}
2331 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2332 	if (err < 0) {
2333 		pr_err("Failed to add wakeup eventfd to poll list\n");
2334 		status = err;
2335 		goto out_delete_session;
2336 	}
2337 #endif // HAVE_EVENTFD_SUPPORT
2338 
2339 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2340 	session->header.env.comp_level = rec->opts.comp_level;
2341 
2342 	if (rec->opts.kcore &&
2343 	    !record__kcore_readable(&session->machines.host)) {
2344 		pr_err("ERROR: kcore is not readable.\n");
2345 		return -1;
2346 	}
2347 
2348 	if (record__init_clock(rec))
2349 		return -1;
2350 
2351 	record__init_features(rec);
2352 
2353 	if (forks) {
2354 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2355 					       workload_exec_failed_signal);
2356 		if (err < 0) {
2357 			pr_err("Couldn't run the workload!\n");
2358 			status = err;
2359 			goto out_delete_session;
2360 		}
2361 	}
2362 
2363 	/*
2364 	 * If we have just single event and are sending data
2365 	 * through pipe, we need to force the ids allocation,
2366 	 * because we synthesize event name through the pipe
2367 	 * and need the id for that.
2368 	 */
2369 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2370 		rec->opts.sample_id = true;
2371 
2372 	record__uniquify_name(rec);
2373 
2374 	if (record__open(rec) != 0) {
2375 		err = -1;
2376 		goto out_free_threads;
2377 	}
2378 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2379 
2380 	if (rec->opts.kcore) {
2381 		err = record__kcore_copy(&session->machines.host, data);
2382 		if (err) {
2383 			pr_err("ERROR: Failed to copy kcore\n");
2384 			goto out_free_threads;
2385 		}
2386 	}
2387 
2388 	err = bpf__apply_obj_config();
2389 	if (err) {
2390 		char errbuf[BUFSIZ];
2391 
2392 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2393 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2394 			 errbuf);
2395 		goto out_free_threads;
2396 	}
2397 
2398 	/*
2399 	 * Normally perf_session__new would do this, but it doesn't have the
2400 	 * evlist.
2401 	 */
2402 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2403 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2404 		rec->tool.ordered_events = false;
2405 	}
2406 
2407 	if (!rec->evlist->core.nr_groups)
2408 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2409 
2410 	if (data->is_pipe) {
2411 		err = perf_header__write_pipe(fd);
2412 		if (err < 0)
2413 			goto out_free_threads;
2414 	} else {
2415 		err = perf_session__write_header(session, rec->evlist, fd, false);
2416 		if (err < 0)
2417 			goto out_free_threads;
2418 	}
2419 
2420 	err = -1;
2421 	if (!rec->no_buildid
2422 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2423 		pr_err("Couldn't generate buildids. "
2424 		       "Use --no-buildid to profile anyway.\n");
2425 		goto out_free_threads;
2426 	}
2427 
2428 	err = record__setup_sb_evlist(rec);
2429 	if (err)
2430 		goto out_free_threads;
2431 
2432 	err = record__synthesize(rec, false);
2433 	if (err < 0)
2434 		goto out_free_threads;
2435 
2436 	if (rec->realtime_prio) {
2437 		struct sched_param param;
2438 
2439 		param.sched_priority = rec->realtime_prio;
2440 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2441 			pr_err("Could not set realtime priority.\n");
2442 			err = -1;
2443 			goto out_free_threads;
2444 		}
2445 	}
2446 
2447 	if (record__start_threads(rec))
2448 		goto out_free_threads;
2449 
2450 	/*
2451 	 * When perf is starting the traced process, all the events
2452 	 * (apart from group members) have enable_on_exec=1 set,
2453 	 * so don't spoil it by prematurely enabling them.
2454 	 */
2455 	if (!target__none(&opts->target) && !opts->initial_delay)
2456 		evlist__enable(rec->evlist);
2457 
2458 	/*
2459 	 * Let the child rip
2460 	 */
2461 	if (forks) {
2462 		struct machine *machine = &session->machines.host;
2463 		union perf_event *event;
2464 		pid_t tgid;
2465 
2466 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2467 		if (event == NULL) {
2468 			err = -ENOMEM;
2469 			goto out_child;
2470 		}
2471 
2472 		/*
2473 		 * Some H/W events are generated before COMM event
2474 		 * which is emitted during exec(), so perf script
2475 		 * cannot see a correct process name for those events.
2476 		 * Synthesize COMM event to prevent it.
2477 		 */
2478 		tgid = perf_event__synthesize_comm(tool, event,
2479 						   rec->evlist->workload.pid,
2480 						   process_synthesized_event,
2481 						   machine);
2482 		free(event);
2483 
2484 		if (tgid == -1)
2485 			goto out_child;
2486 
2487 		event = malloc(sizeof(event->namespaces) +
2488 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2489 			       machine->id_hdr_size);
2490 		if (event == NULL) {
2491 			err = -ENOMEM;
2492 			goto out_child;
2493 		}
2494 
2495 		/*
2496 		 * Synthesize NAMESPACES event for the command specified.
2497 		 */
2498 		perf_event__synthesize_namespaces(tool, event,
2499 						  rec->evlist->workload.pid,
2500 						  tgid, process_synthesized_event,
2501 						  machine);
2502 		free(event);
2503 
2504 		evlist__start_workload(rec->evlist);
2505 	}
2506 
2507 	if (opts->initial_delay) {
2508 		pr_info(EVLIST_DISABLED_MSG);
2509 		if (opts->initial_delay > 0) {
2510 			usleep(opts->initial_delay * USEC_PER_MSEC);
2511 			evlist__enable(rec->evlist);
2512 			pr_info(EVLIST_ENABLED_MSG);
2513 		}
2514 	}
2515 
2516 	trigger_ready(&auxtrace_snapshot_trigger);
2517 	trigger_ready(&switch_output_trigger);
2518 	perf_hooks__invoke_record_start();
2519 
2520 	/*
2521 	 * Must write FINISHED_INIT so it will be seen after all other
2522 	 * synthesized user events, but before any regular events.
2523 	 */
2524 	err = write_finished_init(rec, false);
2525 	if (err < 0)
2526 		goto out_child;
2527 
2528 	for (;;) {
2529 		unsigned long long hits = thread->samples;
2530 
2531 		/*
2532 		 * rec->evlist->bkw_mmap_state is possible to be
2533 		 * BKW_MMAP_EMPTY here: when done == true and
2534 		 * hits != rec->samples in previous round.
2535 		 *
2536 		 * evlist__toggle_bkw_mmap ensure we never
2537 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2538 		 */
2539 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2540 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2541 
2542 		if (record__mmap_read_all(rec, false) < 0) {
2543 			trigger_error(&auxtrace_snapshot_trigger);
2544 			trigger_error(&switch_output_trigger);
2545 			err = -1;
2546 			goto out_child;
2547 		}
2548 
2549 		if (auxtrace_record__snapshot_started) {
2550 			auxtrace_record__snapshot_started = 0;
2551 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2552 				record__read_auxtrace_snapshot(rec, false);
2553 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2554 				pr_err("AUX area tracing snapshot failed\n");
2555 				err = -1;
2556 				goto out_child;
2557 			}
2558 		}
2559 
2560 		if (trigger_is_hit(&switch_output_trigger)) {
2561 			/*
2562 			 * If switch_output_trigger is hit, the data in
2563 			 * overwritable ring buffer should have been collected,
2564 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2565 			 *
2566 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2567 			 * record__mmap_read_all() didn't collect data from
2568 			 * overwritable ring buffer. Read again.
2569 			 */
2570 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2571 				continue;
2572 			trigger_ready(&switch_output_trigger);
2573 
2574 			/*
2575 			 * Reenable events in overwrite ring buffer after
2576 			 * record__mmap_read_all(): we should have collected
2577 			 * data from it.
2578 			 */
2579 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2580 
2581 			if (!quiet)
2582 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2583 					record__waking(rec));
2584 			thread->waking = 0;
2585 			fd = record__switch_output(rec, false);
2586 			if (fd < 0) {
2587 				pr_err("Failed to switch to new file\n");
2588 				trigger_error(&switch_output_trigger);
2589 				err = fd;
2590 				goto out_child;
2591 			}
2592 
2593 			/* re-arm the alarm */
2594 			if (rec->switch_output.time)
2595 				alarm(rec->switch_output.time);
2596 		}
2597 
2598 		if (hits == thread->samples) {
2599 			if (done || draining)
2600 				break;
2601 			err = fdarray__poll(&thread->pollfd, -1);
2602 			/*
2603 			 * Propagate error, only if there's any. Ignore positive
2604 			 * number of returned events and interrupt error.
2605 			 */
2606 			if (err > 0 || (err < 0 && errno == EINTR))
2607 				err = 0;
2608 			thread->waking++;
2609 
2610 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2611 					    record__thread_munmap_filtered, NULL) == 0)
2612 				draining = true;
2613 
2614 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2615 			if (err)
2616 				goto out_child;
2617 			evlist__ctlfd_update(rec->evlist,
2618 				&thread->pollfd.entries[thread->ctlfd_pos]);
2619 		}
2620 
2621 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2622 			switch (cmd) {
2623 			case EVLIST_CTL_CMD_SNAPSHOT:
2624 				hit_auxtrace_snapshot_trigger(rec);
2625 				evlist__ctlfd_ack(rec->evlist);
2626 				break;
2627 			case EVLIST_CTL_CMD_STOP:
2628 				done = 1;
2629 				break;
2630 			case EVLIST_CTL_CMD_ACK:
2631 			case EVLIST_CTL_CMD_UNSUPPORTED:
2632 			case EVLIST_CTL_CMD_ENABLE:
2633 			case EVLIST_CTL_CMD_DISABLE:
2634 			case EVLIST_CTL_CMD_EVLIST:
2635 			case EVLIST_CTL_CMD_PING:
2636 			default:
2637 				break;
2638 			}
2639 		}
2640 
2641 		/*
2642 		 * When perf is starting the traced process, at the end events
2643 		 * die with the process and we wait for that. Thus no need to
2644 		 * disable events in this case.
2645 		 */
2646 		if (done && !disabled && !target__none(&opts->target)) {
2647 			trigger_off(&auxtrace_snapshot_trigger);
2648 			evlist__disable(rec->evlist);
2649 			disabled = true;
2650 		}
2651 	}
2652 
2653 	trigger_off(&auxtrace_snapshot_trigger);
2654 	trigger_off(&switch_output_trigger);
2655 
2656 	if (opts->auxtrace_snapshot_on_exit)
2657 		record__auxtrace_snapshot_exit(rec);
2658 
2659 	if (forks && workload_exec_errno) {
2660 		char msg[STRERR_BUFSIZE], strevsels[2048];
2661 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2662 
2663 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2664 
2665 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2666 			strevsels, argv[0], emsg);
2667 		err = -1;
2668 		goto out_child;
2669 	}
2670 
2671 	if (!quiet)
2672 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2673 			record__waking(rec));
2674 
2675 	write_finished_init(rec, true);
2676 
2677 	if (target__none(&rec->opts.target))
2678 		record__synthesize_workload(rec, true);
2679 
2680 out_child:
2681 	record__stop_threads(rec);
2682 	record__mmap_read_all(rec, true);
2683 out_free_threads:
2684 	record__free_thread_data(rec);
2685 	evlist__finalize_ctlfd(rec->evlist);
2686 	record__aio_mmap_read_sync(rec);
2687 
2688 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2689 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2690 		session->header.env.comp_ratio = ratio + 0.5;
2691 	}
2692 
2693 	if (forks) {
2694 		int exit_status;
2695 
2696 		if (!child_finished)
2697 			kill(rec->evlist->workload.pid, SIGTERM);
2698 
2699 		wait(&exit_status);
2700 
2701 		if (err < 0)
2702 			status = err;
2703 		else if (WIFEXITED(exit_status))
2704 			status = WEXITSTATUS(exit_status);
2705 		else if (WIFSIGNALED(exit_status))
2706 			signr = WTERMSIG(exit_status);
2707 	} else
2708 		status = err;
2709 
2710 	if (rec->off_cpu)
2711 		rec->bytes_written += off_cpu_write(rec->session);
2712 
2713 	record__synthesize(rec, true);
2714 	/* this will be recalculated during process_buildids() */
2715 	rec->samples = 0;
2716 
2717 	if (!err) {
2718 		if (!rec->timestamp_filename) {
2719 			record__finish_output(rec);
2720 		} else {
2721 			fd = record__switch_output(rec, true);
2722 			if (fd < 0) {
2723 				status = fd;
2724 				goto out_delete_session;
2725 			}
2726 		}
2727 	}
2728 
2729 	perf_hooks__invoke_record_end();
2730 
2731 	if (!err && !quiet) {
2732 		char samples[128];
2733 		const char *postfix = rec->timestamp_filename ?
2734 					".<timestamp>" : "";
2735 
2736 		if (rec->samples && !rec->opts.full_auxtrace)
2737 			scnprintf(samples, sizeof(samples),
2738 				  " (%" PRIu64 " samples)", rec->samples);
2739 		else
2740 			samples[0] = '\0';
2741 
2742 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2743 			perf_data__size(data) / 1024.0 / 1024.0,
2744 			data->path, postfix, samples);
2745 		if (ratio) {
2746 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2747 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2748 					ratio);
2749 		}
2750 		fprintf(stderr, " ]\n");
2751 	}
2752 
2753 out_delete_session:
2754 #ifdef HAVE_EVENTFD_SUPPORT
2755 	if (done_fd >= 0)
2756 		close(done_fd);
2757 #endif
2758 	zstd_fini(&session->zstd_data);
2759 	perf_session__delete(session);
2760 
2761 	if (!opts->no_bpf_event)
2762 		evlist__stop_sb_thread(rec->sb_evlist);
2763 	return status;
2764 }
2765 
2766 static void callchain_debug(struct callchain_param *callchain)
2767 {
2768 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2769 
2770 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2771 
2772 	if (callchain->record_mode == CALLCHAIN_DWARF)
2773 		pr_debug("callchain: stack dump size %d\n",
2774 			 callchain->dump_size);
2775 }
2776 
2777 int record_opts__parse_callchain(struct record_opts *record,
2778 				 struct callchain_param *callchain,
2779 				 const char *arg, bool unset)
2780 {
2781 	int ret;
2782 	callchain->enabled = !unset;
2783 
2784 	/* --no-call-graph */
2785 	if (unset) {
2786 		callchain->record_mode = CALLCHAIN_NONE;
2787 		pr_debug("callchain: disabled\n");
2788 		return 0;
2789 	}
2790 
2791 	ret = parse_callchain_record_opt(arg, callchain);
2792 	if (!ret) {
2793 		/* Enable data address sampling for DWARF unwind. */
2794 		if (callchain->record_mode == CALLCHAIN_DWARF)
2795 			record->sample_address = true;
2796 		callchain_debug(callchain);
2797 	}
2798 
2799 	return ret;
2800 }
2801 
2802 int record_parse_callchain_opt(const struct option *opt,
2803 			       const char *arg,
2804 			       int unset)
2805 {
2806 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2807 }
2808 
2809 int record_callchain_opt(const struct option *opt,
2810 			 const char *arg __maybe_unused,
2811 			 int unset __maybe_unused)
2812 {
2813 	struct callchain_param *callchain = opt->value;
2814 
2815 	callchain->enabled = true;
2816 
2817 	if (callchain->record_mode == CALLCHAIN_NONE)
2818 		callchain->record_mode = CALLCHAIN_FP;
2819 
2820 	callchain_debug(callchain);
2821 	return 0;
2822 }
2823 
2824 static int perf_record_config(const char *var, const char *value, void *cb)
2825 {
2826 	struct record *rec = cb;
2827 
2828 	if (!strcmp(var, "record.build-id")) {
2829 		if (!strcmp(value, "cache"))
2830 			rec->no_buildid_cache = false;
2831 		else if (!strcmp(value, "no-cache"))
2832 			rec->no_buildid_cache = true;
2833 		else if (!strcmp(value, "skip"))
2834 			rec->no_buildid = true;
2835 		else if (!strcmp(value, "mmap"))
2836 			rec->buildid_mmap = true;
2837 		else
2838 			return -1;
2839 		return 0;
2840 	}
2841 	if (!strcmp(var, "record.call-graph")) {
2842 		var = "call-graph.record-mode";
2843 		return perf_default_config(var, value, cb);
2844 	}
2845 #ifdef HAVE_AIO_SUPPORT
2846 	if (!strcmp(var, "record.aio")) {
2847 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2848 		if (!rec->opts.nr_cblocks)
2849 			rec->opts.nr_cblocks = nr_cblocks_default;
2850 	}
2851 #endif
2852 	if (!strcmp(var, "record.debuginfod")) {
2853 		rec->debuginfod.urls = strdup(value);
2854 		if (!rec->debuginfod.urls)
2855 			return -ENOMEM;
2856 		rec->debuginfod.set = true;
2857 	}
2858 
2859 	return 0;
2860 }
2861 
2862 
2863 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2864 {
2865 	struct record_opts *opts = (struct record_opts *)opt->value;
2866 
2867 	if (unset || !str)
2868 		return 0;
2869 
2870 	if (!strcasecmp(str, "node"))
2871 		opts->affinity = PERF_AFFINITY_NODE;
2872 	else if (!strcasecmp(str, "cpu"))
2873 		opts->affinity = PERF_AFFINITY_CPU;
2874 
2875 	return 0;
2876 }
2877 
2878 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2879 {
2880 	mask->nbits = nr_bits;
2881 	mask->bits = bitmap_zalloc(mask->nbits);
2882 	if (!mask->bits)
2883 		return -ENOMEM;
2884 
2885 	return 0;
2886 }
2887 
2888 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2889 {
2890 	bitmap_free(mask->bits);
2891 	mask->nbits = 0;
2892 }
2893 
2894 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
2895 {
2896 	int ret;
2897 
2898 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
2899 	if (ret) {
2900 		mask->affinity.bits = NULL;
2901 		return ret;
2902 	}
2903 
2904 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
2905 	if (ret) {
2906 		record__mmap_cpu_mask_free(&mask->maps);
2907 		mask->maps.bits = NULL;
2908 	}
2909 
2910 	return ret;
2911 }
2912 
2913 static void record__thread_mask_free(struct thread_mask *mask)
2914 {
2915 	record__mmap_cpu_mask_free(&mask->maps);
2916 	record__mmap_cpu_mask_free(&mask->affinity);
2917 }
2918 
2919 static int record__parse_threads(const struct option *opt, const char *str, int unset)
2920 {
2921 	int s;
2922 	struct record_opts *opts = opt->value;
2923 
2924 	if (unset || !str || !strlen(str)) {
2925 		opts->threads_spec = THREAD_SPEC__CPU;
2926 	} else {
2927 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
2928 			if (s == THREAD_SPEC__USER) {
2929 				opts->threads_user_spec = strdup(str);
2930 				if (!opts->threads_user_spec)
2931 					return -ENOMEM;
2932 				opts->threads_spec = THREAD_SPEC__USER;
2933 				break;
2934 			}
2935 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
2936 				opts->threads_spec = s;
2937 				break;
2938 			}
2939 		}
2940 	}
2941 
2942 	if (opts->threads_spec == THREAD_SPEC__USER)
2943 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
2944 	else
2945 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
2946 
2947 	return 0;
2948 }
2949 
2950 static int parse_output_max_size(const struct option *opt,
2951 				 const char *str, int unset)
2952 {
2953 	unsigned long *s = (unsigned long *)opt->value;
2954 	static struct parse_tag tags_size[] = {
2955 		{ .tag  = 'B', .mult = 1       },
2956 		{ .tag  = 'K', .mult = 1 << 10 },
2957 		{ .tag  = 'M', .mult = 1 << 20 },
2958 		{ .tag  = 'G', .mult = 1 << 30 },
2959 		{ .tag  = 0 },
2960 	};
2961 	unsigned long val;
2962 
2963 	if (unset) {
2964 		*s = 0;
2965 		return 0;
2966 	}
2967 
2968 	val = parse_tag_value(str, tags_size);
2969 	if (val != (unsigned long) -1) {
2970 		*s = val;
2971 		return 0;
2972 	}
2973 
2974 	return -1;
2975 }
2976 
2977 static int record__parse_mmap_pages(const struct option *opt,
2978 				    const char *str,
2979 				    int unset __maybe_unused)
2980 {
2981 	struct record_opts *opts = opt->value;
2982 	char *s, *p;
2983 	unsigned int mmap_pages;
2984 	int ret;
2985 
2986 	if (!str)
2987 		return -EINVAL;
2988 
2989 	s = strdup(str);
2990 	if (!s)
2991 		return -ENOMEM;
2992 
2993 	p = strchr(s, ',');
2994 	if (p)
2995 		*p = '\0';
2996 
2997 	if (*s) {
2998 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
2999 		if (ret)
3000 			goto out_free;
3001 		opts->mmap_pages = mmap_pages;
3002 	}
3003 
3004 	if (!p) {
3005 		ret = 0;
3006 		goto out_free;
3007 	}
3008 
3009 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3010 	if (ret)
3011 		goto out_free;
3012 
3013 	opts->auxtrace_mmap_pages = mmap_pages;
3014 
3015 out_free:
3016 	free(s);
3017 	return ret;
3018 }
3019 
3020 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3021 {
3022 }
3023 
3024 static int parse_control_option(const struct option *opt,
3025 				const char *str,
3026 				int unset __maybe_unused)
3027 {
3028 	struct record_opts *opts = opt->value;
3029 
3030 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3031 }
3032 
3033 static void switch_output_size_warn(struct record *rec)
3034 {
3035 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3036 	struct switch_output *s = &rec->switch_output;
3037 
3038 	wakeup_size /= 2;
3039 
3040 	if (s->size < wakeup_size) {
3041 		char buf[100];
3042 
3043 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3044 		pr_warning("WARNING: switch-output data size lower than "
3045 			   "wakeup kernel buffer size (%s) "
3046 			   "expect bigger perf.data sizes\n", buf);
3047 	}
3048 }
3049 
3050 static int switch_output_setup(struct record *rec)
3051 {
3052 	struct switch_output *s = &rec->switch_output;
3053 	static struct parse_tag tags_size[] = {
3054 		{ .tag  = 'B', .mult = 1       },
3055 		{ .tag  = 'K', .mult = 1 << 10 },
3056 		{ .tag  = 'M', .mult = 1 << 20 },
3057 		{ .tag  = 'G', .mult = 1 << 30 },
3058 		{ .tag  = 0 },
3059 	};
3060 	static struct parse_tag tags_time[] = {
3061 		{ .tag  = 's', .mult = 1        },
3062 		{ .tag  = 'm', .mult = 60       },
3063 		{ .tag  = 'h', .mult = 60*60    },
3064 		{ .tag  = 'd', .mult = 60*60*24 },
3065 		{ .tag  = 0 },
3066 	};
3067 	unsigned long val;
3068 
3069 	/*
3070 	 * If we're using --switch-output-events, then we imply its
3071 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3072 	 *  thread to its parent.
3073 	 */
3074 	if (rec->switch_output_event_set) {
3075 		if (record__threads_enabled(rec)) {
3076 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3077 			return 0;
3078 		}
3079 		goto do_signal;
3080 	}
3081 
3082 	if (!s->set)
3083 		return 0;
3084 
3085 	if (record__threads_enabled(rec)) {
3086 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3087 		return 0;
3088 	}
3089 
3090 	if (!strcmp(s->str, "signal")) {
3091 do_signal:
3092 		s->signal = true;
3093 		pr_debug("switch-output with SIGUSR2 signal\n");
3094 		goto enabled;
3095 	}
3096 
3097 	val = parse_tag_value(s->str, tags_size);
3098 	if (val != (unsigned long) -1) {
3099 		s->size = val;
3100 		pr_debug("switch-output with %s size threshold\n", s->str);
3101 		goto enabled;
3102 	}
3103 
3104 	val = parse_tag_value(s->str, tags_time);
3105 	if (val != (unsigned long) -1) {
3106 		s->time = val;
3107 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3108 			 s->str, s->time);
3109 		goto enabled;
3110 	}
3111 
3112 	return -1;
3113 
3114 enabled:
3115 	rec->timestamp_filename = true;
3116 	s->enabled              = true;
3117 
3118 	if (s->size && !rec->opts.no_buffering)
3119 		switch_output_size_warn(rec);
3120 
3121 	return 0;
3122 }
3123 
3124 static const char * const __record_usage[] = {
3125 	"perf record [<options>] [<command>]",
3126 	"perf record [<options>] -- <command> [<options>]",
3127 	NULL
3128 };
3129 const char * const *record_usage = __record_usage;
3130 
3131 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3132 				  struct perf_sample *sample, struct machine *machine)
3133 {
3134 	/*
3135 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3136 	 * no need to add them twice.
3137 	 */
3138 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3139 		return 0;
3140 	return perf_event__process_mmap(tool, event, sample, machine);
3141 }
3142 
3143 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3144 				   struct perf_sample *sample, struct machine *machine)
3145 {
3146 	/*
3147 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3148 	 * no need to add them twice.
3149 	 */
3150 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3151 		return 0;
3152 
3153 	return perf_event__process_mmap2(tool, event, sample, machine);
3154 }
3155 
3156 static int process_timestamp_boundary(struct perf_tool *tool,
3157 				      union perf_event *event __maybe_unused,
3158 				      struct perf_sample *sample,
3159 				      struct machine *machine __maybe_unused)
3160 {
3161 	struct record *rec = container_of(tool, struct record, tool);
3162 
3163 	set_timestamp_boundary(rec, sample->time);
3164 	return 0;
3165 }
3166 
3167 static int parse_record_synth_option(const struct option *opt,
3168 				     const char *str,
3169 				     int unset __maybe_unused)
3170 {
3171 	struct record_opts *opts = opt->value;
3172 	char *p = strdup(str);
3173 
3174 	if (p == NULL)
3175 		return -1;
3176 
3177 	opts->synth = parse_synth_opt(p);
3178 	free(p);
3179 
3180 	if (opts->synth < 0) {
3181 		pr_err("Invalid synth option: %s\n", str);
3182 		return -1;
3183 	}
3184 	return 0;
3185 }
3186 
3187 /*
3188  * XXX Ideally would be local to cmd_record() and passed to a record__new
3189  * because we need to have access to it in record__exit, that is called
3190  * after cmd_record() exits, but since record_options need to be accessible to
3191  * builtin-script, leave it here.
3192  *
3193  * At least we don't ouch it in all the other functions here directly.
3194  *
3195  * Just say no to tons of global variables, sigh.
3196  */
3197 static struct record record = {
3198 	.opts = {
3199 		.sample_time	     = true,
3200 		.mmap_pages	     = UINT_MAX,
3201 		.user_freq	     = UINT_MAX,
3202 		.user_interval	     = ULLONG_MAX,
3203 		.freq		     = 4000,
3204 		.target		     = {
3205 			.uses_mmap   = true,
3206 			.default_per_cpu = true,
3207 		},
3208 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3209 		.nr_threads_synthesize = 1,
3210 		.ctl_fd              = -1,
3211 		.ctl_fd_ack          = -1,
3212 		.synth               = PERF_SYNTH_ALL,
3213 	},
3214 	.tool = {
3215 		.sample		= process_sample_event,
3216 		.fork		= perf_event__process_fork,
3217 		.exit		= perf_event__process_exit,
3218 		.comm		= perf_event__process_comm,
3219 		.namespaces	= perf_event__process_namespaces,
3220 		.mmap		= build_id__process_mmap,
3221 		.mmap2		= build_id__process_mmap2,
3222 		.itrace_start	= process_timestamp_boundary,
3223 		.aux		= process_timestamp_boundary,
3224 		.ordered_events	= true,
3225 	},
3226 };
3227 
3228 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3229 	"\n\t\t\t\tDefault: fp";
3230 
3231 static bool dry_run;
3232 
3233 /*
3234  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3235  * with it and switch to use the library functions in perf_evlist that came
3236  * from builtin-record.c, i.e. use record_opts,
3237  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3238  * using pipes, etc.
3239  */
3240 static struct option __record_options[] = {
3241 	OPT_CALLBACK('e', "event", &record.evlist, "event",
3242 		     "event selector. use 'perf list' to list available events",
3243 		     parse_events_option),
3244 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3245 		     "event filter", parse_filter),
3246 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3247 			   NULL, "don't record events from perf itself",
3248 			   exclude_perf),
3249 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3250 		    "record events on existing process id"),
3251 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3252 		    "record events on existing thread id"),
3253 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3254 		    "collect data with this RT SCHED_FIFO priority"),
3255 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3256 		    "collect data without buffering"),
3257 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3258 		    "collect raw sample records from all opened counters"),
3259 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3260 			    "system-wide collection from all CPUs"),
3261 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3262 		    "list of cpus to monitor"),
3263 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3264 	OPT_STRING('o', "output", &record.data.path, "file",
3265 		    "output file name"),
3266 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3267 			&record.opts.no_inherit_set,
3268 			"child tasks do not inherit counters"),
3269 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3270 		    "synthesize non-sample events at the end of output"),
3271 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3272 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3273 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3274 		    "Fail if the specified frequency can't be used"),
3275 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3276 		     "profile at this frequency",
3277 		      record__parse_freq),
3278 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3279 		     "number of mmap data pages and AUX area tracing mmap pages",
3280 		     record__parse_mmap_pages),
3281 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3282 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3283 		     record__mmap_flush_parse),
3284 	OPT_BOOLEAN(0, "group", &record.opts.group,
3285 		    "put the counters into a counter group"),
3286 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3287 			   NULL, "enables call-graph recording" ,
3288 			   &record_callchain_opt),
3289 	OPT_CALLBACK(0, "call-graph", &record.opts,
3290 		     "record_mode[,record_size]", record_callchain_help,
3291 		     &record_parse_callchain_opt),
3292 	OPT_INCR('v', "verbose", &verbose,
3293 		    "be more verbose (show counter open errors, etc)"),
3294 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
3295 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3296 		    "per thread counts"),
3297 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3298 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3299 		    "Record the sample physical addresses"),
3300 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3301 		    "Record the sampled data address data page size"),
3302 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3303 		    "Record the sampled code address (ip) page size"),
3304 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3305 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3306 		    "Record the sample identifier"),
3307 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3308 			&record.opts.sample_time_set,
3309 			"Record the sample timestamps"),
3310 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3311 			"Record the sample period"),
3312 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3313 		    "don't sample"),
3314 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3315 			&record.no_buildid_cache_set,
3316 			"do not update the buildid cache"),
3317 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3318 			&record.no_buildid_set,
3319 			"do not collect buildids in perf.data"),
3320 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3321 		     "monitor event in cgroup name only",
3322 		     parse_cgroups),
3323 	OPT_INTEGER('D', "delay", &record.opts.initial_delay,
3324 		  "ms to wait before starting measurement after program start (-1: start with events disabled)"),
3325 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3326 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3327 		   "user to profile"),
3328 
3329 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3330 		     "branch any", "sample any taken branches",
3331 		     parse_branch_stack),
3332 
3333 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3334 		     "branch filter mask", "branch stack filter modes",
3335 		     parse_branch_stack),
3336 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3337 		    "sample by weight (on special events only)"),
3338 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3339 		    "sample transaction flags (special events only)"),
3340 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3341 		    "use per-thread mmaps"),
3342 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3343 		    "sample selected machine registers on interrupt,"
3344 		    " use '-I?' to list register names", parse_intr_regs),
3345 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3346 		    "sample selected machine registers on interrupt,"
3347 		    " use '--user-regs=?' to list register names", parse_user_regs),
3348 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3349 		    "Record running/enabled time of read (:S) events"),
3350 	OPT_CALLBACK('k', "clockid", &record.opts,
3351 	"clockid", "clockid to use for events, see clock_gettime()",
3352 	parse_clockid),
3353 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3354 			  "opts", "AUX area tracing Snapshot Mode", ""),
3355 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3356 			  "opts", "sample AUX area", ""),
3357 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3358 			"per thread proc mmap processing timeout in ms"),
3359 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3360 		    "Record namespaces events"),
3361 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3362 		    "Record cgroup events"),
3363 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3364 			&record.opts.record_switch_events_set,
3365 			"Record context switch events"),
3366 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3367 			 "Configure all used events to run in kernel space.",
3368 			 PARSE_OPT_EXCLUSIVE),
3369 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3370 			 "Configure all used events to run in user space.",
3371 			 PARSE_OPT_EXCLUSIVE),
3372 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3373 		    "collect kernel callchains"),
3374 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3375 		    "collect user callchains"),
3376 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
3377 		   "clang binary to use for compiling BPF scriptlets"),
3378 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
3379 		   "options passed to clang when compiling BPF scriptlets"),
3380 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3381 		   "file", "vmlinux pathname"),
3382 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3383 		    "Record build-id of all DSOs regardless of hits"),
3384 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3385 		    "Record build-id in map events"),
3386 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3387 		    "append timestamp to output filename"),
3388 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3389 		    "Record timestamp boundary (time of first/last samples)"),
3390 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3391 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3392 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3393 			  "signal"),
3394 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
3395 			 "switch output event selector. use 'perf list' to list available events",
3396 			 parse_events_option_new_evlist),
3397 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3398 		   "Limit number of switch output generated files"),
3399 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3400 		    "Parse options then exit"),
3401 #ifdef HAVE_AIO_SUPPORT
3402 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3403 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3404 		     record__aio_parse),
3405 #endif
3406 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3407 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3408 		     record__parse_affinity),
3409 #ifdef HAVE_ZSTD_SUPPORT
3410 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3411 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3412 			    record__parse_comp_level),
3413 #endif
3414 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3415 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3416 	OPT_UINTEGER(0, "num-thread-synthesize",
3417 		     &record.opts.nr_threads_synthesize,
3418 		     "number of threads to run for event synthesis"),
3419 #ifdef HAVE_LIBPFM
3420 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3421 		"libpfm4 event selector. use 'perf list' to list available events",
3422 		parse_libpfm_events_option),
3423 #endif
3424 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3425 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3426 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3427 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3428 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3429 		      parse_control_option),
3430 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3431 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3432 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3433 			  &record.debuginfod.set, "debuginfod urls",
3434 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3435 			  "system"),
3436 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3437 			    "write collected trace data into several data files using parallel threads",
3438 			    record__parse_threads),
3439 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3440 	OPT_END()
3441 };
3442 
3443 struct option *record_options = __record_options;
3444 
3445 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3446 {
3447 	struct perf_cpu cpu;
3448 	int idx;
3449 
3450 	if (cpu_map__is_dummy(cpus))
3451 		return 0;
3452 
3453 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
3454 		if (cpu.cpu == -1)
3455 			continue;
3456 		/* Return ENODEV is input cpu is greater than max cpu */
3457 		if ((unsigned long)cpu.cpu > mask->nbits)
3458 			return -ENODEV;
3459 		set_bit(cpu.cpu, mask->bits);
3460 	}
3461 
3462 	return 0;
3463 }
3464 
3465 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3466 {
3467 	struct perf_cpu_map *cpus;
3468 
3469 	cpus = perf_cpu_map__new(mask_spec);
3470 	if (!cpus)
3471 		return -ENOMEM;
3472 
3473 	bitmap_zero(mask->bits, mask->nbits);
3474 	if (record__mmap_cpu_mask_init(mask, cpus))
3475 		return -ENODEV;
3476 
3477 	perf_cpu_map__put(cpus);
3478 
3479 	return 0;
3480 }
3481 
3482 static void record__free_thread_masks(struct record *rec, int nr_threads)
3483 {
3484 	int t;
3485 
3486 	if (rec->thread_masks)
3487 		for (t = 0; t < nr_threads; t++)
3488 			record__thread_mask_free(&rec->thread_masks[t]);
3489 
3490 	zfree(&rec->thread_masks);
3491 }
3492 
3493 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3494 {
3495 	int t, ret;
3496 
3497 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3498 	if (!rec->thread_masks) {
3499 		pr_err("Failed to allocate thread masks\n");
3500 		return -ENOMEM;
3501 	}
3502 
3503 	for (t = 0; t < nr_threads; t++) {
3504 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3505 		if (ret) {
3506 			pr_err("Failed to allocate thread masks[%d]\n", t);
3507 			goto out_free;
3508 		}
3509 	}
3510 
3511 	return 0;
3512 
3513 out_free:
3514 	record__free_thread_masks(rec, nr_threads);
3515 
3516 	return ret;
3517 }
3518 
3519 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3520 {
3521 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3522 
3523 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3524 	if (ret)
3525 		return ret;
3526 
3527 	rec->nr_threads = nr_cpus;
3528 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3529 
3530 	for (t = 0; t < rec->nr_threads; t++) {
3531 		set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3532 		set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3533 		if (verbose) {
3534 			pr_debug("thread_masks[%d]: ", t);
3535 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3536 			pr_debug("thread_masks[%d]: ", t);
3537 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3538 		}
3539 	}
3540 
3541 	return 0;
3542 }
3543 
3544 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3545 					  const char **maps_spec, const char **affinity_spec,
3546 					  u32 nr_spec)
3547 {
3548 	u32 s;
3549 	int ret = 0, t = 0;
3550 	struct mmap_cpu_mask cpus_mask;
3551 	struct thread_mask thread_mask, full_mask, *thread_masks;
3552 
3553 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3554 	if (ret) {
3555 		pr_err("Failed to allocate CPUs mask\n");
3556 		return ret;
3557 	}
3558 
3559 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3560 	if (ret) {
3561 		pr_err("Failed to init cpu mask\n");
3562 		goto out_free_cpu_mask;
3563 	}
3564 
3565 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3566 	if (ret) {
3567 		pr_err("Failed to allocate full mask\n");
3568 		goto out_free_cpu_mask;
3569 	}
3570 
3571 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3572 	if (ret) {
3573 		pr_err("Failed to allocate thread mask\n");
3574 		goto out_free_full_and_cpu_masks;
3575 	}
3576 
3577 	for (s = 0; s < nr_spec; s++) {
3578 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3579 		if (ret) {
3580 			pr_err("Failed to initialize maps thread mask\n");
3581 			goto out_free;
3582 		}
3583 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3584 		if (ret) {
3585 			pr_err("Failed to initialize affinity thread mask\n");
3586 			goto out_free;
3587 		}
3588 
3589 		/* ignore invalid CPUs but do not allow empty masks */
3590 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3591 				cpus_mask.bits, thread_mask.maps.nbits)) {
3592 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3593 			ret = -EINVAL;
3594 			goto out_free;
3595 		}
3596 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3597 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3598 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3599 			ret = -EINVAL;
3600 			goto out_free;
3601 		}
3602 
3603 		/* do not allow intersection with other masks (full_mask) */
3604 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3605 				      thread_mask.maps.nbits)) {
3606 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3607 			ret = -EINVAL;
3608 			goto out_free;
3609 		}
3610 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3611 				      thread_mask.affinity.nbits)) {
3612 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3613 			ret = -EINVAL;
3614 			goto out_free;
3615 		}
3616 
3617 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3618 			  thread_mask.maps.bits, full_mask.maps.nbits);
3619 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3620 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3621 
3622 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3623 		if (!thread_masks) {
3624 			pr_err("Failed to reallocate thread masks\n");
3625 			ret = -ENOMEM;
3626 			goto out_free;
3627 		}
3628 		rec->thread_masks = thread_masks;
3629 		rec->thread_masks[t] = thread_mask;
3630 		if (verbose) {
3631 			pr_debug("thread_masks[%d]: ", t);
3632 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3633 			pr_debug("thread_masks[%d]: ", t);
3634 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3635 		}
3636 		t++;
3637 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3638 		if (ret) {
3639 			pr_err("Failed to allocate thread mask\n");
3640 			goto out_free_full_and_cpu_masks;
3641 		}
3642 	}
3643 	rec->nr_threads = t;
3644 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3645 	if (!rec->nr_threads)
3646 		ret = -EINVAL;
3647 
3648 out_free:
3649 	record__thread_mask_free(&thread_mask);
3650 out_free_full_and_cpu_masks:
3651 	record__thread_mask_free(&full_mask);
3652 out_free_cpu_mask:
3653 	record__mmap_cpu_mask_free(&cpus_mask);
3654 
3655 	return ret;
3656 }
3657 
3658 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3659 {
3660 	int ret;
3661 	struct cpu_topology *topo;
3662 
3663 	topo = cpu_topology__new();
3664 	if (!topo) {
3665 		pr_err("Failed to allocate CPU topology\n");
3666 		return -ENOMEM;
3667 	}
3668 
3669 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3670 					     topo->core_cpus_list, topo->core_cpus_lists);
3671 	cpu_topology__delete(topo);
3672 
3673 	return ret;
3674 }
3675 
3676 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3677 {
3678 	int ret;
3679 	struct cpu_topology *topo;
3680 
3681 	topo = cpu_topology__new();
3682 	if (!topo) {
3683 		pr_err("Failed to allocate CPU topology\n");
3684 		return -ENOMEM;
3685 	}
3686 
3687 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3688 					     topo->package_cpus_list, topo->package_cpus_lists);
3689 	cpu_topology__delete(topo);
3690 
3691 	return ret;
3692 }
3693 
3694 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3695 {
3696 	u32 s;
3697 	int ret;
3698 	const char **spec;
3699 	struct numa_topology *topo;
3700 
3701 	topo = numa_topology__new();
3702 	if (!topo) {
3703 		pr_err("Failed to allocate NUMA topology\n");
3704 		return -ENOMEM;
3705 	}
3706 
3707 	spec = zalloc(topo->nr * sizeof(char *));
3708 	if (!spec) {
3709 		pr_err("Failed to allocate NUMA spec\n");
3710 		ret = -ENOMEM;
3711 		goto out_delete_topo;
3712 	}
3713 	for (s = 0; s < topo->nr; s++)
3714 		spec[s] = topo->nodes[s].cpus;
3715 
3716 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3717 
3718 	zfree(&spec);
3719 
3720 out_delete_topo:
3721 	numa_topology__delete(topo);
3722 
3723 	return ret;
3724 }
3725 
3726 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3727 {
3728 	int t, ret;
3729 	u32 s, nr_spec = 0;
3730 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3731 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3732 
3733 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3734 		spec = strtok_r(user_spec, ":", &spec_ptr);
3735 		if (spec == NULL)
3736 			break;
3737 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3738 		mask = strtok_r(spec, "/", &mask_ptr);
3739 		if (mask == NULL)
3740 			break;
3741 		pr_debug2("  maps mask: %s\n", mask);
3742 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3743 		if (!tmp_spec) {
3744 			pr_err("Failed to reallocate maps spec\n");
3745 			ret = -ENOMEM;
3746 			goto out_free;
3747 		}
3748 		maps_spec = tmp_spec;
3749 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3750 		if (!maps_spec[nr_spec]) {
3751 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3752 			ret = -ENOMEM;
3753 			goto out_free;
3754 		}
3755 		mask = strtok_r(NULL, "/", &mask_ptr);
3756 		if (mask == NULL) {
3757 			pr_err("Invalid thread maps or affinity specs\n");
3758 			ret = -EINVAL;
3759 			goto out_free;
3760 		}
3761 		pr_debug2("  affinity mask: %s\n", mask);
3762 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3763 		if (!tmp_spec) {
3764 			pr_err("Failed to reallocate affinity spec\n");
3765 			ret = -ENOMEM;
3766 			goto out_free;
3767 		}
3768 		affinity_spec = tmp_spec;
3769 		affinity_spec[nr_spec] = strdup(mask);
3770 		if (!affinity_spec[nr_spec]) {
3771 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3772 			ret = -ENOMEM;
3773 			goto out_free;
3774 		}
3775 		dup_mask = NULL;
3776 		nr_spec++;
3777 	}
3778 
3779 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3780 					     (const char **)affinity_spec, nr_spec);
3781 
3782 out_free:
3783 	free(dup_mask);
3784 	for (s = 0; s < nr_spec; s++) {
3785 		if (maps_spec)
3786 			free(maps_spec[s]);
3787 		if (affinity_spec)
3788 			free(affinity_spec[s]);
3789 	}
3790 	free(affinity_spec);
3791 	free(maps_spec);
3792 
3793 	return ret;
3794 }
3795 
3796 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3797 {
3798 	int ret;
3799 
3800 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3801 	if (ret)
3802 		return ret;
3803 
3804 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3805 		return -ENODEV;
3806 
3807 	rec->nr_threads = 1;
3808 
3809 	return 0;
3810 }
3811 
3812 static int record__init_thread_masks(struct record *rec)
3813 {
3814 	int ret = 0;
3815 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3816 
3817 	if (!record__threads_enabled(rec))
3818 		return record__init_thread_default_masks(rec, cpus);
3819 
3820 	if (evlist__per_thread(rec->evlist)) {
3821 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3822 		return -EINVAL;
3823 	}
3824 
3825 	switch (rec->opts.threads_spec) {
3826 	case THREAD_SPEC__CPU:
3827 		ret = record__init_thread_cpu_masks(rec, cpus);
3828 		break;
3829 	case THREAD_SPEC__CORE:
3830 		ret = record__init_thread_core_masks(rec, cpus);
3831 		break;
3832 	case THREAD_SPEC__PACKAGE:
3833 		ret = record__init_thread_package_masks(rec, cpus);
3834 		break;
3835 	case THREAD_SPEC__NUMA:
3836 		ret = record__init_thread_numa_masks(rec, cpus);
3837 		break;
3838 	case THREAD_SPEC__USER:
3839 		ret = record__init_thread_user_masks(rec, cpus);
3840 		break;
3841 	default:
3842 		break;
3843 	}
3844 
3845 	return ret;
3846 }
3847 
3848 int cmd_record(int argc, const char **argv)
3849 {
3850 	int err;
3851 	struct record *rec = &record;
3852 	char errbuf[BUFSIZ];
3853 
3854 	setlocale(LC_ALL, "");
3855 
3856 #ifndef HAVE_LIBBPF_SUPPORT
3857 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
3858 	set_nobuild('\0', "clang-path", true);
3859 	set_nobuild('\0', "clang-opt", true);
3860 # undef set_nobuild
3861 #endif
3862 
3863 #ifndef HAVE_BPF_PROLOGUE
3864 # if !defined (HAVE_DWARF_SUPPORT)
3865 #  define REASON  "NO_DWARF=1"
3866 # elif !defined (HAVE_LIBBPF_SUPPORT)
3867 #  define REASON  "NO_LIBBPF=1"
3868 # else
3869 #  define REASON  "this architecture doesn't support BPF prologue"
3870 # endif
3871 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
3872 	set_nobuild('\0', "vmlinux", true);
3873 # undef set_nobuild
3874 # undef REASON
3875 #endif
3876 
3877 #ifndef HAVE_BPF_SKEL
3878 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3879 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3880 # undef set_nobuild
3881 #endif
3882 
3883 	rec->opts.affinity = PERF_AFFINITY_SYS;
3884 
3885 	rec->evlist = evlist__new();
3886 	if (rec->evlist == NULL)
3887 		return -ENOMEM;
3888 
3889 	err = perf_config(perf_record_config, rec);
3890 	if (err)
3891 		return err;
3892 
3893 	argc = parse_options(argc, argv, record_options, record_usage,
3894 			    PARSE_OPT_STOP_AT_NON_OPTION);
3895 	if (quiet)
3896 		perf_quiet_option();
3897 
3898 	err = symbol__validate_sym_arguments();
3899 	if (err)
3900 		return err;
3901 
3902 	perf_debuginfod_setup(&record.debuginfod);
3903 
3904 	/* Make system wide (-a) the default target. */
3905 	if (!argc && target__none(&rec->opts.target))
3906 		rec->opts.target.system_wide = true;
3907 
3908 	if (nr_cgroups && !rec->opts.target.system_wide) {
3909 		usage_with_options_msg(record_usage, record_options,
3910 			"cgroup monitoring only available in system-wide mode");
3911 
3912 	}
3913 
3914 	if (rec->buildid_mmap) {
3915 		if (!perf_can_record_build_id()) {
3916 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
3917 			err = -EINVAL;
3918 			goto out_opts;
3919 		}
3920 		pr_debug("Enabling build id in mmap2 events.\n");
3921 		/* Enable mmap build id synthesizing. */
3922 		symbol_conf.buildid_mmap2 = true;
3923 		/* Enable perf_event_attr::build_id bit. */
3924 		rec->opts.build_id = true;
3925 		/* Disable build id cache. */
3926 		rec->no_buildid = true;
3927 	}
3928 
3929 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
3930 		pr_err("Kernel has no cgroup sampling support.\n");
3931 		err = -EINVAL;
3932 		goto out_opts;
3933 	}
3934 
3935 	if (rec->opts.kcore)
3936 		rec->opts.text_poke = true;
3937 
3938 	if (rec->opts.kcore || record__threads_enabled(rec))
3939 		rec->data.is_dir = true;
3940 
3941 	if (record__threads_enabled(rec)) {
3942 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
3943 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
3944 			goto out_opts;
3945 		}
3946 		if (record__aio_enabled(rec)) {
3947 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
3948 			goto out_opts;
3949 		}
3950 	}
3951 
3952 	if (rec->opts.comp_level != 0) {
3953 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
3954 		rec->no_buildid = true;
3955 	}
3956 
3957 	if (rec->opts.record_switch_events &&
3958 	    !perf_can_record_switch_events()) {
3959 		ui__error("kernel does not support recording context switch events\n");
3960 		parse_options_usage(record_usage, record_options, "switch-events", 0);
3961 		err = -EINVAL;
3962 		goto out_opts;
3963 	}
3964 
3965 	if (switch_output_setup(rec)) {
3966 		parse_options_usage(record_usage, record_options, "switch-output", 0);
3967 		err = -EINVAL;
3968 		goto out_opts;
3969 	}
3970 
3971 	if (rec->switch_output.time) {
3972 		signal(SIGALRM, alarm_sig_handler);
3973 		alarm(rec->switch_output.time);
3974 	}
3975 
3976 	if (rec->switch_output.num_files) {
3977 		rec->switch_output.filenames = calloc(sizeof(char *),
3978 						      rec->switch_output.num_files);
3979 		if (!rec->switch_output.filenames) {
3980 			err = -EINVAL;
3981 			goto out_opts;
3982 		}
3983 	}
3984 
3985 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
3986 		rec->timestamp_filename = false;
3987 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
3988 	}
3989 
3990 	/*
3991 	 * Allow aliases to facilitate the lookup of symbols for address
3992 	 * filters. Refer to auxtrace_parse_filters().
3993 	 */
3994 	symbol_conf.allow_aliases = true;
3995 
3996 	symbol__init(NULL);
3997 
3998 	err = record__auxtrace_init(rec);
3999 	if (err)
4000 		goto out;
4001 
4002 	if (dry_run)
4003 		goto out;
4004 
4005 	err = bpf__setup_stdout(rec->evlist);
4006 	if (err) {
4007 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
4008 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
4009 			 errbuf);
4010 		goto out;
4011 	}
4012 
4013 	err = -ENOMEM;
4014 
4015 	if (rec->no_buildid_cache || rec->no_buildid) {
4016 		disable_buildid_cache();
4017 	} else if (rec->switch_output.enabled) {
4018 		/*
4019 		 * In 'perf record --switch-output', disable buildid
4020 		 * generation by default to reduce data file switching
4021 		 * overhead. Still generate buildid if they are required
4022 		 * explicitly using
4023 		 *
4024 		 *  perf record --switch-output --no-no-buildid \
4025 		 *              --no-no-buildid-cache
4026 		 *
4027 		 * Following code equals to:
4028 		 *
4029 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4030 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4031 		 *         disable_buildid_cache();
4032 		 */
4033 		bool disable = true;
4034 
4035 		if (rec->no_buildid_set && !rec->no_buildid)
4036 			disable = false;
4037 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4038 			disable = false;
4039 		if (disable) {
4040 			rec->no_buildid = true;
4041 			rec->no_buildid_cache = true;
4042 			disable_buildid_cache();
4043 		}
4044 	}
4045 
4046 	if (record.opts.overwrite)
4047 		record.opts.tail_synthesize = true;
4048 
4049 	if (rec->evlist->core.nr_entries == 0) {
4050 		if (perf_pmu__has_hybrid()) {
4051 			err = evlist__add_default_hybrid(rec->evlist,
4052 							 !record.opts.no_samples);
4053 		} else {
4054 			err = __evlist__add_default(rec->evlist,
4055 						    !record.opts.no_samples);
4056 		}
4057 
4058 		if (err < 0) {
4059 			pr_err("Not enough memory for event selector list\n");
4060 			goto out;
4061 		}
4062 	}
4063 
4064 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4065 		rec->opts.no_inherit = true;
4066 
4067 	err = target__validate(&rec->opts.target);
4068 	if (err) {
4069 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4070 		ui__warning("%s\n", errbuf);
4071 	}
4072 
4073 	err = target__parse_uid(&rec->opts.target);
4074 	if (err) {
4075 		int saved_errno = errno;
4076 
4077 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4078 		ui__error("%s", errbuf);
4079 
4080 		err = -saved_errno;
4081 		goto out;
4082 	}
4083 
4084 	/* Enable ignoring missing threads when -u/-p option is defined. */
4085 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4086 
4087 	if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
4088 		pr_err("failed to use cpu list %s\n",
4089 		       rec->opts.target.cpu_list);
4090 		goto out;
4091 	}
4092 
4093 	rec->opts.target.hybrid = perf_pmu__has_hybrid();
4094 
4095 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4096 		arch__add_leaf_frame_record_opts(&rec->opts);
4097 
4098 	err = -ENOMEM;
4099 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4100 		if (rec->opts.target.pid != NULL) {
4101 			pr_err("Couldn't create thread/CPU maps: %s\n",
4102 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4103 			goto out;
4104 		}
4105 		else
4106 			usage_with_options(record_usage, record_options);
4107 	}
4108 
4109 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4110 	if (err)
4111 		goto out;
4112 
4113 	/*
4114 	 * We take all buildids when the file contains
4115 	 * AUX area tracing data because we do not decode the
4116 	 * trace because it would take too long.
4117 	 */
4118 	if (rec->opts.full_auxtrace)
4119 		rec->buildid_all = true;
4120 
4121 	if (rec->opts.text_poke) {
4122 		err = record__config_text_poke(rec->evlist);
4123 		if (err) {
4124 			pr_err("record__config_text_poke failed, error %d\n", err);
4125 			goto out;
4126 		}
4127 	}
4128 
4129 	if (rec->off_cpu) {
4130 		err = record__config_off_cpu(rec);
4131 		if (err) {
4132 			pr_err("record__config_off_cpu failed, error %d\n", err);
4133 			goto out;
4134 		}
4135 	}
4136 
4137 	if (record_opts__config(&rec->opts)) {
4138 		err = -EINVAL;
4139 		goto out;
4140 	}
4141 
4142 	err = record__init_thread_masks(rec);
4143 	if (err) {
4144 		pr_err("Failed to initialize parallel data streaming masks\n");
4145 		goto out;
4146 	}
4147 
4148 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4149 		rec->opts.nr_cblocks = nr_cblocks_max;
4150 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4151 
4152 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4153 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4154 
4155 	if (rec->opts.comp_level > comp_level_max)
4156 		rec->opts.comp_level = comp_level_max;
4157 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4158 
4159 	err = __cmd_record(&record, argc, argv);
4160 out:
4161 	evlist__delete(rec->evlist);
4162 	symbol__exit();
4163 	auxtrace_record__free(rec->itr);
4164 out_opts:
4165 	record__free_thread_masks(rec, rec->nr_threads);
4166 	rec->nr_threads = 0;
4167 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4168 	return err;
4169 }
4170 
4171 static void snapshot_sig_handler(int sig __maybe_unused)
4172 {
4173 	struct record *rec = &record;
4174 
4175 	hit_auxtrace_snapshot_trigger(rec);
4176 
4177 	if (switch_output_signal(rec))
4178 		trigger_hit(&switch_output_trigger);
4179 }
4180 
4181 static void alarm_sig_handler(int sig __maybe_unused)
4182 {
4183 	struct record *rec = &record;
4184 
4185 	if (switch_output_time(rec))
4186 		trigger_hit(&switch_output_trigger);
4187 }
4188