xref: /openbmc/linux/tools/perf/builtin-record.c (revision 68f436a80fc89faa474134edfe442d95528be17a)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/llvm-utils.h"
41 #include "util/bpf-loader.h"
42 #include "util/trigger.h"
43 #include "util/perf-hooks.h"
44 #include "util/cpu-set-sched.h"
45 #include "util/synthetic-events.h"
46 #include "util/time-utils.h"
47 #include "util/units.h"
48 #include "util/bpf-event.h"
49 #include "util/util.h"
50 #include "util/pfm.h"
51 #include "util/pmu.h"
52 #include "util/pmus.h"
53 #include "util/clockid.h"
54 #include "util/off_cpu.h"
55 #include "util/bpf-filter.h"
56 #include "asm/bug.h"
57 #include "perf.h"
58 #include "cputopo.h"
59 
60 #include <errno.h>
61 #include <inttypes.h>
62 #include <locale.h>
63 #include <poll.h>
64 #include <pthread.h>
65 #include <unistd.h>
66 #ifndef HAVE_GETTID
67 #include <syscall.h>
68 #endif
69 #include <sched.h>
70 #include <signal.h>
71 #ifdef HAVE_EVENTFD_SUPPORT
72 #include <sys/eventfd.h>
73 #endif
74 #include <sys/mman.h>
75 #include <sys/wait.h>
76 #include <sys/types.h>
77 #include <sys/stat.h>
78 #include <fcntl.h>
79 #include <linux/err.h>
80 #include <linux/string.h>
81 #include <linux/time64.h>
82 #include <linux/zalloc.h>
83 #include <linux/bitmap.h>
84 #include <sys/time.h>
85 
86 struct switch_output {
87 	bool		 enabled;
88 	bool		 signal;
89 	unsigned long	 size;
90 	unsigned long	 time;
91 	const char	*str;
92 	bool		 set;
93 	char		 **filenames;
94 	int		 num_files;
95 	int		 cur_file;
96 };
97 
98 struct thread_mask {
99 	struct mmap_cpu_mask	maps;
100 	struct mmap_cpu_mask	affinity;
101 };
102 
103 struct record_thread {
104 	pid_t			tid;
105 	struct thread_mask	*mask;
106 	struct {
107 		int		msg[2];
108 		int		ack[2];
109 	} pipes;
110 	struct fdarray		pollfd;
111 	int			ctlfd_pos;
112 	int			nr_mmaps;
113 	struct mmap		**maps;
114 	struct mmap		**overwrite_maps;
115 	struct record		*rec;
116 	unsigned long long	samples;
117 	unsigned long		waking;
118 	u64			bytes_written;
119 	u64			bytes_transferred;
120 	u64			bytes_compressed;
121 };
122 
123 static __thread struct record_thread *thread;
124 
125 enum thread_msg {
126 	THREAD_MSG__UNDEFINED = 0,
127 	THREAD_MSG__READY,
128 	THREAD_MSG__MAX,
129 };
130 
131 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
132 	"UNDEFINED", "READY"
133 };
134 
135 enum thread_spec {
136 	THREAD_SPEC__UNDEFINED = 0,
137 	THREAD_SPEC__CPU,
138 	THREAD_SPEC__CORE,
139 	THREAD_SPEC__PACKAGE,
140 	THREAD_SPEC__NUMA,
141 	THREAD_SPEC__USER,
142 	THREAD_SPEC__MAX,
143 };
144 
145 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
146 	"undefined", "cpu", "core", "package", "numa", "user"
147 };
148 
149 struct pollfd_index_map {
150 	int evlist_pollfd_index;
151 	int thread_pollfd_index;
152 };
153 
154 struct record {
155 	struct perf_tool	tool;
156 	struct record_opts	opts;
157 	u64			bytes_written;
158 	u64			thread_bytes_written;
159 	struct perf_data	data;
160 	struct auxtrace_record	*itr;
161 	struct evlist	*evlist;
162 	struct perf_session	*session;
163 	struct evlist		*sb_evlist;
164 	pthread_t		thread_id;
165 	int			realtime_prio;
166 	bool			switch_output_event_set;
167 	bool			no_buildid;
168 	bool			no_buildid_set;
169 	bool			no_buildid_cache;
170 	bool			no_buildid_cache_set;
171 	bool			buildid_all;
172 	bool			buildid_mmap;
173 	bool			timestamp_filename;
174 	bool			timestamp_boundary;
175 	bool			off_cpu;
176 	struct switch_output	switch_output;
177 	unsigned long long	samples;
178 	unsigned long		output_max_size;	/* = 0: unlimited */
179 	struct perf_debuginfod	debuginfod;
180 	int			nr_threads;
181 	struct thread_mask	*thread_masks;
182 	struct record_thread	*thread_data;
183 	struct pollfd_index_map	*index_map;
184 	size_t			index_map_sz;
185 	size_t			index_map_cnt;
186 };
187 
188 static volatile int done;
189 
190 static volatile int auxtrace_record__snapshot_started;
191 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
192 static DEFINE_TRIGGER(switch_output_trigger);
193 
194 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
195 	"SYS", "NODE", "CPU"
196 };
197 
198 #ifndef HAVE_GETTID
199 static inline pid_t gettid(void)
200 {
201 	return (pid_t)syscall(__NR_gettid);
202 }
203 #endif
204 
205 static int record__threads_enabled(struct record *rec)
206 {
207 	return rec->opts.threads_spec;
208 }
209 
210 static bool switch_output_signal(struct record *rec)
211 {
212 	return rec->switch_output.signal &&
213 	       trigger_is_ready(&switch_output_trigger);
214 }
215 
216 static bool switch_output_size(struct record *rec)
217 {
218 	return rec->switch_output.size &&
219 	       trigger_is_ready(&switch_output_trigger) &&
220 	       (rec->bytes_written >= rec->switch_output.size);
221 }
222 
223 static bool switch_output_time(struct record *rec)
224 {
225 	return rec->switch_output.time &&
226 	       trigger_is_ready(&switch_output_trigger);
227 }
228 
229 static u64 record__bytes_written(struct record *rec)
230 {
231 	return rec->bytes_written + rec->thread_bytes_written;
232 }
233 
234 static bool record__output_max_size_exceeded(struct record *rec)
235 {
236 	return rec->output_max_size &&
237 	       (record__bytes_written(rec) >= rec->output_max_size);
238 }
239 
240 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
241 			 void *bf, size_t size)
242 {
243 	struct perf_data_file *file = &rec->session->data->file;
244 
245 	if (map && map->file)
246 		file = map->file;
247 
248 	if (perf_data_file__write(file, bf, size) < 0) {
249 		pr_err("failed to write perf data, error: %m\n");
250 		return -1;
251 	}
252 
253 	if (map && map->file) {
254 		thread->bytes_written += size;
255 		rec->thread_bytes_written += size;
256 	} else {
257 		rec->bytes_written += size;
258 	}
259 
260 	if (record__output_max_size_exceeded(rec) && !done) {
261 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
262 				" stopping session ]\n",
263 				record__bytes_written(rec) >> 10);
264 		done = 1;
265 	}
266 
267 	if (switch_output_size(rec))
268 		trigger_hit(&switch_output_trigger);
269 
270 	return 0;
271 }
272 
273 static int record__aio_enabled(struct record *rec);
274 static int record__comp_enabled(struct record *rec);
275 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
276 			    void *dst, size_t dst_size, void *src, size_t src_size);
277 
278 #ifdef HAVE_AIO_SUPPORT
279 static int record__aio_write(struct aiocb *cblock, int trace_fd,
280 		void *buf, size_t size, off_t off)
281 {
282 	int rc;
283 
284 	cblock->aio_fildes = trace_fd;
285 	cblock->aio_buf    = buf;
286 	cblock->aio_nbytes = size;
287 	cblock->aio_offset = off;
288 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
289 
290 	do {
291 		rc = aio_write(cblock);
292 		if (rc == 0) {
293 			break;
294 		} else if (errno != EAGAIN) {
295 			cblock->aio_fildes = -1;
296 			pr_err("failed to queue perf data, error: %m\n");
297 			break;
298 		}
299 	} while (1);
300 
301 	return rc;
302 }
303 
304 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
305 {
306 	void *rem_buf;
307 	off_t rem_off;
308 	size_t rem_size;
309 	int rc, aio_errno;
310 	ssize_t aio_ret, written;
311 
312 	aio_errno = aio_error(cblock);
313 	if (aio_errno == EINPROGRESS)
314 		return 0;
315 
316 	written = aio_ret = aio_return(cblock);
317 	if (aio_ret < 0) {
318 		if (aio_errno != EINTR)
319 			pr_err("failed to write perf data, error: %m\n");
320 		written = 0;
321 	}
322 
323 	rem_size = cblock->aio_nbytes - written;
324 
325 	if (rem_size == 0) {
326 		cblock->aio_fildes = -1;
327 		/*
328 		 * md->refcount is incremented in record__aio_pushfn() for
329 		 * every aio write request started in record__aio_push() so
330 		 * decrement it because the request is now complete.
331 		 */
332 		perf_mmap__put(&md->core);
333 		rc = 1;
334 	} else {
335 		/*
336 		 * aio write request may require restart with the
337 		 * reminder if the kernel didn't write whole
338 		 * chunk at once.
339 		 */
340 		rem_off = cblock->aio_offset + written;
341 		rem_buf = (void *)(cblock->aio_buf + written);
342 		record__aio_write(cblock, cblock->aio_fildes,
343 				rem_buf, rem_size, rem_off);
344 		rc = 0;
345 	}
346 
347 	return rc;
348 }
349 
350 static int record__aio_sync(struct mmap *md, bool sync_all)
351 {
352 	struct aiocb **aiocb = md->aio.aiocb;
353 	struct aiocb *cblocks = md->aio.cblocks;
354 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
355 	int i, do_suspend;
356 
357 	do {
358 		do_suspend = 0;
359 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
360 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
361 				if (sync_all)
362 					aiocb[i] = NULL;
363 				else
364 					return i;
365 			} else {
366 				/*
367 				 * Started aio write is not complete yet
368 				 * so it has to be waited before the
369 				 * next allocation.
370 				 */
371 				aiocb[i] = &cblocks[i];
372 				do_suspend = 1;
373 			}
374 		}
375 		if (!do_suspend)
376 			return -1;
377 
378 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
379 			if (!(errno == EAGAIN || errno == EINTR))
380 				pr_err("failed to sync perf data, error: %m\n");
381 		}
382 	} while (1);
383 }
384 
385 struct record_aio {
386 	struct record	*rec;
387 	void		*data;
388 	size_t		size;
389 };
390 
391 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
392 {
393 	struct record_aio *aio = to;
394 
395 	/*
396 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
397 	 * to release space in the kernel buffer as fast as possible, calling
398 	 * perf_mmap__consume() from perf_mmap__push() function.
399 	 *
400 	 * That lets the kernel to proceed with storing more profiling data into
401 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
402 	 *
403 	 * Coping can be done in two steps in case the chunk of profiling data
404 	 * crosses the upper bound of the kernel buffer. In this case we first move
405 	 * part of data from map->start till the upper bound and then the reminder
406 	 * from the beginning of the kernel buffer till the end of the data chunk.
407 	 */
408 
409 	if (record__comp_enabled(aio->rec)) {
410 		size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
411 				     mmap__mmap_len(map) - aio->size,
412 				     buf, size);
413 	} else {
414 		memcpy(aio->data + aio->size, buf, size);
415 	}
416 
417 	if (!aio->size) {
418 		/*
419 		 * Increment map->refcount to guard map->aio.data[] buffer
420 		 * from premature deallocation because map object can be
421 		 * released earlier than aio write request started on
422 		 * map->aio.data[] buffer is complete.
423 		 *
424 		 * perf_mmap__put() is done at record__aio_complete()
425 		 * after started aio request completion or at record__aio_push()
426 		 * if the request failed to start.
427 		 */
428 		perf_mmap__get(&map->core);
429 	}
430 
431 	aio->size += size;
432 
433 	return size;
434 }
435 
436 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
437 {
438 	int ret, idx;
439 	int trace_fd = rec->session->data->file.fd;
440 	struct record_aio aio = { .rec = rec, .size = 0 };
441 
442 	/*
443 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
444 	 * becomes available after previous aio write operation.
445 	 */
446 
447 	idx = record__aio_sync(map, false);
448 	aio.data = map->aio.data[idx];
449 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
450 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
451 		return ret;
452 
453 	rec->samples++;
454 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
455 	if (!ret) {
456 		*off += aio.size;
457 		rec->bytes_written += aio.size;
458 		if (switch_output_size(rec))
459 			trigger_hit(&switch_output_trigger);
460 	} else {
461 		/*
462 		 * Decrement map->refcount incremented in record__aio_pushfn()
463 		 * back if record__aio_write() operation failed to start, otherwise
464 		 * map->refcount is decremented in record__aio_complete() after
465 		 * aio write operation finishes successfully.
466 		 */
467 		perf_mmap__put(&map->core);
468 	}
469 
470 	return ret;
471 }
472 
473 static off_t record__aio_get_pos(int trace_fd)
474 {
475 	return lseek(trace_fd, 0, SEEK_CUR);
476 }
477 
478 static void record__aio_set_pos(int trace_fd, off_t pos)
479 {
480 	lseek(trace_fd, pos, SEEK_SET);
481 }
482 
483 static void record__aio_mmap_read_sync(struct record *rec)
484 {
485 	int i;
486 	struct evlist *evlist = rec->evlist;
487 	struct mmap *maps = evlist->mmap;
488 
489 	if (!record__aio_enabled(rec))
490 		return;
491 
492 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
493 		struct mmap *map = &maps[i];
494 
495 		if (map->core.base)
496 			record__aio_sync(map, true);
497 	}
498 }
499 
500 static int nr_cblocks_default = 1;
501 static int nr_cblocks_max = 4;
502 
503 static int record__aio_parse(const struct option *opt,
504 			     const char *str,
505 			     int unset)
506 {
507 	struct record_opts *opts = (struct record_opts *)opt->value;
508 
509 	if (unset) {
510 		opts->nr_cblocks = 0;
511 	} else {
512 		if (str)
513 			opts->nr_cblocks = strtol(str, NULL, 0);
514 		if (!opts->nr_cblocks)
515 			opts->nr_cblocks = nr_cblocks_default;
516 	}
517 
518 	return 0;
519 }
520 #else /* HAVE_AIO_SUPPORT */
521 static int nr_cblocks_max = 0;
522 
523 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
524 			    off_t *off __maybe_unused)
525 {
526 	return -1;
527 }
528 
529 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
530 {
531 	return -1;
532 }
533 
534 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
535 {
536 }
537 
538 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
539 {
540 }
541 #endif
542 
543 static int record__aio_enabled(struct record *rec)
544 {
545 	return rec->opts.nr_cblocks > 0;
546 }
547 
548 #define MMAP_FLUSH_DEFAULT 1
549 static int record__mmap_flush_parse(const struct option *opt,
550 				    const char *str,
551 				    int unset)
552 {
553 	int flush_max;
554 	struct record_opts *opts = (struct record_opts *)opt->value;
555 	static struct parse_tag tags[] = {
556 			{ .tag  = 'B', .mult = 1       },
557 			{ .tag  = 'K', .mult = 1 << 10 },
558 			{ .tag  = 'M', .mult = 1 << 20 },
559 			{ .tag  = 'G', .mult = 1 << 30 },
560 			{ .tag  = 0 },
561 	};
562 
563 	if (unset)
564 		return 0;
565 
566 	if (str) {
567 		opts->mmap_flush = parse_tag_value(str, tags);
568 		if (opts->mmap_flush == (int)-1)
569 			opts->mmap_flush = strtol(str, NULL, 0);
570 	}
571 
572 	if (!opts->mmap_flush)
573 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
574 
575 	flush_max = evlist__mmap_size(opts->mmap_pages);
576 	flush_max /= 4;
577 	if (opts->mmap_flush > flush_max)
578 		opts->mmap_flush = flush_max;
579 
580 	return 0;
581 }
582 
583 #ifdef HAVE_ZSTD_SUPPORT
584 static unsigned int comp_level_default = 1;
585 
586 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
587 {
588 	struct record_opts *opts = opt->value;
589 
590 	if (unset) {
591 		opts->comp_level = 0;
592 	} else {
593 		if (str)
594 			opts->comp_level = strtol(str, NULL, 0);
595 		if (!opts->comp_level)
596 			opts->comp_level = comp_level_default;
597 	}
598 
599 	return 0;
600 }
601 #endif
602 static unsigned int comp_level_max = 22;
603 
604 static int record__comp_enabled(struct record *rec)
605 {
606 	return rec->opts.comp_level > 0;
607 }
608 
609 static int process_synthesized_event(struct perf_tool *tool,
610 				     union perf_event *event,
611 				     struct perf_sample *sample __maybe_unused,
612 				     struct machine *machine __maybe_unused)
613 {
614 	struct record *rec = container_of(tool, struct record, tool);
615 	return record__write(rec, NULL, event, event->header.size);
616 }
617 
618 static struct mutex synth_lock;
619 
620 static int process_locked_synthesized_event(struct perf_tool *tool,
621 				     union perf_event *event,
622 				     struct perf_sample *sample __maybe_unused,
623 				     struct machine *machine __maybe_unused)
624 {
625 	int ret;
626 
627 	mutex_lock(&synth_lock);
628 	ret = process_synthesized_event(tool, event, sample, machine);
629 	mutex_unlock(&synth_lock);
630 	return ret;
631 }
632 
633 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
634 {
635 	struct record *rec = to;
636 
637 	if (record__comp_enabled(rec)) {
638 		size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
639 		bf   = map->data;
640 	}
641 
642 	thread->samples++;
643 	return record__write(rec, map, bf, size);
644 }
645 
646 static volatile sig_atomic_t signr = -1;
647 static volatile sig_atomic_t child_finished;
648 #ifdef HAVE_EVENTFD_SUPPORT
649 static volatile sig_atomic_t done_fd = -1;
650 #endif
651 
652 static void sig_handler(int sig)
653 {
654 	if (sig == SIGCHLD)
655 		child_finished = 1;
656 	else
657 		signr = sig;
658 
659 	done = 1;
660 #ifdef HAVE_EVENTFD_SUPPORT
661 	if (done_fd >= 0) {
662 		u64 tmp = 1;
663 		int orig_errno = errno;
664 
665 		/*
666 		 * It is possible for this signal handler to run after done is
667 		 * checked in the main loop, but before the perf counter fds are
668 		 * polled. If this happens, the poll() will continue to wait
669 		 * even though done is set, and will only break out if either
670 		 * another signal is received, or the counters are ready for
671 		 * read. To ensure the poll() doesn't sleep when done is set,
672 		 * use an eventfd (done_fd) to wake up the poll().
673 		 */
674 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
675 			pr_err("failed to signal wakeup fd, error: %m\n");
676 
677 		errno = orig_errno;
678 	}
679 #endif // HAVE_EVENTFD_SUPPORT
680 }
681 
682 static void sigsegv_handler(int sig)
683 {
684 	perf_hooks__recover();
685 	sighandler_dump_stack(sig);
686 }
687 
688 static void record__sig_exit(void)
689 {
690 	if (signr == -1)
691 		return;
692 
693 	signal(signr, SIG_DFL);
694 	raise(signr);
695 }
696 
697 #ifdef HAVE_AUXTRACE_SUPPORT
698 
699 static int record__process_auxtrace(struct perf_tool *tool,
700 				    struct mmap *map,
701 				    union perf_event *event, void *data1,
702 				    size_t len1, void *data2, size_t len2)
703 {
704 	struct record *rec = container_of(tool, struct record, tool);
705 	struct perf_data *data = &rec->data;
706 	size_t padding;
707 	u8 pad[8] = {0};
708 
709 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
710 		off_t file_offset;
711 		int fd = perf_data__fd(data);
712 		int err;
713 
714 		file_offset = lseek(fd, 0, SEEK_CUR);
715 		if (file_offset == -1)
716 			return -1;
717 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
718 						     event, file_offset);
719 		if (err)
720 			return err;
721 	}
722 
723 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
724 	padding = (len1 + len2) & 7;
725 	if (padding)
726 		padding = 8 - padding;
727 
728 	record__write(rec, map, event, event->header.size);
729 	record__write(rec, map, data1, len1);
730 	if (len2)
731 		record__write(rec, map, data2, len2);
732 	record__write(rec, map, &pad, padding);
733 
734 	return 0;
735 }
736 
737 static int record__auxtrace_mmap_read(struct record *rec,
738 				      struct mmap *map)
739 {
740 	int ret;
741 
742 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
743 				  record__process_auxtrace);
744 	if (ret < 0)
745 		return ret;
746 
747 	if (ret)
748 		rec->samples++;
749 
750 	return 0;
751 }
752 
753 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
754 					       struct mmap *map)
755 {
756 	int ret;
757 
758 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
759 					   record__process_auxtrace,
760 					   rec->opts.auxtrace_snapshot_size);
761 	if (ret < 0)
762 		return ret;
763 
764 	if (ret)
765 		rec->samples++;
766 
767 	return 0;
768 }
769 
770 static int record__auxtrace_read_snapshot_all(struct record *rec)
771 {
772 	int i;
773 	int rc = 0;
774 
775 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
776 		struct mmap *map = &rec->evlist->mmap[i];
777 
778 		if (!map->auxtrace_mmap.base)
779 			continue;
780 
781 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
782 			rc = -1;
783 			goto out;
784 		}
785 	}
786 out:
787 	return rc;
788 }
789 
790 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
791 {
792 	pr_debug("Recording AUX area tracing snapshot\n");
793 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
794 		trigger_error(&auxtrace_snapshot_trigger);
795 	} else {
796 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
797 			trigger_error(&auxtrace_snapshot_trigger);
798 		else
799 			trigger_ready(&auxtrace_snapshot_trigger);
800 	}
801 }
802 
803 static int record__auxtrace_snapshot_exit(struct record *rec)
804 {
805 	if (trigger_is_error(&auxtrace_snapshot_trigger))
806 		return 0;
807 
808 	if (!auxtrace_record__snapshot_started &&
809 	    auxtrace_record__snapshot_start(rec->itr))
810 		return -1;
811 
812 	record__read_auxtrace_snapshot(rec, true);
813 	if (trigger_is_error(&auxtrace_snapshot_trigger))
814 		return -1;
815 
816 	return 0;
817 }
818 
819 static int record__auxtrace_init(struct record *rec)
820 {
821 	int err;
822 
823 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
824 	    && record__threads_enabled(rec)) {
825 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
826 		return -EINVAL;
827 	}
828 
829 	if (!rec->itr) {
830 		rec->itr = auxtrace_record__init(rec->evlist, &err);
831 		if (err)
832 			return err;
833 	}
834 
835 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
836 					      rec->opts.auxtrace_snapshot_opts);
837 	if (err)
838 		return err;
839 
840 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
841 					    rec->opts.auxtrace_sample_opts);
842 	if (err)
843 		return err;
844 
845 	auxtrace_regroup_aux_output(rec->evlist);
846 
847 	return auxtrace_parse_filters(rec->evlist);
848 }
849 
850 #else
851 
852 static inline
853 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
854 			       struct mmap *map __maybe_unused)
855 {
856 	return 0;
857 }
858 
859 static inline
860 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
861 				    bool on_exit __maybe_unused)
862 {
863 }
864 
865 static inline
866 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
867 {
868 	return 0;
869 }
870 
871 static inline
872 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
873 {
874 	return 0;
875 }
876 
877 static int record__auxtrace_init(struct record *rec __maybe_unused)
878 {
879 	return 0;
880 }
881 
882 #endif
883 
884 static int record__config_text_poke(struct evlist *evlist)
885 {
886 	struct evsel *evsel;
887 
888 	/* Nothing to do if text poke is already configured */
889 	evlist__for_each_entry(evlist, evsel) {
890 		if (evsel->core.attr.text_poke)
891 			return 0;
892 	}
893 
894 	evsel = evlist__add_dummy_on_all_cpus(evlist);
895 	if (!evsel)
896 		return -ENOMEM;
897 
898 	evsel->core.attr.text_poke = 1;
899 	evsel->core.attr.ksymbol = 1;
900 	evsel->immediate = true;
901 	evsel__set_sample_bit(evsel, TIME);
902 
903 	return 0;
904 }
905 
906 static int record__config_off_cpu(struct record *rec)
907 {
908 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
909 }
910 
911 static bool record__kcore_readable(struct machine *machine)
912 {
913 	char kcore[PATH_MAX];
914 	int fd;
915 
916 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
917 
918 	fd = open(kcore, O_RDONLY);
919 	if (fd < 0)
920 		return false;
921 
922 	close(fd);
923 
924 	return true;
925 }
926 
927 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
928 {
929 	char from_dir[PATH_MAX];
930 	char kcore_dir[PATH_MAX];
931 	int ret;
932 
933 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
934 
935 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
936 	if (ret)
937 		return ret;
938 
939 	return kcore_copy(from_dir, kcore_dir);
940 }
941 
942 static void record__thread_data_init_pipes(struct record_thread *thread_data)
943 {
944 	thread_data->pipes.msg[0] = -1;
945 	thread_data->pipes.msg[1] = -1;
946 	thread_data->pipes.ack[0] = -1;
947 	thread_data->pipes.ack[1] = -1;
948 }
949 
950 static int record__thread_data_open_pipes(struct record_thread *thread_data)
951 {
952 	if (pipe(thread_data->pipes.msg))
953 		return -EINVAL;
954 
955 	if (pipe(thread_data->pipes.ack)) {
956 		close(thread_data->pipes.msg[0]);
957 		thread_data->pipes.msg[0] = -1;
958 		close(thread_data->pipes.msg[1]);
959 		thread_data->pipes.msg[1] = -1;
960 		return -EINVAL;
961 	}
962 
963 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
964 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
965 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
966 
967 	return 0;
968 }
969 
970 static void record__thread_data_close_pipes(struct record_thread *thread_data)
971 {
972 	if (thread_data->pipes.msg[0] != -1) {
973 		close(thread_data->pipes.msg[0]);
974 		thread_data->pipes.msg[0] = -1;
975 	}
976 	if (thread_data->pipes.msg[1] != -1) {
977 		close(thread_data->pipes.msg[1]);
978 		thread_data->pipes.msg[1] = -1;
979 	}
980 	if (thread_data->pipes.ack[0] != -1) {
981 		close(thread_data->pipes.ack[0]);
982 		thread_data->pipes.ack[0] = -1;
983 	}
984 	if (thread_data->pipes.ack[1] != -1) {
985 		close(thread_data->pipes.ack[1]);
986 		thread_data->pipes.ack[1] = -1;
987 	}
988 }
989 
990 static bool evlist__per_thread(struct evlist *evlist)
991 {
992 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
993 }
994 
995 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
996 {
997 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
998 	struct mmap *mmap = evlist->mmap;
999 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1000 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1001 	bool per_thread = evlist__per_thread(evlist);
1002 
1003 	if (per_thread)
1004 		thread_data->nr_mmaps = nr_mmaps;
1005 	else
1006 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1007 						      thread_data->mask->maps.nbits);
1008 	if (mmap) {
1009 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1010 		if (!thread_data->maps)
1011 			return -ENOMEM;
1012 	}
1013 	if (overwrite_mmap) {
1014 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1015 		if (!thread_data->overwrite_maps) {
1016 			zfree(&thread_data->maps);
1017 			return -ENOMEM;
1018 		}
1019 	}
1020 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1021 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1022 
1023 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1024 		if (per_thread ||
1025 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1026 			if (thread_data->maps) {
1027 				thread_data->maps[tm] = &mmap[m];
1028 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1029 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1030 			}
1031 			if (thread_data->overwrite_maps) {
1032 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1033 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1034 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1035 			}
1036 			tm++;
1037 		}
1038 	}
1039 
1040 	return 0;
1041 }
1042 
1043 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1044 {
1045 	int f, tm, pos;
1046 	struct mmap *map, *overwrite_map;
1047 
1048 	fdarray__init(&thread_data->pollfd, 64);
1049 
1050 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1051 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1052 		overwrite_map = thread_data->overwrite_maps ?
1053 				thread_data->overwrite_maps[tm] : NULL;
1054 
1055 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1056 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1057 
1058 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1059 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1060 							      &evlist->core.pollfd);
1061 				if (pos < 0)
1062 					return pos;
1063 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1064 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1065 			}
1066 		}
1067 	}
1068 
1069 	return 0;
1070 }
1071 
1072 static void record__free_thread_data(struct record *rec)
1073 {
1074 	int t;
1075 	struct record_thread *thread_data = rec->thread_data;
1076 
1077 	if (thread_data == NULL)
1078 		return;
1079 
1080 	for (t = 0; t < rec->nr_threads; t++) {
1081 		record__thread_data_close_pipes(&thread_data[t]);
1082 		zfree(&thread_data[t].maps);
1083 		zfree(&thread_data[t].overwrite_maps);
1084 		fdarray__exit(&thread_data[t].pollfd);
1085 	}
1086 
1087 	zfree(&rec->thread_data);
1088 }
1089 
1090 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1091 						    int evlist_pollfd_index,
1092 						    int thread_pollfd_index)
1093 {
1094 	size_t x = rec->index_map_cnt;
1095 
1096 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1097 		return -ENOMEM;
1098 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1099 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1100 	rec->index_map_cnt += 1;
1101 	return 0;
1102 }
1103 
1104 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1105 						    struct evlist *evlist,
1106 						    struct record_thread *thread_data)
1107 {
1108 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1109 	struct pollfd *t_entries = thread_data->pollfd.entries;
1110 	int err = 0;
1111 	size_t i;
1112 
1113 	for (i = 0; i < rec->index_map_cnt; i++) {
1114 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1115 		int t_pos = rec->index_map[i].thread_pollfd_index;
1116 
1117 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1118 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1119 			pr_err("Thread and evlist pollfd index mismatch\n");
1120 			err = -EINVAL;
1121 			continue;
1122 		}
1123 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1124 	}
1125 	return err;
1126 }
1127 
1128 static int record__dup_non_perf_events(struct record *rec,
1129 				       struct evlist *evlist,
1130 				       struct record_thread *thread_data)
1131 {
1132 	struct fdarray *fda = &evlist->core.pollfd;
1133 	int i, ret;
1134 
1135 	for (i = 0; i < fda->nr; i++) {
1136 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1137 			continue;
1138 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1139 		if (ret < 0) {
1140 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1141 			return ret;
1142 		}
1143 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1144 			  thread_data, ret, fda->entries[i].fd);
1145 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1146 		if (ret < 0) {
1147 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1148 			return ret;
1149 		}
1150 	}
1151 	return 0;
1152 }
1153 
1154 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1155 {
1156 	int t, ret;
1157 	struct record_thread *thread_data;
1158 
1159 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1160 	if (!rec->thread_data) {
1161 		pr_err("Failed to allocate thread data\n");
1162 		return -ENOMEM;
1163 	}
1164 	thread_data = rec->thread_data;
1165 
1166 	for (t = 0; t < rec->nr_threads; t++)
1167 		record__thread_data_init_pipes(&thread_data[t]);
1168 
1169 	for (t = 0; t < rec->nr_threads; t++) {
1170 		thread_data[t].rec = rec;
1171 		thread_data[t].mask = &rec->thread_masks[t];
1172 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1173 		if (ret) {
1174 			pr_err("Failed to initialize thread[%d] maps\n", t);
1175 			goto out_free;
1176 		}
1177 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1178 		if (ret) {
1179 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1180 			goto out_free;
1181 		}
1182 		if (t) {
1183 			thread_data[t].tid = -1;
1184 			ret = record__thread_data_open_pipes(&thread_data[t]);
1185 			if (ret) {
1186 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1187 				goto out_free;
1188 			}
1189 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1190 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1191 			if (ret < 0) {
1192 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1193 				goto out_free;
1194 			}
1195 			thread_data[t].ctlfd_pos = ret;
1196 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1197 				 thread_data, thread_data[t].ctlfd_pos,
1198 				 thread_data[t].pipes.msg[0]);
1199 		} else {
1200 			thread_data[t].tid = gettid();
1201 
1202 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1203 			if (ret < 0)
1204 				goto out_free;
1205 
1206 			thread_data[t].ctlfd_pos = -1; /* Not used */
1207 		}
1208 	}
1209 
1210 	return 0;
1211 
1212 out_free:
1213 	record__free_thread_data(rec);
1214 
1215 	return ret;
1216 }
1217 
1218 static int record__mmap_evlist(struct record *rec,
1219 			       struct evlist *evlist)
1220 {
1221 	int i, ret;
1222 	struct record_opts *opts = &rec->opts;
1223 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1224 				  opts->auxtrace_sample_mode;
1225 	char msg[512];
1226 
1227 	if (opts->affinity != PERF_AFFINITY_SYS)
1228 		cpu__setup_cpunode_map();
1229 
1230 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1231 				 opts->auxtrace_mmap_pages,
1232 				 auxtrace_overwrite,
1233 				 opts->nr_cblocks, opts->affinity,
1234 				 opts->mmap_flush, opts->comp_level) < 0) {
1235 		if (errno == EPERM) {
1236 			pr_err("Permission error mapping pages.\n"
1237 			       "Consider increasing "
1238 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1239 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1240 			       "(current value: %u,%u)\n",
1241 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1242 			return -errno;
1243 		} else {
1244 			pr_err("failed to mmap with %d (%s)\n", errno,
1245 				str_error_r(errno, msg, sizeof(msg)));
1246 			if (errno)
1247 				return -errno;
1248 			else
1249 				return -EINVAL;
1250 		}
1251 	}
1252 
1253 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1254 		return -1;
1255 
1256 	ret = record__alloc_thread_data(rec, evlist);
1257 	if (ret)
1258 		return ret;
1259 
1260 	if (record__threads_enabled(rec)) {
1261 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1262 		if (ret) {
1263 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1264 			return ret;
1265 		}
1266 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1267 			if (evlist->mmap)
1268 				evlist->mmap[i].file = &rec->data.dir.files[i];
1269 			if (evlist->overwrite_mmap)
1270 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1271 		}
1272 	}
1273 
1274 	return 0;
1275 }
1276 
1277 static int record__mmap(struct record *rec)
1278 {
1279 	return record__mmap_evlist(rec, rec->evlist);
1280 }
1281 
1282 static int record__open(struct record *rec)
1283 {
1284 	char msg[BUFSIZ];
1285 	struct evsel *pos;
1286 	struct evlist *evlist = rec->evlist;
1287 	struct perf_session *session = rec->session;
1288 	struct record_opts *opts = &rec->opts;
1289 	int rc = 0;
1290 
1291 	/*
1292 	 * For initial_delay, system wide or a hybrid system, we need to add a
1293 	 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
1294 	 * of waiting or event synthesis.
1295 	 */
1296 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
1297 	    perf_pmus__num_core_pmus() > 1) {
1298 		pos = evlist__get_tracking_event(evlist);
1299 		if (!evsel__is_dummy_event(pos)) {
1300 			/* Set up dummy event. */
1301 			if (evlist__add_dummy(evlist))
1302 				return -ENOMEM;
1303 			pos = evlist__last(evlist);
1304 			evlist__set_tracking_event(evlist, pos);
1305 		}
1306 
1307 		/*
1308 		 * Enable the dummy event when the process is forked for
1309 		 * initial_delay, immediately for system wide.
1310 		 */
1311 		if (opts->target.initial_delay && !pos->immediate &&
1312 		    !target__has_cpu(&opts->target))
1313 			pos->core.attr.enable_on_exec = 1;
1314 		else
1315 			pos->immediate = 1;
1316 	}
1317 
1318 	evlist__config(evlist, opts, &callchain_param);
1319 
1320 	evlist__for_each_entry(evlist, pos) {
1321 try_again:
1322 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1323 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1324 				if (verbose > 0)
1325 					ui__warning("%s\n", msg);
1326 				goto try_again;
1327 			}
1328 			if ((errno == EINVAL || errno == EBADF) &&
1329 			    pos->core.leader != &pos->core &&
1330 			    pos->weak_group) {
1331 			        pos = evlist__reset_weak_group(evlist, pos, true);
1332 				goto try_again;
1333 			}
1334 			rc = -errno;
1335 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1336 			ui__error("%s\n", msg);
1337 			goto out;
1338 		}
1339 
1340 		pos->supported = true;
1341 	}
1342 
1343 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1344 		pr_warning(
1345 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1346 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1347 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1348 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1349 "Samples in kernel modules won't be resolved at all.\n\n"
1350 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1351 "even with a suitable vmlinux or kallsyms file.\n\n");
1352 	}
1353 
1354 	if (evlist__apply_filters(evlist, &pos)) {
1355 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1356 			pos->filter ?: "BPF", evsel__name(pos), errno,
1357 			str_error_r(errno, msg, sizeof(msg)));
1358 		rc = -1;
1359 		goto out;
1360 	}
1361 
1362 	rc = record__mmap(rec);
1363 	if (rc)
1364 		goto out;
1365 
1366 	session->evlist = evlist;
1367 	perf_session__set_id_hdr_size(session);
1368 out:
1369 	return rc;
1370 }
1371 
1372 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1373 {
1374 	if (rec->evlist->first_sample_time == 0)
1375 		rec->evlist->first_sample_time = sample_time;
1376 
1377 	if (sample_time)
1378 		rec->evlist->last_sample_time = sample_time;
1379 }
1380 
1381 static int process_sample_event(struct perf_tool *tool,
1382 				union perf_event *event,
1383 				struct perf_sample *sample,
1384 				struct evsel *evsel,
1385 				struct machine *machine)
1386 {
1387 	struct record *rec = container_of(tool, struct record, tool);
1388 
1389 	set_timestamp_boundary(rec, sample->time);
1390 
1391 	if (rec->buildid_all)
1392 		return 0;
1393 
1394 	rec->samples++;
1395 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1396 }
1397 
1398 static int process_buildids(struct record *rec)
1399 {
1400 	struct perf_session *session = rec->session;
1401 
1402 	if (perf_data__size(&rec->data) == 0)
1403 		return 0;
1404 
1405 	/*
1406 	 * During this process, it'll load kernel map and replace the
1407 	 * dso->long_name to a real pathname it found.  In this case
1408 	 * we prefer the vmlinux path like
1409 	 *   /lib/modules/3.16.4/build/vmlinux
1410 	 *
1411 	 * rather than build-id path (in debug directory).
1412 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1413 	 */
1414 	symbol_conf.ignore_vmlinux_buildid = true;
1415 
1416 	/*
1417 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1418 	 * so no need to process samples. But if timestamp_boundary is enabled,
1419 	 * it still needs to walk on all samples to get the timestamps of
1420 	 * first/last samples.
1421 	 */
1422 	if (rec->buildid_all && !rec->timestamp_boundary)
1423 		rec->tool.sample = NULL;
1424 
1425 	return perf_session__process_events(session);
1426 }
1427 
1428 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1429 {
1430 	int err;
1431 	struct perf_tool *tool = data;
1432 	/*
1433 	 *As for guest kernel when processing subcommand record&report,
1434 	 *we arrange module mmap prior to guest kernel mmap and trigger
1435 	 *a preload dso because default guest module symbols are loaded
1436 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1437 	 *method is used to avoid symbol missing when the first addr is
1438 	 *in module instead of in guest kernel.
1439 	 */
1440 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1441 					     machine);
1442 	if (err < 0)
1443 		pr_err("Couldn't record guest kernel [%d]'s reference"
1444 		       " relocation symbol.\n", machine->pid);
1445 
1446 	/*
1447 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1448 	 * have no _text sometimes.
1449 	 */
1450 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1451 						 machine);
1452 	if (err < 0)
1453 		pr_err("Couldn't record guest kernel [%d]'s reference"
1454 		       " relocation symbol.\n", machine->pid);
1455 }
1456 
1457 static struct perf_event_header finished_round_event = {
1458 	.size = sizeof(struct perf_event_header),
1459 	.type = PERF_RECORD_FINISHED_ROUND,
1460 };
1461 
1462 static struct perf_event_header finished_init_event = {
1463 	.size = sizeof(struct perf_event_header),
1464 	.type = PERF_RECORD_FINISHED_INIT,
1465 };
1466 
1467 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1468 {
1469 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1470 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1471 			  thread->mask->affinity.nbits)) {
1472 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1473 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1474 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1475 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1476 					(cpu_set_t *)thread->mask->affinity.bits);
1477 		if (verbose == 2) {
1478 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1479 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1480 		}
1481 	}
1482 }
1483 
1484 static size_t process_comp_header(void *record, size_t increment)
1485 {
1486 	struct perf_record_compressed *event = record;
1487 	size_t size = sizeof(*event);
1488 
1489 	if (increment) {
1490 		event->header.size += increment;
1491 		return increment;
1492 	}
1493 
1494 	event->header.type = PERF_RECORD_COMPRESSED;
1495 	event->header.size = size;
1496 
1497 	return size;
1498 }
1499 
1500 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1501 			    void *dst, size_t dst_size, void *src, size_t src_size)
1502 {
1503 	size_t compressed;
1504 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1505 	struct zstd_data *zstd_data = &session->zstd_data;
1506 
1507 	if (map && map->file)
1508 		zstd_data = &map->zstd_data;
1509 
1510 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1511 						     max_record_size, process_comp_header);
1512 
1513 	if (map && map->file) {
1514 		thread->bytes_transferred += src_size;
1515 		thread->bytes_compressed  += compressed;
1516 	} else {
1517 		session->bytes_transferred += src_size;
1518 		session->bytes_compressed  += compressed;
1519 	}
1520 
1521 	return compressed;
1522 }
1523 
1524 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1525 				    bool overwrite, bool synch)
1526 {
1527 	u64 bytes_written = rec->bytes_written;
1528 	int i;
1529 	int rc = 0;
1530 	int nr_mmaps;
1531 	struct mmap **maps;
1532 	int trace_fd = rec->data.file.fd;
1533 	off_t off = 0;
1534 
1535 	if (!evlist)
1536 		return 0;
1537 
1538 	nr_mmaps = thread->nr_mmaps;
1539 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1540 
1541 	if (!maps)
1542 		return 0;
1543 
1544 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1545 		return 0;
1546 
1547 	if (record__aio_enabled(rec))
1548 		off = record__aio_get_pos(trace_fd);
1549 
1550 	for (i = 0; i < nr_mmaps; i++) {
1551 		u64 flush = 0;
1552 		struct mmap *map = maps[i];
1553 
1554 		if (map->core.base) {
1555 			record__adjust_affinity(rec, map);
1556 			if (synch) {
1557 				flush = map->core.flush;
1558 				map->core.flush = 1;
1559 			}
1560 			if (!record__aio_enabled(rec)) {
1561 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1562 					if (synch)
1563 						map->core.flush = flush;
1564 					rc = -1;
1565 					goto out;
1566 				}
1567 			} else {
1568 				if (record__aio_push(rec, map, &off) < 0) {
1569 					record__aio_set_pos(trace_fd, off);
1570 					if (synch)
1571 						map->core.flush = flush;
1572 					rc = -1;
1573 					goto out;
1574 				}
1575 			}
1576 			if (synch)
1577 				map->core.flush = flush;
1578 		}
1579 
1580 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1581 		    !rec->opts.auxtrace_sample_mode &&
1582 		    record__auxtrace_mmap_read(rec, map) != 0) {
1583 			rc = -1;
1584 			goto out;
1585 		}
1586 	}
1587 
1588 	if (record__aio_enabled(rec))
1589 		record__aio_set_pos(trace_fd, off);
1590 
1591 	/*
1592 	 * Mark the round finished in case we wrote
1593 	 * at least one event.
1594 	 *
1595 	 * No need for round events in directory mode,
1596 	 * because per-cpu maps and files have data
1597 	 * sorted by kernel.
1598 	 */
1599 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1600 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1601 
1602 	if (overwrite)
1603 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1604 out:
1605 	return rc;
1606 }
1607 
1608 static int record__mmap_read_all(struct record *rec, bool synch)
1609 {
1610 	int err;
1611 
1612 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1613 	if (err)
1614 		return err;
1615 
1616 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1617 }
1618 
1619 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1620 					   void *arg __maybe_unused)
1621 {
1622 	struct perf_mmap *map = fda->priv[fd].ptr;
1623 
1624 	if (map)
1625 		perf_mmap__put(map);
1626 }
1627 
1628 static void *record__thread(void *arg)
1629 {
1630 	enum thread_msg msg = THREAD_MSG__READY;
1631 	bool terminate = false;
1632 	struct fdarray *pollfd;
1633 	int err, ctlfd_pos;
1634 
1635 	thread = arg;
1636 	thread->tid = gettid();
1637 
1638 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1639 	if (err == -1)
1640 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1641 			   thread->tid, strerror(errno));
1642 
1643 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1644 
1645 	pollfd = &thread->pollfd;
1646 	ctlfd_pos = thread->ctlfd_pos;
1647 
1648 	for (;;) {
1649 		unsigned long long hits = thread->samples;
1650 
1651 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1652 			break;
1653 
1654 		if (hits == thread->samples) {
1655 
1656 			err = fdarray__poll(pollfd, -1);
1657 			/*
1658 			 * Propagate error, only if there's any. Ignore positive
1659 			 * number of returned events and interrupt error.
1660 			 */
1661 			if (err > 0 || (err < 0 && errno == EINTR))
1662 				err = 0;
1663 			thread->waking++;
1664 
1665 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1666 					    record__thread_munmap_filtered, NULL) == 0)
1667 				break;
1668 		}
1669 
1670 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1671 			terminate = true;
1672 			close(thread->pipes.msg[0]);
1673 			thread->pipes.msg[0] = -1;
1674 			pollfd->entries[ctlfd_pos].fd = -1;
1675 			pollfd->entries[ctlfd_pos].events = 0;
1676 		}
1677 
1678 		pollfd->entries[ctlfd_pos].revents = 0;
1679 	}
1680 	record__mmap_read_all(thread->rec, true);
1681 
1682 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1683 	if (err == -1)
1684 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1685 			   thread->tid, strerror(errno));
1686 
1687 	return NULL;
1688 }
1689 
1690 static void record__init_features(struct record *rec)
1691 {
1692 	struct perf_session *session = rec->session;
1693 	int feat;
1694 
1695 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1696 		perf_header__set_feat(&session->header, feat);
1697 
1698 	if (rec->no_buildid)
1699 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1700 
1701 #ifdef HAVE_LIBTRACEEVENT
1702 	if (!have_tracepoints(&rec->evlist->core.entries))
1703 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1704 #endif
1705 
1706 	if (!rec->opts.branch_stack)
1707 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1708 
1709 	if (!rec->opts.full_auxtrace)
1710 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1711 
1712 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1713 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1714 
1715 	if (!rec->opts.use_clockid)
1716 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1717 
1718 	if (!record__threads_enabled(rec))
1719 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1720 
1721 	if (!record__comp_enabled(rec))
1722 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1723 
1724 	perf_header__clear_feat(&session->header, HEADER_STAT);
1725 }
1726 
1727 static void
1728 record__finish_output(struct record *rec)
1729 {
1730 	int i;
1731 	struct perf_data *data = &rec->data;
1732 	int fd = perf_data__fd(data);
1733 
1734 	if (data->is_pipe)
1735 		return;
1736 
1737 	rec->session->header.data_size += rec->bytes_written;
1738 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1739 	if (record__threads_enabled(rec)) {
1740 		for (i = 0; i < data->dir.nr; i++)
1741 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1742 	}
1743 
1744 	if (!rec->no_buildid) {
1745 		process_buildids(rec);
1746 
1747 		if (rec->buildid_all)
1748 			dsos__hit_all(rec->session);
1749 	}
1750 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1751 
1752 	return;
1753 }
1754 
1755 static int record__synthesize_workload(struct record *rec, bool tail)
1756 {
1757 	int err;
1758 	struct perf_thread_map *thread_map;
1759 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1760 
1761 	if (rec->opts.tail_synthesize != tail)
1762 		return 0;
1763 
1764 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1765 	if (thread_map == NULL)
1766 		return -1;
1767 
1768 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1769 						 process_synthesized_event,
1770 						 &rec->session->machines.host,
1771 						 needs_mmap,
1772 						 rec->opts.sample_address);
1773 	perf_thread_map__put(thread_map);
1774 	return err;
1775 }
1776 
1777 static int write_finished_init(struct record *rec, bool tail)
1778 {
1779 	if (rec->opts.tail_synthesize != tail)
1780 		return 0;
1781 
1782 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1783 }
1784 
1785 static int record__synthesize(struct record *rec, bool tail);
1786 
1787 static int
1788 record__switch_output(struct record *rec, bool at_exit)
1789 {
1790 	struct perf_data *data = &rec->data;
1791 	int fd, err;
1792 	char *new_filename;
1793 
1794 	/* Same Size:      "2015122520103046"*/
1795 	char timestamp[] = "InvalidTimestamp";
1796 
1797 	record__aio_mmap_read_sync(rec);
1798 
1799 	write_finished_init(rec, true);
1800 
1801 	record__synthesize(rec, true);
1802 	if (target__none(&rec->opts.target))
1803 		record__synthesize_workload(rec, true);
1804 
1805 	rec->samples = 0;
1806 	record__finish_output(rec);
1807 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1808 	if (err) {
1809 		pr_err("Failed to get current timestamp\n");
1810 		return -EINVAL;
1811 	}
1812 
1813 	fd = perf_data__switch(data, timestamp,
1814 				    rec->session->header.data_offset,
1815 				    at_exit, &new_filename);
1816 	if (fd >= 0 && !at_exit) {
1817 		rec->bytes_written = 0;
1818 		rec->session->header.data_size = 0;
1819 	}
1820 
1821 	if (!quiet)
1822 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1823 			data->path, timestamp);
1824 
1825 	if (rec->switch_output.num_files) {
1826 		int n = rec->switch_output.cur_file + 1;
1827 
1828 		if (n >= rec->switch_output.num_files)
1829 			n = 0;
1830 		rec->switch_output.cur_file = n;
1831 		if (rec->switch_output.filenames[n]) {
1832 			remove(rec->switch_output.filenames[n]);
1833 			zfree(&rec->switch_output.filenames[n]);
1834 		}
1835 		rec->switch_output.filenames[n] = new_filename;
1836 	} else {
1837 		free(new_filename);
1838 	}
1839 
1840 	/* Output tracking events */
1841 	if (!at_exit) {
1842 		record__synthesize(rec, false);
1843 
1844 		/*
1845 		 * In 'perf record --switch-output' without -a,
1846 		 * record__synthesize() in record__switch_output() won't
1847 		 * generate tracking events because there's no thread_map
1848 		 * in evlist. Which causes newly created perf.data doesn't
1849 		 * contain map and comm information.
1850 		 * Create a fake thread_map and directly call
1851 		 * perf_event__synthesize_thread_map() for those events.
1852 		 */
1853 		if (target__none(&rec->opts.target))
1854 			record__synthesize_workload(rec, false);
1855 		write_finished_init(rec, false);
1856 	}
1857 	return fd;
1858 }
1859 
1860 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1861 					struct perf_record_lost_samples *lost,
1862 					int cpu_idx, int thread_idx, u64 lost_count,
1863 					u16 misc_flag)
1864 {
1865 	struct perf_sample_id *sid;
1866 	struct perf_sample sample = {};
1867 	int id_hdr_size;
1868 
1869 	lost->lost = lost_count;
1870 	if (evsel->core.ids) {
1871 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1872 		sample.id = sid->id;
1873 	}
1874 
1875 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1876 						       evsel->core.attr.sample_type, &sample);
1877 	lost->header.size = sizeof(*lost) + id_hdr_size;
1878 	lost->header.misc = misc_flag;
1879 	record__write(rec, NULL, lost, lost->header.size);
1880 }
1881 
1882 static void record__read_lost_samples(struct record *rec)
1883 {
1884 	struct perf_session *session = rec->session;
1885 	struct perf_record_lost_samples *lost;
1886 	struct evsel *evsel;
1887 
1888 	/* there was an error during record__open */
1889 	if (session->evlist == NULL)
1890 		return;
1891 
1892 	lost = zalloc(PERF_SAMPLE_MAX_SIZE);
1893 	if (lost == NULL) {
1894 		pr_debug("Memory allocation failed\n");
1895 		return;
1896 	}
1897 
1898 	lost->header.type = PERF_RECORD_LOST_SAMPLES;
1899 
1900 	evlist__for_each_entry(session->evlist, evsel) {
1901 		struct xyarray *xy = evsel->core.sample_id;
1902 		u64 lost_count;
1903 
1904 		if (xy == NULL || evsel->core.fd == NULL)
1905 			continue;
1906 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1907 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1908 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1909 			continue;
1910 		}
1911 
1912 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1913 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1914 				struct perf_counts_values count;
1915 
1916 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1917 					pr_debug("read LOST count failed\n");
1918 					goto out;
1919 				}
1920 
1921 				if (count.lost) {
1922 					__record__save_lost_samples(rec, evsel, lost,
1923 								    x, y, count.lost, 0);
1924 				}
1925 			}
1926 		}
1927 
1928 		lost_count = perf_bpf_filter__lost_count(evsel);
1929 		if (lost_count)
1930 			__record__save_lost_samples(rec, evsel, lost, 0, 0, lost_count,
1931 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1932 	}
1933 out:
1934 	free(lost);
1935 }
1936 
1937 static volatile sig_atomic_t workload_exec_errno;
1938 
1939 /*
1940  * evlist__prepare_workload will send a SIGUSR1
1941  * if the fork fails, since we asked by setting its
1942  * want_signal to true.
1943  */
1944 static void workload_exec_failed_signal(int signo __maybe_unused,
1945 					siginfo_t *info,
1946 					void *ucontext __maybe_unused)
1947 {
1948 	workload_exec_errno = info->si_value.sival_int;
1949 	done = 1;
1950 	child_finished = 1;
1951 }
1952 
1953 static void snapshot_sig_handler(int sig);
1954 static void alarm_sig_handler(int sig);
1955 
1956 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1957 {
1958 	if (evlist) {
1959 		if (evlist->mmap && evlist->mmap[0].core.base)
1960 			return evlist->mmap[0].core.base;
1961 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1962 			return evlist->overwrite_mmap[0].core.base;
1963 	}
1964 	return NULL;
1965 }
1966 
1967 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1968 {
1969 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1970 	if (pc)
1971 		return pc;
1972 	return NULL;
1973 }
1974 
1975 static int record__synthesize(struct record *rec, bool tail)
1976 {
1977 	struct perf_session *session = rec->session;
1978 	struct machine *machine = &session->machines.host;
1979 	struct perf_data *data = &rec->data;
1980 	struct record_opts *opts = &rec->opts;
1981 	struct perf_tool *tool = &rec->tool;
1982 	int err = 0;
1983 	event_op f = process_synthesized_event;
1984 
1985 	if (rec->opts.tail_synthesize != tail)
1986 		return 0;
1987 
1988 	if (data->is_pipe) {
1989 		err = perf_event__synthesize_for_pipe(tool, session, data,
1990 						      process_synthesized_event);
1991 		if (err < 0)
1992 			goto out;
1993 
1994 		rec->bytes_written += err;
1995 	}
1996 
1997 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1998 					  process_synthesized_event, machine);
1999 	if (err)
2000 		goto out;
2001 
2002 	/* Synthesize id_index before auxtrace_info */
2003 	err = perf_event__synthesize_id_index(tool,
2004 					      process_synthesized_event,
2005 					      session->evlist, machine);
2006 	if (err)
2007 		goto out;
2008 
2009 	if (rec->opts.full_auxtrace) {
2010 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2011 					session, process_synthesized_event);
2012 		if (err)
2013 			goto out;
2014 	}
2015 
2016 	if (!evlist__exclude_kernel(rec->evlist)) {
2017 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2018 							 machine);
2019 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2020 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2021 				   "Check /proc/kallsyms permission or run as root.\n");
2022 
2023 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2024 						     machine);
2025 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2026 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2027 				   "Check /proc/modules permission or run as root.\n");
2028 	}
2029 
2030 	if (perf_guest) {
2031 		machines__process_guests(&session->machines,
2032 					 perf_event__synthesize_guest_os, tool);
2033 	}
2034 
2035 	err = perf_event__synthesize_extra_attr(&rec->tool,
2036 						rec->evlist,
2037 						process_synthesized_event,
2038 						data->is_pipe);
2039 	if (err)
2040 		goto out;
2041 
2042 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2043 						 process_synthesized_event,
2044 						NULL);
2045 	if (err < 0) {
2046 		pr_err("Couldn't synthesize thread map.\n");
2047 		return err;
2048 	}
2049 
2050 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2051 					     process_synthesized_event, NULL);
2052 	if (err < 0) {
2053 		pr_err("Couldn't synthesize cpu map.\n");
2054 		return err;
2055 	}
2056 
2057 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2058 						machine, opts);
2059 	if (err < 0) {
2060 		pr_warning("Couldn't synthesize bpf events.\n");
2061 		err = 0;
2062 	}
2063 
2064 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2065 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2066 						     machine);
2067 		if (err < 0) {
2068 			pr_warning("Couldn't synthesize cgroup events.\n");
2069 			err = 0;
2070 		}
2071 	}
2072 
2073 	if (rec->opts.nr_threads_synthesize > 1) {
2074 		mutex_init(&synth_lock);
2075 		perf_set_multithreaded();
2076 		f = process_locked_synthesized_event;
2077 	}
2078 
2079 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2080 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2081 
2082 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2083 						    rec->evlist->core.threads,
2084 						    f, needs_mmap, opts->sample_address,
2085 						    rec->opts.nr_threads_synthesize);
2086 	}
2087 
2088 	if (rec->opts.nr_threads_synthesize > 1) {
2089 		perf_set_singlethreaded();
2090 		mutex_destroy(&synth_lock);
2091 	}
2092 
2093 out:
2094 	return err;
2095 }
2096 
2097 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2098 {
2099 	struct record *rec = data;
2100 	pthread_kill(rec->thread_id, SIGUSR2);
2101 	return 0;
2102 }
2103 
2104 static int record__setup_sb_evlist(struct record *rec)
2105 {
2106 	struct record_opts *opts = &rec->opts;
2107 
2108 	if (rec->sb_evlist != NULL) {
2109 		/*
2110 		 * We get here if --switch-output-event populated the
2111 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2112 		 * to the main thread.
2113 		 */
2114 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2115 		rec->thread_id = pthread_self();
2116 	}
2117 #ifdef HAVE_LIBBPF_SUPPORT
2118 	if (!opts->no_bpf_event) {
2119 		if (rec->sb_evlist == NULL) {
2120 			rec->sb_evlist = evlist__new();
2121 
2122 			if (rec->sb_evlist == NULL) {
2123 				pr_err("Couldn't create side band evlist.\n.");
2124 				return -1;
2125 			}
2126 		}
2127 
2128 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2129 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2130 			return -1;
2131 		}
2132 	}
2133 #endif
2134 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2135 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2136 		opts->no_bpf_event = true;
2137 	}
2138 
2139 	return 0;
2140 }
2141 
2142 static int record__init_clock(struct record *rec)
2143 {
2144 	struct perf_session *session = rec->session;
2145 	struct timespec ref_clockid;
2146 	struct timeval ref_tod;
2147 	u64 ref;
2148 
2149 	if (!rec->opts.use_clockid)
2150 		return 0;
2151 
2152 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2153 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2154 
2155 	session->header.env.clock.clockid = rec->opts.clockid;
2156 
2157 	if (gettimeofday(&ref_tod, NULL) != 0) {
2158 		pr_err("gettimeofday failed, cannot set reference time.\n");
2159 		return -1;
2160 	}
2161 
2162 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2163 		pr_err("clock_gettime failed, cannot set reference time.\n");
2164 		return -1;
2165 	}
2166 
2167 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2168 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2169 
2170 	session->header.env.clock.tod_ns = ref;
2171 
2172 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2173 	      (u64) ref_clockid.tv_nsec;
2174 
2175 	session->header.env.clock.clockid_ns = ref;
2176 	return 0;
2177 }
2178 
2179 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2180 {
2181 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2182 		trigger_hit(&auxtrace_snapshot_trigger);
2183 		auxtrace_record__snapshot_started = 1;
2184 		if (auxtrace_record__snapshot_start(rec->itr))
2185 			trigger_error(&auxtrace_snapshot_trigger);
2186 	}
2187 }
2188 
2189 static void record__uniquify_name(struct record *rec)
2190 {
2191 	struct evsel *pos;
2192 	struct evlist *evlist = rec->evlist;
2193 	char *new_name;
2194 	int ret;
2195 
2196 	if (perf_pmus__num_core_pmus() == 1)
2197 		return;
2198 
2199 	evlist__for_each_entry(evlist, pos) {
2200 		if (!evsel__is_hybrid(pos))
2201 			continue;
2202 
2203 		if (strchr(pos->name, '/'))
2204 			continue;
2205 
2206 		ret = asprintf(&new_name, "%s/%s/",
2207 			       pos->pmu_name, pos->name);
2208 		if (ret) {
2209 			free(pos->name);
2210 			pos->name = new_name;
2211 		}
2212 	}
2213 }
2214 
2215 static int record__terminate_thread(struct record_thread *thread_data)
2216 {
2217 	int err;
2218 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2219 	pid_t tid = thread_data->tid;
2220 
2221 	close(thread_data->pipes.msg[1]);
2222 	thread_data->pipes.msg[1] = -1;
2223 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2224 	if (err > 0)
2225 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2226 	else
2227 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2228 			   thread->tid, tid);
2229 
2230 	return 0;
2231 }
2232 
2233 static int record__start_threads(struct record *rec)
2234 {
2235 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2236 	struct record_thread *thread_data = rec->thread_data;
2237 	sigset_t full, mask;
2238 	pthread_t handle;
2239 	pthread_attr_t attrs;
2240 
2241 	thread = &thread_data[0];
2242 
2243 	if (!record__threads_enabled(rec))
2244 		return 0;
2245 
2246 	sigfillset(&full);
2247 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2248 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2249 		return -1;
2250 	}
2251 
2252 	pthread_attr_init(&attrs);
2253 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2254 
2255 	for (t = 1; t < nr_threads; t++) {
2256 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2257 
2258 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2259 		pthread_attr_setaffinity_np(&attrs,
2260 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2261 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2262 #endif
2263 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2264 			for (tt = 1; tt < t; tt++)
2265 				record__terminate_thread(&thread_data[t]);
2266 			pr_err("Failed to start threads: %s\n", strerror(errno));
2267 			ret = -1;
2268 			goto out_err;
2269 		}
2270 
2271 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2272 		if (err > 0)
2273 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2274 				  thread_msg_tags[msg]);
2275 		else
2276 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2277 				   thread->tid, rec->thread_data[t].tid);
2278 	}
2279 
2280 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2281 			(cpu_set_t *)thread->mask->affinity.bits);
2282 
2283 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2284 
2285 out_err:
2286 	pthread_attr_destroy(&attrs);
2287 
2288 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2289 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2290 		ret = -1;
2291 	}
2292 
2293 	return ret;
2294 }
2295 
2296 static int record__stop_threads(struct record *rec)
2297 {
2298 	int t;
2299 	struct record_thread *thread_data = rec->thread_data;
2300 
2301 	for (t = 1; t < rec->nr_threads; t++)
2302 		record__terminate_thread(&thread_data[t]);
2303 
2304 	for (t = 0; t < rec->nr_threads; t++) {
2305 		rec->samples += thread_data[t].samples;
2306 		if (!record__threads_enabled(rec))
2307 			continue;
2308 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2309 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2310 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2311 			 thread_data[t].samples, thread_data[t].waking);
2312 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2313 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2314 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2315 		else
2316 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2317 	}
2318 
2319 	return 0;
2320 }
2321 
2322 static unsigned long record__waking(struct record *rec)
2323 {
2324 	int t;
2325 	unsigned long waking = 0;
2326 	struct record_thread *thread_data = rec->thread_data;
2327 
2328 	for (t = 0; t < rec->nr_threads; t++)
2329 		waking += thread_data[t].waking;
2330 
2331 	return waking;
2332 }
2333 
2334 static int __cmd_record(struct record *rec, int argc, const char **argv)
2335 {
2336 	int err;
2337 	int status = 0;
2338 	const bool forks = argc > 0;
2339 	struct perf_tool *tool = &rec->tool;
2340 	struct record_opts *opts = &rec->opts;
2341 	struct perf_data *data = &rec->data;
2342 	struct perf_session *session;
2343 	bool disabled = false, draining = false;
2344 	int fd;
2345 	float ratio = 0;
2346 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2347 
2348 	atexit(record__sig_exit);
2349 	signal(SIGCHLD, sig_handler);
2350 	signal(SIGINT, sig_handler);
2351 	signal(SIGTERM, sig_handler);
2352 	signal(SIGSEGV, sigsegv_handler);
2353 
2354 	if (rec->opts.record_namespaces)
2355 		tool->namespace_events = true;
2356 
2357 	if (rec->opts.record_cgroup) {
2358 #ifdef HAVE_FILE_HANDLE
2359 		tool->cgroup_events = true;
2360 #else
2361 		pr_err("cgroup tracking is not supported\n");
2362 		return -1;
2363 #endif
2364 	}
2365 
2366 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2367 		signal(SIGUSR2, snapshot_sig_handler);
2368 		if (rec->opts.auxtrace_snapshot_mode)
2369 			trigger_on(&auxtrace_snapshot_trigger);
2370 		if (rec->switch_output.enabled)
2371 			trigger_on(&switch_output_trigger);
2372 	} else {
2373 		signal(SIGUSR2, SIG_IGN);
2374 	}
2375 
2376 	session = perf_session__new(data, tool);
2377 	if (IS_ERR(session)) {
2378 		pr_err("Perf session creation failed.\n");
2379 		return PTR_ERR(session);
2380 	}
2381 
2382 	if (record__threads_enabled(rec)) {
2383 		if (perf_data__is_pipe(&rec->data)) {
2384 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2385 			return -1;
2386 		}
2387 		if (rec->opts.full_auxtrace) {
2388 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2389 			return -1;
2390 		}
2391 	}
2392 
2393 	fd = perf_data__fd(data);
2394 	rec->session = session;
2395 
2396 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2397 		pr_err("Compression initialization failed.\n");
2398 		return -1;
2399 	}
2400 #ifdef HAVE_EVENTFD_SUPPORT
2401 	done_fd = eventfd(0, EFD_NONBLOCK);
2402 	if (done_fd < 0) {
2403 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2404 		status = -1;
2405 		goto out_delete_session;
2406 	}
2407 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2408 	if (err < 0) {
2409 		pr_err("Failed to add wakeup eventfd to poll list\n");
2410 		status = err;
2411 		goto out_delete_session;
2412 	}
2413 #endif // HAVE_EVENTFD_SUPPORT
2414 
2415 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2416 	session->header.env.comp_level = rec->opts.comp_level;
2417 
2418 	if (rec->opts.kcore &&
2419 	    !record__kcore_readable(&session->machines.host)) {
2420 		pr_err("ERROR: kcore is not readable.\n");
2421 		return -1;
2422 	}
2423 
2424 	if (record__init_clock(rec))
2425 		return -1;
2426 
2427 	record__init_features(rec);
2428 
2429 	if (forks) {
2430 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2431 					       workload_exec_failed_signal);
2432 		if (err < 0) {
2433 			pr_err("Couldn't run the workload!\n");
2434 			status = err;
2435 			goto out_delete_session;
2436 		}
2437 	}
2438 
2439 	/*
2440 	 * If we have just single event and are sending data
2441 	 * through pipe, we need to force the ids allocation,
2442 	 * because we synthesize event name through the pipe
2443 	 * and need the id for that.
2444 	 */
2445 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2446 		rec->opts.sample_id = true;
2447 
2448 	record__uniquify_name(rec);
2449 
2450 	/* Debug message used by test scripts */
2451 	pr_debug3("perf record opening and mmapping events\n");
2452 	if (record__open(rec) != 0) {
2453 		err = -1;
2454 		goto out_free_threads;
2455 	}
2456 	/* Debug message used by test scripts */
2457 	pr_debug3("perf record done opening and mmapping events\n");
2458 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2459 
2460 	if (rec->opts.kcore) {
2461 		err = record__kcore_copy(&session->machines.host, data);
2462 		if (err) {
2463 			pr_err("ERROR: Failed to copy kcore\n");
2464 			goto out_free_threads;
2465 		}
2466 	}
2467 
2468 	err = bpf__apply_obj_config();
2469 	if (err) {
2470 		char errbuf[BUFSIZ];
2471 
2472 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2473 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2474 			 errbuf);
2475 		goto out_free_threads;
2476 	}
2477 
2478 	/*
2479 	 * Normally perf_session__new would do this, but it doesn't have the
2480 	 * evlist.
2481 	 */
2482 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2483 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2484 		rec->tool.ordered_events = false;
2485 	}
2486 
2487 	if (evlist__nr_groups(rec->evlist) == 0)
2488 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2489 
2490 	if (data->is_pipe) {
2491 		err = perf_header__write_pipe(fd);
2492 		if (err < 0)
2493 			goto out_free_threads;
2494 	} else {
2495 		err = perf_session__write_header(session, rec->evlist, fd, false);
2496 		if (err < 0)
2497 			goto out_free_threads;
2498 	}
2499 
2500 	err = -1;
2501 	if (!rec->no_buildid
2502 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2503 		pr_err("Couldn't generate buildids. "
2504 		       "Use --no-buildid to profile anyway.\n");
2505 		goto out_free_threads;
2506 	}
2507 
2508 	err = record__setup_sb_evlist(rec);
2509 	if (err)
2510 		goto out_free_threads;
2511 
2512 	err = record__synthesize(rec, false);
2513 	if (err < 0)
2514 		goto out_free_threads;
2515 
2516 	if (rec->realtime_prio) {
2517 		struct sched_param param;
2518 
2519 		param.sched_priority = rec->realtime_prio;
2520 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2521 			pr_err("Could not set realtime priority.\n");
2522 			err = -1;
2523 			goto out_free_threads;
2524 		}
2525 	}
2526 
2527 	if (record__start_threads(rec))
2528 		goto out_free_threads;
2529 
2530 	/*
2531 	 * When perf is starting the traced process, all the events
2532 	 * (apart from group members) have enable_on_exec=1 set,
2533 	 * so don't spoil it by prematurely enabling them.
2534 	 */
2535 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2536 		evlist__enable(rec->evlist);
2537 
2538 	/*
2539 	 * Let the child rip
2540 	 */
2541 	if (forks) {
2542 		struct machine *machine = &session->machines.host;
2543 		union perf_event *event;
2544 		pid_t tgid;
2545 
2546 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2547 		if (event == NULL) {
2548 			err = -ENOMEM;
2549 			goto out_child;
2550 		}
2551 
2552 		/*
2553 		 * Some H/W events are generated before COMM event
2554 		 * which is emitted during exec(), so perf script
2555 		 * cannot see a correct process name for those events.
2556 		 * Synthesize COMM event to prevent it.
2557 		 */
2558 		tgid = perf_event__synthesize_comm(tool, event,
2559 						   rec->evlist->workload.pid,
2560 						   process_synthesized_event,
2561 						   machine);
2562 		free(event);
2563 
2564 		if (tgid == -1)
2565 			goto out_child;
2566 
2567 		event = malloc(sizeof(event->namespaces) +
2568 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2569 			       machine->id_hdr_size);
2570 		if (event == NULL) {
2571 			err = -ENOMEM;
2572 			goto out_child;
2573 		}
2574 
2575 		/*
2576 		 * Synthesize NAMESPACES event for the command specified.
2577 		 */
2578 		perf_event__synthesize_namespaces(tool, event,
2579 						  rec->evlist->workload.pid,
2580 						  tgid, process_synthesized_event,
2581 						  machine);
2582 		free(event);
2583 
2584 		evlist__start_workload(rec->evlist);
2585 	}
2586 
2587 	if (opts->target.initial_delay) {
2588 		pr_info(EVLIST_DISABLED_MSG);
2589 		if (opts->target.initial_delay > 0) {
2590 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2591 			evlist__enable(rec->evlist);
2592 			pr_info(EVLIST_ENABLED_MSG);
2593 		}
2594 	}
2595 
2596 	err = event_enable_timer__start(rec->evlist->eet);
2597 	if (err)
2598 		goto out_child;
2599 
2600 	/* Debug message used by test scripts */
2601 	pr_debug3("perf record has started\n");
2602 	fflush(stderr);
2603 
2604 	trigger_ready(&auxtrace_snapshot_trigger);
2605 	trigger_ready(&switch_output_trigger);
2606 	perf_hooks__invoke_record_start();
2607 
2608 	/*
2609 	 * Must write FINISHED_INIT so it will be seen after all other
2610 	 * synthesized user events, but before any regular events.
2611 	 */
2612 	err = write_finished_init(rec, false);
2613 	if (err < 0)
2614 		goto out_child;
2615 
2616 	for (;;) {
2617 		unsigned long long hits = thread->samples;
2618 
2619 		/*
2620 		 * rec->evlist->bkw_mmap_state is possible to be
2621 		 * BKW_MMAP_EMPTY here: when done == true and
2622 		 * hits != rec->samples in previous round.
2623 		 *
2624 		 * evlist__toggle_bkw_mmap ensure we never
2625 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2626 		 */
2627 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2628 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2629 
2630 		if (record__mmap_read_all(rec, false) < 0) {
2631 			trigger_error(&auxtrace_snapshot_trigger);
2632 			trigger_error(&switch_output_trigger);
2633 			err = -1;
2634 			goto out_child;
2635 		}
2636 
2637 		if (auxtrace_record__snapshot_started) {
2638 			auxtrace_record__snapshot_started = 0;
2639 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2640 				record__read_auxtrace_snapshot(rec, false);
2641 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2642 				pr_err("AUX area tracing snapshot failed\n");
2643 				err = -1;
2644 				goto out_child;
2645 			}
2646 		}
2647 
2648 		if (trigger_is_hit(&switch_output_trigger)) {
2649 			/*
2650 			 * If switch_output_trigger is hit, the data in
2651 			 * overwritable ring buffer should have been collected,
2652 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2653 			 *
2654 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2655 			 * record__mmap_read_all() didn't collect data from
2656 			 * overwritable ring buffer. Read again.
2657 			 */
2658 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2659 				continue;
2660 			trigger_ready(&switch_output_trigger);
2661 
2662 			/*
2663 			 * Reenable events in overwrite ring buffer after
2664 			 * record__mmap_read_all(): we should have collected
2665 			 * data from it.
2666 			 */
2667 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2668 
2669 			if (!quiet)
2670 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2671 					record__waking(rec));
2672 			thread->waking = 0;
2673 			fd = record__switch_output(rec, false);
2674 			if (fd < 0) {
2675 				pr_err("Failed to switch to new file\n");
2676 				trigger_error(&switch_output_trigger);
2677 				err = fd;
2678 				goto out_child;
2679 			}
2680 
2681 			/* re-arm the alarm */
2682 			if (rec->switch_output.time)
2683 				alarm(rec->switch_output.time);
2684 		}
2685 
2686 		if (hits == thread->samples) {
2687 			if (done || draining)
2688 				break;
2689 			err = fdarray__poll(&thread->pollfd, -1);
2690 			/*
2691 			 * Propagate error, only if there's any. Ignore positive
2692 			 * number of returned events and interrupt error.
2693 			 */
2694 			if (err > 0 || (err < 0 && errno == EINTR))
2695 				err = 0;
2696 			thread->waking++;
2697 
2698 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2699 					    record__thread_munmap_filtered, NULL) == 0)
2700 				draining = true;
2701 
2702 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2703 			if (err)
2704 				goto out_child;
2705 		}
2706 
2707 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2708 			switch (cmd) {
2709 			case EVLIST_CTL_CMD_SNAPSHOT:
2710 				hit_auxtrace_snapshot_trigger(rec);
2711 				evlist__ctlfd_ack(rec->evlist);
2712 				break;
2713 			case EVLIST_CTL_CMD_STOP:
2714 				done = 1;
2715 				break;
2716 			case EVLIST_CTL_CMD_ACK:
2717 			case EVLIST_CTL_CMD_UNSUPPORTED:
2718 			case EVLIST_CTL_CMD_ENABLE:
2719 			case EVLIST_CTL_CMD_DISABLE:
2720 			case EVLIST_CTL_CMD_EVLIST:
2721 			case EVLIST_CTL_CMD_PING:
2722 			default:
2723 				break;
2724 			}
2725 		}
2726 
2727 		err = event_enable_timer__process(rec->evlist->eet);
2728 		if (err < 0)
2729 			goto out_child;
2730 		if (err) {
2731 			err = 0;
2732 			done = 1;
2733 		}
2734 
2735 		/*
2736 		 * When perf is starting the traced process, at the end events
2737 		 * die with the process and we wait for that. Thus no need to
2738 		 * disable events in this case.
2739 		 */
2740 		if (done && !disabled && !target__none(&opts->target)) {
2741 			trigger_off(&auxtrace_snapshot_trigger);
2742 			evlist__disable(rec->evlist);
2743 			disabled = true;
2744 		}
2745 	}
2746 
2747 	trigger_off(&auxtrace_snapshot_trigger);
2748 	trigger_off(&switch_output_trigger);
2749 
2750 	if (opts->auxtrace_snapshot_on_exit)
2751 		record__auxtrace_snapshot_exit(rec);
2752 
2753 	if (forks && workload_exec_errno) {
2754 		char msg[STRERR_BUFSIZE], strevsels[2048];
2755 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2756 
2757 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2758 
2759 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2760 			strevsels, argv[0], emsg);
2761 		err = -1;
2762 		goto out_child;
2763 	}
2764 
2765 	if (!quiet)
2766 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2767 			record__waking(rec));
2768 
2769 	write_finished_init(rec, true);
2770 
2771 	if (target__none(&rec->opts.target))
2772 		record__synthesize_workload(rec, true);
2773 
2774 out_child:
2775 	record__stop_threads(rec);
2776 	record__mmap_read_all(rec, true);
2777 out_free_threads:
2778 	record__free_thread_data(rec);
2779 	evlist__finalize_ctlfd(rec->evlist);
2780 	record__aio_mmap_read_sync(rec);
2781 
2782 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2783 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2784 		session->header.env.comp_ratio = ratio + 0.5;
2785 	}
2786 
2787 	if (forks) {
2788 		int exit_status;
2789 
2790 		if (!child_finished)
2791 			kill(rec->evlist->workload.pid, SIGTERM);
2792 
2793 		wait(&exit_status);
2794 
2795 		if (err < 0)
2796 			status = err;
2797 		else if (WIFEXITED(exit_status))
2798 			status = WEXITSTATUS(exit_status);
2799 		else if (WIFSIGNALED(exit_status))
2800 			signr = WTERMSIG(exit_status);
2801 	} else
2802 		status = err;
2803 
2804 	if (rec->off_cpu)
2805 		rec->bytes_written += off_cpu_write(rec->session);
2806 
2807 	record__read_lost_samples(rec);
2808 	record__synthesize(rec, true);
2809 	/* this will be recalculated during process_buildids() */
2810 	rec->samples = 0;
2811 
2812 	if (!err) {
2813 		if (!rec->timestamp_filename) {
2814 			record__finish_output(rec);
2815 		} else {
2816 			fd = record__switch_output(rec, true);
2817 			if (fd < 0) {
2818 				status = fd;
2819 				goto out_delete_session;
2820 			}
2821 		}
2822 	}
2823 
2824 	perf_hooks__invoke_record_end();
2825 
2826 	if (!err && !quiet) {
2827 		char samples[128];
2828 		const char *postfix = rec->timestamp_filename ?
2829 					".<timestamp>" : "";
2830 
2831 		if (rec->samples && !rec->opts.full_auxtrace)
2832 			scnprintf(samples, sizeof(samples),
2833 				  " (%" PRIu64 " samples)", rec->samples);
2834 		else
2835 			samples[0] = '\0';
2836 
2837 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2838 			perf_data__size(data) / 1024.0 / 1024.0,
2839 			data->path, postfix, samples);
2840 		if (ratio) {
2841 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2842 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2843 					ratio);
2844 		}
2845 		fprintf(stderr, " ]\n");
2846 	}
2847 
2848 out_delete_session:
2849 #ifdef HAVE_EVENTFD_SUPPORT
2850 	if (done_fd >= 0) {
2851 		fd = done_fd;
2852 		done_fd = -1;
2853 
2854 		close(fd);
2855 	}
2856 #endif
2857 	zstd_fini(&session->zstd_data);
2858 	perf_session__delete(session);
2859 
2860 	if (!opts->no_bpf_event)
2861 		evlist__stop_sb_thread(rec->sb_evlist);
2862 	return status;
2863 }
2864 
2865 static void callchain_debug(struct callchain_param *callchain)
2866 {
2867 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2868 
2869 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2870 
2871 	if (callchain->record_mode == CALLCHAIN_DWARF)
2872 		pr_debug("callchain: stack dump size %d\n",
2873 			 callchain->dump_size);
2874 }
2875 
2876 int record_opts__parse_callchain(struct record_opts *record,
2877 				 struct callchain_param *callchain,
2878 				 const char *arg, bool unset)
2879 {
2880 	int ret;
2881 	callchain->enabled = !unset;
2882 
2883 	/* --no-call-graph */
2884 	if (unset) {
2885 		callchain->record_mode = CALLCHAIN_NONE;
2886 		pr_debug("callchain: disabled\n");
2887 		return 0;
2888 	}
2889 
2890 	ret = parse_callchain_record_opt(arg, callchain);
2891 	if (!ret) {
2892 		/* Enable data address sampling for DWARF unwind. */
2893 		if (callchain->record_mode == CALLCHAIN_DWARF)
2894 			record->sample_address = true;
2895 		callchain_debug(callchain);
2896 	}
2897 
2898 	return ret;
2899 }
2900 
2901 int record_parse_callchain_opt(const struct option *opt,
2902 			       const char *arg,
2903 			       int unset)
2904 {
2905 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2906 }
2907 
2908 int record_callchain_opt(const struct option *opt,
2909 			 const char *arg __maybe_unused,
2910 			 int unset __maybe_unused)
2911 {
2912 	struct callchain_param *callchain = opt->value;
2913 
2914 	callchain->enabled = true;
2915 
2916 	if (callchain->record_mode == CALLCHAIN_NONE)
2917 		callchain->record_mode = CALLCHAIN_FP;
2918 
2919 	callchain_debug(callchain);
2920 	return 0;
2921 }
2922 
2923 static int perf_record_config(const char *var, const char *value, void *cb)
2924 {
2925 	struct record *rec = cb;
2926 
2927 	if (!strcmp(var, "record.build-id")) {
2928 		if (!strcmp(value, "cache"))
2929 			rec->no_buildid_cache = false;
2930 		else if (!strcmp(value, "no-cache"))
2931 			rec->no_buildid_cache = true;
2932 		else if (!strcmp(value, "skip"))
2933 			rec->no_buildid = true;
2934 		else if (!strcmp(value, "mmap"))
2935 			rec->buildid_mmap = true;
2936 		else
2937 			return -1;
2938 		return 0;
2939 	}
2940 	if (!strcmp(var, "record.call-graph")) {
2941 		var = "call-graph.record-mode";
2942 		return perf_default_config(var, value, cb);
2943 	}
2944 #ifdef HAVE_AIO_SUPPORT
2945 	if (!strcmp(var, "record.aio")) {
2946 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2947 		if (!rec->opts.nr_cblocks)
2948 			rec->opts.nr_cblocks = nr_cblocks_default;
2949 	}
2950 #endif
2951 	if (!strcmp(var, "record.debuginfod")) {
2952 		rec->debuginfod.urls = strdup(value);
2953 		if (!rec->debuginfod.urls)
2954 			return -ENOMEM;
2955 		rec->debuginfod.set = true;
2956 	}
2957 
2958 	return 0;
2959 }
2960 
2961 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2962 {
2963 	struct record *rec = (struct record *)opt->value;
2964 
2965 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2966 }
2967 
2968 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2969 {
2970 	struct record_opts *opts = (struct record_opts *)opt->value;
2971 
2972 	if (unset || !str)
2973 		return 0;
2974 
2975 	if (!strcasecmp(str, "node"))
2976 		opts->affinity = PERF_AFFINITY_NODE;
2977 	else if (!strcasecmp(str, "cpu"))
2978 		opts->affinity = PERF_AFFINITY_CPU;
2979 
2980 	return 0;
2981 }
2982 
2983 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2984 {
2985 	mask->nbits = nr_bits;
2986 	mask->bits = bitmap_zalloc(mask->nbits);
2987 	if (!mask->bits)
2988 		return -ENOMEM;
2989 
2990 	return 0;
2991 }
2992 
2993 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2994 {
2995 	bitmap_free(mask->bits);
2996 	mask->nbits = 0;
2997 }
2998 
2999 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3000 {
3001 	int ret;
3002 
3003 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3004 	if (ret) {
3005 		mask->affinity.bits = NULL;
3006 		return ret;
3007 	}
3008 
3009 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3010 	if (ret) {
3011 		record__mmap_cpu_mask_free(&mask->maps);
3012 		mask->maps.bits = NULL;
3013 	}
3014 
3015 	return ret;
3016 }
3017 
3018 static void record__thread_mask_free(struct thread_mask *mask)
3019 {
3020 	record__mmap_cpu_mask_free(&mask->maps);
3021 	record__mmap_cpu_mask_free(&mask->affinity);
3022 }
3023 
3024 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3025 {
3026 	int s;
3027 	struct record_opts *opts = opt->value;
3028 
3029 	if (unset || !str || !strlen(str)) {
3030 		opts->threads_spec = THREAD_SPEC__CPU;
3031 	} else {
3032 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3033 			if (s == THREAD_SPEC__USER) {
3034 				opts->threads_user_spec = strdup(str);
3035 				if (!opts->threads_user_spec)
3036 					return -ENOMEM;
3037 				opts->threads_spec = THREAD_SPEC__USER;
3038 				break;
3039 			}
3040 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3041 				opts->threads_spec = s;
3042 				break;
3043 			}
3044 		}
3045 	}
3046 
3047 	if (opts->threads_spec == THREAD_SPEC__USER)
3048 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3049 	else
3050 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3051 
3052 	return 0;
3053 }
3054 
3055 static int parse_output_max_size(const struct option *opt,
3056 				 const char *str, int unset)
3057 {
3058 	unsigned long *s = (unsigned long *)opt->value;
3059 	static struct parse_tag tags_size[] = {
3060 		{ .tag  = 'B', .mult = 1       },
3061 		{ .tag  = 'K', .mult = 1 << 10 },
3062 		{ .tag  = 'M', .mult = 1 << 20 },
3063 		{ .tag  = 'G', .mult = 1 << 30 },
3064 		{ .tag  = 0 },
3065 	};
3066 	unsigned long val;
3067 
3068 	if (unset) {
3069 		*s = 0;
3070 		return 0;
3071 	}
3072 
3073 	val = parse_tag_value(str, tags_size);
3074 	if (val != (unsigned long) -1) {
3075 		*s = val;
3076 		return 0;
3077 	}
3078 
3079 	return -1;
3080 }
3081 
3082 static int record__parse_mmap_pages(const struct option *opt,
3083 				    const char *str,
3084 				    int unset __maybe_unused)
3085 {
3086 	struct record_opts *opts = opt->value;
3087 	char *s, *p;
3088 	unsigned int mmap_pages;
3089 	int ret;
3090 
3091 	if (!str)
3092 		return -EINVAL;
3093 
3094 	s = strdup(str);
3095 	if (!s)
3096 		return -ENOMEM;
3097 
3098 	p = strchr(s, ',');
3099 	if (p)
3100 		*p = '\0';
3101 
3102 	if (*s) {
3103 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3104 		if (ret)
3105 			goto out_free;
3106 		opts->mmap_pages = mmap_pages;
3107 	}
3108 
3109 	if (!p) {
3110 		ret = 0;
3111 		goto out_free;
3112 	}
3113 
3114 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3115 	if (ret)
3116 		goto out_free;
3117 
3118 	opts->auxtrace_mmap_pages = mmap_pages;
3119 
3120 out_free:
3121 	free(s);
3122 	return ret;
3123 }
3124 
3125 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3126 {
3127 }
3128 
3129 static int parse_control_option(const struct option *opt,
3130 				const char *str,
3131 				int unset __maybe_unused)
3132 {
3133 	struct record_opts *opts = opt->value;
3134 
3135 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3136 }
3137 
3138 static void switch_output_size_warn(struct record *rec)
3139 {
3140 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3141 	struct switch_output *s = &rec->switch_output;
3142 
3143 	wakeup_size /= 2;
3144 
3145 	if (s->size < wakeup_size) {
3146 		char buf[100];
3147 
3148 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3149 		pr_warning("WARNING: switch-output data size lower than "
3150 			   "wakeup kernel buffer size (%s) "
3151 			   "expect bigger perf.data sizes\n", buf);
3152 	}
3153 }
3154 
3155 static int switch_output_setup(struct record *rec)
3156 {
3157 	struct switch_output *s = &rec->switch_output;
3158 	static struct parse_tag tags_size[] = {
3159 		{ .tag  = 'B', .mult = 1       },
3160 		{ .tag  = 'K', .mult = 1 << 10 },
3161 		{ .tag  = 'M', .mult = 1 << 20 },
3162 		{ .tag  = 'G', .mult = 1 << 30 },
3163 		{ .tag  = 0 },
3164 	};
3165 	static struct parse_tag tags_time[] = {
3166 		{ .tag  = 's', .mult = 1        },
3167 		{ .tag  = 'm', .mult = 60       },
3168 		{ .tag  = 'h', .mult = 60*60    },
3169 		{ .tag  = 'd', .mult = 60*60*24 },
3170 		{ .tag  = 0 },
3171 	};
3172 	unsigned long val;
3173 
3174 	/*
3175 	 * If we're using --switch-output-events, then we imply its
3176 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3177 	 *  thread to its parent.
3178 	 */
3179 	if (rec->switch_output_event_set) {
3180 		if (record__threads_enabled(rec)) {
3181 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3182 			return 0;
3183 		}
3184 		goto do_signal;
3185 	}
3186 
3187 	if (!s->set)
3188 		return 0;
3189 
3190 	if (record__threads_enabled(rec)) {
3191 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3192 		return 0;
3193 	}
3194 
3195 	if (!strcmp(s->str, "signal")) {
3196 do_signal:
3197 		s->signal = true;
3198 		pr_debug("switch-output with SIGUSR2 signal\n");
3199 		goto enabled;
3200 	}
3201 
3202 	val = parse_tag_value(s->str, tags_size);
3203 	if (val != (unsigned long) -1) {
3204 		s->size = val;
3205 		pr_debug("switch-output with %s size threshold\n", s->str);
3206 		goto enabled;
3207 	}
3208 
3209 	val = parse_tag_value(s->str, tags_time);
3210 	if (val != (unsigned long) -1) {
3211 		s->time = val;
3212 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3213 			 s->str, s->time);
3214 		goto enabled;
3215 	}
3216 
3217 	return -1;
3218 
3219 enabled:
3220 	rec->timestamp_filename = true;
3221 	s->enabled              = true;
3222 
3223 	if (s->size && !rec->opts.no_buffering)
3224 		switch_output_size_warn(rec);
3225 
3226 	return 0;
3227 }
3228 
3229 static const char * const __record_usage[] = {
3230 	"perf record [<options>] [<command>]",
3231 	"perf record [<options>] -- <command> [<options>]",
3232 	NULL
3233 };
3234 const char * const *record_usage = __record_usage;
3235 
3236 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3237 				  struct perf_sample *sample, struct machine *machine)
3238 {
3239 	/*
3240 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3241 	 * no need to add them twice.
3242 	 */
3243 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3244 		return 0;
3245 	return perf_event__process_mmap(tool, event, sample, machine);
3246 }
3247 
3248 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3249 				   struct perf_sample *sample, struct machine *machine)
3250 {
3251 	/*
3252 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3253 	 * no need to add them twice.
3254 	 */
3255 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3256 		return 0;
3257 
3258 	return perf_event__process_mmap2(tool, event, sample, machine);
3259 }
3260 
3261 static int process_timestamp_boundary(struct perf_tool *tool,
3262 				      union perf_event *event __maybe_unused,
3263 				      struct perf_sample *sample,
3264 				      struct machine *machine __maybe_unused)
3265 {
3266 	struct record *rec = container_of(tool, struct record, tool);
3267 
3268 	set_timestamp_boundary(rec, sample->time);
3269 	return 0;
3270 }
3271 
3272 static int parse_record_synth_option(const struct option *opt,
3273 				     const char *str,
3274 				     int unset __maybe_unused)
3275 {
3276 	struct record_opts *opts = opt->value;
3277 	char *p = strdup(str);
3278 
3279 	if (p == NULL)
3280 		return -1;
3281 
3282 	opts->synth = parse_synth_opt(p);
3283 	free(p);
3284 
3285 	if (opts->synth < 0) {
3286 		pr_err("Invalid synth option: %s\n", str);
3287 		return -1;
3288 	}
3289 	return 0;
3290 }
3291 
3292 /*
3293  * XXX Ideally would be local to cmd_record() and passed to a record__new
3294  * because we need to have access to it in record__exit, that is called
3295  * after cmd_record() exits, but since record_options need to be accessible to
3296  * builtin-script, leave it here.
3297  *
3298  * At least we don't ouch it in all the other functions here directly.
3299  *
3300  * Just say no to tons of global variables, sigh.
3301  */
3302 static struct record record = {
3303 	.opts = {
3304 		.sample_time	     = true,
3305 		.mmap_pages	     = UINT_MAX,
3306 		.user_freq	     = UINT_MAX,
3307 		.user_interval	     = ULLONG_MAX,
3308 		.freq		     = 4000,
3309 		.target		     = {
3310 			.uses_mmap   = true,
3311 			.default_per_cpu = true,
3312 		},
3313 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3314 		.nr_threads_synthesize = 1,
3315 		.ctl_fd              = -1,
3316 		.ctl_fd_ack          = -1,
3317 		.synth               = PERF_SYNTH_ALL,
3318 	},
3319 	.tool = {
3320 		.sample		= process_sample_event,
3321 		.fork		= perf_event__process_fork,
3322 		.exit		= perf_event__process_exit,
3323 		.comm		= perf_event__process_comm,
3324 		.namespaces	= perf_event__process_namespaces,
3325 		.mmap		= build_id__process_mmap,
3326 		.mmap2		= build_id__process_mmap2,
3327 		.itrace_start	= process_timestamp_boundary,
3328 		.aux		= process_timestamp_boundary,
3329 		.ordered_events	= true,
3330 	},
3331 };
3332 
3333 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3334 	"\n\t\t\t\tDefault: fp";
3335 
3336 static bool dry_run;
3337 
3338 static struct parse_events_option_args parse_events_option_args = {
3339 	.evlistp = &record.evlist,
3340 };
3341 
3342 static struct parse_events_option_args switch_output_parse_events_option_args = {
3343 	.evlistp = &record.sb_evlist,
3344 };
3345 
3346 /*
3347  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3348  * with it and switch to use the library functions in perf_evlist that came
3349  * from builtin-record.c, i.e. use record_opts,
3350  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3351  * using pipes, etc.
3352  */
3353 static struct option __record_options[] = {
3354 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3355 		     "event selector. use 'perf list' to list available events",
3356 		     parse_events_option),
3357 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3358 		     "event filter", parse_filter),
3359 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3360 			   NULL, "don't record events from perf itself",
3361 			   exclude_perf),
3362 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3363 		    "record events on existing process id"),
3364 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3365 		    "record events on existing thread id"),
3366 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3367 		    "collect data with this RT SCHED_FIFO priority"),
3368 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3369 		    "collect data without buffering"),
3370 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3371 		    "collect raw sample records from all opened counters"),
3372 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3373 			    "system-wide collection from all CPUs"),
3374 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3375 		    "list of cpus to monitor"),
3376 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3377 	OPT_STRING('o', "output", &record.data.path, "file",
3378 		    "output file name"),
3379 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3380 			&record.opts.no_inherit_set,
3381 			"child tasks do not inherit counters"),
3382 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3383 		    "synthesize non-sample events at the end of output"),
3384 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3385 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3386 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3387 		    "Fail if the specified frequency can't be used"),
3388 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3389 		     "profile at this frequency",
3390 		      record__parse_freq),
3391 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3392 		     "number of mmap data pages and AUX area tracing mmap pages",
3393 		     record__parse_mmap_pages),
3394 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3395 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3396 		     record__mmap_flush_parse),
3397 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3398 			   NULL, "enables call-graph recording" ,
3399 			   &record_callchain_opt),
3400 	OPT_CALLBACK(0, "call-graph", &record.opts,
3401 		     "record_mode[,record_size]", record_callchain_help,
3402 		     &record_parse_callchain_opt),
3403 	OPT_INCR('v', "verbose", &verbose,
3404 		    "be more verbose (show counter open errors, etc)"),
3405 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3406 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3407 		    "per thread counts"),
3408 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3409 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3410 		    "Record the sample physical addresses"),
3411 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3412 		    "Record the sampled data address data page size"),
3413 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3414 		    "Record the sampled code address (ip) page size"),
3415 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3416 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3417 		    "Record the sample identifier"),
3418 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3419 			&record.opts.sample_time_set,
3420 			"Record the sample timestamps"),
3421 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3422 			"Record the sample period"),
3423 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3424 		    "don't sample"),
3425 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3426 			&record.no_buildid_cache_set,
3427 			"do not update the buildid cache"),
3428 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3429 			&record.no_buildid_set,
3430 			"do not collect buildids in perf.data"),
3431 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3432 		     "monitor event in cgroup name only",
3433 		     parse_cgroups),
3434 	OPT_CALLBACK('D', "delay", &record, "ms",
3435 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3436 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3437 		     record__parse_event_enable_time),
3438 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3439 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3440 		   "user to profile"),
3441 
3442 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3443 		     "branch any", "sample any taken branches",
3444 		     parse_branch_stack),
3445 
3446 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3447 		     "branch filter mask", "branch stack filter modes",
3448 		     parse_branch_stack),
3449 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3450 		    "sample by weight (on special events only)"),
3451 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3452 		    "sample transaction flags (special events only)"),
3453 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3454 		    "use per-thread mmaps"),
3455 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3456 		    "sample selected machine registers on interrupt,"
3457 		    " use '-I?' to list register names", parse_intr_regs),
3458 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3459 		    "sample selected machine registers on interrupt,"
3460 		    " use '--user-regs=?' to list register names", parse_user_regs),
3461 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3462 		    "Record running/enabled time of read (:S) events"),
3463 	OPT_CALLBACK('k', "clockid", &record.opts,
3464 	"clockid", "clockid to use for events, see clock_gettime()",
3465 	parse_clockid),
3466 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3467 			  "opts", "AUX area tracing Snapshot Mode", ""),
3468 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3469 			  "opts", "sample AUX area", ""),
3470 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3471 			"per thread proc mmap processing timeout in ms"),
3472 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3473 		    "Record namespaces events"),
3474 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3475 		    "Record cgroup events"),
3476 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3477 			&record.opts.record_switch_events_set,
3478 			"Record context switch events"),
3479 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3480 			 "Configure all used events to run in kernel space.",
3481 			 PARSE_OPT_EXCLUSIVE),
3482 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3483 			 "Configure all used events to run in user space.",
3484 			 PARSE_OPT_EXCLUSIVE),
3485 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3486 		    "collect kernel callchains"),
3487 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3488 		    "collect user callchains"),
3489 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
3490 		   "clang binary to use for compiling BPF scriptlets"),
3491 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
3492 		   "options passed to clang when compiling BPF scriptlets"),
3493 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3494 		   "file", "vmlinux pathname"),
3495 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3496 		    "Record build-id of all DSOs regardless of hits"),
3497 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3498 		    "Record build-id in map events"),
3499 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3500 		    "append timestamp to output filename"),
3501 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3502 		    "Record timestamp boundary (time of first/last samples)"),
3503 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3504 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3505 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3506 			  "signal"),
3507 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3508 			 &record.switch_output_event_set, "switch output event",
3509 			 "switch output event selector. use 'perf list' to list available events",
3510 			 parse_events_option_new_evlist),
3511 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3512 		   "Limit number of switch output generated files"),
3513 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3514 		    "Parse options then exit"),
3515 #ifdef HAVE_AIO_SUPPORT
3516 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3517 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3518 		     record__aio_parse),
3519 #endif
3520 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3521 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3522 		     record__parse_affinity),
3523 #ifdef HAVE_ZSTD_SUPPORT
3524 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3525 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3526 			    record__parse_comp_level),
3527 #endif
3528 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3529 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3530 	OPT_UINTEGER(0, "num-thread-synthesize",
3531 		     &record.opts.nr_threads_synthesize,
3532 		     "number of threads to run for event synthesis"),
3533 #ifdef HAVE_LIBPFM
3534 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3535 		"libpfm4 event selector. use 'perf list' to list available events",
3536 		parse_libpfm_events_option),
3537 #endif
3538 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3539 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3540 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3541 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3542 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3543 		      parse_control_option),
3544 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3545 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3546 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3547 			  &record.debuginfod.set, "debuginfod urls",
3548 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3549 			  "system"),
3550 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3551 			    "write collected trace data into several data files using parallel threads",
3552 			    record__parse_threads),
3553 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3554 	OPT_END()
3555 };
3556 
3557 struct option *record_options = __record_options;
3558 
3559 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3560 {
3561 	struct perf_cpu cpu;
3562 	int idx;
3563 
3564 	if (cpu_map__is_dummy(cpus))
3565 		return 0;
3566 
3567 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
3568 		if (cpu.cpu == -1)
3569 			continue;
3570 		/* Return ENODEV is input cpu is greater than max cpu */
3571 		if ((unsigned long)cpu.cpu > mask->nbits)
3572 			return -ENODEV;
3573 		__set_bit(cpu.cpu, mask->bits);
3574 	}
3575 
3576 	return 0;
3577 }
3578 
3579 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3580 {
3581 	struct perf_cpu_map *cpus;
3582 
3583 	cpus = perf_cpu_map__new(mask_spec);
3584 	if (!cpus)
3585 		return -ENOMEM;
3586 
3587 	bitmap_zero(mask->bits, mask->nbits);
3588 	if (record__mmap_cpu_mask_init(mask, cpus))
3589 		return -ENODEV;
3590 
3591 	perf_cpu_map__put(cpus);
3592 
3593 	return 0;
3594 }
3595 
3596 static void record__free_thread_masks(struct record *rec, int nr_threads)
3597 {
3598 	int t;
3599 
3600 	if (rec->thread_masks)
3601 		for (t = 0; t < nr_threads; t++)
3602 			record__thread_mask_free(&rec->thread_masks[t]);
3603 
3604 	zfree(&rec->thread_masks);
3605 }
3606 
3607 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3608 {
3609 	int t, ret;
3610 
3611 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3612 	if (!rec->thread_masks) {
3613 		pr_err("Failed to allocate thread masks\n");
3614 		return -ENOMEM;
3615 	}
3616 
3617 	for (t = 0; t < nr_threads; t++) {
3618 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3619 		if (ret) {
3620 			pr_err("Failed to allocate thread masks[%d]\n", t);
3621 			goto out_free;
3622 		}
3623 	}
3624 
3625 	return 0;
3626 
3627 out_free:
3628 	record__free_thread_masks(rec, nr_threads);
3629 
3630 	return ret;
3631 }
3632 
3633 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3634 {
3635 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3636 
3637 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3638 	if (ret)
3639 		return ret;
3640 
3641 	rec->nr_threads = nr_cpus;
3642 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3643 
3644 	for (t = 0; t < rec->nr_threads; t++) {
3645 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3646 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3647 		if (verbose > 0) {
3648 			pr_debug("thread_masks[%d]: ", t);
3649 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3650 			pr_debug("thread_masks[%d]: ", t);
3651 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3652 		}
3653 	}
3654 
3655 	return 0;
3656 }
3657 
3658 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3659 					  const char **maps_spec, const char **affinity_spec,
3660 					  u32 nr_spec)
3661 {
3662 	u32 s;
3663 	int ret = 0, t = 0;
3664 	struct mmap_cpu_mask cpus_mask;
3665 	struct thread_mask thread_mask, full_mask, *thread_masks;
3666 
3667 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3668 	if (ret) {
3669 		pr_err("Failed to allocate CPUs mask\n");
3670 		return ret;
3671 	}
3672 
3673 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3674 	if (ret) {
3675 		pr_err("Failed to init cpu mask\n");
3676 		goto out_free_cpu_mask;
3677 	}
3678 
3679 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3680 	if (ret) {
3681 		pr_err("Failed to allocate full mask\n");
3682 		goto out_free_cpu_mask;
3683 	}
3684 
3685 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3686 	if (ret) {
3687 		pr_err("Failed to allocate thread mask\n");
3688 		goto out_free_full_and_cpu_masks;
3689 	}
3690 
3691 	for (s = 0; s < nr_spec; s++) {
3692 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3693 		if (ret) {
3694 			pr_err("Failed to initialize maps thread mask\n");
3695 			goto out_free;
3696 		}
3697 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3698 		if (ret) {
3699 			pr_err("Failed to initialize affinity thread mask\n");
3700 			goto out_free;
3701 		}
3702 
3703 		/* ignore invalid CPUs but do not allow empty masks */
3704 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3705 				cpus_mask.bits, thread_mask.maps.nbits)) {
3706 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3707 			ret = -EINVAL;
3708 			goto out_free;
3709 		}
3710 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3711 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3712 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3713 			ret = -EINVAL;
3714 			goto out_free;
3715 		}
3716 
3717 		/* do not allow intersection with other masks (full_mask) */
3718 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3719 				      thread_mask.maps.nbits)) {
3720 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3721 			ret = -EINVAL;
3722 			goto out_free;
3723 		}
3724 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3725 				      thread_mask.affinity.nbits)) {
3726 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3727 			ret = -EINVAL;
3728 			goto out_free;
3729 		}
3730 
3731 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3732 			  thread_mask.maps.bits, full_mask.maps.nbits);
3733 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3734 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3735 
3736 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3737 		if (!thread_masks) {
3738 			pr_err("Failed to reallocate thread masks\n");
3739 			ret = -ENOMEM;
3740 			goto out_free;
3741 		}
3742 		rec->thread_masks = thread_masks;
3743 		rec->thread_masks[t] = thread_mask;
3744 		if (verbose > 0) {
3745 			pr_debug("thread_masks[%d]: ", t);
3746 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3747 			pr_debug("thread_masks[%d]: ", t);
3748 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3749 		}
3750 		t++;
3751 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3752 		if (ret) {
3753 			pr_err("Failed to allocate thread mask\n");
3754 			goto out_free_full_and_cpu_masks;
3755 		}
3756 	}
3757 	rec->nr_threads = t;
3758 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3759 	if (!rec->nr_threads)
3760 		ret = -EINVAL;
3761 
3762 out_free:
3763 	record__thread_mask_free(&thread_mask);
3764 out_free_full_and_cpu_masks:
3765 	record__thread_mask_free(&full_mask);
3766 out_free_cpu_mask:
3767 	record__mmap_cpu_mask_free(&cpus_mask);
3768 
3769 	return ret;
3770 }
3771 
3772 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3773 {
3774 	int ret;
3775 	struct cpu_topology *topo;
3776 
3777 	topo = cpu_topology__new();
3778 	if (!topo) {
3779 		pr_err("Failed to allocate CPU topology\n");
3780 		return -ENOMEM;
3781 	}
3782 
3783 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3784 					     topo->core_cpus_list, topo->core_cpus_lists);
3785 	cpu_topology__delete(topo);
3786 
3787 	return ret;
3788 }
3789 
3790 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3791 {
3792 	int ret;
3793 	struct cpu_topology *topo;
3794 
3795 	topo = cpu_topology__new();
3796 	if (!topo) {
3797 		pr_err("Failed to allocate CPU topology\n");
3798 		return -ENOMEM;
3799 	}
3800 
3801 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3802 					     topo->package_cpus_list, topo->package_cpus_lists);
3803 	cpu_topology__delete(topo);
3804 
3805 	return ret;
3806 }
3807 
3808 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3809 {
3810 	u32 s;
3811 	int ret;
3812 	const char **spec;
3813 	struct numa_topology *topo;
3814 
3815 	topo = numa_topology__new();
3816 	if (!topo) {
3817 		pr_err("Failed to allocate NUMA topology\n");
3818 		return -ENOMEM;
3819 	}
3820 
3821 	spec = zalloc(topo->nr * sizeof(char *));
3822 	if (!spec) {
3823 		pr_err("Failed to allocate NUMA spec\n");
3824 		ret = -ENOMEM;
3825 		goto out_delete_topo;
3826 	}
3827 	for (s = 0; s < topo->nr; s++)
3828 		spec[s] = topo->nodes[s].cpus;
3829 
3830 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3831 
3832 	zfree(&spec);
3833 
3834 out_delete_topo:
3835 	numa_topology__delete(topo);
3836 
3837 	return ret;
3838 }
3839 
3840 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3841 {
3842 	int t, ret;
3843 	u32 s, nr_spec = 0;
3844 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3845 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3846 
3847 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3848 		spec = strtok_r(user_spec, ":", &spec_ptr);
3849 		if (spec == NULL)
3850 			break;
3851 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3852 		mask = strtok_r(spec, "/", &mask_ptr);
3853 		if (mask == NULL)
3854 			break;
3855 		pr_debug2("  maps mask: %s\n", mask);
3856 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3857 		if (!tmp_spec) {
3858 			pr_err("Failed to reallocate maps spec\n");
3859 			ret = -ENOMEM;
3860 			goto out_free;
3861 		}
3862 		maps_spec = tmp_spec;
3863 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3864 		if (!maps_spec[nr_spec]) {
3865 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3866 			ret = -ENOMEM;
3867 			goto out_free;
3868 		}
3869 		mask = strtok_r(NULL, "/", &mask_ptr);
3870 		if (mask == NULL) {
3871 			pr_err("Invalid thread maps or affinity specs\n");
3872 			ret = -EINVAL;
3873 			goto out_free;
3874 		}
3875 		pr_debug2("  affinity mask: %s\n", mask);
3876 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3877 		if (!tmp_spec) {
3878 			pr_err("Failed to reallocate affinity spec\n");
3879 			ret = -ENOMEM;
3880 			goto out_free;
3881 		}
3882 		affinity_spec = tmp_spec;
3883 		affinity_spec[nr_spec] = strdup(mask);
3884 		if (!affinity_spec[nr_spec]) {
3885 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3886 			ret = -ENOMEM;
3887 			goto out_free;
3888 		}
3889 		dup_mask = NULL;
3890 		nr_spec++;
3891 	}
3892 
3893 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3894 					     (const char **)affinity_spec, nr_spec);
3895 
3896 out_free:
3897 	free(dup_mask);
3898 	for (s = 0; s < nr_spec; s++) {
3899 		if (maps_spec)
3900 			free(maps_spec[s]);
3901 		if (affinity_spec)
3902 			free(affinity_spec[s]);
3903 	}
3904 	free(affinity_spec);
3905 	free(maps_spec);
3906 
3907 	return ret;
3908 }
3909 
3910 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3911 {
3912 	int ret;
3913 
3914 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3915 	if (ret)
3916 		return ret;
3917 
3918 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3919 		return -ENODEV;
3920 
3921 	rec->nr_threads = 1;
3922 
3923 	return 0;
3924 }
3925 
3926 static int record__init_thread_masks(struct record *rec)
3927 {
3928 	int ret = 0;
3929 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3930 
3931 	if (!record__threads_enabled(rec))
3932 		return record__init_thread_default_masks(rec, cpus);
3933 
3934 	if (evlist__per_thread(rec->evlist)) {
3935 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3936 		return -EINVAL;
3937 	}
3938 
3939 	switch (rec->opts.threads_spec) {
3940 	case THREAD_SPEC__CPU:
3941 		ret = record__init_thread_cpu_masks(rec, cpus);
3942 		break;
3943 	case THREAD_SPEC__CORE:
3944 		ret = record__init_thread_core_masks(rec, cpus);
3945 		break;
3946 	case THREAD_SPEC__PACKAGE:
3947 		ret = record__init_thread_package_masks(rec, cpus);
3948 		break;
3949 	case THREAD_SPEC__NUMA:
3950 		ret = record__init_thread_numa_masks(rec, cpus);
3951 		break;
3952 	case THREAD_SPEC__USER:
3953 		ret = record__init_thread_user_masks(rec, cpus);
3954 		break;
3955 	default:
3956 		break;
3957 	}
3958 
3959 	return ret;
3960 }
3961 
3962 int cmd_record(int argc, const char **argv)
3963 {
3964 	int err;
3965 	struct record *rec = &record;
3966 	char errbuf[BUFSIZ];
3967 
3968 	setlocale(LC_ALL, "");
3969 
3970 #ifndef HAVE_LIBBPF_SUPPORT
3971 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
3972 	set_nobuild('\0', "clang-path", true);
3973 	set_nobuild('\0', "clang-opt", true);
3974 # undef set_nobuild
3975 #endif
3976 
3977 #ifndef HAVE_BPF_PROLOGUE
3978 # if !defined (HAVE_DWARF_SUPPORT)
3979 #  define REASON  "NO_DWARF=1"
3980 # elif !defined (HAVE_LIBBPF_SUPPORT)
3981 #  define REASON  "NO_LIBBPF=1"
3982 # else
3983 #  define REASON  "this architecture doesn't support BPF prologue"
3984 # endif
3985 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
3986 	set_nobuild('\0', "vmlinux", true);
3987 # undef set_nobuild
3988 # undef REASON
3989 #endif
3990 
3991 #ifndef HAVE_BPF_SKEL
3992 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3993 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3994 # undef set_nobuild
3995 #endif
3996 
3997 	rec->opts.affinity = PERF_AFFINITY_SYS;
3998 
3999 	rec->evlist = evlist__new();
4000 	if (rec->evlist == NULL)
4001 		return -ENOMEM;
4002 
4003 	err = perf_config(perf_record_config, rec);
4004 	if (err)
4005 		return err;
4006 
4007 	argc = parse_options(argc, argv, record_options, record_usage,
4008 			    PARSE_OPT_STOP_AT_NON_OPTION);
4009 	if (quiet)
4010 		perf_quiet_option();
4011 
4012 	err = symbol__validate_sym_arguments();
4013 	if (err)
4014 		return err;
4015 
4016 	perf_debuginfod_setup(&record.debuginfod);
4017 
4018 	/* Make system wide (-a) the default target. */
4019 	if (!argc && target__none(&rec->opts.target))
4020 		rec->opts.target.system_wide = true;
4021 
4022 	if (nr_cgroups && !rec->opts.target.system_wide) {
4023 		usage_with_options_msg(record_usage, record_options,
4024 			"cgroup monitoring only available in system-wide mode");
4025 
4026 	}
4027 
4028 	if (rec->buildid_mmap) {
4029 		if (!perf_can_record_build_id()) {
4030 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4031 			err = -EINVAL;
4032 			goto out_opts;
4033 		}
4034 		pr_debug("Enabling build id in mmap2 events.\n");
4035 		/* Enable mmap build id synthesizing. */
4036 		symbol_conf.buildid_mmap2 = true;
4037 		/* Enable perf_event_attr::build_id bit. */
4038 		rec->opts.build_id = true;
4039 		/* Disable build id cache. */
4040 		rec->no_buildid = true;
4041 	}
4042 
4043 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4044 		pr_err("Kernel has no cgroup sampling support.\n");
4045 		err = -EINVAL;
4046 		goto out_opts;
4047 	}
4048 
4049 	if (rec->opts.kcore)
4050 		rec->opts.text_poke = true;
4051 
4052 	if (rec->opts.kcore || record__threads_enabled(rec))
4053 		rec->data.is_dir = true;
4054 
4055 	if (record__threads_enabled(rec)) {
4056 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4057 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4058 			goto out_opts;
4059 		}
4060 		if (record__aio_enabled(rec)) {
4061 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4062 			goto out_opts;
4063 		}
4064 	}
4065 
4066 	if (rec->opts.comp_level != 0) {
4067 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4068 		rec->no_buildid = true;
4069 	}
4070 
4071 	if (rec->opts.record_switch_events &&
4072 	    !perf_can_record_switch_events()) {
4073 		ui__error("kernel does not support recording context switch events\n");
4074 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4075 		err = -EINVAL;
4076 		goto out_opts;
4077 	}
4078 
4079 	if (switch_output_setup(rec)) {
4080 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4081 		err = -EINVAL;
4082 		goto out_opts;
4083 	}
4084 
4085 	if (rec->switch_output.time) {
4086 		signal(SIGALRM, alarm_sig_handler);
4087 		alarm(rec->switch_output.time);
4088 	}
4089 
4090 	if (rec->switch_output.num_files) {
4091 		rec->switch_output.filenames = calloc(sizeof(char *),
4092 						      rec->switch_output.num_files);
4093 		if (!rec->switch_output.filenames) {
4094 			err = -EINVAL;
4095 			goto out_opts;
4096 		}
4097 	}
4098 
4099 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4100 		rec->timestamp_filename = false;
4101 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4102 	}
4103 
4104 	/*
4105 	 * Allow aliases to facilitate the lookup of symbols for address
4106 	 * filters. Refer to auxtrace_parse_filters().
4107 	 */
4108 	symbol_conf.allow_aliases = true;
4109 
4110 	symbol__init(NULL);
4111 
4112 	err = record__auxtrace_init(rec);
4113 	if (err)
4114 		goto out;
4115 
4116 	if (dry_run)
4117 		goto out;
4118 
4119 	err = bpf__setup_stdout(rec->evlist);
4120 	if (err) {
4121 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
4122 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
4123 			 errbuf);
4124 		goto out;
4125 	}
4126 
4127 	err = -ENOMEM;
4128 
4129 	if (rec->no_buildid_cache || rec->no_buildid) {
4130 		disable_buildid_cache();
4131 	} else if (rec->switch_output.enabled) {
4132 		/*
4133 		 * In 'perf record --switch-output', disable buildid
4134 		 * generation by default to reduce data file switching
4135 		 * overhead. Still generate buildid if they are required
4136 		 * explicitly using
4137 		 *
4138 		 *  perf record --switch-output --no-no-buildid \
4139 		 *              --no-no-buildid-cache
4140 		 *
4141 		 * Following code equals to:
4142 		 *
4143 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4144 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4145 		 *         disable_buildid_cache();
4146 		 */
4147 		bool disable = true;
4148 
4149 		if (rec->no_buildid_set && !rec->no_buildid)
4150 			disable = false;
4151 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4152 			disable = false;
4153 		if (disable) {
4154 			rec->no_buildid = true;
4155 			rec->no_buildid_cache = true;
4156 			disable_buildid_cache();
4157 		}
4158 	}
4159 
4160 	if (record.opts.overwrite)
4161 		record.opts.tail_synthesize = true;
4162 
4163 	if (rec->evlist->core.nr_entries == 0) {
4164 		bool can_profile_kernel = perf_event_paranoid_check(1);
4165 
4166 		err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu");
4167 		if (err)
4168 			goto out;
4169 	}
4170 
4171 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4172 		rec->opts.no_inherit = true;
4173 
4174 	err = target__validate(&rec->opts.target);
4175 	if (err) {
4176 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4177 		ui__warning("%s\n", errbuf);
4178 	}
4179 
4180 	err = target__parse_uid(&rec->opts.target);
4181 	if (err) {
4182 		int saved_errno = errno;
4183 
4184 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4185 		ui__error("%s", errbuf);
4186 
4187 		err = -saved_errno;
4188 		goto out;
4189 	}
4190 
4191 	/* Enable ignoring missing threads when -u/-p option is defined. */
4192 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4193 
4194 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4195 
4196 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4197 		arch__add_leaf_frame_record_opts(&rec->opts);
4198 
4199 	err = -ENOMEM;
4200 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4201 		if (rec->opts.target.pid != NULL) {
4202 			pr_err("Couldn't create thread/CPU maps: %s\n",
4203 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4204 			goto out;
4205 		}
4206 		else
4207 			usage_with_options(record_usage, record_options);
4208 	}
4209 
4210 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4211 	if (err)
4212 		goto out;
4213 
4214 	/*
4215 	 * We take all buildids when the file contains
4216 	 * AUX area tracing data because we do not decode the
4217 	 * trace because it would take too long.
4218 	 */
4219 	if (rec->opts.full_auxtrace)
4220 		rec->buildid_all = true;
4221 
4222 	if (rec->opts.text_poke) {
4223 		err = record__config_text_poke(rec->evlist);
4224 		if (err) {
4225 			pr_err("record__config_text_poke failed, error %d\n", err);
4226 			goto out;
4227 		}
4228 	}
4229 
4230 	if (rec->off_cpu) {
4231 		err = record__config_off_cpu(rec);
4232 		if (err) {
4233 			pr_err("record__config_off_cpu failed, error %d\n", err);
4234 			goto out;
4235 		}
4236 	}
4237 
4238 	if (record_opts__config(&rec->opts)) {
4239 		err = -EINVAL;
4240 		goto out;
4241 	}
4242 
4243 	err = record__init_thread_masks(rec);
4244 	if (err) {
4245 		pr_err("Failed to initialize parallel data streaming masks\n");
4246 		goto out;
4247 	}
4248 
4249 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4250 		rec->opts.nr_cblocks = nr_cblocks_max;
4251 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4252 
4253 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4254 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4255 
4256 	if (rec->opts.comp_level > comp_level_max)
4257 		rec->opts.comp_level = comp_level_max;
4258 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4259 
4260 	err = __cmd_record(&record, argc, argv);
4261 out:
4262 	evlist__delete(rec->evlist);
4263 	symbol__exit();
4264 	auxtrace_record__free(rec->itr);
4265 out_opts:
4266 	record__free_thread_masks(rec, rec->nr_threads);
4267 	rec->nr_threads = 0;
4268 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4269 	return err;
4270 }
4271 
4272 static void snapshot_sig_handler(int sig __maybe_unused)
4273 {
4274 	struct record *rec = &record;
4275 
4276 	hit_auxtrace_snapshot_trigger(rec);
4277 
4278 	if (switch_output_signal(rec))
4279 		trigger_hit(&switch_output_trigger);
4280 }
4281 
4282 static void alarm_sig_handler(int sig __maybe_unused)
4283 {
4284 	struct record *rec = &record;
4285 
4286 	if (switch_output_time(rec))
4287 		trigger_hit(&switch_output_trigger);
4288 }
4289