xref: /openbmc/linux/tools/perf/builtin-record.c (revision 2a24da4cf6753ee4c1f5b9e16d526a4a115e8562)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "asm/bug.h"
55 #include "perf.h"
56 #include "cputopo.h"
57 
58 #include <errno.h>
59 #include <inttypes.h>
60 #include <locale.h>
61 #include <poll.h>
62 #include <pthread.h>
63 #include <unistd.h>
64 #ifndef HAVE_GETTID
65 #include <syscall.h>
66 #endif
67 #include <sched.h>
68 #include <signal.h>
69 #ifdef HAVE_EVENTFD_SUPPORT
70 #include <sys/eventfd.h>
71 #endif
72 #include <sys/mman.h>
73 #include <sys/wait.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <fcntl.h>
77 #include <linux/err.h>
78 #include <linux/string.h>
79 #include <linux/time64.h>
80 #include <linux/zalloc.h>
81 #include <linux/bitmap.h>
82 #include <sys/time.h>
83 
84 struct switch_output {
85 	bool		 enabled;
86 	bool		 signal;
87 	unsigned long	 size;
88 	unsigned long	 time;
89 	const char	*str;
90 	bool		 set;
91 	char		 **filenames;
92 	int		 num_files;
93 	int		 cur_file;
94 };
95 
96 struct thread_mask {
97 	struct mmap_cpu_mask	maps;
98 	struct mmap_cpu_mask	affinity;
99 };
100 
101 struct record_thread {
102 	pid_t			tid;
103 	struct thread_mask	*mask;
104 	struct {
105 		int		msg[2];
106 		int		ack[2];
107 	} pipes;
108 	struct fdarray		pollfd;
109 	int			ctlfd_pos;
110 	int			nr_mmaps;
111 	struct mmap		**maps;
112 	struct mmap		**overwrite_maps;
113 	struct record		*rec;
114 	unsigned long long	samples;
115 	unsigned long		waking;
116 	u64			bytes_written;
117 	u64			bytes_transferred;
118 	u64			bytes_compressed;
119 };
120 
121 static __thread struct record_thread *thread;
122 
123 enum thread_msg {
124 	THREAD_MSG__UNDEFINED = 0,
125 	THREAD_MSG__READY,
126 	THREAD_MSG__MAX,
127 };
128 
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
130 	"UNDEFINED", "READY"
131 };
132 
133 enum thread_spec {
134 	THREAD_SPEC__UNDEFINED = 0,
135 	THREAD_SPEC__CPU,
136 	THREAD_SPEC__CORE,
137 	THREAD_SPEC__PACKAGE,
138 	THREAD_SPEC__NUMA,
139 	THREAD_SPEC__USER,
140 	THREAD_SPEC__MAX,
141 };
142 
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144 	"undefined", "cpu", "core", "package", "numa", "user"
145 };
146 
147 struct pollfd_index_map {
148 	int evlist_pollfd_index;
149 	int thread_pollfd_index;
150 };
151 
152 struct record {
153 	struct perf_tool	tool;
154 	struct record_opts	opts;
155 	u64			bytes_written;
156 	u64			thread_bytes_written;
157 	struct perf_data	data;
158 	struct auxtrace_record	*itr;
159 	struct evlist	*evlist;
160 	struct perf_session	*session;
161 	struct evlist		*sb_evlist;
162 	pthread_t		thread_id;
163 	int			realtime_prio;
164 	bool			switch_output_event_set;
165 	bool			no_buildid;
166 	bool			no_buildid_set;
167 	bool			no_buildid_cache;
168 	bool			no_buildid_cache_set;
169 	bool			buildid_all;
170 	bool			buildid_mmap;
171 	bool			timestamp_filename;
172 	bool			timestamp_boundary;
173 	bool			off_cpu;
174 	struct switch_output	switch_output;
175 	unsigned long long	samples;
176 	unsigned long		output_max_size;	/* = 0: unlimited */
177 	struct perf_debuginfod	debuginfod;
178 	int			nr_threads;
179 	struct thread_mask	*thread_masks;
180 	struct record_thread	*thread_data;
181 	struct pollfd_index_map	*index_map;
182 	size_t			index_map_sz;
183 	size_t			index_map_cnt;
184 };
185 
186 static volatile int done;
187 
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191 
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193 	"SYS", "NODE", "CPU"
194 };
195 
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199 	return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202 
203 static int record__threads_enabled(struct record *rec)
204 {
205 	return rec->opts.threads_spec;
206 }
207 
208 static bool switch_output_signal(struct record *rec)
209 {
210 	return rec->switch_output.signal &&
211 	       trigger_is_ready(&switch_output_trigger);
212 }
213 
214 static bool switch_output_size(struct record *rec)
215 {
216 	return rec->switch_output.size &&
217 	       trigger_is_ready(&switch_output_trigger) &&
218 	       (rec->bytes_written >= rec->switch_output.size);
219 }
220 
221 static bool switch_output_time(struct record *rec)
222 {
223 	return rec->switch_output.time &&
224 	       trigger_is_ready(&switch_output_trigger);
225 }
226 
227 static u64 record__bytes_written(struct record *rec)
228 {
229 	return rec->bytes_written + rec->thread_bytes_written;
230 }
231 
232 static bool record__output_max_size_exceeded(struct record *rec)
233 {
234 	return rec->output_max_size &&
235 	       (record__bytes_written(rec) >= rec->output_max_size);
236 }
237 
238 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
239 			 void *bf, size_t size)
240 {
241 	struct perf_data_file *file = &rec->session->data->file;
242 
243 	if (map && map->file)
244 		file = map->file;
245 
246 	if (perf_data_file__write(file, bf, size) < 0) {
247 		pr_err("failed to write perf data, error: %m\n");
248 		return -1;
249 	}
250 
251 	if (map && map->file) {
252 		thread->bytes_written += size;
253 		rec->thread_bytes_written += size;
254 	} else {
255 		rec->bytes_written += size;
256 	}
257 
258 	if (record__output_max_size_exceeded(rec) && !done) {
259 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
260 				" stopping session ]\n",
261 				record__bytes_written(rec) >> 10);
262 		done = 1;
263 	}
264 
265 	if (switch_output_size(rec))
266 		trigger_hit(&switch_output_trigger);
267 
268 	return 0;
269 }
270 
271 static int record__aio_enabled(struct record *rec);
272 static int record__comp_enabled(struct record *rec);
273 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
274 			    void *dst, size_t dst_size, void *src, size_t src_size);
275 
276 #ifdef HAVE_AIO_SUPPORT
277 static int record__aio_write(struct aiocb *cblock, int trace_fd,
278 		void *buf, size_t size, off_t off)
279 {
280 	int rc;
281 
282 	cblock->aio_fildes = trace_fd;
283 	cblock->aio_buf    = buf;
284 	cblock->aio_nbytes = size;
285 	cblock->aio_offset = off;
286 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
287 
288 	do {
289 		rc = aio_write(cblock);
290 		if (rc == 0) {
291 			break;
292 		} else if (errno != EAGAIN) {
293 			cblock->aio_fildes = -1;
294 			pr_err("failed to queue perf data, error: %m\n");
295 			break;
296 		}
297 	} while (1);
298 
299 	return rc;
300 }
301 
302 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
303 {
304 	void *rem_buf;
305 	off_t rem_off;
306 	size_t rem_size;
307 	int rc, aio_errno;
308 	ssize_t aio_ret, written;
309 
310 	aio_errno = aio_error(cblock);
311 	if (aio_errno == EINPROGRESS)
312 		return 0;
313 
314 	written = aio_ret = aio_return(cblock);
315 	if (aio_ret < 0) {
316 		if (aio_errno != EINTR)
317 			pr_err("failed to write perf data, error: %m\n");
318 		written = 0;
319 	}
320 
321 	rem_size = cblock->aio_nbytes - written;
322 
323 	if (rem_size == 0) {
324 		cblock->aio_fildes = -1;
325 		/*
326 		 * md->refcount is incremented in record__aio_pushfn() for
327 		 * every aio write request started in record__aio_push() so
328 		 * decrement it because the request is now complete.
329 		 */
330 		perf_mmap__put(&md->core);
331 		rc = 1;
332 	} else {
333 		/*
334 		 * aio write request may require restart with the
335 		 * reminder if the kernel didn't write whole
336 		 * chunk at once.
337 		 */
338 		rem_off = cblock->aio_offset + written;
339 		rem_buf = (void *)(cblock->aio_buf + written);
340 		record__aio_write(cblock, cblock->aio_fildes,
341 				rem_buf, rem_size, rem_off);
342 		rc = 0;
343 	}
344 
345 	return rc;
346 }
347 
348 static int record__aio_sync(struct mmap *md, bool sync_all)
349 {
350 	struct aiocb **aiocb = md->aio.aiocb;
351 	struct aiocb *cblocks = md->aio.cblocks;
352 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
353 	int i, do_suspend;
354 
355 	do {
356 		do_suspend = 0;
357 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
358 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
359 				if (sync_all)
360 					aiocb[i] = NULL;
361 				else
362 					return i;
363 			} else {
364 				/*
365 				 * Started aio write is not complete yet
366 				 * so it has to be waited before the
367 				 * next allocation.
368 				 */
369 				aiocb[i] = &cblocks[i];
370 				do_suspend = 1;
371 			}
372 		}
373 		if (!do_suspend)
374 			return -1;
375 
376 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
377 			if (!(errno == EAGAIN || errno == EINTR))
378 				pr_err("failed to sync perf data, error: %m\n");
379 		}
380 	} while (1);
381 }
382 
383 struct record_aio {
384 	struct record	*rec;
385 	void		*data;
386 	size_t		size;
387 };
388 
389 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
390 {
391 	struct record_aio *aio = to;
392 
393 	/*
394 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
395 	 * to release space in the kernel buffer as fast as possible, calling
396 	 * perf_mmap__consume() from perf_mmap__push() function.
397 	 *
398 	 * That lets the kernel to proceed with storing more profiling data into
399 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
400 	 *
401 	 * Coping can be done in two steps in case the chunk of profiling data
402 	 * crosses the upper bound of the kernel buffer. In this case we first move
403 	 * part of data from map->start till the upper bound and then the reminder
404 	 * from the beginning of the kernel buffer till the end of the data chunk.
405 	 */
406 
407 	if (record__comp_enabled(aio->rec)) {
408 		size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
409 				     mmap__mmap_len(map) - aio->size,
410 				     buf, size);
411 	} else {
412 		memcpy(aio->data + aio->size, buf, size);
413 	}
414 
415 	if (!aio->size) {
416 		/*
417 		 * Increment map->refcount to guard map->aio.data[] buffer
418 		 * from premature deallocation because map object can be
419 		 * released earlier than aio write request started on
420 		 * map->aio.data[] buffer is complete.
421 		 *
422 		 * perf_mmap__put() is done at record__aio_complete()
423 		 * after started aio request completion or at record__aio_push()
424 		 * if the request failed to start.
425 		 */
426 		perf_mmap__get(&map->core);
427 	}
428 
429 	aio->size += size;
430 
431 	return size;
432 }
433 
434 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
435 {
436 	int ret, idx;
437 	int trace_fd = rec->session->data->file.fd;
438 	struct record_aio aio = { .rec = rec, .size = 0 };
439 
440 	/*
441 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
442 	 * becomes available after previous aio write operation.
443 	 */
444 
445 	idx = record__aio_sync(map, false);
446 	aio.data = map->aio.data[idx];
447 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
448 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
449 		return ret;
450 
451 	rec->samples++;
452 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
453 	if (!ret) {
454 		*off += aio.size;
455 		rec->bytes_written += aio.size;
456 		if (switch_output_size(rec))
457 			trigger_hit(&switch_output_trigger);
458 	} else {
459 		/*
460 		 * Decrement map->refcount incremented in record__aio_pushfn()
461 		 * back if record__aio_write() operation failed to start, otherwise
462 		 * map->refcount is decremented in record__aio_complete() after
463 		 * aio write operation finishes successfully.
464 		 */
465 		perf_mmap__put(&map->core);
466 	}
467 
468 	return ret;
469 }
470 
471 static off_t record__aio_get_pos(int trace_fd)
472 {
473 	return lseek(trace_fd, 0, SEEK_CUR);
474 }
475 
476 static void record__aio_set_pos(int trace_fd, off_t pos)
477 {
478 	lseek(trace_fd, pos, SEEK_SET);
479 }
480 
481 static void record__aio_mmap_read_sync(struct record *rec)
482 {
483 	int i;
484 	struct evlist *evlist = rec->evlist;
485 	struct mmap *maps = evlist->mmap;
486 
487 	if (!record__aio_enabled(rec))
488 		return;
489 
490 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
491 		struct mmap *map = &maps[i];
492 
493 		if (map->core.base)
494 			record__aio_sync(map, true);
495 	}
496 }
497 
498 static int nr_cblocks_default = 1;
499 static int nr_cblocks_max = 4;
500 
501 static int record__aio_parse(const struct option *opt,
502 			     const char *str,
503 			     int unset)
504 {
505 	struct record_opts *opts = (struct record_opts *)opt->value;
506 
507 	if (unset) {
508 		opts->nr_cblocks = 0;
509 	} else {
510 		if (str)
511 			opts->nr_cblocks = strtol(str, NULL, 0);
512 		if (!opts->nr_cblocks)
513 			opts->nr_cblocks = nr_cblocks_default;
514 	}
515 
516 	return 0;
517 }
518 #else /* HAVE_AIO_SUPPORT */
519 static int nr_cblocks_max = 0;
520 
521 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
522 			    off_t *off __maybe_unused)
523 {
524 	return -1;
525 }
526 
527 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
528 {
529 	return -1;
530 }
531 
532 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
533 {
534 }
535 
536 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
537 {
538 }
539 #endif
540 
541 static int record__aio_enabled(struct record *rec)
542 {
543 	return rec->opts.nr_cblocks > 0;
544 }
545 
546 #define MMAP_FLUSH_DEFAULT 1
547 static int record__mmap_flush_parse(const struct option *opt,
548 				    const char *str,
549 				    int unset)
550 {
551 	int flush_max;
552 	struct record_opts *opts = (struct record_opts *)opt->value;
553 	static struct parse_tag tags[] = {
554 			{ .tag  = 'B', .mult = 1       },
555 			{ .tag  = 'K', .mult = 1 << 10 },
556 			{ .tag  = 'M', .mult = 1 << 20 },
557 			{ .tag  = 'G', .mult = 1 << 30 },
558 			{ .tag  = 0 },
559 	};
560 
561 	if (unset)
562 		return 0;
563 
564 	if (str) {
565 		opts->mmap_flush = parse_tag_value(str, tags);
566 		if (opts->mmap_flush == (int)-1)
567 			opts->mmap_flush = strtol(str, NULL, 0);
568 	}
569 
570 	if (!opts->mmap_flush)
571 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
572 
573 	flush_max = evlist__mmap_size(opts->mmap_pages);
574 	flush_max /= 4;
575 	if (opts->mmap_flush > flush_max)
576 		opts->mmap_flush = flush_max;
577 
578 	return 0;
579 }
580 
581 #ifdef HAVE_ZSTD_SUPPORT
582 static unsigned int comp_level_default = 1;
583 
584 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
585 {
586 	struct record_opts *opts = opt->value;
587 
588 	if (unset) {
589 		opts->comp_level = 0;
590 	} else {
591 		if (str)
592 			opts->comp_level = strtol(str, NULL, 0);
593 		if (!opts->comp_level)
594 			opts->comp_level = comp_level_default;
595 	}
596 
597 	return 0;
598 }
599 #endif
600 static unsigned int comp_level_max = 22;
601 
602 static int record__comp_enabled(struct record *rec)
603 {
604 	return rec->opts.comp_level > 0;
605 }
606 
607 static int process_synthesized_event(struct perf_tool *tool,
608 				     union perf_event *event,
609 				     struct perf_sample *sample __maybe_unused,
610 				     struct machine *machine __maybe_unused)
611 {
612 	struct record *rec = container_of(tool, struct record, tool);
613 	return record__write(rec, NULL, event, event->header.size);
614 }
615 
616 static struct mutex synth_lock;
617 
618 static int process_locked_synthesized_event(struct perf_tool *tool,
619 				     union perf_event *event,
620 				     struct perf_sample *sample __maybe_unused,
621 				     struct machine *machine __maybe_unused)
622 {
623 	int ret;
624 
625 	mutex_lock(&synth_lock);
626 	ret = process_synthesized_event(tool, event, sample, machine);
627 	mutex_unlock(&synth_lock);
628 	return ret;
629 }
630 
631 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
632 {
633 	struct record *rec = to;
634 
635 	if (record__comp_enabled(rec)) {
636 		size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
637 		bf   = map->data;
638 	}
639 
640 	thread->samples++;
641 	return record__write(rec, map, bf, size);
642 }
643 
644 static volatile sig_atomic_t signr = -1;
645 static volatile sig_atomic_t child_finished;
646 #ifdef HAVE_EVENTFD_SUPPORT
647 static volatile sig_atomic_t done_fd = -1;
648 #endif
649 
650 static void sig_handler(int sig)
651 {
652 	if (sig == SIGCHLD)
653 		child_finished = 1;
654 	else
655 		signr = sig;
656 
657 	done = 1;
658 #ifdef HAVE_EVENTFD_SUPPORT
659 	if (done_fd >= 0) {
660 		u64 tmp = 1;
661 		int orig_errno = errno;
662 
663 		/*
664 		 * It is possible for this signal handler to run after done is
665 		 * checked in the main loop, but before the perf counter fds are
666 		 * polled. If this happens, the poll() will continue to wait
667 		 * even though done is set, and will only break out if either
668 		 * another signal is received, or the counters are ready for
669 		 * read. To ensure the poll() doesn't sleep when done is set,
670 		 * use an eventfd (done_fd) to wake up the poll().
671 		 */
672 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
673 			pr_err("failed to signal wakeup fd, error: %m\n");
674 
675 		errno = orig_errno;
676 	}
677 #endif // HAVE_EVENTFD_SUPPORT
678 }
679 
680 static void sigsegv_handler(int sig)
681 {
682 	perf_hooks__recover();
683 	sighandler_dump_stack(sig);
684 }
685 
686 static void record__sig_exit(void)
687 {
688 	if (signr == -1)
689 		return;
690 
691 	signal(signr, SIG_DFL);
692 	raise(signr);
693 }
694 
695 #ifdef HAVE_AUXTRACE_SUPPORT
696 
697 static int record__process_auxtrace(struct perf_tool *tool,
698 				    struct mmap *map,
699 				    union perf_event *event, void *data1,
700 				    size_t len1, void *data2, size_t len2)
701 {
702 	struct record *rec = container_of(tool, struct record, tool);
703 	struct perf_data *data = &rec->data;
704 	size_t padding;
705 	u8 pad[8] = {0};
706 
707 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
708 		off_t file_offset;
709 		int fd = perf_data__fd(data);
710 		int err;
711 
712 		file_offset = lseek(fd, 0, SEEK_CUR);
713 		if (file_offset == -1)
714 			return -1;
715 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
716 						     event, file_offset);
717 		if (err)
718 			return err;
719 	}
720 
721 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
722 	padding = (len1 + len2) & 7;
723 	if (padding)
724 		padding = 8 - padding;
725 
726 	record__write(rec, map, event, event->header.size);
727 	record__write(rec, map, data1, len1);
728 	if (len2)
729 		record__write(rec, map, data2, len2);
730 	record__write(rec, map, &pad, padding);
731 
732 	return 0;
733 }
734 
735 static int record__auxtrace_mmap_read(struct record *rec,
736 				      struct mmap *map)
737 {
738 	int ret;
739 
740 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
741 				  record__process_auxtrace);
742 	if (ret < 0)
743 		return ret;
744 
745 	if (ret)
746 		rec->samples++;
747 
748 	return 0;
749 }
750 
751 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
752 					       struct mmap *map)
753 {
754 	int ret;
755 
756 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
757 					   record__process_auxtrace,
758 					   rec->opts.auxtrace_snapshot_size);
759 	if (ret < 0)
760 		return ret;
761 
762 	if (ret)
763 		rec->samples++;
764 
765 	return 0;
766 }
767 
768 static int record__auxtrace_read_snapshot_all(struct record *rec)
769 {
770 	int i;
771 	int rc = 0;
772 
773 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
774 		struct mmap *map = &rec->evlist->mmap[i];
775 
776 		if (!map->auxtrace_mmap.base)
777 			continue;
778 
779 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
780 			rc = -1;
781 			goto out;
782 		}
783 	}
784 out:
785 	return rc;
786 }
787 
788 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
789 {
790 	pr_debug("Recording AUX area tracing snapshot\n");
791 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
792 		trigger_error(&auxtrace_snapshot_trigger);
793 	} else {
794 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
795 			trigger_error(&auxtrace_snapshot_trigger);
796 		else
797 			trigger_ready(&auxtrace_snapshot_trigger);
798 	}
799 }
800 
801 static int record__auxtrace_snapshot_exit(struct record *rec)
802 {
803 	if (trigger_is_error(&auxtrace_snapshot_trigger))
804 		return 0;
805 
806 	if (!auxtrace_record__snapshot_started &&
807 	    auxtrace_record__snapshot_start(rec->itr))
808 		return -1;
809 
810 	record__read_auxtrace_snapshot(rec, true);
811 	if (trigger_is_error(&auxtrace_snapshot_trigger))
812 		return -1;
813 
814 	return 0;
815 }
816 
817 static int record__auxtrace_init(struct record *rec)
818 {
819 	int err;
820 
821 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
822 	    && record__threads_enabled(rec)) {
823 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
824 		return -EINVAL;
825 	}
826 
827 	if (!rec->itr) {
828 		rec->itr = auxtrace_record__init(rec->evlist, &err);
829 		if (err)
830 			return err;
831 	}
832 
833 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
834 					      rec->opts.auxtrace_snapshot_opts);
835 	if (err)
836 		return err;
837 
838 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
839 					    rec->opts.auxtrace_sample_opts);
840 	if (err)
841 		return err;
842 
843 	auxtrace_regroup_aux_output(rec->evlist);
844 
845 	return auxtrace_parse_filters(rec->evlist);
846 }
847 
848 #else
849 
850 static inline
851 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
852 			       struct mmap *map __maybe_unused)
853 {
854 	return 0;
855 }
856 
857 static inline
858 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
859 				    bool on_exit __maybe_unused)
860 {
861 }
862 
863 static inline
864 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
865 {
866 	return 0;
867 }
868 
869 static inline
870 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
871 {
872 	return 0;
873 }
874 
875 static int record__auxtrace_init(struct record *rec __maybe_unused)
876 {
877 	return 0;
878 }
879 
880 #endif
881 
882 static int record__config_text_poke(struct evlist *evlist)
883 {
884 	struct evsel *evsel;
885 
886 	/* Nothing to do if text poke is already configured */
887 	evlist__for_each_entry(evlist, evsel) {
888 		if (evsel->core.attr.text_poke)
889 			return 0;
890 	}
891 
892 	evsel = evlist__add_dummy_on_all_cpus(evlist);
893 	if (!evsel)
894 		return -ENOMEM;
895 
896 	evsel->core.attr.text_poke = 1;
897 	evsel->core.attr.ksymbol = 1;
898 	evsel->immediate = true;
899 	evsel__set_sample_bit(evsel, TIME);
900 
901 	return 0;
902 }
903 
904 static int record__config_off_cpu(struct record *rec)
905 {
906 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
907 }
908 
909 static int record__config_tracking_events(struct record *rec)
910 {
911 	struct record_opts *opts = &rec->opts;
912 	struct evlist *evlist = rec->evlist;
913 	struct evsel *evsel;
914 
915 	/*
916 	 * For initial_delay, system wide or a hybrid system, we need to add
917 	 * tracking event so that we can track PERF_RECORD_MMAP to cover the
918 	 * delay of waiting or event synthesis.
919 	 */
920 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
921 	    perf_pmus__num_core_pmus() > 1) {
922 		evsel = evlist__findnew_tracking_event(evlist, false);
923 		if (!evsel)
924 			return -ENOMEM;
925 
926 		/*
927 		 * Enable the tracking event when the process is forked for
928 		 * initial_delay, immediately for system wide.
929 		 */
930 		if (opts->target.initial_delay && !evsel->immediate &&
931 		    !target__has_cpu(&opts->target))
932 			evsel->core.attr.enable_on_exec = 1;
933 		else
934 			evsel->immediate = 1;
935 	}
936 
937 	return 0;
938 }
939 
940 static bool record__kcore_readable(struct machine *machine)
941 {
942 	char kcore[PATH_MAX];
943 	int fd;
944 
945 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
946 
947 	fd = open(kcore, O_RDONLY);
948 	if (fd < 0)
949 		return false;
950 
951 	close(fd);
952 
953 	return true;
954 }
955 
956 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
957 {
958 	char from_dir[PATH_MAX];
959 	char kcore_dir[PATH_MAX];
960 	int ret;
961 
962 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
963 
964 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
965 	if (ret)
966 		return ret;
967 
968 	return kcore_copy(from_dir, kcore_dir);
969 }
970 
971 static void record__thread_data_init_pipes(struct record_thread *thread_data)
972 {
973 	thread_data->pipes.msg[0] = -1;
974 	thread_data->pipes.msg[1] = -1;
975 	thread_data->pipes.ack[0] = -1;
976 	thread_data->pipes.ack[1] = -1;
977 }
978 
979 static int record__thread_data_open_pipes(struct record_thread *thread_data)
980 {
981 	if (pipe(thread_data->pipes.msg))
982 		return -EINVAL;
983 
984 	if (pipe(thread_data->pipes.ack)) {
985 		close(thread_data->pipes.msg[0]);
986 		thread_data->pipes.msg[0] = -1;
987 		close(thread_data->pipes.msg[1]);
988 		thread_data->pipes.msg[1] = -1;
989 		return -EINVAL;
990 	}
991 
992 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
993 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
994 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
995 
996 	return 0;
997 }
998 
999 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1000 {
1001 	if (thread_data->pipes.msg[0] != -1) {
1002 		close(thread_data->pipes.msg[0]);
1003 		thread_data->pipes.msg[0] = -1;
1004 	}
1005 	if (thread_data->pipes.msg[1] != -1) {
1006 		close(thread_data->pipes.msg[1]);
1007 		thread_data->pipes.msg[1] = -1;
1008 	}
1009 	if (thread_data->pipes.ack[0] != -1) {
1010 		close(thread_data->pipes.ack[0]);
1011 		thread_data->pipes.ack[0] = -1;
1012 	}
1013 	if (thread_data->pipes.ack[1] != -1) {
1014 		close(thread_data->pipes.ack[1]);
1015 		thread_data->pipes.ack[1] = -1;
1016 	}
1017 }
1018 
1019 static bool evlist__per_thread(struct evlist *evlist)
1020 {
1021 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1022 }
1023 
1024 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1025 {
1026 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1027 	struct mmap *mmap = evlist->mmap;
1028 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1029 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1030 	bool per_thread = evlist__per_thread(evlist);
1031 
1032 	if (per_thread)
1033 		thread_data->nr_mmaps = nr_mmaps;
1034 	else
1035 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1036 						      thread_data->mask->maps.nbits);
1037 	if (mmap) {
1038 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1039 		if (!thread_data->maps)
1040 			return -ENOMEM;
1041 	}
1042 	if (overwrite_mmap) {
1043 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1044 		if (!thread_data->overwrite_maps) {
1045 			zfree(&thread_data->maps);
1046 			return -ENOMEM;
1047 		}
1048 	}
1049 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1050 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1051 
1052 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1053 		if (per_thread ||
1054 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1055 			if (thread_data->maps) {
1056 				thread_data->maps[tm] = &mmap[m];
1057 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1058 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1059 			}
1060 			if (thread_data->overwrite_maps) {
1061 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1062 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1063 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1064 			}
1065 			tm++;
1066 		}
1067 	}
1068 
1069 	return 0;
1070 }
1071 
1072 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1073 {
1074 	int f, tm, pos;
1075 	struct mmap *map, *overwrite_map;
1076 
1077 	fdarray__init(&thread_data->pollfd, 64);
1078 
1079 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1080 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1081 		overwrite_map = thread_data->overwrite_maps ?
1082 				thread_data->overwrite_maps[tm] : NULL;
1083 
1084 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1085 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1086 
1087 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1088 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1089 							      &evlist->core.pollfd);
1090 				if (pos < 0)
1091 					return pos;
1092 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1093 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1094 			}
1095 		}
1096 	}
1097 
1098 	return 0;
1099 }
1100 
1101 static void record__free_thread_data(struct record *rec)
1102 {
1103 	int t;
1104 	struct record_thread *thread_data = rec->thread_data;
1105 
1106 	if (thread_data == NULL)
1107 		return;
1108 
1109 	for (t = 0; t < rec->nr_threads; t++) {
1110 		record__thread_data_close_pipes(&thread_data[t]);
1111 		zfree(&thread_data[t].maps);
1112 		zfree(&thread_data[t].overwrite_maps);
1113 		fdarray__exit(&thread_data[t].pollfd);
1114 	}
1115 
1116 	zfree(&rec->thread_data);
1117 }
1118 
1119 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1120 						    int evlist_pollfd_index,
1121 						    int thread_pollfd_index)
1122 {
1123 	size_t x = rec->index_map_cnt;
1124 
1125 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1126 		return -ENOMEM;
1127 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1128 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1129 	rec->index_map_cnt += 1;
1130 	return 0;
1131 }
1132 
1133 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1134 						    struct evlist *evlist,
1135 						    struct record_thread *thread_data)
1136 {
1137 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1138 	struct pollfd *t_entries = thread_data->pollfd.entries;
1139 	int err = 0;
1140 	size_t i;
1141 
1142 	for (i = 0; i < rec->index_map_cnt; i++) {
1143 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1144 		int t_pos = rec->index_map[i].thread_pollfd_index;
1145 
1146 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1147 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1148 			pr_err("Thread and evlist pollfd index mismatch\n");
1149 			err = -EINVAL;
1150 			continue;
1151 		}
1152 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1153 	}
1154 	return err;
1155 }
1156 
1157 static int record__dup_non_perf_events(struct record *rec,
1158 				       struct evlist *evlist,
1159 				       struct record_thread *thread_data)
1160 {
1161 	struct fdarray *fda = &evlist->core.pollfd;
1162 	int i, ret;
1163 
1164 	for (i = 0; i < fda->nr; i++) {
1165 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1166 			continue;
1167 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1168 		if (ret < 0) {
1169 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1170 			return ret;
1171 		}
1172 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1173 			  thread_data, ret, fda->entries[i].fd);
1174 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1175 		if (ret < 0) {
1176 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1177 			return ret;
1178 		}
1179 	}
1180 	return 0;
1181 }
1182 
1183 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1184 {
1185 	int t, ret;
1186 	struct record_thread *thread_data;
1187 
1188 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1189 	if (!rec->thread_data) {
1190 		pr_err("Failed to allocate thread data\n");
1191 		return -ENOMEM;
1192 	}
1193 	thread_data = rec->thread_data;
1194 
1195 	for (t = 0; t < rec->nr_threads; t++)
1196 		record__thread_data_init_pipes(&thread_data[t]);
1197 
1198 	for (t = 0; t < rec->nr_threads; t++) {
1199 		thread_data[t].rec = rec;
1200 		thread_data[t].mask = &rec->thread_masks[t];
1201 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1202 		if (ret) {
1203 			pr_err("Failed to initialize thread[%d] maps\n", t);
1204 			goto out_free;
1205 		}
1206 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1207 		if (ret) {
1208 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1209 			goto out_free;
1210 		}
1211 		if (t) {
1212 			thread_data[t].tid = -1;
1213 			ret = record__thread_data_open_pipes(&thread_data[t]);
1214 			if (ret) {
1215 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1216 				goto out_free;
1217 			}
1218 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1219 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1220 			if (ret < 0) {
1221 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1222 				goto out_free;
1223 			}
1224 			thread_data[t].ctlfd_pos = ret;
1225 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1226 				 thread_data, thread_data[t].ctlfd_pos,
1227 				 thread_data[t].pipes.msg[0]);
1228 		} else {
1229 			thread_data[t].tid = gettid();
1230 
1231 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1232 			if (ret < 0)
1233 				goto out_free;
1234 
1235 			thread_data[t].ctlfd_pos = -1; /* Not used */
1236 		}
1237 	}
1238 
1239 	return 0;
1240 
1241 out_free:
1242 	record__free_thread_data(rec);
1243 
1244 	return ret;
1245 }
1246 
1247 static int record__mmap_evlist(struct record *rec,
1248 			       struct evlist *evlist)
1249 {
1250 	int i, ret;
1251 	struct record_opts *opts = &rec->opts;
1252 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1253 				  opts->auxtrace_sample_mode;
1254 	char msg[512];
1255 
1256 	if (opts->affinity != PERF_AFFINITY_SYS)
1257 		cpu__setup_cpunode_map();
1258 
1259 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1260 				 opts->auxtrace_mmap_pages,
1261 				 auxtrace_overwrite,
1262 				 opts->nr_cblocks, opts->affinity,
1263 				 opts->mmap_flush, opts->comp_level) < 0) {
1264 		if (errno == EPERM) {
1265 			pr_err("Permission error mapping pages.\n"
1266 			       "Consider increasing "
1267 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1268 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1269 			       "(current value: %u,%u)\n",
1270 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1271 			return -errno;
1272 		} else {
1273 			pr_err("failed to mmap with %d (%s)\n", errno,
1274 				str_error_r(errno, msg, sizeof(msg)));
1275 			if (errno)
1276 				return -errno;
1277 			else
1278 				return -EINVAL;
1279 		}
1280 	}
1281 
1282 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1283 		return -1;
1284 
1285 	ret = record__alloc_thread_data(rec, evlist);
1286 	if (ret)
1287 		return ret;
1288 
1289 	if (record__threads_enabled(rec)) {
1290 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1291 		if (ret) {
1292 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1293 			return ret;
1294 		}
1295 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1296 			if (evlist->mmap)
1297 				evlist->mmap[i].file = &rec->data.dir.files[i];
1298 			if (evlist->overwrite_mmap)
1299 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1300 		}
1301 	}
1302 
1303 	return 0;
1304 }
1305 
1306 static int record__mmap(struct record *rec)
1307 {
1308 	return record__mmap_evlist(rec, rec->evlist);
1309 }
1310 
1311 static int record__open(struct record *rec)
1312 {
1313 	char msg[BUFSIZ];
1314 	struct evsel *pos;
1315 	struct evlist *evlist = rec->evlist;
1316 	struct perf_session *session = rec->session;
1317 	struct record_opts *opts = &rec->opts;
1318 	int rc = 0;
1319 
1320 	evlist__for_each_entry(evlist, pos) {
1321 try_again:
1322 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1323 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1324 				if (verbose > 0)
1325 					ui__warning("%s\n", msg);
1326 				goto try_again;
1327 			}
1328 			if ((errno == EINVAL || errno == EBADF) &&
1329 			    pos->core.leader != &pos->core &&
1330 			    pos->weak_group) {
1331 			        pos = evlist__reset_weak_group(evlist, pos, true);
1332 				goto try_again;
1333 			}
1334 			rc = -errno;
1335 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1336 			ui__error("%s\n", msg);
1337 			goto out;
1338 		}
1339 
1340 		pos->supported = true;
1341 	}
1342 
1343 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1344 		pr_warning(
1345 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1346 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1347 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1348 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1349 "Samples in kernel modules won't be resolved at all.\n\n"
1350 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1351 "even with a suitable vmlinux or kallsyms file.\n\n");
1352 	}
1353 
1354 	if (evlist__apply_filters(evlist, &pos)) {
1355 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1356 			pos->filter ?: "BPF", evsel__name(pos), errno,
1357 			str_error_r(errno, msg, sizeof(msg)));
1358 		rc = -1;
1359 		goto out;
1360 	}
1361 
1362 	rc = record__mmap(rec);
1363 	if (rc)
1364 		goto out;
1365 
1366 	session->evlist = evlist;
1367 	perf_session__set_id_hdr_size(session);
1368 out:
1369 	return rc;
1370 }
1371 
1372 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1373 {
1374 	if (rec->evlist->first_sample_time == 0)
1375 		rec->evlist->first_sample_time = sample_time;
1376 
1377 	if (sample_time)
1378 		rec->evlist->last_sample_time = sample_time;
1379 }
1380 
1381 static int process_sample_event(struct perf_tool *tool,
1382 				union perf_event *event,
1383 				struct perf_sample *sample,
1384 				struct evsel *evsel,
1385 				struct machine *machine)
1386 {
1387 	struct record *rec = container_of(tool, struct record, tool);
1388 
1389 	set_timestamp_boundary(rec, sample->time);
1390 
1391 	if (rec->buildid_all)
1392 		return 0;
1393 
1394 	rec->samples++;
1395 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1396 }
1397 
1398 static int process_buildids(struct record *rec)
1399 {
1400 	struct perf_session *session = rec->session;
1401 
1402 	if (perf_data__size(&rec->data) == 0)
1403 		return 0;
1404 
1405 	/*
1406 	 * During this process, it'll load kernel map and replace the
1407 	 * dso->long_name to a real pathname it found.  In this case
1408 	 * we prefer the vmlinux path like
1409 	 *   /lib/modules/3.16.4/build/vmlinux
1410 	 *
1411 	 * rather than build-id path (in debug directory).
1412 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1413 	 */
1414 	symbol_conf.ignore_vmlinux_buildid = true;
1415 
1416 	/*
1417 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1418 	 * so no need to process samples. But if timestamp_boundary is enabled,
1419 	 * it still needs to walk on all samples to get the timestamps of
1420 	 * first/last samples.
1421 	 */
1422 	if (rec->buildid_all && !rec->timestamp_boundary)
1423 		rec->tool.sample = NULL;
1424 
1425 	return perf_session__process_events(session);
1426 }
1427 
1428 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1429 {
1430 	int err;
1431 	struct perf_tool *tool = data;
1432 	/*
1433 	 *As for guest kernel when processing subcommand record&report,
1434 	 *we arrange module mmap prior to guest kernel mmap and trigger
1435 	 *a preload dso because default guest module symbols are loaded
1436 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1437 	 *method is used to avoid symbol missing when the first addr is
1438 	 *in module instead of in guest kernel.
1439 	 */
1440 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1441 					     machine);
1442 	if (err < 0)
1443 		pr_err("Couldn't record guest kernel [%d]'s reference"
1444 		       " relocation symbol.\n", machine->pid);
1445 
1446 	/*
1447 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1448 	 * have no _text sometimes.
1449 	 */
1450 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1451 						 machine);
1452 	if (err < 0)
1453 		pr_err("Couldn't record guest kernel [%d]'s reference"
1454 		       " relocation symbol.\n", machine->pid);
1455 }
1456 
1457 static struct perf_event_header finished_round_event = {
1458 	.size = sizeof(struct perf_event_header),
1459 	.type = PERF_RECORD_FINISHED_ROUND,
1460 };
1461 
1462 static struct perf_event_header finished_init_event = {
1463 	.size = sizeof(struct perf_event_header),
1464 	.type = PERF_RECORD_FINISHED_INIT,
1465 };
1466 
1467 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1468 {
1469 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1470 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1471 			  thread->mask->affinity.nbits)) {
1472 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1473 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1474 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1475 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1476 					(cpu_set_t *)thread->mask->affinity.bits);
1477 		if (verbose == 2) {
1478 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1479 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1480 		}
1481 	}
1482 }
1483 
1484 static size_t process_comp_header(void *record, size_t increment)
1485 {
1486 	struct perf_record_compressed *event = record;
1487 	size_t size = sizeof(*event);
1488 
1489 	if (increment) {
1490 		event->header.size += increment;
1491 		return increment;
1492 	}
1493 
1494 	event->header.type = PERF_RECORD_COMPRESSED;
1495 	event->header.size = size;
1496 
1497 	return size;
1498 }
1499 
1500 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1501 			    void *dst, size_t dst_size, void *src, size_t src_size)
1502 {
1503 	size_t compressed;
1504 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1505 	struct zstd_data *zstd_data = &session->zstd_data;
1506 
1507 	if (map && map->file)
1508 		zstd_data = &map->zstd_data;
1509 
1510 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1511 						     max_record_size, process_comp_header);
1512 
1513 	if (map && map->file) {
1514 		thread->bytes_transferred += src_size;
1515 		thread->bytes_compressed  += compressed;
1516 	} else {
1517 		session->bytes_transferred += src_size;
1518 		session->bytes_compressed  += compressed;
1519 	}
1520 
1521 	return compressed;
1522 }
1523 
1524 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1525 				    bool overwrite, bool synch)
1526 {
1527 	u64 bytes_written = rec->bytes_written;
1528 	int i;
1529 	int rc = 0;
1530 	int nr_mmaps;
1531 	struct mmap **maps;
1532 	int trace_fd = rec->data.file.fd;
1533 	off_t off = 0;
1534 
1535 	if (!evlist)
1536 		return 0;
1537 
1538 	nr_mmaps = thread->nr_mmaps;
1539 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1540 
1541 	if (!maps)
1542 		return 0;
1543 
1544 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1545 		return 0;
1546 
1547 	if (record__aio_enabled(rec))
1548 		off = record__aio_get_pos(trace_fd);
1549 
1550 	for (i = 0; i < nr_mmaps; i++) {
1551 		u64 flush = 0;
1552 		struct mmap *map = maps[i];
1553 
1554 		if (map->core.base) {
1555 			record__adjust_affinity(rec, map);
1556 			if (synch) {
1557 				flush = map->core.flush;
1558 				map->core.flush = 1;
1559 			}
1560 			if (!record__aio_enabled(rec)) {
1561 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1562 					if (synch)
1563 						map->core.flush = flush;
1564 					rc = -1;
1565 					goto out;
1566 				}
1567 			} else {
1568 				if (record__aio_push(rec, map, &off) < 0) {
1569 					record__aio_set_pos(trace_fd, off);
1570 					if (synch)
1571 						map->core.flush = flush;
1572 					rc = -1;
1573 					goto out;
1574 				}
1575 			}
1576 			if (synch)
1577 				map->core.flush = flush;
1578 		}
1579 
1580 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1581 		    !rec->opts.auxtrace_sample_mode &&
1582 		    record__auxtrace_mmap_read(rec, map) != 0) {
1583 			rc = -1;
1584 			goto out;
1585 		}
1586 	}
1587 
1588 	if (record__aio_enabled(rec))
1589 		record__aio_set_pos(trace_fd, off);
1590 
1591 	/*
1592 	 * Mark the round finished in case we wrote
1593 	 * at least one event.
1594 	 *
1595 	 * No need for round events in directory mode,
1596 	 * because per-cpu maps and files have data
1597 	 * sorted by kernel.
1598 	 */
1599 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1600 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1601 
1602 	if (overwrite)
1603 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1604 out:
1605 	return rc;
1606 }
1607 
1608 static int record__mmap_read_all(struct record *rec, bool synch)
1609 {
1610 	int err;
1611 
1612 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1613 	if (err)
1614 		return err;
1615 
1616 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1617 }
1618 
1619 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1620 					   void *arg __maybe_unused)
1621 {
1622 	struct perf_mmap *map = fda->priv[fd].ptr;
1623 
1624 	if (map)
1625 		perf_mmap__put(map);
1626 }
1627 
1628 static void *record__thread(void *arg)
1629 {
1630 	enum thread_msg msg = THREAD_MSG__READY;
1631 	bool terminate = false;
1632 	struct fdarray *pollfd;
1633 	int err, ctlfd_pos;
1634 
1635 	thread = arg;
1636 	thread->tid = gettid();
1637 
1638 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1639 	if (err == -1)
1640 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1641 			   thread->tid, strerror(errno));
1642 
1643 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1644 
1645 	pollfd = &thread->pollfd;
1646 	ctlfd_pos = thread->ctlfd_pos;
1647 
1648 	for (;;) {
1649 		unsigned long long hits = thread->samples;
1650 
1651 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1652 			break;
1653 
1654 		if (hits == thread->samples) {
1655 
1656 			err = fdarray__poll(pollfd, -1);
1657 			/*
1658 			 * Propagate error, only if there's any. Ignore positive
1659 			 * number of returned events and interrupt error.
1660 			 */
1661 			if (err > 0 || (err < 0 && errno == EINTR))
1662 				err = 0;
1663 			thread->waking++;
1664 
1665 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1666 					    record__thread_munmap_filtered, NULL) == 0)
1667 				break;
1668 		}
1669 
1670 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1671 			terminate = true;
1672 			close(thread->pipes.msg[0]);
1673 			thread->pipes.msg[0] = -1;
1674 			pollfd->entries[ctlfd_pos].fd = -1;
1675 			pollfd->entries[ctlfd_pos].events = 0;
1676 		}
1677 
1678 		pollfd->entries[ctlfd_pos].revents = 0;
1679 	}
1680 	record__mmap_read_all(thread->rec, true);
1681 
1682 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1683 	if (err == -1)
1684 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1685 			   thread->tid, strerror(errno));
1686 
1687 	return NULL;
1688 }
1689 
1690 static void record__init_features(struct record *rec)
1691 {
1692 	struct perf_session *session = rec->session;
1693 	int feat;
1694 
1695 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1696 		perf_header__set_feat(&session->header, feat);
1697 
1698 	if (rec->no_buildid)
1699 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1700 
1701 #ifdef HAVE_LIBTRACEEVENT
1702 	if (!have_tracepoints(&rec->evlist->core.entries))
1703 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1704 #endif
1705 
1706 	if (!rec->opts.branch_stack)
1707 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1708 
1709 	if (!rec->opts.full_auxtrace)
1710 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1711 
1712 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1713 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1714 
1715 	if (!rec->opts.use_clockid)
1716 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1717 
1718 	if (!record__threads_enabled(rec))
1719 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1720 
1721 	if (!record__comp_enabled(rec))
1722 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1723 
1724 	perf_header__clear_feat(&session->header, HEADER_STAT);
1725 }
1726 
1727 static void
1728 record__finish_output(struct record *rec)
1729 {
1730 	int i;
1731 	struct perf_data *data = &rec->data;
1732 	int fd = perf_data__fd(data);
1733 
1734 	if (data->is_pipe)
1735 		return;
1736 
1737 	rec->session->header.data_size += rec->bytes_written;
1738 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1739 	if (record__threads_enabled(rec)) {
1740 		for (i = 0; i < data->dir.nr; i++)
1741 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1742 	}
1743 
1744 	if (!rec->no_buildid) {
1745 		process_buildids(rec);
1746 
1747 		if (rec->buildid_all)
1748 			dsos__hit_all(rec->session);
1749 	}
1750 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1751 
1752 	return;
1753 }
1754 
1755 static int record__synthesize_workload(struct record *rec, bool tail)
1756 {
1757 	int err;
1758 	struct perf_thread_map *thread_map;
1759 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1760 
1761 	if (rec->opts.tail_synthesize != tail)
1762 		return 0;
1763 
1764 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1765 	if (thread_map == NULL)
1766 		return -1;
1767 
1768 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1769 						 process_synthesized_event,
1770 						 &rec->session->machines.host,
1771 						 needs_mmap,
1772 						 rec->opts.sample_address);
1773 	perf_thread_map__put(thread_map);
1774 	return err;
1775 }
1776 
1777 static int write_finished_init(struct record *rec, bool tail)
1778 {
1779 	if (rec->opts.tail_synthesize != tail)
1780 		return 0;
1781 
1782 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1783 }
1784 
1785 static int record__synthesize(struct record *rec, bool tail);
1786 
1787 static int
1788 record__switch_output(struct record *rec, bool at_exit)
1789 {
1790 	struct perf_data *data = &rec->data;
1791 	char *new_filename = NULL;
1792 	int fd, err;
1793 
1794 	/* Same Size:      "2015122520103046"*/
1795 	char timestamp[] = "InvalidTimestamp";
1796 
1797 	record__aio_mmap_read_sync(rec);
1798 
1799 	write_finished_init(rec, true);
1800 
1801 	record__synthesize(rec, true);
1802 	if (target__none(&rec->opts.target))
1803 		record__synthesize_workload(rec, true);
1804 
1805 	rec->samples = 0;
1806 	record__finish_output(rec);
1807 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1808 	if (err) {
1809 		pr_err("Failed to get current timestamp\n");
1810 		return -EINVAL;
1811 	}
1812 
1813 	fd = perf_data__switch(data, timestamp,
1814 				    rec->session->header.data_offset,
1815 				    at_exit, &new_filename);
1816 	if (fd >= 0 && !at_exit) {
1817 		rec->bytes_written = 0;
1818 		rec->session->header.data_size = 0;
1819 	}
1820 
1821 	if (!quiet)
1822 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1823 			data->path, timestamp);
1824 
1825 	if (rec->switch_output.num_files) {
1826 		int n = rec->switch_output.cur_file + 1;
1827 
1828 		if (n >= rec->switch_output.num_files)
1829 			n = 0;
1830 		rec->switch_output.cur_file = n;
1831 		if (rec->switch_output.filenames[n]) {
1832 			remove(rec->switch_output.filenames[n]);
1833 			zfree(&rec->switch_output.filenames[n]);
1834 		}
1835 		rec->switch_output.filenames[n] = new_filename;
1836 	} else {
1837 		free(new_filename);
1838 	}
1839 
1840 	/* Output tracking events */
1841 	if (!at_exit) {
1842 		record__synthesize(rec, false);
1843 
1844 		/*
1845 		 * In 'perf record --switch-output' without -a,
1846 		 * record__synthesize() in record__switch_output() won't
1847 		 * generate tracking events because there's no thread_map
1848 		 * in evlist. Which causes newly created perf.data doesn't
1849 		 * contain map and comm information.
1850 		 * Create a fake thread_map and directly call
1851 		 * perf_event__synthesize_thread_map() for those events.
1852 		 */
1853 		if (target__none(&rec->opts.target))
1854 			record__synthesize_workload(rec, false);
1855 		write_finished_init(rec, false);
1856 	}
1857 	return fd;
1858 }
1859 
1860 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1861 					struct perf_record_lost_samples *lost,
1862 					int cpu_idx, int thread_idx, u64 lost_count,
1863 					u16 misc_flag)
1864 {
1865 	struct perf_sample_id *sid;
1866 	struct perf_sample sample = {};
1867 	int id_hdr_size;
1868 
1869 	lost->lost = lost_count;
1870 	if (evsel->core.ids) {
1871 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1872 		sample.id = sid->id;
1873 	}
1874 
1875 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1876 						       evsel->core.attr.sample_type, &sample);
1877 	lost->header.size = sizeof(*lost) + id_hdr_size;
1878 	lost->header.misc = misc_flag;
1879 	record__write(rec, NULL, lost, lost->header.size);
1880 }
1881 
1882 static void record__read_lost_samples(struct record *rec)
1883 {
1884 	struct perf_session *session = rec->session;
1885 	struct perf_record_lost_samples *lost;
1886 	struct evsel *evsel;
1887 
1888 	/* there was an error during record__open */
1889 	if (session->evlist == NULL)
1890 		return;
1891 
1892 	lost = zalloc(PERF_SAMPLE_MAX_SIZE);
1893 	if (lost == NULL) {
1894 		pr_debug("Memory allocation failed\n");
1895 		return;
1896 	}
1897 
1898 	lost->header.type = PERF_RECORD_LOST_SAMPLES;
1899 
1900 	evlist__for_each_entry(session->evlist, evsel) {
1901 		struct xyarray *xy = evsel->core.sample_id;
1902 		u64 lost_count;
1903 
1904 		if (xy == NULL || evsel->core.fd == NULL)
1905 			continue;
1906 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1907 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1908 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1909 			continue;
1910 		}
1911 
1912 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1913 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1914 				struct perf_counts_values count;
1915 
1916 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1917 					pr_debug("read LOST count failed\n");
1918 					goto out;
1919 				}
1920 
1921 				if (count.lost) {
1922 					__record__save_lost_samples(rec, evsel, lost,
1923 								    x, y, count.lost, 0);
1924 				}
1925 			}
1926 		}
1927 
1928 		lost_count = perf_bpf_filter__lost_count(evsel);
1929 		if (lost_count)
1930 			__record__save_lost_samples(rec, evsel, lost, 0, 0, lost_count,
1931 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1932 	}
1933 out:
1934 	free(lost);
1935 }
1936 
1937 static volatile sig_atomic_t workload_exec_errno;
1938 
1939 /*
1940  * evlist__prepare_workload will send a SIGUSR1
1941  * if the fork fails, since we asked by setting its
1942  * want_signal to true.
1943  */
1944 static void workload_exec_failed_signal(int signo __maybe_unused,
1945 					siginfo_t *info,
1946 					void *ucontext __maybe_unused)
1947 {
1948 	workload_exec_errno = info->si_value.sival_int;
1949 	done = 1;
1950 	child_finished = 1;
1951 }
1952 
1953 static void snapshot_sig_handler(int sig);
1954 static void alarm_sig_handler(int sig);
1955 
1956 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1957 {
1958 	if (evlist) {
1959 		if (evlist->mmap && evlist->mmap[0].core.base)
1960 			return evlist->mmap[0].core.base;
1961 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1962 			return evlist->overwrite_mmap[0].core.base;
1963 	}
1964 	return NULL;
1965 }
1966 
1967 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1968 {
1969 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1970 	if (pc)
1971 		return pc;
1972 	return NULL;
1973 }
1974 
1975 static int record__synthesize(struct record *rec, bool tail)
1976 {
1977 	struct perf_session *session = rec->session;
1978 	struct machine *machine = &session->machines.host;
1979 	struct perf_data *data = &rec->data;
1980 	struct record_opts *opts = &rec->opts;
1981 	struct perf_tool *tool = &rec->tool;
1982 	int err = 0;
1983 	event_op f = process_synthesized_event;
1984 
1985 	if (rec->opts.tail_synthesize != tail)
1986 		return 0;
1987 
1988 	if (data->is_pipe) {
1989 		err = perf_event__synthesize_for_pipe(tool, session, data,
1990 						      process_synthesized_event);
1991 		if (err < 0)
1992 			goto out;
1993 
1994 		rec->bytes_written += err;
1995 	}
1996 
1997 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1998 					  process_synthesized_event, machine);
1999 	if (err)
2000 		goto out;
2001 
2002 	/* Synthesize id_index before auxtrace_info */
2003 	err = perf_event__synthesize_id_index(tool,
2004 					      process_synthesized_event,
2005 					      session->evlist, machine);
2006 	if (err)
2007 		goto out;
2008 
2009 	if (rec->opts.full_auxtrace) {
2010 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2011 					session, process_synthesized_event);
2012 		if (err)
2013 			goto out;
2014 	}
2015 
2016 	if (!evlist__exclude_kernel(rec->evlist)) {
2017 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2018 							 machine);
2019 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2020 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2021 				   "Check /proc/kallsyms permission or run as root.\n");
2022 
2023 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2024 						     machine);
2025 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2026 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2027 				   "Check /proc/modules permission or run as root.\n");
2028 	}
2029 
2030 	if (perf_guest) {
2031 		machines__process_guests(&session->machines,
2032 					 perf_event__synthesize_guest_os, tool);
2033 	}
2034 
2035 	err = perf_event__synthesize_extra_attr(&rec->tool,
2036 						rec->evlist,
2037 						process_synthesized_event,
2038 						data->is_pipe);
2039 	if (err)
2040 		goto out;
2041 
2042 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2043 						 process_synthesized_event,
2044 						NULL);
2045 	if (err < 0) {
2046 		pr_err("Couldn't synthesize thread map.\n");
2047 		return err;
2048 	}
2049 
2050 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2051 					     process_synthesized_event, NULL);
2052 	if (err < 0) {
2053 		pr_err("Couldn't synthesize cpu map.\n");
2054 		return err;
2055 	}
2056 
2057 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2058 						machine, opts);
2059 	if (err < 0) {
2060 		pr_warning("Couldn't synthesize bpf events.\n");
2061 		err = 0;
2062 	}
2063 
2064 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2065 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2066 						     machine);
2067 		if (err < 0) {
2068 			pr_warning("Couldn't synthesize cgroup events.\n");
2069 			err = 0;
2070 		}
2071 	}
2072 
2073 	if (rec->opts.nr_threads_synthesize > 1) {
2074 		mutex_init(&synth_lock);
2075 		perf_set_multithreaded();
2076 		f = process_locked_synthesized_event;
2077 	}
2078 
2079 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2080 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2081 
2082 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2083 						    rec->evlist->core.threads,
2084 						    f, needs_mmap, opts->sample_address,
2085 						    rec->opts.nr_threads_synthesize);
2086 	}
2087 
2088 	if (rec->opts.nr_threads_synthesize > 1) {
2089 		perf_set_singlethreaded();
2090 		mutex_destroy(&synth_lock);
2091 	}
2092 
2093 out:
2094 	return err;
2095 }
2096 
2097 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2098 {
2099 	struct record *rec = data;
2100 	pthread_kill(rec->thread_id, SIGUSR2);
2101 	return 0;
2102 }
2103 
2104 static int record__setup_sb_evlist(struct record *rec)
2105 {
2106 	struct record_opts *opts = &rec->opts;
2107 
2108 	if (rec->sb_evlist != NULL) {
2109 		/*
2110 		 * We get here if --switch-output-event populated the
2111 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2112 		 * to the main thread.
2113 		 */
2114 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2115 		rec->thread_id = pthread_self();
2116 	}
2117 #ifdef HAVE_LIBBPF_SUPPORT
2118 	if (!opts->no_bpf_event) {
2119 		if (rec->sb_evlist == NULL) {
2120 			rec->sb_evlist = evlist__new();
2121 
2122 			if (rec->sb_evlist == NULL) {
2123 				pr_err("Couldn't create side band evlist.\n.");
2124 				return -1;
2125 			}
2126 		}
2127 
2128 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2129 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2130 			return -1;
2131 		}
2132 	}
2133 #endif
2134 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2135 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2136 		opts->no_bpf_event = true;
2137 	}
2138 
2139 	return 0;
2140 }
2141 
2142 static int record__init_clock(struct record *rec)
2143 {
2144 	struct perf_session *session = rec->session;
2145 	struct timespec ref_clockid;
2146 	struct timeval ref_tod;
2147 	u64 ref;
2148 
2149 	if (!rec->opts.use_clockid)
2150 		return 0;
2151 
2152 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2153 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2154 
2155 	session->header.env.clock.clockid = rec->opts.clockid;
2156 
2157 	if (gettimeofday(&ref_tod, NULL) != 0) {
2158 		pr_err("gettimeofday failed, cannot set reference time.\n");
2159 		return -1;
2160 	}
2161 
2162 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2163 		pr_err("clock_gettime failed, cannot set reference time.\n");
2164 		return -1;
2165 	}
2166 
2167 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2168 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2169 
2170 	session->header.env.clock.tod_ns = ref;
2171 
2172 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2173 	      (u64) ref_clockid.tv_nsec;
2174 
2175 	session->header.env.clock.clockid_ns = ref;
2176 	return 0;
2177 }
2178 
2179 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2180 {
2181 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2182 		trigger_hit(&auxtrace_snapshot_trigger);
2183 		auxtrace_record__snapshot_started = 1;
2184 		if (auxtrace_record__snapshot_start(rec->itr))
2185 			trigger_error(&auxtrace_snapshot_trigger);
2186 	}
2187 }
2188 
2189 static int record__terminate_thread(struct record_thread *thread_data)
2190 {
2191 	int err;
2192 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2193 	pid_t tid = thread_data->tid;
2194 
2195 	close(thread_data->pipes.msg[1]);
2196 	thread_data->pipes.msg[1] = -1;
2197 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2198 	if (err > 0)
2199 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2200 	else
2201 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2202 			   thread->tid, tid);
2203 
2204 	return 0;
2205 }
2206 
2207 static int record__start_threads(struct record *rec)
2208 {
2209 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2210 	struct record_thread *thread_data = rec->thread_data;
2211 	sigset_t full, mask;
2212 	pthread_t handle;
2213 	pthread_attr_t attrs;
2214 
2215 	thread = &thread_data[0];
2216 
2217 	if (!record__threads_enabled(rec))
2218 		return 0;
2219 
2220 	sigfillset(&full);
2221 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2222 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2223 		return -1;
2224 	}
2225 
2226 	pthread_attr_init(&attrs);
2227 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2228 
2229 	for (t = 1; t < nr_threads; t++) {
2230 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2231 
2232 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2233 		pthread_attr_setaffinity_np(&attrs,
2234 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2235 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2236 #endif
2237 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2238 			for (tt = 1; tt < t; tt++)
2239 				record__terminate_thread(&thread_data[t]);
2240 			pr_err("Failed to start threads: %s\n", strerror(errno));
2241 			ret = -1;
2242 			goto out_err;
2243 		}
2244 
2245 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2246 		if (err > 0)
2247 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2248 				  thread_msg_tags[msg]);
2249 		else
2250 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2251 				   thread->tid, rec->thread_data[t].tid);
2252 	}
2253 
2254 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2255 			(cpu_set_t *)thread->mask->affinity.bits);
2256 
2257 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2258 
2259 out_err:
2260 	pthread_attr_destroy(&attrs);
2261 
2262 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2263 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2264 		ret = -1;
2265 	}
2266 
2267 	return ret;
2268 }
2269 
2270 static int record__stop_threads(struct record *rec)
2271 {
2272 	int t;
2273 	struct record_thread *thread_data = rec->thread_data;
2274 
2275 	for (t = 1; t < rec->nr_threads; t++)
2276 		record__terminate_thread(&thread_data[t]);
2277 
2278 	for (t = 0; t < rec->nr_threads; t++) {
2279 		rec->samples += thread_data[t].samples;
2280 		if (!record__threads_enabled(rec))
2281 			continue;
2282 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2283 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2284 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2285 			 thread_data[t].samples, thread_data[t].waking);
2286 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2287 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2288 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2289 		else
2290 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2291 	}
2292 
2293 	return 0;
2294 }
2295 
2296 static unsigned long record__waking(struct record *rec)
2297 {
2298 	int t;
2299 	unsigned long waking = 0;
2300 	struct record_thread *thread_data = rec->thread_data;
2301 
2302 	for (t = 0; t < rec->nr_threads; t++)
2303 		waking += thread_data[t].waking;
2304 
2305 	return waking;
2306 }
2307 
2308 static int __cmd_record(struct record *rec, int argc, const char **argv)
2309 {
2310 	int err;
2311 	int status = 0;
2312 	const bool forks = argc > 0;
2313 	struct perf_tool *tool = &rec->tool;
2314 	struct record_opts *opts = &rec->opts;
2315 	struct perf_data *data = &rec->data;
2316 	struct perf_session *session;
2317 	bool disabled = false, draining = false;
2318 	int fd;
2319 	float ratio = 0;
2320 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2321 
2322 	atexit(record__sig_exit);
2323 	signal(SIGCHLD, sig_handler);
2324 	signal(SIGINT, sig_handler);
2325 	signal(SIGTERM, sig_handler);
2326 	signal(SIGSEGV, sigsegv_handler);
2327 
2328 	if (rec->opts.record_namespaces)
2329 		tool->namespace_events = true;
2330 
2331 	if (rec->opts.record_cgroup) {
2332 #ifdef HAVE_FILE_HANDLE
2333 		tool->cgroup_events = true;
2334 #else
2335 		pr_err("cgroup tracking is not supported\n");
2336 		return -1;
2337 #endif
2338 	}
2339 
2340 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2341 		signal(SIGUSR2, snapshot_sig_handler);
2342 		if (rec->opts.auxtrace_snapshot_mode)
2343 			trigger_on(&auxtrace_snapshot_trigger);
2344 		if (rec->switch_output.enabled)
2345 			trigger_on(&switch_output_trigger);
2346 	} else {
2347 		signal(SIGUSR2, SIG_IGN);
2348 	}
2349 
2350 	session = perf_session__new(data, tool);
2351 	if (IS_ERR(session)) {
2352 		pr_err("Perf session creation failed.\n");
2353 		return PTR_ERR(session);
2354 	}
2355 
2356 	if (record__threads_enabled(rec)) {
2357 		if (perf_data__is_pipe(&rec->data)) {
2358 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2359 			return -1;
2360 		}
2361 		if (rec->opts.full_auxtrace) {
2362 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2363 			return -1;
2364 		}
2365 	}
2366 
2367 	fd = perf_data__fd(data);
2368 	rec->session = session;
2369 
2370 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2371 		pr_err("Compression initialization failed.\n");
2372 		return -1;
2373 	}
2374 #ifdef HAVE_EVENTFD_SUPPORT
2375 	done_fd = eventfd(0, EFD_NONBLOCK);
2376 	if (done_fd < 0) {
2377 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2378 		status = -1;
2379 		goto out_delete_session;
2380 	}
2381 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2382 	if (err < 0) {
2383 		pr_err("Failed to add wakeup eventfd to poll list\n");
2384 		status = err;
2385 		goto out_delete_session;
2386 	}
2387 #endif // HAVE_EVENTFD_SUPPORT
2388 
2389 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2390 	session->header.env.comp_level = rec->opts.comp_level;
2391 
2392 	if (rec->opts.kcore &&
2393 	    !record__kcore_readable(&session->machines.host)) {
2394 		pr_err("ERROR: kcore is not readable.\n");
2395 		return -1;
2396 	}
2397 
2398 	if (record__init_clock(rec))
2399 		return -1;
2400 
2401 	record__init_features(rec);
2402 
2403 	if (forks) {
2404 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2405 					       workload_exec_failed_signal);
2406 		if (err < 0) {
2407 			pr_err("Couldn't run the workload!\n");
2408 			status = err;
2409 			goto out_delete_session;
2410 		}
2411 	}
2412 
2413 	/*
2414 	 * If we have just single event and are sending data
2415 	 * through pipe, we need to force the ids allocation,
2416 	 * because we synthesize event name through the pipe
2417 	 * and need the id for that.
2418 	 */
2419 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2420 		rec->opts.sample_id = true;
2421 
2422 	if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2423 		rec->timestamp_filename = false;
2424 		pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2425 	}
2426 
2427 	evlist__uniquify_name(rec->evlist);
2428 
2429 	evlist__config(rec->evlist, opts, &callchain_param);
2430 
2431 	/* Debug message used by test scripts */
2432 	pr_debug3("perf record opening and mmapping events\n");
2433 	if (record__open(rec) != 0) {
2434 		err = -1;
2435 		goto out_free_threads;
2436 	}
2437 	/* Debug message used by test scripts */
2438 	pr_debug3("perf record done opening and mmapping events\n");
2439 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2440 
2441 	if (rec->opts.kcore) {
2442 		err = record__kcore_copy(&session->machines.host, data);
2443 		if (err) {
2444 			pr_err("ERROR: Failed to copy kcore\n");
2445 			goto out_free_threads;
2446 		}
2447 	}
2448 
2449 	/*
2450 	 * Normally perf_session__new would do this, but it doesn't have the
2451 	 * evlist.
2452 	 */
2453 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2454 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2455 		rec->tool.ordered_events = false;
2456 	}
2457 
2458 	if (evlist__nr_groups(rec->evlist) == 0)
2459 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2460 
2461 	if (data->is_pipe) {
2462 		err = perf_header__write_pipe(fd);
2463 		if (err < 0)
2464 			goto out_free_threads;
2465 	} else {
2466 		err = perf_session__write_header(session, rec->evlist, fd, false);
2467 		if (err < 0)
2468 			goto out_free_threads;
2469 	}
2470 
2471 	err = -1;
2472 	if (!rec->no_buildid
2473 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2474 		pr_err("Couldn't generate buildids. "
2475 		       "Use --no-buildid to profile anyway.\n");
2476 		goto out_free_threads;
2477 	}
2478 
2479 	err = record__setup_sb_evlist(rec);
2480 	if (err)
2481 		goto out_free_threads;
2482 
2483 	err = record__synthesize(rec, false);
2484 	if (err < 0)
2485 		goto out_free_threads;
2486 
2487 	if (rec->realtime_prio) {
2488 		struct sched_param param;
2489 
2490 		param.sched_priority = rec->realtime_prio;
2491 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2492 			pr_err("Could not set realtime priority.\n");
2493 			err = -1;
2494 			goto out_free_threads;
2495 		}
2496 	}
2497 
2498 	if (record__start_threads(rec))
2499 		goto out_free_threads;
2500 
2501 	/*
2502 	 * When perf is starting the traced process, all the events
2503 	 * (apart from group members) have enable_on_exec=1 set,
2504 	 * so don't spoil it by prematurely enabling them.
2505 	 */
2506 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2507 		evlist__enable(rec->evlist);
2508 
2509 	/*
2510 	 * Let the child rip
2511 	 */
2512 	if (forks) {
2513 		struct machine *machine = &session->machines.host;
2514 		union perf_event *event;
2515 		pid_t tgid;
2516 
2517 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2518 		if (event == NULL) {
2519 			err = -ENOMEM;
2520 			goto out_child;
2521 		}
2522 
2523 		/*
2524 		 * Some H/W events are generated before COMM event
2525 		 * which is emitted during exec(), so perf script
2526 		 * cannot see a correct process name for those events.
2527 		 * Synthesize COMM event to prevent it.
2528 		 */
2529 		tgid = perf_event__synthesize_comm(tool, event,
2530 						   rec->evlist->workload.pid,
2531 						   process_synthesized_event,
2532 						   machine);
2533 		free(event);
2534 
2535 		if (tgid == -1)
2536 			goto out_child;
2537 
2538 		event = malloc(sizeof(event->namespaces) +
2539 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2540 			       machine->id_hdr_size);
2541 		if (event == NULL) {
2542 			err = -ENOMEM;
2543 			goto out_child;
2544 		}
2545 
2546 		/*
2547 		 * Synthesize NAMESPACES event for the command specified.
2548 		 */
2549 		perf_event__synthesize_namespaces(tool, event,
2550 						  rec->evlist->workload.pid,
2551 						  tgid, process_synthesized_event,
2552 						  machine);
2553 		free(event);
2554 
2555 		evlist__start_workload(rec->evlist);
2556 	}
2557 
2558 	if (opts->target.initial_delay) {
2559 		pr_info(EVLIST_DISABLED_MSG);
2560 		if (opts->target.initial_delay > 0) {
2561 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2562 			evlist__enable(rec->evlist);
2563 			pr_info(EVLIST_ENABLED_MSG);
2564 		}
2565 	}
2566 
2567 	err = event_enable_timer__start(rec->evlist->eet);
2568 	if (err)
2569 		goto out_child;
2570 
2571 	/* Debug message used by test scripts */
2572 	pr_debug3("perf record has started\n");
2573 	fflush(stderr);
2574 
2575 	trigger_ready(&auxtrace_snapshot_trigger);
2576 	trigger_ready(&switch_output_trigger);
2577 	perf_hooks__invoke_record_start();
2578 
2579 	/*
2580 	 * Must write FINISHED_INIT so it will be seen after all other
2581 	 * synthesized user events, but before any regular events.
2582 	 */
2583 	err = write_finished_init(rec, false);
2584 	if (err < 0)
2585 		goto out_child;
2586 
2587 	for (;;) {
2588 		unsigned long long hits = thread->samples;
2589 
2590 		/*
2591 		 * rec->evlist->bkw_mmap_state is possible to be
2592 		 * BKW_MMAP_EMPTY here: when done == true and
2593 		 * hits != rec->samples in previous round.
2594 		 *
2595 		 * evlist__toggle_bkw_mmap ensure we never
2596 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2597 		 */
2598 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2599 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2600 
2601 		if (record__mmap_read_all(rec, false) < 0) {
2602 			trigger_error(&auxtrace_snapshot_trigger);
2603 			trigger_error(&switch_output_trigger);
2604 			err = -1;
2605 			goto out_child;
2606 		}
2607 
2608 		if (auxtrace_record__snapshot_started) {
2609 			auxtrace_record__snapshot_started = 0;
2610 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2611 				record__read_auxtrace_snapshot(rec, false);
2612 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2613 				pr_err("AUX area tracing snapshot failed\n");
2614 				err = -1;
2615 				goto out_child;
2616 			}
2617 		}
2618 
2619 		if (trigger_is_hit(&switch_output_trigger)) {
2620 			/*
2621 			 * If switch_output_trigger is hit, the data in
2622 			 * overwritable ring buffer should have been collected,
2623 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2624 			 *
2625 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2626 			 * record__mmap_read_all() didn't collect data from
2627 			 * overwritable ring buffer. Read again.
2628 			 */
2629 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2630 				continue;
2631 			trigger_ready(&switch_output_trigger);
2632 
2633 			/*
2634 			 * Reenable events in overwrite ring buffer after
2635 			 * record__mmap_read_all(): we should have collected
2636 			 * data from it.
2637 			 */
2638 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2639 
2640 			if (!quiet)
2641 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2642 					record__waking(rec));
2643 			thread->waking = 0;
2644 			fd = record__switch_output(rec, false);
2645 			if (fd < 0) {
2646 				pr_err("Failed to switch to new file\n");
2647 				trigger_error(&switch_output_trigger);
2648 				err = fd;
2649 				goto out_child;
2650 			}
2651 
2652 			/* re-arm the alarm */
2653 			if (rec->switch_output.time)
2654 				alarm(rec->switch_output.time);
2655 		}
2656 
2657 		if (hits == thread->samples) {
2658 			if (done || draining)
2659 				break;
2660 			err = fdarray__poll(&thread->pollfd, -1);
2661 			/*
2662 			 * Propagate error, only if there's any. Ignore positive
2663 			 * number of returned events and interrupt error.
2664 			 */
2665 			if (err > 0 || (err < 0 && errno == EINTR))
2666 				err = 0;
2667 			thread->waking++;
2668 
2669 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2670 					    record__thread_munmap_filtered, NULL) == 0)
2671 				draining = true;
2672 
2673 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2674 			if (err)
2675 				goto out_child;
2676 		}
2677 
2678 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2679 			switch (cmd) {
2680 			case EVLIST_CTL_CMD_SNAPSHOT:
2681 				hit_auxtrace_snapshot_trigger(rec);
2682 				evlist__ctlfd_ack(rec->evlist);
2683 				break;
2684 			case EVLIST_CTL_CMD_STOP:
2685 				done = 1;
2686 				break;
2687 			case EVLIST_CTL_CMD_ACK:
2688 			case EVLIST_CTL_CMD_UNSUPPORTED:
2689 			case EVLIST_CTL_CMD_ENABLE:
2690 			case EVLIST_CTL_CMD_DISABLE:
2691 			case EVLIST_CTL_CMD_EVLIST:
2692 			case EVLIST_CTL_CMD_PING:
2693 			default:
2694 				break;
2695 			}
2696 		}
2697 
2698 		err = event_enable_timer__process(rec->evlist->eet);
2699 		if (err < 0)
2700 			goto out_child;
2701 		if (err) {
2702 			err = 0;
2703 			done = 1;
2704 		}
2705 
2706 		/*
2707 		 * When perf is starting the traced process, at the end events
2708 		 * die with the process and we wait for that. Thus no need to
2709 		 * disable events in this case.
2710 		 */
2711 		if (done && !disabled && !target__none(&opts->target)) {
2712 			trigger_off(&auxtrace_snapshot_trigger);
2713 			evlist__disable(rec->evlist);
2714 			disabled = true;
2715 		}
2716 	}
2717 
2718 	trigger_off(&auxtrace_snapshot_trigger);
2719 	trigger_off(&switch_output_trigger);
2720 
2721 	if (opts->auxtrace_snapshot_on_exit)
2722 		record__auxtrace_snapshot_exit(rec);
2723 
2724 	if (forks && workload_exec_errno) {
2725 		char msg[STRERR_BUFSIZE], strevsels[2048];
2726 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2727 
2728 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2729 
2730 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2731 			strevsels, argv[0], emsg);
2732 		err = -1;
2733 		goto out_child;
2734 	}
2735 
2736 	if (!quiet)
2737 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2738 			record__waking(rec));
2739 
2740 	write_finished_init(rec, true);
2741 
2742 	if (target__none(&rec->opts.target))
2743 		record__synthesize_workload(rec, true);
2744 
2745 out_child:
2746 	record__stop_threads(rec);
2747 	record__mmap_read_all(rec, true);
2748 out_free_threads:
2749 	record__free_thread_data(rec);
2750 	evlist__finalize_ctlfd(rec->evlist);
2751 	record__aio_mmap_read_sync(rec);
2752 
2753 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2754 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2755 		session->header.env.comp_ratio = ratio + 0.5;
2756 	}
2757 
2758 	if (forks) {
2759 		int exit_status;
2760 
2761 		if (!child_finished)
2762 			kill(rec->evlist->workload.pid, SIGTERM);
2763 
2764 		wait(&exit_status);
2765 
2766 		if (err < 0)
2767 			status = err;
2768 		else if (WIFEXITED(exit_status))
2769 			status = WEXITSTATUS(exit_status);
2770 		else if (WIFSIGNALED(exit_status))
2771 			signr = WTERMSIG(exit_status);
2772 	} else
2773 		status = err;
2774 
2775 	if (rec->off_cpu)
2776 		rec->bytes_written += off_cpu_write(rec->session);
2777 
2778 	record__read_lost_samples(rec);
2779 	record__synthesize(rec, true);
2780 	/* this will be recalculated during process_buildids() */
2781 	rec->samples = 0;
2782 
2783 	if (!err) {
2784 		if (!rec->timestamp_filename) {
2785 			record__finish_output(rec);
2786 		} else {
2787 			fd = record__switch_output(rec, true);
2788 			if (fd < 0) {
2789 				status = fd;
2790 				goto out_delete_session;
2791 			}
2792 		}
2793 	}
2794 
2795 	perf_hooks__invoke_record_end();
2796 
2797 	if (!err && !quiet) {
2798 		char samples[128];
2799 		const char *postfix = rec->timestamp_filename ?
2800 					".<timestamp>" : "";
2801 
2802 		if (rec->samples && !rec->opts.full_auxtrace)
2803 			scnprintf(samples, sizeof(samples),
2804 				  " (%" PRIu64 " samples)", rec->samples);
2805 		else
2806 			samples[0] = '\0';
2807 
2808 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2809 			perf_data__size(data) / 1024.0 / 1024.0,
2810 			data->path, postfix, samples);
2811 		if (ratio) {
2812 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2813 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2814 					ratio);
2815 		}
2816 		fprintf(stderr, " ]\n");
2817 	}
2818 
2819 out_delete_session:
2820 #ifdef HAVE_EVENTFD_SUPPORT
2821 	if (done_fd >= 0) {
2822 		fd = done_fd;
2823 		done_fd = -1;
2824 
2825 		close(fd);
2826 	}
2827 #endif
2828 	zstd_fini(&session->zstd_data);
2829 	if (!opts->no_bpf_event)
2830 		evlist__stop_sb_thread(rec->sb_evlist);
2831 
2832 	perf_session__delete(session);
2833 	return status;
2834 }
2835 
2836 static void callchain_debug(struct callchain_param *callchain)
2837 {
2838 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2839 
2840 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2841 
2842 	if (callchain->record_mode == CALLCHAIN_DWARF)
2843 		pr_debug("callchain: stack dump size %d\n",
2844 			 callchain->dump_size);
2845 }
2846 
2847 int record_opts__parse_callchain(struct record_opts *record,
2848 				 struct callchain_param *callchain,
2849 				 const char *arg, bool unset)
2850 {
2851 	int ret;
2852 	callchain->enabled = !unset;
2853 
2854 	/* --no-call-graph */
2855 	if (unset) {
2856 		callchain->record_mode = CALLCHAIN_NONE;
2857 		pr_debug("callchain: disabled\n");
2858 		return 0;
2859 	}
2860 
2861 	ret = parse_callchain_record_opt(arg, callchain);
2862 	if (!ret) {
2863 		/* Enable data address sampling for DWARF unwind. */
2864 		if (callchain->record_mode == CALLCHAIN_DWARF)
2865 			record->sample_address = true;
2866 		callchain_debug(callchain);
2867 	}
2868 
2869 	return ret;
2870 }
2871 
2872 int record_parse_callchain_opt(const struct option *opt,
2873 			       const char *arg,
2874 			       int unset)
2875 {
2876 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2877 }
2878 
2879 int record_callchain_opt(const struct option *opt,
2880 			 const char *arg __maybe_unused,
2881 			 int unset __maybe_unused)
2882 {
2883 	struct callchain_param *callchain = opt->value;
2884 
2885 	callchain->enabled = true;
2886 
2887 	if (callchain->record_mode == CALLCHAIN_NONE)
2888 		callchain->record_mode = CALLCHAIN_FP;
2889 
2890 	callchain_debug(callchain);
2891 	return 0;
2892 }
2893 
2894 static int perf_record_config(const char *var, const char *value, void *cb)
2895 {
2896 	struct record *rec = cb;
2897 
2898 	if (!strcmp(var, "record.build-id")) {
2899 		if (!strcmp(value, "cache"))
2900 			rec->no_buildid_cache = false;
2901 		else if (!strcmp(value, "no-cache"))
2902 			rec->no_buildid_cache = true;
2903 		else if (!strcmp(value, "skip"))
2904 			rec->no_buildid = true;
2905 		else if (!strcmp(value, "mmap"))
2906 			rec->buildid_mmap = true;
2907 		else
2908 			return -1;
2909 		return 0;
2910 	}
2911 	if (!strcmp(var, "record.call-graph")) {
2912 		var = "call-graph.record-mode";
2913 		return perf_default_config(var, value, cb);
2914 	}
2915 #ifdef HAVE_AIO_SUPPORT
2916 	if (!strcmp(var, "record.aio")) {
2917 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2918 		if (!rec->opts.nr_cblocks)
2919 			rec->opts.nr_cblocks = nr_cblocks_default;
2920 	}
2921 #endif
2922 	if (!strcmp(var, "record.debuginfod")) {
2923 		rec->debuginfod.urls = strdup(value);
2924 		if (!rec->debuginfod.urls)
2925 			return -ENOMEM;
2926 		rec->debuginfod.set = true;
2927 	}
2928 
2929 	return 0;
2930 }
2931 
2932 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2933 {
2934 	struct record *rec = (struct record *)opt->value;
2935 
2936 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2937 }
2938 
2939 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2940 {
2941 	struct record_opts *opts = (struct record_opts *)opt->value;
2942 
2943 	if (unset || !str)
2944 		return 0;
2945 
2946 	if (!strcasecmp(str, "node"))
2947 		opts->affinity = PERF_AFFINITY_NODE;
2948 	else if (!strcasecmp(str, "cpu"))
2949 		opts->affinity = PERF_AFFINITY_CPU;
2950 
2951 	return 0;
2952 }
2953 
2954 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2955 {
2956 	mask->nbits = nr_bits;
2957 	mask->bits = bitmap_zalloc(mask->nbits);
2958 	if (!mask->bits)
2959 		return -ENOMEM;
2960 
2961 	return 0;
2962 }
2963 
2964 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2965 {
2966 	bitmap_free(mask->bits);
2967 	mask->nbits = 0;
2968 }
2969 
2970 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
2971 {
2972 	int ret;
2973 
2974 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
2975 	if (ret) {
2976 		mask->affinity.bits = NULL;
2977 		return ret;
2978 	}
2979 
2980 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
2981 	if (ret) {
2982 		record__mmap_cpu_mask_free(&mask->maps);
2983 		mask->maps.bits = NULL;
2984 	}
2985 
2986 	return ret;
2987 }
2988 
2989 static void record__thread_mask_free(struct thread_mask *mask)
2990 {
2991 	record__mmap_cpu_mask_free(&mask->maps);
2992 	record__mmap_cpu_mask_free(&mask->affinity);
2993 }
2994 
2995 static int record__parse_threads(const struct option *opt, const char *str, int unset)
2996 {
2997 	int s;
2998 	struct record_opts *opts = opt->value;
2999 
3000 	if (unset || !str || !strlen(str)) {
3001 		opts->threads_spec = THREAD_SPEC__CPU;
3002 	} else {
3003 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3004 			if (s == THREAD_SPEC__USER) {
3005 				opts->threads_user_spec = strdup(str);
3006 				if (!opts->threads_user_spec)
3007 					return -ENOMEM;
3008 				opts->threads_spec = THREAD_SPEC__USER;
3009 				break;
3010 			}
3011 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3012 				opts->threads_spec = s;
3013 				break;
3014 			}
3015 		}
3016 	}
3017 
3018 	if (opts->threads_spec == THREAD_SPEC__USER)
3019 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3020 	else
3021 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3022 
3023 	return 0;
3024 }
3025 
3026 static int parse_output_max_size(const struct option *opt,
3027 				 const char *str, int unset)
3028 {
3029 	unsigned long *s = (unsigned long *)opt->value;
3030 	static struct parse_tag tags_size[] = {
3031 		{ .tag  = 'B', .mult = 1       },
3032 		{ .tag  = 'K', .mult = 1 << 10 },
3033 		{ .tag  = 'M', .mult = 1 << 20 },
3034 		{ .tag  = 'G', .mult = 1 << 30 },
3035 		{ .tag  = 0 },
3036 	};
3037 	unsigned long val;
3038 
3039 	if (unset) {
3040 		*s = 0;
3041 		return 0;
3042 	}
3043 
3044 	val = parse_tag_value(str, tags_size);
3045 	if (val != (unsigned long) -1) {
3046 		*s = val;
3047 		return 0;
3048 	}
3049 
3050 	return -1;
3051 }
3052 
3053 static int record__parse_mmap_pages(const struct option *opt,
3054 				    const char *str,
3055 				    int unset __maybe_unused)
3056 {
3057 	struct record_opts *opts = opt->value;
3058 	char *s, *p;
3059 	unsigned int mmap_pages;
3060 	int ret;
3061 
3062 	if (!str)
3063 		return -EINVAL;
3064 
3065 	s = strdup(str);
3066 	if (!s)
3067 		return -ENOMEM;
3068 
3069 	p = strchr(s, ',');
3070 	if (p)
3071 		*p = '\0';
3072 
3073 	if (*s) {
3074 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3075 		if (ret)
3076 			goto out_free;
3077 		opts->mmap_pages = mmap_pages;
3078 	}
3079 
3080 	if (!p) {
3081 		ret = 0;
3082 		goto out_free;
3083 	}
3084 
3085 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3086 	if (ret)
3087 		goto out_free;
3088 
3089 	opts->auxtrace_mmap_pages = mmap_pages;
3090 
3091 out_free:
3092 	free(s);
3093 	return ret;
3094 }
3095 
3096 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3097 {
3098 }
3099 
3100 static int parse_control_option(const struct option *opt,
3101 				const char *str,
3102 				int unset __maybe_unused)
3103 {
3104 	struct record_opts *opts = opt->value;
3105 
3106 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3107 }
3108 
3109 static void switch_output_size_warn(struct record *rec)
3110 {
3111 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3112 	struct switch_output *s = &rec->switch_output;
3113 
3114 	wakeup_size /= 2;
3115 
3116 	if (s->size < wakeup_size) {
3117 		char buf[100];
3118 
3119 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3120 		pr_warning("WARNING: switch-output data size lower than "
3121 			   "wakeup kernel buffer size (%s) "
3122 			   "expect bigger perf.data sizes\n", buf);
3123 	}
3124 }
3125 
3126 static int switch_output_setup(struct record *rec)
3127 {
3128 	struct switch_output *s = &rec->switch_output;
3129 	static struct parse_tag tags_size[] = {
3130 		{ .tag  = 'B', .mult = 1       },
3131 		{ .tag  = 'K', .mult = 1 << 10 },
3132 		{ .tag  = 'M', .mult = 1 << 20 },
3133 		{ .tag  = 'G', .mult = 1 << 30 },
3134 		{ .tag  = 0 },
3135 	};
3136 	static struct parse_tag tags_time[] = {
3137 		{ .tag  = 's', .mult = 1        },
3138 		{ .tag  = 'm', .mult = 60       },
3139 		{ .tag  = 'h', .mult = 60*60    },
3140 		{ .tag  = 'd', .mult = 60*60*24 },
3141 		{ .tag  = 0 },
3142 	};
3143 	unsigned long val;
3144 
3145 	/*
3146 	 * If we're using --switch-output-events, then we imply its
3147 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3148 	 *  thread to its parent.
3149 	 */
3150 	if (rec->switch_output_event_set) {
3151 		if (record__threads_enabled(rec)) {
3152 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3153 			return 0;
3154 		}
3155 		goto do_signal;
3156 	}
3157 
3158 	if (!s->set)
3159 		return 0;
3160 
3161 	if (record__threads_enabled(rec)) {
3162 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3163 		return 0;
3164 	}
3165 
3166 	if (!strcmp(s->str, "signal")) {
3167 do_signal:
3168 		s->signal = true;
3169 		pr_debug("switch-output with SIGUSR2 signal\n");
3170 		goto enabled;
3171 	}
3172 
3173 	val = parse_tag_value(s->str, tags_size);
3174 	if (val != (unsigned long) -1) {
3175 		s->size = val;
3176 		pr_debug("switch-output with %s size threshold\n", s->str);
3177 		goto enabled;
3178 	}
3179 
3180 	val = parse_tag_value(s->str, tags_time);
3181 	if (val != (unsigned long) -1) {
3182 		s->time = val;
3183 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3184 			 s->str, s->time);
3185 		goto enabled;
3186 	}
3187 
3188 	return -1;
3189 
3190 enabled:
3191 	rec->timestamp_filename = true;
3192 	s->enabled              = true;
3193 
3194 	if (s->size && !rec->opts.no_buffering)
3195 		switch_output_size_warn(rec);
3196 
3197 	return 0;
3198 }
3199 
3200 static const char * const __record_usage[] = {
3201 	"perf record [<options>] [<command>]",
3202 	"perf record [<options>] -- <command> [<options>]",
3203 	NULL
3204 };
3205 const char * const *record_usage = __record_usage;
3206 
3207 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3208 				  struct perf_sample *sample, struct machine *machine)
3209 {
3210 	/*
3211 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3212 	 * no need to add them twice.
3213 	 */
3214 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3215 		return 0;
3216 	return perf_event__process_mmap(tool, event, sample, machine);
3217 }
3218 
3219 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3220 				   struct perf_sample *sample, struct machine *machine)
3221 {
3222 	/*
3223 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3224 	 * no need to add them twice.
3225 	 */
3226 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3227 		return 0;
3228 
3229 	return perf_event__process_mmap2(tool, event, sample, machine);
3230 }
3231 
3232 static int process_timestamp_boundary(struct perf_tool *tool,
3233 				      union perf_event *event __maybe_unused,
3234 				      struct perf_sample *sample,
3235 				      struct machine *machine __maybe_unused)
3236 {
3237 	struct record *rec = container_of(tool, struct record, tool);
3238 
3239 	set_timestamp_boundary(rec, sample->time);
3240 	return 0;
3241 }
3242 
3243 static int parse_record_synth_option(const struct option *opt,
3244 				     const char *str,
3245 				     int unset __maybe_unused)
3246 {
3247 	struct record_opts *opts = opt->value;
3248 	char *p = strdup(str);
3249 
3250 	if (p == NULL)
3251 		return -1;
3252 
3253 	opts->synth = parse_synth_opt(p);
3254 	free(p);
3255 
3256 	if (opts->synth < 0) {
3257 		pr_err("Invalid synth option: %s\n", str);
3258 		return -1;
3259 	}
3260 	return 0;
3261 }
3262 
3263 /*
3264  * XXX Ideally would be local to cmd_record() and passed to a record__new
3265  * because we need to have access to it in record__exit, that is called
3266  * after cmd_record() exits, but since record_options need to be accessible to
3267  * builtin-script, leave it here.
3268  *
3269  * At least we don't ouch it in all the other functions here directly.
3270  *
3271  * Just say no to tons of global variables, sigh.
3272  */
3273 static struct record record = {
3274 	.opts = {
3275 		.sample_time	     = true,
3276 		.mmap_pages	     = UINT_MAX,
3277 		.user_freq	     = UINT_MAX,
3278 		.user_interval	     = ULLONG_MAX,
3279 		.freq		     = 4000,
3280 		.target		     = {
3281 			.uses_mmap   = true,
3282 			.default_per_cpu = true,
3283 		},
3284 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3285 		.nr_threads_synthesize = 1,
3286 		.ctl_fd              = -1,
3287 		.ctl_fd_ack          = -1,
3288 		.synth               = PERF_SYNTH_ALL,
3289 	},
3290 	.tool = {
3291 		.sample		= process_sample_event,
3292 		.fork		= perf_event__process_fork,
3293 		.exit		= perf_event__process_exit,
3294 		.comm		= perf_event__process_comm,
3295 		.namespaces	= perf_event__process_namespaces,
3296 		.mmap		= build_id__process_mmap,
3297 		.mmap2		= build_id__process_mmap2,
3298 		.itrace_start	= process_timestamp_boundary,
3299 		.aux		= process_timestamp_boundary,
3300 		.ordered_events	= true,
3301 	},
3302 };
3303 
3304 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3305 	"\n\t\t\t\tDefault: fp";
3306 
3307 static bool dry_run;
3308 
3309 static struct parse_events_option_args parse_events_option_args = {
3310 	.evlistp = &record.evlist,
3311 };
3312 
3313 static struct parse_events_option_args switch_output_parse_events_option_args = {
3314 	.evlistp = &record.sb_evlist,
3315 };
3316 
3317 /*
3318  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3319  * with it and switch to use the library functions in perf_evlist that came
3320  * from builtin-record.c, i.e. use record_opts,
3321  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3322  * using pipes, etc.
3323  */
3324 static struct option __record_options[] = {
3325 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3326 		     "event selector. use 'perf list' to list available events",
3327 		     parse_events_option),
3328 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3329 		     "event filter", parse_filter),
3330 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3331 			   NULL, "don't record events from perf itself",
3332 			   exclude_perf),
3333 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3334 		    "record events on existing process id"),
3335 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3336 		    "record events on existing thread id"),
3337 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3338 		    "collect data with this RT SCHED_FIFO priority"),
3339 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3340 		    "collect data without buffering"),
3341 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3342 		    "collect raw sample records from all opened counters"),
3343 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3344 			    "system-wide collection from all CPUs"),
3345 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3346 		    "list of cpus to monitor"),
3347 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3348 	OPT_STRING('o', "output", &record.data.path, "file",
3349 		    "output file name"),
3350 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3351 			&record.opts.no_inherit_set,
3352 			"child tasks do not inherit counters"),
3353 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3354 		    "synthesize non-sample events at the end of output"),
3355 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3356 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3357 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3358 		    "Fail if the specified frequency can't be used"),
3359 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3360 		     "profile at this frequency",
3361 		      record__parse_freq),
3362 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3363 		     "number of mmap data pages and AUX area tracing mmap pages",
3364 		     record__parse_mmap_pages),
3365 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3366 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3367 		     record__mmap_flush_parse),
3368 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3369 			   NULL, "enables call-graph recording" ,
3370 			   &record_callchain_opt),
3371 	OPT_CALLBACK(0, "call-graph", &record.opts,
3372 		     "record_mode[,record_size]", record_callchain_help,
3373 		     &record_parse_callchain_opt),
3374 	OPT_INCR('v', "verbose", &verbose,
3375 		    "be more verbose (show counter open errors, etc)"),
3376 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3377 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3378 		    "per thread counts"),
3379 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3380 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3381 		    "Record the sample physical addresses"),
3382 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3383 		    "Record the sampled data address data page size"),
3384 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3385 		    "Record the sampled code address (ip) page size"),
3386 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3387 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3388 		    "Record the sample identifier"),
3389 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3390 			&record.opts.sample_time_set,
3391 			"Record the sample timestamps"),
3392 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3393 			"Record the sample period"),
3394 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3395 		    "don't sample"),
3396 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3397 			&record.no_buildid_cache_set,
3398 			"do not update the buildid cache"),
3399 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3400 			&record.no_buildid_set,
3401 			"do not collect buildids in perf.data"),
3402 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3403 		     "monitor event in cgroup name only",
3404 		     parse_cgroups),
3405 	OPT_CALLBACK('D', "delay", &record, "ms",
3406 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3407 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3408 		     record__parse_event_enable_time),
3409 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3410 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3411 		   "user to profile"),
3412 
3413 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3414 		     "branch any", "sample any taken branches",
3415 		     parse_branch_stack),
3416 
3417 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3418 		     "branch filter mask", "branch stack filter modes",
3419 		     parse_branch_stack),
3420 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3421 		    "sample by weight (on special events only)"),
3422 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3423 		    "sample transaction flags (special events only)"),
3424 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3425 		    "use per-thread mmaps"),
3426 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3427 		    "sample selected machine registers on interrupt,"
3428 		    " use '-I?' to list register names", parse_intr_regs),
3429 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3430 		    "sample selected machine registers on interrupt,"
3431 		    " use '--user-regs=?' to list register names", parse_user_regs),
3432 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3433 		    "Record running/enabled time of read (:S) events"),
3434 	OPT_CALLBACK('k', "clockid", &record.opts,
3435 	"clockid", "clockid to use for events, see clock_gettime()",
3436 	parse_clockid),
3437 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3438 			  "opts", "AUX area tracing Snapshot Mode", ""),
3439 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3440 			  "opts", "sample AUX area", ""),
3441 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3442 			"per thread proc mmap processing timeout in ms"),
3443 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3444 		    "Record namespaces events"),
3445 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3446 		    "Record cgroup events"),
3447 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3448 			&record.opts.record_switch_events_set,
3449 			"Record context switch events"),
3450 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3451 			 "Configure all used events to run in kernel space.",
3452 			 PARSE_OPT_EXCLUSIVE),
3453 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3454 			 "Configure all used events to run in user space.",
3455 			 PARSE_OPT_EXCLUSIVE),
3456 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3457 		    "collect kernel callchains"),
3458 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3459 		    "collect user callchains"),
3460 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3461 		   "file", "vmlinux pathname"),
3462 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3463 		    "Record build-id of all DSOs regardless of hits"),
3464 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3465 		    "Record build-id in map events"),
3466 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3467 		    "append timestamp to output filename"),
3468 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3469 		    "Record timestamp boundary (time of first/last samples)"),
3470 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3471 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3472 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3473 			  "signal"),
3474 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3475 			 &record.switch_output_event_set, "switch output event",
3476 			 "switch output event selector. use 'perf list' to list available events",
3477 			 parse_events_option_new_evlist),
3478 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3479 		   "Limit number of switch output generated files"),
3480 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3481 		    "Parse options then exit"),
3482 #ifdef HAVE_AIO_SUPPORT
3483 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3484 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3485 		     record__aio_parse),
3486 #endif
3487 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3488 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3489 		     record__parse_affinity),
3490 #ifdef HAVE_ZSTD_SUPPORT
3491 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3492 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3493 			    record__parse_comp_level),
3494 #endif
3495 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3496 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3497 	OPT_UINTEGER(0, "num-thread-synthesize",
3498 		     &record.opts.nr_threads_synthesize,
3499 		     "number of threads to run for event synthesis"),
3500 #ifdef HAVE_LIBPFM
3501 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3502 		"libpfm4 event selector. use 'perf list' to list available events",
3503 		parse_libpfm_events_option),
3504 #endif
3505 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3506 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3507 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3508 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3509 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3510 		      parse_control_option),
3511 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3512 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3513 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3514 			  &record.debuginfod.set, "debuginfod urls",
3515 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3516 			  "system"),
3517 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3518 			    "write collected trace data into several data files using parallel threads",
3519 			    record__parse_threads),
3520 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3521 	OPT_END()
3522 };
3523 
3524 struct option *record_options = __record_options;
3525 
3526 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3527 {
3528 	struct perf_cpu cpu;
3529 	int idx;
3530 
3531 	if (cpu_map__is_dummy(cpus))
3532 		return 0;
3533 
3534 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
3535 		if (cpu.cpu == -1)
3536 			continue;
3537 		/* Return ENODEV is input cpu is greater than max cpu */
3538 		if ((unsigned long)cpu.cpu > mask->nbits)
3539 			return -ENODEV;
3540 		__set_bit(cpu.cpu, mask->bits);
3541 	}
3542 
3543 	return 0;
3544 }
3545 
3546 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3547 {
3548 	struct perf_cpu_map *cpus;
3549 
3550 	cpus = perf_cpu_map__new(mask_spec);
3551 	if (!cpus)
3552 		return -ENOMEM;
3553 
3554 	bitmap_zero(mask->bits, mask->nbits);
3555 	if (record__mmap_cpu_mask_init(mask, cpus))
3556 		return -ENODEV;
3557 
3558 	perf_cpu_map__put(cpus);
3559 
3560 	return 0;
3561 }
3562 
3563 static void record__free_thread_masks(struct record *rec, int nr_threads)
3564 {
3565 	int t;
3566 
3567 	if (rec->thread_masks)
3568 		for (t = 0; t < nr_threads; t++)
3569 			record__thread_mask_free(&rec->thread_masks[t]);
3570 
3571 	zfree(&rec->thread_masks);
3572 }
3573 
3574 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3575 {
3576 	int t, ret;
3577 
3578 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3579 	if (!rec->thread_masks) {
3580 		pr_err("Failed to allocate thread masks\n");
3581 		return -ENOMEM;
3582 	}
3583 
3584 	for (t = 0; t < nr_threads; t++) {
3585 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3586 		if (ret) {
3587 			pr_err("Failed to allocate thread masks[%d]\n", t);
3588 			goto out_free;
3589 		}
3590 	}
3591 
3592 	return 0;
3593 
3594 out_free:
3595 	record__free_thread_masks(rec, nr_threads);
3596 
3597 	return ret;
3598 }
3599 
3600 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3601 {
3602 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3603 
3604 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3605 	if (ret)
3606 		return ret;
3607 
3608 	rec->nr_threads = nr_cpus;
3609 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3610 
3611 	for (t = 0; t < rec->nr_threads; t++) {
3612 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3613 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3614 		if (verbose > 0) {
3615 			pr_debug("thread_masks[%d]: ", t);
3616 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3617 			pr_debug("thread_masks[%d]: ", t);
3618 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3619 		}
3620 	}
3621 
3622 	return 0;
3623 }
3624 
3625 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3626 					  const char **maps_spec, const char **affinity_spec,
3627 					  u32 nr_spec)
3628 {
3629 	u32 s;
3630 	int ret = 0, t = 0;
3631 	struct mmap_cpu_mask cpus_mask;
3632 	struct thread_mask thread_mask, full_mask, *thread_masks;
3633 
3634 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3635 	if (ret) {
3636 		pr_err("Failed to allocate CPUs mask\n");
3637 		return ret;
3638 	}
3639 
3640 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3641 	if (ret) {
3642 		pr_err("Failed to init cpu mask\n");
3643 		goto out_free_cpu_mask;
3644 	}
3645 
3646 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3647 	if (ret) {
3648 		pr_err("Failed to allocate full mask\n");
3649 		goto out_free_cpu_mask;
3650 	}
3651 
3652 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3653 	if (ret) {
3654 		pr_err("Failed to allocate thread mask\n");
3655 		goto out_free_full_and_cpu_masks;
3656 	}
3657 
3658 	for (s = 0; s < nr_spec; s++) {
3659 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3660 		if (ret) {
3661 			pr_err("Failed to initialize maps thread mask\n");
3662 			goto out_free;
3663 		}
3664 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3665 		if (ret) {
3666 			pr_err("Failed to initialize affinity thread mask\n");
3667 			goto out_free;
3668 		}
3669 
3670 		/* ignore invalid CPUs but do not allow empty masks */
3671 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3672 				cpus_mask.bits, thread_mask.maps.nbits)) {
3673 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3674 			ret = -EINVAL;
3675 			goto out_free;
3676 		}
3677 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3678 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3679 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3680 			ret = -EINVAL;
3681 			goto out_free;
3682 		}
3683 
3684 		/* do not allow intersection with other masks (full_mask) */
3685 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3686 				      thread_mask.maps.nbits)) {
3687 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3688 			ret = -EINVAL;
3689 			goto out_free;
3690 		}
3691 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3692 				      thread_mask.affinity.nbits)) {
3693 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3694 			ret = -EINVAL;
3695 			goto out_free;
3696 		}
3697 
3698 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3699 			  thread_mask.maps.bits, full_mask.maps.nbits);
3700 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3701 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3702 
3703 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3704 		if (!thread_masks) {
3705 			pr_err("Failed to reallocate thread masks\n");
3706 			ret = -ENOMEM;
3707 			goto out_free;
3708 		}
3709 		rec->thread_masks = thread_masks;
3710 		rec->thread_masks[t] = thread_mask;
3711 		if (verbose > 0) {
3712 			pr_debug("thread_masks[%d]: ", t);
3713 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3714 			pr_debug("thread_masks[%d]: ", t);
3715 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3716 		}
3717 		t++;
3718 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3719 		if (ret) {
3720 			pr_err("Failed to allocate thread mask\n");
3721 			goto out_free_full_and_cpu_masks;
3722 		}
3723 	}
3724 	rec->nr_threads = t;
3725 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3726 	if (!rec->nr_threads)
3727 		ret = -EINVAL;
3728 
3729 out_free:
3730 	record__thread_mask_free(&thread_mask);
3731 out_free_full_and_cpu_masks:
3732 	record__thread_mask_free(&full_mask);
3733 out_free_cpu_mask:
3734 	record__mmap_cpu_mask_free(&cpus_mask);
3735 
3736 	return ret;
3737 }
3738 
3739 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3740 {
3741 	int ret;
3742 	struct cpu_topology *topo;
3743 
3744 	topo = cpu_topology__new();
3745 	if (!topo) {
3746 		pr_err("Failed to allocate CPU topology\n");
3747 		return -ENOMEM;
3748 	}
3749 
3750 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3751 					     topo->core_cpus_list, topo->core_cpus_lists);
3752 	cpu_topology__delete(topo);
3753 
3754 	return ret;
3755 }
3756 
3757 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3758 {
3759 	int ret;
3760 	struct cpu_topology *topo;
3761 
3762 	topo = cpu_topology__new();
3763 	if (!topo) {
3764 		pr_err("Failed to allocate CPU topology\n");
3765 		return -ENOMEM;
3766 	}
3767 
3768 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3769 					     topo->package_cpus_list, topo->package_cpus_lists);
3770 	cpu_topology__delete(topo);
3771 
3772 	return ret;
3773 }
3774 
3775 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3776 {
3777 	u32 s;
3778 	int ret;
3779 	const char **spec;
3780 	struct numa_topology *topo;
3781 
3782 	topo = numa_topology__new();
3783 	if (!topo) {
3784 		pr_err("Failed to allocate NUMA topology\n");
3785 		return -ENOMEM;
3786 	}
3787 
3788 	spec = zalloc(topo->nr * sizeof(char *));
3789 	if (!spec) {
3790 		pr_err("Failed to allocate NUMA spec\n");
3791 		ret = -ENOMEM;
3792 		goto out_delete_topo;
3793 	}
3794 	for (s = 0; s < topo->nr; s++)
3795 		spec[s] = topo->nodes[s].cpus;
3796 
3797 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3798 
3799 	zfree(&spec);
3800 
3801 out_delete_topo:
3802 	numa_topology__delete(topo);
3803 
3804 	return ret;
3805 }
3806 
3807 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3808 {
3809 	int t, ret;
3810 	u32 s, nr_spec = 0;
3811 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3812 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3813 
3814 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3815 		spec = strtok_r(user_spec, ":", &spec_ptr);
3816 		if (spec == NULL)
3817 			break;
3818 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3819 		mask = strtok_r(spec, "/", &mask_ptr);
3820 		if (mask == NULL)
3821 			break;
3822 		pr_debug2("  maps mask: %s\n", mask);
3823 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3824 		if (!tmp_spec) {
3825 			pr_err("Failed to reallocate maps spec\n");
3826 			ret = -ENOMEM;
3827 			goto out_free;
3828 		}
3829 		maps_spec = tmp_spec;
3830 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3831 		if (!maps_spec[nr_spec]) {
3832 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3833 			ret = -ENOMEM;
3834 			goto out_free;
3835 		}
3836 		mask = strtok_r(NULL, "/", &mask_ptr);
3837 		if (mask == NULL) {
3838 			pr_err("Invalid thread maps or affinity specs\n");
3839 			ret = -EINVAL;
3840 			goto out_free;
3841 		}
3842 		pr_debug2("  affinity mask: %s\n", mask);
3843 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3844 		if (!tmp_spec) {
3845 			pr_err("Failed to reallocate affinity spec\n");
3846 			ret = -ENOMEM;
3847 			goto out_free;
3848 		}
3849 		affinity_spec = tmp_spec;
3850 		affinity_spec[nr_spec] = strdup(mask);
3851 		if (!affinity_spec[nr_spec]) {
3852 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3853 			ret = -ENOMEM;
3854 			goto out_free;
3855 		}
3856 		dup_mask = NULL;
3857 		nr_spec++;
3858 	}
3859 
3860 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3861 					     (const char **)affinity_spec, nr_spec);
3862 
3863 out_free:
3864 	free(dup_mask);
3865 	for (s = 0; s < nr_spec; s++) {
3866 		if (maps_spec)
3867 			free(maps_spec[s]);
3868 		if (affinity_spec)
3869 			free(affinity_spec[s]);
3870 	}
3871 	free(affinity_spec);
3872 	free(maps_spec);
3873 
3874 	return ret;
3875 }
3876 
3877 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3878 {
3879 	int ret;
3880 
3881 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3882 	if (ret)
3883 		return ret;
3884 
3885 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3886 		return -ENODEV;
3887 
3888 	rec->nr_threads = 1;
3889 
3890 	return 0;
3891 }
3892 
3893 static int record__init_thread_masks(struct record *rec)
3894 {
3895 	int ret = 0;
3896 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3897 
3898 	if (!record__threads_enabled(rec))
3899 		return record__init_thread_default_masks(rec, cpus);
3900 
3901 	if (evlist__per_thread(rec->evlist)) {
3902 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3903 		return -EINVAL;
3904 	}
3905 
3906 	switch (rec->opts.threads_spec) {
3907 	case THREAD_SPEC__CPU:
3908 		ret = record__init_thread_cpu_masks(rec, cpus);
3909 		break;
3910 	case THREAD_SPEC__CORE:
3911 		ret = record__init_thread_core_masks(rec, cpus);
3912 		break;
3913 	case THREAD_SPEC__PACKAGE:
3914 		ret = record__init_thread_package_masks(rec, cpus);
3915 		break;
3916 	case THREAD_SPEC__NUMA:
3917 		ret = record__init_thread_numa_masks(rec, cpus);
3918 		break;
3919 	case THREAD_SPEC__USER:
3920 		ret = record__init_thread_user_masks(rec, cpus);
3921 		break;
3922 	default:
3923 		break;
3924 	}
3925 
3926 	return ret;
3927 }
3928 
3929 int cmd_record(int argc, const char **argv)
3930 {
3931 	int err;
3932 	struct record *rec = &record;
3933 	char errbuf[BUFSIZ];
3934 
3935 	setlocale(LC_ALL, "");
3936 
3937 #ifndef HAVE_BPF_SKEL
3938 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3939 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3940 # undef set_nobuild
3941 #endif
3942 
3943 	/* Disable eager loading of kernel symbols that adds overhead to perf record. */
3944 	symbol_conf.lazy_load_kernel_maps = true;
3945 	rec->opts.affinity = PERF_AFFINITY_SYS;
3946 
3947 	rec->evlist = evlist__new();
3948 	if (rec->evlist == NULL)
3949 		return -ENOMEM;
3950 
3951 	err = perf_config(perf_record_config, rec);
3952 	if (err)
3953 		return err;
3954 
3955 	argc = parse_options(argc, argv, record_options, record_usage,
3956 			    PARSE_OPT_STOP_AT_NON_OPTION);
3957 	if (quiet)
3958 		perf_quiet_option();
3959 
3960 	err = symbol__validate_sym_arguments();
3961 	if (err)
3962 		return err;
3963 
3964 	perf_debuginfod_setup(&record.debuginfod);
3965 
3966 	/* Make system wide (-a) the default target. */
3967 	if (!argc && target__none(&rec->opts.target))
3968 		rec->opts.target.system_wide = true;
3969 
3970 	if (nr_cgroups && !rec->opts.target.system_wide) {
3971 		usage_with_options_msg(record_usage, record_options,
3972 			"cgroup monitoring only available in system-wide mode");
3973 
3974 	}
3975 
3976 	if (rec->buildid_mmap) {
3977 		if (!perf_can_record_build_id()) {
3978 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
3979 			err = -EINVAL;
3980 			goto out_opts;
3981 		}
3982 		pr_debug("Enabling build id in mmap2 events.\n");
3983 		/* Enable mmap build id synthesizing. */
3984 		symbol_conf.buildid_mmap2 = true;
3985 		/* Enable perf_event_attr::build_id bit. */
3986 		rec->opts.build_id = true;
3987 		/* Disable build id cache. */
3988 		rec->no_buildid = true;
3989 	}
3990 
3991 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
3992 		pr_err("Kernel has no cgroup sampling support.\n");
3993 		err = -EINVAL;
3994 		goto out_opts;
3995 	}
3996 
3997 	if (rec->opts.kcore)
3998 		rec->opts.text_poke = true;
3999 
4000 	if (rec->opts.kcore || record__threads_enabled(rec))
4001 		rec->data.is_dir = true;
4002 
4003 	if (record__threads_enabled(rec)) {
4004 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4005 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4006 			goto out_opts;
4007 		}
4008 		if (record__aio_enabled(rec)) {
4009 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4010 			goto out_opts;
4011 		}
4012 	}
4013 
4014 	if (rec->opts.comp_level != 0) {
4015 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4016 		rec->no_buildid = true;
4017 	}
4018 
4019 	if (rec->opts.record_switch_events &&
4020 	    !perf_can_record_switch_events()) {
4021 		ui__error("kernel does not support recording context switch events\n");
4022 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4023 		err = -EINVAL;
4024 		goto out_opts;
4025 	}
4026 
4027 	if (switch_output_setup(rec)) {
4028 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4029 		err = -EINVAL;
4030 		goto out_opts;
4031 	}
4032 
4033 	if (rec->switch_output.time) {
4034 		signal(SIGALRM, alarm_sig_handler);
4035 		alarm(rec->switch_output.time);
4036 	}
4037 
4038 	if (rec->switch_output.num_files) {
4039 		rec->switch_output.filenames = calloc(sizeof(char *),
4040 						      rec->switch_output.num_files);
4041 		if (!rec->switch_output.filenames) {
4042 			err = -EINVAL;
4043 			goto out_opts;
4044 		}
4045 	}
4046 
4047 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4048 		rec->timestamp_filename = false;
4049 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4050 	}
4051 
4052 	/*
4053 	 * Allow aliases to facilitate the lookup of symbols for address
4054 	 * filters. Refer to auxtrace_parse_filters().
4055 	 */
4056 	symbol_conf.allow_aliases = true;
4057 
4058 	symbol__init(NULL);
4059 
4060 	err = record__auxtrace_init(rec);
4061 	if (err)
4062 		goto out;
4063 
4064 	if (dry_run)
4065 		goto out;
4066 
4067 	err = -ENOMEM;
4068 
4069 	if (rec->no_buildid_cache || rec->no_buildid) {
4070 		disable_buildid_cache();
4071 	} else if (rec->switch_output.enabled) {
4072 		/*
4073 		 * In 'perf record --switch-output', disable buildid
4074 		 * generation by default to reduce data file switching
4075 		 * overhead. Still generate buildid if they are required
4076 		 * explicitly using
4077 		 *
4078 		 *  perf record --switch-output --no-no-buildid \
4079 		 *              --no-no-buildid-cache
4080 		 *
4081 		 * Following code equals to:
4082 		 *
4083 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4084 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4085 		 *         disable_buildid_cache();
4086 		 */
4087 		bool disable = true;
4088 
4089 		if (rec->no_buildid_set && !rec->no_buildid)
4090 			disable = false;
4091 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4092 			disable = false;
4093 		if (disable) {
4094 			rec->no_buildid = true;
4095 			rec->no_buildid_cache = true;
4096 			disable_buildid_cache();
4097 		}
4098 	}
4099 
4100 	if (record.opts.overwrite)
4101 		record.opts.tail_synthesize = true;
4102 
4103 	if (rec->evlist->core.nr_entries == 0) {
4104 		bool can_profile_kernel = perf_event_paranoid_check(1);
4105 
4106 		err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu");
4107 		if (err)
4108 			goto out;
4109 	}
4110 
4111 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4112 		rec->opts.no_inherit = true;
4113 
4114 	err = target__validate(&rec->opts.target);
4115 	if (err) {
4116 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4117 		ui__warning("%s\n", errbuf);
4118 	}
4119 
4120 	err = target__parse_uid(&rec->opts.target);
4121 	if (err) {
4122 		int saved_errno = errno;
4123 
4124 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4125 		ui__error("%s", errbuf);
4126 
4127 		err = -saved_errno;
4128 		goto out;
4129 	}
4130 
4131 	/* Enable ignoring missing threads when -u/-p option is defined. */
4132 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4133 
4134 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4135 
4136 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4137 		arch__add_leaf_frame_record_opts(&rec->opts);
4138 
4139 	err = -ENOMEM;
4140 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4141 		if (rec->opts.target.pid != NULL) {
4142 			pr_err("Couldn't create thread/CPU maps: %s\n",
4143 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4144 			goto out;
4145 		}
4146 		else
4147 			usage_with_options(record_usage, record_options);
4148 	}
4149 
4150 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4151 	if (err)
4152 		goto out;
4153 
4154 	/*
4155 	 * We take all buildids when the file contains
4156 	 * AUX area tracing data because we do not decode the
4157 	 * trace because it would take too long.
4158 	 */
4159 	if (rec->opts.full_auxtrace)
4160 		rec->buildid_all = true;
4161 
4162 	if (rec->opts.text_poke) {
4163 		err = record__config_text_poke(rec->evlist);
4164 		if (err) {
4165 			pr_err("record__config_text_poke failed, error %d\n", err);
4166 			goto out;
4167 		}
4168 	}
4169 
4170 	if (rec->off_cpu) {
4171 		err = record__config_off_cpu(rec);
4172 		if (err) {
4173 			pr_err("record__config_off_cpu failed, error %d\n", err);
4174 			goto out;
4175 		}
4176 	}
4177 
4178 	if (record_opts__config(&rec->opts)) {
4179 		err = -EINVAL;
4180 		goto out;
4181 	}
4182 
4183 	err = record__config_tracking_events(rec);
4184 	if (err) {
4185 		pr_err("record__config_tracking_events failed, error %d\n", err);
4186 		goto out;
4187 	}
4188 
4189 	err = record__init_thread_masks(rec);
4190 	if (err) {
4191 		pr_err("Failed to initialize parallel data streaming masks\n");
4192 		goto out;
4193 	}
4194 
4195 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4196 		rec->opts.nr_cblocks = nr_cblocks_max;
4197 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4198 
4199 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4200 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4201 
4202 	if (rec->opts.comp_level > comp_level_max)
4203 		rec->opts.comp_level = comp_level_max;
4204 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4205 
4206 	err = __cmd_record(&record, argc, argv);
4207 out:
4208 	evlist__delete(rec->evlist);
4209 	symbol__exit();
4210 	auxtrace_record__free(rec->itr);
4211 out_opts:
4212 	record__free_thread_masks(rec, rec->nr_threads);
4213 	rec->nr_threads = 0;
4214 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4215 	return err;
4216 }
4217 
4218 static void snapshot_sig_handler(int sig __maybe_unused)
4219 {
4220 	struct record *rec = &record;
4221 
4222 	hit_auxtrace_snapshot_trigger(rec);
4223 
4224 	if (switch_output_signal(rec))
4225 		trigger_hit(&switch_output_trigger);
4226 }
4227 
4228 static void alarm_sig_handler(int sig __maybe_unused)
4229 {
4230 	struct record *rec = &record;
4231 
4232 	if (switch_output_time(rec))
4233 		trigger_hit(&switch_output_trigger);
4234 }
4235