xref: /openbmc/linux/tools/perf/builtin-record.c (revision 6887314f5356389fc219b8152e951ac084a10ef7)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "asm/bug.h"
55 #include "perf.h"
56 #include "cputopo.h"
57 
58 #include <errno.h>
59 #include <inttypes.h>
60 #include <locale.h>
61 #include <poll.h>
62 #include <pthread.h>
63 #include <unistd.h>
64 #ifndef HAVE_GETTID
65 #include <syscall.h>
66 #endif
67 #include <sched.h>
68 #include <signal.h>
69 #ifdef HAVE_EVENTFD_SUPPORT
70 #include <sys/eventfd.h>
71 #endif
72 #include <sys/mman.h>
73 #include <sys/wait.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <fcntl.h>
77 #include <linux/err.h>
78 #include <linux/string.h>
79 #include <linux/time64.h>
80 #include <linux/zalloc.h>
81 #include <linux/bitmap.h>
82 #include <sys/time.h>
83 
84 struct switch_output {
85 	bool		 enabled;
86 	bool		 signal;
87 	unsigned long	 size;
88 	unsigned long	 time;
89 	const char	*str;
90 	bool		 set;
91 	char		 **filenames;
92 	int		 num_files;
93 	int		 cur_file;
94 };
95 
96 struct thread_mask {
97 	struct mmap_cpu_mask	maps;
98 	struct mmap_cpu_mask	affinity;
99 };
100 
101 struct record_thread {
102 	pid_t			tid;
103 	struct thread_mask	*mask;
104 	struct {
105 		int		msg[2];
106 		int		ack[2];
107 	} pipes;
108 	struct fdarray		pollfd;
109 	int			ctlfd_pos;
110 	int			nr_mmaps;
111 	struct mmap		**maps;
112 	struct mmap		**overwrite_maps;
113 	struct record		*rec;
114 	unsigned long long	samples;
115 	unsigned long		waking;
116 	u64			bytes_written;
117 	u64			bytes_transferred;
118 	u64			bytes_compressed;
119 };
120 
121 static __thread struct record_thread *thread;
122 
123 enum thread_msg {
124 	THREAD_MSG__UNDEFINED = 0,
125 	THREAD_MSG__READY,
126 	THREAD_MSG__MAX,
127 };
128 
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
130 	"UNDEFINED", "READY"
131 };
132 
133 enum thread_spec {
134 	THREAD_SPEC__UNDEFINED = 0,
135 	THREAD_SPEC__CPU,
136 	THREAD_SPEC__CORE,
137 	THREAD_SPEC__PACKAGE,
138 	THREAD_SPEC__NUMA,
139 	THREAD_SPEC__USER,
140 	THREAD_SPEC__MAX,
141 };
142 
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144 	"undefined", "cpu", "core", "package", "numa", "user"
145 };
146 
147 struct pollfd_index_map {
148 	int evlist_pollfd_index;
149 	int thread_pollfd_index;
150 };
151 
152 struct record {
153 	struct perf_tool	tool;
154 	struct record_opts	opts;
155 	u64			bytes_written;
156 	u64			thread_bytes_written;
157 	struct perf_data	data;
158 	struct auxtrace_record	*itr;
159 	struct evlist	*evlist;
160 	struct perf_session	*session;
161 	struct evlist		*sb_evlist;
162 	pthread_t		thread_id;
163 	int			realtime_prio;
164 	bool			switch_output_event_set;
165 	bool			no_buildid;
166 	bool			no_buildid_set;
167 	bool			no_buildid_cache;
168 	bool			no_buildid_cache_set;
169 	bool			buildid_all;
170 	bool			buildid_mmap;
171 	bool			timestamp_filename;
172 	bool			timestamp_boundary;
173 	bool			off_cpu;
174 	struct switch_output	switch_output;
175 	unsigned long long	samples;
176 	unsigned long		output_max_size;	/* = 0: unlimited */
177 	struct perf_debuginfod	debuginfod;
178 	int			nr_threads;
179 	struct thread_mask	*thread_masks;
180 	struct record_thread	*thread_data;
181 	struct pollfd_index_map	*index_map;
182 	size_t			index_map_sz;
183 	size_t			index_map_cnt;
184 };
185 
186 static volatile int done;
187 
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191 
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193 	"SYS", "NODE", "CPU"
194 };
195 
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199 	return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202 
203 static int record__threads_enabled(struct record *rec)
204 {
205 	return rec->opts.threads_spec;
206 }
207 
208 static bool switch_output_signal(struct record *rec)
209 {
210 	return rec->switch_output.signal &&
211 	       trigger_is_ready(&switch_output_trigger);
212 }
213 
214 static bool switch_output_size(struct record *rec)
215 {
216 	return rec->switch_output.size &&
217 	       trigger_is_ready(&switch_output_trigger) &&
218 	       (rec->bytes_written >= rec->switch_output.size);
219 }
220 
221 static bool switch_output_time(struct record *rec)
222 {
223 	return rec->switch_output.time &&
224 	       trigger_is_ready(&switch_output_trigger);
225 }
226 
227 static u64 record__bytes_written(struct record *rec)
228 {
229 	return rec->bytes_written + rec->thread_bytes_written;
230 }
231 
232 static bool record__output_max_size_exceeded(struct record *rec)
233 {
234 	return rec->output_max_size &&
235 	       (record__bytes_written(rec) >= rec->output_max_size);
236 }
237 
238 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
239 			 void *bf, size_t size)
240 {
241 	struct perf_data_file *file = &rec->session->data->file;
242 
243 	if (map && map->file)
244 		file = map->file;
245 
246 	if (perf_data_file__write(file, bf, size) < 0) {
247 		pr_err("failed to write perf data, error: %m\n");
248 		return -1;
249 	}
250 
251 	if (map && map->file) {
252 		thread->bytes_written += size;
253 		rec->thread_bytes_written += size;
254 	} else {
255 		rec->bytes_written += size;
256 	}
257 
258 	if (record__output_max_size_exceeded(rec) && !done) {
259 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
260 				" stopping session ]\n",
261 				record__bytes_written(rec) >> 10);
262 		done = 1;
263 	}
264 
265 	if (switch_output_size(rec))
266 		trigger_hit(&switch_output_trigger);
267 
268 	return 0;
269 }
270 
271 static int record__aio_enabled(struct record *rec);
272 static int record__comp_enabled(struct record *rec);
273 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
274 			    void *dst, size_t dst_size, void *src, size_t src_size);
275 
276 #ifdef HAVE_AIO_SUPPORT
277 static int record__aio_write(struct aiocb *cblock, int trace_fd,
278 		void *buf, size_t size, off_t off)
279 {
280 	int rc;
281 
282 	cblock->aio_fildes = trace_fd;
283 	cblock->aio_buf    = buf;
284 	cblock->aio_nbytes = size;
285 	cblock->aio_offset = off;
286 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
287 
288 	do {
289 		rc = aio_write(cblock);
290 		if (rc == 0) {
291 			break;
292 		} else if (errno != EAGAIN) {
293 			cblock->aio_fildes = -1;
294 			pr_err("failed to queue perf data, error: %m\n");
295 			break;
296 		}
297 	} while (1);
298 
299 	return rc;
300 }
301 
302 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
303 {
304 	void *rem_buf;
305 	off_t rem_off;
306 	size_t rem_size;
307 	int rc, aio_errno;
308 	ssize_t aio_ret, written;
309 
310 	aio_errno = aio_error(cblock);
311 	if (aio_errno == EINPROGRESS)
312 		return 0;
313 
314 	written = aio_ret = aio_return(cblock);
315 	if (aio_ret < 0) {
316 		if (aio_errno != EINTR)
317 			pr_err("failed to write perf data, error: %m\n");
318 		written = 0;
319 	}
320 
321 	rem_size = cblock->aio_nbytes - written;
322 
323 	if (rem_size == 0) {
324 		cblock->aio_fildes = -1;
325 		/*
326 		 * md->refcount is incremented in record__aio_pushfn() for
327 		 * every aio write request started in record__aio_push() so
328 		 * decrement it because the request is now complete.
329 		 */
330 		perf_mmap__put(&md->core);
331 		rc = 1;
332 	} else {
333 		/*
334 		 * aio write request may require restart with the
335 		 * reminder if the kernel didn't write whole
336 		 * chunk at once.
337 		 */
338 		rem_off = cblock->aio_offset + written;
339 		rem_buf = (void *)(cblock->aio_buf + written);
340 		record__aio_write(cblock, cblock->aio_fildes,
341 				rem_buf, rem_size, rem_off);
342 		rc = 0;
343 	}
344 
345 	return rc;
346 }
347 
348 static int record__aio_sync(struct mmap *md, bool sync_all)
349 {
350 	struct aiocb **aiocb = md->aio.aiocb;
351 	struct aiocb *cblocks = md->aio.cblocks;
352 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
353 	int i, do_suspend;
354 
355 	do {
356 		do_suspend = 0;
357 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
358 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
359 				if (sync_all)
360 					aiocb[i] = NULL;
361 				else
362 					return i;
363 			} else {
364 				/*
365 				 * Started aio write is not complete yet
366 				 * so it has to be waited before the
367 				 * next allocation.
368 				 */
369 				aiocb[i] = &cblocks[i];
370 				do_suspend = 1;
371 			}
372 		}
373 		if (!do_suspend)
374 			return -1;
375 
376 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
377 			if (!(errno == EAGAIN || errno == EINTR))
378 				pr_err("failed to sync perf data, error: %m\n");
379 		}
380 	} while (1);
381 }
382 
383 struct record_aio {
384 	struct record	*rec;
385 	void		*data;
386 	size_t		size;
387 };
388 
389 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
390 {
391 	struct record_aio *aio = to;
392 
393 	/*
394 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
395 	 * to release space in the kernel buffer as fast as possible, calling
396 	 * perf_mmap__consume() from perf_mmap__push() function.
397 	 *
398 	 * That lets the kernel to proceed with storing more profiling data into
399 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
400 	 *
401 	 * Coping can be done in two steps in case the chunk of profiling data
402 	 * crosses the upper bound of the kernel buffer. In this case we first move
403 	 * part of data from map->start till the upper bound and then the reminder
404 	 * from the beginning of the kernel buffer till the end of the data chunk.
405 	 */
406 
407 	if (record__comp_enabled(aio->rec)) {
408 		size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
409 				     mmap__mmap_len(map) - aio->size,
410 				     buf, size);
411 	} else {
412 		memcpy(aio->data + aio->size, buf, size);
413 	}
414 
415 	if (!aio->size) {
416 		/*
417 		 * Increment map->refcount to guard map->aio.data[] buffer
418 		 * from premature deallocation because map object can be
419 		 * released earlier than aio write request started on
420 		 * map->aio.data[] buffer is complete.
421 		 *
422 		 * perf_mmap__put() is done at record__aio_complete()
423 		 * after started aio request completion or at record__aio_push()
424 		 * if the request failed to start.
425 		 */
426 		perf_mmap__get(&map->core);
427 	}
428 
429 	aio->size += size;
430 
431 	return size;
432 }
433 
434 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
435 {
436 	int ret, idx;
437 	int trace_fd = rec->session->data->file.fd;
438 	struct record_aio aio = { .rec = rec, .size = 0 };
439 
440 	/*
441 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
442 	 * becomes available after previous aio write operation.
443 	 */
444 
445 	idx = record__aio_sync(map, false);
446 	aio.data = map->aio.data[idx];
447 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
448 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
449 		return ret;
450 
451 	rec->samples++;
452 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
453 	if (!ret) {
454 		*off += aio.size;
455 		rec->bytes_written += aio.size;
456 		if (switch_output_size(rec))
457 			trigger_hit(&switch_output_trigger);
458 	} else {
459 		/*
460 		 * Decrement map->refcount incremented in record__aio_pushfn()
461 		 * back if record__aio_write() operation failed to start, otherwise
462 		 * map->refcount is decremented in record__aio_complete() after
463 		 * aio write operation finishes successfully.
464 		 */
465 		perf_mmap__put(&map->core);
466 	}
467 
468 	return ret;
469 }
470 
471 static off_t record__aio_get_pos(int trace_fd)
472 {
473 	return lseek(trace_fd, 0, SEEK_CUR);
474 }
475 
476 static void record__aio_set_pos(int trace_fd, off_t pos)
477 {
478 	lseek(trace_fd, pos, SEEK_SET);
479 }
480 
481 static void record__aio_mmap_read_sync(struct record *rec)
482 {
483 	int i;
484 	struct evlist *evlist = rec->evlist;
485 	struct mmap *maps = evlist->mmap;
486 
487 	if (!record__aio_enabled(rec))
488 		return;
489 
490 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
491 		struct mmap *map = &maps[i];
492 
493 		if (map->core.base)
494 			record__aio_sync(map, true);
495 	}
496 }
497 
498 static int nr_cblocks_default = 1;
499 static int nr_cblocks_max = 4;
500 
501 static int record__aio_parse(const struct option *opt,
502 			     const char *str,
503 			     int unset)
504 {
505 	struct record_opts *opts = (struct record_opts *)opt->value;
506 
507 	if (unset) {
508 		opts->nr_cblocks = 0;
509 	} else {
510 		if (str)
511 			opts->nr_cblocks = strtol(str, NULL, 0);
512 		if (!opts->nr_cblocks)
513 			opts->nr_cblocks = nr_cblocks_default;
514 	}
515 
516 	return 0;
517 }
518 #else /* HAVE_AIO_SUPPORT */
519 static int nr_cblocks_max = 0;
520 
521 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
522 			    off_t *off __maybe_unused)
523 {
524 	return -1;
525 }
526 
527 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
528 {
529 	return -1;
530 }
531 
532 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
533 {
534 }
535 
536 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
537 {
538 }
539 #endif
540 
541 static int record__aio_enabled(struct record *rec)
542 {
543 	return rec->opts.nr_cblocks > 0;
544 }
545 
546 #define MMAP_FLUSH_DEFAULT 1
547 static int record__mmap_flush_parse(const struct option *opt,
548 				    const char *str,
549 				    int unset)
550 {
551 	int flush_max;
552 	struct record_opts *opts = (struct record_opts *)opt->value;
553 	static struct parse_tag tags[] = {
554 			{ .tag  = 'B', .mult = 1       },
555 			{ .tag  = 'K', .mult = 1 << 10 },
556 			{ .tag  = 'M', .mult = 1 << 20 },
557 			{ .tag  = 'G', .mult = 1 << 30 },
558 			{ .tag  = 0 },
559 	};
560 
561 	if (unset)
562 		return 0;
563 
564 	if (str) {
565 		opts->mmap_flush = parse_tag_value(str, tags);
566 		if (opts->mmap_flush == (int)-1)
567 			opts->mmap_flush = strtol(str, NULL, 0);
568 	}
569 
570 	if (!opts->mmap_flush)
571 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
572 
573 	flush_max = evlist__mmap_size(opts->mmap_pages);
574 	flush_max /= 4;
575 	if (opts->mmap_flush > flush_max)
576 		opts->mmap_flush = flush_max;
577 
578 	return 0;
579 }
580 
581 #ifdef HAVE_ZSTD_SUPPORT
582 static unsigned int comp_level_default = 1;
583 
584 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
585 {
586 	struct record_opts *opts = opt->value;
587 
588 	if (unset) {
589 		opts->comp_level = 0;
590 	} else {
591 		if (str)
592 			opts->comp_level = strtol(str, NULL, 0);
593 		if (!opts->comp_level)
594 			opts->comp_level = comp_level_default;
595 	}
596 
597 	return 0;
598 }
599 #endif
600 static unsigned int comp_level_max = 22;
601 
602 static int record__comp_enabled(struct record *rec)
603 {
604 	return rec->opts.comp_level > 0;
605 }
606 
607 static int process_synthesized_event(struct perf_tool *tool,
608 				     union perf_event *event,
609 				     struct perf_sample *sample __maybe_unused,
610 				     struct machine *machine __maybe_unused)
611 {
612 	struct record *rec = container_of(tool, struct record, tool);
613 	return record__write(rec, NULL, event, event->header.size);
614 }
615 
616 static struct mutex synth_lock;
617 
618 static int process_locked_synthesized_event(struct perf_tool *tool,
619 				     union perf_event *event,
620 				     struct perf_sample *sample __maybe_unused,
621 				     struct machine *machine __maybe_unused)
622 {
623 	int ret;
624 
625 	mutex_lock(&synth_lock);
626 	ret = process_synthesized_event(tool, event, sample, machine);
627 	mutex_unlock(&synth_lock);
628 	return ret;
629 }
630 
631 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
632 {
633 	struct record *rec = to;
634 
635 	if (record__comp_enabled(rec)) {
636 		size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
637 		bf   = map->data;
638 	}
639 
640 	thread->samples++;
641 	return record__write(rec, map, bf, size);
642 }
643 
644 static volatile sig_atomic_t signr = -1;
645 static volatile sig_atomic_t child_finished;
646 #ifdef HAVE_EVENTFD_SUPPORT
647 static volatile sig_atomic_t done_fd = -1;
648 #endif
649 
650 static void sig_handler(int sig)
651 {
652 	if (sig == SIGCHLD)
653 		child_finished = 1;
654 	else
655 		signr = sig;
656 
657 	done = 1;
658 #ifdef HAVE_EVENTFD_SUPPORT
659 	if (done_fd >= 0) {
660 		u64 tmp = 1;
661 		int orig_errno = errno;
662 
663 		/*
664 		 * It is possible for this signal handler to run after done is
665 		 * checked in the main loop, but before the perf counter fds are
666 		 * polled. If this happens, the poll() will continue to wait
667 		 * even though done is set, and will only break out if either
668 		 * another signal is received, or the counters are ready for
669 		 * read. To ensure the poll() doesn't sleep when done is set,
670 		 * use an eventfd (done_fd) to wake up the poll().
671 		 */
672 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
673 			pr_err("failed to signal wakeup fd, error: %m\n");
674 
675 		errno = orig_errno;
676 	}
677 #endif // HAVE_EVENTFD_SUPPORT
678 }
679 
680 static void sigsegv_handler(int sig)
681 {
682 	perf_hooks__recover();
683 	sighandler_dump_stack(sig);
684 }
685 
686 static void record__sig_exit(void)
687 {
688 	if (signr == -1)
689 		return;
690 
691 	signal(signr, SIG_DFL);
692 	raise(signr);
693 }
694 
695 #ifdef HAVE_AUXTRACE_SUPPORT
696 
697 static int record__process_auxtrace(struct perf_tool *tool,
698 				    struct mmap *map,
699 				    union perf_event *event, void *data1,
700 				    size_t len1, void *data2, size_t len2)
701 {
702 	struct record *rec = container_of(tool, struct record, tool);
703 	struct perf_data *data = &rec->data;
704 	size_t padding;
705 	u8 pad[8] = {0};
706 
707 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
708 		off_t file_offset;
709 		int fd = perf_data__fd(data);
710 		int err;
711 
712 		file_offset = lseek(fd, 0, SEEK_CUR);
713 		if (file_offset == -1)
714 			return -1;
715 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
716 						     event, file_offset);
717 		if (err)
718 			return err;
719 	}
720 
721 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
722 	padding = (len1 + len2) & 7;
723 	if (padding)
724 		padding = 8 - padding;
725 
726 	record__write(rec, map, event, event->header.size);
727 	record__write(rec, map, data1, len1);
728 	if (len2)
729 		record__write(rec, map, data2, len2);
730 	record__write(rec, map, &pad, padding);
731 
732 	return 0;
733 }
734 
735 static int record__auxtrace_mmap_read(struct record *rec,
736 				      struct mmap *map)
737 {
738 	int ret;
739 
740 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
741 				  record__process_auxtrace);
742 	if (ret < 0)
743 		return ret;
744 
745 	if (ret)
746 		rec->samples++;
747 
748 	return 0;
749 }
750 
751 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
752 					       struct mmap *map)
753 {
754 	int ret;
755 
756 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
757 					   record__process_auxtrace,
758 					   rec->opts.auxtrace_snapshot_size);
759 	if (ret < 0)
760 		return ret;
761 
762 	if (ret)
763 		rec->samples++;
764 
765 	return 0;
766 }
767 
768 static int record__auxtrace_read_snapshot_all(struct record *rec)
769 {
770 	int i;
771 	int rc = 0;
772 
773 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
774 		struct mmap *map = &rec->evlist->mmap[i];
775 
776 		if (!map->auxtrace_mmap.base)
777 			continue;
778 
779 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
780 			rc = -1;
781 			goto out;
782 		}
783 	}
784 out:
785 	return rc;
786 }
787 
788 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
789 {
790 	pr_debug("Recording AUX area tracing snapshot\n");
791 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
792 		trigger_error(&auxtrace_snapshot_trigger);
793 	} else {
794 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
795 			trigger_error(&auxtrace_snapshot_trigger);
796 		else
797 			trigger_ready(&auxtrace_snapshot_trigger);
798 	}
799 }
800 
801 static int record__auxtrace_snapshot_exit(struct record *rec)
802 {
803 	if (trigger_is_error(&auxtrace_snapshot_trigger))
804 		return 0;
805 
806 	if (!auxtrace_record__snapshot_started &&
807 	    auxtrace_record__snapshot_start(rec->itr))
808 		return -1;
809 
810 	record__read_auxtrace_snapshot(rec, true);
811 	if (trigger_is_error(&auxtrace_snapshot_trigger))
812 		return -1;
813 
814 	return 0;
815 }
816 
817 static int record__auxtrace_init(struct record *rec)
818 {
819 	int err;
820 
821 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
822 	    && record__threads_enabled(rec)) {
823 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
824 		return -EINVAL;
825 	}
826 
827 	if (!rec->itr) {
828 		rec->itr = auxtrace_record__init(rec->evlist, &err);
829 		if (err)
830 			return err;
831 	}
832 
833 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
834 					      rec->opts.auxtrace_snapshot_opts);
835 	if (err)
836 		return err;
837 
838 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
839 					    rec->opts.auxtrace_sample_opts);
840 	if (err)
841 		return err;
842 
843 	auxtrace_regroup_aux_output(rec->evlist);
844 
845 	return auxtrace_parse_filters(rec->evlist);
846 }
847 
848 #else
849 
850 static inline
851 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
852 			       struct mmap *map __maybe_unused)
853 {
854 	return 0;
855 }
856 
857 static inline
858 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
859 				    bool on_exit __maybe_unused)
860 {
861 }
862 
863 static inline
864 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
865 {
866 	return 0;
867 }
868 
869 static inline
870 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
871 {
872 	return 0;
873 }
874 
875 static int record__auxtrace_init(struct record *rec __maybe_unused)
876 {
877 	return 0;
878 }
879 
880 #endif
881 
882 static int record__config_text_poke(struct evlist *evlist)
883 {
884 	struct evsel *evsel;
885 
886 	/* Nothing to do if text poke is already configured */
887 	evlist__for_each_entry(evlist, evsel) {
888 		if (evsel->core.attr.text_poke)
889 			return 0;
890 	}
891 
892 	evsel = evlist__add_dummy_on_all_cpus(evlist);
893 	if (!evsel)
894 		return -ENOMEM;
895 
896 	evsel->core.attr.text_poke = 1;
897 	evsel->core.attr.ksymbol = 1;
898 	evsel->immediate = true;
899 	evsel__set_sample_bit(evsel, TIME);
900 
901 	return 0;
902 }
903 
904 static int record__config_off_cpu(struct record *rec)
905 {
906 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
907 }
908 
909 static bool record__kcore_readable(struct machine *machine)
910 {
911 	char kcore[PATH_MAX];
912 	int fd;
913 
914 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
915 
916 	fd = open(kcore, O_RDONLY);
917 	if (fd < 0)
918 		return false;
919 
920 	close(fd);
921 
922 	return true;
923 }
924 
925 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
926 {
927 	char from_dir[PATH_MAX];
928 	char kcore_dir[PATH_MAX];
929 	int ret;
930 
931 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
932 
933 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
934 	if (ret)
935 		return ret;
936 
937 	return kcore_copy(from_dir, kcore_dir);
938 }
939 
940 static void record__thread_data_init_pipes(struct record_thread *thread_data)
941 {
942 	thread_data->pipes.msg[0] = -1;
943 	thread_data->pipes.msg[1] = -1;
944 	thread_data->pipes.ack[0] = -1;
945 	thread_data->pipes.ack[1] = -1;
946 }
947 
948 static int record__thread_data_open_pipes(struct record_thread *thread_data)
949 {
950 	if (pipe(thread_data->pipes.msg))
951 		return -EINVAL;
952 
953 	if (pipe(thread_data->pipes.ack)) {
954 		close(thread_data->pipes.msg[0]);
955 		thread_data->pipes.msg[0] = -1;
956 		close(thread_data->pipes.msg[1]);
957 		thread_data->pipes.msg[1] = -1;
958 		return -EINVAL;
959 	}
960 
961 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
962 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
963 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
964 
965 	return 0;
966 }
967 
968 static void record__thread_data_close_pipes(struct record_thread *thread_data)
969 {
970 	if (thread_data->pipes.msg[0] != -1) {
971 		close(thread_data->pipes.msg[0]);
972 		thread_data->pipes.msg[0] = -1;
973 	}
974 	if (thread_data->pipes.msg[1] != -1) {
975 		close(thread_data->pipes.msg[1]);
976 		thread_data->pipes.msg[1] = -1;
977 	}
978 	if (thread_data->pipes.ack[0] != -1) {
979 		close(thread_data->pipes.ack[0]);
980 		thread_data->pipes.ack[0] = -1;
981 	}
982 	if (thread_data->pipes.ack[1] != -1) {
983 		close(thread_data->pipes.ack[1]);
984 		thread_data->pipes.ack[1] = -1;
985 	}
986 }
987 
988 static bool evlist__per_thread(struct evlist *evlist)
989 {
990 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
991 }
992 
993 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
994 {
995 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
996 	struct mmap *mmap = evlist->mmap;
997 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
998 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
999 	bool per_thread = evlist__per_thread(evlist);
1000 
1001 	if (per_thread)
1002 		thread_data->nr_mmaps = nr_mmaps;
1003 	else
1004 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1005 						      thread_data->mask->maps.nbits);
1006 	if (mmap) {
1007 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1008 		if (!thread_data->maps)
1009 			return -ENOMEM;
1010 	}
1011 	if (overwrite_mmap) {
1012 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1013 		if (!thread_data->overwrite_maps) {
1014 			zfree(&thread_data->maps);
1015 			return -ENOMEM;
1016 		}
1017 	}
1018 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1019 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1020 
1021 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1022 		if (per_thread ||
1023 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1024 			if (thread_data->maps) {
1025 				thread_data->maps[tm] = &mmap[m];
1026 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1027 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1028 			}
1029 			if (thread_data->overwrite_maps) {
1030 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1031 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1032 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1033 			}
1034 			tm++;
1035 		}
1036 	}
1037 
1038 	return 0;
1039 }
1040 
1041 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1042 {
1043 	int f, tm, pos;
1044 	struct mmap *map, *overwrite_map;
1045 
1046 	fdarray__init(&thread_data->pollfd, 64);
1047 
1048 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1049 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1050 		overwrite_map = thread_data->overwrite_maps ?
1051 				thread_data->overwrite_maps[tm] : NULL;
1052 
1053 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1054 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1055 
1056 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1057 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1058 							      &evlist->core.pollfd);
1059 				if (pos < 0)
1060 					return pos;
1061 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1062 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1063 			}
1064 		}
1065 	}
1066 
1067 	return 0;
1068 }
1069 
1070 static void record__free_thread_data(struct record *rec)
1071 {
1072 	int t;
1073 	struct record_thread *thread_data = rec->thread_data;
1074 
1075 	if (thread_data == NULL)
1076 		return;
1077 
1078 	for (t = 0; t < rec->nr_threads; t++) {
1079 		record__thread_data_close_pipes(&thread_data[t]);
1080 		zfree(&thread_data[t].maps);
1081 		zfree(&thread_data[t].overwrite_maps);
1082 		fdarray__exit(&thread_data[t].pollfd);
1083 	}
1084 
1085 	zfree(&rec->thread_data);
1086 }
1087 
1088 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1089 						    int evlist_pollfd_index,
1090 						    int thread_pollfd_index)
1091 {
1092 	size_t x = rec->index_map_cnt;
1093 
1094 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1095 		return -ENOMEM;
1096 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1097 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1098 	rec->index_map_cnt += 1;
1099 	return 0;
1100 }
1101 
1102 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1103 						    struct evlist *evlist,
1104 						    struct record_thread *thread_data)
1105 {
1106 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1107 	struct pollfd *t_entries = thread_data->pollfd.entries;
1108 	int err = 0;
1109 	size_t i;
1110 
1111 	for (i = 0; i < rec->index_map_cnt; i++) {
1112 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1113 		int t_pos = rec->index_map[i].thread_pollfd_index;
1114 
1115 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1116 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1117 			pr_err("Thread and evlist pollfd index mismatch\n");
1118 			err = -EINVAL;
1119 			continue;
1120 		}
1121 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1122 	}
1123 	return err;
1124 }
1125 
1126 static int record__dup_non_perf_events(struct record *rec,
1127 				       struct evlist *evlist,
1128 				       struct record_thread *thread_data)
1129 {
1130 	struct fdarray *fda = &evlist->core.pollfd;
1131 	int i, ret;
1132 
1133 	for (i = 0; i < fda->nr; i++) {
1134 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1135 			continue;
1136 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1137 		if (ret < 0) {
1138 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1139 			return ret;
1140 		}
1141 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1142 			  thread_data, ret, fda->entries[i].fd);
1143 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1144 		if (ret < 0) {
1145 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1146 			return ret;
1147 		}
1148 	}
1149 	return 0;
1150 }
1151 
1152 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1153 {
1154 	int t, ret;
1155 	struct record_thread *thread_data;
1156 
1157 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1158 	if (!rec->thread_data) {
1159 		pr_err("Failed to allocate thread data\n");
1160 		return -ENOMEM;
1161 	}
1162 	thread_data = rec->thread_data;
1163 
1164 	for (t = 0; t < rec->nr_threads; t++)
1165 		record__thread_data_init_pipes(&thread_data[t]);
1166 
1167 	for (t = 0; t < rec->nr_threads; t++) {
1168 		thread_data[t].rec = rec;
1169 		thread_data[t].mask = &rec->thread_masks[t];
1170 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1171 		if (ret) {
1172 			pr_err("Failed to initialize thread[%d] maps\n", t);
1173 			goto out_free;
1174 		}
1175 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1176 		if (ret) {
1177 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1178 			goto out_free;
1179 		}
1180 		if (t) {
1181 			thread_data[t].tid = -1;
1182 			ret = record__thread_data_open_pipes(&thread_data[t]);
1183 			if (ret) {
1184 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1185 				goto out_free;
1186 			}
1187 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1188 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1189 			if (ret < 0) {
1190 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1191 				goto out_free;
1192 			}
1193 			thread_data[t].ctlfd_pos = ret;
1194 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1195 				 thread_data, thread_data[t].ctlfd_pos,
1196 				 thread_data[t].pipes.msg[0]);
1197 		} else {
1198 			thread_data[t].tid = gettid();
1199 
1200 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1201 			if (ret < 0)
1202 				goto out_free;
1203 
1204 			thread_data[t].ctlfd_pos = -1; /* Not used */
1205 		}
1206 	}
1207 
1208 	return 0;
1209 
1210 out_free:
1211 	record__free_thread_data(rec);
1212 
1213 	return ret;
1214 }
1215 
1216 static int record__mmap_evlist(struct record *rec,
1217 			       struct evlist *evlist)
1218 {
1219 	int i, ret;
1220 	struct record_opts *opts = &rec->opts;
1221 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1222 				  opts->auxtrace_sample_mode;
1223 	char msg[512];
1224 
1225 	if (opts->affinity != PERF_AFFINITY_SYS)
1226 		cpu__setup_cpunode_map();
1227 
1228 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1229 				 opts->auxtrace_mmap_pages,
1230 				 auxtrace_overwrite,
1231 				 opts->nr_cblocks, opts->affinity,
1232 				 opts->mmap_flush, opts->comp_level) < 0) {
1233 		if (errno == EPERM) {
1234 			pr_err("Permission error mapping pages.\n"
1235 			       "Consider increasing "
1236 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1237 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1238 			       "(current value: %u,%u)\n",
1239 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1240 			return -errno;
1241 		} else {
1242 			pr_err("failed to mmap with %d (%s)\n", errno,
1243 				str_error_r(errno, msg, sizeof(msg)));
1244 			if (errno)
1245 				return -errno;
1246 			else
1247 				return -EINVAL;
1248 		}
1249 	}
1250 
1251 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1252 		return -1;
1253 
1254 	ret = record__alloc_thread_data(rec, evlist);
1255 	if (ret)
1256 		return ret;
1257 
1258 	if (record__threads_enabled(rec)) {
1259 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1260 		if (ret) {
1261 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1262 			return ret;
1263 		}
1264 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1265 			if (evlist->mmap)
1266 				evlist->mmap[i].file = &rec->data.dir.files[i];
1267 			if (evlist->overwrite_mmap)
1268 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1269 		}
1270 	}
1271 
1272 	return 0;
1273 }
1274 
1275 static int record__mmap(struct record *rec)
1276 {
1277 	return record__mmap_evlist(rec, rec->evlist);
1278 }
1279 
1280 static int record__open(struct record *rec)
1281 {
1282 	char msg[BUFSIZ];
1283 	struct evsel *pos;
1284 	struct evlist *evlist = rec->evlist;
1285 	struct perf_session *session = rec->session;
1286 	struct record_opts *opts = &rec->opts;
1287 	int rc = 0;
1288 
1289 	/*
1290 	 * For initial_delay, system wide or a hybrid system, we need to add a
1291 	 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
1292 	 * of waiting or event synthesis.
1293 	 */
1294 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
1295 	    perf_pmus__num_core_pmus() > 1) {
1296 		pos = evlist__get_tracking_event(evlist);
1297 		if (!evsel__is_dummy_event(pos)) {
1298 			/* Set up dummy event. */
1299 			if (evlist__add_dummy(evlist))
1300 				return -ENOMEM;
1301 			pos = evlist__last(evlist);
1302 			evlist__set_tracking_event(evlist, pos);
1303 		}
1304 
1305 		/*
1306 		 * Enable the dummy event when the process is forked for
1307 		 * initial_delay, immediately for system wide.
1308 		 */
1309 		if (opts->target.initial_delay && !pos->immediate &&
1310 		    !target__has_cpu(&opts->target))
1311 			pos->core.attr.enable_on_exec = 1;
1312 		else
1313 			pos->immediate = 1;
1314 	}
1315 
1316 	evlist__config(evlist, opts, &callchain_param);
1317 
1318 	evlist__for_each_entry(evlist, pos) {
1319 try_again:
1320 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1321 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1322 				if (verbose > 0)
1323 					ui__warning("%s\n", msg);
1324 				goto try_again;
1325 			}
1326 			if ((errno == EINVAL || errno == EBADF) &&
1327 			    pos->core.leader != &pos->core &&
1328 			    pos->weak_group) {
1329 			        pos = evlist__reset_weak_group(evlist, pos, true);
1330 				goto try_again;
1331 			}
1332 			rc = -errno;
1333 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1334 			ui__error("%s\n", msg);
1335 			goto out;
1336 		}
1337 
1338 		pos->supported = true;
1339 	}
1340 
1341 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1342 		pr_warning(
1343 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1344 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1345 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1346 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1347 "Samples in kernel modules won't be resolved at all.\n\n"
1348 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1349 "even with a suitable vmlinux or kallsyms file.\n\n");
1350 	}
1351 
1352 	if (evlist__apply_filters(evlist, &pos)) {
1353 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1354 			pos->filter ?: "BPF", evsel__name(pos), errno,
1355 			str_error_r(errno, msg, sizeof(msg)));
1356 		rc = -1;
1357 		goto out;
1358 	}
1359 
1360 	rc = record__mmap(rec);
1361 	if (rc)
1362 		goto out;
1363 
1364 	session->evlist = evlist;
1365 	perf_session__set_id_hdr_size(session);
1366 out:
1367 	return rc;
1368 }
1369 
1370 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1371 {
1372 	if (rec->evlist->first_sample_time == 0)
1373 		rec->evlist->first_sample_time = sample_time;
1374 
1375 	if (sample_time)
1376 		rec->evlist->last_sample_time = sample_time;
1377 }
1378 
1379 static int process_sample_event(struct perf_tool *tool,
1380 				union perf_event *event,
1381 				struct perf_sample *sample,
1382 				struct evsel *evsel,
1383 				struct machine *machine)
1384 {
1385 	struct record *rec = container_of(tool, struct record, tool);
1386 
1387 	set_timestamp_boundary(rec, sample->time);
1388 
1389 	if (rec->buildid_all)
1390 		return 0;
1391 
1392 	rec->samples++;
1393 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1394 }
1395 
1396 static int process_buildids(struct record *rec)
1397 {
1398 	struct perf_session *session = rec->session;
1399 
1400 	if (perf_data__size(&rec->data) == 0)
1401 		return 0;
1402 
1403 	/*
1404 	 * During this process, it'll load kernel map and replace the
1405 	 * dso->long_name to a real pathname it found.  In this case
1406 	 * we prefer the vmlinux path like
1407 	 *   /lib/modules/3.16.4/build/vmlinux
1408 	 *
1409 	 * rather than build-id path (in debug directory).
1410 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1411 	 */
1412 	symbol_conf.ignore_vmlinux_buildid = true;
1413 
1414 	/*
1415 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1416 	 * so no need to process samples. But if timestamp_boundary is enabled,
1417 	 * it still needs to walk on all samples to get the timestamps of
1418 	 * first/last samples.
1419 	 */
1420 	if (rec->buildid_all && !rec->timestamp_boundary)
1421 		rec->tool.sample = NULL;
1422 
1423 	return perf_session__process_events(session);
1424 }
1425 
1426 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1427 {
1428 	int err;
1429 	struct perf_tool *tool = data;
1430 	/*
1431 	 *As for guest kernel when processing subcommand record&report,
1432 	 *we arrange module mmap prior to guest kernel mmap and trigger
1433 	 *a preload dso because default guest module symbols are loaded
1434 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1435 	 *method is used to avoid symbol missing when the first addr is
1436 	 *in module instead of in guest kernel.
1437 	 */
1438 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1439 					     machine);
1440 	if (err < 0)
1441 		pr_err("Couldn't record guest kernel [%d]'s reference"
1442 		       " relocation symbol.\n", machine->pid);
1443 
1444 	/*
1445 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1446 	 * have no _text sometimes.
1447 	 */
1448 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1449 						 machine);
1450 	if (err < 0)
1451 		pr_err("Couldn't record guest kernel [%d]'s reference"
1452 		       " relocation symbol.\n", machine->pid);
1453 }
1454 
1455 static struct perf_event_header finished_round_event = {
1456 	.size = sizeof(struct perf_event_header),
1457 	.type = PERF_RECORD_FINISHED_ROUND,
1458 };
1459 
1460 static struct perf_event_header finished_init_event = {
1461 	.size = sizeof(struct perf_event_header),
1462 	.type = PERF_RECORD_FINISHED_INIT,
1463 };
1464 
1465 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1466 {
1467 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1468 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1469 			  thread->mask->affinity.nbits)) {
1470 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1471 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1472 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1473 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1474 					(cpu_set_t *)thread->mask->affinity.bits);
1475 		if (verbose == 2) {
1476 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1477 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1478 		}
1479 	}
1480 }
1481 
1482 static size_t process_comp_header(void *record, size_t increment)
1483 {
1484 	struct perf_record_compressed *event = record;
1485 	size_t size = sizeof(*event);
1486 
1487 	if (increment) {
1488 		event->header.size += increment;
1489 		return increment;
1490 	}
1491 
1492 	event->header.type = PERF_RECORD_COMPRESSED;
1493 	event->header.size = size;
1494 
1495 	return size;
1496 }
1497 
1498 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1499 			    void *dst, size_t dst_size, void *src, size_t src_size)
1500 {
1501 	size_t compressed;
1502 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1503 	struct zstd_data *zstd_data = &session->zstd_data;
1504 
1505 	if (map && map->file)
1506 		zstd_data = &map->zstd_data;
1507 
1508 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1509 						     max_record_size, process_comp_header);
1510 
1511 	if (map && map->file) {
1512 		thread->bytes_transferred += src_size;
1513 		thread->bytes_compressed  += compressed;
1514 	} else {
1515 		session->bytes_transferred += src_size;
1516 		session->bytes_compressed  += compressed;
1517 	}
1518 
1519 	return compressed;
1520 }
1521 
1522 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1523 				    bool overwrite, bool synch)
1524 {
1525 	u64 bytes_written = rec->bytes_written;
1526 	int i;
1527 	int rc = 0;
1528 	int nr_mmaps;
1529 	struct mmap **maps;
1530 	int trace_fd = rec->data.file.fd;
1531 	off_t off = 0;
1532 
1533 	if (!evlist)
1534 		return 0;
1535 
1536 	nr_mmaps = thread->nr_mmaps;
1537 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1538 
1539 	if (!maps)
1540 		return 0;
1541 
1542 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1543 		return 0;
1544 
1545 	if (record__aio_enabled(rec))
1546 		off = record__aio_get_pos(trace_fd);
1547 
1548 	for (i = 0; i < nr_mmaps; i++) {
1549 		u64 flush = 0;
1550 		struct mmap *map = maps[i];
1551 
1552 		if (map->core.base) {
1553 			record__adjust_affinity(rec, map);
1554 			if (synch) {
1555 				flush = map->core.flush;
1556 				map->core.flush = 1;
1557 			}
1558 			if (!record__aio_enabled(rec)) {
1559 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1560 					if (synch)
1561 						map->core.flush = flush;
1562 					rc = -1;
1563 					goto out;
1564 				}
1565 			} else {
1566 				if (record__aio_push(rec, map, &off) < 0) {
1567 					record__aio_set_pos(trace_fd, off);
1568 					if (synch)
1569 						map->core.flush = flush;
1570 					rc = -1;
1571 					goto out;
1572 				}
1573 			}
1574 			if (synch)
1575 				map->core.flush = flush;
1576 		}
1577 
1578 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1579 		    !rec->opts.auxtrace_sample_mode &&
1580 		    record__auxtrace_mmap_read(rec, map) != 0) {
1581 			rc = -1;
1582 			goto out;
1583 		}
1584 	}
1585 
1586 	if (record__aio_enabled(rec))
1587 		record__aio_set_pos(trace_fd, off);
1588 
1589 	/*
1590 	 * Mark the round finished in case we wrote
1591 	 * at least one event.
1592 	 *
1593 	 * No need for round events in directory mode,
1594 	 * because per-cpu maps and files have data
1595 	 * sorted by kernel.
1596 	 */
1597 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1598 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1599 
1600 	if (overwrite)
1601 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1602 out:
1603 	return rc;
1604 }
1605 
1606 static int record__mmap_read_all(struct record *rec, bool synch)
1607 {
1608 	int err;
1609 
1610 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1611 	if (err)
1612 		return err;
1613 
1614 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1615 }
1616 
1617 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1618 					   void *arg __maybe_unused)
1619 {
1620 	struct perf_mmap *map = fda->priv[fd].ptr;
1621 
1622 	if (map)
1623 		perf_mmap__put(map);
1624 }
1625 
1626 static void *record__thread(void *arg)
1627 {
1628 	enum thread_msg msg = THREAD_MSG__READY;
1629 	bool terminate = false;
1630 	struct fdarray *pollfd;
1631 	int err, ctlfd_pos;
1632 
1633 	thread = arg;
1634 	thread->tid = gettid();
1635 
1636 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1637 	if (err == -1)
1638 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1639 			   thread->tid, strerror(errno));
1640 
1641 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1642 
1643 	pollfd = &thread->pollfd;
1644 	ctlfd_pos = thread->ctlfd_pos;
1645 
1646 	for (;;) {
1647 		unsigned long long hits = thread->samples;
1648 
1649 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1650 			break;
1651 
1652 		if (hits == thread->samples) {
1653 
1654 			err = fdarray__poll(pollfd, -1);
1655 			/*
1656 			 * Propagate error, only if there's any. Ignore positive
1657 			 * number of returned events and interrupt error.
1658 			 */
1659 			if (err > 0 || (err < 0 && errno == EINTR))
1660 				err = 0;
1661 			thread->waking++;
1662 
1663 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1664 					    record__thread_munmap_filtered, NULL) == 0)
1665 				break;
1666 		}
1667 
1668 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1669 			terminate = true;
1670 			close(thread->pipes.msg[0]);
1671 			thread->pipes.msg[0] = -1;
1672 			pollfd->entries[ctlfd_pos].fd = -1;
1673 			pollfd->entries[ctlfd_pos].events = 0;
1674 		}
1675 
1676 		pollfd->entries[ctlfd_pos].revents = 0;
1677 	}
1678 	record__mmap_read_all(thread->rec, true);
1679 
1680 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1681 	if (err == -1)
1682 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1683 			   thread->tid, strerror(errno));
1684 
1685 	return NULL;
1686 }
1687 
1688 static void record__init_features(struct record *rec)
1689 {
1690 	struct perf_session *session = rec->session;
1691 	int feat;
1692 
1693 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1694 		perf_header__set_feat(&session->header, feat);
1695 
1696 	if (rec->no_buildid)
1697 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1698 
1699 #ifdef HAVE_LIBTRACEEVENT
1700 	if (!have_tracepoints(&rec->evlist->core.entries))
1701 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1702 #endif
1703 
1704 	if (!rec->opts.branch_stack)
1705 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1706 
1707 	if (!rec->opts.full_auxtrace)
1708 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1709 
1710 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1711 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1712 
1713 	if (!rec->opts.use_clockid)
1714 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1715 
1716 	if (!record__threads_enabled(rec))
1717 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1718 
1719 	if (!record__comp_enabled(rec))
1720 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1721 
1722 	perf_header__clear_feat(&session->header, HEADER_STAT);
1723 }
1724 
1725 static void
1726 record__finish_output(struct record *rec)
1727 {
1728 	int i;
1729 	struct perf_data *data = &rec->data;
1730 	int fd = perf_data__fd(data);
1731 
1732 	if (data->is_pipe)
1733 		return;
1734 
1735 	rec->session->header.data_size += rec->bytes_written;
1736 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1737 	if (record__threads_enabled(rec)) {
1738 		for (i = 0; i < data->dir.nr; i++)
1739 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1740 	}
1741 
1742 	if (!rec->no_buildid) {
1743 		process_buildids(rec);
1744 
1745 		if (rec->buildid_all)
1746 			dsos__hit_all(rec->session);
1747 	}
1748 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1749 
1750 	return;
1751 }
1752 
1753 static int record__synthesize_workload(struct record *rec, bool tail)
1754 {
1755 	int err;
1756 	struct perf_thread_map *thread_map;
1757 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1758 
1759 	if (rec->opts.tail_synthesize != tail)
1760 		return 0;
1761 
1762 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1763 	if (thread_map == NULL)
1764 		return -1;
1765 
1766 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1767 						 process_synthesized_event,
1768 						 &rec->session->machines.host,
1769 						 needs_mmap,
1770 						 rec->opts.sample_address);
1771 	perf_thread_map__put(thread_map);
1772 	return err;
1773 }
1774 
1775 static int write_finished_init(struct record *rec, bool tail)
1776 {
1777 	if (rec->opts.tail_synthesize != tail)
1778 		return 0;
1779 
1780 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1781 }
1782 
1783 static int record__synthesize(struct record *rec, bool tail);
1784 
1785 static int
1786 record__switch_output(struct record *rec, bool at_exit)
1787 {
1788 	struct perf_data *data = &rec->data;
1789 	char *new_filename = NULL;
1790 	int fd, err;
1791 
1792 	/* Same Size:      "2015122520103046"*/
1793 	char timestamp[] = "InvalidTimestamp";
1794 
1795 	record__aio_mmap_read_sync(rec);
1796 
1797 	write_finished_init(rec, true);
1798 
1799 	record__synthesize(rec, true);
1800 	if (target__none(&rec->opts.target))
1801 		record__synthesize_workload(rec, true);
1802 
1803 	rec->samples = 0;
1804 	record__finish_output(rec);
1805 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1806 	if (err) {
1807 		pr_err("Failed to get current timestamp\n");
1808 		return -EINVAL;
1809 	}
1810 
1811 	fd = perf_data__switch(data, timestamp,
1812 				    rec->session->header.data_offset,
1813 				    at_exit, &new_filename);
1814 	if (fd >= 0 && !at_exit) {
1815 		rec->bytes_written = 0;
1816 		rec->session->header.data_size = 0;
1817 	}
1818 
1819 	if (!quiet)
1820 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1821 			data->path, timestamp);
1822 
1823 	if (rec->switch_output.num_files) {
1824 		int n = rec->switch_output.cur_file + 1;
1825 
1826 		if (n >= rec->switch_output.num_files)
1827 			n = 0;
1828 		rec->switch_output.cur_file = n;
1829 		if (rec->switch_output.filenames[n]) {
1830 			remove(rec->switch_output.filenames[n]);
1831 			zfree(&rec->switch_output.filenames[n]);
1832 		}
1833 		rec->switch_output.filenames[n] = new_filename;
1834 	} else {
1835 		free(new_filename);
1836 	}
1837 
1838 	/* Output tracking events */
1839 	if (!at_exit) {
1840 		record__synthesize(rec, false);
1841 
1842 		/*
1843 		 * In 'perf record --switch-output' without -a,
1844 		 * record__synthesize() in record__switch_output() won't
1845 		 * generate tracking events because there's no thread_map
1846 		 * in evlist. Which causes newly created perf.data doesn't
1847 		 * contain map and comm information.
1848 		 * Create a fake thread_map and directly call
1849 		 * perf_event__synthesize_thread_map() for those events.
1850 		 */
1851 		if (target__none(&rec->opts.target))
1852 			record__synthesize_workload(rec, false);
1853 		write_finished_init(rec, false);
1854 	}
1855 	return fd;
1856 }
1857 
1858 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1859 					struct perf_record_lost_samples *lost,
1860 					int cpu_idx, int thread_idx, u64 lost_count,
1861 					u16 misc_flag)
1862 {
1863 	struct perf_sample_id *sid;
1864 	struct perf_sample sample = {};
1865 	int id_hdr_size;
1866 
1867 	lost->lost = lost_count;
1868 	if (evsel->core.ids) {
1869 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1870 		sample.id = sid->id;
1871 	}
1872 
1873 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1874 						       evsel->core.attr.sample_type, &sample);
1875 	lost->header.size = sizeof(*lost) + id_hdr_size;
1876 	lost->header.misc = misc_flag;
1877 	record__write(rec, NULL, lost, lost->header.size);
1878 }
1879 
1880 static void record__read_lost_samples(struct record *rec)
1881 {
1882 	struct perf_session *session = rec->session;
1883 	struct perf_record_lost_samples *lost;
1884 	struct evsel *evsel;
1885 
1886 	/* there was an error during record__open */
1887 	if (session->evlist == NULL)
1888 		return;
1889 
1890 	lost = zalloc(PERF_SAMPLE_MAX_SIZE);
1891 	if (lost == NULL) {
1892 		pr_debug("Memory allocation failed\n");
1893 		return;
1894 	}
1895 
1896 	lost->header.type = PERF_RECORD_LOST_SAMPLES;
1897 
1898 	evlist__for_each_entry(session->evlist, evsel) {
1899 		struct xyarray *xy = evsel->core.sample_id;
1900 		u64 lost_count;
1901 
1902 		if (xy == NULL || evsel->core.fd == NULL)
1903 			continue;
1904 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1905 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1906 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1907 			continue;
1908 		}
1909 
1910 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1911 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1912 				struct perf_counts_values count;
1913 
1914 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1915 					pr_debug("read LOST count failed\n");
1916 					goto out;
1917 				}
1918 
1919 				if (count.lost) {
1920 					__record__save_lost_samples(rec, evsel, lost,
1921 								    x, y, count.lost, 0);
1922 				}
1923 			}
1924 		}
1925 
1926 		lost_count = perf_bpf_filter__lost_count(evsel);
1927 		if (lost_count)
1928 			__record__save_lost_samples(rec, evsel, lost, 0, 0, lost_count,
1929 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1930 	}
1931 out:
1932 	free(lost);
1933 }
1934 
1935 static volatile sig_atomic_t workload_exec_errno;
1936 
1937 /*
1938  * evlist__prepare_workload will send a SIGUSR1
1939  * if the fork fails, since we asked by setting its
1940  * want_signal to true.
1941  */
1942 static void workload_exec_failed_signal(int signo __maybe_unused,
1943 					siginfo_t *info,
1944 					void *ucontext __maybe_unused)
1945 {
1946 	workload_exec_errno = info->si_value.sival_int;
1947 	done = 1;
1948 	child_finished = 1;
1949 }
1950 
1951 static void snapshot_sig_handler(int sig);
1952 static void alarm_sig_handler(int sig);
1953 
1954 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1955 {
1956 	if (evlist) {
1957 		if (evlist->mmap && evlist->mmap[0].core.base)
1958 			return evlist->mmap[0].core.base;
1959 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1960 			return evlist->overwrite_mmap[0].core.base;
1961 	}
1962 	return NULL;
1963 }
1964 
1965 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1966 {
1967 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1968 	if (pc)
1969 		return pc;
1970 	return NULL;
1971 }
1972 
1973 static int record__synthesize(struct record *rec, bool tail)
1974 {
1975 	struct perf_session *session = rec->session;
1976 	struct machine *machine = &session->machines.host;
1977 	struct perf_data *data = &rec->data;
1978 	struct record_opts *opts = &rec->opts;
1979 	struct perf_tool *tool = &rec->tool;
1980 	int err = 0;
1981 	event_op f = process_synthesized_event;
1982 
1983 	if (rec->opts.tail_synthesize != tail)
1984 		return 0;
1985 
1986 	if (data->is_pipe) {
1987 		err = perf_event__synthesize_for_pipe(tool, session, data,
1988 						      process_synthesized_event);
1989 		if (err < 0)
1990 			goto out;
1991 
1992 		rec->bytes_written += err;
1993 	}
1994 
1995 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1996 					  process_synthesized_event, machine);
1997 	if (err)
1998 		goto out;
1999 
2000 	/* Synthesize id_index before auxtrace_info */
2001 	err = perf_event__synthesize_id_index(tool,
2002 					      process_synthesized_event,
2003 					      session->evlist, machine);
2004 	if (err)
2005 		goto out;
2006 
2007 	if (rec->opts.full_auxtrace) {
2008 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2009 					session, process_synthesized_event);
2010 		if (err)
2011 			goto out;
2012 	}
2013 
2014 	if (!evlist__exclude_kernel(rec->evlist)) {
2015 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2016 							 machine);
2017 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2018 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2019 				   "Check /proc/kallsyms permission or run as root.\n");
2020 
2021 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2022 						     machine);
2023 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2024 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2025 				   "Check /proc/modules permission or run as root.\n");
2026 	}
2027 
2028 	if (perf_guest) {
2029 		machines__process_guests(&session->machines,
2030 					 perf_event__synthesize_guest_os, tool);
2031 	}
2032 
2033 	err = perf_event__synthesize_extra_attr(&rec->tool,
2034 						rec->evlist,
2035 						process_synthesized_event,
2036 						data->is_pipe);
2037 	if (err)
2038 		goto out;
2039 
2040 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2041 						 process_synthesized_event,
2042 						NULL);
2043 	if (err < 0) {
2044 		pr_err("Couldn't synthesize thread map.\n");
2045 		return err;
2046 	}
2047 
2048 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2049 					     process_synthesized_event, NULL);
2050 	if (err < 0) {
2051 		pr_err("Couldn't synthesize cpu map.\n");
2052 		return err;
2053 	}
2054 
2055 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2056 						machine, opts);
2057 	if (err < 0) {
2058 		pr_warning("Couldn't synthesize bpf events.\n");
2059 		err = 0;
2060 	}
2061 
2062 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2063 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2064 						     machine);
2065 		if (err < 0) {
2066 			pr_warning("Couldn't synthesize cgroup events.\n");
2067 			err = 0;
2068 		}
2069 	}
2070 
2071 	if (rec->opts.nr_threads_synthesize > 1) {
2072 		mutex_init(&synth_lock);
2073 		perf_set_multithreaded();
2074 		f = process_locked_synthesized_event;
2075 	}
2076 
2077 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2078 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2079 
2080 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2081 						    rec->evlist->core.threads,
2082 						    f, needs_mmap, opts->sample_address,
2083 						    rec->opts.nr_threads_synthesize);
2084 	}
2085 
2086 	if (rec->opts.nr_threads_synthesize > 1) {
2087 		perf_set_singlethreaded();
2088 		mutex_destroy(&synth_lock);
2089 	}
2090 
2091 out:
2092 	return err;
2093 }
2094 
2095 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2096 {
2097 	struct record *rec = data;
2098 	pthread_kill(rec->thread_id, SIGUSR2);
2099 	return 0;
2100 }
2101 
2102 static int record__setup_sb_evlist(struct record *rec)
2103 {
2104 	struct record_opts *opts = &rec->opts;
2105 
2106 	if (rec->sb_evlist != NULL) {
2107 		/*
2108 		 * We get here if --switch-output-event populated the
2109 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2110 		 * to the main thread.
2111 		 */
2112 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2113 		rec->thread_id = pthread_self();
2114 	}
2115 #ifdef HAVE_LIBBPF_SUPPORT
2116 	if (!opts->no_bpf_event) {
2117 		if (rec->sb_evlist == NULL) {
2118 			rec->sb_evlist = evlist__new();
2119 
2120 			if (rec->sb_evlist == NULL) {
2121 				pr_err("Couldn't create side band evlist.\n.");
2122 				return -1;
2123 			}
2124 		}
2125 
2126 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2127 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2128 			return -1;
2129 		}
2130 	}
2131 #endif
2132 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2133 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2134 		opts->no_bpf_event = true;
2135 	}
2136 
2137 	return 0;
2138 }
2139 
2140 static int record__init_clock(struct record *rec)
2141 {
2142 	struct perf_session *session = rec->session;
2143 	struct timespec ref_clockid;
2144 	struct timeval ref_tod;
2145 	u64 ref;
2146 
2147 	if (!rec->opts.use_clockid)
2148 		return 0;
2149 
2150 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2151 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2152 
2153 	session->header.env.clock.clockid = rec->opts.clockid;
2154 
2155 	if (gettimeofday(&ref_tod, NULL) != 0) {
2156 		pr_err("gettimeofday failed, cannot set reference time.\n");
2157 		return -1;
2158 	}
2159 
2160 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2161 		pr_err("clock_gettime failed, cannot set reference time.\n");
2162 		return -1;
2163 	}
2164 
2165 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2166 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2167 
2168 	session->header.env.clock.tod_ns = ref;
2169 
2170 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2171 	      (u64) ref_clockid.tv_nsec;
2172 
2173 	session->header.env.clock.clockid_ns = ref;
2174 	return 0;
2175 }
2176 
2177 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2178 {
2179 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2180 		trigger_hit(&auxtrace_snapshot_trigger);
2181 		auxtrace_record__snapshot_started = 1;
2182 		if (auxtrace_record__snapshot_start(rec->itr))
2183 			trigger_error(&auxtrace_snapshot_trigger);
2184 	}
2185 }
2186 
2187 static int record__terminate_thread(struct record_thread *thread_data)
2188 {
2189 	int err;
2190 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2191 	pid_t tid = thread_data->tid;
2192 
2193 	close(thread_data->pipes.msg[1]);
2194 	thread_data->pipes.msg[1] = -1;
2195 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2196 	if (err > 0)
2197 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2198 	else
2199 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2200 			   thread->tid, tid);
2201 
2202 	return 0;
2203 }
2204 
2205 static int record__start_threads(struct record *rec)
2206 {
2207 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2208 	struct record_thread *thread_data = rec->thread_data;
2209 	sigset_t full, mask;
2210 	pthread_t handle;
2211 	pthread_attr_t attrs;
2212 
2213 	thread = &thread_data[0];
2214 
2215 	if (!record__threads_enabled(rec))
2216 		return 0;
2217 
2218 	sigfillset(&full);
2219 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2220 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2221 		return -1;
2222 	}
2223 
2224 	pthread_attr_init(&attrs);
2225 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2226 
2227 	for (t = 1; t < nr_threads; t++) {
2228 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2229 
2230 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2231 		pthread_attr_setaffinity_np(&attrs,
2232 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2233 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2234 #endif
2235 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2236 			for (tt = 1; tt < t; tt++)
2237 				record__terminate_thread(&thread_data[t]);
2238 			pr_err("Failed to start threads: %s\n", strerror(errno));
2239 			ret = -1;
2240 			goto out_err;
2241 		}
2242 
2243 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2244 		if (err > 0)
2245 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2246 				  thread_msg_tags[msg]);
2247 		else
2248 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2249 				   thread->tid, rec->thread_data[t].tid);
2250 	}
2251 
2252 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2253 			(cpu_set_t *)thread->mask->affinity.bits);
2254 
2255 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2256 
2257 out_err:
2258 	pthread_attr_destroy(&attrs);
2259 
2260 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2261 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2262 		ret = -1;
2263 	}
2264 
2265 	return ret;
2266 }
2267 
2268 static int record__stop_threads(struct record *rec)
2269 {
2270 	int t;
2271 	struct record_thread *thread_data = rec->thread_data;
2272 
2273 	for (t = 1; t < rec->nr_threads; t++)
2274 		record__terminate_thread(&thread_data[t]);
2275 
2276 	for (t = 0; t < rec->nr_threads; t++) {
2277 		rec->samples += thread_data[t].samples;
2278 		if (!record__threads_enabled(rec))
2279 			continue;
2280 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2281 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2282 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2283 			 thread_data[t].samples, thread_data[t].waking);
2284 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2285 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2286 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2287 		else
2288 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2289 	}
2290 
2291 	return 0;
2292 }
2293 
2294 static unsigned long record__waking(struct record *rec)
2295 {
2296 	int t;
2297 	unsigned long waking = 0;
2298 	struct record_thread *thread_data = rec->thread_data;
2299 
2300 	for (t = 0; t < rec->nr_threads; t++)
2301 		waking += thread_data[t].waking;
2302 
2303 	return waking;
2304 }
2305 
2306 static int __cmd_record(struct record *rec, int argc, const char **argv)
2307 {
2308 	int err;
2309 	int status = 0;
2310 	const bool forks = argc > 0;
2311 	struct perf_tool *tool = &rec->tool;
2312 	struct record_opts *opts = &rec->opts;
2313 	struct perf_data *data = &rec->data;
2314 	struct perf_session *session;
2315 	bool disabled = false, draining = false;
2316 	int fd;
2317 	float ratio = 0;
2318 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2319 
2320 	atexit(record__sig_exit);
2321 	signal(SIGCHLD, sig_handler);
2322 	signal(SIGINT, sig_handler);
2323 	signal(SIGTERM, sig_handler);
2324 	signal(SIGSEGV, sigsegv_handler);
2325 
2326 	if (rec->opts.record_namespaces)
2327 		tool->namespace_events = true;
2328 
2329 	if (rec->opts.record_cgroup) {
2330 #ifdef HAVE_FILE_HANDLE
2331 		tool->cgroup_events = true;
2332 #else
2333 		pr_err("cgroup tracking is not supported\n");
2334 		return -1;
2335 #endif
2336 	}
2337 
2338 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2339 		signal(SIGUSR2, snapshot_sig_handler);
2340 		if (rec->opts.auxtrace_snapshot_mode)
2341 			trigger_on(&auxtrace_snapshot_trigger);
2342 		if (rec->switch_output.enabled)
2343 			trigger_on(&switch_output_trigger);
2344 	} else {
2345 		signal(SIGUSR2, SIG_IGN);
2346 	}
2347 
2348 	session = perf_session__new(data, tool);
2349 	if (IS_ERR(session)) {
2350 		pr_err("Perf session creation failed.\n");
2351 		return PTR_ERR(session);
2352 	}
2353 
2354 	if (record__threads_enabled(rec)) {
2355 		if (perf_data__is_pipe(&rec->data)) {
2356 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2357 			return -1;
2358 		}
2359 		if (rec->opts.full_auxtrace) {
2360 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2361 			return -1;
2362 		}
2363 	}
2364 
2365 	fd = perf_data__fd(data);
2366 	rec->session = session;
2367 
2368 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2369 		pr_err("Compression initialization failed.\n");
2370 		return -1;
2371 	}
2372 #ifdef HAVE_EVENTFD_SUPPORT
2373 	done_fd = eventfd(0, EFD_NONBLOCK);
2374 	if (done_fd < 0) {
2375 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2376 		status = -1;
2377 		goto out_delete_session;
2378 	}
2379 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2380 	if (err < 0) {
2381 		pr_err("Failed to add wakeup eventfd to poll list\n");
2382 		status = err;
2383 		goto out_delete_session;
2384 	}
2385 #endif // HAVE_EVENTFD_SUPPORT
2386 
2387 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2388 	session->header.env.comp_level = rec->opts.comp_level;
2389 
2390 	if (rec->opts.kcore &&
2391 	    !record__kcore_readable(&session->machines.host)) {
2392 		pr_err("ERROR: kcore is not readable.\n");
2393 		return -1;
2394 	}
2395 
2396 	if (record__init_clock(rec))
2397 		return -1;
2398 
2399 	record__init_features(rec);
2400 
2401 	if (forks) {
2402 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2403 					       workload_exec_failed_signal);
2404 		if (err < 0) {
2405 			pr_err("Couldn't run the workload!\n");
2406 			status = err;
2407 			goto out_delete_session;
2408 		}
2409 	}
2410 
2411 	/*
2412 	 * If we have just single event and are sending data
2413 	 * through pipe, we need to force the ids allocation,
2414 	 * because we synthesize event name through the pipe
2415 	 * and need the id for that.
2416 	 */
2417 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2418 		rec->opts.sample_id = true;
2419 
2420 	if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2421 		rec->timestamp_filename = false;
2422 		pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2423 	}
2424 
2425 	evlist__uniquify_name(rec->evlist);
2426 
2427 	/* Debug message used by test scripts */
2428 	pr_debug3("perf record opening and mmapping events\n");
2429 	if (record__open(rec) != 0) {
2430 		err = -1;
2431 		goto out_free_threads;
2432 	}
2433 	/* Debug message used by test scripts */
2434 	pr_debug3("perf record done opening and mmapping events\n");
2435 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2436 
2437 	if (rec->opts.kcore) {
2438 		err = record__kcore_copy(&session->machines.host, data);
2439 		if (err) {
2440 			pr_err("ERROR: Failed to copy kcore\n");
2441 			goto out_free_threads;
2442 		}
2443 	}
2444 
2445 	/*
2446 	 * Normally perf_session__new would do this, but it doesn't have the
2447 	 * evlist.
2448 	 */
2449 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2450 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2451 		rec->tool.ordered_events = false;
2452 	}
2453 
2454 	if (evlist__nr_groups(rec->evlist) == 0)
2455 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2456 
2457 	if (data->is_pipe) {
2458 		err = perf_header__write_pipe(fd);
2459 		if (err < 0)
2460 			goto out_free_threads;
2461 	} else {
2462 		err = perf_session__write_header(session, rec->evlist, fd, false);
2463 		if (err < 0)
2464 			goto out_free_threads;
2465 	}
2466 
2467 	err = -1;
2468 	if (!rec->no_buildid
2469 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2470 		pr_err("Couldn't generate buildids. "
2471 		       "Use --no-buildid to profile anyway.\n");
2472 		goto out_free_threads;
2473 	}
2474 
2475 	err = record__setup_sb_evlist(rec);
2476 	if (err)
2477 		goto out_free_threads;
2478 
2479 	err = record__synthesize(rec, false);
2480 	if (err < 0)
2481 		goto out_free_threads;
2482 
2483 	if (rec->realtime_prio) {
2484 		struct sched_param param;
2485 
2486 		param.sched_priority = rec->realtime_prio;
2487 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2488 			pr_err("Could not set realtime priority.\n");
2489 			err = -1;
2490 			goto out_free_threads;
2491 		}
2492 	}
2493 
2494 	if (record__start_threads(rec))
2495 		goto out_free_threads;
2496 
2497 	/*
2498 	 * When perf is starting the traced process, all the events
2499 	 * (apart from group members) have enable_on_exec=1 set,
2500 	 * so don't spoil it by prematurely enabling them.
2501 	 */
2502 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2503 		evlist__enable(rec->evlist);
2504 
2505 	/*
2506 	 * Let the child rip
2507 	 */
2508 	if (forks) {
2509 		struct machine *machine = &session->machines.host;
2510 		union perf_event *event;
2511 		pid_t tgid;
2512 
2513 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2514 		if (event == NULL) {
2515 			err = -ENOMEM;
2516 			goto out_child;
2517 		}
2518 
2519 		/*
2520 		 * Some H/W events are generated before COMM event
2521 		 * which is emitted during exec(), so perf script
2522 		 * cannot see a correct process name for those events.
2523 		 * Synthesize COMM event to prevent it.
2524 		 */
2525 		tgid = perf_event__synthesize_comm(tool, event,
2526 						   rec->evlist->workload.pid,
2527 						   process_synthesized_event,
2528 						   machine);
2529 		free(event);
2530 
2531 		if (tgid == -1)
2532 			goto out_child;
2533 
2534 		event = malloc(sizeof(event->namespaces) +
2535 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2536 			       machine->id_hdr_size);
2537 		if (event == NULL) {
2538 			err = -ENOMEM;
2539 			goto out_child;
2540 		}
2541 
2542 		/*
2543 		 * Synthesize NAMESPACES event for the command specified.
2544 		 */
2545 		perf_event__synthesize_namespaces(tool, event,
2546 						  rec->evlist->workload.pid,
2547 						  tgid, process_synthesized_event,
2548 						  machine);
2549 		free(event);
2550 
2551 		evlist__start_workload(rec->evlist);
2552 	}
2553 
2554 	if (opts->target.initial_delay) {
2555 		pr_info(EVLIST_DISABLED_MSG);
2556 		if (opts->target.initial_delay > 0) {
2557 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2558 			evlist__enable(rec->evlist);
2559 			pr_info(EVLIST_ENABLED_MSG);
2560 		}
2561 	}
2562 
2563 	err = event_enable_timer__start(rec->evlist->eet);
2564 	if (err)
2565 		goto out_child;
2566 
2567 	/* Debug message used by test scripts */
2568 	pr_debug3("perf record has started\n");
2569 	fflush(stderr);
2570 
2571 	trigger_ready(&auxtrace_snapshot_trigger);
2572 	trigger_ready(&switch_output_trigger);
2573 	perf_hooks__invoke_record_start();
2574 
2575 	/*
2576 	 * Must write FINISHED_INIT so it will be seen after all other
2577 	 * synthesized user events, but before any regular events.
2578 	 */
2579 	err = write_finished_init(rec, false);
2580 	if (err < 0)
2581 		goto out_child;
2582 
2583 	for (;;) {
2584 		unsigned long long hits = thread->samples;
2585 
2586 		/*
2587 		 * rec->evlist->bkw_mmap_state is possible to be
2588 		 * BKW_MMAP_EMPTY here: when done == true and
2589 		 * hits != rec->samples in previous round.
2590 		 *
2591 		 * evlist__toggle_bkw_mmap ensure we never
2592 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2593 		 */
2594 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2595 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2596 
2597 		if (record__mmap_read_all(rec, false) < 0) {
2598 			trigger_error(&auxtrace_snapshot_trigger);
2599 			trigger_error(&switch_output_trigger);
2600 			err = -1;
2601 			goto out_child;
2602 		}
2603 
2604 		if (auxtrace_record__snapshot_started) {
2605 			auxtrace_record__snapshot_started = 0;
2606 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2607 				record__read_auxtrace_snapshot(rec, false);
2608 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2609 				pr_err("AUX area tracing snapshot failed\n");
2610 				err = -1;
2611 				goto out_child;
2612 			}
2613 		}
2614 
2615 		if (trigger_is_hit(&switch_output_trigger)) {
2616 			/*
2617 			 * If switch_output_trigger is hit, the data in
2618 			 * overwritable ring buffer should have been collected,
2619 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2620 			 *
2621 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2622 			 * record__mmap_read_all() didn't collect data from
2623 			 * overwritable ring buffer. Read again.
2624 			 */
2625 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2626 				continue;
2627 			trigger_ready(&switch_output_trigger);
2628 
2629 			/*
2630 			 * Reenable events in overwrite ring buffer after
2631 			 * record__mmap_read_all(): we should have collected
2632 			 * data from it.
2633 			 */
2634 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2635 
2636 			if (!quiet)
2637 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2638 					record__waking(rec));
2639 			thread->waking = 0;
2640 			fd = record__switch_output(rec, false);
2641 			if (fd < 0) {
2642 				pr_err("Failed to switch to new file\n");
2643 				trigger_error(&switch_output_trigger);
2644 				err = fd;
2645 				goto out_child;
2646 			}
2647 
2648 			/* re-arm the alarm */
2649 			if (rec->switch_output.time)
2650 				alarm(rec->switch_output.time);
2651 		}
2652 
2653 		if (hits == thread->samples) {
2654 			if (done || draining)
2655 				break;
2656 			err = fdarray__poll(&thread->pollfd, -1);
2657 			/*
2658 			 * Propagate error, only if there's any. Ignore positive
2659 			 * number of returned events and interrupt error.
2660 			 */
2661 			if (err > 0 || (err < 0 && errno == EINTR))
2662 				err = 0;
2663 			thread->waking++;
2664 
2665 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2666 					    record__thread_munmap_filtered, NULL) == 0)
2667 				draining = true;
2668 
2669 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2670 			if (err)
2671 				goto out_child;
2672 		}
2673 
2674 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2675 			switch (cmd) {
2676 			case EVLIST_CTL_CMD_SNAPSHOT:
2677 				hit_auxtrace_snapshot_trigger(rec);
2678 				evlist__ctlfd_ack(rec->evlist);
2679 				break;
2680 			case EVLIST_CTL_CMD_STOP:
2681 				done = 1;
2682 				break;
2683 			case EVLIST_CTL_CMD_ACK:
2684 			case EVLIST_CTL_CMD_UNSUPPORTED:
2685 			case EVLIST_CTL_CMD_ENABLE:
2686 			case EVLIST_CTL_CMD_DISABLE:
2687 			case EVLIST_CTL_CMD_EVLIST:
2688 			case EVLIST_CTL_CMD_PING:
2689 			default:
2690 				break;
2691 			}
2692 		}
2693 
2694 		err = event_enable_timer__process(rec->evlist->eet);
2695 		if (err < 0)
2696 			goto out_child;
2697 		if (err) {
2698 			err = 0;
2699 			done = 1;
2700 		}
2701 
2702 		/*
2703 		 * When perf is starting the traced process, at the end events
2704 		 * die with the process and we wait for that. Thus no need to
2705 		 * disable events in this case.
2706 		 */
2707 		if (done && !disabled && !target__none(&opts->target)) {
2708 			trigger_off(&auxtrace_snapshot_trigger);
2709 			evlist__disable(rec->evlist);
2710 			disabled = true;
2711 		}
2712 	}
2713 
2714 	trigger_off(&auxtrace_snapshot_trigger);
2715 	trigger_off(&switch_output_trigger);
2716 
2717 	if (opts->auxtrace_snapshot_on_exit)
2718 		record__auxtrace_snapshot_exit(rec);
2719 
2720 	if (forks && workload_exec_errno) {
2721 		char msg[STRERR_BUFSIZE], strevsels[2048];
2722 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2723 
2724 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2725 
2726 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2727 			strevsels, argv[0], emsg);
2728 		err = -1;
2729 		goto out_child;
2730 	}
2731 
2732 	if (!quiet)
2733 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2734 			record__waking(rec));
2735 
2736 	write_finished_init(rec, true);
2737 
2738 	if (target__none(&rec->opts.target))
2739 		record__synthesize_workload(rec, true);
2740 
2741 out_child:
2742 	record__stop_threads(rec);
2743 	record__mmap_read_all(rec, true);
2744 out_free_threads:
2745 	record__free_thread_data(rec);
2746 	evlist__finalize_ctlfd(rec->evlist);
2747 	record__aio_mmap_read_sync(rec);
2748 
2749 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2750 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2751 		session->header.env.comp_ratio = ratio + 0.5;
2752 	}
2753 
2754 	if (forks) {
2755 		int exit_status;
2756 
2757 		if (!child_finished)
2758 			kill(rec->evlist->workload.pid, SIGTERM);
2759 
2760 		wait(&exit_status);
2761 
2762 		if (err < 0)
2763 			status = err;
2764 		else if (WIFEXITED(exit_status))
2765 			status = WEXITSTATUS(exit_status);
2766 		else if (WIFSIGNALED(exit_status))
2767 			signr = WTERMSIG(exit_status);
2768 	} else
2769 		status = err;
2770 
2771 	if (rec->off_cpu)
2772 		rec->bytes_written += off_cpu_write(rec->session);
2773 
2774 	record__read_lost_samples(rec);
2775 	record__synthesize(rec, true);
2776 	/* this will be recalculated during process_buildids() */
2777 	rec->samples = 0;
2778 
2779 	if (!err) {
2780 		if (!rec->timestamp_filename) {
2781 			record__finish_output(rec);
2782 		} else {
2783 			fd = record__switch_output(rec, true);
2784 			if (fd < 0) {
2785 				status = fd;
2786 				goto out_delete_session;
2787 			}
2788 		}
2789 	}
2790 
2791 	perf_hooks__invoke_record_end();
2792 
2793 	if (!err && !quiet) {
2794 		char samples[128];
2795 		const char *postfix = rec->timestamp_filename ?
2796 					".<timestamp>" : "";
2797 
2798 		if (rec->samples && !rec->opts.full_auxtrace)
2799 			scnprintf(samples, sizeof(samples),
2800 				  " (%" PRIu64 " samples)", rec->samples);
2801 		else
2802 			samples[0] = '\0';
2803 
2804 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2805 			perf_data__size(data) / 1024.0 / 1024.0,
2806 			data->path, postfix, samples);
2807 		if (ratio) {
2808 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2809 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2810 					ratio);
2811 		}
2812 		fprintf(stderr, " ]\n");
2813 	}
2814 
2815 out_delete_session:
2816 #ifdef HAVE_EVENTFD_SUPPORT
2817 	if (done_fd >= 0) {
2818 		fd = done_fd;
2819 		done_fd = -1;
2820 
2821 		close(fd);
2822 	}
2823 #endif
2824 	zstd_fini(&session->zstd_data);
2825 	perf_session__delete(session);
2826 
2827 	if (!opts->no_bpf_event)
2828 		evlist__stop_sb_thread(rec->sb_evlist);
2829 	return status;
2830 }
2831 
2832 static void callchain_debug(struct callchain_param *callchain)
2833 {
2834 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2835 
2836 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2837 
2838 	if (callchain->record_mode == CALLCHAIN_DWARF)
2839 		pr_debug("callchain: stack dump size %d\n",
2840 			 callchain->dump_size);
2841 }
2842 
2843 int record_opts__parse_callchain(struct record_opts *record,
2844 				 struct callchain_param *callchain,
2845 				 const char *arg, bool unset)
2846 {
2847 	int ret;
2848 	callchain->enabled = !unset;
2849 
2850 	/* --no-call-graph */
2851 	if (unset) {
2852 		callchain->record_mode = CALLCHAIN_NONE;
2853 		pr_debug("callchain: disabled\n");
2854 		return 0;
2855 	}
2856 
2857 	ret = parse_callchain_record_opt(arg, callchain);
2858 	if (!ret) {
2859 		/* Enable data address sampling for DWARF unwind. */
2860 		if (callchain->record_mode == CALLCHAIN_DWARF)
2861 			record->sample_address = true;
2862 		callchain_debug(callchain);
2863 	}
2864 
2865 	return ret;
2866 }
2867 
2868 int record_parse_callchain_opt(const struct option *opt,
2869 			       const char *arg,
2870 			       int unset)
2871 {
2872 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2873 }
2874 
2875 int record_callchain_opt(const struct option *opt,
2876 			 const char *arg __maybe_unused,
2877 			 int unset __maybe_unused)
2878 {
2879 	struct callchain_param *callchain = opt->value;
2880 
2881 	callchain->enabled = true;
2882 
2883 	if (callchain->record_mode == CALLCHAIN_NONE)
2884 		callchain->record_mode = CALLCHAIN_FP;
2885 
2886 	callchain_debug(callchain);
2887 	return 0;
2888 }
2889 
2890 static int perf_record_config(const char *var, const char *value, void *cb)
2891 {
2892 	struct record *rec = cb;
2893 
2894 	if (!strcmp(var, "record.build-id")) {
2895 		if (!strcmp(value, "cache"))
2896 			rec->no_buildid_cache = false;
2897 		else if (!strcmp(value, "no-cache"))
2898 			rec->no_buildid_cache = true;
2899 		else if (!strcmp(value, "skip"))
2900 			rec->no_buildid = true;
2901 		else if (!strcmp(value, "mmap"))
2902 			rec->buildid_mmap = true;
2903 		else
2904 			return -1;
2905 		return 0;
2906 	}
2907 	if (!strcmp(var, "record.call-graph")) {
2908 		var = "call-graph.record-mode";
2909 		return perf_default_config(var, value, cb);
2910 	}
2911 #ifdef HAVE_AIO_SUPPORT
2912 	if (!strcmp(var, "record.aio")) {
2913 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2914 		if (!rec->opts.nr_cblocks)
2915 			rec->opts.nr_cblocks = nr_cblocks_default;
2916 	}
2917 #endif
2918 	if (!strcmp(var, "record.debuginfod")) {
2919 		rec->debuginfod.urls = strdup(value);
2920 		if (!rec->debuginfod.urls)
2921 			return -ENOMEM;
2922 		rec->debuginfod.set = true;
2923 	}
2924 
2925 	return 0;
2926 }
2927 
2928 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2929 {
2930 	struct record *rec = (struct record *)opt->value;
2931 
2932 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2933 }
2934 
2935 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2936 {
2937 	struct record_opts *opts = (struct record_opts *)opt->value;
2938 
2939 	if (unset || !str)
2940 		return 0;
2941 
2942 	if (!strcasecmp(str, "node"))
2943 		opts->affinity = PERF_AFFINITY_NODE;
2944 	else if (!strcasecmp(str, "cpu"))
2945 		opts->affinity = PERF_AFFINITY_CPU;
2946 
2947 	return 0;
2948 }
2949 
2950 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2951 {
2952 	mask->nbits = nr_bits;
2953 	mask->bits = bitmap_zalloc(mask->nbits);
2954 	if (!mask->bits)
2955 		return -ENOMEM;
2956 
2957 	return 0;
2958 }
2959 
2960 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2961 {
2962 	bitmap_free(mask->bits);
2963 	mask->nbits = 0;
2964 }
2965 
2966 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
2967 {
2968 	int ret;
2969 
2970 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
2971 	if (ret) {
2972 		mask->affinity.bits = NULL;
2973 		return ret;
2974 	}
2975 
2976 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
2977 	if (ret) {
2978 		record__mmap_cpu_mask_free(&mask->maps);
2979 		mask->maps.bits = NULL;
2980 	}
2981 
2982 	return ret;
2983 }
2984 
2985 static void record__thread_mask_free(struct thread_mask *mask)
2986 {
2987 	record__mmap_cpu_mask_free(&mask->maps);
2988 	record__mmap_cpu_mask_free(&mask->affinity);
2989 }
2990 
2991 static int record__parse_threads(const struct option *opt, const char *str, int unset)
2992 {
2993 	int s;
2994 	struct record_opts *opts = opt->value;
2995 
2996 	if (unset || !str || !strlen(str)) {
2997 		opts->threads_spec = THREAD_SPEC__CPU;
2998 	} else {
2999 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3000 			if (s == THREAD_SPEC__USER) {
3001 				opts->threads_user_spec = strdup(str);
3002 				if (!opts->threads_user_spec)
3003 					return -ENOMEM;
3004 				opts->threads_spec = THREAD_SPEC__USER;
3005 				break;
3006 			}
3007 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3008 				opts->threads_spec = s;
3009 				break;
3010 			}
3011 		}
3012 	}
3013 
3014 	if (opts->threads_spec == THREAD_SPEC__USER)
3015 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3016 	else
3017 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3018 
3019 	return 0;
3020 }
3021 
3022 static int parse_output_max_size(const struct option *opt,
3023 				 const char *str, int unset)
3024 {
3025 	unsigned long *s = (unsigned long *)opt->value;
3026 	static struct parse_tag tags_size[] = {
3027 		{ .tag  = 'B', .mult = 1       },
3028 		{ .tag  = 'K', .mult = 1 << 10 },
3029 		{ .tag  = 'M', .mult = 1 << 20 },
3030 		{ .tag  = 'G', .mult = 1 << 30 },
3031 		{ .tag  = 0 },
3032 	};
3033 	unsigned long val;
3034 
3035 	if (unset) {
3036 		*s = 0;
3037 		return 0;
3038 	}
3039 
3040 	val = parse_tag_value(str, tags_size);
3041 	if (val != (unsigned long) -1) {
3042 		*s = val;
3043 		return 0;
3044 	}
3045 
3046 	return -1;
3047 }
3048 
3049 static int record__parse_mmap_pages(const struct option *opt,
3050 				    const char *str,
3051 				    int unset __maybe_unused)
3052 {
3053 	struct record_opts *opts = opt->value;
3054 	char *s, *p;
3055 	unsigned int mmap_pages;
3056 	int ret;
3057 
3058 	if (!str)
3059 		return -EINVAL;
3060 
3061 	s = strdup(str);
3062 	if (!s)
3063 		return -ENOMEM;
3064 
3065 	p = strchr(s, ',');
3066 	if (p)
3067 		*p = '\0';
3068 
3069 	if (*s) {
3070 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3071 		if (ret)
3072 			goto out_free;
3073 		opts->mmap_pages = mmap_pages;
3074 	}
3075 
3076 	if (!p) {
3077 		ret = 0;
3078 		goto out_free;
3079 	}
3080 
3081 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3082 	if (ret)
3083 		goto out_free;
3084 
3085 	opts->auxtrace_mmap_pages = mmap_pages;
3086 
3087 out_free:
3088 	free(s);
3089 	return ret;
3090 }
3091 
3092 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3093 {
3094 }
3095 
3096 static int parse_control_option(const struct option *opt,
3097 				const char *str,
3098 				int unset __maybe_unused)
3099 {
3100 	struct record_opts *opts = opt->value;
3101 
3102 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3103 }
3104 
3105 static void switch_output_size_warn(struct record *rec)
3106 {
3107 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3108 	struct switch_output *s = &rec->switch_output;
3109 
3110 	wakeup_size /= 2;
3111 
3112 	if (s->size < wakeup_size) {
3113 		char buf[100];
3114 
3115 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3116 		pr_warning("WARNING: switch-output data size lower than "
3117 			   "wakeup kernel buffer size (%s) "
3118 			   "expect bigger perf.data sizes\n", buf);
3119 	}
3120 }
3121 
3122 static int switch_output_setup(struct record *rec)
3123 {
3124 	struct switch_output *s = &rec->switch_output;
3125 	static struct parse_tag tags_size[] = {
3126 		{ .tag  = 'B', .mult = 1       },
3127 		{ .tag  = 'K', .mult = 1 << 10 },
3128 		{ .tag  = 'M', .mult = 1 << 20 },
3129 		{ .tag  = 'G', .mult = 1 << 30 },
3130 		{ .tag  = 0 },
3131 	};
3132 	static struct parse_tag tags_time[] = {
3133 		{ .tag  = 's', .mult = 1        },
3134 		{ .tag  = 'm', .mult = 60       },
3135 		{ .tag  = 'h', .mult = 60*60    },
3136 		{ .tag  = 'd', .mult = 60*60*24 },
3137 		{ .tag  = 0 },
3138 	};
3139 	unsigned long val;
3140 
3141 	/*
3142 	 * If we're using --switch-output-events, then we imply its
3143 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3144 	 *  thread to its parent.
3145 	 */
3146 	if (rec->switch_output_event_set) {
3147 		if (record__threads_enabled(rec)) {
3148 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3149 			return 0;
3150 		}
3151 		goto do_signal;
3152 	}
3153 
3154 	if (!s->set)
3155 		return 0;
3156 
3157 	if (record__threads_enabled(rec)) {
3158 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3159 		return 0;
3160 	}
3161 
3162 	if (!strcmp(s->str, "signal")) {
3163 do_signal:
3164 		s->signal = true;
3165 		pr_debug("switch-output with SIGUSR2 signal\n");
3166 		goto enabled;
3167 	}
3168 
3169 	val = parse_tag_value(s->str, tags_size);
3170 	if (val != (unsigned long) -1) {
3171 		s->size = val;
3172 		pr_debug("switch-output with %s size threshold\n", s->str);
3173 		goto enabled;
3174 	}
3175 
3176 	val = parse_tag_value(s->str, tags_time);
3177 	if (val != (unsigned long) -1) {
3178 		s->time = val;
3179 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3180 			 s->str, s->time);
3181 		goto enabled;
3182 	}
3183 
3184 	return -1;
3185 
3186 enabled:
3187 	rec->timestamp_filename = true;
3188 	s->enabled              = true;
3189 
3190 	if (s->size && !rec->opts.no_buffering)
3191 		switch_output_size_warn(rec);
3192 
3193 	return 0;
3194 }
3195 
3196 static const char * const __record_usage[] = {
3197 	"perf record [<options>] [<command>]",
3198 	"perf record [<options>] -- <command> [<options>]",
3199 	NULL
3200 };
3201 const char * const *record_usage = __record_usage;
3202 
3203 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3204 				  struct perf_sample *sample, struct machine *machine)
3205 {
3206 	/*
3207 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3208 	 * no need to add them twice.
3209 	 */
3210 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3211 		return 0;
3212 	return perf_event__process_mmap(tool, event, sample, machine);
3213 }
3214 
3215 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3216 				   struct perf_sample *sample, struct machine *machine)
3217 {
3218 	/*
3219 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3220 	 * no need to add them twice.
3221 	 */
3222 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3223 		return 0;
3224 
3225 	return perf_event__process_mmap2(tool, event, sample, machine);
3226 }
3227 
3228 static int process_timestamp_boundary(struct perf_tool *tool,
3229 				      union perf_event *event __maybe_unused,
3230 				      struct perf_sample *sample,
3231 				      struct machine *machine __maybe_unused)
3232 {
3233 	struct record *rec = container_of(tool, struct record, tool);
3234 
3235 	set_timestamp_boundary(rec, sample->time);
3236 	return 0;
3237 }
3238 
3239 static int parse_record_synth_option(const struct option *opt,
3240 				     const char *str,
3241 				     int unset __maybe_unused)
3242 {
3243 	struct record_opts *opts = opt->value;
3244 	char *p = strdup(str);
3245 
3246 	if (p == NULL)
3247 		return -1;
3248 
3249 	opts->synth = parse_synth_opt(p);
3250 	free(p);
3251 
3252 	if (opts->synth < 0) {
3253 		pr_err("Invalid synth option: %s\n", str);
3254 		return -1;
3255 	}
3256 	return 0;
3257 }
3258 
3259 /*
3260  * XXX Ideally would be local to cmd_record() and passed to a record__new
3261  * because we need to have access to it in record__exit, that is called
3262  * after cmd_record() exits, but since record_options need to be accessible to
3263  * builtin-script, leave it here.
3264  *
3265  * At least we don't ouch it in all the other functions here directly.
3266  *
3267  * Just say no to tons of global variables, sigh.
3268  */
3269 static struct record record = {
3270 	.opts = {
3271 		.sample_time	     = true,
3272 		.mmap_pages	     = UINT_MAX,
3273 		.user_freq	     = UINT_MAX,
3274 		.user_interval	     = ULLONG_MAX,
3275 		.freq		     = 4000,
3276 		.target		     = {
3277 			.uses_mmap   = true,
3278 			.default_per_cpu = true,
3279 		},
3280 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3281 		.nr_threads_synthesize = 1,
3282 		.ctl_fd              = -1,
3283 		.ctl_fd_ack          = -1,
3284 		.synth               = PERF_SYNTH_ALL,
3285 	},
3286 	.tool = {
3287 		.sample		= process_sample_event,
3288 		.fork		= perf_event__process_fork,
3289 		.exit		= perf_event__process_exit,
3290 		.comm		= perf_event__process_comm,
3291 		.namespaces	= perf_event__process_namespaces,
3292 		.mmap		= build_id__process_mmap,
3293 		.mmap2		= build_id__process_mmap2,
3294 		.itrace_start	= process_timestamp_boundary,
3295 		.aux		= process_timestamp_boundary,
3296 		.ordered_events	= true,
3297 	},
3298 };
3299 
3300 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3301 	"\n\t\t\t\tDefault: fp";
3302 
3303 static bool dry_run;
3304 
3305 static struct parse_events_option_args parse_events_option_args = {
3306 	.evlistp = &record.evlist,
3307 };
3308 
3309 static struct parse_events_option_args switch_output_parse_events_option_args = {
3310 	.evlistp = &record.sb_evlist,
3311 };
3312 
3313 /*
3314  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3315  * with it and switch to use the library functions in perf_evlist that came
3316  * from builtin-record.c, i.e. use record_opts,
3317  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3318  * using pipes, etc.
3319  */
3320 static struct option __record_options[] = {
3321 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3322 		     "event selector. use 'perf list' to list available events",
3323 		     parse_events_option),
3324 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3325 		     "event filter", parse_filter),
3326 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3327 			   NULL, "don't record events from perf itself",
3328 			   exclude_perf),
3329 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3330 		    "record events on existing process id"),
3331 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3332 		    "record events on existing thread id"),
3333 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3334 		    "collect data with this RT SCHED_FIFO priority"),
3335 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3336 		    "collect data without buffering"),
3337 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3338 		    "collect raw sample records from all opened counters"),
3339 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3340 			    "system-wide collection from all CPUs"),
3341 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3342 		    "list of cpus to monitor"),
3343 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3344 	OPT_STRING('o', "output", &record.data.path, "file",
3345 		    "output file name"),
3346 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3347 			&record.opts.no_inherit_set,
3348 			"child tasks do not inherit counters"),
3349 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3350 		    "synthesize non-sample events at the end of output"),
3351 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3352 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3353 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3354 		    "Fail if the specified frequency can't be used"),
3355 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3356 		     "profile at this frequency",
3357 		      record__parse_freq),
3358 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3359 		     "number of mmap data pages and AUX area tracing mmap pages",
3360 		     record__parse_mmap_pages),
3361 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3362 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3363 		     record__mmap_flush_parse),
3364 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3365 			   NULL, "enables call-graph recording" ,
3366 			   &record_callchain_opt),
3367 	OPT_CALLBACK(0, "call-graph", &record.opts,
3368 		     "record_mode[,record_size]", record_callchain_help,
3369 		     &record_parse_callchain_opt),
3370 	OPT_INCR('v', "verbose", &verbose,
3371 		    "be more verbose (show counter open errors, etc)"),
3372 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3373 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3374 		    "per thread counts"),
3375 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3376 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3377 		    "Record the sample physical addresses"),
3378 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3379 		    "Record the sampled data address data page size"),
3380 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3381 		    "Record the sampled code address (ip) page size"),
3382 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3383 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3384 		    "Record the sample identifier"),
3385 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3386 			&record.opts.sample_time_set,
3387 			"Record the sample timestamps"),
3388 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3389 			"Record the sample period"),
3390 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3391 		    "don't sample"),
3392 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3393 			&record.no_buildid_cache_set,
3394 			"do not update the buildid cache"),
3395 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3396 			&record.no_buildid_set,
3397 			"do not collect buildids in perf.data"),
3398 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3399 		     "monitor event in cgroup name only",
3400 		     parse_cgroups),
3401 	OPT_CALLBACK('D', "delay", &record, "ms",
3402 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3403 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3404 		     record__parse_event_enable_time),
3405 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3406 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3407 		   "user to profile"),
3408 
3409 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3410 		     "branch any", "sample any taken branches",
3411 		     parse_branch_stack),
3412 
3413 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3414 		     "branch filter mask", "branch stack filter modes",
3415 		     parse_branch_stack),
3416 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3417 		    "sample by weight (on special events only)"),
3418 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3419 		    "sample transaction flags (special events only)"),
3420 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3421 		    "use per-thread mmaps"),
3422 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3423 		    "sample selected machine registers on interrupt,"
3424 		    " use '-I?' to list register names", parse_intr_regs),
3425 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3426 		    "sample selected machine registers on interrupt,"
3427 		    " use '--user-regs=?' to list register names", parse_user_regs),
3428 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3429 		    "Record running/enabled time of read (:S) events"),
3430 	OPT_CALLBACK('k', "clockid", &record.opts,
3431 	"clockid", "clockid to use for events, see clock_gettime()",
3432 	parse_clockid),
3433 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3434 			  "opts", "AUX area tracing Snapshot Mode", ""),
3435 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3436 			  "opts", "sample AUX area", ""),
3437 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3438 			"per thread proc mmap processing timeout in ms"),
3439 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3440 		    "Record namespaces events"),
3441 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3442 		    "Record cgroup events"),
3443 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3444 			&record.opts.record_switch_events_set,
3445 			"Record context switch events"),
3446 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3447 			 "Configure all used events to run in kernel space.",
3448 			 PARSE_OPT_EXCLUSIVE),
3449 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3450 			 "Configure all used events to run in user space.",
3451 			 PARSE_OPT_EXCLUSIVE),
3452 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3453 		    "collect kernel callchains"),
3454 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3455 		    "collect user callchains"),
3456 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3457 		   "file", "vmlinux pathname"),
3458 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3459 		    "Record build-id of all DSOs regardless of hits"),
3460 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3461 		    "Record build-id in map events"),
3462 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3463 		    "append timestamp to output filename"),
3464 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3465 		    "Record timestamp boundary (time of first/last samples)"),
3466 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3467 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3468 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3469 			  "signal"),
3470 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3471 			 &record.switch_output_event_set, "switch output event",
3472 			 "switch output event selector. use 'perf list' to list available events",
3473 			 parse_events_option_new_evlist),
3474 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3475 		   "Limit number of switch output generated files"),
3476 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3477 		    "Parse options then exit"),
3478 #ifdef HAVE_AIO_SUPPORT
3479 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3480 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3481 		     record__aio_parse),
3482 #endif
3483 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3484 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3485 		     record__parse_affinity),
3486 #ifdef HAVE_ZSTD_SUPPORT
3487 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3488 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3489 			    record__parse_comp_level),
3490 #endif
3491 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3492 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3493 	OPT_UINTEGER(0, "num-thread-synthesize",
3494 		     &record.opts.nr_threads_synthesize,
3495 		     "number of threads to run for event synthesis"),
3496 #ifdef HAVE_LIBPFM
3497 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3498 		"libpfm4 event selector. use 'perf list' to list available events",
3499 		parse_libpfm_events_option),
3500 #endif
3501 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3502 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3503 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3504 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3505 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3506 		      parse_control_option),
3507 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3508 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3509 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3510 			  &record.debuginfod.set, "debuginfod urls",
3511 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3512 			  "system"),
3513 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3514 			    "write collected trace data into several data files using parallel threads",
3515 			    record__parse_threads),
3516 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3517 	OPT_END()
3518 };
3519 
3520 struct option *record_options = __record_options;
3521 
3522 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3523 {
3524 	struct perf_cpu cpu;
3525 	int idx;
3526 
3527 	if (cpu_map__is_dummy(cpus))
3528 		return 0;
3529 
3530 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
3531 		if (cpu.cpu == -1)
3532 			continue;
3533 		/* Return ENODEV is input cpu is greater than max cpu */
3534 		if ((unsigned long)cpu.cpu > mask->nbits)
3535 			return -ENODEV;
3536 		__set_bit(cpu.cpu, mask->bits);
3537 	}
3538 
3539 	return 0;
3540 }
3541 
3542 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3543 {
3544 	struct perf_cpu_map *cpus;
3545 
3546 	cpus = perf_cpu_map__new(mask_spec);
3547 	if (!cpus)
3548 		return -ENOMEM;
3549 
3550 	bitmap_zero(mask->bits, mask->nbits);
3551 	if (record__mmap_cpu_mask_init(mask, cpus))
3552 		return -ENODEV;
3553 
3554 	perf_cpu_map__put(cpus);
3555 
3556 	return 0;
3557 }
3558 
3559 static void record__free_thread_masks(struct record *rec, int nr_threads)
3560 {
3561 	int t;
3562 
3563 	if (rec->thread_masks)
3564 		for (t = 0; t < nr_threads; t++)
3565 			record__thread_mask_free(&rec->thread_masks[t]);
3566 
3567 	zfree(&rec->thread_masks);
3568 }
3569 
3570 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3571 {
3572 	int t, ret;
3573 
3574 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3575 	if (!rec->thread_masks) {
3576 		pr_err("Failed to allocate thread masks\n");
3577 		return -ENOMEM;
3578 	}
3579 
3580 	for (t = 0; t < nr_threads; t++) {
3581 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3582 		if (ret) {
3583 			pr_err("Failed to allocate thread masks[%d]\n", t);
3584 			goto out_free;
3585 		}
3586 	}
3587 
3588 	return 0;
3589 
3590 out_free:
3591 	record__free_thread_masks(rec, nr_threads);
3592 
3593 	return ret;
3594 }
3595 
3596 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3597 {
3598 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3599 
3600 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3601 	if (ret)
3602 		return ret;
3603 
3604 	rec->nr_threads = nr_cpus;
3605 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3606 
3607 	for (t = 0; t < rec->nr_threads; t++) {
3608 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3609 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3610 		if (verbose > 0) {
3611 			pr_debug("thread_masks[%d]: ", t);
3612 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3613 			pr_debug("thread_masks[%d]: ", t);
3614 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3615 		}
3616 	}
3617 
3618 	return 0;
3619 }
3620 
3621 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3622 					  const char **maps_spec, const char **affinity_spec,
3623 					  u32 nr_spec)
3624 {
3625 	u32 s;
3626 	int ret = 0, t = 0;
3627 	struct mmap_cpu_mask cpus_mask;
3628 	struct thread_mask thread_mask, full_mask, *thread_masks;
3629 
3630 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3631 	if (ret) {
3632 		pr_err("Failed to allocate CPUs mask\n");
3633 		return ret;
3634 	}
3635 
3636 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3637 	if (ret) {
3638 		pr_err("Failed to init cpu mask\n");
3639 		goto out_free_cpu_mask;
3640 	}
3641 
3642 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3643 	if (ret) {
3644 		pr_err("Failed to allocate full mask\n");
3645 		goto out_free_cpu_mask;
3646 	}
3647 
3648 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3649 	if (ret) {
3650 		pr_err("Failed to allocate thread mask\n");
3651 		goto out_free_full_and_cpu_masks;
3652 	}
3653 
3654 	for (s = 0; s < nr_spec; s++) {
3655 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3656 		if (ret) {
3657 			pr_err("Failed to initialize maps thread mask\n");
3658 			goto out_free;
3659 		}
3660 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3661 		if (ret) {
3662 			pr_err("Failed to initialize affinity thread mask\n");
3663 			goto out_free;
3664 		}
3665 
3666 		/* ignore invalid CPUs but do not allow empty masks */
3667 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3668 				cpus_mask.bits, thread_mask.maps.nbits)) {
3669 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3670 			ret = -EINVAL;
3671 			goto out_free;
3672 		}
3673 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3674 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3675 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3676 			ret = -EINVAL;
3677 			goto out_free;
3678 		}
3679 
3680 		/* do not allow intersection with other masks (full_mask) */
3681 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3682 				      thread_mask.maps.nbits)) {
3683 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3684 			ret = -EINVAL;
3685 			goto out_free;
3686 		}
3687 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3688 				      thread_mask.affinity.nbits)) {
3689 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3690 			ret = -EINVAL;
3691 			goto out_free;
3692 		}
3693 
3694 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3695 			  thread_mask.maps.bits, full_mask.maps.nbits);
3696 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3697 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3698 
3699 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3700 		if (!thread_masks) {
3701 			pr_err("Failed to reallocate thread masks\n");
3702 			ret = -ENOMEM;
3703 			goto out_free;
3704 		}
3705 		rec->thread_masks = thread_masks;
3706 		rec->thread_masks[t] = thread_mask;
3707 		if (verbose > 0) {
3708 			pr_debug("thread_masks[%d]: ", t);
3709 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3710 			pr_debug("thread_masks[%d]: ", t);
3711 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3712 		}
3713 		t++;
3714 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3715 		if (ret) {
3716 			pr_err("Failed to allocate thread mask\n");
3717 			goto out_free_full_and_cpu_masks;
3718 		}
3719 	}
3720 	rec->nr_threads = t;
3721 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3722 	if (!rec->nr_threads)
3723 		ret = -EINVAL;
3724 
3725 out_free:
3726 	record__thread_mask_free(&thread_mask);
3727 out_free_full_and_cpu_masks:
3728 	record__thread_mask_free(&full_mask);
3729 out_free_cpu_mask:
3730 	record__mmap_cpu_mask_free(&cpus_mask);
3731 
3732 	return ret;
3733 }
3734 
3735 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3736 {
3737 	int ret;
3738 	struct cpu_topology *topo;
3739 
3740 	topo = cpu_topology__new();
3741 	if (!topo) {
3742 		pr_err("Failed to allocate CPU topology\n");
3743 		return -ENOMEM;
3744 	}
3745 
3746 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3747 					     topo->core_cpus_list, topo->core_cpus_lists);
3748 	cpu_topology__delete(topo);
3749 
3750 	return ret;
3751 }
3752 
3753 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3754 {
3755 	int ret;
3756 	struct cpu_topology *topo;
3757 
3758 	topo = cpu_topology__new();
3759 	if (!topo) {
3760 		pr_err("Failed to allocate CPU topology\n");
3761 		return -ENOMEM;
3762 	}
3763 
3764 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3765 					     topo->package_cpus_list, topo->package_cpus_lists);
3766 	cpu_topology__delete(topo);
3767 
3768 	return ret;
3769 }
3770 
3771 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3772 {
3773 	u32 s;
3774 	int ret;
3775 	const char **spec;
3776 	struct numa_topology *topo;
3777 
3778 	topo = numa_topology__new();
3779 	if (!topo) {
3780 		pr_err("Failed to allocate NUMA topology\n");
3781 		return -ENOMEM;
3782 	}
3783 
3784 	spec = zalloc(topo->nr * sizeof(char *));
3785 	if (!spec) {
3786 		pr_err("Failed to allocate NUMA spec\n");
3787 		ret = -ENOMEM;
3788 		goto out_delete_topo;
3789 	}
3790 	for (s = 0; s < topo->nr; s++)
3791 		spec[s] = topo->nodes[s].cpus;
3792 
3793 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3794 
3795 	zfree(&spec);
3796 
3797 out_delete_topo:
3798 	numa_topology__delete(topo);
3799 
3800 	return ret;
3801 }
3802 
3803 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3804 {
3805 	int t, ret;
3806 	u32 s, nr_spec = 0;
3807 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3808 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3809 
3810 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3811 		spec = strtok_r(user_spec, ":", &spec_ptr);
3812 		if (spec == NULL)
3813 			break;
3814 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3815 		mask = strtok_r(spec, "/", &mask_ptr);
3816 		if (mask == NULL)
3817 			break;
3818 		pr_debug2("  maps mask: %s\n", mask);
3819 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3820 		if (!tmp_spec) {
3821 			pr_err("Failed to reallocate maps spec\n");
3822 			ret = -ENOMEM;
3823 			goto out_free;
3824 		}
3825 		maps_spec = tmp_spec;
3826 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3827 		if (!maps_spec[nr_spec]) {
3828 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3829 			ret = -ENOMEM;
3830 			goto out_free;
3831 		}
3832 		mask = strtok_r(NULL, "/", &mask_ptr);
3833 		if (mask == NULL) {
3834 			pr_err("Invalid thread maps or affinity specs\n");
3835 			ret = -EINVAL;
3836 			goto out_free;
3837 		}
3838 		pr_debug2("  affinity mask: %s\n", mask);
3839 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3840 		if (!tmp_spec) {
3841 			pr_err("Failed to reallocate affinity spec\n");
3842 			ret = -ENOMEM;
3843 			goto out_free;
3844 		}
3845 		affinity_spec = tmp_spec;
3846 		affinity_spec[nr_spec] = strdup(mask);
3847 		if (!affinity_spec[nr_spec]) {
3848 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3849 			ret = -ENOMEM;
3850 			goto out_free;
3851 		}
3852 		dup_mask = NULL;
3853 		nr_spec++;
3854 	}
3855 
3856 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3857 					     (const char **)affinity_spec, nr_spec);
3858 
3859 out_free:
3860 	free(dup_mask);
3861 	for (s = 0; s < nr_spec; s++) {
3862 		if (maps_spec)
3863 			free(maps_spec[s]);
3864 		if (affinity_spec)
3865 			free(affinity_spec[s]);
3866 	}
3867 	free(affinity_spec);
3868 	free(maps_spec);
3869 
3870 	return ret;
3871 }
3872 
3873 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3874 {
3875 	int ret;
3876 
3877 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3878 	if (ret)
3879 		return ret;
3880 
3881 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3882 		return -ENODEV;
3883 
3884 	rec->nr_threads = 1;
3885 
3886 	return 0;
3887 }
3888 
3889 static int record__init_thread_masks(struct record *rec)
3890 {
3891 	int ret = 0;
3892 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3893 
3894 	if (!record__threads_enabled(rec))
3895 		return record__init_thread_default_masks(rec, cpus);
3896 
3897 	if (evlist__per_thread(rec->evlist)) {
3898 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3899 		return -EINVAL;
3900 	}
3901 
3902 	switch (rec->opts.threads_spec) {
3903 	case THREAD_SPEC__CPU:
3904 		ret = record__init_thread_cpu_masks(rec, cpus);
3905 		break;
3906 	case THREAD_SPEC__CORE:
3907 		ret = record__init_thread_core_masks(rec, cpus);
3908 		break;
3909 	case THREAD_SPEC__PACKAGE:
3910 		ret = record__init_thread_package_masks(rec, cpus);
3911 		break;
3912 	case THREAD_SPEC__NUMA:
3913 		ret = record__init_thread_numa_masks(rec, cpus);
3914 		break;
3915 	case THREAD_SPEC__USER:
3916 		ret = record__init_thread_user_masks(rec, cpus);
3917 		break;
3918 	default:
3919 		break;
3920 	}
3921 
3922 	return ret;
3923 }
3924 
3925 int cmd_record(int argc, const char **argv)
3926 {
3927 	int err;
3928 	struct record *rec = &record;
3929 	char errbuf[BUFSIZ];
3930 
3931 	setlocale(LC_ALL, "");
3932 
3933 #ifndef HAVE_BPF_SKEL
3934 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3935 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3936 # undef set_nobuild
3937 #endif
3938 
3939 	rec->opts.affinity = PERF_AFFINITY_SYS;
3940 
3941 	rec->evlist = evlist__new();
3942 	if (rec->evlist == NULL)
3943 		return -ENOMEM;
3944 
3945 	err = perf_config(perf_record_config, rec);
3946 	if (err)
3947 		return err;
3948 
3949 	argc = parse_options(argc, argv, record_options, record_usage,
3950 			    PARSE_OPT_STOP_AT_NON_OPTION);
3951 	if (quiet)
3952 		perf_quiet_option();
3953 
3954 	err = symbol__validate_sym_arguments();
3955 	if (err)
3956 		return err;
3957 
3958 	perf_debuginfod_setup(&record.debuginfod);
3959 
3960 	/* Make system wide (-a) the default target. */
3961 	if (!argc && target__none(&rec->opts.target))
3962 		rec->opts.target.system_wide = true;
3963 
3964 	if (nr_cgroups && !rec->opts.target.system_wide) {
3965 		usage_with_options_msg(record_usage, record_options,
3966 			"cgroup monitoring only available in system-wide mode");
3967 
3968 	}
3969 
3970 	if (rec->buildid_mmap) {
3971 		if (!perf_can_record_build_id()) {
3972 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
3973 			err = -EINVAL;
3974 			goto out_opts;
3975 		}
3976 		pr_debug("Enabling build id in mmap2 events.\n");
3977 		/* Enable mmap build id synthesizing. */
3978 		symbol_conf.buildid_mmap2 = true;
3979 		/* Enable perf_event_attr::build_id bit. */
3980 		rec->opts.build_id = true;
3981 		/* Disable build id cache. */
3982 		rec->no_buildid = true;
3983 	}
3984 
3985 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
3986 		pr_err("Kernel has no cgroup sampling support.\n");
3987 		err = -EINVAL;
3988 		goto out_opts;
3989 	}
3990 
3991 	if (rec->opts.kcore)
3992 		rec->opts.text_poke = true;
3993 
3994 	if (rec->opts.kcore || record__threads_enabled(rec))
3995 		rec->data.is_dir = true;
3996 
3997 	if (record__threads_enabled(rec)) {
3998 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
3999 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4000 			goto out_opts;
4001 		}
4002 		if (record__aio_enabled(rec)) {
4003 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4004 			goto out_opts;
4005 		}
4006 	}
4007 
4008 	if (rec->opts.comp_level != 0) {
4009 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4010 		rec->no_buildid = true;
4011 	}
4012 
4013 	if (rec->opts.record_switch_events &&
4014 	    !perf_can_record_switch_events()) {
4015 		ui__error("kernel does not support recording context switch events\n");
4016 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4017 		err = -EINVAL;
4018 		goto out_opts;
4019 	}
4020 
4021 	if (switch_output_setup(rec)) {
4022 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4023 		err = -EINVAL;
4024 		goto out_opts;
4025 	}
4026 
4027 	if (rec->switch_output.time) {
4028 		signal(SIGALRM, alarm_sig_handler);
4029 		alarm(rec->switch_output.time);
4030 	}
4031 
4032 	if (rec->switch_output.num_files) {
4033 		rec->switch_output.filenames = calloc(sizeof(char *),
4034 						      rec->switch_output.num_files);
4035 		if (!rec->switch_output.filenames) {
4036 			err = -EINVAL;
4037 			goto out_opts;
4038 		}
4039 	}
4040 
4041 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4042 		rec->timestamp_filename = false;
4043 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4044 	}
4045 
4046 	/*
4047 	 * Allow aliases to facilitate the lookup of symbols for address
4048 	 * filters. Refer to auxtrace_parse_filters().
4049 	 */
4050 	symbol_conf.allow_aliases = true;
4051 
4052 	symbol__init(NULL);
4053 
4054 	err = record__auxtrace_init(rec);
4055 	if (err)
4056 		goto out;
4057 
4058 	if (dry_run)
4059 		goto out;
4060 
4061 	err = -ENOMEM;
4062 
4063 	if (rec->no_buildid_cache || rec->no_buildid) {
4064 		disable_buildid_cache();
4065 	} else if (rec->switch_output.enabled) {
4066 		/*
4067 		 * In 'perf record --switch-output', disable buildid
4068 		 * generation by default to reduce data file switching
4069 		 * overhead. Still generate buildid if they are required
4070 		 * explicitly using
4071 		 *
4072 		 *  perf record --switch-output --no-no-buildid \
4073 		 *              --no-no-buildid-cache
4074 		 *
4075 		 * Following code equals to:
4076 		 *
4077 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4078 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4079 		 *         disable_buildid_cache();
4080 		 */
4081 		bool disable = true;
4082 
4083 		if (rec->no_buildid_set && !rec->no_buildid)
4084 			disable = false;
4085 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4086 			disable = false;
4087 		if (disable) {
4088 			rec->no_buildid = true;
4089 			rec->no_buildid_cache = true;
4090 			disable_buildid_cache();
4091 		}
4092 	}
4093 
4094 	if (record.opts.overwrite)
4095 		record.opts.tail_synthesize = true;
4096 
4097 	if (rec->evlist->core.nr_entries == 0) {
4098 		bool can_profile_kernel = perf_event_paranoid_check(1);
4099 
4100 		err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu");
4101 		if (err)
4102 			goto out;
4103 	}
4104 
4105 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4106 		rec->opts.no_inherit = true;
4107 
4108 	err = target__validate(&rec->opts.target);
4109 	if (err) {
4110 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4111 		ui__warning("%s\n", errbuf);
4112 	}
4113 
4114 	err = target__parse_uid(&rec->opts.target);
4115 	if (err) {
4116 		int saved_errno = errno;
4117 
4118 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4119 		ui__error("%s", errbuf);
4120 
4121 		err = -saved_errno;
4122 		goto out;
4123 	}
4124 
4125 	/* Enable ignoring missing threads when -u/-p option is defined. */
4126 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4127 
4128 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4129 
4130 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4131 		arch__add_leaf_frame_record_opts(&rec->opts);
4132 
4133 	err = -ENOMEM;
4134 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4135 		if (rec->opts.target.pid != NULL) {
4136 			pr_err("Couldn't create thread/CPU maps: %s\n",
4137 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4138 			goto out;
4139 		}
4140 		else
4141 			usage_with_options(record_usage, record_options);
4142 	}
4143 
4144 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4145 	if (err)
4146 		goto out;
4147 
4148 	/*
4149 	 * We take all buildids when the file contains
4150 	 * AUX area tracing data because we do not decode the
4151 	 * trace because it would take too long.
4152 	 */
4153 	if (rec->opts.full_auxtrace)
4154 		rec->buildid_all = true;
4155 
4156 	if (rec->opts.text_poke) {
4157 		err = record__config_text_poke(rec->evlist);
4158 		if (err) {
4159 			pr_err("record__config_text_poke failed, error %d\n", err);
4160 			goto out;
4161 		}
4162 	}
4163 
4164 	if (rec->off_cpu) {
4165 		err = record__config_off_cpu(rec);
4166 		if (err) {
4167 			pr_err("record__config_off_cpu failed, error %d\n", err);
4168 			goto out;
4169 		}
4170 	}
4171 
4172 	if (record_opts__config(&rec->opts)) {
4173 		err = -EINVAL;
4174 		goto out;
4175 	}
4176 
4177 	err = record__init_thread_masks(rec);
4178 	if (err) {
4179 		pr_err("Failed to initialize parallel data streaming masks\n");
4180 		goto out;
4181 	}
4182 
4183 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4184 		rec->opts.nr_cblocks = nr_cblocks_max;
4185 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4186 
4187 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4188 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4189 
4190 	if (rec->opts.comp_level > comp_level_max)
4191 		rec->opts.comp_level = comp_level_max;
4192 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4193 
4194 	err = __cmd_record(&record, argc, argv);
4195 out:
4196 	evlist__delete(rec->evlist);
4197 	symbol__exit();
4198 	auxtrace_record__free(rec->itr);
4199 out_opts:
4200 	record__free_thread_masks(rec, rec->nr_threads);
4201 	rec->nr_threads = 0;
4202 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4203 	return err;
4204 }
4205 
4206 static void snapshot_sig_handler(int sig __maybe_unused)
4207 {
4208 	struct record *rec = &record;
4209 
4210 	hit_auxtrace_snapshot_trigger(rec);
4211 
4212 	if (switch_output_signal(rec))
4213 		trigger_hit(&switch_output_trigger);
4214 }
4215 
4216 static void alarm_sig_handler(int sig __maybe_unused)
4217 {
4218 	struct record *rec = &record;
4219 
4220 	if (switch_output_time(rec))
4221 		trigger_hit(&switch_output_trigger);
4222 }
4223