xref: /openbmc/linux/tools/perf/builtin-record.c (revision 5e0266f0)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/llvm-utils.h"
41 #include "util/bpf-loader.h"
42 #include "util/trigger.h"
43 #include "util/perf-hooks.h"
44 #include "util/cpu-set-sched.h"
45 #include "util/synthetic-events.h"
46 #include "util/time-utils.h"
47 #include "util/units.h"
48 #include "util/bpf-event.h"
49 #include "util/util.h"
50 #include "util/pfm.h"
51 #include "util/clockid.h"
52 #include "util/pmu-hybrid.h"
53 #include "util/evlist-hybrid.h"
54 #include "util/off_cpu.h"
55 #include "asm/bug.h"
56 #include "perf.h"
57 #include "cputopo.h"
58 
59 #include <errno.h>
60 #include <inttypes.h>
61 #include <locale.h>
62 #include <poll.h>
63 #include <pthread.h>
64 #include <unistd.h>
65 #ifndef HAVE_GETTID
66 #include <syscall.h>
67 #endif
68 #include <sched.h>
69 #include <signal.h>
70 #ifdef HAVE_EVENTFD_SUPPORT
71 #include <sys/eventfd.h>
72 #endif
73 #include <sys/mman.h>
74 #include <sys/wait.h>
75 #include <sys/types.h>
76 #include <sys/stat.h>
77 #include <fcntl.h>
78 #include <linux/err.h>
79 #include <linux/string.h>
80 #include <linux/time64.h>
81 #include <linux/zalloc.h>
82 #include <linux/bitmap.h>
83 #include <sys/time.h>
84 
85 struct switch_output {
86 	bool		 enabled;
87 	bool		 signal;
88 	unsigned long	 size;
89 	unsigned long	 time;
90 	const char	*str;
91 	bool		 set;
92 	char		 **filenames;
93 	int		 num_files;
94 	int		 cur_file;
95 };
96 
97 struct thread_mask {
98 	struct mmap_cpu_mask	maps;
99 	struct mmap_cpu_mask	affinity;
100 };
101 
102 struct record_thread {
103 	pid_t			tid;
104 	struct thread_mask	*mask;
105 	struct {
106 		int		msg[2];
107 		int		ack[2];
108 	} pipes;
109 	struct fdarray		pollfd;
110 	int			ctlfd_pos;
111 	int			nr_mmaps;
112 	struct mmap		**maps;
113 	struct mmap		**overwrite_maps;
114 	struct record		*rec;
115 	unsigned long long	samples;
116 	unsigned long		waking;
117 	u64			bytes_written;
118 	u64			bytes_transferred;
119 	u64			bytes_compressed;
120 };
121 
122 static __thread struct record_thread *thread;
123 
124 enum thread_msg {
125 	THREAD_MSG__UNDEFINED = 0,
126 	THREAD_MSG__READY,
127 	THREAD_MSG__MAX,
128 };
129 
130 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
131 	"UNDEFINED", "READY"
132 };
133 
134 enum thread_spec {
135 	THREAD_SPEC__UNDEFINED = 0,
136 	THREAD_SPEC__CPU,
137 	THREAD_SPEC__CORE,
138 	THREAD_SPEC__PACKAGE,
139 	THREAD_SPEC__NUMA,
140 	THREAD_SPEC__USER,
141 	THREAD_SPEC__MAX,
142 };
143 
144 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
145 	"undefined", "cpu", "core", "package", "numa", "user"
146 };
147 
148 struct pollfd_index_map {
149 	int evlist_pollfd_index;
150 	int thread_pollfd_index;
151 };
152 
153 struct record {
154 	struct perf_tool	tool;
155 	struct record_opts	opts;
156 	u64			bytes_written;
157 	u64			thread_bytes_written;
158 	struct perf_data	data;
159 	struct auxtrace_record	*itr;
160 	struct evlist	*evlist;
161 	struct perf_session	*session;
162 	struct evlist		*sb_evlist;
163 	pthread_t		thread_id;
164 	int			realtime_prio;
165 	bool			switch_output_event_set;
166 	bool			no_buildid;
167 	bool			no_buildid_set;
168 	bool			no_buildid_cache;
169 	bool			no_buildid_cache_set;
170 	bool			buildid_all;
171 	bool			buildid_mmap;
172 	bool			timestamp_filename;
173 	bool			timestamp_boundary;
174 	bool			off_cpu;
175 	struct switch_output	switch_output;
176 	unsigned long long	samples;
177 	unsigned long		output_max_size;	/* = 0: unlimited */
178 	struct perf_debuginfod	debuginfod;
179 	int			nr_threads;
180 	struct thread_mask	*thread_masks;
181 	struct record_thread	*thread_data;
182 	struct pollfd_index_map	*index_map;
183 	size_t			index_map_sz;
184 	size_t			index_map_cnt;
185 };
186 
187 static volatile int done;
188 
189 static volatile int auxtrace_record__snapshot_started;
190 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
191 static DEFINE_TRIGGER(switch_output_trigger);
192 
193 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
194 	"SYS", "NODE", "CPU"
195 };
196 
197 #ifndef HAVE_GETTID
198 static inline pid_t gettid(void)
199 {
200 	return (pid_t)syscall(__NR_gettid);
201 }
202 #endif
203 
204 static int record__threads_enabled(struct record *rec)
205 {
206 	return rec->opts.threads_spec;
207 }
208 
209 static bool switch_output_signal(struct record *rec)
210 {
211 	return rec->switch_output.signal &&
212 	       trigger_is_ready(&switch_output_trigger);
213 }
214 
215 static bool switch_output_size(struct record *rec)
216 {
217 	return rec->switch_output.size &&
218 	       trigger_is_ready(&switch_output_trigger) &&
219 	       (rec->bytes_written >= rec->switch_output.size);
220 }
221 
222 static bool switch_output_time(struct record *rec)
223 {
224 	return rec->switch_output.time &&
225 	       trigger_is_ready(&switch_output_trigger);
226 }
227 
228 static u64 record__bytes_written(struct record *rec)
229 {
230 	return rec->bytes_written + rec->thread_bytes_written;
231 }
232 
233 static bool record__output_max_size_exceeded(struct record *rec)
234 {
235 	return rec->output_max_size &&
236 	       (record__bytes_written(rec) >= rec->output_max_size);
237 }
238 
239 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
240 			 void *bf, size_t size)
241 {
242 	struct perf_data_file *file = &rec->session->data->file;
243 
244 	if (map && map->file)
245 		file = map->file;
246 
247 	if (perf_data_file__write(file, bf, size) < 0) {
248 		pr_err("failed to write perf data, error: %m\n");
249 		return -1;
250 	}
251 
252 	if (map && map->file) {
253 		thread->bytes_written += size;
254 		rec->thread_bytes_written += size;
255 	} else {
256 		rec->bytes_written += size;
257 	}
258 
259 	if (record__output_max_size_exceeded(rec) && !done) {
260 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
261 				" stopping session ]\n",
262 				record__bytes_written(rec) >> 10);
263 		done = 1;
264 	}
265 
266 	if (switch_output_size(rec))
267 		trigger_hit(&switch_output_trigger);
268 
269 	return 0;
270 }
271 
272 static int record__aio_enabled(struct record *rec);
273 static int record__comp_enabled(struct record *rec);
274 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
275 			    void *dst, size_t dst_size, void *src, size_t src_size);
276 
277 #ifdef HAVE_AIO_SUPPORT
278 static int record__aio_write(struct aiocb *cblock, int trace_fd,
279 		void *buf, size_t size, off_t off)
280 {
281 	int rc;
282 
283 	cblock->aio_fildes = trace_fd;
284 	cblock->aio_buf    = buf;
285 	cblock->aio_nbytes = size;
286 	cblock->aio_offset = off;
287 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
288 
289 	do {
290 		rc = aio_write(cblock);
291 		if (rc == 0) {
292 			break;
293 		} else if (errno != EAGAIN) {
294 			cblock->aio_fildes = -1;
295 			pr_err("failed to queue perf data, error: %m\n");
296 			break;
297 		}
298 	} while (1);
299 
300 	return rc;
301 }
302 
303 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
304 {
305 	void *rem_buf;
306 	off_t rem_off;
307 	size_t rem_size;
308 	int rc, aio_errno;
309 	ssize_t aio_ret, written;
310 
311 	aio_errno = aio_error(cblock);
312 	if (aio_errno == EINPROGRESS)
313 		return 0;
314 
315 	written = aio_ret = aio_return(cblock);
316 	if (aio_ret < 0) {
317 		if (aio_errno != EINTR)
318 			pr_err("failed to write perf data, error: %m\n");
319 		written = 0;
320 	}
321 
322 	rem_size = cblock->aio_nbytes - written;
323 
324 	if (rem_size == 0) {
325 		cblock->aio_fildes = -1;
326 		/*
327 		 * md->refcount is incremented in record__aio_pushfn() for
328 		 * every aio write request started in record__aio_push() so
329 		 * decrement it because the request is now complete.
330 		 */
331 		perf_mmap__put(&md->core);
332 		rc = 1;
333 	} else {
334 		/*
335 		 * aio write request may require restart with the
336 		 * reminder if the kernel didn't write whole
337 		 * chunk at once.
338 		 */
339 		rem_off = cblock->aio_offset + written;
340 		rem_buf = (void *)(cblock->aio_buf + written);
341 		record__aio_write(cblock, cblock->aio_fildes,
342 				rem_buf, rem_size, rem_off);
343 		rc = 0;
344 	}
345 
346 	return rc;
347 }
348 
349 static int record__aio_sync(struct mmap *md, bool sync_all)
350 {
351 	struct aiocb **aiocb = md->aio.aiocb;
352 	struct aiocb *cblocks = md->aio.cblocks;
353 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
354 	int i, do_suspend;
355 
356 	do {
357 		do_suspend = 0;
358 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
359 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
360 				if (sync_all)
361 					aiocb[i] = NULL;
362 				else
363 					return i;
364 			} else {
365 				/*
366 				 * Started aio write is not complete yet
367 				 * so it has to be waited before the
368 				 * next allocation.
369 				 */
370 				aiocb[i] = &cblocks[i];
371 				do_suspend = 1;
372 			}
373 		}
374 		if (!do_suspend)
375 			return -1;
376 
377 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
378 			if (!(errno == EAGAIN || errno == EINTR))
379 				pr_err("failed to sync perf data, error: %m\n");
380 		}
381 	} while (1);
382 }
383 
384 struct record_aio {
385 	struct record	*rec;
386 	void		*data;
387 	size_t		size;
388 };
389 
390 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
391 {
392 	struct record_aio *aio = to;
393 
394 	/*
395 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
396 	 * to release space in the kernel buffer as fast as possible, calling
397 	 * perf_mmap__consume() from perf_mmap__push() function.
398 	 *
399 	 * That lets the kernel to proceed with storing more profiling data into
400 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
401 	 *
402 	 * Coping can be done in two steps in case the chunk of profiling data
403 	 * crosses the upper bound of the kernel buffer. In this case we first move
404 	 * part of data from map->start till the upper bound and then the reminder
405 	 * from the beginning of the kernel buffer till the end of the data chunk.
406 	 */
407 
408 	if (record__comp_enabled(aio->rec)) {
409 		size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
410 				     mmap__mmap_len(map) - aio->size,
411 				     buf, size);
412 	} else {
413 		memcpy(aio->data + aio->size, buf, size);
414 	}
415 
416 	if (!aio->size) {
417 		/*
418 		 * Increment map->refcount to guard map->aio.data[] buffer
419 		 * from premature deallocation because map object can be
420 		 * released earlier than aio write request started on
421 		 * map->aio.data[] buffer is complete.
422 		 *
423 		 * perf_mmap__put() is done at record__aio_complete()
424 		 * after started aio request completion or at record__aio_push()
425 		 * if the request failed to start.
426 		 */
427 		perf_mmap__get(&map->core);
428 	}
429 
430 	aio->size += size;
431 
432 	return size;
433 }
434 
435 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
436 {
437 	int ret, idx;
438 	int trace_fd = rec->session->data->file.fd;
439 	struct record_aio aio = { .rec = rec, .size = 0 };
440 
441 	/*
442 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
443 	 * becomes available after previous aio write operation.
444 	 */
445 
446 	idx = record__aio_sync(map, false);
447 	aio.data = map->aio.data[idx];
448 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
449 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
450 		return ret;
451 
452 	rec->samples++;
453 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
454 	if (!ret) {
455 		*off += aio.size;
456 		rec->bytes_written += aio.size;
457 		if (switch_output_size(rec))
458 			trigger_hit(&switch_output_trigger);
459 	} else {
460 		/*
461 		 * Decrement map->refcount incremented in record__aio_pushfn()
462 		 * back if record__aio_write() operation failed to start, otherwise
463 		 * map->refcount is decremented in record__aio_complete() after
464 		 * aio write operation finishes successfully.
465 		 */
466 		perf_mmap__put(&map->core);
467 	}
468 
469 	return ret;
470 }
471 
472 static off_t record__aio_get_pos(int trace_fd)
473 {
474 	return lseek(trace_fd, 0, SEEK_CUR);
475 }
476 
477 static void record__aio_set_pos(int trace_fd, off_t pos)
478 {
479 	lseek(trace_fd, pos, SEEK_SET);
480 }
481 
482 static void record__aio_mmap_read_sync(struct record *rec)
483 {
484 	int i;
485 	struct evlist *evlist = rec->evlist;
486 	struct mmap *maps = evlist->mmap;
487 
488 	if (!record__aio_enabled(rec))
489 		return;
490 
491 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
492 		struct mmap *map = &maps[i];
493 
494 		if (map->core.base)
495 			record__aio_sync(map, true);
496 	}
497 }
498 
499 static int nr_cblocks_default = 1;
500 static int nr_cblocks_max = 4;
501 
502 static int record__aio_parse(const struct option *opt,
503 			     const char *str,
504 			     int unset)
505 {
506 	struct record_opts *opts = (struct record_opts *)opt->value;
507 
508 	if (unset) {
509 		opts->nr_cblocks = 0;
510 	} else {
511 		if (str)
512 			opts->nr_cblocks = strtol(str, NULL, 0);
513 		if (!opts->nr_cblocks)
514 			opts->nr_cblocks = nr_cblocks_default;
515 	}
516 
517 	return 0;
518 }
519 #else /* HAVE_AIO_SUPPORT */
520 static int nr_cblocks_max = 0;
521 
522 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
523 			    off_t *off __maybe_unused)
524 {
525 	return -1;
526 }
527 
528 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
529 {
530 	return -1;
531 }
532 
533 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
534 {
535 }
536 
537 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
538 {
539 }
540 #endif
541 
542 static int record__aio_enabled(struct record *rec)
543 {
544 	return rec->opts.nr_cblocks > 0;
545 }
546 
547 #define MMAP_FLUSH_DEFAULT 1
548 static int record__mmap_flush_parse(const struct option *opt,
549 				    const char *str,
550 				    int unset)
551 {
552 	int flush_max;
553 	struct record_opts *opts = (struct record_opts *)opt->value;
554 	static struct parse_tag tags[] = {
555 			{ .tag  = 'B', .mult = 1       },
556 			{ .tag  = 'K', .mult = 1 << 10 },
557 			{ .tag  = 'M', .mult = 1 << 20 },
558 			{ .tag  = 'G', .mult = 1 << 30 },
559 			{ .tag  = 0 },
560 	};
561 
562 	if (unset)
563 		return 0;
564 
565 	if (str) {
566 		opts->mmap_flush = parse_tag_value(str, tags);
567 		if (opts->mmap_flush == (int)-1)
568 			opts->mmap_flush = strtol(str, NULL, 0);
569 	}
570 
571 	if (!opts->mmap_flush)
572 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
573 
574 	flush_max = evlist__mmap_size(opts->mmap_pages);
575 	flush_max /= 4;
576 	if (opts->mmap_flush > flush_max)
577 		opts->mmap_flush = flush_max;
578 
579 	return 0;
580 }
581 
582 #ifdef HAVE_ZSTD_SUPPORT
583 static unsigned int comp_level_default = 1;
584 
585 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
586 {
587 	struct record_opts *opts = opt->value;
588 
589 	if (unset) {
590 		opts->comp_level = 0;
591 	} else {
592 		if (str)
593 			opts->comp_level = strtol(str, NULL, 0);
594 		if (!opts->comp_level)
595 			opts->comp_level = comp_level_default;
596 	}
597 
598 	return 0;
599 }
600 #endif
601 static unsigned int comp_level_max = 22;
602 
603 static int record__comp_enabled(struct record *rec)
604 {
605 	return rec->opts.comp_level > 0;
606 }
607 
608 static int process_synthesized_event(struct perf_tool *tool,
609 				     union perf_event *event,
610 				     struct perf_sample *sample __maybe_unused,
611 				     struct machine *machine __maybe_unused)
612 {
613 	struct record *rec = container_of(tool, struct record, tool);
614 	return record__write(rec, NULL, event, event->header.size);
615 }
616 
617 static struct mutex synth_lock;
618 
619 static int process_locked_synthesized_event(struct perf_tool *tool,
620 				     union perf_event *event,
621 				     struct perf_sample *sample __maybe_unused,
622 				     struct machine *machine __maybe_unused)
623 {
624 	int ret;
625 
626 	mutex_lock(&synth_lock);
627 	ret = process_synthesized_event(tool, event, sample, machine);
628 	mutex_unlock(&synth_lock);
629 	return ret;
630 }
631 
632 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
633 {
634 	struct record *rec = to;
635 
636 	if (record__comp_enabled(rec)) {
637 		size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
638 		bf   = map->data;
639 	}
640 
641 	thread->samples++;
642 	return record__write(rec, map, bf, size);
643 }
644 
645 static volatile sig_atomic_t signr = -1;
646 static volatile sig_atomic_t child_finished;
647 #ifdef HAVE_EVENTFD_SUPPORT
648 static volatile sig_atomic_t done_fd = -1;
649 #endif
650 
651 static void sig_handler(int sig)
652 {
653 	if (sig == SIGCHLD)
654 		child_finished = 1;
655 	else
656 		signr = sig;
657 
658 	done = 1;
659 #ifdef HAVE_EVENTFD_SUPPORT
660 	if (done_fd >= 0) {
661 		u64 tmp = 1;
662 		int orig_errno = errno;
663 
664 		/*
665 		 * It is possible for this signal handler to run after done is
666 		 * checked in the main loop, but before the perf counter fds are
667 		 * polled. If this happens, the poll() will continue to wait
668 		 * even though done is set, and will only break out if either
669 		 * another signal is received, or the counters are ready for
670 		 * read. To ensure the poll() doesn't sleep when done is set,
671 		 * use an eventfd (done_fd) to wake up the poll().
672 		 */
673 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
674 			pr_err("failed to signal wakeup fd, error: %m\n");
675 
676 		errno = orig_errno;
677 	}
678 #endif // HAVE_EVENTFD_SUPPORT
679 }
680 
681 static void sigsegv_handler(int sig)
682 {
683 	perf_hooks__recover();
684 	sighandler_dump_stack(sig);
685 }
686 
687 static void record__sig_exit(void)
688 {
689 	if (signr == -1)
690 		return;
691 
692 	signal(signr, SIG_DFL);
693 	raise(signr);
694 }
695 
696 #ifdef HAVE_AUXTRACE_SUPPORT
697 
698 static int record__process_auxtrace(struct perf_tool *tool,
699 				    struct mmap *map,
700 				    union perf_event *event, void *data1,
701 				    size_t len1, void *data2, size_t len2)
702 {
703 	struct record *rec = container_of(tool, struct record, tool);
704 	struct perf_data *data = &rec->data;
705 	size_t padding;
706 	u8 pad[8] = {0};
707 
708 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
709 		off_t file_offset;
710 		int fd = perf_data__fd(data);
711 		int err;
712 
713 		file_offset = lseek(fd, 0, SEEK_CUR);
714 		if (file_offset == -1)
715 			return -1;
716 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
717 						     event, file_offset);
718 		if (err)
719 			return err;
720 	}
721 
722 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
723 	padding = (len1 + len2) & 7;
724 	if (padding)
725 		padding = 8 - padding;
726 
727 	record__write(rec, map, event, event->header.size);
728 	record__write(rec, map, data1, len1);
729 	if (len2)
730 		record__write(rec, map, data2, len2);
731 	record__write(rec, map, &pad, padding);
732 
733 	return 0;
734 }
735 
736 static int record__auxtrace_mmap_read(struct record *rec,
737 				      struct mmap *map)
738 {
739 	int ret;
740 
741 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
742 				  record__process_auxtrace);
743 	if (ret < 0)
744 		return ret;
745 
746 	if (ret)
747 		rec->samples++;
748 
749 	return 0;
750 }
751 
752 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
753 					       struct mmap *map)
754 {
755 	int ret;
756 
757 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
758 					   record__process_auxtrace,
759 					   rec->opts.auxtrace_snapshot_size);
760 	if (ret < 0)
761 		return ret;
762 
763 	if (ret)
764 		rec->samples++;
765 
766 	return 0;
767 }
768 
769 static int record__auxtrace_read_snapshot_all(struct record *rec)
770 {
771 	int i;
772 	int rc = 0;
773 
774 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
775 		struct mmap *map = &rec->evlist->mmap[i];
776 
777 		if (!map->auxtrace_mmap.base)
778 			continue;
779 
780 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
781 			rc = -1;
782 			goto out;
783 		}
784 	}
785 out:
786 	return rc;
787 }
788 
789 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
790 {
791 	pr_debug("Recording AUX area tracing snapshot\n");
792 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
793 		trigger_error(&auxtrace_snapshot_trigger);
794 	} else {
795 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
796 			trigger_error(&auxtrace_snapshot_trigger);
797 		else
798 			trigger_ready(&auxtrace_snapshot_trigger);
799 	}
800 }
801 
802 static int record__auxtrace_snapshot_exit(struct record *rec)
803 {
804 	if (trigger_is_error(&auxtrace_snapshot_trigger))
805 		return 0;
806 
807 	if (!auxtrace_record__snapshot_started &&
808 	    auxtrace_record__snapshot_start(rec->itr))
809 		return -1;
810 
811 	record__read_auxtrace_snapshot(rec, true);
812 	if (trigger_is_error(&auxtrace_snapshot_trigger))
813 		return -1;
814 
815 	return 0;
816 }
817 
818 static int record__auxtrace_init(struct record *rec)
819 {
820 	int err;
821 
822 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
823 	    && record__threads_enabled(rec)) {
824 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
825 		return -EINVAL;
826 	}
827 
828 	if (!rec->itr) {
829 		rec->itr = auxtrace_record__init(rec->evlist, &err);
830 		if (err)
831 			return err;
832 	}
833 
834 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
835 					      rec->opts.auxtrace_snapshot_opts);
836 	if (err)
837 		return err;
838 
839 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
840 					    rec->opts.auxtrace_sample_opts);
841 	if (err)
842 		return err;
843 
844 	auxtrace_regroup_aux_output(rec->evlist);
845 
846 	return auxtrace_parse_filters(rec->evlist);
847 }
848 
849 #else
850 
851 static inline
852 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
853 			       struct mmap *map __maybe_unused)
854 {
855 	return 0;
856 }
857 
858 static inline
859 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
860 				    bool on_exit __maybe_unused)
861 {
862 }
863 
864 static inline
865 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
866 {
867 	return 0;
868 }
869 
870 static inline
871 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
872 {
873 	return 0;
874 }
875 
876 static int record__auxtrace_init(struct record *rec __maybe_unused)
877 {
878 	return 0;
879 }
880 
881 #endif
882 
883 static int record__config_text_poke(struct evlist *evlist)
884 {
885 	struct evsel *evsel;
886 
887 	/* Nothing to do if text poke is already configured */
888 	evlist__for_each_entry(evlist, evsel) {
889 		if (evsel->core.attr.text_poke)
890 			return 0;
891 	}
892 
893 	evsel = evlist__add_dummy_on_all_cpus(evlist);
894 	if (!evsel)
895 		return -ENOMEM;
896 
897 	evsel->core.attr.text_poke = 1;
898 	evsel->core.attr.ksymbol = 1;
899 	evsel->immediate = true;
900 	evsel__set_sample_bit(evsel, TIME);
901 
902 	return 0;
903 }
904 
905 static int record__config_off_cpu(struct record *rec)
906 {
907 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
908 }
909 
910 static bool record__kcore_readable(struct machine *machine)
911 {
912 	char kcore[PATH_MAX];
913 	int fd;
914 
915 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
916 
917 	fd = open(kcore, O_RDONLY);
918 	if (fd < 0)
919 		return false;
920 
921 	close(fd);
922 
923 	return true;
924 }
925 
926 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
927 {
928 	char from_dir[PATH_MAX];
929 	char kcore_dir[PATH_MAX];
930 	int ret;
931 
932 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
933 
934 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
935 	if (ret)
936 		return ret;
937 
938 	return kcore_copy(from_dir, kcore_dir);
939 }
940 
941 static void record__thread_data_init_pipes(struct record_thread *thread_data)
942 {
943 	thread_data->pipes.msg[0] = -1;
944 	thread_data->pipes.msg[1] = -1;
945 	thread_data->pipes.ack[0] = -1;
946 	thread_data->pipes.ack[1] = -1;
947 }
948 
949 static int record__thread_data_open_pipes(struct record_thread *thread_data)
950 {
951 	if (pipe(thread_data->pipes.msg))
952 		return -EINVAL;
953 
954 	if (pipe(thread_data->pipes.ack)) {
955 		close(thread_data->pipes.msg[0]);
956 		thread_data->pipes.msg[0] = -1;
957 		close(thread_data->pipes.msg[1]);
958 		thread_data->pipes.msg[1] = -1;
959 		return -EINVAL;
960 	}
961 
962 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
963 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
964 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
965 
966 	return 0;
967 }
968 
969 static void record__thread_data_close_pipes(struct record_thread *thread_data)
970 {
971 	if (thread_data->pipes.msg[0] != -1) {
972 		close(thread_data->pipes.msg[0]);
973 		thread_data->pipes.msg[0] = -1;
974 	}
975 	if (thread_data->pipes.msg[1] != -1) {
976 		close(thread_data->pipes.msg[1]);
977 		thread_data->pipes.msg[1] = -1;
978 	}
979 	if (thread_data->pipes.ack[0] != -1) {
980 		close(thread_data->pipes.ack[0]);
981 		thread_data->pipes.ack[0] = -1;
982 	}
983 	if (thread_data->pipes.ack[1] != -1) {
984 		close(thread_data->pipes.ack[1]);
985 		thread_data->pipes.ack[1] = -1;
986 	}
987 }
988 
989 static bool evlist__per_thread(struct evlist *evlist)
990 {
991 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
992 }
993 
994 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
995 {
996 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
997 	struct mmap *mmap = evlist->mmap;
998 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
999 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1000 	bool per_thread = evlist__per_thread(evlist);
1001 
1002 	if (per_thread)
1003 		thread_data->nr_mmaps = nr_mmaps;
1004 	else
1005 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1006 						      thread_data->mask->maps.nbits);
1007 	if (mmap) {
1008 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1009 		if (!thread_data->maps)
1010 			return -ENOMEM;
1011 	}
1012 	if (overwrite_mmap) {
1013 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1014 		if (!thread_data->overwrite_maps) {
1015 			zfree(&thread_data->maps);
1016 			return -ENOMEM;
1017 		}
1018 	}
1019 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1020 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1021 
1022 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1023 		if (per_thread ||
1024 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1025 			if (thread_data->maps) {
1026 				thread_data->maps[tm] = &mmap[m];
1027 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1028 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1029 			}
1030 			if (thread_data->overwrite_maps) {
1031 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1032 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1033 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1034 			}
1035 			tm++;
1036 		}
1037 	}
1038 
1039 	return 0;
1040 }
1041 
1042 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1043 {
1044 	int f, tm, pos;
1045 	struct mmap *map, *overwrite_map;
1046 
1047 	fdarray__init(&thread_data->pollfd, 64);
1048 
1049 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1050 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1051 		overwrite_map = thread_data->overwrite_maps ?
1052 				thread_data->overwrite_maps[tm] : NULL;
1053 
1054 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1055 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1056 
1057 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1058 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1059 							      &evlist->core.pollfd);
1060 				if (pos < 0)
1061 					return pos;
1062 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1063 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1064 			}
1065 		}
1066 	}
1067 
1068 	return 0;
1069 }
1070 
1071 static void record__free_thread_data(struct record *rec)
1072 {
1073 	int t;
1074 	struct record_thread *thread_data = rec->thread_data;
1075 
1076 	if (thread_data == NULL)
1077 		return;
1078 
1079 	for (t = 0; t < rec->nr_threads; t++) {
1080 		record__thread_data_close_pipes(&thread_data[t]);
1081 		zfree(&thread_data[t].maps);
1082 		zfree(&thread_data[t].overwrite_maps);
1083 		fdarray__exit(&thread_data[t].pollfd);
1084 	}
1085 
1086 	zfree(&rec->thread_data);
1087 }
1088 
1089 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1090 						    int evlist_pollfd_index,
1091 						    int thread_pollfd_index)
1092 {
1093 	size_t x = rec->index_map_cnt;
1094 
1095 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1096 		return -ENOMEM;
1097 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1098 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1099 	rec->index_map_cnt += 1;
1100 	return 0;
1101 }
1102 
1103 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1104 						    struct evlist *evlist,
1105 						    struct record_thread *thread_data)
1106 {
1107 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1108 	struct pollfd *t_entries = thread_data->pollfd.entries;
1109 	int err = 0;
1110 	size_t i;
1111 
1112 	for (i = 0; i < rec->index_map_cnt; i++) {
1113 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1114 		int t_pos = rec->index_map[i].thread_pollfd_index;
1115 
1116 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1117 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1118 			pr_err("Thread and evlist pollfd index mismatch\n");
1119 			err = -EINVAL;
1120 			continue;
1121 		}
1122 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1123 	}
1124 	return err;
1125 }
1126 
1127 static int record__dup_non_perf_events(struct record *rec,
1128 				       struct evlist *evlist,
1129 				       struct record_thread *thread_data)
1130 {
1131 	struct fdarray *fda = &evlist->core.pollfd;
1132 	int i, ret;
1133 
1134 	for (i = 0; i < fda->nr; i++) {
1135 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1136 			continue;
1137 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1138 		if (ret < 0) {
1139 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1140 			return ret;
1141 		}
1142 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1143 			  thread_data, ret, fda->entries[i].fd);
1144 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1145 		if (ret < 0) {
1146 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1147 			return ret;
1148 		}
1149 	}
1150 	return 0;
1151 }
1152 
1153 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1154 {
1155 	int t, ret;
1156 	struct record_thread *thread_data;
1157 
1158 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1159 	if (!rec->thread_data) {
1160 		pr_err("Failed to allocate thread data\n");
1161 		return -ENOMEM;
1162 	}
1163 	thread_data = rec->thread_data;
1164 
1165 	for (t = 0; t < rec->nr_threads; t++)
1166 		record__thread_data_init_pipes(&thread_data[t]);
1167 
1168 	for (t = 0; t < rec->nr_threads; t++) {
1169 		thread_data[t].rec = rec;
1170 		thread_data[t].mask = &rec->thread_masks[t];
1171 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1172 		if (ret) {
1173 			pr_err("Failed to initialize thread[%d] maps\n", t);
1174 			goto out_free;
1175 		}
1176 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1177 		if (ret) {
1178 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1179 			goto out_free;
1180 		}
1181 		if (t) {
1182 			thread_data[t].tid = -1;
1183 			ret = record__thread_data_open_pipes(&thread_data[t]);
1184 			if (ret) {
1185 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1186 				goto out_free;
1187 			}
1188 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1189 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1190 			if (ret < 0) {
1191 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1192 				goto out_free;
1193 			}
1194 			thread_data[t].ctlfd_pos = ret;
1195 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1196 				 thread_data, thread_data[t].ctlfd_pos,
1197 				 thread_data[t].pipes.msg[0]);
1198 		} else {
1199 			thread_data[t].tid = gettid();
1200 
1201 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1202 			if (ret < 0)
1203 				goto out_free;
1204 
1205 			thread_data[t].ctlfd_pos = -1; /* Not used */
1206 		}
1207 	}
1208 
1209 	return 0;
1210 
1211 out_free:
1212 	record__free_thread_data(rec);
1213 
1214 	return ret;
1215 }
1216 
1217 static int record__mmap_evlist(struct record *rec,
1218 			       struct evlist *evlist)
1219 {
1220 	int i, ret;
1221 	struct record_opts *opts = &rec->opts;
1222 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1223 				  opts->auxtrace_sample_mode;
1224 	char msg[512];
1225 
1226 	if (opts->affinity != PERF_AFFINITY_SYS)
1227 		cpu__setup_cpunode_map();
1228 
1229 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1230 				 opts->auxtrace_mmap_pages,
1231 				 auxtrace_overwrite,
1232 				 opts->nr_cblocks, opts->affinity,
1233 				 opts->mmap_flush, opts->comp_level) < 0) {
1234 		if (errno == EPERM) {
1235 			pr_err("Permission error mapping pages.\n"
1236 			       "Consider increasing "
1237 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1238 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1239 			       "(current value: %u,%u)\n",
1240 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1241 			return -errno;
1242 		} else {
1243 			pr_err("failed to mmap with %d (%s)\n", errno,
1244 				str_error_r(errno, msg, sizeof(msg)));
1245 			if (errno)
1246 				return -errno;
1247 			else
1248 				return -EINVAL;
1249 		}
1250 	}
1251 
1252 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1253 		return -1;
1254 
1255 	ret = record__alloc_thread_data(rec, evlist);
1256 	if (ret)
1257 		return ret;
1258 
1259 	if (record__threads_enabled(rec)) {
1260 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1261 		if (ret) {
1262 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1263 			return ret;
1264 		}
1265 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1266 			if (evlist->mmap)
1267 				evlist->mmap[i].file = &rec->data.dir.files[i];
1268 			if (evlist->overwrite_mmap)
1269 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1270 		}
1271 	}
1272 
1273 	return 0;
1274 }
1275 
1276 static int record__mmap(struct record *rec)
1277 {
1278 	return record__mmap_evlist(rec, rec->evlist);
1279 }
1280 
1281 static int record__open(struct record *rec)
1282 {
1283 	char msg[BUFSIZ];
1284 	struct evsel *pos;
1285 	struct evlist *evlist = rec->evlist;
1286 	struct perf_session *session = rec->session;
1287 	struct record_opts *opts = &rec->opts;
1288 	int rc = 0;
1289 
1290 	/*
1291 	 * For initial_delay, system wide or a hybrid system, we need to add a
1292 	 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
1293 	 * of waiting or event synthesis.
1294 	 */
1295 	if (opts->initial_delay || target__has_cpu(&opts->target) ||
1296 	    perf_pmu__has_hybrid()) {
1297 		pos = evlist__get_tracking_event(evlist);
1298 		if (!evsel__is_dummy_event(pos)) {
1299 			/* Set up dummy event. */
1300 			if (evlist__add_dummy(evlist))
1301 				return -ENOMEM;
1302 			pos = evlist__last(evlist);
1303 			evlist__set_tracking_event(evlist, pos);
1304 		}
1305 
1306 		/*
1307 		 * Enable the dummy event when the process is forked for
1308 		 * initial_delay, immediately for system wide.
1309 		 */
1310 		if (opts->initial_delay && !pos->immediate &&
1311 		    !target__has_cpu(&opts->target))
1312 			pos->core.attr.enable_on_exec = 1;
1313 		else
1314 			pos->immediate = 1;
1315 	}
1316 
1317 	evlist__config(evlist, opts, &callchain_param);
1318 
1319 	evlist__for_each_entry(evlist, pos) {
1320 try_again:
1321 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1322 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1323 				if (verbose > 0)
1324 					ui__warning("%s\n", msg);
1325 				goto try_again;
1326 			}
1327 			if ((errno == EINVAL || errno == EBADF) &&
1328 			    pos->core.leader != &pos->core &&
1329 			    pos->weak_group) {
1330 			        pos = evlist__reset_weak_group(evlist, pos, true);
1331 				goto try_again;
1332 			}
1333 			rc = -errno;
1334 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1335 			ui__error("%s\n", msg);
1336 			goto out;
1337 		}
1338 
1339 		pos->supported = true;
1340 	}
1341 
1342 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1343 		pr_warning(
1344 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1345 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1346 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1347 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1348 "Samples in kernel modules won't be resolved at all.\n\n"
1349 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1350 "even with a suitable vmlinux or kallsyms file.\n\n");
1351 	}
1352 
1353 	if (evlist__apply_filters(evlist, &pos)) {
1354 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1355 			pos->filter, evsel__name(pos), errno,
1356 			str_error_r(errno, msg, sizeof(msg)));
1357 		rc = -1;
1358 		goto out;
1359 	}
1360 
1361 	rc = record__mmap(rec);
1362 	if (rc)
1363 		goto out;
1364 
1365 	session->evlist = evlist;
1366 	perf_session__set_id_hdr_size(session);
1367 out:
1368 	return rc;
1369 }
1370 
1371 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1372 {
1373 	if (rec->evlist->first_sample_time == 0)
1374 		rec->evlist->first_sample_time = sample_time;
1375 
1376 	if (sample_time)
1377 		rec->evlist->last_sample_time = sample_time;
1378 }
1379 
1380 static int process_sample_event(struct perf_tool *tool,
1381 				union perf_event *event,
1382 				struct perf_sample *sample,
1383 				struct evsel *evsel,
1384 				struct machine *machine)
1385 {
1386 	struct record *rec = container_of(tool, struct record, tool);
1387 
1388 	set_timestamp_boundary(rec, sample->time);
1389 
1390 	if (rec->buildid_all)
1391 		return 0;
1392 
1393 	rec->samples++;
1394 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1395 }
1396 
1397 static int process_buildids(struct record *rec)
1398 {
1399 	struct perf_session *session = rec->session;
1400 
1401 	if (perf_data__size(&rec->data) == 0)
1402 		return 0;
1403 
1404 	/*
1405 	 * During this process, it'll load kernel map and replace the
1406 	 * dso->long_name to a real pathname it found.  In this case
1407 	 * we prefer the vmlinux path like
1408 	 *   /lib/modules/3.16.4/build/vmlinux
1409 	 *
1410 	 * rather than build-id path (in debug directory).
1411 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1412 	 */
1413 	symbol_conf.ignore_vmlinux_buildid = true;
1414 
1415 	/*
1416 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1417 	 * so no need to process samples. But if timestamp_boundary is enabled,
1418 	 * it still needs to walk on all samples to get the timestamps of
1419 	 * first/last samples.
1420 	 */
1421 	if (rec->buildid_all && !rec->timestamp_boundary)
1422 		rec->tool.sample = NULL;
1423 
1424 	return perf_session__process_events(session);
1425 }
1426 
1427 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1428 {
1429 	int err;
1430 	struct perf_tool *tool = data;
1431 	/*
1432 	 *As for guest kernel when processing subcommand record&report,
1433 	 *we arrange module mmap prior to guest kernel mmap and trigger
1434 	 *a preload dso because default guest module symbols are loaded
1435 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1436 	 *method is used to avoid symbol missing when the first addr is
1437 	 *in module instead of in guest kernel.
1438 	 */
1439 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1440 					     machine);
1441 	if (err < 0)
1442 		pr_err("Couldn't record guest kernel [%d]'s reference"
1443 		       " relocation symbol.\n", machine->pid);
1444 
1445 	/*
1446 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1447 	 * have no _text sometimes.
1448 	 */
1449 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1450 						 machine);
1451 	if (err < 0)
1452 		pr_err("Couldn't record guest kernel [%d]'s reference"
1453 		       " relocation symbol.\n", machine->pid);
1454 }
1455 
1456 static struct perf_event_header finished_round_event = {
1457 	.size = sizeof(struct perf_event_header),
1458 	.type = PERF_RECORD_FINISHED_ROUND,
1459 };
1460 
1461 static struct perf_event_header finished_init_event = {
1462 	.size = sizeof(struct perf_event_header),
1463 	.type = PERF_RECORD_FINISHED_INIT,
1464 };
1465 
1466 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1467 {
1468 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1469 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1470 			  thread->mask->affinity.nbits)) {
1471 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1472 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1473 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1474 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1475 					(cpu_set_t *)thread->mask->affinity.bits);
1476 		if (verbose == 2) {
1477 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1478 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1479 		}
1480 	}
1481 }
1482 
1483 static size_t process_comp_header(void *record, size_t increment)
1484 {
1485 	struct perf_record_compressed *event = record;
1486 	size_t size = sizeof(*event);
1487 
1488 	if (increment) {
1489 		event->header.size += increment;
1490 		return increment;
1491 	}
1492 
1493 	event->header.type = PERF_RECORD_COMPRESSED;
1494 	event->header.size = size;
1495 
1496 	return size;
1497 }
1498 
1499 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1500 			    void *dst, size_t dst_size, void *src, size_t src_size)
1501 {
1502 	size_t compressed;
1503 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1504 	struct zstd_data *zstd_data = &session->zstd_data;
1505 
1506 	if (map && map->file)
1507 		zstd_data = &map->zstd_data;
1508 
1509 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1510 						     max_record_size, process_comp_header);
1511 
1512 	if (map && map->file) {
1513 		thread->bytes_transferred += src_size;
1514 		thread->bytes_compressed  += compressed;
1515 	} else {
1516 		session->bytes_transferred += src_size;
1517 		session->bytes_compressed  += compressed;
1518 	}
1519 
1520 	return compressed;
1521 }
1522 
1523 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1524 				    bool overwrite, bool synch)
1525 {
1526 	u64 bytes_written = rec->bytes_written;
1527 	int i;
1528 	int rc = 0;
1529 	int nr_mmaps;
1530 	struct mmap **maps;
1531 	int trace_fd = rec->data.file.fd;
1532 	off_t off = 0;
1533 
1534 	if (!evlist)
1535 		return 0;
1536 
1537 	nr_mmaps = thread->nr_mmaps;
1538 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1539 
1540 	if (!maps)
1541 		return 0;
1542 
1543 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1544 		return 0;
1545 
1546 	if (record__aio_enabled(rec))
1547 		off = record__aio_get_pos(trace_fd);
1548 
1549 	for (i = 0; i < nr_mmaps; i++) {
1550 		u64 flush = 0;
1551 		struct mmap *map = maps[i];
1552 
1553 		if (map->core.base) {
1554 			record__adjust_affinity(rec, map);
1555 			if (synch) {
1556 				flush = map->core.flush;
1557 				map->core.flush = 1;
1558 			}
1559 			if (!record__aio_enabled(rec)) {
1560 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1561 					if (synch)
1562 						map->core.flush = flush;
1563 					rc = -1;
1564 					goto out;
1565 				}
1566 			} else {
1567 				if (record__aio_push(rec, map, &off) < 0) {
1568 					record__aio_set_pos(trace_fd, off);
1569 					if (synch)
1570 						map->core.flush = flush;
1571 					rc = -1;
1572 					goto out;
1573 				}
1574 			}
1575 			if (synch)
1576 				map->core.flush = flush;
1577 		}
1578 
1579 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1580 		    !rec->opts.auxtrace_sample_mode &&
1581 		    record__auxtrace_mmap_read(rec, map) != 0) {
1582 			rc = -1;
1583 			goto out;
1584 		}
1585 	}
1586 
1587 	if (record__aio_enabled(rec))
1588 		record__aio_set_pos(trace_fd, off);
1589 
1590 	/*
1591 	 * Mark the round finished in case we wrote
1592 	 * at least one event.
1593 	 *
1594 	 * No need for round events in directory mode,
1595 	 * because per-cpu maps and files have data
1596 	 * sorted by kernel.
1597 	 */
1598 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1599 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1600 
1601 	if (overwrite)
1602 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1603 out:
1604 	return rc;
1605 }
1606 
1607 static int record__mmap_read_all(struct record *rec, bool synch)
1608 {
1609 	int err;
1610 
1611 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1612 	if (err)
1613 		return err;
1614 
1615 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1616 }
1617 
1618 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1619 					   void *arg __maybe_unused)
1620 {
1621 	struct perf_mmap *map = fda->priv[fd].ptr;
1622 
1623 	if (map)
1624 		perf_mmap__put(map);
1625 }
1626 
1627 static void *record__thread(void *arg)
1628 {
1629 	enum thread_msg msg = THREAD_MSG__READY;
1630 	bool terminate = false;
1631 	struct fdarray *pollfd;
1632 	int err, ctlfd_pos;
1633 
1634 	thread = arg;
1635 	thread->tid = gettid();
1636 
1637 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1638 	if (err == -1)
1639 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1640 			   thread->tid, strerror(errno));
1641 
1642 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1643 
1644 	pollfd = &thread->pollfd;
1645 	ctlfd_pos = thread->ctlfd_pos;
1646 
1647 	for (;;) {
1648 		unsigned long long hits = thread->samples;
1649 
1650 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1651 			break;
1652 
1653 		if (hits == thread->samples) {
1654 
1655 			err = fdarray__poll(pollfd, -1);
1656 			/*
1657 			 * Propagate error, only if there's any. Ignore positive
1658 			 * number of returned events and interrupt error.
1659 			 */
1660 			if (err > 0 || (err < 0 && errno == EINTR))
1661 				err = 0;
1662 			thread->waking++;
1663 
1664 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1665 					    record__thread_munmap_filtered, NULL) == 0)
1666 				break;
1667 		}
1668 
1669 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1670 			terminate = true;
1671 			close(thread->pipes.msg[0]);
1672 			thread->pipes.msg[0] = -1;
1673 			pollfd->entries[ctlfd_pos].fd = -1;
1674 			pollfd->entries[ctlfd_pos].events = 0;
1675 		}
1676 
1677 		pollfd->entries[ctlfd_pos].revents = 0;
1678 	}
1679 	record__mmap_read_all(thread->rec, true);
1680 
1681 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1682 	if (err == -1)
1683 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1684 			   thread->tid, strerror(errno));
1685 
1686 	return NULL;
1687 }
1688 
1689 static void record__init_features(struct record *rec)
1690 {
1691 	struct perf_session *session = rec->session;
1692 	int feat;
1693 
1694 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1695 		perf_header__set_feat(&session->header, feat);
1696 
1697 	if (rec->no_buildid)
1698 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1699 
1700 #ifdef HAVE_LIBTRACEEVENT
1701 	if (!have_tracepoints(&rec->evlist->core.entries))
1702 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1703 #endif
1704 
1705 	if (!rec->opts.branch_stack)
1706 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1707 
1708 	if (!rec->opts.full_auxtrace)
1709 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1710 
1711 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1712 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1713 
1714 	if (!rec->opts.use_clockid)
1715 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1716 
1717 	if (!record__threads_enabled(rec))
1718 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1719 
1720 	if (!record__comp_enabled(rec))
1721 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1722 
1723 	perf_header__clear_feat(&session->header, HEADER_STAT);
1724 }
1725 
1726 static void
1727 record__finish_output(struct record *rec)
1728 {
1729 	int i;
1730 	struct perf_data *data = &rec->data;
1731 	int fd = perf_data__fd(data);
1732 
1733 	if (data->is_pipe)
1734 		return;
1735 
1736 	rec->session->header.data_size += rec->bytes_written;
1737 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1738 	if (record__threads_enabled(rec)) {
1739 		for (i = 0; i < data->dir.nr; i++)
1740 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1741 	}
1742 
1743 	if (!rec->no_buildid) {
1744 		process_buildids(rec);
1745 
1746 		if (rec->buildid_all)
1747 			dsos__hit_all(rec->session);
1748 	}
1749 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1750 
1751 	return;
1752 }
1753 
1754 static int record__synthesize_workload(struct record *rec, bool tail)
1755 {
1756 	int err;
1757 	struct perf_thread_map *thread_map;
1758 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1759 
1760 	if (rec->opts.tail_synthesize != tail)
1761 		return 0;
1762 
1763 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1764 	if (thread_map == NULL)
1765 		return -1;
1766 
1767 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1768 						 process_synthesized_event,
1769 						 &rec->session->machines.host,
1770 						 needs_mmap,
1771 						 rec->opts.sample_address);
1772 	perf_thread_map__put(thread_map);
1773 	return err;
1774 }
1775 
1776 static int write_finished_init(struct record *rec, bool tail)
1777 {
1778 	if (rec->opts.tail_synthesize != tail)
1779 		return 0;
1780 
1781 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1782 }
1783 
1784 static int record__synthesize(struct record *rec, bool tail);
1785 
1786 static int
1787 record__switch_output(struct record *rec, bool at_exit)
1788 {
1789 	struct perf_data *data = &rec->data;
1790 	int fd, err;
1791 	char *new_filename;
1792 
1793 	/* Same Size:      "2015122520103046"*/
1794 	char timestamp[] = "InvalidTimestamp";
1795 
1796 	record__aio_mmap_read_sync(rec);
1797 
1798 	write_finished_init(rec, true);
1799 
1800 	record__synthesize(rec, true);
1801 	if (target__none(&rec->opts.target))
1802 		record__synthesize_workload(rec, true);
1803 
1804 	rec->samples = 0;
1805 	record__finish_output(rec);
1806 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1807 	if (err) {
1808 		pr_err("Failed to get current timestamp\n");
1809 		return -EINVAL;
1810 	}
1811 
1812 	fd = perf_data__switch(data, timestamp,
1813 				    rec->session->header.data_offset,
1814 				    at_exit, &new_filename);
1815 	if (fd >= 0 && !at_exit) {
1816 		rec->bytes_written = 0;
1817 		rec->session->header.data_size = 0;
1818 	}
1819 
1820 	if (!quiet)
1821 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1822 			data->path, timestamp);
1823 
1824 	if (rec->switch_output.num_files) {
1825 		int n = rec->switch_output.cur_file + 1;
1826 
1827 		if (n >= rec->switch_output.num_files)
1828 			n = 0;
1829 		rec->switch_output.cur_file = n;
1830 		if (rec->switch_output.filenames[n]) {
1831 			remove(rec->switch_output.filenames[n]);
1832 			zfree(&rec->switch_output.filenames[n]);
1833 		}
1834 		rec->switch_output.filenames[n] = new_filename;
1835 	} else {
1836 		free(new_filename);
1837 	}
1838 
1839 	/* Output tracking events */
1840 	if (!at_exit) {
1841 		record__synthesize(rec, false);
1842 
1843 		/*
1844 		 * In 'perf record --switch-output' without -a,
1845 		 * record__synthesize() in record__switch_output() won't
1846 		 * generate tracking events because there's no thread_map
1847 		 * in evlist. Which causes newly created perf.data doesn't
1848 		 * contain map and comm information.
1849 		 * Create a fake thread_map and directly call
1850 		 * perf_event__synthesize_thread_map() for those events.
1851 		 */
1852 		if (target__none(&rec->opts.target))
1853 			record__synthesize_workload(rec, false);
1854 		write_finished_init(rec, false);
1855 	}
1856 	return fd;
1857 }
1858 
1859 static void __record__read_lost_samples(struct record *rec, struct evsel *evsel,
1860 					struct perf_record_lost_samples *lost,
1861 					int cpu_idx, int thread_idx)
1862 {
1863 	struct perf_counts_values count;
1864 	struct perf_sample_id *sid;
1865 	struct perf_sample sample = {};
1866 	int id_hdr_size;
1867 
1868 	if (perf_evsel__read(&evsel->core, cpu_idx, thread_idx, &count) < 0) {
1869 		pr_err("read LOST count failed\n");
1870 		return;
1871 	}
1872 
1873 	if (count.lost == 0)
1874 		return;
1875 
1876 	lost->lost = count.lost;
1877 	if (evsel->core.ids) {
1878 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1879 		sample.id = sid->id;
1880 	}
1881 
1882 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1883 						       evsel->core.attr.sample_type, &sample);
1884 	lost->header.size = sizeof(*lost) + id_hdr_size;
1885 	record__write(rec, NULL, lost, lost->header.size);
1886 }
1887 
1888 static void record__read_lost_samples(struct record *rec)
1889 {
1890 	struct perf_session *session = rec->session;
1891 	struct perf_record_lost_samples *lost;
1892 	struct evsel *evsel;
1893 
1894 	/* there was an error during record__open */
1895 	if (session->evlist == NULL)
1896 		return;
1897 
1898 	lost = zalloc(PERF_SAMPLE_MAX_SIZE);
1899 	if (lost == NULL) {
1900 		pr_debug("Memory allocation failed\n");
1901 		return;
1902 	}
1903 
1904 	lost->header.type = PERF_RECORD_LOST_SAMPLES;
1905 
1906 	evlist__for_each_entry(session->evlist, evsel) {
1907 		struct xyarray *xy = evsel->core.sample_id;
1908 
1909 		if (xy == NULL || evsel->core.fd == NULL)
1910 			continue;
1911 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1912 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1913 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1914 			continue;
1915 		}
1916 
1917 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1918 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1919 				__record__read_lost_samples(rec, evsel, lost, x, y);
1920 			}
1921 		}
1922 	}
1923 	free(lost);
1924 
1925 }
1926 
1927 static volatile sig_atomic_t workload_exec_errno;
1928 
1929 /*
1930  * evlist__prepare_workload will send a SIGUSR1
1931  * if the fork fails, since we asked by setting its
1932  * want_signal to true.
1933  */
1934 static void workload_exec_failed_signal(int signo __maybe_unused,
1935 					siginfo_t *info,
1936 					void *ucontext __maybe_unused)
1937 {
1938 	workload_exec_errno = info->si_value.sival_int;
1939 	done = 1;
1940 	child_finished = 1;
1941 }
1942 
1943 static void snapshot_sig_handler(int sig);
1944 static void alarm_sig_handler(int sig);
1945 
1946 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1947 {
1948 	if (evlist) {
1949 		if (evlist->mmap && evlist->mmap[0].core.base)
1950 			return evlist->mmap[0].core.base;
1951 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1952 			return evlist->overwrite_mmap[0].core.base;
1953 	}
1954 	return NULL;
1955 }
1956 
1957 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1958 {
1959 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1960 	if (pc)
1961 		return pc;
1962 	return NULL;
1963 }
1964 
1965 static int record__synthesize(struct record *rec, bool tail)
1966 {
1967 	struct perf_session *session = rec->session;
1968 	struct machine *machine = &session->machines.host;
1969 	struct perf_data *data = &rec->data;
1970 	struct record_opts *opts = &rec->opts;
1971 	struct perf_tool *tool = &rec->tool;
1972 	int err = 0;
1973 	event_op f = process_synthesized_event;
1974 
1975 	if (rec->opts.tail_synthesize != tail)
1976 		return 0;
1977 
1978 	if (data->is_pipe) {
1979 		err = perf_event__synthesize_for_pipe(tool, session, data,
1980 						      process_synthesized_event);
1981 		if (err < 0)
1982 			goto out;
1983 
1984 		rec->bytes_written += err;
1985 	}
1986 
1987 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1988 					  process_synthesized_event, machine);
1989 	if (err)
1990 		goto out;
1991 
1992 	/* Synthesize id_index before auxtrace_info */
1993 	err = perf_event__synthesize_id_index(tool,
1994 					      process_synthesized_event,
1995 					      session->evlist, machine);
1996 	if (err)
1997 		goto out;
1998 
1999 	if (rec->opts.full_auxtrace) {
2000 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2001 					session, process_synthesized_event);
2002 		if (err)
2003 			goto out;
2004 	}
2005 
2006 	if (!evlist__exclude_kernel(rec->evlist)) {
2007 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2008 							 machine);
2009 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2010 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2011 				   "Check /proc/kallsyms permission or run as root.\n");
2012 
2013 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2014 						     machine);
2015 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2016 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2017 				   "Check /proc/modules permission or run as root.\n");
2018 	}
2019 
2020 	if (perf_guest) {
2021 		machines__process_guests(&session->machines,
2022 					 perf_event__synthesize_guest_os, tool);
2023 	}
2024 
2025 	err = perf_event__synthesize_extra_attr(&rec->tool,
2026 						rec->evlist,
2027 						process_synthesized_event,
2028 						data->is_pipe);
2029 	if (err)
2030 		goto out;
2031 
2032 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2033 						 process_synthesized_event,
2034 						NULL);
2035 	if (err < 0) {
2036 		pr_err("Couldn't synthesize thread map.\n");
2037 		return err;
2038 	}
2039 
2040 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2041 					     process_synthesized_event, NULL);
2042 	if (err < 0) {
2043 		pr_err("Couldn't synthesize cpu map.\n");
2044 		return err;
2045 	}
2046 
2047 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2048 						machine, opts);
2049 	if (err < 0) {
2050 		pr_warning("Couldn't synthesize bpf events.\n");
2051 		err = 0;
2052 	}
2053 
2054 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2055 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2056 						     machine);
2057 		if (err < 0) {
2058 			pr_warning("Couldn't synthesize cgroup events.\n");
2059 			err = 0;
2060 		}
2061 	}
2062 
2063 	if (rec->opts.nr_threads_synthesize > 1) {
2064 		mutex_init(&synth_lock);
2065 		perf_set_multithreaded();
2066 		f = process_locked_synthesized_event;
2067 	}
2068 
2069 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2070 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2071 
2072 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2073 						    rec->evlist->core.threads,
2074 						    f, needs_mmap, opts->sample_address,
2075 						    rec->opts.nr_threads_synthesize);
2076 	}
2077 
2078 	if (rec->opts.nr_threads_synthesize > 1) {
2079 		perf_set_singlethreaded();
2080 		mutex_destroy(&synth_lock);
2081 	}
2082 
2083 out:
2084 	return err;
2085 }
2086 
2087 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2088 {
2089 	struct record *rec = data;
2090 	pthread_kill(rec->thread_id, SIGUSR2);
2091 	return 0;
2092 }
2093 
2094 static int record__setup_sb_evlist(struct record *rec)
2095 {
2096 	struct record_opts *opts = &rec->opts;
2097 
2098 	if (rec->sb_evlist != NULL) {
2099 		/*
2100 		 * We get here if --switch-output-event populated the
2101 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2102 		 * to the main thread.
2103 		 */
2104 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2105 		rec->thread_id = pthread_self();
2106 	}
2107 #ifdef HAVE_LIBBPF_SUPPORT
2108 	if (!opts->no_bpf_event) {
2109 		if (rec->sb_evlist == NULL) {
2110 			rec->sb_evlist = evlist__new();
2111 
2112 			if (rec->sb_evlist == NULL) {
2113 				pr_err("Couldn't create side band evlist.\n.");
2114 				return -1;
2115 			}
2116 		}
2117 
2118 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2119 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2120 			return -1;
2121 		}
2122 	}
2123 #endif
2124 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2125 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2126 		opts->no_bpf_event = true;
2127 	}
2128 
2129 	return 0;
2130 }
2131 
2132 static int record__init_clock(struct record *rec)
2133 {
2134 	struct perf_session *session = rec->session;
2135 	struct timespec ref_clockid;
2136 	struct timeval ref_tod;
2137 	u64 ref;
2138 
2139 	if (!rec->opts.use_clockid)
2140 		return 0;
2141 
2142 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2143 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2144 
2145 	session->header.env.clock.clockid = rec->opts.clockid;
2146 
2147 	if (gettimeofday(&ref_tod, NULL) != 0) {
2148 		pr_err("gettimeofday failed, cannot set reference time.\n");
2149 		return -1;
2150 	}
2151 
2152 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2153 		pr_err("clock_gettime failed, cannot set reference time.\n");
2154 		return -1;
2155 	}
2156 
2157 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2158 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2159 
2160 	session->header.env.clock.tod_ns = ref;
2161 
2162 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2163 	      (u64) ref_clockid.tv_nsec;
2164 
2165 	session->header.env.clock.clockid_ns = ref;
2166 	return 0;
2167 }
2168 
2169 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2170 {
2171 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2172 		trigger_hit(&auxtrace_snapshot_trigger);
2173 		auxtrace_record__snapshot_started = 1;
2174 		if (auxtrace_record__snapshot_start(rec->itr))
2175 			trigger_error(&auxtrace_snapshot_trigger);
2176 	}
2177 }
2178 
2179 static void record__uniquify_name(struct record *rec)
2180 {
2181 	struct evsel *pos;
2182 	struct evlist *evlist = rec->evlist;
2183 	char *new_name;
2184 	int ret;
2185 
2186 	if (!perf_pmu__has_hybrid())
2187 		return;
2188 
2189 	evlist__for_each_entry(evlist, pos) {
2190 		if (!evsel__is_hybrid(pos))
2191 			continue;
2192 
2193 		if (strchr(pos->name, '/'))
2194 			continue;
2195 
2196 		ret = asprintf(&new_name, "%s/%s/",
2197 			       pos->pmu_name, pos->name);
2198 		if (ret) {
2199 			free(pos->name);
2200 			pos->name = new_name;
2201 		}
2202 	}
2203 }
2204 
2205 static int record__terminate_thread(struct record_thread *thread_data)
2206 {
2207 	int err;
2208 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2209 	pid_t tid = thread_data->tid;
2210 
2211 	close(thread_data->pipes.msg[1]);
2212 	thread_data->pipes.msg[1] = -1;
2213 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2214 	if (err > 0)
2215 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2216 	else
2217 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2218 			   thread->tid, tid);
2219 
2220 	return 0;
2221 }
2222 
2223 static int record__start_threads(struct record *rec)
2224 {
2225 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2226 	struct record_thread *thread_data = rec->thread_data;
2227 	sigset_t full, mask;
2228 	pthread_t handle;
2229 	pthread_attr_t attrs;
2230 
2231 	thread = &thread_data[0];
2232 
2233 	if (!record__threads_enabled(rec))
2234 		return 0;
2235 
2236 	sigfillset(&full);
2237 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2238 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2239 		return -1;
2240 	}
2241 
2242 	pthread_attr_init(&attrs);
2243 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2244 
2245 	for (t = 1; t < nr_threads; t++) {
2246 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2247 
2248 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2249 		pthread_attr_setaffinity_np(&attrs,
2250 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2251 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2252 #endif
2253 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2254 			for (tt = 1; tt < t; tt++)
2255 				record__terminate_thread(&thread_data[t]);
2256 			pr_err("Failed to start threads: %s\n", strerror(errno));
2257 			ret = -1;
2258 			goto out_err;
2259 		}
2260 
2261 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2262 		if (err > 0)
2263 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2264 				  thread_msg_tags[msg]);
2265 		else
2266 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2267 				   thread->tid, rec->thread_data[t].tid);
2268 	}
2269 
2270 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2271 			(cpu_set_t *)thread->mask->affinity.bits);
2272 
2273 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2274 
2275 out_err:
2276 	pthread_attr_destroy(&attrs);
2277 
2278 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2279 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2280 		ret = -1;
2281 	}
2282 
2283 	return ret;
2284 }
2285 
2286 static int record__stop_threads(struct record *rec)
2287 {
2288 	int t;
2289 	struct record_thread *thread_data = rec->thread_data;
2290 
2291 	for (t = 1; t < rec->nr_threads; t++)
2292 		record__terminate_thread(&thread_data[t]);
2293 
2294 	for (t = 0; t < rec->nr_threads; t++) {
2295 		rec->samples += thread_data[t].samples;
2296 		if (!record__threads_enabled(rec))
2297 			continue;
2298 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2299 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2300 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2301 			 thread_data[t].samples, thread_data[t].waking);
2302 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2303 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2304 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2305 		else
2306 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2307 	}
2308 
2309 	return 0;
2310 }
2311 
2312 static unsigned long record__waking(struct record *rec)
2313 {
2314 	int t;
2315 	unsigned long waking = 0;
2316 	struct record_thread *thread_data = rec->thread_data;
2317 
2318 	for (t = 0; t < rec->nr_threads; t++)
2319 		waking += thread_data[t].waking;
2320 
2321 	return waking;
2322 }
2323 
2324 static int __cmd_record(struct record *rec, int argc, const char **argv)
2325 {
2326 	int err;
2327 	int status = 0;
2328 	const bool forks = argc > 0;
2329 	struct perf_tool *tool = &rec->tool;
2330 	struct record_opts *opts = &rec->opts;
2331 	struct perf_data *data = &rec->data;
2332 	struct perf_session *session;
2333 	bool disabled = false, draining = false;
2334 	int fd;
2335 	float ratio = 0;
2336 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2337 
2338 	atexit(record__sig_exit);
2339 	signal(SIGCHLD, sig_handler);
2340 	signal(SIGINT, sig_handler);
2341 	signal(SIGTERM, sig_handler);
2342 	signal(SIGSEGV, sigsegv_handler);
2343 
2344 	if (rec->opts.record_namespaces)
2345 		tool->namespace_events = true;
2346 
2347 	if (rec->opts.record_cgroup) {
2348 #ifdef HAVE_FILE_HANDLE
2349 		tool->cgroup_events = true;
2350 #else
2351 		pr_err("cgroup tracking is not supported\n");
2352 		return -1;
2353 #endif
2354 	}
2355 
2356 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2357 		signal(SIGUSR2, snapshot_sig_handler);
2358 		if (rec->opts.auxtrace_snapshot_mode)
2359 			trigger_on(&auxtrace_snapshot_trigger);
2360 		if (rec->switch_output.enabled)
2361 			trigger_on(&switch_output_trigger);
2362 	} else {
2363 		signal(SIGUSR2, SIG_IGN);
2364 	}
2365 
2366 	session = perf_session__new(data, tool);
2367 	if (IS_ERR(session)) {
2368 		pr_err("Perf session creation failed.\n");
2369 		return PTR_ERR(session);
2370 	}
2371 
2372 	if (record__threads_enabled(rec)) {
2373 		if (perf_data__is_pipe(&rec->data)) {
2374 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2375 			return -1;
2376 		}
2377 		if (rec->opts.full_auxtrace) {
2378 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2379 			return -1;
2380 		}
2381 	}
2382 
2383 	fd = perf_data__fd(data);
2384 	rec->session = session;
2385 
2386 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2387 		pr_err("Compression initialization failed.\n");
2388 		return -1;
2389 	}
2390 #ifdef HAVE_EVENTFD_SUPPORT
2391 	done_fd = eventfd(0, EFD_NONBLOCK);
2392 	if (done_fd < 0) {
2393 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2394 		status = -1;
2395 		goto out_delete_session;
2396 	}
2397 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2398 	if (err < 0) {
2399 		pr_err("Failed to add wakeup eventfd to poll list\n");
2400 		status = err;
2401 		goto out_delete_session;
2402 	}
2403 #endif // HAVE_EVENTFD_SUPPORT
2404 
2405 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2406 	session->header.env.comp_level = rec->opts.comp_level;
2407 
2408 	if (rec->opts.kcore &&
2409 	    !record__kcore_readable(&session->machines.host)) {
2410 		pr_err("ERROR: kcore is not readable.\n");
2411 		return -1;
2412 	}
2413 
2414 	if (record__init_clock(rec))
2415 		return -1;
2416 
2417 	record__init_features(rec);
2418 
2419 	if (forks) {
2420 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2421 					       workload_exec_failed_signal);
2422 		if (err < 0) {
2423 			pr_err("Couldn't run the workload!\n");
2424 			status = err;
2425 			goto out_delete_session;
2426 		}
2427 	}
2428 
2429 	/*
2430 	 * If we have just single event and are sending data
2431 	 * through pipe, we need to force the ids allocation,
2432 	 * because we synthesize event name through the pipe
2433 	 * and need the id for that.
2434 	 */
2435 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2436 		rec->opts.sample_id = true;
2437 
2438 	record__uniquify_name(rec);
2439 
2440 	/* Debug message used by test scripts */
2441 	pr_debug3("perf record opening and mmapping events\n");
2442 	if (record__open(rec) != 0) {
2443 		err = -1;
2444 		goto out_free_threads;
2445 	}
2446 	/* Debug message used by test scripts */
2447 	pr_debug3("perf record done opening and mmapping events\n");
2448 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2449 
2450 	if (rec->opts.kcore) {
2451 		err = record__kcore_copy(&session->machines.host, data);
2452 		if (err) {
2453 			pr_err("ERROR: Failed to copy kcore\n");
2454 			goto out_free_threads;
2455 		}
2456 	}
2457 
2458 	err = bpf__apply_obj_config();
2459 	if (err) {
2460 		char errbuf[BUFSIZ];
2461 
2462 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2463 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2464 			 errbuf);
2465 		goto out_free_threads;
2466 	}
2467 
2468 	/*
2469 	 * Normally perf_session__new would do this, but it doesn't have the
2470 	 * evlist.
2471 	 */
2472 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2473 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2474 		rec->tool.ordered_events = false;
2475 	}
2476 
2477 	if (!rec->evlist->core.nr_groups)
2478 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2479 
2480 	if (data->is_pipe) {
2481 		err = perf_header__write_pipe(fd);
2482 		if (err < 0)
2483 			goto out_free_threads;
2484 	} else {
2485 		err = perf_session__write_header(session, rec->evlist, fd, false);
2486 		if (err < 0)
2487 			goto out_free_threads;
2488 	}
2489 
2490 	err = -1;
2491 	if (!rec->no_buildid
2492 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2493 		pr_err("Couldn't generate buildids. "
2494 		       "Use --no-buildid to profile anyway.\n");
2495 		goto out_free_threads;
2496 	}
2497 
2498 	err = record__setup_sb_evlist(rec);
2499 	if (err)
2500 		goto out_free_threads;
2501 
2502 	err = record__synthesize(rec, false);
2503 	if (err < 0)
2504 		goto out_free_threads;
2505 
2506 	if (rec->realtime_prio) {
2507 		struct sched_param param;
2508 
2509 		param.sched_priority = rec->realtime_prio;
2510 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2511 			pr_err("Could not set realtime priority.\n");
2512 			err = -1;
2513 			goto out_free_threads;
2514 		}
2515 	}
2516 
2517 	if (record__start_threads(rec))
2518 		goto out_free_threads;
2519 
2520 	/*
2521 	 * When perf is starting the traced process, all the events
2522 	 * (apart from group members) have enable_on_exec=1 set,
2523 	 * so don't spoil it by prematurely enabling them.
2524 	 */
2525 	if (!target__none(&opts->target) && !opts->initial_delay)
2526 		evlist__enable(rec->evlist);
2527 
2528 	/*
2529 	 * Let the child rip
2530 	 */
2531 	if (forks) {
2532 		struct machine *machine = &session->machines.host;
2533 		union perf_event *event;
2534 		pid_t tgid;
2535 
2536 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2537 		if (event == NULL) {
2538 			err = -ENOMEM;
2539 			goto out_child;
2540 		}
2541 
2542 		/*
2543 		 * Some H/W events are generated before COMM event
2544 		 * which is emitted during exec(), so perf script
2545 		 * cannot see a correct process name for those events.
2546 		 * Synthesize COMM event to prevent it.
2547 		 */
2548 		tgid = perf_event__synthesize_comm(tool, event,
2549 						   rec->evlist->workload.pid,
2550 						   process_synthesized_event,
2551 						   machine);
2552 		free(event);
2553 
2554 		if (tgid == -1)
2555 			goto out_child;
2556 
2557 		event = malloc(sizeof(event->namespaces) +
2558 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2559 			       machine->id_hdr_size);
2560 		if (event == NULL) {
2561 			err = -ENOMEM;
2562 			goto out_child;
2563 		}
2564 
2565 		/*
2566 		 * Synthesize NAMESPACES event for the command specified.
2567 		 */
2568 		perf_event__synthesize_namespaces(tool, event,
2569 						  rec->evlist->workload.pid,
2570 						  tgid, process_synthesized_event,
2571 						  machine);
2572 		free(event);
2573 
2574 		evlist__start_workload(rec->evlist);
2575 	}
2576 
2577 	if (opts->initial_delay) {
2578 		pr_info(EVLIST_DISABLED_MSG);
2579 		if (opts->initial_delay > 0) {
2580 			usleep(opts->initial_delay * USEC_PER_MSEC);
2581 			evlist__enable(rec->evlist);
2582 			pr_info(EVLIST_ENABLED_MSG);
2583 		}
2584 	}
2585 
2586 	err = event_enable_timer__start(rec->evlist->eet);
2587 	if (err)
2588 		goto out_child;
2589 
2590 	/* Debug message used by test scripts */
2591 	pr_debug3("perf record has started\n");
2592 	fflush(stderr);
2593 
2594 	trigger_ready(&auxtrace_snapshot_trigger);
2595 	trigger_ready(&switch_output_trigger);
2596 	perf_hooks__invoke_record_start();
2597 
2598 	/*
2599 	 * Must write FINISHED_INIT so it will be seen after all other
2600 	 * synthesized user events, but before any regular events.
2601 	 */
2602 	err = write_finished_init(rec, false);
2603 	if (err < 0)
2604 		goto out_child;
2605 
2606 	for (;;) {
2607 		unsigned long long hits = thread->samples;
2608 
2609 		/*
2610 		 * rec->evlist->bkw_mmap_state is possible to be
2611 		 * BKW_MMAP_EMPTY here: when done == true and
2612 		 * hits != rec->samples in previous round.
2613 		 *
2614 		 * evlist__toggle_bkw_mmap ensure we never
2615 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2616 		 */
2617 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2618 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2619 
2620 		if (record__mmap_read_all(rec, false) < 0) {
2621 			trigger_error(&auxtrace_snapshot_trigger);
2622 			trigger_error(&switch_output_trigger);
2623 			err = -1;
2624 			goto out_child;
2625 		}
2626 
2627 		if (auxtrace_record__snapshot_started) {
2628 			auxtrace_record__snapshot_started = 0;
2629 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2630 				record__read_auxtrace_snapshot(rec, false);
2631 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2632 				pr_err("AUX area tracing snapshot failed\n");
2633 				err = -1;
2634 				goto out_child;
2635 			}
2636 		}
2637 
2638 		if (trigger_is_hit(&switch_output_trigger)) {
2639 			/*
2640 			 * If switch_output_trigger is hit, the data in
2641 			 * overwritable ring buffer should have been collected,
2642 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2643 			 *
2644 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2645 			 * record__mmap_read_all() didn't collect data from
2646 			 * overwritable ring buffer. Read again.
2647 			 */
2648 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2649 				continue;
2650 			trigger_ready(&switch_output_trigger);
2651 
2652 			/*
2653 			 * Reenable events in overwrite ring buffer after
2654 			 * record__mmap_read_all(): we should have collected
2655 			 * data from it.
2656 			 */
2657 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2658 
2659 			if (!quiet)
2660 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2661 					record__waking(rec));
2662 			thread->waking = 0;
2663 			fd = record__switch_output(rec, false);
2664 			if (fd < 0) {
2665 				pr_err("Failed to switch to new file\n");
2666 				trigger_error(&switch_output_trigger);
2667 				err = fd;
2668 				goto out_child;
2669 			}
2670 
2671 			/* re-arm the alarm */
2672 			if (rec->switch_output.time)
2673 				alarm(rec->switch_output.time);
2674 		}
2675 
2676 		if (hits == thread->samples) {
2677 			if (done || draining)
2678 				break;
2679 			err = fdarray__poll(&thread->pollfd, -1);
2680 			/*
2681 			 * Propagate error, only if there's any. Ignore positive
2682 			 * number of returned events and interrupt error.
2683 			 */
2684 			if (err > 0 || (err < 0 && errno == EINTR))
2685 				err = 0;
2686 			thread->waking++;
2687 
2688 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2689 					    record__thread_munmap_filtered, NULL) == 0)
2690 				draining = true;
2691 
2692 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2693 			if (err)
2694 				goto out_child;
2695 		}
2696 
2697 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2698 			switch (cmd) {
2699 			case EVLIST_CTL_CMD_SNAPSHOT:
2700 				hit_auxtrace_snapshot_trigger(rec);
2701 				evlist__ctlfd_ack(rec->evlist);
2702 				break;
2703 			case EVLIST_CTL_CMD_STOP:
2704 				done = 1;
2705 				break;
2706 			case EVLIST_CTL_CMD_ACK:
2707 			case EVLIST_CTL_CMD_UNSUPPORTED:
2708 			case EVLIST_CTL_CMD_ENABLE:
2709 			case EVLIST_CTL_CMD_DISABLE:
2710 			case EVLIST_CTL_CMD_EVLIST:
2711 			case EVLIST_CTL_CMD_PING:
2712 			default:
2713 				break;
2714 			}
2715 		}
2716 
2717 		err = event_enable_timer__process(rec->evlist->eet);
2718 		if (err < 0)
2719 			goto out_child;
2720 		if (err) {
2721 			err = 0;
2722 			done = 1;
2723 		}
2724 
2725 		/*
2726 		 * When perf is starting the traced process, at the end events
2727 		 * die with the process and we wait for that. Thus no need to
2728 		 * disable events in this case.
2729 		 */
2730 		if (done && !disabled && !target__none(&opts->target)) {
2731 			trigger_off(&auxtrace_snapshot_trigger);
2732 			evlist__disable(rec->evlist);
2733 			disabled = true;
2734 		}
2735 	}
2736 
2737 	trigger_off(&auxtrace_snapshot_trigger);
2738 	trigger_off(&switch_output_trigger);
2739 
2740 	if (opts->auxtrace_snapshot_on_exit)
2741 		record__auxtrace_snapshot_exit(rec);
2742 
2743 	if (forks && workload_exec_errno) {
2744 		char msg[STRERR_BUFSIZE], strevsels[2048];
2745 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2746 
2747 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2748 
2749 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2750 			strevsels, argv[0], emsg);
2751 		err = -1;
2752 		goto out_child;
2753 	}
2754 
2755 	if (!quiet)
2756 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2757 			record__waking(rec));
2758 
2759 	write_finished_init(rec, true);
2760 
2761 	if (target__none(&rec->opts.target))
2762 		record__synthesize_workload(rec, true);
2763 
2764 out_child:
2765 	record__stop_threads(rec);
2766 	record__mmap_read_all(rec, true);
2767 out_free_threads:
2768 	record__free_thread_data(rec);
2769 	evlist__finalize_ctlfd(rec->evlist);
2770 	record__aio_mmap_read_sync(rec);
2771 
2772 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2773 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2774 		session->header.env.comp_ratio = ratio + 0.5;
2775 	}
2776 
2777 	if (forks) {
2778 		int exit_status;
2779 
2780 		if (!child_finished)
2781 			kill(rec->evlist->workload.pid, SIGTERM);
2782 
2783 		wait(&exit_status);
2784 
2785 		if (err < 0)
2786 			status = err;
2787 		else if (WIFEXITED(exit_status))
2788 			status = WEXITSTATUS(exit_status);
2789 		else if (WIFSIGNALED(exit_status))
2790 			signr = WTERMSIG(exit_status);
2791 	} else
2792 		status = err;
2793 
2794 	if (rec->off_cpu)
2795 		rec->bytes_written += off_cpu_write(rec->session);
2796 
2797 	record__read_lost_samples(rec);
2798 	record__synthesize(rec, true);
2799 	/* this will be recalculated during process_buildids() */
2800 	rec->samples = 0;
2801 
2802 	if (!err) {
2803 		if (!rec->timestamp_filename) {
2804 			record__finish_output(rec);
2805 		} else {
2806 			fd = record__switch_output(rec, true);
2807 			if (fd < 0) {
2808 				status = fd;
2809 				goto out_delete_session;
2810 			}
2811 		}
2812 	}
2813 
2814 	perf_hooks__invoke_record_end();
2815 
2816 	if (!err && !quiet) {
2817 		char samples[128];
2818 		const char *postfix = rec->timestamp_filename ?
2819 					".<timestamp>" : "";
2820 
2821 		if (rec->samples && !rec->opts.full_auxtrace)
2822 			scnprintf(samples, sizeof(samples),
2823 				  " (%" PRIu64 " samples)", rec->samples);
2824 		else
2825 			samples[0] = '\0';
2826 
2827 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2828 			perf_data__size(data) / 1024.0 / 1024.0,
2829 			data->path, postfix, samples);
2830 		if (ratio) {
2831 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2832 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2833 					ratio);
2834 		}
2835 		fprintf(stderr, " ]\n");
2836 	}
2837 
2838 out_delete_session:
2839 #ifdef HAVE_EVENTFD_SUPPORT
2840 	if (done_fd >= 0) {
2841 		fd = done_fd;
2842 		done_fd = -1;
2843 
2844 		close(fd);
2845 	}
2846 #endif
2847 	zstd_fini(&session->zstd_data);
2848 	perf_session__delete(session);
2849 
2850 	if (!opts->no_bpf_event)
2851 		evlist__stop_sb_thread(rec->sb_evlist);
2852 	return status;
2853 }
2854 
2855 static void callchain_debug(struct callchain_param *callchain)
2856 {
2857 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2858 
2859 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2860 
2861 	if (callchain->record_mode == CALLCHAIN_DWARF)
2862 		pr_debug("callchain: stack dump size %d\n",
2863 			 callchain->dump_size);
2864 }
2865 
2866 int record_opts__parse_callchain(struct record_opts *record,
2867 				 struct callchain_param *callchain,
2868 				 const char *arg, bool unset)
2869 {
2870 	int ret;
2871 	callchain->enabled = !unset;
2872 
2873 	/* --no-call-graph */
2874 	if (unset) {
2875 		callchain->record_mode = CALLCHAIN_NONE;
2876 		pr_debug("callchain: disabled\n");
2877 		return 0;
2878 	}
2879 
2880 	ret = parse_callchain_record_opt(arg, callchain);
2881 	if (!ret) {
2882 		/* Enable data address sampling for DWARF unwind. */
2883 		if (callchain->record_mode == CALLCHAIN_DWARF)
2884 			record->sample_address = true;
2885 		callchain_debug(callchain);
2886 	}
2887 
2888 	return ret;
2889 }
2890 
2891 int record_parse_callchain_opt(const struct option *opt,
2892 			       const char *arg,
2893 			       int unset)
2894 {
2895 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2896 }
2897 
2898 int record_callchain_opt(const struct option *opt,
2899 			 const char *arg __maybe_unused,
2900 			 int unset __maybe_unused)
2901 {
2902 	struct callchain_param *callchain = opt->value;
2903 
2904 	callchain->enabled = true;
2905 
2906 	if (callchain->record_mode == CALLCHAIN_NONE)
2907 		callchain->record_mode = CALLCHAIN_FP;
2908 
2909 	callchain_debug(callchain);
2910 	return 0;
2911 }
2912 
2913 static int perf_record_config(const char *var, const char *value, void *cb)
2914 {
2915 	struct record *rec = cb;
2916 
2917 	if (!strcmp(var, "record.build-id")) {
2918 		if (!strcmp(value, "cache"))
2919 			rec->no_buildid_cache = false;
2920 		else if (!strcmp(value, "no-cache"))
2921 			rec->no_buildid_cache = true;
2922 		else if (!strcmp(value, "skip"))
2923 			rec->no_buildid = true;
2924 		else if (!strcmp(value, "mmap"))
2925 			rec->buildid_mmap = true;
2926 		else
2927 			return -1;
2928 		return 0;
2929 	}
2930 	if (!strcmp(var, "record.call-graph")) {
2931 		var = "call-graph.record-mode";
2932 		return perf_default_config(var, value, cb);
2933 	}
2934 #ifdef HAVE_AIO_SUPPORT
2935 	if (!strcmp(var, "record.aio")) {
2936 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2937 		if (!rec->opts.nr_cblocks)
2938 			rec->opts.nr_cblocks = nr_cblocks_default;
2939 	}
2940 #endif
2941 	if (!strcmp(var, "record.debuginfod")) {
2942 		rec->debuginfod.urls = strdup(value);
2943 		if (!rec->debuginfod.urls)
2944 			return -ENOMEM;
2945 		rec->debuginfod.set = true;
2946 	}
2947 
2948 	return 0;
2949 }
2950 
2951 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2952 {
2953 	struct record *rec = (struct record *)opt->value;
2954 
2955 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2956 }
2957 
2958 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2959 {
2960 	struct record_opts *opts = (struct record_opts *)opt->value;
2961 
2962 	if (unset || !str)
2963 		return 0;
2964 
2965 	if (!strcasecmp(str, "node"))
2966 		opts->affinity = PERF_AFFINITY_NODE;
2967 	else if (!strcasecmp(str, "cpu"))
2968 		opts->affinity = PERF_AFFINITY_CPU;
2969 
2970 	return 0;
2971 }
2972 
2973 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2974 {
2975 	mask->nbits = nr_bits;
2976 	mask->bits = bitmap_zalloc(mask->nbits);
2977 	if (!mask->bits)
2978 		return -ENOMEM;
2979 
2980 	return 0;
2981 }
2982 
2983 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2984 {
2985 	bitmap_free(mask->bits);
2986 	mask->nbits = 0;
2987 }
2988 
2989 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
2990 {
2991 	int ret;
2992 
2993 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
2994 	if (ret) {
2995 		mask->affinity.bits = NULL;
2996 		return ret;
2997 	}
2998 
2999 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3000 	if (ret) {
3001 		record__mmap_cpu_mask_free(&mask->maps);
3002 		mask->maps.bits = NULL;
3003 	}
3004 
3005 	return ret;
3006 }
3007 
3008 static void record__thread_mask_free(struct thread_mask *mask)
3009 {
3010 	record__mmap_cpu_mask_free(&mask->maps);
3011 	record__mmap_cpu_mask_free(&mask->affinity);
3012 }
3013 
3014 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3015 {
3016 	int s;
3017 	struct record_opts *opts = opt->value;
3018 
3019 	if (unset || !str || !strlen(str)) {
3020 		opts->threads_spec = THREAD_SPEC__CPU;
3021 	} else {
3022 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3023 			if (s == THREAD_SPEC__USER) {
3024 				opts->threads_user_spec = strdup(str);
3025 				if (!opts->threads_user_spec)
3026 					return -ENOMEM;
3027 				opts->threads_spec = THREAD_SPEC__USER;
3028 				break;
3029 			}
3030 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3031 				opts->threads_spec = s;
3032 				break;
3033 			}
3034 		}
3035 	}
3036 
3037 	if (opts->threads_spec == THREAD_SPEC__USER)
3038 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3039 	else
3040 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3041 
3042 	return 0;
3043 }
3044 
3045 static int parse_output_max_size(const struct option *opt,
3046 				 const char *str, int unset)
3047 {
3048 	unsigned long *s = (unsigned long *)opt->value;
3049 	static struct parse_tag tags_size[] = {
3050 		{ .tag  = 'B', .mult = 1       },
3051 		{ .tag  = 'K', .mult = 1 << 10 },
3052 		{ .tag  = 'M', .mult = 1 << 20 },
3053 		{ .tag  = 'G', .mult = 1 << 30 },
3054 		{ .tag  = 0 },
3055 	};
3056 	unsigned long val;
3057 
3058 	if (unset) {
3059 		*s = 0;
3060 		return 0;
3061 	}
3062 
3063 	val = parse_tag_value(str, tags_size);
3064 	if (val != (unsigned long) -1) {
3065 		*s = val;
3066 		return 0;
3067 	}
3068 
3069 	return -1;
3070 }
3071 
3072 static int record__parse_mmap_pages(const struct option *opt,
3073 				    const char *str,
3074 				    int unset __maybe_unused)
3075 {
3076 	struct record_opts *opts = opt->value;
3077 	char *s, *p;
3078 	unsigned int mmap_pages;
3079 	int ret;
3080 
3081 	if (!str)
3082 		return -EINVAL;
3083 
3084 	s = strdup(str);
3085 	if (!s)
3086 		return -ENOMEM;
3087 
3088 	p = strchr(s, ',');
3089 	if (p)
3090 		*p = '\0';
3091 
3092 	if (*s) {
3093 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3094 		if (ret)
3095 			goto out_free;
3096 		opts->mmap_pages = mmap_pages;
3097 	}
3098 
3099 	if (!p) {
3100 		ret = 0;
3101 		goto out_free;
3102 	}
3103 
3104 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3105 	if (ret)
3106 		goto out_free;
3107 
3108 	opts->auxtrace_mmap_pages = mmap_pages;
3109 
3110 out_free:
3111 	free(s);
3112 	return ret;
3113 }
3114 
3115 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3116 {
3117 }
3118 
3119 static int parse_control_option(const struct option *opt,
3120 				const char *str,
3121 				int unset __maybe_unused)
3122 {
3123 	struct record_opts *opts = opt->value;
3124 
3125 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3126 }
3127 
3128 static void switch_output_size_warn(struct record *rec)
3129 {
3130 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3131 	struct switch_output *s = &rec->switch_output;
3132 
3133 	wakeup_size /= 2;
3134 
3135 	if (s->size < wakeup_size) {
3136 		char buf[100];
3137 
3138 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3139 		pr_warning("WARNING: switch-output data size lower than "
3140 			   "wakeup kernel buffer size (%s) "
3141 			   "expect bigger perf.data sizes\n", buf);
3142 	}
3143 }
3144 
3145 static int switch_output_setup(struct record *rec)
3146 {
3147 	struct switch_output *s = &rec->switch_output;
3148 	static struct parse_tag tags_size[] = {
3149 		{ .tag  = 'B', .mult = 1       },
3150 		{ .tag  = 'K', .mult = 1 << 10 },
3151 		{ .tag  = 'M', .mult = 1 << 20 },
3152 		{ .tag  = 'G', .mult = 1 << 30 },
3153 		{ .tag  = 0 },
3154 	};
3155 	static struct parse_tag tags_time[] = {
3156 		{ .tag  = 's', .mult = 1        },
3157 		{ .tag  = 'm', .mult = 60       },
3158 		{ .tag  = 'h', .mult = 60*60    },
3159 		{ .tag  = 'd', .mult = 60*60*24 },
3160 		{ .tag  = 0 },
3161 	};
3162 	unsigned long val;
3163 
3164 	/*
3165 	 * If we're using --switch-output-events, then we imply its
3166 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3167 	 *  thread to its parent.
3168 	 */
3169 	if (rec->switch_output_event_set) {
3170 		if (record__threads_enabled(rec)) {
3171 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3172 			return 0;
3173 		}
3174 		goto do_signal;
3175 	}
3176 
3177 	if (!s->set)
3178 		return 0;
3179 
3180 	if (record__threads_enabled(rec)) {
3181 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3182 		return 0;
3183 	}
3184 
3185 	if (!strcmp(s->str, "signal")) {
3186 do_signal:
3187 		s->signal = true;
3188 		pr_debug("switch-output with SIGUSR2 signal\n");
3189 		goto enabled;
3190 	}
3191 
3192 	val = parse_tag_value(s->str, tags_size);
3193 	if (val != (unsigned long) -1) {
3194 		s->size = val;
3195 		pr_debug("switch-output with %s size threshold\n", s->str);
3196 		goto enabled;
3197 	}
3198 
3199 	val = parse_tag_value(s->str, tags_time);
3200 	if (val != (unsigned long) -1) {
3201 		s->time = val;
3202 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3203 			 s->str, s->time);
3204 		goto enabled;
3205 	}
3206 
3207 	return -1;
3208 
3209 enabled:
3210 	rec->timestamp_filename = true;
3211 	s->enabled              = true;
3212 
3213 	if (s->size && !rec->opts.no_buffering)
3214 		switch_output_size_warn(rec);
3215 
3216 	return 0;
3217 }
3218 
3219 static const char * const __record_usage[] = {
3220 	"perf record [<options>] [<command>]",
3221 	"perf record [<options>] -- <command> [<options>]",
3222 	NULL
3223 };
3224 const char * const *record_usage = __record_usage;
3225 
3226 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3227 				  struct perf_sample *sample, struct machine *machine)
3228 {
3229 	/*
3230 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3231 	 * no need to add them twice.
3232 	 */
3233 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3234 		return 0;
3235 	return perf_event__process_mmap(tool, event, sample, machine);
3236 }
3237 
3238 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3239 				   struct perf_sample *sample, struct machine *machine)
3240 {
3241 	/*
3242 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3243 	 * no need to add them twice.
3244 	 */
3245 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3246 		return 0;
3247 
3248 	return perf_event__process_mmap2(tool, event, sample, machine);
3249 }
3250 
3251 static int process_timestamp_boundary(struct perf_tool *tool,
3252 				      union perf_event *event __maybe_unused,
3253 				      struct perf_sample *sample,
3254 				      struct machine *machine __maybe_unused)
3255 {
3256 	struct record *rec = container_of(tool, struct record, tool);
3257 
3258 	set_timestamp_boundary(rec, sample->time);
3259 	return 0;
3260 }
3261 
3262 static int parse_record_synth_option(const struct option *opt,
3263 				     const char *str,
3264 				     int unset __maybe_unused)
3265 {
3266 	struct record_opts *opts = opt->value;
3267 	char *p = strdup(str);
3268 
3269 	if (p == NULL)
3270 		return -1;
3271 
3272 	opts->synth = parse_synth_opt(p);
3273 	free(p);
3274 
3275 	if (opts->synth < 0) {
3276 		pr_err("Invalid synth option: %s\n", str);
3277 		return -1;
3278 	}
3279 	return 0;
3280 }
3281 
3282 /*
3283  * XXX Ideally would be local to cmd_record() and passed to a record__new
3284  * because we need to have access to it in record__exit, that is called
3285  * after cmd_record() exits, but since record_options need to be accessible to
3286  * builtin-script, leave it here.
3287  *
3288  * At least we don't ouch it in all the other functions here directly.
3289  *
3290  * Just say no to tons of global variables, sigh.
3291  */
3292 static struct record record = {
3293 	.opts = {
3294 		.sample_time	     = true,
3295 		.mmap_pages	     = UINT_MAX,
3296 		.user_freq	     = UINT_MAX,
3297 		.user_interval	     = ULLONG_MAX,
3298 		.freq		     = 4000,
3299 		.target		     = {
3300 			.uses_mmap   = true,
3301 			.default_per_cpu = true,
3302 		},
3303 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3304 		.nr_threads_synthesize = 1,
3305 		.ctl_fd              = -1,
3306 		.ctl_fd_ack          = -1,
3307 		.synth               = PERF_SYNTH_ALL,
3308 	},
3309 	.tool = {
3310 		.sample		= process_sample_event,
3311 		.fork		= perf_event__process_fork,
3312 		.exit		= perf_event__process_exit,
3313 		.comm		= perf_event__process_comm,
3314 		.namespaces	= perf_event__process_namespaces,
3315 		.mmap		= build_id__process_mmap,
3316 		.mmap2		= build_id__process_mmap2,
3317 		.itrace_start	= process_timestamp_boundary,
3318 		.aux		= process_timestamp_boundary,
3319 		.ordered_events	= true,
3320 	},
3321 };
3322 
3323 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3324 	"\n\t\t\t\tDefault: fp";
3325 
3326 static bool dry_run;
3327 
3328 /*
3329  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3330  * with it and switch to use the library functions in perf_evlist that came
3331  * from builtin-record.c, i.e. use record_opts,
3332  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3333  * using pipes, etc.
3334  */
3335 static struct option __record_options[] = {
3336 	OPT_CALLBACK('e', "event", &record.evlist, "event",
3337 		     "event selector. use 'perf list' to list available events",
3338 		     parse_events_option),
3339 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3340 		     "event filter", parse_filter),
3341 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3342 			   NULL, "don't record events from perf itself",
3343 			   exclude_perf),
3344 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3345 		    "record events on existing process id"),
3346 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3347 		    "record events on existing thread id"),
3348 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3349 		    "collect data with this RT SCHED_FIFO priority"),
3350 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3351 		    "collect data without buffering"),
3352 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3353 		    "collect raw sample records from all opened counters"),
3354 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3355 			    "system-wide collection from all CPUs"),
3356 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3357 		    "list of cpus to monitor"),
3358 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3359 	OPT_STRING('o', "output", &record.data.path, "file",
3360 		    "output file name"),
3361 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3362 			&record.opts.no_inherit_set,
3363 			"child tasks do not inherit counters"),
3364 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3365 		    "synthesize non-sample events at the end of output"),
3366 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3367 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3368 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3369 		    "Fail if the specified frequency can't be used"),
3370 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3371 		     "profile at this frequency",
3372 		      record__parse_freq),
3373 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3374 		     "number of mmap data pages and AUX area tracing mmap pages",
3375 		     record__parse_mmap_pages),
3376 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3377 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3378 		     record__mmap_flush_parse),
3379 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3380 			   NULL, "enables call-graph recording" ,
3381 			   &record_callchain_opt),
3382 	OPT_CALLBACK(0, "call-graph", &record.opts,
3383 		     "record_mode[,record_size]", record_callchain_help,
3384 		     &record_parse_callchain_opt),
3385 	OPT_INCR('v', "verbose", &verbose,
3386 		    "be more verbose (show counter open errors, etc)"),
3387 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3388 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3389 		    "per thread counts"),
3390 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3391 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3392 		    "Record the sample physical addresses"),
3393 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3394 		    "Record the sampled data address data page size"),
3395 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3396 		    "Record the sampled code address (ip) page size"),
3397 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3398 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3399 		    "Record the sample identifier"),
3400 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3401 			&record.opts.sample_time_set,
3402 			"Record the sample timestamps"),
3403 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3404 			"Record the sample period"),
3405 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3406 		    "don't sample"),
3407 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3408 			&record.no_buildid_cache_set,
3409 			"do not update the buildid cache"),
3410 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3411 			&record.no_buildid_set,
3412 			"do not collect buildids in perf.data"),
3413 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3414 		     "monitor event in cgroup name only",
3415 		     parse_cgroups),
3416 	OPT_CALLBACK('D', "delay", &record, "ms",
3417 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3418 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3419 		     record__parse_event_enable_time),
3420 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3421 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3422 		   "user to profile"),
3423 
3424 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3425 		     "branch any", "sample any taken branches",
3426 		     parse_branch_stack),
3427 
3428 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3429 		     "branch filter mask", "branch stack filter modes",
3430 		     parse_branch_stack),
3431 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3432 		    "sample by weight (on special events only)"),
3433 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3434 		    "sample transaction flags (special events only)"),
3435 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3436 		    "use per-thread mmaps"),
3437 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3438 		    "sample selected machine registers on interrupt,"
3439 		    " use '-I?' to list register names", parse_intr_regs),
3440 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3441 		    "sample selected machine registers on interrupt,"
3442 		    " use '--user-regs=?' to list register names", parse_user_regs),
3443 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3444 		    "Record running/enabled time of read (:S) events"),
3445 	OPT_CALLBACK('k', "clockid", &record.opts,
3446 	"clockid", "clockid to use for events, see clock_gettime()",
3447 	parse_clockid),
3448 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3449 			  "opts", "AUX area tracing Snapshot Mode", ""),
3450 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3451 			  "opts", "sample AUX area", ""),
3452 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3453 			"per thread proc mmap processing timeout in ms"),
3454 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3455 		    "Record namespaces events"),
3456 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3457 		    "Record cgroup events"),
3458 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3459 			&record.opts.record_switch_events_set,
3460 			"Record context switch events"),
3461 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3462 			 "Configure all used events to run in kernel space.",
3463 			 PARSE_OPT_EXCLUSIVE),
3464 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3465 			 "Configure all used events to run in user space.",
3466 			 PARSE_OPT_EXCLUSIVE),
3467 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3468 		    "collect kernel callchains"),
3469 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3470 		    "collect user callchains"),
3471 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
3472 		   "clang binary to use for compiling BPF scriptlets"),
3473 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
3474 		   "options passed to clang when compiling BPF scriptlets"),
3475 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3476 		   "file", "vmlinux pathname"),
3477 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3478 		    "Record build-id of all DSOs regardless of hits"),
3479 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3480 		    "Record build-id in map events"),
3481 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3482 		    "append timestamp to output filename"),
3483 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3484 		    "Record timestamp boundary (time of first/last samples)"),
3485 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3486 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3487 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3488 			  "signal"),
3489 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
3490 			 "switch output event selector. use 'perf list' to list available events",
3491 			 parse_events_option_new_evlist),
3492 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3493 		   "Limit number of switch output generated files"),
3494 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3495 		    "Parse options then exit"),
3496 #ifdef HAVE_AIO_SUPPORT
3497 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3498 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3499 		     record__aio_parse),
3500 #endif
3501 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3502 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3503 		     record__parse_affinity),
3504 #ifdef HAVE_ZSTD_SUPPORT
3505 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3506 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3507 			    record__parse_comp_level),
3508 #endif
3509 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3510 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3511 	OPT_UINTEGER(0, "num-thread-synthesize",
3512 		     &record.opts.nr_threads_synthesize,
3513 		     "number of threads to run for event synthesis"),
3514 #ifdef HAVE_LIBPFM
3515 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3516 		"libpfm4 event selector. use 'perf list' to list available events",
3517 		parse_libpfm_events_option),
3518 #endif
3519 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3520 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3521 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3522 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3523 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3524 		      parse_control_option),
3525 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3526 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3527 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3528 			  &record.debuginfod.set, "debuginfod urls",
3529 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3530 			  "system"),
3531 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3532 			    "write collected trace data into several data files using parallel threads",
3533 			    record__parse_threads),
3534 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3535 	OPT_END()
3536 };
3537 
3538 struct option *record_options = __record_options;
3539 
3540 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3541 {
3542 	struct perf_cpu cpu;
3543 	int idx;
3544 
3545 	if (cpu_map__is_dummy(cpus))
3546 		return 0;
3547 
3548 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
3549 		if (cpu.cpu == -1)
3550 			continue;
3551 		/* Return ENODEV is input cpu is greater than max cpu */
3552 		if ((unsigned long)cpu.cpu > mask->nbits)
3553 			return -ENODEV;
3554 		__set_bit(cpu.cpu, mask->bits);
3555 	}
3556 
3557 	return 0;
3558 }
3559 
3560 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3561 {
3562 	struct perf_cpu_map *cpus;
3563 
3564 	cpus = perf_cpu_map__new(mask_spec);
3565 	if (!cpus)
3566 		return -ENOMEM;
3567 
3568 	bitmap_zero(mask->bits, mask->nbits);
3569 	if (record__mmap_cpu_mask_init(mask, cpus))
3570 		return -ENODEV;
3571 
3572 	perf_cpu_map__put(cpus);
3573 
3574 	return 0;
3575 }
3576 
3577 static void record__free_thread_masks(struct record *rec, int nr_threads)
3578 {
3579 	int t;
3580 
3581 	if (rec->thread_masks)
3582 		for (t = 0; t < nr_threads; t++)
3583 			record__thread_mask_free(&rec->thread_masks[t]);
3584 
3585 	zfree(&rec->thread_masks);
3586 }
3587 
3588 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3589 {
3590 	int t, ret;
3591 
3592 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3593 	if (!rec->thread_masks) {
3594 		pr_err("Failed to allocate thread masks\n");
3595 		return -ENOMEM;
3596 	}
3597 
3598 	for (t = 0; t < nr_threads; t++) {
3599 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3600 		if (ret) {
3601 			pr_err("Failed to allocate thread masks[%d]\n", t);
3602 			goto out_free;
3603 		}
3604 	}
3605 
3606 	return 0;
3607 
3608 out_free:
3609 	record__free_thread_masks(rec, nr_threads);
3610 
3611 	return ret;
3612 }
3613 
3614 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3615 {
3616 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3617 
3618 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3619 	if (ret)
3620 		return ret;
3621 
3622 	rec->nr_threads = nr_cpus;
3623 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3624 
3625 	for (t = 0; t < rec->nr_threads; t++) {
3626 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3627 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3628 		if (verbose > 0) {
3629 			pr_debug("thread_masks[%d]: ", t);
3630 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3631 			pr_debug("thread_masks[%d]: ", t);
3632 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3633 		}
3634 	}
3635 
3636 	return 0;
3637 }
3638 
3639 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3640 					  const char **maps_spec, const char **affinity_spec,
3641 					  u32 nr_spec)
3642 {
3643 	u32 s;
3644 	int ret = 0, t = 0;
3645 	struct mmap_cpu_mask cpus_mask;
3646 	struct thread_mask thread_mask, full_mask, *thread_masks;
3647 
3648 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3649 	if (ret) {
3650 		pr_err("Failed to allocate CPUs mask\n");
3651 		return ret;
3652 	}
3653 
3654 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3655 	if (ret) {
3656 		pr_err("Failed to init cpu mask\n");
3657 		goto out_free_cpu_mask;
3658 	}
3659 
3660 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3661 	if (ret) {
3662 		pr_err("Failed to allocate full mask\n");
3663 		goto out_free_cpu_mask;
3664 	}
3665 
3666 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3667 	if (ret) {
3668 		pr_err("Failed to allocate thread mask\n");
3669 		goto out_free_full_and_cpu_masks;
3670 	}
3671 
3672 	for (s = 0; s < nr_spec; s++) {
3673 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3674 		if (ret) {
3675 			pr_err("Failed to initialize maps thread mask\n");
3676 			goto out_free;
3677 		}
3678 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3679 		if (ret) {
3680 			pr_err("Failed to initialize affinity thread mask\n");
3681 			goto out_free;
3682 		}
3683 
3684 		/* ignore invalid CPUs but do not allow empty masks */
3685 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3686 				cpus_mask.bits, thread_mask.maps.nbits)) {
3687 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3688 			ret = -EINVAL;
3689 			goto out_free;
3690 		}
3691 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3692 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3693 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3694 			ret = -EINVAL;
3695 			goto out_free;
3696 		}
3697 
3698 		/* do not allow intersection with other masks (full_mask) */
3699 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3700 				      thread_mask.maps.nbits)) {
3701 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3702 			ret = -EINVAL;
3703 			goto out_free;
3704 		}
3705 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3706 				      thread_mask.affinity.nbits)) {
3707 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3708 			ret = -EINVAL;
3709 			goto out_free;
3710 		}
3711 
3712 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3713 			  thread_mask.maps.bits, full_mask.maps.nbits);
3714 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3715 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3716 
3717 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3718 		if (!thread_masks) {
3719 			pr_err("Failed to reallocate thread masks\n");
3720 			ret = -ENOMEM;
3721 			goto out_free;
3722 		}
3723 		rec->thread_masks = thread_masks;
3724 		rec->thread_masks[t] = thread_mask;
3725 		if (verbose > 0) {
3726 			pr_debug("thread_masks[%d]: ", t);
3727 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3728 			pr_debug("thread_masks[%d]: ", t);
3729 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3730 		}
3731 		t++;
3732 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3733 		if (ret) {
3734 			pr_err("Failed to allocate thread mask\n");
3735 			goto out_free_full_and_cpu_masks;
3736 		}
3737 	}
3738 	rec->nr_threads = t;
3739 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3740 	if (!rec->nr_threads)
3741 		ret = -EINVAL;
3742 
3743 out_free:
3744 	record__thread_mask_free(&thread_mask);
3745 out_free_full_and_cpu_masks:
3746 	record__thread_mask_free(&full_mask);
3747 out_free_cpu_mask:
3748 	record__mmap_cpu_mask_free(&cpus_mask);
3749 
3750 	return ret;
3751 }
3752 
3753 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3754 {
3755 	int ret;
3756 	struct cpu_topology *topo;
3757 
3758 	topo = cpu_topology__new();
3759 	if (!topo) {
3760 		pr_err("Failed to allocate CPU topology\n");
3761 		return -ENOMEM;
3762 	}
3763 
3764 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3765 					     topo->core_cpus_list, topo->core_cpus_lists);
3766 	cpu_topology__delete(topo);
3767 
3768 	return ret;
3769 }
3770 
3771 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3772 {
3773 	int ret;
3774 	struct cpu_topology *topo;
3775 
3776 	topo = cpu_topology__new();
3777 	if (!topo) {
3778 		pr_err("Failed to allocate CPU topology\n");
3779 		return -ENOMEM;
3780 	}
3781 
3782 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3783 					     topo->package_cpus_list, topo->package_cpus_lists);
3784 	cpu_topology__delete(topo);
3785 
3786 	return ret;
3787 }
3788 
3789 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3790 {
3791 	u32 s;
3792 	int ret;
3793 	const char **spec;
3794 	struct numa_topology *topo;
3795 
3796 	topo = numa_topology__new();
3797 	if (!topo) {
3798 		pr_err("Failed to allocate NUMA topology\n");
3799 		return -ENOMEM;
3800 	}
3801 
3802 	spec = zalloc(topo->nr * sizeof(char *));
3803 	if (!spec) {
3804 		pr_err("Failed to allocate NUMA spec\n");
3805 		ret = -ENOMEM;
3806 		goto out_delete_topo;
3807 	}
3808 	for (s = 0; s < topo->nr; s++)
3809 		spec[s] = topo->nodes[s].cpus;
3810 
3811 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3812 
3813 	zfree(&spec);
3814 
3815 out_delete_topo:
3816 	numa_topology__delete(topo);
3817 
3818 	return ret;
3819 }
3820 
3821 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3822 {
3823 	int t, ret;
3824 	u32 s, nr_spec = 0;
3825 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3826 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3827 
3828 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3829 		spec = strtok_r(user_spec, ":", &spec_ptr);
3830 		if (spec == NULL)
3831 			break;
3832 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3833 		mask = strtok_r(spec, "/", &mask_ptr);
3834 		if (mask == NULL)
3835 			break;
3836 		pr_debug2("  maps mask: %s\n", mask);
3837 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3838 		if (!tmp_spec) {
3839 			pr_err("Failed to reallocate maps spec\n");
3840 			ret = -ENOMEM;
3841 			goto out_free;
3842 		}
3843 		maps_spec = tmp_spec;
3844 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3845 		if (!maps_spec[nr_spec]) {
3846 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3847 			ret = -ENOMEM;
3848 			goto out_free;
3849 		}
3850 		mask = strtok_r(NULL, "/", &mask_ptr);
3851 		if (mask == NULL) {
3852 			pr_err("Invalid thread maps or affinity specs\n");
3853 			ret = -EINVAL;
3854 			goto out_free;
3855 		}
3856 		pr_debug2("  affinity mask: %s\n", mask);
3857 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3858 		if (!tmp_spec) {
3859 			pr_err("Failed to reallocate affinity spec\n");
3860 			ret = -ENOMEM;
3861 			goto out_free;
3862 		}
3863 		affinity_spec = tmp_spec;
3864 		affinity_spec[nr_spec] = strdup(mask);
3865 		if (!affinity_spec[nr_spec]) {
3866 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3867 			ret = -ENOMEM;
3868 			goto out_free;
3869 		}
3870 		dup_mask = NULL;
3871 		nr_spec++;
3872 	}
3873 
3874 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3875 					     (const char **)affinity_spec, nr_spec);
3876 
3877 out_free:
3878 	free(dup_mask);
3879 	for (s = 0; s < nr_spec; s++) {
3880 		if (maps_spec)
3881 			free(maps_spec[s]);
3882 		if (affinity_spec)
3883 			free(affinity_spec[s]);
3884 	}
3885 	free(affinity_spec);
3886 	free(maps_spec);
3887 
3888 	return ret;
3889 }
3890 
3891 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3892 {
3893 	int ret;
3894 
3895 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3896 	if (ret)
3897 		return ret;
3898 
3899 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3900 		return -ENODEV;
3901 
3902 	rec->nr_threads = 1;
3903 
3904 	return 0;
3905 }
3906 
3907 static int record__init_thread_masks(struct record *rec)
3908 {
3909 	int ret = 0;
3910 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3911 
3912 	if (!record__threads_enabled(rec))
3913 		return record__init_thread_default_masks(rec, cpus);
3914 
3915 	if (evlist__per_thread(rec->evlist)) {
3916 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3917 		return -EINVAL;
3918 	}
3919 
3920 	switch (rec->opts.threads_spec) {
3921 	case THREAD_SPEC__CPU:
3922 		ret = record__init_thread_cpu_masks(rec, cpus);
3923 		break;
3924 	case THREAD_SPEC__CORE:
3925 		ret = record__init_thread_core_masks(rec, cpus);
3926 		break;
3927 	case THREAD_SPEC__PACKAGE:
3928 		ret = record__init_thread_package_masks(rec, cpus);
3929 		break;
3930 	case THREAD_SPEC__NUMA:
3931 		ret = record__init_thread_numa_masks(rec, cpus);
3932 		break;
3933 	case THREAD_SPEC__USER:
3934 		ret = record__init_thread_user_masks(rec, cpus);
3935 		break;
3936 	default:
3937 		break;
3938 	}
3939 
3940 	return ret;
3941 }
3942 
3943 int cmd_record(int argc, const char **argv)
3944 {
3945 	int err;
3946 	struct record *rec = &record;
3947 	char errbuf[BUFSIZ];
3948 
3949 	setlocale(LC_ALL, "");
3950 
3951 #ifndef HAVE_LIBBPF_SUPPORT
3952 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
3953 	set_nobuild('\0', "clang-path", true);
3954 	set_nobuild('\0', "clang-opt", true);
3955 # undef set_nobuild
3956 #endif
3957 
3958 #ifndef HAVE_BPF_PROLOGUE
3959 # if !defined (HAVE_DWARF_SUPPORT)
3960 #  define REASON  "NO_DWARF=1"
3961 # elif !defined (HAVE_LIBBPF_SUPPORT)
3962 #  define REASON  "NO_LIBBPF=1"
3963 # else
3964 #  define REASON  "this architecture doesn't support BPF prologue"
3965 # endif
3966 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
3967 	set_nobuild('\0', "vmlinux", true);
3968 # undef set_nobuild
3969 # undef REASON
3970 #endif
3971 
3972 #ifndef HAVE_BPF_SKEL
3973 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3974 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3975 # undef set_nobuild
3976 #endif
3977 
3978 	rec->opts.affinity = PERF_AFFINITY_SYS;
3979 
3980 	rec->evlist = evlist__new();
3981 	if (rec->evlist == NULL)
3982 		return -ENOMEM;
3983 
3984 	err = perf_config(perf_record_config, rec);
3985 	if (err)
3986 		return err;
3987 
3988 	argc = parse_options(argc, argv, record_options, record_usage,
3989 			    PARSE_OPT_STOP_AT_NON_OPTION);
3990 	if (quiet)
3991 		perf_quiet_option();
3992 
3993 	err = symbol__validate_sym_arguments();
3994 	if (err)
3995 		return err;
3996 
3997 	perf_debuginfod_setup(&record.debuginfod);
3998 
3999 	/* Make system wide (-a) the default target. */
4000 	if (!argc && target__none(&rec->opts.target))
4001 		rec->opts.target.system_wide = true;
4002 
4003 	if (nr_cgroups && !rec->opts.target.system_wide) {
4004 		usage_with_options_msg(record_usage, record_options,
4005 			"cgroup monitoring only available in system-wide mode");
4006 
4007 	}
4008 
4009 	if (rec->buildid_mmap) {
4010 		if (!perf_can_record_build_id()) {
4011 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4012 			err = -EINVAL;
4013 			goto out_opts;
4014 		}
4015 		pr_debug("Enabling build id in mmap2 events.\n");
4016 		/* Enable mmap build id synthesizing. */
4017 		symbol_conf.buildid_mmap2 = true;
4018 		/* Enable perf_event_attr::build_id bit. */
4019 		rec->opts.build_id = true;
4020 		/* Disable build id cache. */
4021 		rec->no_buildid = true;
4022 	}
4023 
4024 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4025 		pr_err("Kernel has no cgroup sampling support.\n");
4026 		err = -EINVAL;
4027 		goto out_opts;
4028 	}
4029 
4030 	if (rec->opts.kcore)
4031 		rec->opts.text_poke = true;
4032 
4033 	if (rec->opts.kcore || record__threads_enabled(rec))
4034 		rec->data.is_dir = true;
4035 
4036 	if (record__threads_enabled(rec)) {
4037 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4038 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4039 			goto out_opts;
4040 		}
4041 		if (record__aio_enabled(rec)) {
4042 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4043 			goto out_opts;
4044 		}
4045 	}
4046 
4047 	if (rec->opts.comp_level != 0) {
4048 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4049 		rec->no_buildid = true;
4050 	}
4051 
4052 	if (rec->opts.record_switch_events &&
4053 	    !perf_can_record_switch_events()) {
4054 		ui__error("kernel does not support recording context switch events\n");
4055 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4056 		err = -EINVAL;
4057 		goto out_opts;
4058 	}
4059 
4060 	if (switch_output_setup(rec)) {
4061 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4062 		err = -EINVAL;
4063 		goto out_opts;
4064 	}
4065 
4066 	if (rec->switch_output.time) {
4067 		signal(SIGALRM, alarm_sig_handler);
4068 		alarm(rec->switch_output.time);
4069 	}
4070 
4071 	if (rec->switch_output.num_files) {
4072 		rec->switch_output.filenames = calloc(sizeof(char *),
4073 						      rec->switch_output.num_files);
4074 		if (!rec->switch_output.filenames) {
4075 			err = -EINVAL;
4076 			goto out_opts;
4077 		}
4078 	}
4079 
4080 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4081 		rec->timestamp_filename = false;
4082 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4083 	}
4084 
4085 	/*
4086 	 * Allow aliases to facilitate the lookup of symbols for address
4087 	 * filters. Refer to auxtrace_parse_filters().
4088 	 */
4089 	symbol_conf.allow_aliases = true;
4090 
4091 	symbol__init(NULL);
4092 
4093 	err = record__auxtrace_init(rec);
4094 	if (err)
4095 		goto out;
4096 
4097 	if (dry_run)
4098 		goto out;
4099 
4100 	err = bpf__setup_stdout(rec->evlist);
4101 	if (err) {
4102 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
4103 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
4104 			 errbuf);
4105 		goto out;
4106 	}
4107 
4108 	err = -ENOMEM;
4109 
4110 	if (rec->no_buildid_cache || rec->no_buildid) {
4111 		disable_buildid_cache();
4112 	} else if (rec->switch_output.enabled) {
4113 		/*
4114 		 * In 'perf record --switch-output', disable buildid
4115 		 * generation by default to reduce data file switching
4116 		 * overhead. Still generate buildid if they are required
4117 		 * explicitly using
4118 		 *
4119 		 *  perf record --switch-output --no-no-buildid \
4120 		 *              --no-no-buildid-cache
4121 		 *
4122 		 * Following code equals to:
4123 		 *
4124 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4125 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4126 		 *         disable_buildid_cache();
4127 		 */
4128 		bool disable = true;
4129 
4130 		if (rec->no_buildid_set && !rec->no_buildid)
4131 			disable = false;
4132 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4133 			disable = false;
4134 		if (disable) {
4135 			rec->no_buildid = true;
4136 			rec->no_buildid_cache = true;
4137 			disable_buildid_cache();
4138 		}
4139 	}
4140 
4141 	if (record.opts.overwrite)
4142 		record.opts.tail_synthesize = true;
4143 
4144 	if (rec->evlist->core.nr_entries == 0) {
4145 		if (perf_pmu__has_hybrid()) {
4146 			err = evlist__add_default_hybrid(rec->evlist,
4147 							 !record.opts.no_samples);
4148 		} else {
4149 			err = __evlist__add_default(rec->evlist,
4150 						    !record.opts.no_samples);
4151 		}
4152 
4153 		if (err < 0) {
4154 			pr_err("Not enough memory for event selector list\n");
4155 			goto out;
4156 		}
4157 	}
4158 
4159 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4160 		rec->opts.no_inherit = true;
4161 
4162 	err = target__validate(&rec->opts.target);
4163 	if (err) {
4164 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4165 		ui__warning("%s\n", errbuf);
4166 	}
4167 
4168 	err = target__parse_uid(&rec->opts.target);
4169 	if (err) {
4170 		int saved_errno = errno;
4171 
4172 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4173 		ui__error("%s", errbuf);
4174 
4175 		err = -saved_errno;
4176 		goto out;
4177 	}
4178 
4179 	/* Enable ignoring missing threads when -u/-p option is defined. */
4180 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4181 
4182 	if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
4183 		pr_err("failed to use cpu list %s\n",
4184 		       rec->opts.target.cpu_list);
4185 		goto out;
4186 	}
4187 
4188 	rec->opts.target.hybrid = perf_pmu__has_hybrid();
4189 
4190 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4191 		arch__add_leaf_frame_record_opts(&rec->opts);
4192 
4193 	err = -ENOMEM;
4194 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4195 		if (rec->opts.target.pid != NULL) {
4196 			pr_err("Couldn't create thread/CPU maps: %s\n",
4197 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4198 			goto out;
4199 		}
4200 		else
4201 			usage_with_options(record_usage, record_options);
4202 	}
4203 
4204 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4205 	if (err)
4206 		goto out;
4207 
4208 	/*
4209 	 * We take all buildids when the file contains
4210 	 * AUX area tracing data because we do not decode the
4211 	 * trace because it would take too long.
4212 	 */
4213 	if (rec->opts.full_auxtrace)
4214 		rec->buildid_all = true;
4215 
4216 	if (rec->opts.text_poke) {
4217 		err = record__config_text_poke(rec->evlist);
4218 		if (err) {
4219 			pr_err("record__config_text_poke failed, error %d\n", err);
4220 			goto out;
4221 		}
4222 	}
4223 
4224 	if (rec->off_cpu) {
4225 		err = record__config_off_cpu(rec);
4226 		if (err) {
4227 			pr_err("record__config_off_cpu failed, error %d\n", err);
4228 			goto out;
4229 		}
4230 	}
4231 
4232 	if (record_opts__config(&rec->opts)) {
4233 		err = -EINVAL;
4234 		goto out;
4235 	}
4236 
4237 	err = record__init_thread_masks(rec);
4238 	if (err) {
4239 		pr_err("Failed to initialize parallel data streaming masks\n");
4240 		goto out;
4241 	}
4242 
4243 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4244 		rec->opts.nr_cblocks = nr_cblocks_max;
4245 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4246 
4247 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4248 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4249 
4250 	if (rec->opts.comp_level > comp_level_max)
4251 		rec->opts.comp_level = comp_level_max;
4252 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4253 
4254 	err = __cmd_record(&record, argc, argv);
4255 out:
4256 	evlist__delete(rec->evlist);
4257 	symbol__exit();
4258 	auxtrace_record__free(rec->itr);
4259 out_opts:
4260 	record__free_thread_masks(rec, rec->nr_threads);
4261 	rec->nr_threads = 0;
4262 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4263 	return err;
4264 }
4265 
4266 static void snapshot_sig_handler(int sig __maybe_unused)
4267 {
4268 	struct record *rec = &record;
4269 
4270 	hit_auxtrace_snapshot_trigger(rec);
4271 
4272 	if (switch_output_signal(rec))
4273 		trigger_hit(&switch_output_trigger);
4274 }
4275 
4276 static void alarm_sig_handler(int sig __maybe_unused)
4277 {
4278 	struct record *rec = &record;
4279 
4280 	if (switch_output_time(rec))
4281 		trigger_hit(&switch_output_trigger);
4282 }
4283