xref: /openbmc/linux/tools/perf/builtin-record.c (revision 1c137323)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/llvm-utils.h"
41 #include "util/bpf-loader.h"
42 #include "util/trigger.h"
43 #include "util/perf-hooks.h"
44 #include "util/cpu-set-sched.h"
45 #include "util/synthetic-events.h"
46 #include "util/time-utils.h"
47 #include "util/units.h"
48 #include "util/bpf-event.h"
49 #include "util/util.h"
50 #include "util/pfm.h"
51 #include "util/clockid.h"
52 #include "util/pmu-hybrid.h"
53 #include "util/evlist-hybrid.h"
54 #include "util/off_cpu.h"
55 #include "asm/bug.h"
56 #include "perf.h"
57 #include "cputopo.h"
58 
59 #include <errno.h>
60 #include <inttypes.h>
61 #include <locale.h>
62 #include <poll.h>
63 #include <pthread.h>
64 #include <unistd.h>
65 #ifndef HAVE_GETTID
66 #include <syscall.h>
67 #endif
68 #include <sched.h>
69 #include <signal.h>
70 #ifdef HAVE_EVENTFD_SUPPORT
71 #include <sys/eventfd.h>
72 #endif
73 #include <sys/mman.h>
74 #include <sys/wait.h>
75 #include <sys/types.h>
76 #include <sys/stat.h>
77 #include <fcntl.h>
78 #include <linux/err.h>
79 #include <linux/string.h>
80 #include <linux/time64.h>
81 #include <linux/zalloc.h>
82 #include <linux/bitmap.h>
83 #include <sys/time.h>
84 
85 struct switch_output {
86 	bool		 enabled;
87 	bool		 signal;
88 	unsigned long	 size;
89 	unsigned long	 time;
90 	const char	*str;
91 	bool		 set;
92 	char		 **filenames;
93 	int		 num_files;
94 	int		 cur_file;
95 };
96 
97 struct thread_mask {
98 	struct mmap_cpu_mask	maps;
99 	struct mmap_cpu_mask	affinity;
100 };
101 
102 struct record_thread {
103 	pid_t			tid;
104 	struct thread_mask	*mask;
105 	struct {
106 		int		msg[2];
107 		int		ack[2];
108 	} pipes;
109 	struct fdarray		pollfd;
110 	int			ctlfd_pos;
111 	int			nr_mmaps;
112 	struct mmap		**maps;
113 	struct mmap		**overwrite_maps;
114 	struct record		*rec;
115 	unsigned long long	samples;
116 	unsigned long		waking;
117 	u64			bytes_written;
118 	u64			bytes_transferred;
119 	u64			bytes_compressed;
120 };
121 
122 static __thread struct record_thread *thread;
123 
124 enum thread_msg {
125 	THREAD_MSG__UNDEFINED = 0,
126 	THREAD_MSG__READY,
127 	THREAD_MSG__MAX,
128 };
129 
130 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
131 	"UNDEFINED", "READY"
132 };
133 
134 enum thread_spec {
135 	THREAD_SPEC__UNDEFINED = 0,
136 	THREAD_SPEC__CPU,
137 	THREAD_SPEC__CORE,
138 	THREAD_SPEC__PACKAGE,
139 	THREAD_SPEC__NUMA,
140 	THREAD_SPEC__USER,
141 	THREAD_SPEC__MAX,
142 };
143 
144 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
145 	"undefined", "cpu", "core", "package", "numa", "user"
146 };
147 
148 struct pollfd_index_map {
149 	int evlist_pollfd_index;
150 	int thread_pollfd_index;
151 };
152 
153 struct record {
154 	struct perf_tool	tool;
155 	struct record_opts	opts;
156 	u64			bytes_written;
157 	struct perf_data	data;
158 	struct auxtrace_record	*itr;
159 	struct evlist	*evlist;
160 	struct perf_session	*session;
161 	struct evlist		*sb_evlist;
162 	pthread_t		thread_id;
163 	int			realtime_prio;
164 	bool			switch_output_event_set;
165 	bool			no_buildid;
166 	bool			no_buildid_set;
167 	bool			no_buildid_cache;
168 	bool			no_buildid_cache_set;
169 	bool			buildid_all;
170 	bool			buildid_mmap;
171 	bool			timestamp_filename;
172 	bool			timestamp_boundary;
173 	bool			off_cpu;
174 	struct switch_output	switch_output;
175 	unsigned long long	samples;
176 	unsigned long		output_max_size;	/* = 0: unlimited */
177 	struct perf_debuginfod	debuginfod;
178 	int			nr_threads;
179 	struct thread_mask	*thread_masks;
180 	struct record_thread	*thread_data;
181 	struct pollfd_index_map	*index_map;
182 	size_t			index_map_sz;
183 	size_t			index_map_cnt;
184 };
185 
186 static volatile int done;
187 
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191 
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193 	"SYS", "NODE", "CPU"
194 };
195 
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199 	return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202 
203 static int record__threads_enabled(struct record *rec)
204 {
205 	return rec->opts.threads_spec;
206 }
207 
208 static bool switch_output_signal(struct record *rec)
209 {
210 	return rec->switch_output.signal &&
211 	       trigger_is_ready(&switch_output_trigger);
212 }
213 
214 static bool switch_output_size(struct record *rec)
215 {
216 	return rec->switch_output.size &&
217 	       trigger_is_ready(&switch_output_trigger) &&
218 	       (rec->bytes_written >= rec->switch_output.size);
219 }
220 
221 static bool switch_output_time(struct record *rec)
222 {
223 	return rec->switch_output.time &&
224 	       trigger_is_ready(&switch_output_trigger);
225 }
226 
227 static u64 record__bytes_written(struct record *rec)
228 {
229 	int t;
230 	u64 bytes_written = rec->bytes_written;
231 	struct record_thread *thread_data = rec->thread_data;
232 
233 	for (t = 0; t < rec->nr_threads; t++)
234 		bytes_written += thread_data[t].bytes_written;
235 
236 	return bytes_written;
237 }
238 
239 static bool record__output_max_size_exceeded(struct record *rec)
240 {
241 	return rec->output_max_size &&
242 	       (record__bytes_written(rec) >= rec->output_max_size);
243 }
244 
245 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
246 			 void *bf, size_t size)
247 {
248 	struct perf_data_file *file = &rec->session->data->file;
249 
250 	if (map && map->file)
251 		file = map->file;
252 
253 	if (perf_data_file__write(file, bf, size) < 0) {
254 		pr_err("failed to write perf data, error: %m\n");
255 		return -1;
256 	}
257 
258 	if (map && map->file)
259 		thread->bytes_written += size;
260 	else
261 		rec->bytes_written += size;
262 
263 	if (record__output_max_size_exceeded(rec) && !done) {
264 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
265 				" stopping session ]\n",
266 				record__bytes_written(rec) >> 10);
267 		done = 1;
268 	}
269 
270 	if (switch_output_size(rec))
271 		trigger_hit(&switch_output_trigger);
272 
273 	return 0;
274 }
275 
276 static int record__aio_enabled(struct record *rec);
277 static int record__comp_enabled(struct record *rec);
278 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
279 			    void *dst, size_t dst_size, void *src, size_t src_size);
280 
281 #ifdef HAVE_AIO_SUPPORT
282 static int record__aio_write(struct aiocb *cblock, int trace_fd,
283 		void *buf, size_t size, off_t off)
284 {
285 	int rc;
286 
287 	cblock->aio_fildes = trace_fd;
288 	cblock->aio_buf    = buf;
289 	cblock->aio_nbytes = size;
290 	cblock->aio_offset = off;
291 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
292 
293 	do {
294 		rc = aio_write(cblock);
295 		if (rc == 0) {
296 			break;
297 		} else if (errno != EAGAIN) {
298 			cblock->aio_fildes = -1;
299 			pr_err("failed to queue perf data, error: %m\n");
300 			break;
301 		}
302 	} while (1);
303 
304 	return rc;
305 }
306 
307 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
308 {
309 	void *rem_buf;
310 	off_t rem_off;
311 	size_t rem_size;
312 	int rc, aio_errno;
313 	ssize_t aio_ret, written;
314 
315 	aio_errno = aio_error(cblock);
316 	if (aio_errno == EINPROGRESS)
317 		return 0;
318 
319 	written = aio_ret = aio_return(cblock);
320 	if (aio_ret < 0) {
321 		if (aio_errno != EINTR)
322 			pr_err("failed to write perf data, error: %m\n");
323 		written = 0;
324 	}
325 
326 	rem_size = cblock->aio_nbytes - written;
327 
328 	if (rem_size == 0) {
329 		cblock->aio_fildes = -1;
330 		/*
331 		 * md->refcount is incremented in record__aio_pushfn() for
332 		 * every aio write request started in record__aio_push() so
333 		 * decrement it because the request is now complete.
334 		 */
335 		perf_mmap__put(&md->core);
336 		rc = 1;
337 	} else {
338 		/*
339 		 * aio write request may require restart with the
340 		 * reminder if the kernel didn't write whole
341 		 * chunk at once.
342 		 */
343 		rem_off = cblock->aio_offset + written;
344 		rem_buf = (void *)(cblock->aio_buf + written);
345 		record__aio_write(cblock, cblock->aio_fildes,
346 				rem_buf, rem_size, rem_off);
347 		rc = 0;
348 	}
349 
350 	return rc;
351 }
352 
353 static int record__aio_sync(struct mmap *md, bool sync_all)
354 {
355 	struct aiocb **aiocb = md->aio.aiocb;
356 	struct aiocb *cblocks = md->aio.cblocks;
357 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
358 	int i, do_suspend;
359 
360 	do {
361 		do_suspend = 0;
362 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
363 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
364 				if (sync_all)
365 					aiocb[i] = NULL;
366 				else
367 					return i;
368 			} else {
369 				/*
370 				 * Started aio write is not complete yet
371 				 * so it has to be waited before the
372 				 * next allocation.
373 				 */
374 				aiocb[i] = &cblocks[i];
375 				do_suspend = 1;
376 			}
377 		}
378 		if (!do_suspend)
379 			return -1;
380 
381 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
382 			if (!(errno == EAGAIN || errno == EINTR))
383 				pr_err("failed to sync perf data, error: %m\n");
384 		}
385 	} while (1);
386 }
387 
388 struct record_aio {
389 	struct record	*rec;
390 	void		*data;
391 	size_t		size;
392 };
393 
394 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
395 {
396 	struct record_aio *aio = to;
397 
398 	/*
399 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
400 	 * to release space in the kernel buffer as fast as possible, calling
401 	 * perf_mmap__consume() from perf_mmap__push() function.
402 	 *
403 	 * That lets the kernel to proceed with storing more profiling data into
404 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
405 	 *
406 	 * Coping can be done in two steps in case the chunk of profiling data
407 	 * crosses the upper bound of the kernel buffer. In this case we first move
408 	 * part of data from map->start till the upper bound and then the reminder
409 	 * from the beginning of the kernel buffer till the end of the data chunk.
410 	 */
411 
412 	if (record__comp_enabled(aio->rec)) {
413 		size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
414 				     mmap__mmap_len(map) - aio->size,
415 				     buf, size);
416 	} else {
417 		memcpy(aio->data + aio->size, buf, size);
418 	}
419 
420 	if (!aio->size) {
421 		/*
422 		 * Increment map->refcount to guard map->aio.data[] buffer
423 		 * from premature deallocation because map object can be
424 		 * released earlier than aio write request started on
425 		 * map->aio.data[] buffer is complete.
426 		 *
427 		 * perf_mmap__put() is done at record__aio_complete()
428 		 * after started aio request completion or at record__aio_push()
429 		 * if the request failed to start.
430 		 */
431 		perf_mmap__get(&map->core);
432 	}
433 
434 	aio->size += size;
435 
436 	return size;
437 }
438 
439 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
440 {
441 	int ret, idx;
442 	int trace_fd = rec->session->data->file.fd;
443 	struct record_aio aio = { .rec = rec, .size = 0 };
444 
445 	/*
446 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
447 	 * becomes available after previous aio write operation.
448 	 */
449 
450 	idx = record__aio_sync(map, false);
451 	aio.data = map->aio.data[idx];
452 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
453 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
454 		return ret;
455 
456 	rec->samples++;
457 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
458 	if (!ret) {
459 		*off += aio.size;
460 		rec->bytes_written += aio.size;
461 		if (switch_output_size(rec))
462 			trigger_hit(&switch_output_trigger);
463 	} else {
464 		/*
465 		 * Decrement map->refcount incremented in record__aio_pushfn()
466 		 * back if record__aio_write() operation failed to start, otherwise
467 		 * map->refcount is decremented in record__aio_complete() after
468 		 * aio write operation finishes successfully.
469 		 */
470 		perf_mmap__put(&map->core);
471 	}
472 
473 	return ret;
474 }
475 
476 static off_t record__aio_get_pos(int trace_fd)
477 {
478 	return lseek(trace_fd, 0, SEEK_CUR);
479 }
480 
481 static void record__aio_set_pos(int trace_fd, off_t pos)
482 {
483 	lseek(trace_fd, pos, SEEK_SET);
484 }
485 
486 static void record__aio_mmap_read_sync(struct record *rec)
487 {
488 	int i;
489 	struct evlist *evlist = rec->evlist;
490 	struct mmap *maps = evlist->mmap;
491 
492 	if (!record__aio_enabled(rec))
493 		return;
494 
495 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
496 		struct mmap *map = &maps[i];
497 
498 		if (map->core.base)
499 			record__aio_sync(map, true);
500 	}
501 }
502 
503 static int nr_cblocks_default = 1;
504 static int nr_cblocks_max = 4;
505 
506 static int record__aio_parse(const struct option *opt,
507 			     const char *str,
508 			     int unset)
509 {
510 	struct record_opts *opts = (struct record_opts *)opt->value;
511 
512 	if (unset) {
513 		opts->nr_cblocks = 0;
514 	} else {
515 		if (str)
516 			opts->nr_cblocks = strtol(str, NULL, 0);
517 		if (!opts->nr_cblocks)
518 			opts->nr_cblocks = nr_cblocks_default;
519 	}
520 
521 	return 0;
522 }
523 #else /* HAVE_AIO_SUPPORT */
524 static int nr_cblocks_max = 0;
525 
526 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
527 			    off_t *off __maybe_unused)
528 {
529 	return -1;
530 }
531 
532 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
533 {
534 	return -1;
535 }
536 
537 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
538 {
539 }
540 
541 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
542 {
543 }
544 #endif
545 
546 static int record__aio_enabled(struct record *rec)
547 {
548 	return rec->opts.nr_cblocks > 0;
549 }
550 
551 #define MMAP_FLUSH_DEFAULT 1
552 static int record__mmap_flush_parse(const struct option *opt,
553 				    const char *str,
554 				    int unset)
555 {
556 	int flush_max;
557 	struct record_opts *opts = (struct record_opts *)opt->value;
558 	static struct parse_tag tags[] = {
559 			{ .tag  = 'B', .mult = 1       },
560 			{ .tag  = 'K', .mult = 1 << 10 },
561 			{ .tag  = 'M', .mult = 1 << 20 },
562 			{ .tag  = 'G', .mult = 1 << 30 },
563 			{ .tag  = 0 },
564 	};
565 
566 	if (unset)
567 		return 0;
568 
569 	if (str) {
570 		opts->mmap_flush = parse_tag_value(str, tags);
571 		if (opts->mmap_flush == (int)-1)
572 			opts->mmap_flush = strtol(str, NULL, 0);
573 	}
574 
575 	if (!opts->mmap_flush)
576 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
577 
578 	flush_max = evlist__mmap_size(opts->mmap_pages);
579 	flush_max /= 4;
580 	if (opts->mmap_flush > flush_max)
581 		opts->mmap_flush = flush_max;
582 
583 	return 0;
584 }
585 
586 #ifdef HAVE_ZSTD_SUPPORT
587 static unsigned int comp_level_default = 1;
588 
589 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
590 {
591 	struct record_opts *opts = opt->value;
592 
593 	if (unset) {
594 		opts->comp_level = 0;
595 	} else {
596 		if (str)
597 			opts->comp_level = strtol(str, NULL, 0);
598 		if (!opts->comp_level)
599 			opts->comp_level = comp_level_default;
600 	}
601 
602 	return 0;
603 }
604 #endif
605 static unsigned int comp_level_max = 22;
606 
607 static int record__comp_enabled(struct record *rec)
608 {
609 	return rec->opts.comp_level > 0;
610 }
611 
612 static int process_synthesized_event(struct perf_tool *tool,
613 				     union perf_event *event,
614 				     struct perf_sample *sample __maybe_unused,
615 				     struct machine *machine __maybe_unused)
616 {
617 	struct record *rec = container_of(tool, struct record, tool);
618 	return record__write(rec, NULL, event, event->header.size);
619 }
620 
621 static struct mutex synth_lock;
622 
623 static int process_locked_synthesized_event(struct perf_tool *tool,
624 				     union perf_event *event,
625 				     struct perf_sample *sample __maybe_unused,
626 				     struct machine *machine __maybe_unused)
627 {
628 	int ret;
629 
630 	mutex_lock(&synth_lock);
631 	ret = process_synthesized_event(tool, event, sample, machine);
632 	mutex_unlock(&synth_lock);
633 	return ret;
634 }
635 
636 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
637 {
638 	struct record *rec = to;
639 
640 	if (record__comp_enabled(rec)) {
641 		size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
642 		bf   = map->data;
643 	}
644 
645 	thread->samples++;
646 	return record__write(rec, map, bf, size);
647 }
648 
649 static volatile int signr = -1;
650 static volatile int child_finished;
651 #ifdef HAVE_EVENTFD_SUPPORT
652 static int done_fd = -1;
653 #endif
654 
655 static void sig_handler(int sig)
656 {
657 	if (sig == SIGCHLD)
658 		child_finished = 1;
659 	else
660 		signr = sig;
661 
662 	done = 1;
663 #ifdef HAVE_EVENTFD_SUPPORT
664 {
665 	u64 tmp = 1;
666 	/*
667 	 * It is possible for this signal handler to run after done is checked
668 	 * in the main loop, but before the perf counter fds are polled. If this
669 	 * happens, the poll() will continue to wait even though done is set,
670 	 * and will only break out if either another signal is received, or the
671 	 * counters are ready for read. To ensure the poll() doesn't sleep when
672 	 * done is set, use an eventfd (done_fd) to wake up the poll().
673 	 */
674 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
675 		pr_err("failed to signal wakeup fd, error: %m\n");
676 }
677 #endif // HAVE_EVENTFD_SUPPORT
678 }
679 
680 static void sigsegv_handler(int sig)
681 {
682 	perf_hooks__recover();
683 	sighandler_dump_stack(sig);
684 }
685 
686 static void record__sig_exit(void)
687 {
688 	if (signr == -1)
689 		return;
690 
691 	signal(signr, SIG_DFL);
692 	raise(signr);
693 }
694 
695 #ifdef HAVE_AUXTRACE_SUPPORT
696 
697 static int record__process_auxtrace(struct perf_tool *tool,
698 				    struct mmap *map,
699 				    union perf_event *event, void *data1,
700 				    size_t len1, void *data2, size_t len2)
701 {
702 	struct record *rec = container_of(tool, struct record, tool);
703 	struct perf_data *data = &rec->data;
704 	size_t padding;
705 	u8 pad[8] = {0};
706 
707 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
708 		off_t file_offset;
709 		int fd = perf_data__fd(data);
710 		int err;
711 
712 		file_offset = lseek(fd, 0, SEEK_CUR);
713 		if (file_offset == -1)
714 			return -1;
715 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
716 						     event, file_offset);
717 		if (err)
718 			return err;
719 	}
720 
721 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
722 	padding = (len1 + len2) & 7;
723 	if (padding)
724 		padding = 8 - padding;
725 
726 	record__write(rec, map, event, event->header.size);
727 	record__write(rec, map, data1, len1);
728 	if (len2)
729 		record__write(rec, map, data2, len2);
730 	record__write(rec, map, &pad, padding);
731 
732 	return 0;
733 }
734 
735 static int record__auxtrace_mmap_read(struct record *rec,
736 				      struct mmap *map)
737 {
738 	int ret;
739 
740 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
741 				  record__process_auxtrace);
742 	if (ret < 0)
743 		return ret;
744 
745 	if (ret)
746 		rec->samples++;
747 
748 	return 0;
749 }
750 
751 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
752 					       struct mmap *map)
753 {
754 	int ret;
755 
756 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
757 					   record__process_auxtrace,
758 					   rec->opts.auxtrace_snapshot_size);
759 	if (ret < 0)
760 		return ret;
761 
762 	if (ret)
763 		rec->samples++;
764 
765 	return 0;
766 }
767 
768 static int record__auxtrace_read_snapshot_all(struct record *rec)
769 {
770 	int i;
771 	int rc = 0;
772 
773 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
774 		struct mmap *map = &rec->evlist->mmap[i];
775 
776 		if (!map->auxtrace_mmap.base)
777 			continue;
778 
779 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
780 			rc = -1;
781 			goto out;
782 		}
783 	}
784 out:
785 	return rc;
786 }
787 
788 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
789 {
790 	pr_debug("Recording AUX area tracing snapshot\n");
791 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
792 		trigger_error(&auxtrace_snapshot_trigger);
793 	} else {
794 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
795 			trigger_error(&auxtrace_snapshot_trigger);
796 		else
797 			trigger_ready(&auxtrace_snapshot_trigger);
798 	}
799 }
800 
801 static int record__auxtrace_snapshot_exit(struct record *rec)
802 {
803 	if (trigger_is_error(&auxtrace_snapshot_trigger))
804 		return 0;
805 
806 	if (!auxtrace_record__snapshot_started &&
807 	    auxtrace_record__snapshot_start(rec->itr))
808 		return -1;
809 
810 	record__read_auxtrace_snapshot(rec, true);
811 	if (trigger_is_error(&auxtrace_snapshot_trigger))
812 		return -1;
813 
814 	return 0;
815 }
816 
817 static int record__auxtrace_init(struct record *rec)
818 {
819 	int err;
820 
821 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
822 	    && record__threads_enabled(rec)) {
823 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
824 		return -EINVAL;
825 	}
826 
827 	if (!rec->itr) {
828 		rec->itr = auxtrace_record__init(rec->evlist, &err);
829 		if (err)
830 			return err;
831 	}
832 
833 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
834 					      rec->opts.auxtrace_snapshot_opts);
835 	if (err)
836 		return err;
837 
838 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
839 					    rec->opts.auxtrace_sample_opts);
840 	if (err)
841 		return err;
842 
843 	auxtrace_regroup_aux_output(rec->evlist);
844 
845 	return auxtrace_parse_filters(rec->evlist);
846 }
847 
848 #else
849 
850 static inline
851 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
852 			       struct mmap *map __maybe_unused)
853 {
854 	return 0;
855 }
856 
857 static inline
858 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
859 				    bool on_exit __maybe_unused)
860 {
861 }
862 
863 static inline
864 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
865 {
866 	return 0;
867 }
868 
869 static inline
870 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
871 {
872 	return 0;
873 }
874 
875 static int record__auxtrace_init(struct record *rec __maybe_unused)
876 {
877 	return 0;
878 }
879 
880 #endif
881 
882 static int record__config_text_poke(struct evlist *evlist)
883 {
884 	struct evsel *evsel;
885 
886 	/* Nothing to do if text poke is already configured */
887 	evlist__for_each_entry(evlist, evsel) {
888 		if (evsel->core.attr.text_poke)
889 			return 0;
890 	}
891 
892 	evsel = evlist__add_dummy_on_all_cpus(evlist);
893 	if (!evsel)
894 		return -ENOMEM;
895 
896 	evsel->core.attr.text_poke = 1;
897 	evsel->core.attr.ksymbol = 1;
898 	evsel->immediate = true;
899 	evsel__set_sample_bit(evsel, TIME);
900 
901 	return 0;
902 }
903 
904 static int record__config_off_cpu(struct record *rec)
905 {
906 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
907 }
908 
909 static bool record__kcore_readable(struct machine *machine)
910 {
911 	char kcore[PATH_MAX];
912 	int fd;
913 
914 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
915 
916 	fd = open(kcore, O_RDONLY);
917 	if (fd < 0)
918 		return false;
919 
920 	close(fd);
921 
922 	return true;
923 }
924 
925 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
926 {
927 	char from_dir[PATH_MAX];
928 	char kcore_dir[PATH_MAX];
929 	int ret;
930 
931 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
932 
933 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
934 	if (ret)
935 		return ret;
936 
937 	return kcore_copy(from_dir, kcore_dir);
938 }
939 
940 static void record__thread_data_init_pipes(struct record_thread *thread_data)
941 {
942 	thread_data->pipes.msg[0] = -1;
943 	thread_data->pipes.msg[1] = -1;
944 	thread_data->pipes.ack[0] = -1;
945 	thread_data->pipes.ack[1] = -1;
946 }
947 
948 static int record__thread_data_open_pipes(struct record_thread *thread_data)
949 {
950 	if (pipe(thread_data->pipes.msg))
951 		return -EINVAL;
952 
953 	if (pipe(thread_data->pipes.ack)) {
954 		close(thread_data->pipes.msg[0]);
955 		thread_data->pipes.msg[0] = -1;
956 		close(thread_data->pipes.msg[1]);
957 		thread_data->pipes.msg[1] = -1;
958 		return -EINVAL;
959 	}
960 
961 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
962 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
963 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
964 
965 	return 0;
966 }
967 
968 static void record__thread_data_close_pipes(struct record_thread *thread_data)
969 {
970 	if (thread_data->pipes.msg[0] != -1) {
971 		close(thread_data->pipes.msg[0]);
972 		thread_data->pipes.msg[0] = -1;
973 	}
974 	if (thread_data->pipes.msg[1] != -1) {
975 		close(thread_data->pipes.msg[1]);
976 		thread_data->pipes.msg[1] = -1;
977 	}
978 	if (thread_data->pipes.ack[0] != -1) {
979 		close(thread_data->pipes.ack[0]);
980 		thread_data->pipes.ack[0] = -1;
981 	}
982 	if (thread_data->pipes.ack[1] != -1) {
983 		close(thread_data->pipes.ack[1]);
984 		thread_data->pipes.ack[1] = -1;
985 	}
986 }
987 
988 static bool evlist__per_thread(struct evlist *evlist)
989 {
990 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
991 }
992 
993 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
994 {
995 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
996 	struct mmap *mmap = evlist->mmap;
997 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
998 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
999 	bool per_thread = evlist__per_thread(evlist);
1000 
1001 	if (per_thread)
1002 		thread_data->nr_mmaps = nr_mmaps;
1003 	else
1004 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1005 						      thread_data->mask->maps.nbits);
1006 	if (mmap) {
1007 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1008 		if (!thread_data->maps)
1009 			return -ENOMEM;
1010 	}
1011 	if (overwrite_mmap) {
1012 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1013 		if (!thread_data->overwrite_maps) {
1014 			zfree(&thread_data->maps);
1015 			return -ENOMEM;
1016 		}
1017 	}
1018 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1019 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1020 
1021 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1022 		if (per_thread ||
1023 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1024 			if (thread_data->maps) {
1025 				thread_data->maps[tm] = &mmap[m];
1026 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1027 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1028 			}
1029 			if (thread_data->overwrite_maps) {
1030 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1031 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1032 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1033 			}
1034 			tm++;
1035 		}
1036 	}
1037 
1038 	return 0;
1039 }
1040 
1041 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1042 {
1043 	int f, tm, pos;
1044 	struct mmap *map, *overwrite_map;
1045 
1046 	fdarray__init(&thread_data->pollfd, 64);
1047 
1048 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1049 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1050 		overwrite_map = thread_data->overwrite_maps ?
1051 				thread_data->overwrite_maps[tm] : NULL;
1052 
1053 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1054 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1055 
1056 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1057 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1058 							      &evlist->core.pollfd);
1059 				if (pos < 0)
1060 					return pos;
1061 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1062 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1063 			}
1064 		}
1065 	}
1066 
1067 	return 0;
1068 }
1069 
1070 static void record__free_thread_data(struct record *rec)
1071 {
1072 	int t;
1073 	struct record_thread *thread_data = rec->thread_data;
1074 
1075 	if (thread_data == NULL)
1076 		return;
1077 
1078 	for (t = 0; t < rec->nr_threads; t++) {
1079 		record__thread_data_close_pipes(&thread_data[t]);
1080 		zfree(&thread_data[t].maps);
1081 		zfree(&thread_data[t].overwrite_maps);
1082 		fdarray__exit(&thread_data[t].pollfd);
1083 	}
1084 
1085 	zfree(&rec->thread_data);
1086 }
1087 
1088 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1089 						    int evlist_pollfd_index,
1090 						    int thread_pollfd_index)
1091 {
1092 	size_t x = rec->index_map_cnt;
1093 
1094 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1095 		return -ENOMEM;
1096 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1097 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1098 	rec->index_map_cnt += 1;
1099 	return 0;
1100 }
1101 
1102 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1103 						    struct evlist *evlist,
1104 						    struct record_thread *thread_data)
1105 {
1106 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1107 	struct pollfd *t_entries = thread_data->pollfd.entries;
1108 	int err = 0;
1109 	size_t i;
1110 
1111 	for (i = 0; i < rec->index_map_cnt; i++) {
1112 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1113 		int t_pos = rec->index_map[i].thread_pollfd_index;
1114 
1115 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1116 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1117 			pr_err("Thread and evlist pollfd index mismatch\n");
1118 			err = -EINVAL;
1119 			continue;
1120 		}
1121 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1122 	}
1123 	return err;
1124 }
1125 
1126 static int record__dup_non_perf_events(struct record *rec,
1127 				       struct evlist *evlist,
1128 				       struct record_thread *thread_data)
1129 {
1130 	struct fdarray *fda = &evlist->core.pollfd;
1131 	int i, ret;
1132 
1133 	for (i = 0; i < fda->nr; i++) {
1134 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1135 			continue;
1136 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1137 		if (ret < 0) {
1138 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1139 			return ret;
1140 		}
1141 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1142 			  thread_data, ret, fda->entries[i].fd);
1143 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1144 		if (ret < 0) {
1145 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1146 			return ret;
1147 		}
1148 	}
1149 	return 0;
1150 }
1151 
1152 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1153 {
1154 	int t, ret;
1155 	struct record_thread *thread_data;
1156 
1157 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1158 	if (!rec->thread_data) {
1159 		pr_err("Failed to allocate thread data\n");
1160 		return -ENOMEM;
1161 	}
1162 	thread_data = rec->thread_data;
1163 
1164 	for (t = 0; t < rec->nr_threads; t++)
1165 		record__thread_data_init_pipes(&thread_data[t]);
1166 
1167 	for (t = 0; t < rec->nr_threads; t++) {
1168 		thread_data[t].rec = rec;
1169 		thread_data[t].mask = &rec->thread_masks[t];
1170 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1171 		if (ret) {
1172 			pr_err("Failed to initialize thread[%d] maps\n", t);
1173 			goto out_free;
1174 		}
1175 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1176 		if (ret) {
1177 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1178 			goto out_free;
1179 		}
1180 		if (t) {
1181 			thread_data[t].tid = -1;
1182 			ret = record__thread_data_open_pipes(&thread_data[t]);
1183 			if (ret) {
1184 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1185 				goto out_free;
1186 			}
1187 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1188 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1189 			if (ret < 0) {
1190 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1191 				goto out_free;
1192 			}
1193 			thread_data[t].ctlfd_pos = ret;
1194 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1195 				 thread_data, thread_data[t].ctlfd_pos,
1196 				 thread_data[t].pipes.msg[0]);
1197 		} else {
1198 			thread_data[t].tid = gettid();
1199 
1200 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1201 			if (ret < 0)
1202 				goto out_free;
1203 
1204 			thread_data[t].ctlfd_pos = -1; /* Not used */
1205 		}
1206 	}
1207 
1208 	return 0;
1209 
1210 out_free:
1211 	record__free_thread_data(rec);
1212 
1213 	return ret;
1214 }
1215 
1216 static int record__mmap_evlist(struct record *rec,
1217 			       struct evlist *evlist)
1218 {
1219 	int i, ret;
1220 	struct record_opts *opts = &rec->opts;
1221 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1222 				  opts->auxtrace_sample_mode;
1223 	char msg[512];
1224 
1225 	if (opts->affinity != PERF_AFFINITY_SYS)
1226 		cpu__setup_cpunode_map();
1227 
1228 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1229 				 opts->auxtrace_mmap_pages,
1230 				 auxtrace_overwrite,
1231 				 opts->nr_cblocks, opts->affinity,
1232 				 opts->mmap_flush, opts->comp_level) < 0) {
1233 		if (errno == EPERM) {
1234 			pr_err("Permission error mapping pages.\n"
1235 			       "Consider increasing "
1236 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1237 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1238 			       "(current value: %u,%u)\n",
1239 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1240 			return -errno;
1241 		} else {
1242 			pr_err("failed to mmap with %d (%s)\n", errno,
1243 				str_error_r(errno, msg, sizeof(msg)));
1244 			if (errno)
1245 				return -errno;
1246 			else
1247 				return -EINVAL;
1248 		}
1249 	}
1250 
1251 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1252 		return -1;
1253 
1254 	ret = record__alloc_thread_data(rec, evlist);
1255 	if (ret)
1256 		return ret;
1257 
1258 	if (record__threads_enabled(rec)) {
1259 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1260 		if (ret) {
1261 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1262 			return ret;
1263 		}
1264 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1265 			if (evlist->mmap)
1266 				evlist->mmap[i].file = &rec->data.dir.files[i];
1267 			if (evlist->overwrite_mmap)
1268 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1269 		}
1270 	}
1271 
1272 	return 0;
1273 }
1274 
1275 static int record__mmap(struct record *rec)
1276 {
1277 	return record__mmap_evlist(rec, rec->evlist);
1278 }
1279 
1280 static int record__open(struct record *rec)
1281 {
1282 	char msg[BUFSIZ];
1283 	struct evsel *pos;
1284 	struct evlist *evlist = rec->evlist;
1285 	struct perf_session *session = rec->session;
1286 	struct record_opts *opts = &rec->opts;
1287 	int rc = 0;
1288 
1289 	/*
1290 	 * For initial_delay, system wide or a hybrid system, we need to add a
1291 	 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
1292 	 * of waiting or event synthesis.
1293 	 */
1294 	if (opts->initial_delay || target__has_cpu(&opts->target) ||
1295 	    perf_pmu__has_hybrid()) {
1296 		pos = evlist__get_tracking_event(evlist);
1297 		if (!evsel__is_dummy_event(pos)) {
1298 			/* Set up dummy event. */
1299 			if (evlist__add_dummy(evlist))
1300 				return -ENOMEM;
1301 			pos = evlist__last(evlist);
1302 			evlist__set_tracking_event(evlist, pos);
1303 		}
1304 
1305 		/*
1306 		 * Enable the dummy event when the process is forked for
1307 		 * initial_delay, immediately for system wide.
1308 		 */
1309 		if (opts->initial_delay && !pos->immediate &&
1310 		    !target__has_cpu(&opts->target))
1311 			pos->core.attr.enable_on_exec = 1;
1312 		else
1313 			pos->immediate = 1;
1314 	}
1315 
1316 	evlist__config(evlist, opts, &callchain_param);
1317 
1318 	evlist__for_each_entry(evlist, pos) {
1319 try_again:
1320 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1321 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1322 				if (verbose > 0)
1323 					ui__warning("%s\n", msg);
1324 				goto try_again;
1325 			}
1326 			if ((errno == EINVAL || errno == EBADF) &&
1327 			    pos->core.leader != &pos->core &&
1328 			    pos->weak_group) {
1329 			        pos = evlist__reset_weak_group(evlist, pos, true);
1330 				goto try_again;
1331 			}
1332 			rc = -errno;
1333 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1334 			ui__error("%s\n", msg);
1335 			goto out;
1336 		}
1337 
1338 		pos->supported = true;
1339 	}
1340 
1341 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1342 		pr_warning(
1343 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1344 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1345 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1346 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1347 "Samples in kernel modules won't be resolved at all.\n\n"
1348 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1349 "even with a suitable vmlinux or kallsyms file.\n\n");
1350 	}
1351 
1352 	if (evlist__apply_filters(evlist, &pos)) {
1353 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1354 			pos->filter, evsel__name(pos), errno,
1355 			str_error_r(errno, msg, sizeof(msg)));
1356 		rc = -1;
1357 		goto out;
1358 	}
1359 
1360 	rc = record__mmap(rec);
1361 	if (rc)
1362 		goto out;
1363 
1364 	session->evlist = evlist;
1365 	perf_session__set_id_hdr_size(session);
1366 out:
1367 	return rc;
1368 }
1369 
1370 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1371 {
1372 	if (rec->evlist->first_sample_time == 0)
1373 		rec->evlist->first_sample_time = sample_time;
1374 
1375 	if (sample_time)
1376 		rec->evlist->last_sample_time = sample_time;
1377 }
1378 
1379 static int process_sample_event(struct perf_tool *tool,
1380 				union perf_event *event,
1381 				struct perf_sample *sample,
1382 				struct evsel *evsel,
1383 				struct machine *machine)
1384 {
1385 	struct record *rec = container_of(tool, struct record, tool);
1386 
1387 	set_timestamp_boundary(rec, sample->time);
1388 
1389 	if (rec->buildid_all)
1390 		return 0;
1391 
1392 	rec->samples++;
1393 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1394 }
1395 
1396 static int process_buildids(struct record *rec)
1397 {
1398 	struct perf_session *session = rec->session;
1399 
1400 	if (perf_data__size(&rec->data) == 0)
1401 		return 0;
1402 
1403 	/*
1404 	 * During this process, it'll load kernel map and replace the
1405 	 * dso->long_name to a real pathname it found.  In this case
1406 	 * we prefer the vmlinux path like
1407 	 *   /lib/modules/3.16.4/build/vmlinux
1408 	 *
1409 	 * rather than build-id path (in debug directory).
1410 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1411 	 */
1412 	symbol_conf.ignore_vmlinux_buildid = true;
1413 
1414 	/*
1415 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1416 	 * so no need to process samples. But if timestamp_boundary is enabled,
1417 	 * it still needs to walk on all samples to get the timestamps of
1418 	 * first/last samples.
1419 	 */
1420 	if (rec->buildid_all && !rec->timestamp_boundary)
1421 		rec->tool.sample = NULL;
1422 
1423 	return perf_session__process_events(session);
1424 }
1425 
1426 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1427 {
1428 	int err;
1429 	struct perf_tool *tool = data;
1430 	/*
1431 	 *As for guest kernel when processing subcommand record&report,
1432 	 *we arrange module mmap prior to guest kernel mmap and trigger
1433 	 *a preload dso because default guest module symbols are loaded
1434 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1435 	 *method is used to avoid symbol missing when the first addr is
1436 	 *in module instead of in guest kernel.
1437 	 */
1438 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1439 					     machine);
1440 	if (err < 0)
1441 		pr_err("Couldn't record guest kernel [%d]'s reference"
1442 		       " relocation symbol.\n", machine->pid);
1443 
1444 	/*
1445 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1446 	 * have no _text sometimes.
1447 	 */
1448 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1449 						 machine);
1450 	if (err < 0)
1451 		pr_err("Couldn't record guest kernel [%d]'s reference"
1452 		       " relocation symbol.\n", machine->pid);
1453 }
1454 
1455 static struct perf_event_header finished_round_event = {
1456 	.size = sizeof(struct perf_event_header),
1457 	.type = PERF_RECORD_FINISHED_ROUND,
1458 };
1459 
1460 static struct perf_event_header finished_init_event = {
1461 	.size = sizeof(struct perf_event_header),
1462 	.type = PERF_RECORD_FINISHED_INIT,
1463 };
1464 
1465 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1466 {
1467 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1468 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1469 			  thread->mask->affinity.nbits)) {
1470 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1471 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1472 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1473 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1474 					(cpu_set_t *)thread->mask->affinity.bits);
1475 		if (verbose == 2) {
1476 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1477 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1478 		}
1479 	}
1480 }
1481 
1482 static size_t process_comp_header(void *record, size_t increment)
1483 {
1484 	struct perf_record_compressed *event = record;
1485 	size_t size = sizeof(*event);
1486 
1487 	if (increment) {
1488 		event->header.size += increment;
1489 		return increment;
1490 	}
1491 
1492 	event->header.type = PERF_RECORD_COMPRESSED;
1493 	event->header.size = size;
1494 
1495 	return size;
1496 }
1497 
1498 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1499 			    void *dst, size_t dst_size, void *src, size_t src_size)
1500 {
1501 	size_t compressed;
1502 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1503 	struct zstd_data *zstd_data = &session->zstd_data;
1504 
1505 	if (map && map->file)
1506 		zstd_data = &map->zstd_data;
1507 
1508 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1509 						     max_record_size, process_comp_header);
1510 
1511 	if (map && map->file) {
1512 		thread->bytes_transferred += src_size;
1513 		thread->bytes_compressed  += compressed;
1514 	} else {
1515 		session->bytes_transferred += src_size;
1516 		session->bytes_compressed  += compressed;
1517 	}
1518 
1519 	return compressed;
1520 }
1521 
1522 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1523 				    bool overwrite, bool synch)
1524 {
1525 	u64 bytes_written = rec->bytes_written;
1526 	int i;
1527 	int rc = 0;
1528 	int nr_mmaps;
1529 	struct mmap **maps;
1530 	int trace_fd = rec->data.file.fd;
1531 	off_t off = 0;
1532 
1533 	if (!evlist)
1534 		return 0;
1535 
1536 	nr_mmaps = thread->nr_mmaps;
1537 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1538 
1539 	if (!maps)
1540 		return 0;
1541 
1542 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1543 		return 0;
1544 
1545 	if (record__aio_enabled(rec))
1546 		off = record__aio_get_pos(trace_fd);
1547 
1548 	for (i = 0; i < nr_mmaps; i++) {
1549 		u64 flush = 0;
1550 		struct mmap *map = maps[i];
1551 
1552 		if (map->core.base) {
1553 			record__adjust_affinity(rec, map);
1554 			if (synch) {
1555 				flush = map->core.flush;
1556 				map->core.flush = 1;
1557 			}
1558 			if (!record__aio_enabled(rec)) {
1559 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1560 					if (synch)
1561 						map->core.flush = flush;
1562 					rc = -1;
1563 					goto out;
1564 				}
1565 			} else {
1566 				if (record__aio_push(rec, map, &off) < 0) {
1567 					record__aio_set_pos(trace_fd, off);
1568 					if (synch)
1569 						map->core.flush = flush;
1570 					rc = -1;
1571 					goto out;
1572 				}
1573 			}
1574 			if (synch)
1575 				map->core.flush = flush;
1576 		}
1577 
1578 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1579 		    !rec->opts.auxtrace_sample_mode &&
1580 		    record__auxtrace_mmap_read(rec, map) != 0) {
1581 			rc = -1;
1582 			goto out;
1583 		}
1584 	}
1585 
1586 	if (record__aio_enabled(rec))
1587 		record__aio_set_pos(trace_fd, off);
1588 
1589 	/*
1590 	 * Mark the round finished in case we wrote
1591 	 * at least one event.
1592 	 *
1593 	 * No need for round events in directory mode,
1594 	 * because per-cpu maps and files have data
1595 	 * sorted by kernel.
1596 	 */
1597 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1598 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1599 
1600 	if (overwrite)
1601 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1602 out:
1603 	return rc;
1604 }
1605 
1606 static int record__mmap_read_all(struct record *rec, bool synch)
1607 {
1608 	int err;
1609 
1610 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1611 	if (err)
1612 		return err;
1613 
1614 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1615 }
1616 
1617 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1618 					   void *arg __maybe_unused)
1619 {
1620 	struct perf_mmap *map = fda->priv[fd].ptr;
1621 
1622 	if (map)
1623 		perf_mmap__put(map);
1624 }
1625 
1626 static void *record__thread(void *arg)
1627 {
1628 	enum thread_msg msg = THREAD_MSG__READY;
1629 	bool terminate = false;
1630 	struct fdarray *pollfd;
1631 	int err, ctlfd_pos;
1632 
1633 	thread = arg;
1634 	thread->tid = gettid();
1635 
1636 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1637 	if (err == -1)
1638 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1639 			   thread->tid, strerror(errno));
1640 
1641 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1642 
1643 	pollfd = &thread->pollfd;
1644 	ctlfd_pos = thread->ctlfd_pos;
1645 
1646 	for (;;) {
1647 		unsigned long long hits = thread->samples;
1648 
1649 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1650 			break;
1651 
1652 		if (hits == thread->samples) {
1653 
1654 			err = fdarray__poll(pollfd, -1);
1655 			/*
1656 			 * Propagate error, only if there's any. Ignore positive
1657 			 * number of returned events and interrupt error.
1658 			 */
1659 			if (err > 0 || (err < 0 && errno == EINTR))
1660 				err = 0;
1661 			thread->waking++;
1662 
1663 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1664 					    record__thread_munmap_filtered, NULL) == 0)
1665 				break;
1666 		}
1667 
1668 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1669 			terminate = true;
1670 			close(thread->pipes.msg[0]);
1671 			thread->pipes.msg[0] = -1;
1672 			pollfd->entries[ctlfd_pos].fd = -1;
1673 			pollfd->entries[ctlfd_pos].events = 0;
1674 		}
1675 
1676 		pollfd->entries[ctlfd_pos].revents = 0;
1677 	}
1678 	record__mmap_read_all(thread->rec, true);
1679 
1680 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1681 	if (err == -1)
1682 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1683 			   thread->tid, strerror(errno));
1684 
1685 	return NULL;
1686 }
1687 
1688 static void record__init_features(struct record *rec)
1689 {
1690 	struct perf_session *session = rec->session;
1691 	int feat;
1692 
1693 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1694 		perf_header__set_feat(&session->header, feat);
1695 
1696 	if (rec->no_buildid)
1697 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1698 
1699 	if (!have_tracepoints(&rec->evlist->core.entries))
1700 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1701 
1702 	if (!rec->opts.branch_stack)
1703 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1704 
1705 	if (!rec->opts.full_auxtrace)
1706 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1707 
1708 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1709 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1710 
1711 	if (!rec->opts.use_clockid)
1712 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1713 
1714 	if (!record__threads_enabled(rec))
1715 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1716 
1717 	if (!record__comp_enabled(rec))
1718 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1719 
1720 	perf_header__clear_feat(&session->header, HEADER_STAT);
1721 }
1722 
1723 static void
1724 record__finish_output(struct record *rec)
1725 {
1726 	int i;
1727 	struct perf_data *data = &rec->data;
1728 	int fd = perf_data__fd(data);
1729 
1730 	if (data->is_pipe)
1731 		return;
1732 
1733 	rec->session->header.data_size += rec->bytes_written;
1734 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1735 	if (record__threads_enabled(rec)) {
1736 		for (i = 0; i < data->dir.nr; i++)
1737 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1738 	}
1739 
1740 	if (!rec->no_buildid) {
1741 		process_buildids(rec);
1742 
1743 		if (rec->buildid_all)
1744 			dsos__hit_all(rec->session);
1745 	}
1746 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1747 
1748 	return;
1749 }
1750 
1751 static int record__synthesize_workload(struct record *rec, bool tail)
1752 {
1753 	int err;
1754 	struct perf_thread_map *thread_map;
1755 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1756 
1757 	if (rec->opts.tail_synthesize != tail)
1758 		return 0;
1759 
1760 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1761 	if (thread_map == NULL)
1762 		return -1;
1763 
1764 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1765 						 process_synthesized_event,
1766 						 &rec->session->machines.host,
1767 						 needs_mmap,
1768 						 rec->opts.sample_address);
1769 	perf_thread_map__put(thread_map);
1770 	return err;
1771 }
1772 
1773 static int write_finished_init(struct record *rec, bool tail)
1774 {
1775 	if (rec->opts.tail_synthesize != tail)
1776 		return 0;
1777 
1778 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1779 }
1780 
1781 static int record__synthesize(struct record *rec, bool tail);
1782 
1783 static int
1784 record__switch_output(struct record *rec, bool at_exit)
1785 {
1786 	struct perf_data *data = &rec->data;
1787 	int fd, err;
1788 	char *new_filename;
1789 
1790 	/* Same Size:      "2015122520103046"*/
1791 	char timestamp[] = "InvalidTimestamp";
1792 
1793 	record__aio_mmap_read_sync(rec);
1794 
1795 	write_finished_init(rec, true);
1796 
1797 	record__synthesize(rec, true);
1798 	if (target__none(&rec->opts.target))
1799 		record__synthesize_workload(rec, true);
1800 
1801 	rec->samples = 0;
1802 	record__finish_output(rec);
1803 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1804 	if (err) {
1805 		pr_err("Failed to get current timestamp\n");
1806 		return -EINVAL;
1807 	}
1808 
1809 	fd = perf_data__switch(data, timestamp,
1810 				    rec->session->header.data_offset,
1811 				    at_exit, &new_filename);
1812 	if (fd >= 0 && !at_exit) {
1813 		rec->bytes_written = 0;
1814 		rec->session->header.data_size = 0;
1815 	}
1816 
1817 	if (!quiet)
1818 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1819 			data->path, timestamp);
1820 
1821 	if (rec->switch_output.num_files) {
1822 		int n = rec->switch_output.cur_file + 1;
1823 
1824 		if (n >= rec->switch_output.num_files)
1825 			n = 0;
1826 		rec->switch_output.cur_file = n;
1827 		if (rec->switch_output.filenames[n]) {
1828 			remove(rec->switch_output.filenames[n]);
1829 			zfree(&rec->switch_output.filenames[n]);
1830 		}
1831 		rec->switch_output.filenames[n] = new_filename;
1832 	} else {
1833 		free(new_filename);
1834 	}
1835 
1836 	/* Output tracking events */
1837 	if (!at_exit) {
1838 		record__synthesize(rec, false);
1839 
1840 		/*
1841 		 * In 'perf record --switch-output' without -a,
1842 		 * record__synthesize() in record__switch_output() won't
1843 		 * generate tracking events because there's no thread_map
1844 		 * in evlist. Which causes newly created perf.data doesn't
1845 		 * contain map and comm information.
1846 		 * Create a fake thread_map and directly call
1847 		 * perf_event__synthesize_thread_map() for those events.
1848 		 */
1849 		if (target__none(&rec->opts.target))
1850 			record__synthesize_workload(rec, false);
1851 		write_finished_init(rec, false);
1852 	}
1853 	return fd;
1854 }
1855 
1856 static void __record__read_lost_samples(struct record *rec, struct evsel *evsel,
1857 					struct perf_record_lost_samples *lost,
1858 					int cpu_idx, int thread_idx)
1859 {
1860 	struct perf_counts_values count;
1861 	struct perf_sample_id *sid;
1862 	struct perf_sample sample = {};
1863 	int id_hdr_size;
1864 
1865 	if (perf_evsel__read(&evsel->core, cpu_idx, thread_idx, &count) < 0) {
1866 		pr_err("read LOST count failed\n");
1867 		return;
1868 	}
1869 
1870 	if (count.lost == 0)
1871 		return;
1872 
1873 	lost->lost = count.lost;
1874 	if (evsel->core.ids) {
1875 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1876 		sample.id = sid->id;
1877 	}
1878 
1879 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1880 						       evsel->core.attr.sample_type, &sample);
1881 	lost->header.size = sizeof(*lost) + id_hdr_size;
1882 	record__write(rec, NULL, lost, lost->header.size);
1883 }
1884 
1885 static void record__read_lost_samples(struct record *rec)
1886 {
1887 	struct perf_session *session = rec->session;
1888 	struct perf_record_lost_samples *lost;
1889 	struct evsel *evsel;
1890 
1891 	/* there was an error during record__open */
1892 	if (session->evlist == NULL)
1893 		return;
1894 
1895 	lost = zalloc(PERF_SAMPLE_MAX_SIZE);
1896 	if (lost == NULL) {
1897 		pr_debug("Memory allocation failed\n");
1898 		return;
1899 	}
1900 
1901 	lost->header.type = PERF_RECORD_LOST_SAMPLES;
1902 
1903 	evlist__for_each_entry(session->evlist, evsel) {
1904 		struct xyarray *xy = evsel->core.sample_id;
1905 
1906 		if (xy == NULL || evsel->core.fd == NULL)
1907 			continue;
1908 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1909 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1910 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1911 			continue;
1912 		}
1913 
1914 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1915 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1916 				__record__read_lost_samples(rec, evsel, lost, x, y);
1917 			}
1918 		}
1919 	}
1920 	free(lost);
1921 
1922 }
1923 
1924 static volatile int workload_exec_errno;
1925 
1926 /*
1927  * evlist__prepare_workload will send a SIGUSR1
1928  * if the fork fails, since we asked by setting its
1929  * want_signal to true.
1930  */
1931 static void workload_exec_failed_signal(int signo __maybe_unused,
1932 					siginfo_t *info,
1933 					void *ucontext __maybe_unused)
1934 {
1935 	workload_exec_errno = info->si_value.sival_int;
1936 	done = 1;
1937 	child_finished = 1;
1938 }
1939 
1940 static void snapshot_sig_handler(int sig);
1941 static void alarm_sig_handler(int sig);
1942 
1943 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1944 {
1945 	if (evlist) {
1946 		if (evlist->mmap && evlist->mmap[0].core.base)
1947 			return evlist->mmap[0].core.base;
1948 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1949 			return evlist->overwrite_mmap[0].core.base;
1950 	}
1951 	return NULL;
1952 }
1953 
1954 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1955 {
1956 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1957 	if (pc)
1958 		return pc;
1959 	return NULL;
1960 }
1961 
1962 static int record__synthesize(struct record *rec, bool tail)
1963 {
1964 	struct perf_session *session = rec->session;
1965 	struct machine *machine = &session->machines.host;
1966 	struct perf_data *data = &rec->data;
1967 	struct record_opts *opts = &rec->opts;
1968 	struct perf_tool *tool = &rec->tool;
1969 	int err = 0;
1970 	event_op f = process_synthesized_event;
1971 
1972 	if (rec->opts.tail_synthesize != tail)
1973 		return 0;
1974 
1975 	if (data->is_pipe) {
1976 		err = perf_event__synthesize_for_pipe(tool, session, data,
1977 						      process_synthesized_event);
1978 		if (err < 0)
1979 			goto out;
1980 
1981 		rec->bytes_written += err;
1982 	}
1983 
1984 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1985 					  process_synthesized_event, machine);
1986 	if (err)
1987 		goto out;
1988 
1989 	/* Synthesize id_index before auxtrace_info */
1990 	err = perf_event__synthesize_id_index(tool,
1991 					      process_synthesized_event,
1992 					      session->evlist, machine);
1993 	if (err)
1994 		goto out;
1995 
1996 	if (rec->opts.full_auxtrace) {
1997 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1998 					session, process_synthesized_event);
1999 		if (err)
2000 			goto out;
2001 	}
2002 
2003 	if (!evlist__exclude_kernel(rec->evlist)) {
2004 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2005 							 machine);
2006 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2007 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2008 				   "Check /proc/kallsyms permission or run as root.\n");
2009 
2010 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2011 						     machine);
2012 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2013 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2014 				   "Check /proc/modules permission or run as root.\n");
2015 	}
2016 
2017 	if (perf_guest) {
2018 		machines__process_guests(&session->machines,
2019 					 perf_event__synthesize_guest_os, tool);
2020 	}
2021 
2022 	err = perf_event__synthesize_extra_attr(&rec->tool,
2023 						rec->evlist,
2024 						process_synthesized_event,
2025 						data->is_pipe);
2026 	if (err)
2027 		goto out;
2028 
2029 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2030 						 process_synthesized_event,
2031 						NULL);
2032 	if (err < 0) {
2033 		pr_err("Couldn't synthesize thread map.\n");
2034 		return err;
2035 	}
2036 
2037 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2038 					     process_synthesized_event, NULL);
2039 	if (err < 0) {
2040 		pr_err("Couldn't synthesize cpu map.\n");
2041 		return err;
2042 	}
2043 
2044 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2045 						machine, opts);
2046 	if (err < 0) {
2047 		pr_warning("Couldn't synthesize bpf events.\n");
2048 		err = 0;
2049 	}
2050 
2051 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2052 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2053 						     machine);
2054 		if (err < 0) {
2055 			pr_warning("Couldn't synthesize cgroup events.\n");
2056 			err = 0;
2057 		}
2058 	}
2059 
2060 	if (rec->opts.nr_threads_synthesize > 1) {
2061 		mutex_init(&synth_lock);
2062 		perf_set_multithreaded();
2063 		f = process_locked_synthesized_event;
2064 	}
2065 
2066 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2067 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2068 
2069 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2070 						    rec->evlist->core.threads,
2071 						    f, needs_mmap, opts->sample_address,
2072 						    rec->opts.nr_threads_synthesize);
2073 	}
2074 
2075 	if (rec->opts.nr_threads_synthesize > 1) {
2076 		perf_set_singlethreaded();
2077 		mutex_destroy(&synth_lock);
2078 	}
2079 
2080 out:
2081 	return err;
2082 }
2083 
2084 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2085 {
2086 	struct record *rec = data;
2087 	pthread_kill(rec->thread_id, SIGUSR2);
2088 	return 0;
2089 }
2090 
2091 static int record__setup_sb_evlist(struct record *rec)
2092 {
2093 	struct record_opts *opts = &rec->opts;
2094 
2095 	if (rec->sb_evlist != NULL) {
2096 		/*
2097 		 * We get here if --switch-output-event populated the
2098 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2099 		 * to the main thread.
2100 		 */
2101 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2102 		rec->thread_id = pthread_self();
2103 	}
2104 #ifdef HAVE_LIBBPF_SUPPORT
2105 	if (!opts->no_bpf_event) {
2106 		if (rec->sb_evlist == NULL) {
2107 			rec->sb_evlist = evlist__new();
2108 
2109 			if (rec->sb_evlist == NULL) {
2110 				pr_err("Couldn't create side band evlist.\n.");
2111 				return -1;
2112 			}
2113 		}
2114 
2115 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2116 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2117 			return -1;
2118 		}
2119 	}
2120 #endif
2121 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2122 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2123 		opts->no_bpf_event = true;
2124 	}
2125 
2126 	return 0;
2127 }
2128 
2129 static int record__init_clock(struct record *rec)
2130 {
2131 	struct perf_session *session = rec->session;
2132 	struct timespec ref_clockid;
2133 	struct timeval ref_tod;
2134 	u64 ref;
2135 
2136 	if (!rec->opts.use_clockid)
2137 		return 0;
2138 
2139 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2140 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2141 
2142 	session->header.env.clock.clockid = rec->opts.clockid;
2143 
2144 	if (gettimeofday(&ref_tod, NULL) != 0) {
2145 		pr_err("gettimeofday failed, cannot set reference time.\n");
2146 		return -1;
2147 	}
2148 
2149 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2150 		pr_err("clock_gettime failed, cannot set reference time.\n");
2151 		return -1;
2152 	}
2153 
2154 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2155 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2156 
2157 	session->header.env.clock.tod_ns = ref;
2158 
2159 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2160 	      (u64) ref_clockid.tv_nsec;
2161 
2162 	session->header.env.clock.clockid_ns = ref;
2163 	return 0;
2164 }
2165 
2166 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2167 {
2168 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2169 		trigger_hit(&auxtrace_snapshot_trigger);
2170 		auxtrace_record__snapshot_started = 1;
2171 		if (auxtrace_record__snapshot_start(rec->itr))
2172 			trigger_error(&auxtrace_snapshot_trigger);
2173 	}
2174 }
2175 
2176 static void record__uniquify_name(struct record *rec)
2177 {
2178 	struct evsel *pos;
2179 	struct evlist *evlist = rec->evlist;
2180 	char *new_name;
2181 	int ret;
2182 
2183 	if (!perf_pmu__has_hybrid())
2184 		return;
2185 
2186 	evlist__for_each_entry(evlist, pos) {
2187 		if (!evsel__is_hybrid(pos))
2188 			continue;
2189 
2190 		if (strchr(pos->name, '/'))
2191 			continue;
2192 
2193 		ret = asprintf(&new_name, "%s/%s/",
2194 			       pos->pmu_name, pos->name);
2195 		if (ret) {
2196 			free(pos->name);
2197 			pos->name = new_name;
2198 		}
2199 	}
2200 }
2201 
2202 static int record__terminate_thread(struct record_thread *thread_data)
2203 {
2204 	int err;
2205 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2206 	pid_t tid = thread_data->tid;
2207 
2208 	close(thread_data->pipes.msg[1]);
2209 	thread_data->pipes.msg[1] = -1;
2210 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2211 	if (err > 0)
2212 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2213 	else
2214 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2215 			   thread->tid, tid);
2216 
2217 	return 0;
2218 }
2219 
2220 static int record__start_threads(struct record *rec)
2221 {
2222 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2223 	struct record_thread *thread_data = rec->thread_data;
2224 	sigset_t full, mask;
2225 	pthread_t handle;
2226 	pthread_attr_t attrs;
2227 
2228 	thread = &thread_data[0];
2229 
2230 	if (!record__threads_enabled(rec))
2231 		return 0;
2232 
2233 	sigfillset(&full);
2234 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2235 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2236 		return -1;
2237 	}
2238 
2239 	pthread_attr_init(&attrs);
2240 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2241 
2242 	for (t = 1; t < nr_threads; t++) {
2243 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2244 
2245 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2246 		pthread_attr_setaffinity_np(&attrs,
2247 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2248 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2249 #endif
2250 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2251 			for (tt = 1; tt < t; tt++)
2252 				record__terminate_thread(&thread_data[t]);
2253 			pr_err("Failed to start threads: %s\n", strerror(errno));
2254 			ret = -1;
2255 			goto out_err;
2256 		}
2257 
2258 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2259 		if (err > 0)
2260 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2261 				  thread_msg_tags[msg]);
2262 		else
2263 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2264 				   thread->tid, rec->thread_data[t].tid);
2265 	}
2266 
2267 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2268 			(cpu_set_t *)thread->mask->affinity.bits);
2269 
2270 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2271 
2272 out_err:
2273 	pthread_attr_destroy(&attrs);
2274 
2275 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2276 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2277 		ret = -1;
2278 	}
2279 
2280 	return ret;
2281 }
2282 
2283 static int record__stop_threads(struct record *rec)
2284 {
2285 	int t;
2286 	struct record_thread *thread_data = rec->thread_data;
2287 
2288 	for (t = 1; t < rec->nr_threads; t++)
2289 		record__terminate_thread(&thread_data[t]);
2290 
2291 	for (t = 0; t < rec->nr_threads; t++) {
2292 		rec->samples += thread_data[t].samples;
2293 		if (!record__threads_enabled(rec))
2294 			continue;
2295 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2296 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2297 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2298 			 thread_data[t].samples, thread_data[t].waking);
2299 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2300 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2301 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2302 		else
2303 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2304 	}
2305 
2306 	return 0;
2307 }
2308 
2309 static unsigned long record__waking(struct record *rec)
2310 {
2311 	int t;
2312 	unsigned long waking = 0;
2313 	struct record_thread *thread_data = rec->thread_data;
2314 
2315 	for (t = 0; t < rec->nr_threads; t++)
2316 		waking += thread_data[t].waking;
2317 
2318 	return waking;
2319 }
2320 
2321 static int __cmd_record(struct record *rec, int argc, const char **argv)
2322 {
2323 	int err;
2324 	int status = 0;
2325 	const bool forks = argc > 0;
2326 	struct perf_tool *tool = &rec->tool;
2327 	struct record_opts *opts = &rec->opts;
2328 	struct perf_data *data = &rec->data;
2329 	struct perf_session *session;
2330 	bool disabled = false, draining = false;
2331 	int fd;
2332 	float ratio = 0;
2333 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2334 
2335 	atexit(record__sig_exit);
2336 	signal(SIGCHLD, sig_handler);
2337 	signal(SIGINT, sig_handler);
2338 	signal(SIGTERM, sig_handler);
2339 	signal(SIGSEGV, sigsegv_handler);
2340 
2341 	if (rec->opts.record_namespaces)
2342 		tool->namespace_events = true;
2343 
2344 	if (rec->opts.record_cgroup) {
2345 #ifdef HAVE_FILE_HANDLE
2346 		tool->cgroup_events = true;
2347 #else
2348 		pr_err("cgroup tracking is not supported\n");
2349 		return -1;
2350 #endif
2351 	}
2352 
2353 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2354 		signal(SIGUSR2, snapshot_sig_handler);
2355 		if (rec->opts.auxtrace_snapshot_mode)
2356 			trigger_on(&auxtrace_snapshot_trigger);
2357 		if (rec->switch_output.enabled)
2358 			trigger_on(&switch_output_trigger);
2359 	} else {
2360 		signal(SIGUSR2, SIG_IGN);
2361 	}
2362 
2363 	session = perf_session__new(data, tool);
2364 	if (IS_ERR(session)) {
2365 		pr_err("Perf session creation failed.\n");
2366 		return PTR_ERR(session);
2367 	}
2368 
2369 	if (record__threads_enabled(rec)) {
2370 		if (perf_data__is_pipe(&rec->data)) {
2371 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2372 			return -1;
2373 		}
2374 		if (rec->opts.full_auxtrace) {
2375 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2376 			return -1;
2377 		}
2378 	}
2379 
2380 	fd = perf_data__fd(data);
2381 	rec->session = session;
2382 
2383 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2384 		pr_err("Compression initialization failed.\n");
2385 		return -1;
2386 	}
2387 #ifdef HAVE_EVENTFD_SUPPORT
2388 	done_fd = eventfd(0, EFD_NONBLOCK);
2389 	if (done_fd < 0) {
2390 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2391 		status = -1;
2392 		goto out_delete_session;
2393 	}
2394 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2395 	if (err < 0) {
2396 		pr_err("Failed to add wakeup eventfd to poll list\n");
2397 		status = err;
2398 		goto out_delete_session;
2399 	}
2400 #endif // HAVE_EVENTFD_SUPPORT
2401 
2402 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2403 	session->header.env.comp_level = rec->opts.comp_level;
2404 
2405 	if (rec->opts.kcore &&
2406 	    !record__kcore_readable(&session->machines.host)) {
2407 		pr_err("ERROR: kcore is not readable.\n");
2408 		return -1;
2409 	}
2410 
2411 	if (record__init_clock(rec))
2412 		return -1;
2413 
2414 	record__init_features(rec);
2415 
2416 	if (forks) {
2417 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2418 					       workload_exec_failed_signal);
2419 		if (err < 0) {
2420 			pr_err("Couldn't run the workload!\n");
2421 			status = err;
2422 			goto out_delete_session;
2423 		}
2424 	}
2425 
2426 	/*
2427 	 * If we have just single event and are sending data
2428 	 * through pipe, we need to force the ids allocation,
2429 	 * because we synthesize event name through the pipe
2430 	 * and need the id for that.
2431 	 */
2432 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2433 		rec->opts.sample_id = true;
2434 
2435 	record__uniquify_name(rec);
2436 
2437 	/* Debug message used by test scripts */
2438 	pr_debug3("perf record opening and mmapping events\n");
2439 	if (record__open(rec) != 0) {
2440 		err = -1;
2441 		goto out_free_threads;
2442 	}
2443 	/* Debug message used by test scripts */
2444 	pr_debug3("perf record done opening and mmapping events\n");
2445 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2446 
2447 	if (rec->opts.kcore) {
2448 		err = record__kcore_copy(&session->machines.host, data);
2449 		if (err) {
2450 			pr_err("ERROR: Failed to copy kcore\n");
2451 			goto out_free_threads;
2452 		}
2453 	}
2454 
2455 	err = bpf__apply_obj_config();
2456 	if (err) {
2457 		char errbuf[BUFSIZ];
2458 
2459 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2460 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2461 			 errbuf);
2462 		goto out_free_threads;
2463 	}
2464 
2465 	/*
2466 	 * Normally perf_session__new would do this, but it doesn't have the
2467 	 * evlist.
2468 	 */
2469 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2470 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2471 		rec->tool.ordered_events = false;
2472 	}
2473 
2474 	if (!rec->evlist->core.nr_groups)
2475 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2476 
2477 	if (data->is_pipe) {
2478 		err = perf_header__write_pipe(fd);
2479 		if (err < 0)
2480 			goto out_free_threads;
2481 	} else {
2482 		err = perf_session__write_header(session, rec->evlist, fd, false);
2483 		if (err < 0)
2484 			goto out_free_threads;
2485 	}
2486 
2487 	err = -1;
2488 	if (!rec->no_buildid
2489 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2490 		pr_err("Couldn't generate buildids. "
2491 		       "Use --no-buildid to profile anyway.\n");
2492 		goto out_free_threads;
2493 	}
2494 
2495 	err = record__setup_sb_evlist(rec);
2496 	if (err)
2497 		goto out_free_threads;
2498 
2499 	err = record__synthesize(rec, false);
2500 	if (err < 0)
2501 		goto out_free_threads;
2502 
2503 	if (rec->realtime_prio) {
2504 		struct sched_param param;
2505 
2506 		param.sched_priority = rec->realtime_prio;
2507 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2508 			pr_err("Could not set realtime priority.\n");
2509 			err = -1;
2510 			goto out_free_threads;
2511 		}
2512 	}
2513 
2514 	if (record__start_threads(rec))
2515 		goto out_free_threads;
2516 
2517 	/*
2518 	 * When perf is starting the traced process, all the events
2519 	 * (apart from group members) have enable_on_exec=1 set,
2520 	 * so don't spoil it by prematurely enabling them.
2521 	 */
2522 	if (!target__none(&opts->target) && !opts->initial_delay)
2523 		evlist__enable(rec->evlist);
2524 
2525 	/*
2526 	 * Let the child rip
2527 	 */
2528 	if (forks) {
2529 		struct machine *machine = &session->machines.host;
2530 		union perf_event *event;
2531 		pid_t tgid;
2532 
2533 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2534 		if (event == NULL) {
2535 			err = -ENOMEM;
2536 			goto out_child;
2537 		}
2538 
2539 		/*
2540 		 * Some H/W events are generated before COMM event
2541 		 * which is emitted during exec(), so perf script
2542 		 * cannot see a correct process name for those events.
2543 		 * Synthesize COMM event to prevent it.
2544 		 */
2545 		tgid = perf_event__synthesize_comm(tool, event,
2546 						   rec->evlist->workload.pid,
2547 						   process_synthesized_event,
2548 						   machine);
2549 		free(event);
2550 
2551 		if (tgid == -1)
2552 			goto out_child;
2553 
2554 		event = malloc(sizeof(event->namespaces) +
2555 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2556 			       machine->id_hdr_size);
2557 		if (event == NULL) {
2558 			err = -ENOMEM;
2559 			goto out_child;
2560 		}
2561 
2562 		/*
2563 		 * Synthesize NAMESPACES event for the command specified.
2564 		 */
2565 		perf_event__synthesize_namespaces(tool, event,
2566 						  rec->evlist->workload.pid,
2567 						  tgid, process_synthesized_event,
2568 						  machine);
2569 		free(event);
2570 
2571 		evlist__start_workload(rec->evlist);
2572 	}
2573 
2574 	if (opts->initial_delay) {
2575 		pr_info(EVLIST_DISABLED_MSG);
2576 		if (opts->initial_delay > 0) {
2577 			usleep(opts->initial_delay * USEC_PER_MSEC);
2578 			evlist__enable(rec->evlist);
2579 			pr_info(EVLIST_ENABLED_MSG);
2580 		}
2581 	}
2582 
2583 	err = event_enable_timer__start(rec->evlist->eet);
2584 	if (err)
2585 		goto out_child;
2586 
2587 	/* Debug message used by test scripts */
2588 	pr_debug3("perf record has started\n");
2589 	fflush(stderr);
2590 
2591 	trigger_ready(&auxtrace_snapshot_trigger);
2592 	trigger_ready(&switch_output_trigger);
2593 	perf_hooks__invoke_record_start();
2594 
2595 	/*
2596 	 * Must write FINISHED_INIT so it will be seen after all other
2597 	 * synthesized user events, but before any regular events.
2598 	 */
2599 	err = write_finished_init(rec, false);
2600 	if (err < 0)
2601 		goto out_child;
2602 
2603 	for (;;) {
2604 		unsigned long long hits = thread->samples;
2605 
2606 		/*
2607 		 * rec->evlist->bkw_mmap_state is possible to be
2608 		 * BKW_MMAP_EMPTY here: when done == true and
2609 		 * hits != rec->samples in previous round.
2610 		 *
2611 		 * evlist__toggle_bkw_mmap ensure we never
2612 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2613 		 */
2614 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2615 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2616 
2617 		if (record__mmap_read_all(rec, false) < 0) {
2618 			trigger_error(&auxtrace_snapshot_trigger);
2619 			trigger_error(&switch_output_trigger);
2620 			err = -1;
2621 			goto out_child;
2622 		}
2623 
2624 		if (auxtrace_record__snapshot_started) {
2625 			auxtrace_record__snapshot_started = 0;
2626 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2627 				record__read_auxtrace_snapshot(rec, false);
2628 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2629 				pr_err("AUX area tracing snapshot failed\n");
2630 				err = -1;
2631 				goto out_child;
2632 			}
2633 		}
2634 
2635 		if (trigger_is_hit(&switch_output_trigger)) {
2636 			/*
2637 			 * If switch_output_trigger is hit, the data in
2638 			 * overwritable ring buffer should have been collected,
2639 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2640 			 *
2641 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2642 			 * record__mmap_read_all() didn't collect data from
2643 			 * overwritable ring buffer. Read again.
2644 			 */
2645 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2646 				continue;
2647 			trigger_ready(&switch_output_trigger);
2648 
2649 			/*
2650 			 * Reenable events in overwrite ring buffer after
2651 			 * record__mmap_read_all(): we should have collected
2652 			 * data from it.
2653 			 */
2654 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2655 
2656 			if (!quiet)
2657 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2658 					record__waking(rec));
2659 			thread->waking = 0;
2660 			fd = record__switch_output(rec, false);
2661 			if (fd < 0) {
2662 				pr_err("Failed to switch to new file\n");
2663 				trigger_error(&switch_output_trigger);
2664 				err = fd;
2665 				goto out_child;
2666 			}
2667 
2668 			/* re-arm the alarm */
2669 			if (rec->switch_output.time)
2670 				alarm(rec->switch_output.time);
2671 		}
2672 
2673 		if (hits == thread->samples) {
2674 			if (done || draining)
2675 				break;
2676 			err = fdarray__poll(&thread->pollfd, -1);
2677 			/*
2678 			 * Propagate error, only if there's any. Ignore positive
2679 			 * number of returned events and interrupt error.
2680 			 */
2681 			if (err > 0 || (err < 0 && errno == EINTR))
2682 				err = 0;
2683 			thread->waking++;
2684 
2685 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2686 					    record__thread_munmap_filtered, NULL) == 0)
2687 				draining = true;
2688 
2689 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2690 			if (err)
2691 				goto out_child;
2692 		}
2693 
2694 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2695 			switch (cmd) {
2696 			case EVLIST_CTL_CMD_SNAPSHOT:
2697 				hit_auxtrace_snapshot_trigger(rec);
2698 				evlist__ctlfd_ack(rec->evlist);
2699 				break;
2700 			case EVLIST_CTL_CMD_STOP:
2701 				done = 1;
2702 				break;
2703 			case EVLIST_CTL_CMD_ACK:
2704 			case EVLIST_CTL_CMD_UNSUPPORTED:
2705 			case EVLIST_CTL_CMD_ENABLE:
2706 			case EVLIST_CTL_CMD_DISABLE:
2707 			case EVLIST_CTL_CMD_EVLIST:
2708 			case EVLIST_CTL_CMD_PING:
2709 			default:
2710 				break;
2711 			}
2712 		}
2713 
2714 		err = event_enable_timer__process(rec->evlist->eet);
2715 		if (err < 0)
2716 			goto out_child;
2717 		if (err) {
2718 			err = 0;
2719 			done = 1;
2720 		}
2721 
2722 		/*
2723 		 * When perf is starting the traced process, at the end events
2724 		 * die with the process and we wait for that. Thus no need to
2725 		 * disable events in this case.
2726 		 */
2727 		if (done && !disabled && !target__none(&opts->target)) {
2728 			trigger_off(&auxtrace_snapshot_trigger);
2729 			evlist__disable(rec->evlist);
2730 			disabled = true;
2731 		}
2732 	}
2733 
2734 	trigger_off(&auxtrace_snapshot_trigger);
2735 	trigger_off(&switch_output_trigger);
2736 
2737 	if (opts->auxtrace_snapshot_on_exit)
2738 		record__auxtrace_snapshot_exit(rec);
2739 
2740 	if (forks && workload_exec_errno) {
2741 		char msg[STRERR_BUFSIZE], strevsels[2048];
2742 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2743 
2744 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2745 
2746 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2747 			strevsels, argv[0], emsg);
2748 		err = -1;
2749 		goto out_child;
2750 	}
2751 
2752 	if (!quiet)
2753 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2754 			record__waking(rec));
2755 
2756 	write_finished_init(rec, true);
2757 
2758 	if (target__none(&rec->opts.target))
2759 		record__synthesize_workload(rec, true);
2760 
2761 out_child:
2762 	record__stop_threads(rec);
2763 	record__mmap_read_all(rec, true);
2764 out_free_threads:
2765 	record__free_thread_data(rec);
2766 	evlist__finalize_ctlfd(rec->evlist);
2767 	record__aio_mmap_read_sync(rec);
2768 
2769 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2770 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2771 		session->header.env.comp_ratio = ratio + 0.5;
2772 	}
2773 
2774 	if (forks) {
2775 		int exit_status;
2776 
2777 		if (!child_finished)
2778 			kill(rec->evlist->workload.pid, SIGTERM);
2779 
2780 		wait(&exit_status);
2781 
2782 		if (err < 0)
2783 			status = err;
2784 		else if (WIFEXITED(exit_status))
2785 			status = WEXITSTATUS(exit_status);
2786 		else if (WIFSIGNALED(exit_status))
2787 			signr = WTERMSIG(exit_status);
2788 	} else
2789 		status = err;
2790 
2791 	if (rec->off_cpu)
2792 		rec->bytes_written += off_cpu_write(rec->session);
2793 
2794 	record__read_lost_samples(rec);
2795 	record__synthesize(rec, true);
2796 	/* this will be recalculated during process_buildids() */
2797 	rec->samples = 0;
2798 
2799 	if (!err) {
2800 		if (!rec->timestamp_filename) {
2801 			record__finish_output(rec);
2802 		} else {
2803 			fd = record__switch_output(rec, true);
2804 			if (fd < 0) {
2805 				status = fd;
2806 				goto out_delete_session;
2807 			}
2808 		}
2809 	}
2810 
2811 	perf_hooks__invoke_record_end();
2812 
2813 	if (!err && !quiet) {
2814 		char samples[128];
2815 		const char *postfix = rec->timestamp_filename ?
2816 					".<timestamp>" : "";
2817 
2818 		if (rec->samples && !rec->opts.full_auxtrace)
2819 			scnprintf(samples, sizeof(samples),
2820 				  " (%" PRIu64 " samples)", rec->samples);
2821 		else
2822 			samples[0] = '\0';
2823 
2824 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2825 			perf_data__size(data) / 1024.0 / 1024.0,
2826 			data->path, postfix, samples);
2827 		if (ratio) {
2828 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2829 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2830 					ratio);
2831 		}
2832 		fprintf(stderr, " ]\n");
2833 	}
2834 
2835 out_delete_session:
2836 #ifdef HAVE_EVENTFD_SUPPORT
2837 	if (done_fd >= 0)
2838 		close(done_fd);
2839 #endif
2840 	zstd_fini(&session->zstd_data);
2841 	perf_session__delete(session);
2842 
2843 	if (!opts->no_bpf_event)
2844 		evlist__stop_sb_thread(rec->sb_evlist);
2845 	return status;
2846 }
2847 
2848 static void callchain_debug(struct callchain_param *callchain)
2849 {
2850 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2851 
2852 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2853 
2854 	if (callchain->record_mode == CALLCHAIN_DWARF)
2855 		pr_debug("callchain: stack dump size %d\n",
2856 			 callchain->dump_size);
2857 }
2858 
2859 int record_opts__parse_callchain(struct record_opts *record,
2860 				 struct callchain_param *callchain,
2861 				 const char *arg, bool unset)
2862 {
2863 	int ret;
2864 	callchain->enabled = !unset;
2865 
2866 	/* --no-call-graph */
2867 	if (unset) {
2868 		callchain->record_mode = CALLCHAIN_NONE;
2869 		pr_debug("callchain: disabled\n");
2870 		return 0;
2871 	}
2872 
2873 	ret = parse_callchain_record_opt(arg, callchain);
2874 	if (!ret) {
2875 		/* Enable data address sampling for DWARF unwind. */
2876 		if (callchain->record_mode == CALLCHAIN_DWARF)
2877 			record->sample_address = true;
2878 		callchain_debug(callchain);
2879 	}
2880 
2881 	return ret;
2882 }
2883 
2884 int record_parse_callchain_opt(const struct option *opt,
2885 			       const char *arg,
2886 			       int unset)
2887 {
2888 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2889 }
2890 
2891 int record_callchain_opt(const struct option *opt,
2892 			 const char *arg __maybe_unused,
2893 			 int unset __maybe_unused)
2894 {
2895 	struct callchain_param *callchain = opt->value;
2896 
2897 	callchain->enabled = true;
2898 
2899 	if (callchain->record_mode == CALLCHAIN_NONE)
2900 		callchain->record_mode = CALLCHAIN_FP;
2901 
2902 	callchain_debug(callchain);
2903 	return 0;
2904 }
2905 
2906 static int perf_record_config(const char *var, const char *value, void *cb)
2907 {
2908 	struct record *rec = cb;
2909 
2910 	if (!strcmp(var, "record.build-id")) {
2911 		if (!strcmp(value, "cache"))
2912 			rec->no_buildid_cache = false;
2913 		else if (!strcmp(value, "no-cache"))
2914 			rec->no_buildid_cache = true;
2915 		else if (!strcmp(value, "skip"))
2916 			rec->no_buildid = true;
2917 		else if (!strcmp(value, "mmap"))
2918 			rec->buildid_mmap = true;
2919 		else
2920 			return -1;
2921 		return 0;
2922 	}
2923 	if (!strcmp(var, "record.call-graph")) {
2924 		var = "call-graph.record-mode";
2925 		return perf_default_config(var, value, cb);
2926 	}
2927 #ifdef HAVE_AIO_SUPPORT
2928 	if (!strcmp(var, "record.aio")) {
2929 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2930 		if (!rec->opts.nr_cblocks)
2931 			rec->opts.nr_cblocks = nr_cblocks_default;
2932 	}
2933 #endif
2934 	if (!strcmp(var, "record.debuginfod")) {
2935 		rec->debuginfod.urls = strdup(value);
2936 		if (!rec->debuginfod.urls)
2937 			return -ENOMEM;
2938 		rec->debuginfod.set = true;
2939 	}
2940 
2941 	return 0;
2942 }
2943 
2944 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2945 {
2946 	struct record *rec = (struct record *)opt->value;
2947 
2948 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2949 }
2950 
2951 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2952 {
2953 	struct record_opts *opts = (struct record_opts *)opt->value;
2954 
2955 	if (unset || !str)
2956 		return 0;
2957 
2958 	if (!strcasecmp(str, "node"))
2959 		opts->affinity = PERF_AFFINITY_NODE;
2960 	else if (!strcasecmp(str, "cpu"))
2961 		opts->affinity = PERF_AFFINITY_CPU;
2962 
2963 	return 0;
2964 }
2965 
2966 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2967 {
2968 	mask->nbits = nr_bits;
2969 	mask->bits = bitmap_zalloc(mask->nbits);
2970 	if (!mask->bits)
2971 		return -ENOMEM;
2972 
2973 	return 0;
2974 }
2975 
2976 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2977 {
2978 	bitmap_free(mask->bits);
2979 	mask->nbits = 0;
2980 }
2981 
2982 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
2983 {
2984 	int ret;
2985 
2986 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
2987 	if (ret) {
2988 		mask->affinity.bits = NULL;
2989 		return ret;
2990 	}
2991 
2992 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
2993 	if (ret) {
2994 		record__mmap_cpu_mask_free(&mask->maps);
2995 		mask->maps.bits = NULL;
2996 	}
2997 
2998 	return ret;
2999 }
3000 
3001 static void record__thread_mask_free(struct thread_mask *mask)
3002 {
3003 	record__mmap_cpu_mask_free(&mask->maps);
3004 	record__mmap_cpu_mask_free(&mask->affinity);
3005 }
3006 
3007 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3008 {
3009 	int s;
3010 	struct record_opts *opts = opt->value;
3011 
3012 	if (unset || !str || !strlen(str)) {
3013 		opts->threads_spec = THREAD_SPEC__CPU;
3014 	} else {
3015 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3016 			if (s == THREAD_SPEC__USER) {
3017 				opts->threads_user_spec = strdup(str);
3018 				if (!opts->threads_user_spec)
3019 					return -ENOMEM;
3020 				opts->threads_spec = THREAD_SPEC__USER;
3021 				break;
3022 			}
3023 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3024 				opts->threads_spec = s;
3025 				break;
3026 			}
3027 		}
3028 	}
3029 
3030 	if (opts->threads_spec == THREAD_SPEC__USER)
3031 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3032 	else
3033 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3034 
3035 	return 0;
3036 }
3037 
3038 static int parse_output_max_size(const struct option *opt,
3039 				 const char *str, int unset)
3040 {
3041 	unsigned long *s = (unsigned long *)opt->value;
3042 	static struct parse_tag tags_size[] = {
3043 		{ .tag  = 'B', .mult = 1       },
3044 		{ .tag  = 'K', .mult = 1 << 10 },
3045 		{ .tag  = 'M', .mult = 1 << 20 },
3046 		{ .tag  = 'G', .mult = 1 << 30 },
3047 		{ .tag  = 0 },
3048 	};
3049 	unsigned long val;
3050 
3051 	if (unset) {
3052 		*s = 0;
3053 		return 0;
3054 	}
3055 
3056 	val = parse_tag_value(str, tags_size);
3057 	if (val != (unsigned long) -1) {
3058 		*s = val;
3059 		return 0;
3060 	}
3061 
3062 	return -1;
3063 }
3064 
3065 static int record__parse_mmap_pages(const struct option *opt,
3066 				    const char *str,
3067 				    int unset __maybe_unused)
3068 {
3069 	struct record_opts *opts = opt->value;
3070 	char *s, *p;
3071 	unsigned int mmap_pages;
3072 	int ret;
3073 
3074 	if (!str)
3075 		return -EINVAL;
3076 
3077 	s = strdup(str);
3078 	if (!s)
3079 		return -ENOMEM;
3080 
3081 	p = strchr(s, ',');
3082 	if (p)
3083 		*p = '\0';
3084 
3085 	if (*s) {
3086 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3087 		if (ret)
3088 			goto out_free;
3089 		opts->mmap_pages = mmap_pages;
3090 	}
3091 
3092 	if (!p) {
3093 		ret = 0;
3094 		goto out_free;
3095 	}
3096 
3097 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3098 	if (ret)
3099 		goto out_free;
3100 
3101 	opts->auxtrace_mmap_pages = mmap_pages;
3102 
3103 out_free:
3104 	free(s);
3105 	return ret;
3106 }
3107 
3108 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3109 {
3110 }
3111 
3112 static int parse_control_option(const struct option *opt,
3113 				const char *str,
3114 				int unset __maybe_unused)
3115 {
3116 	struct record_opts *opts = opt->value;
3117 
3118 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3119 }
3120 
3121 static void switch_output_size_warn(struct record *rec)
3122 {
3123 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3124 	struct switch_output *s = &rec->switch_output;
3125 
3126 	wakeup_size /= 2;
3127 
3128 	if (s->size < wakeup_size) {
3129 		char buf[100];
3130 
3131 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3132 		pr_warning("WARNING: switch-output data size lower than "
3133 			   "wakeup kernel buffer size (%s) "
3134 			   "expect bigger perf.data sizes\n", buf);
3135 	}
3136 }
3137 
3138 static int switch_output_setup(struct record *rec)
3139 {
3140 	struct switch_output *s = &rec->switch_output;
3141 	static struct parse_tag tags_size[] = {
3142 		{ .tag  = 'B', .mult = 1       },
3143 		{ .tag  = 'K', .mult = 1 << 10 },
3144 		{ .tag  = 'M', .mult = 1 << 20 },
3145 		{ .tag  = 'G', .mult = 1 << 30 },
3146 		{ .tag  = 0 },
3147 	};
3148 	static struct parse_tag tags_time[] = {
3149 		{ .tag  = 's', .mult = 1        },
3150 		{ .tag  = 'm', .mult = 60       },
3151 		{ .tag  = 'h', .mult = 60*60    },
3152 		{ .tag  = 'd', .mult = 60*60*24 },
3153 		{ .tag  = 0 },
3154 	};
3155 	unsigned long val;
3156 
3157 	/*
3158 	 * If we're using --switch-output-events, then we imply its
3159 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3160 	 *  thread to its parent.
3161 	 */
3162 	if (rec->switch_output_event_set) {
3163 		if (record__threads_enabled(rec)) {
3164 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3165 			return 0;
3166 		}
3167 		goto do_signal;
3168 	}
3169 
3170 	if (!s->set)
3171 		return 0;
3172 
3173 	if (record__threads_enabled(rec)) {
3174 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3175 		return 0;
3176 	}
3177 
3178 	if (!strcmp(s->str, "signal")) {
3179 do_signal:
3180 		s->signal = true;
3181 		pr_debug("switch-output with SIGUSR2 signal\n");
3182 		goto enabled;
3183 	}
3184 
3185 	val = parse_tag_value(s->str, tags_size);
3186 	if (val != (unsigned long) -1) {
3187 		s->size = val;
3188 		pr_debug("switch-output with %s size threshold\n", s->str);
3189 		goto enabled;
3190 	}
3191 
3192 	val = parse_tag_value(s->str, tags_time);
3193 	if (val != (unsigned long) -1) {
3194 		s->time = val;
3195 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3196 			 s->str, s->time);
3197 		goto enabled;
3198 	}
3199 
3200 	return -1;
3201 
3202 enabled:
3203 	rec->timestamp_filename = true;
3204 	s->enabled              = true;
3205 
3206 	if (s->size && !rec->opts.no_buffering)
3207 		switch_output_size_warn(rec);
3208 
3209 	return 0;
3210 }
3211 
3212 static const char * const __record_usage[] = {
3213 	"perf record [<options>] [<command>]",
3214 	"perf record [<options>] -- <command> [<options>]",
3215 	NULL
3216 };
3217 const char * const *record_usage = __record_usage;
3218 
3219 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3220 				  struct perf_sample *sample, struct machine *machine)
3221 {
3222 	/*
3223 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3224 	 * no need to add them twice.
3225 	 */
3226 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3227 		return 0;
3228 	return perf_event__process_mmap(tool, event, sample, machine);
3229 }
3230 
3231 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3232 				   struct perf_sample *sample, struct machine *machine)
3233 {
3234 	/*
3235 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3236 	 * no need to add them twice.
3237 	 */
3238 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3239 		return 0;
3240 
3241 	return perf_event__process_mmap2(tool, event, sample, machine);
3242 }
3243 
3244 static int process_timestamp_boundary(struct perf_tool *tool,
3245 				      union perf_event *event __maybe_unused,
3246 				      struct perf_sample *sample,
3247 				      struct machine *machine __maybe_unused)
3248 {
3249 	struct record *rec = container_of(tool, struct record, tool);
3250 
3251 	set_timestamp_boundary(rec, sample->time);
3252 	return 0;
3253 }
3254 
3255 static int parse_record_synth_option(const struct option *opt,
3256 				     const char *str,
3257 				     int unset __maybe_unused)
3258 {
3259 	struct record_opts *opts = opt->value;
3260 	char *p = strdup(str);
3261 
3262 	if (p == NULL)
3263 		return -1;
3264 
3265 	opts->synth = parse_synth_opt(p);
3266 	free(p);
3267 
3268 	if (opts->synth < 0) {
3269 		pr_err("Invalid synth option: %s\n", str);
3270 		return -1;
3271 	}
3272 	return 0;
3273 }
3274 
3275 /*
3276  * XXX Ideally would be local to cmd_record() and passed to a record__new
3277  * because we need to have access to it in record__exit, that is called
3278  * after cmd_record() exits, but since record_options need to be accessible to
3279  * builtin-script, leave it here.
3280  *
3281  * At least we don't ouch it in all the other functions here directly.
3282  *
3283  * Just say no to tons of global variables, sigh.
3284  */
3285 static struct record record = {
3286 	.opts = {
3287 		.sample_time	     = true,
3288 		.mmap_pages	     = UINT_MAX,
3289 		.user_freq	     = UINT_MAX,
3290 		.user_interval	     = ULLONG_MAX,
3291 		.freq		     = 4000,
3292 		.target		     = {
3293 			.uses_mmap   = true,
3294 			.default_per_cpu = true,
3295 		},
3296 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3297 		.nr_threads_synthesize = 1,
3298 		.ctl_fd              = -1,
3299 		.ctl_fd_ack          = -1,
3300 		.synth               = PERF_SYNTH_ALL,
3301 	},
3302 	.tool = {
3303 		.sample		= process_sample_event,
3304 		.fork		= perf_event__process_fork,
3305 		.exit		= perf_event__process_exit,
3306 		.comm		= perf_event__process_comm,
3307 		.namespaces	= perf_event__process_namespaces,
3308 		.mmap		= build_id__process_mmap,
3309 		.mmap2		= build_id__process_mmap2,
3310 		.itrace_start	= process_timestamp_boundary,
3311 		.aux		= process_timestamp_boundary,
3312 		.ordered_events	= true,
3313 	},
3314 };
3315 
3316 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3317 	"\n\t\t\t\tDefault: fp";
3318 
3319 static bool dry_run;
3320 
3321 /*
3322  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3323  * with it and switch to use the library functions in perf_evlist that came
3324  * from builtin-record.c, i.e. use record_opts,
3325  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3326  * using pipes, etc.
3327  */
3328 static struct option __record_options[] = {
3329 	OPT_CALLBACK('e', "event", &record.evlist, "event",
3330 		     "event selector. use 'perf list' to list available events",
3331 		     parse_events_option),
3332 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3333 		     "event filter", parse_filter),
3334 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3335 			   NULL, "don't record events from perf itself",
3336 			   exclude_perf),
3337 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3338 		    "record events on existing process id"),
3339 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3340 		    "record events on existing thread id"),
3341 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3342 		    "collect data with this RT SCHED_FIFO priority"),
3343 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3344 		    "collect data without buffering"),
3345 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3346 		    "collect raw sample records from all opened counters"),
3347 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3348 			    "system-wide collection from all CPUs"),
3349 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3350 		    "list of cpus to monitor"),
3351 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3352 	OPT_STRING('o', "output", &record.data.path, "file",
3353 		    "output file name"),
3354 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3355 			&record.opts.no_inherit_set,
3356 			"child tasks do not inherit counters"),
3357 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3358 		    "synthesize non-sample events at the end of output"),
3359 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3360 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3361 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3362 		    "Fail if the specified frequency can't be used"),
3363 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3364 		     "profile at this frequency",
3365 		      record__parse_freq),
3366 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3367 		     "number of mmap data pages and AUX area tracing mmap pages",
3368 		     record__parse_mmap_pages),
3369 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3370 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3371 		     record__mmap_flush_parse),
3372 	OPT_BOOLEAN(0, "group", &record.opts.group,
3373 		    "put the counters into a counter group"),
3374 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3375 			   NULL, "enables call-graph recording" ,
3376 			   &record_callchain_opt),
3377 	OPT_CALLBACK(0, "call-graph", &record.opts,
3378 		     "record_mode[,record_size]", record_callchain_help,
3379 		     &record_parse_callchain_opt),
3380 	OPT_INCR('v', "verbose", &verbose,
3381 		    "be more verbose (show counter open errors, etc)"),
3382 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
3383 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3384 		    "per thread counts"),
3385 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3386 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3387 		    "Record the sample physical addresses"),
3388 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3389 		    "Record the sampled data address data page size"),
3390 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3391 		    "Record the sampled code address (ip) page size"),
3392 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3393 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3394 		    "Record the sample identifier"),
3395 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3396 			&record.opts.sample_time_set,
3397 			"Record the sample timestamps"),
3398 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3399 			"Record the sample period"),
3400 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3401 		    "don't sample"),
3402 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3403 			&record.no_buildid_cache_set,
3404 			"do not update the buildid cache"),
3405 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3406 			&record.no_buildid_set,
3407 			"do not collect buildids in perf.data"),
3408 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3409 		     "monitor event in cgroup name only",
3410 		     parse_cgroups),
3411 	OPT_CALLBACK('D', "delay", &record, "ms",
3412 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3413 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3414 		     record__parse_event_enable_time),
3415 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3416 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3417 		   "user to profile"),
3418 
3419 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3420 		     "branch any", "sample any taken branches",
3421 		     parse_branch_stack),
3422 
3423 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3424 		     "branch filter mask", "branch stack filter modes",
3425 		     parse_branch_stack),
3426 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3427 		    "sample by weight (on special events only)"),
3428 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3429 		    "sample transaction flags (special events only)"),
3430 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3431 		    "use per-thread mmaps"),
3432 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3433 		    "sample selected machine registers on interrupt,"
3434 		    " use '-I?' to list register names", parse_intr_regs),
3435 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3436 		    "sample selected machine registers on interrupt,"
3437 		    " use '--user-regs=?' to list register names", parse_user_regs),
3438 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3439 		    "Record running/enabled time of read (:S) events"),
3440 	OPT_CALLBACK('k', "clockid", &record.opts,
3441 	"clockid", "clockid to use for events, see clock_gettime()",
3442 	parse_clockid),
3443 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3444 			  "opts", "AUX area tracing Snapshot Mode", ""),
3445 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3446 			  "opts", "sample AUX area", ""),
3447 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3448 			"per thread proc mmap processing timeout in ms"),
3449 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3450 		    "Record namespaces events"),
3451 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3452 		    "Record cgroup events"),
3453 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3454 			&record.opts.record_switch_events_set,
3455 			"Record context switch events"),
3456 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3457 			 "Configure all used events to run in kernel space.",
3458 			 PARSE_OPT_EXCLUSIVE),
3459 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3460 			 "Configure all used events to run in user space.",
3461 			 PARSE_OPT_EXCLUSIVE),
3462 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3463 		    "collect kernel callchains"),
3464 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3465 		    "collect user callchains"),
3466 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
3467 		   "clang binary to use for compiling BPF scriptlets"),
3468 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
3469 		   "options passed to clang when compiling BPF scriptlets"),
3470 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3471 		   "file", "vmlinux pathname"),
3472 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3473 		    "Record build-id of all DSOs regardless of hits"),
3474 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3475 		    "Record build-id in map events"),
3476 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3477 		    "append timestamp to output filename"),
3478 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3479 		    "Record timestamp boundary (time of first/last samples)"),
3480 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3481 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3482 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3483 			  "signal"),
3484 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
3485 			 "switch output event selector. use 'perf list' to list available events",
3486 			 parse_events_option_new_evlist),
3487 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3488 		   "Limit number of switch output generated files"),
3489 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3490 		    "Parse options then exit"),
3491 #ifdef HAVE_AIO_SUPPORT
3492 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3493 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3494 		     record__aio_parse),
3495 #endif
3496 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3497 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3498 		     record__parse_affinity),
3499 #ifdef HAVE_ZSTD_SUPPORT
3500 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3501 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3502 			    record__parse_comp_level),
3503 #endif
3504 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3505 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3506 	OPT_UINTEGER(0, "num-thread-synthesize",
3507 		     &record.opts.nr_threads_synthesize,
3508 		     "number of threads to run for event synthesis"),
3509 #ifdef HAVE_LIBPFM
3510 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3511 		"libpfm4 event selector. use 'perf list' to list available events",
3512 		parse_libpfm_events_option),
3513 #endif
3514 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3515 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3516 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3517 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3518 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3519 		      parse_control_option),
3520 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3521 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3522 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3523 			  &record.debuginfod.set, "debuginfod urls",
3524 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3525 			  "system"),
3526 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3527 			    "write collected trace data into several data files using parallel threads",
3528 			    record__parse_threads),
3529 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3530 	OPT_END()
3531 };
3532 
3533 struct option *record_options = __record_options;
3534 
3535 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3536 {
3537 	struct perf_cpu cpu;
3538 	int idx;
3539 
3540 	if (cpu_map__is_dummy(cpus))
3541 		return 0;
3542 
3543 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
3544 		if (cpu.cpu == -1)
3545 			continue;
3546 		/* Return ENODEV is input cpu is greater than max cpu */
3547 		if ((unsigned long)cpu.cpu > mask->nbits)
3548 			return -ENODEV;
3549 		set_bit(cpu.cpu, mask->bits);
3550 	}
3551 
3552 	return 0;
3553 }
3554 
3555 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3556 {
3557 	struct perf_cpu_map *cpus;
3558 
3559 	cpus = perf_cpu_map__new(mask_spec);
3560 	if (!cpus)
3561 		return -ENOMEM;
3562 
3563 	bitmap_zero(mask->bits, mask->nbits);
3564 	if (record__mmap_cpu_mask_init(mask, cpus))
3565 		return -ENODEV;
3566 
3567 	perf_cpu_map__put(cpus);
3568 
3569 	return 0;
3570 }
3571 
3572 static void record__free_thread_masks(struct record *rec, int nr_threads)
3573 {
3574 	int t;
3575 
3576 	if (rec->thread_masks)
3577 		for (t = 0; t < nr_threads; t++)
3578 			record__thread_mask_free(&rec->thread_masks[t]);
3579 
3580 	zfree(&rec->thread_masks);
3581 }
3582 
3583 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3584 {
3585 	int t, ret;
3586 
3587 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3588 	if (!rec->thread_masks) {
3589 		pr_err("Failed to allocate thread masks\n");
3590 		return -ENOMEM;
3591 	}
3592 
3593 	for (t = 0; t < nr_threads; t++) {
3594 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3595 		if (ret) {
3596 			pr_err("Failed to allocate thread masks[%d]\n", t);
3597 			goto out_free;
3598 		}
3599 	}
3600 
3601 	return 0;
3602 
3603 out_free:
3604 	record__free_thread_masks(rec, nr_threads);
3605 
3606 	return ret;
3607 }
3608 
3609 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3610 {
3611 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3612 
3613 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3614 	if (ret)
3615 		return ret;
3616 
3617 	rec->nr_threads = nr_cpus;
3618 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3619 
3620 	for (t = 0; t < rec->nr_threads; t++) {
3621 		set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3622 		set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3623 		if (verbose) {
3624 			pr_debug("thread_masks[%d]: ", t);
3625 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3626 			pr_debug("thread_masks[%d]: ", t);
3627 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3628 		}
3629 	}
3630 
3631 	return 0;
3632 }
3633 
3634 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3635 					  const char **maps_spec, const char **affinity_spec,
3636 					  u32 nr_spec)
3637 {
3638 	u32 s;
3639 	int ret = 0, t = 0;
3640 	struct mmap_cpu_mask cpus_mask;
3641 	struct thread_mask thread_mask, full_mask, *thread_masks;
3642 
3643 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3644 	if (ret) {
3645 		pr_err("Failed to allocate CPUs mask\n");
3646 		return ret;
3647 	}
3648 
3649 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3650 	if (ret) {
3651 		pr_err("Failed to init cpu mask\n");
3652 		goto out_free_cpu_mask;
3653 	}
3654 
3655 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3656 	if (ret) {
3657 		pr_err("Failed to allocate full mask\n");
3658 		goto out_free_cpu_mask;
3659 	}
3660 
3661 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3662 	if (ret) {
3663 		pr_err("Failed to allocate thread mask\n");
3664 		goto out_free_full_and_cpu_masks;
3665 	}
3666 
3667 	for (s = 0; s < nr_spec; s++) {
3668 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3669 		if (ret) {
3670 			pr_err("Failed to initialize maps thread mask\n");
3671 			goto out_free;
3672 		}
3673 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3674 		if (ret) {
3675 			pr_err("Failed to initialize affinity thread mask\n");
3676 			goto out_free;
3677 		}
3678 
3679 		/* ignore invalid CPUs but do not allow empty masks */
3680 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3681 				cpus_mask.bits, thread_mask.maps.nbits)) {
3682 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3683 			ret = -EINVAL;
3684 			goto out_free;
3685 		}
3686 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3687 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3688 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3689 			ret = -EINVAL;
3690 			goto out_free;
3691 		}
3692 
3693 		/* do not allow intersection with other masks (full_mask) */
3694 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3695 				      thread_mask.maps.nbits)) {
3696 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3697 			ret = -EINVAL;
3698 			goto out_free;
3699 		}
3700 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3701 				      thread_mask.affinity.nbits)) {
3702 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3703 			ret = -EINVAL;
3704 			goto out_free;
3705 		}
3706 
3707 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3708 			  thread_mask.maps.bits, full_mask.maps.nbits);
3709 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3710 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3711 
3712 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3713 		if (!thread_masks) {
3714 			pr_err("Failed to reallocate thread masks\n");
3715 			ret = -ENOMEM;
3716 			goto out_free;
3717 		}
3718 		rec->thread_masks = thread_masks;
3719 		rec->thread_masks[t] = thread_mask;
3720 		if (verbose) {
3721 			pr_debug("thread_masks[%d]: ", t);
3722 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3723 			pr_debug("thread_masks[%d]: ", t);
3724 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3725 		}
3726 		t++;
3727 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3728 		if (ret) {
3729 			pr_err("Failed to allocate thread mask\n");
3730 			goto out_free_full_and_cpu_masks;
3731 		}
3732 	}
3733 	rec->nr_threads = t;
3734 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3735 	if (!rec->nr_threads)
3736 		ret = -EINVAL;
3737 
3738 out_free:
3739 	record__thread_mask_free(&thread_mask);
3740 out_free_full_and_cpu_masks:
3741 	record__thread_mask_free(&full_mask);
3742 out_free_cpu_mask:
3743 	record__mmap_cpu_mask_free(&cpus_mask);
3744 
3745 	return ret;
3746 }
3747 
3748 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3749 {
3750 	int ret;
3751 	struct cpu_topology *topo;
3752 
3753 	topo = cpu_topology__new();
3754 	if (!topo) {
3755 		pr_err("Failed to allocate CPU topology\n");
3756 		return -ENOMEM;
3757 	}
3758 
3759 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3760 					     topo->core_cpus_list, topo->core_cpus_lists);
3761 	cpu_topology__delete(topo);
3762 
3763 	return ret;
3764 }
3765 
3766 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3767 {
3768 	int ret;
3769 	struct cpu_topology *topo;
3770 
3771 	topo = cpu_topology__new();
3772 	if (!topo) {
3773 		pr_err("Failed to allocate CPU topology\n");
3774 		return -ENOMEM;
3775 	}
3776 
3777 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3778 					     topo->package_cpus_list, topo->package_cpus_lists);
3779 	cpu_topology__delete(topo);
3780 
3781 	return ret;
3782 }
3783 
3784 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3785 {
3786 	u32 s;
3787 	int ret;
3788 	const char **spec;
3789 	struct numa_topology *topo;
3790 
3791 	topo = numa_topology__new();
3792 	if (!topo) {
3793 		pr_err("Failed to allocate NUMA topology\n");
3794 		return -ENOMEM;
3795 	}
3796 
3797 	spec = zalloc(topo->nr * sizeof(char *));
3798 	if (!spec) {
3799 		pr_err("Failed to allocate NUMA spec\n");
3800 		ret = -ENOMEM;
3801 		goto out_delete_topo;
3802 	}
3803 	for (s = 0; s < topo->nr; s++)
3804 		spec[s] = topo->nodes[s].cpus;
3805 
3806 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3807 
3808 	zfree(&spec);
3809 
3810 out_delete_topo:
3811 	numa_topology__delete(topo);
3812 
3813 	return ret;
3814 }
3815 
3816 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3817 {
3818 	int t, ret;
3819 	u32 s, nr_spec = 0;
3820 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3821 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3822 
3823 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3824 		spec = strtok_r(user_spec, ":", &spec_ptr);
3825 		if (spec == NULL)
3826 			break;
3827 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3828 		mask = strtok_r(spec, "/", &mask_ptr);
3829 		if (mask == NULL)
3830 			break;
3831 		pr_debug2("  maps mask: %s\n", mask);
3832 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3833 		if (!tmp_spec) {
3834 			pr_err("Failed to reallocate maps spec\n");
3835 			ret = -ENOMEM;
3836 			goto out_free;
3837 		}
3838 		maps_spec = tmp_spec;
3839 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3840 		if (!maps_spec[nr_spec]) {
3841 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3842 			ret = -ENOMEM;
3843 			goto out_free;
3844 		}
3845 		mask = strtok_r(NULL, "/", &mask_ptr);
3846 		if (mask == NULL) {
3847 			pr_err("Invalid thread maps or affinity specs\n");
3848 			ret = -EINVAL;
3849 			goto out_free;
3850 		}
3851 		pr_debug2("  affinity mask: %s\n", mask);
3852 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3853 		if (!tmp_spec) {
3854 			pr_err("Failed to reallocate affinity spec\n");
3855 			ret = -ENOMEM;
3856 			goto out_free;
3857 		}
3858 		affinity_spec = tmp_spec;
3859 		affinity_spec[nr_spec] = strdup(mask);
3860 		if (!affinity_spec[nr_spec]) {
3861 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3862 			ret = -ENOMEM;
3863 			goto out_free;
3864 		}
3865 		dup_mask = NULL;
3866 		nr_spec++;
3867 	}
3868 
3869 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3870 					     (const char **)affinity_spec, nr_spec);
3871 
3872 out_free:
3873 	free(dup_mask);
3874 	for (s = 0; s < nr_spec; s++) {
3875 		if (maps_spec)
3876 			free(maps_spec[s]);
3877 		if (affinity_spec)
3878 			free(affinity_spec[s]);
3879 	}
3880 	free(affinity_spec);
3881 	free(maps_spec);
3882 
3883 	return ret;
3884 }
3885 
3886 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3887 {
3888 	int ret;
3889 
3890 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3891 	if (ret)
3892 		return ret;
3893 
3894 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3895 		return -ENODEV;
3896 
3897 	rec->nr_threads = 1;
3898 
3899 	return 0;
3900 }
3901 
3902 static int record__init_thread_masks(struct record *rec)
3903 {
3904 	int ret = 0;
3905 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3906 
3907 	if (!record__threads_enabled(rec))
3908 		return record__init_thread_default_masks(rec, cpus);
3909 
3910 	if (evlist__per_thread(rec->evlist)) {
3911 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3912 		return -EINVAL;
3913 	}
3914 
3915 	switch (rec->opts.threads_spec) {
3916 	case THREAD_SPEC__CPU:
3917 		ret = record__init_thread_cpu_masks(rec, cpus);
3918 		break;
3919 	case THREAD_SPEC__CORE:
3920 		ret = record__init_thread_core_masks(rec, cpus);
3921 		break;
3922 	case THREAD_SPEC__PACKAGE:
3923 		ret = record__init_thread_package_masks(rec, cpus);
3924 		break;
3925 	case THREAD_SPEC__NUMA:
3926 		ret = record__init_thread_numa_masks(rec, cpus);
3927 		break;
3928 	case THREAD_SPEC__USER:
3929 		ret = record__init_thread_user_masks(rec, cpus);
3930 		break;
3931 	default:
3932 		break;
3933 	}
3934 
3935 	return ret;
3936 }
3937 
3938 int cmd_record(int argc, const char **argv)
3939 {
3940 	int err;
3941 	struct record *rec = &record;
3942 	char errbuf[BUFSIZ];
3943 
3944 	setlocale(LC_ALL, "");
3945 
3946 #ifndef HAVE_LIBBPF_SUPPORT
3947 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
3948 	set_nobuild('\0', "clang-path", true);
3949 	set_nobuild('\0', "clang-opt", true);
3950 # undef set_nobuild
3951 #endif
3952 
3953 #ifndef HAVE_BPF_PROLOGUE
3954 # if !defined (HAVE_DWARF_SUPPORT)
3955 #  define REASON  "NO_DWARF=1"
3956 # elif !defined (HAVE_LIBBPF_SUPPORT)
3957 #  define REASON  "NO_LIBBPF=1"
3958 # else
3959 #  define REASON  "this architecture doesn't support BPF prologue"
3960 # endif
3961 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
3962 	set_nobuild('\0', "vmlinux", true);
3963 # undef set_nobuild
3964 # undef REASON
3965 #endif
3966 
3967 #ifndef HAVE_BPF_SKEL
3968 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3969 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3970 # undef set_nobuild
3971 #endif
3972 
3973 	rec->opts.affinity = PERF_AFFINITY_SYS;
3974 
3975 	rec->evlist = evlist__new();
3976 	if (rec->evlist == NULL)
3977 		return -ENOMEM;
3978 
3979 	err = perf_config(perf_record_config, rec);
3980 	if (err)
3981 		return err;
3982 
3983 	argc = parse_options(argc, argv, record_options, record_usage,
3984 			    PARSE_OPT_STOP_AT_NON_OPTION);
3985 	if (quiet)
3986 		perf_quiet_option();
3987 
3988 	err = symbol__validate_sym_arguments();
3989 	if (err)
3990 		return err;
3991 
3992 	perf_debuginfod_setup(&record.debuginfod);
3993 
3994 	/* Make system wide (-a) the default target. */
3995 	if (!argc && target__none(&rec->opts.target))
3996 		rec->opts.target.system_wide = true;
3997 
3998 	if (nr_cgroups && !rec->opts.target.system_wide) {
3999 		usage_with_options_msg(record_usage, record_options,
4000 			"cgroup monitoring only available in system-wide mode");
4001 
4002 	}
4003 
4004 	if (rec->buildid_mmap) {
4005 		if (!perf_can_record_build_id()) {
4006 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4007 			err = -EINVAL;
4008 			goto out_opts;
4009 		}
4010 		pr_debug("Enabling build id in mmap2 events.\n");
4011 		/* Enable mmap build id synthesizing. */
4012 		symbol_conf.buildid_mmap2 = true;
4013 		/* Enable perf_event_attr::build_id bit. */
4014 		rec->opts.build_id = true;
4015 		/* Disable build id cache. */
4016 		rec->no_buildid = true;
4017 	}
4018 
4019 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4020 		pr_err("Kernel has no cgroup sampling support.\n");
4021 		err = -EINVAL;
4022 		goto out_opts;
4023 	}
4024 
4025 	if (rec->opts.kcore)
4026 		rec->opts.text_poke = true;
4027 
4028 	if (rec->opts.kcore || record__threads_enabled(rec))
4029 		rec->data.is_dir = true;
4030 
4031 	if (record__threads_enabled(rec)) {
4032 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4033 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4034 			goto out_opts;
4035 		}
4036 		if (record__aio_enabled(rec)) {
4037 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4038 			goto out_opts;
4039 		}
4040 	}
4041 
4042 	if (rec->opts.comp_level != 0) {
4043 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4044 		rec->no_buildid = true;
4045 	}
4046 
4047 	if (rec->opts.record_switch_events &&
4048 	    !perf_can_record_switch_events()) {
4049 		ui__error("kernel does not support recording context switch events\n");
4050 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4051 		err = -EINVAL;
4052 		goto out_opts;
4053 	}
4054 
4055 	if (switch_output_setup(rec)) {
4056 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4057 		err = -EINVAL;
4058 		goto out_opts;
4059 	}
4060 
4061 	if (rec->switch_output.time) {
4062 		signal(SIGALRM, alarm_sig_handler);
4063 		alarm(rec->switch_output.time);
4064 	}
4065 
4066 	if (rec->switch_output.num_files) {
4067 		rec->switch_output.filenames = calloc(sizeof(char *),
4068 						      rec->switch_output.num_files);
4069 		if (!rec->switch_output.filenames) {
4070 			err = -EINVAL;
4071 			goto out_opts;
4072 		}
4073 	}
4074 
4075 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4076 		rec->timestamp_filename = false;
4077 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4078 	}
4079 
4080 	/*
4081 	 * Allow aliases to facilitate the lookup of symbols for address
4082 	 * filters. Refer to auxtrace_parse_filters().
4083 	 */
4084 	symbol_conf.allow_aliases = true;
4085 
4086 	symbol__init(NULL);
4087 
4088 	err = record__auxtrace_init(rec);
4089 	if (err)
4090 		goto out;
4091 
4092 	if (dry_run)
4093 		goto out;
4094 
4095 	err = bpf__setup_stdout(rec->evlist);
4096 	if (err) {
4097 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
4098 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
4099 			 errbuf);
4100 		goto out;
4101 	}
4102 
4103 	err = -ENOMEM;
4104 
4105 	if (rec->no_buildid_cache || rec->no_buildid) {
4106 		disable_buildid_cache();
4107 	} else if (rec->switch_output.enabled) {
4108 		/*
4109 		 * In 'perf record --switch-output', disable buildid
4110 		 * generation by default to reduce data file switching
4111 		 * overhead. Still generate buildid if they are required
4112 		 * explicitly using
4113 		 *
4114 		 *  perf record --switch-output --no-no-buildid \
4115 		 *              --no-no-buildid-cache
4116 		 *
4117 		 * Following code equals to:
4118 		 *
4119 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4120 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4121 		 *         disable_buildid_cache();
4122 		 */
4123 		bool disable = true;
4124 
4125 		if (rec->no_buildid_set && !rec->no_buildid)
4126 			disable = false;
4127 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4128 			disable = false;
4129 		if (disable) {
4130 			rec->no_buildid = true;
4131 			rec->no_buildid_cache = true;
4132 			disable_buildid_cache();
4133 		}
4134 	}
4135 
4136 	if (record.opts.overwrite)
4137 		record.opts.tail_synthesize = true;
4138 
4139 	if (rec->evlist->core.nr_entries == 0) {
4140 		if (perf_pmu__has_hybrid()) {
4141 			err = evlist__add_default_hybrid(rec->evlist,
4142 							 !record.opts.no_samples);
4143 		} else {
4144 			err = __evlist__add_default(rec->evlist,
4145 						    !record.opts.no_samples);
4146 		}
4147 
4148 		if (err < 0) {
4149 			pr_err("Not enough memory for event selector list\n");
4150 			goto out;
4151 		}
4152 	}
4153 
4154 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4155 		rec->opts.no_inherit = true;
4156 
4157 	err = target__validate(&rec->opts.target);
4158 	if (err) {
4159 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4160 		ui__warning("%s\n", errbuf);
4161 	}
4162 
4163 	err = target__parse_uid(&rec->opts.target);
4164 	if (err) {
4165 		int saved_errno = errno;
4166 
4167 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4168 		ui__error("%s", errbuf);
4169 
4170 		err = -saved_errno;
4171 		goto out;
4172 	}
4173 
4174 	/* Enable ignoring missing threads when -u/-p option is defined. */
4175 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4176 
4177 	if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
4178 		pr_err("failed to use cpu list %s\n",
4179 		       rec->opts.target.cpu_list);
4180 		goto out;
4181 	}
4182 
4183 	rec->opts.target.hybrid = perf_pmu__has_hybrid();
4184 
4185 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4186 		arch__add_leaf_frame_record_opts(&rec->opts);
4187 
4188 	err = -ENOMEM;
4189 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4190 		if (rec->opts.target.pid != NULL) {
4191 			pr_err("Couldn't create thread/CPU maps: %s\n",
4192 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4193 			goto out;
4194 		}
4195 		else
4196 			usage_with_options(record_usage, record_options);
4197 	}
4198 
4199 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4200 	if (err)
4201 		goto out;
4202 
4203 	/*
4204 	 * We take all buildids when the file contains
4205 	 * AUX area tracing data because we do not decode the
4206 	 * trace because it would take too long.
4207 	 */
4208 	if (rec->opts.full_auxtrace)
4209 		rec->buildid_all = true;
4210 
4211 	if (rec->opts.text_poke) {
4212 		err = record__config_text_poke(rec->evlist);
4213 		if (err) {
4214 			pr_err("record__config_text_poke failed, error %d\n", err);
4215 			goto out;
4216 		}
4217 	}
4218 
4219 	if (rec->off_cpu) {
4220 		err = record__config_off_cpu(rec);
4221 		if (err) {
4222 			pr_err("record__config_off_cpu failed, error %d\n", err);
4223 			goto out;
4224 		}
4225 	}
4226 
4227 	if (record_opts__config(&rec->opts)) {
4228 		err = -EINVAL;
4229 		goto out;
4230 	}
4231 
4232 	err = record__init_thread_masks(rec);
4233 	if (err) {
4234 		pr_err("Failed to initialize parallel data streaming masks\n");
4235 		goto out;
4236 	}
4237 
4238 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4239 		rec->opts.nr_cblocks = nr_cblocks_max;
4240 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4241 
4242 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4243 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4244 
4245 	if (rec->opts.comp_level > comp_level_max)
4246 		rec->opts.comp_level = comp_level_max;
4247 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4248 
4249 	err = __cmd_record(&record, argc, argv);
4250 out:
4251 	evlist__delete(rec->evlist);
4252 	symbol__exit();
4253 	auxtrace_record__free(rec->itr);
4254 out_opts:
4255 	record__free_thread_masks(rec, rec->nr_threads);
4256 	rec->nr_threads = 0;
4257 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4258 	return err;
4259 }
4260 
4261 static void snapshot_sig_handler(int sig __maybe_unused)
4262 {
4263 	struct record *rec = &record;
4264 
4265 	hit_auxtrace_snapshot_trigger(rec);
4266 
4267 	if (switch_output_signal(rec))
4268 		trigger_hit(&switch_output_trigger);
4269 }
4270 
4271 static void alarm_sig_handler(int sig __maybe_unused)
4272 {
4273 	struct record *rec = &record;
4274 
4275 	if (switch_output_time(rec))
4276 		trigger_hit(&switch_output_trigger);
4277 }
4278