xref: /openbmc/linux/tools/perf/builtin-record.c (revision 8ac3b5cd)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/llvm-utils.h"
41 #include "util/bpf-loader.h"
42 #include "util/trigger.h"
43 #include "util/perf-hooks.h"
44 #include "util/cpu-set-sched.h"
45 #include "util/synthetic-events.h"
46 #include "util/time-utils.h"
47 #include "util/units.h"
48 #include "util/bpf-event.h"
49 #include "util/util.h"
50 #include "util/pfm.h"
51 #include "util/clockid.h"
52 #include "util/pmu-hybrid.h"
53 #include "util/evlist-hybrid.h"
54 #include "util/off_cpu.h"
55 #include "asm/bug.h"
56 #include "perf.h"
57 #include "cputopo.h"
58 
59 #include <errno.h>
60 #include <inttypes.h>
61 #include <locale.h>
62 #include <poll.h>
63 #include <pthread.h>
64 #include <unistd.h>
65 #ifndef HAVE_GETTID
66 #include <syscall.h>
67 #endif
68 #include <sched.h>
69 #include <signal.h>
70 #ifdef HAVE_EVENTFD_SUPPORT
71 #include <sys/eventfd.h>
72 #endif
73 #include <sys/mman.h>
74 #include <sys/wait.h>
75 #include <sys/types.h>
76 #include <sys/stat.h>
77 #include <fcntl.h>
78 #include <linux/err.h>
79 #include <linux/string.h>
80 #include <linux/time64.h>
81 #include <linux/zalloc.h>
82 #include <linux/bitmap.h>
83 #include <sys/time.h>
84 
85 struct switch_output {
86 	bool		 enabled;
87 	bool		 signal;
88 	unsigned long	 size;
89 	unsigned long	 time;
90 	const char	*str;
91 	bool		 set;
92 	char		 **filenames;
93 	int		 num_files;
94 	int		 cur_file;
95 };
96 
97 struct thread_mask {
98 	struct mmap_cpu_mask	maps;
99 	struct mmap_cpu_mask	affinity;
100 };
101 
102 struct record_thread {
103 	pid_t			tid;
104 	struct thread_mask	*mask;
105 	struct {
106 		int		msg[2];
107 		int		ack[2];
108 	} pipes;
109 	struct fdarray		pollfd;
110 	int			ctlfd_pos;
111 	int			nr_mmaps;
112 	struct mmap		**maps;
113 	struct mmap		**overwrite_maps;
114 	struct record		*rec;
115 	unsigned long long	samples;
116 	unsigned long		waking;
117 	u64			bytes_written;
118 	u64			bytes_transferred;
119 	u64			bytes_compressed;
120 };
121 
122 static __thread struct record_thread *thread;
123 
124 enum thread_msg {
125 	THREAD_MSG__UNDEFINED = 0,
126 	THREAD_MSG__READY,
127 	THREAD_MSG__MAX,
128 };
129 
130 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
131 	"UNDEFINED", "READY"
132 };
133 
134 enum thread_spec {
135 	THREAD_SPEC__UNDEFINED = 0,
136 	THREAD_SPEC__CPU,
137 	THREAD_SPEC__CORE,
138 	THREAD_SPEC__PACKAGE,
139 	THREAD_SPEC__NUMA,
140 	THREAD_SPEC__USER,
141 	THREAD_SPEC__MAX,
142 };
143 
144 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
145 	"undefined", "cpu", "core", "package", "numa", "user"
146 };
147 
148 struct pollfd_index_map {
149 	int evlist_pollfd_index;
150 	int thread_pollfd_index;
151 };
152 
153 struct record {
154 	struct perf_tool	tool;
155 	struct record_opts	opts;
156 	u64			bytes_written;
157 	struct perf_data	data;
158 	struct auxtrace_record	*itr;
159 	struct evlist	*evlist;
160 	struct perf_session	*session;
161 	struct evlist		*sb_evlist;
162 	pthread_t		thread_id;
163 	int			realtime_prio;
164 	bool			switch_output_event_set;
165 	bool			no_buildid;
166 	bool			no_buildid_set;
167 	bool			no_buildid_cache;
168 	bool			no_buildid_cache_set;
169 	bool			buildid_all;
170 	bool			buildid_mmap;
171 	bool			timestamp_filename;
172 	bool			timestamp_boundary;
173 	bool			off_cpu;
174 	struct switch_output	switch_output;
175 	unsigned long long	samples;
176 	unsigned long		output_max_size;	/* = 0: unlimited */
177 	struct perf_debuginfod	debuginfod;
178 	int			nr_threads;
179 	struct thread_mask	*thread_masks;
180 	struct record_thread	*thread_data;
181 	struct pollfd_index_map	*index_map;
182 	size_t			index_map_sz;
183 	size_t			index_map_cnt;
184 };
185 
186 static volatile int done;
187 
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191 
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193 	"SYS", "NODE", "CPU"
194 };
195 
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199 	return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202 
203 static int record__threads_enabled(struct record *rec)
204 {
205 	return rec->opts.threads_spec;
206 }
207 
208 static bool switch_output_signal(struct record *rec)
209 {
210 	return rec->switch_output.signal &&
211 	       trigger_is_ready(&switch_output_trigger);
212 }
213 
214 static bool switch_output_size(struct record *rec)
215 {
216 	return rec->switch_output.size &&
217 	       trigger_is_ready(&switch_output_trigger) &&
218 	       (rec->bytes_written >= rec->switch_output.size);
219 }
220 
221 static bool switch_output_time(struct record *rec)
222 {
223 	return rec->switch_output.time &&
224 	       trigger_is_ready(&switch_output_trigger);
225 }
226 
227 static u64 record__bytes_written(struct record *rec)
228 {
229 	int t;
230 	u64 bytes_written = rec->bytes_written;
231 	struct record_thread *thread_data = rec->thread_data;
232 
233 	for (t = 0; t < rec->nr_threads; t++)
234 		bytes_written += thread_data[t].bytes_written;
235 
236 	return bytes_written;
237 }
238 
239 static bool record__output_max_size_exceeded(struct record *rec)
240 {
241 	return rec->output_max_size &&
242 	       (record__bytes_written(rec) >= rec->output_max_size);
243 }
244 
245 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
246 			 void *bf, size_t size)
247 {
248 	struct perf_data_file *file = &rec->session->data->file;
249 
250 	if (map && map->file)
251 		file = map->file;
252 
253 	if (perf_data_file__write(file, bf, size) < 0) {
254 		pr_err("failed to write perf data, error: %m\n");
255 		return -1;
256 	}
257 
258 	if (map && map->file)
259 		thread->bytes_written += size;
260 	else
261 		rec->bytes_written += size;
262 
263 	if (record__output_max_size_exceeded(rec) && !done) {
264 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
265 				" stopping session ]\n",
266 				record__bytes_written(rec) >> 10);
267 		done = 1;
268 	}
269 
270 	if (switch_output_size(rec))
271 		trigger_hit(&switch_output_trigger);
272 
273 	return 0;
274 }
275 
276 static int record__aio_enabled(struct record *rec);
277 static int record__comp_enabled(struct record *rec);
278 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
279 			    void *dst, size_t dst_size, void *src, size_t src_size);
280 
281 #ifdef HAVE_AIO_SUPPORT
282 static int record__aio_write(struct aiocb *cblock, int trace_fd,
283 		void *buf, size_t size, off_t off)
284 {
285 	int rc;
286 
287 	cblock->aio_fildes = trace_fd;
288 	cblock->aio_buf    = buf;
289 	cblock->aio_nbytes = size;
290 	cblock->aio_offset = off;
291 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
292 
293 	do {
294 		rc = aio_write(cblock);
295 		if (rc == 0) {
296 			break;
297 		} else if (errno != EAGAIN) {
298 			cblock->aio_fildes = -1;
299 			pr_err("failed to queue perf data, error: %m\n");
300 			break;
301 		}
302 	} while (1);
303 
304 	return rc;
305 }
306 
307 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
308 {
309 	void *rem_buf;
310 	off_t rem_off;
311 	size_t rem_size;
312 	int rc, aio_errno;
313 	ssize_t aio_ret, written;
314 
315 	aio_errno = aio_error(cblock);
316 	if (aio_errno == EINPROGRESS)
317 		return 0;
318 
319 	written = aio_ret = aio_return(cblock);
320 	if (aio_ret < 0) {
321 		if (aio_errno != EINTR)
322 			pr_err("failed to write perf data, error: %m\n");
323 		written = 0;
324 	}
325 
326 	rem_size = cblock->aio_nbytes - written;
327 
328 	if (rem_size == 0) {
329 		cblock->aio_fildes = -1;
330 		/*
331 		 * md->refcount is incremented in record__aio_pushfn() for
332 		 * every aio write request started in record__aio_push() so
333 		 * decrement it because the request is now complete.
334 		 */
335 		perf_mmap__put(&md->core);
336 		rc = 1;
337 	} else {
338 		/*
339 		 * aio write request may require restart with the
340 		 * reminder if the kernel didn't write whole
341 		 * chunk at once.
342 		 */
343 		rem_off = cblock->aio_offset + written;
344 		rem_buf = (void *)(cblock->aio_buf + written);
345 		record__aio_write(cblock, cblock->aio_fildes,
346 				rem_buf, rem_size, rem_off);
347 		rc = 0;
348 	}
349 
350 	return rc;
351 }
352 
353 static int record__aio_sync(struct mmap *md, bool sync_all)
354 {
355 	struct aiocb **aiocb = md->aio.aiocb;
356 	struct aiocb *cblocks = md->aio.cblocks;
357 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
358 	int i, do_suspend;
359 
360 	do {
361 		do_suspend = 0;
362 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
363 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
364 				if (sync_all)
365 					aiocb[i] = NULL;
366 				else
367 					return i;
368 			} else {
369 				/*
370 				 * Started aio write is not complete yet
371 				 * so it has to be waited before the
372 				 * next allocation.
373 				 */
374 				aiocb[i] = &cblocks[i];
375 				do_suspend = 1;
376 			}
377 		}
378 		if (!do_suspend)
379 			return -1;
380 
381 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
382 			if (!(errno == EAGAIN || errno == EINTR))
383 				pr_err("failed to sync perf data, error: %m\n");
384 		}
385 	} while (1);
386 }
387 
388 struct record_aio {
389 	struct record	*rec;
390 	void		*data;
391 	size_t		size;
392 };
393 
394 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
395 {
396 	struct record_aio *aio = to;
397 
398 	/*
399 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
400 	 * to release space in the kernel buffer as fast as possible, calling
401 	 * perf_mmap__consume() from perf_mmap__push() function.
402 	 *
403 	 * That lets the kernel to proceed with storing more profiling data into
404 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
405 	 *
406 	 * Coping can be done in two steps in case the chunk of profiling data
407 	 * crosses the upper bound of the kernel buffer. In this case we first move
408 	 * part of data from map->start till the upper bound and then the reminder
409 	 * from the beginning of the kernel buffer till the end of the data chunk.
410 	 */
411 
412 	if (record__comp_enabled(aio->rec)) {
413 		size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
414 				     mmap__mmap_len(map) - aio->size,
415 				     buf, size);
416 	} else {
417 		memcpy(aio->data + aio->size, buf, size);
418 	}
419 
420 	if (!aio->size) {
421 		/*
422 		 * Increment map->refcount to guard map->aio.data[] buffer
423 		 * from premature deallocation because map object can be
424 		 * released earlier than aio write request started on
425 		 * map->aio.data[] buffer is complete.
426 		 *
427 		 * perf_mmap__put() is done at record__aio_complete()
428 		 * after started aio request completion or at record__aio_push()
429 		 * if the request failed to start.
430 		 */
431 		perf_mmap__get(&map->core);
432 	}
433 
434 	aio->size += size;
435 
436 	return size;
437 }
438 
439 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
440 {
441 	int ret, idx;
442 	int trace_fd = rec->session->data->file.fd;
443 	struct record_aio aio = { .rec = rec, .size = 0 };
444 
445 	/*
446 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
447 	 * becomes available after previous aio write operation.
448 	 */
449 
450 	idx = record__aio_sync(map, false);
451 	aio.data = map->aio.data[idx];
452 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
453 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
454 		return ret;
455 
456 	rec->samples++;
457 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
458 	if (!ret) {
459 		*off += aio.size;
460 		rec->bytes_written += aio.size;
461 		if (switch_output_size(rec))
462 			trigger_hit(&switch_output_trigger);
463 	} else {
464 		/*
465 		 * Decrement map->refcount incremented in record__aio_pushfn()
466 		 * back if record__aio_write() operation failed to start, otherwise
467 		 * map->refcount is decremented in record__aio_complete() after
468 		 * aio write operation finishes successfully.
469 		 */
470 		perf_mmap__put(&map->core);
471 	}
472 
473 	return ret;
474 }
475 
476 static off_t record__aio_get_pos(int trace_fd)
477 {
478 	return lseek(trace_fd, 0, SEEK_CUR);
479 }
480 
481 static void record__aio_set_pos(int trace_fd, off_t pos)
482 {
483 	lseek(trace_fd, pos, SEEK_SET);
484 }
485 
486 static void record__aio_mmap_read_sync(struct record *rec)
487 {
488 	int i;
489 	struct evlist *evlist = rec->evlist;
490 	struct mmap *maps = evlist->mmap;
491 
492 	if (!record__aio_enabled(rec))
493 		return;
494 
495 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
496 		struct mmap *map = &maps[i];
497 
498 		if (map->core.base)
499 			record__aio_sync(map, true);
500 	}
501 }
502 
503 static int nr_cblocks_default = 1;
504 static int nr_cblocks_max = 4;
505 
506 static int record__aio_parse(const struct option *opt,
507 			     const char *str,
508 			     int unset)
509 {
510 	struct record_opts *opts = (struct record_opts *)opt->value;
511 
512 	if (unset) {
513 		opts->nr_cblocks = 0;
514 	} else {
515 		if (str)
516 			opts->nr_cblocks = strtol(str, NULL, 0);
517 		if (!opts->nr_cblocks)
518 			opts->nr_cblocks = nr_cblocks_default;
519 	}
520 
521 	return 0;
522 }
523 #else /* HAVE_AIO_SUPPORT */
524 static int nr_cblocks_max = 0;
525 
526 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
527 			    off_t *off __maybe_unused)
528 {
529 	return -1;
530 }
531 
532 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
533 {
534 	return -1;
535 }
536 
537 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
538 {
539 }
540 
541 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
542 {
543 }
544 #endif
545 
546 static int record__aio_enabled(struct record *rec)
547 {
548 	return rec->opts.nr_cblocks > 0;
549 }
550 
551 #define MMAP_FLUSH_DEFAULT 1
552 static int record__mmap_flush_parse(const struct option *opt,
553 				    const char *str,
554 				    int unset)
555 {
556 	int flush_max;
557 	struct record_opts *opts = (struct record_opts *)opt->value;
558 	static struct parse_tag tags[] = {
559 			{ .tag  = 'B', .mult = 1       },
560 			{ .tag  = 'K', .mult = 1 << 10 },
561 			{ .tag  = 'M', .mult = 1 << 20 },
562 			{ .tag  = 'G', .mult = 1 << 30 },
563 			{ .tag  = 0 },
564 	};
565 
566 	if (unset)
567 		return 0;
568 
569 	if (str) {
570 		opts->mmap_flush = parse_tag_value(str, tags);
571 		if (opts->mmap_flush == (int)-1)
572 			opts->mmap_flush = strtol(str, NULL, 0);
573 	}
574 
575 	if (!opts->mmap_flush)
576 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
577 
578 	flush_max = evlist__mmap_size(opts->mmap_pages);
579 	flush_max /= 4;
580 	if (opts->mmap_flush > flush_max)
581 		opts->mmap_flush = flush_max;
582 
583 	return 0;
584 }
585 
586 #ifdef HAVE_ZSTD_SUPPORT
587 static unsigned int comp_level_default = 1;
588 
589 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
590 {
591 	struct record_opts *opts = opt->value;
592 
593 	if (unset) {
594 		opts->comp_level = 0;
595 	} else {
596 		if (str)
597 			opts->comp_level = strtol(str, NULL, 0);
598 		if (!opts->comp_level)
599 			opts->comp_level = comp_level_default;
600 	}
601 
602 	return 0;
603 }
604 #endif
605 static unsigned int comp_level_max = 22;
606 
607 static int record__comp_enabled(struct record *rec)
608 {
609 	return rec->opts.comp_level > 0;
610 }
611 
612 static int process_synthesized_event(struct perf_tool *tool,
613 				     union perf_event *event,
614 				     struct perf_sample *sample __maybe_unused,
615 				     struct machine *machine __maybe_unused)
616 {
617 	struct record *rec = container_of(tool, struct record, tool);
618 	return record__write(rec, NULL, event, event->header.size);
619 }
620 
621 static struct mutex synth_lock;
622 
623 static int process_locked_synthesized_event(struct perf_tool *tool,
624 				     union perf_event *event,
625 				     struct perf_sample *sample __maybe_unused,
626 				     struct machine *machine __maybe_unused)
627 {
628 	int ret;
629 
630 	mutex_lock(&synth_lock);
631 	ret = process_synthesized_event(tool, event, sample, machine);
632 	mutex_unlock(&synth_lock);
633 	return ret;
634 }
635 
636 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
637 {
638 	struct record *rec = to;
639 
640 	if (record__comp_enabled(rec)) {
641 		size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
642 		bf   = map->data;
643 	}
644 
645 	thread->samples++;
646 	return record__write(rec, map, bf, size);
647 }
648 
649 static volatile int signr = -1;
650 static volatile int child_finished;
651 #ifdef HAVE_EVENTFD_SUPPORT
652 static volatile int done_fd = -1;
653 #endif
654 
655 static void sig_handler(int sig)
656 {
657 	if (sig == SIGCHLD)
658 		child_finished = 1;
659 	else
660 		signr = sig;
661 
662 	done = 1;
663 #ifdef HAVE_EVENTFD_SUPPORT
664 	if (done_fd >= 0) {
665 		u64 tmp = 1;
666 		int orig_errno = errno;
667 
668 		/*
669 		 * It is possible for this signal handler to run after done is
670 		 * checked in the main loop, but before the perf counter fds are
671 		 * polled. If this happens, the poll() will continue to wait
672 		 * even though done is set, and will only break out if either
673 		 * another signal is received, or the counters are ready for
674 		 * read. To ensure the poll() doesn't sleep when done is set,
675 		 * use an eventfd (done_fd) to wake up the poll().
676 		 */
677 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
678 			pr_err("failed to signal wakeup fd, error: %m\n");
679 
680 		errno = orig_errno;
681 	}
682 #endif // HAVE_EVENTFD_SUPPORT
683 }
684 
685 static void sigsegv_handler(int sig)
686 {
687 	perf_hooks__recover();
688 	sighandler_dump_stack(sig);
689 }
690 
691 static void record__sig_exit(void)
692 {
693 	if (signr == -1)
694 		return;
695 
696 	signal(signr, SIG_DFL);
697 	raise(signr);
698 }
699 
700 #ifdef HAVE_AUXTRACE_SUPPORT
701 
702 static int record__process_auxtrace(struct perf_tool *tool,
703 				    struct mmap *map,
704 				    union perf_event *event, void *data1,
705 				    size_t len1, void *data2, size_t len2)
706 {
707 	struct record *rec = container_of(tool, struct record, tool);
708 	struct perf_data *data = &rec->data;
709 	size_t padding;
710 	u8 pad[8] = {0};
711 
712 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
713 		off_t file_offset;
714 		int fd = perf_data__fd(data);
715 		int err;
716 
717 		file_offset = lseek(fd, 0, SEEK_CUR);
718 		if (file_offset == -1)
719 			return -1;
720 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
721 						     event, file_offset);
722 		if (err)
723 			return err;
724 	}
725 
726 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
727 	padding = (len1 + len2) & 7;
728 	if (padding)
729 		padding = 8 - padding;
730 
731 	record__write(rec, map, event, event->header.size);
732 	record__write(rec, map, data1, len1);
733 	if (len2)
734 		record__write(rec, map, data2, len2);
735 	record__write(rec, map, &pad, padding);
736 
737 	return 0;
738 }
739 
740 static int record__auxtrace_mmap_read(struct record *rec,
741 				      struct mmap *map)
742 {
743 	int ret;
744 
745 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
746 				  record__process_auxtrace);
747 	if (ret < 0)
748 		return ret;
749 
750 	if (ret)
751 		rec->samples++;
752 
753 	return 0;
754 }
755 
756 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
757 					       struct mmap *map)
758 {
759 	int ret;
760 
761 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
762 					   record__process_auxtrace,
763 					   rec->opts.auxtrace_snapshot_size);
764 	if (ret < 0)
765 		return ret;
766 
767 	if (ret)
768 		rec->samples++;
769 
770 	return 0;
771 }
772 
773 static int record__auxtrace_read_snapshot_all(struct record *rec)
774 {
775 	int i;
776 	int rc = 0;
777 
778 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
779 		struct mmap *map = &rec->evlist->mmap[i];
780 
781 		if (!map->auxtrace_mmap.base)
782 			continue;
783 
784 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
785 			rc = -1;
786 			goto out;
787 		}
788 	}
789 out:
790 	return rc;
791 }
792 
793 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
794 {
795 	pr_debug("Recording AUX area tracing snapshot\n");
796 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
797 		trigger_error(&auxtrace_snapshot_trigger);
798 	} else {
799 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
800 			trigger_error(&auxtrace_snapshot_trigger);
801 		else
802 			trigger_ready(&auxtrace_snapshot_trigger);
803 	}
804 }
805 
806 static int record__auxtrace_snapshot_exit(struct record *rec)
807 {
808 	if (trigger_is_error(&auxtrace_snapshot_trigger))
809 		return 0;
810 
811 	if (!auxtrace_record__snapshot_started &&
812 	    auxtrace_record__snapshot_start(rec->itr))
813 		return -1;
814 
815 	record__read_auxtrace_snapshot(rec, true);
816 	if (trigger_is_error(&auxtrace_snapshot_trigger))
817 		return -1;
818 
819 	return 0;
820 }
821 
822 static int record__auxtrace_init(struct record *rec)
823 {
824 	int err;
825 
826 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
827 	    && record__threads_enabled(rec)) {
828 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
829 		return -EINVAL;
830 	}
831 
832 	if (!rec->itr) {
833 		rec->itr = auxtrace_record__init(rec->evlist, &err);
834 		if (err)
835 			return err;
836 	}
837 
838 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
839 					      rec->opts.auxtrace_snapshot_opts);
840 	if (err)
841 		return err;
842 
843 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
844 					    rec->opts.auxtrace_sample_opts);
845 	if (err)
846 		return err;
847 
848 	auxtrace_regroup_aux_output(rec->evlist);
849 
850 	return auxtrace_parse_filters(rec->evlist);
851 }
852 
853 #else
854 
855 static inline
856 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
857 			       struct mmap *map __maybe_unused)
858 {
859 	return 0;
860 }
861 
862 static inline
863 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
864 				    bool on_exit __maybe_unused)
865 {
866 }
867 
868 static inline
869 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
870 {
871 	return 0;
872 }
873 
874 static inline
875 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
876 {
877 	return 0;
878 }
879 
880 static int record__auxtrace_init(struct record *rec __maybe_unused)
881 {
882 	return 0;
883 }
884 
885 #endif
886 
887 static int record__config_text_poke(struct evlist *evlist)
888 {
889 	struct evsel *evsel;
890 
891 	/* Nothing to do if text poke is already configured */
892 	evlist__for_each_entry(evlist, evsel) {
893 		if (evsel->core.attr.text_poke)
894 			return 0;
895 	}
896 
897 	evsel = evlist__add_dummy_on_all_cpus(evlist);
898 	if (!evsel)
899 		return -ENOMEM;
900 
901 	evsel->core.attr.text_poke = 1;
902 	evsel->core.attr.ksymbol = 1;
903 	evsel->immediate = true;
904 	evsel__set_sample_bit(evsel, TIME);
905 
906 	return 0;
907 }
908 
909 static int record__config_off_cpu(struct record *rec)
910 {
911 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
912 }
913 
914 static bool record__kcore_readable(struct machine *machine)
915 {
916 	char kcore[PATH_MAX];
917 	int fd;
918 
919 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
920 
921 	fd = open(kcore, O_RDONLY);
922 	if (fd < 0)
923 		return false;
924 
925 	close(fd);
926 
927 	return true;
928 }
929 
930 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
931 {
932 	char from_dir[PATH_MAX];
933 	char kcore_dir[PATH_MAX];
934 	int ret;
935 
936 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
937 
938 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
939 	if (ret)
940 		return ret;
941 
942 	return kcore_copy(from_dir, kcore_dir);
943 }
944 
945 static void record__thread_data_init_pipes(struct record_thread *thread_data)
946 {
947 	thread_data->pipes.msg[0] = -1;
948 	thread_data->pipes.msg[1] = -1;
949 	thread_data->pipes.ack[0] = -1;
950 	thread_data->pipes.ack[1] = -1;
951 }
952 
953 static int record__thread_data_open_pipes(struct record_thread *thread_data)
954 {
955 	if (pipe(thread_data->pipes.msg))
956 		return -EINVAL;
957 
958 	if (pipe(thread_data->pipes.ack)) {
959 		close(thread_data->pipes.msg[0]);
960 		thread_data->pipes.msg[0] = -1;
961 		close(thread_data->pipes.msg[1]);
962 		thread_data->pipes.msg[1] = -1;
963 		return -EINVAL;
964 	}
965 
966 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
967 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
968 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
969 
970 	return 0;
971 }
972 
973 static void record__thread_data_close_pipes(struct record_thread *thread_data)
974 {
975 	if (thread_data->pipes.msg[0] != -1) {
976 		close(thread_data->pipes.msg[0]);
977 		thread_data->pipes.msg[0] = -1;
978 	}
979 	if (thread_data->pipes.msg[1] != -1) {
980 		close(thread_data->pipes.msg[1]);
981 		thread_data->pipes.msg[1] = -1;
982 	}
983 	if (thread_data->pipes.ack[0] != -1) {
984 		close(thread_data->pipes.ack[0]);
985 		thread_data->pipes.ack[0] = -1;
986 	}
987 	if (thread_data->pipes.ack[1] != -1) {
988 		close(thread_data->pipes.ack[1]);
989 		thread_data->pipes.ack[1] = -1;
990 	}
991 }
992 
993 static bool evlist__per_thread(struct evlist *evlist)
994 {
995 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
996 }
997 
998 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
999 {
1000 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1001 	struct mmap *mmap = evlist->mmap;
1002 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1003 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1004 	bool per_thread = evlist__per_thread(evlist);
1005 
1006 	if (per_thread)
1007 		thread_data->nr_mmaps = nr_mmaps;
1008 	else
1009 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1010 						      thread_data->mask->maps.nbits);
1011 	if (mmap) {
1012 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1013 		if (!thread_data->maps)
1014 			return -ENOMEM;
1015 	}
1016 	if (overwrite_mmap) {
1017 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1018 		if (!thread_data->overwrite_maps) {
1019 			zfree(&thread_data->maps);
1020 			return -ENOMEM;
1021 		}
1022 	}
1023 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1024 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1025 
1026 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1027 		if (per_thread ||
1028 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1029 			if (thread_data->maps) {
1030 				thread_data->maps[tm] = &mmap[m];
1031 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1032 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1033 			}
1034 			if (thread_data->overwrite_maps) {
1035 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1036 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1037 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1038 			}
1039 			tm++;
1040 		}
1041 	}
1042 
1043 	return 0;
1044 }
1045 
1046 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1047 {
1048 	int f, tm, pos;
1049 	struct mmap *map, *overwrite_map;
1050 
1051 	fdarray__init(&thread_data->pollfd, 64);
1052 
1053 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1054 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1055 		overwrite_map = thread_data->overwrite_maps ?
1056 				thread_data->overwrite_maps[tm] : NULL;
1057 
1058 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1059 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1060 
1061 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1062 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1063 							      &evlist->core.pollfd);
1064 				if (pos < 0)
1065 					return pos;
1066 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1067 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1068 			}
1069 		}
1070 	}
1071 
1072 	return 0;
1073 }
1074 
1075 static void record__free_thread_data(struct record *rec)
1076 {
1077 	int t;
1078 	struct record_thread *thread_data = rec->thread_data;
1079 
1080 	if (thread_data == NULL)
1081 		return;
1082 
1083 	for (t = 0; t < rec->nr_threads; t++) {
1084 		record__thread_data_close_pipes(&thread_data[t]);
1085 		zfree(&thread_data[t].maps);
1086 		zfree(&thread_data[t].overwrite_maps);
1087 		fdarray__exit(&thread_data[t].pollfd);
1088 	}
1089 
1090 	zfree(&rec->thread_data);
1091 }
1092 
1093 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1094 						    int evlist_pollfd_index,
1095 						    int thread_pollfd_index)
1096 {
1097 	size_t x = rec->index_map_cnt;
1098 
1099 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1100 		return -ENOMEM;
1101 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1102 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1103 	rec->index_map_cnt += 1;
1104 	return 0;
1105 }
1106 
1107 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1108 						    struct evlist *evlist,
1109 						    struct record_thread *thread_data)
1110 {
1111 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1112 	struct pollfd *t_entries = thread_data->pollfd.entries;
1113 	int err = 0;
1114 	size_t i;
1115 
1116 	for (i = 0; i < rec->index_map_cnt; i++) {
1117 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1118 		int t_pos = rec->index_map[i].thread_pollfd_index;
1119 
1120 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1121 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1122 			pr_err("Thread and evlist pollfd index mismatch\n");
1123 			err = -EINVAL;
1124 			continue;
1125 		}
1126 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1127 	}
1128 	return err;
1129 }
1130 
1131 static int record__dup_non_perf_events(struct record *rec,
1132 				       struct evlist *evlist,
1133 				       struct record_thread *thread_data)
1134 {
1135 	struct fdarray *fda = &evlist->core.pollfd;
1136 	int i, ret;
1137 
1138 	for (i = 0; i < fda->nr; i++) {
1139 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1140 			continue;
1141 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1142 		if (ret < 0) {
1143 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1144 			return ret;
1145 		}
1146 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1147 			  thread_data, ret, fda->entries[i].fd);
1148 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1149 		if (ret < 0) {
1150 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1151 			return ret;
1152 		}
1153 	}
1154 	return 0;
1155 }
1156 
1157 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1158 {
1159 	int t, ret;
1160 	struct record_thread *thread_data;
1161 
1162 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1163 	if (!rec->thread_data) {
1164 		pr_err("Failed to allocate thread data\n");
1165 		return -ENOMEM;
1166 	}
1167 	thread_data = rec->thread_data;
1168 
1169 	for (t = 0; t < rec->nr_threads; t++)
1170 		record__thread_data_init_pipes(&thread_data[t]);
1171 
1172 	for (t = 0; t < rec->nr_threads; t++) {
1173 		thread_data[t].rec = rec;
1174 		thread_data[t].mask = &rec->thread_masks[t];
1175 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1176 		if (ret) {
1177 			pr_err("Failed to initialize thread[%d] maps\n", t);
1178 			goto out_free;
1179 		}
1180 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1181 		if (ret) {
1182 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1183 			goto out_free;
1184 		}
1185 		if (t) {
1186 			thread_data[t].tid = -1;
1187 			ret = record__thread_data_open_pipes(&thread_data[t]);
1188 			if (ret) {
1189 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1190 				goto out_free;
1191 			}
1192 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1193 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1194 			if (ret < 0) {
1195 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1196 				goto out_free;
1197 			}
1198 			thread_data[t].ctlfd_pos = ret;
1199 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1200 				 thread_data, thread_data[t].ctlfd_pos,
1201 				 thread_data[t].pipes.msg[0]);
1202 		} else {
1203 			thread_data[t].tid = gettid();
1204 
1205 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1206 			if (ret < 0)
1207 				goto out_free;
1208 
1209 			thread_data[t].ctlfd_pos = -1; /* Not used */
1210 		}
1211 	}
1212 
1213 	return 0;
1214 
1215 out_free:
1216 	record__free_thread_data(rec);
1217 
1218 	return ret;
1219 }
1220 
1221 static int record__mmap_evlist(struct record *rec,
1222 			       struct evlist *evlist)
1223 {
1224 	int i, ret;
1225 	struct record_opts *opts = &rec->opts;
1226 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1227 				  opts->auxtrace_sample_mode;
1228 	char msg[512];
1229 
1230 	if (opts->affinity != PERF_AFFINITY_SYS)
1231 		cpu__setup_cpunode_map();
1232 
1233 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1234 				 opts->auxtrace_mmap_pages,
1235 				 auxtrace_overwrite,
1236 				 opts->nr_cblocks, opts->affinity,
1237 				 opts->mmap_flush, opts->comp_level) < 0) {
1238 		if (errno == EPERM) {
1239 			pr_err("Permission error mapping pages.\n"
1240 			       "Consider increasing "
1241 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1242 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1243 			       "(current value: %u,%u)\n",
1244 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1245 			return -errno;
1246 		} else {
1247 			pr_err("failed to mmap with %d (%s)\n", errno,
1248 				str_error_r(errno, msg, sizeof(msg)));
1249 			if (errno)
1250 				return -errno;
1251 			else
1252 				return -EINVAL;
1253 		}
1254 	}
1255 
1256 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1257 		return -1;
1258 
1259 	ret = record__alloc_thread_data(rec, evlist);
1260 	if (ret)
1261 		return ret;
1262 
1263 	if (record__threads_enabled(rec)) {
1264 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1265 		if (ret) {
1266 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1267 			return ret;
1268 		}
1269 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1270 			if (evlist->mmap)
1271 				evlist->mmap[i].file = &rec->data.dir.files[i];
1272 			if (evlist->overwrite_mmap)
1273 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1274 		}
1275 	}
1276 
1277 	return 0;
1278 }
1279 
1280 static int record__mmap(struct record *rec)
1281 {
1282 	return record__mmap_evlist(rec, rec->evlist);
1283 }
1284 
1285 static int record__open(struct record *rec)
1286 {
1287 	char msg[BUFSIZ];
1288 	struct evsel *pos;
1289 	struct evlist *evlist = rec->evlist;
1290 	struct perf_session *session = rec->session;
1291 	struct record_opts *opts = &rec->opts;
1292 	int rc = 0;
1293 
1294 	/*
1295 	 * For initial_delay, system wide or a hybrid system, we need to add a
1296 	 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
1297 	 * of waiting or event synthesis.
1298 	 */
1299 	if (opts->initial_delay || target__has_cpu(&opts->target) ||
1300 	    perf_pmu__has_hybrid()) {
1301 		pos = evlist__get_tracking_event(evlist);
1302 		if (!evsel__is_dummy_event(pos)) {
1303 			/* Set up dummy event. */
1304 			if (evlist__add_dummy(evlist))
1305 				return -ENOMEM;
1306 			pos = evlist__last(evlist);
1307 			evlist__set_tracking_event(evlist, pos);
1308 		}
1309 
1310 		/*
1311 		 * Enable the dummy event when the process is forked for
1312 		 * initial_delay, immediately for system wide.
1313 		 */
1314 		if (opts->initial_delay && !pos->immediate &&
1315 		    !target__has_cpu(&opts->target))
1316 			pos->core.attr.enable_on_exec = 1;
1317 		else
1318 			pos->immediate = 1;
1319 	}
1320 
1321 	evlist__config(evlist, opts, &callchain_param);
1322 
1323 	evlist__for_each_entry(evlist, pos) {
1324 try_again:
1325 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1326 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1327 				if (verbose > 0)
1328 					ui__warning("%s\n", msg);
1329 				goto try_again;
1330 			}
1331 			if ((errno == EINVAL || errno == EBADF) &&
1332 			    pos->core.leader != &pos->core &&
1333 			    pos->weak_group) {
1334 			        pos = evlist__reset_weak_group(evlist, pos, true);
1335 				goto try_again;
1336 			}
1337 			rc = -errno;
1338 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1339 			ui__error("%s\n", msg);
1340 			goto out;
1341 		}
1342 
1343 		pos->supported = true;
1344 	}
1345 
1346 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1347 		pr_warning(
1348 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1349 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1350 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1351 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1352 "Samples in kernel modules won't be resolved at all.\n\n"
1353 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1354 "even with a suitable vmlinux or kallsyms file.\n\n");
1355 	}
1356 
1357 	if (evlist__apply_filters(evlist, &pos)) {
1358 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1359 			pos->filter, evsel__name(pos), errno,
1360 			str_error_r(errno, msg, sizeof(msg)));
1361 		rc = -1;
1362 		goto out;
1363 	}
1364 
1365 	rc = record__mmap(rec);
1366 	if (rc)
1367 		goto out;
1368 
1369 	session->evlist = evlist;
1370 	perf_session__set_id_hdr_size(session);
1371 out:
1372 	return rc;
1373 }
1374 
1375 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1376 {
1377 	if (rec->evlist->first_sample_time == 0)
1378 		rec->evlist->first_sample_time = sample_time;
1379 
1380 	if (sample_time)
1381 		rec->evlist->last_sample_time = sample_time;
1382 }
1383 
1384 static int process_sample_event(struct perf_tool *tool,
1385 				union perf_event *event,
1386 				struct perf_sample *sample,
1387 				struct evsel *evsel,
1388 				struct machine *machine)
1389 {
1390 	struct record *rec = container_of(tool, struct record, tool);
1391 
1392 	set_timestamp_boundary(rec, sample->time);
1393 
1394 	if (rec->buildid_all)
1395 		return 0;
1396 
1397 	rec->samples++;
1398 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1399 }
1400 
1401 static int process_buildids(struct record *rec)
1402 {
1403 	struct perf_session *session = rec->session;
1404 
1405 	if (perf_data__size(&rec->data) == 0)
1406 		return 0;
1407 
1408 	/*
1409 	 * During this process, it'll load kernel map and replace the
1410 	 * dso->long_name to a real pathname it found.  In this case
1411 	 * we prefer the vmlinux path like
1412 	 *   /lib/modules/3.16.4/build/vmlinux
1413 	 *
1414 	 * rather than build-id path (in debug directory).
1415 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1416 	 */
1417 	symbol_conf.ignore_vmlinux_buildid = true;
1418 
1419 	/*
1420 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1421 	 * so no need to process samples. But if timestamp_boundary is enabled,
1422 	 * it still needs to walk on all samples to get the timestamps of
1423 	 * first/last samples.
1424 	 */
1425 	if (rec->buildid_all && !rec->timestamp_boundary)
1426 		rec->tool.sample = NULL;
1427 
1428 	return perf_session__process_events(session);
1429 }
1430 
1431 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1432 {
1433 	int err;
1434 	struct perf_tool *tool = data;
1435 	/*
1436 	 *As for guest kernel when processing subcommand record&report,
1437 	 *we arrange module mmap prior to guest kernel mmap and trigger
1438 	 *a preload dso because default guest module symbols are loaded
1439 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1440 	 *method is used to avoid symbol missing when the first addr is
1441 	 *in module instead of in guest kernel.
1442 	 */
1443 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1444 					     machine);
1445 	if (err < 0)
1446 		pr_err("Couldn't record guest kernel [%d]'s reference"
1447 		       " relocation symbol.\n", machine->pid);
1448 
1449 	/*
1450 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1451 	 * have no _text sometimes.
1452 	 */
1453 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1454 						 machine);
1455 	if (err < 0)
1456 		pr_err("Couldn't record guest kernel [%d]'s reference"
1457 		       " relocation symbol.\n", machine->pid);
1458 }
1459 
1460 static struct perf_event_header finished_round_event = {
1461 	.size = sizeof(struct perf_event_header),
1462 	.type = PERF_RECORD_FINISHED_ROUND,
1463 };
1464 
1465 static struct perf_event_header finished_init_event = {
1466 	.size = sizeof(struct perf_event_header),
1467 	.type = PERF_RECORD_FINISHED_INIT,
1468 };
1469 
1470 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1471 {
1472 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1473 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1474 			  thread->mask->affinity.nbits)) {
1475 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1476 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1477 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1478 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1479 					(cpu_set_t *)thread->mask->affinity.bits);
1480 		if (verbose == 2) {
1481 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1482 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1483 		}
1484 	}
1485 }
1486 
1487 static size_t process_comp_header(void *record, size_t increment)
1488 {
1489 	struct perf_record_compressed *event = record;
1490 	size_t size = sizeof(*event);
1491 
1492 	if (increment) {
1493 		event->header.size += increment;
1494 		return increment;
1495 	}
1496 
1497 	event->header.type = PERF_RECORD_COMPRESSED;
1498 	event->header.size = size;
1499 
1500 	return size;
1501 }
1502 
1503 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1504 			    void *dst, size_t dst_size, void *src, size_t src_size)
1505 {
1506 	size_t compressed;
1507 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1508 	struct zstd_data *zstd_data = &session->zstd_data;
1509 
1510 	if (map && map->file)
1511 		zstd_data = &map->zstd_data;
1512 
1513 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1514 						     max_record_size, process_comp_header);
1515 
1516 	if (map && map->file) {
1517 		thread->bytes_transferred += src_size;
1518 		thread->bytes_compressed  += compressed;
1519 	} else {
1520 		session->bytes_transferred += src_size;
1521 		session->bytes_compressed  += compressed;
1522 	}
1523 
1524 	return compressed;
1525 }
1526 
1527 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1528 				    bool overwrite, bool synch)
1529 {
1530 	u64 bytes_written = rec->bytes_written;
1531 	int i;
1532 	int rc = 0;
1533 	int nr_mmaps;
1534 	struct mmap **maps;
1535 	int trace_fd = rec->data.file.fd;
1536 	off_t off = 0;
1537 
1538 	if (!evlist)
1539 		return 0;
1540 
1541 	nr_mmaps = thread->nr_mmaps;
1542 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1543 
1544 	if (!maps)
1545 		return 0;
1546 
1547 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1548 		return 0;
1549 
1550 	if (record__aio_enabled(rec))
1551 		off = record__aio_get_pos(trace_fd);
1552 
1553 	for (i = 0; i < nr_mmaps; i++) {
1554 		u64 flush = 0;
1555 		struct mmap *map = maps[i];
1556 
1557 		if (map->core.base) {
1558 			record__adjust_affinity(rec, map);
1559 			if (synch) {
1560 				flush = map->core.flush;
1561 				map->core.flush = 1;
1562 			}
1563 			if (!record__aio_enabled(rec)) {
1564 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1565 					if (synch)
1566 						map->core.flush = flush;
1567 					rc = -1;
1568 					goto out;
1569 				}
1570 			} else {
1571 				if (record__aio_push(rec, map, &off) < 0) {
1572 					record__aio_set_pos(trace_fd, off);
1573 					if (synch)
1574 						map->core.flush = flush;
1575 					rc = -1;
1576 					goto out;
1577 				}
1578 			}
1579 			if (synch)
1580 				map->core.flush = flush;
1581 		}
1582 
1583 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1584 		    !rec->opts.auxtrace_sample_mode &&
1585 		    record__auxtrace_mmap_read(rec, map) != 0) {
1586 			rc = -1;
1587 			goto out;
1588 		}
1589 	}
1590 
1591 	if (record__aio_enabled(rec))
1592 		record__aio_set_pos(trace_fd, off);
1593 
1594 	/*
1595 	 * Mark the round finished in case we wrote
1596 	 * at least one event.
1597 	 *
1598 	 * No need for round events in directory mode,
1599 	 * because per-cpu maps and files have data
1600 	 * sorted by kernel.
1601 	 */
1602 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1603 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1604 
1605 	if (overwrite)
1606 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1607 out:
1608 	return rc;
1609 }
1610 
1611 static int record__mmap_read_all(struct record *rec, bool synch)
1612 {
1613 	int err;
1614 
1615 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1616 	if (err)
1617 		return err;
1618 
1619 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1620 }
1621 
1622 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1623 					   void *arg __maybe_unused)
1624 {
1625 	struct perf_mmap *map = fda->priv[fd].ptr;
1626 
1627 	if (map)
1628 		perf_mmap__put(map);
1629 }
1630 
1631 static void *record__thread(void *arg)
1632 {
1633 	enum thread_msg msg = THREAD_MSG__READY;
1634 	bool terminate = false;
1635 	struct fdarray *pollfd;
1636 	int err, ctlfd_pos;
1637 
1638 	thread = arg;
1639 	thread->tid = gettid();
1640 
1641 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1642 	if (err == -1)
1643 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1644 			   thread->tid, strerror(errno));
1645 
1646 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1647 
1648 	pollfd = &thread->pollfd;
1649 	ctlfd_pos = thread->ctlfd_pos;
1650 
1651 	for (;;) {
1652 		unsigned long long hits = thread->samples;
1653 
1654 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1655 			break;
1656 
1657 		if (hits == thread->samples) {
1658 
1659 			err = fdarray__poll(pollfd, -1);
1660 			/*
1661 			 * Propagate error, only if there's any. Ignore positive
1662 			 * number of returned events and interrupt error.
1663 			 */
1664 			if (err > 0 || (err < 0 && errno == EINTR))
1665 				err = 0;
1666 			thread->waking++;
1667 
1668 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1669 					    record__thread_munmap_filtered, NULL) == 0)
1670 				break;
1671 		}
1672 
1673 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1674 			terminate = true;
1675 			close(thread->pipes.msg[0]);
1676 			thread->pipes.msg[0] = -1;
1677 			pollfd->entries[ctlfd_pos].fd = -1;
1678 			pollfd->entries[ctlfd_pos].events = 0;
1679 		}
1680 
1681 		pollfd->entries[ctlfd_pos].revents = 0;
1682 	}
1683 	record__mmap_read_all(thread->rec, true);
1684 
1685 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1686 	if (err == -1)
1687 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1688 			   thread->tid, strerror(errno));
1689 
1690 	return NULL;
1691 }
1692 
1693 static void record__init_features(struct record *rec)
1694 {
1695 	struct perf_session *session = rec->session;
1696 	int feat;
1697 
1698 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1699 		perf_header__set_feat(&session->header, feat);
1700 
1701 	if (rec->no_buildid)
1702 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1703 
1704 	if (!have_tracepoints(&rec->evlist->core.entries))
1705 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1706 
1707 	if (!rec->opts.branch_stack)
1708 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1709 
1710 	if (!rec->opts.full_auxtrace)
1711 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1712 
1713 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1714 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1715 
1716 	if (!rec->opts.use_clockid)
1717 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1718 
1719 	if (!record__threads_enabled(rec))
1720 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1721 
1722 	if (!record__comp_enabled(rec))
1723 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1724 
1725 	perf_header__clear_feat(&session->header, HEADER_STAT);
1726 }
1727 
1728 static void
1729 record__finish_output(struct record *rec)
1730 {
1731 	int i;
1732 	struct perf_data *data = &rec->data;
1733 	int fd = perf_data__fd(data);
1734 
1735 	if (data->is_pipe)
1736 		return;
1737 
1738 	rec->session->header.data_size += rec->bytes_written;
1739 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1740 	if (record__threads_enabled(rec)) {
1741 		for (i = 0; i < data->dir.nr; i++)
1742 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1743 	}
1744 
1745 	if (!rec->no_buildid) {
1746 		process_buildids(rec);
1747 
1748 		if (rec->buildid_all)
1749 			dsos__hit_all(rec->session);
1750 	}
1751 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1752 
1753 	return;
1754 }
1755 
1756 static int record__synthesize_workload(struct record *rec, bool tail)
1757 {
1758 	int err;
1759 	struct perf_thread_map *thread_map;
1760 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1761 
1762 	if (rec->opts.tail_synthesize != tail)
1763 		return 0;
1764 
1765 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1766 	if (thread_map == NULL)
1767 		return -1;
1768 
1769 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1770 						 process_synthesized_event,
1771 						 &rec->session->machines.host,
1772 						 needs_mmap,
1773 						 rec->opts.sample_address);
1774 	perf_thread_map__put(thread_map);
1775 	return err;
1776 }
1777 
1778 static int write_finished_init(struct record *rec, bool tail)
1779 {
1780 	if (rec->opts.tail_synthesize != tail)
1781 		return 0;
1782 
1783 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1784 }
1785 
1786 static int record__synthesize(struct record *rec, bool tail);
1787 
1788 static int
1789 record__switch_output(struct record *rec, bool at_exit)
1790 {
1791 	struct perf_data *data = &rec->data;
1792 	int fd, err;
1793 	char *new_filename;
1794 
1795 	/* Same Size:      "2015122520103046"*/
1796 	char timestamp[] = "InvalidTimestamp";
1797 
1798 	record__aio_mmap_read_sync(rec);
1799 
1800 	write_finished_init(rec, true);
1801 
1802 	record__synthesize(rec, true);
1803 	if (target__none(&rec->opts.target))
1804 		record__synthesize_workload(rec, true);
1805 
1806 	rec->samples = 0;
1807 	record__finish_output(rec);
1808 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1809 	if (err) {
1810 		pr_err("Failed to get current timestamp\n");
1811 		return -EINVAL;
1812 	}
1813 
1814 	fd = perf_data__switch(data, timestamp,
1815 				    rec->session->header.data_offset,
1816 				    at_exit, &new_filename);
1817 	if (fd >= 0 && !at_exit) {
1818 		rec->bytes_written = 0;
1819 		rec->session->header.data_size = 0;
1820 	}
1821 
1822 	if (!quiet)
1823 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1824 			data->path, timestamp);
1825 
1826 	if (rec->switch_output.num_files) {
1827 		int n = rec->switch_output.cur_file + 1;
1828 
1829 		if (n >= rec->switch_output.num_files)
1830 			n = 0;
1831 		rec->switch_output.cur_file = n;
1832 		if (rec->switch_output.filenames[n]) {
1833 			remove(rec->switch_output.filenames[n]);
1834 			zfree(&rec->switch_output.filenames[n]);
1835 		}
1836 		rec->switch_output.filenames[n] = new_filename;
1837 	} else {
1838 		free(new_filename);
1839 	}
1840 
1841 	/* Output tracking events */
1842 	if (!at_exit) {
1843 		record__synthesize(rec, false);
1844 
1845 		/*
1846 		 * In 'perf record --switch-output' without -a,
1847 		 * record__synthesize() in record__switch_output() won't
1848 		 * generate tracking events because there's no thread_map
1849 		 * in evlist. Which causes newly created perf.data doesn't
1850 		 * contain map and comm information.
1851 		 * Create a fake thread_map and directly call
1852 		 * perf_event__synthesize_thread_map() for those events.
1853 		 */
1854 		if (target__none(&rec->opts.target))
1855 			record__synthesize_workload(rec, false);
1856 		write_finished_init(rec, false);
1857 	}
1858 	return fd;
1859 }
1860 
1861 static void __record__read_lost_samples(struct record *rec, struct evsel *evsel,
1862 					struct perf_record_lost_samples *lost,
1863 					int cpu_idx, int thread_idx)
1864 {
1865 	struct perf_counts_values count;
1866 	struct perf_sample_id *sid;
1867 	struct perf_sample sample = {};
1868 	int id_hdr_size;
1869 
1870 	if (perf_evsel__read(&evsel->core, cpu_idx, thread_idx, &count) < 0) {
1871 		pr_err("read LOST count failed\n");
1872 		return;
1873 	}
1874 
1875 	if (count.lost == 0)
1876 		return;
1877 
1878 	lost->lost = count.lost;
1879 	if (evsel->core.ids) {
1880 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1881 		sample.id = sid->id;
1882 	}
1883 
1884 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1885 						       evsel->core.attr.sample_type, &sample);
1886 	lost->header.size = sizeof(*lost) + id_hdr_size;
1887 	record__write(rec, NULL, lost, lost->header.size);
1888 }
1889 
1890 static void record__read_lost_samples(struct record *rec)
1891 {
1892 	struct perf_session *session = rec->session;
1893 	struct perf_record_lost_samples *lost;
1894 	struct evsel *evsel;
1895 
1896 	/* there was an error during record__open */
1897 	if (session->evlist == NULL)
1898 		return;
1899 
1900 	lost = zalloc(PERF_SAMPLE_MAX_SIZE);
1901 	if (lost == NULL) {
1902 		pr_debug("Memory allocation failed\n");
1903 		return;
1904 	}
1905 
1906 	lost->header.type = PERF_RECORD_LOST_SAMPLES;
1907 
1908 	evlist__for_each_entry(session->evlist, evsel) {
1909 		struct xyarray *xy = evsel->core.sample_id;
1910 
1911 		if (xy == NULL || evsel->core.fd == NULL)
1912 			continue;
1913 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1914 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1915 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1916 			continue;
1917 		}
1918 
1919 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1920 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1921 				__record__read_lost_samples(rec, evsel, lost, x, y);
1922 			}
1923 		}
1924 	}
1925 	free(lost);
1926 
1927 }
1928 
1929 static volatile int workload_exec_errno;
1930 
1931 /*
1932  * evlist__prepare_workload will send a SIGUSR1
1933  * if the fork fails, since we asked by setting its
1934  * want_signal to true.
1935  */
1936 static void workload_exec_failed_signal(int signo __maybe_unused,
1937 					siginfo_t *info,
1938 					void *ucontext __maybe_unused)
1939 {
1940 	workload_exec_errno = info->si_value.sival_int;
1941 	done = 1;
1942 	child_finished = 1;
1943 }
1944 
1945 static void snapshot_sig_handler(int sig);
1946 static void alarm_sig_handler(int sig);
1947 
1948 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1949 {
1950 	if (evlist) {
1951 		if (evlist->mmap && evlist->mmap[0].core.base)
1952 			return evlist->mmap[0].core.base;
1953 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1954 			return evlist->overwrite_mmap[0].core.base;
1955 	}
1956 	return NULL;
1957 }
1958 
1959 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1960 {
1961 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1962 	if (pc)
1963 		return pc;
1964 	return NULL;
1965 }
1966 
1967 static int record__synthesize(struct record *rec, bool tail)
1968 {
1969 	struct perf_session *session = rec->session;
1970 	struct machine *machine = &session->machines.host;
1971 	struct perf_data *data = &rec->data;
1972 	struct record_opts *opts = &rec->opts;
1973 	struct perf_tool *tool = &rec->tool;
1974 	int err = 0;
1975 	event_op f = process_synthesized_event;
1976 
1977 	if (rec->opts.tail_synthesize != tail)
1978 		return 0;
1979 
1980 	if (data->is_pipe) {
1981 		err = perf_event__synthesize_for_pipe(tool, session, data,
1982 						      process_synthesized_event);
1983 		if (err < 0)
1984 			goto out;
1985 
1986 		rec->bytes_written += err;
1987 	}
1988 
1989 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1990 					  process_synthesized_event, machine);
1991 	if (err)
1992 		goto out;
1993 
1994 	/* Synthesize id_index before auxtrace_info */
1995 	err = perf_event__synthesize_id_index(tool,
1996 					      process_synthesized_event,
1997 					      session->evlist, machine);
1998 	if (err)
1999 		goto out;
2000 
2001 	if (rec->opts.full_auxtrace) {
2002 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2003 					session, process_synthesized_event);
2004 		if (err)
2005 			goto out;
2006 	}
2007 
2008 	if (!evlist__exclude_kernel(rec->evlist)) {
2009 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2010 							 machine);
2011 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2012 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2013 				   "Check /proc/kallsyms permission or run as root.\n");
2014 
2015 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2016 						     machine);
2017 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2018 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2019 				   "Check /proc/modules permission or run as root.\n");
2020 	}
2021 
2022 	if (perf_guest) {
2023 		machines__process_guests(&session->machines,
2024 					 perf_event__synthesize_guest_os, tool);
2025 	}
2026 
2027 	err = perf_event__synthesize_extra_attr(&rec->tool,
2028 						rec->evlist,
2029 						process_synthesized_event,
2030 						data->is_pipe);
2031 	if (err)
2032 		goto out;
2033 
2034 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2035 						 process_synthesized_event,
2036 						NULL);
2037 	if (err < 0) {
2038 		pr_err("Couldn't synthesize thread map.\n");
2039 		return err;
2040 	}
2041 
2042 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2043 					     process_synthesized_event, NULL);
2044 	if (err < 0) {
2045 		pr_err("Couldn't synthesize cpu map.\n");
2046 		return err;
2047 	}
2048 
2049 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2050 						machine, opts);
2051 	if (err < 0) {
2052 		pr_warning("Couldn't synthesize bpf events.\n");
2053 		err = 0;
2054 	}
2055 
2056 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2057 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2058 						     machine);
2059 		if (err < 0) {
2060 			pr_warning("Couldn't synthesize cgroup events.\n");
2061 			err = 0;
2062 		}
2063 	}
2064 
2065 	if (rec->opts.nr_threads_synthesize > 1) {
2066 		mutex_init(&synth_lock);
2067 		perf_set_multithreaded();
2068 		f = process_locked_synthesized_event;
2069 	}
2070 
2071 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2072 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2073 
2074 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2075 						    rec->evlist->core.threads,
2076 						    f, needs_mmap, opts->sample_address,
2077 						    rec->opts.nr_threads_synthesize);
2078 	}
2079 
2080 	if (rec->opts.nr_threads_synthesize > 1) {
2081 		perf_set_singlethreaded();
2082 		mutex_destroy(&synth_lock);
2083 	}
2084 
2085 out:
2086 	return err;
2087 }
2088 
2089 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2090 {
2091 	struct record *rec = data;
2092 	pthread_kill(rec->thread_id, SIGUSR2);
2093 	return 0;
2094 }
2095 
2096 static int record__setup_sb_evlist(struct record *rec)
2097 {
2098 	struct record_opts *opts = &rec->opts;
2099 
2100 	if (rec->sb_evlist != NULL) {
2101 		/*
2102 		 * We get here if --switch-output-event populated the
2103 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2104 		 * to the main thread.
2105 		 */
2106 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2107 		rec->thread_id = pthread_self();
2108 	}
2109 #ifdef HAVE_LIBBPF_SUPPORT
2110 	if (!opts->no_bpf_event) {
2111 		if (rec->sb_evlist == NULL) {
2112 			rec->sb_evlist = evlist__new();
2113 
2114 			if (rec->sb_evlist == NULL) {
2115 				pr_err("Couldn't create side band evlist.\n.");
2116 				return -1;
2117 			}
2118 		}
2119 
2120 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2121 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2122 			return -1;
2123 		}
2124 	}
2125 #endif
2126 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2127 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2128 		opts->no_bpf_event = true;
2129 	}
2130 
2131 	return 0;
2132 }
2133 
2134 static int record__init_clock(struct record *rec)
2135 {
2136 	struct perf_session *session = rec->session;
2137 	struct timespec ref_clockid;
2138 	struct timeval ref_tod;
2139 	u64 ref;
2140 
2141 	if (!rec->opts.use_clockid)
2142 		return 0;
2143 
2144 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2145 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2146 
2147 	session->header.env.clock.clockid = rec->opts.clockid;
2148 
2149 	if (gettimeofday(&ref_tod, NULL) != 0) {
2150 		pr_err("gettimeofday failed, cannot set reference time.\n");
2151 		return -1;
2152 	}
2153 
2154 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2155 		pr_err("clock_gettime failed, cannot set reference time.\n");
2156 		return -1;
2157 	}
2158 
2159 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2160 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2161 
2162 	session->header.env.clock.tod_ns = ref;
2163 
2164 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2165 	      (u64) ref_clockid.tv_nsec;
2166 
2167 	session->header.env.clock.clockid_ns = ref;
2168 	return 0;
2169 }
2170 
2171 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2172 {
2173 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2174 		trigger_hit(&auxtrace_snapshot_trigger);
2175 		auxtrace_record__snapshot_started = 1;
2176 		if (auxtrace_record__snapshot_start(rec->itr))
2177 			trigger_error(&auxtrace_snapshot_trigger);
2178 	}
2179 }
2180 
2181 static void record__uniquify_name(struct record *rec)
2182 {
2183 	struct evsel *pos;
2184 	struct evlist *evlist = rec->evlist;
2185 	char *new_name;
2186 	int ret;
2187 
2188 	if (!perf_pmu__has_hybrid())
2189 		return;
2190 
2191 	evlist__for_each_entry(evlist, pos) {
2192 		if (!evsel__is_hybrid(pos))
2193 			continue;
2194 
2195 		if (strchr(pos->name, '/'))
2196 			continue;
2197 
2198 		ret = asprintf(&new_name, "%s/%s/",
2199 			       pos->pmu_name, pos->name);
2200 		if (ret) {
2201 			free(pos->name);
2202 			pos->name = new_name;
2203 		}
2204 	}
2205 }
2206 
2207 static int record__terminate_thread(struct record_thread *thread_data)
2208 {
2209 	int err;
2210 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2211 	pid_t tid = thread_data->tid;
2212 
2213 	close(thread_data->pipes.msg[1]);
2214 	thread_data->pipes.msg[1] = -1;
2215 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2216 	if (err > 0)
2217 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2218 	else
2219 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2220 			   thread->tid, tid);
2221 
2222 	return 0;
2223 }
2224 
2225 static int record__start_threads(struct record *rec)
2226 {
2227 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2228 	struct record_thread *thread_data = rec->thread_data;
2229 	sigset_t full, mask;
2230 	pthread_t handle;
2231 	pthread_attr_t attrs;
2232 
2233 	thread = &thread_data[0];
2234 
2235 	if (!record__threads_enabled(rec))
2236 		return 0;
2237 
2238 	sigfillset(&full);
2239 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2240 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2241 		return -1;
2242 	}
2243 
2244 	pthread_attr_init(&attrs);
2245 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2246 
2247 	for (t = 1; t < nr_threads; t++) {
2248 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2249 
2250 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2251 		pthread_attr_setaffinity_np(&attrs,
2252 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2253 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2254 #endif
2255 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2256 			for (tt = 1; tt < t; tt++)
2257 				record__terminate_thread(&thread_data[t]);
2258 			pr_err("Failed to start threads: %s\n", strerror(errno));
2259 			ret = -1;
2260 			goto out_err;
2261 		}
2262 
2263 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2264 		if (err > 0)
2265 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2266 				  thread_msg_tags[msg]);
2267 		else
2268 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2269 				   thread->tid, rec->thread_data[t].tid);
2270 	}
2271 
2272 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2273 			(cpu_set_t *)thread->mask->affinity.bits);
2274 
2275 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2276 
2277 out_err:
2278 	pthread_attr_destroy(&attrs);
2279 
2280 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2281 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2282 		ret = -1;
2283 	}
2284 
2285 	return ret;
2286 }
2287 
2288 static int record__stop_threads(struct record *rec)
2289 {
2290 	int t;
2291 	struct record_thread *thread_data = rec->thread_data;
2292 
2293 	for (t = 1; t < rec->nr_threads; t++)
2294 		record__terminate_thread(&thread_data[t]);
2295 
2296 	for (t = 0; t < rec->nr_threads; t++) {
2297 		rec->samples += thread_data[t].samples;
2298 		if (!record__threads_enabled(rec))
2299 			continue;
2300 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2301 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2302 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2303 			 thread_data[t].samples, thread_data[t].waking);
2304 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2305 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2306 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2307 		else
2308 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2309 	}
2310 
2311 	return 0;
2312 }
2313 
2314 static unsigned long record__waking(struct record *rec)
2315 {
2316 	int t;
2317 	unsigned long waking = 0;
2318 	struct record_thread *thread_data = rec->thread_data;
2319 
2320 	for (t = 0; t < rec->nr_threads; t++)
2321 		waking += thread_data[t].waking;
2322 
2323 	return waking;
2324 }
2325 
2326 static int __cmd_record(struct record *rec, int argc, const char **argv)
2327 {
2328 	int err;
2329 	int status = 0;
2330 	const bool forks = argc > 0;
2331 	struct perf_tool *tool = &rec->tool;
2332 	struct record_opts *opts = &rec->opts;
2333 	struct perf_data *data = &rec->data;
2334 	struct perf_session *session;
2335 	bool disabled = false, draining = false;
2336 	int fd;
2337 	float ratio = 0;
2338 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2339 
2340 	atexit(record__sig_exit);
2341 	signal(SIGCHLD, sig_handler);
2342 	signal(SIGINT, sig_handler);
2343 	signal(SIGTERM, sig_handler);
2344 	signal(SIGSEGV, sigsegv_handler);
2345 
2346 	if (rec->opts.record_namespaces)
2347 		tool->namespace_events = true;
2348 
2349 	if (rec->opts.record_cgroup) {
2350 #ifdef HAVE_FILE_HANDLE
2351 		tool->cgroup_events = true;
2352 #else
2353 		pr_err("cgroup tracking is not supported\n");
2354 		return -1;
2355 #endif
2356 	}
2357 
2358 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2359 		signal(SIGUSR2, snapshot_sig_handler);
2360 		if (rec->opts.auxtrace_snapshot_mode)
2361 			trigger_on(&auxtrace_snapshot_trigger);
2362 		if (rec->switch_output.enabled)
2363 			trigger_on(&switch_output_trigger);
2364 	} else {
2365 		signal(SIGUSR2, SIG_IGN);
2366 	}
2367 
2368 	session = perf_session__new(data, tool);
2369 	if (IS_ERR(session)) {
2370 		pr_err("Perf session creation failed.\n");
2371 		return PTR_ERR(session);
2372 	}
2373 
2374 	if (record__threads_enabled(rec)) {
2375 		if (perf_data__is_pipe(&rec->data)) {
2376 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2377 			return -1;
2378 		}
2379 		if (rec->opts.full_auxtrace) {
2380 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2381 			return -1;
2382 		}
2383 	}
2384 
2385 	fd = perf_data__fd(data);
2386 	rec->session = session;
2387 
2388 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2389 		pr_err("Compression initialization failed.\n");
2390 		return -1;
2391 	}
2392 #ifdef HAVE_EVENTFD_SUPPORT
2393 	done_fd = eventfd(0, EFD_NONBLOCK);
2394 	if (done_fd < 0) {
2395 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2396 		status = -1;
2397 		goto out_delete_session;
2398 	}
2399 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2400 	if (err < 0) {
2401 		pr_err("Failed to add wakeup eventfd to poll list\n");
2402 		status = err;
2403 		goto out_delete_session;
2404 	}
2405 #endif // HAVE_EVENTFD_SUPPORT
2406 
2407 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2408 	session->header.env.comp_level = rec->opts.comp_level;
2409 
2410 	if (rec->opts.kcore &&
2411 	    !record__kcore_readable(&session->machines.host)) {
2412 		pr_err("ERROR: kcore is not readable.\n");
2413 		return -1;
2414 	}
2415 
2416 	if (record__init_clock(rec))
2417 		return -1;
2418 
2419 	record__init_features(rec);
2420 
2421 	if (forks) {
2422 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2423 					       workload_exec_failed_signal);
2424 		if (err < 0) {
2425 			pr_err("Couldn't run the workload!\n");
2426 			status = err;
2427 			goto out_delete_session;
2428 		}
2429 	}
2430 
2431 	/*
2432 	 * If we have just single event and are sending data
2433 	 * through pipe, we need to force the ids allocation,
2434 	 * because we synthesize event name through the pipe
2435 	 * and need the id for that.
2436 	 */
2437 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2438 		rec->opts.sample_id = true;
2439 
2440 	record__uniquify_name(rec);
2441 
2442 	/* Debug message used by test scripts */
2443 	pr_debug3("perf record opening and mmapping events\n");
2444 	if (record__open(rec) != 0) {
2445 		err = -1;
2446 		goto out_free_threads;
2447 	}
2448 	/* Debug message used by test scripts */
2449 	pr_debug3("perf record done opening and mmapping events\n");
2450 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2451 
2452 	if (rec->opts.kcore) {
2453 		err = record__kcore_copy(&session->machines.host, data);
2454 		if (err) {
2455 			pr_err("ERROR: Failed to copy kcore\n");
2456 			goto out_free_threads;
2457 		}
2458 	}
2459 
2460 	err = bpf__apply_obj_config();
2461 	if (err) {
2462 		char errbuf[BUFSIZ];
2463 
2464 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2465 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2466 			 errbuf);
2467 		goto out_free_threads;
2468 	}
2469 
2470 	/*
2471 	 * Normally perf_session__new would do this, but it doesn't have the
2472 	 * evlist.
2473 	 */
2474 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2475 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2476 		rec->tool.ordered_events = false;
2477 	}
2478 
2479 	if (!rec->evlist->core.nr_groups)
2480 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2481 
2482 	if (data->is_pipe) {
2483 		err = perf_header__write_pipe(fd);
2484 		if (err < 0)
2485 			goto out_free_threads;
2486 	} else {
2487 		err = perf_session__write_header(session, rec->evlist, fd, false);
2488 		if (err < 0)
2489 			goto out_free_threads;
2490 	}
2491 
2492 	err = -1;
2493 	if (!rec->no_buildid
2494 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2495 		pr_err("Couldn't generate buildids. "
2496 		       "Use --no-buildid to profile anyway.\n");
2497 		goto out_free_threads;
2498 	}
2499 
2500 	err = record__setup_sb_evlist(rec);
2501 	if (err)
2502 		goto out_free_threads;
2503 
2504 	err = record__synthesize(rec, false);
2505 	if (err < 0)
2506 		goto out_free_threads;
2507 
2508 	if (rec->realtime_prio) {
2509 		struct sched_param param;
2510 
2511 		param.sched_priority = rec->realtime_prio;
2512 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2513 			pr_err("Could not set realtime priority.\n");
2514 			err = -1;
2515 			goto out_free_threads;
2516 		}
2517 	}
2518 
2519 	if (record__start_threads(rec))
2520 		goto out_free_threads;
2521 
2522 	/*
2523 	 * When perf is starting the traced process, all the events
2524 	 * (apart from group members) have enable_on_exec=1 set,
2525 	 * so don't spoil it by prematurely enabling them.
2526 	 */
2527 	if (!target__none(&opts->target) && !opts->initial_delay)
2528 		evlist__enable(rec->evlist);
2529 
2530 	/*
2531 	 * Let the child rip
2532 	 */
2533 	if (forks) {
2534 		struct machine *machine = &session->machines.host;
2535 		union perf_event *event;
2536 		pid_t tgid;
2537 
2538 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2539 		if (event == NULL) {
2540 			err = -ENOMEM;
2541 			goto out_child;
2542 		}
2543 
2544 		/*
2545 		 * Some H/W events are generated before COMM event
2546 		 * which is emitted during exec(), so perf script
2547 		 * cannot see a correct process name for those events.
2548 		 * Synthesize COMM event to prevent it.
2549 		 */
2550 		tgid = perf_event__synthesize_comm(tool, event,
2551 						   rec->evlist->workload.pid,
2552 						   process_synthesized_event,
2553 						   machine);
2554 		free(event);
2555 
2556 		if (tgid == -1)
2557 			goto out_child;
2558 
2559 		event = malloc(sizeof(event->namespaces) +
2560 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2561 			       machine->id_hdr_size);
2562 		if (event == NULL) {
2563 			err = -ENOMEM;
2564 			goto out_child;
2565 		}
2566 
2567 		/*
2568 		 * Synthesize NAMESPACES event for the command specified.
2569 		 */
2570 		perf_event__synthesize_namespaces(tool, event,
2571 						  rec->evlist->workload.pid,
2572 						  tgid, process_synthesized_event,
2573 						  machine);
2574 		free(event);
2575 
2576 		evlist__start_workload(rec->evlist);
2577 	}
2578 
2579 	if (opts->initial_delay) {
2580 		pr_info(EVLIST_DISABLED_MSG);
2581 		if (opts->initial_delay > 0) {
2582 			usleep(opts->initial_delay * USEC_PER_MSEC);
2583 			evlist__enable(rec->evlist);
2584 			pr_info(EVLIST_ENABLED_MSG);
2585 		}
2586 	}
2587 
2588 	err = event_enable_timer__start(rec->evlist->eet);
2589 	if (err)
2590 		goto out_child;
2591 
2592 	/* Debug message used by test scripts */
2593 	pr_debug3("perf record has started\n");
2594 	fflush(stderr);
2595 
2596 	trigger_ready(&auxtrace_snapshot_trigger);
2597 	trigger_ready(&switch_output_trigger);
2598 	perf_hooks__invoke_record_start();
2599 
2600 	/*
2601 	 * Must write FINISHED_INIT so it will be seen after all other
2602 	 * synthesized user events, but before any regular events.
2603 	 */
2604 	err = write_finished_init(rec, false);
2605 	if (err < 0)
2606 		goto out_child;
2607 
2608 	for (;;) {
2609 		unsigned long long hits = thread->samples;
2610 
2611 		/*
2612 		 * rec->evlist->bkw_mmap_state is possible to be
2613 		 * BKW_MMAP_EMPTY here: when done == true and
2614 		 * hits != rec->samples in previous round.
2615 		 *
2616 		 * evlist__toggle_bkw_mmap ensure we never
2617 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2618 		 */
2619 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2620 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2621 
2622 		if (record__mmap_read_all(rec, false) < 0) {
2623 			trigger_error(&auxtrace_snapshot_trigger);
2624 			trigger_error(&switch_output_trigger);
2625 			err = -1;
2626 			goto out_child;
2627 		}
2628 
2629 		if (auxtrace_record__snapshot_started) {
2630 			auxtrace_record__snapshot_started = 0;
2631 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2632 				record__read_auxtrace_snapshot(rec, false);
2633 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2634 				pr_err("AUX area tracing snapshot failed\n");
2635 				err = -1;
2636 				goto out_child;
2637 			}
2638 		}
2639 
2640 		if (trigger_is_hit(&switch_output_trigger)) {
2641 			/*
2642 			 * If switch_output_trigger is hit, the data in
2643 			 * overwritable ring buffer should have been collected,
2644 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2645 			 *
2646 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2647 			 * record__mmap_read_all() didn't collect data from
2648 			 * overwritable ring buffer. Read again.
2649 			 */
2650 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2651 				continue;
2652 			trigger_ready(&switch_output_trigger);
2653 
2654 			/*
2655 			 * Reenable events in overwrite ring buffer after
2656 			 * record__mmap_read_all(): we should have collected
2657 			 * data from it.
2658 			 */
2659 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2660 
2661 			if (!quiet)
2662 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2663 					record__waking(rec));
2664 			thread->waking = 0;
2665 			fd = record__switch_output(rec, false);
2666 			if (fd < 0) {
2667 				pr_err("Failed to switch to new file\n");
2668 				trigger_error(&switch_output_trigger);
2669 				err = fd;
2670 				goto out_child;
2671 			}
2672 
2673 			/* re-arm the alarm */
2674 			if (rec->switch_output.time)
2675 				alarm(rec->switch_output.time);
2676 		}
2677 
2678 		if (hits == thread->samples) {
2679 			if (done || draining)
2680 				break;
2681 			err = fdarray__poll(&thread->pollfd, -1);
2682 			/*
2683 			 * Propagate error, only if there's any. Ignore positive
2684 			 * number of returned events and interrupt error.
2685 			 */
2686 			if (err > 0 || (err < 0 && errno == EINTR))
2687 				err = 0;
2688 			thread->waking++;
2689 
2690 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2691 					    record__thread_munmap_filtered, NULL) == 0)
2692 				draining = true;
2693 
2694 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2695 			if (err)
2696 				goto out_child;
2697 		}
2698 
2699 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2700 			switch (cmd) {
2701 			case EVLIST_CTL_CMD_SNAPSHOT:
2702 				hit_auxtrace_snapshot_trigger(rec);
2703 				evlist__ctlfd_ack(rec->evlist);
2704 				break;
2705 			case EVLIST_CTL_CMD_STOP:
2706 				done = 1;
2707 				break;
2708 			case EVLIST_CTL_CMD_ACK:
2709 			case EVLIST_CTL_CMD_UNSUPPORTED:
2710 			case EVLIST_CTL_CMD_ENABLE:
2711 			case EVLIST_CTL_CMD_DISABLE:
2712 			case EVLIST_CTL_CMD_EVLIST:
2713 			case EVLIST_CTL_CMD_PING:
2714 			default:
2715 				break;
2716 			}
2717 		}
2718 
2719 		err = event_enable_timer__process(rec->evlist->eet);
2720 		if (err < 0)
2721 			goto out_child;
2722 		if (err) {
2723 			err = 0;
2724 			done = 1;
2725 		}
2726 
2727 		/*
2728 		 * When perf is starting the traced process, at the end events
2729 		 * die with the process and we wait for that. Thus no need to
2730 		 * disable events in this case.
2731 		 */
2732 		if (done && !disabled && !target__none(&opts->target)) {
2733 			trigger_off(&auxtrace_snapshot_trigger);
2734 			evlist__disable(rec->evlist);
2735 			disabled = true;
2736 		}
2737 	}
2738 
2739 	trigger_off(&auxtrace_snapshot_trigger);
2740 	trigger_off(&switch_output_trigger);
2741 
2742 	if (opts->auxtrace_snapshot_on_exit)
2743 		record__auxtrace_snapshot_exit(rec);
2744 
2745 	if (forks && workload_exec_errno) {
2746 		char msg[STRERR_BUFSIZE], strevsels[2048];
2747 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2748 
2749 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2750 
2751 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2752 			strevsels, argv[0], emsg);
2753 		err = -1;
2754 		goto out_child;
2755 	}
2756 
2757 	if (!quiet)
2758 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2759 			record__waking(rec));
2760 
2761 	write_finished_init(rec, true);
2762 
2763 	if (target__none(&rec->opts.target))
2764 		record__synthesize_workload(rec, true);
2765 
2766 out_child:
2767 	record__stop_threads(rec);
2768 	record__mmap_read_all(rec, true);
2769 out_free_threads:
2770 	record__free_thread_data(rec);
2771 	evlist__finalize_ctlfd(rec->evlist);
2772 	record__aio_mmap_read_sync(rec);
2773 
2774 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2775 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2776 		session->header.env.comp_ratio = ratio + 0.5;
2777 	}
2778 
2779 	if (forks) {
2780 		int exit_status;
2781 
2782 		if (!child_finished)
2783 			kill(rec->evlist->workload.pid, SIGTERM);
2784 
2785 		wait(&exit_status);
2786 
2787 		if (err < 0)
2788 			status = err;
2789 		else if (WIFEXITED(exit_status))
2790 			status = WEXITSTATUS(exit_status);
2791 		else if (WIFSIGNALED(exit_status))
2792 			signr = WTERMSIG(exit_status);
2793 	} else
2794 		status = err;
2795 
2796 	if (rec->off_cpu)
2797 		rec->bytes_written += off_cpu_write(rec->session);
2798 
2799 	record__read_lost_samples(rec);
2800 	record__synthesize(rec, true);
2801 	/* this will be recalculated during process_buildids() */
2802 	rec->samples = 0;
2803 
2804 	if (!err) {
2805 		if (!rec->timestamp_filename) {
2806 			record__finish_output(rec);
2807 		} else {
2808 			fd = record__switch_output(rec, true);
2809 			if (fd < 0) {
2810 				status = fd;
2811 				goto out_delete_session;
2812 			}
2813 		}
2814 	}
2815 
2816 	perf_hooks__invoke_record_end();
2817 
2818 	if (!err && !quiet) {
2819 		char samples[128];
2820 		const char *postfix = rec->timestamp_filename ?
2821 					".<timestamp>" : "";
2822 
2823 		if (rec->samples && !rec->opts.full_auxtrace)
2824 			scnprintf(samples, sizeof(samples),
2825 				  " (%" PRIu64 " samples)", rec->samples);
2826 		else
2827 			samples[0] = '\0';
2828 
2829 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2830 			perf_data__size(data) / 1024.0 / 1024.0,
2831 			data->path, postfix, samples);
2832 		if (ratio) {
2833 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2834 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2835 					ratio);
2836 		}
2837 		fprintf(stderr, " ]\n");
2838 	}
2839 
2840 out_delete_session:
2841 #ifdef HAVE_EVENTFD_SUPPORT
2842 	if (done_fd >= 0) {
2843 		fd = done_fd;
2844 		done_fd = -1;
2845 
2846 		close(fd);
2847 	}
2848 #endif
2849 	zstd_fini(&session->zstd_data);
2850 	perf_session__delete(session);
2851 
2852 	if (!opts->no_bpf_event)
2853 		evlist__stop_sb_thread(rec->sb_evlist);
2854 	return status;
2855 }
2856 
2857 static void callchain_debug(struct callchain_param *callchain)
2858 {
2859 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2860 
2861 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2862 
2863 	if (callchain->record_mode == CALLCHAIN_DWARF)
2864 		pr_debug("callchain: stack dump size %d\n",
2865 			 callchain->dump_size);
2866 }
2867 
2868 int record_opts__parse_callchain(struct record_opts *record,
2869 				 struct callchain_param *callchain,
2870 				 const char *arg, bool unset)
2871 {
2872 	int ret;
2873 	callchain->enabled = !unset;
2874 
2875 	/* --no-call-graph */
2876 	if (unset) {
2877 		callchain->record_mode = CALLCHAIN_NONE;
2878 		pr_debug("callchain: disabled\n");
2879 		return 0;
2880 	}
2881 
2882 	ret = parse_callchain_record_opt(arg, callchain);
2883 	if (!ret) {
2884 		/* Enable data address sampling for DWARF unwind. */
2885 		if (callchain->record_mode == CALLCHAIN_DWARF)
2886 			record->sample_address = true;
2887 		callchain_debug(callchain);
2888 	}
2889 
2890 	return ret;
2891 }
2892 
2893 int record_parse_callchain_opt(const struct option *opt,
2894 			       const char *arg,
2895 			       int unset)
2896 {
2897 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2898 }
2899 
2900 int record_callchain_opt(const struct option *opt,
2901 			 const char *arg __maybe_unused,
2902 			 int unset __maybe_unused)
2903 {
2904 	struct callchain_param *callchain = opt->value;
2905 
2906 	callchain->enabled = true;
2907 
2908 	if (callchain->record_mode == CALLCHAIN_NONE)
2909 		callchain->record_mode = CALLCHAIN_FP;
2910 
2911 	callchain_debug(callchain);
2912 	return 0;
2913 }
2914 
2915 static int perf_record_config(const char *var, const char *value, void *cb)
2916 {
2917 	struct record *rec = cb;
2918 
2919 	if (!strcmp(var, "record.build-id")) {
2920 		if (!strcmp(value, "cache"))
2921 			rec->no_buildid_cache = false;
2922 		else if (!strcmp(value, "no-cache"))
2923 			rec->no_buildid_cache = true;
2924 		else if (!strcmp(value, "skip"))
2925 			rec->no_buildid = true;
2926 		else if (!strcmp(value, "mmap"))
2927 			rec->buildid_mmap = true;
2928 		else
2929 			return -1;
2930 		return 0;
2931 	}
2932 	if (!strcmp(var, "record.call-graph")) {
2933 		var = "call-graph.record-mode";
2934 		return perf_default_config(var, value, cb);
2935 	}
2936 #ifdef HAVE_AIO_SUPPORT
2937 	if (!strcmp(var, "record.aio")) {
2938 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2939 		if (!rec->opts.nr_cblocks)
2940 			rec->opts.nr_cblocks = nr_cblocks_default;
2941 	}
2942 #endif
2943 	if (!strcmp(var, "record.debuginfod")) {
2944 		rec->debuginfod.urls = strdup(value);
2945 		if (!rec->debuginfod.urls)
2946 			return -ENOMEM;
2947 		rec->debuginfod.set = true;
2948 	}
2949 
2950 	return 0;
2951 }
2952 
2953 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2954 {
2955 	struct record *rec = (struct record *)opt->value;
2956 
2957 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2958 }
2959 
2960 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2961 {
2962 	struct record_opts *opts = (struct record_opts *)opt->value;
2963 
2964 	if (unset || !str)
2965 		return 0;
2966 
2967 	if (!strcasecmp(str, "node"))
2968 		opts->affinity = PERF_AFFINITY_NODE;
2969 	else if (!strcasecmp(str, "cpu"))
2970 		opts->affinity = PERF_AFFINITY_CPU;
2971 
2972 	return 0;
2973 }
2974 
2975 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2976 {
2977 	mask->nbits = nr_bits;
2978 	mask->bits = bitmap_zalloc(mask->nbits);
2979 	if (!mask->bits)
2980 		return -ENOMEM;
2981 
2982 	return 0;
2983 }
2984 
2985 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2986 {
2987 	bitmap_free(mask->bits);
2988 	mask->nbits = 0;
2989 }
2990 
2991 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
2992 {
2993 	int ret;
2994 
2995 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
2996 	if (ret) {
2997 		mask->affinity.bits = NULL;
2998 		return ret;
2999 	}
3000 
3001 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3002 	if (ret) {
3003 		record__mmap_cpu_mask_free(&mask->maps);
3004 		mask->maps.bits = NULL;
3005 	}
3006 
3007 	return ret;
3008 }
3009 
3010 static void record__thread_mask_free(struct thread_mask *mask)
3011 {
3012 	record__mmap_cpu_mask_free(&mask->maps);
3013 	record__mmap_cpu_mask_free(&mask->affinity);
3014 }
3015 
3016 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3017 {
3018 	int s;
3019 	struct record_opts *opts = opt->value;
3020 
3021 	if (unset || !str || !strlen(str)) {
3022 		opts->threads_spec = THREAD_SPEC__CPU;
3023 	} else {
3024 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3025 			if (s == THREAD_SPEC__USER) {
3026 				opts->threads_user_spec = strdup(str);
3027 				if (!opts->threads_user_spec)
3028 					return -ENOMEM;
3029 				opts->threads_spec = THREAD_SPEC__USER;
3030 				break;
3031 			}
3032 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3033 				opts->threads_spec = s;
3034 				break;
3035 			}
3036 		}
3037 	}
3038 
3039 	if (opts->threads_spec == THREAD_SPEC__USER)
3040 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3041 	else
3042 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3043 
3044 	return 0;
3045 }
3046 
3047 static int parse_output_max_size(const struct option *opt,
3048 				 const char *str, int unset)
3049 {
3050 	unsigned long *s = (unsigned long *)opt->value;
3051 	static struct parse_tag tags_size[] = {
3052 		{ .tag  = 'B', .mult = 1       },
3053 		{ .tag  = 'K', .mult = 1 << 10 },
3054 		{ .tag  = 'M', .mult = 1 << 20 },
3055 		{ .tag  = 'G', .mult = 1 << 30 },
3056 		{ .tag  = 0 },
3057 	};
3058 	unsigned long val;
3059 
3060 	if (unset) {
3061 		*s = 0;
3062 		return 0;
3063 	}
3064 
3065 	val = parse_tag_value(str, tags_size);
3066 	if (val != (unsigned long) -1) {
3067 		*s = val;
3068 		return 0;
3069 	}
3070 
3071 	return -1;
3072 }
3073 
3074 static int record__parse_mmap_pages(const struct option *opt,
3075 				    const char *str,
3076 				    int unset __maybe_unused)
3077 {
3078 	struct record_opts *opts = opt->value;
3079 	char *s, *p;
3080 	unsigned int mmap_pages;
3081 	int ret;
3082 
3083 	if (!str)
3084 		return -EINVAL;
3085 
3086 	s = strdup(str);
3087 	if (!s)
3088 		return -ENOMEM;
3089 
3090 	p = strchr(s, ',');
3091 	if (p)
3092 		*p = '\0';
3093 
3094 	if (*s) {
3095 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3096 		if (ret)
3097 			goto out_free;
3098 		opts->mmap_pages = mmap_pages;
3099 	}
3100 
3101 	if (!p) {
3102 		ret = 0;
3103 		goto out_free;
3104 	}
3105 
3106 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3107 	if (ret)
3108 		goto out_free;
3109 
3110 	opts->auxtrace_mmap_pages = mmap_pages;
3111 
3112 out_free:
3113 	free(s);
3114 	return ret;
3115 }
3116 
3117 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3118 {
3119 }
3120 
3121 static int parse_control_option(const struct option *opt,
3122 				const char *str,
3123 				int unset __maybe_unused)
3124 {
3125 	struct record_opts *opts = opt->value;
3126 
3127 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3128 }
3129 
3130 static void switch_output_size_warn(struct record *rec)
3131 {
3132 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3133 	struct switch_output *s = &rec->switch_output;
3134 
3135 	wakeup_size /= 2;
3136 
3137 	if (s->size < wakeup_size) {
3138 		char buf[100];
3139 
3140 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3141 		pr_warning("WARNING: switch-output data size lower than "
3142 			   "wakeup kernel buffer size (%s) "
3143 			   "expect bigger perf.data sizes\n", buf);
3144 	}
3145 }
3146 
3147 static int switch_output_setup(struct record *rec)
3148 {
3149 	struct switch_output *s = &rec->switch_output;
3150 	static struct parse_tag tags_size[] = {
3151 		{ .tag  = 'B', .mult = 1       },
3152 		{ .tag  = 'K', .mult = 1 << 10 },
3153 		{ .tag  = 'M', .mult = 1 << 20 },
3154 		{ .tag  = 'G', .mult = 1 << 30 },
3155 		{ .tag  = 0 },
3156 	};
3157 	static struct parse_tag tags_time[] = {
3158 		{ .tag  = 's', .mult = 1        },
3159 		{ .tag  = 'm', .mult = 60       },
3160 		{ .tag  = 'h', .mult = 60*60    },
3161 		{ .tag  = 'd', .mult = 60*60*24 },
3162 		{ .tag  = 0 },
3163 	};
3164 	unsigned long val;
3165 
3166 	/*
3167 	 * If we're using --switch-output-events, then we imply its
3168 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3169 	 *  thread to its parent.
3170 	 */
3171 	if (rec->switch_output_event_set) {
3172 		if (record__threads_enabled(rec)) {
3173 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3174 			return 0;
3175 		}
3176 		goto do_signal;
3177 	}
3178 
3179 	if (!s->set)
3180 		return 0;
3181 
3182 	if (record__threads_enabled(rec)) {
3183 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3184 		return 0;
3185 	}
3186 
3187 	if (!strcmp(s->str, "signal")) {
3188 do_signal:
3189 		s->signal = true;
3190 		pr_debug("switch-output with SIGUSR2 signal\n");
3191 		goto enabled;
3192 	}
3193 
3194 	val = parse_tag_value(s->str, tags_size);
3195 	if (val != (unsigned long) -1) {
3196 		s->size = val;
3197 		pr_debug("switch-output with %s size threshold\n", s->str);
3198 		goto enabled;
3199 	}
3200 
3201 	val = parse_tag_value(s->str, tags_time);
3202 	if (val != (unsigned long) -1) {
3203 		s->time = val;
3204 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3205 			 s->str, s->time);
3206 		goto enabled;
3207 	}
3208 
3209 	return -1;
3210 
3211 enabled:
3212 	rec->timestamp_filename = true;
3213 	s->enabled              = true;
3214 
3215 	if (s->size && !rec->opts.no_buffering)
3216 		switch_output_size_warn(rec);
3217 
3218 	return 0;
3219 }
3220 
3221 static const char * const __record_usage[] = {
3222 	"perf record [<options>] [<command>]",
3223 	"perf record [<options>] -- <command> [<options>]",
3224 	NULL
3225 };
3226 const char * const *record_usage = __record_usage;
3227 
3228 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3229 				  struct perf_sample *sample, struct machine *machine)
3230 {
3231 	/*
3232 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3233 	 * no need to add them twice.
3234 	 */
3235 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3236 		return 0;
3237 	return perf_event__process_mmap(tool, event, sample, machine);
3238 }
3239 
3240 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3241 				   struct perf_sample *sample, struct machine *machine)
3242 {
3243 	/*
3244 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3245 	 * no need to add them twice.
3246 	 */
3247 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3248 		return 0;
3249 
3250 	return perf_event__process_mmap2(tool, event, sample, machine);
3251 }
3252 
3253 static int process_timestamp_boundary(struct perf_tool *tool,
3254 				      union perf_event *event __maybe_unused,
3255 				      struct perf_sample *sample,
3256 				      struct machine *machine __maybe_unused)
3257 {
3258 	struct record *rec = container_of(tool, struct record, tool);
3259 
3260 	set_timestamp_boundary(rec, sample->time);
3261 	return 0;
3262 }
3263 
3264 static int parse_record_synth_option(const struct option *opt,
3265 				     const char *str,
3266 				     int unset __maybe_unused)
3267 {
3268 	struct record_opts *opts = opt->value;
3269 	char *p = strdup(str);
3270 
3271 	if (p == NULL)
3272 		return -1;
3273 
3274 	opts->synth = parse_synth_opt(p);
3275 	free(p);
3276 
3277 	if (opts->synth < 0) {
3278 		pr_err("Invalid synth option: %s\n", str);
3279 		return -1;
3280 	}
3281 	return 0;
3282 }
3283 
3284 /*
3285  * XXX Ideally would be local to cmd_record() and passed to a record__new
3286  * because we need to have access to it in record__exit, that is called
3287  * after cmd_record() exits, but since record_options need to be accessible to
3288  * builtin-script, leave it here.
3289  *
3290  * At least we don't ouch it in all the other functions here directly.
3291  *
3292  * Just say no to tons of global variables, sigh.
3293  */
3294 static struct record record = {
3295 	.opts = {
3296 		.sample_time	     = true,
3297 		.mmap_pages	     = UINT_MAX,
3298 		.user_freq	     = UINT_MAX,
3299 		.user_interval	     = ULLONG_MAX,
3300 		.freq		     = 4000,
3301 		.target		     = {
3302 			.uses_mmap   = true,
3303 			.default_per_cpu = true,
3304 		},
3305 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3306 		.nr_threads_synthesize = 1,
3307 		.ctl_fd              = -1,
3308 		.ctl_fd_ack          = -1,
3309 		.synth               = PERF_SYNTH_ALL,
3310 	},
3311 	.tool = {
3312 		.sample		= process_sample_event,
3313 		.fork		= perf_event__process_fork,
3314 		.exit		= perf_event__process_exit,
3315 		.comm		= perf_event__process_comm,
3316 		.namespaces	= perf_event__process_namespaces,
3317 		.mmap		= build_id__process_mmap,
3318 		.mmap2		= build_id__process_mmap2,
3319 		.itrace_start	= process_timestamp_boundary,
3320 		.aux		= process_timestamp_boundary,
3321 		.ordered_events	= true,
3322 	},
3323 };
3324 
3325 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3326 	"\n\t\t\t\tDefault: fp";
3327 
3328 static bool dry_run;
3329 
3330 /*
3331  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3332  * with it and switch to use the library functions in perf_evlist that came
3333  * from builtin-record.c, i.e. use record_opts,
3334  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3335  * using pipes, etc.
3336  */
3337 static struct option __record_options[] = {
3338 	OPT_CALLBACK('e', "event", &record.evlist, "event",
3339 		     "event selector. use 'perf list' to list available events",
3340 		     parse_events_option),
3341 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3342 		     "event filter", parse_filter),
3343 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3344 			   NULL, "don't record events from perf itself",
3345 			   exclude_perf),
3346 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3347 		    "record events on existing process id"),
3348 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3349 		    "record events on existing thread id"),
3350 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3351 		    "collect data with this RT SCHED_FIFO priority"),
3352 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3353 		    "collect data without buffering"),
3354 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3355 		    "collect raw sample records from all opened counters"),
3356 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3357 			    "system-wide collection from all CPUs"),
3358 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3359 		    "list of cpus to monitor"),
3360 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3361 	OPT_STRING('o', "output", &record.data.path, "file",
3362 		    "output file name"),
3363 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3364 			&record.opts.no_inherit_set,
3365 			"child tasks do not inherit counters"),
3366 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3367 		    "synthesize non-sample events at the end of output"),
3368 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3369 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3370 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3371 		    "Fail if the specified frequency can't be used"),
3372 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3373 		     "profile at this frequency",
3374 		      record__parse_freq),
3375 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3376 		     "number of mmap data pages and AUX area tracing mmap pages",
3377 		     record__parse_mmap_pages),
3378 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3379 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3380 		     record__mmap_flush_parse),
3381 	OPT_BOOLEAN(0, "group", &record.opts.group,
3382 		    "put the counters into a counter group"),
3383 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3384 			   NULL, "enables call-graph recording" ,
3385 			   &record_callchain_opt),
3386 	OPT_CALLBACK(0, "call-graph", &record.opts,
3387 		     "record_mode[,record_size]", record_callchain_help,
3388 		     &record_parse_callchain_opt),
3389 	OPT_INCR('v', "verbose", &verbose,
3390 		    "be more verbose (show counter open errors, etc)"),
3391 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
3392 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3393 		    "per thread counts"),
3394 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3395 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3396 		    "Record the sample physical addresses"),
3397 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3398 		    "Record the sampled data address data page size"),
3399 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3400 		    "Record the sampled code address (ip) page size"),
3401 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3402 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3403 		    "Record the sample identifier"),
3404 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3405 			&record.opts.sample_time_set,
3406 			"Record the sample timestamps"),
3407 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3408 			"Record the sample period"),
3409 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3410 		    "don't sample"),
3411 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3412 			&record.no_buildid_cache_set,
3413 			"do not update the buildid cache"),
3414 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3415 			&record.no_buildid_set,
3416 			"do not collect buildids in perf.data"),
3417 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3418 		     "monitor event in cgroup name only",
3419 		     parse_cgroups),
3420 	OPT_CALLBACK('D', "delay", &record, "ms",
3421 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3422 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3423 		     record__parse_event_enable_time),
3424 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3425 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3426 		   "user to profile"),
3427 
3428 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3429 		     "branch any", "sample any taken branches",
3430 		     parse_branch_stack),
3431 
3432 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3433 		     "branch filter mask", "branch stack filter modes",
3434 		     parse_branch_stack),
3435 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3436 		    "sample by weight (on special events only)"),
3437 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3438 		    "sample transaction flags (special events only)"),
3439 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3440 		    "use per-thread mmaps"),
3441 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3442 		    "sample selected machine registers on interrupt,"
3443 		    " use '-I?' to list register names", parse_intr_regs),
3444 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3445 		    "sample selected machine registers on interrupt,"
3446 		    " use '--user-regs=?' to list register names", parse_user_regs),
3447 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3448 		    "Record running/enabled time of read (:S) events"),
3449 	OPT_CALLBACK('k', "clockid", &record.opts,
3450 	"clockid", "clockid to use for events, see clock_gettime()",
3451 	parse_clockid),
3452 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3453 			  "opts", "AUX area tracing Snapshot Mode", ""),
3454 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3455 			  "opts", "sample AUX area", ""),
3456 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3457 			"per thread proc mmap processing timeout in ms"),
3458 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3459 		    "Record namespaces events"),
3460 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3461 		    "Record cgroup events"),
3462 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3463 			&record.opts.record_switch_events_set,
3464 			"Record context switch events"),
3465 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3466 			 "Configure all used events to run in kernel space.",
3467 			 PARSE_OPT_EXCLUSIVE),
3468 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3469 			 "Configure all used events to run in user space.",
3470 			 PARSE_OPT_EXCLUSIVE),
3471 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3472 		    "collect kernel callchains"),
3473 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3474 		    "collect user callchains"),
3475 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
3476 		   "clang binary to use for compiling BPF scriptlets"),
3477 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
3478 		   "options passed to clang when compiling BPF scriptlets"),
3479 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3480 		   "file", "vmlinux pathname"),
3481 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3482 		    "Record build-id of all DSOs regardless of hits"),
3483 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3484 		    "Record build-id in map events"),
3485 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3486 		    "append timestamp to output filename"),
3487 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3488 		    "Record timestamp boundary (time of first/last samples)"),
3489 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3490 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3491 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3492 			  "signal"),
3493 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
3494 			 "switch output event selector. use 'perf list' to list available events",
3495 			 parse_events_option_new_evlist),
3496 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3497 		   "Limit number of switch output generated files"),
3498 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3499 		    "Parse options then exit"),
3500 #ifdef HAVE_AIO_SUPPORT
3501 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3502 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3503 		     record__aio_parse),
3504 #endif
3505 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3506 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3507 		     record__parse_affinity),
3508 #ifdef HAVE_ZSTD_SUPPORT
3509 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3510 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3511 			    record__parse_comp_level),
3512 #endif
3513 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3514 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3515 	OPT_UINTEGER(0, "num-thread-synthesize",
3516 		     &record.opts.nr_threads_synthesize,
3517 		     "number of threads to run for event synthesis"),
3518 #ifdef HAVE_LIBPFM
3519 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3520 		"libpfm4 event selector. use 'perf list' to list available events",
3521 		parse_libpfm_events_option),
3522 #endif
3523 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3524 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3525 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3526 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3527 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3528 		      parse_control_option),
3529 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3530 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3531 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3532 			  &record.debuginfod.set, "debuginfod urls",
3533 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3534 			  "system"),
3535 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3536 			    "write collected trace data into several data files using parallel threads",
3537 			    record__parse_threads),
3538 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3539 	OPT_END()
3540 };
3541 
3542 struct option *record_options = __record_options;
3543 
3544 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3545 {
3546 	struct perf_cpu cpu;
3547 	int idx;
3548 
3549 	if (cpu_map__is_dummy(cpus))
3550 		return 0;
3551 
3552 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
3553 		if (cpu.cpu == -1)
3554 			continue;
3555 		/* Return ENODEV is input cpu is greater than max cpu */
3556 		if ((unsigned long)cpu.cpu > mask->nbits)
3557 			return -ENODEV;
3558 		set_bit(cpu.cpu, mask->bits);
3559 	}
3560 
3561 	return 0;
3562 }
3563 
3564 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3565 {
3566 	struct perf_cpu_map *cpus;
3567 
3568 	cpus = perf_cpu_map__new(mask_spec);
3569 	if (!cpus)
3570 		return -ENOMEM;
3571 
3572 	bitmap_zero(mask->bits, mask->nbits);
3573 	if (record__mmap_cpu_mask_init(mask, cpus))
3574 		return -ENODEV;
3575 
3576 	perf_cpu_map__put(cpus);
3577 
3578 	return 0;
3579 }
3580 
3581 static void record__free_thread_masks(struct record *rec, int nr_threads)
3582 {
3583 	int t;
3584 
3585 	if (rec->thread_masks)
3586 		for (t = 0; t < nr_threads; t++)
3587 			record__thread_mask_free(&rec->thread_masks[t]);
3588 
3589 	zfree(&rec->thread_masks);
3590 }
3591 
3592 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3593 {
3594 	int t, ret;
3595 
3596 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3597 	if (!rec->thread_masks) {
3598 		pr_err("Failed to allocate thread masks\n");
3599 		return -ENOMEM;
3600 	}
3601 
3602 	for (t = 0; t < nr_threads; t++) {
3603 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3604 		if (ret) {
3605 			pr_err("Failed to allocate thread masks[%d]\n", t);
3606 			goto out_free;
3607 		}
3608 	}
3609 
3610 	return 0;
3611 
3612 out_free:
3613 	record__free_thread_masks(rec, nr_threads);
3614 
3615 	return ret;
3616 }
3617 
3618 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3619 {
3620 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3621 
3622 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3623 	if (ret)
3624 		return ret;
3625 
3626 	rec->nr_threads = nr_cpus;
3627 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3628 
3629 	for (t = 0; t < rec->nr_threads; t++) {
3630 		set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3631 		set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3632 		if (verbose) {
3633 			pr_debug("thread_masks[%d]: ", t);
3634 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3635 			pr_debug("thread_masks[%d]: ", t);
3636 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3637 		}
3638 	}
3639 
3640 	return 0;
3641 }
3642 
3643 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3644 					  const char **maps_spec, const char **affinity_spec,
3645 					  u32 nr_spec)
3646 {
3647 	u32 s;
3648 	int ret = 0, t = 0;
3649 	struct mmap_cpu_mask cpus_mask;
3650 	struct thread_mask thread_mask, full_mask, *thread_masks;
3651 
3652 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3653 	if (ret) {
3654 		pr_err("Failed to allocate CPUs mask\n");
3655 		return ret;
3656 	}
3657 
3658 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3659 	if (ret) {
3660 		pr_err("Failed to init cpu mask\n");
3661 		goto out_free_cpu_mask;
3662 	}
3663 
3664 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3665 	if (ret) {
3666 		pr_err("Failed to allocate full mask\n");
3667 		goto out_free_cpu_mask;
3668 	}
3669 
3670 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3671 	if (ret) {
3672 		pr_err("Failed to allocate thread mask\n");
3673 		goto out_free_full_and_cpu_masks;
3674 	}
3675 
3676 	for (s = 0; s < nr_spec; s++) {
3677 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3678 		if (ret) {
3679 			pr_err("Failed to initialize maps thread mask\n");
3680 			goto out_free;
3681 		}
3682 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3683 		if (ret) {
3684 			pr_err("Failed to initialize affinity thread mask\n");
3685 			goto out_free;
3686 		}
3687 
3688 		/* ignore invalid CPUs but do not allow empty masks */
3689 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3690 				cpus_mask.bits, thread_mask.maps.nbits)) {
3691 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3692 			ret = -EINVAL;
3693 			goto out_free;
3694 		}
3695 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3696 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3697 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3698 			ret = -EINVAL;
3699 			goto out_free;
3700 		}
3701 
3702 		/* do not allow intersection with other masks (full_mask) */
3703 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3704 				      thread_mask.maps.nbits)) {
3705 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3706 			ret = -EINVAL;
3707 			goto out_free;
3708 		}
3709 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3710 				      thread_mask.affinity.nbits)) {
3711 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3712 			ret = -EINVAL;
3713 			goto out_free;
3714 		}
3715 
3716 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3717 			  thread_mask.maps.bits, full_mask.maps.nbits);
3718 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3719 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3720 
3721 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3722 		if (!thread_masks) {
3723 			pr_err("Failed to reallocate thread masks\n");
3724 			ret = -ENOMEM;
3725 			goto out_free;
3726 		}
3727 		rec->thread_masks = thread_masks;
3728 		rec->thread_masks[t] = thread_mask;
3729 		if (verbose) {
3730 			pr_debug("thread_masks[%d]: ", t);
3731 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3732 			pr_debug("thread_masks[%d]: ", t);
3733 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3734 		}
3735 		t++;
3736 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3737 		if (ret) {
3738 			pr_err("Failed to allocate thread mask\n");
3739 			goto out_free_full_and_cpu_masks;
3740 		}
3741 	}
3742 	rec->nr_threads = t;
3743 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3744 	if (!rec->nr_threads)
3745 		ret = -EINVAL;
3746 
3747 out_free:
3748 	record__thread_mask_free(&thread_mask);
3749 out_free_full_and_cpu_masks:
3750 	record__thread_mask_free(&full_mask);
3751 out_free_cpu_mask:
3752 	record__mmap_cpu_mask_free(&cpus_mask);
3753 
3754 	return ret;
3755 }
3756 
3757 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3758 {
3759 	int ret;
3760 	struct cpu_topology *topo;
3761 
3762 	topo = cpu_topology__new();
3763 	if (!topo) {
3764 		pr_err("Failed to allocate CPU topology\n");
3765 		return -ENOMEM;
3766 	}
3767 
3768 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3769 					     topo->core_cpus_list, topo->core_cpus_lists);
3770 	cpu_topology__delete(topo);
3771 
3772 	return ret;
3773 }
3774 
3775 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3776 {
3777 	int ret;
3778 	struct cpu_topology *topo;
3779 
3780 	topo = cpu_topology__new();
3781 	if (!topo) {
3782 		pr_err("Failed to allocate CPU topology\n");
3783 		return -ENOMEM;
3784 	}
3785 
3786 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3787 					     topo->package_cpus_list, topo->package_cpus_lists);
3788 	cpu_topology__delete(topo);
3789 
3790 	return ret;
3791 }
3792 
3793 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3794 {
3795 	u32 s;
3796 	int ret;
3797 	const char **spec;
3798 	struct numa_topology *topo;
3799 
3800 	topo = numa_topology__new();
3801 	if (!topo) {
3802 		pr_err("Failed to allocate NUMA topology\n");
3803 		return -ENOMEM;
3804 	}
3805 
3806 	spec = zalloc(topo->nr * sizeof(char *));
3807 	if (!spec) {
3808 		pr_err("Failed to allocate NUMA spec\n");
3809 		ret = -ENOMEM;
3810 		goto out_delete_topo;
3811 	}
3812 	for (s = 0; s < topo->nr; s++)
3813 		spec[s] = topo->nodes[s].cpus;
3814 
3815 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3816 
3817 	zfree(&spec);
3818 
3819 out_delete_topo:
3820 	numa_topology__delete(topo);
3821 
3822 	return ret;
3823 }
3824 
3825 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3826 {
3827 	int t, ret;
3828 	u32 s, nr_spec = 0;
3829 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3830 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3831 
3832 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3833 		spec = strtok_r(user_spec, ":", &spec_ptr);
3834 		if (spec == NULL)
3835 			break;
3836 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3837 		mask = strtok_r(spec, "/", &mask_ptr);
3838 		if (mask == NULL)
3839 			break;
3840 		pr_debug2("  maps mask: %s\n", mask);
3841 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3842 		if (!tmp_spec) {
3843 			pr_err("Failed to reallocate maps spec\n");
3844 			ret = -ENOMEM;
3845 			goto out_free;
3846 		}
3847 		maps_spec = tmp_spec;
3848 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3849 		if (!maps_spec[nr_spec]) {
3850 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3851 			ret = -ENOMEM;
3852 			goto out_free;
3853 		}
3854 		mask = strtok_r(NULL, "/", &mask_ptr);
3855 		if (mask == NULL) {
3856 			pr_err("Invalid thread maps or affinity specs\n");
3857 			ret = -EINVAL;
3858 			goto out_free;
3859 		}
3860 		pr_debug2("  affinity mask: %s\n", mask);
3861 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3862 		if (!tmp_spec) {
3863 			pr_err("Failed to reallocate affinity spec\n");
3864 			ret = -ENOMEM;
3865 			goto out_free;
3866 		}
3867 		affinity_spec = tmp_spec;
3868 		affinity_spec[nr_spec] = strdup(mask);
3869 		if (!affinity_spec[nr_spec]) {
3870 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3871 			ret = -ENOMEM;
3872 			goto out_free;
3873 		}
3874 		dup_mask = NULL;
3875 		nr_spec++;
3876 	}
3877 
3878 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3879 					     (const char **)affinity_spec, nr_spec);
3880 
3881 out_free:
3882 	free(dup_mask);
3883 	for (s = 0; s < nr_spec; s++) {
3884 		if (maps_spec)
3885 			free(maps_spec[s]);
3886 		if (affinity_spec)
3887 			free(affinity_spec[s]);
3888 	}
3889 	free(affinity_spec);
3890 	free(maps_spec);
3891 
3892 	return ret;
3893 }
3894 
3895 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3896 {
3897 	int ret;
3898 
3899 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3900 	if (ret)
3901 		return ret;
3902 
3903 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3904 		return -ENODEV;
3905 
3906 	rec->nr_threads = 1;
3907 
3908 	return 0;
3909 }
3910 
3911 static int record__init_thread_masks(struct record *rec)
3912 {
3913 	int ret = 0;
3914 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3915 
3916 	if (!record__threads_enabled(rec))
3917 		return record__init_thread_default_masks(rec, cpus);
3918 
3919 	if (evlist__per_thread(rec->evlist)) {
3920 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3921 		return -EINVAL;
3922 	}
3923 
3924 	switch (rec->opts.threads_spec) {
3925 	case THREAD_SPEC__CPU:
3926 		ret = record__init_thread_cpu_masks(rec, cpus);
3927 		break;
3928 	case THREAD_SPEC__CORE:
3929 		ret = record__init_thread_core_masks(rec, cpus);
3930 		break;
3931 	case THREAD_SPEC__PACKAGE:
3932 		ret = record__init_thread_package_masks(rec, cpus);
3933 		break;
3934 	case THREAD_SPEC__NUMA:
3935 		ret = record__init_thread_numa_masks(rec, cpus);
3936 		break;
3937 	case THREAD_SPEC__USER:
3938 		ret = record__init_thread_user_masks(rec, cpus);
3939 		break;
3940 	default:
3941 		break;
3942 	}
3943 
3944 	return ret;
3945 }
3946 
3947 int cmd_record(int argc, const char **argv)
3948 {
3949 	int err;
3950 	struct record *rec = &record;
3951 	char errbuf[BUFSIZ];
3952 
3953 	setlocale(LC_ALL, "");
3954 
3955 #ifndef HAVE_LIBBPF_SUPPORT
3956 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
3957 	set_nobuild('\0', "clang-path", true);
3958 	set_nobuild('\0', "clang-opt", true);
3959 # undef set_nobuild
3960 #endif
3961 
3962 #ifndef HAVE_BPF_PROLOGUE
3963 # if !defined (HAVE_DWARF_SUPPORT)
3964 #  define REASON  "NO_DWARF=1"
3965 # elif !defined (HAVE_LIBBPF_SUPPORT)
3966 #  define REASON  "NO_LIBBPF=1"
3967 # else
3968 #  define REASON  "this architecture doesn't support BPF prologue"
3969 # endif
3970 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
3971 	set_nobuild('\0', "vmlinux", true);
3972 # undef set_nobuild
3973 # undef REASON
3974 #endif
3975 
3976 #ifndef HAVE_BPF_SKEL
3977 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3978 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3979 # undef set_nobuild
3980 #endif
3981 
3982 	rec->opts.affinity = PERF_AFFINITY_SYS;
3983 
3984 	rec->evlist = evlist__new();
3985 	if (rec->evlist == NULL)
3986 		return -ENOMEM;
3987 
3988 	err = perf_config(perf_record_config, rec);
3989 	if (err)
3990 		return err;
3991 
3992 	argc = parse_options(argc, argv, record_options, record_usage,
3993 			    PARSE_OPT_STOP_AT_NON_OPTION);
3994 	if (quiet)
3995 		perf_quiet_option();
3996 
3997 	err = symbol__validate_sym_arguments();
3998 	if (err)
3999 		return err;
4000 
4001 	perf_debuginfod_setup(&record.debuginfod);
4002 
4003 	/* Make system wide (-a) the default target. */
4004 	if (!argc && target__none(&rec->opts.target))
4005 		rec->opts.target.system_wide = true;
4006 
4007 	if (nr_cgroups && !rec->opts.target.system_wide) {
4008 		usage_with_options_msg(record_usage, record_options,
4009 			"cgroup monitoring only available in system-wide mode");
4010 
4011 	}
4012 
4013 	if (rec->buildid_mmap) {
4014 		if (!perf_can_record_build_id()) {
4015 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4016 			err = -EINVAL;
4017 			goto out_opts;
4018 		}
4019 		pr_debug("Enabling build id in mmap2 events.\n");
4020 		/* Enable mmap build id synthesizing. */
4021 		symbol_conf.buildid_mmap2 = true;
4022 		/* Enable perf_event_attr::build_id bit. */
4023 		rec->opts.build_id = true;
4024 		/* Disable build id cache. */
4025 		rec->no_buildid = true;
4026 	}
4027 
4028 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4029 		pr_err("Kernel has no cgroup sampling support.\n");
4030 		err = -EINVAL;
4031 		goto out_opts;
4032 	}
4033 
4034 	if (rec->opts.kcore)
4035 		rec->opts.text_poke = true;
4036 
4037 	if (rec->opts.kcore || record__threads_enabled(rec))
4038 		rec->data.is_dir = true;
4039 
4040 	if (record__threads_enabled(rec)) {
4041 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4042 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4043 			goto out_opts;
4044 		}
4045 		if (record__aio_enabled(rec)) {
4046 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4047 			goto out_opts;
4048 		}
4049 	}
4050 
4051 	if (rec->opts.comp_level != 0) {
4052 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4053 		rec->no_buildid = true;
4054 	}
4055 
4056 	if (rec->opts.record_switch_events &&
4057 	    !perf_can_record_switch_events()) {
4058 		ui__error("kernel does not support recording context switch events\n");
4059 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4060 		err = -EINVAL;
4061 		goto out_opts;
4062 	}
4063 
4064 	if (switch_output_setup(rec)) {
4065 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4066 		err = -EINVAL;
4067 		goto out_opts;
4068 	}
4069 
4070 	if (rec->switch_output.time) {
4071 		signal(SIGALRM, alarm_sig_handler);
4072 		alarm(rec->switch_output.time);
4073 	}
4074 
4075 	if (rec->switch_output.num_files) {
4076 		rec->switch_output.filenames = calloc(sizeof(char *),
4077 						      rec->switch_output.num_files);
4078 		if (!rec->switch_output.filenames) {
4079 			err = -EINVAL;
4080 			goto out_opts;
4081 		}
4082 	}
4083 
4084 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4085 		rec->timestamp_filename = false;
4086 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4087 	}
4088 
4089 	/*
4090 	 * Allow aliases to facilitate the lookup of symbols for address
4091 	 * filters. Refer to auxtrace_parse_filters().
4092 	 */
4093 	symbol_conf.allow_aliases = true;
4094 
4095 	symbol__init(NULL);
4096 
4097 	err = record__auxtrace_init(rec);
4098 	if (err)
4099 		goto out;
4100 
4101 	if (dry_run)
4102 		goto out;
4103 
4104 	err = bpf__setup_stdout(rec->evlist);
4105 	if (err) {
4106 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
4107 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
4108 			 errbuf);
4109 		goto out;
4110 	}
4111 
4112 	err = -ENOMEM;
4113 
4114 	if (rec->no_buildid_cache || rec->no_buildid) {
4115 		disable_buildid_cache();
4116 	} else if (rec->switch_output.enabled) {
4117 		/*
4118 		 * In 'perf record --switch-output', disable buildid
4119 		 * generation by default to reduce data file switching
4120 		 * overhead. Still generate buildid if they are required
4121 		 * explicitly using
4122 		 *
4123 		 *  perf record --switch-output --no-no-buildid \
4124 		 *              --no-no-buildid-cache
4125 		 *
4126 		 * Following code equals to:
4127 		 *
4128 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4129 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4130 		 *         disable_buildid_cache();
4131 		 */
4132 		bool disable = true;
4133 
4134 		if (rec->no_buildid_set && !rec->no_buildid)
4135 			disable = false;
4136 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4137 			disable = false;
4138 		if (disable) {
4139 			rec->no_buildid = true;
4140 			rec->no_buildid_cache = true;
4141 			disable_buildid_cache();
4142 		}
4143 	}
4144 
4145 	if (record.opts.overwrite)
4146 		record.opts.tail_synthesize = true;
4147 
4148 	if (rec->evlist->core.nr_entries == 0) {
4149 		if (perf_pmu__has_hybrid()) {
4150 			err = evlist__add_default_hybrid(rec->evlist,
4151 							 !record.opts.no_samples);
4152 		} else {
4153 			err = __evlist__add_default(rec->evlist,
4154 						    !record.opts.no_samples);
4155 		}
4156 
4157 		if (err < 0) {
4158 			pr_err("Not enough memory for event selector list\n");
4159 			goto out;
4160 		}
4161 	}
4162 
4163 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4164 		rec->opts.no_inherit = true;
4165 
4166 	err = target__validate(&rec->opts.target);
4167 	if (err) {
4168 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4169 		ui__warning("%s\n", errbuf);
4170 	}
4171 
4172 	err = target__parse_uid(&rec->opts.target);
4173 	if (err) {
4174 		int saved_errno = errno;
4175 
4176 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4177 		ui__error("%s", errbuf);
4178 
4179 		err = -saved_errno;
4180 		goto out;
4181 	}
4182 
4183 	/* Enable ignoring missing threads when -u/-p option is defined. */
4184 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4185 
4186 	if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
4187 		pr_err("failed to use cpu list %s\n",
4188 		       rec->opts.target.cpu_list);
4189 		goto out;
4190 	}
4191 
4192 	rec->opts.target.hybrid = perf_pmu__has_hybrid();
4193 
4194 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4195 		arch__add_leaf_frame_record_opts(&rec->opts);
4196 
4197 	err = -ENOMEM;
4198 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4199 		if (rec->opts.target.pid != NULL) {
4200 			pr_err("Couldn't create thread/CPU maps: %s\n",
4201 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4202 			goto out;
4203 		}
4204 		else
4205 			usage_with_options(record_usage, record_options);
4206 	}
4207 
4208 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4209 	if (err)
4210 		goto out;
4211 
4212 	/*
4213 	 * We take all buildids when the file contains
4214 	 * AUX area tracing data because we do not decode the
4215 	 * trace because it would take too long.
4216 	 */
4217 	if (rec->opts.full_auxtrace)
4218 		rec->buildid_all = true;
4219 
4220 	if (rec->opts.text_poke) {
4221 		err = record__config_text_poke(rec->evlist);
4222 		if (err) {
4223 			pr_err("record__config_text_poke failed, error %d\n", err);
4224 			goto out;
4225 		}
4226 	}
4227 
4228 	if (rec->off_cpu) {
4229 		err = record__config_off_cpu(rec);
4230 		if (err) {
4231 			pr_err("record__config_off_cpu failed, error %d\n", err);
4232 			goto out;
4233 		}
4234 	}
4235 
4236 	if (record_opts__config(&rec->opts)) {
4237 		err = -EINVAL;
4238 		goto out;
4239 	}
4240 
4241 	err = record__init_thread_masks(rec);
4242 	if (err) {
4243 		pr_err("Failed to initialize parallel data streaming masks\n");
4244 		goto out;
4245 	}
4246 
4247 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4248 		rec->opts.nr_cblocks = nr_cblocks_max;
4249 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4250 
4251 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4252 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4253 
4254 	if (rec->opts.comp_level > comp_level_max)
4255 		rec->opts.comp_level = comp_level_max;
4256 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4257 
4258 	err = __cmd_record(&record, argc, argv);
4259 out:
4260 	evlist__delete(rec->evlist);
4261 	symbol__exit();
4262 	auxtrace_record__free(rec->itr);
4263 out_opts:
4264 	record__free_thread_masks(rec, rec->nr_threads);
4265 	rec->nr_threads = 0;
4266 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4267 	return err;
4268 }
4269 
4270 static void snapshot_sig_handler(int sig __maybe_unused)
4271 {
4272 	struct record *rec = &record;
4273 
4274 	hit_auxtrace_snapshot_trigger(rec);
4275 
4276 	if (switch_output_signal(rec))
4277 		trigger_hit(&switch_output_trigger);
4278 }
4279 
4280 static void alarm_sig_handler(int sig __maybe_unused)
4281 {
4282 	struct record *rec = &record;
4283 
4284 	if (switch_output_time(rec))
4285 		trigger_hit(&switch_output_trigger);
4286 }
4287