xref: /openbmc/linux/tools/perf/builtin-record.c (revision 5fa2481c)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/clockid.h"
50 #include "util/pmu-hybrid.h"
51 #include "util/evlist-hybrid.h"
52 #include "util/off_cpu.h"
53 #include "asm/bug.h"
54 #include "perf.h"
55 #include "cputopo.h"
56 
57 #include <errno.h>
58 #include <inttypes.h>
59 #include <locale.h>
60 #include <poll.h>
61 #include <pthread.h>
62 #include <unistd.h>
63 #ifndef HAVE_GETTID
64 #include <syscall.h>
65 #endif
66 #include <sched.h>
67 #include <signal.h>
68 #ifdef HAVE_EVENTFD_SUPPORT
69 #include <sys/eventfd.h>
70 #endif
71 #include <sys/mman.h>
72 #include <sys/wait.h>
73 #include <sys/types.h>
74 #include <sys/stat.h>
75 #include <fcntl.h>
76 #include <linux/err.h>
77 #include <linux/string.h>
78 #include <linux/time64.h>
79 #include <linux/zalloc.h>
80 #include <linux/bitmap.h>
81 #include <sys/time.h>
82 
83 struct switch_output {
84 	bool		 enabled;
85 	bool		 signal;
86 	unsigned long	 size;
87 	unsigned long	 time;
88 	const char	*str;
89 	bool		 set;
90 	char		 **filenames;
91 	int		 num_files;
92 	int		 cur_file;
93 };
94 
95 struct thread_mask {
96 	struct mmap_cpu_mask	maps;
97 	struct mmap_cpu_mask	affinity;
98 };
99 
100 struct record_thread {
101 	pid_t			tid;
102 	struct thread_mask	*mask;
103 	struct {
104 		int		msg[2];
105 		int		ack[2];
106 	} pipes;
107 	struct fdarray		pollfd;
108 	int			ctlfd_pos;
109 	int			nr_mmaps;
110 	struct mmap		**maps;
111 	struct mmap		**overwrite_maps;
112 	struct record		*rec;
113 	unsigned long long	samples;
114 	unsigned long		waking;
115 	u64			bytes_written;
116 	u64			bytes_transferred;
117 	u64			bytes_compressed;
118 };
119 
120 static __thread struct record_thread *thread;
121 
122 enum thread_msg {
123 	THREAD_MSG__UNDEFINED = 0,
124 	THREAD_MSG__READY,
125 	THREAD_MSG__MAX,
126 };
127 
128 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
129 	"UNDEFINED", "READY"
130 };
131 
132 enum thread_spec {
133 	THREAD_SPEC__UNDEFINED = 0,
134 	THREAD_SPEC__CPU,
135 	THREAD_SPEC__CORE,
136 	THREAD_SPEC__PACKAGE,
137 	THREAD_SPEC__NUMA,
138 	THREAD_SPEC__USER,
139 	THREAD_SPEC__MAX,
140 };
141 
142 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
143 	"undefined", "cpu", "core", "package", "numa", "user"
144 };
145 
146 struct record {
147 	struct perf_tool	tool;
148 	struct record_opts	opts;
149 	u64			bytes_written;
150 	struct perf_data	data;
151 	struct auxtrace_record	*itr;
152 	struct evlist	*evlist;
153 	struct perf_session	*session;
154 	struct evlist		*sb_evlist;
155 	pthread_t		thread_id;
156 	int			realtime_prio;
157 	bool			switch_output_event_set;
158 	bool			no_buildid;
159 	bool			no_buildid_set;
160 	bool			no_buildid_cache;
161 	bool			no_buildid_cache_set;
162 	bool			buildid_all;
163 	bool			buildid_mmap;
164 	bool			timestamp_filename;
165 	bool			timestamp_boundary;
166 	bool			off_cpu;
167 	struct switch_output	switch_output;
168 	unsigned long long	samples;
169 	unsigned long		output_max_size;	/* = 0: unlimited */
170 	struct perf_debuginfod	debuginfod;
171 	int			nr_threads;
172 	struct thread_mask	*thread_masks;
173 	struct record_thread	*thread_data;
174 };
175 
176 static volatile int done;
177 
178 static volatile int auxtrace_record__snapshot_started;
179 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
180 static DEFINE_TRIGGER(switch_output_trigger);
181 
182 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
183 	"SYS", "NODE", "CPU"
184 };
185 
186 #ifndef HAVE_GETTID
187 static inline pid_t gettid(void)
188 {
189 	return (pid_t)syscall(__NR_gettid);
190 }
191 #endif
192 
193 static int record__threads_enabled(struct record *rec)
194 {
195 	return rec->opts.threads_spec;
196 }
197 
198 static bool switch_output_signal(struct record *rec)
199 {
200 	return rec->switch_output.signal &&
201 	       trigger_is_ready(&switch_output_trigger);
202 }
203 
204 static bool switch_output_size(struct record *rec)
205 {
206 	return rec->switch_output.size &&
207 	       trigger_is_ready(&switch_output_trigger) &&
208 	       (rec->bytes_written >= rec->switch_output.size);
209 }
210 
211 static bool switch_output_time(struct record *rec)
212 {
213 	return rec->switch_output.time &&
214 	       trigger_is_ready(&switch_output_trigger);
215 }
216 
217 static u64 record__bytes_written(struct record *rec)
218 {
219 	int t;
220 	u64 bytes_written = rec->bytes_written;
221 	struct record_thread *thread_data = rec->thread_data;
222 
223 	for (t = 0; t < rec->nr_threads; t++)
224 		bytes_written += thread_data[t].bytes_written;
225 
226 	return bytes_written;
227 }
228 
229 static bool record__output_max_size_exceeded(struct record *rec)
230 {
231 	return rec->output_max_size &&
232 	       (record__bytes_written(rec) >= rec->output_max_size);
233 }
234 
235 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
236 			 void *bf, size_t size)
237 {
238 	struct perf_data_file *file = &rec->session->data->file;
239 
240 	if (map && map->file)
241 		file = map->file;
242 
243 	if (perf_data_file__write(file, bf, size) < 0) {
244 		pr_err("failed to write perf data, error: %m\n");
245 		return -1;
246 	}
247 
248 	if (map && map->file)
249 		thread->bytes_written += size;
250 	else
251 		rec->bytes_written += size;
252 
253 	if (record__output_max_size_exceeded(rec) && !done) {
254 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
255 				" stopping session ]\n",
256 				record__bytes_written(rec) >> 10);
257 		done = 1;
258 	}
259 
260 	if (switch_output_size(rec))
261 		trigger_hit(&switch_output_trigger);
262 
263 	return 0;
264 }
265 
266 static int record__aio_enabled(struct record *rec);
267 static int record__comp_enabled(struct record *rec);
268 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
269 			    void *dst, size_t dst_size, void *src, size_t src_size);
270 
271 #ifdef HAVE_AIO_SUPPORT
272 static int record__aio_write(struct aiocb *cblock, int trace_fd,
273 		void *buf, size_t size, off_t off)
274 {
275 	int rc;
276 
277 	cblock->aio_fildes = trace_fd;
278 	cblock->aio_buf    = buf;
279 	cblock->aio_nbytes = size;
280 	cblock->aio_offset = off;
281 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
282 
283 	do {
284 		rc = aio_write(cblock);
285 		if (rc == 0) {
286 			break;
287 		} else if (errno != EAGAIN) {
288 			cblock->aio_fildes = -1;
289 			pr_err("failed to queue perf data, error: %m\n");
290 			break;
291 		}
292 	} while (1);
293 
294 	return rc;
295 }
296 
297 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
298 {
299 	void *rem_buf;
300 	off_t rem_off;
301 	size_t rem_size;
302 	int rc, aio_errno;
303 	ssize_t aio_ret, written;
304 
305 	aio_errno = aio_error(cblock);
306 	if (aio_errno == EINPROGRESS)
307 		return 0;
308 
309 	written = aio_ret = aio_return(cblock);
310 	if (aio_ret < 0) {
311 		if (aio_errno != EINTR)
312 			pr_err("failed to write perf data, error: %m\n");
313 		written = 0;
314 	}
315 
316 	rem_size = cblock->aio_nbytes - written;
317 
318 	if (rem_size == 0) {
319 		cblock->aio_fildes = -1;
320 		/*
321 		 * md->refcount is incremented in record__aio_pushfn() for
322 		 * every aio write request started in record__aio_push() so
323 		 * decrement it because the request is now complete.
324 		 */
325 		perf_mmap__put(&md->core);
326 		rc = 1;
327 	} else {
328 		/*
329 		 * aio write request may require restart with the
330 		 * reminder if the kernel didn't write whole
331 		 * chunk at once.
332 		 */
333 		rem_off = cblock->aio_offset + written;
334 		rem_buf = (void *)(cblock->aio_buf + written);
335 		record__aio_write(cblock, cblock->aio_fildes,
336 				rem_buf, rem_size, rem_off);
337 		rc = 0;
338 	}
339 
340 	return rc;
341 }
342 
343 static int record__aio_sync(struct mmap *md, bool sync_all)
344 {
345 	struct aiocb **aiocb = md->aio.aiocb;
346 	struct aiocb *cblocks = md->aio.cblocks;
347 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
348 	int i, do_suspend;
349 
350 	do {
351 		do_suspend = 0;
352 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
353 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
354 				if (sync_all)
355 					aiocb[i] = NULL;
356 				else
357 					return i;
358 			} else {
359 				/*
360 				 * Started aio write is not complete yet
361 				 * so it has to be waited before the
362 				 * next allocation.
363 				 */
364 				aiocb[i] = &cblocks[i];
365 				do_suspend = 1;
366 			}
367 		}
368 		if (!do_suspend)
369 			return -1;
370 
371 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
372 			if (!(errno == EAGAIN || errno == EINTR))
373 				pr_err("failed to sync perf data, error: %m\n");
374 		}
375 	} while (1);
376 }
377 
378 struct record_aio {
379 	struct record	*rec;
380 	void		*data;
381 	size_t		size;
382 };
383 
384 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
385 {
386 	struct record_aio *aio = to;
387 
388 	/*
389 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
390 	 * to release space in the kernel buffer as fast as possible, calling
391 	 * perf_mmap__consume() from perf_mmap__push() function.
392 	 *
393 	 * That lets the kernel to proceed with storing more profiling data into
394 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
395 	 *
396 	 * Coping can be done in two steps in case the chunk of profiling data
397 	 * crosses the upper bound of the kernel buffer. In this case we first move
398 	 * part of data from map->start till the upper bound and then the reminder
399 	 * from the beginning of the kernel buffer till the end of the data chunk.
400 	 */
401 
402 	if (record__comp_enabled(aio->rec)) {
403 		size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
404 				     mmap__mmap_len(map) - aio->size,
405 				     buf, size);
406 	} else {
407 		memcpy(aio->data + aio->size, buf, size);
408 	}
409 
410 	if (!aio->size) {
411 		/*
412 		 * Increment map->refcount to guard map->aio.data[] buffer
413 		 * from premature deallocation because map object can be
414 		 * released earlier than aio write request started on
415 		 * map->aio.data[] buffer is complete.
416 		 *
417 		 * perf_mmap__put() is done at record__aio_complete()
418 		 * after started aio request completion or at record__aio_push()
419 		 * if the request failed to start.
420 		 */
421 		perf_mmap__get(&map->core);
422 	}
423 
424 	aio->size += size;
425 
426 	return size;
427 }
428 
429 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
430 {
431 	int ret, idx;
432 	int trace_fd = rec->session->data->file.fd;
433 	struct record_aio aio = { .rec = rec, .size = 0 };
434 
435 	/*
436 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
437 	 * becomes available after previous aio write operation.
438 	 */
439 
440 	idx = record__aio_sync(map, false);
441 	aio.data = map->aio.data[idx];
442 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
443 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
444 		return ret;
445 
446 	rec->samples++;
447 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
448 	if (!ret) {
449 		*off += aio.size;
450 		rec->bytes_written += aio.size;
451 		if (switch_output_size(rec))
452 			trigger_hit(&switch_output_trigger);
453 	} else {
454 		/*
455 		 * Decrement map->refcount incremented in record__aio_pushfn()
456 		 * back if record__aio_write() operation failed to start, otherwise
457 		 * map->refcount is decremented in record__aio_complete() after
458 		 * aio write operation finishes successfully.
459 		 */
460 		perf_mmap__put(&map->core);
461 	}
462 
463 	return ret;
464 }
465 
466 static off_t record__aio_get_pos(int trace_fd)
467 {
468 	return lseek(trace_fd, 0, SEEK_CUR);
469 }
470 
471 static void record__aio_set_pos(int trace_fd, off_t pos)
472 {
473 	lseek(trace_fd, pos, SEEK_SET);
474 }
475 
476 static void record__aio_mmap_read_sync(struct record *rec)
477 {
478 	int i;
479 	struct evlist *evlist = rec->evlist;
480 	struct mmap *maps = evlist->mmap;
481 
482 	if (!record__aio_enabled(rec))
483 		return;
484 
485 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
486 		struct mmap *map = &maps[i];
487 
488 		if (map->core.base)
489 			record__aio_sync(map, true);
490 	}
491 }
492 
493 static int nr_cblocks_default = 1;
494 static int nr_cblocks_max = 4;
495 
496 static int record__aio_parse(const struct option *opt,
497 			     const char *str,
498 			     int unset)
499 {
500 	struct record_opts *opts = (struct record_opts *)opt->value;
501 
502 	if (unset) {
503 		opts->nr_cblocks = 0;
504 	} else {
505 		if (str)
506 			opts->nr_cblocks = strtol(str, NULL, 0);
507 		if (!opts->nr_cblocks)
508 			opts->nr_cblocks = nr_cblocks_default;
509 	}
510 
511 	return 0;
512 }
513 #else /* HAVE_AIO_SUPPORT */
514 static int nr_cblocks_max = 0;
515 
516 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
517 			    off_t *off __maybe_unused)
518 {
519 	return -1;
520 }
521 
522 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
523 {
524 	return -1;
525 }
526 
527 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
528 {
529 }
530 
531 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
532 {
533 }
534 #endif
535 
536 static int record__aio_enabled(struct record *rec)
537 {
538 	return rec->opts.nr_cblocks > 0;
539 }
540 
541 #define MMAP_FLUSH_DEFAULT 1
542 static int record__mmap_flush_parse(const struct option *opt,
543 				    const char *str,
544 				    int unset)
545 {
546 	int flush_max;
547 	struct record_opts *opts = (struct record_opts *)opt->value;
548 	static struct parse_tag tags[] = {
549 			{ .tag  = 'B', .mult = 1       },
550 			{ .tag  = 'K', .mult = 1 << 10 },
551 			{ .tag  = 'M', .mult = 1 << 20 },
552 			{ .tag  = 'G', .mult = 1 << 30 },
553 			{ .tag  = 0 },
554 	};
555 
556 	if (unset)
557 		return 0;
558 
559 	if (str) {
560 		opts->mmap_flush = parse_tag_value(str, tags);
561 		if (opts->mmap_flush == (int)-1)
562 			opts->mmap_flush = strtol(str, NULL, 0);
563 	}
564 
565 	if (!opts->mmap_flush)
566 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
567 
568 	flush_max = evlist__mmap_size(opts->mmap_pages);
569 	flush_max /= 4;
570 	if (opts->mmap_flush > flush_max)
571 		opts->mmap_flush = flush_max;
572 
573 	return 0;
574 }
575 
576 #ifdef HAVE_ZSTD_SUPPORT
577 static unsigned int comp_level_default = 1;
578 
579 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
580 {
581 	struct record_opts *opts = opt->value;
582 
583 	if (unset) {
584 		opts->comp_level = 0;
585 	} else {
586 		if (str)
587 			opts->comp_level = strtol(str, NULL, 0);
588 		if (!opts->comp_level)
589 			opts->comp_level = comp_level_default;
590 	}
591 
592 	return 0;
593 }
594 #endif
595 static unsigned int comp_level_max = 22;
596 
597 static int record__comp_enabled(struct record *rec)
598 {
599 	return rec->opts.comp_level > 0;
600 }
601 
602 static int process_synthesized_event(struct perf_tool *tool,
603 				     union perf_event *event,
604 				     struct perf_sample *sample __maybe_unused,
605 				     struct machine *machine __maybe_unused)
606 {
607 	struct record *rec = container_of(tool, struct record, tool);
608 	return record__write(rec, NULL, event, event->header.size);
609 }
610 
611 static int process_locked_synthesized_event(struct perf_tool *tool,
612 				     union perf_event *event,
613 				     struct perf_sample *sample __maybe_unused,
614 				     struct machine *machine __maybe_unused)
615 {
616 	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
617 	int ret;
618 
619 	pthread_mutex_lock(&synth_lock);
620 	ret = process_synthesized_event(tool, event, sample, machine);
621 	pthread_mutex_unlock(&synth_lock);
622 	return ret;
623 }
624 
625 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
626 {
627 	struct record *rec = to;
628 
629 	if (record__comp_enabled(rec)) {
630 		size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
631 		bf   = map->data;
632 	}
633 
634 	thread->samples++;
635 	return record__write(rec, map, bf, size);
636 }
637 
638 static volatile int signr = -1;
639 static volatile int child_finished;
640 #ifdef HAVE_EVENTFD_SUPPORT
641 static int done_fd = -1;
642 #endif
643 
644 static void sig_handler(int sig)
645 {
646 	if (sig == SIGCHLD)
647 		child_finished = 1;
648 	else
649 		signr = sig;
650 
651 	done = 1;
652 #ifdef HAVE_EVENTFD_SUPPORT
653 {
654 	u64 tmp = 1;
655 	/*
656 	 * It is possible for this signal handler to run after done is checked
657 	 * in the main loop, but before the perf counter fds are polled. If this
658 	 * happens, the poll() will continue to wait even though done is set,
659 	 * and will only break out if either another signal is received, or the
660 	 * counters are ready for read. To ensure the poll() doesn't sleep when
661 	 * done is set, use an eventfd (done_fd) to wake up the poll().
662 	 */
663 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
664 		pr_err("failed to signal wakeup fd, error: %m\n");
665 }
666 #endif // HAVE_EVENTFD_SUPPORT
667 }
668 
669 static void sigsegv_handler(int sig)
670 {
671 	perf_hooks__recover();
672 	sighandler_dump_stack(sig);
673 }
674 
675 static void record__sig_exit(void)
676 {
677 	if (signr == -1)
678 		return;
679 
680 	signal(signr, SIG_DFL);
681 	raise(signr);
682 }
683 
684 #ifdef HAVE_AUXTRACE_SUPPORT
685 
686 static int record__process_auxtrace(struct perf_tool *tool,
687 				    struct mmap *map,
688 				    union perf_event *event, void *data1,
689 				    size_t len1, void *data2, size_t len2)
690 {
691 	struct record *rec = container_of(tool, struct record, tool);
692 	struct perf_data *data = &rec->data;
693 	size_t padding;
694 	u8 pad[8] = {0};
695 
696 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
697 		off_t file_offset;
698 		int fd = perf_data__fd(data);
699 		int err;
700 
701 		file_offset = lseek(fd, 0, SEEK_CUR);
702 		if (file_offset == -1)
703 			return -1;
704 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
705 						     event, file_offset);
706 		if (err)
707 			return err;
708 	}
709 
710 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
711 	padding = (len1 + len2) & 7;
712 	if (padding)
713 		padding = 8 - padding;
714 
715 	record__write(rec, map, event, event->header.size);
716 	record__write(rec, map, data1, len1);
717 	if (len2)
718 		record__write(rec, map, data2, len2);
719 	record__write(rec, map, &pad, padding);
720 
721 	return 0;
722 }
723 
724 static int record__auxtrace_mmap_read(struct record *rec,
725 				      struct mmap *map)
726 {
727 	int ret;
728 
729 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
730 				  record__process_auxtrace);
731 	if (ret < 0)
732 		return ret;
733 
734 	if (ret)
735 		rec->samples++;
736 
737 	return 0;
738 }
739 
740 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
741 					       struct mmap *map)
742 {
743 	int ret;
744 
745 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
746 					   record__process_auxtrace,
747 					   rec->opts.auxtrace_snapshot_size);
748 	if (ret < 0)
749 		return ret;
750 
751 	if (ret)
752 		rec->samples++;
753 
754 	return 0;
755 }
756 
757 static int record__auxtrace_read_snapshot_all(struct record *rec)
758 {
759 	int i;
760 	int rc = 0;
761 
762 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
763 		struct mmap *map = &rec->evlist->mmap[i];
764 
765 		if (!map->auxtrace_mmap.base)
766 			continue;
767 
768 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
769 			rc = -1;
770 			goto out;
771 		}
772 	}
773 out:
774 	return rc;
775 }
776 
777 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
778 {
779 	pr_debug("Recording AUX area tracing snapshot\n");
780 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
781 		trigger_error(&auxtrace_snapshot_trigger);
782 	} else {
783 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
784 			trigger_error(&auxtrace_snapshot_trigger);
785 		else
786 			trigger_ready(&auxtrace_snapshot_trigger);
787 	}
788 }
789 
790 static int record__auxtrace_snapshot_exit(struct record *rec)
791 {
792 	if (trigger_is_error(&auxtrace_snapshot_trigger))
793 		return 0;
794 
795 	if (!auxtrace_record__snapshot_started &&
796 	    auxtrace_record__snapshot_start(rec->itr))
797 		return -1;
798 
799 	record__read_auxtrace_snapshot(rec, true);
800 	if (trigger_is_error(&auxtrace_snapshot_trigger))
801 		return -1;
802 
803 	return 0;
804 }
805 
806 static int record__auxtrace_init(struct record *rec)
807 {
808 	int err;
809 
810 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
811 	    && record__threads_enabled(rec)) {
812 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
813 		return -EINVAL;
814 	}
815 
816 	if (!rec->itr) {
817 		rec->itr = auxtrace_record__init(rec->evlist, &err);
818 		if (err)
819 			return err;
820 	}
821 
822 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
823 					      rec->opts.auxtrace_snapshot_opts);
824 	if (err)
825 		return err;
826 
827 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
828 					    rec->opts.auxtrace_sample_opts);
829 	if (err)
830 		return err;
831 
832 	auxtrace_regroup_aux_output(rec->evlist);
833 
834 	return auxtrace_parse_filters(rec->evlist);
835 }
836 
837 #else
838 
839 static inline
840 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
841 			       struct mmap *map __maybe_unused)
842 {
843 	return 0;
844 }
845 
846 static inline
847 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
848 				    bool on_exit __maybe_unused)
849 {
850 }
851 
852 static inline
853 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
854 {
855 	return 0;
856 }
857 
858 static inline
859 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
860 {
861 	return 0;
862 }
863 
864 static int record__auxtrace_init(struct record *rec __maybe_unused)
865 {
866 	return 0;
867 }
868 
869 #endif
870 
871 static int record__config_text_poke(struct evlist *evlist)
872 {
873 	struct evsel *evsel;
874 
875 	/* Nothing to do if text poke is already configured */
876 	evlist__for_each_entry(evlist, evsel) {
877 		if (evsel->core.attr.text_poke)
878 			return 0;
879 	}
880 
881 	evsel = evlist__add_dummy_on_all_cpus(evlist);
882 	if (!evsel)
883 		return -ENOMEM;
884 
885 	evsel->core.attr.text_poke = 1;
886 	evsel->core.attr.ksymbol = 1;
887 	evsel->immediate = true;
888 	evsel__set_sample_bit(evsel, TIME);
889 
890 	return 0;
891 }
892 
893 static int record__config_off_cpu(struct record *rec)
894 {
895 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
896 }
897 
898 static bool record__kcore_readable(struct machine *machine)
899 {
900 	char kcore[PATH_MAX];
901 	int fd;
902 
903 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
904 
905 	fd = open(kcore, O_RDONLY);
906 	if (fd < 0)
907 		return false;
908 
909 	close(fd);
910 
911 	return true;
912 }
913 
914 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
915 {
916 	char from_dir[PATH_MAX];
917 	char kcore_dir[PATH_MAX];
918 	int ret;
919 
920 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
921 
922 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
923 	if (ret)
924 		return ret;
925 
926 	return kcore_copy(from_dir, kcore_dir);
927 }
928 
929 static void record__thread_data_init_pipes(struct record_thread *thread_data)
930 {
931 	thread_data->pipes.msg[0] = -1;
932 	thread_data->pipes.msg[1] = -1;
933 	thread_data->pipes.ack[0] = -1;
934 	thread_data->pipes.ack[1] = -1;
935 }
936 
937 static int record__thread_data_open_pipes(struct record_thread *thread_data)
938 {
939 	if (pipe(thread_data->pipes.msg))
940 		return -EINVAL;
941 
942 	if (pipe(thread_data->pipes.ack)) {
943 		close(thread_data->pipes.msg[0]);
944 		thread_data->pipes.msg[0] = -1;
945 		close(thread_data->pipes.msg[1]);
946 		thread_data->pipes.msg[1] = -1;
947 		return -EINVAL;
948 	}
949 
950 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
951 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
952 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
953 
954 	return 0;
955 }
956 
957 static void record__thread_data_close_pipes(struct record_thread *thread_data)
958 {
959 	if (thread_data->pipes.msg[0] != -1) {
960 		close(thread_data->pipes.msg[0]);
961 		thread_data->pipes.msg[0] = -1;
962 	}
963 	if (thread_data->pipes.msg[1] != -1) {
964 		close(thread_data->pipes.msg[1]);
965 		thread_data->pipes.msg[1] = -1;
966 	}
967 	if (thread_data->pipes.ack[0] != -1) {
968 		close(thread_data->pipes.ack[0]);
969 		thread_data->pipes.ack[0] = -1;
970 	}
971 	if (thread_data->pipes.ack[1] != -1) {
972 		close(thread_data->pipes.ack[1]);
973 		thread_data->pipes.ack[1] = -1;
974 	}
975 }
976 
977 static bool evlist__per_thread(struct evlist *evlist)
978 {
979 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
980 }
981 
982 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
983 {
984 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
985 	struct mmap *mmap = evlist->mmap;
986 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
987 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
988 	bool per_thread = evlist__per_thread(evlist);
989 
990 	if (per_thread)
991 		thread_data->nr_mmaps = nr_mmaps;
992 	else
993 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
994 						      thread_data->mask->maps.nbits);
995 	if (mmap) {
996 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
997 		if (!thread_data->maps)
998 			return -ENOMEM;
999 	}
1000 	if (overwrite_mmap) {
1001 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1002 		if (!thread_data->overwrite_maps) {
1003 			zfree(&thread_data->maps);
1004 			return -ENOMEM;
1005 		}
1006 	}
1007 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1008 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1009 
1010 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1011 		if (per_thread ||
1012 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1013 			if (thread_data->maps) {
1014 				thread_data->maps[tm] = &mmap[m];
1015 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1016 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1017 			}
1018 			if (thread_data->overwrite_maps) {
1019 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1020 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1021 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1022 			}
1023 			tm++;
1024 		}
1025 	}
1026 
1027 	return 0;
1028 }
1029 
1030 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1031 {
1032 	int f, tm, pos;
1033 	struct mmap *map, *overwrite_map;
1034 
1035 	fdarray__init(&thread_data->pollfd, 64);
1036 
1037 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1038 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1039 		overwrite_map = thread_data->overwrite_maps ?
1040 				thread_data->overwrite_maps[tm] : NULL;
1041 
1042 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1043 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1044 
1045 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1046 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1047 							      &evlist->core.pollfd);
1048 				if (pos < 0)
1049 					return pos;
1050 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1051 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1052 			}
1053 		}
1054 	}
1055 
1056 	return 0;
1057 }
1058 
1059 static void record__free_thread_data(struct record *rec)
1060 {
1061 	int t;
1062 	struct record_thread *thread_data = rec->thread_data;
1063 
1064 	if (thread_data == NULL)
1065 		return;
1066 
1067 	for (t = 0; t < rec->nr_threads; t++) {
1068 		record__thread_data_close_pipes(&thread_data[t]);
1069 		zfree(&thread_data[t].maps);
1070 		zfree(&thread_data[t].overwrite_maps);
1071 		fdarray__exit(&thread_data[t].pollfd);
1072 	}
1073 
1074 	zfree(&rec->thread_data);
1075 }
1076 
1077 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1078 {
1079 	int t, ret;
1080 	struct record_thread *thread_data;
1081 
1082 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1083 	if (!rec->thread_data) {
1084 		pr_err("Failed to allocate thread data\n");
1085 		return -ENOMEM;
1086 	}
1087 	thread_data = rec->thread_data;
1088 
1089 	for (t = 0; t < rec->nr_threads; t++)
1090 		record__thread_data_init_pipes(&thread_data[t]);
1091 
1092 	for (t = 0; t < rec->nr_threads; t++) {
1093 		thread_data[t].rec = rec;
1094 		thread_data[t].mask = &rec->thread_masks[t];
1095 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1096 		if (ret) {
1097 			pr_err("Failed to initialize thread[%d] maps\n", t);
1098 			goto out_free;
1099 		}
1100 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1101 		if (ret) {
1102 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1103 			goto out_free;
1104 		}
1105 		if (t) {
1106 			thread_data[t].tid = -1;
1107 			ret = record__thread_data_open_pipes(&thread_data[t]);
1108 			if (ret) {
1109 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1110 				goto out_free;
1111 			}
1112 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1113 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1114 			if (ret < 0) {
1115 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1116 				goto out_free;
1117 			}
1118 			thread_data[t].ctlfd_pos = ret;
1119 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1120 				 thread_data, thread_data[t].ctlfd_pos,
1121 				 thread_data[t].pipes.msg[0]);
1122 		} else {
1123 			thread_data[t].tid = gettid();
1124 			if (evlist->ctl_fd.pos == -1)
1125 				continue;
1126 			ret = fdarray__dup_entry_from(&thread_data[t].pollfd, evlist->ctl_fd.pos,
1127 						      &evlist->core.pollfd);
1128 			if (ret < 0) {
1129 				pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1130 				goto out_free;
1131 			}
1132 			thread_data[t].ctlfd_pos = ret;
1133 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1134 				 thread_data, thread_data[t].ctlfd_pos,
1135 				 evlist->core.pollfd.entries[evlist->ctl_fd.pos].fd);
1136 		}
1137 	}
1138 
1139 	return 0;
1140 
1141 out_free:
1142 	record__free_thread_data(rec);
1143 
1144 	return ret;
1145 }
1146 
1147 static int record__mmap_evlist(struct record *rec,
1148 			       struct evlist *evlist)
1149 {
1150 	int i, ret;
1151 	struct record_opts *opts = &rec->opts;
1152 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1153 				  opts->auxtrace_sample_mode;
1154 	char msg[512];
1155 
1156 	if (opts->affinity != PERF_AFFINITY_SYS)
1157 		cpu__setup_cpunode_map();
1158 
1159 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1160 				 opts->auxtrace_mmap_pages,
1161 				 auxtrace_overwrite,
1162 				 opts->nr_cblocks, opts->affinity,
1163 				 opts->mmap_flush, opts->comp_level) < 0) {
1164 		if (errno == EPERM) {
1165 			pr_err("Permission error mapping pages.\n"
1166 			       "Consider increasing "
1167 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1168 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1169 			       "(current value: %u,%u)\n",
1170 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1171 			return -errno;
1172 		} else {
1173 			pr_err("failed to mmap with %d (%s)\n", errno,
1174 				str_error_r(errno, msg, sizeof(msg)));
1175 			if (errno)
1176 				return -errno;
1177 			else
1178 				return -EINVAL;
1179 		}
1180 	}
1181 
1182 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1183 		return -1;
1184 
1185 	ret = record__alloc_thread_data(rec, evlist);
1186 	if (ret)
1187 		return ret;
1188 
1189 	if (record__threads_enabled(rec)) {
1190 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1191 		if (ret) {
1192 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1193 			return ret;
1194 		}
1195 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1196 			if (evlist->mmap)
1197 				evlist->mmap[i].file = &rec->data.dir.files[i];
1198 			if (evlist->overwrite_mmap)
1199 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1200 		}
1201 	}
1202 
1203 	return 0;
1204 }
1205 
1206 static int record__mmap(struct record *rec)
1207 {
1208 	return record__mmap_evlist(rec, rec->evlist);
1209 }
1210 
1211 static int record__open(struct record *rec)
1212 {
1213 	char msg[BUFSIZ];
1214 	struct evsel *pos;
1215 	struct evlist *evlist = rec->evlist;
1216 	struct perf_session *session = rec->session;
1217 	struct record_opts *opts = &rec->opts;
1218 	int rc = 0;
1219 
1220 	/*
1221 	 * For initial_delay, system wide or a hybrid system, we need to add a
1222 	 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
1223 	 * of waiting or event synthesis.
1224 	 */
1225 	if (opts->initial_delay || target__has_cpu(&opts->target) ||
1226 	    perf_pmu__has_hybrid()) {
1227 		pos = evlist__get_tracking_event(evlist);
1228 		if (!evsel__is_dummy_event(pos)) {
1229 			/* Set up dummy event. */
1230 			if (evlist__add_dummy(evlist))
1231 				return -ENOMEM;
1232 			pos = evlist__last(evlist);
1233 			evlist__set_tracking_event(evlist, pos);
1234 		}
1235 
1236 		/*
1237 		 * Enable the dummy event when the process is forked for
1238 		 * initial_delay, immediately for system wide.
1239 		 */
1240 		if (opts->initial_delay && !pos->immediate &&
1241 		    !target__has_cpu(&opts->target))
1242 			pos->core.attr.enable_on_exec = 1;
1243 		else
1244 			pos->immediate = 1;
1245 	}
1246 
1247 	evlist__config(evlist, opts, &callchain_param);
1248 
1249 	evlist__for_each_entry(evlist, pos) {
1250 try_again:
1251 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1252 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1253 				if (verbose > 0)
1254 					ui__warning("%s\n", msg);
1255 				goto try_again;
1256 			}
1257 			if ((errno == EINVAL || errno == EBADF) &&
1258 			    pos->core.leader != &pos->core &&
1259 			    pos->weak_group) {
1260 			        pos = evlist__reset_weak_group(evlist, pos, true);
1261 				goto try_again;
1262 			}
1263 			rc = -errno;
1264 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1265 			ui__error("%s\n", msg);
1266 			goto out;
1267 		}
1268 
1269 		pos->supported = true;
1270 	}
1271 
1272 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1273 		pr_warning(
1274 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1275 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1276 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1277 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1278 "Samples in kernel modules won't be resolved at all.\n\n"
1279 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1280 "even with a suitable vmlinux or kallsyms file.\n\n");
1281 	}
1282 
1283 	if (evlist__apply_filters(evlist, &pos)) {
1284 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1285 			pos->filter, evsel__name(pos), errno,
1286 			str_error_r(errno, msg, sizeof(msg)));
1287 		rc = -1;
1288 		goto out;
1289 	}
1290 
1291 	rc = record__mmap(rec);
1292 	if (rc)
1293 		goto out;
1294 
1295 	session->evlist = evlist;
1296 	perf_session__set_id_hdr_size(session);
1297 out:
1298 	return rc;
1299 }
1300 
1301 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1302 {
1303 	if (rec->evlist->first_sample_time == 0)
1304 		rec->evlist->first_sample_time = sample_time;
1305 
1306 	if (sample_time)
1307 		rec->evlist->last_sample_time = sample_time;
1308 }
1309 
1310 static int process_sample_event(struct perf_tool *tool,
1311 				union perf_event *event,
1312 				struct perf_sample *sample,
1313 				struct evsel *evsel,
1314 				struct machine *machine)
1315 {
1316 	struct record *rec = container_of(tool, struct record, tool);
1317 
1318 	set_timestamp_boundary(rec, sample->time);
1319 
1320 	if (rec->buildid_all)
1321 		return 0;
1322 
1323 	rec->samples++;
1324 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1325 }
1326 
1327 static int process_buildids(struct record *rec)
1328 {
1329 	struct perf_session *session = rec->session;
1330 
1331 	if (perf_data__size(&rec->data) == 0)
1332 		return 0;
1333 
1334 	/*
1335 	 * During this process, it'll load kernel map and replace the
1336 	 * dso->long_name to a real pathname it found.  In this case
1337 	 * we prefer the vmlinux path like
1338 	 *   /lib/modules/3.16.4/build/vmlinux
1339 	 *
1340 	 * rather than build-id path (in debug directory).
1341 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1342 	 */
1343 	symbol_conf.ignore_vmlinux_buildid = true;
1344 
1345 	/*
1346 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1347 	 * so no need to process samples. But if timestamp_boundary is enabled,
1348 	 * it still needs to walk on all samples to get the timestamps of
1349 	 * first/last samples.
1350 	 */
1351 	if (rec->buildid_all && !rec->timestamp_boundary)
1352 		rec->tool.sample = NULL;
1353 
1354 	return perf_session__process_events(session);
1355 }
1356 
1357 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1358 {
1359 	int err;
1360 	struct perf_tool *tool = data;
1361 	/*
1362 	 *As for guest kernel when processing subcommand record&report,
1363 	 *we arrange module mmap prior to guest kernel mmap and trigger
1364 	 *a preload dso because default guest module symbols are loaded
1365 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1366 	 *method is used to avoid symbol missing when the first addr is
1367 	 *in module instead of in guest kernel.
1368 	 */
1369 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1370 					     machine);
1371 	if (err < 0)
1372 		pr_err("Couldn't record guest kernel [%d]'s reference"
1373 		       " relocation symbol.\n", machine->pid);
1374 
1375 	/*
1376 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1377 	 * have no _text sometimes.
1378 	 */
1379 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1380 						 machine);
1381 	if (err < 0)
1382 		pr_err("Couldn't record guest kernel [%d]'s reference"
1383 		       " relocation symbol.\n", machine->pid);
1384 }
1385 
1386 static struct perf_event_header finished_round_event = {
1387 	.size = sizeof(struct perf_event_header),
1388 	.type = PERF_RECORD_FINISHED_ROUND,
1389 };
1390 
1391 static struct perf_event_header finished_init_event = {
1392 	.size = sizeof(struct perf_event_header),
1393 	.type = PERF_RECORD_FINISHED_INIT,
1394 };
1395 
1396 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1397 {
1398 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1399 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1400 			  thread->mask->affinity.nbits)) {
1401 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1402 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1403 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1404 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1405 					(cpu_set_t *)thread->mask->affinity.bits);
1406 		if (verbose == 2) {
1407 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1408 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1409 		}
1410 	}
1411 }
1412 
1413 static size_t process_comp_header(void *record, size_t increment)
1414 {
1415 	struct perf_record_compressed *event = record;
1416 	size_t size = sizeof(*event);
1417 
1418 	if (increment) {
1419 		event->header.size += increment;
1420 		return increment;
1421 	}
1422 
1423 	event->header.type = PERF_RECORD_COMPRESSED;
1424 	event->header.size = size;
1425 
1426 	return size;
1427 }
1428 
1429 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1430 			    void *dst, size_t dst_size, void *src, size_t src_size)
1431 {
1432 	size_t compressed;
1433 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1434 	struct zstd_data *zstd_data = &session->zstd_data;
1435 
1436 	if (map && map->file)
1437 		zstd_data = &map->zstd_data;
1438 
1439 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1440 						     max_record_size, process_comp_header);
1441 
1442 	if (map && map->file) {
1443 		thread->bytes_transferred += src_size;
1444 		thread->bytes_compressed  += compressed;
1445 	} else {
1446 		session->bytes_transferred += src_size;
1447 		session->bytes_compressed  += compressed;
1448 	}
1449 
1450 	return compressed;
1451 }
1452 
1453 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1454 				    bool overwrite, bool synch)
1455 {
1456 	u64 bytes_written = rec->bytes_written;
1457 	int i;
1458 	int rc = 0;
1459 	int nr_mmaps;
1460 	struct mmap **maps;
1461 	int trace_fd = rec->data.file.fd;
1462 	off_t off = 0;
1463 
1464 	if (!evlist)
1465 		return 0;
1466 
1467 	nr_mmaps = thread->nr_mmaps;
1468 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1469 
1470 	if (!maps)
1471 		return 0;
1472 
1473 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1474 		return 0;
1475 
1476 	if (record__aio_enabled(rec))
1477 		off = record__aio_get_pos(trace_fd);
1478 
1479 	for (i = 0; i < nr_mmaps; i++) {
1480 		u64 flush = 0;
1481 		struct mmap *map = maps[i];
1482 
1483 		if (map->core.base) {
1484 			record__adjust_affinity(rec, map);
1485 			if (synch) {
1486 				flush = map->core.flush;
1487 				map->core.flush = 1;
1488 			}
1489 			if (!record__aio_enabled(rec)) {
1490 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1491 					if (synch)
1492 						map->core.flush = flush;
1493 					rc = -1;
1494 					goto out;
1495 				}
1496 			} else {
1497 				if (record__aio_push(rec, map, &off) < 0) {
1498 					record__aio_set_pos(trace_fd, off);
1499 					if (synch)
1500 						map->core.flush = flush;
1501 					rc = -1;
1502 					goto out;
1503 				}
1504 			}
1505 			if (synch)
1506 				map->core.flush = flush;
1507 		}
1508 
1509 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1510 		    !rec->opts.auxtrace_sample_mode &&
1511 		    record__auxtrace_mmap_read(rec, map) != 0) {
1512 			rc = -1;
1513 			goto out;
1514 		}
1515 	}
1516 
1517 	if (record__aio_enabled(rec))
1518 		record__aio_set_pos(trace_fd, off);
1519 
1520 	/*
1521 	 * Mark the round finished in case we wrote
1522 	 * at least one event.
1523 	 *
1524 	 * No need for round events in directory mode,
1525 	 * because per-cpu maps and files have data
1526 	 * sorted by kernel.
1527 	 */
1528 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1529 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1530 
1531 	if (overwrite)
1532 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1533 out:
1534 	return rc;
1535 }
1536 
1537 static int record__mmap_read_all(struct record *rec, bool synch)
1538 {
1539 	int err;
1540 
1541 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1542 	if (err)
1543 		return err;
1544 
1545 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1546 }
1547 
1548 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1549 					   void *arg __maybe_unused)
1550 {
1551 	struct perf_mmap *map = fda->priv[fd].ptr;
1552 
1553 	if (map)
1554 		perf_mmap__put(map);
1555 }
1556 
1557 static void *record__thread(void *arg)
1558 {
1559 	enum thread_msg msg = THREAD_MSG__READY;
1560 	bool terminate = false;
1561 	struct fdarray *pollfd;
1562 	int err, ctlfd_pos;
1563 
1564 	thread = arg;
1565 	thread->tid = gettid();
1566 
1567 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1568 	if (err == -1)
1569 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1570 			   thread->tid, strerror(errno));
1571 
1572 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1573 
1574 	pollfd = &thread->pollfd;
1575 	ctlfd_pos = thread->ctlfd_pos;
1576 
1577 	for (;;) {
1578 		unsigned long long hits = thread->samples;
1579 
1580 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1581 			break;
1582 
1583 		if (hits == thread->samples) {
1584 
1585 			err = fdarray__poll(pollfd, -1);
1586 			/*
1587 			 * Propagate error, only if there's any. Ignore positive
1588 			 * number of returned events and interrupt error.
1589 			 */
1590 			if (err > 0 || (err < 0 && errno == EINTR))
1591 				err = 0;
1592 			thread->waking++;
1593 
1594 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1595 					    record__thread_munmap_filtered, NULL) == 0)
1596 				break;
1597 		}
1598 
1599 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1600 			terminate = true;
1601 			close(thread->pipes.msg[0]);
1602 			thread->pipes.msg[0] = -1;
1603 			pollfd->entries[ctlfd_pos].fd = -1;
1604 			pollfd->entries[ctlfd_pos].events = 0;
1605 		}
1606 
1607 		pollfd->entries[ctlfd_pos].revents = 0;
1608 	}
1609 	record__mmap_read_all(thread->rec, true);
1610 
1611 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1612 	if (err == -1)
1613 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1614 			   thread->tid, strerror(errno));
1615 
1616 	return NULL;
1617 }
1618 
1619 static void record__init_features(struct record *rec)
1620 {
1621 	struct perf_session *session = rec->session;
1622 	int feat;
1623 
1624 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1625 		perf_header__set_feat(&session->header, feat);
1626 
1627 	if (rec->no_buildid)
1628 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1629 
1630 	if (!have_tracepoints(&rec->evlist->core.entries))
1631 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1632 
1633 	if (!rec->opts.branch_stack)
1634 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1635 
1636 	if (!rec->opts.full_auxtrace)
1637 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1638 
1639 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1640 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1641 
1642 	if (!rec->opts.use_clockid)
1643 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1644 
1645 	if (!record__threads_enabled(rec))
1646 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1647 
1648 	if (!record__comp_enabled(rec))
1649 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1650 
1651 	perf_header__clear_feat(&session->header, HEADER_STAT);
1652 }
1653 
1654 static void
1655 record__finish_output(struct record *rec)
1656 {
1657 	int i;
1658 	struct perf_data *data = &rec->data;
1659 	int fd = perf_data__fd(data);
1660 
1661 	if (data->is_pipe)
1662 		return;
1663 
1664 	rec->session->header.data_size += rec->bytes_written;
1665 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1666 	if (record__threads_enabled(rec)) {
1667 		for (i = 0; i < data->dir.nr; i++)
1668 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1669 	}
1670 
1671 	if (!rec->no_buildid) {
1672 		process_buildids(rec);
1673 
1674 		if (rec->buildid_all)
1675 			dsos__hit_all(rec->session);
1676 	}
1677 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1678 
1679 	return;
1680 }
1681 
1682 static int record__synthesize_workload(struct record *rec, bool tail)
1683 {
1684 	int err;
1685 	struct perf_thread_map *thread_map;
1686 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1687 
1688 	if (rec->opts.tail_synthesize != tail)
1689 		return 0;
1690 
1691 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1692 	if (thread_map == NULL)
1693 		return -1;
1694 
1695 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1696 						 process_synthesized_event,
1697 						 &rec->session->machines.host,
1698 						 needs_mmap,
1699 						 rec->opts.sample_address);
1700 	perf_thread_map__put(thread_map);
1701 	return err;
1702 }
1703 
1704 static int write_finished_init(struct record *rec, bool tail)
1705 {
1706 	if (rec->opts.tail_synthesize != tail)
1707 		return 0;
1708 
1709 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1710 }
1711 
1712 static int record__synthesize(struct record *rec, bool tail);
1713 
1714 static int
1715 record__switch_output(struct record *rec, bool at_exit)
1716 {
1717 	struct perf_data *data = &rec->data;
1718 	int fd, err;
1719 	char *new_filename;
1720 
1721 	/* Same Size:      "2015122520103046"*/
1722 	char timestamp[] = "InvalidTimestamp";
1723 
1724 	record__aio_mmap_read_sync(rec);
1725 
1726 	write_finished_init(rec, true);
1727 
1728 	record__synthesize(rec, true);
1729 	if (target__none(&rec->opts.target))
1730 		record__synthesize_workload(rec, true);
1731 
1732 	rec->samples = 0;
1733 	record__finish_output(rec);
1734 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1735 	if (err) {
1736 		pr_err("Failed to get current timestamp\n");
1737 		return -EINVAL;
1738 	}
1739 
1740 	fd = perf_data__switch(data, timestamp,
1741 				    rec->session->header.data_offset,
1742 				    at_exit, &new_filename);
1743 	if (fd >= 0 && !at_exit) {
1744 		rec->bytes_written = 0;
1745 		rec->session->header.data_size = 0;
1746 	}
1747 
1748 	if (!quiet)
1749 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1750 			data->path, timestamp);
1751 
1752 	if (rec->switch_output.num_files) {
1753 		int n = rec->switch_output.cur_file + 1;
1754 
1755 		if (n >= rec->switch_output.num_files)
1756 			n = 0;
1757 		rec->switch_output.cur_file = n;
1758 		if (rec->switch_output.filenames[n]) {
1759 			remove(rec->switch_output.filenames[n]);
1760 			zfree(&rec->switch_output.filenames[n]);
1761 		}
1762 		rec->switch_output.filenames[n] = new_filename;
1763 	} else {
1764 		free(new_filename);
1765 	}
1766 
1767 	/* Output tracking events */
1768 	if (!at_exit) {
1769 		record__synthesize(rec, false);
1770 
1771 		/*
1772 		 * In 'perf record --switch-output' without -a,
1773 		 * record__synthesize() in record__switch_output() won't
1774 		 * generate tracking events because there's no thread_map
1775 		 * in evlist. Which causes newly created perf.data doesn't
1776 		 * contain map and comm information.
1777 		 * Create a fake thread_map and directly call
1778 		 * perf_event__synthesize_thread_map() for those events.
1779 		 */
1780 		if (target__none(&rec->opts.target))
1781 			record__synthesize_workload(rec, false);
1782 		write_finished_init(rec, false);
1783 	}
1784 	return fd;
1785 }
1786 
1787 static volatile int workload_exec_errno;
1788 
1789 /*
1790  * evlist__prepare_workload will send a SIGUSR1
1791  * if the fork fails, since we asked by setting its
1792  * want_signal to true.
1793  */
1794 static void workload_exec_failed_signal(int signo __maybe_unused,
1795 					siginfo_t *info,
1796 					void *ucontext __maybe_unused)
1797 {
1798 	workload_exec_errno = info->si_value.sival_int;
1799 	done = 1;
1800 	child_finished = 1;
1801 }
1802 
1803 static void snapshot_sig_handler(int sig);
1804 static void alarm_sig_handler(int sig);
1805 
1806 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1807 {
1808 	if (evlist) {
1809 		if (evlist->mmap && evlist->mmap[0].core.base)
1810 			return evlist->mmap[0].core.base;
1811 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1812 			return evlist->overwrite_mmap[0].core.base;
1813 	}
1814 	return NULL;
1815 }
1816 
1817 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1818 {
1819 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1820 	if (pc)
1821 		return pc;
1822 	return NULL;
1823 }
1824 
1825 static int record__synthesize(struct record *rec, bool tail)
1826 {
1827 	struct perf_session *session = rec->session;
1828 	struct machine *machine = &session->machines.host;
1829 	struct perf_data *data = &rec->data;
1830 	struct record_opts *opts = &rec->opts;
1831 	struct perf_tool *tool = &rec->tool;
1832 	int err = 0;
1833 	event_op f = process_synthesized_event;
1834 
1835 	if (rec->opts.tail_synthesize != tail)
1836 		return 0;
1837 
1838 	if (data->is_pipe) {
1839 		err = perf_event__synthesize_for_pipe(tool, session, data,
1840 						      process_synthesized_event);
1841 		if (err < 0)
1842 			goto out;
1843 
1844 		rec->bytes_written += err;
1845 	}
1846 
1847 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1848 					  process_synthesized_event, machine);
1849 	if (err)
1850 		goto out;
1851 
1852 	/* Synthesize id_index before auxtrace_info */
1853 	err = perf_event__synthesize_id_index(tool,
1854 					      process_synthesized_event,
1855 					      session->evlist, machine);
1856 	if (err)
1857 		goto out;
1858 
1859 	if (rec->opts.full_auxtrace) {
1860 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1861 					session, process_synthesized_event);
1862 		if (err)
1863 			goto out;
1864 	}
1865 
1866 	if (!evlist__exclude_kernel(rec->evlist)) {
1867 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1868 							 machine);
1869 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1870 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1871 				   "Check /proc/kallsyms permission or run as root.\n");
1872 
1873 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1874 						     machine);
1875 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1876 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1877 				   "Check /proc/modules permission or run as root.\n");
1878 	}
1879 
1880 	if (perf_guest) {
1881 		machines__process_guests(&session->machines,
1882 					 perf_event__synthesize_guest_os, tool);
1883 	}
1884 
1885 	err = perf_event__synthesize_extra_attr(&rec->tool,
1886 						rec->evlist,
1887 						process_synthesized_event,
1888 						data->is_pipe);
1889 	if (err)
1890 		goto out;
1891 
1892 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1893 						 process_synthesized_event,
1894 						NULL);
1895 	if (err < 0) {
1896 		pr_err("Couldn't synthesize thread map.\n");
1897 		return err;
1898 	}
1899 
1900 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
1901 					     process_synthesized_event, NULL);
1902 	if (err < 0) {
1903 		pr_err("Couldn't synthesize cpu map.\n");
1904 		return err;
1905 	}
1906 
1907 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1908 						machine, opts);
1909 	if (err < 0)
1910 		pr_warning("Couldn't synthesize bpf events.\n");
1911 
1912 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
1913 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1914 						     machine);
1915 		if (err < 0)
1916 			pr_warning("Couldn't synthesize cgroup events.\n");
1917 	}
1918 
1919 	if (rec->opts.nr_threads_synthesize > 1) {
1920 		perf_set_multithreaded();
1921 		f = process_locked_synthesized_event;
1922 	}
1923 
1924 	if (rec->opts.synth & PERF_SYNTH_TASK) {
1925 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1926 
1927 		err = __machine__synthesize_threads(machine, tool, &opts->target,
1928 						    rec->evlist->core.threads,
1929 						    f, needs_mmap, opts->sample_address,
1930 						    rec->opts.nr_threads_synthesize);
1931 	}
1932 
1933 	if (rec->opts.nr_threads_synthesize > 1)
1934 		perf_set_singlethreaded();
1935 
1936 out:
1937 	return err;
1938 }
1939 
1940 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1941 {
1942 	struct record *rec = data;
1943 	pthread_kill(rec->thread_id, SIGUSR2);
1944 	return 0;
1945 }
1946 
1947 static int record__setup_sb_evlist(struct record *rec)
1948 {
1949 	struct record_opts *opts = &rec->opts;
1950 
1951 	if (rec->sb_evlist != NULL) {
1952 		/*
1953 		 * We get here if --switch-output-event populated the
1954 		 * sb_evlist, so associate a callback that will send a SIGUSR2
1955 		 * to the main thread.
1956 		 */
1957 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1958 		rec->thread_id = pthread_self();
1959 	}
1960 #ifdef HAVE_LIBBPF_SUPPORT
1961 	if (!opts->no_bpf_event) {
1962 		if (rec->sb_evlist == NULL) {
1963 			rec->sb_evlist = evlist__new();
1964 
1965 			if (rec->sb_evlist == NULL) {
1966 				pr_err("Couldn't create side band evlist.\n.");
1967 				return -1;
1968 			}
1969 		}
1970 
1971 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1972 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1973 			return -1;
1974 		}
1975 	}
1976 #endif
1977 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1978 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1979 		opts->no_bpf_event = true;
1980 	}
1981 
1982 	return 0;
1983 }
1984 
1985 static int record__init_clock(struct record *rec)
1986 {
1987 	struct perf_session *session = rec->session;
1988 	struct timespec ref_clockid;
1989 	struct timeval ref_tod;
1990 	u64 ref;
1991 
1992 	if (!rec->opts.use_clockid)
1993 		return 0;
1994 
1995 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1996 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
1997 
1998 	session->header.env.clock.clockid = rec->opts.clockid;
1999 
2000 	if (gettimeofday(&ref_tod, NULL) != 0) {
2001 		pr_err("gettimeofday failed, cannot set reference time.\n");
2002 		return -1;
2003 	}
2004 
2005 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2006 		pr_err("clock_gettime failed, cannot set reference time.\n");
2007 		return -1;
2008 	}
2009 
2010 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2011 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2012 
2013 	session->header.env.clock.tod_ns = ref;
2014 
2015 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2016 	      (u64) ref_clockid.tv_nsec;
2017 
2018 	session->header.env.clock.clockid_ns = ref;
2019 	return 0;
2020 }
2021 
2022 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2023 {
2024 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2025 		trigger_hit(&auxtrace_snapshot_trigger);
2026 		auxtrace_record__snapshot_started = 1;
2027 		if (auxtrace_record__snapshot_start(rec->itr))
2028 			trigger_error(&auxtrace_snapshot_trigger);
2029 	}
2030 }
2031 
2032 static void record__uniquify_name(struct record *rec)
2033 {
2034 	struct evsel *pos;
2035 	struct evlist *evlist = rec->evlist;
2036 	char *new_name;
2037 	int ret;
2038 
2039 	if (!perf_pmu__has_hybrid())
2040 		return;
2041 
2042 	evlist__for_each_entry(evlist, pos) {
2043 		if (!evsel__is_hybrid(pos))
2044 			continue;
2045 
2046 		if (strchr(pos->name, '/'))
2047 			continue;
2048 
2049 		ret = asprintf(&new_name, "%s/%s/",
2050 			       pos->pmu_name, pos->name);
2051 		if (ret) {
2052 			free(pos->name);
2053 			pos->name = new_name;
2054 		}
2055 	}
2056 }
2057 
2058 static int record__terminate_thread(struct record_thread *thread_data)
2059 {
2060 	int err;
2061 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2062 	pid_t tid = thread_data->tid;
2063 
2064 	close(thread_data->pipes.msg[1]);
2065 	thread_data->pipes.msg[1] = -1;
2066 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2067 	if (err > 0)
2068 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2069 	else
2070 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2071 			   thread->tid, tid);
2072 
2073 	return 0;
2074 }
2075 
2076 static int record__start_threads(struct record *rec)
2077 {
2078 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2079 	struct record_thread *thread_data = rec->thread_data;
2080 	sigset_t full, mask;
2081 	pthread_t handle;
2082 	pthread_attr_t attrs;
2083 
2084 	thread = &thread_data[0];
2085 
2086 	if (!record__threads_enabled(rec))
2087 		return 0;
2088 
2089 	sigfillset(&full);
2090 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2091 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2092 		return -1;
2093 	}
2094 
2095 	pthread_attr_init(&attrs);
2096 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2097 
2098 	for (t = 1; t < nr_threads; t++) {
2099 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2100 
2101 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2102 		pthread_attr_setaffinity_np(&attrs,
2103 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2104 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2105 #endif
2106 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2107 			for (tt = 1; tt < t; tt++)
2108 				record__terminate_thread(&thread_data[t]);
2109 			pr_err("Failed to start threads: %s\n", strerror(errno));
2110 			ret = -1;
2111 			goto out_err;
2112 		}
2113 
2114 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2115 		if (err > 0)
2116 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2117 				  thread_msg_tags[msg]);
2118 		else
2119 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2120 				   thread->tid, rec->thread_data[t].tid);
2121 	}
2122 
2123 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2124 			(cpu_set_t *)thread->mask->affinity.bits);
2125 
2126 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2127 
2128 out_err:
2129 	pthread_attr_destroy(&attrs);
2130 
2131 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2132 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2133 		ret = -1;
2134 	}
2135 
2136 	return ret;
2137 }
2138 
2139 static int record__stop_threads(struct record *rec)
2140 {
2141 	int t;
2142 	struct record_thread *thread_data = rec->thread_data;
2143 
2144 	for (t = 1; t < rec->nr_threads; t++)
2145 		record__terminate_thread(&thread_data[t]);
2146 
2147 	for (t = 0; t < rec->nr_threads; t++) {
2148 		rec->samples += thread_data[t].samples;
2149 		if (!record__threads_enabled(rec))
2150 			continue;
2151 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2152 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2153 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2154 			 thread_data[t].samples, thread_data[t].waking);
2155 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2156 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2157 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2158 		else
2159 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2160 	}
2161 
2162 	return 0;
2163 }
2164 
2165 static unsigned long record__waking(struct record *rec)
2166 {
2167 	int t;
2168 	unsigned long waking = 0;
2169 	struct record_thread *thread_data = rec->thread_data;
2170 
2171 	for (t = 0; t < rec->nr_threads; t++)
2172 		waking += thread_data[t].waking;
2173 
2174 	return waking;
2175 }
2176 
2177 static int __cmd_record(struct record *rec, int argc, const char **argv)
2178 {
2179 	int err;
2180 	int status = 0;
2181 	const bool forks = argc > 0;
2182 	struct perf_tool *tool = &rec->tool;
2183 	struct record_opts *opts = &rec->opts;
2184 	struct perf_data *data = &rec->data;
2185 	struct perf_session *session;
2186 	bool disabled = false, draining = false;
2187 	int fd;
2188 	float ratio = 0;
2189 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2190 
2191 	atexit(record__sig_exit);
2192 	signal(SIGCHLD, sig_handler);
2193 	signal(SIGINT, sig_handler);
2194 	signal(SIGTERM, sig_handler);
2195 	signal(SIGSEGV, sigsegv_handler);
2196 
2197 	if (rec->opts.record_namespaces)
2198 		tool->namespace_events = true;
2199 
2200 	if (rec->opts.record_cgroup) {
2201 #ifdef HAVE_FILE_HANDLE
2202 		tool->cgroup_events = true;
2203 #else
2204 		pr_err("cgroup tracking is not supported\n");
2205 		return -1;
2206 #endif
2207 	}
2208 
2209 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2210 		signal(SIGUSR2, snapshot_sig_handler);
2211 		if (rec->opts.auxtrace_snapshot_mode)
2212 			trigger_on(&auxtrace_snapshot_trigger);
2213 		if (rec->switch_output.enabled)
2214 			trigger_on(&switch_output_trigger);
2215 	} else {
2216 		signal(SIGUSR2, SIG_IGN);
2217 	}
2218 
2219 	session = perf_session__new(data, tool);
2220 	if (IS_ERR(session)) {
2221 		pr_err("Perf session creation failed.\n");
2222 		return PTR_ERR(session);
2223 	}
2224 
2225 	if (record__threads_enabled(rec)) {
2226 		if (perf_data__is_pipe(&rec->data)) {
2227 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2228 			return -1;
2229 		}
2230 		if (rec->opts.full_auxtrace) {
2231 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2232 			return -1;
2233 		}
2234 	}
2235 
2236 	fd = perf_data__fd(data);
2237 	rec->session = session;
2238 
2239 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2240 		pr_err("Compression initialization failed.\n");
2241 		return -1;
2242 	}
2243 #ifdef HAVE_EVENTFD_SUPPORT
2244 	done_fd = eventfd(0, EFD_NONBLOCK);
2245 	if (done_fd < 0) {
2246 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2247 		status = -1;
2248 		goto out_delete_session;
2249 	}
2250 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2251 	if (err < 0) {
2252 		pr_err("Failed to add wakeup eventfd to poll list\n");
2253 		status = err;
2254 		goto out_delete_session;
2255 	}
2256 #endif // HAVE_EVENTFD_SUPPORT
2257 
2258 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2259 	session->header.env.comp_level = rec->opts.comp_level;
2260 
2261 	if (rec->opts.kcore &&
2262 	    !record__kcore_readable(&session->machines.host)) {
2263 		pr_err("ERROR: kcore is not readable.\n");
2264 		return -1;
2265 	}
2266 
2267 	if (record__init_clock(rec))
2268 		return -1;
2269 
2270 	record__init_features(rec);
2271 
2272 	if (forks) {
2273 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2274 					       workload_exec_failed_signal);
2275 		if (err < 0) {
2276 			pr_err("Couldn't run the workload!\n");
2277 			status = err;
2278 			goto out_delete_session;
2279 		}
2280 	}
2281 
2282 	/*
2283 	 * If we have just single event and are sending data
2284 	 * through pipe, we need to force the ids allocation,
2285 	 * because we synthesize event name through the pipe
2286 	 * and need the id for that.
2287 	 */
2288 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2289 		rec->opts.sample_id = true;
2290 
2291 	record__uniquify_name(rec);
2292 
2293 	if (record__open(rec) != 0) {
2294 		err = -1;
2295 		goto out_free_threads;
2296 	}
2297 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2298 
2299 	if (rec->opts.kcore) {
2300 		err = record__kcore_copy(&session->machines.host, data);
2301 		if (err) {
2302 			pr_err("ERROR: Failed to copy kcore\n");
2303 			goto out_free_threads;
2304 		}
2305 	}
2306 
2307 	err = bpf__apply_obj_config();
2308 	if (err) {
2309 		char errbuf[BUFSIZ];
2310 
2311 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2312 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2313 			 errbuf);
2314 		goto out_free_threads;
2315 	}
2316 
2317 	/*
2318 	 * Normally perf_session__new would do this, but it doesn't have the
2319 	 * evlist.
2320 	 */
2321 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2322 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2323 		rec->tool.ordered_events = false;
2324 	}
2325 
2326 	if (!rec->evlist->core.nr_groups)
2327 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2328 
2329 	if (data->is_pipe) {
2330 		err = perf_header__write_pipe(fd);
2331 		if (err < 0)
2332 			goto out_free_threads;
2333 	} else {
2334 		err = perf_session__write_header(session, rec->evlist, fd, false);
2335 		if (err < 0)
2336 			goto out_free_threads;
2337 	}
2338 
2339 	err = -1;
2340 	if (!rec->no_buildid
2341 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2342 		pr_err("Couldn't generate buildids. "
2343 		       "Use --no-buildid to profile anyway.\n");
2344 		goto out_free_threads;
2345 	}
2346 
2347 	err = record__setup_sb_evlist(rec);
2348 	if (err)
2349 		goto out_free_threads;
2350 
2351 	err = record__synthesize(rec, false);
2352 	if (err < 0)
2353 		goto out_free_threads;
2354 
2355 	if (rec->realtime_prio) {
2356 		struct sched_param param;
2357 
2358 		param.sched_priority = rec->realtime_prio;
2359 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2360 			pr_err("Could not set realtime priority.\n");
2361 			err = -1;
2362 			goto out_free_threads;
2363 		}
2364 	}
2365 
2366 	if (record__start_threads(rec))
2367 		goto out_free_threads;
2368 
2369 	/*
2370 	 * When perf is starting the traced process, all the events
2371 	 * (apart from group members) have enable_on_exec=1 set,
2372 	 * so don't spoil it by prematurely enabling them.
2373 	 */
2374 	if (!target__none(&opts->target) && !opts->initial_delay)
2375 		evlist__enable(rec->evlist);
2376 
2377 	/*
2378 	 * Let the child rip
2379 	 */
2380 	if (forks) {
2381 		struct machine *machine = &session->machines.host;
2382 		union perf_event *event;
2383 		pid_t tgid;
2384 
2385 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2386 		if (event == NULL) {
2387 			err = -ENOMEM;
2388 			goto out_child;
2389 		}
2390 
2391 		/*
2392 		 * Some H/W events are generated before COMM event
2393 		 * which is emitted during exec(), so perf script
2394 		 * cannot see a correct process name for those events.
2395 		 * Synthesize COMM event to prevent it.
2396 		 */
2397 		tgid = perf_event__synthesize_comm(tool, event,
2398 						   rec->evlist->workload.pid,
2399 						   process_synthesized_event,
2400 						   machine);
2401 		free(event);
2402 
2403 		if (tgid == -1)
2404 			goto out_child;
2405 
2406 		event = malloc(sizeof(event->namespaces) +
2407 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2408 			       machine->id_hdr_size);
2409 		if (event == NULL) {
2410 			err = -ENOMEM;
2411 			goto out_child;
2412 		}
2413 
2414 		/*
2415 		 * Synthesize NAMESPACES event for the command specified.
2416 		 */
2417 		perf_event__synthesize_namespaces(tool, event,
2418 						  rec->evlist->workload.pid,
2419 						  tgid, process_synthesized_event,
2420 						  machine);
2421 		free(event);
2422 
2423 		evlist__start_workload(rec->evlist);
2424 	}
2425 
2426 	if (opts->initial_delay) {
2427 		pr_info(EVLIST_DISABLED_MSG);
2428 		if (opts->initial_delay > 0) {
2429 			usleep(opts->initial_delay * USEC_PER_MSEC);
2430 			evlist__enable(rec->evlist);
2431 			pr_info(EVLIST_ENABLED_MSG);
2432 		}
2433 	}
2434 
2435 	trigger_ready(&auxtrace_snapshot_trigger);
2436 	trigger_ready(&switch_output_trigger);
2437 	perf_hooks__invoke_record_start();
2438 
2439 	/*
2440 	 * Must write FINISHED_INIT so it will be seen after all other
2441 	 * synthesized user events, but before any regular events.
2442 	 */
2443 	err = write_finished_init(rec, false);
2444 	if (err < 0)
2445 		goto out_child;
2446 
2447 	for (;;) {
2448 		unsigned long long hits = thread->samples;
2449 
2450 		/*
2451 		 * rec->evlist->bkw_mmap_state is possible to be
2452 		 * BKW_MMAP_EMPTY here: when done == true and
2453 		 * hits != rec->samples in previous round.
2454 		 *
2455 		 * evlist__toggle_bkw_mmap ensure we never
2456 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2457 		 */
2458 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2459 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2460 
2461 		if (record__mmap_read_all(rec, false) < 0) {
2462 			trigger_error(&auxtrace_snapshot_trigger);
2463 			trigger_error(&switch_output_trigger);
2464 			err = -1;
2465 			goto out_child;
2466 		}
2467 
2468 		if (auxtrace_record__snapshot_started) {
2469 			auxtrace_record__snapshot_started = 0;
2470 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2471 				record__read_auxtrace_snapshot(rec, false);
2472 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2473 				pr_err("AUX area tracing snapshot failed\n");
2474 				err = -1;
2475 				goto out_child;
2476 			}
2477 		}
2478 
2479 		if (trigger_is_hit(&switch_output_trigger)) {
2480 			/*
2481 			 * If switch_output_trigger is hit, the data in
2482 			 * overwritable ring buffer should have been collected,
2483 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2484 			 *
2485 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2486 			 * record__mmap_read_all() didn't collect data from
2487 			 * overwritable ring buffer. Read again.
2488 			 */
2489 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2490 				continue;
2491 			trigger_ready(&switch_output_trigger);
2492 
2493 			/*
2494 			 * Reenable events in overwrite ring buffer after
2495 			 * record__mmap_read_all(): we should have collected
2496 			 * data from it.
2497 			 */
2498 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2499 
2500 			if (!quiet)
2501 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2502 					record__waking(rec));
2503 			thread->waking = 0;
2504 			fd = record__switch_output(rec, false);
2505 			if (fd < 0) {
2506 				pr_err("Failed to switch to new file\n");
2507 				trigger_error(&switch_output_trigger);
2508 				err = fd;
2509 				goto out_child;
2510 			}
2511 
2512 			/* re-arm the alarm */
2513 			if (rec->switch_output.time)
2514 				alarm(rec->switch_output.time);
2515 		}
2516 
2517 		if (hits == thread->samples) {
2518 			if (done || draining)
2519 				break;
2520 			err = fdarray__poll(&thread->pollfd, -1);
2521 			/*
2522 			 * Propagate error, only if there's any. Ignore positive
2523 			 * number of returned events and interrupt error.
2524 			 */
2525 			if (err > 0 || (err < 0 && errno == EINTR))
2526 				err = 0;
2527 			thread->waking++;
2528 
2529 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2530 					    record__thread_munmap_filtered, NULL) == 0)
2531 				draining = true;
2532 
2533 			evlist__ctlfd_update(rec->evlist,
2534 				&thread->pollfd.entries[thread->ctlfd_pos]);
2535 		}
2536 
2537 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2538 			switch (cmd) {
2539 			case EVLIST_CTL_CMD_SNAPSHOT:
2540 				hit_auxtrace_snapshot_trigger(rec);
2541 				evlist__ctlfd_ack(rec->evlist);
2542 				break;
2543 			case EVLIST_CTL_CMD_STOP:
2544 				done = 1;
2545 				break;
2546 			case EVLIST_CTL_CMD_ACK:
2547 			case EVLIST_CTL_CMD_UNSUPPORTED:
2548 			case EVLIST_CTL_CMD_ENABLE:
2549 			case EVLIST_CTL_CMD_DISABLE:
2550 			case EVLIST_CTL_CMD_EVLIST:
2551 			case EVLIST_CTL_CMD_PING:
2552 			default:
2553 				break;
2554 			}
2555 		}
2556 
2557 		/*
2558 		 * When perf is starting the traced process, at the end events
2559 		 * die with the process and we wait for that. Thus no need to
2560 		 * disable events in this case.
2561 		 */
2562 		if (done && !disabled && !target__none(&opts->target)) {
2563 			trigger_off(&auxtrace_snapshot_trigger);
2564 			evlist__disable(rec->evlist);
2565 			disabled = true;
2566 		}
2567 	}
2568 
2569 	trigger_off(&auxtrace_snapshot_trigger);
2570 	trigger_off(&switch_output_trigger);
2571 
2572 	if (opts->auxtrace_snapshot_on_exit)
2573 		record__auxtrace_snapshot_exit(rec);
2574 
2575 	if (forks && workload_exec_errno) {
2576 		char msg[STRERR_BUFSIZE], strevsels[2048];
2577 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2578 
2579 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2580 
2581 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2582 			strevsels, argv[0], emsg);
2583 		err = -1;
2584 		goto out_child;
2585 	}
2586 
2587 	if (!quiet)
2588 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2589 			record__waking(rec));
2590 
2591 	write_finished_init(rec, true);
2592 
2593 	if (target__none(&rec->opts.target))
2594 		record__synthesize_workload(rec, true);
2595 
2596 out_child:
2597 	record__stop_threads(rec);
2598 	record__mmap_read_all(rec, true);
2599 out_free_threads:
2600 	record__free_thread_data(rec);
2601 	evlist__finalize_ctlfd(rec->evlist);
2602 	record__aio_mmap_read_sync(rec);
2603 
2604 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2605 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2606 		session->header.env.comp_ratio = ratio + 0.5;
2607 	}
2608 
2609 	if (forks) {
2610 		int exit_status;
2611 
2612 		if (!child_finished)
2613 			kill(rec->evlist->workload.pid, SIGTERM);
2614 
2615 		wait(&exit_status);
2616 
2617 		if (err < 0)
2618 			status = err;
2619 		else if (WIFEXITED(exit_status))
2620 			status = WEXITSTATUS(exit_status);
2621 		else if (WIFSIGNALED(exit_status))
2622 			signr = WTERMSIG(exit_status);
2623 	} else
2624 		status = err;
2625 
2626 	if (rec->off_cpu)
2627 		rec->bytes_written += off_cpu_write(rec->session);
2628 
2629 	record__synthesize(rec, true);
2630 	/* this will be recalculated during process_buildids() */
2631 	rec->samples = 0;
2632 
2633 	if (!err) {
2634 		if (!rec->timestamp_filename) {
2635 			record__finish_output(rec);
2636 		} else {
2637 			fd = record__switch_output(rec, true);
2638 			if (fd < 0) {
2639 				status = fd;
2640 				goto out_delete_session;
2641 			}
2642 		}
2643 	}
2644 
2645 	perf_hooks__invoke_record_end();
2646 
2647 	if (!err && !quiet) {
2648 		char samples[128];
2649 		const char *postfix = rec->timestamp_filename ?
2650 					".<timestamp>" : "";
2651 
2652 		if (rec->samples && !rec->opts.full_auxtrace)
2653 			scnprintf(samples, sizeof(samples),
2654 				  " (%" PRIu64 " samples)", rec->samples);
2655 		else
2656 			samples[0] = '\0';
2657 
2658 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2659 			perf_data__size(data) / 1024.0 / 1024.0,
2660 			data->path, postfix, samples);
2661 		if (ratio) {
2662 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2663 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2664 					ratio);
2665 		}
2666 		fprintf(stderr, " ]\n");
2667 	}
2668 
2669 out_delete_session:
2670 #ifdef HAVE_EVENTFD_SUPPORT
2671 	if (done_fd >= 0)
2672 		close(done_fd);
2673 #endif
2674 	zstd_fini(&session->zstd_data);
2675 	perf_session__delete(session);
2676 
2677 	if (!opts->no_bpf_event)
2678 		evlist__stop_sb_thread(rec->sb_evlist);
2679 	return status;
2680 }
2681 
2682 static void callchain_debug(struct callchain_param *callchain)
2683 {
2684 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2685 
2686 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2687 
2688 	if (callchain->record_mode == CALLCHAIN_DWARF)
2689 		pr_debug("callchain: stack dump size %d\n",
2690 			 callchain->dump_size);
2691 }
2692 
2693 int record_opts__parse_callchain(struct record_opts *record,
2694 				 struct callchain_param *callchain,
2695 				 const char *arg, bool unset)
2696 {
2697 	int ret;
2698 	callchain->enabled = !unset;
2699 
2700 	/* --no-call-graph */
2701 	if (unset) {
2702 		callchain->record_mode = CALLCHAIN_NONE;
2703 		pr_debug("callchain: disabled\n");
2704 		return 0;
2705 	}
2706 
2707 	ret = parse_callchain_record_opt(arg, callchain);
2708 	if (!ret) {
2709 		/* Enable data address sampling for DWARF unwind. */
2710 		if (callchain->record_mode == CALLCHAIN_DWARF)
2711 			record->sample_address = true;
2712 		callchain_debug(callchain);
2713 	}
2714 
2715 	return ret;
2716 }
2717 
2718 int record_parse_callchain_opt(const struct option *opt,
2719 			       const char *arg,
2720 			       int unset)
2721 {
2722 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2723 }
2724 
2725 int record_callchain_opt(const struct option *opt,
2726 			 const char *arg __maybe_unused,
2727 			 int unset __maybe_unused)
2728 {
2729 	struct callchain_param *callchain = opt->value;
2730 
2731 	callchain->enabled = true;
2732 
2733 	if (callchain->record_mode == CALLCHAIN_NONE)
2734 		callchain->record_mode = CALLCHAIN_FP;
2735 
2736 	callchain_debug(callchain);
2737 	return 0;
2738 }
2739 
2740 static int perf_record_config(const char *var, const char *value, void *cb)
2741 {
2742 	struct record *rec = cb;
2743 
2744 	if (!strcmp(var, "record.build-id")) {
2745 		if (!strcmp(value, "cache"))
2746 			rec->no_buildid_cache = false;
2747 		else if (!strcmp(value, "no-cache"))
2748 			rec->no_buildid_cache = true;
2749 		else if (!strcmp(value, "skip"))
2750 			rec->no_buildid = true;
2751 		else if (!strcmp(value, "mmap"))
2752 			rec->buildid_mmap = true;
2753 		else
2754 			return -1;
2755 		return 0;
2756 	}
2757 	if (!strcmp(var, "record.call-graph")) {
2758 		var = "call-graph.record-mode";
2759 		return perf_default_config(var, value, cb);
2760 	}
2761 #ifdef HAVE_AIO_SUPPORT
2762 	if (!strcmp(var, "record.aio")) {
2763 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2764 		if (!rec->opts.nr_cblocks)
2765 			rec->opts.nr_cblocks = nr_cblocks_default;
2766 	}
2767 #endif
2768 	if (!strcmp(var, "record.debuginfod")) {
2769 		rec->debuginfod.urls = strdup(value);
2770 		if (!rec->debuginfod.urls)
2771 			return -ENOMEM;
2772 		rec->debuginfod.set = true;
2773 	}
2774 
2775 	return 0;
2776 }
2777 
2778 
2779 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2780 {
2781 	struct record_opts *opts = (struct record_opts *)opt->value;
2782 
2783 	if (unset || !str)
2784 		return 0;
2785 
2786 	if (!strcasecmp(str, "node"))
2787 		opts->affinity = PERF_AFFINITY_NODE;
2788 	else if (!strcasecmp(str, "cpu"))
2789 		opts->affinity = PERF_AFFINITY_CPU;
2790 
2791 	return 0;
2792 }
2793 
2794 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2795 {
2796 	mask->nbits = nr_bits;
2797 	mask->bits = bitmap_zalloc(mask->nbits);
2798 	if (!mask->bits)
2799 		return -ENOMEM;
2800 
2801 	return 0;
2802 }
2803 
2804 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2805 {
2806 	bitmap_free(mask->bits);
2807 	mask->nbits = 0;
2808 }
2809 
2810 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
2811 {
2812 	int ret;
2813 
2814 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
2815 	if (ret) {
2816 		mask->affinity.bits = NULL;
2817 		return ret;
2818 	}
2819 
2820 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
2821 	if (ret) {
2822 		record__mmap_cpu_mask_free(&mask->maps);
2823 		mask->maps.bits = NULL;
2824 	}
2825 
2826 	return ret;
2827 }
2828 
2829 static void record__thread_mask_free(struct thread_mask *mask)
2830 {
2831 	record__mmap_cpu_mask_free(&mask->maps);
2832 	record__mmap_cpu_mask_free(&mask->affinity);
2833 }
2834 
2835 static int record__parse_threads(const struct option *opt, const char *str, int unset)
2836 {
2837 	int s;
2838 	struct record_opts *opts = opt->value;
2839 
2840 	if (unset || !str || !strlen(str)) {
2841 		opts->threads_spec = THREAD_SPEC__CPU;
2842 	} else {
2843 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
2844 			if (s == THREAD_SPEC__USER) {
2845 				opts->threads_user_spec = strdup(str);
2846 				if (!opts->threads_user_spec)
2847 					return -ENOMEM;
2848 				opts->threads_spec = THREAD_SPEC__USER;
2849 				break;
2850 			}
2851 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
2852 				opts->threads_spec = s;
2853 				break;
2854 			}
2855 		}
2856 	}
2857 
2858 	if (opts->threads_spec == THREAD_SPEC__USER)
2859 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
2860 	else
2861 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
2862 
2863 	return 0;
2864 }
2865 
2866 static int parse_output_max_size(const struct option *opt,
2867 				 const char *str, int unset)
2868 {
2869 	unsigned long *s = (unsigned long *)opt->value;
2870 	static struct parse_tag tags_size[] = {
2871 		{ .tag  = 'B', .mult = 1       },
2872 		{ .tag  = 'K', .mult = 1 << 10 },
2873 		{ .tag  = 'M', .mult = 1 << 20 },
2874 		{ .tag  = 'G', .mult = 1 << 30 },
2875 		{ .tag  = 0 },
2876 	};
2877 	unsigned long val;
2878 
2879 	if (unset) {
2880 		*s = 0;
2881 		return 0;
2882 	}
2883 
2884 	val = parse_tag_value(str, tags_size);
2885 	if (val != (unsigned long) -1) {
2886 		*s = val;
2887 		return 0;
2888 	}
2889 
2890 	return -1;
2891 }
2892 
2893 static int record__parse_mmap_pages(const struct option *opt,
2894 				    const char *str,
2895 				    int unset __maybe_unused)
2896 {
2897 	struct record_opts *opts = opt->value;
2898 	char *s, *p;
2899 	unsigned int mmap_pages;
2900 	int ret;
2901 
2902 	if (!str)
2903 		return -EINVAL;
2904 
2905 	s = strdup(str);
2906 	if (!s)
2907 		return -ENOMEM;
2908 
2909 	p = strchr(s, ',');
2910 	if (p)
2911 		*p = '\0';
2912 
2913 	if (*s) {
2914 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
2915 		if (ret)
2916 			goto out_free;
2917 		opts->mmap_pages = mmap_pages;
2918 	}
2919 
2920 	if (!p) {
2921 		ret = 0;
2922 		goto out_free;
2923 	}
2924 
2925 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
2926 	if (ret)
2927 		goto out_free;
2928 
2929 	opts->auxtrace_mmap_pages = mmap_pages;
2930 
2931 out_free:
2932 	free(s);
2933 	return ret;
2934 }
2935 
2936 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
2937 {
2938 }
2939 
2940 static int parse_control_option(const struct option *opt,
2941 				const char *str,
2942 				int unset __maybe_unused)
2943 {
2944 	struct record_opts *opts = opt->value;
2945 
2946 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
2947 }
2948 
2949 static void switch_output_size_warn(struct record *rec)
2950 {
2951 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2952 	struct switch_output *s = &rec->switch_output;
2953 
2954 	wakeup_size /= 2;
2955 
2956 	if (s->size < wakeup_size) {
2957 		char buf[100];
2958 
2959 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2960 		pr_warning("WARNING: switch-output data size lower than "
2961 			   "wakeup kernel buffer size (%s) "
2962 			   "expect bigger perf.data sizes\n", buf);
2963 	}
2964 }
2965 
2966 static int switch_output_setup(struct record *rec)
2967 {
2968 	struct switch_output *s = &rec->switch_output;
2969 	static struct parse_tag tags_size[] = {
2970 		{ .tag  = 'B', .mult = 1       },
2971 		{ .tag  = 'K', .mult = 1 << 10 },
2972 		{ .tag  = 'M', .mult = 1 << 20 },
2973 		{ .tag  = 'G', .mult = 1 << 30 },
2974 		{ .tag  = 0 },
2975 	};
2976 	static struct parse_tag tags_time[] = {
2977 		{ .tag  = 's', .mult = 1        },
2978 		{ .tag  = 'm', .mult = 60       },
2979 		{ .tag  = 'h', .mult = 60*60    },
2980 		{ .tag  = 'd', .mult = 60*60*24 },
2981 		{ .tag  = 0 },
2982 	};
2983 	unsigned long val;
2984 
2985 	/*
2986 	 * If we're using --switch-output-events, then we imply its
2987 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2988 	 *  thread to its parent.
2989 	 */
2990 	if (rec->switch_output_event_set) {
2991 		if (record__threads_enabled(rec)) {
2992 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
2993 			return 0;
2994 		}
2995 		goto do_signal;
2996 	}
2997 
2998 	if (!s->set)
2999 		return 0;
3000 
3001 	if (record__threads_enabled(rec)) {
3002 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3003 		return 0;
3004 	}
3005 
3006 	if (!strcmp(s->str, "signal")) {
3007 do_signal:
3008 		s->signal = true;
3009 		pr_debug("switch-output with SIGUSR2 signal\n");
3010 		goto enabled;
3011 	}
3012 
3013 	val = parse_tag_value(s->str, tags_size);
3014 	if (val != (unsigned long) -1) {
3015 		s->size = val;
3016 		pr_debug("switch-output with %s size threshold\n", s->str);
3017 		goto enabled;
3018 	}
3019 
3020 	val = parse_tag_value(s->str, tags_time);
3021 	if (val != (unsigned long) -1) {
3022 		s->time = val;
3023 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3024 			 s->str, s->time);
3025 		goto enabled;
3026 	}
3027 
3028 	return -1;
3029 
3030 enabled:
3031 	rec->timestamp_filename = true;
3032 	s->enabled              = true;
3033 
3034 	if (s->size && !rec->opts.no_buffering)
3035 		switch_output_size_warn(rec);
3036 
3037 	return 0;
3038 }
3039 
3040 static const char * const __record_usage[] = {
3041 	"perf record [<options>] [<command>]",
3042 	"perf record [<options>] -- <command> [<options>]",
3043 	NULL
3044 };
3045 const char * const *record_usage = __record_usage;
3046 
3047 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3048 				  struct perf_sample *sample, struct machine *machine)
3049 {
3050 	/*
3051 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3052 	 * no need to add them twice.
3053 	 */
3054 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3055 		return 0;
3056 	return perf_event__process_mmap(tool, event, sample, machine);
3057 }
3058 
3059 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3060 				   struct perf_sample *sample, struct machine *machine)
3061 {
3062 	/*
3063 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3064 	 * no need to add them twice.
3065 	 */
3066 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3067 		return 0;
3068 
3069 	return perf_event__process_mmap2(tool, event, sample, machine);
3070 }
3071 
3072 static int process_timestamp_boundary(struct perf_tool *tool,
3073 				      union perf_event *event __maybe_unused,
3074 				      struct perf_sample *sample,
3075 				      struct machine *machine __maybe_unused)
3076 {
3077 	struct record *rec = container_of(tool, struct record, tool);
3078 
3079 	set_timestamp_boundary(rec, sample->time);
3080 	return 0;
3081 }
3082 
3083 static int parse_record_synth_option(const struct option *opt,
3084 				     const char *str,
3085 				     int unset __maybe_unused)
3086 {
3087 	struct record_opts *opts = opt->value;
3088 	char *p = strdup(str);
3089 
3090 	if (p == NULL)
3091 		return -1;
3092 
3093 	opts->synth = parse_synth_opt(p);
3094 	free(p);
3095 
3096 	if (opts->synth < 0) {
3097 		pr_err("Invalid synth option: %s\n", str);
3098 		return -1;
3099 	}
3100 	return 0;
3101 }
3102 
3103 /*
3104  * XXX Ideally would be local to cmd_record() and passed to a record__new
3105  * because we need to have access to it in record__exit, that is called
3106  * after cmd_record() exits, but since record_options need to be accessible to
3107  * builtin-script, leave it here.
3108  *
3109  * At least we don't ouch it in all the other functions here directly.
3110  *
3111  * Just say no to tons of global variables, sigh.
3112  */
3113 static struct record record = {
3114 	.opts = {
3115 		.sample_time	     = true,
3116 		.mmap_pages	     = UINT_MAX,
3117 		.user_freq	     = UINT_MAX,
3118 		.user_interval	     = ULLONG_MAX,
3119 		.freq		     = 4000,
3120 		.target		     = {
3121 			.uses_mmap   = true,
3122 			.default_per_cpu = true,
3123 		},
3124 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3125 		.nr_threads_synthesize = 1,
3126 		.ctl_fd              = -1,
3127 		.ctl_fd_ack          = -1,
3128 		.synth               = PERF_SYNTH_ALL,
3129 	},
3130 	.tool = {
3131 		.sample		= process_sample_event,
3132 		.fork		= perf_event__process_fork,
3133 		.exit		= perf_event__process_exit,
3134 		.comm		= perf_event__process_comm,
3135 		.namespaces	= perf_event__process_namespaces,
3136 		.mmap		= build_id__process_mmap,
3137 		.mmap2		= build_id__process_mmap2,
3138 		.itrace_start	= process_timestamp_boundary,
3139 		.aux		= process_timestamp_boundary,
3140 		.ordered_events	= true,
3141 	},
3142 };
3143 
3144 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3145 	"\n\t\t\t\tDefault: fp";
3146 
3147 static bool dry_run;
3148 
3149 /*
3150  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3151  * with it and switch to use the library functions in perf_evlist that came
3152  * from builtin-record.c, i.e. use record_opts,
3153  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3154  * using pipes, etc.
3155  */
3156 static struct option __record_options[] = {
3157 	OPT_CALLBACK('e', "event", &record.evlist, "event",
3158 		     "event selector. use 'perf list' to list available events",
3159 		     parse_events_option),
3160 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3161 		     "event filter", parse_filter),
3162 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3163 			   NULL, "don't record events from perf itself",
3164 			   exclude_perf),
3165 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3166 		    "record events on existing process id"),
3167 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3168 		    "record events on existing thread id"),
3169 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3170 		    "collect data with this RT SCHED_FIFO priority"),
3171 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3172 		    "collect data without buffering"),
3173 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3174 		    "collect raw sample records from all opened counters"),
3175 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3176 			    "system-wide collection from all CPUs"),
3177 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3178 		    "list of cpus to monitor"),
3179 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3180 	OPT_STRING('o', "output", &record.data.path, "file",
3181 		    "output file name"),
3182 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3183 			&record.opts.no_inherit_set,
3184 			"child tasks do not inherit counters"),
3185 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3186 		    "synthesize non-sample events at the end of output"),
3187 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3188 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3189 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3190 		    "Fail if the specified frequency can't be used"),
3191 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3192 		     "profile at this frequency",
3193 		      record__parse_freq),
3194 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3195 		     "number of mmap data pages and AUX area tracing mmap pages",
3196 		     record__parse_mmap_pages),
3197 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3198 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3199 		     record__mmap_flush_parse),
3200 	OPT_BOOLEAN(0, "group", &record.opts.group,
3201 		    "put the counters into a counter group"),
3202 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3203 			   NULL, "enables call-graph recording" ,
3204 			   &record_callchain_opt),
3205 	OPT_CALLBACK(0, "call-graph", &record.opts,
3206 		     "record_mode[,record_size]", record_callchain_help,
3207 		     &record_parse_callchain_opt),
3208 	OPT_INCR('v', "verbose", &verbose,
3209 		    "be more verbose (show counter open errors, etc)"),
3210 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
3211 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3212 		    "per thread counts"),
3213 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3214 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3215 		    "Record the sample physical addresses"),
3216 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3217 		    "Record the sampled data address data page size"),
3218 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3219 		    "Record the sampled code address (ip) page size"),
3220 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3221 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3222 		    "Record the sample identifier"),
3223 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3224 			&record.opts.sample_time_set,
3225 			"Record the sample timestamps"),
3226 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3227 			"Record the sample period"),
3228 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3229 		    "don't sample"),
3230 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3231 			&record.no_buildid_cache_set,
3232 			"do not update the buildid cache"),
3233 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3234 			&record.no_buildid_set,
3235 			"do not collect buildids in perf.data"),
3236 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3237 		     "monitor event in cgroup name only",
3238 		     parse_cgroups),
3239 	OPT_INTEGER('D', "delay", &record.opts.initial_delay,
3240 		  "ms to wait before starting measurement after program start (-1: start with events disabled)"),
3241 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3242 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3243 		   "user to profile"),
3244 
3245 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3246 		     "branch any", "sample any taken branches",
3247 		     parse_branch_stack),
3248 
3249 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3250 		     "branch filter mask", "branch stack filter modes",
3251 		     parse_branch_stack),
3252 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3253 		    "sample by weight (on special events only)"),
3254 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3255 		    "sample transaction flags (special events only)"),
3256 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3257 		    "use per-thread mmaps"),
3258 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3259 		    "sample selected machine registers on interrupt,"
3260 		    " use '-I?' to list register names", parse_intr_regs),
3261 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3262 		    "sample selected machine registers on interrupt,"
3263 		    " use '--user-regs=?' to list register names", parse_user_regs),
3264 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3265 		    "Record running/enabled time of read (:S) events"),
3266 	OPT_CALLBACK('k', "clockid", &record.opts,
3267 	"clockid", "clockid to use for events, see clock_gettime()",
3268 	parse_clockid),
3269 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3270 			  "opts", "AUX area tracing Snapshot Mode", ""),
3271 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3272 			  "opts", "sample AUX area", ""),
3273 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3274 			"per thread proc mmap processing timeout in ms"),
3275 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3276 		    "Record namespaces events"),
3277 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3278 		    "Record cgroup events"),
3279 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3280 			&record.opts.record_switch_events_set,
3281 			"Record context switch events"),
3282 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3283 			 "Configure all used events to run in kernel space.",
3284 			 PARSE_OPT_EXCLUSIVE),
3285 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3286 			 "Configure all used events to run in user space.",
3287 			 PARSE_OPT_EXCLUSIVE),
3288 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3289 		    "collect kernel callchains"),
3290 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3291 		    "collect user callchains"),
3292 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
3293 		   "clang binary to use for compiling BPF scriptlets"),
3294 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
3295 		   "options passed to clang when compiling BPF scriptlets"),
3296 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3297 		   "file", "vmlinux pathname"),
3298 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3299 		    "Record build-id of all DSOs regardless of hits"),
3300 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3301 		    "Record build-id in map events"),
3302 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3303 		    "append timestamp to output filename"),
3304 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3305 		    "Record timestamp boundary (time of first/last samples)"),
3306 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3307 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3308 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3309 			  "signal"),
3310 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
3311 			 "switch output event selector. use 'perf list' to list available events",
3312 			 parse_events_option_new_evlist),
3313 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3314 		   "Limit number of switch output generated files"),
3315 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3316 		    "Parse options then exit"),
3317 #ifdef HAVE_AIO_SUPPORT
3318 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3319 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3320 		     record__aio_parse),
3321 #endif
3322 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3323 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3324 		     record__parse_affinity),
3325 #ifdef HAVE_ZSTD_SUPPORT
3326 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3327 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3328 			    record__parse_comp_level),
3329 #endif
3330 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3331 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3332 	OPT_UINTEGER(0, "num-thread-synthesize",
3333 		     &record.opts.nr_threads_synthesize,
3334 		     "number of threads to run for event synthesis"),
3335 #ifdef HAVE_LIBPFM
3336 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3337 		"libpfm4 event selector. use 'perf list' to list available events",
3338 		parse_libpfm_events_option),
3339 #endif
3340 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3341 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3342 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3343 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3344 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3345 		      parse_control_option),
3346 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3347 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3348 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3349 			  &record.debuginfod.set, "debuginfod urls",
3350 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3351 			  "system"),
3352 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3353 			    "write collected trace data into several data files using parallel threads",
3354 			    record__parse_threads),
3355 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3356 	OPT_END()
3357 };
3358 
3359 struct option *record_options = __record_options;
3360 
3361 static void record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3362 {
3363 	struct perf_cpu cpu;
3364 	int idx;
3365 
3366 	if (cpu_map__is_dummy(cpus))
3367 		return;
3368 
3369 	perf_cpu_map__for_each_cpu(cpu, idx, cpus)
3370 		set_bit(cpu.cpu, mask->bits);
3371 }
3372 
3373 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3374 {
3375 	struct perf_cpu_map *cpus;
3376 
3377 	cpus = perf_cpu_map__new(mask_spec);
3378 	if (!cpus)
3379 		return -ENOMEM;
3380 
3381 	bitmap_zero(mask->bits, mask->nbits);
3382 	record__mmap_cpu_mask_init(mask, cpus);
3383 	perf_cpu_map__put(cpus);
3384 
3385 	return 0;
3386 }
3387 
3388 static void record__free_thread_masks(struct record *rec, int nr_threads)
3389 {
3390 	int t;
3391 
3392 	if (rec->thread_masks)
3393 		for (t = 0; t < nr_threads; t++)
3394 			record__thread_mask_free(&rec->thread_masks[t]);
3395 
3396 	zfree(&rec->thread_masks);
3397 }
3398 
3399 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3400 {
3401 	int t, ret;
3402 
3403 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3404 	if (!rec->thread_masks) {
3405 		pr_err("Failed to allocate thread masks\n");
3406 		return -ENOMEM;
3407 	}
3408 
3409 	for (t = 0; t < nr_threads; t++) {
3410 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3411 		if (ret) {
3412 			pr_err("Failed to allocate thread masks[%d]\n", t);
3413 			goto out_free;
3414 		}
3415 	}
3416 
3417 	return 0;
3418 
3419 out_free:
3420 	record__free_thread_masks(rec, nr_threads);
3421 
3422 	return ret;
3423 }
3424 
3425 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3426 {
3427 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3428 
3429 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3430 	if (ret)
3431 		return ret;
3432 
3433 	rec->nr_threads = nr_cpus;
3434 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3435 
3436 	for (t = 0; t < rec->nr_threads; t++) {
3437 		set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3438 		set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3439 		if (verbose) {
3440 			pr_debug("thread_masks[%d]: ", t);
3441 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3442 			pr_debug("thread_masks[%d]: ", t);
3443 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3444 		}
3445 	}
3446 
3447 	return 0;
3448 }
3449 
3450 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3451 					  const char **maps_spec, const char **affinity_spec,
3452 					  u32 nr_spec)
3453 {
3454 	u32 s;
3455 	int ret = 0, t = 0;
3456 	struct mmap_cpu_mask cpus_mask;
3457 	struct thread_mask thread_mask, full_mask, *thread_masks;
3458 
3459 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3460 	if (ret) {
3461 		pr_err("Failed to allocate CPUs mask\n");
3462 		return ret;
3463 	}
3464 	record__mmap_cpu_mask_init(&cpus_mask, cpus);
3465 
3466 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3467 	if (ret) {
3468 		pr_err("Failed to allocate full mask\n");
3469 		goto out_free_cpu_mask;
3470 	}
3471 
3472 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3473 	if (ret) {
3474 		pr_err("Failed to allocate thread mask\n");
3475 		goto out_free_full_and_cpu_masks;
3476 	}
3477 
3478 	for (s = 0; s < nr_spec; s++) {
3479 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3480 		if (ret) {
3481 			pr_err("Failed to initialize maps thread mask\n");
3482 			goto out_free;
3483 		}
3484 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3485 		if (ret) {
3486 			pr_err("Failed to initialize affinity thread mask\n");
3487 			goto out_free;
3488 		}
3489 
3490 		/* ignore invalid CPUs but do not allow empty masks */
3491 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3492 				cpus_mask.bits, thread_mask.maps.nbits)) {
3493 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3494 			ret = -EINVAL;
3495 			goto out_free;
3496 		}
3497 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3498 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3499 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3500 			ret = -EINVAL;
3501 			goto out_free;
3502 		}
3503 
3504 		/* do not allow intersection with other masks (full_mask) */
3505 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3506 				      thread_mask.maps.nbits)) {
3507 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3508 			ret = -EINVAL;
3509 			goto out_free;
3510 		}
3511 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3512 				      thread_mask.affinity.nbits)) {
3513 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3514 			ret = -EINVAL;
3515 			goto out_free;
3516 		}
3517 
3518 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3519 			  thread_mask.maps.bits, full_mask.maps.nbits);
3520 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3521 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3522 
3523 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3524 		if (!thread_masks) {
3525 			pr_err("Failed to reallocate thread masks\n");
3526 			ret = -ENOMEM;
3527 			goto out_free;
3528 		}
3529 		rec->thread_masks = thread_masks;
3530 		rec->thread_masks[t] = thread_mask;
3531 		if (verbose) {
3532 			pr_debug("thread_masks[%d]: ", t);
3533 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3534 			pr_debug("thread_masks[%d]: ", t);
3535 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3536 		}
3537 		t++;
3538 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3539 		if (ret) {
3540 			pr_err("Failed to allocate thread mask\n");
3541 			goto out_free_full_and_cpu_masks;
3542 		}
3543 	}
3544 	rec->nr_threads = t;
3545 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3546 	if (!rec->nr_threads)
3547 		ret = -EINVAL;
3548 
3549 out_free:
3550 	record__thread_mask_free(&thread_mask);
3551 out_free_full_and_cpu_masks:
3552 	record__thread_mask_free(&full_mask);
3553 out_free_cpu_mask:
3554 	record__mmap_cpu_mask_free(&cpus_mask);
3555 
3556 	return ret;
3557 }
3558 
3559 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3560 {
3561 	int ret;
3562 	struct cpu_topology *topo;
3563 
3564 	topo = cpu_topology__new();
3565 	if (!topo) {
3566 		pr_err("Failed to allocate CPU topology\n");
3567 		return -ENOMEM;
3568 	}
3569 
3570 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3571 					     topo->core_cpus_list, topo->core_cpus_lists);
3572 	cpu_topology__delete(topo);
3573 
3574 	return ret;
3575 }
3576 
3577 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3578 {
3579 	int ret;
3580 	struct cpu_topology *topo;
3581 
3582 	topo = cpu_topology__new();
3583 	if (!topo) {
3584 		pr_err("Failed to allocate CPU topology\n");
3585 		return -ENOMEM;
3586 	}
3587 
3588 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3589 					     topo->package_cpus_list, topo->package_cpus_lists);
3590 	cpu_topology__delete(topo);
3591 
3592 	return ret;
3593 }
3594 
3595 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3596 {
3597 	u32 s;
3598 	int ret;
3599 	const char **spec;
3600 	struct numa_topology *topo;
3601 
3602 	topo = numa_topology__new();
3603 	if (!topo) {
3604 		pr_err("Failed to allocate NUMA topology\n");
3605 		return -ENOMEM;
3606 	}
3607 
3608 	spec = zalloc(topo->nr * sizeof(char *));
3609 	if (!spec) {
3610 		pr_err("Failed to allocate NUMA spec\n");
3611 		ret = -ENOMEM;
3612 		goto out_delete_topo;
3613 	}
3614 	for (s = 0; s < topo->nr; s++)
3615 		spec[s] = topo->nodes[s].cpus;
3616 
3617 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3618 
3619 	zfree(&spec);
3620 
3621 out_delete_topo:
3622 	numa_topology__delete(topo);
3623 
3624 	return ret;
3625 }
3626 
3627 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3628 {
3629 	int t, ret;
3630 	u32 s, nr_spec = 0;
3631 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3632 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3633 
3634 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3635 		spec = strtok_r(user_spec, ":", &spec_ptr);
3636 		if (spec == NULL)
3637 			break;
3638 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3639 		mask = strtok_r(spec, "/", &mask_ptr);
3640 		if (mask == NULL)
3641 			break;
3642 		pr_debug2("  maps mask: %s\n", mask);
3643 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3644 		if (!tmp_spec) {
3645 			pr_err("Failed to reallocate maps spec\n");
3646 			ret = -ENOMEM;
3647 			goto out_free;
3648 		}
3649 		maps_spec = tmp_spec;
3650 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3651 		if (!maps_spec[nr_spec]) {
3652 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3653 			ret = -ENOMEM;
3654 			goto out_free;
3655 		}
3656 		mask = strtok_r(NULL, "/", &mask_ptr);
3657 		if (mask == NULL) {
3658 			pr_err("Invalid thread maps or affinity specs\n");
3659 			ret = -EINVAL;
3660 			goto out_free;
3661 		}
3662 		pr_debug2("  affinity mask: %s\n", mask);
3663 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3664 		if (!tmp_spec) {
3665 			pr_err("Failed to reallocate affinity spec\n");
3666 			ret = -ENOMEM;
3667 			goto out_free;
3668 		}
3669 		affinity_spec = tmp_spec;
3670 		affinity_spec[nr_spec] = strdup(mask);
3671 		if (!affinity_spec[nr_spec]) {
3672 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3673 			ret = -ENOMEM;
3674 			goto out_free;
3675 		}
3676 		dup_mask = NULL;
3677 		nr_spec++;
3678 	}
3679 
3680 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3681 					     (const char **)affinity_spec, nr_spec);
3682 
3683 out_free:
3684 	free(dup_mask);
3685 	for (s = 0; s < nr_spec; s++) {
3686 		if (maps_spec)
3687 			free(maps_spec[s]);
3688 		if (affinity_spec)
3689 			free(affinity_spec[s]);
3690 	}
3691 	free(affinity_spec);
3692 	free(maps_spec);
3693 
3694 	return ret;
3695 }
3696 
3697 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3698 {
3699 	int ret;
3700 
3701 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3702 	if (ret)
3703 		return ret;
3704 
3705 	record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus);
3706 
3707 	rec->nr_threads = 1;
3708 
3709 	return 0;
3710 }
3711 
3712 static int record__init_thread_masks(struct record *rec)
3713 {
3714 	int ret = 0;
3715 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3716 
3717 	if (!record__threads_enabled(rec))
3718 		return record__init_thread_default_masks(rec, cpus);
3719 
3720 	if (evlist__per_thread(rec->evlist)) {
3721 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3722 		return -EINVAL;
3723 	}
3724 
3725 	switch (rec->opts.threads_spec) {
3726 	case THREAD_SPEC__CPU:
3727 		ret = record__init_thread_cpu_masks(rec, cpus);
3728 		break;
3729 	case THREAD_SPEC__CORE:
3730 		ret = record__init_thread_core_masks(rec, cpus);
3731 		break;
3732 	case THREAD_SPEC__PACKAGE:
3733 		ret = record__init_thread_package_masks(rec, cpus);
3734 		break;
3735 	case THREAD_SPEC__NUMA:
3736 		ret = record__init_thread_numa_masks(rec, cpus);
3737 		break;
3738 	case THREAD_SPEC__USER:
3739 		ret = record__init_thread_user_masks(rec, cpus);
3740 		break;
3741 	default:
3742 		break;
3743 	}
3744 
3745 	return ret;
3746 }
3747 
3748 int cmd_record(int argc, const char **argv)
3749 {
3750 	int err;
3751 	struct record *rec = &record;
3752 	char errbuf[BUFSIZ];
3753 
3754 	setlocale(LC_ALL, "");
3755 
3756 #ifndef HAVE_LIBBPF_SUPPORT
3757 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
3758 	set_nobuild('\0', "clang-path", true);
3759 	set_nobuild('\0', "clang-opt", true);
3760 # undef set_nobuild
3761 #endif
3762 
3763 #ifndef HAVE_BPF_PROLOGUE
3764 # if !defined (HAVE_DWARF_SUPPORT)
3765 #  define REASON  "NO_DWARF=1"
3766 # elif !defined (HAVE_LIBBPF_SUPPORT)
3767 #  define REASON  "NO_LIBBPF=1"
3768 # else
3769 #  define REASON  "this architecture doesn't support BPF prologue"
3770 # endif
3771 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
3772 	set_nobuild('\0', "vmlinux", true);
3773 # undef set_nobuild
3774 # undef REASON
3775 #endif
3776 
3777 #ifndef HAVE_BPF_SKEL
3778 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3779 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3780 # undef set_nobuild
3781 #endif
3782 
3783 	rec->opts.affinity = PERF_AFFINITY_SYS;
3784 
3785 	rec->evlist = evlist__new();
3786 	if (rec->evlist == NULL)
3787 		return -ENOMEM;
3788 
3789 	err = perf_config(perf_record_config, rec);
3790 	if (err)
3791 		return err;
3792 
3793 	argc = parse_options(argc, argv, record_options, record_usage,
3794 			    PARSE_OPT_STOP_AT_NON_OPTION);
3795 	if (quiet)
3796 		perf_quiet_option();
3797 
3798 	err = symbol__validate_sym_arguments();
3799 	if (err)
3800 		return err;
3801 
3802 	perf_debuginfod_setup(&record.debuginfod);
3803 
3804 	/* Make system wide (-a) the default target. */
3805 	if (!argc && target__none(&rec->opts.target))
3806 		rec->opts.target.system_wide = true;
3807 
3808 	if (nr_cgroups && !rec->opts.target.system_wide) {
3809 		usage_with_options_msg(record_usage, record_options,
3810 			"cgroup monitoring only available in system-wide mode");
3811 
3812 	}
3813 
3814 	if (rec->buildid_mmap) {
3815 		if (!perf_can_record_build_id()) {
3816 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
3817 			err = -EINVAL;
3818 			goto out_opts;
3819 		}
3820 		pr_debug("Enabling build id in mmap2 events.\n");
3821 		/* Enable mmap build id synthesizing. */
3822 		symbol_conf.buildid_mmap2 = true;
3823 		/* Enable perf_event_attr::build_id bit. */
3824 		rec->opts.build_id = true;
3825 		/* Disable build id cache. */
3826 		rec->no_buildid = true;
3827 	}
3828 
3829 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
3830 		pr_err("Kernel has no cgroup sampling support.\n");
3831 		err = -EINVAL;
3832 		goto out_opts;
3833 	}
3834 
3835 	if (rec->opts.kcore)
3836 		rec->opts.text_poke = true;
3837 
3838 	if (rec->opts.kcore || record__threads_enabled(rec))
3839 		rec->data.is_dir = true;
3840 
3841 	if (record__threads_enabled(rec)) {
3842 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
3843 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
3844 			goto out_opts;
3845 		}
3846 		if (record__aio_enabled(rec)) {
3847 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
3848 			goto out_opts;
3849 		}
3850 	}
3851 
3852 	if (rec->opts.comp_level != 0) {
3853 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
3854 		rec->no_buildid = true;
3855 	}
3856 
3857 	if (rec->opts.record_switch_events &&
3858 	    !perf_can_record_switch_events()) {
3859 		ui__error("kernel does not support recording context switch events\n");
3860 		parse_options_usage(record_usage, record_options, "switch-events", 0);
3861 		err = -EINVAL;
3862 		goto out_opts;
3863 	}
3864 
3865 	if (switch_output_setup(rec)) {
3866 		parse_options_usage(record_usage, record_options, "switch-output", 0);
3867 		err = -EINVAL;
3868 		goto out_opts;
3869 	}
3870 
3871 	if (rec->switch_output.time) {
3872 		signal(SIGALRM, alarm_sig_handler);
3873 		alarm(rec->switch_output.time);
3874 	}
3875 
3876 	if (rec->switch_output.num_files) {
3877 		rec->switch_output.filenames = calloc(sizeof(char *),
3878 						      rec->switch_output.num_files);
3879 		if (!rec->switch_output.filenames) {
3880 			err = -EINVAL;
3881 			goto out_opts;
3882 		}
3883 	}
3884 
3885 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
3886 		rec->timestamp_filename = false;
3887 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
3888 	}
3889 
3890 	/*
3891 	 * Allow aliases to facilitate the lookup of symbols for address
3892 	 * filters. Refer to auxtrace_parse_filters().
3893 	 */
3894 	symbol_conf.allow_aliases = true;
3895 
3896 	symbol__init(NULL);
3897 
3898 	err = record__auxtrace_init(rec);
3899 	if (err)
3900 		goto out;
3901 
3902 	if (dry_run)
3903 		goto out;
3904 
3905 	err = bpf__setup_stdout(rec->evlist);
3906 	if (err) {
3907 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
3908 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
3909 			 errbuf);
3910 		goto out;
3911 	}
3912 
3913 	err = -ENOMEM;
3914 
3915 	if (rec->no_buildid_cache || rec->no_buildid) {
3916 		disable_buildid_cache();
3917 	} else if (rec->switch_output.enabled) {
3918 		/*
3919 		 * In 'perf record --switch-output', disable buildid
3920 		 * generation by default to reduce data file switching
3921 		 * overhead. Still generate buildid if they are required
3922 		 * explicitly using
3923 		 *
3924 		 *  perf record --switch-output --no-no-buildid \
3925 		 *              --no-no-buildid-cache
3926 		 *
3927 		 * Following code equals to:
3928 		 *
3929 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
3930 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
3931 		 *         disable_buildid_cache();
3932 		 */
3933 		bool disable = true;
3934 
3935 		if (rec->no_buildid_set && !rec->no_buildid)
3936 			disable = false;
3937 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
3938 			disable = false;
3939 		if (disable) {
3940 			rec->no_buildid = true;
3941 			rec->no_buildid_cache = true;
3942 			disable_buildid_cache();
3943 		}
3944 	}
3945 
3946 	if (record.opts.overwrite)
3947 		record.opts.tail_synthesize = true;
3948 
3949 	if (rec->evlist->core.nr_entries == 0) {
3950 		if (perf_pmu__has_hybrid()) {
3951 			err = evlist__add_default_hybrid(rec->evlist,
3952 							 !record.opts.no_samples);
3953 		} else {
3954 			err = __evlist__add_default(rec->evlist,
3955 						    !record.opts.no_samples);
3956 		}
3957 
3958 		if (err < 0) {
3959 			pr_err("Not enough memory for event selector list\n");
3960 			goto out;
3961 		}
3962 	}
3963 
3964 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
3965 		rec->opts.no_inherit = true;
3966 
3967 	err = target__validate(&rec->opts.target);
3968 	if (err) {
3969 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
3970 		ui__warning("%s\n", errbuf);
3971 	}
3972 
3973 	err = target__parse_uid(&rec->opts.target);
3974 	if (err) {
3975 		int saved_errno = errno;
3976 
3977 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
3978 		ui__error("%s", errbuf);
3979 
3980 		err = -saved_errno;
3981 		goto out;
3982 	}
3983 
3984 	/* Enable ignoring missing threads when -u/-p option is defined. */
3985 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
3986 
3987 	if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
3988 		pr_err("failed to use cpu list %s\n",
3989 		       rec->opts.target.cpu_list);
3990 		goto out;
3991 	}
3992 
3993 	rec->opts.target.hybrid = perf_pmu__has_hybrid();
3994 
3995 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
3996 		arch__add_leaf_frame_record_opts(&rec->opts);
3997 
3998 	err = -ENOMEM;
3999 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
4000 		usage_with_options(record_usage, record_options);
4001 
4002 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4003 	if (err)
4004 		goto out;
4005 
4006 	/*
4007 	 * We take all buildids when the file contains
4008 	 * AUX area tracing data because we do not decode the
4009 	 * trace because it would take too long.
4010 	 */
4011 	if (rec->opts.full_auxtrace)
4012 		rec->buildid_all = true;
4013 
4014 	if (rec->opts.text_poke) {
4015 		err = record__config_text_poke(rec->evlist);
4016 		if (err) {
4017 			pr_err("record__config_text_poke failed, error %d\n", err);
4018 			goto out;
4019 		}
4020 	}
4021 
4022 	if (rec->off_cpu) {
4023 		err = record__config_off_cpu(rec);
4024 		if (err) {
4025 			pr_err("record__config_off_cpu failed, error %d\n", err);
4026 			goto out;
4027 		}
4028 	}
4029 
4030 	if (record_opts__config(&rec->opts)) {
4031 		err = -EINVAL;
4032 		goto out;
4033 	}
4034 
4035 	err = record__init_thread_masks(rec);
4036 	if (err) {
4037 		pr_err("Failed to initialize parallel data streaming masks\n");
4038 		goto out;
4039 	}
4040 
4041 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4042 		rec->opts.nr_cblocks = nr_cblocks_max;
4043 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4044 
4045 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4046 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4047 
4048 	if (rec->opts.comp_level > comp_level_max)
4049 		rec->opts.comp_level = comp_level_max;
4050 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4051 
4052 	err = __cmd_record(&record, argc, argv);
4053 out:
4054 	evlist__delete(rec->evlist);
4055 	symbol__exit();
4056 	auxtrace_record__free(rec->itr);
4057 out_opts:
4058 	record__free_thread_masks(rec, rec->nr_threads);
4059 	rec->nr_threads = 0;
4060 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4061 	return err;
4062 }
4063 
4064 static void snapshot_sig_handler(int sig __maybe_unused)
4065 {
4066 	struct record *rec = &record;
4067 
4068 	hit_auxtrace_snapshot_trigger(rec);
4069 
4070 	if (switch_output_signal(rec))
4071 		trigger_hit(&switch_output_trigger);
4072 }
4073 
4074 static void alarm_sig_handler(int sig __maybe_unused)
4075 {
4076 	struct record *rec = &record;
4077 
4078 	if (switch_output_time(rec))
4079 		trigger_hit(&switch_output_trigger);
4080 }
4081