xref: /openbmc/linux/tools/perf/builtin-record.c (revision b8265621)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "asm/bug.h"
50 #include "perf.h"
51 
52 #include <errno.h>
53 #include <inttypes.h>
54 #include <locale.h>
55 #include <poll.h>
56 #include <pthread.h>
57 #include <unistd.h>
58 #include <sched.h>
59 #include <signal.h>
60 #ifdef HAVE_EVENTFD_SUPPORT
61 #include <sys/eventfd.h>
62 #endif
63 #include <sys/mman.h>
64 #include <sys/wait.h>
65 #include <sys/types.h>
66 #include <sys/stat.h>
67 #include <fcntl.h>
68 #include <linux/err.h>
69 #include <linux/string.h>
70 #include <linux/time64.h>
71 #include <linux/zalloc.h>
72 #include <linux/bitmap.h>
73 
74 struct switch_output {
75 	bool		 enabled;
76 	bool		 signal;
77 	unsigned long	 size;
78 	unsigned long	 time;
79 	const char	*str;
80 	bool		 set;
81 	char		 **filenames;
82 	int		 num_files;
83 	int		 cur_file;
84 };
85 
86 struct record {
87 	struct perf_tool	tool;
88 	struct record_opts	opts;
89 	u64			bytes_written;
90 	struct perf_data	data;
91 	struct auxtrace_record	*itr;
92 	struct evlist	*evlist;
93 	struct perf_session	*session;
94 	struct evlist		*sb_evlist;
95 	pthread_t		thread_id;
96 	int			realtime_prio;
97 	bool			switch_output_event_set;
98 	bool			no_buildid;
99 	bool			no_buildid_set;
100 	bool			no_buildid_cache;
101 	bool			no_buildid_cache_set;
102 	bool			buildid_all;
103 	bool			timestamp_filename;
104 	bool			timestamp_boundary;
105 	struct switch_output	switch_output;
106 	unsigned long long	samples;
107 	struct mmap_cpu_mask	affinity_mask;
108 	unsigned long		output_max_size;	/* = 0: unlimited */
109 };
110 
111 static volatile int done;
112 
113 static volatile int auxtrace_record__snapshot_started;
114 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
115 static DEFINE_TRIGGER(switch_output_trigger);
116 
117 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
118 	"SYS", "NODE", "CPU"
119 };
120 
121 static bool switch_output_signal(struct record *rec)
122 {
123 	return rec->switch_output.signal &&
124 	       trigger_is_ready(&switch_output_trigger);
125 }
126 
127 static bool switch_output_size(struct record *rec)
128 {
129 	return rec->switch_output.size &&
130 	       trigger_is_ready(&switch_output_trigger) &&
131 	       (rec->bytes_written >= rec->switch_output.size);
132 }
133 
134 static bool switch_output_time(struct record *rec)
135 {
136 	return rec->switch_output.time &&
137 	       trigger_is_ready(&switch_output_trigger);
138 }
139 
140 static bool record__output_max_size_exceeded(struct record *rec)
141 {
142 	return rec->output_max_size &&
143 	       (rec->bytes_written >= rec->output_max_size);
144 }
145 
146 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
147 			 void *bf, size_t size)
148 {
149 	struct perf_data_file *file = &rec->session->data->file;
150 
151 	if (perf_data_file__write(file, bf, size) < 0) {
152 		pr_err("failed to write perf data, error: %m\n");
153 		return -1;
154 	}
155 
156 	rec->bytes_written += size;
157 
158 	if (record__output_max_size_exceeded(rec) && !done) {
159 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
160 				" stopping session ]\n",
161 				rec->bytes_written >> 10);
162 		done = 1;
163 	}
164 
165 	if (switch_output_size(rec))
166 		trigger_hit(&switch_output_trigger);
167 
168 	return 0;
169 }
170 
171 static int record__aio_enabled(struct record *rec);
172 static int record__comp_enabled(struct record *rec);
173 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
174 			    void *src, size_t src_size);
175 
176 #ifdef HAVE_AIO_SUPPORT
177 static int record__aio_write(struct aiocb *cblock, int trace_fd,
178 		void *buf, size_t size, off_t off)
179 {
180 	int rc;
181 
182 	cblock->aio_fildes = trace_fd;
183 	cblock->aio_buf    = buf;
184 	cblock->aio_nbytes = size;
185 	cblock->aio_offset = off;
186 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
187 
188 	do {
189 		rc = aio_write(cblock);
190 		if (rc == 0) {
191 			break;
192 		} else if (errno != EAGAIN) {
193 			cblock->aio_fildes = -1;
194 			pr_err("failed to queue perf data, error: %m\n");
195 			break;
196 		}
197 	} while (1);
198 
199 	return rc;
200 }
201 
202 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
203 {
204 	void *rem_buf;
205 	off_t rem_off;
206 	size_t rem_size;
207 	int rc, aio_errno;
208 	ssize_t aio_ret, written;
209 
210 	aio_errno = aio_error(cblock);
211 	if (aio_errno == EINPROGRESS)
212 		return 0;
213 
214 	written = aio_ret = aio_return(cblock);
215 	if (aio_ret < 0) {
216 		if (aio_errno != EINTR)
217 			pr_err("failed to write perf data, error: %m\n");
218 		written = 0;
219 	}
220 
221 	rem_size = cblock->aio_nbytes - written;
222 
223 	if (rem_size == 0) {
224 		cblock->aio_fildes = -1;
225 		/*
226 		 * md->refcount is incremented in record__aio_pushfn() for
227 		 * every aio write request started in record__aio_push() so
228 		 * decrement it because the request is now complete.
229 		 */
230 		perf_mmap__put(&md->core);
231 		rc = 1;
232 	} else {
233 		/*
234 		 * aio write request may require restart with the
235 		 * reminder if the kernel didn't write whole
236 		 * chunk at once.
237 		 */
238 		rem_off = cblock->aio_offset + written;
239 		rem_buf = (void *)(cblock->aio_buf + written);
240 		record__aio_write(cblock, cblock->aio_fildes,
241 				rem_buf, rem_size, rem_off);
242 		rc = 0;
243 	}
244 
245 	return rc;
246 }
247 
248 static int record__aio_sync(struct mmap *md, bool sync_all)
249 {
250 	struct aiocb **aiocb = md->aio.aiocb;
251 	struct aiocb *cblocks = md->aio.cblocks;
252 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
253 	int i, do_suspend;
254 
255 	do {
256 		do_suspend = 0;
257 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
258 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
259 				if (sync_all)
260 					aiocb[i] = NULL;
261 				else
262 					return i;
263 			} else {
264 				/*
265 				 * Started aio write is not complete yet
266 				 * so it has to be waited before the
267 				 * next allocation.
268 				 */
269 				aiocb[i] = &cblocks[i];
270 				do_suspend = 1;
271 			}
272 		}
273 		if (!do_suspend)
274 			return -1;
275 
276 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
277 			if (!(errno == EAGAIN || errno == EINTR))
278 				pr_err("failed to sync perf data, error: %m\n");
279 		}
280 	} while (1);
281 }
282 
283 struct record_aio {
284 	struct record	*rec;
285 	void		*data;
286 	size_t		size;
287 };
288 
289 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
290 {
291 	struct record_aio *aio = to;
292 
293 	/*
294 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
295 	 * to release space in the kernel buffer as fast as possible, calling
296 	 * perf_mmap__consume() from perf_mmap__push() function.
297 	 *
298 	 * That lets the kernel to proceed with storing more profiling data into
299 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
300 	 *
301 	 * Coping can be done in two steps in case the chunk of profiling data
302 	 * crosses the upper bound of the kernel buffer. In this case we first move
303 	 * part of data from map->start till the upper bound and then the reminder
304 	 * from the beginning of the kernel buffer till the end of the data chunk.
305 	 */
306 
307 	if (record__comp_enabled(aio->rec)) {
308 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
309 				     mmap__mmap_len(map) - aio->size,
310 				     buf, size);
311 	} else {
312 		memcpy(aio->data + aio->size, buf, size);
313 	}
314 
315 	if (!aio->size) {
316 		/*
317 		 * Increment map->refcount to guard map->aio.data[] buffer
318 		 * from premature deallocation because map object can be
319 		 * released earlier than aio write request started on
320 		 * map->aio.data[] buffer is complete.
321 		 *
322 		 * perf_mmap__put() is done at record__aio_complete()
323 		 * after started aio request completion or at record__aio_push()
324 		 * if the request failed to start.
325 		 */
326 		perf_mmap__get(&map->core);
327 	}
328 
329 	aio->size += size;
330 
331 	return size;
332 }
333 
334 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
335 {
336 	int ret, idx;
337 	int trace_fd = rec->session->data->file.fd;
338 	struct record_aio aio = { .rec = rec, .size = 0 };
339 
340 	/*
341 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
342 	 * becomes available after previous aio write operation.
343 	 */
344 
345 	idx = record__aio_sync(map, false);
346 	aio.data = map->aio.data[idx];
347 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
348 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
349 		return ret;
350 
351 	rec->samples++;
352 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
353 	if (!ret) {
354 		*off += aio.size;
355 		rec->bytes_written += aio.size;
356 		if (switch_output_size(rec))
357 			trigger_hit(&switch_output_trigger);
358 	} else {
359 		/*
360 		 * Decrement map->refcount incremented in record__aio_pushfn()
361 		 * back if record__aio_write() operation failed to start, otherwise
362 		 * map->refcount is decremented in record__aio_complete() after
363 		 * aio write operation finishes successfully.
364 		 */
365 		perf_mmap__put(&map->core);
366 	}
367 
368 	return ret;
369 }
370 
371 static off_t record__aio_get_pos(int trace_fd)
372 {
373 	return lseek(trace_fd, 0, SEEK_CUR);
374 }
375 
376 static void record__aio_set_pos(int trace_fd, off_t pos)
377 {
378 	lseek(trace_fd, pos, SEEK_SET);
379 }
380 
381 static void record__aio_mmap_read_sync(struct record *rec)
382 {
383 	int i;
384 	struct evlist *evlist = rec->evlist;
385 	struct mmap *maps = evlist->mmap;
386 
387 	if (!record__aio_enabled(rec))
388 		return;
389 
390 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
391 		struct mmap *map = &maps[i];
392 
393 		if (map->core.base)
394 			record__aio_sync(map, true);
395 	}
396 }
397 
398 static int nr_cblocks_default = 1;
399 static int nr_cblocks_max = 4;
400 
401 static int record__aio_parse(const struct option *opt,
402 			     const char *str,
403 			     int unset)
404 {
405 	struct record_opts *opts = (struct record_opts *)opt->value;
406 
407 	if (unset) {
408 		opts->nr_cblocks = 0;
409 	} else {
410 		if (str)
411 			opts->nr_cblocks = strtol(str, NULL, 0);
412 		if (!opts->nr_cblocks)
413 			opts->nr_cblocks = nr_cblocks_default;
414 	}
415 
416 	return 0;
417 }
418 #else /* HAVE_AIO_SUPPORT */
419 static int nr_cblocks_max = 0;
420 
421 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
422 			    off_t *off __maybe_unused)
423 {
424 	return -1;
425 }
426 
427 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
428 {
429 	return -1;
430 }
431 
432 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
433 {
434 }
435 
436 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
437 {
438 }
439 #endif
440 
441 static int record__aio_enabled(struct record *rec)
442 {
443 	return rec->opts.nr_cblocks > 0;
444 }
445 
446 #define MMAP_FLUSH_DEFAULT 1
447 static int record__mmap_flush_parse(const struct option *opt,
448 				    const char *str,
449 				    int unset)
450 {
451 	int flush_max;
452 	struct record_opts *opts = (struct record_opts *)opt->value;
453 	static struct parse_tag tags[] = {
454 			{ .tag  = 'B', .mult = 1       },
455 			{ .tag  = 'K', .mult = 1 << 10 },
456 			{ .tag  = 'M', .mult = 1 << 20 },
457 			{ .tag  = 'G', .mult = 1 << 30 },
458 			{ .tag  = 0 },
459 	};
460 
461 	if (unset)
462 		return 0;
463 
464 	if (str) {
465 		opts->mmap_flush = parse_tag_value(str, tags);
466 		if (opts->mmap_flush == (int)-1)
467 			opts->mmap_flush = strtol(str, NULL, 0);
468 	}
469 
470 	if (!opts->mmap_flush)
471 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
472 
473 	flush_max = evlist__mmap_size(opts->mmap_pages);
474 	flush_max /= 4;
475 	if (opts->mmap_flush > flush_max)
476 		opts->mmap_flush = flush_max;
477 
478 	return 0;
479 }
480 
481 #ifdef HAVE_ZSTD_SUPPORT
482 static unsigned int comp_level_default = 1;
483 
484 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
485 {
486 	struct record_opts *opts = opt->value;
487 
488 	if (unset) {
489 		opts->comp_level = 0;
490 	} else {
491 		if (str)
492 			opts->comp_level = strtol(str, NULL, 0);
493 		if (!opts->comp_level)
494 			opts->comp_level = comp_level_default;
495 	}
496 
497 	return 0;
498 }
499 #endif
500 static unsigned int comp_level_max = 22;
501 
502 static int record__comp_enabled(struct record *rec)
503 {
504 	return rec->opts.comp_level > 0;
505 }
506 
507 static int process_synthesized_event(struct perf_tool *tool,
508 				     union perf_event *event,
509 				     struct perf_sample *sample __maybe_unused,
510 				     struct machine *machine __maybe_unused)
511 {
512 	struct record *rec = container_of(tool, struct record, tool);
513 	return record__write(rec, NULL, event, event->header.size);
514 }
515 
516 static int process_locked_synthesized_event(struct perf_tool *tool,
517 				     union perf_event *event,
518 				     struct perf_sample *sample __maybe_unused,
519 				     struct machine *machine __maybe_unused)
520 {
521 	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
522 	int ret;
523 
524 	pthread_mutex_lock(&synth_lock);
525 	ret = process_synthesized_event(tool, event, sample, machine);
526 	pthread_mutex_unlock(&synth_lock);
527 	return ret;
528 }
529 
530 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
531 {
532 	struct record *rec = to;
533 
534 	if (record__comp_enabled(rec)) {
535 		size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
536 		bf   = map->data;
537 	}
538 
539 	rec->samples++;
540 	return record__write(rec, map, bf, size);
541 }
542 
543 static volatile int signr = -1;
544 static volatile int child_finished;
545 #ifdef HAVE_EVENTFD_SUPPORT
546 static int done_fd = -1;
547 #endif
548 
549 static void sig_handler(int sig)
550 {
551 	if (sig == SIGCHLD)
552 		child_finished = 1;
553 	else
554 		signr = sig;
555 
556 	done = 1;
557 #ifdef HAVE_EVENTFD_SUPPORT
558 {
559 	u64 tmp = 1;
560 	/*
561 	 * It is possible for this signal handler to run after done is checked
562 	 * in the main loop, but before the perf counter fds are polled. If this
563 	 * happens, the poll() will continue to wait even though done is set,
564 	 * and will only break out if either another signal is received, or the
565 	 * counters are ready for read. To ensure the poll() doesn't sleep when
566 	 * done is set, use an eventfd (done_fd) to wake up the poll().
567 	 */
568 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
569 		pr_err("failed to signal wakeup fd, error: %m\n");
570 }
571 #endif // HAVE_EVENTFD_SUPPORT
572 }
573 
574 static void sigsegv_handler(int sig)
575 {
576 	perf_hooks__recover();
577 	sighandler_dump_stack(sig);
578 }
579 
580 static void record__sig_exit(void)
581 {
582 	if (signr == -1)
583 		return;
584 
585 	signal(signr, SIG_DFL);
586 	raise(signr);
587 }
588 
589 #ifdef HAVE_AUXTRACE_SUPPORT
590 
591 static int record__process_auxtrace(struct perf_tool *tool,
592 				    struct mmap *map,
593 				    union perf_event *event, void *data1,
594 				    size_t len1, void *data2, size_t len2)
595 {
596 	struct record *rec = container_of(tool, struct record, tool);
597 	struct perf_data *data = &rec->data;
598 	size_t padding;
599 	u8 pad[8] = {0};
600 
601 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
602 		off_t file_offset;
603 		int fd = perf_data__fd(data);
604 		int err;
605 
606 		file_offset = lseek(fd, 0, SEEK_CUR);
607 		if (file_offset == -1)
608 			return -1;
609 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
610 						     event, file_offset);
611 		if (err)
612 			return err;
613 	}
614 
615 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
616 	padding = (len1 + len2) & 7;
617 	if (padding)
618 		padding = 8 - padding;
619 
620 	record__write(rec, map, event, event->header.size);
621 	record__write(rec, map, data1, len1);
622 	if (len2)
623 		record__write(rec, map, data2, len2);
624 	record__write(rec, map, &pad, padding);
625 
626 	return 0;
627 }
628 
629 static int record__auxtrace_mmap_read(struct record *rec,
630 				      struct mmap *map)
631 {
632 	int ret;
633 
634 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
635 				  record__process_auxtrace);
636 	if (ret < 0)
637 		return ret;
638 
639 	if (ret)
640 		rec->samples++;
641 
642 	return 0;
643 }
644 
645 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
646 					       struct mmap *map)
647 {
648 	int ret;
649 
650 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
651 					   record__process_auxtrace,
652 					   rec->opts.auxtrace_snapshot_size);
653 	if (ret < 0)
654 		return ret;
655 
656 	if (ret)
657 		rec->samples++;
658 
659 	return 0;
660 }
661 
662 static int record__auxtrace_read_snapshot_all(struct record *rec)
663 {
664 	int i;
665 	int rc = 0;
666 
667 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
668 		struct mmap *map = &rec->evlist->mmap[i];
669 
670 		if (!map->auxtrace_mmap.base)
671 			continue;
672 
673 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
674 			rc = -1;
675 			goto out;
676 		}
677 	}
678 out:
679 	return rc;
680 }
681 
682 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
683 {
684 	pr_debug("Recording AUX area tracing snapshot\n");
685 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
686 		trigger_error(&auxtrace_snapshot_trigger);
687 	} else {
688 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
689 			trigger_error(&auxtrace_snapshot_trigger);
690 		else
691 			trigger_ready(&auxtrace_snapshot_trigger);
692 	}
693 }
694 
695 static int record__auxtrace_snapshot_exit(struct record *rec)
696 {
697 	if (trigger_is_error(&auxtrace_snapshot_trigger))
698 		return 0;
699 
700 	if (!auxtrace_record__snapshot_started &&
701 	    auxtrace_record__snapshot_start(rec->itr))
702 		return -1;
703 
704 	record__read_auxtrace_snapshot(rec, true);
705 	if (trigger_is_error(&auxtrace_snapshot_trigger))
706 		return -1;
707 
708 	return 0;
709 }
710 
711 static int record__auxtrace_init(struct record *rec)
712 {
713 	int err;
714 
715 	if (!rec->itr) {
716 		rec->itr = auxtrace_record__init(rec->evlist, &err);
717 		if (err)
718 			return err;
719 	}
720 
721 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
722 					      rec->opts.auxtrace_snapshot_opts);
723 	if (err)
724 		return err;
725 
726 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
727 					    rec->opts.auxtrace_sample_opts);
728 	if (err)
729 		return err;
730 
731 	return auxtrace_parse_filters(rec->evlist);
732 }
733 
734 #else
735 
736 static inline
737 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
738 			       struct mmap *map __maybe_unused)
739 {
740 	return 0;
741 }
742 
743 static inline
744 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
745 				    bool on_exit __maybe_unused)
746 {
747 }
748 
749 static inline
750 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
751 {
752 	return 0;
753 }
754 
755 static inline
756 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
757 {
758 	return 0;
759 }
760 
761 static int record__auxtrace_init(struct record *rec __maybe_unused)
762 {
763 	return 0;
764 }
765 
766 #endif
767 
768 static bool record__kcore_readable(struct machine *machine)
769 {
770 	char kcore[PATH_MAX];
771 	int fd;
772 
773 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
774 
775 	fd = open(kcore, O_RDONLY);
776 	if (fd < 0)
777 		return false;
778 
779 	close(fd);
780 
781 	return true;
782 }
783 
784 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
785 {
786 	char from_dir[PATH_MAX];
787 	char kcore_dir[PATH_MAX];
788 	int ret;
789 
790 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
791 
792 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
793 	if (ret)
794 		return ret;
795 
796 	return kcore_copy(from_dir, kcore_dir);
797 }
798 
799 static int record__mmap_evlist(struct record *rec,
800 			       struct evlist *evlist)
801 {
802 	struct record_opts *opts = &rec->opts;
803 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
804 				  opts->auxtrace_sample_mode;
805 	char msg[512];
806 
807 	if (opts->affinity != PERF_AFFINITY_SYS)
808 		cpu__setup_cpunode_map();
809 
810 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
811 				 opts->auxtrace_mmap_pages,
812 				 auxtrace_overwrite,
813 				 opts->nr_cblocks, opts->affinity,
814 				 opts->mmap_flush, opts->comp_level) < 0) {
815 		if (errno == EPERM) {
816 			pr_err("Permission error mapping pages.\n"
817 			       "Consider increasing "
818 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
819 			       "or try again with a smaller value of -m/--mmap_pages.\n"
820 			       "(current value: %u,%u)\n",
821 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
822 			return -errno;
823 		} else {
824 			pr_err("failed to mmap with %d (%s)\n", errno,
825 				str_error_r(errno, msg, sizeof(msg)));
826 			if (errno)
827 				return -errno;
828 			else
829 				return -EINVAL;
830 		}
831 	}
832 	return 0;
833 }
834 
835 static int record__mmap(struct record *rec)
836 {
837 	return record__mmap_evlist(rec, rec->evlist);
838 }
839 
840 static int record__open(struct record *rec)
841 {
842 	char msg[BUFSIZ];
843 	struct evsel *pos;
844 	struct evlist *evlist = rec->evlist;
845 	struct perf_session *session = rec->session;
846 	struct record_opts *opts = &rec->opts;
847 	int rc = 0;
848 
849 	/*
850 	 * For initial_delay or system wide, we need to add a dummy event so
851 	 * that we can track PERF_RECORD_MMAP to cover the delay of waiting or
852 	 * event synthesis.
853 	 */
854 	if (opts->initial_delay || target__has_cpu(&opts->target)) {
855 		pos = perf_evlist__get_tracking_event(evlist);
856 		if (!evsel__is_dummy_event(pos)) {
857 			/* Set up dummy event. */
858 			if (perf_evlist__add_dummy(evlist))
859 				return -ENOMEM;
860 			pos = evlist__last(evlist);
861 			perf_evlist__set_tracking_event(evlist, pos);
862 		}
863 
864 		/*
865 		 * Enable the dummy event when the process is forked for
866 		 * initial_delay, immediately for system wide.
867 		 */
868 		if (opts->initial_delay && !pos->immediate)
869 			pos->core.attr.enable_on_exec = 1;
870 		else
871 			pos->immediate = 1;
872 	}
873 
874 	perf_evlist__config(evlist, opts, &callchain_param);
875 
876 	evlist__for_each_entry(evlist, pos) {
877 try_again:
878 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
879 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
880 				if (verbose > 0)
881 					ui__warning("%s\n", msg);
882 				goto try_again;
883 			}
884 			if ((errno == EINVAL || errno == EBADF) &&
885 			    pos->leader != pos &&
886 			    pos->weak_group) {
887 			        pos = perf_evlist__reset_weak_group(evlist, pos, true);
888 				goto try_again;
889 			}
890 			rc = -errno;
891 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
892 			ui__error("%s\n", msg);
893 			goto out;
894 		}
895 
896 		pos->supported = true;
897 	}
898 
899 	if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) {
900 		pr_warning(
901 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
902 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
903 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
904 "file is not found in the buildid cache or in the vmlinux path.\n\n"
905 "Samples in kernel modules won't be resolved at all.\n\n"
906 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
907 "even with a suitable vmlinux or kallsyms file.\n\n");
908 	}
909 
910 	if (perf_evlist__apply_filters(evlist, &pos)) {
911 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
912 			pos->filter, evsel__name(pos), errno,
913 			str_error_r(errno, msg, sizeof(msg)));
914 		rc = -1;
915 		goto out;
916 	}
917 
918 	rc = record__mmap(rec);
919 	if (rc)
920 		goto out;
921 
922 	session->evlist = evlist;
923 	perf_session__set_id_hdr_size(session);
924 out:
925 	return rc;
926 }
927 
928 static int process_sample_event(struct perf_tool *tool,
929 				union perf_event *event,
930 				struct perf_sample *sample,
931 				struct evsel *evsel,
932 				struct machine *machine)
933 {
934 	struct record *rec = container_of(tool, struct record, tool);
935 
936 	if (rec->evlist->first_sample_time == 0)
937 		rec->evlist->first_sample_time = sample->time;
938 
939 	rec->evlist->last_sample_time = sample->time;
940 
941 	if (rec->buildid_all)
942 		return 0;
943 
944 	rec->samples++;
945 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
946 }
947 
948 static int process_buildids(struct record *rec)
949 {
950 	struct perf_session *session = rec->session;
951 
952 	if (perf_data__size(&rec->data) == 0)
953 		return 0;
954 
955 	/*
956 	 * During this process, it'll load kernel map and replace the
957 	 * dso->long_name to a real pathname it found.  In this case
958 	 * we prefer the vmlinux path like
959 	 *   /lib/modules/3.16.4/build/vmlinux
960 	 *
961 	 * rather than build-id path (in debug directory).
962 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
963 	 */
964 	symbol_conf.ignore_vmlinux_buildid = true;
965 
966 	/*
967 	 * If --buildid-all is given, it marks all DSO regardless of hits,
968 	 * so no need to process samples. But if timestamp_boundary is enabled,
969 	 * it still needs to walk on all samples to get the timestamps of
970 	 * first/last samples.
971 	 */
972 	if (rec->buildid_all && !rec->timestamp_boundary)
973 		rec->tool.sample = NULL;
974 
975 	return perf_session__process_events(session);
976 }
977 
978 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
979 {
980 	int err;
981 	struct perf_tool *tool = data;
982 	/*
983 	 *As for guest kernel when processing subcommand record&report,
984 	 *we arrange module mmap prior to guest kernel mmap and trigger
985 	 *a preload dso because default guest module symbols are loaded
986 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
987 	 *method is used to avoid symbol missing when the first addr is
988 	 *in module instead of in guest kernel.
989 	 */
990 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
991 					     machine);
992 	if (err < 0)
993 		pr_err("Couldn't record guest kernel [%d]'s reference"
994 		       " relocation symbol.\n", machine->pid);
995 
996 	/*
997 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
998 	 * have no _text sometimes.
999 	 */
1000 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1001 						 machine);
1002 	if (err < 0)
1003 		pr_err("Couldn't record guest kernel [%d]'s reference"
1004 		       " relocation symbol.\n", machine->pid);
1005 }
1006 
1007 static struct perf_event_header finished_round_event = {
1008 	.size = sizeof(struct perf_event_header),
1009 	.type = PERF_RECORD_FINISHED_ROUND,
1010 };
1011 
1012 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1013 {
1014 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1015 	    !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
1016 			  rec->affinity_mask.nbits)) {
1017 		bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
1018 		bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
1019 			  map->affinity_mask.bits, rec->affinity_mask.nbits);
1020 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
1021 				  (cpu_set_t *)rec->affinity_mask.bits);
1022 		if (verbose == 2)
1023 			mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
1024 	}
1025 }
1026 
1027 static size_t process_comp_header(void *record, size_t increment)
1028 {
1029 	struct perf_record_compressed *event = record;
1030 	size_t size = sizeof(*event);
1031 
1032 	if (increment) {
1033 		event->header.size += increment;
1034 		return increment;
1035 	}
1036 
1037 	event->header.type = PERF_RECORD_COMPRESSED;
1038 	event->header.size = size;
1039 
1040 	return size;
1041 }
1042 
1043 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
1044 			    void *src, size_t src_size)
1045 {
1046 	size_t compressed;
1047 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1048 
1049 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1050 						     max_record_size, process_comp_header);
1051 
1052 	session->bytes_transferred += src_size;
1053 	session->bytes_compressed  += compressed;
1054 
1055 	return compressed;
1056 }
1057 
1058 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1059 				    bool overwrite, bool synch)
1060 {
1061 	u64 bytes_written = rec->bytes_written;
1062 	int i;
1063 	int rc = 0;
1064 	struct mmap *maps;
1065 	int trace_fd = rec->data.file.fd;
1066 	off_t off = 0;
1067 
1068 	if (!evlist)
1069 		return 0;
1070 
1071 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1072 	if (!maps)
1073 		return 0;
1074 
1075 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1076 		return 0;
1077 
1078 	if (record__aio_enabled(rec))
1079 		off = record__aio_get_pos(trace_fd);
1080 
1081 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
1082 		u64 flush = 0;
1083 		struct mmap *map = &maps[i];
1084 
1085 		if (map->core.base) {
1086 			record__adjust_affinity(rec, map);
1087 			if (synch) {
1088 				flush = map->core.flush;
1089 				map->core.flush = 1;
1090 			}
1091 			if (!record__aio_enabled(rec)) {
1092 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1093 					if (synch)
1094 						map->core.flush = flush;
1095 					rc = -1;
1096 					goto out;
1097 				}
1098 			} else {
1099 				if (record__aio_push(rec, map, &off) < 0) {
1100 					record__aio_set_pos(trace_fd, off);
1101 					if (synch)
1102 						map->core.flush = flush;
1103 					rc = -1;
1104 					goto out;
1105 				}
1106 			}
1107 			if (synch)
1108 				map->core.flush = flush;
1109 		}
1110 
1111 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1112 		    !rec->opts.auxtrace_sample_mode &&
1113 		    record__auxtrace_mmap_read(rec, map) != 0) {
1114 			rc = -1;
1115 			goto out;
1116 		}
1117 	}
1118 
1119 	if (record__aio_enabled(rec))
1120 		record__aio_set_pos(trace_fd, off);
1121 
1122 	/*
1123 	 * Mark the round finished in case we wrote
1124 	 * at least one event.
1125 	 */
1126 	if (bytes_written != rec->bytes_written)
1127 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1128 
1129 	if (overwrite)
1130 		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1131 out:
1132 	return rc;
1133 }
1134 
1135 static int record__mmap_read_all(struct record *rec, bool synch)
1136 {
1137 	int err;
1138 
1139 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1140 	if (err)
1141 		return err;
1142 
1143 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1144 }
1145 
1146 static void record__init_features(struct record *rec)
1147 {
1148 	struct perf_session *session = rec->session;
1149 	int feat;
1150 
1151 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1152 		perf_header__set_feat(&session->header, feat);
1153 
1154 	if (rec->no_buildid)
1155 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1156 
1157 	if (!have_tracepoints(&rec->evlist->core.entries))
1158 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1159 
1160 	if (!rec->opts.branch_stack)
1161 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1162 
1163 	if (!rec->opts.full_auxtrace)
1164 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1165 
1166 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1167 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1168 
1169 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1170 	if (!record__comp_enabled(rec))
1171 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1172 
1173 	perf_header__clear_feat(&session->header, HEADER_STAT);
1174 }
1175 
1176 static void
1177 record__finish_output(struct record *rec)
1178 {
1179 	struct perf_data *data = &rec->data;
1180 	int fd = perf_data__fd(data);
1181 
1182 	if (data->is_pipe)
1183 		return;
1184 
1185 	rec->session->header.data_size += rec->bytes_written;
1186 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1187 
1188 	if (!rec->no_buildid) {
1189 		process_buildids(rec);
1190 
1191 		if (rec->buildid_all)
1192 			dsos__hit_all(rec->session);
1193 	}
1194 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1195 
1196 	return;
1197 }
1198 
1199 static int record__synthesize_workload(struct record *rec, bool tail)
1200 {
1201 	int err;
1202 	struct perf_thread_map *thread_map;
1203 
1204 	if (rec->opts.tail_synthesize != tail)
1205 		return 0;
1206 
1207 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1208 	if (thread_map == NULL)
1209 		return -1;
1210 
1211 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1212 						 process_synthesized_event,
1213 						 &rec->session->machines.host,
1214 						 rec->opts.sample_address);
1215 	perf_thread_map__put(thread_map);
1216 	return err;
1217 }
1218 
1219 static int record__synthesize(struct record *rec, bool tail);
1220 
1221 static int
1222 record__switch_output(struct record *rec, bool at_exit)
1223 {
1224 	struct perf_data *data = &rec->data;
1225 	int fd, err;
1226 	char *new_filename;
1227 
1228 	/* Same Size:      "2015122520103046"*/
1229 	char timestamp[] = "InvalidTimestamp";
1230 
1231 	record__aio_mmap_read_sync(rec);
1232 
1233 	record__synthesize(rec, true);
1234 	if (target__none(&rec->opts.target))
1235 		record__synthesize_workload(rec, true);
1236 
1237 	rec->samples = 0;
1238 	record__finish_output(rec);
1239 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1240 	if (err) {
1241 		pr_err("Failed to get current timestamp\n");
1242 		return -EINVAL;
1243 	}
1244 
1245 	fd = perf_data__switch(data, timestamp,
1246 				    rec->session->header.data_offset,
1247 				    at_exit, &new_filename);
1248 	if (fd >= 0 && !at_exit) {
1249 		rec->bytes_written = 0;
1250 		rec->session->header.data_size = 0;
1251 	}
1252 
1253 	if (!quiet)
1254 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1255 			data->path, timestamp);
1256 
1257 	if (rec->switch_output.num_files) {
1258 		int n = rec->switch_output.cur_file + 1;
1259 
1260 		if (n >= rec->switch_output.num_files)
1261 			n = 0;
1262 		rec->switch_output.cur_file = n;
1263 		if (rec->switch_output.filenames[n]) {
1264 			remove(rec->switch_output.filenames[n]);
1265 			zfree(&rec->switch_output.filenames[n]);
1266 		}
1267 		rec->switch_output.filenames[n] = new_filename;
1268 	} else {
1269 		free(new_filename);
1270 	}
1271 
1272 	/* Output tracking events */
1273 	if (!at_exit) {
1274 		record__synthesize(rec, false);
1275 
1276 		/*
1277 		 * In 'perf record --switch-output' without -a,
1278 		 * record__synthesize() in record__switch_output() won't
1279 		 * generate tracking events because there's no thread_map
1280 		 * in evlist. Which causes newly created perf.data doesn't
1281 		 * contain map and comm information.
1282 		 * Create a fake thread_map and directly call
1283 		 * perf_event__synthesize_thread_map() for those events.
1284 		 */
1285 		if (target__none(&rec->opts.target))
1286 			record__synthesize_workload(rec, false);
1287 	}
1288 	return fd;
1289 }
1290 
1291 static volatile int workload_exec_errno;
1292 
1293 /*
1294  * perf_evlist__prepare_workload will send a SIGUSR1
1295  * if the fork fails, since we asked by setting its
1296  * want_signal to true.
1297  */
1298 static void workload_exec_failed_signal(int signo __maybe_unused,
1299 					siginfo_t *info,
1300 					void *ucontext __maybe_unused)
1301 {
1302 	workload_exec_errno = info->si_value.sival_int;
1303 	done = 1;
1304 	child_finished = 1;
1305 }
1306 
1307 static void snapshot_sig_handler(int sig);
1308 static void alarm_sig_handler(int sig);
1309 
1310 static const struct perf_event_mmap_page *
1311 perf_evlist__pick_pc(struct evlist *evlist)
1312 {
1313 	if (evlist) {
1314 		if (evlist->mmap && evlist->mmap[0].core.base)
1315 			return evlist->mmap[0].core.base;
1316 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1317 			return evlist->overwrite_mmap[0].core.base;
1318 	}
1319 	return NULL;
1320 }
1321 
1322 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1323 {
1324 	const struct perf_event_mmap_page *pc;
1325 
1326 	pc = perf_evlist__pick_pc(rec->evlist);
1327 	if (pc)
1328 		return pc;
1329 	return NULL;
1330 }
1331 
1332 static int record__synthesize(struct record *rec, bool tail)
1333 {
1334 	struct perf_session *session = rec->session;
1335 	struct machine *machine = &session->machines.host;
1336 	struct perf_data *data = &rec->data;
1337 	struct record_opts *opts = &rec->opts;
1338 	struct perf_tool *tool = &rec->tool;
1339 	int fd = perf_data__fd(data);
1340 	int err = 0;
1341 	event_op f = process_synthesized_event;
1342 
1343 	if (rec->opts.tail_synthesize != tail)
1344 		return 0;
1345 
1346 	if (data->is_pipe) {
1347 		/*
1348 		 * We need to synthesize events first, because some
1349 		 * features works on top of them (on report side).
1350 		 */
1351 		err = perf_event__synthesize_attrs(tool, rec->evlist,
1352 						   process_synthesized_event);
1353 		if (err < 0) {
1354 			pr_err("Couldn't synthesize attrs.\n");
1355 			goto out;
1356 		}
1357 
1358 		err = perf_event__synthesize_features(tool, session, rec->evlist,
1359 						      process_synthesized_event);
1360 		if (err < 0) {
1361 			pr_err("Couldn't synthesize features.\n");
1362 			return err;
1363 		}
1364 
1365 		if (have_tracepoints(&rec->evlist->core.entries)) {
1366 			/*
1367 			 * FIXME err <= 0 here actually means that
1368 			 * there were no tracepoints so its not really
1369 			 * an error, just that we don't need to
1370 			 * synthesize anything.  We really have to
1371 			 * return this more properly and also
1372 			 * propagate errors that now are calling die()
1373 			 */
1374 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
1375 								  process_synthesized_event);
1376 			if (err <= 0) {
1377 				pr_err("Couldn't record tracing data.\n");
1378 				goto out;
1379 			}
1380 			rec->bytes_written += err;
1381 		}
1382 	}
1383 
1384 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1385 					  process_synthesized_event, machine);
1386 	if (err)
1387 		goto out;
1388 
1389 	/* Synthesize id_index before auxtrace_info */
1390 	if (rec->opts.auxtrace_sample_mode) {
1391 		err = perf_event__synthesize_id_index(tool,
1392 						      process_synthesized_event,
1393 						      session->evlist, machine);
1394 		if (err)
1395 			goto out;
1396 	}
1397 
1398 	if (rec->opts.full_auxtrace) {
1399 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1400 					session, process_synthesized_event);
1401 		if (err)
1402 			goto out;
1403 	}
1404 
1405 	if (!perf_evlist__exclude_kernel(rec->evlist)) {
1406 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1407 							 machine);
1408 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1409 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1410 				   "Check /proc/kallsyms permission or run as root.\n");
1411 
1412 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1413 						     machine);
1414 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1415 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1416 				   "Check /proc/modules permission or run as root.\n");
1417 	}
1418 
1419 	if (perf_guest) {
1420 		machines__process_guests(&session->machines,
1421 					 perf_event__synthesize_guest_os, tool);
1422 	}
1423 
1424 	err = perf_event__synthesize_extra_attr(&rec->tool,
1425 						rec->evlist,
1426 						process_synthesized_event,
1427 						data->is_pipe);
1428 	if (err)
1429 		goto out;
1430 
1431 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1432 						 process_synthesized_event,
1433 						NULL);
1434 	if (err < 0) {
1435 		pr_err("Couldn't synthesize thread map.\n");
1436 		return err;
1437 	}
1438 
1439 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1440 					     process_synthesized_event, NULL);
1441 	if (err < 0) {
1442 		pr_err("Couldn't synthesize cpu map.\n");
1443 		return err;
1444 	}
1445 
1446 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1447 						machine, opts);
1448 	if (err < 0)
1449 		pr_warning("Couldn't synthesize bpf events.\n");
1450 
1451 	err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1452 					     machine);
1453 	if (err < 0)
1454 		pr_warning("Couldn't synthesize cgroup events.\n");
1455 
1456 	if (rec->opts.nr_threads_synthesize > 1) {
1457 		perf_set_multithreaded();
1458 		f = process_locked_synthesized_event;
1459 	}
1460 
1461 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1462 					    f, opts->sample_address,
1463 					    rec->opts.nr_threads_synthesize);
1464 
1465 	if (rec->opts.nr_threads_synthesize > 1)
1466 		perf_set_singlethreaded();
1467 
1468 out:
1469 	return err;
1470 }
1471 
1472 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1473 {
1474 	struct record *rec = data;
1475 	pthread_kill(rec->thread_id, SIGUSR2);
1476 	return 0;
1477 }
1478 
1479 static int record__setup_sb_evlist(struct record *rec)
1480 {
1481 	struct record_opts *opts = &rec->opts;
1482 
1483 	if (rec->sb_evlist != NULL) {
1484 		/*
1485 		 * We get here if --switch-output-event populated the
1486 		 * sb_evlist, so associate a callback that will send a SIGUSR2
1487 		 * to the main thread.
1488 		 */
1489 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1490 		rec->thread_id = pthread_self();
1491 	}
1492 
1493 	if (!opts->no_bpf_event) {
1494 		if (rec->sb_evlist == NULL) {
1495 			rec->sb_evlist = evlist__new();
1496 
1497 			if (rec->sb_evlist == NULL) {
1498 				pr_err("Couldn't create side band evlist.\n.");
1499 				return -1;
1500 			}
1501 		}
1502 
1503 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1504 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1505 			return -1;
1506 		}
1507 	}
1508 
1509 	if (perf_evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1510 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1511 		opts->no_bpf_event = true;
1512 	}
1513 
1514 	return 0;
1515 }
1516 
1517 static int __cmd_record(struct record *rec, int argc, const char **argv)
1518 {
1519 	int err;
1520 	int status = 0;
1521 	unsigned long waking = 0;
1522 	const bool forks = argc > 0;
1523 	struct perf_tool *tool = &rec->tool;
1524 	struct record_opts *opts = &rec->opts;
1525 	struct perf_data *data = &rec->data;
1526 	struct perf_session *session;
1527 	bool disabled = false, draining = false;
1528 	int fd;
1529 	float ratio = 0;
1530 
1531 	atexit(record__sig_exit);
1532 	signal(SIGCHLD, sig_handler);
1533 	signal(SIGINT, sig_handler);
1534 	signal(SIGTERM, sig_handler);
1535 	signal(SIGSEGV, sigsegv_handler);
1536 
1537 	if (rec->opts.record_namespaces)
1538 		tool->namespace_events = true;
1539 
1540 	if (rec->opts.record_cgroup) {
1541 #ifdef HAVE_FILE_HANDLE
1542 		tool->cgroup_events = true;
1543 #else
1544 		pr_err("cgroup tracking is not supported\n");
1545 		return -1;
1546 #endif
1547 	}
1548 
1549 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1550 		signal(SIGUSR2, snapshot_sig_handler);
1551 		if (rec->opts.auxtrace_snapshot_mode)
1552 			trigger_on(&auxtrace_snapshot_trigger);
1553 		if (rec->switch_output.enabled)
1554 			trigger_on(&switch_output_trigger);
1555 	} else {
1556 		signal(SIGUSR2, SIG_IGN);
1557 	}
1558 
1559 	session = perf_session__new(data, false, tool);
1560 	if (IS_ERR(session)) {
1561 		pr_err("Perf session creation failed.\n");
1562 		return PTR_ERR(session);
1563 	}
1564 
1565 	fd = perf_data__fd(data);
1566 	rec->session = session;
1567 
1568 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1569 		pr_err("Compression initialization failed.\n");
1570 		return -1;
1571 	}
1572 #ifdef HAVE_EVENTFD_SUPPORT
1573 	done_fd = eventfd(0, EFD_NONBLOCK);
1574 	if (done_fd < 0) {
1575 		pr_err("Failed to create wakeup eventfd, error: %m\n");
1576 		status = -1;
1577 		goto out_delete_session;
1578 	}
1579 	err = evlist__add_pollfd(rec->evlist, done_fd);
1580 	if (err < 0) {
1581 		pr_err("Failed to add wakeup eventfd to poll list\n");
1582 		status = err;
1583 		goto out_delete_session;
1584 	}
1585 #endif // HAVE_EVENTFD_SUPPORT
1586 
1587 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1588 	session->header.env.comp_level = rec->opts.comp_level;
1589 
1590 	if (rec->opts.kcore &&
1591 	    !record__kcore_readable(&session->machines.host)) {
1592 		pr_err("ERROR: kcore is not readable.\n");
1593 		return -1;
1594 	}
1595 
1596 	record__init_features(rec);
1597 
1598 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1599 		session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1600 
1601 	if (forks) {
1602 		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1603 						    argv, data->is_pipe,
1604 						    workload_exec_failed_signal);
1605 		if (err < 0) {
1606 			pr_err("Couldn't run the workload!\n");
1607 			status = err;
1608 			goto out_delete_session;
1609 		}
1610 	}
1611 
1612 	/*
1613 	 * If we have just single event and are sending data
1614 	 * through pipe, we need to force the ids allocation,
1615 	 * because we synthesize event name through the pipe
1616 	 * and need the id for that.
1617 	 */
1618 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1619 		rec->opts.sample_id = true;
1620 
1621 	if (record__open(rec) != 0) {
1622 		err = -1;
1623 		goto out_child;
1624 	}
1625 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1626 
1627 	if (rec->opts.kcore) {
1628 		err = record__kcore_copy(&session->machines.host, data);
1629 		if (err) {
1630 			pr_err("ERROR: Failed to copy kcore\n");
1631 			goto out_child;
1632 		}
1633 	}
1634 
1635 	err = bpf__apply_obj_config();
1636 	if (err) {
1637 		char errbuf[BUFSIZ];
1638 
1639 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1640 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1641 			 errbuf);
1642 		goto out_child;
1643 	}
1644 
1645 	/*
1646 	 * Normally perf_session__new would do this, but it doesn't have the
1647 	 * evlist.
1648 	 */
1649 	if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1650 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1651 		rec->tool.ordered_events = false;
1652 	}
1653 
1654 	if (!rec->evlist->nr_groups)
1655 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1656 
1657 	if (data->is_pipe) {
1658 		err = perf_header__write_pipe(fd);
1659 		if (err < 0)
1660 			goto out_child;
1661 	} else {
1662 		err = perf_session__write_header(session, rec->evlist, fd, false);
1663 		if (err < 0)
1664 			goto out_child;
1665 	}
1666 
1667 	err = -1;
1668 	if (!rec->no_buildid
1669 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1670 		pr_err("Couldn't generate buildids. "
1671 		       "Use --no-buildid to profile anyway.\n");
1672 		goto out_child;
1673 	}
1674 
1675 	err = record__setup_sb_evlist(rec);
1676 	if (err)
1677 		goto out_child;
1678 
1679 	err = record__synthesize(rec, false);
1680 	if (err < 0)
1681 		goto out_child;
1682 
1683 	if (rec->realtime_prio) {
1684 		struct sched_param param;
1685 
1686 		param.sched_priority = rec->realtime_prio;
1687 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1688 			pr_err("Could not set realtime priority.\n");
1689 			err = -1;
1690 			goto out_child;
1691 		}
1692 	}
1693 
1694 	/*
1695 	 * When perf is starting the traced process, all the events
1696 	 * (apart from group members) have enable_on_exec=1 set,
1697 	 * so don't spoil it by prematurely enabling them.
1698 	 */
1699 	if (!target__none(&opts->target) && !opts->initial_delay)
1700 		evlist__enable(rec->evlist);
1701 
1702 	/*
1703 	 * Let the child rip
1704 	 */
1705 	if (forks) {
1706 		struct machine *machine = &session->machines.host;
1707 		union perf_event *event;
1708 		pid_t tgid;
1709 
1710 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1711 		if (event == NULL) {
1712 			err = -ENOMEM;
1713 			goto out_child;
1714 		}
1715 
1716 		/*
1717 		 * Some H/W events are generated before COMM event
1718 		 * which is emitted during exec(), so perf script
1719 		 * cannot see a correct process name for those events.
1720 		 * Synthesize COMM event to prevent it.
1721 		 */
1722 		tgid = perf_event__synthesize_comm(tool, event,
1723 						   rec->evlist->workload.pid,
1724 						   process_synthesized_event,
1725 						   machine);
1726 		free(event);
1727 
1728 		if (tgid == -1)
1729 			goto out_child;
1730 
1731 		event = malloc(sizeof(event->namespaces) +
1732 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1733 			       machine->id_hdr_size);
1734 		if (event == NULL) {
1735 			err = -ENOMEM;
1736 			goto out_child;
1737 		}
1738 
1739 		/*
1740 		 * Synthesize NAMESPACES event for the command specified.
1741 		 */
1742 		perf_event__synthesize_namespaces(tool, event,
1743 						  rec->evlist->workload.pid,
1744 						  tgid, process_synthesized_event,
1745 						  machine);
1746 		free(event);
1747 
1748 		perf_evlist__start_workload(rec->evlist);
1749 	}
1750 
1751 	if (opts->initial_delay) {
1752 		usleep(opts->initial_delay * USEC_PER_MSEC);
1753 		evlist__enable(rec->evlist);
1754 	}
1755 
1756 	trigger_ready(&auxtrace_snapshot_trigger);
1757 	trigger_ready(&switch_output_trigger);
1758 	perf_hooks__invoke_record_start();
1759 	for (;;) {
1760 		unsigned long long hits = rec->samples;
1761 
1762 		/*
1763 		 * rec->evlist->bkw_mmap_state is possible to be
1764 		 * BKW_MMAP_EMPTY here: when done == true and
1765 		 * hits != rec->samples in previous round.
1766 		 *
1767 		 * perf_evlist__toggle_bkw_mmap ensure we never
1768 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1769 		 */
1770 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1771 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1772 
1773 		if (record__mmap_read_all(rec, false) < 0) {
1774 			trigger_error(&auxtrace_snapshot_trigger);
1775 			trigger_error(&switch_output_trigger);
1776 			err = -1;
1777 			goto out_child;
1778 		}
1779 
1780 		if (auxtrace_record__snapshot_started) {
1781 			auxtrace_record__snapshot_started = 0;
1782 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1783 				record__read_auxtrace_snapshot(rec, false);
1784 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1785 				pr_err("AUX area tracing snapshot failed\n");
1786 				err = -1;
1787 				goto out_child;
1788 			}
1789 		}
1790 
1791 		if (trigger_is_hit(&switch_output_trigger)) {
1792 			/*
1793 			 * If switch_output_trigger is hit, the data in
1794 			 * overwritable ring buffer should have been collected,
1795 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1796 			 *
1797 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1798 			 * record__mmap_read_all() didn't collect data from
1799 			 * overwritable ring buffer. Read again.
1800 			 */
1801 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1802 				continue;
1803 			trigger_ready(&switch_output_trigger);
1804 
1805 			/*
1806 			 * Reenable events in overwrite ring buffer after
1807 			 * record__mmap_read_all(): we should have collected
1808 			 * data from it.
1809 			 */
1810 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1811 
1812 			if (!quiet)
1813 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1814 					waking);
1815 			waking = 0;
1816 			fd = record__switch_output(rec, false);
1817 			if (fd < 0) {
1818 				pr_err("Failed to switch to new file\n");
1819 				trigger_error(&switch_output_trigger);
1820 				err = fd;
1821 				goto out_child;
1822 			}
1823 
1824 			/* re-arm the alarm */
1825 			if (rec->switch_output.time)
1826 				alarm(rec->switch_output.time);
1827 		}
1828 
1829 		if (hits == rec->samples) {
1830 			if (done || draining)
1831 				break;
1832 			err = evlist__poll(rec->evlist, -1);
1833 			/*
1834 			 * Propagate error, only if there's any. Ignore positive
1835 			 * number of returned events and interrupt error.
1836 			 */
1837 			if (err > 0 || (err < 0 && errno == EINTR))
1838 				err = 0;
1839 			waking++;
1840 
1841 			if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1842 				draining = true;
1843 		}
1844 
1845 		/*
1846 		 * When perf is starting the traced process, at the end events
1847 		 * die with the process and we wait for that. Thus no need to
1848 		 * disable events in this case.
1849 		 */
1850 		if (done && !disabled && !target__none(&opts->target)) {
1851 			trigger_off(&auxtrace_snapshot_trigger);
1852 			evlist__disable(rec->evlist);
1853 			disabled = true;
1854 		}
1855 	}
1856 
1857 	trigger_off(&auxtrace_snapshot_trigger);
1858 	trigger_off(&switch_output_trigger);
1859 
1860 	if (opts->auxtrace_snapshot_on_exit)
1861 		record__auxtrace_snapshot_exit(rec);
1862 
1863 	if (forks && workload_exec_errno) {
1864 		char msg[STRERR_BUFSIZE];
1865 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1866 		pr_err("Workload failed: %s\n", emsg);
1867 		err = -1;
1868 		goto out_child;
1869 	}
1870 
1871 	if (!quiet)
1872 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1873 
1874 	if (target__none(&rec->opts.target))
1875 		record__synthesize_workload(rec, true);
1876 
1877 out_child:
1878 	record__mmap_read_all(rec, true);
1879 	record__aio_mmap_read_sync(rec);
1880 
1881 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1882 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1883 		session->header.env.comp_ratio = ratio + 0.5;
1884 	}
1885 
1886 	if (forks) {
1887 		int exit_status;
1888 
1889 		if (!child_finished)
1890 			kill(rec->evlist->workload.pid, SIGTERM);
1891 
1892 		wait(&exit_status);
1893 
1894 		if (err < 0)
1895 			status = err;
1896 		else if (WIFEXITED(exit_status))
1897 			status = WEXITSTATUS(exit_status);
1898 		else if (WIFSIGNALED(exit_status))
1899 			signr = WTERMSIG(exit_status);
1900 	} else
1901 		status = err;
1902 
1903 	record__synthesize(rec, true);
1904 	/* this will be recalculated during process_buildids() */
1905 	rec->samples = 0;
1906 
1907 	if (!err) {
1908 		if (!rec->timestamp_filename) {
1909 			record__finish_output(rec);
1910 		} else {
1911 			fd = record__switch_output(rec, true);
1912 			if (fd < 0) {
1913 				status = fd;
1914 				goto out_delete_session;
1915 			}
1916 		}
1917 	}
1918 
1919 	perf_hooks__invoke_record_end();
1920 
1921 	if (!err && !quiet) {
1922 		char samples[128];
1923 		const char *postfix = rec->timestamp_filename ?
1924 					".<timestamp>" : "";
1925 
1926 		if (rec->samples && !rec->opts.full_auxtrace)
1927 			scnprintf(samples, sizeof(samples),
1928 				  " (%" PRIu64 " samples)", rec->samples);
1929 		else
1930 			samples[0] = '\0';
1931 
1932 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
1933 			perf_data__size(data) / 1024.0 / 1024.0,
1934 			data->path, postfix, samples);
1935 		if (ratio) {
1936 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
1937 					rec->session->bytes_transferred / 1024.0 / 1024.0,
1938 					ratio);
1939 		}
1940 		fprintf(stderr, " ]\n");
1941 	}
1942 
1943 out_delete_session:
1944 #ifdef HAVE_EVENTFD_SUPPORT
1945 	if (done_fd >= 0)
1946 		close(done_fd);
1947 #endif
1948 	zstd_fini(&session->zstd_data);
1949 	perf_session__delete(session);
1950 
1951 	if (!opts->no_bpf_event)
1952 		perf_evlist__stop_sb_thread(rec->sb_evlist);
1953 	return status;
1954 }
1955 
1956 static void callchain_debug(struct callchain_param *callchain)
1957 {
1958 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1959 
1960 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1961 
1962 	if (callchain->record_mode == CALLCHAIN_DWARF)
1963 		pr_debug("callchain: stack dump size %d\n",
1964 			 callchain->dump_size);
1965 }
1966 
1967 int record_opts__parse_callchain(struct record_opts *record,
1968 				 struct callchain_param *callchain,
1969 				 const char *arg, bool unset)
1970 {
1971 	int ret;
1972 	callchain->enabled = !unset;
1973 
1974 	/* --no-call-graph */
1975 	if (unset) {
1976 		callchain->record_mode = CALLCHAIN_NONE;
1977 		pr_debug("callchain: disabled\n");
1978 		return 0;
1979 	}
1980 
1981 	ret = parse_callchain_record_opt(arg, callchain);
1982 	if (!ret) {
1983 		/* Enable data address sampling for DWARF unwind. */
1984 		if (callchain->record_mode == CALLCHAIN_DWARF)
1985 			record->sample_address = true;
1986 		callchain_debug(callchain);
1987 	}
1988 
1989 	return ret;
1990 }
1991 
1992 int record_parse_callchain_opt(const struct option *opt,
1993 			       const char *arg,
1994 			       int unset)
1995 {
1996 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1997 }
1998 
1999 int record_callchain_opt(const struct option *opt,
2000 			 const char *arg __maybe_unused,
2001 			 int unset __maybe_unused)
2002 {
2003 	struct callchain_param *callchain = opt->value;
2004 
2005 	callchain->enabled = true;
2006 
2007 	if (callchain->record_mode == CALLCHAIN_NONE)
2008 		callchain->record_mode = CALLCHAIN_FP;
2009 
2010 	callchain_debug(callchain);
2011 	return 0;
2012 }
2013 
2014 static int perf_record_config(const char *var, const char *value, void *cb)
2015 {
2016 	struct record *rec = cb;
2017 
2018 	if (!strcmp(var, "record.build-id")) {
2019 		if (!strcmp(value, "cache"))
2020 			rec->no_buildid_cache = false;
2021 		else if (!strcmp(value, "no-cache"))
2022 			rec->no_buildid_cache = true;
2023 		else if (!strcmp(value, "skip"))
2024 			rec->no_buildid = true;
2025 		else
2026 			return -1;
2027 		return 0;
2028 	}
2029 	if (!strcmp(var, "record.call-graph")) {
2030 		var = "call-graph.record-mode";
2031 		return perf_default_config(var, value, cb);
2032 	}
2033 #ifdef HAVE_AIO_SUPPORT
2034 	if (!strcmp(var, "record.aio")) {
2035 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2036 		if (!rec->opts.nr_cblocks)
2037 			rec->opts.nr_cblocks = nr_cblocks_default;
2038 	}
2039 #endif
2040 
2041 	return 0;
2042 }
2043 
2044 struct clockid_map {
2045 	const char *name;
2046 	int clockid;
2047 };
2048 
2049 #define CLOCKID_MAP(n, c)	\
2050 	{ .name = n, .clockid = (c), }
2051 
2052 #define CLOCKID_END	{ .name = NULL, }
2053 
2054 
2055 /*
2056  * Add the missing ones, we need to build on many distros...
2057  */
2058 #ifndef CLOCK_MONOTONIC_RAW
2059 #define CLOCK_MONOTONIC_RAW 4
2060 #endif
2061 #ifndef CLOCK_BOOTTIME
2062 #define CLOCK_BOOTTIME 7
2063 #endif
2064 #ifndef CLOCK_TAI
2065 #define CLOCK_TAI 11
2066 #endif
2067 
2068 static const struct clockid_map clockids[] = {
2069 	/* available for all events, NMI safe */
2070 	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
2071 	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
2072 
2073 	/* available for some events */
2074 	CLOCKID_MAP("realtime", CLOCK_REALTIME),
2075 	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
2076 	CLOCKID_MAP("tai", CLOCK_TAI),
2077 
2078 	/* available for the lazy */
2079 	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
2080 	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
2081 	CLOCKID_MAP("real", CLOCK_REALTIME),
2082 	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
2083 
2084 	CLOCKID_END,
2085 };
2086 
2087 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
2088 {
2089 	struct timespec res;
2090 
2091 	*res_ns = 0;
2092 	if (!clock_getres(clk_id, &res))
2093 		*res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
2094 	else
2095 		pr_warning("WARNING: Failed to determine specified clock resolution.\n");
2096 
2097 	return 0;
2098 }
2099 
2100 static int parse_clockid(const struct option *opt, const char *str, int unset)
2101 {
2102 	struct record_opts *opts = (struct record_opts *)opt->value;
2103 	const struct clockid_map *cm;
2104 	const char *ostr = str;
2105 
2106 	if (unset) {
2107 		opts->use_clockid = 0;
2108 		return 0;
2109 	}
2110 
2111 	/* no arg passed */
2112 	if (!str)
2113 		return 0;
2114 
2115 	/* no setting it twice */
2116 	if (opts->use_clockid)
2117 		return -1;
2118 
2119 	opts->use_clockid = true;
2120 
2121 	/* if its a number, we're done */
2122 	if (sscanf(str, "%d", &opts->clockid) == 1)
2123 		return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
2124 
2125 	/* allow a "CLOCK_" prefix to the name */
2126 	if (!strncasecmp(str, "CLOCK_", 6))
2127 		str += 6;
2128 
2129 	for (cm = clockids; cm->name; cm++) {
2130 		if (!strcasecmp(str, cm->name)) {
2131 			opts->clockid = cm->clockid;
2132 			return get_clockid_res(opts->clockid,
2133 					       &opts->clockid_res_ns);
2134 		}
2135 	}
2136 
2137 	opts->use_clockid = false;
2138 	ui__warning("unknown clockid %s, check man page\n", ostr);
2139 	return -1;
2140 }
2141 
2142 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2143 {
2144 	struct record_opts *opts = (struct record_opts *)opt->value;
2145 
2146 	if (unset || !str)
2147 		return 0;
2148 
2149 	if (!strcasecmp(str, "node"))
2150 		opts->affinity = PERF_AFFINITY_NODE;
2151 	else if (!strcasecmp(str, "cpu"))
2152 		opts->affinity = PERF_AFFINITY_CPU;
2153 
2154 	return 0;
2155 }
2156 
2157 static int parse_output_max_size(const struct option *opt,
2158 				 const char *str, int unset)
2159 {
2160 	unsigned long *s = (unsigned long *)opt->value;
2161 	static struct parse_tag tags_size[] = {
2162 		{ .tag  = 'B', .mult = 1       },
2163 		{ .tag  = 'K', .mult = 1 << 10 },
2164 		{ .tag  = 'M', .mult = 1 << 20 },
2165 		{ .tag  = 'G', .mult = 1 << 30 },
2166 		{ .tag  = 0 },
2167 	};
2168 	unsigned long val;
2169 
2170 	if (unset) {
2171 		*s = 0;
2172 		return 0;
2173 	}
2174 
2175 	val = parse_tag_value(str, tags_size);
2176 	if (val != (unsigned long) -1) {
2177 		*s = val;
2178 		return 0;
2179 	}
2180 
2181 	return -1;
2182 }
2183 
2184 static int record__parse_mmap_pages(const struct option *opt,
2185 				    const char *str,
2186 				    int unset __maybe_unused)
2187 {
2188 	struct record_opts *opts = opt->value;
2189 	char *s, *p;
2190 	unsigned int mmap_pages;
2191 	int ret;
2192 
2193 	if (!str)
2194 		return -EINVAL;
2195 
2196 	s = strdup(str);
2197 	if (!s)
2198 		return -ENOMEM;
2199 
2200 	p = strchr(s, ',');
2201 	if (p)
2202 		*p = '\0';
2203 
2204 	if (*s) {
2205 		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
2206 		if (ret)
2207 			goto out_free;
2208 		opts->mmap_pages = mmap_pages;
2209 	}
2210 
2211 	if (!p) {
2212 		ret = 0;
2213 		goto out_free;
2214 	}
2215 
2216 	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
2217 	if (ret)
2218 		goto out_free;
2219 
2220 	opts->auxtrace_mmap_pages = mmap_pages;
2221 
2222 out_free:
2223 	free(s);
2224 	return ret;
2225 }
2226 
2227 static void switch_output_size_warn(struct record *rec)
2228 {
2229 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2230 	struct switch_output *s = &rec->switch_output;
2231 
2232 	wakeup_size /= 2;
2233 
2234 	if (s->size < wakeup_size) {
2235 		char buf[100];
2236 
2237 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2238 		pr_warning("WARNING: switch-output data size lower than "
2239 			   "wakeup kernel buffer size (%s) "
2240 			   "expect bigger perf.data sizes\n", buf);
2241 	}
2242 }
2243 
2244 static int switch_output_setup(struct record *rec)
2245 {
2246 	struct switch_output *s = &rec->switch_output;
2247 	static struct parse_tag tags_size[] = {
2248 		{ .tag  = 'B', .mult = 1       },
2249 		{ .tag  = 'K', .mult = 1 << 10 },
2250 		{ .tag  = 'M', .mult = 1 << 20 },
2251 		{ .tag  = 'G', .mult = 1 << 30 },
2252 		{ .tag  = 0 },
2253 	};
2254 	static struct parse_tag tags_time[] = {
2255 		{ .tag  = 's', .mult = 1        },
2256 		{ .tag  = 'm', .mult = 60       },
2257 		{ .tag  = 'h', .mult = 60*60    },
2258 		{ .tag  = 'd', .mult = 60*60*24 },
2259 		{ .tag  = 0 },
2260 	};
2261 	unsigned long val;
2262 
2263 	/*
2264 	 * If we're using --switch-output-events, then we imply its
2265 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2266 	 *  thread to its parent.
2267 	 */
2268 	if (rec->switch_output_event_set)
2269 		goto do_signal;
2270 
2271 	if (!s->set)
2272 		return 0;
2273 
2274 	if (!strcmp(s->str, "signal")) {
2275 do_signal:
2276 		s->signal = true;
2277 		pr_debug("switch-output with SIGUSR2 signal\n");
2278 		goto enabled;
2279 	}
2280 
2281 	val = parse_tag_value(s->str, tags_size);
2282 	if (val != (unsigned long) -1) {
2283 		s->size = val;
2284 		pr_debug("switch-output with %s size threshold\n", s->str);
2285 		goto enabled;
2286 	}
2287 
2288 	val = parse_tag_value(s->str, tags_time);
2289 	if (val != (unsigned long) -1) {
2290 		s->time = val;
2291 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2292 			 s->str, s->time);
2293 		goto enabled;
2294 	}
2295 
2296 	return -1;
2297 
2298 enabled:
2299 	rec->timestamp_filename = true;
2300 	s->enabled              = true;
2301 
2302 	if (s->size && !rec->opts.no_buffering)
2303 		switch_output_size_warn(rec);
2304 
2305 	return 0;
2306 }
2307 
2308 static const char * const __record_usage[] = {
2309 	"perf record [<options>] [<command>]",
2310 	"perf record [<options>] -- <command> [<options>]",
2311 	NULL
2312 };
2313 const char * const *record_usage = __record_usage;
2314 
2315 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2316 				  struct perf_sample *sample, struct machine *machine)
2317 {
2318 	/*
2319 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2320 	 * no need to add them twice.
2321 	 */
2322 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2323 		return 0;
2324 	return perf_event__process_mmap(tool, event, sample, machine);
2325 }
2326 
2327 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2328 				   struct perf_sample *sample, struct machine *machine)
2329 {
2330 	/*
2331 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2332 	 * no need to add them twice.
2333 	 */
2334 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2335 		return 0;
2336 
2337 	return perf_event__process_mmap2(tool, event, sample, machine);
2338 }
2339 
2340 /*
2341  * XXX Ideally would be local to cmd_record() and passed to a record__new
2342  * because we need to have access to it in record__exit, that is called
2343  * after cmd_record() exits, but since record_options need to be accessible to
2344  * builtin-script, leave it here.
2345  *
2346  * At least we don't ouch it in all the other functions here directly.
2347  *
2348  * Just say no to tons of global variables, sigh.
2349  */
2350 static struct record record = {
2351 	.opts = {
2352 		.sample_time	     = true,
2353 		.mmap_pages	     = UINT_MAX,
2354 		.user_freq	     = UINT_MAX,
2355 		.user_interval	     = ULLONG_MAX,
2356 		.freq		     = 4000,
2357 		.target		     = {
2358 			.uses_mmap   = true,
2359 			.default_per_cpu = true,
2360 		},
2361 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2362 		.nr_threads_synthesize = 1,
2363 	},
2364 	.tool = {
2365 		.sample		= process_sample_event,
2366 		.fork		= perf_event__process_fork,
2367 		.exit		= perf_event__process_exit,
2368 		.comm		= perf_event__process_comm,
2369 		.namespaces	= perf_event__process_namespaces,
2370 		.mmap		= build_id__process_mmap,
2371 		.mmap2		= build_id__process_mmap2,
2372 		.ordered_events	= true,
2373 	},
2374 };
2375 
2376 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2377 	"\n\t\t\t\tDefault: fp";
2378 
2379 static bool dry_run;
2380 
2381 /*
2382  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2383  * with it and switch to use the library functions in perf_evlist that came
2384  * from builtin-record.c, i.e. use record_opts,
2385  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2386  * using pipes, etc.
2387  */
2388 static struct option __record_options[] = {
2389 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2390 		     "event selector. use 'perf list' to list available events",
2391 		     parse_events_option),
2392 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2393 		     "event filter", parse_filter),
2394 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2395 			   NULL, "don't record events from perf itself",
2396 			   exclude_perf),
2397 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2398 		    "record events on existing process id"),
2399 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2400 		    "record events on existing thread id"),
2401 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2402 		    "collect data with this RT SCHED_FIFO priority"),
2403 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2404 		    "collect data without buffering"),
2405 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2406 		    "collect raw sample records from all opened counters"),
2407 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2408 			    "system-wide collection from all CPUs"),
2409 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2410 		    "list of cpus to monitor"),
2411 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2412 	OPT_STRING('o', "output", &record.data.path, "file",
2413 		    "output file name"),
2414 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2415 			&record.opts.no_inherit_set,
2416 			"child tasks do not inherit counters"),
2417 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2418 		    "synthesize non-sample events at the end of output"),
2419 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2420 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2421 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2422 		    "Fail if the specified frequency can't be used"),
2423 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2424 		     "profile at this frequency",
2425 		      record__parse_freq),
2426 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2427 		     "number of mmap data pages and AUX area tracing mmap pages",
2428 		     record__parse_mmap_pages),
2429 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2430 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2431 		     record__mmap_flush_parse),
2432 	OPT_BOOLEAN(0, "group", &record.opts.group,
2433 		    "put the counters into a counter group"),
2434 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2435 			   NULL, "enables call-graph recording" ,
2436 			   &record_callchain_opt),
2437 	OPT_CALLBACK(0, "call-graph", &record.opts,
2438 		     "record_mode[,record_size]", record_callchain_help,
2439 		     &record_parse_callchain_opt),
2440 	OPT_INCR('v', "verbose", &verbose,
2441 		    "be more verbose (show counter open errors, etc)"),
2442 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2443 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2444 		    "per thread counts"),
2445 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2446 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2447 		    "Record the sample physical addresses"),
2448 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2449 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2450 			&record.opts.sample_time_set,
2451 			"Record the sample timestamps"),
2452 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2453 			"Record the sample period"),
2454 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2455 		    "don't sample"),
2456 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2457 			&record.no_buildid_cache_set,
2458 			"do not update the buildid cache"),
2459 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2460 			&record.no_buildid_set,
2461 			"do not collect buildids in perf.data"),
2462 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2463 		     "monitor event in cgroup name only",
2464 		     parse_cgroups),
2465 	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2466 		  "ms to wait before starting measurement after program start"),
2467 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2468 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2469 		   "user to profile"),
2470 
2471 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2472 		     "branch any", "sample any taken branches",
2473 		     parse_branch_stack),
2474 
2475 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2476 		     "branch filter mask", "branch stack filter modes",
2477 		     parse_branch_stack),
2478 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2479 		    "sample by weight (on special events only)"),
2480 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2481 		    "sample transaction flags (special events only)"),
2482 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2483 		    "use per-thread mmaps"),
2484 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2485 		    "sample selected machine registers on interrupt,"
2486 		    " use '-I?' to list register names", parse_intr_regs),
2487 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2488 		    "sample selected machine registers on interrupt,"
2489 		    " use '--user-regs=?' to list register names", parse_user_regs),
2490 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2491 		    "Record running/enabled time of read (:S) events"),
2492 	OPT_CALLBACK('k', "clockid", &record.opts,
2493 	"clockid", "clockid to use for events, see clock_gettime()",
2494 	parse_clockid),
2495 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2496 			  "opts", "AUX area tracing Snapshot Mode", ""),
2497 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2498 			  "opts", "sample AUX area", ""),
2499 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2500 			"per thread proc mmap processing timeout in ms"),
2501 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2502 		    "Record namespaces events"),
2503 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
2504 		    "Record cgroup events"),
2505 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
2506 			&record.opts.record_switch_events_set,
2507 			"Record context switch events"),
2508 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2509 			 "Configure all used events to run in kernel space.",
2510 			 PARSE_OPT_EXCLUSIVE),
2511 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2512 			 "Configure all used events to run in user space.",
2513 			 PARSE_OPT_EXCLUSIVE),
2514 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2515 		    "collect kernel callchains"),
2516 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2517 		    "collect user callchains"),
2518 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2519 		   "clang binary to use for compiling BPF scriptlets"),
2520 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2521 		   "options passed to clang when compiling BPF scriptlets"),
2522 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2523 		   "file", "vmlinux pathname"),
2524 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2525 		    "Record build-id of all DSOs regardless of hits"),
2526 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2527 		    "append timestamp to output filename"),
2528 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2529 		    "Record timestamp boundary (time of first/last samples)"),
2530 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2531 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2532 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2533 			  "signal"),
2534 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
2535 			 "switch output event selector. use 'perf list' to list available events",
2536 			 parse_events_option_new_evlist),
2537 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2538 		   "Limit number of switch output generated files"),
2539 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2540 		    "Parse options then exit"),
2541 #ifdef HAVE_AIO_SUPPORT
2542 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2543 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2544 		     record__aio_parse),
2545 #endif
2546 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2547 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2548 		     record__parse_affinity),
2549 #ifdef HAVE_ZSTD_SUPPORT
2550 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2551 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2552 			    record__parse_comp_level),
2553 #endif
2554 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
2555 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
2556 	OPT_UINTEGER(0, "num-thread-synthesize",
2557 		     &record.opts.nr_threads_synthesize,
2558 		     "number of threads to run for event synthesis"),
2559 #ifdef HAVE_LIBPFM
2560 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
2561 		"libpfm4 event selector. use 'perf list' to list available events",
2562 		parse_libpfm_events_option),
2563 #endif
2564 	OPT_END()
2565 };
2566 
2567 struct option *record_options = __record_options;
2568 
2569 int cmd_record(int argc, const char **argv)
2570 {
2571 	int err;
2572 	struct record *rec = &record;
2573 	char errbuf[BUFSIZ];
2574 
2575 	setlocale(LC_ALL, "");
2576 
2577 #ifndef HAVE_LIBBPF_SUPPORT
2578 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2579 	set_nobuild('\0', "clang-path", true);
2580 	set_nobuild('\0', "clang-opt", true);
2581 # undef set_nobuild
2582 #endif
2583 
2584 #ifndef HAVE_BPF_PROLOGUE
2585 # if !defined (HAVE_DWARF_SUPPORT)
2586 #  define REASON  "NO_DWARF=1"
2587 # elif !defined (HAVE_LIBBPF_SUPPORT)
2588 #  define REASON  "NO_LIBBPF=1"
2589 # else
2590 #  define REASON  "this architecture doesn't support BPF prologue"
2591 # endif
2592 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2593 	set_nobuild('\0', "vmlinux", true);
2594 # undef set_nobuild
2595 # undef REASON
2596 #endif
2597 
2598 	rec->opts.affinity = PERF_AFFINITY_SYS;
2599 
2600 	rec->evlist = evlist__new();
2601 	if (rec->evlist == NULL)
2602 		return -ENOMEM;
2603 
2604 	err = perf_config(perf_record_config, rec);
2605 	if (err)
2606 		return err;
2607 
2608 	argc = parse_options(argc, argv, record_options, record_usage,
2609 			    PARSE_OPT_STOP_AT_NON_OPTION);
2610 	if (quiet)
2611 		perf_quiet_option();
2612 
2613 	/* Make system wide (-a) the default target. */
2614 	if (!argc && target__none(&rec->opts.target))
2615 		rec->opts.target.system_wide = true;
2616 
2617 	if (nr_cgroups && !rec->opts.target.system_wide) {
2618 		usage_with_options_msg(record_usage, record_options,
2619 			"cgroup monitoring only available in system-wide mode");
2620 
2621 	}
2622 
2623 	if (rec->opts.kcore)
2624 		rec->data.is_dir = true;
2625 
2626 	if (rec->opts.comp_level != 0) {
2627 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2628 		rec->no_buildid = true;
2629 	}
2630 
2631 	if (rec->opts.record_switch_events &&
2632 	    !perf_can_record_switch_events()) {
2633 		ui__error("kernel does not support recording context switch events\n");
2634 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2635 		return -EINVAL;
2636 	}
2637 
2638 	if (switch_output_setup(rec)) {
2639 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2640 		return -EINVAL;
2641 	}
2642 
2643 	if (rec->switch_output.time) {
2644 		signal(SIGALRM, alarm_sig_handler);
2645 		alarm(rec->switch_output.time);
2646 	}
2647 
2648 	if (rec->switch_output.num_files) {
2649 		rec->switch_output.filenames = calloc(sizeof(char *),
2650 						      rec->switch_output.num_files);
2651 		if (!rec->switch_output.filenames)
2652 			return -EINVAL;
2653 	}
2654 
2655 	/*
2656 	 * Allow aliases to facilitate the lookup of symbols for address
2657 	 * filters. Refer to auxtrace_parse_filters().
2658 	 */
2659 	symbol_conf.allow_aliases = true;
2660 
2661 	symbol__init(NULL);
2662 
2663 	if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2664 		rec->affinity_mask.nbits = cpu__max_cpu();
2665 		rec->affinity_mask.bits = bitmap_alloc(rec->affinity_mask.nbits);
2666 		if (!rec->affinity_mask.bits) {
2667 			pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2668 			return -ENOMEM;
2669 		}
2670 		pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2671 	}
2672 
2673 	err = record__auxtrace_init(rec);
2674 	if (err)
2675 		goto out;
2676 
2677 	if (dry_run)
2678 		goto out;
2679 
2680 	err = bpf__setup_stdout(rec->evlist);
2681 	if (err) {
2682 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2683 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2684 			 errbuf);
2685 		goto out;
2686 	}
2687 
2688 	err = -ENOMEM;
2689 
2690 	if (rec->no_buildid_cache || rec->no_buildid) {
2691 		disable_buildid_cache();
2692 	} else if (rec->switch_output.enabled) {
2693 		/*
2694 		 * In 'perf record --switch-output', disable buildid
2695 		 * generation by default to reduce data file switching
2696 		 * overhead. Still generate buildid if they are required
2697 		 * explicitly using
2698 		 *
2699 		 *  perf record --switch-output --no-no-buildid \
2700 		 *              --no-no-buildid-cache
2701 		 *
2702 		 * Following code equals to:
2703 		 *
2704 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2705 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2706 		 *         disable_buildid_cache();
2707 		 */
2708 		bool disable = true;
2709 
2710 		if (rec->no_buildid_set && !rec->no_buildid)
2711 			disable = false;
2712 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2713 			disable = false;
2714 		if (disable) {
2715 			rec->no_buildid = true;
2716 			rec->no_buildid_cache = true;
2717 			disable_buildid_cache();
2718 		}
2719 	}
2720 
2721 	if (record.opts.overwrite)
2722 		record.opts.tail_synthesize = true;
2723 
2724 	if (rec->evlist->core.nr_entries == 0 &&
2725 	    __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2726 		pr_err("Not enough memory for event selector list\n");
2727 		goto out;
2728 	}
2729 
2730 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2731 		rec->opts.no_inherit = true;
2732 
2733 	err = target__validate(&rec->opts.target);
2734 	if (err) {
2735 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2736 		ui__warning("%s\n", errbuf);
2737 	}
2738 
2739 	err = target__parse_uid(&rec->opts.target);
2740 	if (err) {
2741 		int saved_errno = errno;
2742 
2743 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2744 		ui__error("%s", errbuf);
2745 
2746 		err = -saved_errno;
2747 		goto out;
2748 	}
2749 
2750 	/* Enable ignoring missing threads when -u/-p option is defined. */
2751 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2752 
2753 	err = -ENOMEM;
2754 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2755 		usage_with_options(record_usage, record_options);
2756 
2757 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2758 	if (err)
2759 		goto out;
2760 
2761 	/*
2762 	 * We take all buildids when the file contains
2763 	 * AUX area tracing data because we do not decode the
2764 	 * trace because it would take too long.
2765 	 */
2766 	if (rec->opts.full_auxtrace)
2767 		rec->buildid_all = true;
2768 
2769 	if (record_opts__config(&rec->opts)) {
2770 		err = -EINVAL;
2771 		goto out;
2772 	}
2773 
2774 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2775 		rec->opts.nr_cblocks = nr_cblocks_max;
2776 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2777 
2778 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2779 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2780 
2781 	if (rec->opts.comp_level > comp_level_max)
2782 		rec->opts.comp_level = comp_level_max;
2783 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2784 
2785 	err = __cmd_record(&record, argc, argv);
2786 out:
2787 	bitmap_free(rec->affinity_mask.bits);
2788 	evlist__delete(rec->evlist);
2789 	symbol__exit();
2790 	auxtrace_record__free(rec->itr);
2791 	return err;
2792 }
2793 
2794 static void snapshot_sig_handler(int sig __maybe_unused)
2795 {
2796 	struct record *rec = &record;
2797 
2798 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2799 		trigger_hit(&auxtrace_snapshot_trigger);
2800 		auxtrace_record__snapshot_started = 1;
2801 		if (auxtrace_record__snapshot_start(record.itr))
2802 			trigger_error(&auxtrace_snapshot_trigger);
2803 	}
2804 
2805 	if (switch_output_signal(rec))
2806 		trigger_hit(&switch_output_trigger);
2807 }
2808 
2809 static void alarm_sig_handler(int sig __maybe_unused)
2810 {
2811 	struct record *rec = &record;
2812 
2813 	if (switch_output_time(rec))
2814 		trigger_hit(&switch_output_trigger);
2815 }
2816