xref: /openbmc/linux/tools/perf/builtin-record.c (revision 35267cea)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/clockid.h"
50 #include "util/pmu-hybrid.h"
51 #include "util/evlist-hybrid.h"
52 #include "asm/bug.h"
53 #include "perf.h"
54 
55 #include <errno.h>
56 #include <inttypes.h>
57 #include <locale.h>
58 #include <poll.h>
59 #include <pthread.h>
60 #include <unistd.h>
61 #include <sched.h>
62 #include <signal.h>
63 #ifdef HAVE_EVENTFD_SUPPORT
64 #include <sys/eventfd.h>
65 #endif
66 #include <sys/mman.h>
67 #include <sys/wait.h>
68 #include <sys/types.h>
69 #include <sys/stat.h>
70 #include <fcntl.h>
71 #include <linux/err.h>
72 #include <linux/string.h>
73 #include <linux/time64.h>
74 #include <linux/zalloc.h>
75 #include <linux/bitmap.h>
76 #include <sys/time.h>
77 
78 struct switch_output {
79 	bool		 enabled;
80 	bool		 signal;
81 	unsigned long	 size;
82 	unsigned long	 time;
83 	const char	*str;
84 	bool		 set;
85 	char		 **filenames;
86 	int		 num_files;
87 	int		 cur_file;
88 };
89 
90 struct record {
91 	struct perf_tool	tool;
92 	struct record_opts	opts;
93 	u64			bytes_written;
94 	struct perf_data	data;
95 	struct auxtrace_record	*itr;
96 	struct evlist	*evlist;
97 	struct perf_session	*session;
98 	struct evlist		*sb_evlist;
99 	pthread_t		thread_id;
100 	int			realtime_prio;
101 	bool			switch_output_event_set;
102 	bool			no_buildid;
103 	bool			no_buildid_set;
104 	bool			no_buildid_cache;
105 	bool			no_buildid_cache_set;
106 	bool			buildid_all;
107 	bool			buildid_mmap;
108 	bool			timestamp_filename;
109 	bool			timestamp_boundary;
110 	struct switch_output	switch_output;
111 	unsigned long long	samples;
112 	struct mmap_cpu_mask	affinity_mask;
113 	unsigned long		output_max_size;	/* = 0: unlimited */
114 };
115 
116 static volatile int done;
117 
118 static volatile int auxtrace_record__snapshot_started;
119 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
120 static DEFINE_TRIGGER(switch_output_trigger);
121 
122 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
123 	"SYS", "NODE", "CPU"
124 };
125 
126 static bool switch_output_signal(struct record *rec)
127 {
128 	return rec->switch_output.signal &&
129 	       trigger_is_ready(&switch_output_trigger);
130 }
131 
132 static bool switch_output_size(struct record *rec)
133 {
134 	return rec->switch_output.size &&
135 	       trigger_is_ready(&switch_output_trigger) &&
136 	       (rec->bytes_written >= rec->switch_output.size);
137 }
138 
139 static bool switch_output_time(struct record *rec)
140 {
141 	return rec->switch_output.time &&
142 	       trigger_is_ready(&switch_output_trigger);
143 }
144 
145 static bool record__output_max_size_exceeded(struct record *rec)
146 {
147 	return rec->output_max_size &&
148 	       (rec->bytes_written >= rec->output_max_size);
149 }
150 
151 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
152 			 void *bf, size_t size)
153 {
154 	struct perf_data_file *file = &rec->session->data->file;
155 
156 	if (perf_data_file__write(file, bf, size) < 0) {
157 		pr_err("failed to write perf data, error: %m\n");
158 		return -1;
159 	}
160 
161 	rec->bytes_written += size;
162 
163 	if (record__output_max_size_exceeded(rec) && !done) {
164 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
165 				" stopping session ]\n",
166 				rec->bytes_written >> 10);
167 		done = 1;
168 	}
169 
170 	if (switch_output_size(rec))
171 		trigger_hit(&switch_output_trigger);
172 
173 	return 0;
174 }
175 
176 static int record__aio_enabled(struct record *rec);
177 static int record__comp_enabled(struct record *rec);
178 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
179 			    void *src, size_t src_size);
180 
181 #ifdef HAVE_AIO_SUPPORT
182 static int record__aio_write(struct aiocb *cblock, int trace_fd,
183 		void *buf, size_t size, off_t off)
184 {
185 	int rc;
186 
187 	cblock->aio_fildes = trace_fd;
188 	cblock->aio_buf    = buf;
189 	cblock->aio_nbytes = size;
190 	cblock->aio_offset = off;
191 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
192 
193 	do {
194 		rc = aio_write(cblock);
195 		if (rc == 0) {
196 			break;
197 		} else if (errno != EAGAIN) {
198 			cblock->aio_fildes = -1;
199 			pr_err("failed to queue perf data, error: %m\n");
200 			break;
201 		}
202 	} while (1);
203 
204 	return rc;
205 }
206 
207 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
208 {
209 	void *rem_buf;
210 	off_t rem_off;
211 	size_t rem_size;
212 	int rc, aio_errno;
213 	ssize_t aio_ret, written;
214 
215 	aio_errno = aio_error(cblock);
216 	if (aio_errno == EINPROGRESS)
217 		return 0;
218 
219 	written = aio_ret = aio_return(cblock);
220 	if (aio_ret < 0) {
221 		if (aio_errno != EINTR)
222 			pr_err("failed to write perf data, error: %m\n");
223 		written = 0;
224 	}
225 
226 	rem_size = cblock->aio_nbytes - written;
227 
228 	if (rem_size == 0) {
229 		cblock->aio_fildes = -1;
230 		/*
231 		 * md->refcount is incremented in record__aio_pushfn() for
232 		 * every aio write request started in record__aio_push() so
233 		 * decrement it because the request is now complete.
234 		 */
235 		perf_mmap__put(&md->core);
236 		rc = 1;
237 	} else {
238 		/*
239 		 * aio write request may require restart with the
240 		 * reminder if the kernel didn't write whole
241 		 * chunk at once.
242 		 */
243 		rem_off = cblock->aio_offset + written;
244 		rem_buf = (void *)(cblock->aio_buf + written);
245 		record__aio_write(cblock, cblock->aio_fildes,
246 				rem_buf, rem_size, rem_off);
247 		rc = 0;
248 	}
249 
250 	return rc;
251 }
252 
253 static int record__aio_sync(struct mmap *md, bool sync_all)
254 {
255 	struct aiocb **aiocb = md->aio.aiocb;
256 	struct aiocb *cblocks = md->aio.cblocks;
257 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
258 	int i, do_suspend;
259 
260 	do {
261 		do_suspend = 0;
262 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
263 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
264 				if (sync_all)
265 					aiocb[i] = NULL;
266 				else
267 					return i;
268 			} else {
269 				/*
270 				 * Started aio write is not complete yet
271 				 * so it has to be waited before the
272 				 * next allocation.
273 				 */
274 				aiocb[i] = &cblocks[i];
275 				do_suspend = 1;
276 			}
277 		}
278 		if (!do_suspend)
279 			return -1;
280 
281 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
282 			if (!(errno == EAGAIN || errno == EINTR))
283 				pr_err("failed to sync perf data, error: %m\n");
284 		}
285 	} while (1);
286 }
287 
288 struct record_aio {
289 	struct record	*rec;
290 	void		*data;
291 	size_t		size;
292 };
293 
294 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
295 {
296 	struct record_aio *aio = to;
297 
298 	/*
299 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
300 	 * to release space in the kernel buffer as fast as possible, calling
301 	 * perf_mmap__consume() from perf_mmap__push() function.
302 	 *
303 	 * That lets the kernel to proceed with storing more profiling data into
304 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
305 	 *
306 	 * Coping can be done in two steps in case the chunk of profiling data
307 	 * crosses the upper bound of the kernel buffer. In this case we first move
308 	 * part of data from map->start till the upper bound and then the reminder
309 	 * from the beginning of the kernel buffer till the end of the data chunk.
310 	 */
311 
312 	if (record__comp_enabled(aio->rec)) {
313 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
314 				     mmap__mmap_len(map) - aio->size,
315 				     buf, size);
316 	} else {
317 		memcpy(aio->data + aio->size, buf, size);
318 	}
319 
320 	if (!aio->size) {
321 		/*
322 		 * Increment map->refcount to guard map->aio.data[] buffer
323 		 * from premature deallocation because map object can be
324 		 * released earlier than aio write request started on
325 		 * map->aio.data[] buffer is complete.
326 		 *
327 		 * perf_mmap__put() is done at record__aio_complete()
328 		 * after started aio request completion or at record__aio_push()
329 		 * if the request failed to start.
330 		 */
331 		perf_mmap__get(&map->core);
332 	}
333 
334 	aio->size += size;
335 
336 	return size;
337 }
338 
339 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
340 {
341 	int ret, idx;
342 	int trace_fd = rec->session->data->file.fd;
343 	struct record_aio aio = { .rec = rec, .size = 0 };
344 
345 	/*
346 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
347 	 * becomes available after previous aio write operation.
348 	 */
349 
350 	idx = record__aio_sync(map, false);
351 	aio.data = map->aio.data[idx];
352 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
353 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
354 		return ret;
355 
356 	rec->samples++;
357 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
358 	if (!ret) {
359 		*off += aio.size;
360 		rec->bytes_written += aio.size;
361 		if (switch_output_size(rec))
362 			trigger_hit(&switch_output_trigger);
363 	} else {
364 		/*
365 		 * Decrement map->refcount incremented in record__aio_pushfn()
366 		 * back if record__aio_write() operation failed to start, otherwise
367 		 * map->refcount is decremented in record__aio_complete() after
368 		 * aio write operation finishes successfully.
369 		 */
370 		perf_mmap__put(&map->core);
371 	}
372 
373 	return ret;
374 }
375 
376 static off_t record__aio_get_pos(int trace_fd)
377 {
378 	return lseek(trace_fd, 0, SEEK_CUR);
379 }
380 
381 static void record__aio_set_pos(int trace_fd, off_t pos)
382 {
383 	lseek(trace_fd, pos, SEEK_SET);
384 }
385 
386 static void record__aio_mmap_read_sync(struct record *rec)
387 {
388 	int i;
389 	struct evlist *evlist = rec->evlist;
390 	struct mmap *maps = evlist->mmap;
391 
392 	if (!record__aio_enabled(rec))
393 		return;
394 
395 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
396 		struct mmap *map = &maps[i];
397 
398 		if (map->core.base)
399 			record__aio_sync(map, true);
400 	}
401 }
402 
403 static int nr_cblocks_default = 1;
404 static int nr_cblocks_max = 4;
405 
406 static int record__aio_parse(const struct option *opt,
407 			     const char *str,
408 			     int unset)
409 {
410 	struct record_opts *opts = (struct record_opts *)opt->value;
411 
412 	if (unset) {
413 		opts->nr_cblocks = 0;
414 	} else {
415 		if (str)
416 			opts->nr_cblocks = strtol(str, NULL, 0);
417 		if (!opts->nr_cblocks)
418 			opts->nr_cblocks = nr_cblocks_default;
419 	}
420 
421 	return 0;
422 }
423 #else /* HAVE_AIO_SUPPORT */
424 static int nr_cblocks_max = 0;
425 
426 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
427 			    off_t *off __maybe_unused)
428 {
429 	return -1;
430 }
431 
432 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
433 {
434 	return -1;
435 }
436 
437 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
438 {
439 }
440 
441 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
442 {
443 }
444 #endif
445 
446 static int record__aio_enabled(struct record *rec)
447 {
448 	return rec->opts.nr_cblocks > 0;
449 }
450 
451 #define MMAP_FLUSH_DEFAULT 1
452 static int record__mmap_flush_parse(const struct option *opt,
453 				    const char *str,
454 				    int unset)
455 {
456 	int flush_max;
457 	struct record_opts *opts = (struct record_opts *)opt->value;
458 	static struct parse_tag tags[] = {
459 			{ .tag  = 'B', .mult = 1       },
460 			{ .tag  = 'K', .mult = 1 << 10 },
461 			{ .tag  = 'M', .mult = 1 << 20 },
462 			{ .tag  = 'G', .mult = 1 << 30 },
463 			{ .tag  = 0 },
464 	};
465 
466 	if (unset)
467 		return 0;
468 
469 	if (str) {
470 		opts->mmap_flush = parse_tag_value(str, tags);
471 		if (opts->mmap_flush == (int)-1)
472 			opts->mmap_flush = strtol(str, NULL, 0);
473 	}
474 
475 	if (!opts->mmap_flush)
476 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
477 
478 	flush_max = evlist__mmap_size(opts->mmap_pages);
479 	flush_max /= 4;
480 	if (opts->mmap_flush > flush_max)
481 		opts->mmap_flush = flush_max;
482 
483 	return 0;
484 }
485 
486 #ifdef HAVE_ZSTD_SUPPORT
487 static unsigned int comp_level_default = 1;
488 
489 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
490 {
491 	struct record_opts *opts = opt->value;
492 
493 	if (unset) {
494 		opts->comp_level = 0;
495 	} else {
496 		if (str)
497 			opts->comp_level = strtol(str, NULL, 0);
498 		if (!opts->comp_level)
499 			opts->comp_level = comp_level_default;
500 	}
501 
502 	return 0;
503 }
504 #endif
505 static unsigned int comp_level_max = 22;
506 
507 static int record__comp_enabled(struct record *rec)
508 {
509 	return rec->opts.comp_level > 0;
510 }
511 
512 static int process_synthesized_event(struct perf_tool *tool,
513 				     union perf_event *event,
514 				     struct perf_sample *sample __maybe_unused,
515 				     struct machine *machine __maybe_unused)
516 {
517 	struct record *rec = container_of(tool, struct record, tool);
518 	return record__write(rec, NULL, event, event->header.size);
519 }
520 
521 static int process_locked_synthesized_event(struct perf_tool *tool,
522 				     union perf_event *event,
523 				     struct perf_sample *sample __maybe_unused,
524 				     struct machine *machine __maybe_unused)
525 {
526 	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
527 	int ret;
528 
529 	pthread_mutex_lock(&synth_lock);
530 	ret = process_synthesized_event(tool, event, sample, machine);
531 	pthread_mutex_unlock(&synth_lock);
532 	return ret;
533 }
534 
535 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
536 {
537 	struct record *rec = to;
538 
539 	if (record__comp_enabled(rec)) {
540 		size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
541 		bf   = map->data;
542 	}
543 
544 	rec->samples++;
545 	return record__write(rec, map, bf, size);
546 }
547 
548 static volatile int signr = -1;
549 static volatile int child_finished;
550 #ifdef HAVE_EVENTFD_SUPPORT
551 static int done_fd = -1;
552 #endif
553 
554 static void sig_handler(int sig)
555 {
556 	if (sig == SIGCHLD)
557 		child_finished = 1;
558 	else
559 		signr = sig;
560 
561 	done = 1;
562 #ifdef HAVE_EVENTFD_SUPPORT
563 {
564 	u64 tmp = 1;
565 	/*
566 	 * It is possible for this signal handler to run after done is checked
567 	 * in the main loop, but before the perf counter fds are polled. If this
568 	 * happens, the poll() will continue to wait even though done is set,
569 	 * and will only break out if either another signal is received, or the
570 	 * counters are ready for read. To ensure the poll() doesn't sleep when
571 	 * done is set, use an eventfd (done_fd) to wake up the poll().
572 	 */
573 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
574 		pr_err("failed to signal wakeup fd, error: %m\n");
575 }
576 #endif // HAVE_EVENTFD_SUPPORT
577 }
578 
579 static void sigsegv_handler(int sig)
580 {
581 	perf_hooks__recover();
582 	sighandler_dump_stack(sig);
583 }
584 
585 static void record__sig_exit(void)
586 {
587 	if (signr == -1)
588 		return;
589 
590 	signal(signr, SIG_DFL);
591 	raise(signr);
592 }
593 
594 #ifdef HAVE_AUXTRACE_SUPPORT
595 
596 static int record__process_auxtrace(struct perf_tool *tool,
597 				    struct mmap *map,
598 				    union perf_event *event, void *data1,
599 				    size_t len1, void *data2, size_t len2)
600 {
601 	struct record *rec = container_of(tool, struct record, tool);
602 	struct perf_data *data = &rec->data;
603 	size_t padding;
604 	u8 pad[8] = {0};
605 
606 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
607 		off_t file_offset;
608 		int fd = perf_data__fd(data);
609 		int err;
610 
611 		file_offset = lseek(fd, 0, SEEK_CUR);
612 		if (file_offset == -1)
613 			return -1;
614 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
615 						     event, file_offset);
616 		if (err)
617 			return err;
618 	}
619 
620 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
621 	padding = (len1 + len2) & 7;
622 	if (padding)
623 		padding = 8 - padding;
624 
625 	record__write(rec, map, event, event->header.size);
626 	record__write(rec, map, data1, len1);
627 	if (len2)
628 		record__write(rec, map, data2, len2);
629 	record__write(rec, map, &pad, padding);
630 
631 	return 0;
632 }
633 
634 static int record__auxtrace_mmap_read(struct record *rec,
635 				      struct mmap *map)
636 {
637 	int ret;
638 
639 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
640 				  record__process_auxtrace);
641 	if (ret < 0)
642 		return ret;
643 
644 	if (ret)
645 		rec->samples++;
646 
647 	return 0;
648 }
649 
650 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
651 					       struct mmap *map)
652 {
653 	int ret;
654 
655 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
656 					   record__process_auxtrace,
657 					   rec->opts.auxtrace_snapshot_size);
658 	if (ret < 0)
659 		return ret;
660 
661 	if (ret)
662 		rec->samples++;
663 
664 	return 0;
665 }
666 
667 static int record__auxtrace_read_snapshot_all(struct record *rec)
668 {
669 	int i;
670 	int rc = 0;
671 
672 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
673 		struct mmap *map = &rec->evlist->mmap[i];
674 
675 		if (!map->auxtrace_mmap.base)
676 			continue;
677 
678 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
679 			rc = -1;
680 			goto out;
681 		}
682 	}
683 out:
684 	return rc;
685 }
686 
687 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
688 {
689 	pr_debug("Recording AUX area tracing snapshot\n");
690 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
691 		trigger_error(&auxtrace_snapshot_trigger);
692 	} else {
693 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
694 			trigger_error(&auxtrace_snapshot_trigger);
695 		else
696 			trigger_ready(&auxtrace_snapshot_trigger);
697 	}
698 }
699 
700 static int record__auxtrace_snapshot_exit(struct record *rec)
701 {
702 	if (trigger_is_error(&auxtrace_snapshot_trigger))
703 		return 0;
704 
705 	if (!auxtrace_record__snapshot_started &&
706 	    auxtrace_record__snapshot_start(rec->itr))
707 		return -1;
708 
709 	record__read_auxtrace_snapshot(rec, true);
710 	if (trigger_is_error(&auxtrace_snapshot_trigger))
711 		return -1;
712 
713 	return 0;
714 }
715 
716 static int record__auxtrace_init(struct record *rec)
717 {
718 	int err;
719 
720 	if (!rec->itr) {
721 		rec->itr = auxtrace_record__init(rec->evlist, &err);
722 		if (err)
723 			return err;
724 	}
725 
726 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
727 					      rec->opts.auxtrace_snapshot_opts);
728 	if (err)
729 		return err;
730 
731 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
732 					    rec->opts.auxtrace_sample_opts);
733 	if (err)
734 		return err;
735 
736 	auxtrace_regroup_aux_output(rec->evlist);
737 
738 	return auxtrace_parse_filters(rec->evlist);
739 }
740 
741 #else
742 
743 static inline
744 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
745 			       struct mmap *map __maybe_unused)
746 {
747 	return 0;
748 }
749 
750 static inline
751 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
752 				    bool on_exit __maybe_unused)
753 {
754 }
755 
756 static inline
757 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
758 {
759 	return 0;
760 }
761 
762 static inline
763 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
764 {
765 	return 0;
766 }
767 
768 static int record__auxtrace_init(struct record *rec __maybe_unused)
769 {
770 	return 0;
771 }
772 
773 #endif
774 
775 static int record__config_text_poke(struct evlist *evlist)
776 {
777 	struct evsel *evsel;
778 	int err;
779 
780 	/* Nothing to do if text poke is already configured */
781 	evlist__for_each_entry(evlist, evsel) {
782 		if (evsel->core.attr.text_poke)
783 			return 0;
784 	}
785 
786 	err = parse_events(evlist, "dummy:u", NULL);
787 	if (err)
788 		return err;
789 
790 	evsel = evlist__last(evlist);
791 
792 	evsel->core.attr.freq = 0;
793 	evsel->core.attr.sample_period = 1;
794 	evsel->core.attr.text_poke = 1;
795 	evsel->core.attr.ksymbol = 1;
796 
797 	evsel->core.system_wide = true;
798 	evsel->no_aux_samples = true;
799 	evsel->immediate = true;
800 
801 	/* Text poke must be collected on all CPUs */
802 	perf_cpu_map__put(evsel->core.own_cpus);
803 	evsel->core.own_cpus = perf_cpu_map__new(NULL);
804 	perf_cpu_map__put(evsel->core.cpus);
805 	evsel->core.cpus = perf_cpu_map__get(evsel->core.own_cpus);
806 
807 	evsel__set_sample_bit(evsel, TIME);
808 
809 	return 0;
810 }
811 
812 static bool record__kcore_readable(struct machine *machine)
813 {
814 	char kcore[PATH_MAX];
815 	int fd;
816 
817 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
818 
819 	fd = open(kcore, O_RDONLY);
820 	if (fd < 0)
821 		return false;
822 
823 	close(fd);
824 
825 	return true;
826 }
827 
828 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
829 {
830 	char from_dir[PATH_MAX];
831 	char kcore_dir[PATH_MAX];
832 	int ret;
833 
834 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
835 
836 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
837 	if (ret)
838 		return ret;
839 
840 	return kcore_copy(from_dir, kcore_dir);
841 }
842 
843 static int record__mmap_evlist(struct record *rec,
844 			       struct evlist *evlist)
845 {
846 	struct record_opts *opts = &rec->opts;
847 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
848 				  opts->auxtrace_sample_mode;
849 	char msg[512];
850 
851 	if (opts->affinity != PERF_AFFINITY_SYS)
852 		cpu__setup_cpunode_map();
853 
854 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
855 				 opts->auxtrace_mmap_pages,
856 				 auxtrace_overwrite,
857 				 opts->nr_cblocks, opts->affinity,
858 				 opts->mmap_flush, opts->comp_level) < 0) {
859 		if (errno == EPERM) {
860 			pr_err("Permission error mapping pages.\n"
861 			       "Consider increasing "
862 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
863 			       "or try again with a smaller value of -m/--mmap_pages.\n"
864 			       "(current value: %u,%u)\n",
865 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
866 			return -errno;
867 		} else {
868 			pr_err("failed to mmap with %d (%s)\n", errno,
869 				str_error_r(errno, msg, sizeof(msg)));
870 			if (errno)
871 				return -errno;
872 			else
873 				return -EINVAL;
874 		}
875 	}
876 	return 0;
877 }
878 
879 static int record__mmap(struct record *rec)
880 {
881 	return record__mmap_evlist(rec, rec->evlist);
882 }
883 
884 static int record__open(struct record *rec)
885 {
886 	char msg[BUFSIZ];
887 	struct evsel *pos;
888 	struct evlist *evlist = rec->evlist;
889 	struct perf_session *session = rec->session;
890 	struct record_opts *opts = &rec->opts;
891 	int rc = 0;
892 
893 	/*
894 	 * For initial_delay, system wide or a hybrid system, we need to add a
895 	 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
896 	 * of waiting or event synthesis.
897 	 */
898 	if (opts->initial_delay || target__has_cpu(&opts->target) ||
899 	    perf_pmu__has_hybrid()) {
900 		pos = evlist__get_tracking_event(evlist);
901 		if (!evsel__is_dummy_event(pos)) {
902 			/* Set up dummy event. */
903 			if (evlist__add_dummy(evlist))
904 				return -ENOMEM;
905 			pos = evlist__last(evlist);
906 			evlist__set_tracking_event(evlist, pos);
907 		}
908 
909 		/*
910 		 * Enable the dummy event when the process is forked for
911 		 * initial_delay, immediately for system wide.
912 		 */
913 		if (opts->initial_delay && !pos->immediate)
914 			pos->core.attr.enable_on_exec = 1;
915 		else
916 			pos->immediate = 1;
917 	}
918 
919 	evlist__config(evlist, opts, &callchain_param);
920 
921 	evlist__for_each_entry(evlist, pos) {
922 try_again:
923 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
924 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
925 				if (verbose > 0)
926 					ui__warning("%s\n", msg);
927 				goto try_again;
928 			}
929 			if ((errno == EINVAL || errno == EBADF) &&
930 			    pos->core.leader != &pos->core &&
931 			    pos->weak_group) {
932 			        pos = evlist__reset_weak_group(evlist, pos, true);
933 				goto try_again;
934 			}
935 			rc = -errno;
936 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
937 			ui__error("%s\n", msg);
938 			goto out;
939 		}
940 
941 		pos->supported = true;
942 	}
943 
944 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
945 		pr_warning(
946 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
947 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
948 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
949 "file is not found in the buildid cache or in the vmlinux path.\n\n"
950 "Samples in kernel modules won't be resolved at all.\n\n"
951 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
952 "even with a suitable vmlinux or kallsyms file.\n\n");
953 	}
954 
955 	if (evlist__apply_filters(evlist, &pos)) {
956 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
957 			pos->filter, evsel__name(pos), errno,
958 			str_error_r(errno, msg, sizeof(msg)));
959 		rc = -1;
960 		goto out;
961 	}
962 
963 	rc = record__mmap(rec);
964 	if (rc)
965 		goto out;
966 
967 	session->evlist = evlist;
968 	perf_session__set_id_hdr_size(session);
969 out:
970 	return rc;
971 }
972 
973 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
974 {
975 	if (rec->evlist->first_sample_time == 0)
976 		rec->evlist->first_sample_time = sample_time;
977 
978 	if (sample_time)
979 		rec->evlist->last_sample_time = sample_time;
980 }
981 
982 static int process_sample_event(struct perf_tool *tool,
983 				union perf_event *event,
984 				struct perf_sample *sample,
985 				struct evsel *evsel,
986 				struct machine *machine)
987 {
988 	struct record *rec = container_of(tool, struct record, tool);
989 
990 	set_timestamp_boundary(rec, sample->time);
991 
992 	if (rec->buildid_all)
993 		return 0;
994 
995 	rec->samples++;
996 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
997 }
998 
999 static int process_buildids(struct record *rec)
1000 {
1001 	struct perf_session *session = rec->session;
1002 
1003 	if (perf_data__size(&rec->data) == 0)
1004 		return 0;
1005 
1006 	/*
1007 	 * During this process, it'll load kernel map and replace the
1008 	 * dso->long_name to a real pathname it found.  In this case
1009 	 * we prefer the vmlinux path like
1010 	 *   /lib/modules/3.16.4/build/vmlinux
1011 	 *
1012 	 * rather than build-id path (in debug directory).
1013 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1014 	 */
1015 	symbol_conf.ignore_vmlinux_buildid = true;
1016 
1017 	/*
1018 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1019 	 * so no need to process samples. But if timestamp_boundary is enabled,
1020 	 * it still needs to walk on all samples to get the timestamps of
1021 	 * first/last samples.
1022 	 */
1023 	if (rec->buildid_all && !rec->timestamp_boundary)
1024 		rec->tool.sample = NULL;
1025 
1026 	return perf_session__process_events(session);
1027 }
1028 
1029 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1030 {
1031 	int err;
1032 	struct perf_tool *tool = data;
1033 	/*
1034 	 *As for guest kernel when processing subcommand record&report,
1035 	 *we arrange module mmap prior to guest kernel mmap and trigger
1036 	 *a preload dso because default guest module symbols are loaded
1037 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1038 	 *method is used to avoid symbol missing when the first addr is
1039 	 *in module instead of in guest kernel.
1040 	 */
1041 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1042 					     machine);
1043 	if (err < 0)
1044 		pr_err("Couldn't record guest kernel [%d]'s reference"
1045 		       " relocation symbol.\n", machine->pid);
1046 
1047 	/*
1048 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1049 	 * have no _text sometimes.
1050 	 */
1051 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1052 						 machine);
1053 	if (err < 0)
1054 		pr_err("Couldn't record guest kernel [%d]'s reference"
1055 		       " relocation symbol.\n", machine->pid);
1056 }
1057 
1058 static struct perf_event_header finished_round_event = {
1059 	.size = sizeof(struct perf_event_header),
1060 	.type = PERF_RECORD_FINISHED_ROUND,
1061 };
1062 
1063 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1064 {
1065 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1066 	    !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
1067 			  rec->affinity_mask.nbits)) {
1068 		bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
1069 		bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
1070 			  map->affinity_mask.bits, rec->affinity_mask.nbits);
1071 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
1072 				  (cpu_set_t *)rec->affinity_mask.bits);
1073 		if (verbose == 2)
1074 			mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
1075 	}
1076 }
1077 
1078 static size_t process_comp_header(void *record, size_t increment)
1079 {
1080 	struct perf_record_compressed *event = record;
1081 	size_t size = sizeof(*event);
1082 
1083 	if (increment) {
1084 		event->header.size += increment;
1085 		return increment;
1086 	}
1087 
1088 	event->header.type = PERF_RECORD_COMPRESSED;
1089 	event->header.size = size;
1090 
1091 	return size;
1092 }
1093 
1094 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
1095 			    void *src, size_t src_size)
1096 {
1097 	size_t compressed;
1098 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1099 
1100 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1101 						     max_record_size, process_comp_header);
1102 
1103 	session->bytes_transferred += src_size;
1104 	session->bytes_compressed  += compressed;
1105 
1106 	return compressed;
1107 }
1108 
1109 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1110 				    bool overwrite, bool synch)
1111 {
1112 	u64 bytes_written = rec->bytes_written;
1113 	int i;
1114 	int rc = 0;
1115 	struct mmap *maps;
1116 	int trace_fd = rec->data.file.fd;
1117 	off_t off = 0;
1118 
1119 	if (!evlist)
1120 		return 0;
1121 
1122 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1123 	if (!maps)
1124 		return 0;
1125 
1126 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1127 		return 0;
1128 
1129 	if (record__aio_enabled(rec))
1130 		off = record__aio_get_pos(trace_fd);
1131 
1132 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
1133 		u64 flush = 0;
1134 		struct mmap *map = &maps[i];
1135 
1136 		if (map->core.base) {
1137 			record__adjust_affinity(rec, map);
1138 			if (synch) {
1139 				flush = map->core.flush;
1140 				map->core.flush = 1;
1141 			}
1142 			if (!record__aio_enabled(rec)) {
1143 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1144 					if (synch)
1145 						map->core.flush = flush;
1146 					rc = -1;
1147 					goto out;
1148 				}
1149 			} else {
1150 				if (record__aio_push(rec, map, &off) < 0) {
1151 					record__aio_set_pos(trace_fd, off);
1152 					if (synch)
1153 						map->core.flush = flush;
1154 					rc = -1;
1155 					goto out;
1156 				}
1157 			}
1158 			if (synch)
1159 				map->core.flush = flush;
1160 		}
1161 
1162 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1163 		    !rec->opts.auxtrace_sample_mode &&
1164 		    record__auxtrace_mmap_read(rec, map) != 0) {
1165 			rc = -1;
1166 			goto out;
1167 		}
1168 	}
1169 
1170 	if (record__aio_enabled(rec))
1171 		record__aio_set_pos(trace_fd, off);
1172 
1173 	/*
1174 	 * Mark the round finished in case we wrote
1175 	 * at least one event.
1176 	 */
1177 	if (bytes_written != rec->bytes_written)
1178 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1179 
1180 	if (overwrite)
1181 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1182 out:
1183 	return rc;
1184 }
1185 
1186 static int record__mmap_read_all(struct record *rec, bool synch)
1187 {
1188 	int err;
1189 
1190 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1191 	if (err)
1192 		return err;
1193 
1194 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1195 }
1196 
1197 static void record__init_features(struct record *rec)
1198 {
1199 	struct perf_session *session = rec->session;
1200 	int feat;
1201 
1202 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1203 		perf_header__set_feat(&session->header, feat);
1204 
1205 	if (rec->no_buildid)
1206 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1207 
1208 	if (!have_tracepoints(&rec->evlist->core.entries))
1209 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1210 
1211 	if (!rec->opts.branch_stack)
1212 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1213 
1214 	if (!rec->opts.full_auxtrace)
1215 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1216 
1217 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1218 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1219 
1220 	if (!rec->opts.use_clockid)
1221 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1222 
1223 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1224 	if (!record__comp_enabled(rec))
1225 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1226 
1227 	perf_header__clear_feat(&session->header, HEADER_STAT);
1228 }
1229 
1230 static void
1231 record__finish_output(struct record *rec)
1232 {
1233 	struct perf_data *data = &rec->data;
1234 	int fd = perf_data__fd(data);
1235 
1236 	if (data->is_pipe)
1237 		return;
1238 
1239 	rec->session->header.data_size += rec->bytes_written;
1240 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1241 
1242 	if (!rec->no_buildid) {
1243 		process_buildids(rec);
1244 
1245 		if (rec->buildid_all)
1246 			dsos__hit_all(rec->session);
1247 	}
1248 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1249 
1250 	return;
1251 }
1252 
1253 static int record__synthesize_workload(struct record *rec, bool tail)
1254 {
1255 	int err;
1256 	struct perf_thread_map *thread_map;
1257 
1258 	if (rec->opts.tail_synthesize != tail)
1259 		return 0;
1260 
1261 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1262 	if (thread_map == NULL)
1263 		return -1;
1264 
1265 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1266 						 process_synthesized_event,
1267 						 &rec->session->machines.host,
1268 						 rec->opts.sample_address);
1269 	perf_thread_map__put(thread_map);
1270 	return err;
1271 }
1272 
1273 static int record__synthesize(struct record *rec, bool tail);
1274 
1275 static int
1276 record__switch_output(struct record *rec, bool at_exit)
1277 {
1278 	struct perf_data *data = &rec->data;
1279 	int fd, err;
1280 	char *new_filename;
1281 
1282 	/* Same Size:      "2015122520103046"*/
1283 	char timestamp[] = "InvalidTimestamp";
1284 
1285 	record__aio_mmap_read_sync(rec);
1286 
1287 	record__synthesize(rec, true);
1288 	if (target__none(&rec->opts.target))
1289 		record__synthesize_workload(rec, true);
1290 
1291 	rec->samples = 0;
1292 	record__finish_output(rec);
1293 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1294 	if (err) {
1295 		pr_err("Failed to get current timestamp\n");
1296 		return -EINVAL;
1297 	}
1298 
1299 	fd = perf_data__switch(data, timestamp,
1300 				    rec->session->header.data_offset,
1301 				    at_exit, &new_filename);
1302 	if (fd >= 0 && !at_exit) {
1303 		rec->bytes_written = 0;
1304 		rec->session->header.data_size = 0;
1305 	}
1306 
1307 	if (!quiet)
1308 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1309 			data->path, timestamp);
1310 
1311 	if (rec->switch_output.num_files) {
1312 		int n = rec->switch_output.cur_file + 1;
1313 
1314 		if (n >= rec->switch_output.num_files)
1315 			n = 0;
1316 		rec->switch_output.cur_file = n;
1317 		if (rec->switch_output.filenames[n]) {
1318 			remove(rec->switch_output.filenames[n]);
1319 			zfree(&rec->switch_output.filenames[n]);
1320 		}
1321 		rec->switch_output.filenames[n] = new_filename;
1322 	} else {
1323 		free(new_filename);
1324 	}
1325 
1326 	/* Output tracking events */
1327 	if (!at_exit) {
1328 		record__synthesize(rec, false);
1329 
1330 		/*
1331 		 * In 'perf record --switch-output' without -a,
1332 		 * record__synthesize() in record__switch_output() won't
1333 		 * generate tracking events because there's no thread_map
1334 		 * in evlist. Which causes newly created perf.data doesn't
1335 		 * contain map and comm information.
1336 		 * Create a fake thread_map and directly call
1337 		 * perf_event__synthesize_thread_map() for those events.
1338 		 */
1339 		if (target__none(&rec->opts.target))
1340 			record__synthesize_workload(rec, false);
1341 	}
1342 	return fd;
1343 }
1344 
1345 static volatile int workload_exec_errno;
1346 
1347 /*
1348  * evlist__prepare_workload will send a SIGUSR1
1349  * if the fork fails, since we asked by setting its
1350  * want_signal to true.
1351  */
1352 static void workload_exec_failed_signal(int signo __maybe_unused,
1353 					siginfo_t *info,
1354 					void *ucontext __maybe_unused)
1355 {
1356 	workload_exec_errno = info->si_value.sival_int;
1357 	done = 1;
1358 	child_finished = 1;
1359 }
1360 
1361 static void snapshot_sig_handler(int sig);
1362 static void alarm_sig_handler(int sig);
1363 
1364 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1365 {
1366 	if (evlist) {
1367 		if (evlist->mmap && evlist->mmap[0].core.base)
1368 			return evlist->mmap[0].core.base;
1369 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1370 			return evlist->overwrite_mmap[0].core.base;
1371 	}
1372 	return NULL;
1373 }
1374 
1375 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1376 {
1377 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1378 	if (pc)
1379 		return pc;
1380 	return NULL;
1381 }
1382 
1383 static int record__synthesize(struct record *rec, bool tail)
1384 {
1385 	struct perf_session *session = rec->session;
1386 	struct machine *machine = &session->machines.host;
1387 	struct perf_data *data = &rec->data;
1388 	struct record_opts *opts = &rec->opts;
1389 	struct perf_tool *tool = &rec->tool;
1390 	int err = 0;
1391 	event_op f = process_synthesized_event;
1392 
1393 	if (rec->opts.tail_synthesize != tail)
1394 		return 0;
1395 
1396 	if (data->is_pipe) {
1397 		err = perf_event__synthesize_for_pipe(tool, session, data,
1398 						      process_synthesized_event);
1399 		if (err < 0)
1400 			goto out;
1401 
1402 		rec->bytes_written += err;
1403 	}
1404 
1405 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1406 					  process_synthesized_event, machine);
1407 	if (err)
1408 		goto out;
1409 
1410 	/* Synthesize id_index before auxtrace_info */
1411 	if (rec->opts.auxtrace_sample_mode) {
1412 		err = perf_event__synthesize_id_index(tool,
1413 						      process_synthesized_event,
1414 						      session->evlist, machine);
1415 		if (err)
1416 			goto out;
1417 	}
1418 
1419 	if (rec->opts.full_auxtrace) {
1420 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1421 					session, process_synthesized_event);
1422 		if (err)
1423 			goto out;
1424 	}
1425 
1426 	if (!evlist__exclude_kernel(rec->evlist)) {
1427 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1428 							 machine);
1429 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1430 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1431 				   "Check /proc/kallsyms permission or run as root.\n");
1432 
1433 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1434 						     machine);
1435 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1436 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1437 				   "Check /proc/modules permission or run as root.\n");
1438 	}
1439 
1440 	if (perf_guest) {
1441 		machines__process_guests(&session->machines,
1442 					 perf_event__synthesize_guest_os, tool);
1443 	}
1444 
1445 	err = perf_event__synthesize_extra_attr(&rec->tool,
1446 						rec->evlist,
1447 						process_synthesized_event,
1448 						data->is_pipe);
1449 	if (err)
1450 		goto out;
1451 
1452 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1453 						 process_synthesized_event,
1454 						NULL);
1455 	if (err < 0) {
1456 		pr_err("Couldn't synthesize thread map.\n");
1457 		return err;
1458 	}
1459 
1460 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1461 					     process_synthesized_event, NULL);
1462 	if (err < 0) {
1463 		pr_err("Couldn't synthesize cpu map.\n");
1464 		return err;
1465 	}
1466 
1467 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1468 						machine, opts);
1469 	if (err < 0)
1470 		pr_warning("Couldn't synthesize bpf events.\n");
1471 
1472 	err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1473 					     machine);
1474 	if (err < 0)
1475 		pr_warning("Couldn't synthesize cgroup events.\n");
1476 
1477 	if (rec->opts.nr_threads_synthesize > 1) {
1478 		perf_set_multithreaded();
1479 		f = process_locked_synthesized_event;
1480 	}
1481 
1482 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1483 					    f, opts->sample_address,
1484 					    rec->opts.nr_threads_synthesize);
1485 
1486 	if (rec->opts.nr_threads_synthesize > 1)
1487 		perf_set_singlethreaded();
1488 
1489 out:
1490 	return err;
1491 }
1492 
1493 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1494 {
1495 	struct record *rec = data;
1496 	pthread_kill(rec->thread_id, SIGUSR2);
1497 	return 0;
1498 }
1499 
1500 static int record__setup_sb_evlist(struct record *rec)
1501 {
1502 	struct record_opts *opts = &rec->opts;
1503 
1504 	if (rec->sb_evlist != NULL) {
1505 		/*
1506 		 * We get here if --switch-output-event populated the
1507 		 * sb_evlist, so associate a callback that will send a SIGUSR2
1508 		 * to the main thread.
1509 		 */
1510 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1511 		rec->thread_id = pthread_self();
1512 	}
1513 #ifdef HAVE_LIBBPF_SUPPORT
1514 	if (!opts->no_bpf_event) {
1515 		if (rec->sb_evlist == NULL) {
1516 			rec->sb_evlist = evlist__new();
1517 
1518 			if (rec->sb_evlist == NULL) {
1519 				pr_err("Couldn't create side band evlist.\n.");
1520 				return -1;
1521 			}
1522 		}
1523 
1524 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1525 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1526 			return -1;
1527 		}
1528 	}
1529 #endif
1530 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1531 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1532 		opts->no_bpf_event = true;
1533 	}
1534 
1535 	return 0;
1536 }
1537 
1538 static int record__init_clock(struct record *rec)
1539 {
1540 	struct perf_session *session = rec->session;
1541 	struct timespec ref_clockid;
1542 	struct timeval ref_tod;
1543 	u64 ref;
1544 
1545 	if (!rec->opts.use_clockid)
1546 		return 0;
1547 
1548 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1549 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
1550 
1551 	session->header.env.clock.clockid = rec->opts.clockid;
1552 
1553 	if (gettimeofday(&ref_tod, NULL) != 0) {
1554 		pr_err("gettimeofday failed, cannot set reference time.\n");
1555 		return -1;
1556 	}
1557 
1558 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
1559 		pr_err("clock_gettime failed, cannot set reference time.\n");
1560 		return -1;
1561 	}
1562 
1563 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
1564 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
1565 
1566 	session->header.env.clock.tod_ns = ref;
1567 
1568 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
1569 	      (u64) ref_clockid.tv_nsec;
1570 
1571 	session->header.env.clock.clockid_ns = ref;
1572 	return 0;
1573 }
1574 
1575 static void hit_auxtrace_snapshot_trigger(struct record *rec)
1576 {
1577 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1578 		trigger_hit(&auxtrace_snapshot_trigger);
1579 		auxtrace_record__snapshot_started = 1;
1580 		if (auxtrace_record__snapshot_start(rec->itr))
1581 			trigger_error(&auxtrace_snapshot_trigger);
1582 	}
1583 }
1584 
1585 static void record__uniquify_name(struct record *rec)
1586 {
1587 	struct evsel *pos;
1588 	struct evlist *evlist = rec->evlist;
1589 	char *new_name;
1590 	int ret;
1591 
1592 	if (!perf_pmu__has_hybrid())
1593 		return;
1594 
1595 	evlist__for_each_entry(evlist, pos) {
1596 		if (!evsel__is_hybrid(pos))
1597 			continue;
1598 
1599 		if (strchr(pos->name, '/'))
1600 			continue;
1601 
1602 		ret = asprintf(&new_name, "%s/%s/",
1603 			       pos->pmu_name, pos->name);
1604 		if (ret) {
1605 			free(pos->name);
1606 			pos->name = new_name;
1607 		}
1608 	}
1609 }
1610 
1611 static int __cmd_record(struct record *rec, int argc, const char **argv)
1612 {
1613 	int err;
1614 	int status = 0;
1615 	unsigned long waking = 0;
1616 	const bool forks = argc > 0;
1617 	struct perf_tool *tool = &rec->tool;
1618 	struct record_opts *opts = &rec->opts;
1619 	struct perf_data *data = &rec->data;
1620 	struct perf_session *session;
1621 	bool disabled = false, draining = false;
1622 	int fd;
1623 	float ratio = 0;
1624 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
1625 
1626 	atexit(record__sig_exit);
1627 	signal(SIGCHLD, sig_handler);
1628 	signal(SIGINT, sig_handler);
1629 	signal(SIGTERM, sig_handler);
1630 	signal(SIGSEGV, sigsegv_handler);
1631 
1632 	if (rec->opts.record_namespaces)
1633 		tool->namespace_events = true;
1634 
1635 	if (rec->opts.record_cgroup) {
1636 #ifdef HAVE_FILE_HANDLE
1637 		tool->cgroup_events = true;
1638 #else
1639 		pr_err("cgroup tracking is not supported\n");
1640 		return -1;
1641 #endif
1642 	}
1643 
1644 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1645 		signal(SIGUSR2, snapshot_sig_handler);
1646 		if (rec->opts.auxtrace_snapshot_mode)
1647 			trigger_on(&auxtrace_snapshot_trigger);
1648 		if (rec->switch_output.enabled)
1649 			trigger_on(&switch_output_trigger);
1650 	} else {
1651 		signal(SIGUSR2, SIG_IGN);
1652 	}
1653 
1654 	session = perf_session__new(data, tool);
1655 	if (IS_ERR(session)) {
1656 		pr_err("Perf session creation failed.\n");
1657 		return PTR_ERR(session);
1658 	}
1659 
1660 	fd = perf_data__fd(data);
1661 	rec->session = session;
1662 
1663 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1664 		pr_err("Compression initialization failed.\n");
1665 		return -1;
1666 	}
1667 #ifdef HAVE_EVENTFD_SUPPORT
1668 	done_fd = eventfd(0, EFD_NONBLOCK);
1669 	if (done_fd < 0) {
1670 		pr_err("Failed to create wakeup eventfd, error: %m\n");
1671 		status = -1;
1672 		goto out_delete_session;
1673 	}
1674 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
1675 	if (err < 0) {
1676 		pr_err("Failed to add wakeup eventfd to poll list\n");
1677 		status = err;
1678 		goto out_delete_session;
1679 	}
1680 #endif // HAVE_EVENTFD_SUPPORT
1681 
1682 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1683 	session->header.env.comp_level = rec->opts.comp_level;
1684 
1685 	if (rec->opts.kcore &&
1686 	    !record__kcore_readable(&session->machines.host)) {
1687 		pr_err("ERROR: kcore is not readable.\n");
1688 		return -1;
1689 	}
1690 
1691 	if (record__init_clock(rec))
1692 		return -1;
1693 
1694 	record__init_features(rec);
1695 
1696 	if (forks) {
1697 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
1698 					       workload_exec_failed_signal);
1699 		if (err < 0) {
1700 			pr_err("Couldn't run the workload!\n");
1701 			status = err;
1702 			goto out_delete_session;
1703 		}
1704 	}
1705 
1706 	/*
1707 	 * If we have just single event and are sending data
1708 	 * through pipe, we need to force the ids allocation,
1709 	 * because we synthesize event name through the pipe
1710 	 * and need the id for that.
1711 	 */
1712 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1713 		rec->opts.sample_id = true;
1714 
1715 	record__uniquify_name(rec);
1716 
1717 	if (record__open(rec) != 0) {
1718 		err = -1;
1719 		goto out_child;
1720 	}
1721 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1722 
1723 	if (rec->opts.kcore) {
1724 		err = record__kcore_copy(&session->machines.host, data);
1725 		if (err) {
1726 			pr_err("ERROR: Failed to copy kcore\n");
1727 			goto out_child;
1728 		}
1729 	}
1730 
1731 	err = bpf__apply_obj_config();
1732 	if (err) {
1733 		char errbuf[BUFSIZ];
1734 
1735 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1736 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1737 			 errbuf);
1738 		goto out_child;
1739 	}
1740 
1741 	/*
1742 	 * Normally perf_session__new would do this, but it doesn't have the
1743 	 * evlist.
1744 	 */
1745 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
1746 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1747 		rec->tool.ordered_events = false;
1748 	}
1749 
1750 	if (!rec->evlist->core.nr_groups)
1751 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1752 
1753 	if (data->is_pipe) {
1754 		err = perf_header__write_pipe(fd);
1755 		if (err < 0)
1756 			goto out_child;
1757 	} else {
1758 		err = perf_session__write_header(session, rec->evlist, fd, false);
1759 		if (err < 0)
1760 			goto out_child;
1761 	}
1762 
1763 	err = -1;
1764 	if (!rec->no_buildid
1765 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1766 		pr_err("Couldn't generate buildids. "
1767 		       "Use --no-buildid to profile anyway.\n");
1768 		goto out_child;
1769 	}
1770 
1771 	err = record__setup_sb_evlist(rec);
1772 	if (err)
1773 		goto out_child;
1774 
1775 	err = record__synthesize(rec, false);
1776 	if (err < 0)
1777 		goto out_child;
1778 
1779 	if (rec->realtime_prio) {
1780 		struct sched_param param;
1781 
1782 		param.sched_priority = rec->realtime_prio;
1783 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1784 			pr_err("Could not set realtime priority.\n");
1785 			err = -1;
1786 			goto out_child;
1787 		}
1788 	}
1789 
1790 	/*
1791 	 * When perf is starting the traced process, all the events
1792 	 * (apart from group members) have enable_on_exec=1 set,
1793 	 * so don't spoil it by prematurely enabling them.
1794 	 */
1795 	if (!target__none(&opts->target) && !opts->initial_delay)
1796 		evlist__enable(rec->evlist);
1797 
1798 	/*
1799 	 * Let the child rip
1800 	 */
1801 	if (forks) {
1802 		struct machine *machine = &session->machines.host;
1803 		union perf_event *event;
1804 		pid_t tgid;
1805 
1806 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1807 		if (event == NULL) {
1808 			err = -ENOMEM;
1809 			goto out_child;
1810 		}
1811 
1812 		/*
1813 		 * Some H/W events are generated before COMM event
1814 		 * which is emitted during exec(), so perf script
1815 		 * cannot see a correct process name for those events.
1816 		 * Synthesize COMM event to prevent it.
1817 		 */
1818 		tgid = perf_event__synthesize_comm(tool, event,
1819 						   rec->evlist->workload.pid,
1820 						   process_synthesized_event,
1821 						   machine);
1822 		free(event);
1823 
1824 		if (tgid == -1)
1825 			goto out_child;
1826 
1827 		event = malloc(sizeof(event->namespaces) +
1828 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1829 			       machine->id_hdr_size);
1830 		if (event == NULL) {
1831 			err = -ENOMEM;
1832 			goto out_child;
1833 		}
1834 
1835 		/*
1836 		 * Synthesize NAMESPACES event for the command specified.
1837 		 */
1838 		perf_event__synthesize_namespaces(tool, event,
1839 						  rec->evlist->workload.pid,
1840 						  tgid, process_synthesized_event,
1841 						  machine);
1842 		free(event);
1843 
1844 		evlist__start_workload(rec->evlist);
1845 	}
1846 
1847 	if (evlist__initialize_ctlfd(rec->evlist, opts->ctl_fd, opts->ctl_fd_ack))
1848 		goto out_child;
1849 
1850 	if (opts->initial_delay) {
1851 		pr_info(EVLIST_DISABLED_MSG);
1852 		if (opts->initial_delay > 0) {
1853 			usleep(opts->initial_delay * USEC_PER_MSEC);
1854 			evlist__enable(rec->evlist);
1855 			pr_info(EVLIST_ENABLED_MSG);
1856 		}
1857 	}
1858 
1859 	trigger_ready(&auxtrace_snapshot_trigger);
1860 	trigger_ready(&switch_output_trigger);
1861 	perf_hooks__invoke_record_start();
1862 	for (;;) {
1863 		unsigned long long hits = rec->samples;
1864 
1865 		/*
1866 		 * rec->evlist->bkw_mmap_state is possible to be
1867 		 * BKW_MMAP_EMPTY here: when done == true and
1868 		 * hits != rec->samples in previous round.
1869 		 *
1870 		 * evlist__toggle_bkw_mmap ensure we never
1871 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1872 		 */
1873 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1874 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1875 
1876 		if (record__mmap_read_all(rec, false) < 0) {
1877 			trigger_error(&auxtrace_snapshot_trigger);
1878 			trigger_error(&switch_output_trigger);
1879 			err = -1;
1880 			goto out_child;
1881 		}
1882 
1883 		if (auxtrace_record__snapshot_started) {
1884 			auxtrace_record__snapshot_started = 0;
1885 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1886 				record__read_auxtrace_snapshot(rec, false);
1887 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1888 				pr_err("AUX area tracing snapshot failed\n");
1889 				err = -1;
1890 				goto out_child;
1891 			}
1892 		}
1893 
1894 		if (trigger_is_hit(&switch_output_trigger)) {
1895 			/*
1896 			 * If switch_output_trigger is hit, the data in
1897 			 * overwritable ring buffer should have been collected,
1898 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1899 			 *
1900 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1901 			 * record__mmap_read_all() didn't collect data from
1902 			 * overwritable ring buffer. Read again.
1903 			 */
1904 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1905 				continue;
1906 			trigger_ready(&switch_output_trigger);
1907 
1908 			/*
1909 			 * Reenable events in overwrite ring buffer after
1910 			 * record__mmap_read_all(): we should have collected
1911 			 * data from it.
1912 			 */
1913 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1914 
1915 			if (!quiet)
1916 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1917 					waking);
1918 			waking = 0;
1919 			fd = record__switch_output(rec, false);
1920 			if (fd < 0) {
1921 				pr_err("Failed to switch to new file\n");
1922 				trigger_error(&switch_output_trigger);
1923 				err = fd;
1924 				goto out_child;
1925 			}
1926 
1927 			/* re-arm the alarm */
1928 			if (rec->switch_output.time)
1929 				alarm(rec->switch_output.time);
1930 		}
1931 
1932 		if (hits == rec->samples) {
1933 			if (done || draining)
1934 				break;
1935 			err = evlist__poll(rec->evlist, -1);
1936 			/*
1937 			 * Propagate error, only if there's any. Ignore positive
1938 			 * number of returned events and interrupt error.
1939 			 */
1940 			if (err > 0 || (err < 0 && errno == EINTR))
1941 				err = 0;
1942 			waking++;
1943 
1944 			if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1945 				draining = true;
1946 		}
1947 
1948 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
1949 			switch (cmd) {
1950 			case EVLIST_CTL_CMD_SNAPSHOT:
1951 				hit_auxtrace_snapshot_trigger(rec);
1952 				evlist__ctlfd_ack(rec->evlist);
1953 				break;
1954 			case EVLIST_CTL_CMD_STOP:
1955 				done = 1;
1956 				break;
1957 			case EVLIST_CTL_CMD_ACK:
1958 			case EVLIST_CTL_CMD_UNSUPPORTED:
1959 			case EVLIST_CTL_CMD_ENABLE:
1960 			case EVLIST_CTL_CMD_DISABLE:
1961 			case EVLIST_CTL_CMD_EVLIST:
1962 			case EVLIST_CTL_CMD_PING:
1963 			default:
1964 				break;
1965 			}
1966 		}
1967 
1968 		/*
1969 		 * When perf is starting the traced process, at the end events
1970 		 * die with the process and we wait for that. Thus no need to
1971 		 * disable events in this case.
1972 		 */
1973 		if (done && !disabled && !target__none(&opts->target)) {
1974 			trigger_off(&auxtrace_snapshot_trigger);
1975 			evlist__disable(rec->evlist);
1976 			disabled = true;
1977 		}
1978 	}
1979 
1980 	trigger_off(&auxtrace_snapshot_trigger);
1981 	trigger_off(&switch_output_trigger);
1982 
1983 	if (opts->auxtrace_snapshot_on_exit)
1984 		record__auxtrace_snapshot_exit(rec);
1985 
1986 	if (forks && workload_exec_errno) {
1987 		char msg[STRERR_BUFSIZE], strevsels[2048];
1988 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1989 
1990 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
1991 
1992 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
1993 			strevsels, argv[0], emsg);
1994 		err = -1;
1995 		goto out_child;
1996 	}
1997 
1998 	if (!quiet)
1999 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
2000 
2001 	if (target__none(&rec->opts.target))
2002 		record__synthesize_workload(rec, true);
2003 
2004 out_child:
2005 	evlist__finalize_ctlfd(rec->evlist);
2006 	record__mmap_read_all(rec, true);
2007 	record__aio_mmap_read_sync(rec);
2008 
2009 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2010 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2011 		session->header.env.comp_ratio = ratio + 0.5;
2012 	}
2013 
2014 	if (forks) {
2015 		int exit_status;
2016 
2017 		if (!child_finished)
2018 			kill(rec->evlist->workload.pid, SIGTERM);
2019 
2020 		wait(&exit_status);
2021 
2022 		if (err < 0)
2023 			status = err;
2024 		else if (WIFEXITED(exit_status))
2025 			status = WEXITSTATUS(exit_status);
2026 		else if (WIFSIGNALED(exit_status))
2027 			signr = WTERMSIG(exit_status);
2028 	} else
2029 		status = err;
2030 
2031 	record__synthesize(rec, true);
2032 	/* this will be recalculated during process_buildids() */
2033 	rec->samples = 0;
2034 
2035 	if (!err) {
2036 		if (!rec->timestamp_filename) {
2037 			record__finish_output(rec);
2038 		} else {
2039 			fd = record__switch_output(rec, true);
2040 			if (fd < 0) {
2041 				status = fd;
2042 				goto out_delete_session;
2043 			}
2044 		}
2045 	}
2046 
2047 	perf_hooks__invoke_record_end();
2048 
2049 	if (!err && !quiet) {
2050 		char samples[128];
2051 		const char *postfix = rec->timestamp_filename ?
2052 					".<timestamp>" : "";
2053 
2054 		if (rec->samples && !rec->opts.full_auxtrace)
2055 			scnprintf(samples, sizeof(samples),
2056 				  " (%" PRIu64 " samples)", rec->samples);
2057 		else
2058 			samples[0] = '\0';
2059 
2060 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2061 			perf_data__size(data) / 1024.0 / 1024.0,
2062 			data->path, postfix, samples);
2063 		if (ratio) {
2064 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2065 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2066 					ratio);
2067 		}
2068 		fprintf(stderr, " ]\n");
2069 	}
2070 
2071 out_delete_session:
2072 #ifdef HAVE_EVENTFD_SUPPORT
2073 	if (done_fd >= 0)
2074 		close(done_fd);
2075 #endif
2076 	zstd_fini(&session->zstd_data);
2077 	perf_session__delete(session);
2078 
2079 	if (!opts->no_bpf_event)
2080 		evlist__stop_sb_thread(rec->sb_evlist);
2081 	return status;
2082 }
2083 
2084 static void callchain_debug(struct callchain_param *callchain)
2085 {
2086 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2087 
2088 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2089 
2090 	if (callchain->record_mode == CALLCHAIN_DWARF)
2091 		pr_debug("callchain: stack dump size %d\n",
2092 			 callchain->dump_size);
2093 }
2094 
2095 int record_opts__parse_callchain(struct record_opts *record,
2096 				 struct callchain_param *callchain,
2097 				 const char *arg, bool unset)
2098 {
2099 	int ret;
2100 	callchain->enabled = !unset;
2101 
2102 	/* --no-call-graph */
2103 	if (unset) {
2104 		callchain->record_mode = CALLCHAIN_NONE;
2105 		pr_debug("callchain: disabled\n");
2106 		return 0;
2107 	}
2108 
2109 	ret = parse_callchain_record_opt(arg, callchain);
2110 	if (!ret) {
2111 		/* Enable data address sampling for DWARF unwind. */
2112 		if (callchain->record_mode == CALLCHAIN_DWARF)
2113 			record->sample_address = true;
2114 		callchain_debug(callchain);
2115 	}
2116 
2117 	return ret;
2118 }
2119 
2120 int record_parse_callchain_opt(const struct option *opt,
2121 			       const char *arg,
2122 			       int unset)
2123 {
2124 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2125 }
2126 
2127 int record_callchain_opt(const struct option *opt,
2128 			 const char *arg __maybe_unused,
2129 			 int unset __maybe_unused)
2130 {
2131 	struct callchain_param *callchain = opt->value;
2132 
2133 	callchain->enabled = true;
2134 
2135 	if (callchain->record_mode == CALLCHAIN_NONE)
2136 		callchain->record_mode = CALLCHAIN_FP;
2137 
2138 	callchain_debug(callchain);
2139 	return 0;
2140 }
2141 
2142 static int perf_record_config(const char *var, const char *value, void *cb)
2143 {
2144 	struct record *rec = cb;
2145 
2146 	if (!strcmp(var, "record.build-id")) {
2147 		if (!strcmp(value, "cache"))
2148 			rec->no_buildid_cache = false;
2149 		else if (!strcmp(value, "no-cache"))
2150 			rec->no_buildid_cache = true;
2151 		else if (!strcmp(value, "skip"))
2152 			rec->no_buildid = true;
2153 		else if (!strcmp(value, "mmap"))
2154 			rec->buildid_mmap = true;
2155 		else
2156 			return -1;
2157 		return 0;
2158 	}
2159 	if (!strcmp(var, "record.call-graph")) {
2160 		var = "call-graph.record-mode";
2161 		return perf_default_config(var, value, cb);
2162 	}
2163 #ifdef HAVE_AIO_SUPPORT
2164 	if (!strcmp(var, "record.aio")) {
2165 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2166 		if (!rec->opts.nr_cblocks)
2167 			rec->opts.nr_cblocks = nr_cblocks_default;
2168 	}
2169 #endif
2170 
2171 	return 0;
2172 }
2173 
2174 
2175 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2176 {
2177 	struct record_opts *opts = (struct record_opts *)opt->value;
2178 
2179 	if (unset || !str)
2180 		return 0;
2181 
2182 	if (!strcasecmp(str, "node"))
2183 		opts->affinity = PERF_AFFINITY_NODE;
2184 	else if (!strcasecmp(str, "cpu"))
2185 		opts->affinity = PERF_AFFINITY_CPU;
2186 
2187 	return 0;
2188 }
2189 
2190 static int parse_output_max_size(const struct option *opt,
2191 				 const char *str, int unset)
2192 {
2193 	unsigned long *s = (unsigned long *)opt->value;
2194 	static struct parse_tag tags_size[] = {
2195 		{ .tag  = 'B', .mult = 1       },
2196 		{ .tag  = 'K', .mult = 1 << 10 },
2197 		{ .tag  = 'M', .mult = 1 << 20 },
2198 		{ .tag  = 'G', .mult = 1 << 30 },
2199 		{ .tag  = 0 },
2200 	};
2201 	unsigned long val;
2202 
2203 	if (unset) {
2204 		*s = 0;
2205 		return 0;
2206 	}
2207 
2208 	val = parse_tag_value(str, tags_size);
2209 	if (val != (unsigned long) -1) {
2210 		*s = val;
2211 		return 0;
2212 	}
2213 
2214 	return -1;
2215 }
2216 
2217 static int record__parse_mmap_pages(const struct option *opt,
2218 				    const char *str,
2219 				    int unset __maybe_unused)
2220 {
2221 	struct record_opts *opts = opt->value;
2222 	char *s, *p;
2223 	unsigned int mmap_pages;
2224 	int ret;
2225 
2226 	if (!str)
2227 		return -EINVAL;
2228 
2229 	s = strdup(str);
2230 	if (!s)
2231 		return -ENOMEM;
2232 
2233 	p = strchr(s, ',');
2234 	if (p)
2235 		*p = '\0';
2236 
2237 	if (*s) {
2238 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
2239 		if (ret)
2240 			goto out_free;
2241 		opts->mmap_pages = mmap_pages;
2242 	}
2243 
2244 	if (!p) {
2245 		ret = 0;
2246 		goto out_free;
2247 	}
2248 
2249 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
2250 	if (ret)
2251 		goto out_free;
2252 
2253 	opts->auxtrace_mmap_pages = mmap_pages;
2254 
2255 out_free:
2256 	free(s);
2257 	return ret;
2258 }
2259 
2260 static int parse_control_option(const struct option *opt,
2261 				const char *str,
2262 				int unset __maybe_unused)
2263 {
2264 	struct record_opts *opts = opt->value;
2265 
2266 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
2267 }
2268 
2269 static void switch_output_size_warn(struct record *rec)
2270 {
2271 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2272 	struct switch_output *s = &rec->switch_output;
2273 
2274 	wakeup_size /= 2;
2275 
2276 	if (s->size < wakeup_size) {
2277 		char buf[100];
2278 
2279 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2280 		pr_warning("WARNING: switch-output data size lower than "
2281 			   "wakeup kernel buffer size (%s) "
2282 			   "expect bigger perf.data sizes\n", buf);
2283 	}
2284 }
2285 
2286 static int switch_output_setup(struct record *rec)
2287 {
2288 	struct switch_output *s = &rec->switch_output;
2289 	static struct parse_tag tags_size[] = {
2290 		{ .tag  = 'B', .mult = 1       },
2291 		{ .tag  = 'K', .mult = 1 << 10 },
2292 		{ .tag  = 'M', .mult = 1 << 20 },
2293 		{ .tag  = 'G', .mult = 1 << 30 },
2294 		{ .tag  = 0 },
2295 	};
2296 	static struct parse_tag tags_time[] = {
2297 		{ .tag  = 's', .mult = 1        },
2298 		{ .tag  = 'm', .mult = 60       },
2299 		{ .tag  = 'h', .mult = 60*60    },
2300 		{ .tag  = 'd', .mult = 60*60*24 },
2301 		{ .tag  = 0 },
2302 	};
2303 	unsigned long val;
2304 
2305 	/*
2306 	 * If we're using --switch-output-events, then we imply its
2307 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2308 	 *  thread to its parent.
2309 	 */
2310 	if (rec->switch_output_event_set)
2311 		goto do_signal;
2312 
2313 	if (!s->set)
2314 		return 0;
2315 
2316 	if (!strcmp(s->str, "signal")) {
2317 do_signal:
2318 		s->signal = true;
2319 		pr_debug("switch-output with SIGUSR2 signal\n");
2320 		goto enabled;
2321 	}
2322 
2323 	val = parse_tag_value(s->str, tags_size);
2324 	if (val != (unsigned long) -1) {
2325 		s->size = val;
2326 		pr_debug("switch-output with %s size threshold\n", s->str);
2327 		goto enabled;
2328 	}
2329 
2330 	val = parse_tag_value(s->str, tags_time);
2331 	if (val != (unsigned long) -1) {
2332 		s->time = val;
2333 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2334 			 s->str, s->time);
2335 		goto enabled;
2336 	}
2337 
2338 	return -1;
2339 
2340 enabled:
2341 	rec->timestamp_filename = true;
2342 	s->enabled              = true;
2343 
2344 	if (s->size && !rec->opts.no_buffering)
2345 		switch_output_size_warn(rec);
2346 
2347 	return 0;
2348 }
2349 
2350 static const char * const __record_usage[] = {
2351 	"perf record [<options>] [<command>]",
2352 	"perf record [<options>] -- <command> [<options>]",
2353 	NULL
2354 };
2355 const char * const *record_usage = __record_usage;
2356 
2357 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2358 				  struct perf_sample *sample, struct machine *machine)
2359 {
2360 	/*
2361 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2362 	 * no need to add them twice.
2363 	 */
2364 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2365 		return 0;
2366 	return perf_event__process_mmap(tool, event, sample, machine);
2367 }
2368 
2369 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2370 				   struct perf_sample *sample, struct machine *machine)
2371 {
2372 	/*
2373 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2374 	 * no need to add them twice.
2375 	 */
2376 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2377 		return 0;
2378 
2379 	return perf_event__process_mmap2(tool, event, sample, machine);
2380 }
2381 
2382 static int process_timestamp_boundary(struct perf_tool *tool,
2383 				      union perf_event *event __maybe_unused,
2384 				      struct perf_sample *sample,
2385 				      struct machine *machine __maybe_unused)
2386 {
2387 	struct record *rec = container_of(tool, struct record, tool);
2388 
2389 	set_timestamp_boundary(rec, sample->time);
2390 	return 0;
2391 }
2392 
2393 /*
2394  * XXX Ideally would be local to cmd_record() and passed to a record__new
2395  * because we need to have access to it in record__exit, that is called
2396  * after cmd_record() exits, but since record_options need to be accessible to
2397  * builtin-script, leave it here.
2398  *
2399  * At least we don't ouch it in all the other functions here directly.
2400  *
2401  * Just say no to tons of global variables, sigh.
2402  */
2403 static struct record record = {
2404 	.opts = {
2405 		.sample_time	     = true,
2406 		.mmap_pages	     = UINT_MAX,
2407 		.user_freq	     = UINT_MAX,
2408 		.user_interval	     = ULLONG_MAX,
2409 		.freq		     = 4000,
2410 		.target		     = {
2411 			.uses_mmap   = true,
2412 			.default_per_cpu = true,
2413 		},
2414 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2415 		.nr_threads_synthesize = 1,
2416 		.ctl_fd              = -1,
2417 		.ctl_fd_ack          = -1,
2418 	},
2419 	.tool = {
2420 		.sample		= process_sample_event,
2421 		.fork		= perf_event__process_fork,
2422 		.exit		= perf_event__process_exit,
2423 		.comm		= perf_event__process_comm,
2424 		.namespaces	= perf_event__process_namespaces,
2425 		.mmap		= build_id__process_mmap,
2426 		.mmap2		= build_id__process_mmap2,
2427 		.itrace_start	= process_timestamp_boundary,
2428 		.aux		= process_timestamp_boundary,
2429 		.ordered_events	= true,
2430 	},
2431 };
2432 
2433 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2434 	"\n\t\t\t\tDefault: fp";
2435 
2436 static bool dry_run;
2437 
2438 /*
2439  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2440  * with it and switch to use the library functions in perf_evlist that came
2441  * from builtin-record.c, i.e. use record_opts,
2442  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2443  * using pipes, etc.
2444  */
2445 static struct option __record_options[] = {
2446 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2447 		     "event selector. use 'perf list' to list available events",
2448 		     parse_events_option),
2449 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2450 		     "event filter", parse_filter),
2451 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2452 			   NULL, "don't record events from perf itself",
2453 			   exclude_perf),
2454 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2455 		    "record events on existing process id"),
2456 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2457 		    "record events on existing thread id"),
2458 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2459 		    "collect data with this RT SCHED_FIFO priority"),
2460 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2461 		    "collect data without buffering"),
2462 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2463 		    "collect raw sample records from all opened counters"),
2464 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2465 			    "system-wide collection from all CPUs"),
2466 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2467 		    "list of cpus to monitor"),
2468 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2469 	OPT_STRING('o', "output", &record.data.path, "file",
2470 		    "output file name"),
2471 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2472 			&record.opts.no_inherit_set,
2473 			"child tasks do not inherit counters"),
2474 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2475 		    "synthesize non-sample events at the end of output"),
2476 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2477 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
2478 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2479 		    "Fail if the specified frequency can't be used"),
2480 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2481 		     "profile at this frequency",
2482 		      record__parse_freq),
2483 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2484 		     "number of mmap data pages and AUX area tracing mmap pages",
2485 		     record__parse_mmap_pages),
2486 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2487 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2488 		     record__mmap_flush_parse),
2489 	OPT_BOOLEAN(0, "group", &record.opts.group,
2490 		    "put the counters into a counter group"),
2491 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2492 			   NULL, "enables call-graph recording" ,
2493 			   &record_callchain_opt),
2494 	OPT_CALLBACK(0, "call-graph", &record.opts,
2495 		     "record_mode[,record_size]", record_callchain_help,
2496 		     &record_parse_callchain_opt),
2497 	OPT_INCR('v', "verbose", &verbose,
2498 		    "be more verbose (show counter open errors, etc)"),
2499 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2500 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2501 		    "per thread counts"),
2502 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2503 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2504 		    "Record the sample physical addresses"),
2505 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
2506 		    "Record the sampled data address data page size"),
2507 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
2508 		    "Record the sampled code address (ip) page size"),
2509 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2510 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2511 			&record.opts.sample_time_set,
2512 			"Record the sample timestamps"),
2513 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2514 			"Record the sample period"),
2515 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2516 		    "don't sample"),
2517 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2518 			&record.no_buildid_cache_set,
2519 			"do not update the buildid cache"),
2520 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2521 			&record.no_buildid_set,
2522 			"do not collect buildids in perf.data"),
2523 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2524 		     "monitor event in cgroup name only",
2525 		     parse_cgroups),
2526 	OPT_INTEGER('D', "delay", &record.opts.initial_delay,
2527 		  "ms to wait before starting measurement after program start (-1: start with events disabled)"),
2528 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2529 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2530 		   "user to profile"),
2531 
2532 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2533 		     "branch any", "sample any taken branches",
2534 		     parse_branch_stack),
2535 
2536 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2537 		     "branch filter mask", "branch stack filter modes",
2538 		     parse_branch_stack),
2539 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2540 		    "sample by weight (on special events only)"),
2541 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2542 		    "sample transaction flags (special events only)"),
2543 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2544 		    "use per-thread mmaps"),
2545 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2546 		    "sample selected machine registers on interrupt,"
2547 		    " use '-I?' to list register names", parse_intr_regs),
2548 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2549 		    "sample selected machine registers on interrupt,"
2550 		    " use '--user-regs=?' to list register names", parse_user_regs),
2551 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2552 		    "Record running/enabled time of read (:S) events"),
2553 	OPT_CALLBACK('k', "clockid", &record.opts,
2554 	"clockid", "clockid to use for events, see clock_gettime()",
2555 	parse_clockid),
2556 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2557 			  "opts", "AUX area tracing Snapshot Mode", ""),
2558 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2559 			  "opts", "sample AUX area", ""),
2560 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2561 			"per thread proc mmap processing timeout in ms"),
2562 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2563 		    "Record namespaces events"),
2564 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
2565 		    "Record cgroup events"),
2566 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
2567 			&record.opts.record_switch_events_set,
2568 			"Record context switch events"),
2569 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2570 			 "Configure all used events to run in kernel space.",
2571 			 PARSE_OPT_EXCLUSIVE),
2572 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2573 			 "Configure all used events to run in user space.",
2574 			 PARSE_OPT_EXCLUSIVE),
2575 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2576 		    "collect kernel callchains"),
2577 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2578 		    "collect user callchains"),
2579 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2580 		   "clang binary to use for compiling BPF scriptlets"),
2581 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2582 		   "options passed to clang when compiling BPF scriptlets"),
2583 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2584 		   "file", "vmlinux pathname"),
2585 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2586 		    "Record build-id of all DSOs regardless of hits"),
2587 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
2588 		    "Record build-id in map events"),
2589 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2590 		    "append timestamp to output filename"),
2591 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2592 		    "Record timestamp boundary (time of first/last samples)"),
2593 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2594 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2595 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2596 			  "signal"),
2597 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
2598 			 "switch output event selector. use 'perf list' to list available events",
2599 			 parse_events_option_new_evlist),
2600 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2601 		   "Limit number of switch output generated files"),
2602 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2603 		    "Parse options then exit"),
2604 #ifdef HAVE_AIO_SUPPORT
2605 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2606 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2607 		     record__aio_parse),
2608 #endif
2609 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2610 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2611 		     record__parse_affinity),
2612 #ifdef HAVE_ZSTD_SUPPORT
2613 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2614 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2615 			    record__parse_comp_level),
2616 #endif
2617 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
2618 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
2619 	OPT_UINTEGER(0, "num-thread-synthesize",
2620 		     &record.opts.nr_threads_synthesize,
2621 		     "number of threads to run for event synthesis"),
2622 #ifdef HAVE_LIBPFM
2623 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
2624 		"libpfm4 event selector. use 'perf list' to list available events",
2625 		parse_libpfm_events_option),
2626 #endif
2627 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
2628 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
2629 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
2630 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
2631 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
2632 		      parse_control_option),
2633 	OPT_END()
2634 };
2635 
2636 struct option *record_options = __record_options;
2637 
2638 int cmd_record(int argc, const char **argv)
2639 {
2640 	int err;
2641 	struct record *rec = &record;
2642 	char errbuf[BUFSIZ];
2643 
2644 	setlocale(LC_ALL, "");
2645 
2646 #ifndef HAVE_LIBBPF_SUPPORT
2647 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2648 	set_nobuild('\0', "clang-path", true);
2649 	set_nobuild('\0', "clang-opt", true);
2650 # undef set_nobuild
2651 #endif
2652 
2653 #ifndef HAVE_BPF_PROLOGUE
2654 # if !defined (HAVE_DWARF_SUPPORT)
2655 #  define REASON  "NO_DWARF=1"
2656 # elif !defined (HAVE_LIBBPF_SUPPORT)
2657 #  define REASON  "NO_LIBBPF=1"
2658 # else
2659 #  define REASON  "this architecture doesn't support BPF prologue"
2660 # endif
2661 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2662 	set_nobuild('\0', "vmlinux", true);
2663 # undef set_nobuild
2664 # undef REASON
2665 #endif
2666 
2667 	rec->opts.affinity = PERF_AFFINITY_SYS;
2668 
2669 	rec->evlist = evlist__new();
2670 	if (rec->evlist == NULL)
2671 		return -ENOMEM;
2672 
2673 	err = perf_config(perf_record_config, rec);
2674 	if (err)
2675 		return err;
2676 
2677 	argc = parse_options(argc, argv, record_options, record_usage,
2678 			    PARSE_OPT_STOP_AT_NON_OPTION);
2679 	if (quiet)
2680 		perf_quiet_option();
2681 
2682 	/* Make system wide (-a) the default target. */
2683 	if (!argc && target__none(&rec->opts.target))
2684 		rec->opts.target.system_wide = true;
2685 
2686 	if (nr_cgroups && !rec->opts.target.system_wide) {
2687 		usage_with_options_msg(record_usage, record_options,
2688 			"cgroup monitoring only available in system-wide mode");
2689 
2690 	}
2691 
2692 	if (rec->buildid_mmap) {
2693 		if (!perf_can_record_build_id()) {
2694 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
2695 			err = -EINVAL;
2696 			goto out_opts;
2697 		}
2698 		pr_debug("Enabling build id in mmap2 events.\n");
2699 		/* Enable mmap build id synthesizing. */
2700 		symbol_conf.buildid_mmap2 = true;
2701 		/* Enable perf_event_attr::build_id bit. */
2702 		rec->opts.build_id = true;
2703 		/* Disable build id cache. */
2704 		rec->no_buildid = true;
2705 	}
2706 
2707 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
2708 		pr_err("Kernel has no cgroup sampling support.\n");
2709 		err = -EINVAL;
2710 		goto out_opts;
2711 	}
2712 
2713 	if (rec->opts.kcore)
2714 		rec->data.is_dir = true;
2715 
2716 	if (rec->opts.comp_level != 0) {
2717 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2718 		rec->no_buildid = true;
2719 	}
2720 
2721 	if (rec->opts.record_switch_events &&
2722 	    !perf_can_record_switch_events()) {
2723 		ui__error("kernel does not support recording context switch events\n");
2724 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2725 		err = -EINVAL;
2726 		goto out_opts;
2727 	}
2728 
2729 	if (switch_output_setup(rec)) {
2730 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2731 		err = -EINVAL;
2732 		goto out_opts;
2733 	}
2734 
2735 	if (rec->switch_output.time) {
2736 		signal(SIGALRM, alarm_sig_handler);
2737 		alarm(rec->switch_output.time);
2738 	}
2739 
2740 	if (rec->switch_output.num_files) {
2741 		rec->switch_output.filenames = calloc(sizeof(char *),
2742 						      rec->switch_output.num_files);
2743 		if (!rec->switch_output.filenames) {
2744 			err = -EINVAL;
2745 			goto out_opts;
2746 		}
2747 	}
2748 
2749 	/*
2750 	 * Allow aliases to facilitate the lookup of symbols for address
2751 	 * filters. Refer to auxtrace_parse_filters().
2752 	 */
2753 	symbol_conf.allow_aliases = true;
2754 
2755 	symbol__init(NULL);
2756 
2757 	if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2758 		rec->affinity_mask.nbits = cpu__max_cpu();
2759 		rec->affinity_mask.bits = bitmap_alloc(rec->affinity_mask.nbits);
2760 		if (!rec->affinity_mask.bits) {
2761 			pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2762 			err = -ENOMEM;
2763 			goto out_opts;
2764 		}
2765 		pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2766 	}
2767 
2768 	err = record__auxtrace_init(rec);
2769 	if (err)
2770 		goto out;
2771 
2772 	if (dry_run)
2773 		goto out;
2774 
2775 	err = bpf__setup_stdout(rec->evlist);
2776 	if (err) {
2777 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2778 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2779 			 errbuf);
2780 		goto out;
2781 	}
2782 
2783 	err = -ENOMEM;
2784 
2785 	if (rec->no_buildid_cache || rec->no_buildid) {
2786 		disable_buildid_cache();
2787 	} else if (rec->switch_output.enabled) {
2788 		/*
2789 		 * In 'perf record --switch-output', disable buildid
2790 		 * generation by default to reduce data file switching
2791 		 * overhead. Still generate buildid if they are required
2792 		 * explicitly using
2793 		 *
2794 		 *  perf record --switch-output --no-no-buildid \
2795 		 *              --no-no-buildid-cache
2796 		 *
2797 		 * Following code equals to:
2798 		 *
2799 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2800 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2801 		 *         disable_buildid_cache();
2802 		 */
2803 		bool disable = true;
2804 
2805 		if (rec->no_buildid_set && !rec->no_buildid)
2806 			disable = false;
2807 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2808 			disable = false;
2809 		if (disable) {
2810 			rec->no_buildid = true;
2811 			rec->no_buildid_cache = true;
2812 			disable_buildid_cache();
2813 		}
2814 	}
2815 
2816 	if (record.opts.overwrite)
2817 		record.opts.tail_synthesize = true;
2818 
2819 	if (rec->evlist->core.nr_entries == 0) {
2820 		if (perf_pmu__has_hybrid()) {
2821 			err = evlist__add_default_hybrid(rec->evlist,
2822 							 !record.opts.no_samples);
2823 		} else {
2824 			err = __evlist__add_default(rec->evlist,
2825 						    !record.opts.no_samples);
2826 		}
2827 
2828 		if (err < 0) {
2829 			pr_err("Not enough memory for event selector list\n");
2830 			goto out;
2831 		}
2832 	}
2833 
2834 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2835 		rec->opts.no_inherit = true;
2836 
2837 	err = target__validate(&rec->opts.target);
2838 	if (err) {
2839 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2840 		ui__warning("%s\n", errbuf);
2841 	}
2842 
2843 	err = target__parse_uid(&rec->opts.target);
2844 	if (err) {
2845 		int saved_errno = errno;
2846 
2847 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2848 		ui__error("%s", errbuf);
2849 
2850 		err = -saved_errno;
2851 		goto out;
2852 	}
2853 
2854 	/* Enable ignoring missing threads when -u/-p option is defined. */
2855 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2856 
2857 	err = -ENOMEM;
2858 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2859 		usage_with_options(record_usage, record_options);
2860 
2861 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2862 	if (err)
2863 		goto out;
2864 
2865 	/*
2866 	 * We take all buildids when the file contains
2867 	 * AUX area tracing data because we do not decode the
2868 	 * trace because it would take too long.
2869 	 */
2870 	if (rec->opts.full_auxtrace)
2871 		rec->buildid_all = true;
2872 
2873 	if (rec->opts.text_poke) {
2874 		err = record__config_text_poke(rec->evlist);
2875 		if (err) {
2876 			pr_err("record__config_text_poke failed, error %d\n", err);
2877 			goto out;
2878 		}
2879 	}
2880 
2881 	if (record_opts__config(&rec->opts)) {
2882 		err = -EINVAL;
2883 		goto out;
2884 	}
2885 
2886 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2887 		rec->opts.nr_cblocks = nr_cblocks_max;
2888 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2889 
2890 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2891 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2892 
2893 	if (rec->opts.comp_level > comp_level_max)
2894 		rec->opts.comp_level = comp_level_max;
2895 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2896 
2897 	err = __cmd_record(&record, argc, argv);
2898 out:
2899 	bitmap_free(rec->affinity_mask.bits);
2900 	evlist__delete(rec->evlist);
2901 	symbol__exit();
2902 	auxtrace_record__free(rec->itr);
2903 out_opts:
2904 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
2905 	return err;
2906 }
2907 
2908 static void snapshot_sig_handler(int sig __maybe_unused)
2909 {
2910 	struct record *rec = &record;
2911 
2912 	hit_auxtrace_snapshot_trigger(rec);
2913 
2914 	if (switch_output_signal(rec))
2915 		trigger_hit(&switch_output_trigger);
2916 }
2917 
2918 static void alarm_sig_handler(int sig __maybe_unused)
2919 {
2920 	struct record *rec = &record;
2921 
2922 	if (switch_output_time(rec))
2923 		trigger_hit(&switch_output_trigger);
2924 }
2925