xref: /openbmc/linux/tools/perf/builtin-record.c (revision b1a792601f264df7172a728f1a83a05b6b399dfb)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/clockid.h"
50 #include "asm/bug.h"
51 #include "perf.h"
52 
53 #include <errno.h>
54 #include <inttypes.h>
55 #include <locale.h>
56 #include <poll.h>
57 #include <pthread.h>
58 #include <unistd.h>
59 #include <sched.h>
60 #include <signal.h>
61 #ifdef HAVE_EVENTFD_SUPPORT
62 #include <sys/eventfd.h>
63 #endif
64 #include <sys/mman.h>
65 #include <sys/wait.h>
66 #include <sys/types.h>
67 #include <sys/stat.h>
68 #include <fcntl.h>
69 #include <linux/err.h>
70 #include <linux/string.h>
71 #include <linux/time64.h>
72 #include <linux/zalloc.h>
73 #include <linux/bitmap.h>
74 #include <sys/time.h>
75 
76 struct switch_output {
77 	bool		 enabled;
78 	bool		 signal;
79 	unsigned long	 size;
80 	unsigned long	 time;
81 	const char	*str;
82 	bool		 set;
83 	char		 **filenames;
84 	int		 num_files;
85 	int		 cur_file;
86 };
87 
88 struct record {
89 	struct perf_tool	tool;
90 	struct record_opts	opts;
91 	u64			bytes_written;
92 	struct perf_data	data;
93 	struct auxtrace_record	*itr;
94 	struct evlist	*evlist;
95 	struct perf_session	*session;
96 	struct evlist		*sb_evlist;
97 	pthread_t		thread_id;
98 	int			realtime_prio;
99 	bool			switch_output_event_set;
100 	bool			no_buildid;
101 	bool			no_buildid_set;
102 	bool			no_buildid_cache;
103 	bool			no_buildid_cache_set;
104 	bool			buildid_all;
105 	bool			buildid_mmap;
106 	bool			timestamp_filename;
107 	bool			timestamp_boundary;
108 	struct switch_output	switch_output;
109 	unsigned long long	samples;
110 	struct mmap_cpu_mask	affinity_mask;
111 	unsigned long		output_max_size;	/* = 0: unlimited */
112 };
113 
114 static volatile int done;
115 
116 static volatile int auxtrace_record__snapshot_started;
117 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
118 static DEFINE_TRIGGER(switch_output_trigger);
119 
120 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
121 	"SYS", "NODE", "CPU"
122 };
123 
124 static bool switch_output_signal(struct record *rec)
125 {
126 	return rec->switch_output.signal &&
127 	       trigger_is_ready(&switch_output_trigger);
128 }
129 
130 static bool switch_output_size(struct record *rec)
131 {
132 	return rec->switch_output.size &&
133 	       trigger_is_ready(&switch_output_trigger) &&
134 	       (rec->bytes_written >= rec->switch_output.size);
135 }
136 
137 static bool switch_output_time(struct record *rec)
138 {
139 	return rec->switch_output.time &&
140 	       trigger_is_ready(&switch_output_trigger);
141 }
142 
143 static bool record__output_max_size_exceeded(struct record *rec)
144 {
145 	return rec->output_max_size &&
146 	       (rec->bytes_written >= rec->output_max_size);
147 }
148 
149 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
150 			 void *bf, size_t size)
151 {
152 	struct perf_data_file *file = &rec->session->data->file;
153 
154 	if (perf_data_file__write(file, bf, size) < 0) {
155 		pr_err("failed to write perf data, error: %m\n");
156 		return -1;
157 	}
158 
159 	rec->bytes_written += size;
160 
161 	if (record__output_max_size_exceeded(rec) && !done) {
162 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
163 				" stopping session ]\n",
164 				rec->bytes_written >> 10);
165 		done = 1;
166 	}
167 
168 	if (switch_output_size(rec))
169 		trigger_hit(&switch_output_trigger);
170 
171 	return 0;
172 }
173 
174 static int record__aio_enabled(struct record *rec);
175 static int record__comp_enabled(struct record *rec);
176 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
177 			    void *src, size_t src_size);
178 
179 #ifdef HAVE_AIO_SUPPORT
180 static int record__aio_write(struct aiocb *cblock, int trace_fd,
181 		void *buf, size_t size, off_t off)
182 {
183 	int rc;
184 
185 	cblock->aio_fildes = trace_fd;
186 	cblock->aio_buf    = buf;
187 	cblock->aio_nbytes = size;
188 	cblock->aio_offset = off;
189 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
190 
191 	do {
192 		rc = aio_write(cblock);
193 		if (rc == 0) {
194 			break;
195 		} else if (errno != EAGAIN) {
196 			cblock->aio_fildes = -1;
197 			pr_err("failed to queue perf data, error: %m\n");
198 			break;
199 		}
200 	} while (1);
201 
202 	return rc;
203 }
204 
205 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
206 {
207 	void *rem_buf;
208 	off_t rem_off;
209 	size_t rem_size;
210 	int rc, aio_errno;
211 	ssize_t aio_ret, written;
212 
213 	aio_errno = aio_error(cblock);
214 	if (aio_errno == EINPROGRESS)
215 		return 0;
216 
217 	written = aio_ret = aio_return(cblock);
218 	if (aio_ret < 0) {
219 		if (aio_errno != EINTR)
220 			pr_err("failed to write perf data, error: %m\n");
221 		written = 0;
222 	}
223 
224 	rem_size = cblock->aio_nbytes - written;
225 
226 	if (rem_size == 0) {
227 		cblock->aio_fildes = -1;
228 		/*
229 		 * md->refcount is incremented in record__aio_pushfn() for
230 		 * every aio write request started in record__aio_push() so
231 		 * decrement it because the request is now complete.
232 		 */
233 		perf_mmap__put(&md->core);
234 		rc = 1;
235 	} else {
236 		/*
237 		 * aio write request may require restart with the
238 		 * reminder if the kernel didn't write whole
239 		 * chunk at once.
240 		 */
241 		rem_off = cblock->aio_offset + written;
242 		rem_buf = (void *)(cblock->aio_buf + written);
243 		record__aio_write(cblock, cblock->aio_fildes,
244 				rem_buf, rem_size, rem_off);
245 		rc = 0;
246 	}
247 
248 	return rc;
249 }
250 
251 static int record__aio_sync(struct mmap *md, bool sync_all)
252 {
253 	struct aiocb **aiocb = md->aio.aiocb;
254 	struct aiocb *cblocks = md->aio.cblocks;
255 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
256 	int i, do_suspend;
257 
258 	do {
259 		do_suspend = 0;
260 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
261 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
262 				if (sync_all)
263 					aiocb[i] = NULL;
264 				else
265 					return i;
266 			} else {
267 				/*
268 				 * Started aio write is not complete yet
269 				 * so it has to be waited before the
270 				 * next allocation.
271 				 */
272 				aiocb[i] = &cblocks[i];
273 				do_suspend = 1;
274 			}
275 		}
276 		if (!do_suspend)
277 			return -1;
278 
279 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
280 			if (!(errno == EAGAIN || errno == EINTR))
281 				pr_err("failed to sync perf data, error: %m\n");
282 		}
283 	} while (1);
284 }
285 
286 struct record_aio {
287 	struct record	*rec;
288 	void		*data;
289 	size_t		size;
290 };
291 
292 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
293 {
294 	struct record_aio *aio = to;
295 
296 	/*
297 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
298 	 * to release space in the kernel buffer as fast as possible, calling
299 	 * perf_mmap__consume() from perf_mmap__push() function.
300 	 *
301 	 * That lets the kernel to proceed with storing more profiling data into
302 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
303 	 *
304 	 * Coping can be done in two steps in case the chunk of profiling data
305 	 * crosses the upper bound of the kernel buffer. In this case we first move
306 	 * part of data from map->start till the upper bound and then the reminder
307 	 * from the beginning of the kernel buffer till the end of the data chunk.
308 	 */
309 
310 	if (record__comp_enabled(aio->rec)) {
311 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
312 				     mmap__mmap_len(map) - aio->size,
313 				     buf, size);
314 	} else {
315 		memcpy(aio->data + aio->size, buf, size);
316 	}
317 
318 	if (!aio->size) {
319 		/*
320 		 * Increment map->refcount to guard map->aio.data[] buffer
321 		 * from premature deallocation because map object can be
322 		 * released earlier than aio write request started on
323 		 * map->aio.data[] buffer is complete.
324 		 *
325 		 * perf_mmap__put() is done at record__aio_complete()
326 		 * after started aio request completion or at record__aio_push()
327 		 * if the request failed to start.
328 		 */
329 		perf_mmap__get(&map->core);
330 	}
331 
332 	aio->size += size;
333 
334 	return size;
335 }
336 
337 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
338 {
339 	int ret, idx;
340 	int trace_fd = rec->session->data->file.fd;
341 	struct record_aio aio = { .rec = rec, .size = 0 };
342 
343 	/*
344 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
345 	 * becomes available after previous aio write operation.
346 	 */
347 
348 	idx = record__aio_sync(map, false);
349 	aio.data = map->aio.data[idx];
350 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
351 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
352 		return ret;
353 
354 	rec->samples++;
355 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
356 	if (!ret) {
357 		*off += aio.size;
358 		rec->bytes_written += aio.size;
359 		if (switch_output_size(rec))
360 			trigger_hit(&switch_output_trigger);
361 	} else {
362 		/*
363 		 * Decrement map->refcount incremented in record__aio_pushfn()
364 		 * back if record__aio_write() operation failed to start, otherwise
365 		 * map->refcount is decremented in record__aio_complete() after
366 		 * aio write operation finishes successfully.
367 		 */
368 		perf_mmap__put(&map->core);
369 	}
370 
371 	return ret;
372 }
373 
374 static off_t record__aio_get_pos(int trace_fd)
375 {
376 	return lseek(trace_fd, 0, SEEK_CUR);
377 }
378 
379 static void record__aio_set_pos(int trace_fd, off_t pos)
380 {
381 	lseek(trace_fd, pos, SEEK_SET);
382 }
383 
384 static void record__aio_mmap_read_sync(struct record *rec)
385 {
386 	int i;
387 	struct evlist *evlist = rec->evlist;
388 	struct mmap *maps = evlist->mmap;
389 
390 	if (!record__aio_enabled(rec))
391 		return;
392 
393 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
394 		struct mmap *map = &maps[i];
395 
396 		if (map->core.base)
397 			record__aio_sync(map, true);
398 	}
399 }
400 
401 static int nr_cblocks_default = 1;
402 static int nr_cblocks_max = 4;
403 
404 static int record__aio_parse(const struct option *opt,
405 			     const char *str,
406 			     int unset)
407 {
408 	struct record_opts *opts = (struct record_opts *)opt->value;
409 
410 	if (unset) {
411 		opts->nr_cblocks = 0;
412 	} else {
413 		if (str)
414 			opts->nr_cblocks = strtol(str, NULL, 0);
415 		if (!opts->nr_cblocks)
416 			opts->nr_cblocks = nr_cblocks_default;
417 	}
418 
419 	return 0;
420 }
421 #else /* HAVE_AIO_SUPPORT */
422 static int nr_cblocks_max = 0;
423 
424 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
425 			    off_t *off __maybe_unused)
426 {
427 	return -1;
428 }
429 
430 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
431 {
432 	return -1;
433 }
434 
435 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
436 {
437 }
438 
439 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
440 {
441 }
442 #endif
443 
444 static int record__aio_enabled(struct record *rec)
445 {
446 	return rec->opts.nr_cblocks > 0;
447 }
448 
449 #define MMAP_FLUSH_DEFAULT 1
450 static int record__mmap_flush_parse(const struct option *opt,
451 				    const char *str,
452 				    int unset)
453 {
454 	int flush_max;
455 	struct record_opts *opts = (struct record_opts *)opt->value;
456 	static struct parse_tag tags[] = {
457 			{ .tag  = 'B', .mult = 1       },
458 			{ .tag  = 'K', .mult = 1 << 10 },
459 			{ .tag  = 'M', .mult = 1 << 20 },
460 			{ .tag  = 'G', .mult = 1 << 30 },
461 			{ .tag  = 0 },
462 	};
463 
464 	if (unset)
465 		return 0;
466 
467 	if (str) {
468 		opts->mmap_flush = parse_tag_value(str, tags);
469 		if (opts->mmap_flush == (int)-1)
470 			opts->mmap_flush = strtol(str, NULL, 0);
471 	}
472 
473 	if (!opts->mmap_flush)
474 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
475 
476 	flush_max = evlist__mmap_size(opts->mmap_pages);
477 	flush_max /= 4;
478 	if (opts->mmap_flush > flush_max)
479 		opts->mmap_flush = flush_max;
480 
481 	return 0;
482 }
483 
484 #ifdef HAVE_ZSTD_SUPPORT
485 static unsigned int comp_level_default = 1;
486 
487 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
488 {
489 	struct record_opts *opts = opt->value;
490 
491 	if (unset) {
492 		opts->comp_level = 0;
493 	} else {
494 		if (str)
495 			opts->comp_level = strtol(str, NULL, 0);
496 		if (!opts->comp_level)
497 			opts->comp_level = comp_level_default;
498 	}
499 
500 	return 0;
501 }
502 #endif
503 static unsigned int comp_level_max = 22;
504 
505 static int record__comp_enabled(struct record *rec)
506 {
507 	return rec->opts.comp_level > 0;
508 }
509 
510 static int process_synthesized_event(struct perf_tool *tool,
511 				     union perf_event *event,
512 				     struct perf_sample *sample __maybe_unused,
513 				     struct machine *machine __maybe_unused)
514 {
515 	struct record *rec = container_of(tool, struct record, tool);
516 	return record__write(rec, NULL, event, event->header.size);
517 }
518 
519 static int process_locked_synthesized_event(struct perf_tool *tool,
520 				     union perf_event *event,
521 				     struct perf_sample *sample __maybe_unused,
522 				     struct machine *machine __maybe_unused)
523 {
524 	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
525 	int ret;
526 
527 	pthread_mutex_lock(&synth_lock);
528 	ret = process_synthesized_event(tool, event, sample, machine);
529 	pthread_mutex_unlock(&synth_lock);
530 	return ret;
531 }
532 
533 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
534 {
535 	struct record *rec = to;
536 
537 	if (record__comp_enabled(rec)) {
538 		size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
539 		bf   = map->data;
540 	}
541 
542 	rec->samples++;
543 	return record__write(rec, map, bf, size);
544 }
545 
546 static volatile int signr = -1;
547 static volatile int child_finished;
548 #ifdef HAVE_EVENTFD_SUPPORT
549 static int done_fd = -1;
550 #endif
551 
552 static void sig_handler(int sig)
553 {
554 	if (sig == SIGCHLD)
555 		child_finished = 1;
556 	else
557 		signr = sig;
558 
559 	done = 1;
560 #ifdef HAVE_EVENTFD_SUPPORT
561 {
562 	u64 tmp = 1;
563 	/*
564 	 * It is possible for this signal handler to run after done is checked
565 	 * in the main loop, but before the perf counter fds are polled. If this
566 	 * happens, the poll() will continue to wait even though done is set,
567 	 * and will only break out if either another signal is received, or the
568 	 * counters are ready for read. To ensure the poll() doesn't sleep when
569 	 * done is set, use an eventfd (done_fd) to wake up the poll().
570 	 */
571 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
572 		pr_err("failed to signal wakeup fd, error: %m\n");
573 }
574 #endif // HAVE_EVENTFD_SUPPORT
575 }
576 
577 static void sigsegv_handler(int sig)
578 {
579 	perf_hooks__recover();
580 	sighandler_dump_stack(sig);
581 }
582 
583 static void record__sig_exit(void)
584 {
585 	if (signr == -1)
586 		return;
587 
588 	signal(signr, SIG_DFL);
589 	raise(signr);
590 }
591 
592 #ifdef HAVE_AUXTRACE_SUPPORT
593 
594 static int record__process_auxtrace(struct perf_tool *tool,
595 				    struct mmap *map,
596 				    union perf_event *event, void *data1,
597 				    size_t len1, void *data2, size_t len2)
598 {
599 	struct record *rec = container_of(tool, struct record, tool);
600 	struct perf_data *data = &rec->data;
601 	size_t padding;
602 	u8 pad[8] = {0};
603 
604 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
605 		off_t file_offset;
606 		int fd = perf_data__fd(data);
607 		int err;
608 
609 		file_offset = lseek(fd, 0, SEEK_CUR);
610 		if (file_offset == -1)
611 			return -1;
612 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
613 						     event, file_offset);
614 		if (err)
615 			return err;
616 	}
617 
618 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
619 	padding = (len1 + len2) & 7;
620 	if (padding)
621 		padding = 8 - padding;
622 
623 	record__write(rec, map, event, event->header.size);
624 	record__write(rec, map, data1, len1);
625 	if (len2)
626 		record__write(rec, map, data2, len2);
627 	record__write(rec, map, &pad, padding);
628 
629 	return 0;
630 }
631 
632 static int record__auxtrace_mmap_read(struct record *rec,
633 				      struct mmap *map)
634 {
635 	int ret;
636 
637 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
638 				  record__process_auxtrace);
639 	if (ret < 0)
640 		return ret;
641 
642 	if (ret)
643 		rec->samples++;
644 
645 	return 0;
646 }
647 
648 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
649 					       struct mmap *map)
650 {
651 	int ret;
652 
653 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
654 					   record__process_auxtrace,
655 					   rec->opts.auxtrace_snapshot_size);
656 	if (ret < 0)
657 		return ret;
658 
659 	if (ret)
660 		rec->samples++;
661 
662 	return 0;
663 }
664 
665 static int record__auxtrace_read_snapshot_all(struct record *rec)
666 {
667 	int i;
668 	int rc = 0;
669 
670 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
671 		struct mmap *map = &rec->evlist->mmap[i];
672 
673 		if (!map->auxtrace_mmap.base)
674 			continue;
675 
676 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
677 			rc = -1;
678 			goto out;
679 		}
680 	}
681 out:
682 	return rc;
683 }
684 
685 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
686 {
687 	pr_debug("Recording AUX area tracing snapshot\n");
688 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
689 		trigger_error(&auxtrace_snapshot_trigger);
690 	} else {
691 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
692 			trigger_error(&auxtrace_snapshot_trigger);
693 		else
694 			trigger_ready(&auxtrace_snapshot_trigger);
695 	}
696 }
697 
698 static int record__auxtrace_snapshot_exit(struct record *rec)
699 {
700 	if (trigger_is_error(&auxtrace_snapshot_trigger))
701 		return 0;
702 
703 	if (!auxtrace_record__snapshot_started &&
704 	    auxtrace_record__snapshot_start(rec->itr))
705 		return -1;
706 
707 	record__read_auxtrace_snapshot(rec, true);
708 	if (trigger_is_error(&auxtrace_snapshot_trigger))
709 		return -1;
710 
711 	return 0;
712 }
713 
714 static int record__auxtrace_init(struct record *rec)
715 {
716 	int err;
717 
718 	if (!rec->itr) {
719 		rec->itr = auxtrace_record__init(rec->evlist, &err);
720 		if (err)
721 			return err;
722 	}
723 
724 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
725 					      rec->opts.auxtrace_snapshot_opts);
726 	if (err)
727 		return err;
728 
729 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
730 					    rec->opts.auxtrace_sample_opts);
731 	if (err)
732 		return err;
733 
734 	auxtrace_regroup_aux_output(rec->evlist);
735 
736 	return auxtrace_parse_filters(rec->evlist);
737 }
738 
739 #else
740 
741 static inline
742 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
743 			       struct mmap *map __maybe_unused)
744 {
745 	return 0;
746 }
747 
748 static inline
749 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
750 				    bool on_exit __maybe_unused)
751 {
752 }
753 
754 static inline
755 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
756 {
757 	return 0;
758 }
759 
760 static inline
761 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
762 {
763 	return 0;
764 }
765 
766 static int record__auxtrace_init(struct record *rec __maybe_unused)
767 {
768 	return 0;
769 }
770 
771 #endif
772 
773 static int record__config_text_poke(struct evlist *evlist)
774 {
775 	struct evsel *evsel;
776 	int err;
777 
778 	/* Nothing to do if text poke is already configured */
779 	evlist__for_each_entry(evlist, evsel) {
780 		if (evsel->core.attr.text_poke)
781 			return 0;
782 	}
783 
784 	err = parse_events(evlist, "dummy:u", NULL);
785 	if (err)
786 		return err;
787 
788 	evsel = evlist__last(evlist);
789 
790 	evsel->core.attr.freq = 0;
791 	evsel->core.attr.sample_period = 1;
792 	evsel->core.attr.text_poke = 1;
793 	evsel->core.attr.ksymbol = 1;
794 
795 	evsel->core.system_wide = true;
796 	evsel->no_aux_samples = true;
797 	evsel->immediate = true;
798 
799 	/* Text poke must be collected on all CPUs */
800 	perf_cpu_map__put(evsel->core.own_cpus);
801 	evsel->core.own_cpus = perf_cpu_map__new(NULL);
802 	perf_cpu_map__put(evsel->core.cpus);
803 	evsel->core.cpus = perf_cpu_map__get(evsel->core.own_cpus);
804 
805 	evsel__set_sample_bit(evsel, TIME);
806 
807 	return 0;
808 }
809 
810 static bool record__kcore_readable(struct machine *machine)
811 {
812 	char kcore[PATH_MAX];
813 	int fd;
814 
815 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
816 
817 	fd = open(kcore, O_RDONLY);
818 	if (fd < 0)
819 		return false;
820 
821 	close(fd);
822 
823 	return true;
824 }
825 
826 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
827 {
828 	char from_dir[PATH_MAX];
829 	char kcore_dir[PATH_MAX];
830 	int ret;
831 
832 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
833 
834 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
835 	if (ret)
836 		return ret;
837 
838 	return kcore_copy(from_dir, kcore_dir);
839 }
840 
841 static int record__mmap_evlist(struct record *rec,
842 			       struct evlist *evlist)
843 {
844 	struct record_opts *opts = &rec->opts;
845 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
846 				  opts->auxtrace_sample_mode;
847 	char msg[512];
848 
849 	if (opts->affinity != PERF_AFFINITY_SYS)
850 		cpu__setup_cpunode_map();
851 
852 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
853 				 opts->auxtrace_mmap_pages,
854 				 auxtrace_overwrite,
855 				 opts->nr_cblocks, opts->affinity,
856 				 opts->mmap_flush, opts->comp_level) < 0) {
857 		if (errno == EPERM) {
858 			pr_err("Permission error mapping pages.\n"
859 			       "Consider increasing "
860 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
861 			       "or try again with a smaller value of -m/--mmap_pages.\n"
862 			       "(current value: %u,%u)\n",
863 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
864 			return -errno;
865 		} else {
866 			pr_err("failed to mmap with %d (%s)\n", errno,
867 				str_error_r(errno, msg, sizeof(msg)));
868 			if (errno)
869 				return -errno;
870 			else
871 				return -EINVAL;
872 		}
873 	}
874 	return 0;
875 }
876 
877 static int record__mmap(struct record *rec)
878 {
879 	return record__mmap_evlist(rec, rec->evlist);
880 }
881 
882 static int record__open(struct record *rec)
883 {
884 	char msg[BUFSIZ];
885 	struct evsel *pos;
886 	struct evlist *evlist = rec->evlist;
887 	struct perf_session *session = rec->session;
888 	struct record_opts *opts = &rec->opts;
889 	int rc = 0;
890 
891 	/*
892 	 * For initial_delay or system wide, we need to add a dummy event so
893 	 * that we can track PERF_RECORD_MMAP to cover the delay of waiting or
894 	 * event synthesis.
895 	 */
896 	if (opts->initial_delay || target__has_cpu(&opts->target)) {
897 		pos = evlist__get_tracking_event(evlist);
898 		if (!evsel__is_dummy_event(pos)) {
899 			/* Set up dummy event. */
900 			if (evlist__add_dummy(evlist))
901 				return -ENOMEM;
902 			pos = evlist__last(evlist);
903 			evlist__set_tracking_event(evlist, pos);
904 		}
905 
906 		/*
907 		 * Enable the dummy event when the process is forked for
908 		 * initial_delay, immediately for system wide.
909 		 */
910 		if (opts->initial_delay && !pos->immediate)
911 			pos->core.attr.enable_on_exec = 1;
912 		else
913 			pos->immediate = 1;
914 	}
915 
916 	evlist__config(evlist, opts, &callchain_param);
917 
918 	evlist__for_each_entry(evlist, pos) {
919 try_again:
920 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
921 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
922 				if (verbose > 0)
923 					ui__warning("%s\n", msg);
924 				goto try_again;
925 			}
926 			if ((errno == EINVAL || errno == EBADF) &&
927 			    pos->leader != pos &&
928 			    pos->weak_group) {
929 			        pos = evlist__reset_weak_group(evlist, pos, true);
930 				goto try_again;
931 			}
932 			rc = -errno;
933 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
934 			ui__error("%s\n", msg);
935 			goto out;
936 		}
937 
938 		pos->supported = true;
939 	}
940 
941 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
942 		pr_warning(
943 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
944 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
945 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
946 "file is not found in the buildid cache or in the vmlinux path.\n\n"
947 "Samples in kernel modules won't be resolved at all.\n\n"
948 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
949 "even with a suitable vmlinux or kallsyms file.\n\n");
950 	}
951 
952 	if (evlist__apply_filters(evlist, &pos)) {
953 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
954 			pos->filter, evsel__name(pos), errno,
955 			str_error_r(errno, msg, sizeof(msg)));
956 		rc = -1;
957 		goto out;
958 	}
959 
960 	rc = record__mmap(rec);
961 	if (rc)
962 		goto out;
963 
964 	session->evlist = evlist;
965 	perf_session__set_id_hdr_size(session);
966 out:
967 	return rc;
968 }
969 
970 static int process_sample_event(struct perf_tool *tool,
971 				union perf_event *event,
972 				struct perf_sample *sample,
973 				struct evsel *evsel,
974 				struct machine *machine)
975 {
976 	struct record *rec = container_of(tool, struct record, tool);
977 
978 	if (rec->evlist->first_sample_time == 0)
979 		rec->evlist->first_sample_time = sample->time;
980 
981 	rec->evlist->last_sample_time = sample->time;
982 
983 	if (rec->buildid_all)
984 		return 0;
985 
986 	rec->samples++;
987 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
988 }
989 
990 static int process_buildids(struct record *rec)
991 {
992 	struct perf_session *session = rec->session;
993 
994 	if (perf_data__size(&rec->data) == 0)
995 		return 0;
996 
997 	/*
998 	 * During this process, it'll load kernel map and replace the
999 	 * dso->long_name to a real pathname it found.  In this case
1000 	 * we prefer the vmlinux path like
1001 	 *   /lib/modules/3.16.4/build/vmlinux
1002 	 *
1003 	 * rather than build-id path (in debug directory).
1004 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1005 	 */
1006 	symbol_conf.ignore_vmlinux_buildid = true;
1007 
1008 	/*
1009 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1010 	 * so no need to process samples. But if timestamp_boundary is enabled,
1011 	 * it still needs to walk on all samples to get the timestamps of
1012 	 * first/last samples.
1013 	 */
1014 	if (rec->buildid_all && !rec->timestamp_boundary)
1015 		rec->tool.sample = NULL;
1016 
1017 	return perf_session__process_events(session);
1018 }
1019 
1020 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1021 {
1022 	int err;
1023 	struct perf_tool *tool = data;
1024 	/*
1025 	 *As for guest kernel when processing subcommand record&report,
1026 	 *we arrange module mmap prior to guest kernel mmap and trigger
1027 	 *a preload dso because default guest module symbols are loaded
1028 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1029 	 *method is used to avoid symbol missing when the first addr is
1030 	 *in module instead of in guest kernel.
1031 	 */
1032 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1033 					     machine);
1034 	if (err < 0)
1035 		pr_err("Couldn't record guest kernel [%d]'s reference"
1036 		       " relocation symbol.\n", machine->pid);
1037 
1038 	/*
1039 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1040 	 * have no _text sometimes.
1041 	 */
1042 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1043 						 machine);
1044 	if (err < 0)
1045 		pr_err("Couldn't record guest kernel [%d]'s reference"
1046 		       " relocation symbol.\n", machine->pid);
1047 }
1048 
1049 static struct perf_event_header finished_round_event = {
1050 	.size = sizeof(struct perf_event_header),
1051 	.type = PERF_RECORD_FINISHED_ROUND,
1052 };
1053 
1054 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1055 {
1056 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1057 	    !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
1058 			  rec->affinity_mask.nbits)) {
1059 		bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
1060 		bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
1061 			  map->affinity_mask.bits, rec->affinity_mask.nbits);
1062 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
1063 				  (cpu_set_t *)rec->affinity_mask.bits);
1064 		if (verbose == 2)
1065 			mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
1066 	}
1067 }
1068 
1069 static size_t process_comp_header(void *record, size_t increment)
1070 {
1071 	struct perf_record_compressed *event = record;
1072 	size_t size = sizeof(*event);
1073 
1074 	if (increment) {
1075 		event->header.size += increment;
1076 		return increment;
1077 	}
1078 
1079 	event->header.type = PERF_RECORD_COMPRESSED;
1080 	event->header.size = size;
1081 
1082 	return size;
1083 }
1084 
1085 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
1086 			    void *src, size_t src_size)
1087 {
1088 	size_t compressed;
1089 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1090 
1091 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1092 						     max_record_size, process_comp_header);
1093 
1094 	session->bytes_transferred += src_size;
1095 	session->bytes_compressed  += compressed;
1096 
1097 	return compressed;
1098 }
1099 
1100 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1101 				    bool overwrite, bool synch)
1102 {
1103 	u64 bytes_written = rec->bytes_written;
1104 	int i;
1105 	int rc = 0;
1106 	struct mmap *maps;
1107 	int trace_fd = rec->data.file.fd;
1108 	off_t off = 0;
1109 
1110 	if (!evlist)
1111 		return 0;
1112 
1113 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1114 	if (!maps)
1115 		return 0;
1116 
1117 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1118 		return 0;
1119 
1120 	if (record__aio_enabled(rec))
1121 		off = record__aio_get_pos(trace_fd);
1122 
1123 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
1124 		u64 flush = 0;
1125 		struct mmap *map = &maps[i];
1126 
1127 		if (map->core.base) {
1128 			record__adjust_affinity(rec, map);
1129 			if (synch) {
1130 				flush = map->core.flush;
1131 				map->core.flush = 1;
1132 			}
1133 			if (!record__aio_enabled(rec)) {
1134 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1135 					if (synch)
1136 						map->core.flush = flush;
1137 					rc = -1;
1138 					goto out;
1139 				}
1140 			} else {
1141 				if (record__aio_push(rec, map, &off) < 0) {
1142 					record__aio_set_pos(trace_fd, off);
1143 					if (synch)
1144 						map->core.flush = flush;
1145 					rc = -1;
1146 					goto out;
1147 				}
1148 			}
1149 			if (synch)
1150 				map->core.flush = flush;
1151 		}
1152 
1153 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1154 		    !rec->opts.auxtrace_sample_mode &&
1155 		    record__auxtrace_mmap_read(rec, map) != 0) {
1156 			rc = -1;
1157 			goto out;
1158 		}
1159 	}
1160 
1161 	if (record__aio_enabled(rec))
1162 		record__aio_set_pos(trace_fd, off);
1163 
1164 	/*
1165 	 * Mark the round finished in case we wrote
1166 	 * at least one event.
1167 	 */
1168 	if (bytes_written != rec->bytes_written)
1169 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1170 
1171 	if (overwrite)
1172 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1173 out:
1174 	return rc;
1175 }
1176 
1177 static int record__mmap_read_all(struct record *rec, bool synch)
1178 {
1179 	int err;
1180 
1181 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1182 	if (err)
1183 		return err;
1184 
1185 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1186 }
1187 
1188 static void record__init_features(struct record *rec)
1189 {
1190 	struct perf_session *session = rec->session;
1191 	int feat;
1192 
1193 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1194 		perf_header__set_feat(&session->header, feat);
1195 
1196 	if (rec->no_buildid)
1197 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1198 
1199 	if (!have_tracepoints(&rec->evlist->core.entries))
1200 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1201 
1202 	if (!rec->opts.branch_stack)
1203 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1204 
1205 	if (!rec->opts.full_auxtrace)
1206 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1207 
1208 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1209 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1210 
1211 	if (!rec->opts.use_clockid)
1212 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1213 
1214 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1215 	if (!record__comp_enabled(rec))
1216 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1217 
1218 	perf_header__clear_feat(&session->header, HEADER_STAT);
1219 }
1220 
1221 static void
1222 record__finish_output(struct record *rec)
1223 {
1224 	struct perf_data *data = &rec->data;
1225 	int fd = perf_data__fd(data);
1226 
1227 	if (data->is_pipe)
1228 		return;
1229 
1230 	rec->session->header.data_size += rec->bytes_written;
1231 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1232 
1233 	if (!rec->no_buildid) {
1234 		process_buildids(rec);
1235 
1236 		if (rec->buildid_all)
1237 			dsos__hit_all(rec->session);
1238 	}
1239 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1240 
1241 	return;
1242 }
1243 
1244 static int record__synthesize_workload(struct record *rec, bool tail)
1245 {
1246 	int err;
1247 	struct perf_thread_map *thread_map;
1248 
1249 	if (rec->opts.tail_synthesize != tail)
1250 		return 0;
1251 
1252 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1253 	if (thread_map == NULL)
1254 		return -1;
1255 
1256 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1257 						 process_synthesized_event,
1258 						 &rec->session->machines.host,
1259 						 rec->opts.sample_address);
1260 	perf_thread_map__put(thread_map);
1261 	return err;
1262 }
1263 
1264 static int record__synthesize(struct record *rec, bool tail);
1265 
1266 static int
1267 record__switch_output(struct record *rec, bool at_exit)
1268 {
1269 	struct perf_data *data = &rec->data;
1270 	int fd, err;
1271 	char *new_filename;
1272 
1273 	/* Same Size:      "2015122520103046"*/
1274 	char timestamp[] = "InvalidTimestamp";
1275 
1276 	record__aio_mmap_read_sync(rec);
1277 
1278 	record__synthesize(rec, true);
1279 	if (target__none(&rec->opts.target))
1280 		record__synthesize_workload(rec, true);
1281 
1282 	rec->samples = 0;
1283 	record__finish_output(rec);
1284 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1285 	if (err) {
1286 		pr_err("Failed to get current timestamp\n");
1287 		return -EINVAL;
1288 	}
1289 
1290 	fd = perf_data__switch(data, timestamp,
1291 				    rec->session->header.data_offset,
1292 				    at_exit, &new_filename);
1293 	if (fd >= 0 && !at_exit) {
1294 		rec->bytes_written = 0;
1295 		rec->session->header.data_size = 0;
1296 	}
1297 
1298 	if (!quiet)
1299 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1300 			data->path, timestamp);
1301 
1302 	if (rec->switch_output.num_files) {
1303 		int n = rec->switch_output.cur_file + 1;
1304 
1305 		if (n >= rec->switch_output.num_files)
1306 			n = 0;
1307 		rec->switch_output.cur_file = n;
1308 		if (rec->switch_output.filenames[n]) {
1309 			remove(rec->switch_output.filenames[n]);
1310 			zfree(&rec->switch_output.filenames[n]);
1311 		}
1312 		rec->switch_output.filenames[n] = new_filename;
1313 	} else {
1314 		free(new_filename);
1315 	}
1316 
1317 	/* Output tracking events */
1318 	if (!at_exit) {
1319 		record__synthesize(rec, false);
1320 
1321 		/*
1322 		 * In 'perf record --switch-output' without -a,
1323 		 * record__synthesize() in record__switch_output() won't
1324 		 * generate tracking events because there's no thread_map
1325 		 * in evlist. Which causes newly created perf.data doesn't
1326 		 * contain map and comm information.
1327 		 * Create a fake thread_map and directly call
1328 		 * perf_event__synthesize_thread_map() for those events.
1329 		 */
1330 		if (target__none(&rec->opts.target))
1331 			record__synthesize_workload(rec, false);
1332 	}
1333 	return fd;
1334 }
1335 
1336 static volatile int workload_exec_errno;
1337 
1338 /*
1339  * evlist__prepare_workload will send a SIGUSR1
1340  * if the fork fails, since we asked by setting its
1341  * want_signal to true.
1342  */
1343 static void workload_exec_failed_signal(int signo __maybe_unused,
1344 					siginfo_t *info,
1345 					void *ucontext __maybe_unused)
1346 {
1347 	workload_exec_errno = info->si_value.sival_int;
1348 	done = 1;
1349 	child_finished = 1;
1350 }
1351 
1352 static void snapshot_sig_handler(int sig);
1353 static void alarm_sig_handler(int sig);
1354 
1355 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1356 {
1357 	if (evlist) {
1358 		if (evlist->mmap && evlist->mmap[0].core.base)
1359 			return evlist->mmap[0].core.base;
1360 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1361 			return evlist->overwrite_mmap[0].core.base;
1362 	}
1363 	return NULL;
1364 }
1365 
1366 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1367 {
1368 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1369 	if (pc)
1370 		return pc;
1371 	return NULL;
1372 }
1373 
1374 static int record__synthesize(struct record *rec, bool tail)
1375 {
1376 	struct perf_session *session = rec->session;
1377 	struct machine *machine = &session->machines.host;
1378 	struct perf_data *data = &rec->data;
1379 	struct record_opts *opts = &rec->opts;
1380 	struct perf_tool *tool = &rec->tool;
1381 	int fd = perf_data__fd(data);
1382 	int err = 0;
1383 	event_op f = process_synthesized_event;
1384 
1385 	if (rec->opts.tail_synthesize != tail)
1386 		return 0;
1387 
1388 	if (data->is_pipe) {
1389 		/*
1390 		 * We need to synthesize events first, because some
1391 		 * features works on top of them (on report side).
1392 		 */
1393 		err = perf_event__synthesize_attrs(tool, rec->evlist,
1394 						   process_synthesized_event);
1395 		if (err < 0) {
1396 			pr_err("Couldn't synthesize attrs.\n");
1397 			goto out;
1398 		}
1399 
1400 		err = perf_event__synthesize_features(tool, session, rec->evlist,
1401 						      process_synthesized_event);
1402 		if (err < 0) {
1403 			pr_err("Couldn't synthesize features.\n");
1404 			return err;
1405 		}
1406 
1407 		if (have_tracepoints(&rec->evlist->core.entries)) {
1408 			/*
1409 			 * FIXME err <= 0 here actually means that
1410 			 * there were no tracepoints so its not really
1411 			 * an error, just that we don't need to
1412 			 * synthesize anything.  We really have to
1413 			 * return this more properly and also
1414 			 * propagate errors that now are calling die()
1415 			 */
1416 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
1417 								  process_synthesized_event);
1418 			if (err <= 0) {
1419 				pr_err("Couldn't record tracing data.\n");
1420 				goto out;
1421 			}
1422 			rec->bytes_written += err;
1423 		}
1424 	}
1425 
1426 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1427 					  process_synthesized_event, machine);
1428 	if (err)
1429 		goto out;
1430 
1431 	/* Synthesize id_index before auxtrace_info */
1432 	if (rec->opts.auxtrace_sample_mode) {
1433 		err = perf_event__synthesize_id_index(tool,
1434 						      process_synthesized_event,
1435 						      session->evlist, machine);
1436 		if (err)
1437 			goto out;
1438 	}
1439 
1440 	if (rec->opts.full_auxtrace) {
1441 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1442 					session, process_synthesized_event);
1443 		if (err)
1444 			goto out;
1445 	}
1446 
1447 	if (!evlist__exclude_kernel(rec->evlist)) {
1448 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1449 							 machine);
1450 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1451 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1452 				   "Check /proc/kallsyms permission or run as root.\n");
1453 
1454 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1455 						     machine);
1456 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1457 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1458 				   "Check /proc/modules permission or run as root.\n");
1459 	}
1460 
1461 	if (perf_guest) {
1462 		machines__process_guests(&session->machines,
1463 					 perf_event__synthesize_guest_os, tool);
1464 	}
1465 
1466 	err = perf_event__synthesize_extra_attr(&rec->tool,
1467 						rec->evlist,
1468 						process_synthesized_event,
1469 						data->is_pipe);
1470 	if (err)
1471 		goto out;
1472 
1473 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1474 						 process_synthesized_event,
1475 						NULL);
1476 	if (err < 0) {
1477 		pr_err("Couldn't synthesize thread map.\n");
1478 		return err;
1479 	}
1480 
1481 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1482 					     process_synthesized_event, NULL);
1483 	if (err < 0) {
1484 		pr_err("Couldn't synthesize cpu map.\n");
1485 		return err;
1486 	}
1487 
1488 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1489 						machine, opts);
1490 	if (err < 0)
1491 		pr_warning("Couldn't synthesize bpf events.\n");
1492 
1493 	err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1494 					     machine);
1495 	if (err < 0)
1496 		pr_warning("Couldn't synthesize cgroup events.\n");
1497 
1498 	if (rec->opts.nr_threads_synthesize > 1) {
1499 		perf_set_multithreaded();
1500 		f = process_locked_synthesized_event;
1501 	}
1502 
1503 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1504 					    f, opts->sample_address,
1505 					    rec->opts.nr_threads_synthesize);
1506 
1507 	if (rec->opts.nr_threads_synthesize > 1)
1508 		perf_set_singlethreaded();
1509 
1510 out:
1511 	return err;
1512 }
1513 
1514 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1515 {
1516 	struct record *rec = data;
1517 	pthread_kill(rec->thread_id, SIGUSR2);
1518 	return 0;
1519 }
1520 
1521 static int record__setup_sb_evlist(struct record *rec)
1522 {
1523 	struct record_opts *opts = &rec->opts;
1524 
1525 	if (rec->sb_evlist != NULL) {
1526 		/*
1527 		 * We get here if --switch-output-event populated the
1528 		 * sb_evlist, so associate a callback that will send a SIGUSR2
1529 		 * to the main thread.
1530 		 */
1531 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1532 		rec->thread_id = pthread_self();
1533 	}
1534 #ifdef HAVE_LIBBPF_SUPPORT
1535 	if (!opts->no_bpf_event) {
1536 		if (rec->sb_evlist == NULL) {
1537 			rec->sb_evlist = evlist__new();
1538 
1539 			if (rec->sb_evlist == NULL) {
1540 				pr_err("Couldn't create side band evlist.\n.");
1541 				return -1;
1542 			}
1543 		}
1544 
1545 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1546 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1547 			return -1;
1548 		}
1549 	}
1550 #endif
1551 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1552 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1553 		opts->no_bpf_event = true;
1554 	}
1555 
1556 	return 0;
1557 }
1558 
1559 static int record__init_clock(struct record *rec)
1560 {
1561 	struct perf_session *session = rec->session;
1562 	struct timespec ref_clockid;
1563 	struct timeval ref_tod;
1564 	u64 ref;
1565 
1566 	if (!rec->opts.use_clockid)
1567 		return 0;
1568 
1569 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1570 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
1571 
1572 	session->header.env.clock.clockid = rec->opts.clockid;
1573 
1574 	if (gettimeofday(&ref_tod, NULL) != 0) {
1575 		pr_err("gettimeofday failed, cannot set reference time.\n");
1576 		return -1;
1577 	}
1578 
1579 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
1580 		pr_err("clock_gettime failed, cannot set reference time.\n");
1581 		return -1;
1582 	}
1583 
1584 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
1585 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
1586 
1587 	session->header.env.clock.tod_ns = ref;
1588 
1589 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
1590 	      (u64) ref_clockid.tv_nsec;
1591 
1592 	session->header.env.clock.clockid_ns = ref;
1593 	return 0;
1594 }
1595 
1596 static void hit_auxtrace_snapshot_trigger(struct record *rec)
1597 {
1598 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1599 		trigger_hit(&auxtrace_snapshot_trigger);
1600 		auxtrace_record__snapshot_started = 1;
1601 		if (auxtrace_record__snapshot_start(rec->itr))
1602 			trigger_error(&auxtrace_snapshot_trigger);
1603 	}
1604 }
1605 
1606 static int __cmd_record(struct record *rec, int argc, const char **argv)
1607 {
1608 	int err;
1609 	int status = 0;
1610 	unsigned long waking = 0;
1611 	const bool forks = argc > 0;
1612 	struct perf_tool *tool = &rec->tool;
1613 	struct record_opts *opts = &rec->opts;
1614 	struct perf_data *data = &rec->data;
1615 	struct perf_session *session;
1616 	bool disabled = false, draining = false;
1617 	int fd;
1618 	float ratio = 0;
1619 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
1620 
1621 	atexit(record__sig_exit);
1622 	signal(SIGCHLD, sig_handler);
1623 	signal(SIGINT, sig_handler);
1624 	signal(SIGTERM, sig_handler);
1625 	signal(SIGSEGV, sigsegv_handler);
1626 
1627 	if (rec->opts.record_namespaces)
1628 		tool->namespace_events = true;
1629 
1630 	if (rec->opts.record_cgroup) {
1631 #ifdef HAVE_FILE_HANDLE
1632 		tool->cgroup_events = true;
1633 #else
1634 		pr_err("cgroup tracking is not supported\n");
1635 		return -1;
1636 #endif
1637 	}
1638 
1639 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1640 		signal(SIGUSR2, snapshot_sig_handler);
1641 		if (rec->opts.auxtrace_snapshot_mode)
1642 			trigger_on(&auxtrace_snapshot_trigger);
1643 		if (rec->switch_output.enabled)
1644 			trigger_on(&switch_output_trigger);
1645 	} else {
1646 		signal(SIGUSR2, SIG_IGN);
1647 	}
1648 
1649 	session = perf_session__new(data, false, tool);
1650 	if (IS_ERR(session)) {
1651 		pr_err("Perf session creation failed.\n");
1652 		return PTR_ERR(session);
1653 	}
1654 
1655 	fd = perf_data__fd(data);
1656 	rec->session = session;
1657 
1658 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1659 		pr_err("Compression initialization failed.\n");
1660 		return -1;
1661 	}
1662 #ifdef HAVE_EVENTFD_SUPPORT
1663 	done_fd = eventfd(0, EFD_NONBLOCK);
1664 	if (done_fd < 0) {
1665 		pr_err("Failed to create wakeup eventfd, error: %m\n");
1666 		status = -1;
1667 		goto out_delete_session;
1668 	}
1669 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
1670 	if (err < 0) {
1671 		pr_err("Failed to add wakeup eventfd to poll list\n");
1672 		status = err;
1673 		goto out_delete_session;
1674 	}
1675 #endif // HAVE_EVENTFD_SUPPORT
1676 
1677 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1678 	session->header.env.comp_level = rec->opts.comp_level;
1679 
1680 	if (rec->opts.kcore &&
1681 	    !record__kcore_readable(&session->machines.host)) {
1682 		pr_err("ERROR: kcore is not readable.\n");
1683 		return -1;
1684 	}
1685 
1686 	if (record__init_clock(rec))
1687 		return -1;
1688 
1689 	record__init_features(rec);
1690 
1691 	if (forks) {
1692 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
1693 					       workload_exec_failed_signal);
1694 		if (err < 0) {
1695 			pr_err("Couldn't run the workload!\n");
1696 			status = err;
1697 			goto out_delete_session;
1698 		}
1699 	}
1700 
1701 	/*
1702 	 * If we have just single event and are sending data
1703 	 * through pipe, we need to force the ids allocation,
1704 	 * because we synthesize event name through the pipe
1705 	 * and need the id for that.
1706 	 */
1707 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1708 		rec->opts.sample_id = true;
1709 
1710 	if (record__open(rec) != 0) {
1711 		err = -1;
1712 		goto out_child;
1713 	}
1714 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1715 
1716 	if (rec->opts.kcore) {
1717 		err = record__kcore_copy(&session->machines.host, data);
1718 		if (err) {
1719 			pr_err("ERROR: Failed to copy kcore\n");
1720 			goto out_child;
1721 		}
1722 	}
1723 
1724 	err = bpf__apply_obj_config();
1725 	if (err) {
1726 		char errbuf[BUFSIZ];
1727 
1728 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1729 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1730 			 errbuf);
1731 		goto out_child;
1732 	}
1733 
1734 	/*
1735 	 * Normally perf_session__new would do this, but it doesn't have the
1736 	 * evlist.
1737 	 */
1738 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
1739 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1740 		rec->tool.ordered_events = false;
1741 	}
1742 
1743 	if (!rec->evlist->nr_groups)
1744 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1745 
1746 	if (data->is_pipe) {
1747 		err = perf_header__write_pipe(fd);
1748 		if (err < 0)
1749 			goto out_child;
1750 	} else {
1751 		err = perf_session__write_header(session, rec->evlist, fd, false);
1752 		if (err < 0)
1753 			goto out_child;
1754 	}
1755 
1756 	err = -1;
1757 	if (!rec->no_buildid
1758 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1759 		pr_err("Couldn't generate buildids. "
1760 		       "Use --no-buildid to profile anyway.\n");
1761 		goto out_child;
1762 	}
1763 
1764 	err = record__setup_sb_evlist(rec);
1765 	if (err)
1766 		goto out_child;
1767 
1768 	err = record__synthesize(rec, false);
1769 	if (err < 0)
1770 		goto out_child;
1771 
1772 	if (rec->realtime_prio) {
1773 		struct sched_param param;
1774 
1775 		param.sched_priority = rec->realtime_prio;
1776 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1777 			pr_err("Could not set realtime priority.\n");
1778 			err = -1;
1779 			goto out_child;
1780 		}
1781 	}
1782 
1783 	/*
1784 	 * When perf is starting the traced process, all the events
1785 	 * (apart from group members) have enable_on_exec=1 set,
1786 	 * so don't spoil it by prematurely enabling them.
1787 	 */
1788 	if (!target__none(&opts->target) && !opts->initial_delay)
1789 		evlist__enable(rec->evlist);
1790 
1791 	/*
1792 	 * Let the child rip
1793 	 */
1794 	if (forks) {
1795 		struct machine *machine = &session->machines.host;
1796 		union perf_event *event;
1797 		pid_t tgid;
1798 
1799 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1800 		if (event == NULL) {
1801 			err = -ENOMEM;
1802 			goto out_child;
1803 		}
1804 
1805 		/*
1806 		 * Some H/W events are generated before COMM event
1807 		 * which is emitted during exec(), so perf script
1808 		 * cannot see a correct process name for those events.
1809 		 * Synthesize COMM event to prevent it.
1810 		 */
1811 		tgid = perf_event__synthesize_comm(tool, event,
1812 						   rec->evlist->workload.pid,
1813 						   process_synthesized_event,
1814 						   machine);
1815 		free(event);
1816 
1817 		if (tgid == -1)
1818 			goto out_child;
1819 
1820 		event = malloc(sizeof(event->namespaces) +
1821 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1822 			       machine->id_hdr_size);
1823 		if (event == NULL) {
1824 			err = -ENOMEM;
1825 			goto out_child;
1826 		}
1827 
1828 		/*
1829 		 * Synthesize NAMESPACES event for the command specified.
1830 		 */
1831 		perf_event__synthesize_namespaces(tool, event,
1832 						  rec->evlist->workload.pid,
1833 						  tgid, process_synthesized_event,
1834 						  machine);
1835 		free(event);
1836 
1837 		evlist__start_workload(rec->evlist);
1838 	}
1839 
1840 	if (evlist__initialize_ctlfd(rec->evlist, opts->ctl_fd, opts->ctl_fd_ack))
1841 		goto out_child;
1842 
1843 	if (opts->initial_delay) {
1844 		pr_info(EVLIST_DISABLED_MSG);
1845 		if (opts->initial_delay > 0) {
1846 			usleep(opts->initial_delay * USEC_PER_MSEC);
1847 			evlist__enable(rec->evlist);
1848 			pr_info(EVLIST_ENABLED_MSG);
1849 		}
1850 	}
1851 
1852 	trigger_ready(&auxtrace_snapshot_trigger);
1853 	trigger_ready(&switch_output_trigger);
1854 	perf_hooks__invoke_record_start();
1855 	for (;;) {
1856 		unsigned long long hits = rec->samples;
1857 
1858 		/*
1859 		 * rec->evlist->bkw_mmap_state is possible to be
1860 		 * BKW_MMAP_EMPTY here: when done == true and
1861 		 * hits != rec->samples in previous round.
1862 		 *
1863 		 * evlist__toggle_bkw_mmap ensure we never
1864 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1865 		 */
1866 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1867 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1868 
1869 		if (record__mmap_read_all(rec, false) < 0) {
1870 			trigger_error(&auxtrace_snapshot_trigger);
1871 			trigger_error(&switch_output_trigger);
1872 			err = -1;
1873 			goto out_child;
1874 		}
1875 
1876 		if (auxtrace_record__snapshot_started) {
1877 			auxtrace_record__snapshot_started = 0;
1878 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1879 				record__read_auxtrace_snapshot(rec, false);
1880 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1881 				pr_err("AUX area tracing snapshot failed\n");
1882 				err = -1;
1883 				goto out_child;
1884 			}
1885 		}
1886 
1887 		if (trigger_is_hit(&switch_output_trigger)) {
1888 			/*
1889 			 * If switch_output_trigger is hit, the data in
1890 			 * overwritable ring buffer should have been collected,
1891 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1892 			 *
1893 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1894 			 * record__mmap_read_all() didn't collect data from
1895 			 * overwritable ring buffer. Read again.
1896 			 */
1897 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1898 				continue;
1899 			trigger_ready(&switch_output_trigger);
1900 
1901 			/*
1902 			 * Reenable events in overwrite ring buffer after
1903 			 * record__mmap_read_all(): we should have collected
1904 			 * data from it.
1905 			 */
1906 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1907 
1908 			if (!quiet)
1909 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1910 					waking);
1911 			waking = 0;
1912 			fd = record__switch_output(rec, false);
1913 			if (fd < 0) {
1914 				pr_err("Failed to switch to new file\n");
1915 				trigger_error(&switch_output_trigger);
1916 				err = fd;
1917 				goto out_child;
1918 			}
1919 
1920 			/* re-arm the alarm */
1921 			if (rec->switch_output.time)
1922 				alarm(rec->switch_output.time);
1923 		}
1924 
1925 		if (hits == rec->samples) {
1926 			if (done || draining)
1927 				break;
1928 			err = evlist__poll(rec->evlist, -1);
1929 			/*
1930 			 * Propagate error, only if there's any. Ignore positive
1931 			 * number of returned events and interrupt error.
1932 			 */
1933 			if (err > 0 || (err < 0 && errno == EINTR))
1934 				err = 0;
1935 			waking++;
1936 
1937 			if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1938 				draining = true;
1939 		}
1940 
1941 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
1942 			switch (cmd) {
1943 			case EVLIST_CTL_CMD_SNAPSHOT:
1944 				hit_auxtrace_snapshot_trigger(rec);
1945 				evlist__ctlfd_ack(rec->evlist);
1946 				break;
1947 			case EVLIST_CTL_CMD_STOP:
1948 				done = 1;
1949 				break;
1950 			case EVLIST_CTL_CMD_ACK:
1951 			case EVLIST_CTL_CMD_UNSUPPORTED:
1952 			case EVLIST_CTL_CMD_ENABLE:
1953 			case EVLIST_CTL_CMD_DISABLE:
1954 			case EVLIST_CTL_CMD_EVLIST:
1955 			case EVLIST_CTL_CMD_PING:
1956 			default:
1957 				break;
1958 			}
1959 		}
1960 
1961 		/*
1962 		 * When perf is starting the traced process, at the end events
1963 		 * die with the process and we wait for that. Thus no need to
1964 		 * disable events in this case.
1965 		 */
1966 		if (done && !disabled && !target__none(&opts->target)) {
1967 			trigger_off(&auxtrace_snapshot_trigger);
1968 			evlist__disable(rec->evlist);
1969 			disabled = true;
1970 		}
1971 	}
1972 
1973 	trigger_off(&auxtrace_snapshot_trigger);
1974 	trigger_off(&switch_output_trigger);
1975 
1976 	if (opts->auxtrace_snapshot_on_exit)
1977 		record__auxtrace_snapshot_exit(rec);
1978 
1979 	if (forks && workload_exec_errno) {
1980 		char msg[STRERR_BUFSIZE];
1981 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1982 		pr_err("Workload failed: %s\n", emsg);
1983 		err = -1;
1984 		goto out_child;
1985 	}
1986 
1987 	if (!quiet)
1988 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1989 
1990 	if (target__none(&rec->opts.target))
1991 		record__synthesize_workload(rec, true);
1992 
1993 out_child:
1994 	evlist__finalize_ctlfd(rec->evlist);
1995 	record__mmap_read_all(rec, true);
1996 	record__aio_mmap_read_sync(rec);
1997 
1998 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1999 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2000 		session->header.env.comp_ratio = ratio + 0.5;
2001 	}
2002 
2003 	if (forks) {
2004 		int exit_status;
2005 
2006 		if (!child_finished)
2007 			kill(rec->evlist->workload.pid, SIGTERM);
2008 
2009 		wait(&exit_status);
2010 
2011 		if (err < 0)
2012 			status = err;
2013 		else if (WIFEXITED(exit_status))
2014 			status = WEXITSTATUS(exit_status);
2015 		else if (WIFSIGNALED(exit_status))
2016 			signr = WTERMSIG(exit_status);
2017 	} else
2018 		status = err;
2019 
2020 	record__synthesize(rec, true);
2021 	/* this will be recalculated during process_buildids() */
2022 	rec->samples = 0;
2023 
2024 	if (!err) {
2025 		if (!rec->timestamp_filename) {
2026 			record__finish_output(rec);
2027 		} else {
2028 			fd = record__switch_output(rec, true);
2029 			if (fd < 0) {
2030 				status = fd;
2031 				goto out_delete_session;
2032 			}
2033 		}
2034 	}
2035 
2036 	perf_hooks__invoke_record_end();
2037 
2038 	if (!err && !quiet) {
2039 		char samples[128];
2040 		const char *postfix = rec->timestamp_filename ?
2041 					".<timestamp>" : "";
2042 
2043 		if (rec->samples && !rec->opts.full_auxtrace)
2044 			scnprintf(samples, sizeof(samples),
2045 				  " (%" PRIu64 " samples)", rec->samples);
2046 		else
2047 			samples[0] = '\0';
2048 
2049 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2050 			perf_data__size(data) / 1024.0 / 1024.0,
2051 			data->path, postfix, samples);
2052 		if (ratio) {
2053 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2054 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2055 					ratio);
2056 		}
2057 		fprintf(stderr, " ]\n");
2058 	}
2059 
2060 out_delete_session:
2061 #ifdef HAVE_EVENTFD_SUPPORT
2062 	if (done_fd >= 0)
2063 		close(done_fd);
2064 #endif
2065 	zstd_fini(&session->zstd_data);
2066 	perf_session__delete(session);
2067 
2068 	if (!opts->no_bpf_event)
2069 		evlist__stop_sb_thread(rec->sb_evlist);
2070 	return status;
2071 }
2072 
2073 static void callchain_debug(struct callchain_param *callchain)
2074 {
2075 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2076 
2077 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2078 
2079 	if (callchain->record_mode == CALLCHAIN_DWARF)
2080 		pr_debug("callchain: stack dump size %d\n",
2081 			 callchain->dump_size);
2082 }
2083 
2084 int record_opts__parse_callchain(struct record_opts *record,
2085 				 struct callchain_param *callchain,
2086 				 const char *arg, bool unset)
2087 {
2088 	int ret;
2089 	callchain->enabled = !unset;
2090 
2091 	/* --no-call-graph */
2092 	if (unset) {
2093 		callchain->record_mode = CALLCHAIN_NONE;
2094 		pr_debug("callchain: disabled\n");
2095 		return 0;
2096 	}
2097 
2098 	ret = parse_callchain_record_opt(arg, callchain);
2099 	if (!ret) {
2100 		/* Enable data address sampling for DWARF unwind. */
2101 		if (callchain->record_mode == CALLCHAIN_DWARF)
2102 			record->sample_address = true;
2103 		callchain_debug(callchain);
2104 	}
2105 
2106 	return ret;
2107 }
2108 
2109 int record_parse_callchain_opt(const struct option *opt,
2110 			       const char *arg,
2111 			       int unset)
2112 {
2113 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2114 }
2115 
2116 int record_callchain_opt(const struct option *opt,
2117 			 const char *arg __maybe_unused,
2118 			 int unset __maybe_unused)
2119 {
2120 	struct callchain_param *callchain = opt->value;
2121 
2122 	callchain->enabled = true;
2123 
2124 	if (callchain->record_mode == CALLCHAIN_NONE)
2125 		callchain->record_mode = CALLCHAIN_FP;
2126 
2127 	callchain_debug(callchain);
2128 	return 0;
2129 }
2130 
2131 static int perf_record_config(const char *var, const char *value, void *cb)
2132 {
2133 	struct record *rec = cb;
2134 
2135 	if (!strcmp(var, "record.build-id")) {
2136 		if (!strcmp(value, "cache"))
2137 			rec->no_buildid_cache = false;
2138 		else if (!strcmp(value, "no-cache"))
2139 			rec->no_buildid_cache = true;
2140 		else if (!strcmp(value, "skip"))
2141 			rec->no_buildid = true;
2142 		else if (!strcmp(value, "mmap"))
2143 			rec->buildid_mmap = true;
2144 		else
2145 			return -1;
2146 		return 0;
2147 	}
2148 	if (!strcmp(var, "record.call-graph")) {
2149 		var = "call-graph.record-mode";
2150 		return perf_default_config(var, value, cb);
2151 	}
2152 #ifdef HAVE_AIO_SUPPORT
2153 	if (!strcmp(var, "record.aio")) {
2154 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2155 		if (!rec->opts.nr_cblocks)
2156 			rec->opts.nr_cblocks = nr_cblocks_default;
2157 	}
2158 #endif
2159 
2160 	return 0;
2161 }
2162 
2163 
2164 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2165 {
2166 	struct record_opts *opts = (struct record_opts *)opt->value;
2167 
2168 	if (unset || !str)
2169 		return 0;
2170 
2171 	if (!strcasecmp(str, "node"))
2172 		opts->affinity = PERF_AFFINITY_NODE;
2173 	else if (!strcasecmp(str, "cpu"))
2174 		opts->affinity = PERF_AFFINITY_CPU;
2175 
2176 	return 0;
2177 }
2178 
2179 static int parse_output_max_size(const struct option *opt,
2180 				 const char *str, int unset)
2181 {
2182 	unsigned long *s = (unsigned long *)opt->value;
2183 	static struct parse_tag tags_size[] = {
2184 		{ .tag  = 'B', .mult = 1       },
2185 		{ .tag  = 'K', .mult = 1 << 10 },
2186 		{ .tag  = 'M', .mult = 1 << 20 },
2187 		{ .tag  = 'G', .mult = 1 << 30 },
2188 		{ .tag  = 0 },
2189 	};
2190 	unsigned long val;
2191 
2192 	if (unset) {
2193 		*s = 0;
2194 		return 0;
2195 	}
2196 
2197 	val = parse_tag_value(str, tags_size);
2198 	if (val != (unsigned long) -1) {
2199 		*s = val;
2200 		return 0;
2201 	}
2202 
2203 	return -1;
2204 }
2205 
2206 static int record__parse_mmap_pages(const struct option *opt,
2207 				    const char *str,
2208 				    int unset __maybe_unused)
2209 {
2210 	struct record_opts *opts = opt->value;
2211 	char *s, *p;
2212 	unsigned int mmap_pages;
2213 	int ret;
2214 
2215 	if (!str)
2216 		return -EINVAL;
2217 
2218 	s = strdup(str);
2219 	if (!s)
2220 		return -ENOMEM;
2221 
2222 	p = strchr(s, ',');
2223 	if (p)
2224 		*p = '\0';
2225 
2226 	if (*s) {
2227 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
2228 		if (ret)
2229 			goto out_free;
2230 		opts->mmap_pages = mmap_pages;
2231 	}
2232 
2233 	if (!p) {
2234 		ret = 0;
2235 		goto out_free;
2236 	}
2237 
2238 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
2239 	if (ret)
2240 		goto out_free;
2241 
2242 	opts->auxtrace_mmap_pages = mmap_pages;
2243 
2244 out_free:
2245 	free(s);
2246 	return ret;
2247 }
2248 
2249 static int parse_control_option(const struct option *opt,
2250 				const char *str,
2251 				int unset __maybe_unused)
2252 {
2253 	struct record_opts *opts = opt->value;
2254 
2255 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
2256 }
2257 
2258 static void switch_output_size_warn(struct record *rec)
2259 {
2260 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2261 	struct switch_output *s = &rec->switch_output;
2262 
2263 	wakeup_size /= 2;
2264 
2265 	if (s->size < wakeup_size) {
2266 		char buf[100];
2267 
2268 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2269 		pr_warning("WARNING: switch-output data size lower than "
2270 			   "wakeup kernel buffer size (%s) "
2271 			   "expect bigger perf.data sizes\n", buf);
2272 	}
2273 }
2274 
2275 static int switch_output_setup(struct record *rec)
2276 {
2277 	struct switch_output *s = &rec->switch_output;
2278 	static struct parse_tag tags_size[] = {
2279 		{ .tag  = 'B', .mult = 1       },
2280 		{ .tag  = 'K', .mult = 1 << 10 },
2281 		{ .tag  = 'M', .mult = 1 << 20 },
2282 		{ .tag  = 'G', .mult = 1 << 30 },
2283 		{ .tag  = 0 },
2284 	};
2285 	static struct parse_tag tags_time[] = {
2286 		{ .tag  = 's', .mult = 1        },
2287 		{ .tag  = 'm', .mult = 60       },
2288 		{ .tag  = 'h', .mult = 60*60    },
2289 		{ .tag  = 'd', .mult = 60*60*24 },
2290 		{ .tag  = 0 },
2291 	};
2292 	unsigned long val;
2293 
2294 	/*
2295 	 * If we're using --switch-output-events, then we imply its
2296 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2297 	 *  thread to its parent.
2298 	 */
2299 	if (rec->switch_output_event_set)
2300 		goto do_signal;
2301 
2302 	if (!s->set)
2303 		return 0;
2304 
2305 	if (!strcmp(s->str, "signal")) {
2306 do_signal:
2307 		s->signal = true;
2308 		pr_debug("switch-output with SIGUSR2 signal\n");
2309 		goto enabled;
2310 	}
2311 
2312 	val = parse_tag_value(s->str, tags_size);
2313 	if (val != (unsigned long) -1) {
2314 		s->size = val;
2315 		pr_debug("switch-output with %s size threshold\n", s->str);
2316 		goto enabled;
2317 	}
2318 
2319 	val = parse_tag_value(s->str, tags_time);
2320 	if (val != (unsigned long) -1) {
2321 		s->time = val;
2322 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2323 			 s->str, s->time);
2324 		goto enabled;
2325 	}
2326 
2327 	return -1;
2328 
2329 enabled:
2330 	rec->timestamp_filename = true;
2331 	s->enabled              = true;
2332 
2333 	if (s->size && !rec->opts.no_buffering)
2334 		switch_output_size_warn(rec);
2335 
2336 	return 0;
2337 }
2338 
2339 static const char * const __record_usage[] = {
2340 	"perf record [<options>] [<command>]",
2341 	"perf record [<options>] -- <command> [<options>]",
2342 	NULL
2343 };
2344 const char * const *record_usage = __record_usage;
2345 
2346 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2347 				  struct perf_sample *sample, struct machine *machine)
2348 {
2349 	/*
2350 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2351 	 * no need to add them twice.
2352 	 */
2353 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2354 		return 0;
2355 	return perf_event__process_mmap(tool, event, sample, machine);
2356 }
2357 
2358 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2359 				   struct perf_sample *sample, struct machine *machine)
2360 {
2361 	/*
2362 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2363 	 * no need to add them twice.
2364 	 */
2365 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2366 		return 0;
2367 
2368 	return perf_event__process_mmap2(tool, event, sample, machine);
2369 }
2370 
2371 /*
2372  * XXX Ideally would be local to cmd_record() and passed to a record__new
2373  * because we need to have access to it in record__exit, that is called
2374  * after cmd_record() exits, but since record_options need to be accessible to
2375  * builtin-script, leave it here.
2376  *
2377  * At least we don't ouch it in all the other functions here directly.
2378  *
2379  * Just say no to tons of global variables, sigh.
2380  */
2381 static struct record record = {
2382 	.opts = {
2383 		.sample_time	     = true,
2384 		.mmap_pages	     = UINT_MAX,
2385 		.user_freq	     = UINT_MAX,
2386 		.user_interval	     = ULLONG_MAX,
2387 		.freq		     = 4000,
2388 		.target		     = {
2389 			.uses_mmap   = true,
2390 			.default_per_cpu = true,
2391 		},
2392 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2393 		.nr_threads_synthesize = 1,
2394 		.ctl_fd              = -1,
2395 		.ctl_fd_ack          = -1,
2396 	},
2397 	.tool = {
2398 		.sample		= process_sample_event,
2399 		.fork		= perf_event__process_fork,
2400 		.exit		= perf_event__process_exit,
2401 		.comm		= perf_event__process_comm,
2402 		.namespaces	= perf_event__process_namespaces,
2403 		.mmap		= build_id__process_mmap,
2404 		.mmap2		= build_id__process_mmap2,
2405 		.ordered_events	= true,
2406 	},
2407 };
2408 
2409 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2410 	"\n\t\t\t\tDefault: fp";
2411 
2412 static bool dry_run;
2413 
2414 /*
2415  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2416  * with it and switch to use the library functions in perf_evlist that came
2417  * from builtin-record.c, i.e. use record_opts,
2418  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2419  * using pipes, etc.
2420  */
2421 static struct option __record_options[] = {
2422 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2423 		     "event selector. use 'perf list' to list available events",
2424 		     parse_events_option),
2425 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2426 		     "event filter", parse_filter),
2427 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2428 			   NULL, "don't record events from perf itself",
2429 			   exclude_perf),
2430 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2431 		    "record events on existing process id"),
2432 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2433 		    "record events on existing thread id"),
2434 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2435 		    "collect data with this RT SCHED_FIFO priority"),
2436 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2437 		    "collect data without buffering"),
2438 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2439 		    "collect raw sample records from all opened counters"),
2440 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2441 			    "system-wide collection from all CPUs"),
2442 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2443 		    "list of cpus to monitor"),
2444 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2445 	OPT_STRING('o', "output", &record.data.path, "file",
2446 		    "output file name"),
2447 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2448 			&record.opts.no_inherit_set,
2449 			"child tasks do not inherit counters"),
2450 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2451 		    "synthesize non-sample events at the end of output"),
2452 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2453 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
2454 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2455 		    "Fail if the specified frequency can't be used"),
2456 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2457 		     "profile at this frequency",
2458 		      record__parse_freq),
2459 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2460 		     "number of mmap data pages and AUX area tracing mmap pages",
2461 		     record__parse_mmap_pages),
2462 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2463 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2464 		     record__mmap_flush_parse),
2465 	OPT_BOOLEAN(0, "group", &record.opts.group,
2466 		    "put the counters into a counter group"),
2467 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2468 			   NULL, "enables call-graph recording" ,
2469 			   &record_callchain_opt),
2470 	OPT_CALLBACK(0, "call-graph", &record.opts,
2471 		     "record_mode[,record_size]", record_callchain_help,
2472 		     &record_parse_callchain_opt),
2473 	OPT_INCR('v', "verbose", &verbose,
2474 		    "be more verbose (show counter open errors, etc)"),
2475 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2476 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2477 		    "per thread counts"),
2478 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2479 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2480 		    "Record the sample physical addresses"),
2481 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
2482 		    "Record the sampled data address data page size"),
2483 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
2484 		    "Record the sampled code address (ip) page size"),
2485 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2486 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2487 			&record.opts.sample_time_set,
2488 			"Record the sample timestamps"),
2489 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2490 			"Record the sample period"),
2491 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2492 		    "don't sample"),
2493 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2494 			&record.no_buildid_cache_set,
2495 			"do not update the buildid cache"),
2496 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2497 			&record.no_buildid_set,
2498 			"do not collect buildids in perf.data"),
2499 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2500 		     "monitor event in cgroup name only",
2501 		     parse_cgroups),
2502 	OPT_INTEGER('D', "delay", &record.opts.initial_delay,
2503 		  "ms to wait before starting measurement after program start (-1: start with events disabled)"),
2504 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2505 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2506 		   "user to profile"),
2507 
2508 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2509 		     "branch any", "sample any taken branches",
2510 		     parse_branch_stack),
2511 
2512 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2513 		     "branch filter mask", "branch stack filter modes",
2514 		     parse_branch_stack),
2515 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2516 		    "sample by weight (on special events only)"),
2517 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2518 		    "sample transaction flags (special events only)"),
2519 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2520 		    "use per-thread mmaps"),
2521 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2522 		    "sample selected machine registers on interrupt,"
2523 		    " use '-I?' to list register names", parse_intr_regs),
2524 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2525 		    "sample selected machine registers on interrupt,"
2526 		    " use '--user-regs=?' to list register names", parse_user_regs),
2527 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2528 		    "Record running/enabled time of read (:S) events"),
2529 	OPT_CALLBACK('k', "clockid", &record.opts,
2530 	"clockid", "clockid to use for events, see clock_gettime()",
2531 	parse_clockid),
2532 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2533 			  "opts", "AUX area tracing Snapshot Mode", ""),
2534 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2535 			  "opts", "sample AUX area", ""),
2536 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2537 			"per thread proc mmap processing timeout in ms"),
2538 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2539 		    "Record namespaces events"),
2540 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
2541 		    "Record cgroup events"),
2542 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
2543 			&record.opts.record_switch_events_set,
2544 			"Record context switch events"),
2545 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2546 			 "Configure all used events to run in kernel space.",
2547 			 PARSE_OPT_EXCLUSIVE),
2548 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2549 			 "Configure all used events to run in user space.",
2550 			 PARSE_OPT_EXCLUSIVE),
2551 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2552 		    "collect kernel callchains"),
2553 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2554 		    "collect user callchains"),
2555 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2556 		   "clang binary to use for compiling BPF scriptlets"),
2557 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2558 		   "options passed to clang when compiling BPF scriptlets"),
2559 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2560 		   "file", "vmlinux pathname"),
2561 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2562 		    "Record build-id of all DSOs regardless of hits"),
2563 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
2564 		    "Record build-id in map events"),
2565 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2566 		    "append timestamp to output filename"),
2567 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2568 		    "Record timestamp boundary (time of first/last samples)"),
2569 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2570 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2571 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2572 			  "signal"),
2573 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
2574 			 "switch output event selector. use 'perf list' to list available events",
2575 			 parse_events_option_new_evlist),
2576 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2577 		   "Limit number of switch output generated files"),
2578 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2579 		    "Parse options then exit"),
2580 #ifdef HAVE_AIO_SUPPORT
2581 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2582 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2583 		     record__aio_parse),
2584 #endif
2585 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2586 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2587 		     record__parse_affinity),
2588 #ifdef HAVE_ZSTD_SUPPORT
2589 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2590 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2591 			    record__parse_comp_level),
2592 #endif
2593 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
2594 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
2595 	OPT_UINTEGER(0, "num-thread-synthesize",
2596 		     &record.opts.nr_threads_synthesize,
2597 		     "number of threads to run for event synthesis"),
2598 #ifdef HAVE_LIBPFM
2599 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
2600 		"libpfm4 event selector. use 'perf list' to list available events",
2601 		parse_libpfm_events_option),
2602 #endif
2603 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
2604 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
2605 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
2606 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
2607 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
2608 		      parse_control_option),
2609 	OPT_END()
2610 };
2611 
2612 struct option *record_options = __record_options;
2613 
2614 int cmd_record(int argc, const char **argv)
2615 {
2616 	int err;
2617 	struct record *rec = &record;
2618 	char errbuf[BUFSIZ];
2619 
2620 	setlocale(LC_ALL, "");
2621 
2622 #ifndef HAVE_LIBBPF_SUPPORT
2623 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2624 	set_nobuild('\0', "clang-path", true);
2625 	set_nobuild('\0', "clang-opt", true);
2626 # undef set_nobuild
2627 #endif
2628 
2629 #ifndef HAVE_BPF_PROLOGUE
2630 # if !defined (HAVE_DWARF_SUPPORT)
2631 #  define REASON  "NO_DWARF=1"
2632 # elif !defined (HAVE_LIBBPF_SUPPORT)
2633 #  define REASON  "NO_LIBBPF=1"
2634 # else
2635 #  define REASON  "this architecture doesn't support BPF prologue"
2636 # endif
2637 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2638 	set_nobuild('\0', "vmlinux", true);
2639 # undef set_nobuild
2640 # undef REASON
2641 #endif
2642 
2643 	rec->opts.affinity = PERF_AFFINITY_SYS;
2644 
2645 	rec->evlist = evlist__new();
2646 	if (rec->evlist == NULL)
2647 		return -ENOMEM;
2648 
2649 	err = perf_config(perf_record_config, rec);
2650 	if (err)
2651 		return err;
2652 
2653 	argc = parse_options(argc, argv, record_options, record_usage,
2654 			    PARSE_OPT_STOP_AT_NON_OPTION);
2655 	if (quiet)
2656 		perf_quiet_option();
2657 
2658 	/* Make system wide (-a) the default target. */
2659 	if (!argc && target__none(&rec->opts.target))
2660 		rec->opts.target.system_wide = true;
2661 
2662 	if (nr_cgroups && !rec->opts.target.system_wide) {
2663 		usage_with_options_msg(record_usage, record_options,
2664 			"cgroup monitoring only available in system-wide mode");
2665 
2666 	}
2667 
2668 	if (rec->buildid_mmap) {
2669 		if (!perf_can_record_build_id()) {
2670 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
2671 			err = -EINVAL;
2672 			goto out_opts;
2673 		}
2674 		pr_debug("Enabling build id in mmap2 events.\n");
2675 		/* Enable mmap build id synthesizing. */
2676 		symbol_conf.buildid_mmap2 = true;
2677 		/* Enable perf_event_attr::build_id bit. */
2678 		rec->opts.build_id = true;
2679 		/* Disable build id cache. */
2680 		rec->no_buildid = true;
2681 	}
2682 
2683 	if (rec->opts.kcore)
2684 		rec->data.is_dir = true;
2685 
2686 	if (rec->opts.comp_level != 0) {
2687 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2688 		rec->no_buildid = true;
2689 	}
2690 
2691 	if (rec->opts.record_switch_events &&
2692 	    !perf_can_record_switch_events()) {
2693 		ui__error("kernel does not support recording context switch events\n");
2694 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2695 		err = -EINVAL;
2696 		goto out_opts;
2697 	}
2698 
2699 	if (switch_output_setup(rec)) {
2700 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2701 		err = -EINVAL;
2702 		goto out_opts;
2703 	}
2704 
2705 	if (rec->switch_output.time) {
2706 		signal(SIGALRM, alarm_sig_handler);
2707 		alarm(rec->switch_output.time);
2708 	}
2709 
2710 	if (rec->switch_output.num_files) {
2711 		rec->switch_output.filenames = calloc(sizeof(char *),
2712 						      rec->switch_output.num_files);
2713 		if (!rec->switch_output.filenames) {
2714 			err = -EINVAL;
2715 			goto out_opts;
2716 		}
2717 	}
2718 
2719 	/*
2720 	 * Allow aliases to facilitate the lookup of symbols for address
2721 	 * filters. Refer to auxtrace_parse_filters().
2722 	 */
2723 	symbol_conf.allow_aliases = true;
2724 
2725 	symbol__init(NULL);
2726 
2727 	if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2728 		rec->affinity_mask.nbits = cpu__max_cpu();
2729 		rec->affinity_mask.bits = bitmap_alloc(rec->affinity_mask.nbits);
2730 		if (!rec->affinity_mask.bits) {
2731 			pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2732 			err = -ENOMEM;
2733 			goto out_opts;
2734 		}
2735 		pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2736 	}
2737 
2738 	err = record__auxtrace_init(rec);
2739 	if (err)
2740 		goto out;
2741 
2742 	if (dry_run)
2743 		goto out;
2744 
2745 	err = bpf__setup_stdout(rec->evlist);
2746 	if (err) {
2747 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2748 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2749 			 errbuf);
2750 		goto out;
2751 	}
2752 
2753 	err = -ENOMEM;
2754 
2755 	if (rec->no_buildid_cache || rec->no_buildid) {
2756 		disable_buildid_cache();
2757 	} else if (rec->switch_output.enabled) {
2758 		/*
2759 		 * In 'perf record --switch-output', disable buildid
2760 		 * generation by default to reduce data file switching
2761 		 * overhead. Still generate buildid if they are required
2762 		 * explicitly using
2763 		 *
2764 		 *  perf record --switch-output --no-no-buildid \
2765 		 *              --no-no-buildid-cache
2766 		 *
2767 		 * Following code equals to:
2768 		 *
2769 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2770 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2771 		 *         disable_buildid_cache();
2772 		 */
2773 		bool disable = true;
2774 
2775 		if (rec->no_buildid_set && !rec->no_buildid)
2776 			disable = false;
2777 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2778 			disable = false;
2779 		if (disable) {
2780 			rec->no_buildid = true;
2781 			rec->no_buildid_cache = true;
2782 			disable_buildid_cache();
2783 		}
2784 	}
2785 
2786 	if (record.opts.overwrite)
2787 		record.opts.tail_synthesize = true;
2788 
2789 	if (rec->evlist->core.nr_entries == 0 &&
2790 	    __evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2791 		pr_err("Not enough memory for event selector list\n");
2792 		goto out;
2793 	}
2794 
2795 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2796 		rec->opts.no_inherit = true;
2797 
2798 	err = target__validate(&rec->opts.target);
2799 	if (err) {
2800 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2801 		ui__warning("%s\n", errbuf);
2802 	}
2803 
2804 	err = target__parse_uid(&rec->opts.target);
2805 	if (err) {
2806 		int saved_errno = errno;
2807 
2808 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2809 		ui__error("%s", errbuf);
2810 
2811 		err = -saved_errno;
2812 		goto out;
2813 	}
2814 
2815 	/* Enable ignoring missing threads when -u/-p option is defined. */
2816 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2817 
2818 	err = -ENOMEM;
2819 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2820 		usage_with_options(record_usage, record_options);
2821 
2822 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2823 	if (err)
2824 		goto out;
2825 
2826 	/*
2827 	 * We take all buildids when the file contains
2828 	 * AUX area tracing data because we do not decode the
2829 	 * trace because it would take too long.
2830 	 */
2831 	if (rec->opts.full_auxtrace)
2832 		rec->buildid_all = true;
2833 
2834 	if (rec->opts.text_poke) {
2835 		err = record__config_text_poke(rec->evlist);
2836 		if (err) {
2837 			pr_err("record__config_text_poke failed, error %d\n", err);
2838 			goto out;
2839 		}
2840 	}
2841 
2842 	if (record_opts__config(&rec->opts)) {
2843 		err = -EINVAL;
2844 		goto out;
2845 	}
2846 
2847 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2848 		rec->opts.nr_cblocks = nr_cblocks_max;
2849 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2850 
2851 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2852 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2853 
2854 	if (rec->opts.comp_level > comp_level_max)
2855 		rec->opts.comp_level = comp_level_max;
2856 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2857 
2858 	err = __cmd_record(&record, argc, argv);
2859 out:
2860 	bitmap_free(rec->affinity_mask.bits);
2861 	evlist__delete(rec->evlist);
2862 	symbol__exit();
2863 	auxtrace_record__free(rec->itr);
2864 out_opts:
2865 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
2866 	return err;
2867 }
2868 
2869 static void snapshot_sig_handler(int sig __maybe_unused)
2870 {
2871 	struct record *rec = &record;
2872 
2873 	hit_auxtrace_snapshot_trigger(rec);
2874 
2875 	if (switch_output_signal(rec))
2876 		trigger_hit(&switch_output_trigger);
2877 }
2878 
2879 static void alarm_sig_handler(int sig __maybe_unused)
2880 {
2881 	struct record *rec = &record;
2882 
2883 	if (switch_output_time(rec))
2884 		trigger_hit(&switch_output_trigger);
2885 }
2886