xref: /openbmc/linux/tools/perf/builtin-record.c (revision bef7a78d)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/clockid.h"
50 #include "asm/bug.h"
51 #include "perf.h"
52 
53 #include <errno.h>
54 #include <inttypes.h>
55 #include <locale.h>
56 #include <poll.h>
57 #include <pthread.h>
58 #include <unistd.h>
59 #include <sched.h>
60 #include <signal.h>
61 #ifdef HAVE_EVENTFD_SUPPORT
62 #include <sys/eventfd.h>
63 #endif
64 #include <sys/mman.h>
65 #include <sys/wait.h>
66 #include <sys/types.h>
67 #include <sys/stat.h>
68 #include <fcntl.h>
69 #include <linux/err.h>
70 #include <linux/string.h>
71 #include <linux/time64.h>
72 #include <linux/zalloc.h>
73 #include <linux/bitmap.h>
74 #include <sys/time.h>
75 
76 struct switch_output {
77 	bool		 enabled;
78 	bool		 signal;
79 	unsigned long	 size;
80 	unsigned long	 time;
81 	const char	*str;
82 	bool		 set;
83 	char		 **filenames;
84 	int		 num_files;
85 	int		 cur_file;
86 };
87 
88 struct record {
89 	struct perf_tool	tool;
90 	struct record_opts	opts;
91 	u64			bytes_written;
92 	struct perf_data	data;
93 	struct auxtrace_record	*itr;
94 	struct evlist	*evlist;
95 	struct perf_session	*session;
96 	struct evlist		*sb_evlist;
97 	pthread_t		thread_id;
98 	int			realtime_prio;
99 	bool			switch_output_event_set;
100 	bool			no_buildid;
101 	bool			no_buildid_set;
102 	bool			no_buildid_cache;
103 	bool			no_buildid_cache_set;
104 	bool			buildid_all;
105 	bool			timestamp_filename;
106 	bool			timestamp_boundary;
107 	struct switch_output	switch_output;
108 	unsigned long long	samples;
109 	struct mmap_cpu_mask	affinity_mask;
110 	unsigned long		output_max_size;	/* = 0: unlimited */
111 };
112 
113 static volatile int done;
114 
115 static volatile int auxtrace_record__snapshot_started;
116 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
117 static DEFINE_TRIGGER(switch_output_trigger);
118 
119 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
120 	"SYS", "NODE", "CPU"
121 };
122 
123 static bool switch_output_signal(struct record *rec)
124 {
125 	return rec->switch_output.signal &&
126 	       trigger_is_ready(&switch_output_trigger);
127 }
128 
129 static bool switch_output_size(struct record *rec)
130 {
131 	return rec->switch_output.size &&
132 	       trigger_is_ready(&switch_output_trigger) &&
133 	       (rec->bytes_written >= rec->switch_output.size);
134 }
135 
136 static bool switch_output_time(struct record *rec)
137 {
138 	return rec->switch_output.time &&
139 	       trigger_is_ready(&switch_output_trigger);
140 }
141 
142 static bool record__output_max_size_exceeded(struct record *rec)
143 {
144 	return rec->output_max_size &&
145 	       (rec->bytes_written >= rec->output_max_size);
146 }
147 
148 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
149 			 void *bf, size_t size)
150 {
151 	struct perf_data_file *file = &rec->session->data->file;
152 
153 	if (perf_data_file__write(file, bf, size) < 0) {
154 		pr_err("failed to write perf data, error: %m\n");
155 		return -1;
156 	}
157 
158 	rec->bytes_written += size;
159 
160 	if (record__output_max_size_exceeded(rec) && !done) {
161 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
162 				" stopping session ]\n",
163 				rec->bytes_written >> 10);
164 		done = 1;
165 	}
166 
167 	if (switch_output_size(rec))
168 		trigger_hit(&switch_output_trigger);
169 
170 	return 0;
171 }
172 
173 static int record__aio_enabled(struct record *rec);
174 static int record__comp_enabled(struct record *rec);
175 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
176 			    void *src, size_t src_size);
177 
178 #ifdef HAVE_AIO_SUPPORT
179 static int record__aio_write(struct aiocb *cblock, int trace_fd,
180 		void *buf, size_t size, off_t off)
181 {
182 	int rc;
183 
184 	cblock->aio_fildes = trace_fd;
185 	cblock->aio_buf    = buf;
186 	cblock->aio_nbytes = size;
187 	cblock->aio_offset = off;
188 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
189 
190 	do {
191 		rc = aio_write(cblock);
192 		if (rc == 0) {
193 			break;
194 		} else if (errno != EAGAIN) {
195 			cblock->aio_fildes = -1;
196 			pr_err("failed to queue perf data, error: %m\n");
197 			break;
198 		}
199 	} while (1);
200 
201 	return rc;
202 }
203 
204 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
205 {
206 	void *rem_buf;
207 	off_t rem_off;
208 	size_t rem_size;
209 	int rc, aio_errno;
210 	ssize_t aio_ret, written;
211 
212 	aio_errno = aio_error(cblock);
213 	if (aio_errno == EINPROGRESS)
214 		return 0;
215 
216 	written = aio_ret = aio_return(cblock);
217 	if (aio_ret < 0) {
218 		if (aio_errno != EINTR)
219 			pr_err("failed to write perf data, error: %m\n");
220 		written = 0;
221 	}
222 
223 	rem_size = cblock->aio_nbytes - written;
224 
225 	if (rem_size == 0) {
226 		cblock->aio_fildes = -1;
227 		/*
228 		 * md->refcount is incremented in record__aio_pushfn() for
229 		 * every aio write request started in record__aio_push() so
230 		 * decrement it because the request is now complete.
231 		 */
232 		perf_mmap__put(&md->core);
233 		rc = 1;
234 	} else {
235 		/*
236 		 * aio write request may require restart with the
237 		 * reminder if the kernel didn't write whole
238 		 * chunk at once.
239 		 */
240 		rem_off = cblock->aio_offset + written;
241 		rem_buf = (void *)(cblock->aio_buf + written);
242 		record__aio_write(cblock, cblock->aio_fildes,
243 				rem_buf, rem_size, rem_off);
244 		rc = 0;
245 	}
246 
247 	return rc;
248 }
249 
250 static int record__aio_sync(struct mmap *md, bool sync_all)
251 {
252 	struct aiocb **aiocb = md->aio.aiocb;
253 	struct aiocb *cblocks = md->aio.cblocks;
254 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
255 	int i, do_suspend;
256 
257 	do {
258 		do_suspend = 0;
259 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
260 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
261 				if (sync_all)
262 					aiocb[i] = NULL;
263 				else
264 					return i;
265 			} else {
266 				/*
267 				 * Started aio write is not complete yet
268 				 * so it has to be waited before the
269 				 * next allocation.
270 				 */
271 				aiocb[i] = &cblocks[i];
272 				do_suspend = 1;
273 			}
274 		}
275 		if (!do_suspend)
276 			return -1;
277 
278 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
279 			if (!(errno == EAGAIN || errno == EINTR))
280 				pr_err("failed to sync perf data, error: %m\n");
281 		}
282 	} while (1);
283 }
284 
285 struct record_aio {
286 	struct record	*rec;
287 	void		*data;
288 	size_t		size;
289 };
290 
291 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
292 {
293 	struct record_aio *aio = to;
294 
295 	/*
296 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
297 	 * to release space in the kernel buffer as fast as possible, calling
298 	 * perf_mmap__consume() from perf_mmap__push() function.
299 	 *
300 	 * That lets the kernel to proceed with storing more profiling data into
301 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
302 	 *
303 	 * Coping can be done in two steps in case the chunk of profiling data
304 	 * crosses the upper bound of the kernel buffer. In this case we first move
305 	 * part of data from map->start till the upper bound and then the reminder
306 	 * from the beginning of the kernel buffer till the end of the data chunk.
307 	 */
308 
309 	if (record__comp_enabled(aio->rec)) {
310 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
311 				     mmap__mmap_len(map) - aio->size,
312 				     buf, size);
313 	} else {
314 		memcpy(aio->data + aio->size, buf, size);
315 	}
316 
317 	if (!aio->size) {
318 		/*
319 		 * Increment map->refcount to guard map->aio.data[] buffer
320 		 * from premature deallocation because map object can be
321 		 * released earlier than aio write request started on
322 		 * map->aio.data[] buffer is complete.
323 		 *
324 		 * perf_mmap__put() is done at record__aio_complete()
325 		 * after started aio request completion or at record__aio_push()
326 		 * if the request failed to start.
327 		 */
328 		perf_mmap__get(&map->core);
329 	}
330 
331 	aio->size += size;
332 
333 	return size;
334 }
335 
336 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
337 {
338 	int ret, idx;
339 	int trace_fd = rec->session->data->file.fd;
340 	struct record_aio aio = { .rec = rec, .size = 0 };
341 
342 	/*
343 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
344 	 * becomes available after previous aio write operation.
345 	 */
346 
347 	idx = record__aio_sync(map, false);
348 	aio.data = map->aio.data[idx];
349 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
350 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
351 		return ret;
352 
353 	rec->samples++;
354 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
355 	if (!ret) {
356 		*off += aio.size;
357 		rec->bytes_written += aio.size;
358 		if (switch_output_size(rec))
359 			trigger_hit(&switch_output_trigger);
360 	} else {
361 		/*
362 		 * Decrement map->refcount incremented in record__aio_pushfn()
363 		 * back if record__aio_write() operation failed to start, otherwise
364 		 * map->refcount is decremented in record__aio_complete() after
365 		 * aio write operation finishes successfully.
366 		 */
367 		perf_mmap__put(&map->core);
368 	}
369 
370 	return ret;
371 }
372 
373 static off_t record__aio_get_pos(int trace_fd)
374 {
375 	return lseek(trace_fd, 0, SEEK_CUR);
376 }
377 
378 static void record__aio_set_pos(int trace_fd, off_t pos)
379 {
380 	lseek(trace_fd, pos, SEEK_SET);
381 }
382 
383 static void record__aio_mmap_read_sync(struct record *rec)
384 {
385 	int i;
386 	struct evlist *evlist = rec->evlist;
387 	struct mmap *maps = evlist->mmap;
388 
389 	if (!record__aio_enabled(rec))
390 		return;
391 
392 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
393 		struct mmap *map = &maps[i];
394 
395 		if (map->core.base)
396 			record__aio_sync(map, true);
397 	}
398 }
399 
400 static int nr_cblocks_default = 1;
401 static int nr_cblocks_max = 4;
402 
403 static int record__aio_parse(const struct option *opt,
404 			     const char *str,
405 			     int unset)
406 {
407 	struct record_opts *opts = (struct record_opts *)opt->value;
408 
409 	if (unset) {
410 		opts->nr_cblocks = 0;
411 	} else {
412 		if (str)
413 			opts->nr_cblocks = strtol(str, NULL, 0);
414 		if (!opts->nr_cblocks)
415 			opts->nr_cblocks = nr_cblocks_default;
416 	}
417 
418 	return 0;
419 }
420 #else /* HAVE_AIO_SUPPORT */
421 static int nr_cblocks_max = 0;
422 
423 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
424 			    off_t *off __maybe_unused)
425 {
426 	return -1;
427 }
428 
429 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
430 {
431 	return -1;
432 }
433 
434 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
435 {
436 }
437 
438 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
439 {
440 }
441 #endif
442 
443 static int record__aio_enabled(struct record *rec)
444 {
445 	return rec->opts.nr_cblocks > 0;
446 }
447 
448 #define MMAP_FLUSH_DEFAULT 1
449 static int record__mmap_flush_parse(const struct option *opt,
450 				    const char *str,
451 				    int unset)
452 {
453 	int flush_max;
454 	struct record_opts *opts = (struct record_opts *)opt->value;
455 	static struct parse_tag tags[] = {
456 			{ .tag  = 'B', .mult = 1       },
457 			{ .tag  = 'K', .mult = 1 << 10 },
458 			{ .tag  = 'M', .mult = 1 << 20 },
459 			{ .tag  = 'G', .mult = 1 << 30 },
460 			{ .tag  = 0 },
461 	};
462 
463 	if (unset)
464 		return 0;
465 
466 	if (str) {
467 		opts->mmap_flush = parse_tag_value(str, tags);
468 		if (opts->mmap_flush == (int)-1)
469 			opts->mmap_flush = strtol(str, NULL, 0);
470 	}
471 
472 	if (!opts->mmap_flush)
473 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
474 
475 	flush_max = evlist__mmap_size(opts->mmap_pages);
476 	flush_max /= 4;
477 	if (opts->mmap_flush > flush_max)
478 		opts->mmap_flush = flush_max;
479 
480 	return 0;
481 }
482 
483 #ifdef HAVE_ZSTD_SUPPORT
484 static unsigned int comp_level_default = 1;
485 
486 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
487 {
488 	struct record_opts *opts = opt->value;
489 
490 	if (unset) {
491 		opts->comp_level = 0;
492 	} else {
493 		if (str)
494 			opts->comp_level = strtol(str, NULL, 0);
495 		if (!opts->comp_level)
496 			opts->comp_level = comp_level_default;
497 	}
498 
499 	return 0;
500 }
501 #endif
502 static unsigned int comp_level_max = 22;
503 
504 static int record__comp_enabled(struct record *rec)
505 {
506 	return rec->opts.comp_level > 0;
507 }
508 
509 static int process_synthesized_event(struct perf_tool *tool,
510 				     union perf_event *event,
511 				     struct perf_sample *sample __maybe_unused,
512 				     struct machine *machine __maybe_unused)
513 {
514 	struct record *rec = container_of(tool, struct record, tool);
515 	return record__write(rec, NULL, event, event->header.size);
516 }
517 
518 static int process_locked_synthesized_event(struct perf_tool *tool,
519 				     union perf_event *event,
520 				     struct perf_sample *sample __maybe_unused,
521 				     struct machine *machine __maybe_unused)
522 {
523 	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
524 	int ret;
525 
526 	pthread_mutex_lock(&synth_lock);
527 	ret = process_synthesized_event(tool, event, sample, machine);
528 	pthread_mutex_unlock(&synth_lock);
529 	return ret;
530 }
531 
532 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
533 {
534 	struct record *rec = to;
535 
536 	if (record__comp_enabled(rec)) {
537 		size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
538 		bf   = map->data;
539 	}
540 
541 	rec->samples++;
542 	return record__write(rec, map, bf, size);
543 }
544 
545 static volatile int signr = -1;
546 static volatile int child_finished;
547 #ifdef HAVE_EVENTFD_SUPPORT
548 static int done_fd = -1;
549 #endif
550 
551 static void sig_handler(int sig)
552 {
553 	if (sig == SIGCHLD)
554 		child_finished = 1;
555 	else
556 		signr = sig;
557 
558 	done = 1;
559 #ifdef HAVE_EVENTFD_SUPPORT
560 {
561 	u64 tmp = 1;
562 	/*
563 	 * It is possible for this signal handler to run after done is checked
564 	 * in the main loop, but before the perf counter fds are polled. If this
565 	 * happens, the poll() will continue to wait even though done is set,
566 	 * and will only break out if either another signal is received, or the
567 	 * counters are ready for read. To ensure the poll() doesn't sleep when
568 	 * done is set, use an eventfd (done_fd) to wake up the poll().
569 	 */
570 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
571 		pr_err("failed to signal wakeup fd, error: %m\n");
572 }
573 #endif // HAVE_EVENTFD_SUPPORT
574 }
575 
576 static void sigsegv_handler(int sig)
577 {
578 	perf_hooks__recover();
579 	sighandler_dump_stack(sig);
580 }
581 
582 static void record__sig_exit(void)
583 {
584 	if (signr == -1)
585 		return;
586 
587 	signal(signr, SIG_DFL);
588 	raise(signr);
589 }
590 
591 #ifdef HAVE_AUXTRACE_SUPPORT
592 
593 static int record__process_auxtrace(struct perf_tool *tool,
594 				    struct mmap *map,
595 				    union perf_event *event, void *data1,
596 				    size_t len1, void *data2, size_t len2)
597 {
598 	struct record *rec = container_of(tool, struct record, tool);
599 	struct perf_data *data = &rec->data;
600 	size_t padding;
601 	u8 pad[8] = {0};
602 
603 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
604 		off_t file_offset;
605 		int fd = perf_data__fd(data);
606 		int err;
607 
608 		file_offset = lseek(fd, 0, SEEK_CUR);
609 		if (file_offset == -1)
610 			return -1;
611 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
612 						     event, file_offset);
613 		if (err)
614 			return err;
615 	}
616 
617 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
618 	padding = (len1 + len2) & 7;
619 	if (padding)
620 		padding = 8 - padding;
621 
622 	record__write(rec, map, event, event->header.size);
623 	record__write(rec, map, data1, len1);
624 	if (len2)
625 		record__write(rec, map, data2, len2);
626 	record__write(rec, map, &pad, padding);
627 
628 	return 0;
629 }
630 
631 static int record__auxtrace_mmap_read(struct record *rec,
632 				      struct mmap *map)
633 {
634 	int ret;
635 
636 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
637 				  record__process_auxtrace);
638 	if (ret < 0)
639 		return ret;
640 
641 	if (ret)
642 		rec->samples++;
643 
644 	return 0;
645 }
646 
647 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
648 					       struct mmap *map)
649 {
650 	int ret;
651 
652 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
653 					   record__process_auxtrace,
654 					   rec->opts.auxtrace_snapshot_size);
655 	if (ret < 0)
656 		return ret;
657 
658 	if (ret)
659 		rec->samples++;
660 
661 	return 0;
662 }
663 
664 static int record__auxtrace_read_snapshot_all(struct record *rec)
665 {
666 	int i;
667 	int rc = 0;
668 
669 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
670 		struct mmap *map = &rec->evlist->mmap[i];
671 
672 		if (!map->auxtrace_mmap.base)
673 			continue;
674 
675 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
676 			rc = -1;
677 			goto out;
678 		}
679 	}
680 out:
681 	return rc;
682 }
683 
684 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
685 {
686 	pr_debug("Recording AUX area tracing snapshot\n");
687 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
688 		trigger_error(&auxtrace_snapshot_trigger);
689 	} else {
690 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
691 			trigger_error(&auxtrace_snapshot_trigger);
692 		else
693 			trigger_ready(&auxtrace_snapshot_trigger);
694 	}
695 }
696 
697 static int record__auxtrace_snapshot_exit(struct record *rec)
698 {
699 	if (trigger_is_error(&auxtrace_snapshot_trigger))
700 		return 0;
701 
702 	if (!auxtrace_record__snapshot_started &&
703 	    auxtrace_record__snapshot_start(rec->itr))
704 		return -1;
705 
706 	record__read_auxtrace_snapshot(rec, true);
707 	if (trigger_is_error(&auxtrace_snapshot_trigger))
708 		return -1;
709 
710 	return 0;
711 }
712 
713 static int record__auxtrace_init(struct record *rec)
714 {
715 	int err;
716 
717 	if (!rec->itr) {
718 		rec->itr = auxtrace_record__init(rec->evlist, &err);
719 		if (err)
720 			return err;
721 	}
722 
723 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
724 					      rec->opts.auxtrace_snapshot_opts);
725 	if (err)
726 		return err;
727 
728 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
729 					    rec->opts.auxtrace_sample_opts);
730 	if (err)
731 		return err;
732 
733 	return auxtrace_parse_filters(rec->evlist);
734 }
735 
736 #else
737 
738 static inline
739 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
740 			       struct mmap *map __maybe_unused)
741 {
742 	return 0;
743 }
744 
745 static inline
746 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
747 				    bool on_exit __maybe_unused)
748 {
749 }
750 
751 static inline
752 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
753 {
754 	return 0;
755 }
756 
757 static inline
758 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
759 {
760 	return 0;
761 }
762 
763 static int record__auxtrace_init(struct record *rec __maybe_unused)
764 {
765 	return 0;
766 }
767 
768 #endif
769 
770 static int record__config_text_poke(struct evlist *evlist)
771 {
772 	struct evsel *evsel;
773 	int err;
774 
775 	/* Nothing to do if text poke is already configured */
776 	evlist__for_each_entry(evlist, evsel) {
777 		if (evsel->core.attr.text_poke)
778 			return 0;
779 	}
780 
781 	err = parse_events(evlist, "dummy:u", NULL);
782 	if (err)
783 		return err;
784 
785 	evsel = evlist__last(evlist);
786 
787 	evsel->core.attr.freq = 0;
788 	evsel->core.attr.sample_period = 1;
789 	evsel->core.attr.text_poke = 1;
790 	evsel->core.attr.ksymbol = 1;
791 
792 	evsel->core.system_wide = true;
793 	evsel->no_aux_samples = true;
794 	evsel->immediate = true;
795 
796 	/* Text poke must be collected on all CPUs */
797 	perf_cpu_map__put(evsel->core.own_cpus);
798 	evsel->core.own_cpus = perf_cpu_map__new(NULL);
799 	perf_cpu_map__put(evsel->core.cpus);
800 	evsel->core.cpus = perf_cpu_map__get(evsel->core.own_cpus);
801 
802 	evsel__set_sample_bit(evsel, TIME);
803 
804 	return 0;
805 }
806 
807 static bool record__kcore_readable(struct machine *machine)
808 {
809 	char kcore[PATH_MAX];
810 	int fd;
811 
812 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
813 
814 	fd = open(kcore, O_RDONLY);
815 	if (fd < 0)
816 		return false;
817 
818 	close(fd);
819 
820 	return true;
821 }
822 
823 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
824 {
825 	char from_dir[PATH_MAX];
826 	char kcore_dir[PATH_MAX];
827 	int ret;
828 
829 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
830 
831 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
832 	if (ret)
833 		return ret;
834 
835 	return kcore_copy(from_dir, kcore_dir);
836 }
837 
838 static int record__mmap_evlist(struct record *rec,
839 			       struct evlist *evlist)
840 {
841 	struct record_opts *opts = &rec->opts;
842 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
843 				  opts->auxtrace_sample_mode;
844 	char msg[512];
845 
846 	if (opts->affinity != PERF_AFFINITY_SYS)
847 		cpu__setup_cpunode_map();
848 
849 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
850 				 opts->auxtrace_mmap_pages,
851 				 auxtrace_overwrite,
852 				 opts->nr_cblocks, opts->affinity,
853 				 opts->mmap_flush, opts->comp_level) < 0) {
854 		if (errno == EPERM) {
855 			pr_err("Permission error mapping pages.\n"
856 			       "Consider increasing "
857 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
858 			       "or try again with a smaller value of -m/--mmap_pages.\n"
859 			       "(current value: %u,%u)\n",
860 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
861 			return -errno;
862 		} else {
863 			pr_err("failed to mmap with %d (%s)\n", errno,
864 				str_error_r(errno, msg, sizeof(msg)));
865 			if (errno)
866 				return -errno;
867 			else
868 				return -EINVAL;
869 		}
870 	}
871 	return 0;
872 }
873 
874 static int record__mmap(struct record *rec)
875 {
876 	return record__mmap_evlist(rec, rec->evlist);
877 }
878 
879 static int record__open(struct record *rec)
880 {
881 	char msg[BUFSIZ];
882 	struct evsel *pos;
883 	struct evlist *evlist = rec->evlist;
884 	struct perf_session *session = rec->session;
885 	struct record_opts *opts = &rec->opts;
886 	int rc = 0;
887 
888 	/*
889 	 * For initial_delay or system wide, we need to add a dummy event so
890 	 * that we can track PERF_RECORD_MMAP to cover the delay of waiting or
891 	 * event synthesis.
892 	 */
893 	if (opts->initial_delay || target__has_cpu(&opts->target)) {
894 		pos = evlist__get_tracking_event(evlist);
895 		if (!evsel__is_dummy_event(pos)) {
896 			/* Set up dummy event. */
897 			if (evlist__add_dummy(evlist))
898 				return -ENOMEM;
899 			pos = evlist__last(evlist);
900 			evlist__set_tracking_event(evlist, pos);
901 		}
902 
903 		/*
904 		 * Enable the dummy event when the process is forked for
905 		 * initial_delay, immediately for system wide.
906 		 */
907 		if (opts->initial_delay && !pos->immediate)
908 			pos->core.attr.enable_on_exec = 1;
909 		else
910 			pos->immediate = 1;
911 	}
912 
913 	evlist__config(evlist, opts, &callchain_param);
914 
915 	evlist__for_each_entry(evlist, pos) {
916 try_again:
917 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
918 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
919 				if (verbose > 0)
920 					ui__warning("%s\n", msg);
921 				goto try_again;
922 			}
923 			if ((errno == EINVAL || errno == EBADF) &&
924 			    pos->leader != pos &&
925 			    pos->weak_group) {
926 			        pos = evlist__reset_weak_group(evlist, pos, true);
927 				goto try_again;
928 			}
929 			rc = -errno;
930 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
931 			ui__error("%s\n", msg);
932 			goto out;
933 		}
934 
935 		pos->supported = true;
936 	}
937 
938 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
939 		pr_warning(
940 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
941 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
942 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
943 "file is not found in the buildid cache or in the vmlinux path.\n\n"
944 "Samples in kernel modules won't be resolved at all.\n\n"
945 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
946 "even with a suitable vmlinux or kallsyms file.\n\n");
947 	}
948 
949 	if (evlist__apply_filters(evlist, &pos)) {
950 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
951 			pos->filter, evsel__name(pos), errno,
952 			str_error_r(errno, msg, sizeof(msg)));
953 		rc = -1;
954 		goto out;
955 	}
956 
957 	rc = record__mmap(rec);
958 	if (rc)
959 		goto out;
960 
961 	session->evlist = evlist;
962 	perf_session__set_id_hdr_size(session);
963 out:
964 	return rc;
965 }
966 
967 static int process_sample_event(struct perf_tool *tool,
968 				union perf_event *event,
969 				struct perf_sample *sample,
970 				struct evsel *evsel,
971 				struct machine *machine)
972 {
973 	struct record *rec = container_of(tool, struct record, tool);
974 
975 	if (rec->evlist->first_sample_time == 0)
976 		rec->evlist->first_sample_time = sample->time;
977 
978 	rec->evlist->last_sample_time = sample->time;
979 
980 	if (rec->buildid_all)
981 		return 0;
982 
983 	rec->samples++;
984 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
985 }
986 
987 static int process_buildids(struct record *rec)
988 {
989 	struct perf_session *session = rec->session;
990 
991 	if (perf_data__size(&rec->data) == 0)
992 		return 0;
993 
994 	/*
995 	 * During this process, it'll load kernel map and replace the
996 	 * dso->long_name to a real pathname it found.  In this case
997 	 * we prefer the vmlinux path like
998 	 *   /lib/modules/3.16.4/build/vmlinux
999 	 *
1000 	 * rather than build-id path (in debug directory).
1001 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1002 	 */
1003 	symbol_conf.ignore_vmlinux_buildid = true;
1004 
1005 	/*
1006 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1007 	 * so no need to process samples. But if timestamp_boundary is enabled,
1008 	 * it still needs to walk on all samples to get the timestamps of
1009 	 * first/last samples.
1010 	 */
1011 	if (rec->buildid_all && !rec->timestamp_boundary)
1012 		rec->tool.sample = NULL;
1013 
1014 	return perf_session__process_events(session);
1015 }
1016 
1017 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1018 {
1019 	int err;
1020 	struct perf_tool *tool = data;
1021 	/*
1022 	 *As for guest kernel when processing subcommand record&report,
1023 	 *we arrange module mmap prior to guest kernel mmap and trigger
1024 	 *a preload dso because default guest module symbols are loaded
1025 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1026 	 *method is used to avoid symbol missing when the first addr is
1027 	 *in module instead of in guest kernel.
1028 	 */
1029 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1030 					     machine);
1031 	if (err < 0)
1032 		pr_err("Couldn't record guest kernel [%d]'s reference"
1033 		       " relocation symbol.\n", machine->pid);
1034 
1035 	/*
1036 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1037 	 * have no _text sometimes.
1038 	 */
1039 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1040 						 machine);
1041 	if (err < 0)
1042 		pr_err("Couldn't record guest kernel [%d]'s reference"
1043 		       " relocation symbol.\n", machine->pid);
1044 }
1045 
1046 static struct perf_event_header finished_round_event = {
1047 	.size = sizeof(struct perf_event_header),
1048 	.type = PERF_RECORD_FINISHED_ROUND,
1049 };
1050 
1051 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1052 {
1053 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1054 	    !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
1055 			  rec->affinity_mask.nbits)) {
1056 		bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
1057 		bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
1058 			  map->affinity_mask.bits, rec->affinity_mask.nbits);
1059 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
1060 				  (cpu_set_t *)rec->affinity_mask.bits);
1061 		if (verbose == 2)
1062 			mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
1063 	}
1064 }
1065 
1066 static size_t process_comp_header(void *record, size_t increment)
1067 {
1068 	struct perf_record_compressed *event = record;
1069 	size_t size = sizeof(*event);
1070 
1071 	if (increment) {
1072 		event->header.size += increment;
1073 		return increment;
1074 	}
1075 
1076 	event->header.type = PERF_RECORD_COMPRESSED;
1077 	event->header.size = size;
1078 
1079 	return size;
1080 }
1081 
1082 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
1083 			    void *src, size_t src_size)
1084 {
1085 	size_t compressed;
1086 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1087 
1088 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1089 						     max_record_size, process_comp_header);
1090 
1091 	session->bytes_transferred += src_size;
1092 	session->bytes_compressed  += compressed;
1093 
1094 	return compressed;
1095 }
1096 
1097 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1098 				    bool overwrite, bool synch)
1099 {
1100 	u64 bytes_written = rec->bytes_written;
1101 	int i;
1102 	int rc = 0;
1103 	struct mmap *maps;
1104 	int trace_fd = rec->data.file.fd;
1105 	off_t off = 0;
1106 
1107 	if (!evlist)
1108 		return 0;
1109 
1110 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1111 	if (!maps)
1112 		return 0;
1113 
1114 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1115 		return 0;
1116 
1117 	if (record__aio_enabled(rec))
1118 		off = record__aio_get_pos(trace_fd);
1119 
1120 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
1121 		u64 flush = 0;
1122 		struct mmap *map = &maps[i];
1123 
1124 		if (map->core.base) {
1125 			record__adjust_affinity(rec, map);
1126 			if (synch) {
1127 				flush = map->core.flush;
1128 				map->core.flush = 1;
1129 			}
1130 			if (!record__aio_enabled(rec)) {
1131 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1132 					if (synch)
1133 						map->core.flush = flush;
1134 					rc = -1;
1135 					goto out;
1136 				}
1137 			} else {
1138 				if (record__aio_push(rec, map, &off) < 0) {
1139 					record__aio_set_pos(trace_fd, off);
1140 					if (synch)
1141 						map->core.flush = flush;
1142 					rc = -1;
1143 					goto out;
1144 				}
1145 			}
1146 			if (synch)
1147 				map->core.flush = flush;
1148 		}
1149 
1150 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1151 		    !rec->opts.auxtrace_sample_mode &&
1152 		    record__auxtrace_mmap_read(rec, map) != 0) {
1153 			rc = -1;
1154 			goto out;
1155 		}
1156 	}
1157 
1158 	if (record__aio_enabled(rec))
1159 		record__aio_set_pos(trace_fd, off);
1160 
1161 	/*
1162 	 * Mark the round finished in case we wrote
1163 	 * at least one event.
1164 	 */
1165 	if (bytes_written != rec->bytes_written)
1166 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1167 
1168 	if (overwrite)
1169 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1170 out:
1171 	return rc;
1172 }
1173 
1174 static int record__mmap_read_all(struct record *rec, bool synch)
1175 {
1176 	int err;
1177 
1178 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1179 	if (err)
1180 		return err;
1181 
1182 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1183 }
1184 
1185 static void record__init_features(struct record *rec)
1186 {
1187 	struct perf_session *session = rec->session;
1188 	int feat;
1189 
1190 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1191 		perf_header__set_feat(&session->header, feat);
1192 
1193 	if (rec->no_buildid)
1194 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1195 
1196 	if (!have_tracepoints(&rec->evlist->core.entries))
1197 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1198 
1199 	if (!rec->opts.branch_stack)
1200 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1201 
1202 	if (!rec->opts.full_auxtrace)
1203 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1204 
1205 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1206 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1207 
1208 	if (!rec->opts.use_clockid)
1209 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1210 
1211 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1212 	if (!record__comp_enabled(rec))
1213 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1214 
1215 	perf_header__clear_feat(&session->header, HEADER_STAT);
1216 }
1217 
1218 static void
1219 record__finish_output(struct record *rec)
1220 {
1221 	struct perf_data *data = &rec->data;
1222 	int fd = perf_data__fd(data);
1223 
1224 	if (data->is_pipe)
1225 		return;
1226 
1227 	rec->session->header.data_size += rec->bytes_written;
1228 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1229 
1230 	if (!rec->no_buildid) {
1231 		process_buildids(rec);
1232 
1233 		if (rec->buildid_all)
1234 			dsos__hit_all(rec->session);
1235 	}
1236 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1237 
1238 	return;
1239 }
1240 
1241 static int record__synthesize_workload(struct record *rec, bool tail)
1242 {
1243 	int err;
1244 	struct perf_thread_map *thread_map;
1245 
1246 	if (rec->opts.tail_synthesize != tail)
1247 		return 0;
1248 
1249 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1250 	if (thread_map == NULL)
1251 		return -1;
1252 
1253 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1254 						 process_synthesized_event,
1255 						 &rec->session->machines.host,
1256 						 rec->opts.sample_address);
1257 	perf_thread_map__put(thread_map);
1258 	return err;
1259 }
1260 
1261 static int record__synthesize(struct record *rec, bool tail);
1262 
1263 static int
1264 record__switch_output(struct record *rec, bool at_exit)
1265 {
1266 	struct perf_data *data = &rec->data;
1267 	int fd, err;
1268 	char *new_filename;
1269 
1270 	/* Same Size:      "2015122520103046"*/
1271 	char timestamp[] = "InvalidTimestamp";
1272 
1273 	record__aio_mmap_read_sync(rec);
1274 
1275 	record__synthesize(rec, true);
1276 	if (target__none(&rec->opts.target))
1277 		record__synthesize_workload(rec, true);
1278 
1279 	rec->samples = 0;
1280 	record__finish_output(rec);
1281 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1282 	if (err) {
1283 		pr_err("Failed to get current timestamp\n");
1284 		return -EINVAL;
1285 	}
1286 
1287 	fd = perf_data__switch(data, timestamp,
1288 				    rec->session->header.data_offset,
1289 				    at_exit, &new_filename);
1290 	if (fd >= 0 && !at_exit) {
1291 		rec->bytes_written = 0;
1292 		rec->session->header.data_size = 0;
1293 	}
1294 
1295 	if (!quiet)
1296 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1297 			data->path, timestamp);
1298 
1299 	if (rec->switch_output.num_files) {
1300 		int n = rec->switch_output.cur_file + 1;
1301 
1302 		if (n >= rec->switch_output.num_files)
1303 			n = 0;
1304 		rec->switch_output.cur_file = n;
1305 		if (rec->switch_output.filenames[n]) {
1306 			remove(rec->switch_output.filenames[n]);
1307 			zfree(&rec->switch_output.filenames[n]);
1308 		}
1309 		rec->switch_output.filenames[n] = new_filename;
1310 	} else {
1311 		free(new_filename);
1312 	}
1313 
1314 	/* Output tracking events */
1315 	if (!at_exit) {
1316 		record__synthesize(rec, false);
1317 
1318 		/*
1319 		 * In 'perf record --switch-output' without -a,
1320 		 * record__synthesize() in record__switch_output() won't
1321 		 * generate tracking events because there's no thread_map
1322 		 * in evlist. Which causes newly created perf.data doesn't
1323 		 * contain map and comm information.
1324 		 * Create a fake thread_map and directly call
1325 		 * perf_event__synthesize_thread_map() for those events.
1326 		 */
1327 		if (target__none(&rec->opts.target))
1328 			record__synthesize_workload(rec, false);
1329 	}
1330 	return fd;
1331 }
1332 
1333 static volatile int workload_exec_errno;
1334 
1335 /*
1336  * evlist__prepare_workload will send a SIGUSR1
1337  * if the fork fails, since we asked by setting its
1338  * want_signal to true.
1339  */
1340 static void workload_exec_failed_signal(int signo __maybe_unused,
1341 					siginfo_t *info,
1342 					void *ucontext __maybe_unused)
1343 {
1344 	workload_exec_errno = info->si_value.sival_int;
1345 	done = 1;
1346 	child_finished = 1;
1347 }
1348 
1349 static void snapshot_sig_handler(int sig);
1350 static void alarm_sig_handler(int sig);
1351 
1352 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1353 {
1354 	if (evlist) {
1355 		if (evlist->mmap && evlist->mmap[0].core.base)
1356 			return evlist->mmap[0].core.base;
1357 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1358 			return evlist->overwrite_mmap[0].core.base;
1359 	}
1360 	return NULL;
1361 }
1362 
1363 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1364 {
1365 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1366 	if (pc)
1367 		return pc;
1368 	return NULL;
1369 }
1370 
1371 static int record__synthesize(struct record *rec, bool tail)
1372 {
1373 	struct perf_session *session = rec->session;
1374 	struct machine *machine = &session->machines.host;
1375 	struct perf_data *data = &rec->data;
1376 	struct record_opts *opts = &rec->opts;
1377 	struct perf_tool *tool = &rec->tool;
1378 	int fd = perf_data__fd(data);
1379 	int err = 0;
1380 	event_op f = process_synthesized_event;
1381 
1382 	if (rec->opts.tail_synthesize != tail)
1383 		return 0;
1384 
1385 	if (data->is_pipe) {
1386 		/*
1387 		 * We need to synthesize events first, because some
1388 		 * features works on top of them (on report side).
1389 		 */
1390 		err = perf_event__synthesize_attrs(tool, rec->evlist,
1391 						   process_synthesized_event);
1392 		if (err < 0) {
1393 			pr_err("Couldn't synthesize attrs.\n");
1394 			goto out;
1395 		}
1396 
1397 		err = perf_event__synthesize_features(tool, session, rec->evlist,
1398 						      process_synthesized_event);
1399 		if (err < 0) {
1400 			pr_err("Couldn't synthesize features.\n");
1401 			return err;
1402 		}
1403 
1404 		if (have_tracepoints(&rec->evlist->core.entries)) {
1405 			/*
1406 			 * FIXME err <= 0 here actually means that
1407 			 * there were no tracepoints so its not really
1408 			 * an error, just that we don't need to
1409 			 * synthesize anything.  We really have to
1410 			 * return this more properly and also
1411 			 * propagate errors that now are calling die()
1412 			 */
1413 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
1414 								  process_synthesized_event);
1415 			if (err <= 0) {
1416 				pr_err("Couldn't record tracing data.\n");
1417 				goto out;
1418 			}
1419 			rec->bytes_written += err;
1420 		}
1421 	}
1422 
1423 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1424 					  process_synthesized_event, machine);
1425 	if (err)
1426 		goto out;
1427 
1428 	/* Synthesize id_index before auxtrace_info */
1429 	if (rec->opts.auxtrace_sample_mode) {
1430 		err = perf_event__synthesize_id_index(tool,
1431 						      process_synthesized_event,
1432 						      session->evlist, machine);
1433 		if (err)
1434 			goto out;
1435 	}
1436 
1437 	if (rec->opts.full_auxtrace) {
1438 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1439 					session, process_synthesized_event);
1440 		if (err)
1441 			goto out;
1442 	}
1443 
1444 	if (!evlist__exclude_kernel(rec->evlist)) {
1445 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1446 							 machine);
1447 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1448 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1449 				   "Check /proc/kallsyms permission or run as root.\n");
1450 
1451 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1452 						     machine);
1453 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1454 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1455 				   "Check /proc/modules permission or run as root.\n");
1456 	}
1457 
1458 	if (perf_guest) {
1459 		machines__process_guests(&session->machines,
1460 					 perf_event__synthesize_guest_os, tool);
1461 	}
1462 
1463 	err = perf_event__synthesize_extra_attr(&rec->tool,
1464 						rec->evlist,
1465 						process_synthesized_event,
1466 						data->is_pipe);
1467 	if (err)
1468 		goto out;
1469 
1470 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1471 						 process_synthesized_event,
1472 						NULL);
1473 	if (err < 0) {
1474 		pr_err("Couldn't synthesize thread map.\n");
1475 		return err;
1476 	}
1477 
1478 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1479 					     process_synthesized_event, NULL);
1480 	if (err < 0) {
1481 		pr_err("Couldn't synthesize cpu map.\n");
1482 		return err;
1483 	}
1484 
1485 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1486 						machine, opts);
1487 	if (err < 0)
1488 		pr_warning("Couldn't synthesize bpf events.\n");
1489 
1490 	err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1491 					     machine);
1492 	if (err < 0)
1493 		pr_warning("Couldn't synthesize cgroup events.\n");
1494 
1495 	if (rec->opts.nr_threads_synthesize > 1) {
1496 		perf_set_multithreaded();
1497 		f = process_locked_synthesized_event;
1498 	}
1499 
1500 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1501 					    f, opts->sample_address,
1502 					    rec->opts.nr_threads_synthesize);
1503 
1504 	if (rec->opts.nr_threads_synthesize > 1)
1505 		perf_set_singlethreaded();
1506 
1507 out:
1508 	return err;
1509 }
1510 
1511 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1512 {
1513 	struct record *rec = data;
1514 	pthread_kill(rec->thread_id, SIGUSR2);
1515 	return 0;
1516 }
1517 
1518 static int record__setup_sb_evlist(struct record *rec)
1519 {
1520 	struct record_opts *opts = &rec->opts;
1521 
1522 	if (rec->sb_evlist != NULL) {
1523 		/*
1524 		 * We get here if --switch-output-event populated the
1525 		 * sb_evlist, so associate a callback that will send a SIGUSR2
1526 		 * to the main thread.
1527 		 */
1528 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1529 		rec->thread_id = pthread_self();
1530 	}
1531 #ifdef HAVE_LIBBPF_SUPPORT
1532 	if (!opts->no_bpf_event) {
1533 		if (rec->sb_evlist == NULL) {
1534 			rec->sb_evlist = evlist__new();
1535 
1536 			if (rec->sb_evlist == NULL) {
1537 				pr_err("Couldn't create side band evlist.\n.");
1538 				return -1;
1539 			}
1540 		}
1541 
1542 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1543 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1544 			return -1;
1545 		}
1546 	}
1547 #endif
1548 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1549 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1550 		opts->no_bpf_event = true;
1551 	}
1552 
1553 	return 0;
1554 }
1555 
1556 static int record__init_clock(struct record *rec)
1557 {
1558 	struct perf_session *session = rec->session;
1559 	struct timespec ref_clockid;
1560 	struct timeval ref_tod;
1561 	u64 ref;
1562 
1563 	if (!rec->opts.use_clockid)
1564 		return 0;
1565 
1566 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1567 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
1568 
1569 	session->header.env.clock.clockid = rec->opts.clockid;
1570 
1571 	if (gettimeofday(&ref_tod, NULL) != 0) {
1572 		pr_err("gettimeofday failed, cannot set reference time.\n");
1573 		return -1;
1574 	}
1575 
1576 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
1577 		pr_err("clock_gettime failed, cannot set reference time.\n");
1578 		return -1;
1579 	}
1580 
1581 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
1582 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
1583 
1584 	session->header.env.clock.tod_ns = ref;
1585 
1586 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
1587 	      (u64) ref_clockid.tv_nsec;
1588 
1589 	session->header.env.clock.clockid_ns = ref;
1590 	return 0;
1591 }
1592 
1593 static void hit_auxtrace_snapshot_trigger(struct record *rec)
1594 {
1595 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1596 		trigger_hit(&auxtrace_snapshot_trigger);
1597 		auxtrace_record__snapshot_started = 1;
1598 		if (auxtrace_record__snapshot_start(rec->itr))
1599 			trigger_error(&auxtrace_snapshot_trigger);
1600 	}
1601 }
1602 
1603 static int __cmd_record(struct record *rec, int argc, const char **argv)
1604 {
1605 	int err;
1606 	int status = 0;
1607 	unsigned long waking = 0;
1608 	const bool forks = argc > 0;
1609 	struct perf_tool *tool = &rec->tool;
1610 	struct record_opts *opts = &rec->opts;
1611 	struct perf_data *data = &rec->data;
1612 	struct perf_session *session;
1613 	bool disabled = false, draining = false;
1614 	int fd;
1615 	float ratio = 0;
1616 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
1617 
1618 	atexit(record__sig_exit);
1619 	signal(SIGCHLD, sig_handler);
1620 	signal(SIGINT, sig_handler);
1621 	signal(SIGTERM, sig_handler);
1622 	signal(SIGSEGV, sigsegv_handler);
1623 
1624 	if (rec->opts.record_namespaces)
1625 		tool->namespace_events = true;
1626 
1627 	if (rec->opts.record_cgroup) {
1628 #ifdef HAVE_FILE_HANDLE
1629 		tool->cgroup_events = true;
1630 #else
1631 		pr_err("cgroup tracking is not supported\n");
1632 		return -1;
1633 #endif
1634 	}
1635 
1636 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1637 		signal(SIGUSR2, snapshot_sig_handler);
1638 		if (rec->opts.auxtrace_snapshot_mode)
1639 			trigger_on(&auxtrace_snapshot_trigger);
1640 		if (rec->switch_output.enabled)
1641 			trigger_on(&switch_output_trigger);
1642 	} else {
1643 		signal(SIGUSR2, SIG_IGN);
1644 	}
1645 
1646 	session = perf_session__new(data, false, tool);
1647 	if (IS_ERR(session)) {
1648 		pr_err("Perf session creation failed.\n");
1649 		return PTR_ERR(session);
1650 	}
1651 
1652 	fd = perf_data__fd(data);
1653 	rec->session = session;
1654 
1655 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1656 		pr_err("Compression initialization failed.\n");
1657 		return -1;
1658 	}
1659 #ifdef HAVE_EVENTFD_SUPPORT
1660 	done_fd = eventfd(0, EFD_NONBLOCK);
1661 	if (done_fd < 0) {
1662 		pr_err("Failed to create wakeup eventfd, error: %m\n");
1663 		status = -1;
1664 		goto out_delete_session;
1665 	}
1666 	err = evlist__add_pollfd(rec->evlist, done_fd);
1667 	if (err < 0) {
1668 		pr_err("Failed to add wakeup eventfd to poll list\n");
1669 		status = err;
1670 		goto out_delete_session;
1671 	}
1672 #endif // HAVE_EVENTFD_SUPPORT
1673 
1674 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1675 	session->header.env.comp_level = rec->opts.comp_level;
1676 
1677 	if (rec->opts.kcore &&
1678 	    !record__kcore_readable(&session->machines.host)) {
1679 		pr_err("ERROR: kcore is not readable.\n");
1680 		return -1;
1681 	}
1682 
1683 	if (record__init_clock(rec))
1684 		return -1;
1685 
1686 	record__init_features(rec);
1687 
1688 	if (forks) {
1689 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
1690 					       workload_exec_failed_signal);
1691 		if (err < 0) {
1692 			pr_err("Couldn't run the workload!\n");
1693 			status = err;
1694 			goto out_delete_session;
1695 		}
1696 	}
1697 
1698 	/*
1699 	 * If we have just single event and are sending data
1700 	 * through pipe, we need to force the ids allocation,
1701 	 * because we synthesize event name through the pipe
1702 	 * and need the id for that.
1703 	 */
1704 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1705 		rec->opts.sample_id = true;
1706 
1707 	if (record__open(rec) != 0) {
1708 		err = -1;
1709 		goto out_child;
1710 	}
1711 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1712 
1713 	if (rec->opts.kcore) {
1714 		err = record__kcore_copy(&session->machines.host, data);
1715 		if (err) {
1716 			pr_err("ERROR: Failed to copy kcore\n");
1717 			goto out_child;
1718 		}
1719 	}
1720 
1721 	err = bpf__apply_obj_config();
1722 	if (err) {
1723 		char errbuf[BUFSIZ];
1724 
1725 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1726 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1727 			 errbuf);
1728 		goto out_child;
1729 	}
1730 
1731 	/*
1732 	 * Normally perf_session__new would do this, but it doesn't have the
1733 	 * evlist.
1734 	 */
1735 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
1736 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1737 		rec->tool.ordered_events = false;
1738 	}
1739 
1740 	if (!rec->evlist->nr_groups)
1741 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1742 
1743 	if (data->is_pipe) {
1744 		err = perf_header__write_pipe(fd);
1745 		if (err < 0)
1746 			goto out_child;
1747 	} else {
1748 		err = perf_session__write_header(session, rec->evlist, fd, false);
1749 		if (err < 0)
1750 			goto out_child;
1751 	}
1752 
1753 	err = -1;
1754 	if (!rec->no_buildid
1755 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1756 		pr_err("Couldn't generate buildids. "
1757 		       "Use --no-buildid to profile anyway.\n");
1758 		goto out_child;
1759 	}
1760 
1761 	err = record__setup_sb_evlist(rec);
1762 	if (err)
1763 		goto out_child;
1764 
1765 	err = record__synthesize(rec, false);
1766 	if (err < 0)
1767 		goto out_child;
1768 
1769 	if (rec->realtime_prio) {
1770 		struct sched_param param;
1771 
1772 		param.sched_priority = rec->realtime_prio;
1773 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1774 			pr_err("Could not set realtime priority.\n");
1775 			err = -1;
1776 			goto out_child;
1777 		}
1778 	}
1779 
1780 	/*
1781 	 * When perf is starting the traced process, all the events
1782 	 * (apart from group members) have enable_on_exec=1 set,
1783 	 * so don't spoil it by prematurely enabling them.
1784 	 */
1785 	if (!target__none(&opts->target) && !opts->initial_delay)
1786 		evlist__enable(rec->evlist);
1787 
1788 	/*
1789 	 * Let the child rip
1790 	 */
1791 	if (forks) {
1792 		struct machine *machine = &session->machines.host;
1793 		union perf_event *event;
1794 		pid_t tgid;
1795 
1796 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1797 		if (event == NULL) {
1798 			err = -ENOMEM;
1799 			goto out_child;
1800 		}
1801 
1802 		/*
1803 		 * Some H/W events are generated before COMM event
1804 		 * which is emitted during exec(), so perf script
1805 		 * cannot see a correct process name for those events.
1806 		 * Synthesize COMM event to prevent it.
1807 		 */
1808 		tgid = perf_event__synthesize_comm(tool, event,
1809 						   rec->evlist->workload.pid,
1810 						   process_synthesized_event,
1811 						   machine);
1812 		free(event);
1813 
1814 		if (tgid == -1)
1815 			goto out_child;
1816 
1817 		event = malloc(sizeof(event->namespaces) +
1818 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1819 			       machine->id_hdr_size);
1820 		if (event == NULL) {
1821 			err = -ENOMEM;
1822 			goto out_child;
1823 		}
1824 
1825 		/*
1826 		 * Synthesize NAMESPACES event for the command specified.
1827 		 */
1828 		perf_event__synthesize_namespaces(tool, event,
1829 						  rec->evlist->workload.pid,
1830 						  tgid, process_synthesized_event,
1831 						  machine);
1832 		free(event);
1833 
1834 		evlist__start_workload(rec->evlist);
1835 	}
1836 
1837 	if (evlist__initialize_ctlfd(rec->evlist, opts->ctl_fd, opts->ctl_fd_ack))
1838 		goto out_child;
1839 
1840 	if (opts->initial_delay) {
1841 		pr_info(EVLIST_DISABLED_MSG);
1842 		if (opts->initial_delay > 0) {
1843 			usleep(opts->initial_delay * USEC_PER_MSEC);
1844 			evlist__enable(rec->evlist);
1845 			pr_info(EVLIST_ENABLED_MSG);
1846 		}
1847 	}
1848 
1849 	trigger_ready(&auxtrace_snapshot_trigger);
1850 	trigger_ready(&switch_output_trigger);
1851 	perf_hooks__invoke_record_start();
1852 	for (;;) {
1853 		unsigned long long hits = rec->samples;
1854 
1855 		/*
1856 		 * rec->evlist->bkw_mmap_state is possible to be
1857 		 * BKW_MMAP_EMPTY here: when done == true and
1858 		 * hits != rec->samples in previous round.
1859 		 *
1860 		 * evlist__toggle_bkw_mmap ensure we never
1861 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1862 		 */
1863 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1864 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1865 
1866 		if (record__mmap_read_all(rec, false) < 0) {
1867 			trigger_error(&auxtrace_snapshot_trigger);
1868 			trigger_error(&switch_output_trigger);
1869 			err = -1;
1870 			goto out_child;
1871 		}
1872 
1873 		if (auxtrace_record__snapshot_started) {
1874 			auxtrace_record__snapshot_started = 0;
1875 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1876 				record__read_auxtrace_snapshot(rec, false);
1877 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1878 				pr_err("AUX area tracing snapshot failed\n");
1879 				err = -1;
1880 				goto out_child;
1881 			}
1882 		}
1883 
1884 		if (trigger_is_hit(&switch_output_trigger)) {
1885 			/*
1886 			 * If switch_output_trigger is hit, the data in
1887 			 * overwritable ring buffer should have been collected,
1888 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1889 			 *
1890 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1891 			 * record__mmap_read_all() didn't collect data from
1892 			 * overwritable ring buffer. Read again.
1893 			 */
1894 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1895 				continue;
1896 			trigger_ready(&switch_output_trigger);
1897 
1898 			/*
1899 			 * Reenable events in overwrite ring buffer after
1900 			 * record__mmap_read_all(): we should have collected
1901 			 * data from it.
1902 			 */
1903 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1904 
1905 			if (!quiet)
1906 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1907 					waking);
1908 			waking = 0;
1909 			fd = record__switch_output(rec, false);
1910 			if (fd < 0) {
1911 				pr_err("Failed to switch to new file\n");
1912 				trigger_error(&switch_output_trigger);
1913 				err = fd;
1914 				goto out_child;
1915 			}
1916 
1917 			/* re-arm the alarm */
1918 			if (rec->switch_output.time)
1919 				alarm(rec->switch_output.time);
1920 		}
1921 
1922 		if (hits == rec->samples) {
1923 			if (done || draining)
1924 				break;
1925 			err = evlist__poll(rec->evlist, -1);
1926 			/*
1927 			 * Propagate error, only if there's any. Ignore positive
1928 			 * number of returned events and interrupt error.
1929 			 */
1930 			if (err > 0 || (err < 0 && errno == EINTR))
1931 				err = 0;
1932 			waking++;
1933 
1934 			if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1935 				draining = true;
1936 		}
1937 
1938 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
1939 			switch (cmd) {
1940 			case EVLIST_CTL_CMD_ENABLE:
1941 				pr_info(EVLIST_ENABLED_MSG);
1942 				break;
1943 			case EVLIST_CTL_CMD_DISABLE:
1944 				pr_info(EVLIST_DISABLED_MSG);
1945 				break;
1946 			case EVLIST_CTL_CMD_SNAPSHOT:
1947 				hit_auxtrace_snapshot_trigger(rec);
1948 				evlist__ctlfd_ack(rec->evlist);
1949 				break;
1950 			case EVLIST_CTL_CMD_ACK:
1951 			case EVLIST_CTL_CMD_UNSUPPORTED:
1952 			default:
1953 				break;
1954 			}
1955 		}
1956 
1957 		/*
1958 		 * When perf is starting the traced process, at the end events
1959 		 * die with the process and we wait for that. Thus no need to
1960 		 * disable events in this case.
1961 		 */
1962 		if (done && !disabled && !target__none(&opts->target)) {
1963 			trigger_off(&auxtrace_snapshot_trigger);
1964 			evlist__disable(rec->evlist);
1965 			disabled = true;
1966 		}
1967 	}
1968 
1969 	trigger_off(&auxtrace_snapshot_trigger);
1970 	trigger_off(&switch_output_trigger);
1971 
1972 	if (opts->auxtrace_snapshot_on_exit)
1973 		record__auxtrace_snapshot_exit(rec);
1974 
1975 	if (forks && workload_exec_errno) {
1976 		char msg[STRERR_BUFSIZE];
1977 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1978 		pr_err("Workload failed: %s\n", emsg);
1979 		err = -1;
1980 		goto out_child;
1981 	}
1982 
1983 	if (!quiet)
1984 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1985 
1986 	if (target__none(&rec->opts.target))
1987 		record__synthesize_workload(rec, true);
1988 
1989 out_child:
1990 	evlist__finalize_ctlfd(rec->evlist);
1991 	record__mmap_read_all(rec, true);
1992 	record__aio_mmap_read_sync(rec);
1993 
1994 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1995 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1996 		session->header.env.comp_ratio = ratio + 0.5;
1997 	}
1998 
1999 	if (forks) {
2000 		int exit_status;
2001 
2002 		if (!child_finished)
2003 			kill(rec->evlist->workload.pid, SIGTERM);
2004 
2005 		wait(&exit_status);
2006 
2007 		if (err < 0)
2008 			status = err;
2009 		else if (WIFEXITED(exit_status))
2010 			status = WEXITSTATUS(exit_status);
2011 		else if (WIFSIGNALED(exit_status))
2012 			signr = WTERMSIG(exit_status);
2013 	} else
2014 		status = err;
2015 
2016 	record__synthesize(rec, true);
2017 	/* this will be recalculated during process_buildids() */
2018 	rec->samples = 0;
2019 
2020 	if (!err) {
2021 		if (!rec->timestamp_filename) {
2022 			record__finish_output(rec);
2023 		} else {
2024 			fd = record__switch_output(rec, true);
2025 			if (fd < 0) {
2026 				status = fd;
2027 				goto out_delete_session;
2028 			}
2029 		}
2030 	}
2031 
2032 	perf_hooks__invoke_record_end();
2033 
2034 	if (!err && !quiet) {
2035 		char samples[128];
2036 		const char *postfix = rec->timestamp_filename ?
2037 					".<timestamp>" : "";
2038 
2039 		if (rec->samples && !rec->opts.full_auxtrace)
2040 			scnprintf(samples, sizeof(samples),
2041 				  " (%" PRIu64 " samples)", rec->samples);
2042 		else
2043 			samples[0] = '\0';
2044 
2045 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2046 			perf_data__size(data) / 1024.0 / 1024.0,
2047 			data->path, postfix, samples);
2048 		if (ratio) {
2049 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2050 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2051 					ratio);
2052 		}
2053 		fprintf(stderr, " ]\n");
2054 	}
2055 
2056 out_delete_session:
2057 #ifdef HAVE_EVENTFD_SUPPORT
2058 	if (done_fd >= 0)
2059 		close(done_fd);
2060 #endif
2061 	zstd_fini(&session->zstd_data);
2062 	perf_session__delete(session);
2063 
2064 	if (!opts->no_bpf_event)
2065 		evlist__stop_sb_thread(rec->sb_evlist);
2066 	return status;
2067 }
2068 
2069 static void callchain_debug(struct callchain_param *callchain)
2070 {
2071 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2072 
2073 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2074 
2075 	if (callchain->record_mode == CALLCHAIN_DWARF)
2076 		pr_debug("callchain: stack dump size %d\n",
2077 			 callchain->dump_size);
2078 }
2079 
2080 int record_opts__parse_callchain(struct record_opts *record,
2081 				 struct callchain_param *callchain,
2082 				 const char *arg, bool unset)
2083 {
2084 	int ret;
2085 	callchain->enabled = !unset;
2086 
2087 	/* --no-call-graph */
2088 	if (unset) {
2089 		callchain->record_mode = CALLCHAIN_NONE;
2090 		pr_debug("callchain: disabled\n");
2091 		return 0;
2092 	}
2093 
2094 	ret = parse_callchain_record_opt(arg, callchain);
2095 	if (!ret) {
2096 		/* Enable data address sampling for DWARF unwind. */
2097 		if (callchain->record_mode == CALLCHAIN_DWARF)
2098 			record->sample_address = true;
2099 		callchain_debug(callchain);
2100 	}
2101 
2102 	return ret;
2103 }
2104 
2105 int record_parse_callchain_opt(const struct option *opt,
2106 			       const char *arg,
2107 			       int unset)
2108 {
2109 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2110 }
2111 
2112 int record_callchain_opt(const struct option *opt,
2113 			 const char *arg __maybe_unused,
2114 			 int unset __maybe_unused)
2115 {
2116 	struct callchain_param *callchain = opt->value;
2117 
2118 	callchain->enabled = true;
2119 
2120 	if (callchain->record_mode == CALLCHAIN_NONE)
2121 		callchain->record_mode = CALLCHAIN_FP;
2122 
2123 	callchain_debug(callchain);
2124 	return 0;
2125 }
2126 
2127 static int perf_record_config(const char *var, const char *value, void *cb)
2128 {
2129 	struct record *rec = cb;
2130 
2131 	if (!strcmp(var, "record.build-id")) {
2132 		if (!strcmp(value, "cache"))
2133 			rec->no_buildid_cache = false;
2134 		else if (!strcmp(value, "no-cache"))
2135 			rec->no_buildid_cache = true;
2136 		else if (!strcmp(value, "skip"))
2137 			rec->no_buildid = true;
2138 		else
2139 			return -1;
2140 		return 0;
2141 	}
2142 	if (!strcmp(var, "record.call-graph")) {
2143 		var = "call-graph.record-mode";
2144 		return perf_default_config(var, value, cb);
2145 	}
2146 #ifdef HAVE_AIO_SUPPORT
2147 	if (!strcmp(var, "record.aio")) {
2148 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2149 		if (!rec->opts.nr_cblocks)
2150 			rec->opts.nr_cblocks = nr_cblocks_default;
2151 	}
2152 #endif
2153 
2154 	return 0;
2155 }
2156 
2157 
2158 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2159 {
2160 	struct record_opts *opts = (struct record_opts *)opt->value;
2161 
2162 	if (unset || !str)
2163 		return 0;
2164 
2165 	if (!strcasecmp(str, "node"))
2166 		opts->affinity = PERF_AFFINITY_NODE;
2167 	else if (!strcasecmp(str, "cpu"))
2168 		opts->affinity = PERF_AFFINITY_CPU;
2169 
2170 	return 0;
2171 }
2172 
2173 static int parse_output_max_size(const struct option *opt,
2174 				 const char *str, int unset)
2175 {
2176 	unsigned long *s = (unsigned long *)opt->value;
2177 	static struct parse_tag tags_size[] = {
2178 		{ .tag  = 'B', .mult = 1       },
2179 		{ .tag  = 'K', .mult = 1 << 10 },
2180 		{ .tag  = 'M', .mult = 1 << 20 },
2181 		{ .tag  = 'G', .mult = 1 << 30 },
2182 		{ .tag  = 0 },
2183 	};
2184 	unsigned long val;
2185 
2186 	if (unset) {
2187 		*s = 0;
2188 		return 0;
2189 	}
2190 
2191 	val = parse_tag_value(str, tags_size);
2192 	if (val != (unsigned long) -1) {
2193 		*s = val;
2194 		return 0;
2195 	}
2196 
2197 	return -1;
2198 }
2199 
2200 static int record__parse_mmap_pages(const struct option *opt,
2201 				    const char *str,
2202 				    int unset __maybe_unused)
2203 {
2204 	struct record_opts *opts = opt->value;
2205 	char *s, *p;
2206 	unsigned int mmap_pages;
2207 	int ret;
2208 
2209 	if (!str)
2210 		return -EINVAL;
2211 
2212 	s = strdup(str);
2213 	if (!s)
2214 		return -ENOMEM;
2215 
2216 	p = strchr(s, ',');
2217 	if (p)
2218 		*p = '\0';
2219 
2220 	if (*s) {
2221 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
2222 		if (ret)
2223 			goto out_free;
2224 		opts->mmap_pages = mmap_pages;
2225 	}
2226 
2227 	if (!p) {
2228 		ret = 0;
2229 		goto out_free;
2230 	}
2231 
2232 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
2233 	if (ret)
2234 		goto out_free;
2235 
2236 	opts->auxtrace_mmap_pages = mmap_pages;
2237 
2238 out_free:
2239 	free(s);
2240 	return ret;
2241 }
2242 
2243 static int parse_control_option(const struct option *opt,
2244 				const char *str,
2245 				int unset __maybe_unused)
2246 {
2247 	struct record_opts *opts = opt->value;
2248 
2249 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
2250 }
2251 
2252 static void switch_output_size_warn(struct record *rec)
2253 {
2254 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2255 	struct switch_output *s = &rec->switch_output;
2256 
2257 	wakeup_size /= 2;
2258 
2259 	if (s->size < wakeup_size) {
2260 		char buf[100];
2261 
2262 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2263 		pr_warning("WARNING: switch-output data size lower than "
2264 			   "wakeup kernel buffer size (%s) "
2265 			   "expect bigger perf.data sizes\n", buf);
2266 	}
2267 }
2268 
2269 static int switch_output_setup(struct record *rec)
2270 {
2271 	struct switch_output *s = &rec->switch_output;
2272 	static struct parse_tag tags_size[] = {
2273 		{ .tag  = 'B', .mult = 1       },
2274 		{ .tag  = 'K', .mult = 1 << 10 },
2275 		{ .tag  = 'M', .mult = 1 << 20 },
2276 		{ .tag  = 'G', .mult = 1 << 30 },
2277 		{ .tag  = 0 },
2278 	};
2279 	static struct parse_tag tags_time[] = {
2280 		{ .tag  = 's', .mult = 1        },
2281 		{ .tag  = 'm', .mult = 60       },
2282 		{ .tag  = 'h', .mult = 60*60    },
2283 		{ .tag  = 'd', .mult = 60*60*24 },
2284 		{ .tag  = 0 },
2285 	};
2286 	unsigned long val;
2287 
2288 	/*
2289 	 * If we're using --switch-output-events, then we imply its
2290 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2291 	 *  thread to its parent.
2292 	 */
2293 	if (rec->switch_output_event_set)
2294 		goto do_signal;
2295 
2296 	if (!s->set)
2297 		return 0;
2298 
2299 	if (!strcmp(s->str, "signal")) {
2300 do_signal:
2301 		s->signal = true;
2302 		pr_debug("switch-output with SIGUSR2 signal\n");
2303 		goto enabled;
2304 	}
2305 
2306 	val = parse_tag_value(s->str, tags_size);
2307 	if (val != (unsigned long) -1) {
2308 		s->size = val;
2309 		pr_debug("switch-output with %s size threshold\n", s->str);
2310 		goto enabled;
2311 	}
2312 
2313 	val = parse_tag_value(s->str, tags_time);
2314 	if (val != (unsigned long) -1) {
2315 		s->time = val;
2316 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2317 			 s->str, s->time);
2318 		goto enabled;
2319 	}
2320 
2321 	return -1;
2322 
2323 enabled:
2324 	rec->timestamp_filename = true;
2325 	s->enabled              = true;
2326 
2327 	if (s->size && !rec->opts.no_buffering)
2328 		switch_output_size_warn(rec);
2329 
2330 	return 0;
2331 }
2332 
2333 static const char * const __record_usage[] = {
2334 	"perf record [<options>] [<command>]",
2335 	"perf record [<options>] -- <command> [<options>]",
2336 	NULL
2337 };
2338 const char * const *record_usage = __record_usage;
2339 
2340 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2341 				  struct perf_sample *sample, struct machine *machine)
2342 {
2343 	/*
2344 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2345 	 * no need to add them twice.
2346 	 */
2347 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2348 		return 0;
2349 	return perf_event__process_mmap(tool, event, sample, machine);
2350 }
2351 
2352 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2353 				   struct perf_sample *sample, struct machine *machine)
2354 {
2355 	/*
2356 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2357 	 * no need to add them twice.
2358 	 */
2359 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2360 		return 0;
2361 
2362 	return perf_event__process_mmap2(tool, event, sample, machine);
2363 }
2364 
2365 /*
2366  * XXX Ideally would be local to cmd_record() and passed to a record__new
2367  * because we need to have access to it in record__exit, that is called
2368  * after cmd_record() exits, but since record_options need to be accessible to
2369  * builtin-script, leave it here.
2370  *
2371  * At least we don't ouch it in all the other functions here directly.
2372  *
2373  * Just say no to tons of global variables, sigh.
2374  */
2375 static struct record record = {
2376 	.opts = {
2377 		.sample_time	     = true,
2378 		.mmap_pages	     = UINT_MAX,
2379 		.user_freq	     = UINT_MAX,
2380 		.user_interval	     = ULLONG_MAX,
2381 		.freq		     = 4000,
2382 		.target		     = {
2383 			.uses_mmap   = true,
2384 			.default_per_cpu = true,
2385 		},
2386 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2387 		.nr_threads_synthesize = 1,
2388 		.ctl_fd              = -1,
2389 		.ctl_fd_ack          = -1,
2390 	},
2391 	.tool = {
2392 		.sample		= process_sample_event,
2393 		.fork		= perf_event__process_fork,
2394 		.exit		= perf_event__process_exit,
2395 		.comm		= perf_event__process_comm,
2396 		.namespaces	= perf_event__process_namespaces,
2397 		.mmap		= build_id__process_mmap,
2398 		.mmap2		= build_id__process_mmap2,
2399 		.ordered_events	= true,
2400 	},
2401 };
2402 
2403 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2404 	"\n\t\t\t\tDefault: fp";
2405 
2406 static bool dry_run;
2407 
2408 /*
2409  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2410  * with it and switch to use the library functions in perf_evlist that came
2411  * from builtin-record.c, i.e. use record_opts,
2412  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2413  * using pipes, etc.
2414  */
2415 static struct option __record_options[] = {
2416 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2417 		     "event selector. use 'perf list' to list available events",
2418 		     parse_events_option),
2419 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2420 		     "event filter", parse_filter),
2421 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2422 			   NULL, "don't record events from perf itself",
2423 			   exclude_perf),
2424 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2425 		    "record events on existing process id"),
2426 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2427 		    "record events on existing thread id"),
2428 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2429 		    "collect data with this RT SCHED_FIFO priority"),
2430 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2431 		    "collect data without buffering"),
2432 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2433 		    "collect raw sample records from all opened counters"),
2434 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2435 			    "system-wide collection from all CPUs"),
2436 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2437 		    "list of cpus to monitor"),
2438 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2439 	OPT_STRING('o', "output", &record.data.path, "file",
2440 		    "output file name"),
2441 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2442 			&record.opts.no_inherit_set,
2443 			"child tasks do not inherit counters"),
2444 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2445 		    "synthesize non-sample events at the end of output"),
2446 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2447 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
2448 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2449 		    "Fail if the specified frequency can't be used"),
2450 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2451 		     "profile at this frequency",
2452 		      record__parse_freq),
2453 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2454 		     "number of mmap data pages and AUX area tracing mmap pages",
2455 		     record__parse_mmap_pages),
2456 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2457 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2458 		     record__mmap_flush_parse),
2459 	OPT_BOOLEAN(0, "group", &record.opts.group,
2460 		    "put the counters into a counter group"),
2461 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2462 			   NULL, "enables call-graph recording" ,
2463 			   &record_callchain_opt),
2464 	OPT_CALLBACK(0, "call-graph", &record.opts,
2465 		     "record_mode[,record_size]", record_callchain_help,
2466 		     &record_parse_callchain_opt),
2467 	OPT_INCR('v', "verbose", &verbose,
2468 		    "be more verbose (show counter open errors, etc)"),
2469 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2470 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2471 		    "per thread counts"),
2472 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2473 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2474 		    "Record the sample physical addresses"),
2475 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
2476 		    "Record the sampled data address data page size"),
2477 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2478 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2479 			&record.opts.sample_time_set,
2480 			"Record the sample timestamps"),
2481 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2482 			"Record the sample period"),
2483 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2484 		    "don't sample"),
2485 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2486 			&record.no_buildid_cache_set,
2487 			"do not update the buildid cache"),
2488 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2489 			&record.no_buildid_set,
2490 			"do not collect buildids in perf.data"),
2491 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2492 		     "monitor event in cgroup name only",
2493 		     parse_cgroups),
2494 	OPT_INTEGER('D', "delay", &record.opts.initial_delay,
2495 		  "ms to wait before starting measurement after program start (-1: start with events disabled)"),
2496 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2497 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2498 		   "user to profile"),
2499 
2500 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2501 		     "branch any", "sample any taken branches",
2502 		     parse_branch_stack),
2503 
2504 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2505 		     "branch filter mask", "branch stack filter modes",
2506 		     parse_branch_stack),
2507 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2508 		    "sample by weight (on special events only)"),
2509 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2510 		    "sample transaction flags (special events only)"),
2511 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2512 		    "use per-thread mmaps"),
2513 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2514 		    "sample selected machine registers on interrupt,"
2515 		    " use '-I?' to list register names", parse_intr_regs),
2516 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2517 		    "sample selected machine registers on interrupt,"
2518 		    " use '--user-regs=?' to list register names", parse_user_regs),
2519 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2520 		    "Record running/enabled time of read (:S) events"),
2521 	OPT_CALLBACK('k', "clockid", &record.opts,
2522 	"clockid", "clockid to use for events, see clock_gettime()",
2523 	parse_clockid),
2524 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2525 			  "opts", "AUX area tracing Snapshot Mode", ""),
2526 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2527 			  "opts", "sample AUX area", ""),
2528 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2529 			"per thread proc mmap processing timeout in ms"),
2530 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2531 		    "Record namespaces events"),
2532 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
2533 		    "Record cgroup events"),
2534 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
2535 			&record.opts.record_switch_events_set,
2536 			"Record context switch events"),
2537 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2538 			 "Configure all used events to run in kernel space.",
2539 			 PARSE_OPT_EXCLUSIVE),
2540 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2541 			 "Configure all used events to run in user space.",
2542 			 PARSE_OPT_EXCLUSIVE),
2543 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2544 		    "collect kernel callchains"),
2545 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2546 		    "collect user callchains"),
2547 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2548 		   "clang binary to use for compiling BPF scriptlets"),
2549 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2550 		   "options passed to clang when compiling BPF scriptlets"),
2551 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2552 		   "file", "vmlinux pathname"),
2553 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2554 		    "Record build-id of all DSOs regardless of hits"),
2555 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2556 		    "append timestamp to output filename"),
2557 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2558 		    "Record timestamp boundary (time of first/last samples)"),
2559 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2560 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2561 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2562 			  "signal"),
2563 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
2564 			 "switch output event selector. use 'perf list' to list available events",
2565 			 parse_events_option_new_evlist),
2566 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2567 		   "Limit number of switch output generated files"),
2568 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2569 		    "Parse options then exit"),
2570 #ifdef HAVE_AIO_SUPPORT
2571 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2572 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2573 		     record__aio_parse),
2574 #endif
2575 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2576 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2577 		     record__parse_affinity),
2578 #ifdef HAVE_ZSTD_SUPPORT
2579 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2580 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2581 			    record__parse_comp_level),
2582 #endif
2583 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
2584 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
2585 	OPT_UINTEGER(0, "num-thread-synthesize",
2586 		     &record.opts.nr_threads_synthesize,
2587 		     "number of threads to run for event synthesis"),
2588 #ifdef HAVE_LIBPFM
2589 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
2590 		"libpfm4 event selector. use 'perf list' to list available events",
2591 		parse_libpfm_events_option),
2592 #endif
2593 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
2594 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
2595 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
2596 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
2597 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
2598 		      parse_control_option),
2599 	OPT_END()
2600 };
2601 
2602 struct option *record_options = __record_options;
2603 
2604 int cmd_record(int argc, const char **argv)
2605 {
2606 	int err;
2607 	struct record *rec = &record;
2608 	char errbuf[BUFSIZ];
2609 
2610 	setlocale(LC_ALL, "");
2611 
2612 #ifndef HAVE_LIBBPF_SUPPORT
2613 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2614 	set_nobuild('\0', "clang-path", true);
2615 	set_nobuild('\0', "clang-opt", true);
2616 # undef set_nobuild
2617 #endif
2618 
2619 #ifndef HAVE_BPF_PROLOGUE
2620 # if !defined (HAVE_DWARF_SUPPORT)
2621 #  define REASON  "NO_DWARF=1"
2622 # elif !defined (HAVE_LIBBPF_SUPPORT)
2623 #  define REASON  "NO_LIBBPF=1"
2624 # else
2625 #  define REASON  "this architecture doesn't support BPF prologue"
2626 # endif
2627 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2628 	set_nobuild('\0', "vmlinux", true);
2629 # undef set_nobuild
2630 # undef REASON
2631 #endif
2632 
2633 	rec->opts.affinity = PERF_AFFINITY_SYS;
2634 
2635 	rec->evlist = evlist__new();
2636 	if (rec->evlist == NULL)
2637 		return -ENOMEM;
2638 
2639 	err = perf_config(perf_record_config, rec);
2640 	if (err)
2641 		return err;
2642 
2643 	argc = parse_options(argc, argv, record_options, record_usage,
2644 			    PARSE_OPT_STOP_AT_NON_OPTION);
2645 	if (quiet)
2646 		perf_quiet_option();
2647 
2648 	/* Make system wide (-a) the default target. */
2649 	if (!argc && target__none(&rec->opts.target))
2650 		rec->opts.target.system_wide = true;
2651 
2652 	if (nr_cgroups && !rec->opts.target.system_wide) {
2653 		usage_with_options_msg(record_usage, record_options,
2654 			"cgroup monitoring only available in system-wide mode");
2655 
2656 	}
2657 
2658 	if (rec->opts.kcore)
2659 		rec->data.is_dir = true;
2660 
2661 	if (rec->opts.comp_level != 0) {
2662 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2663 		rec->no_buildid = true;
2664 	}
2665 
2666 	if (rec->opts.record_switch_events &&
2667 	    !perf_can_record_switch_events()) {
2668 		ui__error("kernel does not support recording context switch events\n");
2669 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2670 		err = -EINVAL;
2671 		goto out_opts;
2672 	}
2673 
2674 	if (switch_output_setup(rec)) {
2675 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2676 		err = -EINVAL;
2677 		goto out_opts;
2678 	}
2679 
2680 	if (rec->switch_output.time) {
2681 		signal(SIGALRM, alarm_sig_handler);
2682 		alarm(rec->switch_output.time);
2683 	}
2684 
2685 	if (rec->switch_output.num_files) {
2686 		rec->switch_output.filenames = calloc(sizeof(char *),
2687 						      rec->switch_output.num_files);
2688 		if (!rec->switch_output.filenames) {
2689 			err = -EINVAL;
2690 			goto out_opts;
2691 		}
2692 	}
2693 
2694 	/*
2695 	 * Allow aliases to facilitate the lookup of symbols for address
2696 	 * filters. Refer to auxtrace_parse_filters().
2697 	 */
2698 	symbol_conf.allow_aliases = true;
2699 
2700 	symbol__init(NULL);
2701 
2702 	if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2703 		rec->affinity_mask.nbits = cpu__max_cpu();
2704 		rec->affinity_mask.bits = bitmap_alloc(rec->affinity_mask.nbits);
2705 		if (!rec->affinity_mask.bits) {
2706 			pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2707 			err = -ENOMEM;
2708 			goto out_opts;
2709 		}
2710 		pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2711 	}
2712 
2713 	err = record__auxtrace_init(rec);
2714 	if (err)
2715 		goto out;
2716 
2717 	if (dry_run)
2718 		goto out;
2719 
2720 	err = bpf__setup_stdout(rec->evlist);
2721 	if (err) {
2722 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2723 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2724 			 errbuf);
2725 		goto out;
2726 	}
2727 
2728 	err = -ENOMEM;
2729 
2730 	if (rec->no_buildid_cache || rec->no_buildid) {
2731 		disable_buildid_cache();
2732 	} else if (rec->switch_output.enabled) {
2733 		/*
2734 		 * In 'perf record --switch-output', disable buildid
2735 		 * generation by default to reduce data file switching
2736 		 * overhead. Still generate buildid if they are required
2737 		 * explicitly using
2738 		 *
2739 		 *  perf record --switch-output --no-no-buildid \
2740 		 *              --no-no-buildid-cache
2741 		 *
2742 		 * Following code equals to:
2743 		 *
2744 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2745 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2746 		 *         disable_buildid_cache();
2747 		 */
2748 		bool disable = true;
2749 
2750 		if (rec->no_buildid_set && !rec->no_buildid)
2751 			disable = false;
2752 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2753 			disable = false;
2754 		if (disable) {
2755 			rec->no_buildid = true;
2756 			rec->no_buildid_cache = true;
2757 			disable_buildid_cache();
2758 		}
2759 	}
2760 
2761 	if (record.opts.overwrite)
2762 		record.opts.tail_synthesize = true;
2763 
2764 	if (rec->evlist->core.nr_entries == 0 &&
2765 	    __evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2766 		pr_err("Not enough memory for event selector list\n");
2767 		goto out;
2768 	}
2769 
2770 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2771 		rec->opts.no_inherit = true;
2772 
2773 	err = target__validate(&rec->opts.target);
2774 	if (err) {
2775 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2776 		ui__warning("%s\n", errbuf);
2777 	}
2778 
2779 	err = target__parse_uid(&rec->opts.target);
2780 	if (err) {
2781 		int saved_errno = errno;
2782 
2783 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2784 		ui__error("%s", errbuf);
2785 
2786 		err = -saved_errno;
2787 		goto out;
2788 	}
2789 
2790 	/* Enable ignoring missing threads when -u/-p option is defined. */
2791 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2792 
2793 	err = -ENOMEM;
2794 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2795 		usage_with_options(record_usage, record_options);
2796 
2797 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2798 	if (err)
2799 		goto out;
2800 
2801 	/*
2802 	 * We take all buildids when the file contains
2803 	 * AUX area tracing data because we do not decode the
2804 	 * trace because it would take too long.
2805 	 */
2806 	if (rec->opts.full_auxtrace)
2807 		rec->buildid_all = true;
2808 
2809 	if (rec->opts.text_poke) {
2810 		err = record__config_text_poke(rec->evlist);
2811 		if (err) {
2812 			pr_err("record__config_text_poke failed, error %d\n", err);
2813 			goto out;
2814 		}
2815 	}
2816 
2817 	if (record_opts__config(&rec->opts)) {
2818 		err = -EINVAL;
2819 		goto out;
2820 	}
2821 
2822 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2823 		rec->opts.nr_cblocks = nr_cblocks_max;
2824 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2825 
2826 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2827 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2828 
2829 	if (rec->opts.comp_level > comp_level_max)
2830 		rec->opts.comp_level = comp_level_max;
2831 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2832 
2833 	err = __cmd_record(&record, argc, argv);
2834 out:
2835 	bitmap_free(rec->affinity_mask.bits);
2836 	evlist__delete(rec->evlist);
2837 	symbol__exit();
2838 	auxtrace_record__free(rec->itr);
2839 out_opts:
2840 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
2841 	return err;
2842 }
2843 
2844 static void snapshot_sig_handler(int sig __maybe_unused)
2845 {
2846 	struct record *rec = &record;
2847 
2848 	hit_auxtrace_snapshot_trigger(rec);
2849 
2850 	if (switch_output_signal(rec))
2851 		trigger_hit(&switch_output_trigger);
2852 }
2853 
2854 static void alarm_sig_handler(int sig __maybe_unused)
2855 {
2856 	struct record *rec = &record;
2857 
2858 	if (switch_output_time(rec))
2859 		trigger_hit(&switch_output_trigger);
2860 }
2861