xref: /openbmc/linux/tools/perf/builtin-record.c (revision 59b4412f)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "asm/bug.h"
50 #include "perf.h"
51 
52 #include <errno.h>
53 #include <inttypes.h>
54 #include <locale.h>
55 #include <poll.h>
56 #include <pthread.h>
57 #include <unistd.h>
58 #include <sched.h>
59 #include <signal.h>
60 #ifdef HAVE_EVENTFD_SUPPORT
61 #include <sys/eventfd.h>
62 #endif
63 #include <sys/mman.h>
64 #include <sys/wait.h>
65 #include <sys/types.h>
66 #include <sys/stat.h>
67 #include <fcntl.h>
68 #include <linux/err.h>
69 #include <linux/string.h>
70 #include <linux/time64.h>
71 #include <linux/zalloc.h>
72 #include <linux/bitmap.h>
73 
74 struct switch_output {
75 	bool		 enabled;
76 	bool		 signal;
77 	unsigned long	 size;
78 	unsigned long	 time;
79 	const char	*str;
80 	bool		 set;
81 	char		 **filenames;
82 	int		 num_files;
83 	int		 cur_file;
84 };
85 
86 struct record {
87 	struct perf_tool	tool;
88 	struct record_opts	opts;
89 	u64			bytes_written;
90 	struct perf_data	data;
91 	struct auxtrace_record	*itr;
92 	struct evlist	*evlist;
93 	struct perf_session	*session;
94 	struct evlist		*sb_evlist;
95 	pthread_t		thread_id;
96 	int			realtime_prio;
97 	bool			switch_output_event_set;
98 	bool			no_buildid;
99 	bool			no_buildid_set;
100 	bool			no_buildid_cache;
101 	bool			no_buildid_cache_set;
102 	bool			buildid_all;
103 	bool			timestamp_filename;
104 	bool			timestamp_boundary;
105 	struct switch_output	switch_output;
106 	unsigned long long	samples;
107 	struct mmap_cpu_mask	affinity_mask;
108 	unsigned long		output_max_size;	/* = 0: unlimited */
109 };
110 
111 static volatile int done;
112 
113 static volatile int auxtrace_record__snapshot_started;
114 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
115 static DEFINE_TRIGGER(switch_output_trigger);
116 
117 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
118 	"SYS", "NODE", "CPU"
119 };
120 
121 static bool switch_output_signal(struct record *rec)
122 {
123 	return rec->switch_output.signal &&
124 	       trigger_is_ready(&switch_output_trigger);
125 }
126 
127 static bool switch_output_size(struct record *rec)
128 {
129 	return rec->switch_output.size &&
130 	       trigger_is_ready(&switch_output_trigger) &&
131 	       (rec->bytes_written >= rec->switch_output.size);
132 }
133 
134 static bool switch_output_time(struct record *rec)
135 {
136 	return rec->switch_output.time &&
137 	       trigger_is_ready(&switch_output_trigger);
138 }
139 
140 static bool record__output_max_size_exceeded(struct record *rec)
141 {
142 	return rec->output_max_size &&
143 	       (rec->bytes_written >= rec->output_max_size);
144 }
145 
146 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
147 			 void *bf, size_t size)
148 {
149 	struct perf_data_file *file = &rec->session->data->file;
150 
151 	if (perf_data_file__write(file, bf, size) < 0) {
152 		pr_err("failed to write perf data, error: %m\n");
153 		return -1;
154 	}
155 
156 	rec->bytes_written += size;
157 
158 	if (record__output_max_size_exceeded(rec) && !done) {
159 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
160 				" stopping session ]\n",
161 				rec->bytes_written >> 10);
162 		done = 1;
163 	}
164 
165 	if (switch_output_size(rec))
166 		trigger_hit(&switch_output_trigger);
167 
168 	return 0;
169 }
170 
171 static int record__aio_enabled(struct record *rec);
172 static int record__comp_enabled(struct record *rec);
173 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
174 			    void *src, size_t src_size);
175 
176 #ifdef HAVE_AIO_SUPPORT
177 static int record__aio_write(struct aiocb *cblock, int trace_fd,
178 		void *buf, size_t size, off_t off)
179 {
180 	int rc;
181 
182 	cblock->aio_fildes = trace_fd;
183 	cblock->aio_buf    = buf;
184 	cblock->aio_nbytes = size;
185 	cblock->aio_offset = off;
186 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
187 
188 	do {
189 		rc = aio_write(cblock);
190 		if (rc == 0) {
191 			break;
192 		} else if (errno != EAGAIN) {
193 			cblock->aio_fildes = -1;
194 			pr_err("failed to queue perf data, error: %m\n");
195 			break;
196 		}
197 	} while (1);
198 
199 	return rc;
200 }
201 
202 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
203 {
204 	void *rem_buf;
205 	off_t rem_off;
206 	size_t rem_size;
207 	int rc, aio_errno;
208 	ssize_t aio_ret, written;
209 
210 	aio_errno = aio_error(cblock);
211 	if (aio_errno == EINPROGRESS)
212 		return 0;
213 
214 	written = aio_ret = aio_return(cblock);
215 	if (aio_ret < 0) {
216 		if (aio_errno != EINTR)
217 			pr_err("failed to write perf data, error: %m\n");
218 		written = 0;
219 	}
220 
221 	rem_size = cblock->aio_nbytes - written;
222 
223 	if (rem_size == 0) {
224 		cblock->aio_fildes = -1;
225 		/*
226 		 * md->refcount is incremented in record__aio_pushfn() for
227 		 * every aio write request started in record__aio_push() so
228 		 * decrement it because the request is now complete.
229 		 */
230 		perf_mmap__put(&md->core);
231 		rc = 1;
232 	} else {
233 		/*
234 		 * aio write request may require restart with the
235 		 * reminder if the kernel didn't write whole
236 		 * chunk at once.
237 		 */
238 		rem_off = cblock->aio_offset + written;
239 		rem_buf = (void *)(cblock->aio_buf + written);
240 		record__aio_write(cblock, cblock->aio_fildes,
241 				rem_buf, rem_size, rem_off);
242 		rc = 0;
243 	}
244 
245 	return rc;
246 }
247 
248 static int record__aio_sync(struct mmap *md, bool sync_all)
249 {
250 	struct aiocb **aiocb = md->aio.aiocb;
251 	struct aiocb *cblocks = md->aio.cblocks;
252 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
253 	int i, do_suspend;
254 
255 	do {
256 		do_suspend = 0;
257 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
258 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
259 				if (sync_all)
260 					aiocb[i] = NULL;
261 				else
262 					return i;
263 			} else {
264 				/*
265 				 * Started aio write is not complete yet
266 				 * so it has to be waited before the
267 				 * next allocation.
268 				 */
269 				aiocb[i] = &cblocks[i];
270 				do_suspend = 1;
271 			}
272 		}
273 		if (!do_suspend)
274 			return -1;
275 
276 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
277 			if (!(errno == EAGAIN || errno == EINTR))
278 				pr_err("failed to sync perf data, error: %m\n");
279 		}
280 	} while (1);
281 }
282 
283 struct record_aio {
284 	struct record	*rec;
285 	void		*data;
286 	size_t		size;
287 };
288 
289 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
290 {
291 	struct record_aio *aio = to;
292 
293 	/*
294 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
295 	 * to release space in the kernel buffer as fast as possible, calling
296 	 * perf_mmap__consume() from perf_mmap__push() function.
297 	 *
298 	 * That lets the kernel to proceed with storing more profiling data into
299 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
300 	 *
301 	 * Coping can be done in two steps in case the chunk of profiling data
302 	 * crosses the upper bound of the kernel buffer. In this case we first move
303 	 * part of data from map->start till the upper bound and then the reminder
304 	 * from the beginning of the kernel buffer till the end of the data chunk.
305 	 */
306 
307 	if (record__comp_enabled(aio->rec)) {
308 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
309 				     mmap__mmap_len(map) - aio->size,
310 				     buf, size);
311 	} else {
312 		memcpy(aio->data + aio->size, buf, size);
313 	}
314 
315 	if (!aio->size) {
316 		/*
317 		 * Increment map->refcount to guard map->aio.data[] buffer
318 		 * from premature deallocation because map object can be
319 		 * released earlier than aio write request started on
320 		 * map->aio.data[] buffer is complete.
321 		 *
322 		 * perf_mmap__put() is done at record__aio_complete()
323 		 * after started aio request completion or at record__aio_push()
324 		 * if the request failed to start.
325 		 */
326 		perf_mmap__get(&map->core);
327 	}
328 
329 	aio->size += size;
330 
331 	return size;
332 }
333 
334 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
335 {
336 	int ret, idx;
337 	int trace_fd = rec->session->data->file.fd;
338 	struct record_aio aio = { .rec = rec, .size = 0 };
339 
340 	/*
341 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
342 	 * becomes available after previous aio write operation.
343 	 */
344 
345 	idx = record__aio_sync(map, false);
346 	aio.data = map->aio.data[idx];
347 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
348 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
349 		return ret;
350 
351 	rec->samples++;
352 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
353 	if (!ret) {
354 		*off += aio.size;
355 		rec->bytes_written += aio.size;
356 		if (switch_output_size(rec))
357 			trigger_hit(&switch_output_trigger);
358 	} else {
359 		/*
360 		 * Decrement map->refcount incremented in record__aio_pushfn()
361 		 * back if record__aio_write() operation failed to start, otherwise
362 		 * map->refcount is decremented in record__aio_complete() after
363 		 * aio write operation finishes successfully.
364 		 */
365 		perf_mmap__put(&map->core);
366 	}
367 
368 	return ret;
369 }
370 
371 static off_t record__aio_get_pos(int trace_fd)
372 {
373 	return lseek(trace_fd, 0, SEEK_CUR);
374 }
375 
376 static void record__aio_set_pos(int trace_fd, off_t pos)
377 {
378 	lseek(trace_fd, pos, SEEK_SET);
379 }
380 
381 static void record__aio_mmap_read_sync(struct record *rec)
382 {
383 	int i;
384 	struct evlist *evlist = rec->evlist;
385 	struct mmap *maps = evlist->mmap;
386 
387 	if (!record__aio_enabled(rec))
388 		return;
389 
390 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
391 		struct mmap *map = &maps[i];
392 
393 		if (map->core.base)
394 			record__aio_sync(map, true);
395 	}
396 }
397 
398 static int nr_cblocks_default = 1;
399 static int nr_cblocks_max = 4;
400 
401 static int record__aio_parse(const struct option *opt,
402 			     const char *str,
403 			     int unset)
404 {
405 	struct record_opts *opts = (struct record_opts *)opt->value;
406 
407 	if (unset) {
408 		opts->nr_cblocks = 0;
409 	} else {
410 		if (str)
411 			opts->nr_cblocks = strtol(str, NULL, 0);
412 		if (!opts->nr_cblocks)
413 			opts->nr_cblocks = nr_cblocks_default;
414 	}
415 
416 	return 0;
417 }
418 #else /* HAVE_AIO_SUPPORT */
419 static int nr_cblocks_max = 0;
420 
421 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
422 			    off_t *off __maybe_unused)
423 {
424 	return -1;
425 }
426 
427 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
428 {
429 	return -1;
430 }
431 
432 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
433 {
434 }
435 
436 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
437 {
438 }
439 #endif
440 
441 static int record__aio_enabled(struct record *rec)
442 {
443 	return rec->opts.nr_cblocks > 0;
444 }
445 
446 #define MMAP_FLUSH_DEFAULT 1
447 static int record__mmap_flush_parse(const struct option *opt,
448 				    const char *str,
449 				    int unset)
450 {
451 	int flush_max;
452 	struct record_opts *opts = (struct record_opts *)opt->value;
453 	static struct parse_tag tags[] = {
454 			{ .tag  = 'B', .mult = 1       },
455 			{ .tag  = 'K', .mult = 1 << 10 },
456 			{ .tag  = 'M', .mult = 1 << 20 },
457 			{ .tag  = 'G', .mult = 1 << 30 },
458 			{ .tag  = 0 },
459 	};
460 
461 	if (unset)
462 		return 0;
463 
464 	if (str) {
465 		opts->mmap_flush = parse_tag_value(str, tags);
466 		if (opts->mmap_flush == (int)-1)
467 			opts->mmap_flush = strtol(str, NULL, 0);
468 	}
469 
470 	if (!opts->mmap_flush)
471 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
472 
473 	flush_max = evlist__mmap_size(opts->mmap_pages);
474 	flush_max /= 4;
475 	if (opts->mmap_flush > flush_max)
476 		opts->mmap_flush = flush_max;
477 
478 	return 0;
479 }
480 
481 #ifdef HAVE_ZSTD_SUPPORT
482 static unsigned int comp_level_default = 1;
483 
484 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
485 {
486 	struct record_opts *opts = opt->value;
487 
488 	if (unset) {
489 		opts->comp_level = 0;
490 	} else {
491 		if (str)
492 			opts->comp_level = strtol(str, NULL, 0);
493 		if (!opts->comp_level)
494 			opts->comp_level = comp_level_default;
495 	}
496 
497 	return 0;
498 }
499 #endif
500 static unsigned int comp_level_max = 22;
501 
502 static int record__comp_enabled(struct record *rec)
503 {
504 	return rec->opts.comp_level > 0;
505 }
506 
507 static int process_synthesized_event(struct perf_tool *tool,
508 				     union perf_event *event,
509 				     struct perf_sample *sample __maybe_unused,
510 				     struct machine *machine __maybe_unused)
511 {
512 	struct record *rec = container_of(tool, struct record, tool);
513 	return record__write(rec, NULL, event, event->header.size);
514 }
515 
516 static int process_locked_synthesized_event(struct perf_tool *tool,
517 				     union perf_event *event,
518 				     struct perf_sample *sample __maybe_unused,
519 				     struct machine *machine __maybe_unused)
520 {
521 	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
522 	int ret;
523 
524 	pthread_mutex_lock(&synth_lock);
525 	ret = process_synthesized_event(tool, event, sample, machine);
526 	pthread_mutex_unlock(&synth_lock);
527 	return ret;
528 }
529 
530 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
531 {
532 	struct record *rec = to;
533 
534 	if (record__comp_enabled(rec)) {
535 		size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
536 		bf   = map->data;
537 	}
538 
539 	rec->samples++;
540 	return record__write(rec, map, bf, size);
541 }
542 
543 static volatile int signr = -1;
544 static volatile int child_finished;
545 #ifdef HAVE_EVENTFD_SUPPORT
546 static int done_fd = -1;
547 #endif
548 
549 static void sig_handler(int sig)
550 {
551 	if (sig == SIGCHLD)
552 		child_finished = 1;
553 	else
554 		signr = sig;
555 
556 	done = 1;
557 #ifdef HAVE_EVENTFD_SUPPORT
558 {
559 	u64 tmp = 1;
560 	/*
561 	 * It is possible for this signal handler to run after done is checked
562 	 * in the main loop, but before the perf counter fds are polled. If this
563 	 * happens, the poll() will continue to wait even though done is set,
564 	 * and will only break out if either another signal is received, or the
565 	 * counters are ready for read. To ensure the poll() doesn't sleep when
566 	 * done is set, use an eventfd (done_fd) to wake up the poll().
567 	 */
568 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
569 		pr_err("failed to signal wakeup fd, error: %m\n");
570 }
571 #endif // HAVE_EVENTFD_SUPPORT
572 }
573 
574 static void sigsegv_handler(int sig)
575 {
576 	perf_hooks__recover();
577 	sighandler_dump_stack(sig);
578 }
579 
580 static void record__sig_exit(void)
581 {
582 	if (signr == -1)
583 		return;
584 
585 	signal(signr, SIG_DFL);
586 	raise(signr);
587 }
588 
589 #ifdef HAVE_AUXTRACE_SUPPORT
590 
591 static int record__process_auxtrace(struct perf_tool *tool,
592 				    struct mmap *map,
593 				    union perf_event *event, void *data1,
594 				    size_t len1, void *data2, size_t len2)
595 {
596 	struct record *rec = container_of(tool, struct record, tool);
597 	struct perf_data *data = &rec->data;
598 	size_t padding;
599 	u8 pad[8] = {0};
600 
601 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
602 		off_t file_offset;
603 		int fd = perf_data__fd(data);
604 		int err;
605 
606 		file_offset = lseek(fd, 0, SEEK_CUR);
607 		if (file_offset == -1)
608 			return -1;
609 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
610 						     event, file_offset);
611 		if (err)
612 			return err;
613 	}
614 
615 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
616 	padding = (len1 + len2) & 7;
617 	if (padding)
618 		padding = 8 - padding;
619 
620 	record__write(rec, map, event, event->header.size);
621 	record__write(rec, map, data1, len1);
622 	if (len2)
623 		record__write(rec, map, data2, len2);
624 	record__write(rec, map, &pad, padding);
625 
626 	return 0;
627 }
628 
629 static int record__auxtrace_mmap_read(struct record *rec,
630 				      struct mmap *map)
631 {
632 	int ret;
633 
634 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
635 				  record__process_auxtrace);
636 	if (ret < 0)
637 		return ret;
638 
639 	if (ret)
640 		rec->samples++;
641 
642 	return 0;
643 }
644 
645 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
646 					       struct mmap *map)
647 {
648 	int ret;
649 
650 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
651 					   record__process_auxtrace,
652 					   rec->opts.auxtrace_snapshot_size);
653 	if (ret < 0)
654 		return ret;
655 
656 	if (ret)
657 		rec->samples++;
658 
659 	return 0;
660 }
661 
662 static int record__auxtrace_read_snapshot_all(struct record *rec)
663 {
664 	int i;
665 	int rc = 0;
666 
667 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
668 		struct mmap *map = &rec->evlist->mmap[i];
669 
670 		if (!map->auxtrace_mmap.base)
671 			continue;
672 
673 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
674 			rc = -1;
675 			goto out;
676 		}
677 	}
678 out:
679 	return rc;
680 }
681 
682 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
683 {
684 	pr_debug("Recording AUX area tracing snapshot\n");
685 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
686 		trigger_error(&auxtrace_snapshot_trigger);
687 	} else {
688 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
689 			trigger_error(&auxtrace_snapshot_trigger);
690 		else
691 			trigger_ready(&auxtrace_snapshot_trigger);
692 	}
693 }
694 
695 static int record__auxtrace_snapshot_exit(struct record *rec)
696 {
697 	if (trigger_is_error(&auxtrace_snapshot_trigger))
698 		return 0;
699 
700 	if (!auxtrace_record__snapshot_started &&
701 	    auxtrace_record__snapshot_start(rec->itr))
702 		return -1;
703 
704 	record__read_auxtrace_snapshot(rec, true);
705 	if (trigger_is_error(&auxtrace_snapshot_trigger))
706 		return -1;
707 
708 	return 0;
709 }
710 
711 static int record__auxtrace_init(struct record *rec)
712 {
713 	int err;
714 
715 	if (!rec->itr) {
716 		rec->itr = auxtrace_record__init(rec->evlist, &err);
717 		if (err)
718 			return err;
719 	}
720 
721 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
722 					      rec->opts.auxtrace_snapshot_opts);
723 	if (err)
724 		return err;
725 
726 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
727 					    rec->opts.auxtrace_sample_opts);
728 	if (err)
729 		return err;
730 
731 	return auxtrace_parse_filters(rec->evlist);
732 }
733 
734 #else
735 
736 static inline
737 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
738 			       struct mmap *map __maybe_unused)
739 {
740 	return 0;
741 }
742 
743 static inline
744 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
745 				    bool on_exit __maybe_unused)
746 {
747 }
748 
749 static inline
750 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
751 {
752 	return 0;
753 }
754 
755 static inline
756 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
757 {
758 	return 0;
759 }
760 
761 static int record__auxtrace_init(struct record *rec __maybe_unused)
762 {
763 	return 0;
764 }
765 
766 #endif
767 
768 static int record__config_text_poke(struct evlist *evlist)
769 {
770 	struct evsel *evsel;
771 	int err;
772 
773 	/* Nothing to do if text poke is already configured */
774 	evlist__for_each_entry(evlist, evsel) {
775 		if (evsel->core.attr.text_poke)
776 			return 0;
777 	}
778 
779 	err = parse_events(evlist, "dummy:u", NULL);
780 	if (err)
781 		return err;
782 
783 	evsel = evlist__last(evlist);
784 
785 	evsel->core.attr.freq = 0;
786 	evsel->core.attr.sample_period = 1;
787 	evsel->core.attr.text_poke = 1;
788 	evsel->core.attr.ksymbol = 1;
789 
790 	evsel->core.system_wide = true;
791 	evsel->no_aux_samples = true;
792 	evsel->immediate = true;
793 
794 	/* Text poke must be collected on all CPUs */
795 	perf_cpu_map__put(evsel->core.own_cpus);
796 	evsel->core.own_cpus = perf_cpu_map__new(NULL);
797 	perf_cpu_map__put(evsel->core.cpus);
798 	evsel->core.cpus = perf_cpu_map__get(evsel->core.own_cpus);
799 
800 	evsel__set_sample_bit(evsel, TIME);
801 
802 	return 0;
803 }
804 
805 static bool record__kcore_readable(struct machine *machine)
806 {
807 	char kcore[PATH_MAX];
808 	int fd;
809 
810 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
811 
812 	fd = open(kcore, O_RDONLY);
813 	if (fd < 0)
814 		return false;
815 
816 	close(fd);
817 
818 	return true;
819 }
820 
821 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
822 {
823 	char from_dir[PATH_MAX];
824 	char kcore_dir[PATH_MAX];
825 	int ret;
826 
827 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
828 
829 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
830 	if (ret)
831 		return ret;
832 
833 	return kcore_copy(from_dir, kcore_dir);
834 }
835 
836 static int record__mmap_evlist(struct record *rec,
837 			       struct evlist *evlist)
838 {
839 	struct record_opts *opts = &rec->opts;
840 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
841 				  opts->auxtrace_sample_mode;
842 	char msg[512];
843 
844 	if (opts->affinity != PERF_AFFINITY_SYS)
845 		cpu__setup_cpunode_map();
846 
847 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
848 				 opts->auxtrace_mmap_pages,
849 				 auxtrace_overwrite,
850 				 opts->nr_cblocks, opts->affinity,
851 				 opts->mmap_flush, opts->comp_level) < 0) {
852 		if (errno == EPERM) {
853 			pr_err("Permission error mapping pages.\n"
854 			       "Consider increasing "
855 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
856 			       "or try again with a smaller value of -m/--mmap_pages.\n"
857 			       "(current value: %u,%u)\n",
858 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
859 			return -errno;
860 		} else {
861 			pr_err("failed to mmap with %d (%s)\n", errno,
862 				str_error_r(errno, msg, sizeof(msg)));
863 			if (errno)
864 				return -errno;
865 			else
866 				return -EINVAL;
867 		}
868 	}
869 	return 0;
870 }
871 
872 static int record__mmap(struct record *rec)
873 {
874 	return record__mmap_evlist(rec, rec->evlist);
875 }
876 
877 static int record__open(struct record *rec)
878 {
879 	char msg[BUFSIZ];
880 	struct evsel *pos;
881 	struct evlist *evlist = rec->evlist;
882 	struct perf_session *session = rec->session;
883 	struct record_opts *opts = &rec->opts;
884 	int rc = 0;
885 
886 	/*
887 	 * For initial_delay or system wide, we need to add a dummy event so
888 	 * that we can track PERF_RECORD_MMAP to cover the delay of waiting or
889 	 * event synthesis.
890 	 */
891 	if (opts->initial_delay || target__has_cpu(&opts->target)) {
892 		pos = perf_evlist__get_tracking_event(evlist);
893 		if (!evsel__is_dummy_event(pos)) {
894 			/* Set up dummy event. */
895 			if (evlist__add_dummy(evlist))
896 				return -ENOMEM;
897 			pos = evlist__last(evlist);
898 			perf_evlist__set_tracking_event(evlist, pos);
899 		}
900 
901 		/*
902 		 * Enable the dummy event when the process is forked for
903 		 * initial_delay, immediately for system wide.
904 		 */
905 		if (opts->initial_delay && !pos->immediate)
906 			pos->core.attr.enable_on_exec = 1;
907 		else
908 			pos->immediate = 1;
909 	}
910 
911 	perf_evlist__config(evlist, opts, &callchain_param);
912 
913 	evlist__for_each_entry(evlist, pos) {
914 try_again:
915 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
916 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
917 				if (verbose > 0)
918 					ui__warning("%s\n", msg);
919 				goto try_again;
920 			}
921 			if ((errno == EINVAL || errno == EBADF) &&
922 			    pos->leader != pos &&
923 			    pos->weak_group) {
924 			        pos = perf_evlist__reset_weak_group(evlist, pos, true);
925 				goto try_again;
926 			}
927 			rc = -errno;
928 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
929 			ui__error("%s\n", msg);
930 			goto out;
931 		}
932 
933 		pos->supported = true;
934 	}
935 
936 	if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) {
937 		pr_warning(
938 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
939 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
940 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
941 "file is not found in the buildid cache or in the vmlinux path.\n\n"
942 "Samples in kernel modules won't be resolved at all.\n\n"
943 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
944 "even with a suitable vmlinux or kallsyms file.\n\n");
945 	}
946 
947 	if (perf_evlist__apply_filters(evlist, &pos)) {
948 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
949 			pos->filter, evsel__name(pos), errno,
950 			str_error_r(errno, msg, sizeof(msg)));
951 		rc = -1;
952 		goto out;
953 	}
954 
955 	rc = record__mmap(rec);
956 	if (rc)
957 		goto out;
958 
959 	session->evlist = evlist;
960 	perf_session__set_id_hdr_size(session);
961 out:
962 	return rc;
963 }
964 
965 static int process_sample_event(struct perf_tool *tool,
966 				union perf_event *event,
967 				struct perf_sample *sample,
968 				struct evsel *evsel,
969 				struct machine *machine)
970 {
971 	struct record *rec = container_of(tool, struct record, tool);
972 
973 	if (rec->evlist->first_sample_time == 0)
974 		rec->evlist->first_sample_time = sample->time;
975 
976 	rec->evlist->last_sample_time = sample->time;
977 
978 	if (rec->buildid_all)
979 		return 0;
980 
981 	rec->samples++;
982 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
983 }
984 
985 static int process_buildids(struct record *rec)
986 {
987 	struct perf_session *session = rec->session;
988 
989 	if (perf_data__size(&rec->data) == 0)
990 		return 0;
991 
992 	/*
993 	 * During this process, it'll load kernel map and replace the
994 	 * dso->long_name to a real pathname it found.  In this case
995 	 * we prefer the vmlinux path like
996 	 *   /lib/modules/3.16.4/build/vmlinux
997 	 *
998 	 * rather than build-id path (in debug directory).
999 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1000 	 */
1001 	symbol_conf.ignore_vmlinux_buildid = true;
1002 
1003 	/*
1004 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1005 	 * so no need to process samples. But if timestamp_boundary is enabled,
1006 	 * it still needs to walk on all samples to get the timestamps of
1007 	 * first/last samples.
1008 	 */
1009 	if (rec->buildid_all && !rec->timestamp_boundary)
1010 		rec->tool.sample = NULL;
1011 
1012 	return perf_session__process_events(session);
1013 }
1014 
1015 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1016 {
1017 	int err;
1018 	struct perf_tool *tool = data;
1019 	/*
1020 	 *As for guest kernel when processing subcommand record&report,
1021 	 *we arrange module mmap prior to guest kernel mmap and trigger
1022 	 *a preload dso because default guest module symbols are loaded
1023 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1024 	 *method is used to avoid symbol missing when the first addr is
1025 	 *in module instead of in guest kernel.
1026 	 */
1027 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1028 					     machine);
1029 	if (err < 0)
1030 		pr_err("Couldn't record guest kernel [%d]'s reference"
1031 		       " relocation symbol.\n", machine->pid);
1032 
1033 	/*
1034 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1035 	 * have no _text sometimes.
1036 	 */
1037 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1038 						 machine);
1039 	if (err < 0)
1040 		pr_err("Couldn't record guest kernel [%d]'s reference"
1041 		       " relocation symbol.\n", machine->pid);
1042 }
1043 
1044 static struct perf_event_header finished_round_event = {
1045 	.size = sizeof(struct perf_event_header),
1046 	.type = PERF_RECORD_FINISHED_ROUND,
1047 };
1048 
1049 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1050 {
1051 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1052 	    !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
1053 			  rec->affinity_mask.nbits)) {
1054 		bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
1055 		bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
1056 			  map->affinity_mask.bits, rec->affinity_mask.nbits);
1057 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
1058 				  (cpu_set_t *)rec->affinity_mask.bits);
1059 		if (verbose == 2)
1060 			mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
1061 	}
1062 }
1063 
1064 static size_t process_comp_header(void *record, size_t increment)
1065 {
1066 	struct perf_record_compressed *event = record;
1067 	size_t size = sizeof(*event);
1068 
1069 	if (increment) {
1070 		event->header.size += increment;
1071 		return increment;
1072 	}
1073 
1074 	event->header.type = PERF_RECORD_COMPRESSED;
1075 	event->header.size = size;
1076 
1077 	return size;
1078 }
1079 
1080 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
1081 			    void *src, size_t src_size)
1082 {
1083 	size_t compressed;
1084 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1085 
1086 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1087 						     max_record_size, process_comp_header);
1088 
1089 	session->bytes_transferred += src_size;
1090 	session->bytes_compressed  += compressed;
1091 
1092 	return compressed;
1093 }
1094 
1095 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1096 				    bool overwrite, bool synch)
1097 {
1098 	u64 bytes_written = rec->bytes_written;
1099 	int i;
1100 	int rc = 0;
1101 	struct mmap *maps;
1102 	int trace_fd = rec->data.file.fd;
1103 	off_t off = 0;
1104 
1105 	if (!evlist)
1106 		return 0;
1107 
1108 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1109 	if (!maps)
1110 		return 0;
1111 
1112 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1113 		return 0;
1114 
1115 	if (record__aio_enabled(rec))
1116 		off = record__aio_get_pos(trace_fd);
1117 
1118 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
1119 		u64 flush = 0;
1120 		struct mmap *map = &maps[i];
1121 
1122 		if (map->core.base) {
1123 			record__adjust_affinity(rec, map);
1124 			if (synch) {
1125 				flush = map->core.flush;
1126 				map->core.flush = 1;
1127 			}
1128 			if (!record__aio_enabled(rec)) {
1129 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1130 					if (synch)
1131 						map->core.flush = flush;
1132 					rc = -1;
1133 					goto out;
1134 				}
1135 			} else {
1136 				if (record__aio_push(rec, map, &off) < 0) {
1137 					record__aio_set_pos(trace_fd, off);
1138 					if (synch)
1139 						map->core.flush = flush;
1140 					rc = -1;
1141 					goto out;
1142 				}
1143 			}
1144 			if (synch)
1145 				map->core.flush = flush;
1146 		}
1147 
1148 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1149 		    !rec->opts.auxtrace_sample_mode &&
1150 		    record__auxtrace_mmap_read(rec, map) != 0) {
1151 			rc = -1;
1152 			goto out;
1153 		}
1154 	}
1155 
1156 	if (record__aio_enabled(rec))
1157 		record__aio_set_pos(trace_fd, off);
1158 
1159 	/*
1160 	 * Mark the round finished in case we wrote
1161 	 * at least one event.
1162 	 */
1163 	if (bytes_written != rec->bytes_written)
1164 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1165 
1166 	if (overwrite)
1167 		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1168 out:
1169 	return rc;
1170 }
1171 
1172 static int record__mmap_read_all(struct record *rec, bool synch)
1173 {
1174 	int err;
1175 
1176 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1177 	if (err)
1178 		return err;
1179 
1180 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1181 }
1182 
1183 static void record__init_features(struct record *rec)
1184 {
1185 	struct perf_session *session = rec->session;
1186 	int feat;
1187 
1188 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1189 		perf_header__set_feat(&session->header, feat);
1190 
1191 	if (rec->no_buildid)
1192 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1193 
1194 	if (!have_tracepoints(&rec->evlist->core.entries))
1195 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1196 
1197 	if (!rec->opts.branch_stack)
1198 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1199 
1200 	if (!rec->opts.full_auxtrace)
1201 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1202 
1203 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1204 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1205 
1206 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1207 	if (!record__comp_enabled(rec))
1208 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1209 
1210 	perf_header__clear_feat(&session->header, HEADER_STAT);
1211 }
1212 
1213 static void
1214 record__finish_output(struct record *rec)
1215 {
1216 	struct perf_data *data = &rec->data;
1217 	int fd = perf_data__fd(data);
1218 
1219 	if (data->is_pipe)
1220 		return;
1221 
1222 	rec->session->header.data_size += rec->bytes_written;
1223 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1224 
1225 	if (!rec->no_buildid) {
1226 		process_buildids(rec);
1227 
1228 		if (rec->buildid_all)
1229 			dsos__hit_all(rec->session);
1230 	}
1231 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1232 
1233 	return;
1234 }
1235 
1236 static int record__synthesize_workload(struct record *rec, bool tail)
1237 {
1238 	int err;
1239 	struct perf_thread_map *thread_map;
1240 
1241 	if (rec->opts.tail_synthesize != tail)
1242 		return 0;
1243 
1244 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1245 	if (thread_map == NULL)
1246 		return -1;
1247 
1248 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1249 						 process_synthesized_event,
1250 						 &rec->session->machines.host,
1251 						 rec->opts.sample_address);
1252 	perf_thread_map__put(thread_map);
1253 	return err;
1254 }
1255 
1256 static int record__synthesize(struct record *rec, bool tail);
1257 
1258 static int
1259 record__switch_output(struct record *rec, bool at_exit)
1260 {
1261 	struct perf_data *data = &rec->data;
1262 	int fd, err;
1263 	char *new_filename;
1264 
1265 	/* Same Size:      "2015122520103046"*/
1266 	char timestamp[] = "InvalidTimestamp";
1267 
1268 	record__aio_mmap_read_sync(rec);
1269 
1270 	record__synthesize(rec, true);
1271 	if (target__none(&rec->opts.target))
1272 		record__synthesize_workload(rec, true);
1273 
1274 	rec->samples = 0;
1275 	record__finish_output(rec);
1276 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1277 	if (err) {
1278 		pr_err("Failed to get current timestamp\n");
1279 		return -EINVAL;
1280 	}
1281 
1282 	fd = perf_data__switch(data, timestamp,
1283 				    rec->session->header.data_offset,
1284 				    at_exit, &new_filename);
1285 	if (fd >= 0 && !at_exit) {
1286 		rec->bytes_written = 0;
1287 		rec->session->header.data_size = 0;
1288 	}
1289 
1290 	if (!quiet)
1291 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1292 			data->path, timestamp);
1293 
1294 	if (rec->switch_output.num_files) {
1295 		int n = rec->switch_output.cur_file + 1;
1296 
1297 		if (n >= rec->switch_output.num_files)
1298 			n = 0;
1299 		rec->switch_output.cur_file = n;
1300 		if (rec->switch_output.filenames[n]) {
1301 			remove(rec->switch_output.filenames[n]);
1302 			zfree(&rec->switch_output.filenames[n]);
1303 		}
1304 		rec->switch_output.filenames[n] = new_filename;
1305 	} else {
1306 		free(new_filename);
1307 	}
1308 
1309 	/* Output tracking events */
1310 	if (!at_exit) {
1311 		record__synthesize(rec, false);
1312 
1313 		/*
1314 		 * In 'perf record --switch-output' without -a,
1315 		 * record__synthesize() in record__switch_output() won't
1316 		 * generate tracking events because there's no thread_map
1317 		 * in evlist. Which causes newly created perf.data doesn't
1318 		 * contain map and comm information.
1319 		 * Create a fake thread_map and directly call
1320 		 * perf_event__synthesize_thread_map() for those events.
1321 		 */
1322 		if (target__none(&rec->opts.target))
1323 			record__synthesize_workload(rec, false);
1324 	}
1325 	return fd;
1326 }
1327 
1328 static volatile int workload_exec_errno;
1329 
1330 /*
1331  * perf_evlist__prepare_workload will send a SIGUSR1
1332  * if the fork fails, since we asked by setting its
1333  * want_signal to true.
1334  */
1335 static void workload_exec_failed_signal(int signo __maybe_unused,
1336 					siginfo_t *info,
1337 					void *ucontext __maybe_unused)
1338 {
1339 	workload_exec_errno = info->si_value.sival_int;
1340 	done = 1;
1341 	child_finished = 1;
1342 }
1343 
1344 static void snapshot_sig_handler(int sig);
1345 static void alarm_sig_handler(int sig);
1346 
1347 static const struct perf_event_mmap_page *
1348 perf_evlist__pick_pc(struct evlist *evlist)
1349 {
1350 	if (evlist) {
1351 		if (evlist->mmap && evlist->mmap[0].core.base)
1352 			return evlist->mmap[0].core.base;
1353 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1354 			return evlist->overwrite_mmap[0].core.base;
1355 	}
1356 	return NULL;
1357 }
1358 
1359 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1360 {
1361 	const struct perf_event_mmap_page *pc;
1362 
1363 	pc = perf_evlist__pick_pc(rec->evlist);
1364 	if (pc)
1365 		return pc;
1366 	return NULL;
1367 }
1368 
1369 static int record__synthesize(struct record *rec, bool tail)
1370 {
1371 	struct perf_session *session = rec->session;
1372 	struct machine *machine = &session->machines.host;
1373 	struct perf_data *data = &rec->data;
1374 	struct record_opts *opts = &rec->opts;
1375 	struct perf_tool *tool = &rec->tool;
1376 	int fd = perf_data__fd(data);
1377 	int err = 0;
1378 	event_op f = process_synthesized_event;
1379 
1380 	if (rec->opts.tail_synthesize != tail)
1381 		return 0;
1382 
1383 	if (data->is_pipe) {
1384 		/*
1385 		 * We need to synthesize events first, because some
1386 		 * features works on top of them (on report side).
1387 		 */
1388 		err = perf_event__synthesize_attrs(tool, rec->evlist,
1389 						   process_synthesized_event);
1390 		if (err < 0) {
1391 			pr_err("Couldn't synthesize attrs.\n");
1392 			goto out;
1393 		}
1394 
1395 		err = perf_event__synthesize_features(tool, session, rec->evlist,
1396 						      process_synthesized_event);
1397 		if (err < 0) {
1398 			pr_err("Couldn't synthesize features.\n");
1399 			return err;
1400 		}
1401 
1402 		if (have_tracepoints(&rec->evlist->core.entries)) {
1403 			/*
1404 			 * FIXME err <= 0 here actually means that
1405 			 * there were no tracepoints so its not really
1406 			 * an error, just that we don't need to
1407 			 * synthesize anything.  We really have to
1408 			 * return this more properly and also
1409 			 * propagate errors that now are calling die()
1410 			 */
1411 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
1412 								  process_synthesized_event);
1413 			if (err <= 0) {
1414 				pr_err("Couldn't record tracing data.\n");
1415 				goto out;
1416 			}
1417 			rec->bytes_written += err;
1418 		}
1419 	}
1420 
1421 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1422 					  process_synthesized_event, machine);
1423 	if (err)
1424 		goto out;
1425 
1426 	/* Synthesize id_index before auxtrace_info */
1427 	if (rec->opts.auxtrace_sample_mode) {
1428 		err = perf_event__synthesize_id_index(tool,
1429 						      process_synthesized_event,
1430 						      session->evlist, machine);
1431 		if (err)
1432 			goto out;
1433 	}
1434 
1435 	if (rec->opts.full_auxtrace) {
1436 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1437 					session, process_synthesized_event);
1438 		if (err)
1439 			goto out;
1440 	}
1441 
1442 	if (!perf_evlist__exclude_kernel(rec->evlist)) {
1443 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1444 							 machine);
1445 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1446 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1447 				   "Check /proc/kallsyms permission or run as root.\n");
1448 
1449 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1450 						     machine);
1451 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1452 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1453 				   "Check /proc/modules permission or run as root.\n");
1454 	}
1455 
1456 	if (perf_guest) {
1457 		machines__process_guests(&session->machines,
1458 					 perf_event__synthesize_guest_os, tool);
1459 	}
1460 
1461 	err = perf_event__synthesize_extra_attr(&rec->tool,
1462 						rec->evlist,
1463 						process_synthesized_event,
1464 						data->is_pipe);
1465 	if (err)
1466 		goto out;
1467 
1468 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1469 						 process_synthesized_event,
1470 						NULL);
1471 	if (err < 0) {
1472 		pr_err("Couldn't synthesize thread map.\n");
1473 		return err;
1474 	}
1475 
1476 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1477 					     process_synthesized_event, NULL);
1478 	if (err < 0) {
1479 		pr_err("Couldn't synthesize cpu map.\n");
1480 		return err;
1481 	}
1482 
1483 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1484 						machine, opts);
1485 	if (err < 0)
1486 		pr_warning("Couldn't synthesize bpf events.\n");
1487 
1488 	err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1489 					     machine);
1490 	if (err < 0)
1491 		pr_warning("Couldn't synthesize cgroup events.\n");
1492 
1493 	if (rec->opts.nr_threads_synthesize > 1) {
1494 		perf_set_multithreaded();
1495 		f = process_locked_synthesized_event;
1496 	}
1497 
1498 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1499 					    f, opts->sample_address,
1500 					    rec->opts.nr_threads_synthesize);
1501 
1502 	if (rec->opts.nr_threads_synthesize > 1)
1503 		perf_set_singlethreaded();
1504 
1505 out:
1506 	return err;
1507 }
1508 
1509 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1510 {
1511 	struct record *rec = data;
1512 	pthread_kill(rec->thread_id, SIGUSR2);
1513 	return 0;
1514 }
1515 
1516 static int record__setup_sb_evlist(struct record *rec)
1517 {
1518 	struct record_opts *opts = &rec->opts;
1519 
1520 	if (rec->sb_evlist != NULL) {
1521 		/*
1522 		 * We get here if --switch-output-event populated the
1523 		 * sb_evlist, so associate a callback that will send a SIGUSR2
1524 		 * to the main thread.
1525 		 */
1526 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1527 		rec->thread_id = pthread_self();
1528 	}
1529 
1530 	if (!opts->no_bpf_event) {
1531 		if (rec->sb_evlist == NULL) {
1532 			rec->sb_evlist = evlist__new();
1533 
1534 			if (rec->sb_evlist == NULL) {
1535 				pr_err("Couldn't create side band evlist.\n.");
1536 				return -1;
1537 			}
1538 		}
1539 
1540 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1541 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1542 			return -1;
1543 		}
1544 	}
1545 
1546 	if (perf_evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1547 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1548 		opts->no_bpf_event = true;
1549 	}
1550 
1551 	return 0;
1552 }
1553 
1554 static int __cmd_record(struct record *rec, int argc, const char **argv)
1555 {
1556 	int err;
1557 	int status = 0;
1558 	unsigned long waking = 0;
1559 	const bool forks = argc > 0;
1560 	struct perf_tool *tool = &rec->tool;
1561 	struct record_opts *opts = &rec->opts;
1562 	struct perf_data *data = &rec->data;
1563 	struct perf_session *session;
1564 	bool disabled = false, draining = false;
1565 	int fd;
1566 	float ratio = 0;
1567 
1568 	atexit(record__sig_exit);
1569 	signal(SIGCHLD, sig_handler);
1570 	signal(SIGINT, sig_handler);
1571 	signal(SIGTERM, sig_handler);
1572 	signal(SIGSEGV, sigsegv_handler);
1573 
1574 	if (rec->opts.record_namespaces)
1575 		tool->namespace_events = true;
1576 
1577 	if (rec->opts.record_cgroup) {
1578 #ifdef HAVE_FILE_HANDLE
1579 		tool->cgroup_events = true;
1580 #else
1581 		pr_err("cgroup tracking is not supported\n");
1582 		return -1;
1583 #endif
1584 	}
1585 
1586 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1587 		signal(SIGUSR2, snapshot_sig_handler);
1588 		if (rec->opts.auxtrace_snapshot_mode)
1589 			trigger_on(&auxtrace_snapshot_trigger);
1590 		if (rec->switch_output.enabled)
1591 			trigger_on(&switch_output_trigger);
1592 	} else {
1593 		signal(SIGUSR2, SIG_IGN);
1594 	}
1595 
1596 	session = perf_session__new(data, false, tool);
1597 	if (IS_ERR(session)) {
1598 		pr_err("Perf session creation failed.\n");
1599 		return PTR_ERR(session);
1600 	}
1601 
1602 	fd = perf_data__fd(data);
1603 	rec->session = session;
1604 
1605 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1606 		pr_err("Compression initialization failed.\n");
1607 		return -1;
1608 	}
1609 #ifdef HAVE_EVENTFD_SUPPORT
1610 	done_fd = eventfd(0, EFD_NONBLOCK);
1611 	if (done_fd < 0) {
1612 		pr_err("Failed to create wakeup eventfd, error: %m\n");
1613 		status = -1;
1614 		goto out_delete_session;
1615 	}
1616 	err = evlist__add_pollfd(rec->evlist, done_fd);
1617 	if (err < 0) {
1618 		pr_err("Failed to add wakeup eventfd to poll list\n");
1619 		status = err;
1620 		goto out_delete_session;
1621 	}
1622 #endif // HAVE_EVENTFD_SUPPORT
1623 
1624 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1625 	session->header.env.comp_level = rec->opts.comp_level;
1626 
1627 	if (rec->opts.kcore &&
1628 	    !record__kcore_readable(&session->machines.host)) {
1629 		pr_err("ERROR: kcore is not readable.\n");
1630 		return -1;
1631 	}
1632 
1633 	record__init_features(rec);
1634 
1635 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1636 		session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1637 
1638 	if (forks) {
1639 		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1640 						    argv, data->is_pipe,
1641 						    workload_exec_failed_signal);
1642 		if (err < 0) {
1643 			pr_err("Couldn't run the workload!\n");
1644 			status = err;
1645 			goto out_delete_session;
1646 		}
1647 	}
1648 
1649 	/*
1650 	 * If we have just single event and are sending data
1651 	 * through pipe, we need to force the ids allocation,
1652 	 * because we synthesize event name through the pipe
1653 	 * and need the id for that.
1654 	 */
1655 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1656 		rec->opts.sample_id = true;
1657 
1658 	if (record__open(rec) != 0) {
1659 		err = -1;
1660 		goto out_child;
1661 	}
1662 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1663 
1664 	if (rec->opts.kcore) {
1665 		err = record__kcore_copy(&session->machines.host, data);
1666 		if (err) {
1667 			pr_err("ERROR: Failed to copy kcore\n");
1668 			goto out_child;
1669 		}
1670 	}
1671 
1672 	err = bpf__apply_obj_config();
1673 	if (err) {
1674 		char errbuf[BUFSIZ];
1675 
1676 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1677 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1678 			 errbuf);
1679 		goto out_child;
1680 	}
1681 
1682 	/*
1683 	 * Normally perf_session__new would do this, but it doesn't have the
1684 	 * evlist.
1685 	 */
1686 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
1687 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1688 		rec->tool.ordered_events = false;
1689 	}
1690 
1691 	if (!rec->evlist->nr_groups)
1692 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1693 
1694 	if (data->is_pipe) {
1695 		err = perf_header__write_pipe(fd);
1696 		if (err < 0)
1697 			goto out_child;
1698 	} else {
1699 		err = perf_session__write_header(session, rec->evlist, fd, false);
1700 		if (err < 0)
1701 			goto out_child;
1702 	}
1703 
1704 	err = -1;
1705 	if (!rec->no_buildid
1706 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1707 		pr_err("Couldn't generate buildids. "
1708 		       "Use --no-buildid to profile anyway.\n");
1709 		goto out_child;
1710 	}
1711 
1712 	err = record__setup_sb_evlist(rec);
1713 	if (err)
1714 		goto out_child;
1715 
1716 	err = record__synthesize(rec, false);
1717 	if (err < 0)
1718 		goto out_child;
1719 
1720 	if (rec->realtime_prio) {
1721 		struct sched_param param;
1722 
1723 		param.sched_priority = rec->realtime_prio;
1724 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1725 			pr_err("Could not set realtime priority.\n");
1726 			err = -1;
1727 			goto out_child;
1728 		}
1729 	}
1730 
1731 	/*
1732 	 * When perf is starting the traced process, all the events
1733 	 * (apart from group members) have enable_on_exec=1 set,
1734 	 * so don't spoil it by prematurely enabling them.
1735 	 */
1736 	if (!target__none(&opts->target) && !opts->initial_delay)
1737 		evlist__enable(rec->evlist);
1738 
1739 	/*
1740 	 * Let the child rip
1741 	 */
1742 	if (forks) {
1743 		struct machine *machine = &session->machines.host;
1744 		union perf_event *event;
1745 		pid_t tgid;
1746 
1747 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1748 		if (event == NULL) {
1749 			err = -ENOMEM;
1750 			goto out_child;
1751 		}
1752 
1753 		/*
1754 		 * Some H/W events are generated before COMM event
1755 		 * which is emitted during exec(), so perf script
1756 		 * cannot see a correct process name for those events.
1757 		 * Synthesize COMM event to prevent it.
1758 		 */
1759 		tgid = perf_event__synthesize_comm(tool, event,
1760 						   rec->evlist->workload.pid,
1761 						   process_synthesized_event,
1762 						   machine);
1763 		free(event);
1764 
1765 		if (tgid == -1)
1766 			goto out_child;
1767 
1768 		event = malloc(sizeof(event->namespaces) +
1769 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1770 			       machine->id_hdr_size);
1771 		if (event == NULL) {
1772 			err = -ENOMEM;
1773 			goto out_child;
1774 		}
1775 
1776 		/*
1777 		 * Synthesize NAMESPACES event for the command specified.
1778 		 */
1779 		perf_event__synthesize_namespaces(tool, event,
1780 						  rec->evlist->workload.pid,
1781 						  tgid, process_synthesized_event,
1782 						  machine);
1783 		free(event);
1784 
1785 		perf_evlist__start_workload(rec->evlist);
1786 	}
1787 
1788 	if (opts->initial_delay) {
1789 		usleep(opts->initial_delay * USEC_PER_MSEC);
1790 		evlist__enable(rec->evlist);
1791 	}
1792 
1793 	trigger_ready(&auxtrace_snapshot_trigger);
1794 	trigger_ready(&switch_output_trigger);
1795 	perf_hooks__invoke_record_start();
1796 	for (;;) {
1797 		unsigned long long hits = rec->samples;
1798 
1799 		/*
1800 		 * rec->evlist->bkw_mmap_state is possible to be
1801 		 * BKW_MMAP_EMPTY here: when done == true and
1802 		 * hits != rec->samples in previous round.
1803 		 *
1804 		 * perf_evlist__toggle_bkw_mmap ensure we never
1805 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1806 		 */
1807 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1808 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1809 
1810 		if (record__mmap_read_all(rec, false) < 0) {
1811 			trigger_error(&auxtrace_snapshot_trigger);
1812 			trigger_error(&switch_output_trigger);
1813 			err = -1;
1814 			goto out_child;
1815 		}
1816 
1817 		if (auxtrace_record__snapshot_started) {
1818 			auxtrace_record__snapshot_started = 0;
1819 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1820 				record__read_auxtrace_snapshot(rec, false);
1821 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1822 				pr_err("AUX area tracing snapshot failed\n");
1823 				err = -1;
1824 				goto out_child;
1825 			}
1826 		}
1827 
1828 		if (trigger_is_hit(&switch_output_trigger)) {
1829 			/*
1830 			 * If switch_output_trigger is hit, the data in
1831 			 * overwritable ring buffer should have been collected,
1832 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1833 			 *
1834 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1835 			 * record__mmap_read_all() didn't collect data from
1836 			 * overwritable ring buffer. Read again.
1837 			 */
1838 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1839 				continue;
1840 			trigger_ready(&switch_output_trigger);
1841 
1842 			/*
1843 			 * Reenable events in overwrite ring buffer after
1844 			 * record__mmap_read_all(): we should have collected
1845 			 * data from it.
1846 			 */
1847 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1848 
1849 			if (!quiet)
1850 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1851 					waking);
1852 			waking = 0;
1853 			fd = record__switch_output(rec, false);
1854 			if (fd < 0) {
1855 				pr_err("Failed to switch to new file\n");
1856 				trigger_error(&switch_output_trigger);
1857 				err = fd;
1858 				goto out_child;
1859 			}
1860 
1861 			/* re-arm the alarm */
1862 			if (rec->switch_output.time)
1863 				alarm(rec->switch_output.time);
1864 		}
1865 
1866 		if (hits == rec->samples) {
1867 			if (done || draining)
1868 				break;
1869 			err = evlist__poll(rec->evlist, -1);
1870 			/*
1871 			 * Propagate error, only if there's any. Ignore positive
1872 			 * number of returned events and interrupt error.
1873 			 */
1874 			if (err > 0 || (err < 0 && errno == EINTR))
1875 				err = 0;
1876 			waking++;
1877 
1878 			if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1879 				draining = true;
1880 		}
1881 
1882 		/*
1883 		 * When perf is starting the traced process, at the end events
1884 		 * die with the process and we wait for that. Thus no need to
1885 		 * disable events in this case.
1886 		 */
1887 		if (done && !disabled && !target__none(&opts->target)) {
1888 			trigger_off(&auxtrace_snapshot_trigger);
1889 			evlist__disable(rec->evlist);
1890 			disabled = true;
1891 		}
1892 	}
1893 
1894 	trigger_off(&auxtrace_snapshot_trigger);
1895 	trigger_off(&switch_output_trigger);
1896 
1897 	if (opts->auxtrace_snapshot_on_exit)
1898 		record__auxtrace_snapshot_exit(rec);
1899 
1900 	if (forks && workload_exec_errno) {
1901 		char msg[STRERR_BUFSIZE];
1902 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1903 		pr_err("Workload failed: %s\n", emsg);
1904 		err = -1;
1905 		goto out_child;
1906 	}
1907 
1908 	if (!quiet)
1909 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1910 
1911 	if (target__none(&rec->opts.target))
1912 		record__synthesize_workload(rec, true);
1913 
1914 out_child:
1915 	record__mmap_read_all(rec, true);
1916 	record__aio_mmap_read_sync(rec);
1917 
1918 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1919 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1920 		session->header.env.comp_ratio = ratio + 0.5;
1921 	}
1922 
1923 	if (forks) {
1924 		int exit_status;
1925 
1926 		if (!child_finished)
1927 			kill(rec->evlist->workload.pid, SIGTERM);
1928 
1929 		wait(&exit_status);
1930 
1931 		if (err < 0)
1932 			status = err;
1933 		else if (WIFEXITED(exit_status))
1934 			status = WEXITSTATUS(exit_status);
1935 		else if (WIFSIGNALED(exit_status))
1936 			signr = WTERMSIG(exit_status);
1937 	} else
1938 		status = err;
1939 
1940 	record__synthesize(rec, true);
1941 	/* this will be recalculated during process_buildids() */
1942 	rec->samples = 0;
1943 
1944 	if (!err) {
1945 		if (!rec->timestamp_filename) {
1946 			record__finish_output(rec);
1947 		} else {
1948 			fd = record__switch_output(rec, true);
1949 			if (fd < 0) {
1950 				status = fd;
1951 				goto out_delete_session;
1952 			}
1953 		}
1954 	}
1955 
1956 	perf_hooks__invoke_record_end();
1957 
1958 	if (!err && !quiet) {
1959 		char samples[128];
1960 		const char *postfix = rec->timestamp_filename ?
1961 					".<timestamp>" : "";
1962 
1963 		if (rec->samples && !rec->opts.full_auxtrace)
1964 			scnprintf(samples, sizeof(samples),
1965 				  " (%" PRIu64 " samples)", rec->samples);
1966 		else
1967 			samples[0] = '\0';
1968 
1969 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
1970 			perf_data__size(data) / 1024.0 / 1024.0,
1971 			data->path, postfix, samples);
1972 		if (ratio) {
1973 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
1974 					rec->session->bytes_transferred / 1024.0 / 1024.0,
1975 					ratio);
1976 		}
1977 		fprintf(stderr, " ]\n");
1978 	}
1979 
1980 out_delete_session:
1981 #ifdef HAVE_EVENTFD_SUPPORT
1982 	if (done_fd >= 0)
1983 		close(done_fd);
1984 #endif
1985 	zstd_fini(&session->zstd_data);
1986 	perf_session__delete(session);
1987 
1988 	if (!opts->no_bpf_event)
1989 		perf_evlist__stop_sb_thread(rec->sb_evlist);
1990 	return status;
1991 }
1992 
1993 static void callchain_debug(struct callchain_param *callchain)
1994 {
1995 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1996 
1997 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1998 
1999 	if (callchain->record_mode == CALLCHAIN_DWARF)
2000 		pr_debug("callchain: stack dump size %d\n",
2001 			 callchain->dump_size);
2002 }
2003 
2004 int record_opts__parse_callchain(struct record_opts *record,
2005 				 struct callchain_param *callchain,
2006 				 const char *arg, bool unset)
2007 {
2008 	int ret;
2009 	callchain->enabled = !unset;
2010 
2011 	/* --no-call-graph */
2012 	if (unset) {
2013 		callchain->record_mode = CALLCHAIN_NONE;
2014 		pr_debug("callchain: disabled\n");
2015 		return 0;
2016 	}
2017 
2018 	ret = parse_callchain_record_opt(arg, callchain);
2019 	if (!ret) {
2020 		/* Enable data address sampling for DWARF unwind. */
2021 		if (callchain->record_mode == CALLCHAIN_DWARF)
2022 			record->sample_address = true;
2023 		callchain_debug(callchain);
2024 	}
2025 
2026 	return ret;
2027 }
2028 
2029 int record_parse_callchain_opt(const struct option *opt,
2030 			       const char *arg,
2031 			       int unset)
2032 {
2033 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2034 }
2035 
2036 int record_callchain_opt(const struct option *opt,
2037 			 const char *arg __maybe_unused,
2038 			 int unset __maybe_unused)
2039 {
2040 	struct callchain_param *callchain = opt->value;
2041 
2042 	callchain->enabled = true;
2043 
2044 	if (callchain->record_mode == CALLCHAIN_NONE)
2045 		callchain->record_mode = CALLCHAIN_FP;
2046 
2047 	callchain_debug(callchain);
2048 	return 0;
2049 }
2050 
2051 static int perf_record_config(const char *var, const char *value, void *cb)
2052 {
2053 	struct record *rec = cb;
2054 
2055 	if (!strcmp(var, "record.build-id")) {
2056 		if (!strcmp(value, "cache"))
2057 			rec->no_buildid_cache = false;
2058 		else if (!strcmp(value, "no-cache"))
2059 			rec->no_buildid_cache = true;
2060 		else if (!strcmp(value, "skip"))
2061 			rec->no_buildid = true;
2062 		else
2063 			return -1;
2064 		return 0;
2065 	}
2066 	if (!strcmp(var, "record.call-graph")) {
2067 		var = "call-graph.record-mode";
2068 		return perf_default_config(var, value, cb);
2069 	}
2070 #ifdef HAVE_AIO_SUPPORT
2071 	if (!strcmp(var, "record.aio")) {
2072 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2073 		if (!rec->opts.nr_cblocks)
2074 			rec->opts.nr_cblocks = nr_cblocks_default;
2075 	}
2076 #endif
2077 
2078 	return 0;
2079 }
2080 
2081 struct clockid_map {
2082 	const char *name;
2083 	int clockid;
2084 };
2085 
2086 #define CLOCKID_MAP(n, c)	\
2087 	{ .name = n, .clockid = (c), }
2088 
2089 #define CLOCKID_END	{ .name = NULL, }
2090 
2091 
2092 /*
2093  * Add the missing ones, we need to build on many distros...
2094  */
2095 #ifndef CLOCK_MONOTONIC_RAW
2096 #define CLOCK_MONOTONIC_RAW 4
2097 #endif
2098 #ifndef CLOCK_BOOTTIME
2099 #define CLOCK_BOOTTIME 7
2100 #endif
2101 #ifndef CLOCK_TAI
2102 #define CLOCK_TAI 11
2103 #endif
2104 
2105 static const struct clockid_map clockids[] = {
2106 	/* available for all events, NMI safe */
2107 	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
2108 	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
2109 
2110 	/* available for some events */
2111 	CLOCKID_MAP("realtime", CLOCK_REALTIME),
2112 	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
2113 	CLOCKID_MAP("tai", CLOCK_TAI),
2114 
2115 	/* available for the lazy */
2116 	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
2117 	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
2118 	CLOCKID_MAP("real", CLOCK_REALTIME),
2119 	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
2120 
2121 	CLOCKID_END,
2122 };
2123 
2124 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
2125 {
2126 	struct timespec res;
2127 
2128 	*res_ns = 0;
2129 	if (!clock_getres(clk_id, &res))
2130 		*res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
2131 	else
2132 		pr_warning("WARNING: Failed to determine specified clock resolution.\n");
2133 
2134 	return 0;
2135 }
2136 
2137 static int parse_clockid(const struct option *opt, const char *str, int unset)
2138 {
2139 	struct record_opts *opts = (struct record_opts *)opt->value;
2140 	const struct clockid_map *cm;
2141 	const char *ostr = str;
2142 
2143 	if (unset) {
2144 		opts->use_clockid = 0;
2145 		return 0;
2146 	}
2147 
2148 	/* no arg passed */
2149 	if (!str)
2150 		return 0;
2151 
2152 	/* no setting it twice */
2153 	if (opts->use_clockid)
2154 		return -1;
2155 
2156 	opts->use_clockid = true;
2157 
2158 	/* if its a number, we're done */
2159 	if (sscanf(str, "%d", &opts->clockid) == 1)
2160 		return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
2161 
2162 	/* allow a "CLOCK_" prefix to the name */
2163 	if (!strncasecmp(str, "CLOCK_", 6))
2164 		str += 6;
2165 
2166 	for (cm = clockids; cm->name; cm++) {
2167 		if (!strcasecmp(str, cm->name)) {
2168 			opts->clockid = cm->clockid;
2169 			return get_clockid_res(opts->clockid,
2170 					       &opts->clockid_res_ns);
2171 		}
2172 	}
2173 
2174 	opts->use_clockid = false;
2175 	ui__warning("unknown clockid %s, check man page\n", ostr);
2176 	return -1;
2177 }
2178 
2179 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2180 {
2181 	struct record_opts *opts = (struct record_opts *)opt->value;
2182 
2183 	if (unset || !str)
2184 		return 0;
2185 
2186 	if (!strcasecmp(str, "node"))
2187 		opts->affinity = PERF_AFFINITY_NODE;
2188 	else if (!strcasecmp(str, "cpu"))
2189 		opts->affinity = PERF_AFFINITY_CPU;
2190 
2191 	return 0;
2192 }
2193 
2194 static int parse_output_max_size(const struct option *opt,
2195 				 const char *str, int unset)
2196 {
2197 	unsigned long *s = (unsigned long *)opt->value;
2198 	static struct parse_tag tags_size[] = {
2199 		{ .tag  = 'B', .mult = 1       },
2200 		{ .tag  = 'K', .mult = 1 << 10 },
2201 		{ .tag  = 'M', .mult = 1 << 20 },
2202 		{ .tag  = 'G', .mult = 1 << 30 },
2203 		{ .tag  = 0 },
2204 	};
2205 	unsigned long val;
2206 
2207 	if (unset) {
2208 		*s = 0;
2209 		return 0;
2210 	}
2211 
2212 	val = parse_tag_value(str, tags_size);
2213 	if (val != (unsigned long) -1) {
2214 		*s = val;
2215 		return 0;
2216 	}
2217 
2218 	return -1;
2219 }
2220 
2221 static int record__parse_mmap_pages(const struct option *opt,
2222 				    const char *str,
2223 				    int unset __maybe_unused)
2224 {
2225 	struct record_opts *opts = opt->value;
2226 	char *s, *p;
2227 	unsigned int mmap_pages;
2228 	int ret;
2229 
2230 	if (!str)
2231 		return -EINVAL;
2232 
2233 	s = strdup(str);
2234 	if (!s)
2235 		return -ENOMEM;
2236 
2237 	p = strchr(s, ',');
2238 	if (p)
2239 		*p = '\0';
2240 
2241 	if (*s) {
2242 		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
2243 		if (ret)
2244 			goto out_free;
2245 		opts->mmap_pages = mmap_pages;
2246 	}
2247 
2248 	if (!p) {
2249 		ret = 0;
2250 		goto out_free;
2251 	}
2252 
2253 	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
2254 	if (ret)
2255 		goto out_free;
2256 
2257 	opts->auxtrace_mmap_pages = mmap_pages;
2258 
2259 out_free:
2260 	free(s);
2261 	return ret;
2262 }
2263 
2264 static void switch_output_size_warn(struct record *rec)
2265 {
2266 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2267 	struct switch_output *s = &rec->switch_output;
2268 
2269 	wakeup_size /= 2;
2270 
2271 	if (s->size < wakeup_size) {
2272 		char buf[100];
2273 
2274 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2275 		pr_warning("WARNING: switch-output data size lower than "
2276 			   "wakeup kernel buffer size (%s) "
2277 			   "expect bigger perf.data sizes\n", buf);
2278 	}
2279 }
2280 
2281 static int switch_output_setup(struct record *rec)
2282 {
2283 	struct switch_output *s = &rec->switch_output;
2284 	static struct parse_tag tags_size[] = {
2285 		{ .tag  = 'B', .mult = 1       },
2286 		{ .tag  = 'K', .mult = 1 << 10 },
2287 		{ .tag  = 'M', .mult = 1 << 20 },
2288 		{ .tag  = 'G', .mult = 1 << 30 },
2289 		{ .tag  = 0 },
2290 	};
2291 	static struct parse_tag tags_time[] = {
2292 		{ .tag  = 's', .mult = 1        },
2293 		{ .tag  = 'm', .mult = 60       },
2294 		{ .tag  = 'h', .mult = 60*60    },
2295 		{ .tag  = 'd', .mult = 60*60*24 },
2296 		{ .tag  = 0 },
2297 	};
2298 	unsigned long val;
2299 
2300 	/*
2301 	 * If we're using --switch-output-events, then we imply its
2302 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2303 	 *  thread to its parent.
2304 	 */
2305 	if (rec->switch_output_event_set)
2306 		goto do_signal;
2307 
2308 	if (!s->set)
2309 		return 0;
2310 
2311 	if (!strcmp(s->str, "signal")) {
2312 do_signal:
2313 		s->signal = true;
2314 		pr_debug("switch-output with SIGUSR2 signal\n");
2315 		goto enabled;
2316 	}
2317 
2318 	val = parse_tag_value(s->str, tags_size);
2319 	if (val != (unsigned long) -1) {
2320 		s->size = val;
2321 		pr_debug("switch-output with %s size threshold\n", s->str);
2322 		goto enabled;
2323 	}
2324 
2325 	val = parse_tag_value(s->str, tags_time);
2326 	if (val != (unsigned long) -1) {
2327 		s->time = val;
2328 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2329 			 s->str, s->time);
2330 		goto enabled;
2331 	}
2332 
2333 	return -1;
2334 
2335 enabled:
2336 	rec->timestamp_filename = true;
2337 	s->enabled              = true;
2338 
2339 	if (s->size && !rec->opts.no_buffering)
2340 		switch_output_size_warn(rec);
2341 
2342 	return 0;
2343 }
2344 
2345 static const char * const __record_usage[] = {
2346 	"perf record [<options>] [<command>]",
2347 	"perf record [<options>] -- <command> [<options>]",
2348 	NULL
2349 };
2350 const char * const *record_usage = __record_usage;
2351 
2352 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2353 				  struct perf_sample *sample, struct machine *machine)
2354 {
2355 	/*
2356 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2357 	 * no need to add them twice.
2358 	 */
2359 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2360 		return 0;
2361 	return perf_event__process_mmap(tool, event, sample, machine);
2362 }
2363 
2364 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2365 				   struct perf_sample *sample, struct machine *machine)
2366 {
2367 	/*
2368 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2369 	 * no need to add them twice.
2370 	 */
2371 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2372 		return 0;
2373 
2374 	return perf_event__process_mmap2(tool, event, sample, machine);
2375 }
2376 
2377 /*
2378  * XXX Ideally would be local to cmd_record() and passed to a record__new
2379  * because we need to have access to it in record__exit, that is called
2380  * after cmd_record() exits, but since record_options need to be accessible to
2381  * builtin-script, leave it here.
2382  *
2383  * At least we don't ouch it in all the other functions here directly.
2384  *
2385  * Just say no to tons of global variables, sigh.
2386  */
2387 static struct record record = {
2388 	.opts = {
2389 		.sample_time	     = true,
2390 		.mmap_pages	     = UINT_MAX,
2391 		.user_freq	     = UINT_MAX,
2392 		.user_interval	     = ULLONG_MAX,
2393 		.freq		     = 4000,
2394 		.target		     = {
2395 			.uses_mmap   = true,
2396 			.default_per_cpu = true,
2397 		},
2398 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2399 		.nr_threads_synthesize = 1,
2400 	},
2401 	.tool = {
2402 		.sample		= process_sample_event,
2403 		.fork		= perf_event__process_fork,
2404 		.exit		= perf_event__process_exit,
2405 		.comm		= perf_event__process_comm,
2406 		.namespaces	= perf_event__process_namespaces,
2407 		.mmap		= build_id__process_mmap,
2408 		.mmap2		= build_id__process_mmap2,
2409 		.ordered_events	= true,
2410 	},
2411 };
2412 
2413 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2414 	"\n\t\t\t\tDefault: fp";
2415 
2416 static bool dry_run;
2417 
2418 /*
2419  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2420  * with it and switch to use the library functions in perf_evlist that came
2421  * from builtin-record.c, i.e. use record_opts,
2422  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2423  * using pipes, etc.
2424  */
2425 static struct option __record_options[] = {
2426 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2427 		     "event selector. use 'perf list' to list available events",
2428 		     parse_events_option),
2429 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2430 		     "event filter", parse_filter),
2431 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2432 			   NULL, "don't record events from perf itself",
2433 			   exclude_perf),
2434 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2435 		    "record events on existing process id"),
2436 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2437 		    "record events on existing thread id"),
2438 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2439 		    "collect data with this RT SCHED_FIFO priority"),
2440 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2441 		    "collect data without buffering"),
2442 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2443 		    "collect raw sample records from all opened counters"),
2444 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2445 			    "system-wide collection from all CPUs"),
2446 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2447 		    "list of cpus to monitor"),
2448 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2449 	OPT_STRING('o', "output", &record.data.path, "file",
2450 		    "output file name"),
2451 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2452 			&record.opts.no_inherit_set,
2453 			"child tasks do not inherit counters"),
2454 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2455 		    "synthesize non-sample events at the end of output"),
2456 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2457 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2458 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2459 		    "Fail if the specified frequency can't be used"),
2460 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2461 		     "profile at this frequency",
2462 		      record__parse_freq),
2463 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2464 		     "number of mmap data pages and AUX area tracing mmap pages",
2465 		     record__parse_mmap_pages),
2466 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2467 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2468 		     record__mmap_flush_parse),
2469 	OPT_BOOLEAN(0, "group", &record.opts.group,
2470 		    "put the counters into a counter group"),
2471 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2472 			   NULL, "enables call-graph recording" ,
2473 			   &record_callchain_opt),
2474 	OPT_CALLBACK(0, "call-graph", &record.opts,
2475 		     "record_mode[,record_size]", record_callchain_help,
2476 		     &record_parse_callchain_opt),
2477 	OPT_INCR('v', "verbose", &verbose,
2478 		    "be more verbose (show counter open errors, etc)"),
2479 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2480 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2481 		    "per thread counts"),
2482 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2483 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2484 		    "Record the sample physical addresses"),
2485 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2486 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2487 			&record.opts.sample_time_set,
2488 			"Record the sample timestamps"),
2489 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2490 			"Record the sample period"),
2491 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2492 		    "don't sample"),
2493 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2494 			&record.no_buildid_cache_set,
2495 			"do not update the buildid cache"),
2496 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2497 			&record.no_buildid_set,
2498 			"do not collect buildids in perf.data"),
2499 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2500 		     "monitor event in cgroup name only",
2501 		     parse_cgroups),
2502 	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2503 		  "ms to wait before starting measurement after program start"),
2504 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2505 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2506 		   "user to profile"),
2507 
2508 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2509 		     "branch any", "sample any taken branches",
2510 		     parse_branch_stack),
2511 
2512 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2513 		     "branch filter mask", "branch stack filter modes",
2514 		     parse_branch_stack),
2515 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2516 		    "sample by weight (on special events only)"),
2517 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2518 		    "sample transaction flags (special events only)"),
2519 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2520 		    "use per-thread mmaps"),
2521 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2522 		    "sample selected machine registers on interrupt,"
2523 		    " use '-I?' to list register names", parse_intr_regs),
2524 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2525 		    "sample selected machine registers on interrupt,"
2526 		    " use '--user-regs=?' to list register names", parse_user_regs),
2527 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2528 		    "Record running/enabled time of read (:S) events"),
2529 	OPT_CALLBACK('k', "clockid", &record.opts,
2530 	"clockid", "clockid to use for events, see clock_gettime()",
2531 	parse_clockid),
2532 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2533 			  "opts", "AUX area tracing Snapshot Mode", ""),
2534 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2535 			  "opts", "sample AUX area", ""),
2536 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2537 			"per thread proc mmap processing timeout in ms"),
2538 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2539 		    "Record namespaces events"),
2540 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
2541 		    "Record cgroup events"),
2542 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
2543 			&record.opts.record_switch_events_set,
2544 			"Record context switch events"),
2545 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2546 			 "Configure all used events to run in kernel space.",
2547 			 PARSE_OPT_EXCLUSIVE),
2548 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2549 			 "Configure all used events to run in user space.",
2550 			 PARSE_OPT_EXCLUSIVE),
2551 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2552 		    "collect kernel callchains"),
2553 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2554 		    "collect user callchains"),
2555 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2556 		   "clang binary to use for compiling BPF scriptlets"),
2557 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2558 		   "options passed to clang when compiling BPF scriptlets"),
2559 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2560 		   "file", "vmlinux pathname"),
2561 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2562 		    "Record build-id of all DSOs regardless of hits"),
2563 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2564 		    "append timestamp to output filename"),
2565 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2566 		    "Record timestamp boundary (time of first/last samples)"),
2567 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2568 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2569 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2570 			  "signal"),
2571 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
2572 			 "switch output event selector. use 'perf list' to list available events",
2573 			 parse_events_option_new_evlist),
2574 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2575 		   "Limit number of switch output generated files"),
2576 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2577 		    "Parse options then exit"),
2578 #ifdef HAVE_AIO_SUPPORT
2579 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2580 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2581 		     record__aio_parse),
2582 #endif
2583 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2584 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2585 		     record__parse_affinity),
2586 #ifdef HAVE_ZSTD_SUPPORT
2587 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2588 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2589 			    record__parse_comp_level),
2590 #endif
2591 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
2592 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
2593 	OPT_UINTEGER(0, "num-thread-synthesize",
2594 		     &record.opts.nr_threads_synthesize,
2595 		     "number of threads to run for event synthesis"),
2596 #ifdef HAVE_LIBPFM
2597 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
2598 		"libpfm4 event selector. use 'perf list' to list available events",
2599 		parse_libpfm_events_option),
2600 #endif
2601 	OPT_END()
2602 };
2603 
2604 struct option *record_options = __record_options;
2605 
2606 int cmd_record(int argc, const char **argv)
2607 {
2608 	int err;
2609 	struct record *rec = &record;
2610 	char errbuf[BUFSIZ];
2611 
2612 	setlocale(LC_ALL, "");
2613 
2614 #ifndef HAVE_LIBBPF_SUPPORT
2615 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2616 	set_nobuild('\0', "clang-path", true);
2617 	set_nobuild('\0', "clang-opt", true);
2618 # undef set_nobuild
2619 #endif
2620 
2621 #ifndef HAVE_BPF_PROLOGUE
2622 # if !defined (HAVE_DWARF_SUPPORT)
2623 #  define REASON  "NO_DWARF=1"
2624 # elif !defined (HAVE_LIBBPF_SUPPORT)
2625 #  define REASON  "NO_LIBBPF=1"
2626 # else
2627 #  define REASON  "this architecture doesn't support BPF prologue"
2628 # endif
2629 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2630 	set_nobuild('\0', "vmlinux", true);
2631 # undef set_nobuild
2632 # undef REASON
2633 #endif
2634 
2635 	rec->opts.affinity = PERF_AFFINITY_SYS;
2636 
2637 	rec->evlist = evlist__new();
2638 	if (rec->evlist == NULL)
2639 		return -ENOMEM;
2640 
2641 	err = perf_config(perf_record_config, rec);
2642 	if (err)
2643 		return err;
2644 
2645 	argc = parse_options(argc, argv, record_options, record_usage,
2646 			    PARSE_OPT_STOP_AT_NON_OPTION);
2647 	if (quiet)
2648 		perf_quiet_option();
2649 
2650 	/* Make system wide (-a) the default target. */
2651 	if (!argc && target__none(&rec->opts.target))
2652 		rec->opts.target.system_wide = true;
2653 
2654 	if (nr_cgroups && !rec->opts.target.system_wide) {
2655 		usage_with_options_msg(record_usage, record_options,
2656 			"cgroup monitoring only available in system-wide mode");
2657 
2658 	}
2659 
2660 	if (rec->opts.kcore)
2661 		rec->data.is_dir = true;
2662 
2663 	if (rec->opts.comp_level != 0) {
2664 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2665 		rec->no_buildid = true;
2666 	}
2667 
2668 	if (rec->opts.record_switch_events &&
2669 	    !perf_can_record_switch_events()) {
2670 		ui__error("kernel does not support recording context switch events\n");
2671 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2672 		return -EINVAL;
2673 	}
2674 
2675 	if (switch_output_setup(rec)) {
2676 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2677 		return -EINVAL;
2678 	}
2679 
2680 	if (rec->switch_output.time) {
2681 		signal(SIGALRM, alarm_sig_handler);
2682 		alarm(rec->switch_output.time);
2683 	}
2684 
2685 	if (rec->switch_output.num_files) {
2686 		rec->switch_output.filenames = calloc(sizeof(char *),
2687 						      rec->switch_output.num_files);
2688 		if (!rec->switch_output.filenames)
2689 			return -EINVAL;
2690 	}
2691 
2692 	/*
2693 	 * Allow aliases to facilitate the lookup of symbols for address
2694 	 * filters. Refer to auxtrace_parse_filters().
2695 	 */
2696 	symbol_conf.allow_aliases = true;
2697 
2698 	symbol__init(NULL);
2699 
2700 	if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2701 		rec->affinity_mask.nbits = cpu__max_cpu();
2702 		rec->affinity_mask.bits = bitmap_alloc(rec->affinity_mask.nbits);
2703 		if (!rec->affinity_mask.bits) {
2704 			pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2705 			return -ENOMEM;
2706 		}
2707 		pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2708 	}
2709 
2710 	err = record__auxtrace_init(rec);
2711 	if (err)
2712 		goto out;
2713 
2714 	if (dry_run)
2715 		goto out;
2716 
2717 	err = bpf__setup_stdout(rec->evlist);
2718 	if (err) {
2719 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2720 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2721 			 errbuf);
2722 		goto out;
2723 	}
2724 
2725 	err = -ENOMEM;
2726 
2727 	if (rec->no_buildid_cache || rec->no_buildid) {
2728 		disable_buildid_cache();
2729 	} else if (rec->switch_output.enabled) {
2730 		/*
2731 		 * In 'perf record --switch-output', disable buildid
2732 		 * generation by default to reduce data file switching
2733 		 * overhead. Still generate buildid if they are required
2734 		 * explicitly using
2735 		 *
2736 		 *  perf record --switch-output --no-no-buildid \
2737 		 *              --no-no-buildid-cache
2738 		 *
2739 		 * Following code equals to:
2740 		 *
2741 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2742 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2743 		 *         disable_buildid_cache();
2744 		 */
2745 		bool disable = true;
2746 
2747 		if (rec->no_buildid_set && !rec->no_buildid)
2748 			disable = false;
2749 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2750 			disable = false;
2751 		if (disable) {
2752 			rec->no_buildid = true;
2753 			rec->no_buildid_cache = true;
2754 			disable_buildid_cache();
2755 		}
2756 	}
2757 
2758 	if (record.opts.overwrite)
2759 		record.opts.tail_synthesize = true;
2760 
2761 	if (rec->evlist->core.nr_entries == 0 &&
2762 	    __evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2763 		pr_err("Not enough memory for event selector list\n");
2764 		goto out;
2765 	}
2766 
2767 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2768 		rec->opts.no_inherit = true;
2769 
2770 	err = target__validate(&rec->opts.target);
2771 	if (err) {
2772 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2773 		ui__warning("%s\n", errbuf);
2774 	}
2775 
2776 	err = target__parse_uid(&rec->opts.target);
2777 	if (err) {
2778 		int saved_errno = errno;
2779 
2780 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2781 		ui__error("%s", errbuf);
2782 
2783 		err = -saved_errno;
2784 		goto out;
2785 	}
2786 
2787 	/* Enable ignoring missing threads when -u/-p option is defined. */
2788 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2789 
2790 	err = -ENOMEM;
2791 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2792 		usage_with_options(record_usage, record_options);
2793 
2794 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2795 	if (err)
2796 		goto out;
2797 
2798 	/*
2799 	 * We take all buildids when the file contains
2800 	 * AUX area tracing data because we do not decode the
2801 	 * trace because it would take too long.
2802 	 */
2803 	if (rec->opts.full_auxtrace)
2804 		rec->buildid_all = true;
2805 
2806 	if (rec->opts.text_poke) {
2807 		err = record__config_text_poke(rec->evlist);
2808 		if (err) {
2809 			pr_err("record__config_text_poke failed, error %d\n", err);
2810 			goto out;
2811 		}
2812 	}
2813 
2814 	if (record_opts__config(&rec->opts)) {
2815 		err = -EINVAL;
2816 		goto out;
2817 	}
2818 
2819 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2820 		rec->opts.nr_cblocks = nr_cblocks_max;
2821 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2822 
2823 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2824 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2825 
2826 	if (rec->opts.comp_level > comp_level_max)
2827 		rec->opts.comp_level = comp_level_max;
2828 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2829 
2830 	err = __cmd_record(&record, argc, argv);
2831 out:
2832 	bitmap_free(rec->affinity_mask.bits);
2833 	evlist__delete(rec->evlist);
2834 	symbol__exit();
2835 	auxtrace_record__free(rec->itr);
2836 	return err;
2837 }
2838 
2839 static void snapshot_sig_handler(int sig __maybe_unused)
2840 {
2841 	struct record *rec = &record;
2842 
2843 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2844 		trigger_hit(&auxtrace_snapshot_trigger);
2845 		auxtrace_record__snapshot_started = 1;
2846 		if (auxtrace_record__snapshot_start(record.itr))
2847 			trigger_error(&auxtrace_snapshot_trigger);
2848 	}
2849 
2850 	if (switch_output_signal(rec))
2851 		trigger_hit(&switch_output_trigger);
2852 }
2853 
2854 static void alarm_sig_handler(int sig __maybe_unused)
2855 {
2856 	struct record *rec = &record;
2857 
2858 	if (switch_output_time(rec))
2859 		trigger_hit(&switch_output_trigger);
2860 }
2861