xref: /openbmc/linux/tools/perf/builtin-record.c (revision 6cd70754)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/clockid.h"
50 #include "asm/bug.h"
51 #include "perf.h"
52 
53 #include <errno.h>
54 #include <inttypes.h>
55 #include <locale.h>
56 #include <poll.h>
57 #include <pthread.h>
58 #include <unistd.h>
59 #include <sched.h>
60 #include <signal.h>
61 #ifdef HAVE_EVENTFD_SUPPORT
62 #include <sys/eventfd.h>
63 #endif
64 #include <sys/mman.h>
65 #include <sys/wait.h>
66 #include <sys/types.h>
67 #include <sys/stat.h>
68 #include <fcntl.h>
69 #include <linux/err.h>
70 #include <linux/string.h>
71 #include <linux/time64.h>
72 #include <linux/zalloc.h>
73 #include <linux/bitmap.h>
74 #include <sys/time.h>
75 
76 struct switch_output {
77 	bool		 enabled;
78 	bool		 signal;
79 	unsigned long	 size;
80 	unsigned long	 time;
81 	const char	*str;
82 	bool		 set;
83 	char		 **filenames;
84 	int		 num_files;
85 	int		 cur_file;
86 };
87 
88 struct record {
89 	struct perf_tool	tool;
90 	struct record_opts	opts;
91 	u64			bytes_written;
92 	struct perf_data	data;
93 	struct auxtrace_record	*itr;
94 	struct evlist	*evlist;
95 	struct perf_session	*session;
96 	struct evlist		*sb_evlist;
97 	pthread_t		thread_id;
98 	int			realtime_prio;
99 	bool			switch_output_event_set;
100 	bool			no_buildid;
101 	bool			no_buildid_set;
102 	bool			no_buildid_cache;
103 	bool			no_buildid_cache_set;
104 	bool			buildid_all;
105 	bool			buildid_mmap;
106 	bool			timestamp_filename;
107 	bool			timestamp_boundary;
108 	struct switch_output	switch_output;
109 	unsigned long long	samples;
110 	struct mmap_cpu_mask	affinity_mask;
111 	unsigned long		output_max_size;	/* = 0: unlimited */
112 };
113 
114 static volatile int done;
115 
116 static volatile int auxtrace_record__snapshot_started;
117 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
118 static DEFINE_TRIGGER(switch_output_trigger);
119 
120 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
121 	"SYS", "NODE", "CPU"
122 };
123 
124 static bool switch_output_signal(struct record *rec)
125 {
126 	return rec->switch_output.signal &&
127 	       trigger_is_ready(&switch_output_trigger);
128 }
129 
130 static bool switch_output_size(struct record *rec)
131 {
132 	return rec->switch_output.size &&
133 	       trigger_is_ready(&switch_output_trigger) &&
134 	       (rec->bytes_written >= rec->switch_output.size);
135 }
136 
137 static bool switch_output_time(struct record *rec)
138 {
139 	return rec->switch_output.time &&
140 	       trigger_is_ready(&switch_output_trigger);
141 }
142 
143 static bool record__output_max_size_exceeded(struct record *rec)
144 {
145 	return rec->output_max_size &&
146 	       (rec->bytes_written >= rec->output_max_size);
147 }
148 
149 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
150 			 void *bf, size_t size)
151 {
152 	struct perf_data_file *file = &rec->session->data->file;
153 
154 	if (perf_data_file__write(file, bf, size) < 0) {
155 		pr_err("failed to write perf data, error: %m\n");
156 		return -1;
157 	}
158 
159 	rec->bytes_written += size;
160 
161 	if (record__output_max_size_exceeded(rec) && !done) {
162 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
163 				" stopping session ]\n",
164 				rec->bytes_written >> 10);
165 		done = 1;
166 	}
167 
168 	if (switch_output_size(rec))
169 		trigger_hit(&switch_output_trigger);
170 
171 	return 0;
172 }
173 
174 static int record__aio_enabled(struct record *rec);
175 static int record__comp_enabled(struct record *rec);
176 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
177 			    void *src, size_t src_size);
178 
179 #ifdef HAVE_AIO_SUPPORT
180 static int record__aio_write(struct aiocb *cblock, int trace_fd,
181 		void *buf, size_t size, off_t off)
182 {
183 	int rc;
184 
185 	cblock->aio_fildes = trace_fd;
186 	cblock->aio_buf    = buf;
187 	cblock->aio_nbytes = size;
188 	cblock->aio_offset = off;
189 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
190 
191 	do {
192 		rc = aio_write(cblock);
193 		if (rc == 0) {
194 			break;
195 		} else if (errno != EAGAIN) {
196 			cblock->aio_fildes = -1;
197 			pr_err("failed to queue perf data, error: %m\n");
198 			break;
199 		}
200 	} while (1);
201 
202 	return rc;
203 }
204 
205 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
206 {
207 	void *rem_buf;
208 	off_t rem_off;
209 	size_t rem_size;
210 	int rc, aio_errno;
211 	ssize_t aio_ret, written;
212 
213 	aio_errno = aio_error(cblock);
214 	if (aio_errno == EINPROGRESS)
215 		return 0;
216 
217 	written = aio_ret = aio_return(cblock);
218 	if (aio_ret < 0) {
219 		if (aio_errno != EINTR)
220 			pr_err("failed to write perf data, error: %m\n");
221 		written = 0;
222 	}
223 
224 	rem_size = cblock->aio_nbytes - written;
225 
226 	if (rem_size == 0) {
227 		cblock->aio_fildes = -1;
228 		/*
229 		 * md->refcount is incremented in record__aio_pushfn() for
230 		 * every aio write request started in record__aio_push() so
231 		 * decrement it because the request is now complete.
232 		 */
233 		perf_mmap__put(&md->core);
234 		rc = 1;
235 	} else {
236 		/*
237 		 * aio write request may require restart with the
238 		 * reminder if the kernel didn't write whole
239 		 * chunk at once.
240 		 */
241 		rem_off = cblock->aio_offset + written;
242 		rem_buf = (void *)(cblock->aio_buf + written);
243 		record__aio_write(cblock, cblock->aio_fildes,
244 				rem_buf, rem_size, rem_off);
245 		rc = 0;
246 	}
247 
248 	return rc;
249 }
250 
251 static int record__aio_sync(struct mmap *md, bool sync_all)
252 {
253 	struct aiocb **aiocb = md->aio.aiocb;
254 	struct aiocb *cblocks = md->aio.cblocks;
255 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
256 	int i, do_suspend;
257 
258 	do {
259 		do_suspend = 0;
260 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
261 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
262 				if (sync_all)
263 					aiocb[i] = NULL;
264 				else
265 					return i;
266 			} else {
267 				/*
268 				 * Started aio write is not complete yet
269 				 * so it has to be waited before the
270 				 * next allocation.
271 				 */
272 				aiocb[i] = &cblocks[i];
273 				do_suspend = 1;
274 			}
275 		}
276 		if (!do_suspend)
277 			return -1;
278 
279 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
280 			if (!(errno == EAGAIN || errno == EINTR))
281 				pr_err("failed to sync perf data, error: %m\n");
282 		}
283 	} while (1);
284 }
285 
286 struct record_aio {
287 	struct record	*rec;
288 	void		*data;
289 	size_t		size;
290 };
291 
292 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
293 {
294 	struct record_aio *aio = to;
295 
296 	/*
297 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
298 	 * to release space in the kernel buffer as fast as possible, calling
299 	 * perf_mmap__consume() from perf_mmap__push() function.
300 	 *
301 	 * That lets the kernel to proceed with storing more profiling data into
302 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
303 	 *
304 	 * Coping can be done in two steps in case the chunk of profiling data
305 	 * crosses the upper bound of the kernel buffer. In this case we first move
306 	 * part of data from map->start till the upper bound and then the reminder
307 	 * from the beginning of the kernel buffer till the end of the data chunk.
308 	 */
309 
310 	if (record__comp_enabled(aio->rec)) {
311 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
312 				     mmap__mmap_len(map) - aio->size,
313 				     buf, size);
314 	} else {
315 		memcpy(aio->data + aio->size, buf, size);
316 	}
317 
318 	if (!aio->size) {
319 		/*
320 		 * Increment map->refcount to guard map->aio.data[] buffer
321 		 * from premature deallocation because map object can be
322 		 * released earlier than aio write request started on
323 		 * map->aio.data[] buffer is complete.
324 		 *
325 		 * perf_mmap__put() is done at record__aio_complete()
326 		 * after started aio request completion or at record__aio_push()
327 		 * if the request failed to start.
328 		 */
329 		perf_mmap__get(&map->core);
330 	}
331 
332 	aio->size += size;
333 
334 	return size;
335 }
336 
337 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
338 {
339 	int ret, idx;
340 	int trace_fd = rec->session->data->file.fd;
341 	struct record_aio aio = { .rec = rec, .size = 0 };
342 
343 	/*
344 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
345 	 * becomes available after previous aio write operation.
346 	 */
347 
348 	idx = record__aio_sync(map, false);
349 	aio.data = map->aio.data[idx];
350 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
351 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
352 		return ret;
353 
354 	rec->samples++;
355 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
356 	if (!ret) {
357 		*off += aio.size;
358 		rec->bytes_written += aio.size;
359 		if (switch_output_size(rec))
360 			trigger_hit(&switch_output_trigger);
361 	} else {
362 		/*
363 		 * Decrement map->refcount incremented in record__aio_pushfn()
364 		 * back if record__aio_write() operation failed to start, otherwise
365 		 * map->refcount is decremented in record__aio_complete() after
366 		 * aio write operation finishes successfully.
367 		 */
368 		perf_mmap__put(&map->core);
369 	}
370 
371 	return ret;
372 }
373 
374 static off_t record__aio_get_pos(int trace_fd)
375 {
376 	return lseek(trace_fd, 0, SEEK_CUR);
377 }
378 
379 static void record__aio_set_pos(int trace_fd, off_t pos)
380 {
381 	lseek(trace_fd, pos, SEEK_SET);
382 }
383 
384 static void record__aio_mmap_read_sync(struct record *rec)
385 {
386 	int i;
387 	struct evlist *evlist = rec->evlist;
388 	struct mmap *maps = evlist->mmap;
389 
390 	if (!record__aio_enabled(rec))
391 		return;
392 
393 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
394 		struct mmap *map = &maps[i];
395 
396 		if (map->core.base)
397 			record__aio_sync(map, true);
398 	}
399 }
400 
401 static int nr_cblocks_default = 1;
402 static int nr_cblocks_max = 4;
403 
404 static int record__aio_parse(const struct option *opt,
405 			     const char *str,
406 			     int unset)
407 {
408 	struct record_opts *opts = (struct record_opts *)opt->value;
409 
410 	if (unset) {
411 		opts->nr_cblocks = 0;
412 	} else {
413 		if (str)
414 			opts->nr_cblocks = strtol(str, NULL, 0);
415 		if (!opts->nr_cblocks)
416 			opts->nr_cblocks = nr_cblocks_default;
417 	}
418 
419 	return 0;
420 }
421 #else /* HAVE_AIO_SUPPORT */
422 static int nr_cblocks_max = 0;
423 
424 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
425 			    off_t *off __maybe_unused)
426 {
427 	return -1;
428 }
429 
430 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
431 {
432 	return -1;
433 }
434 
435 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
436 {
437 }
438 
439 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
440 {
441 }
442 #endif
443 
444 static int record__aio_enabled(struct record *rec)
445 {
446 	return rec->opts.nr_cblocks > 0;
447 }
448 
449 #define MMAP_FLUSH_DEFAULT 1
450 static int record__mmap_flush_parse(const struct option *opt,
451 				    const char *str,
452 				    int unset)
453 {
454 	int flush_max;
455 	struct record_opts *opts = (struct record_opts *)opt->value;
456 	static struct parse_tag tags[] = {
457 			{ .tag  = 'B', .mult = 1       },
458 			{ .tag  = 'K', .mult = 1 << 10 },
459 			{ .tag  = 'M', .mult = 1 << 20 },
460 			{ .tag  = 'G', .mult = 1 << 30 },
461 			{ .tag  = 0 },
462 	};
463 
464 	if (unset)
465 		return 0;
466 
467 	if (str) {
468 		opts->mmap_flush = parse_tag_value(str, tags);
469 		if (opts->mmap_flush == (int)-1)
470 			opts->mmap_flush = strtol(str, NULL, 0);
471 	}
472 
473 	if (!opts->mmap_flush)
474 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
475 
476 	flush_max = evlist__mmap_size(opts->mmap_pages);
477 	flush_max /= 4;
478 	if (opts->mmap_flush > flush_max)
479 		opts->mmap_flush = flush_max;
480 
481 	return 0;
482 }
483 
484 #ifdef HAVE_ZSTD_SUPPORT
485 static unsigned int comp_level_default = 1;
486 
487 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
488 {
489 	struct record_opts *opts = opt->value;
490 
491 	if (unset) {
492 		opts->comp_level = 0;
493 	} else {
494 		if (str)
495 			opts->comp_level = strtol(str, NULL, 0);
496 		if (!opts->comp_level)
497 			opts->comp_level = comp_level_default;
498 	}
499 
500 	return 0;
501 }
502 #endif
503 static unsigned int comp_level_max = 22;
504 
505 static int record__comp_enabled(struct record *rec)
506 {
507 	return rec->opts.comp_level > 0;
508 }
509 
510 static int process_synthesized_event(struct perf_tool *tool,
511 				     union perf_event *event,
512 				     struct perf_sample *sample __maybe_unused,
513 				     struct machine *machine __maybe_unused)
514 {
515 	struct record *rec = container_of(tool, struct record, tool);
516 	return record__write(rec, NULL, event, event->header.size);
517 }
518 
519 static int process_locked_synthesized_event(struct perf_tool *tool,
520 				     union perf_event *event,
521 				     struct perf_sample *sample __maybe_unused,
522 				     struct machine *machine __maybe_unused)
523 {
524 	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
525 	int ret;
526 
527 	pthread_mutex_lock(&synth_lock);
528 	ret = process_synthesized_event(tool, event, sample, machine);
529 	pthread_mutex_unlock(&synth_lock);
530 	return ret;
531 }
532 
533 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
534 {
535 	struct record *rec = to;
536 
537 	if (record__comp_enabled(rec)) {
538 		size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
539 		bf   = map->data;
540 	}
541 
542 	rec->samples++;
543 	return record__write(rec, map, bf, size);
544 }
545 
546 static volatile int signr = -1;
547 static volatile int child_finished;
548 #ifdef HAVE_EVENTFD_SUPPORT
549 static int done_fd = -1;
550 #endif
551 
552 static void sig_handler(int sig)
553 {
554 	if (sig == SIGCHLD)
555 		child_finished = 1;
556 	else
557 		signr = sig;
558 
559 	done = 1;
560 #ifdef HAVE_EVENTFD_SUPPORT
561 {
562 	u64 tmp = 1;
563 	/*
564 	 * It is possible for this signal handler to run after done is checked
565 	 * in the main loop, but before the perf counter fds are polled. If this
566 	 * happens, the poll() will continue to wait even though done is set,
567 	 * and will only break out if either another signal is received, or the
568 	 * counters are ready for read. To ensure the poll() doesn't sleep when
569 	 * done is set, use an eventfd (done_fd) to wake up the poll().
570 	 */
571 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
572 		pr_err("failed to signal wakeup fd, error: %m\n");
573 }
574 #endif // HAVE_EVENTFD_SUPPORT
575 }
576 
577 static void sigsegv_handler(int sig)
578 {
579 	perf_hooks__recover();
580 	sighandler_dump_stack(sig);
581 }
582 
583 static void record__sig_exit(void)
584 {
585 	if (signr == -1)
586 		return;
587 
588 	signal(signr, SIG_DFL);
589 	raise(signr);
590 }
591 
592 #ifdef HAVE_AUXTRACE_SUPPORT
593 
594 static int record__process_auxtrace(struct perf_tool *tool,
595 				    struct mmap *map,
596 				    union perf_event *event, void *data1,
597 				    size_t len1, void *data2, size_t len2)
598 {
599 	struct record *rec = container_of(tool, struct record, tool);
600 	struct perf_data *data = &rec->data;
601 	size_t padding;
602 	u8 pad[8] = {0};
603 
604 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
605 		off_t file_offset;
606 		int fd = perf_data__fd(data);
607 		int err;
608 
609 		file_offset = lseek(fd, 0, SEEK_CUR);
610 		if (file_offset == -1)
611 			return -1;
612 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
613 						     event, file_offset);
614 		if (err)
615 			return err;
616 	}
617 
618 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
619 	padding = (len1 + len2) & 7;
620 	if (padding)
621 		padding = 8 - padding;
622 
623 	record__write(rec, map, event, event->header.size);
624 	record__write(rec, map, data1, len1);
625 	if (len2)
626 		record__write(rec, map, data2, len2);
627 	record__write(rec, map, &pad, padding);
628 
629 	return 0;
630 }
631 
632 static int record__auxtrace_mmap_read(struct record *rec,
633 				      struct mmap *map)
634 {
635 	int ret;
636 
637 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
638 				  record__process_auxtrace);
639 	if (ret < 0)
640 		return ret;
641 
642 	if (ret)
643 		rec->samples++;
644 
645 	return 0;
646 }
647 
648 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
649 					       struct mmap *map)
650 {
651 	int ret;
652 
653 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
654 					   record__process_auxtrace,
655 					   rec->opts.auxtrace_snapshot_size);
656 	if (ret < 0)
657 		return ret;
658 
659 	if (ret)
660 		rec->samples++;
661 
662 	return 0;
663 }
664 
665 static int record__auxtrace_read_snapshot_all(struct record *rec)
666 {
667 	int i;
668 	int rc = 0;
669 
670 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
671 		struct mmap *map = &rec->evlist->mmap[i];
672 
673 		if (!map->auxtrace_mmap.base)
674 			continue;
675 
676 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
677 			rc = -1;
678 			goto out;
679 		}
680 	}
681 out:
682 	return rc;
683 }
684 
685 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
686 {
687 	pr_debug("Recording AUX area tracing snapshot\n");
688 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
689 		trigger_error(&auxtrace_snapshot_trigger);
690 	} else {
691 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
692 			trigger_error(&auxtrace_snapshot_trigger);
693 		else
694 			trigger_ready(&auxtrace_snapshot_trigger);
695 	}
696 }
697 
698 static int record__auxtrace_snapshot_exit(struct record *rec)
699 {
700 	if (trigger_is_error(&auxtrace_snapshot_trigger))
701 		return 0;
702 
703 	if (!auxtrace_record__snapshot_started &&
704 	    auxtrace_record__snapshot_start(rec->itr))
705 		return -1;
706 
707 	record__read_auxtrace_snapshot(rec, true);
708 	if (trigger_is_error(&auxtrace_snapshot_trigger))
709 		return -1;
710 
711 	return 0;
712 }
713 
714 static int record__auxtrace_init(struct record *rec)
715 {
716 	int err;
717 
718 	if (!rec->itr) {
719 		rec->itr = auxtrace_record__init(rec->evlist, &err);
720 		if (err)
721 			return err;
722 	}
723 
724 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
725 					      rec->opts.auxtrace_snapshot_opts);
726 	if (err)
727 		return err;
728 
729 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
730 					    rec->opts.auxtrace_sample_opts);
731 	if (err)
732 		return err;
733 
734 	auxtrace_regroup_aux_output(rec->evlist);
735 
736 	return auxtrace_parse_filters(rec->evlist);
737 }
738 
739 #else
740 
741 static inline
742 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
743 			       struct mmap *map __maybe_unused)
744 {
745 	return 0;
746 }
747 
748 static inline
749 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
750 				    bool on_exit __maybe_unused)
751 {
752 }
753 
754 static inline
755 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
756 {
757 	return 0;
758 }
759 
760 static inline
761 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
762 {
763 	return 0;
764 }
765 
766 static int record__auxtrace_init(struct record *rec __maybe_unused)
767 {
768 	return 0;
769 }
770 
771 #endif
772 
773 static int record__config_text_poke(struct evlist *evlist)
774 {
775 	struct evsel *evsel;
776 	int err;
777 
778 	/* Nothing to do if text poke is already configured */
779 	evlist__for_each_entry(evlist, evsel) {
780 		if (evsel->core.attr.text_poke)
781 			return 0;
782 	}
783 
784 	err = parse_events(evlist, "dummy:u", NULL);
785 	if (err)
786 		return err;
787 
788 	evsel = evlist__last(evlist);
789 
790 	evsel->core.attr.freq = 0;
791 	evsel->core.attr.sample_period = 1;
792 	evsel->core.attr.text_poke = 1;
793 	evsel->core.attr.ksymbol = 1;
794 
795 	evsel->core.system_wide = true;
796 	evsel->no_aux_samples = true;
797 	evsel->immediate = true;
798 
799 	/* Text poke must be collected on all CPUs */
800 	perf_cpu_map__put(evsel->core.own_cpus);
801 	evsel->core.own_cpus = perf_cpu_map__new(NULL);
802 	perf_cpu_map__put(evsel->core.cpus);
803 	evsel->core.cpus = perf_cpu_map__get(evsel->core.own_cpus);
804 
805 	evsel__set_sample_bit(evsel, TIME);
806 
807 	return 0;
808 }
809 
810 static bool record__kcore_readable(struct machine *machine)
811 {
812 	char kcore[PATH_MAX];
813 	int fd;
814 
815 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
816 
817 	fd = open(kcore, O_RDONLY);
818 	if (fd < 0)
819 		return false;
820 
821 	close(fd);
822 
823 	return true;
824 }
825 
826 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
827 {
828 	char from_dir[PATH_MAX];
829 	char kcore_dir[PATH_MAX];
830 	int ret;
831 
832 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
833 
834 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
835 	if (ret)
836 		return ret;
837 
838 	return kcore_copy(from_dir, kcore_dir);
839 }
840 
841 static int record__mmap_evlist(struct record *rec,
842 			       struct evlist *evlist)
843 {
844 	struct record_opts *opts = &rec->opts;
845 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
846 				  opts->auxtrace_sample_mode;
847 	char msg[512];
848 
849 	if (opts->affinity != PERF_AFFINITY_SYS)
850 		cpu__setup_cpunode_map();
851 
852 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
853 				 opts->auxtrace_mmap_pages,
854 				 auxtrace_overwrite,
855 				 opts->nr_cblocks, opts->affinity,
856 				 opts->mmap_flush, opts->comp_level) < 0) {
857 		if (errno == EPERM) {
858 			pr_err("Permission error mapping pages.\n"
859 			       "Consider increasing "
860 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
861 			       "or try again with a smaller value of -m/--mmap_pages.\n"
862 			       "(current value: %u,%u)\n",
863 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
864 			return -errno;
865 		} else {
866 			pr_err("failed to mmap with %d (%s)\n", errno,
867 				str_error_r(errno, msg, sizeof(msg)));
868 			if (errno)
869 				return -errno;
870 			else
871 				return -EINVAL;
872 		}
873 	}
874 	return 0;
875 }
876 
877 static int record__mmap(struct record *rec)
878 {
879 	return record__mmap_evlist(rec, rec->evlist);
880 }
881 
882 static int record__open(struct record *rec)
883 {
884 	char msg[BUFSIZ];
885 	struct evsel *pos;
886 	struct evlist *evlist = rec->evlist;
887 	struct perf_session *session = rec->session;
888 	struct record_opts *opts = &rec->opts;
889 	int rc = 0;
890 
891 	/*
892 	 * For initial_delay or system wide, we need to add a dummy event so
893 	 * that we can track PERF_RECORD_MMAP to cover the delay of waiting or
894 	 * event synthesis.
895 	 */
896 	if (opts->initial_delay || target__has_cpu(&opts->target)) {
897 		pos = evlist__get_tracking_event(evlist);
898 		if (!evsel__is_dummy_event(pos)) {
899 			/* Set up dummy event. */
900 			if (evlist__add_dummy(evlist))
901 				return -ENOMEM;
902 			pos = evlist__last(evlist);
903 			evlist__set_tracking_event(evlist, pos);
904 		}
905 
906 		/*
907 		 * Enable the dummy event when the process is forked for
908 		 * initial_delay, immediately for system wide.
909 		 */
910 		if (opts->initial_delay && !pos->immediate)
911 			pos->core.attr.enable_on_exec = 1;
912 		else
913 			pos->immediate = 1;
914 	}
915 
916 	evlist__config(evlist, opts, &callchain_param);
917 
918 	evlist__for_each_entry(evlist, pos) {
919 try_again:
920 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
921 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
922 				if (verbose > 0)
923 					ui__warning("%s\n", msg);
924 				goto try_again;
925 			}
926 			if ((errno == EINVAL || errno == EBADF) &&
927 			    pos->leader != pos &&
928 			    pos->weak_group) {
929 			        pos = evlist__reset_weak_group(evlist, pos, true);
930 				goto try_again;
931 			}
932 			rc = -errno;
933 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
934 			ui__error("%s\n", msg);
935 			goto out;
936 		}
937 
938 		pos->supported = true;
939 	}
940 
941 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
942 		pr_warning(
943 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
944 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
945 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
946 "file is not found in the buildid cache or in the vmlinux path.\n\n"
947 "Samples in kernel modules won't be resolved at all.\n\n"
948 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
949 "even with a suitable vmlinux or kallsyms file.\n\n");
950 	}
951 
952 	if (evlist__apply_filters(evlist, &pos)) {
953 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
954 			pos->filter, evsel__name(pos), errno,
955 			str_error_r(errno, msg, sizeof(msg)));
956 		rc = -1;
957 		goto out;
958 	}
959 
960 	rc = record__mmap(rec);
961 	if (rc)
962 		goto out;
963 
964 	session->evlist = evlist;
965 	perf_session__set_id_hdr_size(session);
966 out:
967 	return rc;
968 }
969 
970 static int process_sample_event(struct perf_tool *tool,
971 				union perf_event *event,
972 				struct perf_sample *sample,
973 				struct evsel *evsel,
974 				struct machine *machine)
975 {
976 	struct record *rec = container_of(tool, struct record, tool);
977 
978 	if (rec->evlist->first_sample_time == 0)
979 		rec->evlist->first_sample_time = sample->time;
980 
981 	rec->evlist->last_sample_time = sample->time;
982 
983 	if (rec->buildid_all)
984 		return 0;
985 
986 	rec->samples++;
987 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
988 }
989 
990 static int process_buildids(struct record *rec)
991 {
992 	struct perf_session *session = rec->session;
993 
994 	if (perf_data__size(&rec->data) == 0)
995 		return 0;
996 
997 	/*
998 	 * During this process, it'll load kernel map and replace the
999 	 * dso->long_name to a real pathname it found.  In this case
1000 	 * we prefer the vmlinux path like
1001 	 *   /lib/modules/3.16.4/build/vmlinux
1002 	 *
1003 	 * rather than build-id path (in debug directory).
1004 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1005 	 */
1006 	symbol_conf.ignore_vmlinux_buildid = true;
1007 
1008 	/*
1009 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1010 	 * so no need to process samples. But if timestamp_boundary is enabled,
1011 	 * it still needs to walk on all samples to get the timestamps of
1012 	 * first/last samples.
1013 	 */
1014 	if (rec->buildid_all && !rec->timestamp_boundary)
1015 		rec->tool.sample = NULL;
1016 
1017 	return perf_session__process_events(session);
1018 }
1019 
1020 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1021 {
1022 	int err;
1023 	struct perf_tool *tool = data;
1024 	/*
1025 	 *As for guest kernel when processing subcommand record&report,
1026 	 *we arrange module mmap prior to guest kernel mmap and trigger
1027 	 *a preload dso because default guest module symbols are loaded
1028 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1029 	 *method is used to avoid symbol missing when the first addr is
1030 	 *in module instead of in guest kernel.
1031 	 */
1032 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1033 					     machine);
1034 	if (err < 0)
1035 		pr_err("Couldn't record guest kernel [%d]'s reference"
1036 		       " relocation symbol.\n", machine->pid);
1037 
1038 	/*
1039 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1040 	 * have no _text sometimes.
1041 	 */
1042 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1043 						 machine);
1044 	if (err < 0)
1045 		pr_err("Couldn't record guest kernel [%d]'s reference"
1046 		       " relocation symbol.\n", machine->pid);
1047 }
1048 
1049 static struct perf_event_header finished_round_event = {
1050 	.size = sizeof(struct perf_event_header),
1051 	.type = PERF_RECORD_FINISHED_ROUND,
1052 };
1053 
1054 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1055 {
1056 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1057 	    !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
1058 			  rec->affinity_mask.nbits)) {
1059 		bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
1060 		bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
1061 			  map->affinity_mask.bits, rec->affinity_mask.nbits);
1062 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
1063 				  (cpu_set_t *)rec->affinity_mask.bits);
1064 		if (verbose == 2)
1065 			mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
1066 	}
1067 }
1068 
1069 static size_t process_comp_header(void *record, size_t increment)
1070 {
1071 	struct perf_record_compressed *event = record;
1072 	size_t size = sizeof(*event);
1073 
1074 	if (increment) {
1075 		event->header.size += increment;
1076 		return increment;
1077 	}
1078 
1079 	event->header.type = PERF_RECORD_COMPRESSED;
1080 	event->header.size = size;
1081 
1082 	return size;
1083 }
1084 
1085 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
1086 			    void *src, size_t src_size)
1087 {
1088 	size_t compressed;
1089 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1090 
1091 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1092 						     max_record_size, process_comp_header);
1093 
1094 	session->bytes_transferred += src_size;
1095 	session->bytes_compressed  += compressed;
1096 
1097 	return compressed;
1098 }
1099 
1100 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1101 				    bool overwrite, bool synch)
1102 {
1103 	u64 bytes_written = rec->bytes_written;
1104 	int i;
1105 	int rc = 0;
1106 	struct mmap *maps;
1107 	int trace_fd = rec->data.file.fd;
1108 	off_t off = 0;
1109 
1110 	if (!evlist)
1111 		return 0;
1112 
1113 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1114 	if (!maps)
1115 		return 0;
1116 
1117 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1118 		return 0;
1119 
1120 	if (record__aio_enabled(rec))
1121 		off = record__aio_get_pos(trace_fd);
1122 
1123 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
1124 		u64 flush = 0;
1125 		struct mmap *map = &maps[i];
1126 
1127 		if (map->core.base) {
1128 			record__adjust_affinity(rec, map);
1129 			if (synch) {
1130 				flush = map->core.flush;
1131 				map->core.flush = 1;
1132 			}
1133 			if (!record__aio_enabled(rec)) {
1134 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1135 					if (synch)
1136 						map->core.flush = flush;
1137 					rc = -1;
1138 					goto out;
1139 				}
1140 			} else {
1141 				if (record__aio_push(rec, map, &off) < 0) {
1142 					record__aio_set_pos(trace_fd, off);
1143 					if (synch)
1144 						map->core.flush = flush;
1145 					rc = -1;
1146 					goto out;
1147 				}
1148 			}
1149 			if (synch)
1150 				map->core.flush = flush;
1151 		}
1152 
1153 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1154 		    !rec->opts.auxtrace_sample_mode &&
1155 		    record__auxtrace_mmap_read(rec, map) != 0) {
1156 			rc = -1;
1157 			goto out;
1158 		}
1159 	}
1160 
1161 	if (record__aio_enabled(rec))
1162 		record__aio_set_pos(trace_fd, off);
1163 
1164 	/*
1165 	 * Mark the round finished in case we wrote
1166 	 * at least one event.
1167 	 */
1168 	if (bytes_written != rec->bytes_written)
1169 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1170 
1171 	if (overwrite)
1172 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1173 out:
1174 	return rc;
1175 }
1176 
1177 static int record__mmap_read_all(struct record *rec, bool synch)
1178 {
1179 	int err;
1180 
1181 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1182 	if (err)
1183 		return err;
1184 
1185 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1186 }
1187 
1188 static void record__init_features(struct record *rec)
1189 {
1190 	struct perf_session *session = rec->session;
1191 	int feat;
1192 
1193 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1194 		perf_header__set_feat(&session->header, feat);
1195 
1196 	if (rec->no_buildid)
1197 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1198 
1199 	if (!have_tracepoints(&rec->evlist->core.entries))
1200 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1201 
1202 	if (!rec->opts.branch_stack)
1203 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1204 
1205 	if (!rec->opts.full_auxtrace)
1206 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1207 
1208 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1209 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1210 
1211 	if (!rec->opts.use_clockid)
1212 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1213 
1214 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1215 	if (!record__comp_enabled(rec))
1216 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1217 
1218 	perf_header__clear_feat(&session->header, HEADER_STAT);
1219 }
1220 
1221 static void
1222 record__finish_output(struct record *rec)
1223 {
1224 	struct perf_data *data = &rec->data;
1225 	int fd = perf_data__fd(data);
1226 
1227 	if (data->is_pipe)
1228 		return;
1229 
1230 	rec->session->header.data_size += rec->bytes_written;
1231 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1232 
1233 	if (!rec->no_buildid) {
1234 		process_buildids(rec);
1235 
1236 		if (rec->buildid_all)
1237 			dsos__hit_all(rec->session);
1238 	}
1239 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1240 
1241 	return;
1242 }
1243 
1244 static int record__synthesize_workload(struct record *rec, bool tail)
1245 {
1246 	int err;
1247 	struct perf_thread_map *thread_map;
1248 
1249 	if (rec->opts.tail_synthesize != tail)
1250 		return 0;
1251 
1252 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1253 	if (thread_map == NULL)
1254 		return -1;
1255 
1256 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1257 						 process_synthesized_event,
1258 						 &rec->session->machines.host,
1259 						 rec->opts.sample_address);
1260 	perf_thread_map__put(thread_map);
1261 	return err;
1262 }
1263 
1264 static int record__synthesize(struct record *rec, bool tail);
1265 
1266 static int
1267 record__switch_output(struct record *rec, bool at_exit)
1268 {
1269 	struct perf_data *data = &rec->data;
1270 	int fd, err;
1271 	char *new_filename;
1272 
1273 	/* Same Size:      "2015122520103046"*/
1274 	char timestamp[] = "InvalidTimestamp";
1275 
1276 	record__aio_mmap_read_sync(rec);
1277 
1278 	record__synthesize(rec, true);
1279 	if (target__none(&rec->opts.target))
1280 		record__synthesize_workload(rec, true);
1281 
1282 	rec->samples = 0;
1283 	record__finish_output(rec);
1284 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1285 	if (err) {
1286 		pr_err("Failed to get current timestamp\n");
1287 		return -EINVAL;
1288 	}
1289 
1290 	fd = perf_data__switch(data, timestamp,
1291 				    rec->session->header.data_offset,
1292 				    at_exit, &new_filename);
1293 	if (fd >= 0 && !at_exit) {
1294 		rec->bytes_written = 0;
1295 		rec->session->header.data_size = 0;
1296 	}
1297 
1298 	if (!quiet)
1299 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1300 			data->path, timestamp);
1301 
1302 	if (rec->switch_output.num_files) {
1303 		int n = rec->switch_output.cur_file + 1;
1304 
1305 		if (n >= rec->switch_output.num_files)
1306 			n = 0;
1307 		rec->switch_output.cur_file = n;
1308 		if (rec->switch_output.filenames[n]) {
1309 			remove(rec->switch_output.filenames[n]);
1310 			zfree(&rec->switch_output.filenames[n]);
1311 		}
1312 		rec->switch_output.filenames[n] = new_filename;
1313 	} else {
1314 		free(new_filename);
1315 	}
1316 
1317 	/* Output tracking events */
1318 	if (!at_exit) {
1319 		record__synthesize(rec, false);
1320 
1321 		/*
1322 		 * In 'perf record --switch-output' without -a,
1323 		 * record__synthesize() in record__switch_output() won't
1324 		 * generate tracking events because there's no thread_map
1325 		 * in evlist. Which causes newly created perf.data doesn't
1326 		 * contain map and comm information.
1327 		 * Create a fake thread_map and directly call
1328 		 * perf_event__synthesize_thread_map() for those events.
1329 		 */
1330 		if (target__none(&rec->opts.target))
1331 			record__synthesize_workload(rec, false);
1332 	}
1333 	return fd;
1334 }
1335 
1336 static volatile int workload_exec_errno;
1337 
1338 /*
1339  * evlist__prepare_workload will send a SIGUSR1
1340  * if the fork fails, since we asked by setting its
1341  * want_signal to true.
1342  */
1343 static void workload_exec_failed_signal(int signo __maybe_unused,
1344 					siginfo_t *info,
1345 					void *ucontext __maybe_unused)
1346 {
1347 	workload_exec_errno = info->si_value.sival_int;
1348 	done = 1;
1349 	child_finished = 1;
1350 }
1351 
1352 static void snapshot_sig_handler(int sig);
1353 static void alarm_sig_handler(int sig);
1354 
1355 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1356 {
1357 	if (evlist) {
1358 		if (evlist->mmap && evlist->mmap[0].core.base)
1359 			return evlist->mmap[0].core.base;
1360 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1361 			return evlist->overwrite_mmap[0].core.base;
1362 	}
1363 	return NULL;
1364 }
1365 
1366 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1367 {
1368 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1369 	if (pc)
1370 		return pc;
1371 	return NULL;
1372 }
1373 
1374 static int record__synthesize(struct record *rec, bool tail)
1375 {
1376 	struct perf_session *session = rec->session;
1377 	struct machine *machine = &session->machines.host;
1378 	struct perf_data *data = &rec->data;
1379 	struct record_opts *opts = &rec->opts;
1380 	struct perf_tool *tool = &rec->tool;
1381 	int fd = perf_data__fd(data);
1382 	int err = 0;
1383 	event_op f = process_synthesized_event;
1384 
1385 	if (rec->opts.tail_synthesize != tail)
1386 		return 0;
1387 
1388 	if (data->is_pipe) {
1389 		/*
1390 		 * We need to synthesize events first, because some
1391 		 * features works on top of them (on report side).
1392 		 */
1393 		err = perf_event__synthesize_attrs(tool, rec->evlist,
1394 						   process_synthesized_event);
1395 		if (err < 0) {
1396 			pr_err("Couldn't synthesize attrs.\n");
1397 			goto out;
1398 		}
1399 
1400 		err = perf_event__synthesize_features(tool, session, rec->evlist,
1401 						      process_synthesized_event);
1402 		if (err < 0) {
1403 			pr_err("Couldn't synthesize features.\n");
1404 			return err;
1405 		}
1406 
1407 		if (have_tracepoints(&rec->evlist->core.entries)) {
1408 			/*
1409 			 * FIXME err <= 0 here actually means that
1410 			 * there were no tracepoints so its not really
1411 			 * an error, just that we don't need to
1412 			 * synthesize anything.  We really have to
1413 			 * return this more properly and also
1414 			 * propagate errors that now are calling die()
1415 			 */
1416 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
1417 								  process_synthesized_event);
1418 			if (err <= 0) {
1419 				pr_err("Couldn't record tracing data.\n");
1420 				goto out;
1421 			}
1422 			rec->bytes_written += err;
1423 		}
1424 	}
1425 
1426 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1427 					  process_synthesized_event, machine);
1428 	if (err)
1429 		goto out;
1430 
1431 	/* Synthesize id_index before auxtrace_info */
1432 	if (rec->opts.auxtrace_sample_mode) {
1433 		err = perf_event__synthesize_id_index(tool,
1434 						      process_synthesized_event,
1435 						      session->evlist, machine);
1436 		if (err)
1437 			goto out;
1438 	}
1439 
1440 	if (rec->opts.full_auxtrace) {
1441 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1442 					session, process_synthesized_event);
1443 		if (err)
1444 			goto out;
1445 	}
1446 
1447 	if (!evlist__exclude_kernel(rec->evlist)) {
1448 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1449 							 machine);
1450 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1451 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1452 				   "Check /proc/kallsyms permission or run as root.\n");
1453 
1454 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1455 						     machine);
1456 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1457 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1458 				   "Check /proc/modules permission or run as root.\n");
1459 	}
1460 
1461 	if (perf_guest) {
1462 		machines__process_guests(&session->machines,
1463 					 perf_event__synthesize_guest_os, tool);
1464 	}
1465 
1466 	err = perf_event__synthesize_extra_attr(&rec->tool,
1467 						rec->evlist,
1468 						process_synthesized_event,
1469 						data->is_pipe);
1470 	if (err)
1471 		goto out;
1472 
1473 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1474 						 process_synthesized_event,
1475 						NULL);
1476 	if (err < 0) {
1477 		pr_err("Couldn't synthesize thread map.\n");
1478 		return err;
1479 	}
1480 
1481 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1482 					     process_synthesized_event, NULL);
1483 	if (err < 0) {
1484 		pr_err("Couldn't synthesize cpu map.\n");
1485 		return err;
1486 	}
1487 
1488 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1489 						machine, opts);
1490 	if (err < 0)
1491 		pr_warning("Couldn't synthesize bpf events.\n");
1492 
1493 	err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1494 					     machine);
1495 	if (err < 0)
1496 		pr_warning("Couldn't synthesize cgroup events.\n");
1497 
1498 	if (rec->opts.nr_threads_synthesize > 1) {
1499 		perf_set_multithreaded();
1500 		f = process_locked_synthesized_event;
1501 	}
1502 
1503 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1504 					    f, opts->sample_address,
1505 					    rec->opts.nr_threads_synthesize);
1506 
1507 	if (rec->opts.nr_threads_synthesize > 1)
1508 		perf_set_singlethreaded();
1509 
1510 out:
1511 	return err;
1512 }
1513 
1514 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1515 {
1516 	struct record *rec = data;
1517 	pthread_kill(rec->thread_id, SIGUSR2);
1518 	return 0;
1519 }
1520 
1521 static int record__setup_sb_evlist(struct record *rec)
1522 {
1523 	struct record_opts *opts = &rec->opts;
1524 
1525 	if (rec->sb_evlist != NULL) {
1526 		/*
1527 		 * We get here if --switch-output-event populated the
1528 		 * sb_evlist, so associate a callback that will send a SIGUSR2
1529 		 * to the main thread.
1530 		 */
1531 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1532 		rec->thread_id = pthread_self();
1533 	}
1534 #ifdef HAVE_LIBBPF_SUPPORT
1535 	if (!opts->no_bpf_event) {
1536 		if (rec->sb_evlist == NULL) {
1537 			rec->sb_evlist = evlist__new();
1538 
1539 			if (rec->sb_evlist == NULL) {
1540 				pr_err("Couldn't create side band evlist.\n.");
1541 				return -1;
1542 			}
1543 		}
1544 
1545 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1546 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1547 			return -1;
1548 		}
1549 	}
1550 #endif
1551 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1552 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1553 		opts->no_bpf_event = true;
1554 	}
1555 
1556 	return 0;
1557 }
1558 
1559 static int record__init_clock(struct record *rec)
1560 {
1561 	struct perf_session *session = rec->session;
1562 	struct timespec ref_clockid;
1563 	struct timeval ref_tod;
1564 	u64 ref;
1565 
1566 	if (!rec->opts.use_clockid)
1567 		return 0;
1568 
1569 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1570 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
1571 
1572 	session->header.env.clock.clockid = rec->opts.clockid;
1573 
1574 	if (gettimeofday(&ref_tod, NULL) != 0) {
1575 		pr_err("gettimeofday failed, cannot set reference time.\n");
1576 		return -1;
1577 	}
1578 
1579 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
1580 		pr_err("clock_gettime failed, cannot set reference time.\n");
1581 		return -1;
1582 	}
1583 
1584 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
1585 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
1586 
1587 	session->header.env.clock.tod_ns = ref;
1588 
1589 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
1590 	      (u64) ref_clockid.tv_nsec;
1591 
1592 	session->header.env.clock.clockid_ns = ref;
1593 	return 0;
1594 }
1595 
1596 static void hit_auxtrace_snapshot_trigger(struct record *rec)
1597 {
1598 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1599 		trigger_hit(&auxtrace_snapshot_trigger);
1600 		auxtrace_record__snapshot_started = 1;
1601 		if (auxtrace_record__snapshot_start(rec->itr))
1602 			trigger_error(&auxtrace_snapshot_trigger);
1603 	}
1604 }
1605 
1606 static int __cmd_record(struct record *rec, int argc, const char **argv)
1607 {
1608 	int err;
1609 	int status = 0;
1610 	unsigned long waking = 0;
1611 	const bool forks = argc > 0;
1612 	struct perf_tool *tool = &rec->tool;
1613 	struct record_opts *opts = &rec->opts;
1614 	struct perf_data *data = &rec->data;
1615 	struct perf_session *session;
1616 	bool disabled = false, draining = false;
1617 	int fd;
1618 	float ratio = 0;
1619 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
1620 
1621 	atexit(record__sig_exit);
1622 	signal(SIGCHLD, sig_handler);
1623 	signal(SIGINT, sig_handler);
1624 	signal(SIGTERM, sig_handler);
1625 	signal(SIGSEGV, sigsegv_handler);
1626 
1627 	if (rec->opts.record_namespaces)
1628 		tool->namespace_events = true;
1629 
1630 	if (rec->opts.record_cgroup) {
1631 #ifdef HAVE_FILE_HANDLE
1632 		tool->cgroup_events = true;
1633 #else
1634 		pr_err("cgroup tracking is not supported\n");
1635 		return -1;
1636 #endif
1637 	}
1638 
1639 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1640 		signal(SIGUSR2, snapshot_sig_handler);
1641 		if (rec->opts.auxtrace_snapshot_mode)
1642 			trigger_on(&auxtrace_snapshot_trigger);
1643 		if (rec->switch_output.enabled)
1644 			trigger_on(&switch_output_trigger);
1645 	} else {
1646 		signal(SIGUSR2, SIG_IGN);
1647 	}
1648 
1649 	session = perf_session__new(data, false, tool);
1650 	if (IS_ERR(session)) {
1651 		pr_err("Perf session creation failed.\n");
1652 		return PTR_ERR(session);
1653 	}
1654 
1655 	fd = perf_data__fd(data);
1656 	rec->session = session;
1657 
1658 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1659 		pr_err("Compression initialization failed.\n");
1660 		return -1;
1661 	}
1662 #ifdef HAVE_EVENTFD_SUPPORT
1663 	done_fd = eventfd(0, EFD_NONBLOCK);
1664 	if (done_fd < 0) {
1665 		pr_err("Failed to create wakeup eventfd, error: %m\n");
1666 		status = -1;
1667 		goto out_delete_session;
1668 	}
1669 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
1670 	if (err < 0) {
1671 		pr_err("Failed to add wakeup eventfd to poll list\n");
1672 		status = err;
1673 		goto out_delete_session;
1674 	}
1675 #endif // HAVE_EVENTFD_SUPPORT
1676 
1677 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1678 	session->header.env.comp_level = rec->opts.comp_level;
1679 
1680 	if (rec->opts.kcore &&
1681 	    !record__kcore_readable(&session->machines.host)) {
1682 		pr_err("ERROR: kcore is not readable.\n");
1683 		return -1;
1684 	}
1685 
1686 	if (record__init_clock(rec))
1687 		return -1;
1688 
1689 	record__init_features(rec);
1690 
1691 	if (forks) {
1692 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
1693 					       workload_exec_failed_signal);
1694 		if (err < 0) {
1695 			pr_err("Couldn't run the workload!\n");
1696 			status = err;
1697 			goto out_delete_session;
1698 		}
1699 	}
1700 
1701 	/*
1702 	 * If we have just single event and are sending data
1703 	 * through pipe, we need to force the ids allocation,
1704 	 * because we synthesize event name through the pipe
1705 	 * and need the id for that.
1706 	 */
1707 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1708 		rec->opts.sample_id = true;
1709 
1710 	if (record__open(rec) != 0) {
1711 		err = -1;
1712 		goto out_child;
1713 	}
1714 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1715 
1716 	if (rec->opts.kcore) {
1717 		err = record__kcore_copy(&session->machines.host, data);
1718 		if (err) {
1719 			pr_err("ERROR: Failed to copy kcore\n");
1720 			goto out_child;
1721 		}
1722 	}
1723 
1724 	err = bpf__apply_obj_config();
1725 	if (err) {
1726 		char errbuf[BUFSIZ];
1727 
1728 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1729 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1730 			 errbuf);
1731 		goto out_child;
1732 	}
1733 
1734 	/*
1735 	 * Normally perf_session__new would do this, but it doesn't have the
1736 	 * evlist.
1737 	 */
1738 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
1739 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1740 		rec->tool.ordered_events = false;
1741 	}
1742 
1743 	if (!rec->evlist->nr_groups)
1744 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1745 
1746 	if (data->is_pipe) {
1747 		err = perf_header__write_pipe(fd);
1748 		if (err < 0)
1749 			goto out_child;
1750 	} else {
1751 		err = perf_session__write_header(session, rec->evlist, fd, false);
1752 		if (err < 0)
1753 			goto out_child;
1754 	}
1755 
1756 	err = -1;
1757 	if (!rec->no_buildid
1758 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1759 		pr_err("Couldn't generate buildids. "
1760 		       "Use --no-buildid to profile anyway.\n");
1761 		goto out_child;
1762 	}
1763 
1764 	err = record__setup_sb_evlist(rec);
1765 	if (err)
1766 		goto out_child;
1767 
1768 	err = record__synthesize(rec, false);
1769 	if (err < 0)
1770 		goto out_child;
1771 
1772 	if (rec->realtime_prio) {
1773 		struct sched_param param;
1774 
1775 		param.sched_priority = rec->realtime_prio;
1776 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1777 			pr_err("Could not set realtime priority.\n");
1778 			err = -1;
1779 			goto out_child;
1780 		}
1781 	}
1782 
1783 	/*
1784 	 * When perf is starting the traced process, all the events
1785 	 * (apart from group members) have enable_on_exec=1 set,
1786 	 * so don't spoil it by prematurely enabling them.
1787 	 */
1788 	if (!target__none(&opts->target) && !opts->initial_delay)
1789 		evlist__enable(rec->evlist);
1790 
1791 	/*
1792 	 * Let the child rip
1793 	 */
1794 	if (forks) {
1795 		struct machine *machine = &session->machines.host;
1796 		union perf_event *event;
1797 		pid_t tgid;
1798 
1799 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1800 		if (event == NULL) {
1801 			err = -ENOMEM;
1802 			goto out_child;
1803 		}
1804 
1805 		/*
1806 		 * Some H/W events are generated before COMM event
1807 		 * which is emitted during exec(), so perf script
1808 		 * cannot see a correct process name for those events.
1809 		 * Synthesize COMM event to prevent it.
1810 		 */
1811 		tgid = perf_event__synthesize_comm(tool, event,
1812 						   rec->evlist->workload.pid,
1813 						   process_synthesized_event,
1814 						   machine);
1815 		free(event);
1816 
1817 		if (tgid == -1)
1818 			goto out_child;
1819 
1820 		event = malloc(sizeof(event->namespaces) +
1821 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1822 			       machine->id_hdr_size);
1823 		if (event == NULL) {
1824 			err = -ENOMEM;
1825 			goto out_child;
1826 		}
1827 
1828 		/*
1829 		 * Synthesize NAMESPACES event for the command specified.
1830 		 */
1831 		perf_event__synthesize_namespaces(tool, event,
1832 						  rec->evlist->workload.pid,
1833 						  tgid, process_synthesized_event,
1834 						  machine);
1835 		free(event);
1836 
1837 		evlist__start_workload(rec->evlist);
1838 	}
1839 
1840 	if (evlist__initialize_ctlfd(rec->evlist, opts->ctl_fd, opts->ctl_fd_ack))
1841 		goto out_child;
1842 
1843 	if (opts->initial_delay) {
1844 		pr_info(EVLIST_DISABLED_MSG);
1845 		if (opts->initial_delay > 0) {
1846 			usleep(opts->initial_delay * USEC_PER_MSEC);
1847 			evlist__enable(rec->evlist);
1848 			pr_info(EVLIST_ENABLED_MSG);
1849 		}
1850 	}
1851 
1852 	trigger_ready(&auxtrace_snapshot_trigger);
1853 	trigger_ready(&switch_output_trigger);
1854 	perf_hooks__invoke_record_start();
1855 	for (;;) {
1856 		unsigned long long hits = rec->samples;
1857 
1858 		/*
1859 		 * rec->evlist->bkw_mmap_state is possible to be
1860 		 * BKW_MMAP_EMPTY here: when done == true and
1861 		 * hits != rec->samples in previous round.
1862 		 *
1863 		 * evlist__toggle_bkw_mmap ensure we never
1864 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1865 		 */
1866 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1867 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1868 
1869 		if (record__mmap_read_all(rec, false) < 0) {
1870 			trigger_error(&auxtrace_snapshot_trigger);
1871 			trigger_error(&switch_output_trigger);
1872 			err = -1;
1873 			goto out_child;
1874 		}
1875 
1876 		if (auxtrace_record__snapshot_started) {
1877 			auxtrace_record__snapshot_started = 0;
1878 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1879 				record__read_auxtrace_snapshot(rec, false);
1880 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1881 				pr_err("AUX area tracing snapshot failed\n");
1882 				err = -1;
1883 				goto out_child;
1884 			}
1885 		}
1886 
1887 		if (trigger_is_hit(&switch_output_trigger)) {
1888 			/*
1889 			 * If switch_output_trigger is hit, the data in
1890 			 * overwritable ring buffer should have been collected,
1891 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1892 			 *
1893 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1894 			 * record__mmap_read_all() didn't collect data from
1895 			 * overwritable ring buffer. Read again.
1896 			 */
1897 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1898 				continue;
1899 			trigger_ready(&switch_output_trigger);
1900 
1901 			/*
1902 			 * Reenable events in overwrite ring buffer after
1903 			 * record__mmap_read_all(): we should have collected
1904 			 * data from it.
1905 			 */
1906 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1907 
1908 			if (!quiet)
1909 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1910 					waking);
1911 			waking = 0;
1912 			fd = record__switch_output(rec, false);
1913 			if (fd < 0) {
1914 				pr_err("Failed to switch to new file\n");
1915 				trigger_error(&switch_output_trigger);
1916 				err = fd;
1917 				goto out_child;
1918 			}
1919 
1920 			/* re-arm the alarm */
1921 			if (rec->switch_output.time)
1922 				alarm(rec->switch_output.time);
1923 		}
1924 
1925 		if (hits == rec->samples) {
1926 			if (done || draining)
1927 				break;
1928 			err = evlist__poll(rec->evlist, -1);
1929 			/*
1930 			 * Propagate error, only if there's any. Ignore positive
1931 			 * number of returned events and interrupt error.
1932 			 */
1933 			if (err > 0 || (err < 0 && errno == EINTR))
1934 				err = 0;
1935 			waking++;
1936 
1937 			if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1938 				draining = true;
1939 		}
1940 
1941 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
1942 			switch (cmd) {
1943 			case EVLIST_CTL_CMD_SNAPSHOT:
1944 				hit_auxtrace_snapshot_trigger(rec);
1945 				evlist__ctlfd_ack(rec->evlist);
1946 				break;
1947 			case EVLIST_CTL_CMD_STOP:
1948 				done = 1;
1949 				break;
1950 			case EVLIST_CTL_CMD_ACK:
1951 			case EVLIST_CTL_CMD_UNSUPPORTED:
1952 			case EVLIST_CTL_CMD_ENABLE:
1953 			case EVLIST_CTL_CMD_DISABLE:
1954 			case EVLIST_CTL_CMD_EVLIST:
1955 			case EVLIST_CTL_CMD_PING:
1956 			default:
1957 				break;
1958 			}
1959 		}
1960 
1961 		/*
1962 		 * When perf is starting the traced process, at the end events
1963 		 * die with the process and we wait for that. Thus no need to
1964 		 * disable events in this case.
1965 		 */
1966 		if (done && !disabled && !target__none(&opts->target)) {
1967 			trigger_off(&auxtrace_snapshot_trigger);
1968 			evlist__disable(rec->evlist);
1969 			disabled = true;
1970 		}
1971 	}
1972 
1973 	trigger_off(&auxtrace_snapshot_trigger);
1974 	trigger_off(&switch_output_trigger);
1975 
1976 	if (opts->auxtrace_snapshot_on_exit)
1977 		record__auxtrace_snapshot_exit(rec);
1978 
1979 	if (forks && workload_exec_errno) {
1980 		char msg[STRERR_BUFSIZE], strevsels[2048];
1981 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1982 
1983 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
1984 
1985 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
1986 			strevsels, argv[0], emsg);
1987 		err = -1;
1988 		goto out_child;
1989 	}
1990 
1991 	if (!quiet)
1992 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1993 
1994 	if (target__none(&rec->opts.target))
1995 		record__synthesize_workload(rec, true);
1996 
1997 out_child:
1998 	evlist__finalize_ctlfd(rec->evlist);
1999 	record__mmap_read_all(rec, true);
2000 	record__aio_mmap_read_sync(rec);
2001 
2002 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2003 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2004 		session->header.env.comp_ratio = ratio + 0.5;
2005 	}
2006 
2007 	if (forks) {
2008 		int exit_status;
2009 
2010 		if (!child_finished)
2011 			kill(rec->evlist->workload.pid, SIGTERM);
2012 
2013 		wait(&exit_status);
2014 
2015 		if (err < 0)
2016 			status = err;
2017 		else if (WIFEXITED(exit_status))
2018 			status = WEXITSTATUS(exit_status);
2019 		else if (WIFSIGNALED(exit_status))
2020 			signr = WTERMSIG(exit_status);
2021 	} else
2022 		status = err;
2023 
2024 	record__synthesize(rec, true);
2025 	/* this will be recalculated during process_buildids() */
2026 	rec->samples = 0;
2027 
2028 	if (!err) {
2029 		if (!rec->timestamp_filename) {
2030 			record__finish_output(rec);
2031 		} else {
2032 			fd = record__switch_output(rec, true);
2033 			if (fd < 0) {
2034 				status = fd;
2035 				goto out_delete_session;
2036 			}
2037 		}
2038 	}
2039 
2040 	perf_hooks__invoke_record_end();
2041 
2042 	if (!err && !quiet) {
2043 		char samples[128];
2044 		const char *postfix = rec->timestamp_filename ?
2045 					".<timestamp>" : "";
2046 
2047 		if (rec->samples && !rec->opts.full_auxtrace)
2048 			scnprintf(samples, sizeof(samples),
2049 				  " (%" PRIu64 " samples)", rec->samples);
2050 		else
2051 			samples[0] = '\0';
2052 
2053 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2054 			perf_data__size(data) / 1024.0 / 1024.0,
2055 			data->path, postfix, samples);
2056 		if (ratio) {
2057 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2058 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2059 					ratio);
2060 		}
2061 		fprintf(stderr, " ]\n");
2062 	}
2063 
2064 out_delete_session:
2065 #ifdef HAVE_EVENTFD_SUPPORT
2066 	if (done_fd >= 0)
2067 		close(done_fd);
2068 #endif
2069 	zstd_fini(&session->zstd_data);
2070 	perf_session__delete(session);
2071 
2072 	if (!opts->no_bpf_event)
2073 		evlist__stop_sb_thread(rec->sb_evlist);
2074 	return status;
2075 }
2076 
2077 static void callchain_debug(struct callchain_param *callchain)
2078 {
2079 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2080 
2081 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2082 
2083 	if (callchain->record_mode == CALLCHAIN_DWARF)
2084 		pr_debug("callchain: stack dump size %d\n",
2085 			 callchain->dump_size);
2086 }
2087 
2088 int record_opts__parse_callchain(struct record_opts *record,
2089 				 struct callchain_param *callchain,
2090 				 const char *arg, bool unset)
2091 {
2092 	int ret;
2093 	callchain->enabled = !unset;
2094 
2095 	/* --no-call-graph */
2096 	if (unset) {
2097 		callchain->record_mode = CALLCHAIN_NONE;
2098 		pr_debug("callchain: disabled\n");
2099 		return 0;
2100 	}
2101 
2102 	ret = parse_callchain_record_opt(arg, callchain);
2103 	if (!ret) {
2104 		/* Enable data address sampling for DWARF unwind. */
2105 		if (callchain->record_mode == CALLCHAIN_DWARF)
2106 			record->sample_address = true;
2107 		callchain_debug(callchain);
2108 	}
2109 
2110 	return ret;
2111 }
2112 
2113 int record_parse_callchain_opt(const struct option *opt,
2114 			       const char *arg,
2115 			       int unset)
2116 {
2117 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2118 }
2119 
2120 int record_callchain_opt(const struct option *opt,
2121 			 const char *arg __maybe_unused,
2122 			 int unset __maybe_unused)
2123 {
2124 	struct callchain_param *callchain = opt->value;
2125 
2126 	callchain->enabled = true;
2127 
2128 	if (callchain->record_mode == CALLCHAIN_NONE)
2129 		callchain->record_mode = CALLCHAIN_FP;
2130 
2131 	callchain_debug(callchain);
2132 	return 0;
2133 }
2134 
2135 static int perf_record_config(const char *var, const char *value, void *cb)
2136 {
2137 	struct record *rec = cb;
2138 
2139 	if (!strcmp(var, "record.build-id")) {
2140 		if (!strcmp(value, "cache"))
2141 			rec->no_buildid_cache = false;
2142 		else if (!strcmp(value, "no-cache"))
2143 			rec->no_buildid_cache = true;
2144 		else if (!strcmp(value, "skip"))
2145 			rec->no_buildid = true;
2146 		else if (!strcmp(value, "mmap"))
2147 			rec->buildid_mmap = true;
2148 		else
2149 			return -1;
2150 		return 0;
2151 	}
2152 	if (!strcmp(var, "record.call-graph")) {
2153 		var = "call-graph.record-mode";
2154 		return perf_default_config(var, value, cb);
2155 	}
2156 #ifdef HAVE_AIO_SUPPORT
2157 	if (!strcmp(var, "record.aio")) {
2158 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2159 		if (!rec->opts.nr_cblocks)
2160 			rec->opts.nr_cblocks = nr_cblocks_default;
2161 	}
2162 #endif
2163 
2164 	return 0;
2165 }
2166 
2167 
2168 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2169 {
2170 	struct record_opts *opts = (struct record_opts *)opt->value;
2171 
2172 	if (unset || !str)
2173 		return 0;
2174 
2175 	if (!strcasecmp(str, "node"))
2176 		opts->affinity = PERF_AFFINITY_NODE;
2177 	else if (!strcasecmp(str, "cpu"))
2178 		opts->affinity = PERF_AFFINITY_CPU;
2179 
2180 	return 0;
2181 }
2182 
2183 static int parse_output_max_size(const struct option *opt,
2184 				 const char *str, int unset)
2185 {
2186 	unsigned long *s = (unsigned long *)opt->value;
2187 	static struct parse_tag tags_size[] = {
2188 		{ .tag  = 'B', .mult = 1       },
2189 		{ .tag  = 'K', .mult = 1 << 10 },
2190 		{ .tag  = 'M', .mult = 1 << 20 },
2191 		{ .tag  = 'G', .mult = 1 << 30 },
2192 		{ .tag  = 0 },
2193 	};
2194 	unsigned long val;
2195 
2196 	if (unset) {
2197 		*s = 0;
2198 		return 0;
2199 	}
2200 
2201 	val = parse_tag_value(str, tags_size);
2202 	if (val != (unsigned long) -1) {
2203 		*s = val;
2204 		return 0;
2205 	}
2206 
2207 	return -1;
2208 }
2209 
2210 static int record__parse_mmap_pages(const struct option *opt,
2211 				    const char *str,
2212 				    int unset __maybe_unused)
2213 {
2214 	struct record_opts *opts = opt->value;
2215 	char *s, *p;
2216 	unsigned int mmap_pages;
2217 	int ret;
2218 
2219 	if (!str)
2220 		return -EINVAL;
2221 
2222 	s = strdup(str);
2223 	if (!s)
2224 		return -ENOMEM;
2225 
2226 	p = strchr(s, ',');
2227 	if (p)
2228 		*p = '\0';
2229 
2230 	if (*s) {
2231 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
2232 		if (ret)
2233 			goto out_free;
2234 		opts->mmap_pages = mmap_pages;
2235 	}
2236 
2237 	if (!p) {
2238 		ret = 0;
2239 		goto out_free;
2240 	}
2241 
2242 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
2243 	if (ret)
2244 		goto out_free;
2245 
2246 	opts->auxtrace_mmap_pages = mmap_pages;
2247 
2248 out_free:
2249 	free(s);
2250 	return ret;
2251 }
2252 
2253 static int parse_control_option(const struct option *opt,
2254 				const char *str,
2255 				int unset __maybe_unused)
2256 {
2257 	struct record_opts *opts = opt->value;
2258 
2259 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
2260 }
2261 
2262 static void switch_output_size_warn(struct record *rec)
2263 {
2264 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2265 	struct switch_output *s = &rec->switch_output;
2266 
2267 	wakeup_size /= 2;
2268 
2269 	if (s->size < wakeup_size) {
2270 		char buf[100];
2271 
2272 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2273 		pr_warning("WARNING: switch-output data size lower than "
2274 			   "wakeup kernel buffer size (%s) "
2275 			   "expect bigger perf.data sizes\n", buf);
2276 	}
2277 }
2278 
2279 static int switch_output_setup(struct record *rec)
2280 {
2281 	struct switch_output *s = &rec->switch_output;
2282 	static struct parse_tag tags_size[] = {
2283 		{ .tag  = 'B', .mult = 1       },
2284 		{ .tag  = 'K', .mult = 1 << 10 },
2285 		{ .tag  = 'M', .mult = 1 << 20 },
2286 		{ .tag  = 'G', .mult = 1 << 30 },
2287 		{ .tag  = 0 },
2288 	};
2289 	static struct parse_tag tags_time[] = {
2290 		{ .tag  = 's', .mult = 1        },
2291 		{ .tag  = 'm', .mult = 60       },
2292 		{ .tag  = 'h', .mult = 60*60    },
2293 		{ .tag  = 'd', .mult = 60*60*24 },
2294 		{ .tag  = 0 },
2295 	};
2296 	unsigned long val;
2297 
2298 	/*
2299 	 * If we're using --switch-output-events, then we imply its
2300 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2301 	 *  thread to its parent.
2302 	 */
2303 	if (rec->switch_output_event_set)
2304 		goto do_signal;
2305 
2306 	if (!s->set)
2307 		return 0;
2308 
2309 	if (!strcmp(s->str, "signal")) {
2310 do_signal:
2311 		s->signal = true;
2312 		pr_debug("switch-output with SIGUSR2 signal\n");
2313 		goto enabled;
2314 	}
2315 
2316 	val = parse_tag_value(s->str, tags_size);
2317 	if (val != (unsigned long) -1) {
2318 		s->size = val;
2319 		pr_debug("switch-output with %s size threshold\n", s->str);
2320 		goto enabled;
2321 	}
2322 
2323 	val = parse_tag_value(s->str, tags_time);
2324 	if (val != (unsigned long) -1) {
2325 		s->time = val;
2326 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2327 			 s->str, s->time);
2328 		goto enabled;
2329 	}
2330 
2331 	return -1;
2332 
2333 enabled:
2334 	rec->timestamp_filename = true;
2335 	s->enabled              = true;
2336 
2337 	if (s->size && !rec->opts.no_buffering)
2338 		switch_output_size_warn(rec);
2339 
2340 	return 0;
2341 }
2342 
2343 static const char * const __record_usage[] = {
2344 	"perf record [<options>] [<command>]",
2345 	"perf record [<options>] -- <command> [<options>]",
2346 	NULL
2347 };
2348 const char * const *record_usage = __record_usage;
2349 
2350 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2351 				  struct perf_sample *sample, struct machine *machine)
2352 {
2353 	/*
2354 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2355 	 * no need to add them twice.
2356 	 */
2357 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2358 		return 0;
2359 	return perf_event__process_mmap(tool, event, sample, machine);
2360 }
2361 
2362 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2363 				   struct perf_sample *sample, struct machine *machine)
2364 {
2365 	/*
2366 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2367 	 * no need to add them twice.
2368 	 */
2369 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2370 		return 0;
2371 
2372 	return perf_event__process_mmap2(tool, event, sample, machine);
2373 }
2374 
2375 /*
2376  * XXX Ideally would be local to cmd_record() and passed to a record__new
2377  * because we need to have access to it in record__exit, that is called
2378  * after cmd_record() exits, but since record_options need to be accessible to
2379  * builtin-script, leave it here.
2380  *
2381  * At least we don't ouch it in all the other functions here directly.
2382  *
2383  * Just say no to tons of global variables, sigh.
2384  */
2385 static struct record record = {
2386 	.opts = {
2387 		.sample_time	     = true,
2388 		.mmap_pages	     = UINT_MAX,
2389 		.user_freq	     = UINT_MAX,
2390 		.user_interval	     = ULLONG_MAX,
2391 		.freq		     = 4000,
2392 		.target		     = {
2393 			.uses_mmap   = true,
2394 			.default_per_cpu = true,
2395 		},
2396 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2397 		.nr_threads_synthesize = 1,
2398 		.ctl_fd              = -1,
2399 		.ctl_fd_ack          = -1,
2400 	},
2401 	.tool = {
2402 		.sample		= process_sample_event,
2403 		.fork		= perf_event__process_fork,
2404 		.exit		= perf_event__process_exit,
2405 		.comm		= perf_event__process_comm,
2406 		.namespaces	= perf_event__process_namespaces,
2407 		.mmap		= build_id__process_mmap,
2408 		.mmap2		= build_id__process_mmap2,
2409 		.ordered_events	= true,
2410 	},
2411 };
2412 
2413 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2414 	"\n\t\t\t\tDefault: fp";
2415 
2416 static bool dry_run;
2417 
2418 /*
2419  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2420  * with it and switch to use the library functions in perf_evlist that came
2421  * from builtin-record.c, i.e. use record_opts,
2422  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2423  * using pipes, etc.
2424  */
2425 static struct option __record_options[] = {
2426 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2427 		     "event selector. use 'perf list' to list available events",
2428 		     parse_events_option),
2429 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2430 		     "event filter", parse_filter),
2431 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2432 			   NULL, "don't record events from perf itself",
2433 			   exclude_perf),
2434 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2435 		    "record events on existing process id"),
2436 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2437 		    "record events on existing thread id"),
2438 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2439 		    "collect data with this RT SCHED_FIFO priority"),
2440 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2441 		    "collect data without buffering"),
2442 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2443 		    "collect raw sample records from all opened counters"),
2444 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2445 			    "system-wide collection from all CPUs"),
2446 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2447 		    "list of cpus to monitor"),
2448 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2449 	OPT_STRING('o', "output", &record.data.path, "file",
2450 		    "output file name"),
2451 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2452 			&record.opts.no_inherit_set,
2453 			"child tasks do not inherit counters"),
2454 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2455 		    "synthesize non-sample events at the end of output"),
2456 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2457 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
2458 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2459 		    "Fail if the specified frequency can't be used"),
2460 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2461 		     "profile at this frequency",
2462 		      record__parse_freq),
2463 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2464 		     "number of mmap data pages and AUX area tracing mmap pages",
2465 		     record__parse_mmap_pages),
2466 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2467 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2468 		     record__mmap_flush_parse),
2469 	OPT_BOOLEAN(0, "group", &record.opts.group,
2470 		    "put the counters into a counter group"),
2471 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2472 			   NULL, "enables call-graph recording" ,
2473 			   &record_callchain_opt),
2474 	OPT_CALLBACK(0, "call-graph", &record.opts,
2475 		     "record_mode[,record_size]", record_callchain_help,
2476 		     &record_parse_callchain_opt),
2477 	OPT_INCR('v', "verbose", &verbose,
2478 		    "be more verbose (show counter open errors, etc)"),
2479 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2480 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2481 		    "per thread counts"),
2482 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2483 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2484 		    "Record the sample physical addresses"),
2485 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
2486 		    "Record the sampled data address data page size"),
2487 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
2488 		    "Record the sampled code address (ip) page size"),
2489 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2490 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2491 			&record.opts.sample_time_set,
2492 			"Record the sample timestamps"),
2493 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2494 			"Record the sample period"),
2495 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2496 		    "don't sample"),
2497 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2498 			&record.no_buildid_cache_set,
2499 			"do not update the buildid cache"),
2500 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2501 			&record.no_buildid_set,
2502 			"do not collect buildids in perf.data"),
2503 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2504 		     "monitor event in cgroup name only",
2505 		     parse_cgroups),
2506 	OPT_INTEGER('D', "delay", &record.opts.initial_delay,
2507 		  "ms to wait before starting measurement after program start (-1: start with events disabled)"),
2508 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2509 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2510 		   "user to profile"),
2511 
2512 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2513 		     "branch any", "sample any taken branches",
2514 		     parse_branch_stack),
2515 
2516 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2517 		     "branch filter mask", "branch stack filter modes",
2518 		     parse_branch_stack),
2519 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2520 		    "sample by weight (on special events only)"),
2521 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2522 		    "sample transaction flags (special events only)"),
2523 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2524 		    "use per-thread mmaps"),
2525 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2526 		    "sample selected machine registers on interrupt,"
2527 		    " use '-I?' to list register names", parse_intr_regs),
2528 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2529 		    "sample selected machine registers on interrupt,"
2530 		    " use '--user-regs=?' to list register names", parse_user_regs),
2531 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2532 		    "Record running/enabled time of read (:S) events"),
2533 	OPT_CALLBACK('k', "clockid", &record.opts,
2534 	"clockid", "clockid to use for events, see clock_gettime()",
2535 	parse_clockid),
2536 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2537 			  "opts", "AUX area tracing Snapshot Mode", ""),
2538 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2539 			  "opts", "sample AUX area", ""),
2540 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2541 			"per thread proc mmap processing timeout in ms"),
2542 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2543 		    "Record namespaces events"),
2544 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
2545 		    "Record cgroup events"),
2546 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
2547 			&record.opts.record_switch_events_set,
2548 			"Record context switch events"),
2549 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2550 			 "Configure all used events to run in kernel space.",
2551 			 PARSE_OPT_EXCLUSIVE),
2552 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2553 			 "Configure all used events to run in user space.",
2554 			 PARSE_OPT_EXCLUSIVE),
2555 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2556 		    "collect kernel callchains"),
2557 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2558 		    "collect user callchains"),
2559 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2560 		   "clang binary to use for compiling BPF scriptlets"),
2561 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2562 		   "options passed to clang when compiling BPF scriptlets"),
2563 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2564 		   "file", "vmlinux pathname"),
2565 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2566 		    "Record build-id of all DSOs regardless of hits"),
2567 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
2568 		    "Record build-id in map events"),
2569 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2570 		    "append timestamp to output filename"),
2571 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2572 		    "Record timestamp boundary (time of first/last samples)"),
2573 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2574 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2575 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2576 			  "signal"),
2577 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
2578 			 "switch output event selector. use 'perf list' to list available events",
2579 			 parse_events_option_new_evlist),
2580 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2581 		   "Limit number of switch output generated files"),
2582 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2583 		    "Parse options then exit"),
2584 #ifdef HAVE_AIO_SUPPORT
2585 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2586 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2587 		     record__aio_parse),
2588 #endif
2589 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2590 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2591 		     record__parse_affinity),
2592 #ifdef HAVE_ZSTD_SUPPORT
2593 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2594 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2595 			    record__parse_comp_level),
2596 #endif
2597 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
2598 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
2599 	OPT_UINTEGER(0, "num-thread-synthesize",
2600 		     &record.opts.nr_threads_synthesize,
2601 		     "number of threads to run for event synthesis"),
2602 #ifdef HAVE_LIBPFM
2603 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
2604 		"libpfm4 event selector. use 'perf list' to list available events",
2605 		parse_libpfm_events_option),
2606 #endif
2607 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
2608 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
2609 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
2610 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
2611 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
2612 		      parse_control_option),
2613 	OPT_END()
2614 };
2615 
2616 struct option *record_options = __record_options;
2617 
2618 int cmd_record(int argc, const char **argv)
2619 {
2620 	int err;
2621 	struct record *rec = &record;
2622 	char errbuf[BUFSIZ];
2623 
2624 	setlocale(LC_ALL, "");
2625 
2626 #ifndef HAVE_LIBBPF_SUPPORT
2627 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2628 	set_nobuild('\0', "clang-path", true);
2629 	set_nobuild('\0', "clang-opt", true);
2630 # undef set_nobuild
2631 #endif
2632 
2633 #ifndef HAVE_BPF_PROLOGUE
2634 # if !defined (HAVE_DWARF_SUPPORT)
2635 #  define REASON  "NO_DWARF=1"
2636 # elif !defined (HAVE_LIBBPF_SUPPORT)
2637 #  define REASON  "NO_LIBBPF=1"
2638 # else
2639 #  define REASON  "this architecture doesn't support BPF prologue"
2640 # endif
2641 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2642 	set_nobuild('\0', "vmlinux", true);
2643 # undef set_nobuild
2644 # undef REASON
2645 #endif
2646 
2647 	rec->opts.affinity = PERF_AFFINITY_SYS;
2648 
2649 	rec->evlist = evlist__new();
2650 	if (rec->evlist == NULL)
2651 		return -ENOMEM;
2652 
2653 	err = perf_config(perf_record_config, rec);
2654 	if (err)
2655 		return err;
2656 
2657 	argc = parse_options(argc, argv, record_options, record_usage,
2658 			    PARSE_OPT_STOP_AT_NON_OPTION);
2659 	if (quiet)
2660 		perf_quiet_option();
2661 
2662 	/* Make system wide (-a) the default target. */
2663 	if (!argc && target__none(&rec->opts.target))
2664 		rec->opts.target.system_wide = true;
2665 
2666 	if (nr_cgroups && !rec->opts.target.system_wide) {
2667 		usage_with_options_msg(record_usage, record_options,
2668 			"cgroup monitoring only available in system-wide mode");
2669 
2670 	}
2671 
2672 	if (rec->buildid_mmap) {
2673 		if (!perf_can_record_build_id()) {
2674 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
2675 			err = -EINVAL;
2676 			goto out_opts;
2677 		}
2678 		pr_debug("Enabling build id in mmap2 events.\n");
2679 		/* Enable mmap build id synthesizing. */
2680 		symbol_conf.buildid_mmap2 = true;
2681 		/* Enable perf_event_attr::build_id bit. */
2682 		rec->opts.build_id = true;
2683 		/* Disable build id cache. */
2684 		rec->no_buildid = true;
2685 	}
2686 
2687 	if (rec->opts.kcore)
2688 		rec->data.is_dir = true;
2689 
2690 	if (rec->opts.comp_level != 0) {
2691 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2692 		rec->no_buildid = true;
2693 	}
2694 
2695 	if (rec->opts.record_switch_events &&
2696 	    !perf_can_record_switch_events()) {
2697 		ui__error("kernel does not support recording context switch events\n");
2698 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2699 		err = -EINVAL;
2700 		goto out_opts;
2701 	}
2702 
2703 	if (switch_output_setup(rec)) {
2704 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2705 		err = -EINVAL;
2706 		goto out_opts;
2707 	}
2708 
2709 	if (rec->switch_output.time) {
2710 		signal(SIGALRM, alarm_sig_handler);
2711 		alarm(rec->switch_output.time);
2712 	}
2713 
2714 	if (rec->switch_output.num_files) {
2715 		rec->switch_output.filenames = calloc(sizeof(char *),
2716 						      rec->switch_output.num_files);
2717 		if (!rec->switch_output.filenames) {
2718 			err = -EINVAL;
2719 			goto out_opts;
2720 		}
2721 	}
2722 
2723 	/*
2724 	 * Allow aliases to facilitate the lookup of symbols for address
2725 	 * filters. Refer to auxtrace_parse_filters().
2726 	 */
2727 	symbol_conf.allow_aliases = true;
2728 
2729 	symbol__init(NULL);
2730 
2731 	if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2732 		rec->affinity_mask.nbits = cpu__max_cpu();
2733 		rec->affinity_mask.bits = bitmap_alloc(rec->affinity_mask.nbits);
2734 		if (!rec->affinity_mask.bits) {
2735 			pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2736 			err = -ENOMEM;
2737 			goto out_opts;
2738 		}
2739 		pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2740 	}
2741 
2742 	err = record__auxtrace_init(rec);
2743 	if (err)
2744 		goto out;
2745 
2746 	if (dry_run)
2747 		goto out;
2748 
2749 	err = bpf__setup_stdout(rec->evlist);
2750 	if (err) {
2751 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2752 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2753 			 errbuf);
2754 		goto out;
2755 	}
2756 
2757 	err = -ENOMEM;
2758 
2759 	if (rec->no_buildid_cache || rec->no_buildid) {
2760 		disable_buildid_cache();
2761 	} else if (rec->switch_output.enabled) {
2762 		/*
2763 		 * In 'perf record --switch-output', disable buildid
2764 		 * generation by default to reduce data file switching
2765 		 * overhead. Still generate buildid if they are required
2766 		 * explicitly using
2767 		 *
2768 		 *  perf record --switch-output --no-no-buildid \
2769 		 *              --no-no-buildid-cache
2770 		 *
2771 		 * Following code equals to:
2772 		 *
2773 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2774 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2775 		 *         disable_buildid_cache();
2776 		 */
2777 		bool disable = true;
2778 
2779 		if (rec->no_buildid_set && !rec->no_buildid)
2780 			disable = false;
2781 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2782 			disable = false;
2783 		if (disable) {
2784 			rec->no_buildid = true;
2785 			rec->no_buildid_cache = true;
2786 			disable_buildid_cache();
2787 		}
2788 	}
2789 
2790 	if (record.opts.overwrite)
2791 		record.opts.tail_synthesize = true;
2792 
2793 	if (rec->evlist->core.nr_entries == 0 &&
2794 	    __evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2795 		pr_err("Not enough memory for event selector list\n");
2796 		goto out;
2797 	}
2798 
2799 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2800 		rec->opts.no_inherit = true;
2801 
2802 	err = target__validate(&rec->opts.target);
2803 	if (err) {
2804 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2805 		ui__warning("%s\n", errbuf);
2806 	}
2807 
2808 	err = target__parse_uid(&rec->opts.target);
2809 	if (err) {
2810 		int saved_errno = errno;
2811 
2812 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2813 		ui__error("%s", errbuf);
2814 
2815 		err = -saved_errno;
2816 		goto out;
2817 	}
2818 
2819 	/* Enable ignoring missing threads when -u/-p option is defined. */
2820 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2821 
2822 	err = -ENOMEM;
2823 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2824 		usage_with_options(record_usage, record_options);
2825 
2826 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2827 	if (err)
2828 		goto out;
2829 
2830 	/*
2831 	 * We take all buildids when the file contains
2832 	 * AUX area tracing data because we do not decode the
2833 	 * trace because it would take too long.
2834 	 */
2835 	if (rec->opts.full_auxtrace)
2836 		rec->buildid_all = true;
2837 
2838 	if (rec->opts.text_poke) {
2839 		err = record__config_text_poke(rec->evlist);
2840 		if (err) {
2841 			pr_err("record__config_text_poke failed, error %d\n", err);
2842 			goto out;
2843 		}
2844 	}
2845 
2846 	if (record_opts__config(&rec->opts)) {
2847 		err = -EINVAL;
2848 		goto out;
2849 	}
2850 
2851 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2852 		rec->opts.nr_cblocks = nr_cblocks_max;
2853 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2854 
2855 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2856 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2857 
2858 	if (rec->opts.comp_level > comp_level_max)
2859 		rec->opts.comp_level = comp_level_max;
2860 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2861 
2862 	err = __cmd_record(&record, argc, argv);
2863 out:
2864 	bitmap_free(rec->affinity_mask.bits);
2865 	evlist__delete(rec->evlist);
2866 	symbol__exit();
2867 	auxtrace_record__free(rec->itr);
2868 out_opts:
2869 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
2870 	return err;
2871 }
2872 
2873 static void snapshot_sig_handler(int sig __maybe_unused)
2874 {
2875 	struct record *rec = &record;
2876 
2877 	hit_auxtrace_snapshot_trigger(rec);
2878 
2879 	if (switch_output_signal(rec))
2880 		trigger_hit(&switch_output_trigger);
2881 }
2882 
2883 static void alarm_sig_handler(int sig __maybe_unused)
2884 {
2885 	struct record *rec = &record;
2886 
2887 	if (switch_output_time(rec))
2888 		trigger_hit(&switch_output_trigger);
2889 }
2890