xref: /openbmc/linux/tools/perf/builtin-record.c (revision 0da85d1e)
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #include "builtin.h"
9 
10 #include "perf.h"
11 
12 #include "util/build-id.h"
13 #include "util/util.h"
14 #include "util/parse-options.h"
15 #include "util/parse-events.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/cpumap.h"
28 #include "util/thread_map.h"
29 #include "util/data.h"
30 #include "util/auxtrace.h"
31 
32 #include <unistd.h>
33 #include <sched.h>
34 #include <sys/mman.h>
35 
36 
37 struct record {
38 	struct perf_tool	tool;
39 	struct record_opts	opts;
40 	u64			bytes_written;
41 	struct perf_data_file	file;
42 	struct auxtrace_record	*itr;
43 	struct perf_evlist	*evlist;
44 	struct perf_session	*session;
45 	const char		*progname;
46 	int			realtime_prio;
47 	bool			no_buildid;
48 	bool			no_buildid_cache;
49 	long			samples;
50 };
51 
52 static int record__write(struct record *rec, void *bf, size_t size)
53 {
54 	if (perf_data_file__write(rec->session->file, bf, size) < 0) {
55 		pr_err("failed to write perf data, error: %m\n");
56 		return -1;
57 	}
58 
59 	rec->bytes_written += size;
60 	return 0;
61 }
62 
63 static int process_synthesized_event(struct perf_tool *tool,
64 				     union perf_event *event,
65 				     struct perf_sample *sample __maybe_unused,
66 				     struct machine *machine __maybe_unused)
67 {
68 	struct record *rec = container_of(tool, struct record, tool);
69 	return record__write(rec, event, event->header.size);
70 }
71 
72 static int record__mmap_read(struct record *rec, int idx)
73 {
74 	struct perf_mmap *md = &rec->evlist->mmap[idx];
75 	u64 head = perf_mmap__read_head(md);
76 	u64 old = md->prev;
77 	unsigned char *data = md->base + page_size;
78 	unsigned long size;
79 	void *buf;
80 	int rc = 0;
81 
82 	if (old == head)
83 		return 0;
84 
85 	rec->samples++;
86 
87 	size = head - old;
88 
89 	if ((old & md->mask) + size != (head & md->mask)) {
90 		buf = &data[old & md->mask];
91 		size = md->mask + 1 - (old & md->mask);
92 		old += size;
93 
94 		if (record__write(rec, buf, size) < 0) {
95 			rc = -1;
96 			goto out;
97 		}
98 	}
99 
100 	buf = &data[old & md->mask];
101 	size = head - old;
102 	old += size;
103 
104 	if (record__write(rec, buf, size) < 0) {
105 		rc = -1;
106 		goto out;
107 	}
108 
109 	md->prev = old;
110 	perf_evlist__mmap_consume(rec->evlist, idx);
111 out:
112 	return rc;
113 }
114 
115 static volatile int done;
116 static volatile int signr = -1;
117 static volatile int child_finished;
118 static volatile int auxtrace_snapshot_enabled;
119 static volatile int auxtrace_snapshot_err;
120 static volatile int auxtrace_record__snapshot_started;
121 
122 static void sig_handler(int sig)
123 {
124 	if (sig == SIGCHLD)
125 		child_finished = 1;
126 	else
127 		signr = sig;
128 
129 	done = 1;
130 }
131 
132 static void record__sig_exit(void)
133 {
134 	if (signr == -1)
135 		return;
136 
137 	signal(signr, SIG_DFL);
138 	raise(signr);
139 }
140 
141 #ifdef HAVE_AUXTRACE_SUPPORT
142 
143 static int record__process_auxtrace(struct perf_tool *tool,
144 				    union perf_event *event, void *data1,
145 				    size_t len1, void *data2, size_t len2)
146 {
147 	struct record *rec = container_of(tool, struct record, tool);
148 	struct perf_data_file *file = &rec->file;
149 	size_t padding;
150 	u8 pad[8] = {0};
151 
152 	if (!perf_data_file__is_pipe(file)) {
153 		off_t file_offset;
154 		int fd = perf_data_file__fd(file);
155 		int err;
156 
157 		file_offset = lseek(fd, 0, SEEK_CUR);
158 		if (file_offset == -1)
159 			return -1;
160 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
161 						     event, file_offset);
162 		if (err)
163 			return err;
164 	}
165 
166 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
167 	padding = (len1 + len2) & 7;
168 	if (padding)
169 		padding = 8 - padding;
170 
171 	record__write(rec, event, event->header.size);
172 	record__write(rec, data1, len1);
173 	if (len2)
174 		record__write(rec, data2, len2);
175 	record__write(rec, &pad, padding);
176 
177 	return 0;
178 }
179 
180 static int record__auxtrace_mmap_read(struct record *rec,
181 				      struct auxtrace_mmap *mm)
182 {
183 	int ret;
184 
185 	ret = auxtrace_mmap__read(mm, rec->itr, &rec->tool,
186 				  record__process_auxtrace);
187 	if (ret < 0)
188 		return ret;
189 
190 	if (ret)
191 		rec->samples++;
192 
193 	return 0;
194 }
195 
196 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
197 					       struct auxtrace_mmap *mm)
198 {
199 	int ret;
200 
201 	ret = auxtrace_mmap__read_snapshot(mm, rec->itr, &rec->tool,
202 					   record__process_auxtrace,
203 					   rec->opts.auxtrace_snapshot_size);
204 	if (ret < 0)
205 		return ret;
206 
207 	if (ret)
208 		rec->samples++;
209 
210 	return 0;
211 }
212 
213 static int record__auxtrace_read_snapshot_all(struct record *rec)
214 {
215 	int i;
216 	int rc = 0;
217 
218 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
219 		struct auxtrace_mmap *mm =
220 				&rec->evlist->mmap[i].auxtrace_mmap;
221 
222 		if (!mm->base)
223 			continue;
224 
225 		if (record__auxtrace_mmap_read_snapshot(rec, mm) != 0) {
226 			rc = -1;
227 			goto out;
228 		}
229 	}
230 out:
231 	return rc;
232 }
233 
234 static void record__read_auxtrace_snapshot(struct record *rec)
235 {
236 	pr_debug("Recording AUX area tracing snapshot\n");
237 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
238 		auxtrace_snapshot_err = -1;
239 	} else {
240 		auxtrace_snapshot_err = auxtrace_record__snapshot_finish(rec->itr);
241 		if (!auxtrace_snapshot_err)
242 			auxtrace_snapshot_enabled = 1;
243 	}
244 }
245 
246 #else
247 
248 static inline
249 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
250 			       struct auxtrace_mmap *mm __maybe_unused)
251 {
252 	return 0;
253 }
254 
255 static inline
256 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
257 {
258 }
259 
260 static inline
261 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
262 {
263 	return 0;
264 }
265 
266 #endif
267 
268 static int record__open(struct record *rec)
269 {
270 	char msg[512];
271 	struct perf_evsel *pos;
272 	struct perf_evlist *evlist = rec->evlist;
273 	struct perf_session *session = rec->session;
274 	struct record_opts *opts = &rec->opts;
275 	int rc = 0;
276 
277 	perf_evlist__config(evlist, opts);
278 
279 	evlist__for_each(evlist, pos) {
280 try_again:
281 		if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
282 			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
283 				if (verbose)
284 					ui__warning("%s\n", msg);
285 				goto try_again;
286 			}
287 
288 			rc = -errno;
289 			perf_evsel__open_strerror(pos, &opts->target,
290 						  errno, msg, sizeof(msg));
291 			ui__error("%s\n", msg);
292 			goto out;
293 		}
294 	}
295 
296 	if (perf_evlist__apply_filters(evlist, &pos)) {
297 		error("failed to set filter \"%s\" on event %s with %d (%s)\n",
298 			pos->filter, perf_evsel__name(pos), errno,
299 			strerror_r(errno, msg, sizeof(msg)));
300 		rc = -1;
301 		goto out;
302 	}
303 
304 	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, false,
305 				 opts->auxtrace_mmap_pages,
306 				 opts->auxtrace_snapshot_mode) < 0) {
307 		if (errno == EPERM) {
308 			pr_err("Permission error mapping pages.\n"
309 			       "Consider increasing "
310 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
311 			       "or try again with a smaller value of -m/--mmap_pages.\n"
312 			       "(current value: %u,%u)\n",
313 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
314 			rc = -errno;
315 		} else {
316 			pr_err("failed to mmap with %d (%s)\n", errno,
317 				strerror_r(errno, msg, sizeof(msg)));
318 			rc = -errno;
319 		}
320 		goto out;
321 	}
322 
323 	session->evlist = evlist;
324 	perf_session__set_id_hdr_size(session);
325 out:
326 	return rc;
327 }
328 
329 static int process_sample_event(struct perf_tool *tool,
330 				union perf_event *event,
331 				struct perf_sample *sample,
332 				struct perf_evsel *evsel,
333 				struct machine *machine)
334 {
335 	struct record *rec = container_of(tool, struct record, tool);
336 
337 	rec->samples++;
338 
339 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
340 }
341 
342 static int process_buildids(struct record *rec)
343 {
344 	struct perf_data_file *file  = &rec->file;
345 	struct perf_session *session = rec->session;
346 
347 	u64 size = lseek(perf_data_file__fd(file), 0, SEEK_CUR);
348 	if (size == 0)
349 		return 0;
350 
351 	file->size = size;
352 
353 	/*
354 	 * During this process, it'll load kernel map and replace the
355 	 * dso->long_name to a real pathname it found.  In this case
356 	 * we prefer the vmlinux path like
357 	 *   /lib/modules/3.16.4/build/vmlinux
358 	 *
359 	 * rather than build-id path (in debug directory).
360 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
361 	 */
362 	symbol_conf.ignore_vmlinux_buildid = true;
363 
364 	return perf_session__process_events(session);
365 }
366 
367 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
368 {
369 	int err;
370 	struct perf_tool *tool = data;
371 	/*
372 	 *As for guest kernel when processing subcommand record&report,
373 	 *we arrange module mmap prior to guest kernel mmap and trigger
374 	 *a preload dso because default guest module symbols are loaded
375 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
376 	 *method is used to avoid symbol missing when the first addr is
377 	 *in module instead of in guest kernel.
378 	 */
379 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
380 					     machine);
381 	if (err < 0)
382 		pr_err("Couldn't record guest kernel [%d]'s reference"
383 		       " relocation symbol.\n", machine->pid);
384 
385 	/*
386 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
387 	 * have no _text sometimes.
388 	 */
389 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
390 						 machine);
391 	if (err < 0)
392 		pr_err("Couldn't record guest kernel [%d]'s reference"
393 		       " relocation symbol.\n", machine->pid);
394 }
395 
396 static struct perf_event_header finished_round_event = {
397 	.size = sizeof(struct perf_event_header),
398 	.type = PERF_RECORD_FINISHED_ROUND,
399 };
400 
401 static int record__mmap_read_all(struct record *rec)
402 {
403 	u64 bytes_written = rec->bytes_written;
404 	int i;
405 	int rc = 0;
406 
407 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
408 		struct auxtrace_mmap *mm = &rec->evlist->mmap[i].auxtrace_mmap;
409 
410 		if (rec->evlist->mmap[i].base) {
411 			if (record__mmap_read(rec, i) != 0) {
412 				rc = -1;
413 				goto out;
414 			}
415 		}
416 
417 		if (mm->base && !rec->opts.auxtrace_snapshot_mode &&
418 		    record__auxtrace_mmap_read(rec, mm) != 0) {
419 			rc = -1;
420 			goto out;
421 		}
422 	}
423 
424 	/*
425 	 * Mark the round finished in case we wrote
426 	 * at least one event.
427 	 */
428 	if (bytes_written != rec->bytes_written)
429 		rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
430 
431 out:
432 	return rc;
433 }
434 
435 static void record__init_features(struct record *rec)
436 {
437 	struct perf_session *session = rec->session;
438 	int feat;
439 
440 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
441 		perf_header__set_feat(&session->header, feat);
442 
443 	if (rec->no_buildid)
444 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
445 
446 	if (!have_tracepoints(&rec->evlist->entries))
447 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
448 
449 	if (!rec->opts.branch_stack)
450 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
451 
452 	if (!rec->opts.full_auxtrace)
453 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
454 }
455 
456 static volatile int workload_exec_errno;
457 
458 /*
459  * perf_evlist__prepare_workload will send a SIGUSR1
460  * if the fork fails, since we asked by setting its
461  * want_signal to true.
462  */
463 static void workload_exec_failed_signal(int signo __maybe_unused,
464 					siginfo_t *info,
465 					void *ucontext __maybe_unused)
466 {
467 	workload_exec_errno = info->si_value.sival_int;
468 	done = 1;
469 	child_finished = 1;
470 }
471 
472 static void snapshot_sig_handler(int sig);
473 
474 static int __cmd_record(struct record *rec, int argc, const char **argv)
475 {
476 	int err;
477 	int status = 0;
478 	unsigned long waking = 0;
479 	const bool forks = argc > 0;
480 	struct machine *machine;
481 	struct perf_tool *tool = &rec->tool;
482 	struct record_opts *opts = &rec->opts;
483 	struct perf_data_file *file = &rec->file;
484 	struct perf_session *session;
485 	bool disabled = false, draining = false;
486 	int fd;
487 
488 	rec->progname = argv[0];
489 
490 	atexit(record__sig_exit);
491 	signal(SIGCHLD, sig_handler);
492 	signal(SIGINT, sig_handler);
493 	signal(SIGTERM, sig_handler);
494 	if (rec->opts.auxtrace_snapshot_mode)
495 		signal(SIGUSR2, snapshot_sig_handler);
496 	else
497 		signal(SIGUSR2, SIG_IGN);
498 
499 	session = perf_session__new(file, false, tool);
500 	if (session == NULL) {
501 		pr_err("Perf session creation failed.\n");
502 		return -1;
503 	}
504 
505 	fd = perf_data_file__fd(file);
506 	rec->session = session;
507 
508 	record__init_features(rec);
509 
510 	if (forks) {
511 		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
512 						    argv, file->is_pipe,
513 						    workload_exec_failed_signal);
514 		if (err < 0) {
515 			pr_err("Couldn't run the workload!\n");
516 			status = err;
517 			goto out_delete_session;
518 		}
519 	}
520 
521 	if (record__open(rec) != 0) {
522 		err = -1;
523 		goto out_child;
524 	}
525 
526 	if (!rec->evlist->nr_groups)
527 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
528 
529 	if (file->is_pipe) {
530 		err = perf_header__write_pipe(fd);
531 		if (err < 0)
532 			goto out_child;
533 	} else {
534 		err = perf_session__write_header(session, rec->evlist, fd, false);
535 		if (err < 0)
536 			goto out_child;
537 	}
538 
539 	if (!rec->no_buildid
540 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
541 		pr_err("Couldn't generate buildids. "
542 		       "Use --no-buildid to profile anyway.\n");
543 		err = -1;
544 		goto out_child;
545 	}
546 
547 	machine = &session->machines.host;
548 
549 	if (file->is_pipe) {
550 		err = perf_event__synthesize_attrs(tool, session,
551 						   process_synthesized_event);
552 		if (err < 0) {
553 			pr_err("Couldn't synthesize attrs.\n");
554 			goto out_child;
555 		}
556 
557 		if (have_tracepoints(&rec->evlist->entries)) {
558 			/*
559 			 * FIXME err <= 0 here actually means that
560 			 * there were no tracepoints so its not really
561 			 * an error, just that we don't need to
562 			 * synthesize anything.  We really have to
563 			 * return this more properly and also
564 			 * propagate errors that now are calling die()
565 			 */
566 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
567 								  process_synthesized_event);
568 			if (err <= 0) {
569 				pr_err("Couldn't record tracing data.\n");
570 				goto out_child;
571 			}
572 			rec->bytes_written += err;
573 		}
574 	}
575 
576 	if (rec->opts.full_auxtrace) {
577 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
578 					session, process_synthesized_event);
579 		if (err)
580 			goto out_delete_session;
581 	}
582 
583 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
584 						 machine);
585 	if (err < 0)
586 		pr_err("Couldn't record kernel reference relocation symbol\n"
587 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
588 		       "Check /proc/kallsyms permission or run as root.\n");
589 
590 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
591 					     machine);
592 	if (err < 0)
593 		pr_err("Couldn't record kernel module information.\n"
594 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
595 		       "Check /proc/modules permission or run as root.\n");
596 
597 	if (perf_guest) {
598 		machines__process_guests(&session->machines,
599 					 perf_event__synthesize_guest_os, tool);
600 	}
601 
602 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
603 					    process_synthesized_event, opts->sample_address);
604 	if (err != 0)
605 		goto out_child;
606 
607 	if (rec->realtime_prio) {
608 		struct sched_param param;
609 
610 		param.sched_priority = rec->realtime_prio;
611 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
612 			pr_err("Could not set realtime priority.\n");
613 			err = -1;
614 			goto out_child;
615 		}
616 	}
617 
618 	/*
619 	 * When perf is starting the traced process, all the events
620 	 * (apart from group members) have enable_on_exec=1 set,
621 	 * so don't spoil it by prematurely enabling them.
622 	 */
623 	if (!target__none(&opts->target) && !opts->initial_delay)
624 		perf_evlist__enable(rec->evlist);
625 
626 	/*
627 	 * Let the child rip
628 	 */
629 	if (forks)
630 		perf_evlist__start_workload(rec->evlist);
631 
632 	if (opts->initial_delay) {
633 		usleep(opts->initial_delay * 1000);
634 		perf_evlist__enable(rec->evlist);
635 	}
636 
637 	auxtrace_snapshot_enabled = 1;
638 	for (;;) {
639 		int hits = rec->samples;
640 
641 		if (record__mmap_read_all(rec) < 0) {
642 			auxtrace_snapshot_enabled = 0;
643 			err = -1;
644 			goto out_child;
645 		}
646 
647 		if (auxtrace_record__snapshot_started) {
648 			auxtrace_record__snapshot_started = 0;
649 			if (!auxtrace_snapshot_err)
650 				record__read_auxtrace_snapshot(rec);
651 			if (auxtrace_snapshot_err) {
652 				pr_err("AUX area tracing snapshot failed\n");
653 				err = -1;
654 				goto out_child;
655 			}
656 		}
657 
658 		if (hits == rec->samples) {
659 			if (done || draining)
660 				break;
661 			err = perf_evlist__poll(rec->evlist, -1);
662 			/*
663 			 * Propagate error, only if there's any. Ignore positive
664 			 * number of returned events and interrupt error.
665 			 */
666 			if (err > 0 || (err < 0 && errno == EINTR))
667 				err = 0;
668 			waking++;
669 
670 			if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
671 				draining = true;
672 		}
673 
674 		/*
675 		 * When perf is starting the traced process, at the end events
676 		 * die with the process and we wait for that. Thus no need to
677 		 * disable events in this case.
678 		 */
679 		if (done && !disabled && !target__none(&opts->target)) {
680 			auxtrace_snapshot_enabled = 0;
681 			perf_evlist__disable(rec->evlist);
682 			disabled = true;
683 		}
684 	}
685 	auxtrace_snapshot_enabled = 0;
686 
687 	if (forks && workload_exec_errno) {
688 		char msg[STRERR_BUFSIZE];
689 		const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg));
690 		pr_err("Workload failed: %s\n", emsg);
691 		err = -1;
692 		goto out_child;
693 	}
694 
695 	if (!quiet)
696 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
697 
698 out_child:
699 	if (forks) {
700 		int exit_status;
701 
702 		if (!child_finished)
703 			kill(rec->evlist->workload.pid, SIGTERM);
704 
705 		wait(&exit_status);
706 
707 		if (err < 0)
708 			status = err;
709 		else if (WIFEXITED(exit_status))
710 			status = WEXITSTATUS(exit_status);
711 		else if (WIFSIGNALED(exit_status))
712 			signr = WTERMSIG(exit_status);
713 	} else
714 		status = err;
715 
716 	/* this will be recalculated during process_buildids() */
717 	rec->samples = 0;
718 
719 	if (!err && !file->is_pipe) {
720 		rec->session->header.data_size += rec->bytes_written;
721 
722 		if (!rec->no_buildid) {
723 			process_buildids(rec);
724 			/*
725 			 * We take all buildids when the file contains
726 			 * AUX area tracing data because we do not decode the
727 			 * trace because it would take too long.
728 			 */
729 			if (rec->opts.full_auxtrace)
730 				dsos__hit_all(rec->session);
731 		}
732 		perf_session__write_header(rec->session, rec->evlist, fd, true);
733 	}
734 
735 	if (!err && !quiet) {
736 		char samples[128];
737 
738 		if (rec->samples && !rec->opts.full_auxtrace)
739 			scnprintf(samples, sizeof(samples),
740 				  " (%" PRIu64 " samples)", rec->samples);
741 		else
742 			samples[0] = '\0';
743 
744 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s ]\n",
745 			perf_data_file__size(file) / 1024.0 / 1024.0,
746 			file->path, samples);
747 	}
748 
749 out_delete_session:
750 	perf_session__delete(session);
751 	return status;
752 }
753 
754 #define BRANCH_OPT(n, m) \
755 	{ .name = n, .mode = (m) }
756 
757 #define BRANCH_END { .name = NULL }
758 
759 struct branch_mode {
760 	const char *name;
761 	int mode;
762 };
763 
764 static const struct branch_mode branch_modes[] = {
765 	BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
766 	BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
767 	BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
768 	BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
769 	BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
770 	BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
771 	BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
772 	BRANCH_OPT("abort_tx", PERF_SAMPLE_BRANCH_ABORT_TX),
773 	BRANCH_OPT("in_tx", PERF_SAMPLE_BRANCH_IN_TX),
774 	BRANCH_OPT("no_tx", PERF_SAMPLE_BRANCH_NO_TX),
775 	BRANCH_OPT("cond", PERF_SAMPLE_BRANCH_COND),
776 	BRANCH_END
777 };
778 
779 static int
780 parse_branch_stack(const struct option *opt, const char *str, int unset)
781 {
782 #define ONLY_PLM \
783 	(PERF_SAMPLE_BRANCH_USER	|\
784 	 PERF_SAMPLE_BRANCH_KERNEL	|\
785 	 PERF_SAMPLE_BRANCH_HV)
786 
787 	uint64_t *mode = (uint64_t *)opt->value;
788 	const struct branch_mode *br;
789 	char *s, *os = NULL, *p;
790 	int ret = -1;
791 
792 	if (unset)
793 		return 0;
794 
795 	/*
796 	 * cannot set it twice, -b + --branch-filter for instance
797 	 */
798 	if (*mode)
799 		return -1;
800 
801 	/* str may be NULL in case no arg is passed to -b */
802 	if (str) {
803 		/* because str is read-only */
804 		s = os = strdup(str);
805 		if (!s)
806 			return -1;
807 
808 		for (;;) {
809 			p = strchr(s, ',');
810 			if (p)
811 				*p = '\0';
812 
813 			for (br = branch_modes; br->name; br++) {
814 				if (!strcasecmp(s, br->name))
815 					break;
816 			}
817 			if (!br->name) {
818 				ui__warning("unknown branch filter %s,"
819 					    " check man page\n", s);
820 				goto error;
821 			}
822 
823 			*mode |= br->mode;
824 
825 			if (!p)
826 				break;
827 
828 			s = p + 1;
829 		}
830 	}
831 	ret = 0;
832 
833 	/* default to any branch */
834 	if ((*mode & ~ONLY_PLM) == 0) {
835 		*mode = PERF_SAMPLE_BRANCH_ANY;
836 	}
837 error:
838 	free(os);
839 	return ret;
840 }
841 
842 static void callchain_debug(void)
843 {
844 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
845 
846 	pr_debug("callchain: type %s\n", str[callchain_param.record_mode]);
847 
848 	if (callchain_param.record_mode == CALLCHAIN_DWARF)
849 		pr_debug("callchain: stack dump size %d\n",
850 			 callchain_param.dump_size);
851 }
852 
853 int record_parse_callchain_opt(const struct option *opt __maybe_unused,
854 			       const char *arg,
855 			       int unset)
856 {
857 	int ret;
858 
859 	callchain_param.enabled = !unset;
860 
861 	/* --no-call-graph */
862 	if (unset) {
863 		callchain_param.record_mode = CALLCHAIN_NONE;
864 		pr_debug("callchain: disabled\n");
865 		return 0;
866 	}
867 
868 	ret = parse_callchain_record_opt(arg);
869 	if (!ret)
870 		callchain_debug();
871 
872 	return ret;
873 }
874 
875 int record_callchain_opt(const struct option *opt __maybe_unused,
876 			 const char *arg __maybe_unused,
877 			 int unset __maybe_unused)
878 {
879 	callchain_param.enabled = true;
880 
881 	if (callchain_param.record_mode == CALLCHAIN_NONE)
882 		callchain_param.record_mode = CALLCHAIN_FP;
883 
884 	callchain_debug();
885 	return 0;
886 }
887 
888 static int perf_record_config(const char *var, const char *value, void *cb)
889 {
890 	if (!strcmp(var, "record.call-graph"))
891 		var = "call-graph.record-mode"; /* fall-through */
892 
893 	return perf_default_config(var, value, cb);
894 }
895 
896 struct clockid_map {
897 	const char *name;
898 	int clockid;
899 };
900 
901 #define CLOCKID_MAP(n, c)	\
902 	{ .name = n, .clockid = (c), }
903 
904 #define CLOCKID_END	{ .name = NULL, }
905 
906 
907 /*
908  * Add the missing ones, we need to build on many distros...
909  */
910 #ifndef CLOCK_MONOTONIC_RAW
911 #define CLOCK_MONOTONIC_RAW 4
912 #endif
913 #ifndef CLOCK_BOOTTIME
914 #define CLOCK_BOOTTIME 7
915 #endif
916 #ifndef CLOCK_TAI
917 #define CLOCK_TAI 11
918 #endif
919 
920 static const struct clockid_map clockids[] = {
921 	/* available for all events, NMI safe */
922 	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
923 	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
924 
925 	/* available for some events */
926 	CLOCKID_MAP("realtime", CLOCK_REALTIME),
927 	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
928 	CLOCKID_MAP("tai", CLOCK_TAI),
929 
930 	/* available for the lazy */
931 	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
932 	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
933 	CLOCKID_MAP("real", CLOCK_REALTIME),
934 	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
935 
936 	CLOCKID_END,
937 };
938 
939 static int parse_clockid(const struct option *opt, const char *str, int unset)
940 {
941 	struct record_opts *opts = (struct record_opts *)opt->value;
942 	const struct clockid_map *cm;
943 	const char *ostr = str;
944 
945 	if (unset) {
946 		opts->use_clockid = 0;
947 		return 0;
948 	}
949 
950 	/* no arg passed */
951 	if (!str)
952 		return 0;
953 
954 	/* no setting it twice */
955 	if (opts->use_clockid)
956 		return -1;
957 
958 	opts->use_clockid = true;
959 
960 	/* if its a number, we're done */
961 	if (sscanf(str, "%d", &opts->clockid) == 1)
962 		return 0;
963 
964 	/* allow a "CLOCK_" prefix to the name */
965 	if (!strncasecmp(str, "CLOCK_", 6))
966 		str += 6;
967 
968 	for (cm = clockids; cm->name; cm++) {
969 		if (!strcasecmp(str, cm->name)) {
970 			opts->clockid = cm->clockid;
971 			return 0;
972 		}
973 	}
974 
975 	opts->use_clockid = false;
976 	ui__warning("unknown clockid %s, check man page\n", ostr);
977 	return -1;
978 }
979 
980 static int record__parse_mmap_pages(const struct option *opt,
981 				    const char *str,
982 				    int unset __maybe_unused)
983 {
984 	struct record_opts *opts = opt->value;
985 	char *s, *p;
986 	unsigned int mmap_pages;
987 	int ret;
988 
989 	if (!str)
990 		return -EINVAL;
991 
992 	s = strdup(str);
993 	if (!s)
994 		return -ENOMEM;
995 
996 	p = strchr(s, ',');
997 	if (p)
998 		*p = '\0';
999 
1000 	if (*s) {
1001 		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1002 		if (ret)
1003 			goto out_free;
1004 		opts->mmap_pages = mmap_pages;
1005 	}
1006 
1007 	if (!p) {
1008 		ret = 0;
1009 		goto out_free;
1010 	}
1011 
1012 	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1013 	if (ret)
1014 		goto out_free;
1015 
1016 	opts->auxtrace_mmap_pages = mmap_pages;
1017 
1018 out_free:
1019 	free(s);
1020 	return ret;
1021 }
1022 
1023 static const char * const __record_usage[] = {
1024 	"perf record [<options>] [<command>]",
1025 	"perf record [<options>] -- <command> [<options>]",
1026 	NULL
1027 };
1028 const char * const *record_usage = __record_usage;
1029 
1030 /*
1031  * XXX Ideally would be local to cmd_record() and passed to a record__new
1032  * because we need to have access to it in record__exit, that is called
1033  * after cmd_record() exits, but since record_options need to be accessible to
1034  * builtin-script, leave it here.
1035  *
1036  * At least we don't ouch it in all the other functions here directly.
1037  *
1038  * Just say no to tons of global variables, sigh.
1039  */
1040 static struct record record = {
1041 	.opts = {
1042 		.sample_time	     = true,
1043 		.mmap_pages	     = UINT_MAX,
1044 		.user_freq	     = UINT_MAX,
1045 		.user_interval	     = ULLONG_MAX,
1046 		.freq		     = 4000,
1047 		.target		     = {
1048 			.uses_mmap   = true,
1049 			.default_per_cpu = true,
1050 		},
1051 	},
1052 	.tool = {
1053 		.sample		= process_sample_event,
1054 		.fork		= perf_event__process_fork,
1055 		.comm		= perf_event__process_comm,
1056 		.mmap		= perf_event__process_mmap,
1057 		.mmap2		= perf_event__process_mmap2,
1058 	},
1059 };
1060 
1061 #define CALLCHAIN_HELP "setup and enables call-graph (stack chain/backtrace) recording: "
1062 
1063 #ifdef HAVE_DWARF_UNWIND_SUPPORT
1064 const char record_callchain_help[] = CALLCHAIN_HELP "fp dwarf lbr";
1065 #else
1066 const char record_callchain_help[] = CALLCHAIN_HELP "fp lbr";
1067 #endif
1068 
1069 /*
1070  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
1071  * with it and switch to use the library functions in perf_evlist that came
1072  * from builtin-record.c, i.e. use record_opts,
1073  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
1074  * using pipes, etc.
1075  */
1076 struct option __record_options[] = {
1077 	OPT_CALLBACK('e', "event", &record.evlist, "event",
1078 		     "event selector. use 'perf list' to list available events",
1079 		     parse_events_option),
1080 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
1081 		     "event filter", parse_filter),
1082 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
1083 		    "record events on existing process id"),
1084 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
1085 		    "record events on existing thread id"),
1086 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
1087 		    "collect data with this RT SCHED_FIFO priority"),
1088 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
1089 		    "collect data without buffering"),
1090 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1091 		    "collect raw sample records from all opened counters"),
1092 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1093 			    "system-wide collection from all CPUs"),
1094 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1095 		    "list of cpus to monitor"),
1096 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
1097 	OPT_STRING('o', "output", &record.file.path, "file",
1098 		    "output file name"),
1099 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
1100 			&record.opts.no_inherit_set,
1101 			"child tasks do not inherit counters"),
1102 	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
1103 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
1104 		     "number of mmap data pages and AUX area tracing mmap pages",
1105 		     record__parse_mmap_pages),
1106 	OPT_BOOLEAN(0, "group", &record.opts.group,
1107 		    "put the counters into a counter group"),
1108 	OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
1109 			   NULL, "enables call-graph recording" ,
1110 			   &record_callchain_opt),
1111 	OPT_CALLBACK(0, "call-graph", &record.opts,
1112 		     "mode[,dump_size]", record_callchain_help,
1113 		     &record_parse_callchain_opt),
1114 	OPT_INCR('v', "verbose", &verbose,
1115 		    "be more verbose (show counter open errors, etc)"),
1116 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1117 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1118 		    "per thread counts"),
1119 	OPT_BOOLEAN('d', "data", &record.opts.sample_address,
1120 		    "Sample addresses"),
1121 	OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
1122 	OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
1123 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1124 		    "don't sample"),
1125 	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
1126 		    "do not update the buildid cache"),
1127 	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
1128 		    "do not collect buildids in perf.data"),
1129 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
1130 		     "monitor event in cgroup name only",
1131 		     parse_cgroups),
1132 	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
1133 		  "ms to wait before starting measurement after program start"),
1134 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
1135 		   "user to profile"),
1136 
1137 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
1138 		     "branch any", "sample any taken branches",
1139 		     parse_branch_stack),
1140 
1141 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
1142 		     "branch filter mask", "branch stack filter modes",
1143 		     parse_branch_stack),
1144 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
1145 		    "sample by weight (on special events only)"),
1146 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
1147 		    "sample transaction flags (special events only)"),
1148 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
1149 		    "use per-thread mmaps"),
1150 	OPT_BOOLEAN('I', "intr-regs", &record.opts.sample_intr_regs,
1151 		    "Sample machine registers on interrupt"),
1152 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
1153 		    "Record running/enabled time of read (:S) events"),
1154 	OPT_CALLBACK('k', "clockid", &record.opts,
1155 	"clockid", "clockid to use for events, see clock_gettime()",
1156 	parse_clockid),
1157 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
1158 			  "opts", "AUX area tracing Snapshot Mode", ""),
1159 	OPT_END()
1160 };
1161 
1162 struct option *record_options = __record_options;
1163 
1164 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
1165 {
1166 	int err;
1167 	struct record *rec = &record;
1168 	char errbuf[BUFSIZ];
1169 
1170 	rec->evlist = perf_evlist__new();
1171 	if (rec->evlist == NULL)
1172 		return -ENOMEM;
1173 
1174 	perf_config(perf_record_config, rec);
1175 
1176 	argc = parse_options(argc, argv, record_options, record_usage,
1177 			    PARSE_OPT_STOP_AT_NON_OPTION);
1178 	if (!argc && target__none(&rec->opts.target))
1179 		usage_with_options(record_usage, record_options);
1180 
1181 	if (nr_cgroups && !rec->opts.target.system_wide) {
1182 		ui__error("cgroup monitoring only available in"
1183 			  " system-wide mode\n");
1184 		usage_with_options(record_usage, record_options);
1185 	}
1186 
1187 	if (!rec->itr) {
1188 		rec->itr = auxtrace_record__init(rec->evlist, &err);
1189 		if (err)
1190 			return err;
1191 	}
1192 
1193 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
1194 					      rec->opts.auxtrace_snapshot_opts);
1195 	if (err)
1196 		return err;
1197 
1198 	err = -ENOMEM;
1199 
1200 	symbol__init(NULL);
1201 
1202 	if (symbol_conf.kptr_restrict)
1203 		pr_warning(
1204 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1205 "check /proc/sys/kernel/kptr_restrict.\n\n"
1206 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1207 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1208 "Samples in kernel modules won't be resolved at all.\n\n"
1209 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1210 "even with a suitable vmlinux or kallsyms file.\n\n");
1211 
1212 	if (rec->no_buildid_cache || rec->no_buildid)
1213 		disable_buildid_cache();
1214 
1215 	if (rec->evlist->nr_entries == 0 &&
1216 	    perf_evlist__add_default(rec->evlist) < 0) {
1217 		pr_err("Not enough memory for event selector list\n");
1218 		goto out_symbol_exit;
1219 	}
1220 
1221 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
1222 		rec->opts.no_inherit = true;
1223 
1224 	err = target__validate(&rec->opts.target);
1225 	if (err) {
1226 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1227 		ui__warning("%s", errbuf);
1228 	}
1229 
1230 	err = target__parse_uid(&rec->opts.target);
1231 	if (err) {
1232 		int saved_errno = errno;
1233 
1234 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1235 		ui__error("%s", errbuf);
1236 
1237 		err = -saved_errno;
1238 		goto out_symbol_exit;
1239 	}
1240 
1241 	err = -ENOMEM;
1242 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
1243 		usage_with_options(record_usage, record_options);
1244 
1245 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
1246 	if (err)
1247 		goto out_symbol_exit;
1248 
1249 	if (record_opts__config(&rec->opts)) {
1250 		err = -EINVAL;
1251 		goto out_symbol_exit;
1252 	}
1253 
1254 	err = __cmd_record(&record, argc, argv);
1255 out_symbol_exit:
1256 	perf_evlist__delete(rec->evlist);
1257 	symbol__exit();
1258 	auxtrace_record__free(rec->itr);
1259 	return err;
1260 }
1261 
1262 static void snapshot_sig_handler(int sig __maybe_unused)
1263 {
1264 	if (!auxtrace_snapshot_enabled)
1265 		return;
1266 	auxtrace_snapshot_enabled = 0;
1267 	auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
1268 	auxtrace_record__snapshot_started = 1;
1269 }
1270