xref: /openbmc/linux/tools/perf/builtin-record.c (revision 206e8c00752fbe9cc463184236ac64b2a532cda5)
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #include "builtin.h"
9 
10 #include "perf.h"
11 
12 #include "util/build-id.h"
13 #include "util/util.h"
14 #include "util/parse-options.h"
15 #include "util/parse-events.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/cpumap.h"
28 #include "util/thread_map.h"
29 #include "util/data.h"
30 #include "util/perf_regs.h"
31 #include "util/auxtrace.h"
32 #include "util/parse-branch-options.h"
33 #include "util/parse-regs-options.h"
34 
35 #include <unistd.h>
36 #include <sched.h>
37 #include <sys/mman.h>
38 
39 
40 struct record {
41 	struct perf_tool	tool;
42 	struct record_opts	opts;
43 	u64			bytes_written;
44 	struct perf_data_file	file;
45 	struct auxtrace_record	*itr;
46 	struct perf_evlist	*evlist;
47 	struct perf_session	*session;
48 	const char		*progname;
49 	int			realtime_prio;
50 	bool			no_buildid;
51 	bool			no_buildid_cache;
52 	long			samples;
53 };
54 
55 static int record__write(struct record *rec, void *bf, size_t size)
56 {
57 	if (perf_data_file__write(rec->session->file, bf, size) < 0) {
58 		pr_err("failed to write perf data, error: %m\n");
59 		return -1;
60 	}
61 
62 	rec->bytes_written += size;
63 	return 0;
64 }
65 
66 static int process_synthesized_event(struct perf_tool *tool,
67 				     union perf_event *event,
68 				     struct perf_sample *sample __maybe_unused,
69 				     struct machine *machine __maybe_unused)
70 {
71 	struct record *rec = container_of(tool, struct record, tool);
72 	return record__write(rec, event, event->header.size);
73 }
74 
75 static int record__mmap_read(struct record *rec, int idx)
76 {
77 	struct perf_mmap *md = &rec->evlist->mmap[idx];
78 	u64 head = perf_mmap__read_head(md);
79 	u64 old = md->prev;
80 	unsigned char *data = md->base + page_size;
81 	unsigned long size;
82 	void *buf;
83 	int rc = 0;
84 
85 	if (old == head)
86 		return 0;
87 
88 	rec->samples++;
89 
90 	size = head - old;
91 
92 	if ((old & md->mask) + size != (head & md->mask)) {
93 		buf = &data[old & md->mask];
94 		size = md->mask + 1 - (old & md->mask);
95 		old += size;
96 
97 		if (record__write(rec, buf, size) < 0) {
98 			rc = -1;
99 			goto out;
100 		}
101 	}
102 
103 	buf = &data[old & md->mask];
104 	size = head - old;
105 	old += size;
106 
107 	if (record__write(rec, buf, size) < 0) {
108 		rc = -1;
109 		goto out;
110 	}
111 
112 	md->prev = old;
113 	perf_evlist__mmap_consume(rec->evlist, idx);
114 out:
115 	return rc;
116 }
117 
118 static volatile int done;
119 static volatile int signr = -1;
120 static volatile int child_finished;
121 static volatile int auxtrace_snapshot_enabled;
122 static volatile int auxtrace_snapshot_err;
123 static volatile int auxtrace_record__snapshot_started;
124 
125 static void sig_handler(int sig)
126 {
127 	if (sig == SIGCHLD)
128 		child_finished = 1;
129 	else
130 		signr = sig;
131 
132 	done = 1;
133 }
134 
135 static void record__sig_exit(void)
136 {
137 	if (signr == -1)
138 		return;
139 
140 	signal(signr, SIG_DFL);
141 	raise(signr);
142 }
143 
144 #ifdef HAVE_AUXTRACE_SUPPORT
145 
146 static int record__process_auxtrace(struct perf_tool *tool,
147 				    union perf_event *event, void *data1,
148 				    size_t len1, void *data2, size_t len2)
149 {
150 	struct record *rec = container_of(tool, struct record, tool);
151 	struct perf_data_file *file = &rec->file;
152 	size_t padding;
153 	u8 pad[8] = {0};
154 
155 	if (!perf_data_file__is_pipe(file)) {
156 		off_t file_offset;
157 		int fd = perf_data_file__fd(file);
158 		int err;
159 
160 		file_offset = lseek(fd, 0, SEEK_CUR);
161 		if (file_offset == -1)
162 			return -1;
163 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
164 						     event, file_offset);
165 		if (err)
166 			return err;
167 	}
168 
169 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
170 	padding = (len1 + len2) & 7;
171 	if (padding)
172 		padding = 8 - padding;
173 
174 	record__write(rec, event, event->header.size);
175 	record__write(rec, data1, len1);
176 	if (len2)
177 		record__write(rec, data2, len2);
178 	record__write(rec, &pad, padding);
179 
180 	return 0;
181 }
182 
183 static int record__auxtrace_mmap_read(struct record *rec,
184 				      struct auxtrace_mmap *mm)
185 {
186 	int ret;
187 
188 	ret = auxtrace_mmap__read(mm, rec->itr, &rec->tool,
189 				  record__process_auxtrace);
190 	if (ret < 0)
191 		return ret;
192 
193 	if (ret)
194 		rec->samples++;
195 
196 	return 0;
197 }
198 
199 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
200 					       struct auxtrace_mmap *mm)
201 {
202 	int ret;
203 
204 	ret = auxtrace_mmap__read_snapshot(mm, rec->itr, &rec->tool,
205 					   record__process_auxtrace,
206 					   rec->opts.auxtrace_snapshot_size);
207 	if (ret < 0)
208 		return ret;
209 
210 	if (ret)
211 		rec->samples++;
212 
213 	return 0;
214 }
215 
216 static int record__auxtrace_read_snapshot_all(struct record *rec)
217 {
218 	int i;
219 	int rc = 0;
220 
221 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
222 		struct auxtrace_mmap *mm =
223 				&rec->evlist->mmap[i].auxtrace_mmap;
224 
225 		if (!mm->base)
226 			continue;
227 
228 		if (record__auxtrace_mmap_read_snapshot(rec, mm) != 0) {
229 			rc = -1;
230 			goto out;
231 		}
232 	}
233 out:
234 	return rc;
235 }
236 
237 static void record__read_auxtrace_snapshot(struct record *rec)
238 {
239 	pr_debug("Recording AUX area tracing snapshot\n");
240 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
241 		auxtrace_snapshot_err = -1;
242 	} else {
243 		auxtrace_snapshot_err = auxtrace_record__snapshot_finish(rec->itr);
244 		if (!auxtrace_snapshot_err)
245 			auxtrace_snapshot_enabled = 1;
246 	}
247 }
248 
249 #else
250 
251 static inline
252 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
253 			       struct auxtrace_mmap *mm __maybe_unused)
254 {
255 	return 0;
256 }
257 
258 static inline
259 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
260 {
261 }
262 
263 static inline
264 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
265 {
266 	return 0;
267 }
268 
269 #endif
270 
271 static int record__open(struct record *rec)
272 {
273 	char msg[512];
274 	struct perf_evsel *pos;
275 	struct perf_evlist *evlist = rec->evlist;
276 	struct perf_session *session = rec->session;
277 	struct record_opts *opts = &rec->opts;
278 	int rc = 0;
279 
280 	perf_evlist__config(evlist, opts);
281 
282 	evlist__for_each(evlist, pos) {
283 try_again:
284 		if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
285 			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
286 				if (verbose)
287 					ui__warning("%s\n", msg);
288 				goto try_again;
289 			}
290 
291 			rc = -errno;
292 			perf_evsel__open_strerror(pos, &opts->target,
293 						  errno, msg, sizeof(msg));
294 			ui__error("%s\n", msg);
295 			goto out;
296 		}
297 	}
298 
299 	if (perf_evlist__apply_filters(evlist, &pos)) {
300 		error("failed to set filter \"%s\" on event %s with %d (%s)\n",
301 			pos->filter, perf_evsel__name(pos), errno,
302 			strerror_r(errno, msg, sizeof(msg)));
303 		rc = -1;
304 		goto out;
305 	}
306 
307 	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, false,
308 				 opts->auxtrace_mmap_pages,
309 				 opts->auxtrace_snapshot_mode) < 0) {
310 		if (errno == EPERM) {
311 			pr_err("Permission error mapping pages.\n"
312 			       "Consider increasing "
313 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
314 			       "or try again with a smaller value of -m/--mmap_pages.\n"
315 			       "(current value: %u,%u)\n",
316 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
317 			rc = -errno;
318 		} else {
319 			pr_err("failed to mmap with %d (%s)\n", errno,
320 				strerror_r(errno, msg, sizeof(msg)));
321 			rc = -errno;
322 		}
323 		goto out;
324 	}
325 
326 	session->evlist = evlist;
327 	perf_session__set_id_hdr_size(session);
328 out:
329 	return rc;
330 }
331 
332 static int process_sample_event(struct perf_tool *tool,
333 				union perf_event *event,
334 				struct perf_sample *sample,
335 				struct perf_evsel *evsel,
336 				struct machine *machine)
337 {
338 	struct record *rec = container_of(tool, struct record, tool);
339 
340 	rec->samples++;
341 
342 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
343 }
344 
345 static int process_buildids(struct record *rec)
346 {
347 	struct perf_data_file *file  = &rec->file;
348 	struct perf_session *session = rec->session;
349 
350 	if (file->size == 0)
351 		return 0;
352 
353 	/*
354 	 * During this process, it'll load kernel map and replace the
355 	 * dso->long_name to a real pathname it found.  In this case
356 	 * we prefer the vmlinux path like
357 	 *   /lib/modules/3.16.4/build/vmlinux
358 	 *
359 	 * rather than build-id path (in debug directory).
360 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
361 	 */
362 	symbol_conf.ignore_vmlinux_buildid = true;
363 
364 	return perf_session__process_events(session);
365 }
366 
367 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
368 {
369 	int err;
370 	struct perf_tool *tool = data;
371 	/*
372 	 *As for guest kernel when processing subcommand record&report,
373 	 *we arrange module mmap prior to guest kernel mmap and trigger
374 	 *a preload dso because default guest module symbols are loaded
375 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
376 	 *method is used to avoid symbol missing when the first addr is
377 	 *in module instead of in guest kernel.
378 	 */
379 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
380 					     machine);
381 	if (err < 0)
382 		pr_err("Couldn't record guest kernel [%d]'s reference"
383 		       " relocation symbol.\n", machine->pid);
384 
385 	/*
386 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
387 	 * have no _text sometimes.
388 	 */
389 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
390 						 machine);
391 	if (err < 0)
392 		pr_err("Couldn't record guest kernel [%d]'s reference"
393 		       " relocation symbol.\n", machine->pid);
394 }
395 
396 static struct perf_event_header finished_round_event = {
397 	.size = sizeof(struct perf_event_header),
398 	.type = PERF_RECORD_FINISHED_ROUND,
399 };
400 
401 static int record__mmap_read_all(struct record *rec)
402 {
403 	u64 bytes_written = rec->bytes_written;
404 	int i;
405 	int rc = 0;
406 
407 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
408 		struct auxtrace_mmap *mm = &rec->evlist->mmap[i].auxtrace_mmap;
409 
410 		if (rec->evlist->mmap[i].base) {
411 			if (record__mmap_read(rec, i) != 0) {
412 				rc = -1;
413 				goto out;
414 			}
415 		}
416 
417 		if (mm->base && !rec->opts.auxtrace_snapshot_mode &&
418 		    record__auxtrace_mmap_read(rec, mm) != 0) {
419 			rc = -1;
420 			goto out;
421 		}
422 	}
423 
424 	/*
425 	 * Mark the round finished in case we wrote
426 	 * at least one event.
427 	 */
428 	if (bytes_written != rec->bytes_written)
429 		rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
430 
431 out:
432 	return rc;
433 }
434 
435 static void record__init_features(struct record *rec)
436 {
437 	struct perf_session *session = rec->session;
438 	int feat;
439 
440 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
441 		perf_header__set_feat(&session->header, feat);
442 
443 	if (rec->no_buildid)
444 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
445 
446 	if (!have_tracepoints(&rec->evlist->entries))
447 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
448 
449 	if (!rec->opts.branch_stack)
450 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
451 
452 	if (!rec->opts.full_auxtrace)
453 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
454 }
455 
456 static volatile int workload_exec_errno;
457 
458 /*
459  * perf_evlist__prepare_workload will send a SIGUSR1
460  * if the fork fails, since we asked by setting its
461  * want_signal to true.
462  */
463 static void workload_exec_failed_signal(int signo __maybe_unused,
464 					siginfo_t *info,
465 					void *ucontext __maybe_unused)
466 {
467 	workload_exec_errno = info->si_value.sival_int;
468 	done = 1;
469 	child_finished = 1;
470 }
471 
472 static void snapshot_sig_handler(int sig);
473 
474 static int __cmd_record(struct record *rec, int argc, const char **argv)
475 {
476 	int err;
477 	int status = 0;
478 	unsigned long waking = 0;
479 	const bool forks = argc > 0;
480 	struct machine *machine;
481 	struct perf_tool *tool = &rec->tool;
482 	struct record_opts *opts = &rec->opts;
483 	struct perf_data_file *file = &rec->file;
484 	struct perf_session *session;
485 	bool disabled = false, draining = false;
486 	int fd;
487 
488 	rec->progname = argv[0];
489 
490 	atexit(record__sig_exit);
491 	signal(SIGCHLD, sig_handler);
492 	signal(SIGINT, sig_handler);
493 	signal(SIGTERM, sig_handler);
494 	if (rec->opts.auxtrace_snapshot_mode)
495 		signal(SIGUSR2, snapshot_sig_handler);
496 	else
497 		signal(SIGUSR2, SIG_IGN);
498 
499 	session = perf_session__new(file, false, tool);
500 	if (session == NULL) {
501 		pr_err("Perf session creation failed.\n");
502 		return -1;
503 	}
504 
505 	fd = perf_data_file__fd(file);
506 	rec->session = session;
507 
508 	record__init_features(rec);
509 
510 	if (forks) {
511 		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
512 						    argv, file->is_pipe,
513 						    workload_exec_failed_signal);
514 		if (err < 0) {
515 			pr_err("Couldn't run the workload!\n");
516 			status = err;
517 			goto out_delete_session;
518 		}
519 	}
520 
521 	if (record__open(rec) != 0) {
522 		err = -1;
523 		goto out_child;
524 	}
525 
526 	/*
527 	 * Normally perf_session__new would do this, but it doesn't have the
528 	 * evlist.
529 	 */
530 	if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
531 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
532 		rec->tool.ordered_events = false;
533 	}
534 
535 	if (!rec->evlist->nr_groups)
536 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
537 
538 	if (file->is_pipe) {
539 		err = perf_header__write_pipe(fd);
540 		if (err < 0)
541 			goto out_child;
542 	} else {
543 		err = perf_session__write_header(session, rec->evlist, fd, false);
544 		if (err < 0)
545 			goto out_child;
546 	}
547 
548 	if (!rec->no_buildid
549 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
550 		pr_err("Couldn't generate buildids. "
551 		       "Use --no-buildid to profile anyway.\n");
552 		err = -1;
553 		goto out_child;
554 	}
555 
556 	machine = &session->machines.host;
557 
558 	if (file->is_pipe) {
559 		err = perf_event__synthesize_attrs(tool, session,
560 						   process_synthesized_event);
561 		if (err < 0) {
562 			pr_err("Couldn't synthesize attrs.\n");
563 			goto out_child;
564 		}
565 
566 		if (have_tracepoints(&rec->evlist->entries)) {
567 			/*
568 			 * FIXME err <= 0 here actually means that
569 			 * there were no tracepoints so its not really
570 			 * an error, just that we don't need to
571 			 * synthesize anything.  We really have to
572 			 * return this more properly and also
573 			 * propagate errors that now are calling die()
574 			 */
575 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
576 								  process_synthesized_event);
577 			if (err <= 0) {
578 				pr_err("Couldn't record tracing data.\n");
579 				goto out_child;
580 			}
581 			rec->bytes_written += err;
582 		}
583 	}
584 
585 	if (rec->opts.full_auxtrace) {
586 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
587 					session, process_synthesized_event);
588 		if (err)
589 			goto out_delete_session;
590 	}
591 
592 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
593 						 machine);
594 	if (err < 0)
595 		pr_err("Couldn't record kernel reference relocation symbol\n"
596 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
597 		       "Check /proc/kallsyms permission or run as root.\n");
598 
599 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
600 					     machine);
601 	if (err < 0)
602 		pr_err("Couldn't record kernel module information.\n"
603 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
604 		       "Check /proc/modules permission or run as root.\n");
605 
606 	if (perf_guest) {
607 		machines__process_guests(&session->machines,
608 					 perf_event__synthesize_guest_os, tool);
609 	}
610 
611 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
612 					    process_synthesized_event, opts->sample_address,
613 					    opts->proc_map_timeout);
614 	if (err != 0)
615 		goto out_child;
616 
617 	if (rec->realtime_prio) {
618 		struct sched_param param;
619 
620 		param.sched_priority = rec->realtime_prio;
621 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
622 			pr_err("Could not set realtime priority.\n");
623 			err = -1;
624 			goto out_child;
625 		}
626 	}
627 
628 	/*
629 	 * When perf is starting the traced process, all the events
630 	 * (apart from group members) have enable_on_exec=1 set,
631 	 * so don't spoil it by prematurely enabling them.
632 	 */
633 	if (!target__none(&opts->target) && !opts->initial_delay)
634 		perf_evlist__enable(rec->evlist);
635 
636 	/*
637 	 * Let the child rip
638 	 */
639 	if (forks)
640 		perf_evlist__start_workload(rec->evlist);
641 
642 	if (opts->initial_delay) {
643 		usleep(opts->initial_delay * 1000);
644 		perf_evlist__enable(rec->evlist);
645 	}
646 
647 	auxtrace_snapshot_enabled = 1;
648 	for (;;) {
649 		int hits = rec->samples;
650 
651 		if (record__mmap_read_all(rec) < 0) {
652 			auxtrace_snapshot_enabled = 0;
653 			err = -1;
654 			goto out_child;
655 		}
656 
657 		if (auxtrace_record__snapshot_started) {
658 			auxtrace_record__snapshot_started = 0;
659 			if (!auxtrace_snapshot_err)
660 				record__read_auxtrace_snapshot(rec);
661 			if (auxtrace_snapshot_err) {
662 				pr_err("AUX area tracing snapshot failed\n");
663 				err = -1;
664 				goto out_child;
665 			}
666 		}
667 
668 		if (hits == rec->samples) {
669 			if (done || draining)
670 				break;
671 			err = perf_evlist__poll(rec->evlist, -1);
672 			/*
673 			 * Propagate error, only if there's any. Ignore positive
674 			 * number of returned events and interrupt error.
675 			 */
676 			if (err > 0 || (err < 0 && errno == EINTR))
677 				err = 0;
678 			waking++;
679 
680 			if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
681 				draining = true;
682 		}
683 
684 		/*
685 		 * When perf is starting the traced process, at the end events
686 		 * die with the process and we wait for that. Thus no need to
687 		 * disable events in this case.
688 		 */
689 		if (done && !disabled && !target__none(&opts->target)) {
690 			auxtrace_snapshot_enabled = 0;
691 			perf_evlist__disable(rec->evlist);
692 			disabled = true;
693 		}
694 	}
695 	auxtrace_snapshot_enabled = 0;
696 
697 	if (forks && workload_exec_errno) {
698 		char msg[STRERR_BUFSIZE];
699 		const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg));
700 		pr_err("Workload failed: %s\n", emsg);
701 		err = -1;
702 		goto out_child;
703 	}
704 
705 	if (!quiet)
706 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
707 
708 out_child:
709 	if (forks) {
710 		int exit_status;
711 
712 		if (!child_finished)
713 			kill(rec->evlist->workload.pid, SIGTERM);
714 
715 		wait(&exit_status);
716 
717 		if (err < 0)
718 			status = err;
719 		else if (WIFEXITED(exit_status))
720 			status = WEXITSTATUS(exit_status);
721 		else if (WIFSIGNALED(exit_status))
722 			signr = WTERMSIG(exit_status);
723 	} else
724 		status = err;
725 
726 	/* this will be recalculated during process_buildids() */
727 	rec->samples = 0;
728 
729 	if (!err && !file->is_pipe) {
730 		rec->session->header.data_size += rec->bytes_written;
731 		file->size = lseek(perf_data_file__fd(file), 0, SEEK_CUR);
732 
733 		if (!rec->no_buildid) {
734 			process_buildids(rec);
735 			/*
736 			 * We take all buildids when the file contains
737 			 * AUX area tracing data because we do not decode the
738 			 * trace because it would take too long.
739 			 */
740 			if (rec->opts.full_auxtrace)
741 				dsos__hit_all(rec->session);
742 		}
743 		perf_session__write_header(rec->session, rec->evlist, fd, true);
744 	}
745 
746 	if (!err && !quiet) {
747 		char samples[128];
748 
749 		if (rec->samples && !rec->opts.full_auxtrace)
750 			scnprintf(samples, sizeof(samples),
751 				  " (%" PRIu64 " samples)", rec->samples);
752 		else
753 			samples[0] = '\0';
754 
755 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s ]\n",
756 			perf_data_file__size(file) / 1024.0 / 1024.0,
757 			file->path, samples);
758 	}
759 
760 out_delete_session:
761 	perf_session__delete(session);
762 	return status;
763 }
764 
765 static void callchain_debug(void)
766 {
767 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
768 
769 	pr_debug("callchain: type %s\n", str[callchain_param.record_mode]);
770 
771 	if (callchain_param.record_mode == CALLCHAIN_DWARF)
772 		pr_debug("callchain: stack dump size %d\n",
773 			 callchain_param.dump_size);
774 }
775 
776 int record_parse_callchain_opt(const struct option *opt,
777 			       const char *arg,
778 			       int unset)
779 {
780 	int ret;
781 	struct record_opts *record = (struct record_opts *)opt->value;
782 
783 	record->callgraph_set = true;
784 	callchain_param.enabled = !unset;
785 
786 	/* --no-call-graph */
787 	if (unset) {
788 		callchain_param.record_mode = CALLCHAIN_NONE;
789 		pr_debug("callchain: disabled\n");
790 		return 0;
791 	}
792 
793 	ret = parse_callchain_record_opt(arg, &callchain_param);
794 	if (!ret)
795 		callchain_debug();
796 
797 	return ret;
798 }
799 
800 int record_callchain_opt(const struct option *opt,
801 			 const char *arg __maybe_unused,
802 			 int unset __maybe_unused)
803 {
804 	struct record_opts *record = (struct record_opts *)opt->value;
805 
806 	record->callgraph_set = true;
807 	callchain_param.enabled = true;
808 
809 	if (callchain_param.record_mode == CALLCHAIN_NONE)
810 		callchain_param.record_mode = CALLCHAIN_FP;
811 
812 	callchain_debug();
813 	return 0;
814 }
815 
816 static int perf_record_config(const char *var, const char *value, void *cb)
817 {
818 	if (!strcmp(var, "record.call-graph"))
819 		var = "call-graph.record-mode"; /* fall-through */
820 
821 	return perf_default_config(var, value, cb);
822 }
823 
824 struct clockid_map {
825 	const char *name;
826 	int clockid;
827 };
828 
829 #define CLOCKID_MAP(n, c)	\
830 	{ .name = n, .clockid = (c), }
831 
832 #define CLOCKID_END	{ .name = NULL, }
833 
834 
835 /*
836  * Add the missing ones, we need to build on many distros...
837  */
838 #ifndef CLOCK_MONOTONIC_RAW
839 #define CLOCK_MONOTONIC_RAW 4
840 #endif
841 #ifndef CLOCK_BOOTTIME
842 #define CLOCK_BOOTTIME 7
843 #endif
844 #ifndef CLOCK_TAI
845 #define CLOCK_TAI 11
846 #endif
847 
848 static const struct clockid_map clockids[] = {
849 	/* available for all events, NMI safe */
850 	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
851 	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
852 
853 	/* available for some events */
854 	CLOCKID_MAP("realtime", CLOCK_REALTIME),
855 	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
856 	CLOCKID_MAP("tai", CLOCK_TAI),
857 
858 	/* available for the lazy */
859 	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
860 	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
861 	CLOCKID_MAP("real", CLOCK_REALTIME),
862 	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
863 
864 	CLOCKID_END,
865 };
866 
867 static int parse_clockid(const struct option *opt, const char *str, int unset)
868 {
869 	struct record_opts *opts = (struct record_opts *)opt->value;
870 	const struct clockid_map *cm;
871 	const char *ostr = str;
872 
873 	if (unset) {
874 		opts->use_clockid = 0;
875 		return 0;
876 	}
877 
878 	/* no arg passed */
879 	if (!str)
880 		return 0;
881 
882 	/* no setting it twice */
883 	if (opts->use_clockid)
884 		return -1;
885 
886 	opts->use_clockid = true;
887 
888 	/* if its a number, we're done */
889 	if (sscanf(str, "%d", &opts->clockid) == 1)
890 		return 0;
891 
892 	/* allow a "CLOCK_" prefix to the name */
893 	if (!strncasecmp(str, "CLOCK_", 6))
894 		str += 6;
895 
896 	for (cm = clockids; cm->name; cm++) {
897 		if (!strcasecmp(str, cm->name)) {
898 			opts->clockid = cm->clockid;
899 			return 0;
900 		}
901 	}
902 
903 	opts->use_clockid = false;
904 	ui__warning("unknown clockid %s, check man page\n", ostr);
905 	return -1;
906 }
907 
908 static int record__parse_mmap_pages(const struct option *opt,
909 				    const char *str,
910 				    int unset __maybe_unused)
911 {
912 	struct record_opts *opts = opt->value;
913 	char *s, *p;
914 	unsigned int mmap_pages;
915 	int ret;
916 
917 	if (!str)
918 		return -EINVAL;
919 
920 	s = strdup(str);
921 	if (!s)
922 		return -ENOMEM;
923 
924 	p = strchr(s, ',');
925 	if (p)
926 		*p = '\0';
927 
928 	if (*s) {
929 		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
930 		if (ret)
931 			goto out_free;
932 		opts->mmap_pages = mmap_pages;
933 	}
934 
935 	if (!p) {
936 		ret = 0;
937 		goto out_free;
938 	}
939 
940 	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
941 	if (ret)
942 		goto out_free;
943 
944 	opts->auxtrace_mmap_pages = mmap_pages;
945 
946 out_free:
947 	free(s);
948 	return ret;
949 }
950 
951 static const char * const __record_usage[] = {
952 	"perf record [<options>] [<command>]",
953 	"perf record [<options>] -- <command> [<options>]",
954 	NULL
955 };
956 const char * const *record_usage = __record_usage;
957 
958 /*
959  * XXX Ideally would be local to cmd_record() and passed to a record__new
960  * because we need to have access to it in record__exit, that is called
961  * after cmd_record() exits, but since record_options need to be accessible to
962  * builtin-script, leave it here.
963  *
964  * At least we don't ouch it in all the other functions here directly.
965  *
966  * Just say no to tons of global variables, sigh.
967  */
968 static struct record record = {
969 	.opts = {
970 		.sample_time	     = true,
971 		.mmap_pages	     = UINT_MAX,
972 		.user_freq	     = UINT_MAX,
973 		.user_interval	     = ULLONG_MAX,
974 		.freq		     = 4000,
975 		.target		     = {
976 			.uses_mmap   = true,
977 			.default_per_cpu = true,
978 		},
979 		.proc_map_timeout     = 500,
980 	},
981 	.tool = {
982 		.sample		= process_sample_event,
983 		.fork		= perf_event__process_fork,
984 		.exit		= perf_event__process_exit,
985 		.comm		= perf_event__process_comm,
986 		.mmap		= perf_event__process_mmap,
987 		.mmap2		= perf_event__process_mmap2,
988 		.ordered_events	= true,
989 	},
990 };
991 
992 #define CALLCHAIN_HELP "setup and enables call-graph (stack chain/backtrace) recording: "
993 
994 #ifdef HAVE_DWARF_UNWIND_SUPPORT
995 const char record_callchain_help[] = CALLCHAIN_HELP "fp dwarf lbr";
996 #else
997 const char record_callchain_help[] = CALLCHAIN_HELP "fp lbr";
998 #endif
999 
1000 /*
1001  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
1002  * with it and switch to use the library functions in perf_evlist that came
1003  * from builtin-record.c, i.e. use record_opts,
1004  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
1005  * using pipes, etc.
1006  */
1007 struct option __record_options[] = {
1008 	OPT_CALLBACK('e', "event", &record.evlist, "event",
1009 		     "event selector. use 'perf list' to list available events",
1010 		     parse_events_option),
1011 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
1012 		     "event filter", parse_filter),
1013 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
1014 			   NULL, "don't record events from perf itself",
1015 			   exclude_perf),
1016 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
1017 		    "record events on existing process id"),
1018 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
1019 		    "record events on existing thread id"),
1020 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
1021 		    "collect data with this RT SCHED_FIFO priority"),
1022 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
1023 		    "collect data without buffering"),
1024 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1025 		    "collect raw sample records from all opened counters"),
1026 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1027 			    "system-wide collection from all CPUs"),
1028 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1029 		    "list of cpus to monitor"),
1030 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
1031 	OPT_STRING('o', "output", &record.file.path, "file",
1032 		    "output file name"),
1033 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
1034 			&record.opts.no_inherit_set,
1035 			"child tasks do not inherit counters"),
1036 	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
1037 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
1038 		     "number of mmap data pages and AUX area tracing mmap pages",
1039 		     record__parse_mmap_pages),
1040 	OPT_BOOLEAN(0, "group", &record.opts.group,
1041 		    "put the counters into a counter group"),
1042 	OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
1043 			   NULL, "enables call-graph recording" ,
1044 			   &record_callchain_opt),
1045 	OPT_CALLBACK(0, "call-graph", &record.opts,
1046 		     "mode[,dump_size]", record_callchain_help,
1047 		     &record_parse_callchain_opt),
1048 	OPT_INCR('v', "verbose", &verbose,
1049 		    "be more verbose (show counter open errors, etc)"),
1050 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1051 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1052 		    "per thread counts"),
1053 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
1054 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
1055 			&record.opts.sample_time_set,
1056 			"Record the sample timestamps"),
1057 	OPT_BOOLEAN('P', "period", &record.opts.period, "Record the sample period"),
1058 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1059 		    "don't sample"),
1060 	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
1061 		    "do not update the buildid cache"),
1062 	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
1063 		    "do not collect buildids in perf.data"),
1064 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
1065 		     "monitor event in cgroup name only",
1066 		     parse_cgroups),
1067 	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
1068 		  "ms to wait before starting measurement after program start"),
1069 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
1070 		   "user to profile"),
1071 
1072 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
1073 		     "branch any", "sample any taken branches",
1074 		     parse_branch_stack),
1075 
1076 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
1077 		     "branch filter mask", "branch stack filter modes",
1078 		     parse_branch_stack),
1079 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
1080 		    "sample by weight (on special events only)"),
1081 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
1082 		    "sample transaction flags (special events only)"),
1083 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
1084 		    "use per-thread mmaps"),
1085 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
1086 		    "sample selected machine registers on interrupt,"
1087 		    " use -I ? to list register names", parse_regs),
1088 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
1089 		    "Record running/enabled time of read (:S) events"),
1090 	OPT_CALLBACK('k', "clockid", &record.opts,
1091 	"clockid", "clockid to use for events, see clock_gettime()",
1092 	parse_clockid),
1093 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
1094 			  "opts", "AUX area tracing Snapshot Mode", ""),
1095 	OPT_UINTEGER(0, "proc-map-timeout", &record.opts.proc_map_timeout,
1096 			"per thread proc mmap processing timeout in ms"),
1097 	OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
1098 		    "Record context switch events"),
1099 	OPT_END()
1100 };
1101 
1102 struct option *record_options = __record_options;
1103 
1104 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
1105 {
1106 	int err;
1107 	struct record *rec = &record;
1108 	char errbuf[BUFSIZ];
1109 
1110 	rec->evlist = perf_evlist__new();
1111 	if (rec->evlist == NULL)
1112 		return -ENOMEM;
1113 
1114 	perf_config(perf_record_config, rec);
1115 
1116 	argc = parse_options(argc, argv, record_options, record_usage,
1117 			    PARSE_OPT_STOP_AT_NON_OPTION);
1118 	if (!argc && target__none(&rec->opts.target))
1119 		usage_with_options(record_usage, record_options);
1120 
1121 	if (nr_cgroups && !rec->opts.target.system_wide) {
1122 		ui__error("cgroup monitoring only available in"
1123 			  " system-wide mode\n");
1124 		usage_with_options(record_usage, record_options);
1125 	}
1126 	if (rec->opts.record_switch_events &&
1127 	    !perf_can_record_switch_events()) {
1128 		ui__error("kernel does not support recording context switch events (--switch-events option)\n");
1129 		usage_with_options(record_usage, record_options);
1130 	}
1131 
1132 	if (!rec->itr) {
1133 		rec->itr = auxtrace_record__init(rec->evlist, &err);
1134 		if (err)
1135 			return err;
1136 	}
1137 
1138 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
1139 					      rec->opts.auxtrace_snapshot_opts);
1140 	if (err)
1141 		return err;
1142 
1143 	err = -ENOMEM;
1144 
1145 	symbol__init(NULL);
1146 
1147 	if (symbol_conf.kptr_restrict)
1148 		pr_warning(
1149 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1150 "check /proc/sys/kernel/kptr_restrict.\n\n"
1151 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1152 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1153 "Samples in kernel modules won't be resolved at all.\n\n"
1154 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1155 "even with a suitable vmlinux or kallsyms file.\n\n");
1156 
1157 	if (rec->no_buildid_cache || rec->no_buildid)
1158 		disable_buildid_cache();
1159 
1160 	if (rec->evlist->nr_entries == 0 &&
1161 	    perf_evlist__add_default(rec->evlist) < 0) {
1162 		pr_err("Not enough memory for event selector list\n");
1163 		goto out_symbol_exit;
1164 	}
1165 
1166 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
1167 		rec->opts.no_inherit = true;
1168 
1169 	err = target__validate(&rec->opts.target);
1170 	if (err) {
1171 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1172 		ui__warning("%s", errbuf);
1173 	}
1174 
1175 	err = target__parse_uid(&rec->opts.target);
1176 	if (err) {
1177 		int saved_errno = errno;
1178 
1179 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1180 		ui__error("%s", errbuf);
1181 
1182 		err = -saved_errno;
1183 		goto out_symbol_exit;
1184 	}
1185 
1186 	err = -ENOMEM;
1187 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
1188 		usage_with_options(record_usage, record_options);
1189 
1190 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
1191 	if (err)
1192 		goto out_symbol_exit;
1193 
1194 	if (record_opts__config(&rec->opts)) {
1195 		err = -EINVAL;
1196 		goto out_symbol_exit;
1197 	}
1198 
1199 	err = __cmd_record(&record, argc, argv);
1200 out_symbol_exit:
1201 	perf_evlist__delete(rec->evlist);
1202 	symbol__exit();
1203 	auxtrace_record__free(rec->itr);
1204 	return err;
1205 }
1206 
1207 static void snapshot_sig_handler(int sig __maybe_unused)
1208 {
1209 	if (!auxtrace_snapshot_enabled)
1210 		return;
1211 	auxtrace_snapshot_enabled = 0;
1212 	auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
1213 	auxtrace_record__snapshot_started = 1;
1214 }
1215