xref: /openbmc/linux/tools/perf/builtin-record.c (revision b802fb99)
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #include "builtin.h"
9 
10 #include "perf.h"
11 
12 #include "util/build-id.h"
13 #include "util/util.h"
14 #include <subcmd/parse-options.h>
15 #include "util/parse-events.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/cpumap.h"
28 #include "util/thread_map.h"
29 #include "util/data.h"
30 #include "util/perf_regs.h"
31 #include "util/auxtrace.h"
32 #include "util/parse-branch-options.h"
33 #include "util/parse-regs-options.h"
34 #include "util/llvm-utils.h"
35 
36 #include <unistd.h>
37 #include <sched.h>
38 #include <sys/mman.h>
39 
40 
41 struct record {
42 	struct perf_tool	tool;
43 	struct record_opts	opts;
44 	u64			bytes_written;
45 	struct perf_data_file	file;
46 	struct auxtrace_record	*itr;
47 	struct perf_evlist	*evlist;
48 	struct perf_session	*session;
49 	const char		*progname;
50 	int			realtime_prio;
51 	bool			no_buildid;
52 	bool			no_buildid_cache;
53 	bool			buildid_all;
54 	unsigned long long	samples;
55 };
56 
57 static int record__write(struct record *rec, void *bf, size_t size)
58 {
59 	if (perf_data_file__write(rec->session->file, bf, size) < 0) {
60 		pr_err("failed to write perf data, error: %m\n");
61 		return -1;
62 	}
63 
64 	rec->bytes_written += size;
65 	return 0;
66 }
67 
68 static int process_synthesized_event(struct perf_tool *tool,
69 				     union perf_event *event,
70 				     struct perf_sample *sample __maybe_unused,
71 				     struct machine *machine __maybe_unused)
72 {
73 	struct record *rec = container_of(tool, struct record, tool);
74 	return record__write(rec, event, event->header.size);
75 }
76 
77 static int record__mmap_read(struct record *rec, int idx)
78 {
79 	struct perf_mmap *md = &rec->evlist->mmap[idx];
80 	u64 head = perf_mmap__read_head(md);
81 	u64 old = md->prev;
82 	unsigned char *data = md->base + page_size;
83 	unsigned long size;
84 	void *buf;
85 	int rc = 0;
86 
87 	if (old == head)
88 		return 0;
89 
90 	rec->samples++;
91 
92 	size = head - old;
93 
94 	if ((old & md->mask) + size != (head & md->mask)) {
95 		buf = &data[old & md->mask];
96 		size = md->mask + 1 - (old & md->mask);
97 		old += size;
98 
99 		if (record__write(rec, buf, size) < 0) {
100 			rc = -1;
101 			goto out;
102 		}
103 	}
104 
105 	buf = &data[old & md->mask];
106 	size = head - old;
107 	old += size;
108 
109 	if (record__write(rec, buf, size) < 0) {
110 		rc = -1;
111 		goto out;
112 	}
113 
114 	md->prev = old;
115 	perf_evlist__mmap_consume(rec->evlist, idx);
116 out:
117 	return rc;
118 }
119 
120 static volatile int done;
121 static volatile int signr = -1;
122 static volatile int child_finished;
123 static volatile int auxtrace_snapshot_enabled;
124 static volatile int auxtrace_snapshot_err;
125 static volatile int auxtrace_record__snapshot_started;
126 
127 static void sig_handler(int sig)
128 {
129 	if (sig == SIGCHLD)
130 		child_finished = 1;
131 	else
132 		signr = sig;
133 
134 	done = 1;
135 }
136 
137 static void record__sig_exit(void)
138 {
139 	if (signr == -1)
140 		return;
141 
142 	signal(signr, SIG_DFL);
143 	raise(signr);
144 }
145 
146 #ifdef HAVE_AUXTRACE_SUPPORT
147 
148 static int record__process_auxtrace(struct perf_tool *tool,
149 				    union perf_event *event, void *data1,
150 				    size_t len1, void *data2, size_t len2)
151 {
152 	struct record *rec = container_of(tool, struct record, tool);
153 	struct perf_data_file *file = &rec->file;
154 	size_t padding;
155 	u8 pad[8] = {0};
156 
157 	if (!perf_data_file__is_pipe(file)) {
158 		off_t file_offset;
159 		int fd = perf_data_file__fd(file);
160 		int err;
161 
162 		file_offset = lseek(fd, 0, SEEK_CUR);
163 		if (file_offset == -1)
164 			return -1;
165 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
166 						     event, file_offset);
167 		if (err)
168 			return err;
169 	}
170 
171 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
172 	padding = (len1 + len2) & 7;
173 	if (padding)
174 		padding = 8 - padding;
175 
176 	record__write(rec, event, event->header.size);
177 	record__write(rec, data1, len1);
178 	if (len2)
179 		record__write(rec, data2, len2);
180 	record__write(rec, &pad, padding);
181 
182 	return 0;
183 }
184 
185 static int record__auxtrace_mmap_read(struct record *rec,
186 				      struct auxtrace_mmap *mm)
187 {
188 	int ret;
189 
190 	ret = auxtrace_mmap__read(mm, rec->itr, &rec->tool,
191 				  record__process_auxtrace);
192 	if (ret < 0)
193 		return ret;
194 
195 	if (ret)
196 		rec->samples++;
197 
198 	return 0;
199 }
200 
201 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
202 					       struct auxtrace_mmap *mm)
203 {
204 	int ret;
205 
206 	ret = auxtrace_mmap__read_snapshot(mm, rec->itr, &rec->tool,
207 					   record__process_auxtrace,
208 					   rec->opts.auxtrace_snapshot_size);
209 	if (ret < 0)
210 		return ret;
211 
212 	if (ret)
213 		rec->samples++;
214 
215 	return 0;
216 }
217 
218 static int record__auxtrace_read_snapshot_all(struct record *rec)
219 {
220 	int i;
221 	int rc = 0;
222 
223 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
224 		struct auxtrace_mmap *mm =
225 				&rec->evlist->mmap[i].auxtrace_mmap;
226 
227 		if (!mm->base)
228 			continue;
229 
230 		if (record__auxtrace_mmap_read_snapshot(rec, mm) != 0) {
231 			rc = -1;
232 			goto out;
233 		}
234 	}
235 out:
236 	return rc;
237 }
238 
239 static void record__read_auxtrace_snapshot(struct record *rec)
240 {
241 	pr_debug("Recording AUX area tracing snapshot\n");
242 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
243 		auxtrace_snapshot_err = -1;
244 	} else {
245 		auxtrace_snapshot_err = auxtrace_record__snapshot_finish(rec->itr);
246 		if (!auxtrace_snapshot_err)
247 			auxtrace_snapshot_enabled = 1;
248 	}
249 }
250 
251 #else
252 
253 static inline
254 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
255 			       struct auxtrace_mmap *mm __maybe_unused)
256 {
257 	return 0;
258 }
259 
260 static inline
261 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
262 {
263 }
264 
265 static inline
266 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
267 {
268 	return 0;
269 }
270 
271 #endif
272 
273 static int record__open(struct record *rec)
274 {
275 	char msg[512];
276 	struct perf_evsel *pos;
277 	struct perf_evlist *evlist = rec->evlist;
278 	struct perf_session *session = rec->session;
279 	struct record_opts *opts = &rec->opts;
280 	int rc = 0;
281 
282 	perf_evlist__config(evlist, opts);
283 
284 	evlist__for_each(evlist, pos) {
285 try_again:
286 		if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
287 			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
288 				if (verbose)
289 					ui__warning("%s\n", msg);
290 				goto try_again;
291 			}
292 
293 			rc = -errno;
294 			perf_evsel__open_strerror(pos, &opts->target,
295 						  errno, msg, sizeof(msg));
296 			ui__error("%s\n", msg);
297 			goto out;
298 		}
299 	}
300 
301 	if (perf_evlist__apply_filters(evlist, &pos)) {
302 		error("failed to set filter \"%s\" on event %s with %d (%s)\n",
303 			pos->filter, perf_evsel__name(pos), errno,
304 			strerror_r(errno, msg, sizeof(msg)));
305 		rc = -1;
306 		goto out;
307 	}
308 
309 	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, false,
310 				 opts->auxtrace_mmap_pages,
311 				 opts->auxtrace_snapshot_mode) < 0) {
312 		if (errno == EPERM) {
313 			pr_err("Permission error mapping pages.\n"
314 			       "Consider increasing "
315 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
316 			       "or try again with a smaller value of -m/--mmap_pages.\n"
317 			       "(current value: %u,%u)\n",
318 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
319 			rc = -errno;
320 		} else {
321 			pr_err("failed to mmap with %d (%s)\n", errno,
322 				strerror_r(errno, msg, sizeof(msg)));
323 			rc = -errno;
324 		}
325 		goto out;
326 	}
327 
328 	session->evlist = evlist;
329 	perf_session__set_id_hdr_size(session);
330 out:
331 	return rc;
332 }
333 
334 static int process_sample_event(struct perf_tool *tool,
335 				union perf_event *event,
336 				struct perf_sample *sample,
337 				struct perf_evsel *evsel,
338 				struct machine *machine)
339 {
340 	struct record *rec = container_of(tool, struct record, tool);
341 
342 	rec->samples++;
343 
344 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
345 }
346 
347 static int process_buildids(struct record *rec)
348 {
349 	struct perf_data_file *file  = &rec->file;
350 	struct perf_session *session = rec->session;
351 
352 	if (file->size == 0)
353 		return 0;
354 
355 	/*
356 	 * During this process, it'll load kernel map and replace the
357 	 * dso->long_name to a real pathname it found.  In this case
358 	 * we prefer the vmlinux path like
359 	 *   /lib/modules/3.16.4/build/vmlinux
360 	 *
361 	 * rather than build-id path (in debug directory).
362 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
363 	 */
364 	symbol_conf.ignore_vmlinux_buildid = true;
365 
366 	/*
367 	 * If --buildid-all is given, it marks all DSO regardless of hits,
368 	 * so no need to process samples.
369 	 */
370 	if (rec->buildid_all)
371 		rec->tool.sample = NULL;
372 
373 	return perf_session__process_events(session);
374 }
375 
376 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
377 {
378 	int err;
379 	struct perf_tool *tool = data;
380 	/*
381 	 *As for guest kernel when processing subcommand record&report,
382 	 *we arrange module mmap prior to guest kernel mmap and trigger
383 	 *a preload dso because default guest module symbols are loaded
384 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
385 	 *method is used to avoid symbol missing when the first addr is
386 	 *in module instead of in guest kernel.
387 	 */
388 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
389 					     machine);
390 	if (err < 0)
391 		pr_err("Couldn't record guest kernel [%d]'s reference"
392 		       " relocation symbol.\n", machine->pid);
393 
394 	/*
395 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
396 	 * have no _text sometimes.
397 	 */
398 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
399 						 machine);
400 	if (err < 0)
401 		pr_err("Couldn't record guest kernel [%d]'s reference"
402 		       " relocation symbol.\n", machine->pid);
403 }
404 
405 static struct perf_event_header finished_round_event = {
406 	.size = sizeof(struct perf_event_header),
407 	.type = PERF_RECORD_FINISHED_ROUND,
408 };
409 
410 static int record__mmap_read_all(struct record *rec)
411 {
412 	u64 bytes_written = rec->bytes_written;
413 	int i;
414 	int rc = 0;
415 
416 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
417 		struct auxtrace_mmap *mm = &rec->evlist->mmap[i].auxtrace_mmap;
418 
419 		if (rec->evlist->mmap[i].base) {
420 			if (record__mmap_read(rec, i) != 0) {
421 				rc = -1;
422 				goto out;
423 			}
424 		}
425 
426 		if (mm->base && !rec->opts.auxtrace_snapshot_mode &&
427 		    record__auxtrace_mmap_read(rec, mm) != 0) {
428 			rc = -1;
429 			goto out;
430 		}
431 	}
432 
433 	/*
434 	 * Mark the round finished in case we wrote
435 	 * at least one event.
436 	 */
437 	if (bytes_written != rec->bytes_written)
438 		rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
439 
440 out:
441 	return rc;
442 }
443 
444 static void record__init_features(struct record *rec)
445 {
446 	struct perf_session *session = rec->session;
447 	int feat;
448 
449 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
450 		perf_header__set_feat(&session->header, feat);
451 
452 	if (rec->no_buildid)
453 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
454 
455 	if (!have_tracepoints(&rec->evlist->entries))
456 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
457 
458 	if (!rec->opts.branch_stack)
459 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
460 
461 	if (!rec->opts.full_auxtrace)
462 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
463 
464 	perf_header__clear_feat(&session->header, HEADER_STAT);
465 }
466 
467 static volatile int workload_exec_errno;
468 
469 /*
470  * perf_evlist__prepare_workload will send a SIGUSR1
471  * if the fork fails, since we asked by setting its
472  * want_signal to true.
473  */
474 static void workload_exec_failed_signal(int signo __maybe_unused,
475 					siginfo_t *info,
476 					void *ucontext __maybe_unused)
477 {
478 	workload_exec_errno = info->si_value.sival_int;
479 	done = 1;
480 	child_finished = 1;
481 }
482 
483 static void snapshot_sig_handler(int sig);
484 
485 static int __cmd_record(struct record *rec, int argc, const char **argv)
486 {
487 	int err;
488 	int status = 0;
489 	unsigned long waking = 0;
490 	const bool forks = argc > 0;
491 	struct machine *machine;
492 	struct perf_tool *tool = &rec->tool;
493 	struct record_opts *opts = &rec->opts;
494 	struct perf_data_file *file = &rec->file;
495 	struct perf_session *session;
496 	bool disabled = false, draining = false;
497 	int fd;
498 
499 	rec->progname = argv[0];
500 
501 	atexit(record__sig_exit);
502 	signal(SIGCHLD, sig_handler);
503 	signal(SIGINT, sig_handler);
504 	signal(SIGTERM, sig_handler);
505 	if (rec->opts.auxtrace_snapshot_mode)
506 		signal(SIGUSR2, snapshot_sig_handler);
507 	else
508 		signal(SIGUSR2, SIG_IGN);
509 
510 	session = perf_session__new(file, false, tool);
511 	if (session == NULL) {
512 		pr_err("Perf session creation failed.\n");
513 		return -1;
514 	}
515 
516 	fd = perf_data_file__fd(file);
517 	rec->session = session;
518 
519 	record__init_features(rec);
520 
521 	if (forks) {
522 		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
523 						    argv, file->is_pipe,
524 						    workload_exec_failed_signal);
525 		if (err < 0) {
526 			pr_err("Couldn't run the workload!\n");
527 			status = err;
528 			goto out_delete_session;
529 		}
530 	}
531 
532 	if (record__open(rec) != 0) {
533 		err = -1;
534 		goto out_child;
535 	}
536 
537 	/*
538 	 * Normally perf_session__new would do this, but it doesn't have the
539 	 * evlist.
540 	 */
541 	if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
542 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
543 		rec->tool.ordered_events = false;
544 	}
545 
546 	if (!rec->evlist->nr_groups)
547 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
548 
549 	if (file->is_pipe) {
550 		err = perf_header__write_pipe(fd);
551 		if (err < 0)
552 			goto out_child;
553 	} else {
554 		err = perf_session__write_header(session, rec->evlist, fd, false);
555 		if (err < 0)
556 			goto out_child;
557 	}
558 
559 	if (!rec->no_buildid
560 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
561 		pr_err("Couldn't generate buildids. "
562 		       "Use --no-buildid to profile anyway.\n");
563 		err = -1;
564 		goto out_child;
565 	}
566 
567 	machine = &session->machines.host;
568 
569 	if (file->is_pipe) {
570 		err = perf_event__synthesize_attrs(tool, session,
571 						   process_synthesized_event);
572 		if (err < 0) {
573 			pr_err("Couldn't synthesize attrs.\n");
574 			goto out_child;
575 		}
576 
577 		if (have_tracepoints(&rec->evlist->entries)) {
578 			/*
579 			 * FIXME err <= 0 here actually means that
580 			 * there were no tracepoints so its not really
581 			 * an error, just that we don't need to
582 			 * synthesize anything.  We really have to
583 			 * return this more properly and also
584 			 * propagate errors that now are calling die()
585 			 */
586 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
587 								  process_synthesized_event);
588 			if (err <= 0) {
589 				pr_err("Couldn't record tracing data.\n");
590 				goto out_child;
591 			}
592 			rec->bytes_written += err;
593 		}
594 	}
595 
596 	if (rec->opts.full_auxtrace) {
597 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
598 					session, process_synthesized_event);
599 		if (err)
600 			goto out_delete_session;
601 	}
602 
603 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
604 						 machine);
605 	if (err < 0)
606 		pr_err("Couldn't record kernel reference relocation symbol\n"
607 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
608 		       "Check /proc/kallsyms permission or run as root.\n");
609 
610 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
611 					     machine);
612 	if (err < 0)
613 		pr_err("Couldn't record kernel module information.\n"
614 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
615 		       "Check /proc/modules permission or run as root.\n");
616 
617 	if (perf_guest) {
618 		machines__process_guests(&session->machines,
619 					 perf_event__synthesize_guest_os, tool);
620 	}
621 
622 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
623 					    process_synthesized_event, opts->sample_address,
624 					    opts->proc_map_timeout);
625 	if (err != 0)
626 		goto out_child;
627 
628 	if (rec->realtime_prio) {
629 		struct sched_param param;
630 
631 		param.sched_priority = rec->realtime_prio;
632 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
633 			pr_err("Could not set realtime priority.\n");
634 			err = -1;
635 			goto out_child;
636 		}
637 	}
638 
639 	/*
640 	 * When perf is starting the traced process, all the events
641 	 * (apart from group members) have enable_on_exec=1 set,
642 	 * so don't spoil it by prematurely enabling them.
643 	 */
644 	if (!target__none(&opts->target) && !opts->initial_delay)
645 		perf_evlist__enable(rec->evlist);
646 
647 	/*
648 	 * Let the child rip
649 	 */
650 	if (forks) {
651 		union perf_event *event;
652 
653 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
654 		if (event == NULL) {
655 			err = -ENOMEM;
656 			goto out_child;
657 		}
658 
659 		/*
660 		 * Some H/W events are generated before COMM event
661 		 * which is emitted during exec(), so perf script
662 		 * cannot see a correct process name for those events.
663 		 * Synthesize COMM event to prevent it.
664 		 */
665 		perf_event__synthesize_comm(tool, event,
666 					    rec->evlist->workload.pid,
667 					    process_synthesized_event,
668 					    machine);
669 		free(event);
670 
671 		perf_evlist__start_workload(rec->evlist);
672 	}
673 
674 	if (opts->initial_delay) {
675 		usleep(opts->initial_delay * 1000);
676 		perf_evlist__enable(rec->evlist);
677 	}
678 
679 	auxtrace_snapshot_enabled = 1;
680 	for (;;) {
681 		unsigned long long hits = rec->samples;
682 
683 		if (record__mmap_read_all(rec) < 0) {
684 			auxtrace_snapshot_enabled = 0;
685 			err = -1;
686 			goto out_child;
687 		}
688 
689 		if (auxtrace_record__snapshot_started) {
690 			auxtrace_record__snapshot_started = 0;
691 			if (!auxtrace_snapshot_err)
692 				record__read_auxtrace_snapshot(rec);
693 			if (auxtrace_snapshot_err) {
694 				pr_err("AUX area tracing snapshot failed\n");
695 				err = -1;
696 				goto out_child;
697 			}
698 		}
699 
700 		if (hits == rec->samples) {
701 			if (done || draining)
702 				break;
703 			err = perf_evlist__poll(rec->evlist, -1);
704 			/*
705 			 * Propagate error, only if there's any. Ignore positive
706 			 * number of returned events and interrupt error.
707 			 */
708 			if (err > 0 || (err < 0 && errno == EINTR))
709 				err = 0;
710 			waking++;
711 
712 			if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
713 				draining = true;
714 		}
715 
716 		/*
717 		 * When perf is starting the traced process, at the end events
718 		 * die with the process and we wait for that. Thus no need to
719 		 * disable events in this case.
720 		 */
721 		if (done && !disabled && !target__none(&opts->target)) {
722 			auxtrace_snapshot_enabled = 0;
723 			perf_evlist__disable(rec->evlist);
724 			disabled = true;
725 		}
726 	}
727 	auxtrace_snapshot_enabled = 0;
728 
729 	if (forks && workload_exec_errno) {
730 		char msg[STRERR_BUFSIZE];
731 		const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg));
732 		pr_err("Workload failed: %s\n", emsg);
733 		err = -1;
734 		goto out_child;
735 	}
736 
737 	if (!quiet)
738 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
739 
740 out_child:
741 	if (forks) {
742 		int exit_status;
743 
744 		if (!child_finished)
745 			kill(rec->evlist->workload.pid, SIGTERM);
746 
747 		wait(&exit_status);
748 
749 		if (err < 0)
750 			status = err;
751 		else if (WIFEXITED(exit_status))
752 			status = WEXITSTATUS(exit_status);
753 		else if (WIFSIGNALED(exit_status))
754 			signr = WTERMSIG(exit_status);
755 	} else
756 		status = err;
757 
758 	/* this will be recalculated during process_buildids() */
759 	rec->samples = 0;
760 
761 	if (!err && !file->is_pipe) {
762 		rec->session->header.data_size += rec->bytes_written;
763 		file->size = lseek(perf_data_file__fd(file), 0, SEEK_CUR);
764 
765 		if (!rec->no_buildid) {
766 			process_buildids(rec);
767 
768 			if (rec->buildid_all)
769 				dsos__hit_all(rec->session);
770 		}
771 		perf_session__write_header(rec->session, rec->evlist, fd, true);
772 	}
773 
774 	if (!err && !quiet) {
775 		char samples[128];
776 
777 		if (rec->samples && !rec->opts.full_auxtrace)
778 			scnprintf(samples, sizeof(samples),
779 				  " (%" PRIu64 " samples)", rec->samples);
780 		else
781 			samples[0] = '\0';
782 
783 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s ]\n",
784 			perf_data_file__size(file) / 1024.0 / 1024.0,
785 			file->path, samples);
786 	}
787 
788 out_delete_session:
789 	perf_session__delete(session);
790 	return status;
791 }
792 
793 static void callchain_debug(void)
794 {
795 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
796 
797 	pr_debug("callchain: type %s\n", str[callchain_param.record_mode]);
798 
799 	if (callchain_param.record_mode == CALLCHAIN_DWARF)
800 		pr_debug("callchain: stack dump size %d\n",
801 			 callchain_param.dump_size);
802 }
803 
804 int record_parse_callchain_opt(const struct option *opt,
805 			       const char *arg,
806 			       int unset)
807 {
808 	int ret;
809 	struct record_opts *record = (struct record_opts *)opt->value;
810 
811 	record->callgraph_set = true;
812 	callchain_param.enabled = !unset;
813 
814 	/* --no-call-graph */
815 	if (unset) {
816 		callchain_param.record_mode = CALLCHAIN_NONE;
817 		pr_debug("callchain: disabled\n");
818 		return 0;
819 	}
820 
821 	ret = parse_callchain_record_opt(arg, &callchain_param);
822 	if (!ret) {
823 		/* Enable data address sampling for DWARF unwind. */
824 		if (callchain_param.record_mode == CALLCHAIN_DWARF)
825 			record->sample_address = true;
826 		callchain_debug();
827 	}
828 
829 	return ret;
830 }
831 
832 int record_callchain_opt(const struct option *opt,
833 			 const char *arg __maybe_unused,
834 			 int unset __maybe_unused)
835 {
836 	struct record_opts *record = (struct record_opts *)opt->value;
837 
838 	record->callgraph_set = true;
839 	callchain_param.enabled = true;
840 
841 	if (callchain_param.record_mode == CALLCHAIN_NONE)
842 		callchain_param.record_mode = CALLCHAIN_FP;
843 
844 	callchain_debug();
845 	return 0;
846 }
847 
848 static int perf_record_config(const char *var, const char *value, void *cb)
849 {
850 	struct record *rec = cb;
851 
852 	if (!strcmp(var, "record.build-id")) {
853 		if (!strcmp(value, "cache"))
854 			rec->no_buildid_cache = false;
855 		else if (!strcmp(value, "no-cache"))
856 			rec->no_buildid_cache = true;
857 		else if (!strcmp(value, "skip"))
858 			rec->no_buildid = true;
859 		else
860 			return -1;
861 		return 0;
862 	}
863 	if (!strcmp(var, "record.call-graph"))
864 		var = "call-graph.record-mode"; /* fall-through */
865 
866 	return perf_default_config(var, value, cb);
867 }
868 
869 struct clockid_map {
870 	const char *name;
871 	int clockid;
872 };
873 
874 #define CLOCKID_MAP(n, c)	\
875 	{ .name = n, .clockid = (c), }
876 
877 #define CLOCKID_END	{ .name = NULL, }
878 
879 
880 /*
881  * Add the missing ones, we need to build on many distros...
882  */
883 #ifndef CLOCK_MONOTONIC_RAW
884 #define CLOCK_MONOTONIC_RAW 4
885 #endif
886 #ifndef CLOCK_BOOTTIME
887 #define CLOCK_BOOTTIME 7
888 #endif
889 #ifndef CLOCK_TAI
890 #define CLOCK_TAI 11
891 #endif
892 
893 static const struct clockid_map clockids[] = {
894 	/* available for all events, NMI safe */
895 	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
896 	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
897 
898 	/* available for some events */
899 	CLOCKID_MAP("realtime", CLOCK_REALTIME),
900 	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
901 	CLOCKID_MAP("tai", CLOCK_TAI),
902 
903 	/* available for the lazy */
904 	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
905 	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
906 	CLOCKID_MAP("real", CLOCK_REALTIME),
907 	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
908 
909 	CLOCKID_END,
910 };
911 
912 static int parse_clockid(const struct option *opt, const char *str, int unset)
913 {
914 	struct record_opts *opts = (struct record_opts *)opt->value;
915 	const struct clockid_map *cm;
916 	const char *ostr = str;
917 
918 	if (unset) {
919 		opts->use_clockid = 0;
920 		return 0;
921 	}
922 
923 	/* no arg passed */
924 	if (!str)
925 		return 0;
926 
927 	/* no setting it twice */
928 	if (opts->use_clockid)
929 		return -1;
930 
931 	opts->use_clockid = true;
932 
933 	/* if its a number, we're done */
934 	if (sscanf(str, "%d", &opts->clockid) == 1)
935 		return 0;
936 
937 	/* allow a "CLOCK_" prefix to the name */
938 	if (!strncasecmp(str, "CLOCK_", 6))
939 		str += 6;
940 
941 	for (cm = clockids; cm->name; cm++) {
942 		if (!strcasecmp(str, cm->name)) {
943 			opts->clockid = cm->clockid;
944 			return 0;
945 		}
946 	}
947 
948 	opts->use_clockid = false;
949 	ui__warning("unknown clockid %s, check man page\n", ostr);
950 	return -1;
951 }
952 
953 static int record__parse_mmap_pages(const struct option *opt,
954 				    const char *str,
955 				    int unset __maybe_unused)
956 {
957 	struct record_opts *opts = opt->value;
958 	char *s, *p;
959 	unsigned int mmap_pages;
960 	int ret;
961 
962 	if (!str)
963 		return -EINVAL;
964 
965 	s = strdup(str);
966 	if (!s)
967 		return -ENOMEM;
968 
969 	p = strchr(s, ',');
970 	if (p)
971 		*p = '\0';
972 
973 	if (*s) {
974 		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
975 		if (ret)
976 			goto out_free;
977 		opts->mmap_pages = mmap_pages;
978 	}
979 
980 	if (!p) {
981 		ret = 0;
982 		goto out_free;
983 	}
984 
985 	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
986 	if (ret)
987 		goto out_free;
988 
989 	opts->auxtrace_mmap_pages = mmap_pages;
990 
991 out_free:
992 	free(s);
993 	return ret;
994 }
995 
996 static const char * const __record_usage[] = {
997 	"perf record [<options>] [<command>]",
998 	"perf record [<options>] -- <command> [<options>]",
999 	NULL
1000 };
1001 const char * const *record_usage = __record_usage;
1002 
1003 /*
1004  * XXX Ideally would be local to cmd_record() and passed to a record__new
1005  * because we need to have access to it in record__exit, that is called
1006  * after cmd_record() exits, but since record_options need to be accessible to
1007  * builtin-script, leave it here.
1008  *
1009  * At least we don't ouch it in all the other functions here directly.
1010  *
1011  * Just say no to tons of global variables, sigh.
1012  */
1013 static struct record record = {
1014 	.opts = {
1015 		.sample_time	     = true,
1016 		.mmap_pages	     = UINT_MAX,
1017 		.user_freq	     = UINT_MAX,
1018 		.user_interval	     = ULLONG_MAX,
1019 		.freq		     = 4000,
1020 		.target		     = {
1021 			.uses_mmap   = true,
1022 			.default_per_cpu = true,
1023 		},
1024 		.proc_map_timeout     = 500,
1025 	},
1026 	.tool = {
1027 		.sample		= process_sample_event,
1028 		.fork		= perf_event__process_fork,
1029 		.exit		= perf_event__process_exit,
1030 		.comm		= perf_event__process_comm,
1031 		.mmap		= perf_event__process_mmap,
1032 		.mmap2		= perf_event__process_mmap2,
1033 		.ordered_events	= true,
1034 	},
1035 };
1036 
1037 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
1038 	"\n\t\t\t\tDefault: fp";
1039 
1040 /*
1041  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
1042  * with it and switch to use the library functions in perf_evlist that came
1043  * from builtin-record.c, i.e. use record_opts,
1044  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
1045  * using pipes, etc.
1046  */
1047 struct option __record_options[] = {
1048 	OPT_CALLBACK('e', "event", &record.evlist, "event",
1049 		     "event selector. use 'perf list' to list available events",
1050 		     parse_events_option),
1051 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
1052 		     "event filter", parse_filter),
1053 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
1054 			   NULL, "don't record events from perf itself",
1055 			   exclude_perf),
1056 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
1057 		    "record events on existing process id"),
1058 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
1059 		    "record events on existing thread id"),
1060 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
1061 		    "collect data with this RT SCHED_FIFO priority"),
1062 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
1063 		    "collect data without buffering"),
1064 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1065 		    "collect raw sample records from all opened counters"),
1066 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1067 			    "system-wide collection from all CPUs"),
1068 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1069 		    "list of cpus to monitor"),
1070 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
1071 	OPT_STRING('o', "output", &record.file.path, "file",
1072 		    "output file name"),
1073 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
1074 			&record.opts.no_inherit_set,
1075 			"child tasks do not inherit counters"),
1076 	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
1077 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
1078 		     "number of mmap data pages and AUX area tracing mmap pages",
1079 		     record__parse_mmap_pages),
1080 	OPT_BOOLEAN(0, "group", &record.opts.group,
1081 		    "put the counters into a counter group"),
1082 	OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
1083 			   NULL, "enables call-graph recording" ,
1084 			   &record_callchain_opt),
1085 	OPT_CALLBACK(0, "call-graph", &record.opts,
1086 		     "record_mode[,record_size]", record_callchain_help,
1087 		     &record_parse_callchain_opt),
1088 	OPT_INCR('v', "verbose", &verbose,
1089 		    "be more verbose (show counter open errors, etc)"),
1090 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1091 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1092 		    "per thread counts"),
1093 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
1094 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
1095 			&record.opts.sample_time_set,
1096 			"Record the sample timestamps"),
1097 	OPT_BOOLEAN('P', "period", &record.opts.period, "Record the sample period"),
1098 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1099 		    "don't sample"),
1100 	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
1101 		    "do not update the buildid cache"),
1102 	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
1103 		    "do not collect buildids in perf.data"),
1104 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
1105 		     "monitor event in cgroup name only",
1106 		     parse_cgroups),
1107 	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
1108 		  "ms to wait before starting measurement after program start"),
1109 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
1110 		   "user to profile"),
1111 
1112 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
1113 		     "branch any", "sample any taken branches",
1114 		     parse_branch_stack),
1115 
1116 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
1117 		     "branch filter mask", "branch stack filter modes",
1118 		     parse_branch_stack),
1119 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
1120 		    "sample by weight (on special events only)"),
1121 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
1122 		    "sample transaction flags (special events only)"),
1123 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
1124 		    "use per-thread mmaps"),
1125 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
1126 		    "sample selected machine registers on interrupt,"
1127 		    " use -I ? to list register names", parse_regs),
1128 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
1129 		    "Record running/enabled time of read (:S) events"),
1130 	OPT_CALLBACK('k', "clockid", &record.opts,
1131 	"clockid", "clockid to use for events, see clock_gettime()",
1132 	parse_clockid),
1133 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
1134 			  "opts", "AUX area tracing Snapshot Mode", ""),
1135 	OPT_UINTEGER(0, "proc-map-timeout", &record.opts.proc_map_timeout,
1136 			"per thread proc mmap processing timeout in ms"),
1137 	OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
1138 		    "Record context switch events"),
1139 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
1140 		   "clang binary to use for compiling BPF scriptlets"),
1141 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
1142 		   "options passed to clang when compiling BPF scriptlets"),
1143 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
1144 		   "file", "vmlinux pathname"),
1145 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
1146 		    "Record build-id of all DSOs regardless of hits"),
1147 	OPT_END()
1148 };
1149 
1150 struct option *record_options = __record_options;
1151 
1152 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
1153 {
1154 	int err;
1155 	struct record *rec = &record;
1156 	char errbuf[BUFSIZ];
1157 
1158 #ifndef HAVE_LIBBPF_SUPPORT
1159 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
1160 	set_nobuild('\0', "clang-path", true);
1161 	set_nobuild('\0', "clang-opt", true);
1162 # undef set_nobuild
1163 #endif
1164 
1165 #ifndef HAVE_BPF_PROLOGUE
1166 # if !defined (HAVE_DWARF_SUPPORT)
1167 #  define REASON  "NO_DWARF=1"
1168 # elif !defined (HAVE_LIBBPF_SUPPORT)
1169 #  define REASON  "NO_LIBBPF=1"
1170 # else
1171 #  define REASON  "this architecture doesn't support BPF prologue"
1172 # endif
1173 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
1174 	set_nobuild('\0', "vmlinux", true);
1175 # undef set_nobuild
1176 # undef REASON
1177 #endif
1178 
1179 	rec->evlist = perf_evlist__new();
1180 	if (rec->evlist == NULL)
1181 		return -ENOMEM;
1182 
1183 	perf_config(perf_record_config, rec);
1184 
1185 	argc = parse_options(argc, argv, record_options, record_usage,
1186 			    PARSE_OPT_STOP_AT_NON_OPTION);
1187 	if (!argc && target__none(&rec->opts.target))
1188 		usage_with_options(record_usage, record_options);
1189 
1190 	if (nr_cgroups && !rec->opts.target.system_wide) {
1191 		usage_with_options_msg(record_usage, record_options,
1192 			"cgroup monitoring only available in system-wide mode");
1193 
1194 	}
1195 	if (rec->opts.record_switch_events &&
1196 	    !perf_can_record_switch_events()) {
1197 		ui__error("kernel does not support recording context switch events\n");
1198 		parse_options_usage(record_usage, record_options, "switch-events", 0);
1199 		return -EINVAL;
1200 	}
1201 
1202 	if (!rec->itr) {
1203 		rec->itr = auxtrace_record__init(rec->evlist, &err);
1204 		if (err)
1205 			return err;
1206 	}
1207 
1208 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
1209 					      rec->opts.auxtrace_snapshot_opts);
1210 	if (err)
1211 		return err;
1212 
1213 	err = -ENOMEM;
1214 
1215 	symbol__init(NULL);
1216 
1217 	if (symbol_conf.kptr_restrict)
1218 		pr_warning(
1219 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1220 "check /proc/sys/kernel/kptr_restrict.\n\n"
1221 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1222 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1223 "Samples in kernel modules won't be resolved at all.\n\n"
1224 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1225 "even with a suitable vmlinux or kallsyms file.\n\n");
1226 
1227 	if (rec->no_buildid_cache || rec->no_buildid)
1228 		disable_buildid_cache();
1229 
1230 	if (rec->evlist->nr_entries == 0 &&
1231 	    perf_evlist__add_default(rec->evlist) < 0) {
1232 		pr_err("Not enough memory for event selector list\n");
1233 		goto out_symbol_exit;
1234 	}
1235 
1236 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
1237 		rec->opts.no_inherit = true;
1238 
1239 	err = target__validate(&rec->opts.target);
1240 	if (err) {
1241 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1242 		ui__warning("%s", errbuf);
1243 	}
1244 
1245 	err = target__parse_uid(&rec->opts.target);
1246 	if (err) {
1247 		int saved_errno = errno;
1248 
1249 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1250 		ui__error("%s", errbuf);
1251 
1252 		err = -saved_errno;
1253 		goto out_symbol_exit;
1254 	}
1255 
1256 	err = -ENOMEM;
1257 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
1258 		usage_with_options(record_usage, record_options);
1259 
1260 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
1261 	if (err)
1262 		goto out_symbol_exit;
1263 
1264 	/*
1265 	 * We take all buildids when the file contains
1266 	 * AUX area tracing data because we do not decode the
1267 	 * trace because it would take too long.
1268 	 */
1269 	if (rec->opts.full_auxtrace)
1270 		rec->buildid_all = true;
1271 
1272 	if (record_opts__config(&rec->opts)) {
1273 		err = -EINVAL;
1274 		goto out_symbol_exit;
1275 	}
1276 
1277 	err = __cmd_record(&record, argc, argv);
1278 out_symbol_exit:
1279 	perf_evlist__delete(rec->evlist);
1280 	symbol__exit();
1281 	auxtrace_record__free(rec->itr);
1282 	return err;
1283 }
1284 
1285 static void snapshot_sig_handler(int sig __maybe_unused)
1286 {
1287 	if (!auxtrace_snapshot_enabled)
1288 		return;
1289 	auxtrace_snapshot_enabled = 0;
1290 	auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
1291 	auxtrace_record__snapshot_started = 1;
1292 }
1293