xref: /openbmc/linux/tools/perf/builtin-record.c (revision ce932d0c5589e9766e089c22c66890dfc48fbd94)
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #define _FILE_OFFSET_BITS 64
9 
10 #include "builtin.h"
11 
12 #include "perf.h"
13 
14 #include "util/build-id.h"
15 #include "util/util.h"
16 #include "util/parse-options.h"
17 #include "util/parse-events.h"
18 
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/cpumap.h"
28 #include "util/thread_map.h"
29 
30 #include <unistd.h>
31 #include <sched.h>
32 #include <sys/mman.h>
33 
34 enum write_mode_t {
35 	WRITE_FORCE,
36 	WRITE_APPEND
37 };
38 
39 struct perf_record {
40 	struct perf_tool	tool;
41 	struct perf_record_opts	opts;
42 	u64			bytes_written;
43 	const char		*output_name;
44 	struct perf_evlist	*evlist;
45 	struct perf_session	*session;
46 	const char		*progname;
47 	const char		*uid_str;
48 	int			output;
49 	unsigned int		page_size;
50 	int			realtime_prio;
51 	enum write_mode_t	write_mode;
52 	bool			no_buildid;
53 	bool			no_buildid_cache;
54 	bool			force;
55 	bool			file_new;
56 	bool			append_file;
57 	long			samples;
58 	off_t			post_processing_offset;
59 };
60 
61 static void advance_output(struct perf_record *rec, size_t size)
62 {
63 	rec->bytes_written += size;
64 }
65 
66 static void write_output(struct perf_record *rec, void *buf, size_t size)
67 {
68 	while (size) {
69 		int ret = write(rec->output, buf, size);
70 
71 		if (ret < 0)
72 			die("failed to write");
73 
74 		size -= ret;
75 		buf += ret;
76 
77 		rec->bytes_written += ret;
78 	}
79 }
80 
81 static int process_synthesized_event(struct perf_tool *tool,
82 				     union perf_event *event,
83 				     struct perf_sample *sample __used,
84 				     struct machine *machine __used)
85 {
86 	struct perf_record *rec = container_of(tool, struct perf_record, tool);
87 	write_output(rec, event, event->header.size);
88 	return 0;
89 }
90 
91 static void perf_record__mmap_read(struct perf_record *rec,
92 				   struct perf_mmap *md)
93 {
94 	unsigned int head = perf_mmap__read_head(md);
95 	unsigned int old = md->prev;
96 	unsigned char *data = md->base + rec->page_size;
97 	unsigned long size;
98 	void *buf;
99 
100 	if (old == head)
101 		return;
102 
103 	rec->samples++;
104 
105 	size = head - old;
106 
107 	if ((old & md->mask) + size != (head & md->mask)) {
108 		buf = &data[old & md->mask];
109 		size = md->mask + 1 - (old & md->mask);
110 		old += size;
111 
112 		write_output(rec, buf, size);
113 	}
114 
115 	buf = &data[old & md->mask];
116 	size = head - old;
117 	old += size;
118 
119 	write_output(rec, buf, size);
120 
121 	md->prev = old;
122 	perf_mmap__write_tail(md, old);
123 }
124 
125 static volatile int done = 0;
126 static volatile int signr = -1;
127 static volatile int child_finished = 0;
128 
129 static void sig_handler(int sig)
130 {
131 	if (sig == SIGCHLD)
132 		child_finished = 1;
133 
134 	done = 1;
135 	signr = sig;
136 }
137 
138 static void perf_record__sig_exit(int exit_status __used, void *arg)
139 {
140 	struct perf_record *rec = arg;
141 	int status;
142 
143 	if (rec->evlist->workload.pid > 0) {
144 		if (!child_finished)
145 			kill(rec->evlist->workload.pid, SIGTERM);
146 
147 		wait(&status);
148 		if (WIFSIGNALED(status))
149 			psignal(WTERMSIG(status), rec->progname);
150 	}
151 
152 	if (signr == -1 || signr == SIGUSR1)
153 		return;
154 
155 	signal(signr, SIG_DFL);
156 	kill(getpid(), signr);
157 }
158 
159 static bool perf_evlist__equal(struct perf_evlist *evlist,
160 			       struct perf_evlist *other)
161 {
162 	struct perf_evsel *pos, *pair;
163 
164 	if (evlist->nr_entries != other->nr_entries)
165 		return false;
166 
167 	pair = list_entry(other->entries.next, struct perf_evsel, node);
168 
169 	list_for_each_entry(pos, &evlist->entries, node) {
170 		if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
171 			return false;
172 		pair = list_entry(pair->node.next, struct perf_evsel, node);
173 	}
174 
175 	return true;
176 }
177 
178 static void perf_record__open(struct perf_record *rec)
179 {
180 	struct perf_evsel *pos, *first;
181 	struct perf_evlist *evlist = rec->evlist;
182 	struct perf_session *session = rec->session;
183 	struct perf_record_opts *opts = &rec->opts;
184 
185 	first = list_entry(evlist->entries.next, struct perf_evsel, node);
186 
187 	perf_evlist__config_attrs(evlist, opts);
188 
189 	list_for_each_entry(pos, &evlist->entries, node) {
190 		struct perf_event_attr *attr = &pos->attr;
191 		struct xyarray *group_fd = NULL;
192 		/*
193 		 * Check if parse_single_tracepoint_event has already asked for
194 		 * PERF_SAMPLE_TIME.
195 		 *
196 		 * XXX this is kludgy but short term fix for problems introduced by
197 		 * eac23d1c that broke 'perf script' by having different sample_types
198 		 * when using multiple tracepoint events when we use a perf binary
199 		 * that tries to use sample_id_all on an older kernel.
200 		 *
201 		 * We need to move counter creation to perf_session, support
202 		 * different sample_types, etc.
203 		 */
204 		bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
205 
206 		if (opts->group && pos != first)
207 			group_fd = first->fd;
208 fallback_missing_features:
209 		if (opts->exclude_guest_missing)
210 			attr->exclude_guest = attr->exclude_host = 0;
211 retry_sample_id:
212 		attr->sample_id_all = opts->sample_id_all_missing ? 0 : 1;
213 try_again:
214 		if (perf_evsel__open(pos, evlist->cpus, evlist->threads,
215 				     opts->group, group_fd) < 0) {
216 			int err = errno;
217 
218 			if (err == EPERM || err == EACCES) {
219 				ui__error_paranoid();
220 				exit(EXIT_FAILURE);
221 			} else if (err ==  ENODEV && opts->cpu_list) {
222 				die("No such device - did you specify"
223 					" an out-of-range profile CPU?\n");
224 			} else if (err == EINVAL) {
225 				if (!opts->exclude_guest_missing &&
226 				    (attr->exclude_guest || attr->exclude_host)) {
227 					pr_debug("Old kernel, cannot exclude "
228 						 "guest or host samples.\n");
229 					opts->exclude_guest_missing = true;
230 					goto fallback_missing_features;
231 				} else if (!opts->sample_id_all_missing) {
232 					/*
233 					 * Old kernel, no attr->sample_id_type_all field
234 					 */
235 					opts->sample_id_all_missing = true;
236 					if (!opts->sample_time && !opts->raw_samples && !time_needed)
237 						attr->sample_type &= ~PERF_SAMPLE_TIME;
238 
239 					goto retry_sample_id;
240 				}
241 			}
242 
243 			/*
244 			 * If it's cycles then fall back to hrtimer
245 			 * based cpu-clock-tick sw counter, which
246 			 * is always available even if no PMU support:
247 			 */
248 			if (attr->type == PERF_TYPE_HARDWARE
249 					&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
250 
251 				if (verbose)
252 					ui__warning("The cycles event is not supported, "
253 						    "trying to fall back to cpu-clock-ticks\n");
254 				attr->type = PERF_TYPE_SOFTWARE;
255 				attr->config = PERF_COUNT_SW_CPU_CLOCK;
256 				goto try_again;
257 			}
258 
259 			if (err == ENOENT) {
260 				ui__warning("The %s event is not supported.\n",
261 					    event_name(pos));
262 				exit(EXIT_FAILURE);
263 			}
264 
265 			printf("\n");
266 			error("sys_perf_event_open() syscall returned with %d (%s).  /bin/dmesg may provide additional information.\n",
267 			      err, strerror(err));
268 
269 #if defined(__i386__) || defined(__x86_64__)
270 			if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
271 				die("No hardware sampling interrupt available."
272 				    " No APIC? If so then you can boot the kernel"
273 				    " with the \"lapic\" boot parameter to"
274 				    " force-enable it.\n");
275 #endif
276 
277 			die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
278 		}
279 	}
280 
281 	if (perf_evlist__set_filters(evlist)) {
282 		error("failed to set filter with %d (%s)\n", errno,
283 			strerror(errno));
284 		exit(-1);
285 	}
286 
287 	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
288 		if (errno == EPERM)
289 			die("Permission error mapping pages.\n"
290 			    "Consider increasing "
291 			    "/proc/sys/kernel/perf_event_mlock_kb,\n"
292 			    "or try again with a smaller value of -m/--mmap_pages.\n"
293 			    "(current value: %d)\n", opts->mmap_pages);
294 		else if (!is_power_of_2(opts->mmap_pages))
295 			die("--mmap_pages/-m value must be a power of two.");
296 
297 		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
298 	}
299 
300 	if (rec->file_new)
301 		session->evlist = evlist;
302 	else {
303 		if (!perf_evlist__equal(session->evlist, evlist)) {
304 			fprintf(stderr, "incompatible append\n");
305 			exit(-1);
306 		}
307  	}
308 
309 	perf_session__update_sample_type(session);
310 }
311 
312 static int process_buildids(struct perf_record *rec)
313 {
314 	u64 size = lseek(rec->output, 0, SEEK_CUR);
315 
316 	if (size == 0)
317 		return 0;
318 
319 	rec->session->fd = rec->output;
320 	return __perf_session__process_events(rec->session, rec->post_processing_offset,
321 					      size - rec->post_processing_offset,
322 					      size, &build_id__mark_dso_hit_ops);
323 }
324 
325 static void perf_record__exit(int status __used, void *arg)
326 {
327 	struct perf_record *rec = arg;
328 
329 	if (!rec->opts.pipe_output) {
330 		rec->session->header.data_size += rec->bytes_written;
331 
332 		if (!rec->no_buildid)
333 			process_buildids(rec);
334 		perf_session__write_header(rec->session, rec->evlist,
335 					   rec->output, true);
336 		perf_session__delete(rec->session);
337 		perf_evlist__delete(rec->evlist);
338 		symbol__exit();
339 	}
340 }
341 
342 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
343 {
344 	int err;
345 	struct perf_tool *tool = data;
346 
347 	if (machine__is_host(machine))
348 		return;
349 
350 	/*
351 	 *As for guest kernel when processing subcommand record&report,
352 	 *we arrange module mmap prior to guest kernel mmap and trigger
353 	 *a preload dso because default guest module symbols are loaded
354 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
355 	 *method is used to avoid symbol missing when the first addr is
356 	 *in module instead of in guest kernel.
357 	 */
358 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
359 					     machine);
360 	if (err < 0)
361 		pr_err("Couldn't record guest kernel [%d]'s reference"
362 		       " relocation symbol.\n", machine->pid);
363 
364 	/*
365 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
366 	 * have no _text sometimes.
367 	 */
368 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
369 						 machine, "_text");
370 	if (err < 0)
371 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
372 							 machine, "_stext");
373 	if (err < 0)
374 		pr_err("Couldn't record guest kernel [%d]'s reference"
375 		       " relocation symbol.\n", machine->pid);
376 }
377 
378 static struct perf_event_header finished_round_event = {
379 	.size = sizeof(struct perf_event_header),
380 	.type = PERF_RECORD_FINISHED_ROUND,
381 };
382 
383 static void perf_record__mmap_read_all(struct perf_record *rec)
384 {
385 	int i;
386 
387 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
388 		if (rec->evlist->mmap[i].base)
389 			perf_record__mmap_read(rec, &rec->evlist->mmap[i]);
390 	}
391 
392 	if (perf_header__has_feat(&rec->session->header, HEADER_TRACE_INFO))
393 		write_output(rec, &finished_round_event, sizeof(finished_round_event));
394 }
395 
396 static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
397 {
398 	struct stat st;
399 	int flags;
400 	int err, output, feat;
401 	unsigned long waking = 0;
402 	const bool forks = argc > 0;
403 	struct machine *machine;
404 	struct perf_tool *tool = &rec->tool;
405 	struct perf_record_opts *opts = &rec->opts;
406 	struct perf_evlist *evsel_list = rec->evlist;
407 	const char *output_name = rec->output_name;
408 	struct perf_session *session;
409 
410 	rec->progname = argv[0];
411 
412 	rec->page_size = sysconf(_SC_PAGE_SIZE);
413 
414 	on_exit(perf_record__sig_exit, rec);
415 	signal(SIGCHLD, sig_handler);
416 	signal(SIGINT, sig_handler);
417 	signal(SIGUSR1, sig_handler);
418 
419 	if (!output_name) {
420 		if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
421 			opts->pipe_output = true;
422 		else
423 			rec->output_name = output_name = "perf.data";
424 	}
425 	if (output_name) {
426 		if (!strcmp(output_name, "-"))
427 			opts->pipe_output = true;
428 		else if (!stat(output_name, &st) && st.st_size) {
429 			if (rec->write_mode == WRITE_FORCE) {
430 				char oldname[PATH_MAX];
431 				snprintf(oldname, sizeof(oldname), "%s.old",
432 					 output_name);
433 				unlink(oldname);
434 				rename(output_name, oldname);
435 			}
436 		} else if (rec->write_mode == WRITE_APPEND) {
437 			rec->write_mode = WRITE_FORCE;
438 		}
439 	}
440 
441 	flags = O_CREAT|O_RDWR;
442 	if (rec->write_mode == WRITE_APPEND)
443 		rec->file_new = 0;
444 	else
445 		flags |= O_TRUNC;
446 
447 	if (opts->pipe_output)
448 		output = STDOUT_FILENO;
449 	else
450 		output = open(output_name, flags, S_IRUSR | S_IWUSR);
451 	if (output < 0) {
452 		perror("failed to create output file");
453 		exit(-1);
454 	}
455 
456 	rec->output = output;
457 
458 	session = perf_session__new(output_name, O_WRONLY,
459 				    rec->write_mode == WRITE_FORCE, false, NULL);
460 	if (session == NULL) {
461 		pr_err("Not enough memory for reading perf file header\n");
462 		return -1;
463 	}
464 
465 	rec->session = session;
466 
467 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
468 		perf_header__set_feat(&session->header, feat);
469 
470 	if (rec->no_buildid)
471 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
472 
473 	if (!have_tracepoints(&evsel_list->entries))
474 		perf_header__clear_feat(&session->header, HEADER_TRACE_INFO);
475 
476 	if (!rec->opts.branch_stack)
477 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
478 
479 	if (!rec->file_new) {
480 		err = perf_session__read_header(session, output);
481 		if (err < 0)
482 			goto out_delete_session;
483 	}
484 
485 	if (forks) {
486 		err = perf_evlist__prepare_workload(evsel_list, opts, argv);
487 		if (err < 0) {
488 			pr_err("Couldn't run the workload!\n");
489 			goto out_delete_session;
490 		}
491 	}
492 
493 	perf_record__open(rec);
494 
495 	/*
496 	 * perf_session__delete(session) will be called at perf_record__exit()
497 	 */
498 	on_exit(perf_record__exit, rec);
499 
500 	if (opts->pipe_output) {
501 		err = perf_header__write_pipe(output);
502 		if (err < 0)
503 			return err;
504 	} else if (rec->file_new) {
505 		err = perf_session__write_header(session, evsel_list,
506 						 output, false);
507 		if (err < 0)
508 			return err;
509 	}
510 
511 	if (!rec->no_buildid
512 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
513 		pr_err("Couldn't generate buildids. "
514 		       "Use --no-buildid to profile anyway.\n");
515 		return -1;
516 	}
517 
518 	rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
519 
520 	machine = perf_session__find_host_machine(session);
521 	if (!machine) {
522 		pr_err("Couldn't find native kernel information.\n");
523 		return -1;
524 	}
525 
526 	if (opts->pipe_output) {
527 		err = perf_event__synthesize_attrs(tool, session,
528 						   process_synthesized_event);
529 		if (err < 0) {
530 			pr_err("Couldn't synthesize attrs.\n");
531 			return err;
532 		}
533 
534 		err = perf_event__synthesize_event_types(tool, process_synthesized_event,
535 							 machine);
536 		if (err < 0) {
537 			pr_err("Couldn't synthesize event_types.\n");
538 			return err;
539 		}
540 
541 		if (have_tracepoints(&evsel_list->entries)) {
542 			/*
543 			 * FIXME err <= 0 here actually means that
544 			 * there were no tracepoints so its not really
545 			 * an error, just that we don't need to
546 			 * synthesize anything.  We really have to
547 			 * return this more properly and also
548 			 * propagate errors that now are calling die()
549 			 */
550 			err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
551 								  process_synthesized_event);
552 			if (err <= 0) {
553 				pr_err("Couldn't record tracing data.\n");
554 				return err;
555 			}
556 			advance_output(rec, err);
557 		}
558 	}
559 
560 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
561 						 machine, "_text");
562 	if (err < 0)
563 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
564 							 machine, "_stext");
565 	if (err < 0)
566 		pr_err("Couldn't record kernel reference relocation symbol\n"
567 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
568 		       "Check /proc/kallsyms permission or run as root.\n");
569 
570 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
571 					     machine);
572 	if (err < 0)
573 		pr_err("Couldn't record kernel module information.\n"
574 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
575 		       "Check /proc/modules permission or run as root.\n");
576 
577 	if (perf_guest)
578 		perf_session__process_machines(session, tool,
579 					       perf_event__synthesize_guest_os);
580 
581 	if (!opts->system_wide)
582 		perf_event__synthesize_thread_map(tool, evsel_list->threads,
583 						  process_synthesized_event,
584 						  machine);
585 	else
586 		perf_event__synthesize_threads(tool, process_synthesized_event,
587 					       machine);
588 
589 	if (rec->realtime_prio) {
590 		struct sched_param param;
591 
592 		param.sched_priority = rec->realtime_prio;
593 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
594 			pr_err("Could not set realtime priority.\n");
595 			exit(-1);
596 		}
597 	}
598 
599 	perf_evlist__enable(evsel_list);
600 
601 	/*
602 	 * Let the child rip
603 	 */
604 	if (forks)
605 		perf_evlist__start_workload(evsel_list);
606 
607 	for (;;) {
608 		int hits = rec->samples;
609 
610 		perf_record__mmap_read_all(rec);
611 
612 		if (hits == rec->samples) {
613 			if (done)
614 				break;
615 			err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
616 			waking++;
617 		}
618 
619 		if (done)
620 			perf_evlist__disable(evsel_list);
621 	}
622 
623 	if (quiet || signr == SIGUSR1)
624 		return 0;
625 
626 	fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
627 
628 	/*
629 	 * Approximate RIP event size: 24 bytes.
630 	 */
631 	fprintf(stderr,
632 		"[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
633 		(double)rec->bytes_written / 1024.0 / 1024.0,
634 		output_name,
635 		rec->bytes_written / 24);
636 
637 	return 0;
638 
639 out_delete_session:
640 	perf_session__delete(session);
641 	return err;
642 }
643 
644 #define BRANCH_OPT(n, m) \
645 	{ .name = n, .mode = (m) }
646 
647 #define BRANCH_END { .name = NULL }
648 
649 struct branch_mode {
650 	const char *name;
651 	int mode;
652 };
653 
654 static const struct branch_mode branch_modes[] = {
655 	BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
656 	BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
657 	BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
658 	BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
659 	BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
660 	BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
661 	BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
662 	BRANCH_END
663 };
664 
665 static int
666 parse_branch_stack(const struct option *opt, const char *str, int unset)
667 {
668 #define ONLY_PLM \
669 	(PERF_SAMPLE_BRANCH_USER	|\
670 	 PERF_SAMPLE_BRANCH_KERNEL	|\
671 	 PERF_SAMPLE_BRANCH_HV)
672 
673 	uint64_t *mode = (uint64_t *)opt->value;
674 	const struct branch_mode *br;
675 	char *s, *os = NULL, *p;
676 	int ret = -1;
677 
678 	if (unset)
679 		return 0;
680 
681 	/*
682 	 * cannot set it twice, -b + --branch-filter for instance
683 	 */
684 	if (*mode)
685 		return -1;
686 
687 	/* str may be NULL in case no arg is passed to -b */
688 	if (str) {
689 		/* because str is read-only */
690 		s = os = strdup(str);
691 		if (!s)
692 			return -1;
693 
694 		for (;;) {
695 			p = strchr(s, ',');
696 			if (p)
697 				*p = '\0';
698 
699 			for (br = branch_modes; br->name; br++) {
700 				if (!strcasecmp(s, br->name))
701 					break;
702 			}
703 			if (!br->name) {
704 				ui__warning("unknown branch filter %s,"
705 					    " check man page\n", s);
706 				goto error;
707 			}
708 
709 			*mode |= br->mode;
710 
711 			if (!p)
712 				break;
713 
714 			s = p + 1;
715 		}
716 	}
717 	ret = 0;
718 
719 	/* default to any branch */
720 	if ((*mode & ~ONLY_PLM) == 0) {
721 		*mode = PERF_SAMPLE_BRANCH_ANY;
722 	}
723 error:
724 	free(os);
725 	return ret;
726 }
727 
728 static const char * const record_usage[] = {
729 	"perf record [<options>] [<command>]",
730 	"perf record [<options>] -- <command> [<options>]",
731 	NULL
732 };
733 
734 /*
735  * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
736  * because we need to have access to it in perf_record__exit, that is called
737  * after cmd_record() exits, but since record_options need to be accessible to
738  * builtin-script, leave it here.
739  *
740  * At least we don't ouch it in all the other functions here directly.
741  *
742  * Just say no to tons of global variables, sigh.
743  */
744 static struct perf_record record = {
745 	.opts = {
746 		.mmap_pages	     = UINT_MAX,
747 		.user_freq	     = UINT_MAX,
748 		.user_interval	     = ULLONG_MAX,
749 		.freq		     = 1000,
750 	},
751 	.write_mode = WRITE_FORCE,
752 	.file_new   = true,
753 };
754 
755 /*
756  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
757  * with it and switch to use the library functions in perf_evlist that came
758  * from builtin-record.c, i.e. use perf_record_opts,
759  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
760  * using pipes, etc.
761  */
762 const struct option record_options[] = {
763 	OPT_CALLBACK('e', "event", &record.evlist, "event",
764 		     "event selector. use 'perf list' to list available events",
765 		     parse_events_option),
766 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
767 		     "event filter", parse_filter),
768 	OPT_STRING('p', "pid", &record.opts.target_pid, "pid",
769 		    "record events on existing process id"),
770 	OPT_STRING('t', "tid", &record.opts.target_tid, "tid",
771 		    "record events on existing thread id"),
772 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
773 		    "collect data with this RT SCHED_FIFO priority"),
774 	OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
775 		    "collect data without buffering"),
776 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
777 		    "collect raw sample records from all opened counters"),
778 	OPT_BOOLEAN('a', "all-cpus", &record.opts.system_wide,
779 			    "system-wide collection from all CPUs"),
780 	OPT_BOOLEAN('A', "append", &record.append_file,
781 			    "append to the output file to do incremental profiling"),
782 	OPT_STRING('C', "cpu", &record.opts.cpu_list, "cpu",
783 		    "list of cpus to monitor"),
784 	OPT_BOOLEAN('f', "force", &record.force,
785 			"overwrite existing data file (deprecated)"),
786 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
787 	OPT_STRING('o', "output", &record.output_name, "file",
788 		    "output file name"),
789 	OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
790 		    "child tasks do not inherit counters"),
791 	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
792 	OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
793 		     "number of mmap data pages"),
794 	OPT_BOOLEAN(0, "group", &record.opts.group,
795 		    "put the counters into a counter group"),
796 	OPT_BOOLEAN('g', "call-graph", &record.opts.call_graph,
797 		    "do call-graph (stack chain/backtrace) recording"),
798 	OPT_INCR('v', "verbose", &verbose,
799 		    "be more verbose (show counter open errors, etc)"),
800 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
801 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
802 		    "per thread counts"),
803 	OPT_BOOLEAN('d', "data", &record.opts.sample_address,
804 		    "Sample addresses"),
805 	OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
806 	OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
807 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
808 		    "don't sample"),
809 	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
810 		    "do not update the buildid cache"),
811 	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
812 		    "do not collect buildids in perf.data"),
813 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
814 		     "monitor event in cgroup name only",
815 		     parse_cgroups),
816 	OPT_STRING('u', "uid", &record.uid_str, "user", "user to profile"),
817 
818 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
819 		     "branch any", "sample any taken branches",
820 		     parse_branch_stack),
821 
822 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
823 		     "branch filter mask", "branch stack filter modes",
824 		     parse_branch_stack),
825 	OPT_END()
826 };
827 
828 int cmd_record(int argc, const char **argv, const char *prefix __used)
829 {
830 	int err = -ENOMEM;
831 	struct perf_evsel *pos;
832 	struct perf_evlist *evsel_list;
833 	struct perf_record *rec = &record;
834 
835 	perf_header__set_cmdline(argc, argv);
836 
837 	evsel_list = perf_evlist__new(NULL, NULL);
838 	if (evsel_list == NULL)
839 		return -ENOMEM;
840 
841 	rec->evlist = evsel_list;
842 
843 	argc = parse_options(argc, argv, record_options, record_usage,
844 			    PARSE_OPT_STOP_AT_NON_OPTION);
845 	if (!argc && !rec->opts.target_pid && !rec->opts.target_tid &&
846 		!rec->opts.system_wide && !rec->opts.cpu_list && !rec->uid_str)
847 		usage_with_options(record_usage, record_options);
848 
849 	if (rec->force && rec->append_file) {
850 		fprintf(stderr, "Can't overwrite and append at the same time."
851 				" You need to choose between -f and -A");
852 		usage_with_options(record_usage, record_options);
853 	} else if (rec->append_file) {
854 		rec->write_mode = WRITE_APPEND;
855 	} else {
856 		rec->write_mode = WRITE_FORCE;
857 	}
858 
859 	if (nr_cgroups && !rec->opts.system_wide) {
860 		fprintf(stderr, "cgroup monitoring only available in"
861 			" system-wide mode\n");
862 		usage_with_options(record_usage, record_options);
863 	}
864 
865 	symbol__init();
866 
867 	if (symbol_conf.kptr_restrict)
868 		pr_warning(
869 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
870 "check /proc/sys/kernel/kptr_restrict.\n\n"
871 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
872 "file is not found in the buildid cache or in the vmlinux path.\n\n"
873 "Samples in kernel modules won't be resolved at all.\n\n"
874 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
875 "even with a suitable vmlinux or kallsyms file.\n\n");
876 
877 	if (rec->no_buildid_cache || rec->no_buildid)
878 		disable_buildid_cache();
879 
880 	if (evsel_list->nr_entries == 0 &&
881 	    perf_evlist__add_default(evsel_list) < 0) {
882 		pr_err("Not enough memory for event selector list\n");
883 		goto out_symbol_exit;
884 	}
885 
886 	rec->opts.uid = parse_target_uid(rec->uid_str, rec->opts.target_tid,
887 					 rec->opts.target_pid);
888 	if (rec->uid_str != NULL && rec->opts.uid == UINT_MAX - 1)
889 		goto out_free_fd;
890 
891 	if (rec->opts.target_pid)
892 		rec->opts.target_tid = rec->opts.target_pid;
893 
894 	if (perf_evlist__create_maps(evsel_list, rec->opts.target_pid,
895 				     rec->opts.target_tid, rec->opts.uid,
896 				     rec->opts.cpu_list) < 0)
897 		usage_with_options(record_usage, record_options);
898 
899 	list_for_each_entry(pos, &evsel_list->entries, node) {
900 		if (perf_header__push_event(pos->attr.config, event_name(pos)))
901 			goto out_free_fd;
902 	}
903 
904 	if (rec->opts.user_interval != ULLONG_MAX)
905 		rec->opts.default_interval = rec->opts.user_interval;
906 	if (rec->opts.user_freq != UINT_MAX)
907 		rec->opts.freq = rec->opts.user_freq;
908 
909 	/*
910 	 * User specified count overrides default frequency.
911 	 */
912 	if (rec->opts.default_interval)
913 		rec->opts.freq = 0;
914 	else if (rec->opts.freq) {
915 		rec->opts.default_interval = rec->opts.freq;
916 	} else {
917 		fprintf(stderr, "frequency and count are zero, aborting\n");
918 		err = -EINVAL;
919 		goto out_free_fd;
920 	}
921 
922 	err = __cmd_record(&record, argc, argv);
923 out_free_fd:
924 	perf_evlist__delete_maps(evsel_list);
925 out_symbol_exit:
926 	symbol__exit();
927 	return err;
928 }
929