xref: /openbmc/linux/tools/perf/builtin-record.c (revision f35e839a)
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #include "builtin.h"
9 
10 #include "perf.h"
11 
12 #include "util/build-id.h"
13 #include "util/util.h"
14 #include "util/parse-options.h"
15 #include "util/parse-events.h"
16 
17 #include "util/header.h"
18 #include "util/event.h"
19 #include "util/evlist.h"
20 #include "util/evsel.h"
21 #include "util/debug.h"
22 #include "util/session.h"
23 #include "util/tool.h"
24 #include "util/symbol.h"
25 #include "util/cpumap.h"
26 #include "util/thread_map.h"
27 
28 #include <unistd.h>
29 #include <sched.h>
30 #include <sys/mman.h>
31 
32 #ifndef HAVE_ON_EXIT
33 #ifndef ATEXIT_MAX
34 #define ATEXIT_MAX 32
35 #endif
36 static int __on_exit_count = 0;
37 typedef void (*on_exit_func_t) (int, void *);
38 static on_exit_func_t __on_exit_funcs[ATEXIT_MAX];
39 static void *__on_exit_args[ATEXIT_MAX];
40 static int __exitcode = 0;
41 static void __handle_on_exit_funcs(void);
42 static int on_exit(on_exit_func_t function, void *arg);
43 #define exit(x) (exit)(__exitcode = (x))
44 
45 static int on_exit(on_exit_func_t function, void *arg)
46 {
47 	if (__on_exit_count == ATEXIT_MAX)
48 		return -ENOMEM;
49 	else if (__on_exit_count == 0)
50 		atexit(__handle_on_exit_funcs);
51 	__on_exit_funcs[__on_exit_count] = function;
52 	__on_exit_args[__on_exit_count++] = arg;
53 	return 0;
54 }
55 
56 static void __handle_on_exit_funcs(void)
57 {
58 	int i;
59 	for (i = 0; i < __on_exit_count; i++)
60 		__on_exit_funcs[i] (__exitcode, __on_exit_args[i]);
61 }
62 #endif
63 
64 enum write_mode_t {
65 	WRITE_FORCE,
66 	WRITE_APPEND
67 };
68 
69 struct perf_record {
70 	struct perf_tool	tool;
71 	struct perf_record_opts	opts;
72 	u64			bytes_written;
73 	const char		*output_name;
74 	struct perf_evlist	*evlist;
75 	struct perf_session	*session;
76 	const char		*progname;
77 	int			output;
78 	unsigned int		page_size;
79 	int			realtime_prio;
80 	enum write_mode_t	write_mode;
81 	bool			no_buildid;
82 	bool			no_buildid_cache;
83 	bool			force;
84 	bool			file_new;
85 	bool			append_file;
86 	long			samples;
87 	off_t			post_processing_offset;
88 };
89 
90 static void advance_output(struct perf_record *rec, size_t size)
91 {
92 	rec->bytes_written += size;
93 }
94 
95 static int write_output(struct perf_record *rec, void *buf, size_t size)
96 {
97 	while (size) {
98 		int ret = write(rec->output, buf, size);
99 
100 		if (ret < 0) {
101 			pr_err("failed to write\n");
102 			return -1;
103 		}
104 
105 		size -= ret;
106 		buf += ret;
107 
108 		rec->bytes_written += ret;
109 	}
110 
111 	return 0;
112 }
113 
114 static int process_synthesized_event(struct perf_tool *tool,
115 				     union perf_event *event,
116 				     struct perf_sample *sample __maybe_unused,
117 				     struct machine *machine __maybe_unused)
118 {
119 	struct perf_record *rec = container_of(tool, struct perf_record, tool);
120 	if (write_output(rec, event, event->header.size) < 0)
121 		return -1;
122 
123 	return 0;
124 }
125 
126 static int perf_record__mmap_read(struct perf_record *rec,
127 				   struct perf_mmap *md)
128 {
129 	unsigned int head = perf_mmap__read_head(md);
130 	unsigned int old = md->prev;
131 	unsigned char *data = md->base + rec->page_size;
132 	unsigned long size;
133 	void *buf;
134 	int rc = 0;
135 
136 	if (old == head)
137 		return 0;
138 
139 	rec->samples++;
140 
141 	size = head - old;
142 
143 	if ((old & md->mask) + size != (head & md->mask)) {
144 		buf = &data[old & md->mask];
145 		size = md->mask + 1 - (old & md->mask);
146 		old += size;
147 
148 		if (write_output(rec, buf, size) < 0) {
149 			rc = -1;
150 			goto out;
151 		}
152 	}
153 
154 	buf = &data[old & md->mask];
155 	size = head - old;
156 	old += size;
157 
158 	if (write_output(rec, buf, size) < 0) {
159 		rc = -1;
160 		goto out;
161 	}
162 
163 	md->prev = old;
164 	perf_mmap__write_tail(md, old);
165 
166 out:
167 	return rc;
168 }
169 
170 static volatile int done = 0;
171 static volatile int signr = -1;
172 static volatile int child_finished = 0;
173 
174 static void sig_handler(int sig)
175 {
176 	if (sig == SIGCHLD)
177 		child_finished = 1;
178 
179 	done = 1;
180 	signr = sig;
181 }
182 
183 static void perf_record__sig_exit(int exit_status __maybe_unused, void *arg)
184 {
185 	struct perf_record *rec = arg;
186 	int status;
187 
188 	if (rec->evlist->workload.pid > 0) {
189 		if (!child_finished)
190 			kill(rec->evlist->workload.pid, SIGTERM);
191 
192 		wait(&status);
193 		if (WIFSIGNALED(status))
194 			psignal(WTERMSIG(status), rec->progname);
195 	}
196 
197 	if (signr == -1 || signr == SIGUSR1)
198 		return;
199 
200 	signal(signr, SIG_DFL);
201 	kill(getpid(), signr);
202 }
203 
204 static bool perf_evlist__equal(struct perf_evlist *evlist,
205 			       struct perf_evlist *other)
206 {
207 	struct perf_evsel *pos, *pair;
208 
209 	if (evlist->nr_entries != other->nr_entries)
210 		return false;
211 
212 	pair = perf_evlist__first(other);
213 
214 	list_for_each_entry(pos, &evlist->entries, node) {
215 		if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
216 			return false;
217 		pair = perf_evsel__next(pair);
218 	}
219 
220 	return true;
221 }
222 
223 static int perf_record__open(struct perf_record *rec)
224 {
225 	char msg[512];
226 	struct perf_evsel *pos;
227 	struct perf_evlist *evlist = rec->evlist;
228 	struct perf_session *session = rec->session;
229 	struct perf_record_opts *opts = &rec->opts;
230 	int rc = 0;
231 
232 	perf_evlist__config(evlist, opts);
233 
234 	list_for_each_entry(pos, &evlist->entries, node) {
235 try_again:
236 		if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
237 			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
238 				if (verbose)
239 					ui__warning("%s\n", msg);
240 				goto try_again;
241 			}
242 
243 			rc = -errno;
244 			perf_evsel__open_strerror(pos, &opts->target,
245 						  errno, msg, sizeof(msg));
246 			ui__error("%s\n", msg);
247 			goto out;
248 		}
249 	}
250 
251 	if (perf_evlist__apply_filters(evlist)) {
252 		error("failed to set filter with %d (%s)\n", errno,
253 			strerror(errno));
254 		rc = -1;
255 		goto out;
256 	}
257 
258 	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
259 		if (errno == EPERM) {
260 			pr_err("Permission error mapping pages.\n"
261 			       "Consider increasing "
262 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
263 			       "or try again with a smaller value of -m/--mmap_pages.\n"
264 			       "(current value: %d)\n", opts->mmap_pages);
265 			rc = -errno;
266 		} else if (!is_power_of_2(opts->mmap_pages) &&
267 			   (opts->mmap_pages != UINT_MAX)) {
268 			pr_err("--mmap_pages/-m value must be a power of two.");
269 			rc = -EINVAL;
270 		} else {
271 			pr_err("failed to mmap with %d (%s)\n", errno, strerror(errno));
272 			rc = -errno;
273 		}
274 		goto out;
275 	}
276 
277 	if (rec->file_new)
278 		session->evlist = evlist;
279 	else {
280 		if (!perf_evlist__equal(session->evlist, evlist)) {
281 			fprintf(stderr, "incompatible append\n");
282 			rc = -1;
283 			goto out;
284 		}
285  	}
286 
287 	perf_session__set_id_hdr_size(session);
288 out:
289 	return rc;
290 }
291 
292 static int process_buildids(struct perf_record *rec)
293 {
294 	u64 size = lseek(rec->output, 0, SEEK_CUR);
295 
296 	if (size == 0)
297 		return 0;
298 
299 	rec->session->fd = rec->output;
300 	return __perf_session__process_events(rec->session, rec->post_processing_offset,
301 					      size - rec->post_processing_offset,
302 					      size, &build_id__mark_dso_hit_ops);
303 }
304 
305 static void perf_record__exit(int status, void *arg)
306 {
307 	struct perf_record *rec = arg;
308 
309 	if (status != 0)
310 		return;
311 
312 	if (!rec->opts.pipe_output) {
313 		rec->session->header.data_size += rec->bytes_written;
314 
315 		if (!rec->no_buildid)
316 			process_buildids(rec);
317 		perf_session__write_header(rec->session, rec->evlist,
318 					   rec->output, true);
319 		perf_session__delete(rec->session);
320 		perf_evlist__delete(rec->evlist);
321 		symbol__exit();
322 	}
323 }
324 
325 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
326 {
327 	int err;
328 	struct perf_tool *tool = data;
329 	/*
330 	 *As for guest kernel when processing subcommand record&report,
331 	 *we arrange module mmap prior to guest kernel mmap and trigger
332 	 *a preload dso because default guest module symbols are loaded
333 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
334 	 *method is used to avoid symbol missing when the first addr is
335 	 *in module instead of in guest kernel.
336 	 */
337 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
338 					     machine);
339 	if (err < 0)
340 		pr_err("Couldn't record guest kernel [%d]'s reference"
341 		       " relocation symbol.\n", machine->pid);
342 
343 	/*
344 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
345 	 * have no _text sometimes.
346 	 */
347 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
348 						 machine, "_text");
349 	if (err < 0)
350 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
351 							 machine, "_stext");
352 	if (err < 0)
353 		pr_err("Couldn't record guest kernel [%d]'s reference"
354 		       " relocation symbol.\n", machine->pid);
355 }
356 
357 static struct perf_event_header finished_round_event = {
358 	.size = sizeof(struct perf_event_header),
359 	.type = PERF_RECORD_FINISHED_ROUND,
360 };
361 
362 static int perf_record__mmap_read_all(struct perf_record *rec)
363 {
364 	int i;
365 	int rc = 0;
366 
367 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
368 		if (rec->evlist->mmap[i].base) {
369 			if (perf_record__mmap_read(rec, &rec->evlist->mmap[i]) != 0) {
370 				rc = -1;
371 				goto out;
372 			}
373 		}
374 	}
375 
376 	if (perf_header__has_feat(&rec->session->header, HEADER_TRACING_DATA))
377 		rc = write_output(rec, &finished_round_event,
378 				  sizeof(finished_round_event));
379 
380 out:
381 	return rc;
382 }
383 
384 static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
385 {
386 	struct stat st;
387 	int flags;
388 	int err, output, feat;
389 	unsigned long waking = 0;
390 	const bool forks = argc > 0;
391 	struct machine *machine;
392 	struct perf_tool *tool = &rec->tool;
393 	struct perf_record_opts *opts = &rec->opts;
394 	struct perf_evlist *evsel_list = rec->evlist;
395 	const char *output_name = rec->output_name;
396 	struct perf_session *session;
397 	bool disabled = false;
398 
399 	rec->progname = argv[0];
400 
401 	rec->page_size = sysconf(_SC_PAGE_SIZE);
402 
403 	on_exit(perf_record__sig_exit, rec);
404 	signal(SIGCHLD, sig_handler);
405 	signal(SIGINT, sig_handler);
406 	signal(SIGUSR1, sig_handler);
407 
408 	if (!output_name) {
409 		if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
410 			opts->pipe_output = true;
411 		else
412 			rec->output_name = output_name = "perf.data";
413 	}
414 	if (output_name) {
415 		if (!strcmp(output_name, "-"))
416 			opts->pipe_output = true;
417 		else if (!stat(output_name, &st) && st.st_size) {
418 			if (rec->write_mode == WRITE_FORCE) {
419 				char oldname[PATH_MAX];
420 				snprintf(oldname, sizeof(oldname), "%s.old",
421 					 output_name);
422 				unlink(oldname);
423 				rename(output_name, oldname);
424 			}
425 		} else if (rec->write_mode == WRITE_APPEND) {
426 			rec->write_mode = WRITE_FORCE;
427 		}
428 	}
429 
430 	flags = O_CREAT|O_RDWR;
431 	if (rec->write_mode == WRITE_APPEND)
432 		rec->file_new = 0;
433 	else
434 		flags |= O_TRUNC;
435 
436 	if (opts->pipe_output)
437 		output = STDOUT_FILENO;
438 	else
439 		output = open(output_name, flags, S_IRUSR | S_IWUSR);
440 	if (output < 0) {
441 		perror("failed to create output file");
442 		return -1;
443 	}
444 
445 	rec->output = output;
446 
447 	session = perf_session__new(output_name, O_WRONLY,
448 				    rec->write_mode == WRITE_FORCE, false, NULL);
449 	if (session == NULL) {
450 		pr_err("Not enough memory for reading perf file header\n");
451 		return -1;
452 	}
453 
454 	rec->session = session;
455 
456 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
457 		perf_header__set_feat(&session->header, feat);
458 
459 	if (rec->no_buildid)
460 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
461 
462 	if (!have_tracepoints(&evsel_list->entries))
463 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
464 
465 	if (!rec->opts.branch_stack)
466 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
467 
468 	if (!rec->file_new) {
469 		err = perf_session__read_header(session, output);
470 		if (err < 0)
471 			goto out_delete_session;
472 	}
473 
474 	if (forks) {
475 		err = perf_evlist__prepare_workload(evsel_list, &opts->target,
476 						    argv, opts->pipe_output,
477 						    true);
478 		if (err < 0) {
479 			pr_err("Couldn't run the workload!\n");
480 			goto out_delete_session;
481 		}
482 	}
483 
484 	if (perf_record__open(rec) != 0) {
485 		err = -1;
486 		goto out_delete_session;
487 	}
488 
489 	if (!evsel_list->nr_groups)
490 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
491 
492 	/*
493 	 * perf_session__delete(session) will be called at perf_record__exit()
494 	 */
495 	on_exit(perf_record__exit, rec);
496 
497 	if (opts->pipe_output) {
498 		err = perf_header__write_pipe(output);
499 		if (err < 0)
500 			goto out_delete_session;
501 	} else if (rec->file_new) {
502 		err = perf_session__write_header(session, evsel_list,
503 						 output, false);
504 		if (err < 0)
505 			goto out_delete_session;
506 	}
507 
508 	if (!rec->no_buildid
509 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
510 		pr_err("Couldn't generate buildids. "
511 		       "Use --no-buildid to profile anyway.\n");
512 		err = -1;
513 		goto out_delete_session;
514 	}
515 
516 	rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
517 
518 	machine = &session->machines.host;
519 
520 	if (opts->pipe_output) {
521 		err = perf_event__synthesize_attrs(tool, session,
522 						   process_synthesized_event);
523 		if (err < 0) {
524 			pr_err("Couldn't synthesize attrs.\n");
525 			goto out_delete_session;
526 		}
527 
528 		err = perf_event__synthesize_event_types(tool, process_synthesized_event,
529 							 machine);
530 		if (err < 0) {
531 			pr_err("Couldn't synthesize event_types.\n");
532 			goto out_delete_session;
533 		}
534 
535 		if (have_tracepoints(&evsel_list->entries)) {
536 			/*
537 			 * FIXME err <= 0 here actually means that
538 			 * there were no tracepoints so its not really
539 			 * an error, just that we don't need to
540 			 * synthesize anything.  We really have to
541 			 * return this more properly and also
542 			 * propagate errors that now are calling die()
543 			 */
544 			err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
545 								  process_synthesized_event);
546 			if (err <= 0) {
547 				pr_err("Couldn't record tracing data.\n");
548 				goto out_delete_session;
549 			}
550 			advance_output(rec, err);
551 		}
552 	}
553 
554 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
555 						 machine, "_text");
556 	if (err < 0)
557 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
558 							 machine, "_stext");
559 	if (err < 0)
560 		pr_err("Couldn't record kernel reference relocation symbol\n"
561 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
562 		       "Check /proc/kallsyms permission or run as root.\n");
563 
564 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
565 					     machine);
566 	if (err < 0)
567 		pr_err("Couldn't record kernel module information.\n"
568 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
569 		       "Check /proc/modules permission or run as root.\n");
570 
571 	if (perf_guest) {
572 		machines__process_guests(&session->machines,
573 					 perf_event__synthesize_guest_os, tool);
574 	}
575 
576 	if (perf_target__has_task(&opts->target))
577 		err = perf_event__synthesize_thread_map(tool, evsel_list->threads,
578 						  process_synthesized_event,
579 						  machine);
580 	else if (perf_target__has_cpu(&opts->target))
581 		err = perf_event__synthesize_threads(tool, process_synthesized_event,
582 					       machine);
583 	else /* command specified */
584 		err = 0;
585 
586 	if (err != 0)
587 		goto out_delete_session;
588 
589 	if (rec->realtime_prio) {
590 		struct sched_param param;
591 
592 		param.sched_priority = rec->realtime_prio;
593 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
594 			pr_err("Could not set realtime priority.\n");
595 			err = -1;
596 			goto out_delete_session;
597 		}
598 	}
599 
600 	/*
601 	 * When perf is starting the traced process, all the events
602 	 * (apart from group members) have enable_on_exec=1 set,
603 	 * so don't spoil it by prematurely enabling them.
604 	 */
605 	if (!perf_target__none(&opts->target))
606 		perf_evlist__enable(evsel_list);
607 
608 	/*
609 	 * Let the child rip
610 	 */
611 	if (forks)
612 		perf_evlist__start_workload(evsel_list);
613 
614 	for (;;) {
615 		int hits = rec->samples;
616 
617 		if (perf_record__mmap_read_all(rec) < 0) {
618 			err = -1;
619 			goto out_delete_session;
620 		}
621 
622 		if (hits == rec->samples) {
623 			if (done)
624 				break;
625 			err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
626 			waking++;
627 		}
628 
629 		/*
630 		 * When perf is starting the traced process, at the end events
631 		 * die with the process and we wait for that. Thus no need to
632 		 * disable events in this case.
633 		 */
634 		if (done && !disabled && !perf_target__none(&opts->target)) {
635 			perf_evlist__disable(evsel_list);
636 			disabled = true;
637 		}
638 	}
639 
640 	if (quiet || signr == SIGUSR1)
641 		return 0;
642 
643 	fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
644 
645 	/*
646 	 * Approximate RIP event size: 24 bytes.
647 	 */
648 	fprintf(stderr,
649 		"[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
650 		(double)rec->bytes_written / 1024.0 / 1024.0,
651 		output_name,
652 		rec->bytes_written / 24);
653 
654 	return 0;
655 
656 out_delete_session:
657 	perf_session__delete(session);
658 	return err;
659 }
660 
661 #define BRANCH_OPT(n, m) \
662 	{ .name = n, .mode = (m) }
663 
664 #define BRANCH_END { .name = NULL }
665 
666 struct branch_mode {
667 	const char *name;
668 	int mode;
669 };
670 
671 static const struct branch_mode branch_modes[] = {
672 	BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
673 	BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
674 	BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
675 	BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
676 	BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
677 	BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
678 	BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
679 	BRANCH_END
680 };
681 
682 static int
683 parse_branch_stack(const struct option *opt, const char *str, int unset)
684 {
685 #define ONLY_PLM \
686 	(PERF_SAMPLE_BRANCH_USER	|\
687 	 PERF_SAMPLE_BRANCH_KERNEL	|\
688 	 PERF_SAMPLE_BRANCH_HV)
689 
690 	uint64_t *mode = (uint64_t *)opt->value;
691 	const struct branch_mode *br;
692 	char *s, *os = NULL, *p;
693 	int ret = -1;
694 
695 	if (unset)
696 		return 0;
697 
698 	/*
699 	 * cannot set it twice, -b + --branch-filter for instance
700 	 */
701 	if (*mode)
702 		return -1;
703 
704 	/* str may be NULL in case no arg is passed to -b */
705 	if (str) {
706 		/* because str is read-only */
707 		s = os = strdup(str);
708 		if (!s)
709 			return -1;
710 
711 		for (;;) {
712 			p = strchr(s, ',');
713 			if (p)
714 				*p = '\0';
715 
716 			for (br = branch_modes; br->name; br++) {
717 				if (!strcasecmp(s, br->name))
718 					break;
719 			}
720 			if (!br->name) {
721 				ui__warning("unknown branch filter %s,"
722 					    " check man page\n", s);
723 				goto error;
724 			}
725 
726 			*mode |= br->mode;
727 
728 			if (!p)
729 				break;
730 
731 			s = p + 1;
732 		}
733 	}
734 	ret = 0;
735 
736 	/* default to any branch */
737 	if ((*mode & ~ONLY_PLM) == 0) {
738 		*mode = PERF_SAMPLE_BRANCH_ANY;
739 	}
740 error:
741 	free(os);
742 	return ret;
743 }
744 
745 #ifdef LIBUNWIND_SUPPORT
746 static int get_stack_size(char *str, unsigned long *_size)
747 {
748 	char *endptr;
749 	unsigned long size;
750 	unsigned long max_size = round_down(USHRT_MAX, sizeof(u64));
751 
752 	size = strtoul(str, &endptr, 0);
753 
754 	do {
755 		if (*endptr)
756 			break;
757 
758 		size = round_up(size, sizeof(u64));
759 		if (!size || size > max_size)
760 			break;
761 
762 		*_size = size;
763 		return 0;
764 
765 	} while (0);
766 
767 	pr_err("callchain: Incorrect stack dump size (max %ld): %s\n",
768 	       max_size, str);
769 	return -1;
770 }
771 #endif /* LIBUNWIND_SUPPORT */
772 
773 int record_parse_callchain_opt(const struct option *opt,
774 			       const char *arg, int unset)
775 {
776 	struct perf_record_opts *opts = opt->value;
777 	char *tok, *name, *saveptr = NULL;
778 	char *buf;
779 	int ret = -1;
780 
781 	/* --no-call-graph */
782 	if (unset)
783 		return 0;
784 
785 	/* We specified default option if none is provided. */
786 	BUG_ON(!arg);
787 
788 	/* We need buffer that we know we can write to. */
789 	buf = malloc(strlen(arg) + 1);
790 	if (!buf)
791 		return -ENOMEM;
792 
793 	strcpy(buf, arg);
794 
795 	tok = strtok_r((char *)buf, ",", &saveptr);
796 	name = tok ? : (char *)buf;
797 
798 	do {
799 		/* Framepointer style */
800 		if (!strncmp(name, "fp", sizeof("fp"))) {
801 			if (!strtok_r(NULL, ",", &saveptr)) {
802 				opts->call_graph = CALLCHAIN_FP;
803 				ret = 0;
804 			} else
805 				pr_err("callchain: No more arguments "
806 				       "needed for -g fp\n");
807 			break;
808 
809 #ifdef LIBUNWIND_SUPPORT
810 		/* Dwarf style */
811 		} else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
812 			const unsigned long default_stack_dump_size = 8192;
813 
814 			ret = 0;
815 			opts->call_graph = CALLCHAIN_DWARF;
816 			opts->stack_dump_size = default_stack_dump_size;
817 
818 			tok = strtok_r(NULL, ",", &saveptr);
819 			if (tok) {
820 				unsigned long size = 0;
821 
822 				ret = get_stack_size(tok, &size);
823 				opts->stack_dump_size = size;
824 			}
825 
826 			if (!ret)
827 				pr_debug("callchain: stack dump size %d\n",
828 					 opts->stack_dump_size);
829 #endif /* LIBUNWIND_SUPPORT */
830 		} else {
831 			pr_err("callchain: Unknown -g option "
832 			       "value: %s\n", arg);
833 			break;
834 		}
835 
836 	} while (0);
837 
838 	free(buf);
839 
840 	if (!ret)
841 		pr_debug("callchain: type %d\n", opts->call_graph);
842 
843 	return ret;
844 }
845 
846 static const char * const record_usage[] = {
847 	"perf record [<options>] [<command>]",
848 	"perf record [<options>] -- <command> [<options>]",
849 	NULL
850 };
851 
852 /*
853  * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
854  * because we need to have access to it in perf_record__exit, that is called
855  * after cmd_record() exits, but since record_options need to be accessible to
856  * builtin-script, leave it here.
857  *
858  * At least we don't ouch it in all the other functions here directly.
859  *
860  * Just say no to tons of global variables, sigh.
861  */
862 static struct perf_record record = {
863 	.opts = {
864 		.mmap_pages	     = UINT_MAX,
865 		.user_freq	     = UINT_MAX,
866 		.user_interval	     = ULLONG_MAX,
867 		.freq		     = 4000,
868 		.target		     = {
869 			.uses_mmap   = true,
870 		},
871 	},
872 	.write_mode = WRITE_FORCE,
873 	.file_new   = true,
874 };
875 
876 #define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: "
877 
878 #ifdef LIBUNWIND_SUPPORT
879 const char record_callchain_help[] = CALLCHAIN_HELP "[fp] dwarf";
880 #else
881 const char record_callchain_help[] = CALLCHAIN_HELP "[fp]";
882 #endif
883 
884 /*
885  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
886  * with it and switch to use the library functions in perf_evlist that came
887  * from builtin-record.c, i.e. use perf_record_opts,
888  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
889  * using pipes, etc.
890  */
891 const struct option record_options[] = {
892 	OPT_CALLBACK('e', "event", &record.evlist, "event",
893 		     "event selector. use 'perf list' to list available events",
894 		     parse_events_option),
895 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
896 		     "event filter", parse_filter),
897 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
898 		    "record events on existing process id"),
899 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
900 		    "record events on existing thread id"),
901 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
902 		    "collect data with this RT SCHED_FIFO priority"),
903 	OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
904 		    "collect data without buffering"),
905 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
906 		    "collect raw sample records from all opened counters"),
907 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
908 			    "system-wide collection from all CPUs"),
909 	OPT_BOOLEAN('A', "append", &record.append_file,
910 			    "append to the output file to do incremental profiling"),
911 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
912 		    "list of cpus to monitor"),
913 	OPT_BOOLEAN('f', "force", &record.force,
914 			"overwrite existing data file (deprecated)"),
915 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
916 	OPT_STRING('o', "output", &record.output_name, "file",
917 		    "output file name"),
918 	OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
919 		    "child tasks do not inherit counters"),
920 	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
921 	OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
922 		     "number of mmap data pages"),
923 	OPT_BOOLEAN(0, "group", &record.opts.group,
924 		    "put the counters into a counter group"),
925 	OPT_CALLBACK_DEFAULT('g', "call-graph", &record.opts,
926 			     "mode[,dump_size]", record_callchain_help,
927 			     &record_parse_callchain_opt, "fp"),
928 	OPT_INCR('v', "verbose", &verbose,
929 		    "be more verbose (show counter open errors, etc)"),
930 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
931 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
932 		    "per thread counts"),
933 	OPT_BOOLEAN('d', "data", &record.opts.sample_address,
934 		    "Sample addresses"),
935 	OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
936 	OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
937 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
938 		    "don't sample"),
939 	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
940 		    "do not update the buildid cache"),
941 	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
942 		    "do not collect buildids in perf.data"),
943 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
944 		     "monitor event in cgroup name only",
945 		     parse_cgroups),
946 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
947 		   "user to profile"),
948 
949 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
950 		     "branch any", "sample any taken branches",
951 		     parse_branch_stack),
952 
953 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
954 		     "branch filter mask", "branch stack filter modes",
955 		     parse_branch_stack),
956 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
957 		    "sample by weight (on special events only)"),
958 	OPT_END()
959 };
960 
961 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
962 {
963 	int err = -ENOMEM;
964 	struct perf_evsel *pos;
965 	struct perf_evlist *evsel_list;
966 	struct perf_record *rec = &record;
967 	char errbuf[BUFSIZ];
968 
969 	evsel_list = perf_evlist__new();
970 	if (evsel_list == NULL)
971 		return -ENOMEM;
972 
973 	rec->evlist = evsel_list;
974 
975 	argc = parse_options(argc, argv, record_options, record_usage,
976 			    PARSE_OPT_STOP_AT_NON_OPTION);
977 	if (!argc && perf_target__none(&rec->opts.target))
978 		usage_with_options(record_usage, record_options);
979 
980 	if (rec->force && rec->append_file) {
981 		ui__error("Can't overwrite and append at the same time."
982 			  " You need to choose between -f and -A");
983 		usage_with_options(record_usage, record_options);
984 	} else if (rec->append_file) {
985 		rec->write_mode = WRITE_APPEND;
986 	} else {
987 		rec->write_mode = WRITE_FORCE;
988 	}
989 
990 	if (nr_cgroups && !rec->opts.target.system_wide) {
991 		ui__error("cgroup monitoring only available in"
992 			  " system-wide mode\n");
993 		usage_with_options(record_usage, record_options);
994 	}
995 
996 	symbol__init();
997 
998 	if (symbol_conf.kptr_restrict)
999 		pr_warning(
1000 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1001 "check /proc/sys/kernel/kptr_restrict.\n\n"
1002 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1003 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1004 "Samples in kernel modules won't be resolved at all.\n\n"
1005 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1006 "even with a suitable vmlinux or kallsyms file.\n\n");
1007 
1008 	if (rec->no_buildid_cache || rec->no_buildid)
1009 		disable_buildid_cache();
1010 
1011 	if (evsel_list->nr_entries == 0 &&
1012 	    perf_evlist__add_default(evsel_list) < 0) {
1013 		pr_err("Not enough memory for event selector list\n");
1014 		goto out_symbol_exit;
1015 	}
1016 
1017 	err = perf_target__validate(&rec->opts.target);
1018 	if (err) {
1019 		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1020 		ui__warning("%s", errbuf);
1021 	}
1022 
1023 	err = perf_target__parse_uid(&rec->opts.target);
1024 	if (err) {
1025 		int saved_errno = errno;
1026 
1027 		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1028 		ui__error("%s", errbuf);
1029 
1030 		err = -saved_errno;
1031 		goto out_symbol_exit;
1032 	}
1033 
1034 	err = -ENOMEM;
1035 	if (perf_evlist__create_maps(evsel_list, &rec->opts.target) < 0)
1036 		usage_with_options(record_usage, record_options);
1037 
1038 	list_for_each_entry(pos, &evsel_list->entries, node) {
1039 		if (perf_header__push_event(pos->attr.config, perf_evsel__name(pos)))
1040 			goto out_free_fd;
1041 	}
1042 
1043 	if (rec->opts.user_interval != ULLONG_MAX)
1044 		rec->opts.default_interval = rec->opts.user_interval;
1045 	if (rec->opts.user_freq != UINT_MAX)
1046 		rec->opts.freq = rec->opts.user_freq;
1047 
1048 	/*
1049 	 * User specified count overrides default frequency.
1050 	 */
1051 	if (rec->opts.default_interval)
1052 		rec->opts.freq = 0;
1053 	else if (rec->opts.freq) {
1054 		rec->opts.default_interval = rec->opts.freq;
1055 	} else {
1056 		ui__error("frequency and count are zero, aborting\n");
1057 		err = -EINVAL;
1058 		goto out_free_fd;
1059 	}
1060 
1061 	err = __cmd_record(&record, argc, argv);
1062 
1063 	perf_evlist__munmap(evsel_list);
1064 	perf_evlist__close(evsel_list);
1065 out_free_fd:
1066 	perf_evlist__delete_maps(evsel_list);
1067 out_symbol_exit:
1068 	symbol__exit();
1069 	return err;
1070 }
1071