xref: /openbmc/linux/tools/perf/builtin-record.c (revision 05bcf503)
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #define _FILE_OFFSET_BITS 64
9 
10 #include "builtin.h"
11 
12 #include "perf.h"
13 
14 #include "util/build-id.h"
15 #include "util/util.h"
16 #include "util/parse-options.h"
17 #include "util/parse-events.h"
18 
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/cpumap.h"
28 #include "util/thread_map.h"
29 
30 #include <unistd.h>
31 #include <sched.h>
32 #include <sys/mman.h>
33 
34 enum write_mode_t {
35 	WRITE_FORCE,
36 	WRITE_APPEND
37 };
38 
39 struct perf_record {
40 	struct perf_tool	tool;
41 	struct perf_record_opts	opts;
42 	u64			bytes_written;
43 	const char		*output_name;
44 	struct perf_evlist	*evlist;
45 	struct perf_session	*session;
46 	const char		*progname;
47 	int			output;
48 	unsigned int		page_size;
49 	int			realtime_prio;
50 	enum write_mode_t	write_mode;
51 	bool			no_buildid;
52 	bool			no_buildid_cache;
53 	bool			force;
54 	bool			file_new;
55 	bool			append_file;
56 	long			samples;
57 	off_t			post_processing_offset;
58 };
59 
60 static void advance_output(struct perf_record *rec, size_t size)
61 {
62 	rec->bytes_written += size;
63 }
64 
65 static int write_output(struct perf_record *rec, void *buf, size_t size)
66 {
67 	while (size) {
68 		int ret = write(rec->output, buf, size);
69 
70 		if (ret < 0) {
71 			pr_err("failed to write\n");
72 			return -1;
73 		}
74 
75 		size -= ret;
76 		buf += ret;
77 
78 		rec->bytes_written += ret;
79 	}
80 
81 	return 0;
82 }
83 
84 static int process_synthesized_event(struct perf_tool *tool,
85 				     union perf_event *event,
86 				     struct perf_sample *sample __maybe_unused,
87 				     struct machine *machine __maybe_unused)
88 {
89 	struct perf_record *rec = container_of(tool, struct perf_record, tool);
90 	if (write_output(rec, event, event->header.size) < 0)
91 		return -1;
92 
93 	return 0;
94 }
95 
96 static int perf_record__mmap_read(struct perf_record *rec,
97 				   struct perf_mmap *md)
98 {
99 	unsigned int head = perf_mmap__read_head(md);
100 	unsigned int old = md->prev;
101 	unsigned char *data = md->base + rec->page_size;
102 	unsigned long size;
103 	void *buf;
104 	int rc = 0;
105 
106 	if (old == head)
107 		return 0;
108 
109 	rec->samples++;
110 
111 	size = head - old;
112 
113 	if ((old & md->mask) + size != (head & md->mask)) {
114 		buf = &data[old & md->mask];
115 		size = md->mask + 1 - (old & md->mask);
116 		old += size;
117 
118 		if (write_output(rec, buf, size) < 0) {
119 			rc = -1;
120 			goto out;
121 		}
122 	}
123 
124 	buf = &data[old & md->mask];
125 	size = head - old;
126 	old += size;
127 
128 	if (write_output(rec, buf, size) < 0) {
129 		rc = -1;
130 		goto out;
131 	}
132 
133 	md->prev = old;
134 	perf_mmap__write_tail(md, old);
135 
136 out:
137 	return rc;
138 }
139 
140 static volatile int done = 0;
141 static volatile int signr = -1;
142 static volatile int child_finished = 0;
143 
144 static void sig_handler(int sig)
145 {
146 	if (sig == SIGCHLD)
147 		child_finished = 1;
148 
149 	done = 1;
150 	signr = sig;
151 }
152 
153 static void perf_record__sig_exit(int exit_status __maybe_unused, void *arg)
154 {
155 	struct perf_record *rec = arg;
156 	int status;
157 
158 	if (rec->evlist->workload.pid > 0) {
159 		if (!child_finished)
160 			kill(rec->evlist->workload.pid, SIGTERM);
161 
162 		wait(&status);
163 		if (WIFSIGNALED(status))
164 			psignal(WTERMSIG(status), rec->progname);
165 	}
166 
167 	if (signr == -1 || signr == SIGUSR1)
168 		return;
169 
170 	signal(signr, SIG_DFL);
171 	kill(getpid(), signr);
172 }
173 
174 static bool perf_evlist__equal(struct perf_evlist *evlist,
175 			       struct perf_evlist *other)
176 {
177 	struct perf_evsel *pos, *pair;
178 
179 	if (evlist->nr_entries != other->nr_entries)
180 		return false;
181 
182 	pair = perf_evlist__first(other);
183 
184 	list_for_each_entry(pos, &evlist->entries, node) {
185 		if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
186 			return false;
187 		pair = perf_evsel__next(pair);
188 	}
189 
190 	return true;
191 }
192 
193 static int perf_record__open(struct perf_record *rec)
194 {
195 	struct perf_evsel *pos;
196 	struct perf_evlist *evlist = rec->evlist;
197 	struct perf_session *session = rec->session;
198 	struct perf_record_opts *opts = &rec->opts;
199 	int rc = 0;
200 
201 	perf_evlist__config_attrs(evlist, opts);
202 
203 	if (opts->group)
204 		perf_evlist__set_leader(evlist);
205 
206 	list_for_each_entry(pos, &evlist->entries, node) {
207 		struct perf_event_attr *attr = &pos->attr;
208 		/*
209 		 * Check if parse_single_tracepoint_event has already asked for
210 		 * PERF_SAMPLE_TIME.
211 		 *
212 		 * XXX this is kludgy but short term fix for problems introduced by
213 		 * eac23d1c that broke 'perf script' by having different sample_types
214 		 * when using multiple tracepoint events when we use a perf binary
215 		 * that tries to use sample_id_all on an older kernel.
216 		 *
217 		 * We need to move counter creation to perf_session, support
218 		 * different sample_types, etc.
219 		 */
220 		bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
221 
222 fallback_missing_features:
223 		if (opts->exclude_guest_missing)
224 			attr->exclude_guest = attr->exclude_host = 0;
225 retry_sample_id:
226 		attr->sample_id_all = opts->sample_id_all_missing ? 0 : 1;
227 try_again:
228 		if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
229 			int err = errno;
230 
231 			if (err == EPERM || err == EACCES) {
232 				ui__error_paranoid();
233 				rc = -err;
234 				goto out;
235 			} else if (err ==  ENODEV && opts->target.cpu_list) {
236 				pr_err("No such device - did you specify"
237 				       " an out-of-range profile CPU?\n");
238 				rc = -err;
239 				goto out;
240 			} else if (err == EINVAL) {
241 				if (!opts->exclude_guest_missing &&
242 				    (attr->exclude_guest || attr->exclude_host)) {
243 					pr_debug("Old kernel, cannot exclude "
244 						 "guest or host samples.\n");
245 					opts->exclude_guest_missing = true;
246 					goto fallback_missing_features;
247 				} else if (!opts->sample_id_all_missing) {
248 					/*
249 					 * Old kernel, no attr->sample_id_type_all field
250 					 */
251 					opts->sample_id_all_missing = true;
252 					if (!opts->sample_time && !opts->raw_samples && !time_needed)
253 						attr->sample_type &= ~PERF_SAMPLE_TIME;
254 
255 					goto retry_sample_id;
256 				}
257 			}
258 
259 			/*
260 			 * If it's cycles then fall back to hrtimer
261 			 * based cpu-clock-tick sw counter, which
262 			 * is always available even if no PMU support.
263 			 *
264 			 * PPC returns ENXIO until 2.6.37 (behavior changed
265 			 * with commit b0a873e).
266 			 */
267 			if ((err == ENOENT || err == ENXIO)
268 					&& attr->type == PERF_TYPE_HARDWARE
269 					&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
270 
271 				if (verbose)
272 					ui__warning("The cycles event is not supported, "
273 						    "trying to fall back to cpu-clock-ticks\n");
274 				attr->type = PERF_TYPE_SOFTWARE;
275 				attr->config = PERF_COUNT_SW_CPU_CLOCK;
276 				if (pos->name) {
277 					free(pos->name);
278 					pos->name = NULL;
279 				}
280 				goto try_again;
281 			}
282 
283 			if (err == ENOENT) {
284 				ui__error("The %s event is not supported.\n",
285 					  perf_evsel__name(pos));
286 				rc = -err;
287 				goto out;
288 			}
289 
290 			printf("\n");
291 			error("sys_perf_event_open() syscall returned with %d "
292 			      "(%s) for event %s. /bin/dmesg may provide "
293 			      "additional information.\n",
294 			      err, strerror(err), perf_evsel__name(pos));
295 
296 #if defined(__i386__) || defined(__x86_64__)
297 			if (attr->type == PERF_TYPE_HARDWARE &&
298 			    err == EOPNOTSUPP) {
299 				pr_err("No hardware sampling interrupt available."
300 				       " No APIC? If so then you can boot the kernel"
301 				       " with the \"lapic\" boot parameter to"
302 				       " force-enable it.\n");
303 				rc = -err;
304 				goto out;
305 			}
306 #endif
307 
308 			pr_err("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
309 			rc = -err;
310 			goto out;
311 		}
312 	}
313 
314 	if (perf_evlist__apply_filters(evlist)) {
315 		error("failed to set filter with %d (%s)\n", errno,
316 			strerror(errno));
317 		rc = -1;
318 		goto out;
319 	}
320 
321 	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
322 		if (errno == EPERM) {
323 			pr_err("Permission error mapping pages.\n"
324 			       "Consider increasing "
325 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
326 			       "or try again with a smaller value of -m/--mmap_pages.\n"
327 			       "(current value: %d)\n", opts->mmap_pages);
328 			rc = -errno;
329 		} else if (!is_power_of_2(opts->mmap_pages)) {
330 			pr_err("--mmap_pages/-m value must be a power of two.");
331 			rc = -EINVAL;
332 		} else {
333 			pr_err("failed to mmap with %d (%s)\n", errno, strerror(errno));
334 			rc = -errno;
335 		}
336 		goto out;
337 	}
338 
339 	if (rec->file_new)
340 		session->evlist = evlist;
341 	else {
342 		if (!perf_evlist__equal(session->evlist, evlist)) {
343 			fprintf(stderr, "incompatible append\n");
344 			rc = -1;
345 			goto out;
346 		}
347  	}
348 
349 	perf_session__set_id_hdr_size(session);
350 out:
351 	return rc;
352 }
353 
354 static int process_buildids(struct perf_record *rec)
355 {
356 	u64 size = lseek(rec->output, 0, SEEK_CUR);
357 
358 	if (size == 0)
359 		return 0;
360 
361 	rec->session->fd = rec->output;
362 	return __perf_session__process_events(rec->session, rec->post_processing_offset,
363 					      size - rec->post_processing_offset,
364 					      size, &build_id__mark_dso_hit_ops);
365 }
366 
367 static void perf_record__exit(int status, void *arg)
368 {
369 	struct perf_record *rec = arg;
370 
371 	if (status != 0)
372 		return;
373 
374 	if (!rec->opts.pipe_output) {
375 		rec->session->header.data_size += rec->bytes_written;
376 
377 		if (!rec->no_buildid)
378 			process_buildids(rec);
379 		perf_session__write_header(rec->session, rec->evlist,
380 					   rec->output, true);
381 		perf_session__delete(rec->session);
382 		perf_evlist__delete(rec->evlist);
383 		symbol__exit();
384 	}
385 }
386 
387 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
388 {
389 	int err;
390 	struct perf_tool *tool = data;
391 
392 	if (machine__is_host(machine))
393 		return;
394 
395 	/*
396 	 *As for guest kernel when processing subcommand record&report,
397 	 *we arrange module mmap prior to guest kernel mmap and trigger
398 	 *a preload dso because default guest module symbols are loaded
399 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
400 	 *method is used to avoid symbol missing when the first addr is
401 	 *in module instead of in guest kernel.
402 	 */
403 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
404 					     machine);
405 	if (err < 0)
406 		pr_err("Couldn't record guest kernel [%d]'s reference"
407 		       " relocation symbol.\n", machine->pid);
408 
409 	/*
410 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
411 	 * have no _text sometimes.
412 	 */
413 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
414 						 machine, "_text");
415 	if (err < 0)
416 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
417 							 machine, "_stext");
418 	if (err < 0)
419 		pr_err("Couldn't record guest kernel [%d]'s reference"
420 		       " relocation symbol.\n", machine->pid);
421 }
422 
423 static struct perf_event_header finished_round_event = {
424 	.size = sizeof(struct perf_event_header),
425 	.type = PERF_RECORD_FINISHED_ROUND,
426 };
427 
428 static int perf_record__mmap_read_all(struct perf_record *rec)
429 {
430 	int i;
431 	int rc = 0;
432 
433 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
434 		if (rec->evlist->mmap[i].base) {
435 			if (perf_record__mmap_read(rec, &rec->evlist->mmap[i]) != 0) {
436 				rc = -1;
437 				goto out;
438 			}
439 		}
440 	}
441 
442 	if (perf_header__has_feat(&rec->session->header, HEADER_TRACING_DATA))
443 		rc = write_output(rec, &finished_round_event,
444 				  sizeof(finished_round_event));
445 
446 out:
447 	return rc;
448 }
449 
450 static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
451 {
452 	struct stat st;
453 	int flags;
454 	int err, output, feat;
455 	unsigned long waking = 0;
456 	const bool forks = argc > 0;
457 	struct machine *machine;
458 	struct perf_tool *tool = &rec->tool;
459 	struct perf_record_opts *opts = &rec->opts;
460 	struct perf_evlist *evsel_list = rec->evlist;
461 	const char *output_name = rec->output_name;
462 	struct perf_session *session;
463 
464 	rec->progname = argv[0];
465 
466 	rec->page_size = sysconf(_SC_PAGE_SIZE);
467 
468 	on_exit(perf_record__sig_exit, rec);
469 	signal(SIGCHLD, sig_handler);
470 	signal(SIGINT, sig_handler);
471 	signal(SIGUSR1, sig_handler);
472 
473 	if (!output_name) {
474 		if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
475 			opts->pipe_output = true;
476 		else
477 			rec->output_name = output_name = "perf.data";
478 	}
479 	if (output_name) {
480 		if (!strcmp(output_name, "-"))
481 			opts->pipe_output = true;
482 		else if (!stat(output_name, &st) && st.st_size) {
483 			if (rec->write_mode == WRITE_FORCE) {
484 				char oldname[PATH_MAX];
485 				snprintf(oldname, sizeof(oldname), "%s.old",
486 					 output_name);
487 				unlink(oldname);
488 				rename(output_name, oldname);
489 			}
490 		} else if (rec->write_mode == WRITE_APPEND) {
491 			rec->write_mode = WRITE_FORCE;
492 		}
493 	}
494 
495 	flags = O_CREAT|O_RDWR;
496 	if (rec->write_mode == WRITE_APPEND)
497 		rec->file_new = 0;
498 	else
499 		flags |= O_TRUNC;
500 
501 	if (opts->pipe_output)
502 		output = STDOUT_FILENO;
503 	else
504 		output = open(output_name, flags, S_IRUSR | S_IWUSR);
505 	if (output < 0) {
506 		perror("failed to create output file");
507 		return -1;
508 	}
509 
510 	rec->output = output;
511 
512 	session = perf_session__new(output_name, O_WRONLY,
513 				    rec->write_mode == WRITE_FORCE, false, NULL);
514 	if (session == NULL) {
515 		pr_err("Not enough memory for reading perf file header\n");
516 		return -1;
517 	}
518 
519 	rec->session = session;
520 
521 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
522 		perf_header__set_feat(&session->header, feat);
523 
524 	if (rec->no_buildid)
525 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
526 
527 	if (!have_tracepoints(&evsel_list->entries))
528 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
529 
530 	if (!rec->opts.branch_stack)
531 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
532 
533 	if (!rec->file_new) {
534 		err = perf_session__read_header(session, output);
535 		if (err < 0)
536 			goto out_delete_session;
537 	}
538 
539 	if (forks) {
540 		err = perf_evlist__prepare_workload(evsel_list, opts, argv);
541 		if (err < 0) {
542 			pr_err("Couldn't run the workload!\n");
543 			goto out_delete_session;
544 		}
545 	}
546 
547 	if (perf_record__open(rec) != 0) {
548 		err = -1;
549 		goto out_delete_session;
550 	}
551 
552 	/*
553 	 * perf_session__delete(session) will be called at perf_record__exit()
554 	 */
555 	on_exit(perf_record__exit, rec);
556 
557 	if (opts->pipe_output) {
558 		err = perf_header__write_pipe(output);
559 		if (err < 0)
560 			goto out_delete_session;
561 	} else if (rec->file_new) {
562 		err = perf_session__write_header(session, evsel_list,
563 						 output, false);
564 		if (err < 0)
565 			goto out_delete_session;
566 	}
567 
568 	if (!rec->no_buildid
569 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
570 		pr_err("Couldn't generate buildids. "
571 		       "Use --no-buildid to profile anyway.\n");
572 		err = -1;
573 		goto out_delete_session;
574 	}
575 
576 	rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
577 
578 	machine = perf_session__find_host_machine(session);
579 	if (!machine) {
580 		pr_err("Couldn't find native kernel information.\n");
581 		err = -1;
582 		goto out_delete_session;
583 	}
584 
585 	if (opts->pipe_output) {
586 		err = perf_event__synthesize_attrs(tool, session,
587 						   process_synthesized_event);
588 		if (err < 0) {
589 			pr_err("Couldn't synthesize attrs.\n");
590 			goto out_delete_session;
591 		}
592 
593 		err = perf_event__synthesize_event_types(tool, process_synthesized_event,
594 							 machine);
595 		if (err < 0) {
596 			pr_err("Couldn't synthesize event_types.\n");
597 			goto out_delete_session;
598 		}
599 
600 		if (have_tracepoints(&evsel_list->entries)) {
601 			/*
602 			 * FIXME err <= 0 here actually means that
603 			 * there were no tracepoints so its not really
604 			 * an error, just that we don't need to
605 			 * synthesize anything.  We really have to
606 			 * return this more properly and also
607 			 * propagate errors that now are calling die()
608 			 */
609 			err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
610 								  process_synthesized_event);
611 			if (err <= 0) {
612 				pr_err("Couldn't record tracing data.\n");
613 				goto out_delete_session;
614 			}
615 			advance_output(rec, err);
616 		}
617 	}
618 
619 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
620 						 machine, "_text");
621 	if (err < 0)
622 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
623 							 machine, "_stext");
624 	if (err < 0)
625 		pr_err("Couldn't record kernel reference relocation symbol\n"
626 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
627 		       "Check /proc/kallsyms permission or run as root.\n");
628 
629 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
630 					     machine);
631 	if (err < 0)
632 		pr_err("Couldn't record kernel module information.\n"
633 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
634 		       "Check /proc/modules permission or run as root.\n");
635 
636 	if (perf_guest)
637 		perf_session__process_machines(session, tool,
638 					       perf_event__synthesize_guest_os);
639 
640 	if (!opts->target.system_wide)
641 		err = perf_event__synthesize_thread_map(tool, evsel_list->threads,
642 						  process_synthesized_event,
643 						  machine);
644 	else
645 		err = perf_event__synthesize_threads(tool, process_synthesized_event,
646 					       machine);
647 
648 	if (err != 0)
649 		goto out_delete_session;
650 
651 	if (rec->realtime_prio) {
652 		struct sched_param param;
653 
654 		param.sched_priority = rec->realtime_prio;
655 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
656 			pr_err("Could not set realtime priority.\n");
657 			err = -1;
658 			goto out_delete_session;
659 		}
660 	}
661 
662 	perf_evlist__enable(evsel_list);
663 
664 	/*
665 	 * Let the child rip
666 	 */
667 	if (forks)
668 		perf_evlist__start_workload(evsel_list);
669 
670 	for (;;) {
671 		int hits = rec->samples;
672 
673 		if (perf_record__mmap_read_all(rec) < 0) {
674 			err = -1;
675 			goto out_delete_session;
676 		}
677 
678 		if (hits == rec->samples) {
679 			if (done)
680 				break;
681 			err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
682 			waking++;
683 		}
684 
685 		if (done)
686 			perf_evlist__disable(evsel_list);
687 	}
688 
689 	if (quiet || signr == SIGUSR1)
690 		return 0;
691 
692 	fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
693 
694 	/*
695 	 * Approximate RIP event size: 24 bytes.
696 	 */
697 	fprintf(stderr,
698 		"[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
699 		(double)rec->bytes_written / 1024.0 / 1024.0,
700 		output_name,
701 		rec->bytes_written / 24);
702 
703 	return 0;
704 
705 out_delete_session:
706 	perf_session__delete(session);
707 	return err;
708 }
709 
710 #define BRANCH_OPT(n, m) \
711 	{ .name = n, .mode = (m) }
712 
713 #define BRANCH_END { .name = NULL }
714 
715 struct branch_mode {
716 	const char *name;
717 	int mode;
718 };
719 
720 static const struct branch_mode branch_modes[] = {
721 	BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
722 	BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
723 	BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
724 	BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
725 	BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
726 	BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
727 	BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
728 	BRANCH_END
729 };
730 
731 static int
732 parse_branch_stack(const struct option *opt, const char *str, int unset)
733 {
734 #define ONLY_PLM \
735 	(PERF_SAMPLE_BRANCH_USER	|\
736 	 PERF_SAMPLE_BRANCH_KERNEL	|\
737 	 PERF_SAMPLE_BRANCH_HV)
738 
739 	uint64_t *mode = (uint64_t *)opt->value;
740 	const struct branch_mode *br;
741 	char *s, *os = NULL, *p;
742 	int ret = -1;
743 
744 	if (unset)
745 		return 0;
746 
747 	/*
748 	 * cannot set it twice, -b + --branch-filter for instance
749 	 */
750 	if (*mode)
751 		return -1;
752 
753 	/* str may be NULL in case no arg is passed to -b */
754 	if (str) {
755 		/* because str is read-only */
756 		s = os = strdup(str);
757 		if (!s)
758 			return -1;
759 
760 		for (;;) {
761 			p = strchr(s, ',');
762 			if (p)
763 				*p = '\0';
764 
765 			for (br = branch_modes; br->name; br++) {
766 				if (!strcasecmp(s, br->name))
767 					break;
768 			}
769 			if (!br->name) {
770 				ui__warning("unknown branch filter %s,"
771 					    " check man page\n", s);
772 				goto error;
773 			}
774 
775 			*mode |= br->mode;
776 
777 			if (!p)
778 				break;
779 
780 			s = p + 1;
781 		}
782 	}
783 	ret = 0;
784 
785 	/* default to any branch */
786 	if ((*mode & ~ONLY_PLM) == 0) {
787 		*mode = PERF_SAMPLE_BRANCH_ANY;
788 	}
789 error:
790 	free(os);
791 	return ret;
792 }
793 
794 #ifdef LIBUNWIND_SUPPORT
795 static int get_stack_size(char *str, unsigned long *_size)
796 {
797 	char *endptr;
798 	unsigned long size;
799 	unsigned long max_size = round_down(USHRT_MAX, sizeof(u64));
800 
801 	size = strtoul(str, &endptr, 0);
802 
803 	do {
804 		if (*endptr)
805 			break;
806 
807 		size = round_up(size, sizeof(u64));
808 		if (!size || size > max_size)
809 			break;
810 
811 		*_size = size;
812 		return 0;
813 
814 	} while (0);
815 
816 	pr_err("callchain: Incorrect stack dump size (max %ld): %s\n",
817 	       max_size, str);
818 	return -1;
819 }
820 #endif /* LIBUNWIND_SUPPORT */
821 
822 static int
823 parse_callchain_opt(const struct option *opt __maybe_unused, const char *arg,
824 		    int unset)
825 {
826 	struct perf_record *rec = (struct perf_record *)opt->value;
827 	char *tok, *name, *saveptr = NULL;
828 	char *buf;
829 	int ret = -1;
830 
831 	/* --no-call-graph */
832 	if (unset)
833 		return 0;
834 
835 	/* We specified default option if none is provided. */
836 	BUG_ON(!arg);
837 
838 	/* We need buffer that we know we can write to. */
839 	buf = malloc(strlen(arg) + 1);
840 	if (!buf)
841 		return -ENOMEM;
842 
843 	strcpy(buf, arg);
844 
845 	tok = strtok_r((char *)buf, ",", &saveptr);
846 	name = tok ? : (char *)buf;
847 
848 	do {
849 		/* Framepointer style */
850 		if (!strncmp(name, "fp", sizeof("fp"))) {
851 			if (!strtok_r(NULL, ",", &saveptr)) {
852 				rec->opts.call_graph = CALLCHAIN_FP;
853 				ret = 0;
854 			} else
855 				pr_err("callchain: No more arguments "
856 				       "needed for -g fp\n");
857 			break;
858 
859 #ifdef LIBUNWIND_SUPPORT
860 		/* Dwarf style */
861 		} else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
862 			const unsigned long default_stack_dump_size = 8192;
863 
864 			ret = 0;
865 			rec->opts.call_graph = CALLCHAIN_DWARF;
866 			rec->opts.stack_dump_size = default_stack_dump_size;
867 
868 			tok = strtok_r(NULL, ",", &saveptr);
869 			if (tok) {
870 				unsigned long size = 0;
871 
872 				ret = get_stack_size(tok, &size);
873 				rec->opts.stack_dump_size = size;
874 			}
875 
876 			if (!ret)
877 				pr_debug("callchain: stack dump size %d\n",
878 					 rec->opts.stack_dump_size);
879 #endif /* LIBUNWIND_SUPPORT */
880 		} else {
881 			pr_err("callchain: Unknown -g option "
882 			       "value: %s\n", arg);
883 			break;
884 		}
885 
886 	} while (0);
887 
888 	free(buf);
889 
890 	if (!ret)
891 		pr_debug("callchain: type %d\n", rec->opts.call_graph);
892 
893 	return ret;
894 }
895 
896 static const char * const record_usage[] = {
897 	"perf record [<options>] [<command>]",
898 	"perf record [<options>] -- <command> [<options>]",
899 	NULL
900 };
901 
902 /*
903  * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
904  * because we need to have access to it in perf_record__exit, that is called
905  * after cmd_record() exits, but since record_options need to be accessible to
906  * builtin-script, leave it here.
907  *
908  * At least we don't ouch it in all the other functions here directly.
909  *
910  * Just say no to tons of global variables, sigh.
911  */
912 static struct perf_record record = {
913 	.opts = {
914 		.mmap_pages	     = UINT_MAX,
915 		.user_freq	     = UINT_MAX,
916 		.user_interval	     = ULLONG_MAX,
917 		.freq		     = 4000,
918 		.target		     = {
919 			.uses_mmap   = true,
920 		},
921 	},
922 	.write_mode = WRITE_FORCE,
923 	.file_new   = true,
924 };
925 
926 #define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: "
927 
928 #ifdef LIBUNWIND_SUPPORT
929 static const char callchain_help[] = CALLCHAIN_HELP "[fp] dwarf";
930 #else
931 static const char callchain_help[] = CALLCHAIN_HELP "[fp]";
932 #endif
933 
934 /*
935  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
936  * with it and switch to use the library functions in perf_evlist that came
937  * from builtin-record.c, i.e. use perf_record_opts,
938  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
939  * using pipes, etc.
940  */
941 const struct option record_options[] = {
942 	OPT_CALLBACK('e', "event", &record.evlist, "event",
943 		     "event selector. use 'perf list' to list available events",
944 		     parse_events_option),
945 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
946 		     "event filter", parse_filter),
947 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
948 		    "record events on existing process id"),
949 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
950 		    "record events on existing thread id"),
951 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
952 		    "collect data with this RT SCHED_FIFO priority"),
953 	OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
954 		    "collect data without buffering"),
955 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
956 		    "collect raw sample records from all opened counters"),
957 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
958 			    "system-wide collection from all CPUs"),
959 	OPT_BOOLEAN('A', "append", &record.append_file,
960 			    "append to the output file to do incremental profiling"),
961 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
962 		    "list of cpus to monitor"),
963 	OPT_BOOLEAN('f', "force", &record.force,
964 			"overwrite existing data file (deprecated)"),
965 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
966 	OPT_STRING('o', "output", &record.output_name, "file",
967 		    "output file name"),
968 	OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
969 		    "child tasks do not inherit counters"),
970 	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
971 	OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
972 		     "number of mmap data pages"),
973 	OPT_BOOLEAN(0, "group", &record.opts.group,
974 		    "put the counters into a counter group"),
975 	OPT_CALLBACK_DEFAULT('g', "call-graph", &record, "mode[,dump_size]",
976 			     callchain_help, &parse_callchain_opt,
977 			     "fp"),
978 	OPT_INCR('v', "verbose", &verbose,
979 		    "be more verbose (show counter open errors, etc)"),
980 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
981 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
982 		    "per thread counts"),
983 	OPT_BOOLEAN('d', "data", &record.opts.sample_address,
984 		    "Sample addresses"),
985 	OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
986 	OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
987 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
988 		    "don't sample"),
989 	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
990 		    "do not update the buildid cache"),
991 	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
992 		    "do not collect buildids in perf.data"),
993 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
994 		     "monitor event in cgroup name only",
995 		     parse_cgroups),
996 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
997 		   "user to profile"),
998 
999 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
1000 		     "branch any", "sample any taken branches",
1001 		     parse_branch_stack),
1002 
1003 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
1004 		     "branch filter mask", "branch stack filter modes",
1005 		     parse_branch_stack),
1006 	OPT_END()
1007 };
1008 
1009 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
1010 {
1011 	int err = -ENOMEM;
1012 	struct perf_evsel *pos;
1013 	struct perf_evlist *evsel_list;
1014 	struct perf_record *rec = &record;
1015 	char errbuf[BUFSIZ];
1016 
1017 	evsel_list = perf_evlist__new(NULL, NULL);
1018 	if (evsel_list == NULL)
1019 		return -ENOMEM;
1020 
1021 	rec->evlist = evsel_list;
1022 
1023 	argc = parse_options(argc, argv, record_options, record_usage,
1024 			    PARSE_OPT_STOP_AT_NON_OPTION);
1025 	if (!argc && perf_target__none(&rec->opts.target))
1026 		usage_with_options(record_usage, record_options);
1027 
1028 	if (rec->force && rec->append_file) {
1029 		ui__error("Can't overwrite and append at the same time."
1030 			  " You need to choose between -f and -A");
1031 		usage_with_options(record_usage, record_options);
1032 	} else if (rec->append_file) {
1033 		rec->write_mode = WRITE_APPEND;
1034 	} else {
1035 		rec->write_mode = WRITE_FORCE;
1036 	}
1037 
1038 	if (nr_cgroups && !rec->opts.target.system_wide) {
1039 		ui__error("cgroup monitoring only available in"
1040 			  " system-wide mode\n");
1041 		usage_with_options(record_usage, record_options);
1042 	}
1043 
1044 	symbol__init();
1045 
1046 	if (symbol_conf.kptr_restrict)
1047 		pr_warning(
1048 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1049 "check /proc/sys/kernel/kptr_restrict.\n\n"
1050 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1051 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1052 "Samples in kernel modules won't be resolved at all.\n\n"
1053 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1054 "even with a suitable vmlinux or kallsyms file.\n\n");
1055 
1056 	if (rec->no_buildid_cache || rec->no_buildid)
1057 		disable_buildid_cache();
1058 
1059 	if (evsel_list->nr_entries == 0 &&
1060 	    perf_evlist__add_default(evsel_list) < 0) {
1061 		pr_err("Not enough memory for event selector list\n");
1062 		goto out_symbol_exit;
1063 	}
1064 
1065 	err = perf_target__validate(&rec->opts.target);
1066 	if (err) {
1067 		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1068 		ui__warning("%s", errbuf);
1069 	}
1070 
1071 	err = perf_target__parse_uid(&rec->opts.target);
1072 	if (err) {
1073 		int saved_errno = errno;
1074 
1075 		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1076 		ui__error("%s", errbuf);
1077 
1078 		err = -saved_errno;
1079 		goto out_free_fd;
1080 	}
1081 
1082 	err = -ENOMEM;
1083 	if (perf_evlist__create_maps(evsel_list, &rec->opts.target) < 0)
1084 		usage_with_options(record_usage, record_options);
1085 
1086 	list_for_each_entry(pos, &evsel_list->entries, node) {
1087 		if (perf_header__push_event(pos->attr.config, perf_evsel__name(pos)))
1088 			goto out_free_fd;
1089 	}
1090 
1091 	if (rec->opts.user_interval != ULLONG_MAX)
1092 		rec->opts.default_interval = rec->opts.user_interval;
1093 	if (rec->opts.user_freq != UINT_MAX)
1094 		rec->opts.freq = rec->opts.user_freq;
1095 
1096 	/*
1097 	 * User specified count overrides default frequency.
1098 	 */
1099 	if (rec->opts.default_interval)
1100 		rec->opts.freq = 0;
1101 	else if (rec->opts.freq) {
1102 		rec->opts.default_interval = rec->opts.freq;
1103 	} else {
1104 		ui__error("frequency and count are zero, aborting\n");
1105 		err = -EINVAL;
1106 		goto out_free_fd;
1107 	}
1108 
1109 	err = __cmd_record(&record, argc, argv);
1110 out_free_fd:
1111 	perf_evlist__delete_maps(evsel_list);
1112 out_symbol_exit:
1113 	symbol__exit();
1114 	return err;
1115 }
1116