xref: /openbmc/linux/tools/perf/builtin-record.c (revision 81d67439)
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #define _FILE_OFFSET_BITS 64
9 
10 #include "builtin.h"
11 
12 #include "perf.h"
13 
14 #include "util/build-id.h"
15 #include "util/util.h"
16 #include "util/parse-options.h"
17 #include "util/parse-events.h"
18 
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/symbol.h"
26 #include "util/cpumap.h"
27 #include "util/thread_map.h"
28 
29 #include <unistd.h>
30 #include <sched.h>
31 #include <sys/mman.h>
32 
33 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
34 
35 enum write_mode_t {
36 	WRITE_FORCE,
37 	WRITE_APPEND
38 };
39 
40 static u64			user_interval			= ULLONG_MAX;
41 static u64			default_interval		=      0;
42 
43 static unsigned int		page_size;
44 static unsigned int		mmap_pages			= UINT_MAX;
45 static unsigned int		user_freq 			= UINT_MAX;
46 static int			freq				=   1000;
47 static int			output;
48 static int			pipe_output			=      0;
49 static const char		*output_name			= NULL;
50 static int			group				=      0;
51 static int			realtime_prio			=      0;
52 static bool			nodelay				=  false;
53 static bool			raw_samples			=  false;
54 static bool			sample_id_all_avail		=   true;
55 static bool			system_wide			=  false;
56 static pid_t			target_pid			=     -1;
57 static pid_t			target_tid			=     -1;
58 static pid_t			child_pid			=     -1;
59 static bool			no_inherit			=  false;
60 static enum write_mode_t	write_mode			= WRITE_FORCE;
61 static bool			call_graph			=  false;
62 static bool			inherit_stat			=  false;
63 static bool			no_samples			=  false;
64 static bool			sample_address			=  false;
65 static bool			sample_time			=  false;
66 static bool			no_buildid			=  false;
67 static bool			no_buildid_cache		=  false;
68 static struct perf_evlist	*evsel_list;
69 
70 static long			samples				=      0;
71 static u64			bytes_written			=      0;
72 
73 static int			file_new			=      1;
74 static off_t			post_processing_offset;
75 
76 static struct perf_session	*session;
77 static const char		*cpu_list;
78 
79 static void advance_output(size_t size)
80 {
81 	bytes_written += size;
82 }
83 
84 static void write_output(void *buf, size_t size)
85 {
86 	while (size) {
87 		int ret = write(output, buf, size);
88 
89 		if (ret < 0)
90 			die("failed to write");
91 
92 		size -= ret;
93 		buf += ret;
94 
95 		bytes_written += ret;
96 	}
97 }
98 
99 static int process_synthesized_event(union perf_event *event,
100 				     struct perf_sample *sample __used,
101 				     struct perf_session *self __used)
102 {
103 	write_output(event, event->header.size);
104 	return 0;
105 }
106 
107 static void mmap_read(struct perf_mmap *md)
108 {
109 	unsigned int head = perf_mmap__read_head(md);
110 	unsigned int old = md->prev;
111 	unsigned char *data = md->base + page_size;
112 	unsigned long size;
113 	void *buf;
114 
115 	if (old == head)
116 		return;
117 
118 	samples++;
119 
120 	size = head - old;
121 
122 	if ((old & md->mask) + size != (head & md->mask)) {
123 		buf = &data[old & md->mask];
124 		size = md->mask + 1 - (old & md->mask);
125 		old += size;
126 
127 		write_output(buf, size);
128 	}
129 
130 	buf = &data[old & md->mask];
131 	size = head - old;
132 	old += size;
133 
134 	write_output(buf, size);
135 
136 	md->prev = old;
137 	perf_mmap__write_tail(md, old);
138 }
139 
140 static volatile int done = 0;
141 static volatile int signr = -1;
142 
143 static void sig_handler(int sig)
144 {
145 	done = 1;
146 	signr = sig;
147 }
148 
149 static void sig_atexit(void)
150 {
151 	if (child_pid > 0)
152 		kill(child_pid, SIGTERM);
153 
154 	if (signr == -1 || signr == SIGUSR1)
155 		return;
156 
157 	signal(signr, SIG_DFL);
158 	kill(getpid(), signr);
159 }
160 
161 static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist)
162 {
163 	struct perf_event_attr *attr = &evsel->attr;
164 	int track = !evsel->idx; /* only the first counter needs these */
165 
166 	attr->inherit		= !no_inherit;
167 	attr->read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
168 				  PERF_FORMAT_TOTAL_TIME_RUNNING |
169 				  PERF_FORMAT_ID;
170 
171 	attr->sample_type	|= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
172 
173 	if (evlist->nr_entries > 1)
174 		attr->sample_type |= PERF_SAMPLE_ID;
175 
176 	/*
177 	 * We default some events to a 1 default interval. But keep
178 	 * it a weak assumption overridable by the user.
179 	 */
180 	if (!attr->sample_period || (user_freq != UINT_MAX &&
181 				     user_interval != ULLONG_MAX)) {
182 		if (freq) {
183 			attr->sample_type	|= PERF_SAMPLE_PERIOD;
184 			attr->freq		= 1;
185 			attr->sample_freq	= freq;
186 		} else {
187 			attr->sample_period = default_interval;
188 		}
189 	}
190 
191 	if (no_samples)
192 		attr->sample_freq = 0;
193 
194 	if (inherit_stat)
195 		attr->inherit_stat = 1;
196 
197 	if (sample_address) {
198 		attr->sample_type	|= PERF_SAMPLE_ADDR;
199 		attr->mmap_data = track;
200 	}
201 
202 	if (call_graph)
203 		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;
204 
205 	if (system_wide)
206 		attr->sample_type	|= PERF_SAMPLE_CPU;
207 
208 	if (sample_id_all_avail &&
209 	    (sample_time || system_wide || !no_inherit || cpu_list))
210 		attr->sample_type	|= PERF_SAMPLE_TIME;
211 
212 	if (raw_samples) {
213 		attr->sample_type	|= PERF_SAMPLE_TIME;
214 		attr->sample_type	|= PERF_SAMPLE_RAW;
215 		attr->sample_type	|= PERF_SAMPLE_CPU;
216 	}
217 
218 	if (nodelay) {
219 		attr->watermark = 0;
220 		attr->wakeup_events = 1;
221 	}
222 
223 	attr->mmap		= track;
224 	attr->comm		= track;
225 
226 	if (target_pid == -1 && target_tid == -1 && !system_wide) {
227 		attr->disabled = 1;
228 		attr->enable_on_exec = 1;
229 	}
230 }
231 
232 static bool perf_evlist__equal(struct perf_evlist *evlist,
233 			       struct perf_evlist *other)
234 {
235 	struct perf_evsel *pos, *pair;
236 
237 	if (evlist->nr_entries != other->nr_entries)
238 		return false;
239 
240 	pair = list_entry(other->entries.next, struct perf_evsel, node);
241 
242 	list_for_each_entry(pos, &evlist->entries, node) {
243 		if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
244 			return false;
245 		pair = list_entry(pair->node.next, struct perf_evsel, node);
246 	}
247 
248 	return true;
249 }
250 
251 static void open_counters(struct perf_evlist *evlist)
252 {
253 	struct perf_evsel *pos;
254 
255 	if (evlist->cpus->map[0] < 0)
256 		no_inherit = true;
257 
258 	list_for_each_entry(pos, &evlist->entries, node) {
259 		struct perf_event_attr *attr = &pos->attr;
260 		/*
261 		 * Check if parse_single_tracepoint_event has already asked for
262 		 * PERF_SAMPLE_TIME.
263 		 *
264 		 * XXX this is kludgy but short term fix for problems introduced by
265 		 * eac23d1c that broke 'perf script' by having different sample_types
266 		 * when using multiple tracepoint events when we use a perf binary
267 		 * that tries to use sample_id_all on an older kernel.
268 		 *
269 		 * We need to move counter creation to perf_session, support
270 		 * different sample_types, etc.
271 		 */
272 		bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
273 
274 		config_attr(pos, evlist);
275 retry_sample_id:
276 		attr->sample_id_all = sample_id_all_avail ? 1 : 0;
277 try_again:
278 		if (perf_evsel__open(pos, evlist->cpus, evlist->threads, group) < 0) {
279 			int err = errno;
280 
281 			if (err == EPERM || err == EACCES) {
282 				ui__warning_paranoid();
283 				exit(EXIT_FAILURE);
284 			} else if (err ==  ENODEV && cpu_list) {
285 				die("No such device - did you specify"
286 					" an out-of-range profile CPU?\n");
287 			} else if (err == EINVAL && sample_id_all_avail) {
288 				/*
289 				 * Old kernel, no attr->sample_id_type_all field
290 				 */
291 				sample_id_all_avail = false;
292 				if (!sample_time && !raw_samples && !time_needed)
293 					attr->sample_type &= ~PERF_SAMPLE_TIME;
294 
295 				goto retry_sample_id;
296 			}
297 
298 			/*
299 			 * If it's cycles then fall back to hrtimer
300 			 * based cpu-clock-tick sw counter, which
301 			 * is always available even if no PMU support:
302 			 */
303 			if (attr->type == PERF_TYPE_HARDWARE
304 					&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
305 
306 				if (verbose)
307 					ui__warning("The cycles event is not supported, "
308 						    "trying to fall back to cpu-clock-ticks\n");
309 				attr->type = PERF_TYPE_SOFTWARE;
310 				attr->config = PERF_COUNT_SW_CPU_CLOCK;
311 				goto try_again;
312 			}
313 
314 			if (err == ENOENT) {
315 				ui__warning("The %s event is not supported.\n",
316 					    event_name(pos));
317 				exit(EXIT_FAILURE);
318 			}
319 
320 			printf("\n");
321 			error("sys_perf_event_open() syscall returned with %d (%s).  /bin/dmesg may provide additional information.\n",
322 			      err, strerror(err));
323 
324 #if defined(__i386__) || defined(__x86_64__)
325 			if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
326 				die("No hardware sampling interrupt available."
327 				    " No APIC? If so then you can boot the kernel"
328 				    " with the \"lapic\" boot parameter to"
329 				    " force-enable it.\n");
330 #endif
331 
332 			die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
333 		}
334 	}
335 
336 	if (perf_evlist__set_filters(evlist)) {
337 		error("failed to set filter with %d (%s)\n", errno,
338 			strerror(errno));
339 		exit(-1);
340 	}
341 
342 	if (perf_evlist__mmap(evlist, mmap_pages, false) < 0)
343 		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
344 
345 	if (file_new)
346 		session->evlist = evlist;
347 	else {
348 		if (!perf_evlist__equal(session->evlist, evlist)) {
349 			fprintf(stderr, "incompatible append\n");
350 			exit(-1);
351 		}
352  	}
353 
354 	perf_session__update_sample_type(session);
355 }
356 
357 static int process_buildids(void)
358 {
359 	u64 size = lseek(output, 0, SEEK_CUR);
360 
361 	if (size == 0)
362 		return 0;
363 
364 	session->fd = output;
365 	return __perf_session__process_events(session, post_processing_offset,
366 					      size - post_processing_offset,
367 					      size, &build_id__mark_dso_hit_ops);
368 }
369 
370 static void atexit_header(void)
371 {
372 	if (!pipe_output) {
373 		session->header.data_size += bytes_written;
374 
375 		if (!no_buildid)
376 			process_buildids();
377 		perf_session__write_header(session, evsel_list, output, true);
378 		perf_session__delete(session);
379 		perf_evlist__delete(evsel_list);
380 		symbol__exit();
381 	}
382 }
383 
384 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
385 {
386 	int err;
387 	struct perf_session *psession = data;
388 
389 	if (machine__is_host(machine))
390 		return;
391 
392 	/*
393 	 *As for guest kernel when processing subcommand record&report,
394 	 *we arrange module mmap prior to guest kernel mmap and trigger
395 	 *a preload dso because default guest module symbols are loaded
396 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
397 	 *method is used to avoid symbol missing when the first addr is
398 	 *in module instead of in guest kernel.
399 	 */
400 	err = perf_event__synthesize_modules(process_synthesized_event,
401 					     psession, machine);
402 	if (err < 0)
403 		pr_err("Couldn't record guest kernel [%d]'s reference"
404 		       " relocation symbol.\n", machine->pid);
405 
406 	/*
407 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
408 	 * have no _text sometimes.
409 	 */
410 	err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
411 						 psession, machine, "_text");
412 	if (err < 0)
413 		err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
414 							 psession, machine,
415 							 "_stext");
416 	if (err < 0)
417 		pr_err("Couldn't record guest kernel [%d]'s reference"
418 		       " relocation symbol.\n", machine->pid);
419 }
420 
421 static struct perf_event_header finished_round_event = {
422 	.size = sizeof(struct perf_event_header),
423 	.type = PERF_RECORD_FINISHED_ROUND,
424 };
425 
426 static void mmap_read_all(void)
427 {
428 	int i;
429 
430 	for (i = 0; i < evsel_list->nr_mmaps; i++) {
431 		if (evsel_list->mmap[i].base)
432 			mmap_read(&evsel_list->mmap[i]);
433 	}
434 
435 	if (perf_header__has_feat(&session->header, HEADER_TRACE_INFO))
436 		write_output(&finished_round_event, sizeof(finished_round_event));
437 }
438 
439 static int __cmd_record(int argc, const char **argv)
440 {
441 	int i;
442 	struct stat st;
443 	int flags;
444 	int err;
445 	unsigned long waking = 0;
446 	int child_ready_pipe[2], go_pipe[2];
447 	const bool forks = argc > 0;
448 	char buf;
449 	struct machine *machine;
450 
451 	page_size = sysconf(_SC_PAGE_SIZE);
452 
453 	atexit(sig_atexit);
454 	signal(SIGCHLD, sig_handler);
455 	signal(SIGINT, sig_handler);
456 	signal(SIGUSR1, sig_handler);
457 
458 	if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
459 		perror("failed to create pipes");
460 		exit(-1);
461 	}
462 
463 	if (!output_name) {
464 		if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
465 			pipe_output = 1;
466 		else
467 			output_name = "perf.data";
468 	}
469 	if (output_name) {
470 		if (!strcmp(output_name, "-"))
471 			pipe_output = 1;
472 		else if (!stat(output_name, &st) && st.st_size) {
473 			if (write_mode == WRITE_FORCE) {
474 				char oldname[PATH_MAX];
475 				snprintf(oldname, sizeof(oldname), "%s.old",
476 					 output_name);
477 				unlink(oldname);
478 				rename(output_name, oldname);
479 			}
480 		} else if (write_mode == WRITE_APPEND) {
481 			write_mode = WRITE_FORCE;
482 		}
483 	}
484 
485 	flags = O_CREAT|O_RDWR;
486 	if (write_mode == WRITE_APPEND)
487 		file_new = 0;
488 	else
489 		flags |= O_TRUNC;
490 
491 	if (pipe_output)
492 		output = STDOUT_FILENO;
493 	else
494 		output = open(output_name, flags, S_IRUSR | S_IWUSR);
495 	if (output < 0) {
496 		perror("failed to create output file");
497 		exit(-1);
498 	}
499 
500 	session = perf_session__new(output_name, O_WRONLY,
501 				    write_mode == WRITE_FORCE, false, NULL);
502 	if (session == NULL) {
503 		pr_err("Not enough memory for reading perf file header\n");
504 		return -1;
505 	}
506 
507 	if (!no_buildid)
508 		perf_header__set_feat(&session->header, HEADER_BUILD_ID);
509 
510 	if (!file_new) {
511 		err = perf_session__read_header(session, output);
512 		if (err < 0)
513 			goto out_delete_session;
514 	}
515 
516 	if (have_tracepoints(&evsel_list->entries))
517 		perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
518 
519 	/* 512 kiB: default amount of unprivileged mlocked memory */
520 	if (mmap_pages == UINT_MAX)
521 		mmap_pages = (512 * 1024) / page_size;
522 
523 	if (forks) {
524 		child_pid = fork();
525 		if (child_pid < 0) {
526 			perror("failed to fork");
527 			exit(-1);
528 		}
529 
530 		if (!child_pid) {
531 			if (pipe_output)
532 				dup2(2, 1);
533 			close(child_ready_pipe[0]);
534 			close(go_pipe[1]);
535 			fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
536 
537 			/*
538 			 * Do a dummy execvp to get the PLT entry resolved,
539 			 * so we avoid the resolver overhead on the real
540 			 * execvp call.
541 			 */
542 			execvp("", (char **)argv);
543 
544 			/*
545 			 * Tell the parent we're ready to go
546 			 */
547 			close(child_ready_pipe[1]);
548 
549 			/*
550 			 * Wait until the parent tells us to go.
551 			 */
552 			if (read(go_pipe[0], &buf, 1) == -1)
553 				perror("unable to read pipe");
554 
555 			execvp(argv[0], (char **)argv);
556 
557 			perror(argv[0]);
558 			kill(getppid(), SIGUSR1);
559 			exit(-1);
560 		}
561 
562 		if (!system_wide && target_tid == -1 && target_pid == -1)
563 			evsel_list->threads->map[0] = child_pid;
564 
565 		close(child_ready_pipe[1]);
566 		close(go_pipe[0]);
567 		/*
568 		 * wait for child to settle
569 		 */
570 		if (read(child_ready_pipe[0], &buf, 1) == -1) {
571 			perror("unable to read pipe");
572 			exit(-1);
573 		}
574 		close(child_ready_pipe[0]);
575 	}
576 
577 	open_counters(evsel_list);
578 
579 	/*
580 	 * perf_session__delete(session) will be called at atexit_header()
581 	 */
582 	atexit(atexit_header);
583 
584 	if (pipe_output) {
585 		err = perf_header__write_pipe(output);
586 		if (err < 0)
587 			return err;
588 	} else if (file_new) {
589 		err = perf_session__write_header(session, evsel_list,
590 						 output, false);
591 		if (err < 0)
592 			return err;
593 	}
594 
595 	post_processing_offset = lseek(output, 0, SEEK_CUR);
596 
597 	if (pipe_output) {
598 		err = perf_session__synthesize_attrs(session,
599 						     process_synthesized_event);
600 		if (err < 0) {
601 			pr_err("Couldn't synthesize attrs.\n");
602 			return err;
603 		}
604 
605 		err = perf_event__synthesize_event_types(process_synthesized_event,
606 							 session);
607 		if (err < 0) {
608 			pr_err("Couldn't synthesize event_types.\n");
609 			return err;
610 		}
611 
612 		if (have_tracepoints(&evsel_list->entries)) {
613 			/*
614 			 * FIXME err <= 0 here actually means that
615 			 * there were no tracepoints so its not really
616 			 * an error, just that we don't need to
617 			 * synthesize anything.  We really have to
618 			 * return this more properly and also
619 			 * propagate errors that now are calling die()
620 			 */
621 			err = perf_event__synthesize_tracing_data(output, evsel_list,
622 								  process_synthesized_event,
623 								  session);
624 			if (err <= 0) {
625 				pr_err("Couldn't record tracing data.\n");
626 				return err;
627 			}
628 			advance_output(err);
629 		}
630 	}
631 
632 	machine = perf_session__find_host_machine(session);
633 	if (!machine) {
634 		pr_err("Couldn't find native kernel information.\n");
635 		return -1;
636 	}
637 
638 	err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
639 						 session, machine, "_text");
640 	if (err < 0)
641 		err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
642 							 session, machine, "_stext");
643 	if (err < 0)
644 		pr_err("Couldn't record kernel reference relocation symbol\n"
645 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
646 		       "Check /proc/kallsyms permission or run as root.\n");
647 
648 	err = perf_event__synthesize_modules(process_synthesized_event,
649 					     session, machine);
650 	if (err < 0)
651 		pr_err("Couldn't record kernel module information.\n"
652 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
653 		       "Check /proc/modules permission or run as root.\n");
654 
655 	if (perf_guest)
656 		perf_session__process_machines(session,
657 					       perf_event__synthesize_guest_os);
658 
659 	if (!system_wide)
660 		perf_event__synthesize_thread_map(evsel_list->threads,
661 						  process_synthesized_event,
662 						  session);
663 	else
664 		perf_event__synthesize_threads(process_synthesized_event,
665 					       session);
666 
667 	if (realtime_prio) {
668 		struct sched_param param;
669 
670 		param.sched_priority = realtime_prio;
671 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
672 			pr_err("Could not set realtime priority.\n");
673 			exit(-1);
674 		}
675 	}
676 
677 	/*
678 	 * Let the child rip
679 	 */
680 	if (forks)
681 		close(go_pipe[1]);
682 
683 	for (;;) {
684 		int hits = samples;
685 		int thread;
686 
687 		mmap_read_all();
688 
689 		if (hits == samples) {
690 			if (done)
691 				break;
692 			err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
693 			waking++;
694 		}
695 
696 		if (done) {
697 			for (i = 0; i < evsel_list->cpus->nr; i++) {
698 				struct perf_evsel *pos;
699 
700 				list_for_each_entry(pos, &evsel_list->entries, node) {
701 					for (thread = 0;
702 						thread < evsel_list->threads->nr;
703 						thread++)
704 						ioctl(FD(pos, i, thread),
705 							PERF_EVENT_IOC_DISABLE);
706 				}
707 			}
708 		}
709 	}
710 
711 	if (quiet || signr == SIGUSR1)
712 		return 0;
713 
714 	fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
715 
716 	/*
717 	 * Approximate RIP event size: 24 bytes.
718 	 */
719 	fprintf(stderr,
720 		"[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
721 		(double)bytes_written / 1024.0 / 1024.0,
722 		output_name,
723 		bytes_written / 24);
724 
725 	return 0;
726 
727 out_delete_session:
728 	perf_session__delete(session);
729 	return err;
730 }
731 
732 static const char * const record_usage[] = {
733 	"perf record [<options>] [<command>]",
734 	"perf record [<options>] -- <command> [<options>]",
735 	NULL
736 };
737 
738 static bool force, append_file;
739 
740 const struct option record_options[] = {
741 	OPT_CALLBACK('e', "event", &evsel_list, "event",
742 		     "event selector. use 'perf list' to list available events",
743 		     parse_events_option),
744 	OPT_CALLBACK(0, "filter", &evsel_list, "filter",
745 		     "event filter", parse_filter),
746 	OPT_INTEGER('p', "pid", &target_pid,
747 		    "record events on existing process id"),
748 	OPT_INTEGER('t', "tid", &target_tid,
749 		    "record events on existing thread id"),
750 	OPT_INTEGER('r', "realtime", &realtime_prio,
751 		    "collect data with this RT SCHED_FIFO priority"),
752 	OPT_BOOLEAN('D', "no-delay", &nodelay,
753 		    "collect data without buffering"),
754 	OPT_BOOLEAN('R', "raw-samples", &raw_samples,
755 		    "collect raw sample records from all opened counters"),
756 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
757 			    "system-wide collection from all CPUs"),
758 	OPT_BOOLEAN('A', "append", &append_file,
759 			    "append to the output file to do incremental profiling"),
760 	OPT_STRING('C', "cpu", &cpu_list, "cpu",
761 		    "list of cpus to monitor"),
762 	OPT_BOOLEAN('f', "force", &force,
763 			"overwrite existing data file (deprecated)"),
764 	OPT_U64('c', "count", &user_interval, "event period to sample"),
765 	OPT_STRING('o', "output", &output_name, "file",
766 		    "output file name"),
767 	OPT_BOOLEAN('i', "no-inherit", &no_inherit,
768 		    "child tasks do not inherit counters"),
769 	OPT_UINTEGER('F', "freq", &user_freq, "profile at this frequency"),
770 	OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"),
771 	OPT_BOOLEAN('g', "call-graph", &call_graph,
772 		    "do call-graph (stack chain/backtrace) recording"),
773 	OPT_INCR('v', "verbose", &verbose,
774 		    "be more verbose (show counter open errors, etc)"),
775 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
776 	OPT_BOOLEAN('s', "stat", &inherit_stat,
777 		    "per thread counts"),
778 	OPT_BOOLEAN('d', "data", &sample_address,
779 		    "Sample addresses"),
780 	OPT_BOOLEAN('T', "timestamp", &sample_time, "Sample timestamps"),
781 	OPT_BOOLEAN('n', "no-samples", &no_samples,
782 		    "don't sample"),
783 	OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache,
784 		    "do not update the buildid cache"),
785 	OPT_BOOLEAN('B', "no-buildid", &no_buildid,
786 		    "do not collect buildids in perf.data"),
787 	OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
788 		     "monitor event in cgroup name only",
789 		     parse_cgroups),
790 	OPT_END()
791 };
792 
793 int cmd_record(int argc, const char **argv, const char *prefix __used)
794 {
795 	int err = -ENOMEM;
796 	struct perf_evsel *pos;
797 
798 	evsel_list = perf_evlist__new(NULL, NULL);
799 	if (evsel_list == NULL)
800 		return -ENOMEM;
801 
802 	argc = parse_options(argc, argv, record_options, record_usage,
803 			    PARSE_OPT_STOP_AT_NON_OPTION);
804 	if (!argc && target_pid == -1 && target_tid == -1 &&
805 		!system_wide && !cpu_list)
806 		usage_with_options(record_usage, record_options);
807 
808 	if (force && append_file) {
809 		fprintf(stderr, "Can't overwrite and append at the same time."
810 				" You need to choose between -f and -A");
811 		usage_with_options(record_usage, record_options);
812 	} else if (append_file) {
813 		write_mode = WRITE_APPEND;
814 	} else {
815 		write_mode = WRITE_FORCE;
816 	}
817 
818 	if (nr_cgroups && !system_wide) {
819 		fprintf(stderr, "cgroup monitoring only available in"
820 			" system-wide mode\n");
821 		usage_with_options(record_usage, record_options);
822 	}
823 
824 	symbol__init();
825 
826 	if (symbol_conf.kptr_restrict)
827 		pr_warning(
828 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
829 "check /proc/sys/kernel/kptr_restrict.\n\n"
830 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
831 "file is not found in the buildid cache or in the vmlinux path.\n\n"
832 "Samples in kernel modules won't be resolved at all.\n\n"
833 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
834 "even with a suitable vmlinux or kallsyms file.\n\n");
835 
836 	if (no_buildid_cache || no_buildid)
837 		disable_buildid_cache();
838 
839 	if (evsel_list->nr_entries == 0 &&
840 	    perf_evlist__add_default(evsel_list) < 0) {
841 		pr_err("Not enough memory for event selector list\n");
842 		goto out_symbol_exit;
843 	}
844 
845 	if (target_pid != -1)
846 		target_tid = target_pid;
847 
848 	if (perf_evlist__create_maps(evsel_list, target_pid,
849 				     target_tid, cpu_list) < 0)
850 		usage_with_options(record_usage, record_options);
851 
852 	list_for_each_entry(pos, &evsel_list->entries, node) {
853 		if (perf_evsel__alloc_fd(pos, evsel_list->cpus->nr,
854 					 evsel_list->threads->nr) < 0)
855 			goto out_free_fd;
856 		if (perf_header__push_event(pos->attr.config, event_name(pos)))
857 			goto out_free_fd;
858 	}
859 
860 	if (perf_evlist__alloc_pollfd(evsel_list) < 0)
861 		goto out_free_fd;
862 
863 	if (user_interval != ULLONG_MAX)
864 		default_interval = user_interval;
865 	if (user_freq != UINT_MAX)
866 		freq = user_freq;
867 
868 	/*
869 	 * User specified count overrides default frequency.
870 	 */
871 	if (default_interval)
872 		freq = 0;
873 	else if (freq) {
874 		default_interval = freq;
875 	} else {
876 		fprintf(stderr, "frequency and count are zero, aborting\n");
877 		err = -EINVAL;
878 		goto out_free_fd;
879 	}
880 
881 	err = __cmd_record(argc, argv);
882 out_free_fd:
883 	perf_evlist__delete_maps(evsel_list);
884 out_symbol_exit:
885 	symbol__exit();
886 	return err;
887 }
888