xref: /openbmc/linux/tools/perf/builtin-trace.c (revision e5f586c763a079349398e2b0c7c271386193ac34)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 #include "callchain.h"
38 #include "syscalltbl.h"
39 #include "rb_resort.h"
40 
41 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
42 #include <stdlib.h>
43 #include <string.h>
44 #include <linux/err.h>
45 #include <linux/filter.h>
46 #include <linux/audit.h>
47 #include <linux/random.h>
48 #include <linux/stringify.h>
49 #include <linux/time64.h>
50 
51 #ifndef O_CLOEXEC
52 # define O_CLOEXEC		02000000
53 #endif
54 
55 struct trace {
56 	struct perf_tool	tool;
57 	struct syscalltbl	*sctbl;
58 	struct {
59 		int		max;
60 		struct syscall  *table;
61 		struct {
62 			struct perf_evsel *sys_enter,
63 					  *sys_exit;
64 		}		events;
65 	} syscalls;
66 	struct record_opts	opts;
67 	struct perf_evlist	*evlist;
68 	struct machine		*host;
69 	struct thread		*current;
70 	u64			base_time;
71 	FILE			*output;
72 	unsigned long		nr_events;
73 	struct strlist		*ev_qualifier;
74 	struct {
75 		size_t		nr;
76 		int		*entries;
77 	}			ev_qualifier_ids;
78 	struct {
79 		size_t		nr;
80 		pid_t		*entries;
81 	}			filter_pids;
82 	double			duration_filter;
83 	double			runtime_ms;
84 	struct {
85 		u64		vfs_getname,
86 				proc_getname;
87 	} stats;
88 	unsigned int		max_stack;
89 	unsigned int		min_stack;
90 	bool			not_ev_qualifier;
91 	bool			live;
92 	bool			full_time;
93 	bool			sched;
94 	bool			multiple_threads;
95 	bool			summary;
96 	bool			summary_only;
97 	bool			show_comm;
98 	bool			show_tool_stats;
99 	bool			trace_syscalls;
100 	bool			kernel_syscallchains;
101 	bool			force;
102 	bool			vfs_getname;
103 	int			trace_pgfaults;
104 	int			open_id;
105 };
106 
107 struct tp_field {
108 	int offset;
109 	union {
110 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
111 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
112 	};
113 };
114 
115 #define TP_UINT_FIELD(bits) \
116 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
117 { \
118 	u##bits value; \
119 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
120 	return value;  \
121 }
122 
123 TP_UINT_FIELD(8);
124 TP_UINT_FIELD(16);
125 TP_UINT_FIELD(32);
126 TP_UINT_FIELD(64);
127 
128 #define TP_UINT_FIELD__SWAPPED(bits) \
129 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
130 { \
131 	u##bits value; \
132 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
133 	return bswap_##bits(value);\
134 }
135 
136 TP_UINT_FIELD__SWAPPED(16);
137 TP_UINT_FIELD__SWAPPED(32);
138 TP_UINT_FIELD__SWAPPED(64);
139 
140 static int tp_field__init_uint(struct tp_field *field,
141 			       struct format_field *format_field,
142 			       bool needs_swap)
143 {
144 	field->offset = format_field->offset;
145 
146 	switch (format_field->size) {
147 	case 1:
148 		field->integer = tp_field__u8;
149 		break;
150 	case 2:
151 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
152 		break;
153 	case 4:
154 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
155 		break;
156 	case 8:
157 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
158 		break;
159 	default:
160 		return -1;
161 	}
162 
163 	return 0;
164 }
165 
166 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
167 {
168 	return sample->raw_data + field->offset;
169 }
170 
171 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
172 {
173 	field->offset = format_field->offset;
174 	field->pointer = tp_field__ptr;
175 	return 0;
176 }
177 
178 struct syscall_tp {
179 	struct tp_field id;
180 	union {
181 		struct tp_field args, ret;
182 	};
183 };
184 
185 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
186 					  struct tp_field *field,
187 					  const char *name)
188 {
189 	struct format_field *format_field = perf_evsel__field(evsel, name);
190 
191 	if (format_field == NULL)
192 		return -1;
193 
194 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
195 }
196 
197 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
198 	({ struct syscall_tp *sc = evsel->priv;\
199 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
200 
201 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
202 					 struct tp_field *field,
203 					 const char *name)
204 {
205 	struct format_field *format_field = perf_evsel__field(evsel, name);
206 
207 	if (format_field == NULL)
208 		return -1;
209 
210 	return tp_field__init_ptr(field, format_field);
211 }
212 
213 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
214 	({ struct syscall_tp *sc = evsel->priv;\
215 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
216 
217 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
218 {
219 	zfree(&evsel->priv);
220 	perf_evsel__delete(evsel);
221 }
222 
223 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
224 {
225 	evsel->priv = malloc(sizeof(struct syscall_tp));
226 	if (evsel->priv != NULL) {
227 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
228 			goto out_delete;
229 
230 		evsel->handler = handler;
231 		return 0;
232 	}
233 
234 	return -ENOMEM;
235 
236 out_delete:
237 	zfree(&evsel->priv);
238 	return -ENOENT;
239 }
240 
241 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
242 {
243 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
244 
245 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
246 	if (IS_ERR(evsel))
247 		evsel = perf_evsel__newtp("syscalls", direction);
248 
249 	if (IS_ERR(evsel))
250 		return NULL;
251 
252 	if (perf_evsel__init_syscall_tp(evsel, handler))
253 		goto out_delete;
254 
255 	return evsel;
256 
257 out_delete:
258 	perf_evsel__delete_priv(evsel);
259 	return NULL;
260 }
261 
262 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
263 	({ struct syscall_tp *fields = evsel->priv; \
264 	   fields->name.integer(&fields->name, sample); })
265 
266 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
267 	({ struct syscall_tp *fields = evsel->priv; \
268 	   fields->name.pointer(&fields->name, sample); })
269 
270 struct syscall_arg {
271 	unsigned long val;
272 	struct thread *thread;
273 	struct trace  *trace;
274 	void	      *parm;
275 	u8	      idx;
276 	u8	      mask;
277 };
278 
279 struct strarray {
280 	int	    offset;
281 	int	    nr_entries;
282 	const char **entries;
283 };
284 
285 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
286 	.nr_entries = ARRAY_SIZE(array), \
287 	.entries = array, \
288 }
289 
290 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
291 	.offset	    = off, \
292 	.nr_entries = ARRAY_SIZE(array), \
293 	.entries = array, \
294 }
295 
296 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
297 						const char *intfmt,
298 					        struct syscall_arg *arg)
299 {
300 	struct strarray *sa = arg->parm;
301 	int idx = arg->val - sa->offset;
302 
303 	if (idx < 0 || idx >= sa->nr_entries)
304 		return scnprintf(bf, size, intfmt, arg->val);
305 
306 	return scnprintf(bf, size, "%s", sa->entries[idx]);
307 }
308 
309 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
310 					      struct syscall_arg *arg)
311 {
312 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
313 }
314 
315 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
316 
317 #if defined(__i386__) || defined(__x86_64__)
318 /*
319  * FIXME: Make this available to all arches as soon as the ioctl beautifier
320  * 	  gets rewritten to support all arches.
321  */
322 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
323 						 struct syscall_arg *arg)
324 {
325 	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
326 }
327 
328 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
329 #endif /* defined(__i386__) || defined(__x86_64__) */
330 
331 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
332 					struct syscall_arg *arg);
333 
334 #define SCA_FD syscall_arg__scnprintf_fd
335 
336 #ifndef AT_FDCWD
337 #define AT_FDCWD	-100
338 #endif
339 
340 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
341 					   struct syscall_arg *arg)
342 {
343 	int fd = arg->val;
344 
345 	if (fd == AT_FDCWD)
346 		return scnprintf(bf, size, "CWD");
347 
348 	return syscall_arg__scnprintf_fd(bf, size, arg);
349 }
350 
351 #define SCA_FDAT syscall_arg__scnprintf_fd_at
352 
353 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
354 					      struct syscall_arg *arg);
355 
356 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
357 
358 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
359 					 struct syscall_arg *arg)
360 {
361 	return scnprintf(bf, size, "%#lx", arg->val);
362 }
363 
364 #define SCA_HEX syscall_arg__scnprintf_hex
365 
366 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
367 					 struct syscall_arg *arg)
368 {
369 	return scnprintf(bf, size, "%d", arg->val);
370 }
371 
372 #define SCA_INT syscall_arg__scnprintf_int
373 
374 static const char *bpf_cmd[] = {
375 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
376 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
377 };
378 static DEFINE_STRARRAY(bpf_cmd);
379 
380 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
381 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
382 
383 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
384 static DEFINE_STRARRAY(itimers);
385 
386 static const char *keyctl_options[] = {
387 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
388 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
389 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
390 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
391 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
392 };
393 static DEFINE_STRARRAY(keyctl_options);
394 
395 static const char *whences[] = { "SET", "CUR", "END",
396 #ifdef SEEK_DATA
397 "DATA",
398 #endif
399 #ifdef SEEK_HOLE
400 "HOLE",
401 #endif
402 };
403 static DEFINE_STRARRAY(whences);
404 
405 static const char *fcntl_cmds[] = {
406 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
407 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
408 	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
409 	"F_GETOWNER_UIDS",
410 };
411 static DEFINE_STRARRAY(fcntl_cmds);
412 
413 static const char *rlimit_resources[] = {
414 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
415 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
416 	"RTTIME",
417 };
418 static DEFINE_STRARRAY(rlimit_resources);
419 
420 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
421 static DEFINE_STRARRAY(sighow);
422 
423 static const char *clockid[] = {
424 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
425 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
426 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
427 };
428 static DEFINE_STRARRAY(clockid);
429 
430 static const char *socket_families[] = {
431 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
432 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
433 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
434 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
435 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
436 	"ALG", "NFC", "VSOCK",
437 };
438 static DEFINE_STRARRAY(socket_families);
439 
440 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
441 						 struct syscall_arg *arg)
442 {
443 	size_t printed = 0;
444 	int mode = arg->val;
445 
446 	if (mode == F_OK) /* 0 */
447 		return scnprintf(bf, size, "F");
448 #define	P_MODE(n) \
449 	if (mode & n##_OK) { \
450 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
451 		mode &= ~n##_OK; \
452 	}
453 
454 	P_MODE(R);
455 	P_MODE(W);
456 	P_MODE(X);
457 #undef P_MODE
458 
459 	if (mode)
460 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
461 
462 	return printed;
463 }
464 
465 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
466 
467 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
468 					      struct syscall_arg *arg);
469 
470 #define SCA_FILENAME syscall_arg__scnprintf_filename
471 
472 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
473 						struct syscall_arg *arg)
474 {
475 	int printed = 0, flags = arg->val;
476 
477 #define	P_FLAG(n) \
478 	if (flags & O_##n) { \
479 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
480 		flags &= ~O_##n; \
481 	}
482 
483 	P_FLAG(CLOEXEC);
484 	P_FLAG(NONBLOCK);
485 #undef P_FLAG
486 
487 	if (flags)
488 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
489 
490 	return printed;
491 }
492 
493 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
494 
495 #if defined(__i386__) || defined(__x86_64__)
496 /*
497  * FIXME: Make this available to all arches.
498  */
499 #define TCGETS		0x5401
500 
501 static const char *tioctls[] = {
502 	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
503 	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
504 	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
505 	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
506 	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
507 	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
508 	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
509 	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
510 	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
511 	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
512 	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
513 	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
514 	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
515 	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
516 	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
517 };
518 
519 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
520 #endif /* defined(__i386__) || defined(__x86_64__) */
521 
522 #ifndef GRND_NONBLOCK
523 #define GRND_NONBLOCK	0x0001
524 #endif
525 #ifndef GRND_RANDOM
526 #define GRND_RANDOM	0x0002
527 #endif
528 
529 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
530 						   struct syscall_arg *arg)
531 {
532 	int printed = 0, flags = arg->val;
533 
534 #define	P_FLAG(n) \
535 	if (flags & GRND_##n) { \
536 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
537 		flags &= ~GRND_##n; \
538 	}
539 
540 	P_FLAG(RANDOM);
541 	P_FLAG(NONBLOCK);
542 #undef P_FLAG
543 
544 	if (flags)
545 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
546 
547 	return printed;
548 }
549 
550 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
551 
552 #define STRARRAY(arg, name, array) \
553 	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
554 	  .arg_parm	 = { [arg] = &strarray__##array, }
555 
556 #include "trace/beauty/eventfd.c"
557 #include "trace/beauty/flock.c"
558 #include "trace/beauty/futex_op.c"
559 #include "trace/beauty/mmap.c"
560 #include "trace/beauty/mode_t.c"
561 #include "trace/beauty/msg_flags.c"
562 #include "trace/beauty/open_flags.c"
563 #include "trace/beauty/perf_event_open.c"
564 #include "trace/beauty/pid.c"
565 #include "trace/beauty/sched_policy.c"
566 #include "trace/beauty/seccomp.c"
567 #include "trace/beauty/signum.c"
568 #include "trace/beauty/socket_type.c"
569 #include "trace/beauty/waitid_options.c"
570 
571 static struct syscall_fmt {
572 	const char *name;
573 	const char *alias;
574 	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
575 	void	   *arg_parm[6];
576 	bool	   errmsg;
577 	bool	   errpid;
578 	bool	   timeout;
579 	bool	   hexret;
580 } syscall_fmts[] = {
581 	{ .name	    = "access",	    .errmsg = true,
582 	  .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
583 	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
584 	{ .name	    = "bpf",	    .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
585 	{ .name	    = "brk",	    .hexret = true,
586 	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
587 	{ .name	    = "chdir",	    .errmsg = true, },
588 	{ .name	    = "chmod",	    .errmsg = true, },
589 	{ .name	    = "chroot",	    .errmsg = true, },
590 	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
591 	{ .name	    = "clone",	    .errpid = true, },
592 	{ .name	    = "close",	    .errmsg = true,
593 	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
594 	{ .name	    = "connect",    .errmsg = true, },
595 	{ .name	    = "creat",	    .errmsg = true, },
596 	{ .name	    = "dup",	    .errmsg = true, },
597 	{ .name	    = "dup2",	    .errmsg = true, },
598 	{ .name	    = "dup3",	    .errmsg = true, },
599 	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
600 	{ .name	    = "eventfd2",   .errmsg = true,
601 	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
602 	{ .name	    = "faccessat",  .errmsg = true, },
603 	{ .name	    = "fadvise64",  .errmsg = true, },
604 	{ .name	    = "fallocate",  .errmsg = true, },
605 	{ .name	    = "fchdir",	    .errmsg = true, },
606 	{ .name	    = "fchmod",	    .errmsg = true, },
607 	{ .name	    = "fchmodat",   .errmsg = true,
608 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
609 	{ .name	    = "fchown",	    .errmsg = true, },
610 	{ .name	    = "fchownat",   .errmsg = true,
611 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
612 	{ .name	    = "fcntl",	    .errmsg = true,
613 	  .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
614 	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
615 	{ .name	    = "fdatasync",  .errmsg = true, },
616 	{ .name	    = "flock",	    .errmsg = true,
617 	  .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
618 	{ .name	    = "fsetxattr",  .errmsg = true, },
619 	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat", },
620 	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat", },
621 	{ .name	    = "fstatfs",    .errmsg = true, },
622 	{ .name	    = "fsync",    .errmsg = true, },
623 	{ .name	    = "ftruncate", .errmsg = true, },
624 	{ .name	    = "futex",	    .errmsg = true,
625 	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
626 	{ .name	    = "futimesat", .errmsg = true,
627 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
628 	{ .name	    = "getdents",   .errmsg = true, },
629 	{ .name	    = "getdents64", .errmsg = true, },
630 	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
631 	{ .name	    = "getpid",	    .errpid = true, },
632 	{ .name	    = "getpgid",    .errpid = true, },
633 	{ .name	    = "getppid",    .errpid = true, },
634 	{ .name	    = "getrandom",  .errmsg = true,
635 	  .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
636 	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
637 	{ .name	    = "getxattr",   .errmsg = true, },
638 	{ .name	    = "inotify_add_watch",	    .errmsg = true, },
639 	{ .name	    = "ioctl",	    .errmsg = true,
640 	  .arg_scnprintf = {
641 #if defined(__i386__) || defined(__x86_64__)
642 /*
643  * FIXME: Make this available to all arches.
644  */
645 			     [1] = SCA_STRHEXARRAY, /* cmd */
646 			     [2] = SCA_HEX, /* arg */ },
647 	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
648 #else
649 			     [2] = SCA_HEX, /* arg */ }, },
650 #endif
651 	{ .name	    = "keyctl",	    .errmsg = true, STRARRAY(0, option, keyctl_options), },
652 	{ .name	    = "kill",	    .errmsg = true,
653 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
654 	{ .name	    = "lchown",    .errmsg = true, },
655 	{ .name	    = "lgetxattr",  .errmsg = true, },
656 	{ .name	    = "linkat",	    .errmsg = true,
657 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
658 	{ .name	    = "listxattr",  .errmsg = true, },
659 	{ .name	    = "llistxattr", .errmsg = true, },
660 	{ .name	    = "lremovexattr",  .errmsg = true, },
661 	{ .name	    = "lseek",	    .errmsg = true,
662 	  .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
663 	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
664 	{ .name	    = "lsetxattr",  .errmsg = true, },
665 	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat", },
666 	{ .name	    = "lsxattr",    .errmsg = true, },
667 	{ .name     = "madvise",    .errmsg = true,
668 	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
669 			     [2] = SCA_MADV_BHV, /* behavior */ }, },
670 	{ .name	    = "mkdir",    .errmsg = true, },
671 	{ .name	    = "mkdirat",    .errmsg = true,
672 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
673 	{ .name	    = "mknod",      .errmsg = true, },
674 	{ .name	    = "mknodat",    .errmsg = true,
675 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
676 	{ .name	    = "mlock",	    .errmsg = true,
677 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
678 	{ .name	    = "mlockall",   .errmsg = true,
679 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
680 	{ .name	    = "mmap",	    .hexret = true,
681 	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
682 			     [2] = SCA_MMAP_PROT, /* prot */
683 			     [3] = SCA_MMAP_FLAGS, /* flags */ }, },
684 	{ .name	    = "mprotect",   .errmsg = true,
685 	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
686 			     [2] = SCA_MMAP_PROT, /* prot */ }, },
687 	{ .name	    = "mq_unlink", .errmsg = true,
688 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
689 	{ .name	    = "mremap",	    .hexret = true,
690 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
691 			     [3] = SCA_MREMAP_FLAGS, /* flags */
692 			     [4] = SCA_HEX, /* new_addr */ }, },
693 	{ .name	    = "munlock",    .errmsg = true,
694 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
695 	{ .name	    = "munmap",	    .errmsg = true,
696 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
697 	{ .name	    = "name_to_handle_at", .errmsg = true,
698 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
699 	{ .name	    = "newfstatat", .errmsg = true,
700 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
701 	{ .name	    = "open",	    .errmsg = true,
702 	  .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
703 	{ .name	    = "open_by_handle_at", .errmsg = true,
704 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
705 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
706 	{ .name	    = "openat",	    .errmsg = true,
707 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
708 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
709 	{ .name	    = "perf_event_open", .errmsg = true,
710 	  .arg_scnprintf = { [2] = SCA_INT, /* cpu */
711 			     [3] = SCA_FD,  /* group_fd */
712 			     [4] = SCA_PERF_FLAGS,  /* flags */ }, },
713 	{ .name	    = "pipe2",	    .errmsg = true,
714 	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
715 	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
716 	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
717 	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64", },
718 	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread", },
719 	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
720 	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64", },
721 	{ .name	    = "pwritev",    .errmsg = true, },
722 	{ .name	    = "read",	    .errmsg = true, },
723 	{ .name	    = "readlink",   .errmsg = true, },
724 	{ .name	    = "readlinkat", .errmsg = true,
725 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
726 	{ .name	    = "readv",	    .errmsg = true, },
727 	{ .name	    = "recvfrom",   .errmsg = true,
728 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
729 	{ .name	    = "recvmmsg",   .errmsg = true,
730 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
731 	{ .name	    = "recvmsg",    .errmsg = true,
732 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
733 	{ .name	    = "removexattr", .errmsg = true, },
734 	{ .name	    = "renameat",   .errmsg = true,
735 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
736 	{ .name	    = "rmdir",    .errmsg = true, },
737 	{ .name	    = "rt_sigaction", .errmsg = true,
738 	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
739 	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
740 	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
741 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
742 	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
743 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
744 	{ .name	    = "sched_getattr",	      .errmsg = true, },
745 	{ .name	    = "sched_setattr",	      .errmsg = true, },
746 	{ .name	    = "sched_setscheduler",   .errmsg = true,
747 	  .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
748 	{ .name	    = "seccomp", .errmsg = true,
749 	  .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
750 			     [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
751 	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
752 	{ .name	    = "sendmmsg",    .errmsg = true,
753 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
754 	{ .name	    = "sendmsg",    .errmsg = true,
755 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
756 	{ .name	    = "sendto",	    .errmsg = true,
757 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
758 	{ .name	    = "set_tid_address", .errpid = true, },
759 	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
760 	{ .name	    = "setpgid",    .errmsg = true, },
761 	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
762 	{ .name	    = "setxattr",   .errmsg = true, },
763 	{ .name	    = "shutdown",   .errmsg = true, },
764 	{ .name	    = "socket",	    .errmsg = true,
765 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
766 			     [1] = SCA_SK_TYPE, /* type */ },
767 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
768 	{ .name	    = "socketpair", .errmsg = true,
769 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
770 			     [1] = SCA_SK_TYPE, /* type */ },
771 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
772 	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat", },
773 	{ .name	    = "statfs",	    .errmsg = true, },
774 	{ .name	    = "swapoff",    .errmsg = true,
775 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
776 	{ .name	    = "swapon",	    .errmsg = true,
777 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
778 	{ .name	    = "symlinkat",  .errmsg = true,
779 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
780 	{ .name	    = "tgkill",	    .errmsg = true,
781 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
782 	{ .name	    = "tkill",	    .errmsg = true,
783 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
784 	{ .name	    = "truncate",   .errmsg = true, },
785 	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
786 	{ .name	    = "unlinkat",   .errmsg = true,
787 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
788 	{ .name	    = "utime",  .errmsg = true, },
789 	{ .name	    = "utimensat",  .errmsg = true,
790 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
791 	{ .name	    = "utimes",  .errmsg = true, },
792 	{ .name	    = "vmsplice",  .errmsg = true, },
793 	{ .name	    = "wait4",	    .errpid = true,
794 	  .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
795 	{ .name	    = "waitid",	    .errpid = true,
796 	  .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
797 	{ .name	    = "write",	    .errmsg = true, },
798 	{ .name	    = "writev",	    .errmsg = true, },
799 };
800 
801 static int syscall_fmt__cmp(const void *name, const void *fmtp)
802 {
803 	const struct syscall_fmt *fmt = fmtp;
804 	return strcmp(name, fmt->name);
805 }
806 
807 static struct syscall_fmt *syscall_fmt__find(const char *name)
808 {
809 	const int nmemb = ARRAY_SIZE(syscall_fmts);
810 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
811 }
812 
813 struct syscall {
814 	struct event_format *tp_format;
815 	int		    nr_args;
816 	struct format_field *args;
817 	const char	    *name;
818 	bool		    is_exit;
819 	struct syscall_fmt  *fmt;
820 	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
821 	void		    **arg_parm;
822 };
823 
824 static size_t fprintf_duration(unsigned long t, FILE *fp)
825 {
826 	double duration = (double)t / NSEC_PER_MSEC;
827 	size_t printed = fprintf(fp, "(");
828 
829 	if (duration >= 1.0)
830 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
831 	else if (duration >= 0.01)
832 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
833 	else
834 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
835 	return printed + fprintf(fp, "): ");
836 }
837 
838 /**
839  * filename.ptr: The filename char pointer that will be vfs_getname'd
840  * filename.entry_str_pos: Where to insert the string translated from
841  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
842  */
843 struct thread_trace {
844 	u64		  entry_time;
845 	bool		  entry_pending;
846 	unsigned long	  nr_events;
847 	unsigned long	  pfmaj, pfmin;
848 	char		  *entry_str;
849 	double		  runtime_ms;
850         struct {
851 		unsigned long ptr;
852 		short int     entry_str_pos;
853 		bool	      pending_open;
854 		unsigned int  namelen;
855 		char	      *name;
856 	} filename;
857 	struct {
858 		int	  max;
859 		char	  **table;
860 	} paths;
861 
862 	struct intlist *syscall_stats;
863 };
864 
865 static struct thread_trace *thread_trace__new(void)
866 {
867 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
868 
869 	if (ttrace)
870 		ttrace->paths.max = -1;
871 
872 	ttrace->syscall_stats = intlist__new(NULL);
873 
874 	return ttrace;
875 }
876 
877 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
878 {
879 	struct thread_trace *ttrace;
880 
881 	if (thread == NULL)
882 		goto fail;
883 
884 	if (thread__priv(thread) == NULL)
885 		thread__set_priv(thread, thread_trace__new());
886 
887 	if (thread__priv(thread) == NULL)
888 		goto fail;
889 
890 	ttrace = thread__priv(thread);
891 	++ttrace->nr_events;
892 
893 	return ttrace;
894 fail:
895 	color_fprintf(fp, PERF_COLOR_RED,
896 		      "WARNING: not enough memory, dropping samples!\n");
897 	return NULL;
898 }
899 
900 #define TRACE_PFMAJ		(1 << 0)
901 #define TRACE_PFMIN		(1 << 1)
902 
903 static const size_t trace__entry_str_size = 2048;
904 
905 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
906 {
907 	struct thread_trace *ttrace = thread__priv(thread);
908 
909 	if (fd > ttrace->paths.max) {
910 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
911 
912 		if (npath == NULL)
913 			return -1;
914 
915 		if (ttrace->paths.max != -1) {
916 			memset(npath + ttrace->paths.max + 1, 0,
917 			       (fd - ttrace->paths.max) * sizeof(char *));
918 		} else {
919 			memset(npath, 0, (fd + 1) * sizeof(char *));
920 		}
921 
922 		ttrace->paths.table = npath;
923 		ttrace->paths.max   = fd;
924 	}
925 
926 	ttrace->paths.table[fd] = strdup(pathname);
927 
928 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
929 }
930 
931 static int thread__read_fd_path(struct thread *thread, int fd)
932 {
933 	char linkname[PATH_MAX], pathname[PATH_MAX];
934 	struct stat st;
935 	int ret;
936 
937 	if (thread->pid_ == thread->tid) {
938 		scnprintf(linkname, sizeof(linkname),
939 			  "/proc/%d/fd/%d", thread->pid_, fd);
940 	} else {
941 		scnprintf(linkname, sizeof(linkname),
942 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
943 	}
944 
945 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
946 		return -1;
947 
948 	ret = readlink(linkname, pathname, sizeof(pathname));
949 
950 	if (ret < 0 || ret > st.st_size)
951 		return -1;
952 
953 	pathname[ret] = '\0';
954 	return trace__set_fd_pathname(thread, fd, pathname);
955 }
956 
957 static const char *thread__fd_path(struct thread *thread, int fd,
958 				   struct trace *trace)
959 {
960 	struct thread_trace *ttrace = thread__priv(thread);
961 
962 	if (ttrace == NULL)
963 		return NULL;
964 
965 	if (fd < 0)
966 		return NULL;
967 
968 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
969 		if (!trace->live)
970 			return NULL;
971 		++trace->stats.proc_getname;
972 		if (thread__read_fd_path(thread, fd))
973 			return NULL;
974 	}
975 
976 	return ttrace->paths.table[fd];
977 }
978 
979 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
980 					struct syscall_arg *arg)
981 {
982 	int fd = arg->val;
983 	size_t printed = scnprintf(bf, size, "%d", fd);
984 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
985 
986 	if (path)
987 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
988 
989 	return printed;
990 }
991 
992 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
993 					      struct syscall_arg *arg)
994 {
995 	int fd = arg->val;
996 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
997 	struct thread_trace *ttrace = thread__priv(arg->thread);
998 
999 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1000 		zfree(&ttrace->paths.table[fd]);
1001 
1002 	return printed;
1003 }
1004 
1005 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1006 				     unsigned long ptr)
1007 {
1008 	struct thread_trace *ttrace = thread__priv(thread);
1009 
1010 	ttrace->filename.ptr = ptr;
1011 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1012 }
1013 
1014 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1015 					      struct syscall_arg *arg)
1016 {
1017 	unsigned long ptr = arg->val;
1018 
1019 	if (!arg->trace->vfs_getname)
1020 		return scnprintf(bf, size, "%#x", ptr);
1021 
1022 	thread__set_filename_pos(arg->thread, bf, ptr);
1023 	return 0;
1024 }
1025 
1026 static bool trace__filter_duration(struct trace *trace, double t)
1027 {
1028 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1029 }
1030 
1031 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1032 {
1033 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1034 
1035 	return fprintf(fp, "%10.3f ", ts);
1036 }
1037 
1038 static bool done = false;
1039 static bool interrupted = false;
1040 
1041 static void sig_handler(int sig)
1042 {
1043 	done = true;
1044 	interrupted = sig == SIGINT;
1045 }
1046 
1047 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1048 					u64 duration, u64 tstamp, FILE *fp)
1049 {
1050 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1051 	printed += fprintf_duration(duration, fp);
1052 
1053 	if (trace->multiple_threads) {
1054 		if (trace->show_comm)
1055 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1056 		printed += fprintf(fp, "%d ", thread->tid);
1057 	}
1058 
1059 	return printed;
1060 }
1061 
1062 static int trace__process_event(struct trace *trace, struct machine *machine,
1063 				union perf_event *event, struct perf_sample *sample)
1064 {
1065 	int ret = 0;
1066 
1067 	switch (event->header.type) {
1068 	case PERF_RECORD_LOST:
1069 		color_fprintf(trace->output, PERF_COLOR_RED,
1070 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1071 		ret = machine__process_lost_event(machine, event, sample);
1072 		break;
1073 	default:
1074 		ret = machine__process_event(machine, event, sample);
1075 		break;
1076 	}
1077 
1078 	return ret;
1079 }
1080 
1081 static int trace__tool_process(struct perf_tool *tool,
1082 			       union perf_event *event,
1083 			       struct perf_sample *sample,
1084 			       struct machine *machine)
1085 {
1086 	struct trace *trace = container_of(tool, struct trace, tool);
1087 	return trace__process_event(trace, machine, event, sample);
1088 }
1089 
1090 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1091 {
1092 	struct machine *machine = vmachine;
1093 
1094 	if (machine->kptr_restrict_warned)
1095 		return NULL;
1096 
1097 	if (symbol_conf.kptr_restrict) {
1098 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1099 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1100 			   "Kernel samples will not be resolved.\n");
1101 		machine->kptr_restrict_warned = true;
1102 		return NULL;
1103 	}
1104 
1105 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1106 }
1107 
1108 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1109 {
1110 	int err = symbol__init(NULL);
1111 
1112 	if (err)
1113 		return err;
1114 
1115 	trace->host = machine__new_host();
1116 	if (trace->host == NULL)
1117 		return -ENOMEM;
1118 
1119 	if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1120 		return -errno;
1121 
1122 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1123 					    evlist->threads, trace__tool_process, false,
1124 					    trace->opts.proc_map_timeout);
1125 	if (err)
1126 		symbol__exit();
1127 
1128 	return err;
1129 }
1130 
1131 static int syscall__set_arg_fmts(struct syscall *sc)
1132 {
1133 	struct format_field *field;
1134 	int idx = 0, len;
1135 
1136 	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1137 	if (sc->arg_scnprintf == NULL)
1138 		return -1;
1139 
1140 	if (sc->fmt)
1141 		sc->arg_parm = sc->fmt->arg_parm;
1142 
1143 	for (field = sc->args; field; field = field->next) {
1144 		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1145 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1146 		else if (strcmp(field->type, "const char *") == 0 &&
1147 			 (strcmp(field->name, "filename") == 0 ||
1148 			  strcmp(field->name, "path") == 0 ||
1149 			  strcmp(field->name, "pathname") == 0))
1150 			sc->arg_scnprintf[idx] = SCA_FILENAME;
1151 		else if (field->flags & FIELD_IS_POINTER)
1152 			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1153 		else if (strcmp(field->type, "pid_t") == 0)
1154 			sc->arg_scnprintf[idx] = SCA_PID;
1155 		else if (strcmp(field->type, "umode_t") == 0)
1156 			sc->arg_scnprintf[idx] = SCA_MODE_T;
1157 		else if ((strcmp(field->type, "int") == 0 ||
1158 			  strcmp(field->type, "unsigned int") == 0 ||
1159 			  strcmp(field->type, "long") == 0) &&
1160 			 (len = strlen(field->name)) >= 2 &&
1161 			 strcmp(field->name + len - 2, "fd") == 0) {
1162 			/*
1163 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1164 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1165 			 * 65 int
1166 			 * 23 unsigned int
1167 			 * 7 unsigned long
1168 			 */
1169 			sc->arg_scnprintf[idx] = SCA_FD;
1170 		}
1171 		++idx;
1172 	}
1173 
1174 	return 0;
1175 }
1176 
1177 static int trace__read_syscall_info(struct trace *trace, int id)
1178 {
1179 	char tp_name[128];
1180 	struct syscall *sc;
1181 	const char *name = syscalltbl__name(trace->sctbl, id);
1182 
1183 	if (name == NULL)
1184 		return -1;
1185 
1186 	if (id > trace->syscalls.max) {
1187 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1188 
1189 		if (nsyscalls == NULL)
1190 			return -1;
1191 
1192 		if (trace->syscalls.max != -1) {
1193 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1194 			       (id - trace->syscalls.max) * sizeof(*sc));
1195 		} else {
1196 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1197 		}
1198 
1199 		trace->syscalls.table = nsyscalls;
1200 		trace->syscalls.max   = id;
1201 	}
1202 
1203 	sc = trace->syscalls.table + id;
1204 	sc->name = name;
1205 
1206 	sc->fmt  = syscall_fmt__find(sc->name);
1207 
1208 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1209 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1210 
1211 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1212 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1213 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1214 	}
1215 
1216 	if (IS_ERR(sc->tp_format))
1217 		return -1;
1218 
1219 	sc->args = sc->tp_format->format.fields;
1220 	sc->nr_args = sc->tp_format->format.nr_fields;
1221 	/*
1222 	 * We need to check and discard the first variable '__syscall_nr'
1223 	 * or 'nr' that mean the syscall number. It is needless here.
1224 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1225 	 */
1226 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1227 		sc->args = sc->args->next;
1228 		--sc->nr_args;
1229 	}
1230 
1231 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1232 
1233 	return syscall__set_arg_fmts(sc);
1234 }
1235 
1236 static int trace__validate_ev_qualifier(struct trace *trace)
1237 {
1238 	int err = 0, i;
1239 	struct str_node *pos;
1240 
1241 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1242 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1243 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1244 
1245 	if (trace->ev_qualifier_ids.entries == NULL) {
1246 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1247 		       trace->output);
1248 		err = -EINVAL;
1249 		goto out;
1250 	}
1251 
1252 	i = 0;
1253 
1254 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1255 		const char *sc = pos->s;
1256 		int id = syscalltbl__id(trace->sctbl, sc);
1257 
1258 		if (id < 0) {
1259 			if (err == 0) {
1260 				fputs("Error:\tInvalid syscall ", trace->output);
1261 				err = -EINVAL;
1262 			} else {
1263 				fputs(", ", trace->output);
1264 			}
1265 
1266 			fputs(sc, trace->output);
1267 		}
1268 
1269 		trace->ev_qualifier_ids.entries[i++] = id;
1270 	}
1271 
1272 	if (err < 0) {
1273 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1274 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1275 		zfree(&trace->ev_qualifier_ids.entries);
1276 		trace->ev_qualifier_ids.nr = 0;
1277 	}
1278 out:
1279 	return err;
1280 }
1281 
1282 /*
1283  * args is to be interpreted as a series of longs but we need to handle
1284  * 8-byte unaligned accesses. args points to raw_data within the event
1285  * and raw_data is guaranteed to be 8-byte unaligned because it is
1286  * preceded by raw_size which is a u32. So we need to copy args to a temp
1287  * variable to read it. Most notably this avoids extended load instructions
1288  * on unaligned addresses
1289  */
1290 
1291 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1292 				      unsigned char *args, struct trace *trace,
1293 				      struct thread *thread)
1294 {
1295 	size_t printed = 0;
1296 	unsigned char *p;
1297 	unsigned long val;
1298 
1299 	if (sc->args != NULL) {
1300 		struct format_field *field;
1301 		u8 bit = 1;
1302 		struct syscall_arg arg = {
1303 			.idx	= 0,
1304 			.mask	= 0,
1305 			.trace  = trace,
1306 			.thread = thread,
1307 		};
1308 
1309 		for (field = sc->args; field;
1310 		     field = field->next, ++arg.idx, bit <<= 1) {
1311 			if (arg.mask & bit)
1312 				continue;
1313 
1314 			/* special care for unaligned accesses */
1315 			p = args + sizeof(unsigned long) * arg.idx;
1316 			memcpy(&val, p, sizeof(val));
1317 
1318 			/*
1319  			 * Suppress this argument if its value is zero and
1320  			 * and we don't have a string associated in an
1321  			 * strarray for it.
1322  			 */
1323 			if (val == 0 &&
1324 			    !(sc->arg_scnprintf &&
1325 			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1326 			      sc->arg_parm[arg.idx]))
1327 				continue;
1328 
1329 			printed += scnprintf(bf + printed, size - printed,
1330 					     "%s%s: ", printed ? ", " : "", field->name);
1331 			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1332 				arg.val = val;
1333 				if (sc->arg_parm)
1334 					arg.parm = sc->arg_parm[arg.idx];
1335 				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1336 								      size - printed, &arg);
1337 			} else {
1338 				printed += scnprintf(bf + printed, size - printed,
1339 						     "%ld", val);
1340 			}
1341 		}
1342 	} else if (IS_ERR(sc->tp_format)) {
1343 		/*
1344 		 * If we managed to read the tracepoint /format file, then we
1345 		 * may end up not having any args, like with gettid(), so only
1346 		 * print the raw args when we didn't manage to read it.
1347 		 */
1348 		int i = 0;
1349 
1350 		while (i < 6) {
1351 			/* special care for unaligned accesses */
1352 			p = args + sizeof(unsigned long) * i;
1353 			memcpy(&val, p, sizeof(val));
1354 			printed += scnprintf(bf + printed, size - printed,
1355 					     "%sarg%d: %ld",
1356 					     printed ? ", " : "", i, val);
1357 			++i;
1358 		}
1359 	}
1360 
1361 	return printed;
1362 }
1363 
1364 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1365 				  union perf_event *event,
1366 				  struct perf_sample *sample);
1367 
1368 static struct syscall *trace__syscall_info(struct trace *trace,
1369 					   struct perf_evsel *evsel, int id)
1370 {
1371 
1372 	if (id < 0) {
1373 
1374 		/*
1375 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1376 		 * before that, leaving at a higher verbosity level till that is
1377 		 * explained. Reproduced with plain ftrace with:
1378 		 *
1379 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1380 		 * grep "NR -1 " /t/trace_pipe
1381 		 *
1382 		 * After generating some load on the machine.
1383  		 */
1384 		if (verbose > 1) {
1385 			static u64 n;
1386 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1387 				id, perf_evsel__name(evsel), ++n);
1388 		}
1389 		return NULL;
1390 	}
1391 
1392 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1393 	    trace__read_syscall_info(trace, id))
1394 		goto out_cant_read;
1395 
1396 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1397 		goto out_cant_read;
1398 
1399 	return &trace->syscalls.table[id];
1400 
1401 out_cant_read:
1402 	if (verbose > 0) {
1403 		fprintf(trace->output, "Problems reading syscall %d", id);
1404 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1405 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1406 		fputs(" information\n", trace->output);
1407 	}
1408 	return NULL;
1409 }
1410 
1411 static void thread__update_stats(struct thread_trace *ttrace,
1412 				 int id, struct perf_sample *sample)
1413 {
1414 	struct int_node *inode;
1415 	struct stats *stats;
1416 	u64 duration = 0;
1417 
1418 	inode = intlist__findnew(ttrace->syscall_stats, id);
1419 	if (inode == NULL)
1420 		return;
1421 
1422 	stats = inode->priv;
1423 	if (stats == NULL) {
1424 		stats = malloc(sizeof(struct stats));
1425 		if (stats == NULL)
1426 			return;
1427 		init_stats(stats);
1428 		inode->priv = stats;
1429 	}
1430 
1431 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1432 		duration = sample->time - ttrace->entry_time;
1433 
1434 	update_stats(stats, duration);
1435 }
1436 
1437 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1438 {
1439 	struct thread_trace *ttrace;
1440 	u64 duration;
1441 	size_t printed;
1442 
1443 	if (trace->current == NULL)
1444 		return 0;
1445 
1446 	ttrace = thread__priv(trace->current);
1447 
1448 	if (!ttrace->entry_pending)
1449 		return 0;
1450 
1451 	duration = sample->time - ttrace->entry_time;
1452 
1453 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, ttrace->entry_time, trace->output);
1454 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1455 	ttrace->entry_pending = false;
1456 
1457 	return printed;
1458 }
1459 
1460 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1461 			    union perf_event *event __maybe_unused,
1462 			    struct perf_sample *sample)
1463 {
1464 	char *msg;
1465 	void *args;
1466 	size_t printed = 0;
1467 	struct thread *thread;
1468 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1469 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1470 	struct thread_trace *ttrace;
1471 
1472 	if (sc == NULL)
1473 		return -1;
1474 
1475 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1476 	ttrace = thread__trace(thread, trace->output);
1477 	if (ttrace == NULL)
1478 		goto out_put;
1479 
1480 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1481 
1482 	if (ttrace->entry_str == NULL) {
1483 		ttrace->entry_str = malloc(trace__entry_str_size);
1484 		if (!ttrace->entry_str)
1485 			goto out_put;
1486 	}
1487 
1488 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1489 		trace__printf_interrupted_entry(trace, sample);
1490 
1491 	ttrace->entry_time = sample->time;
1492 	msg = ttrace->entry_str;
1493 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1494 
1495 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1496 					   args, trace, thread);
1497 
1498 	if (sc->is_exit) {
1499 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1500 			trace__fprintf_entry_head(trace, thread, 1, ttrace->entry_time, trace->output);
1501 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1502 		}
1503 	} else {
1504 		ttrace->entry_pending = true;
1505 		/* See trace__vfs_getname & trace__sys_exit */
1506 		ttrace->filename.pending_open = false;
1507 	}
1508 
1509 	if (trace->current != thread) {
1510 		thread__put(trace->current);
1511 		trace->current = thread__get(thread);
1512 	}
1513 	err = 0;
1514 out_put:
1515 	thread__put(thread);
1516 	return err;
1517 }
1518 
1519 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1520 				    struct perf_sample *sample,
1521 				    struct callchain_cursor *cursor)
1522 {
1523 	struct addr_location al;
1524 
1525 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1526 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1527 		return -1;
1528 
1529 	return 0;
1530 }
1531 
1532 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1533 {
1534 	/* TODO: user-configurable print_opts */
1535 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1536 				        EVSEL__PRINT_DSO |
1537 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1538 
1539 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1540 }
1541 
1542 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1543 			   union perf_event *event __maybe_unused,
1544 			   struct perf_sample *sample)
1545 {
1546 	long ret;
1547 	u64 duration = 0;
1548 	struct thread *thread;
1549 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1550 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1551 	struct thread_trace *ttrace;
1552 
1553 	if (sc == NULL)
1554 		return -1;
1555 
1556 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1557 	ttrace = thread__trace(thread, trace->output);
1558 	if (ttrace == NULL)
1559 		goto out_put;
1560 
1561 	if (trace->summary)
1562 		thread__update_stats(ttrace, id, sample);
1563 
1564 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1565 
1566 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1567 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1568 		ttrace->filename.pending_open = false;
1569 		++trace->stats.vfs_getname;
1570 	}
1571 
1572 	if (ttrace->entry_time) {
1573 		duration = sample->time - ttrace->entry_time;
1574 		if (trace__filter_duration(trace, duration))
1575 			goto out;
1576 	} else if (trace->duration_filter)
1577 		goto out;
1578 
1579 	if (sample->callchain) {
1580 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1581 		if (callchain_ret == 0) {
1582 			if (callchain_cursor.nr < trace->min_stack)
1583 				goto out;
1584 			callchain_ret = 1;
1585 		}
1586 	}
1587 
1588 	if (trace->summary_only)
1589 		goto out;
1590 
1591 	trace__fprintf_entry_head(trace, thread, duration, ttrace->entry_time, trace->output);
1592 
1593 	if (ttrace->entry_pending) {
1594 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1595 	} else {
1596 		fprintf(trace->output, " ... [");
1597 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1598 		fprintf(trace->output, "]: %s()", sc->name);
1599 	}
1600 
1601 	if (sc->fmt == NULL) {
1602 signed_print:
1603 		fprintf(trace->output, ") = %ld", ret);
1604 	} else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1605 		char bf[STRERR_BUFSIZE];
1606 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1607 			   *e = audit_errno_to_name(-ret);
1608 
1609 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1610 	} else if (ret == 0 && sc->fmt->timeout)
1611 		fprintf(trace->output, ") = 0 Timeout");
1612 	else if (sc->fmt->hexret)
1613 		fprintf(trace->output, ") = %#lx", ret);
1614 	else if (sc->fmt->errpid) {
1615 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1616 
1617 		if (child != NULL) {
1618 			fprintf(trace->output, ") = %ld", ret);
1619 			if (child->comm_set)
1620 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1621 			thread__put(child);
1622 		}
1623 	} else
1624 		goto signed_print;
1625 
1626 	fputc('\n', trace->output);
1627 
1628 	if (callchain_ret > 0)
1629 		trace__fprintf_callchain(trace, sample);
1630 	else if (callchain_ret < 0)
1631 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1632 out:
1633 	ttrace->entry_pending = false;
1634 	err = 0;
1635 out_put:
1636 	thread__put(thread);
1637 	return err;
1638 }
1639 
1640 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1641 			      union perf_event *event __maybe_unused,
1642 			      struct perf_sample *sample)
1643 {
1644 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1645 	struct thread_trace *ttrace;
1646 	size_t filename_len, entry_str_len, to_move;
1647 	ssize_t remaining_space;
1648 	char *pos;
1649 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1650 
1651 	if (!thread)
1652 		goto out;
1653 
1654 	ttrace = thread__priv(thread);
1655 	if (!ttrace)
1656 		goto out;
1657 
1658 	filename_len = strlen(filename);
1659 
1660 	if (ttrace->filename.namelen < filename_len) {
1661 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1662 
1663 		if (f == NULL)
1664 				goto out;
1665 
1666 		ttrace->filename.namelen = filename_len;
1667 		ttrace->filename.name = f;
1668 	}
1669 
1670 	strcpy(ttrace->filename.name, filename);
1671 	ttrace->filename.pending_open = true;
1672 
1673 	if (!ttrace->filename.ptr)
1674 		goto out;
1675 
1676 	entry_str_len = strlen(ttrace->entry_str);
1677 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1678 	if (remaining_space <= 0)
1679 		goto out;
1680 
1681 	if (filename_len > (size_t)remaining_space) {
1682 		filename += filename_len - remaining_space;
1683 		filename_len = remaining_space;
1684 	}
1685 
1686 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1687 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1688 	memmove(pos + filename_len, pos, to_move);
1689 	memcpy(pos, filename, filename_len);
1690 
1691 	ttrace->filename.ptr = 0;
1692 	ttrace->filename.entry_str_pos = 0;
1693 out:
1694 	return 0;
1695 }
1696 
1697 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1698 				     union perf_event *event __maybe_unused,
1699 				     struct perf_sample *sample)
1700 {
1701         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1702 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1703 	struct thread *thread = machine__findnew_thread(trace->host,
1704 							sample->pid,
1705 							sample->tid);
1706 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1707 
1708 	if (ttrace == NULL)
1709 		goto out_dump;
1710 
1711 	ttrace->runtime_ms += runtime_ms;
1712 	trace->runtime_ms += runtime_ms;
1713 	thread__put(thread);
1714 	return 0;
1715 
1716 out_dump:
1717 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1718 	       evsel->name,
1719 	       perf_evsel__strval(evsel, sample, "comm"),
1720 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1721 	       runtime,
1722 	       perf_evsel__intval(evsel, sample, "vruntime"));
1723 	thread__put(thread);
1724 	return 0;
1725 }
1726 
1727 static void bpf_output__printer(enum binary_printer_ops op,
1728 				unsigned int val, void *extra)
1729 {
1730 	FILE *output = extra;
1731 	unsigned char ch = (unsigned char)val;
1732 
1733 	switch (op) {
1734 	case BINARY_PRINT_CHAR_DATA:
1735 		fprintf(output, "%c", isprint(ch) ? ch : '.');
1736 		break;
1737 	case BINARY_PRINT_DATA_BEGIN:
1738 	case BINARY_PRINT_LINE_BEGIN:
1739 	case BINARY_PRINT_ADDR:
1740 	case BINARY_PRINT_NUM_DATA:
1741 	case BINARY_PRINT_NUM_PAD:
1742 	case BINARY_PRINT_SEP:
1743 	case BINARY_PRINT_CHAR_PAD:
1744 	case BINARY_PRINT_LINE_END:
1745 	case BINARY_PRINT_DATA_END:
1746 	default:
1747 		break;
1748 	}
1749 }
1750 
1751 static void bpf_output__fprintf(struct trace *trace,
1752 				struct perf_sample *sample)
1753 {
1754 	print_binary(sample->raw_data, sample->raw_size, 8,
1755 		     bpf_output__printer, trace->output);
1756 }
1757 
1758 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1759 				union perf_event *event __maybe_unused,
1760 				struct perf_sample *sample)
1761 {
1762 	int callchain_ret = 0;
1763 
1764 	if (sample->callchain) {
1765 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1766 		if (callchain_ret == 0) {
1767 			if (callchain_cursor.nr < trace->min_stack)
1768 				goto out;
1769 			callchain_ret = 1;
1770 		}
1771 	}
1772 
1773 	trace__printf_interrupted_entry(trace, sample);
1774 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1775 
1776 	if (trace->trace_syscalls)
1777 		fprintf(trace->output, "(         ): ");
1778 
1779 	fprintf(trace->output, "%s:", evsel->name);
1780 
1781 	if (perf_evsel__is_bpf_output(evsel)) {
1782 		bpf_output__fprintf(trace, sample);
1783 	} else if (evsel->tp_format) {
1784 		event_format__fprintf(evsel->tp_format, sample->cpu,
1785 				      sample->raw_data, sample->raw_size,
1786 				      trace->output);
1787 	}
1788 
1789 	fprintf(trace->output, ")\n");
1790 
1791 	if (callchain_ret > 0)
1792 		trace__fprintf_callchain(trace, sample);
1793 	else if (callchain_ret < 0)
1794 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1795 out:
1796 	return 0;
1797 }
1798 
1799 static void print_location(FILE *f, struct perf_sample *sample,
1800 			   struct addr_location *al,
1801 			   bool print_dso, bool print_sym)
1802 {
1803 
1804 	if ((verbose > 0 || print_dso) && al->map)
1805 		fprintf(f, "%s@", al->map->dso->long_name);
1806 
1807 	if ((verbose > 0 || print_sym) && al->sym)
1808 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1809 			al->addr - al->sym->start);
1810 	else if (al->map)
1811 		fprintf(f, "0x%" PRIx64, al->addr);
1812 	else
1813 		fprintf(f, "0x%" PRIx64, sample->addr);
1814 }
1815 
1816 static int trace__pgfault(struct trace *trace,
1817 			  struct perf_evsel *evsel,
1818 			  union perf_event *event __maybe_unused,
1819 			  struct perf_sample *sample)
1820 {
1821 	struct thread *thread;
1822 	struct addr_location al;
1823 	char map_type = 'd';
1824 	struct thread_trace *ttrace;
1825 	int err = -1;
1826 	int callchain_ret = 0;
1827 
1828 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1829 
1830 	if (sample->callchain) {
1831 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1832 		if (callchain_ret == 0) {
1833 			if (callchain_cursor.nr < trace->min_stack)
1834 				goto out_put;
1835 			callchain_ret = 1;
1836 		}
1837 	}
1838 
1839 	ttrace = thread__trace(thread, trace->output);
1840 	if (ttrace == NULL)
1841 		goto out_put;
1842 
1843 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1844 		ttrace->pfmaj++;
1845 	else
1846 		ttrace->pfmin++;
1847 
1848 	if (trace->summary_only)
1849 		goto out;
1850 
1851 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1852 			      sample->ip, &al);
1853 
1854 	trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1855 
1856 	fprintf(trace->output, "%sfault [",
1857 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1858 		"maj" : "min");
1859 
1860 	print_location(trace->output, sample, &al, false, true);
1861 
1862 	fprintf(trace->output, "] => ");
1863 
1864 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1865 				   sample->addr, &al);
1866 
1867 	if (!al.map) {
1868 		thread__find_addr_location(thread, sample->cpumode,
1869 					   MAP__FUNCTION, sample->addr, &al);
1870 
1871 		if (al.map)
1872 			map_type = 'x';
1873 		else
1874 			map_type = '?';
1875 	}
1876 
1877 	print_location(trace->output, sample, &al, true, false);
1878 
1879 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1880 
1881 	if (callchain_ret > 0)
1882 		trace__fprintf_callchain(trace, sample);
1883 	else if (callchain_ret < 0)
1884 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1885 out:
1886 	err = 0;
1887 out_put:
1888 	thread__put(thread);
1889 	return err;
1890 }
1891 
1892 static void trace__set_base_time(struct trace *trace,
1893 				 struct perf_evsel *evsel,
1894 				 struct perf_sample *sample)
1895 {
1896 	/*
1897 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1898 	 * and don't use sample->time unconditionally, we may end up having
1899 	 * some other event in the future without PERF_SAMPLE_TIME for good
1900 	 * reason, i.e. we may not be interested in its timestamps, just in
1901 	 * it taking place, picking some piece of information when it
1902 	 * appears in our event stream (vfs_getname comes to mind).
1903 	 */
1904 	if (trace->base_time == 0 && !trace->full_time &&
1905 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1906 		trace->base_time = sample->time;
1907 }
1908 
1909 static int trace__process_sample(struct perf_tool *tool,
1910 				 union perf_event *event,
1911 				 struct perf_sample *sample,
1912 				 struct perf_evsel *evsel,
1913 				 struct machine *machine __maybe_unused)
1914 {
1915 	struct trace *trace = container_of(tool, struct trace, tool);
1916 	struct thread *thread;
1917 	int err = 0;
1918 
1919 	tracepoint_handler handler = evsel->handler;
1920 
1921 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1922 	if (thread && thread__is_filtered(thread))
1923 		return 0;
1924 
1925 	trace__set_base_time(trace, evsel, sample);
1926 
1927 	if (handler) {
1928 		++trace->nr_events;
1929 		handler(trace, evsel, event, sample);
1930 	}
1931 
1932 	return err;
1933 }
1934 
1935 static int trace__record(struct trace *trace, int argc, const char **argv)
1936 {
1937 	unsigned int rec_argc, i, j;
1938 	const char **rec_argv;
1939 	const char * const record_args[] = {
1940 		"record",
1941 		"-R",
1942 		"-m", "1024",
1943 		"-c", "1",
1944 	};
1945 
1946 	const char * const sc_args[] = { "-e", };
1947 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1948 	const char * const majpf_args[] = { "-e", "major-faults" };
1949 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1950 	const char * const minpf_args[] = { "-e", "minor-faults" };
1951 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1952 
1953 	/* +1 is for the event string below */
1954 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1955 		majpf_args_nr + minpf_args_nr + argc;
1956 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
1957 
1958 	if (rec_argv == NULL)
1959 		return -ENOMEM;
1960 
1961 	j = 0;
1962 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
1963 		rec_argv[j++] = record_args[i];
1964 
1965 	if (trace->trace_syscalls) {
1966 		for (i = 0; i < sc_args_nr; i++)
1967 			rec_argv[j++] = sc_args[i];
1968 
1969 		/* event string may be different for older kernels - e.g., RHEL6 */
1970 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
1971 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
1972 		else if (is_valid_tracepoint("syscalls:sys_enter"))
1973 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
1974 		else {
1975 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
1976 			return -1;
1977 		}
1978 	}
1979 
1980 	if (trace->trace_pgfaults & TRACE_PFMAJ)
1981 		for (i = 0; i < majpf_args_nr; i++)
1982 			rec_argv[j++] = majpf_args[i];
1983 
1984 	if (trace->trace_pgfaults & TRACE_PFMIN)
1985 		for (i = 0; i < minpf_args_nr; i++)
1986 			rec_argv[j++] = minpf_args[i];
1987 
1988 	for (i = 0; i < (unsigned int)argc; i++)
1989 		rec_argv[j++] = argv[i];
1990 
1991 	return cmd_record(j, rec_argv, NULL);
1992 }
1993 
1994 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
1995 
1996 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
1997 {
1998 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
1999 
2000 	if (IS_ERR(evsel))
2001 		return false;
2002 
2003 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2004 		perf_evsel__delete(evsel);
2005 		return false;
2006 	}
2007 
2008 	evsel->handler = trace__vfs_getname;
2009 	perf_evlist__add(evlist, evsel);
2010 	return true;
2011 }
2012 
2013 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2014 {
2015 	struct perf_evsel *evsel;
2016 	struct perf_event_attr attr = {
2017 		.type = PERF_TYPE_SOFTWARE,
2018 		.mmap_data = 1,
2019 	};
2020 
2021 	attr.config = config;
2022 	attr.sample_period = 1;
2023 
2024 	event_attr_init(&attr);
2025 
2026 	evsel = perf_evsel__new(&attr);
2027 	if (evsel)
2028 		evsel->handler = trace__pgfault;
2029 
2030 	return evsel;
2031 }
2032 
2033 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2034 {
2035 	const u32 type = event->header.type;
2036 	struct perf_evsel *evsel;
2037 
2038 	if (type != PERF_RECORD_SAMPLE) {
2039 		trace__process_event(trace, trace->host, event, sample);
2040 		return;
2041 	}
2042 
2043 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2044 	if (evsel == NULL) {
2045 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2046 		return;
2047 	}
2048 
2049 	trace__set_base_time(trace, evsel, sample);
2050 
2051 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2052 	    sample->raw_data == NULL) {
2053 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2054 		       perf_evsel__name(evsel), sample->tid,
2055 		       sample->cpu, sample->raw_size);
2056 	} else {
2057 		tracepoint_handler handler = evsel->handler;
2058 		handler(trace, evsel, event, sample);
2059 	}
2060 }
2061 
2062 static int trace__add_syscall_newtp(struct trace *trace)
2063 {
2064 	int ret = -1;
2065 	struct perf_evlist *evlist = trace->evlist;
2066 	struct perf_evsel *sys_enter, *sys_exit;
2067 
2068 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2069 	if (sys_enter == NULL)
2070 		goto out;
2071 
2072 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2073 		goto out_delete_sys_enter;
2074 
2075 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2076 	if (sys_exit == NULL)
2077 		goto out_delete_sys_enter;
2078 
2079 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2080 		goto out_delete_sys_exit;
2081 
2082 	perf_evlist__add(evlist, sys_enter);
2083 	perf_evlist__add(evlist, sys_exit);
2084 
2085 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2086 		/*
2087 		 * We're interested only in the user space callchain
2088 		 * leading to the syscall, allow overriding that for
2089 		 * debugging reasons using --kernel_syscall_callchains
2090 		 */
2091 		sys_exit->attr.exclude_callchain_kernel = 1;
2092 	}
2093 
2094 	trace->syscalls.events.sys_enter = sys_enter;
2095 	trace->syscalls.events.sys_exit  = sys_exit;
2096 
2097 	ret = 0;
2098 out:
2099 	return ret;
2100 
2101 out_delete_sys_exit:
2102 	perf_evsel__delete_priv(sys_exit);
2103 out_delete_sys_enter:
2104 	perf_evsel__delete_priv(sys_enter);
2105 	goto out;
2106 }
2107 
2108 static int trace__set_ev_qualifier_filter(struct trace *trace)
2109 {
2110 	int err = -1;
2111 	struct perf_evsel *sys_exit;
2112 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2113 						trace->ev_qualifier_ids.nr,
2114 						trace->ev_qualifier_ids.entries);
2115 
2116 	if (filter == NULL)
2117 		goto out_enomem;
2118 
2119 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2120 					  filter)) {
2121 		sys_exit = trace->syscalls.events.sys_exit;
2122 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2123 	}
2124 
2125 	free(filter);
2126 out:
2127 	return err;
2128 out_enomem:
2129 	errno = ENOMEM;
2130 	goto out;
2131 }
2132 
2133 static int trace__run(struct trace *trace, int argc, const char **argv)
2134 {
2135 	struct perf_evlist *evlist = trace->evlist;
2136 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2137 	int err = -1, i;
2138 	unsigned long before;
2139 	const bool forks = argc > 0;
2140 	bool draining = false;
2141 
2142 	trace->live = true;
2143 
2144 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2145 		goto out_error_raw_syscalls;
2146 
2147 	if (trace->trace_syscalls)
2148 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2149 
2150 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2151 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2152 		if (pgfault_maj == NULL)
2153 			goto out_error_mem;
2154 		perf_evlist__add(evlist, pgfault_maj);
2155 	}
2156 
2157 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2158 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2159 		if (pgfault_min == NULL)
2160 			goto out_error_mem;
2161 		perf_evlist__add(evlist, pgfault_min);
2162 	}
2163 
2164 	if (trace->sched &&
2165 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2166 				   trace__sched_stat_runtime))
2167 		goto out_error_sched_stat_runtime;
2168 
2169 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2170 	if (err < 0) {
2171 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2172 		goto out_delete_evlist;
2173 	}
2174 
2175 	err = trace__symbols_init(trace, evlist);
2176 	if (err < 0) {
2177 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2178 		goto out_delete_evlist;
2179 	}
2180 
2181 	perf_evlist__config(evlist, &trace->opts, NULL);
2182 
2183 	if (callchain_param.enabled) {
2184 		bool use_identifier = false;
2185 
2186 		if (trace->syscalls.events.sys_exit) {
2187 			perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2188 						     &trace->opts, &callchain_param);
2189 			use_identifier = true;
2190 		}
2191 
2192 		if (pgfault_maj) {
2193 			perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2194 			use_identifier = true;
2195 		}
2196 
2197 		if (pgfault_min) {
2198 			perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2199 			use_identifier = true;
2200 		}
2201 
2202 		if (use_identifier) {
2203 		       /*
2204 			* Now we have evsels with different sample_ids, use
2205 			* PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2206 			* from a fixed position in each ring buffer record.
2207 			*
2208 			* As of this the changeset introducing this comment, this
2209 			* isn't strictly needed, as the fields that can come before
2210 			* PERF_SAMPLE_ID are all used, but we'll probably disable
2211 			* some of those for things like copying the payload of
2212 			* pointer syscall arguments, and for vfs_getname we don't
2213 			* need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2214 			* here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2215 			*/
2216 			perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2217 			perf_evlist__reset_sample_bit(evlist, ID);
2218 		}
2219 	}
2220 
2221 	signal(SIGCHLD, sig_handler);
2222 	signal(SIGINT, sig_handler);
2223 
2224 	if (forks) {
2225 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2226 						    argv, false, NULL);
2227 		if (err < 0) {
2228 			fprintf(trace->output, "Couldn't run the workload!\n");
2229 			goto out_delete_evlist;
2230 		}
2231 	}
2232 
2233 	err = perf_evlist__open(evlist);
2234 	if (err < 0)
2235 		goto out_error_open;
2236 
2237 	err = bpf__apply_obj_config();
2238 	if (err) {
2239 		char errbuf[BUFSIZ];
2240 
2241 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2242 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2243 			 errbuf);
2244 		goto out_error_open;
2245 	}
2246 
2247 	/*
2248 	 * Better not use !target__has_task() here because we need to cover the
2249 	 * case where no threads were specified in the command line, but a
2250 	 * workload was, and in that case we will fill in the thread_map when
2251 	 * we fork the workload in perf_evlist__prepare_workload.
2252 	 */
2253 	if (trace->filter_pids.nr > 0)
2254 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2255 	else if (thread_map__pid(evlist->threads, 0) == -1)
2256 		err = perf_evlist__set_filter_pid(evlist, getpid());
2257 
2258 	if (err < 0)
2259 		goto out_error_mem;
2260 
2261 	if (trace->ev_qualifier_ids.nr > 0) {
2262 		err = trace__set_ev_qualifier_filter(trace);
2263 		if (err < 0)
2264 			goto out_errno;
2265 
2266 		pr_debug("event qualifier tracepoint filter: %s\n",
2267 			 trace->syscalls.events.sys_exit->filter);
2268 	}
2269 
2270 	err = perf_evlist__apply_filters(evlist, &evsel);
2271 	if (err < 0)
2272 		goto out_error_apply_filters;
2273 
2274 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2275 	if (err < 0)
2276 		goto out_error_mmap;
2277 
2278 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2279 		perf_evlist__enable(evlist);
2280 
2281 	if (forks)
2282 		perf_evlist__start_workload(evlist);
2283 
2284 	if (trace->opts.initial_delay) {
2285 		usleep(trace->opts.initial_delay * 1000);
2286 		perf_evlist__enable(evlist);
2287 	}
2288 
2289 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2290 				  evlist->threads->nr > 1 ||
2291 				  perf_evlist__first(evlist)->attr.inherit;
2292 again:
2293 	before = trace->nr_events;
2294 
2295 	for (i = 0; i < evlist->nr_mmaps; i++) {
2296 		union perf_event *event;
2297 
2298 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2299 			struct perf_sample sample;
2300 
2301 			++trace->nr_events;
2302 
2303 			err = perf_evlist__parse_sample(evlist, event, &sample);
2304 			if (err) {
2305 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2306 				goto next_event;
2307 			}
2308 
2309 			trace__handle_event(trace, event, &sample);
2310 next_event:
2311 			perf_evlist__mmap_consume(evlist, i);
2312 
2313 			if (interrupted)
2314 				goto out_disable;
2315 
2316 			if (done && !draining) {
2317 				perf_evlist__disable(evlist);
2318 				draining = true;
2319 			}
2320 		}
2321 	}
2322 
2323 	if (trace->nr_events == before) {
2324 		int timeout = done ? 100 : -1;
2325 
2326 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2327 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2328 				draining = true;
2329 
2330 			goto again;
2331 		}
2332 	} else {
2333 		goto again;
2334 	}
2335 
2336 out_disable:
2337 	thread__zput(trace->current);
2338 
2339 	perf_evlist__disable(evlist);
2340 
2341 	if (!err) {
2342 		if (trace->summary)
2343 			trace__fprintf_thread_summary(trace, trace->output);
2344 
2345 		if (trace->show_tool_stats) {
2346 			fprintf(trace->output, "Stats:\n "
2347 					       " vfs_getname : %" PRIu64 "\n"
2348 					       " proc_getname: %" PRIu64 "\n",
2349 				trace->stats.vfs_getname,
2350 				trace->stats.proc_getname);
2351 		}
2352 	}
2353 
2354 out_delete_evlist:
2355 	perf_evlist__delete(evlist);
2356 	trace->evlist = NULL;
2357 	trace->live = false;
2358 	return err;
2359 {
2360 	char errbuf[BUFSIZ];
2361 
2362 out_error_sched_stat_runtime:
2363 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2364 	goto out_error;
2365 
2366 out_error_raw_syscalls:
2367 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2368 	goto out_error;
2369 
2370 out_error_mmap:
2371 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2372 	goto out_error;
2373 
2374 out_error_open:
2375 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2376 
2377 out_error:
2378 	fprintf(trace->output, "%s\n", errbuf);
2379 	goto out_delete_evlist;
2380 
2381 out_error_apply_filters:
2382 	fprintf(trace->output,
2383 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2384 		evsel->filter, perf_evsel__name(evsel), errno,
2385 		str_error_r(errno, errbuf, sizeof(errbuf)));
2386 	goto out_delete_evlist;
2387 }
2388 out_error_mem:
2389 	fprintf(trace->output, "Not enough memory to run!\n");
2390 	goto out_delete_evlist;
2391 
2392 out_errno:
2393 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2394 	goto out_delete_evlist;
2395 }
2396 
2397 static int trace__replay(struct trace *trace)
2398 {
2399 	const struct perf_evsel_str_handler handlers[] = {
2400 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2401 	};
2402 	struct perf_data_file file = {
2403 		.path  = input_name,
2404 		.mode  = PERF_DATA_MODE_READ,
2405 		.force = trace->force,
2406 	};
2407 	struct perf_session *session;
2408 	struct perf_evsel *evsel;
2409 	int err = -1;
2410 
2411 	trace->tool.sample	  = trace__process_sample;
2412 	trace->tool.mmap	  = perf_event__process_mmap;
2413 	trace->tool.mmap2	  = perf_event__process_mmap2;
2414 	trace->tool.comm	  = perf_event__process_comm;
2415 	trace->tool.exit	  = perf_event__process_exit;
2416 	trace->tool.fork	  = perf_event__process_fork;
2417 	trace->tool.attr	  = perf_event__process_attr;
2418 	trace->tool.tracing_data = perf_event__process_tracing_data;
2419 	trace->tool.build_id	  = perf_event__process_build_id;
2420 
2421 	trace->tool.ordered_events = true;
2422 	trace->tool.ordering_requires_timestamps = true;
2423 
2424 	/* add tid to output */
2425 	trace->multiple_threads = true;
2426 
2427 	session = perf_session__new(&file, false, &trace->tool);
2428 	if (session == NULL)
2429 		return -1;
2430 
2431 	if (trace->opts.target.pid)
2432 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2433 
2434 	if (trace->opts.target.tid)
2435 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2436 
2437 	if (symbol__init(&session->header.env) < 0)
2438 		goto out;
2439 
2440 	trace->host = &session->machines.host;
2441 
2442 	err = perf_session__set_tracepoints_handlers(session, handlers);
2443 	if (err)
2444 		goto out;
2445 
2446 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2447 						     "raw_syscalls:sys_enter");
2448 	/* older kernels have syscalls tp versus raw_syscalls */
2449 	if (evsel == NULL)
2450 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2451 							     "syscalls:sys_enter");
2452 
2453 	if (evsel &&
2454 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2455 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2456 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2457 		goto out;
2458 	}
2459 
2460 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2461 						     "raw_syscalls:sys_exit");
2462 	if (evsel == NULL)
2463 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2464 							     "syscalls:sys_exit");
2465 	if (evsel &&
2466 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2467 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2468 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2469 		goto out;
2470 	}
2471 
2472 	evlist__for_each_entry(session->evlist, evsel) {
2473 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2474 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2475 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2476 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2477 			evsel->handler = trace__pgfault;
2478 	}
2479 
2480 	setup_pager();
2481 
2482 	err = perf_session__process_events(session);
2483 	if (err)
2484 		pr_err("Failed to process events, error %d", err);
2485 
2486 	else if (trace->summary)
2487 		trace__fprintf_thread_summary(trace, trace->output);
2488 
2489 out:
2490 	perf_session__delete(session);
2491 
2492 	return err;
2493 }
2494 
2495 static size_t trace__fprintf_threads_header(FILE *fp)
2496 {
2497 	size_t printed;
2498 
2499 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2500 
2501 	return printed;
2502 }
2503 
2504 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2505 	struct stats 	*stats;
2506 	double		msecs;
2507 	int		syscall;
2508 )
2509 {
2510 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2511 	struct stats *stats = source->priv;
2512 
2513 	entry->syscall = source->i;
2514 	entry->stats   = stats;
2515 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2516 }
2517 
2518 static size_t thread__dump_stats(struct thread_trace *ttrace,
2519 				 struct trace *trace, FILE *fp)
2520 {
2521 	size_t printed = 0;
2522 	struct syscall *sc;
2523 	struct rb_node *nd;
2524 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2525 
2526 	if (syscall_stats == NULL)
2527 		return 0;
2528 
2529 	printed += fprintf(fp, "\n");
2530 
2531 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2532 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2533 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2534 
2535 	resort_rb__for_each_entry(nd, syscall_stats) {
2536 		struct stats *stats = syscall_stats_entry->stats;
2537 		if (stats) {
2538 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2539 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2540 			double avg = avg_stats(stats);
2541 			double pct;
2542 			u64 n = (u64) stats->n;
2543 
2544 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2545 			avg /= NSEC_PER_MSEC;
2546 
2547 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2548 			printed += fprintf(fp, "   %-15s", sc->name);
2549 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2550 					   n, syscall_stats_entry->msecs, min, avg);
2551 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2552 		}
2553 	}
2554 
2555 	resort_rb__delete(syscall_stats);
2556 	printed += fprintf(fp, "\n\n");
2557 
2558 	return printed;
2559 }
2560 
2561 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2562 {
2563 	size_t printed = 0;
2564 	struct thread_trace *ttrace = thread__priv(thread);
2565 	double ratio;
2566 
2567 	if (ttrace == NULL)
2568 		return 0;
2569 
2570 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2571 
2572 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2573 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2574 	printed += fprintf(fp, "%.1f%%", ratio);
2575 	if (ttrace->pfmaj)
2576 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2577 	if (ttrace->pfmin)
2578 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2579 	if (trace->sched)
2580 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2581 	else if (fputc('\n', fp) != EOF)
2582 		++printed;
2583 
2584 	printed += thread__dump_stats(ttrace, trace, fp);
2585 
2586 	return printed;
2587 }
2588 
2589 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2590 {
2591 	return ttrace ? ttrace->nr_events : 0;
2592 }
2593 
2594 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2595 	struct thread *thread;
2596 )
2597 {
2598 	entry->thread = rb_entry(nd, struct thread, rb_node);
2599 }
2600 
2601 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2602 {
2603 	DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2604 	size_t printed = trace__fprintf_threads_header(fp);
2605 	struct rb_node *nd;
2606 
2607 	if (threads == NULL) {
2608 		fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2609 		return 0;
2610 	}
2611 
2612 	resort_rb__for_each_entry(nd, threads)
2613 		printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2614 
2615 	resort_rb__delete(threads);
2616 
2617 	return printed;
2618 }
2619 
2620 static int trace__set_duration(const struct option *opt, const char *str,
2621 			       int unset __maybe_unused)
2622 {
2623 	struct trace *trace = opt->value;
2624 
2625 	trace->duration_filter = atof(str);
2626 	return 0;
2627 }
2628 
2629 static int trace__set_filter_pids(const struct option *opt, const char *str,
2630 				  int unset __maybe_unused)
2631 {
2632 	int ret = -1;
2633 	size_t i;
2634 	struct trace *trace = opt->value;
2635 	/*
2636 	 * FIXME: introduce a intarray class, plain parse csv and create a
2637 	 * { int nr, int entries[] } struct...
2638 	 */
2639 	struct intlist *list = intlist__new(str);
2640 
2641 	if (list == NULL)
2642 		return -1;
2643 
2644 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2645 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2646 
2647 	if (trace->filter_pids.entries == NULL)
2648 		goto out;
2649 
2650 	trace->filter_pids.entries[0] = getpid();
2651 
2652 	for (i = 1; i < trace->filter_pids.nr; ++i)
2653 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2654 
2655 	intlist__delete(list);
2656 	ret = 0;
2657 out:
2658 	return ret;
2659 }
2660 
2661 static int trace__open_output(struct trace *trace, const char *filename)
2662 {
2663 	struct stat st;
2664 
2665 	if (!stat(filename, &st) && st.st_size) {
2666 		char oldname[PATH_MAX];
2667 
2668 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2669 		unlink(oldname);
2670 		rename(filename, oldname);
2671 	}
2672 
2673 	trace->output = fopen(filename, "w");
2674 
2675 	return trace->output == NULL ? -errno : 0;
2676 }
2677 
2678 static int parse_pagefaults(const struct option *opt, const char *str,
2679 			    int unset __maybe_unused)
2680 {
2681 	int *trace_pgfaults = opt->value;
2682 
2683 	if (strcmp(str, "all") == 0)
2684 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2685 	else if (strcmp(str, "maj") == 0)
2686 		*trace_pgfaults |= TRACE_PFMAJ;
2687 	else if (strcmp(str, "min") == 0)
2688 		*trace_pgfaults |= TRACE_PFMIN;
2689 	else
2690 		return -1;
2691 
2692 	return 0;
2693 }
2694 
2695 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2696 {
2697 	struct perf_evsel *evsel;
2698 
2699 	evlist__for_each_entry(evlist, evsel)
2700 		evsel->handler = handler;
2701 }
2702 
2703 /*
2704  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2705  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2706  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2707  *
2708  * It'd be better to introduce a parse_options() variant that would return a
2709  * list with the terms it didn't match to an event...
2710  */
2711 static int trace__parse_events_option(const struct option *opt, const char *str,
2712 				      int unset __maybe_unused)
2713 {
2714 	struct trace *trace = (struct trace *)opt->value;
2715 	const char *s = str;
2716 	char *sep = NULL, *lists[2] = { NULL, NULL, };
2717 	int len = strlen(str), err = -1, list;
2718 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2719 	char group_name[PATH_MAX];
2720 
2721 	if (strace_groups_dir == NULL)
2722 		return -1;
2723 
2724 	if (*s == '!') {
2725 		++s;
2726 		trace->not_ev_qualifier = true;
2727 	}
2728 
2729 	while (1) {
2730 		if ((sep = strchr(s, ',')) != NULL)
2731 			*sep = '\0';
2732 
2733 		list = 0;
2734 		if (syscalltbl__id(trace->sctbl, s) >= 0) {
2735 			list = 1;
2736 		} else {
2737 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2738 			if (access(group_name, R_OK) == 0)
2739 				list = 1;
2740 		}
2741 
2742 		if (lists[list]) {
2743 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2744 		} else {
2745 			lists[list] = malloc(len);
2746 			if (lists[list] == NULL)
2747 				goto out;
2748 			strcpy(lists[list], s);
2749 		}
2750 
2751 		if (!sep)
2752 			break;
2753 
2754 		*sep = ',';
2755 		s = sep + 1;
2756 	}
2757 
2758 	if (lists[1] != NULL) {
2759 		struct strlist_config slist_config = {
2760 			.dirname = strace_groups_dir,
2761 		};
2762 
2763 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2764 		if (trace->ev_qualifier == NULL) {
2765 			fputs("Not enough memory to parse event qualifier", trace->output);
2766 			goto out;
2767 		}
2768 
2769 		if (trace__validate_ev_qualifier(trace))
2770 			goto out;
2771 	}
2772 
2773 	err = 0;
2774 
2775 	if (lists[0]) {
2776 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2777 					       "event selector. use 'perf list' to list available events",
2778 					       parse_events_option);
2779 		err = parse_events_option(&o, lists[0], 0);
2780 	}
2781 out:
2782 	if (sep)
2783 		*sep = ',';
2784 
2785 	return err;
2786 }
2787 
2788 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2789 {
2790 	const char *trace_usage[] = {
2791 		"perf trace [<options>] [<command>]",
2792 		"perf trace [<options>] -- <command> [<options>]",
2793 		"perf trace record [<options>] [<command>]",
2794 		"perf trace record [<options>] -- <command> [<options>]",
2795 		NULL
2796 	};
2797 	struct trace trace = {
2798 		.syscalls = {
2799 			. max = -1,
2800 		},
2801 		.opts = {
2802 			.target = {
2803 				.uid	   = UINT_MAX,
2804 				.uses_mmap = true,
2805 			},
2806 			.user_freq     = UINT_MAX,
2807 			.user_interval = ULLONG_MAX,
2808 			.no_buffering  = true,
2809 			.mmap_pages    = UINT_MAX,
2810 			.proc_map_timeout  = 500,
2811 		},
2812 		.output = stderr,
2813 		.show_comm = true,
2814 		.trace_syscalls = true,
2815 		.kernel_syscallchains = false,
2816 		.max_stack = UINT_MAX,
2817 	};
2818 	const char *output_name = NULL;
2819 	const struct option trace_options[] = {
2820 	OPT_CALLBACK('e', "event", &trace, "event",
2821 		     "event/syscall selector. use 'perf list' to list available events",
2822 		     trace__parse_events_option),
2823 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2824 		    "show the thread COMM next to its id"),
2825 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2826 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2827 		     trace__parse_events_option),
2828 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2829 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2830 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2831 		    "trace events on existing process id"),
2832 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2833 		    "trace events on existing thread id"),
2834 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2835 		     "pids to filter (by the kernel)", trace__set_filter_pids),
2836 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2837 		    "system-wide collection from all CPUs"),
2838 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2839 		    "list of cpus to monitor"),
2840 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2841 		    "child tasks do not inherit counters"),
2842 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2843 		     "number of mmap data pages",
2844 		     perf_evlist__parse_mmap_pages),
2845 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2846 		   "user to profile"),
2847 	OPT_CALLBACK(0, "duration", &trace, "float",
2848 		     "show only events with duration > N.M ms",
2849 		     trace__set_duration),
2850 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2851 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2852 	OPT_BOOLEAN('T', "time", &trace.full_time,
2853 		    "Show full timestamp, not time relative to first start"),
2854 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
2855 		    "Show only syscall summary with statistics"),
2856 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
2857 		    "Show all syscalls and summary with statistics"),
2858 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2859 		     "Trace pagefaults", parse_pagefaults, "maj"),
2860 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2861 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2862 	OPT_CALLBACK(0, "call-graph", &trace.opts,
2863 		     "record_mode[,record_size]", record_callchain_help,
2864 		     &record_parse_callchain_opt),
2865 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2866 		    "Show the kernel callchains on the syscall exit path"),
2867 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2868 		     "Set the minimum stack depth when parsing the callchain, "
2869 		     "anything below the specified depth will be ignored."),
2870 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2871 		     "Set the maximum stack depth when parsing the callchain, "
2872 		     "anything beyond the specified depth will be ignored. "
2873 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2874 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2875 			"per thread proc mmap processing timeout in ms"),
2876 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2877 		     "ms to wait before starting measurement after program "
2878 		     "start"),
2879 	OPT_END()
2880 	};
2881 	bool __maybe_unused max_stack_user_set = true;
2882 	bool mmap_pages_user_set = true;
2883 	const char * const trace_subcommands[] = { "record", NULL };
2884 	int err;
2885 	char bf[BUFSIZ];
2886 
2887 	signal(SIGSEGV, sighandler_dump_stack);
2888 	signal(SIGFPE, sighandler_dump_stack);
2889 
2890 	trace.evlist = perf_evlist__new();
2891 	trace.sctbl = syscalltbl__new();
2892 
2893 	if (trace.evlist == NULL || trace.sctbl == NULL) {
2894 		pr_err("Not enough memory to run!\n");
2895 		err = -ENOMEM;
2896 		goto out;
2897 	}
2898 
2899 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2900 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2901 
2902 	err = bpf__setup_stdout(trace.evlist);
2903 	if (err) {
2904 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2905 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2906 		goto out;
2907 	}
2908 
2909 	err = -1;
2910 
2911 	if (trace.trace_pgfaults) {
2912 		trace.opts.sample_address = true;
2913 		trace.opts.sample_time = true;
2914 	}
2915 
2916 	if (trace.opts.mmap_pages == UINT_MAX)
2917 		mmap_pages_user_set = false;
2918 
2919 	if (trace.max_stack == UINT_MAX) {
2920 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2921 		max_stack_user_set = false;
2922 	}
2923 
2924 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2925 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2926 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2927 #endif
2928 
2929 	if (callchain_param.enabled) {
2930 		if (!mmap_pages_user_set && geteuid() == 0)
2931 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2932 
2933 		symbol_conf.use_callchain = true;
2934 	}
2935 
2936 	if (trace.evlist->nr_entries > 0)
2937 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2938 
2939 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2940 		return trace__record(&trace, argc-1, &argv[1]);
2941 
2942 	/* summary_only implies summary option, but don't overwrite summary if set */
2943 	if (trace.summary_only)
2944 		trace.summary = trace.summary_only;
2945 
2946 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2947 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
2948 		pr_err("Please specify something to trace.\n");
2949 		return -1;
2950 	}
2951 
2952 	if (!trace.trace_syscalls && trace.ev_qualifier) {
2953 		pr_err("The -e option can't be used with --no-syscalls.\n");
2954 		goto out;
2955 	}
2956 
2957 	if (output_name != NULL) {
2958 		err = trace__open_output(&trace, output_name);
2959 		if (err < 0) {
2960 			perror("failed to create output file");
2961 			goto out;
2962 		}
2963 	}
2964 
2965 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
2966 
2967 	err = target__validate(&trace.opts.target);
2968 	if (err) {
2969 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2970 		fprintf(trace.output, "%s", bf);
2971 		goto out_close;
2972 	}
2973 
2974 	err = target__parse_uid(&trace.opts.target);
2975 	if (err) {
2976 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2977 		fprintf(trace.output, "%s", bf);
2978 		goto out_close;
2979 	}
2980 
2981 	if (!argc && target__none(&trace.opts.target))
2982 		trace.opts.target.system_wide = true;
2983 
2984 	if (input_name)
2985 		err = trace__replay(&trace);
2986 	else
2987 		err = trace__run(&trace, argc, argv);
2988 
2989 out_close:
2990 	if (output_name != NULL)
2991 		fclose(trace.output);
2992 out:
2993 	return err;
2994 }
2995