xref: /openbmc/linux/tools/perf/builtin-trace.c (revision f3a8b664)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 #include "callchain.h"
38 #include "syscalltbl.h"
39 #include "rb_resort.h"
40 
41 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
42 #include <stdlib.h>
43 #include <linux/err.h>
44 #include <linux/filter.h>
45 #include <linux/audit.h>
46 #include <linux/random.h>
47 #include <linux/stringify.h>
48 #include <linux/time64.h>
49 
50 #ifndef O_CLOEXEC
51 # define O_CLOEXEC		02000000
52 #endif
53 
54 struct trace {
55 	struct perf_tool	tool;
56 	struct syscalltbl	*sctbl;
57 	struct {
58 		int		max;
59 		struct syscall  *table;
60 		struct {
61 			struct perf_evsel *sys_enter,
62 					  *sys_exit;
63 		}		events;
64 	} syscalls;
65 	struct record_opts	opts;
66 	struct perf_evlist	*evlist;
67 	struct machine		*host;
68 	struct thread		*current;
69 	u64			base_time;
70 	FILE			*output;
71 	unsigned long		nr_events;
72 	struct strlist		*ev_qualifier;
73 	struct {
74 		size_t		nr;
75 		int		*entries;
76 	}			ev_qualifier_ids;
77 	struct intlist		*tid_list;
78 	struct intlist		*pid_list;
79 	struct {
80 		size_t		nr;
81 		pid_t		*entries;
82 	}			filter_pids;
83 	double			duration_filter;
84 	double			runtime_ms;
85 	struct {
86 		u64		vfs_getname,
87 				proc_getname;
88 	} stats;
89 	unsigned int		max_stack;
90 	unsigned int		min_stack;
91 	bool			not_ev_qualifier;
92 	bool			live;
93 	bool			full_time;
94 	bool			sched;
95 	bool			multiple_threads;
96 	bool			summary;
97 	bool			summary_only;
98 	bool			show_comm;
99 	bool			show_tool_stats;
100 	bool			trace_syscalls;
101 	bool			kernel_syscallchains;
102 	bool			force;
103 	bool			vfs_getname;
104 	int			trace_pgfaults;
105 	int			open_id;
106 };
107 
108 struct tp_field {
109 	int offset;
110 	union {
111 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
112 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
113 	};
114 };
115 
116 #define TP_UINT_FIELD(bits) \
117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
118 { \
119 	u##bits value; \
120 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
121 	return value;  \
122 }
123 
124 TP_UINT_FIELD(8);
125 TP_UINT_FIELD(16);
126 TP_UINT_FIELD(32);
127 TP_UINT_FIELD(64);
128 
129 #define TP_UINT_FIELD__SWAPPED(bits) \
130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
131 { \
132 	u##bits value; \
133 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
134 	return bswap_##bits(value);\
135 }
136 
137 TP_UINT_FIELD__SWAPPED(16);
138 TP_UINT_FIELD__SWAPPED(32);
139 TP_UINT_FIELD__SWAPPED(64);
140 
141 static int tp_field__init_uint(struct tp_field *field,
142 			       struct format_field *format_field,
143 			       bool needs_swap)
144 {
145 	field->offset = format_field->offset;
146 
147 	switch (format_field->size) {
148 	case 1:
149 		field->integer = tp_field__u8;
150 		break;
151 	case 2:
152 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
153 		break;
154 	case 4:
155 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
156 		break;
157 	case 8:
158 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
159 		break;
160 	default:
161 		return -1;
162 	}
163 
164 	return 0;
165 }
166 
167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
168 {
169 	return sample->raw_data + field->offset;
170 }
171 
172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
173 {
174 	field->offset = format_field->offset;
175 	field->pointer = tp_field__ptr;
176 	return 0;
177 }
178 
179 struct syscall_tp {
180 	struct tp_field id;
181 	union {
182 		struct tp_field args, ret;
183 	};
184 };
185 
186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
187 					  struct tp_field *field,
188 					  const char *name)
189 {
190 	struct format_field *format_field = perf_evsel__field(evsel, name);
191 
192 	if (format_field == NULL)
193 		return -1;
194 
195 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
196 }
197 
198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
199 	({ struct syscall_tp *sc = evsel->priv;\
200 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
201 
202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
203 					 struct tp_field *field,
204 					 const char *name)
205 {
206 	struct format_field *format_field = perf_evsel__field(evsel, name);
207 
208 	if (format_field == NULL)
209 		return -1;
210 
211 	return tp_field__init_ptr(field, format_field);
212 }
213 
214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
215 	({ struct syscall_tp *sc = evsel->priv;\
216 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
217 
218 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
219 {
220 	zfree(&evsel->priv);
221 	perf_evsel__delete(evsel);
222 }
223 
224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
225 {
226 	evsel->priv = malloc(sizeof(struct syscall_tp));
227 	if (evsel->priv != NULL) {
228 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
229 			goto out_delete;
230 
231 		evsel->handler = handler;
232 		return 0;
233 	}
234 
235 	return -ENOMEM;
236 
237 out_delete:
238 	zfree(&evsel->priv);
239 	return -ENOENT;
240 }
241 
242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
243 {
244 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
245 
246 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
247 	if (IS_ERR(evsel))
248 		evsel = perf_evsel__newtp("syscalls", direction);
249 
250 	if (IS_ERR(evsel))
251 		return NULL;
252 
253 	if (perf_evsel__init_syscall_tp(evsel, handler))
254 		goto out_delete;
255 
256 	return evsel;
257 
258 out_delete:
259 	perf_evsel__delete_priv(evsel);
260 	return NULL;
261 }
262 
263 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
264 	({ struct syscall_tp *fields = evsel->priv; \
265 	   fields->name.integer(&fields->name, sample); })
266 
267 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
268 	({ struct syscall_tp *fields = evsel->priv; \
269 	   fields->name.pointer(&fields->name, sample); })
270 
271 struct syscall_arg {
272 	unsigned long val;
273 	struct thread *thread;
274 	struct trace  *trace;
275 	void	      *parm;
276 	u8	      idx;
277 	u8	      mask;
278 };
279 
280 struct strarray {
281 	int	    offset;
282 	int	    nr_entries;
283 	const char **entries;
284 };
285 
286 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
287 	.nr_entries = ARRAY_SIZE(array), \
288 	.entries = array, \
289 }
290 
291 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
292 	.offset	    = off, \
293 	.nr_entries = ARRAY_SIZE(array), \
294 	.entries = array, \
295 }
296 
297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
298 						const char *intfmt,
299 					        struct syscall_arg *arg)
300 {
301 	struct strarray *sa = arg->parm;
302 	int idx = arg->val - sa->offset;
303 
304 	if (idx < 0 || idx >= sa->nr_entries)
305 		return scnprintf(bf, size, intfmt, arg->val);
306 
307 	return scnprintf(bf, size, "%s", sa->entries[idx]);
308 }
309 
310 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
311 					      struct syscall_arg *arg)
312 {
313 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
314 }
315 
316 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
317 
318 #if defined(__i386__) || defined(__x86_64__)
319 /*
320  * FIXME: Make this available to all arches as soon as the ioctl beautifier
321  * 	  gets rewritten to support all arches.
322  */
323 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
324 						 struct syscall_arg *arg)
325 {
326 	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
327 }
328 
329 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
330 #endif /* defined(__i386__) || defined(__x86_64__) */
331 
332 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
333 					struct syscall_arg *arg);
334 
335 #define SCA_FD syscall_arg__scnprintf_fd
336 
337 #ifndef AT_FDCWD
338 #define AT_FDCWD	-100
339 #endif
340 
341 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
342 					   struct syscall_arg *arg)
343 {
344 	int fd = arg->val;
345 
346 	if (fd == AT_FDCWD)
347 		return scnprintf(bf, size, "CWD");
348 
349 	return syscall_arg__scnprintf_fd(bf, size, arg);
350 }
351 
352 #define SCA_FDAT syscall_arg__scnprintf_fd_at
353 
354 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
355 					      struct syscall_arg *arg);
356 
357 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
358 
359 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
360 					 struct syscall_arg *arg)
361 {
362 	return scnprintf(bf, size, "%#lx", arg->val);
363 }
364 
365 #define SCA_HEX syscall_arg__scnprintf_hex
366 
367 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
368 					 struct syscall_arg *arg)
369 {
370 	return scnprintf(bf, size, "%d", arg->val);
371 }
372 
373 #define SCA_INT syscall_arg__scnprintf_int
374 
375 static const char *bpf_cmd[] = {
376 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
377 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
378 };
379 static DEFINE_STRARRAY(bpf_cmd);
380 
381 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
382 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
383 
384 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
385 static DEFINE_STRARRAY(itimers);
386 
387 static const char *keyctl_options[] = {
388 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
389 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
390 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
391 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
392 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
393 };
394 static DEFINE_STRARRAY(keyctl_options);
395 
396 static const char *whences[] = { "SET", "CUR", "END",
397 #ifdef SEEK_DATA
398 "DATA",
399 #endif
400 #ifdef SEEK_HOLE
401 "HOLE",
402 #endif
403 };
404 static DEFINE_STRARRAY(whences);
405 
406 static const char *fcntl_cmds[] = {
407 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
408 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
409 	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
410 	"F_GETOWNER_UIDS",
411 };
412 static DEFINE_STRARRAY(fcntl_cmds);
413 
414 static const char *rlimit_resources[] = {
415 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
416 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
417 	"RTTIME",
418 };
419 static DEFINE_STRARRAY(rlimit_resources);
420 
421 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
422 static DEFINE_STRARRAY(sighow);
423 
424 static const char *clockid[] = {
425 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
426 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
427 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
428 };
429 static DEFINE_STRARRAY(clockid);
430 
431 static const char *socket_families[] = {
432 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
433 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
434 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
435 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
436 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
437 	"ALG", "NFC", "VSOCK",
438 };
439 static DEFINE_STRARRAY(socket_families);
440 
441 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
442 						 struct syscall_arg *arg)
443 {
444 	size_t printed = 0;
445 	int mode = arg->val;
446 
447 	if (mode == F_OK) /* 0 */
448 		return scnprintf(bf, size, "F");
449 #define	P_MODE(n) \
450 	if (mode & n##_OK) { \
451 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
452 		mode &= ~n##_OK; \
453 	}
454 
455 	P_MODE(R);
456 	P_MODE(W);
457 	P_MODE(X);
458 #undef P_MODE
459 
460 	if (mode)
461 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
462 
463 	return printed;
464 }
465 
466 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
467 
468 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
469 					      struct syscall_arg *arg);
470 
471 #define SCA_FILENAME syscall_arg__scnprintf_filename
472 
473 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
474 						struct syscall_arg *arg)
475 {
476 	int printed = 0, flags = arg->val;
477 
478 #define	P_FLAG(n) \
479 	if (flags & O_##n) { \
480 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
481 		flags &= ~O_##n; \
482 	}
483 
484 	P_FLAG(CLOEXEC);
485 	P_FLAG(NONBLOCK);
486 #undef P_FLAG
487 
488 	if (flags)
489 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
490 
491 	return printed;
492 }
493 
494 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
495 
496 #if defined(__i386__) || defined(__x86_64__)
497 /*
498  * FIXME: Make this available to all arches.
499  */
500 #define TCGETS		0x5401
501 
502 static const char *tioctls[] = {
503 	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
504 	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
505 	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
506 	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
507 	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
508 	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
509 	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
510 	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
511 	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
512 	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
513 	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
514 	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
515 	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
516 	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
517 	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
518 };
519 
520 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
521 #endif /* defined(__i386__) || defined(__x86_64__) */
522 
523 #ifndef GRND_NONBLOCK
524 #define GRND_NONBLOCK	0x0001
525 #endif
526 #ifndef GRND_RANDOM
527 #define GRND_RANDOM	0x0002
528 #endif
529 
530 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
531 						   struct syscall_arg *arg)
532 {
533 	int printed = 0, flags = arg->val;
534 
535 #define	P_FLAG(n) \
536 	if (flags & GRND_##n) { \
537 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
538 		flags &= ~GRND_##n; \
539 	}
540 
541 	P_FLAG(RANDOM);
542 	P_FLAG(NONBLOCK);
543 #undef P_FLAG
544 
545 	if (flags)
546 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
547 
548 	return printed;
549 }
550 
551 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
552 
553 #define STRARRAY(arg, name, array) \
554 	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
555 	  .arg_parm	 = { [arg] = &strarray__##array, }
556 
557 #include "trace/beauty/eventfd.c"
558 #include "trace/beauty/flock.c"
559 #include "trace/beauty/futex_op.c"
560 #include "trace/beauty/mmap.c"
561 #include "trace/beauty/mode_t.c"
562 #include "trace/beauty/msg_flags.c"
563 #include "trace/beauty/open_flags.c"
564 #include "trace/beauty/perf_event_open.c"
565 #include "trace/beauty/pid.c"
566 #include "trace/beauty/sched_policy.c"
567 #include "trace/beauty/seccomp.c"
568 #include "trace/beauty/signum.c"
569 #include "trace/beauty/socket_type.c"
570 #include "trace/beauty/waitid_options.c"
571 
572 static struct syscall_fmt {
573 	const char *name;
574 	const char *alias;
575 	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
576 	void	   *arg_parm[6];
577 	bool	   errmsg;
578 	bool	   errpid;
579 	bool	   timeout;
580 	bool	   hexret;
581 } syscall_fmts[] = {
582 	{ .name	    = "access",	    .errmsg = true,
583 	  .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
584 	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
585 	{ .name	    = "bpf",	    .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
586 	{ .name	    = "brk",	    .hexret = true,
587 	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
588 	{ .name	    = "chdir",	    .errmsg = true, },
589 	{ .name	    = "chmod",	    .errmsg = true, },
590 	{ .name	    = "chroot",	    .errmsg = true, },
591 	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
592 	{ .name	    = "clone",	    .errpid = true, },
593 	{ .name	    = "close",	    .errmsg = true,
594 	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
595 	{ .name	    = "connect",    .errmsg = true, },
596 	{ .name	    = "creat",	    .errmsg = true, },
597 	{ .name	    = "dup",	    .errmsg = true, },
598 	{ .name	    = "dup2",	    .errmsg = true, },
599 	{ .name	    = "dup3",	    .errmsg = true, },
600 	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
601 	{ .name	    = "eventfd2",   .errmsg = true,
602 	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
603 	{ .name	    = "faccessat",  .errmsg = true, },
604 	{ .name	    = "fadvise64",  .errmsg = true, },
605 	{ .name	    = "fallocate",  .errmsg = true, },
606 	{ .name	    = "fchdir",	    .errmsg = true, },
607 	{ .name	    = "fchmod",	    .errmsg = true, },
608 	{ .name	    = "fchmodat",   .errmsg = true,
609 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
610 	{ .name	    = "fchown",	    .errmsg = true, },
611 	{ .name	    = "fchownat",   .errmsg = true,
612 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
613 	{ .name	    = "fcntl",	    .errmsg = true,
614 	  .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
615 	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
616 	{ .name	    = "fdatasync",  .errmsg = true, },
617 	{ .name	    = "flock",	    .errmsg = true,
618 	  .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
619 	{ .name	    = "fsetxattr",  .errmsg = true, },
620 	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat", },
621 	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat", },
622 	{ .name	    = "fstatfs",    .errmsg = true, },
623 	{ .name	    = "fsync",    .errmsg = true, },
624 	{ .name	    = "ftruncate", .errmsg = true, },
625 	{ .name	    = "futex",	    .errmsg = true,
626 	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
627 	{ .name	    = "futimesat", .errmsg = true,
628 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
629 	{ .name	    = "getdents",   .errmsg = true, },
630 	{ .name	    = "getdents64", .errmsg = true, },
631 	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
632 	{ .name	    = "getpid",	    .errpid = true, },
633 	{ .name	    = "getpgid",    .errpid = true, },
634 	{ .name	    = "getppid",    .errpid = true, },
635 	{ .name	    = "getrandom",  .errmsg = true,
636 	  .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
637 	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
638 	{ .name	    = "getxattr",   .errmsg = true, },
639 	{ .name	    = "inotify_add_watch",	    .errmsg = true, },
640 	{ .name	    = "ioctl",	    .errmsg = true,
641 	  .arg_scnprintf = {
642 #if defined(__i386__) || defined(__x86_64__)
643 /*
644  * FIXME: Make this available to all arches.
645  */
646 			     [1] = SCA_STRHEXARRAY, /* cmd */
647 			     [2] = SCA_HEX, /* arg */ },
648 	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
649 #else
650 			     [2] = SCA_HEX, /* arg */ }, },
651 #endif
652 	{ .name	    = "keyctl",	    .errmsg = true, STRARRAY(0, option, keyctl_options), },
653 	{ .name	    = "kill",	    .errmsg = true,
654 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
655 	{ .name	    = "lchown",    .errmsg = true, },
656 	{ .name	    = "lgetxattr",  .errmsg = true, },
657 	{ .name	    = "linkat",	    .errmsg = true,
658 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
659 	{ .name	    = "listxattr",  .errmsg = true, },
660 	{ .name	    = "llistxattr", .errmsg = true, },
661 	{ .name	    = "lremovexattr",  .errmsg = true, },
662 	{ .name	    = "lseek",	    .errmsg = true,
663 	  .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
664 	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
665 	{ .name	    = "lsetxattr",  .errmsg = true, },
666 	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat", },
667 	{ .name	    = "lsxattr",    .errmsg = true, },
668 	{ .name     = "madvise",    .errmsg = true,
669 	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
670 			     [2] = SCA_MADV_BHV, /* behavior */ }, },
671 	{ .name	    = "mkdir",    .errmsg = true, },
672 	{ .name	    = "mkdirat",    .errmsg = true,
673 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
674 	{ .name	    = "mknod",      .errmsg = true, },
675 	{ .name	    = "mknodat",    .errmsg = true,
676 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
677 	{ .name	    = "mlock",	    .errmsg = true,
678 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
679 	{ .name	    = "mlockall",   .errmsg = true,
680 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
681 	{ .name	    = "mmap",	    .hexret = true,
682 	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
683 			     [2] = SCA_MMAP_PROT, /* prot */
684 			     [3] = SCA_MMAP_FLAGS, /* flags */ }, },
685 	{ .name	    = "mprotect",   .errmsg = true,
686 	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
687 			     [2] = SCA_MMAP_PROT, /* prot */ }, },
688 	{ .name	    = "mq_unlink", .errmsg = true,
689 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
690 	{ .name	    = "mremap",	    .hexret = true,
691 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
692 			     [3] = SCA_MREMAP_FLAGS, /* flags */
693 			     [4] = SCA_HEX, /* new_addr */ }, },
694 	{ .name	    = "munlock",    .errmsg = true,
695 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
696 	{ .name	    = "munmap",	    .errmsg = true,
697 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
698 	{ .name	    = "name_to_handle_at", .errmsg = true,
699 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
700 	{ .name	    = "newfstatat", .errmsg = true,
701 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
702 	{ .name	    = "open",	    .errmsg = true,
703 	  .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
704 	{ .name	    = "open_by_handle_at", .errmsg = true,
705 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
706 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
707 	{ .name	    = "openat",	    .errmsg = true,
708 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
709 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
710 	{ .name	    = "perf_event_open", .errmsg = true,
711 	  .arg_scnprintf = { [2] = SCA_INT, /* cpu */
712 			     [3] = SCA_FD,  /* group_fd */
713 			     [4] = SCA_PERF_FLAGS,  /* flags */ }, },
714 	{ .name	    = "pipe2",	    .errmsg = true,
715 	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
716 	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
717 	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
718 	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64", },
719 	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread", },
720 	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
721 	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64", },
722 	{ .name	    = "pwritev",    .errmsg = true, },
723 	{ .name	    = "read",	    .errmsg = true, },
724 	{ .name	    = "readlink",   .errmsg = true, },
725 	{ .name	    = "readlinkat", .errmsg = true,
726 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
727 	{ .name	    = "readv",	    .errmsg = true, },
728 	{ .name	    = "recvfrom",   .errmsg = true,
729 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
730 	{ .name	    = "recvmmsg",   .errmsg = true,
731 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
732 	{ .name	    = "recvmsg",    .errmsg = true,
733 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
734 	{ .name	    = "removexattr", .errmsg = true, },
735 	{ .name	    = "renameat",   .errmsg = true,
736 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
737 	{ .name	    = "rmdir",    .errmsg = true, },
738 	{ .name	    = "rt_sigaction", .errmsg = true,
739 	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
740 	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
741 	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
742 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
743 	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
744 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
745 	{ .name	    = "sched_getattr",	      .errmsg = true, },
746 	{ .name	    = "sched_setattr",	      .errmsg = true, },
747 	{ .name	    = "sched_setscheduler",   .errmsg = true,
748 	  .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
749 	{ .name	    = "seccomp", .errmsg = true,
750 	  .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
751 			     [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
752 	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
753 	{ .name	    = "sendmmsg",    .errmsg = true,
754 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
755 	{ .name	    = "sendmsg",    .errmsg = true,
756 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
757 	{ .name	    = "sendto",	    .errmsg = true,
758 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
759 	{ .name	    = "set_tid_address", .errpid = true, },
760 	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
761 	{ .name	    = "setpgid",    .errmsg = true, },
762 	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
763 	{ .name	    = "setxattr",   .errmsg = true, },
764 	{ .name	    = "shutdown",   .errmsg = true, },
765 	{ .name	    = "socket",	    .errmsg = true,
766 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
767 			     [1] = SCA_SK_TYPE, /* type */ },
768 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
769 	{ .name	    = "socketpair", .errmsg = true,
770 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
771 			     [1] = SCA_SK_TYPE, /* type */ },
772 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
773 	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat", },
774 	{ .name	    = "statfs",	    .errmsg = true, },
775 	{ .name	    = "swapoff",    .errmsg = true,
776 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
777 	{ .name	    = "swapon",	    .errmsg = true,
778 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
779 	{ .name	    = "symlinkat",  .errmsg = true,
780 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
781 	{ .name	    = "tgkill",	    .errmsg = true,
782 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
783 	{ .name	    = "tkill",	    .errmsg = true,
784 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
785 	{ .name	    = "truncate",   .errmsg = true, },
786 	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
787 	{ .name	    = "unlinkat",   .errmsg = true,
788 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
789 	{ .name	    = "utime",  .errmsg = true, },
790 	{ .name	    = "utimensat",  .errmsg = true,
791 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
792 	{ .name	    = "utimes",  .errmsg = true, },
793 	{ .name	    = "vmsplice",  .errmsg = true, },
794 	{ .name	    = "wait4",	    .errpid = true,
795 	  .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
796 	{ .name	    = "waitid",	    .errpid = true,
797 	  .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
798 	{ .name	    = "write",	    .errmsg = true, },
799 	{ .name	    = "writev",	    .errmsg = true, },
800 };
801 
802 static int syscall_fmt__cmp(const void *name, const void *fmtp)
803 {
804 	const struct syscall_fmt *fmt = fmtp;
805 	return strcmp(name, fmt->name);
806 }
807 
808 static struct syscall_fmt *syscall_fmt__find(const char *name)
809 {
810 	const int nmemb = ARRAY_SIZE(syscall_fmts);
811 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
812 }
813 
814 struct syscall {
815 	struct event_format *tp_format;
816 	int		    nr_args;
817 	struct format_field *args;
818 	const char	    *name;
819 	bool		    is_exit;
820 	struct syscall_fmt  *fmt;
821 	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
822 	void		    **arg_parm;
823 };
824 
825 static size_t fprintf_duration(unsigned long t, FILE *fp)
826 {
827 	double duration = (double)t / NSEC_PER_MSEC;
828 	size_t printed = fprintf(fp, "(");
829 
830 	if (duration >= 1.0)
831 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
832 	else if (duration >= 0.01)
833 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
834 	else
835 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
836 	return printed + fprintf(fp, "): ");
837 }
838 
839 /**
840  * filename.ptr: The filename char pointer that will be vfs_getname'd
841  * filename.entry_str_pos: Where to insert the string translated from
842  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
843  */
844 struct thread_trace {
845 	u64		  entry_time;
846 	u64		  exit_time;
847 	bool		  entry_pending;
848 	unsigned long	  nr_events;
849 	unsigned long	  pfmaj, pfmin;
850 	char		  *entry_str;
851 	double		  runtime_ms;
852         struct {
853 		unsigned long ptr;
854 		short int     entry_str_pos;
855 		bool	      pending_open;
856 		unsigned int  namelen;
857 		char	      *name;
858 	} filename;
859 	struct {
860 		int	  max;
861 		char	  **table;
862 	} paths;
863 
864 	struct intlist *syscall_stats;
865 };
866 
867 static struct thread_trace *thread_trace__new(void)
868 {
869 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
870 
871 	if (ttrace)
872 		ttrace->paths.max = -1;
873 
874 	ttrace->syscall_stats = intlist__new(NULL);
875 
876 	return ttrace;
877 }
878 
879 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
880 {
881 	struct thread_trace *ttrace;
882 
883 	if (thread == NULL)
884 		goto fail;
885 
886 	if (thread__priv(thread) == NULL)
887 		thread__set_priv(thread, thread_trace__new());
888 
889 	if (thread__priv(thread) == NULL)
890 		goto fail;
891 
892 	ttrace = thread__priv(thread);
893 	++ttrace->nr_events;
894 
895 	return ttrace;
896 fail:
897 	color_fprintf(fp, PERF_COLOR_RED,
898 		      "WARNING: not enough memory, dropping samples!\n");
899 	return NULL;
900 }
901 
902 #define TRACE_PFMAJ		(1 << 0)
903 #define TRACE_PFMIN		(1 << 1)
904 
905 static const size_t trace__entry_str_size = 2048;
906 
907 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
908 {
909 	struct thread_trace *ttrace = thread__priv(thread);
910 
911 	if (fd > ttrace->paths.max) {
912 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
913 
914 		if (npath == NULL)
915 			return -1;
916 
917 		if (ttrace->paths.max != -1) {
918 			memset(npath + ttrace->paths.max + 1, 0,
919 			       (fd - ttrace->paths.max) * sizeof(char *));
920 		} else {
921 			memset(npath, 0, (fd + 1) * sizeof(char *));
922 		}
923 
924 		ttrace->paths.table = npath;
925 		ttrace->paths.max   = fd;
926 	}
927 
928 	ttrace->paths.table[fd] = strdup(pathname);
929 
930 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
931 }
932 
933 static int thread__read_fd_path(struct thread *thread, int fd)
934 {
935 	char linkname[PATH_MAX], pathname[PATH_MAX];
936 	struct stat st;
937 	int ret;
938 
939 	if (thread->pid_ == thread->tid) {
940 		scnprintf(linkname, sizeof(linkname),
941 			  "/proc/%d/fd/%d", thread->pid_, fd);
942 	} else {
943 		scnprintf(linkname, sizeof(linkname),
944 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
945 	}
946 
947 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
948 		return -1;
949 
950 	ret = readlink(linkname, pathname, sizeof(pathname));
951 
952 	if (ret < 0 || ret > st.st_size)
953 		return -1;
954 
955 	pathname[ret] = '\0';
956 	return trace__set_fd_pathname(thread, fd, pathname);
957 }
958 
959 static const char *thread__fd_path(struct thread *thread, int fd,
960 				   struct trace *trace)
961 {
962 	struct thread_trace *ttrace = thread__priv(thread);
963 
964 	if (ttrace == NULL)
965 		return NULL;
966 
967 	if (fd < 0)
968 		return NULL;
969 
970 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
971 		if (!trace->live)
972 			return NULL;
973 		++trace->stats.proc_getname;
974 		if (thread__read_fd_path(thread, fd))
975 			return NULL;
976 	}
977 
978 	return ttrace->paths.table[fd];
979 }
980 
981 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
982 					struct syscall_arg *arg)
983 {
984 	int fd = arg->val;
985 	size_t printed = scnprintf(bf, size, "%d", fd);
986 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
987 
988 	if (path)
989 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
990 
991 	return printed;
992 }
993 
994 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
995 					      struct syscall_arg *arg)
996 {
997 	int fd = arg->val;
998 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
999 	struct thread_trace *ttrace = thread__priv(arg->thread);
1000 
1001 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1002 		zfree(&ttrace->paths.table[fd]);
1003 
1004 	return printed;
1005 }
1006 
1007 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1008 				     unsigned long ptr)
1009 {
1010 	struct thread_trace *ttrace = thread__priv(thread);
1011 
1012 	ttrace->filename.ptr = ptr;
1013 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1014 }
1015 
1016 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1017 					      struct syscall_arg *arg)
1018 {
1019 	unsigned long ptr = arg->val;
1020 
1021 	if (!arg->trace->vfs_getname)
1022 		return scnprintf(bf, size, "%#x", ptr);
1023 
1024 	thread__set_filename_pos(arg->thread, bf, ptr);
1025 	return 0;
1026 }
1027 
1028 static bool trace__filter_duration(struct trace *trace, double t)
1029 {
1030 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1031 }
1032 
1033 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1034 {
1035 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1036 
1037 	return fprintf(fp, "%10.3f ", ts);
1038 }
1039 
1040 static bool done = false;
1041 static bool interrupted = false;
1042 
1043 static void sig_handler(int sig)
1044 {
1045 	done = true;
1046 	interrupted = sig == SIGINT;
1047 }
1048 
1049 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1050 					u64 duration, u64 tstamp, FILE *fp)
1051 {
1052 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1053 	printed += fprintf_duration(duration, fp);
1054 
1055 	if (trace->multiple_threads) {
1056 		if (trace->show_comm)
1057 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1058 		printed += fprintf(fp, "%d ", thread->tid);
1059 	}
1060 
1061 	return printed;
1062 }
1063 
1064 static int trace__process_event(struct trace *trace, struct machine *machine,
1065 				union perf_event *event, struct perf_sample *sample)
1066 {
1067 	int ret = 0;
1068 
1069 	switch (event->header.type) {
1070 	case PERF_RECORD_LOST:
1071 		color_fprintf(trace->output, PERF_COLOR_RED,
1072 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1073 		ret = machine__process_lost_event(machine, event, sample);
1074 		break;
1075 	default:
1076 		ret = machine__process_event(machine, event, sample);
1077 		break;
1078 	}
1079 
1080 	return ret;
1081 }
1082 
1083 static int trace__tool_process(struct perf_tool *tool,
1084 			       union perf_event *event,
1085 			       struct perf_sample *sample,
1086 			       struct machine *machine)
1087 {
1088 	struct trace *trace = container_of(tool, struct trace, tool);
1089 	return trace__process_event(trace, machine, event, sample);
1090 }
1091 
1092 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1093 {
1094 	struct machine *machine = vmachine;
1095 
1096 	if (machine->kptr_restrict_warned)
1097 		return NULL;
1098 
1099 	if (symbol_conf.kptr_restrict) {
1100 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1101 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1102 			   "Kernel samples will not be resolved.\n");
1103 		machine->kptr_restrict_warned = true;
1104 		return NULL;
1105 	}
1106 
1107 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1108 }
1109 
1110 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1111 {
1112 	int err = symbol__init(NULL);
1113 
1114 	if (err)
1115 		return err;
1116 
1117 	trace->host = machine__new_host();
1118 	if (trace->host == NULL)
1119 		return -ENOMEM;
1120 
1121 	if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1122 		return -errno;
1123 
1124 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1125 					    evlist->threads, trace__tool_process, false,
1126 					    trace->opts.proc_map_timeout);
1127 	if (err)
1128 		symbol__exit();
1129 
1130 	return err;
1131 }
1132 
1133 static int syscall__set_arg_fmts(struct syscall *sc)
1134 {
1135 	struct format_field *field;
1136 	int idx = 0, len;
1137 
1138 	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1139 	if (sc->arg_scnprintf == NULL)
1140 		return -1;
1141 
1142 	if (sc->fmt)
1143 		sc->arg_parm = sc->fmt->arg_parm;
1144 
1145 	for (field = sc->args; field; field = field->next) {
1146 		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1147 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1148 		else if (strcmp(field->type, "const char *") == 0 &&
1149 			 (strcmp(field->name, "filename") == 0 ||
1150 			  strcmp(field->name, "path") == 0 ||
1151 			  strcmp(field->name, "pathname") == 0))
1152 			sc->arg_scnprintf[idx] = SCA_FILENAME;
1153 		else if (field->flags & FIELD_IS_POINTER)
1154 			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1155 		else if (strcmp(field->type, "pid_t") == 0)
1156 			sc->arg_scnprintf[idx] = SCA_PID;
1157 		else if (strcmp(field->type, "umode_t") == 0)
1158 			sc->arg_scnprintf[idx] = SCA_MODE_T;
1159 		else if ((strcmp(field->type, "int") == 0 ||
1160 			  strcmp(field->type, "unsigned int") == 0 ||
1161 			  strcmp(field->type, "long") == 0) &&
1162 			 (len = strlen(field->name)) >= 2 &&
1163 			 strcmp(field->name + len - 2, "fd") == 0) {
1164 			/*
1165 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1166 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1167 			 * 65 int
1168 			 * 23 unsigned int
1169 			 * 7 unsigned long
1170 			 */
1171 			sc->arg_scnprintf[idx] = SCA_FD;
1172 		}
1173 		++idx;
1174 	}
1175 
1176 	return 0;
1177 }
1178 
1179 static int trace__read_syscall_info(struct trace *trace, int id)
1180 {
1181 	char tp_name[128];
1182 	struct syscall *sc;
1183 	const char *name = syscalltbl__name(trace->sctbl, id);
1184 
1185 	if (name == NULL)
1186 		return -1;
1187 
1188 	if (id > trace->syscalls.max) {
1189 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1190 
1191 		if (nsyscalls == NULL)
1192 			return -1;
1193 
1194 		if (trace->syscalls.max != -1) {
1195 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1196 			       (id - trace->syscalls.max) * sizeof(*sc));
1197 		} else {
1198 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1199 		}
1200 
1201 		trace->syscalls.table = nsyscalls;
1202 		trace->syscalls.max   = id;
1203 	}
1204 
1205 	sc = trace->syscalls.table + id;
1206 	sc->name = name;
1207 
1208 	sc->fmt  = syscall_fmt__find(sc->name);
1209 
1210 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1211 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1212 
1213 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1214 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1215 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1216 	}
1217 
1218 	if (IS_ERR(sc->tp_format))
1219 		return -1;
1220 
1221 	sc->args = sc->tp_format->format.fields;
1222 	sc->nr_args = sc->tp_format->format.nr_fields;
1223 	/*
1224 	 * We need to check and discard the first variable '__syscall_nr'
1225 	 * or 'nr' that mean the syscall number. It is needless here.
1226 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1227 	 */
1228 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1229 		sc->args = sc->args->next;
1230 		--sc->nr_args;
1231 	}
1232 
1233 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1234 
1235 	return syscall__set_arg_fmts(sc);
1236 }
1237 
1238 static int trace__validate_ev_qualifier(struct trace *trace)
1239 {
1240 	int err = 0, i;
1241 	struct str_node *pos;
1242 
1243 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1244 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1245 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1246 
1247 	if (trace->ev_qualifier_ids.entries == NULL) {
1248 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1249 		       trace->output);
1250 		err = -EINVAL;
1251 		goto out;
1252 	}
1253 
1254 	i = 0;
1255 
1256 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1257 		const char *sc = pos->s;
1258 		int id = syscalltbl__id(trace->sctbl, sc);
1259 
1260 		if (id < 0) {
1261 			if (err == 0) {
1262 				fputs("Error:\tInvalid syscall ", trace->output);
1263 				err = -EINVAL;
1264 			} else {
1265 				fputs(", ", trace->output);
1266 			}
1267 
1268 			fputs(sc, trace->output);
1269 		}
1270 
1271 		trace->ev_qualifier_ids.entries[i++] = id;
1272 	}
1273 
1274 	if (err < 0) {
1275 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1276 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1277 		zfree(&trace->ev_qualifier_ids.entries);
1278 		trace->ev_qualifier_ids.nr = 0;
1279 	}
1280 out:
1281 	return err;
1282 }
1283 
1284 /*
1285  * args is to be interpreted as a series of longs but we need to handle
1286  * 8-byte unaligned accesses. args points to raw_data within the event
1287  * and raw_data is guaranteed to be 8-byte unaligned because it is
1288  * preceded by raw_size which is a u32. So we need to copy args to a temp
1289  * variable to read it. Most notably this avoids extended load instructions
1290  * on unaligned addresses
1291  */
1292 
1293 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1294 				      unsigned char *args, struct trace *trace,
1295 				      struct thread *thread)
1296 {
1297 	size_t printed = 0;
1298 	unsigned char *p;
1299 	unsigned long val;
1300 
1301 	if (sc->args != NULL) {
1302 		struct format_field *field;
1303 		u8 bit = 1;
1304 		struct syscall_arg arg = {
1305 			.idx	= 0,
1306 			.mask	= 0,
1307 			.trace  = trace,
1308 			.thread = thread,
1309 		};
1310 
1311 		for (field = sc->args; field;
1312 		     field = field->next, ++arg.idx, bit <<= 1) {
1313 			if (arg.mask & bit)
1314 				continue;
1315 
1316 			/* special care for unaligned accesses */
1317 			p = args + sizeof(unsigned long) * arg.idx;
1318 			memcpy(&val, p, sizeof(val));
1319 
1320 			/*
1321  			 * Suppress this argument if its value is zero and
1322  			 * and we don't have a string associated in an
1323  			 * strarray for it.
1324  			 */
1325 			if (val == 0 &&
1326 			    !(sc->arg_scnprintf &&
1327 			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1328 			      sc->arg_parm[arg.idx]))
1329 				continue;
1330 
1331 			printed += scnprintf(bf + printed, size - printed,
1332 					     "%s%s: ", printed ? ", " : "", field->name);
1333 			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1334 				arg.val = val;
1335 				if (sc->arg_parm)
1336 					arg.parm = sc->arg_parm[arg.idx];
1337 				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1338 								      size - printed, &arg);
1339 			} else {
1340 				printed += scnprintf(bf + printed, size - printed,
1341 						     "%ld", val);
1342 			}
1343 		}
1344 	} else if (IS_ERR(sc->tp_format)) {
1345 		/*
1346 		 * If we managed to read the tracepoint /format file, then we
1347 		 * may end up not having any args, like with gettid(), so only
1348 		 * print the raw args when we didn't manage to read it.
1349 		 */
1350 		int i = 0;
1351 
1352 		while (i < 6) {
1353 			/* special care for unaligned accesses */
1354 			p = args + sizeof(unsigned long) * i;
1355 			memcpy(&val, p, sizeof(val));
1356 			printed += scnprintf(bf + printed, size - printed,
1357 					     "%sarg%d: %ld",
1358 					     printed ? ", " : "", i, val);
1359 			++i;
1360 		}
1361 	}
1362 
1363 	return printed;
1364 }
1365 
1366 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1367 				  union perf_event *event,
1368 				  struct perf_sample *sample);
1369 
1370 static struct syscall *trace__syscall_info(struct trace *trace,
1371 					   struct perf_evsel *evsel, int id)
1372 {
1373 
1374 	if (id < 0) {
1375 
1376 		/*
1377 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1378 		 * before that, leaving at a higher verbosity level till that is
1379 		 * explained. Reproduced with plain ftrace with:
1380 		 *
1381 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1382 		 * grep "NR -1 " /t/trace_pipe
1383 		 *
1384 		 * After generating some load on the machine.
1385  		 */
1386 		if (verbose > 1) {
1387 			static u64 n;
1388 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1389 				id, perf_evsel__name(evsel), ++n);
1390 		}
1391 		return NULL;
1392 	}
1393 
1394 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1395 	    trace__read_syscall_info(trace, id))
1396 		goto out_cant_read;
1397 
1398 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1399 		goto out_cant_read;
1400 
1401 	return &trace->syscalls.table[id];
1402 
1403 out_cant_read:
1404 	if (verbose) {
1405 		fprintf(trace->output, "Problems reading syscall %d", id);
1406 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1407 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1408 		fputs(" information\n", trace->output);
1409 	}
1410 	return NULL;
1411 }
1412 
1413 static void thread__update_stats(struct thread_trace *ttrace,
1414 				 int id, struct perf_sample *sample)
1415 {
1416 	struct int_node *inode;
1417 	struct stats *stats;
1418 	u64 duration = 0;
1419 
1420 	inode = intlist__findnew(ttrace->syscall_stats, id);
1421 	if (inode == NULL)
1422 		return;
1423 
1424 	stats = inode->priv;
1425 	if (stats == NULL) {
1426 		stats = malloc(sizeof(struct stats));
1427 		if (stats == NULL)
1428 			return;
1429 		init_stats(stats);
1430 		inode->priv = stats;
1431 	}
1432 
1433 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1434 		duration = sample->time - ttrace->entry_time;
1435 
1436 	update_stats(stats, duration);
1437 }
1438 
1439 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1440 {
1441 	struct thread_trace *ttrace;
1442 	u64 duration;
1443 	size_t printed;
1444 
1445 	if (trace->current == NULL)
1446 		return 0;
1447 
1448 	ttrace = thread__priv(trace->current);
1449 
1450 	if (!ttrace->entry_pending)
1451 		return 0;
1452 
1453 	duration = sample->time - ttrace->entry_time;
1454 
1455 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1456 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1457 	ttrace->entry_pending = false;
1458 
1459 	return printed;
1460 }
1461 
1462 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1463 			    union perf_event *event __maybe_unused,
1464 			    struct perf_sample *sample)
1465 {
1466 	char *msg;
1467 	void *args;
1468 	size_t printed = 0;
1469 	struct thread *thread;
1470 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1471 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1472 	struct thread_trace *ttrace;
1473 
1474 	if (sc == NULL)
1475 		return -1;
1476 
1477 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1478 	ttrace = thread__trace(thread, trace->output);
1479 	if (ttrace == NULL)
1480 		goto out_put;
1481 
1482 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1483 
1484 	if (ttrace->entry_str == NULL) {
1485 		ttrace->entry_str = malloc(trace__entry_str_size);
1486 		if (!ttrace->entry_str)
1487 			goto out_put;
1488 	}
1489 
1490 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1491 		trace__printf_interrupted_entry(trace, sample);
1492 
1493 	ttrace->entry_time = sample->time;
1494 	msg = ttrace->entry_str;
1495 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1496 
1497 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1498 					   args, trace, thread);
1499 
1500 	if (sc->is_exit) {
1501 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1502 			trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1503 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1504 		}
1505 	} else {
1506 		ttrace->entry_pending = true;
1507 		/* See trace__vfs_getname & trace__sys_exit */
1508 		ttrace->filename.pending_open = false;
1509 	}
1510 
1511 	if (trace->current != thread) {
1512 		thread__put(trace->current);
1513 		trace->current = thread__get(thread);
1514 	}
1515 	err = 0;
1516 out_put:
1517 	thread__put(thread);
1518 	return err;
1519 }
1520 
1521 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1522 				    struct perf_sample *sample,
1523 				    struct callchain_cursor *cursor)
1524 {
1525 	struct addr_location al;
1526 
1527 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1528 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1529 		return -1;
1530 
1531 	return 0;
1532 }
1533 
1534 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1535 {
1536 	/* TODO: user-configurable print_opts */
1537 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1538 				        EVSEL__PRINT_DSO |
1539 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1540 
1541 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1542 }
1543 
1544 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1545 			   union perf_event *event __maybe_unused,
1546 			   struct perf_sample *sample)
1547 {
1548 	long ret;
1549 	u64 duration = 0;
1550 	struct thread *thread;
1551 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1552 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1553 	struct thread_trace *ttrace;
1554 
1555 	if (sc == NULL)
1556 		return -1;
1557 
1558 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1559 	ttrace = thread__trace(thread, trace->output);
1560 	if (ttrace == NULL)
1561 		goto out_put;
1562 
1563 	if (trace->summary)
1564 		thread__update_stats(ttrace, id, sample);
1565 
1566 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1567 
1568 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1569 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1570 		ttrace->filename.pending_open = false;
1571 		++trace->stats.vfs_getname;
1572 	}
1573 
1574 	ttrace->exit_time = sample->time;
1575 
1576 	if (ttrace->entry_time) {
1577 		duration = sample->time - ttrace->entry_time;
1578 		if (trace__filter_duration(trace, duration))
1579 			goto out;
1580 	} else if (trace->duration_filter)
1581 		goto out;
1582 
1583 	if (sample->callchain) {
1584 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1585 		if (callchain_ret == 0) {
1586 			if (callchain_cursor.nr < trace->min_stack)
1587 				goto out;
1588 			callchain_ret = 1;
1589 		}
1590 	}
1591 
1592 	if (trace->summary_only)
1593 		goto out;
1594 
1595 	trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1596 
1597 	if (ttrace->entry_pending) {
1598 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1599 	} else {
1600 		fprintf(trace->output, " ... [");
1601 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1602 		fprintf(trace->output, "]: %s()", sc->name);
1603 	}
1604 
1605 	if (sc->fmt == NULL) {
1606 signed_print:
1607 		fprintf(trace->output, ") = %ld", ret);
1608 	} else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1609 		char bf[STRERR_BUFSIZE];
1610 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1611 			   *e = audit_errno_to_name(-ret);
1612 
1613 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1614 	} else if (ret == 0 && sc->fmt->timeout)
1615 		fprintf(trace->output, ") = 0 Timeout");
1616 	else if (sc->fmt->hexret)
1617 		fprintf(trace->output, ") = %#lx", ret);
1618 	else if (sc->fmt->errpid) {
1619 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1620 
1621 		if (child != NULL) {
1622 			fprintf(trace->output, ") = %ld", ret);
1623 			if (child->comm_set)
1624 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1625 			thread__put(child);
1626 		}
1627 	} else
1628 		goto signed_print;
1629 
1630 	fputc('\n', trace->output);
1631 
1632 	if (callchain_ret > 0)
1633 		trace__fprintf_callchain(trace, sample);
1634 	else if (callchain_ret < 0)
1635 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1636 out:
1637 	ttrace->entry_pending = false;
1638 	err = 0;
1639 out_put:
1640 	thread__put(thread);
1641 	return err;
1642 }
1643 
1644 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1645 			      union perf_event *event __maybe_unused,
1646 			      struct perf_sample *sample)
1647 {
1648 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1649 	struct thread_trace *ttrace;
1650 	size_t filename_len, entry_str_len, to_move;
1651 	ssize_t remaining_space;
1652 	char *pos;
1653 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1654 
1655 	if (!thread)
1656 		goto out;
1657 
1658 	ttrace = thread__priv(thread);
1659 	if (!ttrace)
1660 		goto out;
1661 
1662 	filename_len = strlen(filename);
1663 
1664 	if (ttrace->filename.namelen < filename_len) {
1665 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1666 
1667 		if (f == NULL)
1668 				goto out;
1669 
1670 		ttrace->filename.namelen = filename_len;
1671 		ttrace->filename.name = f;
1672 	}
1673 
1674 	strcpy(ttrace->filename.name, filename);
1675 	ttrace->filename.pending_open = true;
1676 
1677 	if (!ttrace->filename.ptr)
1678 		goto out;
1679 
1680 	entry_str_len = strlen(ttrace->entry_str);
1681 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1682 	if (remaining_space <= 0)
1683 		goto out;
1684 
1685 	if (filename_len > (size_t)remaining_space) {
1686 		filename += filename_len - remaining_space;
1687 		filename_len = remaining_space;
1688 	}
1689 
1690 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1691 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1692 	memmove(pos + filename_len, pos, to_move);
1693 	memcpy(pos, filename, filename_len);
1694 
1695 	ttrace->filename.ptr = 0;
1696 	ttrace->filename.entry_str_pos = 0;
1697 out:
1698 	return 0;
1699 }
1700 
1701 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1702 				     union perf_event *event __maybe_unused,
1703 				     struct perf_sample *sample)
1704 {
1705         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1706 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1707 	struct thread *thread = machine__findnew_thread(trace->host,
1708 							sample->pid,
1709 							sample->tid);
1710 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1711 
1712 	if (ttrace == NULL)
1713 		goto out_dump;
1714 
1715 	ttrace->runtime_ms += runtime_ms;
1716 	trace->runtime_ms += runtime_ms;
1717 	thread__put(thread);
1718 	return 0;
1719 
1720 out_dump:
1721 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1722 	       evsel->name,
1723 	       perf_evsel__strval(evsel, sample, "comm"),
1724 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1725 	       runtime,
1726 	       perf_evsel__intval(evsel, sample, "vruntime"));
1727 	thread__put(thread);
1728 	return 0;
1729 }
1730 
1731 static void bpf_output__printer(enum binary_printer_ops op,
1732 				unsigned int val, void *extra)
1733 {
1734 	FILE *output = extra;
1735 	unsigned char ch = (unsigned char)val;
1736 
1737 	switch (op) {
1738 	case BINARY_PRINT_CHAR_DATA:
1739 		fprintf(output, "%c", isprint(ch) ? ch : '.');
1740 		break;
1741 	case BINARY_PRINT_DATA_BEGIN:
1742 	case BINARY_PRINT_LINE_BEGIN:
1743 	case BINARY_PRINT_ADDR:
1744 	case BINARY_PRINT_NUM_DATA:
1745 	case BINARY_PRINT_NUM_PAD:
1746 	case BINARY_PRINT_SEP:
1747 	case BINARY_PRINT_CHAR_PAD:
1748 	case BINARY_PRINT_LINE_END:
1749 	case BINARY_PRINT_DATA_END:
1750 	default:
1751 		break;
1752 	}
1753 }
1754 
1755 static void bpf_output__fprintf(struct trace *trace,
1756 				struct perf_sample *sample)
1757 {
1758 	print_binary(sample->raw_data, sample->raw_size, 8,
1759 		     bpf_output__printer, trace->output);
1760 }
1761 
1762 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1763 				union perf_event *event __maybe_unused,
1764 				struct perf_sample *sample)
1765 {
1766 	int callchain_ret = 0;
1767 
1768 	if (sample->callchain) {
1769 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1770 		if (callchain_ret == 0) {
1771 			if (callchain_cursor.nr < trace->min_stack)
1772 				goto out;
1773 			callchain_ret = 1;
1774 		}
1775 	}
1776 
1777 	trace__printf_interrupted_entry(trace, sample);
1778 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1779 
1780 	if (trace->trace_syscalls)
1781 		fprintf(trace->output, "(         ): ");
1782 
1783 	fprintf(trace->output, "%s:", evsel->name);
1784 
1785 	if (perf_evsel__is_bpf_output(evsel)) {
1786 		bpf_output__fprintf(trace, sample);
1787 	} else if (evsel->tp_format) {
1788 		event_format__fprintf(evsel->tp_format, sample->cpu,
1789 				      sample->raw_data, sample->raw_size,
1790 				      trace->output);
1791 	}
1792 
1793 	fprintf(trace->output, ")\n");
1794 
1795 	if (callchain_ret > 0)
1796 		trace__fprintf_callchain(trace, sample);
1797 	else if (callchain_ret < 0)
1798 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1799 out:
1800 	return 0;
1801 }
1802 
1803 static void print_location(FILE *f, struct perf_sample *sample,
1804 			   struct addr_location *al,
1805 			   bool print_dso, bool print_sym)
1806 {
1807 
1808 	if ((verbose || print_dso) && al->map)
1809 		fprintf(f, "%s@", al->map->dso->long_name);
1810 
1811 	if ((verbose || print_sym) && al->sym)
1812 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1813 			al->addr - al->sym->start);
1814 	else if (al->map)
1815 		fprintf(f, "0x%" PRIx64, al->addr);
1816 	else
1817 		fprintf(f, "0x%" PRIx64, sample->addr);
1818 }
1819 
1820 static int trace__pgfault(struct trace *trace,
1821 			  struct perf_evsel *evsel,
1822 			  union perf_event *event __maybe_unused,
1823 			  struct perf_sample *sample)
1824 {
1825 	struct thread *thread;
1826 	struct addr_location al;
1827 	char map_type = 'd';
1828 	struct thread_trace *ttrace;
1829 	int err = -1;
1830 	int callchain_ret = 0;
1831 
1832 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1833 
1834 	if (sample->callchain) {
1835 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1836 		if (callchain_ret == 0) {
1837 			if (callchain_cursor.nr < trace->min_stack)
1838 				goto out_put;
1839 			callchain_ret = 1;
1840 		}
1841 	}
1842 
1843 	ttrace = thread__trace(thread, trace->output);
1844 	if (ttrace == NULL)
1845 		goto out_put;
1846 
1847 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1848 		ttrace->pfmaj++;
1849 	else
1850 		ttrace->pfmin++;
1851 
1852 	if (trace->summary_only)
1853 		goto out;
1854 
1855 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1856 			      sample->ip, &al);
1857 
1858 	trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1859 
1860 	fprintf(trace->output, "%sfault [",
1861 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1862 		"maj" : "min");
1863 
1864 	print_location(trace->output, sample, &al, false, true);
1865 
1866 	fprintf(trace->output, "] => ");
1867 
1868 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1869 				   sample->addr, &al);
1870 
1871 	if (!al.map) {
1872 		thread__find_addr_location(thread, sample->cpumode,
1873 					   MAP__FUNCTION, sample->addr, &al);
1874 
1875 		if (al.map)
1876 			map_type = 'x';
1877 		else
1878 			map_type = '?';
1879 	}
1880 
1881 	print_location(trace->output, sample, &al, true, false);
1882 
1883 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1884 
1885 	if (callchain_ret > 0)
1886 		trace__fprintf_callchain(trace, sample);
1887 	else if (callchain_ret < 0)
1888 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1889 out:
1890 	err = 0;
1891 out_put:
1892 	thread__put(thread);
1893 	return err;
1894 }
1895 
1896 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1897 {
1898 	if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1899 	    (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1900 		return false;
1901 
1902 	if (trace->pid_list || trace->tid_list)
1903 		return true;
1904 
1905 	return false;
1906 }
1907 
1908 static void trace__set_base_time(struct trace *trace,
1909 				 struct perf_evsel *evsel,
1910 				 struct perf_sample *sample)
1911 {
1912 	/*
1913 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1914 	 * and don't use sample->time unconditionally, we may end up having
1915 	 * some other event in the future without PERF_SAMPLE_TIME for good
1916 	 * reason, i.e. we may not be interested in its timestamps, just in
1917 	 * it taking place, picking some piece of information when it
1918 	 * appears in our event stream (vfs_getname comes to mind).
1919 	 */
1920 	if (trace->base_time == 0 && !trace->full_time &&
1921 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1922 		trace->base_time = sample->time;
1923 }
1924 
1925 static int trace__process_sample(struct perf_tool *tool,
1926 				 union perf_event *event,
1927 				 struct perf_sample *sample,
1928 				 struct perf_evsel *evsel,
1929 				 struct machine *machine __maybe_unused)
1930 {
1931 	struct trace *trace = container_of(tool, struct trace, tool);
1932 	int err = 0;
1933 
1934 	tracepoint_handler handler = evsel->handler;
1935 
1936 	if (skip_sample(trace, sample))
1937 		return 0;
1938 
1939 	trace__set_base_time(trace, evsel, sample);
1940 
1941 	if (handler) {
1942 		++trace->nr_events;
1943 		handler(trace, evsel, event, sample);
1944 	}
1945 
1946 	return err;
1947 }
1948 
1949 static int parse_target_str(struct trace *trace)
1950 {
1951 	if (trace->opts.target.pid) {
1952 		trace->pid_list = intlist__new(trace->opts.target.pid);
1953 		if (trace->pid_list == NULL) {
1954 			pr_err("Error parsing process id string\n");
1955 			return -EINVAL;
1956 		}
1957 	}
1958 
1959 	if (trace->opts.target.tid) {
1960 		trace->tid_list = intlist__new(trace->opts.target.tid);
1961 		if (trace->tid_list == NULL) {
1962 			pr_err("Error parsing thread id string\n");
1963 			return -EINVAL;
1964 		}
1965 	}
1966 
1967 	return 0;
1968 }
1969 
1970 static int trace__record(struct trace *trace, int argc, const char **argv)
1971 {
1972 	unsigned int rec_argc, i, j;
1973 	const char **rec_argv;
1974 	const char * const record_args[] = {
1975 		"record",
1976 		"-R",
1977 		"-m", "1024",
1978 		"-c", "1",
1979 	};
1980 
1981 	const char * const sc_args[] = { "-e", };
1982 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1983 	const char * const majpf_args[] = { "-e", "major-faults" };
1984 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1985 	const char * const minpf_args[] = { "-e", "minor-faults" };
1986 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1987 
1988 	/* +1 is for the event string below */
1989 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1990 		majpf_args_nr + minpf_args_nr + argc;
1991 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
1992 
1993 	if (rec_argv == NULL)
1994 		return -ENOMEM;
1995 
1996 	j = 0;
1997 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
1998 		rec_argv[j++] = record_args[i];
1999 
2000 	if (trace->trace_syscalls) {
2001 		for (i = 0; i < sc_args_nr; i++)
2002 			rec_argv[j++] = sc_args[i];
2003 
2004 		/* event string may be different for older kernels - e.g., RHEL6 */
2005 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2006 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2007 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2008 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2009 		else {
2010 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2011 			return -1;
2012 		}
2013 	}
2014 
2015 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2016 		for (i = 0; i < majpf_args_nr; i++)
2017 			rec_argv[j++] = majpf_args[i];
2018 
2019 	if (trace->trace_pgfaults & TRACE_PFMIN)
2020 		for (i = 0; i < minpf_args_nr; i++)
2021 			rec_argv[j++] = minpf_args[i];
2022 
2023 	for (i = 0; i < (unsigned int)argc; i++)
2024 		rec_argv[j++] = argv[i];
2025 
2026 	return cmd_record(j, rec_argv, NULL);
2027 }
2028 
2029 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2030 
2031 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2032 {
2033 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2034 
2035 	if (IS_ERR(evsel))
2036 		return false;
2037 
2038 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2039 		perf_evsel__delete(evsel);
2040 		return false;
2041 	}
2042 
2043 	evsel->handler = trace__vfs_getname;
2044 	perf_evlist__add(evlist, evsel);
2045 	return true;
2046 }
2047 
2048 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2049 {
2050 	struct perf_evsel *evsel;
2051 	struct perf_event_attr attr = {
2052 		.type = PERF_TYPE_SOFTWARE,
2053 		.mmap_data = 1,
2054 	};
2055 
2056 	attr.config = config;
2057 	attr.sample_period = 1;
2058 
2059 	event_attr_init(&attr);
2060 
2061 	evsel = perf_evsel__new(&attr);
2062 	if (evsel)
2063 		evsel->handler = trace__pgfault;
2064 
2065 	return evsel;
2066 }
2067 
2068 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2069 {
2070 	const u32 type = event->header.type;
2071 	struct perf_evsel *evsel;
2072 
2073 	if (type != PERF_RECORD_SAMPLE) {
2074 		trace__process_event(trace, trace->host, event, sample);
2075 		return;
2076 	}
2077 
2078 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2079 	if (evsel == NULL) {
2080 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2081 		return;
2082 	}
2083 
2084 	trace__set_base_time(trace, evsel, sample);
2085 
2086 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2087 	    sample->raw_data == NULL) {
2088 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2089 		       perf_evsel__name(evsel), sample->tid,
2090 		       sample->cpu, sample->raw_size);
2091 	} else {
2092 		tracepoint_handler handler = evsel->handler;
2093 		handler(trace, evsel, event, sample);
2094 	}
2095 }
2096 
2097 static int trace__add_syscall_newtp(struct trace *trace)
2098 {
2099 	int ret = -1;
2100 	struct perf_evlist *evlist = trace->evlist;
2101 	struct perf_evsel *sys_enter, *sys_exit;
2102 
2103 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2104 	if (sys_enter == NULL)
2105 		goto out;
2106 
2107 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2108 		goto out_delete_sys_enter;
2109 
2110 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2111 	if (sys_exit == NULL)
2112 		goto out_delete_sys_enter;
2113 
2114 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2115 		goto out_delete_sys_exit;
2116 
2117 	perf_evlist__add(evlist, sys_enter);
2118 	perf_evlist__add(evlist, sys_exit);
2119 
2120 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2121 		/*
2122 		 * We're interested only in the user space callchain
2123 		 * leading to the syscall, allow overriding that for
2124 		 * debugging reasons using --kernel_syscall_callchains
2125 		 */
2126 		sys_exit->attr.exclude_callchain_kernel = 1;
2127 	}
2128 
2129 	trace->syscalls.events.sys_enter = sys_enter;
2130 	trace->syscalls.events.sys_exit  = sys_exit;
2131 
2132 	ret = 0;
2133 out:
2134 	return ret;
2135 
2136 out_delete_sys_exit:
2137 	perf_evsel__delete_priv(sys_exit);
2138 out_delete_sys_enter:
2139 	perf_evsel__delete_priv(sys_enter);
2140 	goto out;
2141 }
2142 
2143 static int trace__set_ev_qualifier_filter(struct trace *trace)
2144 {
2145 	int err = -1;
2146 	struct perf_evsel *sys_exit;
2147 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2148 						trace->ev_qualifier_ids.nr,
2149 						trace->ev_qualifier_ids.entries);
2150 
2151 	if (filter == NULL)
2152 		goto out_enomem;
2153 
2154 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2155 					  filter)) {
2156 		sys_exit = trace->syscalls.events.sys_exit;
2157 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2158 	}
2159 
2160 	free(filter);
2161 out:
2162 	return err;
2163 out_enomem:
2164 	errno = ENOMEM;
2165 	goto out;
2166 }
2167 
2168 static int trace__run(struct trace *trace, int argc, const char **argv)
2169 {
2170 	struct perf_evlist *evlist = trace->evlist;
2171 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2172 	int err = -1, i;
2173 	unsigned long before;
2174 	const bool forks = argc > 0;
2175 	bool draining = false;
2176 
2177 	trace->live = true;
2178 
2179 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2180 		goto out_error_raw_syscalls;
2181 
2182 	if (trace->trace_syscalls)
2183 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2184 
2185 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2186 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2187 		if (pgfault_maj == NULL)
2188 			goto out_error_mem;
2189 		perf_evlist__add(evlist, pgfault_maj);
2190 	}
2191 
2192 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2193 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2194 		if (pgfault_min == NULL)
2195 			goto out_error_mem;
2196 		perf_evlist__add(evlist, pgfault_min);
2197 	}
2198 
2199 	if (trace->sched &&
2200 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2201 				   trace__sched_stat_runtime))
2202 		goto out_error_sched_stat_runtime;
2203 
2204 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2205 	if (err < 0) {
2206 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2207 		goto out_delete_evlist;
2208 	}
2209 
2210 	err = trace__symbols_init(trace, evlist);
2211 	if (err < 0) {
2212 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2213 		goto out_delete_evlist;
2214 	}
2215 
2216 	perf_evlist__config(evlist, &trace->opts, NULL);
2217 
2218 	if (callchain_param.enabled) {
2219 		bool use_identifier = false;
2220 
2221 		if (trace->syscalls.events.sys_exit) {
2222 			perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2223 						     &trace->opts, &callchain_param);
2224 			use_identifier = true;
2225 		}
2226 
2227 		if (pgfault_maj) {
2228 			perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2229 			use_identifier = true;
2230 		}
2231 
2232 		if (pgfault_min) {
2233 			perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2234 			use_identifier = true;
2235 		}
2236 
2237 		if (use_identifier) {
2238 		       /*
2239 			* Now we have evsels with different sample_ids, use
2240 			* PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2241 			* from a fixed position in each ring buffer record.
2242 			*
2243 			* As of this the changeset introducing this comment, this
2244 			* isn't strictly needed, as the fields that can come before
2245 			* PERF_SAMPLE_ID are all used, but we'll probably disable
2246 			* some of those for things like copying the payload of
2247 			* pointer syscall arguments, and for vfs_getname we don't
2248 			* need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2249 			* here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2250 			*/
2251 			perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2252 			perf_evlist__reset_sample_bit(evlist, ID);
2253 		}
2254 	}
2255 
2256 	signal(SIGCHLD, sig_handler);
2257 	signal(SIGINT, sig_handler);
2258 
2259 	if (forks) {
2260 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2261 						    argv, false, NULL);
2262 		if (err < 0) {
2263 			fprintf(trace->output, "Couldn't run the workload!\n");
2264 			goto out_delete_evlist;
2265 		}
2266 	}
2267 
2268 	err = perf_evlist__open(evlist);
2269 	if (err < 0)
2270 		goto out_error_open;
2271 
2272 	err = bpf__apply_obj_config();
2273 	if (err) {
2274 		char errbuf[BUFSIZ];
2275 
2276 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2277 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2278 			 errbuf);
2279 		goto out_error_open;
2280 	}
2281 
2282 	/*
2283 	 * Better not use !target__has_task() here because we need to cover the
2284 	 * case where no threads were specified in the command line, but a
2285 	 * workload was, and in that case we will fill in the thread_map when
2286 	 * we fork the workload in perf_evlist__prepare_workload.
2287 	 */
2288 	if (trace->filter_pids.nr > 0)
2289 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2290 	else if (thread_map__pid(evlist->threads, 0) == -1)
2291 		err = perf_evlist__set_filter_pid(evlist, getpid());
2292 
2293 	if (err < 0)
2294 		goto out_error_mem;
2295 
2296 	if (trace->ev_qualifier_ids.nr > 0) {
2297 		err = trace__set_ev_qualifier_filter(trace);
2298 		if (err < 0)
2299 			goto out_errno;
2300 
2301 		pr_debug("event qualifier tracepoint filter: %s\n",
2302 			 trace->syscalls.events.sys_exit->filter);
2303 	}
2304 
2305 	err = perf_evlist__apply_filters(evlist, &evsel);
2306 	if (err < 0)
2307 		goto out_error_apply_filters;
2308 
2309 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2310 	if (err < 0)
2311 		goto out_error_mmap;
2312 
2313 	if (!target__none(&trace->opts.target))
2314 		perf_evlist__enable(evlist);
2315 
2316 	if (forks)
2317 		perf_evlist__start_workload(evlist);
2318 
2319 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2320 				  evlist->threads->nr > 1 ||
2321 				  perf_evlist__first(evlist)->attr.inherit;
2322 again:
2323 	before = trace->nr_events;
2324 
2325 	for (i = 0; i < evlist->nr_mmaps; i++) {
2326 		union perf_event *event;
2327 
2328 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2329 			struct perf_sample sample;
2330 
2331 			++trace->nr_events;
2332 
2333 			err = perf_evlist__parse_sample(evlist, event, &sample);
2334 			if (err) {
2335 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2336 				goto next_event;
2337 			}
2338 
2339 			trace__handle_event(trace, event, &sample);
2340 next_event:
2341 			perf_evlist__mmap_consume(evlist, i);
2342 
2343 			if (interrupted)
2344 				goto out_disable;
2345 
2346 			if (done && !draining) {
2347 				perf_evlist__disable(evlist);
2348 				draining = true;
2349 			}
2350 		}
2351 	}
2352 
2353 	if (trace->nr_events == before) {
2354 		int timeout = done ? 100 : -1;
2355 
2356 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2357 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2358 				draining = true;
2359 
2360 			goto again;
2361 		}
2362 	} else {
2363 		goto again;
2364 	}
2365 
2366 out_disable:
2367 	thread__zput(trace->current);
2368 
2369 	perf_evlist__disable(evlist);
2370 
2371 	if (!err) {
2372 		if (trace->summary)
2373 			trace__fprintf_thread_summary(trace, trace->output);
2374 
2375 		if (trace->show_tool_stats) {
2376 			fprintf(trace->output, "Stats:\n "
2377 					       " vfs_getname : %" PRIu64 "\n"
2378 					       " proc_getname: %" PRIu64 "\n",
2379 				trace->stats.vfs_getname,
2380 				trace->stats.proc_getname);
2381 		}
2382 	}
2383 
2384 out_delete_evlist:
2385 	perf_evlist__delete(evlist);
2386 	trace->evlist = NULL;
2387 	trace->live = false;
2388 	return err;
2389 {
2390 	char errbuf[BUFSIZ];
2391 
2392 out_error_sched_stat_runtime:
2393 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2394 	goto out_error;
2395 
2396 out_error_raw_syscalls:
2397 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2398 	goto out_error;
2399 
2400 out_error_mmap:
2401 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2402 	goto out_error;
2403 
2404 out_error_open:
2405 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2406 
2407 out_error:
2408 	fprintf(trace->output, "%s\n", errbuf);
2409 	goto out_delete_evlist;
2410 
2411 out_error_apply_filters:
2412 	fprintf(trace->output,
2413 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2414 		evsel->filter, perf_evsel__name(evsel), errno,
2415 		str_error_r(errno, errbuf, sizeof(errbuf)));
2416 	goto out_delete_evlist;
2417 }
2418 out_error_mem:
2419 	fprintf(trace->output, "Not enough memory to run!\n");
2420 	goto out_delete_evlist;
2421 
2422 out_errno:
2423 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2424 	goto out_delete_evlist;
2425 }
2426 
2427 static int trace__replay(struct trace *trace)
2428 {
2429 	const struct perf_evsel_str_handler handlers[] = {
2430 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2431 	};
2432 	struct perf_data_file file = {
2433 		.path  = input_name,
2434 		.mode  = PERF_DATA_MODE_READ,
2435 		.force = trace->force,
2436 	};
2437 	struct perf_session *session;
2438 	struct perf_evsel *evsel;
2439 	int err = -1;
2440 
2441 	trace->tool.sample	  = trace__process_sample;
2442 	trace->tool.mmap	  = perf_event__process_mmap;
2443 	trace->tool.mmap2	  = perf_event__process_mmap2;
2444 	trace->tool.comm	  = perf_event__process_comm;
2445 	trace->tool.exit	  = perf_event__process_exit;
2446 	trace->tool.fork	  = perf_event__process_fork;
2447 	trace->tool.attr	  = perf_event__process_attr;
2448 	trace->tool.tracing_data = perf_event__process_tracing_data;
2449 	trace->tool.build_id	  = perf_event__process_build_id;
2450 
2451 	trace->tool.ordered_events = true;
2452 	trace->tool.ordering_requires_timestamps = true;
2453 
2454 	/* add tid to output */
2455 	trace->multiple_threads = true;
2456 
2457 	session = perf_session__new(&file, false, &trace->tool);
2458 	if (session == NULL)
2459 		return -1;
2460 
2461 	if (symbol__init(&session->header.env) < 0)
2462 		goto out;
2463 
2464 	trace->host = &session->machines.host;
2465 
2466 	err = perf_session__set_tracepoints_handlers(session, handlers);
2467 	if (err)
2468 		goto out;
2469 
2470 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2471 						     "raw_syscalls:sys_enter");
2472 	/* older kernels have syscalls tp versus raw_syscalls */
2473 	if (evsel == NULL)
2474 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2475 							     "syscalls:sys_enter");
2476 
2477 	if (evsel &&
2478 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2479 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2480 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2481 		goto out;
2482 	}
2483 
2484 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2485 						     "raw_syscalls:sys_exit");
2486 	if (evsel == NULL)
2487 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2488 							     "syscalls:sys_exit");
2489 	if (evsel &&
2490 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2491 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2492 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2493 		goto out;
2494 	}
2495 
2496 	evlist__for_each_entry(session->evlist, evsel) {
2497 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2498 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2499 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2500 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2501 			evsel->handler = trace__pgfault;
2502 	}
2503 
2504 	err = parse_target_str(trace);
2505 	if (err != 0)
2506 		goto out;
2507 
2508 	setup_pager();
2509 
2510 	err = perf_session__process_events(session);
2511 	if (err)
2512 		pr_err("Failed to process events, error %d", err);
2513 
2514 	else if (trace->summary)
2515 		trace__fprintf_thread_summary(trace, trace->output);
2516 
2517 out:
2518 	perf_session__delete(session);
2519 
2520 	return err;
2521 }
2522 
2523 static size_t trace__fprintf_threads_header(FILE *fp)
2524 {
2525 	size_t printed;
2526 
2527 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2528 
2529 	return printed;
2530 }
2531 
2532 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2533 	struct stats 	*stats;
2534 	double		msecs;
2535 	int		syscall;
2536 )
2537 {
2538 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2539 	struct stats *stats = source->priv;
2540 
2541 	entry->syscall = source->i;
2542 	entry->stats   = stats;
2543 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2544 }
2545 
2546 static size_t thread__dump_stats(struct thread_trace *ttrace,
2547 				 struct trace *trace, FILE *fp)
2548 {
2549 	size_t printed = 0;
2550 	struct syscall *sc;
2551 	struct rb_node *nd;
2552 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2553 
2554 	if (syscall_stats == NULL)
2555 		return 0;
2556 
2557 	printed += fprintf(fp, "\n");
2558 
2559 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2560 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2561 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2562 
2563 	resort_rb__for_each_entry(nd, syscall_stats) {
2564 		struct stats *stats = syscall_stats_entry->stats;
2565 		if (stats) {
2566 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2567 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2568 			double avg = avg_stats(stats);
2569 			double pct;
2570 			u64 n = (u64) stats->n;
2571 
2572 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2573 			avg /= NSEC_PER_MSEC;
2574 
2575 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2576 			printed += fprintf(fp, "   %-15s", sc->name);
2577 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2578 					   n, syscall_stats_entry->msecs, min, avg);
2579 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2580 		}
2581 	}
2582 
2583 	resort_rb__delete(syscall_stats);
2584 	printed += fprintf(fp, "\n\n");
2585 
2586 	return printed;
2587 }
2588 
2589 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2590 {
2591 	size_t printed = 0;
2592 	struct thread_trace *ttrace = thread__priv(thread);
2593 	double ratio;
2594 
2595 	if (ttrace == NULL)
2596 		return 0;
2597 
2598 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2599 
2600 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2601 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2602 	printed += fprintf(fp, "%.1f%%", ratio);
2603 	if (ttrace->pfmaj)
2604 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2605 	if (ttrace->pfmin)
2606 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2607 	if (trace->sched)
2608 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2609 	else if (fputc('\n', fp) != EOF)
2610 		++printed;
2611 
2612 	printed += thread__dump_stats(ttrace, trace, fp);
2613 
2614 	return printed;
2615 }
2616 
2617 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2618 {
2619 	return ttrace ? ttrace->nr_events : 0;
2620 }
2621 
2622 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2623 	struct thread *thread;
2624 )
2625 {
2626 	entry->thread = rb_entry(nd, struct thread, rb_node);
2627 }
2628 
2629 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2630 {
2631 	DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2632 	size_t printed = trace__fprintf_threads_header(fp);
2633 	struct rb_node *nd;
2634 
2635 	if (threads == NULL) {
2636 		fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2637 		return 0;
2638 	}
2639 
2640 	resort_rb__for_each_entry(nd, threads)
2641 		printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2642 
2643 	resort_rb__delete(threads);
2644 
2645 	return printed;
2646 }
2647 
2648 static int trace__set_duration(const struct option *opt, const char *str,
2649 			       int unset __maybe_unused)
2650 {
2651 	struct trace *trace = opt->value;
2652 
2653 	trace->duration_filter = atof(str);
2654 	return 0;
2655 }
2656 
2657 static int trace__set_filter_pids(const struct option *opt, const char *str,
2658 				  int unset __maybe_unused)
2659 {
2660 	int ret = -1;
2661 	size_t i;
2662 	struct trace *trace = opt->value;
2663 	/*
2664 	 * FIXME: introduce a intarray class, plain parse csv and create a
2665 	 * { int nr, int entries[] } struct...
2666 	 */
2667 	struct intlist *list = intlist__new(str);
2668 
2669 	if (list == NULL)
2670 		return -1;
2671 
2672 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2673 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2674 
2675 	if (trace->filter_pids.entries == NULL)
2676 		goto out;
2677 
2678 	trace->filter_pids.entries[0] = getpid();
2679 
2680 	for (i = 1; i < trace->filter_pids.nr; ++i)
2681 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2682 
2683 	intlist__delete(list);
2684 	ret = 0;
2685 out:
2686 	return ret;
2687 }
2688 
2689 static int trace__open_output(struct trace *trace, const char *filename)
2690 {
2691 	struct stat st;
2692 
2693 	if (!stat(filename, &st) && st.st_size) {
2694 		char oldname[PATH_MAX];
2695 
2696 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2697 		unlink(oldname);
2698 		rename(filename, oldname);
2699 	}
2700 
2701 	trace->output = fopen(filename, "w");
2702 
2703 	return trace->output == NULL ? -errno : 0;
2704 }
2705 
2706 static int parse_pagefaults(const struct option *opt, const char *str,
2707 			    int unset __maybe_unused)
2708 {
2709 	int *trace_pgfaults = opt->value;
2710 
2711 	if (strcmp(str, "all") == 0)
2712 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2713 	else if (strcmp(str, "maj") == 0)
2714 		*trace_pgfaults |= TRACE_PFMAJ;
2715 	else if (strcmp(str, "min") == 0)
2716 		*trace_pgfaults |= TRACE_PFMIN;
2717 	else
2718 		return -1;
2719 
2720 	return 0;
2721 }
2722 
2723 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2724 {
2725 	struct perf_evsel *evsel;
2726 
2727 	evlist__for_each_entry(evlist, evsel)
2728 		evsel->handler = handler;
2729 }
2730 
2731 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2732 {
2733 	const char *trace_usage[] = {
2734 		"perf trace [<options>] [<command>]",
2735 		"perf trace [<options>] -- <command> [<options>]",
2736 		"perf trace record [<options>] [<command>]",
2737 		"perf trace record [<options>] -- <command> [<options>]",
2738 		NULL
2739 	};
2740 	struct trace trace = {
2741 		.syscalls = {
2742 			. max = -1,
2743 		},
2744 		.opts = {
2745 			.target = {
2746 				.uid	   = UINT_MAX,
2747 				.uses_mmap = true,
2748 			},
2749 			.user_freq     = UINT_MAX,
2750 			.user_interval = ULLONG_MAX,
2751 			.no_buffering  = true,
2752 			.mmap_pages    = UINT_MAX,
2753 			.proc_map_timeout  = 500,
2754 		},
2755 		.output = stderr,
2756 		.show_comm = true,
2757 		.trace_syscalls = true,
2758 		.kernel_syscallchains = false,
2759 		.max_stack = UINT_MAX,
2760 	};
2761 	const char *output_name = NULL;
2762 	const char *ev_qualifier_str = NULL;
2763 	const struct option trace_options[] = {
2764 	OPT_CALLBACK(0, "event", &trace.evlist, "event",
2765 		     "event selector. use 'perf list' to list available events",
2766 		     parse_events_option),
2767 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2768 		    "show the thread COMM next to its id"),
2769 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2770 	OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2771 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2772 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2773 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2774 		    "trace events on existing process id"),
2775 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2776 		    "trace events on existing thread id"),
2777 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2778 		     "pids to filter (by the kernel)", trace__set_filter_pids),
2779 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2780 		    "system-wide collection from all CPUs"),
2781 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2782 		    "list of cpus to monitor"),
2783 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2784 		    "child tasks do not inherit counters"),
2785 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2786 		     "number of mmap data pages",
2787 		     perf_evlist__parse_mmap_pages),
2788 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2789 		   "user to profile"),
2790 	OPT_CALLBACK(0, "duration", &trace, "float",
2791 		     "show only events with duration > N.M ms",
2792 		     trace__set_duration),
2793 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2794 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2795 	OPT_BOOLEAN('T', "time", &trace.full_time,
2796 		    "Show full timestamp, not time relative to first start"),
2797 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
2798 		    "Show only syscall summary with statistics"),
2799 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
2800 		    "Show all syscalls and summary with statistics"),
2801 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2802 		     "Trace pagefaults", parse_pagefaults, "maj"),
2803 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2804 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2805 	OPT_CALLBACK(0, "call-graph", &trace.opts,
2806 		     "record_mode[,record_size]", record_callchain_help,
2807 		     &record_parse_callchain_opt),
2808 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2809 		    "Show the kernel callchains on the syscall exit path"),
2810 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2811 		     "Set the minimum stack depth when parsing the callchain, "
2812 		     "anything below the specified depth will be ignored."),
2813 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2814 		     "Set the maximum stack depth when parsing the callchain, "
2815 		     "anything beyond the specified depth will be ignored. "
2816 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2817 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2818 			"per thread proc mmap processing timeout in ms"),
2819 	OPT_END()
2820 	};
2821 	bool __maybe_unused max_stack_user_set = true;
2822 	bool mmap_pages_user_set = true;
2823 	const char * const trace_subcommands[] = { "record", NULL };
2824 	int err;
2825 	char bf[BUFSIZ];
2826 
2827 	signal(SIGSEGV, sighandler_dump_stack);
2828 	signal(SIGFPE, sighandler_dump_stack);
2829 
2830 	trace.evlist = perf_evlist__new();
2831 	trace.sctbl = syscalltbl__new();
2832 
2833 	if (trace.evlist == NULL || trace.sctbl == NULL) {
2834 		pr_err("Not enough memory to run!\n");
2835 		err = -ENOMEM;
2836 		goto out;
2837 	}
2838 
2839 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2840 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2841 
2842 	err = bpf__setup_stdout(trace.evlist);
2843 	if (err) {
2844 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2845 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2846 		goto out;
2847 	}
2848 
2849 	err = -1;
2850 
2851 	if (trace.trace_pgfaults) {
2852 		trace.opts.sample_address = true;
2853 		trace.opts.sample_time = true;
2854 	}
2855 
2856 	if (trace.opts.mmap_pages == UINT_MAX)
2857 		mmap_pages_user_set = false;
2858 
2859 	if (trace.max_stack == UINT_MAX) {
2860 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2861 		max_stack_user_set = false;
2862 	}
2863 
2864 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2865 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2866 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2867 #endif
2868 
2869 	if (callchain_param.enabled) {
2870 		if (!mmap_pages_user_set && geteuid() == 0)
2871 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2872 
2873 		symbol_conf.use_callchain = true;
2874 	}
2875 
2876 	if (trace.evlist->nr_entries > 0)
2877 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2878 
2879 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2880 		return trace__record(&trace, argc-1, &argv[1]);
2881 
2882 	/* summary_only implies summary option, but don't overwrite summary if set */
2883 	if (trace.summary_only)
2884 		trace.summary = trace.summary_only;
2885 
2886 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2887 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
2888 		pr_err("Please specify something to trace.\n");
2889 		return -1;
2890 	}
2891 
2892 	if (!trace.trace_syscalls && ev_qualifier_str) {
2893 		pr_err("The -e option can't be used with --no-syscalls.\n");
2894 		goto out;
2895 	}
2896 
2897 	if (output_name != NULL) {
2898 		err = trace__open_output(&trace, output_name);
2899 		if (err < 0) {
2900 			perror("failed to create output file");
2901 			goto out;
2902 		}
2903 	}
2904 
2905 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
2906 
2907 	if (ev_qualifier_str != NULL) {
2908 		const char *s = ev_qualifier_str;
2909 		struct strlist_config slist_config = {
2910 			.dirname = system_path(STRACE_GROUPS_DIR),
2911 		};
2912 
2913 		trace.not_ev_qualifier = *s == '!';
2914 		if (trace.not_ev_qualifier)
2915 			++s;
2916 		trace.ev_qualifier = strlist__new(s, &slist_config);
2917 		if (trace.ev_qualifier == NULL) {
2918 			fputs("Not enough memory to parse event qualifier",
2919 			      trace.output);
2920 			err = -ENOMEM;
2921 			goto out_close;
2922 		}
2923 
2924 		err = trace__validate_ev_qualifier(&trace);
2925 		if (err)
2926 			goto out_close;
2927 	}
2928 
2929 	err = target__validate(&trace.opts.target);
2930 	if (err) {
2931 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2932 		fprintf(trace.output, "%s", bf);
2933 		goto out_close;
2934 	}
2935 
2936 	err = target__parse_uid(&trace.opts.target);
2937 	if (err) {
2938 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2939 		fprintf(trace.output, "%s", bf);
2940 		goto out_close;
2941 	}
2942 
2943 	if (!argc && target__none(&trace.opts.target))
2944 		trace.opts.target.system_wide = true;
2945 
2946 	if (input_name)
2947 		err = trace__replay(&trace);
2948 	else
2949 		err = trace__run(&trace, argc, argv);
2950 
2951 out_close:
2952 	if (output_name != NULL)
2953 		fclose(trace.output);
2954 out:
2955 	return err;
2956 }
2957