xref: /openbmc/linux/tools/perf/builtin-trace.c (revision 5d0e4d78)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/event.h"
25 #include "util/evlist.h"
26 #include <subcmd/exec-cmd.h>
27 #include "util/machine.h"
28 #include "util/path.h"
29 #include "util/session.h"
30 #include "util/thread.h"
31 #include <subcmd/parse-options.h>
32 #include "util/strlist.h"
33 #include "util/intlist.h"
34 #include "util/thread_map.h"
35 #include "util/stat.h"
36 #include "trace/beauty/beauty.h"
37 #include "trace-event.h"
38 #include "util/parse-events.h"
39 #include "util/bpf-loader.h"
40 #include "callchain.h"
41 #include "print_binary.h"
42 #include "string2.h"
43 #include "syscalltbl.h"
44 #include "rb_resort.h"
45 
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/audit.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 
61 #include "sane_ctype.h"
62 
63 #ifndef O_CLOEXEC
64 # define O_CLOEXEC		02000000
65 #endif
66 
67 struct trace {
68 	struct perf_tool	tool;
69 	struct syscalltbl	*sctbl;
70 	struct {
71 		int		max;
72 		struct syscall  *table;
73 		struct {
74 			struct perf_evsel *sys_enter,
75 					  *sys_exit;
76 		}		events;
77 	} syscalls;
78 	struct record_opts	opts;
79 	struct perf_evlist	*evlist;
80 	struct machine		*host;
81 	struct thread		*current;
82 	u64			base_time;
83 	FILE			*output;
84 	unsigned long		nr_events;
85 	struct strlist		*ev_qualifier;
86 	struct {
87 		size_t		nr;
88 		int		*entries;
89 	}			ev_qualifier_ids;
90 	struct {
91 		size_t		nr;
92 		pid_t		*entries;
93 	}			filter_pids;
94 	double			duration_filter;
95 	double			runtime_ms;
96 	struct {
97 		u64		vfs_getname,
98 				proc_getname;
99 	} stats;
100 	unsigned int		max_stack;
101 	unsigned int		min_stack;
102 	bool			not_ev_qualifier;
103 	bool			live;
104 	bool			full_time;
105 	bool			sched;
106 	bool			multiple_threads;
107 	bool			summary;
108 	bool			summary_only;
109 	bool			show_comm;
110 	bool			show_tool_stats;
111 	bool			trace_syscalls;
112 	bool			kernel_syscallchains;
113 	bool			force;
114 	bool			vfs_getname;
115 	int			trace_pgfaults;
116 	int			open_id;
117 };
118 
119 struct tp_field {
120 	int offset;
121 	union {
122 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
123 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
124 	};
125 };
126 
127 #define TP_UINT_FIELD(bits) \
128 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
129 { \
130 	u##bits value; \
131 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
132 	return value;  \
133 }
134 
135 TP_UINT_FIELD(8);
136 TP_UINT_FIELD(16);
137 TP_UINT_FIELD(32);
138 TP_UINT_FIELD(64);
139 
140 #define TP_UINT_FIELD__SWAPPED(bits) \
141 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
142 { \
143 	u##bits value; \
144 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
145 	return bswap_##bits(value);\
146 }
147 
148 TP_UINT_FIELD__SWAPPED(16);
149 TP_UINT_FIELD__SWAPPED(32);
150 TP_UINT_FIELD__SWAPPED(64);
151 
152 static int tp_field__init_uint(struct tp_field *field,
153 			       struct format_field *format_field,
154 			       bool needs_swap)
155 {
156 	field->offset = format_field->offset;
157 
158 	switch (format_field->size) {
159 	case 1:
160 		field->integer = tp_field__u8;
161 		break;
162 	case 2:
163 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
164 		break;
165 	case 4:
166 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
167 		break;
168 	case 8:
169 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
170 		break;
171 	default:
172 		return -1;
173 	}
174 
175 	return 0;
176 }
177 
178 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
179 {
180 	return sample->raw_data + field->offset;
181 }
182 
183 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
184 {
185 	field->offset = format_field->offset;
186 	field->pointer = tp_field__ptr;
187 	return 0;
188 }
189 
190 struct syscall_tp {
191 	struct tp_field id;
192 	union {
193 		struct tp_field args, ret;
194 	};
195 };
196 
197 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
198 					  struct tp_field *field,
199 					  const char *name)
200 {
201 	struct format_field *format_field = perf_evsel__field(evsel, name);
202 
203 	if (format_field == NULL)
204 		return -1;
205 
206 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
207 }
208 
209 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
210 	({ struct syscall_tp *sc = evsel->priv;\
211 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
212 
213 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
214 					 struct tp_field *field,
215 					 const char *name)
216 {
217 	struct format_field *format_field = perf_evsel__field(evsel, name);
218 
219 	if (format_field == NULL)
220 		return -1;
221 
222 	return tp_field__init_ptr(field, format_field);
223 }
224 
225 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
226 	({ struct syscall_tp *sc = evsel->priv;\
227 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
228 
229 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
230 {
231 	zfree(&evsel->priv);
232 	perf_evsel__delete(evsel);
233 }
234 
235 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
236 {
237 	evsel->priv = malloc(sizeof(struct syscall_tp));
238 	if (evsel->priv != NULL) {
239 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
240 			goto out_delete;
241 
242 		evsel->handler = handler;
243 		return 0;
244 	}
245 
246 	return -ENOMEM;
247 
248 out_delete:
249 	zfree(&evsel->priv);
250 	return -ENOENT;
251 }
252 
253 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
254 {
255 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
256 
257 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
258 	if (IS_ERR(evsel))
259 		evsel = perf_evsel__newtp("syscalls", direction);
260 
261 	if (IS_ERR(evsel))
262 		return NULL;
263 
264 	if (perf_evsel__init_syscall_tp(evsel, handler))
265 		goto out_delete;
266 
267 	return evsel;
268 
269 out_delete:
270 	perf_evsel__delete_priv(evsel);
271 	return NULL;
272 }
273 
274 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
275 	({ struct syscall_tp *fields = evsel->priv; \
276 	   fields->name.integer(&fields->name, sample); })
277 
278 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
279 	({ struct syscall_tp *fields = evsel->priv; \
280 	   fields->name.pointer(&fields->name, sample); })
281 
282 struct strarray {
283 	int	    offset;
284 	int	    nr_entries;
285 	const char **entries;
286 };
287 
288 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
289 	.nr_entries = ARRAY_SIZE(array), \
290 	.entries = array, \
291 }
292 
293 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
294 	.offset	    = off, \
295 	.nr_entries = ARRAY_SIZE(array), \
296 	.entries = array, \
297 }
298 
299 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
300 						const char *intfmt,
301 					        struct syscall_arg *arg)
302 {
303 	struct strarray *sa = arg->parm;
304 	int idx = arg->val - sa->offset;
305 
306 	if (idx < 0 || idx >= sa->nr_entries)
307 		return scnprintf(bf, size, intfmt, arg->val);
308 
309 	return scnprintf(bf, size, "%s", sa->entries[idx]);
310 }
311 
312 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
313 					      struct syscall_arg *arg)
314 {
315 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
316 }
317 
318 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
319 
320 #if defined(__i386__) || defined(__x86_64__)
321 /*
322  * FIXME: Make this available to all arches as soon as the ioctl beautifier
323  * 	  gets rewritten to support all arches.
324  */
325 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
326 						 struct syscall_arg *arg)
327 {
328 	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
329 }
330 
331 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
332 #endif /* defined(__i386__) || defined(__x86_64__) */
333 
334 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
335 					struct syscall_arg *arg);
336 
337 #define SCA_FD syscall_arg__scnprintf_fd
338 
339 #ifndef AT_FDCWD
340 #define AT_FDCWD	-100
341 #endif
342 
343 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
344 					   struct syscall_arg *arg)
345 {
346 	int fd = arg->val;
347 
348 	if (fd == AT_FDCWD)
349 		return scnprintf(bf, size, "CWD");
350 
351 	return syscall_arg__scnprintf_fd(bf, size, arg);
352 }
353 
354 #define SCA_FDAT syscall_arg__scnprintf_fd_at
355 
356 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
357 					      struct syscall_arg *arg);
358 
359 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
360 
361 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
362 					 struct syscall_arg *arg)
363 {
364 	return scnprintf(bf, size, "%#lx", arg->val);
365 }
366 
367 #define SCA_HEX syscall_arg__scnprintf_hex
368 
369 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
370 					 struct syscall_arg *arg)
371 {
372 	return scnprintf(bf, size, "%d", arg->val);
373 }
374 
375 #define SCA_INT syscall_arg__scnprintf_int
376 
377 static const char *bpf_cmd[] = {
378 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
379 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
380 };
381 static DEFINE_STRARRAY(bpf_cmd);
382 
383 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
384 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
385 
386 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
387 static DEFINE_STRARRAY(itimers);
388 
389 static const char *keyctl_options[] = {
390 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
391 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
392 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
393 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
394 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
395 };
396 static DEFINE_STRARRAY(keyctl_options);
397 
398 static const char *whences[] = { "SET", "CUR", "END",
399 #ifdef SEEK_DATA
400 "DATA",
401 #endif
402 #ifdef SEEK_HOLE
403 "HOLE",
404 #endif
405 };
406 static DEFINE_STRARRAY(whences);
407 
408 static const char *fcntl_cmds[] = {
409 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
410 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
411 	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
412 	"F_GETOWNER_UIDS",
413 };
414 static DEFINE_STRARRAY(fcntl_cmds);
415 
416 static const char *rlimit_resources[] = {
417 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
418 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
419 	"RTTIME",
420 };
421 static DEFINE_STRARRAY(rlimit_resources);
422 
423 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
424 static DEFINE_STRARRAY(sighow);
425 
426 static const char *clockid[] = {
427 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
428 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
429 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
430 };
431 static DEFINE_STRARRAY(clockid);
432 
433 static const char *socket_families[] = {
434 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
435 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
436 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
437 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
438 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
439 	"ALG", "NFC", "VSOCK",
440 };
441 static DEFINE_STRARRAY(socket_families);
442 
443 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
444 						 struct syscall_arg *arg)
445 {
446 	size_t printed = 0;
447 	int mode = arg->val;
448 
449 	if (mode == F_OK) /* 0 */
450 		return scnprintf(bf, size, "F");
451 #define	P_MODE(n) \
452 	if (mode & n##_OK) { \
453 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
454 		mode &= ~n##_OK; \
455 	}
456 
457 	P_MODE(R);
458 	P_MODE(W);
459 	P_MODE(X);
460 #undef P_MODE
461 
462 	if (mode)
463 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
464 
465 	return printed;
466 }
467 
468 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
469 
470 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
471 					      struct syscall_arg *arg);
472 
473 #define SCA_FILENAME syscall_arg__scnprintf_filename
474 
475 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
476 						struct syscall_arg *arg)
477 {
478 	int printed = 0, flags = arg->val;
479 
480 #define	P_FLAG(n) \
481 	if (flags & O_##n) { \
482 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
483 		flags &= ~O_##n; \
484 	}
485 
486 	P_FLAG(CLOEXEC);
487 	P_FLAG(NONBLOCK);
488 #undef P_FLAG
489 
490 	if (flags)
491 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
492 
493 	return printed;
494 }
495 
496 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
497 
498 #if defined(__i386__) || defined(__x86_64__)
499 /*
500  * FIXME: Make this available to all arches.
501  */
502 #define TCGETS		0x5401
503 
504 static const char *tioctls[] = {
505 	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
506 	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
507 	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
508 	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
509 	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
510 	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
511 	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
512 	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
513 	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
514 	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
515 	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
516 	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
517 	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
518 	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
519 	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
520 };
521 
522 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
523 #endif /* defined(__i386__) || defined(__x86_64__) */
524 
525 #ifndef GRND_NONBLOCK
526 #define GRND_NONBLOCK	0x0001
527 #endif
528 #ifndef GRND_RANDOM
529 #define GRND_RANDOM	0x0002
530 #endif
531 
532 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
533 						   struct syscall_arg *arg)
534 {
535 	int printed = 0, flags = arg->val;
536 
537 #define	P_FLAG(n) \
538 	if (flags & GRND_##n) { \
539 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
540 		flags &= ~GRND_##n; \
541 	}
542 
543 	P_FLAG(RANDOM);
544 	P_FLAG(NONBLOCK);
545 #undef P_FLAG
546 
547 	if (flags)
548 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
549 
550 	return printed;
551 }
552 
553 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
554 
555 #define STRARRAY(arg, name, array) \
556 	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
557 	  .arg_parm	 = { [arg] = &strarray__##array, }
558 
559 #include "trace/beauty/eventfd.c"
560 #include "trace/beauty/flock.c"
561 #include "trace/beauty/futex_op.c"
562 #include "trace/beauty/mmap.c"
563 #include "trace/beauty/mode_t.c"
564 #include "trace/beauty/msg_flags.c"
565 #include "trace/beauty/open_flags.c"
566 #include "trace/beauty/perf_event_open.c"
567 #include "trace/beauty/pid.c"
568 #include "trace/beauty/sched_policy.c"
569 #include "trace/beauty/seccomp.c"
570 #include "trace/beauty/signum.c"
571 #include "trace/beauty/socket_type.c"
572 #include "trace/beauty/waitid_options.c"
573 
574 static struct syscall_fmt {
575 	const char *name;
576 	const char *alias;
577 	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
578 	void	   *arg_parm[6];
579 	bool	   errmsg;
580 	bool	   errpid;
581 	bool	   timeout;
582 	bool	   hexret;
583 } syscall_fmts[] = {
584 	{ .name	    = "access",	    .errmsg = true,
585 	  .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
586 	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
587 	{ .name	    = "bpf",	    .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
588 	{ .name	    = "brk",	    .hexret = true,
589 	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
590 	{ .name	    = "chdir",	    .errmsg = true, },
591 	{ .name	    = "chmod",	    .errmsg = true, },
592 	{ .name	    = "chroot",	    .errmsg = true, },
593 	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
594 	{ .name	    = "clone",	    .errpid = true, },
595 	{ .name	    = "close",	    .errmsg = true,
596 	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
597 	{ .name	    = "connect",    .errmsg = true, },
598 	{ .name	    = "creat",	    .errmsg = true, },
599 	{ .name	    = "dup",	    .errmsg = true, },
600 	{ .name	    = "dup2",	    .errmsg = true, },
601 	{ .name	    = "dup3",	    .errmsg = true, },
602 	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
603 	{ .name	    = "eventfd2",   .errmsg = true,
604 	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
605 	{ .name	    = "faccessat",  .errmsg = true, },
606 	{ .name	    = "fadvise64",  .errmsg = true, },
607 	{ .name	    = "fallocate",  .errmsg = true, },
608 	{ .name	    = "fchdir",	    .errmsg = true, },
609 	{ .name	    = "fchmod",	    .errmsg = true, },
610 	{ .name	    = "fchmodat",   .errmsg = true,
611 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
612 	{ .name	    = "fchown",	    .errmsg = true, },
613 	{ .name	    = "fchownat",   .errmsg = true,
614 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
615 	{ .name	    = "fcntl",	    .errmsg = true,
616 	  .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
617 	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
618 	{ .name	    = "fdatasync",  .errmsg = true, },
619 	{ .name	    = "flock",	    .errmsg = true,
620 	  .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
621 	{ .name	    = "fsetxattr",  .errmsg = true, },
622 	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat", },
623 	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat", },
624 	{ .name	    = "fstatfs",    .errmsg = true, },
625 	{ .name	    = "fsync",    .errmsg = true, },
626 	{ .name	    = "ftruncate", .errmsg = true, },
627 	{ .name	    = "futex",	    .errmsg = true,
628 	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
629 	{ .name	    = "futimesat", .errmsg = true,
630 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
631 	{ .name	    = "getdents",   .errmsg = true, },
632 	{ .name	    = "getdents64", .errmsg = true, },
633 	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
634 	{ .name	    = "getpid",	    .errpid = true, },
635 	{ .name	    = "getpgid",    .errpid = true, },
636 	{ .name	    = "getppid",    .errpid = true, },
637 	{ .name	    = "getrandom",  .errmsg = true,
638 	  .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
639 	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
640 	{ .name	    = "getxattr",   .errmsg = true, },
641 	{ .name	    = "inotify_add_watch",	    .errmsg = true, },
642 	{ .name	    = "ioctl",	    .errmsg = true,
643 	  .arg_scnprintf = {
644 #if defined(__i386__) || defined(__x86_64__)
645 /*
646  * FIXME: Make this available to all arches.
647  */
648 			     [1] = SCA_STRHEXARRAY, /* cmd */
649 			     [2] = SCA_HEX, /* arg */ },
650 	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
651 #else
652 			     [2] = SCA_HEX, /* arg */ }, },
653 #endif
654 	{ .name	    = "keyctl",	    .errmsg = true, STRARRAY(0, option, keyctl_options), },
655 	{ .name	    = "kill",	    .errmsg = true,
656 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
657 	{ .name	    = "lchown",    .errmsg = true, },
658 	{ .name	    = "lgetxattr",  .errmsg = true, },
659 	{ .name	    = "linkat",	    .errmsg = true,
660 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
661 	{ .name	    = "listxattr",  .errmsg = true, },
662 	{ .name	    = "llistxattr", .errmsg = true, },
663 	{ .name	    = "lremovexattr",  .errmsg = true, },
664 	{ .name	    = "lseek",	    .errmsg = true,
665 	  .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
666 	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
667 	{ .name	    = "lsetxattr",  .errmsg = true, },
668 	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat", },
669 	{ .name	    = "lsxattr",    .errmsg = true, },
670 	{ .name     = "madvise",    .errmsg = true,
671 	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
672 			     [2] = SCA_MADV_BHV, /* behavior */ }, },
673 	{ .name	    = "mkdir",    .errmsg = true, },
674 	{ .name	    = "mkdirat",    .errmsg = true,
675 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
676 	{ .name	    = "mknod",      .errmsg = true, },
677 	{ .name	    = "mknodat",    .errmsg = true,
678 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
679 	{ .name	    = "mlock",	    .errmsg = true,
680 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
681 	{ .name	    = "mlockall",   .errmsg = true,
682 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
683 	{ .name	    = "mmap",	    .hexret = true,
684 /* The standard mmap maps to old_mmap on s390x */
685 #if defined(__s390x__)
686 	.alias = "old_mmap",
687 #endif
688 	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
689 			     [2] = SCA_MMAP_PROT, /* prot */
690 			     [3] = SCA_MMAP_FLAGS, /* flags */ }, },
691 	{ .name	    = "mprotect",   .errmsg = true,
692 	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
693 			     [2] = SCA_MMAP_PROT, /* prot */ }, },
694 	{ .name	    = "mq_unlink", .errmsg = true,
695 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
696 	{ .name	    = "mremap",	    .hexret = true,
697 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
698 			     [3] = SCA_MREMAP_FLAGS, /* flags */
699 			     [4] = SCA_HEX, /* new_addr */ }, },
700 	{ .name	    = "munlock",    .errmsg = true,
701 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
702 	{ .name	    = "munmap",	    .errmsg = true,
703 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
704 	{ .name	    = "name_to_handle_at", .errmsg = true,
705 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
706 	{ .name	    = "newfstatat", .errmsg = true,
707 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
708 	{ .name	    = "open",	    .errmsg = true,
709 	  .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
710 	{ .name	    = "open_by_handle_at", .errmsg = true,
711 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
712 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
713 	{ .name	    = "openat",	    .errmsg = true,
714 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
715 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
716 	{ .name	    = "perf_event_open", .errmsg = true,
717 	  .arg_scnprintf = { [2] = SCA_INT, /* cpu */
718 			     [3] = SCA_FD,  /* group_fd */
719 			     [4] = SCA_PERF_FLAGS,  /* flags */ }, },
720 	{ .name	    = "pipe2",	    .errmsg = true,
721 	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
722 	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
723 	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
724 	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64", },
725 	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread", },
726 	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
727 	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64", },
728 	{ .name	    = "pwritev",    .errmsg = true, },
729 	{ .name	    = "read",	    .errmsg = true, },
730 	{ .name	    = "readlink",   .errmsg = true, },
731 	{ .name	    = "readlinkat", .errmsg = true,
732 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
733 	{ .name	    = "readv",	    .errmsg = true, },
734 	{ .name	    = "recvfrom",   .errmsg = true,
735 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
736 	{ .name	    = "recvmmsg",   .errmsg = true,
737 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
738 	{ .name	    = "recvmsg",    .errmsg = true,
739 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
740 	{ .name	    = "removexattr", .errmsg = true, },
741 	{ .name	    = "renameat",   .errmsg = true,
742 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
743 	{ .name	    = "rmdir",    .errmsg = true, },
744 	{ .name	    = "rt_sigaction", .errmsg = true,
745 	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
746 	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
747 	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
748 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
749 	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
750 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
751 	{ .name	    = "sched_getattr",	      .errmsg = true, },
752 	{ .name	    = "sched_setattr",	      .errmsg = true, },
753 	{ .name	    = "sched_setscheduler",   .errmsg = true,
754 	  .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
755 	{ .name	    = "seccomp", .errmsg = true,
756 	  .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
757 			     [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
758 	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
759 	{ .name	    = "sendmmsg",    .errmsg = true,
760 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
761 	{ .name	    = "sendmsg",    .errmsg = true,
762 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
763 	{ .name	    = "sendto",	    .errmsg = true,
764 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
765 	{ .name	    = "set_tid_address", .errpid = true, },
766 	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
767 	{ .name	    = "setpgid",    .errmsg = true, },
768 	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
769 	{ .name	    = "setxattr",   .errmsg = true, },
770 	{ .name	    = "shutdown",   .errmsg = true, },
771 	{ .name	    = "socket",	    .errmsg = true,
772 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
773 			     [1] = SCA_SK_TYPE, /* type */ },
774 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
775 	{ .name	    = "socketpair", .errmsg = true,
776 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
777 			     [1] = SCA_SK_TYPE, /* type */ },
778 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
779 	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat", },
780 	{ .name	    = "statfs",	    .errmsg = true, },
781 	{ .name	    = "statx",	    .errmsg = true,
782 	  .arg_scnprintf = { [0] = SCA_FDAT, /* flags */
783 			     [2] = SCA_STATX_FLAGS, /* flags */
784 			     [3] = SCA_STATX_MASK, /* mask */ }, },
785 	{ .name	    = "swapoff",    .errmsg = true,
786 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
787 	{ .name	    = "swapon",	    .errmsg = true,
788 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
789 	{ .name	    = "symlinkat",  .errmsg = true,
790 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
791 	{ .name	    = "tgkill",	    .errmsg = true,
792 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
793 	{ .name	    = "tkill",	    .errmsg = true,
794 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
795 	{ .name	    = "truncate",   .errmsg = true, },
796 	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
797 	{ .name	    = "unlinkat",   .errmsg = true,
798 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
799 	{ .name	    = "utime",  .errmsg = true, },
800 	{ .name	    = "utimensat",  .errmsg = true,
801 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
802 	{ .name	    = "utimes",  .errmsg = true, },
803 	{ .name	    = "vmsplice",  .errmsg = true, },
804 	{ .name	    = "wait4",	    .errpid = true,
805 	  .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
806 	{ .name	    = "waitid",	    .errpid = true,
807 	  .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
808 	{ .name	    = "write",	    .errmsg = true, },
809 	{ .name	    = "writev",	    .errmsg = true, },
810 };
811 
812 static int syscall_fmt__cmp(const void *name, const void *fmtp)
813 {
814 	const struct syscall_fmt *fmt = fmtp;
815 	return strcmp(name, fmt->name);
816 }
817 
818 static struct syscall_fmt *syscall_fmt__find(const char *name)
819 {
820 	const int nmemb = ARRAY_SIZE(syscall_fmts);
821 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
822 }
823 
824 struct syscall {
825 	struct event_format *tp_format;
826 	int		    nr_args;
827 	struct format_field *args;
828 	const char	    *name;
829 	bool		    is_exit;
830 	struct syscall_fmt  *fmt;
831 	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
832 	void		    **arg_parm;
833 };
834 
835 /*
836  * We need to have this 'calculated' boolean because in some cases we really
837  * don't know what is the duration of a syscall, for instance, when we start
838  * a session and some threads are waiting for a syscall to finish, say 'poll',
839  * in which case all we can do is to print "( ? ) for duration and for the
840  * start timestamp.
841  */
842 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
843 {
844 	double duration = (double)t / NSEC_PER_MSEC;
845 	size_t printed = fprintf(fp, "(");
846 
847 	if (!calculated)
848 		printed += fprintf(fp, "     ?   ");
849 	else if (duration >= 1.0)
850 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
851 	else if (duration >= 0.01)
852 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
853 	else
854 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
855 	return printed + fprintf(fp, "): ");
856 }
857 
858 /**
859  * filename.ptr: The filename char pointer that will be vfs_getname'd
860  * filename.entry_str_pos: Where to insert the string translated from
861  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
862  */
863 struct thread_trace {
864 	u64		  entry_time;
865 	bool		  entry_pending;
866 	unsigned long	  nr_events;
867 	unsigned long	  pfmaj, pfmin;
868 	char		  *entry_str;
869 	double		  runtime_ms;
870         struct {
871 		unsigned long ptr;
872 		short int     entry_str_pos;
873 		bool	      pending_open;
874 		unsigned int  namelen;
875 		char	      *name;
876 	} filename;
877 	struct {
878 		int	  max;
879 		char	  **table;
880 	} paths;
881 
882 	struct intlist *syscall_stats;
883 };
884 
885 static struct thread_trace *thread_trace__new(void)
886 {
887 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
888 
889 	if (ttrace)
890 		ttrace->paths.max = -1;
891 
892 	ttrace->syscall_stats = intlist__new(NULL);
893 
894 	return ttrace;
895 }
896 
897 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
898 {
899 	struct thread_trace *ttrace;
900 
901 	if (thread == NULL)
902 		goto fail;
903 
904 	if (thread__priv(thread) == NULL)
905 		thread__set_priv(thread, thread_trace__new());
906 
907 	if (thread__priv(thread) == NULL)
908 		goto fail;
909 
910 	ttrace = thread__priv(thread);
911 	++ttrace->nr_events;
912 
913 	return ttrace;
914 fail:
915 	color_fprintf(fp, PERF_COLOR_RED,
916 		      "WARNING: not enough memory, dropping samples!\n");
917 	return NULL;
918 }
919 
920 #define TRACE_PFMAJ		(1 << 0)
921 #define TRACE_PFMIN		(1 << 1)
922 
923 static const size_t trace__entry_str_size = 2048;
924 
925 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
926 {
927 	struct thread_trace *ttrace = thread__priv(thread);
928 
929 	if (fd > ttrace->paths.max) {
930 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
931 
932 		if (npath == NULL)
933 			return -1;
934 
935 		if (ttrace->paths.max != -1) {
936 			memset(npath + ttrace->paths.max + 1, 0,
937 			       (fd - ttrace->paths.max) * sizeof(char *));
938 		} else {
939 			memset(npath, 0, (fd + 1) * sizeof(char *));
940 		}
941 
942 		ttrace->paths.table = npath;
943 		ttrace->paths.max   = fd;
944 	}
945 
946 	ttrace->paths.table[fd] = strdup(pathname);
947 
948 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
949 }
950 
951 static int thread__read_fd_path(struct thread *thread, int fd)
952 {
953 	char linkname[PATH_MAX], pathname[PATH_MAX];
954 	struct stat st;
955 	int ret;
956 
957 	if (thread->pid_ == thread->tid) {
958 		scnprintf(linkname, sizeof(linkname),
959 			  "/proc/%d/fd/%d", thread->pid_, fd);
960 	} else {
961 		scnprintf(linkname, sizeof(linkname),
962 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
963 	}
964 
965 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
966 		return -1;
967 
968 	ret = readlink(linkname, pathname, sizeof(pathname));
969 
970 	if (ret < 0 || ret > st.st_size)
971 		return -1;
972 
973 	pathname[ret] = '\0';
974 	return trace__set_fd_pathname(thread, fd, pathname);
975 }
976 
977 static const char *thread__fd_path(struct thread *thread, int fd,
978 				   struct trace *trace)
979 {
980 	struct thread_trace *ttrace = thread__priv(thread);
981 
982 	if (ttrace == NULL)
983 		return NULL;
984 
985 	if (fd < 0)
986 		return NULL;
987 
988 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
989 		if (!trace->live)
990 			return NULL;
991 		++trace->stats.proc_getname;
992 		if (thread__read_fd_path(thread, fd))
993 			return NULL;
994 	}
995 
996 	return ttrace->paths.table[fd];
997 }
998 
999 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1000 					struct syscall_arg *arg)
1001 {
1002 	int fd = arg->val;
1003 	size_t printed = scnprintf(bf, size, "%d", fd);
1004 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1005 
1006 	if (path)
1007 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1008 
1009 	return printed;
1010 }
1011 
1012 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1013 					      struct syscall_arg *arg)
1014 {
1015 	int fd = arg->val;
1016 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1017 	struct thread_trace *ttrace = thread__priv(arg->thread);
1018 
1019 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1020 		zfree(&ttrace->paths.table[fd]);
1021 
1022 	return printed;
1023 }
1024 
1025 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1026 				     unsigned long ptr)
1027 {
1028 	struct thread_trace *ttrace = thread__priv(thread);
1029 
1030 	ttrace->filename.ptr = ptr;
1031 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1032 }
1033 
1034 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1035 					      struct syscall_arg *arg)
1036 {
1037 	unsigned long ptr = arg->val;
1038 
1039 	if (!arg->trace->vfs_getname)
1040 		return scnprintf(bf, size, "%#x", ptr);
1041 
1042 	thread__set_filename_pos(arg->thread, bf, ptr);
1043 	return 0;
1044 }
1045 
1046 static bool trace__filter_duration(struct trace *trace, double t)
1047 {
1048 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1049 }
1050 
1051 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1052 {
1053 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1054 
1055 	return fprintf(fp, "%10.3f ", ts);
1056 }
1057 
1058 /*
1059  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1060  * using ttrace->entry_time for a thread that receives a sys_exit without
1061  * first having received a sys_enter ("poll" issued before tracing session
1062  * starts, lost sys_enter exit due to ring buffer overflow).
1063  */
1064 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1065 {
1066 	if (tstamp > 0)
1067 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1068 
1069 	return fprintf(fp, "         ? ");
1070 }
1071 
1072 static bool done = false;
1073 static bool interrupted = false;
1074 
1075 static void sig_handler(int sig)
1076 {
1077 	done = true;
1078 	interrupted = sig == SIGINT;
1079 }
1080 
1081 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1082 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1083 {
1084 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1085 	printed += fprintf_duration(duration, duration_calculated, fp);
1086 
1087 	if (trace->multiple_threads) {
1088 		if (trace->show_comm)
1089 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1090 		printed += fprintf(fp, "%d ", thread->tid);
1091 	}
1092 
1093 	return printed;
1094 }
1095 
1096 static int trace__process_event(struct trace *trace, struct machine *machine,
1097 				union perf_event *event, struct perf_sample *sample)
1098 {
1099 	int ret = 0;
1100 
1101 	switch (event->header.type) {
1102 	case PERF_RECORD_LOST:
1103 		color_fprintf(trace->output, PERF_COLOR_RED,
1104 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1105 		ret = machine__process_lost_event(machine, event, sample);
1106 		break;
1107 	default:
1108 		ret = machine__process_event(machine, event, sample);
1109 		break;
1110 	}
1111 
1112 	return ret;
1113 }
1114 
1115 static int trace__tool_process(struct perf_tool *tool,
1116 			       union perf_event *event,
1117 			       struct perf_sample *sample,
1118 			       struct machine *machine)
1119 {
1120 	struct trace *trace = container_of(tool, struct trace, tool);
1121 	return trace__process_event(trace, machine, event, sample);
1122 }
1123 
1124 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1125 {
1126 	struct machine *machine = vmachine;
1127 
1128 	if (machine->kptr_restrict_warned)
1129 		return NULL;
1130 
1131 	if (symbol_conf.kptr_restrict) {
1132 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1133 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1134 			   "Kernel samples will not be resolved.\n");
1135 		machine->kptr_restrict_warned = true;
1136 		return NULL;
1137 	}
1138 
1139 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1140 }
1141 
1142 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1143 {
1144 	int err = symbol__init(NULL);
1145 
1146 	if (err)
1147 		return err;
1148 
1149 	trace->host = machine__new_host();
1150 	if (trace->host == NULL)
1151 		return -ENOMEM;
1152 
1153 	if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1154 		return -errno;
1155 
1156 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1157 					    evlist->threads, trace__tool_process, false,
1158 					    trace->opts.proc_map_timeout);
1159 	if (err)
1160 		symbol__exit();
1161 
1162 	return err;
1163 }
1164 
1165 static int syscall__set_arg_fmts(struct syscall *sc)
1166 {
1167 	struct format_field *field;
1168 	int idx = 0, len;
1169 
1170 	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1171 	if (sc->arg_scnprintf == NULL)
1172 		return -1;
1173 
1174 	if (sc->fmt)
1175 		sc->arg_parm = sc->fmt->arg_parm;
1176 
1177 	for (field = sc->args; field; field = field->next) {
1178 		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1179 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1180 		else if (strcmp(field->type, "const char *") == 0 &&
1181 			 (strcmp(field->name, "filename") == 0 ||
1182 			  strcmp(field->name, "path") == 0 ||
1183 			  strcmp(field->name, "pathname") == 0))
1184 			sc->arg_scnprintf[idx] = SCA_FILENAME;
1185 		else if (field->flags & FIELD_IS_POINTER)
1186 			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1187 		else if (strcmp(field->type, "pid_t") == 0)
1188 			sc->arg_scnprintf[idx] = SCA_PID;
1189 		else if (strcmp(field->type, "umode_t") == 0)
1190 			sc->arg_scnprintf[idx] = SCA_MODE_T;
1191 		else if ((strcmp(field->type, "int") == 0 ||
1192 			  strcmp(field->type, "unsigned int") == 0 ||
1193 			  strcmp(field->type, "long") == 0) &&
1194 			 (len = strlen(field->name)) >= 2 &&
1195 			 strcmp(field->name + len - 2, "fd") == 0) {
1196 			/*
1197 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1198 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1199 			 * 65 int
1200 			 * 23 unsigned int
1201 			 * 7 unsigned long
1202 			 */
1203 			sc->arg_scnprintf[idx] = SCA_FD;
1204 		}
1205 		++idx;
1206 	}
1207 
1208 	return 0;
1209 }
1210 
1211 static int trace__read_syscall_info(struct trace *trace, int id)
1212 {
1213 	char tp_name[128];
1214 	struct syscall *sc;
1215 	const char *name = syscalltbl__name(trace->sctbl, id);
1216 
1217 	if (name == NULL)
1218 		return -1;
1219 
1220 	if (id > trace->syscalls.max) {
1221 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1222 
1223 		if (nsyscalls == NULL)
1224 			return -1;
1225 
1226 		if (trace->syscalls.max != -1) {
1227 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1228 			       (id - trace->syscalls.max) * sizeof(*sc));
1229 		} else {
1230 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1231 		}
1232 
1233 		trace->syscalls.table = nsyscalls;
1234 		trace->syscalls.max   = id;
1235 	}
1236 
1237 	sc = trace->syscalls.table + id;
1238 	sc->name = name;
1239 
1240 	sc->fmt  = syscall_fmt__find(sc->name);
1241 
1242 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1243 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1244 
1245 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1246 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1247 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1248 	}
1249 
1250 	if (IS_ERR(sc->tp_format))
1251 		return -1;
1252 
1253 	sc->args = sc->tp_format->format.fields;
1254 	sc->nr_args = sc->tp_format->format.nr_fields;
1255 	/*
1256 	 * We need to check and discard the first variable '__syscall_nr'
1257 	 * or 'nr' that mean the syscall number. It is needless here.
1258 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1259 	 */
1260 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1261 		sc->args = sc->args->next;
1262 		--sc->nr_args;
1263 	}
1264 
1265 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1266 
1267 	return syscall__set_arg_fmts(sc);
1268 }
1269 
1270 static int trace__validate_ev_qualifier(struct trace *trace)
1271 {
1272 	int err = 0, i;
1273 	struct str_node *pos;
1274 
1275 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1276 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1277 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1278 
1279 	if (trace->ev_qualifier_ids.entries == NULL) {
1280 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1281 		       trace->output);
1282 		err = -EINVAL;
1283 		goto out;
1284 	}
1285 
1286 	i = 0;
1287 
1288 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1289 		const char *sc = pos->s;
1290 		int id = syscalltbl__id(trace->sctbl, sc);
1291 
1292 		if (id < 0) {
1293 			if (err == 0) {
1294 				fputs("Error:\tInvalid syscall ", trace->output);
1295 				err = -EINVAL;
1296 			} else {
1297 				fputs(", ", trace->output);
1298 			}
1299 
1300 			fputs(sc, trace->output);
1301 		}
1302 
1303 		trace->ev_qualifier_ids.entries[i++] = id;
1304 	}
1305 
1306 	if (err < 0) {
1307 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1308 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1309 		zfree(&trace->ev_qualifier_ids.entries);
1310 		trace->ev_qualifier_ids.nr = 0;
1311 	}
1312 out:
1313 	return err;
1314 }
1315 
1316 /*
1317  * args is to be interpreted as a series of longs but we need to handle
1318  * 8-byte unaligned accesses. args points to raw_data within the event
1319  * and raw_data is guaranteed to be 8-byte unaligned because it is
1320  * preceded by raw_size which is a u32. So we need to copy args to a temp
1321  * variable to read it. Most notably this avoids extended load instructions
1322  * on unaligned addresses
1323  */
1324 
1325 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1326 				      unsigned char *args, struct trace *trace,
1327 				      struct thread *thread)
1328 {
1329 	size_t printed = 0;
1330 	unsigned char *p;
1331 	unsigned long val;
1332 
1333 	if (sc->args != NULL) {
1334 		struct format_field *field;
1335 		u8 bit = 1;
1336 		struct syscall_arg arg = {
1337 			.idx	= 0,
1338 			.mask	= 0,
1339 			.trace  = trace,
1340 			.thread = thread,
1341 		};
1342 
1343 		for (field = sc->args; field;
1344 		     field = field->next, ++arg.idx, bit <<= 1) {
1345 			if (arg.mask & bit)
1346 				continue;
1347 
1348 			/* special care for unaligned accesses */
1349 			p = args + sizeof(unsigned long) * arg.idx;
1350 			memcpy(&val, p, sizeof(val));
1351 
1352 			/*
1353  			 * Suppress this argument if its value is zero and
1354  			 * and we don't have a string associated in an
1355  			 * strarray for it.
1356  			 */
1357 			if (val == 0 &&
1358 			    !(sc->arg_scnprintf &&
1359 			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1360 			      sc->arg_parm[arg.idx]))
1361 				continue;
1362 
1363 			printed += scnprintf(bf + printed, size - printed,
1364 					     "%s%s: ", printed ? ", " : "", field->name);
1365 			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1366 				arg.val = val;
1367 				if (sc->arg_parm)
1368 					arg.parm = sc->arg_parm[arg.idx];
1369 				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1370 								      size - printed, &arg);
1371 			} else {
1372 				printed += scnprintf(bf + printed, size - printed,
1373 						     "%ld", val);
1374 			}
1375 		}
1376 	} else if (IS_ERR(sc->tp_format)) {
1377 		/*
1378 		 * If we managed to read the tracepoint /format file, then we
1379 		 * may end up not having any args, like with gettid(), so only
1380 		 * print the raw args when we didn't manage to read it.
1381 		 */
1382 		int i = 0;
1383 
1384 		while (i < 6) {
1385 			/* special care for unaligned accesses */
1386 			p = args + sizeof(unsigned long) * i;
1387 			memcpy(&val, p, sizeof(val));
1388 			printed += scnprintf(bf + printed, size - printed,
1389 					     "%sarg%d: %ld",
1390 					     printed ? ", " : "", i, val);
1391 			++i;
1392 		}
1393 	}
1394 
1395 	return printed;
1396 }
1397 
1398 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1399 				  union perf_event *event,
1400 				  struct perf_sample *sample);
1401 
1402 static struct syscall *trace__syscall_info(struct trace *trace,
1403 					   struct perf_evsel *evsel, int id)
1404 {
1405 
1406 	if (id < 0) {
1407 
1408 		/*
1409 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1410 		 * before that, leaving at a higher verbosity level till that is
1411 		 * explained. Reproduced with plain ftrace with:
1412 		 *
1413 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1414 		 * grep "NR -1 " /t/trace_pipe
1415 		 *
1416 		 * After generating some load on the machine.
1417  		 */
1418 		if (verbose > 1) {
1419 			static u64 n;
1420 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1421 				id, perf_evsel__name(evsel), ++n);
1422 		}
1423 		return NULL;
1424 	}
1425 
1426 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1427 	    trace__read_syscall_info(trace, id))
1428 		goto out_cant_read;
1429 
1430 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1431 		goto out_cant_read;
1432 
1433 	return &trace->syscalls.table[id];
1434 
1435 out_cant_read:
1436 	if (verbose > 0) {
1437 		fprintf(trace->output, "Problems reading syscall %d", id);
1438 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1439 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1440 		fputs(" information\n", trace->output);
1441 	}
1442 	return NULL;
1443 }
1444 
1445 static void thread__update_stats(struct thread_trace *ttrace,
1446 				 int id, struct perf_sample *sample)
1447 {
1448 	struct int_node *inode;
1449 	struct stats *stats;
1450 	u64 duration = 0;
1451 
1452 	inode = intlist__findnew(ttrace->syscall_stats, id);
1453 	if (inode == NULL)
1454 		return;
1455 
1456 	stats = inode->priv;
1457 	if (stats == NULL) {
1458 		stats = malloc(sizeof(struct stats));
1459 		if (stats == NULL)
1460 			return;
1461 		init_stats(stats);
1462 		inode->priv = stats;
1463 	}
1464 
1465 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1466 		duration = sample->time - ttrace->entry_time;
1467 
1468 	update_stats(stats, duration);
1469 }
1470 
1471 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1472 {
1473 	struct thread_trace *ttrace;
1474 	u64 duration;
1475 	size_t printed;
1476 
1477 	if (trace->current == NULL)
1478 		return 0;
1479 
1480 	ttrace = thread__priv(trace->current);
1481 
1482 	if (!ttrace->entry_pending)
1483 		return 0;
1484 
1485 	duration = sample->time - ttrace->entry_time;
1486 
1487 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1488 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1489 	ttrace->entry_pending = false;
1490 
1491 	return printed;
1492 }
1493 
1494 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1495 			    union perf_event *event __maybe_unused,
1496 			    struct perf_sample *sample)
1497 {
1498 	char *msg;
1499 	void *args;
1500 	size_t printed = 0;
1501 	struct thread *thread;
1502 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1503 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1504 	struct thread_trace *ttrace;
1505 
1506 	if (sc == NULL)
1507 		return -1;
1508 
1509 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1510 	ttrace = thread__trace(thread, trace->output);
1511 	if (ttrace == NULL)
1512 		goto out_put;
1513 
1514 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1515 
1516 	if (ttrace->entry_str == NULL) {
1517 		ttrace->entry_str = malloc(trace__entry_str_size);
1518 		if (!ttrace->entry_str)
1519 			goto out_put;
1520 	}
1521 
1522 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1523 		trace__printf_interrupted_entry(trace, sample);
1524 
1525 	ttrace->entry_time = sample->time;
1526 	msg = ttrace->entry_str;
1527 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1528 
1529 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1530 					   args, trace, thread);
1531 
1532 	if (sc->is_exit) {
1533 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1534 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1535 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1536 		}
1537 	} else {
1538 		ttrace->entry_pending = true;
1539 		/* See trace__vfs_getname & trace__sys_exit */
1540 		ttrace->filename.pending_open = false;
1541 	}
1542 
1543 	if (trace->current != thread) {
1544 		thread__put(trace->current);
1545 		trace->current = thread__get(thread);
1546 	}
1547 	err = 0;
1548 out_put:
1549 	thread__put(thread);
1550 	return err;
1551 }
1552 
1553 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1554 				    struct perf_sample *sample,
1555 				    struct callchain_cursor *cursor)
1556 {
1557 	struct addr_location al;
1558 
1559 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1560 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1561 		return -1;
1562 
1563 	return 0;
1564 }
1565 
1566 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1567 {
1568 	/* TODO: user-configurable print_opts */
1569 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1570 				        EVSEL__PRINT_DSO |
1571 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1572 
1573 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1574 }
1575 
1576 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1577 			   union perf_event *event __maybe_unused,
1578 			   struct perf_sample *sample)
1579 {
1580 	long ret;
1581 	u64 duration = 0;
1582 	bool duration_calculated = false;
1583 	struct thread *thread;
1584 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1585 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1586 	struct thread_trace *ttrace;
1587 
1588 	if (sc == NULL)
1589 		return -1;
1590 
1591 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1592 	ttrace = thread__trace(thread, trace->output);
1593 	if (ttrace == NULL)
1594 		goto out_put;
1595 
1596 	if (trace->summary)
1597 		thread__update_stats(ttrace, id, sample);
1598 
1599 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1600 
1601 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1602 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1603 		ttrace->filename.pending_open = false;
1604 		++trace->stats.vfs_getname;
1605 	}
1606 
1607 	if (ttrace->entry_time) {
1608 		duration = sample->time - ttrace->entry_time;
1609 		if (trace__filter_duration(trace, duration))
1610 			goto out;
1611 		duration_calculated = true;
1612 	} else if (trace->duration_filter)
1613 		goto out;
1614 
1615 	if (sample->callchain) {
1616 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1617 		if (callchain_ret == 0) {
1618 			if (callchain_cursor.nr < trace->min_stack)
1619 				goto out;
1620 			callchain_ret = 1;
1621 		}
1622 	}
1623 
1624 	if (trace->summary_only)
1625 		goto out;
1626 
1627 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1628 
1629 	if (ttrace->entry_pending) {
1630 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1631 	} else {
1632 		fprintf(trace->output, " ... [");
1633 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1634 		fprintf(trace->output, "]: %s()", sc->name);
1635 	}
1636 
1637 	if (sc->fmt == NULL) {
1638 signed_print:
1639 		fprintf(trace->output, ") = %ld", ret);
1640 	} else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1641 		char bf[STRERR_BUFSIZE];
1642 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1643 			   *e = audit_errno_to_name(-ret);
1644 
1645 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1646 	} else if (ret == 0 && sc->fmt->timeout)
1647 		fprintf(trace->output, ") = 0 Timeout");
1648 	else if (sc->fmt->hexret)
1649 		fprintf(trace->output, ") = %#lx", ret);
1650 	else if (sc->fmt->errpid) {
1651 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1652 
1653 		if (child != NULL) {
1654 			fprintf(trace->output, ") = %ld", ret);
1655 			if (child->comm_set)
1656 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1657 			thread__put(child);
1658 		}
1659 	} else
1660 		goto signed_print;
1661 
1662 	fputc('\n', trace->output);
1663 
1664 	if (callchain_ret > 0)
1665 		trace__fprintf_callchain(trace, sample);
1666 	else if (callchain_ret < 0)
1667 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1668 out:
1669 	ttrace->entry_pending = false;
1670 	err = 0;
1671 out_put:
1672 	thread__put(thread);
1673 	return err;
1674 }
1675 
1676 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1677 			      union perf_event *event __maybe_unused,
1678 			      struct perf_sample *sample)
1679 {
1680 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1681 	struct thread_trace *ttrace;
1682 	size_t filename_len, entry_str_len, to_move;
1683 	ssize_t remaining_space;
1684 	char *pos;
1685 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1686 
1687 	if (!thread)
1688 		goto out;
1689 
1690 	ttrace = thread__priv(thread);
1691 	if (!ttrace)
1692 		goto out_put;
1693 
1694 	filename_len = strlen(filename);
1695 	if (filename_len == 0)
1696 		goto out_put;
1697 
1698 	if (ttrace->filename.namelen < filename_len) {
1699 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1700 
1701 		if (f == NULL)
1702 			goto out_put;
1703 
1704 		ttrace->filename.namelen = filename_len;
1705 		ttrace->filename.name = f;
1706 	}
1707 
1708 	strcpy(ttrace->filename.name, filename);
1709 	ttrace->filename.pending_open = true;
1710 
1711 	if (!ttrace->filename.ptr)
1712 		goto out_put;
1713 
1714 	entry_str_len = strlen(ttrace->entry_str);
1715 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1716 	if (remaining_space <= 0)
1717 		goto out_put;
1718 
1719 	if (filename_len > (size_t)remaining_space) {
1720 		filename += filename_len - remaining_space;
1721 		filename_len = remaining_space;
1722 	}
1723 
1724 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1725 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1726 	memmove(pos + filename_len, pos, to_move);
1727 	memcpy(pos, filename, filename_len);
1728 
1729 	ttrace->filename.ptr = 0;
1730 	ttrace->filename.entry_str_pos = 0;
1731 out_put:
1732 	thread__put(thread);
1733 out:
1734 	return 0;
1735 }
1736 
1737 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1738 				     union perf_event *event __maybe_unused,
1739 				     struct perf_sample *sample)
1740 {
1741         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1742 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1743 	struct thread *thread = machine__findnew_thread(trace->host,
1744 							sample->pid,
1745 							sample->tid);
1746 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1747 
1748 	if (ttrace == NULL)
1749 		goto out_dump;
1750 
1751 	ttrace->runtime_ms += runtime_ms;
1752 	trace->runtime_ms += runtime_ms;
1753 out_put:
1754 	thread__put(thread);
1755 	return 0;
1756 
1757 out_dump:
1758 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1759 	       evsel->name,
1760 	       perf_evsel__strval(evsel, sample, "comm"),
1761 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1762 	       runtime,
1763 	       perf_evsel__intval(evsel, sample, "vruntime"));
1764 	goto out_put;
1765 }
1766 
1767 static void bpf_output__printer(enum binary_printer_ops op,
1768 				unsigned int val, void *extra)
1769 {
1770 	FILE *output = extra;
1771 	unsigned char ch = (unsigned char)val;
1772 
1773 	switch (op) {
1774 	case BINARY_PRINT_CHAR_DATA:
1775 		fprintf(output, "%c", isprint(ch) ? ch : '.');
1776 		break;
1777 	case BINARY_PRINT_DATA_BEGIN:
1778 	case BINARY_PRINT_LINE_BEGIN:
1779 	case BINARY_PRINT_ADDR:
1780 	case BINARY_PRINT_NUM_DATA:
1781 	case BINARY_PRINT_NUM_PAD:
1782 	case BINARY_PRINT_SEP:
1783 	case BINARY_PRINT_CHAR_PAD:
1784 	case BINARY_PRINT_LINE_END:
1785 	case BINARY_PRINT_DATA_END:
1786 	default:
1787 		break;
1788 	}
1789 }
1790 
1791 static void bpf_output__fprintf(struct trace *trace,
1792 				struct perf_sample *sample)
1793 {
1794 	print_binary(sample->raw_data, sample->raw_size, 8,
1795 		     bpf_output__printer, trace->output);
1796 }
1797 
1798 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1799 				union perf_event *event __maybe_unused,
1800 				struct perf_sample *sample)
1801 {
1802 	int callchain_ret = 0;
1803 
1804 	if (sample->callchain) {
1805 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1806 		if (callchain_ret == 0) {
1807 			if (callchain_cursor.nr < trace->min_stack)
1808 				goto out;
1809 			callchain_ret = 1;
1810 		}
1811 	}
1812 
1813 	trace__printf_interrupted_entry(trace, sample);
1814 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1815 
1816 	if (trace->trace_syscalls)
1817 		fprintf(trace->output, "(         ): ");
1818 
1819 	fprintf(trace->output, "%s:", evsel->name);
1820 
1821 	if (perf_evsel__is_bpf_output(evsel)) {
1822 		bpf_output__fprintf(trace, sample);
1823 	} else if (evsel->tp_format) {
1824 		event_format__fprintf(evsel->tp_format, sample->cpu,
1825 				      sample->raw_data, sample->raw_size,
1826 				      trace->output);
1827 	}
1828 
1829 	fprintf(trace->output, ")\n");
1830 
1831 	if (callchain_ret > 0)
1832 		trace__fprintf_callchain(trace, sample);
1833 	else if (callchain_ret < 0)
1834 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1835 out:
1836 	return 0;
1837 }
1838 
1839 static void print_location(FILE *f, struct perf_sample *sample,
1840 			   struct addr_location *al,
1841 			   bool print_dso, bool print_sym)
1842 {
1843 
1844 	if ((verbose > 0 || print_dso) && al->map)
1845 		fprintf(f, "%s@", al->map->dso->long_name);
1846 
1847 	if ((verbose > 0 || print_sym) && al->sym)
1848 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1849 			al->addr - al->sym->start);
1850 	else if (al->map)
1851 		fprintf(f, "0x%" PRIx64, al->addr);
1852 	else
1853 		fprintf(f, "0x%" PRIx64, sample->addr);
1854 }
1855 
1856 static int trace__pgfault(struct trace *trace,
1857 			  struct perf_evsel *evsel,
1858 			  union perf_event *event __maybe_unused,
1859 			  struct perf_sample *sample)
1860 {
1861 	struct thread *thread;
1862 	struct addr_location al;
1863 	char map_type = 'd';
1864 	struct thread_trace *ttrace;
1865 	int err = -1;
1866 	int callchain_ret = 0;
1867 
1868 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1869 
1870 	if (sample->callchain) {
1871 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1872 		if (callchain_ret == 0) {
1873 			if (callchain_cursor.nr < trace->min_stack)
1874 				goto out_put;
1875 			callchain_ret = 1;
1876 		}
1877 	}
1878 
1879 	ttrace = thread__trace(thread, trace->output);
1880 	if (ttrace == NULL)
1881 		goto out_put;
1882 
1883 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1884 		ttrace->pfmaj++;
1885 	else
1886 		ttrace->pfmin++;
1887 
1888 	if (trace->summary_only)
1889 		goto out;
1890 
1891 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1892 			      sample->ip, &al);
1893 
1894 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1895 
1896 	fprintf(trace->output, "%sfault [",
1897 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1898 		"maj" : "min");
1899 
1900 	print_location(trace->output, sample, &al, false, true);
1901 
1902 	fprintf(trace->output, "] => ");
1903 
1904 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1905 				   sample->addr, &al);
1906 
1907 	if (!al.map) {
1908 		thread__find_addr_location(thread, sample->cpumode,
1909 					   MAP__FUNCTION, sample->addr, &al);
1910 
1911 		if (al.map)
1912 			map_type = 'x';
1913 		else
1914 			map_type = '?';
1915 	}
1916 
1917 	print_location(trace->output, sample, &al, true, false);
1918 
1919 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1920 
1921 	if (callchain_ret > 0)
1922 		trace__fprintf_callchain(trace, sample);
1923 	else if (callchain_ret < 0)
1924 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1925 out:
1926 	err = 0;
1927 out_put:
1928 	thread__put(thread);
1929 	return err;
1930 }
1931 
1932 static void trace__set_base_time(struct trace *trace,
1933 				 struct perf_evsel *evsel,
1934 				 struct perf_sample *sample)
1935 {
1936 	/*
1937 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1938 	 * and don't use sample->time unconditionally, we may end up having
1939 	 * some other event in the future without PERF_SAMPLE_TIME for good
1940 	 * reason, i.e. we may not be interested in its timestamps, just in
1941 	 * it taking place, picking some piece of information when it
1942 	 * appears in our event stream (vfs_getname comes to mind).
1943 	 */
1944 	if (trace->base_time == 0 && !trace->full_time &&
1945 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1946 		trace->base_time = sample->time;
1947 }
1948 
1949 static int trace__process_sample(struct perf_tool *tool,
1950 				 union perf_event *event,
1951 				 struct perf_sample *sample,
1952 				 struct perf_evsel *evsel,
1953 				 struct machine *machine __maybe_unused)
1954 {
1955 	struct trace *trace = container_of(tool, struct trace, tool);
1956 	struct thread *thread;
1957 	int err = 0;
1958 
1959 	tracepoint_handler handler = evsel->handler;
1960 
1961 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1962 	if (thread && thread__is_filtered(thread))
1963 		goto out;
1964 
1965 	trace__set_base_time(trace, evsel, sample);
1966 
1967 	if (handler) {
1968 		++trace->nr_events;
1969 		handler(trace, evsel, event, sample);
1970 	}
1971 out:
1972 	thread__put(thread);
1973 	return err;
1974 }
1975 
1976 static int trace__record(struct trace *trace, int argc, const char **argv)
1977 {
1978 	unsigned int rec_argc, i, j;
1979 	const char **rec_argv;
1980 	const char * const record_args[] = {
1981 		"record",
1982 		"-R",
1983 		"-m", "1024",
1984 		"-c", "1",
1985 	};
1986 
1987 	const char * const sc_args[] = { "-e", };
1988 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1989 	const char * const majpf_args[] = { "-e", "major-faults" };
1990 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1991 	const char * const minpf_args[] = { "-e", "minor-faults" };
1992 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1993 
1994 	/* +1 is for the event string below */
1995 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1996 		majpf_args_nr + minpf_args_nr + argc;
1997 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
1998 
1999 	if (rec_argv == NULL)
2000 		return -ENOMEM;
2001 
2002 	j = 0;
2003 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2004 		rec_argv[j++] = record_args[i];
2005 
2006 	if (trace->trace_syscalls) {
2007 		for (i = 0; i < sc_args_nr; i++)
2008 			rec_argv[j++] = sc_args[i];
2009 
2010 		/* event string may be different for older kernels - e.g., RHEL6 */
2011 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2012 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2013 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2014 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2015 		else {
2016 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2017 			return -1;
2018 		}
2019 	}
2020 
2021 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2022 		for (i = 0; i < majpf_args_nr; i++)
2023 			rec_argv[j++] = majpf_args[i];
2024 
2025 	if (trace->trace_pgfaults & TRACE_PFMIN)
2026 		for (i = 0; i < minpf_args_nr; i++)
2027 			rec_argv[j++] = minpf_args[i];
2028 
2029 	for (i = 0; i < (unsigned int)argc; i++)
2030 		rec_argv[j++] = argv[i];
2031 
2032 	return cmd_record(j, rec_argv);
2033 }
2034 
2035 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2036 
2037 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2038 {
2039 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2040 
2041 	if (IS_ERR(evsel))
2042 		return false;
2043 
2044 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2045 		perf_evsel__delete(evsel);
2046 		return false;
2047 	}
2048 
2049 	evsel->handler = trace__vfs_getname;
2050 	perf_evlist__add(evlist, evsel);
2051 	return true;
2052 }
2053 
2054 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2055 {
2056 	struct perf_evsel *evsel;
2057 	struct perf_event_attr attr = {
2058 		.type = PERF_TYPE_SOFTWARE,
2059 		.mmap_data = 1,
2060 	};
2061 
2062 	attr.config = config;
2063 	attr.sample_period = 1;
2064 
2065 	event_attr_init(&attr);
2066 
2067 	evsel = perf_evsel__new(&attr);
2068 	if (evsel)
2069 		evsel->handler = trace__pgfault;
2070 
2071 	return evsel;
2072 }
2073 
2074 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2075 {
2076 	const u32 type = event->header.type;
2077 	struct perf_evsel *evsel;
2078 
2079 	if (type != PERF_RECORD_SAMPLE) {
2080 		trace__process_event(trace, trace->host, event, sample);
2081 		return;
2082 	}
2083 
2084 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2085 	if (evsel == NULL) {
2086 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2087 		return;
2088 	}
2089 
2090 	trace__set_base_time(trace, evsel, sample);
2091 
2092 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2093 	    sample->raw_data == NULL) {
2094 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2095 		       perf_evsel__name(evsel), sample->tid,
2096 		       sample->cpu, sample->raw_size);
2097 	} else {
2098 		tracepoint_handler handler = evsel->handler;
2099 		handler(trace, evsel, event, sample);
2100 	}
2101 }
2102 
2103 static int trace__add_syscall_newtp(struct trace *trace)
2104 {
2105 	int ret = -1;
2106 	struct perf_evlist *evlist = trace->evlist;
2107 	struct perf_evsel *sys_enter, *sys_exit;
2108 
2109 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2110 	if (sys_enter == NULL)
2111 		goto out;
2112 
2113 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2114 		goto out_delete_sys_enter;
2115 
2116 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2117 	if (sys_exit == NULL)
2118 		goto out_delete_sys_enter;
2119 
2120 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2121 		goto out_delete_sys_exit;
2122 
2123 	perf_evlist__add(evlist, sys_enter);
2124 	perf_evlist__add(evlist, sys_exit);
2125 
2126 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2127 		/*
2128 		 * We're interested only in the user space callchain
2129 		 * leading to the syscall, allow overriding that for
2130 		 * debugging reasons using --kernel_syscall_callchains
2131 		 */
2132 		sys_exit->attr.exclude_callchain_kernel = 1;
2133 	}
2134 
2135 	trace->syscalls.events.sys_enter = sys_enter;
2136 	trace->syscalls.events.sys_exit  = sys_exit;
2137 
2138 	ret = 0;
2139 out:
2140 	return ret;
2141 
2142 out_delete_sys_exit:
2143 	perf_evsel__delete_priv(sys_exit);
2144 out_delete_sys_enter:
2145 	perf_evsel__delete_priv(sys_enter);
2146 	goto out;
2147 }
2148 
2149 static int trace__set_ev_qualifier_filter(struct trace *trace)
2150 {
2151 	int err = -1;
2152 	struct perf_evsel *sys_exit;
2153 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2154 						trace->ev_qualifier_ids.nr,
2155 						trace->ev_qualifier_ids.entries);
2156 
2157 	if (filter == NULL)
2158 		goto out_enomem;
2159 
2160 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2161 					  filter)) {
2162 		sys_exit = trace->syscalls.events.sys_exit;
2163 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2164 	}
2165 
2166 	free(filter);
2167 out:
2168 	return err;
2169 out_enomem:
2170 	errno = ENOMEM;
2171 	goto out;
2172 }
2173 
2174 static int trace__run(struct trace *trace, int argc, const char **argv)
2175 {
2176 	struct perf_evlist *evlist = trace->evlist;
2177 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2178 	int err = -1, i;
2179 	unsigned long before;
2180 	const bool forks = argc > 0;
2181 	bool draining = false;
2182 
2183 	trace->live = true;
2184 
2185 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2186 		goto out_error_raw_syscalls;
2187 
2188 	if (trace->trace_syscalls)
2189 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2190 
2191 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2192 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2193 		if (pgfault_maj == NULL)
2194 			goto out_error_mem;
2195 		perf_evlist__add(evlist, pgfault_maj);
2196 	}
2197 
2198 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2199 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2200 		if (pgfault_min == NULL)
2201 			goto out_error_mem;
2202 		perf_evlist__add(evlist, pgfault_min);
2203 	}
2204 
2205 	if (trace->sched &&
2206 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2207 				   trace__sched_stat_runtime))
2208 		goto out_error_sched_stat_runtime;
2209 
2210 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2211 	if (err < 0) {
2212 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2213 		goto out_delete_evlist;
2214 	}
2215 
2216 	err = trace__symbols_init(trace, evlist);
2217 	if (err < 0) {
2218 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2219 		goto out_delete_evlist;
2220 	}
2221 
2222 	perf_evlist__config(evlist, &trace->opts, NULL);
2223 
2224 	if (callchain_param.enabled) {
2225 		bool use_identifier = false;
2226 
2227 		if (trace->syscalls.events.sys_exit) {
2228 			perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2229 						     &trace->opts, &callchain_param);
2230 			use_identifier = true;
2231 		}
2232 
2233 		if (pgfault_maj) {
2234 			perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2235 			use_identifier = true;
2236 		}
2237 
2238 		if (pgfault_min) {
2239 			perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2240 			use_identifier = true;
2241 		}
2242 
2243 		if (use_identifier) {
2244 		       /*
2245 			* Now we have evsels with different sample_ids, use
2246 			* PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2247 			* from a fixed position in each ring buffer record.
2248 			*
2249 			* As of this the changeset introducing this comment, this
2250 			* isn't strictly needed, as the fields that can come before
2251 			* PERF_SAMPLE_ID are all used, but we'll probably disable
2252 			* some of those for things like copying the payload of
2253 			* pointer syscall arguments, and for vfs_getname we don't
2254 			* need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2255 			* here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2256 			*/
2257 			perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2258 			perf_evlist__reset_sample_bit(evlist, ID);
2259 		}
2260 	}
2261 
2262 	signal(SIGCHLD, sig_handler);
2263 	signal(SIGINT, sig_handler);
2264 
2265 	if (forks) {
2266 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2267 						    argv, false, NULL);
2268 		if (err < 0) {
2269 			fprintf(trace->output, "Couldn't run the workload!\n");
2270 			goto out_delete_evlist;
2271 		}
2272 	}
2273 
2274 	err = perf_evlist__open(evlist);
2275 	if (err < 0)
2276 		goto out_error_open;
2277 
2278 	err = bpf__apply_obj_config();
2279 	if (err) {
2280 		char errbuf[BUFSIZ];
2281 
2282 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2283 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2284 			 errbuf);
2285 		goto out_error_open;
2286 	}
2287 
2288 	/*
2289 	 * Better not use !target__has_task() here because we need to cover the
2290 	 * case where no threads were specified in the command line, but a
2291 	 * workload was, and in that case we will fill in the thread_map when
2292 	 * we fork the workload in perf_evlist__prepare_workload.
2293 	 */
2294 	if (trace->filter_pids.nr > 0)
2295 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2296 	else if (thread_map__pid(evlist->threads, 0) == -1)
2297 		err = perf_evlist__set_filter_pid(evlist, getpid());
2298 
2299 	if (err < 0)
2300 		goto out_error_mem;
2301 
2302 	if (trace->ev_qualifier_ids.nr > 0) {
2303 		err = trace__set_ev_qualifier_filter(trace);
2304 		if (err < 0)
2305 			goto out_errno;
2306 
2307 		pr_debug("event qualifier tracepoint filter: %s\n",
2308 			 trace->syscalls.events.sys_exit->filter);
2309 	}
2310 
2311 	err = perf_evlist__apply_filters(evlist, &evsel);
2312 	if (err < 0)
2313 		goto out_error_apply_filters;
2314 
2315 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2316 	if (err < 0)
2317 		goto out_error_mmap;
2318 
2319 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2320 		perf_evlist__enable(evlist);
2321 
2322 	if (forks)
2323 		perf_evlist__start_workload(evlist);
2324 
2325 	if (trace->opts.initial_delay) {
2326 		usleep(trace->opts.initial_delay * 1000);
2327 		perf_evlist__enable(evlist);
2328 	}
2329 
2330 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2331 				  evlist->threads->nr > 1 ||
2332 				  perf_evlist__first(evlist)->attr.inherit;
2333 again:
2334 	before = trace->nr_events;
2335 
2336 	for (i = 0; i < evlist->nr_mmaps; i++) {
2337 		union perf_event *event;
2338 
2339 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2340 			struct perf_sample sample;
2341 
2342 			++trace->nr_events;
2343 
2344 			err = perf_evlist__parse_sample(evlist, event, &sample);
2345 			if (err) {
2346 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2347 				goto next_event;
2348 			}
2349 
2350 			trace__handle_event(trace, event, &sample);
2351 next_event:
2352 			perf_evlist__mmap_consume(evlist, i);
2353 
2354 			if (interrupted)
2355 				goto out_disable;
2356 
2357 			if (done && !draining) {
2358 				perf_evlist__disable(evlist);
2359 				draining = true;
2360 			}
2361 		}
2362 	}
2363 
2364 	if (trace->nr_events == before) {
2365 		int timeout = done ? 100 : -1;
2366 
2367 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2368 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2369 				draining = true;
2370 
2371 			goto again;
2372 		}
2373 	} else {
2374 		goto again;
2375 	}
2376 
2377 out_disable:
2378 	thread__zput(trace->current);
2379 
2380 	perf_evlist__disable(evlist);
2381 
2382 	if (!err) {
2383 		if (trace->summary)
2384 			trace__fprintf_thread_summary(trace, trace->output);
2385 
2386 		if (trace->show_tool_stats) {
2387 			fprintf(trace->output, "Stats:\n "
2388 					       " vfs_getname : %" PRIu64 "\n"
2389 					       " proc_getname: %" PRIu64 "\n",
2390 				trace->stats.vfs_getname,
2391 				trace->stats.proc_getname);
2392 		}
2393 	}
2394 
2395 out_delete_evlist:
2396 	perf_evlist__delete(evlist);
2397 	trace->evlist = NULL;
2398 	trace->live = false;
2399 	return err;
2400 {
2401 	char errbuf[BUFSIZ];
2402 
2403 out_error_sched_stat_runtime:
2404 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2405 	goto out_error;
2406 
2407 out_error_raw_syscalls:
2408 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2409 	goto out_error;
2410 
2411 out_error_mmap:
2412 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2413 	goto out_error;
2414 
2415 out_error_open:
2416 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2417 
2418 out_error:
2419 	fprintf(trace->output, "%s\n", errbuf);
2420 	goto out_delete_evlist;
2421 
2422 out_error_apply_filters:
2423 	fprintf(trace->output,
2424 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2425 		evsel->filter, perf_evsel__name(evsel), errno,
2426 		str_error_r(errno, errbuf, sizeof(errbuf)));
2427 	goto out_delete_evlist;
2428 }
2429 out_error_mem:
2430 	fprintf(trace->output, "Not enough memory to run!\n");
2431 	goto out_delete_evlist;
2432 
2433 out_errno:
2434 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2435 	goto out_delete_evlist;
2436 }
2437 
2438 static int trace__replay(struct trace *trace)
2439 {
2440 	const struct perf_evsel_str_handler handlers[] = {
2441 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2442 	};
2443 	struct perf_data_file file = {
2444 		.path  = input_name,
2445 		.mode  = PERF_DATA_MODE_READ,
2446 		.force = trace->force,
2447 	};
2448 	struct perf_session *session;
2449 	struct perf_evsel *evsel;
2450 	int err = -1;
2451 
2452 	trace->tool.sample	  = trace__process_sample;
2453 	trace->tool.mmap	  = perf_event__process_mmap;
2454 	trace->tool.mmap2	  = perf_event__process_mmap2;
2455 	trace->tool.comm	  = perf_event__process_comm;
2456 	trace->tool.exit	  = perf_event__process_exit;
2457 	trace->tool.fork	  = perf_event__process_fork;
2458 	trace->tool.attr	  = perf_event__process_attr;
2459 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2460 	trace->tool.build_id	  = perf_event__process_build_id;
2461 	trace->tool.namespaces	  = perf_event__process_namespaces;
2462 
2463 	trace->tool.ordered_events = true;
2464 	trace->tool.ordering_requires_timestamps = true;
2465 
2466 	/* add tid to output */
2467 	trace->multiple_threads = true;
2468 
2469 	session = perf_session__new(&file, false, &trace->tool);
2470 	if (session == NULL)
2471 		return -1;
2472 
2473 	if (trace->opts.target.pid)
2474 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2475 
2476 	if (trace->opts.target.tid)
2477 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2478 
2479 	if (symbol__init(&session->header.env) < 0)
2480 		goto out;
2481 
2482 	trace->host = &session->machines.host;
2483 
2484 	err = perf_session__set_tracepoints_handlers(session, handlers);
2485 	if (err)
2486 		goto out;
2487 
2488 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2489 						     "raw_syscalls:sys_enter");
2490 	/* older kernels have syscalls tp versus raw_syscalls */
2491 	if (evsel == NULL)
2492 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2493 							     "syscalls:sys_enter");
2494 
2495 	if (evsel &&
2496 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2497 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2498 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2499 		goto out;
2500 	}
2501 
2502 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2503 						     "raw_syscalls:sys_exit");
2504 	if (evsel == NULL)
2505 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2506 							     "syscalls:sys_exit");
2507 	if (evsel &&
2508 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2509 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2510 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2511 		goto out;
2512 	}
2513 
2514 	evlist__for_each_entry(session->evlist, evsel) {
2515 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2516 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2517 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2518 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2519 			evsel->handler = trace__pgfault;
2520 	}
2521 
2522 	setup_pager();
2523 
2524 	err = perf_session__process_events(session);
2525 	if (err)
2526 		pr_err("Failed to process events, error %d", err);
2527 
2528 	else if (trace->summary)
2529 		trace__fprintf_thread_summary(trace, trace->output);
2530 
2531 out:
2532 	perf_session__delete(session);
2533 
2534 	return err;
2535 }
2536 
2537 static size_t trace__fprintf_threads_header(FILE *fp)
2538 {
2539 	size_t printed;
2540 
2541 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2542 
2543 	return printed;
2544 }
2545 
2546 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2547 	struct stats 	*stats;
2548 	double		msecs;
2549 	int		syscall;
2550 )
2551 {
2552 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2553 	struct stats *stats = source->priv;
2554 
2555 	entry->syscall = source->i;
2556 	entry->stats   = stats;
2557 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2558 }
2559 
2560 static size_t thread__dump_stats(struct thread_trace *ttrace,
2561 				 struct trace *trace, FILE *fp)
2562 {
2563 	size_t printed = 0;
2564 	struct syscall *sc;
2565 	struct rb_node *nd;
2566 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2567 
2568 	if (syscall_stats == NULL)
2569 		return 0;
2570 
2571 	printed += fprintf(fp, "\n");
2572 
2573 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2574 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2575 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2576 
2577 	resort_rb__for_each_entry(nd, syscall_stats) {
2578 		struct stats *stats = syscall_stats_entry->stats;
2579 		if (stats) {
2580 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2581 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2582 			double avg = avg_stats(stats);
2583 			double pct;
2584 			u64 n = (u64) stats->n;
2585 
2586 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2587 			avg /= NSEC_PER_MSEC;
2588 
2589 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2590 			printed += fprintf(fp, "   %-15s", sc->name);
2591 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2592 					   n, syscall_stats_entry->msecs, min, avg);
2593 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2594 		}
2595 	}
2596 
2597 	resort_rb__delete(syscall_stats);
2598 	printed += fprintf(fp, "\n\n");
2599 
2600 	return printed;
2601 }
2602 
2603 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2604 {
2605 	size_t printed = 0;
2606 	struct thread_trace *ttrace = thread__priv(thread);
2607 	double ratio;
2608 
2609 	if (ttrace == NULL)
2610 		return 0;
2611 
2612 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2613 
2614 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2615 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2616 	printed += fprintf(fp, "%.1f%%", ratio);
2617 	if (ttrace->pfmaj)
2618 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2619 	if (ttrace->pfmin)
2620 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2621 	if (trace->sched)
2622 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2623 	else if (fputc('\n', fp) != EOF)
2624 		++printed;
2625 
2626 	printed += thread__dump_stats(ttrace, trace, fp);
2627 
2628 	return printed;
2629 }
2630 
2631 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2632 {
2633 	return ttrace ? ttrace->nr_events : 0;
2634 }
2635 
2636 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2637 	struct thread *thread;
2638 )
2639 {
2640 	entry->thread = rb_entry(nd, struct thread, rb_node);
2641 }
2642 
2643 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2644 {
2645 	DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2646 	size_t printed = trace__fprintf_threads_header(fp);
2647 	struct rb_node *nd;
2648 
2649 	if (threads == NULL) {
2650 		fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2651 		return 0;
2652 	}
2653 
2654 	resort_rb__for_each_entry(nd, threads)
2655 		printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2656 
2657 	resort_rb__delete(threads);
2658 
2659 	return printed;
2660 }
2661 
2662 static int trace__set_duration(const struct option *opt, const char *str,
2663 			       int unset __maybe_unused)
2664 {
2665 	struct trace *trace = opt->value;
2666 
2667 	trace->duration_filter = atof(str);
2668 	return 0;
2669 }
2670 
2671 static int trace__set_filter_pids(const struct option *opt, const char *str,
2672 				  int unset __maybe_unused)
2673 {
2674 	int ret = -1;
2675 	size_t i;
2676 	struct trace *trace = opt->value;
2677 	/*
2678 	 * FIXME: introduce a intarray class, plain parse csv and create a
2679 	 * { int nr, int entries[] } struct...
2680 	 */
2681 	struct intlist *list = intlist__new(str);
2682 
2683 	if (list == NULL)
2684 		return -1;
2685 
2686 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2687 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2688 
2689 	if (trace->filter_pids.entries == NULL)
2690 		goto out;
2691 
2692 	trace->filter_pids.entries[0] = getpid();
2693 
2694 	for (i = 1; i < trace->filter_pids.nr; ++i)
2695 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2696 
2697 	intlist__delete(list);
2698 	ret = 0;
2699 out:
2700 	return ret;
2701 }
2702 
2703 static int trace__open_output(struct trace *trace, const char *filename)
2704 {
2705 	struct stat st;
2706 
2707 	if (!stat(filename, &st) && st.st_size) {
2708 		char oldname[PATH_MAX];
2709 
2710 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2711 		unlink(oldname);
2712 		rename(filename, oldname);
2713 	}
2714 
2715 	trace->output = fopen(filename, "w");
2716 
2717 	return trace->output == NULL ? -errno : 0;
2718 }
2719 
2720 static int parse_pagefaults(const struct option *opt, const char *str,
2721 			    int unset __maybe_unused)
2722 {
2723 	int *trace_pgfaults = opt->value;
2724 
2725 	if (strcmp(str, "all") == 0)
2726 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2727 	else if (strcmp(str, "maj") == 0)
2728 		*trace_pgfaults |= TRACE_PFMAJ;
2729 	else if (strcmp(str, "min") == 0)
2730 		*trace_pgfaults |= TRACE_PFMIN;
2731 	else
2732 		return -1;
2733 
2734 	return 0;
2735 }
2736 
2737 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2738 {
2739 	struct perf_evsel *evsel;
2740 
2741 	evlist__for_each_entry(evlist, evsel)
2742 		evsel->handler = handler;
2743 }
2744 
2745 /*
2746  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2747  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2748  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2749  *
2750  * It'd be better to introduce a parse_options() variant that would return a
2751  * list with the terms it didn't match to an event...
2752  */
2753 static int trace__parse_events_option(const struct option *opt, const char *str,
2754 				      int unset __maybe_unused)
2755 {
2756 	struct trace *trace = (struct trace *)opt->value;
2757 	const char *s = str;
2758 	char *sep = NULL, *lists[2] = { NULL, NULL, };
2759 	int len = strlen(str), err = -1, list;
2760 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2761 	char group_name[PATH_MAX];
2762 
2763 	if (strace_groups_dir == NULL)
2764 		return -1;
2765 
2766 	if (*s == '!') {
2767 		++s;
2768 		trace->not_ev_qualifier = true;
2769 	}
2770 
2771 	while (1) {
2772 		if ((sep = strchr(s, ',')) != NULL)
2773 			*sep = '\0';
2774 
2775 		list = 0;
2776 		if (syscalltbl__id(trace->sctbl, s) >= 0) {
2777 			list = 1;
2778 		} else {
2779 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2780 			if (access(group_name, R_OK) == 0)
2781 				list = 1;
2782 		}
2783 
2784 		if (lists[list]) {
2785 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2786 		} else {
2787 			lists[list] = malloc(len);
2788 			if (lists[list] == NULL)
2789 				goto out;
2790 			strcpy(lists[list], s);
2791 		}
2792 
2793 		if (!sep)
2794 			break;
2795 
2796 		*sep = ',';
2797 		s = sep + 1;
2798 	}
2799 
2800 	if (lists[1] != NULL) {
2801 		struct strlist_config slist_config = {
2802 			.dirname = strace_groups_dir,
2803 		};
2804 
2805 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2806 		if (trace->ev_qualifier == NULL) {
2807 			fputs("Not enough memory to parse event qualifier", trace->output);
2808 			goto out;
2809 		}
2810 
2811 		if (trace__validate_ev_qualifier(trace))
2812 			goto out;
2813 	}
2814 
2815 	err = 0;
2816 
2817 	if (lists[0]) {
2818 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2819 					       "event selector. use 'perf list' to list available events",
2820 					       parse_events_option);
2821 		err = parse_events_option(&o, lists[0], 0);
2822 	}
2823 out:
2824 	if (sep)
2825 		*sep = ',';
2826 
2827 	return err;
2828 }
2829 
2830 int cmd_trace(int argc, const char **argv)
2831 {
2832 	const char *trace_usage[] = {
2833 		"perf trace [<options>] [<command>]",
2834 		"perf trace [<options>] -- <command> [<options>]",
2835 		"perf trace record [<options>] [<command>]",
2836 		"perf trace record [<options>] -- <command> [<options>]",
2837 		NULL
2838 	};
2839 	struct trace trace = {
2840 		.syscalls = {
2841 			. max = -1,
2842 		},
2843 		.opts = {
2844 			.target = {
2845 				.uid	   = UINT_MAX,
2846 				.uses_mmap = true,
2847 			},
2848 			.user_freq     = UINT_MAX,
2849 			.user_interval = ULLONG_MAX,
2850 			.no_buffering  = true,
2851 			.mmap_pages    = UINT_MAX,
2852 			.proc_map_timeout  = 500,
2853 		},
2854 		.output = stderr,
2855 		.show_comm = true,
2856 		.trace_syscalls = true,
2857 		.kernel_syscallchains = false,
2858 		.max_stack = UINT_MAX,
2859 	};
2860 	const char *output_name = NULL;
2861 	const struct option trace_options[] = {
2862 	OPT_CALLBACK('e', "event", &trace, "event",
2863 		     "event/syscall selector. use 'perf list' to list available events",
2864 		     trace__parse_events_option),
2865 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2866 		    "show the thread COMM next to its id"),
2867 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2868 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2869 		     trace__parse_events_option),
2870 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2871 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2872 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2873 		    "trace events on existing process id"),
2874 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2875 		    "trace events on existing thread id"),
2876 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2877 		     "pids to filter (by the kernel)", trace__set_filter_pids),
2878 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2879 		    "system-wide collection from all CPUs"),
2880 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2881 		    "list of cpus to monitor"),
2882 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2883 		    "child tasks do not inherit counters"),
2884 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2885 		     "number of mmap data pages",
2886 		     perf_evlist__parse_mmap_pages),
2887 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2888 		   "user to profile"),
2889 	OPT_CALLBACK(0, "duration", &trace, "float",
2890 		     "show only events with duration > N.M ms",
2891 		     trace__set_duration),
2892 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2893 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2894 	OPT_BOOLEAN('T', "time", &trace.full_time,
2895 		    "Show full timestamp, not time relative to first start"),
2896 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
2897 		    "Show only syscall summary with statistics"),
2898 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
2899 		    "Show all syscalls and summary with statistics"),
2900 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2901 		     "Trace pagefaults", parse_pagefaults, "maj"),
2902 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2903 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2904 	OPT_CALLBACK(0, "call-graph", &trace.opts,
2905 		     "record_mode[,record_size]", record_callchain_help,
2906 		     &record_parse_callchain_opt),
2907 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2908 		    "Show the kernel callchains on the syscall exit path"),
2909 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2910 		     "Set the minimum stack depth when parsing the callchain, "
2911 		     "anything below the specified depth will be ignored."),
2912 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2913 		     "Set the maximum stack depth when parsing the callchain, "
2914 		     "anything beyond the specified depth will be ignored. "
2915 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2916 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2917 			"per thread proc mmap processing timeout in ms"),
2918 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2919 		     "ms to wait before starting measurement after program "
2920 		     "start"),
2921 	OPT_END()
2922 	};
2923 	bool __maybe_unused max_stack_user_set = true;
2924 	bool mmap_pages_user_set = true;
2925 	const char * const trace_subcommands[] = { "record", NULL };
2926 	int err;
2927 	char bf[BUFSIZ];
2928 
2929 	signal(SIGSEGV, sighandler_dump_stack);
2930 	signal(SIGFPE, sighandler_dump_stack);
2931 
2932 	trace.evlist = perf_evlist__new();
2933 	trace.sctbl = syscalltbl__new();
2934 
2935 	if (trace.evlist == NULL || trace.sctbl == NULL) {
2936 		pr_err("Not enough memory to run!\n");
2937 		err = -ENOMEM;
2938 		goto out;
2939 	}
2940 
2941 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2942 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2943 
2944 	err = bpf__setup_stdout(trace.evlist);
2945 	if (err) {
2946 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2947 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2948 		goto out;
2949 	}
2950 
2951 	err = -1;
2952 
2953 	if (trace.trace_pgfaults) {
2954 		trace.opts.sample_address = true;
2955 		trace.opts.sample_time = true;
2956 	}
2957 
2958 	if (trace.opts.mmap_pages == UINT_MAX)
2959 		mmap_pages_user_set = false;
2960 
2961 	if (trace.max_stack == UINT_MAX) {
2962 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2963 		max_stack_user_set = false;
2964 	}
2965 
2966 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2967 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2968 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2969 #endif
2970 
2971 	if (callchain_param.enabled) {
2972 		if (!mmap_pages_user_set && geteuid() == 0)
2973 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2974 
2975 		symbol_conf.use_callchain = true;
2976 	}
2977 
2978 	if (trace.evlist->nr_entries > 0)
2979 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2980 
2981 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2982 		return trace__record(&trace, argc-1, &argv[1]);
2983 
2984 	/* summary_only implies summary option, but don't overwrite summary if set */
2985 	if (trace.summary_only)
2986 		trace.summary = trace.summary_only;
2987 
2988 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2989 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
2990 		pr_err("Please specify something to trace.\n");
2991 		return -1;
2992 	}
2993 
2994 	if (!trace.trace_syscalls && trace.ev_qualifier) {
2995 		pr_err("The -e option can't be used with --no-syscalls.\n");
2996 		goto out;
2997 	}
2998 
2999 	if (output_name != NULL) {
3000 		err = trace__open_output(&trace, output_name);
3001 		if (err < 0) {
3002 			perror("failed to create output file");
3003 			goto out;
3004 		}
3005 	}
3006 
3007 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
3008 
3009 	err = target__validate(&trace.opts.target);
3010 	if (err) {
3011 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3012 		fprintf(trace.output, "%s", bf);
3013 		goto out_close;
3014 	}
3015 
3016 	err = target__parse_uid(&trace.opts.target);
3017 	if (err) {
3018 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3019 		fprintf(trace.output, "%s", bf);
3020 		goto out_close;
3021 	}
3022 
3023 	if (!argc && target__none(&trace.opts.target))
3024 		trace.opts.target.system_wide = true;
3025 
3026 	if (input_name)
3027 		err = trace__replay(&trace);
3028 	else
3029 		err = trace__run(&trace, argc, argv);
3030 
3031 out_close:
3032 	if (output_name != NULL)
3033 		fclose(trace.output);
3034 out:
3035 	return err;
3036 }
3037