xref: /openbmc/linux/tools/perf/builtin-trace.c (revision d28bcd53)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/event.h"
25 #include "util/evlist.h"
26 #include <subcmd/exec-cmd.h>
27 #include "util/machine.h"
28 #include "util/path.h"
29 #include "util/session.h"
30 #include "util/thread.h"
31 #include <subcmd/parse-options.h>
32 #include "util/strlist.h"
33 #include "util/intlist.h"
34 #include "util/thread_map.h"
35 #include "util/stat.h"
36 #include "trace/beauty/beauty.h"
37 #include "trace-event.h"
38 #include "util/parse-events.h"
39 #include "util/bpf-loader.h"
40 #include "callchain.h"
41 #include "print_binary.h"
42 #include "string2.h"
43 #include "syscalltbl.h"
44 #include "rb_resort.h"
45 
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/audit.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 
61 #include "sane_ctype.h"
62 
63 #ifndef O_CLOEXEC
64 # define O_CLOEXEC		02000000
65 #endif
66 
67 struct trace {
68 	struct perf_tool	tool;
69 	struct syscalltbl	*sctbl;
70 	struct {
71 		int		max;
72 		struct syscall  *table;
73 		struct {
74 			struct perf_evsel *sys_enter,
75 					  *sys_exit;
76 		}		events;
77 	} syscalls;
78 	struct record_opts	opts;
79 	struct perf_evlist	*evlist;
80 	struct machine		*host;
81 	struct thread		*current;
82 	u64			base_time;
83 	FILE			*output;
84 	unsigned long		nr_events;
85 	struct strlist		*ev_qualifier;
86 	struct {
87 		size_t		nr;
88 		int		*entries;
89 	}			ev_qualifier_ids;
90 	struct {
91 		size_t		nr;
92 		pid_t		*entries;
93 	}			filter_pids;
94 	double			duration_filter;
95 	double			runtime_ms;
96 	struct {
97 		u64		vfs_getname,
98 				proc_getname;
99 	} stats;
100 	unsigned int		max_stack;
101 	unsigned int		min_stack;
102 	bool			not_ev_qualifier;
103 	bool			live;
104 	bool			full_time;
105 	bool			sched;
106 	bool			multiple_threads;
107 	bool			summary;
108 	bool			summary_only;
109 	bool			show_comm;
110 	bool			show_tool_stats;
111 	bool			trace_syscalls;
112 	bool			kernel_syscallchains;
113 	bool			force;
114 	bool			vfs_getname;
115 	int			trace_pgfaults;
116 	int			open_id;
117 };
118 
119 struct tp_field {
120 	int offset;
121 	union {
122 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
123 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
124 	};
125 };
126 
127 #define TP_UINT_FIELD(bits) \
128 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
129 { \
130 	u##bits value; \
131 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
132 	return value;  \
133 }
134 
135 TP_UINT_FIELD(8);
136 TP_UINT_FIELD(16);
137 TP_UINT_FIELD(32);
138 TP_UINT_FIELD(64);
139 
140 #define TP_UINT_FIELD__SWAPPED(bits) \
141 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
142 { \
143 	u##bits value; \
144 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
145 	return bswap_##bits(value);\
146 }
147 
148 TP_UINT_FIELD__SWAPPED(16);
149 TP_UINT_FIELD__SWAPPED(32);
150 TP_UINT_FIELD__SWAPPED(64);
151 
152 static int tp_field__init_uint(struct tp_field *field,
153 			       struct format_field *format_field,
154 			       bool needs_swap)
155 {
156 	field->offset = format_field->offset;
157 
158 	switch (format_field->size) {
159 	case 1:
160 		field->integer = tp_field__u8;
161 		break;
162 	case 2:
163 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
164 		break;
165 	case 4:
166 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
167 		break;
168 	case 8:
169 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
170 		break;
171 	default:
172 		return -1;
173 	}
174 
175 	return 0;
176 }
177 
178 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
179 {
180 	return sample->raw_data + field->offset;
181 }
182 
183 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
184 {
185 	field->offset = format_field->offset;
186 	field->pointer = tp_field__ptr;
187 	return 0;
188 }
189 
190 struct syscall_tp {
191 	struct tp_field id;
192 	union {
193 		struct tp_field args, ret;
194 	};
195 };
196 
197 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
198 					  struct tp_field *field,
199 					  const char *name)
200 {
201 	struct format_field *format_field = perf_evsel__field(evsel, name);
202 
203 	if (format_field == NULL)
204 		return -1;
205 
206 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
207 }
208 
209 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
210 	({ struct syscall_tp *sc = evsel->priv;\
211 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
212 
213 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
214 					 struct tp_field *field,
215 					 const char *name)
216 {
217 	struct format_field *format_field = perf_evsel__field(evsel, name);
218 
219 	if (format_field == NULL)
220 		return -1;
221 
222 	return tp_field__init_ptr(field, format_field);
223 }
224 
225 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
226 	({ struct syscall_tp *sc = evsel->priv;\
227 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
228 
229 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
230 {
231 	zfree(&evsel->priv);
232 	perf_evsel__delete(evsel);
233 }
234 
235 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
236 {
237 	evsel->priv = malloc(sizeof(struct syscall_tp));
238 	if (evsel->priv != NULL) {
239 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
240 			goto out_delete;
241 
242 		evsel->handler = handler;
243 		return 0;
244 	}
245 
246 	return -ENOMEM;
247 
248 out_delete:
249 	zfree(&evsel->priv);
250 	return -ENOENT;
251 }
252 
253 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
254 {
255 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
256 
257 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
258 	if (IS_ERR(evsel))
259 		evsel = perf_evsel__newtp("syscalls", direction);
260 
261 	if (IS_ERR(evsel))
262 		return NULL;
263 
264 	if (perf_evsel__init_syscall_tp(evsel, handler))
265 		goto out_delete;
266 
267 	return evsel;
268 
269 out_delete:
270 	perf_evsel__delete_priv(evsel);
271 	return NULL;
272 }
273 
274 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
275 	({ struct syscall_tp *fields = evsel->priv; \
276 	   fields->name.integer(&fields->name, sample); })
277 
278 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
279 	({ struct syscall_tp *fields = evsel->priv; \
280 	   fields->name.pointer(&fields->name, sample); })
281 
282 struct strarray {
283 	int	    offset;
284 	int	    nr_entries;
285 	const char **entries;
286 };
287 
288 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
289 	.nr_entries = ARRAY_SIZE(array), \
290 	.entries = array, \
291 }
292 
293 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
294 	.offset	    = off, \
295 	.nr_entries = ARRAY_SIZE(array), \
296 	.entries = array, \
297 }
298 
299 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
300 						const char *intfmt,
301 					        struct syscall_arg *arg)
302 {
303 	struct strarray *sa = arg->parm;
304 	int idx = arg->val - sa->offset;
305 
306 	if (idx < 0 || idx >= sa->nr_entries)
307 		return scnprintf(bf, size, intfmt, arg->val);
308 
309 	return scnprintf(bf, size, "%s", sa->entries[idx]);
310 }
311 
312 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
313 					      struct syscall_arg *arg)
314 {
315 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
316 }
317 
318 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
319 
320 #if defined(__i386__) || defined(__x86_64__)
321 /*
322  * FIXME: Make this available to all arches as soon as the ioctl beautifier
323  * 	  gets rewritten to support all arches.
324  */
325 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
326 						 struct syscall_arg *arg)
327 {
328 	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
329 }
330 
331 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
332 #endif /* defined(__i386__) || defined(__x86_64__) */
333 
334 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
335 					struct syscall_arg *arg);
336 
337 #define SCA_FD syscall_arg__scnprintf_fd
338 
339 #ifndef AT_FDCWD
340 #define AT_FDCWD	-100
341 #endif
342 
343 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
344 					   struct syscall_arg *arg)
345 {
346 	int fd = arg->val;
347 
348 	if (fd == AT_FDCWD)
349 		return scnprintf(bf, size, "CWD");
350 
351 	return syscall_arg__scnprintf_fd(bf, size, arg);
352 }
353 
354 #define SCA_FDAT syscall_arg__scnprintf_fd_at
355 
356 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
357 					      struct syscall_arg *arg);
358 
359 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
360 
361 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
362 					 struct syscall_arg *arg)
363 {
364 	return scnprintf(bf, size, "%#lx", arg->val);
365 }
366 
367 #define SCA_HEX syscall_arg__scnprintf_hex
368 
369 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
370 					 struct syscall_arg *arg)
371 {
372 	return scnprintf(bf, size, "%d", arg->val);
373 }
374 
375 #define SCA_INT syscall_arg__scnprintf_int
376 
377 static const char *bpf_cmd[] = {
378 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
379 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
380 };
381 static DEFINE_STRARRAY(bpf_cmd);
382 
383 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
384 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
385 
386 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
387 static DEFINE_STRARRAY(itimers);
388 
389 static const char *keyctl_options[] = {
390 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
391 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
392 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
393 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
394 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
395 };
396 static DEFINE_STRARRAY(keyctl_options);
397 
398 static const char *whences[] = { "SET", "CUR", "END",
399 #ifdef SEEK_DATA
400 "DATA",
401 #endif
402 #ifdef SEEK_HOLE
403 "HOLE",
404 #endif
405 };
406 static DEFINE_STRARRAY(whences);
407 
408 static const char *fcntl_cmds[] = {
409 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
410 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
411 	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
412 	"F_GETOWNER_UIDS",
413 };
414 static DEFINE_STRARRAY(fcntl_cmds);
415 
416 static const char *rlimit_resources[] = {
417 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
418 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
419 	"RTTIME",
420 };
421 static DEFINE_STRARRAY(rlimit_resources);
422 
423 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
424 static DEFINE_STRARRAY(sighow);
425 
426 static const char *clockid[] = {
427 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
428 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
429 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
430 };
431 static DEFINE_STRARRAY(clockid);
432 
433 static const char *socket_families[] = {
434 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
435 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
436 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
437 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
438 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
439 	"ALG", "NFC", "VSOCK",
440 };
441 static DEFINE_STRARRAY(socket_families);
442 
443 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
444 						 struct syscall_arg *arg)
445 {
446 	size_t printed = 0;
447 	int mode = arg->val;
448 
449 	if (mode == F_OK) /* 0 */
450 		return scnprintf(bf, size, "F");
451 #define	P_MODE(n) \
452 	if (mode & n##_OK) { \
453 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
454 		mode &= ~n##_OK; \
455 	}
456 
457 	P_MODE(R);
458 	P_MODE(W);
459 	P_MODE(X);
460 #undef P_MODE
461 
462 	if (mode)
463 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
464 
465 	return printed;
466 }
467 
468 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
469 
470 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
471 					      struct syscall_arg *arg);
472 
473 #define SCA_FILENAME syscall_arg__scnprintf_filename
474 
475 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
476 						struct syscall_arg *arg)
477 {
478 	int printed = 0, flags = arg->val;
479 
480 #define	P_FLAG(n) \
481 	if (flags & O_##n) { \
482 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
483 		flags &= ~O_##n; \
484 	}
485 
486 	P_FLAG(CLOEXEC);
487 	P_FLAG(NONBLOCK);
488 #undef P_FLAG
489 
490 	if (flags)
491 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
492 
493 	return printed;
494 }
495 
496 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
497 
498 #if defined(__i386__) || defined(__x86_64__)
499 /*
500  * FIXME: Make this available to all arches.
501  */
502 #define TCGETS		0x5401
503 
504 static const char *tioctls[] = {
505 	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
506 	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
507 	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
508 	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
509 	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
510 	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
511 	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
512 	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
513 	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
514 	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
515 	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
516 	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
517 	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
518 	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
519 	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
520 };
521 
522 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
523 #endif /* defined(__i386__) || defined(__x86_64__) */
524 
525 #ifndef GRND_NONBLOCK
526 #define GRND_NONBLOCK	0x0001
527 #endif
528 #ifndef GRND_RANDOM
529 #define GRND_RANDOM	0x0002
530 #endif
531 
532 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
533 						   struct syscall_arg *arg)
534 {
535 	int printed = 0, flags = arg->val;
536 
537 #define	P_FLAG(n) \
538 	if (flags & GRND_##n) { \
539 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
540 		flags &= ~GRND_##n; \
541 	}
542 
543 	P_FLAG(RANDOM);
544 	P_FLAG(NONBLOCK);
545 #undef P_FLAG
546 
547 	if (flags)
548 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
549 
550 	return printed;
551 }
552 
553 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
554 
555 #define STRARRAY(arg, name, array) \
556 	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
557 	  .arg_parm	 = { [arg] = &strarray__##array, }
558 
559 #include "trace/beauty/eventfd.c"
560 #include "trace/beauty/flock.c"
561 #include "trace/beauty/futex_op.c"
562 #include "trace/beauty/mmap.c"
563 #include "trace/beauty/mode_t.c"
564 #include "trace/beauty/msg_flags.c"
565 #include "trace/beauty/open_flags.c"
566 #include "trace/beauty/perf_event_open.c"
567 #include "trace/beauty/pid.c"
568 #include "trace/beauty/sched_policy.c"
569 #include "trace/beauty/seccomp.c"
570 #include "trace/beauty/signum.c"
571 #include "trace/beauty/socket_type.c"
572 #include "trace/beauty/waitid_options.c"
573 
574 static struct syscall_fmt {
575 	const char *name;
576 	const char *alias;
577 	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
578 	void	   *arg_parm[6];
579 	bool	   errmsg;
580 	bool	   errpid;
581 	bool	   timeout;
582 	bool	   hexret;
583 } syscall_fmts[] = {
584 	{ .name	    = "access",	    .errmsg = true,
585 	  .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
586 	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
587 	{ .name	    = "bpf",	    .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
588 	{ .name	    = "brk",	    .hexret = true,
589 	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
590 	{ .name	    = "chdir",	    .errmsg = true, },
591 	{ .name	    = "chmod",	    .errmsg = true, },
592 	{ .name	    = "chroot",	    .errmsg = true, },
593 	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
594 	{ .name	    = "clone",	    .errpid = true, },
595 	{ .name	    = "close",	    .errmsg = true,
596 	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
597 	{ .name	    = "connect",    .errmsg = true, },
598 	{ .name	    = "creat",	    .errmsg = true, },
599 	{ .name	    = "dup",	    .errmsg = true, },
600 	{ .name	    = "dup2",	    .errmsg = true, },
601 	{ .name	    = "dup3",	    .errmsg = true, },
602 	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
603 	{ .name	    = "eventfd2",   .errmsg = true,
604 	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
605 	{ .name	    = "faccessat",  .errmsg = true, },
606 	{ .name	    = "fadvise64",  .errmsg = true, },
607 	{ .name	    = "fallocate",  .errmsg = true, },
608 	{ .name	    = "fchdir",	    .errmsg = true, },
609 	{ .name	    = "fchmod",	    .errmsg = true, },
610 	{ .name	    = "fchmodat",   .errmsg = true,
611 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
612 	{ .name	    = "fchown",	    .errmsg = true, },
613 	{ .name	    = "fchownat",   .errmsg = true,
614 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
615 	{ .name	    = "fcntl",	    .errmsg = true,
616 	  .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
617 	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
618 	{ .name	    = "fdatasync",  .errmsg = true, },
619 	{ .name	    = "flock",	    .errmsg = true,
620 	  .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
621 	{ .name	    = "fsetxattr",  .errmsg = true, },
622 	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat", },
623 	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat", },
624 	{ .name	    = "fstatfs",    .errmsg = true, },
625 	{ .name	    = "fsync",    .errmsg = true, },
626 	{ .name	    = "ftruncate", .errmsg = true, },
627 	{ .name	    = "futex",	    .errmsg = true,
628 	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
629 	{ .name	    = "futimesat", .errmsg = true,
630 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
631 	{ .name	    = "getdents",   .errmsg = true, },
632 	{ .name	    = "getdents64", .errmsg = true, },
633 	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
634 	{ .name	    = "getpid",	    .errpid = true, },
635 	{ .name	    = "getpgid",    .errpid = true, },
636 	{ .name	    = "getppid",    .errpid = true, },
637 	{ .name	    = "getrandom",  .errmsg = true,
638 	  .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
639 	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
640 	{ .name	    = "getxattr",   .errmsg = true, },
641 	{ .name	    = "inotify_add_watch",	    .errmsg = true, },
642 	{ .name	    = "ioctl",	    .errmsg = true,
643 	  .arg_scnprintf = {
644 #if defined(__i386__) || defined(__x86_64__)
645 /*
646  * FIXME: Make this available to all arches.
647  */
648 			     [1] = SCA_STRHEXARRAY, /* cmd */
649 			     [2] = SCA_HEX, /* arg */ },
650 	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
651 #else
652 			     [2] = SCA_HEX, /* arg */ }, },
653 #endif
654 	{ .name	    = "keyctl",	    .errmsg = true, STRARRAY(0, option, keyctl_options), },
655 	{ .name	    = "kill",	    .errmsg = true,
656 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
657 	{ .name	    = "lchown",    .errmsg = true, },
658 	{ .name	    = "lgetxattr",  .errmsg = true, },
659 	{ .name	    = "linkat",	    .errmsg = true,
660 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
661 	{ .name	    = "listxattr",  .errmsg = true, },
662 	{ .name	    = "llistxattr", .errmsg = true, },
663 	{ .name	    = "lremovexattr",  .errmsg = true, },
664 	{ .name	    = "lseek",	    .errmsg = true,
665 	  .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
666 	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
667 	{ .name	    = "lsetxattr",  .errmsg = true, },
668 	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat", },
669 	{ .name	    = "lsxattr",    .errmsg = true, },
670 	{ .name     = "madvise",    .errmsg = true,
671 	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
672 			     [2] = SCA_MADV_BHV, /* behavior */ }, },
673 	{ .name	    = "mkdir",    .errmsg = true, },
674 	{ .name	    = "mkdirat",    .errmsg = true,
675 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
676 	{ .name	    = "mknod",      .errmsg = true, },
677 	{ .name	    = "mknodat",    .errmsg = true,
678 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
679 	{ .name	    = "mlock",	    .errmsg = true,
680 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
681 	{ .name	    = "mlockall",   .errmsg = true,
682 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
683 	{ .name	    = "mmap",	    .hexret = true,
684 	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
685 			     [2] = SCA_MMAP_PROT, /* prot */
686 			     [3] = SCA_MMAP_FLAGS, /* flags */ }, },
687 	{ .name	    = "mprotect",   .errmsg = true,
688 	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
689 			     [2] = SCA_MMAP_PROT, /* prot */ }, },
690 	{ .name	    = "mq_unlink", .errmsg = true,
691 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
692 	{ .name	    = "mremap",	    .hexret = true,
693 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
694 			     [3] = SCA_MREMAP_FLAGS, /* flags */
695 			     [4] = SCA_HEX, /* new_addr */ }, },
696 	{ .name	    = "munlock",    .errmsg = true,
697 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
698 	{ .name	    = "munmap",	    .errmsg = true,
699 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
700 	{ .name	    = "name_to_handle_at", .errmsg = true,
701 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
702 	{ .name	    = "newfstatat", .errmsg = true,
703 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
704 	{ .name	    = "open",	    .errmsg = true,
705 	  .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
706 	{ .name	    = "open_by_handle_at", .errmsg = true,
707 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
708 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
709 	{ .name	    = "openat",	    .errmsg = true,
710 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
711 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
712 	{ .name	    = "perf_event_open", .errmsg = true,
713 	  .arg_scnprintf = { [2] = SCA_INT, /* cpu */
714 			     [3] = SCA_FD,  /* group_fd */
715 			     [4] = SCA_PERF_FLAGS,  /* flags */ }, },
716 	{ .name	    = "pipe2",	    .errmsg = true,
717 	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
718 	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
719 	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
720 	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64", },
721 	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread", },
722 	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
723 	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64", },
724 	{ .name	    = "pwritev",    .errmsg = true, },
725 	{ .name	    = "read",	    .errmsg = true, },
726 	{ .name	    = "readlink",   .errmsg = true, },
727 	{ .name	    = "readlinkat", .errmsg = true,
728 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
729 	{ .name	    = "readv",	    .errmsg = true, },
730 	{ .name	    = "recvfrom",   .errmsg = true,
731 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
732 	{ .name	    = "recvmmsg",   .errmsg = true,
733 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
734 	{ .name	    = "recvmsg",    .errmsg = true,
735 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
736 	{ .name	    = "removexattr", .errmsg = true, },
737 	{ .name	    = "renameat",   .errmsg = true,
738 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
739 	{ .name	    = "rmdir",    .errmsg = true, },
740 	{ .name	    = "rt_sigaction", .errmsg = true,
741 	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
742 	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
743 	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
744 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
745 	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
746 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
747 	{ .name	    = "sched_getattr",	      .errmsg = true, },
748 	{ .name	    = "sched_setattr",	      .errmsg = true, },
749 	{ .name	    = "sched_setscheduler",   .errmsg = true,
750 	  .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
751 	{ .name	    = "seccomp", .errmsg = true,
752 	  .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
753 			     [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
754 	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
755 	{ .name	    = "sendmmsg",    .errmsg = true,
756 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
757 	{ .name	    = "sendmsg",    .errmsg = true,
758 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
759 	{ .name	    = "sendto",	    .errmsg = true,
760 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
761 	{ .name	    = "set_tid_address", .errpid = true, },
762 	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
763 	{ .name	    = "setpgid",    .errmsg = true, },
764 	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
765 	{ .name	    = "setxattr",   .errmsg = true, },
766 	{ .name	    = "shutdown",   .errmsg = true, },
767 	{ .name	    = "socket",	    .errmsg = true,
768 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
769 			     [1] = SCA_SK_TYPE, /* type */ },
770 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
771 	{ .name	    = "socketpair", .errmsg = true,
772 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
773 			     [1] = SCA_SK_TYPE, /* type */ },
774 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
775 	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat", },
776 	{ .name	    = "statfs",	    .errmsg = true, },
777 	{ .name	    = "statx",	    .errmsg = true,
778 	  .arg_scnprintf = { [0] = SCA_FDAT, /* flags */
779 			     [2] = SCA_STATX_FLAGS, /* flags */
780 			     [3] = SCA_STATX_MASK, /* mask */ }, },
781 	{ .name	    = "swapoff",    .errmsg = true,
782 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
783 	{ .name	    = "swapon",	    .errmsg = true,
784 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
785 	{ .name	    = "symlinkat",  .errmsg = true,
786 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
787 	{ .name	    = "tgkill",	    .errmsg = true,
788 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
789 	{ .name	    = "tkill",	    .errmsg = true,
790 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
791 	{ .name	    = "truncate",   .errmsg = true, },
792 	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
793 	{ .name	    = "unlinkat",   .errmsg = true,
794 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
795 	{ .name	    = "utime",  .errmsg = true, },
796 	{ .name	    = "utimensat",  .errmsg = true,
797 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
798 	{ .name	    = "utimes",  .errmsg = true, },
799 	{ .name	    = "vmsplice",  .errmsg = true, },
800 	{ .name	    = "wait4",	    .errpid = true,
801 	  .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
802 	{ .name	    = "waitid",	    .errpid = true,
803 	  .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
804 	{ .name	    = "write",	    .errmsg = true, },
805 	{ .name	    = "writev",	    .errmsg = true, },
806 };
807 
808 static int syscall_fmt__cmp(const void *name, const void *fmtp)
809 {
810 	const struct syscall_fmt *fmt = fmtp;
811 	return strcmp(name, fmt->name);
812 }
813 
814 static struct syscall_fmt *syscall_fmt__find(const char *name)
815 {
816 	const int nmemb = ARRAY_SIZE(syscall_fmts);
817 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
818 }
819 
820 struct syscall {
821 	struct event_format *tp_format;
822 	int		    nr_args;
823 	struct format_field *args;
824 	const char	    *name;
825 	bool		    is_exit;
826 	struct syscall_fmt  *fmt;
827 	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
828 	void		    **arg_parm;
829 };
830 
831 /*
832  * We need to have this 'calculated' boolean because in some cases we really
833  * don't know what is the duration of a syscall, for instance, when we start
834  * a session and some threads are waiting for a syscall to finish, say 'poll',
835  * in which case all we can do is to print "( ? ) for duration and for the
836  * start timestamp.
837  */
838 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
839 {
840 	double duration = (double)t / NSEC_PER_MSEC;
841 	size_t printed = fprintf(fp, "(");
842 
843 	if (!calculated)
844 		printed += fprintf(fp, "     ?   ");
845 	else if (duration >= 1.0)
846 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
847 	else if (duration >= 0.01)
848 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
849 	else
850 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
851 	return printed + fprintf(fp, "): ");
852 }
853 
854 /**
855  * filename.ptr: The filename char pointer that will be vfs_getname'd
856  * filename.entry_str_pos: Where to insert the string translated from
857  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
858  */
859 struct thread_trace {
860 	u64		  entry_time;
861 	bool		  entry_pending;
862 	unsigned long	  nr_events;
863 	unsigned long	  pfmaj, pfmin;
864 	char		  *entry_str;
865 	double		  runtime_ms;
866         struct {
867 		unsigned long ptr;
868 		short int     entry_str_pos;
869 		bool	      pending_open;
870 		unsigned int  namelen;
871 		char	      *name;
872 	} filename;
873 	struct {
874 		int	  max;
875 		char	  **table;
876 	} paths;
877 
878 	struct intlist *syscall_stats;
879 };
880 
881 static struct thread_trace *thread_trace__new(void)
882 {
883 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
884 
885 	if (ttrace)
886 		ttrace->paths.max = -1;
887 
888 	ttrace->syscall_stats = intlist__new(NULL);
889 
890 	return ttrace;
891 }
892 
893 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
894 {
895 	struct thread_trace *ttrace;
896 
897 	if (thread == NULL)
898 		goto fail;
899 
900 	if (thread__priv(thread) == NULL)
901 		thread__set_priv(thread, thread_trace__new());
902 
903 	if (thread__priv(thread) == NULL)
904 		goto fail;
905 
906 	ttrace = thread__priv(thread);
907 	++ttrace->nr_events;
908 
909 	return ttrace;
910 fail:
911 	color_fprintf(fp, PERF_COLOR_RED,
912 		      "WARNING: not enough memory, dropping samples!\n");
913 	return NULL;
914 }
915 
916 #define TRACE_PFMAJ		(1 << 0)
917 #define TRACE_PFMIN		(1 << 1)
918 
919 static const size_t trace__entry_str_size = 2048;
920 
921 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
922 {
923 	struct thread_trace *ttrace = thread__priv(thread);
924 
925 	if (fd > ttrace->paths.max) {
926 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
927 
928 		if (npath == NULL)
929 			return -1;
930 
931 		if (ttrace->paths.max != -1) {
932 			memset(npath + ttrace->paths.max + 1, 0,
933 			       (fd - ttrace->paths.max) * sizeof(char *));
934 		} else {
935 			memset(npath, 0, (fd + 1) * sizeof(char *));
936 		}
937 
938 		ttrace->paths.table = npath;
939 		ttrace->paths.max   = fd;
940 	}
941 
942 	ttrace->paths.table[fd] = strdup(pathname);
943 
944 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
945 }
946 
947 static int thread__read_fd_path(struct thread *thread, int fd)
948 {
949 	char linkname[PATH_MAX], pathname[PATH_MAX];
950 	struct stat st;
951 	int ret;
952 
953 	if (thread->pid_ == thread->tid) {
954 		scnprintf(linkname, sizeof(linkname),
955 			  "/proc/%d/fd/%d", thread->pid_, fd);
956 	} else {
957 		scnprintf(linkname, sizeof(linkname),
958 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
959 	}
960 
961 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
962 		return -1;
963 
964 	ret = readlink(linkname, pathname, sizeof(pathname));
965 
966 	if (ret < 0 || ret > st.st_size)
967 		return -1;
968 
969 	pathname[ret] = '\0';
970 	return trace__set_fd_pathname(thread, fd, pathname);
971 }
972 
973 static const char *thread__fd_path(struct thread *thread, int fd,
974 				   struct trace *trace)
975 {
976 	struct thread_trace *ttrace = thread__priv(thread);
977 
978 	if (ttrace == NULL)
979 		return NULL;
980 
981 	if (fd < 0)
982 		return NULL;
983 
984 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
985 		if (!trace->live)
986 			return NULL;
987 		++trace->stats.proc_getname;
988 		if (thread__read_fd_path(thread, fd))
989 			return NULL;
990 	}
991 
992 	return ttrace->paths.table[fd];
993 }
994 
995 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
996 					struct syscall_arg *arg)
997 {
998 	int fd = arg->val;
999 	size_t printed = scnprintf(bf, size, "%d", fd);
1000 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1001 
1002 	if (path)
1003 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1004 
1005 	return printed;
1006 }
1007 
1008 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1009 					      struct syscall_arg *arg)
1010 {
1011 	int fd = arg->val;
1012 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1013 	struct thread_trace *ttrace = thread__priv(arg->thread);
1014 
1015 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1016 		zfree(&ttrace->paths.table[fd]);
1017 
1018 	return printed;
1019 }
1020 
1021 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1022 				     unsigned long ptr)
1023 {
1024 	struct thread_trace *ttrace = thread__priv(thread);
1025 
1026 	ttrace->filename.ptr = ptr;
1027 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1028 }
1029 
1030 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1031 					      struct syscall_arg *arg)
1032 {
1033 	unsigned long ptr = arg->val;
1034 
1035 	if (!arg->trace->vfs_getname)
1036 		return scnprintf(bf, size, "%#x", ptr);
1037 
1038 	thread__set_filename_pos(arg->thread, bf, ptr);
1039 	return 0;
1040 }
1041 
1042 static bool trace__filter_duration(struct trace *trace, double t)
1043 {
1044 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1045 }
1046 
1047 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1048 {
1049 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1050 
1051 	return fprintf(fp, "%10.3f ", ts);
1052 }
1053 
1054 /*
1055  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1056  * using ttrace->entry_time for a thread that receives a sys_exit without
1057  * first having received a sys_enter ("poll" issued before tracing session
1058  * starts, lost sys_enter exit due to ring buffer overflow).
1059  */
1060 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1061 {
1062 	if (tstamp > 0)
1063 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1064 
1065 	return fprintf(fp, "         ? ");
1066 }
1067 
1068 static bool done = false;
1069 static bool interrupted = false;
1070 
1071 static void sig_handler(int sig)
1072 {
1073 	done = true;
1074 	interrupted = sig == SIGINT;
1075 }
1076 
1077 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1078 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1079 {
1080 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1081 	printed += fprintf_duration(duration, duration_calculated, fp);
1082 
1083 	if (trace->multiple_threads) {
1084 		if (trace->show_comm)
1085 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1086 		printed += fprintf(fp, "%d ", thread->tid);
1087 	}
1088 
1089 	return printed;
1090 }
1091 
1092 static int trace__process_event(struct trace *trace, struct machine *machine,
1093 				union perf_event *event, struct perf_sample *sample)
1094 {
1095 	int ret = 0;
1096 
1097 	switch (event->header.type) {
1098 	case PERF_RECORD_LOST:
1099 		color_fprintf(trace->output, PERF_COLOR_RED,
1100 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1101 		ret = machine__process_lost_event(machine, event, sample);
1102 		break;
1103 	default:
1104 		ret = machine__process_event(machine, event, sample);
1105 		break;
1106 	}
1107 
1108 	return ret;
1109 }
1110 
1111 static int trace__tool_process(struct perf_tool *tool,
1112 			       union perf_event *event,
1113 			       struct perf_sample *sample,
1114 			       struct machine *machine)
1115 {
1116 	struct trace *trace = container_of(tool, struct trace, tool);
1117 	return trace__process_event(trace, machine, event, sample);
1118 }
1119 
1120 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1121 {
1122 	struct machine *machine = vmachine;
1123 
1124 	if (machine->kptr_restrict_warned)
1125 		return NULL;
1126 
1127 	if (symbol_conf.kptr_restrict) {
1128 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1129 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1130 			   "Kernel samples will not be resolved.\n");
1131 		machine->kptr_restrict_warned = true;
1132 		return NULL;
1133 	}
1134 
1135 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1136 }
1137 
1138 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1139 {
1140 	int err = symbol__init(NULL);
1141 
1142 	if (err)
1143 		return err;
1144 
1145 	trace->host = machine__new_host();
1146 	if (trace->host == NULL)
1147 		return -ENOMEM;
1148 
1149 	if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1150 		return -errno;
1151 
1152 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1153 					    evlist->threads, trace__tool_process, false,
1154 					    trace->opts.proc_map_timeout);
1155 	if (err)
1156 		symbol__exit();
1157 
1158 	return err;
1159 }
1160 
1161 static int syscall__set_arg_fmts(struct syscall *sc)
1162 {
1163 	struct format_field *field;
1164 	int idx = 0, len;
1165 
1166 	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1167 	if (sc->arg_scnprintf == NULL)
1168 		return -1;
1169 
1170 	if (sc->fmt)
1171 		sc->arg_parm = sc->fmt->arg_parm;
1172 
1173 	for (field = sc->args; field; field = field->next) {
1174 		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1175 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1176 		else if (strcmp(field->type, "const char *") == 0 &&
1177 			 (strcmp(field->name, "filename") == 0 ||
1178 			  strcmp(field->name, "path") == 0 ||
1179 			  strcmp(field->name, "pathname") == 0))
1180 			sc->arg_scnprintf[idx] = SCA_FILENAME;
1181 		else if (field->flags & FIELD_IS_POINTER)
1182 			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1183 		else if (strcmp(field->type, "pid_t") == 0)
1184 			sc->arg_scnprintf[idx] = SCA_PID;
1185 		else if (strcmp(field->type, "umode_t") == 0)
1186 			sc->arg_scnprintf[idx] = SCA_MODE_T;
1187 		else if ((strcmp(field->type, "int") == 0 ||
1188 			  strcmp(field->type, "unsigned int") == 0 ||
1189 			  strcmp(field->type, "long") == 0) &&
1190 			 (len = strlen(field->name)) >= 2 &&
1191 			 strcmp(field->name + len - 2, "fd") == 0) {
1192 			/*
1193 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1194 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1195 			 * 65 int
1196 			 * 23 unsigned int
1197 			 * 7 unsigned long
1198 			 */
1199 			sc->arg_scnprintf[idx] = SCA_FD;
1200 		}
1201 		++idx;
1202 	}
1203 
1204 	return 0;
1205 }
1206 
1207 static int trace__read_syscall_info(struct trace *trace, int id)
1208 {
1209 	char tp_name[128];
1210 	struct syscall *sc;
1211 	const char *name = syscalltbl__name(trace->sctbl, id);
1212 
1213 	if (name == NULL)
1214 		return -1;
1215 
1216 	if (id > trace->syscalls.max) {
1217 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1218 
1219 		if (nsyscalls == NULL)
1220 			return -1;
1221 
1222 		if (trace->syscalls.max != -1) {
1223 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1224 			       (id - trace->syscalls.max) * sizeof(*sc));
1225 		} else {
1226 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1227 		}
1228 
1229 		trace->syscalls.table = nsyscalls;
1230 		trace->syscalls.max   = id;
1231 	}
1232 
1233 	sc = trace->syscalls.table + id;
1234 	sc->name = name;
1235 
1236 	sc->fmt  = syscall_fmt__find(sc->name);
1237 
1238 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1239 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1240 
1241 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1242 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1243 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1244 	}
1245 
1246 	if (IS_ERR(sc->tp_format))
1247 		return -1;
1248 
1249 	sc->args = sc->tp_format->format.fields;
1250 	sc->nr_args = sc->tp_format->format.nr_fields;
1251 	/*
1252 	 * We need to check and discard the first variable '__syscall_nr'
1253 	 * or 'nr' that mean the syscall number. It is needless here.
1254 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1255 	 */
1256 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1257 		sc->args = sc->args->next;
1258 		--sc->nr_args;
1259 	}
1260 
1261 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1262 
1263 	return syscall__set_arg_fmts(sc);
1264 }
1265 
1266 static int trace__validate_ev_qualifier(struct trace *trace)
1267 {
1268 	int err = 0, i;
1269 	struct str_node *pos;
1270 
1271 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1272 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1273 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1274 
1275 	if (trace->ev_qualifier_ids.entries == NULL) {
1276 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1277 		       trace->output);
1278 		err = -EINVAL;
1279 		goto out;
1280 	}
1281 
1282 	i = 0;
1283 
1284 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1285 		const char *sc = pos->s;
1286 		int id = syscalltbl__id(trace->sctbl, sc);
1287 
1288 		if (id < 0) {
1289 			if (err == 0) {
1290 				fputs("Error:\tInvalid syscall ", trace->output);
1291 				err = -EINVAL;
1292 			} else {
1293 				fputs(", ", trace->output);
1294 			}
1295 
1296 			fputs(sc, trace->output);
1297 		}
1298 
1299 		trace->ev_qualifier_ids.entries[i++] = id;
1300 	}
1301 
1302 	if (err < 0) {
1303 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1304 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1305 		zfree(&trace->ev_qualifier_ids.entries);
1306 		trace->ev_qualifier_ids.nr = 0;
1307 	}
1308 out:
1309 	return err;
1310 }
1311 
1312 /*
1313  * args is to be interpreted as a series of longs but we need to handle
1314  * 8-byte unaligned accesses. args points to raw_data within the event
1315  * and raw_data is guaranteed to be 8-byte unaligned because it is
1316  * preceded by raw_size which is a u32. So we need to copy args to a temp
1317  * variable to read it. Most notably this avoids extended load instructions
1318  * on unaligned addresses
1319  */
1320 
1321 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1322 				      unsigned char *args, struct trace *trace,
1323 				      struct thread *thread)
1324 {
1325 	size_t printed = 0;
1326 	unsigned char *p;
1327 	unsigned long val;
1328 
1329 	if (sc->args != NULL) {
1330 		struct format_field *field;
1331 		u8 bit = 1;
1332 		struct syscall_arg arg = {
1333 			.idx	= 0,
1334 			.mask	= 0,
1335 			.trace  = trace,
1336 			.thread = thread,
1337 		};
1338 
1339 		for (field = sc->args; field;
1340 		     field = field->next, ++arg.idx, bit <<= 1) {
1341 			if (arg.mask & bit)
1342 				continue;
1343 
1344 			/* special care for unaligned accesses */
1345 			p = args + sizeof(unsigned long) * arg.idx;
1346 			memcpy(&val, p, sizeof(val));
1347 
1348 			/*
1349  			 * Suppress this argument if its value is zero and
1350  			 * and we don't have a string associated in an
1351  			 * strarray for it.
1352  			 */
1353 			if (val == 0 &&
1354 			    !(sc->arg_scnprintf &&
1355 			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1356 			      sc->arg_parm[arg.idx]))
1357 				continue;
1358 
1359 			printed += scnprintf(bf + printed, size - printed,
1360 					     "%s%s: ", printed ? ", " : "", field->name);
1361 			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1362 				arg.val = val;
1363 				if (sc->arg_parm)
1364 					arg.parm = sc->arg_parm[arg.idx];
1365 				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1366 								      size - printed, &arg);
1367 			} else {
1368 				printed += scnprintf(bf + printed, size - printed,
1369 						     "%ld", val);
1370 			}
1371 		}
1372 	} else if (IS_ERR(sc->tp_format)) {
1373 		/*
1374 		 * If we managed to read the tracepoint /format file, then we
1375 		 * may end up not having any args, like with gettid(), so only
1376 		 * print the raw args when we didn't manage to read it.
1377 		 */
1378 		int i = 0;
1379 
1380 		while (i < 6) {
1381 			/* special care for unaligned accesses */
1382 			p = args + sizeof(unsigned long) * i;
1383 			memcpy(&val, p, sizeof(val));
1384 			printed += scnprintf(bf + printed, size - printed,
1385 					     "%sarg%d: %ld",
1386 					     printed ? ", " : "", i, val);
1387 			++i;
1388 		}
1389 	}
1390 
1391 	return printed;
1392 }
1393 
1394 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1395 				  union perf_event *event,
1396 				  struct perf_sample *sample);
1397 
1398 static struct syscall *trace__syscall_info(struct trace *trace,
1399 					   struct perf_evsel *evsel, int id)
1400 {
1401 
1402 	if (id < 0) {
1403 
1404 		/*
1405 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1406 		 * before that, leaving at a higher verbosity level till that is
1407 		 * explained. Reproduced with plain ftrace with:
1408 		 *
1409 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1410 		 * grep "NR -1 " /t/trace_pipe
1411 		 *
1412 		 * After generating some load on the machine.
1413  		 */
1414 		if (verbose > 1) {
1415 			static u64 n;
1416 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1417 				id, perf_evsel__name(evsel), ++n);
1418 		}
1419 		return NULL;
1420 	}
1421 
1422 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1423 	    trace__read_syscall_info(trace, id))
1424 		goto out_cant_read;
1425 
1426 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1427 		goto out_cant_read;
1428 
1429 	return &trace->syscalls.table[id];
1430 
1431 out_cant_read:
1432 	if (verbose > 0) {
1433 		fprintf(trace->output, "Problems reading syscall %d", id);
1434 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1435 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1436 		fputs(" information\n", trace->output);
1437 	}
1438 	return NULL;
1439 }
1440 
1441 static void thread__update_stats(struct thread_trace *ttrace,
1442 				 int id, struct perf_sample *sample)
1443 {
1444 	struct int_node *inode;
1445 	struct stats *stats;
1446 	u64 duration = 0;
1447 
1448 	inode = intlist__findnew(ttrace->syscall_stats, id);
1449 	if (inode == NULL)
1450 		return;
1451 
1452 	stats = inode->priv;
1453 	if (stats == NULL) {
1454 		stats = malloc(sizeof(struct stats));
1455 		if (stats == NULL)
1456 			return;
1457 		init_stats(stats);
1458 		inode->priv = stats;
1459 	}
1460 
1461 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1462 		duration = sample->time - ttrace->entry_time;
1463 
1464 	update_stats(stats, duration);
1465 }
1466 
1467 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1468 {
1469 	struct thread_trace *ttrace;
1470 	u64 duration;
1471 	size_t printed;
1472 
1473 	if (trace->current == NULL)
1474 		return 0;
1475 
1476 	ttrace = thread__priv(trace->current);
1477 
1478 	if (!ttrace->entry_pending)
1479 		return 0;
1480 
1481 	duration = sample->time - ttrace->entry_time;
1482 
1483 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1484 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1485 	ttrace->entry_pending = false;
1486 
1487 	return printed;
1488 }
1489 
1490 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1491 			    union perf_event *event __maybe_unused,
1492 			    struct perf_sample *sample)
1493 {
1494 	char *msg;
1495 	void *args;
1496 	size_t printed = 0;
1497 	struct thread *thread;
1498 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1499 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1500 	struct thread_trace *ttrace;
1501 
1502 	if (sc == NULL)
1503 		return -1;
1504 
1505 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1506 	ttrace = thread__trace(thread, trace->output);
1507 	if (ttrace == NULL)
1508 		goto out_put;
1509 
1510 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1511 
1512 	if (ttrace->entry_str == NULL) {
1513 		ttrace->entry_str = malloc(trace__entry_str_size);
1514 		if (!ttrace->entry_str)
1515 			goto out_put;
1516 	}
1517 
1518 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1519 		trace__printf_interrupted_entry(trace, sample);
1520 
1521 	ttrace->entry_time = sample->time;
1522 	msg = ttrace->entry_str;
1523 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1524 
1525 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1526 					   args, trace, thread);
1527 
1528 	if (sc->is_exit) {
1529 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1530 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1531 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1532 		}
1533 	} else {
1534 		ttrace->entry_pending = true;
1535 		/* See trace__vfs_getname & trace__sys_exit */
1536 		ttrace->filename.pending_open = false;
1537 	}
1538 
1539 	if (trace->current != thread) {
1540 		thread__put(trace->current);
1541 		trace->current = thread__get(thread);
1542 	}
1543 	err = 0;
1544 out_put:
1545 	thread__put(thread);
1546 	return err;
1547 }
1548 
1549 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1550 				    struct perf_sample *sample,
1551 				    struct callchain_cursor *cursor)
1552 {
1553 	struct addr_location al;
1554 
1555 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1556 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1557 		return -1;
1558 
1559 	return 0;
1560 }
1561 
1562 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1563 {
1564 	/* TODO: user-configurable print_opts */
1565 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1566 				        EVSEL__PRINT_DSO |
1567 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1568 
1569 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1570 }
1571 
1572 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1573 			   union perf_event *event __maybe_unused,
1574 			   struct perf_sample *sample)
1575 {
1576 	long ret;
1577 	u64 duration = 0;
1578 	bool duration_calculated = false;
1579 	struct thread *thread;
1580 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1581 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1582 	struct thread_trace *ttrace;
1583 
1584 	if (sc == NULL)
1585 		return -1;
1586 
1587 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1588 	ttrace = thread__trace(thread, trace->output);
1589 	if (ttrace == NULL)
1590 		goto out_put;
1591 
1592 	if (trace->summary)
1593 		thread__update_stats(ttrace, id, sample);
1594 
1595 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1596 
1597 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1598 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1599 		ttrace->filename.pending_open = false;
1600 		++trace->stats.vfs_getname;
1601 	}
1602 
1603 	if (ttrace->entry_time) {
1604 		duration = sample->time - ttrace->entry_time;
1605 		if (trace__filter_duration(trace, duration))
1606 			goto out;
1607 		duration_calculated = true;
1608 	} else if (trace->duration_filter)
1609 		goto out;
1610 
1611 	if (sample->callchain) {
1612 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1613 		if (callchain_ret == 0) {
1614 			if (callchain_cursor.nr < trace->min_stack)
1615 				goto out;
1616 			callchain_ret = 1;
1617 		}
1618 	}
1619 
1620 	if (trace->summary_only)
1621 		goto out;
1622 
1623 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1624 
1625 	if (ttrace->entry_pending) {
1626 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1627 	} else {
1628 		fprintf(trace->output, " ... [");
1629 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1630 		fprintf(trace->output, "]: %s()", sc->name);
1631 	}
1632 
1633 	if (sc->fmt == NULL) {
1634 signed_print:
1635 		fprintf(trace->output, ") = %ld", ret);
1636 	} else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1637 		char bf[STRERR_BUFSIZE];
1638 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1639 			   *e = audit_errno_to_name(-ret);
1640 
1641 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1642 	} else if (ret == 0 && sc->fmt->timeout)
1643 		fprintf(trace->output, ") = 0 Timeout");
1644 	else if (sc->fmt->hexret)
1645 		fprintf(trace->output, ") = %#lx", ret);
1646 	else if (sc->fmt->errpid) {
1647 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1648 
1649 		if (child != NULL) {
1650 			fprintf(trace->output, ") = %ld", ret);
1651 			if (child->comm_set)
1652 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1653 			thread__put(child);
1654 		}
1655 	} else
1656 		goto signed_print;
1657 
1658 	fputc('\n', trace->output);
1659 
1660 	if (callchain_ret > 0)
1661 		trace__fprintf_callchain(trace, sample);
1662 	else if (callchain_ret < 0)
1663 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1664 out:
1665 	ttrace->entry_pending = false;
1666 	err = 0;
1667 out_put:
1668 	thread__put(thread);
1669 	return err;
1670 }
1671 
1672 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1673 			      union perf_event *event __maybe_unused,
1674 			      struct perf_sample *sample)
1675 {
1676 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1677 	struct thread_trace *ttrace;
1678 	size_t filename_len, entry_str_len, to_move;
1679 	ssize_t remaining_space;
1680 	char *pos;
1681 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1682 
1683 	if (!thread)
1684 		goto out;
1685 
1686 	ttrace = thread__priv(thread);
1687 	if (!ttrace)
1688 		goto out_put;
1689 
1690 	filename_len = strlen(filename);
1691 	if (filename_len == 0)
1692 		goto out_put;
1693 
1694 	if (ttrace->filename.namelen < filename_len) {
1695 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1696 
1697 		if (f == NULL)
1698 			goto out_put;
1699 
1700 		ttrace->filename.namelen = filename_len;
1701 		ttrace->filename.name = f;
1702 	}
1703 
1704 	strcpy(ttrace->filename.name, filename);
1705 	ttrace->filename.pending_open = true;
1706 
1707 	if (!ttrace->filename.ptr)
1708 		goto out_put;
1709 
1710 	entry_str_len = strlen(ttrace->entry_str);
1711 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1712 	if (remaining_space <= 0)
1713 		goto out_put;
1714 
1715 	if (filename_len > (size_t)remaining_space) {
1716 		filename += filename_len - remaining_space;
1717 		filename_len = remaining_space;
1718 	}
1719 
1720 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1721 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1722 	memmove(pos + filename_len, pos, to_move);
1723 	memcpy(pos, filename, filename_len);
1724 
1725 	ttrace->filename.ptr = 0;
1726 	ttrace->filename.entry_str_pos = 0;
1727 out_put:
1728 	thread__put(thread);
1729 out:
1730 	return 0;
1731 }
1732 
1733 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1734 				     union perf_event *event __maybe_unused,
1735 				     struct perf_sample *sample)
1736 {
1737         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1738 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1739 	struct thread *thread = machine__findnew_thread(trace->host,
1740 							sample->pid,
1741 							sample->tid);
1742 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1743 
1744 	if (ttrace == NULL)
1745 		goto out_dump;
1746 
1747 	ttrace->runtime_ms += runtime_ms;
1748 	trace->runtime_ms += runtime_ms;
1749 out_put:
1750 	thread__put(thread);
1751 	return 0;
1752 
1753 out_dump:
1754 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1755 	       evsel->name,
1756 	       perf_evsel__strval(evsel, sample, "comm"),
1757 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1758 	       runtime,
1759 	       perf_evsel__intval(evsel, sample, "vruntime"));
1760 	goto out_put;
1761 }
1762 
1763 static void bpf_output__printer(enum binary_printer_ops op,
1764 				unsigned int val, void *extra)
1765 {
1766 	FILE *output = extra;
1767 	unsigned char ch = (unsigned char)val;
1768 
1769 	switch (op) {
1770 	case BINARY_PRINT_CHAR_DATA:
1771 		fprintf(output, "%c", isprint(ch) ? ch : '.');
1772 		break;
1773 	case BINARY_PRINT_DATA_BEGIN:
1774 	case BINARY_PRINT_LINE_BEGIN:
1775 	case BINARY_PRINT_ADDR:
1776 	case BINARY_PRINT_NUM_DATA:
1777 	case BINARY_PRINT_NUM_PAD:
1778 	case BINARY_PRINT_SEP:
1779 	case BINARY_PRINT_CHAR_PAD:
1780 	case BINARY_PRINT_LINE_END:
1781 	case BINARY_PRINT_DATA_END:
1782 	default:
1783 		break;
1784 	}
1785 }
1786 
1787 static void bpf_output__fprintf(struct trace *trace,
1788 				struct perf_sample *sample)
1789 {
1790 	print_binary(sample->raw_data, sample->raw_size, 8,
1791 		     bpf_output__printer, trace->output);
1792 }
1793 
1794 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1795 				union perf_event *event __maybe_unused,
1796 				struct perf_sample *sample)
1797 {
1798 	int callchain_ret = 0;
1799 
1800 	if (sample->callchain) {
1801 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1802 		if (callchain_ret == 0) {
1803 			if (callchain_cursor.nr < trace->min_stack)
1804 				goto out;
1805 			callchain_ret = 1;
1806 		}
1807 	}
1808 
1809 	trace__printf_interrupted_entry(trace, sample);
1810 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1811 
1812 	if (trace->trace_syscalls)
1813 		fprintf(trace->output, "(         ): ");
1814 
1815 	fprintf(trace->output, "%s:", evsel->name);
1816 
1817 	if (perf_evsel__is_bpf_output(evsel)) {
1818 		bpf_output__fprintf(trace, sample);
1819 	} else if (evsel->tp_format) {
1820 		event_format__fprintf(evsel->tp_format, sample->cpu,
1821 				      sample->raw_data, sample->raw_size,
1822 				      trace->output);
1823 	}
1824 
1825 	fprintf(trace->output, ")\n");
1826 
1827 	if (callchain_ret > 0)
1828 		trace__fprintf_callchain(trace, sample);
1829 	else if (callchain_ret < 0)
1830 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1831 out:
1832 	return 0;
1833 }
1834 
1835 static void print_location(FILE *f, struct perf_sample *sample,
1836 			   struct addr_location *al,
1837 			   bool print_dso, bool print_sym)
1838 {
1839 
1840 	if ((verbose > 0 || print_dso) && al->map)
1841 		fprintf(f, "%s@", al->map->dso->long_name);
1842 
1843 	if ((verbose > 0 || print_sym) && al->sym)
1844 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1845 			al->addr - al->sym->start);
1846 	else if (al->map)
1847 		fprintf(f, "0x%" PRIx64, al->addr);
1848 	else
1849 		fprintf(f, "0x%" PRIx64, sample->addr);
1850 }
1851 
1852 static int trace__pgfault(struct trace *trace,
1853 			  struct perf_evsel *evsel,
1854 			  union perf_event *event __maybe_unused,
1855 			  struct perf_sample *sample)
1856 {
1857 	struct thread *thread;
1858 	struct addr_location al;
1859 	char map_type = 'd';
1860 	struct thread_trace *ttrace;
1861 	int err = -1;
1862 	int callchain_ret = 0;
1863 
1864 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1865 
1866 	if (sample->callchain) {
1867 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1868 		if (callchain_ret == 0) {
1869 			if (callchain_cursor.nr < trace->min_stack)
1870 				goto out_put;
1871 			callchain_ret = 1;
1872 		}
1873 	}
1874 
1875 	ttrace = thread__trace(thread, trace->output);
1876 	if (ttrace == NULL)
1877 		goto out_put;
1878 
1879 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1880 		ttrace->pfmaj++;
1881 	else
1882 		ttrace->pfmin++;
1883 
1884 	if (trace->summary_only)
1885 		goto out;
1886 
1887 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1888 			      sample->ip, &al);
1889 
1890 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1891 
1892 	fprintf(trace->output, "%sfault [",
1893 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1894 		"maj" : "min");
1895 
1896 	print_location(trace->output, sample, &al, false, true);
1897 
1898 	fprintf(trace->output, "] => ");
1899 
1900 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1901 				   sample->addr, &al);
1902 
1903 	if (!al.map) {
1904 		thread__find_addr_location(thread, sample->cpumode,
1905 					   MAP__FUNCTION, sample->addr, &al);
1906 
1907 		if (al.map)
1908 			map_type = 'x';
1909 		else
1910 			map_type = '?';
1911 	}
1912 
1913 	print_location(trace->output, sample, &al, true, false);
1914 
1915 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1916 
1917 	if (callchain_ret > 0)
1918 		trace__fprintf_callchain(trace, sample);
1919 	else if (callchain_ret < 0)
1920 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1921 out:
1922 	err = 0;
1923 out_put:
1924 	thread__put(thread);
1925 	return err;
1926 }
1927 
1928 static void trace__set_base_time(struct trace *trace,
1929 				 struct perf_evsel *evsel,
1930 				 struct perf_sample *sample)
1931 {
1932 	/*
1933 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1934 	 * and don't use sample->time unconditionally, we may end up having
1935 	 * some other event in the future without PERF_SAMPLE_TIME for good
1936 	 * reason, i.e. we may not be interested in its timestamps, just in
1937 	 * it taking place, picking some piece of information when it
1938 	 * appears in our event stream (vfs_getname comes to mind).
1939 	 */
1940 	if (trace->base_time == 0 && !trace->full_time &&
1941 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1942 		trace->base_time = sample->time;
1943 }
1944 
1945 static int trace__process_sample(struct perf_tool *tool,
1946 				 union perf_event *event,
1947 				 struct perf_sample *sample,
1948 				 struct perf_evsel *evsel,
1949 				 struct machine *machine __maybe_unused)
1950 {
1951 	struct trace *trace = container_of(tool, struct trace, tool);
1952 	struct thread *thread;
1953 	int err = 0;
1954 
1955 	tracepoint_handler handler = evsel->handler;
1956 
1957 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1958 	if (thread && thread__is_filtered(thread))
1959 		goto out;
1960 
1961 	trace__set_base_time(trace, evsel, sample);
1962 
1963 	if (handler) {
1964 		++trace->nr_events;
1965 		handler(trace, evsel, event, sample);
1966 	}
1967 out:
1968 	thread__put(thread);
1969 	return err;
1970 }
1971 
1972 static int trace__record(struct trace *trace, int argc, const char **argv)
1973 {
1974 	unsigned int rec_argc, i, j;
1975 	const char **rec_argv;
1976 	const char * const record_args[] = {
1977 		"record",
1978 		"-R",
1979 		"-m", "1024",
1980 		"-c", "1",
1981 	};
1982 
1983 	const char * const sc_args[] = { "-e", };
1984 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1985 	const char * const majpf_args[] = { "-e", "major-faults" };
1986 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1987 	const char * const minpf_args[] = { "-e", "minor-faults" };
1988 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1989 
1990 	/* +1 is for the event string below */
1991 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1992 		majpf_args_nr + minpf_args_nr + argc;
1993 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
1994 
1995 	if (rec_argv == NULL)
1996 		return -ENOMEM;
1997 
1998 	j = 0;
1999 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2000 		rec_argv[j++] = record_args[i];
2001 
2002 	if (trace->trace_syscalls) {
2003 		for (i = 0; i < sc_args_nr; i++)
2004 			rec_argv[j++] = sc_args[i];
2005 
2006 		/* event string may be different for older kernels - e.g., RHEL6 */
2007 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2008 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2009 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2010 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2011 		else {
2012 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2013 			return -1;
2014 		}
2015 	}
2016 
2017 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2018 		for (i = 0; i < majpf_args_nr; i++)
2019 			rec_argv[j++] = majpf_args[i];
2020 
2021 	if (trace->trace_pgfaults & TRACE_PFMIN)
2022 		for (i = 0; i < minpf_args_nr; i++)
2023 			rec_argv[j++] = minpf_args[i];
2024 
2025 	for (i = 0; i < (unsigned int)argc; i++)
2026 		rec_argv[j++] = argv[i];
2027 
2028 	return cmd_record(j, rec_argv);
2029 }
2030 
2031 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2032 
2033 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2034 {
2035 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2036 
2037 	if (IS_ERR(evsel))
2038 		return false;
2039 
2040 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2041 		perf_evsel__delete(evsel);
2042 		return false;
2043 	}
2044 
2045 	evsel->handler = trace__vfs_getname;
2046 	perf_evlist__add(evlist, evsel);
2047 	return true;
2048 }
2049 
2050 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2051 {
2052 	struct perf_evsel *evsel;
2053 	struct perf_event_attr attr = {
2054 		.type = PERF_TYPE_SOFTWARE,
2055 		.mmap_data = 1,
2056 	};
2057 
2058 	attr.config = config;
2059 	attr.sample_period = 1;
2060 
2061 	event_attr_init(&attr);
2062 
2063 	evsel = perf_evsel__new(&attr);
2064 	if (evsel)
2065 		evsel->handler = trace__pgfault;
2066 
2067 	return evsel;
2068 }
2069 
2070 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2071 {
2072 	const u32 type = event->header.type;
2073 	struct perf_evsel *evsel;
2074 
2075 	if (type != PERF_RECORD_SAMPLE) {
2076 		trace__process_event(trace, trace->host, event, sample);
2077 		return;
2078 	}
2079 
2080 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2081 	if (evsel == NULL) {
2082 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2083 		return;
2084 	}
2085 
2086 	trace__set_base_time(trace, evsel, sample);
2087 
2088 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2089 	    sample->raw_data == NULL) {
2090 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2091 		       perf_evsel__name(evsel), sample->tid,
2092 		       sample->cpu, sample->raw_size);
2093 	} else {
2094 		tracepoint_handler handler = evsel->handler;
2095 		handler(trace, evsel, event, sample);
2096 	}
2097 }
2098 
2099 static int trace__add_syscall_newtp(struct trace *trace)
2100 {
2101 	int ret = -1;
2102 	struct perf_evlist *evlist = trace->evlist;
2103 	struct perf_evsel *sys_enter, *sys_exit;
2104 
2105 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2106 	if (sys_enter == NULL)
2107 		goto out;
2108 
2109 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2110 		goto out_delete_sys_enter;
2111 
2112 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2113 	if (sys_exit == NULL)
2114 		goto out_delete_sys_enter;
2115 
2116 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2117 		goto out_delete_sys_exit;
2118 
2119 	perf_evlist__add(evlist, sys_enter);
2120 	perf_evlist__add(evlist, sys_exit);
2121 
2122 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2123 		/*
2124 		 * We're interested only in the user space callchain
2125 		 * leading to the syscall, allow overriding that for
2126 		 * debugging reasons using --kernel_syscall_callchains
2127 		 */
2128 		sys_exit->attr.exclude_callchain_kernel = 1;
2129 	}
2130 
2131 	trace->syscalls.events.sys_enter = sys_enter;
2132 	trace->syscalls.events.sys_exit  = sys_exit;
2133 
2134 	ret = 0;
2135 out:
2136 	return ret;
2137 
2138 out_delete_sys_exit:
2139 	perf_evsel__delete_priv(sys_exit);
2140 out_delete_sys_enter:
2141 	perf_evsel__delete_priv(sys_enter);
2142 	goto out;
2143 }
2144 
2145 static int trace__set_ev_qualifier_filter(struct trace *trace)
2146 {
2147 	int err = -1;
2148 	struct perf_evsel *sys_exit;
2149 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2150 						trace->ev_qualifier_ids.nr,
2151 						trace->ev_qualifier_ids.entries);
2152 
2153 	if (filter == NULL)
2154 		goto out_enomem;
2155 
2156 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2157 					  filter)) {
2158 		sys_exit = trace->syscalls.events.sys_exit;
2159 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2160 	}
2161 
2162 	free(filter);
2163 out:
2164 	return err;
2165 out_enomem:
2166 	errno = ENOMEM;
2167 	goto out;
2168 }
2169 
2170 static int trace__run(struct trace *trace, int argc, const char **argv)
2171 {
2172 	struct perf_evlist *evlist = trace->evlist;
2173 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2174 	int err = -1, i;
2175 	unsigned long before;
2176 	const bool forks = argc > 0;
2177 	bool draining = false;
2178 
2179 	trace->live = true;
2180 
2181 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2182 		goto out_error_raw_syscalls;
2183 
2184 	if (trace->trace_syscalls)
2185 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2186 
2187 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2188 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2189 		if (pgfault_maj == NULL)
2190 			goto out_error_mem;
2191 		perf_evlist__add(evlist, pgfault_maj);
2192 	}
2193 
2194 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2195 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2196 		if (pgfault_min == NULL)
2197 			goto out_error_mem;
2198 		perf_evlist__add(evlist, pgfault_min);
2199 	}
2200 
2201 	if (trace->sched &&
2202 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2203 				   trace__sched_stat_runtime))
2204 		goto out_error_sched_stat_runtime;
2205 
2206 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2207 	if (err < 0) {
2208 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2209 		goto out_delete_evlist;
2210 	}
2211 
2212 	err = trace__symbols_init(trace, evlist);
2213 	if (err < 0) {
2214 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2215 		goto out_delete_evlist;
2216 	}
2217 
2218 	perf_evlist__config(evlist, &trace->opts, NULL);
2219 
2220 	if (callchain_param.enabled) {
2221 		bool use_identifier = false;
2222 
2223 		if (trace->syscalls.events.sys_exit) {
2224 			perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2225 						     &trace->opts, &callchain_param);
2226 			use_identifier = true;
2227 		}
2228 
2229 		if (pgfault_maj) {
2230 			perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2231 			use_identifier = true;
2232 		}
2233 
2234 		if (pgfault_min) {
2235 			perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2236 			use_identifier = true;
2237 		}
2238 
2239 		if (use_identifier) {
2240 		       /*
2241 			* Now we have evsels with different sample_ids, use
2242 			* PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2243 			* from a fixed position in each ring buffer record.
2244 			*
2245 			* As of this the changeset introducing this comment, this
2246 			* isn't strictly needed, as the fields that can come before
2247 			* PERF_SAMPLE_ID are all used, but we'll probably disable
2248 			* some of those for things like copying the payload of
2249 			* pointer syscall arguments, and for vfs_getname we don't
2250 			* need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2251 			* here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2252 			*/
2253 			perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2254 			perf_evlist__reset_sample_bit(evlist, ID);
2255 		}
2256 	}
2257 
2258 	signal(SIGCHLD, sig_handler);
2259 	signal(SIGINT, sig_handler);
2260 
2261 	if (forks) {
2262 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2263 						    argv, false, NULL);
2264 		if (err < 0) {
2265 			fprintf(trace->output, "Couldn't run the workload!\n");
2266 			goto out_delete_evlist;
2267 		}
2268 	}
2269 
2270 	err = perf_evlist__open(evlist);
2271 	if (err < 0)
2272 		goto out_error_open;
2273 
2274 	err = bpf__apply_obj_config();
2275 	if (err) {
2276 		char errbuf[BUFSIZ];
2277 
2278 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2279 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2280 			 errbuf);
2281 		goto out_error_open;
2282 	}
2283 
2284 	/*
2285 	 * Better not use !target__has_task() here because we need to cover the
2286 	 * case where no threads were specified in the command line, but a
2287 	 * workload was, and in that case we will fill in the thread_map when
2288 	 * we fork the workload in perf_evlist__prepare_workload.
2289 	 */
2290 	if (trace->filter_pids.nr > 0)
2291 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2292 	else if (thread_map__pid(evlist->threads, 0) == -1)
2293 		err = perf_evlist__set_filter_pid(evlist, getpid());
2294 
2295 	if (err < 0)
2296 		goto out_error_mem;
2297 
2298 	if (trace->ev_qualifier_ids.nr > 0) {
2299 		err = trace__set_ev_qualifier_filter(trace);
2300 		if (err < 0)
2301 			goto out_errno;
2302 
2303 		pr_debug("event qualifier tracepoint filter: %s\n",
2304 			 trace->syscalls.events.sys_exit->filter);
2305 	}
2306 
2307 	err = perf_evlist__apply_filters(evlist, &evsel);
2308 	if (err < 0)
2309 		goto out_error_apply_filters;
2310 
2311 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2312 	if (err < 0)
2313 		goto out_error_mmap;
2314 
2315 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2316 		perf_evlist__enable(evlist);
2317 
2318 	if (forks)
2319 		perf_evlist__start_workload(evlist);
2320 
2321 	if (trace->opts.initial_delay) {
2322 		usleep(trace->opts.initial_delay * 1000);
2323 		perf_evlist__enable(evlist);
2324 	}
2325 
2326 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2327 				  evlist->threads->nr > 1 ||
2328 				  perf_evlist__first(evlist)->attr.inherit;
2329 again:
2330 	before = trace->nr_events;
2331 
2332 	for (i = 0; i < evlist->nr_mmaps; i++) {
2333 		union perf_event *event;
2334 
2335 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2336 			struct perf_sample sample;
2337 
2338 			++trace->nr_events;
2339 
2340 			err = perf_evlist__parse_sample(evlist, event, &sample);
2341 			if (err) {
2342 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2343 				goto next_event;
2344 			}
2345 
2346 			trace__handle_event(trace, event, &sample);
2347 next_event:
2348 			perf_evlist__mmap_consume(evlist, i);
2349 
2350 			if (interrupted)
2351 				goto out_disable;
2352 
2353 			if (done && !draining) {
2354 				perf_evlist__disable(evlist);
2355 				draining = true;
2356 			}
2357 		}
2358 	}
2359 
2360 	if (trace->nr_events == before) {
2361 		int timeout = done ? 100 : -1;
2362 
2363 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2364 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2365 				draining = true;
2366 
2367 			goto again;
2368 		}
2369 	} else {
2370 		goto again;
2371 	}
2372 
2373 out_disable:
2374 	thread__zput(trace->current);
2375 
2376 	perf_evlist__disable(evlist);
2377 
2378 	if (!err) {
2379 		if (trace->summary)
2380 			trace__fprintf_thread_summary(trace, trace->output);
2381 
2382 		if (trace->show_tool_stats) {
2383 			fprintf(trace->output, "Stats:\n "
2384 					       " vfs_getname : %" PRIu64 "\n"
2385 					       " proc_getname: %" PRIu64 "\n",
2386 				trace->stats.vfs_getname,
2387 				trace->stats.proc_getname);
2388 		}
2389 	}
2390 
2391 out_delete_evlist:
2392 	perf_evlist__delete(evlist);
2393 	trace->evlist = NULL;
2394 	trace->live = false;
2395 	return err;
2396 {
2397 	char errbuf[BUFSIZ];
2398 
2399 out_error_sched_stat_runtime:
2400 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2401 	goto out_error;
2402 
2403 out_error_raw_syscalls:
2404 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2405 	goto out_error;
2406 
2407 out_error_mmap:
2408 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2409 	goto out_error;
2410 
2411 out_error_open:
2412 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2413 
2414 out_error:
2415 	fprintf(trace->output, "%s\n", errbuf);
2416 	goto out_delete_evlist;
2417 
2418 out_error_apply_filters:
2419 	fprintf(trace->output,
2420 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2421 		evsel->filter, perf_evsel__name(evsel), errno,
2422 		str_error_r(errno, errbuf, sizeof(errbuf)));
2423 	goto out_delete_evlist;
2424 }
2425 out_error_mem:
2426 	fprintf(trace->output, "Not enough memory to run!\n");
2427 	goto out_delete_evlist;
2428 
2429 out_errno:
2430 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2431 	goto out_delete_evlist;
2432 }
2433 
2434 static int trace__replay(struct trace *trace)
2435 {
2436 	const struct perf_evsel_str_handler handlers[] = {
2437 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2438 	};
2439 	struct perf_data_file file = {
2440 		.path  = input_name,
2441 		.mode  = PERF_DATA_MODE_READ,
2442 		.force = trace->force,
2443 	};
2444 	struct perf_session *session;
2445 	struct perf_evsel *evsel;
2446 	int err = -1;
2447 
2448 	trace->tool.sample	  = trace__process_sample;
2449 	trace->tool.mmap	  = perf_event__process_mmap;
2450 	trace->tool.mmap2	  = perf_event__process_mmap2;
2451 	trace->tool.comm	  = perf_event__process_comm;
2452 	trace->tool.exit	  = perf_event__process_exit;
2453 	trace->tool.fork	  = perf_event__process_fork;
2454 	trace->tool.attr	  = perf_event__process_attr;
2455 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2456 	trace->tool.build_id	  = perf_event__process_build_id;
2457 	trace->tool.namespaces	  = perf_event__process_namespaces;
2458 
2459 	trace->tool.ordered_events = true;
2460 	trace->tool.ordering_requires_timestamps = true;
2461 
2462 	/* add tid to output */
2463 	trace->multiple_threads = true;
2464 
2465 	session = perf_session__new(&file, false, &trace->tool);
2466 	if (session == NULL)
2467 		return -1;
2468 
2469 	if (trace->opts.target.pid)
2470 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2471 
2472 	if (trace->opts.target.tid)
2473 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2474 
2475 	if (symbol__init(&session->header.env) < 0)
2476 		goto out;
2477 
2478 	trace->host = &session->machines.host;
2479 
2480 	err = perf_session__set_tracepoints_handlers(session, handlers);
2481 	if (err)
2482 		goto out;
2483 
2484 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2485 						     "raw_syscalls:sys_enter");
2486 	/* older kernels have syscalls tp versus raw_syscalls */
2487 	if (evsel == NULL)
2488 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2489 							     "syscalls:sys_enter");
2490 
2491 	if (evsel &&
2492 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2493 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2494 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2495 		goto out;
2496 	}
2497 
2498 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2499 						     "raw_syscalls:sys_exit");
2500 	if (evsel == NULL)
2501 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2502 							     "syscalls:sys_exit");
2503 	if (evsel &&
2504 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2505 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2506 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2507 		goto out;
2508 	}
2509 
2510 	evlist__for_each_entry(session->evlist, evsel) {
2511 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2512 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2513 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2514 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2515 			evsel->handler = trace__pgfault;
2516 	}
2517 
2518 	setup_pager();
2519 
2520 	err = perf_session__process_events(session);
2521 	if (err)
2522 		pr_err("Failed to process events, error %d", err);
2523 
2524 	else if (trace->summary)
2525 		trace__fprintf_thread_summary(trace, trace->output);
2526 
2527 out:
2528 	perf_session__delete(session);
2529 
2530 	return err;
2531 }
2532 
2533 static size_t trace__fprintf_threads_header(FILE *fp)
2534 {
2535 	size_t printed;
2536 
2537 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2538 
2539 	return printed;
2540 }
2541 
2542 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2543 	struct stats 	*stats;
2544 	double		msecs;
2545 	int		syscall;
2546 )
2547 {
2548 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2549 	struct stats *stats = source->priv;
2550 
2551 	entry->syscall = source->i;
2552 	entry->stats   = stats;
2553 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2554 }
2555 
2556 static size_t thread__dump_stats(struct thread_trace *ttrace,
2557 				 struct trace *trace, FILE *fp)
2558 {
2559 	size_t printed = 0;
2560 	struct syscall *sc;
2561 	struct rb_node *nd;
2562 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2563 
2564 	if (syscall_stats == NULL)
2565 		return 0;
2566 
2567 	printed += fprintf(fp, "\n");
2568 
2569 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2570 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2571 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2572 
2573 	resort_rb__for_each_entry(nd, syscall_stats) {
2574 		struct stats *stats = syscall_stats_entry->stats;
2575 		if (stats) {
2576 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2577 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2578 			double avg = avg_stats(stats);
2579 			double pct;
2580 			u64 n = (u64) stats->n;
2581 
2582 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2583 			avg /= NSEC_PER_MSEC;
2584 
2585 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2586 			printed += fprintf(fp, "   %-15s", sc->name);
2587 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2588 					   n, syscall_stats_entry->msecs, min, avg);
2589 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2590 		}
2591 	}
2592 
2593 	resort_rb__delete(syscall_stats);
2594 	printed += fprintf(fp, "\n\n");
2595 
2596 	return printed;
2597 }
2598 
2599 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2600 {
2601 	size_t printed = 0;
2602 	struct thread_trace *ttrace = thread__priv(thread);
2603 	double ratio;
2604 
2605 	if (ttrace == NULL)
2606 		return 0;
2607 
2608 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2609 
2610 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2611 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2612 	printed += fprintf(fp, "%.1f%%", ratio);
2613 	if (ttrace->pfmaj)
2614 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2615 	if (ttrace->pfmin)
2616 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2617 	if (trace->sched)
2618 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2619 	else if (fputc('\n', fp) != EOF)
2620 		++printed;
2621 
2622 	printed += thread__dump_stats(ttrace, trace, fp);
2623 
2624 	return printed;
2625 }
2626 
2627 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2628 {
2629 	return ttrace ? ttrace->nr_events : 0;
2630 }
2631 
2632 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2633 	struct thread *thread;
2634 )
2635 {
2636 	entry->thread = rb_entry(nd, struct thread, rb_node);
2637 }
2638 
2639 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2640 {
2641 	DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2642 	size_t printed = trace__fprintf_threads_header(fp);
2643 	struct rb_node *nd;
2644 
2645 	if (threads == NULL) {
2646 		fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2647 		return 0;
2648 	}
2649 
2650 	resort_rb__for_each_entry(nd, threads)
2651 		printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2652 
2653 	resort_rb__delete(threads);
2654 
2655 	return printed;
2656 }
2657 
2658 static int trace__set_duration(const struct option *opt, const char *str,
2659 			       int unset __maybe_unused)
2660 {
2661 	struct trace *trace = opt->value;
2662 
2663 	trace->duration_filter = atof(str);
2664 	return 0;
2665 }
2666 
2667 static int trace__set_filter_pids(const struct option *opt, const char *str,
2668 				  int unset __maybe_unused)
2669 {
2670 	int ret = -1;
2671 	size_t i;
2672 	struct trace *trace = opt->value;
2673 	/*
2674 	 * FIXME: introduce a intarray class, plain parse csv and create a
2675 	 * { int nr, int entries[] } struct...
2676 	 */
2677 	struct intlist *list = intlist__new(str);
2678 
2679 	if (list == NULL)
2680 		return -1;
2681 
2682 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2683 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2684 
2685 	if (trace->filter_pids.entries == NULL)
2686 		goto out;
2687 
2688 	trace->filter_pids.entries[0] = getpid();
2689 
2690 	for (i = 1; i < trace->filter_pids.nr; ++i)
2691 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2692 
2693 	intlist__delete(list);
2694 	ret = 0;
2695 out:
2696 	return ret;
2697 }
2698 
2699 static int trace__open_output(struct trace *trace, const char *filename)
2700 {
2701 	struct stat st;
2702 
2703 	if (!stat(filename, &st) && st.st_size) {
2704 		char oldname[PATH_MAX];
2705 
2706 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2707 		unlink(oldname);
2708 		rename(filename, oldname);
2709 	}
2710 
2711 	trace->output = fopen(filename, "w");
2712 
2713 	return trace->output == NULL ? -errno : 0;
2714 }
2715 
2716 static int parse_pagefaults(const struct option *opt, const char *str,
2717 			    int unset __maybe_unused)
2718 {
2719 	int *trace_pgfaults = opt->value;
2720 
2721 	if (strcmp(str, "all") == 0)
2722 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2723 	else if (strcmp(str, "maj") == 0)
2724 		*trace_pgfaults |= TRACE_PFMAJ;
2725 	else if (strcmp(str, "min") == 0)
2726 		*trace_pgfaults |= TRACE_PFMIN;
2727 	else
2728 		return -1;
2729 
2730 	return 0;
2731 }
2732 
2733 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2734 {
2735 	struct perf_evsel *evsel;
2736 
2737 	evlist__for_each_entry(evlist, evsel)
2738 		evsel->handler = handler;
2739 }
2740 
2741 /*
2742  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2743  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2744  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2745  *
2746  * It'd be better to introduce a parse_options() variant that would return a
2747  * list with the terms it didn't match to an event...
2748  */
2749 static int trace__parse_events_option(const struct option *opt, const char *str,
2750 				      int unset __maybe_unused)
2751 {
2752 	struct trace *trace = (struct trace *)opt->value;
2753 	const char *s = str;
2754 	char *sep = NULL, *lists[2] = { NULL, NULL, };
2755 	int len = strlen(str), err = -1, list;
2756 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2757 	char group_name[PATH_MAX];
2758 
2759 	if (strace_groups_dir == NULL)
2760 		return -1;
2761 
2762 	if (*s == '!') {
2763 		++s;
2764 		trace->not_ev_qualifier = true;
2765 	}
2766 
2767 	while (1) {
2768 		if ((sep = strchr(s, ',')) != NULL)
2769 			*sep = '\0';
2770 
2771 		list = 0;
2772 		if (syscalltbl__id(trace->sctbl, s) >= 0) {
2773 			list = 1;
2774 		} else {
2775 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2776 			if (access(group_name, R_OK) == 0)
2777 				list = 1;
2778 		}
2779 
2780 		if (lists[list]) {
2781 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2782 		} else {
2783 			lists[list] = malloc(len);
2784 			if (lists[list] == NULL)
2785 				goto out;
2786 			strcpy(lists[list], s);
2787 		}
2788 
2789 		if (!sep)
2790 			break;
2791 
2792 		*sep = ',';
2793 		s = sep + 1;
2794 	}
2795 
2796 	if (lists[1] != NULL) {
2797 		struct strlist_config slist_config = {
2798 			.dirname = strace_groups_dir,
2799 		};
2800 
2801 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2802 		if (trace->ev_qualifier == NULL) {
2803 			fputs("Not enough memory to parse event qualifier", trace->output);
2804 			goto out;
2805 		}
2806 
2807 		if (trace__validate_ev_qualifier(trace))
2808 			goto out;
2809 	}
2810 
2811 	err = 0;
2812 
2813 	if (lists[0]) {
2814 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2815 					       "event selector. use 'perf list' to list available events",
2816 					       parse_events_option);
2817 		err = parse_events_option(&o, lists[0], 0);
2818 	}
2819 out:
2820 	if (sep)
2821 		*sep = ',';
2822 
2823 	return err;
2824 }
2825 
2826 int cmd_trace(int argc, const char **argv)
2827 {
2828 	const char *trace_usage[] = {
2829 		"perf trace [<options>] [<command>]",
2830 		"perf trace [<options>] -- <command> [<options>]",
2831 		"perf trace record [<options>] [<command>]",
2832 		"perf trace record [<options>] -- <command> [<options>]",
2833 		NULL
2834 	};
2835 	struct trace trace = {
2836 		.syscalls = {
2837 			. max = -1,
2838 		},
2839 		.opts = {
2840 			.target = {
2841 				.uid	   = UINT_MAX,
2842 				.uses_mmap = true,
2843 			},
2844 			.user_freq     = UINT_MAX,
2845 			.user_interval = ULLONG_MAX,
2846 			.no_buffering  = true,
2847 			.mmap_pages    = UINT_MAX,
2848 			.proc_map_timeout  = 500,
2849 		},
2850 		.output = stderr,
2851 		.show_comm = true,
2852 		.trace_syscalls = true,
2853 		.kernel_syscallchains = false,
2854 		.max_stack = UINT_MAX,
2855 	};
2856 	const char *output_name = NULL;
2857 	const struct option trace_options[] = {
2858 	OPT_CALLBACK('e', "event", &trace, "event",
2859 		     "event/syscall selector. use 'perf list' to list available events",
2860 		     trace__parse_events_option),
2861 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2862 		    "show the thread COMM next to its id"),
2863 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2864 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2865 		     trace__parse_events_option),
2866 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2867 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2868 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2869 		    "trace events on existing process id"),
2870 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2871 		    "trace events on existing thread id"),
2872 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2873 		     "pids to filter (by the kernel)", trace__set_filter_pids),
2874 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2875 		    "system-wide collection from all CPUs"),
2876 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2877 		    "list of cpus to monitor"),
2878 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2879 		    "child tasks do not inherit counters"),
2880 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2881 		     "number of mmap data pages",
2882 		     perf_evlist__parse_mmap_pages),
2883 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2884 		   "user to profile"),
2885 	OPT_CALLBACK(0, "duration", &trace, "float",
2886 		     "show only events with duration > N.M ms",
2887 		     trace__set_duration),
2888 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2889 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2890 	OPT_BOOLEAN('T', "time", &trace.full_time,
2891 		    "Show full timestamp, not time relative to first start"),
2892 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
2893 		    "Show only syscall summary with statistics"),
2894 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
2895 		    "Show all syscalls and summary with statistics"),
2896 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2897 		     "Trace pagefaults", parse_pagefaults, "maj"),
2898 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2899 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2900 	OPT_CALLBACK(0, "call-graph", &trace.opts,
2901 		     "record_mode[,record_size]", record_callchain_help,
2902 		     &record_parse_callchain_opt),
2903 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2904 		    "Show the kernel callchains on the syscall exit path"),
2905 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2906 		     "Set the minimum stack depth when parsing the callchain, "
2907 		     "anything below the specified depth will be ignored."),
2908 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2909 		     "Set the maximum stack depth when parsing the callchain, "
2910 		     "anything beyond the specified depth will be ignored. "
2911 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2912 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2913 			"per thread proc mmap processing timeout in ms"),
2914 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2915 		     "ms to wait before starting measurement after program "
2916 		     "start"),
2917 	OPT_END()
2918 	};
2919 	bool __maybe_unused max_stack_user_set = true;
2920 	bool mmap_pages_user_set = true;
2921 	const char * const trace_subcommands[] = { "record", NULL };
2922 	int err;
2923 	char bf[BUFSIZ];
2924 
2925 	signal(SIGSEGV, sighandler_dump_stack);
2926 	signal(SIGFPE, sighandler_dump_stack);
2927 
2928 	trace.evlist = perf_evlist__new();
2929 	trace.sctbl = syscalltbl__new();
2930 
2931 	if (trace.evlist == NULL || trace.sctbl == NULL) {
2932 		pr_err("Not enough memory to run!\n");
2933 		err = -ENOMEM;
2934 		goto out;
2935 	}
2936 
2937 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2938 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2939 
2940 	err = bpf__setup_stdout(trace.evlist);
2941 	if (err) {
2942 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2943 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2944 		goto out;
2945 	}
2946 
2947 	err = -1;
2948 
2949 	if (trace.trace_pgfaults) {
2950 		trace.opts.sample_address = true;
2951 		trace.opts.sample_time = true;
2952 	}
2953 
2954 	if (trace.opts.mmap_pages == UINT_MAX)
2955 		mmap_pages_user_set = false;
2956 
2957 	if (trace.max_stack == UINT_MAX) {
2958 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2959 		max_stack_user_set = false;
2960 	}
2961 
2962 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2963 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2964 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2965 #endif
2966 
2967 	if (callchain_param.enabled) {
2968 		if (!mmap_pages_user_set && geteuid() == 0)
2969 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2970 
2971 		symbol_conf.use_callchain = true;
2972 	}
2973 
2974 	if (trace.evlist->nr_entries > 0)
2975 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2976 
2977 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2978 		return trace__record(&trace, argc-1, &argv[1]);
2979 
2980 	/* summary_only implies summary option, but don't overwrite summary if set */
2981 	if (trace.summary_only)
2982 		trace.summary = trace.summary_only;
2983 
2984 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2985 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
2986 		pr_err("Please specify something to trace.\n");
2987 		return -1;
2988 	}
2989 
2990 	if (!trace.trace_syscalls && trace.ev_qualifier) {
2991 		pr_err("The -e option can't be used with --no-syscalls.\n");
2992 		goto out;
2993 	}
2994 
2995 	if (output_name != NULL) {
2996 		err = trace__open_output(&trace, output_name);
2997 		if (err < 0) {
2998 			perror("failed to create output file");
2999 			goto out;
3000 		}
3001 	}
3002 
3003 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
3004 
3005 	err = target__validate(&trace.opts.target);
3006 	if (err) {
3007 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3008 		fprintf(trace.output, "%s", bf);
3009 		goto out_close;
3010 	}
3011 
3012 	err = target__parse_uid(&trace.opts.target);
3013 	if (err) {
3014 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3015 		fprintf(trace.output, "%s", bf);
3016 		goto out_close;
3017 	}
3018 
3019 	if (!argc && target__none(&trace.opts.target))
3020 		trace.opts.target.system_wide = true;
3021 
3022 	if (input_name)
3023 		err = trace__replay(&trace);
3024 	else
3025 		err = trace__run(&trace, argc, argv);
3026 
3027 out_close:
3028 	if (output_name != NULL)
3029 		fclose(trace.output);
3030 out:
3031 	return err;
3032 }
3033