xref: /openbmc/linux/tools/perf/builtin-trace.c (revision fb960bd2)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/event.h"
25 #include "util/evlist.h"
26 #include <subcmd/exec-cmd.h>
27 #include "util/machine.h"
28 #include "util/path.h"
29 #include "util/session.h"
30 #include "util/thread.h"
31 #include <subcmd/parse-options.h>
32 #include "util/strlist.h"
33 #include "util/intlist.h"
34 #include "util/thread_map.h"
35 #include "util/stat.h"
36 #include "trace/beauty/beauty.h"
37 #include "trace-event.h"
38 #include "util/parse-events.h"
39 #include "util/bpf-loader.h"
40 #include "callchain.h"
41 #include "print_binary.h"
42 #include "string2.h"
43 #include "syscalltbl.h"
44 #include "rb_resort.h"
45 
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/audit.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 
61 #include "sane_ctype.h"
62 
63 #ifndef O_CLOEXEC
64 # define O_CLOEXEC		02000000
65 #endif
66 
67 #ifndef F_LINUX_SPECIFIC_BASE
68 # define F_LINUX_SPECIFIC_BASE	1024
69 #endif
70 
71 struct trace {
72 	struct perf_tool	tool;
73 	struct syscalltbl	*sctbl;
74 	struct {
75 		int		max;
76 		struct syscall  *table;
77 		struct {
78 			struct perf_evsel *sys_enter,
79 					  *sys_exit;
80 		}		events;
81 	} syscalls;
82 	struct record_opts	opts;
83 	struct perf_evlist	*evlist;
84 	struct machine		*host;
85 	struct thread		*current;
86 	u64			base_time;
87 	FILE			*output;
88 	unsigned long		nr_events;
89 	struct strlist		*ev_qualifier;
90 	struct {
91 		size_t		nr;
92 		int		*entries;
93 	}			ev_qualifier_ids;
94 	struct {
95 		size_t		nr;
96 		pid_t		*entries;
97 	}			filter_pids;
98 	double			duration_filter;
99 	double			runtime_ms;
100 	struct {
101 		u64		vfs_getname,
102 				proc_getname;
103 	} stats;
104 	unsigned int		max_stack;
105 	unsigned int		min_stack;
106 	bool			not_ev_qualifier;
107 	bool			live;
108 	bool			full_time;
109 	bool			sched;
110 	bool			multiple_threads;
111 	bool			summary;
112 	bool			summary_only;
113 	bool			show_comm;
114 	bool			show_tool_stats;
115 	bool			trace_syscalls;
116 	bool			kernel_syscallchains;
117 	bool			force;
118 	bool			vfs_getname;
119 	int			trace_pgfaults;
120 	int			open_id;
121 };
122 
123 struct tp_field {
124 	int offset;
125 	union {
126 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
127 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
128 	};
129 };
130 
131 #define TP_UINT_FIELD(bits) \
132 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
133 { \
134 	u##bits value; \
135 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
136 	return value;  \
137 }
138 
139 TP_UINT_FIELD(8);
140 TP_UINT_FIELD(16);
141 TP_UINT_FIELD(32);
142 TP_UINT_FIELD(64);
143 
144 #define TP_UINT_FIELD__SWAPPED(bits) \
145 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
146 { \
147 	u##bits value; \
148 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
149 	return bswap_##bits(value);\
150 }
151 
152 TP_UINT_FIELD__SWAPPED(16);
153 TP_UINT_FIELD__SWAPPED(32);
154 TP_UINT_FIELD__SWAPPED(64);
155 
156 static int tp_field__init_uint(struct tp_field *field,
157 			       struct format_field *format_field,
158 			       bool needs_swap)
159 {
160 	field->offset = format_field->offset;
161 
162 	switch (format_field->size) {
163 	case 1:
164 		field->integer = tp_field__u8;
165 		break;
166 	case 2:
167 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
168 		break;
169 	case 4:
170 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
171 		break;
172 	case 8:
173 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
174 		break;
175 	default:
176 		return -1;
177 	}
178 
179 	return 0;
180 }
181 
182 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
183 {
184 	return sample->raw_data + field->offset;
185 }
186 
187 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
188 {
189 	field->offset = format_field->offset;
190 	field->pointer = tp_field__ptr;
191 	return 0;
192 }
193 
194 struct syscall_tp {
195 	struct tp_field id;
196 	union {
197 		struct tp_field args, ret;
198 	};
199 };
200 
201 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
202 					  struct tp_field *field,
203 					  const char *name)
204 {
205 	struct format_field *format_field = perf_evsel__field(evsel, name);
206 
207 	if (format_field == NULL)
208 		return -1;
209 
210 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
211 }
212 
213 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
214 	({ struct syscall_tp *sc = evsel->priv;\
215 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
216 
217 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
218 					 struct tp_field *field,
219 					 const char *name)
220 {
221 	struct format_field *format_field = perf_evsel__field(evsel, name);
222 
223 	if (format_field == NULL)
224 		return -1;
225 
226 	return tp_field__init_ptr(field, format_field);
227 }
228 
229 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
230 	({ struct syscall_tp *sc = evsel->priv;\
231 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
232 
233 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
234 {
235 	zfree(&evsel->priv);
236 	perf_evsel__delete(evsel);
237 }
238 
239 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
240 {
241 	evsel->priv = malloc(sizeof(struct syscall_tp));
242 	if (evsel->priv != NULL) {
243 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
244 			goto out_delete;
245 
246 		evsel->handler = handler;
247 		return 0;
248 	}
249 
250 	return -ENOMEM;
251 
252 out_delete:
253 	zfree(&evsel->priv);
254 	return -ENOENT;
255 }
256 
257 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
258 {
259 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
260 
261 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
262 	if (IS_ERR(evsel))
263 		evsel = perf_evsel__newtp("syscalls", direction);
264 
265 	if (IS_ERR(evsel))
266 		return NULL;
267 
268 	if (perf_evsel__init_syscall_tp(evsel, handler))
269 		goto out_delete;
270 
271 	return evsel;
272 
273 out_delete:
274 	perf_evsel__delete_priv(evsel);
275 	return NULL;
276 }
277 
278 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
279 	({ struct syscall_tp *fields = evsel->priv; \
280 	   fields->name.integer(&fields->name, sample); })
281 
282 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
283 	({ struct syscall_tp *fields = evsel->priv; \
284 	   fields->name.pointer(&fields->name, sample); })
285 
286 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
287 {
288 	int idx = val - sa->offset;
289 
290 	if (idx < 0 || idx >= sa->nr_entries)
291 		return scnprintf(bf, size, intfmt, val);
292 
293 	return scnprintf(bf, size, "%s", sa->entries[idx]);
294 }
295 
296 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
297 						const char *intfmt,
298 					        struct syscall_arg *arg)
299 {
300 	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
301 }
302 
303 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
304 					      struct syscall_arg *arg)
305 {
306 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
307 }
308 
309 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
310 
311 struct strarrays {
312 	int		nr_entries;
313 	struct strarray **entries;
314 };
315 
316 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
317 	.nr_entries = ARRAY_SIZE(array), \
318 	.entries = array, \
319 }
320 
321 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
322 					struct syscall_arg *arg)
323 {
324 	struct strarrays *sas = arg->parm;
325 	int i;
326 
327 	for (i = 0; i < sas->nr_entries; ++i) {
328 		struct strarray *sa = sas->entries[i];
329 		int idx = arg->val - sa->offset;
330 
331 		if (idx >= 0 && idx < sa->nr_entries) {
332 			if (sa->entries[idx] == NULL)
333 				break;
334 			return scnprintf(bf, size, "%s", sa->entries[idx]);
335 		}
336 	}
337 
338 	return scnprintf(bf, size, "%d", arg->val);
339 }
340 
341 #ifndef AT_FDCWD
342 #define AT_FDCWD	-100
343 #endif
344 
345 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
346 					   struct syscall_arg *arg)
347 {
348 	int fd = arg->val;
349 
350 	if (fd == AT_FDCWD)
351 		return scnprintf(bf, size, "CWD");
352 
353 	return syscall_arg__scnprintf_fd(bf, size, arg);
354 }
355 
356 #define SCA_FDAT syscall_arg__scnprintf_fd_at
357 
358 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
359 					      struct syscall_arg *arg);
360 
361 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
362 
363 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
364 {
365 	return scnprintf(bf, size, "%#lx", arg->val);
366 }
367 
368 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
369 {
370 	return scnprintf(bf, size, "%d", arg->val);
371 }
372 
373 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
374 {
375 	return scnprintf(bf, size, "%ld", arg->val);
376 }
377 
378 static const char *bpf_cmd[] = {
379 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
380 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
381 };
382 static DEFINE_STRARRAY(bpf_cmd);
383 
384 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
385 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
386 
387 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
388 static DEFINE_STRARRAY(itimers);
389 
390 static const char *keyctl_options[] = {
391 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
392 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
393 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
394 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
395 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
396 };
397 static DEFINE_STRARRAY(keyctl_options);
398 
399 static const char *whences[] = { "SET", "CUR", "END",
400 #ifdef SEEK_DATA
401 "DATA",
402 #endif
403 #ifdef SEEK_HOLE
404 "HOLE",
405 #endif
406 };
407 static DEFINE_STRARRAY(whences);
408 
409 static const char *fcntl_cmds[] = {
410 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
411 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
412 	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
413 	"GETOWNER_UIDS",
414 };
415 static DEFINE_STRARRAY(fcntl_cmds);
416 
417 static const char *fcntl_linux_specific_cmds[] = {
418 	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
419 	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
420 	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
421 };
422 
423 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
424 
425 static struct strarray *fcntl_cmds_arrays[] = {
426 	&strarray__fcntl_cmds,
427 	&strarray__fcntl_linux_specific_cmds,
428 };
429 
430 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
431 
432 static const char *rlimit_resources[] = {
433 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
434 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
435 	"RTTIME",
436 };
437 static DEFINE_STRARRAY(rlimit_resources);
438 
439 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
440 static DEFINE_STRARRAY(sighow);
441 
442 static const char *clockid[] = {
443 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
444 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
445 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
446 };
447 static DEFINE_STRARRAY(clockid);
448 
449 static const char *socket_families[] = {
450 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
451 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
452 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
453 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
454 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
455 	"ALG", "NFC", "VSOCK",
456 };
457 static DEFINE_STRARRAY(socket_families);
458 
459 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
460 						 struct syscall_arg *arg)
461 {
462 	size_t printed = 0;
463 	int mode = arg->val;
464 
465 	if (mode == F_OK) /* 0 */
466 		return scnprintf(bf, size, "F");
467 #define	P_MODE(n) \
468 	if (mode & n##_OK) { \
469 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
470 		mode &= ~n##_OK; \
471 	}
472 
473 	P_MODE(R);
474 	P_MODE(W);
475 	P_MODE(X);
476 #undef P_MODE
477 
478 	if (mode)
479 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
480 
481 	return printed;
482 }
483 
484 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
485 
486 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
487 					      struct syscall_arg *arg);
488 
489 #define SCA_FILENAME syscall_arg__scnprintf_filename
490 
491 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
492 						struct syscall_arg *arg)
493 {
494 	int printed = 0, flags = arg->val;
495 
496 #define	P_FLAG(n) \
497 	if (flags & O_##n) { \
498 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
499 		flags &= ~O_##n; \
500 	}
501 
502 	P_FLAG(CLOEXEC);
503 	P_FLAG(NONBLOCK);
504 #undef P_FLAG
505 
506 	if (flags)
507 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
508 
509 	return printed;
510 }
511 
512 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
513 
514 #ifndef GRND_NONBLOCK
515 #define GRND_NONBLOCK	0x0001
516 #endif
517 #ifndef GRND_RANDOM
518 #define GRND_RANDOM	0x0002
519 #endif
520 
521 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
522 						   struct syscall_arg *arg)
523 {
524 	int printed = 0, flags = arg->val;
525 
526 #define	P_FLAG(n) \
527 	if (flags & GRND_##n) { \
528 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
529 		flags &= ~GRND_##n; \
530 	}
531 
532 	P_FLAG(RANDOM);
533 	P_FLAG(NONBLOCK);
534 #undef P_FLAG
535 
536 	if (flags)
537 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
538 
539 	return printed;
540 }
541 
542 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
543 
544 #define STRARRAY(name, array) \
545 	  { .scnprintf	= SCA_STRARRAY, \
546 	    .parm	= &strarray__##array, }
547 
548 #include "trace/beauty/eventfd.c"
549 #include "trace/beauty/flock.c"
550 #include "trace/beauty/futex_op.c"
551 #include "trace/beauty/mmap.c"
552 #include "trace/beauty/mode_t.c"
553 #include "trace/beauty/msg_flags.c"
554 #include "trace/beauty/open_flags.c"
555 #include "trace/beauty/perf_event_open.c"
556 #include "trace/beauty/pid.c"
557 #include "trace/beauty/sched_policy.c"
558 #include "trace/beauty/seccomp.c"
559 #include "trace/beauty/signum.c"
560 #include "trace/beauty/socket_type.c"
561 #include "trace/beauty/waitid_options.c"
562 
563 struct syscall_arg_fmt {
564 	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
565 	void	   *parm;
566 	const char *name;
567 	bool	   show_zero;
568 };
569 
570 static struct syscall_fmt {
571 	const char *name;
572 	const char *alias;
573 	struct syscall_arg_fmt arg[6];
574 	u8	   nr_args;
575 	bool	   errpid;
576 	bool	   timeout;
577 	bool	   hexret;
578 } syscall_fmts[] = {
579 	{ .name	    = "access",
580 	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
581 	{ .name	    = "bpf",
582 	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
583 	{ .name	    = "brk",	    .hexret = true,
584 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
585 	{ .name     = "clock_gettime",
586 	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
587 	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
588 	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
589 		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
590 		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
591 		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
592 		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
593 	{ .name	    = "close",
594 	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
595 	{ .name	    = "epoll_ctl",
596 	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
597 	{ .name	    = "eventfd2",
598 	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
599 	{ .name	    = "fchmodat",
600 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
601 	{ .name	    = "fchownat",
602 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
603 	{ .name	    = "fcntl",
604 	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
605 			   .parm      = &strarrays__fcntl_cmds_arrays,
606 			   .show_zero = true, },
607 		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
608 	{ .name	    = "flock",
609 	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
610 	{ .name	    = "fstat", .alias = "newfstat", },
611 	{ .name	    = "fstatat", .alias = "newfstatat", },
612 	{ .name	    = "futex",
613 	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, }, },
614 	{ .name	    = "futimesat",
615 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
616 	{ .name	    = "getitimer",
617 	  .arg = { [0] = STRARRAY(which, itimers), }, },
618 	{ .name	    = "getpid",	    .errpid = true, },
619 	{ .name	    = "getpgid",    .errpid = true, },
620 	{ .name	    = "getppid",    .errpid = true, },
621 	{ .name	    = "getrandom",
622 	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
623 	{ .name	    = "getrlimit",
624 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
625 	{ .name	    = "ioctl",
626 	  .arg = {
627 #if defined(__i386__) || defined(__x86_64__)
628 /*
629  * FIXME: Make this available to all arches.
630  */
631 		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
632 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
633 #else
634 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
635 #endif
636 	{ .name	    = "kcmp",	    .nr_args = 5,
637 	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
638 		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
639 		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
640 		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
641 		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
642 	{ .name	    = "keyctl",
643 	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
644 	{ .name	    = "kill",
645 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
646 	{ .name	    = "linkat",
647 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
648 	{ .name	    = "lseek",
649 	  .arg = { [2] = STRARRAY(whence, whences), }, },
650 	{ .name	    = "lstat", .alias = "newlstat", },
651 	{ .name     = "madvise",
652 	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
653 		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
654 	{ .name	    = "mkdirat",
655 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
656 	{ .name	    = "mknodat",
657 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
658 	{ .name	    = "mlock",
659 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
660 	{ .name	    = "mlockall",
661 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
662 	{ .name	    = "mmap",	    .hexret = true,
663 /* The standard mmap maps to old_mmap on s390x */
664 #if defined(__s390x__)
665 	.alias = "old_mmap",
666 #endif
667 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
668 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
669 		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
670 	{ .name	    = "mprotect",
671 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
672 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
673 	{ .name	    = "mq_unlink",
674 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
675 	{ .name	    = "mremap",	    .hexret = true,
676 	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
677 		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
678 		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
679 	{ .name	    = "munlock",
680 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
681 	{ .name	    = "munmap",
682 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
683 	{ .name	    = "name_to_handle_at",
684 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
685 	{ .name	    = "newfstatat",
686 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
687 	{ .name	    = "open",
688 	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
689 	{ .name	    = "open_by_handle_at",
690 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
691 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
692 	{ .name	    = "openat",
693 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
694 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
695 	{ .name	    = "perf_event_open",
696 	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
697 		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
698 		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
699 	{ .name	    = "pipe2",
700 	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
701 	{ .name	    = "pkey_alloc",
702 	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
703 	{ .name	    = "pkey_free",
704 	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
705 	{ .name	    = "pkey_mprotect",
706 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
707 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
708 		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
709 	{ .name	    = "poll", .timeout = true, },
710 	{ .name	    = "ppoll", .timeout = true, },
711 	{ .name	    = "prctl", .alias = "arch_prctl",
712 	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
713 		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
714 		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
715 	{ .name	    = "pread", .alias = "pread64", },
716 	{ .name	    = "preadv", .alias = "pread", },
717 	{ .name	    = "prlimit64",
718 	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
719 	{ .name	    = "pwrite", .alias = "pwrite64", },
720 	{ .name	    = "readlinkat",
721 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
722 	{ .name	    = "recvfrom",
723 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
724 	{ .name	    = "recvmmsg",
725 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
726 	{ .name	    = "recvmsg",
727 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
728 	{ .name	    = "renameat",
729 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
730 	{ .name	    = "rt_sigaction",
731 	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
732 	{ .name	    = "rt_sigprocmask",
733 	  .arg = { [0] = STRARRAY(how, sighow), }, },
734 	{ .name	    = "rt_sigqueueinfo",
735 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
736 	{ .name	    = "rt_tgsigqueueinfo",
737 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
738 	{ .name	    = "sched_setscheduler",
739 	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
740 	{ .name	    = "seccomp",
741 	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
742 		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
743 	{ .name	    = "select", .timeout = true, },
744 	{ .name	    = "sendmmsg",
745 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
746 	{ .name	    = "sendmsg",
747 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
748 	{ .name	    = "sendto",
749 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
750 	{ .name	    = "set_tid_address", .errpid = true, },
751 	{ .name	    = "setitimer",
752 	  .arg = { [0] = STRARRAY(which, itimers), }, },
753 	{ .name	    = "setrlimit",
754 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
755 	{ .name	    = "socket",
756 	  .arg = { [0] = STRARRAY(family, socket_families),
757 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
758 	{ .name	    = "socketpair",
759 	  .arg = { [0] = STRARRAY(family, socket_families),
760 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
761 	{ .name	    = "stat", .alias = "newstat", },
762 	{ .name	    = "statx",
763 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
764 		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
765 		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
766 	{ .name	    = "swapoff",
767 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
768 	{ .name	    = "swapon",
769 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
770 	{ .name	    = "symlinkat",
771 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
772 	{ .name	    = "tgkill",
773 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
774 	{ .name	    = "tkill",
775 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
776 	{ .name	    = "uname", .alias = "newuname", },
777 	{ .name	    = "unlinkat",
778 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
779 	{ .name	    = "utimensat",
780 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
781 	{ .name	    = "wait4",	    .errpid = true,
782 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
783 	{ .name	    = "waitid",	    .errpid = true,
784 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
785 };
786 
787 static int syscall_fmt__cmp(const void *name, const void *fmtp)
788 {
789 	const struct syscall_fmt *fmt = fmtp;
790 	return strcmp(name, fmt->name);
791 }
792 
793 static struct syscall_fmt *syscall_fmt__find(const char *name)
794 {
795 	const int nmemb = ARRAY_SIZE(syscall_fmts);
796 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
797 }
798 
799 struct syscall {
800 	struct event_format *tp_format;
801 	int		    nr_args;
802 	struct format_field *args;
803 	const char	    *name;
804 	bool		    is_exit;
805 	struct syscall_fmt  *fmt;
806 	struct syscall_arg_fmt *arg_fmt;
807 };
808 
809 /*
810  * We need to have this 'calculated' boolean because in some cases we really
811  * don't know what is the duration of a syscall, for instance, when we start
812  * a session and some threads are waiting for a syscall to finish, say 'poll',
813  * in which case all we can do is to print "( ? ) for duration and for the
814  * start timestamp.
815  */
816 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
817 {
818 	double duration = (double)t / NSEC_PER_MSEC;
819 	size_t printed = fprintf(fp, "(");
820 
821 	if (!calculated)
822 		printed += fprintf(fp, "     ?   ");
823 	else if (duration >= 1.0)
824 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
825 	else if (duration >= 0.01)
826 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
827 	else
828 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
829 	return printed + fprintf(fp, "): ");
830 }
831 
832 /**
833  * filename.ptr: The filename char pointer that will be vfs_getname'd
834  * filename.entry_str_pos: Where to insert the string translated from
835  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
836  * ret_scnprintf: syscall args may set this to a different syscall return
837  *                formatter, for instance, fcntl may return fds, file flags, etc.
838  */
839 struct thread_trace {
840 	u64		  entry_time;
841 	bool		  entry_pending;
842 	unsigned long	  nr_events;
843 	unsigned long	  pfmaj, pfmin;
844 	char		  *entry_str;
845 	double		  runtime_ms;
846 	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
847         struct {
848 		unsigned long ptr;
849 		short int     entry_str_pos;
850 		bool	      pending_open;
851 		unsigned int  namelen;
852 		char	      *name;
853 	} filename;
854 	struct {
855 		int	  max;
856 		char	  **table;
857 	} paths;
858 
859 	struct intlist *syscall_stats;
860 };
861 
862 static struct thread_trace *thread_trace__new(void)
863 {
864 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
865 
866 	if (ttrace)
867 		ttrace->paths.max = -1;
868 
869 	ttrace->syscall_stats = intlist__new(NULL);
870 
871 	return ttrace;
872 }
873 
874 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
875 {
876 	struct thread_trace *ttrace;
877 
878 	if (thread == NULL)
879 		goto fail;
880 
881 	if (thread__priv(thread) == NULL)
882 		thread__set_priv(thread, thread_trace__new());
883 
884 	if (thread__priv(thread) == NULL)
885 		goto fail;
886 
887 	ttrace = thread__priv(thread);
888 	++ttrace->nr_events;
889 
890 	return ttrace;
891 fail:
892 	color_fprintf(fp, PERF_COLOR_RED,
893 		      "WARNING: not enough memory, dropping samples!\n");
894 	return NULL;
895 }
896 
897 
898 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
899 				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
900 {
901 	struct thread_trace *ttrace = thread__priv(arg->thread);
902 
903 	ttrace->ret_scnprintf = ret_scnprintf;
904 }
905 
906 #define TRACE_PFMAJ		(1 << 0)
907 #define TRACE_PFMIN		(1 << 1)
908 
909 static const size_t trace__entry_str_size = 2048;
910 
911 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
912 {
913 	struct thread_trace *ttrace = thread__priv(thread);
914 
915 	if (fd > ttrace->paths.max) {
916 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
917 
918 		if (npath == NULL)
919 			return -1;
920 
921 		if (ttrace->paths.max != -1) {
922 			memset(npath + ttrace->paths.max + 1, 0,
923 			       (fd - ttrace->paths.max) * sizeof(char *));
924 		} else {
925 			memset(npath, 0, (fd + 1) * sizeof(char *));
926 		}
927 
928 		ttrace->paths.table = npath;
929 		ttrace->paths.max   = fd;
930 	}
931 
932 	ttrace->paths.table[fd] = strdup(pathname);
933 
934 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
935 }
936 
937 static int thread__read_fd_path(struct thread *thread, int fd)
938 {
939 	char linkname[PATH_MAX], pathname[PATH_MAX];
940 	struct stat st;
941 	int ret;
942 
943 	if (thread->pid_ == thread->tid) {
944 		scnprintf(linkname, sizeof(linkname),
945 			  "/proc/%d/fd/%d", thread->pid_, fd);
946 	} else {
947 		scnprintf(linkname, sizeof(linkname),
948 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
949 	}
950 
951 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
952 		return -1;
953 
954 	ret = readlink(linkname, pathname, sizeof(pathname));
955 
956 	if (ret < 0 || ret > st.st_size)
957 		return -1;
958 
959 	pathname[ret] = '\0';
960 	return trace__set_fd_pathname(thread, fd, pathname);
961 }
962 
963 static const char *thread__fd_path(struct thread *thread, int fd,
964 				   struct trace *trace)
965 {
966 	struct thread_trace *ttrace = thread__priv(thread);
967 
968 	if (ttrace == NULL)
969 		return NULL;
970 
971 	if (fd < 0)
972 		return NULL;
973 
974 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
975 		if (!trace->live)
976 			return NULL;
977 		++trace->stats.proc_getname;
978 		if (thread__read_fd_path(thread, fd))
979 			return NULL;
980 	}
981 
982 	return ttrace->paths.table[fd];
983 }
984 
985 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
986 {
987 	int fd = arg->val;
988 	size_t printed = scnprintf(bf, size, "%d", fd);
989 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
990 
991 	if (path)
992 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
993 
994 	return printed;
995 }
996 
997 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
998 {
999         size_t printed = scnprintf(bf, size, "%d", fd);
1000 	struct thread *thread = machine__find_thread(trace->host, pid, pid);
1001 
1002 	if (thread) {
1003 		const char *path = thread__fd_path(thread, fd, trace);
1004 
1005 		if (path)
1006 			printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1007 
1008 		thread__put(thread);
1009 	}
1010 
1011         return printed;
1012 }
1013 
1014 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1015 					      struct syscall_arg *arg)
1016 {
1017 	int fd = arg->val;
1018 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1019 	struct thread_trace *ttrace = thread__priv(arg->thread);
1020 
1021 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1022 		zfree(&ttrace->paths.table[fd]);
1023 
1024 	return printed;
1025 }
1026 
1027 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1028 				     unsigned long ptr)
1029 {
1030 	struct thread_trace *ttrace = thread__priv(thread);
1031 
1032 	ttrace->filename.ptr = ptr;
1033 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1034 }
1035 
1036 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1037 					      struct syscall_arg *arg)
1038 {
1039 	unsigned long ptr = arg->val;
1040 
1041 	if (!arg->trace->vfs_getname)
1042 		return scnprintf(bf, size, "%#x", ptr);
1043 
1044 	thread__set_filename_pos(arg->thread, bf, ptr);
1045 	return 0;
1046 }
1047 
1048 static bool trace__filter_duration(struct trace *trace, double t)
1049 {
1050 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1051 }
1052 
1053 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1054 {
1055 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1056 
1057 	return fprintf(fp, "%10.3f ", ts);
1058 }
1059 
1060 /*
1061  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1062  * using ttrace->entry_time for a thread that receives a sys_exit without
1063  * first having received a sys_enter ("poll" issued before tracing session
1064  * starts, lost sys_enter exit due to ring buffer overflow).
1065  */
1066 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1067 {
1068 	if (tstamp > 0)
1069 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1070 
1071 	return fprintf(fp, "         ? ");
1072 }
1073 
1074 static bool done = false;
1075 static bool interrupted = false;
1076 
1077 static void sig_handler(int sig)
1078 {
1079 	done = true;
1080 	interrupted = sig == SIGINT;
1081 }
1082 
1083 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1084 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1085 {
1086 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1087 	printed += fprintf_duration(duration, duration_calculated, fp);
1088 
1089 	if (trace->multiple_threads) {
1090 		if (trace->show_comm)
1091 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1092 		printed += fprintf(fp, "%d ", thread->tid);
1093 	}
1094 
1095 	return printed;
1096 }
1097 
1098 static int trace__process_event(struct trace *trace, struct machine *machine,
1099 				union perf_event *event, struct perf_sample *sample)
1100 {
1101 	int ret = 0;
1102 
1103 	switch (event->header.type) {
1104 	case PERF_RECORD_LOST:
1105 		color_fprintf(trace->output, PERF_COLOR_RED,
1106 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1107 		ret = machine__process_lost_event(machine, event, sample);
1108 		break;
1109 	default:
1110 		ret = machine__process_event(machine, event, sample);
1111 		break;
1112 	}
1113 
1114 	return ret;
1115 }
1116 
1117 static int trace__tool_process(struct perf_tool *tool,
1118 			       union perf_event *event,
1119 			       struct perf_sample *sample,
1120 			       struct machine *machine)
1121 {
1122 	struct trace *trace = container_of(tool, struct trace, tool);
1123 	return trace__process_event(trace, machine, event, sample);
1124 }
1125 
1126 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1127 {
1128 	struct machine *machine = vmachine;
1129 
1130 	if (machine->kptr_restrict_warned)
1131 		return NULL;
1132 
1133 	if (symbol_conf.kptr_restrict) {
1134 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1135 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1136 			   "Kernel samples will not be resolved.\n");
1137 		machine->kptr_restrict_warned = true;
1138 		return NULL;
1139 	}
1140 
1141 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1142 }
1143 
1144 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1145 {
1146 	int err = symbol__init(NULL);
1147 
1148 	if (err)
1149 		return err;
1150 
1151 	trace->host = machine__new_host();
1152 	if (trace->host == NULL)
1153 		return -ENOMEM;
1154 
1155 	if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1156 		return -errno;
1157 
1158 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1159 					    evlist->threads, trace__tool_process, false,
1160 					    trace->opts.proc_map_timeout, 1);
1161 	if (err)
1162 		symbol__exit();
1163 
1164 	return err;
1165 }
1166 
1167 static void trace__symbols__exit(struct trace *trace)
1168 {
1169 	machine__exit(trace->host);
1170 	trace->host = NULL;
1171 
1172 	symbol__exit();
1173 }
1174 
1175 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1176 {
1177 	int idx;
1178 
1179 	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1180 		nr_args = sc->fmt->nr_args;
1181 
1182 	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1183 	if (sc->arg_fmt == NULL)
1184 		return -1;
1185 
1186 	for (idx = 0; idx < nr_args; ++idx) {
1187 		if (sc->fmt)
1188 			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1189 	}
1190 
1191 	sc->nr_args = nr_args;
1192 	return 0;
1193 }
1194 
1195 static int syscall__set_arg_fmts(struct syscall *sc)
1196 {
1197 	struct format_field *field;
1198 	int idx = 0, len;
1199 
1200 	for (field = sc->args; field; field = field->next, ++idx) {
1201 		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1202 			continue;
1203 
1204 		if (strcmp(field->type, "const char *") == 0 &&
1205 			 (strcmp(field->name, "filename") == 0 ||
1206 			  strcmp(field->name, "path") == 0 ||
1207 			  strcmp(field->name, "pathname") == 0))
1208 			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1209 		else if (field->flags & FIELD_IS_POINTER)
1210 			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1211 		else if (strcmp(field->type, "pid_t") == 0)
1212 			sc->arg_fmt[idx].scnprintf = SCA_PID;
1213 		else if (strcmp(field->type, "umode_t") == 0)
1214 			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1215 		else if ((strcmp(field->type, "int") == 0 ||
1216 			  strcmp(field->type, "unsigned int") == 0 ||
1217 			  strcmp(field->type, "long") == 0) &&
1218 			 (len = strlen(field->name)) >= 2 &&
1219 			 strcmp(field->name + len - 2, "fd") == 0) {
1220 			/*
1221 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1222 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1223 			 * 65 int
1224 			 * 23 unsigned int
1225 			 * 7 unsigned long
1226 			 */
1227 			sc->arg_fmt[idx].scnprintf = SCA_FD;
1228 		}
1229 	}
1230 
1231 	return 0;
1232 }
1233 
1234 static int trace__read_syscall_info(struct trace *trace, int id)
1235 {
1236 	char tp_name[128];
1237 	struct syscall *sc;
1238 	const char *name = syscalltbl__name(trace->sctbl, id);
1239 
1240 	if (name == NULL)
1241 		return -1;
1242 
1243 	if (id > trace->syscalls.max) {
1244 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1245 
1246 		if (nsyscalls == NULL)
1247 			return -1;
1248 
1249 		if (trace->syscalls.max != -1) {
1250 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1251 			       (id - trace->syscalls.max) * sizeof(*sc));
1252 		} else {
1253 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1254 		}
1255 
1256 		trace->syscalls.table = nsyscalls;
1257 		trace->syscalls.max   = id;
1258 	}
1259 
1260 	sc = trace->syscalls.table + id;
1261 	sc->name = name;
1262 
1263 	sc->fmt  = syscall_fmt__find(sc->name);
1264 
1265 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1266 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1267 
1268 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1269 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1270 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1271 	}
1272 
1273 	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1274 		return -1;
1275 
1276 	if (IS_ERR(sc->tp_format))
1277 		return -1;
1278 
1279 	sc->args = sc->tp_format->format.fields;
1280 	/*
1281 	 * We need to check and discard the first variable '__syscall_nr'
1282 	 * or 'nr' that mean the syscall number. It is needless here.
1283 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1284 	 */
1285 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1286 		sc->args = sc->args->next;
1287 		--sc->nr_args;
1288 	}
1289 
1290 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1291 
1292 	return syscall__set_arg_fmts(sc);
1293 }
1294 
1295 static int trace__validate_ev_qualifier(struct trace *trace)
1296 {
1297 	int err = 0, i;
1298 	size_t nr_allocated;
1299 	struct str_node *pos;
1300 
1301 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1302 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1303 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1304 
1305 	if (trace->ev_qualifier_ids.entries == NULL) {
1306 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1307 		       trace->output);
1308 		err = -EINVAL;
1309 		goto out;
1310 	}
1311 
1312 	nr_allocated = trace->ev_qualifier_ids.nr;
1313 	i = 0;
1314 
1315 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1316 		const char *sc = pos->s;
1317 		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1318 
1319 		if (id < 0) {
1320 			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1321 			if (id >= 0)
1322 				goto matches;
1323 
1324 			if (err == 0) {
1325 				fputs("Error:\tInvalid syscall ", trace->output);
1326 				err = -EINVAL;
1327 			} else {
1328 				fputs(", ", trace->output);
1329 			}
1330 
1331 			fputs(sc, trace->output);
1332 		}
1333 matches:
1334 		trace->ev_qualifier_ids.entries[i++] = id;
1335 		if (match_next == -1)
1336 			continue;
1337 
1338 		while (1) {
1339 			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1340 			if (id < 0)
1341 				break;
1342 			if (nr_allocated == trace->ev_qualifier_ids.nr) {
1343 				void *entries;
1344 
1345 				nr_allocated += 8;
1346 				entries = realloc(trace->ev_qualifier_ids.entries,
1347 						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1348 				if (entries == NULL) {
1349 					err = -ENOMEM;
1350 					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1351 					goto out_free;
1352 				}
1353 				trace->ev_qualifier_ids.entries = entries;
1354 			}
1355 			trace->ev_qualifier_ids.nr++;
1356 			trace->ev_qualifier_ids.entries[i++] = id;
1357 		}
1358 	}
1359 
1360 	if (err < 0) {
1361 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1362 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1363 out_free:
1364 		zfree(&trace->ev_qualifier_ids.entries);
1365 		trace->ev_qualifier_ids.nr = 0;
1366 	}
1367 out:
1368 	return err;
1369 }
1370 
1371 /*
1372  * args is to be interpreted as a series of longs but we need to handle
1373  * 8-byte unaligned accesses. args points to raw_data within the event
1374  * and raw_data is guaranteed to be 8-byte unaligned because it is
1375  * preceded by raw_size which is a u32. So we need to copy args to a temp
1376  * variable to read it. Most notably this avoids extended load instructions
1377  * on unaligned addresses
1378  */
1379 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1380 {
1381 	unsigned long val;
1382 	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1383 
1384 	memcpy(&val, p, sizeof(val));
1385 	return val;
1386 }
1387 
1388 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1389 				      struct syscall_arg *arg)
1390 {
1391 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1392 		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1393 
1394 	return scnprintf(bf, size, "arg%d: ", arg->idx);
1395 }
1396 
1397 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1398 				     struct syscall_arg *arg, unsigned long val)
1399 {
1400 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1401 		arg->val = val;
1402 		if (sc->arg_fmt[arg->idx].parm)
1403 			arg->parm = sc->arg_fmt[arg->idx].parm;
1404 		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1405 	}
1406 	return scnprintf(bf, size, "%ld", val);
1407 }
1408 
1409 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1410 				      unsigned char *args, struct trace *trace,
1411 				      struct thread *thread)
1412 {
1413 	size_t printed = 0;
1414 	unsigned long val;
1415 	u8 bit = 1;
1416 	struct syscall_arg arg = {
1417 		.args	= args,
1418 		.idx	= 0,
1419 		.mask	= 0,
1420 		.trace  = trace,
1421 		.thread = thread,
1422 	};
1423 	struct thread_trace *ttrace = thread__priv(thread);
1424 
1425 	/*
1426 	 * Things like fcntl will set this in its 'cmd' formatter to pick the
1427 	 * right formatter for the return value (an fd? file flags?), which is
1428 	 * not needed for syscalls that always return a given type, say an fd.
1429 	 */
1430 	ttrace->ret_scnprintf = NULL;
1431 
1432 	if (sc->args != NULL) {
1433 		struct format_field *field;
1434 
1435 		for (field = sc->args; field;
1436 		     field = field->next, ++arg.idx, bit <<= 1) {
1437 			if (arg.mask & bit)
1438 				continue;
1439 
1440 			val = syscall_arg__val(&arg, arg.idx);
1441 
1442 			/*
1443  			 * Suppress this argument if its value is zero and
1444  			 * and we don't have a string associated in an
1445  			 * strarray for it.
1446  			 */
1447 			if (val == 0 &&
1448 			    !(sc->arg_fmt &&
1449 			      (sc->arg_fmt[arg.idx].show_zero ||
1450 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1451 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1452 			      sc->arg_fmt[arg.idx].parm))
1453 				continue;
1454 
1455 			printed += scnprintf(bf + printed, size - printed,
1456 					     "%s%s: ", printed ? ", " : "", field->name);
1457 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1458 		}
1459 	} else if (IS_ERR(sc->tp_format)) {
1460 		/*
1461 		 * If we managed to read the tracepoint /format file, then we
1462 		 * may end up not having any args, like with gettid(), so only
1463 		 * print the raw args when we didn't manage to read it.
1464 		 */
1465 		while (arg.idx < sc->nr_args) {
1466 			if (arg.mask & bit)
1467 				goto next_arg;
1468 			val = syscall_arg__val(&arg, arg.idx);
1469 			if (printed)
1470 				printed += scnprintf(bf + printed, size - printed, ", ");
1471 			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1472 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1473 next_arg:
1474 			++arg.idx;
1475 			bit <<= 1;
1476 		}
1477 	}
1478 
1479 	return printed;
1480 }
1481 
1482 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1483 				  union perf_event *event,
1484 				  struct perf_sample *sample);
1485 
1486 static struct syscall *trace__syscall_info(struct trace *trace,
1487 					   struct perf_evsel *evsel, int id)
1488 {
1489 
1490 	if (id < 0) {
1491 
1492 		/*
1493 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1494 		 * before that, leaving at a higher verbosity level till that is
1495 		 * explained. Reproduced with plain ftrace with:
1496 		 *
1497 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1498 		 * grep "NR -1 " /t/trace_pipe
1499 		 *
1500 		 * After generating some load on the machine.
1501  		 */
1502 		if (verbose > 1) {
1503 			static u64 n;
1504 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1505 				id, perf_evsel__name(evsel), ++n);
1506 		}
1507 		return NULL;
1508 	}
1509 
1510 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1511 	    trace__read_syscall_info(trace, id))
1512 		goto out_cant_read;
1513 
1514 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1515 		goto out_cant_read;
1516 
1517 	return &trace->syscalls.table[id];
1518 
1519 out_cant_read:
1520 	if (verbose > 0) {
1521 		fprintf(trace->output, "Problems reading syscall %d", id);
1522 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1523 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1524 		fputs(" information\n", trace->output);
1525 	}
1526 	return NULL;
1527 }
1528 
1529 static void thread__update_stats(struct thread_trace *ttrace,
1530 				 int id, struct perf_sample *sample)
1531 {
1532 	struct int_node *inode;
1533 	struct stats *stats;
1534 	u64 duration = 0;
1535 
1536 	inode = intlist__findnew(ttrace->syscall_stats, id);
1537 	if (inode == NULL)
1538 		return;
1539 
1540 	stats = inode->priv;
1541 	if (stats == NULL) {
1542 		stats = malloc(sizeof(struct stats));
1543 		if (stats == NULL)
1544 			return;
1545 		init_stats(stats);
1546 		inode->priv = stats;
1547 	}
1548 
1549 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1550 		duration = sample->time - ttrace->entry_time;
1551 
1552 	update_stats(stats, duration);
1553 }
1554 
1555 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1556 {
1557 	struct thread_trace *ttrace;
1558 	u64 duration;
1559 	size_t printed;
1560 
1561 	if (trace->current == NULL)
1562 		return 0;
1563 
1564 	ttrace = thread__priv(trace->current);
1565 
1566 	if (!ttrace->entry_pending)
1567 		return 0;
1568 
1569 	duration = sample->time - ttrace->entry_time;
1570 
1571 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1572 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1573 	ttrace->entry_pending = false;
1574 
1575 	return printed;
1576 }
1577 
1578 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1579 			    union perf_event *event __maybe_unused,
1580 			    struct perf_sample *sample)
1581 {
1582 	char *msg;
1583 	void *args;
1584 	size_t printed = 0;
1585 	struct thread *thread;
1586 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1587 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1588 	struct thread_trace *ttrace;
1589 
1590 	if (sc == NULL)
1591 		return -1;
1592 
1593 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1594 	ttrace = thread__trace(thread, trace->output);
1595 	if (ttrace == NULL)
1596 		goto out_put;
1597 
1598 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1599 
1600 	if (ttrace->entry_str == NULL) {
1601 		ttrace->entry_str = malloc(trace__entry_str_size);
1602 		if (!ttrace->entry_str)
1603 			goto out_put;
1604 	}
1605 
1606 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1607 		trace__printf_interrupted_entry(trace, sample);
1608 
1609 	ttrace->entry_time = sample->time;
1610 	msg = ttrace->entry_str;
1611 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1612 
1613 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1614 					   args, trace, thread);
1615 
1616 	if (sc->is_exit) {
1617 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1618 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1619 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1620 		}
1621 	} else {
1622 		ttrace->entry_pending = true;
1623 		/* See trace__vfs_getname & trace__sys_exit */
1624 		ttrace->filename.pending_open = false;
1625 	}
1626 
1627 	if (trace->current != thread) {
1628 		thread__put(trace->current);
1629 		trace->current = thread__get(thread);
1630 	}
1631 	err = 0;
1632 out_put:
1633 	thread__put(thread);
1634 	return err;
1635 }
1636 
1637 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1638 				    struct perf_sample *sample,
1639 				    struct callchain_cursor *cursor)
1640 {
1641 	struct addr_location al;
1642 
1643 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1644 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1645 		return -1;
1646 
1647 	return 0;
1648 }
1649 
1650 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1651 {
1652 	/* TODO: user-configurable print_opts */
1653 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1654 				        EVSEL__PRINT_DSO |
1655 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1656 
1657 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1658 }
1659 
1660 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1661 			   union perf_event *event __maybe_unused,
1662 			   struct perf_sample *sample)
1663 {
1664 	long ret;
1665 	u64 duration = 0;
1666 	bool duration_calculated = false;
1667 	struct thread *thread;
1668 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1669 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1670 	struct thread_trace *ttrace;
1671 
1672 	if (sc == NULL)
1673 		return -1;
1674 
1675 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1676 	ttrace = thread__trace(thread, trace->output);
1677 	if (ttrace == NULL)
1678 		goto out_put;
1679 
1680 	if (trace->summary)
1681 		thread__update_stats(ttrace, id, sample);
1682 
1683 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1684 
1685 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1686 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1687 		ttrace->filename.pending_open = false;
1688 		++trace->stats.vfs_getname;
1689 	}
1690 
1691 	if (ttrace->entry_time) {
1692 		duration = sample->time - ttrace->entry_time;
1693 		if (trace__filter_duration(trace, duration))
1694 			goto out;
1695 		duration_calculated = true;
1696 	} else if (trace->duration_filter)
1697 		goto out;
1698 
1699 	if (sample->callchain) {
1700 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1701 		if (callchain_ret == 0) {
1702 			if (callchain_cursor.nr < trace->min_stack)
1703 				goto out;
1704 			callchain_ret = 1;
1705 		}
1706 	}
1707 
1708 	if (trace->summary_only)
1709 		goto out;
1710 
1711 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1712 
1713 	if (ttrace->entry_pending) {
1714 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1715 	} else {
1716 		fprintf(trace->output, " ... [");
1717 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1718 		fprintf(trace->output, "]: %s()", sc->name);
1719 	}
1720 
1721 	if (sc->fmt == NULL) {
1722 		if (ret < 0)
1723 			goto errno_print;
1724 signed_print:
1725 		fprintf(trace->output, ") = %ld", ret);
1726 	} else if (ret < 0) {
1727 errno_print: {
1728 		char bf[STRERR_BUFSIZE];
1729 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1730 			   *e = audit_errno_to_name(-ret);
1731 
1732 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1733 	}
1734 	} else if (ret == 0 && sc->fmt->timeout)
1735 		fprintf(trace->output, ") = 0 Timeout");
1736 	else if (ttrace->ret_scnprintf) {
1737 		char bf[1024];
1738 		struct syscall_arg arg = {
1739 			.val	= ret,
1740 			.thread	= thread,
1741 			.trace	= trace,
1742 		};
1743 		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1744 		ttrace->ret_scnprintf = NULL;
1745 		fprintf(trace->output, ") = %s", bf);
1746 	} else if (sc->fmt->hexret)
1747 		fprintf(trace->output, ") = %#lx", ret);
1748 	else if (sc->fmt->errpid) {
1749 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1750 
1751 		if (child != NULL) {
1752 			fprintf(trace->output, ") = %ld", ret);
1753 			if (child->comm_set)
1754 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1755 			thread__put(child);
1756 		}
1757 	} else
1758 		goto signed_print;
1759 
1760 	fputc('\n', trace->output);
1761 
1762 	if (callchain_ret > 0)
1763 		trace__fprintf_callchain(trace, sample);
1764 	else if (callchain_ret < 0)
1765 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1766 out:
1767 	ttrace->entry_pending = false;
1768 	err = 0;
1769 out_put:
1770 	thread__put(thread);
1771 	return err;
1772 }
1773 
1774 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1775 			      union perf_event *event __maybe_unused,
1776 			      struct perf_sample *sample)
1777 {
1778 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1779 	struct thread_trace *ttrace;
1780 	size_t filename_len, entry_str_len, to_move;
1781 	ssize_t remaining_space;
1782 	char *pos;
1783 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1784 
1785 	if (!thread)
1786 		goto out;
1787 
1788 	ttrace = thread__priv(thread);
1789 	if (!ttrace)
1790 		goto out_put;
1791 
1792 	filename_len = strlen(filename);
1793 	if (filename_len == 0)
1794 		goto out_put;
1795 
1796 	if (ttrace->filename.namelen < filename_len) {
1797 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1798 
1799 		if (f == NULL)
1800 			goto out_put;
1801 
1802 		ttrace->filename.namelen = filename_len;
1803 		ttrace->filename.name = f;
1804 	}
1805 
1806 	strcpy(ttrace->filename.name, filename);
1807 	ttrace->filename.pending_open = true;
1808 
1809 	if (!ttrace->filename.ptr)
1810 		goto out_put;
1811 
1812 	entry_str_len = strlen(ttrace->entry_str);
1813 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1814 	if (remaining_space <= 0)
1815 		goto out_put;
1816 
1817 	if (filename_len > (size_t)remaining_space) {
1818 		filename += filename_len - remaining_space;
1819 		filename_len = remaining_space;
1820 	}
1821 
1822 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1823 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1824 	memmove(pos + filename_len, pos, to_move);
1825 	memcpy(pos, filename, filename_len);
1826 
1827 	ttrace->filename.ptr = 0;
1828 	ttrace->filename.entry_str_pos = 0;
1829 out_put:
1830 	thread__put(thread);
1831 out:
1832 	return 0;
1833 }
1834 
1835 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1836 				     union perf_event *event __maybe_unused,
1837 				     struct perf_sample *sample)
1838 {
1839         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1840 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1841 	struct thread *thread = machine__findnew_thread(trace->host,
1842 							sample->pid,
1843 							sample->tid);
1844 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1845 
1846 	if (ttrace == NULL)
1847 		goto out_dump;
1848 
1849 	ttrace->runtime_ms += runtime_ms;
1850 	trace->runtime_ms += runtime_ms;
1851 out_put:
1852 	thread__put(thread);
1853 	return 0;
1854 
1855 out_dump:
1856 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1857 	       evsel->name,
1858 	       perf_evsel__strval(evsel, sample, "comm"),
1859 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1860 	       runtime,
1861 	       perf_evsel__intval(evsel, sample, "vruntime"));
1862 	goto out_put;
1863 }
1864 
1865 static int bpf_output__printer(enum binary_printer_ops op,
1866 			       unsigned int val, void *extra __maybe_unused, FILE *fp)
1867 {
1868 	unsigned char ch = (unsigned char)val;
1869 
1870 	switch (op) {
1871 	case BINARY_PRINT_CHAR_DATA:
1872 		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1873 	case BINARY_PRINT_DATA_BEGIN:
1874 	case BINARY_PRINT_LINE_BEGIN:
1875 	case BINARY_PRINT_ADDR:
1876 	case BINARY_PRINT_NUM_DATA:
1877 	case BINARY_PRINT_NUM_PAD:
1878 	case BINARY_PRINT_SEP:
1879 	case BINARY_PRINT_CHAR_PAD:
1880 	case BINARY_PRINT_LINE_END:
1881 	case BINARY_PRINT_DATA_END:
1882 	default:
1883 		break;
1884 	}
1885 
1886 	return 0;
1887 }
1888 
1889 static void bpf_output__fprintf(struct trace *trace,
1890 				struct perf_sample *sample)
1891 {
1892 	binary__fprintf(sample->raw_data, sample->raw_size, 8,
1893 			bpf_output__printer, NULL, trace->output);
1894 }
1895 
1896 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1897 				union perf_event *event __maybe_unused,
1898 				struct perf_sample *sample)
1899 {
1900 	int callchain_ret = 0;
1901 
1902 	if (sample->callchain) {
1903 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1904 		if (callchain_ret == 0) {
1905 			if (callchain_cursor.nr < trace->min_stack)
1906 				goto out;
1907 			callchain_ret = 1;
1908 		}
1909 	}
1910 
1911 	trace__printf_interrupted_entry(trace, sample);
1912 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1913 
1914 	if (trace->trace_syscalls)
1915 		fprintf(trace->output, "(         ): ");
1916 
1917 	fprintf(trace->output, "%s:", evsel->name);
1918 
1919 	if (perf_evsel__is_bpf_output(evsel)) {
1920 		bpf_output__fprintf(trace, sample);
1921 	} else if (evsel->tp_format) {
1922 		event_format__fprintf(evsel->tp_format, sample->cpu,
1923 				      sample->raw_data, sample->raw_size,
1924 				      trace->output);
1925 	}
1926 
1927 	fprintf(trace->output, ")\n");
1928 
1929 	if (callchain_ret > 0)
1930 		trace__fprintf_callchain(trace, sample);
1931 	else if (callchain_ret < 0)
1932 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1933 out:
1934 	return 0;
1935 }
1936 
1937 static void print_location(FILE *f, struct perf_sample *sample,
1938 			   struct addr_location *al,
1939 			   bool print_dso, bool print_sym)
1940 {
1941 
1942 	if ((verbose > 0 || print_dso) && al->map)
1943 		fprintf(f, "%s@", al->map->dso->long_name);
1944 
1945 	if ((verbose > 0 || print_sym) && al->sym)
1946 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1947 			al->addr - al->sym->start);
1948 	else if (al->map)
1949 		fprintf(f, "0x%" PRIx64, al->addr);
1950 	else
1951 		fprintf(f, "0x%" PRIx64, sample->addr);
1952 }
1953 
1954 static int trace__pgfault(struct trace *trace,
1955 			  struct perf_evsel *evsel,
1956 			  union perf_event *event __maybe_unused,
1957 			  struct perf_sample *sample)
1958 {
1959 	struct thread *thread;
1960 	struct addr_location al;
1961 	char map_type = 'd';
1962 	struct thread_trace *ttrace;
1963 	int err = -1;
1964 	int callchain_ret = 0;
1965 
1966 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1967 
1968 	if (sample->callchain) {
1969 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1970 		if (callchain_ret == 0) {
1971 			if (callchain_cursor.nr < trace->min_stack)
1972 				goto out_put;
1973 			callchain_ret = 1;
1974 		}
1975 	}
1976 
1977 	ttrace = thread__trace(thread, trace->output);
1978 	if (ttrace == NULL)
1979 		goto out_put;
1980 
1981 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1982 		ttrace->pfmaj++;
1983 	else
1984 		ttrace->pfmin++;
1985 
1986 	if (trace->summary_only)
1987 		goto out;
1988 
1989 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1990 			      sample->ip, &al);
1991 
1992 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1993 
1994 	fprintf(trace->output, "%sfault [",
1995 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1996 		"maj" : "min");
1997 
1998 	print_location(trace->output, sample, &al, false, true);
1999 
2000 	fprintf(trace->output, "] => ");
2001 
2002 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2003 				   sample->addr, &al);
2004 
2005 	if (!al.map) {
2006 		thread__find_addr_location(thread, sample->cpumode,
2007 					   MAP__FUNCTION, sample->addr, &al);
2008 
2009 		if (al.map)
2010 			map_type = 'x';
2011 		else
2012 			map_type = '?';
2013 	}
2014 
2015 	print_location(trace->output, sample, &al, true, false);
2016 
2017 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2018 
2019 	if (callchain_ret > 0)
2020 		trace__fprintf_callchain(trace, sample);
2021 	else if (callchain_ret < 0)
2022 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2023 out:
2024 	err = 0;
2025 out_put:
2026 	thread__put(thread);
2027 	return err;
2028 }
2029 
2030 static void trace__set_base_time(struct trace *trace,
2031 				 struct perf_evsel *evsel,
2032 				 struct perf_sample *sample)
2033 {
2034 	/*
2035 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2036 	 * and don't use sample->time unconditionally, we may end up having
2037 	 * some other event in the future without PERF_SAMPLE_TIME for good
2038 	 * reason, i.e. we may not be interested in its timestamps, just in
2039 	 * it taking place, picking some piece of information when it
2040 	 * appears in our event stream (vfs_getname comes to mind).
2041 	 */
2042 	if (trace->base_time == 0 && !trace->full_time &&
2043 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2044 		trace->base_time = sample->time;
2045 }
2046 
2047 static int trace__process_sample(struct perf_tool *tool,
2048 				 union perf_event *event,
2049 				 struct perf_sample *sample,
2050 				 struct perf_evsel *evsel,
2051 				 struct machine *machine __maybe_unused)
2052 {
2053 	struct trace *trace = container_of(tool, struct trace, tool);
2054 	struct thread *thread;
2055 	int err = 0;
2056 
2057 	tracepoint_handler handler = evsel->handler;
2058 
2059 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2060 	if (thread && thread__is_filtered(thread))
2061 		goto out;
2062 
2063 	trace__set_base_time(trace, evsel, sample);
2064 
2065 	if (handler) {
2066 		++trace->nr_events;
2067 		handler(trace, evsel, event, sample);
2068 	}
2069 out:
2070 	thread__put(thread);
2071 	return err;
2072 }
2073 
2074 static int trace__record(struct trace *trace, int argc, const char **argv)
2075 {
2076 	unsigned int rec_argc, i, j;
2077 	const char **rec_argv;
2078 	const char * const record_args[] = {
2079 		"record",
2080 		"-R",
2081 		"-m", "1024",
2082 		"-c", "1",
2083 	};
2084 
2085 	const char * const sc_args[] = { "-e", };
2086 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2087 	const char * const majpf_args[] = { "-e", "major-faults" };
2088 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2089 	const char * const minpf_args[] = { "-e", "minor-faults" };
2090 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2091 
2092 	/* +1 is for the event string below */
2093 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2094 		majpf_args_nr + minpf_args_nr + argc;
2095 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2096 
2097 	if (rec_argv == NULL)
2098 		return -ENOMEM;
2099 
2100 	j = 0;
2101 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2102 		rec_argv[j++] = record_args[i];
2103 
2104 	if (trace->trace_syscalls) {
2105 		for (i = 0; i < sc_args_nr; i++)
2106 			rec_argv[j++] = sc_args[i];
2107 
2108 		/* event string may be different for older kernels - e.g., RHEL6 */
2109 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2110 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2111 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2112 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2113 		else {
2114 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2115 			free(rec_argv);
2116 			return -1;
2117 		}
2118 	}
2119 
2120 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2121 		for (i = 0; i < majpf_args_nr; i++)
2122 			rec_argv[j++] = majpf_args[i];
2123 
2124 	if (trace->trace_pgfaults & TRACE_PFMIN)
2125 		for (i = 0; i < minpf_args_nr; i++)
2126 			rec_argv[j++] = minpf_args[i];
2127 
2128 	for (i = 0; i < (unsigned int)argc; i++)
2129 		rec_argv[j++] = argv[i];
2130 
2131 	return cmd_record(j, rec_argv);
2132 }
2133 
2134 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2135 
2136 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2137 {
2138 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2139 
2140 	if (IS_ERR(evsel))
2141 		return false;
2142 
2143 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2144 		perf_evsel__delete(evsel);
2145 		return false;
2146 	}
2147 
2148 	evsel->handler = trace__vfs_getname;
2149 	perf_evlist__add(evlist, evsel);
2150 	return true;
2151 }
2152 
2153 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2154 {
2155 	struct perf_evsel *evsel;
2156 	struct perf_event_attr attr = {
2157 		.type = PERF_TYPE_SOFTWARE,
2158 		.mmap_data = 1,
2159 	};
2160 
2161 	attr.config = config;
2162 	attr.sample_period = 1;
2163 
2164 	event_attr_init(&attr);
2165 
2166 	evsel = perf_evsel__new(&attr);
2167 	if (evsel)
2168 		evsel->handler = trace__pgfault;
2169 
2170 	return evsel;
2171 }
2172 
2173 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2174 {
2175 	const u32 type = event->header.type;
2176 	struct perf_evsel *evsel;
2177 
2178 	if (type != PERF_RECORD_SAMPLE) {
2179 		trace__process_event(trace, trace->host, event, sample);
2180 		return;
2181 	}
2182 
2183 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2184 	if (evsel == NULL) {
2185 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2186 		return;
2187 	}
2188 
2189 	trace__set_base_time(trace, evsel, sample);
2190 
2191 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2192 	    sample->raw_data == NULL) {
2193 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2194 		       perf_evsel__name(evsel), sample->tid,
2195 		       sample->cpu, sample->raw_size);
2196 	} else {
2197 		tracepoint_handler handler = evsel->handler;
2198 		handler(trace, evsel, event, sample);
2199 	}
2200 }
2201 
2202 static int trace__add_syscall_newtp(struct trace *trace)
2203 {
2204 	int ret = -1;
2205 	struct perf_evlist *evlist = trace->evlist;
2206 	struct perf_evsel *sys_enter, *sys_exit;
2207 
2208 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2209 	if (sys_enter == NULL)
2210 		goto out;
2211 
2212 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2213 		goto out_delete_sys_enter;
2214 
2215 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2216 	if (sys_exit == NULL)
2217 		goto out_delete_sys_enter;
2218 
2219 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2220 		goto out_delete_sys_exit;
2221 
2222 	perf_evlist__add(evlist, sys_enter);
2223 	perf_evlist__add(evlist, sys_exit);
2224 
2225 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2226 		/*
2227 		 * We're interested only in the user space callchain
2228 		 * leading to the syscall, allow overriding that for
2229 		 * debugging reasons using --kernel_syscall_callchains
2230 		 */
2231 		sys_exit->attr.exclude_callchain_kernel = 1;
2232 	}
2233 
2234 	trace->syscalls.events.sys_enter = sys_enter;
2235 	trace->syscalls.events.sys_exit  = sys_exit;
2236 
2237 	ret = 0;
2238 out:
2239 	return ret;
2240 
2241 out_delete_sys_exit:
2242 	perf_evsel__delete_priv(sys_exit);
2243 out_delete_sys_enter:
2244 	perf_evsel__delete_priv(sys_enter);
2245 	goto out;
2246 }
2247 
2248 static int trace__set_ev_qualifier_filter(struct trace *trace)
2249 {
2250 	int err = -1;
2251 	struct perf_evsel *sys_exit;
2252 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2253 						trace->ev_qualifier_ids.nr,
2254 						trace->ev_qualifier_ids.entries);
2255 
2256 	if (filter == NULL)
2257 		goto out_enomem;
2258 
2259 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2260 					  filter)) {
2261 		sys_exit = trace->syscalls.events.sys_exit;
2262 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2263 	}
2264 
2265 	free(filter);
2266 out:
2267 	return err;
2268 out_enomem:
2269 	errno = ENOMEM;
2270 	goto out;
2271 }
2272 
2273 static int trace__set_filter_loop_pids(struct trace *trace)
2274 {
2275 	unsigned int nr = 1;
2276 	pid_t pids[32] = {
2277 		getpid(),
2278 	};
2279 	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2280 
2281 	while (thread && nr < ARRAY_SIZE(pids)) {
2282 		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2283 
2284 		if (parent == NULL)
2285 			break;
2286 
2287 		if (!strcmp(thread__comm_str(parent), "sshd")) {
2288 			pids[nr++] = parent->tid;
2289 			break;
2290 		}
2291 		thread = parent;
2292 	}
2293 
2294 	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2295 }
2296 
2297 static int trace__run(struct trace *trace, int argc, const char **argv)
2298 {
2299 	struct perf_evlist *evlist = trace->evlist;
2300 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2301 	int err = -1, i;
2302 	unsigned long before;
2303 	const bool forks = argc > 0;
2304 	bool draining = false;
2305 
2306 	trace->live = true;
2307 
2308 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2309 		goto out_error_raw_syscalls;
2310 
2311 	if (trace->trace_syscalls)
2312 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2313 
2314 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2315 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2316 		if (pgfault_maj == NULL)
2317 			goto out_error_mem;
2318 		perf_evlist__add(evlist, pgfault_maj);
2319 	}
2320 
2321 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2322 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2323 		if (pgfault_min == NULL)
2324 			goto out_error_mem;
2325 		perf_evlist__add(evlist, pgfault_min);
2326 	}
2327 
2328 	if (trace->sched &&
2329 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2330 				   trace__sched_stat_runtime))
2331 		goto out_error_sched_stat_runtime;
2332 
2333 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2334 	if (err < 0) {
2335 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2336 		goto out_delete_evlist;
2337 	}
2338 
2339 	err = trace__symbols_init(trace, evlist);
2340 	if (err < 0) {
2341 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2342 		goto out_delete_evlist;
2343 	}
2344 
2345 	perf_evlist__config(evlist, &trace->opts, NULL);
2346 
2347 	if (callchain_param.enabled) {
2348 		bool use_identifier = false;
2349 
2350 		if (trace->syscalls.events.sys_exit) {
2351 			perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2352 						     &trace->opts, &callchain_param);
2353 			use_identifier = true;
2354 		}
2355 
2356 		if (pgfault_maj) {
2357 			perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2358 			use_identifier = true;
2359 		}
2360 
2361 		if (pgfault_min) {
2362 			perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2363 			use_identifier = true;
2364 		}
2365 
2366 		if (use_identifier) {
2367 		       /*
2368 			* Now we have evsels with different sample_ids, use
2369 			* PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2370 			* from a fixed position in each ring buffer record.
2371 			*
2372 			* As of this the changeset introducing this comment, this
2373 			* isn't strictly needed, as the fields that can come before
2374 			* PERF_SAMPLE_ID are all used, but we'll probably disable
2375 			* some of those for things like copying the payload of
2376 			* pointer syscall arguments, and for vfs_getname we don't
2377 			* need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2378 			* here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2379 			*/
2380 			perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2381 			perf_evlist__reset_sample_bit(evlist, ID);
2382 		}
2383 	}
2384 
2385 	signal(SIGCHLD, sig_handler);
2386 	signal(SIGINT, sig_handler);
2387 
2388 	if (forks) {
2389 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2390 						    argv, false, NULL);
2391 		if (err < 0) {
2392 			fprintf(trace->output, "Couldn't run the workload!\n");
2393 			goto out_delete_evlist;
2394 		}
2395 	}
2396 
2397 	err = perf_evlist__open(evlist);
2398 	if (err < 0)
2399 		goto out_error_open;
2400 
2401 	err = bpf__apply_obj_config();
2402 	if (err) {
2403 		char errbuf[BUFSIZ];
2404 
2405 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2406 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2407 			 errbuf);
2408 		goto out_error_open;
2409 	}
2410 
2411 	/*
2412 	 * Better not use !target__has_task() here because we need to cover the
2413 	 * case where no threads were specified in the command line, but a
2414 	 * workload was, and in that case we will fill in the thread_map when
2415 	 * we fork the workload in perf_evlist__prepare_workload.
2416 	 */
2417 	if (trace->filter_pids.nr > 0)
2418 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2419 	else if (thread_map__pid(evlist->threads, 0) == -1)
2420 		err = trace__set_filter_loop_pids(trace);
2421 
2422 	if (err < 0)
2423 		goto out_error_mem;
2424 
2425 	if (trace->ev_qualifier_ids.nr > 0) {
2426 		err = trace__set_ev_qualifier_filter(trace);
2427 		if (err < 0)
2428 			goto out_errno;
2429 
2430 		pr_debug("event qualifier tracepoint filter: %s\n",
2431 			 trace->syscalls.events.sys_exit->filter);
2432 	}
2433 
2434 	err = perf_evlist__apply_filters(evlist, &evsel);
2435 	if (err < 0)
2436 		goto out_error_apply_filters;
2437 
2438 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2439 	if (err < 0)
2440 		goto out_error_mmap;
2441 
2442 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2443 		perf_evlist__enable(evlist);
2444 
2445 	if (forks)
2446 		perf_evlist__start_workload(evlist);
2447 
2448 	if (trace->opts.initial_delay) {
2449 		usleep(trace->opts.initial_delay * 1000);
2450 		perf_evlist__enable(evlist);
2451 	}
2452 
2453 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2454 				  evlist->threads->nr > 1 ||
2455 				  perf_evlist__first(evlist)->attr.inherit;
2456 again:
2457 	before = trace->nr_events;
2458 
2459 	for (i = 0; i < evlist->nr_mmaps; i++) {
2460 		union perf_event *event;
2461 
2462 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2463 			struct perf_sample sample;
2464 
2465 			++trace->nr_events;
2466 
2467 			err = perf_evlist__parse_sample(evlist, event, &sample);
2468 			if (err) {
2469 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2470 				goto next_event;
2471 			}
2472 
2473 			trace__handle_event(trace, event, &sample);
2474 next_event:
2475 			perf_evlist__mmap_consume(evlist, i);
2476 
2477 			if (interrupted)
2478 				goto out_disable;
2479 
2480 			if (done && !draining) {
2481 				perf_evlist__disable(evlist);
2482 				draining = true;
2483 			}
2484 		}
2485 	}
2486 
2487 	if (trace->nr_events == before) {
2488 		int timeout = done ? 100 : -1;
2489 
2490 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2491 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2492 				draining = true;
2493 
2494 			goto again;
2495 		}
2496 	} else {
2497 		goto again;
2498 	}
2499 
2500 out_disable:
2501 	thread__zput(trace->current);
2502 
2503 	perf_evlist__disable(evlist);
2504 
2505 	if (!err) {
2506 		if (trace->summary)
2507 			trace__fprintf_thread_summary(trace, trace->output);
2508 
2509 		if (trace->show_tool_stats) {
2510 			fprintf(trace->output, "Stats:\n "
2511 					       " vfs_getname : %" PRIu64 "\n"
2512 					       " proc_getname: %" PRIu64 "\n",
2513 				trace->stats.vfs_getname,
2514 				trace->stats.proc_getname);
2515 		}
2516 	}
2517 
2518 out_delete_evlist:
2519 	trace__symbols__exit(trace);
2520 
2521 	perf_evlist__delete(evlist);
2522 	trace->evlist = NULL;
2523 	trace->live = false;
2524 	return err;
2525 {
2526 	char errbuf[BUFSIZ];
2527 
2528 out_error_sched_stat_runtime:
2529 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2530 	goto out_error;
2531 
2532 out_error_raw_syscalls:
2533 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2534 	goto out_error;
2535 
2536 out_error_mmap:
2537 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2538 	goto out_error;
2539 
2540 out_error_open:
2541 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2542 
2543 out_error:
2544 	fprintf(trace->output, "%s\n", errbuf);
2545 	goto out_delete_evlist;
2546 
2547 out_error_apply_filters:
2548 	fprintf(trace->output,
2549 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2550 		evsel->filter, perf_evsel__name(evsel), errno,
2551 		str_error_r(errno, errbuf, sizeof(errbuf)));
2552 	goto out_delete_evlist;
2553 }
2554 out_error_mem:
2555 	fprintf(trace->output, "Not enough memory to run!\n");
2556 	goto out_delete_evlist;
2557 
2558 out_errno:
2559 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2560 	goto out_delete_evlist;
2561 }
2562 
2563 static int trace__replay(struct trace *trace)
2564 {
2565 	const struct perf_evsel_str_handler handlers[] = {
2566 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2567 	};
2568 	struct perf_data data = {
2569 		.file      = {
2570 			.path = input_name,
2571 		},
2572 		.mode      = PERF_DATA_MODE_READ,
2573 		.force     = trace->force,
2574 	};
2575 	struct perf_session *session;
2576 	struct perf_evsel *evsel;
2577 	int err = -1;
2578 
2579 	trace->tool.sample	  = trace__process_sample;
2580 	trace->tool.mmap	  = perf_event__process_mmap;
2581 	trace->tool.mmap2	  = perf_event__process_mmap2;
2582 	trace->tool.comm	  = perf_event__process_comm;
2583 	trace->tool.exit	  = perf_event__process_exit;
2584 	trace->tool.fork	  = perf_event__process_fork;
2585 	trace->tool.attr	  = perf_event__process_attr;
2586 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2587 	trace->tool.build_id	  = perf_event__process_build_id;
2588 	trace->tool.namespaces	  = perf_event__process_namespaces;
2589 
2590 	trace->tool.ordered_events = true;
2591 	trace->tool.ordering_requires_timestamps = true;
2592 
2593 	/* add tid to output */
2594 	trace->multiple_threads = true;
2595 
2596 	session = perf_session__new(&data, false, &trace->tool);
2597 	if (session == NULL)
2598 		return -1;
2599 
2600 	if (trace->opts.target.pid)
2601 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2602 
2603 	if (trace->opts.target.tid)
2604 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2605 
2606 	if (symbol__init(&session->header.env) < 0)
2607 		goto out;
2608 
2609 	trace->host = &session->machines.host;
2610 
2611 	err = perf_session__set_tracepoints_handlers(session, handlers);
2612 	if (err)
2613 		goto out;
2614 
2615 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2616 						     "raw_syscalls:sys_enter");
2617 	/* older kernels have syscalls tp versus raw_syscalls */
2618 	if (evsel == NULL)
2619 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2620 							     "syscalls:sys_enter");
2621 
2622 	if (evsel &&
2623 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2624 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2625 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2626 		goto out;
2627 	}
2628 
2629 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2630 						     "raw_syscalls:sys_exit");
2631 	if (evsel == NULL)
2632 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2633 							     "syscalls:sys_exit");
2634 	if (evsel &&
2635 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2636 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2637 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2638 		goto out;
2639 	}
2640 
2641 	evlist__for_each_entry(session->evlist, evsel) {
2642 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2643 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2644 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2645 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2646 			evsel->handler = trace__pgfault;
2647 	}
2648 
2649 	setup_pager();
2650 
2651 	err = perf_session__process_events(session);
2652 	if (err)
2653 		pr_err("Failed to process events, error %d", err);
2654 
2655 	else if (trace->summary)
2656 		trace__fprintf_thread_summary(trace, trace->output);
2657 
2658 out:
2659 	perf_session__delete(session);
2660 
2661 	return err;
2662 }
2663 
2664 static size_t trace__fprintf_threads_header(FILE *fp)
2665 {
2666 	size_t printed;
2667 
2668 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2669 
2670 	return printed;
2671 }
2672 
2673 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2674 	struct stats 	*stats;
2675 	double		msecs;
2676 	int		syscall;
2677 )
2678 {
2679 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2680 	struct stats *stats = source->priv;
2681 
2682 	entry->syscall = source->i;
2683 	entry->stats   = stats;
2684 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2685 }
2686 
2687 static size_t thread__dump_stats(struct thread_trace *ttrace,
2688 				 struct trace *trace, FILE *fp)
2689 {
2690 	size_t printed = 0;
2691 	struct syscall *sc;
2692 	struct rb_node *nd;
2693 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2694 
2695 	if (syscall_stats == NULL)
2696 		return 0;
2697 
2698 	printed += fprintf(fp, "\n");
2699 
2700 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2701 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2702 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2703 
2704 	resort_rb__for_each_entry(nd, syscall_stats) {
2705 		struct stats *stats = syscall_stats_entry->stats;
2706 		if (stats) {
2707 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2708 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2709 			double avg = avg_stats(stats);
2710 			double pct;
2711 			u64 n = (u64) stats->n;
2712 
2713 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2714 			avg /= NSEC_PER_MSEC;
2715 
2716 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2717 			printed += fprintf(fp, "   %-15s", sc->name);
2718 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2719 					   n, syscall_stats_entry->msecs, min, avg);
2720 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2721 		}
2722 	}
2723 
2724 	resort_rb__delete(syscall_stats);
2725 	printed += fprintf(fp, "\n\n");
2726 
2727 	return printed;
2728 }
2729 
2730 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2731 {
2732 	size_t printed = 0;
2733 	struct thread_trace *ttrace = thread__priv(thread);
2734 	double ratio;
2735 
2736 	if (ttrace == NULL)
2737 		return 0;
2738 
2739 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2740 
2741 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2742 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2743 	printed += fprintf(fp, "%.1f%%", ratio);
2744 	if (ttrace->pfmaj)
2745 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2746 	if (ttrace->pfmin)
2747 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2748 	if (trace->sched)
2749 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2750 	else if (fputc('\n', fp) != EOF)
2751 		++printed;
2752 
2753 	printed += thread__dump_stats(ttrace, trace, fp);
2754 
2755 	return printed;
2756 }
2757 
2758 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2759 {
2760 	return ttrace ? ttrace->nr_events : 0;
2761 }
2762 
2763 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2764 	struct thread *thread;
2765 )
2766 {
2767 	entry->thread = rb_entry(nd, struct thread, rb_node);
2768 }
2769 
2770 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2771 {
2772 	size_t printed = trace__fprintf_threads_header(fp);
2773 	struct rb_node *nd;
2774 	int i;
2775 
2776 	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2777 		DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2778 
2779 		if (threads == NULL) {
2780 			fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2781 			return 0;
2782 		}
2783 
2784 		resort_rb__for_each_entry(nd, threads)
2785 			printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2786 
2787 		resort_rb__delete(threads);
2788 	}
2789 	return printed;
2790 }
2791 
2792 static int trace__set_duration(const struct option *opt, const char *str,
2793 			       int unset __maybe_unused)
2794 {
2795 	struct trace *trace = opt->value;
2796 
2797 	trace->duration_filter = atof(str);
2798 	return 0;
2799 }
2800 
2801 static int trace__set_filter_pids(const struct option *opt, const char *str,
2802 				  int unset __maybe_unused)
2803 {
2804 	int ret = -1;
2805 	size_t i;
2806 	struct trace *trace = opt->value;
2807 	/*
2808 	 * FIXME: introduce a intarray class, plain parse csv and create a
2809 	 * { int nr, int entries[] } struct...
2810 	 */
2811 	struct intlist *list = intlist__new(str);
2812 
2813 	if (list == NULL)
2814 		return -1;
2815 
2816 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2817 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2818 
2819 	if (trace->filter_pids.entries == NULL)
2820 		goto out;
2821 
2822 	trace->filter_pids.entries[0] = getpid();
2823 
2824 	for (i = 1; i < trace->filter_pids.nr; ++i)
2825 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2826 
2827 	intlist__delete(list);
2828 	ret = 0;
2829 out:
2830 	return ret;
2831 }
2832 
2833 static int trace__open_output(struct trace *trace, const char *filename)
2834 {
2835 	struct stat st;
2836 
2837 	if (!stat(filename, &st) && st.st_size) {
2838 		char oldname[PATH_MAX];
2839 
2840 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2841 		unlink(oldname);
2842 		rename(filename, oldname);
2843 	}
2844 
2845 	trace->output = fopen(filename, "w");
2846 
2847 	return trace->output == NULL ? -errno : 0;
2848 }
2849 
2850 static int parse_pagefaults(const struct option *opt, const char *str,
2851 			    int unset __maybe_unused)
2852 {
2853 	int *trace_pgfaults = opt->value;
2854 
2855 	if (strcmp(str, "all") == 0)
2856 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2857 	else if (strcmp(str, "maj") == 0)
2858 		*trace_pgfaults |= TRACE_PFMAJ;
2859 	else if (strcmp(str, "min") == 0)
2860 		*trace_pgfaults |= TRACE_PFMIN;
2861 	else
2862 		return -1;
2863 
2864 	return 0;
2865 }
2866 
2867 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2868 {
2869 	struct perf_evsel *evsel;
2870 
2871 	evlist__for_each_entry(evlist, evsel)
2872 		evsel->handler = handler;
2873 }
2874 
2875 /*
2876  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2877  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2878  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2879  *
2880  * It'd be better to introduce a parse_options() variant that would return a
2881  * list with the terms it didn't match to an event...
2882  */
2883 static int trace__parse_events_option(const struct option *opt, const char *str,
2884 				      int unset __maybe_unused)
2885 {
2886 	struct trace *trace = (struct trace *)opt->value;
2887 	const char *s = str;
2888 	char *sep = NULL, *lists[2] = { NULL, NULL, };
2889 	int len = strlen(str) + 1, err = -1, list, idx;
2890 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2891 	char group_name[PATH_MAX];
2892 
2893 	if (strace_groups_dir == NULL)
2894 		return -1;
2895 
2896 	if (*s == '!') {
2897 		++s;
2898 		trace->not_ev_qualifier = true;
2899 	}
2900 
2901 	while (1) {
2902 		if ((sep = strchr(s, ',')) != NULL)
2903 			*sep = '\0';
2904 
2905 		list = 0;
2906 		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2907 		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2908 			list = 1;
2909 		} else {
2910 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2911 			if (access(group_name, R_OK) == 0)
2912 				list = 1;
2913 		}
2914 
2915 		if (lists[list]) {
2916 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2917 		} else {
2918 			lists[list] = malloc(len);
2919 			if (lists[list] == NULL)
2920 				goto out;
2921 			strcpy(lists[list], s);
2922 		}
2923 
2924 		if (!sep)
2925 			break;
2926 
2927 		*sep = ',';
2928 		s = sep + 1;
2929 	}
2930 
2931 	if (lists[1] != NULL) {
2932 		struct strlist_config slist_config = {
2933 			.dirname = strace_groups_dir,
2934 		};
2935 
2936 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2937 		if (trace->ev_qualifier == NULL) {
2938 			fputs("Not enough memory to parse event qualifier", trace->output);
2939 			goto out;
2940 		}
2941 
2942 		if (trace__validate_ev_qualifier(trace))
2943 			goto out;
2944 	}
2945 
2946 	err = 0;
2947 
2948 	if (lists[0]) {
2949 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2950 					       "event selector. use 'perf list' to list available events",
2951 					       parse_events_option);
2952 		err = parse_events_option(&o, lists[0], 0);
2953 	}
2954 out:
2955 	if (sep)
2956 		*sep = ',';
2957 
2958 	return err;
2959 }
2960 
2961 int cmd_trace(int argc, const char **argv)
2962 {
2963 	const char *trace_usage[] = {
2964 		"perf trace [<options>] [<command>]",
2965 		"perf trace [<options>] -- <command> [<options>]",
2966 		"perf trace record [<options>] [<command>]",
2967 		"perf trace record [<options>] -- <command> [<options>]",
2968 		NULL
2969 	};
2970 	struct trace trace = {
2971 		.syscalls = {
2972 			. max = -1,
2973 		},
2974 		.opts = {
2975 			.target = {
2976 				.uid	   = UINT_MAX,
2977 				.uses_mmap = true,
2978 			},
2979 			.user_freq     = UINT_MAX,
2980 			.user_interval = ULLONG_MAX,
2981 			.no_buffering  = true,
2982 			.mmap_pages    = UINT_MAX,
2983 			.proc_map_timeout  = 500,
2984 		},
2985 		.output = stderr,
2986 		.show_comm = true,
2987 		.trace_syscalls = true,
2988 		.kernel_syscallchains = false,
2989 		.max_stack = UINT_MAX,
2990 	};
2991 	const char *output_name = NULL;
2992 	const struct option trace_options[] = {
2993 	OPT_CALLBACK('e', "event", &trace, "event",
2994 		     "event/syscall selector. use 'perf list' to list available events",
2995 		     trace__parse_events_option),
2996 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2997 		    "show the thread COMM next to its id"),
2998 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2999 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3000 		     trace__parse_events_option),
3001 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
3002 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3003 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3004 		    "trace events on existing process id"),
3005 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3006 		    "trace events on existing thread id"),
3007 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3008 		     "pids to filter (by the kernel)", trace__set_filter_pids),
3009 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3010 		    "system-wide collection from all CPUs"),
3011 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3012 		    "list of cpus to monitor"),
3013 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3014 		    "child tasks do not inherit counters"),
3015 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3016 		     "number of mmap data pages",
3017 		     perf_evlist__parse_mmap_pages),
3018 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3019 		   "user to profile"),
3020 	OPT_CALLBACK(0, "duration", &trace, "float",
3021 		     "show only events with duration > N.M ms",
3022 		     trace__set_duration),
3023 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3024 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3025 	OPT_BOOLEAN('T', "time", &trace.full_time,
3026 		    "Show full timestamp, not time relative to first start"),
3027 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3028 		    "Show only syscall summary with statistics"),
3029 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3030 		    "Show all syscalls and summary with statistics"),
3031 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3032 		     "Trace pagefaults", parse_pagefaults, "maj"),
3033 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3034 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3035 	OPT_CALLBACK(0, "call-graph", &trace.opts,
3036 		     "record_mode[,record_size]", record_callchain_help,
3037 		     &record_parse_callchain_opt),
3038 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3039 		    "Show the kernel callchains on the syscall exit path"),
3040 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3041 		     "Set the minimum stack depth when parsing the callchain, "
3042 		     "anything below the specified depth will be ignored."),
3043 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3044 		     "Set the maximum stack depth when parsing the callchain, "
3045 		     "anything beyond the specified depth will be ignored. "
3046 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3047 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3048 			"per thread proc mmap processing timeout in ms"),
3049 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3050 		     "ms to wait before starting measurement after program "
3051 		     "start"),
3052 	OPT_END()
3053 	};
3054 	bool __maybe_unused max_stack_user_set = true;
3055 	bool mmap_pages_user_set = true;
3056 	const char * const trace_subcommands[] = { "record", NULL };
3057 	int err;
3058 	char bf[BUFSIZ];
3059 
3060 	signal(SIGSEGV, sighandler_dump_stack);
3061 	signal(SIGFPE, sighandler_dump_stack);
3062 
3063 	trace.evlist = perf_evlist__new();
3064 	trace.sctbl = syscalltbl__new();
3065 
3066 	if (trace.evlist == NULL || trace.sctbl == NULL) {
3067 		pr_err("Not enough memory to run!\n");
3068 		err = -ENOMEM;
3069 		goto out;
3070 	}
3071 
3072 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3073 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3074 
3075 	err = bpf__setup_stdout(trace.evlist);
3076 	if (err) {
3077 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3078 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3079 		goto out;
3080 	}
3081 
3082 	err = -1;
3083 
3084 	if (trace.trace_pgfaults) {
3085 		trace.opts.sample_address = true;
3086 		trace.opts.sample_time = true;
3087 	}
3088 
3089 	if (trace.opts.mmap_pages == UINT_MAX)
3090 		mmap_pages_user_set = false;
3091 
3092 	if (trace.max_stack == UINT_MAX) {
3093 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3094 		max_stack_user_set = false;
3095 	}
3096 
3097 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3098 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
3099 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3100 #endif
3101 
3102 	if (callchain_param.enabled) {
3103 		if (!mmap_pages_user_set && geteuid() == 0)
3104 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3105 
3106 		symbol_conf.use_callchain = true;
3107 	}
3108 
3109 	if (trace.evlist->nr_entries > 0)
3110 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3111 
3112 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3113 		return trace__record(&trace, argc-1, &argv[1]);
3114 
3115 	/* summary_only implies summary option, but don't overwrite summary if set */
3116 	if (trace.summary_only)
3117 		trace.summary = trace.summary_only;
3118 
3119 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3120 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3121 		pr_err("Please specify something to trace.\n");
3122 		return -1;
3123 	}
3124 
3125 	if (!trace.trace_syscalls && trace.ev_qualifier) {
3126 		pr_err("The -e option can't be used with --no-syscalls.\n");
3127 		goto out;
3128 	}
3129 
3130 	if (output_name != NULL) {
3131 		err = trace__open_output(&trace, output_name);
3132 		if (err < 0) {
3133 			perror("failed to create output file");
3134 			goto out;
3135 		}
3136 	}
3137 
3138 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
3139 
3140 	err = target__validate(&trace.opts.target);
3141 	if (err) {
3142 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3143 		fprintf(trace.output, "%s", bf);
3144 		goto out_close;
3145 	}
3146 
3147 	err = target__parse_uid(&trace.opts.target);
3148 	if (err) {
3149 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3150 		fprintf(trace.output, "%s", bf);
3151 		goto out_close;
3152 	}
3153 
3154 	if (!argc && target__none(&trace.opts.target))
3155 		trace.opts.target.system_wide = true;
3156 
3157 	if (input_name)
3158 		err = trace__replay(&trace);
3159 	else
3160 		err = trace__run(&trace, argc, argv);
3161 
3162 out_close:
3163 	if (output_name != NULL)
3164 		fclose(trace.output);
3165 out:
3166 	return err;
3167 }
3168