xref: /openbmc/linux/tools/perf/builtin-trace.c (revision 68198dca)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/event.h"
25 #include "util/evlist.h"
26 #include <subcmd/exec-cmd.h>
27 #include "util/machine.h"
28 #include "util/path.h"
29 #include "util/session.h"
30 #include "util/thread.h"
31 #include <subcmd/parse-options.h>
32 #include "util/strlist.h"
33 #include "util/intlist.h"
34 #include "util/thread_map.h"
35 #include "util/stat.h"
36 #include "trace/beauty/beauty.h"
37 #include "trace-event.h"
38 #include "util/parse-events.h"
39 #include "util/bpf-loader.h"
40 #include "callchain.h"
41 #include "print_binary.h"
42 #include "string2.h"
43 #include "syscalltbl.h"
44 #include "rb_resort.h"
45 
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/audit.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 
61 #include "sane_ctype.h"
62 
63 #ifndef O_CLOEXEC
64 # define O_CLOEXEC		02000000
65 #endif
66 
67 #ifndef F_LINUX_SPECIFIC_BASE
68 # define F_LINUX_SPECIFIC_BASE	1024
69 #endif
70 
71 struct trace {
72 	struct perf_tool	tool;
73 	struct syscalltbl	*sctbl;
74 	struct {
75 		int		max;
76 		struct syscall  *table;
77 		struct {
78 			struct perf_evsel *sys_enter,
79 					  *sys_exit;
80 		}		events;
81 	} syscalls;
82 	struct record_opts	opts;
83 	struct perf_evlist	*evlist;
84 	struct machine		*host;
85 	struct thread		*current;
86 	u64			base_time;
87 	FILE			*output;
88 	unsigned long		nr_events;
89 	struct strlist		*ev_qualifier;
90 	struct {
91 		size_t		nr;
92 		int		*entries;
93 	}			ev_qualifier_ids;
94 	struct {
95 		size_t		nr;
96 		pid_t		*entries;
97 	}			filter_pids;
98 	double			duration_filter;
99 	double			runtime_ms;
100 	struct {
101 		u64		vfs_getname,
102 				proc_getname;
103 	} stats;
104 	unsigned int		max_stack;
105 	unsigned int		min_stack;
106 	bool			not_ev_qualifier;
107 	bool			live;
108 	bool			full_time;
109 	bool			sched;
110 	bool			multiple_threads;
111 	bool			summary;
112 	bool			summary_only;
113 	bool			show_comm;
114 	bool			show_tool_stats;
115 	bool			trace_syscalls;
116 	bool			kernel_syscallchains;
117 	bool			force;
118 	bool			vfs_getname;
119 	int			trace_pgfaults;
120 	int			open_id;
121 };
122 
123 struct tp_field {
124 	int offset;
125 	union {
126 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
127 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
128 	};
129 };
130 
131 #define TP_UINT_FIELD(bits) \
132 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
133 { \
134 	u##bits value; \
135 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
136 	return value;  \
137 }
138 
139 TP_UINT_FIELD(8);
140 TP_UINT_FIELD(16);
141 TP_UINT_FIELD(32);
142 TP_UINT_FIELD(64);
143 
144 #define TP_UINT_FIELD__SWAPPED(bits) \
145 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
146 { \
147 	u##bits value; \
148 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
149 	return bswap_##bits(value);\
150 }
151 
152 TP_UINT_FIELD__SWAPPED(16);
153 TP_UINT_FIELD__SWAPPED(32);
154 TP_UINT_FIELD__SWAPPED(64);
155 
156 static int tp_field__init_uint(struct tp_field *field,
157 			       struct format_field *format_field,
158 			       bool needs_swap)
159 {
160 	field->offset = format_field->offset;
161 
162 	switch (format_field->size) {
163 	case 1:
164 		field->integer = tp_field__u8;
165 		break;
166 	case 2:
167 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
168 		break;
169 	case 4:
170 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
171 		break;
172 	case 8:
173 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
174 		break;
175 	default:
176 		return -1;
177 	}
178 
179 	return 0;
180 }
181 
182 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
183 {
184 	return sample->raw_data + field->offset;
185 }
186 
187 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
188 {
189 	field->offset = format_field->offset;
190 	field->pointer = tp_field__ptr;
191 	return 0;
192 }
193 
194 struct syscall_tp {
195 	struct tp_field id;
196 	union {
197 		struct tp_field args, ret;
198 	};
199 };
200 
201 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
202 					  struct tp_field *field,
203 					  const char *name)
204 {
205 	struct format_field *format_field = perf_evsel__field(evsel, name);
206 
207 	if (format_field == NULL)
208 		return -1;
209 
210 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
211 }
212 
213 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
214 	({ struct syscall_tp *sc = evsel->priv;\
215 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
216 
217 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
218 					 struct tp_field *field,
219 					 const char *name)
220 {
221 	struct format_field *format_field = perf_evsel__field(evsel, name);
222 
223 	if (format_field == NULL)
224 		return -1;
225 
226 	return tp_field__init_ptr(field, format_field);
227 }
228 
229 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
230 	({ struct syscall_tp *sc = evsel->priv;\
231 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
232 
233 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
234 {
235 	zfree(&evsel->priv);
236 	perf_evsel__delete(evsel);
237 }
238 
239 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
240 {
241 	evsel->priv = malloc(sizeof(struct syscall_tp));
242 	if (evsel->priv != NULL) {
243 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
244 			goto out_delete;
245 
246 		evsel->handler = handler;
247 		return 0;
248 	}
249 
250 	return -ENOMEM;
251 
252 out_delete:
253 	zfree(&evsel->priv);
254 	return -ENOENT;
255 }
256 
257 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
258 {
259 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
260 
261 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
262 	if (IS_ERR(evsel))
263 		evsel = perf_evsel__newtp("syscalls", direction);
264 
265 	if (IS_ERR(evsel))
266 		return NULL;
267 
268 	if (perf_evsel__init_syscall_tp(evsel, handler))
269 		goto out_delete;
270 
271 	return evsel;
272 
273 out_delete:
274 	perf_evsel__delete_priv(evsel);
275 	return NULL;
276 }
277 
278 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
279 	({ struct syscall_tp *fields = evsel->priv; \
280 	   fields->name.integer(&fields->name, sample); })
281 
282 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
283 	({ struct syscall_tp *fields = evsel->priv; \
284 	   fields->name.pointer(&fields->name, sample); })
285 
286 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
287 {
288 	int idx = val - sa->offset;
289 
290 	if (idx < 0 || idx >= sa->nr_entries)
291 		return scnprintf(bf, size, intfmt, val);
292 
293 	return scnprintf(bf, size, "%s", sa->entries[idx]);
294 }
295 
296 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
297 						const char *intfmt,
298 					        struct syscall_arg *arg)
299 {
300 	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
301 }
302 
303 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
304 					      struct syscall_arg *arg)
305 {
306 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
307 }
308 
309 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
310 
311 struct strarrays {
312 	int		nr_entries;
313 	struct strarray **entries;
314 };
315 
316 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
317 	.nr_entries = ARRAY_SIZE(array), \
318 	.entries = array, \
319 }
320 
321 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
322 					struct syscall_arg *arg)
323 {
324 	struct strarrays *sas = arg->parm;
325 	int i;
326 
327 	for (i = 0; i < sas->nr_entries; ++i) {
328 		struct strarray *sa = sas->entries[i];
329 		int idx = arg->val - sa->offset;
330 
331 		if (idx >= 0 && idx < sa->nr_entries) {
332 			if (sa->entries[idx] == NULL)
333 				break;
334 			return scnprintf(bf, size, "%s", sa->entries[idx]);
335 		}
336 	}
337 
338 	return scnprintf(bf, size, "%d", arg->val);
339 }
340 
341 #ifndef AT_FDCWD
342 #define AT_FDCWD	-100
343 #endif
344 
345 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
346 					   struct syscall_arg *arg)
347 {
348 	int fd = arg->val;
349 
350 	if (fd == AT_FDCWD)
351 		return scnprintf(bf, size, "CWD");
352 
353 	return syscall_arg__scnprintf_fd(bf, size, arg);
354 }
355 
356 #define SCA_FDAT syscall_arg__scnprintf_fd_at
357 
358 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
359 					      struct syscall_arg *arg);
360 
361 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
362 
363 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
364 {
365 	return scnprintf(bf, size, "%#lx", arg->val);
366 }
367 
368 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
369 {
370 	return scnprintf(bf, size, "%d", arg->val);
371 }
372 
373 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
374 {
375 	return scnprintf(bf, size, "%ld", arg->val);
376 }
377 
378 static const char *bpf_cmd[] = {
379 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
380 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
381 };
382 static DEFINE_STRARRAY(bpf_cmd);
383 
384 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
385 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
386 
387 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
388 static DEFINE_STRARRAY(itimers);
389 
390 static const char *keyctl_options[] = {
391 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
392 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
393 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
394 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
395 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
396 };
397 static DEFINE_STRARRAY(keyctl_options);
398 
399 static const char *whences[] = { "SET", "CUR", "END",
400 #ifdef SEEK_DATA
401 "DATA",
402 #endif
403 #ifdef SEEK_HOLE
404 "HOLE",
405 #endif
406 };
407 static DEFINE_STRARRAY(whences);
408 
409 static const char *fcntl_cmds[] = {
410 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
411 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
412 	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
413 	"GETOWNER_UIDS",
414 };
415 static DEFINE_STRARRAY(fcntl_cmds);
416 
417 static const char *fcntl_linux_specific_cmds[] = {
418 	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
419 	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
420 	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
421 };
422 
423 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
424 
425 static struct strarray *fcntl_cmds_arrays[] = {
426 	&strarray__fcntl_cmds,
427 	&strarray__fcntl_linux_specific_cmds,
428 };
429 
430 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
431 
432 static const char *rlimit_resources[] = {
433 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
434 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
435 	"RTTIME",
436 };
437 static DEFINE_STRARRAY(rlimit_resources);
438 
439 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
440 static DEFINE_STRARRAY(sighow);
441 
442 static const char *clockid[] = {
443 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
444 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
445 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
446 };
447 static DEFINE_STRARRAY(clockid);
448 
449 static const char *socket_families[] = {
450 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
451 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
452 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
453 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
454 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
455 	"ALG", "NFC", "VSOCK",
456 };
457 static DEFINE_STRARRAY(socket_families);
458 
459 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
460 						 struct syscall_arg *arg)
461 {
462 	size_t printed = 0;
463 	int mode = arg->val;
464 
465 	if (mode == F_OK) /* 0 */
466 		return scnprintf(bf, size, "F");
467 #define	P_MODE(n) \
468 	if (mode & n##_OK) { \
469 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
470 		mode &= ~n##_OK; \
471 	}
472 
473 	P_MODE(R);
474 	P_MODE(W);
475 	P_MODE(X);
476 #undef P_MODE
477 
478 	if (mode)
479 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
480 
481 	return printed;
482 }
483 
484 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
485 
486 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
487 					      struct syscall_arg *arg);
488 
489 #define SCA_FILENAME syscall_arg__scnprintf_filename
490 
491 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
492 						struct syscall_arg *arg)
493 {
494 	int printed = 0, flags = arg->val;
495 
496 #define	P_FLAG(n) \
497 	if (flags & O_##n) { \
498 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
499 		flags &= ~O_##n; \
500 	}
501 
502 	P_FLAG(CLOEXEC);
503 	P_FLAG(NONBLOCK);
504 #undef P_FLAG
505 
506 	if (flags)
507 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
508 
509 	return printed;
510 }
511 
512 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
513 
514 #ifndef GRND_NONBLOCK
515 #define GRND_NONBLOCK	0x0001
516 #endif
517 #ifndef GRND_RANDOM
518 #define GRND_RANDOM	0x0002
519 #endif
520 
521 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
522 						   struct syscall_arg *arg)
523 {
524 	int printed = 0, flags = arg->val;
525 
526 #define	P_FLAG(n) \
527 	if (flags & GRND_##n) { \
528 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
529 		flags &= ~GRND_##n; \
530 	}
531 
532 	P_FLAG(RANDOM);
533 	P_FLAG(NONBLOCK);
534 #undef P_FLAG
535 
536 	if (flags)
537 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
538 
539 	return printed;
540 }
541 
542 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
543 
544 #define STRARRAY(name, array) \
545 	  { .scnprintf	= SCA_STRARRAY, \
546 	    .parm	= &strarray__##array, }
547 
548 #include "trace/beauty/eventfd.c"
549 #include "trace/beauty/flock.c"
550 #include "trace/beauty/futex_op.c"
551 #include "trace/beauty/mmap.c"
552 #include "trace/beauty/mode_t.c"
553 #include "trace/beauty/msg_flags.c"
554 #include "trace/beauty/open_flags.c"
555 #include "trace/beauty/perf_event_open.c"
556 #include "trace/beauty/pid.c"
557 #include "trace/beauty/sched_policy.c"
558 #include "trace/beauty/seccomp.c"
559 #include "trace/beauty/signum.c"
560 #include "trace/beauty/socket_type.c"
561 #include "trace/beauty/waitid_options.c"
562 
563 struct syscall_arg_fmt {
564 	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
565 	void	   *parm;
566 	const char *name;
567 	bool	   show_zero;
568 };
569 
570 static struct syscall_fmt {
571 	const char *name;
572 	const char *alias;
573 	struct syscall_arg_fmt arg[6];
574 	u8	   nr_args;
575 	bool	   errpid;
576 	bool	   timeout;
577 	bool	   hexret;
578 } syscall_fmts[] = {
579 	{ .name	    = "access",
580 	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
581 	{ .name	    = "bpf",
582 	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
583 	{ .name	    = "brk",	    .hexret = true,
584 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
585 	{ .name     = "clock_gettime",
586 	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
587 	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
588 	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
589 		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
590 		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
591 		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
592 		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
593 	{ .name	    = "close",
594 	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
595 	{ .name	    = "epoll_ctl",
596 	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
597 	{ .name	    = "eventfd2",
598 	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
599 	{ .name	    = "fchmodat",
600 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
601 	{ .name	    = "fchownat",
602 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
603 	{ .name	    = "fcntl",
604 	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
605 			   .parm      = &strarrays__fcntl_cmds_arrays,
606 			   .show_zero = true, },
607 		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
608 	{ .name	    = "flock",
609 	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
610 	{ .name	    = "fstat", .alias = "newfstat", },
611 	{ .name	    = "fstatat", .alias = "newfstatat", },
612 	{ .name	    = "futex",
613 	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, }, },
614 	{ .name	    = "futimesat",
615 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
616 	{ .name	    = "getitimer",
617 	  .arg = { [0] = STRARRAY(which, itimers), }, },
618 	{ .name	    = "getpid",	    .errpid = true, },
619 	{ .name	    = "getpgid",    .errpid = true, },
620 	{ .name	    = "getppid",    .errpid = true, },
621 	{ .name	    = "getrandom",
622 	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
623 	{ .name	    = "getrlimit",
624 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
625 	{ .name	    = "ioctl",
626 	  .arg = {
627 #if defined(__i386__) || defined(__x86_64__)
628 /*
629  * FIXME: Make this available to all arches.
630  */
631 		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
632 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
633 #else
634 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
635 #endif
636 	{ .name	    = "kcmp",	    .nr_args = 5,
637 	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
638 		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
639 		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
640 		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
641 		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
642 	{ .name	    = "keyctl",
643 	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
644 	{ .name	    = "kill",
645 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
646 	{ .name	    = "linkat",
647 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
648 	{ .name	    = "lseek",
649 	  .arg = { [2] = STRARRAY(whence, whences), }, },
650 	{ .name	    = "lstat", .alias = "newlstat", },
651 	{ .name     = "madvise",
652 	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
653 		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
654 	{ .name	    = "mkdirat",
655 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
656 	{ .name	    = "mknodat",
657 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
658 	{ .name	    = "mlock",
659 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
660 	{ .name	    = "mlockall",
661 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
662 	{ .name	    = "mmap",	    .hexret = true,
663 /* The standard mmap maps to old_mmap on s390x */
664 #if defined(__s390x__)
665 	.alias = "old_mmap",
666 #endif
667 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
668 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
669 		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
670 	{ .name	    = "mprotect",
671 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
672 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
673 	{ .name	    = "mq_unlink",
674 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
675 	{ .name	    = "mremap",	    .hexret = true,
676 	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
677 		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
678 		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
679 	{ .name	    = "munlock",
680 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
681 	{ .name	    = "munmap",
682 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
683 	{ .name	    = "name_to_handle_at",
684 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
685 	{ .name	    = "newfstatat",
686 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
687 	{ .name	    = "open",
688 	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
689 	{ .name	    = "open_by_handle_at",
690 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
691 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
692 	{ .name	    = "openat",
693 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
694 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
695 	{ .name	    = "perf_event_open",
696 	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
697 		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
698 		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
699 	{ .name	    = "pipe2",
700 	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
701 	{ .name	    = "pkey_alloc",
702 	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
703 	{ .name	    = "pkey_free",
704 	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
705 	{ .name	    = "pkey_mprotect",
706 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
707 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
708 		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
709 	{ .name	    = "poll", .timeout = true, },
710 	{ .name	    = "ppoll", .timeout = true, },
711 	{ .name	    = "prctl", .alias = "arch_prctl",
712 	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
713 		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
714 		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
715 	{ .name	    = "pread", .alias = "pread64", },
716 	{ .name	    = "preadv", .alias = "pread", },
717 	{ .name	    = "prlimit64",
718 	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
719 	{ .name	    = "pwrite", .alias = "pwrite64", },
720 	{ .name	    = "readlinkat",
721 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
722 	{ .name	    = "recvfrom",
723 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
724 	{ .name	    = "recvmmsg",
725 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
726 	{ .name	    = "recvmsg",
727 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
728 	{ .name	    = "renameat",
729 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
730 	{ .name	    = "rt_sigaction",
731 	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
732 	{ .name	    = "rt_sigprocmask",
733 	  .arg = { [0] = STRARRAY(how, sighow), }, },
734 	{ .name	    = "rt_sigqueueinfo",
735 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
736 	{ .name	    = "rt_tgsigqueueinfo",
737 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
738 	{ .name	    = "sched_setscheduler",
739 	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
740 	{ .name	    = "seccomp",
741 	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
742 		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
743 	{ .name	    = "select", .timeout = true, },
744 	{ .name	    = "sendmmsg",
745 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
746 	{ .name	    = "sendmsg",
747 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
748 	{ .name	    = "sendto",
749 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
750 	{ .name	    = "set_tid_address", .errpid = true, },
751 	{ .name	    = "setitimer",
752 	  .arg = { [0] = STRARRAY(which, itimers), }, },
753 	{ .name	    = "setrlimit",
754 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
755 	{ .name	    = "socket",
756 	  .arg = { [0] = STRARRAY(family, socket_families),
757 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
758 	{ .name	    = "socketpair",
759 	  .arg = { [0] = STRARRAY(family, socket_families),
760 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
761 	{ .name	    = "stat", .alias = "newstat", },
762 	{ .name	    = "statx",
763 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
764 		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
765 		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
766 	{ .name	    = "swapoff",
767 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
768 	{ .name	    = "swapon",
769 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
770 	{ .name	    = "symlinkat",
771 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
772 	{ .name	    = "tgkill",
773 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
774 	{ .name	    = "tkill",
775 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
776 	{ .name	    = "uname", .alias = "newuname", },
777 	{ .name	    = "unlinkat",
778 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
779 	{ .name	    = "utimensat",
780 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
781 	{ .name	    = "wait4",	    .errpid = true,
782 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
783 	{ .name	    = "waitid",	    .errpid = true,
784 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
785 };
786 
787 static int syscall_fmt__cmp(const void *name, const void *fmtp)
788 {
789 	const struct syscall_fmt *fmt = fmtp;
790 	return strcmp(name, fmt->name);
791 }
792 
793 static struct syscall_fmt *syscall_fmt__find(const char *name)
794 {
795 	const int nmemb = ARRAY_SIZE(syscall_fmts);
796 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
797 }
798 
799 struct syscall {
800 	struct event_format *tp_format;
801 	int		    nr_args;
802 	struct format_field *args;
803 	const char	    *name;
804 	bool		    is_exit;
805 	struct syscall_fmt  *fmt;
806 	struct syscall_arg_fmt *arg_fmt;
807 };
808 
809 /*
810  * We need to have this 'calculated' boolean because in some cases we really
811  * don't know what is the duration of a syscall, for instance, when we start
812  * a session and some threads are waiting for a syscall to finish, say 'poll',
813  * in which case all we can do is to print "( ? ) for duration and for the
814  * start timestamp.
815  */
816 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
817 {
818 	double duration = (double)t / NSEC_PER_MSEC;
819 	size_t printed = fprintf(fp, "(");
820 
821 	if (!calculated)
822 		printed += fprintf(fp, "     ?   ");
823 	else if (duration >= 1.0)
824 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
825 	else if (duration >= 0.01)
826 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
827 	else
828 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
829 	return printed + fprintf(fp, "): ");
830 }
831 
832 /**
833  * filename.ptr: The filename char pointer that will be vfs_getname'd
834  * filename.entry_str_pos: Where to insert the string translated from
835  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
836  * ret_scnprintf: syscall args may set this to a different syscall return
837  *                formatter, for instance, fcntl may return fds, file flags, etc.
838  */
839 struct thread_trace {
840 	u64		  entry_time;
841 	bool		  entry_pending;
842 	unsigned long	  nr_events;
843 	unsigned long	  pfmaj, pfmin;
844 	char		  *entry_str;
845 	double		  runtime_ms;
846 	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
847         struct {
848 		unsigned long ptr;
849 		short int     entry_str_pos;
850 		bool	      pending_open;
851 		unsigned int  namelen;
852 		char	      *name;
853 	} filename;
854 	struct {
855 		int	  max;
856 		char	  **table;
857 	} paths;
858 
859 	struct intlist *syscall_stats;
860 };
861 
862 static struct thread_trace *thread_trace__new(void)
863 {
864 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
865 
866 	if (ttrace)
867 		ttrace->paths.max = -1;
868 
869 	ttrace->syscall_stats = intlist__new(NULL);
870 
871 	return ttrace;
872 }
873 
874 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
875 {
876 	struct thread_trace *ttrace;
877 
878 	if (thread == NULL)
879 		goto fail;
880 
881 	if (thread__priv(thread) == NULL)
882 		thread__set_priv(thread, thread_trace__new());
883 
884 	if (thread__priv(thread) == NULL)
885 		goto fail;
886 
887 	ttrace = thread__priv(thread);
888 	++ttrace->nr_events;
889 
890 	return ttrace;
891 fail:
892 	color_fprintf(fp, PERF_COLOR_RED,
893 		      "WARNING: not enough memory, dropping samples!\n");
894 	return NULL;
895 }
896 
897 
898 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
899 				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
900 {
901 	struct thread_trace *ttrace = thread__priv(arg->thread);
902 
903 	ttrace->ret_scnprintf = ret_scnprintf;
904 }
905 
906 #define TRACE_PFMAJ		(1 << 0)
907 #define TRACE_PFMIN		(1 << 1)
908 
909 static const size_t trace__entry_str_size = 2048;
910 
911 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
912 {
913 	struct thread_trace *ttrace = thread__priv(thread);
914 
915 	if (fd > ttrace->paths.max) {
916 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
917 
918 		if (npath == NULL)
919 			return -1;
920 
921 		if (ttrace->paths.max != -1) {
922 			memset(npath + ttrace->paths.max + 1, 0,
923 			       (fd - ttrace->paths.max) * sizeof(char *));
924 		} else {
925 			memset(npath, 0, (fd + 1) * sizeof(char *));
926 		}
927 
928 		ttrace->paths.table = npath;
929 		ttrace->paths.max   = fd;
930 	}
931 
932 	ttrace->paths.table[fd] = strdup(pathname);
933 
934 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
935 }
936 
937 static int thread__read_fd_path(struct thread *thread, int fd)
938 {
939 	char linkname[PATH_MAX], pathname[PATH_MAX];
940 	struct stat st;
941 	int ret;
942 
943 	if (thread->pid_ == thread->tid) {
944 		scnprintf(linkname, sizeof(linkname),
945 			  "/proc/%d/fd/%d", thread->pid_, fd);
946 	} else {
947 		scnprintf(linkname, sizeof(linkname),
948 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
949 	}
950 
951 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
952 		return -1;
953 
954 	ret = readlink(linkname, pathname, sizeof(pathname));
955 
956 	if (ret < 0 || ret > st.st_size)
957 		return -1;
958 
959 	pathname[ret] = '\0';
960 	return trace__set_fd_pathname(thread, fd, pathname);
961 }
962 
963 static const char *thread__fd_path(struct thread *thread, int fd,
964 				   struct trace *trace)
965 {
966 	struct thread_trace *ttrace = thread__priv(thread);
967 
968 	if (ttrace == NULL)
969 		return NULL;
970 
971 	if (fd < 0)
972 		return NULL;
973 
974 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
975 		if (!trace->live)
976 			return NULL;
977 		++trace->stats.proc_getname;
978 		if (thread__read_fd_path(thread, fd))
979 			return NULL;
980 	}
981 
982 	return ttrace->paths.table[fd];
983 }
984 
985 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
986 {
987 	int fd = arg->val;
988 	size_t printed = scnprintf(bf, size, "%d", fd);
989 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
990 
991 	if (path)
992 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
993 
994 	return printed;
995 }
996 
997 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
998 {
999         size_t printed = scnprintf(bf, size, "%d", fd);
1000 	struct thread *thread = machine__find_thread(trace->host, pid, pid);
1001 
1002 	if (thread) {
1003 		const char *path = thread__fd_path(thread, fd, trace);
1004 
1005 		if (path)
1006 			printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1007 
1008 		thread__put(thread);
1009 	}
1010 
1011         return printed;
1012 }
1013 
1014 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1015 					      struct syscall_arg *arg)
1016 {
1017 	int fd = arg->val;
1018 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1019 	struct thread_trace *ttrace = thread__priv(arg->thread);
1020 
1021 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1022 		zfree(&ttrace->paths.table[fd]);
1023 
1024 	return printed;
1025 }
1026 
1027 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1028 				     unsigned long ptr)
1029 {
1030 	struct thread_trace *ttrace = thread__priv(thread);
1031 
1032 	ttrace->filename.ptr = ptr;
1033 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1034 }
1035 
1036 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1037 					      struct syscall_arg *arg)
1038 {
1039 	unsigned long ptr = arg->val;
1040 
1041 	if (!arg->trace->vfs_getname)
1042 		return scnprintf(bf, size, "%#x", ptr);
1043 
1044 	thread__set_filename_pos(arg->thread, bf, ptr);
1045 	return 0;
1046 }
1047 
1048 static bool trace__filter_duration(struct trace *trace, double t)
1049 {
1050 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1051 }
1052 
1053 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1054 {
1055 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1056 
1057 	return fprintf(fp, "%10.3f ", ts);
1058 }
1059 
1060 /*
1061  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1062  * using ttrace->entry_time for a thread that receives a sys_exit without
1063  * first having received a sys_enter ("poll" issued before tracing session
1064  * starts, lost sys_enter exit due to ring buffer overflow).
1065  */
1066 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1067 {
1068 	if (tstamp > 0)
1069 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1070 
1071 	return fprintf(fp, "         ? ");
1072 }
1073 
1074 static bool done = false;
1075 static bool interrupted = false;
1076 
1077 static void sig_handler(int sig)
1078 {
1079 	done = true;
1080 	interrupted = sig == SIGINT;
1081 }
1082 
1083 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1084 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1085 {
1086 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1087 	printed += fprintf_duration(duration, duration_calculated, fp);
1088 
1089 	if (trace->multiple_threads) {
1090 		if (trace->show_comm)
1091 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1092 		printed += fprintf(fp, "%d ", thread->tid);
1093 	}
1094 
1095 	return printed;
1096 }
1097 
1098 static int trace__process_event(struct trace *trace, struct machine *machine,
1099 				union perf_event *event, struct perf_sample *sample)
1100 {
1101 	int ret = 0;
1102 
1103 	switch (event->header.type) {
1104 	case PERF_RECORD_LOST:
1105 		color_fprintf(trace->output, PERF_COLOR_RED,
1106 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1107 		ret = machine__process_lost_event(machine, event, sample);
1108 		break;
1109 	default:
1110 		ret = machine__process_event(machine, event, sample);
1111 		break;
1112 	}
1113 
1114 	return ret;
1115 }
1116 
1117 static int trace__tool_process(struct perf_tool *tool,
1118 			       union perf_event *event,
1119 			       struct perf_sample *sample,
1120 			       struct machine *machine)
1121 {
1122 	struct trace *trace = container_of(tool, struct trace, tool);
1123 	return trace__process_event(trace, machine, event, sample);
1124 }
1125 
1126 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1127 {
1128 	struct machine *machine = vmachine;
1129 
1130 	if (machine->kptr_restrict_warned)
1131 		return NULL;
1132 
1133 	if (symbol_conf.kptr_restrict) {
1134 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1135 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1136 			   "Kernel samples will not be resolved.\n");
1137 		machine->kptr_restrict_warned = true;
1138 		return NULL;
1139 	}
1140 
1141 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1142 }
1143 
1144 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1145 {
1146 	int err = symbol__init(NULL);
1147 
1148 	if (err)
1149 		return err;
1150 
1151 	trace->host = machine__new_host();
1152 	if (trace->host == NULL)
1153 		return -ENOMEM;
1154 
1155 	err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1156 	if (err < 0)
1157 		goto out;
1158 
1159 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1160 					    evlist->threads, trace__tool_process, false,
1161 					    trace->opts.proc_map_timeout, 1);
1162 out:
1163 	if (err)
1164 		symbol__exit();
1165 
1166 	return err;
1167 }
1168 
1169 static void trace__symbols__exit(struct trace *trace)
1170 {
1171 	machine__exit(trace->host);
1172 	trace->host = NULL;
1173 
1174 	symbol__exit();
1175 }
1176 
1177 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1178 {
1179 	int idx;
1180 
1181 	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1182 		nr_args = sc->fmt->nr_args;
1183 
1184 	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1185 	if (sc->arg_fmt == NULL)
1186 		return -1;
1187 
1188 	for (idx = 0; idx < nr_args; ++idx) {
1189 		if (sc->fmt)
1190 			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1191 	}
1192 
1193 	sc->nr_args = nr_args;
1194 	return 0;
1195 }
1196 
1197 static int syscall__set_arg_fmts(struct syscall *sc)
1198 {
1199 	struct format_field *field;
1200 	int idx = 0, len;
1201 
1202 	for (field = sc->args; field; field = field->next, ++idx) {
1203 		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1204 			continue;
1205 
1206 		if (strcmp(field->type, "const char *") == 0 &&
1207 			 (strcmp(field->name, "filename") == 0 ||
1208 			  strcmp(field->name, "path") == 0 ||
1209 			  strcmp(field->name, "pathname") == 0))
1210 			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1211 		else if (field->flags & FIELD_IS_POINTER)
1212 			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1213 		else if (strcmp(field->type, "pid_t") == 0)
1214 			sc->arg_fmt[idx].scnprintf = SCA_PID;
1215 		else if (strcmp(field->type, "umode_t") == 0)
1216 			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1217 		else if ((strcmp(field->type, "int") == 0 ||
1218 			  strcmp(field->type, "unsigned int") == 0 ||
1219 			  strcmp(field->type, "long") == 0) &&
1220 			 (len = strlen(field->name)) >= 2 &&
1221 			 strcmp(field->name + len - 2, "fd") == 0) {
1222 			/*
1223 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1224 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1225 			 * 65 int
1226 			 * 23 unsigned int
1227 			 * 7 unsigned long
1228 			 */
1229 			sc->arg_fmt[idx].scnprintf = SCA_FD;
1230 		}
1231 	}
1232 
1233 	return 0;
1234 }
1235 
1236 static int trace__read_syscall_info(struct trace *trace, int id)
1237 {
1238 	char tp_name[128];
1239 	struct syscall *sc;
1240 	const char *name = syscalltbl__name(trace->sctbl, id);
1241 
1242 	if (name == NULL)
1243 		return -1;
1244 
1245 	if (id > trace->syscalls.max) {
1246 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1247 
1248 		if (nsyscalls == NULL)
1249 			return -1;
1250 
1251 		if (trace->syscalls.max != -1) {
1252 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1253 			       (id - trace->syscalls.max) * sizeof(*sc));
1254 		} else {
1255 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1256 		}
1257 
1258 		trace->syscalls.table = nsyscalls;
1259 		trace->syscalls.max   = id;
1260 	}
1261 
1262 	sc = trace->syscalls.table + id;
1263 	sc->name = name;
1264 
1265 	sc->fmt  = syscall_fmt__find(sc->name);
1266 
1267 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1268 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1269 
1270 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1271 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1272 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1273 	}
1274 
1275 	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1276 		return -1;
1277 
1278 	if (IS_ERR(sc->tp_format))
1279 		return -1;
1280 
1281 	sc->args = sc->tp_format->format.fields;
1282 	/*
1283 	 * We need to check and discard the first variable '__syscall_nr'
1284 	 * or 'nr' that mean the syscall number. It is needless here.
1285 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1286 	 */
1287 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1288 		sc->args = sc->args->next;
1289 		--sc->nr_args;
1290 	}
1291 
1292 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1293 
1294 	return syscall__set_arg_fmts(sc);
1295 }
1296 
1297 static int trace__validate_ev_qualifier(struct trace *trace)
1298 {
1299 	int err = 0, i;
1300 	size_t nr_allocated;
1301 	struct str_node *pos;
1302 
1303 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1304 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1305 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1306 
1307 	if (trace->ev_qualifier_ids.entries == NULL) {
1308 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1309 		       trace->output);
1310 		err = -EINVAL;
1311 		goto out;
1312 	}
1313 
1314 	nr_allocated = trace->ev_qualifier_ids.nr;
1315 	i = 0;
1316 
1317 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1318 		const char *sc = pos->s;
1319 		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1320 
1321 		if (id < 0) {
1322 			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1323 			if (id >= 0)
1324 				goto matches;
1325 
1326 			if (err == 0) {
1327 				fputs("Error:\tInvalid syscall ", trace->output);
1328 				err = -EINVAL;
1329 			} else {
1330 				fputs(", ", trace->output);
1331 			}
1332 
1333 			fputs(sc, trace->output);
1334 		}
1335 matches:
1336 		trace->ev_qualifier_ids.entries[i++] = id;
1337 		if (match_next == -1)
1338 			continue;
1339 
1340 		while (1) {
1341 			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1342 			if (id < 0)
1343 				break;
1344 			if (nr_allocated == trace->ev_qualifier_ids.nr) {
1345 				void *entries;
1346 
1347 				nr_allocated += 8;
1348 				entries = realloc(trace->ev_qualifier_ids.entries,
1349 						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1350 				if (entries == NULL) {
1351 					err = -ENOMEM;
1352 					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1353 					goto out_free;
1354 				}
1355 				trace->ev_qualifier_ids.entries = entries;
1356 			}
1357 			trace->ev_qualifier_ids.nr++;
1358 			trace->ev_qualifier_ids.entries[i++] = id;
1359 		}
1360 	}
1361 
1362 	if (err < 0) {
1363 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1364 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1365 out_free:
1366 		zfree(&trace->ev_qualifier_ids.entries);
1367 		trace->ev_qualifier_ids.nr = 0;
1368 	}
1369 out:
1370 	return err;
1371 }
1372 
1373 /*
1374  * args is to be interpreted as a series of longs but we need to handle
1375  * 8-byte unaligned accesses. args points to raw_data within the event
1376  * and raw_data is guaranteed to be 8-byte unaligned because it is
1377  * preceded by raw_size which is a u32. So we need to copy args to a temp
1378  * variable to read it. Most notably this avoids extended load instructions
1379  * on unaligned addresses
1380  */
1381 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1382 {
1383 	unsigned long val;
1384 	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1385 
1386 	memcpy(&val, p, sizeof(val));
1387 	return val;
1388 }
1389 
1390 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1391 				      struct syscall_arg *arg)
1392 {
1393 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1394 		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1395 
1396 	return scnprintf(bf, size, "arg%d: ", arg->idx);
1397 }
1398 
1399 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1400 				     struct syscall_arg *arg, unsigned long val)
1401 {
1402 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1403 		arg->val = val;
1404 		if (sc->arg_fmt[arg->idx].parm)
1405 			arg->parm = sc->arg_fmt[arg->idx].parm;
1406 		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1407 	}
1408 	return scnprintf(bf, size, "%ld", val);
1409 }
1410 
1411 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1412 				      unsigned char *args, struct trace *trace,
1413 				      struct thread *thread)
1414 {
1415 	size_t printed = 0;
1416 	unsigned long val;
1417 	u8 bit = 1;
1418 	struct syscall_arg arg = {
1419 		.args	= args,
1420 		.idx	= 0,
1421 		.mask	= 0,
1422 		.trace  = trace,
1423 		.thread = thread,
1424 	};
1425 	struct thread_trace *ttrace = thread__priv(thread);
1426 
1427 	/*
1428 	 * Things like fcntl will set this in its 'cmd' formatter to pick the
1429 	 * right formatter for the return value (an fd? file flags?), which is
1430 	 * not needed for syscalls that always return a given type, say an fd.
1431 	 */
1432 	ttrace->ret_scnprintf = NULL;
1433 
1434 	if (sc->args != NULL) {
1435 		struct format_field *field;
1436 
1437 		for (field = sc->args; field;
1438 		     field = field->next, ++arg.idx, bit <<= 1) {
1439 			if (arg.mask & bit)
1440 				continue;
1441 
1442 			val = syscall_arg__val(&arg, arg.idx);
1443 
1444 			/*
1445  			 * Suppress this argument if its value is zero and
1446  			 * and we don't have a string associated in an
1447  			 * strarray for it.
1448  			 */
1449 			if (val == 0 &&
1450 			    !(sc->arg_fmt &&
1451 			      (sc->arg_fmt[arg.idx].show_zero ||
1452 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1453 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1454 			      sc->arg_fmt[arg.idx].parm))
1455 				continue;
1456 
1457 			printed += scnprintf(bf + printed, size - printed,
1458 					     "%s%s: ", printed ? ", " : "", field->name);
1459 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1460 		}
1461 	} else if (IS_ERR(sc->tp_format)) {
1462 		/*
1463 		 * If we managed to read the tracepoint /format file, then we
1464 		 * may end up not having any args, like with gettid(), so only
1465 		 * print the raw args when we didn't manage to read it.
1466 		 */
1467 		while (arg.idx < sc->nr_args) {
1468 			if (arg.mask & bit)
1469 				goto next_arg;
1470 			val = syscall_arg__val(&arg, arg.idx);
1471 			if (printed)
1472 				printed += scnprintf(bf + printed, size - printed, ", ");
1473 			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1474 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1475 next_arg:
1476 			++arg.idx;
1477 			bit <<= 1;
1478 		}
1479 	}
1480 
1481 	return printed;
1482 }
1483 
1484 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1485 				  union perf_event *event,
1486 				  struct perf_sample *sample);
1487 
1488 static struct syscall *trace__syscall_info(struct trace *trace,
1489 					   struct perf_evsel *evsel, int id)
1490 {
1491 
1492 	if (id < 0) {
1493 
1494 		/*
1495 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1496 		 * before that, leaving at a higher verbosity level till that is
1497 		 * explained. Reproduced with plain ftrace with:
1498 		 *
1499 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1500 		 * grep "NR -1 " /t/trace_pipe
1501 		 *
1502 		 * After generating some load on the machine.
1503  		 */
1504 		if (verbose > 1) {
1505 			static u64 n;
1506 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1507 				id, perf_evsel__name(evsel), ++n);
1508 		}
1509 		return NULL;
1510 	}
1511 
1512 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1513 	    trace__read_syscall_info(trace, id))
1514 		goto out_cant_read;
1515 
1516 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1517 		goto out_cant_read;
1518 
1519 	return &trace->syscalls.table[id];
1520 
1521 out_cant_read:
1522 	if (verbose > 0) {
1523 		fprintf(trace->output, "Problems reading syscall %d", id);
1524 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1525 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1526 		fputs(" information\n", trace->output);
1527 	}
1528 	return NULL;
1529 }
1530 
1531 static void thread__update_stats(struct thread_trace *ttrace,
1532 				 int id, struct perf_sample *sample)
1533 {
1534 	struct int_node *inode;
1535 	struct stats *stats;
1536 	u64 duration = 0;
1537 
1538 	inode = intlist__findnew(ttrace->syscall_stats, id);
1539 	if (inode == NULL)
1540 		return;
1541 
1542 	stats = inode->priv;
1543 	if (stats == NULL) {
1544 		stats = malloc(sizeof(struct stats));
1545 		if (stats == NULL)
1546 			return;
1547 		init_stats(stats);
1548 		inode->priv = stats;
1549 	}
1550 
1551 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1552 		duration = sample->time - ttrace->entry_time;
1553 
1554 	update_stats(stats, duration);
1555 }
1556 
1557 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1558 {
1559 	struct thread_trace *ttrace;
1560 	u64 duration;
1561 	size_t printed;
1562 
1563 	if (trace->current == NULL)
1564 		return 0;
1565 
1566 	ttrace = thread__priv(trace->current);
1567 
1568 	if (!ttrace->entry_pending)
1569 		return 0;
1570 
1571 	duration = sample->time - ttrace->entry_time;
1572 
1573 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1574 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1575 	ttrace->entry_pending = false;
1576 
1577 	return printed;
1578 }
1579 
1580 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1581 			    union perf_event *event __maybe_unused,
1582 			    struct perf_sample *sample)
1583 {
1584 	char *msg;
1585 	void *args;
1586 	size_t printed = 0;
1587 	struct thread *thread;
1588 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1589 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1590 	struct thread_trace *ttrace;
1591 
1592 	if (sc == NULL)
1593 		return -1;
1594 
1595 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1596 	ttrace = thread__trace(thread, trace->output);
1597 	if (ttrace == NULL)
1598 		goto out_put;
1599 
1600 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1601 
1602 	if (ttrace->entry_str == NULL) {
1603 		ttrace->entry_str = malloc(trace__entry_str_size);
1604 		if (!ttrace->entry_str)
1605 			goto out_put;
1606 	}
1607 
1608 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1609 		trace__printf_interrupted_entry(trace, sample);
1610 
1611 	ttrace->entry_time = sample->time;
1612 	msg = ttrace->entry_str;
1613 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1614 
1615 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1616 					   args, trace, thread);
1617 
1618 	if (sc->is_exit) {
1619 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1620 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1621 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1622 		}
1623 	} else {
1624 		ttrace->entry_pending = true;
1625 		/* See trace__vfs_getname & trace__sys_exit */
1626 		ttrace->filename.pending_open = false;
1627 	}
1628 
1629 	if (trace->current != thread) {
1630 		thread__put(trace->current);
1631 		trace->current = thread__get(thread);
1632 	}
1633 	err = 0;
1634 out_put:
1635 	thread__put(thread);
1636 	return err;
1637 }
1638 
1639 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1640 				    struct perf_sample *sample,
1641 				    struct callchain_cursor *cursor)
1642 {
1643 	struct addr_location al;
1644 
1645 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1646 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1647 		return -1;
1648 
1649 	return 0;
1650 }
1651 
1652 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1653 {
1654 	/* TODO: user-configurable print_opts */
1655 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1656 				        EVSEL__PRINT_DSO |
1657 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1658 
1659 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1660 }
1661 
1662 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1663 			   union perf_event *event __maybe_unused,
1664 			   struct perf_sample *sample)
1665 {
1666 	long ret;
1667 	u64 duration = 0;
1668 	bool duration_calculated = false;
1669 	struct thread *thread;
1670 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1671 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1672 	struct thread_trace *ttrace;
1673 
1674 	if (sc == NULL)
1675 		return -1;
1676 
1677 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1678 	ttrace = thread__trace(thread, trace->output);
1679 	if (ttrace == NULL)
1680 		goto out_put;
1681 
1682 	if (trace->summary)
1683 		thread__update_stats(ttrace, id, sample);
1684 
1685 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1686 
1687 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1688 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1689 		ttrace->filename.pending_open = false;
1690 		++trace->stats.vfs_getname;
1691 	}
1692 
1693 	if (ttrace->entry_time) {
1694 		duration = sample->time - ttrace->entry_time;
1695 		if (trace__filter_duration(trace, duration))
1696 			goto out;
1697 		duration_calculated = true;
1698 	} else if (trace->duration_filter)
1699 		goto out;
1700 
1701 	if (sample->callchain) {
1702 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1703 		if (callchain_ret == 0) {
1704 			if (callchain_cursor.nr < trace->min_stack)
1705 				goto out;
1706 			callchain_ret = 1;
1707 		}
1708 	}
1709 
1710 	if (trace->summary_only)
1711 		goto out;
1712 
1713 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1714 
1715 	if (ttrace->entry_pending) {
1716 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1717 	} else {
1718 		fprintf(trace->output, " ... [");
1719 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1720 		fprintf(trace->output, "]: %s()", sc->name);
1721 	}
1722 
1723 	if (sc->fmt == NULL) {
1724 		if (ret < 0)
1725 			goto errno_print;
1726 signed_print:
1727 		fprintf(trace->output, ") = %ld", ret);
1728 	} else if (ret < 0) {
1729 errno_print: {
1730 		char bf[STRERR_BUFSIZE];
1731 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1732 			   *e = audit_errno_to_name(-ret);
1733 
1734 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1735 	}
1736 	} else if (ret == 0 && sc->fmt->timeout)
1737 		fprintf(trace->output, ") = 0 Timeout");
1738 	else if (ttrace->ret_scnprintf) {
1739 		char bf[1024];
1740 		struct syscall_arg arg = {
1741 			.val	= ret,
1742 			.thread	= thread,
1743 			.trace	= trace,
1744 		};
1745 		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1746 		ttrace->ret_scnprintf = NULL;
1747 		fprintf(trace->output, ") = %s", bf);
1748 	} else if (sc->fmt->hexret)
1749 		fprintf(trace->output, ") = %#lx", ret);
1750 	else if (sc->fmt->errpid) {
1751 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1752 
1753 		if (child != NULL) {
1754 			fprintf(trace->output, ") = %ld", ret);
1755 			if (child->comm_set)
1756 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1757 			thread__put(child);
1758 		}
1759 	} else
1760 		goto signed_print;
1761 
1762 	fputc('\n', trace->output);
1763 
1764 	if (callchain_ret > 0)
1765 		trace__fprintf_callchain(trace, sample);
1766 	else if (callchain_ret < 0)
1767 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1768 out:
1769 	ttrace->entry_pending = false;
1770 	err = 0;
1771 out_put:
1772 	thread__put(thread);
1773 	return err;
1774 }
1775 
1776 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1777 			      union perf_event *event __maybe_unused,
1778 			      struct perf_sample *sample)
1779 {
1780 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1781 	struct thread_trace *ttrace;
1782 	size_t filename_len, entry_str_len, to_move;
1783 	ssize_t remaining_space;
1784 	char *pos;
1785 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1786 
1787 	if (!thread)
1788 		goto out;
1789 
1790 	ttrace = thread__priv(thread);
1791 	if (!ttrace)
1792 		goto out_put;
1793 
1794 	filename_len = strlen(filename);
1795 	if (filename_len == 0)
1796 		goto out_put;
1797 
1798 	if (ttrace->filename.namelen < filename_len) {
1799 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1800 
1801 		if (f == NULL)
1802 			goto out_put;
1803 
1804 		ttrace->filename.namelen = filename_len;
1805 		ttrace->filename.name = f;
1806 	}
1807 
1808 	strcpy(ttrace->filename.name, filename);
1809 	ttrace->filename.pending_open = true;
1810 
1811 	if (!ttrace->filename.ptr)
1812 		goto out_put;
1813 
1814 	entry_str_len = strlen(ttrace->entry_str);
1815 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1816 	if (remaining_space <= 0)
1817 		goto out_put;
1818 
1819 	if (filename_len > (size_t)remaining_space) {
1820 		filename += filename_len - remaining_space;
1821 		filename_len = remaining_space;
1822 	}
1823 
1824 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1825 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1826 	memmove(pos + filename_len, pos, to_move);
1827 	memcpy(pos, filename, filename_len);
1828 
1829 	ttrace->filename.ptr = 0;
1830 	ttrace->filename.entry_str_pos = 0;
1831 out_put:
1832 	thread__put(thread);
1833 out:
1834 	return 0;
1835 }
1836 
1837 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1838 				     union perf_event *event __maybe_unused,
1839 				     struct perf_sample *sample)
1840 {
1841         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1842 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1843 	struct thread *thread = machine__findnew_thread(trace->host,
1844 							sample->pid,
1845 							sample->tid);
1846 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1847 
1848 	if (ttrace == NULL)
1849 		goto out_dump;
1850 
1851 	ttrace->runtime_ms += runtime_ms;
1852 	trace->runtime_ms += runtime_ms;
1853 out_put:
1854 	thread__put(thread);
1855 	return 0;
1856 
1857 out_dump:
1858 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1859 	       evsel->name,
1860 	       perf_evsel__strval(evsel, sample, "comm"),
1861 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1862 	       runtime,
1863 	       perf_evsel__intval(evsel, sample, "vruntime"));
1864 	goto out_put;
1865 }
1866 
1867 static int bpf_output__printer(enum binary_printer_ops op,
1868 			       unsigned int val, void *extra __maybe_unused, FILE *fp)
1869 {
1870 	unsigned char ch = (unsigned char)val;
1871 
1872 	switch (op) {
1873 	case BINARY_PRINT_CHAR_DATA:
1874 		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1875 	case BINARY_PRINT_DATA_BEGIN:
1876 	case BINARY_PRINT_LINE_BEGIN:
1877 	case BINARY_PRINT_ADDR:
1878 	case BINARY_PRINT_NUM_DATA:
1879 	case BINARY_PRINT_NUM_PAD:
1880 	case BINARY_PRINT_SEP:
1881 	case BINARY_PRINT_CHAR_PAD:
1882 	case BINARY_PRINT_LINE_END:
1883 	case BINARY_PRINT_DATA_END:
1884 	default:
1885 		break;
1886 	}
1887 
1888 	return 0;
1889 }
1890 
1891 static void bpf_output__fprintf(struct trace *trace,
1892 				struct perf_sample *sample)
1893 {
1894 	binary__fprintf(sample->raw_data, sample->raw_size, 8,
1895 			bpf_output__printer, NULL, trace->output);
1896 }
1897 
1898 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1899 				union perf_event *event __maybe_unused,
1900 				struct perf_sample *sample)
1901 {
1902 	int callchain_ret = 0;
1903 
1904 	if (sample->callchain) {
1905 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1906 		if (callchain_ret == 0) {
1907 			if (callchain_cursor.nr < trace->min_stack)
1908 				goto out;
1909 			callchain_ret = 1;
1910 		}
1911 	}
1912 
1913 	trace__printf_interrupted_entry(trace, sample);
1914 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1915 
1916 	if (trace->trace_syscalls)
1917 		fprintf(trace->output, "(         ): ");
1918 
1919 	fprintf(trace->output, "%s:", evsel->name);
1920 
1921 	if (perf_evsel__is_bpf_output(evsel)) {
1922 		bpf_output__fprintf(trace, sample);
1923 	} else if (evsel->tp_format) {
1924 		event_format__fprintf(evsel->tp_format, sample->cpu,
1925 				      sample->raw_data, sample->raw_size,
1926 				      trace->output);
1927 	}
1928 
1929 	fprintf(trace->output, ")\n");
1930 
1931 	if (callchain_ret > 0)
1932 		trace__fprintf_callchain(trace, sample);
1933 	else if (callchain_ret < 0)
1934 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1935 out:
1936 	return 0;
1937 }
1938 
1939 static void print_location(FILE *f, struct perf_sample *sample,
1940 			   struct addr_location *al,
1941 			   bool print_dso, bool print_sym)
1942 {
1943 
1944 	if ((verbose > 0 || print_dso) && al->map)
1945 		fprintf(f, "%s@", al->map->dso->long_name);
1946 
1947 	if ((verbose > 0 || print_sym) && al->sym)
1948 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1949 			al->addr - al->sym->start);
1950 	else if (al->map)
1951 		fprintf(f, "0x%" PRIx64, al->addr);
1952 	else
1953 		fprintf(f, "0x%" PRIx64, sample->addr);
1954 }
1955 
1956 static int trace__pgfault(struct trace *trace,
1957 			  struct perf_evsel *evsel,
1958 			  union perf_event *event __maybe_unused,
1959 			  struct perf_sample *sample)
1960 {
1961 	struct thread *thread;
1962 	struct addr_location al;
1963 	char map_type = 'd';
1964 	struct thread_trace *ttrace;
1965 	int err = -1;
1966 	int callchain_ret = 0;
1967 
1968 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1969 
1970 	if (sample->callchain) {
1971 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1972 		if (callchain_ret == 0) {
1973 			if (callchain_cursor.nr < trace->min_stack)
1974 				goto out_put;
1975 			callchain_ret = 1;
1976 		}
1977 	}
1978 
1979 	ttrace = thread__trace(thread, trace->output);
1980 	if (ttrace == NULL)
1981 		goto out_put;
1982 
1983 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1984 		ttrace->pfmaj++;
1985 	else
1986 		ttrace->pfmin++;
1987 
1988 	if (trace->summary_only)
1989 		goto out;
1990 
1991 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1992 			      sample->ip, &al);
1993 
1994 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1995 
1996 	fprintf(trace->output, "%sfault [",
1997 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1998 		"maj" : "min");
1999 
2000 	print_location(trace->output, sample, &al, false, true);
2001 
2002 	fprintf(trace->output, "] => ");
2003 
2004 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2005 				   sample->addr, &al);
2006 
2007 	if (!al.map) {
2008 		thread__find_addr_location(thread, sample->cpumode,
2009 					   MAP__FUNCTION, sample->addr, &al);
2010 
2011 		if (al.map)
2012 			map_type = 'x';
2013 		else
2014 			map_type = '?';
2015 	}
2016 
2017 	print_location(trace->output, sample, &al, true, false);
2018 
2019 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2020 
2021 	if (callchain_ret > 0)
2022 		trace__fprintf_callchain(trace, sample);
2023 	else if (callchain_ret < 0)
2024 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2025 out:
2026 	err = 0;
2027 out_put:
2028 	thread__put(thread);
2029 	return err;
2030 }
2031 
2032 static void trace__set_base_time(struct trace *trace,
2033 				 struct perf_evsel *evsel,
2034 				 struct perf_sample *sample)
2035 {
2036 	/*
2037 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2038 	 * and don't use sample->time unconditionally, we may end up having
2039 	 * some other event in the future without PERF_SAMPLE_TIME for good
2040 	 * reason, i.e. we may not be interested in its timestamps, just in
2041 	 * it taking place, picking some piece of information when it
2042 	 * appears in our event stream (vfs_getname comes to mind).
2043 	 */
2044 	if (trace->base_time == 0 && !trace->full_time &&
2045 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2046 		trace->base_time = sample->time;
2047 }
2048 
2049 static int trace__process_sample(struct perf_tool *tool,
2050 				 union perf_event *event,
2051 				 struct perf_sample *sample,
2052 				 struct perf_evsel *evsel,
2053 				 struct machine *machine __maybe_unused)
2054 {
2055 	struct trace *trace = container_of(tool, struct trace, tool);
2056 	struct thread *thread;
2057 	int err = 0;
2058 
2059 	tracepoint_handler handler = evsel->handler;
2060 
2061 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2062 	if (thread && thread__is_filtered(thread))
2063 		goto out;
2064 
2065 	trace__set_base_time(trace, evsel, sample);
2066 
2067 	if (handler) {
2068 		++trace->nr_events;
2069 		handler(trace, evsel, event, sample);
2070 	}
2071 out:
2072 	thread__put(thread);
2073 	return err;
2074 }
2075 
2076 static int trace__record(struct trace *trace, int argc, const char **argv)
2077 {
2078 	unsigned int rec_argc, i, j;
2079 	const char **rec_argv;
2080 	const char * const record_args[] = {
2081 		"record",
2082 		"-R",
2083 		"-m", "1024",
2084 		"-c", "1",
2085 	};
2086 
2087 	const char * const sc_args[] = { "-e", };
2088 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2089 	const char * const majpf_args[] = { "-e", "major-faults" };
2090 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2091 	const char * const minpf_args[] = { "-e", "minor-faults" };
2092 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2093 
2094 	/* +1 is for the event string below */
2095 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2096 		majpf_args_nr + minpf_args_nr + argc;
2097 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2098 
2099 	if (rec_argv == NULL)
2100 		return -ENOMEM;
2101 
2102 	j = 0;
2103 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2104 		rec_argv[j++] = record_args[i];
2105 
2106 	if (trace->trace_syscalls) {
2107 		for (i = 0; i < sc_args_nr; i++)
2108 			rec_argv[j++] = sc_args[i];
2109 
2110 		/* event string may be different for older kernels - e.g., RHEL6 */
2111 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2112 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2113 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2114 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2115 		else {
2116 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2117 			free(rec_argv);
2118 			return -1;
2119 		}
2120 	}
2121 
2122 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2123 		for (i = 0; i < majpf_args_nr; i++)
2124 			rec_argv[j++] = majpf_args[i];
2125 
2126 	if (trace->trace_pgfaults & TRACE_PFMIN)
2127 		for (i = 0; i < minpf_args_nr; i++)
2128 			rec_argv[j++] = minpf_args[i];
2129 
2130 	for (i = 0; i < (unsigned int)argc; i++)
2131 		rec_argv[j++] = argv[i];
2132 
2133 	return cmd_record(j, rec_argv);
2134 }
2135 
2136 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2137 
2138 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2139 {
2140 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2141 
2142 	if (IS_ERR(evsel))
2143 		return false;
2144 
2145 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2146 		perf_evsel__delete(evsel);
2147 		return false;
2148 	}
2149 
2150 	evsel->handler = trace__vfs_getname;
2151 	perf_evlist__add(evlist, evsel);
2152 	return true;
2153 }
2154 
2155 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2156 {
2157 	struct perf_evsel *evsel;
2158 	struct perf_event_attr attr = {
2159 		.type = PERF_TYPE_SOFTWARE,
2160 		.mmap_data = 1,
2161 	};
2162 
2163 	attr.config = config;
2164 	attr.sample_period = 1;
2165 
2166 	event_attr_init(&attr);
2167 
2168 	evsel = perf_evsel__new(&attr);
2169 	if (evsel)
2170 		evsel->handler = trace__pgfault;
2171 
2172 	return evsel;
2173 }
2174 
2175 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2176 {
2177 	const u32 type = event->header.type;
2178 	struct perf_evsel *evsel;
2179 
2180 	if (type != PERF_RECORD_SAMPLE) {
2181 		trace__process_event(trace, trace->host, event, sample);
2182 		return;
2183 	}
2184 
2185 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2186 	if (evsel == NULL) {
2187 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2188 		return;
2189 	}
2190 
2191 	trace__set_base_time(trace, evsel, sample);
2192 
2193 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2194 	    sample->raw_data == NULL) {
2195 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2196 		       perf_evsel__name(evsel), sample->tid,
2197 		       sample->cpu, sample->raw_size);
2198 	} else {
2199 		tracepoint_handler handler = evsel->handler;
2200 		handler(trace, evsel, event, sample);
2201 	}
2202 }
2203 
2204 static int trace__add_syscall_newtp(struct trace *trace)
2205 {
2206 	int ret = -1;
2207 	struct perf_evlist *evlist = trace->evlist;
2208 	struct perf_evsel *sys_enter, *sys_exit;
2209 
2210 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2211 	if (sys_enter == NULL)
2212 		goto out;
2213 
2214 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2215 		goto out_delete_sys_enter;
2216 
2217 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2218 	if (sys_exit == NULL)
2219 		goto out_delete_sys_enter;
2220 
2221 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2222 		goto out_delete_sys_exit;
2223 
2224 	perf_evlist__add(evlist, sys_enter);
2225 	perf_evlist__add(evlist, sys_exit);
2226 
2227 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2228 		/*
2229 		 * We're interested only in the user space callchain
2230 		 * leading to the syscall, allow overriding that for
2231 		 * debugging reasons using --kernel_syscall_callchains
2232 		 */
2233 		sys_exit->attr.exclude_callchain_kernel = 1;
2234 	}
2235 
2236 	trace->syscalls.events.sys_enter = sys_enter;
2237 	trace->syscalls.events.sys_exit  = sys_exit;
2238 
2239 	ret = 0;
2240 out:
2241 	return ret;
2242 
2243 out_delete_sys_exit:
2244 	perf_evsel__delete_priv(sys_exit);
2245 out_delete_sys_enter:
2246 	perf_evsel__delete_priv(sys_enter);
2247 	goto out;
2248 }
2249 
2250 static int trace__set_ev_qualifier_filter(struct trace *trace)
2251 {
2252 	int err = -1;
2253 	struct perf_evsel *sys_exit;
2254 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2255 						trace->ev_qualifier_ids.nr,
2256 						trace->ev_qualifier_ids.entries);
2257 
2258 	if (filter == NULL)
2259 		goto out_enomem;
2260 
2261 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2262 					  filter)) {
2263 		sys_exit = trace->syscalls.events.sys_exit;
2264 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2265 	}
2266 
2267 	free(filter);
2268 out:
2269 	return err;
2270 out_enomem:
2271 	errno = ENOMEM;
2272 	goto out;
2273 }
2274 
2275 static int trace__set_filter_loop_pids(struct trace *trace)
2276 {
2277 	unsigned int nr = 1;
2278 	pid_t pids[32] = {
2279 		getpid(),
2280 	};
2281 	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2282 
2283 	while (thread && nr < ARRAY_SIZE(pids)) {
2284 		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2285 
2286 		if (parent == NULL)
2287 			break;
2288 
2289 		if (!strcmp(thread__comm_str(parent), "sshd")) {
2290 			pids[nr++] = parent->tid;
2291 			break;
2292 		}
2293 		thread = parent;
2294 	}
2295 
2296 	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2297 }
2298 
2299 static int trace__run(struct trace *trace, int argc, const char **argv)
2300 {
2301 	struct perf_evlist *evlist = trace->evlist;
2302 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2303 	int err = -1, i;
2304 	unsigned long before;
2305 	const bool forks = argc > 0;
2306 	bool draining = false;
2307 
2308 	trace->live = true;
2309 
2310 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2311 		goto out_error_raw_syscalls;
2312 
2313 	if (trace->trace_syscalls)
2314 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2315 
2316 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2317 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2318 		if (pgfault_maj == NULL)
2319 			goto out_error_mem;
2320 		perf_evlist__add(evlist, pgfault_maj);
2321 	}
2322 
2323 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2324 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2325 		if (pgfault_min == NULL)
2326 			goto out_error_mem;
2327 		perf_evlist__add(evlist, pgfault_min);
2328 	}
2329 
2330 	if (trace->sched &&
2331 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2332 				   trace__sched_stat_runtime))
2333 		goto out_error_sched_stat_runtime;
2334 
2335 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2336 	if (err < 0) {
2337 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2338 		goto out_delete_evlist;
2339 	}
2340 
2341 	err = trace__symbols_init(trace, evlist);
2342 	if (err < 0) {
2343 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2344 		goto out_delete_evlist;
2345 	}
2346 
2347 	perf_evlist__config(evlist, &trace->opts, NULL);
2348 
2349 	if (callchain_param.enabled) {
2350 		bool use_identifier = false;
2351 
2352 		if (trace->syscalls.events.sys_exit) {
2353 			perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2354 						     &trace->opts, &callchain_param);
2355 			use_identifier = true;
2356 		}
2357 
2358 		if (pgfault_maj) {
2359 			perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2360 			use_identifier = true;
2361 		}
2362 
2363 		if (pgfault_min) {
2364 			perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2365 			use_identifier = true;
2366 		}
2367 
2368 		if (use_identifier) {
2369 		       /*
2370 			* Now we have evsels with different sample_ids, use
2371 			* PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2372 			* from a fixed position in each ring buffer record.
2373 			*
2374 			* As of this the changeset introducing this comment, this
2375 			* isn't strictly needed, as the fields that can come before
2376 			* PERF_SAMPLE_ID are all used, but we'll probably disable
2377 			* some of those for things like copying the payload of
2378 			* pointer syscall arguments, and for vfs_getname we don't
2379 			* need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2380 			* here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2381 			*/
2382 			perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2383 			perf_evlist__reset_sample_bit(evlist, ID);
2384 		}
2385 	}
2386 
2387 	signal(SIGCHLD, sig_handler);
2388 	signal(SIGINT, sig_handler);
2389 
2390 	if (forks) {
2391 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2392 						    argv, false, NULL);
2393 		if (err < 0) {
2394 			fprintf(trace->output, "Couldn't run the workload!\n");
2395 			goto out_delete_evlist;
2396 		}
2397 	}
2398 
2399 	err = perf_evlist__open(evlist);
2400 	if (err < 0)
2401 		goto out_error_open;
2402 
2403 	err = bpf__apply_obj_config();
2404 	if (err) {
2405 		char errbuf[BUFSIZ];
2406 
2407 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2408 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2409 			 errbuf);
2410 		goto out_error_open;
2411 	}
2412 
2413 	/*
2414 	 * Better not use !target__has_task() here because we need to cover the
2415 	 * case where no threads were specified in the command line, but a
2416 	 * workload was, and in that case we will fill in the thread_map when
2417 	 * we fork the workload in perf_evlist__prepare_workload.
2418 	 */
2419 	if (trace->filter_pids.nr > 0)
2420 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2421 	else if (thread_map__pid(evlist->threads, 0) == -1)
2422 		err = trace__set_filter_loop_pids(trace);
2423 
2424 	if (err < 0)
2425 		goto out_error_mem;
2426 
2427 	if (trace->ev_qualifier_ids.nr > 0) {
2428 		err = trace__set_ev_qualifier_filter(trace);
2429 		if (err < 0)
2430 			goto out_errno;
2431 
2432 		pr_debug("event qualifier tracepoint filter: %s\n",
2433 			 trace->syscalls.events.sys_exit->filter);
2434 	}
2435 
2436 	err = perf_evlist__apply_filters(evlist, &evsel);
2437 	if (err < 0)
2438 		goto out_error_apply_filters;
2439 
2440 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2441 	if (err < 0)
2442 		goto out_error_mmap;
2443 
2444 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2445 		perf_evlist__enable(evlist);
2446 
2447 	if (forks)
2448 		perf_evlist__start_workload(evlist);
2449 
2450 	if (trace->opts.initial_delay) {
2451 		usleep(trace->opts.initial_delay * 1000);
2452 		perf_evlist__enable(evlist);
2453 	}
2454 
2455 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2456 				  evlist->threads->nr > 1 ||
2457 				  perf_evlist__first(evlist)->attr.inherit;
2458 again:
2459 	before = trace->nr_events;
2460 
2461 	for (i = 0; i < evlist->nr_mmaps; i++) {
2462 		union perf_event *event;
2463 
2464 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2465 			struct perf_sample sample;
2466 
2467 			++trace->nr_events;
2468 
2469 			err = perf_evlist__parse_sample(evlist, event, &sample);
2470 			if (err) {
2471 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2472 				goto next_event;
2473 			}
2474 
2475 			trace__handle_event(trace, event, &sample);
2476 next_event:
2477 			perf_evlist__mmap_consume(evlist, i);
2478 
2479 			if (interrupted)
2480 				goto out_disable;
2481 
2482 			if (done && !draining) {
2483 				perf_evlist__disable(evlist);
2484 				draining = true;
2485 			}
2486 		}
2487 	}
2488 
2489 	if (trace->nr_events == before) {
2490 		int timeout = done ? 100 : -1;
2491 
2492 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2493 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2494 				draining = true;
2495 
2496 			goto again;
2497 		}
2498 	} else {
2499 		goto again;
2500 	}
2501 
2502 out_disable:
2503 	thread__zput(trace->current);
2504 
2505 	perf_evlist__disable(evlist);
2506 
2507 	if (!err) {
2508 		if (trace->summary)
2509 			trace__fprintf_thread_summary(trace, trace->output);
2510 
2511 		if (trace->show_tool_stats) {
2512 			fprintf(trace->output, "Stats:\n "
2513 					       " vfs_getname : %" PRIu64 "\n"
2514 					       " proc_getname: %" PRIu64 "\n",
2515 				trace->stats.vfs_getname,
2516 				trace->stats.proc_getname);
2517 		}
2518 	}
2519 
2520 out_delete_evlist:
2521 	trace__symbols__exit(trace);
2522 
2523 	perf_evlist__delete(evlist);
2524 	trace->evlist = NULL;
2525 	trace->live = false;
2526 	return err;
2527 {
2528 	char errbuf[BUFSIZ];
2529 
2530 out_error_sched_stat_runtime:
2531 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2532 	goto out_error;
2533 
2534 out_error_raw_syscalls:
2535 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2536 	goto out_error;
2537 
2538 out_error_mmap:
2539 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2540 	goto out_error;
2541 
2542 out_error_open:
2543 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2544 
2545 out_error:
2546 	fprintf(trace->output, "%s\n", errbuf);
2547 	goto out_delete_evlist;
2548 
2549 out_error_apply_filters:
2550 	fprintf(trace->output,
2551 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2552 		evsel->filter, perf_evsel__name(evsel), errno,
2553 		str_error_r(errno, errbuf, sizeof(errbuf)));
2554 	goto out_delete_evlist;
2555 }
2556 out_error_mem:
2557 	fprintf(trace->output, "Not enough memory to run!\n");
2558 	goto out_delete_evlist;
2559 
2560 out_errno:
2561 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2562 	goto out_delete_evlist;
2563 }
2564 
2565 static int trace__replay(struct trace *trace)
2566 {
2567 	const struct perf_evsel_str_handler handlers[] = {
2568 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2569 	};
2570 	struct perf_data data = {
2571 		.file      = {
2572 			.path = input_name,
2573 		},
2574 		.mode      = PERF_DATA_MODE_READ,
2575 		.force     = trace->force,
2576 	};
2577 	struct perf_session *session;
2578 	struct perf_evsel *evsel;
2579 	int err = -1;
2580 
2581 	trace->tool.sample	  = trace__process_sample;
2582 	trace->tool.mmap	  = perf_event__process_mmap;
2583 	trace->tool.mmap2	  = perf_event__process_mmap2;
2584 	trace->tool.comm	  = perf_event__process_comm;
2585 	trace->tool.exit	  = perf_event__process_exit;
2586 	trace->tool.fork	  = perf_event__process_fork;
2587 	trace->tool.attr	  = perf_event__process_attr;
2588 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2589 	trace->tool.build_id	  = perf_event__process_build_id;
2590 	trace->tool.namespaces	  = perf_event__process_namespaces;
2591 
2592 	trace->tool.ordered_events = true;
2593 	trace->tool.ordering_requires_timestamps = true;
2594 
2595 	/* add tid to output */
2596 	trace->multiple_threads = true;
2597 
2598 	session = perf_session__new(&data, false, &trace->tool);
2599 	if (session == NULL)
2600 		return -1;
2601 
2602 	if (trace->opts.target.pid)
2603 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2604 
2605 	if (trace->opts.target.tid)
2606 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2607 
2608 	if (symbol__init(&session->header.env) < 0)
2609 		goto out;
2610 
2611 	trace->host = &session->machines.host;
2612 
2613 	err = perf_session__set_tracepoints_handlers(session, handlers);
2614 	if (err)
2615 		goto out;
2616 
2617 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2618 						     "raw_syscalls:sys_enter");
2619 	/* older kernels have syscalls tp versus raw_syscalls */
2620 	if (evsel == NULL)
2621 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2622 							     "syscalls:sys_enter");
2623 
2624 	if (evsel &&
2625 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2626 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2627 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2628 		goto out;
2629 	}
2630 
2631 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2632 						     "raw_syscalls:sys_exit");
2633 	if (evsel == NULL)
2634 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2635 							     "syscalls:sys_exit");
2636 	if (evsel &&
2637 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2638 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2639 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2640 		goto out;
2641 	}
2642 
2643 	evlist__for_each_entry(session->evlist, evsel) {
2644 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2645 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2646 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2647 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2648 			evsel->handler = trace__pgfault;
2649 	}
2650 
2651 	setup_pager();
2652 
2653 	err = perf_session__process_events(session);
2654 	if (err)
2655 		pr_err("Failed to process events, error %d", err);
2656 
2657 	else if (trace->summary)
2658 		trace__fprintf_thread_summary(trace, trace->output);
2659 
2660 out:
2661 	perf_session__delete(session);
2662 
2663 	return err;
2664 }
2665 
2666 static size_t trace__fprintf_threads_header(FILE *fp)
2667 {
2668 	size_t printed;
2669 
2670 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2671 
2672 	return printed;
2673 }
2674 
2675 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2676 	struct stats 	*stats;
2677 	double		msecs;
2678 	int		syscall;
2679 )
2680 {
2681 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2682 	struct stats *stats = source->priv;
2683 
2684 	entry->syscall = source->i;
2685 	entry->stats   = stats;
2686 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2687 }
2688 
2689 static size_t thread__dump_stats(struct thread_trace *ttrace,
2690 				 struct trace *trace, FILE *fp)
2691 {
2692 	size_t printed = 0;
2693 	struct syscall *sc;
2694 	struct rb_node *nd;
2695 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2696 
2697 	if (syscall_stats == NULL)
2698 		return 0;
2699 
2700 	printed += fprintf(fp, "\n");
2701 
2702 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2703 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2704 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2705 
2706 	resort_rb__for_each_entry(nd, syscall_stats) {
2707 		struct stats *stats = syscall_stats_entry->stats;
2708 		if (stats) {
2709 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2710 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2711 			double avg = avg_stats(stats);
2712 			double pct;
2713 			u64 n = (u64) stats->n;
2714 
2715 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2716 			avg /= NSEC_PER_MSEC;
2717 
2718 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2719 			printed += fprintf(fp, "   %-15s", sc->name);
2720 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2721 					   n, syscall_stats_entry->msecs, min, avg);
2722 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2723 		}
2724 	}
2725 
2726 	resort_rb__delete(syscall_stats);
2727 	printed += fprintf(fp, "\n\n");
2728 
2729 	return printed;
2730 }
2731 
2732 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2733 {
2734 	size_t printed = 0;
2735 	struct thread_trace *ttrace = thread__priv(thread);
2736 	double ratio;
2737 
2738 	if (ttrace == NULL)
2739 		return 0;
2740 
2741 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2742 
2743 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2744 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2745 	printed += fprintf(fp, "%.1f%%", ratio);
2746 	if (ttrace->pfmaj)
2747 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2748 	if (ttrace->pfmin)
2749 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2750 	if (trace->sched)
2751 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2752 	else if (fputc('\n', fp) != EOF)
2753 		++printed;
2754 
2755 	printed += thread__dump_stats(ttrace, trace, fp);
2756 
2757 	return printed;
2758 }
2759 
2760 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2761 {
2762 	return ttrace ? ttrace->nr_events : 0;
2763 }
2764 
2765 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2766 	struct thread *thread;
2767 )
2768 {
2769 	entry->thread = rb_entry(nd, struct thread, rb_node);
2770 }
2771 
2772 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2773 {
2774 	size_t printed = trace__fprintf_threads_header(fp);
2775 	struct rb_node *nd;
2776 	int i;
2777 
2778 	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2779 		DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2780 
2781 		if (threads == NULL) {
2782 			fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2783 			return 0;
2784 		}
2785 
2786 		resort_rb__for_each_entry(nd, threads)
2787 			printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2788 
2789 		resort_rb__delete(threads);
2790 	}
2791 	return printed;
2792 }
2793 
2794 static int trace__set_duration(const struct option *opt, const char *str,
2795 			       int unset __maybe_unused)
2796 {
2797 	struct trace *trace = opt->value;
2798 
2799 	trace->duration_filter = atof(str);
2800 	return 0;
2801 }
2802 
2803 static int trace__set_filter_pids(const struct option *opt, const char *str,
2804 				  int unset __maybe_unused)
2805 {
2806 	int ret = -1;
2807 	size_t i;
2808 	struct trace *trace = opt->value;
2809 	/*
2810 	 * FIXME: introduce a intarray class, plain parse csv and create a
2811 	 * { int nr, int entries[] } struct...
2812 	 */
2813 	struct intlist *list = intlist__new(str);
2814 
2815 	if (list == NULL)
2816 		return -1;
2817 
2818 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2819 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2820 
2821 	if (trace->filter_pids.entries == NULL)
2822 		goto out;
2823 
2824 	trace->filter_pids.entries[0] = getpid();
2825 
2826 	for (i = 1; i < trace->filter_pids.nr; ++i)
2827 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2828 
2829 	intlist__delete(list);
2830 	ret = 0;
2831 out:
2832 	return ret;
2833 }
2834 
2835 static int trace__open_output(struct trace *trace, const char *filename)
2836 {
2837 	struct stat st;
2838 
2839 	if (!stat(filename, &st) && st.st_size) {
2840 		char oldname[PATH_MAX];
2841 
2842 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2843 		unlink(oldname);
2844 		rename(filename, oldname);
2845 	}
2846 
2847 	trace->output = fopen(filename, "w");
2848 
2849 	return trace->output == NULL ? -errno : 0;
2850 }
2851 
2852 static int parse_pagefaults(const struct option *opt, const char *str,
2853 			    int unset __maybe_unused)
2854 {
2855 	int *trace_pgfaults = opt->value;
2856 
2857 	if (strcmp(str, "all") == 0)
2858 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2859 	else if (strcmp(str, "maj") == 0)
2860 		*trace_pgfaults |= TRACE_PFMAJ;
2861 	else if (strcmp(str, "min") == 0)
2862 		*trace_pgfaults |= TRACE_PFMIN;
2863 	else
2864 		return -1;
2865 
2866 	return 0;
2867 }
2868 
2869 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2870 {
2871 	struct perf_evsel *evsel;
2872 
2873 	evlist__for_each_entry(evlist, evsel)
2874 		evsel->handler = handler;
2875 }
2876 
2877 /*
2878  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2879  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2880  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2881  *
2882  * It'd be better to introduce a parse_options() variant that would return a
2883  * list with the terms it didn't match to an event...
2884  */
2885 static int trace__parse_events_option(const struct option *opt, const char *str,
2886 				      int unset __maybe_unused)
2887 {
2888 	struct trace *trace = (struct trace *)opt->value;
2889 	const char *s = str;
2890 	char *sep = NULL, *lists[2] = { NULL, NULL, };
2891 	int len = strlen(str) + 1, err = -1, list, idx;
2892 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2893 	char group_name[PATH_MAX];
2894 
2895 	if (strace_groups_dir == NULL)
2896 		return -1;
2897 
2898 	if (*s == '!') {
2899 		++s;
2900 		trace->not_ev_qualifier = true;
2901 	}
2902 
2903 	while (1) {
2904 		if ((sep = strchr(s, ',')) != NULL)
2905 			*sep = '\0';
2906 
2907 		list = 0;
2908 		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2909 		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2910 			list = 1;
2911 		} else {
2912 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2913 			if (access(group_name, R_OK) == 0)
2914 				list = 1;
2915 		}
2916 
2917 		if (lists[list]) {
2918 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2919 		} else {
2920 			lists[list] = malloc(len);
2921 			if (lists[list] == NULL)
2922 				goto out;
2923 			strcpy(lists[list], s);
2924 		}
2925 
2926 		if (!sep)
2927 			break;
2928 
2929 		*sep = ',';
2930 		s = sep + 1;
2931 	}
2932 
2933 	if (lists[1] != NULL) {
2934 		struct strlist_config slist_config = {
2935 			.dirname = strace_groups_dir,
2936 		};
2937 
2938 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2939 		if (trace->ev_qualifier == NULL) {
2940 			fputs("Not enough memory to parse event qualifier", trace->output);
2941 			goto out;
2942 		}
2943 
2944 		if (trace__validate_ev_qualifier(trace))
2945 			goto out;
2946 	}
2947 
2948 	err = 0;
2949 
2950 	if (lists[0]) {
2951 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2952 					       "event selector. use 'perf list' to list available events",
2953 					       parse_events_option);
2954 		err = parse_events_option(&o, lists[0], 0);
2955 	}
2956 out:
2957 	if (sep)
2958 		*sep = ',';
2959 
2960 	return err;
2961 }
2962 
2963 int cmd_trace(int argc, const char **argv)
2964 {
2965 	const char *trace_usage[] = {
2966 		"perf trace [<options>] [<command>]",
2967 		"perf trace [<options>] -- <command> [<options>]",
2968 		"perf trace record [<options>] [<command>]",
2969 		"perf trace record [<options>] -- <command> [<options>]",
2970 		NULL
2971 	};
2972 	struct trace trace = {
2973 		.syscalls = {
2974 			. max = -1,
2975 		},
2976 		.opts = {
2977 			.target = {
2978 				.uid	   = UINT_MAX,
2979 				.uses_mmap = true,
2980 			},
2981 			.user_freq     = UINT_MAX,
2982 			.user_interval = ULLONG_MAX,
2983 			.no_buffering  = true,
2984 			.mmap_pages    = UINT_MAX,
2985 			.proc_map_timeout  = 500,
2986 		},
2987 		.output = stderr,
2988 		.show_comm = true,
2989 		.trace_syscalls = true,
2990 		.kernel_syscallchains = false,
2991 		.max_stack = UINT_MAX,
2992 	};
2993 	const char *output_name = NULL;
2994 	const struct option trace_options[] = {
2995 	OPT_CALLBACK('e', "event", &trace, "event",
2996 		     "event/syscall selector. use 'perf list' to list available events",
2997 		     trace__parse_events_option),
2998 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2999 		    "show the thread COMM next to its id"),
3000 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3001 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3002 		     trace__parse_events_option),
3003 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
3004 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3005 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3006 		    "trace events on existing process id"),
3007 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3008 		    "trace events on existing thread id"),
3009 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3010 		     "pids to filter (by the kernel)", trace__set_filter_pids),
3011 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3012 		    "system-wide collection from all CPUs"),
3013 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3014 		    "list of cpus to monitor"),
3015 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3016 		    "child tasks do not inherit counters"),
3017 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3018 		     "number of mmap data pages",
3019 		     perf_evlist__parse_mmap_pages),
3020 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3021 		   "user to profile"),
3022 	OPT_CALLBACK(0, "duration", &trace, "float",
3023 		     "show only events with duration > N.M ms",
3024 		     trace__set_duration),
3025 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3026 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3027 	OPT_BOOLEAN('T', "time", &trace.full_time,
3028 		    "Show full timestamp, not time relative to first start"),
3029 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3030 		    "Show only syscall summary with statistics"),
3031 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3032 		    "Show all syscalls and summary with statistics"),
3033 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3034 		     "Trace pagefaults", parse_pagefaults, "maj"),
3035 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3036 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3037 	OPT_CALLBACK(0, "call-graph", &trace.opts,
3038 		     "record_mode[,record_size]", record_callchain_help,
3039 		     &record_parse_callchain_opt),
3040 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3041 		    "Show the kernel callchains on the syscall exit path"),
3042 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3043 		     "Set the minimum stack depth when parsing the callchain, "
3044 		     "anything below the specified depth will be ignored."),
3045 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3046 		     "Set the maximum stack depth when parsing the callchain, "
3047 		     "anything beyond the specified depth will be ignored. "
3048 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3049 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3050 			"per thread proc mmap processing timeout in ms"),
3051 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3052 		     "ms to wait before starting measurement after program "
3053 		     "start"),
3054 	OPT_END()
3055 	};
3056 	bool __maybe_unused max_stack_user_set = true;
3057 	bool mmap_pages_user_set = true;
3058 	const char * const trace_subcommands[] = { "record", NULL };
3059 	int err;
3060 	char bf[BUFSIZ];
3061 
3062 	signal(SIGSEGV, sighandler_dump_stack);
3063 	signal(SIGFPE, sighandler_dump_stack);
3064 
3065 	trace.evlist = perf_evlist__new();
3066 	trace.sctbl = syscalltbl__new();
3067 
3068 	if (trace.evlist == NULL || trace.sctbl == NULL) {
3069 		pr_err("Not enough memory to run!\n");
3070 		err = -ENOMEM;
3071 		goto out;
3072 	}
3073 
3074 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3075 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3076 
3077 	err = bpf__setup_stdout(trace.evlist);
3078 	if (err) {
3079 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3080 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3081 		goto out;
3082 	}
3083 
3084 	err = -1;
3085 
3086 	if (trace.trace_pgfaults) {
3087 		trace.opts.sample_address = true;
3088 		trace.opts.sample_time = true;
3089 	}
3090 
3091 	if (trace.opts.mmap_pages == UINT_MAX)
3092 		mmap_pages_user_set = false;
3093 
3094 	if (trace.max_stack == UINT_MAX) {
3095 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3096 		max_stack_user_set = false;
3097 	}
3098 
3099 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3100 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
3101 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3102 #endif
3103 
3104 	if (callchain_param.enabled) {
3105 		if (!mmap_pages_user_set && geteuid() == 0)
3106 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3107 
3108 		symbol_conf.use_callchain = true;
3109 	}
3110 
3111 	if (trace.evlist->nr_entries > 0)
3112 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3113 
3114 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3115 		return trace__record(&trace, argc-1, &argv[1]);
3116 
3117 	/* summary_only implies summary option, but don't overwrite summary if set */
3118 	if (trace.summary_only)
3119 		trace.summary = trace.summary_only;
3120 
3121 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3122 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3123 		pr_err("Please specify something to trace.\n");
3124 		return -1;
3125 	}
3126 
3127 	if (!trace.trace_syscalls && trace.ev_qualifier) {
3128 		pr_err("The -e option can't be used with --no-syscalls.\n");
3129 		goto out;
3130 	}
3131 
3132 	if (output_name != NULL) {
3133 		err = trace__open_output(&trace, output_name);
3134 		if (err < 0) {
3135 			perror("failed to create output file");
3136 			goto out;
3137 		}
3138 	}
3139 
3140 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
3141 
3142 	err = target__validate(&trace.opts.target);
3143 	if (err) {
3144 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3145 		fprintf(trace.output, "%s", bf);
3146 		goto out_close;
3147 	}
3148 
3149 	err = target__parse_uid(&trace.opts.target);
3150 	if (err) {
3151 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3152 		fprintf(trace.output, "%s", bf);
3153 		goto out_close;
3154 	}
3155 
3156 	if (!argc && target__none(&trace.opts.target))
3157 		trace.opts.target.system_wide = true;
3158 
3159 	if (input_name)
3160 		err = trace__replay(&trace);
3161 	else
3162 		err = trace__run(&trace, argc, argv);
3163 
3164 out_close:
3165 	if (output_name != NULL)
3166 		fclose(trace.output);
3167 out:
3168 	return err;
3169 }
3170