xref: /openbmc/linux/tools/perf/builtin-trace.c (revision ddc141e5)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/env.h"
25 #include "util/event.h"
26 #include "util/evlist.h"
27 #include <subcmd/exec-cmd.h>
28 #include "util/machine.h"
29 #include "util/path.h"
30 #include "util/session.h"
31 #include "util/thread.h"
32 #include <subcmd/parse-options.h>
33 #include "util/strlist.h"
34 #include "util/intlist.h"
35 #include "util/thread_map.h"
36 #include "util/stat.h"
37 #include "trace/beauty/beauty.h"
38 #include "trace-event.h"
39 #include "util/parse-events.h"
40 #include "util/bpf-loader.h"
41 #include "callchain.h"
42 #include "print_binary.h"
43 #include "string2.h"
44 #include "syscalltbl.h"
45 #include "rb_resort.h"
46 
47 #include <errno.h>
48 #include <inttypes.h>
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/kernel.h>
56 #include <linux/random.h>
57 #include <linux/stringify.h>
58 #include <linux/time64.h>
59 #include <fcntl.h>
60 
61 #include "sane_ctype.h"
62 
63 #ifndef O_CLOEXEC
64 # define O_CLOEXEC		02000000
65 #endif
66 
67 #ifndef F_LINUX_SPECIFIC_BASE
68 # define F_LINUX_SPECIFIC_BASE	1024
69 #endif
70 
71 struct trace {
72 	struct perf_tool	tool;
73 	struct syscalltbl	*sctbl;
74 	struct {
75 		int		max;
76 		struct syscall  *table;
77 		struct {
78 			struct perf_evsel *sys_enter,
79 					  *sys_exit;
80 		}		events;
81 	} syscalls;
82 	struct record_opts	opts;
83 	struct perf_evlist	*evlist;
84 	struct machine		*host;
85 	struct thread		*current;
86 	u64			base_time;
87 	FILE			*output;
88 	unsigned long		nr_events;
89 	struct strlist		*ev_qualifier;
90 	struct {
91 		size_t		nr;
92 		int		*entries;
93 	}			ev_qualifier_ids;
94 	struct {
95 		size_t		nr;
96 		pid_t		*entries;
97 	}			filter_pids;
98 	double			duration_filter;
99 	double			runtime_ms;
100 	struct {
101 		u64		vfs_getname,
102 				proc_getname;
103 	} stats;
104 	unsigned int		max_stack;
105 	unsigned int		min_stack;
106 	bool			not_ev_qualifier;
107 	bool			live;
108 	bool			full_time;
109 	bool			sched;
110 	bool			multiple_threads;
111 	bool			summary;
112 	bool			summary_only;
113 	bool			show_comm;
114 	bool			print_sample;
115 	bool			show_tool_stats;
116 	bool			trace_syscalls;
117 	bool			kernel_syscallchains;
118 	bool			force;
119 	bool			vfs_getname;
120 	int			trace_pgfaults;
121 	int			open_id;
122 };
123 
124 struct tp_field {
125 	int offset;
126 	union {
127 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
128 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
129 	};
130 };
131 
132 #define TP_UINT_FIELD(bits) \
133 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
134 { \
135 	u##bits value; \
136 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
137 	return value;  \
138 }
139 
140 TP_UINT_FIELD(8);
141 TP_UINT_FIELD(16);
142 TP_UINT_FIELD(32);
143 TP_UINT_FIELD(64);
144 
145 #define TP_UINT_FIELD__SWAPPED(bits) \
146 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
147 { \
148 	u##bits value; \
149 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
150 	return bswap_##bits(value);\
151 }
152 
153 TP_UINT_FIELD__SWAPPED(16);
154 TP_UINT_FIELD__SWAPPED(32);
155 TP_UINT_FIELD__SWAPPED(64);
156 
157 static int tp_field__init_uint(struct tp_field *field,
158 			       struct format_field *format_field,
159 			       bool needs_swap)
160 {
161 	field->offset = format_field->offset;
162 
163 	switch (format_field->size) {
164 	case 1:
165 		field->integer = tp_field__u8;
166 		break;
167 	case 2:
168 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
169 		break;
170 	case 4:
171 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
172 		break;
173 	case 8:
174 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
175 		break;
176 	default:
177 		return -1;
178 	}
179 
180 	return 0;
181 }
182 
183 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
184 {
185 	return sample->raw_data + field->offset;
186 }
187 
188 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
189 {
190 	field->offset = format_field->offset;
191 	field->pointer = tp_field__ptr;
192 	return 0;
193 }
194 
195 struct syscall_tp {
196 	struct tp_field id;
197 	union {
198 		struct tp_field args, ret;
199 	};
200 };
201 
202 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
203 					  struct tp_field *field,
204 					  const char *name)
205 {
206 	struct format_field *format_field = perf_evsel__field(evsel, name);
207 
208 	if (format_field == NULL)
209 		return -1;
210 
211 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
212 }
213 
214 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
215 	({ struct syscall_tp *sc = evsel->priv;\
216 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
217 
218 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
219 					 struct tp_field *field,
220 					 const char *name)
221 {
222 	struct format_field *format_field = perf_evsel__field(evsel, name);
223 
224 	if (format_field == NULL)
225 		return -1;
226 
227 	return tp_field__init_ptr(field, format_field);
228 }
229 
230 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
231 	({ struct syscall_tp *sc = evsel->priv;\
232 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
233 
234 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
235 {
236 	zfree(&evsel->priv);
237 	perf_evsel__delete(evsel);
238 }
239 
240 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
241 {
242 	evsel->priv = malloc(sizeof(struct syscall_tp));
243 	if (evsel->priv != NULL) {
244 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
245 			goto out_delete;
246 
247 		evsel->handler = handler;
248 		return 0;
249 	}
250 
251 	return -ENOMEM;
252 
253 out_delete:
254 	zfree(&evsel->priv);
255 	return -ENOENT;
256 }
257 
258 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
259 {
260 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
261 
262 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
263 	if (IS_ERR(evsel))
264 		evsel = perf_evsel__newtp("syscalls", direction);
265 
266 	if (IS_ERR(evsel))
267 		return NULL;
268 
269 	if (perf_evsel__init_syscall_tp(evsel, handler))
270 		goto out_delete;
271 
272 	return evsel;
273 
274 out_delete:
275 	perf_evsel__delete_priv(evsel);
276 	return NULL;
277 }
278 
279 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
280 	({ struct syscall_tp *fields = evsel->priv; \
281 	   fields->name.integer(&fields->name, sample); })
282 
283 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
284 	({ struct syscall_tp *fields = evsel->priv; \
285 	   fields->name.pointer(&fields->name, sample); })
286 
287 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
288 {
289 	int idx = val - sa->offset;
290 
291 	if (idx < 0 || idx >= sa->nr_entries)
292 		return scnprintf(bf, size, intfmt, val);
293 
294 	return scnprintf(bf, size, "%s", sa->entries[idx]);
295 }
296 
297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
298 						const char *intfmt,
299 					        struct syscall_arg *arg)
300 {
301 	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
302 }
303 
304 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
305 					      struct syscall_arg *arg)
306 {
307 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
308 }
309 
310 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
311 
312 struct strarrays {
313 	int		nr_entries;
314 	struct strarray **entries;
315 };
316 
317 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
318 	.nr_entries = ARRAY_SIZE(array), \
319 	.entries = array, \
320 }
321 
322 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
323 					struct syscall_arg *arg)
324 {
325 	struct strarrays *sas = arg->parm;
326 	int i;
327 
328 	for (i = 0; i < sas->nr_entries; ++i) {
329 		struct strarray *sa = sas->entries[i];
330 		int idx = arg->val - sa->offset;
331 
332 		if (idx >= 0 && idx < sa->nr_entries) {
333 			if (sa->entries[idx] == NULL)
334 				break;
335 			return scnprintf(bf, size, "%s", sa->entries[idx]);
336 		}
337 	}
338 
339 	return scnprintf(bf, size, "%d", arg->val);
340 }
341 
342 #ifndef AT_FDCWD
343 #define AT_FDCWD	-100
344 #endif
345 
346 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
347 					   struct syscall_arg *arg)
348 {
349 	int fd = arg->val;
350 
351 	if (fd == AT_FDCWD)
352 		return scnprintf(bf, size, "CWD");
353 
354 	return syscall_arg__scnprintf_fd(bf, size, arg);
355 }
356 
357 #define SCA_FDAT syscall_arg__scnprintf_fd_at
358 
359 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
360 					      struct syscall_arg *arg);
361 
362 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
363 
364 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
365 {
366 	return scnprintf(bf, size, "%#lx", arg->val);
367 }
368 
369 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
370 {
371 	return scnprintf(bf, size, "%d", arg->val);
372 }
373 
374 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
375 {
376 	return scnprintf(bf, size, "%ld", arg->val);
377 }
378 
379 static const char *bpf_cmd[] = {
380 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
381 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
382 };
383 static DEFINE_STRARRAY(bpf_cmd);
384 
385 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
386 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
387 
388 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
389 static DEFINE_STRARRAY(itimers);
390 
391 static const char *keyctl_options[] = {
392 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
393 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
394 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
395 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
396 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
397 };
398 static DEFINE_STRARRAY(keyctl_options);
399 
400 static const char *whences[] = { "SET", "CUR", "END",
401 #ifdef SEEK_DATA
402 "DATA",
403 #endif
404 #ifdef SEEK_HOLE
405 "HOLE",
406 #endif
407 };
408 static DEFINE_STRARRAY(whences);
409 
410 static const char *fcntl_cmds[] = {
411 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
412 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
413 	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
414 	"GETOWNER_UIDS",
415 };
416 static DEFINE_STRARRAY(fcntl_cmds);
417 
418 static const char *fcntl_linux_specific_cmds[] = {
419 	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
420 	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
421 	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
422 };
423 
424 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
425 
426 static struct strarray *fcntl_cmds_arrays[] = {
427 	&strarray__fcntl_cmds,
428 	&strarray__fcntl_linux_specific_cmds,
429 };
430 
431 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
432 
433 static const char *rlimit_resources[] = {
434 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
435 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
436 	"RTTIME",
437 };
438 static DEFINE_STRARRAY(rlimit_resources);
439 
440 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
441 static DEFINE_STRARRAY(sighow);
442 
443 static const char *clockid[] = {
444 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
445 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
446 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
447 };
448 static DEFINE_STRARRAY(clockid);
449 
450 static const char *socket_families[] = {
451 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
452 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
453 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
454 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
455 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
456 	"ALG", "NFC", "VSOCK",
457 };
458 static DEFINE_STRARRAY(socket_families);
459 
460 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
461 						 struct syscall_arg *arg)
462 {
463 	size_t printed = 0;
464 	int mode = arg->val;
465 
466 	if (mode == F_OK) /* 0 */
467 		return scnprintf(bf, size, "F");
468 #define	P_MODE(n) \
469 	if (mode & n##_OK) { \
470 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
471 		mode &= ~n##_OK; \
472 	}
473 
474 	P_MODE(R);
475 	P_MODE(W);
476 	P_MODE(X);
477 #undef P_MODE
478 
479 	if (mode)
480 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
481 
482 	return printed;
483 }
484 
485 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
486 
487 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
488 					      struct syscall_arg *arg);
489 
490 #define SCA_FILENAME syscall_arg__scnprintf_filename
491 
492 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
493 						struct syscall_arg *arg)
494 {
495 	int printed = 0, flags = arg->val;
496 
497 #define	P_FLAG(n) \
498 	if (flags & O_##n) { \
499 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
500 		flags &= ~O_##n; \
501 	}
502 
503 	P_FLAG(CLOEXEC);
504 	P_FLAG(NONBLOCK);
505 #undef P_FLAG
506 
507 	if (flags)
508 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
509 
510 	return printed;
511 }
512 
513 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
514 
515 #ifndef GRND_NONBLOCK
516 #define GRND_NONBLOCK	0x0001
517 #endif
518 #ifndef GRND_RANDOM
519 #define GRND_RANDOM	0x0002
520 #endif
521 
522 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
523 						   struct syscall_arg *arg)
524 {
525 	int printed = 0, flags = arg->val;
526 
527 #define	P_FLAG(n) \
528 	if (flags & GRND_##n) { \
529 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
530 		flags &= ~GRND_##n; \
531 	}
532 
533 	P_FLAG(RANDOM);
534 	P_FLAG(NONBLOCK);
535 #undef P_FLAG
536 
537 	if (flags)
538 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
539 
540 	return printed;
541 }
542 
543 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
544 
545 #define STRARRAY(name, array) \
546 	  { .scnprintf	= SCA_STRARRAY, \
547 	    .parm	= &strarray__##array, }
548 
549 #include "trace/beauty/arch_errno_names.c"
550 #include "trace/beauty/eventfd.c"
551 #include "trace/beauty/futex_op.c"
552 #include "trace/beauty/futex_val3.c"
553 #include "trace/beauty/mmap.c"
554 #include "trace/beauty/mode_t.c"
555 #include "trace/beauty/msg_flags.c"
556 #include "trace/beauty/open_flags.c"
557 #include "trace/beauty/perf_event_open.c"
558 #include "trace/beauty/pid.c"
559 #include "trace/beauty/sched_policy.c"
560 #include "trace/beauty/seccomp.c"
561 #include "trace/beauty/signum.c"
562 #include "trace/beauty/socket_type.c"
563 #include "trace/beauty/waitid_options.c"
564 
565 struct syscall_arg_fmt {
566 	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
567 	void	   *parm;
568 	const char *name;
569 	bool	   show_zero;
570 };
571 
572 static struct syscall_fmt {
573 	const char *name;
574 	const char *alias;
575 	struct syscall_arg_fmt arg[6];
576 	u8	   nr_args;
577 	bool	   errpid;
578 	bool	   timeout;
579 	bool	   hexret;
580 } syscall_fmts[] = {
581 	{ .name	    = "access",
582 	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
583 	{ .name	    = "bpf",
584 	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
585 	{ .name	    = "brk",	    .hexret = true,
586 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
587 	{ .name     = "clock_gettime",
588 	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
589 	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
590 	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
591 		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
592 		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
593 		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
594 		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
595 	{ .name	    = "close",
596 	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
597 	{ .name	    = "epoll_ctl",
598 	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
599 	{ .name	    = "eventfd2",
600 	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
601 	{ .name	    = "fchmodat",
602 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
603 	{ .name	    = "fchownat",
604 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
605 	{ .name	    = "fcntl",
606 	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
607 			   .parm      = &strarrays__fcntl_cmds_arrays,
608 			   .show_zero = true, },
609 		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
610 	{ .name	    = "flock",
611 	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
612 	{ .name	    = "fstat", .alias = "newfstat", },
613 	{ .name	    = "fstatat", .alias = "newfstatat", },
614 	{ .name	    = "futex",
615 	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
616 		   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
617 	{ .name	    = "futimesat",
618 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
619 	{ .name	    = "getitimer",
620 	  .arg = { [0] = STRARRAY(which, itimers), }, },
621 	{ .name	    = "getpid",	    .errpid = true, },
622 	{ .name	    = "getpgid",    .errpid = true, },
623 	{ .name	    = "getppid",    .errpid = true, },
624 	{ .name	    = "getrandom",
625 	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
626 	{ .name	    = "getrlimit",
627 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
628 	{ .name	    = "gettid",	    .errpid = true, },
629 	{ .name	    = "ioctl",
630 	  .arg = {
631 #if defined(__i386__) || defined(__x86_64__)
632 /*
633  * FIXME: Make this available to all arches.
634  */
635 		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
636 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
637 #else
638 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
639 #endif
640 	{ .name	    = "kcmp",	    .nr_args = 5,
641 	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
642 		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
643 		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
644 		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
645 		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
646 	{ .name	    = "keyctl",
647 	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
648 	{ .name	    = "kill",
649 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
650 	{ .name	    = "linkat",
651 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
652 	{ .name	    = "lseek",
653 	  .arg = { [2] = STRARRAY(whence, whences), }, },
654 	{ .name	    = "lstat", .alias = "newlstat", },
655 	{ .name     = "madvise",
656 	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
657 		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
658 	{ .name	    = "mkdirat",
659 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
660 	{ .name	    = "mknodat",
661 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
662 	{ .name	    = "mlock",
663 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
664 	{ .name	    = "mlockall",
665 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
666 	{ .name	    = "mmap",	    .hexret = true,
667 /* The standard mmap maps to old_mmap on s390x */
668 #if defined(__s390x__)
669 	.alias = "old_mmap",
670 #endif
671 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
672 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
673 		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
674 	{ .name	    = "mprotect",
675 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
676 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
677 	{ .name	    = "mq_unlink",
678 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
679 	{ .name	    = "mremap",	    .hexret = true,
680 	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
681 		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
682 		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
683 	{ .name	    = "munlock",
684 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
685 	{ .name	    = "munmap",
686 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
687 	{ .name	    = "name_to_handle_at",
688 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
689 	{ .name	    = "newfstatat",
690 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
691 	{ .name	    = "open",
692 	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
693 	{ .name	    = "open_by_handle_at",
694 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
695 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
696 	{ .name	    = "openat",
697 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
698 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
699 	{ .name	    = "perf_event_open",
700 	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
701 		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
702 		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
703 	{ .name	    = "pipe2",
704 	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
705 	{ .name	    = "pkey_alloc",
706 	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
707 	{ .name	    = "pkey_free",
708 	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
709 	{ .name	    = "pkey_mprotect",
710 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
711 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
712 		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
713 	{ .name	    = "poll", .timeout = true, },
714 	{ .name	    = "ppoll", .timeout = true, },
715 	{ .name	    = "prctl", .alias = "arch_prctl",
716 	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
717 		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
718 		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
719 	{ .name	    = "pread", .alias = "pread64", },
720 	{ .name	    = "preadv", .alias = "pread", },
721 	{ .name	    = "prlimit64",
722 	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
723 	{ .name	    = "pwrite", .alias = "pwrite64", },
724 	{ .name	    = "readlinkat",
725 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
726 	{ .name	    = "recvfrom",
727 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
728 	{ .name	    = "recvmmsg",
729 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
730 	{ .name	    = "recvmsg",
731 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
732 	{ .name	    = "renameat",
733 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
734 	{ .name	    = "rt_sigaction",
735 	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
736 	{ .name	    = "rt_sigprocmask",
737 	  .arg = { [0] = STRARRAY(how, sighow), }, },
738 	{ .name	    = "rt_sigqueueinfo",
739 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
740 	{ .name	    = "rt_tgsigqueueinfo",
741 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
742 	{ .name	    = "sched_setscheduler",
743 	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
744 	{ .name	    = "seccomp",
745 	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
746 		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
747 	{ .name	    = "select", .timeout = true, },
748 	{ .name	    = "sendmmsg",
749 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
750 	{ .name	    = "sendmsg",
751 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
752 	{ .name	    = "sendto",
753 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
754 	{ .name	    = "set_tid_address", .errpid = true, },
755 	{ .name	    = "setitimer",
756 	  .arg = { [0] = STRARRAY(which, itimers), }, },
757 	{ .name	    = "setrlimit",
758 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
759 	{ .name	    = "socket",
760 	  .arg = { [0] = STRARRAY(family, socket_families),
761 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
762 	{ .name	    = "socketpair",
763 	  .arg = { [0] = STRARRAY(family, socket_families),
764 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
765 	{ .name	    = "stat", .alias = "newstat", },
766 	{ .name	    = "statx",
767 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
768 		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
769 		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
770 	{ .name	    = "swapoff",
771 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
772 	{ .name	    = "swapon",
773 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
774 	{ .name	    = "symlinkat",
775 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
776 	{ .name	    = "tgkill",
777 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
778 	{ .name	    = "tkill",
779 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
780 	{ .name	    = "uname", .alias = "newuname", },
781 	{ .name	    = "unlinkat",
782 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
783 	{ .name	    = "utimensat",
784 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
785 	{ .name	    = "wait4",	    .errpid = true,
786 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
787 	{ .name	    = "waitid",	    .errpid = true,
788 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
789 };
790 
791 static int syscall_fmt__cmp(const void *name, const void *fmtp)
792 {
793 	const struct syscall_fmt *fmt = fmtp;
794 	return strcmp(name, fmt->name);
795 }
796 
797 static struct syscall_fmt *syscall_fmt__find(const char *name)
798 {
799 	const int nmemb = ARRAY_SIZE(syscall_fmts);
800 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
801 }
802 
803 struct syscall {
804 	struct event_format *tp_format;
805 	int		    nr_args;
806 	struct format_field *args;
807 	const char	    *name;
808 	bool		    is_exit;
809 	struct syscall_fmt  *fmt;
810 	struct syscall_arg_fmt *arg_fmt;
811 };
812 
813 /*
814  * We need to have this 'calculated' boolean because in some cases we really
815  * don't know what is the duration of a syscall, for instance, when we start
816  * a session and some threads are waiting for a syscall to finish, say 'poll',
817  * in which case all we can do is to print "( ? ) for duration and for the
818  * start timestamp.
819  */
820 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
821 {
822 	double duration = (double)t / NSEC_PER_MSEC;
823 	size_t printed = fprintf(fp, "(");
824 
825 	if (!calculated)
826 		printed += fprintf(fp, "         ");
827 	else if (duration >= 1.0)
828 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
829 	else if (duration >= 0.01)
830 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
831 	else
832 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
833 	return printed + fprintf(fp, "): ");
834 }
835 
836 /**
837  * filename.ptr: The filename char pointer that will be vfs_getname'd
838  * filename.entry_str_pos: Where to insert the string translated from
839  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
840  * ret_scnprintf: syscall args may set this to a different syscall return
841  *                formatter, for instance, fcntl may return fds, file flags, etc.
842  */
843 struct thread_trace {
844 	u64		  entry_time;
845 	bool		  entry_pending;
846 	unsigned long	  nr_events;
847 	unsigned long	  pfmaj, pfmin;
848 	char		  *entry_str;
849 	double		  runtime_ms;
850 	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
851         struct {
852 		unsigned long ptr;
853 		short int     entry_str_pos;
854 		bool	      pending_open;
855 		unsigned int  namelen;
856 		char	      *name;
857 	} filename;
858 	struct {
859 		int	  max;
860 		char	  **table;
861 	} paths;
862 
863 	struct intlist *syscall_stats;
864 };
865 
866 static struct thread_trace *thread_trace__new(void)
867 {
868 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
869 
870 	if (ttrace)
871 		ttrace->paths.max = -1;
872 
873 	ttrace->syscall_stats = intlist__new(NULL);
874 
875 	return ttrace;
876 }
877 
878 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
879 {
880 	struct thread_trace *ttrace;
881 
882 	if (thread == NULL)
883 		goto fail;
884 
885 	if (thread__priv(thread) == NULL)
886 		thread__set_priv(thread, thread_trace__new());
887 
888 	if (thread__priv(thread) == NULL)
889 		goto fail;
890 
891 	ttrace = thread__priv(thread);
892 	++ttrace->nr_events;
893 
894 	return ttrace;
895 fail:
896 	color_fprintf(fp, PERF_COLOR_RED,
897 		      "WARNING: not enough memory, dropping samples!\n");
898 	return NULL;
899 }
900 
901 
902 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
903 				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
904 {
905 	struct thread_trace *ttrace = thread__priv(arg->thread);
906 
907 	ttrace->ret_scnprintf = ret_scnprintf;
908 }
909 
910 #define TRACE_PFMAJ		(1 << 0)
911 #define TRACE_PFMIN		(1 << 1)
912 
913 static const size_t trace__entry_str_size = 2048;
914 
915 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
916 {
917 	struct thread_trace *ttrace = thread__priv(thread);
918 
919 	if (fd > ttrace->paths.max) {
920 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
921 
922 		if (npath == NULL)
923 			return -1;
924 
925 		if (ttrace->paths.max != -1) {
926 			memset(npath + ttrace->paths.max + 1, 0,
927 			       (fd - ttrace->paths.max) * sizeof(char *));
928 		} else {
929 			memset(npath, 0, (fd + 1) * sizeof(char *));
930 		}
931 
932 		ttrace->paths.table = npath;
933 		ttrace->paths.max   = fd;
934 	}
935 
936 	ttrace->paths.table[fd] = strdup(pathname);
937 
938 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
939 }
940 
941 static int thread__read_fd_path(struct thread *thread, int fd)
942 {
943 	char linkname[PATH_MAX], pathname[PATH_MAX];
944 	struct stat st;
945 	int ret;
946 
947 	if (thread->pid_ == thread->tid) {
948 		scnprintf(linkname, sizeof(linkname),
949 			  "/proc/%d/fd/%d", thread->pid_, fd);
950 	} else {
951 		scnprintf(linkname, sizeof(linkname),
952 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
953 	}
954 
955 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
956 		return -1;
957 
958 	ret = readlink(linkname, pathname, sizeof(pathname));
959 
960 	if (ret < 0 || ret > st.st_size)
961 		return -1;
962 
963 	pathname[ret] = '\0';
964 	return trace__set_fd_pathname(thread, fd, pathname);
965 }
966 
967 static const char *thread__fd_path(struct thread *thread, int fd,
968 				   struct trace *trace)
969 {
970 	struct thread_trace *ttrace = thread__priv(thread);
971 
972 	if (ttrace == NULL)
973 		return NULL;
974 
975 	if (fd < 0)
976 		return NULL;
977 
978 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
979 		if (!trace->live)
980 			return NULL;
981 		++trace->stats.proc_getname;
982 		if (thread__read_fd_path(thread, fd))
983 			return NULL;
984 	}
985 
986 	return ttrace->paths.table[fd];
987 }
988 
989 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
990 {
991 	int fd = arg->val;
992 	size_t printed = scnprintf(bf, size, "%d", fd);
993 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
994 
995 	if (path)
996 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
997 
998 	return printed;
999 }
1000 
1001 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1002 {
1003         size_t printed = scnprintf(bf, size, "%d", fd);
1004 	struct thread *thread = machine__find_thread(trace->host, pid, pid);
1005 
1006 	if (thread) {
1007 		const char *path = thread__fd_path(thread, fd, trace);
1008 
1009 		if (path)
1010 			printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1011 
1012 		thread__put(thread);
1013 	}
1014 
1015         return printed;
1016 }
1017 
1018 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1019 					      struct syscall_arg *arg)
1020 {
1021 	int fd = arg->val;
1022 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1023 	struct thread_trace *ttrace = thread__priv(arg->thread);
1024 
1025 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1026 		zfree(&ttrace->paths.table[fd]);
1027 
1028 	return printed;
1029 }
1030 
1031 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1032 				     unsigned long ptr)
1033 {
1034 	struct thread_trace *ttrace = thread__priv(thread);
1035 
1036 	ttrace->filename.ptr = ptr;
1037 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1038 }
1039 
1040 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1041 					      struct syscall_arg *arg)
1042 {
1043 	unsigned long ptr = arg->val;
1044 
1045 	if (!arg->trace->vfs_getname)
1046 		return scnprintf(bf, size, "%#x", ptr);
1047 
1048 	thread__set_filename_pos(arg->thread, bf, ptr);
1049 	return 0;
1050 }
1051 
1052 static bool trace__filter_duration(struct trace *trace, double t)
1053 {
1054 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1055 }
1056 
1057 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1058 {
1059 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1060 
1061 	return fprintf(fp, "%10.3f ", ts);
1062 }
1063 
1064 /*
1065  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1066  * using ttrace->entry_time for a thread that receives a sys_exit without
1067  * first having received a sys_enter ("poll" issued before tracing session
1068  * starts, lost sys_enter exit due to ring buffer overflow).
1069  */
1070 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1071 {
1072 	if (tstamp > 0)
1073 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1074 
1075 	return fprintf(fp, "         ? ");
1076 }
1077 
1078 static bool done = false;
1079 static bool interrupted = false;
1080 
1081 static void sig_handler(int sig)
1082 {
1083 	done = true;
1084 	interrupted = sig == SIGINT;
1085 }
1086 
1087 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1088 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1089 {
1090 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1091 	printed += fprintf_duration(duration, duration_calculated, fp);
1092 
1093 	if (trace->multiple_threads) {
1094 		if (trace->show_comm)
1095 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1096 		printed += fprintf(fp, "%d ", thread->tid);
1097 	}
1098 
1099 	return printed;
1100 }
1101 
1102 static int trace__process_event(struct trace *trace, struct machine *machine,
1103 				union perf_event *event, struct perf_sample *sample)
1104 {
1105 	int ret = 0;
1106 
1107 	switch (event->header.type) {
1108 	case PERF_RECORD_LOST:
1109 		color_fprintf(trace->output, PERF_COLOR_RED,
1110 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1111 		ret = machine__process_lost_event(machine, event, sample);
1112 		break;
1113 	default:
1114 		ret = machine__process_event(machine, event, sample);
1115 		break;
1116 	}
1117 
1118 	return ret;
1119 }
1120 
1121 static int trace__tool_process(struct perf_tool *tool,
1122 			       union perf_event *event,
1123 			       struct perf_sample *sample,
1124 			       struct machine *machine)
1125 {
1126 	struct trace *trace = container_of(tool, struct trace, tool);
1127 	return trace__process_event(trace, machine, event, sample);
1128 }
1129 
1130 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1131 {
1132 	struct machine *machine = vmachine;
1133 
1134 	if (machine->kptr_restrict_warned)
1135 		return NULL;
1136 
1137 	if (symbol_conf.kptr_restrict) {
1138 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1139 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1140 			   "Kernel samples will not be resolved.\n");
1141 		machine->kptr_restrict_warned = true;
1142 		return NULL;
1143 	}
1144 
1145 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1146 }
1147 
1148 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1149 {
1150 	int err = symbol__init(NULL);
1151 
1152 	if (err)
1153 		return err;
1154 
1155 	trace->host = machine__new_host();
1156 	if (trace->host == NULL)
1157 		return -ENOMEM;
1158 
1159 	err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1160 	if (err < 0)
1161 		goto out;
1162 
1163 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1164 					    evlist->threads, trace__tool_process, false,
1165 					    trace->opts.proc_map_timeout, 1);
1166 out:
1167 	if (err)
1168 		symbol__exit();
1169 
1170 	return err;
1171 }
1172 
1173 static void trace__symbols__exit(struct trace *trace)
1174 {
1175 	machine__exit(trace->host);
1176 	trace->host = NULL;
1177 
1178 	symbol__exit();
1179 }
1180 
1181 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1182 {
1183 	int idx;
1184 
1185 	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1186 		nr_args = sc->fmt->nr_args;
1187 
1188 	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1189 	if (sc->arg_fmt == NULL)
1190 		return -1;
1191 
1192 	for (idx = 0; idx < nr_args; ++idx) {
1193 		if (sc->fmt)
1194 			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1195 	}
1196 
1197 	sc->nr_args = nr_args;
1198 	return 0;
1199 }
1200 
1201 static int syscall__set_arg_fmts(struct syscall *sc)
1202 {
1203 	struct format_field *field;
1204 	int idx = 0, len;
1205 
1206 	for (field = sc->args; field; field = field->next, ++idx) {
1207 		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1208 			continue;
1209 
1210 		if (strcmp(field->type, "const char *") == 0 &&
1211 			 (strcmp(field->name, "filename") == 0 ||
1212 			  strcmp(field->name, "path") == 0 ||
1213 			  strcmp(field->name, "pathname") == 0))
1214 			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1215 		else if (field->flags & FIELD_IS_POINTER)
1216 			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1217 		else if (strcmp(field->type, "pid_t") == 0)
1218 			sc->arg_fmt[idx].scnprintf = SCA_PID;
1219 		else if (strcmp(field->type, "umode_t") == 0)
1220 			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1221 		else if ((strcmp(field->type, "int") == 0 ||
1222 			  strcmp(field->type, "unsigned int") == 0 ||
1223 			  strcmp(field->type, "long") == 0) &&
1224 			 (len = strlen(field->name)) >= 2 &&
1225 			 strcmp(field->name + len - 2, "fd") == 0) {
1226 			/*
1227 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1228 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1229 			 * 65 int
1230 			 * 23 unsigned int
1231 			 * 7 unsigned long
1232 			 */
1233 			sc->arg_fmt[idx].scnprintf = SCA_FD;
1234 		}
1235 	}
1236 
1237 	return 0;
1238 }
1239 
1240 static int trace__read_syscall_info(struct trace *trace, int id)
1241 {
1242 	char tp_name[128];
1243 	struct syscall *sc;
1244 	const char *name = syscalltbl__name(trace->sctbl, id);
1245 
1246 	if (name == NULL)
1247 		return -1;
1248 
1249 	if (id > trace->syscalls.max) {
1250 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1251 
1252 		if (nsyscalls == NULL)
1253 			return -1;
1254 
1255 		if (trace->syscalls.max != -1) {
1256 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1257 			       (id - trace->syscalls.max) * sizeof(*sc));
1258 		} else {
1259 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1260 		}
1261 
1262 		trace->syscalls.table = nsyscalls;
1263 		trace->syscalls.max   = id;
1264 	}
1265 
1266 	sc = trace->syscalls.table + id;
1267 	sc->name = name;
1268 
1269 	sc->fmt  = syscall_fmt__find(sc->name);
1270 
1271 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1272 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1273 
1274 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1275 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1276 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1277 	}
1278 
1279 	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1280 		return -1;
1281 
1282 	if (IS_ERR(sc->tp_format))
1283 		return -1;
1284 
1285 	sc->args = sc->tp_format->format.fields;
1286 	/*
1287 	 * We need to check and discard the first variable '__syscall_nr'
1288 	 * or 'nr' that mean the syscall number. It is needless here.
1289 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1290 	 */
1291 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1292 		sc->args = sc->args->next;
1293 		--sc->nr_args;
1294 	}
1295 
1296 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1297 
1298 	return syscall__set_arg_fmts(sc);
1299 }
1300 
1301 static int trace__validate_ev_qualifier(struct trace *trace)
1302 {
1303 	int err = 0, i;
1304 	size_t nr_allocated;
1305 	struct str_node *pos;
1306 
1307 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1308 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1309 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1310 
1311 	if (trace->ev_qualifier_ids.entries == NULL) {
1312 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1313 		       trace->output);
1314 		err = -EINVAL;
1315 		goto out;
1316 	}
1317 
1318 	nr_allocated = trace->ev_qualifier_ids.nr;
1319 	i = 0;
1320 
1321 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1322 		const char *sc = pos->s;
1323 		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1324 
1325 		if (id < 0) {
1326 			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1327 			if (id >= 0)
1328 				goto matches;
1329 
1330 			if (err == 0) {
1331 				fputs("Error:\tInvalid syscall ", trace->output);
1332 				err = -EINVAL;
1333 			} else {
1334 				fputs(", ", trace->output);
1335 			}
1336 
1337 			fputs(sc, trace->output);
1338 		}
1339 matches:
1340 		trace->ev_qualifier_ids.entries[i++] = id;
1341 		if (match_next == -1)
1342 			continue;
1343 
1344 		while (1) {
1345 			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1346 			if (id < 0)
1347 				break;
1348 			if (nr_allocated == trace->ev_qualifier_ids.nr) {
1349 				void *entries;
1350 
1351 				nr_allocated += 8;
1352 				entries = realloc(trace->ev_qualifier_ids.entries,
1353 						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1354 				if (entries == NULL) {
1355 					err = -ENOMEM;
1356 					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1357 					goto out_free;
1358 				}
1359 				trace->ev_qualifier_ids.entries = entries;
1360 			}
1361 			trace->ev_qualifier_ids.nr++;
1362 			trace->ev_qualifier_ids.entries[i++] = id;
1363 		}
1364 	}
1365 
1366 	if (err < 0) {
1367 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1368 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1369 out_free:
1370 		zfree(&trace->ev_qualifier_ids.entries);
1371 		trace->ev_qualifier_ids.nr = 0;
1372 	}
1373 out:
1374 	return err;
1375 }
1376 
1377 /*
1378  * args is to be interpreted as a series of longs but we need to handle
1379  * 8-byte unaligned accesses. args points to raw_data within the event
1380  * and raw_data is guaranteed to be 8-byte unaligned because it is
1381  * preceded by raw_size which is a u32. So we need to copy args to a temp
1382  * variable to read it. Most notably this avoids extended load instructions
1383  * on unaligned addresses
1384  */
1385 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1386 {
1387 	unsigned long val;
1388 	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1389 
1390 	memcpy(&val, p, sizeof(val));
1391 	return val;
1392 }
1393 
1394 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1395 				      struct syscall_arg *arg)
1396 {
1397 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1398 		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1399 
1400 	return scnprintf(bf, size, "arg%d: ", arg->idx);
1401 }
1402 
1403 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1404 				     struct syscall_arg *arg, unsigned long val)
1405 {
1406 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1407 		arg->val = val;
1408 		if (sc->arg_fmt[arg->idx].parm)
1409 			arg->parm = sc->arg_fmt[arg->idx].parm;
1410 		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1411 	}
1412 	return scnprintf(bf, size, "%ld", val);
1413 }
1414 
1415 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1416 				      unsigned char *args, struct trace *trace,
1417 				      struct thread *thread)
1418 {
1419 	size_t printed = 0;
1420 	unsigned long val;
1421 	u8 bit = 1;
1422 	struct syscall_arg arg = {
1423 		.args	= args,
1424 		.idx	= 0,
1425 		.mask	= 0,
1426 		.trace  = trace,
1427 		.thread = thread,
1428 	};
1429 	struct thread_trace *ttrace = thread__priv(thread);
1430 
1431 	/*
1432 	 * Things like fcntl will set this in its 'cmd' formatter to pick the
1433 	 * right formatter for the return value (an fd? file flags?), which is
1434 	 * not needed for syscalls that always return a given type, say an fd.
1435 	 */
1436 	ttrace->ret_scnprintf = NULL;
1437 
1438 	if (sc->args != NULL) {
1439 		struct format_field *field;
1440 
1441 		for (field = sc->args; field;
1442 		     field = field->next, ++arg.idx, bit <<= 1) {
1443 			if (arg.mask & bit)
1444 				continue;
1445 
1446 			val = syscall_arg__val(&arg, arg.idx);
1447 
1448 			/*
1449  			 * Suppress this argument if its value is zero and
1450  			 * and we don't have a string associated in an
1451  			 * strarray for it.
1452  			 */
1453 			if (val == 0 &&
1454 			    !(sc->arg_fmt &&
1455 			      (sc->arg_fmt[arg.idx].show_zero ||
1456 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1457 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1458 			      sc->arg_fmt[arg.idx].parm))
1459 				continue;
1460 
1461 			printed += scnprintf(bf + printed, size - printed,
1462 					     "%s%s: ", printed ? ", " : "", field->name);
1463 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1464 		}
1465 	} else if (IS_ERR(sc->tp_format)) {
1466 		/*
1467 		 * If we managed to read the tracepoint /format file, then we
1468 		 * may end up not having any args, like with gettid(), so only
1469 		 * print the raw args when we didn't manage to read it.
1470 		 */
1471 		while (arg.idx < sc->nr_args) {
1472 			if (arg.mask & bit)
1473 				goto next_arg;
1474 			val = syscall_arg__val(&arg, arg.idx);
1475 			if (printed)
1476 				printed += scnprintf(bf + printed, size - printed, ", ");
1477 			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1478 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1479 next_arg:
1480 			++arg.idx;
1481 			bit <<= 1;
1482 		}
1483 	}
1484 
1485 	return printed;
1486 }
1487 
1488 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1489 				  union perf_event *event,
1490 				  struct perf_sample *sample);
1491 
1492 static struct syscall *trace__syscall_info(struct trace *trace,
1493 					   struct perf_evsel *evsel, int id)
1494 {
1495 
1496 	if (id < 0) {
1497 
1498 		/*
1499 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1500 		 * before that, leaving at a higher verbosity level till that is
1501 		 * explained. Reproduced with plain ftrace with:
1502 		 *
1503 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1504 		 * grep "NR -1 " /t/trace_pipe
1505 		 *
1506 		 * After generating some load on the machine.
1507  		 */
1508 		if (verbose > 1) {
1509 			static u64 n;
1510 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1511 				id, perf_evsel__name(evsel), ++n);
1512 		}
1513 		return NULL;
1514 	}
1515 
1516 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1517 	    trace__read_syscall_info(trace, id))
1518 		goto out_cant_read;
1519 
1520 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1521 		goto out_cant_read;
1522 
1523 	return &trace->syscalls.table[id];
1524 
1525 out_cant_read:
1526 	if (verbose > 0) {
1527 		fprintf(trace->output, "Problems reading syscall %d", id);
1528 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1529 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1530 		fputs(" information\n", trace->output);
1531 	}
1532 	return NULL;
1533 }
1534 
1535 static void thread__update_stats(struct thread_trace *ttrace,
1536 				 int id, struct perf_sample *sample)
1537 {
1538 	struct int_node *inode;
1539 	struct stats *stats;
1540 	u64 duration = 0;
1541 
1542 	inode = intlist__findnew(ttrace->syscall_stats, id);
1543 	if (inode == NULL)
1544 		return;
1545 
1546 	stats = inode->priv;
1547 	if (stats == NULL) {
1548 		stats = malloc(sizeof(struct stats));
1549 		if (stats == NULL)
1550 			return;
1551 		init_stats(stats);
1552 		inode->priv = stats;
1553 	}
1554 
1555 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1556 		duration = sample->time - ttrace->entry_time;
1557 
1558 	update_stats(stats, duration);
1559 }
1560 
1561 static int trace__printf_interrupted_entry(struct trace *trace)
1562 {
1563 	struct thread_trace *ttrace;
1564 	size_t printed;
1565 
1566 	if (trace->current == NULL)
1567 		return 0;
1568 
1569 	ttrace = thread__priv(trace->current);
1570 
1571 	if (!ttrace->entry_pending)
1572 		return 0;
1573 
1574 	printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1575 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1576 	ttrace->entry_pending = false;
1577 
1578 	return printed;
1579 }
1580 
1581 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1582 				 struct perf_sample *sample, struct thread *thread)
1583 {
1584 	int printed = 0;
1585 
1586 	if (trace->print_sample) {
1587 		double ts = (double)sample->time / NSEC_PER_MSEC;
1588 
1589 		printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1590 				   perf_evsel__name(evsel), ts,
1591 				   thread__comm_str(thread),
1592 				   sample->pid, sample->tid, sample->cpu);
1593 	}
1594 
1595 	return printed;
1596 }
1597 
1598 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1599 			    union perf_event *event __maybe_unused,
1600 			    struct perf_sample *sample)
1601 {
1602 	char *msg;
1603 	void *args;
1604 	size_t printed = 0;
1605 	struct thread *thread;
1606 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1607 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1608 	struct thread_trace *ttrace;
1609 
1610 	if (sc == NULL)
1611 		return -1;
1612 
1613 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1614 	ttrace = thread__trace(thread, trace->output);
1615 	if (ttrace == NULL)
1616 		goto out_put;
1617 
1618 	trace__fprintf_sample(trace, evsel, sample, thread);
1619 
1620 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1621 
1622 	if (ttrace->entry_str == NULL) {
1623 		ttrace->entry_str = malloc(trace__entry_str_size);
1624 		if (!ttrace->entry_str)
1625 			goto out_put;
1626 	}
1627 
1628 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1629 		trace__printf_interrupted_entry(trace);
1630 
1631 	ttrace->entry_time = sample->time;
1632 	msg = ttrace->entry_str;
1633 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1634 
1635 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1636 					   args, trace, thread);
1637 
1638 	if (sc->is_exit) {
1639 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1640 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1641 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1642 		}
1643 	} else {
1644 		ttrace->entry_pending = true;
1645 		/* See trace__vfs_getname & trace__sys_exit */
1646 		ttrace->filename.pending_open = false;
1647 	}
1648 
1649 	if (trace->current != thread) {
1650 		thread__put(trace->current);
1651 		trace->current = thread__get(thread);
1652 	}
1653 	err = 0;
1654 out_put:
1655 	thread__put(thread);
1656 	return err;
1657 }
1658 
1659 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1660 				    struct perf_sample *sample,
1661 				    struct callchain_cursor *cursor)
1662 {
1663 	struct addr_location al;
1664 	int max_stack = evsel->attr.sample_max_stack ?
1665 			evsel->attr.sample_max_stack :
1666 			trace->max_stack;
1667 
1668 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1669 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1670 		return -1;
1671 
1672 	return 0;
1673 }
1674 
1675 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1676 {
1677 	/* TODO: user-configurable print_opts */
1678 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1679 				        EVSEL__PRINT_DSO |
1680 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1681 
1682 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1683 }
1684 
1685 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1686 {
1687 	struct perf_env *env = perf_evsel__env(evsel);
1688 	const char *arch_name = perf_env__arch(env);
1689 
1690 	return arch_syscalls__strerrno(arch_name, err);
1691 }
1692 
1693 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1694 			   union perf_event *event __maybe_unused,
1695 			   struct perf_sample *sample)
1696 {
1697 	long ret;
1698 	u64 duration = 0;
1699 	bool duration_calculated = false;
1700 	struct thread *thread;
1701 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1702 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1703 	struct thread_trace *ttrace;
1704 
1705 	if (sc == NULL)
1706 		return -1;
1707 
1708 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1709 	ttrace = thread__trace(thread, trace->output);
1710 	if (ttrace == NULL)
1711 		goto out_put;
1712 
1713 	trace__fprintf_sample(trace, evsel, sample, thread);
1714 
1715 	if (trace->summary)
1716 		thread__update_stats(ttrace, id, sample);
1717 
1718 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1719 
1720 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1721 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1722 		ttrace->filename.pending_open = false;
1723 		++trace->stats.vfs_getname;
1724 	}
1725 
1726 	if (ttrace->entry_time) {
1727 		duration = sample->time - ttrace->entry_time;
1728 		if (trace__filter_duration(trace, duration))
1729 			goto out;
1730 		duration_calculated = true;
1731 	} else if (trace->duration_filter)
1732 		goto out;
1733 
1734 	if (sample->callchain) {
1735 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1736 		if (callchain_ret == 0) {
1737 			if (callchain_cursor.nr < trace->min_stack)
1738 				goto out;
1739 			callchain_ret = 1;
1740 		}
1741 	}
1742 
1743 	if (trace->summary_only)
1744 		goto out;
1745 
1746 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1747 
1748 	if (ttrace->entry_pending) {
1749 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1750 	} else {
1751 		fprintf(trace->output, " ... [");
1752 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1753 		fprintf(trace->output, "]: %s()", sc->name);
1754 	}
1755 
1756 	if (sc->fmt == NULL) {
1757 		if (ret < 0)
1758 			goto errno_print;
1759 signed_print:
1760 		fprintf(trace->output, ") = %ld", ret);
1761 	} else if (ret < 0) {
1762 errno_print: {
1763 		char bf[STRERR_BUFSIZE];
1764 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1765 			   *e = errno_to_name(evsel, -ret);
1766 
1767 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1768 	}
1769 	} else if (ret == 0 && sc->fmt->timeout)
1770 		fprintf(trace->output, ") = 0 Timeout");
1771 	else if (ttrace->ret_scnprintf) {
1772 		char bf[1024];
1773 		struct syscall_arg arg = {
1774 			.val	= ret,
1775 			.thread	= thread,
1776 			.trace	= trace,
1777 		};
1778 		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1779 		ttrace->ret_scnprintf = NULL;
1780 		fprintf(trace->output, ") = %s", bf);
1781 	} else if (sc->fmt->hexret)
1782 		fprintf(trace->output, ") = %#lx", ret);
1783 	else if (sc->fmt->errpid) {
1784 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1785 
1786 		if (child != NULL) {
1787 			fprintf(trace->output, ") = %ld", ret);
1788 			if (child->comm_set)
1789 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1790 			thread__put(child);
1791 		}
1792 	} else
1793 		goto signed_print;
1794 
1795 	fputc('\n', trace->output);
1796 
1797 	if (callchain_ret > 0)
1798 		trace__fprintf_callchain(trace, sample);
1799 	else if (callchain_ret < 0)
1800 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1801 out:
1802 	ttrace->entry_pending = false;
1803 	err = 0;
1804 out_put:
1805 	thread__put(thread);
1806 	return err;
1807 }
1808 
1809 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1810 			      union perf_event *event __maybe_unused,
1811 			      struct perf_sample *sample)
1812 {
1813 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1814 	struct thread_trace *ttrace;
1815 	size_t filename_len, entry_str_len, to_move;
1816 	ssize_t remaining_space;
1817 	char *pos;
1818 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1819 
1820 	if (!thread)
1821 		goto out;
1822 
1823 	ttrace = thread__priv(thread);
1824 	if (!ttrace)
1825 		goto out_put;
1826 
1827 	filename_len = strlen(filename);
1828 	if (filename_len == 0)
1829 		goto out_put;
1830 
1831 	if (ttrace->filename.namelen < filename_len) {
1832 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1833 
1834 		if (f == NULL)
1835 			goto out_put;
1836 
1837 		ttrace->filename.namelen = filename_len;
1838 		ttrace->filename.name = f;
1839 	}
1840 
1841 	strcpy(ttrace->filename.name, filename);
1842 	ttrace->filename.pending_open = true;
1843 
1844 	if (!ttrace->filename.ptr)
1845 		goto out_put;
1846 
1847 	entry_str_len = strlen(ttrace->entry_str);
1848 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1849 	if (remaining_space <= 0)
1850 		goto out_put;
1851 
1852 	if (filename_len > (size_t)remaining_space) {
1853 		filename += filename_len - remaining_space;
1854 		filename_len = remaining_space;
1855 	}
1856 
1857 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1858 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1859 	memmove(pos + filename_len, pos, to_move);
1860 	memcpy(pos, filename, filename_len);
1861 
1862 	ttrace->filename.ptr = 0;
1863 	ttrace->filename.entry_str_pos = 0;
1864 out_put:
1865 	thread__put(thread);
1866 out:
1867 	return 0;
1868 }
1869 
1870 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1871 				     union perf_event *event __maybe_unused,
1872 				     struct perf_sample *sample)
1873 {
1874         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1875 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1876 	struct thread *thread = machine__findnew_thread(trace->host,
1877 							sample->pid,
1878 							sample->tid);
1879 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1880 
1881 	if (ttrace == NULL)
1882 		goto out_dump;
1883 
1884 	ttrace->runtime_ms += runtime_ms;
1885 	trace->runtime_ms += runtime_ms;
1886 out_put:
1887 	thread__put(thread);
1888 	return 0;
1889 
1890 out_dump:
1891 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1892 	       evsel->name,
1893 	       perf_evsel__strval(evsel, sample, "comm"),
1894 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1895 	       runtime,
1896 	       perf_evsel__intval(evsel, sample, "vruntime"));
1897 	goto out_put;
1898 }
1899 
1900 static int bpf_output__printer(enum binary_printer_ops op,
1901 			       unsigned int val, void *extra __maybe_unused, FILE *fp)
1902 {
1903 	unsigned char ch = (unsigned char)val;
1904 
1905 	switch (op) {
1906 	case BINARY_PRINT_CHAR_DATA:
1907 		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1908 	case BINARY_PRINT_DATA_BEGIN:
1909 	case BINARY_PRINT_LINE_BEGIN:
1910 	case BINARY_PRINT_ADDR:
1911 	case BINARY_PRINT_NUM_DATA:
1912 	case BINARY_PRINT_NUM_PAD:
1913 	case BINARY_PRINT_SEP:
1914 	case BINARY_PRINT_CHAR_PAD:
1915 	case BINARY_PRINT_LINE_END:
1916 	case BINARY_PRINT_DATA_END:
1917 	default:
1918 		break;
1919 	}
1920 
1921 	return 0;
1922 }
1923 
1924 static void bpf_output__fprintf(struct trace *trace,
1925 				struct perf_sample *sample)
1926 {
1927 	binary__fprintf(sample->raw_data, sample->raw_size, 8,
1928 			bpf_output__printer, NULL, trace->output);
1929 }
1930 
1931 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1932 				union perf_event *event __maybe_unused,
1933 				struct perf_sample *sample)
1934 {
1935 	int callchain_ret = 0;
1936 
1937 	if (sample->callchain) {
1938 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1939 		if (callchain_ret == 0) {
1940 			if (callchain_cursor.nr < trace->min_stack)
1941 				goto out;
1942 			callchain_ret = 1;
1943 		}
1944 	}
1945 
1946 	trace__printf_interrupted_entry(trace);
1947 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1948 
1949 	if (trace->trace_syscalls)
1950 		fprintf(trace->output, "(         ): ");
1951 
1952 	fprintf(trace->output, "%s:", evsel->name);
1953 
1954 	if (perf_evsel__is_bpf_output(evsel)) {
1955 		bpf_output__fprintf(trace, sample);
1956 	} else if (evsel->tp_format) {
1957 		event_format__fprintf(evsel->tp_format, sample->cpu,
1958 				      sample->raw_data, sample->raw_size,
1959 				      trace->output);
1960 	}
1961 
1962 	fprintf(trace->output, ")\n");
1963 
1964 	if (callchain_ret > 0)
1965 		trace__fprintf_callchain(trace, sample);
1966 	else if (callchain_ret < 0)
1967 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1968 out:
1969 	return 0;
1970 }
1971 
1972 static void print_location(FILE *f, struct perf_sample *sample,
1973 			   struct addr_location *al,
1974 			   bool print_dso, bool print_sym)
1975 {
1976 
1977 	if ((verbose > 0 || print_dso) && al->map)
1978 		fprintf(f, "%s@", al->map->dso->long_name);
1979 
1980 	if ((verbose > 0 || print_sym) && al->sym)
1981 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1982 			al->addr - al->sym->start);
1983 	else if (al->map)
1984 		fprintf(f, "0x%" PRIx64, al->addr);
1985 	else
1986 		fprintf(f, "0x%" PRIx64, sample->addr);
1987 }
1988 
1989 static int trace__pgfault(struct trace *trace,
1990 			  struct perf_evsel *evsel,
1991 			  union perf_event *event __maybe_unused,
1992 			  struct perf_sample *sample)
1993 {
1994 	struct thread *thread;
1995 	struct addr_location al;
1996 	char map_type = 'd';
1997 	struct thread_trace *ttrace;
1998 	int err = -1;
1999 	int callchain_ret = 0;
2000 
2001 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2002 
2003 	if (sample->callchain) {
2004 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2005 		if (callchain_ret == 0) {
2006 			if (callchain_cursor.nr < trace->min_stack)
2007 				goto out_put;
2008 			callchain_ret = 1;
2009 		}
2010 	}
2011 
2012 	ttrace = thread__trace(thread, trace->output);
2013 	if (ttrace == NULL)
2014 		goto out_put;
2015 
2016 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2017 		ttrace->pfmaj++;
2018 	else
2019 		ttrace->pfmin++;
2020 
2021 	if (trace->summary_only)
2022 		goto out;
2023 
2024 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
2025 			      sample->ip, &al);
2026 
2027 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2028 
2029 	fprintf(trace->output, "%sfault [",
2030 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2031 		"maj" : "min");
2032 
2033 	print_location(trace->output, sample, &al, false, true);
2034 
2035 	fprintf(trace->output, "] => ");
2036 
2037 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2038 				   sample->addr, &al);
2039 
2040 	if (!al.map) {
2041 		thread__find_addr_location(thread, sample->cpumode,
2042 					   MAP__FUNCTION, sample->addr, &al);
2043 
2044 		if (al.map)
2045 			map_type = 'x';
2046 		else
2047 			map_type = '?';
2048 	}
2049 
2050 	print_location(trace->output, sample, &al, true, false);
2051 
2052 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2053 
2054 	if (callchain_ret > 0)
2055 		trace__fprintf_callchain(trace, sample);
2056 	else if (callchain_ret < 0)
2057 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2058 out:
2059 	err = 0;
2060 out_put:
2061 	thread__put(thread);
2062 	return err;
2063 }
2064 
2065 static void trace__set_base_time(struct trace *trace,
2066 				 struct perf_evsel *evsel,
2067 				 struct perf_sample *sample)
2068 {
2069 	/*
2070 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2071 	 * and don't use sample->time unconditionally, we may end up having
2072 	 * some other event in the future without PERF_SAMPLE_TIME for good
2073 	 * reason, i.e. we may not be interested in its timestamps, just in
2074 	 * it taking place, picking some piece of information when it
2075 	 * appears in our event stream (vfs_getname comes to mind).
2076 	 */
2077 	if (trace->base_time == 0 && !trace->full_time &&
2078 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2079 		trace->base_time = sample->time;
2080 }
2081 
2082 static int trace__process_sample(struct perf_tool *tool,
2083 				 union perf_event *event,
2084 				 struct perf_sample *sample,
2085 				 struct perf_evsel *evsel,
2086 				 struct machine *machine __maybe_unused)
2087 {
2088 	struct trace *trace = container_of(tool, struct trace, tool);
2089 	struct thread *thread;
2090 	int err = 0;
2091 
2092 	tracepoint_handler handler = evsel->handler;
2093 
2094 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2095 	if (thread && thread__is_filtered(thread))
2096 		goto out;
2097 
2098 	trace__set_base_time(trace, evsel, sample);
2099 
2100 	if (handler) {
2101 		++trace->nr_events;
2102 		handler(trace, evsel, event, sample);
2103 	}
2104 out:
2105 	thread__put(thread);
2106 	return err;
2107 }
2108 
2109 static int trace__record(struct trace *trace, int argc, const char **argv)
2110 {
2111 	unsigned int rec_argc, i, j;
2112 	const char **rec_argv;
2113 	const char * const record_args[] = {
2114 		"record",
2115 		"-R",
2116 		"-m", "1024",
2117 		"-c", "1",
2118 	};
2119 
2120 	const char * const sc_args[] = { "-e", };
2121 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2122 	const char * const majpf_args[] = { "-e", "major-faults" };
2123 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2124 	const char * const minpf_args[] = { "-e", "minor-faults" };
2125 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2126 
2127 	/* +1 is for the event string below */
2128 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2129 		majpf_args_nr + minpf_args_nr + argc;
2130 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2131 
2132 	if (rec_argv == NULL)
2133 		return -ENOMEM;
2134 
2135 	j = 0;
2136 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2137 		rec_argv[j++] = record_args[i];
2138 
2139 	if (trace->trace_syscalls) {
2140 		for (i = 0; i < sc_args_nr; i++)
2141 			rec_argv[j++] = sc_args[i];
2142 
2143 		/* event string may be different for older kernels - e.g., RHEL6 */
2144 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2145 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2146 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2147 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2148 		else {
2149 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2150 			free(rec_argv);
2151 			return -1;
2152 		}
2153 	}
2154 
2155 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2156 		for (i = 0; i < majpf_args_nr; i++)
2157 			rec_argv[j++] = majpf_args[i];
2158 
2159 	if (trace->trace_pgfaults & TRACE_PFMIN)
2160 		for (i = 0; i < minpf_args_nr; i++)
2161 			rec_argv[j++] = minpf_args[i];
2162 
2163 	for (i = 0; i < (unsigned int)argc; i++)
2164 		rec_argv[j++] = argv[i];
2165 
2166 	return cmd_record(j, rec_argv);
2167 }
2168 
2169 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2170 
2171 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2172 {
2173 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2174 
2175 	if (IS_ERR(evsel))
2176 		return false;
2177 
2178 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2179 		perf_evsel__delete(evsel);
2180 		return false;
2181 	}
2182 
2183 	evsel->handler = trace__vfs_getname;
2184 	perf_evlist__add(evlist, evsel);
2185 	return true;
2186 }
2187 
2188 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2189 {
2190 	struct perf_evsel *evsel;
2191 	struct perf_event_attr attr = {
2192 		.type = PERF_TYPE_SOFTWARE,
2193 		.mmap_data = 1,
2194 	};
2195 
2196 	attr.config = config;
2197 	attr.sample_period = 1;
2198 
2199 	event_attr_init(&attr);
2200 
2201 	evsel = perf_evsel__new(&attr);
2202 	if (evsel)
2203 		evsel->handler = trace__pgfault;
2204 
2205 	return evsel;
2206 }
2207 
2208 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2209 {
2210 	const u32 type = event->header.type;
2211 	struct perf_evsel *evsel;
2212 
2213 	if (type != PERF_RECORD_SAMPLE) {
2214 		trace__process_event(trace, trace->host, event, sample);
2215 		return;
2216 	}
2217 
2218 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2219 	if (evsel == NULL) {
2220 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2221 		return;
2222 	}
2223 
2224 	trace__set_base_time(trace, evsel, sample);
2225 
2226 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2227 	    sample->raw_data == NULL) {
2228 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2229 		       perf_evsel__name(evsel), sample->tid,
2230 		       sample->cpu, sample->raw_size);
2231 	} else {
2232 		tracepoint_handler handler = evsel->handler;
2233 		handler(trace, evsel, event, sample);
2234 	}
2235 }
2236 
2237 static int trace__add_syscall_newtp(struct trace *trace)
2238 {
2239 	int ret = -1;
2240 	struct perf_evlist *evlist = trace->evlist;
2241 	struct perf_evsel *sys_enter, *sys_exit;
2242 
2243 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2244 	if (sys_enter == NULL)
2245 		goto out;
2246 
2247 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2248 		goto out_delete_sys_enter;
2249 
2250 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2251 	if (sys_exit == NULL)
2252 		goto out_delete_sys_enter;
2253 
2254 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2255 		goto out_delete_sys_exit;
2256 
2257 	perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2258 	perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2259 
2260 	perf_evlist__add(evlist, sys_enter);
2261 	perf_evlist__add(evlist, sys_exit);
2262 
2263 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2264 		/*
2265 		 * We're interested only in the user space callchain
2266 		 * leading to the syscall, allow overriding that for
2267 		 * debugging reasons using --kernel_syscall_callchains
2268 		 */
2269 		sys_exit->attr.exclude_callchain_kernel = 1;
2270 	}
2271 
2272 	trace->syscalls.events.sys_enter = sys_enter;
2273 	trace->syscalls.events.sys_exit  = sys_exit;
2274 
2275 	ret = 0;
2276 out:
2277 	return ret;
2278 
2279 out_delete_sys_exit:
2280 	perf_evsel__delete_priv(sys_exit);
2281 out_delete_sys_enter:
2282 	perf_evsel__delete_priv(sys_enter);
2283 	goto out;
2284 }
2285 
2286 static int trace__set_ev_qualifier_filter(struct trace *trace)
2287 {
2288 	int err = -1;
2289 	struct perf_evsel *sys_exit;
2290 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2291 						trace->ev_qualifier_ids.nr,
2292 						trace->ev_qualifier_ids.entries);
2293 
2294 	if (filter == NULL)
2295 		goto out_enomem;
2296 
2297 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2298 					  filter)) {
2299 		sys_exit = trace->syscalls.events.sys_exit;
2300 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2301 	}
2302 
2303 	free(filter);
2304 out:
2305 	return err;
2306 out_enomem:
2307 	errno = ENOMEM;
2308 	goto out;
2309 }
2310 
2311 static int trace__set_filter_loop_pids(struct trace *trace)
2312 {
2313 	unsigned int nr = 1;
2314 	pid_t pids[32] = {
2315 		getpid(),
2316 	};
2317 	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2318 
2319 	while (thread && nr < ARRAY_SIZE(pids)) {
2320 		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2321 
2322 		if (parent == NULL)
2323 			break;
2324 
2325 		if (!strcmp(thread__comm_str(parent), "sshd")) {
2326 			pids[nr++] = parent->tid;
2327 			break;
2328 		}
2329 		thread = parent;
2330 	}
2331 
2332 	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2333 }
2334 
2335 static int trace__run(struct trace *trace, int argc, const char **argv)
2336 {
2337 	struct perf_evlist *evlist = trace->evlist;
2338 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2339 	int err = -1, i;
2340 	unsigned long before;
2341 	const bool forks = argc > 0;
2342 	bool draining = false;
2343 
2344 	trace->live = true;
2345 
2346 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2347 		goto out_error_raw_syscalls;
2348 
2349 	if (trace->trace_syscalls)
2350 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2351 
2352 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2353 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2354 		if (pgfault_maj == NULL)
2355 			goto out_error_mem;
2356 		perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2357 		perf_evlist__add(evlist, pgfault_maj);
2358 	}
2359 
2360 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2361 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2362 		if (pgfault_min == NULL)
2363 			goto out_error_mem;
2364 		perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2365 		perf_evlist__add(evlist, pgfault_min);
2366 	}
2367 
2368 	if (trace->sched &&
2369 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2370 				   trace__sched_stat_runtime))
2371 		goto out_error_sched_stat_runtime;
2372 
2373 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2374 	if (err < 0) {
2375 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2376 		goto out_delete_evlist;
2377 	}
2378 
2379 	err = trace__symbols_init(trace, evlist);
2380 	if (err < 0) {
2381 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2382 		goto out_delete_evlist;
2383 	}
2384 
2385 	perf_evlist__config(evlist, &trace->opts, &callchain_param);
2386 
2387 	signal(SIGCHLD, sig_handler);
2388 	signal(SIGINT, sig_handler);
2389 
2390 	if (forks) {
2391 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2392 						    argv, false, NULL);
2393 		if (err < 0) {
2394 			fprintf(trace->output, "Couldn't run the workload!\n");
2395 			goto out_delete_evlist;
2396 		}
2397 	}
2398 
2399 	err = perf_evlist__open(evlist);
2400 	if (err < 0)
2401 		goto out_error_open;
2402 
2403 	err = bpf__apply_obj_config();
2404 	if (err) {
2405 		char errbuf[BUFSIZ];
2406 
2407 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2408 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2409 			 errbuf);
2410 		goto out_error_open;
2411 	}
2412 
2413 	/*
2414 	 * Better not use !target__has_task() here because we need to cover the
2415 	 * case where no threads were specified in the command line, but a
2416 	 * workload was, and in that case we will fill in the thread_map when
2417 	 * we fork the workload in perf_evlist__prepare_workload.
2418 	 */
2419 	if (trace->filter_pids.nr > 0)
2420 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2421 	else if (thread_map__pid(evlist->threads, 0) == -1)
2422 		err = trace__set_filter_loop_pids(trace);
2423 
2424 	if (err < 0)
2425 		goto out_error_mem;
2426 
2427 	if (trace->ev_qualifier_ids.nr > 0) {
2428 		err = trace__set_ev_qualifier_filter(trace);
2429 		if (err < 0)
2430 			goto out_errno;
2431 
2432 		pr_debug("event qualifier tracepoint filter: %s\n",
2433 			 trace->syscalls.events.sys_exit->filter);
2434 	}
2435 
2436 	err = perf_evlist__apply_filters(evlist, &evsel);
2437 	if (err < 0)
2438 		goto out_error_apply_filters;
2439 
2440 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2441 	if (err < 0)
2442 		goto out_error_mmap;
2443 
2444 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2445 		perf_evlist__enable(evlist);
2446 
2447 	if (forks)
2448 		perf_evlist__start_workload(evlist);
2449 
2450 	if (trace->opts.initial_delay) {
2451 		usleep(trace->opts.initial_delay * 1000);
2452 		perf_evlist__enable(evlist);
2453 	}
2454 
2455 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2456 				  evlist->threads->nr > 1 ||
2457 				  perf_evlist__first(evlist)->attr.inherit;
2458 
2459 	/*
2460 	 * Now that we already used evsel->attr to ask the kernel to setup the
2461 	 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2462 	 * trace__resolve_callchain(), allowing per-event max-stack settings
2463 	 * to override an explicitely set --max-stack global setting.
2464 	 */
2465 	evlist__for_each_entry(evlist, evsel) {
2466 		if ((evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
2467 		    evsel->attr.sample_max_stack == 0)
2468 			evsel->attr.sample_max_stack = trace->max_stack;
2469 	}
2470 again:
2471 	before = trace->nr_events;
2472 
2473 	for (i = 0; i < evlist->nr_mmaps; i++) {
2474 		union perf_event *event;
2475 
2476 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2477 			struct perf_sample sample;
2478 
2479 			++trace->nr_events;
2480 
2481 			err = perf_evlist__parse_sample(evlist, event, &sample);
2482 			if (err) {
2483 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2484 				goto next_event;
2485 			}
2486 
2487 			trace__handle_event(trace, event, &sample);
2488 next_event:
2489 			perf_evlist__mmap_consume(evlist, i);
2490 
2491 			if (interrupted)
2492 				goto out_disable;
2493 
2494 			if (done && !draining) {
2495 				perf_evlist__disable(evlist);
2496 				draining = true;
2497 			}
2498 		}
2499 	}
2500 
2501 	if (trace->nr_events == before) {
2502 		int timeout = done ? 100 : -1;
2503 
2504 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2505 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2506 				draining = true;
2507 
2508 			goto again;
2509 		}
2510 	} else {
2511 		goto again;
2512 	}
2513 
2514 out_disable:
2515 	thread__zput(trace->current);
2516 
2517 	perf_evlist__disable(evlist);
2518 
2519 	if (!err) {
2520 		if (trace->summary)
2521 			trace__fprintf_thread_summary(trace, trace->output);
2522 
2523 		if (trace->show_tool_stats) {
2524 			fprintf(trace->output, "Stats:\n "
2525 					       " vfs_getname : %" PRIu64 "\n"
2526 					       " proc_getname: %" PRIu64 "\n",
2527 				trace->stats.vfs_getname,
2528 				trace->stats.proc_getname);
2529 		}
2530 	}
2531 
2532 out_delete_evlist:
2533 	trace__symbols__exit(trace);
2534 
2535 	perf_evlist__delete(evlist);
2536 	trace->evlist = NULL;
2537 	trace->live = false;
2538 	return err;
2539 {
2540 	char errbuf[BUFSIZ];
2541 
2542 out_error_sched_stat_runtime:
2543 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2544 	goto out_error;
2545 
2546 out_error_raw_syscalls:
2547 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2548 	goto out_error;
2549 
2550 out_error_mmap:
2551 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2552 	goto out_error;
2553 
2554 out_error_open:
2555 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2556 
2557 out_error:
2558 	fprintf(trace->output, "%s\n", errbuf);
2559 	goto out_delete_evlist;
2560 
2561 out_error_apply_filters:
2562 	fprintf(trace->output,
2563 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2564 		evsel->filter, perf_evsel__name(evsel), errno,
2565 		str_error_r(errno, errbuf, sizeof(errbuf)));
2566 	goto out_delete_evlist;
2567 }
2568 out_error_mem:
2569 	fprintf(trace->output, "Not enough memory to run!\n");
2570 	goto out_delete_evlist;
2571 
2572 out_errno:
2573 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2574 	goto out_delete_evlist;
2575 }
2576 
2577 static int trace__replay(struct trace *trace)
2578 {
2579 	const struct perf_evsel_str_handler handlers[] = {
2580 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2581 	};
2582 	struct perf_data data = {
2583 		.file      = {
2584 			.path = input_name,
2585 		},
2586 		.mode      = PERF_DATA_MODE_READ,
2587 		.force     = trace->force,
2588 	};
2589 	struct perf_session *session;
2590 	struct perf_evsel *evsel;
2591 	int err = -1;
2592 
2593 	trace->tool.sample	  = trace__process_sample;
2594 	trace->tool.mmap	  = perf_event__process_mmap;
2595 	trace->tool.mmap2	  = perf_event__process_mmap2;
2596 	trace->tool.comm	  = perf_event__process_comm;
2597 	trace->tool.exit	  = perf_event__process_exit;
2598 	trace->tool.fork	  = perf_event__process_fork;
2599 	trace->tool.attr	  = perf_event__process_attr;
2600 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2601 	trace->tool.build_id	  = perf_event__process_build_id;
2602 	trace->tool.namespaces	  = perf_event__process_namespaces;
2603 
2604 	trace->tool.ordered_events = true;
2605 	trace->tool.ordering_requires_timestamps = true;
2606 
2607 	/* add tid to output */
2608 	trace->multiple_threads = true;
2609 
2610 	session = perf_session__new(&data, false, &trace->tool);
2611 	if (session == NULL)
2612 		return -1;
2613 
2614 	if (trace->opts.target.pid)
2615 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2616 
2617 	if (trace->opts.target.tid)
2618 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2619 
2620 	if (symbol__init(&session->header.env) < 0)
2621 		goto out;
2622 
2623 	trace->host = &session->machines.host;
2624 
2625 	err = perf_session__set_tracepoints_handlers(session, handlers);
2626 	if (err)
2627 		goto out;
2628 
2629 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2630 						     "raw_syscalls:sys_enter");
2631 	/* older kernels have syscalls tp versus raw_syscalls */
2632 	if (evsel == NULL)
2633 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2634 							     "syscalls:sys_enter");
2635 
2636 	if (evsel &&
2637 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2638 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2639 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2640 		goto out;
2641 	}
2642 
2643 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2644 						     "raw_syscalls:sys_exit");
2645 	if (evsel == NULL)
2646 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2647 							     "syscalls:sys_exit");
2648 	if (evsel &&
2649 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2650 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2651 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2652 		goto out;
2653 	}
2654 
2655 	evlist__for_each_entry(session->evlist, evsel) {
2656 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2657 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2658 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2659 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2660 			evsel->handler = trace__pgfault;
2661 	}
2662 
2663 	setup_pager();
2664 
2665 	err = perf_session__process_events(session);
2666 	if (err)
2667 		pr_err("Failed to process events, error %d", err);
2668 
2669 	else if (trace->summary)
2670 		trace__fprintf_thread_summary(trace, trace->output);
2671 
2672 out:
2673 	perf_session__delete(session);
2674 
2675 	return err;
2676 }
2677 
2678 static size_t trace__fprintf_threads_header(FILE *fp)
2679 {
2680 	size_t printed;
2681 
2682 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2683 
2684 	return printed;
2685 }
2686 
2687 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2688 	struct stats 	*stats;
2689 	double		msecs;
2690 	int		syscall;
2691 )
2692 {
2693 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2694 	struct stats *stats = source->priv;
2695 
2696 	entry->syscall = source->i;
2697 	entry->stats   = stats;
2698 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2699 }
2700 
2701 static size_t thread__dump_stats(struct thread_trace *ttrace,
2702 				 struct trace *trace, FILE *fp)
2703 {
2704 	size_t printed = 0;
2705 	struct syscall *sc;
2706 	struct rb_node *nd;
2707 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2708 
2709 	if (syscall_stats == NULL)
2710 		return 0;
2711 
2712 	printed += fprintf(fp, "\n");
2713 
2714 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2715 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2716 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2717 
2718 	resort_rb__for_each_entry(nd, syscall_stats) {
2719 		struct stats *stats = syscall_stats_entry->stats;
2720 		if (stats) {
2721 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2722 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2723 			double avg = avg_stats(stats);
2724 			double pct;
2725 			u64 n = (u64) stats->n;
2726 
2727 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2728 			avg /= NSEC_PER_MSEC;
2729 
2730 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2731 			printed += fprintf(fp, "   %-15s", sc->name);
2732 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2733 					   n, syscall_stats_entry->msecs, min, avg);
2734 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2735 		}
2736 	}
2737 
2738 	resort_rb__delete(syscall_stats);
2739 	printed += fprintf(fp, "\n\n");
2740 
2741 	return printed;
2742 }
2743 
2744 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2745 {
2746 	size_t printed = 0;
2747 	struct thread_trace *ttrace = thread__priv(thread);
2748 	double ratio;
2749 
2750 	if (ttrace == NULL)
2751 		return 0;
2752 
2753 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2754 
2755 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2756 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2757 	printed += fprintf(fp, "%.1f%%", ratio);
2758 	if (ttrace->pfmaj)
2759 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2760 	if (ttrace->pfmin)
2761 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2762 	if (trace->sched)
2763 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2764 	else if (fputc('\n', fp) != EOF)
2765 		++printed;
2766 
2767 	printed += thread__dump_stats(ttrace, trace, fp);
2768 
2769 	return printed;
2770 }
2771 
2772 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2773 {
2774 	return ttrace ? ttrace->nr_events : 0;
2775 }
2776 
2777 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2778 	struct thread *thread;
2779 )
2780 {
2781 	entry->thread = rb_entry(nd, struct thread, rb_node);
2782 }
2783 
2784 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2785 {
2786 	size_t printed = trace__fprintf_threads_header(fp);
2787 	struct rb_node *nd;
2788 	int i;
2789 
2790 	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2791 		DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2792 
2793 		if (threads == NULL) {
2794 			fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2795 			return 0;
2796 		}
2797 
2798 		resort_rb__for_each_entry(nd, threads)
2799 			printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2800 
2801 		resort_rb__delete(threads);
2802 	}
2803 	return printed;
2804 }
2805 
2806 static int trace__set_duration(const struct option *opt, const char *str,
2807 			       int unset __maybe_unused)
2808 {
2809 	struct trace *trace = opt->value;
2810 
2811 	trace->duration_filter = atof(str);
2812 	return 0;
2813 }
2814 
2815 static int trace__set_filter_pids(const struct option *opt, const char *str,
2816 				  int unset __maybe_unused)
2817 {
2818 	int ret = -1;
2819 	size_t i;
2820 	struct trace *trace = opt->value;
2821 	/*
2822 	 * FIXME: introduce a intarray class, plain parse csv and create a
2823 	 * { int nr, int entries[] } struct...
2824 	 */
2825 	struct intlist *list = intlist__new(str);
2826 
2827 	if (list == NULL)
2828 		return -1;
2829 
2830 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2831 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2832 
2833 	if (trace->filter_pids.entries == NULL)
2834 		goto out;
2835 
2836 	trace->filter_pids.entries[0] = getpid();
2837 
2838 	for (i = 1; i < trace->filter_pids.nr; ++i)
2839 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2840 
2841 	intlist__delete(list);
2842 	ret = 0;
2843 out:
2844 	return ret;
2845 }
2846 
2847 static int trace__open_output(struct trace *trace, const char *filename)
2848 {
2849 	struct stat st;
2850 
2851 	if (!stat(filename, &st) && st.st_size) {
2852 		char oldname[PATH_MAX];
2853 
2854 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2855 		unlink(oldname);
2856 		rename(filename, oldname);
2857 	}
2858 
2859 	trace->output = fopen(filename, "w");
2860 
2861 	return trace->output == NULL ? -errno : 0;
2862 }
2863 
2864 static int parse_pagefaults(const struct option *opt, const char *str,
2865 			    int unset __maybe_unused)
2866 {
2867 	int *trace_pgfaults = opt->value;
2868 
2869 	if (strcmp(str, "all") == 0)
2870 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2871 	else if (strcmp(str, "maj") == 0)
2872 		*trace_pgfaults |= TRACE_PFMAJ;
2873 	else if (strcmp(str, "min") == 0)
2874 		*trace_pgfaults |= TRACE_PFMIN;
2875 	else
2876 		return -1;
2877 
2878 	return 0;
2879 }
2880 
2881 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2882 {
2883 	struct perf_evsel *evsel;
2884 
2885 	evlist__for_each_entry(evlist, evsel)
2886 		evsel->handler = handler;
2887 }
2888 
2889 /*
2890  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2891  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2892  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2893  *
2894  * It'd be better to introduce a parse_options() variant that would return a
2895  * list with the terms it didn't match to an event...
2896  */
2897 static int trace__parse_events_option(const struct option *opt, const char *str,
2898 				      int unset __maybe_unused)
2899 {
2900 	struct trace *trace = (struct trace *)opt->value;
2901 	const char *s = str;
2902 	char *sep = NULL, *lists[2] = { NULL, NULL, };
2903 	int len = strlen(str) + 1, err = -1, list, idx;
2904 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2905 	char group_name[PATH_MAX];
2906 
2907 	if (strace_groups_dir == NULL)
2908 		return -1;
2909 
2910 	if (*s == '!') {
2911 		++s;
2912 		trace->not_ev_qualifier = true;
2913 	}
2914 
2915 	while (1) {
2916 		if ((sep = strchr(s, ',')) != NULL)
2917 			*sep = '\0';
2918 
2919 		list = 0;
2920 		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2921 		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2922 			list = 1;
2923 		} else {
2924 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2925 			if (access(group_name, R_OK) == 0)
2926 				list = 1;
2927 		}
2928 
2929 		if (lists[list]) {
2930 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2931 		} else {
2932 			lists[list] = malloc(len);
2933 			if (lists[list] == NULL)
2934 				goto out;
2935 			strcpy(lists[list], s);
2936 		}
2937 
2938 		if (!sep)
2939 			break;
2940 
2941 		*sep = ',';
2942 		s = sep + 1;
2943 	}
2944 
2945 	if (lists[1] != NULL) {
2946 		struct strlist_config slist_config = {
2947 			.dirname = strace_groups_dir,
2948 		};
2949 
2950 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2951 		if (trace->ev_qualifier == NULL) {
2952 			fputs("Not enough memory to parse event qualifier", trace->output);
2953 			goto out;
2954 		}
2955 
2956 		if (trace__validate_ev_qualifier(trace))
2957 			goto out;
2958 	}
2959 
2960 	err = 0;
2961 
2962 	if (lists[0]) {
2963 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2964 					       "event selector. use 'perf list' to list available events",
2965 					       parse_events_option);
2966 		err = parse_events_option(&o, lists[0], 0);
2967 	}
2968 out:
2969 	if (sep)
2970 		*sep = ',';
2971 
2972 	return err;
2973 }
2974 
2975 int cmd_trace(int argc, const char **argv)
2976 {
2977 	const char *trace_usage[] = {
2978 		"perf trace [<options>] [<command>]",
2979 		"perf trace [<options>] -- <command> [<options>]",
2980 		"perf trace record [<options>] [<command>]",
2981 		"perf trace record [<options>] -- <command> [<options>]",
2982 		NULL
2983 	};
2984 	struct trace trace = {
2985 		.syscalls = {
2986 			. max = -1,
2987 		},
2988 		.opts = {
2989 			.target = {
2990 				.uid	   = UINT_MAX,
2991 				.uses_mmap = true,
2992 			},
2993 			.user_freq     = UINT_MAX,
2994 			.user_interval = ULLONG_MAX,
2995 			.no_buffering  = true,
2996 			.mmap_pages    = UINT_MAX,
2997 			.proc_map_timeout  = 500,
2998 		},
2999 		.output = stderr,
3000 		.show_comm = true,
3001 		.trace_syscalls = true,
3002 		.kernel_syscallchains = false,
3003 		.max_stack = UINT_MAX,
3004 	};
3005 	const char *output_name = NULL;
3006 	const struct option trace_options[] = {
3007 	OPT_CALLBACK('e', "event", &trace, "event",
3008 		     "event/syscall selector. use 'perf list' to list available events",
3009 		     trace__parse_events_option),
3010 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
3011 		    "show the thread COMM next to its id"),
3012 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3013 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3014 		     trace__parse_events_option),
3015 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
3016 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3017 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3018 		    "trace events on existing process id"),
3019 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3020 		    "trace events on existing thread id"),
3021 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3022 		     "pids to filter (by the kernel)", trace__set_filter_pids),
3023 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3024 		    "system-wide collection from all CPUs"),
3025 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3026 		    "list of cpus to monitor"),
3027 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3028 		    "child tasks do not inherit counters"),
3029 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3030 		     "number of mmap data pages",
3031 		     perf_evlist__parse_mmap_pages),
3032 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3033 		   "user to profile"),
3034 	OPT_CALLBACK(0, "duration", &trace, "float",
3035 		     "show only events with duration > N.M ms",
3036 		     trace__set_duration),
3037 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3038 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3039 	OPT_BOOLEAN('T', "time", &trace.full_time,
3040 		    "Show full timestamp, not time relative to first start"),
3041 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3042 		    "Show only syscall summary with statistics"),
3043 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3044 		    "Show all syscalls and summary with statistics"),
3045 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3046 		     "Trace pagefaults", parse_pagefaults, "maj"),
3047 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3048 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3049 	OPT_CALLBACK(0, "call-graph", &trace.opts,
3050 		     "record_mode[,record_size]", record_callchain_help,
3051 		     &record_parse_callchain_opt),
3052 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3053 		    "Show the kernel callchains on the syscall exit path"),
3054 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3055 		     "Set the minimum stack depth when parsing the callchain, "
3056 		     "anything below the specified depth will be ignored."),
3057 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3058 		     "Set the maximum stack depth when parsing the callchain, "
3059 		     "anything beyond the specified depth will be ignored. "
3060 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3061 	OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3062 			"print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3063 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3064 			"per thread proc mmap processing timeout in ms"),
3065 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3066 		     "ms to wait before starting measurement after program "
3067 		     "start"),
3068 	OPT_END()
3069 	};
3070 	bool __maybe_unused max_stack_user_set = true;
3071 	bool mmap_pages_user_set = true;
3072 	const char * const trace_subcommands[] = { "record", NULL };
3073 	int err;
3074 	char bf[BUFSIZ];
3075 
3076 	signal(SIGSEGV, sighandler_dump_stack);
3077 	signal(SIGFPE, sighandler_dump_stack);
3078 
3079 	trace.evlist = perf_evlist__new();
3080 	trace.sctbl = syscalltbl__new();
3081 
3082 	if (trace.evlist == NULL || trace.sctbl == NULL) {
3083 		pr_err("Not enough memory to run!\n");
3084 		err = -ENOMEM;
3085 		goto out;
3086 	}
3087 
3088 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3089 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3090 
3091 	err = bpf__setup_stdout(trace.evlist);
3092 	if (err) {
3093 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3094 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3095 		goto out;
3096 	}
3097 
3098 	err = -1;
3099 
3100 	if (trace.trace_pgfaults) {
3101 		trace.opts.sample_address = true;
3102 		trace.opts.sample_time = true;
3103 	}
3104 
3105 	if (trace.opts.mmap_pages == UINT_MAX)
3106 		mmap_pages_user_set = false;
3107 
3108 	if (trace.max_stack == UINT_MAX) {
3109 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3110 		max_stack_user_set = false;
3111 	}
3112 
3113 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3114 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3115 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3116 	}
3117 #endif
3118 
3119 	if (callchain_param.enabled) {
3120 		if (!mmap_pages_user_set && geteuid() == 0)
3121 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3122 
3123 		symbol_conf.use_callchain = true;
3124 	}
3125 
3126 	if (trace.evlist->nr_entries > 0)
3127 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3128 
3129 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3130 		return trace__record(&trace, argc-1, &argv[1]);
3131 
3132 	/* summary_only implies summary option, but don't overwrite summary if set */
3133 	if (trace.summary_only)
3134 		trace.summary = trace.summary_only;
3135 
3136 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3137 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3138 		pr_err("Please specify something to trace.\n");
3139 		return -1;
3140 	}
3141 
3142 	if (!trace.trace_syscalls && trace.ev_qualifier) {
3143 		pr_err("The -e option can't be used with --no-syscalls.\n");
3144 		goto out;
3145 	}
3146 
3147 	if (output_name != NULL) {
3148 		err = trace__open_output(&trace, output_name);
3149 		if (err < 0) {
3150 			perror("failed to create output file");
3151 			goto out;
3152 		}
3153 	}
3154 
3155 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
3156 
3157 	err = target__validate(&trace.opts.target);
3158 	if (err) {
3159 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3160 		fprintf(trace.output, "%s", bf);
3161 		goto out_close;
3162 	}
3163 
3164 	err = target__parse_uid(&trace.opts.target);
3165 	if (err) {
3166 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3167 		fprintf(trace.output, "%s", bf);
3168 		goto out_close;
3169 	}
3170 
3171 	if (!argc && target__none(&trace.opts.target))
3172 		trace.opts.target.system_wide = true;
3173 
3174 	if (input_name)
3175 		err = trace__replay(&trace);
3176 	else
3177 		err = trace__run(&trace, argc, argv);
3178 
3179 out_close:
3180 	if (output_name != NULL)
3181 		fclose(trace.output);
3182 out:
3183 	return err;
3184 }
3185