xref: /openbmc/linux/tools/perf/builtin-trace.c (revision a99237af)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/cgroup.h"
23 #include "util/color.h"
24 #include "util/debug.h"
25 #include "util/env.h"
26 #include "util/event.h"
27 #include "util/evlist.h"
28 #include <subcmd/exec-cmd.h>
29 #include "util/machine.h"
30 #include "util/path.h"
31 #include "util/session.h"
32 #include "util/thread.h"
33 #include <subcmd/parse-options.h>
34 #include "util/strlist.h"
35 #include "util/intlist.h"
36 #include "util/thread_map.h"
37 #include "util/stat.h"
38 #include "trace/beauty/beauty.h"
39 #include "trace-event.h"
40 #include "util/parse-events.h"
41 #include "util/bpf-loader.h"
42 #include "callchain.h"
43 #include "print_binary.h"
44 #include "string2.h"
45 #include "syscalltbl.h"
46 #include "rb_resort.h"
47 
48 #include <errno.h>
49 #include <inttypes.h>
50 #include <poll.h>
51 #include <signal.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <linux/err.h>
55 #include <linux/filter.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 #include <fcntl.h>
61 
62 #include "sane_ctype.h"
63 
64 #ifndef O_CLOEXEC
65 # define O_CLOEXEC		02000000
66 #endif
67 
68 #ifndef F_LINUX_SPECIFIC_BASE
69 # define F_LINUX_SPECIFIC_BASE	1024
70 #endif
71 
72 struct trace {
73 	struct perf_tool	tool;
74 	struct syscalltbl	*sctbl;
75 	struct {
76 		int		max;
77 		struct syscall  *table;
78 		struct {
79 			struct perf_evsel *sys_enter,
80 					  *sys_exit;
81 		}		events;
82 	} syscalls;
83 	struct record_opts	opts;
84 	struct perf_evlist	*evlist;
85 	struct machine		*host;
86 	struct thread		*current;
87 	struct cgroup		*cgroup;
88 	u64			base_time;
89 	FILE			*output;
90 	unsigned long		nr_events;
91 	struct strlist		*ev_qualifier;
92 	struct {
93 		size_t		nr;
94 		int		*entries;
95 	}			ev_qualifier_ids;
96 	struct {
97 		size_t		nr;
98 		pid_t		*entries;
99 	}			filter_pids;
100 	double			duration_filter;
101 	double			runtime_ms;
102 	struct {
103 		u64		vfs_getname,
104 				proc_getname;
105 	} stats;
106 	unsigned int		max_stack;
107 	unsigned int		min_stack;
108 	bool			not_ev_qualifier;
109 	bool			live;
110 	bool			full_time;
111 	bool			sched;
112 	bool			multiple_threads;
113 	bool			summary;
114 	bool			summary_only;
115 	bool			failure_only;
116 	bool			show_comm;
117 	bool			print_sample;
118 	bool			show_tool_stats;
119 	bool			trace_syscalls;
120 	bool			kernel_syscallchains;
121 	bool			force;
122 	bool			vfs_getname;
123 	int			trace_pgfaults;
124 	int			open_id;
125 };
126 
127 struct tp_field {
128 	int offset;
129 	union {
130 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
131 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
132 	};
133 };
134 
135 #define TP_UINT_FIELD(bits) \
136 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
137 { \
138 	u##bits value; \
139 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
140 	return value;  \
141 }
142 
143 TP_UINT_FIELD(8);
144 TP_UINT_FIELD(16);
145 TP_UINT_FIELD(32);
146 TP_UINT_FIELD(64);
147 
148 #define TP_UINT_FIELD__SWAPPED(bits) \
149 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
150 { \
151 	u##bits value; \
152 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
153 	return bswap_##bits(value);\
154 }
155 
156 TP_UINT_FIELD__SWAPPED(16);
157 TP_UINT_FIELD__SWAPPED(32);
158 TP_UINT_FIELD__SWAPPED(64);
159 
160 static int tp_field__init_uint(struct tp_field *field,
161 			       struct format_field *format_field,
162 			       bool needs_swap)
163 {
164 	field->offset = format_field->offset;
165 
166 	switch (format_field->size) {
167 	case 1:
168 		field->integer = tp_field__u8;
169 		break;
170 	case 2:
171 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
172 		break;
173 	case 4:
174 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
175 		break;
176 	case 8:
177 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
178 		break;
179 	default:
180 		return -1;
181 	}
182 
183 	return 0;
184 }
185 
186 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
187 {
188 	return sample->raw_data + field->offset;
189 }
190 
191 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
192 {
193 	field->offset = format_field->offset;
194 	field->pointer = tp_field__ptr;
195 	return 0;
196 }
197 
198 struct syscall_tp {
199 	struct tp_field id;
200 	union {
201 		struct tp_field args, ret;
202 	};
203 };
204 
205 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
206 					  struct tp_field *field,
207 					  const char *name)
208 {
209 	struct format_field *format_field = perf_evsel__field(evsel, name);
210 
211 	if (format_field == NULL)
212 		return -1;
213 
214 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
215 }
216 
217 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
218 	({ struct syscall_tp *sc = evsel->priv;\
219 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
220 
221 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
222 					 struct tp_field *field,
223 					 const char *name)
224 {
225 	struct format_field *format_field = perf_evsel__field(evsel, name);
226 
227 	if (format_field == NULL)
228 		return -1;
229 
230 	return tp_field__init_ptr(field, format_field);
231 }
232 
233 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
234 	({ struct syscall_tp *sc = evsel->priv;\
235 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
236 
237 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
238 {
239 	zfree(&evsel->priv);
240 	perf_evsel__delete(evsel);
241 }
242 
243 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
244 {
245 	evsel->priv = malloc(sizeof(struct syscall_tp));
246 	if (evsel->priv != NULL) {
247 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
248 			goto out_delete;
249 
250 		evsel->handler = handler;
251 		return 0;
252 	}
253 
254 	return -ENOMEM;
255 
256 out_delete:
257 	zfree(&evsel->priv);
258 	return -ENOENT;
259 }
260 
261 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
262 {
263 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
264 
265 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
266 	if (IS_ERR(evsel))
267 		evsel = perf_evsel__newtp("syscalls", direction);
268 
269 	if (IS_ERR(evsel))
270 		return NULL;
271 
272 	if (perf_evsel__init_syscall_tp(evsel, handler))
273 		goto out_delete;
274 
275 	return evsel;
276 
277 out_delete:
278 	perf_evsel__delete_priv(evsel);
279 	return NULL;
280 }
281 
282 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
283 	({ struct syscall_tp *fields = evsel->priv; \
284 	   fields->name.integer(&fields->name, sample); })
285 
286 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
287 	({ struct syscall_tp *fields = evsel->priv; \
288 	   fields->name.pointer(&fields->name, sample); })
289 
290 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
291 {
292 	int idx = val - sa->offset;
293 
294 	if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL)
295 		return scnprintf(bf, size, intfmt, val);
296 
297 	return scnprintf(bf, size, "%s", sa->entries[idx]);
298 }
299 
300 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
301 						const char *intfmt,
302 					        struct syscall_arg *arg)
303 {
304 	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
305 }
306 
307 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
308 					      struct syscall_arg *arg)
309 {
310 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
311 }
312 
313 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
314 
315 struct strarrays {
316 	int		nr_entries;
317 	struct strarray **entries;
318 };
319 
320 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
321 	.nr_entries = ARRAY_SIZE(array), \
322 	.entries = array, \
323 }
324 
325 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
326 					struct syscall_arg *arg)
327 {
328 	struct strarrays *sas = arg->parm;
329 	int i;
330 
331 	for (i = 0; i < sas->nr_entries; ++i) {
332 		struct strarray *sa = sas->entries[i];
333 		int idx = arg->val - sa->offset;
334 
335 		if (idx >= 0 && idx < sa->nr_entries) {
336 			if (sa->entries[idx] == NULL)
337 				break;
338 			return scnprintf(bf, size, "%s", sa->entries[idx]);
339 		}
340 	}
341 
342 	return scnprintf(bf, size, "%d", arg->val);
343 }
344 
345 #ifndef AT_FDCWD
346 #define AT_FDCWD	-100
347 #endif
348 
349 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
350 					   struct syscall_arg *arg)
351 {
352 	int fd = arg->val;
353 
354 	if (fd == AT_FDCWD)
355 		return scnprintf(bf, size, "CWD");
356 
357 	return syscall_arg__scnprintf_fd(bf, size, arg);
358 }
359 
360 #define SCA_FDAT syscall_arg__scnprintf_fd_at
361 
362 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
363 					      struct syscall_arg *arg);
364 
365 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
366 
367 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
368 {
369 	return scnprintf(bf, size, "%#lx", arg->val);
370 }
371 
372 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
373 {
374 	return scnprintf(bf, size, "%d", arg->val);
375 }
376 
377 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
378 {
379 	return scnprintf(bf, size, "%ld", arg->val);
380 }
381 
382 static const char *bpf_cmd[] = {
383 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
384 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
385 };
386 static DEFINE_STRARRAY(bpf_cmd);
387 
388 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
389 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
390 
391 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
392 static DEFINE_STRARRAY(itimers);
393 
394 static const char *keyctl_options[] = {
395 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
396 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
397 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
398 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
399 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
400 };
401 static DEFINE_STRARRAY(keyctl_options);
402 
403 static const char *whences[] = { "SET", "CUR", "END",
404 #ifdef SEEK_DATA
405 "DATA",
406 #endif
407 #ifdef SEEK_HOLE
408 "HOLE",
409 #endif
410 };
411 static DEFINE_STRARRAY(whences);
412 
413 static const char *fcntl_cmds[] = {
414 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
415 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
416 	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
417 	"GETOWNER_UIDS",
418 };
419 static DEFINE_STRARRAY(fcntl_cmds);
420 
421 static const char *fcntl_linux_specific_cmds[] = {
422 	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
423 	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
424 	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
425 };
426 
427 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
428 
429 static struct strarray *fcntl_cmds_arrays[] = {
430 	&strarray__fcntl_cmds,
431 	&strarray__fcntl_linux_specific_cmds,
432 };
433 
434 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
435 
436 static const char *rlimit_resources[] = {
437 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
438 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
439 	"RTTIME",
440 };
441 static DEFINE_STRARRAY(rlimit_resources);
442 
443 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
444 static DEFINE_STRARRAY(sighow);
445 
446 static const char *clockid[] = {
447 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
448 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
449 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
450 };
451 static DEFINE_STRARRAY(clockid);
452 
453 static const char *socket_families[] = {
454 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
455 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
456 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
457 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
458 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
459 	"ALG", "NFC", "VSOCK",
460 };
461 static DEFINE_STRARRAY(socket_families);
462 
463 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
464 						 struct syscall_arg *arg)
465 {
466 	size_t printed = 0;
467 	int mode = arg->val;
468 
469 	if (mode == F_OK) /* 0 */
470 		return scnprintf(bf, size, "F");
471 #define	P_MODE(n) \
472 	if (mode & n##_OK) { \
473 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
474 		mode &= ~n##_OK; \
475 	}
476 
477 	P_MODE(R);
478 	P_MODE(W);
479 	P_MODE(X);
480 #undef P_MODE
481 
482 	if (mode)
483 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
484 
485 	return printed;
486 }
487 
488 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
489 
490 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
491 					      struct syscall_arg *arg);
492 
493 #define SCA_FILENAME syscall_arg__scnprintf_filename
494 
495 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
496 						struct syscall_arg *arg)
497 {
498 	int printed = 0, flags = arg->val;
499 
500 #define	P_FLAG(n) \
501 	if (flags & O_##n) { \
502 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
503 		flags &= ~O_##n; \
504 	}
505 
506 	P_FLAG(CLOEXEC);
507 	P_FLAG(NONBLOCK);
508 #undef P_FLAG
509 
510 	if (flags)
511 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
512 
513 	return printed;
514 }
515 
516 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
517 
518 #ifndef GRND_NONBLOCK
519 #define GRND_NONBLOCK	0x0001
520 #endif
521 #ifndef GRND_RANDOM
522 #define GRND_RANDOM	0x0002
523 #endif
524 
525 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
526 						   struct syscall_arg *arg)
527 {
528 	int printed = 0, flags = arg->val;
529 
530 #define	P_FLAG(n) \
531 	if (flags & GRND_##n) { \
532 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
533 		flags &= ~GRND_##n; \
534 	}
535 
536 	P_FLAG(RANDOM);
537 	P_FLAG(NONBLOCK);
538 #undef P_FLAG
539 
540 	if (flags)
541 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
542 
543 	return printed;
544 }
545 
546 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
547 
548 #define STRARRAY(name, array) \
549 	  { .scnprintf	= SCA_STRARRAY, \
550 	    .parm	= &strarray__##array, }
551 
552 #include "trace/beauty/arch_errno_names.c"
553 #include "trace/beauty/eventfd.c"
554 #include "trace/beauty/futex_op.c"
555 #include "trace/beauty/futex_val3.c"
556 #include "trace/beauty/mmap.c"
557 #include "trace/beauty/mode_t.c"
558 #include "trace/beauty/msg_flags.c"
559 #include "trace/beauty/open_flags.c"
560 #include "trace/beauty/perf_event_open.c"
561 #include "trace/beauty/pid.c"
562 #include "trace/beauty/sched_policy.c"
563 #include "trace/beauty/seccomp.c"
564 #include "trace/beauty/signum.c"
565 #include "trace/beauty/socket_type.c"
566 #include "trace/beauty/waitid_options.c"
567 
568 struct syscall_arg_fmt {
569 	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
570 	void	   *parm;
571 	const char *name;
572 	bool	   show_zero;
573 };
574 
575 static struct syscall_fmt {
576 	const char *name;
577 	const char *alias;
578 	struct syscall_arg_fmt arg[6];
579 	u8	   nr_args;
580 	bool	   errpid;
581 	bool	   timeout;
582 	bool	   hexret;
583 } syscall_fmts[] = {
584 	{ .name	    = "access",
585 	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
586 	{ .name	    = "bpf",
587 	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
588 	{ .name	    = "brk",	    .hexret = true,
589 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
590 	{ .name     = "clock_gettime",
591 	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
592 	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
593 	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
594 		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
595 		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
596 		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
597 		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
598 	{ .name	    = "close",
599 	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
600 	{ .name	    = "epoll_ctl",
601 	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
602 	{ .name	    = "eventfd2",
603 	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
604 	{ .name	    = "fchmodat",
605 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
606 	{ .name	    = "fchownat",
607 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
608 	{ .name	    = "fcntl",
609 	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
610 			   .parm      = &strarrays__fcntl_cmds_arrays,
611 			   .show_zero = true, },
612 		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
613 	{ .name	    = "flock",
614 	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
615 	{ .name	    = "fstat", .alias = "newfstat", },
616 	{ .name	    = "fstatat", .alias = "newfstatat", },
617 	{ .name	    = "futex",
618 	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
619 		   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
620 	{ .name	    = "futimesat",
621 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
622 	{ .name	    = "getitimer",
623 	  .arg = { [0] = STRARRAY(which, itimers), }, },
624 	{ .name	    = "getpid",	    .errpid = true, },
625 	{ .name	    = "getpgid",    .errpid = true, },
626 	{ .name	    = "getppid",    .errpid = true, },
627 	{ .name	    = "getrandom",
628 	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
629 	{ .name	    = "getrlimit",
630 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
631 	{ .name	    = "gettid",	    .errpid = true, },
632 	{ .name	    = "ioctl",
633 	  .arg = {
634 #if defined(__i386__) || defined(__x86_64__)
635 /*
636  * FIXME: Make this available to all arches.
637  */
638 		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
639 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
640 #else
641 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
642 #endif
643 	{ .name	    = "kcmp",	    .nr_args = 5,
644 	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
645 		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
646 		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
647 		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
648 		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
649 	{ .name	    = "keyctl",
650 	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
651 	{ .name	    = "kill",
652 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
653 	{ .name	    = "linkat",
654 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
655 	{ .name	    = "lseek",
656 	  .arg = { [2] = STRARRAY(whence, whences), }, },
657 	{ .name	    = "lstat", .alias = "newlstat", },
658 	{ .name     = "madvise",
659 	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
660 		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
661 	{ .name	    = "mkdirat",
662 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
663 	{ .name	    = "mknodat",
664 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
665 	{ .name	    = "mlock",
666 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
667 	{ .name	    = "mlockall",
668 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
669 	{ .name	    = "mmap",	    .hexret = true,
670 /* The standard mmap maps to old_mmap on s390x */
671 #if defined(__s390x__)
672 	.alias = "old_mmap",
673 #endif
674 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
675 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
676 		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
677 	{ .name	    = "mprotect",
678 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
679 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
680 	{ .name	    = "mq_unlink",
681 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
682 	{ .name	    = "mremap",	    .hexret = true,
683 	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
684 		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
685 		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
686 	{ .name	    = "munlock",
687 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
688 	{ .name	    = "munmap",
689 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
690 	{ .name	    = "name_to_handle_at",
691 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
692 	{ .name	    = "newfstatat",
693 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
694 	{ .name	    = "open",
695 	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
696 	{ .name	    = "open_by_handle_at",
697 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
698 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
699 	{ .name	    = "openat",
700 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
701 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
702 	{ .name	    = "perf_event_open",
703 	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
704 		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
705 		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
706 	{ .name	    = "pipe2",
707 	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
708 	{ .name	    = "pkey_alloc",
709 	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
710 	{ .name	    = "pkey_free",
711 	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
712 	{ .name	    = "pkey_mprotect",
713 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
714 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
715 		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
716 	{ .name	    = "poll", .timeout = true, },
717 	{ .name	    = "ppoll", .timeout = true, },
718 	{ .name	    = "prctl", .alias = "arch_prctl",
719 	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
720 		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
721 		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
722 	{ .name	    = "pread", .alias = "pread64", },
723 	{ .name	    = "preadv", .alias = "pread", },
724 	{ .name	    = "prlimit64",
725 	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
726 	{ .name	    = "pwrite", .alias = "pwrite64", },
727 	{ .name	    = "readlinkat",
728 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
729 	{ .name	    = "recvfrom",
730 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
731 	{ .name	    = "recvmmsg",
732 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
733 	{ .name	    = "recvmsg",
734 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
735 	{ .name	    = "renameat",
736 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
737 	{ .name	    = "rt_sigaction",
738 	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
739 	{ .name	    = "rt_sigprocmask",
740 	  .arg = { [0] = STRARRAY(how, sighow), }, },
741 	{ .name	    = "rt_sigqueueinfo",
742 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
743 	{ .name	    = "rt_tgsigqueueinfo",
744 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
745 	{ .name	    = "sched_setscheduler",
746 	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
747 	{ .name	    = "seccomp",
748 	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
749 		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
750 	{ .name	    = "select", .timeout = true, },
751 	{ .name	    = "sendmmsg",
752 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
753 	{ .name	    = "sendmsg",
754 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
755 	{ .name	    = "sendto",
756 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
757 	{ .name	    = "set_tid_address", .errpid = true, },
758 	{ .name	    = "setitimer",
759 	  .arg = { [0] = STRARRAY(which, itimers), }, },
760 	{ .name	    = "setrlimit",
761 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
762 	{ .name	    = "socket",
763 	  .arg = { [0] = STRARRAY(family, socket_families),
764 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
765 		   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
766 	{ .name	    = "socketpair",
767 	  .arg = { [0] = STRARRAY(family, socket_families),
768 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
769 		   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
770 	{ .name	    = "stat", .alias = "newstat", },
771 	{ .name	    = "statx",
772 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
773 		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
774 		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
775 	{ .name	    = "swapoff",
776 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
777 	{ .name	    = "swapon",
778 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
779 	{ .name	    = "symlinkat",
780 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
781 	{ .name	    = "tgkill",
782 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
783 	{ .name	    = "tkill",
784 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
785 	{ .name	    = "uname", .alias = "newuname", },
786 	{ .name	    = "unlinkat",
787 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
788 	{ .name	    = "utimensat",
789 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
790 	{ .name	    = "wait4",	    .errpid = true,
791 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
792 	{ .name	    = "waitid",	    .errpid = true,
793 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
794 };
795 
796 static int syscall_fmt__cmp(const void *name, const void *fmtp)
797 {
798 	const struct syscall_fmt *fmt = fmtp;
799 	return strcmp(name, fmt->name);
800 }
801 
802 static struct syscall_fmt *syscall_fmt__find(const char *name)
803 {
804 	const int nmemb = ARRAY_SIZE(syscall_fmts);
805 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
806 }
807 
808 struct syscall {
809 	struct event_format *tp_format;
810 	int		    nr_args;
811 	struct format_field *args;
812 	const char	    *name;
813 	bool		    is_exit;
814 	struct syscall_fmt  *fmt;
815 	struct syscall_arg_fmt *arg_fmt;
816 };
817 
818 /*
819  * We need to have this 'calculated' boolean because in some cases we really
820  * don't know what is the duration of a syscall, for instance, when we start
821  * a session and some threads are waiting for a syscall to finish, say 'poll',
822  * in which case all we can do is to print "( ? ) for duration and for the
823  * start timestamp.
824  */
825 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
826 {
827 	double duration = (double)t / NSEC_PER_MSEC;
828 	size_t printed = fprintf(fp, "(");
829 
830 	if (!calculated)
831 		printed += fprintf(fp, "         ");
832 	else if (duration >= 1.0)
833 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
834 	else if (duration >= 0.01)
835 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
836 	else
837 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
838 	return printed + fprintf(fp, "): ");
839 }
840 
841 /**
842  * filename.ptr: The filename char pointer that will be vfs_getname'd
843  * filename.entry_str_pos: Where to insert the string translated from
844  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
845  * ret_scnprintf: syscall args may set this to a different syscall return
846  *                formatter, for instance, fcntl may return fds, file flags, etc.
847  */
848 struct thread_trace {
849 	u64		  entry_time;
850 	bool		  entry_pending;
851 	unsigned long	  nr_events;
852 	unsigned long	  pfmaj, pfmin;
853 	char		  *entry_str;
854 	double		  runtime_ms;
855 	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
856         struct {
857 		unsigned long ptr;
858 		short int     entry_str_pos;
859 		bool	      pending_open;
860 		unsigned int  namelen;
861 		char	      *name;
862 	} filename;
863 	struct {
864 		int	  max;
865 		char	  **table;
866 	} paths;
867 
868 	struct intlist *syscall_stats;
869 };
870 
871 static struct thread_trace *thread_trace__new(void)
872 {
873 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
874 
875 	if (ttrace)
876 		ttrace->paths.max = -1;
877 
878 	ttrace->syscall_stats = intlist__new(NULL);
879 
880 	return ttrace;
881 }
882 
883 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
884 {
885 	struct thread_trace *ttrace;
886 
887 	if (thread == NULL)
888 		goto fail;
889 
890 	if (thread__priv(thread) == NULL)
891 		thread__set_priv(thread, thread_trace__new());
892 
893 	if (thread__priv(thread) == NULL)
894 		goto fail;
895 
896 	ttrace = thread__priv(thread);
897 	++ttrace->nr_events;
898 
899 	return ttrace;
900 fail:
901 	color_fprintf(fp, PERF_COLOR_RED,
902 		      "WARNING: not enough memory, dropping samples!\n");
903 	return NULL;
904 }
905 
906 
907 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
908 				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
909 {
910 	struct thread_trace *ttrace = thread__priv(arg->thread);
911 
912 	ttrace->ret_scnprintf = ret_scnprintf;
913 }
914 
915 #define TRACE_PFMAJ		(1 << 0)
916 #define TRACE_PFMIN		(1 << 1)
917 
918 static const size_t trace__entry_str_size = 2048;
919 
920 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
921 {
922 	struct thread_trace *ttrace = thread__priv(thread);
923 
924 	if (fd > ttrace->paths.max) {
925 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
926 
927 		if (npath == NULL)
928 			return -1;
929 
930 		if (ttrace->paths.max != -1) {
931 			memset(npath + ttrace->paths.max + 1, 0,
932 			       (fd - ttrace->paths.max) * sizeof(char *));
933 		} else {
934 			memset(npath, 0, (fd + 1) * sizeof(char *));
935 		}
936 
937 		ttrace->paths.table = npath;
938 		ttrace->paths.max   = fd;
939 	}
940 
941 	ttrace->paths.table[fd] = strdup(pathname);
942 
943 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
944 }
945 
946 static int thread__read_fd_path(struct thread *thread, int fd)
947 {
948 	char linkname[PATH_MAX], pathname[PATH_MAX];
949 	struct stat st;
950 	int ret;
951 
952 	if (thread->pid_ == thread->tid) {
953 		scnprintf(linkname, sizeof(linkname),
954 			  "/proc/%d/fd/%d", thread->pid_, fd);
955 	} else {
956 		scnprintf(linkname, sizeof(linkname),
957 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
958 	}
959 
960 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
961 		return -1;
962 
963 	ret = readlink(linkname, pathname, sizeof(pathname));
964 
965 	if (ret < 0 || ret > st.st_size)
966 		return -1;
967 
968 	pathname[ret] = '\0';
969 	return trace__set_fd_pathname(thread, fd, pathname);
970 }
971 
972 static const char *thread__fd_path(struct thread *thread, int fd,
973 				   struct trace *trace)
974 {
975 	struct thread_trace *ttrace = thread__priv(thread);
976 
977 	if (ttrace == NULL)
978 		return NULL;
979 
980 	if (fd < 0)
981 		return NULL;
982 
983 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
984 		if (!trace->live)
985 			return NULL;
986 		++trace->stats.proc_getname;
987 		if (thread__read_fd_path(thread, fd))
988 			return NULL;
989 	}
990 
991 	return ttrace->paths.table[fd];
992 }
993 
994 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
995 {
996 	int fd = arg->val;
997 	size_t printed = scnprintf(bf, size, "%d", fd);
998 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
999 
1000 	if (path)
1001 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1002 
1003 	return printed;
1004 }
1005 
1006 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1007 {
1008         size_t printed = scnprintf(bf, size, "%d", fd);
1009 	struct thread *thread = machine__find_thread(trace->host, pid, pid);
1010 
1011 	if (thread) {
1012 		const char *path = thread__fd_path(thread, fd, trace);
1013 
1014 		if (path)
1015 			printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1016 
1017 		thread__put(thread);
1018 	}
1019 
1020         return printed;
1021 }
1022 
1023 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1024 					      struct syscall_arg *arg)
1025 {
1026 	int fd = arg->val;
1027 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1028 	struct thread_trace *ttrace = thread__priv(arg->thread);
1029 
1030 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1031 		zfree(&ttrace->paths.table[fd]);
1032 
1033 	return printed;
1034 }
1035 
1036 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1037 				     unsigned long ptr)
1038 {
1039 	struct thread_trace *ttrace = thread__priv(thread);
1040 
1041 	ttrace->filename.ptr = ptr;
1042 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1043 }
1044 
1045 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1046 					      struct syscall_arg *arg)
1047 {
1048 	unsigned long ptr = arg->val;
1049 
1050 	if (!arg->trace->vfs_getname)
1051 		return scnprintf(bf, size, "%#x", ptr);
1052 
1053 	thread__set_filename_pos(arg->thread, bf, ptr);
1054 	return 0;
1055 }
1056 
1057 static bool trace__filter_duration(struct trace *trace, double t)
1058 {
1059 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1060 }
1061 
1062 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1063 {
1064 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1065 
1066 	return fprintf(fp, "%10.3f ", ts);
1067 }
1068 
1069 /*
1070  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1071  * using ttrace->entry_time for a thread that receives a sys_exit without
1072  * first having received a sys_enter ("poll" issued before tracing session
1073  * starts, lost sys_enter exit due to ring buffer overflow).
1074  */
1075 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1076 {
1077 	if (tstamp > 0)
1078 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1079 
1080 	return fprintf(fp, "         ? ");
1081 }
1082 
1083 static bool done = false;
1084 static bool interrupted = false;
1085 
1086 static void sig_handler(int sig)
1087 {
1088 	done = true;
1089 	interrupted = sig == SIGINT;
1090 }
1091 
1092 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1093 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1094 {
1095 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1096 	printed += fprintf_duration(duration, duration_calculated, fp);
1097 
1098 	if (trace->multiple_threads) {
1099 		if (trace->show_comm)
1100 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1101 		printed += fprintf(fp, "%d ", thread->tid);
1102 	}
1103 
1104 	return printed;
1105 }
1106 
1107 static int trace__process_event(struct trace *trace, struct machine *machine,
1108 				union perf_event *event, struct perf_sample *sample)
1109 {
1110 	int ret = 0;
1111 
1112 	switch (event->header.type) {
1113 	case PERF_RECORD_LOST:
1114 		color_fprintf(trace->output, PERF_COLOR_RED,
1115 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1116 		ret = machine__process_lost_event(machine, event, sample);
1117 		break;
1118 	default:
1119 		ret = machine__process_event(machine, event, sample);
1120 		break;
1121 	}
1122 
1123 	return ret;
1124 }
1125 
1126 static int trace__tool_process(struct perf_tool *tool,
1127 			       union perf_event *event,
1128 			       struct perf_sample *sample,
1129 			       struct machine *machine)
1130 {
1131 	struct trace *trace = container_of(tool, struct trace, tool);
1132 	return trace__process_event(trace, machine, event, sample);
1133 }
1134 
1135 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1136 {
1137 	struct machine *machine = vmachine;
1138 
1139 	if (machine->kptr_restrict_warned)
1140 		return NULL;
1141 
1142 	if (symbol_conf.kptr_restrict) {
1143 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1144 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1145 			   "Kernel samples will not be resolved.\n");
1146 		machine->kptr_restrict_warned = true;
1147 		return NULL;
1148 	}
1149 
1150 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1151 }
1152 
1153 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1154 {
1155 	int err = symbol__init(NULL);
1156 
1157 	if (err)
1158 		return err;
1159 
1160 	trace->host = machine__new_host();
1161 	if (trace->host == NULL)
1162 		return -ENOMEM;
1163 
1164 	err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1165 	if (err < 0)
1166 		goto out;
1167 
1168 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1169 					    evlist->threads, trace__tool_process, false,
1170 					    trace->opts.proc_map_timeout, 1);
1171 out:
1172 	if (err)
1173 		symbol__exit();
1174 
1175 	return err;
1176 }
1177 
1178 static void trace__symbols__exit(struct trace *trace)
1179 {
1180 	machine__exit(trace->host);
1181 	trace->host = NULL;
1182 
1183 	symbol__exit();
1184 }
1185 
1186 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1187 {
1188 	int idx;
1189 
1190 	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1191 		nr_args = sc->fmt->nr_args;
1192 
1193 	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1194 	if (sc->arg_fmt == NULL)
1195 		return -1;
1196 
1197 	for (idx = 0; idx < nr_args; ++idx) {
1198 		if (sc->fmt)
1199 			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1200 	}
1201 
1202 	sc->nr_args = nr_args;
1203 	return 0;
1204 }
1205 
1206 static int syscall__set_arg_fmts(struct syscall *sc)
1207 {
1208 	struct format_field *field;
1209 	int idx = 0, len;
1210 
1211 	for (field = sc->args; field; field = field->next, ++idx) {
1212 		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1213 			continue;
1214 
1215 		if (strcmp(field->type, "const char *") == 0 &&
1216 			 (strcmp(field->name, "filename") == 0 ||
1217 			  strcmp(field->name, "path") == 0 ||
1218 			  strcmp(field->name, "pathname") == 0))
1219 			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1220 		else if (field->flags & FIELD_IS_POINTER)
1221 			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1222 		else if (strcmp(field->type, "pid_t") == 0)
1223 			sc->arg_fmt[idx].scnprintf = SCA_PID;
1224 		else if (strcmp(field->type, "umode_t") == 0)
1225 			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1226 		else if ((strcmp(field->type, "int") == 0 ||
1227 			  strcmp(field->type, "unsigned int") == 0 ||
1228 			  strcmp(field->type, "long") == 0) &&
1229 			 (len = strlen(field->name)) >= 2 &&
1230 			 strcmp(field->name + len - 2, "fd") == 0) {
1231 			/*
1232 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1233 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1234 			 * 65 int
1235 			 * 23 unsigned int
1236 			 * 7 unsigned long
1237 			 */
1238 			sc->arg_fmt[idx].scnprintf = SCA_FD;
1239 		}
1240 	}
1241 
1242 	return 0;
1243 }
1244 
1245 static int trace__read_syscall_info(struct trace *trace, int id)
1246 {
1247 	char tp_name[128];
1248 	struct syscall *sc;
1249 	const char *name = syscalltbl__name(trace->sctbl, id);
1250 
1251 	if (name == NULL)
1252 		return -1;
1253 
1254 	if (id > trace->syscalls.max) {
1255 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1256 
1257 		if (nsyscalls == NULL)
1258 			return -1;
1259 
1260 		if (trace->syscalls.max != -1) {
1261 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1262 			       (id - trace->syscalls.max) * sizeof(*sc));
1263 		} else {
1264 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1265 		}
1266 
1267 		trace->syscalls.table = nsyscalls;
1268 		trace->syscalls.max   = id;
1269 	}
1270 
1271 	sc = trace->syscalls.table + id;
1272 	sc->name = name;
1273 
1274 	sc->fmt  = syscall_fmt__find(sc->name);
1275 
1276 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1277 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1278 
1279 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1280 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1281 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1282 	}
1283 
1284 	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1285 		return -1;
1286 
1287 	if (IS_ERR(sc->tp_format))
1288 		return -1;
1289 
1290 	sc->args = sc->tp_format->format.fields;
1291 	/*
1292 	 * We need to check and discard the first variable '__syscall_nr'
1293 	 * or 'nr' that mean the syscall number. It is needless here.
1294 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1295 	 */
1296 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1297 		sc->args = sc->args->next;
1298 		--sc->nr_args;
1299 	}
1300 
1301 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1302 
1303 	return syscall__set_arg_fmts(sc);
1304 }
1305 
1306 static int trace__validate_ev_qualifier(struct trace *trace)
1307 {
1308 	int err = 0, i;
1309 	size_t nr_allocated;
1310 	struct str_node *pos;
1311 
1312 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1313 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1314 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1315 
1316 	if (trace->ev_qualifier_ids.entries == NULL) {
1317 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1318 		       trace->output);
1319 		err = -EINVAL;
1320 		goto out;
1321 	}
1322 
1323 	nr_allocated = trace->ev_qualifier_ids.nr;
1324 	i = 0;
1325 
1326 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1327 		const char *sc = pos->s;
1328 		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1329 
1330 		if (id < 0) {
1331 			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1332 			if (id >= 0)
1333 				goto matches;
1334 
1335 			if (err == 0) {
1336 				fputs("Error:\tInvalid syscall ", trace->output);
1337 				err = -EINVAL;
1338 			} else {
1339 				fputs(", ", trace->output);
1340 			}
1341 
1342 			fputs(sc, trace->output);
1343 		}
1344 matches:
1345 		trace->ev_qualifier_ids.entries[i++] = id;
1346 		if (match_next == -1)
1347 			continue;
1348 
1349 		while (1) {
1350 			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1351 			if (id < 0)
1352 				break;
1353 			if (nr_allocated == trace->ev_qualifier_ids.nr) {
1354 				void *entries;
1355 
1356 				nr_allocated += 8;
1357 				entries = realloc(trace->ev_qualifier_ids.entries,
1358 						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1359 				if (entries == NULL) {
1360 					err = -ENOMEM;
1361 					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1362 					goto out_free;
1363 				}
1364 				trace->ev_qualifier_ids.entries = entries;
1365 			}
1366 			trace->ev_qualifier_ids.nr++;
1367 			trace->ev_qualifier_ids.entries[i++] = id;
1368 		}
1369 	}
1370 
1371 	if (err < 0) {
1372 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1373 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1374 out_free:
1375 		zfree(&trace->ev_qualifier_ids.entries);
1376 		trace->ev_qualifier_ids.nr = 0;
1377 	}
1378 out:
1379 	return err;
1380 }
1381 
1382 /*
1383  * args is to be interpreted as a series of longs but we need to handle
1384  * 8-byte unaligned accesses. args points to raw_data within the event
1385  * and raw_data is guaranteed to be 8-byte unaligned because it is
1386  * preceded by raw_size which is a u32. So we need to copy args to a temp
1387  * variable to read it. Most notably this avoids extended load instructions
1388  * on unaligned addresses
1389  */
1390 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1391 {
1392 	unsigned long val;
1393 	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1394 
1395 	memcpy(&val, p, sizeof(val));
1396 	return val;
1397 }
1398 
1399 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1400 				      struct syscall_arg *arg)
1401 {
1402 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1403 		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1404 
1405 	return scnprintf(bf, size, "arg%d: ", arg->idx);
1406 }
1407 
1408 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1409 				     struct syscall_arg *arg, unsigned long val)
1410 {
1411 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1412 		arg->val = val;
1413 		if (sc->arg_fmt[arg->idx].parm)
1414 			arg->parm = sc->arg_fmt[arg->idx].parm;
1415 		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1416 	}
1417 	return scnprintf(bf, size, "%ld", val);
1418 }
1419 
1420 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1421 				      unsigned char *args, struct trace *trace,
1422 				      struct thread *thread)
1423 {
1424 	size_t printed = 0;
1425 	unsigned long val;
1426 	u8 bit = 1;
1427 	struct syscall_arg arg = {
1428 		.args	= args,
1429 		.idx	= 0,
1430 		.mask	= 0,
1431 		.trace  = trace,
1432 		.thread = thread,
1433 	};
1434 	struct thread_trace *ttrace = thread__priv(thread);
1435 
1436 	/*
1437 	 * Things like fcntl will set this in its 'cmd' formatter to pick the
1438 	 * right formatter for the return value (an fd? file flags?), which is
1439 	 * not needed for syscalls that always return a given type, say an fd.
1440 	 */
1441 	ttrace->ret_scnprintf = NULL;
1442 
1443 	if (sc->args != NULL) {
1444 		struct format_field *field;
1445 
1446 		for (field = sc->args; field;
1447 		     field = field->next, ++arg.idx, bit <<= 1) {
1448 			if (arg.mask & bit)
1449 				continue;
1450 
1451 			val = syscall_arg__val(&arg, arg.idx);
1452 
1453 			/*
1454  			 * Suppress this argument if its value is zero and
1455  			 * and we don't have a string associated in an
1456  			 * strarray for it.
1457  			 */
1458 			if (val == 0 &&
1459 			    !(sc->arg_fmt &&
1460 			      (sc->arg_fmt[arg.idx].show_zero ||
1461 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1462 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1463 			      sc->arg_fmt[arg.idx].parm))
1464 				continue;
1465 
1466 			printed += scnprintf(bf + printed, size - printed,
1467 					     "%s%s: ", printed ? ", " : "", field->name);
1468 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1469 		}
1470 	} else if (IS_ERR(sc->tp_format)) {
1471 		/*
1472 		 * If we managed to read the tracepoint /format file, then we
1473 		 * may end up not having any args, like with gettid(), so only
1474 		 * print the raw args when we didn't manage to read it.
1475 		 */
1476 		while (arg.idx < sc->nr_args) {
1477 			if (arg.mask & bit)
1478 				goto next_arg;
1479 			val = syscall_arg__val(&arg, arg.idx);
1480 			if (printed)
1481 				printed += scnprintf(bf + printed, size - printed, ", ");
1482 			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1483 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1484 next_arg:
1485 			++arg.idx;
1486 			bit <<= 1;
1487 		}
1488 	}
1489 
1490 	return printed;
1491 }
1492 
1493 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1494 				  union perf_event *event,
1495 				  struct perf_sample *sample);
1496 
1497 static struct syscall *trace__syscall_info(struct trace *trace,
1498 					   struct perf_evsel *evsel, int id)
1499 {
1500 
1501 	if (id < 0) {
1502 
1503 		/*
1504 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1505 		 * before that, leaving at a higher verbosity level till that is
1506 		 * explained. Reproduced with plain ftrace with:
1507 		 *
1508 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1509 		 * grep "NR -1 " /t/trace_pipe
1510 		 *
1511 		 * After generating some load on the machine.
1512  		 */
1513 		if (verbose > 1) {
1514 			static u64 n;
1515 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1516 				id, perf_evsel__name(evsel), ++n);
1517 		}
1518 		return NULL;
1519 	}
1520 
1521 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1522 	    trace__read_syscall_info(trace, id))
1523 		goto out_cant_read;
1524 
1525 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1526 		goto out_cant_read;
1527 
1528 	return &trace->syscalls.table[id];
1529 
1530 out_cant_read:
1531 	if (verbose > 0) {
1532 		fprintf(trace->output, "Problems reading syscall %d", id);
1533 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1534 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1535 		fputs(" information\n", trace->output);
1536 	}
1537 	return NULL;
1538 }
1539 
1540 static void thread__update_stats(struct thread_trace *ttrace,
1541 				 int id, struct perf_sample *sample)
1542 {
1543 	struct int_node *inode;
1544 	struct stats *stats;
1545 	u64 duration = 0;
1546 
1547 	inode = intlist__findnew(ttrace->syscall_stats, id);
1548 	if (inode == NULL)
1549 		return;
1550 
1551 	stats = inode->priv;
1552 	if (stats == NULL) {
1553 		stats = malloc(sizeof(struct stats));
1554 		if (stats == NULL)
1555 			return;
1556 		init_stats(stats);
1557 		inode->priv = stats;
1558 	}
1559 
1560 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1561 		duration = sample->time - ttrace->entry_time;
1562 
1563 	update_stats(stats, duration);
1564 }
1565 
1566 static int trace__printf_interrupted_entry(struct trace *trace)
1567 {
1568 	struct thread_trace *ttrace;
1569 	size_t printed;
1570 
1571 	if (trace->failure_only || trace->current == NULL)
1572 		return 0;
1573 
1574 	ttrace = thread__priv(trace->current);
1575 
1576 	if (!ttrace->entry_pending)
1577 		return 0;
1578 
1579 	printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1580 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1581 	ttrace->entry_pending = false;
1582 
1583 	return printed;
1584 }
1585 
1586 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1587 				 struct perf_sample *sample, struct thread *thread)
1588 {
1589 	int printed = 0;
1590 
1591 	if (trace->print_sample) {
1592 		double ts = (double)sample->time / NSEC_PER_MSEC;
1593 
1594 		printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1595 				   perf_evsel__name(evsel), ts,
1596 				   thread__comm_str(thread),
1597 				   sample->pid, sample->tid, sample->cpu);
1598 	}
1599 
1600 	return printed;
1601 }
1602 
1603 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1604 			    union perf_event *event __maybe_unused,
1605 			    struct perf_sample *sample)
1606 {
1607 	char *msg;
1608 	void *args;
1609 	size_t printed = 0;
1610 	struct thread *thread;
1611 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1612 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1613 	struct thread_trace *ttrace;
1614 
1615 	if (sc == NULL)
1616 		return -1;
1617 
1618 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1619 	ttrace = thread__trace(thread, trace->output);
1620 	if (ttrace == NULL)
1621 		goto out_put;
1622 
1623 	trace__fprintf_sample(trace, evsel, sample, thread);
1624 
1625 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1626 
1627 	if (ttrace->entry_str == NULL) {
1628 		ttrace->entry_str = malloc(trace__entry_str_size);
1629 		if (!ttrace->entry_str)
1630 			goto out_put;
1631 	}
1632 
1633 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1634 		trace__printf_interrupted_entry(trace);
1635 
1636 	ttrace->entry_time = sample->time;
1637 	msg = ttrace->entry_str;
1638 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1639 
1640 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1641 					   args, trace, thread);
1642 
1643 	if (sc->is_exit) {
1644 		if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1645 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1646 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1647 		}
1648 	} else {
1649 		ttrace->entry_pending = true;
1650 		/* See trace__vfs_getname & trace__sys_exit */
1651 		ttrace->filename.pending_open = false;
1652 	}
1653 
1654 	if (trace->current != thread) {
1655 		thread__put(trace->current);
1656 		trace->current = thread__get(thread);
1657 	}
1658 	err = 0;
1659 out_put:
1660 	thread__put(thread);
1661 	return err;
1662 }
1663 
1664 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1665 				    struct perf_sample *sample,
1666 				    struct callchain_cursor *cursor)
1667 {
1668 	struct addr_location al;
1669 	int max_stack = evsel->attr.sample_max_stack ?
1670 			evsel->attr.sample_max_stack :
1671 			trace->max_stack;
1672 
1673 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1674 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1675 		return -1;
1676 
1677 	return 0;
1678 }
1679 
1680 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1681 {
1682 	/* TODO: user-configurable print_opts */
1683 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1684 				        EVSEL__PRINT_DSO |
1685 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1686 
1687 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1688 }
1689 
1690 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1691 {
1692 	struct perf_env *env = perf_evsel__env(evsel);
1693 	const char *arch_name = perf_env__arch(env);
1694 
1695 	return arch_syscalls__strerrno(arch_name, err);
1696 }
1697 
1698 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1699 			   union perf_event *event __maybe_unused,
1700 			   struct perf_sample *sample)
1701 {
1702 	long ret;
1703 	u64 duration = 0;
1704 	bool duration_calculated = false;
1705 	struct thread *thread;
1706 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1707 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1708 	struct thread_trace *ttrace;
1709 
1710 	if (sc == NULL)
1711 		return -1;
1712 
1713 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1714 	ttrace = thread__trace(thread, trace->output);
1715 	if (ttrace == NULL)
1716 		goto out_put;
1717 
1718 	trace__fprintf_sample(trace, evsel, sample, thread);
1719 
1720 	if (trace->summary)
1721 		thread__update_stats(ttrace, id, sample);
1722 
1723 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1724 
1725 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1726 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1727 		ttrace->filename.pending_open = false;
1728 		++trace->stats.vfs_getname;
1729 	}
1730 
1731 	if (ttrace->entry_time) {
1732 		duration = sample->time - ttrace->entry_time;
1733 		if (trace__filter_duration(trace, duration))
1734 			goto out;
1735 		duration_calculated = true;
1736 	} else if (trace->duration_filter)
1737 		goto out;
1738 
1739 	if (sample->callchain) {
1740 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1741 		if (callchain_ret == 0) {
1742 			if (callchain_cursor.nr < trace->min_stack)
1743 				goto out;
1744 			callchain_ret = 1;
1745 		}
1746 	}
1747 
1748 	if (trace->summary_only || (ret >= 0 && trace->failure_only))
1749 		goto out;
1750 
1751 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1752 
1753 	if (ttrace->entry_pending) {
1754 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1755 	} else {
1756 		fprintf(trace->output, " ... [");
1757 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1758 		fprintf(trace->output, "]: %s()", sc->name);
1759 	}
1760 
1761 	if (sc->fmt == NULL) {
1762 		if (ret < 0)
1763 			goto errno_print;
1764 signed_print:
1765 		fprintf(trace->output, ") = %ld", ret);
1766 	} else if (ret < 0) {
1767 errno_print: {
1768 		char bf[STRERR_BUFSIZE];
1769 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1770 			   *e = errno_to_name(evsel, -ret);
1771 
1772 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1773 	}
1774 	} else if (ret == 0 && sc->fmt->timeout)
1775 		fprintf(trace->output, ") = 0 Timeout");
1776 	else if (ttrace->ret_scnprintf) {
1777 		char bf[1024];
1778 		struct syscall_arg arg = {
1779 			.val	= ret,
1780 			.thread	= thread,
1781 			.trace	= trace,
1782 		};
1783 		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1784 		ttrace->ret_scnprintf = NULL;
1785 		fprintf(trace->output, ") = %s", bf);
1786 	} else if (sc->fmt->hexret)
1787 		fprintf(trace->output, ") = %#lx", ret);
1788 	else if (sc->fmt->errpid) {
1789 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1790 
1791 		if (child != NULL) {
1792 			fprintf(trace->output, ") = %ld", ret);
1793 			if (child->comm_set)
1794 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1795 			thread__put(child);
1796 		}
1797 	} else
1798 		goto signed_print;
1799 
1800 	fputc('\n', trace->output);
1801 
1802 	if (callchain_ret > 0)
1803 		trace__fprintf_callchain(trace, sample);
1804 	else if (callchain_ret < 0)
1805 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1806 out:
1807 	ttrace->entry_pending = false;
1808 	err = 0;
1809 out_put:
1810 	thread__put(thread);
1811 	return err;
1812 }
1813 
1814 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1815 			      union perf_event *event __maybe_unused,
1816 			      struct perf_sample *sample)
1817 {
1818 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1819 	struct thread_trace *ttrace;
1820 	size_t filename_len, entry_str_len, to_move;
1821 	ssize_t remaining_space;
1822 	char *pos;
1823 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1824 
1825 	if (!thread)
1826 		goto out;
1827 
1828 	ttrace = thread__priv(thread);
1829 	if (!ttrace)
1830 		goto out_put;
1831 
1832 	filename_len = strlen(filename);
1833 	if (filename_len == 0)
1834 		goto out_put;
1835 
1836 	if (ttrace->filename.namelen < filename_len) {
1837 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1838 
1839 		if (f == NULL)
1840 			goto out_put;
1841 
1842 		ttrace->filename.namelen = filename_len;
1843 		ttrace->filename.name = f;
1844 	}
1845 
1846 	strcpy(ttrace->filename.name, filename);
1847 	ttrace->filename.pending_open = true;
1848 
1849 	if (!ttrace->filename.ptr)
1850 		goto out_put;
1851 
1852 	entry_str_len = strlen(ttrace->entry_str);
1853 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1854 	if (remaining_space <= 0)
1855 		goto out_put;
1856 
1857 	if (filename_len > (size_t)remaining_space) {
1858 		filename += filename_len - remaining_space;
1859 		filename_len = remaining_space;
1860 	}
1861 
1862 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1863 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1864 	memmove(pos + filename_len, pos, to_move);
1865 	memcpy(pos, filename, filename_len);
1866 
1867 	ttrace->filename.ptr = 0;
1868 	ttrace->filename.entry_str_pos = 0;
1869 out_put:
1870 	thread__put(thread);
1871 out:
1872 	return 0;
1873 }
1874 
1875 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1876 				     union perf_event *event __maybe_unused,
1877 				     struct perf_sample *sample)
1878 {
1879         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1880 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1881 	struct thread *thread = machine__findnew_thread(trace->host,
1882 							sample->pid,
1883 							sample->tid);
1884 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1885 
1886 	if (ttrace == NULL)
1887 		goto out_dump;
1888 
1889 	ttrace->runtime_ms += runtime_ms;
1890 	trace->runtime_ms += runtime_ms;
1891 out_put:
1892 	thread__put(thread);
1893 	return 0;
1894 
1895 out_dump:
1896 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1897 	       evsel->name,
1898 	       perf_evsel__strval(evsel, sample, "comm"),
1899 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1900 	       runtime,
1901 	       perf_evsel__intval(evsel, sample, "vruntime"));
1902 	goto out_put;
1903 }
1904 
1905 static int bpf_output__printer(enum binary_printer_ops op,
1906 			       unsigned int val, void *extra __maybe_unused, FILE *fp)
1907 {
1908 	unsigned char ch = (unsigned char)val;
1909 
1910 	switch (op) {
1911 	case BINARY_PRINT_CHAR_DATA:
1912 		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1913 	case BINARY_PRINT_DATA_BEGIN:
1914 	case BINARY_PRINT_LINE_BEGIN:
1915 	case BINARY_PRINT_ADDR:
1916 	case BINARY_PRINT_NUM_DATA:
1917 	case BINARY_PRINT_NUM_PAD:
1918 	case BINARY_PRINT_SEP:
1919 	case BINARY_PRINT_CHAR_PAD:
1920 	case BINARY_PRINT_LINE_END:
1921 	case BINARY_PRINT_DATA_END:
1922 	default:
1923 		break;
1924 	}
1925 
1926 	return 0;
1927 }
1928 
1929 static void bpf_output__fprintf(struct trace *trace,
1930 				struct perf_sample *sample)
1931 {
1932 	binary__fprintf(sample->raw_data, sample->raw_size, 8,
1933 			bpf_output__printer, NULL, trace->output);
1934 }
1935 
1936 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1937 				union perf_event *event __maybe_unused,
1938 				struct perf_sample *sample)
1939 {
1940 	int callchain_ret = 0;
1941 
1942 	if (sample->callchain) {
1943 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1944 		if (callchain_ret == 0) {
1945 			if (callchain_cursor.nr < trace->min_stack)
1946 				goto out;
1947 			callchain_ret = 1;
1948 		}
1949 	}
1950 
1951 	trace__printf_interrupted_entry(trace);
1952 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1953 
1954 	if (trace->trace_syscalls)
1955 		fprintf(trace->output, "(         ): ");
1956 
1957 	fprintf(trace->output, "%s:", evsel->name);
1958 
1959 	if (perf_evsel__is_bpf_output(evsel)) {
1960 		bpf_output__fprintf(trace, sample);
1961 	} else if (evsel->tp_format) {
1962 		event_format__fprintf(evsel->tp_format, sample->cpu,
1963 				      sample->raw_data, sample->raw_size,
1964 				      trace->output);
1965 	}
1966 
1967 	fprintf(trace->output, "\n");
1968 
1969 	if (callchain_ret > 0)
1970 		trace__fprintf_callchain(trace, sample);
1971 	else if (callchain_ret < 0)
1972 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1973 out:
1974 	return 0;
1975 }
1976 
1977 static void print_location(FILE *f, struct perf_sample *sample,
1978 			   struct addr_location *al,
1979 			   bool print_dso, bool print_sym)
1980 {
1981 
1982 	if ((verbose > 0 || print_dso) && al->map)
1983 		fprintf(f, "%s@", al->map->dso->long_name);
1984 
1985 	if ((verbose > 0 || print_sym) && al->sym)
1986 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1987 			al->addr - al->sym->start);
1988 	else if (al->map)
1989 		fprintf(f, "0x%" PRIx64, al->addr);
1990 	else
1991 		fprintf(f, "0x%" PRIx64, sample->addr);
1992 }
1993 
1994 static int trace__pgfault(struct trace *trace,
1995 			  struct perf_evsel *evsel,
1996 			  union perf_event *event __maybe_unused,
1997 			  struct perf_sample *sample)
1998 {
1999 	struct thread *thread;
2000 	struct addr_location al;
2001 	char map_type = 'd';
2002 	struct thread_trace *ttrace;
2003 	int err = -1;
2004 	int callchain_ret = 0;
2005 
2006 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2007 
2008 	if (sample->callchain) {
2009 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2010 		if (callchain_ret == 0) {
2011 			if (callchain_cursor.nr < trace->min_stack)
2012 				goto out_put;
2013 			callchain_ret = 1;
2014 		}
2015 	}
2016 
2017 	ttrace = thread__trace(thread, trace->output);
2018 	if (ttrace == NULL)
2019 		goto out_put;
2020 
2021 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2022 		ttrace->pfmaj++;
2023 	else
2024 		ttrace->pfmin++;
2025 
2026 	if (trace->summary_only)
2027 		goto out;
2028 
2029 	thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2030 
2031 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2032 
2033 	fprintf(trace->output, "%sfault [",
2034 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2035 		"maj" : "min");
2036 
2037 	print_location(trace->output, sample, &al, false, true);
2038 
2039 	fprintf(trace->output, "] => ");
2040 
2041 	thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2042 
2043 	if (!al.map) {
2044 		thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2045 
2046 		if (al.map)
2047 			map_type = 'x';
2048 		else
2049 			map_type = '?';
2050 	}
2051 
2052 	print_location(trace->output, sample, &al, true, false);
2053 
2054 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2055 
2056 	if (callchain_ret > 0)
2057 		trace__fprintf_callchain(trace, sample);
2058 	else if (callchain_ret < 0)
2059 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2060 out:
2061 	err = 0;
2062 out_put:
2063 	thread__put(thread);
2064 	return err;
2065 }
2066 
2067 static void trace__set_base_time(struct trace *trace,
2068 				 struct perf_evsel *evsel,
2069 				 struct perf_sample *sample)
2070 {
2071 	/*
2072 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2073 	 * and don't use sample->time unconditionally, we may end up having
2074 	 * some other event in the future without PERF_SAMPLE_TIME for good
2075 	 * reason, i.e. we may not be interested in its timestamps, just in
2076 	 * it taking place, picking some piece of information when it
2077 	 * appears in our event stream (vfs_getname comes to mind).
2078 	 */
2079 	if (trace->base_time == 0 && !trace->full_time &&
2080 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2081 		trace->base_time = sample->time;
2082 }
2083 
2084 static int trace__process_sample(struct perf_tool *tool,
2085 				 union perf_event *event,
2086 				 struct perf_sample *sample,
2087 				 struct perf_evsel *evsel,
2088 				 struct machine *machine __maybe_unused)
2089 {
2090 	struct trace *trace = container_of(tool, struct trace, tool);
2091 	struct thread *thread;
2092 	int err = 0;
2093 
2094 	tracepoint_handler handler = evsel->handler;
2095 
2096 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2097 	if (thread && thread__is_filtered(thread))
2098 		goto out;
2099 
2100 	trace__set_base_time(trace, evsel, sample);
2101 
2102 	if (handler) {
2103 		++trace->nr_events;
2104 		handler(trace, evsel, event, sample);
2105 	}
2106 out:
2107 	thread__put(thread);
2108 	return err;
2109 }
2110 
2111 static int trace__record(struct trace *trace, int argc, const char **argv)
2112 {
2113 	unsigned int rec_argc, i, j;
2114 	const char **rec_argv;
2115 	const char * const record_args[] = {
2116 		"record",
2117 		"-R",
2118 		"-m", "1024",
2119 		"-c", "1",
2120 	};
2121 
2122 	const char * const sc_args[] = { "-e", };
2123 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2124 	const char * const majpf_args[] = { "-e", "major-faults" };
2125 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2126 	const char * const minpf_args[] = { "-e", "minor-faults" };
2127 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2128 
2129 	/* +1 is for the event string below */
2130 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2131 		majpf_args_nr + minpf_args_nr + argc;
2132 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2133 
2134 	if (rec_argv == NULL)
2135 		return -ENOMEM;
2136 
2137 	j = 0;
2138 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2139 		rec_argv[j++] = record_args[i];
2140 
2141 	if (trace->trace_syscalls) {
2142 		for (i = 0; i < sc_args_nr; i++)
2143 			rec_argv[j++] = sc_args[i];
2144 
2145 		/* event string may be different for older kernels - e.g., RHEL6 */
2146 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2147 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2148 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2149 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2150 		else {
2151 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2152 			free(rec_argv);
2153 			return -1;
2154 		}
2155 	}
2156 
2157 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2158 		for (i = 0; i < majpf_args_nr; i++)
2159 			rec_argv[j++] = majpf_args[i];
2160 
2161 	if (trace->trace_pgfaults & TRACE_PFMIN)
2162 		for (i = 0; i < minpf_args_nr; i++)
2163 			rec_argv[j++] = minpf_args[i];
2164 
2165 	for (i = 0; i < (unsigned int)argc; i++)
2166 		rec_argv[j++] = argv[i];
2167 
2168 	return cmd_record(j, rec_argv);
2169 }
2170 
2171 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2172 
2173 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2174 {
2175 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2176 
2177 	if (IS_ERR(evsel))
2178 		return false;
2179 
2180 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2181 		perf_evsel__delete(evsel);
2182 		return false;
2183 	}
2184 
2185 	evsel->handler = trace__vfs_getname;
2186 	perf_evlist__add(evlist, evsel);
2187 	return true;
2188 }
2189 
2190 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2191 {
2192 	struct perf_evsel *evsel;
2193 	struct perf_event_attr attr = {
2194 		.type = PERF_TYPE_SOFTWARE,
2195 		.mmap_data = 1,
2196 	};
2197 
2198 	attr.config = config;
2199 	attr.sample_period = 1;
2200 
2201 	event_attr_init(&attr);
2202 
2203 	evsel = perf_evsel__new(&attr);
2204 	if (evsel)
2205 		evsel->handler = trace__pgfault;
2206 
2207 	return evsel;
2208 }
2209 
2210 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2211 {
2212 	const u32 type = event->header.type;
2213 	struct perf_evsel *evsel;
2214 
2215 	if (type != PERF_RECORD_SAMPLE) {
2216 		trace__process_event(trace, trace->host, event, sample);
2217 		return;
2218 	}
2219 
2220 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2221 	if (evsel == NULL) {
2222 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2223 		return;
2224 	}
2225 
2226 	trace__set_base_time(trace, evsel, sample);
2227 
2228 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2229 	    sample->raw_data == NULL) {
2230 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2231 		       perf_evsel__name(evsel), sample->tid,
2232 		       sample->cpu, sample->raw_size);
2233 	} else {
2234 		tracepoint_handler handler = evsel->handler;
2235 		handler(trace, evsel, event, sample);
2236 	}
2237 }
2238 
2239 static int trace__add_syscall_newtp(struct trace *trace)
2240 {
2241 	int ret = -1;
2242 	struct perf_evlist *evlist = trace->evlist;
2243 	struct perf_evsel *sys_enter, *sys_exit;
2244 
2245 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2246 	if (sys_enter == NULL)
2247 		goto out;
2248 
2249 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2250 		goto out_delete_sys_enter;
2251 
2252 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2253 	if (sys_exit == NULL)
2254 		goto out_delete_sys_enter;
2255 
2256 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2257 		goto out_delete_sys_exit;
2258 
2259 	perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2260 	perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2261 
2262 	perf_evlist__add(evlist, sys_enter);
2263 	perf_evlist__add(evlist, sys_exit);
2264 
2265 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2266 		/*
2267 		 * We're interested only in the user space callchain
2268 		 * leading to the syscall, allow overriding that for
2269 		 * debugging reasons using --kernel_syscall_callchains
2270 		 */
2271 		sys_exit->attr.exclude_callchain_kernel = 1;
2272 	}
2273 
2274 	trace->syscalls.events.sys_enter = sys_enter;
2275 	trace->syscalls.events.sys_exit  = sys_exit;
2276 
2277 	ret = 0;
2278 out:
2279 	return ret;
2280 
2281 out_delete_sys_exit:
2282 	perf_evsel__delete_priv(sys_exit);
2283 out_delete_sys_enter:
2284 	perf_evsel__delete_priv(sys_enter);
2285 	goto out;
2286 }
2287 
2288 static int trace__set_ev_qualifier_filter(struct trace *trace)
2289 {
2290 	int err = -1;
2291 	struct perf_evsel *sys_exit;
2292 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2293 						trace->ev_qualifier_ids.nr,
2294 						trace->ev_qualifier_ids.entries);
2295 
2296 	if (filter == NULL)
2297 		goto out_enomem;
2298 
2299 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2300 					  filter)) {
2301 		sys_exit = trace->syscalls.events.sys_exit;
2302 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2303 	}
2304 
2305 	free(filter);
2306 out:
2307 	return err;
2308 out_enomem:
2309 	errno = ENOMEM;
2310 	goto out;
2311 }
2312 
2313 static int trace__set_filter_loop_pids(struct trace *trace)
2314 {
2315 	unsigned int nr = 1;
2316 	pid_t pids[32] = {
2317 		getpid(),
2318 	};
2319 	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2320 
2321 	while (thread && nr < ARRAY_SIZE(pids)) {
2322 		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2323 
2324 		if (parent == NULL)
2325 			break;
2326 
2327 		if (!strcmp(thread__comm_str(parent), "sshd")) {
2328 			pids[nr++] = parent->tid;
2329 			break;
2330 		}
2331 		thread = parent;
2332 	}
2333 
2334 	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2335 }
2336 
2337 static int trace__run(struct trace *trace, int argc, const char **argv)
2338 {
2339 	struct perf_evlist *evlist = trace->evlist;
2340 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2341 	int err = -1, i;
2342 	unsigned long before;
2343 	const bool forks = argc > 0;
2344 	bool draining = false;
2345 
2346 	trace->live = true;
2347 
2348 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2349 		goto out_error_raw_syscalls;
2350 
2351 	if (trace->trace_syscalls)
2352 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2353 
2354 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2355 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2356 		if (pgfault_maj == NULL)
2357 			goto out_error_mem;
2358 		perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2359 		perf_evlist__add(evlist, pgfault_maj);
2360 	}
2361 
2362 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2363 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2364 		if (pgfault_min == NULL)
2365 			goto out_error_mem;
2366 		perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2367 		perf_evlist__add(evlist, pgfault_min);
2368 	}
2369 
2370 	if (trace->sched &&
2371 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2372 				   trace__sched_stat_runtime))
2373 		goto out_error_sched_stat_runtime;
2374 
2375 	/*
2376 	 * If a global cgroup was set, apply it to all the events without an
2377 	 * explicit cgroup. I.e.:
2378 	 *
2379 	 * 	trace -G A -e sched:*switch
2380 	 *
2381 	 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2382 	 * _and_ sched:sched_switch to the 'A' cgroup, while:
2383 	 *
2384 	 * trace -e sched:*switch -G A
2385 	 *
2386 	 * will only set the sched:sched_switch event to the 'A' cgroup, all the
2387 	 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2388 	 * a cgroup (on the root cgroup, sys wide, etc).
2389 	 *
2390 	 * Multiple cgroups:
2391 	 *
2392 	 * trace -G A -e sched:*switch -G B
2393 	 *
2394 	 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2395 	 * to the 'B' cgroup.
2396 	 *
2397 	 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2398 	 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2399 	 */
2400 	if (trace->cgroup)
2401 		evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2402 
2403 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2404 	if (err < 0) {
2405 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2406 		goto out_delete_evlist;
2407 	}
2408 
2409 	err = trace__symbols_init(trace, evlist);
2410 	if (err < 0) {
2411 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2412 		goto out_delete_evlist;
2413 	}
2414 
2415 	perf_evlist__config(evlist, &trace->opts, &callchain_param);
2416 
2417 	signal(SIGCHLD, sig_handler);
2418 	signal(SIGINT, sig_handler);
2419 
2420 	if (forks) {
2421 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2422 						    argv, false, NULL);
2423 		if (err < 0) {
2424 			fprintf(trace->output, "Couldn't run the workload!\n");
2425 			goto out_delete_evlist;
2426 		}
2427 	}
2428 
2429 	err = perf_evlist__open(evlist);
2430 	if (err < 0)
2431 		goto out_error_open;
2432 
2433 	err = bpf__apply_obj_config();
2434 	if (err) {
2435 		char errbuf[BUFSIZ];
2436 
2437 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2438 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2439 			 errbuf);
2440 		goto out_error_open;
2441 	}
2442 
2443 	/*
2444 	 * Better not use !target__has_task() here because we need to cover the
2445 	 * case where no threads were specified in the command line, but a
2446 	 * workload was, and in that case we will fill in the thread_map when
2447 	 * we fork the workload in perf_evlist__prepare_workload.
2448 	 */
2449 	if (trace->filter_pids.nr > 0)
2450 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2451 	else if (thread_map__pid(evlist->threads, 0) == -1)
2452 		err = trace__set_filter_loop_pids(trace);
2453 
2454 	if (err < 0)
2455 		goto out_error_mem;
2456 
2457 	if (trace->ev_qualifier_ids.nr > 0) {
2458 		err = trace__set_ev_qualifier_filter(trace);
2459 		if (err < 0)
2460 			goto out_errno;
2461 
2462 		pr_debug("event qualifier tracepoint filter: %s\n",
2463 			 trace->syscalls.events.sys_exit->filter);
2464 	}
2465 
2466 	err = perf_evlist__apply_filters(evlist, &evsel);
2467 	if (err < 0)
2468 		goto out_error_apply_filters;
2469 
2470 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2471 	if (err < 0)
2472 		goto out_error_mmap;
2473 
2474 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2475 		perf_evlist__enable(evlist);
2476 
2477 	if (forks)
2478 		perf_evlist__start_workload(evlist);
2479 
2480 	if (trace->opts.initial_delay) {
2481 		usleep(trace->opts.initial_delay * 1000);
2482 		perf_evlist__enable(evlist);
2483 	}
2484 
2485 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2486 				  evlist->threads->nr > 1 ||
2487 				  perf_evlist__first(evlist)->attr.inherit;
2488 
2489 	/*
2490 	 * Now that we already used evsel->attr to ask the kernel to setup the
2491 	 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2492 	 * trace__resolve_callchain(), allowing per-event max-stack settings
2493 	 * to override an explicitely set --max-stack global setting.
2494 	 */
2495 	evlist__for_each_entry(evlist, evsel) {
2496 		if (evsel__has_callchain(evsel) &&
2497 		    evsel->attr.sample_max_stack == 0)
2498 			evsel->attr.sample_max_stack = trace->max_stack;
2499 	}
2500 again:
2501 	before = trace->nr_events;
2502 
2503 	for (i = 0; i < evlist->nr_mmaps; i++) {
2504 		union perf_event *event;
2505 		struct perf_mmap *md;
2506 
2507 		md = &evlist->mmap[i];
2508 		if (perf_mmap__read_init(md) < 0)
2509 			continue;
2510 
2511 		while ((event = perf_mmap__read_event(md)) != NULL) {
2512 			struct perf_sample sample;
2513 
2514 			++trace->nr_events;
2515 
2516 			err = perf_evlist__parse_sample(evlist, event, &sample);
2517 			if (err) {
2518 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2519 				goto next_event;
2520 			}
2521 
2522 			trace__handle_event(trace, event, &sample);
2523 next_event:
2524 			perf_mmap__consume(md);
2525 
2526 			if (interrupted)
2527 				goto out_disable;
2528 
2529 			if (done && !draining) {
2530 				perf_evlist__disable(evlist);
2531 				draining = true;
2532 			}
2533 		}
2534 		perf_mmap__read_done(md);
2535 	}
2536 
2537 	if (trace->nr_events == before) {
2538 		int timeout = done ? 100 : -1;
2539 
2540 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2541 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2542 				draining = true;
2543 
2544 			goto again;
2545 		}
2546 	} else {
2547 		goto again;
2548 	}
2549 
2550 out_disable:
2551 	thread__zput(trace->current);
2552 
2553 	perf_evlist__disable(evlist);
2554 
2555 	if (!err) {
2556 		if (trace->summary)
2557 			trace__fprintf_thread_summary(trace, trace->output);
2558 
2559 		if (trace->show_tool_stats) {
2560 			fprintf(trace->output, "Stats:\n "
2561 					       " vfs_getname : %" PRIu64 "\n"
2562 					       " proc_getname: %" PRIu64 "\n",
2563 				trace->stats.vfs_getname,
2564 				trace->stats.proc_getname);
2565 		}
2566 	}
2567 
2568 out_delete_evlist:
2569 	trace__symbols__exit(trace);
2570 
2571 	perf_evlist__delete(evlist);
2572 	cgroup__put(trace->cgroup);
2573 	trace->evlist = NULL;
2574 	trace->live = false;
2575 	return err;
2576 {
2577 	char errbuf[BUFSIZ];
2578 
2579 out_error_sched_stat_runtime:
2580 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2581 	goto out_error;
2582 
2583 out_error_raw_syscalls:
2584 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2585 	goto out_error;
2586 
2587 out_error_mmap:
2588 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2589 	goto out_error;
2590 
2591 out_error_open:
2592 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2593 
2594 out_error:
2595 	fprintf(trace->output, "%s\n", errbuf);
2596 	goto out_delete_evlist;
2597 
2598 out_error_apply_filters:
2599 	fprintf(trace->output,
2600 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2601 		evsel->filter, perf_evsel__name(evsel), errno,
2602 		str_error_r(errno, errbuf, sizeof(errbuf)));
2603 	goto out_delete_evlist;
2604 }
2605 out_error_mem:
2606 	fprintf(trace->output, "Not enough memory to run!\n");
2607 	goto out_delete_evlist;
2608 
2609 out_errno:
2610 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2611 	goto out_delete_evlist;
2612 }
2613 
2614 static int trace__replay(struct trace *trace)
2615 {
2616 	const struct perf_evsel_str_handler handlers[] = {
2617 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2618 	};
2619 	struct perf_data data = {
2620 		.file      = {
2621 			.path = input_name,
2622 		},
2623 		.mode      = PERF_DATA_MODE_READ,
2624 		.force     = trace->force,
2625 	};
2626 	struct perf_session *session;
2627 	struct perf_evsel *evsel;
2628 	int err = -1;
2629 
2630 	trace->tool.sample	  = trace__process_sample;
2631 	trace->tool.mmap	  = perf_event__process_mmap;
2632 	trace->tool.mmap2	  = perf_event__process_mmap2;
2633 	trace->tool.comm	  = perf_event__process_comm;
2634 	trace->tool.exit	  = perf_event__process_exit;
2635 	trace->tool.fork	  = perf_event__process_fork;
2636 	trace->tool.attr	  = perf_event__process_attr;
2637 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2638 	trace->tool.build_id	  = perf_event__process_build_id;
2639 	trace->tool.namespaces	  = perf_event__process_namespaces;
2640 
2641 	trace->tool.ordered_events = true;
2642 	trace->tool.ordering_requires_timestamps = true;
2643 
2644 	/* add tid to output */
2645 	trace->multiple_threads = true;
2646 
2647 	session = perf_session__new(&data, false, &trace->tool);
2648 	if (session == NULL)
2649 		return -1;
2650 
2651 	if (trace->opts.target.pid)
2652 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2653 
2654 	if (trace->opts.target.tid)
2655 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2656 
2657 	if (symbol__init(&session->header.env) < 0)
2658 		goto out;
2659 
2660 	trace->host = &session->machines.host;
2661 
2662 	err = perf_session__set_tracepoints_handlers(session, handlers);
2663 	if (err)
2664 		goto out;
2665 
2666 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2667 						     "raw_syscalls:sys_enter");
2668 	/* older kernels have syscalls tp versus raw_syscalls */
2669 	if (evsel == NULL)
2670 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2671 							     "syscalls:sys_enter");
2672 
2673 	if (evsel &&
2674 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2675 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2676 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2677 		goto out;
2678 	}
2679 
2680 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2681 						     "raw_syscalls:sys_exit");
2682 	if (evsel == NULL)
2683 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2684 							     "syscalls:sys_exit");
2685 	if (evsel &&
2686 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2687 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2688 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2689 		goto out;
2690 	}
2691 
2692 	evlist__for_each_entry(session->evlist, evsel) {
2693 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2694 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2695 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2696 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2697 			evsel->handler = trace__pgfault;
2698 	}
2699 
2700 	setup_pager();
2701 
2702 	err = perf_session__process_events(session);
2703 	if (err)
2704 		pr_err("Failed to process events, error %d", err);
2705 
2706 	else if (trace->summary)
2707 		trace__fprintf_thread_summary(trace, trace->output);
2708 
2709 out:
2710 	perf_session__delete(session);
2711 
2712 	return err;
2713 }
2714 
2715 static size_t trace__fprintf_threads_header(FILE *fp)
2716 {
2717 	size_t printed;
2718 
2719 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2720 
2721 	return printed;
2722 }
2723 
2724 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2725 	struct stats 	*stats;
2726 	double		msecs;
2727 	int		syscall;
2728 )
2729 {
2730 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2731 	struct stats *stats = source->priv;
2732 
2733 	entry->syscall = source->i;
2734 	entry->stats   = stats;
2735 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2736 }
2737 
2738 static size_t thread__dump_stats(struct thread_trace *ttrace,
2739 				 struct trace *trace, FILE *fp)
2740 {
2741 	size_t printed = 0;
2742 	struct syscall *sc;
2743 	struct rb_node *nd;
2744 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2745 
2746 	if (syscall_stats == NULL)
2747 		return 0;
2748 
2749 	printed += fprintf(fp, "\n");
2750 
2751 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2752 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2753 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2754 
2755 	resort_rb__for_each_entry(nd, syscall_stats) {
2756 		struct stats *stats = syscall_stats_entry->stats;
2757 		if (stats) {
2758 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2759 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2760 			double avg = avg_stats(stats);
2761 			double pct;
2762 			u64 n = (u64) stats->n;
2763 
2764 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2765 			avg /= NSEC_PER_MSEC;
2766 
2767 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2768 			printed += fprintf(fp, "   %-15s", sc->name);
2769 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2770 					   n, syscall_stats_entry->msecs, min, avg);
2771 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2772 		}
2773 	}
2774 
2775 	resort_rb__delete(syscall_stats);
2776 	printed += fprintf(fp, "\n\n");
2777 
2778 	return printed;
2779 }
2780 
2781 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2782 {
2783 	size_t printed = 0;
2784 	struct thread_trace *ttrace = thread__priv(thread);
2785 	double ratio;
2786 
2787 	if (ttrace == NULL)
2788 		return 0;
2789 
2790 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2791 
2792 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2793 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2794 	printed += fprintf(fp, "%.1f%%", ratio);
2795 	if (ttrace->pfmaj)
2796 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2797 	if (ttrace->pfmin)
2798 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2799 	if (trace->sched)
2800 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2801 	else if (fputc('\n', fp) != EOF)
2802 		++printed;
2803 
2804 	printed += thread__dump_stats(ttrace, trace, fp);
2805 
2806 	return printed;
2807 }
2808 
2809 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2810 {
2811 	return ttrace ? ttrace->nr_events : 0;
2812 }
2813 
2814 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2815 	struct thread *thread;
2816 )
2817 {
2818 	entry->thread = rb_entry(nd, struct thread, rb_node);
2819 }
2820 
2821 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2822 {
2823 	size_t printed = trace__fprintf_threads_header(fp);
2824 	struct rb_node *nd;
2825 	int i;
2826 
2827 	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2828 		DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2829 
2830 		if (threads == NULL) {
2831 			fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2832 			return 0;
2833 		}
2834 
2835 		resort_rb__for_each_entry(nd, threads)
2836 			printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2837 
2838 		resort_rb__delete(threads);
2839 	}
2840 	return printed;
2841 }
2842 
2843 static int trace__set_duration(const struct option *opt, const char *str,
2844 			       int unset __maybe_unused)
2845 {
2846 	struct trace *trace = opt->value;
2847 
2848 	trace->duration_filter = atof(str);
2849 	return 0;
2850 }
2851 
2852 static int trace__set_filter_pids(const struct option *opt, const char *str,
2853 				  int unset __maybe_unused)
2854 {
2855 	int ret = -1;
2856 	size_t i;
2857 	struct trace *trace = opt->value;
2858 	/*
2859 	 * FIXME: introduce a intarray class, plain parse csv and create a
2860 	 * { int nr, int entries[] } struct...
2861 	 */
2862 	struct intlist *list = intlist__new(str);
2863 
2864 	if (list == NULL)
2865 		return -1;
2866 
2867 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2868 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2869 
2870 	if (trace->filter_pids.entries == NULL)
2871 		goto out;
2872 
2873 	trace->filter_pids.entries[0] = getpid();
2874 
2875 	for (i = 1; i < trace->filter_pids.nr; ++i)
2876 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2877 
2878 	intlist__delete(list);
2879 	ret = 0;
2880 out:
2881 	return ret;
2882 }
2883 
2884 static int trace__open_output(struct trace *trace, const char *filename)
2885 {
2886 	struct stat st;
2887 
2888 	if (!stat(filename, &st) && st.st_size) {
2889 		char oldname[PATH_MAX];
2890 
2891 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2892 		unlink(oldname);
2893 		rename(filename, oldname);
2894 	}
2895 
2896 	trace->output = fopen(filename, "w");
2897 
2898 	return trace->output == NULL ? -errno : 0;
2899 }
2900 
2901 static int parse_pagefaults(const struct option *opt, const char *str,
2902 			    int unset __maybe_unused)
2903 {
2904 	int *trace_pgfaults = opt->value;
2905 
2906 	if (strcmp(str, "all") == 0)
2907 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2908 	else if (strcmp(str, "maj") == 0)
2909 		*trace_pgfaults |= TRACE_PFMAJ;
2910 	else if (strcmp(str, "min") == 0)
2911 		*trace_pgfaults |= TRACE_PFMIN;
2912 	else
2913 		return -1;
2914 
2915 	return 0;
2916 }
2917 
2918 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2919 {
2920 	struct perf_evsel *evsel;
2921 
2922 	evlist__for_each_entry(evlist, evsel)
2923 		evsel->handler = handler;
2924 }
2925 
2926 /*
2927  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2928  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2929  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2930  *
2931  * It'd be better to introduce a parse_options() variant that would return a
2932  * list with the terms it didn't match to an event...
2933  */
2934 static int trace__parse_events_option(const struct option *opt, const char *str,
2935 				      int unset __maybe_unused)
2936 {
2937 	struct trace *trace = (struct trace *)opt->value;
2938 	const char *s = str;
2939 	char *sep = NULL, *lists[2] = { NULL, NULL, };
2940 	int len = strlen(str) + 1, err = -1, list, idx;
2941 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2942 	char group_name[PATH_MAX];
2943 
2944 	if (strace_groups_dir == NULL)
2945 		return -1;
2946 
2947 	if (*s == '!') {
2948 		++s;
2949 		trace->not_ev_qualifier = true;
2950 	}
2951 
2952 	while (1) {
2953 		if ((sep = strchr(s, ',')) != NULL)
2954 			*sep = '\0';
2955 
2956 		list = 0;
2957 		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2958 		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2959 			list = 1;
2960 		} else {
2961 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2962 			if (access(group_name, R_OK) == 0)
2963 				list = 1;
2964 		}
2965 
2966 		if (lists[list]) {
2967 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2968 		} else {
2969 			lists[list] = malloc(len);
2970 			if (lists[list] == NULL)
2971 				goto out;
2972 			strcpy(lists[list], s);
2973 		}
2974 
2975 		if (!sep)
2976 			break;
2977 
2978 		*sep = ',';
2979 		s = sep + 1;
2980 	}
2981 
2982 	if (lists[1] != NULL) {
2983 		struct strlist_config slist_config = {
2984 			.dirname = strace_groups_dir,
2985 		};
2986 
2987 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2988 		if (trace->ev_qualifier == NULL) {
2989 			fputs("Not enough memory to parse event qualifier", trace->output);
2990 			goto out;
2991 		}
2992 
2993 		if (trace__validate_ev_qualifier(trace))
2994 			goto out;
2995 		trace->trace_syscalls = true;
2996 	}
2997 
2998 	err = 0;
2999 
3000 	if (lists[0]) {
3001 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3002 					       "event selector. use 'perf list' to list available events",
3003 					       parse_events_option);
3004 		err = parse_events_option(&o, lists[0], 0);
3005 	}
3006 out:
3007 	if (sep)
3008 		*sep = ',';
3009 
3010 	return err;
3011 }
3012 
3013 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3014 {
3015 	struct trace *trace = opt->value;
3016 
3017 	if (!list_empty(&trace->evlist->entries))
3018 		return parse_cgroups(opt, str, unset);
3019 
3020 	trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3021 
3022 	return 0;
3023 }
3024 
3025 int cmd_trace(int argc, const char **argv)
3026 {
3027 	const char *trace_usage[] = {
3028 		"perf trace [<options>] [<command>]",
3029 		"perf trace [<options>] -- <command> [<options>]",
3030 		"perf trace record [<options>] [<command>]",
3031 		"perf trace record [<options>] -- <command> [<options>]",
3032 		NULL
3033 	};
3034 	struct trace trace = {
3035 		.syscalls = {
3036 			. max = -1,
3037 		},
3038 		.opts = {
3039 			.target = {
3040 				.uid	   = UINT_MAX,
3041 				.uses_mmap = true,
3042 			},
3043 			.user_freq     = UINT_MAX,
3044 			.user_interval = ULLONG_MAX,
3045 			.no_buffering  = true,
3046 			.mmap_pages    = UINT_MAX,
3047 			.proc_map_timeout  = 500,
3048 		},
3049 		.output = stderr,
3050 		.show_comm = true,
3051 		.trace_syscalls = false,
3052 		.kernel_syscallchains = false,
3053 		.max_stack = UINT_MAX,
3054 	};
3055 	const char *output_name = NULL;
3056 	const struct option trace_options[] = {
3057 	OPT_CALLBACK('e', "event", &trace, "event",
3058 		     "event/syscall selector. use 'perf list' to list available events",
3059 		     trace__parse_events_option),
3060 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
3061 		    "show the thread COMM next to its id"),
3062 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3063 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3064 		     trace__parse_events_option),
3065 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
3066 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3067 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3068 		    "trace events on existing process id"),
3069 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3070 		    "trace events on existing thread id"),
3071 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3072 		     "pids to filter (by the kernel)", trace__set_filter_pids),
3073 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3074 		    "system-wide collection from all CPUs"),
3075 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3076 		    "list of cpus to monitor"),
3077 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3078 		    "child tasks do not inherit counters"),
3079 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3080 		     "number of mmap data pages",
3081 		     perf_evlist__parse_mmap_pages),
3082 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3083 		   "user to profile"),
3084 	OPT_CALLBACK(0, "duration", &trace, "float",
3085 		     "show only events with duration > N.M ms",
3086 		     trace__set_duration),
3087 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3088 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3089 	OPT_BOOLEAN('T', "time", &trace.full_time,
3090 		    "Show full timestamp, not time relative to first start"),
3091 	OPT_BOOLEAN(0, "failure", &trace.failure_only,
3092 		    "Show only syscalls that failed"),
3093 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3094 		    "Show only syscall summary with statistics"),
3095 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3096 		    "Show all syscalls and summary with statistics"),
3097 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3098 		     "Trace pagefaults", parse_pagefaults, "maj"),
3099 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3100 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3101 	OPT_CALLBACK(0, "call-graph", &trace.opts,
3102 		     "record_mode[,record_size]", record_callchain_help,
3103 		     &record_parse_callchain_opt),
3104 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3105 		    "Show the kernel callchains on the syscall exit path"),
3106 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3107 		     "Set the minimum stack depth when parsing the callchain, "
3108 		     "anything below the specified depth will be ignored."),
3109 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3110 		     "Set the maximum stack depth when parsing the callchain, "
3111 		     "anything beyond the specified depth will be ignored. "
3112 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3113 	OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3114 			"print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3115 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3116 			"per thread proc mmap processing timeout in ms"),
3117 	OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3118 		     trace__parse_cgroups),
3119 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3120 		     "ms to wait before starting measurement after program "
3121 		     "start"),
3122 	OPT_END()
3123 	};
3124 	bool __maybe_unused max_stack_user_set = true;
3125 	bool mmap_pages_user_set = true;
3126 	const char * const trace_subcommands[] = { "record", NULL };
3127 	int err;
3128 	char bf[BUFSIZ];
3129 
3130 	signal(SIGSEGV, sighandler_dump_stack);
3131 	signal(SIGFPE, sighandler_dump_stack);
3132 
3133 	trace.evlist = perf_evlist__new();
3134 	trace.sctbl = syscalltbl__new();
3135 
3136 	if (trace.evlist == NULL || trace.sctbl == NULL) {
3137 		pr_err("Not enough memory to run!\n");
3138 		err = -ENOMEM;
3139 		goto out;
3140 	}
3141 
3142 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3143 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3144 
3145 	if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3146 		usage_with_options_msg(trace_usage, trace_options,
3147 				       "cgroup monitoring only available in system-wide mode");
3148 	}
3149 
3150 	err = bpf__setup_stdout(trace.evlist);
3151 	if (err) {
3152 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3153 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3154 		goto out;
3155 	}
3156 
3157 	err = -1;
3158 
3159 	if (trace.trace_pgfaults) {
3160 		trace.opts.sample_address = true;
3161 		trace.opts.sample_time = true;
3162 	}
3163 
3164 	if (trace.opts.mmap_pages == UINT_MAX)
3165 		mmap_pages_user_set = false;
3166 
3167 	if (trace.max_stack == UINT_MAX) {
3168 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3169 		max_stack_user_set = false;
3170 	}
3171 
3172 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3173 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3174 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3175 	}
3176 #endif
3177 
3178 	if (callchain_param.enabled) {
3179 		if (!mmap_pages_user_set && geteuid() == 0)
3180 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3181 
3182 		symbol_conf.use_callchain = true;
3183 	}
3184 
3185 	if (trace.evlist->nr_entries > 0)
3186 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3187 
3188 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3189 		return trace__record(&trace, argc-1, &argv[1]);
3190 
3191 	/* summary_only implies summary option, but don't overwrite summary if set */
3192 	if (trace.summary_only)
3193 		trace.summary = trace.summary_only;
3194 
3195 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3196 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3197 		trace.trace_syscalls = true;
3198 	}
3199 
3200 	if (output_name != NULL) {
3201 		err = trace__open_output(&trace, output_name);
3202 		if (err < 0) {
3203 			perror("failed to create output file");
3204 			goto out;
3205 		}
3206 	}
3207 
3208 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
3209 
3210 	err = target__validate(&trace.opts.target);
3211 	if (err) {
3212 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3213 		fprintf(trace.output, "%s", bf);
3214 		goto out_close;
3215 	}
3216 
3217 	err = target__parse_uid(&trace.opts.target);
3218 	if (err) {
3219 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3220 		fprintf(trace.output, "%s", bf);
3221 		goto out_close;
3222 	}
3223 
3224 	if (!argc && target__none(&trace.opts.target))
3225 		trace.opts.target.system_wide = true;
3226 
3227 	if (input_name)
3228 		err = trace__replay(&trace);
3229 	else
3230 		err = trace__run(&trace, argc, argv);
3231 
3232 out_close:
3233 	if (output_name != NULL)
3234 		fclose(trace.output);
3235 out:
3236 	return err;
3237 }
3238