xref: /openbmc/linux/tools/perf/builtin-trace.c (revision 6396bb221514d2876fd6dc0aa2a1f240d99b37bb)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/cgroup.h"
23 #include "util/color.h"
24 #include "util/debug.h"
25 #include "util/env.h"
26 #include "util/event.h"
27 #include "util/evlist.h"
28 #include <subcmd/exec-cmd.h>
29 #include "util/machine.h"
30 #include "util/path.h"
31 #include "util/session.h"
32 #include "util/thread.h"
33 #include <subcmd/parse-options.h>
34 #include "util/strlist.h"
35 #include "util/intlist.h"
36 #include "util/thread_map.h"
37 #include "util/stat.h"
38 #include "trace/beauty/beauty.h"
39 #include "trace-event.h"
40 #include "util/parse-events.h"
41 #include "util/bpf-loader.h"
42 #include "callchain.h"
43 #include "print_binary.h"
44 #include "string2.h"
45 #include "syscalltbl.h"
46 #include "rb_resort.h"
47 
48 #include <errno.h>
49 #include <inttypes.h>
50 #include <poll.h>
51 #include <signal.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <linux/err.h>
55 #include <linux/filter.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 #include <fcntl.h>
61 
62 #include "sane_ctype.h"
63 
64 #ifndef O_CLOEXEC
65 # define O_CLOEXEC		02000000
66 #endif
67 
68 #ifndef F_LINUX_SPECIFIC_BASE
69 # define F_LINUX_SPECIFIC_BASE	1024
70 #endif
71 
72 struct trace {
73 	struct perf_tool	tool;
74 	struct syscalltbl	*sctbl;
75 	struct {
76 		int		max;
77 		struct syscall  *table;
78 		struct {
79 			struct perf_evsel *sys_enter,
80 					  *sys_exit;
81 		}		events;
82 	} syscalls;
83 	struct record_opts	opts;
84 	struct perf_evlist	*evlist;
85 	struct machine		*host;
86 	struct thread		*current;
87 	struct cgroup		*cgroup;
88 	u64			base_time;
89 	FILE			*output;
90 	unsigned long		nr_events;
91 	struct strlist		*ev_qualifier;
92 	struct {
93 		size_t		nr;
94 		int		*entries;
95 	}			ev_qualifier_ids;
96 	struct {
97 		size_t		nr;
98 		pid_t		*entries;
99 	}			filter_pids;
100 	double			duration_filter;
101 	double			runtime_ms;
102 	struct {
103 		u64		vfs_getname,
104 				proc_getname;
105 	} stats;
106 	unsigned int		max_stack;
107 	unsigned int		min_stack;
108 	bool			not_ev_qualifier;
109 	bool			live;
110 	bool			full_time;
111 	bool			sched;
112 	bool			multiple_threads;
113 	bool			summary;
114 	bool			summary_only;
115 	bool			failure_only;
116 	bool			show_comm;
117 	bool			print_sample;
118 	bool			show_tool_stats;
119 	bool			trace_syscalls;
120 	bool			kernel_syscallchains;
121 	bool			force;
122 	bool			vfs_getname;
123 	int			trace_pgfaults;
124 	int			open_id;
125 };
126 
127 struct tp_field {
128 	int offset;
129 	union {
130 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
131 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
132 	};
133 };
134 
135 #define TP_UINT_FIELD(bits) \
136 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
137 { \
138 	u##bits value; \
139 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
140 	return value;  \
141 }
142 
143 TP_UINT_FIELD(8);
144 TP_UINT_FIELD(16);
145 TP_UINT_FIELD(32);
146 TP_UINT_FIELD(64);
147 
148 #define TP_UINT_FIELD__SWAPPED(bits) \
149 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
150 { \
151 	u##bits value; \
152 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
153 	return bswap_##bits(value);\
154 }
155 
156 TP_UINT_FIELD__SWAPPED(16);
157 TP_UINT_FIELD__SWAPPED(32);
158 TP_UINT_FIELD__SWAPPED(64);
159 
160 static int tp_field__init_uint(struct tp_field *field,
161 			       struct format_field *format_field,
162 			       bool needs_swap)
163 {
164 	field->offset = format_field->offset;
165 
166 	switch (format_field->size) {
167 	case 1:
168 		field->integer = tp_field__u8;
169 		break;
170 	case 2:
171 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
172 		break;
173 	case 4:
174 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
175 		break;
176 	case 8:
177 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
178 		break;
179 	default:
180 		return -1;
181 	}
182 
183 	return 0;
184 }
185 
186 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
187 {
188 	return sample->raw_data + field->offset;
189 }
190 
191 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
192 {
193 	field->offset = format_field->offset;
194 	field->pointer = tp_field__ptr;
195 	return 0;
196 }
197 
198 struct syscall_tp {
199 	struct tp_field id;
200 	union {
201 		struct tp_field args, ret;
202 	};
203 };
204 
205 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
206 					  struct tp_field *field,
207 					  const char *name)
208 {
209 	struct format_field *format_field = perf_evsel__field(evsel, name);
210 
211 	if (format_field == NULL)
212 		return -1;
213 
214 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
215 }
216 
217 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
218 	({ struct syscall_tp *sc = evsel->priv;\
219 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
220 
221 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
222 					 struct tp_field *field,
223 					 const char *name)
224 {
225 	struct format_field *format_field = perf_evsel__field(evsel, name);
226 
227 	if (format_field == NULL)
228 		return -1;
229 
230 	return tp_field__init_ptr(field, format_field);
231 }
232 
233 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
234 	({ struct syscall_tp *sc = evsel->priv;\
235 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
236 
237 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
238 {
239 	zfree(&evsel->priv);
240 	perf_evsel__delete(evsel);
241 }
242 
243 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
244 {
245 	evsel->priv = malloc(sizeof(struct syscall_tp));
246 	if (evsel->priv != NULL) {
247 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
248 			goto out_delete;
249 
250 		evsel->handler = handler;
251 		return 0;
252 	}
253 
254 	return -ENOMEM;
255 
256 out_delete:
257 	zfree(&evsel->priv);
258 	return -ENOENT;
259 }
260 
261 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
262 {
263 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
264 
265 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
266 	if (IS_ERR(evsel))
267 		evsel = perf_evsel__newtp("syscalls", direction);
268 
269 	if (IS_ERR(evsel))
270 		return NULL;
271 
272 	if (perf_evsel__init_syscall_tp(evsel, handler))
273 		goto out_delete;
274 
275 	return evsel;
276 
277 out_delete:
278 	perf_evsel__delete_priv(evsel);
279 	return NULL;
280 }
281 
282 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
283 	({ struct syscall_tp *fields = evsel->priv; \
284 	   fields->name.integer(&fields->name, sample); })
285 
286 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
287 	({ struct syscall_tp *fields = evsel->priv; \
288 	   fields->name.pointer(&fields->name, sample); })
289 
290 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
291 {
292 	int idx = val - sa->offset;
293 
294 	if (idx < 0 || idx >= sa->nr_entries)
295 		return scnprintf(bf, size, intfmt, val);
296 
297 	return scnprintf(bf, size, "%s", sa->entries[idx]);
298 }
299 
300 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
301 						const char *intfmt,
302 					        struct syscall_arg *arg)
303 {
304 	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
305 }
306 
307 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
308 					      struct syscall_arg *arg)
309 {
310 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
311 }
312 
313 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
314 
315 struct strarrays {
316 	int		nr_entries;
317 	struct strarray **entries;
318 };
319 
320 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
321 	.nr_entries = ARRAY_SIZE(array), \
322 	.entries = array, \
323 }
324 
325 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
326 					struct syscall_arg *arg)
327 {
328 	struct strarrays *sas = arg->parm;
329 	int i;
330 
331 	for (i = 0; i < sas->nr_entries; ++i) {
332 		struct strarray *sa = sas->entries[i];
333 		int idx = arg->val - sa->offset;
334 
335 		if (idx >= 0 && idx < sa->nr_entries) {
336 			if (sa->entries[idx] == NULL)
337 				break;
338 			return scnprintf(bf, size, "%s", sa->entries[idx]);
339 		}
340 	}
341 
342 	return scnprintf(bf, size, "%d", arg->val);
343 }
344 
345 #ifndef AT_FDCWD
346 #define AT_FDCWD	-100
347 #endif
348 
349 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
350 					   struct syscall_arg *arg)
351 {
352 	int fd = arg->val;
353 
354 	if (fd == AT_FDCWD)
355 		return scnprintf(bf, size, "CWD");
356 
357 	return syscall_arg__scnprintf_fd(bf, size, arg);
358 }
359 
360 #define SCA_FDAT syscall_arg__scnprintf_fd_at
361 
362 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
363 					      struct syscall_arg *arg);
364 
365 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
366 
367 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
368 {
369 	return scnprintf(bf, size, "%#lx", arg->val);
370 }
371 
372 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
373 {
374 	return scnprintf(bf, size, "%d", arg->val);
375 }
376 
377 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
378 {
379 	return scnprintf(bf, size, "%ld", arg->val);
380 }
381 
382 static const char *bpf_cmd[] = {
383 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
384 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
385 };
386 static DEFINE_STRARRAY(bpf_cmd);
387 
388 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
389 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
390 
391 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
392 static DEFINE_STRARRAY(itimers);
393 
394 static const char *keyctl_options[] = {
395 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
396 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
397 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
398 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
399 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
400 };
401 static DEFINE_STRARRAY(keyctl_options);
402 
403 static const char *whences[] = { "SET", "CUR", "END",
404 #ifdef SEEK_DATA
405 "DATA",
406 #endif
407 #ifdef SEEK_HOLE
408 "HOLE",
409 #endif
410 };
411 static DEFINE_STRARRAY(whences);
412 
413 static const char *fcntl_cmds[] = {
414 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
415 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
416 	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
417 	"GETOWNER_UIDS",
418 };
419 static DEFINE_STRARRAY(fcntl_cmds);
420 
421 static const char *fcntl_linux_specific_cmds[] = {
422 	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
423 	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
424 	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
425 };
426 
427 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
428 
429 static struct strarray *fcntl_cmds_arrays[] = {
430 	&strarray__fcntl_cmds,
431 	&strarray__fcntl_linux_specific_cmds,
432 };
433 
434 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
435 
436 static const char *rlimit_resources[] = {
437 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
438 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
439 	"RTTIME",
440 };
441 static DEFINE_STRARRAY(rlimit_resources);
442 
443 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
444 static DEFINE_STRARRAY(sighow);
445 
446 static const char *clockid[] = {
447 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
448 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
449 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
450 };
451 static DEFINE_STRARRAY(clockid);
452 
453 static const char *socket_families[] = {
454 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
455 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
456 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
457 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
458 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
459 	"ALG", "NFC", "VSOCK",
460 };
461 static DEFINE_STRARRAY(socket_families);
462 
463 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
464 						 struct syscall_arg *arg)
465 {
466 	size_t printed = 0;
467 	int mode = arg->val;
468 
469 	if (mode == F_OK) /* 0 */
470 		return scnprintf(bf, size, "F");
471 #define	P_MODE(n) \
472 	if (mode & n##_OK) { \
473 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
474 		mode &= ~n##_OK; \
475 	}
476 
477 	P_MODE(R);
478 	P_MODE(W);
479 	P_MODE(X);
480 #undef P_MODE
481 
482 	if (mode)
483 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
484 
485 	return printed;
486 }
487 
488 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
489 
490 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
491 					      struct syscall_arg *arg);
492 
493 #define SCA_FILENAME syscall_arg__scnprintf_filename
494 
495 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
496 						struct syscall_arg *arg)
497 {
498 	int printed = 0, flags = arg->val;
499 
500 #define	P_FLAG(n) \
501 	if (flags & O_##n) { \
502 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
503 		flags &= ~O_##n; \
504 	}
505 
506 	P_FLAG(CLOEXEC);
507 	P_FLAG(NONBLOCK);
508 #undef P_FLAG
509 
510 	if (flags)
511 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
512 
513 	return printed;
514 }
515 
516 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
517 
518 #ifndef GRND_NONBLOCK
519 #define GRND_NONBLOCK	0x0001
520 #endif
521 #ifndef GRND_RANDOM
522 #define GRND_RANDOM	0x0002
523 #endif
524 
525 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
526 						   struct syscall_arg *arg)
527 {
528 	int printed = 0, flags = arg->val;
529 
530 #define	P_FLAG(n) \
531 	if (flags & GRND_##n) { \
532 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
533 		flags &= ~GRND_##n; \
534 	}
535 
536 	P_FLAG(RANDOM);
537 	P_FLAG(NONBLOCK);
538 #undef P_FLAG
539 
540 	if (flags)
541 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
542 
543 	return printed;
544 }
545 
546 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
547 
548 #define STRARRAY(name, array) \
549 	  { .scnprintf	= SCA_STRARRAY, \
550 	    .parm	= &strarray__##array, }
551 
552 #include "trace/beauty/arch_errno_names.c"
553 #include "trace/beauty/eventfd.c"
554 #include "trace/beauty/futex_op.c"
555 #include "trace/beauty/futex_val3.c"
556 #include "trace/beauty/mmap.c"
557 #include "trace/beauty/mode_t.c"
558 #include "trace/beauty/msg_flags.c"
559 #include "trace/beauty/open_flags.c"
560 #include "trace/beauty/perf_event_open.c"
561 #include "trace/beauty/pid.c"
562 #include "trace/beauty/sched_policy.c"
563 #include "trace/beauty/seccomp.c"
564 #include "trace/beauty/signum.c"
565 #include "trace/beauty/socket_type.c"
566 #include "trace/beauty/waitid_options.c"
567 
568 struct syscall_arg_fmt {
569 	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
570 	void	   *parm;
571 	const char *name;
572 	bool	   show_zero;
573 };
574 
575 static struct syscall_fmt {
576 	const char *name;
577 	const char *alias;
578 	struct syscall_arg_fmt arg[6];
579 	u8	   nr_args;
580 	bool	   errpid;
581 	bool	   timeout;
582 	bool	   hexret;
583 } syscall_fmts[] = {
584 	{ .name	    = "access",
585 	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
586 	{ .name	    = "bpf",
587 	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
588 	{ .name	    = "brk",	    .hexret = true,
589 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
590 	{ .name     = "clock_gettime",
591 	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
592 	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
593 	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
594 		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
595 		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
596 		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
597 		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
598 	{ .name	    = "close",
599 	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
600 	{ .name	    = "epoll_ctl",
601 	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
602 	{ .name	    = "eventfd2",
603 	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
604 	{ .name	    = "fchmodat",
605 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
606 	{ .name	    = "fchownat",
607 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
608 	{ .name	    = "fcntl",
609 	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
610 			   .parm      = &strarrays__fcntl_cmds_arrays,
611 			   .show_zero = true, },
612 		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
613 	{ .name	    = "flock",
614 	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
615 	{ .name	    = "fstat", .alias = "newfstat", },
616 	{ .name	    = "fstatat", .alias = "newfstatat", },
617 	{ .name	    = "futex",
618 	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
619 		   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
620 	{ .name	    = "futimesat",
621 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
622 	{ .name	    = "getitimer",
623 	  .arg = { [0] = STRARRAY(which, itimers), }, },
624 	{ .name	    = "getpid",	    .errpid = true, },
625 	{ .name	    = "getpgid",    .errpid = true, },
626 	{ .name	    = "getppid",    .errpid = true, },
627 	{ .name	    = "getrandom",
628 	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
629 	{ .name	    = "getrlimit",
630 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
631 	{ .name	    = "gettid",	    .errpid = true, },
632 	{ .name	    = "ioctl",
633 	  .arg = {
634 #if defined(__i386__) || defined(__x86_64__)
635 /*
636  * FIXME: Make this available to all arches.
637  */
638 		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
639 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
640 #else
641 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
642 #endif
643 	{ .name	    = "kcmp",	    .nr_args = 5,
644 	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
645 		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
646 		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
647 		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
648 		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
649 	{ .name	    = "keyctl",
650 	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
651 	{ .name	    = "kill",
652 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
653 	{ .name	    = "linkat",
654 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
655 	{ .name	    = "lseek",
656 	  .arg = { [2] = STRARRAY(whence, whences), }, },
657 	{ .name	    = "lstat", .alias = "newlstat", },
658 	{ .name     = "madvise",
659 	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
660 		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
661 	{ .name	    = "mkdirat",
662 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
663 	{ .name	    = "mknodat",
664 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
665 	{ .name	    = "mlock",
666 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
667 	{ .name	    = "mlockall",
668 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
669 	{ .name	    = "mmap",	    .hexret = true,
670 /* The standard mmap maps to old_mmap on s390x */
671 #if defined(__s390x__)
672 	.alias = "old_mmap",
673 #endif
674 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
675 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
676 		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
677 	{ .name	    = "mprotect",
678 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
679 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
680 	{ .name	    = "mq_unlink",
681 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
682 	{ .name	    = "mremap",	    .hexret = true,
683 	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
684 		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
685 		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
686 	{ .name	    = "munlock",
687 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
688 	{ .name	    = "munmap",
689 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
690 	{ .name	    = "name_to_handle_at",
691 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
692 	{ .name	    = "newfstatat",
693 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
694 	{ .name	    = "open",
695 	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
696 	{ .name	    = "open_by_handle_at",
697 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
698 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
699 	{ .name	    = "openat",
700 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
701 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
702 	{ .name	    = "perf_event_open",
703 	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
704 		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
705 		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
706 	{ .name	    = "pipe2",
707 	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
708 	{ .name	    = "pkey_alloc",
709 	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
710 	{ .name	    = "pkey_free",
711 	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
712 	{ .name	    = "pkey_mprotect",
713 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
714 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
715 		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
716 	{ .name	    = "poll", .timeout = true, },
717 	{ .name	    = "ppoll", .timeout = true, },
718 	{ .name	    = "prctl", .alias = "arch_prctl",
719 	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
720 		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
721 		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
722 	{ .name	    = "pread", .alias = "pread64", },
723 	{ .name	    = "preadv", .alias = "pread", },
724 	{ .name	    = "prlimit64",
725 	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
726 	{ .name	    = "pwrite", .alias = "pwrite64", },
727 	{ .name	    = "readlinkat",
728 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
729 	{ .name	    = "recvfrom",
730 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
731 	{ .name	    = "recvmmsg",
732 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
733 	{ .name	    = "recvmsg",
734 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
735 	{ .name	    = "renameat",
736 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
737 	{ .name	    = "rt_sigaction",
738 	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
739 	{ .name	    = "rt_sigprocmask",
740 	  .arg = { [0] = STRARRAY(how, sighow), }, },
741 	{ .name	    = "rt_sigqueueinfo",
742 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
743 	{ .name	    = "rt_tgsigqueueinfo",
744 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
745 	{ .name	    = "sched_setscheduler",
746 	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
747 	{ .name	    = "seccomp",
748 	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
749 		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
750 	{ .name	    = "select", .timeout = true, },
751 	{ .name	    = "sendmmsg",
752 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
753 	{ .name	    = "sendmsg",
754 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
755 	{ .name	    = "sendto",
756 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
757 	{ .name	    = "set_tid_address", .errpid = true, },
758 	{ .name	    = "setitimer",
759 	  .arg = { [0] = STRARRAY(which, itimers), }, },
760 	{ .name	    = "setrlimit",
761 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
762 	{ .name	    = "socket",
763 	  .arg = { [0] = STRARRAY(family, socket_families),
764 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
765 	{ .name	    = "socketpair",
766 	  .arg = { [0] = STRARRAY(family, socket_families),
767 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
768 	{ .name	    = "stat", .alias = "newstat", },
769 	{ .name	    = "statx",
770 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
771 		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
772 		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
773 	{ .name	    = "swapoff",
774 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
775 	{ .name	    = "swapon",
776 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
777 	{ .name	    = "symlinkat",
778 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
779 	{ .name	    = "tgkill",
780 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
781 	{ .name	    = "tkill",
782 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
783 	{ .name	    = "uname", .alias = "newuname", },
784 	{ .name	    = "unlinkat",
785 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
786 	{ .name	    = "utimensat",
787 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
788 	{ .name	    = "wait4",	    .errpid = true,
789 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
790 	{ .name	    = "waitid",	    .errpid = true,
791 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
792 };
793 
794 static int syscall_fmt__cmp(const void *name, const void *fmtp)
795 {
796 	const struct syscall_fmt *fmt = fmtp;
797 	return strcmp(name, fmt->name);
798 }
799 
800 static struct syscall_fmt *syscall_fmt__find(const char *name)
801 {
802 	const int nmemb = ARRAY_SIZE(syscall_fmts);
803 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
804 }
805 
806 struct syscall {
807 	struct event_format *tp_format;
808 	int		    nr_args;
809 	struct format_field *args;
810 	const char	    *name;
811 	bool		    is_exit;
812 	struct syscall_fmt  *fmt;
813 	struct syscall_arg_fmt *arg_fmt;
814 };
815 
816 /*
817  * We need to have this 'calculated' boolean because in some cases we really
818  * don't know what is the duration of a syscall, for instance, when we start
819  * a session and some threads are waiting for a syscall to finish, say 'poll',
820  * in which case all we can do is to print "( ? ) for duration and for the
821  * start timestamp.
822  */
823 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
824 {
825 	double duration = (double)t / NSEC_PER_MSEC;
826 	size_t printed = fprintf(fp, "(");
827 
828 	if (!calculated)
829 		printed += fprintf(fp, "         ");
830 	else if (duration >= 1.0)
831 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
832 	else if (duration >= 0.01)
833 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
834 	else
835 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
836 	return printed + fprintf(fp, "): ");
837 }
838 
839 /**
840  * filename.ptr: The filename char pointer that will be vfs_getname'd
841  * filename.entry_str_pos: Where to insert the string translated from
842  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
843  * ret_scnprintf: syscall args may set this to a different syscall return
844  *                formatter, for instance, fcntl may return fds, file flags, etc.
845  */
846 struct thread_trace {
847 	u64		  entry_time;
848 	bool		  entry_pending;
849 	unsigned long	  nr_events;
850 	unsigned long	  pfmaj, pfmin;
851 	char		  *entry_str;
852 	double		  runtime_ms;
853 	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
854         struct {
855 		unsigned long ptr;
856 		short int     entry_str_pos;
857 		bool	      pending_open;
858 		unsigned int  namelen;
859 		char	      *name;
860 	} filename;
861 	struct {
862 		int	  max;
863 		char	  **table;
864 	} paths;
865 
866 	struct intlist *syscall_stats;
867 };
868 
869 static struct thread_trace *thread_trace__new(void)
870 {
871 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
872 
873 	if (ttrace)
874 		ttrace->paths.max = -1;
875 
876 	ttrace->syscall_stats = intlist__new(NULL);
877 
878 	return ttrace;
879 }
880 
881 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
882 {
883 	struct thread_trace *ttrace;
884 
885 	if (thread == NULL)
886 		goto fail;
887 
888 	if (thread__priv(thread) == NULL)
889 		thread__set_priv(thread, thread_trace__new());
890 
891 	if (thread__priv(thread) == NULL)
892 		goto fail;
893 
894 	ttrace = thread__priv(thread);
895 	++ttrace->nr_events;
896 
897 	return ttrace;
898 fail:
899 	color_fprintf(fp, PERF_COLOR_RED,
900 		      "WARNING: not enough memory, dropping samples!\n");
901 	return NULL;
902 }
903 
904 
905 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
906 				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
907 {
908 	struct thread_trace *ttrace = thread__priv(arg->thread);
909 
910 	ttrace->ret_scnprintf = ret_scnprintf;
911 }
912 
913 #define TRACE_PFMAJ		(1 << 0)
914 #define TRACE_PFMIN		(1 << 1)
915 
916 static const size_t trace__entry_str_size = 2048;
917 
918 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
919 {
920 	struct thread_trace *ttrace = thread__priv(thread);
921 
922 	if (fd > ttrace->paths.max) {
923 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
924 
925 		if (npath == NULL)
926 			return -1;
927 
928 		if (ttrace->paths.max != -1) {
929 			memset(npath + ttrace->paths.max + 1, 0,
930 			       (fd - ttrace->paths.max) * sizeof(char *));
931 		} else {
932 			memset(npath, 0, (fd + 1) * sizeof(char *));
933 		}
934 
935 		ttrace->paths.table = npath;
936 		ttrace->paths.max   = fd;
937 	}
938 
939 	ttrace->paths.table[fd] = strdup(pathname);
940 
941 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
942 }
943 
944 static int thread__read_fd_path(struct thread *thread, int fd)
945 {
946 	char linkname[PATH_MAX], pathname[PATH_MAX];
947 	struct stat st;
948 	int ret;
949 
950 	if (thread->pid_ == thread->tid) {
951 		scnprintf(linkname, sizeof(linkname),
952 			  "/proc/%d/fd/%d", thread->pid_, fd);
953 	} else {
954 		scnprintf(linkname, sizeof(linkname),
955 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
956 	}
957 
958 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
959 		return -1;
960 
961 	ret = readlink(linkname, pathname, sizeof(pathname));
962 
963 	if (ret < 0 || ret > st.st_size)
964 		return -1;
965 
966 	pathname[ret] = '\0';
967 	return trace__set_fd_pathname(thread, fd, pathname);
968 }
969 
970 static const char *thread__fd_path(struct thread *thread, int fd,
971 				   struct trace *trace)
972 {
973 	struct thread_trace *ttrace = thread__priv(thread);
974 
975 	if (ttrace == NULL)
976 		return NULL;
977 
978 	if (fd < 0)
979 		return NULL;
980 
981 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
982 		if (!trace->live)
983 			return NULL;
984 		++trace->stats.proc_getname;
985 		if (thread__read_fd_path(thread, fd))
986 			return NULL;
987 	}
988 
989 	return ttrace->paths.table[fd];
990 }
991 
992 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
993 {
994 	int fd = arg->val;
995 	size_t printed = scnprintf(bf, size, "%d", fd);
996 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
997 
998 	if (path)
999 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1000 
1001 	return printed;
1002 }
1003 
1004 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1005 {
1006         size_t printed = scnprintf(bf, size, "%d", fd);
1007 	struct thread *thread = machine__find_thread(trace->host, pid, pid);
1008 
1009 	if (thread) {
1010 		const char *path = thread__fd_path(thread, fd, trace);
1011 
1012 		if (path)
1013 			printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1014 
1015 		thread__put(thread);
1016 	}
1017 
1018         return printed;
1019 }
1020 
1021 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1022 					      struct syscall_arg *arg)
1023 {
1024 	int fd = arg->val;
1025 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1026 	struct thread_trace *ttrace = thread__priv(arg->thread);
1027 
1028 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1029 		zfree(&ttrace->paths.table[fd]);
1030 
1031 	return printed;
1032 }
1033 
1034 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1035 				     unsigned long ptr)
1036 {
1037 	struct thread_trace *ttrace = thread__priv(thread);
1038 
1039 	ttrace->filename.ptr = ptr;
1040 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1041 }
1042 
1043 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1044 					      struct syscall_arg *arg)
1045 {
1046 	unsigned long ptr = arg->val;
1047 
1048 	if (!arg->trace->vfs_getname)
1049 		return scnprintf(bf, size, "%#x", ptr);
1050 
1051 	thread__set_filename_pos(arg->thread, bf, ptr);
1052 	return 0;
1053 }
1054 
1055 static bool trace__filter_duration(struct trace *trace, double t)
1056 {
1057 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1058 }
1059 
1060 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1061 {
1062 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1063 
1064 	return fprintf(fp, "%10.3f ", ts);
1065 }
1066 
1067 /*
1068  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1069  * using ttrace->entry_time for a thread that receives a sys_exit without
1070  * first having received a sys_enter ("poll" issued before tracing session
1071  * starts, lost sys_enter exit due to ring buffer overflow).
1072  */
1073 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1074 {
1075 	if (tstamp > 0)
1076 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1077 
1078 	return fprintf(fp, "         ? ");
1079 }
1080 
1081 static bool done = false;
1082 static bool interrupted = false;
1083 
1084 static void sig_handler(int sig)
1085 {
1086 	done = true;
1087 	interrupted = sig == SIGINT;
1088 }
1089 
1090 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1091 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1092 {
1093 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1094 	printed += fprintf_duration(duration, duration_calculated, fp);
1095 
1096 	if (trace->multiple_threads) {
1097 		if (trace->show_comm)
1098 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1099 		printed += fprintf(fp, "%d ", thread->tid);
1100 	}
1101 
1102 	return printed;
1103 }
1104 
1105 static int trace__process_event(struct trace *trace, struct machine *machine,
1106 				union perf_event *event, struct perf_sample *sample)
1107 {
1108 	int ret = 0;
1109 
1110 	switch (event->header.type) {
1111 	case PERF_RECORD_LOST:
1112 		color_fprintf(trace->output, PERF_COLOR_RED,
1113 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1114 		ret = machine__process_lost_event(machine, event, sample);
1115 		break;
1116 	default:
1117 		ret = machine__process_event(machine, event, sample);
1118 		break;
1119 	}
1120 
1121 	return ret;
1122 }
1123 
1124 static int trace__tool_process(struct perf_tool *tool,
1125 			       union perf_event *event,
1126 			       struct perf_sample *sample,
1127 			       struct machine *machine)
1128 {
1129 	struct trace *trace = container_of(tool, struct trace, tool);
1130 	return trace__process_event(trace, machine, event, sample);
1131 }
1132 
1133 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1134 {
1135 	struct machine *machine = vmachine;
1136 
1137 	if (machine->kptr_restrict_warned)
1138 		return NULL;
1139 
1140 	if (symbol_conf.kptr_restrict) {
1141 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1142 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1143 			   "Kernel samples will not be resolved.\n");
1144 		machine->kptr_restrict_warned = true;
1145 		return NULL;
1146 	}
1147 
1148 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1149 }
1150 
1151 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1152 {
1153 	int err = symbol__init(NULL);
1154 
1155 	if (err)
1156 		return err;
1157 
1158 	trace->host = machine__new_host();
1159 	if (trace->host == NULL)
1160 		return -ENOMEM;
1161 
1162 	err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1163 	if (err < 0)
1164 		goto out;
1165 
1166 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1167 					    evlist->threads, trace__tool_process, false,
1168 					    trace->opts.proc_map_timeout, 1);
1169 out:
1170 	if (err)
1171 		symbol__exit();
1172 
1173 	return err;
1174 }
1175 
1176 static void trace__symbols__exit(struct trace *trace)
1177 {
1178 	machine__exit(trace->host);
1179 	trace->host = NULL;
1180 
1181 	symbol__exit();
1182 }
1183 
1184 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1185 {
1186 	int idx;
1187 
1188 	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1189 		nr_args = sc->fmt->nr_args;
1190 
1191 	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1192 	if (sc->arg_fmt == NULL)
1193 		return -1;
1194 
1195 	for (idx = 0; idx < nr_args; ++idx) {
1196 		if (sc->fmt)
1197 			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1198 	}
1199 
1200 	sc->nr_args = nr_args;
1201 	return 0;
1202 }
1203 
1204 static int syscall__set_arg_fmts(struct syscall *sc)
1205 {
1206 	struct format_field *field;
1207 	int idx = 0, len;
1208 
1209 	for (field = sc->args; field; field = field->next, ++idx) {
1210 		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1211 			continue;
1212 
1213 		if (strcmp(field->type, "const char *") == 0 &&
1214 			 (strcmp(field->name, "filename") == 0 ||
1215 			  strcmp(field->name, "path") == 0 ||
1216 			  strcmp(field->name, "pathname") == 0))
1217 			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1218 		else if (field->flags & FIELD_IS_POINTER)
1219 			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1220 		else if (strcmp(field->type, "pid_t") == 0)
1221 			sc->arg_fmt[idx].scnprintf = SCA_PID;
1222 		else if (strcmp(field->type, "umode_t") == 0)
1223 			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1224 		else if ((strcmp(field->type, "int") == 0 ||
1225 			  strcmp(field->type, "unsigned int") == 0 ||
1226 			  strcmp(field->type, "long") == 0) &&
1227 			 (len = strlen(field->name)) >= 2 &&
1228 			 strcmp(field->name + len - 2, "fd") == 0) {
1229 			/*
1230 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1231 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1232 			 * 65 int
1233 			 * 23 unsigned int
1234 			 * 7 unsigned long
1235 			 */
1236 			sc->arg_fmt[idx].scnprintf = SCA_FD;
1237 		}
1238 	}
1239 
1240 	return 0;
1241 }
1242 
1243 static int trace__read_syscall_info(struct trace *trace, int id)
1244 {
1245 	char tp_name[128];
1246 	struct syscall *sc;
1247 	const char *name = syscalltbl__name(trace->sctbl, id);
1248 
1249 	if (name == NULL)
1250 		return -1;
1251 
1252 	if (id > trace->syscalls.max) {
1253 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1254 
1255 		if (nsyscalls == NULL)
1256 			return -1;
1257 
1258 		if (trace->syscalls.max != -1) {
1259 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1260 			       (id - trace->syscalls.max) * sizeof(*sc));
1261 		} else {
1262 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1263 		}
1264 
1265 		trace->syscalls.table = nsyscalls;
1266 		trace->syscalls.max   = id;
1267 	}
1268 
1269 	sc = trace->syscalls.table + id;
1270 	sc->name = name;
1271 
1272 	sc->fmt  = syscall_fmt__find(sc->name);
1273 
1274 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1275 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1276 
1277 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1278 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1279 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1280 	}
1281 
1282 	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1283 		return -1;
1284 
1285 	if (IS_ERR(sc->tp_format))
1286 		return -1;
1287 
1288 	sc->args = sc->tp_format->format.fields;
1289 	/*
1290 	 * We need to check and discard the first variable '__syscall_nr'
1291 	 * or 'nr' that mean the syscall number. It is needless here.
1292 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1293 	 */
1294 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1295 		sc->args = sc->args->next;
1296 		--sc->nr_args;
1297 	}
1298 
1299 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1300 
1301 	return syscall__set_arg_fmts(sc);
1302 }
1303 
1304 static int trace__validate_ev_qualifier(struct trace *trace)
1305 {
1306 	int err = 0, i;
1307 	size_t nr_allocated;
1308 	struct str_node *pos;
1309 
1310 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1311 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1312 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1313 
1314 	if (trace->ev_qualifier_ids.entries == NULL) {
1315 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1316 		       trace->output);
1317 		err = -EINVAL;
1318 		goto out;
1319 	}
1320 
1321 	nr_allocated = trace->ev_qualifier_ids.nr;
1322 	i = 0;
1323 
1324 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1325 		const char *sc = pos->s;
1326 		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1327 
1328 		if (id < 0) {
1329 			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1330 			if (id >= 0)
1331 				goto matches;
1332 
1333 			if (err == 0) {
1334 				fputs("Error:\tInvalid syscall ", trace->output);
1335 				err = -EINVAL;
1336 			} else {
1337 				fputs(", ", trace->output);
1338 			}
1339 
1340 			fputs(sc, trace->output);
1341 		}
1342 matches:
1343 		trace->ev_qualifier_ids.entries[i++] = id;
1344 		if (match_next == -1)
1345 			continue;
1346 
1347 		while (1) {
1348 			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1349 			if (id < 0)
1350 				break;
1351 			if (nr_allocated == trace->ev_qualifier_ids.nr) {
1352 				void *entries;
1353 
1354 				nr_allocated += 8;
1355 				entries = realloc(trace->ev_qualifier_ids.entries,
1356 						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1357 				if (entries == NULL) {
1358 					err = -ENOMEM;
1359 					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1360 					goto out_free;
1361 				}
1362 				trace->ev_qualifier_ids.entries = entries;
1363 			}
1364 			trace->ev_qualifier_ids.nr++;
1365 			trace->ev_qualifier_ids.entries[i++] = id;
1366 		}
1367 	}
1368 
1369 	if (err < 0) {
1370 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1371 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1372 out_free:
1373 		zfree(&trace->ev_qualifier_ids.entries);
1374 		trace->ev_qualifier_ids.nr = 0;
1375 	}
1376 out:
1377 	return err;
1378 }
1379 
1380 /*
1381  * args is to be interpreted as a series of longs but we need to handle
1382  * 8-byte unaligned accesses. args points to raw_data within the event
1383  * and raw_data is guaranteed to be 8-byte unaligned because it is
1384  * preceded by raw_size which is a u32. So we need to copy args to a temp
1385  * variable to read it. Most notably this avoids extended load instructions
1386  * on unaligned addresses
1387  */
1388 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1389 {
1390 	unsigned long val;
1391 	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1392 
1393 	memcpy(&val, p, sizeof(val));
1394 	return val;
1395 }
1396 
1397 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1398 				      struct syscall_arg *arg)
1399 {
1400 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1401 		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1402 
1403 	return scnprintf(bf, size, "arg%d: ", arg->idx);
1404 }
1405 
1406 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1407 				     struct syscall_arg *arg, unsigned long val)
1408 {
1409 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1410 		arg->val = val;
1411 		if (sc->arg_fmt[arg->idx].parm)
1412 			arg->parm = sc->arg_fmt[arg->idx].parm;
1413 		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1414 	}
1415 	return scnprintf(bf, size, "%ld", val);
1416 }
1417 
1418 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1419 				      unsigned char *args, struct trace *trace,
1420 				      struct thread *thread)
1421 {
1422 	size_t printed = 0;
1423 	unsigned long val;
1424 	u8 bit = 1;
1425 	struct syscall_arg arg = {
1426 		.args	= args,
1427 		.idx	= 0,
1428 		.mask	= 0,
1429 		.trace  = trace,
1430 		.thread = thread,
1431 	};
1432 	struct thread_trace *ttrace = thread__priv(thread);
1433 
1434 	/*
1435 	 * Things like fcntl will set this in its 'cmd' formatter to pick the
1436 	 * right formatter for the return value (an fd? file flags?), which is
1437 	 * not needed for syscalls that always return a given type, say an fd.
1438 	 */
1439 	ttrace->ret_scnprintf = NULL;
1440 
1441 	if (sc->args != NULL) {
1442 		struct format_field *field;
1443 
1444 		for (field = sc->args; field;
1445 		     field = field->next, ++arg.idx, bit <<= 1) {
1446 			if (arg.mask & bit)
1447 				continue;
1448 
1449 			val = syscall_arg__val(&arg, arg.idx);
1450 
1451 			/*
1452  			 * Suppress this argument if its value is zero and
1453  			 * and we don't have a string associated in an
1454  			 * strarray for it.
1455  			 */
1456 			if (val == 0 &&
1457 			    !(sc->arg_fmt &&
1458 			      (sc->arg_fmt[arg.idx].show_zero ||
1459 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1460 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1461 			      sc->arg_fmt[arg.idx].parm))
1462 				continue;
1463 
1464 			printed += scnprintf(bf + printed, size - printed,
1465 					     "%s%s: ", printed ? ", " : "", field->name);
1466 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1467 		}
1468 	} else if (IS_ERR(sc->tp_format)) {
1469 		/*
1470 		 * If we managed to read the tracepoint /format file, then we
1471 		 * may end up not having any args, like with gettid(), so only
1472 		 * print the raw args when we didn't manage to read it.
1473 		 */
1474 		while (arg.idx < sc->nr_args) {
1475 			if (arg.mask & bit)
1476 				goto next_arg;
1477 			val = syscall_arg__val(&arg, arg.idx);
1478 			if (printed)
1479 				printed += scnprintf(bf + printed, size - printed, ", ");
1480 			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1481 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1482 next_arg:
1483 			++arg.idx;
1484 			bit <<= 1;
1485 		}
1486 	}
1487 
1488 	return printed;
1489 }
1490 
1491 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1492 				  union perf_event *event,
1493 				  struct perf_sample *sample);
1494 
1495 static struct syscall *trace__syscall_info(struct trace *trace,
1496 					   struct perf_evsel *evsel, int id)
1497 {
1498 
1499 	if (id < 0) {
1500 
1501 		/*
1502 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1503 		 * before that, leaving at a higher verbosity level till that is
1504 		 * explained. Reproduced with plain ftrace with:
1505 		 *
1506 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1507 		 * grep "NR -1 " /t/trace_pipe
1508 		 *
1509 		 * After generating some load on the machine.
1510  		 */
1511 		if (verbose > 1) {
1512 			static u64 n;
1513 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1514 				id, perf_evsel__name(evsel), ++n);
1515 		}
1516 		return NULL;
1517 	}
1518 
1519 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1520 	    trace__read_syscall_info(trace, id))
1521 		goto out_cant_read;
1522 
1523 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1524 		goto out_cant_read;
1525 
1526 	return &trace->syscalls.table[id];
1527 
1528 out_cant_read:
1529 	if (verbose > 0) {
1530 		fprintf(trace->output, "Problems reading syscall %d", id);
1531 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1532 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1533 		fputs(" information\n", trace->output);
1534 	}
1535 	return NULL;
1536 }
1537 
1538 static void thread__update_stats(struct thread_trace *ttrace,
1539 				 int id, struct perf_sample *sample)
1540 {
1541 	struct int_node *inode;
1542 	struct stats *stats;
1543 	u64 duration = 0;
1544 
1545 	inode = intlist__findnew(ttrace->syscall_stats, id);
1546 	if (inode == NULL)
1547 		return;
1548 
1549 	stats = inode->priv;
1550 	if (stats == NULL) {
1551 		stats = malloc(sizeof(struct stats));
1552 		if (stats == NULL)
1553 			return;
1554 		init_stats(stats);
1555 		inode->priv = stats;
1556 	}
1557 
1558 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1559 		duration = sample->time - ttrace->entry_time;
1560 
1561 	update_stats(stats, duration);
1562 }
1563 
1564 static int trace__printf_interrupted_entry(struct trace *trace)
1565 {
1566 	struct thread_trace *ttrace;
1567 	size_t printed;
1568 
1569 	if (trace->failure_only || trace->current == NULL)
1570 		return 0;
1571 
1572 	ttrace = thread__priv(trace->current);
1573 
1574 	if (!ttrace->entry_pending)
1575 		return 0;
1576 
1577 	printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1578 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1579 	ttrace->entry_pending = false;
1580 
1581 	return printed;
1582 }
1583 
1584 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1585 				 struct perf_sample *sample, struct thread *thread)
1586 {
1587 	int printed = 0;
1588 
1589 	if (trace->print_sample) {
1590 		double ts = (double)sample->time / NSEC_PER_MSEC;
1591 
1592 		printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1593 				   perf_evsel__name(evsel), ts,
1594 				   thread__comm_str(thread),
1595 				   sample->pid, sample->tid, sample->cpu);
1596 	}
1597 
1598 	return printed;
1599 }
1600 
1601 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1602 			    union perf_event *event __maybe_unused,
1603 			    struct perf_sample *sample)
1604 {
1605 	char *msg;
1606 	void *args;
1607 	size_t printed = 0;
1608 	struct thread *thread;
1609 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1610 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1611 	struct thread_trace *ttrace;
1612 
1613 	if (sc == NULL)
1614 		return -1;
1615 
1616 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1617 	ttrace = thread__trace(thread, trace->output);
1618 	if (ttrace == NULL)
1619 		goto out_put;
1620 
1621 	trace__fprintf_sample(trace, evsel, sample, thread);
1622 
1623 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1624 
1625 	if (ttrace->entry_str == NULL) {
1626 		ttrace->entry_str = malloc(trace__entry_str_size);
1627 		if (!ttrace->entry_str)
1628 			goto out_put;
1629 	}
1630 
1631 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1632 		trace__printf_interrupted_entry(trace);
1633 
1634 	ttrace->entry_time = sample->time;
1635 	msg = ttrace->entry_str;
1636 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1637 
1638 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1639 					   args, trace, thread);
1640 
1641 	if (sc->is_exit) {
1642 		if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1643 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1644 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1645 		}
1646 	} else {
1647 		ttrace->entry_pending = true;
1648 		/* See trace__vfs_getname & trace__sys_exit */
1649 		ttrace->filename.pending_open = false;
1650 	}
1651 
1652 	if (trace->current != thread) {
1653 		thread__put(trace->current);
1654 		trace->current = thread__get(thread);
1655 	}
1656 	err = 0;
1657 out_put:
1658 	thread__put(thread);
1659 	return err;
1660 }
1661 
1662 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1663 				    struct perf_sample *sample,
1664 				    struct callchain_cursor *cursor)
1665 {
1666 	struct addr_location al;
1667 	int max_stack = evsel->attr.sample_max_stack ?
1668 			evsel->attr.sample_max_stack :
1669 			trace->max_stack;
1670 
1671 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1672 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1673 		return -1;
1674 
1675 	return 0;
1676 }
1677 
1678 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1679 {
1680 	/* TODO: user-configurable print_opts */
1681 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1682 				        EVSEL__PRINT_DSO |
1683 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1684 
1685 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1686 }
1687 
1688 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1689 {
1690 	struct perf_env *env = perf_evsel__env(evsel);
1691 	const char *arch_name = perf_env__arch(env);
1692 
1693 	return arch_syscalls__strerrno(arch_name, err);
1694 }
1695 
1696 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1697 			   union perf_event *event __maybe_unused,
1698 			   struct perf_sample *sample)
1699 {
1700 	long ret;
1701 	u64 duration = 0;
1702 	bool duration_calculated = false;
1703 	struct thread *thread;
1704 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1705 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1706 	struct thread_trace *ttrace;
1707 
1708 	if (sc == NULL)
1709 		return -1;
1710 
1711 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1712 	ttrace = thread__trace(thread, trace->output);
1713 	if (ttrace == NULL)
1714 		goto out_put;
1715 
1716 	trace__fprintf_sample(trace, evsel, sample, thread);
1717 
1718 	if (trace->summary)
1719 		thread__update_stats(ttrace, id, sample);
1720 
1721 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1722 
1723 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1724 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1725 		ttrace->filename.pending_open = false;
1726 		++trace->stats.vfs_getname;
1727 	}
1728 
1729 	if (ttrace->entry_time) {
1730 		duration = sample->time - ttrace->entry_time;
1731 		if (trace__filter_duration(trace, duration))
1732 			goto out;
1733 		duration_calculated = true;
1734 	} else if (trace->duration_filter)
1735 		goto out;
1736 
1737 	if (sample->callchain) {
1738 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1739 		if (callchain_ret == 0) {
1740 			if (callchain_cursor.nr < trace->min_stack)
1741 				goto out;
1742 			callchain_ret = 1;
1743 		}
1744 	}
1745 
1746 	if (trace->summary_only || (ret >= 0 && trace->failure_only))
1747 		goto out;
1748 
1749 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1750 
1751 	if (ttrace->entry_pending) {
1752 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1753 	} else {
1754 		fprintf(trace->output, " ... [");
1755 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1756 		fprintf(trace->output, "]: %s()", sc->name);
1757 	}
1758 
1759 	if (sc->fmt == NULL) {
1760 		if (ret < 0)
1761 			goto errno_print;
1762 signed_print:
1763 		fprintf(trace->output, ") = %ld", ret);
1764 	} else if (ret < 0) {
1765 errno_print: {
1766 		char bf[STRERR_BUFSIZE];
1767 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1768 			   *e = errno_to_name(evsel, -ret);
1769 
1770 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1771 	}
1772 	} else if (ret == 0 && sc->fmt->timeout)
1773 		fprintf(trace->output, ") = 0 Timeout");
1774 	else if (ttrace->ret_scnprintf) {
1775 		char bf[1024];
1776 		struct syscall_arg arg = {
1777 			.val	= ret,
1778 			.thread	= thread,
1779 			.trace	= trace,
1780 		};
1781 		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1782 		ttrace->ret_scnprintf = NULL;
1783 		fprintf(trace->output, ") = %s", bf);
1784 	} else if (sc->fmt->hexret)
1785 		fprintf(trace->output, ") = %#lx", ret);
1786 	else if (sc->fmt->errpid) {
1787 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1788 
1789 		if (child != NULL) {
1790 			fprintf(trace->output, ") = %ld", ret);
1791 			if (child->comm_set)
1792 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1793 			thread__put(child);
1794 		}
1795 	} else
1796 		goto signed_print;
1797 
1798 	fputc('\n', trace->output);
1799 
1800 	if (callchain_ret > 0)
1801 		trace__fprintf_callchain(trace, sample);
1802 	else if (callchain_ret < 0)
1803 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1804 out:
1805 	ttrace->entry_pending = false;
1806 	err = 0;
1807 out_put:
1808 	thread__put(thread);
1809 	return err;
1810 }
1811 
1812 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1813 			      union perf_event *event __maybe_unused,
1814 			      struct perf_sample *sample)
1815 {
1816 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1817 	struct thread_trace *ttrace;
1818 	size_t filename_len, entry_str_len, to_move;
1819 	ssize_t remaining_space;
1820 	char *pos;
1821 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1822 
1823 	if (!thread)
1824 		goto out;
1825 
1826 	ttrace = thread__priv(thread);
1827 	if (!ttrace)
1828 		goto out_put;
1829 
1830 	filename_len = strlen(filename);
1831 	if (filename_len == 0)
1832 		goto out_put;
1833 
1834 	if (ttrace->filename.namelen < filename_len) {
1835 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1836 
1837 		if (f == NULL)
1838 			goto out_put;
1839 
1840 		ttrace->filename.namelen = filename_len;
1841 		ttrace->filename.name = f;
1842 	}
1843 
1844 	strcpy(ttrace->filename.name, filename);
1845 	ttrace->filename.pending_open = true;
1846 
1847 	if (!ttrace->filename.ptr)
1848 		goto out_put;
1849 
1850 	entry_str_len = strlen(ttrace->entry_str);
1851 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1852 	if (remaining_space <= 0)
1853 		goto out_put;
1854 
1855 	if (filename_len > (size_t)remaining_space) {
1856 		filename += filename_len - remaining_space;
1857 		filename_len = remaining_space;
1858 	}
1859 
1860 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1861 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1862 	memmove(pos + filename_len, pos, to_move);
1863 	memcpy(pos, filename, filename_len);
1864 
1865 	ttrace->filename.ptr = 0;
1866 	ttrace->filename.entry_str_pos = 0;
1867 out_put:
1868 	thread__put(thread);
1869 out:
1870 	return 0;
1871 }
1872 
1873 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1874 				     union perf_event *event __maybe_unused,
1875 				     struct perf_sample *sample)
1876 {
1877         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1878 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1879 	struct thread *thread = machine__findnew_thread(trace->host,
1880 							sample->pid,
1881 							sample->tid);
1882 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1883 
1884 	if (ttrace == NULL)
1885 		goto out_dump;
1886 
1887 	ttrace->runtime_ms += runtime_ms;
1888 	trace->runtime_ms += runtime_ms;
1889 out_put:
1890 	thread__put(thread);
1891 	return 0;
1892 
1893 out_dump:
1894 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1895 	       evsel->name,
1896 	       perf_evsel__strval(evsel, sample, "comm"),
1897 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1898 	       runtime,
1899 	       perf_evsel__intval(evsel, sample, "vruntime"));
1900 	goto out_put;
1901 }
1902 
1903 static int bpf_output__printer(enum binary_printer_ops op,
1904 			       unsigned int val, void *extra __maybe_unused, FILE *fp)
1905 {
1906 	unsigned char ch = (unsigned char)val;
1907 
1908 	switch (op) {
1909 	case BINARY_PRINT_CHAR_DATA:
1910 		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1911 	case BINARY_PRINT_DATA_BEGIN:
1912 	case BINARY_PRINT_LINE_BEGIN:
1913 	case BINARY_PRINT_ADDR:
1914 	case BINARY_PRINT_NUM_DATA:
1915 	case BINARY_PRINT_NUM_PAD:
1916 	case BINARY_PRINT_SEP:
1917 	case BINARY_PRINT_CHAR_PAD:
1918 	case BINARY_PRINT_LINE_END:
1919 	case BINARY_PRINT_DATA_END:
1920 	default:
1921 		break;
1922 	}
1923 
1924 	return 0;
1925 }
1926 
1927 static void bpf_output__fprintf(struct trace *trace,
1928 				struct perf_sample *sample)
1929 {
1930 	binary__fprintf(sample->raw_data, sample->raw_size, 8,
1931 			bpf_output__printer, NULL, trace->output);
1932 }
1933 
1934 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1935 				union perf_event *event __maybe_unused,
1936 				struct perf_sample *sample)
1937 {
1938 	int callchain_ret = 0;
1939 
1940 	if (sample->callchain) {
1941 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1942 		if (callchain_ret == 0) {
1943 			if (callchain_cursor.nr < trace->min_stack)
1944 				goto out;
1945 			callchain_ret = 1;
1946 		}
1947 	}
1948 
1949 	trace__printf_interrupted_entry(trace);
1950 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1951 
1952 	if (trace->trace_syscalls)
1953 		fprintf(trace->output, "(         ): ");
1954 
1955 	fprintf(trace->output, "%s:", evsel->name);
1956 
1957 	if (perf_evsel__is_bpf_output(evsel)) {
1958 		bpf_output__fprintf(trace, sample);
1959 	} else if (evsel->tp_format) {
1960 		event_format__fprintf(evsel->tp_format, sample->cpu,
1961 				      sample->raw_data, sample->raw_size,
1962 				      trace->output);
1963 	}
1964 
1965 	fprintf(trace->output, "\n");
1966 
1967 	if (callchain_ret > 0)
1968 		trace__fprintf_callchain(trace, sample);
1969 	else if (callchain_ret < 0)
1970 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1971 out:
1972 	return 0;
1973 }
1974 
1975 static void print_location(FILE *f, struct perf_sample *sample,
1976 			   struct addr_location *al,
1977 			   bool print_dso, bool print_sym)
1978 {
1979 
1980 	if ((verbose > 0 || print_dso) && al->map)
1981 		fprintf(f, "%s@", al->map->dso->long_name);
1982 
1983 	if ((verbose > 0 || print_sym) && al->sym)
1984 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1985 			al->addr - al->sym->start);
1986 	else if (al->map)
1987 		fprintf(f, "0x%" PRIx64, al->addr);
1988 	else
1989 		fprintf(f, "0x%" PRIx64, sample->addr);
1990 }
1991 
1992 static int trace__pgfault(struct trace *trace,
1993 			  struct perf_evsel *evsel,
1994 			  union perf_event *event __maybe_unused,
1995 			  struct perf_sample *sample)
1996 {
1997 	struct thread *thread;
1998 	struct addr_location al;
1999 	char map_type = 'd';
2000 	struct thread_trace *ttrace;
2001 	int err = -1;
2002 	int callchain_ret = 0;
2003 
2004 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2005 
2006 	if (sample->callchain) {
2007 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2008 		if (callchain_ret == 0) {
2009 			if (callchain_cursor.nr < trace->min_stack)
2010 				goto out_put;
2011 			callchain_ret = 1;
2012 		}
2013 	}
2014 
2015 	ttrace = thread__trace(thread, trace->output);
2016 	if (ttrace == NULL)
2017 		goto out_put;
2018 
2019 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2020 		ttrace->pfmaj++;
2021 	else
2022 		ttrace->pfmin++;
2023 
2024 	if (trace->summary_only)
2025 		goto out;
2026 
2027 	thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2028 
2029 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2030 
2031 	fprintf(trace->output, "%sfault [",
2032 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2033 		"maj" : "min");
2034 
2035 	print_location(trace->output, sample, &al, false, true);
2036 
2037 	fprintf(trace->output, "] => ");
2038 
2039 	thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2040 
2041 	if (!al.map) {
2042 		thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2043 
2044 		if (al.map)
2045 			map_type = 'x';
2046 		else
2047 			map_type = '?';
2048 	}
2049 
2050 	print_location(trace->output, sample, &al, true, false);
2051 
2052 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2053 
2054 	if (callchain_ret > 0)
2055 		trace__fprintf_callchain(trace, sample);
2056 	else if (callchain_ret < 0)
2057 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2058 out:
2059 	err = 0;
2060 out_put:
2061 	thread__put(thread);
2062 	return err;
2063 }
2064 
2065 static void trace__set_base_time(struct trace *trace,
2066 				 struct perf_evsel *evsel,
2067 				 struct perf_sample *sample)
2068 {
2069 	/*
2070 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2071 	 * and don't use sample->time unconditionally, we may end up having
2072 	 * some other event in the future without PERF_SAMPLE_TIME for good
2073 	 * reason, i.e. we may not be interested in its timestamps, just in
2074 	 * it taking place, picking some piece of information when it
2075 	 * appears in our event stream (vfs_getname comes to mind).
2076 	 */
2077 	if (trace->base_time == 0 && !trace->full_time &&
2078 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2079 		trace->base_time = sample->time;
2080 }
2081 
2082 static int trace__process_sample(struct perf_tool *tool,
2083 				 union perf_event *event,
2084 				 struct perf_sample *sample,
2085 				 struct perf_evsel *evsel,
2086 				 struct machine *machine __maybe_unused)
2087 {
2088 	struct trace *trace = container_of(tool, struct trace, tool);
2089 	struct thread *thread;
2090 	int err = 0;
2091 
2092 	tracepoint_handler handler = evsel->handler;
2093 
2094 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2095 	if (thread && thread__is_filtered(thread))
2096 		goto out;
2097 
2098 	trace__set_base_time(trace, evsel, sample);
2099 
2100 	if (handler) {
2101 		++trace->nr_events;
2102 		handler(trace, evsel, event, sample);
2103 	}
2104 out:
2105 	thread__put(thread);
2106 	return err;
2107 }
2108 
2109 static int trace__record(struct trace *trace, int argc, const char **argv)
2110 {
2111 	unsigned int rec_argc, i, j;
2112 	const char **rec_argv;
2113 	const char * const record_args[] = {
2114 		"record",
2115 		"-R",
2116 		"-m", "1024",
2117 		"-c", "1",
2118 	};
2119 
2120 	const char * const sc_args[] = { "-e", };
2121 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2122 	const char * const majpf_args[] = { "-e", "major-faults" };
2123 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2124 	const char * const minpf_args[] = { "-e", "minor-faults" };
2125 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2126 
2127 	/* +1 is for the event string below */
2128 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2129 		majpf_args_nr + minpf_args_nr + argc;
2130 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2131 
2132 	if (rec_argv == NULL)
2133 		return -ENOMEM;
2134 
2135 	j = 0;
2136 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2137 		rec_argv[j++] = record_args[i];
2138 
2139 	if (trace->trace_syscalls) {
2140 		for (i = 0; i < sc_args_nr; i++)
2141 			rec_argv[j++] = sc_args[i];
2142 
2143 		/* event string may be different for older kernels - e.g., RHEL6 */
2144 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2145 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2146 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2147 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2148 		else {
2149 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2150 			free(rec_argv);
2151 			return -1;
2152 		}
2153 	}
2154 
2155 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2156 		for (i = 0; i < majpf_args_nr; i++)
2157 			rec_argv[j++] = majpf_args[i];
2158 
2159 	if (trace->trace_pgfaults & TRACE_PFMIN)
2160 		for (i = 0; i < minpf_args_nr; i++)
2161 			rec_argv[j++] = minpf_args[i];
2162 
2163 	for (i = 0; i < (unsigned int)argc; i++)
2164 		rec_argv[j++] = argv[i];
2165 
2166 	return cmd_record(j, rec_argv);
2167 }
2168 
2169 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2170 
2171 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2172 {
2173 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2174 
2175 	if (IS_ERR(evsel))
2176 		return false;
2177 
2178 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2179 		perf_evsel__delete(evsel);
2180 		return false;
2181 	}
2182 
2183 	evsel->handler = trace__vfs_getname;
2184 	perf_evlist__add(evlist, evsel);
2185 	return true;
2186 }
2187 
2188 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2189 {
2190 	struct perf_evsel *evsel;
2191 	struct perf_event_attr attr = {
2192 		.type = PERF_TYPE_SOFTWARE,
2193 		.mmap_data = 1,
2194 	};
2195 
2196 	attr.config = config;
2197 	attr.sample_period = 1;
2198 
2199 	event_attr_init(&attr);
2200 
2201 	evsel = perf_evsel__new(&attr);
2202 	if (evsel)
2203 		evsel->handler = trace__pgfault;
2204 
2205 	return evsel;
2206 }
2207 
2208 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2209 {
2210 	const u32 type = event->header.type;
2211 	struct perf_evsel *evsel;
2212 
2213 	if (type != PERF_RECORD_SAMPLE) {
2214 		trace__process_event(trace, trace->host, event, sample);
2215 		return;
2216 	}
2217 
2218 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2219 	if (evsel == NULL) {
2220 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2221 		return;
2222 	}
2223 
2224 	trace__set_base_time(trace, evsel, sample);
2225 
2226 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2227 	    sample->raw_data == NULL) {
2228 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2229 		       perf_evsel__name(evsel), sample->tid,
2230 		       sample->cpu, sample->raw_size);
2231 	} else {
2232 		tracepoint_handler handler = evsel->handler;
2233 		handler(trace, evsel, event, sample);
2234 	}
2235 }
2236 
2237 static int trace__add_syscall_newtp(struct trace *trace)
2238 {
2239 	int ret = -1;
2240 	struct perf_evlist *evlist = trace->evlist;
2241 	struct perf_evsel *sys_enter, *sys_exit;
2242 
2243 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2244 	if (sys_enter == NULL)
2245 		goto out;
2246 
2247 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2248 		goto out_delete_sys_enter;
2249 
2250 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2251 	if (sys_exit == NULL)
2252 		goto out_delete_sys_enter;
2253 
2254 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2255 		goto out_delete_sys_exit;
2256 
2257 	perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2258 	perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2259 
2260 	perf_evlist__add(evlist, sys_enter);
2261 	perf_evlist__add(evlist, sys_exit);
2262 
2263 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2264 		/*
2265 		 * We're interested only in the user space callchain
2266 		 * leading to the syscall, allow overriding that for
2267 		 * debugging reasons using --kernel_syscall_callchains
2268 		 */
2269 		sys_exit->attr.exclude_callchain_kernel = 1;
2270 	}
2271 
2272 	trace->syscalls.events.sys_enter = sys_enter;
2273 	trace->syscalls.events.sys_exit  = sys_exit;
2274 
2275 	ret = 0;
2276 out:
2277 	return ret;
2278 
2279 out_delete_sys_exit:
2280 	perf_evsel__delete_priv(sys_exit);
2281 out_delete_sys_enter:
2282 	perf_evsel__delete_priv(sys_enter);
2283 	goto out;
2284 }
2285 
2286 static int trace__set_ev_qualifier_filter(struct trace *trace)
2287 {
2288 	int err = -1;
2289 	struct perf_evsel *sys_exit;
2290 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2291 						trace->ev_qualifier_ids.nr,
2292 						trace->ev_qualifier_ids.entries);
2293 
2294 	if (filter == NULL)
2295 		goto out_enomem;
2296 
2297 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2298 					  filter)) {
2299 		sys_exit = trace->syscalls.events.sys_exit;
2300 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2301 	}
2302 
2303 	free(filter);
2304 out:
2305 	return err;
2306 out_enomem:
2307 	errno = ENOMEM;
2308 	goto out;
2309 }
2310 
2311 static int trace__set_filter_loop_pids(struct trace *trace)
2312 {
2313 	unsigned int nr = 1;
2314 	pid_t pids[32] = {
2315 		getpid(),
2316 	};
2317 	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2318 
2319 	while (thread && nr < ARRAY_SIZE(pids)) {
2320 		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2321 
2322 		if (parent == NULL)
2323 			break;
2324 
2325 		if (!strcmp(thread__comm_str(parent), "sshd")) {
2326 			pids[nr++] = parent->tid;
2327 			break;
2328 		}
2329 		thread = parent;
2330 	}
2331 
2332 	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2333 }
2334 
2335 static int trace__run(struct trace *trace, int argc, const char **argv)
2336 {
2337 	struct perf_evlist *evlist = trace->evlist;
2338 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2339 	int err = -1, i;
2340 	unsigned long before;
2341 	const bool forks = argc > 0;
2342 	bool draining = false;
2343 
2344 	trace->live = true;
2345 
2346 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2347 		goto out_error_raw_syscalls;
2348 
2349 	if (trace->trace_syscalls)
2350 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2351 
2352 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2353 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2354 		if (pgfault_maj == NULL)
2355 			goto out_error_mem;
2356 		perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2357 		perf_evlist__add(evlist, pgfault_maj);
2358 	}
2359 
2360 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2361 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2362 		if (pgfault_min == NULL)
2363 			goto out_error_mem;
2364 		perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2365 		perf_evlist__add(evlist, pgfault_min);
2366 	}
2367 
2368 	if (trace->sched &&
2369 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2370 				   trace__sched_stat_runtime))
2371 		goto out_error_sched_stat_runtime;
2372 
2373 	/*
2374 	 * If a global cgroup was set, apply it to all the events without an
2375 	 * explicit cgroup. I.e.:
2376 	 *
2377 	 * 	trace -G A -e sched:*switch
2378 	 *
2379 	 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2380 	 * _and_ sched:sched_switch to the 'A' cgroup, while:
2381 	 *
2382 	 * trace -e sched:*switch -G A
2383 	 *
2384 	 * will only set the sched:sched_switch event to the 'A' cgroup, all the
2385 	 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2386 	 * a cgroup (on the root cgroup, sys wide, etc).
2387 	 *
2388 	 * Multiple cgroups:
2389 	 *
2390 	 * trace -G A -e sched:*switch -G B
2391 	 *
2392 	 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2393 	 * to the 'B' cgroup.
2394 	 *
2395 	 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2396 	 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2397 	 */
2398 	if (trace->cgroup)
2399 		evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2400 
2401 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2402 	if (err < 0) {
2403 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2404 		goto out_delete_evlist;
2405 	}
2406 
2407 	err = trace__symbols_init(trace, evlist);
2408 	if (err < 0) {
2409 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2410 		goto out_delete_evlist;
2411 	}
2412 
2413 	perf_evlist__config(evlist, &trace->opts, &callchain_param);
2414 
2415 	signal(SIGCHLD, sig_handler);
2416 	signal(SIGINT, sig_handler);
2417 
2418 	if (forks) {
2419 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2420 						    argv, false, NULL);
2421 		if (err < 0) {
2422 			fprintf(trace->output, "Couldn't run the workload!\n");
2423 			goto out_delete_evlist;
2424 		}
2425 	}
2426 
2427 	err = perf_evlist__open(evlist);
2428 	if (err < 0)
2429 		goto out_error_open;
2430 
2431 	err = bpf__apply_obj_config();
2432 	if (err) {
2433 		char errbuf[BUFSIZ];
2434 
2435 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2436 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2437 			 errbuf);
2438 		goto out_error_open;
2439 	}
2440 
2441 	/*
2442 	 * Better not use !target__has_task() here because we need to cover the
2443 	 * case where no threads were specified in the command line, but a
2444 	 * workload was, and in that case we will fill in the thread_map when
2445 	 * we fork the workload in perf_evlist__prepare_workload.
2446 	 */
2447 	if (trace->filter_pids.nr > 0)
2448 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2449 	else if (thread_map__pid(evlist->threads, 0) == -1)
2450 		err = trace__set_filter_loop_pids(trace);
2451 
2452 	if (err < 0)
2453 		goto out_error_mem;
2454 
2455 	if (trace->ev_qualifier_ids.nr > 0) {
2456 		err = trace__set_ev_qualifier_filter(trace);
2457 		if (err < 0)
2458 			goto out_errno;
2459 
2460 		pr_debug("event qualifier tracepoint filter: %s\n",
2461 			 trace->syscalls.events.sys_exit->filter);
2462 	}
2463 
2464 	err = perf_evlist__apply_filters(evlist, &evsel);
2465 	if (err < 0)
2466 		goto out_error_apply_filters;
2467 
2468 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2469 	if (err < 0)
2470 		goto out_error_mmap;
2471 
2472 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2473 		perf_evlist__enable(evlist);
2474 
2475 	if (forks)
2476 		perf_evlist__start_workload(evlist);
2477 
2478 	if (trace->opts.initial_delay) {
2479 		usleep(trace->opts.initial_delay * 1000);
2480 		perf_evlist__enable(evlist);
2481 	}
2482 
2483 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2484 				  evlist->threads->nr > 1 ||
2485 				  perf_evlist__first(evlist)->attr.inherit;
2486 
2487 	/*
2488 	 * Now that we already used evsel->attr to ask the kernel to setup the
2489 	 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2490 	 * trace__resolve_callchain(), allowing per-event max-stack settings
2491 	 * to override an explicitely set --max-stack global setting.
2492 	 */
2493 	evlist__for_each_entry(evlist, evsel) {
2494 		if (evsel__has_callchain(evsel) &&
2495 		    evsel->attr.sample_max_stack == 0)
2496 			evsel->attr.sample_max_stack = trace->max_stack;
2497 	}
2498 again:
2499 	before = trace->nr_events;
2500 
2501 	for (i = 0; i < evlist->nr_mmaps; i++) {
2502 		union perf_event *event;
2503 		struct perf_mmap *md;
2504 
2505 		md = &evlist->mmap[i];
2506 		if (perf_mmap__read_init(md) < 0)
2507 			continue;
2508 
2509 		while ((event = perf_mmap__read_event(md)) != NULL) {
2510 			struct perf_sample sample;
2511 
2512 			++trace->nr_events;
2513 
2514 			err = perf_evlist__parse_sample(evlist, event, &sample);
2515 			if (err) {
2516 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2517 				goto next_event;
2518 			}
2519 
2520 			trace__handle_event(trace, event, &sample);
2521 next_event:
2522 			perf_mmap__consume(md);
2523 
2524 			if (interrupted)
2525 				goto out_disable;
2526 
2527 			if (done && !draining) {
2528 				perf_evlist__disable(evlist);
2529 				draining = true;
2530 			}
2531 		}
2532 		perf_mmap__read_done(md);
2533 	}
2534 
2535 	if (trace->nr_events == before) {
2536 		int timeout = done ? 100 : -1;
2537 
2538 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2539 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2540 				draining = true;
2541 
2542 			goto again;
2543 		}
2544 	} else {
2545 		goto again;
2546 	}
2547 
2548 out_disable:
2549 	thread__zput(trace->current);
2550 
2551 	perf_evlist__disable(evlist);
2552 
2553 	if (!err) {
2554 		if (trace->summary)
2555 			trace__fprintf_thread_summary(trace, trace->output);
2556 
2557 		if (trace->show_tool_stats) {
2558 			fprintf(trace->output, "Stats:\n "
2559 					       " vfs_getname : %" PRIu64 "\n"
2560 					       " proc_getname: %" PRIu64 "\n",
2561 				trace->stats.vfs_getname,
2562 				trace->stats.proc_getname);
2563 		}
2564 	}
2565 
2566 out_delete_evlist:
2567 	trace__symbols__exit(trace);
2568 
2569 	perf_evlist__delete(evlist);
2570 	cgroup__put(trace->cgroup);
2571 	trace->evlist = NULL;
2572 	trace->live = false;
2573 	return err;
2574 {
2575 	char errbuf[BUFSIZ];
2576 
2577 out_error_sched_stat_runtime:
2578 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2579 	goto out_error;
2580 
2581 out_error_raw_syscalls:
2582 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2583 	goto out_error;
2584 
2585 out_error_mmap:
2586 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2587 	goto out_error;
2588 
2589 out_error_open:
2590 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2591 
2592 out_error:
2593 	fprintf(trace->output, "%s\n", errbuf);
2594 	goto out_delete_evlist;
2595 
2596 out_error_apply_filters:
2597 	fprintf(trace->output,
2598 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2599 		evsel->filter, perf_evsel__name(evsel), errno,
2600 		str_error_r(errno, errbuf, sizeof(errbuf)));
2601 	goto out_delete_evlist;
2602 }
2603 out_error_mem:
2604 	fprintf(trace->output, "Not enough memory to run!\n");
2605 	goto out_delete_evlist;
2606 
2607 out_errno:
2608 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2609 	goto out_delete_evlist;
2610 }
2611 
2612 static int trace__replay(struct trace *trace)
2613 {
2614 	const struct perf_evsel_str_handler handlers[] = {
2615 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2616 	};
2617 	struct perf_data data = {
2618 		.file      = {
2619 			.path = input_name,
2620 		},
2621 		.mode      = PERF_DATA_MODE_READ,
2622 		.force     = trace->force,
2623 	};
2624 	struct perf_session *session;
2625 	struct perf_evsel *evsel;
2626 	int err = -1;
2627 
2628 	trace->tool.sample	  = trace__process_sample;
2629 	trace->tool.mmap	  = perf_event__process_mmap;
2630 	trace->tool.mmap2	  = perf_event__process_mmap2;
2631 	trace->tool.comm	  = perf_event__process_comm;
2632 	trace->tool.exit	  = perf_event__process_exit;
2633 	trace->tool.fork	  = perf_event__process_fork;
2634 	trace->tool.attr	  = perf_event__process_attr;
2635 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2636 	trace->tool.build_id	  = perf_event__process_build_id;
2637 	trace->tool.namespaces	  = perf_event__process_namespaces;
2638 
2639 	trace->tool.ordered_events = true;
2640 	trace->tool.ordering_requires_timestamps = true;
2641 
2642 	/* add tid to output */
2643 	trace->multiple_threads = true;
2644 
2645 	session = perf_session__new(&data, false, &trace->tool);
2646 	if (session == NULL)
2647 		return -1;
2648 
2649 	if (trace->opts.target.pid)
2650 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2651 
2652 	if (trace->opts.target.tid)
2653 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2654 
2655 	if (symbol__init(&session->header.env) < 0)
2656 		goto out;
2657 
2658 	trace->host = &session->machines.host;
2659 
2660 	err = perf_session__set_tracepoints_handlers(session, handlers);
2661 	if (err)
2662 		goto out;
2663 
2664 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2665 						     "raw_syscalls:sys_enter");
2666 	/* older kernels have syscalls tp versus raw_syscalls */
2667 	if (evsel == NULL)
2668 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2669 							     "syscalls:sys_enter");
2670 
2671 	if (evsel &&
2672 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2673 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2674 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2675 		goto out;
2676 	}
2677 
2678 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2679 						     "raw_syscalls:sys_exit");
2680 	if (evsel == NULL)
2681 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2682 							     "syscalls:sys_exit");
2683 	if (evsel &&
2684 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2685 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2686 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2687 		goto out;
2688 	}
2689 
2690 	evlist__for_each_entry(session->evlist, evsel) {
2691 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2692 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2693 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2694 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2695 			evsel->handler = trace__pgfault;
2696 	}
2697 
2698 	setup_pager();
2699 
2700 	err = perf_session__process_events(session);
2701 	if (err)
2702 		pr_err("Failed to process events, error %d", err);
2703 
2704 	else if (trace->summary)
2705 		trace__fprintf_thread_summary(trace, trace->output);
2706 
2707 out:
2708 	perf_session__delete(session);
2709 
2710 	return err;
2711 }
2712 
2713 static size_t trace__fprintf_threads_header(FILE *fp)
2714 {
2715 	size_t printed;
2716 
2717 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2718 
2719 	return printed;
2720 }
2721 
2722 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2723 	struct stats 	*stats;
2724 	double		msecs;
2725 	int		syscall;
2726 )
2727 {
2728 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2729 	struct stats *stats = source->priv;
2730 
2731 	entry->syscall = source->i;
2732 	entry->stats   = stats;
2733 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2734 }
2735 
2736 static size_t thread__dump_stats(struct thread_trace *ttrace,
2737 				 struct trace *trace, FILE *fp)
2738 {
2739 	size_t printed = 0;
2740 	struct syscall *sc;
2741 	struct rb_node *nd;
2742 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2743 
2744 	if (syscall_stats == NULL)
2745 		return 0;
2746 
2747 	printed += fprintf(fp, "\n");
2748 
2749 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2750 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2751 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2752 
2753 	resort_rb__for_each_entry(nd, syscall_stats) {
2754 		struct stats *stats = syscall_stats_entry->stats;
2755 		if (stats) {
2756 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2757 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2758 			double avg = avg_stats(stats);
2759 			double pct;
2760 			u64 n = (u64) stats->n;
2761 
2762 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2763 			avg /= NSEC_PER_MSEC;
2764 
2765 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2766 			printed += fprintf(fp, "   %-15s", sc->name);
2767 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2768 					   n, syscall_stats_entry->msecs, min, avg);
2769 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2770 		}
2771 	}
2772 
2773 	resort_rb__delete(syscall_stats);
2774 	printed += fprintf(fp, "\n\n");
2775 
2776 	return printed;
2777 }
2778 
2779 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2780 {
2781 	size_t printed = 0;
2782 	struct thread_trace *ttrace = thread__priv(thread);
2783 	double ratio;
2784 
2785 	if (ttrace == NULL)
2786 		return 0;
2787 
2788 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2789 
2790 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2791 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2792 	printed += fprintf(fp, "%.1f%%", ratio);
2793 	if (ttrace->pfmaj)
2794 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2795 	if (ttrace->pfmin)
2796 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2797 	if (trace->sched)
2798 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2799 	else if (fputc('\n', fp) != EOF)
2800 		++printed;
2801 
2802 	printed += thread__dump_stats(ttrace, trace, fp);
2803 
2804 	return printed;
2805 }
2806 
2807 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2808 {
2809 	return ttrace ? ttrace->nr_events : 0;
2810 }
2811 
2812 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2813 	struct thread *thread;
2814 )
2815 {
2816 	entry->thread = rb_entry(nd, struct thread, rb_node);
2817 }
2818 
2819 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2820 {
2821 	size_t printed = trace__fprintf_threads_header(fp);
2822 	struct rb_node *nd;
2823 	int i;
2824 
2825 	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2826 		DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2827 
2828 		if (threads == NULL) {
2829 			fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2830 			return 0;
2831 		}
2832 
2833 		resort_rb__for_each_entry(nd, threads)
2834 			printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2835 
2836 		resort_rb__delete(threads);
2837 	}
2838 	return printed;
2839 }
2840 
2841 static int trace__set_duration(const struct option *opt, const char *str,
2842 			       int unset __maybe_unused)
2843 {
2844 	struct trace *trace = opt->value;
2845 
2846 	trace->duration_filter = atof(str);
2847 	return 0;
2848 }
2849 
2850 static int trace__set_filter_pids(const struct option *opt, const char *str,
2851 				  int unset __maybe_unused)
2852 {
2853 	int ret = -1;
2854 	size_t i;
2855 	struct trace *trace = opt->value;
2856 	/*
2857 	 * FIXME: introduce a intarray class, plain parse csv and create a
2858 	 * { int nr, int entries[] } struct...
2859 	 */
2860 	struct intlist *list = intlist__new(str);
2861 
2862 	if (list == NULL)
2863 		return -1;
2864 
2865 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2866 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2867 
2868 	if (trace->filter_pids.entries == NULL)
2869 		goto out;
2870 
2871 	trace->filter_pids.entries[0] = getpid();
2872 
2873 	for (i = 1; i < trace->filter_pids.nr; ++i)
2874 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2875 
2876 	intlist__delete(list);
2877 	ret = 0;
2878 out:
2879 	return ret;
2880 }
2881 
2882 static int trace__open_output(struct trace *trace, const char *filename)
2883 {
2884 	struct stat st;
2885 
2886 	if (!stat(filename, &st) && st.st_size) {
2887 		char oldname[PATH_MAX];
2888 
2889 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2890 		unlink(oldname);
2891 		rename(filename, oldname);
2892 	}
2893 
2894 	trace->output = fopen(filename, "w");
2895 
2896 	return trace->output == NULL ? -errno : 0;
2897 }
2898 
2899 static int parse_pagefaults(const struct option *opt, const char *str,
2900 			    int unset __maybe_unused)
2901 {
2902 	int *trace_pgfaults = opt->value;
2903 
2904 	if (strcmp(str, "all") == 0)
2905 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2906 	else if (strcmp(str, "maj") == 0)
2907 		*trace_pgfaults |= TRACE_PFMAJ;
2908 	else if (strcmp(str, "min") == 0)
2909 		*trace_pgfaults |= TRACE_PFMIN;
2910 	else
2911 		return -1;
2912 
2913 	return 0;
2914 }
2915 
2916 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2917 {
2918 	struct perf_evsel *evsel;
2919 
2920 	evlist__for_each_entry(evlist, evsel)
2921 		evsel->handler = handler;
2922 }
2923 
2924 /*
2925  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2926  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2927  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2928  *
2929  * It'd be better to introduce a parse_options() variant that would return a
2930  * list with the terms it didn't match to an event...
2931  */
2932 static int trace__parse_events_option(const struct option *opt, const char *str,
2933 				      int unset __maybe_unused)
2934 {
2935 	struct trace *trace = (struct trace *)opt->value;
2936 	const char *s = str;
2937 	char *sep = NULL, *lists[2] = { NULL, NULL, };
2938 	int len = strlen(str) + 1, err = -1, list, idx;
2939 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2940 	char group_name[PATH_MAX];
2941 
2942 	if (strace_groups_dir == NULL)
2943 		return -1;
2944 
2945 	if (*s == '!') {
2946 		++s;
2947 		trace->not_ev_qualifier = true;
2948 	}
2949 
2950 	while (1) {
2951 		if ((sep = strchr(s, ',')) != NULL)
2952 			*sep = '\0';
2953 
2954 		list = 0;
2955 		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2956 		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2957 			list = 1;
2958 		} else {
2959 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2960 			if (access(group_name, R_OK) == 0)
2961 				list = 1;
2962 		}
2963 
2964 		if (lists[list]) {
2965 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2966 		} else {
2967 			lists[list] = malloc(len);
2968 			if (lists[list] == NULL)
2969 				goto out;
2970 			strcpy(lists[list], s);
2971 		}
2972 
2973 		if (!sep)
2974 			break;
2975 
2976 		*sep = ',';
2977 		s = sep + 1;
2978 	}
2979 
2980 	if (lists[1] != NULL) {
2981 		struct strlist_config slist_config = {
2982 			.dirname = strace_groups_dir,
2983 		};
2984 
2985 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2986 		if (trace->ev_qualifier == NULL) {
2987 			fputs("Not enough memory to parse event qualifier", trace->output);
2988 			goto out;
2989 		}
2990 
2991 		if (trace__validate_ev_qualifier(trace))
2992 			goto out;
2993 	}
2994 
2995 	err = 0;
2996 
2997 	if (lists[0]) {
2998 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2999 					       "event selector. use 'perf list' to list available events",
3000 					       parse_events_option);
3001 		err = parse_events_option(&o, lists[0], 0);
3002 	}
3003 out:
3004 	if (sep)
3005 		*sep = ',';
3006 
3007 	return err;
3008 }
3009 
3010 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3011 {
3012 	struct trace *trace = opt->value;
3013 
3014 	if (!list_empty(&trace->evlist->entries))
3015 		return parse_cgroups(opt, str, unset);
3016 
3017 	trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3018 
3019 	return 0;
3020 }
3021 
3022 int cmd_trace(int argc, const char **argv)
3023 {
3024 	const char *trace_usage[] = {
3025 		"perf trace [<options>] [<command>]",
3026 		"perf trace [<options>] -- <command> [<options>]",
3027 		"perf trace record [<options>] [<command>]",
3028 		"perf trace record [<options>] -- <command> [<options>]",
3029 		NULL
3030 	};
3031 	struct trace trace = {
3032 		.syscalls = {
3033 			. max = -1,
3034 		},
3035 		.opts = {
3036 			.target = {
3037 				.uid	   = UINT_MAX,
3038 				.uses_mmap = true,
3039 			},
3040 			.user_freq     = UINT_MAX,
3041 			.user_interval = ULLONG_MAX,
3042 			.no_buffering  = true,
3043 			.mmap_pages    = UINT_MAX,
3044 			.proc_map_timeout  = 500,
3045 		},
3046 		.output = stderr,
3047 		.show_comm = true,
3048 		.trace_syscalls = true,
3049 		.kernel_syscallchains = false,
3050 		.max_stack = UINT_MAX,
3051 	};
3052 	const char *output_name = NULL;
3053 	const struct option trace_options[] = {
3054 	OPT_CALLBACK('e', "event", &trace, "event",
3055 		     "event/syscall selector. use 'perf list' to list available events",
3056 		     trace__parse_events_option),
3057 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
3058 		    "show the thread COMM next to its id"),
3059 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3060 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3061 		     trace__parse_events_option),
3062 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
3063 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3064 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3065 		    "trace events on existing process id"),
3066 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3067 		    "trace events on existing thread id"),
3068 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3069 		     "pids to filter (by the kernel)", trace__set_filter_pids),
3070 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3071 		    "system-wide collection from all CPUs"),
3072 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3073 		    "list of cpus to monitor"),
3074 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3075 		    "child tasks do not inherit counters"),
3076 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3077 		     "number of mmap data pages",
3078 		     perf_evlist__parse_mmap_pages),
3079 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3080 		   "user to profile"),
3081 	OPT_CALLBACK(0, "duration", &trace, "float",
3082 		     "show only events with duration > N.M ms",
3083 		     trace__set_duration),
3084 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3085 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3086 	OPT_BOOLEAN('T', "time", &trace.full_time,
3087 		    "Show full timestamp, not time relative to first start"),
3088 	OPT_BOOLEAN(0, "failure", &trace.failure_only,
3089 		    "Show only syscalls that failed"),
3090 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3091 		    "Show only syscall summary with statistics"),
3092 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3093 		    "Show all syscalls and summary with statistics"),
3094 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3095 		     "Trace pagefaults", parse_pagefaults, "maj"),
3096 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3097 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3098 	OPT_CALLBACK(0, "call-graph", &trace.opts,
3099 		     "record_mode[,record_size]", record_callchain_help,
3100 		     &record_parse_callchain_opt),
3101 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3102 		    "Show the kernel callchains on the syscall exit path"),
3103 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3104 		     "Set the minimum stack depth when parsing the callchain, "
3105 		     "anything below the specified depth will be ignored."),
3106 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3107 		     "Set the maximum stack depth when parsing the callchain, "
3108 		     "anything beyond the specified depth will be ignored. "
3109 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3110 	OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3111 			"print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3112 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3113 			"per thread proc mmap processing timeout in ms"),
3114 	OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3115 		     trace__parse_cgroups),
3116 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3117 		     "ms to wait before starting measurement after program "
3118 		     "start"),
3119 	OPT_END()
3120 	};
3121 	bool __maybe_unused max_stack_user_set = true;
3122 	bool mmap_pages_user_set = true;
3123 	const char * const trace_subcommands[] = { "record", NULL };
3124 	int err;
3125 	char bf[BUFSIZ];
3126 
3127 	signal(SIGSEGV, sighandler_dump_stack);
3128 	signal(SIGFPE, sighandler_dump_stack);
3129 
3130 	trace.evlist = perf_evlist__new();
3131 	trace.sctbl = syscalltbl__new();
3132 
3133 	if (trace.evlist == NULL || trace.sctbl == NULL) {
3134 		pr_err("Not enough memory to run!\n");
3135 		err = -ENOMEM;
3136 		goto out;
3137 	}
3138 
3139 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3140 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3141 
3142 	if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3143 		usage_with_options_msg(trace_usage, trace_options,
3144 				       "cgroup monitoring only available in system-wide mode");
3145 	}
3146 
3147 	err = bpf__setup_stdout(trace.evlist);
3148 	if (err) {
3149 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3150 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3151 		goto out;
3152 	}
3153 
3154 	err = -1;
3155 
3156 	if (trace.trace_pgfaults) {
3157 		trace.opts.sample_address = true;
3158 		trace.opts.sample_time = true;
3159 	}
3160 
3161 	if (trace.opts.mmap_pages == UINT_MAX)
3162 		mmap_pages_user_set = false;
3163 
3164 	if (trace.max_stack == UINT_MAX) {
3165 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3166 		max_stack_user_set = false;
3167 	}
3168 
3169 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3170 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3171 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3172 	}
3173 #endif
3174 
3175 	if (callchain_param.enabled) {
3176 		if (!mmap_pages_user_set && geteuid() == 0)
3177 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3178 
3179 		symbol_conf.use_callchain = true;
3180 	}
3181 
3182 	if (trace.evlist->nr_entries > 0)
3183 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3184 
3185 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3186 		return trace__record(&trace, argc-1, &argv[1]);
3187 
3188 	/* summary_only implies summary option, but don't overwrite summary if set */
3189 	if (trace.summary_only)
3190 		trace.summary = trace.summary_only;
3191 
3192 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3193 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3194 		pr_err("Please specify something to trace.\n");
3195 		return -1;
3196 	}
3197 
3198 	if (!trace.trace_syscalls && trace.ev_qualifier) {
3199 		pr_err("The -e option can't be used with --no-syscalls.\n");
3200 		goto out;
3201 	}
3202 
3203 	if (output_name != NULL) {
3204 		err = trace__open_output(&trace, output_name);
3205 		if (err < 0) {
3206 			perror("failed to create output file");
3207 			goto out;
3208 		}
3209 	}
3210 
3211 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
3212 
3213 	err = target__validate(&trace.opts.target);
3214 	if (err) {
3215 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3216 		fprintf(trace.output, "%s", bf);
3217 		goto out_close;
3218 	}
3219 
3220 	err = target__parse_uid(&trace.opts.target);
3221 	if (err) {
3222 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3223 		fprintf(trace.output, "%s", bf);
3224 		goto out_close;
3225 	}
3226 
3227 	if (!argc && target__none(&trace.opts.target))
3228 		trace.opts.target.system_wide = true;
3229 
3230 	if (input_name)
3231 		err = trace__replay(&trace);
3232 	else
3233 		err = trace__run(&trace, argc, argv);
3234 
3235 out_close:
3236 	if (output_name != NULL)
3237 		fclose(trace.output);
3238 out:
3239 	return err;
3240 }
3241