xref: /openbmc/linux/tools/perf/builtin-trace.c (revision 82e6fdd6)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/cgroup.h"
23 #include "util/color.h"
24 #include "util/debug.h"
25 #include "util/env.h"
26 #include "util/event.h"
27 #include "util/evlist.h"
28 #include <subcmd/exec-cmd.h>
29 #include "util/machine.h"
30 #include "util/path.h"
31 #include "util/session.h"
32 #include "util/thread.h"
33 #include <subcmd/parse-options.h>
34 #include "util/strlist.h"
35 #include "util/intlist.h"
36 #include "util/thread_map.h"
37 #include "util/stat.h"
38 #include "trace/beauty/beauty.h"
39 #include "trace-event.h"
40 #include "util/parse-events.h"
41 #include "util/bpf-loader.h"
42 #include "callchain.h"
43 #include "print_binary.h"
44 #include "string2.h"
45 #include "syscalltbl.h"
46 #include "rb_resort.h"
47 
48 #include <errno.h>
49 #include <inttypes.h>
50 #include <poll.h>
51 #include <signal.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <linux/err.h>
55 #include <linux/filter.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 #include <fcntl.h>
61 
62 #include "sane_ctype.h"
63 
64 #ifndef O_CLOEXEC
65 # define O_CLOEXEC		02000000
66 #endif
67 
68 #ifndef F_LINUX_SPECIFIC_BASE
69 # define F_LINUX_SPECIFIC_BASE	1024
70 #endif
71 
72 struct trace {
73 	struct perf_tool	tool;
74 	struct syscalltbl	*sctbl;
75 	struct {
76 		int		max;
77 		struct syscall  *table;
78 		struct {
79 			struct perf_evsel *sys_enter,
80 					  *sys_exit;
81 		}		events;
82 	} syscalls;
83 	struct record_opts	opts;
84 	struct perf_evlist	*evlist;
85 	struct machine		*host;
86 	struct thread		*current;
87 	struct cgroup		*cgroup;
88 	u64			base_time;
89 	FILE			*output;
90 	unsigned long		nr_events;
91 	struct strlist		*ev_qualifier;
92 	struct {
93 		size_t		nr;
94 		int		*entries;
95 	}			ev_qualifier_ids;
96 	struct {
97 		size_t		nr;
98 		pid_t		*entries;
99 	}			filter_pids;
100 	double			duration_filter;
101 	double			runtime_ms;
102 	struct {
103 		u64		vfs_getname,
104 				proc_getname;
105 	} stats;
106 	unsigned int		max_stack;
107 	unsigned int		min_stack;
108 	bool			not_ev_qualifier;
109 	bool			live;
110 	bool			full_time;
111 	bool			sched;
112 	bool			multiple_threads;
113 	bool			summary;
114 	bool			summary_only;
115 	bool			show_comm;
116 	bool			print_sample;
117 	bool			show_tool_stats;
118 	bool			trace_syscalls;
119 	bool			kernel_syscallchains;
120 	bool			force;
121 	bool			vfs_getname;
122 	int			trace_pgfaults;
123 	int			open_id;
124 };
125 
126 struct tp_field {
127 	int offset;
128 	union {
129 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
130 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
131 	};
132 };
133 
134 #define TP_UINT_FIELD(bits) \
135 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
136 { \
137 	u##bits value; \
138 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
139 	return value;  \
140 }
141 
142 TP_UINT_FIELD(8);
143 TP_UINT_FIELD(16);
144 TP_UINT_FIELD(32);
145 TP_UINT_FIELD(64);
146 
147 #define TP_UINT_FIELD__SWAPPED(bits) \
148 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
149 { \
150 	u##bits value; \
151 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
152 	return bswap_##bits(value);\
153 }
154 
155 TP_UINT_FIELD__SWAPPED(16);
156 TP_UINT_FIELD__SWAPPED(32);
157 TP_UINT_FIELD__SWAPPED(64);
158 
159 static int tp_field__init_uint(struct tp_field *field,
160 			       struct format_field *format_field,
161 			       bool needs_swap)
162 {
163 	field->offset = format_field->offset;
164 
165 	switch (format_field->size) {
166 	case 1:
167 		field->integer = tp_field__u8;
168 		break;
169 	case 2:
170 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
171 		break;
172 	case 4:
173 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
174 		break;
175 	case 8:
176 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
177 		break;
178 	default:
179 		return -1;
180 	}
181 
182 	return 0;
183 }
184 
185 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
186 {
187 	return sample->raw_data + field->offset;
188 }
189 
190 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
191 {
192 	field->offset = format_field->offset;
193 	field->pointer = tp_field__ptr;
194 	return 0;
195 }
196 
197 struct syscall_tp {
198 	struct tp_field id;
199 	union {
200 		struct tp_field args, ret;
201 	};
202 };
203 
204 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
205 					  struct tp_field *field,
206 					  const char *name)
207 {
208 	struct format_field *format_field = perf_evsel__field(evsel, name);
209 
210 	if (format_field == NULL)
211 		return -1;
212 
213 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
214 }
215 
216 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
217 	({ struct syscall_tp *sc = evsel->priv;\
218 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
219 
220 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
221 					 struct tp_field *field,
222 					 const char *name)
223 {
224 	struct format_field *format_field = perf_evsel__field(evsel, name);
225 
226 	if (format_field == NULL)
227 		return -1;
228 
229 	return tp_field__init_ptr(field, format_field);
230 }
231 
232 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
233 	({ struct syscall_tp *sc = evsel->priv;\
234 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
235 
236 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
237 {
238 	zfree(&evsel->priv);
239 	perf_evsel__delete(evsel);
240 }
241 
242 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
243 {
244 	evsel->priv = malloc(sizeof(struct syscall_tp));
245 	if (evsel->priv != NULL) {
246 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
247 			goto out_delete;
248 
249 		evsel->handler = handler;
250 		return 0;
251 	}
252 
253 	return -ENOMEM;
254 
255 out_delete:
256 	zfree(&evsel->priv);
257 	return -ENOENT;
258 }
259 
260 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
261 {
262 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
263 
264 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
265 	if (IS_ERR(evsel))
266 		evsel = perf_evsel__newtp("syscalls", direction);
267 
268 	if (IS_ERR(evsel))
269 		return NULL;
270 
271 	if (perf_evsel__init_syscall_tp(evsel, handler))
272 		goto out_delete;
273 
274 	return evsel;
275 
276 out_delete:
277 	perf_evsel__delete_priv(evsel);
278 	return NULL;
279 }
280 
281 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
282 	({ struct syscall_tp *fields = evsel->priv; \
283 	   fields->name.integer(&fields->name, sample); })
284 
285 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
286 	({ struct syscall_tp *fields = evsel->priv; \
287 	   fields->name.pointer(&fields->name, sample); })
288 
289 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
290 {
291 	int idx = val - sa->offset;
292 
293 	if (idx < 0 || idx >= sa->nr_entries)
294 		return scnprintf(bf, size, intfmt, val);
295 
296 	return scnprintf(bf, size, "%s", sa->entries[idx]);
297 }
298 
299 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
300 						const char *intfmt,
301 					        struct syscall_arg *arg)
302 {
303 	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
304 }
305 
306 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
307 					      struct syscall_arg *arg)
308 {
309 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
310 }
311 
312 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
313 
314 struct strarrays {
315 	int		nr_entries;
316 	struct strarray **entries;
317 };
318 
319 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
320 	.nr_entries = ARRAY_SIZE(array), \
321 	.entries = array, \
322 }
323 
324 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
325 					struct syscall_arg *arg)
326 {
327 	struct strarrays *sas = arg->parm;
328 	int i;
329 
330 	for (i = 0; i < sas->nr_entries; ++i) {
331 		struct strarray *sa = sas->entries[i];
332 		int idx = arg->val - sa->offset;
333 
334 		if (idx >= 0 && idx < sa->nr_entries) {
335 			if (sa->entries[idx] == NULL)
336 				break;
337 			return scnprintf(bf, size, "%s", sa->entries[idx]);
338 		}
339 	}
340 
341 	return scnprintf(bf, size, "%d", arg->val);
342 }
343 
344 #ifndef AT_FDCWD
345 #define AT_FDCWD	-100
346 #endif
347 
348 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
349 					   struct syscall_arg *arg)
350 {
351 	int fd = arg->val;
352 
353 	if (fd == AT_FDCWD)
354 		return scnprintf(bf, size, "CWD");
355 
356 	return syscall_arg__scnprintf_fd(bf, size, arg);
357 }
358 
359 #define SCA_FDAT syscall_arg__scnprintf_fd_at
360 
361 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
362 					      struct syscall_arg *arg);
363 
364 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
365 
366 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
367 {
368 	return scnprintf(bf, size, "%#lx", arg->val);
369 }
370 
371 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
372 {
373 	return scnprintf(bf, size, "%d", arg->val);
374 }
375 
376 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
377 {
378 	return scnprintf(bf, size, "%ld", arg->val);
379 }
380 
381 static const char *bpf_cmd[] = {
382 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
383 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
384 };
385 static DEFINE_STRARRAY(bpf_cmd);
386 
387 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
388 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
389 
390 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
391 static DEFINE_STRARRAY(itimers);
392 
393 static const char *keyctl_options[] = {
394 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
395 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
396 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
397 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
398 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
399 };
400 static DEFINE_STRARRAY(keyctl_options);
401 
402 static const char *whences[] = { "SET", "CUR", "END",
403 #ifdef SEEK_DATA
404 "DATA",
405 #endif
406 #ifdef SEEK_HOLE
407 "HOLE",
408 #endif
409 };
410 static DEFINE_STRARRAY(whences);
411 
412 static const char *fcntl_cmds[] = {
413 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
414 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
415 	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
416 	"GETOWNER_UIDS",
417 };
418 static DEFINE_STRARRAY(fcntl_cmds);
419 
420 static const char *fcntl_linux_specific_cmds[] = {
421 	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
422 	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
423 	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
424 };
425 
426 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
427 
428 static struct strarray *fcntl_cmds_arrays[] = {
429 	&strarray__fcntl_cmds,
430 	&strarray__fcntl_linux_specific_cmds,
431 };
432 
433 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
434 
435 static const char *rlimit_resources[] = {
436 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
437 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
438 	"RTTIME",
439 };
440 static DEFINE_STRARRAY(rlimit_resources);
441 
442 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
443 static DEFINE_STRARRAY(sighow);
444 
445 static const char *clockid[] = {
446 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
447 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
448 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
449 };
450 static DEFINE_STRARRAY(clockid);
451 
452 static const char *socket_families[] = {
453 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
454 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
455 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
456 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
457 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
458 	"ALG", "NFC", "VSOCK",
459 };
460 static DEFINE_STRARRAY(socket_families);
461 
462 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
463 						 struct syscall_arg *arg)
464 {
465 	size_t printed = 0;
466 	int mode = arg->val;
467 
468 	if (mode == F_OK) /* 0 */
469 		return scnprintf(bf, size, "F");
470 #define	P_MODE(n) \
471 	if (mode & n##_OK) { \
472 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
473 		mode &= ~n##_OK; \
474 	}
475 
476 	P_MODE(R);
477 	P_MODE(W);
478 	P_MODE(X);
479 #undef P_MODE
480 
481 	if (mode)
482 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
483 
484 	return printed;
485 }
486 
487 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
488 
489 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
490 					      struct syscall_arg *arg);
491 
492 #define SCA_FILENAME syscall_arg__scnprintf_filename
493 
494 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
495 						struct syscall_arg *arg)
496 {
497 	int printed = 0, flags = arg->val;
498 
499 #define	P_FLAG(n) \
500 	if (flags & O_##n) { \
501 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
502 		flags &= ~O_##n; \
503 	}
504 
505 	P_FLAG(CLOEXEC);
506 	P_FLAG(NONBLOCK);
507 #undef P_FLAG
508 
509 	if (flags)
510 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
511 
512 	return printed;
513 }
514 
515 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
516 
517 #ifndef GRND_NONBLOCK
518 #define GRND_NONBLOCK	0x0001
519 #endif
520 #ifndef GRND_RANDOM
521 #define GRND_RANDOM	0x0002
522 #endif
523 
524 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
525 						   struct syscall_arg *arg)
526 {
527 	int printed = 0, flags = arg->val;
528 
529 #define	P_FLAG(n) \
530 	if (flags & GRND_##n) { \
531 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
532 		flags &= ~GRND_##n; \
533 	}
534 
535 	P_FLAG(RANDOM);
536 	P_FLAG(NONBLOCK);
537 #undef P_FLAG
538 
539 	if (flags)
540 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
541 
542 	return printed;
543 }
544 
545 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
546 
547 #define STRARRAY(name, array) \
548 	  { .scnprintf	= SCA_STRARRAY, \
549 	    .parm	= &strarray__##array, }
550 
551 #include "trace/beauty/arch_errno_names.c"
552 #include "trace/beauty/eventfd.c"
553 #include "trace/beauty/futex_op.c"
554 #include "trace/beauty/futex_val3.c"
555 #include "trace/beauty/mmap.c"
556 #include "trace/beauty/mode_t.c"
557 #include "trace/beauty/msg_flags.c"
558 #include "trace/beauty/open_flags.c"
559 #include "trace/beauty/perf_event_open.c"
560 #include "trace/beauty/pid.c"
561 #include "trace/beauty/sched_policy.c"
562 #include "trace/beauty/seccomp.c"
563 #include "trace/beauty/signum.c"
564 #include "trace/beauty/socket_type.c"
565 #include "trace/beauty/waitid_options.c"
566 
567 struct syscall_arg_fmt {
568 	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
569 	void	   *parm;
570 	const char *name;
571 	bool	   show_zero;
572 };
573 
574 static struct syscall_fmt {
575 	const char *name;
576 	const char *alias;
577 	struct syscall_arg_fmt arg[6];
578 	u8	   nr_args;
579 	bool	   errpid;
580 	bool	   timeout;
581 	bool	   hexret;
582 } syscall_fmts[] = {
583 	{ .name	    = "access",
584 	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
585 	{ .name	    = "bpf",
586 	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
587 	{ .name	    = "brk",	    .hexret = true,
588 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
589 	{ .name     = "clock_gettime",
590 	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
591 	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
592 	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
593 		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
594 		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
595 		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
596 		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
597 	{ .name	    = "close",
598 	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
599 	{ .name	    = "epoll_ctl",
600 	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
601 	{ .name	    = "eventfd2",
602 	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
603 	{ .name	    = "fchmodat",
604 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
605 	{ .name	    = "fchownat",
606 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
607 	{ .name	    = "fcntl",
608 	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
609 			   .parm      = &strarrays__fcntl_cmds_arrays,
610 			   .show_zero = true, },
611 		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
612 	{ .name	    = "flock",
613 	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
614 	{ .name	    = "fstat", .alias = "newfstat", },
615 	{ .name	    = "fstatat", .alias = "newfstatat", },
616 	{ .name	    = "futex",
617 	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
618 		   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
619 	{ .name	    = "futimesat",
620 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
621 	{ .name	    = "getitimer",
622 	  .arg = { [0] = STRARRAY(which, itimers), }, },
623 	{ .name	    = "getpid",	    .errpid = true, },
624 	{ .name	    = "getpgid",    .errpid = true, },
625 	{ .name	    = "getppid",    .errpid = true, },
626 	{ .name	    = "getrandom",
627 	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
628 	{ .name	    = "getrlimit",
629 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
630 	{ .name	    = "gettid",	    .errpid = true, },
631 	{ .name	    = "ioctl",
632 	  .arg = {
633 #if defined(__i386__) || defined(__x86_64__)
634 /*
635  * FIXME: Make this available to all arches.
636  */
637 		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
638 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
639 #else
640 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
641 #endif
642 	{ .name	    = "kcmp",	    .nr_args = 5,
643 	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
644 		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
645 		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
646 		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
647 		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
648 	{ .name	    = "keyctl",
649 	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
650 	{ .name	    = "kill",
651 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
652 	{ .name	    = "linkat",
653 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
654 	{ .name	    = "lseek",
655 	  .arg = { [2] = STRARRAY(whence, whences), }, },
656 	{ .name	    = "lstat", .alias = "newlstat", },
657 	{ .name     = "madvise",
658 	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
659 		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
660 	{ .name	    = "mkdirat",
661 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
662 	{ .name	    = "mknodat",
663 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
664 	{ .name	    = "mlock",
665 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
666 	{ .name	    = "mlockall",
667 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
668 	{ .name	    = "mmap",	    .hexret = true,
669 /* The standard mmap maps to old_mmap on s390x */
670 #if defined(__s390x__)
671 	.alias = "old_mmap",
672 #endif
673 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
674 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
675 		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
676 	{ .name	    = "mprotect",
677 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
678 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
679 	{ .name	    = "mq_unlink",
680 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
681 	{ .name	    = "mremap",	    .hexret = true,
682 	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
683 		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
684 		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
685 	{ .name	    = "munlock",
686 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
687 	{ .name	    = "munmap",
688 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
689 	{ .name	    = "name_to_handle_at",
690 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
691 	{ .name	    = "newfstatat",
692 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
693 	{ .name	    = "open",
694 	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
695 	{ .name	    = "open_by_handle_at",
696 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
697 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
698 	{ .name	    = "openat",
699 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
700 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
701 	{ .name	    = "perf_event_open",
702 	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
703 		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
704 		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
705 	{ .name	    = "pipe2",
706 	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
707 	{ .name	    = "pkey_alloc",
708 	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
709 	{ .name	    = "pkey_free",
710 	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
711 	{ .name	    = "pkey_mprotect",
712 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
713 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
714 		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
715 	{ .name	    = "poll", .timeout = true, },
716 	{ .name	    = "ppoll", .timeout = true, },
717 	{ .name	    = "prctl", .alias = "arch_prctl",
718 	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
719 		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
720 		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
721 	{ .name	    = "pread", .alias = "pread64", },
722 	{ .name	    = "preadv", .alias = "pread", },
723 	{ .name	    = "prlimit64",
724 	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
725 	{ .name	    = "pwrite", .alias = "pwrite64", },
726 	{ .name	    = "readlinkat",
727 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
728 	{ .name	    = "recvfrom",
729 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
730 	{ .name	    = "recvmmsg",
731 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
732 	{ .name	    = "recvmsg",
733 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
734 	{ .name	    = "renameat",
735 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
736 	{ .name	    = "rt_sigaction",
737 	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
738 	{ .name	    = "rt_sigprocmask",
739 	  .arg = { [0] = STRARRAY(how, sighow), }, },
740 	{ .name	    = "rt_sigqueueinfo",
741 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
742 	{ .name	    = "rt_tgsigqueueinfo",
743 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
744 	{ .name	    = "sched_setscheduler",
745 	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
746 	{ .name	    = "seccomp",
747 	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
748 		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
749 	{ .name	    = "select", .timeout = true, },
750 	{ .name	    = "sendmmsg",
751 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
752 	{ .name	    = "sendmsg",
753 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
754 	{ .name	    = "sendto",
755 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
756 	{ .name	    = "set_tid_address", .errpid = true, },
757 	{ .name	    = "setitimer",
758 	  .arg = { [0] = STRARRAY(which, itimers), }, },
759 	{ .name	    = "setrlimit",
760 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
761 	{ .name	    = "socket",
762 	  .arg = { [0] = STRARRAY(family, socket_families),
763 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
764 	{ .name	    = "socketpair",
765 	  .arg = { [0] = STRARRAY(family, socket_families),
766 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
767 	{ .name	    = "stat", .alias = "newstat", },
768 	{ .name	    = "statx",
769 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
770 		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
771 		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
772 	{ .name	    = "swapoff",
773 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
774 	{ .name	    = "swapon",
775 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
776 	{ .name	    = "symlinkat",
777 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
778 	{ .name	    = "tgkill",
779 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
780 	{ .name	    = "tkill",
781 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
782 	{ .name	    = "uname", .alias = "newuname", },
783 	{ .name	    = "unlinkat",
784 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
785 	{ .name	    = "utimensat",
786 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
787 	{ .name	    = "wait4",	    .errpid = true,
788 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
789 	{ .name	    = "waitid",	    .errpid = true,
790 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
791 };
792 
793 static int syscall_fmt__cmp(const void *name, const void *fmtp)
794 {
795 	const struct syscall_fmt *fmt = fmtp;
796 	return strcmp(name, fmt->name);
797 }
798 
799 static struct syscall_fmt *syscall_fmt__find(const char *name)
800 {
801 	const int nmemb = ARRAY_SIZE(syscall_fmts);
802 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
803 }
804 
805 struct syscall {
806 	struct event_format *tp_format;
807 	int		    nr_args;
808 	struct format_field *args;
809 	const char	    *name;
810 	bool		    is_exit;
811 	struct syscall_fmt  *fmt;
812 	struct syscall_arg_fmt *arg_fmt;
813 };
814 
815 /*
816  * We need to have this 'calculated' boolean because in some cases we really
817  * don't know what is the duration of a syscall, for instance, when we start
818  * a session and some threads are waiting for a syscall to finish, say 'poll',
819  * in which case all we can do is to print "( ? ) for duration and for the
820  * start timestamp.
821  */
822 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
823 {
824 	double duration = (double)t / NSEC_PER_MSEC;
825 	size_t printed = fprintf(fp, "(");
826 
827 	if (!calculated)
828 		printed += fprintf(fp, "         ");
829 	else if (duration >= 1.0)
830 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
831 	else if (duration >= 0.01)
832 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
833 	else
834 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
835 	return printed + fprintf(fp, "): ");
836 }
837 
838 /**
839  * filename.ptr: The filename char pointer that will be vfs_getname'd
840  * filename.entry_str_pos: Where to insert the string translated from
841  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
842  * ret_scnprintf: syscall args may set this to a different syscall return
843  *                formatter, for instance, fcntl may return fds, file flags, etc.
844  */
845 struct thread_trace {
846 	u64		  entry_time;
847 	bool		  entry_pending;
848 	unsigned long	  nr_events;
849 	unsigned long	  pfmaj, pfmin;
850 	char		  *entry_str;
851 	double		  runtime_ms;
852 	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
853         struct {
854 		unsigned long ptr;
855 		short int     entry_str_pos;
856 		bool	      pending_open;
857 		unsigned int  namelen;
858 		char	      *name;
859 	} filename;
860 	struct {
861 		int	  max;
862 		char	  **table;
863 	} paths;
864 
865 	struct intlist *syscall_stats;
866 };
867 
868 static struct thread_trace *thread_trace__new(void)
869 {
870 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
871 
872 	if (ttrace)
873 		ttrace->paths.max = -1;
874 
875 	ttrace->syscall_stats = intlist__new(NULL);
876 
877 	return ttrace;
878 }
879 
880 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
881 {
882 	struct thread_trace *ttrace;
883 
884 	if (thread == NULL)
885 		goto fail;
886 
887 	if (thread__priv(thread) == NULL)
888 		thread__set_priv(thread, thread_trace__new());
889 
890 	if (thread__priv(thread) == NULL)
891 		goto fail;
892 
893 	ttrace = thread__priv(thread);
894 	++ttrace->nr_events;
895 
896 	return ttrace;
897 fail:
898 	color_fprintf(fp, PERF_COLOR_RED,
899 		      "WARNING: not enough memory, dropping samples!\n");
900 	return NULL;
901 }
902 
903 
904 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
905 				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
906 {
907 	struct thread_trace *ttrace = thread__priv(arg->thread);
908 
909 	ttrace->ret_scnprintf = ret_scnprintf;
910 }
911 
912 #define TRACE_PFMAJ		(1 << 0)
913 #define TRACE_PFMIN		(1 << 1)
914 
915 static const size_t trace__entry_str_size = 2048;
916 
917 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
918 {
919 	struct thread_trace *ttrace = thread__priv(thread);
920 
921 	if (fd > ttrace->paths.max) {
922 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
923 
924 		if (npath == NULL)
925 			return -1;
926 
927 		if (ttrace->paths.max != -1) {
928 			memset(npath + ttrace->paths.max + 1, 0,
929 			       (fd - ttrace->paths.max) * sizeof(char *));
930 		} else {
931 			memset(npath, 0, (fd + 1) * sizeof(char *));
932 		}
933 
934 		ttrace->paths.table = npath;
935 		ttrace->paths.max   = fd;
936 	}
937 
938 	ttrace->paths.table[fd] = strdup(pathname);
939 
940 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
941 }
942 
943 static int thread__read_fd_path(struct thread *thread, int fd)
944 {
945 	char linkname[PATH_MAX], pathname[PATH_MAX];
946 	struct stat st;
947 	int ret;
948 
949 	if (thread->pid_ == thread->tid) {
950 		scnprintf(linkname, sizeof(linkname),
951 			  "/proc/%d/fd/%d", thread->pid_, fd);
952 	} else {
953 		scnprintf(linkname, sizeof(linkname),
954 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
955 	}
956 
957 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
958 		return -1;
959 
960 	ret = readlink(linkname, pathname, sizeof(pathname));
961 
962 	if (ret < 0 || ret > st.st_size)
963 		return -1;
964 
965 	pathname[ret] = '\0';
966 	return trace__set_fd_pathname(thread, fd, pathname);
967 }
968 
969 static const char *thread__fd_path(struct thread *thread, int fd,
970 				   struct trace *trace)
971 {
972 	struct thread_trace *ttrace = thread__priv(thread);
973 
974 	if (ttrace == NULL)
975 		return NULL;
976 
977 	if (fd < 0)
978 		return NULL;
979 
980 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
981 		if (!trace->live)
982 			return NULL;
983 		++trace->stats.proc_getname;
984 		if (thread__read_fd_path(thread, fd))
985 			return NULL;
986 	}
987 
988 	return ttrace->paths.table[fd];
989 }
990 
991 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
992 {
993 	int fd = arg->val;
994 	size_t printed = scnprintf(bf, size, "%d", fd);
995 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
996 
997 	if (path)
998 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
999 
1000 	return printed;
1001 }
1002 
1003 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1004 {
1005         size_t printed = scnprintf(bf, size, "%d", fd);
1006 	struct thread *thread = machine__find_thread(trace->host, pid, pid);
1007 
1008 	if (thread) {
1009 		const char *path = thread__fd_path(thread, fd, trace);
1010 
1011 		if (path)
1012 			printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1013 
1014 		thread__put(thread);
1015 	}
1016 
1017         return printed;
1018 }
1019 
1020 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1021 					      struct syscall_arg *arg)
1022 {
1023 	int fd = arg->val;
1024 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1025 	struct thread_trace *ttrace = thread__priv(arg->thread);
1026 
1027 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1028 		zfree(&ttrace->paths.table[fd]);
1029 
1030 	return printed;
1031 }
1032 
1033 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1034 				     unsigned long ptr)
1035 {
1036 	struct thread_trace *ttrace = thread__priv(thread);
1037 
1038 	ttrace->filename.ptr = ptr;
1039 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1040 }
1041 
1042 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1043 					      struct syscall_arg *arg)
1044 {
1045 	unsigned long ptr = arg->val;
1046 
1047 	if (!arg->trace->vfs_getname)
1048 		return scnprintf(bf, size, "%#x", ptr);
1049 
1050 	thread__set_filename_pos(arg->thread, bf, ptr);
1051 	return 0;
1052 }
1053 
1054 static bool trace__filter_duration(struct trace *trace, double t)
1055 {
1056 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1057 }
1058 
1059 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1060 {
1061 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1062 
1063 	return fprintf(fp, "%10.3f ", ts);
1064 }
1065 
1066 /*
1067  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1068  * using ttrace->entry_time for a thread that receives a sys_exit without
1069  * first having received a sys_enter ("poll" issued before tracing session
1070  * starts, lost sys_enter exit due to ring buffer overflow).
1071  */
1072 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1073 {
1074 	if (tstamp > 0)
1075 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1076 
1077 	return fprintf(fp, "         ? ");
1078 }
1079 
1080 static bool done = false;
1081 static bool interrupted = false;
1082 
1083 static void sig_handler(int sig)
1084 {
1085 	done = true;
1086 	interrupted = sig == SIGINT;
1087 }
1088 
1089 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1090 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1091 {
1092 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1093 	printed += fprintf_duration(duration, duration_calculated, fp);
1094 
1095 	if (trace->multiple_threads) {
1096 		if (trace->show_comm)
1097 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1098 		printed += fprintf(fp, "%d ", thread->tid);
1099 	}
1100 
1101 	return printed;
1102 }
1103 
1104 static int trace__process_event(struct trace *trace, struct machine *machine,
1105 				union perf_event *event, struct perf_sample *sample)
1106 {
1107 	int ret = 0;
1108 
1109 	switch (event->header.type) {
1110 	case PERF_RECORD_LOST:
1111 		color_fprintf(trace->output, PERF_COLOR_RED,
1112 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1113 		ret = machine__process_lost_event(machine, event, sample);
1114 		break;
1115 	default:
1116 		ret = machine__process_event(machine, event, sample);
1117 		break;
1118 	}
1119 
1120 	return ret;
1121 }
1122 
1123 static int trace__tool_process(struct perf_tool *tool,
1124 			       union perf_event *event,
1125 			       struct perf_sample *sample,
1126 			       struct machine *machine)
1127 {
1128 	struct trace *trace = container_of(tool, struct trace, tool);
1129 	return trace__process_event(trace, machine, event, sample);
1130 }
1131 
1132 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1133 {
1134 	struct machine *machine = vmachine;
1135 
1136 	if (machine->kptr_restrict_warned)
1137 		return NULL;
1138 
1139 	if (symbol_conf.kptr_restrict) {
1140 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1141 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1142 			   "Kernel samples will not be resolved.\n");
1143 		machine->kptr_restrict_warned = true;
1144 		return NULL;
1145 	}
1146 
1147 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1148 }
1149 
1150 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1151 {
1152 	int err = symbol__init(NULL);
1153 
1154 	if (err)
1155 		return err;
1156 
1157 	trace->host = machine__new_host();
1158 	if (trace->host == NULL)
1159 		return -ENOMEM;
1160 
1161 	err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1162 	if (err < 0)
1163 		goto out;
1164 
1165 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1166 					    evlist->threads, trace__tool_process, false,
1167 					    trace->opts.proc_map_timeout, 1);
1168 out:
1169 	if (err)
1170 		symbol__exit();
1171 
1172 	return err;
1173 }
1174 
1175 static void trace__symbols__exit(struct trace *trace)
1176 {
1177 	machine__exit(trace->host);
1178 	trace->host = NULL;
1179 
1180 	symbol__exit();
1181 }
1182 
1183 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1184 {
1185 	int idx;
1186 
1187 	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1188 		nr_args = sc->fmt->nr_args;
1189 
1190 	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1191 	if (sc->arg_fmt == NULL)
1192 		return -1;
1193 
1194 	for (idx = 0; idx < nr_args; ++idx) {
1195 		if (sc->fmt)
1196 			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1197 	}
1198 
1199 	sc->nr_args = nr_args;
1200 	return 0;
1201 }
1202 
1203 static int syscall__set_arg_fmts(struct syscall *sc)
1204 {
1205 	struct format_field *field;
1206 	int idx = 0, len;
1207 
1208 	for (field = sc->args; field; field = field->next, ++idx) {
1209 		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1210 			continue;
1211 
1212 		if (strcmp(field->type, "const char *") == 0 &&
1213 			 (strcmp(field->name, "filename") == 0 ||
1214 			  strcmp(field->name, "path") == 0 ||
1215 			  strcmp(field->name, "pathname") == 0))
1216 			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1217 		else if (field->flags & FIELD_IS_POINTER)
1218 			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1219 		else if (strcmp(field->type, "pid_t") == 0)
1220 			sc->arg_fmt[idx].scnprintf = SCA_PID;
1221 		else if (strcmp(field->type, "umode_t") == 0)
1222 			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1223 		else if ((strcmp(field->type, "int") == 0 ||
1224 			  strcmp(field->type, "unsigned int") == 0 ||
1225 			  strcmp(field->type, "long") == 0) &&
1226 			 (len = strlen(field->name)) >= 2 &&
1227 			 strcmp(field->name + len - 2, "fd") == 0) {
1228 			/*
1229 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1230 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1231 			 * 65 int
1232 			 * 23 unsigned int
1233 			 * 7 unsigned long
1234 			 */
1235 			sc->arg_fmt[idx].scnprintf = SCA_FD;
1236 		}
1237 	}
1238 
1239 	return 0;
1240 }
1241 
1242 static int trace__read_syscall_info(struct trace *trace, int id)
1243 {
1244 	char tp_name[128];
1245 	struct syscall *sc;
1246 	const char *name = syscalltbl__name(trace->sctbl, id);
1247 
1248 	if (name == NULL)
1249 		return -1;
1250 
1251 	if (id > trace->syscalls.max) {
1252 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1253 
1254 		if (nsyscalls == NULL)
1255 			return -1;
1256 
1257 		if (trace->syscalls.max != -1) {
1258 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1259 			       (id - trace->syscalls.max) * sizeof(*sc));
1260 		} else {
1261 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1262 		}
1263 
1264 		trace->syscalls.table = nsyscalls;
1265 		trace->syscalls.max   = id;
1266 	}
1267 
1268 	sc = trace->syscalls.table + id;
1269 	sc->name = name;
1270 
1271 	sc->fmt  = syscall_fmt__find(sc->name);
1272 
1273 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1274 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1275 
1276 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1277 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1278 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1279 	}
1280 
1281 	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1282 		return -1;
1283 
1284 	if (IS_ERR(sc->tp_format))
1285 		return -1;
1286 
1287 	sc->args = sc->tp_format->format.fields;
1288 	/*
1289 	 * We need to check and discard the first variable '__syscall_nr'
1290 	 * or 'nr' that mean the syscall number. It is needless here.
1291 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1292 	 */
1293 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1294 		sc->args = sc->args->next;
1295 		--sc->nr_args;
1296 	}
1297 
1298 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1299 
1300 	return syscall__set_arg_fmts(sc);
1301 }
1302 
1303 static int trace__validate_ev_qualifier(struct trace *trace)
1304 {
1305 	int err = 0, i;
1306 	size_t nr_allocated;
1307 	struct str_node *pos;
1308 
1309 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1310 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1311 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1312 
1313 	if (trace->ev_qualifier_ids.entries == NULL) {
1314 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1315 		       trace->output);
1316 		err = -EINVAL;
1317 		goto out;
1318 	}
1319 
1320 	nr_allocated = trace->ev_qualifier_ids.nr;
1321 	i = 0;
1322 
1323 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1324 		const char *sc = pos->s;
1325 		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1326 
1327 		if (id < 0) {
1328 			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1329 			if (id >= 0)
1330 				goto matches;
1331 
1332 			if (err == 0) {
1333 				fputs("Error:\tInvalid syscall ", trace->output);
1334 				err = -EINVAL;
1335 			} else {
1336 				fputs(", ", trace->output);
1337 			}
1338 
1339 			fputs(sc, trace->output);
1340 		}
1341 matches:
1342 		trace->ev_qualifier_ids.entries[i++] = id;
1343 		if (match_next == -1)
1344 			continue;
1345 
1346 		while (1) {
1347 			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1348 			if (id < 0)
1349 				break;
1350 			if (nr_allocated == trace->ev_qualifier_ids.nr) {
1351 				void *entries;
1352 
1353 				nr_allocated += 8;
1354 				entries = realloc(trace->ev_qualifier_ids.entries,
1355 						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1356 				if (entries == NULL) {
1357 					err = -ENOMEM;
1358 					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1359 					goto out_free;
1360 				}
1361 				trace->ev_qualifier_ids.entries = entries;
1362 			}
1363 			trace->ev_qualifier_ids.nr++;
1364 			trace->ev_qualifier_ids.entries[i++] = id;
1365 		}
1366 	}
1367 
1368 	if (err < 0) {
1369 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1370 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1371 out_free:
1372 		zfree(&trace->ev_qualifier_ids.entries);
1373 		trace->ev_qualifier_ids.nr = 0;
1374 	}
1375 out:
1376 	return err;
1377 }
1378 
1379 /*
1380  * args is to be interpreted as a series of longs but we need to handle
1381  * 8-byte unaligned accesses. args points to raw_data within the event
1382  * and raw_data is guaranteed to be 8-byte unaligned because it is
1383  * preceded by raw_size which is a u32. So we need to copy args to a temp
1384  * variable to read it. Most notably this avoids extended load instructions
1385  * on unaligned addresses
1386  */
1387 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1388 {
1389 	unsigned long val;
1390 	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1391 
1392 	memcpy(&val, p, sizeof(val));
1393 	return val;
1394 }
1395 
1396 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1397 				      struct syscall_arg *arg)
1398 {
1399 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1400 		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1401 
1402 	return scnprintf(bf, size, "arg%d: ", arg->idx);
1403 }
1404 
1405 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1406 				     struct syscall_arg *arg, unsigned long val)
1407 {
1408 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1409 		arg->val = val;
1410 		if (sc->arg_fmt[arg->idx].parm)
1411 			arg->parm = sc->arg_fmt[arg->idx].parm;
1412 		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1413 	}
1414 	return scnprintf(bf, size, "%ld", val);
1415 }
1416 
1417 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1418 				      unsigned char *args, struct trace *trace,
1419 				      struct thread *thread)
1420 {
1421 	size_t printed = 0;
1422 	unsigned long val;
1423 	u8 bit = 1;
1424 	struct syscall_arg arg = {
1425 		.args	= args,
1426 		.idx	= 0,
1427 		.mask	= 0,
1428 		.trace  = trace,
1429 		.thread = thread,
1430 	};
1431 	struct thread_trace *ttrace = thread__priv(thread);
1432 
1433 	/*
1434 	 * Things like fcntl will set this in its 'cmd' formatter to pick the
1435 	 * right formatter for the return value (an fd? file flags?), which is
1436 	 * not needed for syscalls that always return a given type, say an fd.
1437 	 */
1438 	ttrace->ret_scnprintf = NULL;
1439 
1440 	if (sc->args != NULL) {
1441 		struct format_field *field;
1442 
1443 		for (field = sc->args; field;
1444 		     field = field->next, ++arg.idx, bit <<= 1) {
1445 			if (arg.mask & bit)
1446 				continue;
1447 
1448 			val = syscall_arg__val(&arg, arg.idx);
1449 
1450 			/*
1451  			 * Suppress this argument if its value is zero and
1452  			 * and we don't have a string associated in an
1453  			 * strarray for it.
1454  			 */
1455 			if (val == 0 &&
1456 			    !(sc->arg_fmt &&
1457 			      (sc->arg_fmt[arg.idx].show_zero ||
1458 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1459 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1460 			      sc->arg_fmt[arg.idx].parm))
1461 				continue;
1462 
1463 			printed += scnprintf(bf + printed, size - printed,
1464 					     "%s%s: ", printed ? ", " : "", field->name);
1465 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1466 		}
1467 	} else if (IS_ERR(sc->tp_format)) {
1468 		/*
1469 		 * If we managed to read the tracepoint /format file, then we
1470 		 * may end up not having any args, like with gettid(), so only
1471 		 * print the raw args when we didn't manage to read it.
1472 		 */
1473 		while (arg.idx < sc->nr_args) {
1474 			if (arg.mask & bit)
1475 				goto next_arg;
1476 			val = syscall_arg__val(&arg, arg.idx);
1477 			if (printed)
1478 				printed += scnprintf(bf + printed, size - printed, ", ");
1479 			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1480 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1481 next_arg:
1482 			++arg.idx;
1483 			bit <<= 1;
1484 		}
1485 	}
1486 
1487 	return printed;
1488 }
1489 
1490 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1491 				  union perf_event *event,
1492 				  struct perf_sample *sample);
1493 
1494 static struct syscall *trace__syscall_info(struct trace *trace,
1495 					   struct perf_evsel *evsel, int id)
1496 {
1497 
1498 	if (id < 0) {
1499 
1500 		/*
1501 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1502 		 * before that, leaving at a higher verbosity level till that is
1503 		 * explained. Reproduced with plain ftrace with:
1504 		 *
1505 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1506 		 * grep "NR -1 " /t/trace_pipe
1507 		 *
1508 		 * After generating some load on the machine.
1509  		 */
1510 		if (verbose > 1) {
1511 			static u64 n;
1512 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1513 				id, perf_evsel__name(evsel), ++n);
1514 		}
1515 		return NULL;
1516 	}
1517 
1518 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1519 	    trace__read_syscall_info(trace, id))
1520 		goto out_cant_read;
1521 
1522 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1523 		goto out_cant_read;
1524 
1525 	return &trace->syscalls.table[id];
1526 
1527 out_cant_read:
1528 	if (verbose > 0) {
1529 		fprintf(trace->output, "Problems reading syscall %d", id);
1530 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1531 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1532 		fputs(" information\n", trace->output);
1533 	}
1534 	return NULL;
1535 }
1536 
1537 static void thread__update_stats(struct thread_trace *ttrace,
1538 				 int id, struct perf_sample *sample)
1539 {
1540 	struct int_node *inode;
1541 	struct stats *stats;
1542 	u64 duration = 0;
1543 
1544 	inode = intlist__findnew(ttrace->syscall_stats, id);
1545 	if (inode == NULL)
1546 		return;
1547 
1548 	stats = inode->priv;
1549 	if (stats == NULL) {
1550 		stats = malloc(sizeof(struct stats));
1551 		if (stats == NULL)
1552 			return;
1553 		init_stats(stats);
1554 		inode->priv = stats;
1555 	}
1556 
1557 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1558 		duration = sample->time - ttrace->entry_time;
1559 
1560 	update_stats(stats, duration);
1561 }
1562 
1563 static int trace__printf_interrupted_entry(struct trace *trace)
1564 {
1565 	struct thread_trace *ttrace;
1566 	size_t printed;
1567 
1568 	if (trace->current == NULL)
1569 		return 0;
1570 
1571 	ttrace = thread__priv(trace->current);
1572 
1573 	if (!ttrace->entry_pending)
1574 		return 0;
1575 
1576 	printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1577 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1578 	ttrace->entry_pending = false;
1579 
1580 	return printed;
1581 }
1582 
1583 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1584 				 struct perf_sample *sample, struct thread *thread)
1585 {
1586 	int printed = 0;
1587 
1588 	if (trace->print_sample) {
1589 		double ts = (double)sample->time / NSEC_PER_MSEC;
1590 
1591 		printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1592 				   perf_evsel__name(evsel), ts,
1593 				   thread__comm_str(thread),
1594 				   sample->pid, sample->tid, sample->cpu);
1595 	}
1596 
1597 	return printed;
1598 }
1599 
1600 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1601 			    union perf_event *event __maybe_unused,
1602 			    struct perf_sample *sample)
1603 {
1604 	char *msg;
1605 	void *args;
1606 	size_t printed = 0;
1607 	struct thread *thread;
1608 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1609 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1610 	struct thread_trace *ttrace;
1611 
1612 	if (sc == NULL)
1613 		return -1;
1614 
1615 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1616 	ttrace = thread__trace(thread, trace->output);
1617 	if (ttrace == NULL)
1618 		goto out_put;
1619 
1620 	trace__fprintf_sample(trace, evsel, sample, thread);
1621 
1622 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1623 
1624 	if (ttrace->entry_str == NULL) {
1625 		ttrace->entry_str = malloc(trace__entry_str_size);
1626 		if (!ttrace->entry_str)
1627 			goto out_put;
1628 	}
1629 
1630 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1631 		trace__printf_interrupted_entry(trace);
1632 
1633 	ttrace->entry_time = sample->time;
1634 	msg = ttrace->entry_str;
1635 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1636 
1637 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1638 					   args, trace, thread);
1639 
1640 	if (sc->is_exit) {
1641 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1642 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1643 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1644 		}
1645 	} else {
1646 		ttrace->entry_pending = true;
1647 		/* See trace__vfs_getname & trace__sys_exit */
1648 		ttrace->filename.pending_open = false;
1649 	}
1650 
1651 	if (trace->current != thread) {
1652 		thread__put(trace->current);
1653 		trace->current = thread__get(thread);
1654 	}
1655 	err = 0;
1656 out_put:
1657 	thread__put(thread);
1658 	return err;
1659 }
1660 
1661 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1662 				    struct perf_sample *sample,
1663 				    struct callchain_cursor *cursor)
1664 {
1665 	struct addr_location al;
1666 	int max_stack = evsel->attr.sample_max_stack ?
1667 			evsel->attr.sample_max_stack :
1668 			trace->max_stack;
1669 
1670 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1671 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1672 		return -1;
1673 
1674 	return 0;
1675 }
1676 
1677 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1678 {
1679 	/* TODO: user-configurable print_opts */
1680 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1681 				        EVSEL__PRINT_DSO |
1682 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1683 
1684 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1685 }
1686 
1687 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1688 {
1689 	struct perf_env *env = perf_evsel__env(evsel);
1690 	const char *arch_name = perf_env__arch(env);
1691 
1692 	return arch_syscalls__strerrno(arch_name, err);
1693 }
1694 
1695 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1696 			   union perf_event *event __maybe_unused,
1697 			   struct perf_sample *sample)
1698 {
1699 	long ret;
1700 	u64 duration = 0;
1701 	bool duration_calculated = false;
1702 	struct thread *thread;
1703 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1704 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1705 	struct thread_trace *ttrace;
1706 
1707 	if (sc == NULL)
1708 		return -1;
1709 
1710 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1711 	ttrace = thread__trace(thread, trace->output);
1712 	if (ttrace == NULL)
1713 		goto out_put;
1714 
1715 	trace__fprintf_sample(trace, evsel, sample, thread);
1716 
1717 	if (trace->summary)
1718 		thread__update_stats(ttrace, id, sample);
1719 
1720 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1721 
1722 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1723 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1724 		ttrace->filename.pending_open = false;
1725 		++trace->stats.vfs_getname;
1726 	}
1727 
1728 	if (ttrace->entry_time) {
1729 		duration = sample->time - ttrace->entry_time;
1730 		if (trace__filter_duration(trace, duration))
1731 			goto out;
1732 		duration_calculated = true;
1733 	} else if (trace->duration_filter)
1734 		goto out;
1735 
1736 	if (sample->callchain) {
1737 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1738 		if (callchain_ret == 0) {
1739 			if (callchain_cursor.nr < trace->min_stack)
1740 				goto out;
1741 			callchain_ret = 1;
1742 		}
1743 	}
1744 
1745 	if (trace->summary_only)
1746 		goto out;
1747 
1748 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1749 
1750 	if (ttrace->entry_pending) {
1751 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1752 	} else {
1753 		fprintf(trace->output, " ... [");
1754 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1755 		fprintf(trace->output, "]: %s()", sc->name);
1756 	}
1757 
1758 	if (sc->fmt == NULL) {
1759 		if (ret < 0)
1760 			goto errno_print;
1761 signed_print:
1762 		fprintf(trace->output, ") = %ld", ret);
1763 	} else if (ret < 0) {
1764 errno_print: {
1765 		char bf[STRERR_BUFSIZE];
1766 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1767 			   *e = errno_to_name(evsel, -ret);
1768 
1769 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1770 	}
1771 	} else if (ret == 0 && sc->fmt->timeout)
1772 		fprintf(trace->output, ") = 0 Timeout");
1773 	else if (ttrace->ret_scnprintf) {
1774 		char bf[1024];
1775 		struct syscall_arg arg = {
1776 			.val	= ret,
1777 			.thread	= thread,
1778 			.trace	= trace,
1779 		};
1780 		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1781 		ttrace->ret_scnprintf = NULL;
1782 		fprintf(trace->output, ") = %s", bf);
1783 	} else if (sc->fmt->hexret)
1784 		fprintf(trace->output, ") = %#lx", ret);
1785 	else if (sc->fmt->errpid) {
1786 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1787 
1788 		if (child != NULL) {
1789 			fprintf(trace->output, ") = %ld", ret);
1790 			if (child->comm_set)
1791 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1792 			thread__put(child);
1793 		}
1794 	} else
1795 		goto signed_print;
1796 
1797 	fputc('\n', trace->output);
1798 
1799 	if (callchain_ret > 0)
1800 		trace__fprintf_callchain(trace, sample);
1801 	else if (callchain_ret < 0)
1802 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1803 out:
1804 	ttrace->entry_pending = false;
1805 	err = 0;
1806 out_put:
1807 	thread__put(thread);
1808 	return err;
1809 }
1810 
1811 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1812 			      union perf_event *event __maybe_unused,
1813 			      struct perf_sample *sample)
1814 {
1815 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1816 	struct thread_trace *ttrace;
1817 	size_t filename_len, entry_str_len, to_move;
1818 	ssize_t remaining_space;
1819 	char *pos;
1820 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1821 
1822 	if (!thread)
1823 		goto out;
1824 
1825 	ttrace = thread__priv(thread);
1826 	if (!ttrace)
1827 		goto out_put;
1828 
1829 	filename_len = strlen(filename);
1830 	if (filename_len == 0)
1831 		goto out_put;
1832 
1833 	if (ttrace->filename.namelen < filename_len) {
1834 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1835 
1836 		if (f == NULL)
1837 			goto out_put;
1838 
1839 		ttrace->filename.namelen = filename_len;
1840 		ttrace->filename.name = f;
1841 	}
1842 
1843 	strcpy(ttrace->filename.name, filename);
1844 	ttrace->filename.pending_open = true;
1845 
1846 	if (!ttrace->filename.ptr)
1847 		goto out_put;
1848 
1849 	entry_str_len = strlen(ttrace->entry_str);
1850 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1851 	if (remaining_space <= 0)
1852 		goto out_put;
1853 
1854 	if (filename_len > (size_t)remaining_space) {
1855 		filename += filename_len - remaining_space;
1856 		filename_len = remaining_space;
1857 	}
1858 
1859 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1860 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1861 	memmove(pos + filename_len, pos, to_move);
1862 	memcpy(pos, filename, filename_len);
1863 
1864 	ttrace->filename.ptr = 0;
1865 	ttrace->filename.entry_str_pos = 0;
1866 out_put:
1867 	thread__put(thread);
1868 out:
1869 	return 0;
1870 }
1871 
1872 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1873 				     union perf_event *event __maybe_unused,
1874 				     struct perf_sample *sample)
1875 {
1876         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1877 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1878 	struct thread *thread = machine__findnew_thread(trace->host,
1879 							sample->pid,
1880 							sample->tid);
1881 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1882 
1883 	if (ttrace == NULL)
1884 		goto out_dump;
1885 
1886 	ttrace->runtime_ms += runtime_ms;
1887 	trace->runtime_ms += runtime_ms;
1888 out_put:
1889 	thread__put(thread);
1890 	return 0;
1891 
1892 out_dump:
1893 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1894 	       evsel->name,
1895 	       perf_evsel__strval(evsel, sample, "comm"),
1896 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1897 	       runtime,
1898 	       perf_evsel__intval(evsel, sample, "vruntime"));
1899 	goto out_put;
1900 }
1901 
1902 static int bpf_output__printer(enum binary_printer_ops op,
1903 			       unsigned int val, void *extra __maybe_unused, FILE *fp)
1904 {
1905 	unsigned char ch = (unsigned char)val;
1906 
1907 	switch (op) {
1908 	case BINARY_PRINT_CHAR_DATA:
1909 		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1910 	case BINARY_PRINT_DATA_BEGIN:
1911 	case BINARY_PRINT_LINE_BEGIN:
1912 	case BINARY_PRINT_ADDR:
1913 	case BINARY_PRINT_NUM_DATA:
1914 	case BINARY_PRINT_NUM_PAD:
1915 	case BINARY_PRINT_SEP:
1916 	case BINARY_PRINT_CHAR_PAD:
1917 	case BINARY_PRINT_LINE_END:
1918 	case BINARY_PRINT_DATA_END:
1919 	default:
1920 		break;
1921 	}
1922 
1923 	return 0;
1924 }
1925 
1926 static void bpf_output__fprintf(struct trace *trace,
1927 				struct perf_sample *sample)
1928 {
1929 	binary__fprintf(sample->raw_data, sample->raw_size, 8,
1930 			bpf_output__printer, NULL, trace->output);
1931 }
1932 
1933 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1934 				union perf_event *event __maybe_unused,
1935 				struct perf_sample *sample)
1936 {
1937 	int callchain_ret = 0;
1938 
1939 	if (sample->callchain) {
1940 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1941 		if (callchain_ret == 0) {
1942 			if (callchain_cursor.nr < trace->min_stack)
1943 				goto out;
1944 			callchain_ret = 1;
1945 		}
1946 	}
1947 
1948 	trace__printf_interrupted_entry(trace);
1949 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1950 
1951 	if (trace->trace_syscalls)
1952 		fprintf(trace->output, "(         ): ");
1953 
1954 	fprintf(trace->output, "%s:", evsel->name);
1955 
1956 	if (perf_evsel__is_bpf_output(evsel)) {
1957 		bpf_output__fprintf(trace, sample);
1958 	} else if (evsel->tp_format) {
1959 		event_format__fprintf(evsel->tp_format, sample->cpu,
1960 				      sample->raw_data, sample->raw_size,
1961 				      trace->output);
1962 	}
1963 
1964 	fprintf(trace->output, ")\n");
1965 
1966 	if (callchain_ret > 0)
1967 		trace__fprintf_callchain(trace, sample);
1968 	else if (callchain_ret < 0)
1969 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1970 out:
1971 	return 0;
1972 }
1973 
1974 static void print_location(FILE *f, struct perf_sample *sample,
1975 			   struct addr_location *al,
1976 			   bool print_dso, bool print_sym)
1977 {
1978 
1979 	if ((verbose > 0 || print_dso) && al->map)
1980 		fprintf(f, "%s@", al->map->dso->long_name);
1981 
1982 	if ((verbose > 0 || print_sym) && al->sym)
1983 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1984 			al->addr - al->sym->start);
1985 	else if (al->map)
1986 		fprintf(f, "0x%" PRIx64, al->addr);
1987 	else
1988 		fprintf(f, "0x%" PRIx64, sample->addr);
1989 }
1990 
1991 static int trace__pgfault(struct trace *trace,
1992 			  struct perf_evsel *evsel,
1993 			  union perf_event *event __maybe_unused,
1994 			  struct perf_sample *sample)
1995 {
1996 	struct thread *thread;
1997 	struct addr_location al;
1998 	char map_type = 'd';
1999 	struct thread_trace *ttrace;
2000 	int err = -1;
2001 	int callchain_ret = 0;
2002 
2003 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2004 
2005 	if (sample->callchain) {
2006 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2007 		if (callchain_ret == 0) {
2008 			if (callchain_cursor.nr < trace->min_stack)
2009 				goto out_put;
2010 			callchain_ret = 1;
2011 		}
2012 	}
2013 
2014 	ttrace = thread__trace(thread, trace->output);
2015 	if (ttrace == NULL)
2016 		goto out_put;
2017 
2018 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2019 		ttrace->pfmaj++;
2020 	else
2021 		ttrace->pfmin++;
2022 
2023 	if (trace->summary_only)
2024 		goto out;
2025 
2026 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
2027 			      sample->ip, &al);
2028 
2029 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2030 
2031 	fprintf(trace->output, "%sfault [",
2032 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2033 		"maj" : "min");
2034 
2035 	print_location(trace->output, sample, &al, false, true);
2036 
2037 	fprintf(trace->output, "] => ");
2038 
2039 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2040 				   sample->addr, &al);
2041 
2042 	if (!al.map) {
2043 		thread__find_addr_location(thread, sample->cpumode,
2044 					   MAP__FUNCTION, sample->addr, &al);
2045 
2046 		if (al.map)
2047 			map_type = 'x';
2048 		else
2049 			map_type = '?';
2050 	}
2051 
2052 	print_location(trace->output, sample, &al, true, false);
2053 
2054 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2055 
2056 	if (callchain_ret > 0)
2057 		trace__fprintf_callchain(trace, sample);
2058 	else if (callchain_ret < 0)
2059 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2060 out:
2061 	err = 0;
2062 out_put:
2063 	thread__put(thread);
2064 	return err;
2065 }
2066 
2067 static void trace__set_base_time(struct trace *trace,
2068 				 struct perf_evsel *evsel,
2069 				 struct perf_sample *sample)
2070 {
2071 	/*
2072 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2073 	 * and don't use sample->time unconditionally, we may end up having
2074 	 * some other event in the future without PERF_SAMPLE_TIME for good
2075 	 * reason, i.e. we may not be interested in its timestamps, just in
2076 	 * it taking place, picking some piece of information when it
2077 	 * appears in our event stream (vfs_getname comes to mind).
2078 	 */
2079 	if (trace->base_time == 0 && !trace->full_time &&
2080 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2081 		trace->base_time = sample->time;
2082 }
2083 
2084 static int trace__process_sample(struct perf_tool *tool,
2085 				 union perf_event *event,
2086 				 struct perf_sample *sample,
2087 				 struct perf_evsel *evsel,
2088 				 struct machine *machine __maybe_unused)
2089 {
2090 	struct trace *trace = container_of(tool, struct trace, tool);
2091 	struct thread *thread;
2092 	int err = 0;
2093 
2094 	tracepoint_handler handler = evsel->handler;
2095 
2096 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2097 	if (thread && thread__is_filtered(thread))
2098 		goto out;
2099 
2100 	trace__set_base_time(trace, evsel, sample);
2101 
2102 	if (handler) {
2103 		++trace->nr_events;
2104 		handler(trace, evsel, event, sample);
2105 	}
2106 out:
2107 	thread__put(thread);
2108 	return err;
2109 }
2110 
2111 static int trace__record(struct trace *trace, int argc, const char **argv)
2112 {
2113 	unsigned int rec_argc, i, j;
2114 	const char **rec_argv;
2115 	const char * const record_args[] = {
2116 		"record",
2117 		"-R",
2118 		"-m", "1024",
2119 		"-c", "1",
2120 	};
2121 
2122 	const char * const sc_args[] = { "-e", };
2123 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2124 	const char * const majpf_args[] = { "-e", "major-faults" };
2125 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2126 	const char * const minpf_args[] = { "-e", "minor-faults" };
2127 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2128 
2129 	/* +1 is for the event string below */
2130 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2131 		majpf_args_nr + minpf_args_nr + argc;
2132 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2133 
2134 	if (rec_argv == NULL)
2135 		return -ENOMEM;
2136 
2137 	j = 0;
2138 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2139 		rec_argv[j++] = record_args[i];
2140 
2141 	if (trace->trace_syscalls) {
2142 		for (i = 0; i < sc_args_nr; i++)
2143 			rec_argv[j++] = sc_args[i];
2144 
2145 		/* event string may be different for older kernels - e.g., RHEL6 */
2146 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2147 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2148 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2149 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2150 		else {
2151 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2152 			free(rec_argv);
2153 			return -1;
2154 		}
2155 	}
2156 
2157 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2158 		for (i = 0; i < majpf_args_nr; i++)
2159 			rec_argv[j++] = majpf_args[i];
2160 
2161 	if (trace->trace_pgfaults & TRACE_PFMIN)
2162 		for (i = 0; i < minpf_args_nr; i++)
2163 			rec_argv[j++] = minpf_args[i];
2164 
2165 	for (i = 0; i < (unsigned int)argc; i++)
2166 		rec_argv[j++] = argv[i];
2167 
2168 	return cmd_record(j, rec_argv);
2169 }
2170 
2171 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2172 
2173 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2174 {
2175 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2176 
2177 	if (IS_ERR(evsel))
2178 		return false;
2179 
2180 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2181 		perf_evsel__delete(evsel);
2182 		return false;
2183 	}
2184 
2185 	evsel->handler = trace__vfs_getname;
2186 	perf_evlist__add(evlist, evsel);
2187 	return true;
2188 }
2189 
2190 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2191 {
2192 	struct perf_evsel *evsel;
2193 	struct perf_event_attr attr = {
2194 		.type = PERF_TYPE_SOFTWARE,
2195 		.mmap_data = 1,
2196 	};
2197 
2198 	attr.config = config;
2199 	attr.sample_period = 1;
2200 
2201 	event_attr_init(&attr);
2202 
2203 	evsel = perf_evsel__new(&attr);
2204 	if (evsel)
2205 		evsel->handler = trace__pgfault;
2206 
2207 	return evsel;
2208 }
2209 
2210 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2211 {
2212 	const u32 type = event->header.type;
2213 	struct perf_evsel *evsel;
2214 
2215 	if (type != PERF_RECORD_SAMPLE) {
2216 		trace__process_event(trace, trace->host, event, sample);
2217 		return;
2218 	}
2219 
2220 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2221 	if (evsel == NULL) {
2222 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2223 		return;
2224 	}
2225 
2226 	trace__set_base_time(trace, evsel, sample);
2227 
2228 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2229 	    sample->raw_data == NULL) {
2230 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2231 		       perf_evsel__name(evsel), sample->tid,
2232 		       sample->cpu, sample->raw_size);
2233 	} else {
2234 		tracepoint_handler handler = evsel->handler;
2235 		handler(trace, evsel, event, sample);
2236 	}
2237 }
2238 
2239 static int trace__add_syscall_newtp(struct trace *trace)
2240 {
2241 	int ret = -1;
2242 	struct perf_evlist *evlist = trace->evlist;
2243 	struct perf_evsel *sys_enter, *sys_exit;
2244 
2245 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2246 	if (sys_enter == NULL)
2247 		goto out;
2248 
2249 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2250 		goto out_delete_sys_enter;
2251 
2252 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2253 	if (sys_exit == NULL)
2254 		goto out_delete_sys_enter;
2255 
2256 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2257 		goto out_delete_sys_exit;
2258 
2259 	perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2260 	perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2261 
2262 	perf_evlist__add(evlist, sys_enter);
2263 	perf_evlist__add(evlist, sys_exit);
2264 
2265 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2266 		/*
2267 		 * We're interested only in the user space callchain
2268 		 * leading to the syscall, allow overriding that for
2269 		 * debugging reasons using --kernel_syscall_callchains
2270 		 */
2271 		sys_exit->attr.exclude_callchain_kernel = 1;
2272 	}
2273 
2274 	trace->syscalls.events.sys_enter = sys_enter;
2275 	trace->syscalls.events.sys_exit  = sys_exit;
2276 
2277 	ret = 0;
2278 out:
2279 	return ret;
2280 
2281 out_delete_sys_exit:
2282 	perf_evsel__delete_priv(sys_exit);
2283 out_delete_sys_enter:
2284 	perf_evsel__delete_priv(sys_enter);
2285 	goto out;
2286 }
2287 
2288 static int trace__set_ev_qualifier_filter(struct trace *trace)
2289 {
2290 	int err = -1;
2291 	struct perf_evsel *sys_exit;
2292 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2293 						trace->ev_qualifier_ids.nr,
2294 						trace->ev_qualifier_ids.entries);
2295 
2296 	if (filter == NULL)
2297 		goto out_enomem;
2298 
2299 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2300 					  filter)) {
2301 		sys_exit = trace->syscalls.events.sys_exit;
2302 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2303 	}
2304 
2305 	free(filter);
2306 out:
2307 	return err;
2308 out_enomem:
2309 	errno = ENOMEM;
2310 	goto out;
2311 }
2312 
2313 static int trace__set_filter_loop_pids(struct trace *trace)
2314 {
2315 	unsigned int nr = 1;
2316 	pid_t pids[32] = {
2317 		getpid(),
2318 	};
2319 	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2320 
2321 	while (thread && nr < ARRAY_SIZE(pids)) {
2322 		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2323 
2324 		if (parent == NULL)
2325 			break;
2326 
2327 		if (!strcmp(thread__comm_str(parent), "sshd")) {
2328 			pids[nr++] = parent->tid;
2329 			break;
2330 		}
2331 		thread = parent;
2332 	}
2333 
2334 	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2335 }
2336 
2337 static int trace__run(struct trace *trace, int argc, const char **argv)
2338 {
2339 	struct perf_evlist *evlist = trace->evlist;
2340 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2341 	int err = -1, i;
2342 	unsigned long before;
2343 	const bool forks = argc > 0;
2344 	bool draining = false;
2345 
2346 	trace->live = true;
2347 
2348 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2349 		goto out_error_raw_syscalls;
2350 
2351 	if (trace->trace_syscalls)
2352 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2353 
2354 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2355 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2356 		if (pgfault_maj == NULL)
2357 			goto out_error_mem;
2358 		perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2359 		perf_evlist__add(evlist, pgfault_maj);
2360 	}
2361 
2362 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2363 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2364 		if (pgfault_min == NULL)
2365 			goto out_error_mem;
2366 		perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2367 		perf_evlist__add(evlist, pgfault_min);
2368 	}
2369 
2370 	if (trace->sched &&
2371 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2372 				   trace__sched_stat_runtime))
2373 		goto out_error_sched_stat_runtime;
2374 
2375 	/*
2376 	 * If a global cgroup was set, apply it to all the events without an
2377 	 * explicit cgroup. I.e.:
2378 	 *
2379 	 * 	trace -G A -e sched:*switch
2380 	 *
2381 	 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2382 	 * _and_ sched:sched_switch to the 'A' cgroup, while:
2383 	 *
2384 	 * trace -e sched:*switch -G A
2385 	 *
2386 	 * will only set the sched:sched_switch event to the 'A' cgroup, all the
2387 	 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2388 	 * a cgroup (on the root cgroup, sys wide, etc).
2389 	 *
2390 	 * Multiple cgroups:
2391 	 *
2392 	 * trace -G A -e sched:*switch -G B
2393 	 *
2394 	 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2395 	 * to the 'B' cgroup.
2396 	 *
2397 	 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2398 	 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2399 	 */
2400 	if (trace->cgroup)
2401 		evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2402 
2403 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2404 	if (err < 0) {
2405 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2406 		goto out_delete_evlist;
2407 	}
2408 
2409 	err = trace__symbols_init(trace, evlist);
2410 	if (err < 0) {
2411 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2412 		goto out_delete_evlist;
2413 	}
2414 
2415 	perf_evlist__config(evlist, &trace->opts, &callchain_param);
2416 
2417 	signal(SIGCHLD, sig_handler);
2418 	signal(SIGINT, sig_handler);
2419 
2420 	if (forks) {
2421 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2422 						    argv, false, NULL);
2423 		if (err < 0) {
2424 			fprintf(trace->output, "Couldn't run the workload!\n");
2425 			goto out_delete_evlist;
2426 		}
2427 	}
2428 
2429 	err = perf_evlist__open(evlist);
2430 	if (err < 0)
2431 		goto out_error_open;
2432 
2433 	err = bpf__apply_obj_config();
2434 	if (err) {
2435 		char errbuf[BUFSIZ];
2436 
2437 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2438 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2439 			 errbuf);
2440 		goto out_error_open;
2441 	}
2442 
2443 	/*
2444 	 * Better not use !target__has_task() here because we need to cover the
2445 	 * case where no threads were specified in the command line, but a
2446 	 * workload was, and in that case we will fill in the thread_map when
2447 	 * we fork the workload in perf_evlist__prepare_workload.
2448 	 */
2449 	if (trace->filter_pids.nr > 0)
2450 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2451 	else if (thread_map__pid(evlist->threads, 0) == -1)
2452 		err = trace__set_filter_loop_pids(trace);
2453 
2454 	if (err < 0)
2455 		goto out_error_mem;
2456 
2457 	if (trace->ev_qualifier_ids.nr > 0) {
2458 		err = trace__set_ev_qualifier_filter(trace);
2459 		if (err < 0)
2460 			goto out_errno;
2461 
2462 		pr_debug("event qualifier tracepoint filter: %s\n",
2463 			 trace->syscalls.events.sys_exit->filter);
2464 	}
2465 
2466 	err = perf_evlist__apply_filters(evlist, &evsel);
2467 	if (err < 0)
2468 		goto out_error_apply_filters;
2469 
2470 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2471 	if (err < 0)
2472 		goto out_error_mmap;
2473 
2474 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2475 		perf_evlist__enable(evlist);
2476 
2477 	if (forks)
2478 		perf_evlist__start_workload(evlist);
2479 
2480 	if (trace->opts.initial_delay) {
2481 		usleep(trace->opts.initial_delay * 1000);
2482 		perf_evlist__enable(evlist);
2483 	}
2484 
2485 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2486 				  evlist->threads->nr > 1 ||
2487 				  perf_evlist__first(evlist)->attr.inherit;
2488 
2489 	/*
2490 	 * Now that we already used evsel->attr to ask the kernel to setup the
2491 	 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2492 	 * trace__resolve_callchain(), allowing per-event max-stack settings
2493 	 * to override an explicitely set --max-stack global setting.
2494 	 */
2495 	evlist__for_each_entry(evlist, evsel) {
2496 		if ((evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
2497 		    evsel->attr.sample_max_stack == 0)
2498 			evsel->attr.sample_max_stack = trace->max_stack;
2499 	}
2500 again:
2501 	before = trace->nr_events;
2502 
2503 	for (i = 0; i < evlist->nr_mmaps; i++) {
2504 		union perf_event *event;
2505 		struct perf_mmap *md;
2506 
2507 		md = &evlist->mmap[i];
2508 		if (perf_mmap__read_init(md) < 0)
2509 			continue;
2510 
2511 		while ((event = perf_mmap__read_event(md)) != NULL) {
2512 			struct perf_sample sample;
2513 
2514 			++trace->nr_events;
2515 
2516 			err = perf_evlist__parse_sample(evlist, event, &sample);
2517 			if (err) {
2518 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2519 				goto next_event;
2520 			}
2521 
2522 			trace__handle_event(trace, event, &sample);
2523 next_event:
2524 			perf_mmap__consume(md);
2525 
2526 			if (interrupted)
2527 				goto out_disable;
2528 
2529 			if (done && !draining) {
2530 				perf_evlist__disable(evlist);
2531 				draining = true;
2532 			}
2533 		}
2534 		perf_mmap__read_done(md);
2535 	}
2536 
2537 	if (trace->nr_events == before) {
2538 		int timeout = done ? 100 : -1;
2539 
2540 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2541 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2542 				draining = true;
2543 
2544 			goto again;
2545 		}
2546 	} else {
2547 		goto again;
2548 	}
2549 
2550 out_disable:
2551 	thread__zput(trace->current);
2552 
2553 	perf_evlist__disable(evlist);
2554 
2555 	if (!err) {
2556 		if (trace->summary)
2557 			trace__fprintf_thread_summary(trace, trace->output);
2558 
2559 		if (trace->show_tool_stats) {
2560 			fprintf(trace->output, "Stats:\n "
2561 					       " vfs_getname : %" PRIu64 "\n"
2562 					       " proc_getname: %" PRIu64 "\n",
2563 				trace->stats.vfs_getname,
2564 				trace->stats.proc_getname);
2565 		}
2566 	}
2567 
2568 out_delete_evlist:
2569 	trace__symbols__exit(trace);
2570 
2571 	perf_evlist__delete(evlist);
2572 	cgroup__put(trace->cgroup);
2573 	trace->evlist = NULL;
2574 	trace->live = false;
2575 	return err;
2576 {
2577 	char errbuf[BUFSIZ];
2578 
2579 out_error_sched_stat_runtime:
2580 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2581 	goto out_error;
2582 
2583 out_error_raw_syscalls:
2584 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2585 	goto out_error;
2586 
2587 out_error_mmap:
2588 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2589 	goto out_error;
2590 
2591 out_error_open:
2592 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2593 
2594 out_error:
2595 	fprintf(trace->output, "%s\n", errbuf);
2596 	goto out_delete_evlist;
2597 
2598 out_error_apply_filters:
2599 	fprintf(trace->output,
2600 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2601 		evsel->filter, perf_evsel__name(evsel), errno,
2602 		str_error_r(errno, errbuf, sizeof(errbuf)));
2603 	goto out_delete_evlist;
2604 }
2605 out_error_mem:
2606 	fprintf(trace->output, "Not enough memory to run!\n");
2607 	goto out_delete_evlist;
2608 
2609 out_errno:
2610 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2611 	goto out_delete_evlist;
2612 }
2613 
2614 static int trace__replay(struct trace *trace)
2615 {
2616 	const struct perf_evsel_str_handler handlers[] = {
2617 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2618 	};
2619 	struct perf_data data = {
2620 		.file      = {
2621 			.path = input_name,
2622 		},
2623 		.mode      = PERF_DATA_MODE_READ,
2624 		.force     = trace->force,
2625 	};
2626 	struct perf_session *session;
2627 	struct perf_evsel *evsel;
2628 	int err = -1;
2629 
2630 	trace->tool.sample	  = trace__process_sample;
2631 	trace->tool.mmap	  = perf_event__process_mmap;
2632 	trace->tool.mmap2	  = perf_event__process_mmap2;
2633 	trace->tool.comm	  = perf_event__process_comm;
2634 	trace->tool.exit	  = perf_event__process_exit;
2635 	trace->tool.fork	  = perf_event__process_fork;
2636 	trace->tool.attr	  = perf_event__process_attr;
2637 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2638 	trace->tool.build_id	  = perf_event__process_build_id;
2639 	trace->tool.namespaces	  = perf_event__process_namespaces;
2640 
2641 	trace->tool.ordered_events = true;
2642 	trace->tool.ordering_requires_timestamps = true;
2643 
2644 	/* add tid to output */
2645 	trace->multiple_threads = true;
2646 
2647 	session = perf_session__new(&data, false, &trace->tool);
2648 	if (session == NULL)
2649 		return -1;
2650 
2651 	if (trace->opts.target.pid)
2652 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2653 
2654 	if (trace->opts.target.tid)
2655 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2656 
2657 	if (symbol__init(&session->header.env) < 0)
2658 		goto out;
2659 
2660 	trace->host = &session->machines.host;
2661 
2662 	err = perf_session__set_tracepoints_handlers(session, handlers);
2663 	if (err)
2664 		goto out;
2665 
2666 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2667 						     "raw_syscalls:sys_enter");
2668 	/* older kernels have syscalls tp versus raw_syscalls */
2669 	if (evsel == NULL)
2670 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2671 							     "syscalls:sys_enter");
2672 
2673 	if (evsel &&
2674 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2675 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2676 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2677 		goto out;
2678 	}
2679 
2680 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2681 						     "raw_syscalls:sys_exit");
2682 	if (evsel == NULL)
2683 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2684 							     "syscalls:sys_exit");
2685 	if (evsel &&
2686 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2687 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2688 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2689 		goto out;
2690 	}
2691 
2692 	evlist__for_each_entry(session->evlist, evsel) {
2693 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2694 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2695 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2696 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2697 			evsel->handler = trace__pgfault;
2698 	}
2699 
2700 	setup_pager();
2701 
2702 	err = perf_session__process_events(session);
2703 	if (err)
2704 		pr_err("Failed to process events, error %d", err);
2705 
2706 	else if (trace->summary)
2707 		trace__fprintf_thread_summary(trace, trace->output);
2708 
2709 out:
2710 	perf_session__delete(session);
2711 
2712 	return err;
2713 }
2714 
2715 static size_t trace__fprintf_threads_header(FILE *fp)
2716 {
2717 	size_t printed;
2718 
2719 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2720 
2721 	return printed;
2722 }
2723 
2724 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2725 	struct stats 	*stats;
2726 	double		msecs;
2727 	int		syscall;
2728 )
2729 {
2730 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2731 	struct stats *stats = source->priv;
2732 
2733 	entry->syscall = source->i;
2734 	entry->stats   = stats;
2735 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2736 }
2737 
2738 static size_t thread__dump_stats(struct thread_trace *ttrace,
2739 				 struct trace *trace, FILE *fp)
2740 {
2741 	size_t printed = 0;
2742 	struct syscall *sc;
2743 	struct rb_node *nd;
2744 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2745 
2746 	if (syscall_stats == NULL)
2747 		return 0;
2748 
2749 	printed += fprintf(fp, "\n");
2750 
2751 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2752 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2753 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2754 
2755 	resort_rb__for_each_entry(nd, syscall_stats) {
2756 		struct stats *stats = syscall_stats_entry->stats;
2757 		if (stats) {
2758 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2759 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2760 			double avg = avg_stats(stats);
2761 			double pct;
2762 			u64 n = (u64) stats->n;
2763 
2764 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2765 			avg /= NSEC_PER_MSEC;
2766 
2767 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2768 			printed += fprintf(fp, "   %-15s", sc->name);
2769 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2770 					   n, syscall_stats_entry->msecs, min, avg);
2771 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2772 		}
2773 	}
2774 
2775 	resort_rb__delete(syscall_stats);
2776 	printed += fprintf(fp, "\n\n");
2777 
2778 	return printed;
2779 }
2780 
2781 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2782 {
2783 	size_t printed = 0;
2784 	struct thread_trace *ttrace = thread__priv(thread);
2785 	double ratio;
2786 
2787 	if (ttrace == NULL)
2788 		return 0;
2789 
2790 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2791 
2792 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2793 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2794 	printed += fprintf(fp, "%.1f%%", ratio);
2795 	if (ttrace->pfmaj)
2796 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2797 	if (ttrace->pfmin)
2798 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2799 	if (trace->sched)
2800 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2801 	else if (fputc('\n', fp) != EOF)
2802 		++printed;
2803 
2804 	printed += thread__dump_stats(ttrace, trace, fp);
2805 
2806 	return printed;
2807 }
2808 
2809 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2810 {
2811 	return ttrace ? ttrace->nr_events : 0;
2812 }
2813 
2814 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2815 	struct thread *thread;
2816 )
2817 {
2818 	entry->thread = rb_entry(nd, struct thread, rb_node);
2819 }
2820 
2821 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2822 {
2823 	size_t printed = trace__fprintf_threads_header(fp);
2824 	struct rb_node *nd;
2825 	int i;
2826 
2827 	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2828 		DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2829 
2830 		if (threads == NULL) {
2831 			fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2832 			return 0;
2833 		}
2834 
2835 		resort_rb__for_each_entry(nd, threads)
2836 			printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2837 
2838 		resort_rb__delete(threads);
2839 	}
2840 	return printed;
2841 }
2842 
2843 static int trace__set_duration(const struct option *opt, const char *str,
2844 			       int unset __maybe_unused)
2845 {
2846 	struct trace *trace = opt->value;
2847 
2848 	trace->duration_filter = atof(str);
2849 	return 0;
2850 }
2851 
2852 static int trace__set_filter_pids(const struct option *opt, const char *str,
2853 				  int unset __maybe_unused)
2854 {
2855 	int ret = -1;
2856 	size_t i;
2857 	struct trace *trace = opt->value;
2858 	/*
2859 	 * FIXME: introduce a intarray class, plain parse csv and create a
2860 	 * { int nr, int entries[] } struct...
2861 	 */
2862 	struct intlist *list = intlist__new(str);
2863 
2864 	if (list == NULL)
2865 		return -1;
2866 
2867 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2868 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2869 
2870 	if (trace->filter_pids.entries == NULL)
2871 		goto out;
2872 
2873 	trace->filter_pids.entries[0] = getpid();
2874 
2875 	for (i = 1; i < trace->filter_pids.nr; ++i)
2876 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2877 
2878 	intlist__delete(list);
2879 	ret = 0;
2880 out:
2881 	return ret;
2882 }
2883 
2884 static int trace__open_output(struct trace *trace, const char *filename)
2885 {
2886 	struct stat st;
2887 
2888 	if (!stat(filename, &st) && st.st_size) {
2889 		char oldname[PATH_MAX];
2890 
2891 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2892 		unlink(oldname);
2893 		rename(filename, oldname);
2894 	}
2895 
2896 	trace->output = fopen(filename, "w");
2897 
2898 	return trace->output == NULL ? -errno : 0;
2899 }
2900 
2901 static int parse_pagefaults(const struct option *opt, const char *str,
2902 			    int unset __maybe_unused)
2903 {
2904 	int *trace_pgfaults = opt->value;
2905 
2906 	if (strcmp(str, "all") == 0)
2907 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2908 	else if (strcmp(str, "maj") == 0)
2909 		*trace_pgfaults |= TRACE_PFMAJ;
2910 	else if (strcmp(str, "min") == 0)
2911 		*trace_pgfaults |= TRACE_PFMIN;
2912 	else
2913 		return -1;
2914 
2915 	return 0;
2916 }
2917 
2918 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2919 {
2920 	struct perf_evsel *evsel;
2921 
2922 	evlist__for_each_entry(evlist, evsel)
2923 		evsel->handler = handler;
2924 }
2925 
2926 /*
2927  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2928  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2929  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2930  *
2931  * It'd be better to introduce a parse_options() variant that would return a
2932  * list with the terms it didn't match to an event...
2933  */
2934 static int trace__parse_events_option(const struct option *opt, const char *str,
2935 				      int unset __maybe_unused)
2936 {
2937 	struct trace *trace = (struct trace *)opt->value;
2938 	const char *s = str;
2939 	char *sep = NULL, *lists[2] = { NULL, NULL, };
2940 	int len = strlen(str) + 1, err = -1, list, idx;
2941 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2942 	char group_name[PATH_MAX];
2943 
2944 	if (strace_groups_dir == NULL)
2945 		return -1;
2946 
2947 	if (*s == '!') {
2948 		++s;
2949 		trace->not_ev_qualifier = true;
2950 	}
2951 
2952 	while (1) {
2953 		if ((sep = strchr(s, ',')) != NULL)
2954 			*sep = '\0';
2955 
2956 		list = 0;
2957 		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2958 		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2959 			list = 1;
2960 		} else {
2961 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2962 			if (access(group_name, R_OK) == 0)
2963 				list = 1;
2964 		}
2965 
2966 		if (lists[list]) {
2967 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2968 		} else {
2969 			lists[list] = malloc(len);
2970 			if (lists[list] == NULL)
2971 				goto out;
2972 			strcpy(lists[list], s);
2973 		}
2974 
2975 		if (!sep)
2976 			break;
2977 
2978 		*sep = ',';
2979 		s = sep + 1;
2980 	}
2981 
2982 	if (lists[1] != NULL) {
2983 		struct strlist_config slist_config = {
2984 			.dirname = strace_groups_dir,
2985 		};
2986 
2987 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2988 		if (trace->ev_qualifier == NULL) {
2989 			fputs("Not enough memory to parse event qualifier", trace->output);
2990 			goto out;
2991 		}
2992 
2993 		if (trace__validate_ev_qualifier(trace))
2994 			goto out;
2995 	}
2996 
2997 	err = 0;
2998 
2999 	if (lists[0]) {
3000 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3001 					       "event selector. use 'perf list' to list available events",
3002 					       parse_events_option);
3003 		err = parse_events_option(&o, lists[0], 0);
3004 	}
3005 out:
3006 	if (sep)
3007 		*sep = ',';
3008 
3009 	return err;
3010 }
3011 
3012 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3013 {
3014 	struct trace *trace = opt->value;
3015 
3016 	if (!list_empty(&trace->evlist->entries))
3017 		return parse_cgroups(opt, str, unset);
3018 
3019 	trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3020 
3021 	return 0;
3022 }
3023 
3024 int cmd_trace(int argc, const char **argv)
3025 {
3026 	const char *trace_usage[] = {
3027 		"perf trace [<options>] [<command>]",
3028 		"perf trace [<options>] -- <command> [<options>]",
3029 		"perf trace record [<options>] [<command>]",
3030 		"perf trace record [<options>] -- <command> [<options>]",
3031 		NULL
3032 	};
3033 	struct trace trace = {
3034 		.syscalls = {
3035 			. max = -1,
3036 		},
3037 		.opts = {
3038 			.target = {
3039 				.uid	   = UINT_MAX,
3040 				.uses_mmap = true,
3041 			},
3042 			.user_freq     = UINT_MAX,
3043 			.user_interval = ULLONG_MAX,
3044 			.no_buffering  = true,
3045 			.mmap_pages    = UINT_MAX,
3046 			.proc_map_timeout  = 500,
3047 		},
3048 		.output = stderr,
3049 		.show_comm = true,
3050 		.trace_syscalls = true,
3051 		.kernel_syscallchains = false,
3052 		.max_stack = UINT_MAX,
3053 	};
3054 	const char *output_name = NULL;
3055 	const struct option trace_options[] = {
3056 	OPT_CALLBACK('e', "event", &trace, "event",
3057 		     "event/syscall selector. use 'perf list' to list available events",
3058 		     trace__parse_events_option),
3059 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
3060 		    "show the thread COMM next to its id"),
3061 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3062 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3063 		     trace__parse_events_option),
3064 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
3065 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3066 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3067 		    "trace events on existing process id"),
3068 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3069 		    "trace events on existing thread id"),
3070 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3071 		     "pids to filter (by the kernel)", trace__set_filter_pids),
3072 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3073 		    "system-wide collection from all CPUs"),
3074 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3075 		    "list of cpus to monitor"),
3076 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3077 		    "child tasks do not inherit counters"),
3078 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3079 		     "number of mmap data pages",
3080 		     perf_evlist__parse_mmap_pages),
3081 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3082 		   "user to profile"),
3083 	OPT_CALLBACK(0, "duration", &trace, "float",
3084 		     "show only events with duration > N.M ms",
3085 		     trace__set_duration),
3086 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3087 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3088 	OPT_BOOLEAN('T', "time", &trace.full_time,
3089 		    "Show full timestamp, not time relative to first start"),
3090 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3091 		    "Show only syscall summary with statistics"),
3092 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3093 		    "Show all syscalls and summary with statistics"),
3094 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3095 		     "Trace pagefaults", parse_pagefaults, "maj"),
3096 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3097 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3098 	OPT_CALLBACK(0, "call-graph", &trace.opts,
3099 		     "record_mode[,record_size]", record_callchain_help,
3100 		     &record_parse_callchain_opt),
3101 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3102 		    "Show the kernel callchains on the syscall exit path"),
3103 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3104 		     "Set the minimum stack depth when parsing the callchain, "
3105 		     "anything below the specified depth will be ignored."),
3106 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3107 		     "Set the maximum stack depth when parsing the callchain, "
3108 		     "anything beyond the specified depth will be ignored. "
3109 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3110 	OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3111 			"print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3112 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3113 			"per thread proc mmap processing timeout in ms"),
3114 	OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3115 		     trace__parse_cgroups),
3116 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3117 		     "ms to wait before starting measurement after program "
3118 		     "start"),
3119 	OPT_END()
3120 	};
3121 	bool __maybe_unused max_stack_user_set = true;
3122 	bool mmap_pages_user_set = true;
3123 	const char * const trace_subcommands[] = { "record", NULL };
3124 	int err;
3125 	char bf[BUFSIZ];
3126 
3127 	signal(SIGSEGV, sighandler_dump_stack);
3128 	signal(SIGFPE, sighandler_dump_stack);
3129 
3130 	trace.evlist = perf_evlist__new();
3131 	trace.sctbl = syscalltbl__new();
3132 
3133 	if (trace.evlist == NULL || trace.sctbl == NULL) {
3134 		pr_err("Not enough memory to run!\n");
3135 		err = -ENOMEM;
3136 		goto out;
3137 	}
3138 
3139 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3140 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3141 
3142 	if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3143 		usage_with_options_msg(trace_usage, trace_options,
3144 				       "cgroup monitoring only available in system-wide mode");
3145 	}
3146 
3147 	err = bpf__setup_stdout(trace.evlist);
3148 	if (err) {
3149 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3150 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3151 		goto out;
3152 	}
3153 
3154 	err = -1;
3155 
3156 	if (trace.trace_pgfaults) {
3157 		trace.opts.sample_address = true;
3158 		trace.opts.sample_time = true;
3159 	}
3160 
3161 	if (trace.opts.mmap_pages == UINT_MAX)
3162 		mmap_pages_user_set = false;
3163 
3164 	if (trace.max_stack == UINT_MAX) {
3165 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3166 		max_stack_user_set = false;
3167 	}
3168 
3169 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3170 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3171 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3172 	}
3173 #endif
3174 
3175 	if (callchain_param.enabled) {
3176 		if (!mmap_pages_user_set && geteuid() == 0)
3177 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3178 
3179 		symbol_conf.use_callchain = true;
3180 	}
3181 
3182 	if (trace.evlist->nr_entries > 0)
3183 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3184 
3185 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3186 		return trace__record(&trace, argc-1, &argv[1]);
3187 
3188 	/* summary_only implies summary option, but don't overwrite summary if set */
3189 	if (trace.summary_only)
3190 		trace.summary = trace.summary_only;
3191 
3192 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3193 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3194 		pr_err("Please specify something to trace.\n");
3195 		return -1;
3196 	}
3197 
3198 	if (!trace.trace_syscalls && trace.ev_qualifier) {
3199 		pr_err("The -e option can't be used with --no-syscalls.\n");
3200 		goto out;
3201 	}
3202 
3203 	if (output_name != NULL) {
3204 		err = trace__open_output(&trace, output_name);
3205 		if (err < 0) {
3206 			perror("failed to create output file");
3207 			goto out;
3208 		}
3209 	}
3210 
3211 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
3212 
3213 	err = target__validate(&trace.opts.target);
3214 	if (err) {
3215 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3216 		fprintf(trace.output, "%s", bf);
3217 		goto out_close;
3218 	}
3219 
3220 	err = target__parse_uid(&trace.opts.target);
3221 	if (err) {
3222 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3223 		fprintf(trace.output, "%s", bf);
3224 		goto out_close;
3225 	}
3226 
3227 	if (!argc && target__none(&trace.opts.target))
3228 		trace.opts.target.system_wide = true;
3229 
3230 	if (input_name)
3231 		err = trace__replay(&trace);
3232 	else
3233 		err = trace__run(&trace, argc, argv);
3234 
3235 out_close:
3236 	if (output_name != NULL)
3237 		fclose(trace.output);
3238 out:
3239 	return err;
3240 }
3241