xref: /openbmc/linux/tools/perf/builtin-trace.c (revision 5b394b2d)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/cgroup.h"
23 #include "util/color.h"
24 #include "util/debug.h"
25 #include "util/env.h"
26 #include "util/event.h"
27 #include "util/evlist.h"
28 #include <subcmd/exec-cmd.h>
29 #include "util/machine.h"
30 #include "util/path.h"
31 #include "util/session.h"
32 #include "util/thread.h"
33 #include <subcmd/parse-options.h>
34 #include "util/strlist.h"
35 #include "util/intlist.h"
36 #include "util/thread_map.h"
37 #include "util/stat.h"
38 #include "trace/beauty/beauty.h"
39 #include "trace-event.h"
40 #include "util/parse-events.h"
41 #include "util/bpf-loader.h"
42 #include "callchain.h"
43 #include "print_binary.h"
44 #include "string2.h"
45 #include "syscalltbl.h"
46 #include "rb_resort.h"
47 
48 #include <errno.h>
49 #include <inttypes.h>
50 #include <poll.h>
51 #include <signal.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <linux/err.h>
55 #include <linux/filter.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 #include <fcntl.h>
61 
62 #include "sane_ctype.h"
63 
64 #ifndef O_CLOEXEC
65 # define O_CLOEXEC		02000000
66 #endif
67 
68 #ifndef F_LINUX_SPECIFIC_BASE
69 # define F_LINUX_SPECIFIC_BASE	1024
70 #endif
71 
72 struct trace {
73 	struct perf_tool	tool;
74 	struct syscalltbl	*sctbl;
75 	struct {
76 		int		max;
77 		struct syscall  *table;
78 		struct {
79 			struct perf_evsel *sys_enter,
80 					  *sys_exit,
81 					  *augmented;
82 		}		events;
83 	} syscalls;
84 	struct record_opts	opts;
85 	struct perf_evlist	*evlist;
86 	struct machine		*host;
87 	struct thread		*current;
88 	struct cgroup		*cgroup;
89 	u64			base_time;
90 	FILE			*output;
91 	unsigned long		nr_events;
92 	struct strlist		*ev_qualifier;
93 	struct {
94 		size_t		nr;
95 		int		*entries;
96 	}			ev_qualifier_ids;
97 	struct {
98 		size_t		nr;
99 		pid_t		*entries;
100 	}			filter_pids;
101 	double			duration_filter;
102 	double			runtime_ms;
103 	struct {
104 		u64		vfs_getname,
105 				proc_getname;
106 	} stats;
107 	unsigned int		max_stack;
108 	unsigned int		min_stack;
109 	bool			not_ev_qualifier;
110 	bool			live;
111 	bool			full_time;
112 	bool			sched;
113 	bool			multiple_threads;
114 	bool			summary;
115 	bool			summary_only;
116 	bool			failure_only;
117 	bool			show_comm;
118 	bool			print_sample;
119 	bool			show_tool_stats;
120 	bool			trace_syscalls;
121 	bool			kernel_syscallchains;
122 	bool			force;
123 	bool			vfs_getname;
124 	int			trace_pgfaults;
125 };
126 
127 struct tp_field {
128 	int offset;
129 	union {
130 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
131 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
132 	};
133 };
134 
135 #define TP_UINT_FIELD(bits) \
136 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
137 { \
138 	u##bits value; \
139 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
140 	return value;  \
141 }
142 
143 TP_UINT_FIELD(8);
144 TP_UINT_FIELD(16);
145 TP_UINT_FIELD(32);
146 TP_UINT_FIELD(64);
147 
148 #define TP_UINT_FIELD__SWAPPED(bits) \
149 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
150 { \
151 	u##bits value; \
152 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
153 	return bswap_##bits(value);\
154 }
155 
156 TP_UINT_FIELD__SWAPPED(16);
157 TP_UINT_FIELD__SWAPPED(32);
158 TP_UINT_FIELD__SWAPPED(64);
159 
160 static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
161 {
162 	field->offset = offset;
163 
164 	switch (size) {
165 	case 1:
166 		field->integer = tp_field__u8;
167 		break;
168 	case 2:
169 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
170 		break;
171 	case 4:
172 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
173 		break;
174 	case 8:
175 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
176 		break;
177 	default:
178 		return -1;
179 	}
180 
181 	return 0;
182 }
183 
184 static int tp_field__init_uint(struct tp_field *field, struct format_field *format_field, bool needs_swap)
185 {
186 	return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
187 }
188 
189 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
190 {
191 	return sample->raw_data + field->offset;
192 }
193 
194 static int __tp_field__init_ptr(struct tp_field *field, int offset)
195 {
196 	field->offset = offset;
197 	field->pointer = tp_field__ptr;
198 	return 0;
199 }
200 
201 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
202 {
203 	return __tp_field__init_ptr(field, format_field->offset);
204 }
205 
206 struct syscall_tp {
207 	struct tp_field id;
208 	union {
209 		struct tp_field args, ret;
210 	};
211 };
212 
213 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
214 					  struct tp_field *field,
215 					  const char *name)
216 {
217 	struct format_field *format_field = perf_evsel__field(evsel, name);
218 
219 	if (format_field == NULL)
220 		return -1;
221 
222 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
223 }
224 
225 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
226 	({ struct syscall_tp *sc = evsel->priv;\
227 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
228 
229 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
230 					 struct tp_field *field,
231 					 const char *name)
232 {
233 	struct format_field *format_field = perf_evsel__field(evsel, name);
234 
235 	if (format_field == NULL)
236 		return -1;
237 
238 	return tp_field__init_ptr(field, format_field);
239 }
240 
241 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
242 	({ struct syscall_tp *sc = evsel->priv;\
243 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
244 
245 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
246 {
247 	zfree(&evsel->priv);
248 	perf_evsel__delete(evsel);
249 }
250 
251 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel)
252 {
253 	struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
254 
255 	if (evsel->priv != NULL) {
256 		if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr"))
257 			goto out_delete;
258 		return 0;
259 	}
260 
261 	return -ENOMEM;
262 out_delete:
263 	zfree(&evsel->priv);
264 	return -ENOENT;
265 }
266 
267 static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel)
268 {
269 	struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
270 
271 	if (evsel->priv != NULL) {       /* field, sizeof_field, offsetof_field */
272 		if (__tp_field__init_uint(&sc->id, sizeof(long), sizeof(long long), evsel->needs_swap))
273 			goto out_delete;
274 
275 		return 0;
276 	}
277 
278 	return -ENOMEM;
279 out_delete:
280 	zfree(&evsel->priv);
281 	return -EINVAL;
282 }
283 
284 static int perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel *evsel)
285 {
286 	struct syscall_tp *sc = evsel->priv;
287 
288 	return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
289 }
290 
291 static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler)
292 {
293 	evsel->priv = malloc(sizeof(struct syscall_tp));
294 	if (evsel->priv != NULL) {
295 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
296 			goto out_delete;
297 
298 		evsel->handler = handler;
299 		return 0;
300 	}
301 
302 	return -ENOMEM;
303 
304 out_delete:
305 	zfree(&evsel->priv);
306 	return -ENOENT;
307 }
308 
309 static struct perf_evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
310 {
311 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
312 
313 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
314 	if (IS_ERR(evsel))
315 		evsel = perf_evsel__newtp("syscalls", direction);
316 
317 	if (IS_ERR(evsel))
318 		return NULL;
319 
320 	if (perf_evsel__init_raw_syscall_tp(evsel, handler))
321 		goto out_delete;
322 
323 	return evsel;
324 
325 out_delete:
326 	perf_evsel__delete_priv(evsel);
327 	return NULL;
328 }
329 
330 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
331 	({ struct syscall_tp *fields = evsel->priv; \
332 	   fields->name.integer(&fields->name, sample); })
333 
334 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
335 	({ struct syscall_tp *fields = evsel->priv; \
336 	   fields->name.pointer(&fields->name, sample); })
337 
338 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
339 {
340 	int idx = val - sa->offset;
341 
342 	if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL)
343 		return scnprintf(bf, size, intfmt, val);
344 
345 	return scnprintf(bf, size, "%s", sa->entries[idx]);
346 }
347 
348 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
349 						const char *intfmt,
350 					        struct syscall_arg *arg)
351 {
352 	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
353 }
354 
355 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
356 					      struct syscall_arg *arg)
357 {
358 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
359 }
360 
361 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
362 
363 struct strarrays {
364 	int		nr_entries;
365 	struct strarray **entries;
366 };
367 
368 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
369 	.nr_entries = ARRAY_SIZE(array), \
370 	.entries = array, \
371 }
372 
373 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
374 					struct syscall_arg *arg)
375 {
376 	struct strarrays *sas = arg->parm;
377 	int i;
378 
379 	for (i = 0; i < sas->nr_entries; ++i) {
380 		struct strarray *sa = sas->entries[i];
381 		int idx = arg->val - sa->offset;
382 
383 		if (idx >= 0 && idx < sa->nr_entries) {
384 			if (sa->entries[idx] == NULL)
385 				break;
386 			return scnprintf(bf, size, "%s", sa->entries[idx]);
387 		}
388 	}
389 
390 	return scnprintf(bf, size, "%d", arg->val);
391 }
392 
393 #ifndef AT_FDCWD
394 #define AT_FDCWD	-100
395 #endif
396 
397 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
398 					   struct syscall_arg *arg)
399 {
400 	int fd = arg->val;
401 
402 	if (fd == AT_FDCWD)
403 		return scnprintf(bf, size, "CWD");
404 
405 	return syscall_arg__scnprintf_fd(bf, size, arg);
406 }
407 
408 #define SCA_FDAT syscall_arg__scnprintf_fd_at
409 
410 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
411 					      struct syscall_arg *arg);
412 
413 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
414 
415 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
416 {
417 	return scnprintf(bf, size, "%#lx", arg->val);
418 }
419 
420 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
421 {
422 	return scnprintf(bf, size, "%d", arg->val);
423 }
424 
425 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
426 {
427 	return scnprintf(bf, size, "%ld", arg->val);
428 }
429 
430 static const char *bpf_cmd[] = {
431 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
432 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
433 };
434 static DEFINE_STRARRAY(bpf_cmd);
435 
436 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
437 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
438 
439 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
440 static DEFINE_STRARRAY(itimers);
441 
442 static const char *keyctl_options[] = {
443 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
444 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
445 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
446 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
447 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
448 };
449 static DEFINE_STRARRAY(keyctl_options);
450 
451 static const char *whences[] = { "SET", "CUR", "END",
452 #ifdef SEEK_DATA
453 "DATA",
454 #endif
455 #ifdef SEEK_HOLE
456 "HOLE",
457 #endif
458 };
459 static DEFINE_STRARRAY(whences);
460 
461 static const char *fcntl_cmds[] = {
462 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
463 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
464 	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
465 	"GETOWNER_UIDS",
466 };
467 static DEFINE_STRARRAY(fcntl_cmds);
468 
469 static const char *fcntl_linux_specific_cmds[] = {
470 	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
471 	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
472 	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
473 };
474 
475 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
476 
477 static struct strarray *fcntl_cmds_arrays[] = {
478 	&strarray__fcntl_cmds,
479 	&strarray__fcntl_linux_specific_cmds,
480 };
481 
482 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
483 
484 static const char *rlimit_resources[] = {
485 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
486 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
487 	"RTTIME",
488 };
489 static DEFINE_STRARRAY(rlimit_resources);
490 
491 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
492 static DEFINE_STRARRAY(sighow);
493 
494 static const char *clockid[] = {
495 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
496 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
497 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
498 };
499 static DEFINE_STRARRAY(clockid);
500 
501 static const char *socket_families[] = {
502 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
503 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
504 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
505 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
506 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
507 	"ALG", "NFC", "VSOCK",
508 };
509 static DEFINE_STRARRAY(socket_families);
510 
511 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
512 						 struct syscall_arg *arg)
513 {
514 	size_t printed = 0;
515 	int mode = arg->val;
516 
517 	if (mode == F_OK) /* 0 */
518 		return scnprintf(bf, size, "F");
519 #define	P_MODE(n) \
520 	if (mode & n##_OK) { \
521 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
522 		mode &= ~n##_OK; \
523 	}
524 
525 	P_MODE(R);
526 	P_MODE(W);
527 	P_MODE(X);
528 #undef P_MODE
529 
530 	if (mode)
531 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
532 
533 	return printed;
534 }
535 
536 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
537 
538 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
539 					      struct syscall_arg *arg);
540 
541 #define SCA_FILENAME syscall_arg__scnprintf_filename
542 
543 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
544 						struct syscall_arg *arg)
545 {
546 	int printed = 0, flags = arg->val;
547 
548 #define	P_FLAG(n) \
549 	if (flags & O_##n) { \
550 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
551 		flags &= ~O_##n; \
552 	}
553 
554 	P_FLAG(CLOEXEC);
555 	P_FLAG(NONBLOCK);
556 #undef P_FLAG
557 
558 	if (flags)
559 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
560 
561 	return printed;
562 }
563 
564 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
565 
566 #ifndef GRND_NONBLOCK
567 #define GRND_NONBLOCK	0x0001
568 #endif
569 #ifndef GRND_RANDOM
570 #define GRND_RANDOM	0x0002
571 #endif
572 
573 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
574 						   struct syscall_arg *arg)
575 {
576 	int printed = 0, flags = arg->val;
577 
578 #define	P_FLAG(n) \
579 	if (flags & GRND_##n) { \
580 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
581 		flags &= ~GRND_##n; \
582 	}
583 
584 	P_FLAG(RANDOM);
585 	P_FLAG(NONBLOCK);
586 #undef P_FLAG
587 
588 	if (flags)
589 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
590 
591 	return printed;
592 }
593 
594 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
595 
596 #define STRARRAY(name, array) \
597 	  { .scnprintf	= SCA_STRARRAY, \
598 	    .parm	= &strarray__##array, }
599 
600 #include "trace/beauty/arch_errno_names.c"
601 #include "trace/beauty/eventfd.c"
602 #include "trace/beauty/futex_op.c"
603 #include "trace/beauty/futex_val3.c"
604 #include "trace/beauty/mmap.c"
605 #include "trace/beauty/mode_t.c"
606 #include "trace/beauty/msg_flags.c"
607 #include "trace/beauty/open_flags.c"
608 #include "trace/beauty/perf_event_open.c"
609 #include "trace/beauty/pid.c"
610 #include "trace/beauty/sched_policy.c"
611 #include "trace/beauty/seccomp.c"
612 #include "trace/beauty/signum.c"
613 #include "trace/beauty/socket_type.c"
614 #include "trace/beauty/waitid_options.c"
615 
616 struct syscall_arg_fmt {
617 	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
618 	void	   *parm;
619 	const char *name;
620 	bool	   show_zero;
621 };
622 
623 static struct syscall_fmt {
624 	const char *name;
625 	const char *alias;
626 	struct syscall_arg_fmt arg[6];
627 	u8	   nr_args;
628 	bool	   errpid;
629 	bool	   timeout;
630 	bool	   hexret;
631 } syscall_fmts[] = {
632 	{ .name	    = "access",
633 	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
634 	{ .name	    = "bpf",
635 	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
636 	{ .name	    = "brk",	    .hexret = true,
637 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
638 	{ .name     = "clock_gettime",
639 	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
640 	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
641 	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
642 		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
643 		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
644 		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
645 		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
646 	{ .name	    = "close",
647 	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
648 	{ .name	    = "epoll_ctl",
649 	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
650 	{ .name	    = "eventfd2",
651 	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
652 	{ .name	    = "fchmodat",
653 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
654 	{ .name	    = "fchownat",
655 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
656 	{ .name	    = "fcntl",
657 	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
658 			   .parm      = &strarrays__fcntl_cmds_arrays,
659 			   .show_zero = true, },
660 		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
661 	{ .name	    = "flock",
662 	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
663 	{ .name	    = "fstat", .alias = "newfstat", },
664 	{ .name	    = "fstatat", .alias = "newfstatat", },
665 	{ .name	    = "futex",
666 	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
667 		   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
668 	{ .name	    = "futimesat",
669 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
670 	{ .name	    = "getitimer",
671 	  .arg = { [0] = STRARRAY(which, itimers), }, },
672 	{ .name	    = "getpid",	    .errpid = true, },
673 	{ .name	    = "getpgid",    .errpid = true, },
674 	{ .name	    = "getppid",    .errpid = true, },
675 	{ .name	    = "getrandom",
676 	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
677 	{ .name	    = "getrlimit",
678 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
679 	{ .name	    = "gettid",	    .errpid = true, },
680 	{ .name	    = "ioctl",
681 	  .arg = {
682 #if defined(__i386__) || defined(__x86_64__)
683 /*
684  * FIXME: Make this available to all arches.
685  */
686 		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
687 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
688 #else
689 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
690 #endif
691 	{ .name	    = "kcmp",	    .nr_args = 5,
692 	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
693 		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
694 		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
695 		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
696 		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
697 	{ .name	    = "keyctl",
698 	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
699 	{ .name	    = "kill",
700 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
701 	{ .name	    = "linkat",
702 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
703 	{ .name	    = "lseek",
704 	  .arg = { [2] = STRARRAY(whence, whences), }, },
705 	{ .name	    = "lstat", .alias = "newlstat", },
706 	{ .name     = "madvise",
707 	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
708 		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
709 	{ .name	    = "mkdirat",
710 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
711 	{ .name	    = "mknodat",
712 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
713 	{ .name	    = "mlock",
714 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
715 	{ .name	    = "mlockall",
716 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
717 	{ .name	    = "mmap",	    .hexret = true,
718 /* The standard mmap maps to old_mmap on s390x */
719 #if defined(__s390x__)
720 	.alias = "old_mmap",
721 #endif
722 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
723 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
724 		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
725 	{ .name	    = "mprotect",
726 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
727 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
728 	{ .name	    = "mq_unlink",
729 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
730 	{ .name	    = "mremap",	    .hexret = true,
731 	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
732 		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
733 		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
734 	{ .name	    = "munlock",
735 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
736 	{ .name	    = "munmap",
737 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
738 	{ .name	    = "name_to_handle_at",
739 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
740 	{ .name	    = "newfstatat",
741 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
742 	{ .name	    = "open",
743 	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
744 	{ .name	    = "open_by_handle_at",
745 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
746 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
747 	{ .name	    = "openat",
748 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
749 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
750 	{ .name	    = "perf_event_open",
751 	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
752 		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
753 		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
754 	{ .name	    = "pipe2",
755 	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
756 	{ .name	    = "pkey_alloc",
757 	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
758 	{ .name	    = "pkey_free",
759 	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
760 	{ .name	    = "pkey_mprotect",
761 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
762 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
763 		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
764 	{ .name	    = "poll", .timeout = true, },
765 	{ .name	    = "ppoll", .timeout = true, },
766 	{ .name	    = "prctl", .alias = "arch_prctl",
767 	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
768 		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
769 		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
770 	{ .name	    = "pread", .alias = "pread64", },
771 	{ .name	    = "preadv", .alias = "pread", },
772 	{ .name	    = "prlimit64",
773 	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
774 	{ .name	    = "pwrite", .alias = "pwrite64", },
775 	{ .name	    = "readlinkat",
776 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
777 	{ .name	    = "recvfrom",
778 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
779 	{ .name	    = "recvmmsg",
780 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
781 	{ .name	    = "recvmsg",
782 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
783 	{ .name	    = "renameat",
784 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
785 	{ .name	    = "rt_sigaction",
786 	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
787 	{ .name	    = "rt_sigprocmask",
788 	  .arg = { [0] = STRARRAY(how, sighow), }, },
789 	{ .name	    = "rt_sigqueueinfo",
790 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
791 	{ .name	    = "rt_tgsigqueueinfo",
792 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
793 	{ .name	    = "sched_setscheduler",
794 	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
795 	{ .name	    = "seccomp",
796 	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
797 		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
798 	{ .name	    = "select", .timeout = true, },
799 	{ .name	    = "sendmmsg",
800 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
801 	{ .name	    = "sendmsg",
802 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
803 	{ .name	    = "sendto",
804 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
805 	{ .name	    = "set_tid_address", .errpid = true, },
806 	{ .name	    = "setitimer",
807 	  .arg = { [0] = STRARRAY(which, itimers), }, },
808 	{ .name	    = "setrlimit",
809 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
810 	{ .name	    = "socket",
811 	  .arg = { [0] = STRARRAY(family, socket_families),
812 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
813 		   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
814 	{ .name	    = "socketpair",
815 	  .arg = { [0] = STRARRAY(family, socket_families),
816 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
817 		   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
818 	{ .name	    = "stat", .alias = "newstat", },
819 	{ .name	    = "statx",
820 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
821 		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
822 		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
823 	{ .name	    = "swapoff",
824 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
825 	{ .name	    = "swapon",
826 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
827 	{ .name	    = "symlinkat",
828 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
829 	{ .name	    = "tgkill",
830 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
831 	{ .name	    = "tkill",
832 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
833 	{ .name	    = "uname", .alias = "newuname", },
834 	{ .name	    = "unlinkat",
835 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
836 	{ .name	    = "utimensat",
837 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
838 	{ .name	    = "wait4",	    .errpid = true,
839 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
840 	{ .name	    = "waitid",	    .errpid = true,
841 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
842 };
843 
844 static int syscall_fmt__cmp(const void *name, const void *fmtp)
845 {
846 	const struct syscall_fmt *fmt = fmtp;
847 	return strcmp(name, fmt->name);
848 }
849 
850 static struct syscall_fmt *syscall_fmt__find(const char *name)
851 {
852 	const int nmemb = ARRAY_SIZE(syscall_fmts);
853 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
854 }
855 
856 /*
857  * is_exit: is this "exit" or "exit_group"?
858  * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
859  */
860 struct syscall {
861 	struct event_format *tp_format;
862 	int		    nr_args;
863 	bool		    is_exit;
864 	bool		    is_open;
865 	struct format_field *args;
866 	const char	    *name;
867 	struct syscall_fmt  *fmt;
868 	struct syscall_arg_fmt *arg_fmt;
869 };
870 
871 /*
872  * We need to have this 'calculated' boolean because in some cases we really
873  * don't know what is the duration of a syscall, for instance, when we start
874  * a session and some threads are waiting for a syscall to finish, say 'poll',
875  * in which case all we can do is to print "( ? ) for duration and for the
876  * start timestamp.
877  */
878 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
879 {
880 	double duration = (double)t / NSEC_PER_MSEC;
881 	size_t printed = fprintf(fp, "(");
882 
883 	if (!calculated)
884 		printed += fprintf(fp, "         ");
885 	else if (duration >= 1.0)
886 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
887 	else if (duration >= 0.01)
888 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
889 	else
890 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
891 	return printed + fprintf(fp, "): ");
892 }
893 
894 /**
895  * filename.ptr: The filename char pointer that will be vfs_getname'd
896  * filename.entry_str_pos: Where to insert the string translated from
897  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
898  * ret_scnprintf: syscall args may set this to a different syscall return
899  *                formatter, for instance, fcntl may return fds, file flags, etc.
900  */
901 struct thread_trace {
902 	u64		  entry_time;
903 	bool		  entry_pending;
904 	unsigned long	  nr_events;
905 	unsigned long	  pfmaj, pfmin;
906 	char		  *entry_str;
907 	double		  runtime_ms;
908 	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
909         struct {
910 		unsigned long ptr;
911 		short int     entry_str_pos;
912 		bool	      pending_open;
913 		unsigned int  namelen;
914 		char	      *name;
915 	} filename;
916 	struct {
917 		int	  max;
918 		char	  **table;
919 	} paths;
920 
921 	struct intlist *syscall_stats;
922 };
923 
924 static struct thread_trace *thread_trace__new(void)
925 {
926 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
927 
928 	if (ttrace)
929 		ttrace->paths.max = -1;
930 
931 	ttrace->syscall_stats = intlist__new(NULL);
932 
933 	return ttrace;
934 }
935 
936 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
937 {
938 	struct thread_trace *ttrace;
939 
940 	if (thread == NULL)
941 		goto fail;
942 
943 	if (thread__priv(thread) == NULL)
944 		thread__set_priv(thread, thread_trace__new());
945 
946 	if (thread__priv(thread) == NULL)
947 		goto fail;
948 
949 	ttrace = thread__priv(thread);
950 	++ttrace->nr_events;
951 
952 	return ttrace;
953 fail:
954 	color_fprintf(fp, PERF_COLOR_RED,
955 		      "WARNING: not enough memory, dropping samples!\n");
956 	return NULL;
957 }
958 
959 
960 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
961 				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
962 {
963 	struct thread_trace *ttrace = thread__priv(arg->thread);
964 
965 	ttrace->ret_scnprintf = ret_scnprintf;
966 }
967 
968 #define TRACE_PFMAJ		(1 << 0)
969 #define TRACE_PFMIN		(1 << 1)
970 
971 static const size_t trace__entry_str_size = 2048;
972 
973 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
974 {
975 	struct thread_trace *ttrace = thread__priv(thread);
976 
977 	if (fd > ttrace->paths.max) {
978 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
979 
980 		if (npath == NULL)
981 			return -1;
982 
983 		if (ttrace->paths.max != -1) {
984 			memset(npath + ttrace->paths.max + 1, 0,
985 			       (fd - ttrace->paths.max) * sizeof(char *));
986 		} else {
987 			memset(npath, 0, (fd + 1) * sizeof(char *));
988 		}
989 
990 		ttrace->paths.table = npath;
991 		ttrace->paths.max   = fd;
992 	}
993 
994 	ttrace->paths.table[fd] = strdup(pathname);
995 
996 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
997 }
998 
999 static int thread__read_fd_path(struct thread *thread, int fd)
1000 {
1001 	char linkname[PATH_MAX], pathname[PATH_MAX];
1002 	struct stat st;
1003 	int ret;
1004 
1005 	if (thread->pid_ == thread->tid) {
1006 		scnprintf(linkname, sizeof(linkname),
1007 			  "/proc/%d/fd/%d", thread->pid_, fd);
1008 	} else {
1009 		scnprintf(linkname, sizeof(linkname),
1010 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1011 	}
1012 
1013 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1014 		return -1;
1015 
1016 	ret = readlink(linkname, pathname, sizeof(pathname));
1017 
1018 	if (ret < 0 || ret > st.st_size)
1019 		return -1;
1020 
1021 	pathname[ret] = '\0';
1022 	return trace__set_fd_pathname(thread, fd, pathname);
1023 }
1024 
1025 static const char *thread__fd_path(struct thread *thread, int fd,
1026 				   struct trace *trace)
1027 {
1028 	struct thread_trace *ttrace = thread__priv(thread);
1029 
1030 	if (ttrace == NULL)
1031 		return NULL;
1032 
1033 	if (fd < 0)
1034 		return NULL;
1035 
1036 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1037 		if (!trace->live)
1038 			return NULL;
1039 		++trace->stats.proc_getname;
1040 		if (thread__read_fd_path(thread, fd))
1041 			return NULL;
1042 	}
1043 
1044 	return ttrace->paths.table[fd];
1045 }
1046 
1047 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1048 {
1049 	int fd = arg->val;
1050 	size_t printed = scnprintf(bf, size, "%d", fd);
1051 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1052 
1053 	if (path)
1054 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1055 
1056 	return printed;
1057 }
1058 
1059 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1060 {
1061         size_t printed = scnprintf(bf, size, "%d", fd);
1062 	struct thread *thread = machine__find_thread(trace->host, pid, pid);
1063 
1064 	if (thread) {
1065 		const char *path = thread__fd_path(thread, fd, trace);
1066 
1067 		if (path)
1068 			printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1069 
1070 		thread__put(thread);
1071 	}
1072 
1073         return printed;
1074 }
1075 
1076 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1077 					      struct syscall_arg *arg)
1078 {
1079 	int fd = arg->val;
1080 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1081 	struct thread_trace *ttrace = thread__priv(arg->thread);
1082 
1083 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1084 		zfree(&ttrace->paths.table[fd]);
1085 
1086 	return printed;
1087 }
1088 
1089 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1090 				     unsigned long ptr)
1091 {
1092 	struct thread_trace *ttrace = thread__priv(thread);
1093 
1094 	ttrace->filename.ptr = ptr;
1095 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1096 }
1097 
1098 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1099 					      struct syscall_arg *arg)
1100 {
1101 	unsigned long ptr = arg->val;
1102 
1103 	if (!arg->trace->vfs_getname)
1104 		return scnprintf(bf, size, "%#x", ptr);
1105 
1106 	thread__set_filename_pos(arg->thread, bf, ptr);
1107 	return 0;
1108 }
1109 
1110 static bool trace__filter_duration(struct trace *trace, double t)
1111 {
1112 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1113 }
1114 
1115 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1116 {
1117 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1118 
1119 	return fprintf(fp, "%10.3f ", ts);
1120 }
1121 
1122 /*
1123  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1124  * using ttrace->entry_time for a thread that receives a sys_exit without
1125  * first having received a sys_enter ("poll" issued before tracing session
1126  * starts, lost sys_enter exit due to ring buffer overflow).
1127  */
1128 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1129 {
1130 	if (tstamp > 0)
1131 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1132 
1133 	return fprintf(fp, "         ? ");
1134 }
1135 
1136 static bool done = false;
1137 static bool interrupted = false;
1138 
1139 static void sig_handler(int sig)
1140 {
1141 	done = true;
1142 	interrupted = sig == SIGINT;
1143 }
1144 
1145 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1146 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1147 {
1148 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1149 	printed += fprintf_duration(duration, duration_calculated, fp);
1150 
1151 	if (trace->multiple_threads) {
1152 		if (trace->show_comm)
1153 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1154 		printed += fprintf(fp, "%d ", thread->tid);
1155 	}
1156 
1157 	return printed;
1158 }
1159 
1160 static int trace__process_event(struct trace *trace, struct machine *machine,
1161 				union perf_event *event, struct perf_sample *sample)
1162 {
1163 	int ret = 0;
1164 
1165 	switch (event->header.type) {
1166 	case PERF_RECORD_LOST:
1167 		color_fprintf(trace->output, PERF_COLOR_RED,
1168 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1169 		ret = machine__process_lost_event(machine, event, sample);
1170 		break;
1171 	default:
1172 		ret = machine__process_event(machine, event, sample);
1173 		break;
1174 	}
1175 
1176 	return ret;
1177 }
1178 
1179 static int trace__tool_process(struct perf_tool *tool,
1180 			       union perf_event *event,
1181 			       struct perf_sample *sample,
1182 			       struct machine *machine)
1183 {
1184 	struct trace *trace = container_of(tool, struct trace, tool);
1185 	return trace__process_event(trace, machine, event, sample);
1186 }
1187 
1188 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1189 {
1190 	struct machine *machine = vmachine;
1191 
1192 	if (machine->kptr_restrict_warned)
1193 		return NULL;
1194 
1195 	if (symbol_conf.kptr_restrict) {
1196 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1197 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1198 			   "Kernel samples will not be resolved.\n");
1199 		machine->kptr_restrict_warned = true;
1200 		return NULL;
1201 	}
1202 
1203 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1204 }
1205 
1206 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1207 {
1208 	int err = symbol__init(NULL);
1209 
1210 	if (err)
1211 		return err;
1212 
1213 	trace->host = machine__new_host();
1214 	if (trace->host == NULL)
1215 		return -ENOMEM;
1216 
1217 	err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1218 	if (err < 0)
1219 		goto out;
1220 
1221 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1222 					    evlist->threads, trace__tool_process, false,
1223 					    trace->opts.proc_map_timeout, 1);
1224 out:
1225 	if (err)
1226 		symbol__exit();
1227 
1228 	return err;
1229 }
1230 
1231 static void trace__symbols__exit(struct trace *trace)
1232 {
1233 	machine__exit(trace->host);
1234 	trace->host = NULL;
1235 
1236 	symbol__exit();
1237 }
1238 
1239 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1240 {
1241 	int idx;
1242 
1243 	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1244 		nr_args = sc->fmt->nr_args;
1245 
1246 	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1247 	if (sc->arg_fmt == NULL)
1248 		return -1;
1249 
1250 	for (idx = 0; idx < nr_args; ++idx) {
1251 		if (sc->fmt)
1252 			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1253 	}
1254 
1255 	sc->nr_args = nr_args;
1256 	return 0;
1257 }
1258 
1259 static int syscall__set_arg_fmts(struct syscall *sc)
1260 {
1261 	struct format_field *field;
1262 	int idx = 0, len;
1263 
1264 	for (field = sc->args; field; field = field->next, ++idx) {
1265 		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1266 			continue;
1267 
1268 		if (strcmp(field->type, "const char *") == 0 &&
1269 			 (strcmp(field->name, "filename") == 0 ||
1270 			  strcmp(field->name, "path") == 0 ||
1271 			  strcmp(field->name, "pathname") == 0))
1272 			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1273 		else if (field->flags & FIELD_IS_POINTER)
1274 			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1275 		else if (strcmp(field->type, "pid_t") == 0)
1276 			sc->arg_fmt[idx].scnprintf = SCA_PID;
1277 		else if (strcmp(field->type, "umode_t") == 0)
1278 			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1279 		else if ((strcmp(field->type, "int") == 0 ||
1280 			  strcmp(field->type, "unsigned int") == 0 ||
1281 			  strcmp(field->type, "long") == 0) &&
1282 			 (len = strlen(field->name)) >= 2 &&
1283 			 strcmp(field->name + len - 2, "fd") == 0) {
1284 			/*
1285 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1286 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1287 			 * 65 int
1288 			 * 23 unsigned int
1289 			 * 7 unsigned long
1290 			 */
1291 			sc->arg_fmt[idx].scnprintf = SCA_FD;
1292 		}
1293 	}
1294 
1295 	return 0;
1296 }
1297 
1298 static int trace__read_syscall_info(struct trace *trace, int id)
1299 {
1300 	char tp_name[128];
1301 	struct syscall *sc;
1302 	const char *name = syscalltbl__name(trace->sctbl, id);
1303 
1304 	if (name == NULL)
1305 		return -1;
1306 
1307 	if (id > trace->syscalls.max) {
1308 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1309 
1310 		if (nsyscalls == NULL)
1311 			return -1;
1312 
1313 		if (trace->syscalls.max != -1) {
1314 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1315 			       (id - trace->syscalls.max) * sizeof(*sc));
1316 		} else {
1317 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1318 		}
1319 
1320 		trace->syscalls.table = nsyscalls;
1321 		trace->syscalls.max   = id;
1322 	}
1323 
1324 	sc = trace->syscalls.table + id;
1325 	sc->name = name;
1326 
1327 	sc->fmt  = syscall_fmt__find(sc->name);
1328 
1329 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1330 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1331 
1332 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1333 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1334 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1335 	}
1336 
1337 	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1338 		return -1;
1339 
1340 	if (IS_ERR(sc->tp_format))
1341 		return -1;
1342 
1343 	sc->args = sc->tp_format->format.fields;
1344 	/*
1345 	 * We need to check and discard the first variable '__syscall_nr'
1346 	 * or 'nr' that mean the syscall number. It is needless here.
1347 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1348 	 */
1349 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1350 		sc->args = sc->args->next;
1351 		--sc->nr_args;
1352 	}
1353 
1354 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1355 	sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1356 
1357 	return syscall__set_arg_fmts(sc);
1358 }
1359 
1360 static int trace__validate_ev_qualifier(struct trace *trace)
1361 {
1362 	int err = 0, i;
1363 	size_t nr_allocated;
1364 	struct str_node *pos;
1365 
1366 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1367 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1368 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1369 
1370 	if (trace->ev_qualifier_ids.entries == NULL) {
1371 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1372 		       trace->output);
1373 		err = -EINVAL;
1374 		goto out;
1375 	}
1376 
1377 	nr_allocated = trace->ev_qualifier_ids.nr;
1378 	i = 0;
1379 
1380 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1381 		const char *sc = pos->s;
1382 		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1383 
1384 		if (id < 0) {
1385 			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1386 			if (id >= 0)
1387 				goto matches;
1388 
1389 			if (err == 0) {
1390 				fputs("Error:\tInvalid syscall ", trace->output);
1391 				err = -EINVAL;
1392 			} else {
1393 				fputs(", ", trace->output);
1394 			}
1395 
1396 			fputs(sc, trace->output);
1397 		}
1398 matches:
1399 		trace->ev_qualifier_ids.entries[i++] = id;
1400 		if (match_next == -1)
1401 			continue;
1402 
1403 		while (1) {
1404 			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1405 			if (id < 0)
1406 				break;
1407 			if (nr_allocated == trace->ev_qualifier_ids.nr) {
1408 				void *entries;
1409 
1410 				nr_allocated += 8;
1411 				entries = realloc(trace->ev_qualifier_ids.entries,
1412 						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1413 				if (entries == NULL) {
1414 					err = -ENOMEM;
1415 					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1416 					goto out_free;
1417 				}
1418 				trace->ev_qualifier_ids.entries = entries;
1419 			}
1420 			trace->ev_qualifier_ids.nr++;
1421 			trace->ev_qualifier_ids.entries[i++] = id;
1422 		}
1423 	}
1424 
1425 	if (err < 0) {
1426 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1427 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1428 out_free:
1429 		zfree(&trace->ev_qualifier_ids.entries);
1430 		trace->ev_qualifier_ids.nr = 0;
1431 	}
1432 out:
1433 	return err;
1434 }
1435 
1436 /*
1437  * args is to be interpreted as a series of longs but we need to handle
1438  * 8-byte unaligned accesses. args points to raw_data within the event
1439  * and raw_data is guaranteed to be 8-byte unaligned because it is
1440  * preceded by raw_size which is a u32. So we need to copy args to a temp
1441  * variable to read it. Most notably this avoids extended load instructions
1442  * on unaligned addresses
1443  */
1444 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1445 {
1446 	unsigned long val;
1447 	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1448 
1449 	memcpy(&val, p, sizeof(val));
1450 	return val;
1451 }
1452 
1453 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1454 				      struct syscall_arg *arg)
1455 {
1456 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1457 		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1458 
1459 	return scnprintf(bf, size, "arg%d: ", arg->idx);
1460 }
1461 
1462 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1463 				     struct syscall_arg *arg, unsigned long val)
1464 {
1465 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1466 		arg->val = val;
1467 		if (sc->arg_fmt[arg->idx].parm)
1468 			arg->parm = sc->arg_fmt[arg->idx].parm;
1469 		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1470 	}
1471 	return scnprintf(bf, size, "%ld", val);
1472 }
1473 
1474 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1475 				      unsigned char *args, struct trace *trace,
1476 				      struct thread *thread)
1477 {
1478 	size_t printed = 0;
1479 	unsigned long val;
1480 	u8 bit = 1;
1481 	struct syscall_arg arg = {
1482 		.args	= args,
1483 		.idx	= 0,
1484 		.mask	= 0,
1485 		.trace  = trace,
1486 		.thread = thread,
1487 	};
1488 	struct thread_trace *ttrace = thread__priv(thread);
1489 
1490 	/*
1491 	 * Things like fcntl will set this in its 'cmd' formatter to pick the
1492 	 * right formatter for the return value (an fd? file flags?), which is
1493 	 * not needed for syscalls that always return a given type, say an fd.
1494 	 */
1495 	ttrace->ret_scnprintf = NULL;
1496 
1497 	if (sc->args != NULL) {
1498 		struct format_field *field;
1499 
1500 		for (field = sc->args; field;
1501 		     field = field->next, ++arg.idx, bit <<= 1) {
1502 			if (arg.mask & bit)
1503 				continue;
1504 
1505 			val = syscall_arg__val(&arg, arg.idx);
1506 
1507 			/*
1508  			 * Suppress this argument if its value is zero and
1509  			 * and we don't have a string associated in an
1510  			 * strarray for it.
1511  			 */
1512 			if (val == 0 &&
1513 			    !(sc->arg_fmt &&
1514 			      (sc->arg_fmt[arg.idx].show_zero ||
1515 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1516 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1517 			      sc->arg_fmt[arg.idx].parm))
1518 				continue;
1519 
1520 			printed += scnprintf(bf + printed, size - printed,
1521 					     "%s%s: ", printed ? ", " : "", field->name);
1522 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1523 		}
1524 	} else if (IS_ERR(sc->tp_format)) {
1525 		/*
1526 		 * If we managed to read the tracepoint /format file, then we
1527 		 * may end up not having any args, like with gettid(), so only
1528 		 * print the raw args when we didn't manage to read it.
1529 		 */
1530 		while (arg.idx < sc->nr_args) {
1531 			if (arg.mask & bit)
1532 				goto next_arg;
1533 			val = syscall_arg__val(&arg, arg.idx);
1534 			if (printed)
1535 				printed += scnprintf(bf + printed, size - printed, ", ");
1536 			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1537 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1538 next_arg:
1539 			++arg.idx;
1540 			bit <<= 1;
1541 		}
1542 	}
1543 
1544 	return printed;
1545 }
1546 
1547 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1548 				  union perf_event *event,
1549 				  struct perf_sample *sample);
1550 
1551 static struct syscall *trace__syscall_info(struct trace *trace,
1552 					   struct perf_evsel *evsel, int id)
1553 {
1554 
1555 	if (id < 0) {
1556 
1557 		/*
1558 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1559 		 * before that, leaving at a higher verbosity level till that is
1560 		 * explained. Reproduced with plain ftrace with:
1561 		 *
1562 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1563 		 * grep "NR -1 " /t/trace_pipe
1564 		 *
1565 		 * After generating some load on the machine.
1566  		 */
1567 		if (verbose > 1) {
1568 			static u64 n;
1569 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1570 				id, perf_evsel__name(evsel), ++n);
1571 		}
1572 		return NULL;
1573 	}
1574 
1575 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1576 	    trace__read_syscall_info(trace, id))
1577 		goto out_cant_read;
1578 
1579 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1580 		goto out_cant_read;
1581 
1582 	return &trace->syscalls.table[id];
1583 
1584 out_cant_read:
1585 	if (verbose > 0) {
1586 		fprintf(trace->output, "Problems reading syscall %d", id);
1587 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1588 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1589 		fputs(" information\n", trace->output);
1590 	}
1591 	return NULL;
1592 }
1593 
1594 static void thread__update_stats(struct thread_trace *ttrace,
1595 				 int id, struct perf_sample *sample)
1596 {
1597 	struct int_node *inode;
1598 	struct stats *stats;
1599 	u64 duration = 0;
1600 
1601 	inode = intlist__findnew(ttrace->syscall_stats, id);
1602 	if (inode == NULL)
1603 		return;
1604 
1605 	stats = inode->priv;
1606 	if (stats == NULL) {
1607 		stats = malloc(sizeof(struct stats));
1608 		if (stats == NULL)
1609 			return;
1610 		init_stats(stats);
1611 		inode->priv = stats;
1612 	}
1613 
1614 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1615 		duration = sample->time - ttrace->entry_time;
1616 
1617 	update_stats(stats, duration);
1618 }
1619 
1620 static int trace__printf_interrupted_entry(struct trace *trace)
1621 {
1622 	struct thread_trace *ttrace;
1623 	size_t printed;
1624 
1625 	if (trace->failure_only || trace->current == NULL)
1626 		return 0;
1627 
1628 	ttrace = thread__priv(trace->current);
1629 
1630 	if (!ttrace->entry_pending)
1631 		return 0;
1632 
1633 	printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1634 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1635 	ttrace->entry_pending = false;
1636 
1637 	return printed;
1638 }
1639 
1640 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1641 				 struct perf_sample *sample, struct thread *thread)
1642 {
1643 	int printed = 0;
1644 
1645 	if (trace->print_sample) {
1646 		double ts = (double)sample->time / NSEC_PER_MSEC;
1647 
1648 		printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1649 				   perf_evsel__name(evsel), ts,
1650 				   thread__comm_str(thread),
1651 				   sample->pid, sample->tid, sample->cpu);
1652 	}
1653 
1654 	return printed;
1655 }
1656 
1657 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1658 			    union perf_event *event __maybe_unused,
1659 			    struct perf_sample *sample)
1660 {
1661 	char *msg;
1662 	void *args;
1663 	size_t printed = 0;
1664 	struct thread *thread;
1665 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1666 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1667 	struct thread_trace *ttrace;
1668 
1669 	if (sc == NULL)
1670 		return -1;
1671 
1672 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1673 	ttrace = thread__trace(thread, trace->output);
1674 	if (ttrace == NULL)
1675 		goto out_put;
1676 
1677 	trace__fprintf_sample(trace, evsel, sample, thread);
1678 
1679 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1680 
1681 	if (ttrace->entry_str == NULL) {
1682 		ttrace->entry_str = malloc(trace__entry_str_size);
1683 		if (!ttrace->entry_str)
1684 			goto out_put;
1685 	}
1686 
1687 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1688 		trace__printf_interrupted_entry(trace);
1689 
1690 	ttrace->entry_time = sample->time;
1691 	msg = ttrace->entry_str;
1692 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1693 
1694 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1695 					   args, trace, thread);
1696 
1697 	if (sc->is_exit) {
1698 		if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1699 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1700 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1701 		}
1702 	} else {
1703 		ttrace->entry_pending = true;
1704 		/* See trace__vfs_getname & trace__sys_exit */
1705 		ttrace->filename.pending_open = false;
1706 	}
1707 
1708 	if (trace->current != thread) {
1709 		thread__put(trace->current);
1710 		trace->current = thread__get(thread);
1711 	}
1712 	err = 0;
1713 out_put:
1714 	thread__put(thread);
1715 	return err;
1716 }
1717 
1718 static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel,
1719 				    struct perf_sample *sample)
1720 {
1721 	struct thread_trace *ttrace;
1722 	struct thread *thread;
1723 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1724 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1725 	char msg[1024];
1726 	void *args;
1727 
1728 	if (sc == NULL)
1729 		return -1;
1730 
1731 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1732 	ttrace = thread__trace(thread, trace->output);
1733 	/*
1734 	 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
1735 	 * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
1736 	 */
1737 	if (ttrace == NULL)
1738 		goto out_put;
1739 
1740 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1741 	syscall__scnprintf_args(sc, msg, sizeof(msg), args, trace, thread);
1742 	fprintf(trace->output, "%s", msg);
1743 	err = 0;
1744 out_put:
1745 	thread__put(thread);
1746 	return err;
1747 }
1748 
1749 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1750 				    struct perf_sample *sample,
1751 				    struct callchain_cursor *cursor)
1752 {
1753 	struct addr_location al;
1754 	int max_stack = evsel->attr.sample_max_stack ?
1755 			evsel->attr.sample_max_stack :
1756 			trace->max_stack;
1757 
1758 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1759 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1760 		return -1;
1761 
1762 	return 0;
1763 }
1764 
1765 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1766 {
1767 	/* TODO: user-configurable print_opts */
1768 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1769 				        EVSEL__PRINT_DSO |
1770 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1771 
1772 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1773 }
1774 
1775 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1776 {
1777 	struct perf_env *env = perf_evsel__env(evsel);
1778 	const char *arch_name = perf_env__arch(env);
1779 
1780 	return arch_syscalls__strerrno(arch_name, err);
1781 }
1782 
1783 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1784 			   union perf_event *event __maybe_unused,
1785 			   struct perf_sample *sample)
1786 {
1787 	long ret;
1788 	u64 duration = 0;
1789 	bool duration_calculated = false;
1790 	struct thread *thread;
1791 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1792 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1793 	struct thread_trace *ttrace;
1794 
1795 	if (sc == NULL)
1796 		return -1;
1797 
1798 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1799 	ttrace = thread__trace(thread, trace->output);
1800 	if (ttrace == NULL)
1801 		goto out_put;
1802 
1803 	trace__fprintf_sample(trace, evsel, sample, thread);
1804 
1805 	if (trace->summary)
1806 		thread__update_stats(ttrace, id, sample);
1807 
1808 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1809 
1810 	if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
1811 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1812 		ttrace->filename.pending_open = false;
1813 		++trace->stats.vfs_getname;
1814 	}
1815 
1816 	if (ttrace->entry_time) {
1817 		duration = sample->time - ttrace->entry_time;
1818 		if (trace__filter_duration(trace, duration))
1819 			goto out;
1820 		duration_calculated = true;
1821 	} else if (trace->duration_filter)
1822 		goto out;
1823 
1824 	if (sample->callchain) {
1825 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1826 		if (callchain_ret == 0) {
1827 			if (callchain_cursor.nr < trace->min_stack)
1828 				goto out;
1829 			callchain_ret = 1;
1830 		}
1831 	}
1832 
1833 	if (trace->summary_only || (ret >= 0 && trace->failure_only))
1834 		goto out;
1835 
1836 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1837 
1838 	if (ttrace->entry_pending) {
1839 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1840 	} else {
1841 		fprintf(trace->output, " ... [");
1842 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1843 		fprintf(trace->output, "]: %s()", sc->name);
1844 	}
1845 
1846 	if (sc->fmt == NULL) {
1847 		if (ret < 0)
1848 			goto errno_print;
1849 signed_print:
1850 		fprintf(trace->output, ") = %ld", ret);
1851 	} else if (ret < 0) {
1852 errno_print: {
1853 		char bf[STRERR_BUFSIZE];
1854 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1855 			   *e = errno_to_name(evsel, -ret);
1856 
1857 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1858 	}
1859 	} else if (ret == 0 && sc->fmt->timeout)
1860 		fprintf(trace->output, ") = 0 Timeout");
1861 	else if (ttrace->ret_scnprintf) {
1862 		char bf[1024];
1863 		struct syscall_arg arg = {
1864 			.val	= ret,
1865 			.thread	= thread,
1866 			.trace	= trace,
1867 		};
1868 		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1869 		ttrace->ret_scnprintf = NULL;
1870 		fprintf(trace->output, ") = %s", bf);
1871 	} else if (sc->fmt->hexret)
1872 		fprintf(trace->output, ") = %#lx", ret);
1873 	else if (sc->fmt->errpid) {
1874 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1875 
1876 		if (child != NULL) {
1877 			fprintf(trace->output, ") = %ld", ret);
1878 			if (child->comm_set)
1879 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1880 			thread__put(child);
1881 		}
1882 	} else
1883 		goto signed_print;
1884 
1885 	fputc('\n', trace->output);
1886 
1887 	if (callchain_ret > 0)
1888 		trace__fprintf_callchain(trace, sample);
1889 	else if (callchain_ret < 0)
1890 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1891 out:
1892 	ttrace->entry_pending = false;
1893 	err = 0;
1894 out_put:
1895 	thread__put(thread);
1896 	return err;
1897 }
1898 
1899 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1900 			      union perf_event *event __maybe_unused,
1901 			      struct perf_sample *sample)
1902 {
1903 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1904 	struct thread_trace *ttrace;
1905 	size_t filename_len, entry_str_len, to_move;
1906 	ssize_t remaining_space;
1907 	char *pos;
1908 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1909 
1910 	if (!thread)
1911 		goto out;
1912 
1913 	ttrace = thread__priv(thread);
1914 	if (!ttrace)
1915 		goto out_put;
1916 
1917 	filename_len = strlen(filename);
1918 	if (filename_len == 0)
1919 		goto out_put;
1920 
1921 	if (ttrace->filename.namelen < filename_len) {
1922 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1923 
1924 		if (f == NULL)
1925 			goto out_put;
1926 
1927 		ttrace->filename.namelen = filename_len;
1928 		ttrace->filename.name = f;
1929 	}
1930 
1931 	strcpy(ttrace->filename.name, filename);
1932 	ttrace->filename.pending_open = true;
1933 
1934 	if (!ttrace->filename.ptr)
1935 		goto out_put;
1936 
1937 	entry_str_len = strlen(ttrace->entry_str);
1938 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1939 	if (remaining_space <= 0)
1940 		goto out_put;
1941 
1942 	if (filename_len > (size_t)remaining_space) {
1943 		filename += filename_len - remaining_space;
1944 		filename_len = remaining_space;
1945 	}
1946 
1947 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1948 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1949 	memmove(pos + filename_len, pos, to_move);
1950 	memcpy(pos, filename, filename_len);
1951 
1952 	ttrace->filename.ptr = 0;
1953 	ttrace->filename.entry_str_pos = 0;
1954 out_put:
1955 	thread__put(thread);
1956 out:
1957 	return 0;
1958 }
1959 
1960 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1961 				     union perf_event *event __maybe_unused,
1962 				     struct perf_sample *sample)
1963 {
1964         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1965 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1966 	struct thread *thread = machine__findnew_thread(trace->host,
1967 							sample->pid,
1968 							sample->tid);
1969 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1970 
1971 	if (ttrace == NULL)
1972 		goto out_dump;
1973 
1974 	ttrace->runtime_ms += runtime_ms;
1975 	trace->runtime_ms += runtime_ms;
1976 out_put:
1977 	thread__put(thread);
1978 	return 0;
1979 
1980 out_dump:
1981 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1982 	       evsel->name,
1983 	       perf_evsel__strval(evsel, sample, "comm"),
1984 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1985 	       runtime,
1986 	       perf_evsel__intval(evsel, sample, "vruntime"));
1987 	goto out_put;
1988 }
1989 
1990 static int bpf_output__printer(enum binary_printer_ops op,
1991 			       unsigned int val, void *extra __maybe_unused, FILE *fp)
1992 {
1993 	unsigned char ch = (unsigned char)val;
1994 
1995 	switch (op) {
1996 	case BINARY_PRINT_CHAR_DATA:
1997 		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1998 	case BINARY_PRINT_DATA_BEGIN:
1999 	case BINARY_PRINT_LINE_BEGIN:
2000 	case BINARY_PRINT_ADDR:
2001 	case BINARY_PRINT_NUM_DATA:
2002 	case BINARY_PRINT_NUM_PAD:
2003 	case BINARY_PRINT_SEP:
2004 	case BINARY_PRINT_CHAR_PAD:
2005 	case BINARY_PRINT_LINE_END:
2006 	case BINARY_PRINT_DATA_END:
2007 	default:
2008 		break;
2009 	}
2010 
2011 	return 0;
2012 }
2013 
2014 static void bpf_output__fprintf(struct trace *trace,
2015 				struct perf_sample *sample)
2016 {
2017 	binary__fprintf(sample->raw_data, sample->raw_size, 8,
2018 			bpf_output__printer, NULL, trace->output);
2019 }
2020 
2021 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2022 				union perf_event *event __maybe_unused,
2023 				struct perf_sample *sample)
2024 {
2025 	int callchain_ret = 0;
2026 
2027 	if (sample->callchain) {
2028 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2029 		if (callchain_ret == 0) {
2030 			if (callchain_cursor.nr < trace->min_stack)
2031 				goto out;
2032 			callchain_ret = 1;
2033 		}
2034 	}
2035 
2036 	trace__printf_interrupted_entry(trace);
2037 	trace__fprintf_tstamp(trace, sample->time, trace->output);
2038 
2039 	if (trace->trace_syscalls)
2040 		fprintf(trace->output, "(         ): ");
2041 
2042 	fprintf(trace->output, "%s:", evsel->name);
2043 
2044 	if (perf_evsel__is_bpf_output(evsel)) {
2045 		if (evsel == trace->syscalls.events.augmented)
2046 			trace__fprintf_sys_enter(trace, evsel, sample);
2047 		else
2048 			bpf_output__fprintf(trace, sample);
2049 	} else if (evsel->tp_format) {
2050 		if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
2051 		    trace__fprintf_sys_enter(trace, evsel, sample)) {
2052 			event_format__fprintf(evsel->tp_format, sample->cpu,
2053 					      sample->raw_data, sample->raw_size,
2054 					      trace->output);
2055 		}
2056 	}
2057 
2058 	fprintf(trace->output, "\n");
2059 
2060 	if (callchain_ret > 0)
2061 		trace__fprintf_callchain(trace, sample);
2062 	else if (callchain_ret < 0)
2063 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2064 out:
2065 	return 0;
2066 }
2067 
2068 static void print_location(FILE *f, struct perf_sample *sample,
2069 			   struct addr_location *al,
2070 			   bool print_dso, bool print_sym)
2071 {
2072 
2073 	if ((verbose > 0 || print_dso) && al->map)
2074 		fprintf(f, "%s@", al->map->dso->long_name);
2075 
2076 	if ((verbose > 0 || print_sym) && al->sym)
2077 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2078 			al->addr - al->sym->start);
2079 	else if (al->map)
2080 		fprintf(f, "0x%" PRIx64, al->addr);
2081 	else
2082 		fprintf(f, "0x%" PRIx64, sample->addr);
2083 }
2084 
2085 static int trace__pgfault(struct trace *trace,
2086 			  struct perf_evsel *evsel,
2087 			  union perf_event *event __maybe_unused,
2088 			  struct perf_sample *sample)
2089 {
2090 	struct thread *thread;
2091 	struct addr_location al;
2092 	char map_type = 'd';
2093 	struct thread_trace *ttrace;
2094 	int err = -1;
2095 	int callchain_ret = 0;
2096 
2097 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2098 
2099 	if (sample->callchain) {
2100 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2101 		if (callchain_ret == 0) {
2102 			if (callchain_cursor.nr < trace->min_stack)
2103 				goto out_put;
2104 			callchain_ret = 1;
2105 		}
2106 	}
2107 
2108 	ttrace = thread__trace(thread, trace->output);
2109 	if (ttrace == NULL)
2110 		goto out_put;
2111 
2112 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2113 		ttrace->pfmaj++;
2114 	else
2115 		ttrace->pfmin++;
2116 
2117 	if (trace->summary_only)
2118 		goto out;
2119 
2120 	thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2121 
2122 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2123 
2124 	fprintf(trace->output, "%sfault [",
2125 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2126 		"maj" : "min");
2127 
2128 	print_location(trace->output, sample, &al, false, true);
2129 
2130 	fprintf(trace->output, "] => ");
2131 
2132 	thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2133 
2134 	if (!al.map) {
2135 		thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2136 
2137 		if (al.map)
2138 			map_type = 'x';
2139 		else
2140 			map_type = '?';
2141 	}
2142 
2143 	print_location(trace->output, sample, &al, true, false);
2144 
2145 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2146 
2147 	if (callchain_ret > 0)
2148 		trace__fprintf_callchain(trace, sample);
2149 	else if (callchain_ret < 0)
2150 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2151 out:
2152 	err = 0;
2153 out_put:
2154 	thread__put(thread);
2155 	return err;
2156 }
2157 
2158 static void trace__set_base_time(struct trace *trace,
2159 				 struct perf_evsel *evsel,
2160 				 struct perf_sample *sample)
2161 {
2162 	/*
2163 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2164 	 * and don't use sample->time unconditionally, we may end up having
2165 	 * some other event in the future without PERF_SAMPLE_TIME for good
2166 	 * reason, i.e. we may not be interested in its timestamps, just in
2167 	 * it taking place, picking some piece of information when it
2168 	 * appears in our event stream (vfs_getname comes to mind).
2169 	 */
2170 	if (trace->base_time == 0 && !trace->full_time &&
2171 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2172 		trace->base_time = sample->time;
2173 }
2174 
2175 static int trace__process_sample(struct perf_tool *tool,
2176 				 union perf_event *event,
2177 				 struct perf_sample *sample,
2178 				 struct perf_evsel *evsel,
2179 				 struct machine *machine __maybe_unused)
2180 {
2181 	struct trace *trace = container_of(tool, struct trace, tool);
2182 	struct thread *thread;
2183 	int err = 0;
2184 
2185 	tracepoint_handler handler = evsel->handler;
2186 
2187 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2188 	if (thread && thread__is_filtered(thread))
2189 		goto out;
2190 
2191 	trace__set_base_time(trace, evsel, sample);
2192 
2193 	if (handler) {
2194 		++trace->nr_events;
2195 		handler(trace, evsel, event, sample);
2196 	}
2197 out:
2198 	thread__put(thread);
2199 	return err;
2200 }
2201 
2202 static int trace__record(struct trace *trace, int argc, const char **argv)
2203 {
2204 	unsigned int rec_argc, i, j;
2205 	const char **rec_argv;
2206 	const char * const record_args[] = {
2207 		"record",
2208 		"-R",
2209 		"-m", "1024",
2210 		"-c", "1",
2211 	};
2212 
2213 	const char * const sc_args[] = { "-e", };
2214 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2215 	const char * const majpf_args[] = { "-e", "major-faults" };
2216 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2217 	const char * const minpf_args[] = { "-e", "minor-faults" };
2218 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2219 
2220 	/* +1 is for the event string below */
2221 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2222 		majpf_args_nr + minpf_args_nr + argc;
2223 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2224 
2225 	if (rec_argv == NULL)
2226 		return -ENOMEM;
2227 
2228 	j = 0;
2229 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2230 		rec_argv[j++] = record_args[i];
2231 
2232 	if (trace->trace_syscalls) {
2233 		for (i = 0; i < sc_args_nr; i++)
2234 			rec_argv[j++] = sc_args[i];
2235 
2236 		/* event string may be different for older kernels - e.g., RHEL6 */
2237 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2238 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2239 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2240 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2241 		else {
2242 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2243 			free(rec_argv);
2244 			return -1;
2245 		}
2246 	}
2247 
2248 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2249 		for (i = 0; i < majpf_args_nr; i++)
2250 			rec_argv[j++] = majpf_args[i];
2251 
2252 	if (trace->trace_pgfaults & TRACE_PFMIN)
2253 		for (i = 0; i < minpf_args_nr; i++)
2254 			rec_argv[j++] = minpf_args[i];
2255 
2256 	for (i = 0; i < (unsigned int)argc; i++)
2257 		rec_argv[j++] = argv[i];
2258 
2259 	return cmd_record(j, rec_argv);
2260 }
2261 
2262 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2263 
2264 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2265 {
2266 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2267 
2268 	if (IS_ERR(evsel))
2269 		return false;
2270 
2271 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2272 		perf_evsel__delete(evsel);
2273 		return false;
2274 	}
2275 
2276 	evsel->handler = trace__vfs_getname;
2277 	perf_evlist__add(evlist, evsel);
2278 	return true;
2279 }
2280 
2281 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2282 {
2283 	struct perf_evsel *evsel;
2284 	struct perf_event_attr attr = {
2285 		.type = PERF_TYPE_SOFTWARE,
2286 		.mmap_data = 1,
2287 	};
2288 
2289 	attr.config = config;
2290 	attr.sample_period = 1;
2291 
2292 	event_attr_init(&attr);
2293 
2294 	evsel = perf_evsel__new(&attr);
2295 	if (evsel)
2296 		evsel->handler = trace__pgfault;
2297 
2298 	return evsel;
2299 }
2300 
2301 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2302 {
2303 	const u32 type = event->header.type;
2304 	struct perf_evsel *evsel;
2305 
2306 	if (type != PERF_RECORD_SAMPLE) {
2307 		trace__process_event(trace, trace->host, event, sample);
2308 		return;
2309 	}
2310 
2311 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2312 	if (evsel == NULL) {
2313 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2314 		return;
2315 	}
2316 
2317 	trace__set_base_time(trace, evsel, sample);
2318 
2319 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2320 	    sample->raw_data == NULL) {
2321 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2322 		       perf_evsel__name(evsel), sample->tid,
2323 		       sample->cpu, sample->raw_size);
2324 	} else {
2325 		tracepoint_handler handler = evsel->handler;
2326 		handler(trace, evsel, event, sample);
2327 	}
2328 }
2329 
2330 static int trace__add_syscall_newtp(struct trace *trace)
2331 {
2332 	int ret = -1;
2333 	struct perf_evlist *evlist = trace->evlist;
2334 	struct perf_evsel *sys_enter, *sys_exit;
2335 
2336 	sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
2337 	if (sys_enter == NULL)
2338 		goto out;
2339 
2340 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2341 		goto out_delete_sys_enter;
2342 
2343 	sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
2344 	if (sys_exit == NULL)
2345 		goto out_delete_sys_enter;
2346 
2347 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2348 		goto out_delete_sys_exit;
2349 
2350 	perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2351 	perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2352 
2353 	perf_evlist__add(evlist, sys_enter);
2354 	perf_evlist__add(evlist, sys_exit);
2355 
2356 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2357 		/*
2358 		 * We're interested only in the user space callchain
2359 		 * leading to the syscall, allow overriding that for
2360 		 * debugging reasons using --kernel_syscall_callchains
2361 		 */
2362 		sys_exit->attr.exclude_callchain_kernel = 1;
2363 	}
2364 
2365 	trace->syscalls.events.sys_enter = sys_enter;
2366 	trace->syscalls.events.sys_exit  = sys_exit;
2367 
2368 	ret = 0;
2369 out:
2370 	return ret;
2371 
2372 out_delete_sys_exit:
2373 	perf_evsel__delete_priv(sys_exit);
2374 out_delete_sys_enter:
2375 	perf_evsel__delete_priv(sys_enter);
2376 	goto out;
2377 }
2378 
2379 static int trace__set_ev_qualifier_filter(struct trace *trace)
2380 {
2381 	int err = -1;
2382 	struct perf_evsel *sys_exit;
2383 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2384 						trace->ev_qualifier_ids.nr,
2385 						trace->ev_qualifier_ids.entries);
2386 
2387 	if (filter == NULL)
2388 		goto out_enomem;
2389 
2390 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2391 					  filter)) {
2392 		sys_exit = trace->syscalls.events.sys_exit;
2393 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2394 	}
2395 
2396 	free(filter);
2397 out:
2398 	return err;
2399 out_enomem:
2400 	errno = ENOMEM;
2401 	goto out;
2402 }
2403 
2404 static int trace__set_filter_loop_pids(struct trace *trace)
2405 {
2406 	unsigned int nr = 1;
2407 	pid_t pids[32] = {
2408 		getpid(),
2409 	};
2410 	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2411 
2412 	while (thread && nr < ARRAY_SIZE(pids)) {
2413 		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2414 
2415 		if (parent == NULL)
2416 			break;
2417 
2418 		if (!strcmp(thread__comm_str(parent), "sshd")) {
2419 			pids[nr++] = parent->tid;
2420 			break;
2421 		}
2422 		thread = parent;
2423 	}
2424 
2425 	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2426 }
2427 
2428 static int trace__run(struct trace *trace, int argc, const char **argv)
2429 {
2430 	struct perf_evlist *evlist = trace->evlist;
2431 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2432 	int err = -1, i;
2433 	unsigned long before;
2434 	const bool forks = argc > 0;
2435 	bool draining = false;
2436 
2437 	trace->live = true;
2438 
2439 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2440 		goto out_error_raw_syscalls;
2441 
2442 	if (trace->trace_syscalls)
2443 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2444 
2445 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2446 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2447 		if (pgfault_maj == NULL)
2448 			goto out_error_mem;
2449 		perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2450 		perf_evlist__add(evlist, pgfault_maj);
2451 	}
2452 
2453 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2454 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2455 		if (pgfault_min == NULL)
2456 			goto out_error_mem;
2457 		perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2458 		perf_evlist__add(evlist, pgfault_min);
2459 	}
2460 
2461 	if (trace->sched &&
2462 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2463 				   trace__sched_stat_runtime))
2464 		goto out_error_sched_stat_runtime;
2465 
2466 	/*
2467 	 * If a global cgroup was set, apply it to all the events without an
2468 	 * explicit cgroup. I.e.:
2469 	 *
2470 	 * 	trace -G A -e sched:*switch
2471 	 *
2472 	 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2473 	 * _and_ sched:sched_switch to the 'A' cgroup, while:
2474 	 *
2475 	 * trace -e sched:*switch -G A
2476 	 *
2477 	 * will only set the sched:sched_switch event to the 'A' cgroup, all the
2478 	 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2479 	 * a cgroup (on the root cgroup, sys wide, etc).
2480 	 *
2481 	 * Multiple cgroups:
2482 	 *
2483 	 * trace -G A -e sched:*switch -G B
2484 	 *
2485 	 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2486 	 * to the 'B' cgroup.
2487 	 *
2488 	 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2489 	 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2490 	 */
2491 	if (trace->cgroup)
2492 		evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2493 
2494 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2495 	if (err < 0) {
2496 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2497 		goto out_delete_evlist;
2498 	}
2499 
2500 	err = trace__symbols_init(trace, evlist);
2501 	if (err < 0) {
2502 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2503 		goto out_delete_evlist;
2504 	}
2505 
2506 	perf_evlist__config(evlist, &trace->opts, &callchain_param);
2507 
2508 	signal(SIGCHLD, sig_handler);
2509 	signal(SIGINT, sig_handler);
2510 
2511 	if (forks) {
2512 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2513 						    argv, false, NULL);
2514 		if (err < 0) {
2515 			fprintf(trace->output, "Couldn't run the workload!\n");
2516 			goto out_delete_evlist;
2517 		}
2518 	}
2519 
2520 	err = perf_evlist__open(evlist);
2521 	if (err < 0)
2522 		goto out_error_open;
2523 
2524 	err = bpf__apply_obj_config();
2525 	if (err) {
2526 		char errbuf[BUFSIZ];
2527 
2528 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2529 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2530 			 errbuf);
2531 		goto out_error_open;
2532 	}
2533 
2534 	/*
2535 	 * Better not use !target__has_task() here because we need to cover the
2536 	 * case where no threads were specified in the command line, but a
2537 	 * workload was, and in that case we will fill in the thread_map when
2538 	 * we fork the workload in perf_evlist__prepare_workload.
2539 	 */
2540 	if (trace->filter_pids.nr > 0)
2541 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2542 	else if (thread_map__pid(evlist->threads, 0) == -1)
2543 		err = trace__set_filter_loop_pids(trace);
2544 
2545 	if (err < 0)
2546 		goto out_error_mem;
2547 
2548 	if (trace->ev_qualifier_ids.nr > 0) {
2549 		err = trace__set_ev_qualifier_filter(trace);
2550 		if (err < 0)
2551 			goto out_errno;
2552 
2553 		pr_debug("event qualifier tracepoint filter: %s\n",
2554 			 trace->syscalls.events.sys_exit->filter);
2555 	}
2556 
2557 	err = perf_evlist__apply_filters(evlist, &evsel);
2558 	if (err < 0)
2559 		goto out_error_apply_filters;
2560 
2561 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2562 	if (err < 0)
2563 		goto out_error_mmap;
2564 
2565 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2566 		perf_evlist__enable(evlist);
2567 
2568 	if (forks)
2569 		perf_evlist__start_workload(evlist);
2570 
2571 	if (trace->opts.initial_delay) {
2572 		usleep(trace->opts.initial_delay * 1000);
2573 		perf_evlist__enable(evlist);
2574 	}
2575 
2576 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2577 				  evlist->threads->nr > 1 ||
2578 				  perf_evlist__first(evlist)->attr.inherit;
2579 
2580 	/*
2581 	 * Now that we already used evsel->attr to ask the kernel to setup the
2582 	 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2583 	 * trace__resolve_callchain(), allowing per-event max-stack settings
2584 	 * to override an explicitely set --max-stack global setting.
2585 	 */
2586 	evlist__for_each_entry(evlist, evsel) {
2587 		if (evsel__has_callchain(evsel) &&
2588 		    evsel->attr.sample_max_stack == 0)
2589 			evsel->attr.sample_max_stack = trace->max_stack;
2590 	}
2591 again:
2592 	before = trace->nr_events;
2593 
2594 	for (i = 0; i < evlist->nr_mmaps; i++) {
2595 		union perf_event *event;
2596 		struct perf_mmap *md;
2597 
2598 		md = &evlist->mmap[i];
2599 		if (perf_mmap__read_init(md) < 0)
2600 			continue;
2601 
2602 		while ((event = perf_mmap__read_event(md)) != NULL) {
2603 			struct perf_sample sample;
2604 
2605 			++trace->nr_events;
2606 
2607 			err = perf_evlist__parse_sample(evlist, event, &sample);
2608 			if (err) {
2609 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2610 				goto next_event;
2611 			}
2612 
2613 			trace__handle_event(trace, event, &sample);
2614 next_event:
2615 			perf_mmap__consume(md);
2616 
2617 			if (interrupted)
2618 				goto out_disable;
2619 
2620 			if (done && !draining) {
2621 				perf_evlist__disable(evlist);
2622 				draining = true;
2623 			}
2624 		}
2625 		perf_mmap__read_done(md);
2626 	}
2627 
2628 	if (trace->nr_events == before) {
2629 		int timeout = done ? 100 : -1;
2630 
2631 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2632 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2633 				draining = true;
2634 
2635 			goto again;
2636 		}
2637 	} else {
2638 		goto again;
2639 	}
2640 
2641 out_disable:
2642 	thread__zput(trace->current);
2643 
2644 	perf_evlist__disable(evlist);
2645 
2646 	if (!err) {
2647 		if (trace->summary)
2648 			trace__fprintf_thread_summary(trace, trace->output);
2649 
2650 		if (trace->show_tool_stats) {
2651 			fprintf(trace->output, "Stats:\n "
2652 					       " vfs_getname : %" PRIu64 "\n"
2653 					       " proc_getname: %" PRIu64 "\n",
2654 				trace->stats.vfs_getname,
2655 				trace->stats.proc_getname);
2656 		}
2657 	}
2658 
2659 out_delete_evlist:
2660 	trace__symbols__exit(trace);
2661 
2662 	perf_evlist__delete(evlist);
2663 	cgroup__put(trace->cgroup);
2664 	trace->evlist = NULL;
2665 	trace->live = false;
2666 	return err;
2667 {
2668 	char errbuf[BUFSIZ];
2669 
2670 out_error_sched_stat_runtime:
2671 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2672 	goto out_error;
2673 
2674 out_error_raw_syscalls:
2675 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2676 	goto out_error;
2677 
2678 out_error_mmap:
2679 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2680 	goto out_error;
2681 
2682 out_error_open:
2683 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2684 
2685 out_error:
2686 	fprintf(trace->output, "%s\n", errbuf);
2687 	goto out_delete_evlist;
2688 
2689 out_error_apply_filters:
2690 	fprintf(trace->output,
2691 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2692 		evsel->filter, perf_evsel__name(evsel), errno,
2693 		str_error_r(errno, errbuf, sizeof(errbuf)));
2694 	goto out_delete_evlist;
2695 }
2696 out_error_mem:
2697 	fprintf(trace->output, "Not enough memory to run!\n");
2698 	goto out_delete_evlist;
2699 
2700 out_errno:
2701 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2702 	goto out_delete_evlist;
2703 }
2704 
2705 static int trace__replay(struct trace *trace)
2706 {
2707 	const struct perf_evsel_str_handler handlers[] = {
2708 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2709 	};
2710 	struct perf_data data = {
2711 		.file      = {
2712 			.path = input_name,
2713 		},
2714 		.mode      = PERF_DATA_MODE_READ,
2715 		.force     = trace->force,
2716 	};
2717 	struct perf_session *session;
2718 	struct perf_evsel *evsel;
2719 	int err = -1;
2720 
2721 	trace->tool.sample	  = trace__process_sample;
2722 	trace->tool.mmap	  = perf_event__process_mmap;
2723 	trace->tool.mmap2	  = perf_event__process_mmap2;
2724 	trace->tool.comm	  = perf_event__process_comm;
2725 	trace->tool.exit	  = perf_event__process_exit;
2726 	trace->tool.fork	  = perf_event__process_fork;
2727 	trace->tool.attr	  = perf_event__process_attr;
2728 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2729 	trace->tool.build_id	  = perf_event__process_build_id;
2730 	trace->tool.namespaces	  = perf_event__process_namespaces;
2731 
2732 	trace->tool.ordered_events = true;
2733 	trace->tool.ordering_requires_timestamps = true;
2734 
2735 	/* add tid to output */
2736 	trace->multiple_threads = true;
2737 
2738 	session = perf_session__new(&data, false, &trace->tool);
2739 	if (session == NULL)
2740 		return -1;
2741 
2742 	if (trace->opts.target.pid)
2743 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2744 
2745 	if (trace->opts.target.tid)
2746 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2747 
2748 	if (symbol__init(&session->header.env) < 0)
2749 		goto out;
2750 
2751 	trace->host = &session->machines.host;
2752 
2753 	err = perf_session__set_tracepoints_handlers(session, handlers);
2754 	if (err)
2755 		goto out;
2756 
2757 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2758 						     "raw_syscalls:sys_enter");
2759 	/* older kernels have syscalls tp versus raw_syscalls */
2760 	if (evsel == NULL)
2761 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2762 							     "syscalls:sys_enter");
2763 
2764 	if (evsel &&
2765 	    (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
2766 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2767 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2768 		goto out;
2769 	}
2770 
2771 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2772 						     "raw_syscalls:sys_exit");
2773 	if (evsel == NULL)
2774 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2775 							     "syscalls:sys_exit");
2776 	if (evsel &&
2777 	    (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
2778 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2779 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2780 		goto out;
2781 	}
2782 
2783 	evlist__for_each_entry(session->evlist, evsel) {
2784 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2785 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2786 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2787 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2788 			evsel->handler = trace__pgfault;
2789 	}
2790 
2791 	setup_pager();
2792 
2793 	err = perf_session__process_events(session);
2794 	if (err)
2795 		pr_err("Failed to process events, error %d", err);
2796 
2797 	else if (trace->summary)
2798 		trace__fprintf_thread_summary(trace, trace->output);
2799 
2800 out:
2801 	perf_session__delete(session);
2802 
2803 	return err;
2804 }
2805 
2806 static size_t trace__fprintf_threads_header(FILE *fp)
2807 {
2808 	size_t printed;
2809 
2810 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2811 
2812 	return printed;
2813 }
2814 
2815 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2816 	struct stats 	*stats;
2817 	double		msecs;
2818 	int		syscall;
2819 )
2820 {
2821 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2822 	struct stats *stats = source->priv;
2823 
2824 	entry->syscall = source->i;
2825 	entry->stats   = stats;
2826 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2827 }
2828 
2829 static size_t thread__dump_stats(struct thread_trace *ttrace,
2830 				 struct trace *trace, FILE *fp)
2831 {
2832 	size_t printed = 0;
2833 	struct syscall *sc;
2834 	struct rb_node *nd;
2835 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2836 
2837 	if (syscall_stats == NULL)
2838 		return 0;
2839 
2840 	printed += fprintf(fp, "\n");
2841 
2842 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2843 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2844 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2845 
2846 	resort_rb__for_each_entry(nd, syscall_stats) {
2847 		struct stats *stats = syscall_stats_entry->stats;
2848 		if (stats) {
2849 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2850 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2851 			double avg = avg_stats(stats);
2852 			double pct;
2853 			u64 n = (u64) stats->n;
2854 
2855 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2856 			avg /= NSEC_PER_MSEC;
2857 
2858 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2859 			printed += fprintf(fp, "   %-15s", sc->name);
2860 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2861 					   n, syscall_stats_entry->msecs, min, avg);
2862 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2863 		}
2864 	}
2865 
2866 	resort_rb__delete(syscall_stats);
2867 	printed += fprintf(fp, "\n\n");
2868 
2869 	return printed;
2870 }
2871 
2872 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2873 {
2874 	size_t printed = 0;
2875 	struct thread_trace *ttrace = thread__priv(thread);
2876 	double ratio;
2877 
2878 	if (ttrace == NULL)
2879 		return 0;
2880 
2881 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2882 
2883 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2884 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2885 	printed += fprintf(fp, "%.1f%%", ratio);
2886 	if (ttrace->pfmaj)
2887 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2888 	if (ttrace->pfmin)
2889 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2890 	if (trace->sched)
2891 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2892 	else if (fputc('\n', fp) != EOF)
2893 		++printed;
2894 
2895 	printed += thread__dump_stats(ttrace, trace, fp);
2896 
2897 	return printed;
2898 }
2899 
2900 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2901 {
2902 	return ttrace ? ttrace->nr_events : 0;
2903 }
2904 
2905 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2906 	struct thread *thread;
2907 )
2908 {
2909 	entry->thread = rb_entry(nd, struct thread, rb_node);
2910 }
2911 
2912 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2913 {
2914 	size_t printed = trace__fprintf_threads_header(fp);
2915 	struct rb_node *nd;
2916 	int i;
2917 
2918 	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2919 		DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2920 
2921 		if (threads == NULL) {
2922 			fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2923 			return 0;
2924 		}
2925 
2926 		resort_rb__for_each_entry(nd, threads)
2927 			printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2928 
2929 		resort_rb__delete(threads);
2930 	}
2931 	return printed;
2932 }
2933 
2934 static int trace__set_duration(const struct option *opt, const char *str,
2935 			       int unset __maybe_unused)
2936 {
2937 	struct trace *trace = opt->value;
2938 
2939 	trace->duration_filter = atof(str);
2940 	return 0;
2941 }
2942 
2943 static int trace__set_filter_pids(const struct option *opt, const char *str,
2944 				  int unset __maybe_unused)
2945 {
2946 	int ret = -1;
2947 	size_t i;
2948 	struct trace *trace = opt->value;
2949 	/*
2950 	 * FIXME: introduce a intarray class, plain parse csv and create a
2951 	 * { int nr, int entries[] } struct...
2952 	 */
2953 	struct intlist *list = intlist__new(str);
2954 
2955 	if (list == NULL)
2956 		return -1;
2957 
2958 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2959 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2960 
2961 	if (trace->filter_pids.entries == NULL)
2962 		goto out;
2963 
2964 	trace->filter_pids.entries[0] = getpid();
2965 
2966 	for (i = 1; i < trace->filter_pids.nr; ++i)
2967 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2968 
2969 	intlist__delete(list);
2970 	ret = 0;
2971 out:
2972 	return ret;
2973 }
2974 
2975 static int trace__open_output(struct trace *trace, const char *filename)
2976 {
2977 	struct stat st;
2978 
2979 	if (!stat(filename, &st) && st.st_size) {
2980 		char oldname[PATH_MAX];
2981 
2982 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2983 		unlink(oldname);
2984 		rename(filename, oldname);
2985 	}
2986 
2987 	trace->output = fopen(filename, "w");
2988 
2989 	return trace->output == NULL ? -errno : 0;
2990 }
2991 
2992 static int parse_pagefaults(const struct option *opt, const char *str,
2993 			    int unset __maybe_unused)
2994 {
2995 	int *trace_pgfaults = opt->value;
2996 
2997 	if (strcmp(str, "all") == 0)
2998 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2999 	else if (strcmp(str, "maj") == 0)
3000 		*trace_pgfaults |= TRACE_PFMAJ;
3001 	else if (strcmp(str, "min") == 0)
3002 		*trace_pgfaults |= TRACE_PFMIN;
3003 	else
3004 		return -1;
3005 
3006 	return 0;
3007 }
3008 
3009 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3010 {
3011 	struct perf_evsel *evsel;
3012 
3013 	evlist__for_each_entry(evlist, evsel)
3014 		evsel->handler = handler;
3015 }
3016 
3017 static int evlist__set_syscall_tp_fields(struct perf_evlist *evlist)
3018 {
3019 	struct perf_evsel *evsel;
3020 
3021 	evlist__for_each_entry(evlist, evsel) {
3022 		if (evsel->priv || !evsel->tp_format)
3023 			continue;
3024 
3025 		if (strcmp(evsel->tp_format->system, "syscalls"))
3026 			continue;
3027 
3028 		if (perf_evsel__init_syscall_tp(evsel))
3029 			return -1;
3030 
3031 		if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
3032 			struct syscall_tp *sc = evsel->priv;
3033 
3034 			if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
3035 				return -1;
3036 		} else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
3037 			struct syscall_tp *sc = evsel->priv;
3038 
3039 			if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
3040 				return -1;
3041 		}
3042 	}
3043 
3044 	return 0;
3045 }
3046 
3047 /*
3048  * XXX: Hackish, just splitting the combined -e+--event (syscalls
3049  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
3050  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
3051  *
3052  * It'd be better to introduce a parse_options() variant that would return a
3053  * list with the terms it didn't match to an event...
3054  */
3055 static int trace__parse_events_option(const struct option *opt, const char *str,
3056 				      int unset __maybe_unused)
3057 {
3058 	struct trace *trace = (struct trace *)opt->value;
3059 	const char *s = str;
3060 	char *sep = NULL, *lists[2] = { NULL, NULL, };
3061 	int len = strlen(str) + 1, err = -1, list, idx;
3062 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
3063 	char group_name[PATH_MAX];
3064 
3065 	if (strace_groups_dir == NULL)
3066 		return -1;
3067 
3068 	if (*s == '!') {
3069 		++s;
3070 		trace->not_ev_qualifier = true;
3071 	}
3072 
3073 	while (1) {
3074 		if ((sep = strchr(s, ',')) != NULL)
3075 			*sep = '\0';
3076 
3077 		list = 0;
3078 		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
3079 		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
3080 			list = 1;
3081 		} else {
3082 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
3083 			if (access(group_name, R_OK) == 0)
3084 				list = 1;
3085 		}
3086 
3087 		if (lists[list]) {
3088 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
3089 		} else {
3090 			lists[list] = malloc(len);
3091 			if (lists[list] == NULL)
3092 				goto out;
3093 			strcpy(lists[list], s);
3094 		}
3095 
3096 		if (!sep)
3097 			break;
3098 
3099 		*sep = ',';
3100 		s = sep + 1;
3101 	}
3102 
3103 	if (lists[1] != NULL) {
3104 		struct strlist_config slist_config = {
3105 			.dirname = strace_groups_dir,
3106 		};
3107 
3108 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
3109 		if (trace->ev_qualifier == NULL) {
3110 			fputs("Not enough memory to parse event qualifier", trace->output);
3111 			goto out;
3112 		}
3113 
3114 		if (trace__validate_ev_qualifier(trace))
3115 			goto out;
3116 		trace->trace_syscalls = true;
3117 	}
3118 
3119 	err = 0;
3120 
3121 	if (lists[0]) {
3122 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3123 					       "event selector. use 'perf list' to list available events",
3124 					       parse_events_option);
3125 		err = parse_events_option(&o, lists[0], 0);
3126 	}
3127 out:
3128 	if (sep)
3129 		*sep = ',';
3130 
3131 	return err;
3132 }
3133 
3134 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3135 {
3136 	struct trace *trace = opt->value;
3137 
3138 	if (!list_empty(&trace->evlist->entries))
3139 		return parse_cgroups(opt, str, unset);
3140 
3141 	trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3142 
3143 	return 0;
3144 }
3145 
3146 int cmd_trace(int argc, const char **argv)
3147 {
3148 	const char *trace_usage[] = {
3149 		"perf trace [<options>] [<command>]",
3150 		"perf trace [<options>] -- <command> [<options>]",
3151 		"perf trace record [<options>] [<command>]",
3152 		"perf trace record [<options>] -- <command> [<options>]",
3153 		NULL
3154 	};
3155 	struct trace trace = {
3156 		.syscalls = {
3157 			. max = -1,
3158 		},
3159 		.opts = {
3160 			.target = {
3161 				.uid	   = UINT_MAX,
3162 				.uses_mmap = true,
3163 			},
3164 			.user_freq     = UINT_MAX,
3165 			.user_interval = ULLONG_MAX,
3166 			.no_buffering  = true,
3167 			.mmap_pages    = UINT_MAX,
3168 			.proc_map_timeout  = 500,
3169 		},
3170 		.output = stderr,
3171 		.show_comm = true,
3172 		.trace_syscalls = false,
3173 		.kernel_syscallchains = false,
3174 		.max_stack = UINT_MAX,
3175 	};
3176 	const char *output_name = NULL;
3177 	const struct option trace_options[] = {
3178 	OPT_CALLBACK('e', "event", &trace, "event",
3179 		     "event/syscall selector. use 'perf list' to list available events",
3180 		     trace__parse_events_option),
3181 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
3182 		    "show the thread COMM next to its id"),
3183 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3184 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3185 		     trace__parse_events_option),
3186 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
3187 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3188 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3189 		    "trace events on existing process id"),
3190 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3191 		    "trace events on existing thread id"),
3192 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3193 		     "pids to filter (by the kernel)", trace__set_filter_pids),
3194 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3195 		    "system-wide collection from all CPUs"),
3196 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3197 		    "list of cpus to monitor"),
3198 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3199 		    "child tasks do not inherit counters"),
3200 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3201 		     "number of mmap data pages",
3202 		     perf_evlist__parse_mmap_pages),
3203 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3204 		   "user to profile"),
3205 	OPT_CALLBACK(0, "duration", &trace, "float",
3206 		     "show only events with duration > N.M ms",
3207 		     trace__set_duration),
3208 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3209 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3210 	OPT_BOOLEAN('T', "time", &trace.full_time,
3211 		    "Show full timestamp, not time relative to first start"),
3212 	OPT_BOOLEAN(0, "failure", &trace.failure_only,
3213 		    "Show only syscalls that failed"),
3214 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3215 		    "Show only syscall summary with statistics"),
3216 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3217 		    "Show all syscalls and summary with statistics"),
3218 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3219 		     "Trace pagefaults", parse_pagefaults, "maj"),
3220 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3221 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3222 	OPT_CALLBACK(0, "call-graph", &trace.opts,
3223 		     "record_mode[,record_size]", record_callchain_help,
3224 		     &record_parse_callchain_opt),
3225 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3226 		    "Show the kernel callchains on the syscall exit path"),
3227 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3228 		     "Set the minimum stack depth when parsing the callchain, "
3229 		     "anything below the specified depth will be ignored."),
3230 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3231 		     "Set the maximum stack depth when parsing the callchain, "
3232 		     "anything beyond the specified depth will be ignored. "
3233 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3234 	OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3235 			"print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3236 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3237 			"per thread proc mmap processing timeout in ms"),
3238 	OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3239 		     trace__parse_cgroups),
3240 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3241 		     "ms to wait before starting measurement after program "
3242 		     "start"),
3243 	OPT_END()
3244 	};
3245 	bool __maybe_unused max_stack_user_set = true;
3246 	bool mmap_pages_user_set = true;
3247 	struct perf_evsel *evsel;
3248 	const char * const trace_subcommands[] = { "record", NULL };
3249 	int err = -1;
3250 	char bf[BUFSIZ];
3251 
3252 	signal(SIGSEGV, sighandler_dump_stack);
3253 	signal(SIGFPE, sighandler_dump_stack);
3254 
3255 	trace.evlist = perf_evlist__new();
3256 	trace.sctbl = syscalltbl__new();
3257 
3258 	if (trace.evlist == NULL || trace.sctbl == NULL) {
3259 		pr_err("Not enough memory to run!\n");
3260 		err = -ENOMEM;
3261 		goto out;
3262 	}
3263 
3264 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3265 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3266 
3267 	if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3268 		usage_with_options_msg(trace_usage, trace_options,
3269 				       "cgroup monitoring only available in system-wide mode");
3270 	}
3271 
3272 	evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__");
3273 	if (IS_ERR(evsel)) {
3274 		bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf));
3275 		pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf);
3276 		goto out;
3277 	}
3278 
3279 	if (evsel) {
3280 		if (perf_evsel__init_augmented_syscall_tp(evsel) ||
3281 		    perf_evsel__init_augmented_syscall_tp_args(evsel))
3282 			goto out;
3283 		trace.syscalls.events.augmented = evsel;
3284 	}
3285 
3286 	err = bpf__setup_stdout(trace.evlist);
3287 	if (err) {
3288 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3289 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3290 		goto out;
3291 	}
3292 
3293 	err = -1;
3294 
3295 	if (trace.trace_pgfaults) {
3296 		trace.opts.sample_address = true;
3297 		trace.opts.sample_time = true;
3298 	}
3299 
3300 	if (trace.opts.mmap_pages == UINT_MAX)
3301 		mmap_pages_user_set = false;
3302 
3303 	if (trace.max_stack == UINT_MAX) {
3304 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3305 		max_stack_user_set = false;
3306 	}
3307 
3308 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3309 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3310 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3311 	}
3312 #endif
3313 
3314 	if (callchain_param.enabled) {
3315 		if (!mmap_pages_user_set && geteuid() == 0)
3316 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3317 
3318 		symbol_conf.use_callchain = true;
3319 	}
3320 
3321 	if (trace.evlist->nr_entries > 0) {
3322 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3323 		if (evlist__set_syscall_tp_fields(trace.evlist)) {
3324 			perror("failed to set syscalls:* tracepoint fields");
3325 			goto out;
3326 		}
3327 	}
3328 
3329 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3330 		return trace__record(&trace, argc-1, &argv[1]);
3331 
3332 	/* summary_only implies summary option, but don't overwrite summary if set */
3333 	if (trace.summary_only)
3334 		trace.summary = trace.summary_only;
3335 
3336 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3337 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3338 		trace.trace_syscalls = true;
3339 	}
3340 
3341 	if (output_name != NULL) {
3342 		err = trace__open_output(&trace, output_name);
3343 		if (err < 0) {
3344 			perror("failed to create output file");
3345 			goto out;
3346 		}
3347 	}
3348 
3349 	err = target__validate(&trace.opts.target);
3350 	if (err) {
3351 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3352 		fprintf(trace.output, "%s", bf);
3353 		goto out_close;
3354 	}
3355 
3356 	err = target__parse_uid(&trace.opts.target);
3357 	if (err) {
3358 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3359 		fprintf(trace.output, "%s", bf);
3360 		goto out_close;
3361 	}
3362 
3363 	if (!argc && target__none(&trace.opts.target))
3364 		trace.opts.target.system_wide = true;
3365 
3366 	if (input_name)
3367 		err = trace__replay(&trace);
3368 	else
3369 		err = trace__run(&trace, argc, argv);
3370 
3371 out_close:
3372 	if (output_name != NULL)
3373 		fclose(trace.output);
3374 out:
3375 	return err;
3376 }
3377