xref: /openbmc/linux/tools/perf/builtin-trace.c (revision 4f3db074)
1 #include <traceevent/event-parse.h>
2 #include "builtin.h"
3 #include "util/color.h"
4 #include "util/debug.h"
5 #include "util/evlist.h"
6 #include "util/machine.h"
7 #include "util/session.h"
8 #include "util/thread.h"
9 #include "util/parse-options.h"
10 #include "util/strlist.h"
11 #include "util/intlist.h"
12 #include "util/thread_map.h"
13 #include "util/stat.h"
14 #include "trace-event.h"
15 #include "util/parse-events.h"
16 
17 #include <libaudit.h>
18 #include <stdlib.h>
19 #include <sys/eventfd.h>
20 #include <sys/mman.h>
21 #include <linux/futex.h>
22 
23 /* For older distros: */
24 #ifndef MAP_STACK
25 # define MAP_STACK		0x20000
26 #endif
27 
28 #ifndef MADV_HWPOISON
29 # define MADV_HWPOISON		100
30 #endif
31 
32 #ifndef MADV_MERGEABLE
33 # define MADV_MERGEABLE		12
34 #endif
35 
36 #ifndef MADV_UNMERGEABLE
37 # define MADV_UNMERGEABLE	13
38 #endif
39 
40 #ifndef EFD_SEMAPHORE
41 # define EFD_SEMAPHORE		1
42 #endif
43 
44 struct tp_field {
45 	int offset;
46 	union {
47 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
48 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
49 	};
50 };
51 
52 #define TP_UINT_FIELD(bits) \
53 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
54 { \
55 	u##bits value; \
56 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
57 	return value;  \
58 }
59 
60 TP_UINT_FIELD(8);
61 TP_UINT_FIELD(16);
62 TP_UINT_FIELD(32);
63 TP_UINT_FIELD(64);
64 
65 #define TP_UINT_FIELD__SWAPPED(bits) \
66 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
67 { \
68 	u##bits value; \
69 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
70 	return bswap_##bits(value);\
71 }
72 
73 TP_UINT_FIELD__SWAPPED(16);
74 TP_UINT_FIELD__SWAPPED(32);
75 TP_UINT_FIELD__SWAPPED(64);
76 
77 static int tp_field__init_uint(struct tp_field *field,
78 			       struct format_field *format_field,
79 			       bool needs_swap)
80 {
81 	field->offset = format_field->offset;
82 
83 	switch (format_field->size) {
84 	case 1:
85 		field->integer = tp_field__u8;
86 		break;
87 	case 2:
88 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
89 		break;
90 	case 4:
91 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
92 		break;
93 	case 8:
94 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
95 		break;
96 	default:
97 		return -1;
98 	}
99 
100 	return 0;
101 }
102 
103 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
104 {
105 	return sample->raw_data + field->offset;
106 }
107 
108 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
109 {
110 	field->offset = format_field->offset;
111 	field->pointer = tp_field__ptr;
112 	return 0;
113 }
114 
115 struct syscall_tp {
116 	struct tp_field id;
117 	union {
118 		struct tp_field args, ret;
119 	};
120 };
121 
122 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
123 					  struct tp_field *field,
124 					  const char *name)
125 {
126 	struct format_field *format_field = perf_evsel__field(evsel, name);
127 
128 	if (format_field == NULL)
129 		return -1;
130 
131 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
132 }
133 
134 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
135 	({ struct syscall_tp *sc = evsel->priv;\
136 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
137 
138 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
139 					 struct tp_field *field,
140 					 const char *name)
141 {
142 	struct format_field *format_field = perf_evsel__field(evsel, name);
143 
144 	if (format_field == NULL)
145 		return -1;
146 
147 	return tp_field__init_ptr(field, format_field);
148 }
149 
150 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
151 	({ struct syscall_tp *sc = evsel->priv;\
152 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
153 
154 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
155 {
156 	zfree(&evsel->priv);
157 	perf_evsel__delete(evsel);
158 }
159 
160 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
161 {
162 	evsel->priv = malloc(sizeof(struct syscall_tp));
163 	if (evsel->priv != NULL) {
164 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
165 			goto out_delete;
166 
167 		evsel->handler = handler;
168 		return 0;
169 	}
170 
171 	return -ENOMEM;
172 
173 out_delete:
174 	zfree(&evsel->priv);
175 	return -ENOENT;
176 }
177 
178 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
179 {
180 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
181 
182 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
183 	if (evsel == NULL)
184 		evsel = perf_evsel__newtp("syscalls", direction);
185 
186 	if (evsel) {
187 		if (perf_evsel__init_syscall_tp(evsel, handler))
188 			goto out_delete;
189 	}
190 
191 	return evsel;
192 
193 out_delete:
194 	perf_evsel__delete_priv(evsel);
195 	return NULL;
196 }
197 
198 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
199 	({ struct syscall_tp *fields = evsel->priv; \
200 	   fields->name.integer(&fields->name, sample); })
201 
202 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
203 	({ struct syscall_tp *fields = evsel->priv; \
204 	   fields->name.pointer(&fields->name, sample); })
205 
206 static int perf_evlist__add_syscall_newtp(struct perf_evlist *evlist,
207 					  void *sys_enter_handler,
208 					  void *sys_exit_handler)
209 {
210 	int ret = -1;
211 	struct perf_evsel *sys_enter, *sys_exit;
212 
213 	sys_enter = perf_evsel__syscall_newtp("sys_enter", sys_enter_handler);
214 	if (sys_enter == NULL)
215 		goto out;
216 
217 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
218 		goto out_delete_sys_enter;
219 
220 	sys_exit = perf_evsel__syscall_newtp("sys_exit", sys_exit_handler);
221 	if (sys_exit == NULL)
222 		goto out_delete_sys_enter;
223 
224 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
225 		goto out_delete_sys_exit;
226 
227 	perf_evlist__add(evlist, sys_enter);
228 	perf_evlist__add(evlist, sys_exit);
229 
230 	ret = 0;
231 out:
232 	return ret;
233 
234 out_delete_sys_exit:
235 	perf_evsel__delete_priv(sys_exit);
236 out_delete_sys_enter:
237 	perf_evsel__delete_priv(sys_enter);
238 	goto out;
239 }
240 
241 
242 struct syscall_arg {
243 	unsigned long val;
244 	struct thread *thread;
245 	struct trace  *trace;
246 	void	      *parm;
247 	u8	      idx;
248 	u8	      mask;
249 };
250 
251 struct strarray {
252 	int	    offset;
253 	int	    nr_entries;
254 	const char **entries;
255 };
256 
257 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
258 	.nr_entries = ARRAY_SIZE(array), \
259 	.entries = array, \
260 }
261 
262 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
263 	.offset	    = off, \
264 	.nr_entries = ARRAY_SIZE(array), \
265 	.entries = array, \
266 }
267 
268 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
269 						const char *intfmt,
270 					        struct syscall_arg *arg)
271 {
272 	struct strarray *sa = arg->parm;
273 	int idx = arg->val - sa->offset;
274 
275 	if (idx < 0 || idx >= sa->nr_entries)
276 		return scnprintf(bf, size, intfmt, arg->val);
277 
278 	return scnprintf(bf, size, "%s", sa->entries[idx]);
279 }
280 
281 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
282 					      struct syscall_arg *arg)
283 {
284 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
285 }
286 
287 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
288 
289 #if defined(__i386__) || defined(__x86_64__)
290 /*
291  * FIXME: Make this available to all arches as soon as the ioctl beautifier
292  * 	  gets rewritten to support all arches.
293  */
294 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
295 						 struct syscall_arg *arg)
296 {
297 	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
298 }
299 
300 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
301 #endif /* defined(__i386__) || defined(__x86_64__) */
302 
303 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
304 					struct syscall_arg *arg);
305 
306 #define SCA_FD syscall_arg__scnprintf_fd
307 
308 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
309 					   struct syscall_arg *arg)
310 {
311 	int fd = arg->val;
312 
313 	if (fd == AT_FDCWD)
314 		return scnprintf(bf, size, "CWD");
315 
316 	return syscall_arg__scnprintf_fd(bf, size, arg);
317 }
318 
319 #define SCA_FDAT syscall_arg__scnprintf_fd_at
320 
321 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
322 					      struct syscall_arg *arg);
323 
324 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
325 
326 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
327 					 struct syscall_arg *arg)
328 {
329 	return scnprintf(bf, size, "%#lx", arg->val);
330 }
331 
332 #define SCA_HEX syscall_arg__scnprintf_hex
333 
334 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
335 					       struct syscall_arg *arg)
336 {
337 	int printed = 0, prot = arg->val;
338 
339 	if (prot == PROT_NONE)
340 		return scnprintf(bf, size, "NONE");
341 #define	P_MMAP_PROT(n) \
342 	if (prot & PROT_##n) { \
343 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
344 		prot &= ~PROT_##n; \
345 	}
346 
347 	P_MMAP_PROT(EXEC);
348 	P_MMAP_PROT(READ);
349 	P_MMAP_PROT(WRITE);
350 #ifdef PROT_SEM
351 	P_MMAP_PROT(SEM);
352 #endif
353 	P_MMAP_PROT(GROWSDOWN);
354 	P_MMAP_PROT(GROWSUP);
355 #undef P_MMAP_PROT
356 
357 	if (prot)
358 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
359 
360 	return printed;
361 }
362 
363 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
364 
365 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
366 						struct syscall_arg *arg)
367 {
368 	int printed = 0, flags = arg->val;
369 
370 #define	P_MMAP_FLAG(n) \
371 	if (flags & MAP_##n) { \
372 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
373 		flags &= ~MAP_##n; \
374 	}
375 
376 	P_MMAP_FLAG(SHARED);
377 	P_MMAP_FLAG(PRIVATE);
378 #ifdef MAP_32BIT
379 	P_MMAP_FLAG(32BIT);
380 #endif
381 	P_MMAP_FLAG(ANONYMOUS);
382 	P_MMAP_FLAG(DENYWRITE);
383 	P_MMAP_FLAG(EXECUTABLE);
384 	P_MMAP_FLAG(FILE);
385 	P_MMAP_FLAG(FIXED);
386 	P_MMAP_FLAG(GROWSDOWN);
387 #ifdef MAP_HUGETLB
388 	P_MMAP_FLAG(HUGETLB);
389 #endif
390 	P_MMAP_FLAG(LOCKED);
391 	P_MMAP_FLAG(NONBLOCK);
392 	P_MMAP_FLAG(NORESERVE);
393 	P_MMAP_FLAG(POPULATE);
394 	P_MMAP_FLAG(STACK);
395 #ifdef MAP_UNINITIALIZED
396 	P_MMAP_FLAG(UNINITIALIZED);
397 #endif
398 #undef P_MMAP_FLAG
399 
400 	if (flags)
401 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
402 
403 	return printed;
404 }
405 
406 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
407 
408 static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
409 						  struct syscall_arg *arg)
410 {
411 	int printed = 0, flags = arg->val;
412 
413 #define P_MREMAP_FLAG(n) \
414 	if (flags & MREMAP_##n) { \
415 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
416 		flags &= ~MREMAP_##n; \
417 	}
418 
419 	P_MREMAP_FLAG(MAYMOVE);
420 #ifdef MREMAP_FIXED
421 	P_MREMAP_FLAG(FIXED);
422 #endif
423 #undef P_MREMAP_FLAG
424 
425 	if (flags)
426 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
427 
428 	return printed;
429 }
430 
431 #define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
432 
433 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
434 						      struct syscall_arg *arg)
435 {
436 	int behavior = arg->val;
437 
438 	switch (behavior) {
439 #define	P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
440 	P_MADV_BHV(NORMAL);
441 	P_MADV_BHV(RANDOM);
442 	P_MADV_BHV(SEQUENTIAL);
443 	P_MADV_BHV(WILLNEED);
444 	P_MADV_BHV(DONTNEED);
445 	P_MADV_BHV(REMOVE);
446 	P_MADV_BHV(DONTFORK);
447 	P_MADV_BHV(DOFORK);
448 	P_MADV_BHV(HWPOISON);
449 #ifdef MADV_SOFT_OFFLINE
450 	P_MADV_BHV(SOFT_OFFLINE);
451 #endif
452 	P_MADV_BHV(MERGEABLE);
453 	P_MADV_BHV(UNMERGEABLE);
454 #ifdef MADV_HUGEPAGE
455 	P_MADV_BHV(HUGEPAGE);
456 #endif
457 #ifdef MADV_NOHUGEPAGE
458 	P_MADV_BHV(NOHUGEPAGE);
459 #endif
460 #ifdef MADV_DONTDUMP
461 	P_MADV_BHV(DONTDUMP);
462 #endif
463 #ifdef MADV_DODUMP
464 	P_MADV_BHV(DODUMP);
465 #endif
466 #undef P_MADV_PHV
467 	default: break;
468 	}
469 
470 	return scnprintf(bf, size, "%#x", behavior);
471 }
472 
473 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
474 
475 static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
476 					   struct syscall_arg *arg)
477 {
478 	int printed = 0, op = arg->val;
479 
480 	if (op == 0)
481 		return scnprintf(bf, size, "NONE");
482 #define	P_CMD(cmd) \
483 	if ((op & LOCK_##cmd) == LOCK_##cmd) { \
484 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
485 		op &= ~LOCK_##cmd; \
486 	}
487 
488 	P_CMD(SH);
489 	P_CMD(EX);
490 	P_CMD(NB);
491 	P_CMD(UN);
492 	P_CMD(MAND);
493 	P_CMD(RW);
494 	P_CMD(READ);
495 	P_CMD(WRITE);
496 #undef P_OP
497 
498 	if (op)
499 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
500 
501 	return printed;
502 }
503 
504 #define SCA_FLOCK syscall_arg__scnprintf_flock
505 
506 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
507 {
508 	enum syscall_futex_args {
509 		SCF_UADDR   = (1 << 0),
510 		SCF_OP	    = (1 << 1),
511 		SCF_VAL	    = (1 << 2),
512 		SCF_TIMEOUT = (1 << 3),
513 		SCF_UADDR2  = (1 << 4),
514 		SCF_VAL3    = (1 << 5),
515 	};
516 	int op = arg->val;
517 	int cmd = op & FUTEX_CMD_MASK;
518 	size_t printed = 0;
519 
520 	switch (cmd) {
521 #define	P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
522 	P_FUTEX_OP(WAIT);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
523 	P_FUTEX_OP(WAKE);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
524 	P_FUTEX_OP(FD);		    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
525 	P_FUTEX_OP(REQUEUE);	    arg->mask |= SCF_VAL3|SCF_TIMEOUT;	          break;
526 	P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;			  break;
527 	P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;			  break;
528 	P_FUTEX_OP(WAKE_OP);							  break;
529 	P_FUTEX_OP(LOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
530 	P_FUTEX_OP(UNLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
531 	P_FUTEX_OP(TRYLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
532 	P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;			  break;
533 	P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;			  break;
534 	P_FUTEX_OP(WAIT_REQUEUE_PI);						  break;
535 	default: printed = scnprintf(bf, size, "%#x", cmd);			  break;
536 	}
537 
538 	if (op & FUTEX_PRIVATE_FLAG)
539 		printed += scnprintf(bf + printed, size - printed, "|PRIV");
540 
541 	if (op & FUTEX_CLOCK_REALTIME)
542 		printed += scnprintf(bf + printed, size - printed, "|CLKRT");
543 
544 	return printed;
545 }
546 
547 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
548 
549 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
550 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
551 
552 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
553 static DEFINE_STRARRAY(itimers);
554 
555 static const char *whences[] = { "SET", "CUR", "END",
556 #ifdef SEEK_DATA
557 "DATA",
558 #endif
559 #ifdef SEEK_HOLE
560 "HOLE",
561 #endif
562 };
563 static DEFINE_STRARRAY(whences);
564 
565 static const char *fcntl_cmds[] = {
566 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
567 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
568 	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
569 	"F_GETOWNER_UIDS",
570 };
571 static DEFINE_STRARRAY(fcntl_cmds);
572 
573 static const char *rlimit_resources[] = {
574 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
575 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
576 	"RTTIME",
577 };
578 static DEFINE_STRARRAY(rlimit_resources);
579 
580 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
581 static DEFINE_STRARRAY(sighow);
582 
583 static const char *clockid[] = {
584 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
585 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE",
586 };
587 static DEFINE_STRARRAY(clockid);
588 
589 static const char *socket_families[] = {
590 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
591 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
592 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
593 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
594 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
595 	"ALG", "NFC", "VSOCK",
596 };
597 static DEFINE_STRARRAY(socket_families);
598 
599 #ifndef SOCK_TYPE_MASK
600 #define SOCK_TYPE_MASK 0xf
601 #endif
602 
603 static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
604 						      struct syscall_arg *arg)
605 {
606 	size_t printed;
607 	int type = arg->val,
608 	    flags = type & ~SOCK_TYPE_MASK;
609 
610 	type &= SOCK_TYPE_MASK;
611 	/*
612  	 * Can't use a strarray, MIPS may override for ABI reasons.
613  	 */
614 	switch (type) {
615 #define	P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
616 	P_SK_TYPE(STREAM);
617 	P_SK_TYPE(DGRAM);
618 	P_SK_TYPE(RAW);
619 	P_SK_TYPE(RDM);
620 	P_SK_TYPE(SEQPACKET);
621 	P_SK_TYPE(DCCP);
622 	P_SK_TYPE(PACKET);
623 #undef P_SK_TYPE
624 	default:
625 		printed = scnprintf(bf, size, "%#x", type);
626 	}
627 
628 #define	P_SK_FLAG(n) \
629 	if (flags & SOCK_##n) { \
630 		printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
631 		flags &= ~SOCK_##n; \
632 	}
633 
634 	P_SK_FLAG(CLOEXEC);
635 	P_SK_FLAG(NONBLOCK);
636 #undef P_SK_FLAG
637 
638 	if (flags)
639 		printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
640 
641 	return printed;
642 }
643 
644 #define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
645 
646 #ifndef MSG_PROBE
647 #define MSG_PROBE	     0x10
648 #endif
649 #ifndef MSG_WAITFORONE
650 #define MSG_WAITFORONE	0x10000
651 #endif
652 #ifndef MSG_SENDPAGE_NOTLAST
653 #define MSG_SENDPAGE_NOTLAST 0x20000
654 #endif
655 #ifndef MSG_FASTOPEN
656 #define MSG_FASTOPEN	     0x20000000
657 #endif
658 
659 static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
660 					       struct syscall_arg *arg)
661 {
662 	int printed = 0, flags = arg->val;
663 
664 	if (flags == 0)
665 		return scnprintf(bf, size, "NONE");
666 #define	P_MSG_FLAG(n) \
667 	if (flags & MSG_##n) { \
668 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
669 		flags &= ~MSG_##n; \
670 	}
671 
672 	P_MSG_FLAG(OOB);
673 	P_MSG_FLAG(PEEK);
674 	P_MSG_FLAG(DONTROUTE);
675 	P_MSG_FLAG(TRYHARD);
676 	P_MSG_FLAG(CTRUNC);
677 	P_MSG_FLAG(PROBE);
678 	P_MSG_FLAG(TRUNC);
679 	P_MSG_FLAG(DONTWAIT);
680 	P_MSG_FLAG(EOR);
681 	P_MSG_FLAG(WAITALL);
682 	P_MSG_FLAG(FIN);
683 	P_MSG_FLAG(SYN);
684 	P_MSG_FLAG(CONFIRM);
685 	P_MSG_FLAG(RST);
686 	P_MSG_FLAG(ERRQUEUE);
687 	P_MSG_FLAG(NOSIGNAL);
688 	P_MSG_FLAG(MORE);
689 	P_MSG_FLAG(WAITFORONE);
690 	P_MSG_FLAG(SENDPAGE_NOTLAST);
691 	P_MSG_FLAG(FASTOPEN);
692 	P_MSG_FLAG(CMSG_CLOEXEC);
693 #undef P_MSG_FLAG
694 
695 	if (flags)
696 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
697 
698 	return printed;
699 }
700 
701 #define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
702 
703 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
704 						 struct syscall_arg *arg)
705 {
706 	size_t printed = 0;
707 	int mode = arg->val;
708 
709 	if (mode == F_OK) /* 0 */
710 		return scnprintf(bf, size, "F");
711 #define	P_MODE(n) \
712 	if (mode & n##_OK) { \
713 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
714 		mode &= ~n##_OK; \
715 	}
716 
717 	P_MODE(R);
718 	P_MODE(W);
719 	P_MODE(X);
720 #undef P_MODE
721 
722 	if (mode)
723 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
724 
725 	return printed;
726 }
727 
728 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
729 
730 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
731 					       struct syscall_arg *arg)
732 {
733 	int printed = 0, flags = arg->val;
734 
735 	if (!(flags & O_CREAT))
736 		arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
737 
738 	if (flags == 0)
739 		return scnprintf(bf, size, "RDONLY");
740 #define	P_FLAG(n) \
741 	if (flags & O_##n) { \
742 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
743 		flags &= ~O_##n; \
744 	}
745 
746 	P_FLAG(APPEND);
747 	P_FLAG(ASYNC);
748 	P_FLAG(CLOEXEC);
749 	P_FLAG(CREAT);
750 	P_FLAG(DIRECT);
751 	P_FLAG(DIRECTORY);
752 	P_FLAG(EXCL);
753 	P_FLAG(LARGEFILE);
754 	P_FLAG(NOATIME);
755 	P_FLAG(NOCTTY);
756 #ifdef O_NONBLOCK
757 	P_FLAG(NONBLOCK);
758 #elif O_NDELAY
759 	P_FLAG(NDELAY);
760 #endif
761 #ifdef O_PATH
762 	P_FLAG(PATH);
763 #endif
764 	P_FLAG(RDWR);
765 #ifdef O_DSYNC
766 	if ((flags & O_SYNC) == O_SYNC)
767 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
768 	else {
769 		P_FLAG(DSYNC);
770 	}
771 #else
772 	P_FLAG(SYNC);
773 #endif
774 	P_FLAG(TRUNC);
775 	P_FLAG(WRONLY);
776 #undef P_FLAG
777 
778 	if (flags)
779 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
780 
781 	return printed;
782 }
783 
784 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
785 
786 static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
787 						   struct syscall_arg *arg)
788 {
789 	int printed = 0, flags = arg->val;
790 
791 	if (flags == 0)
792 		return scnprintf(bf, size, "NONE");
793 #define	P_FLAG(n) \
794 	if (flags & EFD_##n) { \
795 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
796 		flags &= ~EFD_##n; \
797 	}
798 
799 	P_FLAG(SEMAPHORE);
800 	P_FLAG(CLOEXEC);
801 	P_FLAG(NONBLOCK);
802 #undef P_FLAG
803 
804 	if (flags)
805 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
806 
807 	return printed;
808 }
809 
810 #define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
811 
812 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
813 						struct syscall_arg *arg)
814 {
815 	int printed = 0, flags = arg->val;
816 
817 #define	P_FLAG(n) \
818 	if (flags & O_##n) { \
819 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
820 		flags &= ~O_##n; \
821 	}
822 
823 	P_FLAG(CLOEXEC);
824 	P_FLAG(NONBLOCK);
825 #undef P_FLAG
826 
827 	if (flags)
828 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
829 
830 	return printed;
831 }
832 
833 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
834 
835 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
836 {
837 	int sig = arg->val;
838 
839 	switch (sig) {
840 #define	P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
841 	P_SIGNUM(HUP);
842 	P_SIGNUM(INT);
843 	P_SIGNUM(QUIT);
844 	P_SIGNUM(ILL);
845 	P_SIGNUM(TRAP);
846 	P_SIGNUM(ABRT);
847 	P_SIGNUM(BUS);
848 	P_SIGNUM(FPE);
849 	P_SIGNUM(KILL);
850 	P_SIGNUM(USR1);
851 	P_SIGNUM(SEGV);
852 	P_SIGNUM(USR2);
853 	P_SIGNUM(PIPE);
854 	P_SIGNUM(ALRM);
855 	P_SIGNUM(TERM);
856 	P_SIGNUM(CHLD);
857 	P_SIGNUM(CONT);
858 	P_SIGNUM(STOP);
859 	P_SIGNUM(TSTP);
860 	P_SIGNUM(TTIN);
861 	P_SIGNUM(TTOU);
862 	P_SIGNUM(URG);
863 	P_SIGNUM(XCPU);
864 	P_SIGNUM(XFSZ);
865 	P_SIGNUM(VTALRM);
866 	P_SIGNUM(PROF);
867 	P_SIGNUM(WINCH);
868 	P_SIGNUM(IO);
869 	P_SIGNUM(PWR);
870 	P_SIGNUM(SYS);
871 #ifdef SIGEMT
872 	P_SIGNUM(EMT);
873 #endif
874 #ifdef SIGSTKFLT
875 	P_SIGNUM(STKFLT);
876 #endif
877 #ifdef SIGSWI
878 	P_SIGNUM(SWI);
879 #endif
880 	default: break;
881 	}
882 
883 	return scnprintf(bf, size, "%#x", sig);
884 }
885 
886 #define SCA_SIGNUM syscall_arg__scnprintf_signum
887 
888 #if defined(__i386__) || defined(__x86_64__)
889 /*
890  * FIXME: Make this available to all arches.
891  */
892 #define TCGETS		0x5401
893 
894 static const char *tioctls[] = {
895 	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
896 	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
897 	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
898 	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
899 	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
900 	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
901 	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
902 	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
903 	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
904 	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
905 	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
906 	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
907 	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
908 	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
909 	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
910 };
911 
912 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
913 #endif /* defined(__i386__) || defined(__x86_64__) */
914 
915 #define STRARRAY(arg, name, array) \
916 	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
917 	  .arg_parm	 = { [arg] = &strarray__##array, }
918 
919 static struct syscall_fmt {
920 	const char *name;
921 	const char *alias;
922 	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
923 	void	   *arg_parm[6];
924 	bool	   errmsg;
925 	bool	   timeout;
926 	bool	   hexret;
927 } syscall_fmts[] = {
928 	{ .name	    = "access",	    .errmsg = true,
929 	  .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
930 	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
931 	{ .name	    = "brk",	    .hexret = true,
932 	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
933 	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
934 	{ .name	    = "close",	    .errmsg = true,
935 	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
936 	{ .name	    = "connect",    .errmsg = true, },
937 	{ .name	    = "dup",	    .errmsg = true,
938 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
939 	{ .name	    = "dup2",	    .errmsg = true,
940 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
941 	{ .name	    = "dup3",	    .errmsg = true,
942 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
943 	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
944 	{ .name	    = "eventfd2",   .errmsg = true,
945 	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
946 	{ .name	    = "faccessat",  .errmsg = true,
947 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
948 	{ .name	    = "fadvise64",  .errmsg = true,
949 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
950 	{ .name	    = "fallocate",  .errmsg = true,
951 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
952 	{ .name	    = "fchdir",	    .errmsg = true,
953 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
954 	{ .name	    = "fchmod",	    .errmsg = true,
955 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
956 	{ .name	    = "fchmodat",   .errmsg = true,
957 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
958 	{ .name	    = "fchown",	    .errmsg = true,
959 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
960 	{ .name	    = "fchownat",   .errmsg = true,
961 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
962 	{ .name	    = "fcntl",	    .errmsg = true,
963 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
964 			     [1] = SCA_STRARRAY, /* cmd */ },
965 	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
966 	{ .name	    = "fdatasync",  .errmsg = true,
967 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
968 	{ .name	    = "flock",	    .errmsg = true,
969 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
970 			     [1] = SCA_FLOCK, /* cmd */ }, },
971 	{ .name	    = "fsetxattr",  .errmsg = true,
972 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
973 	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat",
974 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
975 	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat",
976 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
977 	{ .name	    = "fstatfs",    .errmsg = true,
978 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
979 	{ .name	    = "fsync",    .errmsg = true,
980 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
981 	{ .name	    = "ftruncate", .errmsg = true,
982 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
983 	{ .name	    = "futex",	    .errmsg = true,
984 	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
985 	{ .name	    = "futimesat", .errmsg = true,
986 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
987 	{ .name	    = "getdents",   .errmsg = true,
988 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
989 	{ .name	    = "getdents64", .errmsg = true,
990 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
991 	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
992 	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
993 	{ .name	    = "ioctl",	    .errmsg = true,
994 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
995 #if defined(__i386__) || defined(__x86_64__)
996 /*
997  * FIXME: Make this available to all arches.
998  */
999 			     [1] = SCA_STRHEXARRAY, /* cmd */
1000 			     [2] = SCA_HEX, /* arg */ },
1001 	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
1002 #else
1003 			     [2] = SCA_HEX, /* arg */ }, },
1004 #endif
1005 	{ .name	    = "kill",	    .errmsg = true,
1006 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1007 	{ .name	    = "linkat",	    .errmsg = true,
1008 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1009 	{ .name	    = "lseek",	    .errmsg = true,
1010 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1011 			     [2] = SCA_STRARRAY, /* whence */ },
1012 	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
1013 	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat", },
1014 	{ .name     = "madvise",    .errmsg = true,
1015 	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
1016 			     [2] = SCA_MADV_BHV, /* behavior */ }, },
1017 	{ .name	    = "mkdirat",    .errmsg = true,
1018 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1019 	{ .name	    = "mknodat",    .errmsg = true,
1020 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1021 	{ .name	    = "mlock",	    .errmsg = true,
1022 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1023 	{ .name	    = "mlockall",   .errmsg = true,
1024 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1025 	{ .name	    = "mmap",	    .hexret = true,
1026 	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
1027 			     [2] = SCA_MMAP_PROT, /* prot */
1028 			     [3] = SCA_MMAP_FLAGS, /* flags */
1029 			     [4] = SCA_FD, 	  /* fd */ }, },
1030 	{ .name	    = "mprotect",   .errmsg = true,
1031 	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
1032 			     [2] = SCA_MMAP_PROT, /* prot */ }, },
1033 	{ .name	    = "mremap",	    .hexret = true,
1034 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1035 			     [3] = SCA_MREMAP_FLAGS, /* flags */
1036 			     [4] = SCA_HEX, /* new_addr */ }, },
1037 	{ .name	    = "munlock",    .errmsg = true,
1038 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1039 	{ .name	    = "munmap",	    .errmsg = true,
1040 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1041 	{ .name	    = "name_to_handle_at", .errmsg = true,
1042 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1043 	{ .name	    = "newfstatat", .errmsg = true,
1044 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1045 	{ .name	    = "open",	    .errmsg = true,
1046 	  .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1047 	{ .name	    = "open_by_handle_at", .errmsg = true,
1048 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1049 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1050 	{ .name	    = "openat",	    .errmsg = true,
1051 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1052 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1053 	{ .name	    = "pipe2",	    .errmsg = true,
1054 	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1055 	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
1056 	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
1057 	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64",
1058 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1059 	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread",
1060 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1061 	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1062 	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64",
1063 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1064 	{ .name	    = "pwritev",    .errmsg = true,
1065 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1066 	{ .name	    = "read",	    .errmsg = true,
1067 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1068 	{ .name	    = "readlinkat", .errmsg = true,
1069 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1070 	{ .name	    = "readv",	    .errmsg = true,
1071 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1072 	{ .name	    = "recvfrom",   .errmsg = true,
1073 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1074 	{ .name	    = "recvmmsg",   .errmsg = true,
1075 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1076 	{ .name	    = "recvmsg",    .errmsg = true,
1077 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1078 	{ .name	    = "renameat",   .errmsg = true,
1079 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1080 	{ .name	    = "rt_sigaction", .errmsg = true,
1081 	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1082 	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1083 	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
1084 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1085 	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
1086 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1087 	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
1088 	{ .name	    = "sendmmsg",    .errmsg = true,
1089 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1090 	{ .name	    = "sendmsg",    .errmsg = true,
1091 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1092 	{ .name	    = "sendto",	    .errmsg = true,
1093 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1094 	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1095 	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1096 	{ .name	    = "shutdown",   .errmsg = true,
1097 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1098 	{ .name	    = "socket",	    .errmsg = true,
1099 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1100 			     [1] = SCA_SK_TYPE, /* type */ },
1101 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1102 	{ .name	    = "socketpair", .errmsg = true,
1103 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1104 			     [1] = SCA_SK_TYPE, /* type */ },
1105 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1106 	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat", },
1107 	{ .name	    = "symlinkat",  .errmsg = true,
1108 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1109 	{ .name	    = "tgkill",	    .errmsg = true,
1110 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1111 	{ .name	    = "tkill",	    .errmsg = true,
1112 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1113 	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
1114 	{ .name	    = "unlinkat",   .errmsg = true,
1115 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1116 	{ .name	    = "utimensat",  .errmsg = true,
1117 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
1118 	{ .name	    = "write",	    .errmsg = true,
1119 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1120 	{ .name	    = "writev",	    .errmsg = true,
1121 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1122 };
1123 
1124 static int syscall_fmt__cmp(const void *name, const void *fmtp)
1125 {
1126 	const struct syscall_fmt *fmt = fmtp;
1127 	return strcmp(name, fmt->name);
1128 }
1129 
1130 static struct syscall_fmt *syscall_fmt__find(const char *name)
1131 {
1132 	const int nmemb = ARRAY_SIZE(syscall_fmts);
1133 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1134 }
1135 
1136 struct syscall {
1137 	struct event_format *tp_format;
1138 	int		    nr_args;
1139 	struct format_field *args;
1140 	const char	    *name;
1141 	bool		    filtered;
1142 	bool		    is_exit;
1143 	struct syscall_fmt  *fmt;
1144 	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1145 	void		    **arg_parm;
1146 };
1147 
1148 static size_t fprintf_duration(unsigned long t, FILE *fp)
1149 {
1150 	double duration = (double)t / NSEC_PER_MSEC;
1151 	size_t printed = fprintf(fp, "(");
1152 
1153 	if (duration >= 1.0)
1154 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1155 	else if (duration >= 0.01)
1156 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1157 	else
1158 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1159 	return printed + fprintf(fp, "): ");
1160 }
1161 
1162 struct thread_trace {
1163 	u64		  entry_time;
1164 	u64		  exit_time;
1165 	bool		  entry_pending;
1166 	unsigned long	  nr_events;
1167 	unsigned long	  pfmaj, pfmin;
1168 	char		  *entry_str;
1169 	double		  runtime_ms;
1170 	struct {
1171 		int	  max;
1172 		char	  **table;
1173 	} paths;
1174 
1175 	struct intlist *syscall_stats;
1176 };
1177 
1178 static struct thread_trace *thread_trace__new(void)
1179 {
1180 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1181 
1182 	if (ttrace)
1183 		ttrace->paths.max = -1;
1184 
1185 	ttrace->syscall_stats = intlist__new(NULL);
1186 
1187 	return ttrace;
1188 }
1189 
1190 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1191 {
1192 	struct thread_trace *ttrace;
1193 
1194 	if (thread == NULL)
1195 		goto fail;
1196 
1197 	if (thread__priv(thread) == NULL)
1198 		thread__set_priv(thread, thread_trace__new());
1199 
1200 	if (thread__priv(thread) == NULL)
1201 		goto fail;
1202 
1203 	ttrace = thread__priv(thread);
1204 	++ttrace->nr_events;
1205 
1206 	return ttrace;
1207 fail:
1208 	color_fprintf(fp, PERF_COLOR_RED,
1209 		      "WARNING: not enough memory, dropping samples!\n");
1210 	return NULL;
1211 }
1212 
1213 #define TRACE_PFMAJ		(1 << 0)
1214 #define TRACE_PFMIN		(1 << 1)
1215 
1216 struct trace {
1217 	struct perf_tool	tool;
1218 	struct {
1219 		int		machine;
1220 		int		open_id;
1221 	}			audit;
1222 	struct {
1223 		int		max;
1224 		struct syscall  *table;
1225 	} syscalls;
1226 	struct record_opts	opts;
1227 	struct perf_evlist	*evlist;
1228 	struct machine		*host;
1229 	struct thread		*current;
1230 	u64			base_time;
1231 	FILE			*output;
1232 	unsigned long		nr_events;
1233 	struct strlist		*ev_qualifier;
1234 	const char 		*last_vfs_getname;
1235 	struct intlist		*tid_list;
1236 	struct intlist		*pid_list;
1237 	struct {
1238 		size_t		nr;
1239 		pid_t		*entries;
1240 	}			filter_pids;
1241 	double			duration_filter;
1242 	double			runtime_ms;
1243 	struct {
1244 		u64		vfs_getname,
1245 				proc_getname;
1246 	} stats;
1247 	bool			not_ev_qualifier;
1248 	bool			live;
1249 	bool			full_time;
1250 	bool			sched;
1251 	bool			multiple_threads;
1252 	bool			summary;
1253 	bool			summary_only;
1254 	bool			show_comm;
1255 	bool			show_tool_stats;
1256 	bool			trace_syscalls;
1257 	bool			force;
1258 	int			trace_pgfaults;
1259 };
1260 
1261 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1262 {
1263 	struct thread_trace *ttrace = thread__priv(thread);
1264 
1265 	if (fd > ttrace->paths.max) {
1266 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1267 
1268 		if (npath == NULL)
1269 			return -1;
1270 
1271 		if (ttrace->paths.max != -1) {
1272 			memset(npath + ttrace->paths.max + 1, 0,
1273 			       (fd - ttrace->paths.max) * sizeof(char *));
1274 		} else {
1275 			memset(npath, 0, (fd + 1) * sizeof(char *));
1276 		}
1277 
1278 		ttrace->paths.table = npath;
1279 		ttrace->paths.max   = fd;
1280 	}
1281 
1282 	ttrace->paths.table[fd] = strdup(pathname);
1283 
1284 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
1285 }
1286 
1287 static int thread__read_fd_path(struct thread *thread, int fd)
1288 {
1289 	char linkname[PATH_MAX], pathname[PATH_MAX];
1290 	struct stat st;
1291 	int ret;
1292 
1293 	if (thread->pid_ == thread->tid) {
1294 		scnprintf(linkname, sizeof(linkname),
1295 			  "/proc/%d/fd/%d", thread->pid_, fd);
1296 	} else {
1297 		scnprintf(linkname, sizeof(linkname),
1298 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1299 	}
1300 
1301 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1302 		return -1;
1303 
1304 	ret = readlink(linkname, pathname, sizeof(pathname));
1305 
1306 	if (ret < 0 || ret > st.st_size)
1307 		return -1;
1308 
1309 	pathname[ret] = '\0';
1310 	return trace__set_fd_pathname(thread, fd, pathname);
1311 }
1312 
1313 static const char *thread__fd_path(struct thread *thread, int fd,
1314 				   struct trace *trace)
1315 {
1316 	struct thread_trace *ttrace = thread__priv(thread);
1317 
1318 	if (ttrace == NULL)
1319 		return NULL;
1320 
1321 	if (fd < 0)
1322 		return NULL;
1323 
1324 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1325 		if (!trace->live)
1326 			return NULL;
1327 		++trace->stats.proc_getname;
1328 		if (thread__read_fd_path(thread, fd))
1329 			return NULL;
1330 	}
1331 
1332 	return ttrace->paths.table[fd];
1333 }
1334 
1335 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1336 					struct syscall_arg *arg)
1337 {
1338 	int fd = arg->val;
1339 	size_t printed = scnprintf(bf, size, "%d", fd);
1340 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1341 
1342 	if (path)
1343 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1344 
1345 	return printed;
1346 }
1347 
1348 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1349 					      struct syscall_arg *arg)
1350 {
1351 	int fd = arg->val;
1352 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1353 	struct thread_trace *ttrace = thread__priv(arg->thread);
1354 
1355 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1356 		zfree(&ttrace->paths.table[fd]);
1357 
1358 	return printed;
1359 }
1360 
1361 static bool trace__filter_duration(struct trace *trace, double t)
1362 {
1363 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1364 }
1365 
1366 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1367 {
1368 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1369 
1370 	return fprintf(fp, "%10.3f ", ts);
1371 }
1372 
1373 static bool done = false;
1374 static bool interrupted = false;
1375 
1376 static void sig_handler(int sig)
1377 {
1378 	done = true;
1379 	interrupted = sig == SIGINT;
1380 }
1381 
1382 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1383 					u64 duration, u64 tstamp, FILE *fp)
1384 {
1385 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1386 	printed += fprintf_duration(duration, fp);
1387 
1388 	if (trace->multiple_threads) {
1389 		if (trace->show_comm)
1390 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1391 		printed += fprintf(fp, "%d ", thread->tid);
1392 	}
1393 
1394 	return printed;
1395 }
1396 
1397 static int trace__process_event(struct trace *trace, struct machine *machine,
1398 				union perf_event *event, struct perf_sample *sample)
1399 {
1400 	int ret = 0;
1401 
1402 	switch (event->header.type) {
1403 	case PERF_RECORD_LOST:
1404 		color_fprintf(trace->output, PERF_COLOR_RED,
1405 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1406 		ret = machine__process_lost_event(machine, event, sample);
1407 	default:
1408 		ret = machine__process_event(machine, event, sample);
1409 		break;
1410 	}
1411 
1412 	return ret;
1413 }
1414 
1415 static int trace__tool_process(struct perf_tool *tool,
1416 			       union perf_event *event,
1417 			       struct perf_sample *sample,
1418 			       struct machine *machine)
1419 {
1420 	struct trace *trace = container_of(tool, struct trace, tool);
1421 	return trace__process_event(trace, machine, event, sample);
1422 }
1423 
1424 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1425 {
1426 	int err = symbol__init(NULL);
1427 
1428 	if (err)
1429 		return err;
1430 
1431 	trace->host = machine__new_host();
1432 	if (trace->host == NULL)
1433 		return -ENOMEM;
1434 
1435 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1436 					    evlist->threads, trace__tool_process, false);
1437 	if (err)
1438 		symbol__exit();
1439 
1440 	return err;
1441 }
1442 
1443 static int syscall__set_arg_fmts(struct syscall *sc)
1444 {
1445 	struct format_field *field;
1446 	int idx = 0;
1447 
1448 	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1449 	if (sc->arg_scnprintf == NULL)
1450 		return -1;
1451 
1452 	if (sc->fmt)
1453 		sc->arg_parm = sc->fmt->arg_parm;
1454 
1455 	for (field = sc->args; field; field = field->next) {
1456 		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1457 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1458 		else if (field->flags & FIELD_IS_POINTER)
1459 			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1460 		++idx;
1461 	}
1462 
1463 	return 0;
1464 }
1465 
1466 static int trace__read_syscall_info(struct trace *trace, int id)
1467 {
1468 	char tp_name[128];
1469 	struct syscall *sc;
1470 	const char *name = audit_syscall_to_name(id, trace->audit.machine);
1471 
1472 	if (name == NULL)
1473 		return -1;
1474 
1475 	if (id > trace->syscalls.max) {
1476 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1477 
1478 		if (nsyscalls == NULL)
1479 			return -1;
1480 
1481 		if (trace->syscalls.max != -1) {
1482 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1483 			       (id - trace->syscalls.max) * sizeof(*sc));
1484 		} else {
1485 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1486 		}
1487 
1488 		trace->syscalls.table = nsyscalls;
1489 		trace->syscalls.max   = id;
1490 	}
1491 
1492 	sc = trace->syscalls.table + id;
1493 	sc->name = name;
1494 
1495 	if (trace->ev_qualifier) {
1496 		bool in = strlist__find(trace->ev_qualifier, name) != NULL;
1497 
1498 		if (!(in ^ trace->not_ev_qualifier)) {
1499 			sc->filtered = true;
1500 			/*
1501 			 * No need to do read tracepoint information since this will be
1502 			 * filtered out.
1503 			 */
1504 			return 0;
1505 		}
1506 	}
1507 
1508 	sc->fmt  = syscall_fmt__find(sc->name);
1509 
1510 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1511 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1512 
1513 	if (sc->tp_format == NULL && sc->fmt && sc->fmt->alias) {
1514 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1515 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1516 	}
1517 
1518 	if (sc->tp_format == NULL)
1519 		return -1;
1520 
1521 	sc->args = sc->tp_format->format.fields;
1522 	sc->nr_args = sc->tp_format->format.nr_fields;
1523 	/* drop nr field - not relevant here; does not exist on older kernels */
1524 	if (sc->args && strcmp(sc->args->name, "nr") == 0) {
1525 		sc->args = sc->args->next;
1526 		--sc->nr_args;
1527 	}
1528 
1529 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1530 
1531 	return syscall__set_arg_fmts(sc);
1532 }
1533 
1534 /*
1535  * args is to be interpreted as a series of longs but we need to handle
1536  * 8-byte unaligned accesses. args points to raw_data within the event
1537  * and raw_data is guaranteed to be 8-byte unaligned because it is
1538  * preceded by raw_size which is a u32. So we need to copy args to a temp
1539  * variable to read it. Most notably this avoids extended load instructions
1540  * on unaligned addresses
1541  */
1542 
1543 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1544 				      unsigned char *args, struct trace *trace,
1545 				      struct thread *thread)
1546 {
1547 	size_t printed = 0;
1548 	unsigned char *p;
1549 	unsigned long val;
1550 
1551 	if (sc->args != NULL) {
1552 		struct format_field *field;
1553 		u8 bit = 1;
1554 		struct syscall_arg arg = {
1555 			.idx	= 0,
1556 			.mask	= 0,
1557 			.trace  = trace,
1558 			.thread = thread,
1559 		};
1560 
1561 		for (field = sc->args; field;
1562 		     field = field->next, ++arg.idx, bit <<= 1) {
1563 			if (arg.mask & bit)
1564 				continue;
1565 
1566 			/* special care for unaligned accesses */
1567 			p = args + sizeof(unsigned long) * arg.idx;
1568 			memcpy(&val, p, sizeof(val));
1569 
1570 			/*
1571  			 * Suppress this argument if its value is zero and
1572  			 * and we don't have a string associated in an
1573  			 * strarray for it.
1574  			 */
1575 			if (val == 0 &&
1576 			    !(sc->arg_scnprintf &&
1577 			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1578 			      sc->arg_parm[arg.idx]))
1579 				continue;
1580 
1581 			printed += scnprintf(bf + printed, size - printed,
1582 					     "%s%s: ", printed ? ", " : "", field->name);
1583 			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1584 				arg.val = val;
1585 				if (sc->arg_parm)
1586 					arg.parm = sc->arg_parm[arg.idx];
1587 				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1588 								      size - printed, &arg);
1589 			} else {
1590 				printed += scnprintf(bf + printed, size - printed,
1591 						     "%ld", val);
1592 			}
1593 		}
1594 	} else {
1595 		int i = 0;
1596 
1597 		while (i < 6) {
1598 			/* special care for unaligned accesses */
1599 			p = args + sizeof(unsigned long) * i;
1600 			memcpy(&val, p, sizeof(val));
1601 			printed += scnprintf(bf + printed, size - printed,
1602 					     "%sarg%d: %ld",
1603 					     printed ? ", " : "", i, val);
1604 			++i;
1605 		}
1606 	}
1607 
1608 	return printed;
1609 }
1610 
1611 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1612 				  union perf_event *event,
1613 				  struct perf_sample *sample);
1614 
1615 static struct syscall *trace__syscall_info(struct trace *trace,
1616 					   struct perf_evsel *evsel, int id)
1617 {
1618 
1619 	if (id < 0) {
1620 
1621 		/*
1622 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1623 		 * before that, leaving at a higher verbosity level till that is
1624 		 * explained. Reproduced with plain ftrace with:
1625 		 *
1626 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1627 		 * grep "NR -1 " /t/trace_pipe
1628 		 *
1629 		 * After generating some load on the machine.
1630  		 */
1631 		if (verbose > 1) {
1632 			static u64 n;
1633 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1634 				id, perf_evsel__name(evsel), ++n);
1635 		}
1636 		return NULL;
1637 	}
1638 
1639 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1640 	    trace__read_syscall_info(trace, id))
1641 		goto out_cant_read;
1642 
1643 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1644 		goto out_cant_read;
1645 
1646 	return &trace->syscalls.table[id];
1647 
1648 out_cant_read:
1649 	if (verbose) {
1650 		fprintf(trace->output, "Problems reading syscall %d", id);
1651 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1652 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1653 		fputs(" information\n", trace->output);
1654 	}
1655 	return NULL;
1656 }
1657 
1658 static void thread__update_stats(struct thread_trace *ttrace,
1659 				 int id, struct perf_sample *sample)
1660 {
1661 	struct int_node *inode;
1662 	struct stats *stats;
1663 	u64 duration = 0;
1664 
1665 	inode = intlist__findnew(ttrace->syscall_stats, id);
1666 	if (inode == NULL)
1667 		return;
1668 
1669 	stats = inode->priv;
1670 	if (stats == NULL) {
1671 		stats = malloc(sizeof(struct stats));
1672 		if (stats == NULL)
1673 			return;
1674 		init_stats(stats);
1675 		inode->priv = stats;
1676 	}
1677 
1678 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1679 		duration = sample->time - ttrace->entry_time;
1680 
1681 	update_stats(stats, duration);
1682 }
1683 
1684 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1685 {
1686 	struct thread_trace *ttrace;
1687 	u64 duration;
1688 	size_t printed;
1689 
1690 	if (trace->current == NULL)
1691 		return 0;
1692 
1693 	ttrace = thread__priv(trace->current);
1694 
1695 	if (!ttrace->entry_pending)
1696 		return 0;
1697 
1698 	duration = sample->time - ttrace->entry_time;
1699 
1700 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1701 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1702 	ttrace->entry_pending = false;
1703 
1704 	return printed;
1705 }
1706 
1707 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1708 			    union perf_event *event __maybe_unused,
1709 			    struct perf_sample *sample)
1710 {
1711 	char *msg;
1712 	void *args;
1713 	size_t printed = 0;
1714 	struct thread *thread;
1715 	int id = perf_evsel__sc_tp_uint(evsel, id, sample);
1716 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1717 	struct thread_trace *ttrace;
1718 
1719 	if (sc == NULL)
1720 		return -1;
1721 
1722 	if (sc->filtered)
1723 		return 0;
1724 
1725 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1726 	ttrace = thread__trace(thread, trace->output);
1727 	if (ttrace == NULL)
1728 		return -1;
1729 
1730 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1731 
1732 	if (ttrace->entry_str == NULL) {
1733 		ttrace->entry_str = malloc(1024);
1734 		if (!ttrace->entry_str)
1735 			return -1;
1736 	}
1737 
1738 	if (!trace->summary_only)
1739 		trace__printf_interrupted_entry(trace, sample);
1740 
1741 	ttrace->entry_time = sample->time;
1742 	msg = ttrace->entry_str;
1743 	printed += scnprintf(msg + printed, 1024 - printed, "%s(", sc->name);
1744 
1745 	printed += syscall__scnprintf_args(sc, msg + printed, 1024 - printed,
1746 					   args, trace, thread);
1747 
1748 	if (sc->is_exit) {
1749 		if (!trace->duration_filter && !trace->summary_only) {
1750 			trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1751 			fprintf(trace->output, "%-70s\n", ttrace->entry_str);
1752 		}
1753 	} else
1754 		ttrace->entry_pending = true;
1755 
1756 	if (trace->current != thread) {
1757 		thread__put(trace->current);
1758 		trace->current = thread__get(thread);
1759 	}
1760 
1761 	return 0;
1762 }
1763 
1764 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1765 			   union perf_event *event __maybe_unused,
1766 			   struct perf_sample *sample)
1767 {
1768 	long ret;
1769 	u64 duration = 0;
1770 	struct thread *thread;
1771 	int id = perf_evsel__sc_tp_uint(evsel, id, sample);
1772 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1773 	struct thread_trace *ttrace;
1774 
1775 	if (sc == NULL)
1776 		return -1;
1777 
1778 	if (sc->filtered)
1779 		return 0;
1780 
1781 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1782 	ttrace = thread__trace(thread, trace->output);
1783 	if (ttrace == NULL)
1784 		return -1;
1785 
1786 	if (trace->summary)
1787 		thread__update_stats(ttrace, id, sample);
1788 
1789 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1790 
1791 	if (id == trace->audit.open_id && ret >= 0 && trace->last_vfs_getname) {
1792 		trace__set_fd_pathname(thread, ret, trace->last_vfs_getname);
1793 		trace->last_vfs_getname = NULL;
1794 		++trace->stats.vfs_getname;
1795 	}
1796 
1797 	ttrace->exit_time = sample->time;
1798 
1799 	if (ttrace->entry_time) {
1800 		duration = sample->time - ttrace->entry_time;
1801 		if (trace__filter_duration(trace, duration))
1802 			goto out;
1803 	} else if (trace->duration_filter)
1804 		goto out;
1805 
1806 	if (trace->summary_only)
1807 		goto out;
1808 
1809 	trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1810 
1811 	if (ttrace->entry_pending) {
1812 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1813 	} else {
1814 		fprintf(trace->output, " ... [");
1815 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1816 		fprintf(trace->output, "]: %s()", sc->name);
1817 	}
1818 
1819 	if (sc->fmt == NULL) {
1820 signed_print:
1821 		fprintf(trace->output, ") = %ld", ret);
1822 	} else if (ret < 0 && sc->fmt->errmsg) {
1823 		char bf[STRERR_BUFSIZE];
1824 		const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
1825 			   *e = audit_errno_to_name(-ret);
1826 
1827 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1828 	} else if (ret == 0 && sc->fmt->timeout)
1829 		fprintf(trace->output, ") = 0 Timeout");
1830 	else if (sc->fmt->hexret)
1831 		fprintf(trace->output, ") = %#lx", ret);
1832 	else
1833 		goto signed_print;
1834 
1835 	fputc('\n', trace->output);
1836 out:
1837 	ttrace->entry_pending = false;
1838 
1839 	return 0;
1840 }
1841 
1842 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1843 			      union perf_event *event __maybe_unused,
1844 			      struct perf_sample *sample)
1845 {
1846 	trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname");
1847 	return 0;
1848 }
1849 
1850 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1851 				     union perf_event *event __maybe_unused,
1852 				     struct perf_sample *sample)
1853 {
1854         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1855 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1856 	struct thread *thread = machine__findnew_thread(trace->host,
1857 							sample->pid,
1858 							sample->tid);
1859 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1860 
1861 	if (ttrace == NULL)
1862 		goto out_dump;
1863 
1864 	ttrace->runtime_ms += runtime_ms;
1865 	trace->runtime_ms += runtime_ms;
1866 	return 0;
1867 
1868 out_dump:
1869 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1870 	       evsel->name,
1871 	       perf_evsel__strval(evsel, sample, "comm"),
1872 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1873 	       runtime,
1874 	       perf_evsel__intval(evsel, sample, "vruntime"));
1875 	return 0;
1876 }
1877 
1878 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1879 				union perf_event *event __maybe_unused,
1880 				struct perf_sample *sample)
1881 {
1882 	trace__printf_interrupted_entry(trace, sample);
1883 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1884 
1885 	if (trace->trace_syscalls)
1886 		fprintf(trace->output, "(         ): ");
1887 
1888 	fprintf(trace->output, "%s:", evsel->name);
1889 
1890 	if (evsel->tp_format) {
1891 		event_format__fprintf(evsel->tp_format, sample->cpu,
1892 				      sample->raw_data, sample->raw_size,
1893 				      trace->output);
1894 	}
1895 
1896 	fprintf(trace->output, ")\n");
1897 	return 0;
1898 }
1899 
1900 static void print_location(FILE *f, struct perf_sample *sample,
1901 			   struct addr_location *al,
1902 			   bool print_dso, bool print_sym)
1903 {
1904 
1905 	if ((verbose || print_dso) && al->map)
1906 		fprintf(f, "%s@", al->map->dso->long_name);
1907 
1908 	if ((verbose || print_sym) && al->sym)
1909 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1910 			al->addr - al->sym->start);
1911 	else if (al->map)
1912 		fprintf(f, "0x%" PRIx64, al->addr);
1913 	else
1914 		fprintf(f, "0x%" PRIx64, sample->addr);
1915 }
1916 
1917 static int trace__pgfault(struct trace *trace,
1918 			  struct perf_evsel *evsel,
1919 			  union perf_event *event,
1920 			  struct perf_sample *sample)
1921 {
1922 	struct thread *thread;
1923 	u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
1924 	struct addr_location al;
1925 	char map_type = 'd';
1926 	struct thread_trace *ttrace;
1927 
1928 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1929 	ttrace = thread__trace(thread, trace->output);
1930 	if (ttrace == NULL)
1931 		return -1;
1932 
1933 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1934 		ttrace->pfmaj++;
1935 	else
1936 		ttrace->pfmin++;
1937 
1938 	if (trace->summary_only)
1939 		return 0;
1940 
1941 	thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
1942 			      sample->ip, &al);
1943 
1944 	trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1945 
1946 	fprintf(trace->output, "%sfault [",
1947 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1948 		"maj" : "min");
1949 
1950 	print_location(trace->output, sample, &al, false, true);
1951 
1952 	fprintf(trace->output, "] => ");
1953 
1954 	thread__find_addr_location(thread, cpumode, MAP__VARIABLE,
1955 				   sample->addr, &al);
1956 
1957 	if (!al.map) {
1958 		thread__find_addr_location(thread, cpumode,
1959 					   MAP__FUNCTION, sample->addr, &al);
1960 
1961 		if (al.map)
1962 			map_type = 'x';
1963 		else
1964 			map_type = '?';
1965 	}
1966 
1967 	print_location(trace->output, sample, &al, true, false);
1968 
1969 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1970 
1971 	return 0;
1972 }
1973 
1974 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1975 {
1976 	if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1977 	    (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1978 		return false;
1979 
1980 	if (trace->pid_list || trace->tid_list)
1981 		return true;
1982 
1983 	return false;
1984 }
1985 
1986 static int trace__process_sample(struct perf_tool *tool,
1987 				 union perf_event *event,
1988 				 struct perf_sample *sample,
1989 				 struct perf_evsel *evsel,
1990 				 struct machine *machine __maybe_unused)
1991 {
1992 	struct trace *trace = container_of(tool, struct trace, tool);
1993 	int err = 0;
1994 
1995 	tracepoint_handler handler = evsel->handler;
1996 
1997 	if (skip_sample(trace, sample))
1998 		return 0;
1999 
2000 	if (!trace->full_time && trace->base_time == 0)
2001 		trace->base_time = sample->time;
2002 
2003 	if (handler) {
2004 		++trace->nr_events;
2005 		handler(trace, evsel, event, sample);
2006 	}
2007 
2008 	return err;
2009 }
2010 
2011 static int parse_target_str(struct trace *trace)
2012 {
2013 	if (trace->opts.target.pid) {
2014 		trace->pid_list = intlist__new(trace->opts.target.pid);
2015 		if (trace->pid_list == NULL) {
2016 			pr_err("Error parsing process id string\n");
2017 			return -EINVAL;
2018 		}
2019 	}
2020 
2021 	if (trace->opts.target.tid) {
2022 		trace->tid_list = intlist__new(trace->opts.target.tid);
2023 		if (trace->tid_list == NULL) {
2024 			pr_err("Error parsing thread id string\n");
2025 			return -EINVAL;
2026 		}
2027 	}
2028 
2029 	return 0;
2030 }
2031 
2032 static int trace__record(struct trace *trace, int argc, const char **argv)
2033 {
2034 	unsigned int rec_argc, i, j;
2035 	const char **rec_argv;
2036 	const char * const record_args[] = {
2037 		"record",
2038 		"-R",
2039 		"-m", "1024",
2040 		"-c", "1",
2041 	};
2042 
2043 	const char * const sc_args[] = { "-e", };
2044 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2045 	const char * const majpf_args[] = { "-e", "major-faults" };
2046 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2047 	const char * const minpf_args[] = { "-e", "minor-faults" };
2048 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2049 
2050 	/* +1 is for the event string below */
2051 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2052 		majpf_args_nr + minpf_args_nr + argc;
2053 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2054 
2055 	if (rec_argv == NULL)
2056 		return -ENOMEM;
2057 
2058 	j = 0;
2059 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2060 		rec_argv[j++] = record_args[i];
2061 
2062 	if (trace->trace_syscalls) {
2063 		for (i = 0; i < sc_args_nr; i++)
2064 			rec_argv[j++] = sc_args[i];
2065 
2066 		/* event string may be different for older kernels - e.g., RHEL6 */
2067 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2068 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2069 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2070 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2071 		else {
2072 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2073 			return -1;
2074 		}
2075 	}
2076 
2077 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2078 		for (i = 0; i < majpf_args_nr; i++)
2079 			rec_argv[j++] = majpf_args[i];
2080 
2081 	if (trace->trace_pgfaults & TRACE_PFMIN)
2082 		for (i = 0; i < minpf_args_nr; i++)
2083 			rec_argv[j++] = minpf_args[i];
2084 
2085 	for (i = 0; i < (unsigned int)argc; i++)
2086 		rec_argv[j++] = argv[i];
2087 
2088 	return cmd_record(j, rec_argv, NULL);
2089 }
2090 
2091 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2092 
2093 static void perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2094 {
2095 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2096 	if (evsel == NULL)
2097 		return;
2098 
2099 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2100 		perf_evsel__delete(evsel);
2101 		return;
2102 	}
2103 
2104 	evsel->handler = trace__vfs_getname;
2105 	perf_evlist__add(evlist, evsel);
2106 }
2107 
2108 static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2109 				    u64 config)
2110 {
2111 	struct perf_evsel *evsel;
2112 	struct perf_event_attr attr = {
2113 		.type = PERF_TYPE_SOFTWARE,
2114 		.mmap_data = 1,
2115 	};
2116 
2117 	attr.config = config;
2118 	attr.sample_period = 1;
2119 
2120 	event_attr_init(&attr);
2121 
2122 	evsel = perf_evsel__new(&attr);
2123 	if (!evsel)
2124 		return -ENOMEM;
2125 
2126 	evsel->handler = trace__pgfault;
2127 	perf_evlist__add(evlist, evsel);
2128 
2129 	return 0;
2130 }
2131 
2132 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2133 {
2134 	const u32 type = event->header.type;
2135 	struct perf_evsel *evsel;
2136 
2137 	if (!trace->full_time && trace->base_time == 0)
2138 		trace->base_time = sample->time;
2139 
2140 	if (type != PERF_RECORD_SAMPLE) {
2141 		trace__process_event(trace, trace->host, event, sample);
2142 		return;
2143 	}
2144 
2145 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2146 	if (evsel == NULL) {
2147 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2148 		return;
2149 	}
2150 
2151 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2152 	    sample->raw_data == NULL) {
2153 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2154 		       perf_evsel__name(evsel), sample->tid,
2155 		       sample->cpu, sample->raw_size);
2156 	} else {
2157 		tracepoint_handler handler = evsel->handler;
2158 		handler(trace, evsel, event, sample);
2159 	}
2160 }
2161 
2162 static int trace__run(struct trace *trace, int argc, const char **argv)
2163 {
2164 	struct perf_evlist *evlist = trace->evlist;
2165 	int err = -1, i;
2166 	unsigned long before;
2167 	const bool forks = argc > 0;
2168 	bool draining = false;
2169 
2170 	trace->live = true;
2171 
2172 	if (trace->trace_syscalls &&
2173 	    perf_evlist__add_syscall_newtp(evlist, trace__sys_enter,
2174 					   trace__sys_exit))
2175 		goto out_error_raw_syscalls;
2176 
2177 	if (trace->trace_syscalls)
2178 		perf_evlist__add_vfs_getname(evlist);
2179 
2180 	if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2181 	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2182 		goto out_error_mem;
2183 	}
2184 
2185 	if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2186 	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2187 		goto out_error_mem;
2188 
2189 	if (trace->sched &&
2190 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2191 				   trace__sched_stat_runtime))
2192 		goto out_error_sched_stat_runtime;
2193 
2194 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2195 	if (err < 0) {
2196 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2197 		goto out_delete_evlist;
2198 	}
2199 
2200 	err = trace__symbols_init(trace, evlist);
2201 	if (err < 0) {
2202 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2203 		goto out_delete_evlist;
2204 	}
2205 
2206 	perf_evlist__config(evlist, &trace->opts);
2207 
2208 	signal(SIGCHLD, sig_handler);
2209 	signal(SIGINT, sig_handler);
2210 
2211 	if (forks) {
2212 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2213 						    argv, false, NULL);
2214 		if (err < 0) {
2215 			fprintf(trace->output, "Couldn't run the workload!\n");
2216 			goto out_delete_evlist;
2217 		}
2218 	}
2219 
2220 	err = perf_evlist__open(evlist);
2221 	if (err < 0)
2222 		goto out_error_open;
2223 
2224 	/*
2225 	 * Better not use !target__has_task() here because we need to cover the
2226 	 * case where no threads were specified in the command line, but a
2227 	 * workload was, and in that case we will fill in the thread_map when
2228 	 * we fork the workload in perf_evlist__prepare_workload.
2229 	 */
2230 	if (trace->filter_pids.nr > 0)
2231 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2232 	else if (evlist->threads->map[0] == -1)
2233 		err = perf_evlist__set_filter_pid(evlist, getpid());
2234 
2235 	if (err < 0) {
2236 		printf("err=%d,%s\n", -err, strerror(-err));
2237 		exit(1);
2238 	}
2239 
2240 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2241 	if (err < 0)
2242 		goto out_error_mmap;
2243 
2244 	if (forks)
2245 		perf_evlist__start_workload(evlist);
2246 	else
2247 		perf_evlist__enable(evlist);
2248 
2249 	trace->multiple_threads = evlist->threads->map[0] == -1 ||
2250 				  evlist->threads->nr > 1 ||
2251 				  perf_evlist__first(evlist)->attr.inherit;
2252 again:
2253 	before = trace->nr_events;
2254 
2255 	for (i = 0; i < evlist->nr_mmaps; i++) {
2256 		union perf_event *event;
2257 
2258 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2259 			struct perf_sample sample;
2260 
2261 			++trace->nr_events;
2262 
2263 			err = perf_evlist__parse_sample(evlist, event, &sample);
2264 			if (err) {
2265 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2266 				goto next_event;
2267 			}
2268 
2269 			trace__handle_event(trace, event, &sample);
2270 next_event:
2271 			perf_evlist__mmap_consume(evlist, i);
2272 
2273 			if (interrupted)
2274 				goto out_disable;
2275 		}
2276 	}
2277 
2278 	if (trace->nr_events == before) {
2279 		int timeout = done ? 100 : -1;
2280 
2281 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2282 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2283 				draining = true;
2284 
2285 			goto again;
2286 		}
2287 	} else {
2288 		goto again;
2289 	}
2290 
2291 out_disable:
2292 	thread__zput(trace->current);
2293 
2294 	perf_evlist__disable(evlist);
2295 
2296 	if (!err) {
2297 		if (trace->summary)
2298 			trace__fprintf_thread_summary(trace, trace->output);
2299 
2300 		if (trace->show_tool_stats) {
2301 			fprintf(trace->output, "Stats:\n "
2302 					       " vfs_getname : %" PRIu64 "\n"
2303 					       " proc_getname: %" PRIu64 "\n",
2304 				trace->stats.vfs_getname,
2305 				trace->stats.proc_getname);
2306 		}
2307 	}
2308 
2309 out_delete_evlist:
2310 	perf_evlist__delete(evlist);
2311 	trace->evlist = NULL;
2312 	trace->live = false;
2313 	return err;
2314 {
2315 	char errbuf[BUFSIZ];
2316 
2317 out_error_sched_stat_runtime:
2318 	debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2319 	goto out_error;
2320 
2321 out_error_raw_syscalls:
2322 	debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2323 	goto out_error;
2324 
2325 out_error_mmap:
2326 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2327 	goto out_error;
2328 
2329 out_error_open:
2330 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2331 
2332 out_error:
2333 	fprintf(trace->output, "%s\n", errbuf);
2334 	goto out_delete_evlist;
2335 }
2336 out_error_mem:
2337 	fprintf(trace->output, "Not enough memory to run!\n");
2338 	goto out_delete_evlist;
2339 }
2340 
2341 static int trace__replay(struct trace *trace)
2342 {
2343 	const struct perf_evsel_str_handler handlers[] = {
2344 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2345 	};
2346 	struct perf_data_file file = {
2347 		.path  = input_name,
2348 		.mode  = PERF_DATA_MODE_READ,
2349 		.force = trace->force,
2350 	};
2351 	struct perf_session *session;
2352 	struct perf_evsel *evsel;
2353 	int err = -1;
2354 
2355 	trace->tool.sample	  = trace__process_sample;
2356 	trace->tool.mmap	  = perf_event__process_mmap;
2357 	trace->tool.mmap2	  = perf_event__process_mmap2;
2358 	trace->tool.comm	  = perf_event__process_comm;
2359 	trace->tool.exit	  = perf_event__process_exit;
2360 	trace->tool.fork	  = perf_event__process_fork;
2361 	trace->tool.attr	  = perf_event__process_attr;
2362 	trace->tool.tracing_data = perf_event__process_tracing_data;
2363 	trace->tool.build_id	  = perf_event__process_build_id;
2364 
2365 	trace->tool.ordered_events = true;
2366 	trace->tool.ordering_requires_timestamps = true;
2367 
2368 	/* add tid to output */
2369 	trace->multiple_threads = true;
2370 
2371 	session = perf_session__new(&file, false, &trace->tool);
2372 	if (session == NULL)
2373 		return -1;
2374 
2375 	if (symbol__init(&session->header.env) < 0)
2376 		goto out;
2377 
2378 	trace->host = &session->machines.host;
2379 
2380 	err = perf_session__set_tracepoints_handlers(session, handlers);
2381 	if (err)
2382 		goto out;
2383 
2384 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2385 						     "raw_syscalls:sys_enter");
2386 	/* older kernels have syscalls tp versus raw_syscalls */
2387 	if (evsel == NULL)
2388 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2389 							     "syscalls:sys_enter");
2390 
2391 	if (evsel &&
2392 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2393 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2394 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2395 		goto out;
2396 	}
2397 
2398 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2399 						     "raw_syscalls:sys_exit");
2400 	if (evsel == NULL)
2401 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2402 							     "syscalls:sys_exit");
2403 	if (evsel &&
2404 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2405 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2406 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2407 		goto out;
2408 	}
2409 
2410 	evlist__for_each(session->evlist, evsel) {
2411 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2412 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2413 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2414 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2415 			evsel->handler = trace__pgfault;
2416 	}
2417 
2418 	err = parse_target_str(trace);
2419 	if (err != 0)
2420 		goto out;
2421 
2422 	setup_pager();
2423 
2424 	err = perf_session__process_events(session);
2425 	if (err)
2426 		pr_err("Failed to process events, error %d", err);
2427 
2428 	else if (trace->summary)
2429 		trace__fprintf_thread_summary(trace, trace->output);
2430 
2431 out:
2432 	perf_session__delete(session);
2433 
2434 	return err;
2435 }
2436 
2437 static size_t trace__fprintf_threads_header(FILE *fp)
2438 {
2439 	size_t printed;
2440 
2441 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2442 
2443 	return printed;
2444 }
2445 
2446 static size_t thread__dump_stats(struct thread_trace *ttrace,
2447 				 struct trace *trace, FILE *fp)
2448 {
2449 	struct stats *stats;
2450 	size_t printed = 0;
2451 	struct syscall *sc;
2452 	struct int_node *inode = intlist__first(ttrace->syscall_stats);
2453 
2454 	if (inode == NULL)
2455 		return 0;
2456 
2457 	printed += fprintf(fp, "\n");
2458 
2459 	printed += fprintf(fp, "   syscall            calls      min       avg       max      stddev\n");
2460 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)        (%%)\n");
2461 	printed += fprintf(fp, "   --------------- -------- --------- --------- ---------     ------\n");
2462 
2463 	/* each int_node is a syscall */
2464 	while (inode) {
2465 		stats = inode->priv;
2466 		if (stats) {
2467 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2468 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2469 			double avg = avg_stats(stats);
2470 			double pct;
2471 			u64 n = (u64) stats->n;
2472 
2473 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2474 			avg /= NSEC_PER_MSEC;
2475 
2476 			sc = &trace->syscalls.table[inode->i];
2477 			printed += fprintf(fp, "   %-15s", sc->name);
2478 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f",
2479 					   n, min, avg);
2480 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2481 		}
2482 
2483 		inode = intlist__next(inode);
2484 	}
2485 
2486 	printed += fprintf(fp, "\n\n");
2487 
2488 	return printed;
2489 }
2490 
2491 /* struct used to pass data to per-thread function */
2492 struct summary_data {
2493 	FILE *fp;
2494 	struct trace *trace;
2495 	size_t printed;
2496 };
2497 
2498 static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2499 {
2500 	struct summary_data *data = priv;
2501 	FILE *fp = data->fp;
2502 	size_t printed = data->printed;
2503 	struct trace *trace = data->trace;
2504 	struct thread_trace *ttrace = thread__priv(thread);
2505 	double ratio;
2506 
2507 	if (ttrace == NULL)
2508 		return 0;
2509 
2510 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2511 
2512 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2513 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2514 	printed += fprintf(fp, "%.1f%%", ratio);
2515 	if (ttrace->pfmaj)
2516 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2517 	if (ttrace->pfmin)
2518 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2519 	printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2520 	printed += thread__dump_stats(ttrace, trace, fp);
2521 
2522 	data->printed += printed;
2523 
2524 	return 0;
2525 }
2526 
2527 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2528 {
2529 	struct summary_data data = {
2530 		.fp = fp,
2531 		.trace = trace
2532 	};
2533 	data.printed = trace__fprintf_threads_header(fp);
2534 
2535 	machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2536 
2537 	return data.printed;
2538 }
2539 
2540 static int trace__set_duration(const struct option *opt, const char *str,
2541 			       int unset __maybe_unused)
2542 {
2543 	struct trace *trace = opt->value;
2544 
2545 	trace->duration_filter = atof(str);
2546 	return 0;
2547 }
2548 
2549 static int trace__set_filter_pids(const struct option *opt, const char *str,
2550 				  int unset __maybe_unused)
2551 {
2552 	int ret = -1;
2553 	size_t i;
2554 	struct trace *trace = opt->value;
2555 	/*
2556 	 * FIXME: introduce a intarray class, plain parse csv and create a
2557 	 * { int nr, int entries[] } struct...
2558 	 */
2559 	struct intlist *list = intlist__new(str);
2560 
2561 	if (list == NULL)
2562 		return -1;
2563 
2564 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2565 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2566 
2567 	if (trace->filter_pids.entries == NULL)
2568 		goto out;
2569 
2570 	trace->filter_pids.entries[0] = getpid();
2571 
2572 	for (i = 1; i < trace->filter_pids.nr; ++i)
2573 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2574 
2575 	intlist__delete(list);
2576 	ret = 0;
2577 out:
2578 	return ret;
2579 }
2580 
2581 static int trace__open_output(struct trace *trace, const char *filename)
2582 {
2583 	struct stat st;
2584 
2585 	if (!stat(filename, &st) && st.st_size) {
2586 		char oldname[PATH_MAX];
2587 
2588 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2589 		unlink(oldname);
2590 		rename(filename, oldname);
2591 	}
2592 
2593 	trace->output = fopen(filename, "w");
2594 
2595 	return trace->output == NULL ? -errno : 0;
2596 }
2597 
2598 static int parse_pagefaults(const struct option *opt, const char *str,
2599 			    int unset __maybe_unused)
2600 {
2601 	int *trace_pgfaults = opt->value;
2602 
2603 	if (strcmp(str, "all") == 0)
2604 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2605 	else if (strcmp(str, "maj") == 0)
2606 		*trace_pgfaults |= TRACE_PFMAJ;
2607 	else if (strcmp(str, "min") == 0)
2608 		*trace_pgfaults |= TRACE_PFMIN;
2609 	else
2610 		return -1;
2611 
2612 	return 0;
2613 }
2614 
2615 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2616 {
2617 	struct perf_evsel *evsel;
2618 
2619 	evlist__for_each(evlist, evsel)
2620 		evsel->handler = handler;
2621 }
2622 
2623 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2624 {
2625 	const char *trace_usage[] = {
2626 		"perf trace [<options>] [<command>]",
2627 		"perf trace [<options>] -- <command> [<options>]",
2628 		"perf trace record [<options>] [<command>]",
2629 		"perf trace record [<options>] -- <command> [<options>]",
2630 		NULL
2631 	};
2632 	struct trace trace = {
2633 		.audit = {
2634 			.machine = audit_detect_machine(),
2635 			.open_id = audit_name_to_syscall("open", trace.audit.machine),
2636 		},
2637 		.syscalls = {
2638 			. max = -1,
2639 		},
2640 		.opts = {
2641 			.target = {
2642 				.uid	   = UINT_MAX,
2643 				.uses_mmap = true,
2644 			},
2645 			.user_freq     = UINT_MAX,
2646 			.user_interval = ULLONG_MAX,
2647 			.no_buffering  = true,
2648 			.mmap_pages    = UINT_MAX,
2649 		},
2650 		.output = stdout,
2651 		.show_comm = true,
2652 		.trace_syscalls = true,
2653 	};
2654 	const char *output_name = NULL;
2655 	const char *ev_qualifier_str = NULL;
2656 	const struct option trace_options[] = {
2657 	OPT_CALLBACK(0, "event", &trace.evlist, "event",
2658 		     "event selector. use 'perf list' to list available events",
2659 		     parse_events_option),
2660 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2661 		    "show the thread COMM next to its id"),
2662 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2663 	OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2664 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2665 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2666 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2667 		    "trace events on existing process id"),
2668 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2669 		    "trace events on existing thread id"),
2670 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2671 		     "pids to filter (by the kernel)", trace__set_filter_pids),
2672 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2673 		    "system-wide collection from all CPUs"),
2674 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2675 		    "list of cpus to monitor"),
2676 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2677 		    "child tasks do not inherit counters"),
2678 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2679 		     "number of mmap data pages",
2680 		     perf_evlist__parse_mmap_pages),
2681 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2682 		   "user to profile"),
2683 	OPT_CALLBACK(0, "duration", &trace, "float",
2684 		     "show only events with duration > N.M ms",
2685 		     trace__set_duration),
2686 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2687 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2688 	OPT_BOOLEAN('T', "time", &trace.full_time,
2689 		    "Show full timestamp, not time relative to first start"),
2690 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
2691 		    "Show only syscall summary with statistics"),
2692 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
2693 		    "Show all syscalls and summary with statistics"),
2694 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2695 		     "Trace pagefaults", parse_pagefaults, "maj"),
2696 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2697 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2698 	OPT_END()
2699 	};
2700 	const char * const trace_subcommands[] = { "record", NULL };
2701 	int err;
2702 	char bf[BUFSIZ];
2703 
2704 	signal(SIGSEGV, sighandler_dump_stack);
2705 	signal(SIGFPE, sighandler_dump_stack);
2706 
2707 	trace.evlist = perf_evlist__new();
2708 	if (trace.evlist == NULL)
2709 		return -ENOMEM;
2710 
2711 	if (trace.evlist == NULL) {
2712 		pr_err("Not enough memory to run!\n");
2713 		goto out;
2714 	}
2715 
2716 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2717 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2718 
2719 	if (trace.trace_pgfaults) {
2720 		trace.opts.sample_address = true;
2721 		trace.opts.sample_time = true;
2722 	}
2723 
2724 	if (trace.evlist->nr_entries > 0)
2725 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2726 
2727 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2728 		return trace__record(&trace, argc-1, &argv[1]);
2729 
2730 	/* summary_only implies summary option, but don't overwrite summary if set */
2731 	if (trace.summary_only)
2732 		trace.summary = trace.summary_only;
2733 
2734 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2735 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
2736 		pr_err("Please specify something to trace.\n");
2737 		return -1;
2738 	}
2739 
2740 	if (output_name != NULL) {
2741 		err = trace__open_output(&trace, output_name);
2742 		if (err < 0) {
2743 			perror("failed to create output file");
2744 			goto out;
2745 		}
2746 	}
2747 
2748 	if (ev_qualifier_str != NULL) {
2749 		const char *s = ev_qualifier_str;
2750 
2751 		trace.not_ev_qualifier = *s == '!';
2752 		if (trace.not_ev_qualifier)
2753 			++s;
2754 		trace.ev_qualifier = strlist__new(true, s);
2755 		if (trace.ev_qualifier == NULL) {
2756 			fputs("Not enough memory to parse event qualifier",
2757 			      trace.output);
2758 			err = -ENOMEM;
2759 			goto out_close;
2760 		}
2761 	}
2762 
2763 	err = target__validate(&trace.opts.target);
2764 	if (err) {
2765 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2766 		fprintf(trace.output, "%s", bf);
2767 		goto out_close;
2768 	}
2769 
2770 	err = target__parse_uid(&trace.opts.target);
2771 	if (err) {
2772 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2773 		fprintf(trace.output, "%s", bf);
2774 		goto out_close;
2775 	}
2776 
2777 	if (!argc && target__none(&trace.opts.target))
2778 		trace.opts.target.system_wide = true;
2779 
2780 	if (input_name)
2781 		err = trace__replay(&trace);
2782 	else
2783 		err = trace__run(&trace, argc, argv);
2784 
2785 out_close:
2786 	if (output_name != NULL)
2787 		fclose(trace.output);
2788 out:
2789 	return err;
2790 }
2791