xref: /openbmc/linux/tools/perf/builtin-trace.c (revision d7a3d85e)
1 #include <traceevent/event-parse.h>
2 #include "builtin.h"
3 #include "util/color.h"
4 #include "util/debug.h"
5 #include "util/evlist.h"
6 #include "util/machine.h"
7 #include "util/session.h"
8 #include "util/thread.h"
9 #include "util/parse-options.h"
10 #include "util/strlist.h"
11 #include "util/intlist.h"
12 #include "util/thread_map.h"
13 #include "util/stat.h"
14 #include "trace-event.h"
15 #include "util/parse-events.h"
16 
17 #include <libaudit.h>
18 #include <stdlib.h>
19 #include <sys/mman.h>
20 #include <linux/futex.h>
21 
22 /* For older distros: */
23 #ifndef MAP_STACK
24 # define MAP_STACK		0x20000
25 #endif
26 
27 #ifndef MADV_HWPOISON
28 # define MADV_HWPOISON		100
29 #endif
30 
31 #ifndef MADV_MERGEABLE
32 # define MADV_MERGEABLE		12
33 #endif
34 
35 #ifndef MADV_UNMERGEABLE
36 # define MADV_UNMERGEABLE	13
37 #endif
38 
39 #ifndef EFD_SEMAPHORE
40 # define EFD_SEMAPHORE		1
41 #endif
42 
43 #ifndef EFD_NONBLOCK
44 # define EFD_NONBLOCK		00004000
45 #endif
46 
47 #ifndef EFD_CLOEXEC
48 # define EFD_CLOEXEC		02000000
49 #endif
50 
51 #ifndef O_CLOEXEC
52 # define O_CLOEXEC		02000000
53 #endif
54 
55 #ifndef SOCK_DCCP
56 # define SOCK_DCCP		6
57 #endif
58 
59 #ifndef SOCK_CLOEXEC
60 # define SOCK_CLOEXEC		02000000
61 #endif
62 
63 #ifndef SOCK_NONBLOCK
64 # define SOCK_NONBLOCK		00004000
65 #endif
66 
67 #ifndef MSG_CMSG_CLOEXEC
68 # define MSG_CMSG_CLOEXEC	0x40000000
69 #endif
70 
71 struct tp_field {
72 	int offset;
73 	union {
74 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
75 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
76 	};
77 };
78 
79 #define TP_UINT_FIELD(bits) \
80 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
81 { \
82 	u##bits value; \
83 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
84 	return value;  \
85 }
86 
87 TP_UINT_FIELD(8);
88 TP_UINT_FIELD(16);
89 TP_UINT_FIELD(32);
90 TP_UINT_FIELD(64);
91 
92 #define TP_UINT_FIELD__SWAPPED(bits) \
93 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
94 { \
95 	u##bits value; \
96 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
97 	return bswap_##bits(value);\
98 }
99 
100 TP_UINT_FIELD__SWAPPED(16);
101 TP_UINT_FIELD__SWAPPED(32);
102 TP_UINT_FIELD__SWAPPED(64);
103 
104 static int tp_field__init_uint(struct tp_field *field,
105 			       struct format_field *format_field,
106 			       bool needs_swap)
107 {
108 	field->offset = format_field->offset;
109 
110 	switch (format_field->size) {
111 	case 1:
112 		field->integer = tp_field__u8;
113 		break;
114 	case 2:
115 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
116 		break;
117 	case 4:
118 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
119 		break;
120 	case 8:
121 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
122 		break;
123 	default:
124 		return -1;
125 	}
126 
127 	return 0;
128 }
129 
130 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
131 {
132 	return sample->raw_data + field->offset;
133 }
134 
135 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
136 {
137 	field->offset = format_field->offset;
138 	field->pointer = tp_field__ptr;
139 	return 0;
140 }
141 
142 struct syscall_tp {
143 	struct tp_field id;
144 	union {
145 		struct tp_field args, ret;
146 	};
147 };
148 
149 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
150 					  struct tp_field *field,
151 					  const char *name)
152 {
153 	struct format_field *format_field = perf_evsel__field(evsel, name);
154 
155 	if (format_field == NULL)
156 		return -1;
157 
158 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
159 }
160 
161 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
162 	({ struct syscall_tp *sc = evsel->priv;\
163 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
164 
165 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
166 					 struct tp_field *field,
167 					 const char *name)
168 {
169 	struct format_field *format_field = perf_evsel__field(evsel, name);
170 
171 	if (format_field == NULL)
172 		return -1;
173 
174 	return tp_field__init_ptr(field, format_field);
175 }
176 
177 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
178 	({ struct syscall_tp *sc = evsel->priv;\
179 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
180 
181 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
182 {
183 	zfree(&evsel->priv);
184 	perf_evsel__delete(evsel);
185 }
186 
187 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
188 {
189 	evsel->priv = malloc(sizeof(struct syscall_tp));
190 	if (evsel->priv != NULL) {
191 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
192 			goto out_delete;
193 
194 		evsel->handler = handler;
195 		return 0;
196 	}
197 
198 	return -ENOMEM;
199 
200 out_delete:
201 	zfree(&evsel->priv);
202 	return -ENOENT;
203 }
204 
205 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
206 {
207 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
208 
209 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
210 	if (evsel == NULL)
211 		evsel = perf_evsel__newtp("syscalls", direction);
212 
213 	if (evsel) {
214 		if (perf_evsel__init_syscall_tp(evsel, handler))
215 			goto out_delete;
216 	}
217 
218 	return evsel;
219 
220 out_delete:
221 	perf_evsel__delete_priv(evsel);
222 	return NULL;
223 }
224 
225 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
226 	({ struct syscall_tp *fields = evsel->priv; \
227 	   fields->name.integer(&fields->name, sample); })
228 
229 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
230 	({ struct syscall_tp *fields = evsel->priv; \
231 	   fields->name.pointer(&fields->name, sample); })
232 
233 static int perf_evlist__add_syscall_newtp(struct perf_evlist *evlist,
234 					  void *sys_enter_handler,
235 					  void *sys_exit_handler)
236 {
237 	int ret = -1;
238 	struct perf_evsel *sys_enter, *sys_exit;
239 
240 	sys_enter = perf_evsel__syscall_newtp("sys_enter", sys_enter_handler);
241 	if (sys_enter == NULL)
242 		goto out;
243 
244 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
245 		goto out_delete_sys_enter;
246 
247 	sys_exit = perf_evsel__syscall_newtp("sys_exit", sys_exit_handler);
248 	if (sys_exit == NULL)
249 		goto out_delete_sys_enter;
250 
251 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
252 		goto out_delete_sys_exit;
253 
254 	perf_evlist__add(evlist, sys_enter);
255 	perf_evlist__add(evlist, sys_exit);
256 
257 	ret = 0;
258 out:
259 	return ret;
260 
261 out_delete_sys_exit:
262 	perf_evsel__delete_priv(sys_exit);
263 out_delete_sys_enter:
264 	perf_evsel__delete_priv(sys_enter);
265 	goto out;
266 }
267 
268 
269 struct syscall_arg {
270 	unsigned long val;
271 	struct thread *thread;
272 	struct trace  *trace;
273 	void	      *parm;
274 	u8	      idx;
275 	u8	      mask;
276 };
277 
278 struct strarray {
279 	int	    offset;
280 	int	    nr_entries;
281 	const char **entries;
282 };
283 
284 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
285 	.nr_entries = ARRAY_SIZE(array), \
286 	.entries = array, \
287 }
288 
289 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
290 	.offset	    = off, \
291 	.nr_entries = ARRAY_SIZE(array), \
292 	.entries = array, \
293 }
294 
295 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
296 						const char *intfmt,
297 					        struct syscall_arg *arg)
298 {
299 	struct strarray *sa = arg->parm;
300 	int idx = arg->val - sa->offset;
301 
302 	if (idx < 0 || idx >= sa->nr_entries)
303 		return scnprintf(bf, size, intfmt, arg->val);
304 
305 	return scnprintf(bf, size, "%s", sa->entries[idx]);
306 }
307 
308 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
309 					      struct syscall_arg *arg)
310 {
311 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
312 }
313 
314 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
315 
316 #if defined(__i386__) || defined(__x86_64__)
317 /*
318  * FIXME: Make this available to all arches as soon as the ioctl beautifier
319  * 	  gets rewritten to support all arches.
320  */
321 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
322 						 struct syscall_arg *arg)
323 {
324 	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
325 }
326 
327 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
328 #endif /* defined(__i386__) || defined(__x86_64__) */
329 
330 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
331 					struct syscall_arg *arg);
332 
333 #define SCA_FD syscall_arg__scnprintf_fd
334 
335 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
336 					   struct syscall_arg *arg)
337 {
338 	int fd = arg->val;
339 
340 	if (fd == AT_FDCWD)
341 		return scnprintf(bf, size, "CWD");
342 
343 	return syscall_arg__scnprintf_fd(bf, size, arg);
344 }
345 
346 #define SCA_FDAT syscall_arg__scnprintf_fd_at
347 
348 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
349 					      struct syscall_arg *arg);
350 
351 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
352 
353 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
354 					 struct syscall_arg *arg)
355 {
356 	return scnprintf(bf, size, "%#lx", arg->val);
357 }
358 
359 #define SCA_HEX syscall_arg__scnprintf_hex
360 
361 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
362 					       struct syscall_arg *arg)
363 {
364 	int printed = 0, prot = arg->val;
365 
366 	if (prot == PROT_NONE)
367 		return scnprintf(bf, size, "NONE");
368 #define	P_MMAP_PROT(n) \
369 	if (prot & PROT_##n) { \
370 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
371 		prot &= ~PROT_##n; \
372 	}
373 
374 	P_MMAP_PROT(EXEC);
375 	P_MMAP_PROT(READ);
376 	P_MMAP_PROT(WRITE);
377 #ifdef PROT_SEM
378 	P_MMAP_PROT(SEM);
379 #endif
380 	P_MMAP_PROT(GROWSDOWN);
381 	P_MMAP_PROT(GROWSUP);
382 #undef P_MMAP_PROT
383 
384 	if (prot)
385 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
386 
387 	return printed;
388 }
389 
390 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
391 
392 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
393 						struct syscall_arg *arg)
394 {
395 	int printed = 0, flags = arg->val;
396 
397 #define	P_MMAP_FLAG(n) \
398 	if (flags & MAP_##n) { \
399 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
400 		flags &= ~MAP_##n; \
401 	}
402 
403 	P_MMAP_FLAG(SHARED);
404 	P_MMAP_FLAG(PRIVATE);
405 #ifdef MAP_32BIT
406 	P_MMAP_FLAG(32BIT);
407 #endif
408 	P_MMAP_FLAG(ANONYMOUS);
409 	P_MMAP_FLAG(DENYWRITE);
410 	P_MMAP_FLAG(EXECUTABLE);
411 	P_MMAP_FLAG(FILE);
412 	P_MMAP_FLAG(FIXED);
413 	P_MMAP_FLAG(GROWSDOWN);
414 #ifdef MAP_HUGETLB
415 	P_MMAP_FLAG(HUGETLB);
416 #endif
417 	P_MMAP_FLAG(LOCKED);
418 	P_MMAP_FLAG(NONBLOCK);
419 	P_MMAP_FLAG(NORESERVE);
420 	P_MMAP_FLAG(POPULATE);
421 	P_MMAP_FLAG(STACK);
422 #ifdef MAP_UNINITIALIZED
423 	P_MMAP_FLAG(UNINITIALIZED);
424 #endif
425 #undef P_MMAP_FLAG
426 
427 	if (flags)
428 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
429 
430 	return printed;
431 }
432 
433 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
434 
435 static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
436 						  struct syscall_arg *arg)
437 {
438 	int printed = 0, flags = arg->val;
439 
440 #define P_MREMAP_FLAG(n) \
441 	if (flags & MREMAP_##n) { \
442 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
443 		flags &= ~MREMAP_##n; \
444 	}
445 
446 	P_MREMAP_FLAG(MAYMOVE);
447 #ifdef MREMAP_FIXED
448 	P_MREMAP_FLAG(FIXED);
449 #endif
450 #undef P_MREMAP_FLAG
451 
452 	if (flags)
453 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
454 
455 	return printed;
456 }
457 
458 #define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
459 
460 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
461 						      struct syscall_arg *arg)
462 {
463 	int behavior = arg->val;
464 
465 	switch (behavior) {
466 #define	P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
467 	P_MADV_BHV(NORMAL);
468 	P_MADV_BHV(RANDOM);
469 	P_MADV_BHV(SEQUENTIAL);
470 	P_MADV_BHV(WILLNEED);
471 	P_MADV_BHV(DONTNEED);
472 	P_MADV_BHV(REMOVE);
473 	P_MADV_BHV(DONTFORK);
474 	P_MADV_BHV(DOFORK);
475 	P_MADV_BHV(HWPOISON);
476 #ifdef MADV_SOFT_OFFLINE
477 	P_MADV_BHV(SOFT_OFFLINE);
478 #endif
479 	P_MADV_BHV(MERGEABLE);
480 	P_MADV_BHV(UNMERGEABLE);
481 #ifdef MADV_HUGEPAGE
482 	P_MADV_BHV(HUGEPAGE);
483 #endif
484 #ifdef MADV_NOHUGEPAGE
485 	P_MADV_BHV(NOHUGEPAGE);
486 #endif
487 #ifdef MADV_DONTDUMP
488 	P_MADV_BHV(DONTDUMP);
489 #endif
490 #ifdef MADV_DODUMP
491 	P_MADV_BHV(DODUMP);
492 #endif
493 #undef P_MADV_PHV
494 	default: break;
495 	}
496 
497 	return scnprintf(bf, size, "%#x", behavior);
498 }
499 
500 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
501 
502 static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
503 					   struct syscall_arg *arg)
504 {
505 	int printed = 0, op = arg->val;
506 
507 	if (op == 0)
508 		return scnprintf(bf, size, "NONE");
509 #define	P_CMD(cmd) \
510 	if ((op & LOCK_##cmd) == LOCK_##cmd) { \
511 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
512 		op &= ~LOCK_##cmd; \
513 	}
514 
515 	P_CMD(SH);
516 	P_CMD(EX);
517 	P_CMD(NB);
518 	P_CMD(UN);
519 	P_CMD(MAND);
520 	P_CMD(RW);
521 	P_CMD(READ);
522 	P_CMD(WRITE);
523 #undef P_OP
524 
525 	if (op)
526 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
527 
528 	return printed;
529 }
530 
531 #define SCA_FLOCK syscall_arg__scnprintf_flock
532 
533 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
534 {
535 	enum syscall_futex_args {
536 		SCF_UADDR   = (1 << 0),
537 		SCF_OP	    = (1 << 1),
538 		SCF_VAL	    = (1 << 2),
539 		SCF_TIMEOUT = (1 << 3),
540 		SCF_UADDR2  = (1 << 4),
541 		SCF_VAL3    = (1 << 5),
542 	};
543 	int op = arg->val;
544 	int cmd = op & FUTEX_CMD_MASK;
545 	size_t printed = 0;
546 
547 	switch (cmd) {
548 #define	P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
549 	P_FUTEX_OP(WAIT);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
550 	P_FUTEX_OP(WAKE);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
551 	P_FUTEX_OP(FD);		    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
552 	P_FUTEX_OP(REQUEUE);	    arg->mask |= SCF_VAL3|SCF_TIMEOUT;	          break;
553 	P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;			  break;
554 	P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;			  break;
555 	P_FUTEX_OP(WAKE_OP);							  break;
556 	P_FUTEX_OP(LOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
557 	P_FUTEX_OP(UNLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
558 	P_FUTEX_OP(TRYLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
559 	P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;			  break;
560 	P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;			  break;
561 	P_FUTEX_OP(WAIT_REQUEUE_PI);						  break;
562 	default: printed = scnprintf(bf, size, "%#x", cmd);			  break;
563 	}
564 
565 	if (op & FUTEX_PRIVATE_FLAG)
566 		printed += scnprintf(bf + printed, size - printed, "|PRIV");
567 
568 	if (op & FUTEX_CLOCK_REALTIME)
569 		printed += scnprintf(bf + printed, size - printed, "|CLKRT");
570 
571 	return printed;
572 }
573 
574 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
575 
576 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
577 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
578 
579 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
580 static DEFINE_STRARRAY(itimers);
581 
582 static const char *whences[] = { "SET", "CUR", "END",
583 #ifdef SEEK_DATA
584 "DATA",
585 #endif
586 #ifdef SEEK_HOLE
587 "HOLE",
588 #endif
589 };
590 static DEFINE_STRARRAY(whences);
591 
592 static const char *fcntl_cmds[] = {
593 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
594 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
595 	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
596 	"F_GETOWNER_UIDS",
597 };
598 static DEFINE_STRARRAY(fcntl_cmds);
599 
600 static const char *rlimit_resources[] = {
601 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
602 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
603 	"RTTIME",
604 };
605 static DEFINE_STRARRAY(rlimit_resources);
606 
607 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
608 static DEFINE_STRARRAY(sighow);
609 
610 static const char *clockid[] = {
611 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
612 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE",
613 };
614 static DEFINE_STRARRAY(clockid);
615 
616 static const char *socket_families[] = {
617 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
618 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
619 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
620 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
621 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
622 	"ALG", "NFC", "VSOCK",
623 };
624 static DEFINE_STRARRAY(socket_families);
625 
626 #ifndef SOCK_TYPE_MASK
627 #define SOCK_TYPE_MASK 0xf
628 #endif
629 
630 static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
631 						      struct syscall_arg *arg)
632 {
633 	size_t printed;
634 	int type = arg->val,
635 	    flags = type & ~SOCK_TYPE_MASK;
636 
637 	type &= SOCK_TYPE_MASK;
638 	/*
639  	 * Can't use a strarray, MIPS may override for ABI reasons.
640  	 */
641 	switch (type) {
642 #define	P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
643 	P_SK_TYPE(STREAM);
644 	P_SK_TYPE(DGRAM);
645 	P_SK_TYPE(RAW);
646 	P_SK_TYPE(RDM);
647 	P_SK_TYPE(SEQPACKET);
648 	P_SK_TYPE(DCCP);
649 	P_SK_TYPE(PACKET);
650 #undef P_SK_TYPE
651 	default:
652 		printed = scnprintf(bf, size, "%#x", type);
653 	}
654 
655 #define	P_SK_FLAG(n) \
656 	if (flags & SOCK_##n) { \
657 		printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
658 		flags &= ~SOCK_##n; \
659 	}
660 
661 	P_SK_FLAG(CLOEXEC);
662 	P_SK_FLAG(NONBLOCK);
663 #undef P_SK_FLAG
664 
665 	if (flags)
666 		printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
667 
668 	return printed;
669 }
670 
671 #define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
672 
673 #ifndef MSG_PROBE
674 #define MSG_PROBE	     0x10
675 #endif
676 #ifndef MSG_WAITFORONE
677 #define MSG_WAITFORONE	0x10000
678 #endif
679 #ifndef MSG_SENDPAGE_NOTLAST
680 #define MSG_SENDPAGE_NOTLAST 0x20000
681 #endif
682 #ifndef MSG_FASTOPEN
683 #define MSG_FASTOPEN	     0x20000000
684 #endif
685 
686 static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
687 					       struct syscall_arg *arg)
688 {
689 	int printed = 0, flags = arg->val;
690 
691 	if (flags == 0)
692 		return scnprintf(bf, size, "NONE");
693 #define	P_MSG_FLAG(n) \
694 	if (flags & MSG_##n) { \
695 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
696 		flags &= ~MSG_##n; \
697 	}
698 
699 	P_MSG_FLAG(OOB);
700 	P_MSG_FLAG(PEEK);
701 	P_MSG_FLAG(DONTROUTE);
702 	P_MSG_FLAG(TRYHARD);
703 	P_MSG_FLAG(CTRUNC);
704 	P_MSG_FLAG(PROBE);
705 	P_MSG_FLAG(TRUNC);
706 	P_MSG_FLAG(DONTWAIT);
707 	P_MSG_FLAG(EOR);
708 	P_MSG_FLAG(WAITALL);
709 	P_MSG_FLAG(FIN);
710 	P_MSG_FLAG(SYN);
711 	P_MSG_FLAG(CONFIRM);
712 	P_MSG_FLAG(RST);
713 	P_MSG_FLAG(ERRQUEUE);
714 	P_MSG_FLAG(NOSIGNAL);
715 	P_MSG_FLAG(MORE);
716 	P_MSG_FLAG(WAITFORONE);
717 	P_MSG_FLAG(SENDPAGE_NOTLAST);
718 	P_MSG_FLAG(FASTOPEN);
719 	P_MSG_FLAG(CMSG_CLOEXEC);
720 #undef P_MSG_FLAG
721 
722 	if (flags)
723 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
724 
725 	return printed;
726 }
727 
728 #define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
729 
730 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
731 						 struct syscall_arg *arg)
732 {
733 	size_t printed = 0;
734 	int mode = arg->val;
735 
736 	if (mode == F_OK) /* 0 */
737 		return scnprintf(bf, size, "F");
738 #define	P_MODE(n) \
739 	if (mode & n##_OK) { \
740 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
741 		mode &= ~n##_OK; \
742 	}
743 
744 	P_MODE(R);
745 	P_MODE(W);
746 	P_MODE(X);
747 #undef P_MODE
748 
749 	if (mode)
750 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
751 
752 	return printed;
753 }
754 
755 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
756 
757 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
758 					       struct syscall_arg *arg)
759 {
760 	int printed = 0, flags = arg->val;
761 
762 	if (!(flags & O_CREAT))
763 		arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
764 
765 	if (flags == 0)
766 		return scnprintf(bf, size, "RDONLY");
767 #define	P_FLAG(n) \
768 	if (flags & O_##n) { \
769 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
770 		flags &= ~O_##n; \
771 	}
772 
773 	P_FLAG(APPEND);
774 	P_FLAG(ASYNC);
775 	P_FLAG(CLOEXEC);
776 	P_FLAG(CREAT);
777 	P_FLAG(DIRECT);
778 	P_FLAG(DIRECTORY);
779 	P_FLAG(EXCL);
780 	P_FLAG(LARGEFILE);
781 	P_FLAG(NOATIME);
782 	P_FLAG(NOCTTY);
783 #ifdef O_NONBLOCK
784 	P_FLAG(NONBLOCK);
785 #elif O_NDELAY
786 	P_FLAG(NDELAY);
787 #endif
788 #ifdef O_PATH
789 	P_FLAG(PATH);
790 #endif
791 	P_FLAG(RDWR);
792 #ifdef O_DSYNC
793 	if ((flags & O_SYNC) == O_SYNC)
794 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
795 	else {
796 		P_FLAG(DSYNC);
797 	}
798 #else
799 	P_FLAG(SYNC);
800 #endif
801 	P_FLAG(TRUNC);
802 	P_FLAG(WRONLY);
803 #undef P_FLAG
804 
805 	if (flags)
806 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
807 
808 	return printed;
809 }
810 
811 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
812 
813 static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
814 						   struct syscall_arg *arg)
815 {
816 	int printed = 0, flags = arg->val;
817 
818 	if (flags == 0)
819 		return scnprintf(bf, size, "NONE");
820 #define	P_FLAG(n) \
821 	if (flags & EFD_##n) { \
822 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
823 		flags &= ~EFD_##n; \
824 	}
825 
826 	P_FLAG(SEMAPHORE);
827 	P_FLAG(CLOEXEC);
828 	P_FLAG(NONBLOCK);
829 #undef P_FLAG
830 
831 	if (flags)
832 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
833 
834 	return printed;
835 }
836 
837 #define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
838 
839 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
840 						struct syscall_arg *arg)
841 {
842 	int printed = 0, flags = arg->val;
843 
844 #define	P_FLAG(n) \
845 	if (flags & O_##n) { \
846 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
847 		flags &= ~O_##n; \
848 	}
849 
850 	P_FLAG(CLOEXEC);
851 	P_FLAG(NONBLOCK);
852 #undef P_FLAG
853 
854 	if (flags)
855 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
856 
857 	return printed;
858 }
859 
860 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
861 
862 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
863 {
864 	int sig = arg->val;
865 
866 	switch (sig) {
867 #define	P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
868 	P_SIGNUM(HUP);
869 	P_SIGNUM(INT);
870 	P_SIGNUM(QUIT);
871 	P_SIGNUM(ILL);
872 	P_SIGNUM(TRAP);
873 	P_SIGNUM(ABRT);
874 	P_SIGNUM(BUS);
875 	P_SIGNUM(FPE);
876 	P_SIGNUM(KILL);
877 	P_SIGNUM(USR1);
878 	P_SIGNUM(SEGV);
879 	P_SIGNUM(USR2);
880 	P_SIGNUM(PIPE);
881 	P_SIGNUM(ALRM);
882 	P_SIGNUM(TERM);
883 	P_SIGNUM(CHLD);
884 	P_SIGNUM(CONT);
885 	P_SIGNUM(STOP);
886 	P_SIGNUM(TSTP);
887 	P_SIGNUM(TTIN);
888 	P_SIGNUM(TTOU);
889 	P_SIGNUM(URG);
890 	P_SIGNUM(XCPU);
891 	P_SIGNUM(XFSZ);
892 	P_SIGNUM(VTALRM);
893 	P_SIGNUM(PROF);
894 	P_SIGNUM(WINCH);
895 	P_SIGNUM(IO);
896 	P_SIGNUM(PWR);
897 	P_SIGNUM(SYS);
898 #ifdef SIGEMT
899 	P_SIGNUM(EMT);
900 #endif
901 #ifdef SIGSTKFLT
902 	P_SIGNUM(STKFLT);
903 #endif
904 #ifdef SIGSWI
905 	P_SIGNUM(SWI);
906 #endif
907 	default: break;
908 	}
909 
910 	return scnprintf(bf, size, "%#x", sig);
911 }
912 
913 #define SCA_SIGNUM syscall_arg__scnprintf_signum
914 
915 #if defined(__i386__) || defined(__x86_64__)
916 /*
917  * FIXME: Make this available to all arches.
918  */
919 #define TCGETS		0x5401
920 
921 static const char *tioctls[] = {
922 	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
923 	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
924 	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
925 	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
926 	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
927 	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
928 	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
929 	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
930 	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
931 	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
932 	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
933 	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
934 	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
935 	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
936 	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
937 };
938 
939 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
940 #endif /* defined(__i386__) || defined(__x86_64__) */
941 
942 #define STRARRAY(arg, name, array) \
943 	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
944 	  .arg_parm	 = { [arg] = &strarray__##array, }
945 
946 static struct syscall_fmt {
947 	const char *name;
948 	const char *alias;
949 	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
950 	void	   *arg_parm[6];
951 	bool	   errmsg;
952 	bool	   timeout;
953 	bool	   hexret;
954 } syscall_fmts[] = {
955 	{ .name	    = "access",	    .errmsg = true,
956 	  .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
957 	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
958 	{ .name	    = "brk",	    .hexret = true,
959 	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
960 	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
961 	{ .name	    = "close",	    .errmsg = true,
962 	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
963 	{ .name	    = "connect",    .errmsg = true, },
964 	{ .name	    = "dup",	    .errmsg = true,
965 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
966 	{ .name	    = "dup2",	    .errmsg = true,
967 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
968 	{ .name	    = "dup3",	    .errmsg = true,
969 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
970 	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
971 	{ .name	    = "eventfd2",   .errmsg = true,
972 	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
973 	{ .name	    = "faccessat",  .errmsg = true,
974 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
975 	{ .name	    = "fadvise64",  .errmsg = true,
976 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
977 	{ .name	    = "fallocate",  .errmsg = true,
978 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
979 	{ .name	    = "fchdir",	    .errmsg = true,
980 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
981 	{ .name	    = "fchmod",	    .errmsg = true,
982 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
983 	{ .name	    = "fchmodat",   .errmsg = true,
984 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
985 	{ .name	    = "fchown",	    .errmsg = true,
986 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
987 	{ .name	    = "fchownat",   .errmsg = true,
988 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
989 	{ .name	    = "fcntl",	    .errmsg = true,
990 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
991 			     [1] = SCA_STRARRAY, /* cmd */ },
992 	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
993 	{ .name	    = "fdatasync",  .errmsg = true,
994 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
995 	{ .name	    = "flock",	    .errmsg = true,
996 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
997 			     [1] = SCA_FLOCK, /* cmd */ }, },
998 	{ .name	    = "fsetxattr",  .errmsg = true,
999 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1000 	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat",
1001 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1002 	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat",
1003 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1004 	{ .name	    = "fstatfs",    .errmsg = true,
1005 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1006 	{ .name	    = "fsync",    .errmsg = true,
1007 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1008 	{ .name	    = "ftruncate", .errmsg = true,
1009 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1010 	{ .name	    = "futex",	    .errmsg = true,
1011 	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
1012 	{ .name	    = "futimesat", .errmsg = true,
1013 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1014 	{ .name	    = "getdents",   .errmsg = true,
1015 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1016 	{ .name	    = "getdents64", .errmsg = true,
1017 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1018 	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1019 	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1020 	{ .name	    = "ioctl",	    .errmsg = true,
1021 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1022 #if defined(__i386__) || defined(__x86_64__)
1023 /*
1024  * FIXME: Make this available to all arches.
1025  */
1026 			     [1] = SCA_STRHEXARRAY, /* cmd */
1027 			     [2] = SCA_HEX, /* arg */ },
1028 	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
1029 #else
1030 			     [2] = SCA_HEX, /* arg */ }, },
1031 #endif
1032 	{ .name	    = "kill",	    .errmsg = true,
1033 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1034 	{ .name	    = "linkat",	    .errmsg = true,
1035 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1036 	{ .name	    = "lseek",	    .errmsg = true,
1037 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1038 			     [2] = SCA_STRARRAY, /* whence */ },
1039 	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
1040 	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat", },
1041 	{ .name     = "madvise",    .errmsg = true,
1042 	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
1043 			     [2] = SCA_MADV_BHV, /* behavior */ }, },
1044 	{ .name	    = "mkdirat",    .errmsg = true,
1045 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1046 	{ .name	    = "mknodat",    .errmsg = true,
1047 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1048 	{ .name	    = "mlock",	    .errmsg = true,
1049 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1050 	{ .name	    = "mlockall",   .errmsg = true,
1051 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1052 	{ .name	    = "mmap",	    .hexret = true,
1053 	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
1054 			     [2] = SCA_MMAP_PROT, /* prot */
1055 			     [3] = SCA_MMAP_FLAGS, /* flags */
1056 			     [4] = SCA_FD, 	  /* fd */ }, },
1057 	{ .name	    = "mprotect",   .errmsg = true,
1058 	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
1059 			     [2] = SCA_MMAP_PROT, /* prot */ }, },
1060 	{ .name	    = "mremap",	    .hexret = true,
1061 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1062 			     [3] = SCA_MREMAP_FLAGS, /* flags */
1063 			     [4] = SCA_HEX, /* new_addr */ }, },
1064 	{ .name	    = "munlock",    .errmsg = true,
1065 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1066 	{ .name	    = "munmap",	    .errmsg = true,
1067 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1068 	{ .name	    = "name_to_handle_at", .errmsg = true,
1069 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1070 	{ .name	    = "newfstatat", .errmsg = true,
1071 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1072 	{ .name	    = "open",	    .errmsg = true,
1073 	  .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1074 	{ .name	    = "open_by_handle_at", .errmsg = true,
1075 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1076 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1077 	{ .name	    = "openat",	    .errmsg = true,
1078 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1079 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1080 	{ .name	    = "pipe2",	    .errmsg = true,
1081 	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1082 	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
1083 	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
1084 	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64",
1085 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1086 	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread",
1087 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1088 	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1089 	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64",
1090 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1091 	{ .name	    = "pwritev",    .errmsg = true,
1092 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1093 	{ .name	    = "read",	    .errmsg = true,
1094 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1095 	{ .name	    = "readlinkat", .errmsg = true,
1096 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1097 	{ .name	    = "readv",	    .errmsg = true,
1098 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1099 	{ .name	    = "recvfrom",   .errmsg = true,
1100 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1101 	{ .name	    = "recvmmsg",   .errmsg = true,
1102 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1103 	{ .name	    = "recvmsg",    .errmsg = true,
1104 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1105 	{ .name	    = "renameat",   .errmsg = true,
1106 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1107 	{ .name	    = "rt_sigaction", .errmsg = true,
1108 	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1109 	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1110 	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
1111 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1112 	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
1113 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1114 	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
1115 	{ .name	    = "sendmmsg",    .errmsg = true,
1116 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1117 	{ .name	    = "sendmsg",    .errmsg = true,
1118 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1119 	{ .name	    = "sendto",	    .errmsg = true,
1120 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1121 	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1122 	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1123 	{ .name	    = "shutdown",   .errmsg = true,
1124 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1125 	{ .name	    = "socket",	    .errmsg = true,
1126 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1127 			     [1] = SCA_SK_TYPE, /* type */ },
1128 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1129 	{ .name	    = "socketpair", .errmsg = true,
1130 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1131 			     [1] = SCA_SK_TYPE, /* type */ },
1132 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1133 	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat", },
1134 	{ .name	    = "symlinkat",  .errmsg = true,
1135 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1136 	{ .name	    = "tgkill",	    .errmsg = true,
1137 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1138 	{ .name	    = "tkill",	    .errmsg = true,
1139 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1140 	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
1141 	{ .name	    = "unlinkat",   .errmsg = true,
1142 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1143 	{ .name	    = "utimensat",  .errmsg = true,
1144 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
1145 	{ .name	    = "write",	    .errmsg = true,
1146 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1147 	{ .name	    = "writev",	    .errmsg = true,
1148 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1149 };
1150 
1151 static int syscall_fmt__cmp(const void *name, const void *fmtp)
1152 {
1153 	const struct syscall_fmt *fmt = fmtp;
1154 	return strcmp(name, fmt->name);
1155 }
1156 
1157 static struct syscall_fmt *syscall_fmt__find(const char *name)
1158 {
1159 	const int nmemb = ARRAY_SIZE(syscall_fmts);
1160 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1161 }
1162 
1163 struct syscall {
1164 	struct event_format *tp_format;
1165 	int		    nr_args;
1166 	struct format_field *args;
1167 	const char	    *name;
1168 	bool		    filtered;
1169 	bool		    is_exit;
1170 	struct syscall_fmt  *fmt;
1171 	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1172 	void		    **arg_parm;
1173 };
1174 
1175 static size_t fprintf_duration(unsigned long t, FILE *fp)
1176 {
1177 	double duration = (double)t / NSEC_PER_MSEC;
1178 	size_t printed = fprintf(fp, "(");
1179 
1180 	if (duration >= 1.0)
1181 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1182 	else if (duration >= 0.01)
1183 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1184 	else
1185 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1186 	return printed + fprintf(fp, "): ");
1187 }
1188 
1189 struct thread_trace {
1190 	u64		  entry_time;
1191 	u64		  exit_time;
1192 	bool		  entry_pending;
1193 	unsigned long	  nr_events;
1194 	unsigned long	  pfmaj, pfmin;
1195 	char		  *entry_str;
1196 	double		  runtime_ms;
1197 	struct {
1198 		int	  max;
1199 		char	  **table;
1200 	} paths;
1201 
1202 	struct intlist *syscall_stats;
1203 };
1204 
1205 static struct thread_trace *thread_trace__new(void)
1206 {
1207 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1208 
1209 	if (ttrace)
1210 		ttrace->paths.max = -1;
1211 
1212 	ttrace->syscall_stats = intlist__new(NULL);
1213 
1214 	return ttrace;
1215 }
1216 
1217 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1218 {
1219 	struct thread_trace *ttrace;
1220 
1221 	if (thread == NULL)
1222 		goto fail;
1223 
1224 	if (thread__priv(thread) == NULL)
1225 		thread__set_priv(thread, thread_trace__new());
1226 
1227 	if (thread__priv(thread) == NULL)
1228 		goto fail;
1229 
1230 	ttrace = thread__priv(thread);
1231 	++ttrace->nr_events;
1232 
1233 	return ttrace;
1234 fail:
1235 	color_fprintf(fp, PERF_COLOR_RED,
1236 		      "WARNING: not enough memory, dropping samples!\n");
1237 	return NULL;
1238 }
1239 
1240 #define TRACE_PFMAJ		(1 << 0)
1241 #define TRACE_PFMIN		(1 << 1)
1242 
1243 struct trace {
1244 	struct perf_tool	tool;
1245 	struct {
1246 		int		machine;
1247 		int		open_id;
1248 	}			audit;
1249 	struct {
1250 		int		max;
1251 		struct syscall  *table;
1252 	} syscalls;
1253 	struct record_opts	opts;
1254 	struct perf_evlist	*evlist;
1255 	struct machine		*host;
1256 	struct thread		*current;
1257 	u64			base_time;
1258 	FILE			*output;
1259 	unsigned long		nr_events;
1260 	struct strlist		*ev_qualifier;
1261 	const char 		*last_vfs_getname;
1262 	struct intlist		*tid_list;
1263 	struct intlist		*pid_list;
1264 	struct {
1265 		size_t		nr;
1266 		pid_t		*entries;
1267 	}			filter_pids;
1268 	double			duration_filter;
1269 	double			runtime_ms;
1270 	struct {
1271 		u64		vfs_getname,
1272 				proc_getname;
1273 	} stats;
1274 	bool			not_ev_qualifier;
1275 	bool			live;
1276 	bool			full_time;
1277 	bool			sched;
1278 	bool			multiple_threads;
1279 	bool			summary;
1280 	bool			summary_only;
1281 	bool			show_comm;
1282 	bool			show_tool_stats;
1283 	bool			trace_syscalls;
1284 	bool			force;
1285 	int			trace_pgfaults;
1286 };
1287 
1288 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1289 {
1290 	struct thread_trace *ttrace = thread__priv(thread);
1291 
1292 	if (fd > ttrace->paths.max) {
1293 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1294 
1295 		if (npath == NULL)
1296 			return -1;
1297 
1298 		if (ttrace->paths.max != -1) {
1299 			memset(npath + ttrace->paths.max + 1, 0,
1300 			       (fd - ttrace->paths.max) * sizeof(char *));
1301 		} else {
1302 			memset(npath, 0, (fd + 1) * sizeof(char *));
1303 		}
1304 
1305 		ttrace->paths.table = npath;
1306 		ttrace->paths.max   = fd;
1307 	}
1308 
1309 	ttrace->paths.table[fd] = strdup(pathname);
1310 
1311 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
1312 }
1313 
1314 static int thread__read_fd_path(struct thread *thread, int fd)
1315 {
1316 	char linkname[PATH_MAX], pathname[PATH_MAX];
1317 	struct stat st;
1318 	int ret;
1319 
1320 	if (thread->pid_ == thread->tid) {
1321 		scnprintf(linkname, sizeof(linkname),
1322 			  "/proc/%d/fd/%d", thread->pid_, fd);
1323 	} else {
1324 		scnprintf(linkname, sizeof(linkname),
1325 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1326 	}
1327 
1328 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1329 		return -1;
1330 
1331 	ret = readlink(linkname, pathname, sizeof(pathname));
1332 
1333 	if (ret < 0 || ret > st.st_size)
1334 		return -1;
1335 
1336 	pathname[ret] = '\0';
1337 	return trace__set_fd_pathname(thread, fd, pathname);
1338 }
1339 
1340 static const char *thread__fd_path(struct thread *thread, int fd,
1341 				   struct trace *trace)
1342 {
1343 	struct thread_trace *ttrace = thread__priv(thread);
1344 
1345 	if (ttrace == NULL)
1346 		return NULL;
1347 
1348 	if (fd < 0)
1349 		return NULL;
1350 
1351 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1352 		if (!trace->live)
1353 			return NULL;
1354 		++trace->stats.proc_getname;
1355 		if (thread__read_fd_path(thread, fd))
1356 			return NULL;
1357 	}
1358 
1359 	return ttrace->paths.table[fd];
1360 }
1361 
1362 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1363 					struct syscall_arg *arg)
1364 {
1365 	int fd = arg->val;
1366 	size_t printed = scnprintf(bf, size, "%d", fd);
1367 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1368 
1369 	if (path)
1370 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1371 
1372 	return printed;
1373 }
1374 
1375 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1376 					      struct syscall_arg *arg)
1377 {
1378 	int fd = arg->val;
1379 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1380 	struct thread_trace *ttrace = thread__priv(arg->thread);
1381 
1382 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1383 		zfree(&ttrace->paths.table[fd]);
1384 
1385 	return printed;
1386 }
1387 
1388 static bool trace__filter_duration(struct trace *trace, double t)
1389 {
1390 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1391 }
1392 
1393 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1394 {
1395 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1396 
1397 	return fprintf(fp, "%10.3f ", ts);
1398 }
1399 
1400 static bool done = false;
1401 static bool interrupted = false;
1402 
1403 static void sig_handler(int sig)
1404 {
1405 	done = true;
1406 	interrupted = sig == SIGINT;
1407 }
1408 
1409 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1410 					u64 duration, u64 tstamp, FILE *fp)
1411 {
1412 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1413 	printed += fprintf_duration(duration, fp);
1414 
1415 	if (trace->multiple_threads) {
1416 		if (trace->show_comm)
1417 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1418 		printed += fprintf(fp, "%d ", thread->tid);
1419 	}
1420 
1421 	return printed;
1422 }
1423 
1424 static int trace__process_event(struct trace *trace, struct machine *machine,
1425 				union perf_event *event, struct perf_sample *sample)
1426 {
1427 	int ret = 0;
1428 
1429 	switch (event->header.type) {
1430 	case PERF_RECORD_LOST:
1431 		color_fprintf(trace->output, PERF_COLOR_RED,
1432 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1433 		ret = machine__process_lost_event(machine, event, sample);
1434 	default:
1435 		ret = machine__process_event(machine, event, sample);
1436 		break;
1437 	}
1438 
1439 	return ret;
1440 }
1441 
1442 static int trace__tool_process(struct perf_tool *tool,
1443 			       union perf_event *event,
1444 			       struct perf_sample *sample,
1445 			       struct machine *machine)
1446 {
1447 	struct trace *trace = container_of(tool, struct trace, tool);
1448 	return trace__process_event(trace, machine, event, sample);
1449 }
1450 
1451 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1452 {
1453 	int err = symbol__init(NULL);
1454 
1455 	if (err)
1456 		return err;
1457 
1458 	trace->host = machine__new_host();
1459 	if (trace->host == NULL)
1460 		return -ENOMEM;
1461 
1462 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1463 					    evlist->threads, trace__tool_process, false);
1464 	if (err)
1465 		symbol__exit();
1466 
1467 	return err;
1468 }
1469 
1470 static int syscall__set_arg_fmts(struct syscall *sc)
1471 {
1472 	struct format_field *field;
1473 	int idx = 0;
1474 
1475 	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1476 	if (sc->arg_scnprintf == NULL)
1477 		return -1;
1478 
1479 	if (sc->fmt)
1480 		sc->arg_parm = sc->fmt->arg_parm;
1481 
1482 	for (field = sc->args; field; field = field->next) {
1483 		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1484 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1485 		else if (field->flags & FIELD_IS_POINTER)
1486 			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1487 		++idx;
1488 	}
1489 
1490 	return 0;
1491 }
1492 
1493 static int trace__read_syscall_info(struct trace *trace, int id)
1494 {
1495 	char tp_name[128];
1496 	struct syscall *sc;
1497 	const char *name = audit_syscall_to_name(id, trace->audit.machine);
1498 
1499 	if (name == NULL)
1500 		return -1;
1501 
1502 	if (id > trace->syscalls.max) {
1503 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1504 
1505 		if (nsyscalls == NULL)
1506 			return -1;
1507 
1508 		if (trace->syscalls.max != -1) {
1509 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1510 			       (id - trace->syscalls.max) * sizeof(*sc));
1511 		} else {
1512 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1513 		}
1514 
1515 		trace->syscalls.table = nsyscalls;
1516 		trace->syscalls.max   = id;
1517 	}
1518 
1519 	sc = trace->syscalls.table + id;
1520 	sc->name = name;
1521 
1522 	if (trace->ev_qualifier) {
1523 		bool in = strlist__find(trace->ev_qualifier, name) != NULL;
1524 
1525 		if (!(in ^ trace->not_ev_qualifier)) {
1526 			sc->filtered = true;
1527 			/*
1528 			 * No need to do read tracepoint information since this will be
1529 			 * filtered out.
1530 			 */
1531 			return 0;
1532 		}
1533 	}
1534 
1535 	sc->fmt  = syscall_fmt__find(sc->name);
1536 
1537 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1538 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1539 
1540 	if (sc->tp_format == NULL && sc->fmt && sc->fmt->alias) {
1541 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1542 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1543 	}
1544 
1545 	if (sc->tp_format == NULL)
1546 		return -1;
1547 
1548 	sc->args = sc->tp_format->format.fields;
1549 	sc->nr_args = sc->tp_format->format.nr_fields;
1550 	/* drop nr field - not relevant here; does not exist on older kernels */
1551 	if (sc->args && strcmp(sc->args->name, "nr") == 0) {
1552 		sc->args = sc->args->next;
1553 		--sc->nr_args;
1554 	}
1555 
1556 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1557 
1558 	return syscall__set_arg_fmts(sc);
1559 }
1560 
1561 /*
1562  * args is to be interpreted as a series of longs but we need to handle
1563  * 8-byte unaligned accesses. args points to raw_data within the event
1564  * and raw_data is guaranteed to be 8-byte unaligned because it is
1565  * preceded by raw_size which is a u32. So we need to copy args to a temp
1566  * variable to read it. Most notably this avoids extended load instructions
1567  * on unaligned addresses
1568  */
1569 
1570 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1571 				      unsigned char *args, struct trace *trace,
1572 				      struct thread *thread)
1573 {
1574 	size_t printed = 0;
1575 	unsigned char *p;
1576 	unsigned long val;
1577 
1578 	if (sc->args != NULL) {
1579 		struct format_field *field;
1580 		u8 bit = 1;
1581 		struct syscall_arg arg = {
1582 			.idx	= 0,
1583 			.mask	= 0,
1584 			.trace  = trace,
1585 			.thread = thread,
1586 		};
1587 
1588 		for (field = sc->args; field;
1589 		     field = field->next, ++arg.idx, bit <<= 1) {
1590 			if (arg.mask & bit)
1591 				continue;
1592 
1593 			/* special care for unaligned accesses */
1594 			p = args + sizeof(unsigned long) * arg.idx;
1595 			memcpy(&val, p, sizeof(val));
1596 
1597 			/*
1598  			 * Suppress this argument if its value is zero and
1599  			 * and we don't have a string associated in an
1600  			 * strarray for it.
1601  			 */
1602 			if (val == 0 &&
1603 			    !(sc->arg_scnprintf &&
1604 			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1605 			      sc->arg_parm[arg.idx]))
1606 				continue;
1607 
1608 			printed += scnprintf(bf + printed, size - printed,
1609 					     "%s%s: ", printed ? ", " : "", field->name);
1610 			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1611 				arg.val = val;
1612 				if (sc->arg_parm)
1613 					arg.parm = sc->arg_parm[arg.idx];
1614 				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1615 								      size - printed, &arg);
1616 			} else {
1617 				printed += scnprintf(bf + printed, size - printed,
1618 						     "%ld", val);
1619 			}
1620 		}
1621 	} else {
1622 		int i = 0;
1623 
1624 		while (i < 6) {
1625 			/* special care for unaligned accesses */
1626 			p = args + sizeof(unsigned long) * i;
1627 			memcpy(&val, p, sizeof(val));
1628 			printed += scnprintf(bf + printed, size - printed,
1629 					     "%sarg%d: %ld",
1630 					     printed ? ", " : "", i, val);
1631 			++i;
1632 		}
1633 	}
1634 
1635 	return printed;
1636 }
1637 
1638 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1639 				  union perf_event *event,
1640 				  struct perf_sample *sample);
1641 
1642 static struct syscall *trace__syscall_info(struct trace *trace,
1643 					   struct perf_evsel *evsel, int id)
1644 {
1645 
1646 	if (id < 0) {
1647 
1648 		/*
1649 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1650 		 * before that, leaving at a higher verbosity level till that is
1651 		 * explained. Reproduced with plain ftrace with:
1652 		 *
1653 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1654 		 * grep "NR -1 " /t/trace_pipe
1655 		 *
1656 		 * After generating some load on the machine.
1657  		 */
1658 		if (verbose > 1) {
1659 			static u64 n;
1660 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1661 				id, perf_evsel__name(evsel), ++n);
1662 		}
1663 		return NULL;
1664 	}
1665 
1666 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1667 	    trace__read_syscall_info(trace, id))
1668 		goto out_cant_read;
1669 
1670 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1671 		goto out_cant_read;
1672 
1673 	return &trace->syscalls.table[id];
1674 
1675 out_cant_read:
1676 	if (verbose) {
1677 		fprintf(trace->output, "Problems reading syscall %d", id);
1678 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1679 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1680 		fputs(" information\n", trace->output);
1681 	}
1682 	return NULL;
1683 }
1684 
1685 static void thread__update_stats(struct thread_trace *ttrace,
1686 				 int id, struct perf_sample *sample)
1687 {
1688 	struct int_node *inode;
1689 	struct stats *stats;
1690 	u64 duration = 0;
1691 
1692 	inode = intlist__findnew(ttrace->syscall_stats, id);
1693 	if (inode == NULL)
1694 		return;
1695 
1696 	stats = inode->priv;
1697 	if (stats == NULL) {
1698 		stats = malloc(sizeof(struct stats));
1699 		if (stats == NULL)
1700 			return;
1701 		init_stats(stats);
1702 		inode->priv = stats;
1703 	}
1704 
1705 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1706 		duration = sample->time - ttrace->entry_time;
1707 
1708 	update_stats(stats, duration);
1709 }
1710 
1711 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1712 {
1713 	struct thread_trace *ttrace;
1714 	u64 duration;
1715 	size_t printed;
1716 
1717 	if (trace->current == NULL)
1718 		return 0;
1719 
1720 	ttrace = thread__priv(trace->current);
1721 
1722 	if (!ttrace->entry_pending)
1723 		return 0;
1724 
1725 	duration = sample->time - ttrace->entry_time;
1726 
1727 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1728 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1729 	ttrace->entry_pending = false;
1730 
1731 	return printed;
1732 }
1733 
1734 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1735 			    union perf_event *event __maybe_unused,
1736 			    struct perf_sample *sample)
1737 {
1738 	char *msg;
1739 	void *args;
1740 	size_t printed = 0;
1741 	struct thread *thread;
1742 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1743 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1744 	struct thread_trace *ttrace;
1745 
1746 	if (sc == NULL)
1747 		return -1;
1748 
1749 	if (sc->filtered)
1750 		return 0;
1751 
1752 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1753 	ttrace = thread__trace(thread, trace->output);
1754 	if (ttrace == NULL)
1755 		goto out_put;
1756 
1757 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1758 
1759 	if (ttrace->entry_str == NULL) {
1760 		ttrace->entry_str = malloc(1024);
1761 		if (!ttrace->entry_str)
1762 			goto out_put;
1763 	}
1764 
1765 	if (!trace->summary_only)
1766 		trace__printf_interrupted_entry(trace, sample);
1767 
1768 	ttrace->entry_time = sample->time;
1769 	msg = ttrace->entry_str;
1770 	printed += scnprintf(msg + printed, 1024 - printed, "%s(", sc->name);
1771 
1772 	printed += syscall__scnprintf_args(sc, msg + printed, 1024 - printed,
1773 					   args, trace, thread);
1774 
1775 	if (sc->is_exit) {
1776 		if (!trace->duration_filter && !trace->summary_only) {
1777 			trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1778 			fprintf(trace->output, "%-70s\n", ttrace->entry_str);
1779 		}
1780 	} else
1781 		ttrace->entry_pending = true;
1782 
1783 	if (trace->current != thread) {
1784 		thread__put(trace->current);
1785 		trace->current = thread__get(thread);
1786 	}
1787 	err = 0;
1788 out_put:
1789 	thread__put(thread);
1790 	return err;
1791 }
1792 
1793 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1794 			   union perf_event *event __maybe_unused,
1795 			   struct perf_sample *sample)
1796 {
1797 	long ret;
1798 	u64 duration = 0;
1799 	struct thread *thread;
1800 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1801 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1802 	struct thread_trace *ttrace;
1803 
1804 	if (sc == NULL)
1805 		return -1;
1806 
1807 	if (sc->filtered)
1808 		return 0;
1809 
1810 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1811 	ttrace = thread__trace(thread, trace->output);
1812 	if (ttrace == NULL)
1813 		goto out_put;
1814 
1815 	if (trace->summary)
1816 		thread__update_stats(ttrace, id, sample);
1817 
1818 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1819 
1820 	if (id == trace->audit.open_id && ret >= 0 && trace->last_vfs_getname) {
1821 		trace__set_fd_pathname(thread, ret, trace->last_vfs_getname);
1822 		trace->last_vfs_getname = NULL;
1823 		++trace->stats.vfs_getname;
1824 	}
1825 
1826 	ttrace->exit_time = sample->time;
1827 
1828 	if (ttrace->entry_time) {
1829 		duration = sample->time - ttrace->entry_time;
1830 		if (trace__filter_duration(trace, duration))
1831 			goto out;
1832 	} else if (trace->duration_filter)
1833 		goto out;
1834 
1835 	if (trace->summary_only)
1836 		goto out;
1837 
1838 	trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1839 
1840 	if (ttrace->entry_pending) {
1841 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1842 	} else {
1843 		fprintf(trace->output, " ... [");
1844 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1845 		fprintf(trace->output, "]: %s()", sc->name);
1846 	}
1847 
1848 	if (sc->fmt == NULL) {
1849 signed_print:
1850 		fprintf(trace->output, ") = %ld", ret);
1851 	} else if (ret < 0 && sc->fmt->errmsg) {
1852 		char bf[STRERR_BUFSIZE];
1853 		const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
1854 			   *e = audit_errno_to_name(-ret);
1855 
1856 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1857 	} else if (ret == 0 && sc->fmt->timeout)
1858 		fprintf(trace->output, ") = 0 Timeout");
1859 	else if (sc->fmt->hexret)
1860 		fprintf(trace->output, ") = %#lx", ret);
1861 	else
1862 		goto signed_print;
1863 
1864 	fputc('\n', trace->output);
1865 out:
1866 	ttrace->entry_pending = false;
1867 	err = 0;
1868 out_put:
1869 	thread__put(thread);
1870 	return err;
1871 }
1872 
1873 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1874 			      union perf_event *event __maybe_unused,
1875 			      struct perf_sample *sample)
1876 {
1877 	trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname");
1878 	return 0;
1879 }
1880 
1881 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1882 				     union perf_event *event __maybe_unused,
1883 				     struct perf_sample *sample)
1884 {
1885         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1886 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1887 	struct thread *thread = machine__findnew_thread(trace->host,
1888 							sample->pid,
1889 							sample->tid);
1890 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1891 
1892 	if (ttrace == NULL)
1893 		goto out_dump;
1894 
1895 	ttrace->runtime_ms += runtime_ms;
1896 	trace->runtime_ms += runtime_ms;
1897 	thread__put(thread);
1898 	return 0;
1899 
1900 out_dump:
1901 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1902 	       evsel->name,
1903 	       perf_evsel__strval(evsel, sample, "comm"),
1904 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1905 	       runtime,
1906 	       perf_evsel__intval(evsel, sample, "vruntime"));
1907 	thread__put(thread);
1908 	return 0;
1909 }
1910 
1911 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1912 				union perf_event *event __maybe_unused,
1913 				struct perf_sample *sample)
1914 {
1915 	trace__printf_interrupted_entry(trace, sample);
1916 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1917 
1918 	if (trace->trace_syscalls)
1919 		fprintf(trace->output, "(         ): ");
1920 
1921 	fprintf(trace->output, "%s:", evsel->name);
1922 
1923 	if (evsel->tp_format) {
1924 		event_format__fprintf(evsel->tp_format, sample->cpu,
1925 				      sample->raw_data, sample->raw_size,
1926 				      trace->output);
1927 	}
1928 
1929 	fprintf(trace->output, ")\n");
1930 	return 0;
1931 }
1932 
1933 static void print_location(FILE *f, struct perf_sample *sample,
1934 			   struct addr_location *al,
1935 			   bool print_dso, bool print_sym)
1936 {
1937 
1938 	if ((verbose || print_dso) && al->map)
1939 		fprintf(f, "%s@", al->map->dso->long_name);
1940 
1941 	if ((verbose || print_sym) && al->sym)
1942 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1943 			al->addr - al->sym->start);
1944 	else if (al->map)
1945 		fprintf(f, "0x%" PRIx64, al->addr);
1946 	else
1947 		fprintf(f, "0x%" PRIx64, sample->addr);
1948 }
1949 
1950 static int trace__pgfault(struct trace *trace,
1951 			  struct perf_evsel *evsel,
1952 			  union perf_event *event,
1953 			  struct perf_sample *sample)
1954 {
1955 	struct thread *thread;
1956 	u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
1957 	struct addr_location al;
1958 	char map_type = 'd';
1959 	struct thread_trace *ttrace;
1960 	int err = -1;
1961 
1962 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1963 	ttrace = thread__trace(thread, trace->output);
1964 	if (ttrace == NULL)
1965 		goto out_put;
1966 
1967 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1968 		ttrace->pfmaj++;
1969 	else
1970 		ttrace->pfmin++;
1971 
1972 	if (trace->summary_only)
1973 		goto out;
1974 
1975 	thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
1976 			      sample->ip, &al);
1977 
1978 	trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1979 
1980 	fprintf(trace->output, "%sfault [",
1981 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1982 		"maj" : "min");
1983 
1984 	print_location(trace->output, sample, &al, false, true);
1985 
1986 	fprintf(trace->output, "] => ");
1987 
1988 	thread__find_addr_location(thread, cpumode, MAP__VARIABLE,
1989 				   sample->addr, &al);
1990 
1991 	if (!al.map) {
1992 		thread__find_addr_location(thread, cpumode,
1993 					   MAP__FUNCTION, sample->addr, &al);
1994 
1995 		if (al.map)
1996 			map_type = 'x';
1997 		else
1998 			map_type = '?';
1999 	}
2000 
2001 	print_location(trace->output, sample, &al, true, false);
2002 
2003 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2004 out:
2005 	err = 0;
2006 out_put:
2007 	thread__put(thread);
2008 	return err;
2009 }
2010 
2011 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
2012 {
2013 	if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
2014 	    (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
2015 		return false;
2016 
2017 	if (trace->pid_list || trace->tid_list)
2018 		return true;
2019 
2020 	return false;
2021 }
2022 
2023 static int trace__process_sample(struct perf_tool *tool,
2024 				 union perf_event *event,
2025 				 struct perf_sample *sample,
2026 				 struct perf_evsel *evsel,
2027 				 struct machine *machine __maybe_unused)
2028 {
2029 	struct trace *trace = container_of(tool, struct trace, tool);
2030 	int err = 0;
2031 
2032 	tracepoint_handler handler = evsel->handler;
2033 
2034 	if (skip_sample(trace, sample))
2035 		return 0;
2036 
2037 	if (!trace->full_time && trace->base_time == 0)
2038 		trace->base_time = sample->time;
2039 
2040 	if (handler) {
2041 		++trace->nr_events;
2042 		handler(trace, evsel, event, sample);
2043 	}
2044 
2045 	return err;
2046 }
2047 
2048 static int parse_target_str(struct trace *trace)
2049 {
2050 	if (trace->opts.target.pid) {
2051 		trace->pid_list = intlist__new(trace->opts.target.pid);
2052 		if (trace->pid_list == NULL) {
2053 			pr_err("Error parsing process id string\n");
2054 			return -EINVAL;
2055 		}
2056 	}
2057 
2058 	if (trace->opts.target.tid) {
2059 		trace->tid_list = intlist__new(trace->opts.target.tid);
2060 		if (trace->tid_list == NULL) {
2061 			pr_err("Error parsing thread id string\n");
2062 			return -EINVAL;
2063 		}
2064 	}
2065 
2066 	return 0;
2067 }
2068 
2069 static int trace__record(struct trace *trace, int argc, const char **argv)
2070 {
2071 	unsigned int rec_argc, i, j;
2072 	const char **rec_argv;
2073 	const char * const record_args[] = {
2074 		"record",
2075 		"-R",
2076 		"-m", "1024",
2077 		"-c", "1",
2078 	};
2079 
2080 	const char * const sc_args[] = { "-e", };
2081 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2082 	const char * const majpf_args[] = { "-e", "major-faults" };
2083 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2084 	const char * const minpf_args[] = { "-e", "minor-faults" };
2085 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2086 
2087 	/* +1 is for the event string below */
2088 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2089 		majpf_args_nr + minpf_args_nr + argc;
2090 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2091 
2092 	if (rec_argv == NULL)
2093 		return -ENOMEM;
2094 
2095 	j = 0;
2096 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2097 		rec_argv[j++] = record_args[i];
2098 
2099 	if (trace->trace_syscalls) {
2100 		for (i = 0; i < sc_args_nr; i++)
2101 			rec_argv[j++] = sc_args[i];
2102 
2103 		/* event string may be different for older kernels - e.g., RHEL6 */
2104 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2105 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2106 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2107 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2108 		else {
2109 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2110 			return -1;
2111 		}
2112 	}
2113 
2114 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2115 		for (i = 0; i < majpf_args_nr; i++)
2116 			rec_argv[j++] = majpf_args[i];
2117 
2118 	if (trace->trace_pgfaults & TRACE_PFMIN)
2119 		for (i = 0; i < minpf_args_nr; i++)
2120 			rec_argv[j++] = minpf_args[i];
2121 
2122 	for (i = 0; i < (unsigned int)argc; i++)
2123 		rec_argv[j++] = argv[i];
2124 
2125 	return cmd_record(j, rec_argv, NULL);
2126 }
2127 
2128 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2129 
2130 static void perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2131 {
2132 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2133 	if (evsel == NULL)
2134 		return;
2135 
2136 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2137 		perf_evsel__delete(evsel);
2138 		return;
2139 	}
2140 
2141 	evsel->handler = trace__vfs_getname;
2142 	perf_evlist__add(evlist, evsel);
2143 }
2144 
2145 static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2146 				    u64 config)
2147 {
2148 	struct perf_evsel *evsel;
2149 	struct perf_event_attr attr = {
2150 		.type = PERF_TYPE_SOFTWARE,
2151 		.mmap_data = 1,
2152 	};
2153 
2154 	attr.config = config;
2155 	attr.sample_period = 1;
2156 
2157 	event_attr_init(&attr);
2158 
2159 	evsel = perf_evsel__new(&attr);
2160 	if (!evsel)
2161 		return -ENOMEM;
2162 
2163 	evsel->handler = trace__pgfault;
2164 	perf_evlist__add(evlist, evsel);
2165 
2166 	return 0;
2167 }
2168 
2169 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2170 {
2171 	const u32 type = event->header.type;
2172 	struct perf_evsel *evsel;
2173 
2174 	if (!trace->full_time && trace->base_time == 0)
2175 		trace->base_time = sample->time;
2176 
2177 	if (type != PERF_RECORD_SAMPLE) {
2178 		trace__process_event(trace, trace->host, event, sample);
2179 		return;
2180 	}
2181 
2182 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2183 	if (evsel == NULL) {
2184 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2185 		return;
2186 	}
2187 
2188 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2189 	    sample->raw_data == NULL) {
2190 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2191 		       perf_evsel__name(evsel), sample->tid,
2192 		       sample->cpu, sample->raw_size);
2193 	} else {
2194 		tracepoint_handler handler = evsel->handler;
2195 		handler(trace, evsel, event, sample);
2196 	}
2197 }
2198 
2199 static int trace__run(struct trace *trace, int argc, const char **argv)
2200 {
2201 	struct perf_evlist *evlist = trace->evlist;
2202 	int err = -1, i;
2203 	unsigned long before;
2204 	const bool forks = argc > 0;
2205 	bool draining = false;
2206 
2207 	trace->live = true;
2208 
2209 	if (trace->trace_syscalls &&
2210 	    perf_evlist__add_syscall_newtp(evlist, trace__sys_enter,
2211 					   trace__sys_exit))
2212 		goto out_error_raw_syscalls;
2213 
2214 	if (trace->trace_syscalls)
2215 		perf_evlist__add_vfs_getname(evlist);
2216 
2217 	if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2218 	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2219 		goto out_error_mem;
2220 	}
2221 
2222 	if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2223 	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2224 		goto out_error_mem;
2225 
2226 	if (trace->sched &&
2227 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2228 				   trace__sched_stat_runtime))
2229 		goto out_error_sched_stat_runtime;
2230 
2231 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2232 	if (err < 0) {
2233 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2234 		goto out_delete_evlist;
2235 	}
2236 
2237 	err = trace__symbols_init(trace, evlist);
2238 	if (err < 0) {
2239 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2240 		goto out_delete_evlist;
2241 	}
2242 
2243 	perf_evlist__config(evlist, &trace->opts);
2244 
2245 	signal(SIGCHLD, sig_handler);
2246 	signal(SIGINT, sig_handler);
2247 
2248 	if (forks) {
2249 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2250 						    argv, false, NULL);
2251 		if (err < 0) {
2252 			fprintf(trace->output, "Couldn't run the workload!\n");
2253 			goto out_delete_evlist;
2254 		}
2255 	}
2256 
2257 	err = perf_evlist__open(evlist);
2258 	if (err < 0)
2259 		goto out_error_open;
2260 
2261 	/*
2262 	 * Better not use !target__has_task() here because we need to cover the
2263 	 * case where no threads were specified in the command line, but a
2264 	 * workload was, and in that case we will fill in the thread_map when
2265 	 * we fork the workload in perf_evlist__prepare_workload.
2266 	 */
2267 	if (trace->filter_pids.nr > 0)
2268 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2269 	else if (evlist->threads->map[0] == -1)
2270 		err = perf_evlist__set_filter_pid(evlist, getpid());
2271 
2272 	if (err < 0) {
2273 		printf("err=%d,%s\n", -err, strerror(-err));
2274 		exit(1);
2275 	}
2276 
2277 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2278 	if (err < 0)
2279 		goto out_error_mmap;
2280 
2281 	if (!target__none(&trace->opts.target))
2282 		perf_evlist__enable(evlist);
2283 
2284 	if (forks)
2285 		perf_evlist__start_workload(evlist);
2286 
2287 	trace->multiple_threads = evlist->threads->map[0] == -1 ||
2288 				  evlist->threads->nr > 1 ||
2289 				  perf_evlist__first(evlist)->attr.inherit;
2290 again:
2291 	before = trace->nr_events;
2292 
2293 	for (i = 0; i < evlist->nr_mmaps; i++) {
2294 		union perf_event *event;
2295 
2296 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2297 			struct perf_sample sample;
2298 
2299 			++trace->nr_events;
2300 
2301 			err = perf_evlist__parse_sample(evlist, event, &sample);
2302 			if (err) {
2303 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2304 				goto next_event;
2305 			}
2306 
2307 			trace__handle_event(trace, event, &sample);
2308 next_event:
2309 			perf_evlist__mmap_consume(evlist, i);
2310 
2311 			if (interrupted)
2312 				goto out_disable;
2313 
2314 			if (done && !draining) {
2315 				perf_evlist__disable(evlist);
2316 				draining = true;
2317 			}
2318 		}
2319 	}
2320 
2321 	if (trace->nr_events == before) {
2322 		int timeout = done ? 100 : -1;
2323 
2324 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2325 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2326 				draining = true;
2327 
2328 			goto again;
2329 		}
2330 	} else {
2331 		goto again;
2332 	}
2333 
2334 out_disable:
2335 	thread__zput(trace->current);
2336 
2337 	perf_evlist__disable(evlist);
2338 
2339 	if (!err) {
2340 		if (trace->summary)
2341 			trace__fprintf_thread_summary(trace, trace->output);
2342 
2343 		if (trace->show_tool_stats) {
2344 			fprintf(trace->output, "Stats:\n "
2345 					       " vfs_getname : %" PRIu64 "\n"
2346 					       " proc_getname: %" PRIu64 "\n",
2347 				trace->stats.vfs_getname,
2348 				trace->stats.proc_getname);
2349 		}
2350 	}
2351 
2352 out_delete_evlist:
2353 	perf_evlist__delete(evlist);
2354 	trace->evlist = NULL;
2355 	trace->live = false;
2356 	return err;
2357 {
2358 	char errbuf[BUFSIZ];
2359 
2360 out_error_sched_stat_runtime:
2361 	debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2362 	goto out_error;
2363 
2364 out_error_raw_syscalls:
2365 	debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2366 	goto out_error;
2367 
2368 out_error_mmap:
2369 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2370 	goto out_error;
2371 
2372 out_error_open:
2373 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2374 
2375 out_error:
2376 	fprintf(trace->output, "%s\n", errbuf);
2377 	goto out_delete_evlist;
2378 }
2379 out_error_mem:
2380 	fprintf(trace->output, "Not enough memory to run!\n");
2381 	goto out_delete_evlist;
2382 }
2383 
2384 static int trace__replay(struct trace *trace)
2385 {
2386 	const struct perf_evsel_str_handler handlers[] = {
2387 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2388 	};
2389 	struct perf_data_file file = {
2390 		.path  = input_name,
2391 		.mode  = PERF_DATA_MODE_READ,
2392 		.force = trace->force,
2393 	};
2394 	struct perf_session *session;
2395 	struct perf_evsel *evsel;
2396 	int err = -1;
2397 
2398 	trace->tool.sample	  = trace__process_sample;
2399 	trace->tool.mmap	  = perf_event__process_mmap;
2400 	trace->tool.mmap2	  = perf_event__process_mmap2;
2401 	trace->tool.comm	  = perf_event__process_comm;
2402 	trace->tool.exit	  = perf_event__process_exit;
2403 	trace->tool.fork	  = perf_event__process_fork;
2404 	trace->tool.attr	  = perf_event__process_attr;
2405 	trace->tool.tracing_data = perf_event__process_tracing_data;
2406 	trace->tool.build_id	  = perf_event__process_build_id;
2407 
2408 	trace->tool.ordered_events = true;
2409 	trace->tool.ordering_requires_timestamps = true;
2410 
2411 	/* add tid to output */
2412 	trace->multiple_threads = true;
2413 
2414 	session = perf_session__new(&file, false, &trace->tool);
2415 	if (session == NULL)
2416 		return -1;
2417 
2418 	if (symbol__init(&session->header.env) < 0)
2419 		goto out;
2420 
2421 	trace->host = &session->machines.host;
2422 
2423 	err = perf_session__set_tracepoints_handlers(session, handlers);
2424 	if (err)
2425 		goto out;
2426 
2427 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2428 						     "raw_syscalls:sys_enter");
2429 	/* older kernels have syscalls tp versus raw_syscalls */
2430 	if (evsel == NULL)
2431 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2432 							     "syscalls:sys_enter");
2433 
2434 	if (evsel &&
2435 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2436 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2437 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2438 		goto out;
2439 	}
2440 
2441 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2442 						     "raw_syscalls:sys_exit");
2443 	if (evsel == NULL)
2444 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2445 							     "syscalls:sys_exit");
2446 	if (evsel &&
2447 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2448 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2449 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2450 		goto out;
2451 	}
2452 
2453 	evlist__for_each(session->evlist, evsel) {
2454 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2455 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2456 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2457 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2458 			evsel->handler = trace__pgfault;
2459 	}
2460 
2461 	err = parse_target_str(trace);
2462 	if (err != 0)
2463 		goto out;
2464 
2465 	setup_pager();
2466 
2467 	err = perf_session__process_events(session);
2468 	if (err)
2469 		pr_err("Failed to process events, error %d", err);
2470 
2471 	else if (trace->summary)
2472 		trace__fprintf_thread_summary(trace, trace->output);
2473 
2474 out:
2475 	perf_session__delete(session);
2476 
2477 	return err;
2478 }
2479 
2480 static size_t trace__fprintf_threads_header(FILE *fp)
2481 {
2482 	size_t printed;
2483 
2484 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2485 
2486 	return printed;
2487 }
2488 
2489 static size_t thread__dump_stats(struct thread_trace *ttrace,
2490 				 struct trace *trace, FILE *fp)
2491 {
2492 	struct stats *stats;
2493 	size_t printed = 0;
2494 	struct syscall *sc;
2495 	struct int_node *inode = intlist__first(ttrace->syscall_stats);
2496 
2497 	if (inode == NULL)
2498 		return 0;
2499 
2500 	printed += fprintf(fp, "\n");
2501 
2502 	printed += fprintf(fp, "   syscall            calls      min       avg       max      stddev\n");
2503 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)        (%%)\n");
2504 	printed += fprintf(fp, "   --------------- -------- --------- --------- ---------     ------\n");
2505 
2506 	/* each int_node is a syscall */
2507 	while (inode) {
2508 		stats = inode->priv;
2509 		if (stats) {
2510 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2511 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2512 			double avg = avg_stats(stats);
2513 			double pct;
2514 			u64 n = (u64) stats->n;
2515 
2516 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2517 			avg /= NSEC_PER_MSEC;
2518 
2519 			sc = &trace->syscalls.table[inode->i];
2520 			printed += fprintf(fp, "   %-15s", sc->name);
2521 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f",
2522 					   n, min, avg);
2523 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2524 		}
2525 
2526 		inode = intlist__next(inode);
2527 	}
2528 
2529 	printed += fprintf(fp, "\n\n");
2530 
2531 	return printed;
2532 }
2533 
2534 /* struct used to pass data to per-thread function */
2535 struct summary_data {
2536 	FILE *fp;
2537 	struct trace *trace;
2538 	size_t printed;
2539 };
2540 
2541 static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2542 {
2543 	struct summary_data *data = priv;
2544 	FILE *fp = data->fp;
2545 	size_t printed = data->printed;
2546 	struct trace *trace = data->trace;
2547 	struct thread_trace *ttrace = thread__priv(thread);
2548 	double ratio;
2549 
2550 	if (ttrace == NULL)
2551 		return 0;
2552 
2553 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2554 
2555 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2556 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2557 	printed += fprintf(fp, "%.1f%%", ratio);
2558 	if (ttrace->pfmaj)
2559 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2560 	if (ttrace->pfmin)
2561 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2562 	printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2563 	printed += thread__dump_stats(ttrace, trace, fp);
2564 
2565 	data->printed += printed;
2566 
2567 	return 0;
2568 }
2569 
2570 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2571 {
2572 	struct summary_data data = {
2573 		.fp = fp,
2574 		.trace = trace
2575 	};
2576 	data.printed = trace__fprintf_threads_header(fp);
2577 
2578 	machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2579 
2580 	return data.printed;
2581 }
2582 
2583 static int trace__set_duration(const struct option *opt, const char *str,
2584 			       int unset __maybe_unused)
2585 {
2586 	struct trace *trace = opt->value;
2587 
2588 	trace->duration_filter = atof(str);
2589 	return 0;
2590 }
2591 
2592 static int trace__set_filter_pids(const struct option *opt, const char *str,
2593 				  int unset __maybe_unused)
2594 {
2595 	int ret = -1;
2596 	size_t i;
2597 	struct trace *trace = opt->value;
2598 	/*
2599 	 * FIXME: introduce a intarray class, plain parse csv and create a
2600 	 * { int nr, int entries[] } struct...
2601 	 */
2602 	struct intlist *list = intlist__new(str);
2603 
2604 	if (list == NULL)
2605 		return -1;
2606 
2607 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2608 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2609 
2610 	if (trace->filter_pids.entries == NULL)
2611 		goto out;
2612 
2613 	trace->filter_pids.entries[0] = getpid();
2614 
2615 	for (i = 1; i < trace->filter_pids.nr; ++i)
2616 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2617 
2618 	intlist__delete(list);
2619 	ret = 0;
2620 out:
2621 	return ret;
2622 }
2623 
2624 static int trace__open_output(struct trace *trace, const char *filename)
2625 {
2626 	struct stat st;
2627 
2628 	if (!stat(filename, &st) && st.st_size) {
2629 		char oldname[PATH_MAX];
2630 
2631 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2632 		unlink(oldname);
2633 		rename(filename, oldname);
2634 	}
2635 
2636 	trace->output = fopen(filename, "w");
2637 
2638 	return trace->output == NULL ? -errno : 0;
2639 }
2640 
2641 static int parse_pagefaults(const struct option *opt, const char *str,
2642 			    int unset __maybe_unused)
2643 {
2644 	int *trace_pgfaults = opt->value;
2645 
2646 	if (strcmp(str, "all") == 0)
2647 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2648 	else if (strcmp(str, "maj") == 0)
2649 		*trace_pgfaults |= TRACE_PFMAJ;
2650 	else if (strcmp(str, "min") == 0)
2651 		*trace_pgfaults |= TRACE_PFMIN;
2652 	else
2653 		return -1;
2654 
2655 	return 0;
2656 }
2657 
2658 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2659 {
2660 	struct perf_evsel *evsel;
2661 
2662 	evlist__for_each(evlist, evsel)
2663 		evsel->handler = handler;
2664 }
2665 
2666 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2667 {
2668 	const char *trace_usage[] = {
2669 		"perf trace [<options>] [<command>]",
2670 		"perf trace [<options>] -- <command> [<options>]",
2671 		"perf trace record [<options>] [<command>]",
2672 		"perf trace record [<options>] -- <command> [<options>]",
2673 		NULL
2674 	};
2675 	struct trace trace = {
2676 		.audit = {
2677 			.machine = audit_detect_machine(),
2678 			.open_id = audit_name_to_syscall("open", trace.audit.machine),
2679 		},
2680 		.syscalls = {
2681 			. max = -1,
2682 		},
2683 		.opts = {
2684 			.target = {
2685 				.uid	   = UINT_MAX,
2686 				.uses_mmap = true,
2687 			},
2688 			.user_freq     = UINT_MAX,
2689 			.user_interval = ULLONG_MAX,
2690 			.no_buffering  = true,
2691 			.mmap_pages    = UINT_MAX,
2692 		},
2693 		.output = stdout,
2694 		.show_comm = true,
2695 		.trace_syscalls = true,
2696 	};
2697 	const char *output_name = NULL;
2698 	const char *ev_qualifier_str = NULL;
2699 	const struct option trace_options[] = {
2700 	OPT_CALLBACK(0, "event", &trace.evlist, "event",
2701 		     "event selector. use 'perf list' to list available events",
2702 		     parse_events_option),
2703 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2704 		    "show the thread COMM next to its id"),
2705 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2706 	OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2707 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2708 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2709 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2710 		    "trace events on existing process id"),
2711 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2712 		    "trace events on existing thread id"),
2713 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2714 		     "pids to filter (by the kernel)", trace__set_filter_pids),
2715 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2716 		    "system-wide collection from all CPUs"),
2717 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2718 		    "list of cpus to monitor"),
2719 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2720 		    "child tasks do not inherit counters"),
2721 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2722 		     "number of mmap data pages",
2723 		     perf_evlist__parse_mmap_pages),
2724 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2725 		   "user to profile"),
2726 	OPT_CALLBACK(0, "duration", &trace, "float",
2727 		     "show only events with duration > N.M ms",
2728 		     trace__set_duration),
2729 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2730 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2731 	OPT_BOOLEAN('T', "time", &trace.full_time,
2732 		    "Show full timestamp, not time relative to first start"),
2733 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
2734 		    "Show only syscall summary with statistics"),
2735 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
2736 		    "Show all syscalls and summary with statistics"),
2737 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2738 		     "Trace pagefaults", parse_pagefaults, "maj"),
2739 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2740 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2741 	OPT_END()
2742 	};
2743 	const char * const trace_subcommands[] = { "record", NULL };
2744 	int err;
2745 	char bf[BUFSIZ];
2746 
2747 	signal(SIGSEGV, sighandler_dump_stack);
2748 	signal(SIGFPE, sighandler_dump_stack);
2749 
2750 	trace.evlist = perf_evlist__new();
2751 
2752 	if (trace.evlist == NULL) {
2753 		pr_err("Not enough memory to run!\n");
2754 		err = -ENOMEM;
2755 		goto out;
2756 	}
2757 
2758 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2759 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2760 
2761 	if (trace.trace_pgfaults) {
2762 		trace.opts.sample_address = true;
2763 		trace.opts.sample_time = true;
2764 	}
2765 
2766 	if (trace.evlist->nr_entries > 0)
2767 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2768 
2769 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2770 		return trace__record(&trace, argc-1, &argv[1]);
2771 
2772 	/* summary_only implies summary option, but don't overwrite summary if set */
2773 	if (trace.summary_only)
2774 		trace.summary = trace.summary_only;
2775 
2776 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2777 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
2778 		pr_err("Please specify something to trace.\n");
2779 		return -1;
2780 	}
2781 
2782 	if (output_name != NULL) {
2783 		err = trace__open_output(&trace, output_name);
2784 		if (err < 0) {
2785 			perror("failed to create output file");
2786 			goto out;
2787 		}
2788 	}
2789 
2790 	if (ev_qualifier_str != NULL) {
2791 		const char *s = ev_qualifier_str;
2792 
2793 		trace.not_ev_qualifier = *s == '!';
2794 		if (trace.not_ev_qualifier)
2795 			++s;
2796 		trace.ev_qualifier = strlist__new(true, s);
2797 		if (trace.ev_qualifier == NULL) {
2798 			fputs("Not enough memory to parse event qualifier",
2799 			      trace.output);
2800 			err = -ENOMEM;
2801 			goto out_close;
2802 		}
2803 	}
2804 
2805 	err = target__validate(&trace.opts.target);
2806 	if (err) {
2807 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2808 		fprintf(trace.output, "%s", bf);
2809 		goto out_close;
2810 	}
2811 
2812 	err = target__parse_uid(&trace.opts.target);
2813 	if (err) {
2814 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2815 		fprintf(trace.output, "%s", bf);
2816 		goto out_close;
2817 	}
2818 
2819 	if (!argc && target__none(&trace.opts.target))
2820 		trace.opts.target.system_wide = true;
2821 
2822 	if (input_name)
2823 		err = trace__replay(&trace);
2824 	else
2825 		err = trace__run(&trace, argc, argv);
2826 
2827 out_close:
2828 	if (output_name != NULL)
2829 		fclose(trace.output);
2830 out:
2831 	return err;
2832 }
2833