xref: /openbmc/linux/tools/perf/builtin-trace.c (revision 45471cd98decae5fced8b38e46c223f54a924814)
1 #include <traceevent/event-parse.h>
2 #include "builtin.h"
3 #include "util/color.h"
4 #include "util/debug.h"
5 #include "util/evlist.h"
6 #include "util/machine.h"
7 #include "util/session.h"
8 #include "util/thread.h"
9 #include "util/parse-options.h"
10 #include "util/strlist.h"
11 #include "util/intlist.h"
12 #include "util/thread_map.h"
13 #include "util/stat.h"
14 #include "trace-event.h"
15 #include "util/parse-events.h"
16 
17 #include <libaudit.h>
18 #include <stdlib.h>
19 #include <sys/mman.h>
20 #include <linux/futex.h>
21 
22 /* For older distros: */
23 #ifndef MAP_STACK
24 # define MAP_STACK		0x20000
25 #endif
26 
27 #ifndef MADV_HWPOISON
28 # define MADV_HWPOISON		100
29 #endif
30 
31 #ifndef MADV_MERGEABLE
32 # define MADV_MERGEABLE		12
33 #endif
34 
35 #ifndef MADV_UNMERGEABLE
36 # define MADV_UNMERGEABLE	13
37 #endif
38 
39 #ifndef EFD_SEMAPHORE
40 # define EFD_SEMAPHORE		1
41 #endif
42 
43 #ifndef EFD_NONBLOCK
44 # define EFD_NONBLOCK		00004000
45 #endif
46 
47 #ifndef EFD_CLOEXEC
48 # define EFD_CLOEXEC		02000000
49 #endif
50 
51 #ifndef O_CLOEXEC
52 # define O_CLOEXEC		02000000
53 #endif
54 
55 #ifndef SOCK_DCCP
56 # define SOCK_DCCP		6
57 #endif
58 
59 #ifndef SOCK_CLOEXEC
60 # define SOCK_CLOEXEC		02000000
61 #endif
62 
63 #ifndef SOCK_NONBLOCK
64 # define SOCK_NONBLOCK		00004000
65 #endif
66 
67 #ifndef MSG_CMSG_CLOEXEC
68 # define MSG_CMSG_CLOEXEC	0x40000000
69 #endif
70 
71 #ifndef PERF_FLAG_FD_NO_GROUP
72 # define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
73 #endif
74 
75 #ifndef PERF_FLAG_FD_OUTPUT
76 # define PERF_FLAG_FD_OUTPUT		(1UL << 1)
77 #endif
78 
79 #ifndef PERF_FLAG_PID_CGROUP
80 # define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
81 #endif
82 
83 #ifndef PERF_FLAG_FD_CLOEXEC
84 # define PERF_FLAG_FD_CLOEXEC		(1UL << 3) /* O_CLOEXEC */
85 #endif
86 
87 
88 struct tp_field {
89 	int offset;
90 	union {
91 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
92 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
93 	};
94 };
95 
96 #define TP_UINT_FIELD(bits) \
97 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
98 { \
99 	u##bits value; \
100 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
101 	return value;  \
102 }
103 
104 TP_UINT_FIELD(8);
105 TP_UINT_FIELD(16);
106 TP_UINT_FIELD(32);
107 TP_UINT_FIELD(64);
108 
109 #define TP_UINT_FIELD__SWAPPED(bits) \
110 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
111 { \
112 	u##bits value; \
113 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
114 	return bswap_##bits(value);\
115 }
116 
117 TP_UINT_FIELD__SWAPPED(16);
118 TP_UINT_FIELD__SWAPPED(32);
119 TP_UINT_FIELD__SWAPPED(64);
120 
121 static int tp_field__init_uint(struct tp_field *field,
122 			       struct format_field *format_field,
123 			       bool needs_swap)
124 {
125 	field->offset = format_field->offset;
126 
127 	switch (format_field->size) {
128 	case 1:
129 		field->integer = tp_field__u8;
130 		break;
131 	case 2:
132 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
133 		break;
134 	case 4:
135 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
136 		break;
137 	case 8:
138 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
139 		break;
140 	default:
141 		return -1;
142 	}
143 
144 	return 0;
145 }
146 
147 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
148 {
149 	return sample->raw_data + field->offset;
150 }
151 
152 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
153 {
154 	field->offset = format_field->offset;
155 	field->pointer = tp_field__ptr;
156 	return 0;
157 }
158 
159 struct syscall_tp {
160 	struct tp_field id;
161 	union {
162 		struct tp_field args, ret;
163 	};
164 };
165 
166 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
167 					  struct tp_field *field,
168 					  const char *name)
169 {
170 	struct format_field *format_field = perf_evsel__field(evsel, name);
171 
172 	if (format_field == NULL)
173 		return -1;
174 
175 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
176 }
177 
178 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
179 	({ struct syscall_tp *sc = evsel->priv;\
180 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
181 
182 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
183 					 struct tp_field *field,
184 					 const char *name)
185 {
186 	struct format_field *format_field = perf_evsel__field(evsel, name);
187 
188 	if (format_field == NULL)
189 		return -1;
190 
191 	return tp_field__init_ptr(field, format_field);
192 }
193 
194 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
195 	({ struct syscall_tp *sc = evsel->priv;\
196 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
197 
198 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
199 {
200 	zfree(&evsel->priv);
201 	perf_evsel__delete(evsel);
202 }
203 
204 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
205 {
206 	evsel->priv = malloc(sizeof(struct syscall_tp));
207 	if (evsel->priv != NULL) {
208 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
209 			goto out_delete;
210 
211 		evsel->handler = handler;
212 		return 0;
213 	}
214 
215 	return -ENOMEM;
216 
217 out_delete:
218 	zfree(&evsel->priv);
219 	return -ENOENT;
220 }
221 
222 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
223 {
224 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
225 
226 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
227 	if (evsel == NULL)
228 		evsel = perf_evsel__newtp("syscalls", direction);
229 
230 	if (evsel) {
231 		if (perf_evsel__init_syscall_tp(evsel, handler))
232 			goto out_delete;
233 	}
234 
235 	return evsel;
236 
237 out_delete:
238 	perf_evsel__delete_priv(evsel);
239 	return NULL;
240 }
241 
242 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
243 	({ struct syscall_tp *fields = evsel->priv; \
244 	   fields->name.integer(&fields->name, sample); })
245 
246 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
247 	({ struct syscall_tp *fields = evsel->priv; \
248 	   fields->name.pointer(&fields->name, sample); })
249 
250 static int perf_evlist__add_syscall_newtp(struct perf_evlist *evlist,
251 					  void *sys_enter_handler,
252 					  void *sys_exit_handler)
253 {
254 	int ret = -1;
255 	struct perf_evsel *sys_enter, *sys_exit;
256 
257 	sys_enter = perf_evsel__syscall_newtp("sys_enter", sys_enter_handler);
258 	if (sys_enter == NULL)
259 		goto out;
260 
261 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
262 		goto out_delete_sys_enter;
263 
264 	sys_exit = perf_evsel__syscall_newtp("sys_exit", sys_exit_handler);
265 	if (sys_exit == NULL)
266 		goto out_delete_sys_enter;
267 
268 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
269 		goto out_delete_sys_exit;
270 
271 	perf_evlist__add(evlist, sys_enter);
272 	perf_evlist__add(evlist, sys_exit);
273 
274 	ret = 0;
275 out:
276 	return ret;
277 
278 out_delete_sys_exit:
279 	perf_evsel__delete_priv(sys_exit);
280 out_delete_sys_enter:
281 	perf_evsel__delete_priv(sys_enter);
282 	goto out;
283 }
284 
285 
286 struct syscall_arg {
287 	unsigned long val;
288 	struct thread *thread;
289 	struct trace  *trace;
290 	void	      *parm;
291 	u8	      idx;
292 	u8	      mask;
293 };
294 
295 struct strarray {
296 	int	    offset;
297 	int	    nr_entries;
298 	const char **entries;
299 };
300 
301 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
302 	.nr_entries = ARRAY_SIZE(array), \
303 	.entries = array, \
304 }
305 
306 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
307 	.offset	    = off, \
308 	.nr_entries = ARRAY_SIZE(array), \
309 	.entries = array, \
310 }
311 
312 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
313 						const char *intfmt,
314 					        struct syscall_arg *arg)
315 {
316 	struct strarray *sa = arg->parm;
317 	int idx = arg->val - sa->offset;
318 
319 	if (idx < 0 || idx >= sa->nr_entries)
320 		return scnprintf(bf, size, intfmt, arg->val);
321 
322 	return scnprintf(bf, size, "%s", sa->entries[idx]);
323 }
324 
325 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
326 					      struct syscall_arg *arg)
327 {
328 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
329 }
330 
331 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
332 
333 #if defined(__i386__) || defined(__x86_64__)
334 /*
335  * FIXME: Make this available to all arches as soon as the ioctl beautifier
336  * 	  gets rewritten to support all arches.
337  */
338 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
339 						 struct syscall_arg *arg)
340 {
341 	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
342 }
343 
344 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
345 #endif /* defined(__i386__) || defined(__x86_64__) */
346 
347 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
348 					struct syscall_arg *arg);
349 
350 #define SCA_FD syscall_arg__scnprintf_fd
351 
352 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
353 					   struct syscall_arg *arg)
354 {
355 	int fd = arg->val;
356 
357 	if (fd == AT_FDCWD)
358 		return scnprintf(bf, size, "CWD");
359 
360 	return syscall_arg__scnprintf_fd(bf, size, arg);
361 }
362 
363 #define SCA_FDAT syscall_arg__scnprintf_fd_at
364 
365 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
366 					      struct syscall_arg *arg);
367 
368 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
369 
370 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
371 					 struct syscall_arg *arg)
372 {
373 	return scnprintf(bf, size, "%#lx", arg->val);
374 }
375 
376 #define SCA_HEX syscall_arg__scnprintf_hex
377 
378 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
379 					 struct syscall_arg *arg)
380 {
381 	return scnprintf(bf, size, "%d", arg->val);
382 }
383 
384 #define SCA_INT syscall_arg__scnprintf_int
385 
386 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
387 					       struct syscall_arg *arg)
388 {
389 	int printed = 0, prot = arg->val;
390 
391 	if (prot == PROT_NONE)
392 		return scnprintf(bf, size, "NONE");
393 #define	P_MMAP_PROT(n) \
394 	if (prot & PROT_##n) { \
395 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
396 		prot &= ~PROT_##n; \
397 	}
398 
399 	P_MMAP_PROT(EXEC);
400 	P_MMAP_PROT(READ);
401 	P_MMAP_PROT(WRITE);
402 #ifdef PROT_SEM
403 	P_MMAP_PROT(SEM);
404 #endif
405 	P_MMAP_PROT(GROWSDOWN);
406 	P_MMAP_PROT(GROWSUP);
407 #undef P_MMAP_PROT
408 
409 	if (prot)
410 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
411 
412 	return printed;
413 }
414 
415 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
416 
417 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
418 						struct syscall_arg *arg)
419 {
420 	int printed = 0, flags = arg->val;
421 
422 #define	P_MMAP_FLAG(n) \
423 	if (flags & MAP_##n) { \
424 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
425 		flags &= ~MAP_##n; \
426 	}
427 
428 	P_MMAP_FLAG(SHARED);
429 	P_MMAP_FLAG(PRIVATE);
430 #ifdef MAP_32BIT
431 	P_MMAP_FLAG(32BIT);
432 #endif
433 	P_MMAP_FLAG(ANONYMOUS);
434 	P_MMAP_FLAG(DENYWRITE);
435 	P_MMAP_FLAG(EXECUTABLE);
436 	P_MMAP_FLAG(FILE);
437 	P_MMAP_FLAG(FIXED);
438 	P_MMAP_FLAG(GROWSDOWN);
439 #ifdef MAP_HUGETLB
440 	P_MMAP_FLAG(HUGETLB);
441 #endif
442 	P_MMAP_FLAG(LOCKED);
443 	P_MMAP_FLAG(NONBLOCK);
444 	P_MMAP_FLAG(NORESERVE);
445 	P_MMAP_FLAG(POPULATE);
446 	P_MMAP_FLAG(STACK);
447 #ifdef MAP_UNINITIALIZED
448 	P_MMAP_FLAG(UNINITIALIZED);
449 #endif
450 #undef P_MMAP_FLAG
451 
452 	if (flags)
453 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
454 
455 	return printed;
456 }
457 
458 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
459 
460 static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
461 						  struct syscall_arg *arg)
462 {
463 	int printed = 0, flags = arg->val;
464 
465 #define P_MREMAP_FLAG(n) \
466 	if (flags & MREMAP_##n) { \
467 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
468 		flags &= ~MREMAP_##n; \
469 	}
470 
471 	P_MREMAP_FLAG(MAYMOVE);
472 #ifdef MREMAP_FIXED
473 	P_MREMAP_FLAG(FIXED);
474 #endif
475 #undef P_MREMAP_FLAG
476 
477 	if (flags)
478 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
479 
480 	return printed;
481 }
482 
483 #define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
484 
485 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
486 						      struct syscall_arg *arg)
487 {
488 	int behavior = arg->val;
489 
490 	switch (behavior) {
491 #define	P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
492 	P_MADV_BHV(NORMAL);
493 	P_MADV_BHV(RANDOM);
494 	P_MADV_BHV(SEQUENTIAL);
495 	P_MADV_BHV(WILLNEED);
496 	P_MADV_BHV(DONTNEED);
497 	P_MADV_BHV(REMOVE);
498 	P_MADV_BHV(DONTFORK);
499 	P_MADV_BHV(DOFORK);
500 	P_MADV_BHV(HWPOISON);
501 #ifdef MADV_SOFT_OFFLINE
502 	P_MADV_BHV(SOFT_OFFLINE);
503 #endif
504 	P_MADV_BHV(MERGEABLE);
505 	P_MADV_BHV(UNMERGEABLE);
506 #ifdef MADV_HUGEPAGE
507 	P_MADV_BHV(HUGEPAGE);
508 #endif
509 #ifdef MADV_NOHUGEPAGE
510 	P_MADV_BHV(NOHUGEPAGE);
511 #endif
512 #ifdef MADV_DONTDUMP
513 	P_MADV_BHV(DONTDUMP);
514 #endif
515 #ifdef MADV_DODUMP
516 	P_MADV_BHV(DODUMP);
517 #endif
518 #undef P_MADV_PHV
519 	default: break;
520 	}
521 
522 	return scnprintf(bf, size, "%#x", behavior);
523 }
524 
525 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
526 
527 static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
528 					   struct syscall_arg *arg)
529 {
530 	int printed = 0, op = arg->val;
531 
532 	if (op == 0)
533 		return scnprintf(bf, size, "NONE");
534 #define	P_CMD(cmd) \
535 	if ((op & LOCK_##cmd) == LOCK_##cmd) { \
536 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
537 		op &= ~LOCK_##cmd; \
538 	}
539 
540 	P_CMD(SH);
541 	P_CMD(EX);
542 	P_CMD(NB);
543 	P_CMD(UN);
544 	P_CMD(MAND);
545 	P_CMD(RW);
546 	P_CMD(READ);
547 	P_CMD(WRITE);
548 #undef P_OP
549 
550 	if (op)
551 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
552 
553 	return printed;
554 }
555 
556 #define SCA_FLOCK syscall_arg__scnprintf_flock
557 
558 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
559 {
560 	enum syscall_futex_args {
561 		SCF_UADDR   = (1 << 0),
562 		SCF_OP	    = (1 << 1),
563 		SCF_VAL	    = (1 << 2),
564 		SCF_TIMEOUT = (1 << 3),
565 		SCF_UADDR2  = (1 << 4),
566 		SCF_VAL3    = (1 << 5),
567 	};
568 	int op = arg->val;
569 	int cmd = op & FUTEX_CMD_MASK;
570 	size_t printed = 0;
571 
572 	switch (cmd) {
573 #define	P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
574 	P_FUTEX_OP(WAIT);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
575 	P_FUTEX_OP(WAKE);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
576 	P_FUTEX_OP(FD);		    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
577 	P_FUTEX_OP(REQUEUE);	    arg->mask |= SCF_VAL3|SCF_TIMEOUT;	          break;
578 	P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;			  break;
579 	P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;			  break;
580 	P_FUTEX_OP(WAKE_OP);							  break;
581 	P_FUTEX_OP(LOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
582 	P_FUTEX_OP(UNLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
583 	P_FUTEX_OP(TRYLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
584 	P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;			  break;
585 	P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;			  break;
586 	P_FUTEX_OP(WAIT_REQUEUE_PI);						  break;
587 	default: printed = scnprintf(bf, size, "%#x", cmd);			  break;
588 	}
589 
590 	if (op & FUTEX_PRIVATE_FLAG)
591 		printed += scnprintf(bf + printed, size - printed, "|PRIV");
592 
593 	if (op & FUTEX_CLOCK_REALTIME)
594 		printed += scnprintf(bf + printed, size - printed, "|CLKRT");
595 
596 	return printed;
597 }
598 
599 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
600 
601 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
602 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
603 
604 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
605 static DEFINE_STRARRAY(itimers);
606 
607 static const char *whences[] = { "SET", "CUR", "END",
608 #ifdef SEEK_DATA
609 "DATA",
610 #endif
611 #ifdef SEEK_HOLE
612 "HOLE",
613 #endif
614 };
615 static DEFINE_STRARRAY(whences);
616 
617 static const char *fcntl_cmds[] = {
618 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
619 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
620 	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
621 	"F_GETOWNER_UIDS",
622 };
623 static DEFINE_STRARRAY(fcntl_cmds);
624 
625 static const char *rlimit_resources[] = {
626 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
627 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
628 	"RTTIME",
629 };
630 static DEFINE_STRARRAY(rlimit_resources);
631 
632 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
633 static DEFINE_STRARRAY(sighow);
634 
635 static const char *clockid[] = {
636 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
637 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE",
638 };
639 static DEFINE_STRARRAY(clockid);
640 
641 static const char *socket_families[] = {
642 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
643 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
644 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
645 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
646 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
647 	"ALG", "NFC", "VSOCK",
648 };
649 static DEFINE_STRARRAY(socket_families);
650 
651 #ifndef SOCK_TYPE_MASK
652 #define SOCK_TYPE_MASK 0xf
653 #endif
654 
655 static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
656 						      struct syscall_arg *arg)
657 {
658 	size_t printed;
659 	int type = arg->val,
660 	    flags = type & ~SOCK_TYPE_MASK;
661 
662 	type &= SOCK_TYPE_MASK;
663 	/*
664  	 * Can't use a strarray, MIPS may override for ABI reasons.
665  	 */
666 	switch (type) {
667 #define	P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
668 	P_SK_TYPE(STREAM);
669 	P_SK_TYPE(DGRAM);
670 	P_SK_TYPE(RAW);
671 	P_SK_TYPE(RDM);
672 	P_SK_TYPE(SEQPACKET);
673 	P_SK_TYPE(DCCP);
674 	P_SK_TYPE(PACKET);
675 #undef P_SK_TYPE
676 	default:
677 		printed = scnprintf(bf, size, "%#x", type);
678 	}
679 
680 #define	P_SK_FLAG(n) \
681 	if (flags & SOCK_##n) { \
682 		printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
683 		flags &= ~SOCK_##n; \
684 	}
685 
686 	P_SK_FLAG(CLOEXEC);
687 	P_SK_FLAG(NONBLOCK);
688 #undef P_SK_FLAG
689 
690 	if (flags)
691 		printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
692 
693 	return printed;
694 }
695 
696 #define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
697 
698 #ifndef MSG_PROBE
699 #define MSG_PROBE	     0x10
700 #endif
701 #ifndef MSG_WAITFORONE
702 #define MSG_WAITFORONE	0x10000
703 #endif
704 #ifndef MSG_SENDPAGE_NOTLAST
705 #define MSG_SENDPAGE_NOTLAST 0x20000
706 #endif
707 #ifndef MSG_FASTOPEN
708 #define MSG_FASTOPEN	     0x20000000
709 #endif
710 
711 static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
712 					       struct syscall_arg *arg)
713 {
714 	int printed = 0, flags = arg->val;
715 
716 	if (flags == 0)
717 		return scnprintf(bf, size, "NONE");
718 #define	P_MSG_FLAG(n) \
719 	if (flags & MSG_##n) { \
720 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
721 		flags &= ~MSG_##n; \
722 	}
723 
724 	P_MSG_FLAG(OOB);
725 	P_MSG_FLAG(PEEK);
726 	P_MSG_FLAG(DONTROUTE);
727 	P_MSG_FLAG(TRYHARD);
728 	P_MSG_FLAG(CTRUNC);
729 	P_MSG_FLAG(PROBE);
730 	P_MSG_FLAG(TRUNC);
731 	P_MSG_FLAG(DONTWAIT);
732 	P_MSG_FLAG(EOR);
733 	P_MSG_FLAG(WAITALL);
734 	P_MSG_FLAG(FIN);
735 	P_MSG_FLAG(SYN);
736 	P_MSG_FLAG(CONFIRM);
737 	P_MSG_FLAG(RST);
738 	P_MSG_FLAG(ERRQUEUE);
739 	P_MSG_FLAG(NOSIGNAL);
740 	P_MSG_FLAG(MORE);
741 	P_MSG_FLAG(WAITFORONE);
742 	P_MSG_FLAG(SENDPAGE_NOTLAST);
743 	P_MSG_FLAG(FASTOPEN);
744 	P_MSG_FLAG(CMSG_CLOEXEC);
745 #undef P_MSG_FLAG
746 
747 	if (flags)
748 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
749 
750 	return printed;
751 }
752 
753 #define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
754 
755 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
756 						 struct syscall_arg *arg)
757 {
758 	size_t printed = 0;
759 	int mode = arg->val;
760 
761 	if (mode == F_OK) /* 0 */
762 		return scnprintf(bf, size, "F");
763 #define	P_MODE(n) \
764 	if (mode & n##_OK) { \
765 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
766 		mode &= ~n##_OK; \
767 	}
768 
769 	P_MODE(R);
770 	P_MODE(W);
771 	P_MODE(X);
772 #undef P_MODE
773 
774 	if (mode)
775 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
776 
777 	return printed;
778 }
779 
780 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
781 
782 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
783 					       struct syscall_arg *arg)
784 {
785 	int printed = 0, flags = arg->val;
786 
787 	if (!(flags & O_CREAT))
788 		arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
789 
790 	if (flags == 0)
791 		return scnprintf(bf, size, "RDONLY");
792 #define	P_FLAG(n) \
793 	if (flags & O_##n) { \
794 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
795 		flags &= ~O_##n; \
796 	}
797 
798 	P_FLAG(APPEND);
799 	P_FLAG(ASYNC);
800 	P_FLAG(CLOEXEC);
801 	P_FLAG(CREAT);
802 	P_FLAG(DIRECT);
803 	P_FLAG(DIRECTORY);
804 	P_FLAG(EXCL);
805 	P_FLAG(LARGEFILE);
806 	P_FLAG(NOATIME);
807 	P_FLAG(NOCTTY);
808 #ifdef O_NONBLOCK
809 	P_FLAG(NONBLOCK);
810 #elif O_NDELAY
811 	P_FLAG(NDELAY);
812 #endif
813 #ifdef O_PATH
814 	P_FLAG(PATH);
815 #endif
816 	P_FLAG(RDWR);
817 #ifdef O_DSYNC
818 	if ((flags & O_SYNC) == O_SYNC)
819 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
820 	else {
821 		P_FLAG(DSYNC);
822 	}
823 #else
824 	P_FLAG(SYNC);
825 #endif
826 	P_FLAG(TRUNC);
827 	P_FLAG(WRONLY);
828 #undef P_FLAG
829 
830 	if (flags)
831 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
832 
833 	return printed;
834 }
835 
836 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
837 
838 static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
839 						struct syscall_arg *arg)
840 {
841 	int printed = 0, flags = arg->val;
842 
843 	if (flags == 0)
844 		return 0;
845 
846 #define	P_FLAG(n) \
847 	if (flags & PERF_FLAG_##n) { \
848 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
849 		flags &= ~PERF_FLAG_##n; \
850 	}
851 
852 	P_FLAG(FD_NO_GROUP);
853 	P_FLAG(FD_OUTPUT);
854 	P_FLAG(PID_CGROUP);
855 	P_FLAG(FD_CLOEXEC);
856 #undef P_FLAG
857 
858 	if (flags)
859 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
860 
861 	return printed;
862 }
863 
864 #define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
865 
866 static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
867 						   struct syscall_arg *arg)
868 {
869 	int printed = 0, flags = arg->val;
870 
871 	if (flags == 0)
872 		return scnprintf(bf, size, "NONE");
873 #define	P_FLAG(n) \
874 	if (flags & EFD_##n) { \
875 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
876 		flags &= ~EFD_##n; \
877 	}
878 
879 	P_FLAG(SEMAPHORE);
880 	P_FLAG(CLOEXEC);
881 	P_FLAG(NONBLOCK);
882 #undef P_FLAG
883 
884 	if (flags)
885 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
886 
887 	return printed;
888 }
889 
890 #define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
891 
892 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
893 						struct syscall_arg *arg)
894 {
895 	int printed = 0, flags = arg->val;
896 
897 #define	P_FLAG(n) \
898 	if (flags & O_##n) { \
899 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
900 		flags &= ~O_##n; \
901 	}
902 
903 	P_FLAG(CLOEXEC);
904 	P_FLAG(NONBLOCK);
905 #undef P_FLAG
906 
907 	if (flags)
908 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
909 
910 	return printed;
911 }
912 
913 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
914 
915 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
916 {
917 	int sig = arg->val;
918 
919 	switch (sig) {
920 #define	P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
921 	P_SIGNUM(HUP);
922 	P_SIGNUM(INT);
923 	P_SIGNUM(QUIT);
924 	P_SIGNUM(ILL);
925 	P_SIGNUM(TRAP);
926 	P_SIGNUM(ABRT);
927 	P_SIGNUM(BUS);
928 	P_SIGNUM(FPE);
929 	P_SIGNUM(KILL);
930 	P_SIGNUM(USR1);
931 	P_SIGNUM(SEGV);
932 	P_SIGNUM(USR2);
933 	P_SIGNUM(PIPE);
934 	P_SIGNUM(ALRM);
935 	P_SIGNUM(TERM);
936 	P_SIGNUM(CHLD);
937 	P_SIGNUM(CONT);
938 	P_SIGNUM(STOP);
939 	P_SIGNUM(TSTP);
940 	P_SIGNUM(TTIN);
941 	P_SIGNUM(TTOU);
942 	P_SIGNUM(URG);
943 	P_SIGNUM(XCPU);
944 	P_SIGNUM(XFSZ);
945 	P_SIGNUM(VTALRM);
946 	P_SIGNUM(PROF);
947 	P_SIGNUM(WINCH);
948 	P_SIGNUM(IO);
949 	P_SIGNUM(PWR);
950 	P_SIGNUM(SYS);
951 #ifdef SIGEMT
952 	P_SIGNUM(EMT);
953 #endif
954 #ifdef SIGSTKFLT
955 	P_SIGNUM(STKFLT);
956 #endif
957 #ifdef SIGSWI
958 	P_SIGNUM(SWI);
959 #endif
960 	default: break;
961 	}
962 
963 	return scnprintf(bf, size, "%#x", sig);
964 }
965 
966 #define SCA_SIGNUM syscall_arg__scnprintf_signum
967 
968 #if defined(__i386__) || defined(__x86_64__)
969 /*
970  * FIXME: Make this available to all arches.
971  */
972 #define TCGETS		0x5401
973 
974 static const char *tioctls[] = {
975 	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
976 	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
977 	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
978 	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
979 	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
980 	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
981 	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
982 	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
983 	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
984 	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
985 	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
986 	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
987 	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
988 	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
989 	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
990 };
991 
992 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
993 #endif /* defined(__i386__) || defined(__x86_64__) */
994 
995 #define STRARRAY(arg, name, array) \
996 	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
997 	  .arg_parm	 = { [arg] = &strarray__##array, }
998 
999 static struct syscall_fmt {
1000 	const char *name;
1001 	const char *alias;
1002 	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
1003 	void	   *arg_parm[6];
1004 	bool	   errmsg;
1005 	bool	   timeout;
1006 	bool	   hexret;
1007 } syscall_fmts[] = {
1008 	{ .name	    = "access",	    .errmsg = true,
1009 	  .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
1010 	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
1011 	{ .name	    = "brk",	    .hexret = true,
1012 	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
1013 	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
1014 	{ .name	    = "close",	    .errmsg = true,
1015 	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
1016 	{ .name	    = "connect",    .errmsg = true, },
1017 	{ .name	    = "dup",	    .errmsg = true,
1018 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1019 	{ .name	    = "dup2",	    .errmsg = true,
1020 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1021 	{ .name	    = "dup3",	    .errmsg = true,
1022 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1023 	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
1024 	{ .name	    = "eventfd2",   .errmsg = true,
1025 	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
1026 	{ .name	    = "faccessat",  .errmsg = true,
1027 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1028 	{ .name	    = "fadvise64",  .errmsg = true,
1029 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1030 	{ .name	    = "fallocate",  .errmsg = true,
1031 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1032 	{ .name	    = "fchdir",	    .errmsg = true,
1033 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1034 	{ .name	    = "fchmod",	    .errmsg = true,
1035 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1036 	{ .name	    = "fchmodat",   .errmsg = true,
1037 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1038 	{ .name	    = "fchown",	    .errmsg = true,
1039 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1040 	{ .name	    = "fchownat",   .errmsg = true,
1041 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1042 	{ .name	    = "fcntl",	    .errmsg = true,
1043 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1044 			     [1] = SCA_STRARRAY, /* cmd */ },
1045 	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
1046 	{ .name	    = "fdatasync",  .errmsg = true,
1047 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1048 	{ .name	    = "flock",	    .errmsg = true,
1049 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1050 			     [1] = SCA_FLOCK, /* cmd */ }, },
1051 	{ .name	    = "fsetxattr",  .errmsg = true,
1052 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1053 	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat",
1054 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1055 	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat",
1056 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1057 	{ .name	    = "fstatfs",    .errmsg = true,
1058 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1059 	{ .name	    = "fsync",    .errmsg = true,
1060 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1061 	{ .name	    = "ftruncate", .errmsg = true,
1062 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1063 	{ .name	    = "futex",	    .errmsg = true,
1064 	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
1065 	{ .name	    = "futimesat", .errmsg = true,
1066 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1067 	{ .name	    = "getdents",   .errmsg = true,
1068 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1069 	{ .name	    = "getdents64", .errmsg = true,
1070 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1071 	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1072 	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1073 	{ .name	    = "ioctl",	    .errmsg = true,
1074 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1075 #if defined(__i386__) || defined(__x86_64__)
1076 /*
1077  * FIXME: Make this available to all arches.
1078  */
1079 			     [1] = SCA_STRHEXARRAY, /* cmd */
1080 			     [2] = SCA_HEX, /* arg */ },
1081 	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
1082 #else
1083 			     [2] = SCA_HEX, /* arg */ }, },
1084 #endif
1085 	{ .name	    = "kill",	    .errmsg = true,
1086 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1087 	{ .name	    = "linkat",	    .errmsg = true,
1088 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1089 	{ .name	    = "lseek",	    .errmsg = true,
1090 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1091 			     [2] = SCA_STRARRAY, /* whence */ },
1092 	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
1093 	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat", },
1094 	{ .name     = "madvise",    .errmsg = true,
1095 	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
1096 			     [2] = SCA_MADV_BHV, /* behavior */ }, },
1097 	{ .name	    = "mkdirat",    .errmsg = true,
1098 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1099 	{ .name	    = "mknodat",    .errmsg = true,
1100 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1101 	{ .name	    = "mlock",	    .errmsg = true,
1102 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1103 	{ .name	    = "mlockall",   .errmsg = true,
1104 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1105 	{ .name	    = "mmap",	    .hexret = true,
1106 	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
1107 			     [2] = SCA_MMAP_PROT, /* prot */
1108 			     [3] = SCA_MMAP_FLAGS, /* flags */
1109 			     [4] = SCA_FD, 	  /* fd */ }, },
1110 	{ .name	    = "mprotect",   .errmsg = true,
1111 	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
1112 			     [2] = SCA_MMAP_PROT, /* prot */ }, },
1113 	{ .name	    = "mremap",	    .hexret = true,
1114 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1115 			     [3] = SCA_MREMAP_FLAGS, /* flags */
1116 			     [4] = SCA_HEX, /* new_addr */ }, },
1117 	{ .name	    = "munlock",    .errmsg = true,
1118 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1119 	{ .name	    = "munmap",	    .errmsg = true,
1120 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1121 	{ .name	    = "name_to_handle_at", .errmsg = true,
1122 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1123 	{ .name	    = "newfstatat", .errmsg = true,
1124 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1125 	{ .name	    = "open",	    .errmsg = true,
1126 	  .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1127 	{ .name	    = "open_by_handle_at", .errmsg = true,
1128 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1129 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1130 	{ .name	    = "openat",	    .errmsg = true,
1131 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1132 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1133 	{ .name	    = "perf_event_open", .errmsg = true,
1134 	  .arg_scnprintf = { [1] = SCA_INT, /* pid */
1135 			     [2] = SCA_INT, /* cpu */
1136 			     [3] = SCA_FD,  /* group_fd */
1137 			     [4] = SCA_PERF_FLAGS,  /* flags */ }, },
1138 	{ .name	    = "pipe2",	    .errmsg = true,
1139 	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1140 	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
1141 	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
1142 	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64",
1143 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1144 	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread",
1145 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1146 	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1147 	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64",
1148 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1149 	{ .name	    = "pwritev",    .errmsg = true,
1150 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1151 	{ .name	    = "read",	    .errmsg = true,
1152 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1153 	{ .name	    = "readlinkat", .errmsg = true,
1154 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1155 	{ .name	    = "readv",	    .errmsg = true,
1156 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1157 	{ .name	    = "recvfrom",   .errmsg = true,
1158 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1159 	{ .name	    = "recvmmsg",   .errmsg = true,
1160 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1161 	{ .name	    = "recvmsg",    .errmsg = true,
1162 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1163 	{ .name	    = "renameat",   .errmsg = true,
1164 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1165 	{ .name	    = "rt_sigaction", .errmsg = true,
1166 	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1167 	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1168 	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
1169 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1170 	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
1171 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1172 	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
1173 	{ .name	    = "sendmmsg",    .errmsg = true,
1174 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1175 	{ .name	    = "sendmsg",    .errmsg = true,
1176 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1177 	{ .name	    = "sendto",	    .errmsg = true,
1178 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1179 	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1180 	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1181 	{ .name	    = "shutdown",   .errmsg = true,
1182 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1183 	{ .name	    = "socket",	    .errmsg = true,
1184 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1185 			     [1] = SCA_SK_TYPE, /* type */ },
1186 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1187 	{ .name	    = "socketpair", .errmsg = true,
1188 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1189 			     [1] = SCA_SK_TYPE, /* type */ },
1190 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1191 	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat", },
1192 	{ .name	    = "symlinkat",  .errmsg = true,
1193 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1194 	{ .name	    = "tgkill",	    .errmsg = true,
1195 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1196 	{ .name	    = "tkill",	    .errmsg = true,
1197 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1198 	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
1199 	{ .name	    = "unlinkat",   .errmsg = true,
1200 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1201 	{ .name	    = "utimensat",  .errmsg = true,
1202 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
1203 	{ .name	    = "write",	    .errmsg = true,
1204 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1205 	{ .name	    = "writev",	    .errmsg = true,
1206 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1207 };
1208 
1209 static int syscall_fmt__cmp(const void *name, const void *fmtp)
1210 {
1211 	const struct syscall_fmt *fmt = fmtp;
1212 	return strcmp(name, fmt->name);
1213 }
1214 
1215 static struct syscall_fmt *syscall_fmt__find(const char *name)
1216 {
1217 	const int nmemb = ARRAY_SIZE(syscall_fmts);
1218 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1219 }
1220 
1221 struct syscall {
1222 	struct event_format *tp_format;
1223 	int		    nr_args;
1224 	struct format_field *args;
1225 	const char	    *name;
1226 	bool		    filtered;
1227 	bool		    is_exit;
1228 	struct syscall_fmt  *fmt;
1229 	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1230 	void		    **arg_parm;
1231 };
1232 
1233 static size_t fprintf_duration(unsigned long t, FILE *fp)
1234 {
1235 	double duration = (double)t / NSEC_PER_MSEC;
1236 	size_t printed = fprintf(fp, "(");
1237 
1238 	if (duration >= 1.0)
1239 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1240 	else if (duration >= 0.01)
1241 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1242 	else
1243 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1244 	return printed + fprintf(fp, "): ");
1245 }
1246 
1247 struct thread_trace {
1248 	u64		  entry_time;
1249 	u64		  exit_time;
1250 	bool		  entry_pending;
1251 	unsigned long	  nr_events;
1252 	unsigned long	  pfmaj, pfmin;
1253 	char		  *entry_str;
1254 	double		  runtime_ms;
1255 	struct {
1256 		int	  max;
1257 		char	  **table;
1258 	} paths;
1259 
1260 	struct intlist *syscall_stats;
1261 };
1262 
1263 static struct thread_trace *thread_trace__new(void)
1264 {
1265 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1266 
1267 	if (ttrace)
1268 		ttrace->paths.max = -1;
1269 
1270 	ttrace->syscall_stats = intlist__new(NULL);
1271 
1272 	return ttrace;
1273 }
1274 
1275 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1276 {
1277 	struct thread_trace *ttrace;
1278 
1279 	if (thread == NULL)
1280 		goto fail;
1281 
1282 	if (thread__priv(thread) == NULL)
1283 		thread__set_priv(thread, thread_trace__new());
1284 
1285 	if (thread__priv(thread) == NULL)
1286 		goto fail;
1287 
1288 	ttrace = thread__priv(thread);
1289 	++ttrace->nr_events;
1290 
1291 	return ttrace;
1292 fail:
1293 	color_fprintf(fp, PERF_COLOR_RED,
1294 		      "WARNING: not enough memory, dropping samples!\n");
1295 	return NULL;
1296 }
1297 
1298 #define TRACE_PFMAJ		(1 << 0)
1299 #define TRACE_PFMIN		(1 << 1)
1300 
1301 struct trace {
1302 	struct perf_tool	tool;
1303 	struct {
1304 		int		machine;
1305 		int		open_id;
1306 	}			audit;
1307 	struct {
1308 		int		max;
1309 		struct syscall  *table;
1310 	} syscalls;
1311 	struct record_opts	opts;
1312 	struct perf_evlist	*evlist;
1313 	struct machine		*host;
1314 	struct thread		*current;
1315 	u64			base_time;
1316 	FILE			*output;
1317 	unsigned long		nr_events;
1318 	struct strlist		*ev_qualifier;
1319 	const char 		*last_vfs_getname;
1320 	struct intlist		*tid_list;
1321 	struct intlist		*pid_list;
1322 	struct {
1323 		size_t		nr;
1324 		pid_t		*entries;
1325 	}			filter_pids;
1326 	double			duration_filter;
1327 	double			runtime_ms;
1328 	struct {
1329 		u64		vfs_getname,
1330 				proc_getname;
1331 	} stats;
1332 	bool			not_ev_qualifier;
1333 	bool			live;
1334 	bool			full_time;
1335 	bool			sched;
1336 	bool			multiple_threads;
1337 	bool			summary;
1338 	bool			summary_only;
1339 	bool			show_comm;
1340 	bool			show_tool_stats;
1341 	bool			trace_syscalls;
1342 	bool			force;
1343 	int			trace_pgfaults;
1344 };
1345 
1346 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1347 {
1348 	struct thread_trace *ttrace = thread__priv(thread);
1349 
1350 	if (fd > ttrace->paths.max) {
1351 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1352 
1353 		if (npath == NULL)
1354 			return -1;
1355 
1356 		if (ttrace->paths.max != -1) {
1357 			memset(npath + ttrace->paths.max + 1, 0,
1358 			       (fd - ttrace->paths.max) * sizeof(char *));
1359 		} else {
1360 			memset(npath, 0, (fd + 1) * sizeof(char *));
1361 		}
1362 
1363 		ttrace->paths.table = npath;
1364 		ttrace->paths.max   = fd;
1365 	}
1366 
1367 	ttrace->paths.table[fd] = strdup(pathname);
1368 
1369 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
1370 }
1371 
1372 static int thread__read_fd_path(struct thread *thread, int fd)
1373 {
1374 	char linkname[PATH_MAX], pathname[PATH_MAX];
1375 	struct stat st;
1376 	int ret;
1377 
1378 	if (thread->pid_ == thread->tid) {
1379 		scnprintf(linkname, sizeof(linkname),
1380 			  "/proc/%d/fd/%d", thread->pid_, fd);
1381 	} else {
1382 		scnprintf(linkname, sizeof(linkname),
1383 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1384 	}
1385 
1386 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1387 		return -1;
1388 
1389 	ret = readlink(linkname, pathname, sizeof(pathname));
1390 
1391 	if (ret < 0 || ret > st.st_size)
1392 		return -1;
1393 
1394 	pathname[ret] = '\0';
1395 	return trace__set_fd_pathname(thread, fd, pathname);
1396 }
1397 
1398 static const char *thread__fd_path(struct thread *thread, int fd,
1399 				   struct trace *trace)
1400 {
1401 	struct thread_trace *ttrace = thread__priv(thread);
1402 
1403 	if (ttrace == NULL)
1404 		return NULL;
1405 
1406 	if (fd < 0)
1407 		return NULL;
1408 
1409 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1410 		if (!trace->live)
1411 			return NULL;
1412 		++trace->stats.proc_getname;
1413 		if (thread__read_fd_path(thread, fd))
1414 			return NULL;
1415 	}
1416 
1417 	return ttrace->paths.table[fd];
1418 }
1419 
1420 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1421 					struct syscall_arg *arg)
1422 {
1423 	int fd = arg->val;
1424 	size_t printed = scnprintf(bf, size, "%d", fd);
1425 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1426 
1427 	if (path)
1428 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1429 
1430 	return printed;
1431 }
1432 
1433 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1434 					      struct syscall_arg *arg)
1435 {
1436 	int fd = arg->val;
1437 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1438 	struct thread_trace *ttrace = thread__priv(arg->thread);
1439 
1440 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1441 		zfree(&ttrace->paths.table[fd]);
1442 
1443 	return printed;
1444 }
1445 
1446 static bool trace__filter_duration(struct trace *trace, double t)
1447 {
1448 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1449 }
1450 
1451 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1452 {
1453 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1454 
1455 	return fprintf(fp, "%10.3f ", ts);
1456 }
1457 
1458 static bool done = false;
1459 static bool interrupted = false;
1460 
1461 static void sig_handler(int sig)
1462 {
1463 	done = true;
1464 	interrupted = sig == SIGINT;
1465 }
1466 
1467 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1468 					u64 duration, u64 tstamp, FILE *fp)
1469 {
1470 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1471 	printed += fprintf_duration(duration, fp);
1472 
1473 	if (trace->multiple_threads) {
1474 		if (trace->show_comm)
1475 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1476 		printed += fprintf(fp, "%d ", thread->tid);
1477 	}
1478 
1479 	return printed;
1480 }
1481 
1482 static int trace__process_event(struct trace *trace, struct machine *machine,
1483 				union perf_event *event, struct perf_sample *sample)
1484 {
1485 	int ret = 0;
1486 
1487 	switch (event->header.type) {
1488 	case PERF_RECORD_LOST:
1489 		color_fprintf(trace->output, PERF_COLOR_RED,
1490 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1491 		ret = machine__process_lost_event(machine, event, sample);
1492 	default:
1493 		ret = machine__process_event(machine, event, sample);
1494 		break;
1495 	}
1496 
1497 	return ret;
1498 }
1499 
1500 static int trace__tool_process(struct perf_tool *tool,
1501 			       union perf_event *event,
1502 			       struct perf_sample *sample,
1503 			       struct machine *machine)
1504 {
1505 	struct trace *trace = container_of(tool, struct trace, tool);
1506 	return trace__process_event(trace, machine, event, sample);
1507 }
1508 
1509 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1510 {
1511 	int err = symbol__init(NULL);
1512 
1513 	if (err)
1514 		return err;
1515 
1516 	trace->host = machine__new_host();
1517 	if (trace->host == NULL)
1518 		return -ENOMEM;
1519 
1520 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1521 					    evlist->threads, trace__tool_process, false,
1522 					    trace->opts.proc_map_timeout);
1523 	if (err)
1524 		symbol__exit();
1525 
1526 	return err;
1527 }
1528 
1529 static int syscall__set_arg_fmts(struct syscall *sc)
1530 {
1531 	struct format_field *field;
1532 	int idx = 0;
1533 
1534 	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1535 	if (sc->arg_scnprintf == NULL)
1536 		return -1;
1537 
1538 	if (sc->fmt)
1539 		sc->arg_parm = sc->fmt->arg_parm;
1540 
1541 	for (field = sc->args; field; field = field->next) {
1542 		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1543 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1544 		else if (field->flags & FIELD_IS_POINTER)
1545 			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1546 		++idx;
1547 	}
1548 
1549 	return 0;
1550 }
1551 
1552 static int trace__read_syscall_info(struct trace *trace, int id)
1553 {
1554 	char tp_name[128];
1555 	struct syscall *sc;
1556 	const char *name = audit_syscall_to_name(id, trace->audit.machine);
1557 
1558 	if (name == NULL)
1559 		return -1;
1560 
1561 	if (id > trace->syscalls.max) {
1562 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1563 
1564 		if (nsyscalls == NULL)
1565 			return -1;
1566 
1567 		if (trace->syscalls.max != -1) {
1568 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1569 			       (id - trace->syscalls.max) * sizeof(*sc));
1570 		} else {
1571 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1572 		}
1573 
1574 		trace->syscalls.table = nsyscalls;
1575 		trace->syscalls.max   = id;
1576 	}
1577 
1578 	sc = trace->syscalls.table + id;
1579 	sc->name = name;
1580 
1581 	if (trace->ev_qualifier) {
1582 		bool in = strlist__find(trace->ev_qualifier, name) != NULL;
1583 
1584 		if (!(in ^ trace->not_ev_qualifier)) {
1585 			sc->filtered = true;
1586 			/*
1587 			 * No need to do read tracepoint information since this will be
1588 			 * filtered out.
1589 			 */
1590 			return 0;
1591 		}
1592 	}
1593 
1594 	sc->fmt  = syscall_fmt__find(sc->name);
1595 
1596 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1597 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1598 
1599 	if (sc->tp_format == NULL && sc->fmt && sc->fmt->alias) {
1600 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1601 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1602 	}
1603 
1604 	if (sc->tp_format == NULL)
1605 		return -1;
1606 
1607 	sc->args = sc->tp_format->format.fields;
1608 	sc->nr_args = sc->tp_format->format.nr_fields;
1609 	/* drop nr field - not relevant here; does not exist on older kernels */
1610 	if (sc->args && strcmp(sc->args->name, "nr") == 0) {
1611 		sc->args = sc->args->next;
1612 		--sc->nr_args;
1613 	}
1614 
1615 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1616 
1617 	return syscall__set_arg_fmts(sc);
1618 }
1619 
1620 /*
1621  * args is to be interpreted as a series of longs but we need to handle
1622  * 8-byte unaligned accesses. args points to raw_data within the event
1623  * and raw_data is guaranteed to be 8-byte unaligned because it is
1624  * preceded by raw_size which is a u32. So we need to copy args to a temp
1625  * variable to read it. Most notably this avoids extended load instructions
1626  * on unaligned addresses
1627  */
1628 
1629 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1630 				      unsigned char *args, struct trace *trace,
1631 				      struct thread *thread)
1632 {
1633 	size_t printed = 0;
1634 	unsigned char *p;
1635 	unsigned long val;
1636 
1637 	if (sc->args != NULL) {
1638 		struct format_field *field;
1639 		u8 bit = 1;
1640 		struct syscall_arg arg = {
1641 			.idx	= 0,
1642 			.mask	= 0,
1643 			.trace  = trace,
1644 			.thread = thread,
1645 		};
1646 
1647 		for (field = sc->args; field;
1648 		     field = field->next, ++arg.idx, bit <<= 1) {
1649 			if (arg.mask & bit)
1650 				continue;
1651 
1652 			/* special care for unaligned accesses */
1653 			p = args + sizeof(unsigned long) * arg.idx;
1654 			memcpy(&val, p, sizeof(val));
1655 
1656 			/*
1657  			 * Suppress this argument if its value is zero and
1658  			 * and we don't have a string associated in an
1659  			 * strarray for it.
1660  			 */
1661 			if (val == 0 &&
1662 			    !(sc->arg_scnprintf &&
1663 			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1664 			      sc->arg_parm[arg.idx]))
1665 				continue;
1666 
1667 			printed += scnprintf(bf + printed, size - printed,
1668 					     "%s%s: ", printed ? ", " : "", field->name);
1669 			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1670 				arg.val = val;
1671 				if (sc->arg_parm)
1672 					arg.parm = sc->arg_parm[arg.idx];
1673 				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1674 								      size - printed, &arg);
1675 			} else {
1676 				printed += scnprintf(bf + printed, size - printed,
1677 						     "%ld", val);
1678 			}
1679 		}
1680 	} else {
1681 		int i = 0;
1682 
1683 		while (i < 6) {
1684 			/* special care for unaligned accesses */
1685 			p = args + sizeof(unsigned long) * i;
1686 			memcpy(&val, p, sizeof(val));
1687 			printed += scnprintf(bf + printed, size - printed,
1688 					     "%sarg%d: %ld",
1689 					     printed ? ", " : "", i, val);
1690 			++i;
1691 		}
1692 	}
1693 
1694 	return printed;
1695 }
1696 
1697 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1698 				  union perf_event *event,
1699 				  struct perf_sample *sample);
1700 
1701 static struct syscall *trace__syscall_info(struct trace *trace,
1702 					   struct perf_evsel *evsel, int id)
1703 {
1704 
1705 	if (id < 0) {
1706 
1707 		/*
1708 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1709 		 * before that, leaving at a higher verbosity level till that is
1710 		 * explained. Reproduced with plain ftrace with:
1711 		 *
1712 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1713 		 * grep "NR -1 " /t/trace_pipe
1714 		 *
1715 		 * After generating some load on the machine.
1716  		 */
1717 		if (verbose > 1) {
1718 			static u64 n;
1719 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1720 				id, perf_evsel__name(evsel), ++n);
1721 		}
1722 		return NULL;
1723 	}
1724 
1725 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1726 	    trace__read_syscall_info(trace, id))
1727 		goto out_cant_read;
1728 
1729 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1730 		goto out_cant_read;
1731 
1732 	return &trace->syscalls.table[id];
1733 
1734 out_cant_read:
1735 	if (verbose) {
1736 		fprintf(trace->output, "Problems reading syscall %d", id);
1737 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1738 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1739 		fputs(" information\n", trace->output);
1740 	}
1741 	return NULL;
1742 }
1743 
1744 static void thread__update_stats(struct thread_trace *ttrace,
1745 				 int id, struct perf_sample *sample)
1746 {
1747 	struct int_node *inode;
1748 	struct stats *stats;
1749 	u64 duration = 0;
1750 
1751 	inode = intlist__findnew(ttrace->syscall_stats, id);
1752 	if (inode == NULL)
1753 		return;
1754 
1755 	stats = inode->priv;
1756 	if (stats == NULL) {
1757 		stats = malloc(sizeof(struct stats));
1758 		if (stats == NULL)
1759 			return;
1760 		init_stats(stats);
1761 		inode->priv = stats;
1762 	}
1763 
1764 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1765 		duration = sample->time - ttrace->entry_time;
1766 
1767 	update_stats(stats, duration);
1768 }
1769 
1770 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1771 {
1772 	struct thread_trace *ttrace;
1773 	u64 duration;
1774 	size_t printed;
1775 
1776 	if (trace->current == NULL)
1777 		return 0;
1778 
1779 	ttrace = thread__priv(trace->current);
1780 
1781 	if (!ttrace->entry_pending)
1782 		return 0;
1783 
1784 	duration = sample->time - ttrace->entry_time;
1785 
1786 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1787 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1788 	ttrace->entry_pending = false;
1789 
1790 	return printed;
1791 }
1792 
1793 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1794 			    union perf_event *event __maybe_unused,
1795 			    struct perf_sample *sample)
1796 {
1797 	char *msg;
1798 	void *args;
1799 	size_t printed = 0;
1800 	struct thread *thread;
1801 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1802 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1803 	struct thread_trace *ttrace;
1804 
1805 	if (sc == NULL)
1806 		return -1;
1807 
1808 	if (sc->filtered)
1809 		return 0;
1810 
1811 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1812 	ttrace = thread__trace(thread, trace->output);
1813 	if (ttrace == NULL)
1814 		goto out_put;
1815 
1816 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1817 
1818 	if (ttrace->entry_str == NULL) {
1819 		ttrace->entry_str = malloc(1024);
1820 		if (!ttrace->entry_str)
1821 			goto out_put;
1822 	}
1823 
1824 	if (!trace->summary_only)
1825 		trace__printf_interrupted_entry(trace, sample);
1826 
1827 	ttrace->entry_time = sample->time;
1828 	msg = ttrace->entry_str;
1829 	printed += scnprintf(msg + printed, 1024 - printed, "%s(", sc->name);
1830 
1831 	printed += syscall__scnprintf_args(sc, msg + printed, 1024 - printed,
1832 					   args, trace, thread);
1833 
1834 	if (sc->is_exit) {
1835 		if (!trace->duration_filter && !trace->summary_only) {
1836 			trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1837 			fprintf(trace->output, "%-70s\n", ttrace->entry_str);
1838 		}
1839 	} else
1840 		ttrace->entry_pending = true;
1841 
1842 	if (trace->current != thread) {
1843 		thread__put(trace->current);
1844 		trace->current = thread__get(thread);
1845 	}
1846 	err = 0;
1847 out_put:
1848 	thread__put(thread);
1849 	return err;
1850 }
1851 
1852 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1853 			   union perf_event *event __maybe_unused,
1854 			   struct perf_sample *sample)
1855 {
1856 	long ret;
1857 	u64 duration = 0;
1858 	struct thread *thread;
1859 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1860 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1861 	struct thread_trace *ttrace;
1862 
1863 	if (sc == NULL)
1864 		return -1;
1865 
1866 	if (sc->filtered)
1867 		return 0;
1868 
1869 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1870 	ttrace = thread__trace(thread, trace->output);
1871 	if (ttrace == NULL)
1872 		goto out_put;
1873 
1874 	if (trace->summary)
1875 		thread__update_stats(ttrace, id, sample);
1876 
1877 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1878 
1879 	if (id == trace->audit.open_id && ret >= 0 && trace->last_vfs_getname) {
1880 		trace__set_fd_pathname(thread, ret, trace->last_vfs_getname);
1881 		trace->last_vfs_getname = NULL;
1882 		++trace->stats.vfs_getname;
1883 	}
1884 
1885 	ttrace->exit_time = sample->time;
1886 
1887 	if (ttrace->entry_time) {
1888 		duration = sample->time - ttrace->entry_time;
1889 		if (trace__filter_duration(trace, duration))
1890 			goto out;
1891 	} else if (trace->duration_filter)
1892 		goto out;
1893 
1894 	if (trace->summary_only)
1895 		goto out;
1896 
1897 	trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1898 
1899 	if (ttrace->entry_pending) {
1900 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1901 	} else {
1902 		fprintf(trace->output, " ... [");
1903 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1904 		fprintf(trace->output, "]: %s()", sc->name);
1905 	}
1906 
1907 	if (sc->fmt == NULL) {
1908 signed_print:
1909 		fprintf(trace->output, ") = %ld", ret);
1910 	} else if (ret < 0 && sc->fmt->errmsg) {
1911 		char bf[STRERR_BUFSIZE];
1912 		const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
1913 			   *e = audit_errno_to_name(-ret);
1914 
1915 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1916 	} else if (ret == 0 && sc->fmt->timeout)
1917 		fprintf(trace->output, ") = 0 Timeout");
1918 	else if (sc->fmt->hexret)
1919 		fprintf(trace->output, ") = %#lx", ret);
1920 	else
1921 		goto signed_print;
1922 
1923 	fputc('\n', trace->output);
1924 out:
1925 	ttrace->entry_pending = false;
1926 	err = 0;
1927 out_put:
1928 	thread__put(thread);
1929 	return err;
1930 }
1931 
1932 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1933 			      union perf_event *event __maybe_unused,
1934 			      struct perf_sample *sample)
1935 {
1936 	trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname");
1937 	return 0;
1938 }
1939 
1940 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1941 				     union perf_event *event __maybe_unused,
1942 				     struct perf_sample *sample)
1943 {
1944         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1945 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1946 	struct thread *thread = machine__findnew_thread(trace->host,
1947 							sample->pid,
1948 							sample->tid);
1949 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1950 
1951 	if (ttrace == NULL)
1952 		goto out_dump;
1953 
1954 	ttrace->runtime_ms += runtime_ms;
1955 	trace->runtime_ms += runtime_ms;
1956 	thread__put(thread);
1957 	return 0;
1958 
1959 out_dump:
1960 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1961 	       evsel->name,
1962 	       perf_evsel__strval(evsel, sample, "comm"),
1963 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1964 	       runtime,
1965 	       perf_evsel__intval(evsel, sample, "vruntime"));
1966 	thread__put(thread);
1967 	return 0;
1968 }
1969 
1970 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1971 				union perf_event *event __maybe_unused,
1972 				struct perf_sample *sample)
1973 {
1974 	trace__printf_interrupted_entry(trace, sample);
1975 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1976 
1977 	if (trace->trace_syscalls)
1978 		fprintf(trace->output, "(         ): ");
1979 
1980 	fprintf(trace->output, "%s:", evsel->name);
1981 
1982 	if (evsel->tp_format) {
1983 		event_format__fprintf(evsel->tp_format, sample->cpu,
1984 				      sample->raw_data, sample->raw_size,
1985 				      trace->output);
1986 	}
1987 
1988 	fprintf(trace->output, ")\n");
1989 	return 0;
1990 }
1991 
1992 static void print_location(FILE *f, struct perf_sample *sample,
1993 			   struct addr_location *al,
1994 			   bool print_dso, bool print_sym)
1995 {
1996 
1997 	if ((verbose || print_dso) && al->map)
1998 		fprintf(f, "%s@", al->map->dso->long_name);
1999 
2000 	if ((verbose || print_sym) && al->sym)
2001 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2002 			al->addr - al->sym->start);
2003 	else if (al->map)
2004 		fprintf(f, "0x%" PRIx64, al->addr);
2005 	else
2006 		fprintf(f, "0x%" PRIx64, sample->addr);
2007 }
2008 
2009 static int trace__pgfault(struct trace *trace,
2010 			  struct perf_evsel *evsel,
2011 			  union perf_event *event,
2012 			  struct perf_sample *sample)
2013 {
2014 	struct thread *thread;
2015 	u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
2016 	struct addr_location al;
2017 	char map_type = 'd';
2018 	struct thread_trace *ttrace;
2019 	int err = -1;
2020 
2021 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2022 	ttrace = thread__trace(thread, trace->output);
2023 	if (ttrace == NULL)
2024 		goto out_put;
2025 
2026 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2027 		ttrace->pfmaj++;
2028 	else
2029 		ttrace->pfmin++;
2030 
2031 	if (trace->summary_only)
2032 		goto out;
2033 
2034 	thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
2035 			      sample->ip, &al);
2036 
2037 	trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
2038 
2039 	fprintf(trace->output, "%sfault [",
2040 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2041 		"maj" : "min");
2042 
2043 	print_location(trace->output, sample, &al, false, true);
2044 
2045 	fprintf(trace->output, "] => ");
2046 
2047 	thread__find_addr_location(thread, cpumode, MAP__VARIABLE,
2048 				   sample->addr, &al);
2049 
2050 	if (!al.map) {
2051 		thread__find_addr_location(thread, cpumode,
2052 					   MAP__FUNCTION, sample->addr, &al);
2053 
2054 		if (al.map)
2055 			map_type = 'x';
2056 		else
2057 			map_type = '?';
2058 	}
2059 
2060 	print_location(trace->output, sample, &al, true, false);
2061 
2062 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2063 out:
2064 	err = 0;
2065 out_put:
2066 	thread__put(thread);
2067 	return err;
2068 }
2069 
2070 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
2071 {
2072 	if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
2073 	    (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
2074 		return false;
2075 
2076 	if (trace->pid_list || trace->tid_list)
2077 		return true;
2078 
2079 	return false;
2080 }
2081 
2082 static int trace__process_sample(struct perf_tool *tool,
2083 				 union perf_event *event,
2084 				 struct perf_sample *sample,
2085 				 struct perf_evsel *evsel,
2086 				 struct machine *machine __maybe_unused)
2087 {
2088 	struct trace *trace = container_of(tool, struct trace, tool);
2089 	int err = 0;
2090 
2091 	tracepoint_handler handler = evsel->handler;
2092 
2093 	if (skip_sample(trace, sample))
2094 		return 0;
2095 
2096 	if (!trace->full_time && trace->base_time == 0)
2097 		trace->base_time = sample->time;
2098 
2099 	if (handler) {
2100 		++trace->nr_events;
2101 		handler(trace, evsel, event, sample);
2102 	}
2103 
2104 	return err;
2105 }
2106 
2107 static int parse_target_str(struct trace *trace)
2108 {
2109 	if (trace->opts.target.pid) {
2110 		trace->pid_list = intlist__new(trace->opts.target.pid);
2111 		if (trace->pid_list == NULL) {
2112 			pr_err("Error parsing process id string\n");
2113 			return -EINVAL;
2114 		}
2115 	}
2116 
2117 	if (trace->opts.target.tid) {
2118 		trace->tid_list = intlist__new(trace->opts.target.tid);
2119 		if (trace->tid_list == NULL) {
2120 			pr_err("Error parsing thread id string\n");
2121 			return -EINVAL;
2122 		}
2123 	}
2124 
2125 	return 0;
2126 }
2127 
2128 static int trace__record(struct trace *trace, int argc, const char **argv)
2129 {
2130 	unsigned int rec_argc, i, j;
2131 	const char **rec_argv;
2132 	const char * const record_args[] = {
2133 		"record",
2134 		"-R",
2135 		"-m", "1024",
2136 		"-c", "1",
2137 	};
2138 
2139 	const char * const sc_args[] = { "-e", };
2140 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2141 	const char * const majpf_args[] = { "-e", "major-faults" };
2142 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2143 	const char * const minpf_args[] = { "-e", "minor-faults" };
2144 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2145 
2146 	/* +1 is for the event string below */
2147 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2148 		majpf_args_nr + minpf_args_nr + argc;
2149 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2150 
2151 	if (rec_argv == NULL)
2152 		return -ENOMEM;
2153 
2154 	j = 0;
2155 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2156 		rec_argv[j++] = record_args[i];
2157 
2158 	if (trace->trace_syscalls) {
2159 		for (i = 0; i < sc_args_nr; i++)
2160 			rec_argv[j++] = sc_args[i];
2161 
2162 		/* event string may be different for older kernels - e.g., RHEL6 */
2163 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2164 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2165 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2166 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2167 		else {
2168 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2169 			return -1;
2170 		}
2171 	}
2172 
2173 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2174 		for (i = 0; i < majpf_args_nr; i++)
2175 			rec_argv[j++] = majpf_args[i];
2176 
2177 	if (trace->trace_pgfaults & TRACE_PFMIN)
2178 		for (i = 0; i < minpf_args_nr; i++)
2179 			rec_argv[j++] = minpf_args[i];
2180 
2181 	for (i = 0; i < (unsigned int)argc; i++)
2182 		rec_argv[j++] = argv[i];
2183 
2184 	return cmd_record(j, rec_argv, NULL);
2185 }
2186 
2187 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2188 
2189 static void perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2190 {
2191 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2192 	if (evsel == NULL)
2193 		return;
2194 
2195 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2196 		perf_evsel__delete(evsel);
2197 		return;
2198 	}
2199 
2200 	evsel->handler = trace__vfs_getname;
2201 	perf_evlist__add(evlist, evsel);
2202 }
2203 
2204 static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2205 				    u64 config)
2206 {
2207 	struct perf_evsel *evsel;
2208 	struct perf_event_attr attr = {
2209 		.type = PERF_TYPE_SOFTWARE,
2210 		.mmap_data = 1,
2211 	};
2212 
2213 	attr.config = config;
2214 	attr.sample_period = 1;
2215 
2216 	event_attr_init(&attr);
2217 
2218 	evsel = perf_evsel__new(&attr);
2219 	if (!evsel)
2220 		return -ENOMEM;
2221 
2222 	evsel->handler = trace__pgfault;
2223 	perf_evlist__add(evlist, evsel);
2224 
2225 	return 0;
2226 }
2227 
2228 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2229 {
2230 	const u32 type = event->header.type;
2231 	struct perf_evsel *evsel;
2232 
2233 	if (!trace->full_time && trace->base_time == 0)
2234 		trace->base_time = sample->time;
2235 
2236 	if (type != PERF_RECORD_SAMPLE) {
2237 		trace__process_event(trace, trace->host, event, sample);
2238 		return;
2239 	}
2240 
2241 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2242 	if (evsel == NULL) {
2243 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2244 		return;
2245 	}
2246 
2247 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2248 	    sample->raw_data == NULL) {
2249 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2250 		       perf_evsel__name(evsel), sample->tid,
2251 		       sample->cpu, sample->raw_size);
2252 	} else {
2253 		tracepoint_handler handler = evsel->handler;
2254 		handler(trace, evsel, event, sample);
2255 	}
2256 }
2257 
2258 static int trace__run(struct trace *trace, int argc, const char **argv)
2259 {
2260 	struct perf_evlist *evlist = trace->evlist;
2261 	int err = -1, i;
2262 	unsigned long before;
2263 	const bool forks = argc > 0;
2264 	bool draining = false;
2265 
2266 	trace->live = true;
2267 
2268 	if (trace->trace_syscalls &&
2269 	    perf_evlist__add_syscall_newtp(evlist, trace__sys_enter,
2270 					   trace__sys_exit))
2271 		goto out_error_raw_syscalls;
2272 
2273 	if (trace->trace_syscalls)
2274 		perf_evlist__add_vfs_getname(evlist);
2275 
2276 	if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2277 	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2278 		goto out_error_mem;
2279 	}
2280 
2281 	if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2282 	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2283 		goto out_error_mem;
2284 
2285 	if (trace->sched &&
2286 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2287 				   trace__sched_stat_runtime))
2288 		goto out_error_sched_stat_runtime;
2289 
2290 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2291 	if (err < 0) {
2292 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2293 		goto out_delete_evlist;
2294 	}
2295 
2296 	err = trace__symbols_init(trace, evlist);
2297 	if (err < 0) {
2298 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2299 		goto out_delete_evlist;
2300 	}
2301 
2302 	perf_evlist__config(evlist, &trace->opts);
2303 
2304 	signal(SIGCHLD, sig_handler);
2305 	signal(SIGINT, sig_handler);
2306 
2307 	if (forks) {
2308 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2309 						    argv, false, NULL);
2310 		if (err < 0) {
2311 			fprintf(trace->output, "Couldn't run the workload!\n");
2312 			goto out_delete_evlist;
2313 		}
2314 	}
2315 
2316 	err = perf_evlist__open(evlist);
2317 	if (err < 0)
2318 		goto out_error_open;
2319 
2320 	/*
2321 	 * Better not use !target__has_task() here because we need to cover the
2322 	 * case where no threads were specified in the command line, but a
2323 	 * workload was, and in that case we will fill in the thread_map when
2324 	 * we fork the workload in perf_evlist__prepare_workload.
2325 	 */
2326 	if (trace->filter_pids.nr > 0)
2327 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2328 	else if (evlist->threads->map[0] == -1)
2329 		err = perf_evlist__set_filter_pid(evlist, getpid());
2330 
2331 	if (err < 0) {
2332 		printf("err=%d,%s\n", -err, strerror(-err));
2333 		exit(1);
2334 	}
2335 
2336 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2337 	if (err < 0)
2338 		goto out_error_mmap;
2339 
2340 	if (!target__none(&trace->opts.target))
2341 		perf_evlist__enable(evlist);
2342 
2343 	if (forks)
2344 		perf_evlist__start_workload(evlist);
2345 
2346 	trace->multiple_threads = evlist->threads->map[0] == -1 ||
2347 				  evlist->threads->nr > 1 ||
2348 				  perf_evlist__first(evlist)->attr.inherit;
2349 again:
2350 	before = trace->nr_events;
2351 
2352 	for (i = 0; i < evlist->nr_mmaps; i++) {
2353 		union perf_event *event;
2354 
2355 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2356 			struct perf_sample sample;
2357 
2358 			++trace->nr_events;
2359 
2360 			err = perf_evlist__parse_sample(evlist, event, &sample);
2361 			if (err) {
2362 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2363 				goto next_event;
2364 			}
2365 
2366 			trace__handle_event(trace, event, &sample);
2367 next_event:
2368 			perf_evlist__mmap_consume(evlist, i);
2369 
2370 			if (interrupted)
2371 				goto out_disable;
2372 
2373 			if (done && !draining) {
2374 				perf_evlist__disable(evlist);
2375 				draining = true;
2376 			}
2377 		}
2378 	}
2379 
2380 	if (trace->nr_events == before) {
2381 		int timeout = done ? 100 : -1;
2382 
2383 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2384 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2385 				draining = true;
2386 
2387 			goto again;
2388 		}
2389 	} else {
2390 		goto again;
2391 	}
2392 
2393 out_disable:
2394 	thread__zput(trace->current);
2395 
2396 	perf_evlist__disable(evlist);
2397 
2398 	if (!err) {
2399 		if (trace->summary)
2400 			trace__fprintf_thread_summary(trace, trace->output);
2401 
2402 		if (trace->show_tool_stats) {
2403 			fprintf(trace->output, "Stats:\n "
2404 					       " vfs_getname : %" PRIu64 "\n"
2405 					       " proc_getname: %" PRIu64 "\n",
2406 				trace->stats.vfs_getname,
2407 				trace->stats.proc_getname);
2408 		}
2409 	}
2410 
2411 out_delete_evlist:
2412 	perf_evlist__delete(evlist);
2413 	trace->evlist = NULL;
2414 	trace->live = false;
2415 	return err;
2416 {
2417 	char errbuf[BUFSIZ];
2418 
2419 out_error_sched_stat_runtime:
2420 	debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2421 	goto out_error;
2422 
2423 out_error_raw_syscalls:
2424 	debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2425 	goto out_error;
2426 
2427 out_error_mmap:
2428 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2429 	goto out_error;
2430 
2431 out_error_open:
2432 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2433 
2434 out_error:
2435 	fprintf(trace->output, "%s\n", errbuf);
2436 	goto out_delete_evlist;
2437 }
2438 out_error_mem:
2439 	fprintf(trace->output, "Not enough memory to run!\n");
2440 	goto out_delete_evlist;
2441 }
2442 
2443 static int trace__replay(struct trace *trace)
2444 {
2445 	const struct perf_evsel_str_handler handlers[] = {
2446 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2447 	};
2448 	struct perf_data_file file = {
2449 		.path  = input_name,
2450 		.mode  = PERF_DATA_MODE_READ,
2451 		.force = trace->force,
2452 	};
2453 	struct perf_session *session;
2454 	struct perf_evsel *evsel;
2455 	int err = -1;
2456 
2457 	trace->tool.sample	  = trace__process_sample;
2458 	trace->tool.mmap	  = perf_event__process_mmap;
2459 	trace->tool.mmap2	  = perf_event__process_mmap2;
2460 	trace->tool.comm	  = perf_event__process_comm;
2461 	trace->tool.exit	  = perf_event__process_exit;
2462 	trace->tool.fork	  = perf_event__process_fork;
2463 	trace->tool.attr	  = perf_event__process_attr;
2464 	trace->tool.tracing_data = perf_event__process_tracing_data;
2465 	trace->tool.build_id	  = perf_event__process_build_id;
2466 
2467 	trace->tool.ordered_events = true;
2468 	trace->tool.ordering_requires_timestamps = true;
2469 
2470 	/* add tid to output */
2471 	trace->multiple_threads = true;
2472 
2473 	session = perf_session__new(&file, false, &trace->tool);
2474 	if (session == NULL)
2475 		return -1;
2476 
2477 	if (symbol__init(&session->header.env) < 0)
2478 		goto out;
2479 
2480 	trace->host = &session->machines.host;
2481 
2482 	err = perf_session__set_tracepoints_handlers(session, handlers);
2483 	if (err)
2484 		goto out;
2485 
2486 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2487 						     "raw_syscalls:sys_enter");
2488 	/* older kernels have syscalls tp versus raw_syscalls */
2489 	if (evsel == NULL)
2490 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2491 							     "syscalls:sys_enter");
2492 
2493 	if (evsel &&
2494 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2495 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2496 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2497 		goto out;
2498 	}
2499 
2500 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2501 						     "raw_syscalls:sys_exit");
2502 	if (evsel == NULL)
2503 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2504 							     "syscalls:sys_exit");
2505 	if (evsel &&
2506 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2507 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2508 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2509 		goto out;
2510 	}
2511 
2512 	evlist__for_each(session->evlist, evsel) {
2513 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2514 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2515 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2516 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2517 			evsel->handler = trace__pgfault;
2518 	}
2519 
2520 	err = parse_target_str(trace);
2521 	if (err != 0)
2522 		goto out;
2523 
2524 	setup_pager();
2525 
2526 	err = perf_session__process_events(session);
2527 	if (err)
2528 		pr_err("Failed to process events, error %d", err);
2529 
2530 	else if (trace->summary)
2531 		trace__fprintf_thread_summary(trace, trace->output);
2532 
2533 out:
2534 	perf_session__delete(session);
2535 
2536 	return err;
2537 }
2538 
2539 static size_t trace__fprintf_threads_header(FILE *fp)
2540 {
2541 	size_t printed;
2542 
2543 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2544 
2545 	return printed;
2546 }
2547 
2548 static size_t thread__dump_stats(struct thread_trace *ttrace,
2549 				 struct trace *trace, FILE *fp)
2550 {
2551 	struct stats *stats;
2552 	size_t printed = 0;
2553 	struct syscall *sc;
2554 	struct int_node *inode = intlist__first(ttrace->syscall_stats);
2555 
2556 	if (inode == NULL)
2557 		return 0;
2558 
2559 	printed += fprintf(fp, "\n");
2560 
2561 	printed += fprintf(fp, "   syscall            calls      min       avg       max      stddev\n");
2562 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)        (%%)\n");
2563 	printed += fprintf(fp, "   --------------- -------- --------- --------- ---------     ------\n");
2564 
2565 	/* each int_node is a syscall */
2566 	while (inode) {
2567 		stats = inode->priv;
2568 		if (stats) {
2569 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2570 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2571 			double avg = avg_stats(stats);
2572 			double pct;
2573 			u64 n = (u64) stats->n;
2574 
2575 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2576 			avg /= NSEC_PER_MSEC;
2577 
2578 			sc = &trace->syscalls.table[inode->i];
2579 			printed += fprintf(fp, "   %-15s", sc->name);
2580 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f",
2581 					   n, min, avg);
2582 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2583 		}
2584 
2585 		inode = intlist__next(inode);
2586 	}
2587 
2588 	printed += fprintf(fp, "\n\n");
2589 
2590 	return printed;
2591 }
2592 
2593 /* struct used to pass data to per-thread function */
2594 struct summary_data {
2595 	FILE *fp;
2596 	struct trace *trace;
2597 	size_t printed;
2598 };
2599 
2600 static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2601 {
2602 	struct summary_data *data = priv;
2603 	FILE *fp = data->fp;
2604 	size_t printed = data->printed;
2605 	struct trace *trace = data->trace;
2606 	struct thread_trace *ttrace = thread__priv(thread);
2607 	double ratio;
2608 
2609 	if (ttrace == NULL)
2610 		return 0;
2611 
2612 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2613 
2614 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2615 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2616 	printed += fprintf(fp, "%.1f%%", ratio);
2617 	if (ttrace->pfmaj)
2618 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2619 	if (ttrace->pfmin)
2620 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2621 	printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2622 	printed += thread__dump_stats(ttrace, trace, fp);
2623 
2624 	data->printed += printed;
2625 
2626 	return 0;
2627 }
2628 
2629 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2630 {
2631 	struct summary_data data = {
2632 		.fp = fp,
2633 		.trace = trace
2634 	};
2635 	data.printed = trace__fprintf_threads_header(fp);
2636 
2637 	machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2638 
2639 	return data.printed;
2640 }
2641 
2642 static int trace__set_duration(const struct option *opt, const char *str,
2643 			       int unset __maybe_unused)
2644 {
2645 	struct trace *trace = opt->value;
2646 
2647 	trace->duration_filter = atof(str);
2648 	return 0;
2649 }
2650 
2651 static int trace__set_filter_pids(const struct option *opt, const char *str,
2652 				  int unset __maybe_unused)
2653 {
2654 	int ret = -1;
2655 	size_t i;
2656 	struct trace *trace = opt->value;
2657 	/*
2658 	 * FIXME: introduce a intarray class, plain parse csv and create a
2659 	 * { int nr, int entries[] } struct...
2660 	 */
2661 	struct intlist *list = intlist__new(str);
2662 
2663 	if (list == NULL)
2664 		return -1;
2665 
2666 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2667 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2668 
2669 	if (trace->filter_pids.entries == NULL)
2670 		goto out;
2671 
2672 	trace->filter_pids.entries[0] = getpid();
2673 
2674 	for (i = 1; i < trace->filter_pids.nr; ++i)
2675 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2676 
2677 	intlist__delete(list);
2678 	ret = 0;
2679 out:
2680 	return ret;
2681 }
2682 
2683 static int trace__open_output(struct trace *trace, const char *filename)
2684 {
2685 	struct stat st;
2686 
2687 	if (!stat(filename, &st) && st.st_size) {
2688 		char oldname[PATH_MAX];
2689 
2690 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2691 		unlink(oldname);
2692 		rename(filename, oldname);
2693 	}
2694 
2695 	trace->output = fopen(filename, "w");
2696 
2697 	return trace->output == NULL ? -errno : 0;
2698 }
2699 
2700 static int parse_pagefaults(const struct option *opt, const char *str,
2701 			    int unset __maybe_unused)
2702 {
2703 	int *trace_pgfaults = opt->value;
2704 
2705 	if (strcmp(str, "all") == 0)
2706 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2707 	else if (strcmp(str, "maj") == 0)
2708 		*trace_pgfaults |= TRACE_PFMAJ;
2709 	else if (strcmp(str, "min") == 0)
2710 		*trace_pgfaults |= TRACE_PFMIN;
2711 	else
2712 		return -1;
2713 
2714 	return 0;
2715 }
2716 
2717 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2718 {
2719 	struct perf_evsel *evsel;
2720 
2721 	evlist__for_each(evlist, evsel)
2722 		evsel->handler = handler;
2723 }
2724 
2725 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2726 {
2727 	const char *trace_usage[] = {
2728 		"perf trace [<options>] [<command>]",
2729 		"perf trace [<options>] -- <command> [<options>]",
2730 		"perf trace record [<options>] [<command>]",
2731 		"perf trace record [<options>] -- <command> [<options>]",
2732 		NULL
2733 	};
2734 	struct trace trace = {
2735 		.audit = {
2736 			.machine = audit_detect_machine(),
2737 			.open_id = audit_name_to_syscall("open", trace.audit.machine),
2738 		},
2739 		.syscalls = {
2740 			. max = -1,
2741 		},
2742 		.opts = {
2743 			.target = {
2744 				.uid	   = UINT_MAX,
2745 				.uses_mmap = true,
2746 			},
2747 			.user_freq     = UINT_MAX,
2748 			.user_interval = ULLONG_MAX,
2749 			.no_buffering  = true,
2750 			.mmap_pages    = UINT_MAX,
2751 			.proc_map_timeout  = 500,
2752 		},
2753 		.output = stdout,
2754 		.show_comm = true,
2755 		.trace_syscalls = true,
2756 	};
2757 	const char *output_name = NULL;
2758 	const char *ev_qualifier_str = NULL;
2759 	const struct option trace_options[] = {
2760 	OPT_CALLBACK(0, "event", &trace.evlist, "event",
2761 		     "event selector. use 'perf list' to list available events",
2762 		     parse_events_option),
2763 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2764 		    "show the thread COMM next to its id"),
2765 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2766 	OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2767 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2768 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2769 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2770 		    "trace events on existing process id"),
2771 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2772 		    "trace events on existing thread id"),
2773 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2774 		     "pids to filter (by the kernel)", trace__set_filter_pids),
2775 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2776 		    "system-wide collection from all CPUs"),
2777 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2778 		    "list of cpus to monitor"),
2779 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2780 		    "child tasks do not inherit counters"),
2781 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2782 		     "number of mmap data pages",
2783 		     perf_evlist__parse_mmap_pages),
2784 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2785 		   "user to profile"),
2786 	OPT_CALLBACK(0, "duration", &trace, "float",
2787 		     "show only events with duration > N.M ms",
2788 		     trace__set_duration),
2789 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2790 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2791 	OPT_BOOLEAN('T', "time", &trace.full_time,
2792 		    "Show full timestamp, not time relative to first start"),
2793 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
2794 		    "Show only syscall summary with statistics"),
2795 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
2796 		    "Show all syscalls and summary with statistics"),
2797 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2798 		     "Trace pagefaults", parse_pagefaults, "maj"),
2799 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2800 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2801 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2802 			"per thread proc mmap processing timeout in ms"),
2803 	OPT_END()
2804 	};
2805 	const char * const trace_subcommands[] = { "record", NULL };
2806 	int err;
2807 	char bf[BUFSIZ];
2808 
2809 	signal(SIGSEGV, sighandler_dump_stack);
2810 	signal(SIGFPE, sighandler_dump_stack);
2811 
2812 	trace.evlist = perf_evlist__new();
2813 
2814 	if (trace.evlist == NULL) {
2815 		pr_err("Not enough memory to run!\n");
2816 		err = -ENOMEM;
2817 		goto out;
2818 	}
2819 
2820 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2821 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2822 
2823 	if (trace.trace_pgfaults) {
2824 		trace.opts.sample_address = true;
2825 		trace.opts.sample_time = true;
2826 	}
2827 
2828 	if (trace.evlist->nr_entries > 0)
2829 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2830 
2831 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2832 		return trace__record(&trace, argc-1, &argv[1]);
2833 
2834 	/* summary_only implies summary option, but don't overwrite summary if set */
2835 	if (trace.summary_only)
2836 		trace.summary = trace.summary_only;
2837 
2838 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2839 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
2840 		pr_err("Please specify something to trace.\n");
2841 		return -1;
2842 	}
2843 
2844 	if (output_name != NULL) {
2845 		err = trace__open_output(&trace, output_name);
2846 		if (err < 0) {
2847 			perror("failed to create output file");
2848 			goto out;
2849 		}
2850 	}
2851 
2852 	if (ev_qualifier_str != NULL) {
2853 		const char *s = ev_qualifier_str;
2854 
2855 		trace.not_ev_qualifier = *s == '!';
2856 		if (trace.not_ev_qualifier)
2857 			++s;
2858 		trace.ev_qualifier = strlist__new(true, s);
2859 		if (trace.ev_qualifier == NULL) {
2860 			fputs("Not enough memory to parse event qualifier",
2861 			      trace.output);
2862 			err = -ENOMEM;
2863 			goto out_close;
2864 		}
2865 	}
2866 
2867 	err = target__validate(&trace.opts.target);
2868 	if (err) {
2869 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2870 		fprintf(trace.output, "%s", bf);
2871 		goto out_close;
2872 	}
2873 
2874 	err = target__parse_uid(&trace.opts.target);
2875 	if (err) {
2876 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2877 		fprintf(trace.output, "%s", bf);
2878 		goto out_close;
2879 	}
2880 
2881 	if (!argc && target__none(&trace.opts.target))
2882 		trace.opts.target.system_wide = true;
2883 
2884 	if (input_name)
2885 		err = trace__replay(&trace);
2886 	else
2887 		err = trace__run(&trace, argc, argv);
2888 
2889 out_close:
2890 	if (output_name != NULL)
2891 		fclose(trace.output);
2892 out:
2893 	return err;
2894 }
2895