xref: /openbmc/linux/tools/perf/builtin-trace.c (revision e2f1cf25)
1 #include <traceevent/event-parse.h>
2 #include "builtin.h"
3 #include "util/color.h"
4 #include "util/debug.h"
5 #include "util/evlist.h"
6 #include "util/machine.h"
7 #include "util/session.h"
8 #include "util/thread.h"
9 #include "util/parse-options.h"
10 #include "util/strlist.h"
11 #include "util/intlist.h"
12 #include "util/thread_map.h"
13 #include "util/stat.h"
14 #include "trace-event.h"
15 #include "util/parse-events.h"
16 
17 #include <libaudit.h>
18 #include <stdlib.h>
19 #include <sys/mman.h>
20 #include <linux/futex.h>
21 
22 /* For older distros: */
23 #ifndef MAP_STACK
24 # define MAP_STACK		0x20000
25 #endif
26 
27 #ifndef MADV_HWPOISON
28 # define MADV_HWPOISON		100
29 #endif
30 
31 #ifndef MADV_MERGEABLE
32 # define MADV_MERGEABLE		12
33 #endif
34 
35 #ifndef MADV_UNMERGEABLE
36 # define MADV_UNMERGEABLE	13
37 #endif
38 
39 #ifndef EFD_SEMAPHORE
40 # define EFD_SEMAPHORE		1
41 #endif
42 
43 #ifndef EFD_NONBLOCK
44 # define EFD_NONBLOCK		00004000
45 #endif
46 
47 #ifndef EFD_CLOEXEC
48 # define EFD_CLOEXEC		02000000
49 #endif
50 
51 #ifndef O_CLOEXEC
52 # define O_CLOEXEC		02000000
53 #endif
54 
55 #ifndef SOCK_DCCP
56 # define SOCK_DCCP		6
57 #endif
58 
59 #ifndef SOCK_CLOEXEC
60 # define SOCK_CLOEXEC		02000000
61 #endif
62 
63 #ifndef SOCK_NONBLOCK
64 # define SOCK_NONBLOCK		00004000
65 #endif
66 
67 #ifndef MSG_CMSG_CLOEXEC
68 # define MSG_CMSG_CLOEXEC	0x40000000
69 #endif
70 
71 #ifndef PERF_FLAG_FD_NO_GROUP
72 # define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
73 #endif
74 
75 #ifndef PERF_FLAG_FD_OUTPUT
76 # define PERF_FLAG_FD_OUTPUT		(1UL << 1)
77 #endif
78 
79 #ifndef PERF_FLAG_PID_CGROUP
80 # define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
81 #endif
82 
83 #ifndef PERF_FLAG_FD_CLOEXEC
84 # define PERF_FLAG_FD_CLOEXEC		(1UL << 3) /* O_CLOEXEC */
85 #endif
86 
87 
88 struct tp_field {
89 	int offset;
90 	union {
91 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
92 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
93 	};
94 };
95 
96 #define TP_UINT_FIELD(bits) \
97 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
98 { \
99 	u##bits value; \
100 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
101 	return value;  \
102 }
103 
104 TP_UINT_FIELD(8);
105 TP_UINT_FIELD(16);
106 TP_UINT_FIELD(32);
107 TP_UINT_FIELD(64);
108 
109 #define TP_UINT_FIELD__SWAPPED(bits) \
110 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
111 { \
112 	u##bits value; \
113 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
114 	return bswap_##bits(value);\
115 }
116 
117 TP_UINT_FIELD__SWAPPED(16);
118 TP_UINT_FIELD__SWAPPED(32);
119 TP_UINT_FIELD__SWAPPED(64);
120 
121 static int tp_field__init_uint(struct tp_field *field,
122 			       struct format_field *format_field,
123 			       bool needs_swap)
124 {
125 	field->offset = format_field->offset;
126 
127 	switch (format_field->size) {
128 	case 1:
129 		field->integer = tp_field__u8;
130 		break;
131 	case 2:
132 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
133 		break;
134 	case 4:
135 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
136 		break;
137 	case 8:
138 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
139 		break;
140 	default:
141 		return -1;
142 	}
143 
144 	return 0;
145 }
146 
147 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
148 {
149 	return sample->raw_data + field->offset;
150 }
151 
152 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
153 {
154 	field->offset = format_field->offset;
155 	field->pointer = tp_field__ptr;
156 	return 0;
157 }
158 
159 struct syscall_tp {
160 	struct tp_field id;
161 	union {
162 		struct tp_field args, ret;
163 	};
164 };
165 
166 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
167 					  struct tp_field *field,
168 					  const char *name)
169 {
170 	struct format_field *format_field = perf_evsel__field(evsel, name);
171 
172 	if (format_field == NULL)
173 		return -1;
174 
175 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
176 }
177 
178 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
179 	({ struct syscall_tp *sc = evsel->priv;\
180 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
181 
182 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
183 					 struct tp_field *field,
184 					 const char *name)
185 {
186 	struct format_field *format_field = perf_evsel__field(evsel, name);
187 
188 	if (format_field == NULL)
189 		return -1;
190 
191 	return tp_field__init_ptr(field, format_field);
192 }
193 
194 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
195 	({ struct syscall_tp *sc = evsel->priv;\
196 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
197 
198 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
199 {
200 	zfree(&evsel->priv);
201 	perf_evsel__delete(evsel);
202 }
203 
204 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
205 {
206 	evsel->priv = malloc(sizeof(struct syscall_tp));
207 	if (evsel->priv != NULL) {
208 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
209 			goto out_delete;
210 
211 		evsel->handler = handler;
212 		return 0;
213 	}
214 
215 	return -ENOMEM;
216 
217 out_delete:
218 	zfree(&evsel->priv);
219 	return -ENOENT;
220 }
221 
222 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
223 {
224 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
225 
226 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
227 	if (evsel == NULL)
228 		evsel = perf_evsel__newtp("syscalls", direction);
229 
230 	if (evsel) {
231 		if (perf_evsel__init_syscall_tp(evsel, handler))
232 			goto out_delete;
233 	}
234 
235 	return evsel;
236 
237 out_delete:
238 	perf_evsel__delete_priv(evsel);
239 	return NULL;
240 }
241 
242 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
243 	({ struct syscall_tp *fields = evsel->priv; \
244 	   fields->name.integer(&fields->name, sample); })
245 
246 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
247 	({ struct syscall_tp *fields = evsel->priv; \
248 	   fields->name.pointer(&fields->name, sample); })
249 
250 static int perf_evlist__add_syscall_newtp(struct perf_evlist *evlist,
251 					  void *sys_enter_handler,
252 					  void *sys_exit_handler)
253 {
254 	int ret = -1;
255 	struct perf_evsel *sys_enter, *sys_exit;
256 
257 	sys_enter = perf_evsel__syscall_newtp("sys_enter", sys_enter_handler);
258 	if (sys_enter == NULL)
259 		goto out;
260 
261 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
262 		goto out_delete_sys_enter;
263 
264 	sys_exit = perf_evsel__syscall_newtp("sys_exit", sys_exit_handler);
265 	if (sys_exit == NULL)
266 		goto out_delete_sys_enter;
267 
268 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
269 		goto out_delete_sys_exit;
270 
271 	perf_evlist__add(evlist, sys_enter);
272 	perf_evlist__add(evlist, sys_exit);
273 
274 	ret = 0;
275 out:
276 	return ret;
277 
278 out_delete_sys_exit:
279 	perf_evsel__delete_priv(sys_exit);
280 out_delete_sys_enter:
281 	perf_evsel__delete_priv(sys_enter);
282 	goto out;
283 }
284 
285 
286 struct syscall_arg {
287 	unsigned long val;
288 	struct thread *thread;
289 	struct trace  *trace;
290 	void	      *parm;
291 	u8	      idx;
292 	u8	      mask;
293 };
294 
295 struct strarray {
296 	int	    offset;
297 	int	    nr_entries;
298 	const char **entries;
299 };
300 
301 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
302 	.nr_entries = ARRAY_SIZE(array), \
303 	.entries = array, \
304 }
305 
306 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
307 	.offset	    = off, \
308 	.nr_entries = ARRAY_SIZE(array), \
309 	.entries = array, \
310 }
311 
312 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
313 						const char *intfmt,
314 					        struct syscall_arg *arg)
315 {
316 	struct strarray *sa = arg->parm;
317 	int idx = arg->val - sa->offset;
318 
319 	if (idx < 0 || idx >= sa->nr_entries)
320 		return scnprintf(bf, size, intfmt, arg->val);
321 
322 	return scnprintf(bf, size, "%s", sa->entries[idx]);
323 }
324 
325 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
326 					      struct syscall_arg *arg)
327 {
328 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
329 }
330 
331 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
332 
333 #if defined(__i386__) || defined(__x86_64__)
334 /*
335  * FIXME: Make this available to all arches as soon as the ioctl beautifier
336  * 	  gets rewritten to support all arches.
337  */
338 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
339 						 struct syscall_arg *arg)
340 {
341 	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
342 }
343 
344 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
345 #endif /* defined(__i386__) || defined(__x86_64__) */
346 
347 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
348 					struct syscall_arg *arg);
349 
350 #define SCA_FD syscall_arg__scnprintf_fd
351 
352 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
353 					   struct syscall_arg *arg)
354 {
355 	int fd = arg->val;
356 
357 	if (fd == AT_FDCWD)
358 		return scnprintf(bf, size, "CWD");
359 
360 	return syscall_arg__scnprintf_fd(bf, size, arg);
361 }
362 
363 #define SCA_FDAT syscall_arg__scnprintf_fd_at
364 
365 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
366 					      struct syscall_arg *arg);
367 
368 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
369 
370 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
371 					 struct syscall_arg *arg)
372 {
373 	return scnprintf(bf, size, "%#lx", arg->val);
374 }
375 
376 #define SCA_HEX syscall_arg__scnprintf_hex
377 
378 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
379 					 struct syscall_arg *arg)
380 {
381 	return scnprintf(bf, size, "%d", arg->val);
382 }
383 
384 #define SCA_INT syscall_arg__scnprintf_int
385 
386 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
387 					       struct syscall_arg *arg)
388 {
389 	int printed = 0, prot = arg->val;
390 
391 	if (prot == PROT_NONE)
392 		return scnprintf(bf, size, "NONE");
393 #define	P_MMAP_PROT(n) \
394 	if (prot & PROT_##n) { \
395 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
396 		prot &= ~PROT_##n; \
397 	}
398 
399 	P_MMAP_PROT(EXEC);
400 	P_MMAP_PROT(READ);
401 	P_MMAP_PROT(WRITE);
402 #ifdef PROT_SEM
403 	P_MMAP_PROT(SEM);
404 #endif
405 	P_MMAP_PROT(GROWSDOWN);
406 	P_MMAP_PROT(GROWSUP);
407 #undef P_MMAP_PROT
408 
409 	if (prot)
410 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
411 
412 	return printed;
413 }
414 
415 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
416 
417 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
418 						struct syscall_arg *arg)
419 {
420 	int printed = 0, flags = arg->val;
421 
422 #define	P_MMAP_FLAG(n) \
423 	if (flags & MAP_##n) { \
424 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
425 		flags &= ~MAP_##n; \
426 	}
427 
428 	P_MMAP_FLAG(SHARED);
429 	P_MMAP_FLAG(PRIVATE);
430 #ifdef MAP_32BIT
431 	P_MMAP_FLAG(32BIT);
432 #endif
433 	P_MMAP_FLAG(ANONYMOUS);
434 	P_MMAP_FLAG(DENYWRITE);
435 	P_MMAP_FLAG(EXECUTABLE);
436 	P_MMAP_FLAG(FILE);
437 	P_MMAP_FLAG(FIXED);
438 	P_MMAP_FLAG(GROWSDOWN);
439 #ifdef MAP_HUGETLB
440 	P_MMAP_FLAG(HUGETLB);
441 #endif
442 	P_MMAP_FLAG(LOCKED);
443 	P_MMAP_FLAG(NONBLOCK);
444 	P_MMAP_FLAG(NORESERVE);
445 	P_MMAP_FLAG(POPULATE);
446 	P_MMAP_FLAG(STACK);
447 #ifdef MAP_UNINITIALIZED
448 	P_MMAP_FLAG(UNINITIALIZED);
449 #endif
450 #undef P_MMAP_FLAG
451 
452 	if (flags)
453 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
454 
455 	return printed;
456 }
457 
458 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
459 
460 static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
461 						  struct syscall_arg *arg)
462 {
463 	int printed = 0, flags = arg->val;
464 
465 #define P_MREMAP_FLAG(n) \
466 	if (flags & MREMAP_##n) { \
467 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
468 		flags &= ~MREMAP_##n; \
469 	}
470 
471 	P_MREMAP_FLAG(MAYMOVE);
472 #ifdef MREMAP_FIXED
473 	P_MREMAP_FLAG(FIXED);
474 #endif
475 #undef P_MREMAP_FLAG
476 
477 	if (flags)
478 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
479 
480 	return printed;
481 }
482 
483 #define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
484 
485 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
486 						      struct syscall_arg *arg)
487 {
488 	int behavior = arg->val;
489 
490 	switch (behavior) {
491 #define	P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
492 	P_MADV_BHV(NORMAL);
493 	P_MADV_BHV(RANDOM);
494 	P_MADV_BHV(SEQUENTIAL);
495 	P_MADV_BHV(WILLNEED);
496 	P_MADV_BHV(DONTNEED);
497 	P_MADV_BHV(REMOVE);
498 	P_MADV_BHV(DONTFORK);
499 	P_MADV_BHV(DOFORK);
500 	P_MADV_BHV(HWPOISON);
501 #ifdef MADV_SOFT_OFFLINE
502 	P_MADV_BHV(SOFT_OFFLINE);
503 #endif
504 	P_MADV_BHV(MERGEABLE);
505 	P_MADV_BHV(UNMERGEABLE);
506 #ifdef MADV_HUGEPAGE
507 	P_MADV_BHV(HUGEPAGE);
508 #endif
509 #ifdef MADV_NOHUGEPAGE
510 	P_MADV_BHV(NOHUGEPAGE);
511 #endif
512 #ifdef MADV_DONTDUMP
513 	P_MADV_BHV(DONTDUMP);
514 #endif
515 #ifdef MADV_DODUMP
516 	P_MADV_BHV(DODUMP);
517 #endif
518 #undef P_MADV_PHV
519 	default: break;
520 	}
521 
522 	return scnprintf(bf, size, "%#x", behavior);
523 }
524 
525 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
526 
527 static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
528 					   struct syscall_arg *arg)
529 {
530 	int printed = 0, op = arg->val;
531 
532 	if (op == 0)
533 		return scnprintf(bf, size, "NONE");
534 #define	P_CMD(cmd) \
535 	if ((op & LOCK_##cmd) == LOCK_##cmd) { \
536 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
537 		op &= ~LOCK_##cmd; \
538 	}
539 
540 	P_CMD(SH);
541 	P_CMD(EX);
542 	P_CMD(NB);
543 	P_CMD(UN);
544 	P_CMD(MAND);
545 	P_CMD(RW);
546 	P_CMD(READ);
547 	P_CMD(WRITE);
548 #undef P_OP
549 
550 	if (op)
551 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
552 
553 	return printed;
554 }
555 
556 #define SCA_FLOCK syscall_arg__scnprintf_flock
557 
558 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
559 {
560 	enum syscall_futex_args {
561 		SCF_UADDR   = (1 << 0),
562 		SCF_OP	    = (1 << 1),
563 		SCF_VAL	    = (1 << 2),
564 		SCF_TIMEOUT = (1 << 3),
565 		SCF_UADDR2  = (1 << 4),
566 		SCF_VAL3    = (1 << 5),
567 	};
568 	int op = arg->val;
569 	int cmd = op & FUTEX_CMD_MASK;
570 	size_t printed = 0;
571 
572 	switch (cmd) {
573 #define	P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
574 	P_FUTEX_OP(WAIT);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
575 	P_FUTEX_OP(WAKE);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
576 	P_FUTEX_OP(FD);		    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
577 	P_FUTEX_OP(REQUEUE);	    arg->mask |= SCF_VAL3|SCF_TIMEOUT;	          break;
578 	P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;			  break;
579 	P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;			  break;
580 	P_FUTEX_OP(WAKE_OP);							  break;
581 	P_FUTEX_OP(LOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
582 	P_FUTEX_OP(UNLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
583 	P_FUTEX_OP(TRYLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
584 	P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;			  break;
585 	P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;			  break;
586 	P_FUTEX_OP(WAIT_REQUEUE_PI);						  break;
587 	default: printed = scnprintf(bf, size, "%#x", cmd);			  break;
588 	}
589 
590 	if (op & FUTEX_PRIVATE_FLAG)
591 		printed += scnprintf(bf + printed, size - printed, "|PRIV");
592 
593 	if (op & FUTEX_CLOCK_REALTIME)
594 		printed += scnprintf(bf + printed, size - printed, "|CLKRT");
595 
596 	return printed;
597 }
598 
599 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
600 
601 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
602 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
603 
604 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
605 static DEFINE_STRARRAY(itimers);
606 
607 static const char *whences[] = { "SET", "CUR", "END",
608 #ifdef SEEK_DATA
609 "DATA",
610 #endif
611 #ifdef SEEK_HOLE
612 "HOLE",
613 #endif
614 };
615 static DEFINE_STRARRAY(whences);
616 
617 static const char *fcntl_cmds[] = {
618 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
619 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
620 	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
621 	"F_GETOWNER_UIDS",
622 };
623 static DEFINE_STRARRAY(fcntl_cmds);
624 
625 static const char *rlimit_resources[] = {
626 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
627 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
628 	"RTTIME",
629 };
630 static DEFINE_STRARRAY(rlimit_resources);
631 
632 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
633 static DEFINE_STRARRAY(sighow);
634 
635 static const char *clockid[] = {
636 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
637 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE",
638 };
639 static DEFINE_STRARRAY(clockid);
640 
641 static const char *socket_families[] = {
642 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
643 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
644 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
645 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
646 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
647 	"ALG", "NFC", "VSOCK",
648 };
649 static DEFINE_STRARRAY(socket_families);
650 
651 #ifndef SOCK_TYPE_MASK
652 #define SOCK_TYPE_MASK 0xf
653 #endif
654 
655 static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
656 						      struct syscall_arg *arg)
657 {
658 	size_t printed;
659 	int type = arg->val,
660 	    flags = type & ~SOCK_TYPE_MASK;
661 
662 	type &= SOCK_TYPE_MASK;
663 	/*
664  	 * Can't use a strarray, MIPS may override for ABI reasons.
665  	 */
666 	switch (type) {
667 #define	P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
668 	P_SK_TYPE(STREAM);
669 	P_SK_TYPE(DGRAM);
670 	P_SK_TYPE(RAW);
671 	P_SK_TYPE(RDM);
672 	P_SK_TYPE(SEQPACKET);
673 	P_SK_TYPE(DCCP);
674 	P_SK_TYPE(PACKET);
675 #undef P_SK_TYPE
676 	default:
677 		printed = scnprintf(bf, size, "%#x", type);
678 	}
679 
680 #define	P_SK_FLAG(n) \
681 	if (flags & SOCK_##n) { \
682 		printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
683 		flags &= ~SOCK_##n; \
684 	}
685 
686 	P_SK_FLAG(CLOEXEC);
687 	P_SK_FLAG(NONBLOCK);
688 #undef P_SK_FLAG
689 
690 	if (flags)
691 		printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
692 
693 	return printed;
694 }
695 
696 #define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
697 
698 #ifndef MSG_PROBE
699 #define MSG_PROBE	     0x10
700 #endif
701 #ifndef MSG_WAITFORONE
702 #define MSG_WAITFORONE	0x10000
703 #endif
704 #ifndef MSG_SENDPAGE_NOTLAST
705 #define MSG_SENDPAGE_NOTLAST 0x20000
706 #endif
707 #ifndef MSG_FASTOPEN
708 #define MSG_FASTOPEN	     0x20000000
709 #endif
710 
711 static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
712 					       struct syscall_arg *arg)
713 {
714 	int printed = 0, flags = arg->val;
715 
716 	if (flags == 0)
717 		return scnprintf(bf, size, "NONE");
718 #define	P_MSG_FLAG(n) \
719 	if (flags & MSG_##n) { \
720 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
721 		flags &= ~MSG_##n; \
722 	}
723 
724 	P_MSG_FLAG(OOB);
725 	P_MSG_FLAG(PEEK);
726 	P_MSG_FLAG(DONTROUTE);
727 	P_MSG_FLAG(TRYHARD);
728 	P_MSG_FLAG(CTRUNC);
729 	P_MSG_FLAG(PROBE);
730 	P_MSG_FLAG(TRUNC);
731 	P_MSG_FLAG(DONTWAIT);
732 	P_MSG_FLAG(EOR);
733 	P_MSG_FLAG(WAITALL);
734 	P_MSG_FLAG(FIN);
735 	P_MSG_FLAG(SYN);
736 	P_MSG_FLAG(CONFIRM);
737 	P_MSG_FLAG(RST);
738 	P_MSG_FLAG(ERRQUEUE);
739 	P_MSG_FLAG(NOSIGNAL);
740 	P_MSG_FLAG(MORE);
741 	P_MSG_FLAG(WAITFORONE);
742 	P_MSG_FLAG(SENDPAGE_NOTLAST);
743 	P_MSG_FLAG(FASTOPEN);
744 	P_MSG_FLAG(CMSG_CLOEXEC);
745 #undef P_MSG_FLAG
746 
747 	if (flags)
748 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
749 
750 	return printed;
751 }
752 
753 #define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
754 
755 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
756 						 struct syscall_arg *arg)
757 {
758 	size_t printed = 0;
759 	int mode = arg->val;
760 
761 	if (mode == F_OK) /* 0 */
762 		return scnprintf(bf, size, "F");
763 #define	P_MODE(n) \
764 	if (mode & n##_OK) { \
765 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
766 		mode &= ~n##_OK; \
767 	}
768 
769 	P_MODE(R);
770 	P_MODE(W);
771 	P_MODE(X);
772 #undef P_MODE
773 
774 	if (mode)
775 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
776 
777 	return printed;
778 }
779 
780 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
781 
782 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
783 					       struct syscall_arg *arg)
784 {
785 	int printed = 0, flags = arg->val;
786 
787 	if (!(flags & O_CREAT))
788 		arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
789 
790 	if (flags == 0)
791 		return scnprintf(bf, size, "RDONLY");
792 #define	P_FLAG(n) \
793 	if (flags & O_##n) { \
794 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
795 		flags &= ~O_##n; \
796 	}
797 
798 	P_FLAG(APPEND);
799 	P_FLAG(ASYNC);
800 	P_FLAG(CLOEXEC);
801 	P_FLAG(CREAT);
802 	P_FLAG(DIRECT);
803 	P_FLAG(DIRECTORY);
804 	P_FLAG(EXCL);
805 	P_FLAG(LARGEFILE);
806 	P_FLAG(NOATIME);
807 	P_FLAG(NOCTTY);
808 #ifdef O_NONBLOCK
809 	P_FLAG(NONBLOCK);
810 #elif O_NDELAY
811 	P_FLAG(NDELAY);
812 #endif
813 #ifdef O_PATH
814 	P_FLAG(PATH);
815 #endif
816 	P_FLAG(RDWR);
817 #ifdef O_DSYNC
818 	if ((flags & O_SYNC) == O_SYNC)
819 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
820 	else {
821 		P_FLAG(DSYNC);
822 	}
823 #else
824 	P_FLAG(SYNC);
825 #endif
826 	P_FLAG(TRUNC);
827 	P_FLAG(WRONLY);
828 #undef P_FLAG
829 
830 	if (flags)
831 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
832 
833 	return printed;
834 }
835 
836 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
837 
838 static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
839 						struct syscall_arg *arg)
840 {
841 	int printed = 0, flags = arg->val;
842 
843 	if (flags == 0)
844 		return 0;
845 
846 #define	P_FLAG(n) \
847 	if (flags & PERF_FLAG_##n) { \
848 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
849 		flags &= ~PERF_FLAG_##n; \
850 	}
851 
852 	P_FLAG(FD_NO_GROUP);
853 	P_FLAG(FD_OUTPUT);
854 	P_FLAG(PID_CGROUP);
855 	P_FLAG(FD_CLOEXEC);
856 #undef P_FLAG
857 
858 	if (flags)
859 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
860 
861 	return printed;
862 }
863 
864 #define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
865 
866 static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
867 						   struct syscall_arg *arg)
868 {
869 	int printed = 0, flags = arg->val;
870 
871 	if (flags == 0)
872 		return scnprintf(bf, size, "NONE");
873 #define	P_FLAG(n) \
874 	if (flags & EFD_##n) { \
875 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
876 		flags &= ~EFD_##n; \
877 	}
878 
879 	P_FLAG(SEMAPHORE);
880 	P_FLAG(CLOEXEC);
881 	P_FLAG(NONBLOCK);
882 #undef P_FLAG
883 
884 	if (flags)
885 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
886 
887 	return printed;
888 }
889 
890 #define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
891 
892 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
893 						struct syscall_arg *arg)
894 {
895 	int printed = 0, flags = arg->val;
896 
897 #define	P_FLAG(n) \
898 	if (flags & O_##n) { \
899 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
900 		flags &= ~O_##n; \
901 	}
902 
903 	P_FLAG(CLOEXEC);
904 	P_FLAG(NONBLOCK);
905 #undef P_FLAG
906 
907 	if (flags)
908 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
909 
910 	return printed;
911 }
912 
913 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
914 
915 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
916 {
917 	int sig = arg->val;
918 
919 	switch (sig) {
920 #define	P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
921 	P_SIGNUM(HUP);
922 	P_SIGNUM(INT);
923 	P_SIGNUM(QUIT);
924 	P_SIGNUM(ILL);
925 	P_SIGNUM(TRAP);
926 	P_SIGNUM(ABRT);
927 	P_SIGNUM(BUS);
928 	P_SIGNUM(FPE);
929 	P_SIGNUM(KILL);
930 	P_SIGNUM(USR1);
931 	P_SIGNUM(SEGV);
932 	P_SIGNUM(USR2);
933 	P_SIGNUM(PIPE);
934 	P_SIGNUM(ALRM);
935 	P_SIGNUM(TERM);
936 	P_SIGNUM(CHLD);
937 	P_SIGNUM(CONT);
938 	P_SIGNUM(STOP);
939 	P_SIGNUM(TSTP);
940 	P_SIGNUM(TTIN);
941 	P_SIGNUM(TTOU);
942 	P_SIGNUM(URG);
943 	P_SIGNUM(XCPU);
944 	P_SIGNUM(XFSZ);
945 	P_SIGNUM(VTALRM);
946 	P_SIGNUM(PROF);
947 	P_SIGNUM(WINCH);
948 	P_SIGNUM(IO);
949 	P_SIGNUM(PWR);
950 	P_SIGNUM(SYS);
951 #ifdef SIGEMT
952 	P_SIGNUM(EMT);
953 #endif
954 #ifdef SIGSTKFLT
955 	P_SIGNUM(STKFLT);
956 #endif
957 #ifdef SIGSWI
958 	P_SIGNUM(SWI);
959 #endif
960 	default: break;
961 	}
962 
963 	return scnprintf(bf, size, "%#x", sig);
964 }
965 
966 #define SCA_SIGNUM syscall_arg__scnprintf_signum
967 
968 #if defined(__i386__) || defined(__x86_64__)
969 /*
970  * FIXME: Make this available to all arches.
971  */
972 #define TCGETS		0x5401
973 
974 static const char *tioctls[] = {
975 	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
976 	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
977 	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
978 	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
979 	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
980 	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
981 	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
982 	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
983 	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
984 	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
985 	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
986 	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
987 	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
988 	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
989 	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
990 };
991 
992 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
993 #endif /* defined(__i386__) || defined(__x86_64__) */
994 
995 #define STRARRAY(arg, name, array) \
996 	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
997 	  .arg_parm	 = { [arg] = &strarray__##array, }
998 
999 static struct syscall_fmt {
1000 	const char *name;
1001 	const char *alias;
1002 	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
1003 	void	   *arg_parm[6];
1004 	bool	   errmsg;
1005 	bool	   timeout;
1006 	bool	   hexret;
1007 } syscall_fmts[] = {
1008 	{ .name	    = "access",	    .errmsg = true,
1009 	  .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
1010 	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
1011 	{ .name	    = "brk",	    .hexret = true,
1012 	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
1013 	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
1014 	{ .name	    = "close",	    .errmsg = true,
1015 	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
1016 	{ .name	    = "connect",    .errmsg = true, },
1017 	{ .name	    = "dup",	    .errmsg = true,
1018 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1019 	{ .name	    = "dup2",	    .errmsg = true,
1020 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1021 	{ .name	    = "dup3",	    .errmsg = true,
1022 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1023 	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
1024 	{ .name	    = "eventfd2",   .errmsg = true,
1025 	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
1026 	{ .name	    = "faccessat",  .errmsg = true,
1027 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1028 	{ .name	    = "fadvise64",  .errmsg = true,
1029 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1030 	{ .name	    = "fallocate",  .errmsg = true,
1031 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1032 	{ .name	    = "fchdir",	    .errmsg = true,
1033 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1034 	{ .name	    = "fchmod",	    .errmsg = true,
1035 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1036 	{ .name	    = "fchmodat",   .errmsg = true,
1037 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1038 	{ .name	    = "fchown",	    .errmsg = true,
1039 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1040 	{ .name	    = "fchownat",   .errmsg = true,
1041 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1042 	{ .name	    = "fcntl",	    .errmsg = true,
1043 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1044 			     [1] = SCA_STRARRAY, /* cmd */ },
1045 	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
1046 	{ .name	    = "fdatasync",  .errmsg = true,
1047 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1048 	{ .name	    = "flock",	    .errmsg = true,
1049 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1050 			     [1] = SCA_FLOCK, /* cmd */ }, },
1051 	{ .name	    = "fsetxattr",  .errmsg = true,
1052 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1053 	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat",
1054 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1055 	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat",
1056 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1057 	{ .name	    = "fstatfs",    .errmsg = true,
1058 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1059 	{ .name	    = "fsync",    .errmsg = true,
1060 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1061 	{ .name	    = "ftruncate", .errmsg = true,
1062 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1063 	{ .name	    = "futex",	    .errmsg = true,
1064 	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
1065 	{ .name	    = "futimesat", .errmsg = true,
1066 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1067 	{ .name	    = "getdents",   .errmsg = true,
1068 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1069 	{ .name	    = "getdents64", .errmsg = true,
1070 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1071 	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1072 	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1073 	{ .name	    = "ioctl",	    .errmsg = true,
1074 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1075 #if defined(__i386__) || defined(__x86_64__)
1076 /*
1077  * FIXME: Make this available to all arches.
1078  */
1079 			     [1] = SCA_STRHEXARRAY, /* cmd */
1080 			     [2] = SCA_HEX, /* arg */ },
1081 	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
1082 #else
1083 			     [2] = SCA_HEX, /* arg */ }, },
1084 #endif
1085 	{ .name	    = "kill",	    .errmsg = true,
1086 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1087 	{ .name	    = "linkat",	    .errmsg = true,
1088 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1089 	{ .name	    = "lseek",	    .errmsg = true,
1090 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1091 			     [2] = SCA_STRARRAY, /* whence */ },
1092 	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
1093 	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat", },
1094 	{ .name     = "madvise",    .errmsg = true,
1095 	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
1096 			     [2] = SCA_MADV_BHV, /* behavior */ }, },
1097 	{ .name	    = "mkdirat",    .errmsg = true,
1098 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1099 	{ .name	    = "mknodat",    .errmsg = true,
1100 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1101 	{ .name	    = "mlock",	    .errmsg = true,
1102 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1103 	{ .name	    = "mlockall",   .errmsg = true,
1104 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1105 	{ .name	    = "mmap",	    .hexret = true,
1106 	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
1107 			     [2] = SCA_MMAP_PROT, /* prot */
1108 			     [3] = SCA_MMAP_FLAGS, /* flags */
1109 			     [4] = SCA_FD, 	  /* fd */ }, },
1110 	{ .name	    = "mprotect",   .errmsg = true,
1111 	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
1112 			     [2] = SCA_MMAP_PROT, /* prot */ }, },
1113 	{ .name	    = "mremap",	    .hexret = true,
1114 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1115 			     [3] = SCA_MREMAP_FLAGS, /* flags */
1116 			     [4] = SCA_HEX, /* new_addr */ }, },
1117 	{ .name	    = "munlock",    .errmsg = true,
1118 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1119 	{ .name	    = "munmap",	    .errmsg = true,
1120 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1121 	{ .name	    = "name_to_handle_at", .errmsg = true,
1122 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1123 	{ .name	    = "newfstatat", .errmsg = true,
1124 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1125 	{ .name	    = "open",	    .errmsg = true,
1126 	  .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1127 	{ .name	    = "open_by_handle_at", .errmsg = true,
1128 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1129 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1130 	{ .name	    = "openat",	    .errmsg = true,
1131 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1132 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1133 	{ .name	    = "perf_event_open", .errmsg = true,
1134 	  .arg_scnprintf = { [1] = SCA_INT, /* pid */
1135 			     [2] = SCA_INT, /* cpu */
1136 			     [3] = SCA_FD,  /* group_fd */
1137 			     [4] = SCA_PERF_FLAGS,  /* flags */ }, },
1138 	{ .name	    = "pipe2",	    .errmsg = true,
1139 	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1140 	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
1141 	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
1142 	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64",
1143 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1144 	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread",
1145 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1146 	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1147 	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64",
1148 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1149 	{ .name	    = "pwritev",    .errmsg = true,
1150 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1151 	{ .name	    = "read",	    .errmsg = true,
1152 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1153 	{ .name	    = "readlinkat", .errmsg = true,
1154 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1155 	{ .name	    = "readv",	    .errmsg = true,
1156 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1157 	{ .name	    = "recvfrom",   .errmsg = true,
1158 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1159 	{ .name	    = "recvmmsg",   .errmsg = true,
1160 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1161 	{ .name	    = "recvmsg",    .errmsg = true,
1162 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1163 	{ .name	    = "renameat",   .errmsg = true,
1164 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1165 	{ .name	    = "rt_sigaction", .errmsg = true,
1166 	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1167 	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1168 	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
1169 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1170 	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
1171 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1172 	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
1173 	{ .name	    = "sendmmsg",    .errmsg = true,
1174 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1175 	{ .name	    = "sendmsg",    .errmsg = true,
1176 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1177 	{ .name	    = "sendto",	    .errmsg = true,
1178 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1179 	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1180 	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1181 	{ .name	    = "shutdown",   .errmsg = true,
1182 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1183 	{ .name	    = "socket",	    .errmsg = true,
1184 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1185 			     [1] = SCA_SK_TYPE, /* type */ },
1186 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1187 	{ .name	    = "socketpair", .errmsg = true,
1188 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1189 			     [1] = SCA_SK_TYPE, /* type */ },
1190 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1191 	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat", },
1192 	{ .name	    = "symlinkat",  .errmsg = true,
1193 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1194 	{ .name	    = "tgkill",	    .errmsg = true,
1195 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1196 	{ .name	    = "tkill",	    .errmsg = true,
1197 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1198 	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
1199 	{ .name	    = "unlinkat",   .errmsg = true,
1200 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1201 	{ .name	    = "utimensat",  .errmsg = true,
1202 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
1203 	{ .name	    = "write",	    .errmsg = true,
1204 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1205 	{ .name	    = "writev",	    .errmsg = true,
1206 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1207 };
1208 
1209 static int syscall_fmt__cmp(const void *name, const void *fmtp)
1210 {
1211 	const struct syscall_fmt *fmt = fmtp;
1212 	return strcmp(name, fmt->name);
1213 }
1214 
1215 static struct syscall_fmt *syscall_fmt__find(const char *name)
1216 {
1217 	const int nmemb = ARRAY_SIZE(syscall_fmts);
1218 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1219 }
1220 
1221 struct syscall {
1222 	struct event_format *tp_format;
1223 	int		    nr_args;
1224 	struct format_field *args;
1225 	const char	    *name;
1226 	bool		    filtered;
1227 	bool		    is_exit;
1228 	struct syscall_fmt  *fmt;
1229 	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1230 	void		    **arg_parm;
1231 };
1232 
1233 static size_t fprintf_duration(unsigned long t, FILE *fp)
1234 {
1235 	double duration = (double)t / NSEC_PER_MSEC;
1236 	size_t printed = fprintf(fp, "(");
1237 
1238 	if (duration >= 1.0)
1239 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1240 	else if (duration >= 0.01)
1241 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1242 	else
1243 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1244 	return printed + fprintf(fp, "): ");
1245 }
1246 
1247 struct thread_trace {
1248 	u64		  entry_time;
1249 	u64		  exit_time;
1250 	bool		  entry_pending;
1251 	unsigned long	  nr_events;
1252 	unsigned long	  pfmaj, pfmin;
1253 	char		  *entry_str;
1254 	double		  runtime_ms;
1255 	struct {
1256 		int	  max;
1257 		char	  **table;
1258 	} paths;
1259 
1260 	struct intlist *syscall_stats;
1261 };
1262 
1263 static struct thread_trace *thread_trace__new(void)
1264 {
1265 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1266 
1267 	if (ttrace)
1268 		ttrace->paths.max = -1;
1269 
1270 	ttrace->syscall_stats = intlist__new(NULL);
1271 
1272 	return ttrace;
1273 }
1274 
1275 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1276 {
1277 	struct thread_trace *ttrace;
1278 
1279 	if (thread == NULL)
1280 		goto fail;
1281 
1282 	if (thread__priv(thread) == NULL)
1283 		thread__set_priv(thread, thread_trace__new());
1284 
1285 	if (thread__priv(thread) == NULL)
1286 		goto fail;
1287 
1288 	ttrace = thread__priv(thread);
1289 	++ttrace->nr_events;
1290 
1291 	return ttrace;
1292 fail:
1293 	color_fprintf(fp, PERF_COLOR_RED,
1294 		      "WARNING: not enough memory, dropping samples!\n");
1295 	return NULL;
1296 }
1297 
1298 #define TRACE_PFMAJ		(1 << 0)
1299 #define TRACE_PFMIN		(1 << 1)
1300 
1301 struct trace {
1302 	struct perf_tool	tool;
1303 	struct {
1304 		int		machine;
1305 		int		open_id;
1306 	}			audit;
1307 	struct {
1308 		int		max;
1309 		struct syscall  *table;
1310 	} syscalls;
1311 	struct record_opts	opts;
1312 	struct perf_evlist	*evlist;
1313 	struct machine		*host;
1314 	struct thread		*current;
1315 	u64			base_time;
1316 	FILE			*output;
1317 	unsigned long		nr_events;
1318 	struct strlist		*ev_qualifier;
1319 	const char 		*last_vfs_getname;
1320 	struct intlist		*tid_list;
1321 	struct intlist		*pid_list;
1322 	struct {
1323 		size_t		nr;
1324 		pid_t		*entries;
1325 	}			filter_pids;
1326 	double			duration_filter;
1327 	double			runtime_ms;
1328 	struct {
1329 		u64		vfs_getname,
1330 				proc_getname;
1331 	} stats;
1332 	bool			not_ev_qualifier;
1333 	bool			live;
1334 	bool			full_time;
1335 	bool			sched;
1336 	bool			multiple_threads;
1337 	bool			summary;
1338 	bool			summary_only;
1339 	bool			show_comm;
1340 	bool			show_tool_stats;
1341 	bool			trace_syscalls;
1342 	bool			force;
1343 	int			trace_pgfaults;
1344 };
1345 
1346 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1347 {
1348 	struct thread_trace *ttrace = thread__priv(thread);
1349 
1350 	if (fd > ttrace->paths.max) {
1351 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1352 
1353 		if (npath == NULL)
1354 			return -1;
1355 
1356 		if (ttrace->paths.max != -1) {
1357 			memset(npath + ttrace->paths.max + 1, 0,
1358 			       (fd - ttrace->paths.max) * sizeof(char *));
1359 		} else {
1360 			memset(npath, 0, (fd + 1) * sizeof(char *));
1361 		}
1362 
1363 		ttrace->paths.table = npath;
1364 		ttrace->paths.max   = fd;
1365 	}
1366 
1367 	ttrace->paths.table[fd] = strdup(pathname);
1368 
1369 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
1370 }
1371 
1372 static int thread__read_fd_path(struct thread *thread, int fd)
1373 {
1374 	char linkname[PATH_MAX], pathname[PATH_MAX];
1375 	struct stat st;
1376 	int ret;
1377 
1378 	if (thread->pid_ == thread->tid) {
1379 		scnprintf(linkname, sizeof(linkname),
1380 			  "/proc/%d/fd/%d", thread->pid_, fd);
1381 	} else {
1382 		scnprintf(linkname, sizeof(linkname),
1383 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1384 	}
1385 
1386 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1387 		return -1;
1388 
1389 	ret = readlink(linkname, pathname, sizeof(pathname));
1390 
1391 	if (ret < 0 || ret > st.st_size)
1392 		return -1;
1393 
1394 	pathname[ret] = '\0';
1395 	return trace__set_fd_pathname(thread, fd, pathname);
1396 }
1397 
1398 static const char *thread__fd_path(struct thread *thread, int fd,
1399 				   struct trace *trace)
1400 {
1401 	struct thread_trace *ttrace = thread__priv(thread);
1402 
1403 	if (ttrace == NULL)
1404 		return NULL;
1405 
1406 	if (fd < 0)
1407 		return NULL;
1408 
1409 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1410 		if (!trace->live)
1411 			return NULL;
1412 		++trace->stats.proc_getname;
1413 		if (thread__read_fd_path(thread, fd))
1414 			return NULL;
1415 	}
1416 
1417 	return ttrace->paths.table[fd];
1418 }
1419 
1420 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1421 					struct syscall_arg *arg)
1422 {
1423 	int fd = arg->val;
1424 	size_t printed = scnprintf(bf, size, "%d", fd);
1425 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1426 
1427 	if (path)
1428 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1429 
1430 	return printed;
1431 }
1432 
1433 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1434 					      struct syscall_arg *arg)
1435 {
1436 	int fd = arg->val;
1437 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1438 	struct thread_trace *ttrace = thread__priv(arg->thread);
1439 
1440 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1441 		zfree(&ttrace->paths.table[fd]);
1442 
1443 	return printed;
1444 }
1445 
1446 static bool trace__filter_duration(struct trace *trace, double t)
1447 {
1448 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1449 }
1450 
1451 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1452 {
1453 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1454 
1455 	return fprintf(fp, "%10.3f ", ts);
1456 }
1457 
1458 static bool done = false;
1459 static bool interrupted = false;
1460 
1461 static void sig_handler(int sig)
1462 {
1463 	done = true;
1464 	interrupted = sig == SIGINT;
1465 }
1466 
1467 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1468 					u64 duration, u64 tstamp, FILE *fp)
1469 {
1470 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1471 	printed += fprintf_duration(duration, fp);
1472 
1473 	if (trace->multiple_threads) {
1474 		if (trace->show_comm)
1475 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1476 		printed += fprintf(fp, "%d ", thread->tid);
1477 	}
1478 
1479 	return printed;
1480 }
1481 
1482 static int trace__process_event(struct trace *trace, struct machine *machine,
1483 				union perf_event *event, struct perf_sample *sample)
1484 {
1485 	int ret = 0;
1486 
1487 	switch (event->header.type) {
1488 	case PERF_RECORD_LOST:
1489 		color_fprintf(trace->output, PERF_COLOR_RED,
1490 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1491 		ret = machine__process_lost_event(machine, event, sample);
1492 	default:
1493 		ret = machine__process_event(machine, event, sample);
1494 		break;
1495 	}
1496 
1497 	return ret;
1498 }
1499 
1500 static int trace__tool_process(struct perf_tool *tool,
1501 			       union perf_event *event,
1502 			       struct perf_sample *sample,
1503 			       struct machine *machine)
1504 {
1505 	struct trace *trace = container_of(tool, struct trace, tool);
1506 	return trace__process_event(trace, machine, event, sample);
1507 }
1508 
1509 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1510 {
1511 	int err = symbol__init(NULL);
1512 
1513 	if (err)
1514 		return err;
1515 
1516 	trace->host = machine__new_host();
1517 	if (trace->host == NULL)
1518 		return -ENOMEM;
1519 
1520 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1521 					    evlist->threads, trace__tool_process, false,
1522 					    trace->opts.proc_map_timeout);
1523 	if (err)
1524 		symbol__exit();
1525 
1526 	return err;
1527 }
1528 
1529 static int syscall__set_arg_fmts(struct syscall *sc)
1530 {
1531 	struct format_field *field;
1532 	int idx = 0;
1533 
1534 	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1535 	if (sc->arg_scnprintf == NULL)
1536 		return -1;
1537 
1538 	if (sc->fmt)
1539 		sc->arg_parm = sc->fmt->arg_parm;
1540 
1541 	for (field = sc->args; field; field = field->next) {
1542 		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1543 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1544 		else if (field->flags & FIELD_IS_POINTER)
1545 			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1546 		++idx;
1547 	}
1548 
1549 	return 0;
1550 }
1551 
1552 static int trace__read_syscall_info(struct trace *trace, int id)
1553 {
1554 	char tp_name[128];
1555 	struct syscall *sc;
1556 	const char *name = audit_syscall_to_name(id, trace->audit.machine);
1557 
1558 	if (name == NULL)
1559 		return -1;
1560 
1561 	if (id > trace->syscalls.max) {
1562 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1563 
1564 		if (nsyscalls == NULL)
1565 			return -1;
1566 
1567 		if (trace->syscalls.max != -1) {
1568 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1569 			       (id - trace->syscalls.max) * sizeof(*sc));
1570 		} else {
1571 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1572 		}
1573 
1574 		trace->syscalls.table = nsyscalls;
1575 		trace->syscalls.max   = id;
1576 	}
1577 
1578 	sc = trace->syscalls.table + id;
1579 	sc->name = name;
1580 
1581 	if (trace->ev_qualifier) {
1582 		bool in = strlist__find(trace->ev_qualifier, name) != NULL;
1583 
1584 		if (!(in ^ trace->not_ev_qualifier)) {
1585 			sc->filtered = true;
1586 			/*
1587 			 * No need to do read tracepoint information since this will be
1588 			 * filtered out.
1589 			 */
1590 			return 0;
1591 		}
1592 	}
1593 
1594 	sc->fmt  = syscall_fmt__find(sc->name);
1595 
1596 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1597 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1598 
1599 	if (sc->tp_format == NULL && sc->fmt && sc->fmt->alias) {
1600 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1601 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1602 	}
1603 
1604 	if (sc->tp_format == NULL)
1605 		return -1;
1606 
1607 	sc->args = sc->tp_format->format.fields;
1608 	sc->nr_args = sc->tp_format->format.nr_fields;
1609 	/* drop nr field - not relevant here; does not exist on older kernels */
1610 	if (sc->args && strcmp(sc->args->name, "nr") == 0) {
1611 		sc->args = sc->args->next;
1612 		--sc->nr_args;
1613 	}
1614 
1615 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1616 
1617 	return syscall__set_arg_fmts(sc);
1618 }
1619 
1620 static int trace__validate_ev_qualifier(struct trace *trace)
1621 {
1622 	int err = 0;
1623 	struct str_node *pos;
1624 
1625 	strlist__for_each(pos, trace->ev_qualifier) {
1626 		const char *sc = pos->s;
1627 
1628 		if (audit_name_to_syscall(sc, trace->audit.machine) < 0) {
1629 			if (err == 0) {
1630 				fputs("Error:\tInvalid syscall ", trace->output);
1631 				err = -EINVAL;
1632 			} else {
1633 				fputs(", ", trace->output);
1634 			}
1635 
1636 			fputs(sc, trace->output);
1637 		}
1638 	}
1639 
1640 	if (err < 0) {
1641 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1642 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1643 	}
1644 
1645 	return err;
1646 }
1647 
1648 /*
1649  * args is to be interpreted as a series of longs but we need to handle
1650  * 8-byte unaligned accesses. args points to raw_data within the event
1651  * and raw_data is guaranteed to be 8-byte unaligned because it is
1652  * preceded by raw_size which is a u32. So we need to copy args to a temp
1653  * variable to read it. Most notably this avoids extended load instructions
1654  * on unaligned addresses
1655  */
1656 
1657 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1658 				      unsigned char *args, struct trace *trace,
1659 				      struct thread *thread)
1660 {
1661 	size_t printed = 0;
1662 	unsigned char *p;
1663 	unsigned long val;
1664 
1665 	if (sc->args != NULL) {
1666 		struct format_field *field;
1667 		u8 bit = 1;
1668 		struct syscall_arg arg = {
1669 			.idx	= 0,
1670 			.mask	= 0,
1671 			.trace  = trace,
1672 			.thread = thread,
1673 		};
1674 
1675 		for (field = sc->args; field;
1676 		     field = field->next, ++arg.idx, bit <<= 1) {
1677 			if (arg.mask & bit)
1678 				continue;
1679 
1680 			/* special care for unaligned accesses */
1681 			p = args + sizeof(unsigned long) * arg.idx;
1682 			memcpy(&val, p, sizeof(val));
1683 
1684 			/*
1685  			 * Suppress this argument if its value is zero and
1686  			 * and we don't have a string associated in an
1687  			 * strarray for it.
1688  			 */
1689 			if (val == 0 &&
1690 			    !(sc->arg_scnprintf &&
1691 			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1692 			      sc->arg_parm[arg.idx]))
1693 				continue;
1694 
1695 			printed += scnprintf(bf + printed, size - printed,
1696 					     "%s%s: ", printed ? ", " : "", field->name);
1697 			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1698 				arg.val = val;
1699 				if (sc->arg_parm)
1700 					arg.parm = sc->arg_parm[arg.idx];
1701 				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1702 								      size - printed, &arg);
1703 			} else {
1704 				printed += scnprintf(bf + printed, size - printed,
1705 						     "%ld", val);
1706 			}
1707 		}
1708 	} else {
1709 		int i = 0;
1710 
1711 		while (i < 6) {
1712 			/* special care for unaligned accesses */
1713 			p = args + sizeof(unsigned long) * i;
1714 			memcpy(&val, p, sizeof(val));
1715 			printed += scnprintf(bf + printed, size - printed,
1716 					     "%sarg%d: %ld",
1717 					     printed ? ", " : "", i, val);
1718 			++i;
1719 		}
1720 	}
1721 
1722 	return printed;
1723 }
1724 
1725 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1726 				  union perf_event *event,
1727 				  struct perf_sample *sample);
1728 
1729 static struct syscall *trace__syscall_info(struct trace *trace,
1730 					   struct perf_evsel *evsel, int id)
1731 {
1732 
1733 	if (id < 0) {
1734 
1735 		/*
1736 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1737 		 * before that, leaving at a higher verbosity level till that is
1738 		 * explained. Reproduced with plain ftrace with:
1739 		 *
1740 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1741 		 * grep "NR -1 " /t/trace_pipe
1742 		 *
1743 		 * After generating some load on the machine.
1744  		 */
1745 		if (verbose > 1) {
1746 			static u64 n;
1747 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1748 				id, perf_evsel__name(evsel), ++n);
1749 		}
1750 		return NULL;
1751 	}
1752 
1753 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1754 	    trace__read_syscall_info(trace, id))
1755 		goto out_cant_read;
1756 
1757 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1758 		goto out_cant_read;
1759 
1760 	return &trace->syscalls.table[id];
1761 
1762 out_cant_read:
1763 	if (verbose) {
1764 		fprintf(trace->output, "Problems reading syscall %d", id);
1765 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1766 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1767 		fputs(" information\n", trace->output);
1768 	}
1769 	return NULL;
1770 }
1771 
1772 static void thread__update_stats(struct thread_trace *ttrace,
1773 				 int id, struct perf_sample *sample)
1774 {
1775 	struct int_node *inode;
1776 	struct stats *stats;
1777 	u64 duration = 0;
1778 
1779 	inode = intlist__findnew(ttrace->syscall_stats, id);
1780 	if (inode == NULL)
1781 		return;
1782 
1783 	stats = inode->priv;
1784 	if (stats == NULL) {
1785 		stats = malloc(sizeof(struct stats));
1786 		if (stats == NULL)
1787 			return;
1788 		init_stats(stats);
1789 		inode->priv = stats;
1790 	}
1791 
1792 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1793 		duration = sample->time - ttrace->entry_time;
1794 
1795 	update_stats(stats, duration);
1796 }
1797 
1798 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1799 {
1800 	struct thread_trace *ttrace;
1801 	u64 duration;
1802 	size_t printed;
1803 
1804 	if (trace->current == NULL)
1805 		return 0;
1806 
1807 	ttrace = thread__priv(trace->current);
1808 
1809 	if (!ttrace->entry_pending)
1810 		return 0;
1811 
1812 	duration = sample->time - ttrace->entry_time;
1813 
1814 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1815 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1816 	ttrace->entry_pending = false;
1817 
1818 	return printed;
1819 }
1820 
1821 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1822 			    union perf_event *event __maybe_unused,
1823 			    struct perf_sample *sample)
1824 {
1825 	char *msg;
1826 	void *args;
1827 	size_t printed = 0;
1828 	struct thread *thread;
1829 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1830 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1831 	struct thread_trace *ttrace;
1832 
1833 	if (sc == NULL)
1834 		return -1;
1835 
1836 	if (sc->filtered)
1837 		return 0;
1838 
1839 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1840 	ttrace = thread__trace(thread, trace->output);
1841 	if (ttrace == NULL)
1842 		goto out_put;
1843 
1844 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1845 
1846 	if (ttrace->entry_str == NULL) {
1847 		ttrace->entry_str = malloc(1024);
1848 		if (!ttrace->entry_str)
1849 			goto out_put;
1850 	}
1851 
1852 	if (!trace->summary_only)
1853 		trace__printf_interrupted_entry(trace, sample);
1854 
1855 	ttrace->entry_time = sample->time;
1856 	msg = ttrace->entry_str;
1857 	printed += scnprintf(msg + printed, 1024 - printed, "%s(", sc->name);
1858 
1859 	printed += syscall__scnprintf_args(sc, msg + printed, 1024 - printed,
1860 					   args, trace, thread);
1861 
1862 	if (sc->is_exit) {
1863 		if (!trace->duration_filter && !trace->summary_only) {
1864 			trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1865 			fprintf(trace->output, "%-70s\n", ttrace->entry_str);
1866 		}
1867 	} else
1868 		ttrace->entry_pending = true;
1869 
1870 	if (trace->current != thread) {
1871 		thread__put(trace->current);
1872 		trace->current = thread__get(thread);
1873 	}
1874 	err = 0;
1875 out_put:
1876 	thread__put(thread);
1877 	return err;
1878 }
1879 
1880 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1881 			   union perf_event *event __maybe_unused,
1882 			   struct perf_sample *sample)
1883 {
1884 	long ret;
1885 	u64 duration = 0;
1886 	struct thread *thread;
1887 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1888 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1889 	struct thread_trace *ttrace;
1890 
1891 	if (sc == NULL)
1892 		return -1;
1893 
1894 	if (sc->filtered)
1895 		return 0;
1896 
1897 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1898 	ttrace = thread__trace(thread, trace->output);
1899 	if (ttrace == NULL)
1900 		goto out_put;
1901 
1902 	if (trace->summary)
1903 		thread__update_stats(ttrace, id, sample);
1904 
1905 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1906 
1907 	if (id == trace->audit.open_id && ret >= 0 && trace->last_vfs_getname) {
1908 		trace__set_fd_pathname(thread, ret, trace->last_vfs_getname);
1909 		trace->last_vfs_getname = NULL;
1910 		++trace->stats.vfs_getname;
1911 	}
1912 
1913 	ttrace->exit_time = sample->time;
1914 
1915 	if (ttrace->entry_time) {
1916 		duration = sample->time - ttrace->entry_time;
1917 		if (trace__filter_duration(trace, duration))
1918 			goto out;
1919 	} else if (trace->duration_filter)
1920 		goto out;
1921 
1922 	if (trace->summary_only)
1923 		goto out;
1924 
1925 	trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1926 
1927 	if (ttrace->entry_pending) {
1928 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1929 	} else {
1930 		fprintf(trace->output, " ... [");
1931 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1932 		fprintf(trace->output, "]: %s()", sc->name);
1933 	}
1934 
1935 	if (sc->fmt == NULL) {
1936 signed_print:
1937 		fprintf(trace->output, ") = %ld", ret);
1938 	} else if (ret < 0 && sc->fmt->errmsg) {
1939 		char bf[STRERR_BUFSIZE];
1940 		const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
1941 			   *e = audit_errno_to_name(-ret);
1942 
1943 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1944 	} else if (ret == 0 && sc->fmt->timeout)
1945 		fprintf(trace->output, ") = 0 Timeout");
1946 	else if (sc->fmt->hexret)
1947 		fprintf(trace->output, ") = %#lx", ret);
1948 	else
1949 		goto signed_print;
1950 
1951 	fputc('\n', trace->output);
1952 out:
1953 	ttrace->entry_pending = false;
1954 	err = 0;
1955 out_put:
1956 	thread__put(thread);
1957 	return err;
1958 }
1959 
1960 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1961 			      union perf_event *event __maybe_unused,
1962 			      struct perf_sample *sample)
1963 {
1964 	trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname");
1965 	return 0;
1966 }
1967 
1968 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1969 				     union perf_event *event __maybe_unused,
1970 				     struct perf_sample *sample)
1971 {
1972         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1973 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1974 	struct thread *thread = machine__findnew_thread(trace->host,
1975 							sample->pid,
1976 							sample->tid);
1977 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1978 
1979 	if (ttrace == NULL)
1980 		goto out_dump;
1981 
1982 	ttrace->runtime_ms += runtime_ms;
1983 	trace->runtime_ms += runtime_ms;
1984 	thread__put(thread);
1985 	return 0;
1986 
1987 out_dump:
1988 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1989 	       evsel->name,
1990 	       perf_evsel__strval(evsel, sample, "comm"),
1991 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1992 	       runtime,
1993 	       perf_evsel__intval(evsel, sample, "vruntime"));
1994 	thread__put(thread);
1995 	return 0;
1996 }
1997 
1998 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1999 				union perf_event *event __maybe_unused,
2000 				struct perf_sample *sample)
2001 {
2002 	trace__printf_interrupted_entry(trace, sample);
2003 	trace__fprintf_tstamp(trace, sample->time, trace->output);
2004 
2005 	if (trace->trace_syscalls)
2006 		fprintf(trace->output, "(         ): ");
2007 
2008 	fprintf(trace->output, "%s:", evsel->name);
2009 
2010 	if (evsel->tp_format) {
2011 		event_format__fprintf(evsel->tp_format, sample->cpu,
2012 				      sample->raw_data, sample->raw_size,
2013 				      trace->output);
2014 	}
2015 
2016 	fprintf(trace->output, ")\n");
2017 	return 0;
2018 }
2019 
2020 static void print_location(FILE *f, struct perf_sample *sample,
2021 			   struct addr_location *al,
2022 			   bool print_dso, bool print_sym)
2023 {
2024 
2025 	if ((verbose || print_dso) && al->map)
2026 		fprintf(f, "%s@", al->map->dso->long_name);
2027 
2028 	if ((verbose || print_sym) && al->sym)
2029 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2030 			al->addr - al->sym->start);
2031 	else if (al->map)
2032 		fprintf(f, "0x%" PRIx64, al->addr);
2033 	else
2034 		fprintf(f, "0x%" PRIx64, sample->addr);
2035 }
2036 
2037 static int trace__pgfault(struct trace *trace,
2038 			  struct perf_evsel *evsel,
2039 			  union perf_event *event,
2040 			  struct perf_sample *sample)
2041 {
2042 	struct thread *thread;
2043 	u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
2044 	struct addr_location al;
2045 	char map_type = 'd';
2046 	struct thread_trace *ttrace;
2047 	int err = -1;
2048 
2049 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2050 	ttrace = thread__trace(thread, trace->output);
2051 	if (ttrace == NULL)
2052 		goto out_put;
2053 
2054 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2055 		ttrace->pfmaj++;
2056 	else
2057 		ttrace->pfmin++;
2058 
2059 	if (trace->summary_only)
2060 		goto out;
2061 
2062 	thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
2063 			      sample->ip, &al);
2064 
2065 	trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
2066 
2067 	fprintf(trace->output, "%sfault [",
2068 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2069 		"maj" : "min");
2070 
2071 	print_location(trace->output, sample, &al, false, true);
2072 
2073 	fprintf(trace->output, "] => ");
2074 
2075 	thread__find_addr_location(thread, cpumode, MAP__VARIABLE,
2076 				   sample->addr, &al);
2077 
2078 	if (!al.map) {
2079 		thread__find_addr_location(thread, cpumode,
2080 					   MAP__FUNCTION, sample->addr, &al);
2081 
2082 		if (al.map)
2083 			map_type = 'x';
2084 		else
2085 			map_type = '?';
2086 	}
2087 
2088 	print_location(trace->output, sample, &al, true, false);
2089 
2090 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2091 out:
2092 	err = 0;
2093 out_put:
2094 	thread__put(thread);
2095 	return err;
2096 }
2097 
2098 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
2099 {
2100 	if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
2101 	    (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
2102 		return false;
2103 
2104 	if (trace->pid_list || trace->tid_list)
2105 		return true;
2106 
2107 	return false;
2108 }
2109 
2110 static int trace__process_sample(struct perf_tool *tool,
2111 				 union perf_event *event,
2112 				 struct perf_sample *sample,
2113 				 struct perf_evsel *evsel,
2114 				 struct machine *machine __maybe_unused)
2115 {
2116 	struct trace *trace = container_of(tool, struct trace, tool);
2117 	int err = 0;
2118 
2119 	tracepoint_handler handler = evsel->handler;
2120 
2121 	if (skip_sample(trace, sample))
2122 		return 0;
2123 
2124 	if (!trace->full_time && trace->base_time == 0)
2125 		trace->base_time = sample->time;
2126 
2127 	if (handler) {
2128 		++trace->nr_events;
2129 		handler(trace, evsel, event, sample);
2130 	}
2131 
2132 	return err;
2133 }
2134 
2135 static int parse_target_str(struct trace *trace)
2136 {
2137 	if (trace->opts.target.pid) {
2138 		trace->pid_list = intlist__new(trace->opts.target.pid);
2139 		if (trace->pid_list == NULL) {
2140 			pr_err("Error parsing process id string\n");
2141 			return -EINVAL;
2142 		}
2143 	}
2144 
2145 	if (trace->opts.target.tid) {
2146 		trace->tid_list = intlist__new(trace->opts.target.tid);
2147 		if (trace->tid_list == NULL) {
2148 			pr_err("Error parsing thread id string\n");
2149 			return -EINVAL;
2150 		}
2151 	}
2152 
2153 	return 0;
2154 }
2155 
2156 static int trace__record(struct trace *trace, int argc, const char **argv)
2157 {
2158 	unsigned int rec_argc, i, j;
2159 	const char **rec_argv;
2160 	const char * const record_args[] = {
2161 		"record",
2162 		"-R",
2163 		"-m", "1024",
2164 		"-c", "1",
2165 	};
2166 
2167 	const char * const sc_args[] = { "-e", };
2168 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2169 	const char * const majpf_args[] = { "-e", "major-faults" };
2170 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2171 	const char * const minpf_args[] = { "-e", "minor-faults" };
2172 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2173 
2174 	/* +1 is for the event string below */
2175 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2176 		majpf_args_nr + minpf_args_nr + argc;
2177 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2178 
2179 	if (rec_argv == NULL)
2180 		return -ENOMEM;
2181 
2182 	j = 0;
2183 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2184 		rec_argv[j++] = record_args[i];
2185 
2186 	if (trace->trace_syscalls) {
2187 		for (i = 0; i < sc_args_nr; i++)
2188 			rec_argv[j++] = sc_args[i];
2189 
2190 		/* event string may be different for older kernels - e.g., RHEL6 */
2191 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2192 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2193 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2194 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2195 		else {
2196 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2197 			return -1;
2198 		}
2199 	}
2200 
2201 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2202 		for (i = 0; i < majpf_args_nr; i++)
2203 			rec_argv[j++] = majpf_args[i];
2204 
2205 	if (trace->trace_pgfaults & TRACE_PFMIN)
2206 		for (i = 0; i < minpf_args_nr; i++)
2207 			rec_argv[j++] = minpf_args[i];
2208 
2209 	for (i = 0; i < (unsigned int)argc; i++)
2210 		rec_argv[j++] = argv[i];
2211 
2212 	return cmd_record(j, rec_argv, NULL);
2213 }
2214 
2215 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2216 
2217 static void perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2218 {
2219 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2220 	if (evsel == NULL)
2221 		return;
2222 
2223 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2224 		perf_evsel__delete(evsel);
2225 		return;
2226 	}
2227 
2228 	evsel->handler = trace__vfs_getname;
2229 	perf_evlist__add(evlist, evsel);
2230 }
2231 
2232 static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2233 				    u64 config)
2234 {
2235 	struct perf_evsel *evsel;
2236 	struct perf_event_attr attr = {
2237 		.type = PERF_TYPE_SOFTWARE,
2238 		.mmap_data = 1,
2239 	};
2240 
2241 	attr.config = config;
2242 	attr.sample_period = 1;
2243 
2244 	event_attr_init(&attr);
2245 
2246 	evsel = perf_evsel__new(&attr);
2247 	if (!evsel)
2248 		return -ENOMEM;
2249 
2250 	evsel->handler = trace__pgfault;
2251 	perf_evlist__add(evlist, evsel);
2252 
2253 	return 0;
2254 }
2255 
2256 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2257 {
2258 	const u32 type = event->header.type;
2259 	struct perf_evsel *evsel;
2260 
2261 	if (!trace->full_time && trace->base_time == 0)
2262 		trace->base_time = sample->time;
2263 
2264 	if (type != PERF_RECORD_SAMPLE) {
2265 		trace__process_event(trace, trace->host, event, sample);
2266 		return;
2267 	}
2268 
2269 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2270 	if (evsel == NULL) {
2271 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2272 		return;
2273 	}
2274 
2275 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2276 	    sample->raw_data == NULL) {
2277 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2278 		       perf_evsel__name(evsel), sample->tid,
2279 		       sample->cpu, sample->raw_size);
2280 	} else {
2281 		tracepoint_handler handler = evsel->handler;
2282 		handler(trace, evsel, event, sample);
2283 	}
2284 }
2285 
2286 static int trace__run(struct trace *trace, int argc, const char **argv)
2287 {
2288 	struct perf_evlist *evlist = trace->evlist;
2289 	int err = -1, i;
2290 	unsigned long before;
2291 	const bool forks = argc > 0;
2292 	bool draining = false;
2293 
2294 	trace->live = true;
2295 
2296 	if (trace->trace_syscalls &&
2297 	    perf_evlist__add_syscall_newtp(evlist, trace__sys_enter,
2298 					   trace__sys_exit))
2299 		goto out_error_raw_syscalls;
2300 
2301 	if (trace->trace_syscalls)
2302 		perf_evlist__add_vfs_getname(evlist);
2303 
2304 	if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2305 	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2306 		goto out_error_mem;
2307 	}
2308 
2309 	if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2310 	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2311 		goto out_error_mem;
2312 
2313 	if (trace->sched &&
2314 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2315 				   trace__sched_stat_runtime))
2316 		goto out_error_sched_stat_runtime;
2317 
2318 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2319 	if (err < 0) {
2320 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2321 		goto out_delete_evlist;
2322 	}
2323 
2324 	err = trace__symbols_init(trace, evlist);
2325 	if (err < 0) {
2326 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2327 		goto out_delete_evlist;
2328 	}
2329 
2330 	perf_evlist__config(evlist, &trace->opts);
2331 
2332 	signal(SIGCHLD, sig_handler);
2333 	signal(SIGINT, sig_handler);
2334 
2335 	if (forks) {
2336 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2337 						    argv, false, NULL);
2338 		if (err < 0) {
2339 			fprintf(trace->output, "Couldn't run the workload!\n");
2340 			goto out_delete_evlist;
2341 		}
2342 	}
2343 
2344 	err = perf_evlist__open(evlist);
2345 	if (err < 0)
2346 		goto out_error_open;
2347 
2348 	/*
2349 	 * Better not use !target__has_task() here because we need to cover the
2350 	 * case where no threads were specified in the command line, but a
2351 	 * workload was, and in that case we will fill in the thread_map when
2352 	 * we fork the workload in perf_evlist__prepare_workload.
2353 	 */
2354 	if (trace->filter_pids.nr > 0)
2355 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2356 	else if (thread_map__pid(evlist->threads, 0) == -1)
2357 		err = perf_evlist__set_filter_pid(evlist, getpid());
2358 
2359 	if (err < 0) {
2360 		printf("err=%d,%s\n", -err, strerror(-err));
2361 		exit(1);
2362 	}
2363 
2364 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2365 	if (err < 0)
2366 		goto out_error_mmap;
2367 
2368 	if (!target__none(&trace->opts.target))
2369 		perf_evlist__enable(evlist);
2370 
2371 	if (forks)
2372 		perf_evlist__start_workload(evlist);
2373 
2374 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2375 				  evlist->threads->nr > 1 ||
2376 				  perf_evlist__first(evlist)->attr.inherit;
2377 again:
2378 	before = trace->nr_events;
2379 
2380 	for (i = 0; i < evlist->nr_mmaps; i++) {
2381 		union perf_event *event;
2382 
2383 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2384 			struct perf_sample sample;
2385 
2386 			++trace->nr_events;
2387 
2388 			err = perf_evlist__parse_sample(evlist, event, &sample);
2389 			if (err) {
2390 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2391 				goto next_event;
2392 			}
2393 
2394 			trace__handle_event(trace, event, &sample);
2395 next_event:
2396 			perf_evlist__mmap_consume(evlist, i);
2397 
2398 			if (interrupted)
2399 				goto out_disable;
2400 
2401 			if (done && !draining) {
2402 				perf_evlist__disable(evlist);
2403 				draining = true;
2404 			}
2405 		}
2406 	}
2407 
2408 	if (trace->nr_events == before) {
2409 		int timeout = done ? 100 : -1;
2410 
2411 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2412 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2413 				draining = true;
2414 
2415 			goto again;
2416 		}
2417 	} else {
2418 		goto again;
2419 	}
2420 
2421 out_disable:
2422 	thread__zput(trace->current);
2423 
2424 	perf_evlist__disable(evlist);
2425 
2426 	if (!err) {
2427 		if (trace->summary)
2428 			trace__fprintf_thread_summary(trace, trace->output);
2429 
2430 		if (trace->show_tool_stats) {
2431 			fprintf(trace->output, "Stats:\n "
2432 					       " vfs_getname : %" PRIu64 "\n"
2433 					       " proc_getname: %" PRIu64 "\n",
2434 				trace->stats.vfs_getname,
2435 				trace->stats.proc_getname);
2436 		}
2437 	}
2438 
2439 out_delete_evlist:
2440 	perf_evlist__delete(evlist);
2441 	trace->evlist = NULL;
2442 	trace->live = false;
2443 	return err;
2444 {
2445 	char errbuf[BUFSIZ];
2446 
2447 out_error_sched_stat_runtime:
2448 	debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2449 	goto out_error;
2450 
2451 out_error_raw_syscalls:
2452 	debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2453 	goto out_error;
2454 
2455 out_error_mmap:
2456 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2457 	goto out_error;
2458 
2459 out_error_open:
2460 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2461 
2462 out_error:
2463 	fprintf(trace->output, "%s\n", errbuf);
2464 	goto out_delete_evlist;
2465 }
2466 out_error_mem:
2467 	fprintf(trace->output, "Not enough memory to run!\n");
2468 	goto out_delete_evlist;
2469 }
2470 
2471 static int trace__replay(struct trace *trace)
2472 {
2473 	const struct perf_evsel_str_handler handlers[] = {
2474 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2475 	};
2476 	struct perf_data_file file = {
2477 		.path  = input_name,
2478 		.mode  = PERF_DATA_MODE_READ,
2479 		.force = trace->force,
2480 	};
2481 	struct perf_session *session;
2482 	struct perf_evsel *evsel;
2483 	int err = -1;
2484 
2485 	trace->tool.sample	  = trace__process_sample;
2486 	trace->tool.mmap	  = perf_event__process_mmap;
2487 	trace->tool.mmap2	  = perf_event__process_mmap2;
2488 	trace->tool.comm	  = perf_event__process_comm;
2489 	trace->tool.exit	  = perf_event__process_exit;
2490 	trace->tool.fork	  = perf_event__process_fork;
2491 	trace->tool.attr	  = perf_event__process_attr;
2492 	trace->tool.tracing_data = perf_event__process_tracing_data;
2493 	trace->tool.build_id	  = perf_event__process_build_id;
2494 
2495 	trace->tool.ordered_events = true;
2496 	trace->tool.ordering_requires_timestamps = true;
2497 
2498 	/* add tid to output */
2499 	trace->multiple_threads = true;
2500 
2501 	session = perf_session__new(&file, false, &trace->tool);
2502 	if (session == NULL)
2503 		return -1;
2504 
2505 	if (symbol__init(&session->header.env) < 0)
2506 		goto out;
2507 
2508 	trace->host = &session->machines.host;
2509 
2510 	err = perf_session__set_tracepoints_handlers(session, handlers);
2511 	if (err)
2512 		goto out;
2513 
2514 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2515 						     "raw_syscalls:sys_enter");
2516 	/* older kernels have syscalls tp versus raw_syscalls */
2517 	if (evsel == NULL)
2518 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2519 							     "syscalls:sys_enter");
2520 
2521 	if (evsel &&
2522 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2523 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2524 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2525 		goto out;
2526 	}
2527 
2528 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2529 						     "raw_syscalls:sys_exit");
2530 	if (evsel == NULL)
2531 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2532 							     "syscalls:sys_exit");
2533 	if (evsel &&
2534 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2535 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2536 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2537 		goto out;
2538 	}
2539 
2540 	evlist__for_each(session->evlist, evsel) {
2541 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2542 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2543 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2544 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2545 			evsel->handler = trace__pgfault;
2546 	}
2547 
2548 	err = parse_target_str(trace);
2549 	if (err != 0)
2550 		goto out;
2551 
2552 	setup_pager();
2553 
2554 	err = perf_session__process_events(session);
2555 	if (err)
2556 		pr_err("Failed to process events, error %d", err);
2557 
2558 	else if (trace->summary)
2559 		trace__fprintf_thread_summary(trace, trace->output);
2560 
2561 out:
2562 	perf_session__delete(session);
2563 
2564 	return err;
2565 }
2566 
2567 static size_t trace__fprintf_threads_header(FILE *fp)
2568 {
2569 	size_t printed;
2570 
2571 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2572 
2573 	return printed;
2574 }
2575 
2576 static size_t thread__dump_stats(struct thread_trace *ttrace,
2577 				 struct trace *trace, FILE *fp)
2578 {
2579 	struct stats *stats;
2580 	size_t printed = 0;
2581 	struct syscall *sc;
2582 	struct int_node *inode = intlist__first(ttrace->syscall_stats);
2583 
2584 	if (inode == NULL)
2585 		return 0;
2586 
2587 	printed += fprintf(fp, "\n");
2588 
2589 	printed += fprintf(fp, "   syscall            calls      min       avg       max      stddev\n");
2590 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)        (%%)\n");
2591 	printed += fprintf(fp, "   --------------- -------- --------- --------- ---------     ------\n");
2592 
2593 	/* each int_node is a syscall */
2594 	while (inode) {
2595 		stats = inode->priv;
2596 		if (stats) {
2597 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2598 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2599 			double avg = avg_stats(stats);
2600 			double pct;
2601 			u64 n = (u64) stats->n;
2602 
2603 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2604 			avg /= NSEC_PER_MSEC;
2605 
2606 			sc = &trace->syscalls.table[inode->i];
2607 			printed += fprintf(fp, "   %-15s", sc->name);
2608 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f",
2609 					   n, min, avg);
2610 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2611 		}
2612 
2613 		inode = intlist__next(inode);
2614 	}
2615 
2616 	printed += fprintf(fp, "\n\n");
2617 
2618 	return printed;
2619 }
2620 
2621 /* struct used to pass data to per-thread function */
2622 struct summary_data {
2623 	FILE *fp;
2624 	struct trace *trace;
2625 	size_t printed;
2626 };
2627 
2628 static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2629 {
2630 	struct summary_data *data = priv;
2631 	FILE *fp = data->fp;
2632 	size_t printed = data->printed;
2633 	struct trace *trace = data->trace;
2634 	struct thread_trace *ttrace = thread__priv(thread);
2635 	double ratio;
2636 
2637 	if (ttrace == NULL)
2638 		return 0;
2639 
2640 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2641 
2642 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2643 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2644 	printed += fprintf(fp, "%.1f%%", ratio);
2645 	if (ttrace->pfmaj)
2646 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2647 	if (ttrace->pfmin)
2648 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2649 	printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2650 	printed += thread__dump_stats(ttrace, trace, fp);
2651 
2652 	data->printed += printed;
2653 
2654 	return 0;
2655 }
2656 
2657 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2658 {
2659 	struct summary_data data = {
2660 		.fp = fp,
2661 		.trace = trace
2662 	};
2663 	data.printed = trace__fprintf_threads_header(fp);
2664 
2665 	machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2666 
2667 	return data.printed;
2668 }
2669 
2670 static int trace__set_duration(const struct option *opt, const char *str,
2671 			       int unset __maybe_unused)
2672 {
2673 	struct trace *trace = opt->value;
2674 
2675 	trace->duration_filter = atof(str);
2676 	return 0;
2677 }
2678 
2679 static int trace__set_filter_pids(const struct option *opt, const char *str,
2680 				  int unset __maybe_unused)
2681 {
2682 	int ret = -1;
2683 	size_t i;
2684 	struct trace *trace = opt->value;
2685 	/*
2686 	 * FIXME: introduce a intarray class, plain parse csv and create a
2687 	 * { int nr, int entries[] } struct...
2688 	 */
2689 	struct intlist *list = intlist__new(str);
2690 
2691 	if (list == NULL)
2692 		return -1;
2693 
2694 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2695 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2696 
2697 	if (trace->filter_pids.entries == NULL)
2698 		goto out;
2699 
2700 	trace->filter_pids.entries[0] = getpid();
2701 
2702 	for (i = 1; i < trace->filter_pids.nr; ++i)
2703 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2704 
2705 	intlist__delete(list);
2706 	ret = 0;
2707 out:
2708 	return ret;
2709 }
2710 
2711 static int trace__open_output(struct trace *trace, const char *filename)
2712 {
2713 	struct stat st;
2714 
2715 	if (!stat(filename, &st) && st.st_size) {
2716 		char oldname[PATH_MAX];
2717 
2718 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2719 		unlink(oldname);
2720 		rename(filename, oldname);
2721 	}
2722 
2723 	trace->output = fopen(filename, "w");
2724 
2725 	return trace->output == NULL ? -errno : 0;
2726 }
2727 
2728 static int parse_pagefaults(const struct option *opt, const char *str,
2729 			    int unset __maybe_unused)
2730 {
2731 	int *trace_pgfaults = opt->value;
2732 
2733 	if (strcmp(str, "all") == 0)
2734 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2735 	else if (strcmp(str, "maj") == 0)
2736 		*trace_pgfaults |= TRACE_PFMAJ;
2737 	else if (strcmp(str, "min") == 0)
2738 		*trace_pgfaults |= TRACE_PFMIN;
2739 	else
2740 		return -1;
2741 
2742 	return 0;
2743 }
2744 
2745 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2746 {
2747 	struct perf_evsel *evsel;
2748 
2749 	evlist__for_each(evlist, evsel)
2750 		evsel->handler = handler;
2751 }
2752 
2753 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2754 {
2755 	const char *trace_usage[] = {
2756 		"perf trace [<options>] [<command>]",
2757 		"perf trace [<options>] -- <command> [<options>]",
2758 		"perf trace record [<options>] [<command>]",
2759 		"perf trace record [<options>] -- <command> [<options>]",
2760 		NULL
2761 	};
2762 	struct trace trace = {
2763 		.audit = {
2764 			.machine = audit_detect_machine(),
2765 			.open_id = audit_name_to_syscall("open", trace.audit.machine),
2766 		},
2767 		.syscalls = {
2768 			. max = -1,
2769 		},
2770 		.opts = {
2771 			.target = {
2772 				.uid	   = UINT_MAX,
2773 				.uses_mmap = true,
2774 			},
2775 			.user_freq     = UINT_MAX,
2776 			.user_interval = ULLONG_MAX,
2777 			.no_buffering  = true,
2778 			.mmap_pages    = UINT_MAX,
2779 			.proc_map_timeout  = 500,
2780 		},
2781 		.output = stdout,
2782 		.show_comm = true,
2783 		.trace_syscalls = true,
2784 	};
2785 	const char *output_name = NULL;
2786 	const char *ev_qualifier_str = NULL;
2787 	const struct option trace_options[] = {
2788 	OPT_CALLBACK(0, "event", &trace.evlist, "event",
2789 		     "event selector. use 'perf list' to list available events",
2790 		     parse_events_option),
2791 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2792 		    "show the thread COMM next to its id"),
2793 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2794 	OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2795 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2796 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2797 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2798 		    "trace events on existing process id"),
2799 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2800 		    "trace events on existing thread id"),
2801 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2802 		     "pids to filter (by the kernel)", trace__set_filter_pids),
2803 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2804 		    "system-wide collection from all CPUs"),
2805 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2806 		    "list of cpus to monitor"),
2807 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2808 		    "child tasks do not inherit counters"),
2809 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2810 		     "number of mmap data pages",
2811 		     perf_evlist__parse_mmap_pages),
2812 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2813 		   "user to profile"),
2814 	OPT_CALLBACK(0, "duration", &trace, "float",
2815 		     "show only events with duration > N.M ms",
2816 		     trace__set_duration),
2817 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2818 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2819 	OPT_BOOLEAN('T', "time", &trace.full_time,
2820 		    "Show full timestamp, not time relative to first start"),
2821 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
2822 		    "Show only syscall summary with statistics"),
2823 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
2824 		    "Show all syscalls and summary with statistics"),
2825 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2826 		     "Trace pagefaults", parse_pagefaults, "maj"),
2827 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2828 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2829 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2830 			"per thread proc mmap processing timeout in ms"),
2831 	OPT_END()
2832 	};
2833 	const char * const trace_subcommands[] = { "record", NULL };
2834 	int err;
2835 	char bf[BUFSIZ];
2836 
2837 	signal(SIGSEGV, sighandler_dump_stack);
2838 	signal(SIGFPE, sighandler_dump_stack);
2839 
2840 	trace.evlist = perf_evlist__new();
2841 
2842 	if (trace.evlist == NULL) {
2843 		pr_err("Not enough memory to run!\n");
2844 		err = -ENOMEM;
2845 		goto out;
2846 	}
2847 
2848 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2849 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2850 
2851 	if (trace.trace_pgfaults) {
2852 		trace.opts.sample_address = true;
2853 		trace.opts.sample_time = true;
2854 	}
2855 
2856 	if (trace.evlist->nr_entries > 0)
2857 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2858 
2859 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2860 		return trace__record(&trace, argc-1, &argv[1]);
2861 
2862 	/* summary_only implies summary option, but don't overwrite summary if set */
2863 	if (trace.summary_only)
2864 		trace.summary = trace.summary_only;
2865 
2866 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2867 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
2868 		pr_err("Please specify something to trace.\n");
2869 		return -1;
2870 	}
2871 
2872 	if (output_name != NULL) {
2873 		err = trace__open_output(&trace, output_name);
2874 		if (err < 0) {
2875 			perror("failed to create output file");
2876 			goto out;
2877 		}
2878 	}
2879 
2880 	if (ev_qualifier_str != NULL) {
2881 		const char *s = ev_qualifier_str;
2882 
2883 		trace.not_ev_qualifier = *s == '!';
2884 		if (trace.not_ev_qualifier)
2885 			++s;
2886 		trace.ev_qualifier = strlist__new(true, s);
2887 		if (trace.ev_qualifier == NULL) {
2888 			fputs("Not enough memory to parse event qualifier",
2889 			      trace.output);
2890 			err = -ENOMEM;
2891 			goto out_close;
2892 		}
2893 
2894 		err = trace__validate_ev_qualifier(&trace);
2895 		if (err)
2896 			goto out_close;
2897 	}
2898 
2899 	err = target__validate(&trace.opts.target);
2900 	if (err) {
2901 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2902 		fprintf(trace.output, "%s", bf);
2903 		goto out_close;
2904 	}
2905 
2906 	err = target__parse_uid(&trace.opts.target);
2907 	if (err) {
2908 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2909 		fprintf(trace.output, "%s", bf);
2910 		goto out_close;
2911 	}
2912 
2913 	if (!argc && target__none(&trace.opts.target))
2914 		trace.opts.target.system_wide = true;
2915 
2916 	if (input_name)
2917 		err = trace__replay(&trace);
2918 	else
2919 		err = trace__run(&trace, argc, argv);
2920 
2921 out_close:
2922 	if (output_name != NULL)
2923 		fclose(trace.output);
2924 out:
2925 	return err;
2926 }
2927