1 /* 2 * builtin-trace.c 3 * 4 * Builtin 'trace' command: 5 * 6 * Display a continuously updated trace of any workload, CPU, specific PID, 7 * system wide, etc. Default format is loosely strace like, but any other 8 * event may be specified using --event. 9 * 10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> 11 * 12 * Initially based on the 'trace' prototype by Thomas Gleixner: 13 * 14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'") 15 * 16 * Released under the GPL v2. (and only v2, not any later version) 17 */ 18 19 #include <traceevent/event-parse.h> 20 #include <api/fs/tracing_path.h> 21 #include "builtin.h" 22 #include "util/color.h" 23 #include "util/debug.h" 24 #include "util/evlist.h" 25 #include <subcmd/exec-cmd.h> 26 #include "util/machine.h" 27 #include "util/session.h" 28 #include "util/thread.h" 29 #include <subcmd/parse-options.h> 30 #include "util/strlist.h" 31 #include "util/intlist.h" 32 #include "util/thread_map.h" 33 #include "util/stat.h" 34 #include "trace-event.h" 35 #include "util/parse-events.h" 36 #include "util/bpf-loader.h" 37 #include "callchain.h" 38 #include "syscalltbl.h" 39 #include "rb_resort.h" 40 41 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */ 42 #include <stdlib.h> 43 #include <linux/err.h> 44 #include <linux/filter.h> 45 #include <linux/audit.h> 46 #include <sys/ptrace.h> 47 #include <linux/random.h> 48 #include <linux/stringify.h> 49 50 #ifndef O_CLOEXEC 51 # define O_CLOEXEC 02000000 52 #endif 53 54 struct trace { 55 struct perf_tool tool; 56 struct syscalltbl *sctbl; 57 struct { 58 int max; 59 struct syscall *table; 60 struct { 61 struct perf_evsel *sys_enter, 62 *sys_exit; 63 } events; 64 } syscalls; 65 struct record_opts opts; 66 struct perf_evlist *evlist; 67 struct machine *host; 68 struct thread *current; 69 u64 base_time; 70 FILE *output; 71 unsigned long nr_events; 72 struct strlist *ev_qualifier; 73 struct { 74 size_t nr; 75 int *entries; 76 } ev_qualifier_ids; 77 struct intlist *tid_list; 78 struct intlist *pid_list; 79 struct { 80 size_t nr; 81 pid_t *entries; 82 } filter_pids; 83 double duration_filter; 84 double runtime_ms; 85 struct { 86 u64 vfs_getname, 87 proc_getname; 88 } stats; 89 unsigned int max_stack; 90 unsigned int min_stack; 91 bool not_ev_qualifier; 92 bool live; 93 bool full_time; 94 bool sched; 95 bool multiple_threads; 96 bool summary; 97 bool summary_only; 98 bool show_comm; 99 bool show_tool_stats; 100 bool trace_syscalls; 101 bool kernel_syscallchains; 102 bool force; 103 bool vfs_getname; 104 int trace_pgfaults; 105 int open_id; 106 }; 107 108 struct tp_field { 109 int offset; 110 union { 111 u64 (*integer)(struct tp_field *field, struct perf_sample *sample); 112 void *(*pointer)(struct tp_field *field, struct perf_sample *sample); 113 }; 114 }; 115 116 #define TP_UINT_FIELD(bits) \ 117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \ 118 { \ 119 u##bits value; \ 120 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \ 121 return value; \ 122 } 123 124 TP_UINT_FIELD(8); 125 TP_UINT_FIELD(16); 126 TP_UINT_FIELD(32); 127 TP_UINT_FIELD(64); 128 129 #define TP_UINT_FIELD__SWAPPED(bits) \ 130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \ 131 { \ 132 u##bits value; \ 133 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \ 134 return bswap_##bits(value);\ 135 } 136 137 TP_UINT_FIELD__SWAPPED(16); 138 TP_UINT_FIELD__SWAPPED(32); 139 TP_UINT_FIELD__SWAPPED(64); 140 141 static int tp_field__init_uint(struct tp_field *field, 142 struct format_field *format_field, 143 bool needs_swap) 144 { 145 field->offset = format_field->offset; 146 147 switch (format_field->size) { 148 case 1: 149 field->integer = tp_field__u8; 150 break; 151 case 2: 152 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16; 153 break; 154 case 4: 155 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32; 156 break; 157 case 8: 158 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64; 159 break; 160 default: 161 return -1; 162 } 163 164 return 0; 165 } 166 167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample) 168 { 169 return sample->raw_data + field->offset; 170 } 171 172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field) 173 { 174 field->offset = format_field->offset; 175 field->pointer = tp_field__ptr; 176 return 0; 177 } 178 179 struct syscall_tp { 180 struct tp_field id; 181 union { 182 struct tp_field args, ret; 183 }; 184 }; 185 186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel, 187 struct tp_field *field, 188 const char *name) 189 { 190 struct format_field *format_field = perf_evsel__field(evsel, name); 191 192 if (format_field == NULL) 193 return -1; 194 195 return tp_field__init_uint(field, format_field, evsel->needs_swap); 196 } 197 198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \ 199 ({ struct syscall_tp *sc = evsel->priv;\ 200 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); }) 201 202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel, 203 struct tp_field *field, 204 const char *name) 205 { 206 struct format_field *format_field = perf_evsel__field(evsel, name); 207 208 if (format_field == NULL) 209 return -1; 210 211 return tp_field__init_ptr(field, format_field); 212 } 213 214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \ 215 ({ struct syscall_tp *sc = evsel->priv;\ 216 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); }) 217 218 static void perf_evsel__delete_priv(struct perf_evsel *evsel) 219 { 220 zfree(&evsel->priv); 221 perf_evsel__delete(evsel); 222 } 223 224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler) 225 { 226 evsel->priv = malloc(sizeof(struct syscall_tp)); 227 if (evsel->priv != NULL) { 228 if (perf_evsel__init_sc_tp_uint_field(evsel, id)) 229 goto out_delete; 230 231 evsel->handler = handler; 232 return 0; 233 } 234 235 return -ENOMEM; 236 237 out_delete: 238 zfree(&evsel->priv); 239 return -ENOENT; 240 } 241 242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler) 243 { 244 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction); 245 246 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */ 247 if (IS_ERR(evsel)) 248 evsel = perf_evsel__newtp("syscalls", direction); 249 250 if (IS_ERR(evsel)) 251 return NULL; 252 253 if (perf_evsel__init_syscall_tp(evsel, handler)) 254 goto out_delete; 255 256 return evsel; 257 258 out_delete: 259 perf_evsel__delete_priv(evsel); 260 return NULL; 261 } 262 263 #define perf_evsel__sc_tp_uint(evsel, name, sample) \ 264 ({ struct syscall_tp *fields = evsel->priv; \ 265 fields->name.integer(&fields->name, sample); }) 266 267 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \ 268 ({ struct syscall_tp *fields = evsel->priv; \ 269 fields->name.pointer(&fields->name, sample); }) 270 271 struct syscall_arg { 272 unsigned long val; 273 struct thread *thread; 274 struct trace *trace; 275 void *parm; 276 u8 idx; 277 u8 mask; 278 }; 279 280 struct strarray { 281 int offset; 282 int nr_entries; 283 const char **entries; 284 }; 285 286 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \ 287 .nr_entries = ARRAY_SIZE(array), \ 288 .entries = array, \ 289 } 290 291 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \ 292 .offset = off, \ 293 .nr_entries = ARRAY_SIZE(array), \ 294 .entries = array, \ 295 } 296 297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size, 298 const char *intfmt, 299 struct syscall_arg *arg) 300 { 301 struct strarray *sa = arg->parm; 302 int idx = arg->val - sa->offset; 303 304 if (idx < 0 || idx >= sa->nr_entries) 305 return scnprintf(bf, size, intfmt, arg->val); 306 307 return scnprintf(bf, size, "%s", sa->entries[idx]); 308 } 309 310 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size, 311 struct syscall_arg *arg) 312 { 313 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg); 314 } 315 316 #define SCA_STRARRAY syscall_arg__scnprintf_strarray 317 318 #if defined(__i386__) || defined(__x86_64__) 319 /* 320 * FIXME: Make this available to all arches as soon as the ioctl beautifier 321 * gets rewritten to support all arches. 322 */ 323 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size, 324 struct syscall_arg *arg) 325 { 326 return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg); 327 } 328 329 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray 330 #endif /* defined(__i386__) || defined(__x86_64__) */ 331 332 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size, 333 struct syscall_arg *arg); 334 335 #define SCA_FD syscall_arg__scnprintf_fd 336 337 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size, 338 struct syscall_arg *arg) 339 { 340 int fd = arg->val; 341 342 if (fd == AT_FDCWD) 343 return scnprintf(bf, size, "CWD"); 344 345 return syscall_arg__scnprintf_fd(bf, size, arg); 346 } 347 348 #define SCA_FDAT syscall_arg__scnprintf_fd_at 349 350 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size, 351 struct syscall_arg *arg); 352 353 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd 354 355 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size, 356 struct syscall_arg *arg) 357 { 358 return scnprintf(bf, size, "%#lx", arg->val); 359 } 360 361 #define SCA_HEX syscall_arg__scnprintf_hex 362 363 static size_t syscall_arg__scnprintf_int(char *bf, size_t size, 364 struct syscall_arg *arg) 365 { 366 return scnprintf(bf, size, "%d", arg->val); 367 } 368 369 #define SCA_INT syscall_arg__scnprintf_int 370 371 static const char *bpf_cmd[] = { 372 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM", 373 "MAP_GET_NEXT_KEY", "PROG_LOAD", 374 }; 375 static DEFINE_STRARRAY(bpf_cmd); 376 377 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", }; 378 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1); 379 380 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", }; 381 static DEFINE_STRARRAY(itimers); 382 383 static const char *keyctl_options[] = { 384 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN", 385 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ", 386 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT", 387 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT", 388 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT", 389 }; 390 static DEFINE_STRARRAY(keyctl_options); 391 392 static const char *whences[] = { "SET", "CUR", "END", 393 #ifdef SEEK_DATA 394 "DATA", 395 #endif 396 #ifdef SEEK_HOLE 397 "HOLE", 398 #endif 399 }; 400 static DEFINE_STRARRAY(whences); 401 402 static const char *fcntl_cmds[] = { 403 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK", 404 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64", 405 "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX", 406 "F_GETOWNER_UIDS", 407 }; 408 static DEFINE_STRARRAY(fcntl_cmds); 409 410 static const char *rlimit_resources[] = { 411 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE", 412 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO", 413 "RTTIME", 414 }; 415 static DEFINE_STRARRAY(rlimit_resources); 416 417 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", }; 418 static DEFINE_STRARRAY(sighow); 419 420 static const char *clockid[] = { 421 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID", 422 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME", 423 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI" 424 }; 425 static DEFINE_STRARRAY(clockid); 426 427 static const char *socket_families[] = { 428 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM", 429 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI", 430 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC", 431 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC", 432 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF", 433 "ALG", "NFC", "VSOCK", 434 }; 435 static DEFINE_STRARRAY(socket_families); 436 437 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size, 438 struct syscall_arg *arg) 439 { 440 size_t printed = 0; 441 int mode = arg->val; 442 443 if (mode == F_OK) /* 0 */ 444 return scnprintf(bf, size, "F"); 445 #define P_MODE(n) \ 446 if (mode & n##_OK) { \ 447 printed += scnprintf(bf + printed, size - printed, "%s", #n); \ 448 mode &= ~n##_OK; \ 449 } 450 451 P_MODE(R); 452 P_MODE(W); 453 P_MODE(X); 454 #undef P_MODE 455 456 if (mode) 457 printed += scnprintf(bf + printed, size - printed, "|%#x", mode); 458 459 return printed; 460 } 461 462 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode 463 464 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, 465 struct syscall_arg *arg); 466 467 #define SCA_FILENAME syscall_arg__scnprintf_filename 468 469 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size, 470 struct syscall_arg *arg) 471 { 472 int printed = 0, flags = arg->val; 473 474 #define P_FLAG(n) \ 475 if (flags & O_##n) { \ 476 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ 477 flags &= ~O_##n; \ 478 } 479 480 P_FLAG(CLOEXEC); 481 P_FLAG(NONBLOCK); 482 #undef P_FLAG 483 484 if (flags) 485 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); 486 487 return printed; 488 } 489 490 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags 491 492 #if defined(__i386__) || defined(__x86_64__) 493 /* 494 * FIXME: Make this available to all arches. 495 */ 496 #define TCGETS 0x5401 497 498 static const char *tioctls[] = { 499 "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW", 500 "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL", 501 "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI", 502 "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC", 503 "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX", 504 "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO", 505 "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK", 506 "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2", 507 "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK", 508 "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG", 509 "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL", 510 [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG", 511 "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS", 512 "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI", 513 "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE", 514 }; 515 516 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401); 517 #endif /* defined(__i386__) || defined(__x86_64__) */ 518 519 #ifndef GRND_NONBLOCK 520 #define GRND_NONBLOCK 0x0001 521 #endif 522 #ifndef GRND_RANDOM 523 #define GRND_RANDOM 0x0002 524 #endif 525 526 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, 527 struct syscall_arg *arg) 528 { 529 int printed = 0, flags = arg->val; 530 531 #define P_FLAG(n) \ 532 if (flags & GRND_##n) { \ 533 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ 534 flags &= ~GRND_##n; \ 535 } 536 537 P_FLAG(RANDOM); 538 P_FLAG(NONBLOCK); 539 #undef P_FLAG 540 541 if (flags) 542 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); 543 544 return printed; 545 } 546 547 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags 548 549 #define STRARRAY(arg, name, array) \ 550 .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \ 551 .arg_parm = { [arg] = &strarray__##array, } 552 553 #include "trace/beauty/eventfd.c" 554 #include "trace/beauty/flock.c" 555 #include "trace/beauty/futex_op.c" 556 #include "trace/beauty/mmap.c" 557 #include "trace/beauty/mode_t.c" 558 #include "trace/beauty/msg_flags.c" 559 #include "trace/beauty/open_flags.c" 560 #include "trace/beauty/perf_event_open.c" 561 #include "trace/beauty/pid.c" 562 #include "trace/beauty/sched_policy.c" 563 #include "trace/beauty/seccomp.c" 564 #include "trace/beauty/signum.c" 565 #include "trace/beauty/socket_type.c" 566 #include "trace/beauty/waitid_options.c" 567 568 static struct syscall_fmt { 569 const char *name; 570 const char *alias; 571 size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg); 572 void *arg_parm[6]; 573 bool errmsg; 574 bool errpid; 575 bool timeout; 576 bool hexret; 577 } syscall_fmts[] = { 578 { .name = "access", .errmsg = true, 579 .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, }, 580 { .name = "arch_prctl", .errmsg = true, .alias = "prctl", }, 581 { .name = "bpf", .errmsg = true, STRARRAY(0, cmd, bpf_cmd), }, 582 { .name = "brk", .hexret = true, 583 .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, }, 584 { .name = "chdir", .errmsg = true, }, 585 { .name = "chmod", .errmsg = true, }, 586 { .name = "chroot", .errmsg = true, }, 587 { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), }, 588 { .name = "clone", .errpid = true, }, 589 { .name = "close", .errmsg = true, 590 .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, }, 591 { .name = "connect", .errmsg = true, }, 592 { .name = "creat", .errmsg = true, }, 593 { .name = "dup", .errmsg = true, }, 594 { .name = "dup2", .errmsg = true, }, 595 { .name = "dup3", .errmsg = true, }, 596 { .name = "epoll_ctl", .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), }, 597 { .name = "eventfd2", .errmsg = true, 598 .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, }, 599 { .name = "faccessat", .errmsg = true, }, 600 { .name = "fadvise64", .errmsg = true, }, 601 { .name = "fallocate", .errmsg = true, }, 602 { .name = "fchdir", .errmsg = true, }, 603 { .name = "fchmod", .errmsg = true, }, 604 { .name = "fchmodat", .errmsg = true, 605 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 606 { .name = "fchown", .errmsg = true, }, 607 { .name = "fchownat", .errmsg = true, 608 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 609 { .name = "fcntl", .errmsg = true, 610 .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ }, 611 .arg_parm = { [1] = &strarray__fcntl_cmds, /* cmd */ }, }, 612 { .name = "fdatasync", .errmsg = true, }, 613 { .name = "flock", .errmsg = true, 614 .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, }, 615 { .name = "fsetxattr", .errmsg = true, }, 616 { .name = "fstat", .errmsg = true, .alias = "newfstat", }, 617 { .name = "fstatat", .errmsg = true, .alias = "newfstatat", }, 618 { .name = "fstatfs", .errmsg = true, }, 619 { .name = "fsync", .errmsg = true, }, 620 { .name = "ftruncate", .errmsg = true, }, 621 { .name = "futex", .errmsg = true, 622 .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, }, 623 { .name = "futimesat", .errmsg = true, 624 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 625 { .name = "getdents", .errmsg = true, }, 626 { .name = "getdents64", .errmsg = true, }, 627 { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), }, 628 { .name = "getpid", .errpid = true, }, 629 { .name = "getpgid", .errpid = true, }, 630 { .name = "getppid", .errpid = true, }, 631 { .name = "getrandom", .errmsg = true, 632 .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, }, 633 { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), }, 634 { .name = "getxattr", .errmsg = true, }, 635 { .name = "inotify_add_watch", .errmsg = true, }, 636 { .name = "ioctl", .errmsg = true, 637 .arg_scnprintf = { 638 #if defined(__i386__) || defined(__x86_64__) 639 /* 640 * FIXME: Make this available to all arches. 641 */ 642 [1] = SCA_STRHEXARRAY, /* cmd */ 643 [2] = SCA_HEX, /* arg */ }, 644 .arg_parm = { [1] = &strarray__tioctls, /* cmd */ }, }, 645 #else 646 [2] = SCA_HEX, /* arg */ }, }, 647 #endif 648 { .name = "keyctl", .errmsg = true, STRARRAY(0, option, keyctl_options), }, 649 { .name = "kill", .errmsg = true, 650 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, }, 651 { .name = "lchown", .errmsg = true, }, 652 { .name = "lgetxattr", .errmsg = true, }, 653 { .name = "linkat", .errmsg = true, 654 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 655 { .name = "listxattr", .errmsg = true, }, 656 { .name = "llistxattr", .errmsg = true, }, 657 { .name = "lremovexattr", .errmsg = true, }, 658 { .name = "lseek", .errmsg = true, 659 .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ }, 660 .arg_parm = { [2] = &strarray__whences, /* whence */ }, }, 661 { .name = "lsetxattr", .errmsg = true, }, 662 { .name = "lstat", .errmsg = true, .alias = "newlstat", }, 663 { .name = "lsxattr", .errmsg = true, }, 664 { .name = "madvise", .errmsg = true, 665 .arg_scnprintf = { [0] = SCA_HEX, /* start */ 666 [2] = SCA_MADV_BHV, /* behavior */ }, }, 667 { .name = "mkdir", .errmsg = true, }, 668 { .name = "mkdirat", .errmsg = true, 669 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 670 { .name = "mknod", .errmsg = true, }, 671 { .name = "mknodat", .errmsg = true, 672 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 673 { .name = "mlock", .errmsg = true, 674 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, }, 675 { .name = "mlockall", .errmsg = true, 676 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, }, 677 { .name = "mmap", .hexret = true, 678 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ 679 [2] = SCA_MMAP_PROT, /* prot */ 680 [3] = SCA_MMAP_FLAGS, /* flags */ }, }, 681 { .name = "mprotect", .errmsg = true, 682 .arg_scnprintf = { [0] = SCA_HEX, /* start */ 683 [2] = SCA_MMAP_PROT, /* prot */ }, }, 684 { .name = "mq_unlink", .errmsg = true, 685 .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, }, 686 { .name = "mremap", .hexret = true, 687 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ 688 [3] = SCA_MREMAP_FLAGS, /* flags */ 689 [4] = SCA_HEX, /* new_addr */ }, }, 690 { .name = "munlock", .errmsg = true, 691 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, }, 692 { .name = "munmap", .errmsg = true, 693 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, }, 694 { .name = "name_to_handle_at", .errmsg = true, 695 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 696 { .name = "newfstatat", .errmsg = true, 697 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 698 { .name = "open", .errmsg = true, 699 .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, }, 700 { .name = "open_by_handle_at", .errmsg = true, 701 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ 702 [2] = SCA_OPEN_FLAGS, /* flags */ }, }, 703 { .name = "openat", .errmsg = true, 704 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ 705 [2] = SCA_OPEN_FLAGS, /* flags */ }, }, 706 { .name = "perf_event_open", .errmsg = true, 707 .arg_scnprintf = { [2] = SCA_INT, /* cpu */ 708 [3] = SCA_FD, /* group_fd */ 709 [4] = SCA_PERF_FLAGS, /* flags */ }, }, 710 { .name = "pipe2", .errmsg = true, 711 .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, }, 712 { .name = "poll", .errmsg = true, .timeout = true, }, 713 { .name = "ppoll", .errmsg = true, .timeout = true, }, 714 { .name = "pread", .errmsg = true, .alias = "pread64", }, 715 { .name = "preadv", .errmsg = true, .alias = "pread", }, 716 { .name = "prlimit64", .errmsg = true, STRARRAY(1, resource, rlimit_resources), }, 717 { .name = "pwrite", .errmsg = true, .alias = "pwrite64", }, 718 { .name = "pwritev", .errmsg = true, }, 719 { .name = "read", .errmsg = true, }, 720 { .name = "readlink", .errmsg = true, }, 721 { .name = "readlinkat", .errmsg = true, 722 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 723 { .name = "readv", .errmsg = true, }, 724 { .name = "recvfrom", .errmsg = true, 725 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, 726 { .name = "recvmmsg", .errmsg = true, 727 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, 728 { .name = "recvmsg", .errmsg = true, 729 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, }, 730 { .name = "removexattr", .errmsg = true, }, 731 { .name = "renameat", .errmsg = true, 732 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 733 { .name = "rmdir", .errmsg = true, }, 734 { .name = "rt_sigaction", .errmsg = true, 735 .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, }, 736 { .name = "rt_sigprocmask", .errmsg = true, STRARRAY(0, how, sighow), }, 737 { .name = "rt_sigqueueinfo", .errmsg = true, 738 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, }, 739 { .name = "rt_tgsigqueueinfo", .errmsg = true, 740 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, }, 741 { .name = "sched_setscheduler", .errmsg = true, 742 .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, }, 743 { .name = "seccomp", .errmsg = true, 744 .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */ 745 [1] = SCA_SECCOMP_FLAGS, /* flags */ }, }, 746 { .name = "select", .errmsg = true, .timeout = true, }, 747 { .name = "sendmmsg", .errmsg = true, 748 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, 749 { .name = "sendmsg", .errmsg = true, 750 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, }, 751 { .name = "sendto", .errmsg = true, 752 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, 753 { .name = "set_tid_address", .errpid = true, }, 754 { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), }, 755 { .name = "setpgid", .errmsg = true, }, 756 { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), }, 757 { .name = "setxattr", .errmsg = true, }, 758 { .name = "shutdown", .errmsg = true, }, 759 { .name = "socket", .errmsg = true, 760 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */ 761 [1] = SCA_SK_TYPE, /* type */ }, 762 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, }, 763 { .name = "socketpair", .errmsg = true, 764 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */ 765 [1] = SCA_SK_TYPE, /* type */ }, 766 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, }, 767 { .name = "stat", .errmsg = true, .alias = "newstat", }, 768 { .name = "statfs", .errmsg = true, }, 769 { .name = "swapoff", .errmsg = true, 770 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, }, 771 { .name = "swapon", .errmsg = true, 772 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, }, 773 { .name = "symlinkat", .errmsg = true, 774 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 775 { .name = "tgkill", .errmsg = true, 776 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, }, 777 { .name = "tkill", .errmsg = true, 778 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, }, 779 { .name = "truncate", .errmsg = true, }, 780 { .name = "uname", .errmsg = true, .alias = "newuname", }, 781 { .name = "unlinkat", .errmsg = true, 782 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 783 { .name = "utime", .errmsg = true, }, 784 { .name = "utimensat", .errmsg = true, 785 .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, }, 786 { .name = "utimes", .errmsg = true, }, 787 { .name = "vmsplice", .errmsg = true, }, 788 { .name = "wait4", .errpid = true, 789 .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, }, 790 { .name = "waitid", .errpid = true, 791 .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, }, 792 { .name = "write", .errmsg = true, }, 793 { .name = "writev", .errmsg = true, }, 794 }; 795 796 static int syscall_fmt__cmp(const void *name, const void *fmtp) 797 { 798 const struct syscall_fmt *fmt = fmtp; 799 return strcmp(name, fmt->name); 800 } 801 802 static struct syscall_fmt *syscall_fmt__find(const char *name) 803 { 804 const int nmemb = ARRAY_SIZE(syscall_fmts); 805 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp); 806 } 807 808 struct syscall { 809 struct event_format *tp_format; 810 int nr_args; 811 struct format_field *args; 812 const char *name; 813 bool is_exit; 814 struct syscall_fmt *fmt; 815 size_t (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg); 816 void **arg_parm; 817 }; 818 819 static size_t fprintf_duration(unsigned long t, FILE *fp) 820 { 821 double duration = (double)t / NSEC_PER_MSEC; 822 size_t printed = fprintf(fp, "("); 823 824 if (duration >= 1.0) 825 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration); 826 else if (duration >= 0.01) 827 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration); 828 else 829 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration); 830 return printed + fprintf(fp, "): "); 831 } 832 833 /** 834 * filename.ptr: The filename char pointer that will be vfs_getname'd 835 * filename.entry_str_pos: Where to insert the string translated from 836 * filename.ptr by the vfs_getname tracepoint/kprobe. 837 */ 838 struct thread_trace { 839 u64 entry_time; 840 u64 exit_time; 841 bool entry_pending; 842 unsigned long nr_events; 843 unsigned long pfmaj, pfmin; 844 char *entry_str; 845 double runtime_ms; 846 struct { 847 unsigned long ptr; 848 short int entry_str_pos; 849 bool pending_open; 850 unsigned int namelen; 851 char *name; 852 } filename; 853 struct { 854 int max; 855 char **table; 856 } paths; 857 858 struct intlist *syscall_stats; 859 }; 860 861 static struct thread_trace *thread_trace__new(void) 862 { 863 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace)); 864 865 if (ttrace) 866 ttrace->paths.max = -1; 867 868 ttrace->syscall_stats = intlist__new(NULL); 869 870 return ttrace; 871 } 872 873 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp) 874 { 875 struct thread_trace *ttrace; 876 877 if (thread == NULL) 878 goto fail; 879 880 if (thread__priv(thread) == NULL) 881 thread__set_priv(thread, thread_trace__new()); 882 883 if (thread__priv(thread) == NULL) 884 goto fail; 885 886 ttrace = thread__priv(thread); 887 ++ttrace->nr_events; 888 889 return ttrace; 890 fail: 891 color_fprintf(fp, PERF_COLOR_RED, 892 "WARNING: not enough memory, dropping samples!\n"); 893 return NULL; 894 } 895 896 #define TRACE_PFMAJ (1 << 0) 897 #define TRACE_PFMIN (1 << 1) 898 899 static const size_t trace__entry_str_size = 2048; 900 901 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname) 902 { 903 struct thread_trace *ttrace = thread__priv(thread); 904 905 if (fd > ttrace->paths.max) { 906 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *)); 907 908 if (npath == NULL) 909 return -1; 910 911 if (ttrace->paths.max != -1) { 912 memset(npath + ttrace->paths.max + 1, 0, 913 (fd - ttrace->paths.max) * sizeof(char *)); 914 } else { 915 memset(npath, 0, (fd + 1) * sizeof(char *)); 916 } 917 918 ttrace->paths.table = npath; 919 ttrace->paths.max = fd; 920 } 921 922 ttrace->paths.table[fd] = strdup(pathname); 923 924 return ttrace->paths.table[fd] != NULL ? 0 : -1; 925 } 926 927 static int thread__read_fd_path(struct thread *thread, int fd) 928 { 929 char linkname[PATH_MAX], pathname[PATH_MAX]; 930 struct stat st; 931 int ret; 932 933 if (thread->pid_ == thread->tid) { 934 scnprintf(linkname, sizeof(linkname), 935 "/proc/%d/fd/%d", thread->pid_, fd); 936 } else { 937 scnprintf(linkname, sizeof(linkname), 938 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd); 939 } 940 941 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname)) 942 return -1; 943 944 ret = readlink(linkname, pathname, sizeof(pathname)); 945 946 if (ret < 0 || ret > st.st_size) 947 return -1; 948 949 pathname[ret] = '\0'; 950 return trace__set_fd_pathname(thread, fd, pathname); 951 } 952 953 static const char *thread__fd_path(struct thread *thread, int fd, 954 struct trace *trace) 955 { 956 struct thread_trace *ttrace = thread__priv(thread); 957 958 if (ttrace == NULL) 959 return NULL; 960 961 if (fd < 0) 962 return NULL; 963 964 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) { 965 if (!trace->live) 966 return NULL; 967 ++trace->stats.proc_getname; 968 if (thread__read_fd_path(thread, fd)) 969 return NULL; 970 } 971 972 return ttrace->paths.table[fd]; 973 } 974 975 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size, 976 struct syscall_arg *arg) 977 { 978 int fd = arg->val; 979 size_t printed = scnprintf(bf, size, "%d", fd); 980 const char *path = thread__fd_path(arg->thread, fd, arg->trace); 981 982 if (path) 983 printed += scnprintf(bf + printed, size - printed, "<%s>", path); 984 985 return printed; 986 } 987 988 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size, 989 struct syscall_arg *arg) 990 { 991 int fd = arg->val; 992 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg); 993 struct thread_trace *ttrace = thread__priv(arg->thread); 994 995 if (ttrace && fd >= 0 && fd <= ttrace->paths.max) 996 zfree(&ttrace->paths.table[fd]); 997 998 return printed; 999 } 1000 1001 static void thread__set_filename_pos(struct thread *thread, const char *bf, 1002 unsigned long ptr) 1003 { 1004 struct thread_trace *ttrace = thread__priv(thread); 1005 1006 ttrace->filename.ptr = ptr; 1007 ttrace->filename.entry_str_pos = bf - ttrace->entry_str; 1008 } 1009 1010 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, 1011 struct syscall_arg *arg) 1012 { 1013 unsigned long ptr = arg->val; 1014 1015 if (!arg->trace->vfs_getname) 1016 return scnprintf(bf, size, "%#x", ptr); 1017 1018 thread__set_filename_pos(arg->thread, bf, ptr); 1019 return 0; 1020 } 1021 1022 static bool trace__filter_duration(struct trace *trace, double t) 1023 { 1024 return t < (trace->duration_filter * NSEC_PER_MSEC); 1025 } 1026 1027 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp) 1028 { 1029 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC; 1030 1031 return fprintf(fp, "%10.3f ", ts); 1032 } 1033 1034 static bool done = false; 1035 static bool interrupted = false; 1036 1037 static void sig_handler(int sig) 1038 { 1039 done = true; 1040 interrupted = sig == SIGINT; 1041 } 1042 1043 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread, 1044 u64 duration, u64 tstamp, FILE *fp) 1045 { 1046 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp); 1047 printed += fprintf_duration(duration, fp); 1048 1049 if (trace->multiple_threads) { 1050 if (trace->show_comm) 1051 printed += fprintf(fp, "%.14s/", thread__comm_str(thread)); 1052 printed += fprintf(fp, "%d ", thread->tid); 1053 } 1054 1055 return printed; 1056 } 1057 1058 static int trace__process_event(struct trace *trace, struct machine *machine, 1059 union perf_event *event, struct perf_sample *sample) 1060 { 1061 int ret = 0; 1062 1063 switch (event->header.type) { 1064 case PERF_RECORD_LOST: 1065 color_fprintf(trace->output, PERF_COLOR_RED, 1066 "LOST %" PRIu64 " events!\n", event->lost.lost); 1067 ret = machine__process_lost_event(machine, event, sample); 1068 break; 1069 default: 1070 ret = machine__process_event(machine, event, sample); 1071 break; 1072 } 1073 1074 return ret; 1075 } 1076 1077 static int trace__tool_process(struct perf_tool *tool, 1078 union perf_event *event, 1079 struct perf_sample *sample, 1080 struct machine *machine) 1081 { 1082 struct trace *trace = container_of(tool, struct trace, tool); 1083 return trace__process_event(trace, machine, event, sample); 1084 } 1085 1086 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp) 1087 { 1088 struct machine *machine = vmachine; 1089 1090 if (machine->kptr_restrict_warned) 1091 return NULL; 1092 1093 if (symbol_conf.kptr_restrict) { 1094 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n" 1095 "Check /proc/sys/kernel/kptr_restrict.\n\n" 1096 "Kernel samples will not be resolved.\n"); 1097 machine->kptr_restrict_warned = true; 1098 return NULL; 1099 } 1100 1101 return machine__resolve_kernel_addr(vmachine, addrp, modp); 1102 } 1103 1104 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist) 1105 { 1106 int err = symbol__init(NULL); 1107 1108 if (err) 1109 return err; 1110 1111 trace->host = machine__new_host(); 1112 if (trace->host == NULL) 1113 return -ENOMEM; 1114 1115 if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0) 1116 return -errno; 1117 1118 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target, 1119 evlist->threads, trace__tool_process, false, 1120 trace->opts.proc_map_timeout); 1121 if (err) 1122 symbol__exit(); 1123 1124 return err; 1125 } 1126 1127 static int syscall__set_arg_fmts(struct syscall *sc) 1128 { 1129 struct format_field *field; 1130 int idx = 0, len; 1131 1132 sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *)); 1133 if (sc->arg_scnprintf == NULL) 1134 return -1; 1135 1136 if (sc->fmt) 1137 sc->arg_parm = sc->fmt->arg_parm; 1138 1139 for (field = sc->args; field; field = field->next) { 1140 if (sc->fmt && sc->fmt->arg_scnprintf[idx]) 1141 sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx]; 1142 else if (strcmp(field->type, "const char *") == 0 && 1143 (strcmp(field->name, "filename") == 0 || 1144 strcmp(field->name, "path") == 0 || 1145 strcmp(field->name, "pathname") == 0)) 1146 sc->arg_scnprintf[idx] = SCA_FILENAME; 1147 else if (field->flags & FIELD_IS_POINTER) 1148 sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex; 1149 else if (strcmp(field->type, "pid_t") == 0) 1150 sc->arg_scnprintf[idx] = SCA_PID; 1151 else if (strcmp(field->type, "umode_t") == 0) 1152 sc->arg_scnprintf[idx] = SCA_MODE_T; 1153 else if ((strcmp(field->type, "int") == 0 || 1154 strcmp(field->type, "unsigned int") == 0 || 1155 strcmp(field->type, "long") == 0) && 1156 (len = strlen(field->name)) >= 2 && 1157 strcmp(field->name + len - 2, "fd") == 0) { 1158 /* 1159 * /sys/kernel/tracing/events/syscalls/sys_enter* 1160 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c 1161 * 65 int 1162 * 23 unsigned int 1163 * 7 unsigned long 1164 */ 1165 sc->arg_scnprintf[idx] = SCA_FD; 1166 } 1167 ++idx; 1168 } 1169 1170 return 0; 1171 } 1172 1173 static int trace__read_syscall_info(struct trace *trace, int id) 1174 { 1175 char tp_name[128]; 1176 struct syscall *sc; 1177 const char *name = syscalltbl__name(trace->sctbl, id); 1178 1179 if (name == NULL) 1180 return -1; 1181 1182 if (id > trace->syscalls.max) { 1183 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc)); 1184 1185 if (nsyscalls == NULL) 1186 return -1; 1187 1188 if (trace->syscalls.max != -1) { 1189 memset(nsyscalls + trace->syscalls.max + 1, 0, 1190 (id - trace->syscalls.max) * sizeof(*sc)); 1191 } else { 1192 memset(nsyscalls, 0, (id + 1) * sizeof(*sc)); 1193 } 1194 1195 trace->syscalls.table = nsyscalls; 1196 trace->syscalls.max = id; 1197 } 1198 1199 sc = trace->syscalls.table + id; 1200 sc->name = name; 1201 1202 sc->fmt = syscall_fmt__find(sc->name); 1203 1204 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name); 1205 sc->tp_format = trace_event__tp_format("syscalls", tp_name); 1206 1207 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) { 1208 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias); 1209 sc->tp_format = trace_event__tp_format("syscalls", tp_name); 1210 } 1211 1212 if (IS_ERR(sc->tp_format)) 1213 return -1; 1214 1215 sc->args = sc->tp_format->format.fields; 1216 sc->nr_args = sc->tp_format->format.nr_fields; 1217 /* 1218 * We need to check and discard the first variable '__syscall_nr' 1219 * or 'nr' that mean the syscall number. It is needless here. 1220 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels. 1221 */ 1222 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) { 1223 sc->args = sc->args->next; 1224 --sc->nr_args; 1225 } 1226 1227 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit"); 1228 1229 return syscall__set_arg_fmts(sc); 1230 } 1231 1232 static int trace__validate_ev_qualifier(struct trace *trace) 1233 { 1234 int err = 0, i; 1235 struct str_node *pos; 1236 1237 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier); 1238 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr * 1239 sizeof(trace->ev_qualifier_ids.entries[0])); 1240 1241 if (trace->ev_qualifier_ids.entries == NULL) { 1242 fputs("Error:\tNot enough memory for allocating events qualifier ids\n", 1243 trace->output); 1244 err = -EINVAL; 1245 goto out; 1246 } 1247 1248 i = 0; 1249 1250 strlist__for_each(pos, trace->ev_qualifier) { 1251 const char *sc = pos->s; 1252 int id = syscalltbl__id(trace->sctbl, sc); 1253 1254 if (id < 0) { 1255 if (err == 0) { 1256 fputs("Error:\tInvalid syscall ", trace->output); 1257 err = -EINVAL; 1258 } else { 1259 fputs(", ", trace->output); 1260 } 1261 1262 fputs(sc, trace->output); 1263 } 1264 1265 trace->ev_qualifier_ids.entries[i++] = id; 1266 } 1267 1268 if (err < 0) { 1269 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'" 1270 "\nHint:\tand: 'man syscalls'\n", trace->output); 1271 zfree(&trace->ev_qualifier_ids.entries); 1272 trace->ev_qualifier_ids.nr = 0; 1273 } 1274 out: 1275 return err; 1276 } 1277 1278 /* 1279 * args is to be interpreted as a series of longs but we need to handle 1280 * 8-byte unaligned accesses. args points to raw_data within the event 1281 * and raw_data is guaranteed to be 8-byte unaligned because it is 1282 * preceded by raw_size which is a u32. So we need to copy args to a temp 1283 * variable to read it. Most notably this avoids extended load instructions 1284 * on unaligned addresses 1285 */ 1286 1287 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, 1288 unsigned char *args, struct trace *trace, 1289 struct thread *thread) 1290 { 1291 size_t printed = 0; 1292 unsigned char *p; 1293 unsigned long val; 1294 1295 if (sc->args != NULL) { 1296 struct format_field *field; 1297 u8 bit = 1; 1298 struct syscall_arg arg = { 1299 .idx = 0, 1300 .mask = 0, 1301 .trace = trace, 1302 .thread = thread, 1303 }; 1304 1305 for (field = sc->args; field; 1306 field = field->next, ++arg.idx, bit <<= 1) { 1307 if (arg.mask & bit) 1308 continue; 1309 1310 /* special care for unaligned accesses */ 1311 p = args + sizeof(unsigned long) * arg.idx; 1312 memcpy(&val, p, sizeof(val)); 1313 1314 /* 1315 * Suppress this argument if its value is zero and 1316 * and we don't have a string associated in an 1317 * strarray for it. 1318 */ 1319 if (val == 0 && 1320 !(sc->arg_scnprintf && 1321 sc->arg_scnprintf[arg.idx] == SCA_STRARRAY && 1322 sc->arg_parm[arg.idx])) 1323 continue; 1324 1325 printed += scnprintf(bf + printed, size - printed, 1326 "%s%s: ", printed ? ", " : "", field->name); 1327 if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) { 1328 arg.val = val; 1329 if (sc->arg_parm) 1330 arg.parm = sc->arg_parm[arg.idx]; 1331 printed += sc->arg_scnprintf[arg.idx](bf + printed, 1332 size - printed, &arg); 1333 } else { 1334 printed += scnprintf(bf + printed, size - printed, 1335 "%ld", val); 1336 } 1337 } 1338 } else if (IS_ERR(sc->tp_format)) { 1339 /* 1340 * If we managed to read the tracepoint /format file, then we 1341 * may end up not having any args, like with gettid(), so only 1342 * print the raw args when we didn't manage to read it. 1343 */ 1344 int i = 0; 1345 1346 while (i < 6) { 1347 /* special care for unaligned accesses */ 1348 p = args + sizeof(unsigned long) * i; 1349 memcpy(&val, p, sizeof(val)); 1350 printed += scnprintf(bf + printed, size - printed, 1351 "%sarg%d: %ld", 1352 printed ? ", " : "", i, val); 1353 ++i; 1354 } 1355 } 1356 1357 return printed; 1358 } 1359 1360 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel, 1361 union perf_event *event, 1362 struct perf_sample *sample); 1363 1364 static struct syscall *trace__syscall_info(struct trace *trace, 1365 struct perf_evsel *evsel, int id) 1366 { 1367 1368 if (id < 0) { 1369 1370 /* 1371 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried 1372 * before that, leaving at a higher verbosity level till that is 1373 * explained. Reproduced with plain ftrace with: 1374 * 1375 * echo 1 > /t/events/raw_syscalls/sys_exit/enable 1376 * grep "NR -1 " /t/trace_pipe 1377 * 1378 * After generating some load on the machine. 1379 */ 1380 if (verbose > 1) { 1381 static u64 n; 1382 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n", 1383 id, perf_evsel__name(evsel), ++n); 1384 } 1385 return NULL; 1386 } 1387 1388 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) && 1389 trace__read_syscall_info(trace, id)) 1390 goto out_cant_read; 1391 1392 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL)) 1393 goto out_cant_read; 1394 1395 return &trace->syscalls.table[id]; 1396 1397 out_cant_read: 1398 if (verbose) { 1399 fprintf(trace->output, "Problems reading syscall %d", id); 1400 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL) 1401 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name); 1402 fputs(" information\n", trace->output); 1403 } 1404 return NULL; 1405 } 1406 1407 static void thread__update_stats(struct thread_trace *ttrace, 1408 int id, struct perf_sample *sample) 1409 { 1410 struct int_node *inode; 1411 struct stats *stats; 1412 u64 duration = 0; 1413 1414 inode = intlist__findnew(ttrace->syscall_stats, id); 1415 if (inode == NULL) 1416 return; 1417 1418 stats = inode->priv; 1419 if (stats == NULL) { 1420 stats = malloc(sizeof(struct stats)); 1421 if (stats == NULL) 1422 return; 1423 init_stats(stats); 1424 inode->priv = stats; 1425 } 1426 1427 if (ttrace->entry_time && sample->time > ttrace->entry_time) 1428 duration = sample->time - ttrace->entry_time; 1429 1430 update_stats(stats, duration); 1431 } 1432 1433 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample) 1434 { 1435 struct thread_trace *ttrace; 1436 u64 duration; 1437 size_t printed; 1438 1439 if (trace->current == NULL) 1440 return 0; 1441 1442 ttrace = thread__priv(trace->current); 1443 1444 if (!ttrace->entry_pending) 1445 return 0; 1446 1447 duration = sample->time - ttrace->entry_time; 1448 1449 printed = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output); 1450 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str); 1451 ttrace->entry_pending = false; 1452 1453 return printed; 1454 } 1455 1456 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, 1457 union perf_event *event __maybe_unused, 1458 struct perf_sample *sample) 1459 { 1460 char *msg; 1461 void *args; 1462 size_t printed = 0; 1463 struct thread *thread; 1464 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; 1465 struct syscall *sc = trace__syscall_info(trace, evsel, id); 1466 struct thread_trace *ttrace; 1467 1468 if (sc == NULL) 1469 return -1; 1470 1471 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1472 ttrace = thread__trace(thread, trace->output); 1473 if (ttrace == NULL) 1474 goto out_put; 1475 1476 args = perf_evsel__sc_tp_ptr(evsel, args, sample); 1477 1478 if (ttrace->entry_str == NULL) { 1479 ttrace->entry_str = malloc(trace__entry_str_size); 1480 if (!ttrace->entry_str) 1481 goto out_put; 1482 } 1483 1484 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) 1485 trace__printf_interrupted_entry(trace, sample); 1486 1487 ttrace->entry_time = sample->time; 1488 msg = ttrace->entry_str; 1489 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name); 1490 1491 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed, 1492 args, trace, thread); 1493 1494 if (sc->is_exit) { 1495 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) { 1496 trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output); 1497 fprintf(trace->output, "%-70s)\n", ttrace->entry_str); 1498 } 1499 } else { 1500 ttrace->entry_pending = true; 1501 /* See trace__vfs_getname & trace__sys_exit */ 1502 ttrace->filename.pending_open = false; 1503 } 1504 1505 if (trace->current != thread) { 1506 thread__put(trace->current); 1507 trace->current = thread__get(thread); 1508 } 1509 err = 0; 1510 out_put: 1511 thread__put(thread); 1512 return err; 1513 } 1514 1515 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel, 1516 struct perf_sample *sample, 1517 struct callchain_cursor *cursor) 1518 { 1519 struct addr_location al; 1520 1521 if (machine__resolve(trace->host, &al, sample) < 0 || 1522 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack)) 1523 return -1; 1524 1525 return 0; 1526 } 1527 1528 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample) 1529 { 1530 /* TODO: user-configurable print_opts */ 1531 const unsigned int print_opts = EVSEL__PRINT_SYM | 1532 EVSEL__PRINT_DSO | 1533 EVSEL__PRINT_UNKNOWN_AS_ADDR; 1534 1535 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output); 1536 } 1537 1538 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, 1539 union perf_event *event __maybe_unused, 1540 struct perf_sample *sample) 1541 { 1542 long ret; 1543 u64 duration = 0; 1544 struct thread *thread; 1545 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0; 1546 struct syscall *sc = trace__syscall_info(trace, evsel, id); 1547 struct thread_trace *ttrace; 1548 1549 if (sc == NULL) 1550 return -1; 1551 1552 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1553 ttrace = thread__trace(thread, trace->output); 1554 if (ttrace == NULL) 1555 goto out_put; 1556 1557 if (trace->summary) 1558 thread__update_stats(ttrace, id, sample); 1559 1560 ret = perf_evsel__sc_tp_uint(evsel, ret, sample); 1561 1562 if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) { 1563 trace__set_fd_pathname(thread, ret, ttrace->filename.name); 1564 ttrace->filename.pending_open = false; 1565 ++trace->stats.vfs_getname; 1566 } 1567 1568 ttrace->exit_time = sample->time; 1569 1570 if (ttrace->entry_time) { 1571 duration = sample->time - ttrace->entry_time; 1572 if (trace__filter_duration(trace, duration)) 1573 goto out; 1574 } else if (trace->duration_filter) 1575 goto out; 1576 1577 if (sample->callchain) { 1578 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 1579 if (callchain_ret == 0) { 1580 if (callchain_cursor.nr < trace->min_stack) 1581 goto out; 1582 callchain_ret = 1; 1583 } 1584 } 1585 1586 if (trace->summary_only) 1587 goto out; 1588 1589 trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output); 1590 1591 if (ttrace->entry_pending) { 1592 fprintf(trace->output, "%-70s", ttrace->entry_str); 1593 } else { 1594 fprintf(trace->output, " ... ["); 1595 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued"); 1596 fprintf(trace->output, "]: %s()", sc->name); 1597 } 1598 1599 if (sc->fmt == NULL) { 1600 signed_print: 1601 fprintf(trace->output, ") = %ld", ret); 1602 } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) { 1603 char bf[STRERR_BUFSIZE]; 1604 const char *emsg = strerror_r(-ret, bf, sizeof(bf)), 1605 *e = audit_errno_to_name(-ret); 1606 1607 fprintf(trace->output, ") = -1 %s %s", e, emsg); 1608 } else if (ret == 0 && sc->fmt->timeout) 1609 fprintf(trace->output, ") = 0 Timeout"); 1610 else if (sc->fmt->hexret) 1611 fprintf(trace->output, ") = %#lx", ret); 1612 else if (sc->fmt->errpid) { 1613 struct thread *child = machine__find_thread(trace->host, ret, ret); 1614 1615 if (child != NULL) { 1616 fprintf(trace->output, ") = %ld", ret); 1617 if (child->comm_set) 1618 fprintf(trace->output, " (%s)", thread__comm_str(child)); 1619 thread__put(child); 1620 } 1621 } else 1622 goto signed_print; 1623 1624 fputc('\n', trace->output); 1625 1626 if (callchain_ret > 0) 1627 trace__fprintf_callchain(trace, sample); 1628 else if (callchain_ret < 0) 1629 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 1630 out: 1631 ttrace->entry_pending = false; 1632 err = 0; 1633 out_put: 1634 thread__put(thread); 1635 return err; 1636 } 1637 1638 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel, 1639 union perf_event *event __maybe_unused, 1640 struct perf_sample *sample) 1641 { 1642 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1643 struct thread_trace *ttrace; 1644 size_t filename_len, entry_str_len, to_move; 1645 ssize_t remaining_space; 1646 char *pos; 1647 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname"); 1648 1649 if (!thread) 1650 goto out; 1651 1652 ttrace = thread__priv(thread); 1653 if (!ttrace) 1654 goto out; 1655 1656 filename_len = strlen(filename); 1657 1658 if (ttrace->filename.namelen < filename_len) { 1659 char *f = realloc(ttrace->filename.name, filename_len + 1); 1660 1661 if (f == NULL) 1662 goto out; 1663 1664 ttrace->filename.namelen = filename_len; 1665 ttrace->filename.name = f; 1666 } 1667 1668 strcpy(ttrace->filename.name, filename); 1669 ttrace->filename.pending_open = true; 1670 1671 if (!ttrace->filename.ptr) 1672 goto out; 1673 1674 entry_str_len = strlen(ttrace->entry_str); 1675 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */ 1676 if (remaining_space <= 0) 1677 goto out; 1678 1679 if (filename_len > (size_t)remaining_space) { 1680 filename += filename_len - remaining_space; 1681 filename_len = remaining_space; 1682 } 1683 1684 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */ 1685 pos = ttrace->entry_str + ttrace->filename.entry_str_pos; 1686 memmove(pos + filename_len, pos, to_move); 1687 memcpy(pos, filename, filename_len); 1688 1689 ttrace->filename.ptr = 0; 1690 ttrace->filename.entry_str_pos = 0; 1691 out: 1692 return 0; 1693 } 1694 1695 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel, 1696 union perf_event *event __maybe_unused, 1697 struct perf_sample *sample) 1698 { 1699 u64 runtime = perf_evsel__intval(evsel, sample, "runtime"); 1700 double runtime_ms = (double)runtime / NSEC_PER_MSEC; 1701 struct thread *thread = machine__findnew_thread(trace->host, 1702 sample->pid, 1703 sample->tid); 1704 struct thread_trace *ttrace = thread__trace(thread, trace->output); 1705 1706 if (ttrace == NULL) 1707 goto out_dump; 1708 1709 ttrace->runtime_ms += runtime_ms; 1710 trace->runtime_ms += runtime_ms; 1711 thread__put(thread); 1712 return 0; 1713 1714 out_dump: 1715 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n", 1716 evsel->name, 1717 perf_evsel__strval(evsel, sample, "comm"), 1718 (pid_t)perf_evsel__intval(evsel, sample, "pid"), 1719 runtime, 1720 perf_evsel__intval(evsel, sample, "vruntime")); 1721 thread__put(thread); 1722 return 0; 1723 } 1724 1725 static void bpf_output__printer(enum binary_printer_ops op, 1726 unsigned int val, void *extra) 1727 { 1728 FILE *output = extra; 1729 unsigned char ch = (unsigned char)val; 1730 1731 switch (op) { 1732 case BINARY_PRINT_CHAR_DATA: 1733 fprintf(output, "%c", isprint(ch) ? ch : '.'); 1734 break; 1735 case BINARY_PRINT_DATA_BEGIN: 1736 case BINARY_PRINT_LINE_BEGIN: 1737 case BINARY_PRINT_ADDR: 1738 case BINARY_PRINT_NUM_DATA: 1739 case BINARY_PRINT_NUM_PAD: 1740 case BINARY_PRINT_SEP: 1741 case BINARY_PRINT_CHAR_PAD: 1742 case BINARY_PRINT_LINE_END: 1743 case BINARY_PRINT_DATA_END: 1744 default: 1745 break; 1746 } 1747 } 1748 1749 static void bpf_output__fprintf(struct trace *trace, 1750 struct perf_sample *sample) 1751 { 1752 print_binary(sample->raw_data, sample->raw_size, 8, 1753 bpf_output__printer, trace->output); 1754 } 1755 1756 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, 1757 union perf_event *event __maybe_unused, 1758 struct perf_sample *sample) 1759 { 1760 int callchain_ret = 0; 1761 1762 if (sample->callchain) { 1763 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 1764 if (callchain_ret == 0) { 1765 if (callchain_cursor.nr < trace->min_stack) 1766 goto out; 1767 callchain_ret = 1; 1768 } 1769 } 1770 1771 trace__printf_interrupted_entry(trace, sample); 1772 trace__fprintf_tstamp(trace, sample->time, trace->output); 1773 1774 if (trace->trace_syscalls) 1775 fprintf(trace->output, "( ): "); 1776 1777 fprintf(trace->output, "%s:", evsel->name); 1778 1779 if (perf_evsel__is_bpf_output(evsel)) { 1780 bpf_output__fprintf(trace, sample); 1781 } else if (evsel->tp_format) { 1782 event_format__fprintf(evsel->tp_format, sample->cpu, 1783 sample->raw_data, sample->raw_size, 1784 trace->output); 1785 } 1786 1787 fprintf(trace->output, ")\n"); 1788 1789 if (callchain_ret > 0) 1790 trace__fprintf_callchain(trace, sample); 1791 else if (callchain_ret < 0) 1792 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 1793 out: 1794 return 0; 1795 } 1796 1797 static void print_location(FILE *f, struct perf_sample *sample, 1798 struct addr_location *al, 1799 bool print_dso, bool print_sym) 1800 { 1801 1802 if ((verbose || print_dso) && al->map) 1803 fprintf(f, "%s@", al->map->dso->long_name); 1804 1805 if ((verbose || print_sym) && al->sym) 1806 fprintf(f, "%s+0x%" PRIx64, al->sym->name, 1807 al->addr - al->sym->start); 1808 else if (al->map) 1809 fprintf(f, "0x%" PRIx64, al->addr); 1810 else 1811 fprintf(f, "0x%" PRIx64, sample->addr); 1812 } 1813 1814 static int trace__pgfault(struct trace *trace, 1815 struct perf_evsel *evsel, 1816 union perf_event *event __maybe_unused, 1817 struct perf_sample *sample) 1818 { 1819 struct thread *thread; 1820 struct addr_location al; 1821 char map_type = 'd'; 1822 struct thread_trace *ttrace; 1823 int err = -1; 1824 int callchain_ret = 0; 1825 1826 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1827 1828 if (sample->callchain) { 1829 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 1830 if (callchain_ret == 0) { 1831 if (callchain_cursor.nr < trace->min_stack) 1832 goto out_put; 1833 callchain_ret = 1; 1834 } 1835 } 1836 1837 ttrace = thread__trace(thread, trace->output); 1838 if (ttrace == NULL) 1839 goto out_put; 1840 1841 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ) 1842 ttrace->pfmaj++; 1843 else 1844 ttrace->pfmin++; 1845 1846 if (trace->summary_only) 1847 goto out; 1848 1849 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION, 1850 sample->ip, &al); 1851 1852 trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output); 1853 1854 fprintf(trace->output, "%sfault [", 1855 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ? 1856 "maj" : "min"); 1857 1858 print_location(trace->output, sample, &al, false, true); 1859 1860 fprintf(trace->output, "] => "); 1861 1862 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE, 1863 sample->addr, &al); 1864 1865 if (!al.map) { 1866 thread__find_addr_location(thread, sample->cpumode, 1867 MAP__FUNCTION, sample->addr, &al); 1868 1869 if (al.map) 1870 map_type = 'x'; 1871 else 1872 map_type = '?'; 1873 } 1874 1875 print_location(trace->output, sample, &al, true, false); 1876 1877 fprintf(trace->output, " (%c%c)\n", map_type, al.level); 1878 1879 if (callchain_ret > 0) 1880 trace__fprintf_callchain(trace, sample); 1881 else if (callchain_ret < 0) 1882 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 1883 out: 1884 err = 0; 1885 out_put: 1886 thread__put(thread); 1887 return err; 1888 } 1889 1890 static bool skip_sample(struct trace *trace, struct perf_sample *sample) 1891 { 1892 if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) || 1893 (trace->tid_list && intlist__find(trace->tid_list, sample->tid))) 1894 return false; 1895 1896 if (trace->pid_list || trace->tid_list) 1897 return true; 1898 1899 return false; 1900 } 1901 1902 static void trace__set_base_time(struct trace *trace, 1903 struct perf_evsel *evsel, 1904 struct perf_sample *sample) 1905 { 1906 /* 1907 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust 1908 * and don't use sample->time unconditionally, we may end up having 1909 * some other event in the future without PERF_SAMPLE_TIME for good 1910 * reason, i.e. we may not be interested in its timestamps, just in 1911 * it taking place, picking some piece of information when it 1912 * appears in our event stream (vfs_getname comes to mind). 1913 */ 1914 if (trace->base_time == 0 && !trace->full_time && 1915 (evsel->attr.sample_type & PERF_SAMPLE_TIME)) 1916 trace->base_time = sample->time; 1917 } 1918 1919 static int trace__process_sample(struct perf_tool *tool, 1920 union perf_event *event, 1921 struct perf_sample *sample, 1922 struct perf_evsel *evsel, 1923 struct machine *machine __maybe_unused) 1924 { 1925 struct trace *trace = container_of(tool, struct trace, tool); 1926 int err = 0; 1927 1928 tracepoint_handler handler = evsel->handler; 1929 1930 if (skip_sample(trace, sample)) 1931 return 0; 1932 1933 trace__set_base_time(trace, evsel, sample); 1934 1935 if (handler) { 1936 ++trace->nr_events; 1937 handler(trace, evsel, event, sample); 1938 } 1939 1940 return err; 1941 } 1942 1943 static int parse_target_str(struct trace *trace) 1944 { 1945 if (trace->opts.target.pid) { 1946 trace->pid_list = intlist__new(trace->opts.target.pid); 1947 if (trace->pid_list == NULL) { 1948 pr_err("Error parsing process id string\n"); 1949 return -EINVAL; 1950 } 1951 } 1952 1953 if (trace->opts.target.tid) { 1954 trace->tid_list = intlist__new(trace->opts.target.tid); 1955 if (trace->tid_list == NULL) { 1956 pr_err("Error parsing thread id string\n"); 1957 return -EINVAL; 1958 } 1959 } 1960 1961 return 0; 1962 } 1963 1964 static int trace__record(struct trace *trace, int argc, const char **argv) 1965 { 1966 unsigned int rec_argc, i, j; 1967 const char **rec_argv; 1968 const char * const record_args[] = { 1969 "record", 1970 "-R", 1971 "-m", "1024", 1972 "-c", "1", 1973 }; 1974 1975 const char * const sc_args[] = { "-e", }; 1976 unsigned int sc_args_nr = ARRAY_SIZE(sc_args); 1977 const char * const majpf_args[] = { "-e", "major-faults" }; 1978 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args); 1979 const char * const minpf_args[] = { "-e", "minor-faults" }; 1980 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args); 1981 1982 /* +1 is for the event string below */ 1983 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 + 1984 majpf_args_nr + minpf_args_nr + argc; 1985 rec_argv = calloc(rec_argc + 1, sizeof(char *)); 1986 1987 if (rec_argv == NULL) 1988 return -ENOMEM; 1989 1990 j = 0; 1991 for (i = 0; i < ARRAY_SIZE(record_args); i++) 1992 rec_argv[j++] = record_args[i]; 1993 1994 if (trace->trace_syscalls) { 1995 for (i = 0; i < sc_args_nr; i++) 1996 rec_argv[j++] = sc_args[i]; 1997 1998 /* event string may be different for older kernels - e.g., RHEL6 */ 1999 if (is_valid_tracepoint("raw_syscalls:sys_enter")) 2000 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit"; 2001 else if (is_valid_tracepoint("syscalls:sys_enter")) 2002 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit"; 2003 else { 2004 pr_err("Neither raw_syscalls nor syscalls events exist.\n"); 2005 return -1; 2006 } 2007 } 2008 2009 if (trace->trace_pgfaults & TRACE_PFMAJ) 2010 for (i = 0; i < majpf_args_nr; i++) 2011 rec_argv[j++] = majpf_args[i]; 2012 2013 if (trace->trace_pgfaults & TRACE_PFMIN) 2014 for (i = 0; i < minpf_args_nr; i++) 2015 rec_argv[j++] = minpf_args[i]; 2016 2017 for (i = 0; i < (unsigned int)argc; i++) 2018 rec_argv[j++] = argv[i]; 2019 2020 return cmd_record(j, rec_argv, NULL); 2021 } 2022 2023 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp); 2024 2025 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist) 2026 { 2027 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname"); 2028 2029 if (IS_ERR(evsel)) 2030 return false; 2031 2032 if (perf_evsel__field(evsel, "pathname") == NULL) { 2033 perf_evsel__delete(evsel); 2034 return false; 2035 } 2036 2037 evsel->handler = trace__vfs_getname; 2038 perf_evlist__add(evlist, evsel); 2039 return true; 2040 } 2041 2042 static struct perf_evsel *perf_evsel__new_pgfault(u64 config) 2043 { 2044 struct perf_evsel *evsel; 2045 struct perf_event_attr attr = { 2046 .type = PERF_TYPE_SOFTWARE, 2047 .mmap_data = 1, 2048 }; 2049 2050 attr.config = config; 2051 attr.sample_period = 1; 2052 2053 event_attr_init(&attr); 2054 2055 evsel = perf_evsel__new(&attr); 2056 if (evsel) 2057 evsel->handler = trace__pgfault; 2058 2059 return evsel; 2060 } 2061 2062 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample) 2063 { 2064 const u32 type = event->header.type; 2065 struct perf_evsel *evsel; 2066 2067 if (type != PERF_RECORD_SAMPLE) { 2068 trace__process_event(trace, trace->host, event, sample); 2069 return; 2070 } 2071 2072 evsel = perf_evlist__id2evsel(trace->evlist, sample->id); 2073 if (evsel == NULL) { 2074 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id); 2075 return; 2076 } 2077 2078 trace__set_base_time(trace, evsel, sample); 2079 2080 if (evsel->attr.type == PERF_TYPE_TRACEPOINT && 2081 sample->raw_data == NULL) { 2082 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n", 2083 perf_evsel__name(evsel), sample->tid, 2084 sample->cpu, sample->raw_size); 2085 } else { 2086 tracepoint_handler handler = evsel->handler; 2087 handler(trace, evsel, event, sample); 2088 } 2089 } 2090 2091 static int trace__add_syscall_newtp(struct trace *trace) 2092 { 2093 int ret = -1; 2094 struct perf_evlist *evlist = trace->evlist; 2095 struct perf_evsel *sys_enter, *sys_exit; 2096 2097 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter); 2098 if (sys_enter == NULL) 2099 goto out; 2100 2101 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args)) 2102 goto out_delete_sys_enter; 2103 2104 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit); 2105 if (sys_exit == NULL) 2106 goto out_delete_sys_enter; 2107 2108 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret)) 2109 goto out_delete_sys_exit; 2110 2111 perf_evlist__add(evlist, sys_enter); 2112 perf_evlist__add(evlist, sys_exit); 2113 2114 if (callchain_param.enabled && !trace->kernel_syscallchains) { 2115 /* 2116 * We're interested only in the user space callchain 2117 * leading to the syscall, allow overriding that for 2118 * debugging reasons using --kernel_syscall_callchains 2119 */ 2120 sys_exit->attr.exclude_callchain_kernel = 1; 2121 } 2122 2123 trace->syscalls.events.sys_enter = sys_enter; 2124 trace->syscalls.events.sys_exit = sys_exit; 2125 2126 ret = 0; 2127 out: 2128 return ret; 2129 2130 out_delete_sys_exit: 2131 perf_evsel__delete_priv(sys_exit); 2132 out_delete_sys_enter: 2133 perf_evsel__delete_priv(sys_enter); 2134 goto out; 2135 } 2136 2137 static int trace__set_ev_qualifier_filter(struct trace *trace) 2138 { 2139 int err = -1; 2140 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier, 2141 trace->ev_qualifier_ids.nr, 2142 trace->ev_qualifier_ids.entries); 2143 2144 if (filter == NULL) 2145 goto out_enomem; 2146 2147 if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter)) 2148 err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter); 2149 2150 free(filter); 2151 out: 2152 return err; 2153 out_enomem: 2154 errno = ENOMEM; 2155 goto out; 2156 } 2157 2158 static int trace__run(struct trace *trace, int argc, const char **argv) 2159 { 2160 struct perf_evlist *evlist = trace->evlist; 2161 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL; 2162 int err = -1, i; 2163 unsigned long before; 2164 const bool forks = argc > 0; 2165 bool draining = false; 2166 2167 trace->live = true; 2168 2169 if (trace->trace_syscalls && trace__add_syscall_newtp(trace)) 2170 goto out_error_raw_syscalls; 2171 2172 if (trace->trace_syscalls) 2173 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist); 2174 2175 if ((trace->trace_pgfaults & TRACE_PFMAJ)) { 2176 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ); 2177 if (pgfault_maj == NULL) 2178 goto out_error_mem; 2179 perf_evlist__add(evlist, pgfault_maj); 2180 } 2181 2182 if ((trace->trace_pgfaults & TRACE_PFMIN)) { 2183 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN); 2184 if (pgfault_min == NULL) 2185 goto out_error_mem; 2186 perf_evlist__add(evlist, pgfault_min); 2187 } 2188 2189 if (trace->sched && 2190 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime", 2191 trace__sched_stat_runtime)) 2192 goto out_error_sched_stat_runtime; 2193 2194 err = perf_evlist__create_maps(evlist, &trace->opts.target); 2195 if (err < 0) { 2196 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n"); 2197 goto out_delete_evlist; 2198 } 2199 2200 err = trace__symbols_init(trace, evlist); 2201 if (err < 0) { 2202 fprintf(trace->output, "Problems initializing symbol libraries!\n"); 2203 goto out_delete_evlist; 2204 } 2205 2206 perf_evlist__config(evlist, &trace->opts, NULL); 2207 2208 if (callchain_param.enabled) { 2209 bool use_identifier = false; 2210 2211 if (trace->syscalls.events.sys_exit) { 2212 perf_evsel__config_callchain(trace->syscalls.events.sys_exit, 2213 &trace->opts, &callchain_param); 2214 use_identifier = true; 2215 } 2216 2217 if (pgfault_maj) { 2218 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param); 2219 use_identifier = true; 2220 } 2221 2222 if (pgfault_min) { 2223 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param); 2224 use_identifier = true; 2225 } 2226 2227 if (use_identifier) { 2228 /* 2229 * Now we have evsels with different sample_ids, use 2230 * PERF_SAMPLE_IDENTIFIER to map from sample to evsel 2231 * from a fixed position in each ring buffer record. 2232 * 2233 * As of this the changeset introducing this comment, this 2234 * isn't strictly needed, as the fields that can come before 2235 * PERF_SAMPLE_ID are all used, but we'll probably disable 2236 * some of those for things like copying the payload of 2237 * pointer syscall arguments, and for vfs_getname we don't 2238 * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this 2239 * here as a warning we need to use PERF_SAMPLE_IDENTIFIER. 2240 */ 2241 perf_evlist__set_sample_bit(evlist, IDENTIFIER); 2242 perf_evlist__reset_sample_bit(evlist, ID); 2243 } 2244 } 2245 2246 signal(SIGCHLD, sig_handler); 2247 signal(SIGINT, sig_handler); 2248 2249 if (forks) { 2250 err = perf_evlist__prepare_workload(evlist, &trace->opts.target, 2251 argv, false, NULL); 2252 if (err < 0) { 2253 fprintf(trace->output, "Couldn't run the workload!\n"); 2254 goto out_delete_evlist; 2255 } 2256 } 2257 2258 err = perf_evlist__open(evlist); 2259 if (err < 0) 2260 goto out_error_open; 2261 2262 err = bpf__apply_obj_config(); 2263 if (err) { 2264 char errbuf[BUFSIZ]; 2265 2266 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf)); 2267 pr_err("ERROR: Apply config to BPF failed: %s\n", 2268 errbuf); 2269 goto out_error_open; 2270 } 2271 2272 /* 2273 * Better not use !target__has_task() here because we need to cover the 2274 * case where no threads were specified in the command line, but a 2275 * workload was, and in that case we will fill in the thread_map when 2276 * we fork the workload in perf_evlist__prepare_workload. 2277 */ 2278 if (trace->filter_pids.nr > 0) 2279 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries); 2280 else if (thread_map__pid(evlist->threads, 0) == -1) 2281 err = perf_evlist__set_filter_pid(evlist, getpid()); 2282 2283 if (err < 0) 2284 goto out_error_mem; 2285 2286 if (trace->ev_qualifier_ids.nr > 0) { 2287 err = trace__set_ev_qualifier_filter(trace); 2288 if (err < 0) 2289 goto out_errno; 2290 2291 pr_debug("event qualifier tracepoint filter: %s\n", 2292 trace->syscalls.events.sys_exit->filter); 2293 } 2294 2295 err = perf_evlist__apply_filters(evlist, &evsel); 2296 if (err < 0) 2297 goto out_error_apply_filters; 2298 2299 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false); 2300 if (err < 0) 2301 goto out_error_mmap; 2302 2303 if (!target__none(&trace->opts.target)) 2304 perf_evlist__enable(evlist); 2305 2306 if (forks) 2307 perf_evlist__start_workload(evlist); 2308 2309 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 || 2310 evlist->threads->nr > 1 || 2311 perf_evlist__first(evlist)->attr.inherit; 2312 again: 2313 before = trace->nr_events; 2314 2315 for (i = 0; i < evlist->nr_mmaps; i++) { 2316 union perf_event *event; 2317 2318 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) { 2319 struct perf_sample sample; 2320 2321 ++trace->nr_events; 2322 2323 err = perf_evlist__parse_sample(evlist, event, &sample); 2324 if (err) { 2325 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err); 2326 goto next_event; 2327 } 2328 2329 trace__handle_event(trace, event, &sample); 2330 next_event: 2331 perf_evlist__mmap_consume(evlist, i); 2332 2333 if (interrupted) 2334 goto out_disable; 2335 2336 if (done && !draining) { 2337 perf_evlist__disable(evlist); 2338 draining = true; 2339 } 2340 } 2341 } 2342 2343 if (trace->nr_events == before) { 2344 int timeout = done ? 100 : -1; 2345 2346 if (!draining && perf_evlist__poll(evlist, timeout) > 0) { 2347 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0) 2348 draining = true; 2349 2350 goto again; 2351 } 2352 } else { 2353 goto again; 2354 } 2355 2356 out_disable: 2357 thread__zput(trace->current); 2358 2359 perf_evlist__disable(evlist); 2360 2361 if (!err) { 2362 if (trace->summary) 2363 trace__fprintf_thread_summary(trace, trace->output); 2364 2365 if (trace->show_tool_stats) { 2366 fprintf(trace->output, "Stats:\n " 2367 " vfs_getname : %" PRIu64 "\n" 2368 " proc_getname: %" PRIu64 "\n", 2369 trace->stats.vfs_getname, 2370 trace->stats.proc_getname); 2371 } 2372 } 2373 2374 out_delete_evlist: 2375 perf_evlist__delete(evlist); 2376 trace->evlist = NULL; 2377 trace->live = false; 2378 return err; 2379 { 2380 char errbuf[BUFSIZ]; 2381 2382 out_error_sched_stat_runtime: 2383 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime"); 2384 goto out_error; 2385 2386 out_error_raw_syscalls: 2387 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)"); 2388 goto out_error; 2389 2390 out_error_mmap: 2391 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf)); 2392 goto out_error; 2393 2394 out_error_open: 2395 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf)); 2396 2397 out_error: 2398 fprintf(trace->output, "%s\n", errbuf); 2399 goto out_delete_evlist; 2400 2401 out_error_apply_filters: 2402 fprintf(trace->output, 2403 "Failed to set filter \"%s\" on event %s with %d (%s)\n", 2404 evsel->filter, perf_evsel__name(evsel), errno, 2405 strerror_r(errno, errbuf, sizeof(errbuf))); 2406 goto out_delete_evlist; 2407 } 2408 out_error_mem: 2409 fprintf(trace->output, "Not enough memory to run!\n"); 2410 goto out_delete_evlist; 2411 2412 out_errno: 2413 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno)); 2414 goto out_delete_evlist; 2415 } 2416 2417 static int trace__replay(struct trace *trace) 2418 { 2419 const struct perf_evsel_str_handler handlers[] = { 2420 { "probe:vfs_getname", trace__vfs_getname, }, 2421 }; 2422 struct perf_data_file file = { 2423 .path = input_name, 2424 .mode = PERF_DATA_MODE_READ, 2425 .force = trace->force, 2426 }; 2427 struct perf_session *session; 2428 struct perf_evsel *evsel; 2429 int err = -1; 2430 2431 trace->tool.sample = trace__process_sample; 2432 trace->tool.mmap = perf_event__process_mmap; 2433 trace->tool.mmap2 = perf_event__process_mmap2; 2434 trace->tool.comm = perf_event__process_comm; 2435 trace->tool.exit = perf_event__process_exit; 2436 trace->tool.fork = perf_event__process_fork; 2437 trace->tool.attr = perf_event__process_attr; 2438 trace->tool.tracing_data = perf_event__process_tracing_data; 2439 trace->tool.build_id = perf_event__process_build_id; 2440 2441 trace->tool.ordered_events = true; 2442 trace->tool.ordering_requires_timestamps = true; 2443 2444 /* add tid to output */ 2445 trace->multiple_threads = true; 2446 2447 session = perf_session__new(&file, false, &trace->tool); 2448 if (session == NULL) 2449 return -1; 2450 2451 if (symbol__init(&session->header.env) < 0) 2452 goto out; 2453 2454 trace->host = &session->machines.host; 2455 2456 err = perf_session__set_tracepoints_handlers(session, handlers); 2457 if (err) 2458 goto out; 2459 2460 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2461 "raw_syscalls:sys_enter"); 2462 /* older kernels have syscalls tp versus raw_syscalls */ 2463 if (evsel == NULL) 2464 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2465 "syscalls:sys_enter"); 2466 2467 if (evsel && 2468 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 || 2469 perf_evsel__init_sc_tp_ptr_field(evsel, args))) { 2470 pr_err("Error during initialize raw_syscalls:sys_enter event\n"); 2471 goto out; 2472 } 2473 2474 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2475 "raw_syscalls:sys_exit"); 2476 if (evsel == NULL) 2477 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2478 "syscalls:sys_exit"); 2479 if (evsel && 2480 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 || 2481 perf_evsel__init_sc_tp_uint_field(evsel, ret))) { 2482 pr_err("Error during initialize raw_syscalls:sys_exit event\n"); 2483 goto out; 2484 } 2485 2486 evlist__for_each(session->evlist, evsel) { 2487 if (evsel->attr.type == PERF_TYPE_SOFTWARE && 2488 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ || 2489 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN || 2490 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS)) 2491 evsel->handler = trace__pgfault; 2492 } 2493 2494 err = parse_target_str(trace); 2495 if (err != 0) 2496 goto out; 2497 2498 setup_pager(); 2499 2500 err = perf_session__process_events(session); 2501 if (err) 2502 pr_err("Failed to process events, error %d", err); 2503 2504 else if (trace->summary) 2505 trace__fprintf_thread_summary(trace, trace->output); 2506 2507 out: 2508 perf_session__delete(session); 2509 2510 return err; 2511 } 2512 2513 static size_t trace__fprintf_threads_header(FILE *fp) 2514 { 2515 size_t printed; 2516 2517 printed = fprintf(fp, "\n Summary of events:\n\n"); 2518 2519 return printed; 2520 } 2521 2522 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs, 2523 struct stats *stats; 2524 double msecs; 2525 int syscall; 2526 ) 2527 { 2528 struct int_node *source = rb_entry(nd, struct int_node, rb_node); 2529 struct stats *stats = source->priv; 2530 2531 entry->syscall = source->i; 2532 entry->stats = stats; 2533 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0; 2534 } 2535 2536 static size_t thread__dump_stats(struct thread_trace *ttrace, 2537 struct trace *trace, FILE *fp) 2538 { 2539 size_t printed = 0; 2540 struct syscall *sc; 2541 struct rb_node *nd; 2542 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats); 2543 2544 if (syscall_stats == NULL) 2545 return 0; 2546 2547 printed += fprintf(fp, "\n"); 2548 2549 printed += fprintf(fp, " syscall calls total min avg max stddev\n"); 2550 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); 2551 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n"); 2552 2553 resort_rb__for_each(nd, syscall_stats) { 2554 struct stats *stats = syscall_stats_entry->stats; 2555 if (stats) { 2556 double min = (double)(stats->min) / NSEC_PER_MSEC; 2557 double max = (double)(stats->max) / NSEC_PER_MSEC; 2558 double avg = avg_stats(stats); 2559 double pct; 2560 u64 n = (u64) stats->n; 2561 2562 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0; 2563 avg /= NSEC_PER_MSEC; 2564 2565 sc = &trace->syscalls.table[syscall_stats_entry->syscall]; 2566 printed += fprintf(fp, " %-15s", sc->name); 2567 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f", 2568 n, syscall_stats_entry->msecs, min, avg); 2569 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct); 2570 } 2571 } 2572 2573 resort_rb__delete(syscall_stats); 2574 printed += fprintf(fp, "\n\n"); 2575 2576 return printed; 2577 } 2578 2579 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace) 2580 { 2581 size_t printed = 0; 2582 struct thread_trace *ttrace = thread__priv(thread); 2583 double ratio; 2584 2585 if (ttrace == NULL) 2586 return 0; 2587 2588 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0; 2589 2590 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid); 2591 printed += fprintf(fp, "%lu events, ", ttrace->nr_events); 2592 printed += fprintf(fp, "%.1f%%", ratio); 2593 if (ttrace->pfmaj) 2594 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj); 2595 if (ttrace->pfmin) 2596 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin); 2597 if (trace->sched) 2598 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms); 2599 else if (fputc('\n', fp) != EOF) 2600 ++printed; 2601 2602 printed += thread__dump_stats(ttrace, trace, fp); 2603 2604 return printed; 2605 } 2606 2607 static unsigned long thread__nr_events(struct thread_trace *ttrace) 2608 { 2609 return ttrace ? ttrace->nr_events : 0; 2610 } 2611 2612 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)), 2613 struct thread *thread; 2614 ) 2615 { 2616 entry->thread = rb_entry(nd, struct thread, rb_node); 2617 } 2618 2619 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp) 2620 { 2621 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host); 2622 size_t printed = trace__fprintf_threads_header(fp); 2623 struct rb_node *nd; 2624 2625 if (threads == NULL) { 2626 fprintf(fp, "%s", "Error sorting output by nr_events!\n"); 2627 return 0; 2628 } 2629 2630 resort_rb__for_each(nd, threads) 2631 printed += trace__fprintf_thread(fp, threads_entry->thread, trace); 2632 2633 resort_rb__delete(threads); 2634 2635 return printed; 2636 } 2637 2638 static int trace__set_duration(const struct option *opt, const char *str, 2639 int unset __maybe_unused) 2640 { 2641 struct trace *trace = opt->value; 2642 2643 trace->duration_filter = atof(str); 2644 return 0; 2645 } 2646 2647 static int trace__set_filter_pids(const struct option *opt, const char *str, 2648 int unset __maybe_unused) 2649 { 2650 int ret = -1; 2651 size_t i; 2652 struct trace *trace = opt->value; 2653 /* 2654 * FIXME: introduce a intarray class, plain parse csv and create a 2655 * { int nr, int entries[] } struct... 2656 */ 2657 struct intlist *list = intlist__new(str); 2658 2659 if (list == NULL) 2660 return -1; 2661 2662 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1; 2663 trace->filter_pids.entries = calloc(i, sizeof(pid_t)); 2664 2665 if (trace->filter_pids.entries == NULL) 2666 goto out; 2667 2668 trace->filter_pids.entries[0] = getpid(); 2669 2670 for (i = 1; i < trace->filter_pids.nr; ++i) 2671 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i; 2672 2673 intlist__delete(list); 2674 ret = 0; 2675 out: 2676 return ret; 2677 } 2678 2679 static int trace__open_output(struct trace *trace, const char *filename) 2680 { 2681 struct stat st; 2682 2683 if (!stat(filename, &st) && st.st_size) { 2684 char oldname[PATH_MAX]; 2685 2686 scnprintf(oldname, sizeof(oldname), "%s.old", filename); 2687 unlink(oldname); 2688 rename(filename, oldname); 2689 } 2690 2691 trace->output = fopen(filename, "w"); 2692 2693 return trace->output == NULL ? -errno : 0; 2694 } 2695 2696 static int parse_pagefaults(const struct option *opt, const char *str, 2697 int unset __maybe_unused) 2698 { 2699 int *trace_pgfaults = opt->value; 2700 2701 if (strcmp(str, "all") == 0) 2702 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN; 2703 else if (strcmp(str, "maj") == 0) 2704 *trace_pgfaults |= TRACE_PFMAJ; 2705 else if (strcmp(str, "min") == 0) 2706 *trace_pgfaults |= TRACE_PFMIN; 2707 else 2708 return -1; 2709 2710 return 0; 2711 } 2712 2713 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler) 2714 { 2715 struct perf_evsel *evsel; 2716 2717 evlist__for_each(evlist, evsel) 2718 evsel->handler = handler; 2719 } 2720 2721 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) 2722 { 2723 const char *trace_usage[] = { 2724 "perf trace [<options>] [<command>]", 2725 "perf trace [<options>] -- <command> [<options>]", 2726 "perf trace record [<options>] [<command>]", 2727 "perf trace record [<options>] -- <command> [<options>]", 2728 NULL 2729 }; 2730 struct trace trace = { 2731 .syscalls = { 2732 . max = -1, 2733 }, 2734 .opts = { 2735 .target = { 2736 .uid = UINT_MAX, 2737 .uses_mmap = true, 2738 }, 2739 .user_freq = UINT_MAX, 2740 .user_interval = ULLONG_MAX, 2741 .no_buffering = true, 2742 .mmap_pages = UINT_MAX, 2743 .proc_map_timeout = 500, 2744 }, 2745 .output = stderr, 2746 .show_comm = true, 2747 .trace_syscalls = true, 2748 .kernel_syscallchains = false, 2749 .max_stack = UINT_MAX, 2750 }; 2751 const char *output_name = NULL; 2752 const char *ev_qualifier_str = NULL; 2753 const struct option trace_options[] = { 2754 OPT_CALLBACK(0, "event", &trace.evlist, "event", 2755 "event selector. use 'perf list' to list available events", 2756 parse_events_option), 2757 OPT_BOOLEAN(0, "comm", &trace.show_comm, 2758 "show the thread COMM next to its id"), 2759 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"), 2760 OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"), 2761 OPT_STRING('o', "output", &output_name, "file", "output file name"), 2762 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"), 2763 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid", 2764 "trace events on existing process id"), 2765 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid", 2766 "trace events on existing thread id"), 2767 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids", 2768 "pids to filter (by the kernel)", trace__set_filter_pids), 2769 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide, 2770 "system-wide collection from all CPUs"), 2771 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu", 2772 "list of cpus to monitor"), 2773 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit, 2774 "child tasks do not inherit counters"), 2775 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages", 2776 "number of mmap data pages", 2777 perf_evlist__parse_mmap_pages), 2778 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user", 2779 "user to profile"), 2780 OPT_CALLBACK(0, "duration", &trace, "float", 2781 "show only events with duration > N.M ms", 2782 trace__set_duration), 2783 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"), 2784 OPT_INCR('v', "verbose", &verbose, "be more verbose"), 2785 OPT_BOOLEAN('T', "time", &trace.full_time, 2786 "Show full timestamp, not time relative to first start"), 2787 OPT_BOOLEAN('s', "summary", &trace.summary_only, 2788 "Show only syscall summary with statistics"), 2789 OPT_BOOLEAN('S', "with-summary", &trace.summary, 2790 "Show all syscalls and summary with statistics"), 2791 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min", 2792 "Trace pagefaults", parse_pagefaults, "maj"), 2793 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"), 2794 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"), 2795 OPT_CALLBACK(0, "call-graph", &trace.opts, 2796 "record_mode[,record_size]", record_callchain_help, 2797 &record_parse_callchain_opt), 2798 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains, 2799 "Show the kernel callchains on the syscall exit path"), 2800 OPT_UINTEGER(0, "min-stack", &trace.min_stack, 2801 "Set the minimum stack depth when parsing the callchain, " 2802 "anything below the specified depth will be ignored."), 2803 OPT_UINTEGER(0, "max-stack", &trace.max_stack, 2804 "Set the maximum stack depth when parsing the callchain, " 2805 "anything beyond the specified depth will be ignored. " 2806 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)), 2807 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout, 2808 "per thread proc mmap processing timeout in ms"), 2809 OPT_END() 2810 }; 2811 bool __maybe_unused max_stack_user_set = true; 2812 bool mmap_pages_user_set = true; 2813 const char * const trace_subcommands[] = { "record", NULL }; 2814 int err; 2815 char bf[BUFSIZ]; 2816 2817 signal(SIGSEGV, sighandler_dump_stack); 2818 signal(SIGFPE, sighandler_dump_stack); 2819 2820 trace.evlist = perf_evlist__new(); 2821 trace.sctbl = syscalltbl__new(); 2822 2823 if (trace.evlist == NULL || trace.sctbl == NULL) { 2824 pr_err("Not enough memory to run!\n"); 2825 err = -ENOMEM; 2826 goto out; 2827 } 2828 2829 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands, 2830 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION); 2831 2832 err = bpf__setup_stdout(trace.evlist); 2833 if (err) { 2834 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf)); 2835 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf); 2836 goto out; 2837 } 2838 2839 err = -1; 2840 2841 if (trace.trace_pgfaults) { 2842 trace.opts.sample_address = true; 2843 trace.opts.sample_time = true; 2844 } 2845 2846 if (trace.opts.mmap_pages == UINT_MAX) 2847 mmap_pages_user_set = false; 2848 2849 if (trace.max_stack == UINT_MAX) { 2850 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack; 2851 max_stack_user_set = false; 2852 } 2853 2854 #ifdef HAVE_DWARF_UNWIND_SUPPORT 2855 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls) 2856 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false); 2857 #endif 2858 2859 if (callchain_param.enabled) { 2860 if (!mmap_pages_user_set && geteuid() == 0) 2861 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4; 2862 2863 symbol_conf.use_callchain = true; 2864 } 2865 2866 if (trace.evlist->nr_entries > 0) 2867 evlist__set_evsel_handler(trace.evlist, trace__event_handler); 2868 2869 if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) 2870 return trace__record(&trace, argc-1, &argv[1]); 2871 2872 /* summary_only implies summary option, but don't overwrite summary if set */ 2873 if (trace.summary_only) 2874 trace.summary = trace.summary_only; 2875 2876 if (!trace.trace_syscalls && !trace.trace_pgfaults && 2877 trace.evlist->nr_entries == 0 /* Was --events used? */) { 2878 pr_err("Please specify something to trace.\n"); 2879 return -1; 2880 } 2881 2882 if (!trace.trace_syscalls && ev_qualifier_str) { 2883 pr_err("The -e option can't be used with --no-syscalls.\n"); 2884 goto out; 2885 } 2886 2887 if (output_name != NULL) { 2888 err = trace__open_output(&trace, output_name); 2889 if (err < 0) { 2890 perror("failed to create output file"); 2891 goto out; 2892 } 2893 } 2894 2895 trace.open_id = syscalltbl__id(trace.sctbl, "open"); 2896 2897 if (ev_qualifier_str != NULL) { 2898 const char *s = ev_qualifier_str; 2899 struct strlist_config slist_config = { 2900 .dirname = system_path(STRACE_GROUPS_DIR), 2901 }; 2902 2903 trace.not_ev_qualifier = *s == '!'; 2904 if (trace.not_ev_qualifier) 2905 ++s; 2906 trace.ev_qualifier = strlist__new(s, &slist_config); 2907 if (trace.ev_qualifier == NULL) { 2908 fputs("Not enough memory to parse event qualifier", 2909 trace.output); 2910 err = -ENOMEM; 2911 goto out_close; 2912 } 2913 2914 err = trace__validate_ev_qualifier(&trace); 2915 if (err) 2916 goto out_close; 2917 } 2918 2919 err = target__validate(&trace.opts.target); 2920 if (err) { 2921 target__strerror(&trace.opts.target, err, bf, sizeof(bf)); 2922 fprintf(trace.output, "%s", bf); 2923 goto out_close; 2924 } 2925 2926 err = target__parse_uid(&trace.opts.target); 2927 if (err) { 2928 target__strerror(&trace.opts.target, err, bf, sizeof(bf)); 2929 fprintf(trace.output, "%s", bf); 2930 goto out_close; 2931 } 2932 2933 if (!argc && target__none(&trace.opts.target)) 2934 trace.opts.target.system_wide = true; 2935 2936 if (input_name) 2937 err = trace__replay(&trace); 2938 else 2939 err = trace__run(&trace, argc, argv); 2940 2941 out_close: 2942 if (output_name != NULL) 2943 fclose(trace.output); 2944 out: 2945 return err; 2946 } 2947