1 /* 2 * builtin-trace.c 3 * 4 * Builtin 'trace' command: 5 * 6 * Display a continuously updated trace of any workload, CPU, specific PID, 7 * system wide, etc. Default format is loosely strace like, but any other 8 * event may be specified using --event. 9 * 10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> 11 * 12 * Initially based on the 'trace' prototype by Thomas Gleixner: 13 * 14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'") 15 * 16 * Released under the GPL v2. (and only v2, not any later version) 17 */ 18 19 #include <traceevent/event-parse.h> 20 #include <api/fs/tracing_path.h> 21 #include "builtin.h" 22 #include "util/color.h" 23 #include "util/debug.h" 24 #include "util/event.h" 25 #include "util/evlist.h" 26 #include <subcmd/exec-cmd.h> 27 #include "util/machine.h" 28 #include "util/path.h" 29 #include "util/session.h" 30 #include "util/thread.h" 31 #include <subcmd/parse-options.h> 32 #include "util/strlist.h" 33 #include "util/intlist.h" 34 #include "util/thread_map.h" 35 #include "util/stat.h" 36 #include "trace/beauty/beauty.h" 37 #include "trace-event.h" 38 #include "util/parse-events.h" 39 #include "util/bpf-loader.h" 40 #include "callchain.h" 41 #include "print_binary.h" 42 #include "string2.h" 43 #include "syscalltbl.h" 44 #include "rb_resort.h" 45 46 #include <errno.h> 47 #include <inttypes.h> 48 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */ 49 #include <poll.h> 50 #include <signal.h> 51 #include <stdlib.h> 52 #include <string.h> 53 #include <linux/err.h> 54 #include <linux/filter.h> 55 #include <linux/audit.h> 56 #include <linux/kernel.h> 57 #include <linux/random.h> 58 #include <linux/stringify.h> 59 #include <linux/time64.h> 60 61 #include "sane_ctype.h" 62 63 #ifndef O_CLOEXEC 64 # define O_CLOEXEC 02000000 65 #endif 66 67 struct trace { 68 struct perf_tool tool; 69 struct syscalltbl *sctbl; 70 struct { 71 int max; 72 struct syscall *table; 73 struct { 74 struct perf_evsel *sys_enter, 75 *sys_exit; 76 } events; 77 } syscalls; 78 struct record_opts opts; 79 struct perf_evlist *evlist; 80 struct machine *host; 81 struct thread *current; 82 u64 base_time; 83 FILE *output; 84 unsigned long nr_events; 85 struct strlist *ev_qualifier; 86 struct { 87 size_t nr; 88 int *entries; 89 } ev_qualifier_ids; 90 struct { 91 size_t nr; 92 pid_t *entries; 93 } filter_pids; 94 double duration_filter; 95 double runtime_ms; 96 struct { 97 u64 vfs_getname, 98 proc_getname; 99 } stats; 100 unsigned int max_stack; 101 unsigned int min_stack; 102 bool not_ev_qualifier; 103 bool live; 104 bool full_time; 105 bool sched; 106 bool multiple_threads; 107 bool summary; 108 bool summary_only; 109 bool show_comm; 110 bool show_tool_stats; 111 bool trace_syscalls; 112 bool kernel_syscallchains; 113 bool force; 114 bool vfs_getname; 115 int trace_pgfaults; 116 int open_id; 117 }; 118 119 struct tp_field { 120 int offset; 121 union { 122 u64 (*integer)(struct tp_field *field, struct perf_sample *sample); 123 void *(*pointer)(struct tp_field *field, struct perf_sample *sample); 124 }; 125 }; 126 127 #define TP_UINT_FIELD(bits) \ 128 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \ 129 { \ 130 u##bits value; \ 131 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \ 132 return value; \ 133 } 134 135 TP_UINT_FIELD(8); 136 TP_UINT_FIELD(16); 137 TP_UINT_FIELD(32); 138 TP_UINT_FIELD(64); 139 140 #define TP_UINT_FIELD__SWAPPED(bits) \ 141 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \ 142 { \ 143 u##bits value; \ 144 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \ 145 return bswap_##bits(value);\ 146 } 147 148 TP_UINT_FIELD__SWAPPED(16); 149 TP_UINT_FIELD__SWAPPED(32); 150 TP_UINT_FIELD__SWAPPED(64); 151 152 static int tp_field__init_uint(struct tp_field *field, 153 struct format_field *format_field, 154 bool needs_swap) 155 { 156 field->offset = format_field->offset; 157 158 switch (format_field->size) { 159 case 1: 160 field->integer = tp_field__u8; 161 break; 162 case 2: 163 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16; 164 break; 165 case 4: 166 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32; 167 break; 168 case 8: 169 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64; 170 break; 171 default: 172 return -1; 173 } 174 175 return 0; 176 } 177 178 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample) 179 { 180 return sample->raw_data + field->offset; 181 } 182 183 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field) 184 { 185 field->offset = format_field->offset; 186 field->pointer = tp_field__ptr; 187 return 0; 188 } 189 190 struct syscall_tp { 191 struct tp_field id; 192 union { 193 struct tp_field args, ret; 194 }; 195 }; 196 197 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel, 198 struct tp_field *field, 199 const char *name) 200 { 201 struct format_field *format_field = perf_evsel__field(evsel, name); 202 203 if (format_field == NULL) 204 return -1; 205 206 return tp_field__init_uint(field, format_field, evsel->needs_swap); 207 } 208 209 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \ 210 ({ struct syscall_tp *sc = evsel->priv;\ 211 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); }) 212 213 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel, 214 struct tp_field *field, 215 const char *name) 216 { 217 struct format_field *format_field = perf_evsel__field(evsel, name); 218 219 if (format_field == NULL) 220 return -1; 221 222 return tp_field__init_ptr(field, format_field); 223 } 224 225 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \ 226 ({ struct syscall_tp *sc = evsel->priv;\ 227 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); }) 228 229 static void perf_evsel__delete_priv(struct perf_evsel *evsel) 230 { 231 zfree(&evsel->priv); 232 perf_evsel__delete(evsel); 233 } 234 235 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler) 236 { 237 evsel->priv = malloc(sizeof(struct syscall_tp)); 238 if (evsel->priv != NULL) { 239 if (perf_evsel__init_sc_tp_uint_field(evsel, id)) 240 goto out_delete; 241 242 evsel->handler = handler; 243 return 0; 244 } 245 246 return -ENOMEM; 247 248 out_delete: 249 zfree(&evsel->priv); 250 return -ENOENT; 251 } 252 253 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler) 254 { 255 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction); 256 257 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */ 258 if (IS_ERR(evsel)) 259 evsel = perf_evsel__newtp("syscalls", direction); 260 261 if (IS_ERR(evsel)) 262 return NULL; 263 264 if (perf_evsel__init_syscall_tp(evsel, handler)) 265 goto out_delete; 266 267 return evsel; 268 269 out_delete: 270 perf_evsel__delete_priv(evsel); 271 return NULL; 272 } 273 274 #define perf_evsel__sc_tp_uint(evsel, name, sample) \ 275 ({ struct syscall_tp *fields = evsel->priv; \ 276 fields->name.integer(&fields->name, sample); }) 277 278 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \ 279 ({ struct syscall_tp *fields = evsel->priv; \ 280 fields->name.pointer(&fields->name, sample); }) 281 282 struct strarray { 283 int offset; 284 int nr_entries; 285 const char **entries; 286 }; 287 288 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \ 289 .nr_entries = ARRAY_SIZE(array), \ 290 .entries = array, \ 291 } 292 293 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \ 294 .offset = off, \ 295 .nr_entries = ARRAY_SIZE(array), \ 296 .entries = array, \ 297 } 298 299 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size, 300 const char *intfmt, 301 struct syscall_arg *arg) 302 { 303 struct strarray *sa = arg->parm; 304 int idx = arg->val - sa->offset; 305 306 if (idx < 0 || idx >= sa->nr_entries) 307 return scnprintf(bf, size, intfmt, arg->val); 308 309 return scnprintf(bf, size, "%s", sa->entries[idx]); 310 } 311 312 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size, 313 struct syscall_arg *arg) 314 { 315 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg); 316 } 317 318 #define SCA_STRARRAY syscall_arg__scnprintf_strarray 319 320 #if defined(__i386__) || defined(__x86_64__) 321 /* 322 * FIXME: Make this available to all arches as soon as the ioctl beautifier 323 * gets rewritten to support all arches. 324 */ 325 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size, 326 struct syscall_arg *arg) 327 { 328 return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg); 329 } 330 331 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray 332 #endif /* defined(__i386__) || defined(__x86_64__) */ 333 334 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size, 335 struct syscall_arg *arg); 336 337 #define SCA_FD syscall_arg__scnprintf_fd 338 339 #ifndef AT_FDCWD 340 #define AT_FDCWD -100 341 #endif 342 343 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size, 344 struct syscall_arg *arg) 345 { 346 int fd = arg->val; 347 348 if (fd == AT_FDCWD) 349 return scnprintf(bf, size, "CWD"); 350 351 return syscall_arg__scnprintf_fd(bf, size, arg); 352 } 353 354 #define SCA_FDAT syscall_arg__scnprintf_fd_at 355 356 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size, 357 struct syscall_arg *arg); 358 359 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd 360 361 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size, 362 struct syscall_arg *arg) 363 { 364 return scnprintf(bf, size, "%#lx", arg->val); 365 } 366 367 #define SCA_HEX syscall_arg__scnprintf_hex 368 369 static size_t syscall_arg__scnprintf_int(char *bf, size_t size, 370 struct syscall_arg *arg) 371 { 372 return scnprintf(bf, size, "%d", arg->val); 373 } 374 375 #define SCA_INT syscall_arg__scnprintf_int 376 377 static const char *bpf_cmd[] = { 378 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM", 379 "MAP_GET_NEXT_KEY", "PROG_LOAD", 380 }; 381 static DEFINE_STRARRAY(bpf_cmd); 382 383 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", }; 384 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1); 385 386 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", }; 387 static DEFINE_STRARRAY(itimers); 388 389 static const char *keyctl_options[] = { 390 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN", 391 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ", 392 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT", 393 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT", 394 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT", 395 }; 396 static DEFINE_STRARRAY(keyctl_options); 397 398 static const char *whences[] = { "SET", "CUR", "END", 399 #ifdef SEEK_DATA 400 "DATA", 401 #endif 402 #ifdef SEEK_HOLE 403 "HOLE", 404 #endif 405 }; 406 static DEFINE_STRARRAY(whences); 407 408 static const char *fcntl_cmds[] = { 409 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK", 410 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64", 411 "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX", 412 "F_GETOWNER_UIDS", 413 }; 414 static DEFINE_STRARRAY(fcntl_cmds); 415 416 static const char *rlimit_resources[] = { 417 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE", 418 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO", 419 "RTTIME", 420 }; 421 static DEFINE_STRARRAY(rlimit_resources); 422 423 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", }; 424 static DEFINE_STRARRAY(sighow); 425 426 static const char *clockid[] = { 427 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID", 428 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME", 429 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI" 430 }; 431 static DEFINE_STRARRAY(clockid); 432 433 static const char *socket_families[] = { 434 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM", 435 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI", 436 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC", 437 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC", 438 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF", 439 "ALG", "NFC", "VSOCK", 440 }; 441 static DEFINE_STRARRAY(socket_families); 442 443 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size, 444 struct syscall_arg *arg) 445 { 446 size_t printed = 0; 447 int mode = arg->val; 448 449 if (mode == F_OK) /* 0 */ 450 return scnprintf(bf, size, "F"); 451 #define P_MODE(n) \ 452 if (mode & n##_OK) { \ 453 printed += scnprintf(bf + printed, size - printed, "%s", #n); \ 454 mode &= ~n##_OK; \ 455 } 456 457 P_MODE(R); 458 P_MODE(W); 459 P_MODE(X); 460 #undef P_MODE 461 462 if (mode) 463 printed += scnprintf(bf + printed, size - printed, "|%#x", mode); 464 465 return printed; 466 } 467 468 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode 469 470 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, 471 struct syscall_arg *arg); 472 473 #define SCA_FILENAME syscall_arg__scnprintf_filename 474 475 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size, 476 struct syscall_arg *arg) 477 { 478 int printed = 0, flags = arg->val; 479 480 #define P_FLAG(n) \ 481 if (flags & O_##n) { \ 482 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ 483 flags &= ~O_##n; \ 484 } 485 486 P_FLAG(CLOEXEC); 487 P_FLAG(NONBLOCK); 488 #undef P_FLAG 489 490 if (flags) 491 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); 492 493 return printed; 494 } 495 496 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags 497 498 #if defined(__i386__) || defined(__x86_64__) 499 /* 500 * FIXME: Make this available to all arches. 501 */ 502 #define TCGETS 0x5401 503 504 static const char *tioctls[] = { 505 "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW", 506 "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL", 507 "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI", 508 "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC", 509 "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX", 510 "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO", 511 "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK", 512 "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2", 513 "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK", 514 "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG", 515 "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL", 516 [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG", 517 "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS", 518 "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI", 519 "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE", 520 }; 521 522 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401); 523 #endif /* defined(__i386__) || defined(__x86_64__) */ 524 525 #ifndef GRND_NONBLOCK 526 #define GRND_NONBLOCK 0x0001 527 #endif 528 #ifndef GRND_RANDOM 529 #define GRND_RANDOM 0x0002 530 #endif 531 532 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, 533 struct syscall_arg *arg) 534 { 535 int printed = 0, flags = arg->val; 536 537 #define P_FLAG(n) \ 538 if (flags & GRND_##n) { \ 539 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ 540 flags &= ~GRND_##n; \ 541 } 542 543 P_FLAG(RANDOM); 544 P_FLAG(NONBLOCK); 545 #undef P_FLAG 546 547 if (flags) 548 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); 549 550 return printed; 551 } 552 553 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags 554 555 #define STRARRAY(arg, name, array) \ 556 .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \ 557 .arg_parm = { [arg] = &strarray__##array, } 558 559 #include "trace/beauty/eventfd.c" 560 #include "trace/beauty/flock.c" 561 #include "trace/beauty/futex_op.c" 562 #include "trace/beauty/mmap.c" 563 #include "trace/beauty/mode_t.c" 564 #include "trace/beauty/msg_flags.c" 565 #include "trace/beauty/open_flags.c" 566 #include "trace/beauty/perf_event_open.c" 567 #include "trace/beauty/pid.c" 568 #include "trace/beauty/sched_policy.c" 569 #include "trace/beauty/seccomp.c" 570 #include "trace/beauty/signum.c" 571 #include "trace/beauty/socket_type.c" 572 #include "trace/beauty/waitid_options.c" 573 574 static struct syscall_fmt { 575 const char *name; 576 const char *alias; 577 size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg); 578 void *arg_parm[6]; 579 bool errmsg; 580 bool errpid; 581 bool timeout; 582 bool hexret; 583 } syscall_fmts[] = { 584 { .name = "access", .errmsg = true, 585 .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, }, 586 { .name = "arch_prctl", .errmsg = true, .alias = "prctl", }, 587 { .name = "bpf", .errmsg = true, STRARRAY(0, cmd, bpf_cmd), }, 588 { .name = "brk", .hexret = true, 589 .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, }, 590 { .name = "chdir", .errmsg = true, }, 591 { .name = "chmod", .errmsg = true, }, 592 { .name = "chroot", .errmsg = true, }, 593 { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), }, 594 { .name = "clone", .errpid = true, }, 595 { .name = "close", .errmsg = true, 596 .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, }, 597 { .name = "connect", .errmsg = true, }, 598 { .name = "creat", .errmsg = true, }, 599 { .name = "dup", .errmsg = true, }, 600 { .name = "dup2", .errmsg = true, }, 601 { .name = "dup3", .errmsg = true, }, 602 { .name = "epoll_ctl", .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), }, 603 { .name = "eventfd2", .errmsg = true, 604 .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, }, 605 { .name = "faccessat", .errmsg = true, }, 606 { .name = "fadvise64", .errmsg = true, }, 607 { .name = "fallocate", .errmsg = true, }, 608 { .name = "fchdir", .errmsg = true, }, 609 { .name = "fchmod", .errmsg = true, }, 610 { .name = "fchmodat", .errmsg = true, 611 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 612 { .name = "fchown", .errmsg = true, }, 613 { .name = "fchownat", .errmsg = true, 614 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 615 { .name = "fcntl", .errmsg = true, 616 .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ }, 617 .arg_parm = { [1] = &strarray__fcntl_cmds, /* cmd */ }, }, 618 { .name = "fdatasync", .errmsg = true, }, 619 { .name = "flock", .errmsg = true, 620 .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, }, 621 { .name = "fsetxattr", .errmsg = true, }, 622 { .name = "fstat", .errmsg = true, .alias = "newfstat", }, 623 { .name = "fstatat", .errmsg = true, .alias = "newfstatat", }, 624 { .name = "fstatfs", .errmsg = true, }, 625 { .name = "fsync", .errmsg = true, }, 626 { .name = "ftruncate", .errmsg = true, }, 627 { .name = "futex", .errmsg = true, 628 .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, }, 629 { .name = "futimesat", .errmsg = true, 630 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 631 { .name = "getdents", .errmsg = true, }, 632 { .name = "getdents64", .errmsg = true, }, 633 { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), }, 634 { .name = "getpid", .errpid = true, }, 635 { .name = "getpgid", .errpid = true, }, 636 { .name = "getppid", .errpid = true, }, 637 { .name = "getrandom", .errmsg = true, 638 .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, }, 639 { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), }, 640 { .name = "getxattr", .errmsg = true, }, 641 { .name = "inotify_add_watch", .errmsg = true, }, 642 { .name = "ioctl", .errmsg = true, 643 .arg_scnprintf = { 644 #if defined(__i386__) || defined(__x86_64__) 645 /* 646 * FIXME: Make this available to all arches. 647 */ 648 [1] = SCA_STRHEXARRAY, /* cmd */ 649 [2] = SCA_HEX, /* arg */ }, 650 .arg_parm = { [1] = &strarray__tioctls, /* cmd */ }, }, 651 #else 652 [2] = SCA_HEX, /* arg */ }, }, 653 #endif 654 { .name = "keyctl", .errmsg = true, STRARRAY(0, option, keyctl_options), }, 655 { .name = "kill", .errmsg = true, 656 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, }, 657 { .name = "lchown", .errmsg = true, }, 658 { .name = "lgetxattr", .errmsg = true, }, 659 { .name = "linkat", .errmsg = true, 660 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 661 { .name = "listxattr", .errmsg = true, }, 662 { .name = "llistxattr", .errmsg = true, }, 663 { .name = "lremovexattr", .errmsg = true, }, 664 { .name = "lseek", .errmsg = true, 665 .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ }, 666 .arg_parm = { [2] = &strarray__whences, /* whence */ }, }, 667 { .name = "lsetxattr", .errmsg = true, }, 668 { .name = "lstat", .errmsg = true, .alias = "newlstat", }, 669 { .name = "lsxattr", .errmsg = true, }, 670 { .name = "madvise", .errmsg = true, 671 .arg_scnprintf = { [0] = SCA_HEX, /* start */ 672 [2] = SCA_MADV_BHV, /* behavior */ }, }, 673 { .name = "mkdir", .errmsg = true, }, 674 { .name = "mkdirat", .errmsg = true, 675 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 676 { .name = "mknod", .errmsg = true, }, 677 { .name = "mknodat", .errmsg = true, 678 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 679 { .name = "mlock", .errmsg = true, 680 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, }, 681 { .name = "mlockall", .errmsg = true, 682 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, }, 683 { .name = "mmap", .hexret = true, 684 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ 685 [2] = SCA_MMAP_PROT, /* prot */ 686 [3] = SCA_MMAP_FLAGS, /* flags */ }, }, 687 { .name = "mprotect", .errmsg = true, 688 .arg_scnprintf = { [0] = SCA_HEX, /* start */ 689 [2] = SCA_MMAP_PROT, /* prot */ }, }, 690 { .name = "mq_unlink", .errmsg = true, 691 .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, }, 692 { .name = "mremap", .hexret = true, 693 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ 694 [3] = SCA_MREMAP_FLAGS, /* flags */ 695 [4] = SCA_HEX, /* new_addr */ }, }, 696 { .name = "munlock", .errmsg = true, 697 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, }, 698 { .name = "munmap", .errmsg = true, 699 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, }, 700 { .name = "name_to_handle_at", .errmsg = true, 701 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 702 { .name = "newfstatat", .errmsg = true, 703 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 704 { .name = "open", .errmsg = true, 705 .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, }, 706 { .name = "open_by_handle_at", .errmsg = true, 707 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ 708 [2] = SCA_OPEN_FLAGS, /* flags */ }, }, 709 { .name = "openat", .errmsg = true, 710 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ 711 [2] = SCA_OPEN_FLAGS, /* flags */ }, }, 712 { .name = "perf_event_open", .errmsg = true, 713 .arg_scnprintf = { [2] = SCA_INT, /* cpu */ 714 [3] = SCA_FD, /* group_fd */ 715 [4] = SCA_PERF_FLAGS, /* flags */ }, }, 716 { .name = "pipe2", .errmsg = true, 717 .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, }, 718 { .name = "poll", .errmsg = true, .timeout = true, }, 719 { .name = "ppoll", .errmsg = true, .timeout = true, }, 720 { .name = "pread", .errmsg = true, .alias = "pread64", }, 721 { .name = "preadv", .errmsg = true, .alias = "pread", }, 722 { .name = "prlimit64", .errmsg = true, STRARRAY(1, resource, rlimit_resources), }, 723 { .name = "pwrite", .errmsg = true, .alias = "pwrite64", }, 724 { .name = "pwritev", .errmsg = true, }, 725 { .name = "read", .errmsg = true, }, 726 { .name = "readlink", .errmsg = true, }, 727 { .name = "readlinkat", .errmsg = true, 728 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 729 { .name = "readv", .errmsg = true, }, 730 { .name = "recvfrom", .errmsg = true, 731 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, 732 { .name = "recvmmsg", .errmsg = true, 733 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, 734 { .name = "recvmsg", .errmsg = true, 735 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, }, 736 { .name = "removexattr", .errmsg = true, }, 737 { .name = "renameat", .errmsg = true, 738 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 739 { .name = "rmdir", .errmsg = true, }, 740 { .name = "rt_sigaction", .errmsg = true, 741 .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, }, 742 { .name = "rt_sigprocmask", .errmsg = true, STRARRAY(0, how, sighow), }, 743 { .name = "rt_sigqueueinfo", .errmsg = true, 744 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, }, 745 { .name = "rt_tgsigqueueinfo", .errmsg = true, 746 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, }, 747 { .name = "sched_getattr", .errmsg = true, }, 748 { .name = "sched_setattr", .errmsg = true, }, 749 { .name = "sched_setscheduler", .errmsg = true, 750 .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, }, 751 { .name = "seccomp", .errmsg = true, 752 .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */ 753 [1] = SCA_SECCOMP_FLAGS, /* flags */ }, }, 754 { .name = "select", .errmsg = true, .timeout = true, }, 755 { .name = "sendmmsg", .errmsg = true, 756 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, 757 { .name = "sendmsg", .errmsg = true, 758 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, }, 759 { .name = "sendto", .errmsg = true, 760 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, 761 { .name = "set_tid_address", .errpid = true, }, 762 { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), }, 763 { .name = "setpgid", .errmsg = true, }, 764 { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), }, 765 { .name = "setxattr", .errmsg = true, }, 766 { .name = "shutdown", .errmsg = true, }, 767 { .name = "socket", .errmsg = true, 768 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */ 769 [1] = SCA_SK_TYPE, /* type */ }, 770 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, }, 771 { .name = "socketpair", .errmsg = true, 772 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */ 773 [1] = SCA_SK_TYPE, /* type */ }, 774 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, }, 775 { .name = "stat", .errmsg = true, .alias = "newstat", }, 776 { .name = "statfs", .errmsg = true, }, 777 { .name = "statx", .errmsg = true, 778 .arg_scnprintf = { [0] = SCA_FDAT, /* flags */ 779 [2] = SCA_STATX_FLAGS, /* flags */ 780 [3] = SCA_STATX_MASK, /* mask */ }, }, 781 { .name = "swapoff", .errmsg = true, 782 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, }, 783 { .name = "swapon", .errmsg = true, 784 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, }, 785 { .name = "symlinkat", .errmsg = true, 786 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 787 { .name = "tgkill", .errmsg = true, 788 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, }, 789 { .name = "tkill", .errmsg = true, 790 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, }, 791 { .name = "truncate", .errmsg = true, }, 792 { .name = "uname", .errmsg = true, .alias = "newuname", }, 793 { .name = "unlinkat", .errmsg = true, 794 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 795 { .name = "utime", .errmsg = true, }, 796 { .name = "utimensat", .errmsg = true, 797 .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, }, 798 { .name = "utimes", .errmsg = true, }, 799 { .name = "vmsplice", .errmsg = true, }, 800 { .name = "wait4", .errpid = true, 801 .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, }, 802 { .name = "waitid", .errpid = true, 803 .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, }, 804 { .name = "write", .errmsg = true, }, 805 { .name = "writev", .errmsg = true, }, 806 }; 807 808 static int syscall_fmt__cmp(const void *name, const void *fmtp) 809 { 810 const struct syscall_fmt *fmt = fmtp; 811 return strcmp(name, fmt->name); 812 } 813 814 static struct syscall_fmt *syscall_fmt__find(const char *name) 815 { 816 const int nmemb = ARRAY_SIZE(syscall_fmts); 817 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp); 818 } 819 820 struct syscall { 821 struct event_format *tp_format; 822 int nr_args; 823 struct format_field *args; 824 const char *name; 825 bool is_exit; 826 struct syscall_fmt *fmt; 827 size_t (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg); 828 void **arg_parm; 829 }; 830 831 /* 832 * We need to have this 'calculated' boolean because in some cases we really 833 * don't know what is the duration of a syscall, for instance, when we start 834 * a session and some threads are waiting for a syscall to finish, say 'poll', 835 * in which case all we can do is to print "( ? ) for duration and for the 836 * start timestamp. 837 */ 838 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp) 839 { 840 double duration = (double)t / NSEC_PER_MSEC; 841 size_t printed = fprintf(fp, "("); 842 843 if (!calculated) 844 printed += fprintf(fp, " ? "); 845 else if (duration >= 1.0) 846 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration); 847 else if (duration >= 0.01) 848 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration); 849 else 850 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration); 851 return printed + fprintf(fp, "): "); 852 } 853 854 /** 855 * filename.ptr: The filename char pointer that will be vfs_getname'd 856 * filename.entry_str_pos: Where to insert the string translated from 857 * filename.ptr by the vfs_getname tracepoint/kprobe. 858 */ 859 struct thread_trace { 860 u64 entry_time; 861 bool entry_pending; 862 unsigned long nr_events; 863 unsigned long pfmaj, pfmin; 864 char *entry_str; 865 double runtime_ms; 866 struct { 867 unsigned long ptr; 868 short int entry_str_pos; 869 bool pending_open; 870 unsigned int namelen; 871 char *name; 872 } filename; 873 struct { 874 int max; 875 char **table; 876 } paths; 877 878 struct intlist *syscall_stats; 879 }; 880 881 static struct thread_trace *thread_trace__new(void) 882 { 883 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace)); 884 885 if (ttrace) 886 ttrace->paths.max = -1; 887 888 ttrace->syscall_stats = intlist__new(NULL); 889 890 return ttrace; 891 } 892 893 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp) 894 { 895 struct thread_trace *ttrace; 896 897 if (thread == NULL) 898 goto fail; 899 900 if (thread__priv(thread) == NULL) 901 thread__set_priv(thread, thread_trace__new()); 902 903 if (thread__priv(thread) == NULL) 904 goto fail; 905 906 ttrace = thread__priv(thread); 907 ++ttrace->nr_events; 908 909 return ttrace; 910 fail: 911 color_fprintf(fp, PERF_COLOR_RED, 912 "WARNING: not enough memory, dropping samples!\n"); 913 return NULL; 914 } 915 916 #define TRACE_PFMAJ (1 << 0) 917 #define TRACE_PFMIN (1 << 1) 918 919 static const size_t trace__entry_str_size = 2048; 920 921 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname) 922 { 923 struct thread_trace *ttrace = thread__priv(thread); 924 925 if (fd > ttrace->paths.max) { 926 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *)); 927 928 if (npath == NULL) 929 return -1; 930 931 if (ttrace->paths.max != -1) { 932 memset(npath + ttrace->paths.max + 1, 0, 933 (fd - ttrace->paths.max) * sizeof(char *)); 934 } else { 935 memset(npath, 0, (fd + 1) * sizeof(char *)); 936 } 937 938 ttrace->paths.table = npath; 939 ttrace->paths.max = fd; 940 } 941 942 ttrace->paths.table[fd] = strdup(pathname); 943 944 return ttrace->paths.table[fd] != NULL ? 0 : -1; 945 } 946 947 static int thread__read_fd_path(struct thread *thread, int fd) 948 { 949 char linkname[PATH_MAX], pathname[PATH_MAX]; 950 struct stat st; 951 int ret; 952 953 if (thread->pid_ == thread->tid) { 954 scnprintf(linkname, sizeof(linkname), 955 "/proc/%d/fd/%d", thread->pid_, fd); 956 } else { 957 scnprintf(linkname, sizeof(linkname), 958 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd); 959 } 960 961 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname)) 962 return -1; 963 964 ret = readlink(linkname, pathname, sizeof(pathname)); 965 966 if (ret < 0 || ret > st.st_size) 967 return -1; 968 969 pathname[ret] = '\0'; 970 return trace__set_fd_pathname(thread, fd, pathname); 971 } 972 973 static const char *thread__fd_path(struct thread *thread, int fd, 974 struct trace *trace) 975 { 976 struct thread_trace *ttrace = thread__priv(thread); 977 978 if (ttrace == NULL) 979 return NULL; 980 981 if (fd < 0) 982 return NULL; 983 984 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) { 985 if (!trace->live) 986 return NULL; 987 ++trace->stats.proc_getname; 988 if (thread__read_fd_path(thread, fd)) 989 return NULL; 990 } 991 992 return ttrace->paths.table[fd]; 993 } 994 995 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size, 996 struct syscall_arg *arg) 997 { 998 int fd = arg->val; 999 size_t printed = scnprintf(bf, size, "%d", fd); 1000 const char *path = thread__fd_path(arg->thread, fd, arg->trace); 1001 1002 if (path) 1003 printed += scnprintf(bf + printed, size - printed, "<%s>", path); 1004 1005 return printed; 1006 } 1007 1008 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size, 1009 struct syscall_arg *arg) 1010 { 1011 int fd = arg->val; 1012 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg); 1013 struct thread_trace *ttrace = thread__priv(arg->thread); 1014 1015 if (ttrace && fd >= 0 && fd <= ttrace->paths.max) 1016 zfree(&ttrace->paths.table[fd]); 1017 1018 return printed; 1019 } 1020 1021 static void thread__set_filename_pos(struct thread *thread, const char *bf, 1022 unsigned long ptr) 1023 { 1024 struct thread_trace *ttrace = thread__priv(thread); 1025 1026 ttrace->filename.ptr = ptr; 1027 ttrace->filename.entry_str_pos = bf - ttrace->entry_str; 1028 } 1029 1030 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, 1031 struct syscall_arg *arg) 1032 { 1033 unsigned long ptr = arg->val; 1034 1035 if (!arg->trace->vfs_getname) 1036 return scnprintf(bf, size, "%#x", ptr); 1037 1038 thread__set_filename_pos(arg->thread, bf, ptr); 1039 return 0; 1040 } 1041 1042 static bool trace__filter_duration(struct trace *trace, double t) 1043 { 1044 return t < (trace->duration_filter * NSEC_PER_MSEC); 1045 } 1046 1047 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp) 1048 { 1049 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC; 1050 1051 return fprintf(fp, "%10.3f ", ts); 1052 } 1053 1054 /* 1055 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are 1056 * using ttrace->entry_time for a thread that receives a sys_exit without 1057 * first having received a sys_enter ("poll" issued before tracing session 1058 * starts, lost sys_enter exit due to ring buffer overflow). 1059 */ 1060 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp) 1061 { 1062 if (tstamp > 0) 1063 return __trace__fprintf_tstamp(trace, tstamp, fp); 1064 1065 return fprintf(fp, " ? "); 1066 } 1067 1068 static bool done = false; 1069 static bool interrupted = false; 1070 1071 static void sig_handler(int sig) 1072 { 1073 done = true; 1074 interrupted = sig == SIGINT; 1075 } 1076 1077 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread, 1078 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp) 1079 { 1080 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp); 1081 printed += fprintf_duration(duration, duration_calculated, fp); 1082 1083 if (trace->multiple_threads) { 1084 if (trace->show_comm) 1085 printed += fprintf(fp, "%.14s/", thread__comm_str(thread)); 1086 printed += fprintf(fp, "%d ", thread->tid); 1087 } 1088 1089 return printed; 1090 } 1091 1092 static int trace__process_event(struct trace *trace, struct machine *machine, 1093 union perf_event *event, struct perf_sample *sample) 1094 { 1095 int ret = 0; 1096 1097 switch (event->header.type) { 1098 case PERF_RECORD_LOST: 1099 color_fprintf(trace->output, PERF_COLOR_RED, 1100 "LOST %" PRIu64 " events!\n", event->lost.lost); 1101 ret = machine__process_lost_event(machine, event, sample); 1102 break; 1103 default: 1104 ret = machine__process_event(machine, event, sample); 1105 break; 1106 } 1107 1108 return ret; 1109 } 1110 1111 static int trace__tool_process(struct perf_tool *tool, 1112 union perf_event *event, 1113 struct perf_sample *sample, 1114 struct machine *machine) 1115 { 1116 struct trace *trace = container_of(tool, struct trace, tool); 1117 return trace__process_event(trace, machine, event, sample); 1118 } 1119 1120 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp) 1121 { 1122 struct machine *machine = vmachine; 1123 1124 if (machine->kptr_restrict_warned) 1125 return NULL; 1126 1127 if (symbol_conf.kptr_restrict) { 1128 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n" 1129 "Check /proc/sys/kernel/kptr_restrict.\n\n" 1130 "Kernel samples will not be resolved.\n"); 1131 machine->kptr_restrict_warned = true; 1132 return NULL; 1133 } 1134 1135 return machine__resolve_kernel_addr(vmachine, addrp, modp); 1136 } 1137 1138 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist) 1139 { 1140 int err = symbol__init(NULL); 1141 1142 if (err) 1143 return err; 1144 1145 trace->host = machine__new_host(); 1146 if (trace->host == NULL) 1147 return -ENOMEM; 1148 1149 if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0) 1150 return -errno; 1151 1152 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target, 1153 evlist->threads, trace__tool_process, false, 1154 trace->opts.proc_map_timeout); 1155 if (err) 1156 symbol__exit(); 1157 1158 return err; 1159 } 1160 1161 static int syscall__set_arg_fmts(struct syscall *sc) 1162 { 1163 struct format_field *field; 1164 int idx = 0, len; 1165 1166 sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *)); 1167 if (sc->arg_scnprintf == NULL) 1168 return -1; 1169 1170 if (sc->fmt) 1171 sc->arg_parm = sc->fmt->arg_parm; 1172 1173 for (field = sc->args; field; field = field->next) { 1174 if (sc->fmt && sc->fmt->arg_scnprintf[idx]) 1175 sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx]; 1176 else if (strcmp(field->type, "const char *") == 0 && 1177 (strcmp(field->name, "filename") == 0 || 1178 strcmp(field->name, "path") == 0 || 1179 strcmp(field->name, "pathname") == 0)) 1180 sc->arg_scnprintf[idx] = SCA_FILENAME; 1181 else if (field->flags & FIELD_IS_POINTER) 1182 sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex; 1183 else if (strcmp(field->type, "pid_t") == 0) 1184 sc->arg_scnprintf[idx] = SCA_PID; 1185 else if (strcmp(field->type, "umode_t") == 0) 1186 sc->arg_scnprintf[idx] = SCA_MODE_T; 1187 else if ((strcmp(field->type, "int") == 0 || 1188 strcmp(field->type, "unsigned int") == 0 || 1189 strcmp(field->type, "long") == 0) && 1190 (len = strlen(field->name)) >= 2 && 1191 strcmp(field->name + len - 2, "fd") == 0) { 1192 /* 1193 * /sys/kernel/tracing/events/syscalls/sys_enter* 1194 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c 1195 * 65 int 1196 * 23 unsigned int 1197 * 7 unsigned long 1198 */ 1199 sc->arg_scnprintf[idx] = SCA_FD; 1200 } 1201 ++idx; 1202 } 1203 1204 return 0; 1205 } 1206 1207 static int trace__read_syscall_info(struct trace *trace, int id) 1208 { 1209 char tp_name[128]; 1210 struct syscall *sc; 1211 const char *name = syscalltbl__name(trace->sctbl, id); 1212 1213 if (name == NULL) 1214 return -1; 1215 1216 if (id > trace->syscalls.max) { 1217 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc)); 1218 1219 if (nsyscalls == NULL) 1220 return -1; 1221 1222 if (trace->syscalls.max != -1) { 1223 memset(nsyscalls + trace->syscalls.max + 1, 0, 1224 (id - trace->syscalls.max) * sizeof(*sc)); 1225 } else { 1226 memset(nsyscalls, 0, (id + 1) * sizeof(*sc)); 1227 } 1228 1229 trace->syscalls.table = nsyscalls; 1230 trace->syscalls.max = id; 1231 } 1232 1233 sc = trace->syscalls.table + id; 1234 sc->name = name; 1235 1236 sc->fmt = syscall_fmt__find(sc->name); 1237 1238 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name); 1239 sc->tp_format = trace_event__tp_format("syscalls", tp_name); 1240 1241 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) { 1242 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias); 1243 sc->tp_format = trace_event__tp_format("syscalls", tp_name); 1244 } 1245 1246 if (IS_ERR(sc->tp_format)) 1247 return -1; 1248 1249 sc->args = sc->tp_format->format.fields; 1250 sc->nr_args = sc->tp_format->format.nr_fields; 1251 /* 1252 * We need to check and discard the first variable '__syscall_nr' 1253 * or 'nr' that mean the syscall number. It is needless here. 1254 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels. 1255 */ 1256 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) { 1257 sc->args = sc->args->next; 1258 --sc->nr_args; 1259 } 1260 1261 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit"); 1262 1263 return syscall__set_arg_fmts(sc); 1264 } 1265 1266 static int trace__validate_ev_qualifier(struct trace *trace) 1267 { 1268 int err = 0, i; 1269 struct str_node *pos; 1270 1271 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier); 1272 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr * 1273 sizeof(trace->ev_qualifier_ids.entries[0])); 1274 1275 if (trace->ev_qualifier_ids.entries == NULL) { 1276 fputs("Error:\tNot enough memory for allocating events qualifier ids\n", 1277 trace->output); 1278 err = -EINVAL; 1279 goto out; 1280 } 1281 1282 i = 0; 1283 1284 strlist__for_each_entry(pos, trace->ev_qualifier) { 1285 const char *sc = pos->s; 1286 int id = syscalltbl__id(trace->sctbl, sc); 1287 1288 if (id < 0) { 1289 if (err == 0) { 1290 fputs("Error:\tInvalid syscall ", trace->output); 1291 err = -EINVAL; 1292 } else { 1293 fputs(", ", trace->output); 1294 } 1295 1296 fputs(sc, trace->output); 1297 } 1298 1299 trace->ev_qualifier_ids.entries[i++] = id; 1300 } 1301 1302 if (err < 0) { 1303 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'" 1304 "\nHint:\tand: 'man syscalls'\n", trace->output); 1305 zfree(&trace->ev_qualifier_ids.entries); 1306 trace->ev_qualifier_ids.nr = 0; 1307 } 1308 out: 1309 return err; 1310 } 1311 1312 /* 1313 * args is to be interpreted as a series of longs but we need to handle 1314 * 8-byte unaligned accesses. args points to raw_data within the event 1315 * and raw_data is guaranteed to be 8-byte unaligned because it is 1316 * preceded by raw_size which is a u32. So we need to copy args to a temp 1317 * variable to read it. Most notably this avoids extended load instructions 1318 * on unaligned addresses 1319 */ 1320 1321 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, 1322 unsigned char *args, struct trace *trace, 1323 struct thread *thread) 1324 { 1325 size_t printed = 0; 1326 unsigned char *p; 1327 unsigned long val; 1328 1329 if (sc->args != NULL) { 1330 struct format_field *field; 1331 u8 bit = 1; 1332 struct syscall_arg arg = { 1333 .idx = 0, 1334 .mask = 0, 1335 .trace = trace, 1336 .thread = thread, 1337 }; 1338 1339 for (field = sc->args; field; 1340 field = field->next, ++arg.idx, bit <<= 1) { 1341 if (arg.mask & bit) 1342 continue; 1343 1344 /* special care for unaligned accesses */ 1345 p = args + sizeof(unsigned long) * arg.idx; 1346 memcpy(&val, p, sizeof(val)); 1347 1348 /* 1349 * Suppress this argument if its value is zero and 1350 * and we don't have a string associated in an 1351 * strarray for it. 1352 */ 1353 if (val == 0 && 1354 !(sc->arg_scnprintf && 1355 sc->arg_scnprintf[arg.idx] == SCA_STRARRAY && 1356 sc->arg_parm[arg.idx])) 1357 continue; 1358 1359 printed += scnprintf(bf + printed, size - printed, 1360 "%s%s: ", printed ? ", " : "", field->name); 1361 if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) { 1362 arg.val = val; 1363 if (sc->arg_parm) 1364 arg.parm = sc->arg_parm[arg.idx]; 1365 printed += sc->arg_scnprintf[arg.idx](bf + printed, 1366 size - printed, &arg); 1367 } else { 1368 printed += scnprintf(bf + printed, size - printed, 1369 "%ld", val); 1370 } 1371 } 1372 } else if (IS_ERR(sc->tp_format)) { 1373 /* 1374 * If we managed to read the tracepoint /format file, then we 1375 * may end up not having any args, like with gettid(), so only 1376 * print the raw args when we didn't manage to read it. 1377 */ 1378 int i = 0; 1379 1380 while (i < 6) { 1381 /* special care for unaligned accesses */ 1382 p = args + sizeof(unsigned long) * i; 1383 memcpy(&val, p, sizeof(val)); 1384 printed += scnprintf(bf + printed, size - printed, 1385 "%sarg%d: %ld", 1386 printed ? ", " : "", i, val); 1387 ++i; 1388 } 1389 } 1390 1391 return printed; 1392 } 1393 1394 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel, 1395 union perf_event *event, 1396 struct perf_sample *sample); 1397 1398 static struct syscall *trace__syscall_info(struct trace *trace, 1399 struct perf_evsel *evsel, int id) 1400 { 1401 1402 if (id < 0) { 1403 1404 /* 1405 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried 1406 * before that, leaving at a higher verbosity level till that is 1407 * explained. Reproduced with plain ftrace with: 1408 * 1409 * echo 1 > /t/events/raw_syscalls/sys_exit/enable 1410 * grep "NR -1 " /t/trace_pipe 1411 * 1412 * After generating some load on the machine. 1413 */ 1414 if (verbose > 1) { 1415 static u64 n; 1416 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n", 1417 id, perf_evsel__name(evsel), ++n); 1418 } 1419 return NULL; 1420 } 1421 1422 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) && 1423 trace__read_syscall_info(trace, id)) 1424 goto out_cant_read; 1425 1426 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL)) 1427 goto out_cant_read; 1428 1429 return &trace->syscalls.table[id]; 1430 1431 out_cant_read: 1432 if (verbose > 0) { 1433 fprintf(trace->output, "Problems reading syscall %d", id); 1434 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL) 1435 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name); 1436 fputs(" information\n", trace->output); 1437 } 1438 return NULL; 1439 } 1440 1441 static void thread__update_stats(struct thread_trace *ttrace, 1442 int id, struct perf_sample *sample) 1443 { 1444 struct int_node *inode; 1445 struct stats *stats; 1446 u64 duration = 0; 1447 1448 inode = intlist__findnew(ttrace->syscall_stats, id); 1449 if (inode == NULL) 1450 return; 1451 1452 stats = inode->priv; 1453 if (stats == NULL) { 1454 stats = malloc(sizeof(struct stats)); 1455 if (stats == NULL) 1456 return; 1457 init_stats(stats); 1458 inode->priv = stats; 1459 } 1460 1461 if (ttrace->entry_time && sample->time > ttrace->entry_time) 1462 duration = sample->time - ttrace->entry_time; 1463 1464 update_stats(stats, duration); 1465 } 1466 1467 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample) 1468 { 1469 struct thread_trace *ttrace; 1470 u64 duration; 1471 size_t printed; 1472 1473 if (trace->current == NULL) 1474 return 0; 1475 1476 ttrace = thread__priv(trace->current); 1477 1478 if (!ttrace->entry_pending) 1479 return 0; 1480 1481 duration = sample->time - ttrace->entry_time; 1482 1483 printed = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output); 1484 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str); 1485 ttrace->entry_pending = false; 1486 1487 return printed; 1488 } 1489 1490 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, 1491 union perf_event *event __maybe_unused, 1492 struct perf_sample *sample) 1493 { 1494 char *msg; 1495 void *args; 1496 size_t printed = 0; 1497 struct thread *thread; 1498 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; 1499 struct syscall *sc = trace__syscall_info(trace, evsel, id); 1500 struct thread_trace *ttrace; 1501 1502 if (sc == NULL) 1503 return -1; 1504 1505 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1506 ttrace = thread__trace(thread, trace->output); 1507 if (ttrace == NULL) 1508 goto out_put; 1509 1510 args = perf_evsel__sc_tp_ptr(evsel, args, sample); 1511 1512 if (ttrace->entry_str == NULL) { 1513 ttrace->entry_str = malloc(trace__entry_str_size); 1514 if (!ttrace->entry_str) 1515 goto out_put; 1516 } 1517 1518 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) 1519 trace__printf_interrupted_entry(trace, sample); 1520 1521 ttrace->entry_time = sample->time; 1522 msg = ttrace->entry_str; 1523 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name); 1524 1525 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed, 1526 args, trace, thread); 1527 1528 if (sc->is_exit) { 1529 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) { 1530 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output); 1531 fprintf(trace->output, "%-70s)\n", ttrace->entry_str); 1532 } 1533 } else { 1534 ttrace->entry_pending = true; 1535 /* See trace__vfs_getname & trace__sys_exit */ 1536 ttrace->filename.pending_open = false; 1537 } 1538 1539 if (trace->current != thread) { 1540 thread__put(trace->current); 1541 trace->current = thread__get(thread); 1542 } 1543 err = 0; 1544 out_put: 1545 thread__put(thread); 1546 return err; 1547 } 1548 1549 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel, 1550 struct perf_sample *sample, 1551 struct callchain_cursor *cursor) 1552 { 1553 struct addr_location al; 1554 1555 if (machine__resolve(trace->host, &al, sample) < 0 || 1556 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack)) 1557 return -1; 1558 1559 return 0; 1560 } 1561 1562 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample) 1563 { 1564 /* TODO: user-configurable print_opts */ 1565 const unsigned int print_opts = EVSEL__PRINT_SYM | 1566 EVSEL__PRINT_DSO | 1567 EVSEL__PRINT_UNKNOWN_AS_ADDR; 1568 1569 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output); 1570 } 1571 1572 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, 1573 union perf_event *event __maybe_unused, 1574 struct perf_sample *sample) 1575 { 1576 long ret; 1577 u64 duration = 0; 1578 bool duration_calculated = false; 1579 struct thread *thread; 1580 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0; 1581 struct syscall *sc = trace__syscall_info(trace, evsel, id); 1582 struct thread_trace *ttrace; 1583 1584 if (sc == NULL) 1585 return -1; 1586 1587 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1588 ttrace = thread__trace(thread, trace->output); 1589 if (ttrace == NULL) 1590 goto out_put; 1591 1592 if (trace->summary) 1593 thread__update_stats(ttrace, id, sample); 1594 1595 ret = perf_evsel__sc_tp_uint(evsel, ret, sample); 1596 1597 if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) { 1598 trace__set_fd_pathname(thread, ret, ttrace->filename.name); 1599 ttrace->filename.pending_open = false; 1600 ++trace->stats.vfs_getname; 1601 } 1602 1603 if (ttrace->entry_time) { 1604 duration = sample->time - ttrace->entry_time; 1605 if (trace__filter_duration(trace, duration)) 1606 goto out; 1607 duration_calculated = true; 1608 } else if (trace->duration_filter) 1609 goto out; 1610 1611 if (sample->callchain) { 1612 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 1613 if (callchain_ret == 0) { 1614 if (callchain_cursor.nr < trace->min_stack) 1615 goto out; 1616 callchain_ret = 1; 1617 } 1618 } 1619 1620 if (trace->summary_only) 1621 goto out; 1622 1623 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output); 1624 1625 if (ttrace->entry_pending) { 1626 fprintf(trace->output, "%-70s", ttrace->entry_str); 1627 } else { 1628 fprintf(trace->output, " ... ["); 1629 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued"); 1630 fprintf(trace->output, "]: %s()", sc->name); 1631 } 1632 1633 if (sc->fmt == NULL) { 1634 signed_print: 1635 fprintf(trace->output, ") = %ld", ret); 1636 } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) { 1637 char bf[STRERR_BUFSIZE]; 1638 const char *emsg = str_error_r(-ret, bf, sizeof(bf)), 1639 *e = audit_errno_to_name(-ret); 1640 1641 fprintf(trace->output, ") = -1 %s %s", e, emsg); 1642 } else if (ret == 0 && sc->fmt->timeout) 1643 fprintf(trace->output, ") = 0 Timeout"); 1644 else if (sc->fmt->hexret) 1645 fprintf(trace->output, ") = %#lx", ret); 1646 else if (sc->fmt->errpid) { 1647 struct thread *child = machine__find_thread(trace->host, ret, ret); 1648 1649 if (child != NULL) { 1650 fprintf(trace->output, ") = %ld", ret); 1651 if (child->comm_set) 1652 fprintf(trace->output, " (%s)", thread__comm_str(child)); 1653 thread__put(child); 1654 } 1655 } else 1656 goto signed_print; 1657 1658 fputc('\n', trace->output); 1659 1660 if (callchain_ret > 0) 1661 trace__fprintf_callchain(trace, sample); 1662 else if (callchain_ret < 0) 1663 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 1664 out: 1665 ttrace->entry_pending = false; 1666 err = 0; 1667 out_put: 1668 thread__put(thread); 1669 return err; 1670 } 1671 1672 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel, 1673 union perf_event *event __maybe_unused, 1674 struct perf_sample *sample) 1675 { 1676 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1677 struct thread_trace *ttrace; 1678 size_t filename_len, entry_str_len, to_move; 1679 ssize_t remaining_space; 1680 char *pos; 1681 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname"); 1682 1683 if (!thread) 1684 goto out; 1685 1686 ttrace = thread__priv(thread); 1687 if (!ttrace) 1688 goto out_put; 1689 1690 filename_len = strlen(filename); 1691 if (filename_len == 0) 1692 goto out_put; 1693 1694 if (ttrace->filename.namelen < filename_len) { 1695 char *f = realloc(ttrace->filename.name, filename_len + 1); 1696 1697 if (f == NULL) 1698 goto out_put; 1699 1700 ttrace->filename.namelen = filename_len; 1701 ttrace->filename.name = f; 1702 } 1703 1704 strcpy(ttrace->filename.name, filename); 1705 ttrace->filename.pending_open = true; 1706 1707 if (!ttrace->filename.ptr) 1708 goto out_put; 1709 1710 entry_str_len = strlen(ttrace->entry_str); 1711 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */ 1712 if (remaining_space <= 0) 1713 goto out_put; 1714 1715 if (filename_len > (size_t)remaining_space) { 1716 filename += filename_len - remaining_space; 1717 filename_len = remaining_space; 1718 } 1719 1720 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */ 1721 pos = ttrace->entry_str + ttrace->filename.entry_str_pos; 1722 memmove(pos + filename_len, pos, to_move); 1723 memcpy(pos, filename, filename_len); 1724 1725 ttrace->filename.ptr = 0; 1726 ttrace->filename.entry_str_pos = 0; 1727 out_put: 1728 thread__put(thread); 1729 out: 1730 return 0; 1731 } 1732 1733 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel, 1734 union perf_event *event __maybe_unused, 1735 struct perf_sample *sample) 1736 { 1737 u64 runtime = perf_evsel__intval(evsel, sample, "runtime"); 1738 double runtime_ms = (double)runtime / NSEC_PER_MSEC; 1739 struct thread *thread = machine__findnew_thread(trace->host, 1740 sample->pid, 1741 sample->tid); 1742 struct thread_trace *ttrace = thread__trace(thread, trace->output); 1743 1744 if (ttrace == NULL) 1745 goto out_dump; 1746 1747 ttrace->runtime_ms += runtime_ms; 1748 trace->runtime_ms += runtime_ms; 1749 out_put: 1750 thread__put(thread); 1751 return 0; 1752 1753 out_dump: 1754 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n", 1755 evsel->name, 1756 perf_evsel__strval(evsel, sample, "comm"), 1757 (pid_t)perf_evsel__intval(evsel, sample, "pid"), 1758 runtime, 1759 perf_evsel__intval(evsel, sample, "vruntime")); 1760 goto out_put; 1761 } 1762 1763 static void bpf_output__printer(enum binary_printer_ops op, 1764 unsigned int val, void *extra) 1765 { 1766 FILE *output = extra; 1767 unsigned char ch = (unsigned char)val; 1768 1769 switch (op) { 1770 case BINARY_PRINT_CHAR_DATA: 1771 fprintf(output, "%c", isprint(ch) ? ch : '.'); 1772 break; 1773 case BINARY_PRINT_DATA_BEGIN: 1774 case BINARY_PRINT_LINE_BEGIN: 1775 case BINARY_PRINT_ADDR: 1776 case BINARY_PRINT_NUM_DATA: 1777 case BINARY_PRINT_NUM_PAD: 1778 case BINARY_PRINT_SEP: 1779 case BINARY_PRINT_CHAR_PAD: 1780 case BINARY_PRINT_LINE_END: 1781 case BINARY_PRINT_DATA_END: 1782 default: 1783 break; 1784 } 1785 } 1786 1787 static void bpf_output__fprintf(struct trace *trace, 1788 struct perf_sample *sample) 1789 { 1790 print_binary(sample->raw_data, sample->raw_size, 8, 1791 bpf_output__printer, trace->output); 1792 } 1793 1794 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, 1795 union perf_event *event __maybe_unused, 1796 struct perf_sample *sample) 1797 { 1798 int callchain_ret = 0; 1799 1800 if (sample->callchain) { 1801 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 1802 if (callchain_ret == 0) { 1803 if (callchain_cursor.nr < trace->min_stack) 1804 goto out; 1805 callchain_ret = 1; 1806 } 1807 } 1808 1809 trace__printf_interrupted_entry(trace, sample); 1810 trace__fprintf_tstamp(trace, sample->time, trace->output); 1811 1812 if (trace->trace_syscalls) 1813 fprintf(trace->output, "( ): "); 1814 1815 fprintf(trace->output, "%s:", evsel->name); 1816 1817 if (perf_evsel__is_bpf_output(evsel)) { 1818 bpf_output__fprintf(trace, sample); 1819 } else if (evsel->tp_format) { 1820 event_format__fprintf(evsel->tp_format, sample->cpu, 1821 sample->raw_data, sample->raw_size, 1822 trace->output); 1823 } 1824 1825 fprintf(trace->output, ")\n"); 1826 1827 if (callchain_ret > 0) 1828 trace__fprintf_callchain(trace, sample); 1829 else if (callchain_ret < 0) 1830 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 1831 out: 1832 return 0; 1833 } 1834 1835 static void print_location(FILE *f, struct perf_sample *sample, 1836 struct addr_location *al, 1837 bool print_dso, bool print_sym) 1838 { 1839 1840 if ((verbose > 0 || print_dso) && al->map) 1841 fprintf(f, "%s@", al->map->dso->long_name); 1842 1843 if ((verbose > 0 || print_sym) && al->sym) 1844 fprintf(f, "%s+0x%" PRIx64, al->sym->name, 1845 al->addr - al->sym->start); 1846 else if (al->map) 1847 fprintf(f, "0x%" PRIx64, al->addr); 1848 else 1849 fprintf(f, "0x%" PRIx64, sample->addr); 1850 } 1851 1852 static int trace__pgfault(struct trace *trace, 1853 struct perf_evsel *evsel, 1854 union perf_event *event __maybe_unused, 1855 struct perf_sample *sample) 1856 { 1857 struct thread *thread; 1858 struct addr_location al; 1859 char map_type = 'd'; 1860 struct thread_trace *ttrace; 1861 int err = -1; 1862 int callchain_ret = 0; 1863 1864 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1865 1866 if (sample->callchain) { 1867 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 1868 if (callchain_ret == 0) { 1869 if (callchain_cursor.nr < trace->min_stack) 1870 goto out_put; 1871 callchain_ret = 1; 1872 } 1873 } 1874 1875 ttrace = thread__trace(thread, trace->output); 1876 if (ttrace == NULL) 1877 goto out_put; 1878 1879 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ) 1880 ttrace->pfmaj++; 1881 else 1882 ttrace->pfmin++; 1883 1884 if (trace->summary_only) 1885 goto out; 1886 1887 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION, 1888 sample->ip, &al); 1889 1890 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output); 1891 1892 fprintf(trace->output, "%sfault [", 1893 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ? 1894 "maj" : "min"); 1895 1896 print_location(trace->output, sample, &al, false, true); 1897 1898 fprintf(trace->output, "] => "); 1899 1900 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE, 1901 sample->addr, &al); 1902 1903 if (!al.map) { 1904 thread__find_addr_location(thread, sample->cpumode, 1905 MAP__FUNCTION, sample->addr, &al); 1906 1907 if (al.map) 1908 map_type = 'x'; 1909 else 1910 map_type = '?'; 1911 } 1912 1913 print_location(trace->output, sample, &al, true, false); 1914 1915 fprintf(trace->output, " (%c%c)\n", map_type, al.level); 1916 1917 if (callchain_ret > 0) 1918 trace__fprintf_callchain(trace, sample); 1919 else if (callchain_ret < 0) 1920 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 1921 out: 1922 err = 0; 1923 out_put: 1924 thread__put(thread); 1925 return err; 1926 } 1927 1928 static void trace__set_base_time(struct trace *trace, 1929 struct perf_evsel *evsel, 1930 struct perf_sample *sample) 1931 { 1932 /* 1933 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust 1934 * and don't use sample->time unconditionally, we may end up having 1935 * some other event in the future without PERF_SAMPLE_TIME for good 1936 * reason, i.e. we may not be interested in its timestamps, just in 1937 * it taking place, picking some piece of information when it 1938 * appears in our event stream (vfs_getname comes to mind). 1939 */ 1940 if (trace->base_time == 0 && !trace->full_time && 1941 (evsel->attr.sample_type & PERF_SAMPLE_TIME)) 1942 trace->base_time = sample->time; 1943 } 1944 1945 static int trace__process_sample(struct perf_tool *tool, 1946 union perf_event *event, 1947 struct perf_sample *sample, 1948 struct perf_evsel *evsel, 1949 struct machine *machine __maybe_unused) 1950 { 1951 struct trace *trace = container_of(tool, struct trace, tool); 1952 struct thread *thread; 1953 int err = 0; 1954 1955 tracepoint_handler handler = evsel->handler; 1956 1957 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1958 if (thread && thread__is_filtered(thread)) 1959 goto out; 1960 1961 trace__set_base_time(trace, evsel, sample); 1962 1963 if (handler) { 1964 ++trace->nr_events; 1965 handler(trace, evsel, event, sample); 1966 } 1967 out: 1968 thread__put(thread); 1969 return err; 1970 } 1971 1972 static int trace__record(struct trace *trace, int argc, const char **argv) 1973 { 1974 unsigned int rec_argc, i, j; 1975 const char **rec_argv; 1976 const char * const record_args[] = { 1977 "record", 1978 "-R", 1979 "-m", "1024", 1980 "-c", "1", 1981 }; 1982 1983 const char * const sc_args[] = { "-e", }; 1984 unsigned int sc_args_nr = ARRAY_SIZE(sc_args); 1985 const char * const majpf_args[] = { "-e", "major-faults" }; 1986 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args); 1987 const char * const minpf_args[] = { "-e", "minor-faults" }; 1988 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args); 1989 1990 /* +1 is for the event string below */ 1991 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 + 1992 majpf_args_nr + minpf_args_nr + argc; 1993 rec_argv = calloc(rec_argc + 1, sizeof(char *)); 1994 1995 if (rec_argv == NULL) 1996 return -ENOMEM; 1997 1998 j = 0; 1999 for (i = 0; i < ARRAY_SIZE(record_args); i++) 2000 rec_argv[j++] = record_args[i]; 2001 2002 if (trace->trace_syscalls) { 2003 for (i = 0; i < sc_args_nr; i++) 2004 rec_argv[j++] = sc_args[i]; 2005 2006 /* event string may be different for older kernels - e.g., RHEL6 */ 2007 if (is_valid_tracepoint("raw_syscalls:sys_enter")) 2008 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit"; 2009 else if (is_valid_tracepoint("syscalls:sys_enter")) 2010 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit"; 2011 else { 2012 pr_err("Neither raw_syscalls nor syscalls events exist.\n"); 2013 return -1; 2014 } 2015 } 2016 2017 if (trace->trace_pgfaults & TRACE_PFMAJ) 2018 for (i = 0; i < majpf_args_nr; i++) 2019 rec_argv[j++] = majpf_args[i]; 2020 2021 if (trace->trace_pgfaults & TRACE_PFMIN) 2022 for (i = 0; i < minpf_args_nr; i++) 2023 rec_argv[j++] = minpf_args[i]; 2024 2025 for (i = 0; i < (unsigned int)argc; i++) 2026 rec_argv[j++] = argv[i]; 2027 2028 return cmd_record(j, rec_argv); 2029 } 2030 2031 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp); 2032 2033 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist) 2034 { 2035 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname"); 2036 2037 if (IS_ERR(evsel)) 2038 return false; 2039 2040 if (perf_evsel__field(evsel, "pathname") == NULL) { 2041 perf_evsel__delete(evsel); 2042 return false; 2043 } 2044 2045 evsel->handler = trace__vfs_getname; 2046 perf_evlist__add(evlist, evsel); 2047 return true; 2048 } 2049 2050 static struct perf_evsel *perf_evsel__new_pgfault(u64 config) 2051 { 2052 struct perf_evsel *evsel; 2053 struct perf_event_attr attr = { 2054 .type = PERF_TYPE_SOFTWARE, 2055 .mmap_data = 1, 2056 }; 2057 2058 attr.config = config; 2059 attr.sample_period = 1; 2060 2061 event_attr_init(&attr); 2062 2063 evsel = perf_evsel__new(&attr); 2064 if (evsel) 2065 evsel->handler = trace__pgfault; 2066 2067 return evsel; 2068 } 2069 2070 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample) 2071 { 2072 const u32 type = event->header.type; 2073 struct perf_evsel *evsel; 2074 2075 if (type != PERF_RECORD_SAMPLE) { 2076 trace__process_event(trace, trace->host, event, sample); 2077 return; 2078 } 2079 2080 evsel = perf_evlist__id2evsel(trace->evlist, sample->id); 2081 if (evsel == NULL) { 2082 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id); 2083 return; 2084 } 2085 2086 trace__set_base_time(trace, evsel, sample); 2087 2088 if (evsel->attr.type == PERF_TYPE_TRACEPOINT && 2089 sample->raw_data == NULL) { 2090 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n", 2091 perf_evsel__name(evsel), sample->tid, 2092 sample->cpu, sample->raw_size); 2093 } else { 2094 tracepoint_handler handler = evsel->handler; 2095 handler(trace, evsel, event, sample); 2096 } 2097 } 2098 2099 static int trace__add_syscall_newtp(struct trace *trace) 2100 { 2101 int ret = -1; 2102 struct perf_evlist *evlist = trace->evlist; 2103 struct perf_evsel *sys_enter, *sys_exit; 2104 2105 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter); 2106 if (sys_enter == NULL) 2107 goto out; 2108 2109 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args)) 2110 goto out_delete_sys_enter; 2111 2112 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit); 2113 if (sys_exit == NULL) 2114 goto out_delete_sys_enter; 2115 2116 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret)) 2117 goto out_delete_sys_exit; 2118 2119 perf_evlist__add(evlist, sys_enter); 2120 perf_evlist__add(evlist, sys_exit); 2121 2122 if (callchain_param.enabled && !trace->kernel_syscallchains) { 2123 /* 2124 * We're interested only in the user space callchain 2125 * leading to the syscall, allow overriding that for 2126 * debugging reasons using --kernel_syscall_callchains 2127 */ 2128 sys_exit->attr.exclude_callchain_kernel = 1; 2129 } 2130 2131 trace->syscalls.events.sys_enter = sys_enter; 2132 trace->syscalls.events.sys_exit = sys_exit; 2133 2134 ret = 0; 2135 out: 2136 return ret; 2137 2138 out_delete_sys_exit: 2139 perf_evsel__delete_priv(sys_exit); 2140 out_delete_sys_enter: 2141 perf_evsel__delete_priv(sys_enter); 2142 goto out; 2143 } 2144 2145 static int trace__set_ev_qualifier_filter(struct trace *trace) 2146 { 2147 int err = -1; 2148 struct perf_evsel *sys_exit; 2149 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier, 2150 trace->ev_qualifier_ids.nr, 2151 trace->ev_qualifier_ids.entries); 2152 2153 if (filter == NULL) 2154 goto out_enomem; 2155 2156 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter, 2157 filter)) { 2158 sys_exit = trace->syscalls.events.sys_exit; 2159 err = perf_evsel__append_tp_filter(sys_exit, filter); 2160 } 2161 2162 free(filter); 2163 out: 2164 return err; 2165 out_enomem: 2166 errno = ENOMEM; 2167 goto out; 2168 } 2169 2170 static int trace__run(struct trace *trace, int argc, const char **argv) 2171 { 2172 struct perf_evlist *evlist = trace->evlist; 2173 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL; 2174 int err = -1, i; 2175 unsigned long before; 2176 const bool forks = argc > 0; 2177 bool draining = false; 2178 2179 trace->live = true; 2180 2181 if (trace->trace_syscalls && trace__add_syscall_newtp(trace)) 2182 goto out_error_raw_syscalls; 2183 2184 if (trace->trace_syscalls) 2185 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist); 2186 2187 if ((trace->trace_pgfaults & TRACE_PFMAJ)) { 2188 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ); 2189 if (pgfault_maj == NULL) 2190 goto out_error_mem; 2191 perf_evlist__add(evlist, pgfault_maj); 2192 } 2193 2194 if ((trace->trace_pgfaults & TRACE_PFMIN)) { 2195 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN); 2196 if (pgfault_min == NULL) 2197 goto out_error_mem; 2198 perf_evlist__add(evlist, pgfault_min); 2199 } 2200 2201 if (trace->sched && 2202 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime", 2203 trace__sched_stat_runtime)) 2204 goto out_error_sched_stat_runtime; 2205 2206 err = perf_evlist__create_maps(evlist, &trace->opts.target); 2207 if (err < 0) { 2208 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n"); 2209 goto out_delete_evlist; 2210 } 2211 2212 err = trace__symbols_init(trace, evlist); 2213 if (err < 0) { 2214 fprintf(trace->output, "Problems initializing symbol libraries!\n"); 2215 goto out_delete_evlist; 2216 } 2217 2218 perf_evlist__config(evlist, &trace->opts, NULL); 2219 2220 if (callchain_param.enabled) { 2221 bool use_identifier = false; 2222 2223 if (trace->syscalls.events.sys_exit) { 2224 perf_evsel__config_callchain(trace->syscalls.events.sys_exit, 2225 &trace->opts, &callchain_param); 2226 use_identifier = true; 2227 } 2228 2229 if (pgfault_maj) { 2230 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param); 2231 use_identifier = true; 2232 } 2233 2234 if (pgfault_min) { 2235 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param); 2236 use_identifier = true; 2237 } 2238 2239 if (use_identifier) { 2240 /* 2241 * Now we have evsels with different sample_ids, use 2242 * PERF_SAMPLE_IDENTIFIER to map from sample to evsel 2243 * from a fixed position in each ring buffer record. 2244 * 2245 * As of this the changeset introducing this comment, this 2246 * isn't strictly needed, as the fields that can come before 2247 * PERF_SAMPLE_ID are all used, but we'll probably disable 2248 * some of those for things like copying the payload of 2249 * pointer syscall arguments, and for vfs_getname we don't 2250 * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this 2251 * here as a warning we need to use PERF_SAMPLE_IDENTIFIER. 2252 */ 2253 perf_evlist__set_sample_bit(evlist, IDENTIFIER); 2254 perf_evlist__reset_sample_bit(evlist, ID); 2255 } 2256 } 2257 2258 signal(SIGCHLD, sig_handler); 2259 signal(SIGINT, sig_handler); 2260 2261 if (forks) { 2262 err = perf_evlist__prepare_workload(evlist, &trace->opts.target, 2263 argv, false, NULL); 2264 if (err < 0) { 2265 fprintf(trace->output, "Couldn't run the workload!\n"); 2266 goto out_delete_evlist; 2267 } 2268 } 2269 2270 err = perf_evlist__open(evlist); 2271 if (err < 0) 2272 goto out_error_open; 2273 2274 err = bpf__apply_obj_config(); 2275 if (err) { 2276 char errbuf[BUFSIZ]; 2277 2278 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf)); 2279 pr_err("ERROR: Apply config to BPF failed: %s\n", 2280 errbuf); 2281 goto out_error_open; 2282 } 2283 2284 /* 2285 * Better not use !target__has_task() here because we need to cover the 2286 * case where no threads were specified in the command line, but a 2287 * workload was, and in that case we will fill in the thread_map when 2288 * we fork the workload in perf_evlist__prepare_workload. 2289 */ 2290 if (trace->filter_pids.nr > 0) 2291 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries); 2292 else if (thread_map__pid(evlist->threads, 0) == -1) 2293 err = perf_evlist__set_filter_pid(evlist, getpid()); 2294 2295 if (err < 0) 2296 goto out_error_mem; 2297 2298 if (trace->ev_qualifier_ids.nr > 0) { 2299 err = trace__set_ev_qualifier_filter(trace); 2300 if (err < 0) 2301 goto out_errno; 2302 2303 pr_debug("event qualifier tracepoint filter: %s\n", 2304 trace->syscalls.events.sys_exit->filter); 2305 } 2306 2307 err = perf_evlist__apply_filters(evlist, &evsel); 2308 if (err < 0) 2309 goto out_error_apply_filters; 2310 2311 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false); 2312 if (err < 0) 2313 goto out_error_mmap; 2314 2315 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay) 2316 perf_evlist__enable(evlist); 2317 2318 if (forks) 2319 perf_evlist__start_workload(evlist); 2320 2321 if (trace->opts.initial_delay) { 2322 usleep(trace->opts.initial_delay * 1000); 2323 perf_evlist__enable(evlist); 2324 } 2325 2326 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 || 2327 evlist->threads->nr > 1 || 2328 perf_evlist__first(evlist)->attr.inherit; 2329 again: 2330 before = trace->nr_events; 2331 2332 for (i = 0; i < evlist->nr_mmaps; i++) { 2333 union perf_event *event; 2334 2335 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) { 2336 struct perf_sample sample; 2337 2338 ++trace->nr_events; 2339 2340 err = perf_evlist__parse_sample(evlist, event, &sample); 2341 if (err) { 2342 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err); 2343 goto next_event; 2344 } 2345 2346 trace__handle_event(trace, event, &sample); 2347 next_event: 2348 perf_evlist__mmap_consume(evlist, i); 2349 2350 if (interrupted) 2351 goto out_disable; 2352 2353 if (done && !draining) { 2354 perf_evlist__disable(evlist); 2355 draining = true; 2356 } 2357 } 2358 } 2359 2360 if (trace->nr_events == before) { 2361 int timeout = done ? 100 : -1; 2362 2363 if (!draining && perf_evlist__poll(evlist, timeout) > 0) { 2364 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0) 2365 draining = true; 2366 2367 goto again; 2368 } 2369 } else { 2370 goto again; 2371 } 2372 2373 out_disable: 2374 thread__zput(trace->current); 2375 2376 perf_evlist__disable(evlist); 2377 2378 if (!err) { 2379 if (trace->summary) 2380 trace__fprintf_thread_summary(trace, trace->output); 2381 2382 if (trace->show_tool_stats) { 2383 fprintf(trace->output, "Stats:\n " 2384 " vfs_getname : %" PRIu64 "\n" 2385 " proc_getname: %" PRIu64 "\n", 2386 trace->stats.vfs_getname, 2387 trace->stats.proc_getname); 2388 } 2389 } 2390 2391 out_delete_evlist: 2392 perf_evlist__delete(evlist); 2393 trace->evlist = NULL; 2394 trace->live = false; 2395 return err; 2396 { 2397 char errbuf[BUFSIZ]; 2398 2399 out_error_sched_stat_runtime: 2400 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime"); 2401 goto out_error; 2402 2403 out_error_raw_syscalls: 2404 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)"); 2405 goto out_error; 2406 2407 out_error_mmap: 2408 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf)); 2409 goto out_error; 2410 2411 out_error_open: 2412 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf)); 2413 2414 out_error: 2415 fprintf(trace->output, "%s\n", errbuf); 2416 goto out_delete_evlist; 2417 2418 out_error_apply_filters: 2419 fprintf(trace->output, 2420 "Failed to set filter \"%s\" on event %s with %d (%s)\n", 2421 evsel->filter, perf_evsel__name(evsel), errno, 2422 str_error_r(errno, errbuf, sizeof(errbuf))); 2423 goto out_delete_evlist; 2424 } 2425 out_error_mem: 2426 fprintf(trace->output, "Not enough memory to run!\n"); 2427 goto out_delete_evlist; 2428 2429 out_errno: 2430 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno)); 2431 goto out_delete_evlist; 2432 } 2433 2434 static int trace__replay(struct trace *trace) 2435 { 2436 const struct perf_evsel_str_handler handlers[] = { 2437 { "probe:vfs_getname", trace__vfs_getname, }, 2438 }; 2439 struct perf_data_file file = { 2440 .path = input_name, 2441 .mode = PERF_DATA_MODE_READ, 2442 .force = trace->force, 2443 }; 2444 struct perf_session *session; 2445 struct perf_evsel *evsel; 2446 int err = -1; 2447 2448 trace->tool.sample = trace__process_sample; 2449 trace->tool.mmap = perf_event__process_mmap; 2450 trace->tool.mmap2 = perf_event__process_mmap2; 2451 trace->tool.comm = perf_event__process_comm; 2452 trace->tool.exit = perf_event__process_exit; 2453 trace->tool.fork = perf_event__process_fork; 2454 trace->tool.attr = perf_event__process_attr; 2455 trace->tool.tracing_data = perf_event__process_tracing_data; 2456 trace->tool.build_id = perf_event__process_build_id; 2457 trace->tool.namespaces = perf_event__process_namespaces; 2458 2459 trace->tool.ordered_events = true; 2460 trace->tool.ordering_requires_timestamps = true; 2461 2462 /* add tid to output */ 2463 trace->multiple_threads = true; 2464 2465 session = perf_session__new(&file, false, &trace->tool); 2466 if (session == NULL) 2467 return -1; 2468 2469 if (trace->opts.target.pid) 2470 symbol_conf.pid_list_str = strdup(trace->opts.target.pid); 2471 2472 if (trace->opts.target.tid) 2473 symbol_conf.tid_list_str = strdup(trace->opts.target.tid); 2474 2475 if (symbol__init(&session->header.env) < 0) 2476 goto out; 2477 2478 trace->host = &session->machines.host; 2479 2480 err = perf_session__set_tracepoints_handlers(session, handlers); 2481 if (err) 2482 goto out; 2483 2484 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2485 "raw_syscalls:sys_enter"); 2486 /* older kernels have syscalls tp versus raw_syscalls */ 2487 if (evsel == NULL) 2488 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2489 "syscalls:sys_enter"); 2490 2491 if (evsel && 2492 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 || 2493 perf_evsel__init_sc_tp_ptr_field(evsel, args))) { 2494 pr_err("Error during initialize raw_syscalls:sys_enter event\n"); 2495 goto out; 2496 } 2497 2498 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2499 "raw_syscalls:sys_exit"); 2500 if (evsel == NULL) 2501 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2502 "syscalls:sys_exit"); 2503 if (evsel && 2504 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 || 2505 perf_evsel__init_sc_tp_uint_field(evsel, ret))) { 2506 pr_err("Error during initialize raw_syscalls:sys_exit event\n"); 2507 goto out; 2508 } 2509 2510 evlist__for_each_entry(session->evlist, evsel) { 2511 if (evsel->attr.type == PERF_TYPE_SOFTWARE && 2512 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ || 2513 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN || 2514 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS)) 2515 evsel->handler = trace__pgfault; 2516 } 2517 2518 setup_pager(); 2519 2520 err = perf_session__process_events(session); 2521 if (err) 2522 pr_err("Failed to process events, error %d", err); 2523 2524 else if (trace->summary) 2525 trace__fprintf_thread_summary(trace, trace->output); 2526 2527 out: 2528 perf_session__delete(session); 2529 2530 return err; 2531 } 2532 2533 static size_t trace__fprintf_threads_header(FILE *fp) 2534 { 2535 size_t printed; 2536 2537 printed = fprintf(fp, "\n Summary of events:\n\n"); 2538 2539 return printed; 2540 } 2541 2542 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs, 2543 struct stats *stats; 2544 double msecs; 2545 int syscall; 2546 ) 2547 { 2548 struct int_node *source = rb_entry(nd, struct int_node, rb_node); 2549 struct stats *stats = source->priv; 2550 2551 entry->syscall = source->i; 2552 entry->stats = stats; 2553 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0; 2554 } 2555 2556 static size_t thread__dump_stats(struct thread_trace *ttrace, 2557 struct trace *trace, FILE *fp) 2558 { 2559 size_t printed = 0; 2560 struct syscall *sc; 2561 struct rb_node *nd; 2562 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats); 2563 2564 if (syscall_stats == NULL) 2565 return 0; 2566 2567 printed += fprintf(fp, "\n"); 2568 2569 printed += fprintf(fp, " syscall calls total min avg max stddev\n"); 2570 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); 2571 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n"); 2572 2573 resort_rb__for_each_entry(nd, syscall_stats) { 2574 struct stats *stats = syscall_stats_entry->stats; 2575 if (stats) { 2576 double min = (double)(stats->min) / NSEC_PER_MSEC; 2577 double max = (double)(stats->max) / NSEC_PER_MSEC; 2578 double avg = avg_stats(stats); 2579 double pct; 2580 u64 n = (u64) stats->n; 2581 2582 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0; 2583 avg /= NSEC_PER_MSEC; 2584 2585 sc = &trace->syscalls.table[syscall_stats_entry->syscall]; 2586 printed += fprintf(fp, " %-15s", sc->name); 2587 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f", 2588 n, syscall_stats_entry->msecs, min, avg); 2589 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct); 2590 } 2591 } 2592 2593 resort_rb__delete(syscall_stats); 2594 printed += fprintf(fp, "\n\n"); 2595 2596 return printed; 2597 } 2598 2599 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace) 2600 { 2601 size_t printed = 0; 2602 struct thread_trace *ttrace = thread__priv(thread); 2603 double ratio; 2604 2605 if (ttrace == NULL) 2606 return 0; 2607 2608 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0; 2609 2610 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid); 2611 printed += fprintf(fp, "%lu events, ", ttrace->nr_events); 2612 printed += fprintf(fp, "%.1f%%", ratio); 2613 if (ttrace->pfmaj) 2614 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj); 2615 if (ttrace->pfmin) 2616 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin); 2617 if (trace->sched) 2618 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms); 2619 else if (fputc('\n', fp) != EOF) 2620 ++printed; 2621 2622 printed += thread__dump_stats(ttrace, trace, fp); 2623 2624 return printed; 2625 } 2626 2627 static unsigned long thread__nr_events(struct thread_trace *ttrace) 2628 { 2629 return ttrace ? ttrace->nr_events : 0; 2630 } 2631 2632 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)), 2633 struct thread *thread; 2634 ) 2635 { 2636 entry->thread = rb_entry(nd, struct thread, rb_node); 2637 } 2638 2639 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp) 2640 { 2641 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host); 2642 size_t printed = trace__fprintf_threads_header(fp); 2643 struct rb_node *nd; 2644 2645 if (threads == NULL) { 2646 fprintf(fp, "%s", "Error sorting output by nr_events!\n"); 2647 return 0; 2648 } 2649 2650 resort_rb__for_each_entry(nd, threads) 2651 printed += trace__fprintf_thread(fp, threads_entry->thread, trace); 2652 2653 resort_rb__delete(threads); 2654 2655 return printed; 2656 } 2657 2658 static int trace__set_duration(const struct option *opt, const char *str, 2659 int unset __maybe_unused) 2660 { 2661 struct trace *trace = opt->value; 2662 2663 trace->duration_filter = atof(str); 2664 return 0; 2665 } 2666 2667 static int trace__set_filter_pids(const struct option *opt, const char *str, 2668 int unset __maybe_unused) 2669 { 2670 int ret = -1; 2671 size_t i; 2672 struct trace *trace = opt->value; 2673 /* 2674 * FIXME: introduce a intarray class, plain parse csv and create a 2675 * { int nr, int entries[] } struct... 2676 */ 2677 struct intlist *list = intlist__new(str); 2678 2679 if (list == NULL) 2680 return -1; 2681 2682 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1; 2683 trace->filter_pids.entries = calloc(i, sizeof(pid_t)); 2684 2685 if (trace->filter_pids.entries == NULL) 2686 goto out; 2687 2688 trace->filter_pids.entries[0] = getpid(); 2689 2690 for (i = 1; i < trace->filter_pids.nr; ++i) 2691 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i; 2692 2693 intlist__delete(list); 2694 ret = 0; 2695 out: 2696 return ret; 2697 } 2698 2699 static int trace__open_output(struct trace *trace, const char *filename) 2700 { 2701 struct stat st; 2702 2703 if (!stat(filename, &st) && st.st_size) { 2704 char oldname[PATH_MAX]; 2705 2706 scnprintf(oldname, sizeof(oldname), "%s.old", filename); 2707 unlink(oldname); 2708 rename(filename, oldname); 2709 } 2710 2711 trace->output = fopen(filename, "w"); 2712 2713 return trace->output == NULL ? -errno : 0; 2714 } 2715 2716 static int parse_pagefaults(const struct option *opt, const char *str, 2717 int unset __maybe_unused) 2718 { 2719 int *trace_pgfaults = opt->value; 2720 2721 if (strcmp(str, "all") == 0) 2722 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN; 2723 else if (strcmp(str, "maj") == 0) 2724 *trace_pgfaults |= TRACE_PFMAJ; 2725 else if (strcmp(str, "min") == 0) 2726 *trace_pgfaults |= TRACE_PFMIN; 2727 else 2728 return -1; 2729 2730 return 0; 2731 } 2732 2733 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler) 2734 { 2735 struct perf_evsel *evsel; 2736 2737 evlist__for_each_entry(evlist, evsel) 2738 evsel->handler = handler; 2739 } 2740 2741 /* 2742 * XXX: Hackish, just splitting the combined -e+--event (syscalls 2743 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use 2744 * existing facilities unchanged (trace->ev_qualifier + parse_options()). 2745 * 2746 * It'd be better to introduce a parse_options() variant that would return a 2747 * list with the terms it didn't match to an event... 2748 */ 2749 static int trace__parse_events_option(const struct option *opt, const char *str, 2750 int unset __maybe_unused) 2751 { 2752 struct trace *trace = (struct trace *)opt->value; 2753 const char *s = str; 2754 char *sep = NULL, *lists[2] = { NULL, NULL, }; 2755 int len = strlen(str), err = -1, list; 2756 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR); 2757 char group_name[PATH_MAX]; 2758 2759 if (strace_groups_dir == NULL) 2760 return -1; 2761 2762 if (*s == '!') { 2763 ++s; 2764 trace->not_ev_qualifier = true; 2765 } 2766 2767 while (1) { 2768 if ((sep = strchr(s, ',')) != NULL) 2769 *sep = '\0'; 2770 2771 list = 0; 2772 if (syscalltbl__id(trace->sctbl, s) >= 0) { 2773 list = 1; 2774 } else { 2775 path__join(group_name, sizeof(group_name), strace_groups_dir, s); 2776 if (access(group_name, R_OK) == 0) 2777 list = 1; 2778 } 2779 2780 if (lists[list]) { 2781 sprintf(lists[list] + strlen(lists[list]), ",%s", s); 2782 } else { 2783 lists[list] = malloc(len); 2784 if (lists[list] == NULL) 2785 goto out; 2786 strcpy(lists[list], s); 2787 } 2788 2789 if (!sep) 2790 break; 2791 2792 *sep = ','; 2793 s = sep + 1; 2794 } 2795 2796 if (lists[1] != NULL) { 2797 struct strlist_config slist_config = { 2798 .dirname = strace_groups_dir, 2799 }; 2800 2801 trace->ev_qualifier = strlist__new(lists[1], &slist_config); 2802 if (trace->ev_qualifier == NULL) { 2803 fputs("Not enough memory to parse event qualifier", trace->output); 2804 goto out; 2805 } 2806 2807 if (trace__validate_ev_qualifier(trace)) 2808 goto out; 2809 } 2810 2811 err = 0; 2812 2813 if (lists[0]) { 2814 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event", 2815 "event selector. use 'perf list' to list available events", 2816 parse_events_option); 2817 err = parse_events_option(&o, lists[0], 0); 2818 } 2819 out: 2820 if (sep) 2821 *sep = ','; 2822 2823 return err; 2824 } 2825 2826 int cmd_trace(int argc, const char **argv) 2827 { 2828 const char *trace_usage[] = { 2829 "perf trace [<options>] [<command>]", 2830 "perf trace [<options>] -- <command> [<options>]", 2831 "perf trace record [<options>] [<command>]", 2832 "perf trace record [<options>] -- <command> [<options>]", 2833 NULL 2834 }; 2835 struct trace trace = { 2836 .syscalls = { 2837 . max = -1, 2838 }, 2839 .opts = { 2840 .target = { 2841 .uid = UINT_MAX, 2842 .uses_mmap = true, 2843 }, 2844 .user_freq = UINT_MAX, 2845 .user_interval = ULLONG_MAX, 2846 .no_buffering = true, 2847 .mmap_pages = UINT_MAX, 2848 .proc_map_timeout = 500, 2849 }, 2850 .output = stderr, 2851 .show_comm = true, 2852 .trace_syscalls = true, 2853 .kernel_syscallchains = false, 2854 .max_stack = UINT_MAX, 2855 }; 2856 const char *output_name = NULL; 2857 const struct option trace_options[] = { 2858 OPT_CALLBACK('e', "event", &trace, "event", 2859 "event/syscall selector. use 'perf list' to list available events", 2860 trace__parse_events_option), 2861 OPT_BOOLEAN(0, "comm", &trace.show_comm, 2862 "show the thread COMM next to its id"), 2863 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"), 2864 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace", 2865 trace__parse_events_option), 2866 OPT_STRING('o', "output", &output_name, "file", "output file name"), 2867 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"), 2868 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid", 2869 "trace events on existing process id"), 2870 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid", 2871 "trace events on existing thread id"), 2872 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids", 2873 "pids to filter (by the kernel)", trace__set_filter_pids), 2874 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide, 2875 "system-wide collection from all CPUs"), 2876 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu", 2877 "list of cpus to monitor"), 2878 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit, 2879 "child tasks do not inherit counters"), 2880 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages", 2881 "number of mmap data pages", 2882 perf_evlist__parse_mmap_pages), 2883 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user", 2884 "user to profile"), 2885 OPT_CALLBACK(0, "duration", &trace, "float", 2886 "show only events with duration > N.M ms", 2887 trace__set_duration), 2888 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"), 2889 OPT_INCR('v', "verbose", &verbose, "be more verbose"), 2890 OPT_BOOLEAN('T', "time", &trace.full_time, 2891 "Show full timestamp, not time relative to first start"), 2892 OPT_BOOLEAN('s', "summary", &trace.summary_only, 2893 "Show only syscall summary with statistics"), 2894 OPT_BOOLEAN('S', "with-summary", &trace.summary, 2895 "Show all syscalls and summary with statistics"), 2896 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min", 2897 "Trace pagefaults", parse_pagefaults, "maj"), 2898 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"), 2899 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"), 2900 OPT_CALLBACK(0, "call-graph", &trace.opts, 2901 "record_mode[,record_size]", record_callchain_help, 2902 &record_parse_callchain_opt), 2903 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains, 2904 "Show the kernel callchains on the syscall exit path"), 2905 OPT_UINTEGER(0, "min-stack", &trace.min_stack, 2906 "Set the minimum stack depth when parsing the callchain, " 2907 "anything below the specified depth will be ignored."), 2908 OPT_UINTEGER(0, "max-stack", &trace.max_stack, 2909 "Set the maximum stack depth when parsing the callchain, " 2910 "anything beyond the specified depth will be ignored. " 2911 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)), 2912 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout, 2913 "per thread proc mmap processing timeout in ms"), 2914 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay, 2915 "ms to wait before starting measurement after program " 2916 "start"), 2917 OPT_END() 2918 }; 2919 bool __maybe_unused max_stack_user_set = true; 2920 bool mmap_pages_user_set = true; 2921 const char * const trace_subcommands[] = { "record", NULL }; 2922 int err; 2923 char bf[BUFSIZ]; 2924 2925 signal(SIGSEGV, sighandler_dump_stack); 2926 signal(SIGFPE, sighandler_dump_stack); 2927 2928 trace.evlist = perf_evlist__new(); 2929 trace.sctbl = syscalltbl__new(); 2930 2931 if (trace.evlist == NULL || trace.sctbl == NULL) { 2932 pr_err("Not enough memory to run!\n"); 2933 err = -ENOMEM; 2934 goto out; 2935 } 2936 2937 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands, 2938 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION); 2939 2940 err = bpf__setup_stdout(trace.evlist); 2941 if (err) { 2942 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf)); 2943 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf); 2944 goto out; 2945 } 2946 2947 err = -1; 2948 2949 if (trace.trace_pgfaults) { 2950 trace.opts.sample_address = true; 2951 trace.opts.sample_time = true; 2952 } 2953 2954 if (trace.opts.mmap_pages == UINT_MAX) 2955 mmap_pages_user_set = false; 2956 2957 if (trace.max_stack == UINT_MAX) { 2958 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack; 2959 max_stack_user_set = false; 2960 } 2961 2962 #ifdef HAVE_DWARF_UNWIND_SUPPORT 2963 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls) 2964 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false); 2965 #endif 2966 2967 if (callchain_param.enabled) { 2968 if (!mmap_pages_user_set && geteuid() == 0) 2969 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4; 2970 2971 symbol_conf.use_callchain = true; 2972 } 2973 2974 if (trace.evlist->nr_entries > 0) 2975 evlist__set_evsel_handler(trace.evlist, trace__event_handler); 2976 2977 if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) 2978 return trace__record(&trace, argc-1, &argv[1]); 2979 2980 /* summary_only implies summary option, but don't overwrite summary if set */ 2981 if (trace.summary_only) 2982 trace.summary = trace.summary_only; 2983 2984 if (!trace.trace_syscalls && !trace.trace_pgfaults && 2985 trace.evlist->nr_entries == 0 /* Was --events used? */) { 2986 pr_err("Please specify something to trace.\n"); 2987 return -1; 2988 } 2989 2990 if (!trace.trace_syscalls && trace.ev_qualifier) { 2991 pr_err("The -e option can't be used with --no-syscalls.\n"); 2992 goto out; 2993 } 2994 2995 if (output_name != NULL) { 2996 err = trace__open_output(&trace, output_name); 2997 if (err < 0) { 2998 perror("failed to create output file"); 2999 goto out; 3000 } 3001 } 3002 3003 trace.open_id = syscalltbl__id(trace.sctbl, "open"); 3004 3005 err = target__validate(&trace.opts.target); 3006 if (err) { 3007 target__strerror(&trace.opts.target, err, bf, sizeof(bf)); 3008 fprintf(trace.output, "%s", bf); 3009 goto out_close; 3010 } 3011 3012 err = target__parse_uid(&trace.opts.target); 3013 if (err) { 3014 target__strerror(&trace.opts.target, err, bf, sizeof(bf)); 3015 fprintf(trace.output, "%s", bf); 3016 goto out_close; 3017 } 3018 3019 if (!argc && target__none(&trace.opts.target)) 3020 trace.opts.target.system_wide = true; 3021 3022 if (input_name) 3023 err = trace__replay(&trace); 3024 else 3025 err = trace__run(&trace, argc, argv); 3026 3027 out_close: 3028 if (output_name != NULL) 3029 fclose(trace.output); 3030 out: 3031 return err; 3032 } 3033