1 /* 2 * builtin-trace.c 3 * 4 * Builtin 'trace' command: 5 * 6 * Display a continuously updated trace of any workload, CPU, specific PID, 7 * system wide, etc. Default format is loosely strace like, but any other 8 * event may be specified using --event. 9 * 10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> 11 * 12 * Initially based on the 'trace' prototype by Thomas Gleixner: 13 * 14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'") 15 * 16 * Released under the GPL v2. (and only v2, not any later version) 17 */ 18 19 #include <traceevent/event-parse.h> 20 #include <api/fs/tracing_path.h> 21 #include "builtin.h" 22 #include "util/color.h" 23 #include "util/debug.h" 24 #include "util/evlist.h" 25 #include <subcmd/exec-cmd.h> 26 #include "util/machine.h" 27 #include "util/session.h" 28 #include "util/thread.h" 29 #include <subcmd/parse-options.h> 30 #include "util/strlist.h" 31 #include "util/intlist.h" 32 #include "util/thread_map.h" 33 #include "util/stat.h" 34 #include "trace-event.h" 35 #include "util/parse-events.h" 36 #include "util/bpf-loader.h" 37 #include "callchain.h" 38 #include "syscalltbl.h" 39 #include "rb_resort.h" 40 41 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */ 42 #include <stdlib.h> 43 #include <linux/err.h> 44 #include <linux/filter.h> 45 #include <linux/audit.h> 46 #include <linux/random.h> 47 #include <linux/stringify.h> 48 #include <linux/time64.h> 49 50 #ifndef O_CLOEXEC 51 # define O_CLOEXEC 02000000 52 #endif 53 54 struct trace { 55 struct perf_tool tool; 56 struct syscalltbl *sctbl; 57 struct { 58 int max; 59 struct syscall *table; 60 struct { 61 struct perf_evsel *sys_enter, 62 *sys_exit; 63 } events; 64 } syscalls; 65 struct record_opts opts; 66 struct perf_evlist *evlist; 67 struct machine *host; 68 struct thread *current; 69 u64 base_time; 70 FILE *output; 71 unsigned long nr_events; 72 struct strlist *ev_qualifier; 73 struct { 74 size_t nr; 75 int *entries; 76 } ev_qualifier_ids; 77 struct intlist *tid_list; 78 struct intlist *pid_list; 79 struct { 80 size_t nr; 81 pid_t *entries; 82 } filter_pids; 83 double duration_filter; 84 double runtime_ms; 85 struct { 86 u64 vfs_getname, 87 proc_getname; 88 } stats; 89 unsigned int max_stack; 90 unsigned int min_stack; 91 bool not_ev_qualifier; 92 bool live; 93 bool full_time; 94 bool sched; 95 bool multiple_threads; 96 bool summary; 97 bool summary_only; 98 bool show_comm; 99 bool show_tool_stats; 100 bool trace_syscalls; 101 bool kernel_syscallchains; 102 bool force; 103 bool vfs_getname; 104 int trace_pgfaults; 105 int open_id; 106 }; 107 108 struct tp_field { 109 int offset; 110 union { 111 u64 (*integer)(struct tp_field *field, struct perf_sample *sample); 112 void *(*pointer)(struct tp_field *field, struct perf_sample *sample); 113 }; 114 }; 115 116 #define TP_UINT_FIELD(bits) \ 117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \ 118 { \ 119 u##bits value; \ 120 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \ 121 return value; \ 122 } 123 124 TP_UINT_FIELD(8); 125 TP_UINT_FIELD(16); 126 TP_UINT_FIELD(32); 127 TP_UINT_FIELD(64); 128 129 #define TP_UINT_FIELD__SWAPPED(bits) \ 130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \ 131 { \ 132 u##bits value; \ 133 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \ 134 return bswap_##bits(value);\ 135 } 136 137 TP_UINT_FIELD__SWAPPED(16); 138 TP_UINT_FIELD__SWAPPED(32); 139 TP_UINT_FIELD__SWAPPED(64); 140 141 static int tp_field__init_uint(struct tp_field *field, 142 struct format_field *format_field, 143 bool needs_swap) 144 { 145 field->offset = format_field->offset; 146 147 switch (format_field->size) { 148 case 1: 149 field->integer = tp_field__u8; 150 break; 151 case 2: 152 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16; 153 break; 154 case 4: 155 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32; 156 break; 157 case 8: 158 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64; 159 break; 160 default: 161 return -1; 162 } 163 164 return 0; 165 } 166 167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample) 168 { 169 return sample->raw_data + field->offset; 170 } 171 172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field) 173 { 174 field->offset = format_field->offset; 175 field->pointer = tp_field__ptr; 176 return 0; 177 } 178 179 struct syscall_tp { 180 struct tp_field id; 181 union { 182 struct tp_field args, ret; 183 }; 184 }; 185 186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel, 187 struct tp_field *field, 188 const char *name) 189 { 190 struct format_field *format_field = perf_evsel__field(evsel, name); 191 192 if (format_field == NULL) 193 return -1; 194 195 return tp_field__init_uint(field, format_field, evsel->needs_swap); 196 } 197 198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \ 199 ({ struct syscall_tp *sc = evsel->priv;\ 200 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); }) 201 202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel, 203 struct tp_field *field, 204 const char *name) 205 { 206 struct format_field *format_field = perf_evsel__field(evsel, name); 207 208 if (format_field == NULL) 209 return -1; 210 211 return tp_field__init_ptr(field, format_field); 212 } 213 214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \ 215 ({ struct syscall_tp *sc = evsel->priv;\ 216 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); }) 217 218 static void perf_evsel__delete_priv(struct perf_evsel *evsel) 219 { 220 zfree(&evsel->priv); 221 perf_evsel__delete(evsel); 222 } 223 224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler) 225 { 226 evsel->priv = malloc(sizeof(struct syscall_tp)); 227 if (evsel->priv != NULL) { 228 if (perf_evsel__init_sc_tp_uint_field(evsel, id)) 229 goto out_delete; 230 231 evsel->handler = handler; 232 return 0; 233 } 234 235 return -ENOMEM; 236 237 out_delete: 238 zfree(&evsel->priv); 239 return -ENOENT; 240 } 241 242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler) 243 { 244 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction); 245 246 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */ 247 if (IS_ERR(evsel)) 248 evsel = perf_evsel__newtp("syscalls", direction); 249 250 if (IS_ERR(evsel)) 251 return NULL; 252 253 if (perf_evsel__init_syscall_tp(evsel, handler)) 254 goto out_delete; 255 256 return evsel; 257 258 out_delete: 259 perf_evsel__delete_priv(evsel); 260 return NULL; 261 } 262 263 #define perf_evsel__sc_tp_uint(evsel, name, sample) \ 264 ({ struct syscall_tp *fields = evsel->priv; \ 265 fields->name.integer(&fields->name, sample); }) 266 267 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \ 268 ({ struct syscall_tp *fields = evsel->priv; \ 269 fields->name.pointer(&fields->name, sample); }) 270 271 struct syscall_arg { 272 unsigned long val; 273 struct thread *thread; 274 struct trace *trace; 275 void *parm; 276 u8 idx; 277 u8 mask; 278 }; 279 280 struct strarray { 281 int offset; 282 int nr_entries; 283 const char **entries; 284 }; 285 286 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \ 287 .nr_entries = ARRAY_SIZE(array), \ 288 .entries = array, \ 289 } 290 291 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \ 292 .offset = off, \ 293 .nr_entries = ARRAY_SIZE(array), \ 294 .entries = array, \ 295 } 296 297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size, 298 const char *intfmt, 299 struct syscall_arg *arg) 300 { 301 struct strarray *sa = arg->parm; 302 int idx = arg->val - sa->offset; 303 304 if (idx < 0 || idx >= sa->nr_entries) 305 return scnprintf(bf, size, intfmt, arg->val); 306 307 return scnprintf(bf, size, "%s", sa->entries[idx]); 308 } 309 310 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size, 311 struct syscall_arg *arg) 312 { 313 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg); 314 } 315 316 #define SCA_STRARRAY syscall_arg__scnprintf_strarray 317 318 #if defined(__i386__) || defined(__x86_64__) 319 /* 320 * FIXME: Make this available to all arches as soon as the ioctl beautifier 321 * gets rewritten to support all arches. 322 */ 323 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size, 324 struct syscall_arg *arg) 325 { 326 return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg); 327 } 328 329 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray 330 #endif /* defined(__i386__) || defined(__x86_64__) */ 331 332 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size, 333 struct syscall_arg *arg); 334 335 #define SCA_FD syscall_arg__scnprintf_fd 336 337 #ifndef AT_FDCWD 338 #define AT_FDCWD -100 339 #endif 340 341 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size, 342 struct syscall_arg *arg) 343 { 344 int fd = arg->val; 345 346 if (fd == AT_FDCWD) 347 return scnprintf(bf, size, "CWD"); 348 349 return syscall_arg__scnprintf_fd(bf, size, arg); 350 } 351 352 #define SCA_FDAT syscall_arg__scnprintf_fd_at 353 354 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size, 355 struct syscall_arg *arg); 356 357 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd 358 359 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size, 360 struct syscall_arg *arg) 361 { 362 return scnprintf(bf, size, "%#lx", arg->val); 363 } 364 365 #define SCA_HEX syscall_arg__scnprintf_hex 366 367 static size_t syscall_arg__scnprintf_int(char *bf, size_t size, 368 struct syscall_arg *arg) 369 { 370 return scnprintf(bf, size, "%d", arg->val); 371 } 372 373 #define SCA_INT syscall_arg__scnprintf_int 374 375 static const char *bpf_cmd[] = { 376 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM", 377 "MAP_GET_NEXT_KEY", "PROG_LOAD", 378 }; 379 static DEFINE_STRARRAY(bpf_cmd); 380 381 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", }; 382 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1); 383 384 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", }; 385 static DEFINE_STRARRAY(itimers); 386 387 static const char *keyctl_options[] = { 388 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN", 389 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ", 390 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT", 391 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT", 392 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT", 393 }; 394 static DEFINE_STRARRAY(keyctl_options); 395 396 static const char *whences[] = { "SET", "CUR", "END", 397 #ifdef SEEK_DATA 398 "DATA", 399 #endif 400 #ifdef SEEK_HOLE 401 "HOLE", 402 #endif 403 }; 404 static DEFINE_STRARRAY(whences); 405 406 static const char *fcntl_cmds[] = { 407 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK", 408 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64", 409 "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX", 410 "F_GETOWNER_UIDS", 411 }; 412 static DEFINE_STRARRAY(fcntl_cmds); 413 414 static const char *rlimit_resources[] = { 415 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE", 416 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO", 417 "RTTIME", 418 }; 419 static DEFINE_STRARRAY(rlimit_resources); 420 421 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", }; 422 static DEFINE_STRARRAY(sighow); 423 424 static const char *clockid[] = { 425 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID", 426 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME", 427 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI" 428 }; 429 static DEFINE_STRARRAY(clockid); 430 431 static const char *socket_families[] = { 432 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM", 433 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI", 434 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC", 435 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC", 436 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF", 437 "ALG", "NFC", "VSOCK", 438 }; 439 static DEFINE_STRARRAY(socket_families); 440 441 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size, 442 struct syscall_arg *arg) 443 { 444 size_t printed = 0; 445 int mode = arg->val; 446 447 if (mode == F_OK) /* 0 */ 448 return scnprintf(bf, size, "F"); 449 #define P_MODE(n) \ 450 if (mode & n##_OK) { \ 451 printed += scnprintf(bf + printed, size - printed, "%s", #n); \ 452 mode &= ~n##_OK; \ 453 } 454 455 P_MODE(R); 456 P_MODE(W); 457 P_MODE(X); 458 #undef P_MODE 459 460 if (mode) 461 printed += scnprintf(bf + printed, size - printed, "|%#x", mode); 462 463 return printed; 464 } 465 466 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode 467 468 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, 469 struct syscall_arg *arg); 470 471 #define SCA_FILENAME syscall_arg__scnprintf_filename 472 473 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size, 474 struct syscall_arg *arg) 475 { 476 int printed = 0, flags = arg->val; 477 478 #define P_FLAG(n) \ 479 if (flags & O_##n) { \ 480 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ 481 flags &= ~O_##n; \ 482 } 483 484 P_FLAG(CLOEXEC); 485 P_FLAG(NONBLOCK); 486 #undef P_FLAG 487 488 if (flags) 489 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); 490 491 return printed; 492 } 493 494 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags 495 496 #if defined(__i386__) || defined(__x86_64__) 497 /* 498 * FIXME: Make this available to all arches. 499 */ 500 #define TCGETS 0x5401 501 502 static const char *tioctls[] = { 503 "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW", 504 "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL", 505 "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI", 506 "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC", 507 "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX", 508 "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO", 509 "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK", 510 "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2", 511 "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK", 512 "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG", 513 "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL", 514 [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG", 515 "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS", 516 "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI", 517 "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE", 518 }; 519 520 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401); 521 #endif /* defined(__i386__) || defined(__x86_64__) */ 522 523 #ifndef GRND_NONBLOCK 524 #define GRND_NONBLOCK 0x0001 525 #endif 526 #ifndef GRND_RANDOM 527 #define GRND_RANDOM 0x0002 528 #endif 529 530 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, 531 struct syscall_arg *arg) 532 { 533 int printed = 0, flags = arg->val; 534 535 #define P_FLAG(n) \ 536 if (flags & GRND_##n) { \ 537 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ 538 flags &= ~GRND_##n; \ 539 } 540 541 P_FLAG(RANDOM); 542 P_FLAG(NONBLOCK); 543 #undef P_FLAG 544 545 if (flags) 546 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); 547 548 return printed; 549 } 550 551 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags 552 553 #define STRARRAY(arg, name, array) \ 554 .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \ 555 .arg_parm = { [arg] = &strarray__##array, } 556 557 #include "trace/beauty/eventfd.c" 558 #include "trace/beauty/flock.c" 559 #include "trace/beauty/futex_op.c" 560 #include "trace/beauty/mmap.c" 561 #include "trace/beauty/mode_t.c" 562 #include "trace/beauty/msg_flags.c" 563 #include "trace/beauty/open_flags.c" 564 #include "trace/beauty/perf_event_open.c" 565 #include "trace/beauty/pid.c" 566 #include "trace/beauty/sched_policy.c" 567 #include "trace/beauty/seccomp.c" 568 #include "trace/beauty/signum.c" 569 #include "trace/beauty/socket_type.c" 570 #include "trace/beauty/waitid_options.c" 571 572 static struct syscall_fmt { 573 const char *name; 574 const char *alias; 575 size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg); 576 void *arg_parm[6]; 577 bool errmsg; 578 bool errpid; 579 bool timeout; 580 bool hexret; 581 } syscall_fmts[] = { 582 { .name = "access", .errmsg = true, 583 .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, }, 584 { .name = "arch_prctl", .errmsg = true, .alias = "prctl", }, 585 { .name = "bpf", .errmsg = true, STRARRAY(0, cmd, bpf_cmd), }, 586 { .name = "brk", .hexret = true, 587 .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, }, 588 { .name = "chdir", .errmsg = true, }, 589 { .name = "chmod", .errmsg = true, }, 590 { .name = "chroot", .errmsg = true, }, 591 { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), }, 592 { .name = "clone", .errpid = true, }, 593 { .name = "close", .errmsg = true, 594 .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, }, 595 { .name = "connect", .errmsg = true, }, 596 { .name = "creat", .errmsg = true, }, 597 { .name = "dup", .errmsg = true, }, 598 { .name = "dup2", .errmsg = true, }, 599 { .name = "dup3", .errmsg = true, }, 600 { .name = "epoll_ctl", .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), }, 601 { .name = "eventfd2", .errmsg = true, 602 .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, }, 603 { .name = "faccessat", .errmsg = true, }, 604 { .name = "fadvise64", .errmsg = true, }, 605 { .name = "fallocate", .errmsg = true, }, 606 { .name = "fchdir", .errmsg = true, }, 607 { .name = "fchmod", .errmsg = true, }, 608 { .name = "fchmodat", .errmsg = true, 609 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 610 { .name = "fchown", .errmsg = true, }, 611 { .name = "fchownat", .errmsg = true, 612 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 613 { .name = "fcntl", .errmsg = true, 614 .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ }, 615 .arg_parm = { [1] = &strarray__fcntl_cmds, /* cmd */ }, }, 616 { .name = "fdatasync", .errmsg = true, }, 617 { .name = "flock", .errmsg = true, 618 .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, }, 619 { .name = "fsetxattr", .errmsg = true, }, 620 { .name = "fstat", .errmsg = true, .alias = "newfstat", }, 621 { .name = "fstatat", .errmsg = true, .alias = "newfstatat", }, 622 { .name = "fstatfs", .errmsg = true, }, 623 { .name = "fsync", .errmsg = true, }, 624 { .name = "ftruncate", .errmsg = true, }, 625 { .name = "futex", .errmsg = true, 626 .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, }, 627 { .name = "futimesat", .errmsg = true, 628 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 629 { .name = "getdents", .errmsg = true, }, 630 { .name = "getdents64", .errmsg = true, }, 631 { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), }, 632 { .name = "getpid", .errpid = true, }, 633 { .name = "getpgid", .errpid = true, }, 634 { .name = "getppid", .errpid = true, }, 635 { .name = "getrandom", .errmsg = true, 636 .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, }, 637 { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), }, 638 { .name = "getxattr", .errmsg = true, }, 639 { .name = "inotify_add_watch", .errmsg = true, }, 640 { .name = "ioctl", .errmsg = true, 641 .arg_scnprintf = { 642 #if defined(__i386__) || defined(__x86_64__) 643 /* 644 * FIXME: Make this available to all arches. 645 */ 646 [1] = SCA_STRHEXARRAY, /* cmd */ 647 [2] = SCA_HEX, /* arg */ }, 648 .arg_parm = { [1] = &strarray__tioctls, /* cmd */ }, }, 649 #else 650 [2] = SCA_HEX, /* arg */ }, }, 651 #endif 652 { .name = "keyctl", .errmsg = true, STRARRAY(0, option, keyctl_options), }, 653 { .name = "kill", .errmsg = true, 654 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, }, 655 { .name = "lchown", .errmsg = true, }, 656 { .name = "lgetxattr", .errmsg = true, }, 657 { .name = "linkat", .errmsg = true, 658 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 659 { .name = "listxattr", .errmsg = true, }, 660 { .name = "llistxattr", .errmsg = true, }, 661 { .name = "lremovexattr", .errmsg = true, }, 662 { .name = "lseek", .errmsg = true, 663 .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ }, 664 .arg_parm = { [2] = &strarray__whences, /* whence */ }, }, 665 { .name = "lsetxattr", .errmsg = true, }, 666 { .name = "lstat", .errmsg = true, .alias = "newlstat", }, 667 { .name = "lsxattr", .errmsg = true, }, 668 { .name = "madvise", .errmsg = true, 669 .arg_scnprintf = { [0] = SCA_HEX, /* start */ 670 [2] = SCA_MADV_BHV, /* behavior */ }, }, 671 { .name = "mkdir", .errmsg = true, }, 672 { .name = "mkdirat", .errmsg = true, 673 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 674 { .name = "mknod", .errmsg = true, }, 675 { .name = "mknodat", .errmsg = true, 676 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 677 { .name = "mlock", .errmsg = true, 678 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, }, 679 { .name = "mlockall", .errmsg = true, 680 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, }, 681 { .name = "mmap", .hexret = true, 682 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ 683 [2] = SCA_MMAP_PROT, /* prot */ 684 [3] = SCA_MMAP_FLAGS, /* flags */ }, }, 685 { .name = "mprotect", .errmsg = true, 686 .arg_scnprintf = { [0] = SCA_HEX, /* start */ 687 [2] = SCA_MMAP_PROT, /* prot */ }, }, 688 { .name = "mq_unlink", .errmsg = true, 689 .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, }, 690 { .name = "mremap", .hexret = true, 691 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ 692 [3] = SCA_MREMAP_FLAGS, /* flags */ 693 [4] = SCA_HEX, /* new_addr */ }, }, 694 { .name = "munlock", .errmsg = true, 695 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, }, 696 { .name = "munmap", .errmsg = true, 697 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, }, 698 { .name = "name_to_handle_at", .errmsg = true, 699 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 700 { .name = "newfstatat", .errmsg = true, 701 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 702 { .name = "open", .errmsg = true, 703 .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, }, 704 { .name = "open_by_handle_at", .errmsg = true, 705 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ 706 [2] = SCA_OPEN_FLAGS, /* flags */ }, }, 707 { .name = "openat", .errmsg = true, 708 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ 709 [2] = SCA_OPEN_FLAGS, /* flags */ }, }, 710 { .name = "perf_event_open", .errmsg = true, 711 .arg_scnprintf = { [2] = SCA_INT, /* cpu */ 712 [3] = SCA_FD, /* group_fd */ 713 [4] = SCA_PERF_FLAGS, /* flags */ }, }, 714 { .name = "pipe2", .errmsg = true, 715 .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, }, 716 { .name = "poll", .errmsg = true, .timeout = true, }, 717 { .name = "ppoll", .errmsg = true, .timeout = true, }, 718 { .name = "pread", .errmsg = true, .alias = "pread64", }, 719 { .name = "preadv", .errmsg = true, .alias = "pread", }, 720 { .name = "prlimit64", .errmsg = true, STRARRAY(1, resource, rlimit_resources), }, 721 { .name = "pwrite", .errmsg = true, .alias = "pwrite64", }, 722 { .name = "pwritev", .errmsg = true, }, 723 { .name = "read", .errmsg = true, }, 724 { .name = "readlink", .errmsg = true, }, 725 { .name = "readlinkat", .errmsg = true, 726 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 727 { .name = "readv", .errmsg = true, }, 728 { .name = "recvfrom", .errmsg = true, 729 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, 730 { .name = "recvmmsg", .errmsg = true, 731 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, 732 { .name = "recvmsg", .errmsg = true, 733 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, }, 734 { .name = "removexattr", .errmsg = true, }, 735 { .name = "renameat", .errmsg = true, 736 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 737 { .name = "rmdir", .errmsg = true, }, 738 { .name = "rt_sigaction", .errmsg = true, 739 .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, }, 740 { .name = "rt_sigprocmask", .errmsg = true, STRARRAY(0, how, sighow), }, 741 { .name = "rt_sigqueueinfo", .errmsg = true, 742 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, }, 743 { .name = "rt_tgsigqueueinfo", .errmsg = true, 744 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, }, 745 { .name = "sched_setscheduler", .errmsg = true, 746 .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, }, 747 { .name = "seccomp", .errmsg = true, 748 .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */ 749 [1] = SCA_SECCOMP_FLAGS, /* flags */ }, }, 750 { .name = "select", .errmsg = true, .timeout = true, }, 751 { .name = "sendmmsg", .errmsg = true, 752 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, 753 { .name = "sendmsg", .errmsg = true, 754 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, }, 755 { .name = "sendto", .errmsg = true, 756 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, 757 { .name = "set_tid_address", .errpid = true, }, 758 { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), }, 759 { .name = "setpgid", .errmsg = true, }, 760 { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), }, 761 { .name = "setxattr", .errmsg = true, }, 762 { .name = "shutdown", .errmsg = true, }, 763 { .name = "socket", .errmsg = true, 764 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */ 765 [1] = SCA_SK_TYPE, /* type */ }, 766 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, }, 767 { .name = "socketpair", .errmsg = true, 768 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */ 769 [1] = SCA_SK_TYPE, /* type */ }, 770 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, }, 771 { .name = "stat", .errmsg = true, .alias = "newstat", }, 772 { .name = "statfs", .errmsg = true, }, 773 { .name = "swapoff", .errmsg = true, 774 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, }, 775 { .name = "swapon", .errmsg = true, 776 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, }, 777 { .name = "symlinkat", .errmsg = true, 778 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 779 { .name = "tgkill", .errmsg = true, 780 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, }, 781 { .name = "tkill", .errmsg = true, 782 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, }, 783 { .name = "truncate", .errmsg = true, }, 784 { .name = "uname", .errmsg = true, .alias = "newuname", }, 785 { .name = "unlinkat", .errmsg = true, 786 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 787 { .name = "utime", .errmsg = true, }, 788 { .name = "utimensat", .errmsg = true, 789 .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, }, 790 { .name = "utimes", .errmsg = true, }, 791 { .name = "vmsplice", .errmsg = true, }, 792 { .name = "wait4", .errpid = true, 793 .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, }, 794 { .name = "waitid", .errpid = true, 795 .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, }, 796 { .name = "write", .errmsg = true, }, 797 { .name = "writev", .errmsg = true, }, 798 }; 799 800 static int syscall_fmt__cmp(const void *name, const void *fmtp) 801 { 802 const struct syscall_fmt *fmt = fmtp; 803 return strcmp(name, fmt->name); 804 } 805 806 static struct syscall_fmt *syscall_fmt__find(const char *name) 807 { 808 const int nmemb = ARRAY_SIZE(syscall_fmts); 809 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp); 810 } 811 812 struct syscall { 813 struct event_format *tp_format; 814 int nr_args; 815 struct format_field *args; 816 const char *name; 817 bool is_exit; 818 struct syscall_fmt *fmt; 819 size_t (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg); 820 void **arg_parm; 821 }; 822 823 static size_t fprintf_duration(unsigned long t, FILE *fp) 824 { 825 double duration = (double)t / NSEC_PER_MSEC; 826 size_t printed = fprintf(fp, "("); 827 828 if (duration >= 1.0) 829 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration); 830 else if (duration >= 0.01) 831 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration); 832 else 833 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration); 834 return printed + fprintf(fp, "): "); 835 } 836 837 /** 838 * filename.ptr: The filename char pointer that will be vfs_getname'd 839 * filename.entry_str_pos: Where to insert the string translated from 840 * filename.ptr by the vfs_getname tracepoint/kprobe. 841 */ 842 struct thread_trace { 843 u64 entry_time; 844 u64 exit_time; 845 bool entry_pending; 846 unsigned long nr_events; 847 unsigned long pfmaj, pfmin; 848 char *entry_str; 849 double runtime_ms; 850 struct { 851 unsigned long ptr; 852 short int entry_str_pos; 853 bool pending_open; 854 unsigned int namelen; 855 char *name; 856 } filename; 857 struct { 858 int max; 859 char **table; 860 } paths; 861 862 struct intlist *syscall_stats; 863 }; 864 865 static struct thread_trace *thread_trace__new(void) 866 { 867 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace)); 868 869 if (ttrace) 870 ttrace->paths.max = -1; 871 872 ttrace->syscall_stats = intlist__new(NULL); 873 874 return ttrace; 875 } 876 877 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp) 878 { 879 struct thread_trace *ttrace; 880 881 if (thread == NULL) 882 goto fail; 883 884 if (thread__priv(thread) == NULL) 885 thread__set_priv(thread, thread_trace__new()); 886 887 if (thread__priv(thread) == NULL) 888 goto fail; 889 890 ttrace = thread__priv(thread); 891 ++ttrace->nr_events; 892 893 return ttrace; 894 fail: 895 color_fprintf(fp, PERF_COLOR_RED, 896 "WARNING: not enough memory, dropping samples!\n"); 897 return NULL; 898 } 899 900 #define TRACE_PFMAJ (1 << 0) 901 #define TRACE_PFMIN (1 << 1) 902 903 static const size_t trace__entry_str_size = 2048; 904 905 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname) 906 { 907 struct thread_trace *ttrace = thread__priv(thread); 908 909 if (fd > ttrace->paths.max) { 910 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *)); 911 912 if (npath == NULL) 913 return -1; 914 915 if (ttrace->paths.max != -1) { 916 memset(npath + ttrace->paths.max + 1, 0, 917 (fd - ttrace->paths.max) * sizeof(char *)); 918 } else { 919 memset(npath, 0, (fd + 1) * sizeof(char *)); 920 } 921 922 ttrace->paths.table = npath; 923 ttrace->paths.max = fd; 924 } 925 926 ttrace->paths.table[fd] = strdup(pathname); 927 928 return ttrace->paths.table[fd] != NULL ? 0 : -1; 929 } 930 931 static int thread__read_fd_path(struct thread *thread, int fd) 932 { 933 char linkname[PATH_MAX], pathname[PATH_MAX]; 934 struct stat st; 935 int ret; 936 937 if (thread->pid_ == thread->tid) { 938 scnprintf(linkname, sizeof(linkname), 939 "/proc/%d/fd/%d", thread->pid_, fd); 940 } else { 941 scnprintf(linkname, sizeof(linkname), 942 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd); 943 } 944 945 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname)) 946 return -1; 947 948 ret = readlink(linkname, pathname, sizeof(pathname)); 949 950 if (ret < 0 || ret > st.st_size) 951 return -1; 952 953 pathname[ret] = '\0'; 954 return trace__set_fd_pathname(thread, fd, pathname); 955 } 956 957 static const char *thread__fd_path(struct thread *thread, int fd, 958 struct trace *trace) 959 { 960 struct thread_trace *ttrace = thread__priv(thread); 961 962 if (ttrace == NULL) 963 return NULL; 964 965 if (fd < 0) 966 return NULL; 967 968 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) { 969 if (!trace->live) 970 return NULL; 971 ++trace->stats.proc_getname; 972 if (thread__read_fd_path(thread, fd)) 973 return NULL; 974 } 975 976 return ttrace->paths.table[fd]; 977 } 978 979 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size, 980 struct syscall_arg *arg) 981 { 982 int fd = arg->val; 983 size_t printed = scnprintf(bf, size, "%d", fd); 984 const char *path = thread__fd_path(arg->thread, fd, arg->trace); 985 986 if (path) 987 printed += scnprintf(bf + printed, size - printed, "<%s>", path); 988 989 return printed; 990 } 991 992 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size, 993 struct syscall_arg *arg) 994 { 995 int fd = arg->val; 996 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg); 997 struct thread_trace *ttrace = thread__priv(arg->thread); 998 999 if (ttrace && fd >= 0 && fd <= ttrace->paths.max) 1000 zfree(&ttrace->paths.table[fd]); 1001 1002 return printed; 1003 } 1004 1005 static void thread__set_filename_pos(struct thread *thread, const char *bf, 1006 unsigned long ptr) 1007 { 1008 struct thread_trace *ttrace = thread__priv(thread); 1009 1010 ttrace->filename.ptr = ptr; 1011 ttrace->filename.entry_str_pos = bf - ttrace->entry_str; 1012 } 1013 1014 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, 1015 struct syscall_arg *arg) 1016 { 1017 unsigned long ptr = arg->val; 1018 1019 if (!arg->trace->vfs_getname) 1020 return scnprintf(bf, size, "%#x", ptr); 1021 1022 thread__set_filename_pos(arg->thread, bf, ptr); 1023 return 0; 1024 } 1025 1026 static bool trace__filter_duration(struct trace *trace, double t) 1027 { 1028 return t < (trace->duration_filter * NSEC_PER_MSEC); 1029 } 1030 1031 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp) 1032 { 1033 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC; 1034 1035 return fprintf(fp, "%10.3f ", ts); 1036 } 1037 1038 static bool done = false; 1039 static bool interrupted = false; 1040 1041 static void sig_handler(int sig) 1042 { 1043 done = true; 1044 interrupted = sig == SIGINT; 1045 } 1046 1047 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread, 1048 u64 duration, u64 tstamp, FILE *fp) 1049 { 1050 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp); 1051 printed += fprintf_duration(duration, fp); 1052 1053 if (trace->multiple_threads) { 1054 if (trace->show_comm) 1055 printed += fprintf(fp, "%.14s/", thread__comm_str(thread)); 1056 printed += fprintf(fp, "%d ", thread->tid); 1057 } 1058 1059 return printed; 1060 } 1061 1062 static int trace__process_event(struct trace *trace, struct machine *machine, 1063 union perf_event *event, struct perf_sample *sample) 1064 { 1065 int ret = 0; 1066 1067 switch (event->header.type) { 1068 case PERF_RECORD_LOST: 1069 color_fprintf(trace->output, PERF_COLOR_RED, 1070 "LOST %" PRIu64 " events!\n", event->lost.lost); 1071 ret = machine__process_lost_event(machine, event, sample); 1072 break; 1073 default: 1074 ret = machine__process_event(machine, event, sample); 1075 break; 1076 } 1077 1078 return ret; 1079 } 1080 1081 static int trace__tool_process(struct perf_tool *tool, 1082 union perf_event *event, 1083 struct perf_sample *sample, 1084 struct machine *machine) 1085 { 1086 struct trace *trace = container_of(tool, struct trace, tool); 1087 return trace__process_event(trace, machine, event, sample); 1088 } 1089 1090 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp) 1091 { 1092 struct machine *machine = vmachine; 1093 1094 if (machine->kptr_restrict_warned) 1095 return NULL; 1096 1097 if (symbol_conf.kptr_restrict) { 1098 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n" 1099 "Check /proc/sys/kernel/kptr_restrict.\n\n" 1100 "Kernel samples will not be resolved.\n"); 1101 machine->kptr_restrict_warned = true; 1102 return NULL; 1103 } 1104 1105 return machine__resolve_kernel_addr(vmachine, addrp, modp); 1106 } 1107 1108 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist) 1109 { 1110 int err = symbol__init(NULL); 1111 1112 if (err) 1113 return err; 1114 1115 trace->host = machine__new_host(); 1116 if (trace->host == NULL) 1117 return -ENOMEM; 1118 1119 if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0) 1120 return -errno; 1121 1122 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target, 1123 evlist->threads, trace__tool_process, false, 1124 trace->opts.proc_map_timeout); 1125 if (err) 1126 symbol__exit(); 1127 1128 return err; 1129 } 1130 1131 static int syscall__set_arg_fmts(struct syscall *sc) 1132 { 1133 struct format_field *field; 1134 int idx = 0, len; 1135 1136 sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *)); 1137 if (sc->arg_scnprintf == NULL) 1138 return -1; 1139 1140 if (sc->fmt) 1141 sc->arg_parm = sc->fmt->arg_parm; 1142 1143 for (field = sc->args; field; field = field->next) { 1144 if (sc->fmt && sc->fmt->arg_scnprintf[idx]) 1145 sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx]; 1146 else if (strcmp(field->type, "const char *") == 0 && 1147 (strcmp(field->name, "filename") == 0 || 1148 strcmp(field->name, "path") == 0 || 1149 strcmp(field->name, "pathname") == 0)) 1150 sc->arg_scnprintf[idx] = SCA_FILENAME; 1151 else if (field->flags & FIELD_IS_POINTER) 1152 sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex; 1153 else if (strcmp(field->type, "pid_t") == 0) 1154 sc->arg_scnprintf[idx] = SCA_PID; 1155 else if (strcmp(field->type, "umode_t") == 0) 1156 sc->arg_scnprintf[idx] = SCA_MODE_T; 1157 else if ((strcmp(field->type, "int") == 0 || 1158 strcmp(field->type, "unsigned int") == 0 || 1159 strcmp(field->type, "long") == 0) && 1160 (len = strlen(field->name)) >= 2 && 1161 strcmp(field->name + len - 2, "fd") == 0) { 1162 /* 1163 * /sys/kernel/tracing/events/syscalls/sys_enter* 1164 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c 1165 * 65 int 1166 * 23 unsigned int 1167 * 7 unsigned long 1168 */ 1169 sc->arg_scnprintf[idx] = SCA_FD; 1170 } 1171 ++idx; 1172 } 1173 1174 return 0; 1175 } 1176 1177 static int trace__read_syscall_info(struct trace *trace, int id) 1178 { 1179 char tp_name[128]; 1180 struct syscall *sc; 1181 const char *name = syscalltbl__name(trace->sctbl, id); 1182 1183 if (name == NULL) 1184 return -1; 1185 1186 if (id > trace->syscalls.max) { 1187 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc)); 1188 1189 if (nsyscalls == NULL) 1190 return -1; 1191 1192 if (trace->syscalls.max != -1) { 1193 memset(nsyscalls + trace->syscalls.max + 1, 0, 1194 (id - trace->syscalls.max) * sizeof(*sc)); 1195 } else { 1196 memset(nsyscalls, 0, (id + 1) * sizeof(*sc)); 1197 } 1198 1199 trace->syscalls.table = nsyscalls; 1200 trace->syscalls.max = id; 1201 } 1202 1203 sc = trace->syscalls.table + id; 1204 sc->name = name; 1205 1206 sc->fmt = syscall_fmt__find(sc->name); 1207 1208 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name); 1209 sc->tp_format = trace_event__tp_format("syscalls", tp_name); 1210 1211 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) { 1212 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias); 1213 sc->tp_format = trace_event__tp_format("syscalls", tp_name); 1214 } 1215 1216 if (IS_ERR(sc->tp_format)) 1217 return -1; 1218 1219 sc->args = sc->tp_format->format.fields; 1220 sc->nr_args = sc->tp_format->format.nr_fields; 1221 /* 1222 * We need to check and discard the first variable '__syscall_nr' 1223 * or 'nr' that mean the syscall number. It is needless here. 1224 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels. 1225 */ 1226 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) { 1227 sc->args = sc->args->next; 1228 --sc->nr_args; 1229 } 1230 1231 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit"); 1232 1233 return syscall__set_arg_fmts(sc); 1234 } 1235 1236 static int trace__validate_ev_qualifier(struct trace *trace) 1237 { 1238 int err = 0, i; 1239 struct str_node *pos; 1240 1241 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier); 1242 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr * 1243 sizeof(trace->ev_qualifier_ids.entries[0])); 1244 1245 if (trace->ev_qualifier_ids.entries == NULL) { 1246 fputs("Error:\tNot enough memory for allocating events qualifier ids\n", 1247 trace->output); 1248 err = -EINVAL; 1249 goto out; 1250 } 1251 1252 i = 0; 1253 1254 strlist__for_each_entry(pos, trace->ev_qualifier) { 1255 const char *sc = pos->s; 1256 int id = syscalltbl__id(trace->sctbl, sc); 1257 1258 if (id < 0) { 1259 if (err == 0) { 1260 fputs("Error:\tInvalid syscall ", trace->output); 1261 err = -EINVAL; 1262 } else { 1263 fputs(", ", trace->output); 1264 } 1265 1266 fputs(sc, trace->output); 1267 } 1268 1269 trace->ev_qualifier_ids.entries[i++] = id; 1270 } 1271 1272 if (err < 0) { 1273 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'" 1274 "\nHint:\tand: 'man syscalls'\n", trace->output); 1275 zfree(&trace->ev_qualifier_ids.entries); 1276 trace->ev_qualifier_ids.nr = 0; 1277 } 1278 out: 1279 return err; 1280 } 1281 1282 /* 1283 * args is to be interpreted as a series of longs but we need to handle 1284 * 8-byte unaligned accesses. args points to raw_data within the event 1285 * and raw_data is guaranteed to be 8-byte unaligned because it is 1286 * preceded by raw_size which is a u32. So we need to copy args to a temp 1287 * variable to read it. Most notably this avoids extended load instructions 1288 * on unaligned addresses 1289 */ 1290 1291 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, 1292 unsigned char *args, struct trace *trace, 1293 struct thread *thread) 1294 { 1295 size_t printed = 0; 1296 unsigned char *p; 1297 unsigned long val; 1298 1299 if (sc->args != NULL) { 1300 struct format_field *field; 1301 u8 bit = 1; 1302 struct syscall_arg arg = { 1303 .idx = 0, 1304 .mask = 0, 1305 .trace = trace, 1306 .thread = thread, 1307 }; 1308 1309 for (field = sc->args; field; 1310 field = field->next, ++arg.idx, bit <<= 1) { 1311 if (arg.mask & bit) 1312 continue; 1313 1314 /* special care for unaligned accesses */ 1315 p = args + sizeof(unsigned long) * arg.idx; 1316 memcpy(&val, p, sizeof(val)); 1317 1318 /* 1319 * Suppress this argument if its value is zero and 1320 * and we don't have a string associated in an 1321 * strarray for it. 1322 */ 1323 if (val == 0 && 1324 !(sc->arg_scnprintf && 1325 sc->arg_scnprintf[arg.idx] == SCA_STRARRAY && 1326 sc->arg_parm[arg.idx])) 1327 continue; 1328 1329 printed += scnprintf(bf + printed, size - printed, 1330 "%s%s: ", printed ? ", " : "", field->name); 1331 if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) { 1332 arg.val = val; 1333 if (sc->arg_parm) 1334 arg.parm = sc->arg_parm[arg.idx]; 1335 printed += sc->arg_scnprintf[arg.idx](bf + printed, 1336 size - printed, &arg); 1337 } else { 1338 printed += scnprintf(bf + printed, size - printed, 1339 "%ld", val); 1340 } 1341 } 1342 } else if (IS_ERR(sc->tp_format)) { 1343 /* 1344 * If we managed to read the tracepoint /format file, then we 1345 * may end up not having any args, like with gettid(), so only 1346 * print the raw args when we didn't manage to read it. 1347 */ 1348 int i = 0; 1349 1350 while (i < 6) { 1351 /* special care for unaligned accesses */ 1352 p = args + sizeof(unsigned long) * i; 1353 memcpy(&val, p, sizeof(val)); 1354 printed += scnprintf(bf + printed, size - printed, 1355 "%sarg%d: %ld", 1356 printed ? ", " : "", i, val); 1357 ++i; 1358 } 1359 } 1360 1361 return printed; 1362 } 1363 1364 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel, 1365 union perf_event *event, 1366 struct perf_sample *sample); 1367 1368 static struct syscall *trace__syscall_info(struct trace *trace, 1369 struct perf_evsel *evsel, int id) 1370 { 1371 1372 if (id < 0) { 1373 1374 /* 1375 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried 1376 * before that, leaving at a higher verbosity level till that is 1377 * explained. Reproduced with plain ftrace with: 1378 * 1379 * echo 1 > /t/events/raw_syscalls/sys_exit/enable 1380 * grep "NR -1 " /t/trace_pipe 1381 * 1382 * After generating some load on the machine. 1383 */ 1384 if (verbose > 1) { 1385 static u64 n; 1386 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n", 1387 id, perf_evsel__name(evsel), ++n); 1388 } 1389 return NULL; 1390 } 1391 1392 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) && 1393 trace__read_syscall_info(trace, id)) 1394 goto out_cant_read; 1395 1396 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL)) 1397 goto out_cant_read; 1398 1399 return &trace->syscalls.table[id]; 1400 1401 out_cant_read: 1402 if (verbose) { 1403 fprintf(trace->output, "Problems reading syscall %d", id); 1404 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL) 1405 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name); 1406 fputs(" information\n", trace->output); 1407 } 1408 return NULL; 1409 } 1410 1411 static void thread__update_stats(struct thread_trace *ttrace, 1412 int id, struct perf_sample *sample) 1413 { 1414 struct int_node *inode; 1415 struct stats *stats; 1416 u64 duration = 0; 1417 1418 inode = intlist__findnew(ttrace->syscall_stats, id); 1419 if (inode == NULL) 1420 return; 1421 1422 stats = inode->priv; 1423 if (stats == NULL) { 1424 stats = malloc(sizeof(struct stats)); 1425 if (stats == NULL) 1426 return; 1427 init_stats(stats); 1428 inode->priv = stats; 1429 } 1430 1431 if (ttrace->entry_time && sample->time > ttrace->entry_time) 1432 duration = sample->time - ttrace->entry_time; 1433 1434 update_stats(stats, duration); 1435 } 1436 1437 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample) 1438 { 1439 struct thread_trace *ttrace; 1440 u64 duration; 1441 size_t printed; 1442 1443 if (trace->current == NULL) 1444 return 0; 1445 1446 ttrace = thread__priv(trace->current); 1447 1448 if (!ttrace->entry_pending) 1449 return 0; 1450 1451 duration = sample->time - ttrace->entry_time; 1452 1453 printed = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output); 1454 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str); 1455 ttrace->entry_pending = false; 1456 1457 return printed; 1458 } 1459 1460 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, 1461 union perf_event *event __maybe_unused, 1462 struct perf_sample *sample) 1463 { 1464 char *msg; 1465 void *args; 1466 size_t printed = 0; 1467 struct thread *thread; 1468 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; 1469 struct syscall *sc = trace__syscall_info(trace, evsel, id); 1470 struct thread_trace *ttrace; 1471 1472 if (sc == NULL) 1473 return -1; 1474 1475 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1476 ttrace = thread__trace(thread, trace->output); 1477 if (ttrace == NULL) 1478 goto out_put; 1479 1480 args = perf_evsel__sc_tp_ptr(evsel, args, sample); 1481 1482 if (ttrace->entry_str == NULL) { 1483 ttrace->entry_str = malloc(trace__entry_str_size); 1484 if (!ttrace->entry_str) 1485 goto out_put; 1486 } 1487 1488 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) 1489 trace__printf_interrupted_entry(trace, sample); 1490 1491 ttrace->entry_time = sample->time; 1492 msg = ttrace->entry_str; 1493 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name); 1494 1495 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed, 1496 args, trace, thread); 1497 1498 if (sc->is_exit) { 1499 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) { 1500 trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output); 1501 fprintf(trace->output, "%-70s)\n", ttrace->entry_str); 1502 } 1503 } else { 1504 ttrace->entry_pending = true; 1505 /* See trace__vfs_getname & trace__sys_exit */ 1506 ttrace->filename.pending_open = false; 1507 } 1508 1509 if (trace->current != thread) { 1510 thread__put(trace->current); 1511 trace->current = thread__get(thread); 1512 } 1513 err = 0; 1514 out_put: 1515 thread__put(thread); 1516 return err; 1517 } 1518 1519 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel, 1520 struct perf_sample *sample, 1521 struct callchain_cursor *cursor) 1522 { 1523 struct addr_location al; 1524 1525 if (machine__resolve(trace->host, &al, sample) < 0 || 1526 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack)) 1527 return -1; 1528 1529 return 0; 1530 } 1531 1532 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample) 1533 { 1534 /* TODO: user-configurable print_opts */ 1535 const unsigned int print_opts = EVSEL__PRINT_SYM | 1536 EVSEL__PRINT_DSO | 1537 EVSEL__PRINT_UNKNOWN_AS_ADDR; 1538 1539 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output); 1540 } 1541 1542 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, 1543 union perf_event *event __maybe_unused, 1544 struct perf_sample *sample) 1545 { 1546 long ret; 1547 u64 duration = 0; 1548 struct thread *thread; 1549 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0; 1550 struct syscall *sc = trace__syscall_info(trace, evsel, id); 1551 struct thread_trace *ttrace; 1552 1553 if (sc == NULL) 1554 return -1; 1555 1556 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1557 ttrace = thread__trace(thread, trace->output); 1558 if (ttrace == NULL) 1559 goto out_put; 1560 1561 if (trace->summary) 1562 thread__update_stats(ttrace, id, sample); 1563 1564 ret = perf_evsel__sc_tp_uint(evsel, ret, sample); 1565 1566 if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) { 1567 trace__set_fd_pathname(thread, ret, ttrace->filename.name); 1568 ttrace->filename.pending_open = false; 1569 ++trace->stats.vfs_getname; 1570 } 1571 1572 ttrace->exit_time = sample->time; 1573 1574 if (ttrace->entry_time) { 1575 duration = sample->time - ttrace->entry_time; 1576 if (trace__filter_duration(trace, duration)) 1577 goto out; 1578 } else if (trace->duration_filter) 1579 goto out; 1580 1581 if (sample->callchain) { 1582 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 1583 if (callchain_ret == 0) { 1584 if (callchain_cursor.nr < trace->min_stack) 1585 goto out; 1586 callchain_ret = 1; 1587 } 1588 } 1589 1590 if (trace->summary_only) 1591 goto out; 1592 1593 trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output); 1594 1595 if (ttrace->entry_pending) { 1596 fprintf(trace->output, "%-70s", ttrace->entry_str); 1597 } else { 1598 fprintf(trace->output, " ... ["); 1599 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued"); 1600 fprintf(trace->output, "]: %s()", sc->name); 1601 } 1602 1603 if (sc->fmt == NULL) { 1604 signed_print: 1605 fprintf(trace->output, ") = %ld", ret); 1606 } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) { 1607 char bf[STRERR_BUFSIZE]; 1608 const char *emsg = str_error_r(-ret, bf, sizeof(bf)), 1609 *e = audit_errno_to_name(-ret); 1610 1611 fprintf(trace->output, ") = -1 %s %s", e, emsg); 1612 } else if (ret == 0 && sc->fmt->timeout) 1613 fprintf(trace->output, ") = 0 Timeout"); 1614 else if (sc->fmt->hexret) 1615 fprintf(trace->output, ") = %#lx", ret); 1616 else if (sc->fmt->errpid) { 1617 struct thread *child = machine__find_thread(trace->host, ret, ret); 1618 1619 if (child != NULL) { 1620 fprintf(trace->output, ") = %ld", ret); 1621 if (child->comm_set) 1622 fprintf(trace->output, " (%s)", thread__comm_str(child)); 1623 thread__put(child); 1624 } 1625 } else 1626 goto signed_print; 1627 1628 fputc('\n', trace->output); 1629 1630 if (callchain_ret > 0) 1631 trace__fprintf_callchain(trace, sample); 1632 else if (callchain_ret < 0) 1633 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 1634 out: 1635 ttrace->entry_pending = false; 1636 err = 0; 1637 out_put: 1638 thread__put(thread); 1639 return err; 1640 } 1641 1642 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel, 1643 union perf_event *event __maybe_unused, 1644 struct perf_sample *sample) 1645 { 1646 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1647 struct thread_trace *ttrace; 1648 size_t filename_len, entry_str_len, to_move; 1649 ssize_t remaining_space; 1650 char *pos; 1651 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname"); 1652 1653 if (!thread) 1654 goto out; 1655 1656 ttrace = thread__priv(thread); 1657 if (!ttrace) 1658 goto out; 1659 1660 filename_len = strlen(filename); 1661 1662 if (ttrace->filename.namelen < filename_len) { 1663 char *f = realloc(ttrace->filename.name, filename_len + 1); 1664 1665 if (f == NULL) 1666 goto out; 1667 1668 ttrace->filename.namelen = filename_len; 1669 ttrace->filename.name = f; 1670 } 1671 1672 strcpy(ttrace->filename.name, filename); 1673 ttrace->filename.pending_open = true; 1674 1675 if (!ttrace->filename.ptr) 1676 goto out; 1677 1678 entry_str_len = strlen(ttrace->entry_str); 1679 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */ 1680 if (remaining_space <= 0) 1681 goto out; 1682 1683 if (filename_len > (size_t)remaining_space) { 1684 filename += filename_len - remaining_space; 1685 filename_len = remaining_space; 1686 } 1687 1688 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */ 1689 pos = ttrace->entry_str + ttrace->filename.entry_str_pos; 1690 memmove(pos + filename_len, pos, to_move); 1691 memcpy(pos, filename, filename_len); 1692 1693 ttrace->filename.ptr = 0; 1694 ttrace->filename.entry_str_pos = 0; 1695 out: 1696 return 0; 1697 } 1698 1699 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel, 1700 union perf_event *event __maybe_unused, 1701 struct perf_sample *sample) 1702 { 1703 u64 runtime = perf_evsel__intval(evsel, sample, "runtime"); 1704 double runtime_ms = (double)runtime / NSEC_PER_MSEC; 1705 struct thread *thread = machine__findnew_thread(trace->host, 1706 sample->pid, 1707 sample->tid); 1708 struct thread_trace *ttrace = thread__trace(thread, trace->output); 1709 1710 if (ttrace == NULL) 1711 goto out_dump; 1712 1713 ttrace->runtime_ms += runtime_ms; 1714 trace->runtime_ms += runtime_ms; 1715 thread__put(thread); 1716 return 0; 1717 1718 out_dump: 1719 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n", 1720 evsel->name, 1721 perf_evsel__strval(evsel, sample, "comm"), 1722 (pid_t)perf_evsel__intval(evsel, sample, "pid"), 1723 runtime, 1724 perf_evsel__intval(evsel, sample, "vruntime")); 1725 thread__put(thread); 1726 return 0; 1727 } 1728 1729 static void bpf_output__printer(enum binary_printer_ops op, 1730 unsigned int val, void *extra) 1731 { 1732 FILE *output = extra; 1733 unsigned char ch = (unsigned char)val; 1734 1735 switch (op) { 1736 case BINARY_PRINT_CHAR_DATA: 1737 fprintf(output, "%c", isprint(ch) ? ch : '.'); 1738 break; 1739 case BINARY_PRINT_DATA_BEGIN: 1740 case BINARY_PRINT_LINE_BEGIN: 1741 case BINARY_PRINT_ADDR: 1742 case BINARY_PRINT_NUM_DATA: 1743 case BINARY_PRINT_NUM_PAD: 1744 case BINARY_PRINT_SEP: 1745 case BINARY_PRINT_CHAR_PAD: 1746 case BINARY_PRINT_LINE_END: 1747 case BINARY_PRINT_DATA_END: 1748 default: 1749 break; 1750 } 1751 } 1752 1753 static void bpf_output__fprintf(struct trace *trace, 1754 struct perf_sample *sample) 1755 { 1756 print_binary(sample->raw_data, sample->raw_size, 8, 1757 bpf_output__printer, trace->output); 1758 } 1759 1760 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, 1761 union perf_event *event __maybe_unused, 1762 struct perf_sample *sample) 1763 { 1764 int callchain_ret = 0; 1765 1766 if (sample->callchain) { 1767 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 1768 if (callchain_ret == 0) { 1769 if (callchain_cursor.nr < trace->min_stack) 1770 goto out; 1771 callchain_ret = 1; 1772 } 1773 } 1774 1775 trace__printf_interrupted_entry(trace, sample); 1776 trace__fprintf_tstamp(trace, sample->time, trace->output); 1777 1778 if (trace->trace_syscalls) 1779 fprintf(trace->output, "( ): "); 1780 1781 fprintf(trace->output, "%s:", evsel->name); 1782 1783 if (perf_evsel__is_bpf_output(evsel)) { 1784 bpf_output__fprintf(trace, sample); 1785 } else if (evsel->tp_format) { 1786 event_format__fprintf(evsel->tp_format, sample->cpu, 1787 sample->raw_data, sample->raw_size, 1788 trace->output); 1789 } 1790 1791 fprintf(trace->output, ")\n"); 1792 1793 if (callchain_ret > 0) 1794 trace__fprintf_callchain(trace, sample); 1795 else if (callchain_ret < 0) 1796 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 1797 out: 1798 return 0; 1799 } 1800 1801 static void print_location(FILE *f, struct perf_sample *sample, 1802 struct addr_location *al, 1803 bool print_dso, bool print_sym) 1804 { 1805 1806 if ((verbose || print_dso) && al->map) 1807 fprintf(f, "%s@", al->map->dso->long_name); 1808 1809 if ((verbose || print_sym) && al->sym) 1810 fprintf(f, "%s+0x%" PRIx64, al->sym->name, 1811 al->addr - al->sym->start); 1812 else if (al->map) 1813 fprintf(f, "0x%" PRIx64, al->addr); 1814 else 1815 fprintf(f, "0x%" PRIx64, sample->addr); 1816 } 1817 1818 static int trace__pgfault(struct trace *trace, 1819 struct perf_evsel *evsel, 1820 union perf_event *event __maybe_unused, 1821 struct perf_sample *sample) 1822 { 1823 struct thread *thread; 1824 struct addr_location al; 1825 char map_type = 'd'; 1826 struct thread_trace *ttrace; 1827 int err = -1; 1828 int callchain_ret = 0; 1829 1830 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1831 1832 if (sample->callchain) { 1833 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 1834 if (callchain_ret == 0) { 1835 if (callchain_cursor.nr < trace->min_stack) 1836 goto out_put; 1837 callchain_ret = 1; 1838 } 1839 } 1840 1841 ttrace = thread__trace(thread, trace->output); 1842 if (ttrace == NULL) 1843 goto out_put; 1844 1845 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ) 1846 ttrace->pfmaj++; 1847 else 1848 ttrace->pfmin++; 1849 1850 if (trace->summary_only) 1851 goto out; 1852 1853 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION, 1854 sample->ip, &al); 1855 1856 trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output); 1857 1858 fprintf(trace->output, "%sfault [", 1859 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ? 1860 "maj" : "min"); 1861 1862 print_location(trace->output, sample, &al, false, true); 1863 1864 fprintf(trace->output, "] => "); 1865 1866 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE, 1867 sample->addr, &al); 1868 1869 if (!al.map) { 1870 thread__find_addr_location(thread, sample->cpumode, 1871 MAP__FUNCTION, sample->addr, &al); 1872 1873 if (al.map) 1874 map_type = 'x'; 1875 else 1876 map_type = '?'; 1877 } 1878 1879 print_location(trace->output, sample, &al, true, false); 1880 1881 fprintf(trace->output, " (%c%c)\n", map_type, al.level); 1882 1883 if (callchain_ret > 0) 1884 trace__fprintf_callchain(trace, sample); 1885 else if (callchain_ret < 0) 1886 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 1887 out: 1888 err = 0; 1889 out_put: 1890 thread__put(thread); 1891 return err; 1892 } 1893 1894 static bool skip_sample(struct trace *trace, struct perf_sample *sample) 1895 { 1896 if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) || 1897 (trace->tid_list && intlist__find(trace->tid_list, sample->tid))) 1898 return false; 1899 1900 if (trace->pid_list || trace->tid_list) 1901 return true; 1902 1903 return false; 1904 } 1905 1906 static void trace__set_base_time(struct trace *trace, 1907 struct perf_evsel *evsel, 1908 struct perf_sample *sample) 1909 { 1910 /* 1911 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust 1912 * and don't use sample->time unconditionally, we may end up having 1913 * some other event in the future without PERF_SAMPLE_TIME for good 1914 * reason, i.e. we may not be interested in its timestamps, just in 1915 * it taking place, picking some piece of information when it 1916 * appears in our event stream (vfs_getname comes to mind). 1917 */ 1918 if (trace->base_time == 0 && !trace->full_time && 1919 (evsel->attr.sample_type & PERF_SAMPLE_TIME)) 1920 trace->base_time = sample->time; 1921 } 1922 1923 static int trace__process_sample(struct perf_tool *tool, 1924 union perf_event *event, 1925 struct perf_sample *sample, 1926 struct perf_evsel *evsel, 1927 struct machine *machine __maybe_unused) 1928 { 1929 struct trace *trace = container_of(tool, struct trace, tool); 1930 int err = 0; 1931 1932 tracepoint_handler handler = evsel->handler; 1933 1934 if (skip_sample(trace, sample)) 1935 return 0; 1936 1937 trace__set_base_time(trace, evsel, sample); 1938 1939 if (handler) { 1940 ++trace->nr_events; 1941 handler(trace, evsel, event, sample); 1942 } 1943 1944 return err; 1945 } 1946 1947 static int parse_target_str(struct trace *trace) 1948 { 1949 if (trace->opts.target.pid) { 1950 trace->pid_list = intlist__new(trace->opts.target.pid); 1951 if (trace->pid_list == NULL) { 1952 pr_err("Error parsing process id string\n"); 1953 return -EINVAL; 1954 } 1955 } 1956 1957 if (trace->opts.target.tid) { 1958 trace->tid_list = intlist__new(trace->opts.target.tid); 1959 if (trace->tid_list == NULL) { 1960 pr_err("Error parsing thread id string\n"); 1961 return -EINVAL; 1962 } 1963 } 1964 1965 return 0; 1966 } 1967 1968 static int trace__record(struct trace *trace, int argc, const char **argv) 1969 { 1970 unsigned int rec_argc, i, j; 1971 const char **rec_argv; 1972 const char * const record_args[] = { 1973 "record", 1974 "-R", 1975 "-m", "1024", 1976 "-c", "1", 1977 }; 1978 1979 const char * const sc_args[] = { "-e", }; 1980 unsigned int sc_args_nr = ARRAY_SIZE(sc_args); 1981 const char * const majpf_args[] = { "-e", "major-faults" }; 1982 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args); 1983 const char * const minpf_args[] = { "-e", "minor-faults" }; 1984 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args); 1985 1986 /* +1 is for the event string below */ 1987 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 + 1988 majpf_args_nr + minpf_args_nr + argc; 1989 rec_argv = calloc(rec_argc + 1, sizeof(char *)); 1990 1991 if (rec_argv == NULL) 1992 return -ENOMEM; 1993 1994 j = 0; 1995 for (i = 0; i < ARRAY_SIZE(record_args); i++) 1996 rec_argv[j++] = record_args[i]; 1997 1998 if (trace->trace_syscalls) { 1999 for (i = 0; i < sc_args_nr; i++) 2000 rec_argv[j++] = sc_args[i]; 2001 2002 /* event string may be different for older kernels - e.g., RHEL6 */ 2003 if (is_valid_tracepoint("raw_syscalls:sys_enter")) 2004 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit"; 2005 else if (is_valid_tracepoint("syscalls:sys_enter")) 2006 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit"; 2007 else { 2008 pr_err("Neither raw_syscalls nor syscalls events exist.\n"); 2009 return -1; 2010 } 2011 } 2012 2013 if (trace->trace_pgfaults & TRACE_PFMAJ) 2014 for (i = 0; i < majpf_args_nr; i++) 2015 rec_argv[j++] = majpf_args[i]; 2016 2017 if (trace->trace_pgfaults & TRACE_PFMIN) 2018 for (i = 0; i < minpf_args_nr; i++) 2019 rec_argv[j++] = minpf_args[i]; 2020 2021 for (i = 0; i < (unsigned int)argc; i++) 2022 rec_argv[j++] = argv[i]; 2023 2024 return cmd_record(j, rec_argv, NULL); 2025 } 2026 2027 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp); 2028 2029 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist) 2030 { 2031 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname"); 2032 2033 if (IS_ERR(evsel)) 2034 return false; 2035 2036 if (perf_evsel__field(evsel, "pathname") == NULL) { 2037 perf_evsel__delete(evsel); 2038 return false; 2039 } 2040 2041 evsel->handler = trace__vfs_getname; 2042 perf_evlist__add(evlist, evsel); 2043 return true; 2044 } 2045 2046 static struct perf_evsel *perf_evsel__new_pgfault(u64 config) 2047 { 2048 struct perf_evsel *evsel; 2049 struct perf_event_attr attr = { 2050 .type = PERF_TYPE_SOFTWARE, 2051 .mmap_data = 1, 2052 }; 2053 2054 attr.config = config; 2055 attr.sample_period = 1; 2056 2057 event_attr_init(&attr); 2058 2059 evsel = perf_evsel__new(&attr); 2060 if (evsel) 2061 evsel->handler = trace__pgfault; 2062 2063 return evsel; 2064 } 2065 2066 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample) 2067 { 2068 const u32 type = event->header.type; 2069 struct perf_evsel *evsel; 2070 2071 if (type != PERF_RECORD_SAMPLE) { 2072 trace__process_event(trace, trace->host, event, sample); 2073 return; 2074 } 2075 2076 evsel = perf_evlist__id2evsel(trace->evlist, sample->id); 2077 if (evsel == NULL) { 2078 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id); 2079 return; 2080 } 2081 2082 trace__set_base_time(trace, evsel, sample); 2083 2084 if (evsel->attr.type == PERF_TYPE_TRACEPOINT && 2085 sample->raw_data == NULL) { 2086 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n", 2087 perf_evsel__name(evsel), sample->tid, 2088 sample->cpu, sample->raw_size); 2089 } else { 2090 tracepoint_handler handler = evsel->handler; 2091 handler(trace, evsel, event, sample); 2092 } 2093 } 2094 2095 static int trace__add_syscall_newtp(struct trace *trace) 2096 { 2097 int ret = -1; 2098 struct perf_evlist *evlist = trace->evlist; 2099 struct perf_evsel *sys_enter, *sys_exit; 2100 2101 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter); 2102 if (sys_enter == NULL) 2103 goto out; 2104 2105 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args)) 2106 goto out_delete_sys_enter; 2107 2108 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit); 2109 if (sys_exit == NULL) 2110 goto out_delete_sys_enter; 2111 2112 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret)) 2113 goto out_delete_sys_exit; 2114 2115 perf_evlist__add(evlist, sys_enter); 2116 perf_evlist__add(evlist, sys_exit); 2117 2118 if (callchain_param.enabled && !trace->kernel_syscallchains) { 2119 /* 2120 * We're interested only in the user space callchain 2121 * leading to the syscall, allow overriding that for 2122 * debugging reasons using --kernel_syscall_callchains 2123 */ 2124 sys_exit->attr.exclude_callchain_kernel = 1; 2125 } 2126 2127 trace->syscalls.events.sys_enter = sys_enter; 2128 trace->syscalls.events.sys_exit = sys_exit; 2129 2130 ret = 0; 2131 out: 2132 return ret; 2133 2134 out_delete_sys_exit: 2135 perf_evsel__delete_priv(sys_exit); 2136 out_delete_sys_enter: 2137 perf_evsel__delete_priv(sys_enter); 2138 goto out; 2139 } 2140 2141 static int trace__set_ev_qualifier_filter(struct trace *trace) 2142 { 2143 int err = -1; 2144 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier, 2145 trace->ev_qualifier_ids.nr, 2146 trace->ev_qualifier_ids.entries); 2147 2148 if (filter == NULL) 2149 goto out_enomem; 2150 2151 if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter)) 2152 err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter); 2153 2154 free(filter); 2155 out: 2156 return err; 2157 out_enomem: 2158 errno = ENOMEM; 2159 goto out; 2160 } 2161 2162 static int trace__run(struct trace *trace, int argc, const char **argv) 2163 { 2164 struct perf_evlist *evlist = trace->evlist; 2165 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL; 2166 int err = -1, i; 2167 unsigned long before; 2168 const bool forks = argc > 0; 2169 bool draining = false; 2170 2171 trace->live = true; 2172 2173 if (trace->trace_syscalls && trace__add_syscall_newtp(trace)) 2174 goto out_error_raw_syscalls; 2175 2176 if (trace->trace_syscalls) 2177 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist); 2178 2179 if ((trace->trace_pgfaults & TRACE_PFMAJ)) { 2180 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ); 2181 if (pgfault_maj == NULL) 2182 goto out_error_mem; 2183 perf_evlist__add(evlist, pgfault_maj); 2184 } 2185 2186 if ((trace->trace_pgfaults & TRACE_PFMIN)) { 2187 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN); 2188 if (pgfault_min == NULL) 2189 goto out_error_mem; 2190 perf_evlist__add(evlist, pgfault_min); 2191 } 2192 2193 if (trace->sched && 2194 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime", 2195 trace__sched_stat_runtime)) 2196 goto out_error_sched_stat_runtime; 2197 2198 err = perf_evlist__create_maps(evlist, &trace->opts.target); 2199 if (err < 0) { 2200 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n"); 2201 goto out_delete_evlist; 2202 } 2203 2204 err = trace__symbols_init(trace, evlist); 2205 if (err < 0) { 2206 fprintf(trace->output, "Problems initializing symbol libraries!\n"); 2207 goto out_delete_evlist; 2208 } 2209 2210 perf_evlist__config(evlist, &trace->opts, NULL); 2211 2212 if (callchain_param.enabled) { 2213 bool use_identifier = false; 2214 2215 if (trace->syscalls.events.sys_exit) { 2216 perf_evsel__config_callchain(trace->syscalls.events.sys_exit, 2217 &trace->opts, &callchain_param); 2218 use_identifier = true; 2219 } 2220 2221 if (pgfault_maj) { 2222 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param); 2223 use_identifier = true; 2224 } 2225 2226 if (pgfault_min) { 2227 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param); 2228 use_identifier = true; 2229 } 2230 2231 if (use_identifier) { 2232 /* 2233 * Now we have evsels with different sample_ids, use 2234 * PERF_SAMPLE_IDENTIFIER to map from sample to evsel 2235 * from a fixed position in each ring buffer record. 2236 * 2237 * As of this the changeset introducing this comment, this 2238 * isn't strictly needed, as the fields that can come before 2239 * PERF_SAMPLE_ID are all used, but we'll probably disable 2240 * some of those for things like copying the payload of 2241 * pointer syscall arguments, and for vfs_getname we don't 2242 * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this 2243 * here as a warning we need to use PERF_SAMPLE_IDENTIFIER. 2244 */ 2245 perf_evlist__set_sample_bit(evlist, IDENTIFIER); 2246 perf_evlist__reset_sample_bit(evlist, ID); 2247 } 2248 } 2249 2250 signal(SIGCHLD, sig_handler); 2251 signal(SIGINT, sig_handler); 2252 2253 if (forks) { 2254 err = perf_evlist__prepare_workload(evlist, &trace->opts.target, 2255 argv, false, NULL); 2256 if (err < 0) { 2257 fprintf(trace->output, "Couldn't run the workload!\n"); 2258 goto out_delete_evlist; 2259 } 2260 } 2261 2262 err = perf_evlist__open(evlist); 2263 if (err < 0) 2264 goto out_error_open; 2265 2266 err = bpf__apply_obj_config(); 2267 if (err) { 2268 char errbuf[BUFSIZ]; 2269 2270 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf)); 2271 pr_err("ERROR: Apply config to BPF failed: %s\n", 2272 errbuf); 2273 goto out_error_open; 2274 } 2275 2276 /* 2277 * Better not use !target__has_task() here because we need to cover the 2278 * case where no threads were specified in the command line, but a 2279 * workload was, and in that case we will fill in the thread_map when 2280 * we fork the workload in perf_evlist__prepare_workload. 2281 */ 2282 if (trace->filter_pids.nr > 0) 2283 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries); 2284 else if (thread_map__pid(evlist->threads, 0) == -1) 2285 err = perf_evlist__set_filter_pid(evlist, getpid()); 2286 2287 if (err < 0) 2288 goto out_error_mem; 2289 2290 if (trace->ev_qualifier_ids.nr > 0) { 2291 err = trace__set_ev_qualifier_filter(trace); 2292 if (err < 0) 2293 goto out_errno; 2294 2295 pr_debug("event qualifier tracepoint filter: %s\n", 2296 trace->syscalls.events.sys_exit->filter); 2297 } 2298 2299 err = perf_evlist__apply_filters(evlist, &evsel); 2300 if (err < 0) 2301 goto out_error_apply_filters; 2302 2303 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false); 2304 if (err < 0) 2305 goto out_error_mmap; 2306 2307 if (!target__none(&trace->opts.target)) 2308 perf_evlist__enable(evlist); 2309 2310 if (forks) 2311 perf_evlist__start_workload(evlist); 2312 2313 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 || 2314 evlist->threads->nr > 1 || 2315 perf_evlist__first(evlist)->attr.inherit; 2316 again: 2317 before = trace->nr_events; 2318 2319 for (i = 0; i < evlist->nr_mmaps; i++) { 2320 union perf_event *event; 2321 2322 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) { 2323 struct perf_sample sample; 2324 2325 ++trace->nr_events; 2326 2327 err = perf_evlist__parse_sample(evlist, event, &sample); 2328 if (err) { 2329 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err); 2330 goto next_event; 2331 } 2332 2333 trace__handle_event(trace, event, &sample); 2334 next_event: 2335 perf_evlist__mmap_consume(evlist, i); 2336 2337 if (interrupted) 2338 goto out_disable; 2339 2340 if (done && !draining) { 2341 perf_evlist__disable(evlist); 2342 draining = true; 2343 } 2344 } 2345 } 2346 2347 if (trace->nr_events == before) { 2348 int timeout = done ? 100 : -1; 2349 2350 if (!draining && perf_evlist__poll(evlist, timeout) > 0) { 2351 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0) 2352 draining = true; 2353 2354 goto again; 2355 } 2356 } else { 2357 goto again; 2358 } 2359 2360 out_disable: 2361 thread__zput(trace->current); 2362 2363 perf_evlist__disable(evlist); 2364 2365 if (!err) { 2366 if (trace->summary) 2367 trace__fprintf_thread_summary(trace, trace->output); 2368 2369 if (trace->show_tool_stats) { 2370 fprintf(trace->output, "Stats:\n " 2371 " vfs_getname : %" PRIu64 "\n" 2372 " proc_getname: %" PRIu64 "\n", 2373 trace->stats.vfs_getname, 2374 trace->stats.proc_getname); 2375 } 2376 } 2377 2378 out_delete_evlist: 2379 perf_evlist__delete(evlist); 2380 trace->evlist = NULL; 2381 trace->live = false; 2382 return err; 2383 { 2384 char errbuf[BUFSIZ]; 2385 2386 out_error_sched_stat_runtime: 2387 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime"); 2388 goto out_error; 2389 2390 out_error_raw_syscalls: 2391 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)"); 2392 goto out_error; 2393 2394 out_error_mmap: 2395 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf)); 2396 goto out_error; 2397 2398 out_error_open: 2399 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf)); 2400 2401 out_error: 2402 fprintf(trace->output, "%s\n", errbuf); 2403 goto out_delete_evlist; 2404 2405 out_error_apply_filters: 2406 fprintf(trace->output, 2407 "Failed to set filter \"%s\" on event %s with %d (%s)\n", 2408 evsel->filter, perf_evsel__name(evsel), errno, 2409 str_error_r(errno, errbuf, sizeof(errbuf))); 2410 goto out_delete_evlist; 2411 } 2412 out_error_mem: 2413 fprintf(trace->output, "Not enough memory to run!\n"); 2414 goto out_delete_evlist; 2415 2416 out_errno: 2417 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno)); 2418 goto out_delete_evlist; 2419 } 2420 2421 static int trace__replay(struct trace *trace) 2422 { 2423 const struct perf_evsel_str_handler handlers[] = { 2424 { "probe:vfs_getname", trace__vfs_getname, }, 2425 }; 2426 struct perf_data_file file = { 2427 .path = input_name, 2428 .mode = PERF_DATA_MODE_READ, 2429 .force = trace->force, 2430 }; 2431 struct perf_session *session; 2432 struct perf_evsel *evsel; 2433 int err = -1; 2434 2435 trace->tool.sample = trace__process_sample; 2436 trace->tool.mmap = perf_event__process_mmap; 2437 trace->tool.mmap2 = perf_event__process_mmap2; 2438 trace->tool.comm = perf_event__process_comm; 2439 trace->tool.exit = perf_event__process_exit; 2440 trace->tool.fork = perf_event__process_fork; 2441 trace->tool.attr = perf_event__process_attr; 2442 trace->tool.tracing_data = perf_event__process_tracing_data; 2443 trace->tool.build_id = perf_event__process_build_id; 2444 2445 trace->tool.ordered_events = true; 2446 trace->tool.ordering_requires_timestamps = true; 2447 2448 /* add tid to output */ 2449 trace->multiple_threads = true; 2450 2451 session = perf_session__new(&file, false, &trace->tool); 2452 if (session == NULL) 2453 return -1; 2454 2455 if (symbol__init(&session->header.env) < 0) 2456 goto out; 2457 2458 trace->host = &session->machines.host; 2459 2460 err = perf_session__set_tracepoints_handlers(session, handlers); 2461 if (err) 2462 goto out; 2463 2464 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2465 "raw_syscalls:sys_enter"); 2466 /* older kernels have syscalls tp versus raw_syscalls */ 2467 if (evsel == NULL) 2468 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2469 "syscalls:sys_enter"); 2470 2471 if (evsel && 2472 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 || 2473 perf_evsel__init_sc_tp_ptr_field(evsel, args))) { 2474 pr_err("Error during initialize raw_syscalls:sys_enter event\n"); 2475 goto out; 2476 } 2477 2478 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2479 "raw_syscalls:sys_exit"); 2480 if (evsel == NULL) 2481 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2482 "syscalls:sys_exit"); 2483 if (evsel && 2484 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 || 2485 perf_evsel__init_sc_tp_uint_field(evsel, ret))) { 2486 pr_err("Error during initialize raw_syscalls:sys_exit event\n"); 2487 goto out; 2488 } 2489 2490 evlist__for_each_entry(session->evlist, evsel) { 2491 if (evsel->attr.type == PERF_TYPE_SOFTWARE && 2492 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ || 2493 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN || 2494 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS)) 2495 evsel->handler = trace__pgfault; 2496 } 2497 2498 err = parse_target_str(trace); 2499 if (err != 0) 2500 goto out; 2501 2502 setup_pager(); 2503 2504 err = perf_session__process_events(session); 2505 if (err) 2506 pr_err("Failed to process events, error %d", err); 2507 2508 else if (trace->summary) 2509 trace__fprintf_thread_summary(trace, trace->output); 2510 2511 out: 2512 perf_session__delete(session); 2513 2514 return err; 2515 } 2516 2517 static size_t trace__fprintf_threads_header(FILE *fp) 2518 { 2519 size_t printed; 2520 2521 printed = fprintf(fp, "\n Summary of events:\n\n"); 2522 2523 return printed; 2524 } 2525 2526 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs, 2527 struct stats *stats; 2528 double msecs; 2529 int syscall; 2530 ) 2531 { 2532 struct int_node *source = rb_entry(nd, struct int_node, rb_node); 2533 struct stats *stats = source->priv; 2534 2535 entry->syscall = source->i; 2536 entry->stats = stats; 2537 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0; 2538 } 2539 2540 static size_t thread__dump_stats(struct thread_trace *ttrace, 2541 struct trace *trace, FILE *fp) 2542 { 2543 size_t printed = 0; 2544 struct syscall *sc; 2545 struct rb_node *nd; 2546 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats); 2547 2548 if (syscall_stats == NULL) 2549 return 0; 2550 2551 printed += fprintf(fp, "\n"); 2552 2553 printed += fprintf(fp, " syscall calls total min avg max stddev\n"); 2554 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); 2555 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n"); 2556 2557 resort_rb__for_each_entry(nd, syscall_stats) { 2558 struct stats *stats = syscall_stats_entry->stats; 2559 if (stats) { 2560 double min = (double)(stats->min) / NSEC_PER_MSEC; 2561 double max = (double)(stats->max) / NSEC_PER_MSEC; 2562 double avg = avg_stats(stats); 2563 double pct; 2564 u64 n = (u64) stats->n; 2565 2566 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0; 2567 avg /= NSEC_PER_MSEC; 2568 2569 sc = &trace->syscalls.table[syscall_stats_entry->syscall]; 2570 printed += fprintf(fp, " %-15s", sc->name); 2571 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f", 2572 n, syscall_stats_entry->msecs, min, avg); 2573 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct); 2574 } 2575 } 2576 2577 resort_rb__delete(syscall_stats); 2578 printed += fprintf(fp, "\n\n"); 2579 2580 return printed; 2581 } 2582 2583 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace) 2584 { 2585 size_t printed = 0; 2586 struct thread_trace *ttrace = thread__priv(thread); 2587 double ratio; 2588 2589 if (ttrace == NULL) 2590 return 0; 2591 2592 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0; 2593 2594 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid); 2595 printed += fprintf(fp, "%lu events, ", ttrace->nr_events); 2596 printed += fprintf(fp, "%.1f%%", ratio); 2597 if (ttrace->pfmaj) 2598 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj); 2599 if (ttrace->pfmin) 2600 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin); 2601 if (trace->sched) 2602 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms); 2603 else if (fputc('\n', fp) != EOF) 2604 ++printed; 2605 2606 printed += thread__dump_stats(ttrace, trace, fp); 2607 2608 return printed; 2609 } 2610 2611 static unsigned long thread__nr_events(struct thread_trace *ttrace) 2612 { 2613 return ttrace ? ttrace->nr_events : 0; 2614 } 2615 2616 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)), 2617 struct thread *thread; 2618 ) 2619 { 2620 entry->thread = rb_entry(nd, struct thread, rb_node); 2621 } 2622 2623 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp) 2624 { 2625 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host); 2626 size_t printed = trace__fprintf_threads_header(fp); 2627 struct rb_node *nd; 2628 2629 if (threads == NULL) { 2630 fprintf(fp, "%s", "Error sorting output by nr_events!\n"); 2631 return 0; 2632 } 2633 2634 resort_rb__for_each_entry(nd, threads) 2635 printed += trace__fprintf_thread(fp, threads_entry->thread, trace); 2636 2637 resort_rb__delete(threads); 2638 2639 return printed; 2640 } 2641 2642 static int trace__set_duration(const struct option *opt, const char *str, 2643 int unset __maybe_unused) 2644 { 2645 struct trace *trace = opt->value; 2646 2647 trace->duration_filter = atof(str); 2648 return 0; 2649 } 2650 2651 static int trace__set_filter_pids(const struct option *opt, const char *str, 2652 int unset __maybe_unused) 2653 { 2654 int ret = -1; 2655 size_t i; 2656 struct trace *trace = opt->value; 2657 /* 2658 * FIXME: introduce a intarray class, plain parse csv and create a 2659 * { int nr, int entries[] } struct... 2660 */ 2661 struct intlist *list = intlist__new(str); 2662 2663 if (list == NULL) 2664 return -1; 2665 2666 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1; 2667 trace->filter_pids.entries = calloc(i, sizeof(pid_t)); 2668 2669 if (trace->filter_pids.entries == NULL) 2670 goto out; 2671 2672 trace->filter_pids.entries[0] = getpid(); 2673 2674 for (i = 1; i < trace->filter_pids.nr; ++i) 2675 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i; 2676 2677 intlist__delete(list); 2678 ret = 0; 2679 out: 2680 return ret; 2681 } 2682 2683 static int trace__open_output(struct trace *trace, const char *filename) 2684 { 2685 struct stat st; 2686 2687 if (!stat(filename, &st) && st.st_size) { 2688 char oldname[PATH_MAX]; 2689 2690 scnprintf(oldname, sizeof(oldname), "%s.old", filename); 2691 unlink(oldname); 2692 rename(filename, oldname); 2693 } 2694 2695 trace->output = fopen(filename, "w"); 2696 2697 return trace->output == NULL ? -errno : 0; 2698 } 2699 2700 static int parse_pagefaults(const struct option *opt, const char *str, 2701 int unset __maybe_unused) 2702 { 2703 int *trace_pgfaults = opt->value; 2704 2705 if (strcmp(str, "all") == 0) 2706 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN; 2707 else if (strcmp(str, "maj") == 0) 2708 *trace_pgfaults |= TRACE_PFMAJ; 2709 else if (strcmp(str, "min") == 0) 2710 *trace_pgfaults |= TRACE_PFMIN; 2711 else 2712 return -1; 2713 2714 return 0; 2715 } 2716 2717 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler) 2718 { 2719 struct perf_evsel *evsel; 2720 2721 evlist__for_each_entry(evlist, evsel) 2722 evsel->handler = handler; 2723 } 2724 2725 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) 2726 { 2727 const char *trace_usage[] = { 2728 "perf trace [<options>] [<command>]", 2729 "perf trace [<options>] -- <command> [<options>]", 2730 "perf trace record [<options>] [<command>]", 2731 "perf trace record [<options>] -- <command> [<options>]", 2732 NULL 2733 }; 2734 struct trace trace = { 2735 .syscalls = { 2736 . max = -1, 2737 }, 2738 .opts = { 2739 .target = { 2740 .uid = UINT_MAX, 2741 .uses_mmap = true, 2742 }, 2743 .user_freq = UINT_MAX, 2744 .user_interval = ULLONG_MAX, 2745 .no_buffering = true, 2746 .mmap_pages = UINT_MAX, 2747 .proc_map_timeout = 500, 2748 }, 2749 .output = stderr, 2750 .show_comm = true, 2751 .trace_syscalls = true, 2752 .kernel_syscallchains = false, 2753 .max_stack = UINT_MAX, 2754 }; 2755 const char *output_name = NULL; 2756 const char *ev_qualifier_str = NULL; 2757 const struct option trace_options[] = { 2758 OPT_CALLBACK(0, "event", &trace.evlist, "event", 2759 "event selector. use 'perf list' to list available events", 2760 parse_events_option), 2761 OPT_BOOLEAN(0, "comm", &trace.show_comm, 2762 "show the thread COMM next to its id"), 2763 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"), 2764 OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"), 2765 OPT_STRING('o', "output", &output_name, "file", "output file name"), 2766 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"), 2767 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid", 2768 "trace events on existing process id"), 2769 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid", 2770 "trace events on existing thread id"), 2771 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids", 2772 "pids to filter (by the kernel)", trace__set_filter_pids), 2773 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide, 2774 "system-wide collection from all CPUs"), 2775 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu", 2776 "list of cpus to monitor"), 2777 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit, 2778 "child tasks do not inherit counters"), 2779 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages", 2780 "number of mmap data pages", 2781 perf_evlist__parse_mmap_pages), 2782 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user", 2783 "user to profile"), 2784 OPT_CALLBACK(0, "duration", &trace, "float", 2785 "show only events with duration > N.M ms", 2786 trace__set_duration), 2787 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"), 2788 OPT_INCR('v', "verbose", &verbose, "be more verbose"), 2789 OPT_BOOLEAN('T', "time", &trace.full_time, 2790 "Show full timestamp, not time relative to first start"), 2791 OPT_BOOLEAN('s', "summary", &trace.summary_only, 2792 "Show only syscall summary with statistics"), 2793 OPT_BOOLEAN('S', "with-summary", &trace.summary, 2794 "Show all syscalls and summary with statistics"), 2795 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min", 2796 "Trace pagefaults", parse_pagefaults, "maj"), 2797 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"), 2798 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"), 2799 OPT_CALLBACK(0, "call-graph", &trace.opts, 2800 "record_mode[,record_size]", record_callchain_help, 2801 &record_parse_callchain_opt), 2802 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains, 2803 "Show the kernel callchains on the syscall exit path"), 2804 OPT_UINTEGER(0, "min-stack", &trace.min_stack, 2805 "Set the minimum stack depth when parsing the callchain, " 2806 "anything below the specified depth will be ignored."), 2807 OPT_UINTEGER(0, "max-stack", &trace.max_stack, 2808 "Set the maximum stack depth when parsing the callchain, " 2809 "anything beyond the specified depth will be ignored. " 2810 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)), 2811 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout, 2812 "per thread proc mmap processing timeout in ms"), 2813 OPT_END() 2814 }; 2815 bool __maybe_unused max_stack_user_set = true; 2816 bool mmap_pages_user_set = true; 2817 const char * const trace_subcommands[] = { "record", NULL }; 2818 int err; 2819 char bf[BUFSIZ]; 2820 2821 signal(SIGSEGV, sighandler_dump_stack); 2822 signal(SIGFPE, sighandler_dump_stack); 2823 2824 trace.evlist = perf_evlist__new(); 2825 trace.sctbl = syscalltbl__new(); 2826 2827 if (trace.evlist == NULL || trace.sctbl == NULL) { 2828 pr_err("Not enough memory to run!\n"); 2829 err = -ENOMEM; 2830 goto out; 2831 } 2832 2833 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands, 2834 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION); 2835 2836 err = bpf__setup_stdout(trace.evlist); 2837 if (err) { 2838 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf)); 2839 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf); 2840 goto out; 2841 } 2842 2843 err = -1; 2844 2845 if (trace.trace_pgfaults) { 2846 trace.opts.sample_address = true; 2847 trace.opts.sample_time = true; 2848 } 2849 2850 if (trace.opts.mmap_pages == UINT_MAX) 2851 mmap_pages_user_set = false; 2852 2853 if (trace.max_stack == UINT_MAX) { 2854 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack; 2855 max_stack_user_set = false; 2856 } 2857 2858 #ifdef HAVE_DWARF_UNWIND_SUPPORT 2859 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls) 2860 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false); 2861 #endif 2862 2863 if (callchain_param.enabled) { 2864 if (!mmap_pages_user_set && geteuid() == 0) 2865 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4; 2866 2867 symbol_conf.use_callchain = true; 2868 } 2869 2870 if (trace.evlist->nr_entries > 0) 2871 evlist__set_evsel_handler(trace.evlist, trace__event_handler); 2872 2873 if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) 2874 return trace__record(&trace, argc-1, &argv[1]); 2875 2876 /* summary_only implies summary option, but don't overwrite summary if set */ 2877 if (trace.summary_only) 2878 trace.summary = trace.summary_only; 2879 2880 if (!trace.trace_syscalls && !trace.trace_pgfaults && 2881 trace.evlist->nr_entries == 0 /* Was --events used? */) { 2882 pr_err("Please specify something to trace.\n"); 2883 return -1; 2884 } 2885 2886 if (!trace.trace_syscalls && ev_qualifier_str) { 2887 pr_err("The -e option can't be used with --no-syscalls.\n"); 2888 goto out; 2889 } 2890 2891 if (output_name != NULL) { 2892 err = trace__open_output(&trace, output_name); 2893 if (err < 0) { 2894 perror("failed to create output file"); 2895 goto out; 2896 } 2897 } 2898 2899 trace.open_id = syscalltbl__id(trace.sctbl, "open"); 2900 2901 if (ev_qualifier_str != NULL) { 2902 const char *s = ev_qualifier_str; 2903 struct strlist_config slist_config = { 2904 .dirname = system_path(STRACE_GROUPS_DIR), 2905 }; 2906 2907 trace.not_ev_qualifier = *s == '!'; 2908 if (trace.not_ev_qualifier) 2909 ++s; 2910 trace.ev_qualifier = strlist__new(s, &slist_config); 2911 if (trace.ev_qualifier == NULL) { 2912 fputs("Not enough memory to parse event qualifier", 2913 trace.output); 2914 err = -ENOMEM; 2915 goto out_close; 2916 } 2917 2918 err = trace__validate_ev_qualifier(&trace); 2919 if (err) 2920 goto out_close; 2921 } 2922 2923 err = target__validate(&trace.opts.target); 2924 if (err) { 2925 target__strerror(&trace.opts.target, err, bf, sizeof(bf)); 2926 fprintf(trace.output, "%s", bf); 2927 goto out_close; 2928 } 2929 2930 err = target__parse_uid(&trace.opts.target); 2931 if (err) { 2932 target__strerror(&trace.opts.target, err, bf, sizeof(bf)); 2933 fprintf(trace.output, "%s", bf); 2934 goto out_close; 2935 } 2936 2937 if (!argc && target__none(&trace.opts.target)) 2938 trace.opts.target.system_wide = true; 2939 2940 if (input_name) 2941 err = trace__replay(&trace); 2942 else 2943 err = trace__run(&trace, argc, argv); 2944 2945 out_close: 2946 if (output_name != NULL) 2947 fclose(trace.output); 2948 out: 2949 return err; 2950 } 2951