1 /* 2 * builtin-trace.c 3 * 4 * Builtin 'trace' command: 5 * 6 * Display a continuously updated trace of any workload, CPU, specific PID, 7 * system wide, etc. Default format is loosely strace like, but any other 8 * event may be specified using --event. 9 * 10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> 11 * 12 * Initially based on the 'trace' prototype by Thomas Gleixner: 13 * 14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'") 15 * 16 * Released under the GPL v2. (and only v2, not any later version) 17 */ 18 19 #include <traceevent/event-parse.h> 20 #include <api/fs/tracing_path.h> 21 #include "builtin.h" 22 #include "util/cgroup.h" 23 #include "util/color.h" 24 #include "util/debug.h" 25 #include "util/env.h" 26 #include "util/event.h" 27 #include "util/evlist.h" 28 #include <subcmd/exec-cmd.h> 29 #include "util/machine.h" 30 #include "util/path.h" 31 #include "util/session.h" 32 #include "util/thread.h" 33 #include <subcmd/parse-options.h> 34 #include "util/strlist.h" 35 #include "util/intlist.h" 36 #include "util/thread_map.h" 37 #include "util/stat.h" 38 #include "trace/beauty/beauty.h" 39 #include "trace-event.h" 40 #include "util/parse-events.h" 41 #include "util/bpf-loader.h" 42 #include "callchain.h" 43 #include "print_binary.h" 44 #include "string2.h" 45 #include "syscalltbl.h" 46 #include "rb_resort.h" 47 48 #include <errno.h> 49 #include <inttypes.h> 50 #include <poll.h> 51 #include <signal.h> 52 #include <stdlib.h> 53 #include <string.h> 54 #include <linux/err.h> 55 #include <linux/filter.h> 56 #include <linux/kernel.h> 57 #include <linux/random.h> 58 #include <linux/stringify.h> 59 #include <linux/time64.h> 60 #include <fcntl.h> 61 62 #include "sane_ctype.h" 63 64 #ifndef O_CLOEXEC 65 # define O_CLOEXEC 02000000 66 #endif 67 68 #ifndef F_LINUX_SPECIFIC_BASE 69 # define F_LINUX_SPECIFIC_BASE 1024 70 #endif 71 72 struct trace { 73 struct perf_tool tool; 74 struct syscalltbl *sctbl; 75 struct { 76 int max; 77 struct syscall *table; 78 struct { 79 struct perf_evsel *sys_enter, 80 *sys_exit, 81 *augmented; 82 } events; 83 } syscalls; 84 struct record_opts opts; 85 struct perf_evlist *evlist; 86 struct machine *host; 87 struct thread *current; 88 struct cgroup *cgroup; 89 u64 base_time; 90 FILE *output; 91 unsigned long nr_events; 92 struct strlist *ev_qualifier; 93 struct { 94 size_t nr; 95 int *entries; 96 } ev_qualifier_ids; 97 struct { 98 size_t nr; 99 pid_t *entries; 100 } filter_pids; 101 double duration_filter; 102 double runtime_ms; 103 struct { 104 u64 vfs_getname, 105 proc_getname; 106 } stats; 107 unsigned int max_stack; 108 unsigned int min_stack; 109 bool not_ev_qualifier; 110 bool live; 111 bool full_time; 112 bool sched; 113 bool multiple_threads; 114 bool summary; 115 bool summary_only; 116 bool failure_only; 117 bool show_comm; 118 bool print_sample; 119 bool show_tool_stats; 120 bool trace_syscalls; 121 bool kernel_syscallchains; 122 bool force; 123 bool vfs_getname; 124 int trace_pgfaults; 125 }; 126 127 struct tp_field { 128 int offset; 129 union { 130 u64 (*integer)(struct tp_field *field, struct perf_sample *sample); 131 void *(*pointer)(struct tp_field *field, struct perf_sample *sample); 132 }; 133 }; 134 135 #define TP_UINT_FIELD(bits) \ 136 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \ 137 { \ 138 u##bits value; \ 139 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \ 140 return value; \ 141 } 142 143 TP_UINT_FIELD(8); 144 TP_UINT_FIELD(16); 145 TP_UINT_FIELD(32); 146 TP_UINT_FIELD(64); 147 148 #define TP_UINT_FIELD__SWAPPED(bits) \ 149 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \ 150 { \ 151 u##bits value; \ 152 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \ 153 return bswap_##bits(value);\ 154 } 155 156 TP_UINT_FIELD__SWAPPED(16); 157 TP_UINT_FIELD__SWAPPED(32); 158 TP_UINT_FIELD__SWAPPED(64); 159 160 static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap) 161 { 162 field->offset = offset; 163 164 switch (size) { 165 case 1: 166 field->integer = tp_field__u8; 167 break; 168 case 2: 169 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16; 170 break; 171 case 4: 172 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32; 173 break; 174 case 8: 175 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64; 176 break; 177 default: 178 return -1; 179 } 180 181 return 0; 182 } 183 184 static int tp_field__init_uint(struct tp_field *field, struct format_field *format_field, bool needs_swap) 185 { 186 return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap); 187 } 188 189 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample) 190 { 191 return sample->raw_data + field->offset; 192 } 193 194 static int __tp_field__init_ptr(struct tp_field *field, int offset) 195 { 196 field->offset = offset; 197 field->pointer = tp_field__ptr; 198 return 0; 199 } 200 201 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field) 202 { 203 return __tp_field__init_ptr(field, format_field->offset); 204 } 205 206 struct syscall_tp { 207 struct tp_field id; 208 union { 209 struct tp_field args, ret; 210 }; 211 }; 212 213 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel, 214 struct tp_field *field, 215 const char *name) 216 { 217 struct format_field *format_field = perf_evsel__field(evsel, name); 218 219 if (format_field == NULL) 220 return -1; 221 222 return tp_field__init_uint(field, format_field, evsel->needs_swap); 223 } 224 225 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \ 226 ({ struct syscall_tp *sc = evsel->priv;\ 227 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); }) 228 229 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel, 230 struct tp_field *field, 231 const char *name) 232 { 233 struct format_field *format_field = perf_evsel__field(evsel, name); 234 235 if (format_field == NULL) 236 return -1; 237 238 return tp_field__init_ptr(field, format_field); 239 } 240 241 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \ 242 ({ struct syscall_tp *sc = evsel->priv;\ 243 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); }) 244 245 static void perf_evsel__delete_priv(struct perf_evsel *evsel) 246 { 247 zfree(&evsel->priv); 248 perf_evsel__delete(evsel); 249 } 250 251 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel) 252 { 253 struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp)); 254 255 if (evsel->priv != NULL) { 256 if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr")) 257 goto out_delete; 258 return 0; 259 } 260 261 return -ENOMEM; 262 out_delete: 263 zfree(&evsel->priv); 264 return -ENOENT; 265 } 266 267 static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel) 268 { 269 struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp)); 270 271 if (evsel->priv != NULL) { /* field, sizeof_field, offsetof_field */ 272 if (__tp_field__init_uint(&sc->id, sizeof(long), sizeof(long long), evsel->needs_swap)) 273 goto out_delete; 274 275 return 0; 276 } 277 278 return -ENOMEM; 279 out_delete: 280 zfree(&evsel->priv); 281 return -EINVAL; 282 } 283 284 static int perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel *evsel) 285 { 286 struct syscall_tp *sc = evsel->priv; 287 288 return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)); 289 } 290 291 static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler) 292 { 293 evsel->priv = malloc(sizeof(struct syscall_tp)); 294 if (evsel->priv != NULL) { 295 if (perf_evsel__init_sc_tp_uint_field(evsel, id)) 296 goto out_delete; 297 298 evsel->handler = handler; 299 return 0; 300 } 301 302 return -ENOMEM; 303 304 out_delete: 305 zfree(&evsel->priv); 306 return -ENOENT; 307 } 308 309 static struct perf_evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler) 310 { 311 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction); 312 313 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */ 314 if (IS_ERR(evsel)) 315 evsel = perf_evsel__newtp("syscalls", direction); 316 317 if (IS_ERR(evsel)) 318 return NULL; 319 320 if (perf_evsel__init_raw_syscall_tp(evsel, handler)) 321 goto out_delete; 322 323 return evsel; 324 325 out_delete: 326 perf_evsel__delete_priv(evsel); 327 return NULL; 328 } 329 330 #define perf_evsel__sc_tp_uint(evsel, name, sample) \ 331 ({ struct syscall_tp *fields = evsel->priv; \ 332 fields->name.integer(&fields->name, sample); }) 333 334 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \ 335 ({ struct syscall_tp *fields = evsel->priv; \ 336 fields->name.pointer(&fields->name, sample); }) 337 338 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val) 339 { 340 int idx = val - sa->offset; 341 342 if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) 343 return scnprintf(bf, size, intfmt, val); 344 345 return scnprintf(bf, size, "%s", sa->entries[idx]); 346 } 347 348 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size, 349 const char *intfmt, 350 struct syscall_arg *arg) 351 { 352 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val); 353 } 354 355 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size, 356 struct syscall_arg *arg) 357 { 358 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg); 359 } 360 361 #define SCA_STRARRAY syscall_arg__scnprintf_strarray 362 363 struct strarrays { 364 int nr_entries; 365 struct strarray **entries; 366 }; 367 368 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \ 369 .nr_entries = ARRAY_SIZE(array), \ 370 .entries = array, \ 371 } 372 373 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size, 374 struct syscall_arg *arg) 375 { 376 struct strarrays *sas = arg->parm; 377 int i; 378 379 for (i = 0; i < sas->nr_entries; ++i) { 380 struct strarray *sa = sas->entries[i]; 381 int idx = arg->val - sa->offset; 382 383 if (idx >= 0 && idx < sa->nr_entries) { 384 if (sa->entries[idx] == NULL) 385 break; 386 return scnprintf(bf, size, "%s", sa->entries[idx]); 387 } 388 } 389 390 return scnprintf(bf, size, "%d", arg->val); 391 } 392 393 #ifndef AT_FDCWD 394 #define AT_FDCWD -100 395 #endif 396 397 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size, 398 struct syscall_arg *arg) 399 { 400 int fd = arg->val; 401 402 if (fd == AT_FDCWD) 403 return scnprintf(bf, size, "CWD"); 404 405 return syscall_arg__scnprintf_fd(bf, size, arg); 406 } 407 408 #define SCA_FDAT syscall_arg__scnprintf_fd_at 409 410 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size, 411 struct syscall_arg *arg); 412 413 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd 414 415 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg) 416 { 417 return scnprintf(bf, size, "%#lx", arg->val); 418 } 419 420 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg) 421 { 422 return scnprintf(bf, size, "%d", arg->val); 423 } 424 425 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg) 426 { 427 return scnprintf(bf, size, "%ld", arg->val); 428 } 429 430 static const char *bpf_cmd[] = { 431 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM", 432 "MAP_GET_NEXT_KEY", "PROG_LOAD", 433 }; 434 static DEFINE_STRARRAY(bpf_cmd); 435 436 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", }; 437 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1); 438 439 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", }; 440 static DEFINE_STRARRAY(itimers); 441 442 static const char *keyctl_options[] = { 443 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN", 444 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ", 445 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT", 446 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT", 447 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT", 448 }; 449 static DEFINE_STRARRAY(keyctl_options); 450 451 static const char *whences[] = { "SET", "CUR", "END", 452 #ifdef SEEK_DATA 453 "DATA", 454 #endif 455 #ifdef SEEK_HOLE 456 "HOLE", 457 #endif 458 }; 459 static DEFINE_STRARRAY(whences); 460 461 static const char *fcntl_cmds[] = { 462 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK", 463 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64", 464 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX", 465 "GETOWNER_UIDS", 466 }; 467 static DEFINE_STRARRAY(fcntl_cmds); 468 469 static const char *fcntl_linux_specific_cmds[] = { 470 "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC", 471 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS", 472 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT", 473 }; 474 475 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE); 476 477 static struct strarray *fcntl_cmds_arrays[] = { 478 &strarray__fcntl_cmds, 479 &strarray__fcntl_linux_specific_cmds, 480 }; 481 482 static DEFINE_STRARRAYS(fcntl_cmds_arrays); 483 484 static const char *rlimit_resources[] = { 485 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE", 486 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO", 487 "RTTIME", 488 }; 489 static DEFINE_STRARRAY(rlimit_resources); 490 491 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", }; 492 static DEFINE_STRARRAY(sighow); 493 494 static const char *clockid[] = { 495 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID", 496 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME", 497 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI" 498 }; 499 static DEFINE_STRARRAY(clockid); 500 501 static const char *socket_families[] = { 502 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM", 503 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI", 504 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC", 505 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC", 506 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF", 507 "ALG", "NFC", "VSOCK", 508 }; 509 static DEFINE_STRARRAY(socket_families); 510 511 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size, 512 struct syscall_arg *arg) 513 { 514 size_t printed = 0; 515 int mode = arg->val; 516 517 if (mode == F_OK) /* 0 */ 518 return scnprintf(bf, size, "F"); 519 #define P_MODE(n) \ 520 if (mode & n##_OK) { \ 521 printed += scnprintf(bf + printed, size - printed, "%s", #n); \ 522 mode &= ~n##_OK; \ 523 } 524 525 P_MODE(R); 526 P_MODE(W); 527 P_MODE(X); 528 #undef P_MODE 529 530 if (mode) 531 printed += scnprintf(bf + printed, size - printed, "|%#x", mode); 532 533 return printed; 534 } 535 536 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode 537 538 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, 539 struct syscall_arg *arg); 540 541 #define SCA_FILENAME syscall_arg__scnprintf_filename 542 543 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size, 544 struct syscall_arg *arg) 545 { 546 int printed = 0, flags = arg->val; 547 548 #define P_FLAG(n) \ 549 if (flags & O_##n) { \ 550 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ 551 flags &= ~O_##n; \ 552 } 553 554 P_FLAG(CLOEXEC); 555 P_FLAG(NONBLOCK); 556 #undef P_FLAG 557 558 if (flags) 559 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); 560 561 return printed; 562 } 563 564 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags 565 566 #ifndef GRND_NONBLOCK 567 #define GRND_NONBLOCK 0x0001 568 #endif 569 #ifndef GRND_RANDOM 570 #define GRND_RANDOM 0x0002 571 #endif 572 573 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, 574 struct syscall_arg *arg) 575 { 576 int printed = 0, flags = arg->val; 577 578 #define P_FLAG(n) \ 579 if (flags & GRND_##n) { \ 580 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ 581 flags &= ~GRND_##n; \ 582 } 583 584 P_FLAG(RANDOM); 585 P_FLAG(NONBLOCK); 586 #undef P_FLAG 587 588 if (flags) 589 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); 590 591 return printed; 592 } 593 594 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags 595 596 #define STRARRAY(name, array) \ 597 { .scnprintf = SCA_STRARRAY, \ 598 .parm = &strarray__##array, } 599 600 #include "trace/beauty/arch_errno_names.c" 601 #include "trace/beauty/eventfd.c" 602 #include "trace/beauty/futex_op.c" 603 #include "trace/beauty/futex_val3.c" 604 #include "trace/beauty/mmap.c" 605 #include "trace/beauty/mode_t.c" 606 #include "trace/beauty/msg_flags.c" 607 #include "trace/beauty/open_flags.c" 608 #include "trace/beauty/perf_event_open.c" 609 #include "trace/beauty/pid.c" 610 #include "trace/beauty/sched_policy.c" 611 #include "trace/beauty/seccomp.c" 612 #include "trace/beauty/signum.c" 613 #include "trace/beauty/socket_type.c" 614 #include "trace/beauty/waitid_options.c" 615 616 struct syscall_arg_fmt { 617 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg); 618 void *parm; 619 const char *name; 620 bool show_zero; 621 }; 622 623 static struct syscall_fmt { 624 const char *name; 625 const char *alias; 626 struct syscall_arg_fmt arg[6]; 627 u8 nr_args; 628 bool errpid; 629 bool timeout; 630 bool hexret; 631 } syscall_fmts[] = { 632 { .name = "access", 633 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, }, 634 { .name = "bpf", 635 .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, }, 636 { .name = "brk", .hexret = true, 637 .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, }, 638 { .name = "clock_gettime", 639 .arg = { [0] = STRARRAY(clk_id, clockid), }, }, 640 { .name = "clone", .errpid = true, .nr_args = 5, 641 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, }, 642 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, }, 643 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, }, 644 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, }, 645 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, }, 646 { .name = "close", 647 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, }, 648 { .name = "epoll_ctl", 649 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, }, 650 { .name = "eventfd2", 651 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, }, 652 { .name = "fchmodat", 653 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 654 { .name = "fchownat", 655 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 656 { .name = "fcntl", 657 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */ 658 .parm = &strarrays__fcntl_cmds_arrays, 659 .show_zero = true, }, 660 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, }, 661 { .name = "flock", 662 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, }, 663 { .name = "fstat", .alias = "newfstat", }, 664 { .name = "fstatat", .alias = "newfstatat", }, 665 { .name = "futex", 666 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, 667 [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, }, 668 { .name = "futimesat", 669 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 670 { .name = "getitimer", 671 .arg = { [0] = STRARRAY(which, itimers), }, }, 672 { .name = "getpid", .errpid = true, }, 673 { .name = "getpgid", .errpid = true, }, 674 { .name = "getppid", .errpid = true, }, 675 { .name = "getrandom", 676 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, }, 677 { .name = "getrlimit", 678 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, }, 679 { .name = "gettid", .errpid = true, }, 680 { .name = "ioctl", 681 .arg = { 682 #if defined(__i386__) || defined(__x86_64__) 683 /* 684 * FIXME: Make this available to all arches. 685 */ 686 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ }, 687 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, }, 688 #else 689 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, }, 690 #endif 691 { .name = "kcmp", .nr_args = 5, 692 .arg = { [0] = { .name = "pid1", .scnprintf = SCA_PID, }, 693 [1] = { .name = "pid2", .scnprintf = SCA_PID, }, 694 [2] = { .name = "type", .scnprintf = SCA_KCMP_TYPE, }, 695 [3] = { .name = "idx1", .scnprintf = SCA_KCMP_IDX, }, 696 [4] = { .name = "idx2", .scnprintf = SCA_KCMP_IDX, }, }, }, 697 { .name = "keyctl", 698 .arg = { [0] = STRARRAY(option, keyctl_options), }, }, 699 { .name = "kill", 700 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 701 { .name = "linkat", 702 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 703 { .name = "lseek", 704 .arg = { [2] = STRARRAY(whence, whences), }, }, 705 { .name = "lstat", .alias = "newlstat", }, 706 { .name = "madvise", 707 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, 708 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, }, 709 { .name = "mkdirat", 710 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 711 { .name = "mknodat", 712 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 713 { .name = "mlock", 714 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, }, 715 { .name = "mlockall", 716 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, }, 717 { .name = "mmap", .hexret = true, 718 /* The standard mmap maps to old_mmap on s390x */ 719 #if defined(__s390x__) 720 .alias = "old_mmap", 721 #endif 722 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, 723 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, 724 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, }, 725 { .name = "mprotect", 726 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, 727 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, }, 728 { .name = "mq_unlink", 729 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, }, 730 { .name = "mremap", .hexret = true, 731 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, 732 [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, 733 [4] = { .scnprintf = SCA_HEX, /* new_addr */ }, }, }, 734 { .name = "munlock", 735 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, }, 736 { .name = "munmap", 737 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, }, 738 { .name = "name_to_handle_at", 739 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 740 { .name = "newfstatat", 741 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 742 { .name = "open", 743 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, 744 { .name = "open_by_handle_at", 745 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, 746 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, 747 { .name = "openat", 748 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, 749 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, 750 { .name = "perf_event_open", 751 .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ }, 752 [3] = { .scnprintf = SCA_FD, /* group_fd */ }, 753 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, }, 754 { .name = "pipe2", 755 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, }, 756 { .name = "pkey_alloc", 757 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, }, 758 { .name = "pkey_free", 759 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, }, 760 { .name = "pkey_mprotect", 761 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, 762 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, 763 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, }, 764 { .name = "poll", .timeout = true, }, 765 { .name = "ppoll", .timeout = true, }, 766 { .name = "prctl", .alias = "arch_prctl", 767 .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ }, 768 [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ }, 769 [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, }, 770 { .name = "pread", .alias = "pread64", }, 771 { .name = "preadv", .alias = "pread", }, 772 { .name = "prlimit64", 773 .arg = { [1] = STRARRAY(resource, rlimit_resources), }, }, 774 { .name = "pwrite", .alias = "pwrite64", }, 775 { .name = "readlinkat", 776 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 777 { .name = "recvfrom", 778 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 779 { .name = "recvmmsg", 780 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 781 { .name = "recvmsg", 782 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 783 { .name = "renameat", 784 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 785 { .name = "rt_sigaction", 786 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 787 { .name = "rt_sigprocmask", 788 .arg = { [0] = STRARRAY(how, sighow), }, }, 789 { .name = "rt_sigqueueinfo", 790 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 791 { .name = "rt_tgsigqueueinfo", 792 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 793 { .name = "sched_setscheduler", 794 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, }, 795 { .name = "seccomp", 796 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ }, 797 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, }, 798 { .name = "select", .timeout = true, }, 799 { .name = "sendmmsg", 800 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 801 { .name = "sendmsg", 802 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 803 { .name = "sendto", 804 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 805 { .name = "set_tid_address", .errpid = true, }, 806 { .name = "setitimer", 807 .arg = { [0] = STRARRAY(which, itimers), }, }, 808 { .name = "setrlimit", 809 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, }, 810 { .name = "socket", 811 .arg = { [0] = STRARRAY(family, socket_families), 812 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, 813 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, }, 814 { .name = "socketpair", 815 .arg = { [0] = STRARRAY(family, socket_families), 816 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, 817 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, }, 818 { .name = "stat", .alias = "newstat", }, 819 { .name = "statx", 820 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ }, 821 [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } , 822 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, }, 823 { .name = "swapoff", 824 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, }, 825 { .name = "swapon", 826 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, }, 827 { .name = "symlinkat", 828 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 829 { .name = "tgkill", 830 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 831 { .name = "tkill", 832 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 833 { .name = "uname", .alias = "newuname", }, 834 { .name = "unlinkat", 835 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 836 { .name = "utimensat", 837 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, }, 838 { .name = "wait4", .errpid = true, 839 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, }, 840 { .name = "waitid", .errpid = true, 841 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, }, 842 }; 843 844 static int syscall_fmt__cmp(const void *name, const void *fmtp) 845 { 846 const struct syscall_fmt *fmt = fmtp; 847 return strcmp(name, fmt->name); 848 } 849 850 static struct syscall_fmt *syscall_fmt__find(const char *name) 851 { 852 const int nmemb = ARRAY_SIZE(syscall_fmts); 853 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp); 854 } 855 856 /* 857 * is_exit: is this "exit" or "exit_group"? 858 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter. 859 */ 860 struct syscall { 861 struct event_format *tp_format; 862 int nr_args; 863 bool is_exit; 864 bool is_open; 865 struct format_field *args; 866 const char *name; 867 struct syscall_fmt *fmt; 868 struct syscall_arg_fmt *arg_fmt; 869 }; 870 871 /* 872 * We need to have this 'calculated' boolean because in some cases we really 873 * don't know what is the duration of a syscall, for instance, when we start 874 * a session and some threads are waiting for a syscall to finish, say 'poll', 875 * in which case all we can do is to print "( ? ) for duration and for the 876 * start timestamp. 877 */ 878 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp) 879 { 880 double duration = (double)t / NSEC_PER_MSEC; 881 size_t printed = fprintf(fp, "("); 882 883 if (!calculated) 884 printed += fprintf(fp, " "); 885 else if (duration >= 1.0) 886 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration); 887 else if (duration >= 0.01) 888 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration); 889 else 890 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration); 891 return printed + fprintf(fp, "): "); 892 } 893 894 /** 895 * filename.ptr: The filename char pointer that will be vfs_getname'd 896 * filename.entry_str_pos: Where to insert the string translated from 897 * filename.ptr by the vfs_getname tracepoint/kprobe. 898 * ret_scnprintf: syscall args may set this to a different syscall return 899 * formatter, for instance, fcntl may return fds, file flags, etc. 900 */ 901 struct thread_trace { 902 u64 entry_time; 903 bool entry_pending; 904 unsigned long nr_events; 905 unsigned long pfmaj, pfmin; 906 char *entry_str; 907 double runtime_ms; 908 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg); 909 struct { 910 unsigned long ptr; 911 short int entry_str_pos; 912 bool pending_open; 913 unsigned int namelen; 914 char *name; 915 } filename; 916 struct { 917 int max; 918 char **table; 919 } paths; 920 921 struct intlist *syscall_stats; 922 }; 923 924 static struct thread_trace *thread_trace__new(void) 925 { 926 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace)); 927 928 if (ttrace) 929 ttrace->paths.max = -1; 930 931 ttrace->syscall_stats = intlist__new(NULL); 932 933 return ttrace; 934 } 935 936 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp) 937 { 938 struct thread_trace *ttrace; 939 940 if (thread == NULL) 941 goto fail; 942 943 if (thread__priv(thread) == NULL) 944 thread__set_priv(thread, thread_trace__new()); 945 946 if (thread__priv(thread) == NULL) 947 goto fail; 948 949 ttrace = thread__priv(thread); 950 ++ttrace->nr_events; 951 952 return ttrace; 953 fail: 954 color_fprintf(fp, PERF_COLOR_RED, 955 "WARNING: not enough memory, dropping samples!\n"); 956 return NULL; 957 } 958 959 960 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg, 961 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg)) 962 { 963 struct thread_trace *ttrace = thread__priv(arg->thread); 964 965 ttrace->ret_scnprintf = ret_scnprintf; 966 } 967 968 #define TRACE_PFMAJ (1 << 0) 969 #define TRACE_PFMIN (1 << 1) 970 971 static const size_t trace__entry_str_size = 2048; 972 973 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname) 974 { 975 struct thread_trace *ttrace = thread__priv(thread); 976 977 if (fd > ttrace->paths.max) { 978 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *)); 979 980 if (npath == NULL) 981 return -1; 982 983 if (ttrace->paths.max != -1) { 984 memset(npath + ttrace->paths.max + 1, 0, 985 (fd - ttrace->paths.max) * sizeof(char *)); 986 } else { 987 memset(npath, 0, (fd + 1) * sizeof(char *)); 988 } 989 990 ttrace->paths.table = npath; 991 ttrace->paths.max = fd; 992 } 993 994 ttrace->paths.table[fd] = strdup(pathname); 995 996 return ttrace->paths.table[fd] != NULL ? 0 : -1; 997 } 998 999 static int thread__read_fd_path(struct thread *thread, int fd) 1000 { 1001 char linkname[PATH_MAX], pathname[PATH_MAX]; 1002 struct stat st; 1003 int ret; 1004 1005 if (thread->pid_ == thread->tid) { 1006 scnprintf(linkname, sizeof(linkname), 1007 "/proc/%d/fd/%d", thread->pid_, fd); 1008 } else { 1009 scnprintf(linkname, sizeof(linkname), 1010 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd); 1011 } 1012 1013 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname)) 1014 return -1; 1015 1016 ret = readlink(linkname, pathname, sizeof(pathname)); 1017 1018 if (ret < 0 || ret > st.st_size) 1019 return -1; 1020 1021 pathname[ret] = '\0'; 1022 return trace__set_fd_pathname(thread, fd, pathname); 1023 } 1024 1025 static const char *thread__fd_path(struct thread *thread, int fd, 1026 struct trace *trace) 1027 { 1028 struct thread_trace *ttrace = thread__priv(thread); 1029 1030 if (ttrace == NULL) 1031 return NULL; 1032 1033 if (fd < 0) 1034 return NULL; 1035 1036 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) { 1037 if (!trace->live) 1038 return NULL; 1039 ++trace->stats.proc_getname; 1040 if (thread__read_fd_path(thread, fd)) 1041 return NULL; 1042 } 1043 1044 return ttrace->paths.table[fd]; 1045 } 1046 1047 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg) 1048 { 1049 int fd = arg->val; 1050 size_t printed = scnprintf(bf, size, "%d", fd); 1051 const char *path = thread__fd_path(arg->thread, fd, arg->trace); 1052 1053 if (path) 1054 printed += scnprintf(bf + printed, size - printed, "<%s>", path); 1055 1056 return printed; 1057 } 1058 1059 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size) 1060 { 1061 size_t printed = scnprintf(bf, size, "%d", fd); 1062 struct thread *thread = machine__find_thread(trace->host, pid, pid); 1063 1064 if (thread) { 1065 const char *path = thread__fd_path(thread, fd, trace); 1066 1067 if (path) 1068 printed += scnprintf(bf + printed, size - printed, "<%s>", path); 1069 1070 thread__put(thread); 1071 } 1072 1073 return printed; 1074 } 1075 1076 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size, 1077 struct syscall_arg *arg) 1078 { 1079 int fd = arg->val; 1080 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg); 1081 struct thread_trace *ttrace = thread__priv(arg->thread); 1082 1083 if (ttrace && fd >= 0 && fd <= ttrace->paths.max) 1084 zfree(&ttrace->paths.table[fd]); 1085 1086 return printed; 1087 } 1088 1089 static void thread__set_filename_pos(struct thread *thread, const char *bf, 1090 unsigned long ptr) 1091 { 1092 struct thread_trace *ttrace = thread__priv(thread); 1093 1094 ttrace->filename.ptr = ptr; 1095 ttrace->filename.entry_str_pos = bf - ttrace->entry_str; 1096 } 1097 1098 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, 1099 struct syscall_arg *arg) 1100 { 1101 unsigned long ptr = arg->val; 1102 1103 if (!arg->trace->vfs_getname) 1104 return scnprintf(bf, size, "%#x", ptr); 1105 1106 thread__set_filename_pos(arg->thread, bf, ptr); 1107 return 0; 1108 } 1109 1110 static bool trace__filter_duration(struct trace *trace, double t) 1111 { 1112 return t < (trace->duration_filter * NSEC_PER_MSEC); 1113 } 1114 1115 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp) 1116 { 1117 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC; 1118 1119 return fprintf(fp, "%10.3f ", ts); 1120 } 1121 1122 /* 1123 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are 1124 * using ttrace->entry_time for a thread that receives a sys_exit without 1125 * first having received a sys_enter ("poll" issued before tracing session 1126 * starts, lost sys_enter exit due to ring buffer overflow). 1127 */ 1128 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp) 1129 { 1130 if (tstamp > 0) 1131 return __trace__fprintf_tstamp(trace, tstamp, fp); 1132 1133 return fprintf(fp, " ? "); 1134 } 1135 1136 static bool done = false; 1137 static bool interrupted = false; 1138 1139 static void sig_handler(int sig) 1140 { 1141 done = true; 1142 interrupted = sig == SIGINT; 1143 } 1144 1145 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread, 1146 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp) 1147 { 1148 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp); 1149 printed += fprintf_duration(duration, duration_calculated, fp); 1150 1151 if (trace->multiple_threads) { 1152 if (trace->show_comm) 1153 printed += fprintf(fp, "%.14s/", thread__comm_str(thread)); 1154 printed += fprintf(fp, "%d ", thread->tid); 1155 } 1156 1157 return printed; 1158 } 1159 1160 static int trace__process_event(struct trace *trace, struct machine *machine, 1161 union perf_event *event, struct perf_sample *sample) 1162 { 1163 int ret = 0; 1164 1165 switch (event->header.type) { 1166 case PERF_RECORD_LOST: 1167 color_fprintf(trace->output, PERF_COLOR_RED, 1168 "LOST %" PRIu64 " events!\n", event->lost.lost); 1169 ret = machine__process_lost_event(machine, event, sample); 1170 break; 1171 default: 1172 ret = machine__process_event(machine, event, sample); 1173 break; 1174 } 1175 1176 return ret; 1177 } 1178 1179 static int trace__tool_process(struct perf_tool *tool, 1180 union perf_event *event, 1181 struct perf_sample *sample, 1182 struct machine *machine) 1183 { 1184 struct trace *trace = container_of(tool, struct trace, tool); 1185 return trace__process_event(trace, machine, event, sample); 1186 } 1187 1188 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp) 1189 { 1190 struct machine *machine = vmachine; 1191 1192 if (machine->kptr_restrict_warned) 1193 return NULL; 1194 1195 if (symbol_conf.kptr_restrict) { 1196 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n" 1197 "Check /proc/sys/kernel/kptr_restrict.\n\n" 1198 "Kernel samples will not be resolved.\n"); 1199 machine->kptr_restrict_warned = true; 1200 return NULL; 1201 } 1202 1203 return machine__resolve_kernel_addr(vmachine, addrp, modp); 1204 } 1205 1206 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist) 1207 { 1208 int err = symbol__init(NULL); 1209 1210 if (err) 1211 return err; 1212 1213 trace->host = machine__new_host(); 1214 if (trace->host == NULL) 1215 return -ENOMEM; 1216 1217 err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr); 1218 if (err < 0) 1219 goto out; 1220 1221 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target, 1222 evlist->threads, trace__tool_process, false, 1223 trace->opts.proc_map_timeout, 1); 1224 out: 1225 if (err) 1226 symbol__exit(); 1227 1228 return err; 1229 } 1230 1231 static void trace__symbols__exit(struct trace *trace) 1232 { 1233 machine__exit(trace->host); 1234 trace->host = NULL; 1235 1236 symbol__exit(); 1237 } 1238 1239 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args) 1240 { 1241 int idx; 1242 1243 if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0) 1244 nr_args = sc->fmt->nr_args; 1245 1246 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt)); 1247 if (sc->arg_fmt == NULL) 1248 return -1; 1249 1250 for (idx = 0; idx < nr_args; ++idx) { 1251 if (sc->fmt) 1252 sc->arg_fmt[idx] = sc->fmt->arg[idx]; 1253 } 1254 1255 sc->nr_args = nr_args; 1256 return 0; 1257 } 1258 1259 static int syscall__set_arg_fmts(struct syscall *sc) 1260 { 1261 struct format_field *field; 1262 int idx = 0, len; 1263 1264 for (field = sc->args; field; field = field->next, ++idx) { 1265 if (sc->fmt && sc->fmt->arg[idx].scnprintf) 1266 continue; 1267 1268 if (strcmp(field->type, "const char *") == 0 && 1269 (strcmp(field->name, "filename") == 0 || 1270 strcmp(field->name, "path") == 0 || 1271 strcmp(field->name, "pathname") == 0)) 1272 sc->arg_fmt[idx].scnprintf = SCA_FILENAME; 1273 else if (field->flags & FIELD_IS_POINTER) 1274 sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex; 1275 else if (strcmp(field->type, "pid_t") == 0) 1276 sc->arg_fmt[idx].scnprintf = SCA_PID; 1277 else if (strcmp(field->type, "umode_t") == 0) 1278 sc->arg_fmt[idx].scnprintf = SCA_MODE_T; 1279 else if ((strcmp(field->type, "int") == 0 || 1280 strcmp(field->type, "unsigned int") == 0 || 1281 strcmp(field->type, "long") == 0) && 1282 (len = strlen(field->name)) >= 2 && 1283 strcmp(field->name + len - 2, "fd") == 0) { 1284 /* 1285 * /sys/kernel/tracing/events/syscalls/sys_enter* 1286 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c 1287 * 65 int 1288 * 23 unsigned int 1289 * 7 unsigned long 1290 */ 1291 sc->arg_fmt[idx].scnprintf = SCA_FD; 1292 } 1293 } 1294 1295 return 0; 1296 } 1297 1298 static int trace__read_syscall_info(struct trace *trace, int id) 1299 { 1300 char tp_name[128]; 1301 struct syscall *sc; 1302 const char *name = syscalltbl__name(trace->sctbl, id); 1303 1304 if (name == NULL) 1305 return -1; 1306 1307 if (id > trace->syscalls.max) { 1308 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc)); 1309 1310 if (nsyscalls == NULL) 1311 return -1; 1312 1313 if (trace->syscalls.max != -1) { 1314 memset(nsyscalls + trace->syscalls.max + 1, 0, 1315 (id - trace->syscalls.max) * sizeof(*sc)); 1316 } else { 1317 memset(nsyscalls, 0, (id + 1) * sizeof(*sc)); 1318 } 1319 1320 trace->syscalls.table = nsyscalls; 1321 trace->syscalls.max = id; 1322 } 1323 1324 sc = trace->syscalls.table + id; 1325 sc->name = name; 1326 1327 sc->fmt = syscall_fmt__find(sc->name); 1328 1329 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name); 1330 sc->tp_format = trace_event__tp_format("syscalls", tp_name); 1331 1332 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) { 1333 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias); 1334 sc->tp_format = trace_event__tp_format("syscalls", tp_name); 1335 } 1336 1337 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields)) 1338 return -1; 1339 1340 if (IS_ERR(sc->tp_format)) 1341 return -1; 1342 1343 sc->args = sc->tp_format->format.fields; 1344 /* 1345 * We need to check and discard the first variable '__syscall_nr' 1346 * or 'nr' that mean the syscall number. It is needless here. 1347 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels. 1348 */ 1349 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) { 1350 sc->args = sc->args->next; 1351 --sc->nr_args; 1352 } 1353 1354 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit"); 1355 sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat"); 1356 1357 return syscall__set_arg_fmts(sc); 1358 } 1359 1360 static int trace__validate_ev_qualifier(struct trace *trace) 1361 { 1362 int err = 0, i; 1363 size_t nr_allocated; 1364 struct str_node *pos; 1365 1366 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier); 1367 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr * 1368 sizeof(trace->ev_qualifier_ids.entries[0])); 1369 1370 if (trace->ev_qualifier_ids.entries == NULL) { 1371 fputs("Error:\tNot enough memory for allocating events qualifier ids\n", 1372 trace->output); 1373 err = -EINVAL; 1374 goto out; 1375 } 1376 1377 nr_allocated = trace->ev_qualifier_ids.nr; 1378 i = 0; 1379 1380 strlist__for_each_entry(pos, trace->ev_qualifier) { 1381 const char *sc = pos->s; 1382 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1; 1383 1384 if (id < 0) { 1385 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next); 1386 if (id >= 0) 1387 goto matches; 1388 1389 if (err == 0) { 1390 fputs("Error:\tInvalid syscall ", trace->output); 1391 err = -EINVAL; 1392 } else { 1393 fputs(", ", trace->output); 1394 } 1395 1396 fputs(sc, trace->output); 1397 } 1398 matches: 1399 trace->ev_qualifier_ids.entries[i++] = id; 1400 if (match_next == -1) 1401 continue; 1402 1403 while (1) { 1404 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next); 1405 if (id < 0) 1406 break; 1407 if (nr_allocated == trace->ev_qualifier_ids.nr) { 1408 void *entries; 1409 1410 nr_allocated += 8; 1411 entries = realloc(trace->ev_qualifier_ids.entries, 1412 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0])); 1413 if (entries == NULL) { 1414 err = -ENOMEM; 1415 fputs("\nError:\t Not enough memory for parsing\n", trace->output); 1416 goto out_free; 1417 } 1418 trace->ev_qualifier_ids.entries = entries; 1419 } 1420 trace->ev_qualifier_ids.nr++; 1421 trace->ev_qualifier_ids.entries[i++] = id; 1422 } 1423 } 1424 1425 if (err < 0) { 1426 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'" 1427 "\nHint:\tand: 'man syscalls'\n", trace->output); 1428 out_free: 1429 zfree(&trace->ev_qualifier_ids.entries); 1430 trace->ev_qualifier_ids.nr = 0; 1431 } 1432 out: 1433 return err; 1434 } 1435 1436 /* 1437 * args is to be interpreted as a series of longs but we need to handle 1438 * 8-byte unaligned accesses. args points to raw_data within the event 1439 * and raw_data is guaranteed to be 8-byte unaligned because it is 1440 * preceded by raw_size which is a u32. So we need to copy args to a temp 1441 * variable to read it. Most notably this avoids extended load instructions 1442 * on unaligned addresses 1443 */ 1444 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx) 1445 { 1446 unsigned long val; 1447 unsigned char *p = arg->args + sizeof(unsigned long) * idx; 1448 1449 memcpy(&val, p, sizeof(val)); 1450 return val; 1451 } 1452 1453 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size, 1454 struct syscall_arg *arg) 1455 { 1456 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name) 1457 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name); 1458 1459 return scnprintf(bf, size, "arg%d: ", arg->idx); 1460 } 1461 1462 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size, 1463 struct syscall_arg *arg, unsigned long val) 1464 { 1465 if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) { 1466 arg->val = val; 1467 if (sc->arg_fmt[arg->idx].parm) 1468 arg->parm = sc->arg_fmt[arg->idx].parm; 1469 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg); 1470 } 1471 return scnprintf(bf, size, "%ld", val); 1472 } 1473 1474 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, 1475 unsigned char *args, struct trace *trace, 1476 struct thread *thread) 1477 { 1478 size_t printed = 0; 1479 unsigned long val; 1480 u8 bit = 1; 1481 struct syscall_arg arg = { 1482 .args = args, 1483 .idx = 0, 1484 .mask = 0, 1485 .trace = trace, 1486 .thread = thread, 1487 }; 1488 struct thread_trace *ttrace = thread__priv(thread); 1489 1490 /* 1491 * Things like fcntl will set this in its 'cmd' formatter to pick the 1492 * right formatter for the return value (an fd? file flags?), which is 1493 * not needed for syscalls that always return a given type, say an fd. 1494 */ 1495 ttrace->ret_scnprintf = NULL; 1496 1497 if (sc->args != NULL) { 1498 struct format_field *field; 1499 1500 for (field = sc->args; field; 1501 field = field->next, ++arg.idx, bit <<= 1) { 1502 if (arg.mask & bit) 1503 continue; 1504 1505 val = syscall_arg__val(&arg, arg.idx); 1506 1507 /* 1508 * Suppress this argument if its value is zero and 1509 * and we don't have a string associated in an 1510 * strarray for it. 1511 */ 1512 if (val == 0 && 1513 !(sc->arg_fmt && 1514 (sc->arg_fmt[arg.idx].show_zero || 1515 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY || 1516 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) && 1517 sc->arg_fmt[arg.idx].parm)) 1518 continue; 1519 1520 printed += scnprintf(bf + printed, size - printed, 1521 "%s%s: ", printed ? ", " : "", field->name); 1522 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val); 1523 } 1524 } else if (IS_ERR(sc->tp_format)) { 1525 /* 1526 * If we managed to read the tracepoint /format file, then we 1527 * may end up not having any args, like with gettid(), so only 1528 * print the raw args when we didn't manage to read it. 1529 */ 1530 while (arg.idx < sc->nr_args) { 1531 if (arg.mask & bit) 1532 goto next_arg; 1533 val = syscall_arg__val(&arg, arg.idx); 1534 if (printed) 1535 printed += scnprintf(bf + printed, size - printed, ", "); 1536 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg); 1537 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val); 1538 next_arg: 1539 ++arg.idx; 1540 bit <<= 1; 1541 } 1542 } 1543 1544 return printed; 1545 } 1546 1547 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel, 1548 union perf_event *event, 1549 struct perf_sample *sample); 1550 1551 static struct syscall *trace__syscall_info(struct trace *trace, 1552 struct perf_evsel *evsel, int id) 1553 { 1554 1555 if (id < 0) { 1556 1557 /* 1558 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried 1559 * before that, leaving at a higher verbosity level till that is 1560 * explained. Reproduced with plain ftrace with: 1561 * 1562 * echo 1 > /t/events/raw_syscalls/sys_exit/enable 1563 * grep "NR -1 " /t/trace_pipe 1564 * 1565 * After generating some load on the machine. 1566 */ 1567 if (verbose > 1) { 1568 static u64 n; 1569 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n", 1570 id, perf_evsel__name(evsel), ++n); 1571 } 1572 return NULL; 1573 } 1574 1575 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) && 1576 trace__read_syscall_info(trace, id)) 1577 goto out_cant_read; 1578 1579 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL)) 1580 goto out_cant_read; 1581 1582 return &trace->syscalls.table[id]; 1583 1584 out_cant_read: 1585 if (verbose > 0) { 1586 fprintf(trace->output, "Problems reading syscall %d", id); 1587 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL) 1588 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name); 1589 fputs(" information\n", trace->output); 1590 } 1591 return NULL; 1592 } 1593 1594 static void thread__update_stats(struct thread_trace *ttrace, 1595 int id, struct perf_sample *sample) 1596 { 1597 struct int_node *inode; 1598 struct stats *stats; 1599 u64 duration = 0; 1600 1601 inode = intlist__findnew(ttrace->syscall_stats, id); 1602 if (inode == NULL) 1603 return; 1604 1605 stats = inode->priv; 1606 if (stats == NULL) { 1607 stats = malloc(sizeof(struct stats)); 1608 if (stats == NULL) 1609 return; 1610 init_stats(stats); 1611 inode->priv = stats; 1612 } 1613 1614 if (ttrace->entry_time && sample->time > ttrace->entry_time) 1615 duration = sample->time - ttrace->entry_time; 1616 1617 update_stats(stats, duration); 1618 } 1619 1620 static int trace__printf_interrupted_entry(struct trace *trace) 1621 { 1622 struct thread_trace *ttrace; 1623 size_t printed; 1624 1625 if (trace->failure_only || trace->current == NULL) 1626 return 0; 1627 1628 ttrace = thread__priv(trace->current); 1629 1630 if (!ttrace->entry_pending) 1631 return 0; 1632 1633 printed = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output); 1634 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str); 1635 ttrace->entry_pending = false; 1636 1637 return printed; 1638 } 1639 1640 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel, 1641 struct perf_sample *sample, struct thread *thread) 1642 { 1643 int printed = 0; 1644 1645 if (trace->print_sample) { 1646 double ts = (double)sample->time / NSEC_PER_MSEC; 1647 1648 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n", 1649 perf_evsel__name(evsel), ts, 1650 thread__comm_str(thread), 1651 sample->pid, sample->tid, sample->cpu); 1652 } 1653 1654 return printed; 1655 } 1656 1657 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, 1658 union perf_event *event __maybe_unused, 1659 struct perf_sample *sample) 1660 { 1661 char *msg; 1662 void *args; 1663 size_t printed = 0; 1664 struct thread *thread; 1665 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; 1666 struct syscall *sc = trace__syscall_info(trace, evsel, id); 1667 struct thread_trace *ttrace; 1668 1669 if (sc == NULL) 1670 return -1; 1671 1672 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1673 ttrace = thread__trace(thread, trace->output); 1674 if (ttrace == NULL) 1675 goto out_put; 1676 1677 trace__fprintf_sample(trace, evsel, sample, thread); 1678 1679 args = perf_evsel__sc_tp_ptr(evsel, args, sample); 1680 1681 if (ttrace->entry_str == NULL) { 1682 ttrace->entry_str = malloc(trace__entry_str_size); 1683 if (!ttrace->entry_str) 1684 goto out_put; 1685 } 1686 1687 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) 1688 trace__printf_interrupted_entry(trace); 1689 1690 ttrace->entry_time = sample->time; 1691 msg = ttrace->entry_str; 1692 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name); 1693 1694 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed, 1695 args, trace, thread); 1696 1697 if (sc->is_exit) { 1698 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) { 1699 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output); 1700 fprintf(trace->output, "%-70s)\n", ttrace->entry_str); 1701 } 1702 } else { 1703 ttrace->entry_pending = true; 1704 /* See trace__vfs_getname & trace__sys_exit */ 1705 ttrace->filename.pending_open = false; 1706 } 1707 1708 if (trace->current != thread) { 1709 thread__put(trace->current); 1710 trace->current = thread__get(thread); 1711 } 1712 err = 0; 1713 out_put: 1714 thread__put(thread); 1715 return err; 1716 } 1717 1718 static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel, 1719 struct perf_sample *sample) 1720 { 1721 struct thread_trace *ttrace; 1722 struct thread *thread; 1723 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; 1724 struct syscall *sc = trace__syscall_info(trace, evsel, id); 1725 char msg[1024]; 1726 void *args; 1727 1728 if (sc == NULL) 1729 return -1; 1730 1731 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1732 ttrace = thread__trace(thread, trace->output); 1733 /* 1734 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args() 1735 * and the rest of the beautifiers accessing it via struct syscall_arg touches it. 1736 */ 1737 if (ttrace == NULL) 1738 goto out_put; 1739 1740 args = perf_evsel__sc_tp_ptr(evsel, args, sample); 1741 syscall__scnprintf_args(sc, msg, sizeof(msg), args, trace, thread); 1742 fprintf(trace->output, "%s", msg); 1743 err = 0; 1744 out_put: 1745 thread__put(thread); 1746 return err; 1747 } 1748 1749 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel, 1750 struct perf_sample *sample, 1751 struct callchain_cursor *cursor) 1752 { 1753 struct addr_location al; 1754 int max_stack = evsel->attr.sample_max_stack ? 1755 evsel->attr.sample_max_stack : 1756 trace->max_stack; 1757 1758 if (machine__resolve(trace->host, &al, sample) < 0 || 1759 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack)) 1760 return -1; 1761 1762 return 0; 1763 } 1764 1765 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample) 1766 { 1767 /* TODO: user-configurable print_opts */ 1768 const unsigned int print_opts = EVSEL__PRINT_SYM | 1769 EVSEL__PRINT_DSO | 1770 EVSEL__PRINT_UNKNOWN_AS_ADDR; 1771 1772 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output); 1773 } 1774 1775 static const char *errno_to_name(struct perf_evsel *evsel, int err) 1776 { 1777 struct perf_env *env = perf_evsel__env(evsel); 1778 const char *arch_name = perf_env__arch(env); 1779 1780 return arch_syscalls__strerrno(arch_name, err); 1781 } 1782 1783 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, 1784 union perf_event *event __maybe_unused, 1785 struct perf_sample *sample) 1786 { 1787 long ret; 1788 u64 duration = 0; 1789 bool duration_calculated = false; 1790 struct thread *thread; 1791 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0; 1792 struct syscall *sc = trace__syscall_info(trace, evsel, id); 1793 struct thread_trace *ttrace; 1794 1795 if (sc == NULL) 1796 return -1; 1797 1798 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1799 ttrace = thread__trace(thread, trace->output); 1800 if (ttrace == NULL) 1801 goto out_put; 1802 1803 trace__fprintf_sample(trace, evsel, sample, thread); 1804 1805 if (trace->summary) 1806 thread__update_stats(ttrace, id, sample); 1807 1808 ret = perf_evsel__sc_tp_uint(evsel, ret, sample); 1809 1810 if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) { 1811 trace__set_fd_pathname(thread, ret, ttrace->filename.name); 1812 ttrace->filename.pending_open = false; 1813 ++trace->stats.vfs_getname; 1814 } 1815 1816 if (ttrace->entry_time) { 1817 duration = sample->time - ttrace->entry_time; 1818 if (trace__filter_duration(trace, duration)) 1819 goto out; 1820 duration_calculated = true; 1821 } else if (trace->duration_filter) 1822 goto out; 1823 1824 if (sample->callchain) { 1825 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 1826 if (callchain_ret == 0) { 1827 if (callchain_cursor.nr < trace->min_stack) 1828 goto out; 1829 callchain_ret = 1; 1830 } 1831 } 1832 1833 if (trace->summary_only || (ret >= 0 && trace->failure_only)) 1834 goto out; 1835 1836 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output); 1837 1838 if (ttrace->entry_pending) { 1839 fprintf(trace->output, "%-70s", ttrace->entry_str); 1840 } else { 1841 fprintf(trace->output, " ... ["); 1842 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued"); 1843 fprintf(trace->output, "]: %s()", sc->name); 1844 } 1845 1846 if (sc->fmt == NULL) { 1847 if (ret < 0) 1848 goto errno_print; 1849 signed_print: 1850 fprintf(trace->output, ") = %ld", ret); 1851 } else if (ret < 0) { 1852 errno_print: { 1853 char bf[STRERR_BUFSIZE]; 1854 const char *emsg = str_error_r(-ret, bf, sizeof(bf)), 1855 *e = errno_to_name(evsel, -ret); 1856 1857 fprintf(trace->output, ") = -1 %s %s", e, emsg); 1858 } 1859 } else if (ret == 0 && sc->fmt->timeout) 1860 fprintf(trace->output, ") = 0 Timeout"); 1861 else if (ttrace->ret_scnprintf) { 1862 char bf[1024]; 1863 struct syscall_arg arg = { 1864 .val = ret, 1865 .thread = thread, 1866 .trace = trace, 1867 }; 1868 ttrace->ret_scnprintf(bf, sizeof(bf), &arg); 1869 ttrace->ret_scnprintf = NULL; 1870 fprintf(trace->output, ") = %s", bf); 1871 } else if (sc->fmt->hexret) 1872 fprintf(trace->output, ") = %#lx", ret); 1873 else if (sc->fmt->errpid) { 1874 struct thread *child = machine__find_thread(trace->host, ret, ret); 1875 1876 if (child != NULL) { 1877 fprintf(trace->output, ") = %ld", ret); 1878 if (child->comm_set) 1879 fprintf(trace->output, " (%s)", thread__comm_str(child)); 1880 thread__put(child); 1881 } 1882 } else 1883 goto signed_print; 1884 1885 fputc('\n', trace->output); 1886 1887 if (callchain_ret > 0) 1888 trace__fprintf_callchain(trace, sample); 1889 else if (callchain_ret < 0) 1890 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 1891 out: 1892 ttrace->entry_pending = false; 1893 err = 0; 1894 out_put: 1895 thread__put(thread); 1896 return err; 1897 } 1898 1899 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel, 1900 union perf_event *event __maybe_unused, 1901 struct perf_sample *sample) 1902 { 1903 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1904 struct thread_trace *ttrace; 1905 size_t filename_len, entry_str_len, to_move; 1906 ssize_t remaining_space; 1907 char *pos; 1908 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname"); 1909 1910 if (!thread) 1911 goto out; 1912 1913 ttrace = thread__priv(thread); 1914 if (!ttrace) 1915 goto out_put; 1916 1917 filename_len = strlen(filename); 1918 if (filename_len == 0) 1919 goto out_put; 1920 1921 if (ttrace->filename.namelen < filename_len) { 1922 char *f = realloc(ttrace->filename.name, filename_len + 1); 1923 1924 if (f == NULL) 1925 goto out_put; 1926 1927 ttrace->filename.namelen = filename_len; 1928 ttrace->filename.name = f; 1929 } 1930 1931 strcpy(ttrace->filename.name, filename); 1932 ttrace->filename.pending_open = true; 1933 1934 if (!ttrace->filename.ptr) 1935 goto out_put; 1936 1937 entry_str_len = strlen(ttrace->entry_str); 1938 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */ 1939 if (remaining_space <= 0) 1940 goto out_put; 1941 1942 if (filename_len > (size_t)remaining_space) { 1943 filename += filename_len - remaining_space; 1944 filename_len = remaining_space; 1945 } 1946 1947 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */ 1948 pos = ttrace->entry_str + ttrace->filename.entry_str_pos; 1949 memmove(pos + filename_len, pos, to_move); 1950 memcpy(pos, filename, filename_len); 1951 1952 ttrace->filename.ptr = 0; 1953 ttrace->filename.entry_str_pos = 0; 1954 out_put: 1955 thread__put(thread); 1956 out: 1957 return 0; 1958 } 1959 1960 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel, 1961 union perf_event *event __maybe_unused, 1962 struct perf_sample *sample) 1963 { 1964 u64 runtime = perf_evsel__intval(evsel, sample, "runtime"); 1965 double runtime_ms = (double)runtime / NSEC_PER_MSEC; 1966 struct thread *thread = machine__findnew_thread(trace->host, 1967 sample->pid, 1968 sample->tid); 1969 struct thread_trace *ttrace = thread__trace(thread, trace->output); 1970 1971 if (ttrace == NULL) 1972 goto out_dump; 1973 1974 ttrace->runtime_ms += runtime_ms; 1975 trace->runtime_ms += runtime_ms; 1976 out_put: 1977 thread__put(thread); 1978 return 0; 1979 1980 out_dump: 1981 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n", 1982 evsel->name, 1983 perf_evsel__strval(evsel, sample, "comm"), 1984 (pid_t)perf_evsel__intval(evsel, sample, "pid"), 1985 runtime, 1986 perf_evsel__intval(evsel, sample, "vruntime")); 1987 goto out_put; 1988 } 1989 1990 static int bpf_output__printer(enum binary_printer_ops op, 1991 unsigned int val, void *extra __maybe_unused, FILE *fp) 1992 { 1993 unsigned char ch = (unsigned char)val; 1994 1995 switch (op) { 1996 case BINARY_PRINT_CHAR_DATA: 1997 return fprintf(fp, "%c", isprint(ch) ? ch : '.'); 1998 case BINARY_PRINT_DATA_BEGIN: 1999 case BINARY_PRINT_LINE_BEGIN: 2000 case BINARY_PRINT_ADDR: 2001 case BINARY_PRINT_NUM_DATA: 2002 case BINARY_PRINT_NUM_PAD: 2003 case BINARY_PRINT_SEP: 2004 case BINARY_PRINT_CHAR_PAD: 2005 case BINARY_PRINT_LINE_END: 2006 case BINARY_PRINT_DATA_END: 2007 default: 2008 break; 2009 } 2010 2011 return 0; 2012 } 2013 2014 static void bpf_output__fprintf(struct trace *trace, 2015 struct perf_sample *sample) 2016 { 2017 binary__fprintf(sample->raw_data, sample->raw_size, 8, 2018 bpf_output__printer, NULL, trace->output); 2019 } 2020 2021 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, 2022 union perf_event *event __maybe_unused, 2023 struct perf_sample *sample) 2024 { 2025 int callchain_ret = 0; 2026 2027 if (sample->callchain) { 2028 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 2029 if (callchain_ret == 0) { 2030 if (callchain_cursor.nr < trace->min_stack) 2031 goto out; 2032 callchain_ret = 1; 2033 } 2034 } 2035 2036 trace__printf_interrupted_entry(trace); 2037 trace__fprintf_tstamp(trace, sample->time, trace->output); 2038 2039 if (trace->trace_syscalls) 2040 fprintf(trace->output, "( ): "); 2041 2042 fprintf(trace->output, "%s:", evsel->name); 2043 2044 if (perf_evsel__is_bpf_output(evsel)) { 2045 if (evsel == trace->syscalls.events.augmented) 2046 trace__fprintf_sys_enter(trace, evsel, sample); 2047 else 2048 bpf_output__fprintf(trace, sample); 2049 } else if (evsel->tp_format) { 2050 if (strncmp(evsel->tp_format->name, "sys_enter_", 10) || 2051 trace__fprintf_sys_enter(trace, evsel, sample)) { 2052 event_format__fprintf(evsel->tp_format, sample->cpu, 2053 sample->raw_data, sample->raw_size, 2054 trace->output); 2055 } 2056 } 2057 2058 fprintf(trace->output, "\n"); 2059 2060 if (callchain_ret > 0) 2061 trace__fprintf_callchain(trace, sample); 2062 else if (callchain_ret < 0) 2063 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 2064 out: 2065 return 0; 2066 } 2067 2068 static void print_location(FILE *f, struct perf_sample *sample, 2069 struct addr_location *al, 2070 bool print_dso, bool print_sym) 2071 { 2072 2073 if ((verbose > 0 || print_dso) && al->map) 2074 fprintf(f, "%s@", al->map->dso->long_name); 2075 2076 if ((verbose > 0 || print_sym) && al->sym) 2077 fprintf(f, "%s+0x%" PRIx64, al->sym->name, 2078 al->addr - al->sym->start); 2079 else if (al->map) 2080 fprintf(f, "0x%" PRIx64, al->addr); 2081 else 2082 fprintf(f, "0x%" PRIx64, sample->addr); 2083 } 2084 2085 static int trace__pgfault(struct trace *trace, 2086 struct perf_evsel *evsel, 2087 union perf_event *event __maybe_unused, 2088 struct perf_sample *sample) 2089 { 2090 struct thread *thread; 2091 struct addr_location al; 2092 char map_type = 'd'; 2093 struct thread_trace *ttrace; 2094 int err = -1; 2095 int callchain_ret = 0; 2096 2097 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 2098 2099 if (sample->callchain) { 2100 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 2101 if (callchain_ret == 0) { 2102 if (callchain_cursor.nr < trace->min_stack) 2103 goto out_put; 2104 callchain_ret = 1; 2105 } 2106 } 2107 2108 ttrace = thread__trace(thread, trace->output); 2109 if (ttrace == NULL) 2110 goto out_put; 2111 2112 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ) 2113 ttrace->pfmaj++; 2114 else 2115 ttrace->pfmin++; 2116 2117 if (trace->summary_only) 2118 goto out; 2119 2120 thread__find_symbol(thread, sample->cpumode, sample->ip, &al); 2121 2122 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output); 2123 2124 fprintf(trace->output, "%sfault [", 2125 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ? 2126 "maj" : "min"); 2127 2128 print_location(trace->output, sample, &al, false, true); 2129 2130 fprintf(trace->output, "] => "); 2131 2132 thread__find_symbol(thread, sample->cpumode, sample->addr, &al); 2133 2134 if (!al.map) { 2135 thread__find_symbol(thread, sample->cpumode, sample->addr, &al); 2136 2137 if (al.map) 2138 map_type = 'x'; 2139 else 2140 map_type = '?'; 2141 } 2142 2143 print_location(trace->output, sample, &al, true, false); 2144 2145 fprintf(trace->output, " (%c%c)\n", map_type, al.level); 2146 2147 if (callchain_ret > 0) 2148 trace__fprintf_callchain(trace, sample); 2149 else if (callchain_ret < 0) 2150 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 2151 out: 2152 err = 0; 2153 out_put: 2154 thread__put(thread); 2155 return err; 2156 } 2157 2158 static void trace__set_base_time(struct trace *trace, 2159 struct perf_evsel *evsel, 2160 struct perf_sample *sample) 2161 { 2162 /* 2163 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust 2164 * and don't use sample->time unconditionally, we may end up having 2165 * some other event in the future without PERF_SAMPLE_TIME for good 2166 * reason, i.e. we may not be interested in its timestamps, just in 2167 * it taking place, picking some piece of information when it 2168 * appears in our event stream (vfs_getname comes to mind). 2169 */ 2170 if (trace->base_time == 0 && !trace->full_time && 2171 (evsel->attr.sample_type & PERF_SAMPLE_TIME)) 2172 trace->base_time = sample->time; 2173 } 2174 2175 static int trace__process_sample(struct perf_tool *tool, 2176 union perf_event *event, 2177 struct perf_sample *sample, 2178 struct perf_evsel *evsel, 2179 struct machine *machine __maybe_unused) 2180 { 2181 struct trace *trace = container_of(tool, struct trace, tool); 2182 struct thread *thread; 2183 int err = 0; 2184 2185 tracepoint_handler handler = evsel->handler; 2186 2187 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 2188 if (thread && thread__is_filtered(thread)) 2189 goto out; 2190 2191 trace__set_base_time(trace, evsel, sample); 2192 2193 if (handler) { 2194 ++trace->nr_events; 2195 handler(trace, evsel, event, sample); 2196 } 2197 out: 2198 thread__put(thread); 2199 return err; 2200 } 2201 2202 static int trace__record(struct trace *trace, int argc, const char **argv) 2203 { 2204 unsigned int rec_argc, i, j; 2205 const char **rec_argv; 2206 const char * const record_args[] = { 2207 "record", 2208 "-R", 2209 "-m", "1024", 2210 "-c", "1", 2211 }; 2212 2213 const char * const sc_args[] = { "-e", }; 2214 unsigned int sc_args_nr = ARRAY_SIZE(sc_args); 2215 const char * const majpf_args[] = { "-e", "major-faults" }; 2216 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args); 2217 const char * const minpf_args[] = { "-e", "minor-faults" }; 2218 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args); 2219 2220 /* +1 is for the event string below */ 2221 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 + 2222 majpf_args_nr + minpf_args_nr + argc; 2223 rec_argv = calloc(rec_argc + 1, sizeof(char *)); 2224 2225 if (rec_argv == NULL) 2226 return -ENOMEM; 2227 2228 j = 0; 2229 for (i = 0; i < ARRAY_SIZE(record_args); i++) 2230 rec_argv[j++] = record_args[i]; 2231 2232 if (trace->trace_syscalls) { 2233 for (i = 0; i < sc_args_nr; i++) 2234 rec_argv[j++] = sc_args[i]; 2235 2236 /* event string may be different for older kernels - e.g., RHEL6 */ 2237 if (is_valid_tracepoint("raw_syscalls:sys_enter")) 2238 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit"; 2239 else if (is_valid_tracepoint("syscalls:sys_enter")) 2240 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit"; 2241 else { 2242 pr_err("Neither raw_syscalls nor syscalls events exist.\n"); 2243 free(rec_argv); 2244 return -1; 2245 } 2246 } 2247 2248 if (trace->trace_pgfaults & TRACE_PFMAJ) 2249 for (i = 0; i < majpf_args_nr; i++) 2250 rec_argv[j++] = majpf_args[i]; 2251 2252 if (trace->trace_pgfaults & TRACE_PFMIN) 2253 for (i = 0; i < minpf_args_nr; i++) 2254 rec_argv[j++] = minpf_args[i]; 2255 2256 for (i = 0; i < (unsigned int)argc; i++) 2257 rec_argv[j++] = argv[i]; 2258 2259 return cmd_record(j, rec_argv); 2260 } 2261 2262 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp); 2263 2264 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist) 2265 { 2266 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname"); 2267 2268 if (IS_ERR(evsel)) 2269 return false; 2270 2271 if (perf_evsel__field(evsel, "pathname") == NULL) { 2272 perf_evsel__delete(evsel); 2273 return false; 2274 } 2275 2276 evsel->handler = trace__vfs_getname; 2277 perf_evlist__add(evlist, evsel); 2278 return true; 2279 } 2280 2281 static struct perf_evsel *perf_evsel__new_pgfault(u64 config) 2282 { 2283 struct perf_evsel *evsel; 2284 struct perf_event_attr attr = { 2285 .type = PERF_TYPE_SOFTWARE, 2286 .mmap_data = 1, 2287 }; 2288 2289 attr.config = config; 2290 attr.sample_period = 1; 2291 2292 event_attr_init(&attr); 2293 2294 evsel = perf_evsel__new(&attr); 2295 if (evsel) 2296 evsel->handler = trace__pgfault; 2297 2298 return evsel; 2299 } 2300 2301 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample) 2302 { 2303 const u32 type = event->header.type; 2304 struct perf_evsel *evsel; 2305 2306 if (type != PERF_RECORD_SAMPLE) { 2307 trace__process_event(trace, trace->host, event, sample); 2308 return; 2309 } 2310 2311 evsel = perf_evlist__id2evsel(trace->evlist, sample->id); 2312 if (evsel == NULL) { 2313 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id); 2314 return; 2315 } 2316 2317 trace__set_base_time(trace, evsel, sample); 2318 2319 if (evsel->attr.type == PERF_TYPE_TRACEPOINT && 2320 sample->raw_data == NULL) { 2321 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n", 2322 perf_evsel__name(evsel), sample->tid, 2323 sample->cpu, sample->raw_size); 2324 } else { 2325 tracepoint_handler handler = evsel->handler; 2326 handler(trace, evsel, event, sample); 2327 } 2328 } 2329 2330 static int trace__add_syscall_newtp(struct trace *trace) 2331 { 2332 int ret = -1; 2333 struct perf_evlist *evlist = trace->evlist; 2334 struct perf_evsel *sys_enter, *sys_exit; 2335 2336 sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter); 2337 if (sys_enter == NULL) 2338 goto out; 2339 2340 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args)) 2341 goto out_delete_sys_enter; 2342 2343 sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit); 2344 if (sys_exit == NULL) 2345 goto out_delete_sys_enter; 2346 2347 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret)) 2348 goto out_delete_sys_exit; 2349 2350 perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param); 2351 perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param); 2352 2353 perf_evlist__add(evlist, sys_enter); 2354 perf_evlist__add(evlist, sys_exit); 2355 2356 if (callchain_param.enabled && !trace->kernel_syscallchains) { 2357 /* 2358 * We're interested only in the user space callchain 2359 * leading to the syscall, allow overriding that for 2360 * debugging reasons using --kernel_syscall_callchains 2361 */ 2362 sys_exit->attr.exclude_callchain_kernel = 1; 2363 } 2364 2365 trace->syscalls.events.sys_enter = sys_enter; 2366 trace->syscalls.events.sys_exit = sys_exit; 2367 2368 ret = 0; 2369 out: 2370 return ret; 2371 2372 out_delete_sys_exit: 2373 perf_evsel__delete_priv(sys_exit); 2374 out_delete_sys_enter: 2375 perf_evsel__delete_priv(sys_enter); 2376 goto out; 2377 } 2378 2379 static int trace__set_ev_qualifier_filter(struct trace *trace) 2380 { 2381 int err = -1; 2382 struct perf_evsel *sys_exit; 2383 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier, 2384 trace->ev_qualifier_ids.nr, 2385 trace->ev_qualifier_ids.entries); 2386 2387 if (filter == NULL) 2388 goto out_enomem; 2389 2390 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter, 2391 filter)) { 2392 sys_exit = trace->syscalls.events.sys_exit; 2393 err = perf_evsel__append_tp_filter(sys_exit, filter); 2394 } 2395 2396 free(filter); 2397 out: 2398 return err; 2399 out_enomem: 2400 errno = ENOMEM; 2401 goto out; 2402 } 2403 2404 static int trace__set_filter_loop_pids(struct trace *trace) 2405 { 2406 unsigned int nr = 1; 2407 pid_t pids[32] = { 2408 getpid(), 2409 }; 2410 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]); 2411 2412 while (thread && nr < ARRAY_SIZE(pids)) { 2413 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid); 2414 2415 if (parent == NULL) 2416 break; 2417 2418 if (!strcmp(thread__comm_str(parent), "sshd")) { 2419 pids[nr++] = parent->tid; 2420 break; 2421 } 2422 thread = parent; 2423 } 2424 2425 return perf_evlist__set_filter_pids(trace->evlist, nr, pids); 2426 } 2427 2428 static int trace__run(struct trace *trace, int argc, const char **argv) 2429 { 2430 struct perf_evlist *evlist = trace->evlist; 2431 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL; 2432 int err = -1, i; 2433 unsigned long before; 2434 const bool forks = argc > 0; 2435 bool draining = false; 2436 2437 trace->live = true; 2438 2439 if (trace->trace_syscalls && trace__add_syscall_newtp(trace)) 2440 goto out_error_raw_syscalls; 2441 2442 if (trace->trace_syscalls) 2443 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist); 2444 2445 if ((trace->trace_pgfaults & TRACE_PFMAJ)) { 2446 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ); 2447 if (pgfault_maj == NULL) 2448 goto out_error_mem; 2449 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param); 2450 perf_evlist__add(evlist, pgfault_maj); 2451 } 2452 2453 if ((trace->trace_pgfaults & TRACE_PFMIN)) { 2454 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN); 2455 if (pgfault_min == NULL) 2456 goto out_error_mem; 2457 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param); 2458 perf_evlist__add(evlist, pgfault_min); 2459 } 2460 2461 if (trace->sched && 2462 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime", 2463 trace__sched_stat_runtime)) 2464 goto out_error_sched_stat_runtime; 2465 2466 /* 2467 * If a global cgroup was set, apply it to all the events without an 2468 * explicit cgroup. I.e.: 2469 * 2470 * trace -G A -e sched:*switch 2471 * 2472 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc 2473 * _and_ sched:sched_switch to the 'A' cgroup, while: 2474 * 2475 * trace -e sched:*switch -G A 2476 * 2477 * will only set the sched:sched_switch event to the 'A' cgroup, all the 2478 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without" 2479 * a cgroup (on the root cgroup, sys wide, etc). 2480 * 2481 * Multiple cgroups: 2482 * 2483 * trace -G A -e sched:*switch -G B 2484 * 2485 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes 2486 * to the 'B' cgroup. 2487 * 2488 * evlist__set_default_cgroup() grabs a reference of the passed cgroup 2489 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL. 2490 */ 2491 if (trace->cgroup) 2492 evlist__set_default_cgroup(trace->evlist, trace->cgroup); 2493 2494 err = perf_evlist__create_maps(evlist, &trace->opts.target); 2495 if (err < 0) { 2496 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n"); 2497 goto out_delete_evlist; 2498 } 2499 2500 err = trace__symbols_init(trace, evlist); 2501 if (err < 0) { 2502 fprintf(trace->output, "Problems initializing symbol libraries!\n"); 2503 goto out_delete_evlist; 2504 } 2505 2506 perf_evlist__config(evlist, &trace->opts, &callchain_param); 2507 2508 signal(SIGCHLD, sig_handler); 2509 signal(SIGINT, sig_handler); 2510 2511 if (forks) { 2512 err = perf_evlist__prepare_workload(evlist, &trace->opts.target, 2513 argv, false, NULL); 2514 if (err < 0) { 2515 fprintf(trace->output, "Couldn't run the workload!\n"); 2516 goto out_delete_evlist; 2517 } 2518 } 2519 2520 err = perf_evlist__open(evlist); 2521 if (err < 0) 2522 goto out_error_open; 2523 2524 err = bpf__apply_obj_config(); 2525 if (err) { 2526 char errbuf[BUFSIZ]; 2527 2528 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf)); 2529 pr_err("ERROR: Apply config to BPF failed: %s\n", 2530 errbuf); 2531 goto out_error_open; 2532 } 2533 2534 /* 2535 * Better not use !target__has_task() here because we need to cover the 2536 * case where no threads were specified in the command line, but a 2537 * workload was, and in that case we will fill in the thread_map when 2538 * we fork the workload in perf_evlist__prepare_workload. 2539 */ 2540 if (trace->filter_pids.nr > 0) 2541 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries); 2542 else if (thread_map__pid(evlist->threads, 0) == -1) 2543 err = trace__set_filter_loop_pids(trace); 2544 2545 if (err < 0) 2546 goto out_error_mem; 2547 2548 if (trace->ev_qualifier_ids.nr > 0) { 2549 err = trace__set_ev_qualifier_filter(trace); 2550 if (err < 0) 2551 goto out_errno; 2552 2553 pr_debug("event qualifier tracepoint filter: %s\n", 2554 trace->syscalls.events.sys_exit->filter); 2555 } 2556 2557 err = perf_evlist__apply_filters(evlist, &evsel); 2558 if (err < 0) 2559 goto out_error_apply_filters; 2560 2561 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages); 2562 if (err < 0) 2563 goto out_error_mmap; 2564 2565 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay) 2566 perf_evlist__enable(evlist); 2567 2568 if (forks) 2569 perf_evlist__start_workload(evlist); 2570 2571 if (trace->opts.initial_delay) { 2572 usleep(trace->opts.initial_delay * 1000); 2573 perf_evlist__enable(evlist); 2574 } 2575 2576 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 || 2577 evlist->threads->nr > 1 || 2578 perf_evlist__first(evlist)->attr.inherit; 2579 2580 /* 2581 * Now that we already used evsel->attr to ask the kernel to setup the 2582 * events, lets reuse evsel->attr.sample_max_stack as the limit in 2583 * trace__resolve_callchain(), allowing per-event max-stack settings 2584 * to override an explicitely set --max-stack global setting. 2585 */ 2586 evlist__for_each_entry(evlist, evsel) { 2587 if (evsel__has_callchain(evsel) && 2588 evsel->attr.sample_max_stack == 0) 2589 evsel->attr.sample_max_stack = trace->max_stack; 2590 } 2591 again: 2592 before = trace->nr_events; 2593 2594 for (i = 0; i < evlist->nr_mmaps; i++) { 2595 union perf_event *event; 2596 struct perf_mmap *md; 2597 2598 md = &evlist->mmap[i]; 2599 if (perf_mmap__read_init(md) < 0) 2600 continue; 2601 2602 while ((event = perf_mmap__read_event(md)) != NULL) { 2603 struct perf_sample sample; 2604 2605 ++trace->nr_events; 2606 2607 err = perf_evlist__parse_sample(evlist, event, &sample); 2608 if (err) { 2609 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err); 2610 goto next_event; 2611 } 2612 2613 trace__handle_event(trace, event, &sample); 2614 next_event: 2615 perf_mmap__consume(md); 2616 2617 if (interrupted) 2618 goto out_disable; 2619 2620 if (done && !draining) { 2621 perf_evlist__disable(evlist); 2622 draining = true; 2623 } 2624 } 2625 perf_mmap__read_done(md); 2626 } 2627 2628 if (trace->nr_events == before) { 2629 int timeout = done ? 100 : -1; 2630 2631 if (!draining && perf_evlist__poll(evlist, timeout) > 0) { 2632 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0) 2633 draining = true; 2634 2635 goto again; 2636 } 2637 } else { 2638 goto again; 2639 } 2640 2641 out_disable: 2642 thread__zput(trace->current); 2643 2644 perf_evlist__disable(evlist); 2645 2646 if (!err) { 2647 if (trace->summary) 2648 trace__fprintf_thread_summary(trace, trace->output); 2649 2650 if (trace->show_tool_stats) { 2651 fprintf(trace->output, "Stats:\n " 2652 " vfs_getname : %" PRIu64 "\n" 2653 " proc_getname: %" PRIu64 "\n", 2654 trace->stats.vfs_getname, 2655 trace->stats.proc_getname); 2656 } 2657 } 2658 2659 out_delete_evlist: 2660 trace__symbols__exit(trace); 2661 2662 perf_evlist__delete(evlist); 2663 cgroup__put(trace->cgroup); 2664 trace->evlist = NULL; 2665 trace->live = false; 2666 return err; 2667 { 2668 char errbuf[BUFSIZ]; 2669 2670 out_error_sched_stat_runtime: 2671 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime"); 2672 goto out_error; 2673 2674 out_error_raw_syscalls: 2675 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)"); 2676 goto out_error; 2677 2678 out_error_mmap: 2679 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf)); 2680 goto out_error; 2681 2682 out_error_open: 2683 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf)); 2684 2685 out_error: 2686 fprintf(trace->output, "%s\n", errbuf); 2687 goto out_delete_evlist; 2688 2689 out_error_apply_filters: 2690 fprintf(trace->output, 2691 "Failed to set filter \"%s\" on event %s with %d (%s)\n", 2692 evsel->filter, perf_evsel__name(evsel), errno, 2693 str_error_r(errno, errbuf, sizeof(errbuf))); 2694 goto out_delete_evlist; 2695 } 2696 out_error_mem: 2697 fprintf(trace->output, "Not enough memory to run!\n"); 2698 goto out_delete_evlist; 2699 2700 out_errno: 2701 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno)); 2702 goto out_delete_evlist; 2703 } 2704 2705 static int trace__replay(struct trace *trace) 2706 { 2707 const struct perf_evsel_str_handler handlers[] = { 2708 { "probe:vfs_getname", trace__vfs_getname, }, 2709 }; 2710 struct perf_data data = { 2711 .file = { 2712 .path = input_name, 2713 }, 2714 .mode = PERF_DATA_MODE_READ, 2715 .force = trace->force, 2716 }; 2717 struct perf_session *session; 2718 struct perf_evsel *evsel; 2719 int err = -1; 2720 2721 trace->tool.sample = trace__process_sample; 2722 trace->tool.mmap = perf_event__process_mmap; 2723 trace->tool.mmap2 = perf_event__process_mmap2; 2724 trace->tool.comm = perf_event__process_comm; 2725 trace->tool.exit = perf_event__process_exit; 2726 trace->tool.fork = perf_event__process_fork; 2727 trace->tool.attr = perf_event__process_attr; 2728 trace->tool.tracing_data = perf_event__process_tracing_data; 2729 trace->tool.build_id = perf_event__process_build_id; 2730 trace->tool.namespaces = perf_event__process_namespaces; 2731 2732 trace->tool.ordered_events = true; 2733 trace->tool.ordering_requires_timestamps = true; 2734 2735 /* add tid to output */ 2736 trace->multiple_threads = true; 2737 2738 session = perf_session__new(&data, false, &trace->tool); 2739 if (session == NULL) 2740 return -1; 2741 2742 if (trace->opts.target.pid) 2743 symbol_conf.pid_list_str = strdup(trace->opts.target.pid); 2744 2745 if (trace->opts.target.tid) 2746 symbol_conf.tid_list_str = strdup(trace->opts.target.tid); 2747 2748 if (symbol__init(&session->header.env) < 0) 2749 goto out; 2750 2751 trace->host = &session->machines.host; 2752 2753 err = perf_session__set_tracepoints_handlers(session, handlers); 2754 if (err) 2755 goto out; 2756 2757 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2758 "raw_syscalls:sys_enter"); 2759 /* older kernels have syscalls tp versus raw_syscalls */ 2760 if (evsel == NULL) 2761 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2762 "syscalls:sys_enter"); 2763 2764 if (evsel && 2765 (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 || 2766 perf_evsel__init_sc_tp_ptr_field(evsel, args))) { 2767 pr_err("Error during initialize raw_syscalls:sys_enter event\n"); 2768 goto out; 2769 } 2770 2771 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2772 "raw_syscalls:sys_exit"); 2773 if (evsel == NULL) 2774 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2775 "syscalls:sys_exit"); 2776 if (evsel && 2777 (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 || 2778 perf_evsel__init_sc_tp_uint_field(evsel, ret))) { 2779 pr_err("Error during initialize raw_syscalls:sys_exit event\n"); 2780 goto out; 2781 } 2782 2783 evlist__for_each_entry(session->evlist, evsel) { 2784 if (evsel->attr.type == PERF_TYPE_SOFTWARE && 2785 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ || 2786 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN || 2787 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS)) 2788 evsel->handler = trace__pgfault; 2789 } 2790 2791 setup_pager(); 2792 2793 err = perf_session__process_events(session); 2794 if (err) 2795 pr_err("Failed to process events, error %d", err); 2796 2797 else if (trace->summary) 2798 trace__fprintf_thread_summary(trace, trace->output); 2799 2800 out: 2801 perf_session__delete(session); 2802 2803 return err; 2804 } 2805 2806 static size_t trace__fprintf_threads_header(FILE *fp) 2807 { 2808 size_t printed; 2809 2810 printed = fprintf(fp, "\n Summary of events:\n\n"); 2811 2812 return printed; 2813 } 2814 2815 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs, 2816 struct stats *stats; 2817 double msecs; 2818 int syscall; 2819 ) 2820 { 2821 struct int_node *source = rb_entry(nd, struct int_node, rb_node); 2822 struct stats *stats = source->priv; 2823 2824 entry->syscall = source->i; 2825 entry->stats = stats; 2826 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0; 2827 } 2828 2829 static size_t thread__dump_stats(struct thread_trace *ttrace, 2830 struct trace *trace, FILE *fp) 2831 { 2832 size_t printed = 0; 2833 struct syscall *sc; 2834 struct rb_node *nd; 2835 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats); 2836 2837 if (syscall_stats == NULL) 2838 return 0; 2839 2840 printed += fprintf(fp, "\n"); 2841 2842 printed += fprintf(fp, " syscall calls total min avg max stddev\n"); 2843 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); 2844 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n"); 2845 2846 resort_rb__for_each_entry(nd, syscall_stats) { 2847 struct stats *stats = syscall_stats_entry->stats; 2848 if (stats) { 2849 double min = (double)(stats->min) / NSEC_PER_MSEC; 2850 double max = (double)(stats->max) / NSEC_PER_MSEC; 2851 double avg = avg_stats(stats); 2852 double pct; 2853 u64 n = (u64) stats->n; 2854 2855 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0; 2856 avg /= NSEC_PER_MSEC; 2857 2858 sc = &trace->syscalls.table[syscall_stats_entry->syscall]; 2859 printed += fprintf(fp, " %-15s", sc->name); 2860 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f", 2861 n, syscall_stats_entry->msecs, min, avg); 2862 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct); 2863 } 2864 } 2865 2866 resort_rb__delete(syscall_stats); 2867 printed += fprintf(fp, "\n\n"); 2868 2869 return printed; 2870 } 2871 2872 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace) 2873 { 2874 size_t printed = 0; 2875 struct thread_trace *ttrace = thread__priv(thread); 2876 double ratio; 2877 2878 if (ttrace == NULL) 2879 return 0; 2880 2881 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0; 2882 2883 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid); 2884 printed += fprintf(fp, "%lu events, ", ttrace->nr_events); 2885 printed += fprintf(fp, "%.1f%%", ratio); 2886 if (ttrace->pfmaj) 2887 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj); 2888 if (ttrace->pfmin) 2889 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin); 2890 if (trace->sched) 2891 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms); 2892 else if (fputc('\n', fp) != EOF) 2893 ++printed; 2894 2895 printed += thread__dump_stats(ttrace, trace, fp); 2896 2897 return printed; 2898 } 2899 2900 static unsigned long thread__nr_events(struct thread_trace *ttrace) 2901 { 2902 return ttrace ? ttrace->nr_events : 0; 2903 } 2904 2905 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)), 2906 struct thread *thread; 2907 ) 2908 { 2909 entry->thread = rb_entry(nd, struct thread, rb_node); 2910 } 2911 2912 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp) 2913 { 2914 size_t printed = trace__fprintf_threads_header(fp); 2915 struct rb_node *nd; 2916 int i; 2917 2918 for (i = 0; i < THREADS__TABLE_SIZE; i++) { 2919 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i); 2920 2921 if (threads == NULL) { 2922 fprintf(fp, "%s", "Error sorting output by nr_events!\n"); 2923 return 0; 2924 } 2925 2926 resort_rb__for_each_entry(nd, threads) 2927 printed += trace__fprintf_thread(fp, threads_entry->thread, trace); 2928 2929 resort_rb__delete(threads); 2930 } 2931 return printed; 2932 } 2933 2934 static int trace__set_duration(const struct option *opt, const char *str, 2935 int unset __maybe_unused) 2936 { 2937 struct trace *trace = opt->value; 2938 2939 trace->duration_filter = atof(str); 2940 return 0; 2941 } 2942 2943 static int trace__set_filter_pids(const struct option *opt, const char *str, 2944 int unset __maybe_unused) 2945 { 2946 int ret = -1; 2947 size_t i; 2948 struct trace *trace = opt->value; 2949 /* 2950 * FIXME: introduce a intarray class, plain parse csv and create a 2951 * { int nr, int entries[] } struct... 2952 */ 2953 struct intlist *list = intlist__new(str); 2954 2955 if (list == NULL) 2956 return -1; 2957 2958 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1; 2959 trace->filter_pids.entries = calloc(i, sizeof(pid_t)); 2960 2961 if (trace->filter_pids.entries == NULL) 2962 goto out; 2963 2964 trace->filter_pids.entries[0] = getpid(); 2965 2966 for (i = 1; i < trace->filter_pids.nr; ++i) 2967 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i; 2968 2969 intlist__delete(list); 2970 ret = 0; 2971 out: 2972 return ret; 2973 } 2974 2975 static int trace__open_output(struct trace *trace, const char *filename) 2976 { 2977 struct stat st; 2978 2979 if (!stat(filename, &st) && st.st_size) { 2980 char oldname[PATH_MAX]; 2981 2982 scnprintf(oldname, sizeof(oldname), "%s.old", filename); 2983 unlink(oldname); 2984 rename(filename, oldname); 2985 } 2986 2987 trace->output = fopen(filename, "w"); 2988 2989 return trace->output == NULL ? -errno : 0; 2990 } 2991 2992 static int parse_pagefaults(const struct option *opt, const char *str, 2993 int unset __maybe_unused) 2994 { 2995 int *trace_pgfaults = opt->value; 2996 2997 if (strcmp(str, "all") == 0) 2998 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN; 2999 else if (strcmp(str, "maj") == 0) 3000 *trace_pgfaults |= TRACE_PFMAJ; 3001 else if (strcmp(str, "min") == 0) 3002 *trace_pgfaults |= TRACE_PFMIN; 3003 else 3004 return -1; 3005 3006 return 0; 3007 } 3008 3009 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler) 3010 { 3011 struct perf_evsel *evsel; 3012 3013 evlist__for_each_entry(evlist, evsel) 3014 evsel->handler = handler; 3015 } 3016 3017 static int evlist__set_syscall_tp_fields(struct perf_evlist *evlist) 3018 { 3019 struct perf_evsel *evsel; 3020 3021 evlist__for_each_entry(evlist, evsel) { 3022 if (evsel->priv || !evsel->tp_format) 3023 continue; 3024 3025 if (strcmp(evsel->tp_format->system, "syscalls")) 3026 continue; 3027 3028 if (perf_evsel__init_syscall_tp(evsel)) 3029 return -1; 3030 3031 if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) { 3032 struct syscall_tp *sc = evsel->priv; 3033 3034 if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64))) 3035 return -1; 3036 } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) { 3037 struct syscall_tp *sc = evsel->priv; 3038 3039 if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap)) 3040 return -1; 3041 } 3042 } 3043 3044 return 0; 3045 } 3046 3047 /* 3048 * XXX: Hackish, just splitting the combined -e+--event (syscalls 3049 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use 3050 * existing facilities unchanged (trace->ev_qualifier + parse_options()). 3051 * 3052 * It'd be better to introduce a parse_options() variant that would return a 3053 * list with the terms it didn't match to an event... 3054 */ 3055 static int trace__parse_events_option(const struct option *opt, const char *str, 3056 int unset __maybe_unused) 3057 { 3058 struct trace *trace = (struct trace *)opt->value; 3059 const char *s = str; 3060 char *sep = NULL, *lists[2] = { NULL, NULL, }; 3061 int len = strlen(str) + 1, err = -1, list, idx; 3062 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR); 3063 char group_name[PATH_MAX]; 3064 3065 if (strace_groups_dir == NULL) 3066 return -1; 3067 3068 if (*s == '!') { 3069 ++s; 3070 trace->not_ev_qualifier = true; 3071 } 3072 3073 while (1) { 3074 if ((sep = strchr(s, ',')) != NULL) 3075 *sep = '\0'; 3076 3077 list = 0; 3078 if (syscalltbl__id(trace->sctbl, s) >= 0 || 3079 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) { 3080 list = 1; 3081 } else { 3082 path__join(group_name, sizeof(group_name), strace_groups_dir, s); 3083 if (access(group_name, R_OK) == 0) 3084 list = 1; 3085 } 3086 3087 if (lists[list]) { 3088 sprintf(lists[list] + strlen(lists[list]), ",%s", s); 3089 } else { 3090 lists[list] = malloc(len); 3091 if (lists[list] == NULL) 3092 goto out; 3093 strcpy(lists[list], s); 3094 } 3095 3096 if (!sep) 3097 break; 3098 3099 *sep = ','; 3100 s = sep + 1; 3101 } 3102 3103 if (lists[1] != NULL) { 3104 struct strlist_config slist_config = { 3105 .dirname = strace_groups_dir, 3106 }; 3107 3108 trace->ev_qualifier = strlist__new(lists[1], &slist_config); 3109 if (trace->ev_qualifier == NULL) { 3110 fputs("Not enough memory to parse event qualifier", trace->output); 3111 goto out; 3112 } 3113 3114 if (trace__validate_ev_qualifier(trace)) 3115 goto out; 3116 trace->trace_syscalls = true; 3117 } 3118 3119 err = 0; 3120 3121 if (lists[0]) { 3122 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event", 3123 "event selector. use 'perf list' to list available events", 3124 parse_events_option); 3125 err = parse_events_option(&o, lists[0], 0); 3126 } 3127 out: 3128 if (sep) 3129 *sep = ','; 3130 3131 return err; 3132 } 3133 3134 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset) 3135 { 3136 struct trace *trace = opt->value; 3137 3138 if (!list_empty(&trace->evlist->entries)) 3139 return parse_cgroups(opt, str, unset); 3140 3141 trace->cgroup = evlist__findnew_cgroup(trace->evlist, str); 3142 3143 return 0; 3144 } 3145 3146 int cmd_trace(int argc, const char **argv) 3147 { 3148 const char *trace_usage[] = { 3149 "perf trace [<options>] [<command>]", 3150 "perf trace [<options>] -- <command> [<options>]", 3151 "perf trace record [<options>] [<command>]", 3152 "perf trace record [<options>] -- <command> [<options>]", 3153 NULL 3154 }; 3155 struct trace trace = { 3156 .syscalls = { 3157 . max = -1, 3158 }, 3159 .opts = { 3160 .target = { 3161 .uid = UINT_MAX, 3162 .uses_mmap = true, 3163 }, 3164 .user_freq = UINT_MAX, 3165 .user_interval = ULLONG_MAX, 3166 .no_buffering = true, 3167 .mmap_pages = UINT_MAX, 3168 .proc_map_timeout = 500, 3169 }, 3170 .output = stderr, 3171 .show_comm = true, 3172 .trace_syscalls = false, 3173 .kernel_syscallchains = false, 3174 .max_stack = UINT_MAX, 3175 }; 3176 const char *output_name = NULL; 3177 const struct option trace_options[] = { 3178 OPT_CALLBACK('e', "event", &trace, "event", 3179 "event/syscall selector. use 'perf list' to list available events", 3180 trace__parse_events_option), 3181 OPT_BOOLEAN(0, "comm", &trace.show_comm, 3182 "show the thread COMM next to its id"), 3183 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"), 3184 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace", 3185 trace__parse_events_option), 3186 OPT_STRING('o', "output", &output_name, "file", "output file name"), 3187 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"), 3188 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid", 3189 "trace events on existing process id"), 3190 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid", 3191 "trace events on existing thread id"), 3192 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids", 3193 "pids to filter (by the kernel)", trace__set_filter_pids), 3194 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide, 3195 "system-wide collection from all CPUs"), 3196 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu", 3197 "list of cpus to monitor"), 3198 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit, 3199 "child tasks do not inherit counters"), 3200 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages", 3201 "number of mmap data pages", 3202 perf_evlist__parse_mmap_pages), 3203 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user", 3204 "user to profile"), 3205 OPT_CALLBACK(0, "duration", &trace, "float", 3206 "show only events with duration > N.M ms", 3207 trace__set_duration), 3208 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"), 3209 OPT_INCR('v', "verbose", &verbose, "be more verbose"), 3210 OPT_BOOLEAN('T', "time", &trace.full_time, 3211 "Show full timestamp, not time relative to first start"), 3212 OPT_BOOLEAN(0, "failure", &trace.failure_only, 3213 "Show only syscalls that failed"), 3214 OPT_BOOLEAN('s', "summary", &trace.summary_only, 3215 "Show only syscall summary with statistics"), 3216 OPT_BOOLEAN('S', "with-summary", &trace.summary, 3217 "Show all syscalls and summary with statistics"), 3218 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min", 3219 "Trace pagefaults", parse_pagefaults, "maj"), 3220 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"), 3221 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"), 3222 OPT_CALLBACK(0, "call-graph", &trace.opts, 3223 "record_mode[,record_size]", record_callchain_help, 3224 &record_parse_callchain_opt), 3225 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains, 3226 "Show the kernel callchains on the syscall exit path"), 3227 OPT_UINTEGER(0, "min-stack", &trace.min_stack, 3228 "Set the minimum stack depth when parsing the callchain, " 3229 "anything below the specified depth will be ignored."), 3230 OPT_UINTEGER(0, "max-stack", &trace.max_stack, 3231 "Set the maximum stack depth when parsing the callchain, " 3232 "anything beyond the specified depth will be ignored. " 3233 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)), 3234 OPT_BOOLEAN(0, "print-sample", &trace.print_sample, 3235 "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"), 3236 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout, 3237 "per thread proc mmap processing timeout in ms"), 3238 OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only", 3239 trace__parse_cgroups), 3240 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay, 3241 "ms to wait before starting measurement after program " 3242 "start"), 3243 OPT_END() 3244 }; 3245 bool __maybe_unused max_stack_user_set = true; 3246 bool mmap_pages_user_set = true; 3247 struct perf_evsel *evsel; 3248 const char * const trace_subcommands[] = { "record", NULL }; 3249 int err = -1; 3250 char bf[BUFSIZ]; 3251 3252 signal(SIGSEGV, sighandler_dump_stack); 3253 signal(SIGFPE, sighandler_dump_stack); 3254 3255 trace.evlist = perf_evlist__new(); 3256 trace.sctbl = syscalltbl__new(); 3257 3258 if (trace.evlist == NULL || trace.sctbl == NULL) { 3259 pr_err("Not enough memory to run!\n"); 3260 err = -ENOMEM; 3261 goto out; 3262 } 3263 3264 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands, 3265 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION); 3266 3267 if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) { 3268 usage_with_options_msg(trace_usage, trace_options, 3269 "cgroup monitoring only available in system-wide mode"); 3270 } 3271 3272 evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__"); 3273 if (IS_ERR(evsel)) { 3274 bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf)); 3275 pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf); 3276 goto out; 3277 } 3278 3279 if (evsel) { 3280 if (perf_evsel__init_augmented_syscall_tp(evsel) || 3281 perf_evsel__init_augmented_syscall_tp_args(evsel)) 3282 goto out; 3283 trace.syscalls.events.augmented = evsel; 3284 } 3285 3286 err = bpf__setup_stdout(trace.evlist); 3287 if (err) { 3288 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf)); 3289 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf); 3290 goto out; 3291 } 3292 3293 err = -1; 3294 3295 if (trace.trace_pgfaults) { 3296 trace.opts.sample_address = true; 3297 trace.opts.sample_time = true; 3298 } 3299 3300 if (trace.opts.mmap_pages == UINT_MAX) 3301 mmap_pages_user_set = false; 3302 3303 if (trace.max_stack == UINT_MAX) { 3304 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack(); 3305 max_stack_user_set = false; 3306 } 3307 3308 #ifdef HAVE_DWARF_UNWIND_SUPPORT 3309 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) { 3310 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false); 3311 } 3312 #endif 3313 3314 if (callchain_param.enabled) { 3315 if (!mmap_pages_user_set && geteuid() == 0) 3316 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4; 3317 3318 symbol_conf.use_callchain = true; 3319 } 3320 3321 if (trace.evlist->nr_entries > 0) { 3322 evlist__set_evsel_handler(trace.evlist, trace__event_handler); 3323 if (evlist__set_syscall_tp_fields(trace.evlist)) { 3324 perror("failed to set syscalls:* tracepoint fields"); 3325 goto out; 3326 } 3327 } 3328 3329 if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) 3330 return trace__record(&trace, argc-1, &argv[1]); 3331 3332 /* summary_only implies summary option, but don't overwrite summary if set */ 3333 if (trace.summary_only) 3334 trace.summary = trace.summary_only; 3335 3336 if (!trace.trace_syscalls && !trace.trace_pgfaults && 3337 trace.evlist->nr_entries == 0 /* Was --events used? */) { 3338 trace.trace_syscalls = true; 3339 } 3340 3341 if (output_name != NULL) { 3342 err = trace__open_output(&trace, output_name); 3343 if (err < 0) { 3344 perror("failed to create output file"); 3345 goto out; 3346 } 3347 } 3348 3349 err = target__validate(&trace.opts.target); 3350 if (err) { 3351 target__strerror(&trace.opts.target, err, bf, sizeof(bf)); 3352 fprintf(trace.output, "%s", bf); 3353 goto out_close; 3354 } 3355 3356 err = target__parse_uid(&trace.opts.target); 3357 if (err) { 3358 target__strerror(&trace.opts.target, err, bf, sizeof(bf)); 3359 fprintf(trace.output, "%s", bf); 3360 goto out_close; 3361 } 3362 3363 if (!argc && target__none(&trace.opts.target)) 3364 trace.opts.target.system_wide = true; 3365 3366 if (input_name) 3367 err = trace__replay(&trace); 3368 else 3369 err = trace__run(&trace, argc, argv); 3370 3371 out_close: 3372 if (output_name != NULL) 3373 fclose(trace.output); 3374 out: 3375 return err; 3376 } 3377