1 #include <traceevent/event-parse.h> 2 #include "builtin.h" 3 #include "util/color.h" 4 #include "util/debug.h" 5 #include "util/evlist.h" 6 #include "util/machine.h" 7 #include "util/session.h" 8 #include "util/thread.h" 9 #include "util/parse-options.h" 10 #include "util/strlist.h" 11 #include "util/intlist.h" 12 #include "util/thread_map.h" 13 14 #include <libaudit.h> 15 #include <stdlib.h> 16 #include <sys/mman.h> 17 #include <linux/futex.h> 18 19 /* For older distros: */ 20 #ifndef MAP_STACK 21 # define MAP_STACK 0x20000 22 #endif 23 24 #ifndef MADV_HWPOISON 25 # define MADV_HWPOISON 100 26 #endif 27 28 #ifndef MADV_MERGEABLE 29 # define MADV_MERGEABLE 12 30 #endif 31 32 #ifndef MADV_UNMERGEABLE 33 # define MADV_UNMERGEABLE 13 34 #endif 35 36 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size, 37 unsigned long arg, 38 u8 arg_idx __maybe_unused, 39 u8 *arg_mask __maybe_unused) 40 { 41 return scnprintf(bf, size, "%#lx", arg); 42 } 43 44 #define SCA_HEX syscall_arg__scnprintf_hex 45 46 static size_t syscall_arg__scnprintf_whence(char *bf, size_t size, 47 unsigned long arg, 48 u8 arg_idx __maybe_unused, 49 u8 *arg_mask __maybe_unused) 50 { 51 int whence = arg; 52 53 switch (whence) { 54 #define P_WHENCE(n) case SEEK_##n: return scnprintf(bf, size, #n) 55 P_WHENCE(SET); 56 P_WHENCE(CUR); 57 P_WHENCE(END); 58 #ifdef SEEK_DATA 59 P_WHENCE(DATA); 60 #endif 61 #ifdef SEEK_HOLE 62 P_WHENCE(HOLE); 63 #endif 64 #undef P_WHENCE 65 default: break; 66 } 67 68 return scnprintf(bf, size, "%#x", whence); 69 } 70 71 #define SCA_WHENCE syscall_arg__scnprintf_whence 72 73 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size, 74 unsigned long arg, 75 u8 arg_idx __maybe_unused, 76 u8 *arg_mask __maybe_unused) 77 { 78 int printed = 0, prot = arg; 79 80 if (prot == PROT_NONE) 81 return scnprintf(bf, size, "NONE"); 82 #define P_MMAP_PROT(n) \ 83 if (prot & PROT_##n) { \ 84 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ 85 prot &= ~PROT_##n; \ 86 } 87 88 P_MMAP_PROT(EXEC); 89 P_MMAP_PROT(READ); 90 P_MMAP_PROT(WRITE); 91 #ifdef PROT_SEM 92 P_MMAP_PROT(SEM); 93 #endif 94 P_MMAP_PROT(GROWSDOWN); 95 P_MMAP_PROT(GROWSUP); 96 #undef P_MMAP_PROT 97 98 if (prot) 99 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot); 100 101 return printed; 102 } 103 104 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot 105 106 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size, 107 unsigned long arg, u8 arg_idx __maybe_unused, 108 u8 *arg_mask __maybe_unused) 109 { 110 int printed = 0, flags = arg; 111 112 #define P_MMAP_FLAG(n) \ 113 if (flags & MAP_##n) { \ 114 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ 115 flags &= ~MAP_##n; \ 116 } 117 118 P_MMAP_FLAG(SHARED); 119 P_MMAP_FLAG(PRIVATE); 120 #ifdef MAP_32BIT 121 P_MMAP_FLAG(32BIT); 122 #endif 123 P_MMAP_FLAG(ANONYMOUS); 124 P_MMAP_FLAG(DENYWRITE); 125 P_MMAP_FLAG(EXECUTABLE); 126 P_MMAP_FLAG(FILE); 127 P_MMAP_FLAG(FIXED); 128 P_MMAP_FLAG(GROWSDOWN); 129 #ifdef MAP_HUGETLB 130 P_MMAP_FLAG(HUGETLB); 131 #endif 132 P_MMAP_FLAG(LOCKED); 133 P_MMAP_FLAG(NONBLOCK); 134 P_MMAP_FLAG(NORESERVE); 135 P_MMAP_FLAG(POPULATE); 136 P_MMAP_FLAG(STACK); 137 #ifdef MAP_UNINITIALIZED 138 P_MMAP_FLAG(UNINITIALIZED); 139 #endif 140 #undef P_MMAP_FLAG 141 142 if (flags) 143 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); 144 145 return printed; 146 } 147 148 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags 149 150 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size, 151 unsigned long arg, u8 arg_idx __maybe_unused, 152 u8 *arg_mask __maybe_unused) 153 { 154 int behavior = arg; 155 156 switch (behavior) { 157 #define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n) 158 P_MADV_BHV(NORMAL); 159 P_MADV_BHV(RANDOM); 160 P_MADV_BHV(SEQUENTIAL); 161 P_MADV_BHV(WILLNEED); 162 P_MADV_BHV(DONTNEED); 163 P_MADV_BHV(REMOVE); 164 P_MADV_BHV(DONTFORK); 165 P_MADV_BHV(DOFORK); 166 P_MADV_BHV(HWPOISON); 167 #ifdef MADV_SOFT_OFFLINE 168 P_MADV_BHV(SOFT_OFFLINE); 169 #endif 170 P_MADV_BHV(MERGEABLE); 171 P_MADV_BHV(UNMERGEABLE); 172 #ifdef MADV_HUGEPAGE 173 P_MADV_BHV(HUGEPAGE); 174 #endif 175 #ifdef MADV_NOHUGEPAGE 176 P_MADV_BHV(NOHUGEPAGE); 177 #endif 178 #ifdef MADV_DONTDUMP 179 P_MADV_BHV(DONTDUMP); 180 #endif 181 #ifdef MADV_DODUMP 182 P_MADV_BHV(DODUMP); 183 #endif 184 #undef P_MADV_PHV 185 default: break; 186 } 187 188 return scnprintf(bf, size, "%#x", behavior); 189 } 190 191 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior 192 193 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, unsigned long arg, 194 u8 arg_idx __maybe_unused, u8 *arg_mask) 195 { 196 enum syscall_futex_args { 197 SCF_UADDR = (1 << 0), 198 SCF_OP = (1 << 1), 199 SCF_VAL = (1 << 2), 200 SCF_TIMEOUT = (1 << 3), 201 SCF_UADDR2 = (1 << 4), 202 SCF_VAL3 = (1 << 5), 203 }; 204 int op = arg; 205 int cmd = op & FUTEX_CMD_MASK; 206 size_t printed = 0; 207 208 switch (cmd) { 209 #define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n); 210 P_FUTEX_OP(WAIT); *arg_mask |= SCF_VAL3|SCF_UADDR2; break; 211 P_FUTEX_OP(WAKE); *arg_mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break; 212 P_FUTEX_OP(FD); *arg_mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break; 213 P_FUTEX_OP(REQUEUE); *arg_mask |= SCF_VAL3|SCF_TIMEOUT; break; 214 P_FUTEX_OP(CMP_REQUEUE); *arg_mask |= SCF_TIMEOUT; break; 215 P_FUTEX_OP(CMP_REQUEUE_PI); *arg_mask |= SCF_TIMEOUT; break; 216 P_FUTEX_OP(WAKE_OP); break; 217 P_FUTEX_OP(LOCK_PI); *arg_mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break; 218 P_FUTEX_OP(UNLOCK_PI); *arg_mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break; 219 P_FUTEX_OP(TRYLOCK_PI); *arg_mask |= SCF_VAL3|SCF_UADDR2; break; 220 P_FUTEX_OP(WAIT_BITSET); *arg_mask |= SCF_UADDR2; break; 221 P_FUTEX_OP(WAKE_BITSET); *arg_mask |= SCF_UADDR2; break; 222 P_FUTEX_OP(WAIT_REQUEUE_PI); break; 223 default: printed = scnprintf(bf, size, "%#x", cmd); break; 224 } 225 226 if (op & FUTEX_PRIVATE_FLAG) 227 printed += scnprintf(bf + printed, size - printed, "|PRIV"); 228 229 if (op & FUTEX_CLOCK_REALTIME) 230 printed += scnprintf(bf + printed, size - printed, "|CLKRT"); 231 232 return printed; 233 } 234 235 #define SCA_FUTEX_OP syscall_arg__scnprintf_futex_op 236 237 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size, 238 unsigned long arg, 239 u8 arg_idx, u8 *arg_mask) 240 { 241 int printed = 0, flags = arg; 242 243 if (!(flags & O_CREAT)) 244 *arg_mask |= 1 << (arg_idx + 1); /* Mask the mode parm */ 245 246 if (flags == 0) 247 return scnprintf(bf, size, "RDONLY"); 248 #define P_FLAG(n) \ 249 if (flags & O_##n) { \ 250 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ 251 flags &= ~O_##n; \ 252 } 253 254 P_FLAG(APPEND); 255 P_FLAG(ASYNC); 256 P_FLAG(CLOEXEC); 257 P_FLAG(CREAT); 258 P_FLAG(DIRECT); 259 P_FLAG(DIRECTORY); 260 P_FLAG(EXCL); 261 P_FLAG(LARGEFILE); 262 P_FLAG(NOATIME); 263 P_FLAG(NOCTTY); 264 #ifdef O_NONBLOCK 265 P_FLAG(NONBLOCK); 266 #elif O_NDELAY 267 P_FLAG(NDELAY); 268 #endif 269 #ifdef O_PATH 270 P_FLAG(PATH); 271 #endif 272 P_FLAG(RDWR); 273 #ifdef O_DSYNC 274 if ((flags & O_SYNC) == O_SYNC) 275 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC"); 276 else { 277 P_FLAG(DSYNC); 278 } 279 #else 280 P_FLAG(SYNC); 281 #endif 282 P_FLAG(TRUNC); 283 P_FLAG(WRONLY); 284 #undef P_FLAG 285 286 if (flags) 287 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); 288 289 return printed; 290 } 291 292 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags 293 294 static struct syscall_fmt { 295 const char *name; 296 const char *alias; 297 size_t (*arg_scnprintf[6])(char *bf, size_t size, unsigned long arg, u8 arg_idx, u8 *arg_mask); 298 bool errmsg; 299 bool timeout; 300 bool hexret; 301 } syscall_fmts[] = { 302 { .name = "access", .errmsg = true, }, 303 { .name = "arch_prctl", .errmsg = true, .alias = "prctl", }, 304 { .name = "brk", .hexret = true, 305 .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, }, 306 { .name = "mmap", .hexret = true, }, 307 { .name = "connect", .errmsg = true, }, 308 { .name = "fstat", .errmsg = true, .alias = "newfstat", }, 309 { .name = "fstatat", .errmsg = true, .alias = "newfstatat", }, 310 { .name = "futex", .errmsg = true, 311 .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, }, 312 { .name = "ioctl", .errmsg = true, 313 .arg_scnprintf = { [2] = SCA_HEX, /* arg */ }, }, 314 { .name = "lseek", .errmsg = true, 315 .arg_scnprintf = { [2] = SCA_WHENCE, /* whence */ }, }, 316 { .name = "lstat", .errmsg = true, .alias = "newlstat", }, 317 { .name = "madvise", .errmsg = true, 318 .arg_scnprintf = { [0] = SCA_HEX, /* start */ 319 [2] = SCA_MADV_BHV, /* behavior */ }, }, 320 { .name = "mmap", .hexret = true, 321 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ 322 [2] = SCA_MMAP_PROT, /* prot */ 323 [3] = SCA_MMAP_FLAGS, /* flags */ }, }, 324 { .name = "mprotect", .errmsg = true, 325 .arg_scnprintf = { [0] = SCA_HEX, /* start */ 326 [2] = SCA_MMAP_PROT, /* prot */ }, }, 327 { .name = "mremap", .hexret = true, 328 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ 329 [4] = SCA_HEX, /* new_addr */ }, }, 330 { .name = "munmap", .errmsg = true, 331 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, }, 332 { .name = "open", .errmsg = true, 333 .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, }, 334 { .name = "open_by_handle_at", .errmsg = true, 335 .arg_scnprintf = { [2] = SCA_OPEN_FLAGS, /* flags */ }, }, 336 { .name = "openat", .errmsg = true, 337 .arg_scnprintf = { [2] = SCA_OPEN_FLAGS, /* flags */ }, }, 338 { .name = "poll", .errmsg = true, .timeout = true, }, 339 { .name = "ppoll", .errmsg = true, .timeout = true, }, 340 { .name = "pread", .errmsg = true, .alias = "pread64", }, 341 { .name = "pwrite", .errmsg = true, .alias = "pwrite64", }, 342 { .name = "read", .errmsg = true, }, 343 { .name = "recvfrom", .errmsg = true, }, 344 { .name = "select", .errmsg = true, .timeout = true, }, 345 { .name = "socket", .errmsg = true, }, 346 { .name = "stat", .errmsg = true, .alias = "newstat", }, 347 { .name = "uname", .errmsg = true, .alias = "newuname", }, 348 }; 349 350 static int syscall_fmt__cmp(const void *name, const void *fmtp) 351 { 352 const struct syscall_fmt *fmt = fmtp; 353 return strcmp(name, fmt->name); 354 } 355 356 static struct syscall_fmt *syscall_fmt__find(const char *name) 357 { 358 const int nmemb = ARRAY_SIZE(syscall_fmts); 359 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp); 360 } 361 362 struct syscall { 363 struct event_format *tp_format; 364 const char *name; 365 bool filtered; 366 struct syscall_fmt *fmt; 367 size_t (**arg_scnprintf)(char *bf, size_t size, 368 unsigned long arg, u8 arg_idx, u8 *args_mask); 369 }; 370 371 static size_t fprintf_duration(unsigned long t, FILE *fp) 372 { 373 double duration = (double)t / NSEC_PER_MSEC; 374 size_t printed = fprintf(fp, "("); 375 376 if (duration >= 1.0) 377 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration); 378 else if (duration >= 0.01) 379 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration); 380 else 381 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration); 382 return printed + fprintf(fp, "): "); 383 } 384 385 struct thread_trace { 386 u64 entry_time; 387 u64 exit_time; 388 bool entry_pending; 389 unsigned long nr_events; 390 char *entry_str; 391 double runtime_ms; 392 }; 393 394 static struct thread_trace *thread_trace__new(void) 395 { 396 return zalloc(sizeof(struct thread_trace)); 397 } 398 399 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp) 400 { 401 struct thread_trace *ttrace; 402 403 if (thread == NULL) 404 goto fail; 405 406 if (thread->priv == NULL) 407 thread->priv = thread_trace__new(); 408 409 if (thread->priv == NULL) 410 goto fail; 411 412 ttrace = thread->priv; 413 ++ttrace->nr_events; 414 415 return ttrace; 416 fail: 417 color_fprintf(fp, PERF_COLOR_RED, 418 "WARNING: not enough memory, dropping samples!\n"); 419 return NULL; 420 } 421 422 struct trace { 423 struct perf_tool tool; 424 int audit_machine; 425 struct { 426 int max; 427 struct syscall *table; 428 } syscalls; 429 struct perf_record_opts opts; 430 struct machine host; 431 u64 base_time; 432 FILE *output; 433 unsigned long nr_events; 434 struct strlist *ev_qualifier; 435 bool not_ev_qualifier; 436 struct intlist *tid_list; 437 struct intlist *pid_list; 438 bool sched; 439 bool multiple_threads; 440 double duration_filter; 441 double runtime_ms; 442 }; 443 444 static bool trace__filter_duration(struct trace *trace, double t) 445 { 446 return t < (trace->duration_filter * NSEC_PER_MSEC); 447 } 448 449 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp) 450 { 451 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC; 452 453 return fprintf(fp, "%10.3f ", ts); 454 } 455 456 static bool done = false; 457 458 static void sig_handler(int sig __maybe_unused) 459 { 460 done = true; 461 } 462 463 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread, 464 u64 duration, u64 tstamp, FILE *fp) 465 { 466 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp); 467 printed += fprintf_duration(duration, fp); 468 469 if (trace->multiple_threads) 470 printed += fprintf(fp, "%d ", thread->tid); 471 472 return printed; 473 } 474 475 static int trace__process_event(struct trace *trace, struct machine *machine, 476 union perf_event *event) 477 { 478 int ret = 0; 479 480 switch (event->header.type) { 481 case PERF_RECORD_LOST: 482 color_fprintf(trace->output, PERF_COLOR_RED, 483 "LOST %" PRIu64 " events!\n", event->lost.lost); 484 ret = machine__process_lost_event(machine, event); 485 default: 486 ret = machine__process_event(machine, event); 487 break; 488 } 489 490 return ret; 491 } 492 493 static int trace__tool_process(struct perf_tool *tool, 494 union perf_event *event, 495 struct perf_sample *sample __maybe_unused, 496 struct machine *machine) 497 { 498 struct trace *trace = container_of(tool, struct trace, tool); 499 return trace__process_event(trace, machine, event); 500 } 501 502 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist) 503 { 504 int err = symbol__init(); 505 506 if (err) 507 return err; 508 509 machine__init(&trace->host, "", HOST_KERNEL_ID); 510 machine__create_kernel_maps(&trace->host); 511 512 if (perf_target__has_task(&trace->opts.target)) { 513 err = perf_event__synthesize_thread_map(&trace->tool, evlist->threads, 514 trace__tool_process, 515 &trace->host); 516 } else { 517 err = perf_event__synthesize_threads(&trace->tool, trace__tool_process, 518 &trace->host); 519 } 520 521 if (err) 522 symbol__exit(); 523 524 return err; 525 } 526 527 static int syscall__set_arg_fmts(struct syscall *sc) 528 { 529 struct format_field *field; 530 int idx = 0; 531 532 sc->arg_scnprintf = calloc(sc->tp_format->format.nr_fields - 1, sizeof(void *)); 533 if (sc->arg_scnprintf == NULL) 534 return -1; 535 536 for (field = sc->tp_format->format.fields->next; field; field = field->next) { 537 if (sc->fmt && sc->fmt->arg_scnprintf[idx]) 538 sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx]; 539 else if (field->flags & FIELD_IS_POINTER) 540 sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex; 541 ++idx; 542 } 543 544 return 0; 545 } 546 547 static int trace__read_syscall_info(struct trace *trace, int id) 548 { 549 char tp_name[128]; 550 struct syscall *sc; 551 const char *name = audit_syscall_to_name(id, trace->audit_machine); 552 553 if (name == NULL) 554 return -1; 555 556 if (id > trace->syscalls.max) { 557 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc)); 558 559 if (nsyscalls == NULL) 560 return -1; 561 562 if (trace->syscalls.max != -1) { 563 memset(nsyscalls + trace->syscalls.max + 1, 0, 564 (id - trace->syscalls.max) * sizeof(*sc)); 565 } else { 566 memset(nsyscalls, 0, (id + 1) * sizeof(*sc)); 567 } 568 569 trace->syscalls.table = nsyscalls; 570 trace->syscalls.max = id; 571 } 572 573 sc = trace->syscalls.table + id; 574 sc->name = name; 575 576 if (trace->ev_qualifier) { 577 bool in = strlist__find(trace->ev_qualifier, name) != NULL; 578 579 if (!(in ^ trace->not_ev_qualifier)) { 580 sc->filtered = true; 581 /* 582 * No need to do read tracepoint information since this will be 583 * filtered out. 584 */ 585 return 0; 586 } 587 } 588 589 sc->fmt = syscall_fmt__find(sc->name); 590 591 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name); 592 sc->tp_format = event_format__new("syscalls", tp_name); 593 594 if (sc->tp_format == NULL && sc->fmt && sc->fmt->alias) { 595 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias); 596 sc->tp_format = event_format__new("syscalls", tp_name); 597 } 598 599 if (sc->tp_format == NULL) 600 return -1; 601 602 return syscall__set_arg_fmts(sc); 603 } 604 605 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, 606 unsigned long *args) 607 { 608 int i = 0; 609 size_t printed = 0; 610 611 if (sc->tp_format != NULL) { 612 struct format_field *field; 613 u8 mask = 0, bit = 1; 614 615 for (field = sc->tp_format->format.fields->next; field; 616 field = field->next, ++i, bit <<= 1) { 617 if (mask & bit) 618 continue; 619 620 printed += scnprintf(bf + printed, size - printed, 621 "%s%s: ", printed ? ", " : "", field->name); 622 623 if (sc->arg_scnprintf && sc->arg_scnprintf[i]) { 624 printed += sc->arg_scnprintf[i](bf + printed, size - printed, 625 args[i], i, &mask); 626 } else { 627 printed += scnprintf(bf + printed, size - printed, 628 "%ld", args[i]); 629 } 630 } 631 } else { 632 while (i < 6) { 633 printed += scnprintf(bf + printed, size - printed, 634 "%sarg%d: %ld", 635 printed ? ", " : "", i, args[i]); 636 ++i; 637 } 638 } 639 640 return printed; 641 } 642 643 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel, 644 struct perf_sample *sample); 645 646 static struct syscall *trace__syscall_info(struct trace *trace, 647 struct perf_evsel *evsel, 648 struct perf_sample *sample) 649 { 650 int id = perf_evsel__intval(evsel, sample, "id"); 651 652 if (id < 0) { 653 654 /* 655 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried 656 * before that, leaving at a higher verbosity level till that is 657 * explained. Reproduced with plain ftrace with: 658 * 659 * echo 1 > /t/events/raw_syscalls/sys_exit/enable 660 * grep "NR -1 " /t/trace_pipe 661 * 662 * After generating some load on the machine. 663 */ 664 if (verbose > 1) { 665 static u64 n; 666 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n", 667 id, perf_evsel__name(evsel), ++n); 668 } 669 return NULL; 670 } 671 672 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) && 673 trace__read_syscall_info(trace, id)) 674 goto out_cant_read; 675 676 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL)) 677 goto out_cant_read; 678 679 return &trace->syscalls.table[id]; 680 681 out_cant_read: 682 if (verbose) { 683 fprintf(trace->output, "Problems reading syscall %d", id); 684 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL) 685 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name); 686 fputs(" information\n", trace->output); 687 } 688 return NULL; 689 } 690 691 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, 692 struct perf_sample *sample) 693 { 694 char *msg; 695 void *args; 696 size_t printed = 0; 697 struct thread *thread; 698 struct syscall *sc = trace__syscall_info(trace, evsel, sample); 699 struct thread_trace *ttrace; 700 701 if (sc == NULL) 702 return -1; 703 704 if (sc->filtered) 705 return 0; 706 707 thread = machine__findnew_thread(&trace->host, sample->pid, 708 sample->tid); 709 ttrace = thread__trace(thread, trace->output); 710 if (ttrace == NULL) 711 return -1; 712 713 args = perf_evsel__rawptr(evsel, sample, "args"); 714 if (args == NULL) { 715 fprintf(trace->output, "Problems reading syscall arguments\n"); 716 return -1; 717 } 718 719 ttrace = thread->priv; 720 721 if (ttrace->entry_str == NULL) { 722 ttrace->entry_str = malloc(1024); 723 if (!ttrace->entry_str) 724 return -1; 725 } 726 727 ttrace->entry_time = sample->time; 728 msg = ttrace->entry_str; 729 printed += scnprintf(msg + printed, 1024 - printed, "%s(", sc->name); 730 731 printed += syscall__scnprintf_args(sc, msg + printed, 1024 - printed, args); 732 733 if (!strcmp(sc->name, "exit_group") || !strcmp(sc->name, "exit")) { 734 if (!trace->duration_filter) { 735 trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output); 736 fprintf(trace->output, "%-70s\n", ttrace->entry_str); 737 } 738 } else 739 ttrace->entry_pending = true; 740 741 return 0; 742 } 743 744 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, 745 struct perf_sample *sample) 746 { 747 int ret; 748 u64 duration = 0; 749 struct thread *thread; 750 struct syscall *sc = trace__syscall_info(trace, evsel, sample); 751 struct thread_trace *ttrace; 752 753 if (sc == NULL) 754 return -1; 755 756 if (sc->filtered) 757 return 0; 758 759 thread = machine__findnew_thread(&trace->host, sample->pid, 760 sample->tid); 761 ttrace = thread__trace(thread, trace->output); 762 if (ttrace == NULL) 763 return -1; 764 765 ret = perf_evsel__intval(evsel, sample, "ret"); 766 767 ttrace = thread->priv; 768 769 ttrace->exit_time = sample->time; 770 771 if (ttrace->entry_time) { 772 duration = sample->time - ttrace->entry_time; 773 if (trace__filter_duration(trace, duration)) 774 goto out; 775 } else if (trace->duration_filter) 776 goto out; 777 778 trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output); 779 780 if (ttrace->entry_pending) { 781 fprintf(trace->output, "%-70s", ttrace->entry_str); 782 } else { 783 fprintf(trace->output, " ... ["); 784 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued"); 785 fprintf(trace->output, "]: %s()", sc->name); 786 } 787 788 if (sc->fmt == NULL) { 789 signed_print: 790 fprintf(trace->output, ") = %d", ret); 791 } else if (ret < 0 && sc->fmt->errmsg) { 792 char bf[256]; 793 const char *emsg = strerror_r(-ret, bf, sizeof(bf)), 794 *e = audit_errno_to_name(-ret); 795 796 fprintf(trace->output, ") = -1 %s %s", e, emsg); 797 } else if (ret == 0 && sc->fmt->timeout) 798 fprintf(trace->output, ") = 0 Timeout"); 799 else if (sc->fmt->hexret) 800 fprintf(trace->output, ") = %#x", ret); 801 else 802 goto signed_print; 803 804 fputc('\n', trace->output); 805 out: 806 ttrace->entry_pending = false; 807 808 return 0; 809 } 810 811 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel, 812 struct perf_sample *sample) 813 { 814 u64 runtime = perf_evsel__intval(evsel, sample, "runtime"); 815 double runtime_ms = (double)runtime / NSEC_PER_MSEC; 816 struct thread *thread = machine__findnew_thread(&trace->host, 817 sample->pid, 818 sample->tid); 819 struct thread_trace *ttrace = thread__trace(thread, trace->output); 820 821 if (ttrace == NULL) 822 goto out_dump; 823 824 ttrace->runtime_ms += runtime_ms; 825 trace->runtime_ms += runtime_ms; 826 return 0; 827 828 out_dump: 829 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n", 830 evsel->name, 831 perf_evsel__strval(evsel, sample, "comm"), 832 (pid_t)perf_evsel__intval(evsel, sample, "pid"), 833 runtime, 834 perf_evsel__intval(evsel, sample, "vruntime")); 835 return 0; 836 } 837 838 static bool skip_sample(struct trace *trace, struct perf_sample *sample) 839 { 840 if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) || 841 (trace->tid_list && intlist__find(trace->tid_list, sample->tid))) 842 return false; 843 844 if (trace->pid_list || trace->tid_list) 845 return true; 846 847 return false; 848 } 849 850 static int trace__process_sample(struct perf_tool *tool, 851 union perf_event *event __maybe_unused, 852 struct perf_sample *sample, 853 struct perf_evsel *evsel, 854 struct machine *machine __maybe_unused) 855 { 856 struct trace *trace = container_of(tool, struct trace, tool); 857 int err = 0; 858 859 tracepoint_handler handler = evsel->handler.func; 860 861 if (skip_sample(trace, sample)) 862 return 0; 863 864 if (trace->base_time == 0) 865 trace->base_time = sample->time; 866 867 if (handler) 868 handler(trace, evsel, sample); 869 870 return err; 871 } 872 873 static bool 874 perf_session__has_tp(struct perf_session *session, const char *name) 875 { 876 struct perf_evsel *evsel; 877 878 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, name); 879 880 return evsel != NULL; 881 } 882 883 static int parse_target_str(struct trace *trace) 884 { 885 if (trace->opts.target.pid) { 886 trace->pid_list = intlist__new(trace->opts.target.pid); 887 if (trace->pid_list == NULL) { 888 pr_err("Error parsing process id string\n"); 889 return -EINVAL; 890 } 891 } 892 893 if (trace->opts.target.tid) { 894 trace->tid_list = intlist__new(trace->opts.target.tid); 895 if (trace->tid_list == NULL) { 896 pr_err("Error parsing thread id string\n"); 897 return -EINVAL; 898 } 899 } 900 901 return 0; 902 } 903 904 static int trace__run(struct trace *trace, int argc, const char **argv) 905 { 906 struct perf_evlist *evlist = perf_evlist__new(); 907 struct perf_evsel *evsel; 908 int err = -1, i; 909 unsigned long before; 910 const bool forks = argc > 0; 911 912 if (evlist == NULL) { 913 fprintf(trace->output, "Not enough memory to run!\n"); 914 goto out; 915 } 916 917 if (perf_evlist__add_newtp(evlist, "raw_syscalls", "sys_enter", trace__sys_enter) || 918 perf_evlist__add_newtp(evlist, "raw_syscalls", "sys_exit", trace__sys_exit)) { 919 fprintf(trace->output, "Couldn't read the raw_syscalls tracepoints information!\n"); 920 goto out_delete_evlist; 921 } 922 923 if (trace->sched && 924 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime", 925 trace__sched_stat_runtime)) { 926 fprintf(trace->output, "Couldn't read the sched_stat_runtime tracepoint information!\n"); 927 goto out_delete_evlist; 928 } 929 930 err = perf_evlist__create_maps(evlist, &trace->opts.target); 931 if (err < 0) { 932 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n"); 933 goto out_delete_evlist; 934 } 935 936 err = trace__symbols_init(trace, evlist); 937 if (err < 0) { 938 fprintf(trace->output, "Problems initializing symbol libraries!\n"); 939 goto out_delete_maps; 940 } 941 942 perf_evlist__config(evlist, &trace->opts); 943 944 signal(SIGCHLD, sig_handler); 945 signal(SIGINT, sig_handler); 946 947 if (forks) { 948 err = perf_evlist__prepare_workload(evlist, &trace->opts.target, 949 argv, false, false); 950 if (err < 0) { 951 fprintf(trace->output, "Couldn't run the workload!\n"); 952 goto out_delete_maps; 953 } 954 } 955 956 err = perf_evlist__open(evlist); 957 if (err < 0) { 958 fprintf(trace->output, "Couldn't create the events: %s\n", strerror(errno)); 959 goto out_delete_maps; 960 } 961 962 err = perf_evlist__mmap(evlist, UINT_MAX, false); 963 if (err < 0) { 964 fprintf(trace->output, "Couldn't mmap the events: %s\n", strerror(errno)); 965 goto out_close_evlist; 966 } 967 968 perf_evlist__enable(evlist); 969 970 if (forks) 971 perf_evlist__start_workload(evlist); 972 973 trace->multiple_threads = evlist->threads->map[0] == -1 || evlist->threads->nr > 1; 974 again: 975 before = trace->nr_events; 976 977 for (i = 0; i < evlist->nr_mmaps; i++) { 978 union perf_event *event; 979 980 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) { 981 const u32 type = event->header.type; 982 tracepoint_handler handler; 983 struct perf_sample sample; 984 985 ++trace->nr_events; 986 987 err = perf_evlist__parse_sample(evlist, event, &sample); 988 if (err) { 989 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err); 990 continue; 991 } 992 993 if (trace->base_time == 0) 994 trace->base_time = sample.time; 995 996 if (type != PERF_RECORD_SAMPLE) { 997 trace__process_event(trace, &trace->host, event); 998 continue; 999 } 1000 1001 evsel = perf_evlist__id2evsel(evlist, sample.id); 1002 if (evsel == NULL) { 1003 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample.id); 1004 continue; 1005 } 1006 1007 if (sample.raw_data == NULL) { 1008 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n", 1009 perf_evsel__name(evsel), sample.tid, 1010 sample.cpu, sample.raw_size); 1011 continue; 1012 } 1013 1014 handler = evsel->handler.func; 1015 handler(trace, evsel, &sample); 1016 1017 if (done) 1018 goto out_unmap_evlist; 1019 } 1020 } 1021 1022 if (trace->nr_events == before) { 1023 if (done) 1024 goto out_unmap_evlist; 1025 1026 poll(evlist->pollfd, evlist->nr_fds, -1); 1027 } 1028 1029 if (done) 1030 perf_evlist__disable(evlist); 1031 1032 goto again; 1033 1034 out_unmap_evlist: 1035 perf_evlist__munmap(evlist); 1036 out_close_evlist: 1037 perf_evlist__close(evlist); 1038 out_delete_maps: 1039 perf_evlist__delete_maps(evlist); 1040 out_delete_evlist: 1041 perf_evlist__delete(evlist); 1042 out: 1043 return err; 1044 } 1045 1046 static int trace__replay(struct trace *trace) 1047 { 1048 const struct perf_evsel_str_handler handlers[] = { 1049 { "raw_syscalls:sys_enter", trace__sys_enter, }, 1050 { "raw_syscalls:sys_exit", trace__sys_exit, }, 1051 }; 1052 1053 struct perf_session *session; 1054 int err = -1; 1055 1056 trace->tool.sample = trace__process_sample; 1057 trace->tool.mmap = perf_event__process_mmap; 1058 trace->tool.mmap2 = perf_event__process_mmap2; 1059 trace->tool.comm = perf_event__process_comm; 1060 trace->tool.exit = perf_event__process_exit; 1061 trace->tool.fork = perf_event__process_fork; 1062 trace->tool.attr = perf_event__process_attr; 1063 trace->tool.tracing_data = perf_event__process_tracing_data; 1064 trace->tool.build_id = perf_event__process_build_id; 1065 1066 trace->tool.ordered_samples = true; 1067 trace->tool.ordering_requires_timestamps = true; 1068 1069 /* add tid to output */ 1070 trace->multiple_threads = true; 1071 1072 if (symbol__init() < 0) 1073 return -1; 1074 1075 session = perf_session__new(input_name, O_RDONLY, 0, false, 1076 &trace->tool); 1077 if (session == NULL) 1078 return -ENOMEM; 1079 1080 err = perf_session__set_tracepoints_handlers(session, handlers); 1081 if (err) 1082 goto out; 1083 1084 if (!perf_session__has_tp(session, "raw_syscalls:sys_enter")) { 1085 pr_err("Data file does not have raw_syscalls:sys_enter events\n"); 1086 goto out; 1087 } 1088 1089 if (!perf_session__has_tp(session, "raw_syscalls:sys_exit")) { 1090 pr_err("Data file does not have raw_syscalls:sys_exit events\n"); 1091 goto out; 1092 } 1093 1094 err = parse_target_str(trace); 1095 if (err != 0) 1096 goto out; 1097 1098 setup_pager(); 1099 1100 err = perf_session__process_events(session, &trace->tool); 1101 if (err) 1102 pr_err("Failed to process events, error %d", err); 1103 1104 out: 1105 perf_session__delete(session); 1106 1107 return err; 1108 } 1109 1110 static size_t trace__fprintf_threads_header(FILE *fp) 1111 { 1112 size_t printed; 1113 1114 printed = fprintf(fp, "\n _____________________________________________________________________\n"); 1115 printed += fprintf(fp," __) Summary of events (__\n\n"); 1116 printed += fprintf(fp," [ task - pid ] [ events ] [ ratio ] [ runtime ]\n"); 1117 printed += fprintf(fp," _____________________________________________________________________\n\n"); 1118 1119 return printed; 1120 } 1121 1122 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp) 1123 { 1124 size_t printed = trace__fprintf_threads_header(fp); 1125 struct rb_node *nd; 1126 1127 for (nd = rb_first(&trace->host.threads); nd; nd = rb_next(nd)) { 1128 struct thread *thread = rb_entry(nd, struct thread, rb_node); 1129 struct thread_trace *ttrace = thread->priv; 1130 const char *color; 1131 double ratio; 1132 1133 if (ttrace == NULL) 1134 continue; 1135 1136 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0; 1137 1138 color = PERF_COLOR_NORMAL; 1139 if (ratio > 50.0) 1140 color = PERF_COLOR_RED; 1141 else if (ratio > 25.0) 1142 color = PERF_COLOR_GREEN; 1143 else if (ratio > 5.0) 1144 color = PERF_COLOR_YELLOW; 1145 1146 printed += color_fprintf(fp, color, "%20s", thread->comm); 1147 printed += fprintf(fp, " - %-5d :%11lu [", thread->tid, ttrace->nr_events); 1148 printed += color_fprintf(fp, color, "%5.1f%%", ratio); 1149 printed += fprintf(fp, " ] %10.3f ms\n", ttrace->runtime_ms); 1150 } 1151 1152 return printed; 1153 } 1154 1155 static int trace__set_duration(const struct option *opt, const char *str, 1156 int unset __maybe_unused) 1157 { 1158 struct trace *trace = opt->value; 1159 1160 trace->duration_filter = atof(str); 1161 return 0; 1162 } 1163 1164 static int trace__open_output(struct trace *trace, const char *filename) 1165 { 1166 struct stat st; 1167 1168 if (!stat(filename, &st) && st.st_size) { 1169 char oldname[PATH_MAX]; 1170 1171 scnprintf(oldname, sizeof(oldname), "%s.old", filename); 1172 unlink(oldname); 1173 rename(filename, oldname); 1174 } 1175 1176 trace->output = fopen(filename, "w"); 1177 1178 return trace->output == NULL ? -errno : 0; 1179 } 1180 1181 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) 1182 { 1183 const char * const trace_usage[] = { 1184 "perf trace [<options>] [<command>]", 1185 "perf trace [<options>] -- <command> [<options>]", 1186 NULL 1187 }; 1188 struct trace trace = { 1189 .audit_machine = audit_detect_machine(), 1190 .syscalls = { 1191 . max = -1, 1192 }, 1193 .opts = { 1194 .target = { 1195 .uid = UINT_MAX, 1196 .uses_mmap = true, 1197 }, 1198 .user_freq = UINT_MAX, 1199 .user_interval = ULLONG_MAX, 1200 .no_delay = true, 1201 .mmap_pages = 1024, 1202 }, 1203 .output = stdout, 1204 }; 1205 const char *output_name = NULL; 1206 const char *ev_qualifier_str = NULL; 1207 const struct option trace_options[] = { 1208 OPT_STRING('e', "expr", &ev_qualifier_str, "expr", 1209 "list of events to trace"), 1210 OPT_STRING('o', "output", &output_name, "file", "output file name"), 1211 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"), 1212 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid", 1213 "trace events on existing process id"), 1214 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid", 1215 "trace events on existing thread id"), 1216 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide, 1217 "system-wide collection from all CPUs"), 1218 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu", 1219 "list of cpus to monitor"), 1220 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit, 1221 "child tasks do not inherit counters"), 1222 OPT_UINTEGER('m', "mmap-pages", &trace.opts.mmap_pages, 1223 "number of mmap data pages"), 1224 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user", 1225 "user to profile"), 1226 OPT_CALLBACK(0, "duration", &trace, "float", 1227 "show only events with duration > N.M ms", 1228 trace__set_duration), 1229 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"), 1230 OPT_INCR('v', "verbose", &verbose, "be more verbose"), 1231 OPT_END() 1232 }; 1233 int err; 1234 char bf[BUFSIZ]; 1235 1236 argc = parse_options(argc, argv, trace_options, trace_usage, 0); 1237 1238 if (output_name != NULL) { 1239 err = trace__open_output(&trace, output_name); 1240 if (err < 0) { 1241 perror("failed to create output file"); 1242 goto out; 1243 } 1244 } 1245 1246 if (ev_qualifier_str != NULL) { 1247 const char *s = ev_qualifier_str; 1248 1249 trace.not_ev_qualifier = *s == '!'; 1250 if (trace.not_ev_qualifier) 1251 ++s; 1252 trace.ev_qualifier = strlist__new(true, s); 1253 if (trace.ev_qualifier == NULL) { 1254 fputs("Not enough memory to parse event qualifier", 1255 trace.output); 1256 err = -ENOMEM; 1257 goto out_close; 1258 } 1259 } 1260 1261 err = perf_target__validate(&trace.opts.target); 1262 if (err) { 1263 perf_target__strerror(&trace.opts.target, err, bf, sizeof(bf)); 1264 fprintf(trace.output, "%s", bf); 1265 goto out_close; 1266 } 1267 1268 err = perf_target__parse_uid(&trace.opts.target); 1269 if (err) { 1270 perf_target__strerror(&trace.opts.target, err, bf, sizeof(bf)); 1271 fprintf(trace.output, "%s", bf); 1272 goto out_close; 1273 } 1274 1275 if (!argc && perf_target__none(&trace.opts.target)) 1276 trace.opts.target.system_wide = true; 1277 1278 if (input_name) 1279 err = trace__replay(&trace); 1280 else 1281 err = trace__run(&trace, argc, argv); 1282 1283 if (trace.sched && !err) 1284 trace__fprintf_thread_summary(&trace, trace.output); 1285 1286 out_close: 1287 if (output_name != NULL) 1288 fclose(trace.output); 1289 out: 1290 return err; 1291 } 1292