1 #include <trace/syscall.h> 2 #include <trace/events/syscalls.h> 3 #include <linux/slab.h> 4 #include <linux/kernel.h> 5 #include <linux/ftrace.h> 6 #include <linux/perf_event.h> 7 #include <asm/syscall.h> 8 9 #include "trace_output.h" 10 #include "trace.h" 11 12 static DEFINE_MUTEX(syscall_trace_lock); 13 static int sys_refcount_enter; 14 static int sys_refcount_exit; 15 static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); 16 static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 17 18 static int syscall_enter_register(struct ftrace_event_call *event, 19 enum trace_reg type); 20 static int syscall_exit_register(struct ftrace_event_call *event, 21 enum trace_reg type); 22 23 static int syscall_enter_define_fields(struct ftrace_event_call *call); 24 static int syscall_exit_define_fields(struct ftrace_event_call *call); 25 26 static struct list_head * 27 syscall_get_enter_fields(struct ftrace_event_call *call) 28 { 29 struct syscall_metadata *entry = call->data; 30 31 return &entry->enter_fields; 32 } 33 34 struct trace_event_functions enter_syscall_print_funcs = { 35 .trace = print_syscall_enter, 36 }; 37 38 struct trace_event_functions exit_syscall_print_funcs = { 39 .trace = print_syscall_exit, 40 }; 41 42 struct ftrace_event_class event_class_syscall_enter = { 43 .system = "syscalls", 44 .reg = syscall_enter_register, 45 .define_fields = syscall_enter_define_fields, 46 .get_fields = syscall_get_enter_fields, 47 .raw_init = init_syscall_trace, 48 }; 49 50 struct ftrace_event_class event_class_syscall_exit = { 51 .system = "syscalls", 52 .reg = syscall_exit_register, 53 .define_fields = syscall_exit_define_fields, 54 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), 55 .raw_init = init_syscall_trace, 56 }; 57 58 extern struct syscall_metadata *__start_syscalls_metadata[]; 59 extern struct syscall_metadata *__stop_syscalls_metadata[]; 60 61 static struct syscall_metadata **syscalls_metadata; 62 63 #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME 64 static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) 65 { 66 /* 67 * Only compare after the "sys" prefix. Archs that use 68 * syscall wrappers may have syscalls symbols aliases prefixed 69 * with "SyS" instead of "sys", leading to an unwanted 70 * mismatch. 71 */ 72 return !strcmp(sym + 3, name + 3); 73 } 74 #endif 75 76 static __init struct syscall_metadata * 77 find_syscall_meta(unsigned long syscall) 78 { 79 struct syscall_metadata **start; 80 struct syscall_metadata **stop; 81 char str[KSYM_SYMBOL_LEN]; 82 83 84 start = __start_syscalls_metadata; 85 stop = __stop_syscalls_metadata; 86 kallsyms_lookup(syscall, NULL, NULL, NULL, str); 87 88 if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) 89 return NULL; 90 91 for ( ; start < stop; start++) { 92 if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) 93 return *start; 94 } 95 return NULL; 96 } 97 98 static struct syscall_metadata *syscall_nr_to_meta(int nr) 99 { 100 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) 101 return NULL; 102 103 return syscalls_metadata[nr]; 104 } 105 106 enum print_line_t 107 print_syscall_enter(struct trace_iterator *iter, int flags, 108 struct trace_event *event) 109 { 110 struct trace_seq *s = &iter->seq; 111 struct trace_entry *ent = iter->ent; 112 struct syscall_trace_enter *trace; 113 struct syscall_metadata *entry; 114 int i, ret, syscall; 115 116 trace = (typeof(trace))ent; 117 syscall = trace->nr; 118 entry = syscall_nr_to_meta(syscall); 119 120 if (!entry) 121 goto end; 122 123 if (entry->enter_event->event.type != ent->type) { 124 WARN_ON_ONCE(1); 125 goto end; 126 } 127 128 ret = trace_seq_printf(s, "%s(", entry->name); 129 if (!ret) 130 return TRACE_TYPE_PARTIAL_LINE; 131 132 for (i = 0; i < entry->nb_args; i++) { 133 /* parameter types */ 134 if (trace_flags & TRACE_ITER_VERBOSE) { 135 ret = trace_seq_printf(s, "%s ", entry->types[i]); 136 if (!ret) 137 return TRACE_TYPE_PARTIAL_LINE; 138 } 139 /* parameter values */ 140 ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i], 141 trace->args[i], 142 i == entry->nb_args - 1 ? "" : ", "); 143 if (!ret) 144 return TRACE_TYPE_PARTIAL_LINE; 145 } 146 147 ret = trace_seq_putc(s, ')'); 148 if (!ret) 149 return TRACE_TYPE_PARTIAL_LINE; 150 151 end: 152 ret = trace_seq_putc(s, '\n'); 153 if (!ret) 154 return TRACE_TYPE_PARTIAL_LINE; 155 156 return TRACE_TYPE_HANDLED; 157 } 158 159 enum print_line_t 160 print_syscall_exit(struct trace_iterator *iter, int flags, 161 struct trace_event *event) 162 { 163 struct trace_seq *s = &iter->seq; 164 struct trace_entry *ent = iter->ent; 165 struct syscall_trace_exit *trace; 166 int syscall; 167 struct syscall_metadata *entry; 168 int ret; 169 170 trace = (typeof(trace))ent; 171 syscall = trace->nr; 172 entry = syscall_nr_to_meta(syscall); 173 174 if (!entry) { 175 trace_seq_printf(s, "\n"); 176 return TRACE_TYPE_HANDLED; 177 } 178 179 if (entry->exit_event->event.type != ent->type) { 180 WARN_ON_ONCE(1); 181 return TRACE_TYPE_UNHANDLED; 182 } 183 184 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, 185 trace->ret); 186 if (!ret) 187 return TRACE_TYPE_PARTIAL_LINE; 188 189 return TRACE_TYPE_HANDLED; 190 } 191 192 extern char *__bad_type_size(void); 193 194 #define SYSCALL_FIELD(type, name) \ 195 sizeof(type) != sizeof(trace.name) ? \ 196 __bad_type_size() : \ 197 #type, #name, offsetof(typeof(trace), name), \ 198 sizeof(trace.name), is_signed_type(type) 199 200 static 201 int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) 202 { 203 int i; 204 int pos = 0; 205 206 /* When len=0, we just calculate the needed length */ 207 #define LEN_OR_ZERO (len ? len - pos : 0) 208 209 pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); 210 for (i = 0; i < entry->nb_args; i++) { 211 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", 212 entry->args[i], sizeof(unsigned long), 213 i == entry->nb_args - 1 ? "" : ", "); 214 } 215 pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); 216 217 for (i = 0; i < entry->nb_args; i++) { 218 pos += snprintf(buf + pos, LEN_OR_ZERO, 219 ", ((unsigned long)(REC->%s))", entry->args[i]); 220 } 221 222 #undef LEN_OR_ZERO 223 224 /* return the length of print_fmt */ 225 return pos; 226 } 227 228 static int set_syscall_print_fmt(struct ftrace_event_call *call) 229 { 230 char *print_fmt; 231 int len; 232 struct syscall_metadata *entry = call->data; 233 234 if (entry->enter_event != call) { 235 call->print_fmt = "\"0x%lx\", REC->ret"; 236 return 0; 237 } 238 239 /* First: called with 0 length to calculate the needed length */ 240 len = __set_enter_print_fmt(entry, NULL, 0); 241 242 print_fmt = kmalloc(len + 1, GFP_KERNEL); 243 if (!print_fmt) 244 return -ENOMEM; 245 246 /* Second: actually write the @print_fmt */ 247 __set_enter_print_fmt(entry, print_fmt, len + 1); 248 call->print_fmt = print_fmt; 249 250 return 0; 251 } 252 253 static void free_syscall_print_fmt(struct ftrace_event_call *call) 254 { 255 struct syscall_metadata *entry = call->data; 256 257 if (entry->enter_event == call) 258 kfree(call->print_fmt); 259 } 260 261 static int syscall_enter_define_fields(struct ftrace_event_call *call) 262 { 263 struct syscall_trace_enter trace; 264 struct syscall_metadata *meta = call->data; 265 int ret; 266 int i; 267 int offset = offsetof(typeof(trace), args); 268 269 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); 270 if (ret) 271 return ret; 272 273 for (i = 0; i < meta->nb_args; i++) { 274 ret = trace_define_field(call, meta->types[i], 275 meta->args[i], offset, 276 sizeof(unsigned long), 0, 277 FILTER_OTHER); 278 offset += sizeof(unsigned long); 279 } 280 281 return ret; 282 } 283 284 static int syscall_exit_define_fields(struct ftrace_event_call *call) 285 { 286 struct syscall_trace_exit trace; 287 int ret; 288 289 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); 290 if (ret) 291 return ret; 292 293 ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 294 FILTER_OTHER); 295 296 return ret; 297 } 298 299 void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) 300 { 301 struct syscall_trace_enter *entry; 302 struct syscall_metadata *sys_data; 303 struct ring_buffer_event *event; 304 struct ring_buffer *buffer; 305 int size; 306 int syscall_nr; 307 308 syscall_nr = syscall_get_nr(current, regs); 309 if (syscall_nr < 0) 310 return; 311 if (!test_bit(syscall_nr, enabled_enter_syscalls)) 312 return; 313 314 sys_data = syscall_nr_to_meta(syscall_nr); 315 if (!sys_data) 316 return; 317 318 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 319 320 event = trace_current_buffer_lock_reserve(&buffer, 321 sys_data->enter_event->event.type, size, 0, 0); 322 if (!event) 323 return; 324 325 entry = ring_buffer_event_data(event); 326 entry->nr = syscall_nr; 327 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); 328 329 if (!filter_current_check_discard(buffer, sys_data->enter_event, 330 entry, event)) 331 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 332 } 333 334 void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) 335 { 336 struct syscall_trace_exit *entry; 337 struct syscall_metadata *sys_data; 338 struct ring_buffer_event *event; 339 struct ring_buffer *buffer; 340 int syscall_nr; 341 342 syscall_nr = syscall_get_nr(current, regs); 343 if (syscall_nr < 0) 344 return; 345 if (!test_bit(syscall_nr, enabled_exit_syscalls)) 346 return; 347 348 sys_data = syscall_nr_to_meta(syscall_nr); 349 if (!sys_data) 350 return; 351 352 event = trace_current_buffer_lock_reserve(&buffer, 353 sys_data->exit_event->event.type, sizeof(*entry), 0, 0); 354 if (!event) 355 return; 356 357 entry = ring_buffer_event_data(event); 358 entry->nr = syscall_nr; 359 entry->ret = syscall_get_return_value(current, regs); 360 361 if (!filter_current_check_discard(buffer, sys_data->exit_event, 362 entry, event)) 363 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 364 } 365 366 int reg_event_syscall_enter(struct ftrace_event_call *call) 367 { 368 int ret = 0; 369 int num; 370 371 num = ((struct syscall_metadata *)call->data)->syscall_nr; 372 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 373 return -ENOSYS; 374 mutex_lock(&syscall_trace_lock); 375 if (!sys_refcount_enter) 376 ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); 377 if (!ret) { 378 set_bit(num, enabled_enter_syscalls); 379 sys_refcount_enter++; 380 } 381 mutex_unlock(&syscall_trace_lock); 382 return ret; 383 } 384 385 void unreg_event_syscall_enter(struct ftrace_event_call *call) 386 { 387 int num; 388 389 num = ((struct syscall_metadata *)call->data)->syscall_nr; 390 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 391 return; 392 mutex_lock(&syscall_trace_lock); 393 sys_refcount_enter--; 394 clear_bit(num, enabled_enter_syscalls); 395 if (!sys_refcount_enter) 396 unregister_trace_sys_enter(ftrace_syscall_enter, NULL); 397 mutex_unlock(&syscall_trace_lock); 398 } 399 400 int reg_event_syscall_exit(struct ftrace_event_call *call) 401 { 402 int ret = 0; 403 int num; 404 405 num = ((struct syscall_metadata *)call->data)->syscall_nr; 406 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 407 return -ENOSYS; 408 mutex_lock(&syscall_trace_lock); 409 if (!sys_refcount_exit) 410 ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); 411 if (!ret) { 412 set_bit(num, enabled_exit_syscalls); 413 sys_refcount_exit++; 414 } 415 mutex_unlock(&syscall_trace_lock); 416 return ret; 417 } 418 419 void unreg_event_syscall_exit(struct ftrace_event_call *call) 420 { 421 int num; 422 423 num = ((struct syscall_metadata *)call->data)->syscall_nr; 424 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 425 return; 426 mutex_lock(&syscall_trace_lock); 427 sys_refcount_exit--; 428 clear_bit(num, enabled_exit_syscalls); 429 if (!sys_refcount_exit) 430 unregister_trace_sys_exit(ftrace_syscall_exit, NULL); 431 mutex_unlock(&syscall_trace_lock); 432 } 433 434 int init_syscall_trace(struct ftrace_event_call *call) 435 { 436 int id; 437 int num; 438 439 num = ((struct syscall_metadata *)call->data)->syscall_nr; 440 if (num < 0 || num >= NR_syscalls) { 441 pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", 442 ((struct syscall_metadata *)call->data)->name); 443 return -ENOSYS; 444 } 445 446 if (set_syscall_print_fmt(call) < 0) 447 return -ENOMEM; 448 449 id = trace_event_raw_init(call); 450 451 if (id < 0) { 452 free_syscall_print_fmt(call); 453 return id; 454 } 455 456 return id; 457 } 458 459 unsigned long __init __weak arch_syscall_addr(int nr) 460 { 461 return (unsigned long)sys_call_table[nr]; 462 } 463 464 int __init init_ftrace_syscalls(void) 465 { 466 struct syscall_metadata *meta; 467 unsigned long addr; 468 int i; 469 470 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * 471 NR_syscalls, GFP_KERNEL); 472 if (!syscalls_metadata) { 473 WARN_ON(1); 474 return -ENOMEM; 475 } 476 477 for (i = 0; i < NR_syscalls; i++) { 478 addr = arch_syscall_addr(i); 479 meta = find_syscall_meta(addr); 480 if (!meta) 481 continue; 482 483 meta->syscall_nr = i; 484 syscalls_metadata[i] = meta; 485 } 486 487 return 0; 488 } 489 core_initcall(init_ftrace_syscalls); 490 491 #ifdef CONFIG_PERF_EVENTS 492 493 static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls); 494 static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); 495 static int sys_perf_refcount_enter; 496 static int sys_perf_refcount_exit; 497 498 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) 499 { 500 struct syscall_metadata *sys_data; 501 struct syscall_trace_enter *rec; 502 struct hlist_head *head; 503 int syscall_nr; 504 int rctx; 505 int size; 506 507 syscall_nr = syscall_get_nr(current, regs); 508 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) 509 return; 510 511 sys_data = syscall_nr_to_meta(syscall_nr); 512 if (!sys_data) 513 return; 514 515 /* get the size after alignment with the u32 buffer size field */ 516 size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); 517 size = ALIGN(size + sizeof(u32), sizeof(u64)); 518 size -= sizeof(u32); 519 520 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 521 "perf buffer not large enough")) 522 return; 523 524 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 525 sys_data->enter_event->event.type, regs, &rctx); 526 if (!rec) 527 return; 528 529 rec->nr = syscall_nr; 530 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 531 (unsigned long *)&rec->args); 532 533 head = this_cpu_ptr(sys_data->enter_event->perf_events); 534 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); 535 } 536 537 int perf_sysenter_enable(struct ftrace_event_call *call) 538 { 539 int ret = 0; 540 int num; 541 542 num = ((struct syscall_metadata *)call->data)->syscall_nr; 543 544 mutex_lock(&syscall_trace_lock); 545 if (!sys_perf_refcount_enter) 546 ret = register_trace_sys_enter(perf_syscall_enter, NULL); 547 if (ret) { 548 pr_info("event trace: Could not activate" 549 "syscall entry trace point"); 550 } else { 551 set_bit(num, enabled_perf_enter_syscalls); 552 sys_perf_refcount_enter++; 553 } 554 mutex_unlock(&syscall_trace_lock); 555 return ret; 556 } 557 558 void perf_sysenter_disable(struct ftrace_event_call *call) 559 { 560 int num; 561 562 num = ((struct syscall_metadata *)call->data)->syscall_nr; 563 564 mutex_lock(&syscall_trace_lock); 565 sys_perf_refcount_enter--; 566 clear_bit(num, enabled_perf_enter_syscalls); 567 if (!sys_perf_refcount_enter) 568 unregister_trace_sys_enter(perf_syscall_enter, NULL); 569 mutex_unlock(&syscall_trace_lock); 570 } 571 572 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) 573 { 574 struct syscall_metadata *sys_data; 575 struct syscall_trace_exit *rec; 576 struct hlist_head *head; 577 int syscall_nr; 578 int rctx; 579 int size; 580 581 syscall_nr = syscall_get_nr(current, regs); 582 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) 583 return; 584 585 sys_data = syscall_nr_to_meta(syscall_nr); 586 if (!sys_data) 587 return; 588 589 /* We can probably do that at build time */ 590 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); 591 size -= sizeof(u32); 592 593 /* 594 * Impossible, but be paranoid with the future 595 * How to put this check outside runtime? 596 */ 597 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 598 "exit event has grown above perf buffer size")) 599 return; 600 601 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 602 sys_data->exit_event->event.type, regs, &rctx); 603 if (!rec) 604 return; 605 606 rec->nr = syscall_nr; 607 rec->ret = syscall_get_return_value(current, regs); 608 609 head = this_cpu_ptr(sys_data->exit_event->perf_events); 610 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); 611 } 612 613 int perf_sysexit_enable(struct ftrace_event_call *call) 614 { 615 int ret = 0; 616 int num; 617 618 num = ((struct syscall_metadata *)call->data)->syscall_nr; 619 620 mutex_lock(&syscall_trace_lock); 621 if (!sys_perf_refcount_exit) 622 ret = register_trace_sys_exit(perf_syscall_exit, NULL); 623 if (ret) { 624 pr_info("event trace: Could not activate" 625 "syscall exit trace point"); 626 } else { 627 set_bit(num, enabled_perf_exit_syscalls); 628 sys_perf_refcount_exit++; 629 } 630 mutex_unlock(&syscall_trace_lock); 631 return ret; 632 } 633 634 void perf_sysexit_disable(struct ftrace_event_call *call) 635 { 636 int num; 637 638 num = ((struct syscall_metadata *)call->data)->syscall_nr; 639 640 mutex_lock(&syscall_trace_lock); 641 sys_perf_refcount_exit--; 642 clear_bit(num, enabled_perf_exit_syscalls); 643 if (!sys_perf_refcount_exit) 644 unregister_trace_sys_exit(perf_syscall_exit, NULL); 645 mutex_unlock(&syscall_trace_lock); 646 } 647 648 #endif /* CONFIG_PERF_EVENTS */ 649 650 static int syscall_enter_register(struct ftrace_event_call *event, 651 enum trace_reg type) 652 { 653 switch (type) { 654 case TRACE_REG_REGISTER: 655 return reg_event_syscall_enter(event); 656 case TRACE_REG_UNREGISTER: 657 unreg_event_syscall_enter(event); 658 return 0; 659 660 #ifdef CONFIG_PERF_EVENTS 661 case TRACE_REG_PERF_REGISTER: 662 return perf_sysenter_enable(event); 663 case TRACE_REG_PERF_UNREGISTER: 664 perf_sysenter_disable(event); 665 return 0; 666 #endif 667 } 668 return 0; 669 } 670 671 static int syscall_exit_register(struct ftrace_event_call *event, 672 enum trace_reg type) 673 { 674 switch (type) { 675 case TRACE_REG_REGISTER: 676 return reg_event_syscall_exit(event); 677 case TRACE_REG_UNREGISTER: 678 unreg_event_syscall_exit(event); 679 return 0; 680 681 #ifdef CONFIG_PERF_EVENTS 682 case TRACE_REG_PERF_REGISTER: 683 return perf_sysexit_enable(event); 684 case TRACE_REG_PERF_UNREGISTER: 685 perf_sysexit_disable(event); 686 return 0; 687 #endif 688 } 689 return 0; 690 } 691