1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2021, Microsoft Corporation. 4 * 5 * Authors: 6 * Beau Belgrave <beaub@linux.microsoft.com> 7 */ 8 9 #include <linux/bitmap.h> 10 #include <linux/cdev.h> 11 #include <linux/hashtable.h> 12 #include <linux/list.h> 13 #include <linux/io.h> 14 #include <linux/uio.h> 15 #include <linux/ioctl.h> 16 #include <linux/jhash.h> 17 #include <linux/trace_events.h> 18 #include <linux/tracefs.h> 19 #include <linux/types.h> 20 #include <linux/uaccess.h> 21 #include <uapi/linux/user_events.h> 22 #include "trace.h" 23 #include "trace_dynevent.h" 24 25 #define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1) 26 27 #define FIELD_DEPTH_TYPE 0 28 #define FIELD_DEPTH_NAME 1 29 #define FIELD_DEPTH_SIZE 2 30 31 /* 32 * Limits how many trace_event calls user processes can create: 33 * Must be multiple of PAGE_SIZE. 34 */ 35 #define MAX_PAGES 1 36 #define MAX_EVENTS (MAX_PAGES * PAGE_SIZE) 37 38 /* Limit how long of an event name plus args within the subsystem. */ 39 #define MAX_EVENT_DESC 512 40 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name) 41 #define MAX_FIELD_ARRAY_SIZE 1024 42 #define MAX_FIELD_ARG_NAME 256 43 44 #define MAX_BPF_COPY_SIZE PAGE_SIZE 45 #define MAX_STACK_BPF_DATA 512 46 47 static char *register_page_data; 48 49 static DEFINE_MUTEX(reg_mutex); 50 static DEFINE_HASHTABLE(register_table, 4); 51 static DECLARE_BITMAP(page_bitmap, MAX_EVENTS); 52 53 /* 54 * Stores per-event properties, as users register events 55 * within a file a user_event might be created if it does not 56 * already exist. These are globally used and their lifetime 57 * is tied to the refcnt member. These cannot go away until the 58 * refcnt reaches zero. 59 */ 60 struct user_event { 61 struct tracepoint tracepoint; 62 struct trace_event_call call; 63 struct trace_event_class class; 64 struct dyn_event devent; 65 struct hlist_node node; 66 struct list_head fields; 67 atomic_t refcnt; 68 int index; 69 int flags; 70 }; 71 72 /* 73 * Stores per-file events references, as users register events 74 * within a file this structure is modified and freed via RCU. 75 * The lifetime of this struct is tied to the lifetime of the file. 76 * These are not shared and only accessible by the file that created it. 77 */ 78 struct user_event_refs { 79 struct rcu_head rcu; 80 int count; 81 struct user_event *events[]; 82 }; 83 84 typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, 85 void *tpdata); 86 87 static int user_event_parse(char *name, char *args, char *flags, 88 struct user_event **newuser); 89 90 static u32 user_event_key(char *name) 91 { 92 return jhash(name, strlen(name), 0); 93 } 94 95 static __always_inline __must_check 96 size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i) 97 { 98 size_t ret; 99 100 pagefault_disable(); 101 102 ret = copy_from_iter_nocache(addr, bytes, i); 103 104 pagefault_enable(); 105 106 return ret; 107 } 108 109 static struct list_head *user_event_get_fields(struct trace_event_call *call) 110 { 111 struct user_event *user = (struct user_event *)call->data; 112 113 return &user->fields; 114 } 115 116 /* 117 * Parses a register command for user_events 118 * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]] 119 * 120 * Example event named 'test' with a 20 char 'msg' field with an unsigned int 121 * 'id' field after: 122 * test char[20] msg;unsigned int id 123 * 124 * NOTE: Offsets are from the user data perspective, they are not from the 125 * trace_entry/buffer perspective. We automatically add the common properties 126 * sizes to the offset for the user. 127 */ 128 static int user_event_parse_cmd(char *raw_command, struct user_event **newuser) 129 { 130 char *name = raw_command; 131 char *args = strpbrk(name, " "); 132 char *flags; 133 134 if (args) 135 *args++ = '\0'; 136 137 flags = strpbrk(name, ":"); 138 139 if (flags) 140 *flags++ = '\0'; 141 142 return user_event_parse(name, args, flags, newuser); 143 } 144 145 static int user_field_array_size(const char *type) 146 { 147 const char *start = strchr(type, '['); 148 char val[8]; 149 char *bracket; 150 int size = 0; 151 152 if (start == NULL) 153 return -EINVAL; 154 155 if (strscpy(val, start + 1, sizeof(val)) <= 0) 156 return -EINVAL; 157 158 bracket = strchr(val, ']'); 159 160 if (!bracket) 161 return -EINVAL; 162 163 *bracket = '\0'; 164 165 if (kstrtouint(val, 0, &size)) 166 return -EINVAL; 167 168 if (size > MAX_FIELD_ARRAY_SIZE) 169 return -EINVAL; 170 171 return size; 172 } 173 174 static int user_field_size(const char *type) 175 { 176 /* long is not allowed from a user, since it's ambigious in size */ 177 if (strcmp(type, "s64") == 0) 178 return sizeof(s64); 179 if (strcmp(type, "u64") == 0) 180 return sizeof(u64); 181 if (strcmp(type, "s32") == 0) 182 return sizeof(s32); 183 if (strcmp(type, "u32") == 0) 184 return sizeof(u32); 185 if (strcmp(type, "int") == 0) 186 return sizeof(int); 187 if (strcmp(type, "unsigned int") == 0) 188 return sizeof(unsigned int); 189 if (strcmp(type, "s16") == 0) 190 return sizeof(s16); 191 if (strcmp(type, "u16") == 0) 192 return sizeof(u16); 193 if (strcmp(type, "short") == 0) 194 return sizeof(short); 195 if (strcmp(type, "unsigned short") == 0) 196 return sizeof(unsigned short); 197 if (strcmp(type, "s8") == 0) 198 return sizeof(s8); 199 if (strcmp(type, "u8") == 0) 200 return sizeof(u8); 201 if (strcmp(type, "char") == 0) 202 return sizeof(char); 203 if (strcmp(type, "unsigned char") == 0) 204 return sizeof(unsigned char); 205 if (str_has_prefix(type, "char[")) 206 return user_field_array_size(type); 207 if (str_has_prefix(type, "unsigned char[")) 208 return user_field_array_size(type); 209 if (str_has_prefix(type, "__data_loc ")) 210 return sizeof(u32); 211 if (str_has_prefix(type, "__rel_loc ")) 212 return sizeof(u32); 213 214 /* Uknown basic type, error */ 215 return -EINVAL; 216 } 217 218 static void user_event_destroy_fields(struct user_event *user) 219 { 220 struct ftrace_event_field *field, *next; 221 struct list_head *head = &user->fields; 222 223 list_for_each_entry_safe(field, next, head, link) { 224 list_del(&field->link); 225 kfree(field); 226 } 227 } 228 229 static int user_event_add_field(struct user_event *user, const char *type, 230 const char *name, int offset, int size, 231 int is_signed, int filter_type) 232 { 233 struct ftrace_event_field *field; 234 235 field = kmalloc(sizeof(*field), GFP_KERNEL); 236 237 if (!field) 238 return -ENOMEM; 239 240 field->type = type; 241 field->name = name; 242 field->offset = offset; 243 field->size = size; 244 field->is_signed = is_signed; 245 field->filter_type = filter_type; 246 247 list_add(&field->link, &user->fields); 248 249 return 0; 250 } 251 252 /* 253 * Parses the values of a field within the description 254 * Format: type name [size] 255 */ 256 static int user_event_parse_field(char *field, struct user_event *user, 257 u32 *offset) 258 { 259 char *part, *type, *name; 260 u32 depth = 0, saved_offset = *offset; 261 int len, size = -EINVAL; 262 bool is_struct = false; 263 264 field = skip_spaces(field); 265 266 if (*field == '\0') 267 return 0; 268 269 /* Handle types that have a space within */ 270 len = str_has_prefix(field, "unsigned "); 271 if (len) 272 goto skip_next; 273 274 len = str_has_prefix(field, "struct "); 275 if (len) { 276 is_struct = true; 277 goto skip_next; 278 } 279 280 len = str_has_prefix(field, "__data_loc unsigned "); 281 if (len) 282 goto skip_next; 283 284 len = str_has_prefix(field, "__data_loc "); 285 if (len) 286 goto skip_next; 287 288 len = str_has_prefix(field, "__rel_loc unsigned "); 289 if (len) 290 goto skip_next; 291 292 len = str_has_prefix(field, "__rel_loc "); 293 if (len) 294 goto skip_next; 295 296 goto parse; 297 skip_next: 298 type = field; 299 field = strpbrk(field + len, " "); 300 301 if (field == NULL) 302 return -EINVAL; 303 304 *field++ = '\0'; 305 depth++; 306 parse: 307 while ((part = strsep(&field, " ")) != NULL) { 308 switch (depth++) { 309 case FIELD_DEPTH_TYPE: 310 type = part; 311 break; 312 case FIELD_DEPTH_NAME: 313 name = part; 314 break; 315 case FIELD_DEPTH_SIZE: 316 if (!is_struct) 317 return -EINVAL; 318 319 if (kstrtou32(part, 10, &size)) 320 return -EINVAL; 321 break; 322 default: 323 return -EINVAL; 324 } 325 } 326 327 if (depth < FIELD_DEPTH_SIZE) 328 return -EINVAL; 329 330 if (depth == FIELD_DEPTH_SIZE) 331 size = user_field_size(type); 332 333 if (size == 0) 334 return -EINVAL; 335 336 if (size < 0) 337 return size; 338 339 *offset = saved_offset + size; 340 341 return user_event_add_field(user, type, name, saved_offset, size, 342 type[0] != 'u', FILTER_OTHER); 343 } 344 345 static void user_event_parse_flags(struct user_event *user, char *flags) 346 { 347 char *flag; 348 349 if (flags == NULL) 350 return; 351 352 while ((flag = strsep(&flags, ",")) != NULL) { 353 if (strcmp(flag, "BPF_ITER") == 0) 354 user->flags |= FLAG_BPF_ITER; 355 } 356 } 357 358 static int user_event_parse_fields(struct user_event *user, char *args) 359 { 360 char *field; 361 u32 offset = sizeof(struct trace_entry); 362 int ret = -EINVAL; 363 364 if (args == NULL) 365 return 0; 366 367 while ((field = strsep(&args, ";")) != NULL) { 368 ret = user_event_parse_field(field, user, &offset); 369 370 if (ret) 371 break; 372 } 373 374 return ret; 375 } 376 377 static struct trace_event_fields user_event_fields_array[1]; 378 379 static const char *user_field_format(const char *type) 380 { 381 if (strcmp(type, "s64") == 0) 382 return "%lld"; 383 if (strcmp(type, "u64") == 0) 384 return "%llu"; 385 if (strcmp(type, "s32") == 0) 386 return "%d"; 387 if (strcmp(type, "u32") == 0) 388 return "%u"; 389 if (strcmp(type, "int") == 0) 390 return "%d"; 391 if (strcmp(type, "unsigned int") == 0) 392 return "%u"; 393 if (strcmp(type, "s16") == 0) 394 return "%d"; 395 if (strcmp(type, "u16") == 0) 396 return "%u"; 397 if (strcmp(type, "short") == 0) 398 return "%d"; 399 if (strcmp(type, "unsigned short") == 0) 400 return "%u"; 401 if (strcmp(type, "s8") == 0) 402 return "%d"; 403 if (strcmp(type, "u8") == 0) 404 return "%u"; 405 if (strcmp(type, "char") == 0) 406 return "%d"; 407 if (strcmp(type, "unsigned char") == 0) 408 return "%u"; 409 if (strstr(type, "char[") != 0) 410 return "%s"; 411 412 /* Unknown, likely struct, allowed treat as 64-bit */ 413 return "%llu"; 414 } 415 416 static bool user_field_is_dyn_string(const char *type, const char **str_func) 417 { 418 if (str_has_prefix(type, "__data_loc ")) { 419 *str_func = "__get_str"; 420 goto check; 421 } 422 423 if (str_has_prefix(type, "__rel_loc ")) { 424 *str_func = "__get_rel_str"; 425 goto check; 426 } 427 428 return false; 429 check: 430 return strstr(type, "char") != 0; 431 } 432 433 #define LEN_OR_ZERO (len ? len - pos : 0) 434 static int user_event_set_print_fmt(struct user_event *user, char *buf, int len) 435 { 436 struct ftrace_event_field *field, *next; 437 struct list_head *head = &user->fields; 438 int pos = 0, depth = 0; 439 const char *str_func; 440 441 pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); 442 443 list_for_each_entry_safe_reverse(field, next, head, link) { 444 if (depth != 0) 445 pos += snprintf(buf + pos, LEN_OR_ZERO, " "); 446 447 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s", 448 field->name, user_field_format(field->type)); 449 450 depth++; 451 } 452 453 pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); 454 455 list_for_each_entry_safe_reverse(field, next, head, link) { 456 if (user_field_is_dyn_string(field->type, &str_func)) 457 pos += snprintf(buf + pos, LEN_OR_ZERO, 458 ", %s(%s)", str_func, field->name); 459 else 460 pos += snprintf(buf + pos, LEN_OR_ZERO, 461 ", REC->%s", field->name); 462 } 463 464 return pos + 1; 465 } 466 #undef LEN_OR_ZERO 467 468 static int user_event_create_print_fmt(struct user_event *user) 469 { 470 char *print_fmt; 471 int len; 472 473 len = user_event_set_print_fmt(user, NULL, 0); 474 475 print_fmt = kmalloc(len, GFP_KERNEL); 476 477 if (!print_fmt) 478 return -ENOMEM; 479 480 user_event_set_print_fmt(user, print_fmt, len); 481 482 user->call.print_fmt = print_fmt; 483 484 return 0; 485 } 486 487 static enum print_line_t user_event_print_trace(struct trace_iterator *iter, 488 int flags, 489 struct trace_event *event) 490 { 491 /* Unsafe to try to decode user provided print_fmt, use hex */ 492 trace_print_hex_dump_seq(&iter->seq, "", DUMP_PREFIX_OFFSET, 16, 493 1, iter->ent, iter->ent_size, true); 494 495 return trace_handle_return(&iter->seq); 496 } 497 498 static struct trace_event_functions user_event_funcs = { 499 .trace = user_event_print_trace, 500 }; 501 502 static int destroy_user_event(struct user_event *user) 503 { 504 int ret = 0; 505 506 /* Must destroy fields before call removal */ 507 user_event_destroy_fields(user); 508 509 ret = trace_remove_event_call(&user->call); 510 511 if (ret) 512 return ret; 513 514 dyn_event_remove(&user->devent); 515 516 register_page_data[user->index] = 0; 517 clear_bit(user->index, page_bitmap); 518 hash_del(&user->node); 519 520 kfree(user->call.print_fmt); 521 kfree(EVENT_NAME(user)); 522 kfree(user); 523 524 return ret; 525 } 526 527 static struct user_event *find_user_event(char *name, u32 *outkey) 528 { 529 struct user_event *user; 530 u32 key = user_event_key(name); 531 532 *outkey = key; 533 534 hash_for_each_possible(register_table, user, node, key) 535 if (!strcmp(EVENT_NAME(user), name)) 536 return user; 537 538 return NULL; 539 } 540 541 /* 542 * Writes the user supplied payload out to a trace file. 543 */ 544 static void user_event_ftrace(struct user_event *user, struct iov_iter *i, 545 void *tpdata) 546 { 547 struct trace_event_file *file; 548 struct trace_entry *entry; 549 struct trace_event_buffer event_buffer; 550 551 file = (struct trace_event_file *)tpdata; 552 553 if (!file || 554 !(file->flags & EVENT_FILE_FL_ENABLED) || 555 trace_trigger_soft_disabled(file)) 556 return; 557 558 /* Allocates and fills trace_entry, + 1 of this is data payload */ 559 entry = trace_event_buffer_reserve(&event_buffer, file, 560 sizeof(*entry) + i->count); 561 562 if (unlikely(!entry)) 563 return; 564 565 if (unlikely(!copy_nofault(entry + 1, i->count, i))) 566 __trace_event_discard_commit(event_buffer.buffer, 567 event_buffer.event); 568 else 569 trace_event_buffer_commit(&event_buffer); 570 } 571 572 #ifdef CONFIG_PERF_EVENTS 573 static void user_event_bpf(struct user_event *user, struct iov_iter *i) 574 { 575 struct user_bpf_context context; 576 struct user_bpf_iter bpf_i; 577 char fast_data[MAX_STACK_BPF_DATA]; 578 void *temp = NULL; 579 580 if ((user->flags & FLAG_BPF_ITER) && iter_is_iovec(i)) { 581 /* Raw iterator */ 582 context.data_type = USER_BPF_DATA_ITER; 583 context.data_len = i->count; 584 context.iter = &bpf_i; 585 586 bpf_i.iov_offset = i->iov_offset; 587 bpf_i.iov = i->iov; 588 bpf_i.nr_segs = i->nr_segs; 589 } else if (i->nr_segs == 1 && iter_is_iovec(i)) { 590 /* Single buffer from user */ 591 context.data_type = USER_BPF_DATA_USER; 592 context.data_len = i->count; 593 context.udata = i->iov->iov_base + i->iov_offset; 594 } else { 595 /* Multi buffer from user */ 596 struct iov_iter copy = *i; 597 size_t copy_size = min_t(size_t, i->count, MAX_BPF_COPY_SIZE); 598 599 context.data_type = USER_BPF_DATA_KERNEL; 600 context.kdata = fast_data; 601 602 if (unlikely(copy_size > sizeof(fast_data))) { 603 temp = kmalloc(copy_size, GFP_NOWAIT); 604 605 if (temp) 606 context.kdata = temp; 607 else 608 copy_size = sizeof(fast_data); 609 } 610 611 context.data_len = copy_nofault(context.kdata, 612 copy_size, ©); 613 } 614 615 trace_call_bpf(&user->call, &context); 616 617 kfree(temp); 618 } 619 620 /* 621 * Writes the user supplied payload out to perf ring buffer or eBPF program. 622 */ 623 static void user_event_perf(struct user_event *user, struct iov_iter *i, 624 void *tpdata) 625 { 626 struct hlist_head *perf_head; 627 628 if (bpf_prog_array_valid(&user->call)) 629 user_event_bpf(user, i); 630 631 perf_head = this_cpu_ptr(user->call.perf_events); 632 633 if (perf_head && !hlist_empty(perf_head)) { 634 struct trace_entry *perf_entry; 635 struct pt_regs *regs; 636 size_t size = sizeof(*perf_entry) + i->count; 637 int context; 638 639 perf_entry = perf_trace_buf_alloc(ALIGN(size, 8), 640 ®s, &context); 641 642 if (unlikely(!perf_entry)) 643 return; 644 645 perf_fetch_caller_regs(regs); 646 647 if (unlikely(!copy_nofault(perf_entry + 1, i->count, i))) { 648 perf_swevent_put_recursion_context(context); 649 return; 650 } 651 652 perf_trace_buf_submit(perf_entry, size, context, 653 user->call.event.type, 1, regs, 654 perf_head, NULL); 655 } 656 } 657 #endif 658 659 /* 660 * Update the register page that is shared between user processes. 661 */ 662 static void update_reg_page_for(struct user_event *user) 663 { 664 struct tracepoint *tp = &user->tracepoint; 665 char status = 0; 666 667 if (atomic_read(&tp->key.enabled) > 0) { 668 struct tracepoint_func *probe_func_ptr; 669 user_event_func_t probe_func; 670 671 rcu_read_lock_sched(); 672 673 probe_func_ptr = rcu_dereference_sched(tp->funcs); 674 675 if (probe_func_ptr) { 676 do { 677 probe_func = probe_func_ptr->func; 678 679 if (probe_func == user_event_ftrace) 680 status |= EVENT_STATUS_FTRACE; 681 #ifdef CONFIG_PERF_EVENTS 682 else if (probe_func == user_event_perf) 683 status |= EVENT_STATUS_PERF; 684 #endif 685 else 686 status |= EVENT_STATUS_OTHER; 687 } while ((++probe_func_ptr)->func); 688 } 689 690 rcu_read_unlock_sched(); 691 } 692 693 register_page_data[user->index] = status; 694 } 695 696 /* 697 * Register callback for our events from tracing sub-systems. 698 */ 699 static int user_event_reg(struct trace_event_call *call, 700 enum trace_reg type, 701 void *data) 702 { 703 struct user_event *user = (struct user_event *)call->data; 704 int ret = 0; 705 706 if (!user) 707 return -ENOENT; 708 709 switch (type) { 710 case TRACE_REG_REGISTER: 711 ret = tracepoint_probe_register(call->tp, 712 call->class->probe, 713 data); 714 if (!ret) 715 goto inc; 716 break; 717 718 case TRACE_REG_UNREGISTER: 719 tracepoint_probe_unregister(call->tp, 720 call->class->probe, 721 data); 722 goto dec; 723 724 #ifdef CONFIG_PERF_EVENTS 725 case TRACE_REG_PERF_REGISTER: 726 ret = tracepoint_probe_register(call->tp, 727 call->class->perf_probe, 728 data); 729 if (!ret) 730 goto inc; 731 break; 732 733 case TRACE_REG_PERF_UNREGISTER: 734 tracepoint_probe_unregister(call->tp, 735 call->class->perf_probe, 736 data); 737 goto dec; 738 739 case TRACE_REG_PERF_OPEN: 740 case TRACE_REG_PERF_CLOSE: 741 case TRACE_REG_PERF_ADD: 742 case TRACE_REG_PERF_DEL: 743 break; 744 #endif 745 } 746 747 return ret; 748 inc: 749 atomic_inc(&user->refcnt); 750 update_reg_page_for(user); 751 return 0; 752 dec: 753 update_reg_page_for(user); 754 atomic_dec(&user->refcnt); 755 return 0; 756 } 757 758 static int user_event_create(const char *raw_command) 759 { 760 struct user_event *user; 761 char *name; 762 int ret; 763 764 if (!str_has_prefix(raw_command, USER_EVENTS_PREFIX)) 765 return -ECANCELED; 766 767 raw_command += USER_EVENTS_PREFIX_LEN; 768 raw_command = skip_spaces(raw_command); 769 770 name = kstrdup(raw_command, GFP_KERNEL); 771 772 if (!name) 773 return -ENOMEM; 774 775 mutex_lock(®_mutex); 776 ret = user_event_parse_cmd(name, &user); 777 mutex_unlock(®_mutex); 778 779 if (ret) 780 kfree(name); 781 782 return ret; 783 } 784 785 static int user_event_show(struct seq_file *m, struct dyn_event *ev) 786 { 787 struct user_event *user = container_of(ev, struct user_event, devent); 788 struct ftrace_event_field *field, *next; 789 struct list_head *head; 790 int depth = 0; 791 792 seq_printf(m, "%s%s", USER_EVENTS_PREFIX, EVENT_NAME(user)); 793 794 head = trace_get_fields(&user->call); 795 796 list_for_each_entry_safe_reverse(field, next, head, link) { 797 if (depth == 0) 798 seq_puts(m, " "); 799 else 800 seq_puts(m, "; "); 801 802 seq_printf(m, "%s %s", field->type, field->name); 803 804 if (str_has_prefix(field->type, "struct ")) 805 seq_printf(m, " %d", field->size); 806 807 depth++; 808 } 809 810 seq_puts(m, "\n"); 811 812 return 0; 813 } 814 815 static bool user_event_is_busy(struct dyn_event *ev) 816 { 817 struct user_event *user = container_of(ev, struct user_event, devent); 818 819 return atomic_read(&user->refcnt) != 0; 820 } 821 822 static int user_event_free(struct dyn_event *ev) 823 { 824 struct user_event *user = container_of(ev, struct user_event, devent); 825 826 if (atomic_read(&user->refcnt) != 0) 827 return -EBUSY; 828 829 return destroy_user_event(user); 830 } 831 832 static bool user_field_match(struct ftrace_event_field *field, int argc, 833 const char **argv, int *iout) 834 { 835 char *field_name, *arg_name; 836 int len, pos, i = *iout; 837 bool colon = false, match = false; 838 839 if (i >= argc) 840 return false; 841 842 len = MAX_FIELD_ARG_NAME; 843 field_name = kmalloc(len, GFP_KERNEL); 844 arg_name = kmalloc(len, GFP_KERNEL); 845 846 if (!arg_name || !field_name) 847 goto out; 848 849 pos = 0; 850 851 for (; i < argc; ++i) { 852 if (i != *iout) 853 pos += snprintf(arg_name + pos, len - pos, " "); 854 855 pos += snprintf(arg_name + pos, len - pos, argv[i]); 856 857 if (strchr(argv[i], ';')) { 858 ++i; 859 colon = true; 860 break; 861 } 862 } 863 864 pos = 0; 865 866 pos += snprintf(field_name + pos, len - pos, field->type); 867 pos += snprintf(field_name + pos, len - pos, " "); 868 pos += snprintf(field_name + pos, len - pos, field->name); 869 870 if (colon) 871 pos += snprintf(field_name + pos, len - pos, ";"); 872 873 *iout = i; 874 875 match = strcmp(arg_name, field_name) == 0; 876 out: 877 kfree(arg_name); 878 kfree(field_name); 879 880 return match; 881 } 882 883 static bool user_fields_match(struct user_event *user, int argc, 884 const char **argv) 885 { 886 struct ftrace_event_field *field, *next; 887 struct list_head *head = &user->fields; 888 int i = 0; 889 890 list_for_each_entry_safe_reverse(field, next, head, link) 891 if (!user_field_match(field, argc, argv, &i)) 892 return false; 893 894 if (i != argc) 895 return false; 896 897 return true; 898 } 899 900 static bool user_event_match(const char *system, const char *event, 901 int argc, const char **argv, struct dyn_event *ev) 902 { 903 struct user_event *user = container_of(ev, struct user_event, devent); 904 bool match; 905 906 match = strcmp(EVENT_NAME(user), event) == 0 && 907 (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0); 908 909 if (match && argc > 0) 910 match = user_fields_match(user, argc, argv); 911 912 return match; 913 } 914 915 static struct dyn_event_operations user_event_dops = { 916 .create = user_event_create, 917 .show = user_event_show, 918 .is_busy = user_event_is_busy, 919 .free = user_event_free, 920 .match = user_event_match, 921 }; 922 923 static int user_event_trace_register(struct user_event *user) 924 { 925 int ret; 926 927 ret = register_trace_event(&user->call.event); 928 929 if (!ret) 930 return -ENODEV; 931 932 ret = trace_add_event_call(&user->call); 933 934 if (ret) 935 unregister_trace_event(&user->call.event); 936 937 return ret; 938 } 939 940 /* 941 * Parses the event name, arguments and flags then registers if successful. 942 * The name buffer lifetime is owned by this method for success cases only. 943 */ 944 static int user_event_parse(char *name, char *args, char *flags, 945 struct user_event **newuser) 946 { 947 int ret; 948 int index; 949 u32 key; 950 struct user_event *user = find_user_event(name, &key); 951 952 if (user) { 953 *newuser = user; 954 /* 955 * Name is allocated by caller, free it since it already exists. 956 * Caller only worries about failure cases for freeing. 957 */ 958 kfree(name); 959 return 0; 960 } 961 962 index = find_first_zero_bit(page_bitmap, MAX_EVENTS); 963 964 if (index == MAX_EVENTS) 965 return -EMFILE; 966 967 user = kzalloc(sizeof(*user), GFP_KERNEL); 968 969 if (!user) 970 return -ENOMEM; 971 972 INIT_LIST_HEAD(&user->class.fields); 973 INIT_LIST_HEAD(&user->fields); 974 975 user->tracepoint.name = name; 976 977 user_event_parse_flags(user, flags); 978 979 ret = user_event_parse_fields(user, args); 980 981 if (ret) 982 goto put_user; 983 984 ret = user_event_create_print_fmt(user); 985 986 if (ret) 987 goto put_user; 988 989 user->call.data = user; 990 user->call.class = &user->class; 991 user->call.name = name; 992 user->call.flags = TRACE_EVENT_FL_TRACEPOINT; 993 user->call.tp = &user->tracepoint; 994 user->call.event.funcs = &user_event_funcs; 995 996 user->class.system = USER_EVENTS_SYSTEM; 997 user->class.fields_array = user_event_fields_array; 998 user->class.get_fields = user_event_get_fields; 999 user->class.reg = user_event_reg; 1000 user->class.probe = user_event_ftrace; 1001 #ifdef CONFIG_PERF_EVENTS 1002 user->class.perf_probe = user_event_perf; 1003 #endif 1004 1005 mutex_lock(&event_mutex); 1006 ret = user_event_trace_register(user); 1007 mutex_unlock(&event_mutex); 1008 1009 if (ret) 1010 goto put_user; 1011 1012 user->index = index; 1013 dyn_event_init(&user->devent, &user_event_dops); 1014 dyn_event_add(&user->devent, &user->call); 1015 set_bit(user->index, page_bitmap); 1016 hash_add(register_table, &user->node, key); 1017 1018 *newuser = user; 1019 return 0; 1020 put_user: 1021 user_event_destroy_fields(user); 1022 kfree(user); 1023 return ret; 1024 } 1025 1026 /* 1027 * Deletes a previously created event if it is no longer being used. 1028 */ 1029 static int delete_user_event(char *name) 1030 { 1031 u32 key; 1032 int ret; 1033 struct user_event *user = find_user_event(name, &key); 1034 1035 if (!user) 1036 return -ENOENT; 1037 1038 if (atomic_read(&user->refcnt) != 0) 1039 return -EBUSY; 1040 1041 mutex_lock(&event_mutex); 1042 ret = destroy_user_event(user); 1043 mutex_unlock(&event_mutex); 1044 1045 return ret; 1046 } 1047 1048 /* 1049 * Validates the user payload and writes via iterator. 1050 */ 1051 static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) 1052 { 1053 struct user_event_refs *refs; 1054 struct user_event *user = NULL; 1055 struct tracepoint *tp; 1056 ssize_t ret = i->count; 1057 int idx; 1058 1059 if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx))) 1060 return -EFAULT; 1061 1062 rcu_read_lock_sched(); 1063 1064 refs = rcu_dereference_sched(file->private_data); 1065 1066 /* 1067 * The refs->events array is protected by RCU, and new items may be 1068 * added. But the user retrieved from indexing into the events array 1069 * shall be immutable while the file is opened. 1070 */ 1071 if (likely(refs && idx < refs->count)) 1072 user = refs->events[idx]; 1073 1074 rcu_read_unlock_sched(); 1075 1076 if (unlikely(user == NULL)) 1077 return -ENOENT; 1078 1079 tp = &user->tracepoint; 1080 1081 /* 1082 * It's possible key.enabled disables after this check, however 1083 * we don't mind if a few events are included in this condition. 1084 */ 1085 if (likely(atomic_read(&tp->key.enabled) > 0)) { 1086 struct tracepoint_func *probe_func_ptr; 1087 user_event_func_t probe_func; 1088 struct iov_iter copy; 1089 void *tpdata; 1090 1091 if (unlikely(fault_in_iov_iter_readable(i, i->count))) 1092 return -EFAULT; 1093 1094 rcu_read_lock_sched(); 1095 1096 probe_func_ptr = rcu_dereference_sched(tp->funcs); 1097 1098 if (probe_func_ptr) { 1099 do { 1100 copy = *i; 1101 probe_func = probe_func_ptr->func; 1102 tpdata = probe_func_ptr->data; 1103 probe_func(user, ©, tpdata); 1104 } while ((++probe_func_ptr)->func); 1105 } 1106 1107 rcu_read_unlock_sched(); 1108 } 1109 1110 return ret; 1111 } 1112 1113 static ssize_t user_events_write(struct file *file, const char __user *ubuf, 1114 size_t count, loff_t *ppos) 1115 { 1116 struct iovec iov; 1117 struct iov_iter i; 1118 1119 if (unlikely(*ppos != 0)) 1120 return -EFAULT; 1121 1122 if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i))) 1123 return -EFAULT; 1124 1125 return user_events_write_core(file, &i); 1126 } 1127 1128 static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i) 1129 { 1130 return user_events_write_core(kp->ki_filp, i); 1131 } 1132 1133 static int user_events_ref_add(struct file *file, struct user_event *user) 1134 { 1135 struct user_event_refs *refs, *new_refs; 1136 int i, size, count = 0; 1137 1138 refs = rcu_dereference_protected(file->private_data, 1139 lockdep_is_held(®_mutex)); 1140 1141 if (refs) { 1142 count = refs->count; 1143 1144 for (i = 0; i < count; ++i) 1145 if (refs->events[i] == user) 1146 return i; 1147 } 1148 1149 size = struct_size(refs, events, count + 1); 1150 1151 new_refs = kzalloc(size, GFP_KERNEL); 1152 1153 if (!new_refs) 1154 return -ENOMEM; 1155 1156 new_refs->count = count + 1; 1157 1158 for (i = 0; i < count; ++i) 1159 new_refs->events[i] = refs->events[i]; 1160 1161 new_refs->events[i] = user; 1162 1163 atomic_inc(&user->refcnt); 1164 1165 rcu_assign_pointer(file->private_data, new_refs); 1166 1167 if (refs) 1168 kfree_rcu(refs, rcu); 1169 1170 return i; 1171 } 1172 1173 static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) 1174 { 1175 u32 size; 1176 long ret; 1177 1178 ret = get_user(size, &ureg->size); 1179 1180 if (ret) 1181 return ret; 1182 1183 if (size > PAGE_SIZE) 1184 return -E2BIG; 1185 1186 return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); 1187 } 1188 1189 /* 1190 * Registers a user_event on behalf of a user process. 1191 */ 1192 static long user_events_ioctl_reg(struct file *file, unsigned long uarg) 1193 { 1194 struct user_reg __user *ureg = (struct user_reg __user *)uarg; 1195 struct user_reg reg; 1196 struct user_event *user; 1197 char *name; 1198 long ret; 1199 1200 ret = user_reg_get(ureg, ®); 1201 1202 if (ret) 1203 return ret; 1204 1205 name = strndup_user((const char __user *)(uintptr_t)reg.name_args, 1206 MAX_EVENT_DESC); 1207 1208 if (IS_ERR(name)) { 1209 ret = PTR_ERR(name); 1210 return ret; 1211 } 1212 1213 ret = user_event_parse_cmd(name, &user); 1214 1215 if (ret) { 1216 kfree(name); 1217 return ret; 1218 } 1219 1220 ret = user_events_ref_add(file, user); 1221 1222 /* Positive number is index and valid */ 1223 if (ret < 0) 1224 return ret; 1225 1226 put_user((u32)ret, &ureg->write_index); 1227 put_user(user->index, &ureg->status_index); 1228 1229 return 0; 1230 } 1231 1232 /* 1233 * Deletes a user_event on behalf of a user process. 1234 */ 1235 static long user_events_ioctl_del(struct file *file, unsigned long uarg) 1236 { 1237 void __user *ubuf = (void __user *)uarg; 1238 char *name; 1239 long ret; 1240 1241 name = strndup_user(ubuf, MAX_EVENT_DESC); 1242 1243 if (IS_ERR(name)) 1244 return PTR_ERR(name); 1245 1246 ret = delete_user_event(name); 1247 1248 kfree(name); 1249 1250 return ret; 1251 } 1252 1253 /* 1254 * Handles the ioctl from user mode to register or alter operations. 1255 */ 1256 static long user_events_ioctl(struct file *file, unsigned int cmd, 1257 unsigned long uarg) 1258 { 1259 long ret = -ENOTTY; 1260 1261 switch (cmd) { 1262 case DIAG_IOCSREG: 1263 mutex_lock(®_mutex); 1264 ret = user_events_ioctl_reg(file, uarg); 1265 mutex_unlock(®_mutex); 1266 break; 1267 1268 case DIAG_IOCSDEL: 1269 mutex_lock(®_mutex); 1270 ret = user_events_ioctl_del(file, uarg); 1271 mutex_unlock(®_mutex); 1272 break; 1273 } 1274 1275 return ret; 1276 } 1277 1278 /* 1279 * Handles the final close of the file from user mode. 1280 */ 1281 static int user_events_release(struct inode *node, struct file *file) 1282 { 1283 struct user_event_refs *refs; 1284 struct user_event *user; 1285 int i; 1286 1287 /* 1288 * Ensure refs cannot change under any situation by taking the 1289 * register mutex during the final freeing of the references. 1290 */ 1291 mutex_lock(®_mutex); 1292 1293 refs = file->private_data; 1294 1295 if (!refs) 1296 goto out; 1297 1298 /* 1299 * The lifetime of refs has reached an end, it's tied to this file. 1300 * The underlying user_events are ref counted, and cannot be freed. 1301 * After this decrement, the user_events may be freed elsewhere. 1302 */ 1303 for (i = 0; i < refs->count; ++i) { 1304 user = refs->events[i]; 1305 1306 if (user) 1307 atomic_dec(&user->refcnt); 1308 } 1309 out: 1310 file->private_data = NULL; 1311 1312 mutex_unlock(®_mutex); 1313 1314 kfree(refs); 1315 1316 return 0; 1317 } 1318 1319 static const struct file_operations user_data_fops = { 1320 .write = user_events_write, 1321 .write_iter = user_events_write_iter, 1322 .unlocked_ioctl = user_events_ioctl, 1323 .release = user_events_release, 1324 }; 1325 1326 /* 1327 * Maps the shared page into the user process for checking if event is enabled. 1328 */ 1329 static int user_status_mmap(struct file *file, struct vm_area_struct *vma) 1330 { 1331 unsigned long size = vma->vm_end - vma->vm_start; 1332 1333 if (size != MAX_EVENTS) 1334 return -EINVAL; 1335 1336 return remap_pfn_range(vma, vma->vm_start, 1337 virt_to_phys(register_page_data) >> PAGE_SHIFT, 1338 size, vm_get_page_prot(VM_READ)); 1339 } 1340 1341 static void *user_seq_start(struct seq_file *m, loff_t *pos) 1342 { 1343 if (*pos) 1344 return NULL; 1345 1346 return (void *)1; 1347 } 1348 1349 static void *user_seq_next(struct seq_file *m, void *p, loff_t *pos) 1350 { 1351 ++*pos; 1352 return NULL; 1353 } 1354 1355 static void user_seq_stop(struct seq_file *m, void *p) 1356 { 1357 } 1358 1359 static int user_seq_show(struct seq_file *m, void *p) 1360 { 1361 struct user_event *user; 1362 char status; 1363 int i, active = 0, busy = 0, flags; 1364 1365 mutex_lock(®_mutex); 1366 1367 hash_for_each(register_table, i, user, node) { 1368 status = register_page_data[user->index]; 1369 flags = user->flags; 1370 1371 seq_printf(m, "%d:%s", user->index, EVENT_NAME(user)); 1372 1373 if (flags != 0 || status != 0) 1374 seq_puts(m, " #"); 1375 1376 if (status != 0) { 1377 seq_puts(m, " Used by"); 1378 if (status & EVENT_STATUS_FTRACE) 1379 seq_puts(m, " ftrace"); 1380 if (status & EVENT_STATUS_PERF) 1381 seq_puts(m, " perf"); 1382 if (status & EVENT_STATUS_OTHER) 1383 seq_puts(m, " other"); 1384 busy++; 1385 } 1386 1387 if (flags & FLAG_BPF_ITER) 1388 seq_puts(m, " FLAG:BPF_ITER"); 1389 1390 seq_puts(m, "\n"); 1391 active++; 1392 } 1393 1394 mutex_unlock(®_mutex); 1395 1396 seq_puts(m, "\n"); 1397 seq_printf(m, "Active: %d\n", active); 1398 seq_printf(m, "Busy: %d\n", busy); 1399 seq_printf(m, "Max: %ld\n", MAX_EVENTS); 1400 1401 return 0; 1402 } 1403 1404 static const struct seq_operations user_seq_ops = { 1405 .start = user_seq_start, 1406 .next = user_seq_next, 1407 .stop = user_seq_stop, 1408 .show = user_seq_show, 1409 }; 1410 1411 static int user_status_open(struct inode *node, struct file *file) 1412 { 1413 return seq_open(file, &user_seq_ops); 1414 } 1415 1416 static const struct file_operations user_status_fops = { 1417 .open = user_status_open, 1418 .mmap = user_status_mmap, 1419 .read = seq_read, 1420 .llseek = seq_lseek, 1421 .release = seq_release, 1422 }; 1423 1424 /* 1425 * Creates a set of tracefs files to allow user mode interactions. 1426 */ 1427 static int create_user_tracefs(void) 1428 { 1429 struct dentry *edata, *emmap; 1430 1431 edata = tracefs_create_file("user_events_data", TRACE_MODE_WRITE, 1432 NULL, NULL, &user_data_fops); 1433 1434 if (!edata) { 1435 pr_warn("Could not create tracefs 'user_events_data' entry\n"); 1436 goto err; 1437 } 1438 1439 /* mmap with MAP_SHARED requires writable fd */ 1440 emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE, 1441 NULL, NULL, &user_status_fops); 1442 1443 if (!emmap) { 1444 tracefs_remove(edata); 1445 pr_warn("Could not create tracefs 'user_events_mmap' entry\n"); 1446 goto err; 1447 } 1448 1449 return 0; 1450 err: 1451 return -ENODEV; 1452 } 1453 1454 static void set_page_reservations(bool set) 1455 { 1456 int page; 1457 1458 for (page = 0; page < MAX_PAGES; ++page) { 1459 void *addr = register_page_data + (PAGE_SIZE * page); 1460 1461 if (set) 1462 SetPageReserved(virt_to_page(addr)); 1463 else 1464 ClearPageReserved(virt_to_page(addr)); 1465 } 1466 } 1467 1468 static int __init trace_events_user_init(void) 1469 { 1470 int ret; 1471 1472 /* Zero all bits beside 0 (which is reserved for failures) */ 1473 bitmap_zero(page_bitmap, MAX_EVENTS); 1474 set_bit(0, page_bitmap); 1475 1476 register_page_data = kzalloc(MAX_EVENTS, GFP_KERNEL); 1477 1478 if (!register_page_data) 1479 return -ENOMEM; 1480 1481 set_page_reservations(true); 1482 1483 ret = create_user_tracefs(); 1484 1485 if (ret) { 1486 pr_warn("user_events could not register with tracefs\n"); 1487 set_page_reservations(false); 1488 kfree(register_page_data); 1489 return ret; 1490 } 1491 1492 if (dyn_event_register(&user_event_dops)) 1493 pr_warn("user_events could not register with dyn_events\n"); 1494 1495 return 0; 1496 } 1497 1498 fs_initcall(trace_events_user_init); 1499