1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2021, Microsoft Corporation. 4 * 5 * Authors: 6 * Beau Belgrave <beaub@linux.microsoft.com> 7 */ 8 9 #include <linux/bitmap.h> 10 #include <linux/cdev.h> 11 #include <linux/hashtable.h> 12 #include <linux/list.h> 13 #include <linux/io.h> 14 #include <linux/uio.h> 15 #include <linux/ioctl.h> 16 #include <linux/jhash.h> 17 #include <linux/trace_events.h> 18 #include <linux/tracefs.h> 19 #include <linux/types.h> 20 #include <linux/uaccess.h> 21 #include <uapi/linux/user_events.h> 22 #include "trace.h" 23 #include "trace_dynevent.h" 24 25 #define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1) 26 27 #define FIELD_DEPTH_TYPE 0 28 #define FIELD_DEPTH_NAME 1 29 #define FIELD_DEPTH_SIZE 2 30 31 /* 32 * Limits how many trace_event calls user processes can create: 33 * Must be a power of two of PAGE_SIZE. 34 */ 35 #define MAX_PAGE_ORDER 0 36 #define MAX_PAGES (1 << MAX_PAGE_ORDER) 37 #define MAX_EVENTS (MAX_PAGES * PAGE_SIZE) 38 39 /* Limit how long of an event name plus args within the subsystem. */ 40 #define MAX_EVENT_DESC 512 41 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name) 42 #define MAX_FIELD_ARRAY_SIZE 1024 43 #define MAX_FIELD_ARG_NAME 256 44 45 #define MAX_BPF_COPY_SIZE PAGE_SIZE 46 #define MAX_STACK_BPF_DATA 512 47 48 static char *register_page_data; 49 50 static DEFINE_MUTEX(reg_mutex); 51 static DEFINE_HASHTABLE(register_table, 4); 52 static DECLARE_BITMAP(page_bitmap, MAX_EVENTS); 53 54 /* 55 * Stores per-event properties, as users register events 56 * within a file a user_event might be created if it does not 57 * already exist. These are globally used and their lifetime 58 * is tied to the refcnt member. These cannot go away until the 59 * refcnt reaches zero. 60 */ 61 struct user_event { 62 struct tracepoint tracepoint; 63 struct trace_event_call call; 64 struct trace_event_class class; 65 struct dyn_event devent; 66 struct hlist_node node; 67 struct list_head fields; 68 struct list_head validators; 69 atomic_t refcnt; 70 int index; 71 int flags; 72 int min_size; 73 }; 74 75 /* 76 * Stores per-file events references, as users register events 77 * within a file this structure is modified and freed via RCU. 78 * The lifetime of this struct is tied to the lifetime of the file. 79 * These are not shared and only accessible by the file that created it. 80 */ 81 struct user_event_refs { 82 struct rcu_head rcu; 83 int count; 84 struct user_event *events[]; 85 }; 86 87 #define VALIDATOR_ENSURE_NULL (1 << 0) 88 #define VALIDATOR_REL (1 << 1) 89 90 struct user_event_validator { 91 struct list_head link; 92 int offset; 93 int flags; 94 }; 95 96 typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, 97 void *tpdata, bool *faulted); 98 99 static int user_event_parse(char *name, char *args, char *flags, 100 struct user_event **newuser); 101 102 static u32 user_event_key(char *name) 103 { 104 return jhash(name, strlen(name), 0); 105 } 106 107 static __always_inline __must_check 108 size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i) 109 { 110 size_t ret; 111 112 pagefault_disable(); 113 114 ret = copy_from_iter_nocache(addr, bytes, i); 115 116 pagefault_enable(); 117 118 return ret; 119 } 120 121 static struct list_head *user_event_get_fields(struct trace_event_call *call) 122 { 123 struct user_event *user = (struct user_event *)call->data; 124 125 return &user->fields; 126 } 127 128 /* 129 * Parses a register command for user_events 130 * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]] 131 * 132 * Example event named 'test' with a 20 char 'msg' field with an unsigned int 133 * 'id' field after: 134 * test char[20] msg;unsigned int id 135 * 136 * NOTE: Offsets are from the user data perspective, they are not from the 137 * trace_entry/buffer perspective. We automatically add the common properties 138 * sizes to the offset for the user. 139 * 140 * Upon success user_event has its ref count increased by 1. 141 */ 142 static int user_event_parse_cmd(char *raw_command, struct user_event **newuser) 143 { 144 char *name = raw_command; 145 char *args = strpbrk(name, " "); 146 char *flags; 147 148 if (args) 149 *args++ = '\0'; 150 151 flags = strpbrk(name, ":"); 152 153 if (flags) 154 *flags++ = '\0'; 155 156 return user_event_parse(name, args, flags, newuser); 157 } 158 159 static int user_field_array_size(const char *type) 160 { 161 const char *start = strchr(type, '['); 162 char val[8]; 163 char *bracket; 164 int size = 0; 165 166 if (start == NULL) 167 return -EINVAL; 168 169 if (strscpy(val, start + 1, sizeof(val)) <= 0) 170 return -EINVAL; 171 172 bracket = strchr(val, ']'); 173 174 if (!bracket) 175 return -EINVAL; 176 177 *bracket = '\0'; 178 179 if (kstrtouint(val, 0, &size)) 180 return -EINVAL; 181 182 if (size > MAX_FIELD_ARRAY_SIZE) 183 return -EINVAL; 184 185 return size; 186 } 187 188 static int user_field_size(const char *type) 189 { 190 /* long is not allowed from a user, since it's ambigious in size */ 191 if (strcmp(type, "s64") == 0) 192 return sizeof(s64); 193 if (strcmp(type, "u64") == 0) 194 return sizeof(u64); 195 if (strcmp(type, "s32") == 0) 196 return sizeof(s32); 197 if (strcmp(type, "u32") == 0) 198 return sizeof(u32); 199 if (strcmp(type, "int") == 0) 200 return sizeof(int); 201 if (strcmp(type, "unsigned int") == 0) 202 return sizeof(unsigned int); 203 if (strcmp(type, "s16") == 0) 204 return sizeof(s16); 205 if (strcmp(type, "u16") == 0) 206 return sizeof(u16); 207 if (strcmp(type, "short") == 0) 208 return sizeof(short); 209 if (strcmp(type, "unsigned short") == 0) 210 return sizeof(unsigned short); 211 if (strcmp(type, "s8") == 0) 212 return sizeof(s8); 213 if (strcmp(type, "u8") == 0) 214 return sizeof(u8); 215 if (strcmp(type, "char") == 0) 216 return sizeof(char); 217 if (strcmp(type, "unsigned char") == 0) 218 return sizeof(unsigned char); 219 if (str_has_prefix(type, "char[")) 220 return user_field_array_size(type); 221 if (str_has_prefix(type, "unsigned char[")) 222 return user_field_array_size(type); 223 if (str_has_prefix(type, "__data_loc ")) 224 return sizeof(u32); 225 if (str_has_prefix(type, "__rel_loc ")) 226 return sizeof(u32); 227 228 /* Uknown basic type, error */ 229 return -EINVAL; 230 } 231 232 static void user_event_destroy_validators(struct user_event *user) 233 { 234 struct user_event_validator *validator, *next; 235 struct list_head *head = &user->validators; 236 237 list_for_each_entry_safe(validator, next, head, link) { 238 list_del(&validator->link); 239 kfree(validator); 240 } 241 } 242 243 static void user_event_destroy_fields(struct user_event *user) 244 { 245 struct ftrace_event_field *field, *next; 246 struct list_head *head = &user->fields; 247 248 list_for_each_entry_safe(field, next, head, link) { 249 list_del(&field->link); 250 kfree(field); 251 } 252 } 253 254 static int user_event_add_field(struct user_event *user, const char *type, 255 const char *name, int offset, int size, 256 int is_signed, int filter_type) 257 { 258 struct user_event_validator *validator; 259 struct ftrace_event_field *field; 260 int validator_flags = 0; 261 262 field = kmalloc(sizeof(*field), GFP_KERNEL); 263 264 if (!field) 265 return -ENOMEM; 266 267 if (str_has_prefix(type, "__data_loc ")) 268 goto add_validator; 269 270 if (str_has_prefix(type, "__rel_loc ")) { 271 validator_flags |= VALIDATOR_REL; 272 goto add_validator; 273 } 274 275 goto add_field; 276 277 add_validator: 278 if (strstr(type, "char") != 0) 279 validator_flags |= VALIDATOR_ENSURE_NULL; 280 281 validator = kmalloc(sizeof(*validator), GFP_KERNEL); 282 283 if (!validator) { 284 kfree(field); 285 return -ENOMEM; 286 } 287 288 validator->flags = validator_flags; 289 validator->offset = offset; 290 291 /* Want sequential access when validating */ 292 list_add_tail(&validator->link, &user->validators); 293 294 add_field: 295 field->type = type; 296 field->name = name; 297 field->offset = offset; 298 field->size = size; 299 field->is_signed = is_signed; 300 field->filter_type = filter_type; 301 302 list_add(&field->link, &user->fields); 303 304 /* 305 * Min size from user writes that are required, this does not include 306 * the size of trace_entry (common fields). 307 */ 308 user->min_size = (offset + size) - sizeof(struct trace_entry); 309 310 return 0; 311 } 312 313 /* 314 * Parses the values of a field within the description 315 * Format: type name [size] 316 */ 317 static int user_event_parse_field(char *field, struct user_event *user, 318 u32 *offset) 319 { 320 char *part, *type, *name; 321 u32 depth = 0, saved_offset = *offset; 322 int len, size = -EINVAL; 323 bool is_struct = false; 324 325 field = skip_spaces(field); 326 327 if (*field == '\0') 328 return 0; 329 330 /* Handle types that have a space within */ 331 len = str_has_prefix(field, "unsigned "); 332 if (len) 333 goto skip_next; 334 335 len = str_has_prefix(field, "struct "); 336 if (len) { 337 is_struct = true; 338 goto skip_next; 339 } 340 341 len = str_has_prefix(field, "__data_loc unsigned "); 342 if (len) 343 goto skip_next; 344 345 len = str_has_prefix(field, "__data_loc "); 346 if (len) 347 goto skip_next; 348 349 len = str_has_prefix(field, "__rel_loc unsigned "); 350 if (len) 351 goto skip_next; 352 353 len = str_has_prefix(field, "__rel_loc "); 354 if (len) 355 goto skip_next; 356 357 goto parse; 358 skip_next: 359 type = field; 360 field = strpbrk(field + len, " "); 361 362 if (field == NULL) 363 return -EINVAL; 364 365 *field++ = '\0'; 366 depth++; 367 parse: 368 name = NULL; 369 370 while ((part = strsep(&field, " ")) != NULL) { 371 switch (depth++) { 372 case FIELD_DEPTH_TYPE: 373 type = part; 374 break; 375 case FIELD_DEPTH_NAME: 376 name = part; 377 break; 378 case FIELD_DEPTH_SIZE: 379 if (!is_struct) 380 return -EINVAL; 381 382 if (kstrtou32(part, 10, &size)) 383 return -EINVAL; 384 break; 385 default: 386 return -EINVAL; 387 } 388 } 389 390 if (depth < FIELD_DEPTH_SIZE || !name) 391 return -EINVAL; 392 393 if (depth == FIELD_DEPTH_SIZE) 394 size = user_field_size(type); 395 396 if (size == 0) 397 return -EINVAL; 398 399 if (size < 0) 400 return size; 401 402 *offset = saved_offset + size; 403 404 return user_event_add_field(user, type, name, saved_offset, size, 405 type[0] != 'u', FILTER_OTHER); 406 } 407 408 static void user_event_parse_flags(struct user_event *user, char *flags) 409 { 410 char *flag; 411 412 if (flags == NULL) 413 return; 414 415 while ((flag = strsep(&flags, ",")) != NULL) { 416 if (strcmp(flag, "BPF_ITER") == 0) 417 user->flags |= FLAG_BPF_ITER; 418 } 419 } 420 421 static int user_event_parse_fields(struct user_event *user, char *args) 422 { 423 char *field; 424 u32 offset = sizeof(struct trace_entry); 425 int ret = -EINVAL; 426 427 if (args == NULL) 428 return 0; 429 430 while ((field = strsep(&args, ";")) != NULL) { 431 ret = user_event_parse_field(field, user, &offset); 432 433 if (ret) 434 break; 435 } 436 437 return ret; 438 } 439 440 static struct trace_event_fields user_event_fields_array[1]; 441 442 static const char *user_field_format(const char *type) 443 { 444 if (strcmp(type, "s64") == 0) 445 return "%lld"; 446 if (strcmp(type, "u64") == 0) 447 return "%llu"; 448 if (strcmp(type, "s32") == 0) 449 return "%d"; 450 if (strcmp(type, "u32") == 0) 451 return "%u"; 452 if (strcmp(type, "int") == 0) 453 return "%d"; 454 if (strcmp(type, "unsigned int") == 0) 455 return "%u"; 456 if (strcmp(type, "s16") == 0) 457 return "%d"; 458 if (strcmp(type, "u16") == 0) 459 return "%u"; 460 if (strcmp(type, "short") == 0) 461 return "%d"; 462 if (strcmp(type, "unsigned short") == 0) 463 return "%u"; 464 if (strcmp(type, "s8") == 0) 465 return "%d"; 466 if (strcmp(type, "u8") == 0) 467 return "%u"; 468 if (strcmp(type, "char") == 0) 469 return "%d"; 470 if (strcmp(type, "unsigned char") == 0) 471 return "%u"; 472 if (strstr(type, "char[") != 0) 473 return "%s"; 474 475 /* Unknown, likely struct, allowed treat as 64-bit */ 476 return "%llu"; 477 } 478 479 static bool user_field_is_dyn_string(const char *type, const char **str_func) 480 { 481 if (str_has_prefix(type, "__data_loc ")) { 482 *str_func = "__get_str"; 483 goto check; 484 } 485 486 if (str_has_prefix(type, "__rel_loc ")) { 487 *str_func = "__get_rel_str"; 488 goto check; 489 } 490 491 return false; 492 check: 493 return strstr(type, "char") != 0; 494 } 495 496 #define LEN_OR_ZERO (len ? len - pos : 0) 497 static int user_event_set_print_fmt(struct user_event *user, char *buf, int len) 498 { 499 struct ftrace_event_field *field, *next; 500 struct list_head *head = &user->fields; 501 int pos = 0, depth = 0; 502 const char *str_func; 503 504 pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); 505 506 list_for_each_entry_safe_reverse(field, next, head, link) { 507 if (depth != 0) 508 pos += snprintf(buf + pos, LEN_OR_ZERO, " "); 509 510 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s", 511 field->name, user_field_format(field->type)); 512 513 depth++; 514 } 515 516 pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); 517 518 list_for_each_entry_safe_reverse(field, next, head, link) { 519 if (user_field_is_dyn_string(field->type, &str_func)) 520 pos += snprintf(buf + pos, LEN_OR_ZERO, 521 ", %s(%s)", str_func, field->name); 522 else 523 pos += snprintf(buf + pos, LEN_OR_ZERO, 524 ", REC->%s", field->name); 525 } 526 527 return pos + 1; 528 } 529 #undef LEN_OR_ZERO 530 531 static int user_event_create_print_fmt(struct user_event *user) 532 { 533 char *print_fmt; 534 int len; 535 536 len = user_event_set_print_fmt(user, NULL, 0); 537 538 print_fmt = kmalloc(len, GFP_KERNEL); 539 540 if (!print_fmt) 541 return -ENOMEM; 542 543 user_event_set_print_fmt(user, print_fmt, len); 544 545 user->call.print_fmt = print_fmt; 546 547 return 0; 548 } 549 550 static enum print_line_t user_event_print_trace(struct trace_iterator *iter, 551 int flags, 552 struct trace_event *event) 553 { 554 /* Unsafe to try to decode user provided print_fmt, use hex */ 555 trace_print_hex_dump_seq(&iter->seq, "", DUMP_PREFIX_OFFSET, 16, 556 1, iter->ent, iter->ent_size, true); 557 558 return trace_handle_return(&iter->seq); 559 } 560 561 static struct trace_event_functions user_event_funcs = { 562 .trace = user_event_print_trace, 563 }; 564 565 static int destroy_user_event(struct user_event *user) 566 { 567 int ret = 0; 568 569 /* Must destroy fields before call removal */ 570 user_event_destroy_fields(user); 571 572 ret = trace_remove_event_call(&user->call); 573 574 if (ret) 575 return ret; 576 577 dyn_event_remove(&user->devent); 578 579 register_page_data[user->index] = 0; 580 clear_bit(user->index, page_bitmap); 581 hash_del(&user->node); 582 583 user_event_destroy_validators(user); 584 kfree(user->call.print_fmt); 585 kfree(EVENT_NAME(user)); 586 kfree(user); 587 588 return ret; 589 } 590 591 static struct user_event *find_user_event(char *name, u32 *outkey) 592 { 593 struct user_event *user; 594 u32 key = user_event_key(name); 595 596 *outkey = key; 597 598 hash_for_each_possible(register_table, user, node, key) 599 if (!strcmp(EVENT_NAME(user), name)) { 600 atomic_inc(&user->refcnt); 601 return user; 602 } 603 604 return NULL; 605 } 606 607 static int user_event_validate(struct user_event *user, void *data, int len) 608 { 609 struct list_head *head = &user->validators; 610 struct user_event_validator *validator; 611 void *pos, *end = data + len; 612 u32 loc, offset, size; 613 614 list_for_each_entry(validator, head, link) { 615 pos = data + validator->offset; 616 617 /* Already done min_size check, no bounds check here */ 618 loc = *(u32 *)pos; 619 offset = loc & 0xffff; 620 size = loc >> 16; 621 622 if (likely(validator->flags & VALIDATOR_REL)) 623 pos += offset + sizeof(loc); 624 else 625 pos = data + offset; 626 627 pos += size; 628 629 if (unlikely(pos > end)) 630 return -EFAULT; 631 632 if (likely(validator->flags & VALIDATOR_ENSURE_NULL)) 633 if (unlikely(*(char *)(pos - 1) != '\0')) 634 return -EFAULT; 635 } 636 637 return 0; 638 } 639 640 /* 641 * Writes the user supplied payload out to a trace file. 642 */ 643 static void user_event_ftrace(struct user_event *user, struct iov_iter *i, 644 void *tpdata, bool *faulted) 645 { 646 struct trace_event_file *file; 647 struct trace_entry *entry; 648 struct trace_event_buffer event_buffer; 649 size_t size = sizeof(*entry) + i->count; 650 651 file = (struct trace_event_file *)tpdata; 652 653 if (!file || 654 !(file->flags & EVENT_FILE_FL_ENABLED) || 655 trace_trigger_soft_disabled(file)) 656 return; 657 658 /* Allocates and fills trace_entry, + 1 of this is data payload */ 659 entry = trace_event_buffer_reserve(&event_buffer, file, size); 660 661 if (unlikely(!entry)) 662 return; 663 664 if (unlikely(!copy_nofault(entry + 1, i->count, i))) 665 goto discard; 666 667 if (!list_empty(&user->validators) && 668 unlikely(user_event_validate(user, entry, size))) 669 goto discard; 670 671 trace_event_buffer_commit(&event_buffer); 672 673 return; 674 discard: 675 *faulted = true; 676 __trace_event_discard_commit(event_buffer.buffer, 677 event_buffer.event); 678 } 679 680 #ifdef CONFIG_PERF_EVENTS 681 static void user_event_bpf(struct user_event *user, struct iov_iter *i) 682 { 683 struct user_bpf_context context; 684 struct user_bpf_iter bpf_i; 685 char fast_data[MAX_STACK_BPF_DATA]; 686 void *temp = NULL; 687 688 if ((user->flags & FLAG_BPF_ITER) && iter_is_iovec(i)) { 689 /* Raw iterator */ 690 context.data_type = USER_BPF_DATA_ITER; 691 context.data_len = i->count; 692 context.iter = &bpf_i; 693 694 bpf_i.iov_offset = i->iov_offset; 695 bpf_i.iov = i->iov; 696 bpf_i.nr_segs = i->nr_segs; 697 } else if (i->nr_segs == 1 && iter_is_iovec(i)) { 698 /* Single buffer from user */ 699 context.data_type = USER_BPF_DATA_USER; 700 context.data_len = i->count; 701 context.udata = i->iov->iov_base + i->iov_offset; 702 } else { 703 /* Multi buffer from user */ 704 struct iov_iter copy = *i; 705 size_t copy_size = min_t(size_t, i->count, MAX_BPF_COPY_SIZE); 706 707 context.data_type = USER_BPF_DATA_KERNEL; 708 context.kdata = fast_data; 709 710 if (unlikely(copy_size > sizeof(fast_data))) { 711 temp = kmalloc(copy_size, GFP_NOWAIT); 712 713 if (temp) 714 context.kdata = temp; 715 else 716 copy_size = sizeof(fast_data); 717 } 718 719 context.data_len = copy_nofault(context.kdata, 720 copy_size, ©); 721 } 722 723 trace_call_bpf(&user->call, &context); 724 725 kfree(temp); 726 } 727 728 /* 729 * Writes the user supplied payload out to perf ring buffer or eBPF program. 730 */ 731 static void user_event_perf(struct user_event *user, struct iov_iter *i, 732 void *tpdata, bool *faulted) 733 { 734 struct hlist_head *perf_head; 735 736 if (bpf_prog_array_valid(&user->call)) 737 user_event_bpf(user, i); 738 739 perf_head = this_cpu_ptr(user->call.perf_events); 740 741 if (perf_head && !hlist_empty(perf_head)) { 742 struct trace_entry *perf_entry; 743 struct pt_regs *regs; 744 size_t size = sizeof(*perf_entry) + i->count; 745 int context; 746 747 perf_entry = perf_trace_buf_alloc(ALIGN(size, 8), 748 ®s, &context); 749 750 if (unlikely(!perf_entry)) 751 return; 752 753 perf_fetch_caller_regs(regs); 754 755 if (unlikely(!copy_nofault(perf_entry + 1, i->count, i))) 756 goto discard; 757 758 if (!list_empty(&user->validators) && 759 unlikely(user_event_validate(user, perf_entry, size))) 760 goto discard; 761 762 perf_trace_buf_submit(perf_entry, size, context, 763 user->call.event.type, 1, regs, 764 perf_head, NULL); 765 766 return; 767 discard: 768 *faulted = true; 769 perf_swevent_put_recursion_context(context); 770 } 771 } 772 #endif 773 774 /* 775 * Update the register page that is shared between user processes. 776 */ 777 static void update_reg_page_for(struct user_event *user) 778 { 779 struct tracepoint *tp = &user->tracepoint; 780 char status = 0; 781 782 if (atomic_read(&tp->key.enabled) > 0) { 783 struct tracepoint_func *probe_func_ptr; 784 user_event_func_t probe_func; 785 786 rcu_read_lock_sched(); 787 788 probe_func_ptr = rcu_dereference_sched(tp->funcs); 789 790 if (probe_func_ptr) { 791 do { 792 probe_func = probe_func_ptr->func; 793 794 if (probe_func == user_event_ftrace) 795 status |= EVENT_STATUS_FTRACE; 796 #ifdef CONFIG_PERF_EVENTS 797 else if (probe_func == user_event_perf) 798 status |= EVENT_STATUS_PERF; 799 #endif 800 else 801 status |= EVENT_STATUS_OTHER; 802 } while ((++probe_func_ptr)->func); 803 } 804 805 rcu_read_unlock_sched(); 806 } 807 808 register_page_data[user->index] = status; 809 } 810 811 /* 812 * Register callback for our events from tracing sub-systems. 813 */ 814 static int user_event_reg(struct trace_event_call *call, 815 enum trace_reg type, 816 void *data) 817 { 818 struct user_event *user = (struct user_event *)call->data; 819 int ret = 0; 820 821 if (!user) 822 return -ENOENT; 823 824 switch (type) { 825 case TRACE_REG_REGISTER: 826 ret = tracepoint_probe_register(call->tp, 827 call->class->probe, 828 data); 829 if (!ret) 830 goto inc; 831 break; 832 833 case TRACE_REG_UNREGISTER: 834 tracepoint_probe_unregister(call->tp, 835 call->class->probe, 836 data); 837 goto dec; 838 839 #ifdef CONFIG_PERF_EVENTS 840 case TRACE_REG_PERF_REGISTER: 841 ret = tracepoint_probe_register(call->tp, 842 call->class->perf_probe, 843 data); 844 if (!ret) 845 goto inc; 846 break; 847 848 case TRACE_REG_PERF_UNREGISTER: 849 tracepoint_probe_unregister(call->tp, 850 call->class->perf_probe, 851 data); 852 goto dec; 853 854 case TRACE_REG_PERF_OPEN: 855 case TRACE_REG_PERF_CLOSE: 856 case TRACE_REG_PERF_ADD: 857 case TRACE_REG_PERF_DEL: 858 break; 859 #endif 860 } 861 862 return ret; 863 inc: 864 atomic_inc(&user->refcnt); 865 update_reg_page_for(user); 866 return 0; 867 dec: 868 update_reg_page_for(user); 869 atomic_dec(&user->refcnt); 870 return 0; 871 } 872 873 static int user_event_create(const char *raw_command) 874 { 875 struct user_event *user; 876 char *name; 877 int ret; 878 879 if (!str_has_prefix(raw_command, USER_EVENTS_PREFIX)) 880 return -ECANCELED; 881 882 raw_command += USER_EVENTS_PREFIX_LEN; 883 raw_command = skip_spaces(raw_command); 884 885 name = kstrdup(raw_command, GFP_KERNEL); 886 887 if (!name) 888 return -ENOMEM; 889 890 mutex_lock(®_mutex); 891 892 ret = user_event_parse_cmd(name, &user); 893 894 if (!ret) 895 atomic_dec(&user->refcnt); 896 897 mutex_unlock(®_mutex); 898 899 if (ret) 900 kfree(name); 901 902 return ret; 903 } 904 905 static int user_event_show(struct seq_file *m, struct dyn_event *ev) 906 { 907 struct user_event *user = container_of(ev, struct user_event, devent); 908 struct ftrace_event_field *field, *next; 909 struct list_head *head; 910 int depth = 0; 911 912 seq_printf(m, "%s%s", USER_EVENTS_PREFIX, EVENT_NAME(user)); 913 914 head = trace_get_fields(&user->call); 915 916 list_for_each_entry_safe_reverse(field, next, head, link) { 917 if (depth == 0) 918 seq_puts(m, " "); 919 else 920 seq_puts(m, "; "); 921 922 seq_printf(m, "%s %s", field->type, field->name); 923 924 if (str_has_prefix(field->type, "struct ")) 925 seq_printf(m, " %d", field->size); 926 927 depth++; 928 } 929 930 seq_puts(m, "\n"); 931 932 return 0; 933 } 934 935 static bool user_event_is_busy(struct dyn_event *ev) 936 { 937 struct user_event *user = container_of(ev, struct user_event, devent); 938 939 return atomic_read(&user->refcnt) != 0; 940 } 941 942 static int user_event_free(struct dyn_event *ev) 943 { 944 struct user_event *user = container_of(ev, struct user_event, devent); 945 946 if (atomic_read(&user->refcnt) != 0) 947 return -EBUSY; 948 949 return destroy_user_event(user); 950 } 951 952 static bool user_field_match(struct ftrace_event_field *field, int argc, 953 const char **argv, int *iout) 954 { 955 char *field_name, *arg_name; 956 int len, pos, i = *iout; 957 bool colon = false, match = false; 958 959 if (i >= argc) 960 return false; 961 962 len = MAX_FIELD_ARG_NAME; 963 field_name = kmalloc(len, GFP_KERNEL); 964 arg_name = kmalloc(len, GFP_KERNEL); 965 966 if (!arg_name || !field_name) 967 goto out; 968 969 pos = 0; 970 971 for (; i < argc; ++i) { 972 if (i != *iout) 973 pos += snprintf(arg_name + pos, len - pos, " "); 974 975 pos += snprintf(arg_name + pos, len - pos, argv[i]); 976 977 if (strchr(argv[i], ';')) { 978 ++i; 979 colon = true; 980 break; 981 } 982 } 983 984 pos = 0; 985 986 pos += snprintf(field_name + pos, len - pos, field->type); 987 pos += snprintf(field_name + pos, len - pos, " "); 988 pos += snprintf(field_name + pos, len - pos, field->name); 989 990 if (colon) 991 pos += snprintf(field_name + pos, len - pos, ";"); 992 993 *iout = i; 994 995 match = strcmp(arg_name, field_name) == 0; 996 out: 997 kfree(arg_name); 998 kfree(field_name); 999 1000 return match; 1001 } 1002 1003 static bool user_fields_match(struct user_event *user, int argc, 1004 const char **argv) 1005 { 1006 struct ftrace_event_field *field, *next; 1007 struct list_head *head = &user->fields; 1008 int i = 0; 1009 1010 list_for_each_entry_safe_reverse(field, next, head, link) 1011 if (!user_field_match(field, argc, argv, &i)) 1012 return false; 1013 1014 if (i != argc) 1015 return false; 1016 1017 return true; 1018 } 1019 1020 static bool user_event_match(const char *system, const char *event, 1021 int argc, const char **argv, struct dyn_event *ev) 1022 { 1023 struct user_event *user = container_of(ev, struct user_event, devent); 1024 bool match; 1025 1026 match = strcmp(EVENT_NAME(user), event) == 0 && 1027 (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0); 1028 1029 if (match && argc > 0) 1030 match = user_fields_match(user, argc, argv); 1031 1032 return match; 1033 } 1034 1035 static struct dyn_event_operations user_event_dops = { 1036 .create = user_event_create, 1037 .show = user_event_show, 1038 .is_busy = user_event_is_busy, 1039 .free = user_event_free, 1040 .match = user_event_match, 1041 }; 1042 1043 static int user_event_trace_register(struct user_event *user) 1044 { 1045 int ret; 1046 1047 ret = register_trace_event(&user->call.event); 1048 1049 if (!ret) 1050 return -ENODEV; 1051 1052 ret = trace_add_event_call(&user->call); 1053 1054 if (ret) 1055 unregister_trace_event(&user->call.event); 1056 1057 return ret; 1058 } 1059 1060 /* 1061 * Parses the event name, arguments and flags then registers if successful. 1062 * The name buffer lifetime is owned by this method for success cases only. 1063 * Upon success the returned user_event has its ref count increased by 1. 1064 */ 1065 static int user_event_parse(char *name, char *args, char *flags, 1066 struct user_event **newuser) 1067 { 1068 int ret; 1069 int index; 1070 u32 key; 1071 struct user_event *user; 1072 1073 /* Prevent dyn_event from racing */ 1074 mutex_lock(&event_mutex); 1075 user = find_user_event(name, &key); 1076 mutex_unlock(&event_mutex); 1077 1078 if (user) { 1079 *newuser = user; 1080 /* 1081 * Name is allocated by caller, free it since it already exists. 1082 * Caller only worries about failure cases for freeing. 1083 */ 1084 kfree(name); 1085 return 0; 1086 } 1087 1088 index = find_first_zero_bit(page_bitmap, MAX_EVENTS); 1089 1090 if (index == MAX_EVENTS) 1091 return -EMFILE; 1092 1093 user = kzalloc(sizeof(*user), GFP_KERNEL); 1094 1095 if (!user) 1096 return -ENOMEM; 1097 1098 INIT_LIST_HEAD(&user->class.fields); 1099 INIT_LIST_HEAD(&user->fields); 1100 INIT_LIST_HEAD(&user->validators); 1101 1102 user->tracepoint.name = name; 1103 1104 user_event_parse_flags(user, flags); 1105 1106 ret = user_event_parse_fields(user, args); 1107 1108 if (ret) 1109 goto put_user; 1110 1111 ret = user_event_create_print_fmt(user); 1112 1113 if (ret) 1114 goto put_user; 1115 1116 user->call.data = user; 1117 user->call.class = &user->class; 1118 user->call.name = name; 1119 user->call.flags = TRACE_EVENT_FL_TRACEPOINT; 1120 user->call.tp = &user->tracepoint; 1121 user->call.event.funcs = &user_event_funcs; 1122 1123 user->class.system = USER_EVENTS_SYSTEM; 1124 user->class.fields_array = user_event_fields_array; 1125 user->class.get_fields = user_event_get_fields; 1126 user->class.reg = user_event_reg; 1127 user->class.probe = user_event_ftrace; 1128 #ifdef CONFIG_PERF_EVENTS 1129 user->class.perf_probe = user_event_perf; 1130 #endif 1131 1132 mutex_lock(&event_mutex); 1133 ret = user_event_trace_register(user); 1134 mutex_unlock(&event_mutex); 1135 1136 if (ret) 1137 goto put_user; 1138 1139 user->index = index; 1140 1141 /* Ensure we track ref */ 1142 atomic_inc(&user->refcnt); 1143 1144 dyn_event_init(&user->devent, &user_event_dops); 1145 dyn_event_add(&user->devent, &user->call); 1146 set_bit(user->index, page_bitmap); 1147 hash_add(register_table, &user->node, key); 1148 1149 *newuser = user; 1150 return 0; 1151 put_user: 1152 user_event_destroy_fields(user); 1153 user_event_destroy_validators(user); 1154 kfree(user); 1155 return ret; 1156 } 1157 1158 /* 1159 * Deletes a previously created event if it is no longer being used. 1160 */ 1161 static int delete_user_event(char *name) 1162 { 1163 u32 key; 1164 int ret; 1165 struct user_event *user = find_user_event(name, &key); 1166 1167 if (!user) 1168 return -ENOENT; 1169 1170 /* Ensure we are the last ref */ 1171 if (atomic_read(&user->refcnt) != 1) { 1172 ret = -EBUSY; 1173 goto put_ref; 1174 } 1175 1176 ret = destroy_user_event(user); 1177 1178 if (ret) 1179 goto put_ref; 1180 1181 return ret; 1182 put_ref: 1183 /* No longer have this ref */ 1184 atomic_dec(&user->refcnt); 1185 1186 return ret; 1187 } 1188 1189 /* 1190 * Validates the user payload and writes via iterator. 1191 */ 1192 static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) 1193 { 1194 struct user_event_refs *refs; 1195 struct user_event *user = NULL; 1196 struct tracepoint *tp; 1197 ssize_t ret = i->count; 1198 int idx; 1199 1200 if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx))) 1201 return -EFAULT; 1202 1203 rcu_read_lock_sched(); 1204 1205 refs = rcu_dereference_sched(file->private_data); 1206 1207 /* 1208 * The refs->events array is protected by RCU, and new items may be 1209 * added. But the user retrieved from indexing into the events array 1210 * shall be immutable while the file is opened. 1211 */ 1212 if (likely(refs && idx < refs->count)) 1213 user = refs->events[idx]; 1214 1215 rcu_read_unlock_sched(); 1216 1217 if (unlikely(user == NULL)) 1218 return -ENOENT; 1219 1220 if (unlikely(i->count < user->min_size)) 1221 return -EINVAL; 1222 1223 tp = &user->tracepoint; 1224 1225 /* 1226 * It's possible key.enabled disables after this check, however 1227 * we don't mind if a few events are included in this condition. 1228 */ 1229 if (likely(atomic_read(&tp->key.enabled) > 0)) { 1230 struct tracepoint_func *probe_func_ptr; 1231 user_event_func_t probe_func; 1232 struct iov_iter copy; 1233 void *tpdata; 1234 bool faulted; 1235 1236 if (unlikely(fault_in_iov_iter_readable(i, i->count))) 1237 return -EFAULT; 1238 1239 faulted = false; 1240 1241 rcu_read_lock_sched(); 1242 1243 probe_func_ptr = rcu_dereference_sched(tp->funcs); 1244 1245 if (probe_func_ptr) { 1246 do { 1247 copy = *i; 1248 probe_func = probe_func_ptr->func; 1249 tpdata = probe_func_ptr->data; 1250 probe_func(user, ©, tpdata, &faulted); 1251 } while ((++probe_func_ptr)->func); 1252 } 1253 1254 rcu_read_unlock_sched(); 1255 1256 if (unlikely(faulted)) 1257 return -EFAULT; 1258 } 1259 1260 return ret; 1261 } 1262 1263 static ssize_t user_events_write(struct file *file, const char __user *ubuf, 1264 size_t count, loff_t *ppos) 1265 { 1266 struct iovec iov; 1267 struct iov_iter i; 1268 1269 if (unlikely(*ppos != 0)) 1270 return -EFAULT; 1271 1272 if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i))) 1273 return -EFAULT; 1274 1275 return user_events_write_core(file, &i); 1276 } 1277 1278 static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i) 1279 { 1280 return user_events_write_core(kp->ki_filp, i); 1281 } 1282 1283 static int user_events_ref_add(struct file *file, struct user_event *user) 1284 { 1285 struct user_event_refs *refs, *new_refs; 1286 int i, size, count = 0; 1287 1288 refs = rcu_dereference_protected(file->private_data, 1289 lockdep_is_held(®_mutex)); 1290 1291 if (refs) { 1292 count = refs->count; 1293 1294 for (i = 0; i < count; ++i) 1295 if (refs->events[i] == user) 1296 return i; 1297 } 1298 1299 size = struct_size(refs, events, count + 1); 1300 1301 new_refs = kzalloc(size, GFP_KERNEL); 1302 1303 if (!new_refs) 1304 return -ENOMEM; 1305 1306 new_refs->count = count + 1; 1307 1308 for (i = 0; i < count; ++i) 1309 new_refs->events[i] = refs->events[i]; 1310 1311 new_refs->events[i] = user; 1312 1313 atomic_inc(&user->refcnt); 1314 1315 rcu_assign_pointer(file->private_data, new_refs); 1316 1317 if (refs) 1318 kfree_rcu(refs, rcu); 1319 1320 return i; 1321 } 1322 1323 static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) 1324 { 1325 u32 size; 1326 long ret; 1327 1328 ret = get_user(size, &ureg->size); 1329 1330 if (ret) 1331 return ret; 1332 1333 if (size > PAGE_SIZE) 1334 return -E2BIG; 1335 1336 return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); 1337 } 1338 1339 /* 1340 * Registers a user_event on behalf of a user process. 1341 */ 1342 static long user_events_ioctl_reg(struct file *file, unsigned long uarg) 1343 { 1344 struct user_reg __user *ureg = (struct user_reg __user *)uarg; 1345 struct user_reg reg; 1346 struct user_event *user; 1347 char *name; 1348 long ret; 1349 1350 ret = user_reg_get(ureg, ®); 1351 1352 if (ret) 1353 return ret; 1354 1355 name = strndup_user((const char __user *)(uintptr_t)reg.name_args, 1356 MAX_EVENT_DESC); 1357 1358 if (IS_ERR(name)) { 1359 ret = PTR_ERR(name); 1360 return ret; 1361 } 1362 1363 ret = user_event_parse_cmd(name, &user); 1364 1365 if (ret) { 1366 kfree(name); 1367 return ret; 1368 } 1369 1370 ret = user_events_ref_add(file, user); 1371 1372 /* No longer need parse ref, ref_add either worked or not */ 1373 atomic_dec(&user->refcnt); 1374 1375 /* Positive number is index and valid */ 1376 if (ret < 0) 1377 return ret; 1378 1379 put_user((u32)ret, &ureg->write_index); 1380 put_user(user->index, &ureg->status_index); 1381 1382 return 0; 1383 } 1384 1385 /* 1386 * Deletes a user_event on behalf of a user process. 1387 */ 1388 static long user_events_ioctl_del(struct file *file, unsigned long uarg) 1389 { 1390 void __user *ubuf = (void __user *)uarg; 1391 char *name; 1392 long ret; 1393 1394 name = strndup_user(ubuf, MAX_EVENT_DESC); 1395 1396 if (IS_ERR(name)) 1397 return PTR_ERR(name); 1398 1399 /* event_mutex prevents dyn_event from racing */ 1400 mutex_lock(&event_mutex); 1401 ret = delete_user_event(name); 1402 mutex_unlock(&event_mutex); 1403 1404 kfree(name); 1405 1406 return ret; 1407 } 1408 1409 /* 1410 * Handles the ioctl from user mode to register or alter operations. 1411 */ 1412 static long user_events_ioctl(struct file *file, unsigned int cmd, 1413 unsigned long uarg) 1414 { 1415 long ret = -ENOTTY; 1416 1417 switch (cmd) { 1418 case DIAG_IOCSREG: 1419 mutex_lock(®_mutex); 1420 ret = user_events_ioctl_reg(file, uarg); 1421 mutex_unlock(®_mutex); 1422 break; 1423 1424 case DIAG_IOCSDEL: 1425 mutex_lock(®_mutex); 1426 ret = user_events_ioctl_del(file, uarg); 1427 mutex_unlock(®_mutex); 1428 break; 1429 } 1430 1431 return ret; 1432 } 1433 1434 /* 1435 * Handles the final close of the file from user mode. 1436 */ 1437 static int user_events_release(struct inode *node, struct file *file) 1438 { 1439 struct user_event_refs *refs; 1440 struct user_event *user; 1441 int i; 1442 1443 /* 1444 * Ensure refs cannot change under any situation by taking the 1445 * register mutex during the final freeing of the references. 1446 */ 1447 mutex_lock(®_mutex); 1448 1449 refs = file->private_data; 1450 1451 if (!refs) 1452 goto out; 1453 1454 /* 1455 * The lifetime of refs has reached an end, it's tied to this file. 1456 * The underlying user_events are ref counted, and cannot be freed. 1457 * After this decrement, the user_events may be freed elsewhere. 1458 */ 1459 for (i = 0; i < refs->count; ++i) { 1460 user = refs->events[i]; 1461 1462 if (user) 1463 atomic_dec(&user->refcnt); 1464 } 1465 out: 1466 file->private_data = NULL; 1467 1468 mutex_unlock(®_mutex); 1469 1470 kfree(refs); 1471 1472 return 0; 1473 } 1474 1475 static const struct file_operations user_data_fops = { 1476 .write = user_events_write, 1477 .write_iter = user_events_write_iter, 1478 .unlocked_ioctl = user_events_ioctl, 1479 .release = user_events_release, 1480 }; 1481 1482 /* 1483 * Maps the shared page into the user process for checking if event is enabled. 1484 */ 1485 static int user_status_mmap(struct file *file, struct vm_area_struct *vma) 1486 { 1487 unsigned long size = vma->vm_end - vma->vm_start; 1488 1489 if (size != MAX_EVENTS) 1490 return -EINVAL; 1491 1492 return remap_pfn_range(vma, vma->vm_start, 1493 virt_to_phys(register_page_data) >> PAGE_SHIFT, 1494 size, vm_get_page_prot(VM_READ)); 1495 } 1496 1497 static void *user_seq_start(struct seq_file *m, loff_t *pos) 1498 { 1499 if (*pos) 1500 return NULL; 1501 1502 return (void *)1; 1503 } 1504 1505 static void *user_seq_next(struct seq_file *m, void *p, loff_t *pos) 1506 { 1507 ++*pos; 1508 return NULL; 1509 } 1510 1511 static void user_seq_stop(struct seq_file *m, void *p) 1512 { 1513 } 1514 1515 static int user_seq_show(struct seq_file *m, void *p) 1516 { 1517 struct user_event *user; 1518 char status; 1519 int i, active = 0, busy = 0, flags; 1520 1521 mutex_lock(®_mutex); 1522 1523 hash_for_each(register_table, i, user, node) { 1524 status = register_page_data[user->index]; 1525 flags = user->flags; 1526 1527 seq_printf(m, "%d:%s", user->index, EVENT_NAME(user)); 1528 1529 if (flags != 0 || status != 0) 1530 seq_puts(m, " #"); 1531 1532 if (status != 0) { 1533 seq_puts(m, " Used by"); 1534 if (status & EVENT_STATUS_FTRACE) 1535 seq_puts(m, " ftrace"); 1536 if (status & EVENT_STATUS_PERF) 1537 seq_puts(m, " perf"); 1538 if (status & EVENT_STATUS_OTHER) 1539 seq_puts(m, " other"); 1540 busy++; 1541 } 1542 1543 if (flags & FLAG_BPF_ITER) 1544 seq_puts(m, " FLAG:BPF_ITER"); 1545 1546 seq_puts(m, "\n"); 1547 active++; 1548 } 1549 1550 mutex_unlock(®_mutex); 1551 1552 seq_puts(m, "\n"); 1553 seq_printf(m, "Active: %d\n", active); 1554 seq_printf(m, "Busy: %d\n", busy); 1555 seq_printf(m, "Max: %ld\n", MAX_EVENTS); 1556 1557 return 0; 1558 } 1559 1560 static const struct seq_operations user_seq_ops = { 1561 .start = user_seq_start, 1562 .next = user_seq_next, 1563 .stop = user_seq_stop, 1564 .show = user_seq_show, 1565 }; 1566 1567 static int user_status_open(struct inode *node, struct file *file) 1568 { 1569 return seq_open(file, &user_seq_ops); 1570 } 1571 1572 static const struct file_operations user_status_fops = { 1573 .open = user_status_open, 1574 .mmap = user_status_mmap, 1575 .read = seq_read, 1576 .llseek = seq_lseek, 1577 .release = seq_release, 1578 }; 1579 1580 /* 1581 * Creates a set of tracefs files to allow user mode interactions. 1582 */ 1583 static int create_user_tracefs(void) 1584 { 1585 struct dentry *edata, *emmap; 1586 1587 edata = tracefs_create_file("user_events_data", TRACE_MODE_WRITE, 1588 NULL, NULL, &user_data_fops); 1589 1590 if (!edata) { 1591 pr_warn("Could not create tracefs 'user_events_data' entry\n"); 1592 goto err; 1593 } 1594 1595 /* mmap with MAP_SHARED requires writable fd */ 1596 emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE, 1597 NULL, NULL, &user_status_fops); 1598 1599 if (!emmap) { 1600 tracefs_remove(edata); 1601 pr_warn("Could not create tracefs 'user_events_mmap' entry\n"); 1602 goto err; 1603 } 1604 1605 return 0; 1606 err: 1607 return -ENODEV; 1608 } 1609 1610 static void set_page_reservations(bool set) 1611 { 1612 int page; 1613 1614 for (page = 0; page < MAX_PAGES; ++page) { 1615 void *addr = register_page_data + (PAGE_SIZE * page); 1616 1617 if (set) 1618 SetPageReserved(virt_to_page(addr)); 1619 else 1620 ClearPageReserved(virt_to_page(addr)); 1621 } 1622 } 1623 1624 static int __init trace_events_user_init(void) 1625 { 1626 struct page *pages; 1627 int ret; 1628 1629 /* Zero all bits beside 0 (which is reserved for failures) */ 1630 bitmap_zero(page_bitmap, MAX_EVENTS); 1631 set_bit(0, page_bitmap); 1632 1633 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER); 1634 if (!pages) 1635 return -ENOMEM; 1636 register_page_data = page_address(pages); 1637 1638 set_page_reservations(true); 1639 1640 ret = create_user_tracefs(); 1641 1642 if (ret) { 1643 pr_warn("user_events could not register with tracefs\n"); 1644 set_page_reservations(false); 1645 __free_pages(pages, MAX_PAGE_ORDER); 1646 return ret; 1647 } 1648 1649 if (dyn_event_register(&user_event_dops)) 1650 pr_warn("user_events could not register with dyn_events\n"); 1651 1652 return 0; 1653 } 1654 1655 fs_initcall(trace_events_user_init); 1656