xref: /openbmc/linux/kernel/trace/trace_events_user.c (revision 0279400ad38d858ed68f5d787385f6122d4170b2)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2021, Microsoft Corporation.
4  *
5  * Authors:
6  *   Beau Belgrave <beaub@linux.microsoft.com>
7  */
8 
9 #include <linux/bitmap.h>
10 #include <linux/cdev.h>
11 #include <linux/hashtable.h>
12 #include <linux/list.h>
13 #include <linux/io.h>
14 #include <linux/uio.h>
15 #include <linux/ioctl.h>
16 #include <linux/jhash.h>
17 #include <linux/trace_events.h>
18 #include <linux/tracefs.h>
19 #include <linux/types.h>
20 #include <linux/uaccess.h>
21 #include <uapi/linux/user_events.h>
22 #include "trace.h"
23 #include "trace_dynevent.h"
24 
25 #define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1)
26 
27 #define FIELD_DEPTH_TYPE 0
28 #define FIELD_DEPTH_NAME 1
29 #define FIELD_DEPTH_SIZE 2
30 
31 /*
32  * Limits how many trace_event calls user processes can create:
33  * Must be multiple of PAGE_SIZE.
34  */
35 #define MAX_PAGES 1
36 #define MAX_EVENTS (MAX_PAGES * PAGE_SIZE)
37 
38 /* Limit how long of an event name plus args within the subsystem. */
39 #define MAX_EVENT_DESC 512
40 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
41 #define MAX_FIELD_ARRAY_SIZE 1024
42 #define MAX_FIELD_ARG_NAME 256
43 
44 #define MAX_BPF_COPY_SIZE PAGE_SIZE
45 #define MAX_STACK_BPF_DATA 512
46 
47 static char *register_page_data;
48 
49 static DEFINE_MUTEX(reg_mutex);
50 static DEFINE_HASHTABLE(register_table, 4);
51 static DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
52 
53 /*
54  * Stores per-event properties, as users register events
55  * within a file a user_event might be created if it does not
56  * already exist. These are globally used and their lifetime
57  * is tied to the refcnt member. These cannot go away until the
58  * refcnt reaches zero.
59  */
60 struct user_event {
61 	struct tracepoint tracepoint;
62 	struct trace_event_call call;
63 	struct trace_event_class class;
64 	struct dyn_event devent;
65 	struct hlist_node node;
66 	struct list_head fields;
67 	atomic_t refcnt;
68 	int index;
69 	int flags;
70 };
71 
72 /*
73  * Stores per-file events references, as users register events
74  * within a file this structure is modified and freed via RCU.
75  * The lifetime of this struct is tied to the lifetime of the file.
76  * These are not shared and only accessible by the file that created it.
77  */
78 struct user_event_refs {
79 	struct rcu_head rcu;
80 	int count;
81 	struct user_event *events[];
82 };
83 
84 typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
85 				   void *tpdata);
86 
87 static int user_event_parse(char *name, char *args, char *flags,
88 			    struct user_event **newuser);
89 
90 static u32 user_event_key(char *name)
91 {
92 	return jhash(name, strlen(name), 0);
93 }
94 
95 static __always_inline __must_check
96 size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i)
97 {
98 	size_t ret;
99 
100 	pagefault_disable();
101 
102 	ret = copy_from_iter_nocache(addr, bytes, i);
103 
104 	pagefault_enable();
105 
106 	return ret;
107 }
108 
109 static struct list_head *user_event_get_fields(struct trace_event_call *call)
110 {
111 	struct user_event *user = (struct user_event *)call->data;
112 
113 	return &user->fields;
114 }
115 
116 /*
117  * Parses a register command for user_events
118  * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]]
119  *
120  * Example event named 'test' with a 20 char 'msg' field with an unsigned int
121  * 'id' field after:
122  * test char[20] msg;unsigned int id
123  *
124  * NOTE: Offsets are from the user data perspective, they are not from the
125  * trace_entry/buffer perspective. We automatically add the common properties
126  * sizes to the offset for the user.
127  */
128 static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
129 {
130 	char *name = raw_command;
131 	char *args = strpbrk(name, " ");
132 	char *flags;
133 
134 	if (args)
135 		*args++ = '\0';
136 
137 	flags = strpbrk(name, ":");
138 
139 	if (flags)
140 		*flags++ = '\0';
141 
142 	return user_event_parse(name, args, flags, newuser);
143 }
144 
145 static int user_field_array_size(const char *type)
146 {
147 	const char *start = strchr(type, '[');
148 	char val[8];
149 	char *bracket;
150 	int size = 0;
151 
152 	if (start == NULL)
153 		return -EINVAL;
154 
155 	if (strscpy(val, start + 1, sizeof(val)) <= 0)
156 		return -EINVAL;
157 
158 	bracket = strchr(val, ']');
159 
160 	if (!bracket)
161 		return -EINVAL;
162 
163 	*bracket = '\0';
164 
165 	if (kstrtouint(val, 0, &size))
166 		return -EINVAL;
167 
168 	if (size > MAX_FIELD_ARRAY_SIZE)
169 		return -EINVAL;
170 
171 	return size;
172 }
173 
174 static int user_field_size(const char *type)
175 {
176 	/* long is not allowed from a user, since it's ambigious in size */
177 	if (strcmp(type, "s64") == 0)
178 		return sizeof(s64);
179 	if (strcmp(type, "u64") == 0)
180 		return sizeof(u64);
181 	if (strcmp(type, "s32") == 0)
182 		return sizeof(s32);
183 	if (strcmp(type, "u32") == 0)
184 		return sizeof(u32);
185 	if (strcmp(type, "int") == 0)
186 		return sizeof(int);
187 	if (strcmp(type, "unsigned int") == 0)
188 		return sizeof(unsigned int);
189 	if (strcmp(type, "s16") == 0)
190 		return sizeof(s16);
191 	if (strcmp(type, "u16") == 0)
192 		return sizeof(u16);
193 	if (strcmp(type, "short") == 0)
194 		return sizeof(short);
195 	if (strcmp(type, "unsigned short") == 0)
196 		return sizeof(unsigned short);
197 	if (strcmp(type, "s8") == 0)
198 		return sizeof(s8);
199 	if (strcmp(type, "u8") == 0)
200 		return sizeof(u8);
201 	if (strcmp(type, "char") == 0)
202 		return sizeof(char);
203 	if (strcmp(type, "unsigned char") == 0)
204 		return sizeof(unsigned char);
205 	if (str_has_prefix(type, "char["))
206 		return user_field_array_size(type);
207 	if (str_has_prefix(type, "unsigned char["))
208 		return user_field_array_size(type);
209 	if (str_has_prefix(type, "__data_loc "))
210 		return sizeof(u32);
211 	if (str_has_prefix(type, "__rel_loc "))
212 		return sizeof(u32);
213 
214 	/* Uknown basic type, error */
215 	return -EINVAL;
216 }
217 
218 static void user_event_destroy_fields(struct user_event *user)
219 {
220 	struct ftrace_event_field *field, *next;
221 	struct list_head *head = &user->fields;
222 
223 	list_for_each_entry_safe(field, next, head, link) {
224 		list_del(&field->link);
225 		kfree(field);
226 	}
227 }
228 
229 static int user_event_add_field(struct user_event *user, const char *type,
230 				const char *name, int offset, int size,
231 				int is_signed, int filter_type)
232 {
233 	struct ftrace_event_field *field;
234 
235 	field = kmalloc(sizeof(*field), GFP_KERNEL);
236 
237 	if (!field)
238 		return -ENOMEM;
239 
240 	field->type = type;
241 	field->name = name;
242 	field->offset = offset;
243 	field->size = size;
244 	field->is_signed = is_signed;
245 	field->filter_type = filter_type;
246 
247 	list_add(&field->link, &user->fields);
248 
249 	return 0;
250 }
251 
252 /*
253  * Parses the values of a field within the description
254  * Format: type name [size]
255  */
256 static int user_event_parse_field(char *field, struct user_event *user,
257 				  u32 *offset)
258 {
259 	char *part, *type, *name;
260 	u32 depth = 0, saved_offset = *offset;
261 	int len, size = -EINVAL;
262 	bool is_struct = false;
263 
264 	field = skip_spaces(field);
265 
266 	if (*field == '\0')
267 		return 0;
268 
269 	/* Handle types that have a space within */
270 	len = str_has_prefix(field, "unsigned ");
271 	if (len)
272 		goto skip_next;
273 
274 	len = str_has_prefix(field, "struct ");
275 	if (len) {
276 		is_struct = true;
277 		goto skip_next;
278 	}
279 
280 	len = str_has_prefix(field, "__data_loc unsigned ");
281 	if (len)
282 		goto skip_next;
283 
284 	len = str_has_prefix(field, "__data_loc ");
285 	if (len)
286 		goto skip_next;
287 
288 	len = str_has_prefix(field, "__rel_loc unsigned ");
289 	if (len)
290 		goto skip_next;
291 
292 	len = str_has_prefix(field, "__rel_loc ");
293 	if (len)
294 		goto skip_next;
295 
296 	goto parse;
297 skip_next:
298 	type = field;
299 	field = strpbrk(field + len, " ");
300 
301 	if (field == NULL)
302 		return -EINVAL;
303 
304 	*field++ = '\0';
305 	depth++;
306 parse:
307 	while ((part = strsep(&field, " ")) != NULL) {
308 		switch (depth++) {
309 		case FIELD_DEPTH_TYPE:
310 			type = part;
311 			break;
312 		case FIELD_DEPTH_NAME:
313 			name = part;
314 			break;
315 		case FIELD_DEPTH_SIZE:
316 			if (!is_struct)
317 				return -EINVAL;
318 
319 			if (kstrtou32(part, 10, &size))
320 				return -EINVAL;
321 			break;
322 		default:
323 			return -EINVAL;
324 		}
325 	}
326 
327 	if (depth < FIELD_DEPTH_SIZE)
328 		return -EINVAL;
329 
330 	if (depth == FIELD_DEPTH_SIZE)
331 		size = user_field_size(type);
332 
333 	if (size == 0)
334 		return -EINVAL;
335 
336 	if (size < 0)
337 		return size;
338 
339 	*offset = saved_offset + size;
340 
341 	return user_event_add_field(user, type, name, saved_offset, size,
342 				    type[0] != 'u', FILTER_OTHER);
343 }
344 
345 static void user_event_parse_flags(struct user_event *user, char *flags)
346 {
347 	char *flag;
348 
349 	if (flags == NULL)
350 		return;
351 
352 	while ((flag = strsep(&flags, ",")) != NULL) {
353 		if (strcmp(flag, "BPF_ITER") == 0)
354 			user->flags |= FLAG_BPF_ITER;
355 	}
356 }
357 
358 static int user_event_parse_fields(struct user_event *user, char *args)
359 {
360 	char *field;
361 	u32 offset = sizeof(struct trace_entry);
362 	int ret = -EINVAL;
363 
364 	if (args == NULL)
365 		return 0;
366 
367 	while ((field = strsep(&args, ";")) != NULL) {
368 		ret = user_event_parse_field(field, user, &offset);
369 
370 		if (ret)
371 			break;
372 	}
373 
374 	return ret;
375 }
376 
377 static struct trace_event_fields user_event_fields_array[1];
378 
379 static const char *user_field_format(const char *type)
380 {
381 	if (strcmp(type, "s64") == 0)
382 		return "%lld";
383 	if (strcmp(type, "u64") == 0)
384 		return "%llu";
385 	if (strcmp(type, "s32") == 0)
386 		return "%d";
387 	if (strcmp(type, "u32") == 0)
388 		return "%u";
389 	if (strcmp(type, "int") == 0)
390 		return "%d";
391 	if (strcmp(type, "unsigned int") == 0)
392 		return "%u";
393 	if (strcmp(type, "s16") == 0)
394 		return "%d";
395 	if (strcmp(type, "u16") == 0)
396 		return "%u";
397 	if (strcmp(type, "short") == 0)
398 		return "%d";
399 	if (strcmp(type, "unsigned short") == 0)
400 		return "%u";
401 	if (strcmp(type, "s8") == 0)
402 		return "%d";
403 	if (strcmp(type, "u8") == 0)
404 		return "%u";
405 	if (strcmp(type, "char") == 0)
406 		return "%d";
407 	if (strcmp(type, "unsigned char") == 0)
408 		return "%u";
409 	if (strstr(type, "char[") != 0)
410 		return "%s";
411 
412 	/* Unknown, likely struct, allowed treat as 64-bit */
413 	return "%llu";
414 }
415 
416 static bool user_field_is_dyn_string(const char *type, const char **str_func)
417 {
418 	if (str_has_prefix(type, "__data_loc ")) {
419 		*str_func = "__get_str";
420 		goto check;
421 	}
422 
423 	if (str_has_prefix(type, "__rel_loc ")) {
424 		*str_func = "__get_rel_str";
425 		goto check;
426 	}
427 
428 	return false;
429 check:
430 	return strstr(type, "char") != 0;
431 }
432 
433 #define LEN_OR_ZERO (len ? len - pos : 0)
434 static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
435 {
436 	struct ftrace_event_field *field, *next;
437 	struct list_head *head = &user->fields;
438 	int pos = 0, depth = 0;
439 	const char *str_func;
440 
441 	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
442 
443 	list_for_each_entry_safe_reverse(field, next, head, link) {
444 		if (depth != 0)
445 			pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
446 
447 		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s",
448 				field->name, user_field_format(field->type));
449 
450 		depth++;
451 	}
452 
453 	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
454 
455 	list_for_each_entry_safe_reverse(field, next, head, link) {
456 		if (user_field_is_dyn_string(field->type, &str_func))
457 			pos += snprintf(buf + pos, LEN_OR_ZERO,
458 					", %s(%s)", str_func, field->name);
459 		else
460 			pos += snprintf(buf + pos, LEN_OR_ZERO,
461 					", REC->%s", field->name);
462 	}
463 
464 	return pos + 1;
465 }
466 #undef LEN_OR_ZERO
467 
468 static int user_event_create_print_fmt(struct user_event *user)
469 {
470 	char *print_fmt;
471 	int len;
472 
473 	len = user_event_set_print_fmt(user, NULL, 0);
474 
475 	print_fmt = kmalloc(len, GFP_KERNEL);
476 
477 	if (!print_fmt)
478 		return -ENOMEM;
479 
480 	user_event_set_print_fmt(user, print_fmt, len);
481 
482 	user->call.print_fmt = print_fmt;
483 
484 	return 0;
485 }
486 
487 static enum print_line_t user_event_print_trace(struct trace_iterator *iter,
488 						int flags,
489 						struct trace_event *event)
490 {
491 	/* Unsafe to try to decode user provided print_fmt, use hex */
492 	trace_print_hex_dump_seq(&iter->seq, "", DUMP_PREFIX_OFFSET, 16,
493 				 1, iter->ent, iter->ent_size, true);
494 
495 	return trace_handle_return(&iter->seq);
496 }
497 
498 static struct trace_event_functions user_event_funcs = {
499 	.trace = user_event_print_trace,
500 };
501 
502 static int destroy_user_event(struct user_event *user)
503 {
504 	int ret = 0;
505 
506 	/* Must destroy fields before call removal */
507 	user_event_destroy_fields(user);
508 
509 	ret = trace_remove_event_call(&user->call);
510 
511 	if (ret)
512 		return ret;
513 
514 	dyn_event_remove(&user->devent);
515 
516 	register_page_data[user->index] = 0;
517 	clear_bit(user->index, page_bitmap);
518 	hash_del(&user->node);
519 
520 	kfree(user->call.print_fmt);
521 	kfree(EVENT_NAME(user));
522 	kfree(user);
523 
524 	return ret;
525 }
526 
527 static struct user_event *find_user_event(char *name, u32 *outkey)
528 {
529 	struct user_event *user;
530 	u32 key = user_event_key(name);
531 
532 	*outkey = key;
533 
534 	hash_for_each_possible(register_table, user, node, key)
535 		if (!strcmp(EVENT_NAME(user), name))
536 			return user;
537 
538 	return NULL;
539 }
540 
541 /*
542  * Writes the user supplied payload out to a trace file.
543  */
544 static void user_event_ftrace(struct user_event *user, struct iov_iter *i,
545 			      void *tpdata)
546 {
547 	struct trace_event_file *file;
548 	struct trace_entry *entry;
549 	struct trace_event_buffer event_buffer;
550 
551 	file = (struct trace_event_file *)tpdata;
552 
553 	if (!file ||
554 	    !(file->flags & EVENT_FILE_FL_ENABLED) ||
555 	    trace_trigger_soft_disabled(file))
556 		return;
557 
558 	/* Allocates and fills trace_entry, + 1 of this is data payload */
559 	entry = trace_event_buffer_reserve(&event_buffer, file,
560 					   sizeof(*entry) + i->count);
561 
562 	if (unlikely(!entry))
563 		return;
564 
565 	if (unlikely(!copy_nofault(entry + 1, i->count, i)))
566 		__trace_event_discard_commit(event_buffer.buffer,
567 					     event_buffer.event);
568 	else
569 		trace_event_buffer_commit(&event_buffer);
570 }
571 
572 #ifdef CONFIG_PERF_EVENTS
573 static void user_event_bpf(struct user_event *user, struct iov_iter *i)
574 {
575 	struct user_bpf_context context;
576 	struct user_bpf_iter bpf_i;
577 	char fast_data[MAX_STACK_BPF_DATA];
578 	void *temp = NULL;
579 
580 	if ((user->flags & FLAG_BPF_ITER) && iter_is_iovec(i)) {
581 		/* Raw iterator */
582 		context.data_type = USER_BPF_DATA_ITER;
583 		context.data_len = i->count;
584 		context.iter = &bpf_i;
585 
586 		bpf_i.iov_offset = i->iov_offset;
587 		bpf_i.iov = i->iov;
588 		bpf_i.nr_segs = i->nr_segs;
589 	} else if (i->nr_segs == 1 && iter_is_iovec(i)) {
590 		/* Single buffer from user */
591 		context.data_type = USER_BPF_DATA_USER;
592 		context.data_len = i->count;
593 		context.udata = i->iov->iov_base + i->iov_offset;
594 	} else {
595 		/* Multi buffer from user */
596 		struct iov_iter copy = *i;
597 		size_t copy_size = min_t(size_t, i->count, MAX_BPF_COPY_SIZE);
598 
599 		context.data_type = USER_BPF_DATA_KERNEL;
600 		context.kdata = fast_data;
601 
602 		if (unlikely(copy_size > sizeof(fast_data))) {
603 			temp = kmalloc(copy_size, GFP_NOWAIT);
604 
605 			if (temp)
606 				context.kdata = temp;
607 			else
608 				copy_size = sizeof(fast_data);
609 		}
610 
611 		context.data_len = copy_nofault(context.kdata,
612 						copy_size, &copy);
613 	}
614 
615 	trace_call_bpf(&user->call, &context);
616 
617 	kfree(temp);
618 }
619 
620 /*
621  * Writes the user supplied payload out to perf ring buffer or eBPF program.
622  */
623 static void user_event_perf(struct user_event *user, struct iov_iter *i,
624 			    void *tpdata)
625 {
626 	struct hlist_head *perf_head;
627 
628 	if (bpf_prog_array_valid(&user->call))
629 		user_event_bpf(user, i);
630 
631 	perf_head = this_cpu_ptr(user->call.perf_events);
632 
633 	if (perf_head && !hlist_empty(perf_head)) {
634 		struct trace_entry *perf_entry;
635 		struct pt_regs *regs;
636 		size_t size = sizeof(*perf_entry) + i->count;
637 		int context;
638 
639 		perf_entry = perf_trace_buf_alloc(ALIGN(size, 8),
640 						  &regs, &context);
641 
642 		if (unlikely(!perf_entry))
643 			return;
644 
645 		perf_fetch_caller_regs(regs);
646 
647 		if (unlikely(!copy_nofault(perf_entry + 1, i->count, i))) {
648 			perf_swevent_put_recursion_context(context);
649 			return;
650 		}
651 
652 		perf_trace_buf_submit(perf_entry, size, context,
653 				      user->call.event.type, 1, regs,
654 				      perf_head, NULL);
655 	}
656 }
657 #endif
658 
659 /*
660  * Update the register page that is shared between user processes.
661  */
662 static void update_reg_page_for(struct user_event *user)
663 {
664 	struct tracepoint *tp = &user->tracepoint;
665 	char status = 0;
666 
667 	if (atomic_read(&tp->key.enabled) > 0) {
668 		struct tracepoint_func *probe_func_ptr;
669 		user_event_func_t probe_func;
670 
671 		rcu_read_lock_sched();
672 
673 		probe_func_ptr = rcu_dereference_sched(tp->funcs);
674 
675 		if (probe_func_ptr) {
676 			do {
677 				probe_func = probe_func_ptr->func;
678 
679 				if (probe_func == user_event_ftrace)
680 					status |= EVENT_STATUS_FTRACE;
681 #ifdef CONFIG_PERF_EVENTS
682 				else if (probe_func == user_event_perf)
683 					status |= EVENT_STATUS_PERF;
684 #endif
685 				else
686 					status |= EVENT_STATUS_OTHER;
687 			} while ((++probe_func_ptr)->func);
688 		}
689 
690 		rcu_read_unlock_sched();
691 	}
692 
693 	register_page_data[user->index] = status;
694 }
695 
696 /*
697  * Register callback for our events from tracing sub-systems.
698  */
699 static int user_event_reg(struct trace_event_call *call,
700 			  enum trace_reg type,
701 			  void *data)
702 {
703 	struct user_event *user = (struct user_event *)call->data;
704 	int ret = 0;
705 
706 	if (!user)
707 		return -ENOENT;
708 
709 	switch (type) {
710 	case TRACE_REG_REGISTER:
711 		ret = tracepoint_probe_register(call->tp,
712 						call->class->probe,
713 						data);
714 		if (!ret)
715 			goto inc;
716 		break;
717 
718 	case TRACE_REG_UNREGISTER:
719 		tracepoint_probe_unregister(call->tp,
720 					    call->class->probe,
721 					    data);
722 		goto dec;
723 
724 #ifdef CONFIG_PERF_EVENTS
725 	case TRACE_REG_PERF_REGISTER:
726 		ret = tracepoint_probe_register(call->tp,
727 						call->class->perf_probe,
728 						data);
729 		if (!ret)
730 			goto inc;
731 		break;
732 
733 	case TRACE_REG_PERF_UNREGISTER:
734 		tracepoint_probe_unregister(call->tp,
735 					    call->class->perf_probe,
736 					    data);
737 		goto dec;
738 
739 	case TRACE_REG_PERF_OPEN:
740 	case TRACE_REG_PERF_CLOSE:
741 	case TRACE_REG_PERF_ADD:
742 	case TRACE_REG_PERF_DEL:
743 		break;
744 #endif
745 	}
746 
747 	return ret;
748 inc:
749 	atomic_inc(&user->refcnt);
750 	update_reg_page_for(user);
751 	return 0;
752 dec:
753 	update_reg_page_for(user);
754 	atomic_dec(&user->refcnt);
755 	return 0;
756 }
757 
758 static int user_event_create(const char *raw_command)
759 {
760 	struct user_event *user;
761 	char *name;
762 	int ret;
763 
764 	if (!str_has_prefix(raw_command, USER_EVENTS_PREFIX))
765 		return -ECANCELED;
766 
767 	raw_command += USER_EVENTS_PREFIX_LEN;
768 	raw_command = skip_spaces(raw_command);
769 
770 	name = kstrdup(raw_command, GFP_KERNEL);
771 
772 	if (!name)
773 		return -ENOMEM;
774 
775 	mutex_lock(&reg_mutex);
776 	ret = user_event_parse_cmd(name, &user);
777 	mutex_unlock(&reg_mutex);
778 
779 	if (ret)
780 		kfree(name);
781 
782 	return ret;
783 }
784 
785 static int user_event_show(struct seq_file *m, struct dyn_event *ev)
786 {
787 	struct user_event *user = container_of(ev, struct user_event, devent);
788 	struct ftrace_event_field *field, *next;
789 	struct list_head *head;
790 	int depth = 0;
791 
792 	seq_printf(m, "%s%s", USER_EVENTS_PREFIX, EVENT_NAME(user));
793 
794 	head = trace_get_fields(&user->call);
795 
796 	list_for_each_entry_safe_reverse(field, next, head, link) {
797 		if (depth == 0)
798 			seq_puts(m, " ");
799 		else
800 			seq_puts(m, "; ");
801 
802 		seq_printf(m, "%s %s", field->type, field->name);
803 
804 		if (str_has_prefix(field->type, "struct "))
805 			seq_printf(m, " %d", field->size);
806 
807 		depth++;
808 	}
809 
810 	seq_puts(m, "\n");
811 
812 	return 0;
813 }
814 
815 static bool user_event_is_busy(struct dyn_event *ev)
816 {
817 	struct user_event *user = container_of(ev, struct user_event, devent);
818 
819 	return atomic_read(&user->refcnt) != 0;
820 }
821 
822 static int user_event_free(struct dyn_event *ev)
823 {
824 	struct user_event *user = container_of(ev, struct user_event, devent);
825 
826 	if (atomic_read(&user->refcnt) != 0)
827 		return -EBUSY;
828 
829 	return destroy_user_event(user);
830 }
831 
832 static bool user_field_match(struct ftrace_event_field *field, int argc,
833 			     const char **argv, int *iout)
834 {
835 	char *field_name, *arg_name;
836 	int len, pos, i = *iout;
837 	bool colon = false, match = false;
838 
839 	if (i >= argc)
840 		return false;
841 
842 	len = MAX_FIELD_ARG_NAME;
843 	field_name = kmalloc(len, GFP_KERNEL);
844 	arg_name = kmalloc(len, GFP_KERNEL);
845 
846 	if (!arg_name || !field_name)
847 		goto out;
848 
849 	pos = 0;
850 
851 	for (; i < argc; ++i) {
852 		if (i != *iout)
853 			pos += snprintf(arg_name + pos, len - pos, " ");
854 
855 		pos += snprintf(arg_name + pos, len - pos, argv[i]);
856 
857 		if (strchr(argv[i], ';')) {
858 			++i;
859 			colon = true;
860 			break;
861 		}
862 	}
863 
864 	pos = 0;
865 
866 	pos += snprintf(field_name + pos, len - pos, field->type);
867 	pos += snprintf(field_name + pos, len - pos, " ");
868 	pos += snprintf(field_name + pos, len - pos, field->name);
869 
870 	if (colon)
871 		pos += snprintf(field_name + pos, len - pos, ";");
872 
873 	*iout = i;
874 
875 	match = strcmp(arg_name, field_name) == 0;
876 out:
877 	kfree(arg_name);
878 	kfree(field_name);
879 
880 	return match;
881 }
882 
883 static bool user_fields_match(struct user_event *user, int argc,
884 			      const char **argv)
885 {
886 	struct ftrace_event_field *field, *next;
887 	struct list_head *head = &user->fields;
888 	int i = 0;
889 
890 	list_for_each_entry_safe_reverse(field, next, head, link)
891 		if (!user_field_match(field, argc, argv, &i))
892 			return false;
893 
894 	if (i != argc)
895 		return false;
896 
897 	return true;
898 }
899 
900 static bool user_event_match(const char *system, const char *event,
901 			     int argc, const char **argv, struct dyn_event *ev)
902 {
903 	struct user_event *user = container_of(ev, struct user_event, devent);
904 	bool match;
905 
906 	match = strcmp(EVENT_NAME(user), event) == 0 &&
907 		(!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
908 
909 	if (match && argc > 0)
910 		match = user_fields_match(user, argc, argv);
911 
912 	return match;
913 }
914 
915 static struct dyn_event_operations user_event_dops = {
916 	.create = user_event_create,
917 	.show = user_event_show,
918 	.is_busy = user_event_is_busy,
919 	.free = user_event_free,
920 	.match = user_event_match,
921 };
922 
923 static int user_event_trace_register(struct user_event *user)
924 {
925 	int ret;
926 
927 	ret = register_trace_event(&user->call.event);
928 
929 	if (!ret)
930 		return -ENODEV;
931 
932 	ret = trace_add_event_call(&user->call);
933 
934 	if (ret)
935 		unregister_trace_event(&user->call.event);
936 
937 	return ret;
938 }
939 
940 /*
941  * Parses the event name, arguments and flags then registers if successful.
942  * The name buffer lifetime is owned by this method for success cases only.
943  */
944 static int user_event_parse(char *name, char *args, char *flags,
945 			    struct user_event **newuser)
946 {
947 	int ret;
948 	int index;
949 	u32 key;
950 	struct user_event *user = find_user_event(name, &key);
951 
952 	if (user) {
953 		*newuser = user;
954 		/*
955 		 * Name is allocated by caller, free it since it already exists.
956 		 * Caller only worries about failure cases for freeing.
957 		 */
958 		kfree(name);
959 		return 0;
960 	}
961 
962 	index = find_first_zero_bit(page_bitmap, MAX_EVENTS);
963 
964 	if (index == MAX_EVENTS)
965 		return -EMFILE;
966 
967 	user = kzalloc(sizeof(*user), GFP_KERNEL);
968 
969 	if (!user)
970 		return -ENOMEM;
971 
972 	INIT_LIST_HEAD(&user->class.fields);
973 	INIT_LIST_HEAD(&user->fields);
974 
975 	user->tracepoint.name = name;
976 
977 	user_event_parse_flags(user, flags);
978 
979 	ret = user_event_parse_fields(user, args);
980 
981 	if (ret)
982 		goto put_user;
983 
984 	ret = user_event_create_print_fmt(user);
985 
986 	if (ret)
987 		goto put_user;
988 
989 	user->call.data = user;
990 	user->call.class = &user->class;
991 	user->call.name = name;
992 	user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
993 	user->call.tp = &user->tracepoint;
994 	user->call.event.funcs = &user_event_funcs;
995 
996 	user->class.system = USER_EVENTS_SYSTEM;
997 	user->class.fields_array = user_event_fields_array;
998 	user->class.get_fields = user_event_get_fields;
999 	user->class.reg = user_event_reg;
1000 	user->class.probe = user_event_ftrace;
1001 #ifdef CONFIG_PERF_EVENTS
1002 	user->class.perf_probe = user_event_perf;
1003 #endif
1004 
1005 	mutex_lock(&event_mutex);
1006 	ret = user_event_trace_register(user);
1007 	mutex_unlock(&event_mutex);
1008 
1009 	if (ret)
1010 		goto put_user;
1011 
1012 	user->index = index;
1013 	dyn_event_init(&user->devent, &user_event_dops);
1014 	dyn_event_add(&user->devent, &user->call);
1015 	set_bit(user->index, page_bitmap);
1016 	hash_add(register_table, &user->node, key);
1017 
1018 	*newuser = user;
1019 	return 0;
1020 put_user:
1021 	user_event_destroy_fields(user);
1022 	kfree(user);
1023 	return ret;
1024 }
1025 
1026 /*
1027  * Deletes a previously created event if it is no longer being used.
1028  */
1029 static int delete_user_event(char *name)
1030 {
1031 	u32 key;
1032 	int ret;
1033 	struct user_event *user = find_user_event(name, &key);
1034 
1035 	if (!user)
1036 		return -ENOENT;
1037 
1038 	if (atomic_read(&user->refcnt) != 0)
1039 		return -EBUSY;
1040 
1041 	mutex_lock(&event_mutex);
1042 	ret = destroy_user_event(user);
1043 	mutex_unlock(&event_mutex);
1044 
1045 	return ret;
1046 }
1047 
1048 /*
1049  * Validates the user payload and writes via iterator.
1050  */
1051 static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
1052 {
1053 	struct user_event_refs *refs;
1054 	struct user_event *user = NULL;
1055 	struct tracepoint *tp;
1056 	ssize_t ret = i->count;
1057 	int idx;
1058 
1059 	if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx)))
1060 		return -EFAULT;
1061 
1062 	rcu_read_lock_sched();
1063 
1064 	refs = rcu_dereference_sched(file->private_data);
1065 
1066 	/*
1067 	 * The refs->events array is protected by RCU, and new items may be
1068 	 * added. But the user retrieved from indexing into the events array
1069 	 * shall be immutable while the file is opened.
1070 	 */
1071 	if (likely(refs && idx < refs->count))
1072 		user = refs->events[idx];
1073 
1074 	rcu_read_unlock_sched();
1075 
1076 	if (unlikely(user == NULL))
1077 		return -ENOENT;
1078 
1079 	tp = &user->tracepoint;
1080 
1081 	/*
1082 	 * It's possible key.enabled disables after this check, however
1083 	 * we don't mind if a few events are included in this condition.
1084 	 */
1085 	if (likely(atomic_read(&tp->key.enabled) > 0)) {
1086 		struct tracepoint_func *probe_func_ptr;
1087 		user_event_func_t probe_func;
1088 		struct iov_iter copy;
1089 		void *tpdata;
1090 
1091 		if (unlikely(fault_in_iov_iter_readable(i, i->count)))
1092 			return -EFAULT;
1093 
1094 		rcu_read_lock_sched();
1095 
1096 		probe_func_ptr = rcu_dereference_sched(tp->funcs);
1097 
1098 		if (probe_func_ptr) {
1099 			do {
1100 				copy = *i;
1101 				probe_func = probe_func_ptr->func;
1102 				tpdata = probe_func_ptr->data;
1103 				probe_func(user, &copy, tpdata);
1104 			} while ((++probe_func_ptr)->func);
1105 		}
1106 
1107 		rcu_read_unlock_sched();
1108 	}
1109 
1110 	return ret;
1111 }
1112 
1113 static ssize_t user_events_write(struct file *file, const char __user *ubuf,
1114 				 size_t count, loff_t *ppos)
1115 {
1116 	struct iovec iov;
1117 	struct iov_iter i;
1118 
1119 	if (unlikely(*ppos != 0))
1120 		return -EFAULT;
1121 
1122 	if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i)))
1123 		return -EFAULT;
1124 
1125 	return user_events_write_core(file, &i);
1126 }
1127 
1128 static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i)
1129 {
1130 	return user_events_write_core(kp->ki_filp, i);
1131 }
1132 
1133 static int user_events_ref_add(struct file *file, struct user_event *user)
1134 {
1135 	struct user_event_refs *refs, *new_refs;
1136 	int i, size, count = 0;
1137 
1138 	refs = rcu_dereference_protected(file->private_data,
1139 					 lockdep_is_held(&reg_mutex));
1140 
1141 	if (refs) {
1142 		count = refs->count;
1143 
1144 		for (i = 0; i < count; ++i)
1145 			if (refs->events[i] == user)
1146 				return i;
1147 	}
1148 
1149 	size = struct_size(refs, events, count + 1);
1150 
1151 	new_refs = kzalloc(size, GFP_KERNEL);
1152 
1153 	if (!new_refs)
1154 		return -ENOMEM;
1155 
1156 	new_refs->count = count + 1;
1157 
1158 	for (i = 0; i < count; ++i)
1159 		new_refs->events[i] = refs->events[i];
1160 
1161 	new_refs->events[i] = user;
1162 
1163 	atomic_inc(&user->refcnt);
1164 
1165 	rcu_assign_pointer(file->private_data, new_refs);
1166 
1167 	if (refs)
1168 		kfree_rcu(refs, rcu);
1169 
1170 	return i;
1171 }
1172 
1173 static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
1174 {
1175 	u32 size;
1176 	long ret;
1177 
1178 	ret = get_user(size, &ureg->size);
1179 
1180 	if (ret)
1181 		return ret;
1182 
1183 	if (size > PAGE_SIZE)
1184 		return -E2BIG;
1185 
1186 	return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
1187 }
1188 
1189 /*
1190  * Registers a user_event on behalf of a user process.
1191  */
1192 static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
1193 {
1194 	struct user_reg __user *ureg = (struct user_reg __user *)uarg;
1195 	struct user_reg reg;
1196 	struct user_event *user;
1197 	char *name;
1198 	long ret;
1199 
1200 	ret = user_reg_get(ureg, &reg);
1201 
1202 	if (ret)
1203 		return ret;
1204 
1205 	name = strndup_user((const char __user *)(uintptr_t)reg.name_args,
1206 			    MAX_EVENT_DESC);
1207 
1208 	if (IS_ERR(name)) {
1209 		ret = PTR_ERR(name);
1210 		return ret;
1211 	}
1212 
1213 	ret = user_event_parse_cmd(name, &user);
1214 
1215 	if (ret) {
1216 		kfree(name);
1217 		return ret;
1218 	}
1219 
1220 	ret = user_events_ref_add(file, user);
1221 
1222 	/* Positive number is index and valid */
1223 	if (ret < 0)
1224 		return ret;
1225 
1226 	put_user((u32)ret, &ureg->write_index);
1227 	put_user(user->index, &ureg->status_index);
1228 
1229 	return 0;
1230 }
1231 
1232 /*
1233  * Deletes a user_event on behalf of a user process.
1234  */
1235 static long user_events_ioctl_del(struct file *file, unsigned long uarg)
1236 {
1237 	void __user *ubuf = (void __user *)uarg;
1238 	char *name;
1239 	long ret;
1240 
1241 	name = strndup_user(ubuf, MAX_EVENT_DESC);
1242 
1243 	if (IS_ERR(name))
1244 		return PTR_ERR(name);
1245 
1246 	ret = delete_user_event(name);
1247 
1248 	kfree(name);
1249 
1250 	return ret;
1251 }
1252 
1253 /*
1254  * Handles the ioctl from user mode to register or alter operations.
1255  */
1256 static long user_events_ioctl(struct file *file, unsigned int cmd,
1257 			      unsigned long uarg)
1258 {
1259 	long ret = -ENOTTY;
1260 
1261 	switch (cmd) {
1262 	case DIAG_IOCSREG:
1263 		mutex_lock(&reg_mutex);
1264 		ret = user_events_ioctl_reg(file, uarg);
1265 		mutex_unlock(&reg_mutex);
1266 		break;
1267 
1268 	case DIAG_IOCSDEL:
1269 		mutex_lock(&reg_mutex);
1270 		ret = user_events_ioctl_del(file, uarg);
1271 		mutex_unlock(&reg_mutex);
1272 		break;
1273 	}
1274 
1275 	return ret;
1276 }
1277 
1278 /*
1279  * Handles the final close of the file from user mode.
1280  */
1281 static int user_events_release(struct inode *node, struct file *file)
1282 {
1283 	struct user_event_refs *refs;
1284 	struct user_event *user;
1285 	int i;
1286 
1287 	/*
1288 	 * Ensure refs cannot change under any situation by taking the
1289 	 * register mutex during the final freeing of the references.
1290 	 */
1291 	mutex_lock(&reg_mutex);
1292 
1293 	refs = file->private_data;
1294 
1295 	if (!refs)
1296 		goto out;
1297 
1298 	/*
1299 	 * The lifetime of refs has reached an end, it's tied to this file.
1300 	 * The underlying user_events are ref counted, and cannot be freed.
1301 	 * After this decrement, the user_events may be freed elsewhere.
1302 	 */
1303 	for (i = 0; i < refs->count; ++i) {
1304 		user = refs->events[i];
1305 
1306 		if (user)
1307 			atomic_dec(&user->refcnt);
1308 	}
1309 out:
1310 	file->private_data = NULL;
1311 
1312 	mutex_unlock(&reg_mutex);
1313 
1314 	kfree(refs);
1315 
1316 	return 0;
1317 }
1318 
1319 static const struct file_operations user_data_fops = {
1320 	.write = user_events_write,
1321 	.write_iter = user_events_write_iter,
1322 	.unlocked_ioctl	= user_events_ioctl,
1323 	.release = user_events_release,
1324 };
1325 
1326 /*
1327  * Maps the shared page into the user process for checking if event is enabled.
1328  */
1329 static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
1330 {
1331 	unsigned long size = vma->vm_end - vma->vm_start;
1332 
1333 	if (size != MAX_EVENTS)
1334 		return -EINVAL;
1335 
1336 	return remap_pfn_range(vma, vma->vm_start,
1337 			       virt_to_phys(register_page_data) >> PAGE_SHIFT,
1338 			       size, vm_get_page_prot(VM_READ));
1339 }
1340 
1341 static void *user_seq_start(struct seq_file *m, loff_t *pos)
1342 {
1343 	if (*pos)
1344 		return NULL;
1345 
1346 	return (void *)1;
1347 }
1348 
1349 static void *user_seq_next(struct seq_file *m, void *p, loff_t *pos)
1350 {
1351 	++*pos;
1352 	return NULL;
1353 }
1354 
1355 static void user_seq_stop(struct seq_file *m, void *p)
1356 {
1357 }
1358 
1359 static int user_seq_show(struct seq_file *m, void *p)
1360 {
1361 	struct user_event *user;
1362 	char status;
1363 	int i, active = 0, busy = 0, flags;
1364 
1365 	mutex_lock(&reg_mutex);
1366 
1367 	hash_for_each(register_table, i, user, node) {
1368 		status = register_page_data[user->index];
1369 		flags = user->flags;
1370 
1371 		seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
1372 
1373 		if (flags != 0 || status != 0)
1374 			seq_puts(m, " #");
1375 
1376 		if (status != 0) {
1377 			seq_puts(m, " Used by");
1378 			if (status & EVENT_STATUS_FTRACE)
1379 				seq_puts(m, " ftrace");
1380 			if (status & EVENT_STATUS_PERF)
1381 				seq_puts(m, " perf");
1382 			if (status & EVENT_STATUS_OTHER)
1383 				seq_puts(m, " other");
1384 			busy++;
1385 		}
1386 
1387 		if (flags & FLAG_BPF_ITER)
1388 			seq_puts(m, " FLAG:BPF_ITER");
1389 
1390 		seq_puts(m, "\n");
1391 		active++;
1392 	}
1393 
1394 	mutex_unlock(&reg_mutex);
1395 
1396 	seq_puts(m, "\n");
1397 	seq_printf(m, "Active: %d\n", active);
1398 	seq_printf(m, "Busy: %d\n", busy);
1399 	seq_printf(m, "Max: %ld\n", MAX_EVENTS);
1400 
1401 	return 0;
1402 }
1403 
1404 static const struct seq_operations user_seq_ops = {
1405 	.start = user_seq_start,
1406 	.next  = user_seq_next,
1407 	.stop  = user_seq_stop,
1408 	.show  = user_seq_show,
1409 };
1410 
1411 static int user_status_open(struct inode *node, struct file *file)
1412 {
1413 	return seq_open(file, &user_seq_ops);
1414 }
1415 
1416 static const struct file_operations user_status_fops = {
1417 	.open = user_status_open,
1418 	.mmap = user_status_mmap,
1419 	.read = seq_read,
1420 	.llseek  = seq_lseek,
1421 	.release = seq_release,
1422 };
1423 
1424 /*
1425  * Creates a set of tracefs files to allow user mode interactions.
1426  */
1427 static int create_user_tracefs(void)
1428 {
1429 	struct dentry *edata, *emmap;
1430 
1431 	edata = tracefs_create_file("user_events_data", TRACE_MODE_WRITE,
1432 				    NULL, NULL, &user_data_fops);
1433 
1434 	if (!edata) {
1435 		pr_warn("Could not create tracefs 'user_events_data' entry\n");
1436 		goto err;
1437 	}
1438 
1439 	/* mmap with MAP_SHARED requires writable fd */
1440 	emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE,
1441 				    NULL, NULL, &user_status_fops);
1442 
1443 	if (!emmap) {
1444 		tracefs_remove(edata);
1445 		pr_warn("Could not create tracefs 'user_events_mmap' entry\n");
1446 		goto err;
1447 	}
1448 
1449 	return 0;
1450 err:
1451 	return -ENODEV;
1452 }
1453 
1454 static void set_page_reservations(bool set)
1455 {
1456 	int page;
1457 
1458 	for (page = 0; page < MAX_PAGES; ++page) {
1459 		void *addr = register_page_data + (PAGE_SIZE * page);
1460 
1461 		if (set)
1462 			SetPageReserved(virt_to_page(addr));
1463 		else
1464 			ClearPageReserved(virt_to_page(addr));
1465 	}
1466 }
1467 
1468 static int __init trace_events_user_init(void)
1469 {
1470 	int ret;
1471 
1472 	/* Zero all bits beside 0 (which is reserved for failures) */
1473 	bitmap_zero(page_bitmap, MAX_EVENTS);
1474 	set_bit(0, page_bitmap);
1475 
1476 	register_page_data = kzalloc(MAX_EVENTS, GFP_KERNEL);
1477 
1478 	if (!register_page_data)
1479 		return -ENOMEM;
1480 
1481 	set_page_reservations(true);
1482 
1483 	ret = create_user_tracefs();
1484 
1485 	if (ret) {
1486 		pr_warn("user_events could not register with tracefs\n");
1487 		set_page_reservations(false);
1488 		kfree(register_page_data);
1489 		return ret;
1490 	}
1491 
1492 	if (dyn_event_register(&user_event_dops))
1493 		pr_warn("user_events could not register with dyn_events\n");
1494 
1495 	return 0;
1496 }
1497 
1498 fs_initcall(trace_events_user_init);
1499